upgini 1.2.91a3906.dev1__py3-none-any.whl → 1.2.93__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/unary.py +0 -8
- upgini/dataset.py +58 -109
- upgini/features_enricher.py +225 -187
- upgini/metadata.py +3 -0
- upgini/metrics.py +13 -12
- upgini/resource_bundle/strings.properties +2 -0
- upgini/utils/feature_info.py +2 -2
- upgini/utils/sample_utils.py +416 -0
- upgini/utils/target_utils.py +3 -199
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.93.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.93.dist-info}/RECORD +14 -13
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.93.dist-info}/WHEEL +1 -1
- {upgini-1.2.91a3906.dev1.dist-info → upgini-1.2.93.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.93"
|
upgini/autofe/unary.py
CHANGED
@@ -190,11 +190,3 @@ class Bin(PandasOperator):
|
|
190
190
|
if isinstance(value, str):
|
191
191
|
return json.loads(value)
|
192
192
|
return value
|
193
|
-
|
194
|
-
|
195
|
-
class Cluster(PandasOperator):
|
196
|
-
name: str = "cluster"
|
197
|
-
is_unary: bool = True
|
198
|
-
input_type: Optional[str] = "vector"
|
199
|
-
output_type: Optional[str] = "category"
|
200
|
-
is_categorical: bool = True
|
upgini/dataset.py
CHANGED
@@ -38,11 +38,7 @@ from upgini.metadata import (
|
|
38
38
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
39
39
|
from upgini.search_task import SearchTask
|
40
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
41
|
-
from upgini.utils.
|
42
|
-
balance_undersample,
|
43
|
-
balance_undersample_forced,
|
44
|
-
balance_undersample_time_series_trunc,
|
45
|
-
)
|
41
|
+
from upgini.utils.sample_utils import SampleColumns, SampleConfig, sample
|
46
42
|
|
47
43
|
try:
|
48
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -52,20 +48,10 @@ except Exception:
|
|
52
48
|
)
|
53
49
|
|
54
50
|
|
55
|
-
class Dataset:
|
51
|
+
class Dataset:
|
56
52
|
MIN_ROWS_COUNT = 100
|
57
53
|
MAX_ROWS = 200_000
|
58
|
-
FIT_SAMPLE_ROWS = 200_000
|
59
|
-
FIT_SAMPLE_THRESHOLD = 200_000
|
60
|
-
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
61
|
-
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
62
|
-
FIT_SAMPLE_THRESHOLD_TS = 54_000
|
63
|
-
FIT_SAMPLE_ROWS_TS = 54_000
|
64
|
-
BINARY_MIN_SAMPLE_THRESHOLD = 5_000
|
65
|
-
MULTICLASS_MIN_SAMPLE_THRESHOLD = 25_000
|
66
54
|
IMBALANCE_THESHOLD = 0.6
|
67
|
-
BINARY_BOOTSTRAP_LOOPS = 5
|
68
|
-
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
69
55
|
MIN_TARGET_CLASS_ROWS = 100
|
70
56
|
MAX_MULTICLASS_CLASS_COUNT = 100
|
71
57
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
@@ -88,6 +74,7 @@ class Dataset: # (pd.DataFrame):
|
|
88
74
|
date_column: Optional[str] = None,
|
89
75
|
id_columns: Optional[List[str]] = None,
|
90
76
|
random_state: Optional[int] = None,
|
77
|
+
sample_config: Optional[SampleConfig] = None,
|
91
78
|
rest_client: Optional[_RestClient] = None,
|
92
79
|
logger: Optional[logging.Logger] = None,
|
93
80
|
bundle: Optional[ResourceBundle] = None,
|
@@ -95,6 +82,7 @@ class Dataset: # (pd.DataFrame):
|
|
95
82
|
**kwargs,
|
96
83
|
):
|
97
84
|
self.bundle = bundle or get_custom_bundle()
|
85
|
+
self.sample_config = sample_config or SampleConfig(force_sample_size=self.FORCE_SAMPLE_SIZE)
|
98
86
|
if df is not None:
|
99
87
|
data = df.copy()
|
100
88
|
elif path is not None:
|
@@ -233,109 +221,70 @@ class Dataset: # (pd.DataFrame):
|
|
233
221
|
raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
234
222
|
|
235
223
|
def __resample(self, force_downsampling=False):
|
236
|
-
# self.logger.info("Resampling etalon")
|
237
|
-
# Resample imbalanced target. Only train segment (without eval_set)
|
238
|
-
if force_downsampling:
|
239
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
240
|
-
self.data = balance_undersample_forced(
|
241
|
-
df=self.data,
|
242
|
-
target_column=target_column,
|
243
|
-
task_type=self.task_type,
|
244
|
-
cv_type=self.cv_type,
|
245
|
-
date_column=self.date_column,
|
246
|
-
id_columns=self.id_columns,
|
247
|
-
random_state=self.random_state,
|
248
|
-
sample_size=self.FORCE_SAMPLE_SIZE,
|
249
|
-
logger=self.logger,
|
250
|
-
bundle=self.bundle,
|
251
|
-
warning_callback=self.warning_callback,
|
252
|
-
)
|
253
|
-
return
|
254
224
|
|
255
|
-
if EVAL_SET_INDEX in self.data.columns:
|
225
|
+
if EVAL_SET_INDEX in self.data.columns and not force_downsampling:
|
256
226
|
train_segment = self.data[self.data[EVAL_SET_INDEX] == 0]
|
257
227
|
else:
|
258
228
|
train_segment = self.data
|
259
229
|
|
260
|
-
|
261
|
-
self.task_type == ModelTaskType.BINARY and len(train_segment) > self.BINARY_MIN_SAMPLE_THRESHOLD
|
262
|
-
):
|
263
|
-
count = len(train_segment)
|
264
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
265
|
-
target = train_segment[target_column]
|
266
|
-
target_classes_count = target.nunique()
|
230
|
+
self.imbalanced = self.__is_imbalanced(train_segment)
|
267
231
|
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
232
|
+
sample_columns = SampleColumns(
|
233
|
+
ids=self.id_columns,
|
234
|
+
date=self.date_column,
|
235
|
+
target=self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET),
|
236
|
+
eval_set_index=EVAL_SET_INDEX,
|
237
|
+
)
|
274
238
|
|
275
|
-
|
276
|
-
|
277
|
-
|
239
|
+
self.data = sample(
|
240
|
+
train_segment if self.imbalanced else self.data, # for imbalanced data we will be doing transform anyway
|
241
|
+
self.task_type,
|
242
|
+
self.cv_type,
|
243
|
+
self.sample_config,
|
244
|
+
sample_columns,
|
245
|
+
self.random_state,
|
246
|
+
balance=self.imbalanced,
|
247
|
+
force_downsampling=force_downsampling,
|
248
|
+
logger=self.logger,
|
249
|
+
bundle=self.bundle,
|
250
|
+
warning_callback=self.warning_callback,
|
251
|
+
)
|
278
252
|
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
)
|
283
|
-
self.logger.warning(msg)
|
284
|
-
raise ValidationError(msg)
|
253
|
+
def __is_imbalanced(self, data: pd.DataFrame) -> bool:
|
254
|
+
if self.task_type is None or not self.task_type.is_classification():
|
255
|
+
return False
|
285
256
|
|
286
|
-
|
287
|
-
|
288
|
-
|
289
|
-
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
290
|
-
if min_class_count < min_class_threshold:
|
291
|
-
self.imbalanced = True
|
292
|
-
self.data = balance_undersample(
|
293
|
-
df=train_segment,
|
294
|
-
target_column=target_column,
|
295
|
-
task_type=self.task_type,
|
296
|
-
random_state=self.random_state,
|
297
|
-
binary_min_sample_threshold=self.BINARY_MIN_SAMPLE_THRESHOLD,
|
298
|
-
multiclass_min_sample_threshold=self.MULTICLASS_MIN_SAMPLE_THRESHOLD,
|
299
|
-
binary_bootstrap_loops=self.BINARY_BOOTSTRAP_LOOPS,
|
300
|
-
multiclass_bootstrap_loops=self.MULTICLASS_BOOTSTRAP_LOOPS,
|
301
|
-
logger=self.logger,
|
302
|
-
bundle=self.bundle,
|
303
|
-
warning_callback=self.warning_callback,
|
304
|
-
)
|
257
|
+
if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
|
258
|
+
return False
|
305
259
|
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
|
310
|
-
elif not self.imbalanced and EVAL_SET_INDEX in self.data.columns:
|
311
|
-
sample_threshold = self.FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD
|
312
|
-
sample_rows = self.FIT_SAMPLE_WITH_EVAL_SET_ROWS
|
313
|
-
else:
|
314
|
-
sample_threshold = self.FIT_SAMPLE_THRESHOLD
|
315
|
-
sample_rows = self.FIT_SAMPLE_ROWS
|
260
|
+
count = len(data)
|
261
|
+
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
262
|
+
target = data[target_column]
|
263
|
+
target_classes_count = target.nunique()
|
316
264
|
|
317
|
-
if
|
318
|
-
self.
|
319
|
-
|
320
|
-
f"and will be downsampled to {sample_rows}"
|
265
|
+
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
266
|
+
msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
|
267
|
+
target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
|
321
268
|
)
|
322
|
-
|
323
|
-
|
324
|
-
|
325
|
-
|
326
|
-
|
327
|
-
|
328
|
-
|
329
|
-
|
330
|
-
|
331
|
-
|
332
|
-
|
333
|
-
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
338
|
-
|
269
|
+
self.logger.warning(msg)
|
270
|
+
raise ValidationError(msg)
|
271
|
+
|
272
|
+
vc = target.value_counts()
|
273
|
+
min_class_value = vc.index[len(vc) - 1]
|
274
|
+
min_class_count = vc[min_class_value]
|
275
|
+
|
276
|
+
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
277
|
+
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
278
|
+
min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
|
279
|
+
)
|
280
|
+
self.logger.warning(msg)
|
281
|
+
raise ValidationError(msg)
|
282
|
+
|
283
|
+
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
284
|
+
min_class_threshold = min_class_percent * count
|
285
|
+
|
286
|
+
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
287
|
+
return bool(min_class_count < min_class_threshold)
|
339
288
|
|
340
289
|
def __validate_dataset(self, validate_target: bool, silent_mode: bool):
|
341
290
|
"""Validate DataSet"""
|
@@ -617,8 +566,8 @@ class Dataset: # (pd.DataFrame):
|
|
617
566
|
def _set_sample_size(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|
618
567
|
if runtime_parameters is not None and runtime_parameters.properties is not None:
|
619
568
|
if self.cv_type is not None and self.cv_type.is_time_series():
|
620
|
-
runtime_parameters.properties["sample_size"] = self.
|
621
|
-
runtime_parameters.properties["iter0_sample_size"] = self.
|
569
|
+
runtime_parameters.properties["sample_size"] = self.sample_config.fit_sample_rows_ts
|
570
|
+
runtime_parameters.properties["iter0_sample_size"] = self.sample_config.fit_sample_rows_ts
|
622
571
|
return runtime_parameters
|
623
572
|
|
624
573
|
def _clean_generate_features(self, runtime_parameters: Optional[RuntimeParameters]) -> Optional[RuntimeParameters]:
|