upgini 1.2.113a3974.dev1__py3-none-any.whl → 1.2.114__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +8 -4
- upgini/dataset.py +48 -78
- upgini/features_enricher.py +726 -516
- upgini/http.py +15 -19
- upgini/metadata.py +1 -10
- upgini/metrics.py +6 -2
- upgini/resource_bundle/strings.properties +8 -6
- upgini/sampler/base.py +3 -1
- upgini/sampler/random_under_sampler.py +18 -8
- upgini/search_task.py +6 -0
- upgini/utils/config.py +43 -0
- upgini/utils/deduplicate_utils.py +57 -9
- upgini/utils/display_utils.py +1 -1
- upgini/utils/feature_info.py +5 -0
- upgini/utils/hash_utils.py +159 -0
- upgini/utils/psi.py +300 -0
- upgini/utils/sample_utils.py +45 -42
- upgini/utils/target_utils.py +53 -2
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/METADATA +62 -32
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/RECORD +23 -20
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/WHEEL +1 -1
- {upgini-1.2.113a3974.dev1.dist-info → upgini-1.2.114.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.114"
|
upgini/autofe/date.py
CHANGED
@@ -244,7 +244,8 @@ class DateListDiffBounded(DateListDiff, ParametrizedOperator):
|
|
244
244
|
|
245
245
|
class DatePercentileBase(PandasOperator, abc.ABC):
|
246
246
|
is_binary: bool = True
|
247
|
-
|
247
|
+
is_categorical: bool = True
|
248
|
+
output_type: Optional[str] = "category"
|
248
249
|
|
249
250
|
date_unit: Optional[str] = None
|
250
251
|
|
@@ -254,7 +255,12 @@ class DatePercentileBase(PandasOperator, abc.ABC):
|
|
254
255
|
|
255
256
|
bounds = self._get_bounds(left)
|
256
257
|
|
257
|
-
return
|
258
|
+
return (
|
259
|
+
right.index.to_series()
|
260
|
+
.apply(lambda i: self._perc(right[i], bounds[i]))
|
261
|
+
.astype(pd.Int64Dtype())
|
262
|
+
.astype("category")
|
263
|
+
)
|
258
264
|
|
259
265
|
@abc.abstractmethod
|
260
266
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
@@ -318,8 +324,6 @@ class DatePercentile(DatePercentileBase):
|
|
318
324
|
|
319
325
|
class DatePercentileMethod2(DatePercentileBase):
|
320
326
|
name: str = "date_per_method2"
|
321
|
-
is_categorical: bool = True
|
322
|
-
output_type: Optional[str] = "category"
|
323
327
|
|
324
328
|
def _get_bounds(self, date_col: pd.Series) -> pd.Series:
|
325
329
|
pass
|
upgini/dataset.py
CHANGED
@@ -25,7 +25,6 @@ from upgini.metadata import (
|
|
25
25
|
AutoFEParameters,
|
26
26
|
CVType,
|
27
27
|
DataType,
|
28
|
-
FeaturesFilter,
|
29
28
|
FileColumnMeaningType,
|
30
29
|
FileColumnMetadata,
|
31
30
|
FileMetadata,
|
@@ -37,8 +36,9 @@ from upgini.metadata import (
|
|
37
36
|
)
|
38
37
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
39
38
|
from upgini.search_task import SearchTask
|
39
|
+
from upgini.utils.config import SampleConfig
|
40
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
41
|
-
from upgini.utils.sample_utils import SampleColumns,
|
41
|
+
from upgini.utils.sample_utils import SampleColumns, sample
|
42
42
|
|
43
43
|
try:
|
44
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -50,10 +50,7 @@ except Exception:
|
|
50
50
|
|
51
51
|
class Dataset:
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
|
-
MAX_ROWS =
|
54
|
-
IMBALANCE_THESHOLD = 0.6
|
55
|
-
MIN_TARGET_CLASS_ROWS = 100
|
56
|
-
MAX_MULTICLASS_CLASS_COUNT = 100
|
53
|
+
MAX_ROWS = 200_000
|
57
54
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
58
55
|
MAX_FEATURES_COUNT = 3500
|
59
56
|
MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
|
@@ -73,6 +70,7 @@ class Dataset:
|
|
73
70
|
cv_type: Optional[CVType] = None,
|
74
71
|
date_column: Optional[str] = None,
|
75
72
|
id_columns: Optional[List[str]] = None,
|
73
|
+
is_imbalanced: bool = False,
|
76
74
|
random_state: Optional[int] = None,
|
77
75
|
sample_config: Optional[SampleConfig] = None,
|
78
76
|
rest_client: Optional[_RestClient] = None,
|
@@ -117,8 +115,9 @@ class Dataset:
|
|
117
115
|
self.rest_client = rest_client
|
118
116
|
self.random_state = random_state
|
119
117
|
self.columns_renaming: Dict[str, str] = {}
|
120
|
-
self.
|
118
|
+
self.is_imbalanced: bool = False
|
121
119
|
self.id_columns = id_columns
|
120
|
+
self.is_imbalanced = is_imbalanced
|
122
121
|
self.date_column = date_column
|
123
122
|
if logger is not None:
|
124
123
|
self.logger = logger
|
@@ -184,7 +183,19 @@ class Dataset:
|
|
184
183
|
def __validate_target(self):
|
185
184
|
# self.logger.info("Validating target")
|
186
185
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
|
187
|
-
|
186
|
+
|
187
|
+
oot_indices = []
|
188
|
+
if EVAL_SET_INDEX in self.data.columns:
|
189
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
190
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
191
|
+
if eval_set[target_column].isna().all():
|
192
|
+
oot_indices.append(eval_set_index)
|
193
|
+
|
194
|
+
df_to_check = self.data.copy()
|
195
|
+
if oot_indices:
|
196
|
+
df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
|
197
|
+
|
198
|
+
target = df_to_check[target_column]
|
188
199
|
|
189
200
|
if self.task_type == ModelTaskType.BINARY:
|
190
201
|
if not is_integer_dtype(target):
|
@@ -201,7 +212,7 @@ class Dataset:
|
|
201
212
|
elif self.task_type == ModelTaskType.MULTICLASS:
|
202
213
|
if not is_integer_dtype(target):
|
203
214
|
try:
|
204
|
-
target =
|
215
|
+
target = target.astype("category").cat.codes
|
205
216
|
except Exception:
|
206
217
|
self.logger.exception("Failed to cast target to category codes for multiclass task type")
|
207
218
|
raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
@@ -227,8 +238,6 @@ class Dataset:
|
|
227
238
|
else:
|
228
239
|
train_segment = self.data
|
229
240
|
|
230
|
-
self.imbalanced = self.__is_imbalanced(train_segment)
|
231
|
-
|
232
241
|
sample_columns = SampleColumns(
|
233
242
|
ids=self.id_columns,
|
234
243
|
date=self.date_column,
|
@@ -237,55 +246,19 @@ class Dataset:
|
|
237
246
|
)
|
238
247
|
|
239
248
|
self.data = sample(
|
240
|
-
train_segment if self.
|
249
|
+
train_segment if self.is_imbalanced else self.data, # for imbalanced data we will be doing transform anyway
|
241
250
|
self.task_type,
|
242
251
|
self.cv_type,
|
243
252
|
self.sample_config,
|
244
253
|
sample_columns,
|
245
254
|
self.random_state,
|
246
|
-
balance=self.
|
255
|
+
balance=self.is_imbalanced,
|
247
256
|
force_downsampling=force_downsampling,
|
248
257
|
logger=self.logger,
|
249
258
|
bundle=self.bundle,
|
250
259
|
warning_callback=self.warning_callback,
|
251
260
|
)
|
252
261
|
|
253
|
-
def __is_imbalanced(self, data: pd.DataFrame) -> bool:
|
254
|
-
if self.task_type is None or not self.task_type.is_classification():
|
255
|
-
return False
|
256
|
-
|
257
|
-
if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
|
258
|
-
return False
|
259
|
-
|
260
|
-
count = len(data)
|
261
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
262
|
-
target = data[target_column]
|
263
|
-
target_classes_count = target.nunique()
|
264
|
-
|
265
|
-
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
266
|
-
msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
|
267
|
-
target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
|
268
|
-
)
|
269
|
-
self.logger.warning(msg)
|
270
|
-
raise ValidationError(msg)
|
271
|
-
|
272
|
-
vc = target.value_counts()
|
273
|
-
min_class_value = vc.index[len(vc) - 1]
|
274
|
-
min_class_count = vc[min_class_value]
|
275
|
-
|
276
|
-
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
277
|
-
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
278
|
-
min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
|
279
|
-
)
|
280
|
-
self.logger.warning(msg)
|
281
|
-
raise ValidationError(msg)
|
282
|
-
|
283
|
-
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
284
|
-
min_class_threshold = min_class_percent * count
|
285
|
-
|
286
|
-
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
287
|
-
return bool(min_class_count < min_class_threshold)
|
288
|
-
|
289
262
|
def __validate_dataset(self, validate_target: bool, silent_mode: bool):
|
290
263
|
"""Validate DataSet"""
|
291
264
|
# self.logger.info("validating etalon")
|
@@ -335,15 +308,37 @@ class Dataset:
|
|
335
308
|
all_valid_message = self.bundle.get("validation_all_valid_message")
|
336
309
|
invalid_message = self.bundle.get("validation_invalid_message")
|
337
310
|
|
311
|
+
oot_indices = []
|
312
|
+
if EVAL_SET_INDEX in self.data.columns:
|
313
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
314
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
315
|
+
if eval_set[target].isna().all():
|
316
|
+
oot_indices.append(eval_set_index)
|
317
|
+
|
338
318
|
for col in columns_to_validate:
|
339
|
-
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
340
319
|
if validate_target and target is not None and col == target:
|
341
|
-
|
320
|
+
if oot_indices:
|
321
|
+
mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
|
322
|
+
invalid_target_mask = (
|
323
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
324
|
+
)
|
325
|
+
# Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
|
326
|
+
self.data[f"{col}_is_valid"] = True
|
327
|
+
self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
|
328
|
+
else:
|
329
|
+
# No OOT: mark invalid where target is NaN or +/-inf
|
330
|
+
self.data[f"{col}_is_valid"] = ~(
|
331
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
332
|
+
)
|
333
|
+
else:
|
334
|
+
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
342
335
|
|
343
336
|
if col in mandatory_columns:
|
344
337
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
345
338
|
|
346
|
-
|
339
|
+
# Use stable pandas API across versions: Series.unique keeps order
|
340
|
+
# and collapses multiple NaNs into a single NaN
|
341
|
+
invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
|
347
342
|
valid_share = self.data[f"{col}_is_valid"].sum() / nrows
|
348
343
|
original_col_name = self.columns_renaming[col]
|
349
344
|
validation_stats[original_col_name] = {}
|
@@ -503,9 +498,6 @@ class Dataset:
|
|
503
498
|
return_scores: bool,
|
504
499
|
extract_features: bool,
|
505
500
|
accurate_model: Optional[bool] = None,
|
506
|
-
importance_threshold: Optional[float] = None,
|
507
|
-
max_features: Optional[int] = None,
|
508
|
-
filter_features: Optional[dict] = None,
|
509
501
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
510
502
|
metrics_calculation: Optional[bool] = False,
|
511
503
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
@@ -514,28 +506,12 @@ class Dataset:
|
|
514
506
|
search_customization = SearchCustomization(
|
515
507
|
extractFeatures=extract_features,
|
516
508
|
accurateModel=accurate_model,
|
517
|
-
importanceThreshold=importance_threshold,
|
518
|
-
maxFeatures=max_features,
|
519
509
|
returnScores=return_scores,
|
520
510
|
runtimeParameters=runtime_parameters,
|
521
511
|
metricsCalculation=metrics_calculation,
|
522
512
|
)
|
523
|
-
if filter_features:
|
524
|
-
if [
|
525
|
-
key
|
526
|
-
for key in filter_features
|
527
|
-
if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
|
528
|
-
]:
|
529
|
-
raise ValidationError(self.bundle.get("dataset_invalid_filter"))
|
530
|
-
feature_filter = FeaturesFilter(
|
531
|
-
minImportance=filter_features.get("min_importance"),
|
532
|
-
maxPSI=filter_features.get("max_psi"),
|
533
|
-
maxCount=filter_features.get("max_count"),
|
534
|
-
selectedFeatures=filter_features.get("selected_features"),
|
535
|
-
)
|
536
|
-
search_customization.featuresFilter = feature_filter
|
537
513
|
|
538
|
-
search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.
|
514
|
+
search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.is_imbalanced
|
539
515
|
if auto_fe_parameters is not None:
|
540
516
|
search_customization.runtimeParameters.properties["feature_generation_params.ts.gap_days"] = (
|
541
517
|
auto_fe_parameters.ts_gap_days
|
@@ -590,9 +566,6 @@ class Dataset:
|
|
590
566
|
extract_features: bool = False,
|
591
567
|
accurate_model: bool = False,
|
592
568
|
exclude_features_sources: Optional[List[str]] = None,
|
593
|
-
importance_threshold: Optional[float] = None, # deprecated
|
594
|
-
max_features: Optional[int] = None, # deprecated
|
595
|
-
filter_features: Optional[dict] = None, # deprecated
|
596
569
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
597
570
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
598
571
|
force_downsampling: bool = False,
|
@@ -609,9 +582,6 @@ class Dataset:
|
|
609
582
|
return_scores=return_scores,
|
610
583
|
extract_features=extract_features,
|
611
584
|
accurate_model=accurate_model,
|
612
|
-
importance_threshold=importance_threshold,
|
613
|
-
max_features=max_features,
|
614
|
-
filter_features=filter_features,
|
615
585
|
runtime_parameters=runtime_parameters,
|
616
586
|
auto_fe_parameters=auto_fe_parameters,
|
617
587
|
)
|