upgini 1.2.114a1__tar.gz → 1.2.114a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.114a1 → upgini-1.2.114a2}/PKG-INFO +1 -1
- upgini-1.2.114a2/src/upgini/__about__.py +1 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/dataset.py +37 -5
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/features_enricher.py +31 -15
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/resource_bundle/strings.properties +2 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/deduplicate_utils.py +30 -18
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/sample_utils.py +30 -2
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/target_utils.py +6 -1
- upgini-1.2.114a1/src/upgini/__about__.py +0 -1
- {upgini-1.2.114a1 → upgini-1.2.114a2}/.gitignore +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/LICENSE +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/README.md +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/pyproject.toml +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/ads.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/errors.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/http.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/metadata.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/metrics.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/search_task.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/spinner.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/psi.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.114a1 → upgini-1.2.114a2}/src/upgini/version_validator.py +0 -0
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.114a2"
|
@@ -50,7 +50,7 @@ except Exception:
|
|
50
50
|
|
51
51
|
class Dataset:
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
|
-
MAX_ROWS =
|
53
|
+
MAX_ROWS = 200_000
|
54
54
|
IMBALANCE_THESHOLD = 0.6
|
55
55
|
MIN_TARGET_CLASS_ROWS = 100
|
56
56
|
MAX_MULTICLASS_CLASS_COUNT = 100
|
@@ -184,7 +184,19 @@ class Dataset:
|
|
184
184
|
def __validate_target(self):
|
185
185
|
# self.logger.info("Validating target")
|
186
186
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
|
187
|
-
|
187
|
+
|
188
|
+
oot_indices = []
|
189
|
+
if EVAL_SET_INDEX in self.data.columns:
|
190
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
191
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
192
|
+
if eval_set[target_column].isna().all():
|
193
|
+
oot_indices.append(eval_set_index)
|
194
|
+
|
195
|
+
df_to_check = self.data.copy()
|
196
|
+
if oot_indices:
|
197
|
+
df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
|
198
|
+
|
199
|
+
target = df_to_check[target_column]
|
188
200
|
|
189
201
|
if self.task_type == ModelTaskType.BINARY:
|
190
202
|
if not is_integer_dtype(target):
|
@@ -201,7 +213,7 @@ class Dataset:
|
|
201
213
|
elif self.task_type == ModelTaskType.MULTICLASS:
|
202
214
|
if not is_integer_dtype(target):
|
203
215
|
try:
|
204
|
-
target =
|
216
|
+
target = target.astype("category").cat.codes
|
205
217
|
except Exception:
|
206
218
|
self.logger.exception("Failed to cast target to category codes for multiclass task type")
|
207
219
|
raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
@@ -335,10 +347,30 @@ class Dataset:
|
|
335
347
|
all_valid_message = self.bundle.get("validation_all_valid_message")
|
336
348
|
invalid_message = self.bundle.get("validation_invalid_message")
|
337
349
|
|
350
|
+
oot_indices = []
|
351
|
+
if EVAL_SET_INDEX in self.data.columns:
|
352
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
353
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
354
|
+
if eval_set[target].isna().all():
|
355
|
+
oot_indices.append(eval_set_index)
|
356
|
+
|
338
357
|
for col in columns_to_validate:
|
339
|
-
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
340
358
|
if validate_target and target is not None and col == target:
|
341
|
-
|
359
|
+
if oot_indices:
|
360
|
+
mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
|
361
|
+
invalid_target_mask = (
|
362
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
363
|
+
)
|
364
|
+
# Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
|
365
|
+
self.data[f"{col}_is_valid"] = True
|
366
|
+
self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
|
367
|
+
else:
|
368
|
+
# No OOT: mark invalid where target is NaN or +/-inf
|
369
|
+
self.data[f"{col}_is_valid"] = ~(
|
370
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
371
|
+
)
|
372
|
+
else:
|
373
|
+
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
342
374
|
|
343
375
|
if col in mandatory_columns:
|
344
376
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
@@ -398,16 +398,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
398
398
|
|
399
399
|
api_key = property(_get_api_key, _set_api_key)
|
400
400
|
|
401
|
-
|
402
|
-
def _check_eval_set(eval_set, X, bundle: ResourceBundle):
|
401
|
+
def _check_eval_set(self, eval_set, X):
|
403
402
|
checked_eval_set = []
|
404
403
|
if eval_set is None:
|
405
404
|
return checked_eval_set
|
406
405
|
if isinstance(eval_set, tuple):
|
407
406
|
eval_set = [eval_set]
|
408
407
|
if not isinstance(eval_set, list):
|
409
|
-
raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
410
|
-
for eval_pair in eval_set or []:
|
408
|
+
raise ValidationError(self.bundle.get("unsupported_type_eval_set").format(type(eval_set)))
|
409
|
+
for i, eval_pair in enumerate(eval_set or [], 1):
|
411
410
|
# Handle OOT
|
412
411
|
if isinstance(eval_pair, pd.DataFrame):
|
413
412
|
empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
|
@@ -417,12 +416,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
417
416
|
eval_pair = (eval_pair[0], empty_target)
|
418
417
|
|
419
418
|
if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
|
420
|
-
raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
419
|
+
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
421
420
|
if eval_pair[1] is None:
|
422
421
|
empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
|
423
422
|
eval_pair = (eval_pair[0], empty_target)
|
424
|
-
|
423
|
+
|
424
|
+
if not is_frames_equal(X, eval_pair[0], self.bundle):
|
425
425
|
checked_eval_set.append(eval_pair)
|
426
|
+
else:
|
427
|
+
msg = f"Eval set {i} is equal to train set and will be ignored"
|
428
|
+
self.logger.warning(msg)
|
429
|
+
print(msg)
|
426
430
|
return checked_eval_set
|
427
431
|
|
428
432
|
def fit(
|
@@ -517,7 +521,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
517
521
|
try:
|
518
522
|
self.X = X
|
519
523
|
self.y = y
|
520
|
-
self.eval_set = self._check_eval_set(eval_set, X
|
524
|
+
self.eval_set = self._check_eval_set(eval_set, X)
|
521
525
|
self.dump_input(trace_id, X, y, self.eval_set)
|
522
526
|
self.__set_select_features(select_features)
|
523
527
|
self.__inner_fit(
|
@@ -678,7 +682,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
678
682
|
try:
|
679
683
|
self.X = X
|
680
684
|
self.y = y
|
681
|
-
self.eval_set = self._check_eval_set(eval_set, X
|
685
|
+
self.eval_set = self._check_eval_set(eval_set, X)
|
682
686
|
self.__set_select_features(select_features)
|
683
687
|
self.dump_input(trace_id, X, y, self.eval_set)
|
684
688
|
|
@@ -953,7 +957,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
953
957
|
effective_X = X if X is not None else self.X
|
954
958
|
effective_y = y if y is not None else self.y
|
955
959
|
effective_eval_set = eval_set if eval_set is not None else self.eval_set
|
956
|
-
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X
|
960
|
+
effective_eval_set = self._check_eval_set(effective_eval_set, effective_X)
|
957
961
|
|
958
962
|
if (
|
959
963
|
self._search_task is None
|
@@ -1471,14 +1475,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
1471
1475
|
date_column = self._get_date_column(search_keys)
|
1472
1476
|
|
1473
1477
|
# Get minimum date from main dataset X
|
1474
|
-
main_min_date = X[date_column].min()
|
1478
|
+
main_min_date = X[date_column].dropna().min()
|
1475
1479
|
|
1476
1480
|
# Find minimum date for each eval_set and compare with main dataset
|
1477
1481
|
eval_dates = []
|
1478
1482
|
for i, (eval_x, _) in enumerate(eval_set):
|
1479
1483
|
if date_column in eval_x.columns:
|
1480
|
-
|
1481
|
-
|
1484
|
+
if len(eval_x) < 1000:
|
1485
|
+
self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
|
1486
|
+
continue
|
1487
|
+
eval_min_date = eval_x[date_column].dropna().min()
|
1488
|
+
eval_max_date = eval_x[date_column].dropna().max()
|
1482
1489
|
eval_dates.append((i, eval_min_date, eval_max_date))
|
1483
1490
|
|
1484
1491
|
if not eval_dates:
|
@@ -1679,7 +1686,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1679
1686
|
if X is None:
|
1680
1687
|
return True, self.X, self.y, self.eval_set
|
1681
1688
|
|
1682
|
-
checked_eval_set = self._check_eval_set(eval_set, X
|
1689
|
+
checked_eval_set = self._check_eval_set(eval_set, X)
|
1683
1690
|
|
1684
1691
|
if (
|
1685
1692
|
X is self.X
|
@@ -1783,7 +1790,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1783
1790
|
):
|
1784
1791
|
is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
|
1785
1792
|
is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
|
1786
|
-
checked_eval_set = self._check_eval_set(eval_set, X
|
1793
|
+
checked_eval_set = self._check_eval_set(eval_set, X)
|
1787
1794
|
validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
|
1788
1795
|
|
1789
1796
|
sampled_data = self._get_enriched_for_metrics(
|
@@ -3246,6 +3253,15 @@ if response.status_code == 200:
|
|
3246
3253
|
else:
|
3247
3254
|
self.__log_warning(full_duplicates_warning)
|
3248
3255
|
|
3256
|
+
# Check if OOT eval set still more than 1000 rows
|
3257
|
+
if EVAL_SET_INDEX in df.columns:
|
3258
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
3259
|
+
if eval_set_index == 0:
|
3260
|
+
continue
|
3261
|
+
eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index]
|
3262
|
+
if np.all(pd.isna(eval_set_df[TARGET])) and len(eval_set_df) < 1000:
|
3263
|
+
self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
|
3264
|
+
|
3249
3265
|
# Explode multiple search keys
|
3250
3266
|
df = self.__add_fit_system_record_id(
|
3251
3267
|
df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
|
@@ -3823,7 +3839,7 @@ if response.status_code == 200:
|
|
3823
3839
|
if isinstance(eval_set, tuple):
|
3824
3840
|
eval_set = [eval_set]
|
3825
3841
|
for eval in eval_set:
|
3826
|
-
is_oot = eval[1]
|
3842
|
+
is_oot = np.all(pd.isna(eval[1]))
|
3827
3843
|
if not is_oot:
|
3828
3844
|
if self.baseline_score_column not in eval[0].columns:
|
3829
3845
|
raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
|
@@ -165,6 +165,7 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
|
|
165
165
|
dataset_empty_column_names=Some column names are empty. Add names please
|
166
166
|
dataset_full_duplicates={:.5f}% of the rows are fully duplicated
|
167
167
|
dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
|
168
|
+
dataset_diff_target_duplicates_oot={:.4f}% of rows ({}) in OOT eval_set are duplicates with train or another eval_set. These rows will be deleted from OOT\nSample of incorrect row indexes: {}
|
168
169
|
dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
169
170
|
dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
|
170
171
|
dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
|
@@ -185,6 +186,7 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
|
|
185
186
|
dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
|
186
187
|
dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
|
187
188
|
dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
|
189
|
+
oot_eval_set_too_small_after_dedup=OOT eval set {} has less than 1000 rows after deduplication. It will be ignored for stability check
|
188
190
|
binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
|
189
191
|
all_search_keys_invalid=All search keys are invalid
|
190
192
|
all_emails_invalid=All values in column {} are invalid emails # Metrics validation
|
@@ -134,10 +134,12 @@ def remove_fintech_duplicates(
|
|
134
134
|
logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
|
135
135
|
|
136
136
|
# Process each eval_set part separately
|
137
|
+
oot_eval_dfs = []
|
137
138
|
new_eval_dfs = []
|
138
139
|
for i, eval_df in enumerate(eval_dfs, 1):
|
139
140
|
# Skip OOT
|
140
141
|
if eval_df[TARGET].isna().all():
|
142
|
+
oot_eval_dfs.append(eval_df)
|
141
143
|
continue
|
142
144
|
logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
|
143
145
|
cleaned_eval_df, eval_warning = process_df(eval_df, i)
|
@@ -148,8 +150,8 @@ def remove_fintech_duplicates(
|
|
148
150
|
|
149
151
|
# Combine the processed train and eval parts back into one dataset
|
150
152
|
logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
|
151
|
-
if new_eval_dfs:
|
152
|
-
df = pd.concat([train_df] + new_eval_dfs)
|
153
|
+
if new_eval_dfs or oot_eval_dfs:
|
154
|
+
df = pd.concat([train_df] + new_eval_dfs + oot_eval_dfs, ignore_index=False)
|
153
155
|
else:
|
154
156
|
df = train_df
|
155
157
|
logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
|
@@ -196,30 +198,30 @@ def clean_full_duplicates(
|
|
196
198
|
|
197
199
|
# Separate rows to exclude from deduplication:
|
198
200
|
# for each eval_set_index != 0 check separately, all TARGET values are NaN
|
199
|
-
excluded_from_dedup = pd.DataFrame()
|
200
201
|
df_for_dedup = df
|
202
|
+
oot_df = None
|
201
203
|
|
202
204
|
if EVAL_SET_INDEX in df.columns:
|
203
|
-
|
204
|
-
|
205
|
-
|
206
|
-
|
207
|
-
for eval_idx in unique_eval_indices:
|
205
|
+
oot_eval_dfs = []
|
206
|
+
other_dfs = []
|
207
|
+
for eval_idx in df[EVAL_SET_INDEX].unique():
|
208
208
|
eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
|
209
209
|
# Check that all TARGET values for this specific eval_set_index are NaN
|
210
|
-
if
|
211
|
-
|
210
|
+
if eval_idx != 0 and eval_subset[TARGET].isna().all():
|
211
|
+
oot_eval_dfs.append(eval_subset)
|
212
212
|
logger.info(
|
213
213
|
f"Excluded {len(eval_subset)} rows from deduplication "
|
214
214
|
f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
|
215
215
|
)
|
216
|
+
else:
|
217
|
+
other_dfs.append(eval_subset)
|
218
|
+
|
219
|
+
if oot_eval_dfs:
|
220
|
+
oot_df = pd.concat(oot_eval_dfs, ignore_index=False)
|
221
|
+
df_for_dedup = pd.concat(other_dfs, ignore_index=False)
|
222
|
+
else:
|
223
|
+
df_for_dedup = df
|
216
224
|
|
217
|
-
# Combine all excluded parts
|
218
|
-
if excluded_parts:
|
219
|
-
excluded_from_dedup = pd.concat(excluded_parts, ignore_index=False)
|
220
|
-
# Remove excluded rows from dataframe for deduplication
|
221
|
-
excluded_indices = excluded_from_dedup.index
|
222
|
-
df_for_dedup = df[~df.index.isin(excluded_indices)]
|
223
225
|
marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
|
224
226
|
if marked_duplicates.sum() > 0:
|
225
227
|
dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
|
@@ -231,8 +233,18 @@ def clean_full_duplicates(
|
|
231
233
|
df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
|
232
234
|
logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
|
233
235
|
# Combine back excluded rows
|
234
|
-
if
|
235
|
-
df = pd.concat([df_for_dedup,
|
236
|
+
if oot_df is not None:
|
237
|
+
df = pd.concat([df_for_dedup, oot_df], ignore_index=False)
|
238
|
+
marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
|
239
|
+
if marked_duplicates.sum() > 0:
|
240
|
+
dups_indices = df[marked_duplicates].index.to_list()[:100]
|
241
|
+
nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
|
242
|
+
num_dup_rows = len(df) - nrows_after_tgt_dedup
|
243
|
+
share_tgt_dedup = 100 * num_dup_rows / len(df)
|
244
|
+
msg = bundle.get("dataset_diff_target_duplicates_oot").format(
|
245
|
+
share_tgt_dedup, num_dup_rows, dups_indices
|
246
|
+
)
|
247
|
+
df = df.drop_duplicates(subset=unique_columns, keep="first")
|
236
248
|
logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
|
237
249
|
else:
|
238
250
|
df = df_for_dedup
|
@@ -5,7 +5,7 @@ from typing import Callable, List, Optional
|
|
5
5
|
import numpy as np
|
6
6
|
import pandas as pd
|
7
7
|
|
8
|
-
from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
|
8
|
+
from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, TARGET, CVType, ModelTaskType
|
9
9
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
10
10
|
from upgini.utils.target_utils import balance_undersample
|
11
11
|
from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
|
@@ -117,6 +117,22 @@ def sample(
|
|
117
117
|
**kwargs,
|
118
118
|
)
|
119
119
|
|
120
|
+
# separate OOT
|
121
|
+
oot_dfs = []
|
122
|
+
other_dfs = []
|
123
|
+
if EVAL_SET_INDEX in df.columns:
|
124
|
+
for eval_set_index in df[EVAL_SET_INDEX].unique():
|
125
|
+
eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
|
126
|
+
if TARGET in eval_df.columns and eval_df[TARGET].isna().all():
|
127
|
+
oot_dfs.append(eval_df)
|
128
|
+
else:
|
129
|
+
other_dfs.append(eval_df)
|
130
|
+
if len(oot_dfs) > 0:
|
131
|
+
oot_df = pd.concat(oot_dfs, ignore_index=False)
|
132
|
+
df = pd.concat(other_dfs, ignore_index=False)
|
133
|
+
else:
|
134
|
+
oot_df = None
|
135
|
+
|
120
136
|
num_samples = _num_samples(df)
|
121
137
|
if num_samples > fit_sample_threshold:
|
122
138
|
logger.info(
|
@@ -126,6 +142,18 @@ def sample(
|
|
126
142
|
df = df.sample(n=fit_sample_rows, random_state=random_state)
|
127
143
|
logger.info(f"Shape after threshold resampling: {df.shape}")
|
128
144
|
|
145
|
+
if oot_df is not None:
|
146
|
+
num_samples_oot = _num_samples(oot_df)
|
147
|
+
if num_samples_oot > fit_sample_threshold:
|
148
|
+
logger.info(
|
149
|
+
f"OOT has size {num_samples_oot} more than threshold {fit_sample_threshold} "
|
150
|
+
f"and will be downsampled to {fit_sample_rows}"
|
151
|
+
)
|
152
|
+
oot_df = oot_df.sample(n=fit_sample_rows, random_state=random_state)
|
153
|
+
df = pd.concat([df, oot_df], ignore_index=False)
|
154
|
+
|
155
|
+
logger.info(f"Dataset size after downsampling: {len(df)}")
|
156
|
+
|
129
157
|
return df
|
130
158
|
|
131
159
|
|
@@ -175,7 +203,7 @@ def sample_time_series_train_eval(
|
|
175
203
|
)
|
176
204
|
if logger is not None:
|
177
205
|
logger.info(f"Eval set size: {len(eval_df)}")
|
178
|
-
df = pd.concat([train_df, eval_df])
|
206
|
+
df = pd.concat([train_df, eval_df], ignore_index=False)
|
179
207
|
|
180
208
|
elif len(train_df) > max_rows:
|
181
209
|
df = sample_time_series_trunc(
|
@@ -6,7 +6,7 @@ import pandas as pd
|
|
6
6
|
from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
|
7
7
|
|
8
8
|
from upgini.errors import ValidationError
|
9
|
-
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
9
|
+
from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, ModelTaskType
|
10
10
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
|
11
11
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
12
12
|
|
@@ -132,6 +132,11 @@ def balance_undersample(
|
|
132
132
|
if SYSTEM_RECORD_ID not in df.columns:
|
133
133
|
raise Exception("System record id must be presented for undersampling")
|
134
134
|
|
135
|
+
# Rebalance and send to server only train data
|
136
|
+
# because eval set data will be sent separately in transform for metrics
|
137
|
+
if EVAL_SET_INDEX in df.columns:
|
138
|
+
df = df[df[EVAL_SET_INDEX] == 0]
|
139
|
+
|
135
140
|
target = df[target_column].copy()
|
136
141
|
|
137
142
|
vc = target.value_counts()
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.2.114a1"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|