upgini 1.2.114a1__tar.gz → 1.2.114a3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {upgini-1.2.114a1 → upgini-1.2.114a3}/PKG-INFO +1 -1
  2. upgini-1.2.114a3/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/dataset.py +37 -5
  4. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/features_enricher.py +35 -15
  5. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/resource_bundle/strings.properties +2 -0
  6. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/deduplicate_utils.py +30 -18
  7. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/psi.py +3 -3
  8. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/sample_utils.py +30 -2
  9. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/target_utils.py +6 -1
  10. upgini-1.2.114a1/src/upgini/__about__.py +0 -1
  11. {upgini-1.2.114a1 → upgini-1.2.114a3}/.gitignore +0 -0
  12. {upgini-1.2.114a1 → upgini-1.2.114a3}/LICENSE +0 -0
  13. {upgini-1.2.114a1 → upgini-1.2.114a3}/README.md +0 -0
  14. {upgini-1.2.114a1 → upgini-1.2.114a3}/pyproject.toml +0 -0
  15. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/__init__.py +0 -0
  16. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/ads.py +0 -0
  17. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/all_operators.py +0 -0
  21. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/date.py +0 -0
  23. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/feature.py +0 -0
  24. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/groupby.py +0 -0
  25. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/operator.py +0 -0
  26. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/__init__.py +0 -0
  27. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/base.py +0 -0
  28. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/cross.py +0 -0
  29. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/delta.py +0 -0
  30. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/lag.py +0 -0
  31. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/roll.py +0 -0
  32. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/trend.py +0 -0
  33. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/timeseries/volatility.py +0 -0
  34. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/unary.py +0 -0
  35. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/utils.py +0 -0
  36. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/autofe/vector.py +0 -0
  37. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/data_source/__init__.py +0 -0
  38. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/data_source/data_source_publisher.py +0 -0
  39. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/errors.py +0 -0
  40. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/http.py +0 -0
  41. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/mdc/__init__.py +0 -0
  42. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/mdc/context.py +0 -0
  43. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/metadata.py +0 -0
  44. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/metrics.py +0 -0
  45. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/normalizer/__init__.py +0 -0
  46. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/normalizer/normalize_utils.py +0 -0
  47. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/resource_bundle/__init__.py +0 -0
  48. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/resource_bundle/exceptions.py +0 -0
  49. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  50. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/sampler/__init__.py +0 -0
  51. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/sampler/base.py +0 -0
  52. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/sampler/random_under_sampler.py +0 -0
  53. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/sampler/utils.py +0 -0
  54. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/search_task.py +0 -0
  55. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/spinner.py +0 -0
  56. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  57. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/__init__.py +0 -0
  58. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/base_search_key_detector.py +0 -0
  59. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/blocked_time_series.py +0 -0
  60. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/country_utils.py +0 -0
  61. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/custom_loss_utils.py +0 -0
  62. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/cv_utils.py +0 -0
  63. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/datetime_utils.py +0 -0
  64. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/display_utils.py +0 -0
  65. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/email_utils.py +0 -0
  66. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/fallback_progress_bar.py +0 -0
  67. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/feature_info.py +0 -0
  68. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/features_validator.py +0 -0
  69. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/format.py +0 -0
  70. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/ip_utils.py +0 -0
  71. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/mstats.py +0 -0
  72. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/postal_code_utils.py +0 -0
  74. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/progress_bar.py +0 -0
  75. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/sklearn_ext.py +0 -0
  76. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/sort.py +0 -0
  77. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/track_info.py +0 -0
  78. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/ts_utils.py +0 -0
  79. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/utils/warning_counter.py +0 -0
  80. {upgini-1.2.114a1 → upgini-1.2.114a3}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.114a1
3
+ Version: 1.2.114a3
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.114a3"
@@ -50,7 +50,7 @@ except Exception:
50
50
 
51
51
  class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
- MAX_ROWS = 100_000
53
+ MAX_ROWS = 200_000
54
54
  IMBALANCE_THESHOLD = 0.6
55
55
  MIN_TARGET_CLASS_ROWS = 100
56
56
  MAX_MULTICLASS_CLASS_COUNT = 100
@@ -184,7 +184,19 @@ class Dataset:
184
184
  def __validate_target(self):
185
185
  # self.logger.info("Validating target")
186
186
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
187
- target = self.data[target_column]
187
+
188
+ oot_indices = []
189
+ if EVAL_SET_INDEX in self.data.columns:
190
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
191
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
192
+ if eval_set[target_column].isna().all():
193
+ oot_indices.append(eval_set_index)
194
+
195
+ df_to_check = self.data.copy()
196
+ if oot_indices:
197
+ df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
198
+
199
+ target = df_to_check[target_column]
188
200
 
189
201
  if self.task_type == ModelTaskType.BINARY:
190
202
  if not is_integer_dtype(target):
@@ -201,7 +213,7 @@ class Dataset:
201
213
  elif self.task_type == ModelTaskType.MULTICLASS:
202
214
  if not is_integer_dtype(target):
203
215
  try:
204
- target = self.data[target_column].astype("category").cat.codes
216
+ target = target.astype("category").cat.codes
205
217
  except Exception:
206
218
  self.logger.exception("Failed to cast target to category codes for multiclass task type")
207
219
  raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
@@ -335,10 +347,30 @@ class Dataset:
335
347
  all_valid_message = self.bundle.get("validation_all_valid_message")
336
348
  invalid_message = self.bundle.get("validation_invalid_message")
337
349
 
350
+ oot_indices = []
351
+ if EVAL_SET_INDEX in self.data.columns:
352
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
353
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
354
+ if eval_set[target].isna().all():
355
+ oot_indices.append(eval_set_index)
356
+
338
357
  for col in columns_to_validate:
339
- self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
340
358
  if validate_target and target is not None and col == target:
341
- self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
359
+ if oot_indices:
360
+ mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
361
+ invalid_target_mask = (
362
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
363
+ )
364
+ # Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
365
+ self.data[f"{col}_is_valid"] = True
366
+ self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
367
+ else:
368
+ # No OOT: mark invalid where target is NaN or +/-inf
369
+ self.data[f"{col}_is_valid"] = ~(
370
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
371
+ )
372
+ else:
373
+ self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
342
374
 
343
375
  if col in mandatory_columns:
344
376
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
@@ -398,16 +398,15 @@ class FeaturesEnricher(TransformerMixin):
398
398
 
399
399
  api_key = property(_get_api_key, _set_api_key)
400
400
 
401
- @staticmethod
402
- def _check_eval_set(eval_set, X, bundle: ResourceBundle):
401
+ def _check_eval_set(self, eval_set, X):
403
402
  checked_eval_set = []
404
403
  if eval_set is None:
405
404
  return checked_eval_set
406
405
  if isinstance(eval_set, tuple):
407
406
  eval_set = [eval_set]
408
407
  if not isinstance(eval_set, list):
409
- raise ValidationError(bundle.get("unsupported_type_eval_set").format(type(eval_set)))
410
- for eval_pair in eval_set or []:
408
+ raise ValidationError(self.bundle.get("unsupported_type_eval_set").format(type(eval_set)))
409
+ for i, eval_pair in enumerate(eval_set or [], 1):
411
410
  # Handle OOT
412
411
  if isinstance(eval_pair, pd.DataFrame):
413
412
  empty_target = pd.Series([np.nan] * len(eval_pair), index=eval_pair.index)
@@ -417,12 +416,17 @@ class FeaturesEnricher(TransformerMixin):
417
416
  eval_pair = (eval_pair[0], empty_target)
418
417
 
419
418
  if not isinstance(eval_pair, tuple) or len(eval_pair) != 2:
420
- raise ValidationError(bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
419
+ raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
421
420
  if eval_pair[1] is None:
422
421
  empty_target = pd.Series([np.nan] * len(eval_pair[0]), index=eval_pair[0].index)
423
422
  eval_pair = (eval_pair[0], empty_target)
424
- if not is_frames_equal(X, eval_pair[0], bundle):
423
+
424
+ if not is_frames_equal(X, eval_pair[0], self.bundle):
425
425
  checked_eval_set.append(eval_pair)
426
+ else:
427
+ msg = f"Eval set {i} is equal to train set and will be ignored"
428
+ self.logger.warning(msg)
429
+ print(msg)
426
430
  return checked_eval_set
427
431
 
428
432
  def fit(
@@ -517,7 +521,7 @@ class FeaturesEnricher(TransformerMixin):
517
521
  try:
518
522
  self.X = X
519
523
  self.y = y
520
- self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
524
+ self.eval_set = self._check_eval_set(eval_set, X)
521
525
  self.dump_input(trace_id, X, y, self.eval_set)
522
526
  self.__set_select_features(select_features)
523
527
  self.__inner_fit(
@@ -678,7 +682,7 @@ class FeaturesEnricher(TransformerMixin):
678
682
  try:
679
683
  self.X = X
680
684
  self.y = y
681
- self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
685
+ self.eval_set = self._check_eval_set(eval_set, X)
682
686
  self.__set_select_features(select_features)
683
687
  self.dump_input(trace_id, X, y, self.eval_set)
684
688
 
@@ -953,7 +957,7 @@ class FeaturesEnricher(TransformerMixin):
953
957
  effective_X = X if X is not None else self.X
954
958
  effective_y = y if y is not None else self.y
955
959
  effective_eval_set = eval_set if eval_set is not None else self.eval_set
956
- effective_eval_set = self._check_eval_set(effective_eval_set, effective_X, self.bundle)
960
+ effective_eval_set = self._check_eval_set(effective_eval_set, effective_X)
957
961
 
958
962
  if (
959
963
  self._search_task is None
@@ -1471,14 +1475,17 @@ class FeaturesEnricher(TransformerMixin):
1471
1475
  date_column = self._get_date_column(search_keys)
1472
1476
 
1473
1477
  # Get minimum date from main dataset X
1474
- main_min_date = X[date_column].min()
1478
+ main_min_date = X[date_column].dropna().min()
1475
1479
 
1476
1480
  # Find minimum date for each eval_set and compare with main dataset
1477
1481
  eval_dates = []
1478
1482
  for i, (eval_x, _) in enumerate(eval_set):
1479
1483
  if date_column in eval_x.columns:
1480
- eval_min_date = eval_x[date_column].min()
1481
- eval_max_date = eval_x[date_column].max()
1484
+ if len(eval_x) < 1000:
1485
+ self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1486
+ continue
1487
+ eval_min_date = eval_x[date_column].dropna().min()
1488
+ eval_max_date = eval_x[date_column].dropna().max()
1482
1489
  eval_dates.append((i, eval_min_date, eval_max_date))
1483
1490
 
1484
1491
  if not eval_dates:
@@ -1509,6 +1516,8 @@ class FeaturesEnricher(TransformerMixin):
1509
1516
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1510
1517
  )
1511
1518
 
1519
+ self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
1520
+
1512
1521
  unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1513
1522
  if unstable_by_sparsity:
1514
1523
  self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
@@ -1517,6 +1526,8 @@ class FeaturesEnricher(TransformerMixin):
1517
1526
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1518
1527
  )
1519
1528
 
1529
+ self.logger.info(f"PSI values by value: {psi_values}")
1530
+
1520
1531
  unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1521
1532
  if unstable_by_value:
1522
1533
  self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
@@ -1679,7 +1690,7 @@ class FeaturesEnricher(TransformerMixin):
1679
1690
  if X is None:
1680
1691
  return True, self.X, self.y, self.eval_set
1681
1692
 
1682
- checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1693
+ checked_eval_set = self._check_eval_set(eval_set, X)
1683
1694
 
1684
1695
  if (
1685
1696
  X is self.X
@@ -1783,7 +1794,7 @@ class FeaturesEnricher(TransformerMixin):
1783
1794
  ):
1784
1795
  is_input_same_as_fit, X, y, eval_set = self._is_input_same_as_fit(X, y, eval_set)
1785
1796
  is_demo_dataset = hash_input(X, y, eval_set) in DEMO_DATASET_HASHES
1786
- checked_eval_set = self._check_eval_set(eval_set, X, self.bundle)
1797
+ checked_eval_set = self._check_eval_set(eval_set, X)
1787
1798
  validated_X, validated_y, validated_eval_set = self._validate_train_eval(X, y, checked_eval_set, silent=True)
1788
1799
 
1789
1800
  sampled_data = self._get_enriched_for_metrics(
@@ -3246,6 +3257,15 @@ if response.status_code == 200:
3246
3257
  else:
3247
3258
  self.__log_warning(full_duplicates_warning)
3248
3259
 
3260
+ # Check if OOT eval set still more than 1000 rows
3261
+ if EVAL_SET_INDEX in df.columns:
3262
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
3263
+ if eval_set_index == 0:
3264
+ continue
3265
+ eval_set_df = df[df[EVAL_SET_INDEX] == eval_set_index]
3266
+ if np.all(pd.isna(eval_set_df[TARGET])) and len(eval_set_df) < 1000:
3267
+ self.__log_warning(self.bundle.get("oot_eval_set_too_small_after_dedup").format(eval_set_index + 1))
3268
+
3249
3269
  # Explode multiple search keys
3250
3270
  df = self.__add_fit_system_record_id(
3251
3271
  df, self.fit_search_keys, ENTITY_SYSTEM_RECORD_ID, TARGET, self.fit_columns_renaming
@@ -3823,7 +3843,7 @@ if response.status_code == 200:
3823
3843
  if isinstance(eval_set, tuple):
3824
3844
  eval_set = [eval_set]
3825
3845
  for eval in eval_set:
3826
- is_oot = eval[1].isna().all()
3846
+ is_oot = np.all(pd.isna(eval[1]))
3827
3847
  if not is_oot:
3828
3848
  if self.baseline_score_column not in eval[0].columns:
3829
3849
  raise ValidationError(self.bundle.get("baseline_score_column_not_exists"))
@@ -165,6 +165,7 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
165
165
  dataset_empty_column_names=Some column names are empty. Add names please
166
166
  dataset_full_duplicates={:.5f}% of the rows are fully duplicated
167
167
  dataset_diff_target_duplicates={:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nSample of incorrect row indexes: {}
168
+ dataset_diff_target_duplicates_oot={:.4f}% of rows ({}) in OOT eval_set are duplicates with train or another eval_set. These rows will be deleted from OOT\nSample of incorrect row indexes: {}
168
169
  dataset_train_diff_target_duplicates_fintech={:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
169
170
  dataset_eval_diff_target_duplicates_fintech={:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
170
171
  dataset_drop_old_dates=We don't have data before '2000-01-01' and removed all earlier records from the search dataset
@@ -185,6 +186,7 @@ dataset_invalid_column_type=Unsupported data type of column {}: {}
185
186
  dataset_invalid_filter=Unknown field in filter_features. Should be {'min_importance', 'max_psi', 'max_count', 'selected_features'}.
186
187
  dataset_too_big_file=Too big size of dataframe X for processing. Please reduce number of rows or columns
187
188
  dataset_transform_diff_fit=You try to enrich dataset that column names are different from the train dataset column names that you used on the fit stage. Please make the column names the same as in the train dataset and restart.
189
+ oot_eval_set_too_small_after_dedup=OOT eval set {} has less than 1000 rows after deduplication. It will be ignored for stability check
188
190
  binary_small_dataset=The least populated class in Target contains less than 1000 rows.\nSmall numbers of observations may negatively affect the number of selected features and quality of your ML model.\nUpgini recommends you increase the number of observations in the least populated class.\n
189
191
  all_search_keys_invalid=All search keys are invalid
190
192
  all_emails_invalid=All values in column {} are invalid emails # Metrics validation
@@ -134,10 +134,12 @@ def remove_fintech_duplicates(
134
134
  logger.info(f"Train dataset shape after clean fintech duplicates: {train_df.shape}")
135
135
 
136
136
  # Process each eval_set part separately
137
+ oot_eval_dfs = []
137
138
  new_eval_dfs = []
138
139
  for i, eval_df in enumerate(eval_dfs, 1):
139
140
  # Skip OOT
140
141
  if eval_df[TARGET].isna().all():
142
+ oot_eval_dfs.append(eval_df)
141
143
  continue
142
144
  logger.info(f"Eval {i} dataset shape before clean fintech duplicates: {eval_df.shape}")
143
145
  cleaned_eval_df, eval_warning = process_df(eval_df, i)
@@ -148,8 +150,8 @@ def remove_fintech_duplicates(
148
150
 
149
151
  # Combine the processed train and eval parts back into one dataset
150
152
  logger.info(f"Dataset shape before clean fintech duplicates: {df.shape}")
151
- if new_eval_dfs:
152
- df = pd.concat([train_df] + new_eval_dfs)
153
+ if new_eval_dfs or oot_eval_dfs:
154
+ df = pd.concat([train_df] + new_eval_dfs + oot_eval_dfs, ignore_index=False)
153
155
  else:
154
156
  df = train_df
155
157
  logger.info(f"Dataset shape after clean fintech duplicates: {df.shape}")
@@ -196,30 +198,30 @@ def clean_full_duplicates(
196
198
 
197
199
  # Separate rows to exclude from deduplication:
198
200
  # for each eval_set_index != 0 check separately, all TARGET values are NaN
199
- excluded_from_dedup = pd.DataFrame()
200
201
  df_for_dedup = df
202
+ oot_df = None
201
203
 
202
204
  if EVAL_SET_INDEX in df.columns:
203
- excluded_parts = []
204
- # Get all unique eval_set_index values, except 0
205
- unique_eval_indices = df[df[EVAL_SET_INDEX] != 0][EVAL_SET_INDEX].unique()
206
-
207
- for eval_idx in unique_eval_indices:
205
+ oot_eval_dfs = []
206
+ other_dfs = []
207
+ for eval_idx in df[EVAL_SET_INDEX].unique():
208
208
  eval_subset = df[df[EVAL_SET_INDEX] == eval_idx]
209
209
  # Check that all TARGET values for this specific eval_set_index are NaN
210
- if len(eval_subset) > 0 and eval_subset[TARGET].isna().all():
211
- excluded_parts.append(eval_subset)
210
+ if eval_idx != 0 and eval_subset[TARGET].isna().all():
211
+ oot_eval_dfs.append(eval_subset)
212
212
  logger.info(
213
213
  f"Excluded {len(eval_subset)} rows from deduplication "
214
214
  f"(eval_set_index={eval_idx} and all TARGET values are NaN)"
215
215
  )
216
+ else:
217
+ other_dfs.append(eval_subset)
218
+
219
+ if oot_eval_dfs:
220
+ oot_df = pd.concat(oot_eval_dfs, ignore_index=False)
221
+ df_for_dedup = pd.concat(other_dfs, ignore_index=False)
222
+ else:
223
+ df_for_dedup = df
216
224
 
217
- # Combine all excluded parts
218
- if excluded_parts:
219
- excluded_from_dedup = pd.concat(excluded_parts, ignore_index=False)
220
- # Remove excluded rows from dataframe for deduplication
221
- excluded_indices = excluded_from_dedup.index
222
- df_for_dedup = df[~df.index.isin(excluded_indices)]
223
225
  marked_duplicates = df_for_dedup.duplicated(subset=unique_columns, keep=False)
224
226
  if marked_duplicates.sum() > 0:
225
227
  dups_indices = df_for_dedup[marked_duplicates].index.to_list()[:100]
@@ -231,8 +233,18 @@ def clean_full_duplicates(
231
233
  df_for_dedup = df_for_dedup.drop_duplicates(subset=unique_columns, keep=False)
232
234
  logger.info(f"Dataset shape after clean invalid target duplicates: {df_for_dedup.shape}")
233
235
  # Combine back excluded rows
234
- if len(excluded_from_dedup) > 0:
235
- df = pd.concat([df_for_dedup, excluded_from_dedup], ignore_index=False)
236
+ if oot_df is not None:
237
+ df = pd.concat([df_for_dedup, oot_df], ignore_index=False)
238
+ marked_duplicates = df.duplicated(subset=unique_columns, keep=False)
239
+ if marked_duplicates.sum() > 0:
240
+ dups_indices = df[marked_duplicates].index.to_list()[:100]
241
+ nrows_after_tgt_dedup = len(df.drop_duplicates(subset=unique_columns, keep=False))
242
+ num_dup_rows = len(df) - nrows_after_tgt_dedup
243
+ share_tgt_dedup = 100 * num_dup_rows / len(df)
244
+ msg = bundle.get("dataset_diff_target_duplicates_oot").format(
245
+ share_tgt_dedup, num_dup_rows, dups_indices
246
+ )
247
+ df = df.drop_duplicates(subset=unique_columns, keep="first")
236
248
  logger.info(f"Final dataset shape after adding back excluded rows: {df.shape}")
237
249
  else:
238
250
  df = df_for_dedup
@@ -77,7 +77,7 @@ def calculate_features_psi(
77
77
  psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
78
78
  psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
79
79
  ) -> Dict[str, float]:
80
- empty_res = pd.Series(index=df.columns, data=0)
80
+ empty_res = {col: 0.0 for col in df.columns if col not in [TARGET, date_column]}
81
81
 
82
82
  if not is_numeric_dtype(df[date_column]):
83
83
  df[date_column] = pd.to_datetime(df[date_column]).dt.floor("D").astype(np.int64) / 10**6
@@ -113,9 +113,9 @@ def calculate_features_psi(
113
113
  cat_top_pct=psi_target_params.cat_top_pct,
114
114
  agg_func=target_agg_func,
115
115
  )
116
- if target_psi is None:
116
+ if target_psi is None or np.isnan(target_psi):
117
117
  logger.info("Cannot determine target PSI. Skip feature PSI check")
118
- return pd.Series(index=df.columns, data=0)
118
+ return empty_res
119
119
 
120
120
  if target_psi > psi_target_params.threshold:
121
121
  logger.info(
@@ -5,7 +5,7 @@ from typing import Callable, List, Optional
5
5
  import numpy as np
6
6
  import pandas as pd
7
7
 
8
- from upgini.metadata import SYSTEM_RECORD_ID, CVType, ModelTaskType
8
+ from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, TARGET, CVType, ModelTaskType
9
9
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
10
10
  from upgini.utils.target_utils import balance_undersample
11
11
  from upgini.utils.ts_utils import get_most_frequent_time_unit, trunc_datetime
@@ -117,6 +117,22 @@ def sample(
117
117
  **kwargs,
118
118
  )
119
119
 
120
+ # separate OOT
121
+ oot_dfs = []
122
+ other_dfs = []
123
+ if EVAL_SET_INDEX in df.columns:
124
+ for eval_set_index in df[EVAL_SET_INDEX].unique():
125
+ eval_df = df[df[EVAL_SET_INDEX] == eval_set_index]
126
+ if TARGET in eval_df.columns and eval_df[TARGET].isna().all():
127
+ oot_dfs.append(eval_df)
128
+ else:
129
+ other_dfs.append(eval_df)
130
+ if len(oot_dfs) > 0:
131
+ oot_df = pd.concat(oot_dfs, ignore_index=False)
132
+ df = pd.concat(other_dfs, ignore_index=False)
133
+ else:
134
+ oot_df = None
135
+
120
136
  num_samples = _num_samples(df)
121
137
  if num_samples > fit_sample_threshold:
122
138
  logger.info(
@@ -126,6 +142,18 @@ def sample(
126
142
  df = df.sample(n=fit_sample_rows, random_state=random_state)
127
143
  logger.info(f"Shape after threshold resampling: {df.shape}")
128
144
 
145
+ if oot_df is not None:
146
+ num_samples_oot = _num_samples(oot_df)
147
+ if num_samples_oot > fit_sample_threshold:
148
+ logger.info(
149
+ f"OOT has size {num_samples_oot} more than threshold {fit_sample_threshold} "
150
+ f"and will be downsampled to {fit_sample_rows}"
151
+ )
152
+ oot_df = oot_df.sample(n=fit_sample_rows, random_state=random_state)
153
+ df = pd.concat([df, oot_df], ignore_index=False)
154
+
155
+ logger.info(f"Dataset size after downsampling: {len(df)}")
156
+
129
157
  return df
130
158
 
131
159
 
@@ -175,7 +203,7 @@ def sample_time_series_train_eval(
175
203
  )
176
204
  if logger is not None:
177
205
  logger.info(f"Eval set size: {len(eval_df)}")
178
- df = pd.concat([train_df, eval_df])
206
+ df = pd.concat([train_df, eval_df], ignore_index=False)
179
207
 
180
208
  elif len(train_df) > max_rows:
181
209
  df = sample_time_series_trunc(
@@ -6,7 +6,7 @@ import pandas as pd
6
6
  from pandas.api.types import is_bool_dtype, is_datetime64_any_dtype, is_numeric_dtype
7
7
 
8
8
  from upgini.errors import ValidationError
9
- from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
9
+ from upgini.metadata import EVAL_SET_INDEX, SYSTEM_RECORD_ID, ModelTaskType
10
10
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle, bundle
11
11
  from upgini.sampler.random_under_sampler import RandomUnderSampler
12
12
 
@@ -132,6 +132,11 @@ def balance_undersample(
132
132
  if SYSTEM_RECORD_ID not in df.columns:
133
133
  raise Exception("System record id must be presented for undersampling")
134
134
 
135
+ # Rebalance and send to server only train data
136
+ # because eval set data will be sent separately in transform for metrics
137
+ if EVAL_SET_INDEX in df.columns:
138
+ df = df[df[EVAL_SET_INDEX] == 0]
139
+
135
140
  target = df[target_column].copy()
136
141
 
137
142
  vc = target.value_counts()
@@ -1 +0,0 @@
1
- __version__ = "1.2.114a1"
File without changes
File without changes
File without changes
File without changes
File without changes