upgini 1.2.113a3974.dev2__tar.gz → 1.2.114a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/PKG-INFO +31 -17
  2. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/README.md +30 -16
  3. upgini-1.2.114a2/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/dataset.py +40 -6
  5. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/features_enricher.py +489 -147
  6. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/metadata.py +1 -0
  7. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/metrics.py +6 -2
  8. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/resource_bundle/strings.properties +6 -1
  9. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/sampler/base.py +3 -1
  10. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/sampler/random_under_sampler.py +18 -8
  11. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/deduplicate_utils.py +57 -9
  12. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/feature_info.py +5 -0
  13. upgini-1.2.114a2/src/upgini/utils/psi.py +294 -0
  14. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/sample_utils.py +30 -2
  15. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/target_utils.py +6 -1
  16. upgini-1.2.113a3974.dev2/src/upgini/__about__.py +0 -1
  17. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/.gitignore +0 -0
  18. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/LICENSE +0 -0
  19. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/pyproject.toml +0 -0
  20. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/__init__.py +0 -0
  21. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/ads.py +0 -0
  22. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/ads_management/__init__.py +0 -0
  23. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/ads_management/ads_manager.py +0 -0
  24. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/__init__.py +0 -0
  25. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/all_operators.py +0 -0
  26. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/binary.py +0 -0
  27. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/date.py +0 -0
  28. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/feature.py +0 -0
  29. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/groupby.py +0 -0
  30. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/operator.py +0 -0
  31. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/__init__.py +0 -0
  32. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/base.py +0 -0
  33. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/cross.py +0 -0
  34. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/delta.py +0 -0
  35. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/lag.py +0 -0
  36. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/roll.py +0 -0
  37. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/trend.py +0 -0
  38. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/timeseries/volatility.py +0 -0
  39. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/unary.py +0 -0
  40. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/utils.py +0 -0
  41. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/autofe/vector.py +0 -0
  42. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/data_source/__init__.py +0 -0
  43. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/data_source/data_source_publisher.py +0 -0
  44. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/errors.py +0 -0
  45. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/http.py +0 -0
  46. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/mdc/__init__.py +0 -0
  47. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/mdc/context.py +0 -0
  48. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/normalizer/__init__.py +0 -0
  49. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/normalizer/normalize_utils.py +0 -0
  50. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/resource_bundle/__init__.py +0 -0
  51. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/resource_bundle/exceptions.py +0 -0
  52. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  53. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/sampler/__init__.py +0 -0
  54. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/sampler/utils.py +0 -0
  55. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/search_task.py +0 -0
  56. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/spinner.py +0 -0
  57. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  58. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/__init__.py +0 -0
  59. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/base_search_key_detector.py +0 -0
  60. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/blocked_time_series.py +0 -0
  61. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/country_utils.py +0 -0
  62. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/custom_loss_utils.py +0 -0
  63. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/cv_utils.py +0 -0
  64. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/datetime_utils.py +0 -0
  65. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/display_utils.py +0 -0
  66. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/email_utils.py +0 -0
  67. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  68. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/features_validator.py +0 -0
  69. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/format.py +0 -0
  70. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/ip_utils.py +0 -0
  71. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/mstats.py +0 -0
  72. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/postal_code_utils.py +0 -0
  74. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/progress_bar.py +0 -0
  75. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/sklearn_ext.py +0 -0
  76. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/sort.py +0 -0
  77. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/track_info.py +0 -0
  78. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/ts_utils.py +0 -0
  79. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/utils/warning_counter.py +0 -0
  80. {upgini-1.2.113a3974.dev2 → upgini-1.2.114a2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.113a3974.dev2
3
+ Version: 1.2.114a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
270
270
  enricher = FeaturesEnricher(
271
271
  search_keys={
272
272
  "subscription_activation_date": SearchKey.DATE,
273
- "country": SearchKey.COUNTRY,
274
- "zip_code": SearchKey.POSTAL_CODE,
275
- "hashed_email": SearchKey.HEM,
273
+ "country": SearchKey.COUNTRY,
274
+ "zip_code": SearchKey.POSTAL_CODE,
275
+ "hashed_email": SearchKey.HEM,
276
276
  "last_visit_ip_address": SearchKey.IP,
277
277
  "registered_with_phone": SearchKey.PHONE
278
278
  })
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
358
358
  enricher = FeaturesEnricher(
359
359
  search_keys={
360
360
  "subscription_activation_date": SearchKey.DATE,
361
- "country": SearchKey.COUNTRY,
362
- "zip_code": SearchKey.POSTAL_CODE,
363
- "hashed_email": SearchKey.HEM,
361
+ "country": SearchKey.COUNTRY,
362
+ "zip_code": SearchKey.POSTAL_CODE,
363
+ "hashed_email": SearchKey.HEM,
364
364
  "last_visit_ip_address": SearchKey.IP,
365
365
  "registered_with_phone": SearchKey.PHONE
366
366
  },
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
381
381
  enricher = FeaturesEnricher(
382
382
  search_keys={
383
383
  "subscription_activation_date": SearchKey.DATE,
384
- "zip_code": SearchKey.POSTAL_CODE,
384
+ "zip_code": SearchKey.POSTAL_CODE,
385
385
  },
386
386
  country_code = "US",
387
387
  date_format = "%Y-%d-%m"
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
409
409
  enricher = FeaturesEnricher(
410
410
  search_keys={
411
411
  "subscription_activation_date": SearchKey.DATE,
412
- "country": SearchKey.COUNTRY,
413
- "zip_code": SearchKey.POSTAL_CODE
412
+ "country": SearchKey.COUNTRY,
413
+ "zip_code": SearchKey.POSTAL_CODE
414
414
  })
415
415
 
416
416
  # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
464
464
  enricher = FeaturesEnricher(
465
465
  search_keys={
466
466
  "subscription_activation_date": SearchKey.DATE,
467
- "country": SearchKey.COUNTRY,
468
- "zip_code": SearchKey.POSTAL_CODE,
467
+ "country": SearchKey.COUNTRY,
468
+ "zip_code": SearchKey.POSTAL_CODE,
469
469
  },
470
470
  )
471
471
  ```
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
516
516
  If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
517
517
  ```python
518
518
  enricher = FeaturesEnricher(
519
- search_keys={
520
- "sales_date": SearchKey.DATE,
519
+ search_keys={
520
+ "sales_date": SearchKey.DATE,
521
521
  },
522
522
  id_columns=["store_id", "product_id"],
523
523
  cv=CVType.time_series
@@ -733,9 +733,22 @@ enricher.fit(
733
733
  )
734
734
  ```
735
735
  #### ⚠️ Requirements for out-of-time dataset
736
- - Same data schema as for search initialization dataset
736
+ - Same data schema as for search initialization X dataset
737
737
  - Pandas dataframe representation
738
738
 
739
+ There are 3 options to pass out-of-time without labels:
740
+ ```python
741
+ enricher.fit(
742
+ train_ids_and_features,
743
+ train_label,
744
+ eval_set = [
745
+ (eval_ids_and_features_1,), # Just tuple of 1 element
746
+ (eval_ids_and_features_2, None), # None as labels
747
+ (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
748
+ ]
749
+ )
750
+ ```
751
+
739
752
  ### Use custom loss function in feature selection & metrics calculation
740
753
 
741
754
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -797,7 +810,7 @@ enricher = FeaturesEnricher(
797
810
  enricher.fit(X, y)
798
811
  ```
799
812
 
800
- ## Turn off removing of target outliers
813
+ ### Turn off removing of target outliers
801
814
  Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
802
815
 
803
816
  ```python
@@ -808,7 +821,7 @@ enricher = FeaturesEnricher(
808
821
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
809
822
  ```
810
823
 
811
- ## Turn off generating features on search keys
824
+ ### Turn off generating features on search keys
812
825
  Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
813
826
 
814
827
  ```python
@@ -816,6 +829,7 @@ enricher = FeaturesEnricher(
816
829
  search_keys={"date": SearchKey.DATE},
817
830
  generate_search_key_features=False,
818
831
  )
832
+ ```
819
833
 
820
834
  ## 🔑 Open up all capabilities of Upgini
821
835
 
@@ -224,9 +224,9 @@ from upgini.metadata import SearchKey
224
224
  enricher = FeaturesEnricher(
225
225
  search_keys={
226
226
  "subscription_activation_date": SearchKey.DATE,
227
- "country": SearchKey.COUNTRY,
228
- "zip_code": SearchKey.POSTAL_CODE,
229
- "hashed_email": SearchKey.HEM,
227
+ "country": SearchKey.COUNTRY,
228
+ "zip_code": SearchKey.POSTAL_CODE,
229
+ "hashed_email": SearchKey.HEM,
230
230
  "last_visit_ip_address": SearchKey.IP,
231
231
  "registered_with_phone": SearchKey.PHONE
232
232
  })
@@ -312,9 +312,9 @@ from upgini.metadata import SearchKey
312
312
  enricher = FeaturesEnricher(
313
313
  search_keys={
314
314
  "subscription_activation_date": SearchKey.DATE,
315
- "country": SearchKey.COUNTRY,
316
- "zip_code": SearchKey.POSTAL_CODE,
317
- "hashed_email": SearchKey.HEM,
315
+ "country": SearchKey.COUNTRY,
316
+ "zip_code": SearchKey.POSTAL_CODE,
317
+ "hashed_email": SearchKey.HEM,
318
318
  "last_visit_ip_address": SearchKey.IP,
319
319
  "registered_with_phone": SearchKey.PHONE
320
320
  },
@@ -335,7 +335,7 @@ from upgini.metadata import SearchKey
335
335
  enricher = FeaturesEnricher(
336
336
  search_keys={
337
337
  "subscription_activation_date": SearchKey.DATE,
338
- "zip_code": SearchKey.POSTAL_CODE,
338
+ "zip_code": SearchKey.POSTAL_CODE,
339
339
  },
340
340
  country_code = "US",
341
341
  date_format = "%Y-%d-%m"
@@ -363,8 +363,8 @@ y = train_df["churn_flag"]
363
363
  enricher = FeaturesEnricher(
364
364
  search_keys={
365
365
  "subscription_activation_date": SearchKey.DATE,
366
- "country": SearchKey.COUNTRY,
367
- "zip_code": SearchKey.POSTAL_CODE
366
+ "country": SearchKey.COUNTRY,
367
+ "zip_code": SearchKey.POSTAL_CODE
368
368
  })
369
369
 
370
370
  # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -418,8 +418,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
418
418
  enricher = FeaturesEnricher(
419
419
  search_keys={
420
420
  "subscription_activation_date": SearchKey.DATE,
421
- "country": SearchKey.COUNTRY,
422
- "zip_code": SearchKey.POSTAL_CODE,
421
+ "country": SearchKey.COUNTRY,
422
+ "zip_code": SearchKey.POSTAL_CODE,
423
423
  },
424
424
  )
425
425
  ```
@@ -470,8 +470,8 @@ enricher = FeaturesEnricher(
470
470
  If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
471
471
  ```python
472
472
  enricher = FeaturesEnricher(
473
- search_keys={
474
- "sales_date": SearchKey.DATE,
473
+ search_keys={
474
+ "sales_date": SearchKey.DATE,
475
475
  },
476
476
  id_columns=["store_id", "product_id"],
477
477
  cv=CVType.time_series
@@ -687,9 +687,22 @@ enricher.fit(
687
687
  )
688
688
  ```
689
689
  #### ⚠️ Requirements for out-of-time dataset
690
- - Same data schema as for search initialization dataset
690
+ - Same data schema as for search initialization X dataset
691
691
  - Pandas dataframe representation
692
692
 
693
+ There are 3 options to pass out-of-time without labels:
694
+ ```python
695
+ enricher.fit(
696
+ train_ids_and_features,
697
+ train_label,
698
+ eval_set = [
699
+ (eval_ids_and_features_1,), # Just tuple of 1 element
700
+ (eval_ids_and_features_2, None), # None as labels
701
+ (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
702
+ ]
703
+ )
704
+ ```
705
+
693
706
  ### Use custom loss function in feature selection & metrics calculation
694
707
 
695
708
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -751,7 +764,7 @@ enricher = FeaturesEnricher(
751
764
  enricher.fit(X, y)
752
765
  ```
753
766
 
754
- ## Turn off removing of target outliers
767
+ ### Turn off removing of target outliers
755
768
  Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
756
769
 
757
770
  ```python
@@ -762,7 +775,7 @@ enricher = FeaturesEnricher(
762
775
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
763
776
  ```
764
777
 
765
- ## Turn off generating features on search keys
778
+ ### Turn off generating features on search keys
766
779
  Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
767
780
 
768
781
  ```python
@@ -770,6 +783,7 @@ enricher = FeaturesEnricher(
770
783
  search_keys={"date": SearchKey.DATE},
771
784
  generate_search_key_features=False,
772
785
  )
786
+ ```
773
787
 
774
788
  ## 🔑 Open up all capabilities of Upgini
775
789
 
@@ -0,0 +1 @@
1
+ __version__ = "1.2.114a2"
@@ -50,7 +50,7 @@ except Exception:
50
50
 
51
51
  class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
- MAX_ROWS = 100_000
53
+ MAX_ROWS = 200_000
54
54
  IMBALANCE_THESHOLD = 0.6
55
55
  MIN_TARGET_CLASS_ROWS = 100
56
56
  MAX_MULTICLASS_CLASS_COUNT = 100
@@ -184,7 +184,19 @@ class Dataset:
184
184
  def __validate_target(self):
185
185
  # self.logger.info("Validating target")
186
186
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
187
- target = self.data[target_column]
187
+
188
+ oot_indices = []
189
+ if EVAL_SET_INDEX in self.data.columns:
190
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
191
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
192
+ if eval_set[target_column].isna().all():
193
+ oot_indices.append(eval_set_index)
194
+
195
+ df_to_check = self.data.copy()
196
+ if oot_indices:
197
+ df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
198
+
199
+ target = df_to_check[target_column]
188
200
 
189
201
  if self.task_type == ModelTaskType.BINARY:
190
202
  if not is_integer_dtype(target):
@@ -201,7 +213,7 @@ class Dataset:
201
213
  elif self.task_type == ModelTaskType.MULTICLASS:
202
214
  if not is_integer_dtype(target):
203
215
  try:
204
- target = self.data[target_column].astype("category").cat.codes
216
+ target = target.astype("category").cat.codes
205
217
  except Exception:
206
218
  self.logger.exception("Failed to cast target to category codes for multiclass task type")
207
219
  raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
@@ -335,15 +347,37 @@ class Dataset:
335
347
  all_valid_message = self.bundle.get("validation_all_valid_message")
336
348
  invalid_message = self.bundle.get("validation_invalid_message")
337
349
 
350
+ oot_indices = []
351
+ if EVAL_SET_INDEX in self.data.columns:
352
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
353
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
354
+ if eval_set[target].isna().all():
355
+ oot_indices.append(eval_set_index)
356
+
338
357
  for col in columns_to_validate:
339
- self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
340
358
  if validate_target and target is not None and col == target:
341
- self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
359
+ if oot_indices:
360
+ mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
361
+ invalid_target_mask = (
362
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
363
+ )
364
+ # Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
365
+ self.data[f"{col}_is_valid"] = True
366
+ self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
367
+ else:
368
+ # No OOT: mark invalid where target is NaN or +/-inf
369
+ self.data[f"{col}_is_valid"] = ~(
370
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
371
+ )
372
+ else:
373
+ self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
342
374
 
343
375
  if col in mandatory_columns:
344
376
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
345
377
 
346
- invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
378
+ # Use stable pandas API across versions: Series.unique keeps order
379
+ # and collapses multiple NaNs into a single NaN
380
+ invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
347
381
  valid_share = self.data[f"{col}_is_valid"].sum() / nrows
348
382
  original_col_name = self.columns_renaming[col]
349
383
  validation_stats[original_col_name] = {}