upgini 1.2.113a3974.dev2__tar.gz → 1.2.114__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/PKG-INFO +62 -32
  2. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/README.md +60 -30
  3. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/pyproject.toml +1 -1
  4. upgini-1.2.114/src/upgini/__about__.py +1 -0
  5. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/dataset.py +48 -78
  6. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/features_enricher.py +726 -516
  7. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/http.py +15 -19
  8. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/metadata.py +1 -10
  9. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/metrics.py +6 -2
  10. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/strings.properties +8 -6
  11. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/base.py +3 -1
  12. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/random_under_sampler.py +18 -8
  13. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/search_task.py +6 -0
  14. upgini-1.2.114/src/upgini/utils/config.py +43 -0
  15. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/deduplicate_utils.py +57 -9
  16. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/display_utils.py +1 -1
  17. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/feature_info.py +5 -0
  18. upgini-1.2.114/src/upgini/utils/hash_utils.py +159 -0
  19. upgini-1.2.114/src/upgini/utils/psi.py +300 -0
  20. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/sample_utils.py +45 -42
  21. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/target_utils.py +53 -2
  22. upgini-1.2.113a3974.dev2/src/upgini/__about__.py +0 -1
  23. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/.gitignore +0 -0
  24. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/LICENSE +0 -0
  25. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/__init__.py +0 -0
  26. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/ads.py +0 -0
  27. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/ads_management/__init__.py +0 -0
  28. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/ads_management/ads_manager.py +0 -0
  29. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/__init__.py +0 -0
  30. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/all_operators.py +0 -0
  31. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/binary.py +0 -0
  32. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/date.py +0 -0
  33. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/feature.py +0 -0
  34. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/groupby.py +0 -0
  35. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/operator.py +0 -0
  36. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/__init__.py +0 -0
  37. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/base.py +0 -0
  38. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/cross.py +0 -0
  39. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/delta.py +0 -0
  40. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/lag.py +0 -0
  41. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/roll.py +0 -0
  42. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/trend.py +0 -0
  43. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/volatility.py +0 -0
  44. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/unary.py +0 -0
  45. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/utils.py +0 -0
  46. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/vector.py +0 -0
  47. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/data_source/__init__.py +0 -0
  48. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/data_source/data_source_publisher.py +0 -0
  49. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/errors.py +0 -0
  50. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/mdc/__init__.py +0 -0
  51. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/mdc/context.py +0 -0
  52. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/normalizer/__init__.py +0 -0
  53. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/normalizer/normalize_utils.py +0 -0
  54. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/__init__.py +0 -0
  55. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/exceptions.py +0 -0
  56. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  57. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/__init__.py +0 -0
  58. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/utils.py +0 -0
  59. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/spinner.py +0 -0
  60. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  61. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/__init__.py +0 -0
  62. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/base_search_key_detector.py +0 -0
  63. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/blocked_time_series.py +0 -0
  64. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/country_utils.py +0 -0
  65. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/custom_loss_utils.py +0 -0
  66. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/cv_utils.py +0 -0
  67. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/datetime_utils.py +0 -0
  68. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/email_utils.py +0 -0
  69. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/fallback_progress_bar.py +0 -0
  70. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/features_validator.py +0 -0
  71. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/format.py +0 -0
  72. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/ip_utils.py +0 -0
  73. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/mstats.py +0 -0
  74. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/phone_utils.py +0 -0
  75. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/postal_code_utils.py +0 -0
  76. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/progress_bar.py +0 -0
  77. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/sklearn_ext.py +0 -0
  78. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/sort.py +0 -0
  79. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.113a3974.dev2
3
+ Version: 1.2.114
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -38,7 +38,7 @@ Requires-Dist: python-bidi==0.4.2
38
38
  Requires-Dist: python-dateutil>=2.8.0
39
39
  Requires-Dist: python-json-logger>=3.3.0
40
40
  Requires-Dist: requests>=2.8.0
41
- Requires-Dist: scikit-learn>=1.3.0
41
+ Requires-Dist: scikit-learn<1.8.0,>=1.3.0
42
42
  Requires-Dist: scipy>=1.10.0
43
43
  Requires-Dist: shap>=0.44.0
44
44
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
270
270
  enricher = FeaturesEnricher(
271
271
  search_keys={
272
272
  "subscription_activation_date": SearchKey.DATE,
273
- "country": SearchKey.COUNTRY,
274
- "zip_code": SearchKey.POSTAL_CODE,
275
- "hashed_email": SearchKey.HEM,
273
+ "country": SearchKey.COUNTRY,
274
+ "zip_code": SearchKey.POSTAL_CODE,
275
+ "hashed_email": SearchKey.HEM,
276
276
  "last_visit_ip_address": SearchKey.IP,
277
277
  "registered_with_phone": SearchKey.PHONE
278
278
  })
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
358
358
  enricher = FeaturesEnricher(
359
359
  search_keys={
360
360
  "subscription_activation_date": SearchKey.DATE,
361
- "country": SearchKey.COUNTRY,
362
- "zip_code": SearchKey.POSTAL_CODE,
363
- "hashed_email": SearchKey.HEM,
361
+ "country": SearchKey.COUNTRY,
362
+ "zip_code": SearchKey.POSTAL_CODE,
363
+ "hashed_email": SearchKey.HEM,
364
364
  "last_visit_ip_address": SearchKey.IP,
365
365
  "registered_with_phone": SearchKey.PHONE
366
366
  },
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
381
381
  enricher = FeaturesEnricher(
382
382
  search_keys={
383
383
  "subscription_activation_date": SearchKey.DATE,
384
- "zip_code": SearchKey.POSTAL_CODE,
384
+ "zip_code": SearchKey.POSTAL_CODE,
385
385
  },
386
386
  country_code = "US",
387
387
  date_format = "%Y-%d-%m"
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
409
409
  enricher = FeaturesEnricher(
410
410
  search_keys={
411
411
  "subscription_activation_date": SearchKey.DATE,
412
- "country": SearchKey.COUNTRY,
413
- "zip_code": SearchKey.POSTAL_CODE
412
+ "country": SearchKey.COUNTRY,
413
+ "zip_code": SearchKey.POSTAL_CODE
414
414
  })
415
415
 
416
416
  # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
464
464
  enricher = FeaturesEnricher(
465
465
  search_keys={
466
466
  "subscription_activation_date": SearchKey.DATE,
467
- "country": SearchKey.COUNTRY,
468
- "zip_code": SearchKey.POSTAL_CODE,
467
+ "country": SearchKey.COUNTRY,
468
+ "zip_code": SearchKey.POSTAL_CODE,
469
469
  },
470
470
  )
471
471
  ```
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
516
516
  If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
517
517
  ```python
518
518
  enricher = FeaturesEnricher(
519
- search_keys={
520
- "sales_date": SearchKey.DATE,
519
+ search_keys={
520
+ "sales_date": SearchKey.DATE,
521
521
  },
522
522
  id_columns=["store_id", "product_id"],
523
523
  cv=CVType.time_series
@@ -733,9 +733,52 @@ enricher.fit(
733
733
  )
734
734
  ```
735
735
  #### ⚠️ Requirements for out-of-time dataset
736
- - Same data schema as for search initialization dataset
736
+ - Same data schema as for search initialization X dataset
737
737
  - Pandas dataframe representation
738
738
 
739
+ There are 3 options to pass out-of-time without labels:
740
+ ```python
741
+ enricher.fit(
742
+ train_ids_and_features,
743
+ train_label,
744
+ eval_set = [
745
+ (eval_ids_and_features_1,), # Just tuple of 1 element
746
+ (eval_ids_and_features_2, None), # None as labels
747
+ (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
748
+ ]
749
+ )
750
+ ```
751
+
752
+ ### Control feature stability with PSI parameters
753
+
754
+ `FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
755
+
756
+ ```python
757
+ enricher = FeaturesEnricher(
758
+ search_keys={"registration_date": SearchKey.DATE}
759
+ )
760
+
761
+ # Control feature stability during fit
762
+ enricher.fit(
763
+ X, y,
764
+ stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
765
+ stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
766
+ )
767
+
768
+ # Same parameters work for fit_transform
769
+ enriched_df = enricher.fit_transform(
770
+ X, y,
771
+ stability_threshold=0.1, # Stricter threshold for more stable features
772
+ stability_agg_func="mean" # Use mean aggregation instead of max
773
+ )
774
+ ```
775
+
776
+ **Stability parameters:**
777
+ - `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
778
+ - `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
779
+
780
+ **PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
781
+
739
782
  ### Use custom loss function in feature selection & metrics calculation
740
783
 
741
784
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -756,20 +799,6 @@ enricher = FeaturesEnricher(
756
799
  enriched_dataframe.fit(X, y)
757
800
  ```
758
801
 
759
- ### Return initial dataframe enriched with TOP external features by importance
760
-
761
- `transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
762
- - `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
763
- - `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
764
-
765
- And `keep_input=True` will keep all initial columns from search dataset X:
766
- ```python
767
- enricher = FeaturesEnricher(
768
- search_keys={"subscription_activation_date": SearchKey.DATE}
769
- )
770
- enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
771
- ```
772
-
773
802
  ### Exclude premium data sources from fit, transform and metrics calculation
774
803
 
775
804
  `fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
@@ -797,7 +826,7 @@ enricher = FeaturesEnricher(
797
826
  enricher.fit(X, y)
798
827
  ```
799
828
 
800
- ## Turn off removing of target outliers
829
+ ### Turn off removing of target outliers
801
830
  Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
802
831
 
803
832
  ```python
@@ -808,7 +837,7 @@ enricher = FeaturesEnricher(
808
837
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
809
838
  ```
810
839
 
811
- ## Turn off generating features on search keys
840
+ ### Turn off generating features on search keys
812
841
  Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
813
842
 
814
843
  ```python
@@ -816,6 +845,7 @@ enricher = FeaturesEnricher(
816
845
  search_keys={"date": SearchKey.DATE},
817
846
  generate_search_key_features=False,
818
847
  )
848
+ ```
819
849
 
820
850
  ## 🔑 Open up all capabilities of Upgini
821
851
 
@@ -224,9 +224,9 @@ from upgini.metadata import SearchKey
224
224
  enricher = FeaturesEnricher(
225
225
  search_keys={
226
226
  "subscription_activation_date": SearchKey.DATE,
227
- "country": SearchKey.COUNTRY,
228
- "zip_code": SearchKey.POSTAL_CODE,
229
- "hashed_email": SearchKey.HEM,
227
+ "country": SearchKey.COUNTRY,
228
+ "zip_code": SearchKey.POSTAL_CODE,
229
+ "hashed_email": SearchKey.HEM,
230
230
  "last_visit_ip_address": SearchKey.IP,
231
231
  "registered_with_phone": SearchKey.PHONE
232
232
  })
@@ -312,9 +312,9 @@ from upgini.metadata import SearchKey
312
312
  enricher = FeaturesEnricher(
313
313
  search_keys={
314
314
  "subscription_activation_date": SearchKey.DATE,
315
- "country": SearchKey.COUNTRY,
316
- "zip_code": SearchKey.POSTAL_CODE,
317
- "hashed_email": SearchKey.HEM,
315
+ "country": SearchKey.COUNTRY,
316
+ "zip_code": SearchKey.POSTAL_CODE,
317
+ "hashed_email": SearchKey.HEM,
318
318
  "last_visit_ip_address": SearchKey.IP,
319
319
  "registered_with_phone": SearchKey.PHONE
320
320
  },
@@ -335,7 +335,7 @@ from upgini.metadata import SearchKey
335
335
  enricher = FeaturesEnricher(
336
336
  search_keys={
337
337
  "subscription_activation_date": SearchKey.DATE,
338
- "zip_code": SearchKey.POSTAL_CODE,
338
+ "zip_code": SearchKey.POSTAL_CODE,
339
339
  },
340
340
  country_code = "US",
341
341
  date_format = "%Y-%d-%m"
@@ -363,8 +363,8 @@ y = train_df["churn_flag"]
363
363
  enricher = FeaturesEnricher(
364
364
  search_keys={
365
365
  "subscription_activation_date": SearchKey.DATE,
366
- "country": SearchKey.COUNTRY,
367
- "zip_code": SearchKey.POSTAL_CODE
366
+ "country": SearchKey.COUNTRY,
367
+ "zip_code": SearchKey.POSTAL_CODE
368
368
  })
369
369
 
370
370
  # everything is ready to fit! For 200к records fitting should take around 10 minutes,
@@ -418,8 +418,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
418
418
  enricher = FeaturesEnricher(
419
419
  search_keys={
420
420
  "subscription_activation_date": SearchKey.DATE,
421
- "country": SearchKey.COUNTRY,
422
- "zip_code": SearchKey.POSTAL_CODE,
421
+ "country": SearchKey.COUNTRY,
422
+ "zip_code": SearchKey.POSTAL_CODE,
423
423
  },
424
424
  )
425
425
  ```
@@ -470,8 +470,8 @@ enricher = FeaturesEnricher(
470
470
  If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
471
471
  ```python
472
472
  enricher = FeaturesEnricher(
473
- search_keys={
474
- "sales_date": SearchKey.DATE,
473
+ search_keys={
474
+ "sales_date": SearchKey.DATE,
475
475
  },
476
476
  id_columns=["store_id", "product_id"],
477
477
  cv=CVType.time_series
@@ -687,9 +687,52 @@ enricher.fit(
687
687
  )
688
688
  ```
689
689
  #### ⚠️ Requirements for out-of-time dataset
690
- - Same data schema as for search initialization dataset
690
+ - Same data schema as for search initialization X dataset
691
691
  - Pandas dataframe representation
692
692
 
693
+ There are 3 options to pass out-of-time without labels:
694
+ ```python
695
+ enricher.fit(
696
+ train_ids_and_features,
697
+ train_label,
698
+ eval_set = [
699
+ (eval_ids_and_features_1,), # Just tuple of 1 element
700
+ (eval_ids_and_features_2, None), # None as labels
701
+ (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
702
+ ]
703
+ )
704
+ ```
705
+
706
+ ### Control feature stability with PSI parameters
707
+
708
+ `FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
709
+
710
+ ```python
711
+ enricher = FeaturesEnricher(
712
+ search_keys={"registration_date": SearchKey.DATE}
713
+ )
714
+
715
+ # Control feature stability during fit
716
+ enricher.fit(
717
+ X, y,
718
+ stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
719
+ stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
720
+ )
721
+
722
+ # Same parameters work for fit_transform
723
+ enriched_df = enricher.fit_transform(
724
+ X, y,
725
+ stability_threshold=0.1, # Stricter threshold for more stable features
726
+ stability_agg_func="mean" # Use mean aggregation instead of max
727
+ )
728
+ ```
729
+
730
+ **Stability parameters:**
731
+ - `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
732
+ - `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
733
+
734
+ **PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
735
+
693
736
  ### Use custom loss function in feature selection & metrics calculation
694
737
 
695
738
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
@@ -710,20 +753,6 @@ enricher = FeaturesEnricher(
710
753
  enriched_dataframe.fit(X, y)
711
754
  ```
712
755
 
713
- ### Return initial dataframe enriched with TOP external features by importance
714
-
715
- `transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
716
- - `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
717
- - `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
718
-
719
- And `keep_input=True` will keep all initial columns from search dataset X:
720
- ```python
721
- enricher = FeaturesEnricher(
722
- search_keys={"subscription_activation_date": SearchKey.DATE}
723
- )
724
- enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
725
- ```
726
-
727
756
  ### Exclude premium data sources from fit, transform and metrics calculation
728
757
 
729
758
  `fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
@@ -751,7 +780,7 @@ enricher = FeaturesEnricher(
751
780
  enricher.fit(X, y)
752
781
  ```
753
782
 
754
- ## Turn off removing of target outliers
783
+ ### Turn off removing of target outliers
755
784
  Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
756
785
 
757
786
  ```python
@@ -762,7 +791,7 @@ enricher = FeaturesEnricher(
762
791
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
763
792
  ```
764
793
 
765
- ## Turn off generating features on search keys
794
+ ### Turn off generating features on search keys
766
795
  Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
767
796
 
768
797
  ```python
@@ -770,6 +799,7 @@ enricher = FeaturesEnricher(
770
799
  search_keys={"date": SearchKey.DATE},
771
800
  generate_search_key_features=False,
772
801
  )
802
+ ```
773
803
 
774
804
  ## 🔑 Open up all capabilities of Upgini
775
805
 
@@ -46,7 +46,7 @@ dependencies = [
46
46
  "python-dateutil>=2.8.0",
47
47
  "python-json-logger>=3.3.0",
48
48
  "requests>=2.8.0",
49
- "scikit-learn>=1.3.0",
49
+ "scikit-learn>=1.3.0,<1.8.0",
50
50
  "scipy>=1.10.0",
51
51
  "python-bidi==0.4.2",
52
52
  "xhtml2pdf>=0.2.11,<0.3.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.114"
@@ -25,7 +25,6 @@ from upgini.metadata import (
25
25
  AutoFEParameters,
26
26
  CVType,
27
27
  DataType,
28
- FeaturesFilter,
29
28
  FileColumnMeaningType,
30
29
  FileColumnMetadata,
31
30
  FileMetadata,
@@ -37,8 +36,9 @@ from upgini.metadata import (
37
36
  )
38
37
  from upgini.resource_bundle import ResourceBundle, get_custom_bundle
39
38
  from upgini.search_task import SearchTask
39
+ from upgini.utils.config import SampleConfig
40
40
  from upgini.utils.email_utils import EmailSearchKeyConverter
41
- from upgini.utils.sample_utils import SampleColumns, SampleConfig, sample
41
+ from upgini.utils.sample_utils import SampleColumns, sample
42
42
 
43
43
  try:
44
44
  from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
@@ -50,10 +50,7 @@ except Exception:
50
50
 
51
51
  class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
- MAX_ROWS = 100_000
54
- IMBALANCE_THESHOLD = 0.6
55
- MIN_TARGET_CLASS_ROWS = 100
56
- MAX_MULTICLASS_CLASS_COUNT = 100
53
+ MAX_ROWS = 200_000
57
54
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
58
55
  MAX_FEATURES_COUNT = 3500
59
56
  MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
@@ -73,6 +70,7 @@ class Dataset:
73
70
  cv_type: Optional[CVType] = None,
74
71
  date_column: Optional[str] = None,
75
72
  id_columns: Optional[List[str]] = None,
73
+ is_imbalanced: bool = False,
76
74
  random_state: Optional[int] = None,
77
75
  sample_config: Optional[SampleConfig] = None,
78
76
  rest_client: Optional[_RestClient] = None,
@@ -117,8 +115,9 @@ class Dataset:
117
115
  self.rest_client = rest_client
118
116
  self.random_state = random_state
119
117
  self.columns_renaming: Dict[str, str] = {}
120
- self.imbalanced: bool = False
118
+ self.is_imbalanced: bool = False
121
119
  self.id_columns = id_columns
120
+ self.is_imbalanced = is_imbalanced
122
121
  self.date_column = date_column
123
122
  if logger is not None:
124
123
  self.logger = logger
@@ -184,7 +183,19 @@ class Dataset:
184
183
  def __validate_target(self):
185
184
  # self.logger.info("Validating target")
186
185
  target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
187
- target = self.data[target_column]
186
+
187
+ oot_indices = []
188
+ if EVAL_SET_INDEX in self.data.columns:
189
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
190
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
191
+ if eval_set[target_column].isna().all():
192
+ oot_indices.append(eval_set_index)
193
+
194
+ df_to_check = self.data.copy()
195
+ if oot_indices:
196
+ df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
197
+
198
+ target = df_to_check[target_column]
188
199
 
189
200
  if self.task_type == ModelTaskType.BINARY:
190
201
  if not is_integer_dtype(target):
@@ -201,7 +212,7 @@ class Dataset:
201
212
  elif self.task_type == ModelTaskType.MULTICLASS:
202
213
  if not is_integer_dtype(target):
203
214
  try:
204
- target = self.data[target_column].astype("category").cat.codes
215
+ target = target.astype("category").cat.codes
205
216
  except Exception:
206
217
  self.logger.exception("Failed to cast target to category codes for multiclass task type")
207
218
  raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
@@ -227,8 +238,6 @@ class Dataset:
227
238
  else:
228
239
  train_segment = self.data
229
240
 
230
- self.imbalanced = self.__is_imbalanced(train_segment)
231
-
232
241
  sample_columns = SampleColumns(
233
242
  ids=self.id_columns,
234
243
  date=self.date_column,
@@ -237,55 +246,19 @@ class Dataset:
237
246
  )
238
247
 
239
248
  self.data = sample(
240
- train_segment if self.imbalanced else self.data, # for imbalanced data we will be doing transform anyway
249
+ train_segment if self.is_imbalanced else self.data, # for imbalanced data we will be doing transform anyway
241
250
  self.task_type,
242
251
  self.cv_type,
243
252
  self.sample_config,
244
253
  sample_columns,
245
254
  self.random_state,
246
- balance=self.imbalanced,
255
+ balance=self.is_imbalanced,
247
256
  force_downsampling=force_downsampling,
248
257
  logger=self.logger,
249
258
  bundle=self.bundle,
250
259
  warning_callback=self.warning_callback,
251
260
  )
252
261
 
253
- def __is_imbalanced(self, data: pd.DataFrame) -> bool:
254
- if self.task_type is None or not self.task_type.is_classification():
255
- return False
256
-
257
- if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
258
- return False
259
-
260
- count = len(data)
261
- target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
262
- target = data[target_column]
263
- target_classes_count = target.nunique()
264
-
265
- if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
266
- msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
267
- target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
268
- )
269
- self.logger.warning(msg)
270
- raise ValidationError(msg)
271
-
272
- vc = target.value_counts()
273
- min_class_value = vc.index[len(vc) - 1]
274
- min_class_count = vc[min_class_value]
275
-
276
- if min_class_count < self.MIN_TARGET_CLASS_ROWS:
277
- msg = self.bundle.get("dataset_rarest_class_less_min").format(
278
- min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
279
- )
280
- self.logger.warning(msg)
281
- raise ValidationError(msg)
282
-
283
- min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
284
- min_class_threshold = min_class_percent * count
285
-
286
- # If min class count less than 30% for binary or (60 / classes_count)% for multiclass
287
- return bool(min_class_count < min_class_threshold)
288
-
289
262
  def __validate_dataset(self, validate_target: bool, silent_mode: bool):
290
263
  """Validate DataSet"""
291
264
  # self.logger.info("validating etalon")
@@ -335,15 +308,37 @@ class Dataset:
335
308
  all_valid_message = self.bundle.get("validation_all_valid_message")
336
309
  invalid_message = self.bundle.get("validation_invalid_message")
337
310
 
311
+ oot_indices = []
312
+ if EVAL_SET_INDEX in self.data.columns:
313
+ for eval_set_index in self.data[EVAL_SET_INDEX].unique():
314
+ eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
315
+ if eval_set[target].isna().all():
316
+ oot_indices.append(eval_set_index)
317
+
338
318
  for col in columns_to_validate:
339
- self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
340
319
  if validate_target and target is not None and col == target:
341
- self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
320
+ if oot_indices:
321
+ mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
322
+ invalid_target_mask = (
323
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
324
+ )
325
+ # Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
326
+ self.data[f"{col}_is_valid"] = True
327
+ self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
328
+ else:
329
+ # No OOT: mark invalid where target is NaN or +/-inf
330
+ self.data[f"{col}_is_valid"] = ~(
331
+ self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
332
+ )
333
+ else:
334
+ self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
342
335
 
343
336
  if col in mandatory_columns:
344
337
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
345
338
 
346
- invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
339
+ # Use stable pandas API across versions: Series.unique keeps order
340
+ # and collapses multiple NaNs into a single NaN
341
+ invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
347
342
  valid_share = self.data[f"{col}_is_valid"].sum() / nrows
348
343
  original_col_name = self.columns_renaming[col]
349
344
  validation_stats[original_col_name] = {}
@@ -503,9 +498,6 @@ class Dataset:
503
498
  return_scores: bool,
504
499
  extract_features: bool,
505
500
  accurate_model: Optional[bool] = None,
506
- importance_threshold: Optional[float] = None,
507
- max_features: Optional[int] = None,
508
- filter_features: Optional[dict] = None,
509
501
  runtime_parameters: Optional[RuntimeParameters] = None,
510
502
  metrics_calculation: Optional[bool] = False,
511
503
  auto_fe_parameters: Optional[AutoFEParameters] = None,
@@ -514,28 +506,12 @@ class Dataset:
514
506
  search_customization = SearchCustomization(
515
507
  extractFeatures=extract_features,
516
508
  accurateModel=accurate_model,
517
- importanceThreshold=importance_threshold,
518
- maxFeatures=max_features,
519
509
  returnScores=return_scores,
520
510
  runtimeParameters=runtime_parameters,
521
511
  metricsCalculation=metrics_calculation,
522
512
  )
523
- if filter_features:
524
- if [
525
- key
526
- for key in filter_features
527
- if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
528
- ]:
529
- raise ValidationError(self.bundle.get("dataset_invalid_filter"))
530
- feature_filter = FeaturesFilter(
531
- minImportance=filter_features.get("min_importance"),
532
- maxPSI=filter_features.get("max_psi"),
533
- maxCount=filter_features.get("max_count"),
534
- selectedFeatures=filter_features.get("selected_features"),
535
- )
536
- search_customization.featuresFilter = feature_filter
537
513
 
538
- search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.imbalanced
514
+ search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.is_imbalanced
539
515
  if auto_fe_parameters is not None:
540
516
  search_customization.runtimeParameters.properties["feature_generation_params.ts.gap_days"] = (
541
517
  auto_fe_parameters.ts_gap_days
@@ -590,9 +566,6 @@ class Dataset:
590
566
  extract_features: bool = False,
591
567
  accurate_model: bool = False,
592
568
  exclude_features_sources: Optional[List[str]] = None,
593
- importance_threshold: Optional[float] = None, # deprecated
594
- max_features: Optional[int] = None, # deprecated
595
- filter_features: Optional[dict] = None, # deprecated
596
569
  runtime_parameters: Optional[RuntimeParameters] = None,
597
570
  auto_fe_parameters: Optional[AutoFEParameters] = None,
598
571
  force_downsampling: bool = False,
@@ -609,9 +582,6 @@ class Dataset:
609
582
  return_scores=return_scores,
610
583
  extract_features=extract_features,
611
584
  accurate_model=accurate_model,
612
- importance_threshold=importance_threshold,
613
- max_features=max_features,
614
- filter_features=filter_features,
615
585
  runtime_parameters=runtime_parameters,
616
586
  auto_fe_parameters=auto_fe_parameters,
617
587
  )