upgini 1.2.113a3974.dev2__tar.gz → 1.2.114__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/PKG-INFO +62 -32
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/README.md +60 -30
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/pyproject.toml +1 -1
- upgini-1.2.114/src/upgini/__about__.py +1 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/dataset.py +48 -78
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/features_enricher.py +726 -516
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/http.py +15 -19
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/metadata.py +1 -10
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/metrics.py +6 -2
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/strings.properties +8 -6
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/base.py +3 -1
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/random_under_sampler.py +18 -8
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/search_task.py +6 -0
- upgini-1.2.114/src/upgini/utils/config.py +43 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/deduplicate_utils.py +57 -9
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/display_utils.py +1 -1
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/feature_info.py +5 -0
- upgini-1.2.114/src/upgini/utils/hash_utils.py +159 -0
- upgini-1.2.114/src/upgini/utils/psi.py +300 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/sample_utils.py +45 -42
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/target_utils.py +53 -2
- upgini-1.2.113a3974.dev2/src/upgini/__about__.py +0 -1
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/.gitignore +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/LICENSE +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/ads.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/errors.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/spinner.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.114
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -38,7 +38,7 @@ Requires-Dist: python-bidi==0.4.2
|
|
38
38
|
Requires-Dist: python-dateutil>=2.8.0
|
39
39
|
Requires-Dist: python-json-logger>=3.3.0
|
40
40
|
Requires-Dist: requests>=2.8.0
|
41
|
-
Requires-Dist: scikit-learn
|
41
|
+
Requires-Dist: scikit-learn<1.8.0,>=1.3.0
|
42
42
|
Requires-Dist: scipy>=1.10.0
|
43
43
|
Requires-Dist: shap>=0.44.0
|
44
44
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
|
|
270
270
|
enricher = FeaturesEnricher(
|
271
271
|
search_keys={
|
272
272
|
"subscription_activation_date": SearchKey.DATE,
|
273
|
-
|
274
|
-
|
275
|
-
|
273
|
+
"country": SearchKey.COUNTRY,
|
274
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
275
|
+
"hashed_email": SearchKey.HEM,
|
276
276
|
"last_visit_ip_address": SearchKey.IP,
|
277
277
|
"registered_with_phone": SearchKey.PHONE
|
278
278
|
})
|
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
|
|
358
358
|
enricher = FeaturesEnricher(
|
359
359
|
search_keys={
|
360
360
|
"subscription_activation_date": SearchKey.DATE,
|
361
|
-
|
362
|
-
|
363
|
-
|
361
|
+
"country": SearchKey.COUNTRY,
|
362
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
363
|
+
"hashed_email": SearchKey.HEM,
|
364
364
|
"last_visit_ip_address": SearchKey.IP,
|
365
365
|
"registered_with_phone": SearchKey.PHONE
|
366
366
|
},
|
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
|
|
381
381
|
enricher = FeaturesEnricher(
|
382
382
|
search_keys={
|
383
383
|
"subscription_activation_date": SearchKey.DATE,
|
384
|
-
|
384
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
385
385
|
},
|
386
386
|
country_code = "US",
|
387
387
|
date_format = "%Y-%d-%m"
|
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
|
|
409
409
|
enricher = FeaturesEnricher(
|
410
410
|
search_keys={
|
411
411
|
"subscription_activation_date": SearchKey.DATE,
|
412
|
-
|
413
|
-
|
412
|
+
"country": SearchKey.COUNTRY,
|
413
|
+
"zip_code": SearchKey.POSTAL_CODE
|
414
414
|
})
|
415
415
|
|
416
416
|
# everything is ready to fit! For 200к records fitting should take around 10 minutes,
|
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
|
|
464
464
|
enricher = FeaturesEnricher(
|
465
465
|
search_keys={
|
466
466
|
"subscription_activation_date": SearchKey.DATE,
|
467
|
-
|
468
|
-
|
467
|
+
"country": SearchKey.COUNTRY,
|
468
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
469
469
|
},
|
470
470
|
)
|
471
471
|
```
|
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
|
|
516
516
|
If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
|
517
517
|
```python
|
518
518
|
enricher = FeaturesEnricher(
|
519
|
-
|
520
|
-
|
519
|
+
search_keys={
|
520
|
+
"sales_date": SearchKey.DATE,
|
521
521
|
},
|
522
522
|
id_columns=["store_id", "product_id"],
|
523
523
|
cv=CVType.time_series
|
@@ -733,9 +733,52 @@ enricher.fit(
|
|
733
733
|
)
|
734
734
|
```
|
735
735
|
#### ⚠️ Requirements for out-of-time dataset
|
736
|
-
- Same data schema as for search initialization dataset
|
736
|
+
- Same data schema as for search initialization X dataset
|
737
737
|
- Pandas dataframe representation
|
738
738
|
|
739
|
+
There are 3 options to pass out-of-time without labels:
|
740
|
+
```python
|
741
|
+
enricher.fit(
|
742
|
+
train_ids_and_features,
|
743
|
+
train_label,
|
744
|
+
eval_set = [
|
745
|
+
(eval_ids_and_features_1,), # Just tuple of 1 element
|
746
|
+
(eval_ids_and_features_2, None), # None as labels
|
747
|
+
(eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
|
748
|
+
]
|
749
|
+
)
|
750
|
+
```
|
751
|
+
|
752
|
+
### Control feature stability with PSI parameters
|
753
|
+
|
754
|
+
`FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
|
755
|
+
|
756
|
+
```python
|
757
|
+
enricher = FeaturesEnricher(
|
758
|
+
search_keys={"registration_date": SearchKey.DATE}
|
759
|
+
)
|
760
|
+
|
761
|
+
# Control feature stability during fit
|
762
|
+
enricher.fit(
|
763
|
+
X, y,
|
764
|
+
stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
|
765
|
+
stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
|
766
|
+
)
|
767
|
+
|
768
|
+
# Same parameters work for fit_transform
|
769
|
+
enriched_df = enricher.fit_transform(
|
770
|
+
X, y,
|
771
|
+
stability_threshold=0.1, # Stricter threshold for more stable features
|
772
|
+
stability_agg_func="mean" # Use mean aggregation instead of max
|
773
|
+
)
|
774
|
+
```
|
775
|
+
|
776
|
+
**Stability parameters:**
|
777
|
+
- `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
|
778
|
+
- `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
|
779
|
+
|
780
|
+
**PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
|
781
|
+
|
739
782
|
### Use custom loss function in feature selection & metrics calculation
|
740
783
|
|
741
784
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
@@ -756,20 +799,6 @@ enricher = FeaturesEnricher(
|
|
756
799
|
enriched_dataframe.fit(X, y)
|
757
800
|
```
|
758
801
|
|
759
|
-
### Return initial dataframe enriched with TOP external features by importance
|
760
|
-
|
761
|
-
`transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
|
762
|
-
- `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
|
763
|
-
- `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
|
764
|
-
|
765
|
-
And `keep_input=True` will keep all initial columns from search dataset X:
|
766
|
-
```python
|
767
|
-
enricher = FeaturesEnricher(
|
768
|
-
search_keys={"subscription_activation_date": SearchKey.DATE}
|
769
|
-
)
|
770
|
-
enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
|
771
|
-
```
|
772
|
-
|
773
802
|
### Exclude premium data sources from fit, transform and metrics calculation
|
774
803
|
|
775
804
|
`fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
|
@@ -797,7 +826,7 @@ enricher = FeaturesEnricher(
|
|
797
826
|
enricher.fit(X, y)
|
798
827
|
```
|
799
828
|
|
800
|
-
|
829
|
+
### Turn off removing of target outliers
|
801
830
|
Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
|
802
831
|
|
803
832
|
```python
|
@@ -808,7 +837,7 @@ enricher = FeaturesEnricher(
|
|
808
837
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
809
838
|
```
|
810
839
|
|
811
|
-
|
840
|
+
### Turn off generating features on search keys
|
812
841
|
Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
|
813
842
|
|
814
843
|
```python
|
@@ -816,6 +845,7 @@ enricher = FeaturesEnricher(
|
|
816
845
|
search_keys={"date": SearchKey.DATE},
|
817
846
|
generate_search_key_features=False,
|
818
847
|
)
|
848
|
+
```
|
819
849
|
|
820
850
|
## 🔑 Open up all capabilities of Upgini
|
821
851
|
|
@@ -224,9 +224,9 @@ from upgini.metadata import SearchKey
|
|
224
224
|
enricher = FeaturesEnricher(
|
225
225
|
search_keys={
|
226
226
|
"subscription_activation_date": SearchKey.DATE,
|
227
|
-
|
228
|
-
|
229
|
-
|
227
|
+
"country": SearchKey.COUNTRY,
|
228
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
229
|
+
"hashed_email": SearchKey.HEM,
|
230
230
|
"last_visit_ip_address": SearchKey.IP,
|
231
231
|
"registered_with_phone": SearchKey.PHONE
|
232
232
|
})
|
@@ -312,9 +312,9 @@ from upgini.metadata import SearchKey
|
|
312
312
|
enricher = FeaturesEnricher(
|
313
313
|
search_keys={
|
314
314
|
"subscription_activation_date": SearchKey.DATE,
|
315
|
-
|
316
|
-
|
317
|
-
|
315
|
+
"country": SearchKey.COUNTRY,
|
316
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
317
|
+
"hashed_email": SearchKey.HEM,
|
318
318
|
"last_visit_ip_address": SearchKey.IP,
|
319
319
|
"registered_with_phone": SearchKey.PHONE
|
320
320
|
},
|
@@ -335,7 +335,7 @@ from upgini.metadata import SearchKey
|
|
335
335
|
enricher = FeaturesEnricher(
|
336
336
|
search_keys={
|
337
337
|
"subscription_activation_date": SearchKey.DATE,
|
338
|
-
|
338
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
339
339
|
},
|
340
340
|
country_code = "US",
|
341
341
|
date_format = "%Y-%d-%m"
|
@@ -363,8 +363,8 @@ y = train_df["churn_flag"]
|
|
363
363
|
enricher = FeaturesEnricher(
|
364
364
|
search_keys={
|
365
365
|
"subscription_activation_date": SearchKey.DATE,
|
366
|
-
|
367
|
-
|
366
|
+
"country": SearchKey.COUNTRY,
|
367
|
+
"zip_code": SearchKey.POSTAL_CODE
|
368
368
|
})
|
369
369
|
|
370
370
|
# everything is ready to fit! For 200к records fitting should take around 10 minutes,
|
@@ -418,8 +418,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
|
|
418
418
|
enricher = FeaturesEnricher(
|
419
419
|
search_keys={
|
420
420
|
"subscription_activation_date": SearchKey.DATE,
|
421
|
-
|
422
|
-
|
421
|
+
"country": SearchKey.COUNTRY,
|
422
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
423
423
|
},
|
424
424
|
)
|
425
425
|
```
|
@@ -470,8 +470,8 @@ enricher = FeaturesEnricher(
|
|
470
470
|
If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
|
471
471
|
```python
|
472
472
|
enricher = FeaturesEnricher(
|
473
|
-
|
474
|
-
|
473
|
+
search_keys={
|
474
|
+
"sales_date": SearchKey.DATE,
|
475
475
|
},
|
476
476
|
id_columns=["store_id", "product_id"],
|
477
477
|
cv=CVType.time_series
|
@@ -687,9 +687,52 @@ enricher.fit(
|
|
687
687
|
)
|
688
688
|
```
|
689
689
|
#### ⚠️ Requirements for out-of-time dataset
|
690
|
-
- Same data schema as for search initialization dataset
|
690
|
+
- Same data schema as for search initialization X dataset
|
691
691
|
- Pandas dataframe representation
|
692
692
|
|
693
|
+
There are 3 options to pass out-of-time without labels:
|
694
|
+
```python
|
695
|
+
enricher.fit(
|
696
|
+
train_ids_and_features,
|
697
|
+
train_label,
|
698
|
+
eval_set = [
|
699
|
+
(eval_ids_and_features_1,), # Just tuple of 1 element
|
700
|
+
(eval_ids_and_features_2, None), # None as labels
|
701
|
+
(eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
|
702
|
+
]
|
703
|
+
)
|
704
|
+
```
|
705
|
+
|
706
|
+
### Control feature stability with PSI parameters
|
707
|
+
|
708
|
+
`FeaturesEnricher` supports Population Stability Index (PSI) calculation on eval_set to evaluate feature stability over time. You can control this behavior using stability parameters in `fit` and `fit_transform` methods:
|
709
|
+
|
710
|
+
```python
|
711
|
+
enricher = FeaturesEnricher(
|
712
|
+
search_keys={"registration_date": SearchKey.DATE}
|
713
|
+
)
|
714
|
+
|
715
|
+
# Control feature stability during fit
|
716
|
+
enricher.fit(
|
717
|
+
X, y,
|
718
|
+
stability_threshold=0.2, # PSI threshold: features with PSI above this value will be dropped
|
719
|
+
stability_agg_func="max" # Aggregation function for stability values: "max", "min", "mean"
|
720
|
+
)
|
721
|
+
|
722
|
+
# Same parameters work for fit_transform
|
723
|
+
enriched_df = enricher.fit_transform(
|
724
|
+
X, y,
|
725
|
+
stability_threshold=0.1, # Stricter threshold for more stable features
|
726
|
+
stability_agg_func="mean" # Use mean aggregation instead of max
|
727
|
+
)
|
728
|
+
```
|
729
|
+
|
730
|
+
**Stability parameters:**
|
731
|
+
- `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
|
732
|
+
- `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
|
733
|
+
|
734
|
+
**PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
|
735
|
+
|
693
736
|
### Use custom loss function in feature selection & metrics calculation
|
694
737
|
|
695
738
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
@@ -710,20 +753,6 @@ enricher = FeaturesEnricher(
|
|
710
753
|
enriched_dataframe.fit(X, y)
|
711
754
|
```
|
712
755
|
|
713
|
-
### Return initial dataframe enriched with TOP external features by importance
|
714
|
-
|
715
|
-
`transform` and `fit_transform` methods of `FeaturesEnricher` can be used with two additional parameters:
|
716
|
-
- `importance_threshold`: float = 0 - only features with *importance >= threshold* will be added to the output dataframe
|
717
|
-
- `max_features`: int - only first TOP N features by importance will be returned, where *N = max_features*
|
718
|
-
|
719
|
-
And `keep_input=True` will keep all initial columns from search dataset X:
|
720
|
-
```python
|
721
|
-
enricher = FeaturesEnricher(
|
722
|
-
search_keys={"subscription_activation_date": SearchKey.DATE}
|
723
|
-
)
|
724
|
-
enriched_dataframe.fit_transform(X, y, keep_input=True, max_features=2)
|
725
|
-
```
|
726
|
-
|
727
756
|
### Exclude premium data sources from fit, transform and metrics calculation
|
728
757
|
|
729
758
|
`fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
|
@@ -751,7 +780,7 @@ enricher = FeaturesEnricher(
|
|
751
780
|
enricher.fit(X, y)
|
752
781
|
```
|
753
782
|
|
754
|
-
|
783
|
+
### Turn off removing of target outliers
|
755
784
|
Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
|
756
785
|
|
757
786
|
```python
|
@@ -762,7 +791,7 @@ enricher = FeaturesEnricher(
|
|
762
791
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
763
792
|
```
|
764
793
|
|
765
|
-
|
794
|
+
### Turn off generating features on search keys
|
766
795
|
Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
|
767
796
|
|
768
797
|
```python
|
@@ -770,6 +799,7 @@ enricher = FeaturesEnricher(
|
|
770
799
|
search_keys={"date": SearchKey.DATE},
|
771
800
|
generate_search_key_features=False,
|
772
801
|
)
|
802
|
+
```
|
773
803
|
|
774
804
|
## 🔑 Open up all capabilities of Upgini
|
775
805
|
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.114"
|
@@ -25,7 +25,6 @@ from upgini.metadata import (
|
|
25
25
|
AutoFEParameters,
|
26
26
|
CVType,
|
27
27
|
DataType,
|
28
|
-
FeaturesFilter,
|
29
28
|
FileColumnMeaningType,
|
30
29
|
FileColumnMetadata,
|
31
30
|
FileMetadata,
|
@@ -37,8 +36,9 @@ from upgini.metadata import (
|
|
37
36
|
)
|
38
37
|
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
39
38
|
from upgini.search_task import SearchTask
|
39
|
+
from upgini.utils.config import SampleConfig
|
40
40
|
from upgini.utils.email_utils import EmailSearchKeyConverter
|
41
|
-
from upgini.utils.sample_utils import SampleColumns,
|
41
|
+
from upgini.utils.sample_utils import SampleColumns, sample
|
42
42
|
|
43
43
|
try:
|
44
44
|
from upgini.utils.progress_bar import CustomProgressBar as ProgressBar
|
@@ -50,10 +50,7 @@ except Exception:
|
|
50
50
|
|
51
51
|
class Dataset:
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
|
-
MAX_ROWS =
|
54
|
-
IMBALANCE_THESHOLD = 0.6
|
55
|
-
MIN_TARGET_CLASS_ROWS = 100
|
56
|
-
MAX_MULTICLASS_CLASS_COUNT = 100
|
53
|
+
MAX_ROWS = 200_000
|
57
54
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
58
55
|
MAX_FEATURES_COUNT = 3500
|
59
56
|
MAX_UPLOADING_FILE_SIZE = 268435456 # 256 Mb
|
@@ -73,6 +70,7 @@ class Dataset:
|
|
73
70
|
cv_type: Optional[CVType] = None,
|
74
71
|
date_column: Optional[str] = None,
|
75
72
|
id_columns: Optional[List[str]] = None,
|
73
|
+
is_imbalanced: bool = False,
|
76
74
|
random_state: Optional[int] = None,
|
77
75
|
sample_config: Optional[SampleConfig] = None,
|
78
76
|
rest_client: Optional[_RestClient] = None,
|
@@ -117,8 +115,9 @@ class Dataset:
|
|
117
115
|
self.rest_client = rest_client
|
118
116
|
self.random_state = random_state
|
119
117
|
self.columns_renaming: Dict[str, str] = {}
|
120
|
-
self.
|
118
|
+
self.is_imbalanced: bool = False
|
121
119
|
self.id_columns = id_columns
|
120
|
+
self.is_imbalanced = is_imbalanced
|
122
121
|
self.date_column = date_column
|
123
122
|
if logger is not None:
|
124
123
|
self.logger = logger
|
@@ -184,7 +183,19 @@ class Dataset:
|
|
184
183
|
def __validate_target(self):
|
185
184
|
# self.logger.info("Validating target")
|
186
185
|
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, "")
|
187
|
-
|
186
|
+
|
187
|
+
oot_indices = []
|
188
|
+
if EVAL_SET_INDEX in self.data.columns:
|
189
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
190
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
191
|
+
if eval_set[target_column].isna().all():
|
192
|
+
oot_indices.append(eval_set_index)
|
193
|
+
|
194
|
+
df_to_check = self.data.copy()
|
195
|
+
if oot_indices:
|
196
|
+
df_to_check = df_to_check[~df_to_check[EVAL_SET_INDEX].isin(oot_indices)]
|
197
|
+
|
198
|
+
target = df_to_check[target_column]
|
188
199
|
|
189
200
|
if self.task_type == ModelTaskType.BINARY:
|
190
201
|
if not is_integer_dtype(target):
|
@@ -201,7 +212,7 @@ class Dataset:
|
|
201
212
|
elif self.task_type == ModelTaskType.MULTICLASS:
|
202
213
|
if not is_integer_dtype(target):
|
203
214
|
try:
|
204
|
-
target =
|
215
|
+
target = target.astype("category").cat.codes
|
205
216
|
except Exception:
|
206
217
|
self.logger.exception("Failed to cast target to category codes for multiclass task type")
|
207
218
|
raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
@@ -227,8 +238,6 @@ class Dataset:
|
|
227
238
|
else:
|
228
239
|
train_segment = self.data
|
229
240
|
|
230
|
-
self.imbalanced = self.__is_imbalanced(train_segment)
|
231
|
-
|
232
241
|
sample_columns = SampleColumns(
|
233
242
|
ids=self.id_columns,
|
234
243
|
date=self.date_column,
|
@@ -237,55 +246,19 @@ class Dataset:
|
|
237
246
|
)
|
238
247
|
|
239
248
|
self.data = sample(
|
240
|
-
train_segment if self.
|
249
|
+
train_segment if self.is_imbalanced else self.data, # for imbalanced data we will be doing transform anyway
|
241
250
|
self.task_type,
|
242
251
|
self.cv_type,
|
243
252
|
self.sample_config,
|
244
253
|
sample_columns,
|
245
254
|
self.random_state,
|
246
|
-
balance=self.
|
255
|
+
balance=self.is_imbalanced,
|
247
256
|
force_downsampling=force_downsampling,
|
248
257
|
logger=self.logger,
|
249
258
|
bundle=self.bundle,
|
250
259
|
warning_callback=self.warning_callback,
|
251
260
|
)
|
252
261
|
|
253
|
-
def __is_imbalanced(self, data: pd.DataFrame) -> bool:
|
254
|
-
if self.task_type is None or not self.task_type.is_classification():
|
255
|
-
return False
|
256
|
-
|
257
|
-
if self.task_type == ModelTaskType.BINARY and len(data) <= self.sample_config.binary_min_sample_threshold:
|
258
|
-
return False
|
259
|
-
|
260
|
-
count = len(data)
|
261
|
-
target_column = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value, TARGET)
|
262
|
-
target = data[target_column]
|
263
|
-
target_classes_count = target.nunique()
|
264
|
-
|
265
|
-
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
266
|
-
msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
|
267
|
-
target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
|
268
|
-
)
|
269
|
-
self.logger.warning(msg)
|
270
|
-
raise ValidationError(msg)
|
271
|
-
|
272
|
-
vc = target.value_counts()
|
273
|
-
min_class_value = vc.index[len(vc) - 1]
|
274
|
-
min_class_count = vc[min_class_value]
|
275
|
-
|
276
|
-
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
277
|
-
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
278
|
-
min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
|
279
|
-
)
|
280
|
-
self.logger.warning(msg)
|
281
|
-
raise ValidationError(msg)
|
282
|
-
|
283
|
-
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
284
|
-
min_class_threshold = min_class_percent * count
|
285
|
-
|
286
|
-
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
287
|
-
return bool(min_class_count < min_class_threshold)
|
288
|
-
|
289
262
|
def __validate_dataset(self, validate_target: bool, silent_mode: bool):
|
290
263
|
"""Validate DataSet"""
|
291
264
|
# self.logger.info("validating etalon")
|
@@ -335,15 +308,37 @@ class Dataset:
|
|
335
308
|
all_valid_message = self.bundle.get("validation_all_valid_message")
|
336
309
|
invalid_message = self.bundle.get("validation_invalid_message")
|
337
310
|
|
311
|
+
oot_indices = []
|
312
|
+
if EVAL_SET_INDEX in self.data.columns:
|
313
|
+
for eval_set_index in self.data[EVAL_SET_INDEX].unique():
|
314
|
+
eval_set = self.data[self.data[EVAL_SET_INDEX] == eval_set_index]
|
315
|
+
if eval_set[target].isna().all():
|
316
|
+
oot_indices.append(eval_set_index)
|
317
|
+
|
338
318
|
for col in columns_to_validate:
|
339
|
-
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
340
319
|
if validate_target and target is not None and col == target:
|
341
|
-
|
320
|
+
if oot_indices:
|
321
|
+
mask_not_oot = ~self.data[EVAL_SET_INDEX].isin(oot_indices)
|
322
|
+
invalid_target_mask = (
|
323
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
324
|
+
)
|
325
|
+
# Initialize as valid and mark invalid only for non-OOT rows with NaN or +/-inf
|
326
|
+
self.data[f"{col}_is_valid"] = True
|
327
|
+
self.data.loc[mask_not_oot & invalid_target_mask, f"{col}_is_valid"] = False
|
328
|
+
else:
|
329
|
+
# No OOT: mark invalid where target is NaN or +/-inf
|
330
|
+
self.data[f"{col}_is_valid"] = ~(
|
331
|
+
self.data[col].isnull() | (self.data[col] == np.inf) | (self.data[col] == -np.inf)
|
332
|
+
)
|
333
|
+
else:
|
334
|
+
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
342
335
|
|
343
336
|
if col in mandatory_columns:
|
344
337
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
345
338
|
|
346
|
-
|
339
|
+
# Use stable pandas API across versions: Series.unique keeps order
|
340
|
+
# and collapses multiple NaNs into a single NaN
|
341
|
+
invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
|
347
342
|
valid_share = self.data[f"{col}_is_valid"].sum() / nrows
|
348
343
|
original_col_name = self.columns_renaming[col]
|
349
344
|
validation_stats[original_col_name] = {}
|
@@ -503,9 +498,6 @@ class Dataset:
|
|
503
498
|
return_scores: bool,
|
504
499
|
extract_features: bool,
|
505
500
|
accurate_model: Optional[bool] = None,
|
506
|
-
importance_threshold: Optional[float] = None,
|
507
|
-
max_features: Optional[int] = None,
|
508
|
-
filter_features: Optional[dict] = None,
|
509
501
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
510
502
|
metrics_calculation: Optional[bool] = False,
|
511
503
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
@@ -514,28 +506,12 @@ class Dataset:
|
|
514
506
|
search_customization = SearchCustomization(
|
515
507
|
extractFeatures=extract_features,
|
516
508
|
accurateModel=accurate_model,
|
517
|
-
importanceThreshold=importance_threshold,
|
518
|
-
maxFeatures=max_features,
|
519
509
|
returnScores=return_scores,
|
520
510
|
runtimeParameters=runtime_parameters,
|
521
511
|
metricsCalculation=metrics_calculation,
|
522
512
|
)
|
523
|
-
if filter_features:
|
524
|
-
if [
|
525
|
-
key
|
526
|
-
for key in filter_features
|
527
|
-
if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
|
528
|
-
]:
|
529
|
-
raise ValidationError(self.bundle.get("dataset_invalid_filter"))
|
530
|
-
feature_filter = FeaturesFilter(
|
531
|
-
minImportance=filter_features.get("min_importance"),
|
532
|
-
maxPSI=filter_features.get("max_psi"),
|
533
|
-
maxCount=filter_features.get("max_count"),
|
534
|
-
selectedFeatures=filter_features.get("selected_features"),
|
535
|
-
)
|
536
|
-
search_customization.featuresFilter = feature_filter
|
537
513
|
|
538
|
-
search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.
|
514
|
+
search_customization.runtimeParameters.properties["etalon_imbalanced"] = self.is_imbalanced
|
539
515
|
if auto_fe_parameters is not None:
|
540
516
|
search_customization.runtimeParameters.properties["feature_generation_params.ts.gap_days"] = (
|
541
517
|
auto_fe_parameters.ts_gap_days
|
@@ -590,9 +566,6 @@ class Dataset:
|
|
590
566
|
extract_features: bool = False,
|
591
567
|
accurate_model: bool = False,
|
592
568
|
exclude_features_sources: Optional[List[str]] = None,
|
593
|
-
importance_threshold: Optional[float] = None, # deprecated
|
594
|
-
max_features: Optional[int] = None, # deprecated
|
595
|
-
filter_features: Optional[dict] = None, # deprecated
|
596
569
|
runtime_parameters: Optional[RuntimeParameters] = None,
|
597
570
|
auto_fe_parameters: Optional[AutoFEParameters] = None,
|
598
571
|
force_downsampling: bool = False,
|
@@ -609,9 +582,6 @@ class Dataset:
|
|
609
582
|
return_scores=return_scores,
|
610
583
|
extract_features=extract_features,
|
611
584
|
accurate_model=accurate_model,
|
612
|
-
importance_threshold=importance_threshold,
|
613
|
-
max_features=max_features,
|
614
|
-
filter_features=filter_features,
|
615
585
|
runtime_parameters=runtime_parameters,
|
616
586
|
auto_fe_parameters=auto_fe_parameters,
|
617
587
|
)
|