upgini 1.2.113a3974.dev2__tar.gz → 1.2.114a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/PKG-INFO +31 -17
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/README.md +30 -16
- upgini-1.2.114a1/src/upgini/__about__.py +1 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/dataset.py +3 -1
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/features_enricher.py +462 -136
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/metadata.py +1 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/metrics.py +6 -2
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/resource_bundle/strings.properties +4 -1
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/sampler/base.py +3 -1
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/sampler/random_under_sampler.py +18 -8
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/deduplicate_utils.py +43 -7
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/feature_info.py +5 -0
- upgini-1.2.114a1/src/upgini/utils/psi.py +294 -0
- upgini-1.2.113a3974.dev2/src/upgini/__about__.py +0 -1
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/.gitignore +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/LICENSE +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/pyproject.toml +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/ads.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/errors.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/http.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/sample_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.113a3974.dev2 → upgini-1.2.114a1}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.114a1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -270,9 +270,9 @@ from upgini.metadata import SearchKey
|
|
270
270
|
enricher = FeaturesEnricher(
|
271
271
|
search_keys={
|
272
272
|
"subscription_activation_date": SearchKey.DATE,
|
273
|
-
|
274
|
-
|
275
|
-
|
273
|
+
"country": SearchKey.COUNTRY,
|
274
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
275
|
+
"hashed_email": SearchKey.HEM,
|
276
276
|
"last_visit_ip_address": SearchKey.IP,
|
277
277
|
"registered_with_phone": SearchKey.PHONE
|
278
278
|
})
|
@@ -358,9 +358,9 @@ from upgini.metadata import SearchKey
|
|
358
358
|
enricher = FeaturesEnricher(
|
359
359
|
search_keys={
|
360
360
|
"subscription_activation_date": SearchKey.DATE,
|
361
|
-
|
362
|
-
|
363
|
-
|
361
|
+
"country": SearchKey.COUNTRY,
|
362
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
363
|
+
"hashed_email": SearchKey.HEM,
|
364
364
|
"last_visit_ip_address": SearchKey.IP,
|
365
365
|
"registered_with_phone": SearchKey.PHONE
|
366
366
|
},
|
@@ -381,7 +381,7 @@ from upgini.metadata import SearchKey
|
|
381
381
|
enricher = FeaturesEnricher(
|
382
382
|
search_keys={
|
383
383
|
"subscription_activation_date": SearchKey.DATE,
|
384
|
-
|
384
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
385
385
|
},
|
386
386
|
country_code = "US",
|
387
387
|
date_format = "%Y-%d-%m"
|
@@ -409,8 +409,8 @@ y = train_df["churn_flag"]
|
|
409
409
|
enricher = FeaturesEnricher(
|
410
410
|
search_keys={
|
411
411
|
"subscription_activation_date": SearchKey.DATE,
|
412
|
-
|
413
|
-
|
412
|
+
"country": SearchKey.COUNTRY,
|
413
|
+
"zip_code": SearchKey.POSTAL_CODE
|
414
414
|
})
|
415
415
|
|
416
416
|
# everything is ready to fit! For 200к records fitting should take around 10 minutes,
|
@@ -464,8 +464,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
|
|
464
464
|
enricher = FeaturesEnricher(
|
465
465
|
search_keys={
|
466
466
|
"subscription_activation_date": SearchKey.DATE,
|
467
|
-
|
468
|
-
|
467
|
+
"country": SearchKey.COUNTRY,
|
468
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
469
469
|
},
|
470
470
|
)
|
471
471
|
```
|
@@ -516,8 +516,8 @@ enricher = FeaturesEnricher(
|
|
516
516
|
If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
|
517
517
|
```python
|
518
518
|
enricher = FeaturesEnricher(
|
519
|
-
|
520
|
-
|
519
|
+
search_keys={
|
520
|
+
"sales_date": SearchKey.DATE,
|
521
521
|
},
|
522
522
|
id_columns=["store_id", "product_id"],
|
523
523
|
cv=CVType.time_series
|
@@ -733,9 +733,22 @@ enricher.fit(
|
|
733
733
|
)
|
734
734
|
```
|
735
735
|
#### ⚠️ Requirements for out-of-time dataset
|
736
|
-
- Same data schema as for search initialization dataset
|
736
|
+
- Same data schema as for search initialization X dataset
|
737
737
|
- Pandas dataframe representation
|
738
738
|
|
739
|
+
There are 3 options to pass out-of-time without labels:
|
740
|
+
```python
|
741
|
+
enricher.fit(
|
742
|
+
train_ids_and_features,
|
743
|
+
train_label,
|
744
|
+
eval_set = [
|
745
|
+
(eval_ids_and_features_1,), # Just tuple of 1 element
|
746
|
+
(eval_ids_and_features_2, None), # None as labels
|
747
|
+
(eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
|
748
|
+
]
|
749
|
+
)
|
750
|
+
```
|
751
|
+
|
739
752
|
### Use custom loss function in feature selection & metrics calculation
|
740
753
|
|
741
754
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
@@ -797,7 +810,7 @@ enricher = FeaturesEnricher(
|
|
797
810
|
enricher.fit(X, y)
|
798
811
|
```
|
799
812
|
|
800
|
-
|
813
|
+
### Turn off removing of target outliers
|
801
814
|
Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
|
802
815
|
|
803
816
|
```python
|
@@ -808,7 +821,7 @@ enricher = FeaturesEnricher(
|
|
808
821
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
809
822
|
```
|
810
823
|
|
811
|
-
|
824
|
+
### Turn off generating features on search keys
|
812
825
|
Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
|
813
826
|
|
814
827
|
```python
|
@@ -816,6 +829,7 @@ enricher = FeaturesEnricher(
|
|
816
829
|
search_keys={"date": SearchKey.DATE},
|
817
830
|
generate_search_key_features=False,
|
818
831
|
)
|
832
|
+
```
|
819
833
|
|
820
834
|
## 🔑 Open up all capabilities of Upgini
|
821
835
|
|
@@ -224,9 +224,9 @@ from upgini.metadata import SearchKey
|
|
224
224
|
enricher = FeaturesEnricher(
|
225
225
|
search_keys={
|
226
226
|
"subscription_activation_date": SearchKey.DATE,
|
227
|
-
|
228
|
-
|
229
|
-
|
227
|
+
"country": SearchKey.COUNTRY,
|
228
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
229
|
+
"hashed_email": SearchKey.HEM,
|
230
230
|
"last_visit_ip_address": SearchKey.IP,
|
231
231
|
"registered_with_phone": SearchKey.PHONE
|
232
232
|
})
|
@@ -312,9 +312,9 @@ from upgini.metadata import SearchKey
|
|
312
312
|
enricher = FeaturesEnricher(
|
313
313
|
search_keys={
|
314
314
|
"subscription_activation_date": SearchKey.DATE,
|
315
|
-
|
316
|
-
|
317
|
-
|
315
|
+
"country": SearchKey.COUNTRY,
|
316
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
317
|
+
"hashed_email": SearchKey.HEM,
|
318
318
|
"last_visit_ip_address": SearchKey.IP,
|
319
319
|
"registered_with_phone": SearchKey.PHONE
|
320
320
|
},
|
@@ -335,7 +335,7 @@ from upgini.metadata import SearchKey
|
|
335
335
|
enricher = FeaturesEnricher(
|
336
336
|
search_keys={
|
337
337
|
"subscription_activation_date": SearchKey.DATE,
|
338
|
-
|
338
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
339
339
|
},
|
340
340
|
country_code = "US",
|
341
341
|
date_format = "%Y-%d-%m"
|
@@ -363,8 +363,8 @@ y = train_df["churn_flag"]
|
|
363
363
|
enricher = FeaturesEnricher(
|
364
364
|
search_keys={
|
365
365
|
"subscription_activation_date": SearchKey.DATE,
|
366
|
-
|
367
|
-
|
366
|
+
"country": SearchKey.COUNTRY,
|
367
|
+
"zip_code": SearchKey.POSTAL_CODE
|
368
368
|
})
|
369
369
|
|
370
370
|
# everything is ready to fit! For 200к records fitting should take around 10 minutes,
|
@@ -418,8 +418,8 @@ And then, for `transform` in a production ML pipeline, you'll get enrichment wit
|
|
418
418
|
enricher = FeaturesEnricher(
|
419
419
|
search_keys={
|
420
420
|
"subscription_activation_date": SearchKey.DATE,
|
421
|
-
|
422
|
-
|
421
|
+
"country": SearchKey.COUNTRY,
|
422
|
+
"zip_code": SearchKey.POSTAL_CODE,
|
423
423
|
},
|
424
424
|
)
|
425
425
|
```
|
@@ -470,8 +470,8 @@ enricher = FeaturesEnricher(
|
|
470
470
|
If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
|
471
471
|
```python
|
472
472
|
enricher = FeaturesEnricher(
|
473
|
-
|
474
|
-
|
473
|
+
search_keys={
|
474
|
+
"sales_date": SearchKey.DATE,
|
475
475
|
},
|
476
476
|
id_columns=["store_id", "product_id"],
|
477
477
|
cv=CVType.time_series
|
@@ -687,9 +687,22 @@ enricher.fit(
|
|
687
687
|
)
|
688
688
|
```
|
689
689
|
#### ⚠️ Requirements for out-of-time dataset
|
690
|
-
- Same data schema as for search initialization dataset
|
690
|
+
- Same data schema as for search initialization X dataset
|
691
691
|
- Pandas dataframe representation
|
692
692
|
|
693
|
+
There are 3 options to pass out-of-time without labels:
|
694
|
+
```python
|
695
|
+
enricher.fit(
|
696
|
+
train_ids_and_features,
|
697
|
+
train_label,
|
698
|
+
eval_set = [
|
699
|
+
(eval_ids_and_features_1,), # Just tuple of 1 element
|
700
|
+
(eval_ids_and_features_2, None), # None as labels
|
701
|
+
(eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
|
702
|
+
]
|
703
|
+
)
|
704
|
+
```
|
705
|
+
|
693
706
|
### Use custom loss function in feature selection & metrics calculation
|
694
707
|
|
695
708
|
`FeaturesEnricher` can be initialized with additional string parameter `loss`.
|
@@ -751,7 +764,7 @@ enricher = FeaturesEnricher(
|
|
751
764
|
enricher.fit(X, y)
|
752
765
|
```
|
753
766
|
|
754
|
-
|
767
|
+
### Turn off removing of target outliers
|
755
768
|
Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
|
756
769
|
|
757
770
|
```python
|
@@ -762,7 +775,7 @@ enricher = FeaturesEnricher(
|
|
762
775
|
enricher.fit(X, y, remove_outliers_calc_metrics=False)
|
763
776
|
```
|
764
777
|
|
765
|
-
|
778
|
+
### Turn off generating features on search keys
|
766
779
|
Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
|
767
780
|
|
768
781
|
```python
|
@@ -770,6 +783,7 @@ enricher = FeaturesEnricher(
|
|
770
783
|
search_keys={"date": SearchKey.DATE},
|
771
784
|
generate_search_key_features=False,
|
772
785
|
)
|
786
|
+
```
|
773
787
|
|
774
788
|
## 🔑 Open up all capabilities of Upgini
|
775
789
|
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.114a1"
|
@@ -343,7 +343,9 @@ class Dataset:
|
|
343
343
|
if col in mandatory_columns:
|
344
344
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
345
345
|
|
346
|
-
|
346
|
+
# Use stable pandas API across versions: Series.unique keeps order
|
347
|
+
# and collapses multiple NaNs into a single NaN
|
348
|
+
invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
|
347
349
|
valid_share = self.data[f"{col}_is_valid"].sum() / nrows
|
348
350
|
original_col_name = self.columns_renaming[col]
|
349
351
|
validation_stats[original_col_name] = {}
|