upgini 1.2.39a3769.dev2__py3-none-any.whl → 1.2.41a3758.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +42 -16
- upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.2.39a3769.dev2.dist-info → upgini-1.2.41a3758.dev1.dist-info}/METADATA +15 -3
- {upgini-1.2.39a3769.dev2.dist-info → upgini-1.2.41a3758.dev1.dist-info}/RECORD +7 -7
- {upgini-1.2.39a3769.dev2.dist-info → upgini-1.2.41a3758.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.39a3769.dev2.dist-info → upgini-1.2.41a3758.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.41a3758.dev1"
|
upgini/features_enricher.py
CHANGED
|
@@ -165,10 +165,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
165
165
|
|
|
166
166
|
shared_datasets: list of str, optional (default=None)
|
|
167
167
|
List of private shared dataset ids for custom search
|
|
168
|
-
|
|
169
|
-
select_features: bool, optional (default=False)
|
|
170
|
-
If True, return only selected features both from input and data sources.
|
|
171
|
-
Otherwise, return all features from input and only selected features from data sources.
|
|
172
168
|
"""
|
|
173
169
|
|
|
174
170
|
TARGET_NAME = "target"
|
|
@@ -235,7 +231,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
235
231
|
client_visitorid: Optional[str] = None,
|
|
236
232
|
custom_bundle_config: Optional[str] = None,
|
|
237
233
|
add_date_if_missing: bool = True,
|
|
238
|
-
select_features: bool = False,
|
|
239
234
|
disable_force_downsampling: bool = False,
|
|
240
235
|
id_columns: Optional[List[str]] = None,
|
|
241
236
|
**kwargs,
|
|
@@ -297,7 +292,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
297
292
|
self.dropped_client_feature_names_ = []
|
|
298
293
|
self.feature_importances_ = []
|
|
299
294
|
self.search_id = search_id
|
|
300
|
-
self.select_features = select_features
|
|
301
295
|
self.disable_force_downsampling = disable_force_downsampling
|
|
302
296
|
|
|
303
297
|
if search_id:
|
|
@@ -405,6 +399,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
405
399
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
406
400
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
407
401
|
search_id_callback: Optional[Callable[[str], Any]] = None,
|
|
402
|
+
select_features: bool = False,
|
|
408
403
|
**kwargs,
|
|
409
404
|
):
|
|
410
405
|
"""Fit to data.
|
|
@@ -440,6 +435,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
440
435
|
|
|
441
436
|
remove_outliers_calc_metrics, optional (default=True)
|
|
442
437
|
If True then rows with target ouliers will be dropped on metrics calculation
|
|
438
|
+
|
|
439
|
+
select_features: bool, optional (default=False)
|
|
440
|
+
If True, return only selected features both from input and data sources.
|
|
441
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
443
442
|
"""
|
|
444
443
|
trace_id = str(uuid.uuid4())
|
|
445
444
|
start_time = time.time()
|
|
@@ -474,6 +473,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
474
473
|
self.y = y
|
|
475
474
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
476
475
|
self.dump_input(trace_id, X, y, self.eval_set)
|
|
476
|
+
self.__set_select_features(select_features)
|
|
477
477
|
self.__inner_fit(
|
|
478
478
|
trace_id,
|
|
479
479
|
X,
|
|
@@ -523,6 +523,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
523
523
|
finally:
|
|
524
524
|
self.logger.info(f"Fit elapsed time: {time.time() - start_time}")
|
|
525
525
|
|
|
526
|
+
def __set_select_features(self, select_features: bool):
|
|
527
|
+
self.fit_select_features = select_features
|
|
528
|
+
self.runtime_parameters.properties["select_features"] = select_features
|
|
529
|
+
|
|
526
530
|
def fit_transform(
|
|
527
531
|
self,
|
|
528
532
|
X: Union[pd.DataFrame, pd.Series, np.ndarray],
|
|
@@ -538,6 +542,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
538
542
|
estimator: Optional[Any] = None,
|
|
539
543
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
540
544
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
545
|
+
select_features: bool = False,
|
|
541
546
|
**kwargs,
|
|
542
547
|
) -> pd.DataFrame:
|
|
543
548
|
"""Fit to data, then transform it.
|
|
@@ -578,6 +583,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
578
583
|
remove_outliers_calc_metrics, optional (default=True)
|
|
579
584
|
If True then rows with target ouliers will be dropped on metrics calculation
|
|
580
585
|
|
|
586
|
+
select_features: bool, optional (default=False)
|
|
587
|
+
If True, return only selected features both from input and data sources.
|
|
588
|
+
Otherwise, return all features from input and only selected features from data sources.
|
|
589
|
+
|
|
581
590
|
Returns
|
|
582
591
|
-------
|
|
583
592
|
X_new: pandas.DataFrame of shape (n_samples, n_features_new)
|
|
@@ -612,6 +621,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
612
621
|
self.X = X
|
|
613
622
|
self.y = y
|
|
614
623
|
self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
|
|
624
|
+
self.__set_select_features(select_features)
|
|
615
625
|
self.dump_input(trace_id, X, y, self.eval_set)
|
|
616
626
|
|
|
617
627
|
if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
|
|
@@ -1231,8 +1241,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1231
1241
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1232
1242
|
|
|
1233
1243
|
def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
|
|
1244
|
+
renaming = self.fit_columns_renaming or {}
|
|
1234
1245
|
new_shaps = {
|
|
1235
|
-
feature: _round_shap_value(shap)
|
|
1246
|
+
renaming.get(feature, feature): _round_shap_value(shap)
|
|
1247
|
+
for feature, shap in new_shaps.items()
|
|
1248
|
+
if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
|
|
1236
1249
|
}
|
|
1237
1250
|
self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
|
|
1238
1251
|
|
|
@@ -1461,7 +1474,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1461
1474
|
c
|
|
1462
1475
|
for c in X_sampled.columns.to_list()
|
|
1463
1476
|
if (
|
|
1464
|
-
not self.
|
|
1477
|
+
not self.fit_select_features
|
|
1465
1478
|
or c in self.feature_names_
|
|
1466
1479
|
or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
|
|
1467
1480
|
)
|
|
@@ -2008,7 +2021,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2008
2021
|
trace_id = trace_id or uuid.uuid4()
|
|
2009
2022
|
return search_task.get_progress(trace_id)
|
|
2010
2023
|
|
|
2011
|
-
def get_transactional_transform_api(self):
|
|
2024
|
+
def get_transactional_transform_api(self, only_online_sources=False):
|
|
2012
2025
|
if self.api_key is None:
|
|
2013
2026
|
raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
|
|
2014
2027
|
if self._search_task is None:
|
|
@@ -2066,7 +2079,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2066
2079
|
api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
|
|
2067
2080
|
-H 'Authorization: {self.api_key}' \\
|
|
2068
2081
|
-H 'Content-Type: application/json' \\
|
|
2069
|
-
-d '{{"search_keys": {keys}{features_section}}}'"""
|
|
2082
|
+
-d '{{"search_keys": {keys}{features_section}, "only_online_sources": {str(only_online_sources).lower()}}}'"""
|
|
2070
2083
|
return api_example
|
|
2071
2084
|
|
|
2072
2085
|
def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
|
|
@@ -2110,13 +2123,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2110
2123
|
return None, {c: c for c in X.columns}, []
|
|
2111
2124
|
|
|
2112
2125
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
2113
|
-
online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
|
|
2126
|
+
online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
|
|
2114
2127
|
if len(online_api_features) > 0:
|
|
2115
2128
|
self.logger.warning(
|
|
2116
2129
|
f"There are important features for transform, that generated by online API: {online_api_features}"
|
|
2117
2130
|
)
|
|
2118
|
-
|
|
2119
|
-
|
|
2131
|
+
msg = self.bundle.get("online_api_features_transform").format(online_api_features)
|
|
2132
|
+
self.logger.warning(msg)
|
|
2133
|
+
print(msg)
|
|
2134
|
+
print(self.get_transactional_transform_api(only_online_sources=True))
|
|
2120
2135
|
|
|
2121
2136
|
if not metrics_calculation:
|
|
2122
2137
|
transform_usage = self.rest_client.get_current_transform_usage(trace_id)
|
|
@@ -2702,6 +2717,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2702
2717
|
self.fit_search_keys,
|
|
2703
2718
|
self.fit_columns_renaming,
|
|
2704
2719
|
list(unnest_search_keys.keys()),
|
|
2720
|
+
self.bundle,
|
|
2705
2721
|
self.logger,
|
|
2706
2722
|
)
|
|
2707
2723
|
df = converter.convert(df)
|
|
@@ -3269,6 +3285,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3269
3285
|
f"Generate features: {self.generate_features}\n"
|
|
3270
3286
|
f"Round embeddings: {self.round_embeddings}\n"
|
|
3271
3287
|
f"Detect missing search keys: {self.detect_missing_search_keys}\n"
|
|
3288
|
+
f"Exclude columns: {self.exclude_columns}\n"
|
|
3272
3289
|
f"Exclude features sources: {exclude_features_sources}\n"
|
|
3273
3290
|
f"Calculate metrics: {calculate_metrics}\n"
|
|
3274
3291
|
f"Scoring: {scoring}\n"
|
|
@@ -3276,6 +3293,15 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3276
3293
|
f"Remove target outliers: {remove_outliers_calc_metrics}\n"
|
|
3277
3294
|
f"Exclude columns: {self.exclude_columns}\n"
|
|
3278
3295
|
f"Search id: {self.search_id}\n"
|
|
3296
|
+
f"Custom loss: {self.loss}\n"
|
|
3297
|
+
f"Logs enabled: {self.logs_enabled}\n"
|
|
3298
|
+
f"Raise validation error: {self.raise_validation_error}\n"
|
|
3299
|
+
f"Baseline score column: {self.baseline_score_column}\n"
|
|
3300
|
+
f"Client ip: {self.client_ip}\n"
|
|
3301
|
+
f"Client visitorId: {self.client_visitorid}\n"
|
|
3302
|
+
f"Add date if missing: {self.add_date_if_missing}\n"
|
|
3303
|
+
f"Disable force downsampling: {self.disable_force_downsampling}\n"
|
|
3304
|
+
f"Id columns: {self.id_columns}\n"
|
|
3279
3305
|
)
|
|
3280
3306
|
|
|
3281
3307
|
def sample(df):
|
|
@@ -3662,7 +3688,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3662
3688
|
is_client_feature = feature_meta.name in x_columns
|
|
3663
3689
|
|
|
3664
3690
|
if feature_meta.shap_value == 0.0:
|
|
3665
|
-
if self.
|
|
3691
|
+
if self.fit_select_features:
|
|
3666
3692
|
self.dropped_client_feature_names_.append(feature_meta.name)
|
|
3667
3693
|
continue
|
|
3668
3694
|
|
|
@@ -3671,7 +3697,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3671
3697
|
feature_meta.name in self.fit_generated_features
|
|
3672
3698
|
or feature_meta.name == COUNTRY
|
|
3673
3699
|
# In select_features mode we select also from etalon features and need to show them
|
|
3674
|
-
or (not self.
|
|
3700
|
+
or (not self.fit_select_features and is_client_feature)
|
|
3675
3701
|
):
|
|
3676
3702
|
continue
|
|
3677
3703
|
|
|
@@ -3959,7 +3985,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3959
3985
|
display_html_dataframe(self.metrics, self.metrics, msg)
|
|
3960
3986
|
|
|
3961
3987
|
def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
|
|
3962
|
-
search_key_names = search_keys.
|
|
3988
|
+
search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
|
|
3963
3989
|
if self.fit_columns_renaming:
|
|
3964
3990
|
search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
|
|
3965
3991
|
msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
|
|
@@ -216,6 +216,7 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
|
|
|
216
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
217
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
218
218
|
forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
|
|
219
|
+
online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
|
|
219
220
|
|
|
220
221
|
# Validation table
|
|
221
222
|
validation_column_name_header=Column name
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.41a3758.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -382,6 +382,7 @@ enricher = FeaturesEnricher(
|
|
|
382
382
|
date_format = "%Y-%d-%m"
|
|
383
383
|
)
|
|
384
384
|
```
|
|
385
|
+
|
|
385
386
|
### 4. 🔍 Start your first feature search!
|
|
386
387
|
The main abstraction you interact is `FeaturesEnricher`, a Scikit-learn compatible estimator. You can easily add it into your existing ML pipelines.
|
|
387
388
|
Create instance of the `FeaturesEnricher` class and call:
|
|
@@ -412,7 +413,7 @@ enricher = FeaturesEnricher(
|
|
|
412
413
|
enricher.fit(X, y)
|
|
413
414
|
```
|
|
414
415
|
|
|
415
|
-
That's all
|
|
416
|
+
That's all! We've fit `FeaturesEnricher`.
|
|
416
417
|
### 5. 📈 Evaluate feature importances (SHAP values) from the search result
|
|
417
418
|
|
|
418
419
|
`FeaturesEnricher` class has two properties for feature importances, which will be filled after fit - `feature_names_` and `feature_importances_`:
|
|
@@ -464,7 +465,7 @@ enricher = FeaturesEnricher(
|
|
|
464
465
|
)
|
|
465
466
|
```
|
|
466
467
|
|
|
467
|
-
## 💻 How it
|
|
468
|
+
## 💻 How does it work?
|
|
468
469
|
|
|
469
470
|
### 🧹 Search dataset validation
|
|
470
471
|
We validate and clean search initialization dataset under the hood:
|
|
@@ -506,6 +507,17 @@ enricher = FeaturesEnricher(
|
|
|
506
507
|
cv=CVType.time_series
|
|
507
508
|
)
|
|
508
509
|
```
|
|
510
|
+
|
|
511
|
+
If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
|
|
512
|
+
```python
|
|
513
|
+
enricher = FeaturesEnricher(
|
|
514
|
+
search_keys={
|
|
515
|
+
"sales_date": SearchKey.DATE,
|
|
516
|
+
},
|
|
517
|
+
id_columns=["store_id", "product_id"],
|
|
518
|
+
cv=CVType.time_series
|
|
519
|
+
)
|
|
520
|
+
```
|
|
509
521
|
⚠️ **Pre-process search dataset** in case of time series prediction:
|
|
510
522
|
sort rows in dataset according to observation order, in most cases - ascending order by date/datetime.
|
|
511
523
|
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=KQ5_UqUf1j9QhJsdY2vLVTEcHPCYbzp5HHMntbtpDpE,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=d9VlOs9hTf6eL8TX_9bO400HQj3y_jVGthABvQJqONs,33350
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=c-NKv3UfMGqcyHb4KZjuCzLj6hW19_1ysi0IWDXYstI,198633
|
|
7
7
|
upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
|
|
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
30
30
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
33
|
+
upgini/resource_bundle/strings.properties,sha256=uQWmbcd9TJh-xE0QpmHpHYKw-20utvXeHwFA-U_iTLw,27302
|
|
34
34
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
35
35
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
36
36
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
59
59
|
upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
|
|
60
60
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
61
61
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
64
|
-
upgini-1.2.
|
|
65
|
-
upgini-1.2.
|
|
62
|
+
upgini-1.2.41a3758.dev1.dist-info/METADATA,sha256=gfveQriK3BlEZTWtxNrMlApMona-ghB5CzCN0HRVGMs,49064
|
|
63
|
+
upgini-1.2.41a3758.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
64
|
+
upgini-1.2.41a3758.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
65
|
+
upgini-1.2.41a3758.dev1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|