upgini 1.2.39a3769.dev2__py3-none-any.whl → 1.2.41a3758.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.39a3769.dev2"
1
+ __version__ = "1.2.41a3758.dev1"
@@ -165,10 +165,6 @@ class FeaturesEnricher(TransformerMixin):
165
165
 
166
166
  shared_datasets: list of str, optional (default=None)
167
167
  List of private shared dataset ids for custom search
168
-
169
- select_features: bool, optional (default=False)
170
- If True, return only selected features both from input and data sources.
171
- Otherwise, return all features from input and only selected features from data sources.
172
168
  """
173
169
 
174
170
  TARGET_NAME = "target"
@@ -235,7 +231,6 @@ class FeaturesEnricher(TransformerMixin):
235
231
  client_visitorid: Optional[str] = None,
236
232
  custom_bundle_config: Optional[str] = None,
237
233
  add_date_if_missing: bool = True,
238
- select_features: bool = False,
239
234
  disable_force_downsampling: bool = False,
240
235
  id_columns: Optional[List[str]] = None,
241
236
  **kwargs,
@@ -297,7 +292,6 @@ class FeaturesEnricher(TransformerMixin):
297
292
  self.dropped_client_feature_names_ = []
298
293
  self.feature_importances_ = []
299
294
  self.search_id = search_id
300
- self.select_features = select_features
301
295
  self.disable_force_downsampling = disable_force_downsampling
302
296
 
303
297
  if search_id:
@@ -405,6 +399,7 @@ class FeaturesEnricher(TransformerMixin):
405
399
  remove_outliers_calc_metrics: Optional[bool] = None,
406
400
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
407
401
  search_id_callback: Optional[Callable[[str], Any]] = None,
402
+ select_features: bool = False,
408
403
  **kwargs,
409
404
  ):
410
405
  """Fit to data.
@@ -440,6 +435,10 @@ class FeaturesEnricher(TransformerMixin):
440
435
 
441
436
  remove_outliers_calc_metrics, optional (default=True)
442
437
  If True then rows with target ouliers will be dropped on metrics calculation
438
+
439
+ select_features: bool, optional (default=False)
440
+ If True, return only selected features both from input and data sources.
441
+ Otherwise, return all features from input and only selected features from data sources.
443
442
  """
444
443
  trace_id = str(uuid.uuid4())
445
444
  start_time = time.time()
@@ -474,6 +473,7 @@ class FeaturesEnricher(TransformerMixin):
474
473
  self.y = y
475
474
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
476
475
  self.dump_input(trace_id, X, y, self.eval_set)
476
+ self.__set_select_features(select_features)
477
477
  self.__inner_fit(
478
478
  trace_id,
479
479
  X,
@@ -523,6 +523,10 @@ class FeaturesEnricher(TransformerMixin):
523
523
  finally:
524
524
  self.logger.info(f"Fit elapsed time: {time.time() - start_time}")
525
525
 
526
+ def __set_select_features(self, select_features: bool):
527
+ self.fit_select_features = select_features
528
+ self.runtime_parameters.properties["select_features"] = select_features
529
+
526
530
  def fit_transform(
527
531
  self,
528
532
  X: Union[pd.DataFrame, pd.Series, np.ndarray],
@@ -538,6 +542,7 @@ class FeaturesEnricher(TransformerMixin):
538
542
  estimator: Optional[Any] = None,
539
543
  remove_outliers_calc_metrics: Optional[bool] = None,
540
544
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
545
+ select_features: bool = False,
541
546
  **kwargs,
542
547
  ) -> pd.DataFrame:
543
548
  """Fit to data, then transform it.
@@ -578,6 +583,10 @@ class FeaturesEnricher(TransformerMixin):
578
583
  remove_outliers_calc_metrics, optional (default=True)
579
584
  If True then rows with target ouliers will be dropped on metrics calculation
580
585
 
586
+ select_features: bool, optional (default=False)
587
+ If True, return only selected features both from input and data sources.
588
+ Otherwise, return all features from input and only selected features from data sources.
589
+
581
590
  Returns
582
591
  -------
583
592
  X_new: pandas.DataFrame of shape (n_samples, n_features_new)
@@ -612,6 +621,7 @@ class FeaturesEnricher(TransformerMixin):
612
621
  self.X = X
613
622
  self.y = y
614
623
  self.eval_set = self._check_eval_set(eval_set, X, self.bundle)
624
+ self.__set_select_features(select_features)
615
625
  self.dump_input(trace_id, X, y, self.eval_set)
616
626
 
617
627
  if _num_samples(drop_duplicates(X)) > Dataset.MAX_ROWS:
@@ -1231,8 +1241,11 @@ class FeaturesEnricher(TransformerMixin):
1231
1241
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1232
1242
 
1233
1243
  def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
1244
+ renaming = self.fit_columns_renaming or {}
1234
1245
  new_shaps = {
1235
- feature: _round_shap_value(shap) for feature, shap in new_shaps.items() if feature in self.feature_names_
1246
+ renaming.get(feature, feature): _round_shap_value(shap)
1247
+ for feature, shap in new_shaps.items()
1248
+ if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1236
1249
  }
1237
1250
  self.__prepare_feature_importances(trace_id, x_columns, new_shaps, silent=True)
1238
1251
 
@@ -1461,7 +1474,7 @@ class FeaturesEnricher(TransformerMixin):
1461
1474
  c
1462
1475
  for c in X_sampled.columns.to_list()
1463
1476
  if (
1464
- not self.select_features
1477
+ not self.fit_select_features
1465
1478
  or c in self.feature_names_
1466
1479
  or (self.fit_columns_renaming is not None and self.fit_columns_renaming.get(c) in self.feature_names_)
1467
1480
  )
@@ -2008,7 +2021,7 @@ class FeaturesEnricher(TransformerMixin):
2008
2021
  trace_id = trace_id or uuid.uuid4()
2009
2022
  return search_task.get_progress(trace_id)
2010
2023
 
2011
- def get_transactional_transform_api(self):
2024
+ def get_transactional_transform_api(self, only_online_sources=False):
2012
2025
  if self.api_key is None:
2013
2026
  raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
2014
2027
  if self._search_task is None:
@@ -2066,7 +2079,7 @@ class FeaturesEnricher(TransformerMixin):
2066
2079
  api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
2067
2080
  -H 'Authorization: {self.api_key}' \\
2068
2081
  -H 'Content-Type: application/json' \\
2069
- -d '{{"search_keys": {keys}{features_section}}}'"""
2082
+ -d '{{"search_keys": {keys}{features_section}, "only_online_sources": {str(only_online_sources).lower()}}}'"""
2070
2083
  return api_example
2071
2084
 
2072
2085
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -2110,13 +2123,15 @@ class FeaturesEnricher(TransformerMixin):
2110
2123
  return None, {c: c for c in X.columns}, []
2111
2124
 
2112
2125
  features_meta = self._search_task.get_all_features_metadata_v2()
2113
- online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
2126
+ online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
2114
2127
  if len(online_api_features) > 0:
2115
2128
  self.logger.warning(
2116
2129
  f"There are important features for transform, that generated by online API: {online_api_features}"
2117
2130
  )
2118
- # TODO
2119
- raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
2131
+ msg = self.bundle.get("online_api_features_transform").format(online_api_features)
2132
+ self.logger.warning(msg)
2133
+ print(msg)
2134
+ print(self.get_transactional_transform_api(only_online_sources=True))
2120
2135
 
2121
2136
  if not metrics_calculation:
2122
2137
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -2702,6 +2717,7 @@ class FeaturesEnricher(TransformerMixin):
2702
2717
  self.fit_search_keys,
2703
2718
  self.fit_columns_renaming,
2704
2719
  list(unnest_search_keys.keys()),
2720
+ self.bundle,
2705
2721
  self.logger,
2706
2722
  )
2707
2723
  df = converter.convert(df)
@@ -3269,6 +3285,7 @@ class FeaturesEnricher(TransformerMixin):
3269
3285
  f"Generate features: {self.generate_features}\n"
3270
3286
  f"Round embeddings: {self.round_embeddings}\n"
3271
3287
  f"Detect missing search keys: {self.detect_missing_search_keys}\n"
3288
+ f"Exclude columns: {self.exclude_columns}\n"
3272
3289
  f"Exclude features sources: {exclude_features_sources}\n"
3273
3290
  f"Calculate metrics: {calculate_metrics}\n"
3274
3291
  f"Scoring: {scoring}\n"
@@ -3276,6 +3293,15 @@ class FeaturesEnricher(TransformerMixin):
3276
3293
  f"Remove target outliers: {remove_outliers_calc_metrics}\n"
3277
3294
  f"Exclude columns: {self.exclude_columns}\n"
3278
3295
  f"Search id: {self.search_id}\n"
3296
+ f"Custom loss: {self.loss}\n"
3297
+ f"Logs enabled: {self.logs_enabled}\n"
3298
+ f"Raise validation error: {self.raise_validation_error}\n"
3299
+ f"Baseline score column: {self.baseline_score_column}\n"
3300
+ f"Client ip: {self.client_ip}\n"
3301
+ f"Client visitorId: {self.client_visitorid}\n"
3302
+ f"Add date if missing: {self.add_date_if_missing}\n"
3303
+ f"Disable force downsampling: {self.disable_force_downsampling}\n"
3304
+ f"Id columns: {self.id_columns}\n"
3279
3305
  )
3280
3306
 
3281
3307
  def sample(df):
@@ -3662,7 +3688,7 @@ class FeaturesEnricher(TransformerMixin):
3662
3688
  is_client_feature = feature_meta.name in x_columns
3663
3689
 
3664
3690
  if feature_meta.shap_value == 0.0:
3665
- if self.select_features:
3691
+ if self.fit_select_features:
3666
3692
  self.dropped_client_feature_names_.append(feature_meta.name)
3667
3693
  continue
3668
3694
 
@@ -3671,7 +3697,7 @@ class FeaturesEnricher(TransformerMixin):
3671
3697
  feature_meta.name in self.fit_generated_features
3672
3698
  or feature_meta.name == COUNTRY
3673
3699
  # In select_features mode we select also from etalon features and need to show them
3674
- or (not self.select_features and is_client_feature)
3700
+ or (not self.fit_select_features and is_client_feature)
3675
3701
  ):
3676
3702
  continue
3677
3703
 
@@ -3959,7 +3985,7 @@ class FeaturesEnricher(TransformerMixin):
3959
3985
  display_html_dataframe(self.metrics, self.metrics, msg)
3960
3986
 
3961
3987
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
3962
- search_key_names = search_keys.keys()
3988
+ search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
3963
3989
  if self.fit_columns_renaming:
3964
3990
  search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
3965
3991
  msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
@@ -216,6 +216,7 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
216
216
  loss_selection_info=Using loss `{}` for feature selection
217
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
218
218
  forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
219
+ online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
219
220
 
220
221
  # Validation table
221
222
  validation_column_name_header=Column name
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.39a3769.dev2
3
+ Version: 1.2.41a3758.dev1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -382,6 +382,7 @@ enricher = FeaturesEnricher(
382
382
  date_format = "%Y-%d-%m"
383
383
  )
384
384
  ```
385
+
385
386
  ### 4. 🔍 Start your first feature search!
386
387
  The main abstraction you interact is `FeaturesEnricher`, a Scikit-learn compatible estimator. You can easily add it into your existing ML pipelines.
387
388
  Create instance of the `FeaturesEnricher` class and call:
@@ -412,7 +413,7 @@ enricher = FeaturesEnricher(
412
413
  enricher.fit(X, y)
413
414
  ```
414
415
 
415
- That's all). We've fitted `FeaturesEnricher`.
416
+ That's all! We've fit `FeaturesEnricher`.
416
417
  ### 5. 📈 Evaluate feature importances (SHAP values) from the search result
417
418
 
418
419
  `FeaturesEnricher` class has two properties for feature importances, which will be filled after fit - `feature_names_` and `feature_importances_`:
@@ -464,7 +465,7 @@ enricher = FeaturesEnricher(
464
465
  )
465
466
  ```
466
467
 
467
- ## 💻 How it works?
468
+ ## 💻 How does it work?
468
469
 
469
470
  ### 🧹 Search dataset validation
470
471
  We validate and clean search initialization dataset under the hood:
@@ -506,6 +507,17 @@ enricher = FeaturesEnricher(
506
507
  cv=CVType.time_series
507
508
  )
508
509
  ```
510
+
511
+ If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
512
+ ```python
513
+ enricher = FeaturesEnricher(
514
+ search_keys={
515
+ "sales_date": SearchKey.DATE,
516
+ },
517
+ id_columns=["store_id", "product_id"],
518
+ cv=CVType.time_series
519
+ )
520
+ ```
509
521
  ⚠️ **Pre-process search dataset** in case of time series prediction:
510
522
  sort rows in dataset according to observation order, in most cases - ascending order by date/datetime.
511
523
 
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=2ilnzZVy_WdaVJ8AG6XQ1dEDOf4Mo3p6WiWCjIzOxF8,33
1
+ upgini/__about__.py,sha256=KQ5_UqUf1j9QhJsdY2vLVTEcHPCYbzp5HHMntbtpDpE,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=d9VlOs9hTf6eL8TX_9bO400HQj3y_jVGthABvQJqONs,33350
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=HY7FBC-ioH5hNg2NVMLMV_YAqu4rThgrJoK0JT8cdhU,196975
6
+ upgini/features_enricher.py,sha256=c-NKv3UfMGqcyHb4KZjuCzLj6hW19_1ysi0IWDXYstI,198633
7
7
  upgini/http.py,sha256=plZGTGoi1h2edd8Cnjt4eYB8t4NbBGnZz7DtPTByiNc,42885
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=-ibqiNjD7dTagqg53FoEJNEqvAYbwgfyn9PGTRQ_YKU,12054
@@ -30,7 +30,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
30
30
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
31
31
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
32
32
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
33
- upgini/resource_bundle/strings.properties,sha256=TiYWmFnuhOq0R3aVg2nbA3F5AWLgjrgh68Yj6MhG-x8,27088
33
+ upgini/resource_bundle/strings.properties,sha256=uQWmbcd9TJh-xE0QpmHpHYKw-20utvXeHwFA-U_iTLw,27302
34
34
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
35
35
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
36
36
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -59,7 +59,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
59
59
  upgini/utils/target_utils.py,sha256=RlpKGss9kMibVSlA8iZuO_qxmyeplqzn7X8g6hiGGGs,14341
60
60
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
61
61
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
62
- upgini-1.2.39a3769.dev2.dist-info/METADATA,sha256=Vh1Rr3q2Osl1_Ee7uetOp8LROY2nVUb_kvZwyxEDcHc,48604
63
- upgini-1.2.39a3769.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
- upgini-1.2.39a3769.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
- upgini-1.2.39a3769.dev2.dist-info/RECORD,,
62
+ upgini-1.2.41a3758.dev1.dist-info/METADATA,sha256=gfveQriK3BlEZTWtxNrMlApMona-ghB5CzCN0HRVGMs,49064
63
+ upgini-1.2.41a3758.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
64
+ upgini-1.2.41a3758.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
65
+ upgini-1.2.41a3758.dev1.dist-info/RECORD,,