upgini 1.2.38a3769.dev11__tar.gz → 1.2.39__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (67) hide show
  1. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/PKG-INFO +15 -3
  2. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/README.md +14 -2
  3. upgini-1.2.39/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/dataset.py +3 -0
  5. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/features_enricher.py +28 -12
  6. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/resource_bundle/strings.properties +1 -0
  7. upgini-1.2.38a3769.dev11/src/upgini/__about__.py +0 -1
  8. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/.gitignore +0 -0
  9. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/LICENSE +0 -0
  10. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/pyproject.toml +0 -0
  11. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/all_operands.py +0 -0
  17. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/errors.py +0 -0
  27. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/http.py +0 -0
  28. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/lazy_import.py +0 -0
  29. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/mdc/__init__.py +0 -0
  30. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/mdc/context.py +0 -0
  31. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/metadata.py +0 -0
  32. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/metrics.py +0 -0
  33. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/normalizer/normalize_utils.py +0 -0
  35. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  38. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/sampler/__init__.py +0 -0
  39. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/sampler/base.py +0 -0
  40. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/sampler/random_under_sampler.py +0 -0
  41. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/sampler/utils.py +0 -0
  42. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/search_task.py +0 -0
  43. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/spinner.py +0 -0
  44. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  45. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/datetime_utils.py +0 -0
  52. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/deduplicate_utils.py +0 -0
  53. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/display_utils.py +0 -0
  54. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/feature_info.py +0 -0
  57. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/features_validator.py +0 -0
  58. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/sklearn_ext.py +0 -0
  64. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/target_utils.py +0 -0
  65. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/track_info.py +0 -0
  66. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/utils/warning_counter.py +0 -0
  67. {upgini-1.2.38a3769.dev11 → upgini-1.2.39}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.38a3769.dev11
3
+ Version: 1.2.39
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -382,6 +382,7 @@ enricher = FeaturesEnricher(
382
382
  date_format = "%Y-%d-%m"
383
383
  )
384
384
  ```
385
+
385
386
  ### 4. 🔍 Start your first feature search!
386
387
  The main abstraction you interact is `FeaturesEnricher`, a Scikit-learn compatible estimator. You can easily add it into your existing ML pipelines.
387
388
  Create instance of the `FeaturesEnricher` class and call:
@@ -412,7 +413,7 @@ enricher = FeaturesEnricher(
412
413
  enricher.fit(X, y)
413
414
  ```
414
415
 
415
- That's all). We've fitted `FeaturesEnricher`.
416
+ That's all! We've fit `FeaturesEnricher`.
416
417
  ### 5. 📈 Evaluate feature importances (SHAP values) from the search result
417
418
 
418
419
  `FeaturesEnricher` class has two properties for feature importances, which will be filled after fit - `feature_names_` and `feature_importances_`:
@@ -464,7 +465,7 @@ enricher = FeaturesEnricher(
464
465
  )
465
466
  ```
466
467
 
467
- ## 💻 How it works?
468
+ ## 💻 How does it work?
468
469
 
469
470
  ### 🧹 Search dataset validation
470
471
  We validate and clean search initialization dataset under the hood:
@@ -506,6 +507,17 @@ enricher = FeaturesEnricher(
506
507
  cv=CVType.time_series
507
508
  )
508
509
  ```
510
+
511
+ If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
512
+ ```python
513
+ enricher = FeaturesEnricher(
514
+ search_keys={
515
+ "sales_date": SearchKey.DATE,
516
+ },
517
+ id_columns=["store_id", "product_id"],
518
+ cv=CVType.time_series
519
+ )
520
+ ```
509
521
  ⚠️ **Pre-process search dataset** in case of time series prediction:
510
522
  sort rows in dataset according to observation order, in most cases - ascending order by date/datetime.
511
523
 
@@ -340,6 +340,7 @@ enricher = FeaturesEnricher(
340
340
  date_format = "%Y-%d-%m"
341
341
  )
342
342
  ```
343
+
343
344
  ### 4. 🔍 Start your first feature search!
344
345
  The main abstraction you interact is `FeaturesEnricher`, a Scikit-learn compatible estimator. You can easily add it into your existing ML pipelines.
345
346
  Create instance of the `FeaturesEnricher` class and call:
@@ -370,7 +371,7 @@ enricher = FeaturesEnricher(
370
371
  enricher.fit(X, y)
371
372
  ```
372
373
 
373
- That's all). We've fitted `FeaturesEnricher`.
374
+ That's all! We've fit `FeaturesEnricher`.
374
375
  ### 5. 📈 Evaluate feature importances (SHAP values) from the search result
375
376
 
376
377
  `FeaturesEnricher` class has two properties for feature importances, which will be filled after fit - `feature_names_` and `feature_importances_`:
@@ -422,7 +423,7 @@ enricher = FeaturesEnricher(
422
423
  )
423
424
  ```
424
425
 
425
- ## 💻 How it works?
426
+ ## 💻 How does it work?
426
427
 
427
428
  ### 🧹 Search dataset validation
428
429
  We validate and clean search initialization dataset under the hood:
@@ -464,6 +465,17 @@ enricher = FeaturesEnricher(
464
465
  cv=CVType.time_series
465
466
  )
466
467
  ```
468
+
469
+ If you're working with multivariate time series, you should specify id columns of individual univariate series in `FeaturesEnricher`. For example, if you have a dataset predicting sales for different stores and products, you should specify store and product id columns as follows:
470
+ ```python
471
+ enricher = FeaturesEnricher(
472
+ search_keys={
473
+ "sales_date": SearchKey.DATE,
474
+ },
475
+ id_columns=["store_id", "product_id"],
476
+ cv=CVType.time_series
477
+ )
478
+ ```
467
479
  ⚠️ **Pre-process search dataset** in case of time series prediction:
468
480
  sort rows in dataset according to observation order, in most cases - ascending order by date/datetime.
469
481
 
@@ -0,0 +1 @@
1
+ __version__ = "1.2.39"
@@ -77,6 +77,7 @@ class Dataset: # (pd.DataFrame):
77
77
  unnest_search_keys: Optional[Dict[str, str]] = None,
78
78
  model_task_type: Optional[ModelTaskType] = None,
79
79
  cv_type: Optional[CVType] = None,
80
+ date_column: Optional[str] = None,
80
81
  id_columns: Optional[List[str]] = None,
81
82
  random_state: Optional[int] = None,
82
83
  rest_client: Optional[_RestClient] = None,
@@ -122,6 +123,7 @@ class Dataset: # (pd.DataFrame):
122
123
  self.columns_renaming: Dict[str, str] = {}
123
124
  self.imbalanced: bool = False
124
125
  self.id_columns = id_columns
126
+ self.date_column = date_column
125
127
  if logger is not None:
126
128
  self.logger = logger
127
129
  else:
@@ -232,6 +234,7 @@ class Dataset: # (pd.DataFrame):
232
234
  target_column=target_column,
233
235
  task_type=self.task_type,
234
236
  cv_type=self.cv_type,
237
+ date_column=self.date_column,
235
238
  id_columns=self.id_columns,
236
239
  random_state=self.random_state,
237
240
  sample_size=self.FORCE_SAMPLE_SIZE,
@@ -932,9 +932,7 @@ class FeaturesEnricher(TransformerMixin):
932
932
  cat_features, search_keys_for_metrics = self._get_client_cat_features(
933
933
  estimator, validated_X, self.search_keys
934
934
  )
935
- search_keys_for_metrics.extend(
936
- [c for c in self.id_columns or [] if c not in search_keys_for_metrics]
937
- )
935
+ search_keys_for_metrics.extend([c for c in self.id_columns or [] if c not in search_keys_for_metrics])
938
936
  self.logger.info(f"Search keys for metrics: {search_keys_for_metrics}")
939
937
 
940
938
  prepared_data = self._prepare_data_for_metrics(
@@ -2010,7 +2008,7 @@ class FeaturesEnricher(TransformerMixin):
2010
2008
  trace_id = trace_id or uuid.uuid4()
2011
2009
  return search_task.get_progress(trace_id)
2012
2010
 
2013
- def get_transactional_transform_api(self):
2011
+ def get_transactional_transform_api(self, only_online_sources=False):
2014
2012
  if self.api_key is None:
2015
2013
  raise ValidationError(self.bundle.get("transactional_transform_unregistered"))
2016
2014
  if self._search_task is None:
@@ -2068,7 +2066,7 @@ class FeaturesEnricher(TransformerMixin):
2068
2066
  api_example = f"""curl 'https://search.upgini.com/online/api/http_inference_trigger?search_id={search_id}' \\
2069
2067
  -H 'Authorization: {self.api_key}' \\
2070
2068
  -H 'Content-Type: application/json' \\
2071
- -d '{{"search_keys": {keys}{features_section}}}'"""
2069
+ -d '{{"search_keys": {keys}{features_section}, "only_online_sources": {str(only_online_sources).lower()}}}'"""
2072
2070
  return api_example
2073
2071
 
2074
2072
  def _get_copy_of_runtime_parameters(self) -> RuntimeParameters:
@@ -2112,13 +2110,15 @@ class FeaturesEnricher(TransformerMixin):
2112
2110
  return None, {c: c for c in X.columns}, []
2113
2111
 
2114
2112
  features_meta = self._search_task.get_all_features_metadata_v2()
2115
- online_api_features = [fm.name for fm in features_meta if fm.from_online_api]
2113
+ online_api_features = [fm.name for fm in features_meta if fm.from_online_api and fm.shap_value > 0]
2116
2114
  if len(online_api_features) > 0:
2117
2115
  self.logger.warning(
2118
2116
  f"There are important features for transform, that generated by online API: {online_api_features}"
2119
2117
  )
2120
- # TODO
2121
- raise Exception("There are features selected that are paid. Contact support (sales@upgini.com)")
2118
+ msg = self.bundle.get("online_api_features_transform").format(online_api_features)
2119
+ self.logger.warning(msg)
2120
+ print(msg)
2121
+ print(self.get_transactional_transform_api(only_online_sources=True))
2122
2122
 
2123
2123
  if not metrics_calculation:
2124
2124
  transform_usage = self.rest_client.get_current_transform_usage(trace_id)
@@ -2150,6 +2150,9 @@ class FeaturesEnricher(TransformerMixin):
2150
2150
  validated_X = validated_X.drop(columns=columns_to_drop)
2151
2151
 
2152
2152
  search_keys = self.search_keys.copy()
2153
+ if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
2154
+ self.search_keys.update({col: SearchKey.CUSTOM_KEY for col in self.id_columns})
2155
+
2153
2156
  search_keys = self.__prepare_search_keys(
2154
2157
  validated_X, search_keys, is_demo_dataset, is_transform=True, silent_mode=silent_mode
2155
2158
  )
@@ -2307,7 +2310,8 @@ class FeaturesEnricher(TransformerMixin):
2307
2310
  meaning_types=meaning_types,
2308
2311
  search_keys=combined_search_keys,
2309
2312
  unnest_search_keys=unnest_search_keys,
2310
- id_columns=self.__get_renamed_id_columns(),
2313
+ id_columns=self.__get_renamed_id_columns(columns_renaming),
2314
+ date_column=self._get_date_column(search_keys),
2311
2315
  date_format=self.date_format,
2312
2316
  rest_client=self.rest_client,
2313
2317
  logger=self.logger,
@@ -2794,6 +2798,7 @@ class FeaturesEnricher(TransformerMixin):
2794
2798
  model_task_type=self.model_task_type,
2795
2799
  cv_type=self.cv,
2796
2800
  id_columns=self.__get_renamed_id_columns(),
2801
+ date_column=self._get_date_column(self.fit_search_keys),
2797
2802
  date_format=self.date_format,
2798
2803
  random_state=self.random_state,
2799
2804
  rest_client=self.rest_client,
@@ -2953,8 +2958,9 @@ class FeaturesEnricher(TransformerMixin):
2953
2958
  def __should_add_date_column(self):
2954
2959
  return self.add_date_if_missing or (self.cv is not None and self.cv.is_time_series())
2955
2960
 
2956
- def __get_renamed_id_columns(self):
2957
- reverse_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2961
+ def __get_renamed_id_columns(self, renaming: Optional[Dict[str, str]] = None):
2962
+ renaming = renaming or self.fit_columns_renaming
2963
+ reverse_renaming = {v: k for k, v in renaming.items()}
2958
2964
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
2959
2965
 
2960
2966
  def __adjust_cv(self, df: pd.DataFrame):
@@ -3265,6 +3271,7 @@ class FeaturesEnricher(TransformerMixin):
3265
3271
  f"Generate features: {self.generate_features}\n"
3266
3272
  f"Round embeddings: {self.round_embeddings}\n"
3267
3273
  f"Detect missing search keys: {self.detect_missing_search_keys}\n"
3274
+ f"Exclude columns: {self.exclude_columns}\n"
3268
3275
  f"Exclude features sources: {exclude_features_sources}\n"
3269
3276
  f"Calculate metrics: {calculate_metrics}\n"
3270
3277
  f"Scoring: {scoring}\n"
@@ -3272,6 +3279,15 @@ class FeaturesEnricher(TransformerMixin):
3272
3279
  f"Remove target outliers: {remove_outliers_calc_metrics}\n"
3273
3280
  f"Exclude columns: {self.exclude_columns}\n"
3274
3281
  f"Search id: {self.search_id}\n"
3282
+ f"Custom loss: {self.loss}\n"
3283
+ f"Logs enabled: {self.logs_enabled}\n"
3284
+ f"Raise validation error: {self.raise_validation_error}\n"
3285
+ f"Baseline score column: {self.baseline_score_column}\n"
3286
+ f"Client ip: {self.client_ip}\n"
3287
+ f"Client visitorId: {self.client_visitorid}\n"
3288
+ f"Add date if missing: {self.add_date_if_missing}\n"
3289
+ f"Select features: {self.select_features}\n"
3290
+ f"Disable force downsampling: {self.disable_force_downsampling}\n"
3275
3291
  )
3276
3292
 
3277
3293
  def sample(df):
@@ -3955,7 +3971,7 @@ class FeaturesEnricher(TransformerMixin):
3955
3971
  display_html_dataframe(self.metrics, self.metrics, msg)
3956
3972
 
3957
3973
  def __show_selected_features(self, search_keys: Dict[str, SearchKey]):
3958
- search_key_names = search_keys.keys()
3974
+ search_key_names = [col for col, tpe in search_keys.items() if tpe != SearchKey.CUSTOM_KEY]
3959
3975
  if self.fit_columns_renaming:
3960
3976
  search_key_names = [self.fit_columns_renaming.get(col, col) for col in search_key_names]
3961
3977
  msg = self.bundle.get("features_info_header").format(len(self.feature_names_), search_key_names)
@@ -216,6 +216,7 @@ imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of
216
216
  loss_selection_info=Using loss `{}` for feature selection
217
217
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
218
218
  forced_balance_undersample=For quick data retrieval, your dataset has been sampled. To use data search without data sampling please contact support (sales@upgini.com)
219
+ online_api_features_transform=Please note that some of the selected features {} are provided through a slow enrichment interface and are not available via transformation. However, they can be accessed via the API:
219
220
 
220
221
  # Validation table
221
222
  validation_column_name_header=Column name
@@ -1 +0,0 @@
1
- __version__ = "1.2.38a3769.dev11"
File without changes
File without changes