upgini 1.2.125__py3-none-any.whl → 1.2.128a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.125"
1
+ __version__ = "1.2.128a1"
@@ -123,17 +123,9 @@ class DataSourcePublisher:
123
123
  set(search_keys.values()) == {SearchKey.IP_RANGE_FROM, SearchKey.IP_RANGE_TO}
124
124
  or set(search_keys.values()) == {SearchKey.IPV6_RANGE_FROM, SearchKey.IPV6_RANGE_TO}
125
125
  or set(search_keys.values()) == {SearchKey.MSISDN_RANGE_FROM, SearchKey.MSISDN_RANGE_TO}
126
+ or snapshot_frequency_days is not None or join_date_abs_limit_days is not None
126
127
  ) and sort_column is None:
127
128
  raise ValidationError("Sort column is required for passed search keys")
128
- if (
129
- set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
130
- and snapshot_frequency_days is None
131
- and join_date_abs_limit_days is None
132
- ):
133
- raise ValidationError(
134
- "With MSISDN and DATE keys one of the snapshot_frequency_days or"
135
- " join_date_abs_limit_days parameters is required"
136
- )
137
129
  if (
138
130
  set(search_keys.values()) == {SearchKey.PHONE, SearchKey.DATE}
139
131
  or set(search_keys.values()) == {SearchKey.HEM, SearchKey.DATE}
@@ -1479,11 +1479,7 @@ class FeaturesEnricher(TransformerMixin):
1479
1479
 
1480
1480
  self.logger.info(f"PSI values by sparsity: {psi_values_sparse}")
1481
1481
 
1482
- unstable_by_sparsity = [
1483
- feature
1484
- for feature, psi in psi_values_sparse.items()
1485
- if psi > stability_threshold
1486
- ]
1482
+ unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1487
1483
  if unstable_by_sparsity:
1488
1484
  self.logger.info(f"Unstable by sparsity features ({stability_threshold}): {sorted(unstable_by_sparsity)}")
1489
1485
 
@@ -1493,11 +1489,7 @@ class FeaturesEnricher(TransformerMixin):
1493
1489
 
1494
1490
  self.logger.info(f"PSI values by value: {psi_values}")
1495
1491
 
1496
- unstable_by_value = [
1497
- feature
1498
- for feature, psi in psi_values.items()
1499
- if psi > stability_threshold
1500
- ]
1492
+ unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1501
1493
  if unstable_by_value:
1502
1494
  self.logger.info(f"Unstable by value features ({stability_threshold}): {sorted(unstable_by_value)}")
1503
1495
 
@@ -2561,10 +2553,15 @@ if response.status_code == 200:
2561
2553
  if transform_usage.has_limit:
2562
2554
  if len(X) > transform_usage.rest_rows:
2563
2555
  rest_rows = max(transform_usage.rest_rows, 0)
2564
- msg = self.bundle.get("transform_usage_warning").format(len(X), rest_rows)
2556
+ bundle_msg = (
2557
+ "transform_usage_warning_registered"
2558
+ if self.__is_registered
2559
+ else "transform_usage_warning_demo"
2560
+ )
2561
+ msg = self.bundle.get(bundle_msg).format(len(X), rest_rows)
2565
2562
  self.logger.warning(msg)
2566
2563
  print(msg)
2567
- show_request_quote_button()
2564
+ show_request_quote_button(is_registered=self.__is_registered)
2568
2565
  return None, {}, [], {}
2569
2566
  else:
2570
2567
  msg = self.bundle.get("transform_usage_info").format(
@@ -2749,8 +2746,13 @@ if response.status_code == 200:
2749
2746
 
2750
2747
  meaning_types = {}
2751
2748
  meaning_types.update(
2752
- {col: FileColumnMeaningType.FEATURE for col in features_for_transform if col not in date_features}
2749
+ {
2750
+ col: FileColumnMeaningType.FEATURE
2751
+ for col in features_for_transform
2752
+ if col not in date_features and col not in generated_features
2753
+ }
2753
2754
  )
2755
+ meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in generated_features})
2754
2756
  meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
2755
2757
  meaning_types.update({col: key.value for col, key in search_keys.items()})
2756
2758
 
@@ -3304,9 +3306,10 @@ if response.status_code == 200:
3304
3306
  **{
3305
3307
  str(c): FileColumnMeaningType.FEATURE
3306
3308
  for c in df.columns
3307
- if c not in non_feature_columns and c not in date_features
3309
+ if c not in non_feature_columns and c not in date_features and c not in self.fit_generated_features
3308
3310
  },
3309
3311
  }
3312
+ meaning_types.update({col: FileColumnMeaningType.GENERATED_FEATURE for col in self.fit_generated_features})
3310
3313
  meaning_types.update({col: FileColumnMeaningType.DATE_FEATURE for col in date_features})
3311
3314
  meaning_types[self.TARGET_NAME] = FileColumnMeaningType.TARGET
3312
3315
  meaning_types[ENTITY_SYSTEM_RECORD_ID] = FileColumnMeaningType.ENTITY_SYSTEM_RECORD_ID
@@ -3368,7 +3371,12 @@ if response.status_code == 200:
3368
3371
  self.passed_features = [
3369
3372
  column
3370
3373
  for column, meaning_type in meaning_types.items()
3371
- if meaning_type in [FileColumnMeaningType.FEATURE, FileColumnMeaningType.DATE_FEATURE]
3374
+ if meaning_type
3375
+ in [
3376
+ FileColumnMeaningType.FEATURE,
3377
+ FileColumnMeaningType.DATE_FEATURE,
3378
+ FileColumnMeaningType.GENERATED_FEATURE,
3379
+ ]
3372
3380
  ]
3373
3381
 
3374
3382
  self._search_task = dataset.search(
@@ -4433,7 +4441,9 @@ if response.status_code == 200:
4433
4441
  raise Exception(self.bundle.get("missing_features_meta"))
4434
4442
  features_meta = deepcopy(features_meta)
4435
4443
 
4436
- original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
4444
+ file_metadata_columns = self._search_task.get_file_metadata(trace_id).columns
4445
+ file_meta_by_orig_name = {c.originalName: c for c in file_metadata_columns}
4446
+ original_names_dict = {c.name: c.originalName for c in file_metadata_columns}
4437
4447
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
4438
4448
 
4439
4449
  # To be sure that names with hash suffixes
@@ -4453,7 +4463,11 @@ if response.status_code == 200:
4453
4463
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4454
4464
  feature_meta.name = original_name
4455
4465
 
4456
- is_client_feature = original_name in clients_features_df.columns
4466
+ file_meta = file_meta_by_orig_name.get(original_name)
4467
+ is_generated_feature = (
4468
+ file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
4469
+ )
4470
+ is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
4457
4471
 
4458
4472
  if selected_features is not None and feature_meta.name not in selected_features:
4459
4473
  self.logger.info(f"Feature {feature_meta.name} is not selected before and skipped")
@@ -4476,9 +4490,13 @@ if response.status_code == 200:
4476
4490
 
4477
4491
  for feature_meta in selected_features_meta:
4478
4492
  original_name = original_names_dict.get(feature_meta.name, feature_meta.name)
4479
- is_client_feature = original_name in clients_features_df.columns
4493
+ file_meta = file_meta_by_orig_name.get(original_name)
4494
+ is_generated_feature = (
4495
+ file_meta is not None and file_meta.meaningType == FileColumnMeaningType.GENERATED_FEATURE
4496
+ )
4497
+ is_client_feature = original_name in clients_features_df.columns and not is_generated_feature
4480
4498
 
4481
- if not is_client_feature:
4499
+ if not is_client_feature and not is_generated_feature:
4482
4500
  self.external_source_feature_names.append(original_name)
4483
4501
 
4484
4502
  if self.psi_values is not None:
@@ -4509,9 +4527,10 @@ if response.status_code == 200:
4509
4527
 
4510
4528
  self.feature_names_.append(feature_meta.name)
4511
4529
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
4512
-
4513
4530
  df_for_sample = features_df if feature_meta.name in features_df.columns else clients_features_df
4514
- feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
4531
+ feature_info = FeatureInfo.from_metadata(
4532
+ feature_meta, df_for_sample, is_client_feature, is_generated_feature
4533
+ )
4515
4534
  features_info.append(feature_info.to_row(self.bundle))
4516
4535
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
4517
4536
  internal_features_info.append(feature_info.to_internal_row(self.bundle))
@@ -4522,7 +4541,7 @@ if response.status_code == 200:
4522
4541
  if len(features_info) > 0:
4523
4542
  self.features_info = pd.DataFrame(features_info)
4524
4543
  # If all psi values are 0 or null, drop psi column
4525
- if self.features_info[self.bundle.get("features_info_psi")].fillna(0.0).eq(0.0).all():
4544
+ if self.features_info[self.bundle.get("features_info_psi")].astype(np.float64).fillna(0.0).eq(0.0).all():
4526
4545
  self.features_info.drop(columns=[self.bundle.get("features_info_psi")], inplace=True)
4527
4546
  self._features_info_without_links = pd.DataFrame(features_info_without_links)
4528
4547
  self._internal_features_info = pd.DataFrame(internal_features_info)
upgini/metadata.py CHANGED
@@ -36,6 +36,7 @@ class FileColumnMeaningType(Enum):
36
36
  SCORE = "SCORE"
37
37
  TARGET = "TARGET"
38
38
  FEATURE = "FEATURE"
39
+ GENERATED_FEATURE = "GENERATED_FEATURE"
39
40
  DATE_FEATURE = "DATE_FEATURE"
40
41
  CUSTOM_KEY = "CUSTOM_KEY"
41
42
  COUNTRY = "COUNTRY"
@@ -12,7 +12,8 @@ polling_unregister_information=We'll send email notification once it's completed
12
12
  ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
13
13
  demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
14
14
  transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
15
- transform_usage_warning=You are trying to launch enrichment for {} rows, which will exceed the rest limit {}.
15
+ transform_usage_warning_demo=Unregistered-user limit: {} rows remaining; you requested {}.
16
+ transform_usage_warning_registered=Free tier limit: {} rows remaining; you requested {}.
16
17
 
17
18
  # Warnings
18
19
  support_link=https://upgini.com/support
@@ -339,17 +339,54 @@ def show_button_download_pdf(
339
339
  return display(HTML(html), display_id=display_id)
340
340
 
341
341
 
342
- def show_request_quote_button():
342
+ def show_request_quote_button(is_registered: bool):
343
343
  if not ipython_available():
344
- print("https://upgini.com/request-a-quote")
344
+ if is_registered:
345
+ print("https://upgini.com/request-a-quote")
346
+ else:
347
+ print("https://profile.upgini.com/login")
345
348
  else:
346
- import ipywidgets as widgets
347
- from IPython.display import Javascript, display
348
-
349
- button = widgets.Button(description="Request a quote", button_style="danger")
349
+ from IPython.display import HTML, display, Javascript
350
+ from ipywidgets import Layout, Button
351
+
352
+ if is_registered:
353
+ display(HTML("""
354
+ <style>
355
+ button.custom-button {
356
+ border: 1px solid black !important;
357
+ background: white !important;
358
+ color: black !important;
359
+ white-space: nowrap;
360
+ }
361
+ </style>
362
+ """))
363
+ description = "Request a quote"
364
+ tooltip = "Ask a quote"
365
+ url = "https://upgini.com/request-a-quote"
366
+ else:
367
+ display(HTML("""
368
+ <style>
369
+ button.custom-button {
370
+ border: 1px solid #d00 !important;
371
+ background: #fff !important;
372
+ color: #d00 !important;
373
+ white-space: nowrap;
374
+ }
375
+ </style>
376
+ """))
377
+ description = "Get an API KEY"
378
+ tooltip = "Register"
379
+ url = "https://profile.upgini.com/login"
380
+
381
+ button = Button(
382
+ description=description,
383
+ layout=Layout(width='auto'),
384
+ tooltip=tooltip
385
+ )
386
+ button.add_class("custom-button")
350
387
 
351
388
  def on_button_clicked(b):
352
- display(Javascript('window.open("https://upgini.com/request-a-quote");'))
389
+ display(Javascript('window.open("' + url + '");'))
353
390
 
354
391
  button.on_click(on_button_clicked)
355
392
 
@@ -31,7 +31,10 @@ class FeatureInfo:
31
31
 
32
32
  @staticmethod
33
33
  def from_metadata(
34
- feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame], is_client_feature: bool
34
+ feature_meta: FeaturesMetadataV2,
35
+ data: Optional[pd.DataFrame],
36
+ is_client_feature: bool,
37
+ is_generated_feature: bool,
35
38
  ) -> "FeatureInfo":
36
39
  return FeatureInfo(
37
40
  name=_get_name(feature_meta),
@@ -41,8 +44,8 @@ class FeatureInfo:
41
44
  value_preview=_get_feature_sample(feature_meta, data),
42
45
  provider=_get_provider(feature_meta, is_client_feature),
43
46
  internal_provider=_get_internal_provider(feature_meta, is_client_feature),
44
- source=_get_source(feature_meta, is_client_feature),
45
- internal_source=_get_internal_source(feature_meta, is_client_feature),
47
+ source=_get_source(feature_meta, is_client_feature, is_generated_feature),
48
+ internal_source=_get_internal_source(feature_meta, is_client_feature, is_generated_feature),
46
49
  update_frequency=feature_meta.update_frequency,
47
50
  commercial_schema=feature_meta.commercial_schema,
48
51
  doc_link=feature_meta.doc_link,
@@ -139,22 +142,30 @@ def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature:
139
142
  return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
140
143
 
141
144
 
142
- def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
145
+ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
146
+ if is_generated_feature:
147
+ return "AutoFE: features from Training dataset"
148
+
143
149
  sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
144
150
  source_links = _list_or_single(feature_meta.data_source_links, feature_meta.data_source_link)
145
151
  if sources:
146
152
  source = _make_links(sources, source_links)
147
153
  else:
148
- source = _get_internal_source(feature_meta, is_client_feature)
154
+ source = _get_internal_source(feature_meta, is_client_feature, is_generated_feature)
149
155
  return source
150
156
 
151
157
 
152
- def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
158
+ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool, is_generated_feature: bool) -> str:
159
+ if is_generated_feature:
160
+ return "AutoFE: features from Training dataset"
161
+
153
162
  sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
154
163
  if sources:
155
164
  return ", ".join(sources)
165
+ elif feature_meta.data_source:
166
+ return feature_meta.data_source
156
167
  else:
157
- return feature_meta.data_source or (
168
+ return (
158
169
  LLM_SOURCE
159
170
  if not feature_meta.name.endswith("_country")
160
171
  and not feature_meta.name.endswith("_postal_code")
@@ -54,12 +54,16 @@ def _get_execution_ide() -> str:
54
54
  @lru_cache
55
55
  def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optional[str] = None) -> dict:
56
56
  # default values
57
+ print("Start getting track metrics")
57
58
  track = {"ide": _get_execution_ide()}
59
+ print("Track ide: ", track["ide"])
58
60
  ident_res = "https://api64.ipify.org"
59
61
 
60
62
  try:
61
63
  track["hostname"] = socket.gethostname()
64
+ print("Track hostname: ", track["hostname"])
62
65
  track["whoami"] = getuser()
66
+ print("Track whoami: ", track["whoami"])
63
67
  except Exception as e:
64
68
  track["hostname"] = "localhost"
65
69
  track["whoami"] = "root"
@@ -83,7 +87,9 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
83
87
  """
84
88
  )
85
89
  )
90
+ print("After display JS with visitorId")
86
91
  track["visitorId"] = output.eval_js("getVisitorId()", timeout_sec=30)
92
+ print("Track visitorId: ", track["visitorId"])
87
93
  except Exception as e:
88
94
  track["err"] = str(e)
89
95
  if "visitorId" not in track:
@@ -106,7 +112,9 @@ def get_track_metrics(client_ip: Optional[str] = None, client_visitorid: Optiona
106
112
  """
107
113
  )
108
114
  )
115
+ print("After display JS with ip")
109
116
  track["ip"] = output.eval_js("getIP()", timeout_sec=10)
117
+ print("Track ip: ", track["ip"])
110
118
  except Exception as e:
111
119
  track["err"] = str(e)
112
120
  if "ip" not in track:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.125
3
+ Version: 1.2.128a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -51,7 +51,7 @@ Description-Content-Type: text/markdown
51
51
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
52
52
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
53
53
  <h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
54
- <p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
54
+ <p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community, and premium external data sources, </br>including open & commercial LLMs</b> </p>
55
55
  <p align="center">
56
56
  <br />
57
57
  <a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
@@ -59,7 +59,7 @@ Description-Content-Type: text/markdown
59
59
  <a href="https://profile.upgini.com">Register / Sign In</a> |
60
60
  <!-- <a href="https://gitter.im/upgini/community?utm_source=share-link&utm_medium=link&utm_campaign=share-link">Gitter Community</a> | -->
61
61
  <a href="https://4mlg.short.gy/join-upgini-community">Slack Community</a> |
62
- <a href="https://forms.gle/pH99gb5hPxBEfNdR7"><strong>Propose new Data source</strong></a>
62
+ <a href="https://forms.gle/pH99gb5hPxBEfNdR7"><strong>Propose a new data source</strong></a>
63
63
  </p>
64
64
  <p align=center>
65
65
  <a href="/LICENSE"><img alt="BSD-3 license" src="https://img.shields.io/badge/license-BSD--3%20Clause-green"></a>
@@ -75,19 +75,19 @@ Description-Content-Type: text/markdown
75
75
  [![Gitter Сommunity](https://img.shields.io/badge/gitter-@upgini-teal.svg?logo=gitter)](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
76
76
  ## ❔ Overview
77
77
 
78
- **Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
78
+ **Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of ML features using large language models (LLMs), GNNs (graph neural networks), and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
79
79
 
80
- **Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify features search and enrichment to make external data a standard approach. Like a hyperparameter tuning for machine learning nowadays.
80
+ **Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want to radically simplify feature search and enrichment to make external data a standard approach. Like hyperparameter tuning in machine learning today.
81
81
 
82
82
  **Mission:** Democratize access to data sources for data science community.
83
83
 
84
84
  ## 🚀 Awesome features
85
- ⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
86
- ⭐️ Automated feature generation from the sources: feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
87
- ⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
88
- ⭐️ Calculate accuracy metrics and uplifts after enrichment existing ML model with external features
89
- ⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
90
- ⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
85
+ ⭐️ Automatically find only relevant features that *improve your model’s accuracy*. Not just correlated with the target variable, which in 9 out of 10 cases yields zero accuracy improvement
86
+ ⭐️ Automated feature generation from the sources: feature generation with LLM‑based data augmentation, RNNs, and GraphNNs; ensembling across multiple data sources
87
+ ⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/ZIP code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
88
+ ⭐️ Calculate accuracy metrics and uplift after enriching an existing ML model with external features
89
+ ⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate the risks of unstable external data dependencies in the ML pipeline
90
+ ⭐️ Easy to use - a single request to enrich the training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
91
91
  <table>
92
92
  <tr>
93
93
  <td> date / datetime </td>
@@ -103,7 +103,7 @@ Description-Content-Type: text/markdown
103
103
  </tr>
104
104
  </table>
105
105
 
106
- ⭐️ Scikit-learn compatible interface for quick data integration with existing ML pipelines
106
+ ⭐️ Scikit-learn-compatible interface for quick data integration with existing ML pipelines
107
107
  ⭐️ Support for most common supervised ML tasks on tabular data:
108
108
  <table>
109
109
  <tr>
@@ -112,7 +112,7 @@ Description-Content-Type: text/markdown
112
112
  </tr>
113
113
  <tr>
114
114
  <td><a href="https://en.wikipedia.org/wiki/Regression_analysis">☑️ regression</a></td>
115
- <td><a href="https://en.wikipedia.org/wiki/Time_series#Prediction_and_forecasting">☑️ time series prediction</a></td>
115
+ <td><a href="https://en.wikipedia.org/wiki/Time_series#Prediction_and_forecasting">☑️ time-series prediction</a></td>
116
116
  </tr>
117
117
  </table>
118
118
 
@@ -124,13 +124,13 @@ Description-Content-Type: text/markdown
124
124
 
125
125
  ## 🌎 Connected data sources and coverage
126
126
 
127
- - **Public data** : public sector, academic institutions, other sources through open data portals. Curated and updated by the Upgini team
128
- - **Community shared data**: royalty / license free datasets or features from Data science community (our users). It's both a public and a scraped data
127
+ - **Public data**: public sector, academic institutions, other sources through open data portals. Curated and updated by the Upgini team
128
+ - **Communityshared data**: royalty- or license-free datasets or features from the data science community (our users). This includes both public and scraped data
129
129
  - **Premium data providers**: commercial data sources verified by the Upgini team in real-world use cases
130
130
 
131
- 👉 [**Details on datasets and features**](https://upgini.com/#data_sources)
131
+ 👉 [**Details on datasets and features**](https://upgini.com/#data_sources)
132
132
  #### 📊 Total: **239 countries** and **up to 41 years** of history
133
- |Data sources|Countries|History, years|# sources for ensemble|Update|Search keys|API Key required
133
+ |Data sources|Countries|History (years)|# sources for ensembling|Update frequency|Search keys|API Key required
134
134
  |--|--|--|--|--|--|--|
135
135
  |Historical weather & Climate normals | 68 |22|-|Monthly|date, country, postal/ZIP code|No
136
136
  |Location/Places/POI/Area/Proximity information from OpenStreetMap | 221 |2|-|Monthly|date, country, postal/ZIP code|No
@@ -138,7 +138,7 @@ Description-Content-Type: text/markdown
138
138
  |Consumer Confidence index| 44 |22|-|Monthly|date, country|No
139
139
  |World economic indicators|191 |41|-|Monthly|date, country|No
140
140
  |Markets data|-|17|-|Monthly|date, datetime|No
141
- |World mobile & fixed broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
141
+ |World mobile & fixed-broadband network coverage and performance |167|-|3|Monthly|country, postal/ZIP code|No
142
142
  |World demographic data |90|-|2|Annual|country, postal/ZIP code|No
143
143
  |World house prices |44|-|3|Annual|country, postal/ZIP code|No
144
144
  |Public social media profile data |104|-|-|Monthly|date, email/HEM, phone |Yes
@@ -153,8 +153,8 @@ Description-Content-Type: text/markdown
153
153
 
154
154
  ### [Search of relevant external features & Automated feature generation for Salary prediction task (use as a template)](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb)
155
155
 
156
- * The goal is to predict salary for data science job postning based on information about employer and job description.
157
- * Following this guide, you'll learn how to **search & auto generate new relevant features with Upgini library**
156
+ * The goal is to predict salary for a data science job posting based on information about the employer and job description.
157
+ * Following this guide, you'll learn how to **search and autogenerate new relevant features with the Upgini library**
158
158
  * The evaluation metric is [Mean Absolute Error (MAE)](https://en.wikipedia.org/wiki/Mean_absolute_error).
159
159
 
160
160
  Run [Feature search & generation notebook](https://github.com/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb) inside your browser:
@@ -169,7 +169,7 @@ Run [Feature search & generation notebook](https://github.com/upgini/upgini/blob
169
169
  ### ❓ [Simple sales prediction for retail stores](https://github.com/upgini/upgini/blob/main/notebooks/kaggle_example.ipynb)
170
170
 
171
171
  * The goal is to **predict future sales of different goods in stores** based on a 5-year history of sales.
172
- * Kaggle Competition [Store Item Demand Forecasting Challenge](https://www.kaggle.com/c/demand-forecasting-kernels-only) is a product sales forecasting. The evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
172
+ * Kaggle Competition [Store Item Demand Forecasting Challenge](https://www.kaggle.com/c/demand-forecasting-kernels-only) is a product sales forecasting competition. The evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
173
173
 
174
174
  Run [Simple sales prediction for retail stores](https://github.com/upgini/upgini/blob/main/notebooks/kaggle_example.ipynb) inside your browser:
175
175
 
@@ -181,25 +181,25 @@ Run [Simple sales prediction for retail stores](https://github.com/upgini/upgini
181
181
  [![Open example in Gitpod](https://img.shields.io/badge/run_example_in-gitpod-orange?style=for-the-badge&logo=gitpod)](https://gitpod.io/#/github.com/upgini/upgini)
182
182
  -->
183
183
 
184
- ### ❓ [How to boost ML model accuracy for Kaggle TOP1 Leaderboard in 10 minutes](https://www.kaggle.com/code/romaupgini/more-external-features-for-top1-private-lb-4-54/notebook)
184
+ ### ❓ [How to boost ML model accuracy for Kaggle Top-1 leaderboard in 10 minutes](https://www.kaggle.com/code/romaupgini/more-external-features-for-top1-private-lb-4-54/notebook)
185
185
 
186
- * The goal is **accuracy improvement for TOP1 winning Kaggle solution** from new relevant external features & data.
187
- * [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting, evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
186
+ * The goal is **to improve a Top‑1 winning Kaggle solution** by adding new relevant external features and data.
187
+ * [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting competition; the evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
188
188
 
189
189
  ### ❓ [How to do low-code feature engineering for AutoML tools](https://www.kaggle.com/code/romaupgini/zero-feature-engineering-with-upgini-pycaret/notebook)
190
190
 
191
191
  * **Save time on feature search and engineering**. Use ready-to-use external features and data sources to maximize overall AutoML accuracy, right out of the box.
192
192
  * [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting, evaluation metric is [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
193
- * Low-code AutoML tools: [Upgini](https://github.com/upgini/upgini) and [PyCaret](https://github.com/pycaret/pycaret)
193
+ * Low-code AutoML frameworks: [Upgini](https://github.com/upgini/upgini) and [PyCaret](https://github.com/pycaret/pycaret)
194
194
 
195
- ### ❓ [How to improve accuracy of Multivariate Time Series forecast from external features & data](https://www.kaggle.com/code/romaupgini/guide-external-data-features-for-multivariatets/notebook)
195
+ ### ❓ [How to improve accuracy of Multivariate time-series forecast from external features & data](https://www.kaggle.com/code/romaupgini/guide-external-data-features-for-multivariatets/notebook)
196
196
 
197
- * The goal is **accuracy improvement of Multivariate Time Series prediction** from new relevant external features & data. The main challenge here is a strategy of data & feature enrichment, when a component of Multivariate TS depends not only on its past values but also has **some dependency on other components**.
197
+ * The goal is **to improve the accuracy of multivariate time‑series forecasting** using new relevant external features and data. The main challenge is the data and feature enrichment strategy, in which a component of a multivariate time series depends not only on its past values but also on other components.
198
198
  * [Kaggle Competition](https://www.kaggle.com/competitions/tabular-playground-series-jan-2022/) is a product sales forecasting, evaluation metric is [RMSLE](https://www.kaggle.com/code/carlmcbrideellis/store-sales-using-the-average-of-the-last-16-days#Note-regarding-calculating-the-average).
199
199
 
200
200
  ### ❓ [How to speed up feature engineering hypothesis tests with ready-to-use external features](https://www.kaggle.com/code/romaupgini/statement-dates-to-use-or-not-to-use/notebook)
201
201
 
202
- * **Save time on external data wrangling and feature calculation code** for hypothesis tests. The key challenge here is a time-dependent representation of information in a training dataset, which is uncommon for credit default prediction tasks. As a result, special data enrichment strategy is used.
202
+ * **Save time on external data wrangling and feature calculation code** for hypothesis tests. The key challenge is the timedependent representation of information in the training dataset, which is uncommon for credit default prediction tasks. As a result, special data enrichment strategy is used.
203
203
  * [Kaggle Competition](https://www.kaggle.com/competitions/amex-default-prediction) is a credit default prediction, evaluation metric is [normalized Gini coefficient](https://www.kaggle.com/competitions/amex-default-prediction/discussion/327464).
204
204
 
205
205
  ## 🏁 Quick start
@@ -228,19 +228,19 @@ docker build -t upgini .</i></br>
228
228
  <i>
229
229
  docker run -p 8888:8888 upgini</br>
230
230
  </i></br>
231
- 3. Open http://localhost:8888?token="<"your_token_from_console_output">" in your browser
231
+ 3. Open http://localhost:8888?token=&lt;your_token_from_console_output&gt; in your browser
232
232
  </details>
233
233
 
234
234
 
235
235
  ### 2. 💡 Use your labeled training dataset for search
236
236
 
237
237
  You can use your labeled training datasets "as is" to initiate the search. Under the hood, we'll search for relevant data using:
238
- - **[search keys](#-search-key-types-we-support-more-to-come)** from training dataset to match records from potential data sources with a new features
239
- - **labels** from training dataset to estimate relevancy of feature or dataset for your ML task and calculate feature importance metrics
240
- - **your features** from training dataset to find external datasets and features which only give accuracy improvement to your existing data and estimate accuracy uplift ([optional](#find-features-only-give-accuracy-gain-to-existing-data-in-the-ml-model))
238
+ - **[search keys](#-search-key-types-we-support-more-to-come)** from the training dataset to match records from potential data sources with new features
239
+ - **labels** from the training dataset to estimate the relevance of features or datasets for your ML task and calculate feature importance metrics
240
+ - **your features** from the training dataset to find external datasets and features that improve accuracy of your existing data and estimate accuracy uplift ([optional](#find-features-only-give-accuracy-gain-to-existing-data-in-the-ml-model))
241
241
 
242
242
 
243
- Load training dataset into pandas dataframe and separate features' columns from label column in a Scikit-learn way:
243
+ Load the training dataset into a Pandas DataFrame and separate feature columns from the label column in a Scikit-learn way:
244
244
  ```python
245
245
  import pandas as pd
246
246
  # labeled training dataset - customer_churn_prediction_train.csv
@@ -251,7 +251,7 @@ y = train_df["churn_flag"]
251
251
  <table border=1 cellpadding=10><tr><td>
252
252
  ⚠️ <b>Requirements for search initialization dataset</b>
253
253
  <br>
254
- We do dataset verification and cleaning under the hood, but still there are some requirements to follow:
254
+ We perform dataset verification and cleaning under the hood, but still there are some requirements to follow:
255
255
  <br>
256
256
  1. <b>pandas.DataFrame</b>, <b>pandas.Series</b> or <b>numpy.ndarray</b> representation;
257
257
  <br>
@@ -259,12 +259,12 @@ We do dataset verification and cleaning under the hood, but still there are some
259
259
  <br>
260
260
  3. at least one column selected as a <a href="#-search-key-types-we-support-more-to-come">search key</a>;
261
261
  <br>
262
- 4. min size after deduplication by search key column and NaNs removal: <i>100 records</i>
262
+ 4. min size after deduplication by search-key columns and removal of NaNs: <i>100 records</i>
263
263
  </td></tr></table>
264
264
 
265
- ### 3. 🔦 Choose one or multiple columns as a search keys
266
- *Search keys* columns will be used to match records from all potential external data sources / features.
267
- Define one or multiple columns as a search keys with `FeaturesEnricher` class initialization.
265
+ ### 3. 🔦 Choose one or more columns as search keys
266
+ *Search keys* columns will be used to match records from all potential external data sources/features.
267
+ Define one or more columns as search keys when initializing the `FeaturesEnricher` class.
268
268
  ```python
269
269
  from upgini.features_enricher import FeaturesEnricher
270
270
  from upgini.metadata import SearchKey
@@ -284,7 +284,7 @@ enricher = FeaturesEnricher(
284
284
  <tr>
285
285
  <th> Search Key<br/>Meaning Type </th>
286
286
  <th> Description </th>
287
- <th> Allowed pandas dtypes (python types) </th>
287
+ <th> Allowed pandas dtypes (Python types) </th>
288
288
  <th> Example </th>
289
289
  </tr>
290
290
  <tr>
@@ -301,13 +301,13 @@ enricher = FeaturesEnricher(
301
301
  </tr>
302
302
  <tr>
303
303
  <td> SearchKey.IP </td>
304
- <td> IP address (version 4) </td>
305
- <td> <tt>object(str, ipaddress.IPv4Address)</tt> <br/> <tt>string</tt> <br/> <tt>int64</tt> </td>
304
+ <td> IPv4 or IPv6 address</td>
305
+ <td> <tt>object(str, ipaddress.IPv4Address, ipaddress.IPv6Address)</tt> <br/> <tt>string</tt> <br/> <tt>int64</tt> </td>
306
306
  <td> <tt>192.168.0.1 </tt> </td>
307
307
  </tr>
308
308
  <tr>
309
309
  <td> SearchKey.PHONE </td>
310
- <td> phone number, <a href="https://en.wikipedia.org/wiki/E.164">E.164 standard</a> </td>
310
+ <td> phone number (<a href="https://en.wikipedia.org/wiki/E.164">E.164 standard</a>) </td>
311
311
  <td> <tt>object(str)</tt> <br/> <tt>string</tt> <br/> <tt>int64</tt> <br/> <tt>float64</tt> </td>
312
312
  <td> <tt>443451925138 </tt> </td>
313
313
  </tr>
@@ -322,7 +322,7 @@ enricher = FeaturesEnricher(
322
322
  </td>
323
323
  <td>
324
324
  <tt>2020-02-12 </tt>&nbsp;(<a href="https://en.wikipedia.org/wiki/ISO_8601">ISO-8601 standard</a>)
325
- <br/> <tt>12.02.2020 </tt>&nbsp;(non standard notation)
325
+ <br/> <tt>12.02.2020 </tt>&nbsp;(nonstandard notation)
326
326
  </td>
327
327
  </tr>
328
328
  <tr>
@@ -344,7 +344,7 @@ enricher = FeaturesEnricher(
344
344
  </tr>
345
345
  <tr>
346
346
  <td> SearchKey.POSTAL_CODE </td>
347
- <td> Postal code a.k.a. ZIP code. Could be used only with SearchKey.COUNTRY </td>
347
+ <td> Postal code a.k.a. ZIP code. Can only be used with SearchKey.COUNTRY </td>
348
348
  <td> <tt>object(str)</tt> <br/> <tt>string</tt> </td>
349
349
  <td> <tt>21174 </tt> <br/> <tt>061107 </tt> <br/> <tt>SE-999-99 </tt> </td>
350
350
  </tr>
@@ -352,7 +352,7 @@ enricher = FeaturesEnricher(
352
352
 
353
353
  </details>
354
354
 
355
- For the meaning types <tt>SearchKey.DATE</tt>/<tt>SearchKey.DATETIME</tt> with dtypes <tt>object</tt> or <tt>string</tt> you have to clarify date/datetime format by passing <tt>date_format</tt> parameter to `FeaturesEnricher`. For example:
355
+ For the search key types <tt>SearchKey.DATE</tt>/<tt>SearchKey.DATETIME</tt> with dtypes <tt>object</tt> or <tt>string</tt> you have to specify the date/datetime format by passing <tt>date_format</tt> parameter to `FeaturesEnricher`. For example:
356
356
  ```python
357
357
  from upgini.features_enricher import FeaturesEnricher
358
358
  from upgini.metadata import SearchKey
@@ -370,12 +370,12 @@ enricher = FeaturesEnricher(
370
370
  )
371
371
  ```
372
372
 
373
- To use datetime not in UTC timezone, you can cast datetime column explicitly to your timezone (example for Warsaw):
373
+ To use a non-UTC timezone for datetime, you can cast datetime column explicitly to your timezone (example for Warsaw):
374
374
  ```python
375
375
  df["date"] = df.date.astype("datetime64").dt.tz_localize("Europe/Warsaw")
376
376
  ```
377
377
 
378
- Single country for the whole training dataset can be passed with `country_code` parameter:
378
+ A single country for the whole training dataset can be passed via `country_code` parameter:
379
379
  ```python
380
380
  from upgini.features_enricher import FeaturesEnricher
381
381
  from upgini.metadata import SearchKey
@@ -391,10 +391,10 @@ enricher = FeaturesEnricher(
391
391
  ```
392
392
 
393
393
  ### 4. 🔍 Start your first feature search!
394
- The main abstraction you interact is `FeaturesEnricher`, a Scikit-learn compatible estimator. You can easily add it into your existing ML pipelines.
395
- Create instance of the `FeaturesEnricher` class and call:
394
+ The main abstraction you interact with is `FeaturesEnricher`, a Scikit-learn-compatible estimator. You can easily add it to your existing ML pipelines.
395
+ Create an instance of the `FeaturesEnricher` class and call:
396
396
  - `fit` to search relevant datasets & features
397
- - than `transform` to enrich your dataset with features from search result
397
+ - then `transform` to enrich your dataset with features from the search result
398
398
 
399
399
  Let's try it out!
400
400
  ```python
@@ -407,7 +407,7 @@ train_df = pd.read_csv("customer_churn_prediction_train.csv")
407
407
  X = train_df.drop(columns="churn_flag")
408
408
  y = train_df["churn_flag"]
409
409
 
410
- # now we're going to create `FeaturesEnricher` class
410
+ # now we're going to create an instance of the `FeaturesEnricher` class
411
411
  enricher = FeaturesEnricher(
412
412
  search_keys={
413
413
  "subscription_activation_date": SearchKey.DATE,
@@ -415,15 +415,15 @@ enricher = FeaturesEnricher(
415
415
  "zip_code": SearchKey.POSTAL_CODE
416
416
  })
417
417
 
418
- # everything is ready to fit! For 200к records fitting should take around 10 minutes,
419
- # we send email notification, just register on profile.upgini.com
418
+ # Everything is ready to fit! For 100k records, fitting should take around 10 minutes
419
+ # We'll send an email notification; just register on profile.upgini.com
420
420
  enricher.fit(X, y)
421
421
  ```
422
422
 
423
- That's all! We've fit `FeaturesEnricher`.
423
+ That's it! The `FeaturesEnricher` is now fitted.
424
424
  ### 5. 📈 Evaluate feature importances (SHAP values) from the search result
425
425
 
426
- `FeaturesEnricher` class has two properties for feature importances, which will be filled after fit - `feature_names_` and `feature_importances_`:
426
+ `FeaturesEnricher` class has two properties for feature importances, that are populated after fit - `feature_names_` and `feature_importances_`:
427
427
  - `feature_names_` - feature names from the search result, and if parameter `keep_input=True` was used, initial columns from search dataset as well
428
428
  - `feature_importances_` - SHAP values for features from the search result, same order as in `feature_names_`
429
429
 
@@ -434,8 +434,8 @@ enricher.get_features_info()
434
434
  Get more details about `FeaturesEnricher` at runtime using docstrings via `help(FeaturesEnricher)` or `help(FeaturesEnricher.fit)`.
435
435
 
436
436
  ### 6. 🏭 Enrich Production ML pipeline with relevant external features
437
- `FeaturesEnricher` is a Scikit-learn compatible estimator, so any pandas dataframe can be enriched with external features from a search result (after `fit` ).
438
- Use `transform` method of `FeaturesEnricher` , and let magic to do the rest 🪄
437
+ `FeaturesEnricher` is a Scikit-learn-compatible estimator, so any pandas dataframe can be enriched with external features from a search result (after `fit`).
438
+ Use the `transform` method of `FeaturesEnricher`, and let the magic do the rest 🪄
439
439
  ```python
440
440
  # load dataset for enrichment
441
441
  test_x = pd.read_csv("test.csv")
@@ -444,24 +444,24 @@ enriched_test_features = enricher.transform(test_x)
444
444
  ```
445
445
  #### 6.1 Reuse completed search for enrichment without 'fit' run
446
446
 
447
- `FeaturesEnricher` can be initiated with a `search_id` parameter from completed search after fit method call.
447
+ `FeaturesEnricher` can be initialized with `search_id` from a completed search (after a fit call).
448
448
  Just use `enricher.get_search_id()` or copy search id string from the `fit()` output.
449
- Search keys and features in X should be the same as for `fit()`
449
+ Search keys and features in X must be the same as for `fit()`
450
450
  ```python
451
451
  enricher = FeaturesEnricher(
452
- #same set of a search keys as for the fit step
452
+ # same set of search keys as for the fit step
453
453
  search_keys={"date": SearchKey.DATE},
454
- api_key="<YOUR API_KEY>", # if you fit enricher with api_key then you should use it here
454
+ api_key="<YOUR API_KEY>", # if you fitted the enricher with an api_key, then you should use it here
455
455
  search_id = "abcdef00-0000-0000-0000-999999999999"
456
456
  )
457
- enriched_prod_dataframe=enricher.transform(input_dataframe)
457
+ enriched_prod_dataframe = enricher.transform(input_dataframe)
458
458
  ```
459
- #### 6.2 Enrichment with an updated external data sources and features
460
- For most of the ML cases, training step requires labeled dataset with a historical observations from the past. But for production step you'll need an updated and actual data sources and features for the present time, to calculate a prediction.
461
- `FeaturesEnricher`, when initiated with set of search keys which includes `SearchKey.DATE`, will match records from all potential external data sources **exactly on a the specific date/datetime** based on `SearchKey.DATE`. To avoid enrichment with features "form the future" for the `fit` step.
462
- And then, for `transform` in a production ML pipeline, you'll get enrichment with relevant features, actual for the present date.
459
+ #### 6.2 Enrichment with updated external data sources and features
460
+ In most ML cases, the training step requires a labeled dataset with historical observations. For production, you'll need updated, current data sources and features to generate predictions.
461
+ `FeaturesEnricher`, when initialized with a set of search keys that includes `SearchKey.DATE`, will match records from all potential external data sources **exactly on the specified date/datetime** based on `SearchKey.DATE`, to avoid enrichment with features "from the future" during the `fit` step.
462
+ And then, for `transform` in a production ML pipeline, you'll get enrichment with relevant features, current as of the present date.
463
463
 
464
- ⚠️ Initiate `FeaturesEnricher` with `SearchKey.DATE` search key in a key set to get actual features for production and avoid features from the future for the training:
464
+ ⚠️ Include `SearchKey.DATE` in the set of search keys to get current features for production and avoid features from the future during training:
465
465
  ```python
466
466
  enricher = FeaturesEnricher(
467
467
  search_keys={
@@ -475,13 +475,13 @@ enricher = FeaturesEnricher(
475
475
  ## 💻 How does it work?
476
476
 
477
477
  ### 🧹 Search dataset validation
478
- We validate and clean search initialization dataset under the hood:
478
+ We validate and clean the searchinitialization dataset under the hood:
479
479
 
480
- - сheck you **search keys** columns format;
480
+ - check your **search keys** columns' formats;
481
481
  - check zero variance for label column;
482
- - check dataset for full row duplicates. If we find any, we remove duplicated rows and make a note on share of row duplicates;
483
- - check inconsistent labels - rows with the same features and keys but different labels, we remove them and make a note on share of row duplicates;
484
- - remove columns with zero variance - we treat any non **search key** column in search dataset as a feature, so columns with zero variance will be removed
482
+ - check dataset for full row duplicates. If we find any, we remove them and report their share;
483
+ - check inconsistent labels - rows with the same features and keys but different labels, we remove them and report their share;
484
+ - remove columns with zero variance - we treat any non **search key** column in the search dataset as a feature, so columns with zero variance will be removed
485
485
 
486
486
  ### ❔ Supervised ML tasks detection
487
487
  We detect ML task under the hood based on label column values. Currently we support:
@@ -489,7 +489,7 @@ We detect ML task under the hood based on label column values. Currently we supp
489
489
  - ModelTaskType.MULTICLASS
490
490
  - ModelTaskType.REGRESSION
491
491
 
492
- But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML taks type:
492
+ But for certain search datasets you can pass parameter to `FeaturesEnricher` with correct ML task type:
493
493
  ```python
494
494
  from upgini.features_enricher import FeaturesEnricher
495
495
  from upgini.metadata import SearchKey, ModelTaskType
@@ -499,12 +499,12 @@ enricher = FeaturesEnricher(
499
499
  model_task_type=ModelTaskType.REGRESSION
500
500
  )
501
501
  ```
502
- #### ⏰ Time Series prediction support
503
- *Time series prediction* supported as `ModelTaskType.REGRESSION` or `ModelTaskType.BINARY` tasks with time series specific cross-validation split:
504
- * [Scikit-learn time series cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html#time-series-split) - `CVType.time_series` parameter
505
- * [Blocked time series cross-validation](https://goldinlocks.github.io/Time-Series-Cross-Validation/#Blocked-and-Time-Series-Split-Cross-Validation) - `CVType.blocked_time_series` parameter
502
+ #### ⏰ Time-series prediction support
503
+ *Time-series prediction* is supported as `ModelTaskType.REGRESSION` or `ModelTaskType.BINARY` tasks with time-seriesspecific cross-validation splits:
504
+ * [Scikit-learn time-series cross-validation](https://scikit-learn.org/stable/modules/cross_validation.html#time-series-split) - `CVType.time_series` parameter
505
+ * [Blocked time-series cross-validation](https://goldinlocks.github.io/Time-Series-Cross-Validation/#Blocked-and-Time-Series-Split-Cross-Validation) - `CVType.blocked_time_series` parameter
506
506
 
507
- To initiate feature search you can pass cross-validation type parameter to `FeaturesEnricher` with time series specific CV type:
507
+ To initiate feature search, you can pass the cross-validation type parameter to `FeaturesEnricher` with a time-seriesspecific CV type:
508
508
  ```python
509
509
  from upgini.features_enricher import FeaturesEnricher
510
510
  from upgini.metadata import SearchKey, CVType
@@ -525,12 +525,12 @@ enricher = FeaturesEnricher(
525
525
  cv=CVType.time_series
526
526
  )
527
527
  ```
528
- ⚠️ **Pre-process search dataset** in case of time series prediction:
528
+ ⚠️ **Preprocess the dataset** in case of time-series prediction:
529
529
  sort rows in dataset according to observation order, in most cases - ascending order by date/datetime.
530
530
 
531
531
  ### 🆙 Accuracy and uplift metrics calculations
532
- `FeaturesEnricher` automaticaly calculates model metrics and uplift from new relevant features either using `calculate_metrics()` method or `calculate_metrics=True` parameter in `fit` or `fit_transform` methods (example below).
533
- You can use any model estimator with scikit-learn compartible interface, some examples are:
532
+ `FeaturesEnricher` automatically calculates model metrics and uplift from new relevant features either using `calculate_metrics()` method or `calculate_metrics=True` parameter in `fit` or `fit_transform` methods (example below).
533
+ You can use any model estimator with scikit-learn-compatible interface, some examples are:
534
534
  * [All Scikit-Learn supervised models](https://scikit-learn.org/stable/supervised_learning.html)
535
535
  * [Xgboost](https://xgboost.readthedocs.io/en/stable/python/python_api.html#module-xgboost.sklearn)
536
536
  * [LightGBM](https://lightgbm.readthedocs.io/en/latest/Python-API.html#scikit-learn-api)
@@ -538,8 +538,8 @@ You can use any model estimator with scikit-learn compartible interface, some ex
538
538
 
539
539
  <details>
540
540
  <summary>
541
- 👈 Evaluation metric should be passed to <i>calculate_metrics()</i> by <i>scoring</i> parameter,<br/>
542
- out-of-the box Upgini supports
541
+ 👈 Evaluation metric should be passed to <i>calculate_metrics()</i> by the <i>scoring</i> parameter,<br/>
542
+ out-of-the-box Upgini supports
543
543
  </summary>
544
544
  <table style="table-layout: fixed;">
545
545
  <tr>
@@ -646,10 +646,10 @@ You can use any model estimator with scikit-learn compartible interface, some ex
646
646
  </table>
647
647
  </details>
648
648
 
649
- In addition to that list, you can define custom evaluation metric function using [scikit-learn make_scorer](https://scikit-learn.org/0.15/modules/model_evaluation.html#defining-your-scoring-strategy-from-score-functions), for example [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
649
+ In addition to that list, you can define a custom evaluation metric function using [scikit-learn make_scorer](https://scikit-learn.org/1.7/modules/model_evaluation.html#defining-your-scoring-strategy-from-score-functions), for example [SMAPE](https://en.wikipedia.org/wiki/Symmetric_mean_absolute_percentage_error).
650
650
 
651
- By default, `calculate_metrics()` method calculates evaluation metric with the same cross-validation split as selected for `FeaturesEnricher.fit()` by parameter `cv = CVType.<cross-validation-split>`.
652
- But you can easily define new split by passing child of BaseCrossValidator to parameter `cv` in `calculate_metrics()`.
651
+ By default, the `calculate_metrics()` method calculates the evaluation metric with the same cross-validation split as selected for `FeaturesEnricher.fit()` by the parameter `cv = CVType.<cross-validation-split>`.
652
+ But you can easily define a new split by passing a subclass of `BaseCrossValidator` to the `cv` parameter in `calculate_metrics()`.
653
653
 
654
654
  Example with more tips-and-tricks:
655
655
  ```python
@@ -674,7 +674,7 @@ enricher.calculate_metrics(scoring=custom_scoring)
674
674
  custom_cv = TimeSeriesSplit(n_splits=5)
675
675
  enricher.calculate_metrics(cv=custom_cv)
676
676
 
677
- # All this custom parameters could be combined in both methods: fit, fit_transform and calculate_metrics:
677
+ # All of these custom parameters can be combined in both methods: fit, fit_transform and calculate_metrics:
678
678
  enricher.fit(X, y, eval_set, calculate_metrics=True, estimator=custom_estimator, scoring=custom_scoring, cv=custom_cv)
679
679
  ```
680
680
 
@@ -684,9 +684,9 @@ enricher.fit(X, y, eval_set, calculate_metrics=True, estimator=custom_estimator,
684
684
 
685
685
  ### 🤖 Automated feature generation from columns in a search dataset
686
686
 
687
- If a training dataset has a text column, you can generate additional embeddings from it using instructed embeddings generation with LLMs and data augmentation from external sources, just like Upgini does for all records from connected data sources.
687
+ If a training dataset has a text column, you can generate additional embeddings from it using instruction‑guided embedding generation with LLMs and data augmentation from external sources, just like Upgini does for all records from connected data sources.
688
688
 
689
- For most cases, this gives better results than direct embeddings generation from a text field. Currently, Upgini has two LLMs connected to a search engine - GPT-3.5 from OpenAI and GPT-J.
689
+ In most cases, this gives better results than direct embeddings generation from a text field. Currently, Upgini has two LLMs connected to the search engine - GPT-3.5 from OpenAI and GPT-J.
690
690
 
691
691
  To use this feature, pass the column names as arguments to the `generate_features` parameter. You can use up to 2 columns.
692
692
 
@@ -701,17 +701,17 @@ enricher = FeaturesEnricher(
701
701
 
702
702
  With this code, Upgini will generate LLM embeddings from text columns and then check them for predictive power for your ML task.
703
703
 
704
- Finally, Upgini will return a dataset enriched by only relevant components of LLM embeddings.
704
+ Finally, Upgini will return a dataset enriched with only the relevant components of LLM embeddings.
705
705
 
706
- ### Find features only give accuracy gain to existing data in the ML model
706
+ ### Find features that only provide accuracy gains to existing data in the ML model
707
707
 
708
- If you already have features or other external data sources, you can specifically search new datasets & features only give accuracy gain "on top" of them.
708
+ If you already have features or other external data sources, you can specifically search for new datasets and features that only provide accuracy gains "on top" of them.
709
709
 
710
- Just leave all these existing features in the labeled training dataset and Upgini library automatically use them during feature search process and as a baseline ML model to calculate accuracy metric uplift. Only features which improve accuracy will return.
710
+ Just leave all these existing features in the labeled training dataset and the Upgini library automatically uses them during the feature search process and as a baseline ML model to calculate accuracy metric uplift. Only features that improve accuracy will be returned.
711
711
 
712
712
  ### Check robustness of accuracy improvement from external features
713
713
 
714
- You can validate external features robustness on out-of-time dataset using `eval_set` parameter:
714
+ You can validate the robustness of external features on an out-of-time dataset using the `eval_set` parameter:
715
715
  ```python
716
716
  # load train dataset
717
717
  train_df = pd.read_csv("train.csv")
@@ -738,13 +738,13 @@ enricher.fit(
738
738
  - Same data schema as for search initialization X dataset
739
739
  - Pandas dataframe representation
740
740
 
741
- There are 3 options to pass out-of-time without labels:
741
+ The out-of-time dataset can be without labels. There are 3 options to pass out-of-time without labels:
742
742
  ```python
743
743
  enricher.fit(
744
744
  train_ids_and_features,
745
745
  train_label,
746
746
  eval_set = [
747
- (eval_ids_and_features_1,), # Just tuple of 1 element
747
+ (eval_ids_and_features_1,), # A tuple with 1 element
748
748
  (eval_ids_and_features_2, None), # None as labels
749
749
  (eval_ids_and_features_3, [np.nan] * len(eval_ids_and_features_3)), # List or Series of the same size as eval X
750
750
  ]
@@ -776,15 +776,15 @@ enriched_df = enricher.fit_transform(
776
776
  ```
777
777
 
778
778
  **Stability parameters:**
779
- - `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI below this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
779
+ - `stability_threshold` (float, default=0.2): PSI threshold value. Features with PSI above this threshold will be excluded from the final feature set. Lower values mean stricter stability requirements.
780
780
  - `stability_agg_func` (str, default="max"): Function to aggregate PSI values across time intervals. Options: "max" (most conservative), "min" (least conservative), "mean" (balanced approach).
781
781
 
782
- **PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models.
782
+ **PSI (Population Stability Index)** measures how much feature distribution changes over time. Lower PSI values indicate more stable features, which are generally more reliable for production ML models. PSI is calculated on the eval_set, which should contain the most recent dates relative to the training dataset.
783
783
 
784
784
  ### Use custom loss function in feature selection & metrics calculation
785
785
 
786
786
  `FeaturesEnricher` can be initialized with additional string parameter `loss`.
787
- Depending on ML-task, you can use the following loss functions:
787
+ Depending on the ML task, you can use the following loss functions:
788
788
  - `regression`: regression, regression_l1, huber, poisson, quantile, mape, gamma, tweedie;
789
789
  - `binary`: binary;
790
790
  - `multiclass`: multiclass, multiclassova.
@@ -803,7 +803,7 @@ enriched_dataframe.fit(X, y)
803
803
 
804
804
  ### Exclude premium data sources from fit, transform and metrics calculation
805
805
 
806
- `fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with parameter `exclude_features_sources` that allows to exclude Trial or Paid features from Premium data sources:
806
+ `fit`, `fit_transform`, `transform` and `calculate_metrics` methods of `FeaturesEnricher` can be used with the `exclude_features_sources` parameter to exclude Trial or Paid features from Premium data sources:
807
807
  ```python
808
808
  enricher = FeaturesEnricher(
809
809
  search_keys={"subscription_activation_date": SearchKey.DATE}
@@ -816,7 +816,7 @@ enricher.transform(X, exclude_features_sources=(trial_features + paid_features))
816
816
  ```
817
817
 
818
818
  ### Turn off autodetection for search key columns
819
- Upgini has autodetection of search keys on by default.
819
+ Upgini has autodetection of search keys enabled by default.
820
820
  To turn off use `autodetect_search_keys=False`:
821
821
 
822
822
  ```python
@@ -828,8 +828,8 @@ enricher = FeaturesEnricher(
828
828
  enricher.fit(X, y)
829
829
  ```
830
830
 
831
- ### Turn off removing of target outliers
832
- Upgini detect rows with target outlier for regression tasks. By default such rows are dropped on metrics calculation. To turn off removing of target outlier rows use parameter `remove_outliers_calc_metrics=False` in fit, fit_transform or calculate_metrics methods:
831
+ ### Turn off removal of target outliers
832
+ Upgini detects rows with target outliers for regression tasks. By default such rows are dropped during metrics calculation. To turn off the removal of targetoutlier rows, use the `remove_outliers_calc_metrics=False` parameter in the fit, fit_transform, or calculate_metrics methods:
833
833
 
834
834
  ```python
835
835
  enricher = FeaturesEnricher(
@@ -839,8 +839,8 @@ enricher = FeaturesEnricher(
839
839
  enricher.fit(X, y, remove_outliers_calc_metrics=False)
840
840
  ```
841
841
 
842
- ### Turn off generating features on search keys
843
- Upgini tries to generate features on email, date and datetime search keys. By default this generation is enabled. To disable it use parameter `generate_search_key_features` of FeaturesEnricher constructor:
842
+ ### Turn off feature generation on search keys
843
+ Upgini attempts to generate features for email, date and datetime search keys. By default this generation is enabled. To disable it use the `generate_search_key_features` parameter of the FeaturesEnricher constructor:
844
844
 
845
845
  ```python
846
846
  enricher = FeaturesEnricher(
@@ -851,37 +851,37 @@ enricher = FeaturesEnricher(
851
851
 
852
852
  ## 🔑 Open up all capabilities of Upgini
853
853
 
854
- [Register](https://profile.upgini.com) and get a free API key for exclusive data sources and features: 600 mln+ phone numbers, 350 mln+ emails, 2^32 IP addresses
854
+ [Register](https://profile.upgini.com) and get a free API key for exclusive data sources and features: 600M+ phone numbers, 350M+ emails, 2^32 IP addresses
855
855
 
856
856
  |Benefit|No Sign-up | Registered user |
857
857
  |--|--|--|
858
858
  |Enrichment with **date/datetime, postal/ZIP code and country keys** | Yes | Yes |
859
- |Enrichment with **phone number, hashed email/HEM and IP-address keys** | No | Yes |
859
+ |Enrichment with **phone number, hashed email/HEM and IP address keys** | No | Yes |
860
860
  |Email notification on **search task completion** | No | Yes |
861
861
  |Automated **feature generation with LLMs** from columns in a search dataset| Yes, *till 12/05/23* | Yes |
862
862
  |Email notification on **new data source activation** 🔜 | No | Yes |
863
863
 
864
- ## 👩🏻‍💻 How to share data/features with a community ?
865
- You may publish ANY data which you consider as royalty / license free ([Open Data](http://opendatahandbook.org/guide/en/what-is-open-data/)) and potentially valuable for ML applications for **community usage**:
864
+ ## 👩🏻‍💻 How to share data/features with the community?
865
+ You may publish ANY data which you consider as royalty or licensefree ([Open Data](http://opendatahandbook.org/guide/en/what-is-open-data/)) and potentially valuable for ML applications for **community usage**:
866
866
  1. Please Sign Up [here](https://profile.upgini.com)
867
- 2. Copy *Upgini API key* from profile and upload your data from Upgini python library with this key:
867
+ 2. Copy *Upgini API key* from your profile and upload your data from the Upgini Python library with this key:
868
868
  ```python
869
869
  import pandas as pd
870
870
  from upgini.metadata import SearchKey
871
871
  from upgini.ads import upload_user_ads
872
872
  import os
873
873
  os.environ["UPGINI_API_KEY"] = "your_long_string_api_key_goes_here"
874
- #you can define custom search key which might not be supported yet, just use SearchKey.CUSTOM_KEY type
874
+ #you can define a custom search key that might not yet be supported; just use SearchKey.CUSTOM_KEY type
875
875
  sample_df = pd.read_csv("path_to_data_sample_file")
876
876
  upload_user_ads("test", sample_df, {
877
877
  "city": SearchKey.CUSTOM_KEY,
878
878
  "stats_date": SearchKey.DATE
879
879
  })
880
880
  ```
881
- 3. After data verification, search results on community data will be available usual way.
881
+ 3. After data verification, search results on community data will be available in the usual way.
882
882
 
883
883
  ## 🛠 Getting Help & Community
884
- Please note, that we are still in a beta stage.
884
+ Please note that we are still in beta.
885
885
  Requests and support, in preferred order
886
886
  [![Claim help in slack](https://img.shields.io/badge/slack-@upgini-orange.svg?style=for-the-badge&logo=slack)](https://4mlg.short.gy/join-upgini-community)
887
887
  [![Open GitHub issue](https://img.shields.io/badge/open%20issue%20on-github-blue?style=for-the-badge&logo=github)](https://github.com/upgini/upgini/issues)
@@ -894,22 +894,22 @@ Requests and support, in preferred order
894
894
 
895
895
  ## 🧩 Contributing
896
896
  We are not a large team, so we probably won't be able to:
897
- - implement smooth integration with most common low-code ML libraries and platforms ([PyCaret](https://www.github.com/pycaret/pycaret), [H2O AutoML](https://github.com//h2oai/h2o-3/blob/master/h2o-docs/src/product/automl.rst), etc. )
897
+ - implement smooth integration with the most common low-code ML libraries and platforms ([PyCaret](https://www.github.com/pycaret/pycaret), [H2O AutoML](https://github.com//h2oai/h2o-3/blob/master/h2o-docs/src/product/automl.rst), etc.)
898
898
  - implement all possible data verification and normalization capabilities for different types of search keys
899
899
  And we need some help from the community!
900
900
 
901
- So, we'll be happy about every **pull request** you open and **issue** you find to make this library **more incredible**. Please note that it might sometimes take us a while to get back to you.
902
- **For major changes**, please open an issue first to discuss what you would like to change
901
+ So, we'll be happy about every **pull request** you open and every **issue** you report to make this library **even better**. Please note that it might sometimes take us a while to get back to you.
902
+ **For major changes**, please open an issue first to discuss what you would like to change.
903
903
  #### Developing
904
904
  Some convenient ways to start contributing are:
905
905
  ⚙️ [**Open in Visual Studio Code**](https://open.vscode.dev/upgini/upgini) You can remotely open this repo in VS Code without cloning or automatically clone and open it inside a docker container.
906
906
  ⚙️ **Gitpod** [![Gitpod Ready-to-Code](https://img.shields.io/badge/Gitpod-Ready--to--Code-blue?logo=gitpod)](https://gitpod.io/#https://github.com/upgini/upgini) You can use Gitpod to launch a fully functional development environment right in your browser.
907
907
 
908
908
  ## 🔗 Useful links
909
- - [Simple sales predictions as a template notebook](#-simple-sales-prediction-for-retail-stores)
909
+ - [Simple sales prediction template notebook](#-simple-sales-prediction-for-retail-stores)
910
910
  - [Full list of Kaggle Guides & Examples](https://www.kaggle.com/romaupgini/code)
911
911
  - [Project on PyPI](https://pypi.org/project/upgini)
912
912
  - [More perks for registered users](https://profile.upgini.com)
913
913
 
914
- <sup>😔 Found mistype or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
914
+ <sup>😔 Found typo or a bug in code snippet? Our bad! <a href="https://github.com/upgini/upgini/issues/new?assignees=&title=readme%2Fbug">
915
915
  Please report it here</a></sup>
@@ -1,11 +1,11 @@
1
- upgini/__about__.py,sha256=khvL6Ma3KHnaaXtUCPR9kKBJFG5qg7emKoKVlrbEt0k,24
1
+ upgini/__about__.py,sha256=0IfOlHrvtIj9lMq5rIqaMXXHCEFy-5GmtuHRuC-AsAw,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=Nm2ZmwyQqvTnymYpGUwyJWy7y2ebXlHMyYmGeGcyA_s,31652
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=tmKeERG2b0YfJ47g-UXQQ3S-9tyagwUOhI4oqN3kG2w,233058
6
+ upgini/features_enricher.py,sha256=wC9hWu47gdn-dXs5yLHO9etjm3t7XVF-xpafF1gakWI,234470
7
7
  upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
- upgini/metadata.py,sha256=CL9bFytdUZlbQYtTgNgAkt_sxO9klARQtULDBgb2Hlg,12575
8
+ upgini/metadata.py,sha256=H3wiN37k-yqWZgbPD0tJzx8DzaCIkgmX5cybhByQWLg,12619
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
10
10
  upgini/search_task.py,sha256=5mL_qV5mVtDkIumM9xCOgfa9Lc2B8mxJ1qI21iaScnQ,18656
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
@@ -31,14 +31,14 @@ upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_a
31
31
  upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
32
32
  upgini/autofe/timeseries/volatility.py,sha256=SvZfhM_ZAWCNpTf87WjSnZsnlblARgruDlu4By4Zvhc,8078
33
33
  upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
34
- upgini/data_source/data_source_publisher.py,sha256=qXQUYErhCmkWHm2FWgTL0FYZ2aJbxtSDV94OCM3eqUU,26653
34
+ upgini/data_source/data_source_publisher.py,sha256=CQi3fEukaStV-RiadSEvEFLThOlZJzA6PzleQQgGfGk,26286
35
35
  upgini/mdc/__init__.py,sha256=iHJlXQg6xRM1-ZOUtaPSJqw5SpQDszvxp4LyqviNLIQ,1027
36
36
  upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
37
37
  upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  upgini/normalizer/normalize_utils.py,sha256=w9f_9udrwqbhXgFMTs2keuce-6X_j6h3D7EdNo_2X7g,8493
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=KcXm1Nl6c3zswL91tIbG0DjuuNpzxUdCg1cY9f2-9cg,29283
41
+ upgini/resource_bundle/strings.properties,sha256=3aK2sxXYuvSLuoOyLq8IcyekfINH0Il5nLvVXMsuEpY,29353
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -54,10 +54,10 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
54
54
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
55
  upgini/utils/datetime_utils.py,sha256=l85UzSQLhtMeI2G6m-m8y8bCColCLSXNHb2-G6fKpLM,16988
56
56
  upgini/utils/deduplicate_utils.py,sha256=6czbn1q0p-lOmrNvbAzueBpDHmfIP4TfV4poWqbjX5w,11255
57
- upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc,11973
57
+ upgini/utils/display_utils.py,sha256=p6o0VlYtGpU6bXv3B-fjQM9PeZEkl05OylHXSRyP0us,13219
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
59
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
60
- upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
60
+ upgini/utils/feature_info.py,sha256=SQTRbSxJDkh2G2c0KGBmOv8f69gVzWbTtcXn0_2Qb-8,7945
61
61
  upgini/utils/features_validator.py,sha256=A_3AX7X5u5AH7RLgkTiS6dHxaOiq5vm8w4ijQWLGcMY,4871
62
62
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
63
  upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
@@ -71,10 +71,10 @@ upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,
71
71
  upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
72
72
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
73
73
  upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,10882
74
- upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
74
+ upgini/utils/track_info.py,sha256=HAMk4d-TYd0szl4fo6Gczvh-sabj3jFcxCoJ19zsNAo,6056
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.125.dist-info/METADATA,sha256=CAoP8m15syLZEVmnYuUjUMI1Jo-XvMCGhz-CZnRYwy4,50781
78
- upgini-1.2.125.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.125.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.125.dist-info/RECORD,,
77
+ upgini-1.2.128a1.dist-info/METADATA,sha256=_P3GkmDolD-Q-FUcfMYMsCerxOWduvD6VqB3KfhDymo,51144
78
+ upgini-1.2.128a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.128a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.128a1.dist-info/RECORD,,