upgini 1.1.244a25__tar.gz → 1.1.245a1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (82) hide show
  1. {upgini-1.1.244a25/src/upgini.egg-info → upgini-1.1.245a1}/PKG-INFO +7 -7
  2. {upgini-1.1.244a25 → upgini-1.1.245a1}/README.md +6 -6
  3. {upgini-1.1.244a25 → upgini-1.1.245a1}/setup.py +1 -1
  4. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/dataset.py +59 -53
  5. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/features_enricher.py +198 -185
  6. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/metrics.py +1 -0
  7. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/resource_bundle/__init__.py +14 -1
  8. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/target_utils.py +1 -0
  9. {upgini-1.1.244a25 → upgini-1.1.245a1/src/upgini.egg-info}/PKG-INFO +7 -7
  10. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_features_enricher.py +14 -14
  11. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_metrics.py +24 -30
  12. {upgini-1.1.244a25 → upgini-1.1.245a1}/LICENSE +0 -0
  13. {upgini-1.1.244a25 → upgini-1.1.245a1}/pyproject.toml +0 -0
  14. {upgini-1.1.244a25 → upgini-1.1.245a1}/setup.cfg +0 -0
  15. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/__init__.py +0 -0
  16. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/ads.py +0 -0
  17. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/ads_management/__init__.py +0 -0
  18. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/ads_management/ads_manager.py +0 -0
  19. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/__init__.py +0 -0
  20. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/all_operands.py +0 -0
  21. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/binary.py +0 -0
  22. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/feature.py +0 -0
  23. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/groupby.py +0 -0
  24. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/operand.py +0 -0
  25. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/unary.py +0 -0
  26. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/vector.py +0 -0
  27. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/data_source/__init__.py +0 -0
  28. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/data_source/data_source_publisher.py +0 -0
  29. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/errors.py +0 -0
  30. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/fingerprint.js +0 -0
  31. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/http.py +0 -0
  32. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/mdc/__init__.py +0 -0
  33. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/mdc/context.py +0 -0
  34. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/metadata.py +0 -0
  35. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/normalizer/phone_normalizer.py +0 -0
  37. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/search_task.py +0 -0
  44. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/spinner.py +0 -0
  45. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/__init__.py +0 -0
  46. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/base_search_key_detector.py +0 -0
  47. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/blocked_time_series.py +0 -0
  48. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/country_utils.py +0 -0
  49. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/custom_loss_utils.py +0 -0
  50. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/cv_utils.py +0 -0
  51. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/datetime_utils.py +0 -0
  52. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/deduplicate_utils.py +0 -0
  53. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/display_utils.py +0 -0
  54. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/features_validator.py +0 -0
  57. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/version_validator.py +0 -0
  66. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/SOURCES.txt +0 -0
  67. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/dependency_links.txt +0 -0
  68. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/requires.txt +0 -0
  69. {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/top_level.txt +0 -0
  70. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_binary_dataset.py +0 -0
  71. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_blocked_time_series.py +0 -0
  72. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_categorical_dataset.py +0 -0
  73. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_continuous_dataset.py +0 -0
  74. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_country_utils.py +0 -0
  75. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_custom_loss_utils.py +0 -0
  76. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_datetime_utils.py +0 -0
  77. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_email_utils.py +0 -0
  78. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_etalon_validation.py +0 -0
  79. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_phone_utils.py +0 -0
  80. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_postal_code_utils.py +0 -0
  81. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_target_utils.py +0 -0
  82. {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.244a25
3
+ Version: 1.1.245a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -32,8 +32,8 @@ License-File: LICENSE
32
32
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
33
33
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
34
34
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
35
- <h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning</a></h2>
36
- <p align="center"> <b>Easily find and add relevant features to your ML pipeline from</br> hundreds of public, community and premium external data sources, </br>optimized for ML models with LLMs and other neural networks</b> </p>
35
+ <h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
36
+ <p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
37
37
  <p align="center">
38
38
  <br />
39
39
  <a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
@@ -57,7 +57,7 @@ License-File: LICENSE
57
57
  [![Gitter Сommunity](https://img.shields.io/badge/gitter-@upgini-teal.svg?logo=gitter)](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
58
58
  ## ❔ Overview
59
59
 
60
- **Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features from the source data using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
60
+ **Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
61
61
 
62
62
  **Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify features search and enrichment to make external data a standard approach. Like a hyperparameter tuning for machine learning nowadays.
63
63
 
@@ -65,9 +65,9 @@ License-File: LICENSE
65
65
 
66
66
  ## 🚀 Awesome features
67
67
  ⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
68
- ⭐️ Data source optimizations: automated feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
69
- ⭐️ *Automatic search key augmentation* from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
70
- ⭐️ Calculate *accuracy metrics and uplifts* after enrichment existing ML model with external features
68
+ ⭐️ Automated feature generation from the sources: feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
69
+ ⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
70
+ ⭐️ Calculate accuracy metrics and uplifts after enrichment existing ML model with external features
71
71
  ⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
72
72
  ⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
73
73
  <table>
@@ -2,8 +2,8 @@
2
2
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
3
3
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
4
4
  <!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
5
- <h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning</a></h2>
6
- <p align="center"> <b>Easily find and add relevant features to your ML pipeline from</br> hundreds of public, community and premium external data sources, </br>optimized for ML models with LLMs and other neural networks</b> </p>
5
+ <h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
6
+ <p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
7
7
  <p align="center">
8
8
  <br />
9
9
  <a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
@@ -27,7 +27,7 @@
27
27
  [![Gitter Сommunity](https://img.shields.io/badge/gitter-@upgini-teal.svg?logo=gitter)](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
28
28
  ## ❔ Overview
29
29
 
30
- **Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features from the source data using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
30
+ **Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
31
31
 
32
32
  **Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify features search and enrichment to make external data a standard approach. Like a hyperparameter tuning for machine learning nowadays.
33
33
 
@@ -35,9 +35,9 @@
35
35
 
36
36
  ## 🚀 Awesome features
37
37
  ⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
38
- ⭐️ Data source optimizations: automated feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
39
- ⭐️ *Automatic search key augmentation* from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
40
- ⭐️ Calculate *accuracy metrics and uplifts* after enrichment existing ML model with external features
38
+ ⭐️ Automated feature generation from the sources: feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
39
+ ⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
40
+ ⭐️ Calculate accuracy metrics and uplifts after enrichment existing ML model with external features
41
41
  ⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
42
42
  ⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
43
43
  <table>
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.244a25"
43
+ version = "1.1.245a1"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -38,7 +38,7 @@ from upgini.metadata import (
38
38
  SearchCustomization,
39
39
  )
40
40
  from upgini.normalizer.phone_normalizer import PhoneNormalizer
41
- from upgini.resource_bundle import bundle
41
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
42
42
  from upgini.sampler.random_under_sampler import RandomUnderSampler
43
43
  from upgini.search_task import SearchTask
44
44
  from upgini.utils import combine_search_keys
@@ -81,8 +81,10 @@ class Dataset: # (pd.DataFrame):
81
81
  rest_client: Optional[_RestClient] = None,
82
82
  logger: Optional[logging.Logger] = None,
83
83
  warning_counter: Optional[WarningCounter] = None,
84
+ bundle: Optional[ResourceBundle] = None,
84
85
  **kwargs,
85
86
  ):
87
+ self.bundle = bundle or get_custom_bundle()
86
88
  if df is not None:
87
89
  data = df.copy()
88
90
  elif path is not None:
@@ -95,13 +97,13 @@ class Dataset: # (pd.DataFrame):
95
97
  kwargs["sep"] = sep
96
98
  data = pd.read_csv(path, **kwargs)
97
99
  else:
98
- raise ValueError(bundle.get("dataset_dataframe_or_path_empty"))
100
+ raise ValueError(self.bundle.get("dataset_dataframe_or_path_empty"))
99
101
  if isinstance(data, pd.DataFrame):
100
102
  self.data = data
101
103
  elif isinstance(data, pd.io.parsers.TextFileReader): # type: ignore
102
- raise ValueError(bundle.get("dataset_dataframe_iterator"))
104
+ raise ValueError(self.bundle.get("dataset_dataframe_iterator"))
103
105
  else:
104
- raise ValueError(bundle.get("dataset_dataframe_not_pandas"))
106
+ raise ValueError(self.bundle.get("dataset_dataframe_not_pandas"))
105
107
 
106
108
  self.dataset_name = dataset_name
107
109
  self.task_type = model_task_type
@@ -134,14 +136,14 @@ class Dataset: # (pd.DataFrame):
134
136
  @property
135
137
  def meaning_types_checked(self) -> Dict[str, FileColumnMeaningType]:
136
138
  if self.meaning_types is None:
137
- raise ValueError(bundle.get("dataset_empty_meaning_types"))
139
+ raise ValueError(self.bundle.get("dataset_empty_meaning_types"))
138
140
  else:
139
141
  return self.meaning_types
140
142
 
141
143
  @property
142
144
  def search_keys_checked(self) -> List[Tuple[str, ...]]:
143
145
  if self.search_keys is None:
144
- raise ValueError(bundle.get("dataset_empty_search_keys"))
146
+ raise ValueError(self.bundle.get("dataset_empty_search_keys"))
145
147
  else:
146
148
  return self.search_keys
147
149
 
@@ -156,11 +158,11 @@ class Dataset: # (pd.DataFrame):
156
158
 
157
159
  def __validate_min_rows_count(self):
158
160
  if len(self.data) < self.MIN_ROWS_COUNT:
159
- raise ValidationError(bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
161
+ raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
160
162
 
161
163
  def __validate_max_row_count(self):
162
164
  if len(self.data) > self.MAX_ROWS:
163
- raise ValidationError(bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
165
+ raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
164
166
 
165
167
  def __rename_columns(self):
166
168
  # self.logger.info("Replace restricted symbols in column names")
@@ -175,7 +177,7 @@ class Dataset: # (pd.DataFrame):
175
177
  new_column = str(column)
176
178
  suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
177
179
  if len(new_column) == 0:
178
- raise ValidationError(bundle.get("dataset_empty_column_names"))
180
+ raise ValidationError(self.bundle.get("dataset_empty_column_names"))
179
181
  # db limit for column length
180
182
  if len(new_column) > 250:
181
183
  new_column = new_column[:250]
@@ -235,7 +237,7 @@ class Dataset: # (pd.DataFrame):
235
237
  nrows_after_full_dedup = len(self.data)
236
238
  share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
237
239
  if share_full_dedup > 0:
238
- msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
240
+ msg = self.bundle.get("dataset_full_duplicates").format(share_full_dedup)
239
241
  self.logger.warning(msg)
240
242
  # if not silent_mode:
241
243
  # print(msg)
@@ -250,7 +252,9 @@ class Dataset: # (pd.DataFrame):
250
252
  num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
251
253
  share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
252
254
 
253
- msg = bundle.get("dataset_diff_target_duplicates").format(share_tgt_dedup, num_dup_rows, dups_indices)
255
+ msg = self.bundle.get("dataset_diff_target_duplicates").format(
256
+ share_tgt_dedup, num_dup_rows, dups_indices
257
+ )
254
258
  self.logger.warning(msg)
255
259
  if not silent_mode:
256
260
  print(msg)
@@ -342,7 +346,7 @@ class Dataset: # (pd.DataFrame):
342
346
 
343
347
  self.data[ip] = self.data[ip].apply(self._safe_ip_parse)
344
348
  if self.data[ip].isnull().all():
345
- raise ValidationError(bundle.get("invalid_ip").format(ip))
349
+ raise ValidationError(self.bundle.get("invalid_ip").format(ip))
346
350
 
347
351
  if self.data[ip].apply(self._is_ipv4).any():
348
352
  ipv4 = ip + "_v4"
@@ -379,7 +383,7 @@ class Dataset: # (pd.DataFrame):
379
383
  .str.replace("UK", "GB", regex=False)
380
384
  )
381
385
  if (self.data[iso_code] == "").all():
382
- raise ValidationError(bundle.get("invalid_country").format(iso_code))
386
+ raise ValidationError(self.bundle.get("invalid_country").format(iso_code))
383
387
 
384
388
  def __normalize_postal_code(self):
385
389
  postal_code = self.etalon_def_checked.get(FileColumnMeaningType.POSTAL_CODE.value)
@@ -402,7 +406,7 @@ class Dataset: # (pd.DataFrame):
402
406
  .str.replace(r"^0+\B", "", regex=True) # remove leading zeros
403
407
  )
404
408
  if (self.data[postal_code] == "").all():
405
- raise ValidationError(bundle.get("invalid_postal_code").format(postal_code))
409
+ raise ValidationError(self.bundle.get("invalid_postal_code").format(postal_code))
406
410
 
407
411
  def __normalize_hem(self):
408
412
  hem = self.etalon_def_checked.get(FileColumnMeaningType.HEM.value)
@@ -420,9 +424,9 @@ class Dataset: # (pd.DataFrame):
420
424
  self.data.drop(index=old_subset.index, inplace=True) # type: ignore
421
425
  self.logger.info(f"df after dropping old rows: {self.data.shape}")
422
426
  if len(self.data) == 0:
423
- raise ValidationError(bundle.get("dataset_all_dates_old"))
427
+ raise ValidationError(self.bundle.get("dataset_all_dates_old"))
424
428
  else:
425
- msg = bundle.get("dataset_drop_old_dates")
429
+ msg = self.bundle.get("dataset_drop_old_dates")
426
430
  self.logger.warning(msg)
427
431
  if not silent_mode:
428
432
  print(msg)
@@ -458,10 +462,10 @@ class Dataset: # (pd.DataFrame):
458
462
  target = target.astype("category").cat.codes
459
463
  except ValueError:
460
464
  self.logger.exception("Failed to cast target to category codes for binary task type")
461
- raise ValidationError(bundle.get("dataset_invalid_target_type").format(target.dtype))
465
+ raise ValidationError(self.bundle.get("dataset_invalid_target_type").format(target.dtype))
462
466
  target_classes_count = target.nunique()
463
467
  if target_classes_count != 2:
464
- msg = bundle.get("dataset_invalid_binary_target").format(target_classes_count)
468
+ msg = self.bundle.get("dataset_invalid_binary_target").format(target_classes_count)
465
469
  self.logger.warning(msg)
466
470
  raise ValidationError(msg)
467
471
  elif self.task_type == ModelTaskType.MULTICLASS:
@@ -470,21 +474,21 @@ class Dataset: # (pd.DataFrame):
470
474
  target = self.data[target_column].astype("category").cat.codes
471
475
  except Exception:
472
476
  self.logger.exception("Failed to cast target to category codes for multiclass task type")
473
- raise ValidationError(bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
477
+ raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
474
478
  elif self.task_type == ModelTaskType.REGRESSION:
475
479
  if not is_float_dtype(target):
476
480
  try:
477
481
  self.data[target_column] = self.data[target_column].astype("float")
478
482
  except ValueError:
479
483
  self.logger.exception("Failed to cast target to float for regression task type")
480
- raise ValidationError(bundle.get("dataset_invalid_regression_target").format(target.dtype))
484
+ raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
481
485
  elif self.task_type == ModelTaskType.TIMESERIES:
482
486
  if not is_float_dtype(target):
483
487
  try:
484
488
  self.data[target_column] = self.data[target_column].astype("float")
485
489
  except ValueError:
486
490
  self.logger.exception("Failed to cast target to float for timeseries task type")
487
- raise ValidationError(bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
491
+ raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
488
492
 
489
493
  def __resample(self):
490
494
  # self.logger.info("Resampling etalon")
@@ -505,7 +509,7 @@ class Dataset: # (pd.DataFrame):
505
509
  target_classes_count = target.nunique()
506
510
 
507
511
  if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
508
- msg = bundle.get("dataset_to_many_multiclass_targets").format(
512
+ msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
509
513
  target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
510
514
  )
511
515
  self.logger.warning(msg)
@@ -519,7 +523,7 @@ class Dataset: # (pd.DataFrame):
519
523
  min_class_value = v
520
524
 
521
525
  if min_class_count < self.MIN_TARGET_CLASS_ROWS:
522
- msg = bundle.get("dataset_rarest_class_less_min").format(
526
+ msg = self.bundle.get("dataset_rarest_class_less_min").format(
523
527
  min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
524
528
  )
525
529
  self.logger.warning(msg)
@@ -529,7 +533,7 @@ class Dataset: # (pd.DataFrame):
529
533
  min_class_threshold = min_class_percent * count
530
534
 
531
535
  if min_class_count < min_class_threshold:
532
- msg = bundle.get("dataset_rarest_class_less_threshold").format(
536
+ msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
533
537
  min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
534
538
  )
535
539
  self.logger.warning(msg)
@@ -543,7 +547,7 @@ class Dataset: # (pd.DataFrame):
543
547
  quantile25_idx = int(0.75 * len(classes))
544
548
  quantile25_class = classes[quantile25_idx]
545
549
  count_of_quantile25_class = len(target[target == quantile25_class])
546
- msg = bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
550
+ msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
547
551
  self.logger.warning(msg)
548
552
  print(msg)
549
553
  # 25% and lower classes will stay as is. Higher classes will be downsampled
@@ -621,7 +625,7 @@ class Dataset: # (pd.DataFrame):
621
625
  del self.meaning_types_checked[f]
622
626
 
623
627
  if removed_features:
624
- msg = bundle.get("dataset_date_features").format(removed_features)
628
+ msg = self.bundle.get("dataset_date_features").format(removed_features)
625
629
  self.logger.warning(msg)
626
630
  if not silent_mode:
627
631
  print(msg)
@@ -629,7 +633,7 @@ class Dataset: # (pd.DataFrame):
629
633
 
630
634
  def __validate_features_count(self):
631
635
  if len(self.__features()) > self.MAX_FEATURES_COUNT:
632
- msg = bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
636
+ msg = self.bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
633
637
  self.logger.warning(msg)
634
638
  raise ValidationError(msg)
635
639
 
@@ -646,14 +650,14 @@ class Dataset: # (pd.DataFrame):
646
650
  target = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value)
647
651
  if validate_target:
648
652
  if target is None:
649
- raise ValidationError(bundle.get("dataset_missing_target"))
653
+ raise ValidationError(self.bundle.get("dataset_missing_target"))
650
654
 
651
655
  target_value = self.__target_value()
652
656
  target_items = target_value.nunique()
653
657
  if target_items == 1:
654
- raise ValidationError(bundle.get("dataset_constant_target"))
658
+ raise ValidationError(self.bundle.get("dataset_constant_target"))
655
659
  elif target_items == 0:
656
- raise ValidationError(bundle.get("dataset_empty_target"))
660
+ raise ValidationError(self.bundle.get("dataset_empty_target"))
657
661
 
658
662
  # if self.task_type != ModelTaskType.MULTICLASS:
659
663
  # self.data[target] = self.data[target].apply(pd.to_numeric, errors="coerce")
@@ -682,11 +686,11 @@ class Dataset: # (pd.DataFrame):
682
686
  self.data["valid_keys"] = 0
683
687
  self.data["valid_mandatory"] = True
684
688
 
685
- all_valid_status = bundle.get("validation_all_valid_status")
686
- some_invalid_status = bundle.get("validation_some_invalid_status")
687
- all_invalid_status = bundle.get("validation_all_invalid_status")
688
- all_valid_message = bundle.get("validation_all_valid_message")
689
- invalid_message = bundle.get("validation_invalid_message")
689
+ all_valid_status = self.bundle.get("validation_all_valid_status")
690
+ some_invalid_status = self.bundle.get("validation_some_invalid_status")
691
+ all_invalid_status = self.bundle.get("validation_all_invalid_status")
692
+ all_valid_message = self.bundle.get("validation_all_valid_message")
693
+ invalid_message = self.bundle.get("validation_invalid_message")
690
694
 
691
695
  for col in columns_to_validate:
692
696
  self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
@@ -727,9 +731,9 @@ class Dataset: # (pd.DataFrame):
727
731
  if not silent_mode:
728
732
  df_stats = pd.DataFrame.from_dict(validation_stats, orient="index")
729
733
  df_stats.reset_index(inplace=True)
730
- name_header = bundle.get("validation_column_name_header")
731
- status_header = bundle.get("validation_status_header")
732
- description_header = bundle.get("validation_descr_header")
734
+ name_header = self.bundle.get("validation_column_name_header")
735
+ status_header = self.bundle.get("validation_status_header")
736
+ description_header = self.bundle.get("validation_descr_header")
733
737
  df_stats.columns = [name_header, status_header, description_header]
734
738
  try:
735
739
  import html
@@ -738,11 +742,11 @@ class Dataset: # (pd.DataFrame):
738
742
 
739
743
  _ = get_ipython() # type: ignore
740
744
 
741
- text_color = bundle.get("validation_text_color")
745
+ text_color = self.bundle.get("validation_text_color")
742
746
  colormap = {
743
- all_valid_status: bundle.get("validation_all_valid_color"),
744
- some_invalid_status: bundle.get("validation_some_invalid_color"),
745
- all_invalid_status: bundle.get("validation_all_invalid_color"),
747
+ all_valid_status: self.bundle.get("validation_all_valid_color"),
748
+ some_invalid_status: self.bundle.get("validation_some_invalid_color"),
749
+ all_invalid_status: self.bundle.get("validation_all_invalid_color"),
746
750
  }
747
751
 
748
752
  def map_color(text) -> str:
@@ -766,31 +770,33 @@ class Dataset: # (pd.DataFrame):
766
770
  print(df_stats)
767
771
 
768
772
  if len(self.data) == 0:
769
- raise ValidationError(bundle.get("all_search_keys_invalid"))
773
+ raise ValidationError(self.bundle.get("all_search_keys_invalid"))
770
774
 
771
775
  def __validate_meaning_types(self, validate_target: bool):
772
776
  # self.logger.info("Validating meaning types")
773
777
  if self.meaning_types is None or len(self.meaning_types) == 0:
774
- raise ValueError(bundle.get("dataset_missing_meaning_types"))
778
+ raise ValueError(self.bundle.get("dataset_missing_meaning_types"))
775
779
 
776
780
  if SYSTEM_RECORD_ID not in self.data.columns:
777
781
  raise ValueError("Internal error")
778
782
 
779
783
  for column in self.meaning_types:
780
784
  if column not in self.data.columns:
781
- raise ValueError(bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
785
+ raise ValueError(self.bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
782
786
  if validate_target and FileColumnMeaningType.TARGET not in self.meaning_types.values():
783
- raise ValueError(bundle.get("dataset_missing_target"))
787
+ raise ValueError(self.bundle.get("dataset_missing_target"))
784
788
 
785
789
  def __validate_search_keys(self):
786
790
  # self.logger.info("Validating search keys")
787
791
  if self.search_keys is None or len(self.search_keys) == 0:
788
- raise ValueError(bundle.get("dataset_missing_search_keys"))
792
+ raise ValueError(self.bundle.get("dataset_missing_search_keys"))
789
793
  for keys_group in self.search_keys:
790
794
  for key in keys_group:
791
795
  if key not in self.data.columns:
792
796
  showing_columns = set(self.data.columns) - SYSTEM_COLUMNS
793
- raise ValidationError(bundle.get("dataset_missing_search_key_column").format(key, showing_columns))
797
+ raise ValidationError(
798
+ self.bundle.get("dataset_missing_search_key_column").format(key, showing_columns)
799
+ )
794
800
 
795
801
  def validate(self, validate_target: bool = True, silent_mode: bool = False):
796
802
  # self.logger.info("Validating dataset")
@@ -895,7 +901,7 @@ class Dataset: # (pd.DataFrame):
895
901
  elif is_string_dtype(pandas_data_type):
896
902
  return DataType.STRING
897
903
  else:
898
- msg = bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
904
+ msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
899
905
  self.logger.warning(msg)
900
906
  raise ValidationError(msg)
901
907
 
@@ -926,7 +932,7 @@ class Dataset: # (pd.DataFrame):
926
932
  for key in filter_features
927
933
  if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
928
934
  ]:
929
- raise ValidationError(bundle.get("dataset_invalid_filter"))
935
+ raise ValidationError(self.bundle.get("dataset_invalid_filter"))
930
936
  feature_filter = FeaturesFilter(
931
937
  minImportance=filter_features.get("min_importance"),
932
938
  maxPSI=filter_features.get("max_psi"),
@@ -1017,7 +1023,7 @@ class Dataset: # (pd.DataFrame):
1017
1023
  trace_id, parquet_file_path, file_metadata, file_metrics, search_customization
1018
1024
  )
1019
1025
  # if progress_bar is not None:
1020
- # progress_bar.progress = (6.0, bundle.get(ProgressStage.MATCHING.value))
1026
+ # progress_bar.progress = (6.0, self.bundle.get(ProgressStage.MATCHING.value))
1021
1027
  # if progress_callback is not None:
1022
1028
  # progress_callback(SearchProgress(6.0, ProgressStage.MATCHING))
1023
1029
  self.file_upload_id = search_task_response.file_upload_id
@@ -1088,7 +1094,7 @@ class Dataset: # (pd.DataFrame):
1088
1094
  )
1089
1095
  self.file_upload_id = search_task_response.file_upload_id
1090
1096
  # if progress_bar is not None:
1091
- # progress_bar.progress = (6.0, bundle.get(ProgressStage.ENRICHING.value))
1097
+ # progress_bar.progress = (6.0, self.bundle.get(ProgressStage.ENRICHING.value))
1092
1098
  # if progress_callback is not None:
1093
1099
  # progress_callback(SearchProgress(6.0, ProgressStage.ENRICHING))
1094
1100
 
@@ -1108,5 +1114,5 @@ class Dataset: # (pd.DataFrame):
1108
1114
  uploading_file_size = Path(parquet_file_path).stat().st_size
1109
1115
  self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
1110
1116
  if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
1111
- raise ValidationError(bundle.get("dataset_too_big_file"))
1117
+ raise ValidationError(self.bundle.get("dataset_too_big_file"))
1112
1118
  return parquet_file_path