upgini 1.1.244a25__tar.gz → 1.1.245a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.244a25/src/upgini.egg-info → upgini-1.1.245a1}/PKG-INFO +7 -7
- {upgini-1.1.244a25 → upgini-1.1.245a1}/README.md +6 -6
- {upgini-1.1.244a25 → upgini-1.1.245a1}/setup.py +1 -1
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/dataset.py +59 -53
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/features_enricher.py +198 -185
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/metrics.py +1 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/resource_bundle/__init__.py +14 -1
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/target_utils.py +1 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1/src/upgini.egg-info}/PKG-INFO +7 -7
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_features_enricher.py +14 -14
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_metrics.py +24 -30
- {upgini-1.1.244a25 → upgini-1.1.245a1}/LICENSE +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/pyproject.toml +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/setup.cfg +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/ads.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/errors.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/http.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/metadata.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/search_task.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/spinner.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_country_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_email_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_target_utils.py +0 -0
- {upgini-1.1.244a25 → upgini-1.1.245a1}/tests/test_widget.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.245a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -32,8 +32,8 @@ License-File: LICENSE
|
|
|
32
32
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
33
33
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
|
|
34
34
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
|
|
35
|
-
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning</a></h2>
|
|
36
|
-
<p align="center"> <b>Easily find and add relevant features to your ML pipeline from</br> hundreds of public, community and premium external data sources, </br>
|
|
35
|
+
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
|
|
36
|
+
<p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
|
|
37
37
|
<p align="center">
|
|
38
38
|
<br />
|
|
39
39
|
<a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
|
|
@@ -57,7 +57,7 @@ License-File: LICENSE
|
|
|
57
57
|
[](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
|
|
58
58
|
## ❔ Overview
|
|
59
59
|
|
|
60
|
-
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features
|
|
60
|
+
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
|
|
61
61
|
|
|
62
62
|
**Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify features search and enrichment to make external data a standard approach. Like a hyperparameter tuning for machine learning nowadays.
|
|
63
63
|
|
|
@@ -65,9 +65,9 @@ License-File: LICENSE
|
|
|
65
65
|
|
|
66
66
|
## 🚀 Awesome features
|
|
67
67
|
⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
|
|
68
|
-
⭐️
|
|
69
|
-
⭐️
|
|
70
|
-
⭐️ Calculate
|
|
68
|
+
⭐️ Automated feature generation from the sources: feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
|
|
69
|
+
⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
70
|
+
⭐️ Calculate accuracy metrics and uplifts after enrichment existing ML model with external features
|
|
71
71
|
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
72
72
|
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
73
73
|
<table>
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
3
3
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
|
|
4
4
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
|
|
5
|
-
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning</a></h2>
|
|
6
|
-
<p align="center"> <b>Easily find and add relevant features to your ML pipeline from</br> hundreds of public, community and premium external data sources, </br>
|
|
5
|
+
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
|
|
6
|
+
<p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
|
|
7
7
|
<p align="center">
|
|
8
8
|
<br />
|
|
9
9
|
<a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
[](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
|
|
28
28
|
## ❔ Overview
|
|
29
29
|
|
|
30
|
-
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features
|
|
30
|
+
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
|
|
31
31
|
|
|
32
32
|
**Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify features search and enrichment to make external data a standard approach. Like a hyperparameter tuning for machine learning nowadays.
|
|
33
33
|
|
|
@@ -35,9 +35,9 @@
|
|
|
35
35
|
|
|
36
36
|
## 🚀 Awesome features
|
|
37
37
|
⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
|
|
38
|
-
⭐️
|
|
39
|
-
⭐️
|
|
40
|
-
⭐️ Calculate
|
|
38
|
+
⭐️ Automated feature generation from the sources: feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
|
|
39
|
+
⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
40
|
+
⭐️ Calculate accuracy metrics and uplifts after enrichment existing ML model with external features
|
|
41
41
|
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
42
42
|
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
43
43
|
<table>
|
|
@@ -38,7 +38,7 @@ from upgini.metadata import (
|
|
|
38
38
|
SearchCustomization,
|
|
39
39
|
)
|
|
40
40
|
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
41
|
-
from upgini.resource_bundle import
|
|
41
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
42
42
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
43
43
|
from upgini.search_task import SearchTask
|
|
44
44
|
from upgini.utils import combine_search_keys
|
|
@@ -81,8 +81,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
81
81
|
rest_client: Optional[_RestClient] = None,
|
|
82
82
|
logger: Optional[logging.Logger] = None,
|
|
83
83
|
warning_counter: Optional[WarningCounter] = None,
|
|
84
|
+
bundle: Optional[ResourceBundle] = None,
|
|
84
85
|
**kwargs,
|
|
85
86
|
):
|
|
87
|
+
self.bundle = bundle or get_custom_bundle()
|
|
86
88
|
if df is not None:
|
|
87
89
|
data = df.copy()
|
|
88
90
|
elif path is not None:
|
|
@@ -95,13 +97,13 @@ class Dataset: # (pd.DataFrame):
|
|
|
95
97
|
kwargs["sep"] = sep
|
|
96
98
|
data = pd.read_csv(path, **kwargs)
|
|
97
99
|
else:
|
|
98
|
-
raise ValueError(bundle.get("dataset_dataframe_or_path_empty"))
|
|
100
|
+
raise ValueError(self.bundle.get("dataset_dataframe_or_path_empty"))
|
|
99
101
|
if isinstance(data, pd.DataFrame):
|
|
100
102
|
self.data = data
|
|
101
103
|
elif isinstance(data, pd.io.parsers.TextFileReader): # type: ignore
|
|
102
|
-
raise ValueError(bundle.get("dataset_dataframe_iterator"))
|
|
104
|
+
raise ValueError(self.bundle.get("dataset_dataframe_iterator"))
|
|
103
105
|
else:
|
|
104
|
-
raise ValueError(bundle.get("dataset_dataframe_not_pandas"))
|
|
106
|
+
raise ValueError(self.bundle.get("dataset_dataframe_not_pandas"))
|
|
105
107
|
|
|
106
108
|
self.dataset_name = dataset_name
|
|
107
109
|
self.task_type = model_task_type
|
|
@@ -134,14 +136,14 @@ class Dataset: # (pd.DataFrame):
|
|
|
134
136
|
@property
|
|
135
137
|
def meaning_types_checked(self) -> Dict[str, FileColumnMeaningType]:
|
|
136
138
|
if self.meaning_types is None:
|
|
137
|
-
raise ValueError(bundle.get("dataset_empty_meaning_types"))
|
|
139
|
+
raise ValueError(self.bundle.get("dataset_empty_meaning_types"))
|
|
138
140
|
else:
|
|
139
141
|
return self.meaning_types
|
|
140
142
|
|
|
141
143
|
@property
|
|
142
144
|
def search_keys_checked(self) -> List[Tuple[str, ...]]:
|
|
143
145
|
if self.search_keys is None:
|
|
144
|
-
raise ValueError(bundle.get("dataset_empty_search_keys"))
|
|
146
|
+
raise ValueError(self.bundle.get("dataset_empty_search_keys"))
|
|
145
147
|
else:
|
|
146
148
|
return self.search_keys
|
|
147
149
|
|
|
@@ -156,11 +158,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
156
158
|
|
|
157
159
|
def __validate_min_rows_count(self):
|
|
158
160
|
if len(self.data) < self.MIN_ROWS_COUNT:
|
|
159
|
-
raise ValidationError(bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
|
|
161
|
+
raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
|
|
160
162
|
|
|
161
163
|
def __validate_max_row_count(self):
|
|
162
164
|
if len(self.data) > self.MAX_ROWS:
|
|
163
|
-
raise ValidationError(bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
|
|
165
|
+
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
|
|
164
166
|
|
|
165
167
|
def __rename_columns(self):
|
|
166
168
|
# self.logger.info("Replace restricted symbols in column names")
|
|
@@ -175,7 +177,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
175
177
|
new_column = str(column)
|
|
176
178
|
suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
|
|
177
179
|
if len(new_column) == 0:
|
|
178
|
-
raise ValidationError(bundle.get("dataset_empty_column_names"))
|
|
180
|
+
raise ValidationError(self.bundle.get("dataset_empty_column_names"))
|
|
179
181
|
# db limit for column length
|
|
180
182
|
if len(new_column) > 250:
|
|
181
183
|
new_column = new_column[:250]
|
|
@@ -235,7 +237,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
235
237
|
nrows_after_full_dedup = len(self.data)
|
|
236
238
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
237
239
|
if share_full_dedup > 0:
|
|
238
|
-
msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
240
|
+
msg = self.bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
239
241
|
self.logger.warning(msg)
|
|
240
242
|
# if not silent_mode:
|
|
241
243
|
# print(msg)
|
|
@@ -250,7 +252,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
250
252
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
251
253
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
252
254
|
|
|
253
|
-
msg = bundle.get("dataset_diff_target_duplicates").format(
|
|
255
|
+
msg = self.bundle.get("dataset_diff_target_duplicates").format(
|
|
256
|
+
share_tgt_dedup, num_dup_rows, dups_indices
|
|
257
|
+
)
|
|
254
258
|
self.logger.warning(msg)
|
|
255
259
|
if not silent_mode:
|
|
256
260
|
print(msg)
|
|
@@ -342,7 +346,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
342
346
|
|
|
343
347
|
self.data[ip] = self.data[ip].apply(self._safe_ip_parse)
|
|
344
348
|
if self.data[ip].isnull().all():
|
|
345
|
-
raise ValidationError(bundle.get("invalid_ip").format(ip))
|
|
349
|
+
raise ValidationError(self.bundle.get("invalid_ip").format(ip))
|
|
346
350
|
|
|
347
351
|
if self.data[ip].apply(self._is_ipv4).any():
|
|
348
352
|
ipv4 = ip + "_v4"
|
|
@@ -379,7 +383,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
379
383
|
.str.replace("UK", "GB", regex=False)
|
|
380
384
|
)
|
|
381
385
|
if (self.data[iso_code] == "").all():
|
|
382
|
-
raise ValidationError(bundle.get("invalid_country").format(iso_code))
|
|
386
|
+
raise ValidationError(self.bundle.get("invalid_country").format(iso_code))
|
|
383
387
|
|
|
384
388
|
def __normalize_postal_code(self):
|
|
385
389
|
postal_code = self.etalon_def_checked.get(FileColumnMeaningType.POSTAL_CODE.value)
|
|
@@ -402,7 +406,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
402
406
|
.str.replace(r"^0+\B", "", regex=True) # remove leading zeros
|
|
403
407
|
)
|
|
404
408
|
if (self.data[postal_code] == "").all():
|
|
405
|
-
raise ValidationError(bundle.get("invalid_postal_code").format(postal_code))
|
|
409
|
+
raise ValidationError(self.bundle.get("invalid_postal_code").format(postal_code))
|
|
406
410
|
|
|
407
411
|
def __normalize_hem(self):
|
|
408
412
|
hem = self.etalon_def_checked.get(FileColumnMeaningType.HEM.value)
|
|
@@ -420,9 +424,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
420
424
|
self.data.drop(index=old_subset.index, inplace=True) # type: ignore
|
|
421
425
|
self.logger.info(f"df after dropping old rows: {self.data.shape}")
|
|
422
426
|
if len(self.data) == 0:
|
|
423
|
-
raise ValidationError(bundle.get("dataset_all_dates_old"))
|
|
427
|
+
raise ValidationError(self.bundle.get("dataset_all_dates_old"))
|
|
424
428
|
else:
|
|
425
|
-
msg = bundle.get("dataset_drop_old_dates")
|
|
429
|
+
msg = self.bundle.get("dataset_drop_old_dates")
|
|
426
430
|
self.logger.warning(msg)
|
|
427
431
|
if not silent_mode:
|
|
428
432
|
print(msg)
|
|
@@ -458,10 +462,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
458
462
|
target = target.astype("category").cat.codes
|
|
459
463
|
except ValueError:
|
|
460
464
|
self.logger.exception("Failed to cast target to category codes for binary task type")
|
|
461
|
-
raise ValidationError(bundle.get("dataset_invalid_target_type").format(target.dtype))
|
|
465
|
+
raise ValidationError(self.bundle.get("dataset_invalid_target_type").format(target.dtype))
|
|
462
466
|
target_classes_count = target.nunique()
|
|
463
467
|
if target_classes_count != 2:
|
|
464
|
-
msg = bundle.get("dataset_invalid_binary_target").format(target_classes_count)
|
|
468
|
+
msg = self.bundle.get("dataset_invalid_binary_target").format(target_classes_count)
|
|
465
469
|
self.logger.warning(msg)
|
|
466
470
|
raise ValidationError(msg)
|
|
467
471
|
elif self.task_type == ModelTaskType.MULTICLASS:
|
|
@@ -470,21 +474,21 @@ class Dataset: # (pd.DataFrame):
|
|
|
470
474
|
target = self.data[target_column].astype("category").cat.codes
|
|
471
475
|
except Exception:
|
|
472
476
|
self.logger.exception("Failed to cast target to category codes for multiclass task type")
|
|
473
|
-
raise ValidationError(bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
|
477
|
+
raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
|
474
478
|
elif self.task_type == ModelTaskType.REGRESSION:
|
|
475
479
|
if not is_float_dtype(target):
|
|
476
480
|
try:
|
|
477
481
|
self.data[target_column] = self.data[target_column].astype("float")
|
|
478
482
|
except ValueError:
|
|
479
483
|
self.logger.exception("Failed to cast target to float for regression task type")
|
|
480
|
-
raise ValidationError(bundle.get("dataset_invalid_regression_target").format(target.dtype))
|
|
484
|
+
raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
|
|
481
485
|
elif self.task_type == ModelTaskType.TIMESERIES:
|
|
482
486
|
if not is_float_dtype(target):
|
|
483
487
|
try:
|
|
484
488
|
self.data[target_column] = self.data[target_column].astype("float")
|
|
485
489
|
except ValueError:
|
|
486
490
|
self.logger.exception("Failed to cast target to float for timeseries task type")
|
|
487
|
-
raise ValidationError(bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
|
491
|
+
raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
|
488
492
|
|
|
489
493
|
def __resample(self):
|
|
490
494
|
# self.logger.info("Resampling etalon")
|
|
@@ -505,7 +509,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
505
509
|
target_classes_count = target.nunique()
|
|
506
510
|
|
|
507
511
|
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
|
508
|
-
msg = bundle.get("dataset_to_many_multiclass_targets").format(
|
|
512
|
+
msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
|
|
509
513
|
target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
|
|
510
514
|
)
|
|
511
515
|
self.logger.warning(msg)
|
|
@@ -519,7 +523,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
519
523
|
min_class_value = v
|
|
520
524
|
|
|
521
525
|
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
|
522
|
-
msg = bundle.get("dataset_rarest_class_less_min").format(
|
|
526
|
+
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
|
523
527
|
min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
|
|
524
528
|
)
|
|
525
529
|
self.logger.warning(msg)
|
|
@@ -529,7 +533,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
529
533
|
min_class_threshold = min_class_percent * count
|
|
530
534
|
|
|
531
535
|
if min_class_count < min_class_threshold:
|
|
532
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
536
|
+
msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
|
|
533
537
|
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
534
538
|
)
|
|
535
539
|
self.logger.warning(msg)
|
|
@@ -543,7 +547,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
543
547
|
quantile25_idx = int(0.75 * len(classes))
|
|
544
548
|
quantile25_class = classes[quantile25_idx]
|
|
545
549
|
count_of_quantile25_class = len(target[target == quantile25_class])
|
|
546
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
|
|
550
|
+
msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
|
|
547
551
|
self.logger.warning(msg)
|
|
548
552
|
print(msg)
|
|
549
553
|
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
@@ -621,7 +625,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
621
625
|
del self.meaning_types_checked[f]
|
|
622
626
|
|
|
623
627
|
if removed_features:
|
|
624
|
-
msg = bundle.get("dataset_date_features").format(removed_features)
|
|
628
|
+
msg = self.bundle.get("dataset_date_features").format(removed_features)
|
|
625
629
|
self.logger.warning(msg)
|
|
626
630
|
if not silent_mode:
|
|
627
631
|
print(msg)
|
|
@@ -629,7 +633,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
629
633
|
|
|
630
634
|
def __validate_features_count(self):
|
|
631
635
|
if len(self.__features()) > self.MAX_FEATURES_COUNT:
|
|
632
|
-
msg = bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
|
|
636
|
+
msg = self.bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
|
|
633
637
|
self.logger.warning(msg)
|
|
634
638
|
raise ValidationError(msg)
|
|
635
639
|
|
|
@@ -646,14 +650,14 @@ class Dataset: # (pd.DataFrame):
|
|
|
646
650
|
target = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value)
|
|
647
651
|
if validate_target:
|
|
648
652
|
if target is None:
|
|
649
|
-
raise ValidationError(bundle.get("dataset_missing_target"))
|
|
653
|
+
raise ValidationError(self.bundle.get("dataset_missing_target"))
|
|
650
654
|
|
|
651
655
|
target_value = self.__target_value()
|
|
652
656
|
target_items = target_value.nunique()
|
|
653
657
|
if target_items == 1:
|
|
654
|
-
raise ValidationError(bundle.get("dataset_constant_target"))
|
|
658
|
+
raise ValidationError(self.bundle.get("dataset_constant_target"))
|
|
655
659
|
elif target_items == 0:
|
|
656
|
-
raise ValidationError(bundle.get("dataset_empty_target"))
|
|
660
|
+
raise ValidationError(self.bundle.get("dataset_empty_target"))
|
|
657
661
|
|
|
658
662
|
# if self.task_type != ModelTaskType.MULTICLASS:
|
|
659
663
|
# self.data[target] = self.data[target].apply(pd.to_numeric, errors="coerce")
|
|
@@ -682,11 +686,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
682
686
|
self.data["valid_keys"] = 0
|
|
683
687
|
self.data["valid_mandatory"] = True
|
|
684
688
|
|
|
685
|
-
all_valid_status = bundle.get("validation_all_valid_status")
|
|
686
|
-
some_invalid_status = bundle.get("validation_some_invalid_status")
|
|
687
|
-
all_invalid_status = bundle.get("validation_all_invalid_status")
|
|
688
|
-
all_valid_message = bundle.get("validation_all_valid_message")
|
|
689
|
-
invalid_message = bundle.get("validation_invalid_message")
|
|
689
|
+
all_valid_status = self.bundle.get("validation_all_valid_status")
|
|
690
|
+
some_invalid_status = self.bundle.get("validation_some_invalid_status")
|
|
691
|
+
all_invalid_status = self.bundle.get("validation_all_invalid_status")
|
|
692
|
+
all_valid_message = self.bundle.get("validation_all_valid_message")
|
|
693
|
+
invalid_message = self.bundle.get("validation_invalid_message")
|
|
690
694
|
|
|
691
695
|
for col in columns_to_validate:
|
|
692
696
|
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
|
@@ -727,9 +731,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
727
731
|
if not silent_mode:
|
|
728
732
|
df_stats = pd.DataFrame.from_dict(validation_stats, orient="index")
|
|
729
733
|
df_stats.reset_index(inplace=True)
|
|
730
|
-
name_header = bundle.get("validation_column_name_header")
|
|
731
|
-
status_header = bundle.get("validation_status_header")
|
|
732
|
-
description_header = bundle.get("validation_descr_header")
|
|
734
|
+
name_header = self.bundle.get("validation_column_name_header")
|
|
735
|
+
status_header = self.bundle.get("validation_status_header")
|
|
736
|
+
description_header = self.bundle.get("validation_descr_header")
|
|
733
737
|
df_stats.columns = [name_header, status_header, description_header]
|
|
734
738
|
try:
|
|
735
739
|
import html
|
|
@@ -738,11 +742,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
738
742
|
|
|
739
743
|
_ = get_ipython() # type: ignore
|
|
740
744
|
|
|
741
|
-
text_color = bundle.get("validation_text_color")
|
|
745
|
+
text_color = self.bundle.get("validation_text_color")
|
|
742
746
|
colormap = {
|
|
743
|
-
all_valid_status: bundle.get("validation_all_valid_color"),
|
|
744
|
-
some_invalid_status: bundle.get("validation_some_invalid_color"),
|
|
745
|
-
all_invalid_status: bundle.get("validation_all_invalid_color"),
|
|
747
|
+
all_valid_status: self.bundle.get("validation_all_valid_color"),
|
|
748
|
+
some_invalid_status: self.bundle.get("validation_some_invalid_color"),
|
|
749
|
+
all_invalid_status: self.bundle.get("validation_all_invalid_color"),
|
|
746
750
|
}
|
|
747
751
|
|
|
748
752
|
def map_color(text) -> str:
|
|
@@ -766,31 +770,33 @@ class Dataset: # (pd.DataFrame):
|
|
|
766
770
|
print(df_stats)
|
|
767
771
|
|
|
768
772
|
if len(self.data) == 0:
|
|
769
|
-
raise ValidationError(bundle.get("all_search_keys_invalid"))
|
|
773
|
+
raise ValidationError(self.bundle.get("all_search_keys_invalid"))
|
|
770
774
|
|
|
771
775
|
def __validate_meaning_types(self, validate_target: bool):
|
|
772
776
|
# self.logger.info("Validating meaning types")
|
|
773
777
|
if self.meaning_types is None or len(self.meaning_types) == 0:
|
|
774
|
-
raise ValueError(bundle.get("dataset_missing_meaning_types"))
|
|
778
|
+
raise ValueError(self.bundle.get("dataset_missing_meaning_types"))
|
|
775
779
|
|
|
776
780
|
if SYSTEM_RECORD_ID not in self.data.columns:
|
|
777
781
|
raise ValueError("Internal error")
|
|
778
782
|
|
|
779
783
|
for column in self.meaning_types:
|
|
780
784
|
if column not in self.data.columns:
|
|
781
|
-
raise ValueError(bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
|
|
785
|
+
raise ValueError(self.bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
|
|
782
786
|
if validate_target and FileColumnMeaningType.TARGET not in self.meaning_types.values():
|
|
783
|
-
raise ValueError(bundle.get("dataset_missing_target"))
|
|
787
|
+
raise ValueError(self.bundle.get("dataset_missing_target"))
|
|
784
788
|
|
|
785
789
|
def __validate_search_keys(self):
|
|
786
790
|
# self.logger.info("Validating search keys")
|
|
787
791
|
if self.search_keys is None or len(self.search_keys) == 0:
|
|
788
|
-
raise ValueError(bundle.get("dataset_missing_search_keys"))
|
|
792
|
+
raise ValueError(self.bundle.get("dataset_missing_search_keys"))
|
|
789
793
|
for keys_group in self.search_keys:
|
|
790
794
|
for key in keys_group:
|
|
791
795
|
if key not in self.data.columns:
|
|
792
796
|
showing_columns = set(self.data.columns) - SYSTEM_COLUMNS
|
|
793
|
-
raise ValidationError(
|
|
797
|
+
raise ValidationError(
|
|
798
|
+
self.bundle.get("dataset_missing_search_key_column").format(key, showing_columns)
|
|
799
|
+
)
|
|
794
800
|
|
|
795
801
|
def validate(self, validate_target: bool = True, silent_mode: bool = False):
|
|
796
802
|
# self.logger.info("Validating dataset")
|
|
@@ -895,7 +901,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
895
901
|
elif is_string_dtype(pandas_data_type):
|
|
896
902
|
return DataType.STRING
|
|
897
903
|
else:
|
|
898
|
-
msg = bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
|
904
|
+
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
|
899
905
|
self.logger.warning(msg)
|
|
900
906
|
raise ValidationError(msg)
|
|
901
907
|
|
|
@@ -926,7 +932,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
926
932
|
for key in filter_features
|
|
927
933
|
if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
|
|
928
934
|
]:
|
|
929
|
-
raise ValidationError(bundle.get("dataset_invalid_filter"))
|
|
935
|
+
raise ValidationError(self.bundle.get("dataset_invalid_filter"))
|
|
930
936
|
feature_filter = FeaturesFilter(
|
|
931
937
|
minImportance=filter_features.get("min_importance"),
|
|
932
938
|
maxPSI=filter_features.get("max_psi"),
|
|
@@ -1017,7 +1023,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
1017
1023
|
trace_id, parquet_file_path, file_metadata, file_metrics, search_customization
|
|
1018
1024
|
)
|
|
1019
1025
|
# if progress_bar is not None:
|
|
1020
|
-
# progress_bar.progress = (6.0, bundle.get(ProgressStage.MATCHING.value))
|
|
1026
|
+
# progress_bar.progress = (6.0, self.bundle.get(ProgressStage.MATCHING.value))
|
|
1021
1027
|
# if progress_callback is not None:
|
|
1022
1028
|
# progress_callback(SearchProgress(6.0, ProgressStage.MATCHING))
|
|
1023
1029
|
self.file_upload_id = search_task_response.file_upload_id
|
|
@@ -1088,7 +1094,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
1088
1094
|
)
|
|
1089
1095
|
self.file_upload_id = search_task_response.file_upload_id
|
|
1090
1096
|
# if progress_bar is not None:
|
|
1091
|
-
# progress_bar.progress = (6.0, bundle.get(ProgressStage.ENRICHING.value))
|
|
1097
|
+
# progress_bar.progress = (6.0, self.bundle.get(ProgressStage.ENRICHING.value))
|
|
1092
1098
|
# if progress_callback is not None:
|
|
1093
1099
|
# progress_callback(SearchProgress(6.0, ProgressStage.ENRICHING))
|
|
1094
1100
|
|
|
@@ -1108,5 +1114,5 @@ class Dataset: # (pd.DataFrame):
|
|
|
1108
1114
|
uploading_file_size = Path(parquet_file_path).stat().st_size
|
|
1109
1115
|
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
|
|
1110
1116
|
if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
|
|
1111
|
-
raise ValidationError(bundle.get("dataset_too_big_file"))
|
|
1117
|
+
raise ValidationError(self.bundle.get("dataset_too_big_file"))
|
|
1112
1118
|
return parquet_file_path
|