upgini 1.1.244a24__tar.gz → 1.1.245a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.1.244a24/src/upgini.egg-info → upgini-1.1.245a1}/PKG-INFO +7 -7
- {upgini-1.1.244a24 → upgini-1.1.245a1}/README.md +6 -6
- {upgini-1.1.244a24 → upgini-1.1.245a1}/setup.py +1 -1
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/dataset.py +67 -55
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/features_enricher.py +202 -186
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/metrics.py +1 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/resource_bundle/__init__.py +14 -1
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/target_utils.py +1 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1/src/upgini.egg-info}/PKG-INFO +7 -7
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_features_enricher.py +14 -14
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_metrics.py +24 -30
- {upgini-1.1.244a24 → upgini-1.1.245a1}/LICENSE +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/pyproject.toml +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/setup.cfg +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/ads.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/errors.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/http.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/metadata.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/search_task.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/spinner.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_country_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_email_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_target_utils.py +0 -0
- {upgini-1.1.244a24 → upgini-1.1.245a1}/tests/test_widget.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.1
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.1.
|
|
3
|
+
Version: 1.1.245a1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Home-page: https://upgini.com/
|
|
6
6
|
Author: Upgini Developers
|
|
@@ -32,8 +32,8 @@ License-File: LICENSE
|
|
|
32
32
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
33
33
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
|
|
34
34
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
|
|
35
|
-
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning</a></h2>
|
|
36
|
-
<p align="center"> <b>Easily find and add relevant features to your ML pipeline from</br> hundreds of public, community and premium external data sources, </br>
|
|
35
|
+
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
|
|
36
|
+
<p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
|
|
37
37
|
<p align="center">
|
|
38
38
|
<br />
|
|
39
39
|
<a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
|
|
@@ -57,7 +57,7 @@ License-File: LICENSE
|
|
|
57
57
|
[](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
|
|
58
58
|
## ❔ Overview
|
|
59
59
|
|
|
60
|
-
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features
|
|
60
|
+
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
|
|
61
61
|
|
|
62
62
|
**Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify features search and enrichment to make external data a standard approach. Like a hyperparameter tuning for machine learning nowadays.
|
|
63
63
|
|
|
@@ -65,9 +65,9 @@ License-File: LICENSE
|
|
|
65
65
|
|
|
66
66
|
## 🚀 Awesome features
|
|
67
67
|
⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
|
|
68
|
-
⭐️
|
|
69
|
-
⭐️
|
|
70
|
-
⭐️ Calculate
|
|
68
|
+
⭐️ Automated feature generation from the sources: feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
|
|
69
|
+
⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
70
|
+
⭐️ Calculate accuracy metrics and uplifts after enrichment existing ML model with external features
|
|
71
71
|
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
72
72
|
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
73
73
|
<table>
|
|
@@ -2,8 +2,8 @@
|
|
|
2
2
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : low-code feature search and enrichment library for machine learning </h2> -->
|
|
3
3
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> : Free automated data enrichment library for machine learning: </br>only the accuracy improving features in 2 minutes </h2> -->
|
|
4
4
|
<!-- <h2 align="center"> <a href="https://upgini.com/">Upgini</a> • Free production-ready automated data enrichment library for machine learning</h2>-->
|
|
5
|
-
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning</a></h2>
|
|
6
|
-
<p align="center"> <b>Easily find and add relevant features to your ML pipeline from</br> hundreds of public, community and premium external data sources, </br>
|
|
5
|
+
<h2 align="center"> <a href="https://upgini.com/">Upgini • Intelligent data search & enrichment for Machine Learning and AI</a></h2>
|
|
6
|
+
<p align="center"> <b>Easily find and add relevant features to your ML & AI pipeline from</br> hundreds of public, community and premium external data sources, </br>including open & commercial LLMs</b> </p>
|
|
7
7
|
<p align="center">
|
|
8
8
|
<br />
|
|
9
9
|
<a href="https://colab.research.google.com/github/upgini/upgini/blob/main/notebooks/Upgini_Features_search%26generation.ipynb"><strong>Quick Start in Colab »</strong></a> |
|
|
@@ -27,7 +27,7 @@
|
|
|
27
27
|
[](https://gitter.im/upgini/community?utm_source=badge&utm_medium=badge&utm_campaign=pr-badge) -->
|
|
28
28
|
## ❔ Overview
|
|
29
29
|
|
|
30
|
-
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features
|
|
30
|
+
**Upgini** is an intelligent data search engine with a Python library that helps you find and add relevant features to your ML pipeline from hundreds of public, community, and premium external data sources. Under the hood, Upgini automatically optimizes all connected data sources by [generating an optimal set of machine ML features using large language models (LLMs), GraphNNs and recurrent neural networks (RNNs)](https://upgini.com/#optimized_external_data).
|
|
31
31
|
|
|
32
32
|
**Motivation:** for most supervised ML models external data & features boost accuracy significantly better than any hyperparameters tuning. But lack of automated and time-efficient enrichment tools for external data blocks massive adoption of external features in ML pipelines. We want radically simplify features search and enrichment to make external data a standard approach. Like a hyperparameter tuning for machine learning nowadays.
|
|
33
33
|
|
|
@@ -35,9 +35,9 @@
|
|
|
35
35
|
|
|
36
36
|
## 🚀 Awesome features
|
|
37
37
|
⭐️ Automatically find only relevant features that *give accuracy improvement for ML model*. Not just correlated with target variable, what 9 out of 10 cases gives zero accuracy improvement
|
|
38
|
-
⭐️
|
|
39
|
-
⭐️
|
|
40
|
-
⭐️ Calculate
|
|
38
|
+
⭐️ Automated feature generation from the sources: feature generation with Large Language Models' data augmentation, RNNs, GraphNN; multiple data source ensembling
|
|
39
|
+
⭐️ Automatic search key augmentation from all connected sources. If you do not have all search keys in your search request, such as postal/zip code, Upgini will try to add those keys based on the provided set of search keys. This will broaden the search across all available data sources
|
|
40
|
+
⭐️ Calculate accuracy metrics and uplifts after enrichment existing ML model with external features
|
|
41
41
|
⭐️ Check the stability of accuracy gain from external data on out-of-time intervals and verification datasets. Mitigate risks of unstable external data dependencies in ML pipeline
|
|
42
42
|
⭐️ Easy to use - single request to enrich training dataset with [*all of the keys at once*](#-search-key-types-we-support-more-to-come):
|
|
43
43
|
<table>
|
|
@@ -38,7 +38,7 @@ from upgini.metadata import (
|
|
|
38
38
|
SearchCustomization,
|
|
39
39
|
)
|
|
40
40
|
from upgini.normalizer.phone_normalizer import PhoneNormalizer
|
|
41
|
-
from upgini.resource_bundle import
|
|
41
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
42
42
|
from upgini.sampler.random_under_sampler import RandomUnderSampler
|
|
43
43
|
from upgini.search_task import SearchTask
|
|
44
44
|
from upgini.utils import combine_search_keys
|
|
@@ -81,8 +81,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
81
81
|
rest_client: Optional[_RestClient] = None,
|
|
82
82
|
logger: Optional[logging.Logger] = None,
|
|
83
83
|
warning_counter: Optional[WarningCounter] = None,
|
|
84
|
+
bundle: Optional[ResourceBundle] = None,
|
|
84
85
|
**kwargs,
|
|
85
86
|
):
|
|
87
|
+
self.bundle = bundle or get_custom_bundle()
|
|
86
88
|
if df is not None:
|
|
87
89
|
data = df.copy()
|
|
88
90
|
elif path is not None:
|
|
@@ -95,13 +97,13 @@ class Dataset: # (pd.DataFrame):
|
|
|
95
97
|
kwargs["sep"] = sep
|
|
96
98
|
data = pd.read_csv(path, **kwargs)
|
|
97
99
|
else:
|
|
98
|
-
raise ValueError(bundle.get("dataset_dataframe_or_path_empty"))
|
|
100
|
+
raise ValueError(self.bundle.get("dataset_dataframe_or_path_empty"))
|
|
99
101
|
if isinstance(data, pd.DataFrame):
|
|
100
102
|
self.data = data
|
|
101
103
|
elif isinstance(data, pd.io.parsers.TextFileReader): # type: ignore
|
|
102
|
-
raise ValueError(bundle.get("dataset_dataframe_iterator"))
|
|
104
|
+
raise ValueError(self.bundle.get("dataset_dataframe_iterator"))
|
|
103
105
|
else:
|
|
104
|
-
raise ValueError(bundle.get("dataset_dataframe_not_pandas"))
|
|
106
|
+
raise ValueError(self.bundle.get("dataset_dataframe_not_pandas"))
|
|
105
107
|
|
|
106
108
|
self.dataset_name = dataset_name
|
|
107
109
|
self.task_type = model_task_type
|
|
@@ -134,14 +136,14 @@ class Dataset: # (pd.DataFrame):
|
|
|
134
136
|
@property
|
|
135
137
|
def meaning_types_checked(self) -> Dict[str, FileColumnMeaningType]:
|
|
136
138
|
if self.meaning_types is None:
|
|
137
|
-
raise ValueError(bundle.get("dataset_empty_meaning_types"))
|
|
139
|
+
raise ValueError(self.bundle.get("dataset_empty_meaning_types"))
|
|
138
140
|
else:
|
|
139
141
|
return self.meaning_types
|
|
140
142
|
|
|
141
143
|
@property
|
|
142
144
|
def search_keys_checked(self) -> List[Tuple[str, ...]]:
|
|
143
145
|
if self.search_keys is None:
|
|
144
|
-
raise ValueError(bundle.get("dataset_empty_search_keys"))
|
|
146
|
+
raise ValueError(self.bundle.get("dataset_empty_search_keys"))
|
|
145
147
|
else:
|
|
146
148
|
return self.search_keys
|
|
147
149
|
|
|
@@ -156,11 +158,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
156
158
|
|
|
157
159
|
def __validate_min_rows_count(self):
|
|
158
160
|
if len(self.data) < self.MIN_ROWS_COUNT:
|
|
159
|
-
raise ValidationError(bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
|
|
161
|
+
raise ValidationError(self.bundle.get("dataset_too_few_rows").format(self.MIN_ROWS_COUNT))
|
|
160
162
|
|
|
161
163
|
def __validate_max_row_count(self):
|
|
162
164
|
if len(self.data) > self.MAX_ROWS:
|
|
163
|
-
raise ValidationError(bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
|
|
165
|
+
raise ValidationError(self.bundle.get("dataset_too_many_rows_registered").format(self.MAX_ROWS))
|
|
164
166
|
|
|
165
167
|
def __rename_columns(self):
|
|
166
168
|
# self.logger.info("Replace restricted symbols in column names")
|
|
@@ -175,7 +177,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
175
177
|
new_column = str(column)
|
|
176
178
|
suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
|
|
177
179
|
if len(new_column) == 0:
|
|
178
|
-
raise ValidationError(bundle.get("dataset_empty_column_names"))
|
|
180
|
+
raise ValidationError(self.bundle.get("dataset_empty_column_names"))
|
|
179
181
|
# db limit for column length
|
|
180
182
|
if len(new_column) > 250:
|
|
181
183
|
new_column = new_column[:250]
|
|
@@ -235,7 +237,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
235
237
|
nrows_after_full_dedup = len(self.data)
|
|
236
238
|
share_full_dedup = 100 * (1 - nrows_after_full_dedup / nrows)
|
|
237
239
|
if share_full_dedup > 0:
|
|
238
|
-
msg = bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
240
|
+
msg = self.bundle.get("dataset_full_duplicates").format(share_full_dedup)
|
|
239
241
|
self.logger.warning(msg)
|
|
240
242
|
# if not silent_mode:
|
|
241
243
|
# print(msg)
|
|
@@ -250,7 +252,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
250
252
|
num_dup_rows = nrows_after_full_dedup - nrows_after_tgt_dedup
|
|
251
253
|
share_tgt_dedup = 100 * num_dup_rows / nrows_after_full_dedup
|
|
252
254
|
|
|
253
|
-
msg = bundle.get("dataset_diff_target_duplicates").format(
|
|
255
|
+
msg = self.bundle.get("dataset_diff_target_duplicates").format(
|
|
256
|
+
share_tgt_dedup, num_dup_rows, dups_indices
|
|
257
|
+
)
|
|
254
258
|
self.logger.warning(msg)
|
|
255
259
|
if not silent_mode:
|
|
256
260
|
print(msg)
|
|
@@ -342,7 +346,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
342
346
|
|
|
343
347
|
self.data[ip] = self.data[ip].apply(self._safe_ip_parse)
|
|
344
348
|
if self.data[ip].isnull().all():
|
|
345
|
-
raise ValidationError(bundle.get("invalid_ip").format(ip))
|
|
349
|
+
raise ValidationError(self.bundle.get("invalid_ip").format(ip))
|
|
346
350
|
|
|
347
351
|
if self.data[ip].apply(self._is_ipv4).any():
|
|
348
352
|
ipv4 = ip + "_v4"
|
|
@@ -379,7 +383,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
379
383
|
.str.replace("UK", "GB", regex=False)
|
|
380
384
|
)
|
|
381
385
|
if (self.data[iso_code] == "").all():
|
|
382
|
-
raise ValidationError(bundle.get("invalid_country").format(iso_code))
|
|
386
|
+
raise ValidationError(self.bundle.get("invalid_country").format(iso_code))
|
|
383
387
|
|
|
384
388
|
def __normalize_postal_code(self):
|
|
385
389
|
postal_code = self.etalon_def_checked.get(FileColumnMeaningType.POSTAL_CODE.value)
|
|
@@ -402,7 +406,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
402
406
|
.str.replace(r"^0+\B", "", regex=True) # remove leading zeros
|
|
403
407
|
)
|
|
404
408
|
if (self.data[postal_code] == "").all():
|
|
405
|
-
raise ValidationError(bundle.get("invalid_postal_code").format(postal_code))
|
|
409
|
+
raise ValidationError(self.bundle.get("invalid_postal_code").format(postal_code))
|
|
406
410
|
|
|
407
411
|
def __normalize_hem(self):
|
|
408
412
|
hem = self.etalon_def_checked.get(FileColumnMeaningType.HEM.value)
|
|
@@ -420,9 +424,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
420
424
|
self.data.drop(index=old_subset.index, inplace=True) # type: ignore
|
|
421
425
|
self.logger.info(f"df after dropping old rows: {self.data.shape}")
|
|
422
426
|
if len(self.data) == 0:
|
|
423
|
-
raise ValidationError(bundle.get("dataset_all_dates_old"))
|
|
427
|
+
raise ValidationError(self.bundle.get("dataset_all_dates_old"))
|
|
424
428
|
else:
|
|
425
|
-
msg = bundle.get("dataset_drop_old_dates")
|
|
429
|
+
msg = self.bundle.get("dataset_drop_old_dates")
|
|
426
430
|
self.logger.warning(msg)
|
|
427
431
|
if not silent_mode:
|
|
428
432
|
print(msg)
|
|
@@ -458,10 +462,10 @@ class Dataset: # (pd.DataFrame):
|
|
|
458
462
|
target = target.astype("category").cat.codes
|
|
459
463
|
except ValueError:
|
|
460
464
|
self.logger.exception("Failed to cast target to category codes for binary task type")
|
|
461
|
-
raise ValidationError(bundle.get("dataset_invalid_target_type").format(target.dtype))
|
|
465
|
+
raise ValidationError(self.bundle.get("dataset_invalid_target_type").format(target.dtype))
|
|
462
466
|
target_classes_count = target.nunique()
|
|
463
467
|
if target_classes_count != 2:
|
|
464
|
-
msg = bundle.get("dataset_invalid_binary_target").format(target_classes_count)
|
|
468
|
+
msg = self.bundle.get("dataset_invalid_binary_target").format(target_classes_count)
|
|
465
469
|
self.logger.warning(msg)
|
|
466
470
|
raise ValidationError(msg)
|
|
467
471
|
elif self.task_type == ModelTaskType.MULTICLASS:
|
|
@@ -470,21 +474,21 @@ class Dataset: # (pd.DataFrame):
|
|
|
470
474
|
target = self.data[target_column].astype("category").cat.codes
|
|
471
475
|
except Exception:
|
|
472
476
|
self.logger.exception("Failed to cast target to category codes for multiclass task type")
|
|
473
|
-
raise ValidationError(bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
|
477
|
+
raise ValidationError(self.bundle.get("dataset_invalid_multiclass_target").format(target.dtype))
|
|
474
478
|
elif self.task_type == ModelTaskType.REGRESSION:
|
|
475
479
|
if not is_float_dtype(target):
|
|
476
480
|
try:
|
|
477
481
|
self.data[target_column] = self.data[target_column].astype("float")
|
|
478
482
|
except ValueError:
|
|
479
483
|
self.logger.exception("Failed to cast target to float for regression task type")
|
|
480
|
-
raise ValidationError(bundle.get("dataset_invalid_regression_target").format(target.dtype))
|
|
484
|
+
raise ValidationError(self.bundle.get("dataset_invalid_regression_target").format(target.dtype))
|
|
481
485
|
elif self.task_type == ModelTaskType.TIMESERIES:
|
|
482
486
|
if not is_float_dtype(target):
|
|
483
487
|
try:
|
|
484
488
|
self.data[target_column] = self.data[target_column].astype("float")
|
|
485
489
|
except ValueError:
|
|
486
490
|
self.logger.exception("Failed to cast target to float for timeseries task type")
|
|
487
|
-
raise ValidationError(bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
|
491
|
+
raise ValidationError(self.bundle.get("dataset_invalid_timeseries_target").format(target.dtype))
|
|
488
492
|
|
|
489
493
|
def __resample(self):
|
|
490
494
|
# self.logger.info("Resampling etalon")
|
|
@@ -505,7 +509,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
505
509
|
target_classes_count = target.nunique()
|
|
506
510
|
|
|
507
511
|
if target_classes_count > self.MAX_MULTICLASS_CLASS_COUNT:
|
|
508
|
-
msg = bundle.get("dataset_to_many_multiclass_targets").format(
|
|
512
|
+
msg = self.bundle.get("dataset_to_many_multiclass_targets").format(
|
|
509
513
|
target_classes_count, self.MAX_MULTICLASS_CLASS_COUNT
|
|
510
514
|
)
|
|
511
515
|
self.logger.warning(msg)
|
|
@@ -519,7 +523,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
519
523
|
min_class_value = v
|
|
520
524
|
|
|
521
525
|
if min_class_count < self.MIN_TARGET_CLASS_ROWS:
|
|
522
|
-
msg = bundle.get("dataset_rarest_class_less_min").format(
|
|
526
|
+
msg = self.bundle.get("dataset_rarest_class_less_min").format(
|
|
523
527
|
min_class_value, min_class_count, self.MIN_TARGET_CLASS_ROWS
|
|
524
528
|
)
|
|
525
529
|
self.logger.warning(msg)
|
|
@@ -529,7 +533,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
529
533
|
min_class_threshold = min_class_percent * count
|
|
530
534
|
|
|
531
535
|
if min_class_count < min_class_threshold:
|
|
532
|
-
msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
536
|
+
msg = self.bundle.get("dataset_rarest_class_less_threshold").format(
|
|
533
537
|
min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
534
538
|
)
|
|
535
539
|
self.logger.warning(msg)
|
|
@@ -543,7 +547,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
543
547
|
quantile25_idx = int(0.75 * len(classes))
|
|
544
548
|
quantile25_class = classes[quantile25_idx]
|
|
545
549
|
count_of_quantile25_class = len(target[target == quantile25_class])
|
|
546
|
-
msg = bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
|
|
550
|
+
msg = self.bundle.get("imbalance_multiclass").format(quantile25_class, count_of_quantile25_class)
|
|
547
551
|
self.logger.warning(msg)
|
|
548
552
|
print(msg)
|
|
549
553
|
# 25% and lower classes will stay as is. Higher classes will be downsampled
|
|
@@ -621,7 +625,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
621
625
|
del self.meaning_types_checked[f]
|
|
622
626
|
|
|
623
627
|
if removed_features:
|
|
624
|
-
msg = bundle.get("dataset_date_features").format(removed_features)
|
|
628
|
+
msg = self.bundle.get("dataset_date_features").format(removed_features)
|
|
625
629
|
self.logger.warning(msg)
|
|
626
630
|
if not silent_mode:
|
|
627
631
|
print(msg)
|
|
@@ -629,7 +633,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
629
633
|
|
|
630
634
|
def __validate_features_count(self):
|
|
631
635
|
if len(self.__features()) > self.MAX_FEATURES_COUNT:
|
|
632
|
-
msg = bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
|
|
636
|
+
msg = self.bundle.get("dataset_too_many_features").format(self.MAX_FEATURES_COUNT)
|
|
633
637
|
self.logger.warning(msg)
|
|
634
638
|
raise ValidationError(msg)
|
|
635
639
|
|
|
@@ -646,14 +650,14 @@ class Dataset: # (pd.DataFrame):
|
|
|
646
650
|
target = self.etalon_def_checked.get(FileColumnMeaningType.TARGET.value)
|
|
647
651
|
if validate_target:
|
|
648
652
|
if target is None:
|
|
649
|
-
raise ValidationError(bundle.get("dataset_missing_target"))
|
|
653
|
+
raise ValidationError(self.bundle.get("dataset_missing_target"))
|
|
650
654
|
|
|
651
655
|
target_value = self.__target_value()
|
|
652
656
|
target_items = target_value.nunique()
|
|
653
657
|
if target_items == 1:
|
|
654
|
-
raise ValidationError(bundle.get("dataset_constant_target"))
|
|
658
|
+
raise ValidationError(self.bundle.get("dataset_constant_target"))
|
|
655
659
|
elif target_items == 0:
|
|
656
|
-
raise ValidationError(bundle.get("dataset_empty_target"))
|
|
660
|
+
raise ValidationError(self.bundle.get("dataset_empty_target"))
|
|
657
661
|
|
|
658
662
|
# if self.task_type != ModelTaskType.MULTICLASS:
|
|
659
663
|
# self.data[target] = self.data[target].apply(pd.to_numeric, errors="coerce")
|
|
@@ -664,23 +668,29 @@ class Dataset: # (pd.DataFrame):
|
|
|
664
668
|
for key in search_group
|
|
665
669
|
if self.columns_renaming.get(key) != EmailSearchKeyConverter.EMAIL_ONE_DOMAIN_COLUMN_NAME
|
|
666
670
|
]
|
|
671
|
+
ipv4_column = self.etalon_def_checked.get(FileColumnMeaningType.IP_ADDRESS)
|
|
672
|
+
if (
|
|
673
|
+
FileColumnMeaningType.IPV6_ADDRESS in self.etalon_def_checked
|
|
674
|
+
and ipv4_column is not None
|
|
675
|
+
and ipv4_column in keys_to_validate
|
|
676
|
+
):
|
|
677
|
+
keys_to_validate.remove(ipv4_column)
|
|
678
|
+
|
|
667
679
|
mandatory_columns = [target]
|
|
668
680
|
columns_to_validate = mandatory_columns.copy()
|
|
669
681
|
columns_to_validate.extend(keys_to_validate)
|
|
670
682
|
columns_to_validate = set([i for i in columns_to_validate if i is not None])
|
|
671
683
|
|
|
672
|
-
# TODO remove ipv4 from validation if ipv6 is presented
|
|
673
|
-
|
|
674
684
|
nrows = len(self.data)
|
|
675
685
|
validation_stats = {}
|
|
676
686
|
self.data["valid_keys"] = 0
|
|
677
687
|
self.data["valid_mandatory"] = True
|
|
678
688
|
|
|
679
|
-
all_valid_status = bundle.get("validation_all_valid_status")
|
|
680
|
-
some_invalid_status = bundle.get("validation_some_invalid_status")
|
|
681
|
-
all_invalid_status = bundle.get("validation_all_invalid_status")
|
|
682
|
-
all_valid_message = bundle.get("validation_all_valid_message")
|
|
683
|
-
invalid_message = bundle.get("validation_invalid_message")
|
|
689
|
+
all_valid_status = self.bundle.get("validation_all_valid_status")
|
|
690
|
+
some_invalid_status = self.bundle.get("validation_some_invalid_status")
|
|
691
|
+
all_invalid_status = self.bundle.get("validation_all_invalid_status")
|
|
692
|
+
all_valid_message = self.bundle.get("validation_all_valid_message")
|
|
693
|
+
invalid_message = self.bundle.get("validation_invalid_message")
|
|
684
694
|
|
|
685
695
|
for col in columns_to_validate:
|
|
686
696
|
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
|
@@ -721,9 +731,9 @@ class Dataset: # (pd.DataFrame):
|
|
|
721
731
|
if not silent_mode:
|
|
722
732
|
df_stats = pd.DataFrame.from_dict(validation_stats, orient="index")
|
|
723
733
|
df_stats.reset_index(inplace=True)
|
|
724
|
-
name_header = bundle.get("validation_column_name_header")
|
|
725
|
-
status_header = bundle.get("validation_status_header")
|
|
726
|
-
description_header = bundle.get("validation_descr_header")
|
|
734
|
+
name_header = self.bundle.get("validation_column_name_header")
|
|
735
|
+
status_header = self.bundle.get("validation_status_header")
|
|
736
|
+
description_header = self.bundle.get("validation_descr_header")
|
|
727
737
|
df_stats.columns = [name_header, status_header, description_header]
|
|
728
738
|
try:
|
|
729
739
|
import html
|
|
@@ -732,11 +742,11 @@ class Dataset: # (pd.DataFrame):
|
|
|
732
742
|
|
|
733
743
|
_ = get_ipython() # type: ignore
|
|
734
744
|
|
|
735
|
-
text_color = bundle.get("validation_text_color")
|
|
745
|
+
text_color = self.bundle.get("validation_text_color")
|
|
736
746
|
colormap = {
|
|
737
|
-
all_valid_status: bundle.get("validation_all_valid_color"),
|
|
738
|
-
some_invalid_status: bundle.get("validation_some_invalid_color"),
|
|
739
|
-
all_invalid_status: bundle.get("validation_all_invalid_color"),
|
|
747
|
+
all_valid_status: self.bundle.get("validation_all_valid_color"),
|
|
748
|
+
some_invalid_status: self.bundle.get("validation_some_invalid_color"),
|
|
749
|
+
all_invalid_status: self.bundle.get("validation_all_invalid_color"),
|
|
740
750
|
}
|
|
741
751
|
|
|
742
752
|
def map_color(text) -> str:
|
|
@@ -760,31 +770,33 @@ class Dataset: # (pd.DataFrame):
|
|
|
760
770
|
print(df_stats)
|
|
761
771
|
|
|
762
772
|
if len(self.data) == 0:
|
|
763
|
-
raise ValidationError(bundle.get("all_search_keys_invalid"))
|
|
773
|
+
raise ValidationError(self.bundle.get("all_search_keys_invalid"))
|
|
764
774
|
|
|
765
775
|
def __validate_meaning_types(self, validate_target: bool):
|
|
766
776
|
# self.logger.info("Validating meaning types")
|
|
767
777
|
if self.meaning_types is None or len(self.meaning_types) == 0:
|
|
768
|
-
raise ValueError(bundle.get("dataset_missing_meaning_types"))
|
|
778
|
+
raise ValueError(self.bundle.get("dataset_missing_meaning_types"))
|
|
769
779
|
|
|
770
780
|
if SYSTEM_RECORD_ID not in self.data.columns:
|
|
771
781
|
raise ValueError("Internal error")
|
|
772
782
|
|
|
773
783
|
for column in self.meaning_types:
|
|
774
784
|
if column not in self.data.columns:
|
|
775
|
-
raise ValueError(bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
|
|
785
|
+
raise ValueError(self.bundle.get("dataset_missing_meaning_column").format(column, self.data.columns))
|
|
776
786
|
if validate_target and FileColumnMeaningType.TARGET not in self.meaning_types.values():
|
|
777
|
-
raise ValueError(bundle.get("dataset_missing_target"))
|
|
787
|
+
raise ValueError(self.bundle.get("dataset_missing_target"))
|
|
778
788
|
|
|
779
789
|
def __validate_search_keys(self):
|
|
780
790
|
# self.logger.info("Validating search keys")
|
|
781
791
|
if self.search_keys is None or len(self.search_keys) == 0:
|
|
782
|
-
raise ValueError(bundle.get("dataset_missing_search_keys"))
|
|
792
|
+
raise ValueError(self.bundle.get("dataset_missing_search_keys"))
|
|
783
793
|
for keys_group in self.search_keys:
|
|
784
794
|
for key in keys_group:
|
|
785
795
|
if key not in self.data.columns:
|
|
786
796
|
showing_columns = set(self.data.columns) - SYSTEM_COLUMNS
|
|
787
|
-
raise ValidationError(
|
|
797
|
+
raise ValidationError(
|
|
798
|
+
self.bundle.get("dataset_missing_search_key_column").format(key, showing_columns)
|
|
799
|
+
)
|
|
788
800
|
|
|
789
801
|
def validate(self, validate_target: bool = True, silent_mode: bool = False):
|
|
790
802
|
# self.logger.info("Validating dataset")
|
|
@@ -889,7 +901,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
889
901
|
elif is_string_dtype(pandas_data_type):
|
|
890
902
|
return DataType.STRING
|
|
891
903
|
else:
|
|
892
|
-
msg = bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
|
904
|
+
msg = self.bundle.get("dataset_invalid_column_type").format(column_name, pandas_data_type)
|
|
893
905
|
self.logger.warning(msg)
|
|
894
906
|
raise ValidationError(msg)
|
|
895
907
|
|
|
@@ -920,7 +932,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
920
932
|
for key in filter_features
|
|
921
933
|
if key not in {"min_importance", "max_psi", "max_count", "selected_features"}
|
|
922
934
|
]:
|
|
923
|
-
raise ValidationError(bundle.get("dataset_invalid_filter"))
|
|
935
|
+
raise ValidationError(self.bundle.get("dataset_invalid_filter"))
|
|
924
936
|
feature_filter = FeaturesFilter(
|
|
925
937
|
minImportance=filter_features.get("min_importance"),
|
|
926
938
|
maxPSI=filter_features.get("max_psi"),
|
|
@@ -1011,7 +1023,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
1011
1023
|
trace_id, parquet_file_path, file_metadata, file_metrics, search_customization
|
|
1012
1024
|
)
|
|
1013
1025
|
# if progress_bar is not None:
|
|
1014
|
-
# progress_bar.progress = (6.0, bundle.get(ProgressStage.MATCHING.value))
|
|
1026
|
+
# progress_bar.progress = (6.0, self.bundle.get(ProgressStage.MATCHING.value))
|
|
1015
1027
|
# if progress_callback is not None:
|
|
1016
1028
|
# progress_callback(SearchProgress(6.0, ProgressStage.MATCHING))
|
|
1017
1029
|
self.file_upload_id = search_task_response.file_upload_id
|
|
@@ -1082,7 +1094,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
1082
1094
|
)
|
|
1083
1095
|
self.file_upload_id = search_task_response.file_upload_id
|
|
1084
1096
|
# if progress_bar is not None:
|
|
1085
|
-
# progress_bar.progress = (6.0, bundle.get(ProgressStage.ENRICHING.value))
|
|
1097
|
+
# progress_bar.progress = (6.0, self.bundle.get(ProgressStage.ENRICHING.value))
|
|
1086
1098
|
# if progress_callback is not None:
|
|
1087
1099
|
# progress_callback(SearchProgress(6.0, ProgressStage.ENRICHING))
|
|
1088
1100
|
|
|
@@ -1102,5 +1114,5 @@ class Dataset: # (pd.DataFrame):
|
|
|
1102
1114
|
uploading_file_size = Path(parquet_file_path).stat().st_size
|
|
1103
1115
|
self.logger.info(f"Size of prepared uploading file: {uploading_file_size}")
|
|
1104
1116
|
if uploading_file_size > self.MAX_UPLOADING_FILE_SIZE:
|
|
1105
|
-
raise ValidationError(bundle.get("dataset_too_big_file"))
|
|
1117
|
+
raise ValidationError(self.bundle.get("dataset_too_big_file"))
|
|
1106
1118
|
return parquet_file_path
|