upgini 1.2.25a1__tar.gz → 1.2.27__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.25a1 → upgini-1.2.27}/PKG-INFO +1 -1
- {upgini-1.2.25a1 → upgini-1.2.27}/pyproject.toml +0 -6
- upgini-1.2.27/src/upgini/__about__.py +1 -0
- upgini-1.2.27/src/upgini/__init__.py +5 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/features_enricher.py +17 -14
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/resource_bundle/strings.properties +1 -1
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/display_utils.py +0 -1
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/target_utils.py +18 -15
- upgini-1.2.25a1/src/upgini/__about__.py +0 -1
- upgini-1.2.25a1/src/upgini/__init__.py +0 -13
- {upgini-1.2.25a1 → upgini-1.2.27}/.gitignore +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/LICENSE +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/README.md +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/ads.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/dataset.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/errors.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/http.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/metadata.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/metrics.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/search_task.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/spinner.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.25a1 → upgini-1.2.27}/src/upgini/version_validator.py +0 -0
|
@@ -52,12 +52,6 @@ dependencies = [
|
|
|
52
52
|
"levenshtein>=0.25.1",
|
|
53
53
|
]
|
|
54
54
|
|
|
55
|
-
[tool.setuptools]
|
|
56
|
-
include-package-data = true
|
|
57
|
-
|
|
58
|
-
[tool.setuptools.package-data]
|
|
59
|
-
"upgini" = ["utils/Roboto-Regular.ttf"]
|
|
60
|
-
|
|
61
55
|
[project.urls]
|
|
62
56
|
"Bug Reports" = "https://github.com/upgini/upgini/issues"
|
|
63
57
|
Homepage = "https://upgini.com/"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.27"
|
|
@@ -2546,9 +2546,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2546
2546
|
self.fit_generated_features.extend(generator.generated_features)
|
|
2547
2547
|
|
|
2548
2548
|
# Checks that need validated date
|
|
2549
|
-
|
|
2550
|
-
|
|
2551
|
-
|
|
2549
|
+
try:
|
|
2550
|
+
if not is_dates_distribution_valid(df, self.fit_search_keys):
|
|
2551
|
+
self.__log_warning(bundle.get("x_unstable_by_date"))
|
|
2552
|
+
except Exception:
|
|
2553
|
+
self.logger.exception("Failed to check dates distribution validity")
|
|
2552
2554
|
|
|
2553
2555
|
if (
|
|
2554
2556
|
is_numeric_dtype(df[self.TARGET_NAME])
|
|
@@ -3194,9 +3196,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3194
3196
|
|
|
3195
3197
|
return df
|
|
3196
3198
|
|
|
3197
|
-
@staticmethod
|
|
3198
3199
|
def _add_current_date_as_key(
|
|
3199
|
-
df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
3200
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], logger: logging.Logger, bundle: ResourceBundle
|
|
3200
3201
|
) -> pd.DataFrame:
|
|
3201
3202
|
if (
|
|
3202
3203
|
set(search_keys.values()) == {SearchKey.PHONE}
|
|
@@ -3204,9 +3205,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3204
3205
|
or set(search_keys.values()) == {SearchKey.HEM}
|
|
3205
3206
|
or set(search_keys.values()) == {SearchKey.COUNTRY, SearchKey.POSTAL_CODE}
|
|
3206
3207
|
):
|
|
3207
|
-
|
|
3208
|
-
print(msg)
|
|
3209
|
-
logger.warning(msg)
|
|
3208
|
+
self.__log_warning(bundle.get("current_date_added"))
|
|
3210
3209
|
df[FeaturesEnricher.CURRENT_DATE] = datetime.date.today()
|
|
3211
3210
|
search_keys[FeaturesEnricher.CURRENT_DATE] = SearchKey.DATE
|
|
3212
3211
|
converter = DateTimeSearchKeyConverter(FeaturesEnricher.CURRENT_DATE)
|
|
@@ -4041,15 +4040,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4041
4040
|
half_train = round(len(train) / 2)
|
|
4042
4041
|
part1 = train[:half_train]
|
|
4043
4042
|
part2 = train[half_train:]
|
|
4044
|
-
|
|
4045
|
-
if
|
|
4046
|
-
self.
|
|
4043
|
+
train_psi_result = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
|
|
4044
|
+
if isinstance(train_psi_result, Exception):
|
|
4045
|
+
self.logger.exception("Failed to calculate train PSI", train_psi_result)
|
|
4046
|
+
elif train_psi_result > 0.2:
|
|
4047
|
+
self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi_result))
|
|
4047
4048
|
|
|
4048
4049
|
# 2. Check train-test PSI
|
|
4049
4050
|
if eval1 is not None:
|
|
4050
|
-
|
|
4051
|
-
if
|
|
4052
|
-
self.
|
|
4051
|
+
train_test_psi_result = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
|
|
4052
|
+
if isinstance(train_test_psi_result, Exception):
|
|
4053
|
+
self.logger.exception("Failed to calculate test PSI", train_test_psi_result)
|
|
4054
|
+
elif train_test_psi_result > 0.2:
|
|
4055
|
+
self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi_result))
|
|
4053
4056
|
|
|
4054
4057
|
def _dump_python_libs(self):
|
|
4055
4058
|
try:
|
|
@@ -9,7 +9,7 @@ search_stopped=Search request stopped
|
|
|
9
9
|
polling_search_task=\nRunning search request, search_id={}
|
|
10
10
|
polling_unregister_information=We'll send email notification once it's completed, just use your personal api_key from profile.upgini.com
|
|
11
11
|
ads_upload_finish=Thank you for your submission!\nWe'll check your data sharing proposal and get back to you
|
|
12
|
-
demo_dataset_info=Demo training dataset detected. Registration for an API key is not required
|
|
12
|
+
demo_dataset_info=Demo training dataset detected. Registration for an API key is not required.\n
|
|
13
13
|
transform_usage_info=You use Trial access to Upgini data enrichment. Limit for Trial: {} rows. You have already enriched: {} rows.
|
|
14
14
|
transform_usage_warning=You are trying to launch enrichment for {} rows, which will exceed the rest limit {}.
|
|
15
15
|
|
|
@@ -169,7 +169,6 @@ def make_html_report(
|
|
|
169
169
|
from pkg_resources import resource_filename
|
|
170
170
|
font_path = resource_filename('upgini.utils', 'Roboto-Regular.ttf')
|
|
171
171
|
|
|
172
|
-
print(font_path)
|
|
173
172
|
return f"""<html>
|
|
174
173
|
<head>
|
|
175
174
|
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
|
@@ -3,7 +3,7 @@ from typing import Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
|
-
from pandas.api.types import is_numeric_dtype
|
|
6
|
+
from pandas.api.types import is_numeric_dtype, is_bool_dtype
|
|
7
7
|
|
|
8
8
|
from upgini.errors import ValidationError
|
|
9
9
|
from upgini.metadata import SYSTEM_RECORD_ID, ModelTaskType
|
|
@@ -229,22 +229,25 @@ def balance_undersample(
|
|
|
229
229
|
return resampled_data
|
|
230
230
|
|
|
231
231
|
|
|
232
|
-
def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
233
|
-
|
|
232
|
+
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
233
|
+
try:
|
|
234
|
+
df = pd.concat([expected, actual])
|
|
234
235
|
|
|
235
|
-
|
|
236
|
-
|
|
237
|
-
df_max = df.max()
|
|
238
|
-
bins = [df_min, (df_min + df_max) / 2, df_max]
|
|
236
|
+
if is_bool_dtype(df):
|
|
237
|
+
df = np.where(df, 1, 0)
|
|
239
238
|
|
|
240
|
-
|
|
241
|
-
|
|
239
|
+
# Define the bins for the target variable
|
|
240
|
+
df_min = df.min()
|
|
241
|
+
df_max = df.max()
|
|
242
|
+
bins = [df_min, (df_min + df_max) / 2, df_max]
|
|
242
243
|
|
|
243
|
-
|
|
244
|
-
|
|
244
|
+
# Calculate the base distribution
|
|
245
|
+
train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
|
|
245
246
|
|
|
246
|
-
|
|
247
|
-
|
|
247
|
+
# Calculate the target distribution
|
|
248
|
+
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
249
|
+
|
|
250
|
+
# Calculate the PSI
|
|
248
251
|
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
249
|
-
except Exception:
|
|
250
|
-
return
|
|
252
|
+
except Exception as e:
|
|
253
|
+
return e
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.25a1"
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
from upgini.features_enricher import FeaturesEnricher # noqa: F401
|
|
4
|
-
from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
|
|
5
|
-
# from .lazy_import import LazyImport
|
|
6
|
-
|
|
7
|
-
os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
|
|
8
|
-
|
|
9
|
-
# FeaturesEnricher = LazyImport("upgini.features_enricher", "FeaturesEnricher")
|
|
10
|
-
# SearchKey = LazyImport("upgini.metadata", "SearchKey")
|
|
11
|
-
# RuntimeParameters = LazyImport("upgini.metadata", "RuntimeParameters")
|
|
12
|
-
# CVType = LazyImport("upgini.metadata", "CVType")
|
|
13
|
-
# ModelTaskType = LazyImport("upgini.metadata", "ModelTaskType")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|