upgini 1.2.26__tar.gz → 1.2.28__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.26 → upgini-1.2.28}/PKG-INFO +1 -1
- upgini-1.2.28/src/upgini/__about__.py +1 -0
- upgini-1.2.28/src/upgini/__init__.py +5 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/features_enricher.py +29 -16
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/strings.properties +2 -2
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/target_utils.py +16 -16
- upgini-1.2.26/src/upgini/__about__.py +0 -1
- upgini-1.2.26/src/upgini/__init__.py +0 -13
- {upgini-1.2.26 → upgini-1.2.28}/.gitignore +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/LICENSE +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/README.md +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/pyproject.toml +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/ads.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/dataset.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/errors.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/http.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/metadata.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/metrics.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/search_task.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/spinner.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.26 → upgini-1.2.28}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.28"
|
|
@@ -2026,7 +2026,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2026
2026
|
start_time = time.time()
|
|
2027
2027
|
with MDC(trace_id=trace_id):
|
|
2028
2028
|
self.logger.info("Start transform")
|
|
2029
|
-
|
|
2029
|
+
|
|
2030
|
+
validated_X = self._validate_X(X, is_transform=True)
|
|
2031
|
+
|
|
2032
|
+
self.__log_debug_information(validated_X, exclude_features_sources=exclude_features_sources)
|
|
2030
2033
|
|
|
2031
2034
|
self.__validate_search_keys(self.search_keys, self.search_id)
|
|
2032
2035
|
|
|
@@ -2058,8 +2061,6 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2058
2061
|
self.logger.info(msg)
|
|
2059
2062
|
print(msg)
|
|
2060
2063
|
|
|
2061
|
-
validated_X = self._validate_X(X, is_transform=True)
|
|
2062
|
-
|
|
2063
2064
|
is_demo_dataset = hash_input(validated_X) in DEMO_DATASET_HASHES
|
|
2064
2065
|
|
|
2065
2066
|
columns_to_drop = [c for c in validated_X.columns if c in self.feature_names_]
|
|
@@ -2476,9 +2477,9 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2476
2477
|
validate_scoring_argument(scoring)
|
|
2477
2478
|
|
|
2478
2479
|
self.__log_debug_information(
|
|
2479
|
-
|
|
2480
|
-
|
|
2481
|
-
|
|
2480
|
+
validated_X,
|
|
2481
|
+
validated_y,
|
|
2482
|
+
validated_eval_set,
|
|
2482
2483
|
exclude_features_sources=exclude_features_sources,
|
|
2483
2484
|
calculate_metrics=calculate_metrics,
|
|
2484
2485
|
scoring=scoring,
|
|
@@ -2546,9 +2547,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2546
2547
|
self.fit_generated_features.extend(generator.generated_features)
|
|
2547
2548
|
|
|
2548
2549
|
# Checks that need validated date
|
|
2549
|
-
|
|
2550
|
-
|
|
2551
|
-
|
|
2550
|
+
try:
|
|
2551
|
+
if not is_dates_distribution_valid(df, self.fit_search_keys):
|
|
2552
|
+
self.__log_warning(bundle.get("x_unstable_by_date"))
|
|
2553
|
+
except Exception:
|
|
2554
|
+
self.logger.exception("Failed to check dates distribution validity")
|
|
2552
2555
|
|
|
2553
2556
|
if (
|
|
2554
2557
|
is_numeric_dtype(df[self.TARGET_NAME])
|
|
@@ -3760,11 +3763,17 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3760
3763
|
if len(passed_unsupported_search_keys) > 0:
|
|
3761
3764
|
raise ValidationError(self.bundle.get("unsupported_search_key").format(passed_unsupported_search_keys))
|
|
3762
3765
|
|
|
3766
|
+
x_columns = [
|
|
3767
|
+
c
|
|
3768
|
+
for c in x.columns
|
|
3769
|
+
if c not in [TARGET, EVAL_SET_INDEX, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID, SEARCH_KEY_UNNEST]
|
|
3770
|
+
]
|
|
3771
|
+
|
|
3763
3772
|
for column_id, meaning_type in search_keys.items():
|
|
3764
3773
|
column_name = None
|
|
3765
3774
|
if isinstance(column_id, str):
|
|
3766
3775
|
if column_id not in x.columns:
|
|
3767
|
-
raise ValidationError(self.bundle.get("search_key_not_found").format(column_id,
|
|
3776
|
+
raise ValidationError(self.bundle.get("search_key_not_found").format(column_id, x_columns))
|
|
3768
3777
|
column_name = column_id
|
|
3769
3778
|
valid_search_keys[column_name] = meaning_type
|
|
3770
3779
|
elif isinstance(column_id, int):
|
|
@@ -4038,15 +4047,19 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
4038
4047
|
half_train = round(len(train) / 2)
|
|
4039
4048
|
part1 = train[:half_train]
|
|
4040
4049
|
part2 = train[half_train:]
|
|
4041
|
-
|
|
4042
|
-
if
|
|
4043
|
-
self.
|
|
4050
|
+
train_psi_result = calculate_psi(part1[self.TARGET_NAME], part2[self.TARGET_NAME])
|
|
4051
|
+
if isinstance(train_psi_result, Exception):
|
|
4052
|
+
self.logger.exception("Failed to calculate train PSI", train_psi_result)
|
|
4053
|
+
elif train_psi_result > 0.2:
|
|
4054
|
+
self.__log_warning(self.bundle.get("train_unstable_target").format(train_psi_result))
|
|
4044
4055
|
|
|
4045
4056
|
# 2. Check train-test PSI
|
|
4046
4057
|
if eval1 is not None:
|
|
4047
|
-
|
|
4048
|
-
if
|
|
4049
|
-
self.
|
|
4058
|
+
train_test_psi_result = calculate_psi(train[self.TARGET_NAME], eval1[self.TARGET_NAME])
|
|
4059
|
+
if isinstance(train_test_psi_result, Exception):
|
|
4060
|
+
self.logger.exception("Failed to calculate test PSI", train_test_psi_result)
|
|
4061
|
+
elif train_test_psi_result > 0.2:
|
|
4062
|
+
self.__log_warning(self.bundle.get("eval_unstable_target").format(train_test_psi_result))
|
|
4050
4063
|
|
|
4051
4064
|
def _dump_python_libs(self):
|
|
4052
4065
|
try:
|
|
@@ -201,7 +201,7 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
|
|
|
201
201
|
email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
202
202
|
phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
203
203
|
phone_detected_not_registered=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
|
|
204
|
-
target_type_detected
|
|
204
|
+
target_type_detected=\nDetected task type: {}. Reason: {}\nYou can set task type manually with argument `model_task_type` of FeaturesEnricher constructor if task type detected incorrectly\n
|
|
205
205
|
binary_target_reason=only two unique label-values observed
|
|
206
206
|
non_numeric_multiclass_reason=non-numeric label values observed
|
|
207
207
|
few_unique_label_multiclass_reason=few unique label-values observed and can be considered as categorical
|
|
@@ -212,7 +212,7 @@ limited_int_multiclass_reason=integer-like values with limited unique values obs
|
|
|
212
212
|
all_ok_community_invite=❓ Support request
|
|
213
213
|
too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
|
|
214
214
|
imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
|
|
215
|
-
imbalanced_target
|
|
215
|
+
imbalanced_target=\nTarget is imbalanced and will be undersampled. Frequency of the rarest class `{}` is {}
|
|
216
216
|
loss_selection_info=Using loss `{}` for feature selection
|
|
217
217
|
loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
|
|
218
218
|
|
|
@@ -229,25 +229,25 @@ def balance_undersample(
|
|
|
229
229
|
return resampled_data
|
|
230
230
|
|
|
231
231
|
|
|
232
|
-
def calculate_psi(expected: pd.Series, actual: pd.Series) -> float:
|
|
233
|
-
|
|
232
|
+
def calculate_psi(expected: pd.Series, actual: pd.Series) -> Union[float, Exception]:
|
|
233
|
+
try:
|
|
234
|
+
df = pd.concat([expected, actual])
|
|
234
235
|
|
|
235
|
-
|
|
236
|
-
|
|
236
|
+
if is_bool_dtype(df):
|
|
237
|
+
df = np.where(df, 1, 0)
|
|
237
238
|
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
|
|
241
|
-
|
|
239
|
+
# Define the bins for the target variable
|
|
240
|
+
df_min = df.min()
|
|
241
|
+
df_max = df.max()
|
|
242
|
+
bins = [df_min, (df_min + df_max) / 2, df_max]
|
|
242
243
|
|
|
243
|
-
|
|
244
|
-
|
|
244
|
+
# Calculate the base distribution
|
|
245
|
+
train_distribution = expected.value_counts(bins=bins, normalize=True).sort_index().values
|
|
245
246
|
|
|
246
|
-
|
|
247
|
-
|
|
247
|
+
# Calculate the target distribution
|
|
248
|
+
test_distribution = actual.value_counts(bins=bins, normalize=True).sort_index().values
|
|
248
249
|
|
|
249
|
-
|
|
250
|
-
try:
|
|
250
|
+
# Calculate the PSI
|
|
251
251
|
return np.sum((train_distribution - test_distribution) * np.log(train_distribution / test_distribution))
|
|
252
|
-
except Exception:
|
|
253
|
-
return
|
|
252
|
+
except Exception as e:
|
|
253
|
+
return e
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.26"
|
|
@@ -1,13 +0,0 @@
|
|
|
1
|
-
import os
|
|
2
|
-
|
|
3
|
-
from upgini.features_enricher import FeaturesEnricher # noqa: F401
|
|
4
|
-
from upgini.metadata import SearchKey, CVType, RuntimeParameters, ModelTaskType # noqa: F401
|
|
5
|
-
# from .lazy_import import LazyImport
|
|
6
|
-
|
|
7
|
-
os.environ["SETUPTOOLS_USE_DISTUTILS"] = "stdlib"
|
|
8
|
-
|
|
9
|
-
# FeaturesEnricher = LazyImport("upgini.features_enricher", "FeaturesEnricher")
|
|
10
|
-
# SearchKey = LazyImport("upgini.metadata", "SearchKey")
|
|
11
|
-
# RuntimeParameters = LazyImport("upgini.metadata", "RuntimeParameters")
|
|
12
|
-
# CVType = LazyImport("upgini.metadata", "CVType")
|
|
13
|
-
# ModelTaskType = LazyImport("upgini.metadata", "ModelTaskType")
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|