upgini 1.2.14__tar.gz → 1.2.14a1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- {upgini-1.2.14 → upgini-1.2.14a1}/PKG-INFO +1 -1
- upgini-1.2.14a1/src/upgini/__about__.py +1 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/dataset.py +1 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/features_enricher.py +25 -22
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/normalizer/normalize_utils.py +15 -22
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/target_utils.py +22 -18
- upgini-1.2.14/src/upgini/__about__.py +0 -1
- {upgini-1.2.14 → upgini-1.2.14a1}/.gitignore +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/LICENSE +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/README.md +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/pyproject.toml +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/ads.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/errors.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/http.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/metadata.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/metrics.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.14 → upgini-1.2.14a1}/src/upgini/version_validator.py +0 -0
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.14a1"
|
|
@@ -253,6 +253,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
253
253
|
min_class_percent = self.IMBALANCE_THESHOLD / target_classes_count
|
|
254
254
|
min_class_threshold = min_class_percent * count
|
|
255
255
|
|
|
256
|
+
# If min class count less than 30% for binary or (60 / classes_count)% for multiclass
|
|
256
257
|
if min_class_count < min_class_threshold:
|
|
257
258
|
self.imbalanced = True
|
|
258
259
|
self.data = balance_undersample(
|
|
@@ -1577,8 +1577,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1577
1577
|
df = generator.generate(df)
|
|
1578
1578
|
generated_features.extend(generator.generated_features)
|
|
1579
1579
|
|
|
1580
|
-
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
1581
|
-
df
|
|
1580
|
+
normalizer = Normalizer(search_keys, generated_features, self.bundle, self.logger, self.warning_counter)
|
|
1581
|
+
df = normalizer.normalize(df)
|
|
1582
1582
|
columns_renaming = normalizer.columns_renaming
|
|
1583
1583
|
|
|
1584
1584
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
@@ -2017,8 +2017,10 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2017
2017
|
df = generator.generate(df)
|
|
2018
2018
|
generated_features.extend(generator.generated_features)
|
|
2019
2019
|
|
|
2020
|
-
normalizer = Normalizer(
|
|
2021
|
-
|
|
2020
|
+
normalizer = Normalizer(
|
|
2021
|
+
search_keys, generated_features, self.bundle, self.logger, self.warning_counter, silent_mode
|
|
2022
|
+
)
|
|
2023
|
+
df = normalizer.normalize(df)
|
|
2022
2024
|
columns_renaming = normalizer.columns_renaming
|
|
2023
2025
|
|
|
2024
2026
|
# Don't pass all features in backend on transform
|
|
@@ -2447,13 +2449,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2447
2449
|
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2448
2450
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2449
2451
|
|
|
2450
|
-
|
|
2451
|
-
df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
|
|
2452
|
-
df, self.fit_search_keys, self.fit_generated_features
|
|
2453
|
-
)
|
|
2454
|
-
self.fit_columns_renaming = normalizer.columns_renaming
|
|
2452
|
+
self.__adjust_cv(df, maybe_date_column, self.model_task_type)
|
|
2455
2453
|
|
|
2456
|
-
|
|
2454
|
+
normalizer = Normalizer(
|
|
2455
|
+
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2456
|
+
)
|
|
2457
|
+
df = normalizer.normalize(df)
|
|
2458
|
+
columns_renaming = normalizer.columns_renaming
|
|
2459
|
+
self.fit_columns_renaming = columns_renaming
|
|
2457
2460
|
|
|
2458
2461
|
df = remove_fintech_duplicates(
|
|
2459
2462
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2467,7 +2470,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2467
2470
|
self.df_with_original_index = df.copy()
|
|
2468
2471
|
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2469
2472
|
|
|
2470
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys,
|
|
2473
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, columns_renaming)
|
|
2471
2474
|
|
|
2472
2475
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2473
2476
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
@@ -2477,7 +2480,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2477
2480
|
email_column,
|
|
2478
2481
|
hem_column,
|
|
2479
2482
|
self.fit_search_keys,
|
|
2480
|
-
|
|
2483
|
+
columns_renaming,
|
|
2481
2484
|
list(unnest_search_keys.keys()),
|
|
2482
2485
|
self.logger,
|
|
2483
2486
|
)
|
|
@@ -2488,7 +2491,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2488
2491
|
converter = IpSearchKeyConverter(
|
|
2489
2492
|
ip_column,
|
|
2490
2493
|
self.fit_search_keys,
|
|
2491
|
-
|
|
2494
|
+
columns_renaming,
|
|
2492
2495
|
list(unnest_search_keys.keys()),
|
|
2493
2496
|
self.bundle,
|
|
2494
2497
|
self.logger,
|
|
@@ -2519,7 +2522,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2519
2522
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2520
2523
|
|
|
2521
2524
|
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2522
|
-
df, features_columns, self.generate_features, self.warning_counter,
|
|
2525
|
+
df, features_columns, self.generate_features, self.warning_counter, columns_renaming
|
|
2523
2526
|
)
|
|
2524
2527
|
self.fit_dropped_features.update(features_to_drop)
|
|
2525
2528
|
df = df.drop(columns=features_to_drop)
|
|
@@ -2560,7 +2563,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2560
2563
|
rest_client=self.rest_client,
|
|
2561
2564
|
logger=self.logger,
|
|
2562
2565
|
)
|
|
2563
|
-
dataset.columns_renaming =
|
|
2566
|
+
dataset.columns_renaming = columns_renaming
|
|
2564
2567
|
|
|
2565
2568
|
self.passed_features = [
|
|
2566
2569
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2707,24 +2710,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2707
2710
|
if not self.warning_counter.has_warnings():
|
|
2708
2711
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2709
2712
|
|
|
2710
|
-
def __adjust_cv(self, df: pd.DataFrame):
|
|
2711
|
-
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2713
|
+
def __adjust_cv(self, df: pd.DataFrame, date_column: pd.Series, model_task_type: ModelTaskType):
|
|
2712
2714
|
# Check Multivariate time series
|
|
2713
2715
|
if (
|
|
2714
2716
|
self.cv is None
|
|
2715
2717
|
and date_column
|
|
2716
|
-
and
|
|
2718
|
+
and model_task_type == ModelTaskType.REGRESSION
|
|
2717
2719
|
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
2718
2720
|
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
2719
2721
|
):
|
|
2720
2722
|
msg = self.bundle.get("multivariate_timeseries_detected")
|
|
2721
2723
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2722
|
-
elif
|
|
2724
|
+
elif (
|
|
2725
|
+
self.cv is None
|
|
2726
|
+
and model_task_type != ModelTaskType.REGRESSION
|
|
2727
|
+
and self._get_group_columns(df, self.fit_search_keys)
|
|
2728
|
+
):
|
|
2723
2729
|
msg = self.bundle.get("group_k_fold_in_classification")
|
|
2724
2730
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
2725
|
-
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
|
2726
|
-
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
|
|
2727
|
-
self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
|
|
2728
2731
|
|
|
2729
2732
|
def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
|
|
2730
2733
|
if print_warning:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
from logging import Logger, getLogger
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -35,25 +35,22 @@ class Normalizer:
|
|
|
35
35
|
|
|
36
36
|
def __init__(
|
|
37
37
|
self,
|
|
38
|
+
search_keys: Dict[str, SearchKey],
|
|
39
|
+
generated_features: List[str],
|
|
38
40
|
bundle: ResourceBundle = None,
|
|
39
41
|
logger: Logger = None,
|
|
40
42
|
warnings_counter: WarningCounter = None,
|
|
41
43
|
silent_mode=False,
|
|
42
44
|
):
|
|
45
|
+
self.search_keys = search_keys
|
|
46
|
+
self.generated_features = generated_features
|
|
43
47
|
self.bundle = bundle or get_custom_bundle()
|
|
44
48
|
self.logger = logger or getLogger()
|
|
45
49
|
self.warnings_counter = warnings_counter or WarningCounter()
|
|
46
50
|
self.silent_mode = silent_mode
|
|
47
51
|
self.columns_renaming = {}
|
|
48
|
-
self.search_keys = {}
|
|
49
|
-
self.generated_features = []
|
|
50
|
-
|
|
51
|
-
def normalize(
|
|
52
|
-
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
53
|
-
) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
|
|
54
|
-
self.search_keys = search_keys.copy()
|
|
55
|
-
self.generated_features = generated_features.copy()
|
|
56
52
|
|
|
53
|
+
def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
57
54
|
df = df.copy()
|
|
58
55
|
df = self._rename_columns(df)
|
|
59
56
|
|
|
@@ -71,25 +68,21 @@ class Normalizer:
|
|
|
71
68
|
|
|
72
69
|
df = self.__convert_features_types(df)
|
|
73
70
|
|
|
74
|
-
return df
|
|
71
|
+
return df
|
|
75
72
|
|
|
76
73
|
def _rename_columns(self, df: pd.DataFrame):
|
|
77
74
|
# logger.info("Replace restricted symbols in column names")
|
|
78
75
|
new_columns = []
|
|
79
76
|
dup_counter = 0
|
|
80
77
|
for column in df.columns:
|
|
81
|
-
if
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
DateTimeSearchKeyConverter.DATETIME_COL,
|
|
90
|
-
]
|
|
91
|
-
+ self.generated_features
|
|
92
|
-
):
|
|
78
|
+
if column in [
|
|
79
|
+
TARGET,
|
|
80
|
+
EVAL_SET_INDEX,
|
|
81
|
+
SYSTEM_RECORD_ID,
|
|
82
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
83
|
+
SEARCH_KEY_UNNEST,
|
|
84
|
+
DateTimeSearchKeyConverter.DATETIME_COL,
|
|
85
|
+
] + self.generated_features:
|
|
93
86
|
self.columns_renaming[column] = column
|
|
94
87
|
new_columns.append(column)
|
|
95
88
|
continue
|
|
@@ -150,30 +150,34 @@ def balance_undersample(
|
|
|
150
150
|
# fill up to min_sample_threshold by majority class
|
|
151
151
|
minority_class = df[df[target_column] == min_class_value]
|
|
152
152
|
majority_class = df[df[target_column] != min_class_value]
|
|
153
|
-
sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
153
|
+
# sample_size = min(len(majority_class), min_sample_threshold - min_class_count)
|
|
154
|
+
sample_size = min(
|
|
155
|
+
max_class_count,
|
|
156
|
+
binary_bootstrap_loops * (min_class_count + max(min_sample_threshold - 2 * min_class_count, 0)),
|
|
157
|
+
)
|
|
154
158
|
sampled_majority_class = majority_class.sample(n=sample_size, random_state=random_state)
|
|
155
159
|
resampled_data = df[
|
|
156
160
|
(df[SYSTEM_RECORD_ID].isin(minority_class[SYSTEM_RECORD_ID]))
|
|
157
161
|
| (df[SYSTEM_RECORD_ID].isin(sampled_majority_class[SYSTEM_RECORD_ID]))
|
|
158
162
|
]
|
|
159
163
|
|
|
160
|
-
elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
161
|
-
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
164
|
+
# elif max_class_count > min_class_count * binary_bootstrap_loops:
|
|
165
|
+
# msg = bundle.get("dataset_rarest_class_less_threshold").format(
|
|
166
|
+
# min_class_value, min_class_count, min_class_threshold, min_class_percent * 100
|
|
167
|
+
# )
|
|
168
|
+
# logger.warning(msg)
|
|
169
|
+
# print(msg)
|
|
170
|
+
# if warning_counter:
|
|
171
|
+
# warning_counter.increment()
|
|
172
|
+
|
|
173
|
+
# sampler = RandomUnderSampler(
|
|
174
|
+
# sampling_strategy={max_class_value: binary_bootstrap_loops * min_class_count}, random_state=random_state
|
|
175
|
+
# )
|
|
176
|
+
# X = df[SYSTEM_RECORD_ID]
|
|
177
|
+
# X = X.to_frame(SYSTEM_RECORD_ID)
|
|
178
|
+
# new_x, _ = sampler.fit_resample(X, target) # type: ignore
|
|
179
|
+
|
|
180
|
+
# resampled_data = df[df[SYSTEM_RECORD_ID].isin(new_x[SYSTEM_RECORD_ID])]
|
|
177
181
|
|
|
178
182
|
logger.info(f"Shape after rebalance resampling: {resampled_data}")
|
|
179
183
|
return resampled_data
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.14"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|