upgini 1.2.13a3__py3-none-any.whl → 1.2.14__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/binary.py +6 -12
- upgini/features_enricher.py +22 -25
- upgini/normalizer/normalize_utils.py +22 -15
- {upgini-1.2.13a3.dist-info → upgini-1.2.14.dist-info}/METADATA +1 -1
- {upgini-1.2.13a3.dist-info → upgini-1.2.14.dist-info}/RECORD +8 -8
- {upgini-1.2.13a3.dist-info → upgini-1.2.14.dist-info}/WHEEL +1 -1
- {upgini-1.2.13a3.dist-info → upgini-1.2.14.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.14"
|
upgini/autofe/binary.py
CHANGED
|
@@ -140,27 +140,21 @@ class Distance(PandasOperand):
|
|
|
140
140
|
has_symmetry_importance: bool = True
|
|
141
141
|
|
|
142
142
|
def calculate_binary(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
143
|
-
# Handle None values by replacing them with 0 in the dot product and norm calculations
|
|
144
|
-
left = left.apply(lambda x: np.array(x) if x is not None else np.zeros_like(right[0]))
|
|
145
|
-
right = right.apply(lambda x: np.array(x) if x is not None else np.zeros_like(left[0]))
|
|
146
|
-
|
|
147
143
|
return pd.Series(
|
|
148
144
|
1 - self.__dot(left, right) / (self.__norm(left) * self.__norm(right)), index=left.index
|
|
149
|
-
)
|
|
145
|
+
).astype(np.float64)
|
|
150
146
|
|
|
151
147
|
# row-wise dot product, handling None values
|
|
152
148
|
def __dot(self, left: pd.Series, right: pd.Series) -> pd.Series:
|
|
153
|
-
left = left.apply(lambda x: np.array(x)
|
|
154
|
-
right = right.apply(lambda x: np.array(x)
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
res = (left * right).apply(np.sum)
|
|
149
|
+
left = left.apply(lambda x: np.array(x))
|
|
150
|
+
right = right.apply(lambda x: np.array(x))
|
|
151
|
+
res = (left.dropna() * right.dropna()).apply(np.sum)
|
|
152
|
+
res = res.reindex(left.index.union(right.index))
|
|
158
153
|
return res
|
|
159
154
|
|
|
160
155
|
# Calculate the norm of a vector, handling None values
|
|
161
156
|
def __norm(self, vector: pd.Series) -> pd.Series:
|
|
162
|
-
|
|
163
|
-
vector = vector.apply(lambda x: np.array(x) if x is not None else np.zeros_like(vector[0]))
|
|
157
|
+
vector = vector.fillna(np.nan)
|
|
164
158
|
return np.sqrt(self.__dot(vector, vector))
|
|
165
159
|
|
|
166
160
|
|
upgini/features_enricher.py
CHANGED
|
@@ -1577,8 +1577,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1577
1577
|
df = generator.generate(df)
|
|
1578
1578
|
generated_features.extend(generator.generated_features)
|
|
1579
1579
|
|
|
1580
|
-
normalizer = Normalizer(
|
|
1581
|
-
df = normalizer.normalize(df)
|
|
1580
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
1581
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
1582
1582
|
columns_renaming = normalizer.columns_renaming
|
|
1583
1583
|
|
|
1584
1584
|
df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
|
|
@@ -2017,10 +2017,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2017
2017
|
df = generator.generate(df)
|
|
2018
2018
|
generated_features.extend(generator.generated_features)
|
|
2019
2019
|
|
|
2020
|
-
normalizer = Normalizer(
|
|
2021
|
-
|
|
2022
|
-
)
|
|
2023
|
-
df = normalizer.normalize(df)
|
|
2020
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
|
|
2021
|
+
df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
|
|
2024
2022
|
columns_renaming = normalizer.columns_renaming
|
|
2025
2023
|
|
|
2026
2024
|
# Don't pass all features in backend on transform
|
|
@@ -2449,14 +2447,13 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2449
2447
|
if is_numeric_dtype(df[self.TARGET_NAME]) and has_date:
|
|
2450
2448
|
self._validate_PSI(df.sort_values(by=maybe_date_column))
|
|
2451
2449
|
|
|
2452
|
-
self.
|
|
2453
|
-
|
|
2454
|
-
|
|
2455
|
-
self.fit_search_keys, self.fit_generated_features, self.bundle, self.logger, self.warning_counter
|
|
2450
|
+
normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
|
|
2451
|
+
df, self.fit_search_keys, self.fit_generated_features = normalizer.normalize(
|
|
2452
|
+
df, self.fit_search_keys, self.fit_generated_features
|
|
2456
2453
|
)
|
|
2457
|
-
|
|
2458
|
-
|
|
2459
|
-
self.
|
|
2454
|
+
self.fit_columns_renaming = normalizer.columns_renaming
|
|
2455
|
+
|
|
2456
|
+
self.__adjust_cv(df)
|
|
2460
2457
|
|
|
2461
2458
|
df = remove_fintech_duplicates(
|
|
2462
2459
|
df, self.fit_search_keys, date_format=self.date_format, logger=self.logger, bundle=self.bundle
|
|
@@ -2470,7 +2467,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2470
2467
|
self.df_with_original_index = df.copy()
|
|
2471
2468
|
# TODO check maybe need to drop _time column from df_with_original_index
|
|
2472
2469
|
|
|
2473
|
-
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys,
|
|
2470
|
+
df, unnest_search_keys = self._explode_multiple_search_keys(df, self.fit_search_keys, self.fit_columns_renaming)
|
|
2474
2471
|
|
|
2475
2472
|
# Convert EMAIL to HEM after unnesting to do it only with one column
|
|
2476
2473
|
email_column = self._get_email_column(self.fit_search_keys)
|
|
@@ -2480,7 +2477,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2480
2477
|
email_column,
|
|
2481
2478
|
hem_column,
|
|
2482
2479
|
self.fit_search_keys,
|
|
2483
|
-
|
|
2480
|
+
self.fit_columns_renaming,
|
|
2484
2481
|
list(unnest_search_keys.keys()),
|
|
2485
2482
|
self.logger,
|
|
2486
2483
|
)
|
|
@@ -2491,7 +2488,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2491
2488
|
converter = IpSearchKeyConverter(
|
|
2492
2489
|
ip_column,
|
|
2493
2490
|
self.fit_search_keys,
|
|
2494
|
-
|
|
2491
|
+
self.fit_columns_renaming,
|
|
2495
2492
|
list(unnest_search_keys.keys()),
|
|
2496
2493
|
self.bundle,
|
|
2497
2494
|
self.logger,
|
|
@@ -2522,7 +2519,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2522
2519
|
features_columns = [c for c in df.columns if c not in non_feature_columns]
|
|
2523
2520
|
|
|
2524
2521
|
features_to_drop = FeaturesValidator(self.logger).validate(
|
|
2525
|
-
df, features_columns, self.generate_features, self.warning_counter,
|
|
2522
|
+
df, features_columns, self.generate_features, self.warning_counter, self.fit_columns_renaming
|
|
2526
2523
|
)
|
|
2527
2524
|
self.fit_dropped_features.update(features_to_drop)
|
|
2528
2525
|
df = df.drop(columns=features_to_drop)
|
|
@@ -2563,7 +2560,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2563
2560
|
rest_client=self.rest_client,
|
|
2564
2561
|
logger=self.logger,
|
|
2565
2562
|
)
|
|
2566
|
-
dataset.columns_renaming =
|
|
2563
|
+
dataset.columns_renaming = self.fit_columns_renaming
|
|
2567
2564
|
|
|
2568
2565
|
self.passed_features = [
|
|
2569
2566
|
column for column, meaning_type in meaning_types.items() if meaning_type == FileColumnMeaningType.FEATURE
|
|
@@ -2710,24 +2707,24 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
2710
2707
|
if not self.warning_counter.has_warnings():
|
|
2711
2708
|
self.__display_support_link(self.bundle.get("all_ok_community_invite"))
|
|
2712
2709
|
|
|
2713
|
-
def __adjust_cv(self, df: pd.DataFrame
|
|
2710
|
+
def __adjust_cv(self, df: pd.DataFrame):
|
|
2711
|
+
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
|
2714
2712
|
# Check Multivariate time series
|
|
2715
2713
|
if (
|
|
2716
2714
|
self.cv is None
|
|
2717
2715
|
and date_column
|
|
2718
|
-
and model_task_type == ModelTaskType.REGRESSION
|
|
2716
|
+
and self.model_task_type == ModelTaskType.REGRESSION
|
|
2719
2717
|
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
|
2720
2718
|
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
|
2721
2719
|
):
|
|
2722
2720
|
msg = self.bundle.get("multivariate_timeseries_detected")
|
|
2723
2721
|
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
|
2724
|
-
elif
|
|
2725
|
-
self.cv is None
|
|
2726
|
-
and model_task_type != ModelTaskType.REGRESSION
|
|
2727
|
-
and self._get_group_columns(df, self.fit_search_keys)
|
|
2728
|
-
):
|
|
2722
|
+
elif self.cv is None and self.model_task_type != ModelTaskType.REGRESSION:
|
|
2729
2723
|
msg = self.bundle.get("group_k_fold_in_classification")
|
|
2730
2724
|
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
|
2725
|
+
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
|
2726
|
+
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
|
|
2727
|
+
self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
|
|
2731
2728
|
|
|
2732
2729
|
def __override_cv(self, cv: CVType, msg: str, print_warning: bool = True):
|
|
2733
2730
|
if print_warning:
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import hashlib
|
|
2
2
|
from logging import Logger, getLogger
|
|
3
|
-
from typing import Dict, List
|
|
3
|
+
from typing import Dict, List, Tuple
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
import pandas as pd
|
|
@@ -35,22 +35,25 @@ class Normalizer:
|
|
|
35
35
|
|
|
36
36
|
def __init__(
|
|
37
37
|
self,
|
|
38
|
-
search_keys: Dict[str, SearchKey],
|
|
39
|
-
generated_features: List[str],
|
|
40
38
|
bundle: ResourceBundle = None,
|
|
41
39
|
logger: Logger = None,
|
|
42
40
|
warnings_counter: WarningCounter = None,
|
|
43
41
|
silent_mode=False,
|
|
44
42
|
):
|
|
45
|
-
self.search_keys = search_keys
|
|
46
|
-
self.generated_features = generated_features
|
|
47
43
|
self.bundle = bundle or get_custom_bundle()
|
|
48
44
|
self.logger = logger or getLogger()
|
|
49
45
|
self.warnings_counter = warnings_counter or WarningCounter()
|
|
50
46
|
self.silent_mode = silent_mode
|
|
51
47
|
self.columns_renaming = {}
|
|
48
|
+
self.search_keys = {}
|
|
49
|
+
self.generated_features = []
|
|
50
|
+
|
|
51
|
+
def normalize(
|
|
52
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
53
|
+
) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
|
|
54
|
+
self.search_keys = search_keys.copy()
|
|
55
|
+
self.generated_features = generated_features.copy()
|
|
52
56
|
|
|
53
|
-
def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
54
57
|
df = df.copy()
|
|
55
58
|
df = self._rename_columns(df)
|
|
56
59
|
|
|
@@ -68,21 +71,25 @@ class Normalizer:
|
|
|
68
71
|
|
|
69
72
|
df = self.__convert_features_types(df)
|
|
70
73
|
|
|
71
|
-
return df
|
|
74
|
+
return df, self.search_keys, self.generated_features
|
|
72
75
|
|
|
73
76
|
def _rename_columns(self, df: pd.DataFrame):
|
|
74
77
|
# logger.info("Replace restricted symbols in column names")
|
|
75
78
|
new_columns = []
|
|
76
79
|
dup_counter = 0
|
|
77
80
|
for column in df.columns:
|
|
78
|
-
if
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
|
|
84
|
-
|
|
85
|
-
|
|
81
|
+
if (
|
|
82
|
+
column
|
|
83
|
+
in [
|
|
84
|
+
TARGET,
|
|
85
|
+
EVAL_SET_INDEX,
|
|
86
|
+
SYSTEM_RECORD_ID,
|
|
87
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
88
|
+
SEARCH_KEY_UNNEST,
|
|
89
|
+
DateTimeSearchKeyConverter.DATETIME_COL,
|
|
90
|
+
]
|
|
91
|
+
+ self.generated_features
|
|
92
|
+
):
|
|
86
93
|
self.columns_renaming[column] = column
|
|
87
94
|
new_columns.append(column)
|
|
88
95
|
continue
|
|
@@ -1,9 +1,9 @@
|
|
|
1
|
-
upgini/__about__.py,sha256
|
|
1
|
+
upgini/__about__.py,sha256=mxzjGyB-ihJR05pHYcBiUe_XT5X9wj6cBHLxOXBhAeM,23
|
|
2
2
|
upgini/__init__.py,sha256=M64LwQTBa-5Jz24Zm2h8rWwlKQQ1J8nP7gGgIciS0WU,589
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=olZ-OHSfBNoBSCo7R5t7uCLukI2nO7afpx_A-HCiJLk,31067
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=vRC7g6n6XQxSrvzXk6NJjP0ZytDQhWR4sTAo4Hp7gmA,188319
|
|
7
7
|
upgini/http.py,sha256=21asexflvavydzCOONJDGQBtQanCElrbnqLXakJ9Cu8,42880
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=osmzdNESeh7yP3BZday6N9Q3eaIHfzhhRM1d6NSgcf0,11223
|
|
@@ -15,7 +15,7 @@ upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9Jvf
|
|
|
15
15
|
upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo-R-nGdw,2648
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operands.py,sha256=cCCB44qvkmuWyiRM5Xykx8tkHPIjQthrWyj67STWN80,2578
|
|
18
|
-
upgini/autofe/binary.py,sha256=
|
|
18
|
+
upgini/autofe/binary.py,sha256=zMhtHVuGUAFLUqem-XiXqJj-GRXxS88tdz8tFuDfSNM,7659
|
|
19
19
|
upgini/autofe/date.py,sha256=OpFc3Al0xO3qlESn2Uokfxw51ArVqmh3xngWwdrsaqE,9762
|
|
20
20
|
upgini/autofe/feature.py,sha256=eL7wABUhDKZzv3E-RPJNcyGwSfB0UptcfU2RbvsOks4,15082
|
|
21
21
|
upgini/autofe/groupby.py,sha256=r-xl_keZZgm_tpiEoDhjYSkT6NHv7a4cRQR4wJ4uCp8,3263
|
|
@@ -27,7 +27,7 @@ upgini/data_source/data_source_publisher.py,sha256=X-8aGtVgzGmxyXkMVBoBLIGDMb4lY
|
|
|
27
27
|
upgini/mdc/__init__.py,sha256=aM08nIWFc2gWdWUa3_IuEnNND0cQPkBGnYpRMnfFN8k,1019
|
|
28
28
|
upgini/mdc/context.py,sha256=3u1B-jXt7tXEvNcV3qmR9SDCseudnY7KYsLclBdwVLk,1405
|
|
29
29
|
upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
30
|
-
upgini/normalizer/normalize_utils.py,sha256=
|
|
30
|
+
upgini/normalizer/normalize_utils.py,sha256=Lv75lq7M46z9cAIutwkdKZtPZkWblgoRzToAJ1BwY8A,7709
|
|
31
31
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
32
32
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
33
33
|
upgini/resource_bundle/strings.properties,sha256=9kvmcUrsSFUCrzOiN0Ozf-lQ2H8Igz5gATUPoHMOaU4,26456
|
|
@@ -57,7 +57,7 @@ upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,4
|
|
|
57
57
|
upgini/utils/target_utils.py,sha256=BVtDmrmFMKerSUWaNOIEdzsYHIFiODdpnWbE50QDPDc,7864
|
|
58
58
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
59
59
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
60
|
-
upgini-1.2.
|
|
61
|
-
upgini-1.2.
|
|
62
|
-
upgini-1.2.
|
|
63
|
-
upgini-1.2.
|
|
60
|
+
upgini-1.2.14.dist-info/METADATA,sha256=xIbSnwYAnie1HKKRe_6MEUGG0BaZRtvWAsRlc2vzlBw,48577
|
|
61
|
+
upgini-1.2.14.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
62
|
+
upgini-1.2.14.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
63
|
+
upgini-1.2.14.dist-info/RECORD,,
|
|
File without changes
|