upgini 1.2.121a3__py3-none-any.whl → 1.2.122__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/feature.py +17 -0
- upgini/autofe/operator.py +26 -0
- upgini/features_enricher.py +32 -24
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/features_validator.py +5 -1
- upgini/utils/psi.py +0 -1
- {upgini-1.2.121a3.dist-info → upgini-1.2.122.dist-info}/METADATA +1 -1
- {upgini-1.2.121a3.dist-info → upgini-1.2.122.dist-info}/RECORD +11 -11
- {upgini-1.2.121a3.dist-info → upgini-1.2.122.dist-info}/WHEEL +0 -0
- {upgini-1.2.121a3.dist-info → upgini-1.2.122.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.122"
|
upgini/autofe/feature.py
CHANGED
@@ -42,6 +42,9 @@ class Column:
|
|
42
42
|
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
43
43
|
return self.get_columns(**kwargs)[0]
|
44
44
|
|
45
|
+
def reset_display_indices(self) -> "Column":
|
46
|
+
return self
|
47
|
+
|
45
48
|
def _unhash(self, feature_name: str) -> str:
|
46
49
|
last_component_idx = feature_name.rfind("_")
|
47
50
|
if not feature_name.startswith("f_"):
|
@@ -147,6 +150,13 @@ class Feature:
|
|
147
150
|
self.cached_display_name = None
|
148
151
|
return self
|
149
152
|
|
153
|
+
def rename_op_params(self, mapping: Dict[str, str]) -> "Feature":
|
154
|
+
self.op.rename_params(mapping)
|
155
|
+
for child in self.children:
|
156
|
+
if isinstance(child, Feature):
|
157
|
+
child.rename_op_params(mapping)
|
158
|
+
return self
|
159
|
+
|
150
160
|
def get_column_nodes(self) -> List[Union[Column, "Feature"]]:
|
151
161
|
res = []
|
152
162
|
for child in self.children:
|
@@ -212,6 +222,13 @@ class Feature:
|
|
212
222
|
self.cached_display_name = None
|
213
223
|
return self
|
214
224
|
|
225
|
+
def reset_display_indices(self) -> "Feature":
|
226
|
+
for child in self.children:
|
227
|
+
child.reset_display_indices()
|
228
|
+
self.display_index = None
|
229
|
+
self.cached_display_name = None
|
230
|
+
return self
|
231
|
+
|
215
232
|
def infer_type(self, data: pd.DataFrame) -> Union[str, DtypeObj]:
|
216
233
|
if self.op.output_type:
|
217
234
|
return self.op.output_type
|
upgini/autofe/operator.py
CHANGED
@@ -89,6 +89,32 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
|
|
89
89
|
def delete_data(self):
|
90
90
|
pass
|
91
91
|
|
92
|
+
def rename_params(self, columns_renaming: Dict[str, str]) -> "Operator":
|
93
|
+
# Rename occurrences of column names inside self.params keys according to columns_renaming
|
94
|
+
if not self.params or not columns_renaming:
|
95
|
+
return self
|
96
|
+
|
97
|
+
# Replace longer keys first to avoid partial overlaps
|
98
|
+
replacements = sorted(columns_renaming.items(), key=lambda kv: -len(kv[0]))
|
99
|
+
|
100
|
+
renamed_params: Dict[str, str] = {}
|
101
|
+
for param_key, param_value in self.params.items():
|
102
|
+
new_key = param_key
|
103
|
+
for old, new in replacements:
|
104
|
+
if old and old in new_key:
|
105
|
+
new_key = new_key.replace(old, new)
|
106
|
+
|
107
|
+
if new_key in renamed_params and new_key != param_key:
|
108
|
+
self._logger.warning(
|
109
|
+
"Param key collision after rename: '%s' -> '%s'. Overwriting value.",
|
110
|
+
param_key,
|
111
|
+
new_key,
|
112
|
+
)
|
113
|
+
renamed_params[new_key] = param_value
|
114
|
+
|
115
|
+
self.params = renamed_params
|
116
|
+
return self
|
117
|
+
|
92
118
|
|
93
119
|
class ParametrizedOperator(Operator, abc.ABC):
|
94
120
|
|
upgini/features_enricher.py
CHANGED
@@ -1028,7 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1028
1028
|
columns_renaming,
|
1029
1029
|
_,
|
1030
1030
|
) = prepared_data
|
1031
|
-
|
1031
|
+
|
1032
1032
|
gc.collect()
|
1033
1033
|
|
1034
1034
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
@@ -1406,7 +1406,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1406
1406
|
self,
|
1407
1407
|
X: pd.DataFrame,
|
1408
1408
|
eval_set: list[tuple[pd.DataFrame, pd.Series]],
|
1409
|
-
enriched_eval_set: dict,
|
1409
|
+
enriched_eval_set: dict[int, tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]],
|
1410
1410
|
eval_set_dates: dict[int, pd.Series],
|
1411
1411
|
search_keys: dict[str, SearchKey],
|
1412
1412
|
stability_threshold: float,
|
@@ -1417,31 +1417,42 @@ class FeaturesEnricher(TransformerMixin):
|
|
1417
1417
|
# Find latest eval set or earliest if all eval sets are before train set
|
1418
1418
|
date_column = self._get_date_column(search_keys)
|
1419
1419
|
|
1420
|
+
date_converter = DateTimeSearchKeyConverter(
|
1421
|
+
date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
|
1422
|
+
)
|
1423
|
+
|
1424
|
+
X = date_converter.convert(X)
|
1425
|
+
|
1420
1426
|
x_date = X[date_column].dropna()
|
1421
|
-
if
|
1422
|
-
|
1423
|
-
|
1427
|
+
if len(x_date) == 0:
|
1428
|
+
self.logger.warning("Empty date column in X")
|
1429
|
+
return []
|
1424
1430
|
|
1425
|
-
|
1426
|
-
eval_x_date = eval_x[date_column].dropna()
|
1427
|
-
if not is_numeric_dtype(eval_x_date):
|
1428
|
-
eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
|
1431
|
+
main_min_date = x_date.min()
|
1429
1432
|
|
1430
1433
|
# Find minimum date for each eval_set and compare with main dataset
|
1431
1434
|
eval_dates = []
|
1432
1435
|
for i, (eval_x, _) in enumerate(eval_set):
|
1433
|
-
if date_column in eval_x.columns:
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1436
|
+
if date_column not in eval_x.columns:
|
1437
|
+
self.logger.warning(f"Date column not found in eval_set {i + 1}")
|
1438
|
+
continue
|
1439
|
+
eval_x = date_converter.convert(eval_x)
|
1440
|
+
eval_x_date = eval_x[date_column].dropna()
|
1441
|
+
if len(eval_x_date) < 1000:
|
1442
|
+
self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
|
1443
|
+
continue
|
1444
|
+
if len(enriched_eval_set[i][2]) < 1000:
|
1445
|
+
self.logger.warning(
|
1446
|
+
f"Enriched eval_set {i} has less than 1000 rows. It will be ignored for stability check"
|
1447
|
+
)
|
1448
|
+
continue
|
1449
|
+
|
1450
|
+
eval_min_date = eval_x_date.min()
|
1451
|
+
eval_max_date = eval_x_date.max()
|
1452
|
+
eval_dates.append((i, eval_min_date, eval_max_date))
|
1443
1453
|
|
1444
1454
|
if not eval_dates:
|
1455
|
+
self.logger.warning("There are no correct eval_sets for stability check")
|
1445
1456
|
return []
|
1446
1457
|
|
1447
1458
|
# Check if any eval_set has minimum date >= main dataset minimum date
|
@@ -1464,10 +1475,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1464
1475
|
checking_eval_set_df = checking_eval_set_df.copy()
|
1465
1476
|
|
1466
1477
|
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1467
|
-
|
1468
|
-
checking_eval_set_df[date_column] = (
|
1469
|
-
pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
1470
|
-
)
|
1478
|
+
checking_eval_set_df = date_converter.convert(checking_eval_set_df)
|
1471
1479
|
|
1472
1480
|
psi_values_sparse = calculate_sparsity_psi(
|
1473
1481
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
@@ -3378,8 +3386,8 @@ if response.status_code == 200:
|
|
3378
3386
|
except KeyboardInterrupt as e:
|
3379
3387
|
print(self.bundle.get("search_stopping"))
|
3380
3388
|
self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
|
3381
|
-
self._search_task = None
|
3382
3389
|
self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
|
3390
|
+
self._search_task = None
|
3383
3391
|
print(self.bundle.get("search_stopped"))
|
3384
3392
|
raise e
|
3385
3393
|
|
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
|
|
155
155
|
# features validation
|
156
156
|
empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
|
157
157
|
high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
|
158
|
-
one_hot_encoded_features=One hot encoded features detected
|
158
|
+
one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
|
159
159
|
|
160
160
|
# Dataset validation
|
161
161
|
dataset_too_few_rows=X size should be at least {} rows after validation
|
@@ -46,7 +46,7 @@ class FeaturesValidator:
|
|
46
46
|
|
47
47
|
if one_hot_encoded_features:
|
48
48
|
msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
|
49
|
-
|
49
|
+
warnings.append(msg)
|
50
50
|
|
51
51
|
columns_renaming = columns_renaming or {}
|
52
52
|
|
@@ -100,6 +100,10 @@ class FeaturesValidator:
|
|
100
100
|
@staticmethod
|
101
101
|
def is_one_hot_encoded(series: pd.Series) -> bool:
|
102
102
|
try:
|
103
|
+
# All rows should be the same type
|
104
|
+
if series.apply(lambda x: type(x)).nunique() != 1:
|
105
|
+
return False
|
106
|
+
|
103
107
|
# First, handle string representations of True/False
|
104
108
|
series_copy = series.copy()
|
105
109
|
if series_copy.dtype == "object" or series_copy.dtype == "string":
|
upgini/utils/psi.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256
|
1
|
+
upgini/__about__.py,sha256=-JqzGEBlhFUnCWmxu0lqdTawM1jUPGK4oP4I-0hFJNI,24
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=44_WWpTiJXZzmp2iAoY2SSYgHuaB_RqnLZ35zkNssK8,231839
|
7
7
|
upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
|
8
8
|
upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
@@ -16,9 +16,9 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
17
17
|
upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
|
18
18
|
upgini/autofe/date.py,sha256=RvexgrL1_6ISYPVrl9HUQmPgpVSGQsTNv8YhNQWs-5M,11329
|
19
|
-
upgini/autofe/feature.py,sha256=
|
19
|
+
upgini/autofe/feature.py,sha256=W9sZHdz5Vi0H_oPyY5saZAPjyd5wunpULnCqrGLpQc4,16879
|
20
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
21
|
-
upgini/autofe/operator.py,sha256=
|
21
|
+
upgini/autofe/operator.py,sha256=3i4aWqlRomgTIVAPnivwFb3St87UoWMtZBTzQNJCyuU,6278
|
22
22
|
upgini/autofe/unary.py,sha256=FFtvkQaT0cu_zPZ1jCLcsjik-UUh12qQFF3tUW8NqsE,6675
|
23
23
|
upgini/autofe/utils.py,sha256=dYrtyAM8Vcc_R8u4dNo54IsGrHKagTHDJTKhGho0bRg,2967
|
24
24
|
upgini/autofe/vector.py,sha256=r5H6DKT5f3KNjERpV2OOloZ96nDWkModXnpsqw_A77Q,2313
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=KcXm1Nl6c3zswL91tIbG0DjuuNpzxUdCg1cY9f2-9cg,29283
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
|
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
59
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
60
60
|
upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
|
61
|
-
upgini/utils/features_validator.py,sha256=
|
61
|
+
upgini/utils/features_validator.py,sha256=A_3AX7X5u5AH7RLgkTiS6dHxaOiq5vm8w4ijQWLGcMY,4871
|
62
62
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
63
63
|
upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
@@ -66,7 +66,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
66
66
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
67
67
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
69
|
-
upgini/utils/psi.py,sha256=
|
69
|
+
upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
71
71
|
upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
|
72
72
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
|
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.122.dist-info/METADATA,sha256=e9lV45Du_2DKcMVvqgXpI1TkicMzXsiPApqm6b9tsYU,50743
|
78
|
+
upgini-1.2.122.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.122.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.122.dist-info/RECORD,,
|
File without changes
|
File without changes
|