upgini 1.2.121a2__py3-none-any.whl → 1.2.122a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/feature.py +10 -0
- upgini/features_enricher.py +31 -23
- upgini/resource_bundle/strings.properties +1 -1
- upgini/utils/features_validator.py +18 -7
- upgini/utils/psi.py +0 -1
- {upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/METADATA +1 -1
- {upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/RECORD +10 -10
- {upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/WHEEL +0 -0
- {upgini-1.2.121a2.dist-info → upgini-1.2.122a1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.122a1"
|
upgini/autofe/feature.py
CHANGED
@@ -42,6 +42,9 @@ class Column:
|
|
42
42
|
def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
|
43
43
|
return self.get_columns(**kwargs)[0]
|
44
44
|
|
45
|
+
def reset_display_indices(self) -> "Column":
|
46
|
+
return self
|
47
|
+
|
45
48
|
def _unhash(self, feature_name: str) -> str:
|
46
49
|
last_component_idx = feature_name.rfind("_")
|
47
50
|
if not feature_name.startswith("f_"):
|
@@ -212,6 +215,13 @@ class Feature:
|
|
212
215
|
self.cached_display_name = None
|
213
216
|
return self
|
214
217
|
|
218
|
+
def reset_display_indices(self) -> "Feature":
|
219
|
+
for child in self.children:
|
220
|
+
child.reset_display_indices()
|
221
|
+
self.display_index = None
|
222
|
+
self.cached_display_name = None
|
223
|
+
return self
|
224
|
+
|
215
225
|
def infer_type(self, data: pd.DataFrame) -> Union[str, DtypeObj]:
|
216
226
|
if self.op.output_type:
|
217
227
|
return self.op.output_type
|
upgini/features_enricher.py
CHANGED
@@ -1028,7 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1028
1028
|
columns_renaming,
|
1029
1029
|
_,
|
1030
1030
|
) = prepared_data
|
1031
|
-
|
1031
|
+
|
1032
1032
|
gc.collect()
|
1033
1033
|
|
1034
1034
|
if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
|
@@ -1406,7 +1406,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1406
1406
|
self,
|
1407
1407
|
X: pd.DataFrame,
|
1408
1408
|
eval_set: list[tuple[pd.DataFrame, pd.Series]],
|
1409
|
-
enriched_eval_set: dict,
|
1409
|
+
enriched_eval_set: dict[int, tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]],
|
1410
1410
|
eval_set_dates: dict[int, pd.Series],
|
1411
1411
|
search_keys: dict[str, SearchKey],
|
1412
1412
|
stability_threshold: float,
|
@@ -1417,31 +1417,42 @@ class FeaturesEnricher(TransformerMixin):
|
|
1417
1417
|
# Find latest eval set or earliest if all eval sets are before train set
|
1418
1418
|
date_column = self._get_date_column(search_keys)
|
1419
1419
|
|
1420
|
+
date_converter = DateTimeSearchKeyConverter(
|
1421
|
+
date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
|
1422
|
+
)
|
1423
|
+
|
1424
|
+
X = date_converter.convert(X)
|
1425
|
+
|
1420
1426
|
x_date = X[date_column].dropna()
|
1421
|
-
if
|
1422
|
-
|
1423
|
-
|
1427
|
+
if len(x_date) == 0:
|
1428
|
+
self.logger.warning("Empty date column in X")
|
1429
|
+
return []
|
1424
1430
|
|
1425
|
-
|
1426
|
-
eval_x_date = eval_x[date_column].dropna()
|
1427
|
-
if not is_numeric_dtype(eval_x_date):
|
1428
|
-
eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
|
1431
|
+
main_min_date = x_date.min()
|
1429
1432
|
|
1430
1433
|
# Find minimum date for each eval_set and compare with main dataset
|
1431
1434
|
eval_dates = []
|
1432
1435
|
for i, (eval_x, _) in enumerate(eval_set):
|
1433
|
-
if date_column in eval_x.columns:
|
1434
|
-
|
1435
|
-
|
1436
|
-
|
1437
|
-
|
1438
|
-
|
1439
|
-
|
1440
|
-
|
1441
|
-
|
1442
|
-
|
1436
|
+
if date_column not in eval_x.columns:
|
1437
|
+
self.logger.warning(f"Date column not found in eval_set {i + 1}")
|
1438
|
+
continue
|
1439
|
+
eval_x = date_converter.convert(eval_x)
|
1440
|
+
eval_x_date = eval_x[date_column].dropna()
|
1441
|
+
if len(eval_x_date) < 1000:
|
1442
|
+
self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
|
1443
|
+
continue
|
1444
|
+
if len(enriched_eval_set[i][2]) < 1000:
|
1445
|
+
self.logger.warning(
|
1446
|
+
f"Enriched eval_set {i} has less than 1000 rows. It will be ignored for stability check"
|
1447
|
+
)
|
1448
|
+
continue
|
1449
|
+
|
1450
|
+
eval_min_date = eval_x_date.min()
|
1451
|
+
eval_max_date = eval_x_date.max()
|
1452
|
+
eval_dates.append((i, eval_min_date, eval_max_date))
|
1443
1453
|
|
1444
1454
|
if not eval_dates:
|
1455
|
+
self.logger.warning("There are no correct eval_sets for stability check")
|
1445
1456
|
return []
|
1446
1457
|
|
1447
1458
|
# Check if any eval_set has minimum date >= main dataset minimum date
|
@@ -1464,10 +1475,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
1464
1475
|
checking_eval_set_df = checking_eval_set_df.copy()
|
1465
1476
|
|
1466
1477
|
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1467
|
-
|
1468
|
-
checking_eval_set_df[date_column] = (
|
1469
|
-
pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
|
1470
|
-
)
|
1478
|
+
checking_eval_set_df = date_converter.convert(checking_eval_set_df)
|
1471
1479
|
|
1472
1480
|
psi_values_sparse = calculate_sparsity_psi(
|
1473
1481
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
|
|
155
155
|
# features validation
|
156
156
|
empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
|
157
157
|
high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
|
158
|
-
one_hot_encoded_features=One hot encoded features detected
|
158
|
+
one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
|
159
159
|
|
160
160
|
# Dataset validation
|
161
161
|
dataset_too_few_rows=X size should be at least {} rows after validation
|
@@ -46,7 +46,7 @@ class FeaturesValidator:
|
|
46
46
|
|
47
47
|
if one_hot_encoded_features:
|
48
48
|
msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
|
49
|
-
|
49
|
+
warnings.append(msg)
|
50
50
|
|
51
51
|
columns_renaming = columns_renaming or {}
|
52
52
|
|
@@ -100,18 +100,29 @@ class FeaturesValidator:
|
|
100
100
|
@staticmethod
|
101
101
|
def is_one_hot_encoded(series: pd.Series) -> bool:
|
102
102
|
try:
|
103
|
-
#
|
104
|
-
series
|
105
|
-
|
103
|
+
# All rows should be the same type
|
104
|
+
if series.apply(lambda x: type(x)).nunique() != 1:
|
105
|
+
return False
|
106
|
+
|
107
|
+
# First, handle string representations of True/False
|
108
|
+
series_copy = series.copy()
|
109
|
+
if series_copy.dtype == "object" or series_copy.dtype == "string":
|
110
|
+
# Convert string representations of boolean values to numeric
|
111
|
+
series_copy = series_copy.astype(str).str.strip().str.lower()
|
112
|
+
series_copy = series_copy.replace({"true": "1", "false": "0"})
|
113
|
+
|
114
|
+
# Column contains only 0 and 1 (as strings or numbers or booleans)
|
115
|
+
series_copy = series_copy.astype(float)
|
116
|
+
if set(series_copy.unique()) != {0.0, 1.0}:
|
106
117
|
return False
|
107
118
|
|
108
|
-
|
119
|
+
series_copy = series_copy.astype(int)
|
109
120
|
|
110
121
|
# Column doesn't contain any NaN, np.NaN, space, null, etc.
|
111
|
-
if not (
|
122
|
+
if not (series_copy.isin([0, 1])).all():
|
112
123
|
return False
|
113
124
|
|
114
|
-
vc =
|
125
|
+
vc = series_copy.value_counts()
|
115
126
|
# Column should contain both 0 and 1
|
116
127
|
if len(vc) != 2:
|
117
128
|
return False
|
upgini/utils/psi.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=hzzmPAt8OZIX5YRwSKl5dj9LWowWDEnOpFN5Xq2xARQ,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=lBaecwDHkKpYWTz8fxs5Q12bDJGPLcDOesCPh0xX96s,231839
|
7
7
|
upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
|
8
8
|
upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
|
9
9
|
upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
|
@@ -16,7 +16,7 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
16
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
17
17
|
upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
|
18
18
|
upgini/autofe/date.py,sha256=RvexgrL1_6ISYPVrl9HUQmPgpVSGQsTNv8YhNQWs-5M,11329
|
19
|
-
upgini/autofe/feature.py,sha256=
|
19
|
+
upgini/autofe/feature.py,sha256=2jOdTTnUqdUewznxsveuTLgKcPLPNtFWS0YQsYYBbPk,16622
|
20
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
21
21
|
upgini/autofe/operator.py,sha256=RB3rKMjFi5Cx81RiYXN3OTCuXjmvzmFKQrxn4h0Oclo,5219
|
22
22
|
upgini/autofe/unary.py,sha256=FFtvkQaT0cu_zPZ1jCLcsjik-UUh12qQFF3tUW8NqsE,6675
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=KcXm1Nl6c3zswL91tIbG0DjuuNpzxUdCg1cY9f2-9cg,29283
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
44
44
|
upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
|
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
|
|
58
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
59
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
60
60
|
upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
|
61
|
-
upgini/utils/features_validator.py,sha256=
|
61
|
+
upgini/utils/features_validator.py,sha256=A_3AX7X5u5AH7RLgkTiS6dHxaOiq5vm8w4ijQWLGcMY,4871
|
62
62
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
63
63
|
upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
|
64
64
|
upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
|
@@ -66,7 +66,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
66
66
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
67
67
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
68
68
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
69
|
-
upgini/utils/psi.py,sha256=
|
69
|
+
upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
|
70
70
|
upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
|
71
71
|
upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
|
72
72
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
|
|
74
74
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
75
75
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
76
76
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
79
|
-
upgini-1.2.
|
80
|
-
upgini-1.2.
|
77
|
+
upgini-1.2.122a1.dist-info/METADATA,sha256=3pPdEVaYucgJB5Klks339i5-JTM7hJpEZUmZS7dEWi8,50745
|
78
|
+
upgini-1.2.122a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
79
|
+
upgini-1.2.122a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
80
|
+
upgini-1.2.122a1.dist-info/RECORD,,
|
File without changes
|
File without changes
|