upgini 1.2.113a5__py3-none-any.whl → 1.2.113a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +3 -1
- upgini/features_enricher.py +32 -30
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a6.dist-info}/METADATA +1 -1
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a6.dist-info}/RECORD +7 -7
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a6.dist-info}/WHEEL +0 -0
- {upgini-1.2.113a5.dist-info → upgini-1.2.113a6.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.113a6"
|
upgini/dataset.py
CHANGED
@@ -343,7 +343,9 @@ class Dataset:
|
|
343
343
|
if col in mandatory_columns:
|
344
344
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
345
345
|
|
346
|
-
|
346
|
+
# Use stable pandas API across versions: Series.unique keeps order
|
347
|
+
# and collapses multiple NaNs into a single NaN
|
348
|
+
invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
|
347
349
|
valid_share = self.data[f"{col}_is_valid"].sum() / nrows
|
348
350
|
original_col_name = self.columns_renaming[col]
|
349
351
|
validation_stats[original_col_name] = {}
|
upgini/features_enricher.py
CHANGED
@@ -1012,15 +1012,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1012
1012
|
if self.id_columns and self.id_columns_encoder is not None:
|
1013
1013
|
if cat_features_from_backend:
|
1014
1014
|
cat_features_from_backend = [
|
1015
|
-
c
|
1016
|
-
for c in cat_features_from_backend
|
1017
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1015
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
1018
1016
|
]
|
1019
1017
|
if client_cat_features:
|
1020
1018
|
client_cat_features = [
|
1021
|
-
c
|
1022
|
-
for c in client_cat_features
|
1023
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1019
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
1024
1020
|
]
|
1025
1021
|
for cat_feature in cat_features_from_backend:
|
1026
1022
|
if cat_feature in self.search_keys:
|
@@ -1384,15 +1380,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1384
1380
|
if self.id_columns and self.id_columns_encoder is not None:
|
1385
1381
|
if cat_features_from_backend:
|
1386
1382
|
cat_features_from_backend = [
|
1387
|
-
c
|
1388
|
-
for c in cat_features_from_backend
|
1389
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1383
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
1390
1384
|
]
|
1391
1385
|
if client_cat_features:
|
1392
1386
|
client_cat_features = [
|
1393
|
-
c
|
1394
|
-
for c in client_cat_features
|
1395
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1387
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
1396
1388
|
]
|
1397
1389
|
|
1398
1390
|
prepared_data = self._prepare_data_for_metrics(
|
@@ -1829,7 +1821,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1829
1821
|
or c in set(self.feature_names_).union(self.id_columns or [])
|
1830
1822
|
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1831
1823
|
)
|
1832
|
-
and c
|
1824
|
+
and c
|
1825
|
+
not in (
|
1833
1826
|
excluding_search_keys
|
1834
1827
|
+ list(self.fit_dropped_features)
|
1835
1828
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -3220,7 +3213,15 @@ if response.status_code == 200:
|
|
3220
3213
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
3221
3214
|
|
3222
3215
|
# Group columns should have normalized names
|
3223
|
-
self.
|
3216
|
+
if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
|
3217
|
+
original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
|
3218
|
+
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
|
3219
|
+
[
|
3220
|
+
original_to_hash.get(c, c)
|
3221
|
+
for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
|
3222
|
+
]
|
3223
|
+
)
|
3224
|
+
|
3224
3225
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
3225
3226
|
id_columns = self.__get_renamed_id_columns()
|
3226
3227
|
if id_columns:
|
@@ -3525,23 +3526,24 @@ if response.status_code == 200:
|
|
3525
3526
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3526
3527
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
3527
3528
|
|
3528
|
-
def __adjust_cv(self, df: pd.DataFrame
|
3529
|
-
if self.cv is
|
3530
|
-
|
3529
|
+
def __adjust_cv(self, df: pd.DataFrame):
|
3530
|
+
if self.cv is None:
|
3531
|
+
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3532
|
+
# Check Multivariate time series
|
3533
|
+
if (
|
3534
|
+
date_column
|
3535
|
+
and self.model_task_type == ModelTaskType.REGRESSION
|
3536
|
+
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
|
3537
|
+
== 0
|
3538
|
+
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3539
|
+
):
|
3540
|
+
msg = self.bundle.get("multivariate_timeseries_detected")
|
3541
|
+
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3542
|
+
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3543
|
+
msg = self.bundle.get("group_k_fold_in_classification")
|
3544
|
+
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3531
3545
|
|
3532
|
-
|
3533
|
-
# Check Multivariate time series
|
3534
|
-
if (
|
3535
|
-
date_column
|
3536
|
-
and self.model_task_type == ModelTaskType.REGRESSION
|
3537
|
-
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
3538
|
-
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3539
|
-
):
|
3540
|
-
msg = self.bundle.get("multivariate_timeseries_detected")
|
3541
|
-
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3542
|
-
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3543
|
-
msg = self.bundle.get("group_k_fold_in_classification")
|
3544
|
-
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3546
|
+
if self.cv == CVType.group_k_fold:
|
3545
3547
|
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
3546
3548
|
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
|
3547
3549
|
self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=hA793gAu-mC2Lw0M27RABL7IKbRk6aGmyLjnqBIPNOc,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=TU_Kk574JCNlx_PaeDu1HN1qwi5IzlwkkAbUqU_M_QM,32860
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=liJcrINi_NvPLHJqELYqF9Gcga2PA3l4UYvD3ieBkB8,234815
|
7
7
|
upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
|
8
8
|
upgini/metadata.py,sha256=sx4X9fPkyCgXB6FPk9Rq_S1Kx8ibkbaWA-qNDVCuSmg,12811
|
9
9
|
upgini/metrics.py,sha256=O19UqmgZ6SA136eCYV5lVU3J26ecgZlGXnxGblMvZJc,45869
|
@@ -72,7 +72,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
72
72
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
73
73
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
74
74
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
75
|
+
upgini-1.2.113a6.dist-info/METADATA,sha256=xMMMKyiR7qAg6VHYaxUnshC75w7h3REvo0NGYIEWwJM,49531
|
76
|
+
upgini-1.2.113a6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
77
|
+
upgini-1.2.113a6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
78
|
+
upgini-1.2.113a6.dist-info/RECORD,,
|
File without changes
|
File without changes
|