upgini 1.2.113a4__py3-none-any.whl → 1.2.113a6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +3 -1
- upgini/features_enricher.py +49 -39
- upgini/utils/psi.py +26 -0
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a6.dist-info}/METADATA +1 -1
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a6.dist-info}/RECORD +8 -8
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a6.dist-info}/WHEEL +0 -0
- {upgini-1.2.113a4.dist-info → upgini-1.2.113a6.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.113a6"
|
upgini/dataset.py
CHANGED
@@ -343,7 +343,9 @@ class Dataset:
|
|
343
343
|
if col in mandatory_columns:
|
344
344
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
345
345
|
|
346
|
-
|
346
|
+
# Use stable pandas API across versions: Series.unique keeps order
|
347
|
+
# and collapses multiple NaNs into a single NaN
|
348
|
+
invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
|
347
349
|
valid_share = self.data[f"{col}_is_valid"].sum() / nrows
|
348
350
|
original_col_name = self.columns_renaming[col]
|
349
351
|
validation_stats[original_col_name] = {}
|
upgini/features_enricher.py
CHANGED
@@ -112,7 +112,7 @@ except Exception:
|
|
112
112
|
CustomFallbackProgressBar as ProgressBar,
|
113
113
|
)
|
114
114
|
|
115
|
-
from upgini.utils.psi import calculate_features_psi
|
115
|
+
from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
|
116
116
|
from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
|
117
117
|
from upgini.utils.sort import sort_columns
|
118
118
|
from upgini.utils.target_utils import calculate_psi, define_task
|
@@ -1012,15 +1012,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1012
1012
|
if self.id_columns and self.id_columns_encoder is not None:
|
1013
1013
|
if cat_features_from_backend:
|
1014
1014
|
cat_features_from_backend = [
|
1015
|
-
c
|
1016
|
-
for c in cat_features_from_backend
|
1017
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1015
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
1018
1016
|
]
|
1019
1017
|
if client_cat_features:
|
1020
1018
|
client_cat_features = [
|
1021
|
-
c
|
1022
|
-
for c in client_cat_features
|
1023
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1019
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
1024
1020
|
]
|
1025
1021
|
for cat_feature in cat_features_from_backend:
|
1026
1022
|
if cat_feature in self.search_keys:
|
@@ -1384,15 +1380,11 @@ class FeaturesEnricher(TransformerMixin):
|
|
1384
1380
|
if self.id_columns and self.id_columns_encoder is not None:
|
1385
1381
|
if cat_features_from_backend:
|
1386
1382
|
cat_features_from_backend = [
|
1387
|
-
c
|
1388
|
-
for c in cat_features_from_backend
|
1389
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1383
|
+
c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
|
1390
1384
|
]
|
1391
1385
|
if client_cat_features:
|
1392
1386
|
client_cat_features = [
|
1393
|
-
c
|
1394
|
-
for c in client_cat_features
|
1395
|
-
if c not in self.id_columns_encoder.feature_names_in_
|
1387
|
+
c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
|
1396
1388
|
]
|
1397
1389
|
|
1398
1390
|
prepared_data = self._prepare_data_for_metrics(
|
@@ -1513,15 +1505,29 @@ class FeaturesEnricher(TransformerMixin):
|
|
1513
1505
|
|
1514
1506
|
checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
|
1515
1507
|
|
1508
|
+
psi_values_sparse = calculate_sparsity_psi(
|
1509
|
+
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1510
|
+
)
|
1511
|
+
|
1512
|
+
unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
|
1513
|
+
if unstable_by_sparsity:
|
1514
|
+
self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
|
1515
|
+
|
1516
1516
|
psi_values = calculate_features_psi(
|
1517
1517
|
checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
|
1518
1518
|
)
|
1519
1519
|
|
1520
|
+
unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
|
1521
|
+
if unstable_by_value:
|
1522
|
+
self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
|
1523
|
+
|
1520
1524
|
self.psi_values = {
|
1521
1525
|
feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
|
1522
1526
|
}
|
1523
1527
|
|
1524
|
-
|
1528
|
+
total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
|
1529
|
+
|
1530
|
+
return total_unstable_features
|
1525
1531
|
|
1526
1532
|
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
1527
1533
|
renaming = self.fit_columns_renaming or {}
|
@@ -1815,7 +1821,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1815
1821
|
or c in set(self.feature_names_).union(self.id_columns or [])
|
1816
1822
|
or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
|
1817
1823
|
)
|
1818
|
-
and c
|
1824
|
+
and c
|
1825
|
+
not in (
|
1819
1826
|
excluding_search_keys
|
1820
1827
|
+ list(self.fit_dropped_features)
|
1821
1828
|
+ [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
|
@@ -2273,13 +2280,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2273
2280
|
enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
|
2274
2281
|
)
|
2275
2282
|
|
2276
|
-
|
2277
|
-
reversed_renaming = {v: k for k, v in columns_renaming.items()}
|
2278
|
-
X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2279
|
-
enriched_X.rename(columns=reversed_renaming, inplace=True)
|
2280
|
-
for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
|
2281
|
-
eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
|
2282
|
-
enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
|
2283
|
+
search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
|
2283
2284
|
|
2284
2285
|
# Cache and return results
|
2285
2286
|
datasets_hash = hash_input(validated_X, validated_y, eval_set)
|
@@ -3212,7 +3213,15 @@ if response.status_code == 200:
|
|
3212
3213
|
self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
|
3213
3214
|
|
3214
3215
|
# Group columns should have normalized names
|
3215
|
-
self.
|
3216
|
+
if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
|
3217
|
+
original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
|
3218
|
+
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
|
3219
|
+
[
|
3220
|
+
original_to_hash.get(c, c)
|
3221
|
+
for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
|
3222
|
+
]
|
3223
|
+
)
|
3224
|
+
|
3216
3225
|
if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
|
3217
3226
|
id_columns = self.__get_renamed_id_columns()
|
3218
3227
|
if id_columns:
|
@@ -3517,23 +3526,24 @@ if response.status_code == 200:
|
|
3517
3526
|
reverse_renaming = {v: k for k, v in renaming.items()}
|
3518
3527
|
return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
|
3519
3528
|
|
3520
|
-
def __adjust_cv(self, df: pd.DataFrame
|
3521
|
-
if self.cv is
|
3522
|
-
|
3529
|
+
def __adjust_cv(self, df: pd.DataFrame):
|
3530
|
+
if self.cv is None:
|
3531
|
+
date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
|
3532
|
+
# Check Multivariate time series
|
3533
|
+
if (
|
3534
|
+
date_column
|
3535
|
+
and self.model_task_type == ModelTaskType.REGRESSION
|
3536
|
+
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
|
3537
|
+
== 0
|
3538
|
+
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3539
|
+
):
|
3540
|
+
msg = self.bundle.get("multivariate_timeseries_detected")
|
3541
|
+
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3542
|
+
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3543
|
+
msg = self.bundle.get("group_k_fold_in_classification")
|
3544
|
+
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3523
3545
|
|
3524
|
-
|
3525
|
-
# Check Multivariate time series
|
3526
|
-
if (
|
3527
|
-
date_column
|
3528
|
-
and self.model_task_type == ModelTaskType.REGRESSION
|
3529
|
-
and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
|
3530
|
-
and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
|
3531
|
-
):
|
3532
|
-
msg = self.bundle.get("multivariate_timeseries_detected")
|
3533
|
-
self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
|
3534
|
-
elif self.model_task_type != ModelTaskType.REGRESSION:
|
3535
|
-
msg = self.bundle.get("group_k_fold_in_classification")
|
3536
|
-
self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
|
3546
|
+
if self.cv == CVType.group_k_fold:
|
3537
3547
|
group_columns = self._get_group_columns(df, self.fit_search_keys)
|
3538
3548
|
self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
|
3539
3549
|
self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
|
upgini/utils/psi.py
CHANGED
@@ -42,6 +42,32 @@ DEFAULT_FEATURES_PARAMS = StabilityParams(
|
|
42
42
|
)
|
43
43
|
|
44
44
|
|
45
|
+
def calculate_sparsity_psi(
|
46
|
+
df: pd.DataFrame,
|
47
|
+
cat_features: list[str],
|
48
|
+
date_column: str,
|
49
|
+
logger: logging.Logger,
|
50
|
+
model_task_type: ModelTaskType,
|
51
|
+
psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
|
52
|
+
psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
|
53
|
+
) -> Dict[str, float]:
|
54
|
+
sparse_features = df.columns[df.isna().sum() > 0].to_list()
|
55
|
+
if len(sparse_features) > 0:
|
56
|
+
logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
|
57
|
+
sparse_df = df[sparse_features].notna()
|
58
|
+
sparse_df[date_column] = df[date_column]
|
59
|
+
return calculate_features_psi(
|
60
|
+
sparse_df,
|
61
|
+
cat_features,
|
62
|
+
date_column,
|
63
|
+
logger,
|
64
|
+
model_task_type,
|
65
|
+
psi_target_params,
|
66
|
+
psi_features_params,
|
67
|
+
)
|
68
|
+
return {}
|
69
|
+
|
70
|
+
|
45
71
|
def calculate_features_psi(
|
46
72
|
df: pd.DataFrame,
|
47
73
|
cat_features: list[str],
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=hA793gAu-mC2Lw0M27RABL7IKbRk6aGmyLjnqBIPNOc,26
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=TU_Kk574JCNlx_PaeDu1HN1qwi5IzlwkkAbUqU_M_QM,32860
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=liJcrINi_NvPLHJqELYqF9Gcga2PA3l4UYvD3ieBkB8,234815
|
7
7
|
upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
|
8
8
|
upgini/metadata.py,sha256=sx4X9fPkyCgXB6FPk9Rq_S1Kx8ibkbaWA-qNDVCuSmg,12811
|
9
9
|
upgini/metrics.py,sha256=O19UqmgZ6SA136eCYV5lVU3J26ecgZlGXnxGblMvZJc,45869
|
@@ -64,7 +64,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/psi.py,sha256=
|
67
|
+
upgini/utils/psi.py,sha256=pLtECcCeco_WRqMjFnQvhUB4vHArjHtD5HzJFP9ICMc,10972
|
68
68
|
upgini/utils/sample_utils.py,sha256=lZJ4yf9Jiq9Em2Ny9m3RIiF7WSxBPrc4E3xxn_8sQk8,15417
|
69
69
|
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
70
70
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
@@ -72,7 +72,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
|
|
72
72
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
73
73
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
74
74
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
78
|
-
upgini-1.2.
|
75
|
+
upgini-1.2.113a6.dist-info/METADATA,sha256=xMMMKyiR7qAg6VHYaxUnshC75w7h3REvo0NGYIEWwJM,49531
|
76
|
+
upgini-1.2.113a6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
77
|
+
upgini-1.2.113a6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
78
|
+
upgini-1.2.113a6.dist-info/RECORD,,
|
File without changes
|
File without changes
|