upgini 1.2.113a4__py3-none-any.whl → 1.2.113a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.113a4"
1
+ __version__ = "1.2.113a6"
upgini/dataset.py CHANGED
@@ -343,7 +343,9 @@ class Dataset:
343
343
  if col in mandatory_columns:
344
344
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
345
345
 
346
- invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
346
+ # Use stable pandas API across versions: Series.unique keeps order
347
+ # and collapses multiple NaNs into a single NaN
348
+ invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
347
349
  valid_share = self.data[f"{col}_is_valid"].sum() / nrows
348
350
  original_col_name = self.columns_renaming[col]
349
351
  validation_stats[original_col_name] = {}
@@ -112,7 +112,7 @@ except Exception:
112
112
  CustomFallbackProgressBar as ProgressBar,
113
113
  )
114
114
 
115
- from upgini.utils.psi import calculate_features_psi
115
+ from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
116
116
  from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
117
117
  from upgini.utils.sort import sort_columns
118
118
  from upgini.utils.target_utils import calculate_psi, define_task
@@ -1012,15 +1012,11 @@ class FeaturesEnricher(TransformerMixin):
1012
1012
  if self.id_columns and self.id_columns_encoder is not None:
1013
1013
  if cat_features_from_backend:
1014
1014
  cat_features_from_backend = [
1015
- c
1016
- for c in cat_features_from_backend
1017
- if c not in self.id_columns_encoder.feature_names_in_
1015
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
1018
1016
  ]
1019
1017
  if client_cat_features:
1020
1018
  client_cat_features = [
1021
- c
1022
- for c in client_cat_features
1023
- if c not in self.id_columns_encoder.feature_names_in_
1019
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
1024
1020
  ]
1025
1021
  for cat_feature in cat_features_from_backend:
1026
1022
  if cat_feature in self.search_keys:
@@ -1384,15 +1380,11 @@ class FeaturesEnricher(TransformerMixin):
1384
1380
  if self.id_columns and self.id_columns_encoder is not None:
1385
1381
  if cat_features_from_backend:
1386
1382
  cat_features_from_backend = [
1387
- c
1388
- for c in cat_features_from_backend
1389
- if c not in self.id_columns_encoder.feature_names_in_
1383
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
1390
1384
  ]
1391
1385
  if client_cat_features:
1392
1386
  client_cat_features = [
1393
- c
1394
- for c in client_cat_features
1395
- if c not in self.id_columns_encoder.feature_names_in_
1387
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
1396
1388
  ]
1397
1389
 
1398
1390
  prepared_data = self._prepare_data_for_metrics(
@@ -1513,15 +1505,29 @@ class FeaturesEnricher(TransformerMixin):
1513
1505
 
1514
1506
  checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1515
1507
 
1508
+ psi_values_sparse = calculate_sparsity_psi(
1509
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1510
+ )
1511
+
1512
+ unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1513
+ if unstable_by_sparsity:
1514
+ self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1515
+
1516
1516
  psi_values = calculate_features_psi(
1517
1517
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1518
1518
  )
1519
1519
 
1520
+ unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1521
+ if unstable_by_value:
1522
+ self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1523
+
1520
1524
  self.psi_values = {
1521
1525
  feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1522
1526
  }
1523
1527
 
1524
- return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1528
+ total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
1529
+
1530
+ return total_unstable_features
1525
1531
 
1526
1532
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1527
1533
  renaming = self.fit_columns_renaming or {}
@@ -1815,7 +1821,8 @@ class FeaturesEnricher(TransformerMixin):
1815
1821
  or c in set(self.feature_names_).union(self.id_columns or [])
1816
1822
  or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1817
1823
  )
1818
- and c not in (
1824
+ and c
1825
+ not in (
1819
1826
  excluding_search_keys
1820
1827
  + list(self.fit_dropped_features)
1821
1828
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -2273,13 +2280,7 @@ class FeaturesEnricher(TransformerMixin):
2273
2280
  enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
2274
2281
  )
2275
2282
 
2276
- # Add hash-suffixes because output of transform has original names
2277
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2278
- X_sampled.rename(columns=reversed_renaming, inplace=True)
2279
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2280
- for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2281
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
2282
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2283
+ search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2283
2284
 
2284
2285
  # Cache and return results
2285
2286
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
@@ -3212,7 +3213,15 @@ if response.status_code == 200:
3212
3213
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
3213
3214
 
3214
3215
  # Group columns should have normalized names
3215
- self.__adjust_cv(df, force=True)
3216
+ if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
3217
+ original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
3218
+ self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
3219
+ [
3220
+ original_to_hash.get(c, c)
3221
+ for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
3222
+ ]
3223
+ )
3224
+
3216
3225
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
3217
3226
  id_columns = self.__get_renamed_id_columns()
3218
3227
  if id_columns:
@@ -3517,23 +3526,24 @@ if response.status_code == 200:
3517
3526
  reverse_renaming = {v: k for k, v in renaming.items()}
3518
3527
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
3519
3528
 
3520
- def __adjust_cv(self, df: pd.DataFrame, force: bool = False):
3521
- if self.cv is not None and not force:
3522
- return
3529
+ def __adjust_cv(self, df: pd.DataFrame):
3530
+ if self.cv is None:
3531
+ date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3532
+ # Check Multivariate time series
3533
+ if (
3534
+ date_column
3535
+ and self.model_task_type == ModelTaskType.REGRESSION
3536
+ and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
3537
+ == 0
3538
+ and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3539
+ ):
3540
+ msg = self.bundle.get("multivariate_timeseries_detected")
3541
+ self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3542
+ elif self.model_task_type != ModelTaskType.REGRESSION:
3543
+ msg = self.bundle.get("group_k_fold_in_classification")
3544
+ self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3523
3545
 
3524
- date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3525
- # Check Multivariate time series
3526
- if (
3527
- date_column
3528
- and self.model_task_type == ModelTaskType.REGRESSION
3529
- and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
3530
- and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3531
- ):
3532
- msg = self.bundle.get("multivariate_timeseries_detected")
3533
- self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3534
- elif self.model_task_type != ModelTaskType.REGRESSION:
3535
- msg = self.bundle.get("group_k_fold_in_classification")
3536
- self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3546
+ if self.cv == CVType.group_k_fold:
3537
3547
  group_columns = self._get_group_columns(df, self.fit_search_keys)
3538
3548
  self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
3539
3549
  self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
upgini/utils/psi.py CHANGED
@@ -42,6 +42,32 @@ DEFAULT_FEATURES_PARAMS = StabilityParams(
42
42
  )
43
43
 
44
44
 
45
+ def calculate_sparsity_psi(
46
+ df: pd.DataFrame,
47
+ cat_features: list[str],
48
+ date_column: str,
49
+ logger: logging.Logger,
50
+ model_task_type: ModelTaskType,
51
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
52
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
53
+ ) -> Dict[str, float]:
54
+ sparse_features = df.columns[df.isna().sum() > 0].to_list()
55
+ if len(sparse_features) > 0:
56
+ logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
57
+ sparse_df = df[sparse_features].notna()
58
+ sparse_df[date_column] = df[date_column]
59
+ return calculate_features_psi(
60
+ sparse_df,
61
+ cat_features,
62
+ date_column,
63
+ logger,
64
+ model_task_type,
65
+ psi_target_params,
66
+ psi_features_params,
67
+ )
68
+ return {}
69
+
70
+
45
71
  def calculate_features_psi(
46
72
  df: pd.DataFrame,
47
73
  cat_features: list[str],
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.113a4
3
+ Version: 1.2.113a6
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=e2daN14w8eJvl4oPzSLzI5_ozEF8dUoUUXkaatWm-og,26
1
+ upgini/__about__.py,sha256=hA793gAu-mC2Lw0M27RABL7IKbRk6aGmyLjnqBIPNOc,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=xFi0a-A3uvtxVwFM6JOyitkEPd1I2slIBj5SWfys3hQ,32724
4
+ upgini/dataset.py,sha256=TU_Kk574JCNlx_PaeDu1HN1qwi5IzlwkkAbUqU_M_QM,32860
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=kJQEX0FaaCdBdyMlrx7xLi5J617Meue1etq3H2vVn6U,234271
6
+ upgini/features_enricher.py,sha256=liJcrINi_NvPLHJqELYqF9Gcga2PA3l4UYvD3ieBkB8,234815
7
7
  upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
8
8
  upgini/metadata.py,sha256=sx4X9fPkyCgXB6FPk9Rq_S1Kx8ibkbaWA-qNDVCuSmg,12811
9
9
  upgini/metrics.py,sha256=O19UqmgZ6SA136eCYV5lVU3J26ecgZlGXnxGblMvZJc,45869
@@ -64,7 +64,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
- upgini/utils/psi.py,sha256=gYeZ3FOwGriVUnuO3BbdicSgXGQqPU14f7yleyO55f0,10108
67
+ upgini/utils/psi.py,sha256=pLtECcCeco_WRqMjFnQvhUB4vHArjHtD5HzJFP9ICMc,10972
68
68
  upgini/utils/sample_utils.py,sha256=lZJ4yf9Jiq9Em2Ny9m3RIiF7WSxBPrc4E3xxn_8sQk8,15417
69
69
  upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
70
70
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
@@ -72,7 +72,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
72
72
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
73
73
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
74
74
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
75
- upgini-1.2.113a4.dist-info/METADATA,sha256=kcZUW8fN8paJpTxumveHf8pDeQ6h1nXptLK_F9vGttI,49531
76
- upgini-1.2.113a4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
77
- upgini-1.2.113a4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
78
- upgini-1.2.113a4.dist-info/RECORD,,
75
+ upgini-1.2.113a6.dist-info/METADATA,sha256=xMMMKyiR7qAg6VHYaxUnshC75w7h3REvo0NGYIEWwJM,49531
76
+ upgini-1.2.113a6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
77
+ upgini-1.2.113a6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
78
+ upgini-1.2.113a6.dist-info/RECORD,,