upgini 1.2.113a5__py3-none-any.whl → 1.2.113a6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.113a5"
1
+ __version__ = "1.2.113a6"
upgini/dataset.py CHANGED
@@ -343,7 +343,9 @@ class Dataset:
343
343
  if col in mandatory_columns:
344
344
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
345
345
 
346
- invalid_values = list(set(self.data.loc[self.data[f"{col}_is_valid"] == 0, col].head().values))
346
+ # Use stable pandas API across versions: Series.unique keeps order
347
+ # and collapses multiple NaNs into a single NaN
348
+ invalid_values = self.data.loc[self.data[f"{col}_is_valid"] == 0, col].unique().tolist()[:5]
347
349
  valid_share = self.data[f"{col}_is_valid"].sum() / nrows
348
350
  original_col_name = self.columns_renaming[col]
349
351
  validation_stats[original_col_name] = {}
@@ -1012,15 +1012,11 @@ class FeaturesEnricher(TransformerMixin):
1012
1012
  if self.id_columns and self.id_columns_encoder is not None:
1013
1013
  if cat_features_from_backend:
1014
1014
  cat_features_from_backend = [
1015
- c
1016
- for c in cat_features_from_backend
1017
- if c not in self.id_columns_encoder.feature_names_in_
1015
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
1018
1016
  ]
1019
1017
  if client_cat_features:
1020
1018
  client_cat_features = [
1021
- c
1022
- for c in client_cat_features
1023
- if c not in self.id_columns_encoder.feature_names_in_
1019
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
1024
1020
  ]
1025
1021
  for cat_feature in cat_features_from_backend:
1026
1022
  if cat_feature in self.search_keys:
@@ -1384,15 +1380,11 @@ class FeaturesEnricher(TransformerMixin):
1384
1380
  if self.id_columns and self.id_columns_encoder is not None:
1385
1381
  if cat_features_from_backend:
1386
1382
  cat_features_from_backend = [
1387
- c
1388
- for c in cat_features_from_backend
1389
- if c not in self.id_columns_encoder.feature_names_in_
1383
+ c for c in cat_features_from_backend if c not in self.id_columns_encoder.feature_names_in_
1390
1384
  ]
1391
1385
  if client_cat_features:
1392
1386
  client_cat_features = [
1393
- c
1394
- for c in client_cat_features
1395
- if c not in self.id_columns_encoder.feature_names_in_
1387
+ c for c in client_cat_features if c not in self.id_columns_encoder.feature_names_in_
1396
1388
  ]
1397
1389
 
1398
1390
  prepared_data = self._prepare_data_for_metrics(
@@ -1829,7 +1821,8 @@ class FeaturesEnricher(TransformerMixin):
1829
1821
  or c in set(self.feature_names_).union(self.id_columns or [])
1830
1822
  or (self.fit_columns_renaming or {}).get(c, c) in set(self.feature_names_).union(self.id_columns or [])
1831
1823
  )
1832
- and c not in (
1824
+ and c
1825
+ not in (
1833
1826
  excluding_search_keys
1834
1827
  + list(self.fit_dropped_features)
1835
1828
  + [DateTimeSearchKeyConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
@@ -3220,7 +3213,15 @@ if response.status_code == 200:
3220
3213
  self.fit_generated_features = [f for f in self.fit_generated_features if f not in self.fit_dropped_features]
3221
3214
 
3222
3215
  # Group columns should have normalized names
3223
- self.__adjust_cv(df, force=True)
3216
+ if self.runtime_parameters.properties.get("cv_params.group_columns") is not None:
3217
+ original_to_hash = {v: k for k, v in self.fit_columns_renaming.items()}
3218
+ self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(
3219
+ [
3220
+ original_to_hash.get(c, c)
3221
+ for c in self.runtime_parameters.properties["cv_params.group_columns"].split(",")
3222
+ ]
3223
+ )
3224
+
3224
3225
  if self.id_columns is not None and self.cv is not None and self.cv.is_time_series():
3225
3226
  id_columns = self.__get_renamed_id_columns()
3226
3227
  if id_columns:
@@ -3525,23 +3526,24 @@ if response.status_code == 200:
3525
3526
  reverse_renaming = {v: k for k, v in renaming.items()}
3526
3527
  return None if self.id_columns is None else [reverse_renaming.get(c) or c for c in self.id_columns]
3527
3528
 
3528
- def __adjust_cv(self, df: pd.DataFrame, force: bool = False):
3529
- if self.cv is not None and not force:
3530
- return
3529
+ def __adjust_cv(self, df: pd.DataFrame):
3530
+ if self.cv is None:
3531
+ date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3532
+ # Check Multivariate time series
3533
+ if (
3534
+ date_column
3535
+ and self.model_task_type == ModelTaskType.REGRESSION
3536
+ and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys()))
3537
+ == 0
3538
+ and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3539
+ ):
3540
+ msg = self.bundle.get("multivariate_timeseries_detected")
3541
+ self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3542
+ elif self.model_task_type != ModelTaskType.REGRESSION:
3543
+ msg = self.bundle.get("group_k_fold_in_classification")
3544
+ self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3531
3545
 
3532
- date_column = SearchKey.find_key(self.fit_search_keys, [SearchKey.DATE, SearchKey.DATETIME])
3533
- # Check Multivariate time series
3534
- if (
3535
- date_column
3536
- and self.model_task_type == ModelTaskType.REGRESSION
3537
- and len({SearchKey.PHONE, SearchKey.EMAIL, SearchKey.HEM}.intersection(self.fit_search_keys.keys())) == 0
3538
- and is_blocked_time_series(df, date_column, list(self.fit_search_keys.keys()) + [TARGET])
3539
- ):
3540
- msg = self.bundle.get("multivariate_timeseries_detected")
3541
- self.__override_cv(CVType.blocked_time_series, msg, print_warning=False)
3542
- elif self.model_task_type != ModelTaskType.REGRESSION:
3543
- msg = self.bundle.get("group_k_fold_in_classification")
3544
- self.__override_cv(CVType.group_k_fold, msg, print_warning=self.cv is not None)
3546
+ if self.cv == CVType.group_k_fold:
3545
3547
  group_columns = self._get_group_columns(df, self.fit_search_keys)
3546
3548
  self.runtime_parameters.properties["cv_params.group_columns"] = ",".join(group_columns)
3547
3549
  self.runtime_parameters.properties["cv_params.shuffle_kfold"] = "True"
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.113a5
3
+ Version: 1.2.113a6
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=QdA0r4M8wEBY37BMjK9uA_83s1sWkyXy2XJhfn7vl3A,26
1
+ upgini/__about__.py,sha256=hA793gAu-mC2Lw0M27RABL7IKbRk6aGmyLjnqBIPNOc,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=xFi0a-A3uvtxVwFM6JOyitkEPd1I2slIBj5SWfys3hQ,32724
4
+ upgini/dataset.py,sha256=TU_Kk574JCNlx_PaeDu1HN1qwi5IzlwkkAbUqU_M_QM,32860
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=wifdmDP-3e3y51KYhCHPYuN6vU8mj2m3SYo-kMWcNz0,234523
6
+ upgini/features_enricher.py,sha256=liJcrINi_NvPLHJqELYqF9Gcga2PA3l4UYvD3ieBkB8,234815
7
7
  upgini/http.py,sha256=zeAZvT6IAzOs9jQ3WG8mJBANLajgvv2LZePFzKz004w,45482
8
8
  upgini/metadata.py,sha256=sx4X9fPkyCgXB6FPk9Rq_S1Kx8ibkbaWA-qNDVCuSmg,12811
9
9
  upgini/metrics.py,sha256=O19UqmgZ6SA136eCYV5lVU3J26ecgZlGXnxGblMvZJc,45869
@@ -72,7 +72,7 @@ upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,
72
72
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
73
73
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
74
74
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
75
- upgini-1.2.113a5.dist-info/METADATA,sha256=VOeoK4hhJyhb0OJWG2cgsN-hES6xe3QIRyZMovxP8ek,49531
76
- upgini-1.2.113a5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
77
- upgini-1.2.113a5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
78
- upgini-1.2.113a5.dist-info/RECORD,,
75
+ upgini-1.2.113a6.dist-info/METADATA,sha256=xMMMKyiR7qAg6VHYaxUnshC75w7h3REvo0NGYIEWwJM,49531
76
+ upgini-1.2.113a6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
77
+ upgini-1.2.113a6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
78
+ upgini-1.2.113a6.dist-info/RECORD,,