upgini 1.2.121a2__py3-none-any.whl → 1.2.122__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.121a2"
1
+ __version__ = "1.2.122"
upgini/autofe/feature.py CHANGED
@@ -42,6 +42,9 @@ class Column:
42
42
  def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
43
43
  return self.get_columns(**kwargs)[0]
44
44
 
45
+ def reset_display_indices(self) -> "Column":
46
+ return self
47
+
45
48
  def _unhash(self, feature_name: str) -> str:
46
49
  last_component_idx = feature_name.rfind("_")
47
50
  if not feature_name.startswith("f_"):
@@ -147,6 +150,13 @@ class Feature:
147
150
  self.cached_display_name = None
148
151
  return self
149
152
 
153
+ def rename_op_params(self, mapping: Dict[str, str]) -> "Feature":
154
+ self.op.rename_params(mapping)
155
+ for child in self.children:
156
+ if isinstance(child, Feature):
157
+ child.rename_op_params(mapping)
158
+ return self
159
+
150
160
  def get_column_nodes(self) -> List[Union[Column, "Feature"]]:
151
161
  res = []
152
162
  for child in self.children:
@@ -212,6 +222,13 @@ class Feature:
212
222
  self.cached_display_name = None
213
223
  return self
214
224
 
225
+ def reset_display_indices(self) -> "Feature":
226
+ for child in self.children:
227
+ child.reset_display_indices()
228
+ self.display_index = None
229
+ self.cached_display_name = None
230
+ return self
231
+
215
232
  def infer_type(self, data: pd.DataFrame) -> Union[str, DtypeObj]:
216
233
  if self.op.output_type:
217
234
  return self.op.output_type
upgini/autofe/operator.py CHANGED
@@ -89,6 +89,32 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
89
89
  def delete_data(self):
90
90
  pass
91
91
 
92
+ def rename_params(self, columns_renaming: Dict[str, str]) -> "Operator":
93
+ # Rename occurrences of column names inside self.params keys according to columns_renaming
94
+ if not self.params or not columns_renaming:
95
+ return self
96
+
97
+ # Replace longer keys first to avoid partial overlaps
98
+ replacements = sorted(columns_renaming.items(), key=lambda kv: -len(kv[0]))
99
+
100
+ renamed_params: Dict[str, str] = {}
101
+ for param_key, param_value in self.params.items():
102
+ new_key = param_key
103
+ for old, new in replacements:
104
+ if old and old in new_key:
105
+ new_key = new_key.replace(old, new)
106
+
107
+ if new_key in renamed_params and new_key != param_key:
108
+ self._logger.warning(
109
+ "Param key collision after rename: '%s' -> '%s'. Overwriting value.",
110
+ param_key,
111
+ new_key,
112
+ )
113
+ renamed_params[new_key] = param_value
114
+
115
+ self.params = renamed_params
116
+ return self
117
+
92
118
 
93
119
  class ParametrizedOperator(Operator, abc.ABC):
94
120
 
@@ -1028,7 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
1028
1028
  columns_renaming,
1029
1029
  _,
1030
1030
  ) = prepared_data
1031
-
1031
+
1032
1032
  gc.collect()
1033
1033
 
1034
1034
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
@@ -1406,7 +1406,7 @@ class FeaturesEnricher(TransformerMixin):
1406
1406
  self,
1407
1407
  X: pd.DataFrame,
1408
1408
  eval_set: list[tuple[pd.DataFrame, pd.Series]],
1409
- enriched_eval_set: dict,
1409
+ enriched_eval_set: dict[int, tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]],
1410
1410
  eval_set_dates: dict[int, pd.Series],
1411
1411
  search_keys: dict[str, SearchKey],
1412
1412
  stability_threshold: float,
@@ -1417,31 +1417,42 @@ class FeaturesEnricher(TransformerMixin):
1417
1417
  # Find latest eval set or earliest if all eval sets are before train set
1418
1418
  date_column = self._get_date_column(search_keys)
1419
1419
 
1420
+ date_converter = DateTimeSearchKeyConverter(
1421
+ date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
1422
+ )
1423
+
1424
+ X = date_converter.convert(X)
1425
+
1420
1426
  x_date = X[date_column].dropna()
1421
- if not is_numeric_dtype(x_date):
1422
- x_date = pd.to_datetime(x_date).dt.floor("D").astype(np.int64) / 10**6
1423
- main_min_date = x_date.min()
1427
+ if len(x_date) == 0:
1428
+ self.logger.warning("Empty date column in X")
1429
+ return []
1424
1430
 
1425
- for eval_x, _ in eval_set:
1426
- eval_x_date = eval_x[date_column].dropna()
1427
- if not is_numeric_dtype(eval_x_date):
1428
- eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1431
+ main_min_date = x_date.min()
1429
1432
 
1430
1433
  # Find minimum date for each eval_set and compare with main dataset
1431
1434
  eval_dates = []
1432
1435
  for i, (eval_x, _) in enumerate(eval_set):
1433
- if date_column in eval_x.columns:
1434
- if len(eval_x) < 1000:
1435
- self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1436
- continue
1437
- eval_x_date = eval_x[date_column].dropna()
1438
- if not is_numeric_dtype(eval_x_date):
1439
- eval_x_date = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1440
- eval_min_date = eval_x_date.min()
1441
- eval_max_date = eval_x_date.max()
1442
- eval_dates.append((i, eval_min_date, eval_max_date))
1436
+ if date_column not in eval_x.columns:
1437
+ self.logger.warning(f"Date column not found in eval_set {i + 1}")
1438
+ continue
1439
+ eval_x = date_converter.convert(eval_x)
1440
+ eval_x_date = eval_x[date_column].dropna()
1441
+ if len(eval_x_date) < 1000:
1442
+ self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1443
+ continue
1444
+ if len(enriched_eval_set[i][2]) < 1000:
1445
+ self.logger.warning(
1446
+ f"Enriched eval_set {i} has less than 1000 rows. It will be ignored for stability check"
1447
+ )
1448
+ continue
1449
+
1450
+ eval_min_date = eval_x_date.min()
1451
+ eval_max_date = eval_x_date.max()
1452
+ eval_dates.append((i, eval_min_date, eval_max_date))
1443
1453
 
1444
1454
  if not eval_dates:
1455
+ self.logger.warning("There are no correct eval_sets for stability check")
1445
1456
  return []
1446
1457
 
1447
1458
  # Check if any eval_set has minimum date >= main dataset minimum date
@@ -1464,10 +1475,7 @@ class FeaturesEnricher(TransformerMixin):
1464
1475
  checking_eval_set_df = checking_eval_set_df.copy()
1465
1476
 
1466
1477
  checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1467
- if not is_numeric_dtype(checking_eval_set_df[date_column]):
1468
- checking_eval_set_df[date_column] = (
1469
- pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
1470
- )
1478
+ checking_eval_set_df = date_converter.convert(checking_eval_set_df)
1471
1479
 
1472
1480
  psi_values_sparse = calculate_sparsity_psi(
1473
1481
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -3378,8 +3386,8 @@ if response.status_code == 200:
3378
3386
  except KeyboardInterrupt as e:
3379
3387
  print(self.bundle.get("search_stopping"))
3380
3388
  self.rest_client.stop_search_task_v2(trace_id, self._search_task.search_task_id)
3381
- self._search_task = None
3382
3389
  self.logger.warning(f"Search {self._search_task.search_task_id} stopped by user")
3390
+ self._search_task = None
3383
3391
  print(self.bundle.get("search_stopped"))
3384
3392
  raise e
3385
3393
 
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
155
155
  # features validation
156
156
  empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
157
157
  high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
158
- one_hot_encoded_features=One hot encoded features detected: {}
158
+ one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
159
159
 
160
160
  # Dataset validation
161
161
  dataset_too_few_rows=X size should be at least {} rows after validation
@@ -46,7 +46,7 @@ class FeaturesValidator:
46
46
 
47
47
  if one_hot_encoded_features:
48
48
  msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
49
- self.logger.info(msg)
49
+ warnings.append(msg)
50
50
 
51
51
  columns_renaming = columns_renaming or {}
52
52
 
@@ -100,18 +100,29 @@ class FeaturesValidator:
100
100
  @staticmethod
101
101
  def is_one_hot_encoded(series: pd.Series) -> bool:
102
102
  try:
103
- # Column contains only 0 and 1 (as strings or numbers)
104
- series = series.astype(float)
105
- if set(series.unique()) != {0.0, 1.0}:
103
+ # All rows should be the same type
104
+ if series.apply(lambda x: type(x)).nunique() != 1:
105
+ return False
106
+
107
+ # First, handle string representations of True/False
108
+ series_copy = series.copy()
109
+ if series_copy.dtype == "object" or series_copy.dtype == "string":
110
+ # Convert string representations of boolean values to numeric
111
+ series_copy = series_copy.astype(str).str.strip().str.lower()
112
+ series_copy = series_copy.replace({"true": "1", "false": "0"})
113
+
114
+ # Column contains only 0 and 1 (as strings or numbers or booleans)
115
+ series_copy = series_copy.astype(float)
116
+ if set(series_copy.unique()) != {0.0, 1.0}:
106
117
  return False
107
118
 
108
- series = series.astype(int)
119
+ series_copy = series_copy.astype(int)
109
120
 
110
121
  # Column doesn't contain any NaN, np.NaN, space, null, etc.
111
- if not (series.isin([0, 1])).all():
122
+ if not (series_copy.isin([0, 1])).all():
112
123
  return False
113
124
 
114
- vc = series.value_counts()
125
+ vc = series_copy.value_counts()
115
126
  # Column should contain both 0 and 1
116
127
  if len(vc) != 2:
117
128
  return False
upgini/utils/psi.py CHANGED
@@ -7,7 +7,6 @@ from typing import Callable, Dict, Optional
7
7
  import more_itertools
8
8
  import numpy as np
9
9
  import pandas as pd
10
- from pandas.api.types import is_numeric_dtype
11
10
  from pydantic import BaseModel
12
11
 
13
12
  from upgini.metadata import TARGET, ModelTaskType
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.121a2
3
+ Version: 1.2.122
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=Dv8DzHbPAHs_fY_MACW4HNqnYW7CilejShdVPFkTaYM,26
1
+ upgini/__about__.py,sha256=-JqzGEBlhFUnCWmxu0lqdTawM1jUPGK4oP4I-0hFJNI,24
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=pQ8JQe0cdygD-W9GefJmfE6bnj4EYzXsjlgWdIS9nS8,31578
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=Du1S72F55cqyKbHT3VGSPnJO3XicWABFVkA2-G3chdA,231696
6
+ upgini/features_enricher.py,sha256=44_WWpTiJXZzmp2iAoY2SSYgHuaB_RqnLZ35zkNssK8,231839
7
7
  upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
8
  upgini/metadata.py,sha256=VzgtgEbPPtNxTrj9LM5qSDP3DujHwAXqbUSKBjPcb9c,12477
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
@@ -16,9 +16,9 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
16
16
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
17
17
  upgini/autofe/binary.py,sha256=oOEECc4nRzZN2tYaiqx8F2XHnfWpk1bVvb7ZkZJ0lO8,7709
18
18
  upgini/autofe/date.py,sha256=RvexgrL1_6ISYPVrl9HUQmPgpVSGQsTNv8YhNQWs-5M,11329
19
- upgini/autofe/feature.py,sha256=b4Ps_sCPui9b4h0K3ya85cfL1SWpLVrlHc40zkKVfAY,16329
19
+ upgini/autofe/feature.py,sha256=W9sZHdz5Vi0H_oPyY5saZAPjyd5wunpULnCqrGLpQc4,16879
20
20
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
21
- upgini/autofe/operator.py,sha256=RB3rKMjFi5Cx81RiYXN3OTCuXjmvzmFKQrxn4h0Oclo,5219
21
+ upgini/autofe/operator.py,sha256=3i4aWqlRomgTIVAPnivwFb3St87UoWMtZBTzQNJCyuU,6278
22
22
  upgini/autofe/unary.py,sha256=FFtvkQaT0cu_zPZ1jCLcsjik-UUh12qQFF3tUW8NqsE,6675
23
23
  upgini/autofe/utils.py,sha256=dYrtyAM8Vcc_R8u4dNo54IsGrHKagTHDJTKhGho0bRg,2967
24
24
  upgini/autofe/vector.py,sha256=r5H6DKT5f3KNjERpV2OOloZ96nDWkModXnpsqw_A77Q,2313
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=mDh2mBW3aQMB4EFP2aHbf2dGMVkOcWnp4sKKvKDBh8w,8511
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=Kmc6ZHpo0hK-bEQuoQkU0SPIQCnIDYRKqkfN3a_gvRU,29237
41
+ upgini/resource_bundle/strings.properties,sha256=KcXm1Nl6c3zswL91tIbG0DjuuNpzxUdCg1cY9f2-9cg,29283
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=Fva2FEhLiNRPZ9Q6uOtJRtRzwsayjv7aphalAZO_4lc,6452
@@ -58,7 +58,7 @@ upgini/utils/display_utils.py,sha256=uSG3JwpwCIgRJXsp-8ktuJ0Dh-WFti7IrRLMUfHfoDc
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
59
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
60
60
  upgini/utils/feature_info.py,sha256=6vihytwKma_TlXtTn4l6Aj4kqlOj0ouLy-yWVV6VUw8,7551
61
- upgini/utils/features_validator.py,sha256=RAnfX80GBFcz6-SlTSR0DF6BZzf7A7IL8dlIqEoSz_s,4265
61
+ upgini/utils/features_validator.py,sha256=A_3AX7X5u5AH7RLgkTiS6dHxaOiq5vm8w4ijQWLGcMY,4871
62
62
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
63
  upgini/utils/hash_utils.py,sha256=mP2yHyzvDNdpa5g3B4MHzulxBeEz_ZSoGl1YF_VnAyE,5538
64
64
  upgini/utils/ip_utils.py,sha256=wmnnwVQdjX9o1cNQw6VQMk6maHhvsq6hNsZBYf9knrw,6585
@@ -66,7 +66,7 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
66
66
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
67
67
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
68
68
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
69
- upgini/utils/psi.py,sha256=vw8QEktXSx29IiMJMxmDeFU_4lJInJBXt_XL5Muekzo,11114
69
+ upgini/utils/psi.py,sha256=D_DMMBVkU4nwMospTwdMpYzNFACDxhqTuNesDngPwyY,11068
70
70
  upgini/utils/sample_utils.py,sha256=xpfYaZ2cYP7I2JrcooVc13QNBFawB81cJRuh38451Q4,15123
71
71
  upgini/utils/sklearn_ext.py,sha256=Pcy8sWD6f4YcE5Bu0UmXD4j0ICmXtrT8DJlTArM-_a0,49356
72
72
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=GCPn4QeJ83JJ_vyBJ3IhY5fyIRkLC9q9BE59S2FRO1I,
74
74
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.121a2.dist-info/METADATA,sha256=1XVh2jWKC2I3ElN4ftyEveTny9C1pU5z69Osnp6q7_s,50745
78
- upgini-1.2.121a2.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.121a2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.121a2.dist-info/RECORD,,
77
+ upgini-1.2.122.dist-info/METADATA,sha256=e9lV45Du_2DKcMVvqgXpI1TkicMzXsiPApqm6b9tsYU,50743
78
+ upgini-1.2.122.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
+ upgini-1.2.122.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.122.dist-info/RECORD,,