upgini 1.2.121a3__tar.gz → 1.2.122a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (82) hide show
  1. {upgini-1.2.121a3 → upgini-1.2.122a2}/PKG-INFO +1 -1
  2. upgini-1.2.122a2/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/feature.py +11 -0
  4. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/operator.py +26 -0
  5. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/features_enricher.py +31 -23
  6. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/resource_bundle/strings.properties +1 -1
  7. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/features_validator.py +5 -1
  8. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/psi.py +0 -1
  9. upgini-1.2.121a3/src/upgini/__about__.py +0 -1
  10. {upgini-1.2.121a3 → upgini-1.2.122a2}/.gitignore +0 -0
  11. {upgini-1.2.121a3 → upgini-1.2.122a2}/LICENSE +0 -0
  12. {upgini-1.2.121a3 → upgini-1.2.122a2}/README.md +0 -0
  13. {upgini-1.2.121a3 → upgini-1.2.122a2}/pyproject.toml +0 -0
  14. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/__init__.py +0 -0
  15. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/ads.py +0 -0
  16. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/ads_management/__init__.py +0 -0
  17. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/ads_management/ads_manager.py +0 -0
  18. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/__init__.py +0 -0
  19. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/all_operators.py +0 -0
  20. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/binary.py +0 -0
  21. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/date.py +0 -0
  22. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/__init__.py +0 -0
  24. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/base.py +0 -0
  25. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/cross.py +0 -0
  26. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/delta.py +0 -0
  27. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/lag.py +0 -0
  28. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/roll.py +0 -0
  29. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/trend.py +0 -0
  30. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/timeseries/volatility.py +0 -0
  31. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/unary.py +0 -0
  32. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/utils.py +0 -0
  33. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/autofe/vector.py +0 -0
  34. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/data_source/__init__.py +0 -0
  35. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/data_source/data_source_publisher.py +0 -0
  36. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/dataset.py +0 -0
  37. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/errors.py +0 -0
  38. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/http.py +0 -0
  39. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/mdc/__init__.py +0 -0
  40. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/mdc/context.py +0 -0
  41. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/metadata.py +0 -0
  42. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/metrics.py +0 -0
  43. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/normalizer/__init__.py +0 -0
  44. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/normalizer/normalize_utils.py +0 -0
  45. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/resource_bundle/__init__.py +0 -0
  46. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/resource_bundle/exceptions.py +0 -0
  47. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  48. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/sampler/__init__.py +0 -0
  49. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/sampler/base.py +0 -0
  50. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/sampler/random_under_sampler.py +0 -0
  51. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/sampler/utils.py +0 -0
  52. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/search_task.py +0 -0
  53. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/spinner.py +0 -0
  54. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  55. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/__init__.py +0 -0
  56. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/base_search_key_detector.py +0 -0
  57. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/blocked_time_series.py +0 -0
  58. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/config.py +0 -0
  59. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/country_utils.py +0 -0
  60. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/custom_loss_utils.py +0 -0
  61. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/cv_utils.py +0 -0
  62. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/datetime_utils.py +0 -0
  63. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/deduplicate_utils.py +0 -0
  64. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/display_utils.py +0 -0
  65. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/email_utils.py +0 -0
  66. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  67. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/feature_info.py +0 -0
  68. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/hash_utils.py +0 -0
  70. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/ip_utils.py +0 -0
  71. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/mstats.py +0 -0
  72. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/phone_utils.py +0 -0
  73. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/postal_code_utils.py +0 -0
  74. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/progress_bar.py +0 -0
  75. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/sample_utils.py +0 -0
  76. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/sklearn_ext.py +0 -0
  77. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/sort.py +0 -0
  78. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/target_utils.py +0 -0
  79. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/track_info.py +0 -0
  80. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/ts_utils.py +0 -0
  81. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/utils/warning_counter.py +0 -0
  82. {upgini-1.2.121a3 → upgini-1.2.122a2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.121a3
3
+ Version: 1.2.122a2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.122a2"
@@ -42,6 +42,9 @@ class Column:
42
42
  def get_display_name(self, cache: bool = True, shorten: bool = False, **kwargs) -> str:
43
43
  return self.get_columns(**kwargs)[0]
44
44
 
45
+ def reset_display_indices(self) -> "Column":
46
+ return self
47
+
45
48
  def _unhash(self, feature_name: str) -> str:
46
49
  last_component_idx = feature_name.rfind("_")
47
50
  if not feature_name.startswith("f_"):
@@ -142,6 +145,7 @@ class Feature:
142
145
  )
143
146
 
144
147
  def rename_columns(self, mapping: Dict[str, str]) -> "Feature":
148
+ self.op.rename_columns(mapping)
145
149
  for child in self.children:
146
150
  child.rename_columns(mapping)
147
151
  self.cached_display_name = None
@@ -212,6 +216,13 @@ class Feature:
212
216
  self.cached_display_name = None
213
217
  return self
214
218
 
219
+ def reset_display_indices(self) -> "Feature":
220
+ for child in self.children:
221
+ child.reset_display_indices()
222
+ self.display_index = None
223
+ self.cached_display_name = None
224
+ return self
225
+
215
226
  def infer_type(self, data: pd.DataFrame) -> Union[str, DtypeObj]:
216
227
  if self.op.output_type:
217
228
  return self.op.output_type
@@ -89,6 +89,32 @@ class Operator(BaseModel, metaclass=OperatorRegistry):
89
89
  def delete_data(self):
90
90
  pass
91
91
 
92
+ def rename_columns(self, columns_renaming: Dict[str, str]) -> "Operator":
93
+ # Rename occurrences of column names inside self.params keys according to columns_renaming
94
+ if not self.params or not columns_renaming:
95
+ return self
96
+
97
+ # Replace longer keys first to avoid partial overlaps
98
+ replacements = sorted(columns_renaming.items(), key=lambda kv: -len(kv[0]))
99
+
100
+ renamed_params: Dict[str, str] = {}
101
+ for param_key, param_value in self.params.items():
102
+ new_key = param_key
103
+ for old, new in replacements:
104
+ if old and old in new_key:
105
+ new_key = new_key.replace(old, new)
106
+
107
+ if new_key in renamed_params and new_key != param_key:
108
+ self._logger.warning(
109
+ "Param key collision after rename: '%s' -> '%s'. Overwriting value.",
110
+ param_key,
111
+ new_key,
112
+ )
113
+ renamed_params[new_key] = param_value
114
+
115
+ self.params = renamed_params
116
+ return self
117
+
92
118
 
93
119
  class ParametrizedOperator(Operator, abc.ABC):
94
120
 
@@ -1028,7 +1028,7 @@ class FeaturesEnricher(TransformerMixin):
1028
1028
  columns_renaming,
1029
1029
  _,
1030
1030
  ) = prepared_data
1031
-
1031
+
1032
1032
  gc.collect()
1033
1033
 
1034
1034
  if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
@@ -1406,7 +1406,7 @@ class FeaturesEnricher(TransformerMixin):
1406
1406
  self,
1407
1407
  X: pd.DataFrame,
1408
1408
  eval_set: list[tuple[pd.DataFrame, pd.Series]],
1409
- enriched_eval_set: dict,
1409
+ enriched_eval_set: dict[int, tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]],
1410
1410
  eval_set_dates: dict[int, pd.Series],
1411
1411
  search_keys: dict[str, SearchKey],
1412
1412
  stability_threshold: float,
@@ -1417,31 +1417,42 @@ class FeaturesEnricher(TransformerMixin):
1417
1417
  # Find latest eval set or earliest if all eval sets are before train set
1418
1418
  date_column = self._get_date_column(search_keys)
1419
1419
 
1420
+ date_converter = DateTimeSearchKeyConverter(
1421
+ date_column, self.date_format, self.logger, self.bundle, generate_cyclical_features=False
1422
+ )
1423
+
1424
+ X = date_converter.convert(X)
1425
+
1420
1426
  x_date = X[date_column].dropna()
1421
- if not is_numeric_dtype(x_date):
1422
- x_date = pd.to_datetime(x_date).dt.floor("D").astype(np.int64) / 10**6
1423
- main_min_date = x_date.min()
1427
+ if len(x_date) == 0:
1428
+ self.logger.warning("Empty date column in X")
1429
+ return []
1424
1430
 
1425
- for eval_x, _ in eval_set:
1426
- eval_x_date = eval_x[date_column].dropna()
1427
- if not is_numeric_dtype(eval_x_date):
1428
- eval_x[date_column] = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1431
+ main_min_date = x_date.min()
1429
1432
 
1430
1433
  # Find minimum date for each eval_set and compare with main dataset
1431
1434
  eval_dates = []
1432
1435
  for i, (eval_x, _) in enumerate(eval_set):
1433
- if date_column in eval_x.columns:
1434
- if len(eval_x) < 1000:
1435
- self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1436
- continue
1437
- eval_x_date = eval_x[date_column].dropna()
1438
- if not is_numeric_dtype(eval_x_date):
1439
- eval_x_date = pd.to_datetime(eval_x_date).dt.floor("D").astype(np.int64) / 10**6
1440
- eval_min_date = eval_x_date.min()
1441
- eval_max_date = eval_x_date.max()
1442
- eval_dates.append((i, eval_min_date, eval_max_date))
1436
+ if date_column not in eval_x.columns:
1437
+ self.logger.warning(f"Date column not found in eval_set {i + 1}")
1438
+ continue
1439
+ eval_x = date_converter.convert(eval_x)
1440
+ eval_x_date = eval_x[date_column].dropna()
1441
+ if len(eval_x_date) < 1000:
1442
+ self.logger.warning(f"Eval_set {i} has less than 1000 rows. It will be ignored for stability check")
1443
+ continue
1444
+ if len(enriched_eval_set[i][2]) < 1000:
1445
+ self.logger.warning(
1446
+ f"Enriched eval_set {i} has less than 1000 rows. It will be ignored for stability check"
1447
+ )
1448
+ continue
1449
+
1450
+ eval_min_date = eval_x_date.min()
1451
+ eval_max_date = eval_x_date.max()
1452
+ eval_dates.append((i, eval_min_date, eval_max_date))
1443
1453
 
1444
1454
  if not eval_dates:
1455
+ self.logger.warning("There are no correct eval_sets for stability check")
1445
1456
  return []
1446
1457
 
1447
1458
  # Check if any eval_set has minimum date >= main dataset minimum date
@@ -1464,10 +1475,7 @@ class FeaturesEnricher(TransformerMixin):
1464
1475
  checking_eval_set_df = checking_eval_set_df.copy()
1465
1476
 
1466
1477
  checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1467
- if not is_numeric_dtype(checking_eval_set_df[date_column]):
1468
- checking_eval_set_df[date_column] = (
1469
- pd.to_datetime(checking_eval_set_df[date_column]).dt.floor("D").astype(np.int64) / 10**6
1470
- )
1478
+ checking_eval_set_df = date_converter.convert(checking_eval_set_df)
1471
1479
 
1472
1480
  psi_values_sparse = calculate_sparsity_psi(
1473
1481
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
@@ -155,7 +155,7 @@ target_outliers_warning=We detected {} outliers in your sample.\nExamples of out
155
155
  # features validation
156
156
  empty_or_contant_features=Columns {} has value with frequency more than 99%, removed from X
157
157
  high_cardinality_features=Columns {} has high cardinality (>90% unique values), removed from X
158
- one_hot_encoded_features=One hot encoded features detected: {}
158
+ one_hot_encoded_features=One hot encoded features detected. Use int encoding for correct results of fit.\n{}
159
159
 
160
160
  # Dataset validation
161
161
  dataset_too_few_rows=X size should be at least {} rows after validation
@@ -46,7 +46,7 @@ class FeaturesValidator:
46
46
 
47
47
  if one_hot_encoded_features:
48
48
  msg = bundle.get("one_hot_encoded_features").format(one_hot_encoded_features)
49
- self.logger.info(msg)
49
+ warnings.append(msg)
50
50
 
51
51
  columns_renaming = columns_renaming or {}
52
52
 
@@ -100,6 +100,10 @@ class FeaturesValidator:
100
100
  @staticmethod
101
101
  def is_one_hot_encoded(series: pd.Series) -> bool:
102
102
  try:
103
+ # All rows should be the same type
104
+ if series.apply(lambda x: type(x)).nunique() != 1:
105
+ return False
106
+
103
107
  # First, handle string representations of True/False
104
108
  series_copy = series.copy()
105
109
  if series_copy.dtype == "object" or series_copy.dtype == "string":
@@ -7,7 +7,6 @@ from typing import Callable, Dict, Optional
7
7
  import more_itertools
8
8
  import numpy as np
9
9
  import pandas as pd
10
- from pandas.api.types import is_numeric_dtype
11
10
  from pydantic import BaseModel
12
11
 
13
12
  from upgini.metadata import TARGET, ModelTaskType
@@ -1 +0,0 @@
1
- __version__ = "1.2.121a3"
File without changes
File without changes
File without changes
File without changes
File without changes