upgini 1.2.133a1__py3-none-any.whl → 1.2.135a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.133a1"
1
+ __version__ = "1.2.135a1"
@@ -751,7 +751,6 @@ class FeaturesEnricher(TransformerMixin):
751
751
  exclude_features_sources: list[str] | None = None,
752
752
  keep_input: bool = True,
753
753
  trace_id: str | None = None,
754
- metrics_calculation: bool = False,
755
754
  silent_mode=False,
756
755
  progress_bar: ProgressBar | None = None,
757
756
  progress_callback: Callable[[SearchProgress], Any] | None = None,
@@ -810,11 +809,12 @@ class FeaturesEnricher(TransformerMixin):
810
809
  X,
811
810
  y=y,
812
811
  exclude_features_sources=exclude_features_sources,
813
- metrics_calculation=metrics_calculation,
814
812
  silent_mode=silent_mode,
815
813
  progress_bar=progress_bar,
816
814
  keep_input=keep_input,
817
815
  )
816
+ if TARGET in result.columns:
817
+ result.drop(columns=TARGET, inplace=True)
818
818
  self.logger.info("Transform finished successfully")
819
819
  search_progress = SearchProgress(100.0, ProgressStage.FINISHED)
820
820
  if progress_bar is not None:
@@ -1637,7 +1637,7 @@ class FeaturesEnricher(TransformerMixin):
1637
1637
 
1638
1638
  if not isinstance(_cv, BaseCrossValidator):
1639
1639
  date_column = self._get_date_column(search_keys)
1640
- date_series = X[date_column] if date_column is not None else None
1640
+ date_series = X[date_column] if date_column is not None and date_column in X.columns else None
1641
1641
  _cv, groups = CVConfig(
1642
1642
  _cv, date_series, self.random_state, self._search_task.get_shuffle_kfold(), group_columns=group_columns
1643
1643
  ).get_cv_and_groups(X)
@@ -1738,7 +1738,7 @@ class FeaturesEnricher(TransformerMixin):
1738
1738
 
1739
1739
  client_features = [
1740
1740
  c
1741
- for c in (validated_X.columns.to_list() + generated_features)
1741
+ for c in validated_X.columns.to_list()
1742
1742
  if (not self.fit_select_features or c in set(self.feature_names_).union(self.id_columns or []))
1743
1743
  and c
1744
1744
  not in (
@@ -1747,6 +1747,7 @@ class FeaturesEnricher(TransformerMixin):
1747
1747
  + [DateTimeConverter.DATETIME_COL, SYSTEM_RECORD_ID, ENTITY_SYSTEM_RECORD_ID]
1748
1748
  )
1749
1749
  ]
1750
+ client_features.extend(f for f in generated_features if f in self.feature_names_)
1750
1751
  if self.baseline_score_column is not None and self.baseline_score_column not in client_features:
1751
1752
  client_features.append(self.baseline_score_column)
1752
1753
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
@@ -1847,7 +1848,7 @@ class FeaturesEnricher(TransformerMixin):
1847
1848
  enriched_eval_X_sorted, enriched_eval_y_sorted = self._sort_by_system_record_id(
1848
1849
  enriched_eval_X, eval_y_sampled, self.cv
1849
1850
  )
1850
- if date_column is not None:
1851
+ if date_column is not None and date_column in eval_X_sorted.columns:
1851
1852
  eval_set_dates[idx] = eval_X_sorted[date_column]
1852
1853
  fitting_eval_X = eval_X_sorted[fitting_x_columns].copy()
1853
1854
  fitting_enriched_eval_X = enriched_eval_X_sorted[fitting_enriched_x_columns].copy()
@@ -1936,7 +1937,9 @@ class FeaturesEnricher(TransformerMixin):
1936
1937
  and self.df_with_original_index is not None
1937
1938
  ):
1938
1939
  self.logger.info("Dataset is not imbalanced, so use enriched_X from fit")
1939
- return self.__get_enriched_from_fit(eval_set, trace_id, remove_outliers_calc_metrics)
1940
+ return self.__get_enriched_from_fit(
1941
+ validated_X, validated_y, eval_set, trace_id, remove_outliers_calc_metrics
1942
+ )
1940
1943
  else:
1941
1944
  self.logger.info(
1942
1945
  "Dataset is imbalanced or exclude_features_sources or X was passed or this is saved search."
@@ -2074,6 +2077,8 @@ class FeaturesEnricher(TransformerMixin):
2074
2077
 
2075
2078
  def __get_enriched_from_fit(
2076
2079
  self,
2080
+ validated_X: pd.DataFrame,
2081
+ validated_y: pd.Series,
2077
2082
  eval_set: list[tuple] | None,
2078
2083
  trace_id: str,
2079
2084
  remove_outliers_calc_metrics: bool | None,
@@ -2124,6 +2129,24 @@ class FeaturesEnricher(TransformerMixin):
2124
2129
  drop_system_record_id=False,
2125
2130
  )
2126
2131
 
2132
+ enriched_Xy.rename(columns=self.fit_columns_renaming, inplace=True)
2133
+ search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2134
+ generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
2135
+
2136
+ validated_Xy = validated_X.copy()
2137
+ validated_Xy[TARGET] = validated_y
2138
+
2139
+ selecting_columns = self._selecting_input_and_generated_columns(
2140
+ validated_Xy, self.fit_generated_features, keep_input=True, trace_id=trace_id
2141
+ )
2142
+ selecting_columns.extend(
2143
+ c
2144
+ for c in enriched_Xy.columns
2145
+ if (c in self.feature_names_ and c not in selecting_columns and c not in validated_X.columns)
2146
+ or c in [EVAL_SET_INDEX, ENTITY_SYSTEM_RECORD_ID, SYSTEM_RECORD_ID]
2147
+ )
2148
+ enriched_Xy = enriched_Xy[selecting_columns]
2149
+
2127
2150
  # Handle eval sets extraction based on EVAL_SET_INDEX
2128
2151
  if EVAL_SET_INDEX in enriched_Xy.columns:
2129
2152
  eval_set_indices = list(enriched_Xy[EVAL_SET_INDEX].unique())
@@ -2135,7 +2158,11 @@ class FeaturesEnricher(TransformerMixin):
2135
2158
  ].copy()
2136
2159
  enriched_Xy = enriched_Xy.loc[enriched_Xy[EVAL_SET_INDEX] == 0].copy()
2137
2160
 
2138
- x_columns = [c for c in self.df_with_original_index.columns if c not in [EVAL_SET_INDEX, TARGET]]
2161
+ x_columns = [
2162
+ c
2163
+ for c in [self.fit_columns_renaming.get(k, k) for k in self.df_with_original_index.columns]
2164
+ if c not in [EVAL_SET_INDEX, TARGET] and c in selecting_columns
2165
+ ]
2139
2166
  X_sampled = enriched_Xy[x_columns].copy()
2140
2167
  y_sampled = enriched_Xy[TARGET].copy()
2141
2168
  enriched_X = enriched_Xy.drop(columns=[TARGET, EVAL_SET_INDEX], errors="ignore")
@@ -2157,15 +2184,6 @@ class FeaturesEnricher(TransformerMixin):
2157
2184
  enriched_eval_X = enriched_eval_sets[idx + 1][enriched_X_columns].copy()
2158
2185
  eval_set_sampled_dict[idx] = (eval_X_sampled, enriched_eval_X, eval_y_sampled)
2159
2186
 
2160
- # reversed_renaming = {v: k for k, v in self.fit_columns_renaming.items()}
2161
- X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2162
- enriched_X.rename(columns=self.fit_columns_renaming, inplace=True)
2163
- for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2164
- eval_X_sampled.rename(columns=self.fit_columns_renaming, inplace=True)
2165
- enriched_eval_X.rename(columns=self.fit_columns_renaming, inplace=True)
2166
- search_keys = {self.fit_columns_renaming.get(k, k): v for k, v in search_keys.items()}
2167
- generated_features = [self.fit_columns_renaming.get(c, c) for c in self.fit_generated_features]
2168
-
2169
2187
  datasets_hash = hash_input(self.X, self.y, self.eval_set)
2170
2188
  return self.__cache_and_return_results(
2171
2189
  datasets_hash,
@@ -2642,7 +2660,7 @@ if response.status_code == 200:
2642
2660
  generated_features = [columns_renaming.get(c, c) for c in generated_features]
2643
2661
  search_keys = {columns_renaming.get(c, c): t for c, t in search_keys.items()}
2644
2662
  selecting_columns = self._selecting_input_and_generated_columns(
2645
- validated_Xy, generated_features, keep_input, trace_id
2663
+ validated_Xy, generated_features, keep_input, trace_id, is_transform=True
2646
2664
  )
2647
2665
  self.logger.warning(f"Filtered columns by existance in dataframe: {selecting_columns}")
2648
2666
  if add_fit_system_record_id:
@@ -2895,7 +2913,7 @@ if response.status_code == 200:
2895
2913
  )
2896
2914
 
2897
2915
  selecting_columns = self._selecting_input_and_generated_columns(
2898
- validated_Xy, generated_features, keep_input, trace_id
2916
+ validated_Xy, generated_features, keep_input, trace_id, is_transform=True
2899
2917
  )
2900
2918
  selecting_columns.extend(
2901
2919
  c
@@ -2933,20 +2951,19 @@ if response.status_code == 200:
2933
2951
  generated_features: list[str],
2934
2952
  keep_input: bool,
2935
2953
  trace_id: str,
2954
+ is_transform: bool = False,
2936
2955
  ):
2937
2956
  fit_input_columns = [c.originalName for c in self._search_task.get_file_metadata(trace_id).columns]
2938
2957
  new_columns_on_transform = [c for c in validated_Xy.columns if c not in fit_input_columns]
2939
2958
 
2940
- selected_generated_features = [
2941
- c for c in generated_features if not self.fit_select_features or c in self.feature_names_
2942
- ]
2959
+ selected_generated_features = [c for c in generated_features if c in self.feature_names_]
2943
2960
  if keep_input is True:
2944
2961
  selected_input_columns = [
2945
2962
  c
2946
2963
  for c in validated_Xy.columns
2947
2964
  if not self.fit_select_features
2948
2965
  or c in self.feature_names_
2949
- or c in new_columns_on_transform
2966
+ or (c in new_columns_on_transform and is_transform)
2950
2967
  or c in self.search_keys
2951
2968
  or c in (self.id_columns or [])
2952
2969
  or c in [EVAL_SET_INDEX, TARGET] # transform for metrics calculation
@@ -3245,7 +3262,7 @@ if response.status_code == 200:
3245
3262
  if fintech_warnings:
3246
3263
  for fintech_warning in fintech_warnings:
3247
3264
  self.__log_warning(fintech_warning)
3248
- df, full_duplicates_warning = clean_full_duplicates(df, self.logger, bundle=self.bundle)
3265
+ df, full_duplicates_warning = clean_full_duplicates(df, logger=self.logger, bundle=self.bundle)
3249
3266
  if full_duplicates_warning:
3250
3267
  if len(df) == 0:
3251
3268
  raise ValidationError(full_duplicates_warning)
@@ -1,6 +1,5 @@
1
1
  import datetime
2
2
  import logging
3
- import re
4
3
  from typing import Dict, List, Optional
5
4
 
6
5
  import numpy as np
@@ -67,7 +66,7 @@ class DateTimeConverter:
67
66
  try:
68
67
  if s is None or len(str(s).strip()) == 0:
69
68
  return None
70
- if not re.match(DATETIME_PATTERN, str(s)):
69
+ if sum(ch.isdigit() for ch in str(s)) < 6:
71
70
  return None
72
71
  return s
73
72
  except Exception:
@@ -84,30 +83,31 @@ class DateTimeConverter:
84
83
  return parsed is not None and not parsed.isna().all()
85
84
 
86
85
  def parse_datetime(self, df: pd.DataFrame, raise_errors=True) -> pd.Series | None:
87
- df = df.copy()
88
86
  if len(df) == 0 or df[self.date_column].isna().all():
89
87
  return None
90
88
 
89
+ date_col = df[self.date_column].copy()
90
+
91
91
  try:
92
- if df[self.date_column].apply(lambda x: isinstance(x, datetime.datetime)).all():
93
- parsed_datetime = df[self.date_column].apply(lambda x: x.replace(tzinfo=None))
94
- elif isinstance(df[self.date_column].dropna().values[0], datetime.date):
95
- parsed_datetime = pd.to_datetime(df[self.date_column], errors="coerce")
96
- elif isinstance(df[self.date_column].dtype, pd.PeriodDtype):
97
- parsed_datetime = df[self.date_column].dt.to_timestamp()
98
- elif is_numeric_dtype(df[self.date_column]):
92
+ if date_col.apply(lambda x: isinstance(x, datetime.datetime)).all():
93
+ parsed_datetime = date_col.apply(lambda x: x.replace(tzinfo=None))
94
+ elif isinstance(date_col.dropna().values[0], datetime.date):
95
+ parsed_datetime = pd.to_datetime(date_col, errors="coerce")
96
+ elif isinstance(date_col.dtype, pd.PeriodDtype):
97
+ parsed_datetime = date_col.dt.to_timestamp()
98
+ elif is_numeric_dtype(date_col):
99
99
  # 315532801 - 2524608001 - seconds
100
100
  # 315532801000 - 2524608001000 - milliseconds
101
101
  # 315532801000000 - 2524608001000000 - microseconds
102
102
  # 315532801000000000 - 2524608001000000000 - nanoseconds
103
- if df[self.date_column].apply(lambda x: 10**16 < x).all():
104
- parsed_datetime = pd.to_datetime(df[self.date_column], unit="ns")
105
- elif df[self.date_column].apply(lambda x: 10**14 < x < 10**16).all():
106
- parsed_datetime = pd.to_datetime(df[self.date_column], unit="us")
107
- elif df[self.date_column].apply(lambda x: 10**11 < x < 10**14).all():
108
- parsed_datetime = pd.to_datetime(df[self.date_column], unit="ms")
109
- elif df[self.date_column].apply(lambda x: 10**8 < x < 10**11).all():
110
- parsed_datetime = pd.to_datetime(df[self.date_column], unit="s")
103
+ if date_col.apply(lambda x: 10**16 < x).all():
104
+ parsed_datetime = pd.to_datetime(date_col, unit="ns")
105
+ elif date_col.apply(lambda x: 10**14 < x < 10**16).all():
106
+ parsed_datetime = pd.to_datetime(date_col, unit="us")
107
+ elif date_col.apply(lambda x: 10**11 < x < 10**14).all():
108
+ parsed_datetime = pd.to_datetime(date_col, unit="ms")
109
+ elif date_col.apply(lambda x: 10**8 < x < 10**11).all():
110
+ parsed_datetime = pd.to_datetime(date_col, unit="s")
111
111
  else:
112
112
  msg = self.bundle.get("unsupported_date_type").format(self.date_column)
113
113
  if raise_errors:
@@ -115,8 +115,10 @@ class DateTimeConverter:
115
115
  else:
116
116
  return None
117
117
  else:
118
- df[self.date_column] = df[self.date_column].astype("string").apply(self.clean_date)
119
- parsed_datetime = self.parse_string_date(df, raise_errors)
118
+ date_col = date_col.astype("string").apply(self.clean_date)
119
+ parsed_datetime = self.parse_string_date(date_col.to_frame(self.date_column), raise_errors)
120
+ if parsed_datetime.isna().all():
121
+ raise ValidationError(self.bundle.get("invalid_date_format").format(self.date_column))
120
122
  parsed_datetime = parsed_datetime.dt.tz_localize(None)
121
123
  return parsed_datetime
122
124
  except Exception as e:
@@ -1,6 +1,6 @@
1
- Metadata-Version: 2.3
1
+ Metadata-Version: 2.4
2
2
  Name: upgini
3
- Version: 1.2.133a1
3
+ Version: 1.2.135a1
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=2J0xRzQRVTKW9-UjHayhhp4WFFpAteaH5RVfrXavaz0,26
1
+ upgini/__about__.py,sha256=ut2rbJ0xiGgZg547NgkkPDfF6sWBeWwDef4pISy7Ipc,26
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=Nm2ZmwyQqvTnymYpGUwyJWy7y2ebXlHMyYmGeGcyA_s,31652
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=Na-W1f5xQVUKF4_m2Bw8mM29kLD8N3U7p9-FU0J9bi8,234415
6
+ upgini/features_enricher.py,sha256=YvVLu2Fa0XQb-J8sUKH0W0_k-WLyfwWQ7646raObva4,235033
7
7
  upgini/http.py,sha256=-J_wOpnwVnT0ebPC6sOs6fN3AWtCD0LJLu6nlYmxaqk,44348
8
8
  upgini/metadata.py,sha256=H3wiN37k-yqWZgbPD0tJzx8DzaCIkgmX5cybhByQWLg,12619
9
9
  upgini/metrics.py,sha256=KCPE_apPN-9BIdv6GqASbJVaB_gBcy8wzNApAcyaGo4,46020
@@ -52,7 +52,7 @@ upgini/utils/config.py,sha256=zFdnjchykfp_1Tm3Qep7phLzXBpXIOzr2tIuXchRBLw,1754
52
52
  upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk,6937
53
53
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
54
54
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
- upgini/utils/datetime_utils.py,sha256=l85UzSQLhtMeI2G6m-m8y8bCColCLSXNHb2-G6fKpLM,16988
55
+ upgini/utils/datetime_utils.py,sha256=3_FQoa_ywgEeznaEPN2kuH_ES-LZJWSN2AI39sM9NRg,16988
56
56
  upgini/utils/deduplicate_utils.py,sha256=CLX0QapRxB-ZVQT7yGvv1vSd2zac5SwRjCJavujdCps,11332
57
57
  upgini/utils/display_utils.py,sha256=MoTqXZJvC6pAqgOaI3V0FG-IU_LnMfrn4TDcNvUqsdg,13316
58
58
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
@@ -74,7 +74,7 @@ upgini/utils/target_utils.py,sha256=CihpV6SC95HwtlMH60rGAUzVDa4Id0Bva8ySprmNHlE,
74
74
  upgini/utils/track_info.py,sha256=NDKeQTUlZaYp15UoP-xLKGoDoJQ0drbDMwB0g9R0PUg,6427
75
75
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
76
76
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
77
- upgini-1.2.133a1.dist-info/METADATA,sha256=oveLN_pPi2K1BqqAnu5ZnGXVMl7TeD65Jg1biA1drE0,51135
78
- upgini-1.2.133a1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
79
- upgini-1.2.133a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
- upgini-1.2.133a1.dist-info/RECORD,,
77
+ upgini-1.2.135a1.dist-info/METADATA,sha256=0E6YorGA-6HKO5wnsp75qUKG-BxNKXOVEEH5snEVBvI,51135
78
+ upgini-1.2.135a1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
79
+ upgini-1.2.135a1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
80
+ upgini-1.2.135a1.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.25.0
2
+ Generator: hatchling 1.27.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any