upgini 1.2.66a3818.dev1__py3-none-any.whl → 1.2.68__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.66a3818.dev1"
1
+ __version__ = "1.2.68"
upgini/autofe/feature.py CHANGED
@@ -112,11 +112,7 @@ class Feature:
112
112
 
113
113
  def get_hash(self) -> str:
114
114
  return hashlib.sha256(
115
- "_".join(
116
- [self.op.get_hash_component()]
117
- + [ch.op.get_hash_component() for ch in self.children if isinstance(ch, Feature)]
118
- + [ch.get_display_name() for ch in self.children]
119
- ).encode("utf-8")
115
+ "_".join([self.op.get_hash_component()] + [ch.get_display_name() for ch in self.children]).encode("utf-8")
120
116
  ).hexdigest()[:8]
121
117
 
122
118
  def set_alias(self, alias: str) -> "Feature":
@@ -308,7 +308,8 @@ class FeaturesEnricher(TransformerMixin):
308
308
  self._search_task = search_task.poll_result(trace_id, quiet=True, check_fit=True)
309
309
  file_metadata = self._search_task.get_file_metadata(trace_id)
310
310
  x_columns = [c.originalName or c.name for c in file_metadata.columns]
311
- self.__prepare_feature_importances(trace_id, x_columns, silent=True)
311
+ df = pd.DataFrame(columns=x_columns)
312
+ self.__prepare_feature_importances(trace_id, df, silent=True)
312
313
  # TODO validate search_keys with search_keys from file_metadata
313
314
  print(self.bundle.get("search_by_task_id_finish"))
314
315
  self.logger.debug(f"Successfully initialized with search_id: {search_id}")
@@ -1087,7 +1088,7 @@ class FeaturesEnricher(TransformerMixin):
1087
1088
  enriched_shaps = enriched_cv_result.shap_values
1088
1089
 
1089
1090
  if enriched_shaps is not None:
1090
- self._update_shap_values(trace_id, validated_X.columns.to_list(), enriched_shaps)
1091
+ self._update_shap_values(trace_id, fitting_X, enriched_shaps)
1091
1092
 
1092
1093
  if enriched_metric is None:
1093
1094
  self.logger.warning(
@@ -1255,14 +1256,14 @@ class FeaturesEnricher(TransformerMixin):
1255
1256
  finally:
1256
1257
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1257
1258
 
1258
- def _update_shap_values(self, trace_id: str, x_columns: List[str], new_shaps: Dict[str, float]):
1259
+ def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
1259
1260
  renaming = self.fit_columns_renaming or {}
1260
1261
  new_shaps = {
1261
1262
  renaming.get(feature, feature): _round_shap_value(shap)
1262
1263
  for feature, shap in new_shaps.items()
1263
1264
  if feature in self.feature_names_ or renaming.get(feature, feature) in self.feature_names_
1264
1265
  }
1265
- self.__prepare_feature_importances(trace_id, x_columns, new_shaps)
1266
+ self.__prepare_feature_importances(trace_id, df, new_shaps)
1266
1267
 
1267
1268
  if self.features_info_display_handle is not None:
1268
1269
  try:
@@ -3021,7 +3022,7 @@ if response.status_code == 200:
3021
3022
  msg = self.bundle.get("features_not_generated").format(unused_features_for_generation)
3022
3023
  self.__log_warning(msg)
3023
3024
 
3024
- self.__prepare_feature_importances(trace_id, validated_X.columns.to_list() + self.fit_generated_features)
3025
+ self.__prepare_feature_importances(trace_id, df)
3025
3026
 
3026
3027
  self.__show_selected_features(self.fit_search_keys)
3027
3028
 
@@ -3796,7 +3797,7 @@ if response.status_code == 200:
3796
3797
  return result_train, result_eval_sets
3797
3798
 
3798
3799
  def __prepare_feature_importances(
3799
- self, trace_id: str, x_columns: List[str], updated_shaps: Optional[Dict[str, float]] = None, silent=False
3800
+ self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3800
3801
  ):
3801
3802
  if self._search_task is None:
3802
3803
  raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
@@ -3807,6 +3808,8 @@ if response.status_code == 200:
3807
3808
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3808
3809
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
3809
3810
 
3811
+ df = df.rename(columns=original_names_dict)
3812
+
3810
3813
  self.feature_names_ = []
3811
3814
  self.dropped_client_feature_names_ = []
3812
3815
  self.feature_importances_ = []
@@ -3825,7 +3828,7 @@ if response.status_code == 200:
3825
3828
  if feature_meta.name in original_names_dict.keys():
3826
3829
  feature_meta.name = original_names_dict[feature_meta.name]
3827
3830
 
3828
- is_client_feature = feature_meta.name in x_columns
3831
+ is_client_feature = feature_meta.name in df.columns
3829
3832
 
3830
3833
  # TODO make a decision about selected features based on special flag from mlb
3831
3834
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
@@ -3845,7 +3848,7 @@ if response.status_code == 200:
3845
3848
  self.feature_names_.append(feature_meta.name)
3846
3849
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3847
3850
 
3848
- df_for_sample = features_df if feature_meta.name in features_df.columns else self.X
3851
+ df_for_sample = features_df if feature_meta.name in features_df.columns else df
3849
3852
  feature_info = FeatureInfo.from_metadata(feature_meta, df_for_sample, is_client_feature)
3850
3853
  features_info.append(feature_info.to_row(self.bundle))
3851
3854
  features_info_without_links.append(feature_info.to_row_without_links(self.bundle))
@@ -88,8 +88,11 @@ class FeatureInfo:
88
88
 
89
89
 
90
90
  def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
91
- if data is not None and feature_meta.name in data.columns:
92
- feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
91
+ if data is not None and len(data) > 0 and feature_meta.name in data.columns:
92
+ if len(data) > 3:
93
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
94
+ else:
95
+ feature_sample = data[feature_meta.name].dropna().unique().tolist()
93
96
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
94
97
  feature_sample = [round(f, 4) for f in feature_sample]
95
98
  feature_sample = [str(f) for f in feature_sample]
@@ -123,7 +126,11 @@ def _get_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) ->
123
126
 
124
127
 
125
128
  def _get_internal_provider(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
126
- return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
129
+ providers = _list_or_single(feature_meta.data_providers, feature_meta.data_provider)
130
+ if providers:
131
+ return ", ".join(providers)
132
+ else:
133
+ return "" if is_client_feature else (feature_meta.data_provider or "Upgini")
127
134
 
128
135
 
129
136
  def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
@@ -137,13 +144,17 @@ def _get_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> st
137
144
 
138
145
 
139
146
  def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bool) -> str:
140
- return feature_meta.data_source or (
141
- LLM_SOURCE
142
- if not feature_meta.name.endswith("_country")
143
- and not feature_meta.name.endswith("_postal_code")
144
- and not is_client_feature
145
- else ""
146
- )
147
+ sources = _list_or_single(feature_meta.data_sources, feature_meta.data_source)
148
+ if sources:
149
+ return ", ".join(sources)
150
+ else:
151
+ return feature_meta.data_source or (
152
+ LLM_SOURCE
153
+ if not feature_meta.name.endswith("_country")
154
+ and not feature_meta.name.endswith("_postal_code")
155
+ and not is_client_feature
156
+ else ""
157
+ )
147
158
 
148
159
 
149
160
  def _list_or_single(lst: List[str], single: str):
@@ -161,7 +172,7 @@ def _to_anchor(link: str, value: str) -> str:
161
172
  return f"<a href='{link}' target='_blank' rel='noopener noreferrer'>{value}</a>"
162
173
 
163
174
 
164
- def _make_links(names: List[str], links: List[str]):
175
+ def _make_links(names: List[str], links: List[str]) -> str:
165
176
  all_links = [_to_anchor(link, name) for name, link in itertools.zip_longest(names, links)]
166
177
  return ",".join(all_links)
167
178
 
upgini/utils/sort.py CHANGED
@@ -39,6 +39,11 @@ def sort_columns(
39
39
  sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
40
40
  sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
41
41
 
42
+ duplicate_names = df.columns[df.columns.duplicated()].unique()
43
+ if len(duplicate_names) > 0:
44
+ logger.warning(f"WARNING: Found columns with duplicate names: {list(duplicate_names)}")
45
+ df = df[list(set(df.columns))]
46
+
42
47
  other_columns = sorted(
43
48
  [
44
49
  c
@@ -49,7 +54,7 @@ def sort_columns(
49
54
  target = target_column if isinstance(target_column, pd.Series) else df[target_column]
50
55
  target = prepare_target(target, model_task_type)
51
56
  sort_dict = get_sort_columns_dict(
52
- df[sorted_keys + other_columns], target, sorted_keys, omit_nan=True, sort_all_columns=sort_all_columns
57
+ df[sorted_keys + other_columns], target, sorted_keys, sort_all_columns=sort_all_columns
53
58
  )
54
59
  other_columns = [c for c in other_columns if c in sort_dict]
55
60
  columns_for_sort = sorted_keys + sorted(other_columns, key=lambda e: sort_dict[e], reverse=True)
@@ -60,7 +65,6 @@ def get_sort_columns_dict(
60
65
  df: pd.DataFrame,
61
66
  target: pd.Series,
62
67
  sorted_keys: List[str],
63
- omit_nan: bool,
64
68
  n_jobs: Optional[int] = None,
65
69
  sort_all_columns: bool = False,
66
70
  ) -> Dict[str, Any]:
@@ -78,6 +82,13 @@ def get_sort_columns_dict(
78
82
  return {}
79
83
 
80
84
  df = df[columns_for_sort]
85
+ df_with_target = pd.concat([df, target], axis=1)
86
+ # Drop rows where target is NaN
87
+ df_with_target = df_with_target.loc[~target.isna()]
88
+ df = df_with_target.iloc[:, :-1]
89
+ target = df_with_target.iloc[:, -1]
90
+ df = df.fillna(df.mean())
91
+ omit_nan = False
81
92
  hashes = [hash_series(df[col]) for col in columns_for_sort]
82
93
  df = np.asarray(df, dtype=np.float32)
83
94
  correlations = get_sort_columns_correlations(df, target, omit_nan, n_jobs)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.66a3818.dev1
3
+ Version: 1.2.68
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=NbAl7_TAPRLWAiByFYGbEOi4eRvu1Erxk-b19Z5nTRs,33
1
+ upgini/__about__.py,sha256=36MiqZwik3NwGoXvug_voP86Q4bwFpauhq0m3rJ2Avc,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=nXGBMC42VPAmqQKXbEqZJFIHiGj6F_G2AwhurA8LuQs,205351
6
+ upgini/features_enricher.py,sha256=YXG5uUBN1Qo-3X5EUV4Y--Pyqbvg4Gta3WIoWQMTYkU,205359
7
7
  upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
8
  upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
9
9
  upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
@@ -17,7 +17,7 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
18
18
  upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
19
  upgini/autofe/date.py,sha256=I07psJerrxOcHao91PdSCk9X6KWu61IBVyFRLjGNgK8,10730
20
- upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
20
+ upgini/autofe/feature.py,sha256=xgu6bVIlUJ5PCUgoXQRNcGkcMOhj-_BdDRmkB_qRFS4,14766
21
21
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
22
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
23
  upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
@@ -56,7 +56,7 @@ upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuM
56
56
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
57
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
59
- upgini/utils/feature_info.py,sha256=0rOXSyCj-sw-8migWP0ge8qrOzGU50dQvH0JUJUrDfQ,6766
59
+ upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
60
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
62
62
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
@@ -65,12 +65,12 @@ upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,1
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
68
- upgini/utils/sort.py,sha256=GfWfCIbfK7e7BvSPZZNJD-PEtiN19DnTCEQkeefHHxI,6491
68
+ upgini/utils/sort.py,sha256=H79A17NMoHtLbqLCPFx_MBUloLZcDKjOba_H4gCE3t8,6965
69
69
  upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.66a3818.dev1.dist-info/METADATA,sha256=RlvM_n0dDfEJ6-4PCEiyh7bXHCDZjjdTOOP7uGjQd-M,49123
74
- upgini-1.2.66a3818.dev1.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
- upgini-1.2.66a3818.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.66a3818.dev1.dist-info/RECORD,,
73
+ upgini-1.2.68.dist-info/METADATA,sha256=XJ67N1OUXdu0fMTjlA9hFcg3HPnN4KXiiqXYvNWD3Dk,49113
74
+ upgini-1.2.68.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.68.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.68.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any