upgini 1.2.81a3832.dev4__py3-none-any.whl → 1.2.81a3832.dev6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.81a3832.dev4"
1
+ __version__ = "1.2.81a3832.dev6"
@@ -1023,12 +1023,12 @@ class FeaturesEnricher(TransformerMixin):
1023
1023
  self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
1024
1024
  return None
1025
1025
 
1026
- maybe_phone_column = self._get_phone_column(self.search_keys)
1027
- text_features = (
1028
- [f for f in self.generate_features if f != maybe_phone_column]
1029
- if self.generate_features is not None
1030
- else None
1031
- )
1026
+ text_features = self.generate_features.copy() if self.generate_features else None
1027
+ if text_features:
1028
+ for renamed, original in columns_renaming.items():
1029
+ if original in text_features:
1030
+ text_features.remove(original)
1031
+ text_features.append(renamed)
1032
1032
 
1033
1033
  print(self.bundle.get("metrics_start"))
1034
1034
  with Spinner():
@@ -1041,9 +1041,7 @@ class FeaturesEnricher(TransformerMixin):
1041
1041
  enriched_cat_features = [f for f in cat_features if f in fitting_enriched_X.columns]
1042
1042
  if len(enriched_cat_features) < len(cat_features):
1043
1043
  missing_cat_features = [f for f in cat_features if f not in fitting_enriched_X.columns]
1044
- self.logger.warning(
1045
- f"Some cat_features were not found in enriched_X: {missing_cat_features}"
1046
- )
1044
+ self.logger.warning(f"Some cat_features were not found in enriched_X: {missing_cat_features}")
1047
1045
 
1048
1046
  _, metric, multiplier = define_scorer(model_task_type, scoring)
1049
1047
 
@@ -2750,7 +2748,9 @@ if response.status_code == 200:
2750
2748
  if self.id_columns is not None:
2751
2749
  for id_column in self.id_columns:
2752
2750
  if id_column not in validated_X.columns:
2753
- raise ValidationError(self.bundle.get("missing_id_column").format(id_column))
2751
+ raise ValidationError(
2752
+ self.bundle.get("missing_id_column").format(id_column, list(validated_X.columns))
2753
+ )
2754
2754
 
2755
2755
  validate_scoring_argument(scoring)
2756
2756
 
@@ -3092,7 +3092,7 @@ if response.status_code == 200:
3092
3092
  self.__show_selected_features(self.fit_search_keys)
3093
3093
 
3094
3094
  autofe_description = self.get_autofe_features_description()
3095
- if autofe_description is not None:
3095
+ if autofe_description is not None and len(autofe_description) > 0:
3096
3096
  self.logger.info(f"AutoFE descriptions: {autofe_description}")
3097
3097
  self.autofe_features_display_handle = display_html_dataframe(
3098
3098
  df=autofe_description,
@@ -4260,12 +4260,13 @@ if response.status_code == 200:
4260
4260
  display_id=f"features_info_{uuid.uuid4()}",
4261
4261
  )
4262
4262
 
4263
- self.data_sources_display_handle = display_html_dataframe(
4264
- self.relevant_data_sources,
4265
- self._relevant_data_sources_wo_links,
4266
- self.bundle.get("relevant_data_sources_header"),
4267
- display_id=f"data_sources_{uuid.uuid4()}",
4268
- )
4263
+ if len(self.relevant_data_sources) > 0:
4264
+ self.data_sources_display_handle = display_html_dataframe(
4265
+ self.relevant_data_sources,
4266
+ self._relevant_data_sources_wo_links,
4267
+ self.bundle.get("relevant_data_sources_header"),
4268
+ display_id=f"data_sources_{uuid.uuid4()}",
4269
+ )
4269
4270
  else:
4270
4271
  msg = self.bundle.get("features_info_zero_important_features")
4271
4272
  self.__log_warning(msg, show_support_link=True)
upgini/metrics.py CHANGED
@@ -18,7 +18,6 @@ from numpy import log1p
18
18
  from pandas.api.types import is_numeric_dtype
19
19
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
20
20
 
21
- # from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
22
21
  from upgini.utils.features_validator import FeaturesValidator
23
22
  from upgini.utils.sklearn_ext import cross_validate
24
23
 
@@ -100,7 +99,6 @@ LIGHTGBM_REGRESSION_PARAMS = {
100
99
  "min_sum_hessian_in_leaf": 0.01,
101
100
  "objective": "huber",
102
101
  "deterministic": "true",
103
- # "force_col_wise": "true",
104
102
  "verbosity": -1,
105
103
  }
106
104
 
@@ -115,12 +113,10 @@ LIGHTGBM_MULTICLASS_PARAMS = {
115
113
  "cat_smooth": 18,
116
114
  "cat_l2": 8,
117
115
  "objective": "multiclass",
118
- # "class_weight": "balanced",
119
116
  "use_quantized_grad": "true",
120
117
  "num_grad_quant_bins": "8",
121
118
  "stochastic_rounding": "true",
122
119
  "deterministic": "true",
123
- # "force_col_wise": "true",
124
120
  "verbosity": -1,
125
121
  }
126
122
 
@@ -131,13 +127,11 @@ LIGHTGBM_BINARY_PARAMS = {
131
127
  "max_depth": 5,
132
128
  "learning_rate": 0.05,
133
129
  "objective": "binary",
134
- # "class_weight": "balanced",
135
130
  "max_cat_threshold": 80,
136
131
  "min_data_per_group": 20,
137
132
  "cat_smooth": 18,
138
133
  "cat_l2": 8,
139
134
  "deterministic": "true",
140
- # "force_col_wise": "true",
141
135
  "verbosity": -1,
142
136
  }
143
137
 
@@ -146,34 +140,6 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
146
140
  N_FOLDS = 5
147
141
  BLOCKED_TS_TEST_SIZE = 0.2
148
142
 
149
- # NA_VALUES = [
150
- # "",
151
- # " ",
152
- # " ",
153
- # "#n/a",
154
- # "#n/a n/a",
155
- # "#na",
156
- # "-1.#ind",
157
- # "-1.#qnan",
158
- # "-nan",
159
- # "1.#ind",
160
- # "1.#qnan",
161
- # "n/a",
162
- # "na",
163
- # "null",
164
- # "nan",
165
- # "n/a",
166
- # "nan",
167
- # "none",
168
- # "-",
169
- # "undefined",
170
- # "[[unknown]]",
171
- # "[not provided]",
172
- # "[unknown]",
173
- # ]
174
-
175
- # NA_REPLACEMENT = "NA"
176
-
177
143
  SUPPORTED_CATBOOST_METRICS = {
178
144
  s.upper(): s
179
145
  for s in (
@@ -975,7 +941,8 @@ def _get_cat_features(
975
941
 
976
942
  logger.info(f"Selected categorical features: {cat_features}")
977
943
 
978
- features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
944
+ non_encode_features = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
945
+ features_to_encode = [f for f in cat_features if f not in non_encode_features]
979
946
 
980
947
  logger.info(f"Features to encode: {features_to_encode}")
981
948
 
@@ -1067,12 +1034,3 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1067
1034
  multioutput=multioutput,
1068
1035
  )
1069
1036
  return mse if squared else np.sqrt(mse)
1070
-
1071
-
1072
- # def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
1073
- # for c in cat_features:
1074
- # if c in df.columns:
1075
- # df[c] = df[c].astype("string").fillna(NA_REPLACEMENT).astype(str)
1076
- # na_filter = df[c].str.lower().isin(NA_VALUES)
1077
- # df.loc[na_filter, c] = NA_REPLACEMENT
1078
- # return df
@@ -140,7 +140,7 @@ baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input
140
140
  baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
141
141
  missing_features_for_transform=Missing some features for transform that were presented on fit: {}
142
142
  missing_target_for_transform=Search contains features on target. Please add y to the call and try again
143
- missing_id_column=Id column {} not found in X
143
+ missing_id_column=Id column {} not found in X: {}
144
144
  # target validation
145
145
  empty_target=Target is empty in all rows
146
146
  # non_numeric_target=Binary target should be numerical type
@@ -92,9 +92,9 @@ def display_html_dataframe(
92
92
  if table_tsv is not None:
93
93
  copy_and_share = f"""
94
94
  <div style="text-align: right">
95
- <button onclick=navigator.clipboard.writeText(decodeURI('{table_tsv}'))>\U0001F4C2 Copy</button>
95
+ <button onclick=navigator.clipboard.writeText(decodeURI('{table_tsv}'))>\U0001f4c2 Copy</button>
96
96
  <a href='mailto:<Share with...>?subject={email_subject}&body={table_tsv}'>
97
- <button>\U0001F4E8 Share</button>
97
+ <button>\U0001f4e8 Share</button>
98
98
  </a>
99
99
  </div>"""
100
100
  else:
@@ -112,6 +112,7 @@ def display_html_dataframe(
112
112
 
113
113
  .upgini-df tbody td {{
114
114
  padding: 0.5em;
115
+ color: black;
115
116
  }}
116
117
 
117
118
  .upgini-df tbody tr:nth-child(odd) {{
@@ -164,10 +165,12 @@ def make_html_report(
164
165
 
165
166
  try:
166
167
  from importlib.resources import files
167
- font_path = files('upgini.utils').joinpath('Roboto-Regular.ttf')
168
+
169
+ font_path = files("upgini.utils").joinpath("Roboto-Regular.ttf")
168
170
  except Exception:
169
171
  from pkg_resources import resource_filename
170
- font_path = resource_filename('upgini.utils', 'Roboto-Regular.ttf')
172
+
173
+ font_path = resource_filename("upgini.utils", "Roboto-Regular.ttf")
171
174
 
172
175
  return f"""<html>
173
176
  <head>
@@ -274,8 +277,10 @@ def make_html_report(
274
277
  if metrics_df is not None
275
278
  else ""
276
279
  }
277
- <h3>Relevant data sources</h3>
278
- {make_table(relevant_datasources_df)}
280
+ {"<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
281
+ if len(relevant_datasources_df) > 0
282
+ else ""
283
+ }
279
284
  <h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
280
285
  {make_table(relevant_features_df, wrap_long_string=25)}
281
286
  {"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
@@ -311,7 +316,7 @@ def prepare_and_show_report(
311
316
 
312
317
 
313
318
  def show_button_download_pdf(
314
- source: str, title="\U0001F4CA Download PDF report", display_id: Optional[str] = None, display_handle=None
319
+ source: str, title="\U0001f4ca Download PDF report", display_id: Optional[str] = None, display_handle=None
315
320
  ):
316
321
  from IPython.display import HTML, display
317
322
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.81a3832.dev4
3
+ Version: 1.2.81a3832.dev6
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=pCwu34y7lahquDqU9yxAiDAS0JO7ZeCM2pE16RyFW4w,33
1
+ upgini/__about__.py,sha256=yNrgPKOedmyNgT4TYavHML3irFQc9hNEAf0TxhtzLzA,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=WiSVfmlHI9oKJQbyf46FH0yY80hBJ6hheFpugw0f_vE,210583
6
+ upgini/features_enricher.py,sha256=ODCSzFw62y_8vUrfbcZtDu0dWMIDCGYKWD2F54QDFII,210787
7
7
  upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=I3ad9pcB2ODf4HNETXtVeM6YA8aymUamWtYA0nvIt7Y,39720
9
+ upgini/metrics.py,sha256=lWFF_dQAWcgI7EOQlTXiLjsAEoPLxNv1PCp_egoKolc,38821
10
10
  upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
38
38
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
39
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
40
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
41
- upgini/resource_bundle/strings.properties,sha256=o9iLgAJSxXm6mkQgtNwBVQKIRVwLF1__Dn9gSXb1kLY,27953
41
+ upgini/resource_bundle/strings.properties,sha256=GmkTgxowpykuuviubVH5cMF_lNFQJEqfRoBJaj3c72E,27957
42
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
43
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
44
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -53,7 +53,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
53
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
54
54
  upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
55
55
  upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
56
- upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
56
+ upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
57
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
59
59
  upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.81a3832.dev4.dist-info/METADATA,sha256=JTAyzDU5xBuTcqNjI4mkTOCWSKPMlANhADa9iTM-wxc,49172
74
- upgini-1.2.81a3832.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.81a3832.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.81a3832.dev4.dist-info/RECORD,,
73
+ upgini-1.2.81a3832.dev6.dist-info/METADATA,sha256=WjpXtnU3FUqspcRA2Zl-5iMqo5fqT2xIhHPJXFPcPN4,49172
74
+ upgini-1.2.81a3832.dev6.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.81a3832.dev6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.81a3832.dev6.dist-info/RECORD,,