upgini 1.2.81a3832.dev5__py3-none-any.whl → 1.2.81a3832.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.81a3832.dev5"
1
+ __version__ = "1.2.81a3832.dev7"
@@ -1023,12 +1023,12 @@ class FeaturesEnricher(TransformerMixin):
1023
1023
  self.__log_warning(self.bundle.get("metrics_no_important_free_features"))
1024
1024
  return None
1025
1025
 
1026
- maybe_phone_column = self._get_phone_column(self.search_keys)
1027
- text_features = (
1028
- [f for f in self.generate_features if f != maybe_phone_column]
1029
- if self.generate_features is not None
1030
- else None
1031
- )
1026
+ text_features = self.generate_features.copy() if self.generate_features else None
1027
+ if text_features:
1028
+ for renamed, original in columns_renaming.items():
1029
+ if original in text_features:
1030
+ text_features.remove(original)
1031
+ text_features.append(renamed)
1032
1032
 
1033
1033
  print(self.bundle.get("metrics_start"))
1034
1034
  with Spinner():
@@ -3092,7 +3092,7 @@ if response.status_code == 200:
3092
3092
  self.__show_selected_features(self.fit_search_keys)
3093
3093
 
3094
3094
  autofe_description = self.get_autofe_features_description()
3095
- if autofe_description is not None:
3095
+ if autofe_description is not None and len(autofe_description) > 0:
3096
3096
  self.logger.info(f"AutoFE descriptions: {autofe_description}")
3097
3097
  self.autofe_features_display_handle = display_html_dataframe(
3098
3098
  df=autofe_description,
@@ -3934,6 +3934,7 @@ if response.status_code == 200:
3934
3934
  continue
3935
3935
 
3936
3936
  # Use only important features
3937
+ # If select_features is False, we don't show etalon features in the report
3937
3938
  if (
3938
3939
  # feature_meta.name in self.fit_generated_features or
3939
3940
  feature_meta.name == COUNTRY # constant synthetic column
@@ -4260,12 +4261,13 @@ if response.status_code == 200:
4260
4261
  display_id=f"features_info_{uuid.uuid4()}",
4261
4262
  )
4262
4263
 
4263
- self.data_sources_display_handle = display_html_dataframe(
4264
- self.relevant_data_sources,
4265
- self._relevant_data_sources_wo_links,
4266
- self.bundle.get("relevant_data_sources_header"),
4267
- display_id=f"data_sources_{uuid.uuid4()}",
4268
- )
4264
+ if len(self.relevant_data_sources) > 0:
4265
+ self.data_sources_display_handle = display_html_dataframe(
4266
+ self.relevant_data_sources,
4267
+ self._relevant_data_sources_wo_links,
4268
+ self.bundle.get("relevant_data_sources_header"),
4269
+ display_id=f"data_sources_{uuid.uuid4()}",
4270
+ )
4269
4271
  else:
4270
4272
  msg = self.bundle.get("features_info_zero_important_features")
4271
4273
  self.__log_warning(msg, show_support_link=True)
upgini/metrics.py CHANGED
@@ -18,7 +18,6 @@ from numpy import log1p
18
18
  from pandas.api.types import is_numeric_dtype
19
19
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
20
20
 
21
- # from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
22
21
  from upgini.utils.features_validator import FeaturesValidator
23
22
  from upgini.utils.sklearn_ext import cross_validate
24
23
 
@@ -100,7 +99,6 @@ LIGHTGBM_REGRESSION_PARAMS = {
100
99
  "min_sum_hessian_in_leaf": 0.01,
101
100
  "objective": "huber",
102
101
  "deterministic": "true",
103
- # "force_col_wise": "true",
104
102
  "verbosity": -1,
105
103
  }
106
104
 
@@ -115,12 +113,10 @@ LIGHTGBM_MULTICLASS_PARAMS = {
115
113
  "cat_smooth": 18,
116
114
  "cat_l2": 8,
117
115
  "objective": "multiclass",
118
- # "class_weight": "balanced",
119
116
  "use_quantized_grad": "true",
120
117
  "num_grad_quant_bins": "8",
121
118
  "stochastic_rounding": "true",
122
119
  "deterministic": "true",
123
- # "force_col_wise": "true",
124
120
  "verbosity": -1,
125
121
  }
126
122
 
@@ -131,13 +127,11 @@ LIGHTGBM_BINARY_PARAMS = {
131
127
  "max_depth": 5,
132
128
  "learning_rate": 0.05,
133
129
  "objective": "binary",
134
- # "class_weight": "balanced",
135
130
  "max_cat_threshold": 80,
136
131
  "min_data_per_group": 20,
137
132
  "cat_smooth": 18,
138
133
  "cat_l2": 8,
139
134
  "deterministic": "true",
140
- # "force_col_wise": "true",
141
135
  "verbosity": -1,
142
136
  }
143
137
 
@@ -146,34 +140,6 @@ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
146
140
  N_FOLDS = 5
147
141
  BLOCKED_TS_TEST_SIZE = 0.2
148
142
 
149
- # NA_VALUES = [
150
- # "",
151
- # " ",
152
- # " ",
153
- # "#n/a",
154
- # "#n/a n/a",
155
- # "#na",
156
- # "-1.#ind",
157
- # "-1.#qnan",
158
- # "-nan",
159
- # "1.#ind",
160
- # "1.#qnan",
161
- # "n/a",
162
- # "na",
163
- # "null",
164
- # "nan",
165
- # "n/a",
166
- # "nan",
167
- # "none",
168
- # "-",
169
- # "undefined",
170
- # "[[unknown]]",
171
- # "[not provided]",
172
- # "[unknown]",
173
- # ]
174
-
175
- # NA_REPLACEMENT = "NA"
176
-
177
143
  SUPPORTED_CATBOOST_METRICS = {
178
144
  s.upper(): s
179
145
  for s in (
@@ -975,7 +941,8 @@ def _get_cat_features(
975
941
 
976
942
  logger.info(f"Selected categorical features: {cat_features}")
977
943
 
978
- features_to_encode = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
944
+ non_encode_features = list(set(x.select_dtypes(exclude=[np.number, np.datetime64, pd.CategoricalDtype()]).columns))
945
+ features_to_encode = [f for f in cat_features if f not in non_encode_features]
979
946
 
980
947
  logger.info(f"Features to encode: {features_to_encode}")
981
948
 
@@ -1067,12 +1034,3 @@ def _ext_mean_squared_log_error(y_true, y_pred, *, sample_weight=None, multioutp
1067
1034
  multioutput=multioutput,
1068
1035
  )
1069
1036
  return mse if squared else np.sqrt(mse)
1070
-
1071
-
1072
- # def fill_na_cat_features(df: pd.DataFrame, cat_features: List[str]) -> pd.DataFrame:
1073
- # for c in cat_features:
1074
- # if c in df.columns:
1075
- # df[c] = df[c].astype("string").fillna(NA_REPLACEMENT).astype(str)
1076
- # na_filter = df[c].str.lower().isin(NA_VALUES)
1077
- # df.loc[na_filter, c] = NA_REPLACEMENT
1078
- # return df
@@ -92,9 +92,9 @@ def display_html_dataframe(
92
92
  if table_tsv is not None:
93
93
  copy_and_share = f"""
94
94
  <div style="text-align: right">
95
- <button onclick=navigator.clipboard.writeText(decodeURI('{table_tsv}'))>\U0001F4C2 Copy</button>
95
+ <button onclick=navigator.clipboard.writeText(decodeURI('{table_tsv}'))>\U0001f4c2 Copy</button>
96
96
  <a href='mailto:<Share with...>?subject={email_subject}&body={table_tsv}'>
97
- <button>\U0001F4E8 Share</button>
97
+ <button>\U0001f4e8 Share</button>
98
98
  </a>
99
99
  </div>"""
100
100
  else:
@@ -112,6 +112,7 @@ def display_html_dataframe(
112
112
 
113
113
  .upgini-df tbody td {{
114
114
  padding: 0.5em;
115
+ color: black;
115
116
  }}
116
117
 
117
118
  .upgini-df tbody tr:nth-child(odd) {{
@@ -164,10 +165,12 @@ def make_html_report(
164
165
 
165
166
  try:
166
167
  from importlib.resources import files
167
- font_path = files('upgini.utils').joinpath('Roboto-Regular.ttf')
168
+
169
+ font_path = files("upgini.utils").joinpath("Roboto-Regular.ttf")
168
170
  except Exception:
169
171
  from pkg_resources import resource_filename
170
- font_path = resource_filename('upgini.utils', 'Roboto-Regular.ttf')
172
+
173
+ font_path = resource_filename("upgini.utils", "Roboto-Regular.ttf")
171
174
 
172
175
  return f"""<html>
173
176
  <head>
@@ -274,8 +277,10 @@ def make_html_report(
274
277
  if metrics_df is not None
275
278
  else ""
276
279
  }
277
- <h3>Relevant data sources</h3>
278
- {make_table(relevant_datasources_df)}
280
+ {"<h3>Relevant data sources</h3>" + make_table(relevant_datasources_df)
281
+ if len(relevant_datasources_df) > 0
282
+ else ""
283
+ }
279
284
  <h3>All relevant features. Listing ({len(relevant_features_df)} items)</h3>
280
285
  {make_table(relevant_features_df, wrap_long_string=25)}
281
286
  {"<h3>Description of AutoFE feature names</h3>" + make_table(autofe_descriptions_df, wrap_long_string=25)
@@ -311,7 +316,7 @@ def prepare_and_show_report(
311
316
 
312
317
 
313
318
  def show_button_download_pdf(
314
- source: str, title="\U0001F4CA Download PDF report", display_id: Optional[str] = None, display_handle=None
319
+ source: str, title="\U0001f4ca Download PDF report", display_id: Optional[str] = None, display_handle=None
315
320
  ):
316
321
  from IPython.display import HTML, display
317
322
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.81a3832.dev5
3
+ Version: 1.2.81a3832.dev7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=ixqR9YBpWXVR7UIN7EjNA5u0StPom5moacbDEkTJTUs,33
1
+ upgini/__about__.py,sha256=RVMSywROOgx43djBaCB4g_TyIw1r_t2n34999sThuLw,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=JYMZG712gAJbk1iymcm_JB5VaXADtKhI_IuTUdMlmvM,210602
6
+ upgini/features_enricher.py,sha256=WCX50iuq8_hf9AYuEfs_ZWNR7FbFc44zuXg27Z40r2s,210874
7
7
  upgini/http.py,sha256=AfaJ3c8z_tK2hZFEehNybDKE0mp1tYcyAP_l0_p8bLQ,43933
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=I3ad9pcB2ODf4HNETXtVeM6YA8aymUamWtYA0nvIt7Y,39720
9
+ upgini/metrics.py,sha256=lWFF_dQAWcgI7EOQlTXiLjsAEoPLxNv1PCp_egoKolc,38821
10
10
  upgini/search_task.py,sha256=RcvAE785yksWTsTNWuZFVNlk32jHElMoEna1T_C5N8Q,17823
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -53,7 +53,7 @@ upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDc
53
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
54
54
  upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
55
55
  upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
56
- upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
56
+ upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
57
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
59
59
  upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=LRN840dzx78-wg7ftdxAkp2c1eu8-JDvkACiRThm4HE,
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.81a3832.dev5.dist-info/METADATA,sha256=etw4taZHb8GlFA_w6POAXt0Wracb4cM4WNYE0SW6tI8,49172
74
- upgini-1.2.81a3832.dev5.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.81a3832.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.81a3832.dev5.dist-info/RECORD,,
73
+ upgini-1.2.81a3832.dev7.dist-info/METADATA,sha256=BrDfaRLGuSFtMudHxPC_sYI8_G9iWzidJO0vIihGtUE,49172
74
+ upgini-1.2.81a3832.dev7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.81a3832.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.81a3832.dev7.dist-info/RECORD,,