upgini 1.2.91a3884.dev5__py3-none-any.whl → 1.2.92__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.91a3884.dev5"
1
+ __version__ = "1.2.92"
upgini/dataset.py CHANGED
@@ -52,8 +52,6 @@ class Dataset:
52
52
  MIN_ROWS_COUNT = 100
53
53
  MAX_ROWS = 200_000
54
54
  IMBALANCE_THESHOLD = 0.6
55
- BINARY_BOOTSTRAP_LOOPS = 5
56
- MULTICLASS_BOOTSTRAP_LOOPS = 2
57
55
  MIN_TARGET_CLASS_ROWS = 100
58
56
  MAX_MULTICLASS_CLASS_COUNT = 100
59
57
  MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
@@ -5,7 +5,6 @@ import hashlib
5
5
  import itertools
6
6
  import json
7
7
  import logging
8
- import numbers
9
8
  import os
10
9
  import sys
11
10
  import tempfile
@@ -303,6 +302,7 @@ class FeaturesEnricher(TransformerMixin):
303
302
  self.metrics: Optional[pd.DataFrame] = None
304
303
  self.feature_names_ = []
305
304
  self.external_source_feature_names = []
305
+ self.zero_shap_client_features = []
306
306
  self.feature_importances_ = []
307
307
  self.search_id = search_id
308
308
  self.disable_force_downsampling = disable_force_downsampling
@@ -2049,8 +2049,7 @@ class FeaturesEnricher(TransformerMixin):
2049
2049
  for idx, eval_pair in enumerate(eval_set):
2050
2050
  eval_x, eval_y = eval_pair
2051
2051
  eval_df_with_index = eval_x.copy()
2052
- if eval_y is not None:
2053
- eval_df_with_index[TARGET] = eval_y
2052
+ eval_df_with_index[TARGET] = eval_y
2054
2053
  eval_df_with_index[EVAL_SET_INDEX] = idx + 1
2055
2054
  df = pd.concat([df, eval_df_with_index])
2056
2055
 
@@ -2669,7 +2668,7 @@ if response.status_code == 200:
2669
2668
  selecting_columns = [
2670
2669
  c
2671
2670
  for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
2672
- if c not in self.external_source_feature_names or c in (self.id_columns or [])
2671
+ if c not in self.zero_shap_client_features or c in (self.id_columns or [])
2673
2672
  ]
2674
2673
  selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
2675
2674
  if add_fit_system_record_id:
@@ -3296,7 +3295,7 @@ if response.status_code == 200:
3296
3295
  is_transform: bool = False,
3297
3296
  ) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
3298
3297
  validated_X = self._validate_X(X, is_transform)
3299
- validated_y = self._validate_y(validated_X, y)
3298
+ validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
3300
3299
  validated_eval_set = self._validate_eval_set(validated_X, eval_set)
3301
3300
  return validated_X, validated_y, validated_eval_set
3302
3301
 
@@ -3380,8 +3379,8 @@ if response.status_code == 200:
3380
3379
 
3381
3380
  return validated_X
3382
3381
 
3383
- def _validate_y(self, X: pd.DataFrame, y) -> Optional[pd.Series]:
3384
- if y is None:
3382
+ def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> Optional[pd.Series]:
3383
+ if y is None and not enforce_y:
3385
3384
  return None
3386
3385
  if (
3387
3386
  not isinstance(y, pd.Series)
@@ -4064,6 +4063,7 @@ if response.status_code == 200:
4064
4063
 
4065
4064
  self.feature_names_ = []
4066
4065
  self.external_source_feature_names = []
4066
+ self.zero_shap_client_features = []
4067
4067
  self.feature_importances_ = []
4068
4068
  features_info = []
4069
4069
  features_info_without_links = []
@@ -4099,6 +4099,8 @@ if response.status_code == 200:
4099
4099
 
4100
4100
  # TODO make a decision about selected features based on special flag from mlb
4101
4101
  if original_shaps.get(feature_meta.name, 0.0) == 0.0:
4102
+ if is_client_feature and self.fit_select_features:
4103
+ self.zero_shap_client_features.append(original_name)
4102
4104
  continue
4103
4105
 
4104
4106
  # Use only important features
@@ -155,7 +155,7 @@ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bo
155
155
  and not feature_meta.name.endswith("_postal_code")
156
156
  and not is_client_feature
157
157
  else ""
158
- )
158
+ )
159
159
 
160
160
 
161
161
  def _list_or_single(lst: List[str], single: str):
@@ -179,7 +179,7 @@ def _make_links(names: List[str], links: List[str]) -> str:
179
179
 
180
180
 
181
181
  def _round_shap_value(shap: float) -> float:
182
- if shap > 0.0 and shap < 0.0001:
182
+ if shap >= 0.0 and shap < 0.0001:
183
183
  return 0.0001
184
184
  else:
185
185
  return round(shap, 4)
@@ -173,7 +173,8 @@ def sample_time_series_train_eval(
173
173
  logger=logger,
174
174
  **kwargs,
175
175
  )
176
- logger.info(f"Eval set size: {len(eval_df)}")
176
+ if logger is not None:
177
+ logger.info(f"Eval set size: {len(eval_df)}")
177
178
  df = pd.concat([train_df, eval_df])
178
179
 
179
180
  elif len(train_df) > max_rows:
@@ -189,7 +190,8 @@ def sample_time_series_train_eval(
189
190
  else:
190
191
  df = train_df
191
192
 
192
- logger.info(f"Train set size: {len(df)}")
193
+ if logger is not None:
194
+ logger.info(f"Train set size: {len(df)}")
193
195
 
194
196
  return df
195
197
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.91a3884.dev5
3
+ Version: 1.2.92
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,9 +1,9 @@
1
- upgini/__about__.py,sha256=tqdF5EG5u2XotiGWCSnsQ61GODo019Lr-s097ItdAHs,33
1
+ upgini/__about__.py,sha256=wXo9Q87kBdNAVEzs4oUkI_3AmrQDgiMvfXa7xRn9cOE,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=MituLJZTDdIwSk-Ia5G1pS52PERNHQ2P99FgCH2kTjQ,32790
4
+ upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=udQdXpqVO4YAwEHjUzS195k2jxe5_CtZ-KTpWRicjfs,218225
6
+ upgini/features_enricher.py,sha256=wFeqZ30Dkhiyath--Jg6uVVoTwCdPJ42Rbe_smr1ue4,218465
7
7
  upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
8
8
  upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
9
9
  upgini/metrics.py,sha256=Bc1L9DUmEL8OWwNvIEjPjw5EyHSZbiu3v2hWyBmedis,45313
@@ -56,7 +56,7 @@ upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ
56
56
  upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
57
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
58
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
59
- upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
59
+ upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
60
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
61
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
62
62
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
64
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
65
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
- upgini/utils/sample_utils.py,sha256=jQ1em2FRnMFLul9ujuBgs5XZ9jAZ4eM4FHT3aDSjOy8,15351
67
+ upgini/utils/sample_utils.py,sha256=ETLPKQU_YngiYbdlnEoF2h7QS-3oN8et54q3Qs2ZAbA,15417
68
68
  upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
69
69
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
70
70
  upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
71
71
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
72
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
73
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.91a3884.dev5.dist-info/METADATA,sha256=A9MDv_VCFrWDDw2Xyo7Wsx3ps6ECwTul36gmw-wujgI,49546
75
- upgini-1.2.91a3884.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.91a3884.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.91a3884.dev5.dist-info/RECORD,,
74
+ upgini-1.2.92.dist-info/METADATA,sha256=yXqDsCwRNGqlytVFuoBL04Swo6xYo5lsk9_YHj-6PfQ,49536
75
+ upgini-1.2.92.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
+ upgini-1.2.92.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
+ upgini-1.2.92.dist-info/RECORD,,