upgini 1.2.91a3884.dev5__py3-none-any.whl → 1.2.92__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/dataset.py +0 -2
- upgini/features_enricher.py +9 -7
- upgini/utils/feature_info.py +2 -2
- upgini/utils/sample_utils.py +4 -2
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.92.dist-info}/METADATA +1 -1
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.92.dist-info}/RECORD +9 -9
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.92.dist-info}/WHEEL +0 -0
- {upgini-1.2.91a3884.dev5.dist-info → upgini-1.2.92.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.92"
|
upgini/dataset.py
CHANGED
@@ -52,8 +52,6 @@ class Dataset:
|
|
52
52
|
MIN_ROWS_COUNT = 100
|
53
53
|
MAX_ROWS = 200_000
|
54
54
|
IMBALANCE_THESHOLD = 0.6
|
55
|
-
BINARY_BOOTSTRAP_LOOPS = 5
|
56
|
-
MULTICLASS_BOOTSTRAP_LOOPS = 2
|
57
55
|
MIN_TARGET_CLASS_ROWS = 100
|
58
56
|
MAX_MULTICLASS_CLASS_COUNT = 100
|
59
57
|
MIN_SUPPORTED_DATE_TS = 946684800000 # 2000-01-01
|
upgini/features_enricher.py
CHANGED
@@ -5,7 +5,6 @@ import hashlib
|
|
5
5
|
import itertools
|
6
6
|
import json
|
7
7
|
import logging
|
8
|
-
import numbers
|
9
8
|
import os
|
10
9
|
import sys
|
11
10
|
import tempfile
|
@@ -303,6 +302,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
303
302
|
self.metrics: Optional[pd.DataFrame] = None
|
304
303
|
self.feature_names_ = []
|
305
304
|
self.external_source_feature_names = []
|
305
|
+
self.zero_shap_client_features = []
|
306
306
|
self.feature_importances_ = []
|
307
307
|
self.search_id = search_id
|
308
308
|
self.disable_force_downsampling = disable_force_downsampling
|
@@ -2049,8 +2049,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
2049
2049
|
for idx, eval_pair in enumerate(eval_set):
|
2050
2050
|
eval_x, eval_y = eval_pair
|
2051
2051
|
eval_df_with_index = eval_x.copy()
|
2052
|
-
|
2053
|
-
eval_df_with_index[TARGET] = eval_y
|
2052
|
+
eval_df_with_index[TARGET] = eval_y
|
2054
2053
|
eval_df_with_index[EVAL_SET_INDEX] = idx + 1
|
2055
2054
|
df = pd.concat([df, eval_df_with_index])
|
2056
2055
|
|
@@ -2669,7 +2668,7 @@ if response.status_code == 200:
|
|
2669
2668
|
selecting_columns = [
|
2670
2669
|
c
|
2671
2670
|
for c in itertools.chain(validated_Xy.columns.tolist(), selected_generated_features)
|
2672
|
-
if c not in self.
|
2671
|
+
if c not in self.zero_shap_client_features or c in (self.id_columns or [])
|
2673
2672
|
]
|
2674
2673
|
selecting_columns.extend(c for c in result.columns if c in filtered_columns and c not in selecting_columns)
|
2675
2674
|
if add_fit_system_record_id:
|
@@ -3296,7 +3295,7 @@ if response.status_code == 200:
|
|
3296
3295
|
is_transform: bool = False,
|
3297
3296
|
) -> Tuple[pd.DataFrame, pd.Series, Optional[List[Tuple[pd.DataFrame, pd.Series]]]]:
|
3298
3297
|
validated_X = self._validate_X(X, is_transform)
|
3299
|
-
validated_y = self._validate_y(validated_X, y)
|
3298
|
+
validated_y = self._validate_y(validated_X, y, enforce_y=not is_transform)
|
3300
3299
|
validated_eval_set = self._validate_eval_set(validated_X, eval_set)
|
3301
3300
|
return validated_X, validated_y, validated_eval_set
|
3302
3301
|
|
@@ -3380,8 +3379,8 @@ if response.status_code == 200:
|
|
3380
3379
|
|
3381
3380
|
return validated_X
|
3382
3381
|
|
3383
|
-
def _validate_y(self, X: pd.DataFrame, y) -> Optional[pd.Series]:
|
3384
|
-
if y is None:
|
3382
|
+
def _validate_y(self, X: pd.DataFrame, y, enforce_y: bool = True) -> Optional[pd.Series]:
|
3383
|
+
if y is None and not enforce_y:
|
3385
3384
|
return None
|
3386
3385
|
if (
|
3387
3386
|
not isinstance(y, pd.Series)
|
@@ -4064,6 +4063,7 @@ if response.status_code == 200:
|
|
4064
4063
|
|
4065
4064
|
self.feature_names_ = []
|
4066
4065
|
self.external_source_feature_names = []
|
4066
|
+
self.zero_shap_client_features = []
|
4067
4067
|
self.feature_importances_ = []
|
4068
4068
|
features_info = []
|
4069
4069
|
features_info_without_links = []
|
@@ -4099,6 +4099,8 @@ if response.status_code == 200:
|
|
4099
4099
|
|
4100
4100
|
# TODO make a decision about selected features based on special flag from mlb
|
4101
4101
|
if original_shaps.get(feature_meta.name, 0.0) == 0.0:
|
4102
|
+
if is_client_feature and self.fit_select_features:
|
4103
|
+
self.zero_shap_client_features.append(original_name)
|
4102
4104
|
continue
|
4103
4105
|
|
4104
4106
|
# Use only important features
|
upgini/utils/feature_info.py
CHANGED
@@ -155,7 +155,7 @@ def _get_internal_source(feature_meta: FeaturesMetadataV2, is_client_feature: bo
|
|
155
155
|
and not feature_meta.name.endswith("_postal_code")
|
156
156
|
and not is_client_feature
|
157
157
|
else ""
|
158
|
-
|
158
|
+
)
|
159
159
|
|
160
160
|
|
161
161
|
def _list_or_single(lst: List[str], single: str):
|
@@ -179,7 +179,7 @@ def _make_links(names: List[str], links: List[str]) -> str:
|
|
179
179
|
|
180
180
|
|
181
181
|
def _round_shap_value(shap: float) -> float:
|
182
|
-
if shap
|
182
|
+
if shap >= 0.0 and shap < 0.0001:
|
183
183
|
return 0.0001
|
184
184
|
else:
|
185
185
|
return round(shap, 4)
|
upgini/utils/sample_utils.py
CHANGED
@@ -173,7 +173,8 @@ def sample_time_series_train_eval(
|
|
173
173
|
logger=logger,
|
174
174
|
**kwargs,
|
175
175
|
)
|
176
|
-
logger
|
176
|
+
if logger is not None:
|
177
|
+
logger.info(f"Eval set size: {len(eval_df)}")
|
177
178
|
df = pd.concat([train_df, eval_df])
|
178
179
|
|
179
180
|
elif len(train_df) > max_rows:
|
@@ -189,7 +190,8 @@ def sample_time_series_train_eval(
|
|
189
190
|
else:
|
190
191
|
df = train_df
|
191
192
|
|
192
|
-
logger
|
193
|
+
if logger is not None:
|
194
|
+
logger.info(f"Train set size: {len(df)}")
|
193
195
|
|
194
196
|
return df
|
195
197
|
|
@@ -1,9 +1,9 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=wXo9Q87kBdNAVEzs4oUkI_3AmrQDgiMvfXa7xRn9cOE,23
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
|
-
upgini/dataset.py,sha256=
|
4
|
+
upgini/dataset.py,sha256=e6JDYTZ2AwC5aF-dqclKZKkiKrHo2f6cFmMQO2ZZmjM,32724
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=wFeqZ30Dkhiyath--Jg6uVVoTwCdPJ42Rbe_smr1ue4,218465
|
7
7
|
upgini/http.py,sha256=4i7fQwrwU3WzDUOWzrgR-4C8eJwj_5dBwRAR-UjUtlc,44345
|
8
8
|
upgini/metadata.py,sha256=vsbbHyPCP3Rs8WkeDgQg99uAA_zmsbDStAT-NwDYhO4,12455
|
9
9
|
upgini/metrics.py,sha256=Bc1L9DUmEL8OWwNvIEjPjw5EyHSZbiu3v2hWyBmedis,45313
|
@@ -56,7 +56,7 @@ upgini/utils/deduplicate_utils.py,sha256=EpBVCov42-FJIAPfa4jY_ZRct3N2MFaC7i-oJNZ
|
|
56
56
|
upgini/utils/display_utils.py,sha256=hAeWEcJtPDg8fAVcMNrNB-azFD2WJp1nvbPAhR7SeP4,12071
|
57
57
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
58
58
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
59
|
-
upgini/utils/feature_info.py,sha256=
|
59
|
+
upgini/utils/feature_info.py,sha256=b3RvAeOHSEu-ZXWTrf42Dll_3ZUBL0pw7sdk7hgUKD0,7284
|
60
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
61
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
62
62
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
@@ -64,14 +64,14 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
|
-
upgini/utils/sample_utils.py,sha256=
|
67
|
+
upgini/utils/sample_utils.py,sha256=ETLPKQU_YngiYbdlnEoF2h7QS-3oN8et54q3Qs2ZAbA,15417
|
68
68
|
upgini/utils/sklearn_ext.py,sha256=jLJWAKkqQinV15Z4y1ZnsN3c-fKFwXTsprs00COnyVU,49315
|
69
69
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
70
70
|
upgini/utils/target_utils.py,sha256=i3Xt5l9ybB2_nF_ma5cfPuL3OeFTs2dY2xDI0p4Azpg,9049
|
71
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
72
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
73
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
77
|
-
upgini-1.2.
|
74
|
+
upgini-1.2.92.dist-info/METADATA,sha256=yXqDsCwRNGqlytVFuoBL04Swo6xYo5lsk9_YHj-6PfQ,49536
|
75
|
+
upgini-1.2.92.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
76
|
+
upgini-1.2.92.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
77
|
+
upgini-1.2.92.dist-info/RECORD,,
|
File without changes
|
File without changes
|