upgini 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +15 -8
- upgini/metrics.py +22 -29
- upgini/utils/target_utils.py +4 -2
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/METADATA +1 -1
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/RECORD +8 -8
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/WHEEL +0 -0
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.72"
|
upgini/features_enricher.py
CHANGED
@@ -3250,8 +3250,7 @@ if response.status_code == 200:
|
|
3250
3250
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3251
3251
|
if len(eval_pair) != 2:
|
3252
3252
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
3253
|
-
eval_X = eval_pair
|
3254
|
-
eval_y = eval_pair[1]
|
3253
|
+
eval_X, eval_y = eval_pair
|
3255
3254
|
|
3256
3255
|
if _num_samples(eval_X) == 0:
|
3257
3256
|
raise ValidationError(self.bundle.get("eval_x_is_empty"))
|
@@ -3872,15 +3871,23 @@ if response.status_code == 200:
|
|
3872
3871
|
|
3873
3872
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
3874
3873
|
|
3875
|
-
if updated_shaps is not None:
|
3876
|
-
for fm in features_meta:
|
3877
|
-
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
3878
|
-
|
3879
|
-
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3880
3874
|
for feature_meta in features_meta:
|
3881
3875
|
if feature_meta.name in original_names_dict.keys():
|
3882
3876
|
feature_meta.name = original_names_dict[feature_meta.name]
|
3883
3877
|
|
3878
|
+
if updated_shaps is not None:
|
3879
|
+
updating_shap = updated_shaps.get(feature_meta.name)
|
3880
|
+
if updating_shap is None:
|
3881
|
+
self.logger.warning(
|
3882
|
+
f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
|
3883
|
+
)
|
3884
|
+
updating_shap = 0.0
|
3885
|
+
feature_meta.shap_value = updating_shap
|
3886
|
+
|
3887
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3888
|
+
|
3889
|
+
for feature_meta in features_meta:
|
3890
|
+
|
3884
3891
|
is_client_feature = feature_meta.name in df.columns
|
3885
3892
|
|
3886
3893
|
# TODO make a decision about selected features based on special flag from mlb
|
@@ -3892,7 +3899,7 @@ if response.status_code == 200:
|
|
3892
3899
|
# Use only important features
|
3893
3900
|
if (
|
3894
3901
|
# feature_meta.name in self.fit_generated_features or
|
3895
|
-
feature_meta.name == COUNTRY
|
3902
|
+
feature_meta.name == COUNTRY # constant synthetic column
|
3896
3903
|
# In select_features mode we select also from etalon features and need to show them
|
3897
3904
|
or (not self.fit_select_features and is_client_feature)
|
3898
3905
|
):
|
upgini/metrics.py
CHANGED
@@ -8,16 +8,18 @@ from copy import deepcopy
|
|
8
8
|
from dataclasses import dataclass
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
10
10
|
|
11
|
+
import lightgbm as lgb
|
11
12
|
import numpy as np
|
12
13
|
import pandas as pd
|
13
14
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
14
|
-
import lightgbm as lgb
|
15
15
|
from numpy import log1p
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
17
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
18
|
+
from sklearn.preprocessing import OrdinalEncoder
|
18
19
|
|
19
20
|
from upgini.utils.features_validator import FeaturesValidator
|
20
21
|
from upgini.utils.sklearn_ext import cross_validate
|
22
|
+
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
21
23
|
|
22
24
|
try:
|
23
25
|
from sklearn.metrics import get_scorer_names
|
@@ -29,7 +31,7 @@ except ImportError:
|
|
29
31
|
available_scorers = SCORERS
|
30
32
|
from sklearn.metrics import mean_squared_error
|
31
33
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
32
|
-
from sklearn.model_selection import BaseCrossValidator
|
34
|
+
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
33
35
|
|
34
36
|
from upgini.errors import ValidationError
|
35
37
|
from upgini.metadata import ModelTaskType
|
@@ -83,22 +85,6 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
83
85
|
"auto_class_weights": "Balanced",
|
84
86
|
}
|
85
87
|
|
86
|
-
LIGHTGBM_PARAMS = {
|
87
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
88
|
-
# "num_leaves": 16,
|
89
|
-
# "n_estimators": 150,
|
90
|
-
# "min_child_weight": 1,
|
91
|
-
"max_depth": 4,
|
92
|
-
"max_cat_threshold": 80,
|
93
|
-
"min_data_per_group": 25,
|
94
|
-
"num_boost_round": 150,
|
95
|
-
"cat_l2": 10,
|
96
|
-
"cat_smooth": 12,
|
97
|
-
"learning_rate": 0.05,
|
98
|
-
"feature_fraction": 1.0,
|
99
|
-
"min_sum_hessian_in_leaf": 0.01,
|
100
|
-
}
|
101
|
-
|
102
88
|
LIGHTGBM_REGRESSION_PARAMS = {
|
103
89
|
"random_state": DEFAULT_RANDOM_STATE,
|
104
90
|
"deterministic": True,
|
@@ -125,9 +111,9 @@ LIGHTGBM_MULTICLASS_PARAMS = {
|
|
125
111
|
"max_cat_threshold": 80,
|
126
112
|
"min_data_per_group": 20,
|
127
113
|
"cat_smooth": 18,
|
128
|
-
"cat_l2"
|
114
|
+
"cat_l2": 8,
|
129
115
|
"objective": "multiclass",
|
130
|
-
"class_weight": "balanced",
|
116
|
+
# "class_weight": "balanced",
|
131
117
|
"use_quantized_grad": "true",
|
132
118
|
"num_grad_quant_bins": "8",
|
133
119
|
"stochastic_rounding": "true",
|
@@ -141,12 +127,12 @@ LIGHTGBM_BINARY_PARAMS = {
|
|
141
127
|
"max_depth": 5,
|
142
128
|
"learning_rate": 0.05,
|
143
129
|
"objective": "binary",
|
144
|
-
"class_weight": "balanced",
|
130
|
+
# "class_weight": "balanced",
|
145
131
|
"deterministic": True,
|
146
132
|
"max_cat_threshold": 80,
|
147
133
|
"min_data_per_group": 20,
|
148
134
|
"cat_smooth": 18,
|
149
|
-
"cat_l2"
|
135
|
+
"cat_l2": 8,
|
150
136
|
"verbosity": -1,
|
151
137
|
}
|
152
138
|
|
@@ -505,7 +491,8 @@ class EstimatorWrapper:
|
|
505
491
|
params = _get_add_params(params, add_params)
|
506
492
|
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
507
493
|
elif target_type == ModelTaskType.REGRESSION:
|
508
|
-
|
494
|
+
if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
|
495
|
+
params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
|
509
496
|
params = _get_add_params(params, add_params)
|
510
497
|
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
511
498
|
else:
|
@@ -754,6 +741,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
754
741
|
logger=logger,
|
755
742
|
)
|
756
743
|
self.cat_features = None
|
744
|
+
self.cat_encoder = None
|
757
745
|
self.n_classes = None
|
758
746
|
|
759
747
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
@@ -764,10 +752,13 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
764
752
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
765
753
|
self.cat_features = _get_cat_features(x)
|
766
754
|
if self.cat_features:
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
755
|
+
x = fill_na_cat_features(x, self.cat_features)
|
756
|
+
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
757
|
+
encoded = pd.DataFrame(
|
758
|
+
encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
759
|
+
)
|
760
|
+
x[self.cat_features] = encoded
|
761
|
+
self.cat_encoder = encoder
|
771
762
|
if not is_numeric_dtype(y_numpy):
|
772
763
|
y_numpy = correct_string_target(y_numpy)
|
773
764
|
|
@@ -777,8 +768,10 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
777
768
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
778
769
|
if self.cat_features is not None:
|
779
770
|
x = fill_na_cat_features(x, self.cat_features)
|
780
|
-
|
781
|
-
x[
|
771
|
+
if self.cat_encoder is not None:
|
772
|
+
x[self.cat_features] = pd.DataFrame(
|
773
|
+
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
774
|
+
)
|
782
775
|
if not is_numeric_dtype(y):
|
783
776
|
y_numpy = correct_string_target(y_numpy)
|
784
777
|
return x, y_numpy, params
|
upgini/utils/target_utils.py
CHANGED
@@ -204,7 +204,7 @@ def balance_undersample(
|
|
204
204
|
def balance_undersample_forced(
|
205
205
|
df: pd.DataFrame,
|
206
206
|
target_column: str,
|
207
|
-
id_columns: List[str],
|
207
|
+
id_columns: Optional[List[str]],
|
208
208
|
date_column: str,
|
209
209
|
task_type: ModelTaskType,
|
210
210
|
cv_type: Optional[CVType],
|
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
287
287
|
|
288
288
|
def balance_undersample_time_series_trunc(
|
289
289
|
df: pd.DataFrame,
|
290
|
-
id_columns: List[str],
|
290
|
+
id_columns: Optional[List[str]],
|
291
291
|
date_column: str,
|
292
292
|
sample_size: int,
|
293
293
|
random_state: int = 42,
|
@@ -297,6 +297,8 @@ def balance_undersample_time_series_trunc(
|
|
297
297
|
time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
|
298
298
|
**kwargs,
|
299
299
|
):
|
300
|
+
if id_columns is None:
|
301
|
+
id_columns = []
|
300
302
|
# Convert date column to datetime
|
301
303
|
dates_df = df[id_columns + [date_column]].copy()
|
302
304
|
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256
|
1
|
+
upgini/__about__.py,sha256=-EK4ypqJTIRrg6g1P6PtLXT9vC4Vq7zblqFi389VgwA,23
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
|
7
7
|
upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=a0bY4oTMb-MgB1yC1IuTcEtotKZxAxjgV_QV2Z4V8u4,38988
|
10
10
|
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
67
|
upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
|
-
upgini/utils/target_utils.py,sha256=
|
69
|
+
upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,16650
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.72.dist-info/METADATA,sha256=OpaT2gblO8qGzEJBNf36-dPwbedHPP93bX0fPAOMl38,49091
|
74
|
+
upgini-1.2.72.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.72.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.72.dist-info/RECORD,,
|
File without changes
|
File without changes
|