upgini 1.2.71a3832.dev13__py3-none-any.whl → 1.2.72__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +1 -2
- upgini/metrics.py +15 -49
- upgini/utils/target_utils.py +2 -2
- {upgini-1.2.71a3832.dev13.dist-info → upgini-1.2.72.dist-info}/METADATA +1 -1
- {upgini-1.2.71a3832.dev13.dist-info → upgini-1.2.72.dist-info}/RECORD +8 -8
- {upgini-1.2.71a3832.dev13.dist-info → upgini-1.2.72.dist-info}/WHEEL +0 -0
- {upgini-1.2.71a3832.dev13.dist-info → upgini-1.2.72.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.72"
|
upgini/features_enricher.py
CHANGED
@@ -3250,8 +3250,7 @@ if response.status_code == 200:
|
|
3250
3250
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3251
3251
|
if len(eval_pair) != 2:
|
3252
3252
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
3253
|
-
eval_X = eval_pair
|
3254
|
-
eval_y = eval_pair[1]
|
3253
|
+
eval_X, eval_y = eval_pair
|
3255
3254
|
|
3256
3255
|
if _num_samples(eval_X) == 0:
|
3257
3256
|
raise ValidationError(self.bundle.get("eval_x_is_empty"))
|
upgini/metrics.py
CHANGED
@@ -8,18 +8,18 @@ from copy import deepcopy
|
|
8
8
|
from dataclasses import dataclass
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
10
10
|
|
11
|
+
import lightgbm as lgb
|
11
12
|
import numpy as np
|
12
13
|
import pandas as pd
|
13
14
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
14
|
-
import lightgbm as lgb
|
15
15
|
from numpy import log1p
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
17
|
-
# from sklearn.calibration import LabelEncoder
|
18
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
19
18
|
from sklearn.preprocessing import OrdinalEncoder
|
20
19
|
|
21
20
|
from upgini.utils.features_validator import FeaturesValidator
|
22
21
|
from upgini.utils.sklearn_ext import cross_validate
|
22
|
+
from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
|
23
23
|
|
24
24
|
try:
|
25
25
|
from sklearn.metrics import get_scorer_names
|
@@ -31,7 +31,7 @@ except ImportError:
|
|
31
31
|
available_scorers = SCORERS
|
32
32
|
from sklearn.metrics import mean_squared_error
|
33
33
|
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
34
|
-
from sklearn.model_selection import BaseCrossValidator
|
34
|
+
from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
|
35
35
|
|
36
36
|
from upgini.errors import ValidationError
|
37
37
|
from upgini.metadata import ModelTaskType
|
@@ -85,22 +85,6 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
85
85
|
"auto_class_weights": "Balanced",
|
86
86
|
}
|
87
87
|
|
88
|
-
LIGHTGBM_PARAMS = {
|
89
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
90
|
-
# "num_leaves": 16,
|
91
|
-
# "n_estimators": 150,
|
92
|
-
# "min_child_weight": 1,
|
93
|
-
"max_depth": 4,
|
94
|
-
"max_cat_threshold": 80,
|
95
|
-
"min_data_per_group": 25,
|
96
|
-
"num_boost_round": 150,
|
97
|
-
"cat_l2": 10,
|
98
|
-
"cat_smooth": 12,
|
99
|
-
"learning_rate": 0.05,
|
100
|
-
"feature_fraction": 1.0,
|
101
|
-
"min_sum_hessian_in_leaf": 0.01,
|
102
|
-
}
|
103
|
-
|
104
88
|
LIGHTGBM_REGRESSION_PARAMS = {
|
105
89
|
"random_state": DEFAULT_RANDOM_STATE,
|
106
90
|
"deterministic": True,
|
@@ -127,9 +111,9 @@ LIGHTGBM_MULTICLASS_PARAMS = {
|
|
127
111
|
"max_cat_threshold": 80,
|
128
112
|
"min_data_per_group": 20,
|
129
113
|
"cat_smooth": 18,
|
130
|
-
"cat_l2"
|
114
|
+
"cat_l2": 8,
|
131
115
|
"objective": "multiclass",
|
132
|
-
"class_weight": "balanced",
|
116
|
+
# "class_weight": "balanced",
|
133
117
|
"use_quantized_grad": "true",
|
134
118
|
"num_grad_quant_bins": "8",
|
135
119
|
"stochastic_rounding": "true",
|
@@ -143,12 +127,12 @@ LIGHTGBM_BINARY_PARAMS = {
|
|
143
127
|
"max_depth": 5,
|
144
128
|
"learning_rate": 0.05,
|
145
129
|
"objective": "binary",
|
146
|
-
"class_weight": "balanced",
|
130
|
+
# "class_weight": "balanced",
|
147
131
|
"deterministic": True,
|
148
132
|
"max_cat_threshold": 80,
|
149
133
|
"min_data_per_group": 20,
|
150
134
|
"cat_smooth": 18,
|
151
|
-
"cat_l2"
|
135
|
+
"cat_l2": 8,
|
152
136
|
"verbosity": -1,
|
153
137
|
}
|
154
138
|
|
@@ -507,7 +491,8 @@ class EstimatorWrapper:
|
|
507
491
|
params = _get_add_params(params, add_params)
|
508
492
|
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
509
493
|
elif target_type == ModelTaskType.REGRESSION:
|
510
|
-
|
494
|
+
if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
|
495
|
+
params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
|
511
496
|
params = _get_add_params(params, add_params)
|
512
497
|
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
513
498
|
else:
|
@@ -756,7 +741,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
756
741
|
logger=logger,
|
757
742
|
)
|
758
743
|
self.cat_features = None
|
759
|
-
# self.cat_features_encoders = dict()
|
760
744
|
self.cat_encoder = None
|
761
745
|
self.n_classes = None
|
762
746
|
|
@@ -768,23 +752,13 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
768
752
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
769
753
|
self.cat_features = _get_cat_features(x)
|
770
754
|
if self.cat_features:
|
771
|
-
params["categorical_feature"] = self.cat_features
|
772
|
-
# params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features] Works
|
773
|
-
# params["categorical_feature"] = "notauto"
|
774
|
-
# params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
|
775
|
-
# cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features] Doesn't work
|
776
|
-
# params["categorical_feature"] = ",".join(cat_indices)
|
777
|
-
pass
|
778
755
|
x = fill_na_cat_features(x, self.cat_features)
|
779
756
|
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
780
|
-
encoded =
|
757
|
+
encoded = pd.DataFrame(
|
758
|
+
encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
759
|
+
)
|
781
760
|
x[self.cat_features] = encoded
|
782
761
|
self.cat_encoder = encoder
|
783
|
-
# for feature in self.cat_features:
|
784
|
-
# encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
785
|
-
# x[feature] = encoder.fit_transform(x[feature])
|
786
|
-
# self.cat_features_encoders[feature] = encoder
|
787
|
-
# x[feature] = x[feature].astype("category").cat.codes
|
788
762
|
if not is_numeric_dtype(y_numpy):
|
789
763
|
y_numpy = correct_string_target(y_numpy)
|
790
764
|
|
@@ -793,19 +767,11 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
793
767
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
794
768
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
795
769
|
if self.cat_features is not None:
|
796
|
-
params["categorical_feature"] = self.cat_features
|
797
|
-
# params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features]
|
798
|
-
# params["categorical_feature"] = "notauto"
|
799
|
-
# params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
|
800
|
-
# cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features]
|
801
|
-
# params["categorical_feature"] = ",".join(cat_indices)
|
802
770
|
x = fill_na_cat_features(x, self.cat_features)
|
803
771
|
if self.cat_encoder is not None:
|
804
|
-
x[self.cat_features] =
|
805
|
-
|
806
|
-
|
807
|
-
# x[feature] = encoder.transform(x[feature])
|
808
|
-
# x[feature] = x[feature].astype("category").cat.codes
|
772
|
+
x[self.cat_features] = pd.DataFrame(
|
773
|
+
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
774
|
+
)
|
809
775
|
if not is_numeric_dtype(y):
|
810
776
|
y_numpy = correct_string_target(y_numpy)
|
811
777
|
return x, y_numpy, params
|
upgini/utils/target_utils.py
CHANGED
@@ -297,9 +297,9 @@ def balance_undersample_time_series_trunc(
|
|
297
297
|
time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
|
298
298
|
**kwargs,
|
299
299
|
):
|
300
|
-
# Convert date column to datetime
|
301
300
|
if id_columns is None:
|
302
|
-
id_columns = [
|
301
|
+
id_columns = []
|
302
|
+
# Convert date column to datetime
|
303
303
|
dates_df = df[id_columns + [date_column]].copy()
|
304
304
|
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
305
305
|
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256
|
1
|
+
upgini/__about__.py,sha256=-EK4ypqJTIRrg6g1P6PtLXT9vC4Vq7zblqFi389VgwA,23
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
|
7
7
|
upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=a0bY4oTMb-MgB1yC1IuTcEtotKZxAxjgV_QV2Z4V8u4,38988
|
10
10
|
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
67
|
upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
|
-
upgini/utils/target_utils.py,sha256=
|
69
|
+
upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,16650
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.72.dist-info/METADATA,sha256=OpaT2gblO8qGzEJBNf36-dPwbedHPP93bX0fPAOMl38,49091
|
74
|
+
upgini-1.2.72.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.72.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.72.dist-info/RECORD,,
|
File without changes
|
File without changes
|