upgini 1.2.68a3832.dev5__py3-none-any.whl → 1.2.68a3832.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/metrics.py +72 -14
- {upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/METADATA +1 -1
- {upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/RECORD +6 -6
- {upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/WHEEL +0 -0
- {upgini-1.2.68a3832.dev5.dist-info → upgini-1.2.68a3832.dev7.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.68a3832.
|
|
1
|
+
__version__ = "1.2.68a3832.dev7"
|
upgini/metrics.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
import inspect
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
7
6
|
from collections import defaultdict
|
|
8
7
|
from copy import deepcopy
|
|
8
|
+
from dataclasses import dataclass
|
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
@@ -26,11 +26,8 @@ except ImportError:
|
|
|
26
26
|
from sklearn.metrics._scorer import SCORERS
|
|
27
27
|
|
|
28
28
|
available_scorers = SCORERS
|
|
29
|
-
from sklearn.metrics._regression import (
|
|
30
|
-
_check_reg_targets,
|
|
31
|
-
check_consistent_length,
|
|
32
|
-
)
|
|
33
29
|
from sklearn.metrics import mean_squared_error
|
|
30
|
+
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
|
34
31
|
from sklearn.model_selection import BaseCrossValidator
|
|
35
32
|
|
|
36
33
|
from upgini.errors import ValidationError
|
|
@@ -101,6 +98,58 @@ LIGHTGBM_PARAMS = {
|
|
|
101
98
|
"min_sum_hessian_in_leaf": 0.01,
|
|
102
99
|
}
|
|
103
100
|
|
|
101
|
+
LIGHTGBM_REGRESSION_PARAMS = {
|
|
102
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
103
|
+
"deterministic": True,
|
|
104
|
+
"min_gain_to_split": 0.001,
|
|
105
|
+
"n_estimators": 275,
|
|
106
|
+
"max_depth": 5,
|
|
107
|
+
"max_cat_threshold": 80,
|
|
108
|
+
"min_data_per_group": 25,
|
|
109
|
+
"cat_l2": 10,
|
|
110
|
+
"cat_smooth": 12,
|
|
111
|
+
"learning_rate": 0.05,
|
|
112
|
+
"feature_fraction": 1.0,
|
|
113
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
114
|
+
"objective": "huber",
|
|
115
|
+
"verbosity": -1,
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
LIGHTGBM_MULTICLASS_PARAMS = {
|
|
119
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
120
|
+
"deterministic": True,
|
|
121
|
+
"min_gain_to_split": 0.001,
|
|
122
|
+
"n_estimators": 275,
|
|
123
|
+
"max_depth": 3,
|
|
124
|
+
"max_cat_threshold": 80,
|
|
125
|
+
"min_data_per_group": 25,
|
|
126
|
+
"cat_l2": 10,
|
|
127
|
+
"cat_smooth": 12,
|
|
128
|
+
"learning_rate": 0.25, # CatBoost 0.25
|
|
129
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
130
|
+
"objective": "softmax",
|
|
131
|
+
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
132
|
+
"verbosity": -1,
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
LIGHTGBM_BINARY_PARAMS = {
|
|
136
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
137
|
+
"deterministic": True,
|
|
138
|
+
"min_gain_to_split": 0.001,
|
|
139
|
+
"n_estimators": 275,
|
|
140
|
+
"max_depth": 5,
|
|
141
|
+
"max_cat_threshold": 80,
|
|
142
|
+
"min_data_per_group": 25,
|
|
143
|
+
"cat_l2": 10,
|
|
144
|
+
"cat_smooth": 12,
|
|
145
|
+
"learning_rate": 0.05,
|
|
146
|
+
"feature_fraction": 1.0,
|
|
147
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
148
|
+
"objective": "binary",
|
|
149
|
+
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
150
|
+
"verbosity": -1,
|
|
151
|
+
}
|
|
152
|
+
|
|
104
153
|
N_FOLDS = 5
|
|
105
154
|
BLOCKED_TS_TEST_SIZE = 0.2
|
|
106
155
|
|
|
@@ -220,6 +269,7 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
220
269
|
def is_catboost_estimator(estimator):
|
|
221
270
|
try:
|
|
222
271
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
272
|
+
|
|
223
273
|
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
224
274
|
except ImportError:
|
|
225
275
|
return False
|
|
@@ -441,28 +491,27 @@ class EstimatorWrapper:
|
|
|
441
491
|
}
|
|
442
492
|
if estimator is None:
|
|
443
493
|
params = {}
|
|
444
|
-
params["has_time"] = has_date
|
|
445
494
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
446
495
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
447
496
|
if target_type == ModelTaskType.MULTICLASS:
|
|
448
497
|
# params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
|
449
498
|
# params = _get_add_params(params, add_params)
|
|
450
499
|
# estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
451
|
-
params = _get_add_params(params,
|
|
500
|
+
params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
|
|
452
501
|
params = _get_add_params(params, add_params)
|
|
453
502
|
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
454
503
|
elif target_type == ModelTaskType.BINARY:
|
|
455
504
|
# params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
|
|
456
505
|
# params = _get_add_params(params, add_params)
|
|
457
506
|
# estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
458
|
-
params = _get_add_params(params,
|
|
507
|
+
params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
|
|
459
508
|
params = _get_add_params(params, add_params)
|
|
460
509
|
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
461
510
|
elif target_type == ModelTaskType.REGRESSION:
|
|
462
511
|
# params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
|
463
512
|
# params = _get_add_params(params, add_params)
|
|
464
513
|
# estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
|
|
465
|
-
params = _get_add_params(params,
|
|
514
|
+
params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
|
|
466
515
|
params = _get_add_params(params, add_params)
|
|
467
516
|
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
|
468
517
|
else:
|
|
@@ -474,15 +523,14 @@ class EstimatorWrapper:
|
|
|
474
523
|
estimator_copy = deepcopy(estimator)
|
|
475
524
|
kwargs["estimator"] = estimator_copy
|
|
476
525
|
if is_catboost_estimator(estimator):
|
|
526
|
+
params["has_time"] = has_date
|
|
477
527
|
if cat_features is not None:
|
|
478
528
|
for cat_feature in cat_features:
|
|
479
529
|
if cat_feature not in x.columns:
|
|
480
530
|
logger.error(
|
|
481
531
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
482
532
|
)
|
|
483
|
-
estimator_copy.set_params(
|
|
484
|
-
cat_features=cat_features
|
|
485
|
-
)
|
|
533
|
+
estimator_copy.set_params(cat_features=cat_features)
|
|
486
534
|
estimator = CatBoostWrapper(**kwargs)
|
|
487
535
|
else:
|
|
488
536
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
@@ -530,8 +578,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
530
578
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
531
579
|
|
|
532
580
|
# Find embeddings
|
|
533
|
-
from catboost import CatBoostClassifier
|
|
534
581
|
import catboost
|
|
582
|
+
from catboost import CatBoostClassifier
|
|
583
|
+
|
|
535
584
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
536
585
|
emb_pattern = r"(.+)_emb\d+"
|
|
537
586
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
@@ -655,6 +704,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
655
704
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
656
705
|
try:
|
|
657
706
|
from catboost import Pool
|
|
707
|
+
|
|
658
708
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
659
709
|
fold_pool = Pool(
|
|
660
710
|
x,
|
|
@@ -712,6 +762,14 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
712
762
|
|
|
713
763
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
714
764
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
765
|
+
if self.target_type == ModelTaskType.MULTICLASS:
|
|
766
|
+
params["num_class"] = y.nunique()
|
|
767
|
+
emb_pattern = r"(.+)_emb\d+"
|
|
768
|
+
emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
769
|
+
max_bin_by_feature_type = {
|
|
770
|
+
feature: 63 if feature in emb_features else 255 for feature in x.columns
|
|
771
|
+
}
|
|
772
|
+
params["max_bin_by_feature_type"] = max_bin_by_feature_type
|
|
715
773
|
self.cat_features = _get_cat_features(x)
|
|
716
774
|
x = fill_na_cat_features(x, self.cat_features)
|
|
717
775
|
for feature in self.cat_features:
|
|
@@ -733,8 +791,8 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
733
791
|
|
|
734
792
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
735
793
|
try:
|
|
736
|
-
import shap
|
|
737
794
|
import lightgbm as lgb
|
|
795
|
+
import shap
|
|
738
796
|
|
|
739
797
|
if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
|
|
740
798
|
return None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.68a3832.
|
|
3
|
+
Version: 1.2.68a3832.dev7
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=CR4sN9ZhYNXzf0xJ61KtPk3O9k_pn-EljmzdfIfJyoM,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=1rb6BzyuiQFGVCTDmKL2wox3UFRNjtNaIJOwQnZ801A,34956
|
|
@@ -7,7 +7,7 @@ upgini/features_enricher.py,sha256=GXXx14jwf3F26_KrfJ6O40Vcu1hRx5iBjUB_jxy3Xvg,2
|
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=onr-wFpP0idy0SH3Wxv2xnqxT5H5MiP70o44f1PhfFw,38808
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
|
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
73
|
-
upgini-1.2.68a3832.
|
|
74
|
-
upgini-1.2.68a3832.
|
|
75
|
-
upgini-1.2.68a3832.
|
|
76
|
-
upgini-1.2.68a3832.
|
|
73
|
+
upgini-1.2.68a3832.dev7.dist-info/METADATA,sha256=LZ6mg6092FrqWu_yNWdMKTMgOC9lYclcPCQFLyMQAW0,49149
|
|
74
|
+
upgini-1.2.68a3832.dev7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
75
|
+
upgini-1.2.68a3832.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
76
|
+
upgini-1.2.68a3832.dev7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|