upgini 1.2.68a3832.dev6__py3-none-any.whl → 1.2.68a3832.dev7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/metrics.py +38 -28
- {upgini-1.2.68a3832.dev6.dist-info → upgini-1.2.68a3832.dev7.dist-info}/METADATA +1 -1
- {upgini-1.2.68a3832.dev6.dist-info → upgini-1.2.68a3832.dev7.dist-info}/RECORD +6 -6
- {upgini-1.2.68a3832.dev6.dist-info → upgini-1.2.68a3832.dev7.dist-info}/WHEEL +0 -0
- {upgini-1.2.68a3832.dev6.dist-info → upgini-1.2.68a3832.dev7.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.68a3832.
|
|
1
|
+
__version__ = "1.2.68a3832.dev7"
|
upgini/metrics.py
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
import inspect
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
7
6
|
from collections import defaultdict
|
|
8
7
|
from copy import deepcopy
|
|
8
|
+
from dataclasses import dataclass
|
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
11
|
import numpy as np
|
|
@@ -26,11 +26,8 @@ except ImportError:
|
|
|
26
26
|
from sklearn.metrics._scorer import SCORERS
|
|
27
27
|
|
|
28
28
|
available_scorers = SCORERS
|
|
29
|
-
from sklearn.metrics._regression import (
|
|
30
|
-
_check_reg_targets,
|
|
31
|
-
check_consistent_length,
|
|
32
|
-
)
|
|
33
29
|
from sklearn.metrics import mean_squared_error
|
|
30
|
+
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
|
34
31
|
from sklearn.model_selection import BaseCrossValidator
|
|
35
32
|
|
|
36
33
|
from upgini.errors import ValidationError
|
|
@@ -102,37 +99,43 @@ LIGHTGBM_PARAMS = {
|
|
|
102
99
|
}
|
|
103
100
|
|
|
104
101
|
LIGHTGBM_REGRESSION_PARAMS = {
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
102
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
103
|
+
"deterministic": True,
|
|
104
|
+
"min_gain_to_split": 0.001,
|
|
105
|
+
"n_estimators": 275,
|
|
106
|
+
"max_depth": 5,
|
|
107
|
+
"max_cat_threshold": 80,
|
|
108
|
+
"min_data_per_group": 25,
|
|
109
|
+
"cat_l2": 10,
|
|
110
|
+
"cat_smooth": 12,
|
|
111
|
+
"learning_rate": 0.05,
|
|
112
|
+
"feature_fraction": 1.0,
|
|
113
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
114
|
+
"objective": "huber",
|
|
115
|
+
"verbosity": -1,
|
|
117
116
|
}
|
|
118
117
|
|
|
119
118
|
LIGHTGBM_MULTICLASS_PARAMS = {
|
|
120
119
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
120
|
+
"deterministic": True,
|
|
121
|
+
"min_gain_to_split": 0.001,
|
|
121
122
|
"n_estimators": 275,
|
|
122
123
|
"max_depth": 3,
|
|
123
124
|
"max_cat_threshold": 80,
|
|
124
125
|
"min_data_per_group": 25,
|
|
125
126
|
"cat_l2": 10,
|
|
126
127
|
"cat_smooth": 12,
|
|
127
|
-
"learning_rate": 0.25,
|
|
128
|
+
"learning_rate": 0.25, # CatBoost 0.25
|
|
128
129
|
"min_sum_hessian_in_leaf": 0.01,
|
|
129
|
-
"objective": "
|
|
130
|
-
"class_weight": "balanced",
|
|
131
|
-
"verbosity":
|
|
130
|
+
"objective": "softmax",
|
|
131
|
+
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
132
|
+
"verbosity": -1,
|
|
132
133
|
}
|
|
133
134
|
|
|
134
135
|
LIGHTGBM_BINARY_PARAMS = {
|
|
135
136
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
137
|
+
"deterministic": True,
|
|
138
|
+
"min_gain_to_split": 0.001,
|
|
136
139
|
"n_estimators": 275,
|
|
137
140
|
"max_depth": 5,
|
|
138
141
|
"max_cat_threshold": 80,
|
|
@@ -143,8 +146,8 @@ LIGHTGBM_BINARY_PARAMS = {
|
|
|
143
146
|
"feature_fraction": 1.0,
|
|
144
147
|
"min_sum_hessian_in_leaf": 0.01,
|
|
145
148
|
"objective": "binary",
|
|
146
|
-
"class_weight": "balanced",
|
|
147
|
-
"verbosity":
|
|
149
|
+
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
150
|
+
"verbosity": -1,
|
|
148
151
|
}
|
|
149
152
|
|
|
150
153
|
N_FOLDS = 5
|
|
@@ -266,6 +269,7 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
266
269
|
def is_catboost_estimator(estimator):
|
|
267
270
|
try:
|
|
268
271
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
272
|
+
|
|
269
273
|
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
270
274
|
except ImportError:
|
|
271
275
|
return False
|
|
@@ -526,9 +530,7 @@ class EstimatorWrapper:
|
|
|
526
530
|
logger.error(
|
|
527
531
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
528
532
|
)
|
|
529
|
-
estimator_copy.set_params(
|
|
530
|
-
cat_features=cat_features
|
|
531
|
-
)
|
|
533
|
+
estimator_copy.set_params(cat_features=cat_features)
|
|
532
534
|
estimator = CatBoostWrapper(**kwargs)
|
|
533
535
|
else:
|
|
534
536
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
@@ -576,8 +578,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
576
578
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
577
579
|
|
|
578
580
|
# Find embeddings
|
|
579
|
-
from catboost import CatBoostClassifier
|
|
580
581
|
import catboost
|
|
582
|
+
from catboost import CatBoostClassifier
|
|
583
|
+
|
|
581
584
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
582
585
|
emb_pattern = r"(.+)_emb\d+"
|
|
583
586
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
@@ -701,6 +704,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
701
704
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
702
705
|
try:
|
|
703
706
|
from catboost import Pool
|
|
707
|
+
|
|
704
708
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
705
709
|
fold_pool = Pool(
|
|
706
710
|
x,
|
|
@@ -760,6 +764,12 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
760
764
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
761
765
|
if self.target_type == ModelTaskType.MULTICLASS:
|
|
762
766
|
params["num_class"] = y.nunique()
|
|
767
|
+
emb_pattern = r"(.+)_emb\d+"
|
|
768
|
+
emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
769
|
+
max_bin_by_feature_type = {
|
|
770
|
+
feature: 63 if feature in emb_features else 255 for feature in x.columns
|
|
771
|
+
}
|
|
772
|
+
params["max_bin_by_feature_type"] = max_bin_by_feature_type
|
|
763
773
|
self.cat_features = _get_cat_features(x)
|
|
764
774
|
x = fill_na_cat_features(x, self.cat_features)
|
|
765
775
|
for feature in self.cat_features:
|
|
@@ -781,8 +791,8 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
781
791
|
|
|
782
792
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
783
793
|
try:
|
|
784
|
-
import shap
|
|
785
794
|
import lightgbm as lgb
|
|
795
|
+
import shap
|
|
786
796
|
|
|
787
797
|
if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
|
|
788
798
|
return None
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.68a3832.
|
|
3
|
+
Version: 1.2.68a3832.dev7
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=CR4sN9ZhYNXzf0xJ61KtPk3O9k_pn-EljmzdfIfJyoM,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
4
|
upgini/dataset.py,sha256=1rb6BzyuiQFGVCTDmKL2wox3UFRNjtNaIJOwQnZ801A,34956
|
|
@@ -7,7 +7,7 @@ upgini/features_enricher.py,sha256=GXXx14jwf3F26_KrfJ6O40Vcu1hRx5iBjUB_jxy3Xvg,2
|
|
|
7
7
|
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=onr-wFpP0idy0SH3Wxv2xnqxT5H5MiP70o44f1PhfFw,38808
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
@@ -70,7 +70,7 @@ upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,
|
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
73
|
-
upgini-1.2.68a3832.
|
|
74
|
-
upgini-1.2.68a3832.
|
|
75
|
-
upgini-1.2.68a3832.
|
|
76
|
-
upgini-1.2.68a3832.
|
|
73
|
+
upgini-1.2.68a3832.dev7.dist-info/METADATA,sha256=LZ6mg6092FrqWu_yNWdMKTMgOC9lYclcPCQFLyMQAW0,49149
|
|
74
|
+
upgini-1.2.68a3832.dev7.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
75
|
+
upgini-1.2.68a3832.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
76
|
+
upgini-1.2.68a3832.dev7.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|