upgini 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72a3659.dev1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/autofe/vector.py +23 -1
- upgini/features_enricher.py +15 -8
- upgini/metrics.py +16 -9
- upgini/utils/target_utils.py +4 -2
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/METADATA +1 -1
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/RECORD +9 -9
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/WHEEL +0 -0
- {upgini-1.2.71a3832.dev12.dist-info → upgini-1.2.72a3659.dev1.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.
|
1
|
+
__version__ = "1.2.72a3659.dev1"
|
upgini/autofe/vector.py
CHANGED
@@ -1,4 +1,4 @@
|
|
1
|
-
from typing import List, Optional
|
1
|
+
from typing import Dict, List, Optional
|
2
2
|
|
3
3
|
import pandas as pd
|
4
4
|
|
@@ -22,3 +22,25 @@ class Sum(PandasOperator, VectorizableMixin):
|
|
22
22
|
|
23
23
|
def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
24
24
|
return pd.DataFrame(data).T.fillna(0).sum(axis=1)
|
25
|
+
|
26
|
+
|
27
|
+
class OnnxModel(PandasOperator):
|
28
|
+
name: str = "onnx"
|
29
|
+
is_vector: bool = True
|
30
|
+
output_type: Optional[str] = "float"
|
31
|
+
model_name: str
|
32
|
+
|
33
|
+
def get_params(self) -> Dict[str, Optional[str]]:
|
34
|
+
res = super().get_params()
|
35
|
+
res.update(
|
36
|
+
{
|
37
|
+
"model_name": self.model_name,
|
38
|
+
}
|
39
|
+
)
|
40
|
+
return res
|
41
|
+
|
42
|
+
# def load_model(self):
|
43
|
+
# ...
|
44
|
+
|
45
|
+
# def calculate_vector(self, data: List[pd.Series]) -> pd.Series:
|
46
|
+
# ...
|
upgini/features_enricher.py
CHANGED
@@ -3250,8 +3250,7 @@ if response.status_code == 200:
|
|
3250
3250
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
3251
3251
|
if len(eval_pair) != 2:
|
3252
3252
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
3253
|
-
eval_X = eval_pair
|
3254
|
-
eval_y = eval_pair[1]
|
3253
|
+
eval_X, eval_y = eval_pair
|
3255
3254
|
|
3256
3255
|
if _num_samples(eval_X) == 0:
|
3257
3256
|
raise ValidationError(self.bundle.get("eval_x_is_empty"))
|
@@ -3872,15 +3871,23 @@ if response.status_code == 200:
|
|
3872
3871
|
|
3873
3872
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
3874
3873
|
|
3875
|
-
if updated_shaps is not None:
|
3876
|
-
for fm in features_meta:
|
3877
|
-
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
3878
|
-
|
3879
|
-
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3880
3874
|
for feature_meta in features_meta:
|
3881
3875
|
if feature_meta.name in original_names_dict.keys():
|
3882
3876
|
feature_meta.name = original_names_dict[feature_meta.name]
|
3883
3877
|
|
3878
|
+
if updated_shaps is not None:
|
3879
|
+
updating_shap = updated_shaps.get(feature_meta.name)
|
3880
|
+
if updating_shap is None:
|
3881
|
+
self.logger.warning(
|
3882
|
+
f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
|
3883
|
+
)
|
3884
|
+
updating_shap = 0.0
|
3885
|
+
feature_meta.shap_value = updating_shap
|
3886
|
+
|
3887
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3888
|
+
|
3889
|
+
for feature_meta in features_meta:
|
3890
|
+
|
3884
3891
|
is_client_feature = feature_meta.name in df.columns
|
3885
3892
|
|
3886
3893
|
# TODO make a decision about selected features based on special flag from mlb
|
@@ -3892,7 +3899,7 @@ if response.status_code == 200:
|
|
3892
3899
|
# Use only important features
|
3893
3900
|
if (
|
3894
3901
|
# feature_meta.name in self.fit_generated_features or
|
3895
|
-
feature_meta.name == COUNTRY
|
3902
|
+
feature_meta.name == COUNTRY # constant synthetic column
|
3896
3903
|
# In select_features mode we select also from etalon features and need to show them
|
3897
3904
|
or (not self.fit_select_features and is_client_feature)
|
3898
3905
|
):
|
upgini/metrics.py
CHANGED
@@ -8,13 +8,14 @@ from copy import deepcopy
|
|
8
8
|
from dataclasses import dataclass
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
10
10
|
|
11
|
+
import lightgbm as lgb
|
11
12
|
import numpy as np
|
12
13
|
import pandas as pd
|
13
14
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
14
|
-
import lightgbm as lgb
|
15
15
|
from numpy import log1p
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
17
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
18
|
+
from sklearn.preprocessing import OrdinalEncoder
|
18
19
|
|
19
20
|
from upgini.utils.features_validator import FeaturesValidator
|
20
21
|
from upgini.utils.sklearn_ext import cross_validate
|
@@ -125,7 +126,7 @@ LIGHTGBM_MULTICLASS_PARAMS = {
|
|
125
126
|
"max_cat_threshold": 80,
|
126
127
|
"min_data_per_group": 20,
|
127
128
|
"cat_smooth": 18,
|
128
|
-
"cat_l2"
|
129
|
+
"cat_l2": 8,
|
129
130
|
"objective": "multiclass",
|
130
131
|
"class_weight": "balanced",
|
131
132
|
"use_quantized_grad": "true",
|
@@ -146,7 +147,7 @@ LIGHTGBM_BINARY_PARAMS = {
|
|
146
147
|
"max_cat_threshold": 80,
|
147
148
|
"min_data_per_group": 20,
|
148
149
|
"cat_smooth": 18,
|
149
|
-
"cat_l2"
|
150
|
+
"cat_l2": 8,
|
150
151
|
"verbosity": -1,
|
151
152
|
}
|
152
153
|
|
@@ -754,6 +755,7 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
754
755
|
logger=logger,
|
755
756
|
)
|
756
757
|
self.cat_features = None
|
758
|
+
self.cat_encoder = None
|
757
759
|
self.n_classes = None
|
758
760
|
|
759
761
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
@@ -764,10 +766,13 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
764
766
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
765
767
|
self.cat_features = _get_cat_features(x)
|
766
768
|
if self.cat_features:
|
767
|
-
|
768
|
-
|
769
|
-
|
770
|
-
|
769
|
+
x = fill_na_cat_features(x, self.cat_features)
|
770
|
+
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
771
|
+
encoded = pd.DataFrame(
|
772
|
+
encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
773
|
+
)
|
774
|
+
x[self.cat_features] = encoded
|
775
|
+
self.cat_encoder = encoder
|
771
776
|
if not is_numeric_dtype(y_numpy):
|
772
777
|
y_numpy = correct_string_target(y_numpy)
|
773
778
|
|
@@ -777,8 +782,10 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
777
782
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
778
783
|
if self.cat_features is not None:
|
779
784
|
x = fill_na_cat_features(x, self.cat_features)
|
780
|
-
|
781
|
-
x[
|
785
|
+
if self.cat_encoder is not None:
|
786
|
+
x[self.cat_features] = pd.DataFrame(
|
787
|
+
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
788
|
+
)
|
782
789
|
if not is_numeric_dtype(y):
|
783
790
|
y_numpy = correct_string_target(y_numpy)
|
784
791
|
return x, y_numpy, params
|
upgini/utils/target_utils.py
CHANGED
@@ -204,7 +204,7 @@ def balance_undersample(
|
|
204
204
|
def balance_undersample_forced(
|
205
205
|
df: pd.DataFrame,
|
206
206
|
target_column: str,
|
207
|
-
id_columns: List[str],
|
207
|
+
id_columns: Optional[List[str]],
|
208
208
|
date_column: str,
|
209
209
|
task_type: ModelTaskType,
|
210
210
|
cv_type: Optional[CVType],
|
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
287
287
|
|
288
288
|
def balance_undersample_time_series_trunc(
|
289
289
|
df: pd.DataFrame,
|
290
|
-
id_columns: List[str],
|
290
|
+
id_columns: Optional[List[str]],
|
291
291
|
date_column: str,
|
292
292
|
sample_size: int,
|
293
293
|
random_state: int = 42,
|
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
|
|
298
298
|
**kwargs,
|
299
299
|
):
|
300
300
|
# Convert date column to datetime
|
301
|
+
if id_columns is None:
|
302
|
+
id_columns = [date_column]
|
301
303
|
dates_df = df[id_columns + [date_column]].copy()
|
302
304
|
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
303
305
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.
|
3
|
+
Version: 1.2.72a3659.dev1
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=n3Di7UqdUYABUquK0tXIme5xiFjO7fpJ3AKGXnT-Jec,33
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
|
7
7
|
upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=jobZL_Hg7guufDYH2XdanxgbyJTuC9ZAMZodeptE3I4,39177
|
10
10
|
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -21,7 +21,7 @@ upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
|
21
21
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
22
22
|
upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
|
23
23
|
upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
|
24
|
-
upgini/autofe/vector.py,sha256
|
24
|
+
upgini/autofe/vector.py,sha256=-aLI4cA5HI2p42Skj4Sfb3XAPAFfbcu7FjukWsxVFdM,1161
|
25
25
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
26
26
|
upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
|
27
27
|
upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
|
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
67
|
upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
|
-
upgini/utils/target_utils.py,sha256=
|
69
|
+
upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.
|
74
|
-
upgini-1.2.
|
75
|
-
upgini-1.2.
|
76
|
-
upgini-1.2.
|
73
|
+
upgini-1.2.72a3659.dev1.dist-info/METADATA,sha256=tuv9DtWEtwHVjoIMPK4LKOvrmaQ3suMZS43JeEcEDiY,49101
|
74
|
+
upgini-1.2.72a3659.dev1.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.72a3659.dev1.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.72a3659.dev1.dist-info/RECORD,,
|
File without changes
|
File without changes
|