upgini 1.2.71a3810.dev4__py3-none-any.whl → 1.2.71a3810.dev6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/feature.py +9 -7
- upgini/autofe/timeseries/base.py +2 -2
- upgini/autofe/timeseries/cross.py +1 -1
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +86 -27
- upgini/http.py +9 -4
- upgini/metrics.py +178 -54
- upgini/resource_bundle/strings.properties +1 -0
- upgini/utils/deduplicate_utils.py +2 -0
- upgini/utils/feature_info.py +2 -1
- upgini/utils/sklearn_ext.py +20 -2
- upgini/utils/sort.py +1 -1
- upgini/utils/target_utils.py +4 -2
- {upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/METADATA +5 -4
- {upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/RECORD +18 -19
- upgini/lazy_import.py +0 -35
- {upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/WHEEL +0 -0
- {upgini-1.2.71a3810.dev4.dist-info → upgini-1.2.71a3810.dev6.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.71a3810.
|
|
1
|
+
__version__ = "1.2.71a3810.dev6"
|
upgini/autofe/feature.py
CHANGED
|
@@ -162,16 +162,18 @@ class Feature:
|
|
|
162
162
|
return self.cached_display_name
|
|
163
163
|
|
|
164
164
|
should_stack_op = not isinstance(self.children[0], Column) if self.op.is_unary else False
|
|
165
|
-
|
|
165
|
+
components = []
|
|
166
166
|
|
|
167
167
|
if self.alias:
|
|
168
|
-
components
|
|
169
|
-
elif
|
|
170
|
-
components
|
|
171
|
-
|
|
172
|
-
components = (
|
|
173
|
-
["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe"] + prev_name + [self.get_op_display_name()]
|
|
168
|
+
components.extend(["f_autofe", self.alias])
|
|
169
|
+
elif should_stack_op:
|
|
170
|
+
components.extend(
|
|
171
|
+
[self.children[0].get_display_name(cache=cache, shorten=shorten, **kwargs), self.get_op_display_name()]
|
|
174
172
|
)
|
|
173
|
+
elif shorten:
|
|
174
|
+
components.extend(["f_autofe", self.get_op_display_name()])
|
|
175
|
+
else:
|
|
176
|
+
components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe", self.get_op_display_name()]
|
|
175
177
|
components.extend([str(self.display_index)] if self.display_index is not None else [])
|
|
176
178
|
display_name = "_".join(components)
|
|
177
179
|
|
upgini/autofe/timeseries/base.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import abc
|
|
2
|
-
from typing import Dict, List, Optional
|
|
2
|
+
from typing import Dict, List, Optional, Tuple
|
|
3
3
|
|
|
4
4
|
import pandas as pd
|
|
5
5
|
from upgini.autofe.operator import PandasOperator
|
|
@@ -64,7 +64,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
|
|
|
64
64
|
return base_formula
|
|
65
65
|
|
|
66
66
|
@classmethod
|
|
67
|
-
def _parse_offset_from_formula(cls, formula: str, base_regex: str) ->
|
|
67
|
+
def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> Tuple[Optional[Dict], Optional[str]]:
|
|
68
68
|
"""
|
|
69
69
|
Parse the offset component from a formula.
|
|
70
70
|
|
upgini/dataset.py
CHANGED
|
@@ -389,7 +389,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
389
389
|
for col in columns_to_validate:
|
|
390
390
|
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
|
391
391
|
if validate_target and target is not None and col == target:
|
|
392
|
-
self.data.loc[self.data[target] == np.
|
|
392
|
+
self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
|
|
393
393
|
|
|
394
394
|
if col in mandatory_columns:
|
|
395
395
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
upgini/features_enricher.py
CHANGED
|
@@ -12,6 +12,7 @@ import tempfile
|
|
|
12
12
|
import time
|
|
13
13
|
import uuid
|
|
14
14
|
from collections import Counter
|
|
15
|
+
from copy import deepcopy
|
|
15
16
|
from dataclasses import dataclass
|
|
16
17
|
from threading import Thread
|
|
17
18
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
|
@@ -841,7 +842,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
841
842
|
max_features: Optional[int] = None,
|
|
842
843
|
remove_outliers_calc_metrics: Optional[bool] = None,
|
|
843
844
|
trace_id: Optional[str] = None,
|
|
844
|
-
|
|
845
|
+
internal_call: bool = False,
|
|
845
846
|
progress_bar: Optional[ProgressBar] = None,
|
|
846
847
|
progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
|
|
847
848
|
**kwargs,
|
|
@@ -1095,7 +1096,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1095
1096
|
enriched_shaps = enriched_cv_result.shap_values
|
|
1096
1097
|
|
|
1097
1098
|
if enriched_shaps is not None:
|
|
1098
|
-
self._update_shap_values(trace_id, fitting_X, enriched_shaps)
|
|
1099
|
+
self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
|
|
1099
1100
|
|
|
1100
1101
|
if enriched_metric is None:
|
|
1101
1102
|
self.logger.warning(
|
|
@@ -1256,14 +1257,14 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1256
1257
|
if self.raise_validation_error:
|
|
1257
1258
|
raise e
|
|
1258
1259
|
else:
|
|
1259
|
-
if not
|
|
1260
|
+
if not internal_call:
|
|
1260
1261
|
self._dump_python_libs()
|
|
1261
1262
|
self.__display_support_link()
|
|
1262
1263
|
raise e
|
|
1263
1264
|
finally:
|
|
1264
1265
|
self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
|
|
1265
1266
|
|
|
1266
|
-
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
|
|
1267
|
+
def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
|
|
1267
1268
|
renaming = self.fit_columns_renaming or {}
|
|
1268
1269
|
new_shaps = {
|
|
1269
1270
|
renaming.get(feature, feature): _round_shap_value(shap)
|
|
@@ -1272,7 +1273,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1272
1273
|
}
|
|
1273
1274
|
self.__prepare_feature_importances(trace_id, df, new_shaps)
|
|
1274
1275
|
|
|
1275
|
-
if self.features_info_display_handle is not None:
|
|
1276
|
+
if not silent and self.features_info_display_handle is not None:
|
|
1276
1277
|
try:
|
|
1277
1278
|
_ = get_ipython() # type: ignore
|
|
1278
1279
|
|
|
@@ -1284,7 +1285,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1284
1285
|
)
|
|
1285
1286
|
except (ImportError, NameError):
|
|
1286
1287
|
pass
|
|
1287
|
-
if self.data_sources_display_handle is not None:
|
|
1288
|
+
if not silent and self.data_sources_display_handle is not None:
|
|
1288
1289
|
try:
|
|
1289
1290
|
_ = get_ipython() # type: ignore
|
|
1290
1291
|
|
|
@@ -1296,7 +1297,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1296
1297
|
)
|
|
1297
1298
|
except (ImportError, NameError):
|
|
1298
1299
|
pass
|
|
1299
|
-
if self.autofe_features_display_handle is not None:
|
|
1300
|
+
if not silent and self.autofe_features_display_handle is not None:
|
|
1300
1301
|
try:
|
|
1301
1302
|
_ = get_ipython() # type: ignore
|
|
1302
1303
|
autofe_descriptions_df = self.get_autofe_features_description()
|
|
@@ -1309,7 +1310,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1309
1310
|
)
|
|
1310
1311
|
except (ImportError, NameError):
|
|
1311
1312
|
pass
|
|
1312
|
-
if self.report_button_handle is not None:
|
|
1313
|
+
if not silent and self.report_button_handle is not None:
|
|
1313
1314
|
try:
|
|
1314
1315
|
_ = get_ipython() # type: ignore
|
|
1315
1316
|
|
|
@@ -1512,8 +1513,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1512
1513
|
self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
|
|
1513
1514
|
|
|
1514
1515
|
filtered_enriched_features = self.__filtered_enriched_features(
|
|
1515
|
-
importance_threshold,
|
|
1516
|
-
max_features,
|
|
1516
|
+
importance_threshold, max_features, trace_id, validated_X
|
|
1517
1517
|
)
|
|
1518
1518
|
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
|
1519
1519
|
|
|
@@ -2541,7 +2541,9 @@ if response.status_code == 200:
|
|
|
2541
2541
|
for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
|
|
2542
2542
|
if c not in self.dropped_client_feature_names_
|
|
2543
2543
|
]
|
|
2544
|
-
filtered_columns = self.__filtered_enriched_features(
|
|
2544
|
+
filtered_columns = self.__filtered_enriched_features(
|
|
2545
|
+
importance_threshold, max_features, trace_id, validated_X
|
|
2546
|
+
)
|
|
2545
2547
|
selecting_columns.extend(
|
|
2546
2548
|
c for c in filtered_columns if c in result.columns and c not in validated_X.columns
|
|
2547
2549
|
)
|
|
@@ -3248,8 +3250,7 @@ if response.status_code == 200:
|
|
|
3248
3250
|
def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
|
|
3249
3251
|
if len(eval_pair) != 2:
|
|
3250
3252
|
raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
|
|
3251
|
-
eval_X = eval_pair
|
|
3252
|
-
eval_y = eval_pair[1]
|
|
3253
|
+
eval_X, eval_y = eval_pair
|
|
3253
3254
|
|
|
3254
3255
|
if _num_samples(eval_X) == 0:
|
|
3255
3256
|
raise ValidationError(self.bundle.get("eval_x_is_empty"))
|
|
@@ -3805,6 +3806,47 @@ if response.status_code == 200:
|
|
|
3805
3806
|
|
|
3806
3807
|
return result_features
|
|
3807
3808
|
|
|
3809
|
+
def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
|
|
3810
|
+
if self._search_task is None:
|
|
3811
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
|
3812
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3813
|
+
if features_meta is None:
|
|
3814
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
|
3815
|
+
features_meta = deepcopy(features_meta)
|
|
3816
|
+
|
|
3817
|
+
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
3818
|
+
df = df.rename(columns=original_names_dict)
|
|
3819
|
+
|
|
3820
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3821
|
+
|
|
3822
|
+
importances = {}
|
|
3823
|
+
|
|
3824
|
+
for feature_meta in features_meta:
|
|
3825
|
+
if feature_meta.name in original_names_dict.keys():
|
|
3826
|
+
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3827
|
+
|
|
3828
|
+
is_client_feature = feature_meta.name in df.columns
|
|
3829
|
+
|
|
3830
|
+
if feature_meta.shap_value == 0.0:
|
|
3831
|
+
continue
|
|
3832
|
+
|
|
3833
|
+
# Use only important features
|
|
3834
|
+
if (
|
|
3835
|
+
feature_meta.name == COUNTRY
|
|
3836
|
+
# In select_features mode we select also from etalon features and need to show them
|
|
3837
|
+
or (not self.fit_select_features and is_client_feature)
|
|
3838
|
+
):
|
|
3839
|
+
continue
|
|
3840
|
+
|
|
3841
|
+
# Temporary workaround for duplicate features metadata
|
|
3842
|
+
if feature_meta.name in importances:
|
|
3843
|
+
self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
|
|
3844
|
+
continue
|
|
3845
|
+
|
|
3846
|
+
importances[feature_meta.name] = feature_meta.shap_value
|
|
3847
|
+
|
|
3848
|
+
return importances
|
|
3849
|
+
|
|
3808
3850
|
def __prepare_feature_importances(
|
|
3809
3851
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
|
3810
3852
|
):
|
|
@@ -3813,6 +3855,7 @@ if response.status_code == 200:
|
|
|
3813
3855
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
|
3814
3856
|
if features_meta is None:
|
|
3815
3857
|
raise Exception(self.bundle.get("missing_features_meta"))
|
|
3858
|
+
features_meta = deepcopy(features_meta)
|
|
3816
3859
|
|
|
3817
3860
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
|
3818
3861
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
|
@@ -3828,15 +3871,23 @@ if response.status_code == 200:
|
|
|
3828
3871
|
|
|
3829
3872
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
|
3830
3873
|
|
|
3831
|
-
if updated_shaps is not None:
|
|
3832
|
-
for fm in features_meta:
|
|
3833
|
-
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
|
3834
|
-
|
|
3835
|
-
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3836
3874
|
for feature_meta in features_meta:
|
|
3837
3875
|
if feature_meta.name in original_names_dict.keys():
|
|
3838
3876
|
feature_meta.name = original_names_dict[feature_meta.name]
|
|
3839
3877
|
|
|
3878
|
+
if updated_shaps is not None:
|
|
3879
|
+
updating_shap = updated_shaps.get(feature_meta.name)
|
|
3880
|
+
if updating_shap is None:
|
|
3881
|
+
self.logger.warning(
|
|
3882
|
+
f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
|
|
3883
|
+
)
|
|
3884
|
+
updating_shap = 0.0
|
|
3885
|
+
feature_meta.shap_value = updating_shap
|
|
3886
|
+
|
|
3887
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
|
3888
|
+
|
|
3889
|
+
for feature_meta in features_meta:
|
|
3890
|
+
|
|
3840
3891
|
is_client_feature = feature_meta.name in df.columns
|
|
3841
3892
|
|
|
3842
3893
|
# TODO make a decision about selected features based on special flag from mlb
|
|
@@ -3848,7 +3899,7 @@ if response.status_code == 200:
|
|
|
3848
3899
|
# Use only important features
|
|
3849
3900
|
if (
|
|
3850
3901
|
# feature_meta.name in self.fit_generated_features or
|
|
3851
|
-
feature_meta.name == COUNTRY
|
|
3902
|
+
feature_meta.name == COUNTRY # constant synthetic column
|
|
3852
3903
|
# In select_features mode we select also from etalon features and need to show them
|
|
3853
3904
|
or (not self.fit_select_features and is_client_feature)
|
|
3854
3905
|
):
|
|
@@ -3990,16 +4041,19 @@ if response.status_code == 200:
|
|
|
3990
4041
|
)
|
|
3991
4042
|
|
|
3992
4043
|
def __filtered_importance_names(
|
|
3993
|
-
self, importance_threshold: Optional[float], max_features: Optional[int]
|
|
4044
|
+
self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
|
|
3994
4045
|
) -> List[str]:
|
|
3995
|
-
|
|
3996
|
-
|
|
4046
|
+
# get features importance from server
|
|
4047
|
+
filtered_importances = self.__get_features_importance_from_server(trace_id, df)
|
|
3997
4048
|
|
|
3998
|
-
filtered_importances
|
|
4049
|
+
if len(filtered_importances) == 0:
|
|
4050
|
+
return []
|
|
3999
4051
|
|
|
4000
4052
|
if importance_threshold is not None:
|
|
4001
4053
|
filtered_importances = [
|
|
4002
|
-
(name, importance)
|
|
4054
|
+
(name, importance)
|
|
4055
|
+
for name, importance in filtered_importances.items()
|
|
4056
|
+
if importance > importance_threshold
|
|
4003
4057
|
]
|
|
4004
4058
|
if max_features is not None:
|
|
4005
4059
|
filtered_importances = list(filtered_importances)[:max_features]
|
|
@@ -4084,7 +4138,10 @@ if response.status_code == 200:
|
|
|
4084
4138
|
)
|
|
4085
4139
|
|
|
4086
4140
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
4087
|
-
|
|
4141
|
+
if self.__is_registered:
|
|
4142
|
+
msg = self.bundle.get("only_custom_keys")
|
|
4143
|
+
else:
|
|
4144
|
+
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4088
4145
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4089
4146
|
raise ValidationError(msg)
|
|
4090
4147
|
|
|
@@ -4135,7 +4192,7 @@ if response.status_code == 200:
|
|
|
4135
4192
|
max_features=max_features,
|
|
4136
4193
|
remove_outliers_calc_metrics=remove_outliers_calc_metrics,
|
|
4137
4194
|
trace_id=trace_id,
|
|
4138
|
-
|
|
4195
|
+
internal_call=True,
|
|
4139
4196
|
progress_bar=progress_bar,
|
|
4140
4197
|
progress_callback=progress_callback,
|
|
4141
4198
|
)
|
|
@@ -4209,11 +4266,13 @@ if response.status_code == 200:
|
|
|
4209
4266
|
self,
|
|
4210
4267
|
importance_threshold: Optional[float],
|
|
4211
4268
|
max_features: Optional[int],
|
|
4269
|
+
trace_id: str,
|
|
4270
|
+
df: pd.DataFrame,
|
|
4212
4271
|
) -> List[str]:
|
|
4213
4272
|
importance_threshold = self.__validate_importance_threshold(importance_threshold)
|
|
4214
4273
|
max_features = self.__validate_max_features(max_features)
|
|
4215
4274
|
|
|
4216
|
-
return self.__filtered_importance_names(importance_threshold, max_features)
|
|
4275
|
+
return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
|
|
4217
4276
|
|
|
4218
4277
|
def __detect_missing_search_keys(
|
|
4219
4278
|
self,
|
upgini/http.py
CHANGED
|
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
16
16
|
from urllib.parse import urljoin
|
|
17
17
|
|
|
18
18
|
import jwt
|
|
19
|
+
|
|
19
20
|
# import pandas as pd
|
|
20
21
|
import requests
|
|
21
22
|
from pydantic import BaseModel
|
|
@@ -342,7 +343,9 @@ class _RestClient:
|
|
|
342
343
|
else:
|
|
343
344
|
return self._syncronized_refresh_access_token()
|
|
344
345
|
|
|
345
|
-
def _with_unauth_retry(
|
|
346
|
+
def _with_unauth_retry(
|
|
347
|
+
self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False
|
|
348
|
+
):
|
|
346
349
|
try:
|
|
347
350
|
return request()
|
|
348
351
|
except RequestException as e:
|
|
@@ -373,8 +376,9 @@ class _RestClient:
|
|
|
373
376
|
elif "more than one concurrent search request" in e.message.lower():
|
|
374
377
|
raise ValidationError(bundle.get("concurrent_request"))
|
|
375
378
|
else:
|
|
376
|
-
|
|
377
|
-
|
|
379
|
+
if not silent:
|
|
380
|
+
print(e)
|
|
381
|
+
show_status_error()
|
|
378
382
|
raise e
|
|
379
383
|
|
|
380
384
|
@staticmethod
|
|
@@ -706,6 +710,7 @@ class _RestClient:
|
|
|
706
710
|
silent=True,
|
|
707
711
|
),
|
|
708
712
|
need_connection_retry=False,
|
|
713
|
+
silent=True,
|
|
709
714
|
)
|
|
710
715
|
except Exception:
|
|
711
716
|
self.send_log_event_unauth(log_event)
|
|
@@ -716,7 +721,7 @@ class _RestClient:
|
|
|
716
721
|
try:
|
|
717
722
|
requests.post(
|
|
718
723
|
url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
|
|
719
|
-
json=log_event.
|
|
724
|
+
json=log_event.model_dump(exclude_none=True),
|
|
720
725
|
headers=_RestClient._get_base_headers(content_type="application/json"),
|
|
721
726
|
)
|
|
722
727
|
except Exception:
|
upgini/metrics.py
CHANGED
|
@@ -1,20 +1,21 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
import inspect
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
7
6
|
from collections import defaultdict
|
|
8
7
|
from copy import deepcopy
|
|
8
|
+
from dataclasses import dataclass
|
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
|
-
import
|
|
11
|
+
import lightgbm as lgb
|
|
12
12
|
import numpy as np
|
|
13
13
|
import pandas as pd
|
|
14
|
-
from
|
|
14
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
15
15
|
from numpy import log1p
|
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
|
17
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
18
|
+
from sklearn.preprocessing import OrdinalEncoder
|
|
18
19
|
|
|
19
20
|
from upgini.utils.features_validator import FeaturesValidator
|
|
20
21
|
from upgini.utils.sklearn_ext import cross_validate
|
|
@@ -27,11 +28,8 @@ except ImportError:
|
|
|
27
28
|
from sklearn.metrics._scorer import SCORERS
|
|
28
29
|
|
|
29
30
|
available_scorers = SCORERS
|
|
30
|
-
from sklearn.metrics._regression import (
|
|
31
|
-
_check_reg_targets,
|
|
32
|
-
check_consistent_length,
|
|
33
|
-
)
|
|
34
31
|
from sklearn.metrics import mean_squared_error
|
|
32
|
+
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
|
35
33
|
from sklearn.model_selection import BaseCrossValidator
|
|
36
34
|
|
|
37
35
|
from upgini.errors import ValidationError
|
|
@@ -88,13 +86,73 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
|
88
86
|
|
|
89
87
|
LIGHTGBM_PARAMS = {
|
|
90
88
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
91
|
-
"num_leaves": 16,
|
|
89
|
+
# "num_leaves": 16,
|
|
90
|
+
# "n_estimators": 150,
|
|
91
|
+
# "min_child_weight": 1,
|
|
92
92
|
"max_depth": 4,
|
|
93
|
-
"
|
|
93
|
+
"max_cat_threshold": 80,
|
|
94
|
+
"min_data_per_group": 25,
|
|
95
|
+
"num_boost_round": 150,
|
|
96
|
+
"cat_l2": 10,
|
|
97
|
+
"cat_smooth": 12,
|
|
98
|
+
"learning_rate": 0.05,
|
|
99
|
+
"feature_fraction": 1.0,
|
|
100
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
LIGHTGBM_REGRESSION_PARAMS = {
|
|
104
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
105
|
+
"deterministic": True,
|
|
106
|
+
"min_gain_to_split": 0.001,
|
|
107
|
+
"n_estimators": 275,
|
|
108
|
+
"max_depth": 5,
|
|
109
|
+
"max_cat_threshold": 80,
|
|
110
|
+
"min_data_per_group": 25,
|
|
111
|
+
"cat_l2": 10,
|
|
112
|
+
"cat_smooth": 12,
|
|
113
|
+
"learning_rate": 0.05,
|
|
114
|
+
"feature_fraction": 1.0,
|
|
115
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
116
|
+
"objective": "huber",
|
|
117
|
+
"verbosity": -1,
|
|
118
|
+
}
|
|
119
|
+
|
|
120
|
+
LIGHTGBM_MULTICLASS_PARAMS = {
|
|
121
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
122
|
+
"n_estimators": 275,
|
|
123
|
+
"max_depth": 5,
|
|
94
124
|
"learning_rate": 0.05,
|
|
95
|
-
"
|
|
125
|
+
"min_gain_to_split": 0.001,
|
|
126
|
+
"max_cat_threshold": 80,
|
|
127
|
+
"min_data_per_group": 20,
|
|
128
|
+
"cat_smooth": 18,
|
|
129
|
+
"cat_l2": 8,
|
|
130
|
+
"objective": "multiclass",
|
|
131
|
+
"class_weight": "balanced",
|
|
132
|
+
"use_quantized_grad": "true",
|
|
133
|
+
"num_grad_quant_bins": "8",
|
|
134
|
+
"stochastic_rounding": "true",
|
|
135
|
+
"verbosity": -1,
|
|
96
136
|
}
|
|
97
137
|
|
|
138
|
+
LIGHTGBM_BINARY_PARAMS = {
|
|
139
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
140
|
+
"min_gain_to_split": 0.001,
|
|
141
|
+
"n_estimators": 275,
|
|
142
|
+
"max_depth": 5,
|
|
143
|
+
"learning_rate": 0.05,
|
|
144
|
+
"objective": "binary",
|
|
145
|
+
"class_weight": "balanced",
|
|
146
|
+
"deterministic": True,
|
|
147
|
+
"max_cat_threshold": 80,
|
|
148
|
+
"min_data_per_group": 20,
|
|
149
|
+
"cat_smooth": 18,
|
|
150
|
+
"cat_l2": 8,
|
|
151
|
+
"verbosity": -1,
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
|
|
155
|
+
|
|
98
156
|
N_FOLDS = 5
|
|
99
157
|
BLOCKED_TS_TEST_SIZE = 0.2
|
|
100
158
|
|
|
@@ -211,6 +269,15 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
211
269
|
}
|
|
212
270
|
|
|
213
271
|
|
|
272
|
+
def is_catboost_estimator(estimator):
|
|
273
|
+
try:
|
|
274
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
275
|
+
|
|
276
|
+
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
277
|
+
except ImportError:
|
|
278
|
+
return False
|
|
279
|
+
|
|
280
|
+
|
|
214
281
|
@dataclass
|
|
215
282
|
class _CrossValResults:
|
|
216
283
|
metric: Optional[float]
|
|
@@ -274,7 +341,7 @@ class EstimatorWrapper:
|
|
|
274
341
|
for c in x.columns:
|
|
275
342
|
if is_numeric_dtype(x[c]):
|
|
276
343
|
x[c] = x[c].astype(float)
|
|
277
|
-
|
|
344
|
+
elif not x[c].dtype == "category":
|
|
278
345
|
x[c] = x[c].astype(str)
|
|
279
346
|
|
|
280
347
|
if not isinstance(y, pd.Series):
|
|
@@ -292,7 +359,7 @@ class EstimatorWrapper:
|
|
|
292
359
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
293
360
|
return x, y, groups
|
|
294
361
|
|
|
295
|
-
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame,
|
|
362
|
+
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray]:
|
|
296
363
|
joined = pd.concat([x, y], axis=1)
|
|
297
364
|
joined = joined[joined[y.name].notna()]
|
|
298
365
|
joined = joined.reset_index(drop=True)
|
|
@@ -346,12 +413,15 @@ class EstimatorWrapper:
|
|
|
346
413
|
for estimator, split in zip(self.cv_estimators, splits):
|
|
347
414
|
_, validation_idx = split
|
|
348
415
|
cv_x = x.iloc[validation_idx]
|
|
349
|
-
|
|
416
|
+
if isinstance(y, pd.Series):
|
|
417
|
+
cv_y = y.iloc[validation_idx]
|
|
418
|
+
else:
|
|
419
|
+
cv_y = y[validation_idx]
|
|
350
420
|
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
351
421
|
if shaps is not None:
|
|
352
422
|
for feature, shap_value in shaps.items():
|
|
353
423
|
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
354
|
-
shap_values_all_folds[feature].
|
|
424
|
+
shap_values_all_folds[feature].append(shap_value)
|
|
355
425
|
|
|
356
426
|
if shap_values_all_folds:
|
|
357
427
|
average_shap_values = {
|
|
@@ -427,21 +497,18 @@ class EstimatorWrapper:
|
|
|
427
497
|
}
|
|
428
498
|
if estimator is None:
|
|
429
499
|
params = {}
|
|
430
|
-
params["has_time"] = has_date
|
|
431
|
-
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
432
|
-
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
433
500
|
if target_type == ModelTaskType.MULTICLASS:
|
|
434
|
-
params = _get_add_params(params,
|
|
501
|
+
params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
|
|
435
502
|
params = _get_add_params(params, add_params)
|
|
436
|
-
estimator =
|
|
503
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
437
504
|
elif target_type == ModelTaskType.BINARY:
|
|
438
|
-
params = _get_add_params(params,
|
|
505
|
+
params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
|
|
439
506
|
params = _get_add_params(params, add_params)
|
|
440
|
-
estimator =
|
|
507
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
441
508
|
elif target_type == ModelTaskType.REGRESSION:
|
|
442
|
-
params = _get_add_params(params,
|
|
509
|
+
params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
|
|
443
510
|
params = _get_add_params(params, add_params)
|
|
444
|
-
estimator =
|
|
511
|
+
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
|
445
512
|
else:
|
|
446
513
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
447
514
|
else:
|
|
@@ -450,31 +517,21 @@ class EstimatorWrapper:
|
|
|
450
517
|
else:
|
|
451
518
|
estimator_copy = deepcopy(estimator)
|
|
452
519
|
kwargs["estimator"] = estimator_copy
|
|
453
|
-
if
|
|
520
|
+
if is_catboost_estimator(estimator):
|
|
454
521
|
if cat_features is not None:
|
|
455
522
|
for cat_feature in cat_features:
|
|
456
523
|
if cat_feature not in x.columns:
|
|
457
524
|
logger.error(
|
|
458
525
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
459
526
|
)
|
|
460
|
-
estimator_copy.set_params(
|
|
461
|
-
# cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
462
|
-
cat_features=cat_features
|
|
463
|
-
)
|
|
527
|
+
estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
|
|
464
528
|
estimator = CatBoostWrapper(**kwargs)
|
|
465
529
|
else:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
else:
|
|
472
|
-
logger.warning(
|
|
473
|
-
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
474
|
-
"Default strategy for category features will be used"
|
|
475
|
-
)
|
|
476
|
-
estimator = OtherEstimatorWrapper(**kwargs)
|
|
477
|
-
except ModuleNotFoundError:
|
|
530
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
531
|
+
estimator = LightGBMWrapper(**kwargs)
|
|
532
|
+
elif is_catboost_estimator(estimator):
|
|
533
|
+
estimator = CatBoostWrapper(**kwargs)
|
|
534
|
+
else:
|
|
478
535
|
logger.warning(
|
|
479
536
|
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
480
537
|
"Default strategy for category features will be used"
|
|
@@ -487,7 +544,7 @@ class EstimatorWrapper:
|
|
|
487
544
|
class CatBoostWrapper(EstimatorWrapper):
|
|
488
545
|
def __init__(
|
|
489
546
|
self,
|
|
490
|
-
estimator
|
|
547
|
+
estimator,
|
|
491
548
|
scorer: Callable,
|
|
492
549
|
metric_name: str,
|
|
493
550
|
multiplier: int,
|
|
@@ -517,6 +574,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
517
574
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
518
575
|
|
|
519
576
|
# Find embeddings
|
|
577
|
+
import catboost
|
|
578
|
+
from catboost import CatBoostClassifier
|
|
579
|
+
|
|
520
580
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
521
581
|
emb_pattern = r"(.+)_emb\d+"
|
|
522
582
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
@@ -637,8 +697,10 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
637
697
|
else:
|
|
638
698
|
raise e
|
|
639
699
|
|
|
640
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator
|
|
700
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
641
701
|
try:
|
|
702
|
+
from catboost import Pool
|
|
703
|
+
|
|
642
704
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
643
705
|
fold_pool = Pool(
|
|
644
706
|
x,
|
|
@@ -693,27 +755,89 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
693
755
|
logger=logger,
|
|
694
756
|
)
|
|
695
757
|
self.cat_features = None
|
|
758
|
+
self.cat_encoder = None
|
|
759
|
+
self.n_classes = None
|
|
696
760
|
|
|
697
761
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
698
|
-
x,
|
|
762
|
+
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
|
763
|
+
if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
|
|
764
|
+
self.n_classes = len(np.unique(y_numpy))
|
|
765
|
+
if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
|
|
766
|
+
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
|
699
767
|
self.cat_features = _get_cat_features(x)
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
if
|
|
704
|
-
|
|
768
|
+
print("prepare to fit")
|
|
769
|
+
print(x.dtypes.to_dict())
|
|
770
|
+
print(self.cat_features)
|
|
771
|
+
if self.cat_features:
|
|
772
|
+
x = fill_na_cat_features(x, self.cat_features)
|
|
773
|
+
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
|
774
|
+
encoded = pd.DataFrame(
|
|
775
|
+
encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
|
776
|
+
)
|
|
777
|
+
x[self.cat_features] = encoded
|
|
778
|
+
self.cat_encoder = encoder
|
|
779
|
+
if not is_numeric_dtype(y_numpy):
|
|
780
|
+
y_numpy = correct_string_target(y_numpy)
|
|
705
781
|
|
|
706
|
-
return x,
|
|
782
|
+
return x, y_numpy, groups, params
|
|
707
783
|
|
|
708
784
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
709
|
-
x,
|
|
785
|
+
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
|
786
|
+
print("prepare to calculate")
|
|
787
|
+
print(x.dtypes.to_dict())
|
|
788
|
+
print(self.cat_features)
|
|
710
789
|
if self.cat_features is not None:
|
|
711
790
|
x = fill_na_cat_features(x, self.cat_features)
|
|
712
|
-
|
|
713
|
-
x[
|
|
791
|
+
if self.cat_encoder is not None:
|
|
792
|
+
x[self.cat_features] = pd.DataFrame(
|
|
793
|
+
self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
|
|
794
|
+
)
|
|
714
795
|
if not is_numeric_dtype(y):
|
|
715
|
-
|
|
716
|
-
return x,
|
|
796
|
+
y_numpy = correct_string_target(y_numpy)
|
|
797
|
+
return x, y_numpy, params
|
|
798
|
+
|
|
799
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
800
|
+
try:
|
|
801
|
+
shap_matrix = estimator.predict(
|
|
802
|
+
x,
|
|
803
|
+
predict_disable_shape_check=True,
|
|
804
|
+
raw_score=True,
|
|
805
|
+
pred_leaf=False,
|
|
806
|
+
pred_early_stop=True,
|
|
807
|
+
pred_contrib=True,
|
|
808
|
+
)
|
|
809
|
+
|
|
810
|
+
if self.target_type == ModelTaskType.MULTICLASS:
|
|
811
|
+
n_feat = x.shape[1]
|
|
812
|
+
shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
|
|
813
|
+
shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
|
|
814
|
+
|
|
815
|
+
# exclude base value
|
|
816
|
+
shap_matrix = shap_matrix[:, :-1]
|
|
817
|
+
|
|
818
|
+
feature_importance = {}
|
|
819
|
+
for i, col in enumerate(x.columns):
|
|
820
|
+
feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
|
|
821
|
+
|
|
822
|
+
# # exclude last column (base value)
|
|
823
|
+
# shap_values_only = shap_values[:, :-1]
|
|
824
|
+
# mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
|
|
825
|
+
|
|
826
|
+
# # For classification, shap_values is returned as a list for each class
|
|
827
|
+
# # Take values for the positive class
|
|
828
|
+
# if isinstance(shap_values, list):
|
|
829
|
+
# shap_values = shap_values[1]
|
|
830
|
+
|
|
831
|
+
# # Calculate mean absolute SHAP value for each feature
|
|
832
|
+
# feature_importance = {}
|
|
833
|
+
# for i, col in enumerate(x.columns):
|
|
834
|
+
# feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
|
835
|
+
|
|
836
|
+
return feature_importance
|
|
837
|
+
|
|
838
|
+
except Exception as e:
|
|
839
|
+
self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
|
|
840
|
+
return None
|
|
717
841
|
|
|
718
842
|
|
|
719
843
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
|
|
|
80
80
|
postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
|
|
81
81
|
multiple_search_key=Search key {} passed multiple times
|
|
82
82
|
unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
|
|
83
|
+
only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
|
|
83
84
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
84
85
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
85
86
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
@@ -74,6 +74,8 @@ def remove_fintech_duplicates(
|
|
|
74
74
|
# Checking for different dates by the same personal keys
|
|
75
75
|
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
76
76
|
total = len(uniques)
|
|
77
|
+
if total == 0:
|
|
78
|
+
return segment_df, None
|
|
77
79
|
diff_dates = len(uniques[uniques > 1])
|
|
78
80
|
if diff_dates / total >= 0.6:
|
|
79
81
|
return segment_df, None
|
upgini/utils/feature_info.py
CHANGED
|
@@ -90,7 +90,8 @@ class FeatureInfo:
|
|
|
90
90
|
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
|
|
91
91
|
if data is not None and len(data) > 0 and feature_meta.name in data.columns:
|
|
92
92
|
if len(data) > 3:
|
|
93
|
-
|
|
93
|
+
rand = np.random.RandomState(42)
|
|
94
|
+
feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
94
95
|
else:
|
|
95
96
|
feature_sample = data[feature_meta.name].dropna().unique().tolist()
|
|
96
97
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
upgini/utils/sklearn_ext.py
CHANGED
|
@@ -9,7 +9,6 @@ from traceback import format_exc
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import scipy.sparse as sp
|
|
12
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
13
12
|
from joblib import Parallel, logger
|
|
14
13
|
from scipy.sparse import issparse
|
|
15
14
|
from sklearn import config_context, get_config
|
|
@@ -342,6 +341,22 @@ def cross_validate(
|
|
|
342
341
|
raise e
|
|
343
342
|
|
|
344
343
|
|
|
344
|
+
def is_catboost_estimator(estimator):
|
|
345
|
+
try:
|
|
346
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
347
|
+
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
348
|
+
except ImportError:
|
|
349
|
+
return False
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
def is_lightgbm_estimator(estimator):
|
|
353
|
+
try:
|
|
354
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
355
|
+
return isinstance(estimator, (LGBMClassifier, LGBMRegressor))
|
|
356
|
+
except ImportError:
|
|
357
|
+
return False
|
|
358
|
+
|
|
359
|
+
|
|
345
360
|
def _fit_and_score(
|
|
346
361
|
estimator,
|
|
347
362
|
X,
|
|
@@ -497,7 +512,10 @@ def _fit_and_score(
|
|
|
497
512
|
if y_train is None:
|
|
498
513
|
estimator.fit(X_train, **fit_params)
|
|
499
514
|
else:
|
|
500
|
-
if
|
|
515
|
+
if is_catboost_estimator(estimator):
|
|
516
|
+
fit_params = fit_params.copy()
|
|
517
|
+
fit_params["eval_set"] = [(X_test, y_test)]
|
|
518
|
+
elif is_lightgbm_estimator(estimator):
|
|
501
519
|
fit_params = fit_params.copy()
|
|
502
520
|
fit_params["eval_set"] = [(X_test, y_test)]
|
|
503
521
|
estimator.fit(X_train, y_train, **fit_params)
|
upgini/utils/sort.py
CHANGED
|
@@ -87,7 +87,7 @@ def get_sort_columns_dict(
|
|
|
87
87
|
df_with_target = df_with_target.loc[~target.isna()]
|
|
88
88
|
df = df_with_target.iloc[:, :-1]
|
|
89
89
|
target = df_with_target.iloc[:, -1]
|
|
90
|
-
df = df.fillna(df.mean())
|
|
90
|
+
df = df.fillna(df.apply(lambda x: int(x.mean()) if pd.api.types.is_integer_dtype(x) else x.mean()))
|
|
91
91
|
omit_nan = False
|
|
92
92
|
hashes = [hash_series(df[col]) for col in columns_for_sort]
|
|
93
93
|
df = np.asarray(df, dtype=np.float32)
|
upgini/utils/target_utils.py
CHANGED
|
@@ -204,7 +204,7 @@ def balance_undersample(
|
|
|
204
204
|
def balance_undersample_forced(
|
|
205
205
|
df: pd.DataFrame,
|
|
206
206
|
target_column: str,
|
|
207
|
-
id_columns: List[str],
|
|
207
|
+
id_columns: Optional[List[str]],
|
|
208
208
|
date_column: str,
|
|
209
209
|
task_type: ModelTaskType,
|
|
210
210
|
cv_type: Optional[CVType],
|
|
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
|
287
287
|
|
|
288
288
|
def balance_undersample_time_series_trunc(
|
|
289
289
|
df: pd.DataFrame,
|
|
290
|
-
id_columns: List[str],
|
|
290
|
+
id_columns: Optional[List[str]],
|
|
291
291
|
date_column: str,
|
|
292
292
|
sample_size: int,
|
|
293
293
|
random_state: int = 42,
|
|
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
|
|
|
298
298
|
**kwargs,
|
|
299
299
|
):
|
|
300
300
|
# Convert date column to datetime
|
|
301
|
+
if id_columns is None:
|
|
302
|
+
id_columns = [date_column]
|
|
301
303
|
dates_df = df[id_columns + [date_column]].copy()
|
|
302
304
|
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
|
303
305
|
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.71a3810.
|
|
3
|
+
Version: 1.2.71a3810.dev6
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -22,14 +22,14 @@ Classifier: Programming Language :: Python :: 3.11
|
|
|
22
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
24
24
|
Requires-Python: <3.12,>=3.10
|
|
25
|
-
Requires-Dist: catboost>=1.0.3
|
|
26
25
|
Requires-Dist: fastparquet>=0.8.1
|
|
27
26
|
Requires-Dist: ipywidgets>=8.1.0
|
|
28
27
|
Requires-Dist: jarowinkler>=2.0.0
|
|
29
28
|
Requires-Dist: levenshtein>=0.25.1
|
|
30
|
-
Requires-Dist:
|
|
29
|
+
Requires-Dist: lightgbm>=4.6.0
|
|
30
|
+
Requires-Dist: numpy<3.0.0,>=1.19.0
|
|
31
31
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
32
|
-
Requires-Dist: psutil>=
|
|
32
|
+
Requires-Dist: psutil>=5.9.0
|
|
33
33
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
34
34
|
Requires-Dist: pyjwt>=2.8.0
|
|
35
35
|
Requires-Dist: python-bidi==0.4.2
|
|
@@ -38,6 +38,7 @@ Requires-Dist: python-json-logger>=3.3.0
|
|
|
38
38
|
Requires-Dist: requests>=2.8.0
|
|
39
39
|
Requires-Dist: scikit-learn>=1.3.0
|
|
40
40
|
Requires-Dist: scipy>=1.10.0
|
|
41
|
+
Requires-Dist: shap>=0.44.0
|
|
41
42
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
42
43
|
Description-Content-Type: text/markdown
|
|
43
44
|
|
|
@@ -1,13 +1,12 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=pP2-JWkoPVosnW6bKUy6ajRXus3pPBdc2hG-HO-Ztao,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
8
|
-
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
6
|
+
upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
|
|
7
|
+
upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
|
|
9
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metrics.py,sha256=B4sFcz-OWkVMQ7d_Y8vZwDo-xXkF6H2oAaCIgImSC0k,39410
|
|
11
10
|
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
|
12
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
@@ -17,15 +16,15 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
|
17
16
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
|
18
17
|
upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
|
|
19
18
|
upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
19
|
+
upgini/autofe/feature.py,sha256=md43NwDof0s_nWn_WfOO0l2wYItQ416nEzHm5u29XOA,14945
|
|
21
20
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
|
22
21
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
|
23
22
|
upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
|
|
24
23
|
upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
|
|
25
24
|
upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
|
|
26
25
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
|
27
|
-
upgini/autofe/timeseries/base.py,sha256=
|
|
28
|
-
upgini/autofe/timeseries/cross.py,sha256=
|
|
26
|
+
upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
|
|
27
|
+
upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
|
|
29
28
|
upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
|
|
30
29
|
upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
|
|
31
30
|
upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
|
|
@@ -39,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
39
38
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
40
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
41
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
42
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=mwQrerdJj3adzT-fHqvs6Qjf-rqDccsUzELDIXJKAmY,27791
|
|
43
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
44
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
45
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -53,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
|
|
|
53
52
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
54
53
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
55
54
|
upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
|
|
56
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
55
|
+
upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
|
|
57
56
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
58
57
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
59
58
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
60
|
-
upgini/utils/feature_info.py,sha256=
|
|
59
|
+
upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
|
|
61
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
62
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
63
62
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
@@ -65,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
|
65
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
66
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
67
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
68
|
-
upgini/utils/sklearn_ext.py,sha256=
|
|
69
|
-
upgini/utils/sort.py,sha256=
|
|
70
|
-
upgini/utils/target_utils.py,sha256=
|
|
67
|
+
upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
|
|
68
|
+
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
|
69
|
+
upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
|
|
71
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
72
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
73
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
74
|
-
upgini-1.2.71a3810.
|
|
75
|
-
upgini-1.2.71a3810.
|
|
76
|
-
upgini-1.2.71a3810.
|
|
77
|
-
upgini-1.2.71a3810.
|
|
73
|
+
upgini-1.2.71a3810.dev6.dist-info/METADATA,sha256=hGSStg6uah4fD-YtMrBOLCF6EPf9Uq59DfbYsspPQkI,49101
|
|
74
|
+
upgini-1.2.71a3810.dev6.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
75
|
+
upgini-1.2.71a3810.dev6.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
76
|
+
upgini-1.2.71a3810.dev6.dist-info/RECORD,,
|
upgini/lazy_import.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
import importlib
|
|
2
|
-
import importlib.util
|
|
3
|
-
import importlib.machinery
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
class LazyImport:
|
|
7
|
-
def __init__(self, module_name, class_name):
|
|
8
|
-
self.module_name = module_name
|
|
9
|
-
self.class_name = class_name
|
|
10
|
-
self._module = None
|
|
11
|
-
self._class = None
|
|
12
|
-
|
|
13
|
-
def _load(self):
|
|
14
|
-
if self._module is None:
|
|
15
|
-
# Load module and save link to it
|
|
16
|
-
spec = importlib.util.find_spec(self.module_name)
|
|
17
|
-
if spec is None:
|
|
18
|
-
raise ImportError(f"Module {self.module_name} not found")
|
|
19
|
-
|
|
20
|
-
# Create module
|
|
21
|
-
self._module = importlib.util.module_from_spec(spec)
|
|
22
|
-
|
|
23
|
-
# Execute module
|
|
24
|
-
spec.loader.exec_module(self._module)
|
|
25
|
-
|
|
26
|
-
# Get class from module
|
|
27
|
-
self._class = getattr(self._module, self.class_name)
|
|
28
|
-
|
|
29
|
-
def __call__(self, *args, **kwargs):
|
|
30
|
-
self._load()
|
|
31
|
-
return self._class(*args, **kwargs)
|
|
32
|
-
|
|
33
|
-
def __getattr__(self, name):
|
|
34
|
-
self._load()
|
|
35
|
-
return getattr(self._class, name)
|
|
File without changes
|
|
File without changes
|