upgini 1.2.69__tar.gz → 1.2.70a3832.dev1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/PKG-INFO +4 -3
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/pyproject.toml +3 -2
- upgini-1.2.70a3832.dev1/src/upgini/__about__.py +1 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/dataset.py +1 -1
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/features_enricher.py +4 -1
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/http.py +9 -4
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/metrics.py +145 -48
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/resource_bundle/strings.properties +1 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/deduplicate_utils.py +2 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/feature_info.py +2 -1
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/sklearn_ext.py +9 -2
- upgini-1.2.69/src/upgini/__about__.py +0 -1
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/.gitignore +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/LICENSE +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/README.md +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/ads.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/errors.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/lazy_import.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/metadata.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/search_task.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/spinner.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/version_validator.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.70a3832.dev1
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -23,12 +23,12 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
24
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
25
|
Requires-Python: <3.12,>=3.8
|
|
26
|
-
Requires-Dist: catboost>=1.0.3
|
|
27
26
|
Requires-Dist: fastparquet>=0.8.1
|
|
28
27
|
Requires-Dist: ipywidgets>=8.1.0
|
|
29
28
|
Requires-Dist: jarowinkler>=2.0.0
|
|
30
29
|
Requires-Dist: levenshtein>=0.25.1
|
|
31
|
-
Requires-Dist:
|
|
30
|
+
Requires-Dist: lightgbm>=4.6.0
|
|
31
|
+
Requires-Dist: numpy<3.0.0,>=1.19.0
|
|
32
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
33
33
|
Requires-Dist: psutil>=6.0.0
|
|
34
34
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
@@ -39,6 +39,7 @@ Requires-Dist: python-json-logger>=3.3.0
|
|
|
39
39
|
Requires-Dist: requests>=2.8.0
|
|
40
40
|
Requires-Dist: scikit-learn>=1.3.0
|
|
41
41
|
Requires-Dist: scipy>=1.10.0
|
|
42
|
+
Requires-Dist: shap>=0.44.0
|
|
42
43
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
43
44
|
Description-Content-Type: text/markdown
|
|
44
45
|
|
|
@@ -35,10 +35,11 @@ classifiers = [
|
|
|
35
35
|
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
36
36
|
]
|
|
37
37
|
dependencies = [
|
|
38
|
-
"
|
|
38
|
+
"lightgbm>=4.6.0",
|
|
39
|
+
"shap>=0.44.0",
|
|
39
40
|
"fastparquet>=0.8.1",
|
|
40
41
|
"ipywidgets>=8.1.0",
|
|
41
|
-
"numpy>=1.19.0
|
|
42
|
+
"numpy>=1.19.0,<3.0.0",
|
|
42
43
|
"pandas>=1.1.0,<3.0.0",
|
|
43
44
|
"pydantic>1.0.0,<3.0.0",
|
|
44
45
|
"pyjwt>=2.8.0",
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "1.2.70a3832.dev1"
|
|
@@ -388,7 +388,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
388
388
|
for col in columns_to_validate:
|
|
389
389
|
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
|
390
390
|
if validate_target and target is not None and col == target:
|
|
391
|
-
self.data.loc[self.data[target] == np.
|
|
391
|
+
self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
|
|
392
392
|
|
|
393
393
|
if col in mandatory_columns:
|
|
394
394
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
|
@@ -4075,7 +4075,10 @@ if response.status_code == 200:
|
|
|
4075
4075
|
)
|
|
4076
4076
|
|
|
4077
4077
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
4078
|
-
|
|
4078
|
+
if self.__is_registered:
|
|
4079
|
+
msg = self.bundle.get("only_custom_keys")
|
|
4080
|
+
else:
|
|
4081
|
+
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4079
4082
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4080
4083
|
raise ValidationError(msg)
|
|
4081
4084
|
|
|
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
16
16
|
from urllib.parse import urljoin
|
|
17
17
|
|
|
18
18
|
import jwt
|
|
19
|
+
|
|
19
20
|
# import pandas as pd
|
|
20
21
|
import requests
|
|
21
22
|
from pydantic import BaseModel
|
|
@@ -342,7 +343,9 @@ class _RestClient:
|
|
|
342
343
|
else:
|
|
343
344
|
return self._syncronized_refresh_access_token()
|
|
344
345
|
|
|
345
|
-
def _with_unauth_retry(
|
|
346
|
+
def _with_unauth_retry(
|
|
347
|
+
self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False
|
|
348
|
+
):
|
|
346
349
|
try:
|
|
347
350
|
return request()
|
|
348
351
|
except RequestException as e:
|
|
@@ -373,8 +376,9 @@ class _RestClient:
|
|
|
373
376
|
elif "more than one concurrent search request" in e.message.lower():
|
|
374
377
|
raise ValidationError(bundle.get("concurrent_request"))
|
|
375
378
|
else:
|
|
376
|
-
|
|
377
|
-
|
|
379
|
+
if not silent:
|
|
380
|
+
print(e)
|
|
381
|
+
show_status_error()
|
|
378
382
|
raise e
|
|
379
383
|
|
|
380
384
|
@staticmethod
|
|
@@ -706,6 +710,7 @@ class _RestClient:
|
|
|
706
710
|
silent=True,
|
|
707
711
|
),
|
|
708
712
|
need_connection_retry=False,
|
|
713
|
+
silent=True,
|
|
709
714
|
)
|
|
710
715
|
except Exception:
|
|
711
716
|
self.send_log_event_unauth(log_event)
|
|
@@ -716,7 +721,7 @@ class _RestClient:
|
|
|
716
721
|
try:
|
|
717
722
|
requests.post(
|
|
718
723
|
url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
|
|
719
|
-
json=log_event.
|
|
724
|
+
json=log_event.model_dump(exclude_none=True),
|
|
720
725
|
headers=_RestClient._get_base_headers(content_type="application/json"),
|
|
721
726
|
)
|
|
722
727
|
except Exception:
|
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
-
from dataclasses import dataclass
|
|
4
3
|
import inspect
|
|
5
4
|
import logging
|
|
6
5
|
import re
|
|
6
|
+
import warnings
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from copy import deepcopy
|
|
9
|
+
from dataclasses import dataclass
|
|
9
10
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
10
11
|
|
|
11
|
-
import catboost
|
|
12
12
|
import numpy as np
|
|
13
13
|
import pandas as pd
|
|
14
|
-
from
|
|
14
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
15
15
|
from numpy import log1p
|
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
|
17
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -27,11 +27,8 @@ except ImportError:
|
|
|
27
27
|
from sklearn.metrics._scorer import SCORERS
|
|
28
28
|
|
|
29
29
|
available_scorers = SCORERS
|
|
30
|
-
from sklearn.metrics._regression import (
|
|
31
|
-
_check_reg_targets,
|
|
32
|
-
check_consistent_length,
|
|
33
|
-
)
|
|
34
30
|
from sklearn.metrics import mean_squared_error
|
|
31
|
+
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
|
35
32
|
from sklearn.model_selection import BaseCrossValidator
|
|
36
33
|
|
|
37
34
|
from upgini.errors import ValidationError
|
|
@@ -88,11 +85,73 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
|
88
85
|
|
|
89
86
|
LIGHTGBM_PARAMS = {
|
|
90
87
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
91
|
-
"num_leaves": 16,
|
|
88
|
+
# "num_leaves": 16,
|
|
89
|
+
# "n_estimators": 150,
|
|
90
|
+
# "min_child_weight": 1,
|
|
92
91
|
"max_depth": 4,
|
|
93
|
-
"
|
|
92
|
+
"max_cat_threshold": 80,
|
|
93
|
+
"min_data_per_group": 25,
|
|
94
|
+
"num_boost_round": 150,
|
|
95
|
+
"cat_l2": 10,
|
|
96
|
+
"cat_smooth": 12,
|
|
97
|
+
"learning_rate": 0.05,
|
|
98
|
+
"feature_fraction": 1.0,
|
|
99
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
LIGHTGBM_REGRESSION_PARAMS = {
|
|
103
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
104
|
+
"deterministic": True,
|
|
105
|
+
"min_gain_to_split": 0.001,
|
|
106
|
+
"n_estimators": 275,
|
|
107
|
+
"max_depth": 5,
|
|
108
|
+
"max_cat_threshold": 80,
|
|
109
|
+
"min_data_per_group": 25,
|
|
110
|
+
"cat_l2": 10,
|
|
111
|
+
"cat_smooth": 12,
|
|
94
112
|
"learning_rate": 0.05,
|
|
95
|
-
"
|
|
113
|
+
"feature_fraction": 1.0,
|
|
114
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
115
|
+
"objective": "huber",
|
|
116
|
+
"verbosity": -1,
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
LIGHTGBM_MULTICLASS_PARAMS = {
|
|
120
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
121
|
+
"deterministic": True,
|
|
122
|
+
"min_gain_to_split": 0.001,
|
|
123
|
+
"n_estimators": 275,
|
|
124
|
+
"max_depth": 3,
|
|
125
|
+
"max_cat_threshold": 80,
|
|
126
|
+
"min_data_per_group": 25,
|
|
127
|
+
"cat_l2": 10,
|
|
128
|
+
"cat_smooth": 12,
|
|
129
|
+
"learning_rate": 0.25, # CatBoost 0.25
|
|
130
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
131
|
+
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
132
|
+
"objective": "multiclass",
|
|
133
|
+
"use_quantized_grad": "true",
|
|
134
|
+
"num_grad_quant_bins": "8",
|
|
135
|
+
"stochastic_rounding": "true",
|
|
136
|
+
"verbosity": -1,
|
|
137
|
+
}
|
|
138
|
+
|
|
139
|
+
LIGHTGBM_BINARY_PARAMS = {
|
|
140
|
+
"random_state": DEFAULT_RANDOM_STATE,
|
|
141
|
+
"deterministic": True,
|
|
142
|
+
"min_gain_to_split": 0.001,
|
|
143
|
+
"n_estimators": 275,
|
|
144
|
+
"max_depth": 5,
|
|
145
|
+
"max_cat_threshold": 80,
|
|
146
|
+
"min_data_per_group": 25,
|
|
147
|
+
"cat_l2": 10,
|
|
148
|
+
"cat_smooth": 12,
|
|
149
|
+
"learning_rate": 0.05,
|
|
150
|
+
"feature_fraction": 1.0,
|
|
151
|
+
"min_sum_hessian_in_leaf": 0.01,
|
|
152
|
+
"objective": "binary",
|
|
153
|
+
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
154
|
+
"verbosity": -1,
|
|
96
155
|
}
|
|
97
156
|
|
|
98
157
|
N_FOLDS = 5
|
|
@@ -211,6 +270,15 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
211
270
|
}
|
|
212
271
|
|
|
213
272
|
|
|
273
|
+
def is_catboost_estimator(estimator):
|
|
274
|
+
try:
|
|
275
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
276
|
+
|
|
277
|
+
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
278
|
+
except ImportError:
|
|
279
|
+
return False
|
|
280
|
+
|
|
281
|
+
|
|
214
282
|
@dataclass
|
|
215
283
|
class _CrossValResults:
|
|
216
284
|
metric: Optional[float]
|
|
@@ -292,7 +360,7 @@ class EstimatorWrapper:
|
|
|
292
360
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
293
361
|
return x, y, groups
|
|
294
362
|
|
|
295
|
-
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame,
|
|
363
|
+
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray]:
|
|
296
364
|
joined = pd.concat([x, y], axis=1)
|
|
297
365
|
joined = joined[joined[y.name].notna()]
|
|
298
366
|
joined = joined.reset_index(drop=True)
|
|
@@ -346,12 +414,15 @@ class EstimatorWrapper:
|
|
|
346
414
|
for estimator, split in zip(self.cv_estimators, splits):
|
|
347
415
|
_, validation_idx = split
|
|
348
416
|
cv_x = x.iloc[validation_idx]
|
|
349
|
-
|
|
417
|
+
if isinstance(y, pd.Series):
|
|
418
|
+
cv_y = y.iloc[validation_idx]
|
|
419
|
+
else:
|
|
420
|
+
cv_y = y[validation_idx]
|
|
350
421
|
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
351
422
|
if shaps is not None:
|
|
352
423
|
for feature, shap_value in shaps.items():
|
|
353
424
|
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
354
|
-
shap_values_all_folds[feature].
|
|
425
|
+
shap_values_all_folds[feature].append(shap_value)
|
|
355
426
|
|
|
356
427
|
if shap_values_all_folds:
|
|
357
428
|
average_shap_values = {
|
|
@@ -427,21 +498,18 @@ class EstimatorWrapper:
|
|
|
427
498
|
}
|
|
428
499
|
if estimator is None:
|
|
429
500
|
params = {}
|
|
430
|
-
params["has_time"] = has_date
|
|
431
|
-
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
432
|
-
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
433
501
|
if target_type == ModelTaskType.MULTICLASS:
|
|
434
|
-
params = _get_add_params(params,
|
|
502
|
+
params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
|
|
435
503
|
params = _get_add_params(params, add_params)
|
|
436
|
-
estimator =
|
|
504
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
437
505
|
elif target_type == ModelTaskType.BINARY:
|
|
438
|
-
params = _get_add_params(params,
|
|
506
|
+
params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
|
|
439
507
|
params = _get_add_params(params, add_params)
|
|
440
|
-
estimator =
|
|
508
|
+
estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
|
|
441
509
|
elif target_type == ModelTaskType.REGRESSION:
|
|
442
|
-
params = _get_add_params(params,
|
|
510
|
+
params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
|
|
443
511
|
params = _get_add_params(params, add_params)
|
|
444
|
-
estimator =
|
|
512
|
+
estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
|
|
445
513
|
else:
|
|
446
514
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
447
515
|
else:
|
|
@@ -450,31 +518,21 @@ class EstimatorWrapper:
|
|
|
450
518
|
else:
|
|
451
519
|
estimator_copy = deepcopy(estimator)
|
|
452
520
|
kwargs["estimator"] = estimator_copy
|
|
453
|
-
if
|
|
521
|
+
if is_catboost_estimator(estimator):
|
|
454
522
|
if cat_features is not None:
|
|
455
523
|
for cat_feature in cat_features:
|
|
456
524
|
if cat_feature not in x.columns:
|
|
457
525
|
logger.error(
|
|
458
526
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
459
527
|
)
|
|
460
|
-
estimator_copy.set_params(
|
|
461
|
-
# cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
462
|
-
cat_features=cat_features
|
|
463
|
-
)
|
|
528
|
+
estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
|
|
464
529
|
estimator = CatBoostWrapper(**kwargs)
|
|
465
530
|
else:
|
|
466
|
-
|
|
467
|
-
|
|
468
|
-
|
|
469
|
-
|
|
470
|
-
|
|
471
|
-
else:
|
|
472
|
-
logger.warning(
|
|
473
|
-
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
474
|
-
"Default strategy for category features will be used"
|
|
475
|
-
)
|
|
476
|
-
estimator = OtherEstimatorWrapper(**kwargs)
|
|
477
|
-
except ModuleNotFoundError:
|
|
531
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
532
|
+
estimator = LightGBMWrapper(**kwargs)
|
|
533
|
+
elif is_catboost_estimator(estimator):
|
|
534
|
+
estimator = CatBoostWrapper(**kwargs)
|
|
535
|
+
else:
|
|
478
536
|
logger.warning(
|
|
479
537
|
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
480
538
|
"Default strategy for category features will be used"
|
|
@@ -487,7 +545,7 @@ class EstimatorWrapper:
|
|
|
487
545
|
class CatBoostWrapper(EstimatorWrapper):
|
|
488
546
|
def __init__(
|
|
489
547
|
self,
|
|
490
|
-
estimator
|
|
548
|
+
estimator,
|
|
491
549
|
scorer: Callable,
|
|
492
550
|
metric_name: str,
|
|
493
551
|
multiplier: int,
|
|
@@ -517,6 +575,9 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
517
575
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
518
576
|
|
|
519
577
|
# Find embeddings
|
|
578
|
+
import catboost
|
|
579
|
+
from catboost import CatBoostClassifier
|
|
580
|
+
|
|
520
581
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
521
582
|
emb_pattern = r"(.+)_emb\d+"
|
|
522
583
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
@@ -637,8 +698,10 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
637
698
|
else:
|
|
638
699
|
raise e
|
|
639
700
|
|
|
640
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator
|
|
701
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
641
702
|
try:
|
|
703
|
+
from catboost import Pool
|
|
704
|
+
|
|
642
705
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
643
706
|
fold_pool = Pool(
|
|
644
707
|
x,
|
|
@@ -695,25 +758,59 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
695
758
|
self.cat_features = None
|
|
696
759
|
|
|
697
760
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
698
|
-
x,
|
|
761
|
+
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
|
699
762
|
self.cat_features = _get_cat_features(x)
|
|
700
763
|
x = fill_na_cat_features(x, self.cat_features)
|
|
701
764
|
for feature in self.cat_features:
|
|
702
765
|
x[feature] = x[feature].astype("category").cat.codes
|
|
703
|
-
if not is_numeric_dtype(
|
|
704
|
-
|
|
766
|
+
if not is_numeric_dtype(y_numpy):
|
|
767
|
+
y_numpy = correct_string_target(y_numpy)
|
|
705
768
|
|
|
706
|
-
return x,
|
|
769
|
+
return x, y_numpy, groups, params
|
|
707
770
|
|
|
708
771
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
709
|
-
x,
|
|
772
|
+
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
|
710
773
|
if self.cat_features is not None:
|
|
711
774
|
x = fill_na_cat_features(x, self.cat_features)
|
|
712
775
|
for feature in self.cat_features:
|
|
713
776
|
x[feature] = x[feature].astype("category").cat.codes
|
|
714
777
|
if not is_numeric_dtype(y):
|
|
715
|
-
|
|
716
|
-
return x,
|
|
778
|
+
y_numpy = correct_string_target(y_numpy)
|
|
779
|
+
return x, y_numpy, params
|
|
780
|
+
|
|
781
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
782
|
+
try:
|
|
783
|
+
# Suppress specific warning from SHAP for LightGBM binary classifier
|
|
784
|
+
warnings.filterwarnings(
|
|
785
|
+
"ignore",
|
|
786
|
+
message=(
|
|
787
|
+
"LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
|
|
788
|
+
),
|
|
789
|
+
)
|
|
790
|
+
from shap import TreeExplainer
|
|
791
|
+
|
|
792
|
+
if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
|
|
793
|
+
return None
|
|
794
|
+
|
|
795
|
+
explainer = TreeExplainer(estimator)
|
|
796
|
+
|
|
797
|
+
shap_values = explainer.shap_values(x)
|
|
798
|
+
|
|
799
|
+
# For classification, shap_values is returned as a list for each class
|
|
800
|
+
# Take values for the positive class
|
|
801
|
+
if isinstance(shap_values, list):
|
|
802
|
+
shap_values = shap_values[1]
|
|
803
|
+
|
|
804
|
+
# Calculate mean absolute SHAP value for each feature
|
|
805
|
+
feature_importance = {}
|
|
806
|
+
for i, col in enumerate(x.columns):
|
|
807
|
+
feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
|
808
|
+
|
|
809
|
+
return feature_importance
|
|
810
|
+
|
|
811
|
+
except Exception as e:
|
|
812
|
+
self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
|
|
813
|
+
return None
|
|
717
814
|
|
|
718
815
|
|
|
719
816
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
|
|
|
80
80
|
postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
|
|
81
81
|
multiple_search_key=Search key {} passed multiple times
|
|
82
82
|
unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
|
|
83
|
+
only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
|
|
83
84
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
84
85
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
85
86
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
@@ -74,6 +74,8 @@ def remove_fintech_duplicates(
|
|
|
74
74
|
# Checking for different dates by the same personal keys
|
|
75
75
|
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
76
76
|
total = len(uniques)
|
|
77
|
+
if total == 0:
|
|
78
|
+
return segment_df, None
|
|
77
79
|
diff_dates = len(uniques[uniques > 1])
|
|
78
80
|
if diff_dates / total >= 0.6:
|
|
79
81
|
return segment_df, None
|
|
@@ -90,7 +90,8 @@ class FeatureInfo:
|
|
|
90
90
|
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
|
|
91
91
|
if data is not None and len(data) > 0 and feature_meta.name in data.columns:
|
|
92
92
|
if len(data) > 3:
|
|
93
|
-
|
|
93
|
+
rand = np.random.RandomState(42)
|
|
94
|
+
feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
94
95
|
else:
|
|
95
96
|
feature_sample = data[feature_meta.name].dropna().unique().tolist()
|
|
96
97
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
|
@@ -9,7 +9,6 @@ from traceback import format_exc
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import scipy.sparse as sp
|
|
12
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
13
12
|
from joblib import Parallel, logger
|
|
14
13
|
from scipy.sparse import issparse
|
|
15
14
|
from sklearn import config_context, get_config
|
|
@@ -342,6 +341,14 @@ def cross_validate(
|
|
|
342
341
|
raise e
|
|
343
342
|
|
|
344
343
|
|
|
344
|
+
def is_catboost_estimator(estimator):
|
|
345
|
+
try:
|
|
346
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
347
|
+
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
348
|
+
except ImportError:
|
|
349
|
+
return False
|
|
350
|
+
|
|
351
|
+
|
|
345
352
|
def _fit_and_score(
|
|
346
353
|
estimator,
|
|
347
354
|
X,
|
|
@@ -497,7 +504,7 @@ def _fit_and_score(
|
|
|
497
504
|
if y_train is None:
|
|
498
505
|
estimator.fit(X_train, **fit_params)
|
|
499
506
|
else:
|
|
500
|
-
if
|
|
507
|
+
if is_catboost_estimator(estimator):
|
|
501
508
|
fit_params = fit_params.copy()
|
|
502
509
|
fit_params["eval_set"] = [(X_test, y_test)]
|
|
503
510
|
estimator.fit(X_train, y_train, **fit_params)
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
__version__ = "1.2.69"
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
{upgini-1.2.69 → upgini-1.2.70a3832.dev1}/src/upgini/resource_bundle/strings_widget.properties
RENAMED
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|