upgini 1.2.71a3832.dev11__py3-none-any.whl → 1.2.71a3832.dev13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/__about__.py +1 -1
- upgini/features_enricher.py +20 -9
- upgini/metrics.py +32 -5
- upgini/utils/target_utils.py +4 -2
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/METADATA +1 -1
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/RECORD +8 -8
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/WHEEL +0 -0
- {upgini-1.2.71a3832.dev11.dist-info → upgini-1.2.71a3832.dev13.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
@@ -1 +1 @@
|
|
1
|
-
__version__ = "1.2.71a3832.
|
1
|
+
__version__ = "1.2.71a3832.dev13"
|
upgini/features_enricher.py
CHANGED
@@ -12,6 +12,7 @@ import tempfile
|
|
12
12
|
import time
|
13
13
|
import uuid
|
14
14
|
from collections import Counter
|
15
|
+
from copy import deepcopy
|
15
16
|
from dataclasses import dataclass
|
16
17
|
from threading import Thread
|
17
18
|
from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
|
@@ -3812,6 +3813,7 @@ if response.status_code == 200:
|
|
3812
3813
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
3813
3814
|
if features_meta is None:
|
3814
3815
|
raise Exception(self.bundle.get("missing_features_meta"))
|
3816
|
+
features_meta = deepcopy(features_meta)
|
3815
3817
|
|
3816
3818
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
3817
3819
|
df = df.rename(columns=original_names_dict)
|
@@ -3854,6 +3856,7 @@ if response.status_code == 200:
|
|
3854
3856
|
features_meta = self._search_task.get_all_features_metadata_v2()
|
3855
3857
|
if features_meta is None:
|
3856
3858
|
raise Exception(self.bundle.get("missing_features_meta"))
|
3859
|
+
features_meta = deepcopy(features_meta)
|
3857
3860
|
|
3858
3861
|
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
3859
3862
|
features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
|
@@ -3869,15 +3872,23 @@ if response.status_code == 200:
|
|
3869
3872
|
|
3870
3873
|
original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
|
3871
3874
|
|
3872
|
-
if updated_shaps is not None:
|
3873
|
-
for fm in features_meta:
|
3874
|
-
fm.shap_value = updated_shaps.get(fm.name, 0.0)
|
3875
|
-
|
3876
|
-
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3877
3875
|
for feature_meta in features_meta:
|
3878
3876
|
if feature_meta.name in original_names_dict.keys():
|
3879
3877
|
feature_meta.name = original_names_dict[feature_meta.name]
|
3880
3878
|
|
3879
|
+
if updated_shaps is not None:
|
3880
|
+
updating_shap = updated_shaps.get(feature_meta.name)
|
3881
|
+
if updating_shap is None:
|
3882
|
+
self.logger.warning(
|
3883
|
+
f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
|
3884
|
+
)
|
3885
|
+
updating_shap = 0.0
|
3886
|
+
feature_meta.shap_value = updating_shap
|
3887
|
+
|
3888
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3889
|
+
|
3890
|
+
for feature_meta in features_meta:
|
3891
|
+
|
3881
3892
|
is_client_feature = feature_meta.name in df.columns
|
3882
3893
|
|
3883
3894
|
# TODO make a decision about selected features based on special flag from mlb
|
@@ -3889,7 +3900,7 @@ if response.status_code == 200:
|
|
3889
3900
|
# Use only important features
|
3890
3901
|
if (
|
3891
3902
|
# feature_meta.name in self.fit_generated_features or
|
3892
|
-
feature_meta.name == COUNTRY
|
3903
|
+
feature_meta.name == COUNTRY # constant synthetic column
|
3893
3904
|
# In select_features mode we select also from etalon features and need to show them
|
3894
3905
|
or (not self.fit_select_features and is_client_feature)
|
3895
3906
|
):
|
@@ -4039,11 +4050,11 @@ if response.status_code == 200:
|
|
4039
4050
|
if len(filtered_importances) == 0:
|
4040
4051
|
return []
|
4041
4052
|
|
4042
|
-
filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
|
4043
|
-
|
4044
4053
|
if importance_threshold is not None:
|
4045
4054
|
filtered_importances = [
|
4046
|
-
(name, importance)
|
4055
|
+
(name, importance)
|
4056
|
+
for name, importance in filtered_importances.items()
|
4057
|
+
if importance > importance_threshold
|
4047
4058
|
]
|
4048
4059
|
if max_features is not None:
|
4049
4060
|
filtered_importances = list(filtered_importances)[:max_features]
|
upgini/metrics.py
CHANGED
@@ -14,7 +14,9 @@ from lightgbm import LGBMClassifier, LGBMRegressor
|
|
14
14
|
import lightgbm as lgb
|
15
15
|
from numpy import log1p
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
17
|
+
# from sklearn.calibration import LabelEncoder
|
17
18
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
19
|
+
from sklearn.preprocessing import OrdinalEncoder
|
18
20
|
|
19
21
|
from upgini.utils.features_validator import FeaturesValidator
|
20
22
|
from upgini.utils.sklearn_ext import cross_validate
|
@@ -754,6 +756,8 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
754
756
|
logger=logger,
|
755
757
|
)
|
756
758
|
self.cat_features = None
|
759
|
+
# self.cat_features_encoders = dict()
|
760
|
+
self.cat_encoder = None
|
757
761
|
self.n_classes = None
|
758
762
|
|
759
763
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
@@ -765,9 +769,22 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
765
769
|
self.cat_features = _get_cat_features(x)
|
766
770
|
if self.cat_features:
|
767
771
|
params["categorical_feature"] = self.cat_features
|
768
|
-
|
769
|
-
|
770
|
-
|
772
|
+
# params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features] Works
|
773
|
+
# params["categorical_feature"] = "notauto"
|
774
|
+
# params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
|
775
|
+
# cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features] Doesn't work
|
776
|
+
# params["categorical_feature"] = ",".join(cat_indices)
|
777
|
+
pass
|
778
|
+
x = fill_na_cat_features(x, self.cat_features)
|
779
|
+
encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
780
|
+
encoded = encoder.fit_transform(x[self.cat_features], y_numpy)
|
781
|
+
x[self.cat_features] = encoded
|
782
|
+
self.cat_encoder = encoder
|
783
|
+
# for feature in self.cat_features:
|
784
|
+
# encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
|
785
|
+
# x[feature] = encoder.fit_transform(x[feature])
|
786
|
+
# self.cat_features_encoders[feature] = encoder
|
787
|
+
# x[feature] = x[feature].astype("category").cat.codes
|
771
788
|
if not is_numeric_dtype(y_numpy):
|
772
789
|
y_numpy = correct_string_target(y_numpy)
|
773
790
|
|
@@ -776,9 +793,19 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
776
793
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
777
794
|
x, y_numpy, params = super()._prepare_to_calculate(x, y)
|
778
795
|
if self.cat_features is not None:
|
796
|
+
params["categorical_feature"] = self.cat_features
|
797
|
+
# params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features]
|
798
|
+
# params["categorical_feature"] = "notauto"
|
799
|
+
# params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
|
800
|
+
# cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features]
|
801
|
+
# params["categorical_feature"] = ",".join(cat_indices)
|
779
802
|
x = fill_na_cat_features(x, self.cat_features)
|
780
|
-
|
781
|
-
x[
|
803
|
+
if self.cat_encoder is not None:
|
804
|
+
x[self.cat_features] = self.cat_encoder.transform(x[self.cat_features])
|
805
|
+
# for feature in self.cat_features:
|
806
|
+
# encoder = self.cat_features_encoders[feature]
|
807
|
+
# x[feature] = encoder.transform(x[feature])
|
808
|
+
# x[feature] = x[feature].astype("category").cat.codes
|
782
809
|
if not is_numeric_dtype(y):
|
783
810
|
y_numpy = correct_string_target(y_numpy)
|
784
811
|
return x, y_numpy, params
|
upgini/utils/target_utils.py
CHANGED
@@ -204,7 +204,7 @@ def balance_undersample(
|
|
204
204
|
def balance_undersample_forced(
|
205
205
|
df: pd.DataFrame,
|
206
206
|
target_column: str,
|
207
|
-
id_columns: List[str],
|
207
|
+
id_columns: Optional[List[str]],
|
208
208
|
date_column: str,
|
209
209
|
task_type: ModelTaskType,
|
210
210
|
cv_type: Optional[CVType],
|
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
|
|
287
287
|
|
288
288
|
def balance_undersample_time_series_trunc(
|
289
289
|
df: pd.DataFrame,
|
290
|
-
id_columns: List[str],
|
290
|
+
id_columns: Optional[List[str]],
|
291
291
|
date_column: str,
|
292
292
|
sample_size: int,
|
293
293
|
random_state: int = 42,
|
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
|
|
298
298
|
**kwargs,
|
299
299
|
):
|
300
300
|
# Convert date column to datetime
|
301
|
+
if id_columns is None:
|
302
|
+
id_columns = [date_column]
|
301
303
|
dates_df = df[id_columns + [date_column]].copy()
|
302
304
|
dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
|
303
305
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.71a3832.
|
3
|
+
Version: 1.2.71a3832.dev13
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -1,12 +1,12 @@
|
|
1
|
-
upgini/__about__.py,sha256=
|
1
|
+
upgini/__about__.py,sha256=buorll9F2OX4EgV8VmlIrj09nqmsSmqAG8T8p6hRCls,34
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
4
4
|
upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
6
|
-
upgini/features_enricher.py,sha256=
|
6
|
+
upgini/features_enricher.py,sha256=lk80Bx9U36lva6T4lPHBFk88ivrpZ-2uwwMwQg0LglE,207023
|
7
7
|
upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
|
8
8
|
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
9
|
-
upgini/metrics.py,sha256=
|
9
|
+
upgini/metrics.py,sha256=ot6AhxfRRTzM-dNApWTvmteLBAmGjD9OyAuKmtUTprE,40630
|
10
10
|
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
11
11
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
12
12
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
67
67
|
upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
|
68
68
|
upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
|
69
|
-
upgini/utils/target_utils.py,sha256=
|
69
|
+
upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
73
|
-
upgini-1.2.71a3832.
|
74
|
-
upgini-1.2.71a3832.
|
75
|
-
upgini-1.2.71a3832.
|
76
|
-
upgini-1.2.71a3832.
|
73
|
+
upgini-1.2.71a3832.dev13.dist-info/METADATA,sha256=JdRugxJAMW4KLyRuz7yIX_PqSz_nObynmhkW5-g_lVs,49102
|
74
|
+
upgini-1.2.71a3832.dev13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
75
|
+
upgini-1.2.71a3832.dev13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
76
|
+
upgini-1.2.71a3832.dev13.dist-info/RECORD,,
|
File without changes
|
File without changes
|