upgini 1.2.71a3832.dev9__tar.gz → 1.2.71a3832.dev10__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/PKG-INFO +1 -1
- upgini-1.2.71a3832.dev10/src/upgini/__about__.py +1 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/features_enricher.py +50 -3
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/metrics.py +31 -20
- upgini-1.2.71a3832.dev9/src/upgini/__about__.py +0 -1
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/.gitignore +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/LICENSE +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/README.md +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/pyproject.toml +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/ads.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/all_operators.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/date.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/operator.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/base.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/cross.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/delta.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/lag.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/roll.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/trend.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/volatility.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/dataset.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/errors.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/http.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/mdc/context.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/metadata.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/normalizer/normalize_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/base.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/search_task.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/spinner.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/Roboto-Regular.ttf +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/feature_info.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/format.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/mstats.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/sklearn_ext.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/sort.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/ts_utils.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.3
|
2
2
|
Name: upgini
|
3
|
-
Version: 1.2.71a3832.
|
3
|
+
Version: 1.2.71a3832.dev10
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
@@ -0,0 +1 @@
|
|
1
|
+
__version__ = "1.2.71a3832.dev10"
|
@@ -1514,6 +1514,8 @@ class FeaturesEnricher(TransformerMixin):
|
|
1514
1514
|
filtered_enriched_features = self.__filtered_enriched_features(
|
1515
1515
|
importance_threshold,
|
1516
1516
|
max_features,
|
1517
|
+
trace_id,
|
1518
|
+
validated_X
|
1517
1519
|
)
|
1518
1520
|
filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
|
1519
1521
|
|
@@ -3805,6 +3807,46 @@ if response.status_code == 200:
|
|
3805
3807
|
|
3806
3808
|
return result_features
|
3807
3809
|
|
3810
|
+
def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
|
3811
|
+
if self._search_task is None:
|
3812
|
+
raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
|
3813
|
+
features_meta = self._search_task.get_all_features_metadata_v2()
|
3814
|
+
if features_meta is None:
|
3815
|
+
raise Exception(self.bundle.get("missing_features_meta"))
|
3816
|
+
|
3817
|
+
original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
|
3818
|
+
df = df.rename(columns=original_names_dict)
|
3819
|
+
|
3820
|
+
features_meta.sort(key=lambda m: (-m.shap_value, m.name))
|
3821
|
+
|
3822
|
+
importances = {}
|
3823
|
+
|
3824
|
+
for feature_meta in features_meta:
|
3825
|
+
if feature_meta.name in original_names_dict.keys():
|
3826
|
+
feature_meta.name = original_names_dict[feature_meta.name]
|
3827
|
+
|
3828
|
+
is_client_feature = feature_meta.name in df.columns
|
3829
|
+
|
3830
|
+
if feature_meta.shap_value == 0.0:
|
3831
|
+
continue
|
3832
|
+
|
3833
|
+
# Use only important features
|
3834
|
+
if (
|
3835
|
+
feature_meta.name == COUNTRY
|
3836
|
+
# In select_features mode we select also from etalon features and need to show them
|
3837
|
+
or (not self.fit_select_features and is_client_feature)
|
3838
|
+
):
|
3839
|
+
continue
|
3840
|
+
|
3841
|
+
# Temporary workaround for duplicate features metadata
|
3842
|
+
if feature_meta.name in self.feature_names_:
|
3843
|
+
self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
|
3844
|
+
continue
|
3845
|
+
|
3846
|
+
importances[feature_meta.name] = feature_meta.shap_value
|
3847
|
+
|
3848
|
+
return importances
|
3849
|
+
|
3808
3850
|
def __prepare_feature_importances(
|
3809
3851
|
self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
|
3810
3852
|
):
|
@@ -3990,9 +4032,12 @@ if response.status_code == 200:
|
|
3990
4032
|
)
|
3991
4033
|
|
3992
4034
|
def __filtered_importance_names(
|
3993
|
-
self, importance_threshold: Optional[float], max_features: Optional[int]
|
4035
|
+
self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
|
3994
4036
|
) -> List[str]:
|
3995
|
-
|
4037
|
+
# get features importance from server
|
4038
|
+
filtered_importances = self.__get_features_importance_from_server(trace_id, df)
|
4039
|
+
|
4040
|
+
if len(filtered_importances) == 0:
|
3996
4041
|
return []
|
3997
4042
|
|
3998
4043
|
filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
|
@@ -4212,11 +4257,13 @@ if response.status_code == 200:
|
|
4212
4257
|
self,
|
4213
4258
|
importance_threshold: Optional[float],
|
4214
4259
|
max_features: Optional[int],
|
4260
|
+
trace_id: str,
|
4261
|
+
df: pd.DataFrame,
|
4215
4262
|
) -> List[str]:
|
4216
4263
|
importance_threshold = self.__validate_importance_threshold(importance_threshold)
|
4217
4264
|
max_features = self.__validate_max_features(max_features)
|
4218
4265
|
|
4219
|
-
return self.__filtered_importance_names(importance_threshold, max_features)
|
4266
|
+
return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
|
4220
4267
|
|
4221
4268
|
def __detect_missing_search_keys(
|
4222
4269
|
self,
|
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
import inspect
|
4
4
|
import logging
|
5
5
|
import re
|
6
|
-
import warnings
|
7
6
|
from collections import defaultdict
|
8
7
|
from copy import deepcopy
|
9
8
|
from dataclasses import dataclass
|
@@ -755,9 +754,12 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
755
754
|
logger=logger,
|
756
755
|
)
|
757
756
|
self.cat_features = None
|
757
|
+
self.n_classes = None
|
758
758
|
|
759
759
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
760
760
|
x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
|
761
|
+
if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
|
762
|
+
self.n_classes = len(np.unique(y_numpy))
|
761
763
|
if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
|
762
764
|
params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
|
763
765
|
self.cat_features = _get_cat_features(x)
|
@@ -783,31 +785,40 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
783
785
|
|
784
786
|
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
785
787
|
try:
|
786
|
-
|
787
|
-
|
788
|
-
|
789
|
-
|
790
|
-
|
791
|
-
|
788
|
+
shap_matrix = estimator.predict(
|
789
|
+
x,
|
790
|
+
predict_disable_shape_check=True,
|
791
|
+
raw_score=True,
|
792
|
+
pred_leaf=False,
|
793
|
+
pred_early_stop=True,
|
794
|
+
pred_contrib=True,
|
792
795
|
)
|
793
|
-
from shap import TreeExplainer
|
794
|
-
|
795
|
-
if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
|
796
|
-
return None
|
797
796
|
|
798
|
-
|
799
|
-
|
800
|
-
|
797
|
+
if self.target_type == ModelTaskType.MULTICLASS:
|
798
|
+
n_feat = x.shape[1]
|
799
|
+
shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
|
800
|
+
shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
|
801
801
|
|
802
|
-
#
|
803
|
-
|
804
|
-
if isinstance(shap_values, list):
|
805
|
-
shap_values = shap_values[1]
|
802
|
+
# exclude base value
|
803
|
+
shap_matrix = shap_matrix[:, :-1]
|
806
804
|
|
807
|
-
# Calculate mean absolute SHAP value for each feature
|
808
805
|
feature_importance = {}
|
809
806
|
for i, col in enumerate(x.columns):
|
810
|
-
feature_importance[col] = np.mean(np.abs(
|
807
|
+
feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
|
808
|
+
|
809
|
+
# # exclude last column (base value)
|
810
|
+
# shap_values_only = shap_values[:, :-1]
|
811
|
+
# mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
|
812
|
+
|
813
|
+
# # For classification, shap_values is returned as a list for each class
|
814
|
+
# # Take values for the positive class
|
815
|
+
# if isinstance(shap_values, list):
|
816
|
+
# shap_values = shap_values[1]
|
817
|
+
|
818
|
+
# # Calculate mean absolute SHAP value for each feature
|
819
|
+
# feature_importance = {}
|
820
|
+
# for i, col in enumerate(x.columns):
|
821
|
+
# feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
811
822
|
|
812
823
|
return feature_importance
|
813
824
|
|
@@ -1 +0,0 @@
|
|
1
|
-
__version__ = "1.2.71a3832.dev9"
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/ads_management/ads_manager.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/__init__.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/volatility.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/data_source/data_source_publisher.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/normalizer/normalize_utils.py
RENAMED
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/exceptions.py
RENAMED
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/strings.properties
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/random_under_sampler.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/base_search_key_detector.py
RENAMED
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/blocked_time_series.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
{upgini-1.2.71a3832.dev9 → upgini-1.2.71a3832.dev10}/src/upgini/utils/fallback_progress_bar.py
RENAMED
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|