upgini 1.2.71a3832.dev8__tar.gz → 1.2.71a3832.dev10__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/PKG-INFO +1 -1
  2. upgini-1.2.71a3832.dev10/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/features_enricher.py +50 -3
  4. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/metrics.py +44 -37
  5. upgini-1.2.71a3832.dev8/src/upgini/__about__.py +0 -1
  6. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/.gitignore +0 -0
  7. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/LICENSE +0 -0
  8. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/README.md +0 -0
  9. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/pyproject.toml +0 -0
  10. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/__init__.py +0 -0
  11. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/ads.py +0 -0
  12. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/all_operators.py +0 -0
  16. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/operator.py +0 -0
  21. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/__init__.py +0 -0
  22. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/base.py +0 -0
  23. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/cross.py +0 -0
  24. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/delta.py +0 -0
  25. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/lag.py +0 -0
  26. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/roll.py +0 -0
  27. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/trend.py +0 -0
  28. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/timeseries/volatility.py +0 -0
  29. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/http.py +0 -0
  37. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/mdc/__init__.py +0 -0
  38. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/mdc/context.py +0 -0
  39. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/metadata.py +0 -0
  40. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/normalizer/__init__.py +0 -0
  41. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/normalizer/normalize_utils.py +0 -0
  42. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/__init__.py +0 -0
  43. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/exceptions.py +0 -0
  44. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/strings.properties +0 -0
  45. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  46. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/__init__.py +0 -0
  47. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/base.py +0 -0
  48. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/random_under_sampler.py +0 -0
  49. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/sampler/utils.py +0 -0
  50. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/search_task.py +0 -0
  51. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/spinner.py +0 -0
  52. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  53. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/__init__.py +0 -0
  54. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/base_search_key_detector.py +0 -0
  55. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/blocked_time_series.py +0 -0
  56. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/country_utils.py +0 -0
  57. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/custom_loss_utils.py +0 -0
  58. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/cv_utils.py +0 -0
  59. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/datetime_utils.py +0 -0
  60. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/deduplicate_utils.py +0 -0
  61. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/display_utils.py +0 -0
  62. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/email_utils.py +0 -0
  63. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/fallback_progress_bar.py +0 -0
  64. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/feature_info.py +0 -0
  65. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/features_validator.py +0 -0
  66. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/format.py +0 -0
  67. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/ip_utils.py +0 -0
  68. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/mstats.py +0 -0
  69. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/phone_utils.py +0 -0
  70. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/postal_code_utils.py +0 -0
  71. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/progress_bar.py +0 -0
  72. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/sklearn_ext.py +0 -0
  73. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/sort.py +0 -0
  74. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/target_utils.py +0 -0
  75. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/track_info.py +0 -0
  76. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/ts_utils.py +0 -0
  77. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/utils/warning_counter.py +0 -0
  78. {upgini-1.2.71a3832.dev8 → upgini-1.2.71a3832.dev10}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3832.dev8
3
+ Version: 1.2.71a3832.dev10
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.71a3832.dev10"
@@ -1514,6 +1514,8 @@ class FeaturesEnricher(TransformerMixin):
1514
1514
  filtered_enriched_features = self.__filtered_enriched_features(
1515
1515
  importance_threshold,
1516
1516
  max_features,
1517
+ trace_id,
1518
+ validated_X
1517
1519
  )
1518
1520
  filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1519
1521
 
@@ -3805,6 +3807,46 @@ if response.status_code == 200:
3805
3807
 
3806
3808
  return result_features
3807
3809
 
3810
+ def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
3811
+ if self._search_task is None:
3812
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3813
+ features_meta = self._search_task.get_all_features_metadata_v2()
3814
+ if features_meta is None:
3815
+ raise Exception(self.bundle.get("missing_features_meta"))
3816
+
3817
+ original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3818
+ df = df.rename(columns=original_names_dict)
3819
+
3820
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3821
+
3822
+ importances = {}
3823
+
3824
+ for feature_meta in features_meta:
3825
+ if feature_meta.name in original_names_dict.keys():
3826
+ feature_meta.name = original_names_dict[feature_meta.name]
3827
+
3828
+ is_client_feature = feature_meta.name in df.columns
3829
+
3830
+ if feature_meta.shap_value == 0.0:
3831
+ continue
3832
+
3833
+ # Use only important features
3834
+ if (
3835
+ feature_meta.name == COUNTRY
3836
+ # In select_features mode we select also from etalon features and need to show them
3837
+ or (not self.fit_select_features and is_client_feature)
3838
+ ):
3839
+ continue
3840
+
3841
+ # Temporary workaround for duplicate features metadata
3842
+ if feature_meta.name in self.feature_names_:
3843
+ self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
3844
+ continue
3845
+
3846
+ importances[feature_meta.name] = feature_meta.shap_value
3847
+
3848
+ return importances
3849
+
3808
3850
  def __prepare_feature_importances(
3809
3851
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3810
3852
  ):
@@ -3990,9 +4032,12 @@ if response.status_code == 200:
3990
4032
  )
3991
4033
 
3992
4034
  def __filtered_importance_names(
3993
- self, importance_threshold: Optional[float], max_features: Optional[int]
4035
+ self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
3994
4036
  ) -> List[str]:
3995
- if len(self.feature_names_) == 0:
4037
+ # get features importance from server
4038
+ filtered_importances = self.__get_features_importance_from_server(trace_id, df)
4039
+
4040
+ if len(filtered_importances) == 0:
3996
4041
  return []
3997
4042
 
3998
4043
  filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
@@ -4212,11 +4257,13 @@ if response.status_code == 200:
4212
4257
  self,
4213
4258
  importance_threshold: Optional[float],
4214
4259
  max_features: Optional[int],
4260
+ trace_id: str,
4261
+ df: pd.DataFrame,
4215
4262
  ) -> List[str]:
4216
4263
  importance_threshold = self.__validate_importance_threshold(importance_threshold)
4217
4264
  max_features = self.__validate_max_features(max_features)
4218
4265
 
4219
- return self.__filtered_importance_names(importance_threshold, max_features)
4266
+ return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
4220
4267
 
4221
4268
  def __detect_missing_search_keys(
4222
4269
  self,
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import inspect
4
4
  import logging
5
5
  import re
6
- import warnings
7
6
  from collections import defaultdict
8
7
  from copy import deepcopy
9
8
  from dataclasses import dataclass
@@ -119,18 +118,16 @@ LIGHTGBM_REGRESSION_PARAMS = {
119
118
 
120
119
  LIGHTGBM_MULTICLASS_PARAMS = {
121
120
  "random_state": DEFAULT_RANDOM_STATE,
122
- "deterministic": True,
123
- "min_gain_to_split": 0.001,
124
121
  "n_estimators": 275,
125
- "max_depth": 3,
122
+ "max_depth": 5,
123
+ "learning_rate": 0.05,
124
+ "min_gain_to_split": 0.001,
126
125
  "max_cat_threshold": 80,
127
- "min_data_per_group": 25,
128
- "cat_l2": 10,
129
- "cat_smooth": 12,
130
- "learning_rate": 0.25, # CatBoost 0.25
131
- "min_sum_hessian_in_leaf": 0.01,
132
- "class_weight": "balanced", # TODO pass dict with weights for each class
126
+ "min_data_per_group": 20,
127
+ "cat_smooth": 18,
128
+ "cat_l2" : 8,
133
129
  "objective": "multiclass",
130
+ "class_weight": "balanced",
134
131
  "use_quantized_grad": "true",
135
132
  "num_grad_quant_bins": "8",
136
133
  "stochastic_rounding": "true",
@@ -139,19 +136,17 @@ LIGHTGBM_MULTICLASS_PARAMS = {
139
136
 
140
137
  LIGHTGBM_BINARY_PARAMS = {
141
138
  "random_state": DEFAULT_RANDOM_STATE,
142
- "deterministic": True,
143
139
  "min_gain_to_split": 0.001,
144
140
  "n_estimators": 275,
145
141
  "max_depth": 5,
146
- "max_cat_threshold": 80,
147
- "min_data_per_group": 25,
148
- "cat_l2": 10,
149
- "cat_smooth": 12,
150
142
  "learning_rate": 0.05,
151
- "feature_fraction": 1.0,
152
- "min_sum_hessian_in_leaf": 0.01,
153
143
  "objective": "binary",
154
- "class_weight": "balanced", # TODO pass dict with weights for each class
144
+ "class_weight": "balanced",
145
+ "deterministic": True,
146
+ "max_cat_threshold": 80,
147
+ "min_data_per_group": 20,
148
+ "cat_smooth": 18,
149
+ "cat_l2" : 8,
155
150
  "verbosity": -1,
156
151
  }
157
152
 
@@ -759,9 +754,12 @@ class LightGBMWrapper(EstimatorWrapper):
759
754
  logger=logger,
760
755
  )
761
756
  self.cat_features = None
757
+ self.n_classes = None
762
758
 
763
759
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
764
760
  x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
761
+ if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
762
+ self.n_classes = len(np.unique(y_numpy))
765
763
  if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
766
764
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
767
765
  self.cat_features = _get_cat_features(x)
@@ -787,31 +785,40 @@ class LightGBMWrapper(EstimatorWrapper):
787
785
 
788
786
  def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
789
787
  try:
790
- # Suppress specific warning from SHAP for LightGBM binary classifier
791
- warnings.filterwarnings(
792
- "ignore",
793
- message=(
794
- "LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
795
- ),
788
+ shap_matrix = estimator.predict(
789
+ x,
790
+ predict_disable_shape_check=True,
791
+ raw_score=True,
792
+ pred_leaf=False,
793
+ pred_early_stop=True,
794
+ pred_contrib=True,
796
795
  )
797
- from shap import TreeExplainer
798
796
 
799
- if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
800
- return None
801
-
802
- explainer = TreeExplainer(estimator)
803
-
804
- shap_values = explainer.shap_values(x)
797
+ if self.target_type == ModelTaskType.MULTICLASS:
798
+ n_feat = x.shape[1]
799
+ shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
800
+ shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
805
801
 
806
- # For classification, shap_values is returned as a list for each class
807
- # Take values for the positive class
808
- if isinstance(shap_values, list):
809
- shap_values = shap_values[1]
802
+ # exclude base value
803
+ shap_matrix = shap_matrix[:, :-1]
810
804
 
811
- # Calculate mean absolute SHAP value for each feature
812
805
  feature_importance = {}
813
806
  for i, col in enumerate(x.columns):
814
- feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
807
+ feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
808
+
809
+ # # exclude last column (base value)
810
+ # shap_values_only = shap_values[:, :-1]
811
+ # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
812
+
813
+ # # For classification, shap_values is returned as a list for each class
814
+ # # Take values for the positive class
815
+ # if isinstance(shap_values, list):
816
+ # shap_values = shap_values[1]
817
+
818
+ # # Calculate mean absolute SHAP value for each feature
819
+ # feature_importance = {}
820
+ # for i, col in enumerate(x.columns):
821
+ # feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
815
822
 
816
823
  return feature_importance
817
824
 
@@ -1 +0,0 @@
1
- __version__ = "1.2.71a3832.dev8"