upgini 1.2.71a3832.dev11__py3-none-any.whl → 1.2.71a3832.dev13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.71a3832.dev11"
1
+ __version__ = "1.2.71a3832.dev13"
@@ -12,6 +12,7 @@ import tempfile
12
12
  import time
13
13
  import uuid
14
14
  from collections import Counter
15
+ from copy import deepcopy
15
16
  from dataclasses import dataclass
16
17
  from threading import Thread
17
18
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -3812,6 +3813,7 @@ if response.status_code == 200:
3812
3813
  features_meta = self._search_task.get_all_features_metadata_v2()
3813
3814
  if features_meta is None:
3814
3815
  raise Exception(self.bundle.get("missing_features_meta"))
3816
+ features_meta = deepcopy(features_meta)
3815
3817
 
3816
3818
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3817
3819
  df = df.rename(columns=original_names_dict)
@@ -3854,6 +3856,7 @@ if response.status_code == 200:
3854
3856
  features_meta = self._search_task.get_all_features_metadata_v2()
3855
3857
  if features_meta is None:
3856
3858
  raise Exception(self.bundle.get("missing_features_meta"))
3859
+ features_meta = deepcopy(features_meta)
3857
3860
 
3858
3861
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3859
3862
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
@@ -3869,15 +3872,23 @@ if response.status_code == 200:
3869
3872
 
3870
3873
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
3871
3874
 
3872
- if updated_shaps is not None:
3873
- for fm in features_meta:
3874
- fm.shap_value = updated_shaps.get(fm.name, 0.0)
3875
-
3876
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3877
3875
  for feature_meta in features_meta:
3878
3876
  if feature_meta.name in original_names_dict.keys():
3879
3877
  feature_meta.name = original_names_dict[feature_meta.name]
3880
3878
 
3879
+ if updated_shaps is not None:
3880
+ updating_shap = updated_shaps.get(feature_meta.name)
3881
+ if updating_shap is None:
3882
+ self.logger.warning(
3883
+ f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
3884
+ )
3885
+ updating_shap = 0.0
3886
+ feature_meta.shap_value = updating_shap
3887
+
3888
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3889
+
3890
+ for feature_meta in features_meta:
3891
+
3881
3892
  is_client_feature = feature_meta.name in df.columns
3882
3893
 
3883
3894
  # TODO make a decision about selected features based on special flag from mlb
@@ -3889,7 +3900,7 @@ if response.status_code == 200:
3889
3900
  # Use only important features
3890
3901
  if (
3891
3902
  # feature_meta.name in self.fit_generated_features or
3892
- feature_meta.name == COUNTRY
3903
+ feature_meta.name == COUNTRY # constant synthetic column
3893
3904
  # In select_features mode we select also from etalon features and need to show them
3894
3905
  or (not self.fit_select_features and is_client_feature)
3895
3906
  ):
@@ -4039,11 +4050,11 @@ if response.status_code == 200:
4039
4050
  if len(filtered_importances) == 0:
4040
4051
  return []
4041
4052
 
4042
- filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
4043
-
4044
4053
  if importance_threshold is not None:
4045
4054
  filtered_importances = [
4046
- (name, importance) for name, importance in filtered_importances if importance > importance_threshold
4055
+ (name, importance)
4056
+ for name, importance in filtered_importances.items()
4057
+ if importance > importance_threshold
4047
4058
  ]
4048
4059
  if max_features is not None:
4049
4060
  filtered_importances = list(filtered_importances)[:max_features]
upgini/metrics.py CHANGED
@@ -14,7 +14,9 @@ from lightgbm import LGBMClassifier, LGBMRegressor
14
14
  import lightgbm as lgb
15
15
  from numpy import log1p
16
16
  from pandas.api.types import is_numeric_dtype
17
+ # from sklearn.calibration import LabelEncoder
17
18
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
19
+ from sklearn.preprocessing import OrdinalEncoder
18
20
 
19
21
  from upgini.utils.features_validator import FeaturesValidator
20
22
  from upgini.utils.sklearn_ext import cross_validate
@@ -754,6 +756,8 @@ class LightGBMWrapper(EstimatorWrapper):
754
756
  logger=logger,
755
757
  )
756
758
  self.cat_features = None
759
+ # self.cat_features_encoders = dict()
760
+ self.cat_encoder = None
757
761
  self.n_classes = None
758
762
 
759
763
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -765,9 +769,22 @@ class LightGBMWrapper(EstimatorWrapper):
765
769
  self.cat_features = _get_cat_features(x)
766
770
  if self.cat_features:
767
771
  params["categorical_feature"] = self.cat_features
768
- x = fill_na_cat_features(x, self.cat_features)
769
- for feature in self.cat_features:
770
- x[feature] = x[feature].astype("category").cat.codes
772
+ # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features] Works
773
+ # params["categorical_feature"] = "notauto"
774
+ # params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
775
+ # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features] Doesn't work
776
+ # params["categorical_feature"] = ",".join(cat_indices)
777
+ pass
778
+ x = fill_na_cat_features(x, self.cat_features)
779
+ encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
780
+ encoded = encoder.fit_transform(x[self.cat_features], y_numpy)
781
+ x[self.cat_features] = encoded
782
+ self.cat_encoder = encoder
783
+ # for feature in self.cat_features:
784
+ # encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
785
+ # x[feature] = encoder.fit_transform(x[feature])
786
+ # self.cat_features_encoders[feature] = encoder
787
+ # x[feature] = x[feature].astype("category").cat.codes
771
788
  if not is_numeric_dtype(y_numpy):
772
789
  y_numpy = correct_string_target(y_numpy)
773
790
 
@@ -776,9 +793,19 @@ class LightGBMWrapper(EstimatorWrapper):
776
793
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
777
794
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
778
795
  if self.cat_features is not None:
796
+ params["categorical_feature"] = self.cat_features
797
+ # params["categorical_feature"] = [x.columns.get_loc(c) for c in self.cat_features]
798
+ # params["categorical_feature"] = "notauto"
799
+ # params["categorical_feature"] = "name:" + ",".join(self.cat_features) # Doesn't work
800
+ # cat_indices = [str(x.columns.get_loc(c)) for c in self.cat_features]
801
+ # params["categorical_feature"] = ",".join(cat_indices)
779
802
  x = fill_na_cat_features(x, self.cat_features)
780
- for feature in self.cat_features:
781
- x[feature] = x[feature].astype("category").cat.codes
803
+ if self.cat_encoder is not None:
804
+ x[self.cat_features] = self.cat_encoder.transform(x[self.cat_features])
805
+ # for feature in self.cat_features:
806
+ # encoder = self.cat_features_encoders[feature]
807
+ # x[feature] = encoder.transform(x[feature])
808
+ # x[feature] = x[feature].astype("category").cat.codes
782
809
  if not is_numeric_dtype(y):
783
810
  y_numpy = correct_string_target(y_numpy)
784
811
  return x, y_numpy, params
@@ -204,7 +204,7 @@ def balance_undersample(
204
204
  def balance_undersample_forced(
205
205
  df: pd.DataFrame,
206
206
  target_column: str,
207
- id_columns: List[str],
207
+ id_columns: Optional[List[str]],
208
208
  date_column: str,
209
209
  task_type: ModelTaskType,
210
210
  cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
287
287
 
288
288
  def balance_undersample_time_series_trunc(
289
289
  df: pd.DataFrame,
290
- id_columns: List[str],
290
+ id_columns: Optional[List[str]],
291
291
  date_column: str,
292
292
  sample_size: int,
293
293
  random_state: int = 42,
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
298
298
  **kwargs,
299
299
  ):
300
300
  # Convert date column to datetime
301
+ if id_columns is None:
302
+ id_columns = [date_column]
301
303
  dates_df = df[id_columns + [date_column]].copy()
302
304
  dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
303
305
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3832.dev11
3
+ Version: 1.2.71a3832.dev13
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=MPYFg9v0SOhqTxe0IfYh4m6Nh3TlmyfHR9sua58WXBM,34
1
+ upgini/__about__.py,sha256=buorll9F2OX4EgV8VmlIrj09nqmsSmqAG8T8p6hRCls,34
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=oYOBaHIyPjm-EEZvJT9pU35_DW8bArEQKymZyhW8LbE,206592
6
+ upgini/features_enricher.py,sha256=lk80Bx9U36lva6T4lPHBFk88ivrpZ-2uwwMwQg0LglE,207023
7
7
  upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=9AaQi7Yb22ZNnycUOAUpcP7TWF5Pfy_NGACcDj10aMs,38820
9
+ upgini/metrics.py,sha256=ot6AhxfRRTzM-dNApWTvmteLBAmGjD9OyAuKmtUTprE,40630
10
10
  upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
- upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
69
+ upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.71a3832.dev11.dist-info/METADATA,sha256=QuI4m49RjcWmDJ74fXMWfNqBKPXGKDsKGhhO_wR1Kfw,49102
74
- upgini-1.2.71a3832.dev11.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.71a3832.dev11.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.71a3832.dev11.dist-info/RECORD,,
73
+ upgini-1.2.71a3832.dev13.dist-info/METADATA,sha256=JdRugxJAMW4KLyRuz7yIX_PqSz_nObynmhkW5-g_lVs,49102
74
+ upgini-1.2.71a3832.dev13.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.71a3832.dev13.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.71a3832.dev13.dist-info/RECORD,,