upgini 1.2.71a3832.dev12__py3-none-any.whl → 1.2.72__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.71a3832.dev12"
1
+ __version__ = "1.2.72"
@@ -3250,8 +3250,7 @@ if response.status_code == 200:
3250
3250
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3251
3251
  if len(eval_pair) != 2:
3252
3252
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
3253
- eval_X = eval_pair[0]
3254
- eval_y = eval_pair[1]
3253
+ eval_X, eval_y = eval_pair
3255
3254
 
3256
3255
  if _num_samples(eval_X) == 0:
3257
3256
  raise ValidationError(self.bundle.get("eval_x_is_empty"))
@@ -3872,15 +3871,23 @@ if response.status_code == 200:
3872
3871
 
3873
3872
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
3874
3873
 
3875
- if updated_shaps is not None:
3876
- for fm in features_meta:
3877
- fm.shap_value = updated_shaps.get(fm.name, 0.0)
3878
-
3879
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3880
3874
  for feature_meta in features_meta:
3881
3875
  if feature_meta.name in original_names_dict.keys():
3882
3876
  feature_meta.name = original_names_dict[feature_meta.name]
3883
3877
 
3878
+ if updated_shaps is not None:
3879
+ updating_shap = updated_shaps.get(feature_meta.name)
3880
+ if updating_shap is None:
3881
+ self.logger.warning(
3882
+ f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
3883
+ )
3884
+ updating_shap = 0.0
3885
+ feature_meta.shap_value = updating_shap
3886
+
3887
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3888
+
3889
+ for feature_meta in features_meta:
3890
+
3884
3891
  is_client_feature = feature_meta.name in df.columns
3885
3892
 
3886
3893
  # TODO make a decision about selected features based on special flag from mlb
@@ -3892,7 +3899,7 @@ if response.status_code == 200:
3892
3899
  # Use only important features
3893
3900
  if (
3894
3901
  # feature_meta.name in self.fit_generated_features or
3895
- feature_meta.name == COUNTRY
3902
+ feature_meta.name == COUNTRY # constant synthetic column
3896
3903
  # In select_features mode we select also from etalon features and need to show them
3897
3904
  or (not self.fit_select_features and is_client_feature)
3898
3905
  ):
upgini/metrics.py CHANGED
@@ -8,16 +8,18 @@ from copy import deepcopy
8
8
  from dataclasses import dataclass
9
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
10
 
11
+ import lightgbm as lgb
11
12
  import numpy as np
12
13
  import pandas as pd
13
14
  from lightgbm import LGBMClassifier, LGBMRegressor
14
- import lightgbm as lgb
15
15
  from numpy import log1p
16
16
  from pandas.api.types import is_numeric_dtype
17
17
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
18
+ from sklearn.preprocessing import OrdinalEncoder
18
19
 
19
20
  from upgini.utils.features_validator import FeaturesValidator
20
21
  from upgini.utils.sklearn_ext import cross_validate
22
+ from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit
21
23
 
22
24
  try:
23
25
  from sklearn.metrics import get_scorer_names
@@ -29,7 +31,7 @@ except ImportError:
29
31
  available_scorers = SCORERS
30
32
  from sklearn.metrics import mean_squared_error
31
33
  from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
32
- from sklearn.model_selection import BaseCrossValidator
34
+ from sklearn.model_selection import BaseCrossValidator, TimeSeriesSplit
33
35
 
34
36
  from upgini.errors import ValidationError
35
37
  from upgini.metadata import ModelTaskType
@@ -83,22 +85,6 @@ CATBOOST_MULTICLASS_PARAMS = {
83
85
  "auto_class_weights": "Balanced",
84
86
  }
85
87
 
86
- LIGHTGBM_PARAMS = {
87
- "random_state": DEFAULT_RANDOM_STATE,
88
- # "num_leaves": 16,
89
- # "n_estimators": 150,
90
- # "min_child_weight": 1,
91
- "max_depth": 4,
92
- "max_cat_threshold": 80,
93
- "min_data_per_group": 25,
94
- "num_boost_round": 150,
95
- "cat_l2": 10,
96
- "cat_smooth": 12,
97
- "learning_rate": 0.05,
98
- "feature_fraction": 1.0,
99
- "min_sum_hessian_in_leaf": 0.01,
100
- }
101
-
102
88
  LIGHTGBM_REGRESSION_PARAMS = {
103
89
  "random_state": DEFAULT_RANDOM_STATE,
104
90
  "deterministic": True,
@@ -125,9 +111,9 @@ LIGHTGBM_MULTICLASS_PARAMS = {
125
111
  "max_cat_threshold": 80,
126
112
  "min_data_per_group": 20,
127
113
  "cat_smooth": 18,
128
- "cat_l2" : 8,
114
+ "cat_l2": 8,
129
115
  "objective": "multiclass",
130
- "class_weight": "balanced",
116
+ # "class_weight": "balanced",
131
117
  "use_quantized_grad": "true",
132
118
  "num_grad_quant_bins": "8",
133
119
  "stochastic_rounding": "true",
@@ -141,12 +127,12 @@ LIGHTGBM_BINARY_PARAMS = {
141
127
  "max_depth": 5,
142
128
  "learning_rate": 0.05,
143
129
  "objective": "binary",
144
- "class_weight": "balanced",
130
+ # "class_weight": "balanced",
145
131
  "deterministic": True,
146
132
  "max_cat_threshold": 80,
147
133
  "min_data_per_group": 20,
148
134
  "cat_smooth": 18,
149
- "cat_l2" : 8,
135
+ "cat_l2": 8,
150
136
  "verbosity": -1,
151
137
  }
152
138
 
@@ -505,7 +491,8 @@ class EstimatorWrapper:
505
491
  params = _get_add_params(params, add_params)
506
492
  estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
507
493
  elif target_type == ModelTaskType.REGRESSION:
508
- params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
494
+ if not isinstance(cv, TimeSeriesSplit) and not isinstance(cv, BlockedTimeSeriesSplit):
495
+ params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
509
496
  params = _get_add_params(params, add_params)
510
497
  estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
511
498
  else:
@@ -754,6 +741,7 @@ class LightGBMWrapper(EstimatorWrapper):
754
741
  logger=logger,
755
742
  )
756
743
  self.cat_features = None
744
+ self.cat_encoder = None
757
745
  self.n_classes = None
758
746
 
759
747
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
@@ -764,10 +752,13 @@ class LightGBMWrapper(EstimatorWrapper):
764
752
  params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
765
753
  self.cat_features = _get_cat_features(x)
766
754
  if self.cat_features:
767
- params["categorical_feature"] = self.cat_features
768
- x = fill_na_cat_features(x, self.cat_features)
769
- for feature in self.cat_features:
770
- x[feature] = x[feature].astype("category").cat.codes
755
+ x = fill_na_cat_features(x, self.cat_features)
756
+ encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
757
+ encoded = pd.DataFrame(
758
+ encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
759
+ )
760
+ x[self.cat_features] = encoded
761
+ self.cat_encoder = encoder
771
762
  if not is_numeric_dtype(y_numpy):
772
763
  y_numpy = correct_string_target(y_numpy)
773
764
 
@@ -777,8 +768,10 @@ class LightGBMWrapper(EstimatorWrapper):
777
768
  x, y_numpy, params = super()._prepare_to_calculate(x, y)
778
769
  if self.cat_features is not None:
779
770
  x = fill_na_cat_features(x, self.cat_features)
780
- for feature in self.cat_features:
781
- x[feature] = x[feature].astype("category").cat.codes
771
+ if self.cat_encoder is not None:
772
+ x[self.cat_features] = pd.DataFrame(
773
+ self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
774
+ )
782
775
  if not is_numeric_dtype(y):
783
776
  y_numpy = correct_string_target(y_numpy)
784
777
  return x, y_numpy, params
@@ -204,7 +204,7 @@ def balance_undersample(
204
204
  def balance_undersample_forced(
205
205
  df: pd.DataFrame,
206
206
  target_column: str,
207
- id_columns: List[str],
207
+ id_columns: Optional[List[str]],
208
208
  date_column: str,
209
209
  task_type: ModelTaskType,
210
210
  cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
287
287
 
288
288
  def balance_undersample_time_series_trunc(
289
289
  df: pd.DataFrame,
290
- id_columns: List[str],
290
+ id_columns: Optional[List[str]],
291
291
  date_column: str,
292
292
  sample_size: int,
293
293
  random_state: int = 42,
@@ -297,6 +297,8 @@ def balance_undersample_time_series_trunc(
297
297
  time_unit_threshold: pd.Timedelta = DEFAULT_TIME_UNIT_THRESHOLD,
298
298
  **kwargs,
299
299
  ):
300
+ if id_columns is None:
301
+ id_columns = []
300
302
  # Convert date column to datetime
301
303
  dates_df = df[id_columns + [date_column]].copy()
302
304
  dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3832.dev12
3
+ Version: 1.2.72
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -1,12 +1,12 @@
1
- upgini/__about__.py,sha256=okx02f-XOrtKlpdtJnV-aHreGFvFkxZ5NQhd5zxvhMk,34
1
+ upgini/__about__.py,sha256=-EK4ypqJTIRrg6g1P6PtLXT9vC4Vq7zblqFi389VgwA,23
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
4
  upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=DgWboHEhr5BQT87MaAo2iUtrhapP3iqczLeZtWLRkDs,206664
6
+ upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
7
7
  upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
8
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
9
- upgini/metrics.py,sha256=9AaQi7Yb22ZNnycUOAUpcP7TWF5Pfy_NGACcDj10aMs,38820
9
+ upgini/metrics.py,sha256=a0bY4oTMb-MgB1yC1IuTcEtotKZxAxjgV_QV2Z4V8u4,38988
10
10
  upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
11
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
12
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -66,11 +66,11 @@ upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml
66
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
67
67
  upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
68
  upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
- upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
69
+ upgini/utils/target_utils.py,sha256=P0cCVRaakWLydYwFjk3TEaQfr0p0hfsJCvKRD8qcxiE,16650
70
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
71
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
72
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
73
- upgini-1.2.71a3832.dev12.dist-info/METADATA,sha256=8jmuNEDPwjc-Wa6Bds0FjYqYgqf3LFMYyRGUDy5DME8,49102
74
- upgini-1.2.71a3832.dev12.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
- upgini-1.2.71a3832.dev12.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
- upgini-1.2.71a3832.dev12.dist-info/RECORD,,
73
+ upgini-1.2.72.dist-info/METADATA,sha256=OpaT2gblO8qGzEJBNf36-dPwbedHPP93bX0fPAOMl38,49091
74
+ upgini-1.2.72.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.72.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.72.dist-info/RECORD,,