upgini 1.2.71a3810.dev3__py3-none-any.whl → 1.2.71a3832.dev4__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.71a3810.dev3"
1
+ __version__ = "1.2.71a3832.dev4"
@@ -1,5 +1,5 @@
1
1
  import abc
2
- from typing import Dict, List, Optional
2
+ from typing import Dict, List, Optional, Tuple
3
3
 
4
4
  import pandas as pd
5
5
  from upgini.autofe.operator import PandasOperator
@@ -64,7 +64,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
64
64
  return base_formula
65
65
 
66
66
  @classmethod
67
- def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> tuple[Optional[dict], Optional[str]]:
67
+ def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> Tuple[Optional[Dict], Optional[str]]:
68
68
  """
69
69
  Parse the offset component from a formula.
70
70
 
@@ -93,7 +93,7 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
93
93
 
94
94
  return cls(**params)
95
95
 
96
- def get_params(self) -> Dict[str, str | None]:
96
+ def get_params(self) -> Dict[str, Optional[str]]:
97
97
  res = super().get_params()
98
98
  res.update(
99
99
  {
upgini/autofe/unary.py CHANGED
@@ -1,10 +1,8 @@
1
- import json
2
- from typing import Dict, List, Optional
1
+ from typing import Dict, Optional
3
2
  import numpy as np
4
3
  import pandas as pd
5
4
 
6
5
  from upgini.autofe.operator import PandasOperator, VectorizableMixin
7
- from upgini.autofe.utils import pydantic_validator
8
6
 
9
7
 
10
8
  class Abs(PandasOperator, VectorizableMixin):
@@ -155,38 +153,3 @@ class Embeddings(PandasOperator):
155
153
  is_unary: bool = True
156
154
  input_type: Optional[str] = "string"
157
155
  output_type: Optional[str] = "vector"
158
-
159
-
160
- class Bin(PandasOperator):
161
- name: str = "bin"
162
- is_unary: bool = True
163
- output_type: Optional[str] = "string"
164
- bin_bounds: List[int] = []
165
- is_categorical: bool = True
166
-
167
- def calculate_unary(self, data: pd.Series) -> pd.Series:
168
- return data.apply(self._bin, bounds=self.bin_bounds).fillna(-1).astype(int).astype(str)
169
-
170
- def _bin(self, f, bounds):
171
- if f is None or np.isnan(f):
172
- return np.nan
173
- hit = np.where(f >= np.array(bounds))[0]
174
- if hit.size > 0:
175
- return np.max(hit) + 1
176
- else:
177
- return np.nan
178
-
179
- def get_params(self) -> Dict[str, Optional[str]]:
180
- res = super().get_params()
181
- res.update(
182
- {
183
- "bin_bounds": json.dumps(self.bin_bounds),
184
- }
185
- )
186
- return res
187
-
188
- @pydantic_validator("bin_bounds", mode="before")
189
- def parse_bin_bounds(cls, value):
190
- if isinstance(value, str):
191
- return json.loads(value)
192
- return value
upgini/dataset.py CHANGED
@@ -389,7 +389,7 @@ class Dataset: # (pd.DataFrame):
389
389
  for col in columns_to_validate:
390
390
  self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
391
391
  if validate_target and target is not None and col == target:
392
- self.data.loc[self.data[target] == np.Inf, f"{col}_is_valid"] = False
392
+ self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
393
393
 
394
394
  if col in mandatory_columns:
395
395
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
@@ -841,7 +841,7 @@ class FeaturesEnricher(TransformerMixin):
841
841
  max_features: Optional[int] = None,
842
842
  remove_outliers_calc_metrics: Optional[bool] = None,
843
843
  trace_id: Optional[str] = None,
844
- silent: bool = False,
844
+ internal_call: bool = False,
845
845
  progress_bar: Optional[ProgressBar] = None,
846
846
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
847
847
  **kwargs,
@@ -1095,7 +1095,7 @@ class FeaturesEnricher(TransformerMixin):
1095
1095
  enriched_shaps = enriched_cv_result.shap_values
1096
1096
 
1097
1097
  if enriched_shaps is not None:
1098
- self._update_shap_values(trace_id, fitting_X, enriched_shaps)
1098
+ self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
1099
1099
 
1100
1100
  if enriched_metric is None:
1101
1101
  self.logger.warning(
@@ -1256,14 +1256,14 @@ class FeaturesEnricher(TransformerMixin):
1256
1256
  if self.raise_validation_error:
1257
1257
  raise e
1258
1258
  else:
1259
- if not silent:
1259
+ if not internal_call:
1260
1260
  self._dump_python_libs()
1261
1261
  self.__display_support_link()
1262
1262
  raise e
1263
1263
  finally:
1264
1264
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1265
1265
 
1266
- def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
1266
+ def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1267
1267
  renaming = self.fit_columns_renaming or {}
1268
1268
  new_shaps = {
1269
1269
  renaming.get(feature, feature): _round_shap_value(shap)
@@ -1272,7 +1272,7 @@ class FeaturesEnricher(TransformerMixin):
1272
1272
  }
1273
1273
  self.__prepare_feature_importances(trace_id, df, new_shaps)
1274
1274
 
1275
- if self.features_info_display_handle is not None:
1275
+ if not silent and self.features_info_display_handle is not None:
1276
1276
  try:
1277
1277
  _ = get_ipython() # type: ignore
1278
1278
 
@@ -1284,7 +1284,7 @@ class FeaturesEnricher(TransformerMixin):
1284
1284
  )
1285
1285
  except (ImportError, NameError):
1286
1286
  pass
1287
- if self.data_sources_display_handle is not None:
1287
+ if not silent and self.data_sources_display_handle is not None:
1288
1288
  try:
1289
1289
  _ = get_ipython() # type: ignore
1290
1290
 
@@ -1296,7 +1296,7 @@ class FeaturesEnricher(TransformerMixin):
1296
1296
  )
1297
1297
  except (ImportError, NameError):
1298
1298
  pass
1299
- if self.autofe_features_display_handle is not None:
1299
+ if not silent and self.autofe_features_display_handle is not None:
1300
1300
  try:
1301
1301
  _ = get_ipython() # type: ignore
1302
1302
  autofe_descriptions_df = self.get_autofe_features_description()
@@ -1309,7 +1309,7 @@ class FeaturesEnricher(TransformerMixin):
1309
1309
  )
1310
1310
  except (ImportError, NameError):
1311
1311
  pass
1312
- if self.report_button_handle is not None:
1312
+ if not silent and self.report_button_handle is not None:
1313
1313
  try:
1314
1314
  _ = get_ipython() # type: ignore
1315
1315
 
@@ -4084,7 +4084,10 @@ if response.status_code == 200:
4084
4084
  )
4085
4085
 
4086
4086
  if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
4087
- msg = self.bundle.get("unregistered_only_personal_keys")
4087
+ if self.__is_registered:
4088
+ msg = self.bundle.get("only_custom_keys")
4089
+ else:
4090
+ msg = self.bundle.get("unregistered_only_personal_keys")
4088
4091
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
4089
4092
  raise ValidationError(msg)
4090
4093
 
@@ -4135,7 +4138,7 @@ if response.status_code == 200:
4135
4138
  max_features=max_features,
4136
4139
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
4137
4140
  trace_id=trace_id,
4138
- silent=True,
4141
+ internal_call=True,
4139
4142
  progress_bar=progress_bar,
4140
4143
  progress_callback=progress_callback,
4141
4144
  )
upgini/http.py CHANGED
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
16
16
  from urllib.parse import urljoin
17
17
 
18
18
  import jwt
19
+
19
20
  # import pandas as pd
20
21
  import requests
21
22
  from pydantic import BaseModel
@@ -342,7 +343,9 @@ class _RestClient:
342
343
  else:
343
344
  return self._syncronized_refresh_access_token()
344
345
 
345
- def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True):
346
+ def _with_unauth_retry(
347
+ self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False
348
+ ):
346
349
  try:
347
350
  return request()
348
351
  except RequestException as e:
@@ -373,8 +376,9 @@ class _RestClient:
373
376
  elif "more than one concurrent search request" in e.message.lower():
374
377
  raise ValidationError(bundle.get("concurrent_request"))
375
378
  else:
376
- print(e)
377
- show_status_error()
379
+ if not silent:
380
+ print(e)
381
+ show_status_error()
378
382
  raise e
379
383
 
380
384
  @staticmethod
@@ -706,6 +710,7 @@ class _RestClient:
706
710
  silent=True,
707
711
  ),
708
712
  need_connection_retry=False,
713
+ silent=True,
709
714
  )
710
715
  except Exception:
711
716
  self.send_log_event_unauth(log_event)
@@ -716,7 +721,7 @@ class _RestClient:
716
721
  try:
717
722
  requests.post(
718
723
  url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
719
- json=log_event.dict(exclude_none=True),
724
+ json=log_event.model_dump(exclude_none=True),
720
725
  headers=_RestClient._get_base_headers(content_type="application/json"),
721
726
  )
722
727
  except Exception:
upgini/metrics.py CHANGED
@@ -1,17 +1,18 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
3
  import inspect
5
4
  import logging
6
5
  import re
6
+ import warnings
7
7
  from collections import defaultdict
8
8
  from copy import deepcopy
9
+ from dataclasses import dataclass
9
10
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
11
 
11
- import catboost
12
12
  import numpy as np
13
13
  import pandas as pd
14
- from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
14
+ from lightgbm import LGBMClassifier, LGBMRegressor
15
+ import lightgbm as lgb
15
16
  from numpy import log1p
16
17
  from pandas.api.types import is_numeric_dtype
17
18
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -27,11 +28,8 @@ except ImportError:
27
28
  from sklearn.metrics._scorer import SCORERS
28
29
 
29
30
  available_scorers = SCORERS
30
- from sklearn.metrics._regression import (
31
- _check_reg_targets,
32
- check_consistent_length,
33
- )
34
31
  from sklearn.metrics import mean_squared_error
32
+ from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
35
33
  from sklearn.model_selection import BaseCrossValidator
36
34
 
37
35
  from upgini.errors import ValidationError
@@ -88,11 +86,73 @@ CATBOOST_MULTICLASS_PARAMS = {
88
86
 
89
87
  LIGHTGBM_PARAMS = {
90
88
  "random_state": DEFAULT_RANDOM_STATE,
91
- "num_leaves": 16,
89
+ # "num_leaves": 16,
90
+ # "n_estimators": 150,
91
+ # "min_child_weight": 1,
92
92
  "max_depth": 4,
93
- "n_estimators": 150,
93
+ "max_cat_threshold": 80,
94
+ "min_data_per_group": 25,
95
+ "num_boost_round": 150,
96
+ "cat_l2": 10,
97
+ "cat_smooth": 12,
98
+ "learning_rate": 0.05,
99
+ "feature_fraction": 1.0,
100
+ "min_sum_hessian_in_leaf": 0.01,
101
+ }
102
+
103
+ LIGHTGBM_REGRESSION_PARAMS = {
104
+ "random_state": DEFAULT_RANDOM_STATE,
105
+ "deterministic": True,
106
+ "min_gain_to_split": 0.001,
107
+ "n_estimators": 275,
108
+ "max_depth": 5,
109
+ "max_cat_threshold": 80,
110
+ "min_data_per_group": 25,
111
+ "cat_l2": 10,
112
+ "cat_smooth": 12,
94
113
  "learning_rate": 0.05,
95
- "min_child_weight": 1,
114
+ "feature_fraction": 1.0,
115
+ "min_sum_hessian_in_leaf": 0.01,
116
+ "objective": "huber",
117
+ "verbosity": -1,
118
+ }
119
+
120
+ LIGHTGBM_MULTICLASS_PARAMS = {
121
+ "random_state": DEFAULT_RANDOM_STATE,
122
+ "deterministic": True,
123
+ "min_gain_to_split": 0.001,
124
+ "n_estimators": 275,
125
+ "max_depth": 3,
126
+ "max_cat_threshold": 80,
127
+ "min_data_per_group": 25,
128
+ "cat_l2": 10,
129
+ "cat_smooth": 12,
130
+ "learning_rate": 0.25, # CatBoost 0.25
131
+ "min_sum_hessian_in_leaf": 0.01,
132
+ "class_weight": "balanced", # TODO pass dict with weights for each class
133
+ "objective": "multiclass",
134
+ "use_quantized_grad": "true",
135
+ "num_grad_quant_bins": "8",
136
+ "stochastic_rounding": "true",
137
+ "verbosity": -1,
138
+ }
139
+
140
+ LIGHTGBM_BINARY_PARAMS = {
141
+ "random_state": DEFAULT_RANDOM_STATE,
142
+ "deterministic": True,
143
+ "min_gain_to_split": 0.001,
144
+ "n_estimators": 275,
145
+ "max_depth": 5,
146
+ "max_cat_threshold": 80,
147
+ "min_data_per_group": 25,
148
+ "cat_l2": 10,
149
+ "cat_smooth": 12,
150
+ "learning_rate": 0.05,
151
+ "feature_fraction": 1.0,
152
+ "min_sum_hessian_in_leaf": 0.01,
153
+ "objective": "binary",
154
+ "class_weight": "balanced", # TODO pass dict with weights for each class
155
+ "verbosity": -1,
96
156
  }
97
157
 
98
158
  N_FOLDS = 5
@@ -211,6 +271,15 @@ SUPPORTED_CATBOOST_METRICS = {
211
271
  }
212
272
 
213
273
 
274
+ def is_catboost_estimator(estimator):
275
+ try:
276
+ from catboost import CatBoostClassifier, CatBoostRegressor
277
+
278
+ return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
279
+ except ImportError:
280
+ return False
281
+
282
+
214
283
  @dataclass
215
284
  class _CrossValResults:
216
285
  metric: Optional[float]
@@ -292,7 +361,7 @@ class EstimatorWrapper:
292
361
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
293
362
  return x, y, groups
294
363
 
295
- def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
364
+ def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray]:
296
365
  joined = pd.concat([x, y], axis=1)
297
366
  joined = joined[joined[y.name].notna()]
298
367
  joined = joined.reset_index(drop=True)
@@ -346,12 +415,15 @@ class EstimatorWrapper:
346
415
  for estimator, split in zip(self.cv_estimators, splits):
347
416
  _, validation_idx = split
348
417
  cv_x = x.iloc[validation_idx]
349
- cv_y = y[validation_idx]
418
+ if isinstance(y, pd.Series):
419
+ cv_y = y.iloc[validation_idx]
420
+ else:
421
+ cv_y = y[validation_idx]
350
422
  shaps = self.calculate_shap(cv_x, cv_y, estimator)
351
423
  if shaps is not None:
352
424
  for feature, shap_value in shaps.items():
353
425
  # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
354
- shap_values_all_folds[feature].extend(shap_value.tolist())
426
+ shap_values_all_folds[feature].append(shap_value)
355
427
 
356
428
  if shap_values_all_folds:
357
429
  average_shap_values = {
@@ -427,21 +499,18 @@ class EstimatorWrapper:
427
499
  }
428
500
  if estimator is None:
429
501
  params = {}
430
- params["has_time"] = has_date
431
- # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
432
- # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
433
502
  if target_type == ModelTaskType.MULTICLASS:
434
- params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
503
+ params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
435
504
  params = _get_add_params(params, add_params)
436
- estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
505
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
437
506
  elif target_type == ModelTaskType.BINARY:
438
- params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
507
+ params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
439
508
  params = _get_add_params(params, add_params)
440
- estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
509
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
441
510
  elif target_type == ModelTaskType.REGRESSION:
442
- params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
511
+ params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
443
512
  params = _get_add_params(params, add_params)
444
- estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
513
+ estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
445
514
  else:
446
515
  raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
447
516
  else:
@@ -450,31 +519,21 @@ class EstimatorWrapper:
450
519
  else:
451
520
  estimator_copy = deepcopy(estimator)
452
521
  kwargs["estimator"] = estimator_copy
453
- if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
522
+ if is_catboost_estimator(estimator):
454
523
  if cat_features is not None:
455
524
  for cat_feature in cat_features:
456
525
  if cat_feature not in x.columns:
457
526
  logger.error(
458
527
  f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
459
528
  )
460
- estimator_copy.set_params(
461
- # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
462
- cat_features=cat_features
463
- )
529
+ estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
464
530
  estimator = CatBoostWrapper(**kwargs)
465
531
  else:
466
- try:
467
- from lightgbm import LGBMClassifier, LGBMRegressor
468
-
469
- if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
470
- estimator = LightGBMWrapper(**kwargs)
471
- else:
472
- logger.warning(
473
- f"Unexpected estimator is used for metrics: {estimator}. "
474
- "Default strategy for category features will be used"
475
- )
476
- estimator = OtherEstimatorWrapper(**kwargs)
477
- except ModuleNotFoundError:
532
+ if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
533
+ estimator = LightGBMWrapper(**kwargs)
534
+ elif is_catboost_estimator(estimator):
535
+ estimator = CatBoostWrapper(**kwargs)
536
+ else:
478
537
  logger.warning(
479
538
  f"Unexpected estimator is used for metrics: {estimator}. "
480
539
  "Default strategy for category features will be used"
@@ -487,7 +546,7 @@ class EstimatorWrapper:
487
546
  class CatBoostWrapper(EstimatorWrapper):
488
547
  def __init__(
489
548
  self,
490
- estimator: Union[CatBoostClassifier, CatBoostRegressor],
549
+ estimator,
491
550
  scorer: Callable,
492
551
  metric_name: str,
493
552
  multiplier: int,
@@ -517,6 +576,9 @@ class CatBoostWrapper(EstimatorWrapper):
517
576
  x, y, groups, params = super()._prepare_to_fit(x, y)
518
577
 
519
578
  # Find embeddings
579
+ import catboost
580
+ from catboost import CatBoostClassifier
581
+
520
582
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
521
583
  emb_pattern = r"(.+)_emb\d+"
522
584
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -637,8 +699,10 @@ class CatBoostWrapper(EstimatorWrapper):
637
699
  else:
638
700
  raise e
639
701
 
640
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
702
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
641
703
  try:
704
+ from catboost import Pool
705
+
642
706
  # Create Pool for fold data, if need (for example, when categorical features are present)
643
707
  fold_pool = Pool(
644
708
  x,
@@ -695,25 +759,60 @@ class LightGBMWrapper(EstimatorWrapper):
695
759
  self.cat_features = None
696
760
 
697
761
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
698
- x, y, groups, params = super()._prepare_to_fit(x, y)
762
+ x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
763
+ params["callbacks"] = [lgb.early_stopping(stopping_rounds=20)]
699
764
  self.cat_features = _get_cat_features(x)
700
765
  x = fill_na_cat_features(x, self.cat_features)
701
766
  for feature in self.cat_features:
702
767
  x[feature] = x[feature].astype("category").cat.codes
703
- if not is_numeric_dtype(y):
704
- y = correct_string_target(y)
768
+ if not is_numeric_dtype(y_numpy):
769
+ y_numpy = correct_string_target(y_numpy)
705
770
 
706
- return x, y, groups, params
771
+ return x, y_numpy, groups, params
707
772
 
708
773
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
709
- x, y, params = super()._prepare_to_calculate(x, y)
774
+ x, y_numpy, params = super()._prepare_to_calculate(x, y)
710
775
  if self.cat_features is not None:
711
776
  x = fill_na_cat_features(x, self.cat_features)
712
777
  for feature in self.cat_features:
713
778
  x[feature] = x[feature].astype("category").cat.codes
714
779
  if not is_numeric_dtype(y):
715
- y = correct_string_target(y)
716
- return x, y, params
780
+ y_numpy = correct_string_target(y_numpy)
781
+ return x, y_numpy, params
782
+
783
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
784
+ try:
785
+ # Suppress specific warning from SHAP for LightGBM binary classifier
786
+ warnings.filterwarnings(
787
+ "ignore",
788
+ message=(
789
+ "LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
790
+ ),
791
+ )
792
+ from shap import TreeExplainer
793
+
794
+ if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
795
+ return None
796
+
797
+ explainer = TreeExplainer(estimator)
798
+
799
+ shap_values = explainer.shap_values(x)
800
+
801
+ # For classification, shap_values is returned as a list for each class
802
+ # Take values for the positive class
803
+ if isinstance(shap_values, list):
804
+ shap_values = shap_values[1]
805
+
806
+ # Calculate mean absolute SHAP value for each feature
807
+ feature_importance = {}
808
+ for i, col in enumerate(x.columns):
809
+ feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
810
+
811
+ return feature_importance
812
+
813
+ except Exception as e:
814
+ self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
815
+ return None
717
816
 
718
817
 
719
818
  class OtherEstimatorWrapper(EstimatorWrapper):
@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
80
80
  postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
81
81
  multiple_search_key=Search key {} passed multiple times
82
82
  unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
83
+ only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
83
84
  search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
84
85
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
85
86
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
@@ -74,6 +74,8 @@ def remove_fintech_duplicates(
74
74
  # Checking for different dates by the same personal keys
75
75
  uniques = grouped_by_personal_cols[date_col].nunique()
76
76
  total = len(uniques)
77
+ if total == 0:
78
+ return segment_df, None
77
79
  diff_dates = len(uniques[uniques > 1])
78
80
  if diff_dates / total >= 0.6:
79
81
  return segment_df, None
@@ -90,7 +90,8 @@ class FeatureInfo:
90
90
  def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
91
91
  if data is not None and len(data) > 0 and feature_meta.name in data.columns:
92
92
  if len(data) > 3:
93
- feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
93
+ rand = np.random.RandomState(42)
94
+ feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
94
95
  else:
95
96
  feature_sample = data[feature_meta.name].dropna().unique().tolist()
96
97
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
@@ -9,7 +9,6 @@ from traceback import format_exc
9
9
 
10
10
  import numpy as np
11
11
  import scipy.sparse as sp
12
- from catboost import CatBoostClassifier, CatBoostRegressor
13
12
  from joblib import Parallel, logger
14
13
  from scipy.sparse import issparse
15
14
  from sklearn import config_context, get_config
@@ -342,6 +341,22 @@ def cross_validate(
342
341
  raise e
343
342
 
344
343
 
344
+ def is_catboost_estimator(estimator):
345
+ try:
346
+ from catboost import CatBoostClassifier, CatBoostRegressor
347
+ return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
348
+ except ImportError:
349
+ return False
350
+
351
+
352
+ def is_lightgbm_estimator(estimator):
353
+ try:
354
+ from lightgbm import LGBMClassifier, LGBMRegressor
355
+ return isinstance(estimator, (LGBMClassifier, LGBMRegressor))
356
+ except ImportError:
357
+ return False
358
+
359
+
345
360
  def _fit_and_score(
346
361
  estimator,
347
362
  X,
@@ -497,7 +512,10 @@ def _fit_and_score(
497
512
  if y_train is None:
498
513
  estimator.fit(X_train, **fit_params)
499
514
  else:
500
- if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
515
+ if is_catboost_estimator(estimator):
516
+ fit_params = fit_params.copy()
517
+ fit_params["eval_set"] = [(X_test, y_test)]
518
+ elif is_lightgbm_estimator(estimator):
501
519
  fit_params = fit_params.copy()
502
520
  fit_params["eval_set"] = [(X_test, y_test)]
503
521
  estimator.fit(X_train, y_train, **fit_params)
upgini/utils/sort.py CHANGED
@@ -87,7 +87,7 @@ def get_sort_columns_dict(
87
87
  df_with_target = df_with_target.loc[~target.isna()]
88
88
  df = df_with_target.iloc[:, :-1]
89
89
  target = df_with_target.iloc[:, -1]
90
- df = df.fillna(df.mean())
90
+ df = df.fillna(df.apply(lambda x: int(x.mean()) if pd.api.types.is_integer_dtype(x) else x.mean()))
91
91
  omit_nan = False
92
92
  hashes = [hash_series(df[col]) for col in columns_for_sort]
93
93
  df = np.asarray(df, dtype=np.float32)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3810.dev3
3
+ Version: 1.2.71a3832.dev4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -22,14 +22,14 @@ Classifier: Programming Language :: Python :: 3.11
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Requires-Python: <3.12,>=3.10
25
- Requires-Dist: catboost>=1.0.3
26
25
  Requires-Dist: fastparquet>=0.8.1
27
26
  Requires-Dist: ipywidgets>=8.1.0
28
27
  Requires-Dist: jarowinkler>=2.0.0
29
28
  Requires-Dist: levenshtein>=0.25.1
30
- Requires-Dist: numpy<=1.26.4,>=1.19.0
29
+ Requires-Dist: lightgbm>=4.6.0
30
+ Requires-Dist: numpy<3.0.0,>=1.19.0
31
31
  Requires-Dist: pandas<3.0.0,>=1.1.0
32
- Requires-Dist: psutil>=6.0.0
32
+ Requires-Dist: psutil>=5.9.0
33
33
  Requires-Dist: pydantic<3.0.0,>1.0.0
34
34
  Requires-Dist: pyjwt>=2.8.0
35
35
  Requires-Dist: python-bidi==0.4.2
@@ -38,6 +38,7 @@ Requires-Dist: python-json-logger>=3.3.0
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
40
  Requires-Dist: scipy>=1.10.0
41
+ Requires-Dist: shap>=0.44.0
41
42
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
42
43
  Description-Content-Type: text/markdown
43
44
 
@@ -1,13 +1,12 @@
1
- upgini/__about__.py,sha256=HJUMYeAgyipX6d-hLqm0G7l9lH2D5uJGT9KFNz20JM0,33
1
+ upgini/__about__.py,sha256=xZJ4YiYa1ZXgmCQ3SYjASYcXSx3CrMdke97pR0PB16E,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=nCPfkQIlAanLgCpcmsDfxFXmg99dRm9m0K_ibdLUr-4,35365
4
+ upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=KqDQ29sU1Aty5Z40DDqO869Y_CClQfmU58nE9rScxRc,204434
7
- upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
- upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
6
+ upgini/features_enricher.py,sha256=Z6RSjqcqneGwWflsq1Q5rjf83awPNYqKpAgHRh7jils,204680
7
+ upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
9
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
10
- upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
9
+ upgini/metrics.py,sha256=LI0wwTUSnxX62lVSM7J8Pq_RSbruq93QUhbMXilWM30,38301
11
10
  upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
12
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -20,12 +19,12 @@ upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
20
19
  upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
21
20
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
21
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
- upgini/autofe/unary.py,sha256=3lvwtWrgIHziypwUTetrUv1iCqwDhabbKH4OySkQDhs,5722
22
+ upgini/autofe/unary.py,sha256=yVgPvtfnPSOhrii0YgezddmgWPwyOBCR0JutaIkdTTc,4658
24
23
  upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
25
24
  upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
26
25
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
27
- upgini/autofe/timeseries/base.py,sha256=MYK260n3h9kEbgunbyp0cpR0pgNHml3N2WDLGW5BLDU,3603
28
- upgini/autofe/timeseries/cross.py,sha256=xpHHVITXYUK20BgEZlqKN1Uy2uxKnHz72gngjt7BxVE,5316
26
+ upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
27
+ upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
29
28
  upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
30
29
  upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
31
30
  upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
@@ -39,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
39
38
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
40
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
41
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
42
- upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
41
+ upgini/resource_bundle/strings.properties,sha256=mwQrerdJj3adzT-fHqvs6Qjf-rqDccsUzELDIXJKAmY,27791
43
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
44
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -53,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
53
52
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
54
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
54
  upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
56
- upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
55
+ upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
57
56
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
58
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
60
- upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
59
+ upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
61
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
62
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
62
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
@@ -65,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
65
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
66
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
67
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
68
- upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
69
- upgini/utils/sort.py,sha256=H79A17NMoHtLbqLCPFx_MBUloLZcDKjOba_H4gCE3t8,6965
67
+ upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
+ upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
70
69
  upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
71
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.71a3810.dev3.dist-info/METADATA,sha256=tZWeZpg4Bh8rhogD7KDK-Sq7oFXDHzH0ljKi3Q1Z6AQ,49075
75
- upgini-1.2.71a3810.dev3.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.71a3810.dev3.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.71a3810.dev3.dist-info/RECORD,,
73
+ upgini-1.2.71a3832.dev4.dist-info/METADATA,sha256=XWxCzwoYpOeebCAtVb_H4-x-9VeHLDwYc7DkputGaAc,49101
74
+ upgini-1.2.71a3832.dev4.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
75
+ upgini-1.2.71a3832.dev4.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.71a3832.dev4.dist-info/RECORD,,
@@ -1,4 +1,4 @@
1
1
  Wheel-Version: 1.0
2
- Generator: hatchling 1.24.2
2
+ Generator: hatchling 1.25.0
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
upgini/lazy_import.py DELETED
@@ -1,35 +0,0 @@
1
- import importlib
2
- import importlib.util
3
- import importlib.machinery
4
-
5
-
6
- class LazyImport:
7
- def __init__(self, module_name, class_name):
8
- self.module_name = module_name
9
- self.class_name = class_name
10
- self._module = None
11
- self._class = None
12
-
13
- def _load(self):
14
- if self._module is None:
15
- # Load module and save link to it
16
- spec = importlib.util.find_spec(self.module_name)
17
- if spec is None:
18
- raise ImportError(f"Module {self.module_name} not found")
19
-
20
- # Create module
21
- self._module = importlib.util.module_from_spec(spec)
22
-
23
- # Execute module
24
- spec.loader.exec_module(self._module)
25
-
26
- # Get class from module
27
- self._class = getattr(self._module, self.class_name)
28
-
29
- def __call__(self, *args, **kwargs):
30
- self._load()
31
- return self._class(*args, **kwargs)
32
-
33
- def __getattr__(self, name):
34
- self._load()
35
- return getattr(self._class, name)