upgini 1.2.71a3810.dev5__py3-none-any.whl → 1.2.71a3810.dev7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

upgini/__about__.py CHANGED
@@ -1 +1 @@
1
- __version__ = "1.2.71a3810.dev5"
1
+ __version__ = "1.2.71a3810.dev7"
upgini/autofe/feature.py CHANGED
@@ -170,7 +170,7 @@ class Feature:
170
170
  components.extend(
171
171
  [self.children[0].get_display_name(cache=cache, shorten=shorten, **kwargs), self.get_op_display_name()]
172
172
  )
173
- elif shorten:
173
+ elif shorten and not self.op.is_unary:
174
174
  components.extend(["f_autofe", self.get_op_display_name()])
175
175
  else:
176
176
  components = ["f_" + "_f_".join(self.get_columns(**kwargs))] + ["autofe", self.get_op_display_name()]
@@ -1,5 +1,5 @@
1
1
  import abc
2
- from typing import Dict, List, Optional
2
+ from typing import Dict, List, Optional, Tuple
3
3
 
4
4
  import pandas as pd
5
5
  from upgini.autofe.operator import PandasOperator
@@ -64,7 +64,7 @@ class TimeSeriesBase(PandasOperator, abc.ABC):
64
64
  return base_formula
65
65
 
66
66
  @classmethod
67
- def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> tuple[Optional[dict], Optional[str]]:
67
+ def _parse_offset_from_formula(cls, formula: str, base_regex: str) -> Tuple[Optional[Dict], Optional[str]]:
68
68
  """
69
69
  Parse the offset component from a formula.
70
70
 
@@ -93,7 +93,7 @@ class CrossSeriesInteraction(TimeSeriesBase, ParametrizedOperator):
93
93
 
94
94
  return cls(**params)
95
95
 
96
- def get_params(self) -> Dict[str, str | None]:
96
+ def get_params(self) -> Dict[str, Optional[str]]:
97
97
  res = super().get_params()
98
98
  res.update(
99
99
  {
upgini/dataset.py CHANGED
@@ -389,7 +389,7 @@ class Dataset: # (pd.DataFrame):
389
389
  for col in columns_to_validate:
390
390
  self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
391
391
  if validate_target and target is not None and col == target:
392
- self.data.loc[self.data[target] == np.Inf, f"{col}_is_valid"] = False
392
+ self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
393
393
 
394
394
  if col in mandatory_columns:
395
395
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
@@ -12,6 +12,7 @@ import tempfile
12
12
  import time
13
13
  import uuid
14
14
  from collections import Counter
15
+ from copy import deepcopy
15
16
  from dataclasses import dataclass
16
17
  from threading import Thread
17
18
  from typing import Any, Callable, Dict, List, Optional, Set, Tuple, Union
@@ -841,7 +842,7 @@ class FeaturesEnricher(TransformerMixin):
841
842
  max_features: Optional[int] = None,
842
843
  remove_outliers_calc_metrics: Optional[bool] = None,
843
844
  trace_id: Optional[str] = None,
844
- silent: bool = False,
845
+ internal_call: bool = False,
845
846
  progress_bar: Optional[ProgressBar] = None,
846
847
  progress_callback: Optional[Callable[[SearchProgress], Any]] = None,
847
848
  **kwargs,
@@ -1095,7 +1096,7 @@ class FeaturesEnricher(TransformerMixin):
1095
1096
  enriched_shaps = enriched_cv_result.shap_values
1096
1097
 
1097
1098
  if enriched_shaps is not None:
1098
- self._update_shap_values(trace_id, fitting_X, enriched_shaps)
1099
+ self._update_shap_values(trace_id, fitting_X, enriched_shaps, silent=not internal_call)
1099
1100
 
1100
1101
  if enriched_metric is None:
1101
1102
  self.logger.warning(
@@ -1256,14 +1257,14 @@ class FeaturesEnricher(TransformerMixin):
1256
1257
  if self.raise_validation_error:
1257
1258
  raise e
1258
1259
  else:
1259
- if not silent:
1260
+ if not internal_call:
1260
1261
  self._dump_python_libs()
1261
1262
  self.__display_support_link()
1262
1263
  raise e
1263
1264
  finally:
1264
1265
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1265
1266
 
1266
- def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float]):
1267
+ def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1267
1268
  renaming = self.fit_columns_renaming or {}
1268
1269
  new_shaps = {
1269
1270
  renaming.get(feature, feature): _round_shap_value(shap)
@@ -1272,7 +1273,7 @@ class FeaturesEnricher(TransformerMixin):
1272
1273
  }
1273
1274
  self.__prepare_feature_importances(trace_id, df, new_shaps)
1274
1275
 
1275
- if self.features_info_display_handle is not None:
1276
+ if not silent and self.features_info_display_handle is not None:
1276
1277
  try:
1277
1278
  _ = get_ipython() # type: ignore
1278
1279
 
@@ -1284,7 +1285,7 @@ class FeaturesEnricher(TransformerMixin):
1284
1285
  )
1285
1286
  except (ImportError, NameError):
1286
1287
  pass
1287
- if self.data_sources_display_handle is not None:
1288
+ if not silent and self.data_sources_display_handle is not None:
1288
1289
  try:
1289
1290
  _ = get_ipython() # type: ignore
1290
1291
 
@@ -1296,7 +1297,7 @@ class FeaturesEnricher(TransformerMixin):
1296
1297
  )
1297
1298
  except (ImportError, NameError):
1298
1299
  pass
1299
- if self.autofe_features_display_handle is not None:
1300
+ if not silent and self.autofe_features_display_handle is not None:
1300
1301
  try:
1301
1302
  _ = get_ipython() # type: ignore
1302
1303
  autofe_descriptions_df = self.get_autofe_features_description()
@@ -1309,7 +1310,7 @@ class FeaturesEnricher(TransformerMixin):
1309
1310
  )
1310
1311
  except (ImportError, NameError):
1311
1312
  pass
1312
- if self.report_button_handle is not None:
1313
+ if not silent and self.report_button_handle is not None:
1313
1314
  try:
1314
1315
  _ = get_ipython() # type: ignore
1315
1316
 
@@ -1512,8 +1513,7 @@ class FeaturesEnricher(TransformerMixin):
1512
1513
  self.logger.info(f"Client features column on prepare data for metrics: {client_features}")
1513
1514
 
1514
1515
  filtered_enriched_features = self.__filtered_enriched_features(
1515
- importance_threshold,
1516
- max_features,
1516
+ importance_threshold, max_features, trace_id, validated_X
1517
1517
  )
1518
1518
  filtered_enriched_features = [c for c in filtered_enriched_features if c not in client_features]
1519
1519
 
@@ -2541,7 +2541,9 @@ if response.status_code == 200:
2541
2541
  for c in itertools.chain(validated_Xy.columns.tolist(), generated_features)
2542
2542
  if c not in self.dropped_client_feature_names_
2543
2543
  ]
2544
- filtered_columns = self.__filtered_enriched_features(importance_threshold, max_features)
2544
+ filtered_columns = self.__filtered_enriched_features(
2545
+ importance_threshold, max_features, trace_id, validated_X
2546
+ )
2545
2547
  selecting_columns.extend(
2546
2548
  c for c in filtered_columns if c in result.columns and c not in validated_X.columns
2547
2549
  )
@@ -3248,8 +3250,7 @@ if response.status_code == 200:
3248
3250
  def _validate_eval_set_pair(self, X: pd.DataFrame, eval_pair: Tuple) -> Tuple[pd.DataFrame, pd.Series]:
3249
3251
  if len(eval_pair) != 2:
3250
3252
  raise ValidationError(self.bundle.get("eval_set_invalid_tuple_size").format(len(eval_pair)))
3251
- eval_X = eval_pair[0]
3252
- eval_y = eval_pair[1]
3253
+ eval_X, eval_y = eval_pair
3253
3254
 
3254
3255
  if _num_samples(eval_X) == 0:
3255
3256
  raise ValidationError(self.bundle.get("eval_x_is_empty"))
@@ -3805,6 +3806,47 @@ if response.status_code == 200:
3805
3806
 
3806
3807
  return result_features
3807
3808
 
3809
+ def __get_features_importance_from_server(self, trace_id: str, df: pd.DataFrame):
3810
+ if self._search_task is None:
3811
+ raise NotFittedError(self.bundle.get("transform_unfitted_enricher"))
3812
+ features_meta = self._search_task.get_all_features_metadata_v2()
3813
+ if features_meta is None:
3814
+ raise Exception(self.bundle.get("missing_features_meta"))
3815
+ features_meta = deepcopy(features_meta)
3816
+
3817
+ original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3818
+ df = df.rename(columns=original_names_dict)
3819
+
3820
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3821
+
3822
+ importances = {}
3823
+
3824
+ for feature_meta in features_meta:
3825
+ if feature_meta.name in original_names_dict.keys():
3826
+ feature_meta.name = original_names_dict[feature_meta.name]
3827
+
3828
+ is_client_feature = feature_meta.name in df.columns
3829
+
3830
+ if feature_meta.shap_value == 0.0:
3831
+ continue
3832
+
3833
+ # Use only important features
3834
+ if (
3835
+ feature_meta.name == COUNTRY
3836
+ # In select_features mode we select also from etalon features and need to show them
3837
+ or (not self.fit_select_features and is_client_feature)
3838
+ ):
3839
+ continue
3840
+
3841
+ # Temporary workaround for duplicate features metadata
3842
+ if feature_meta.name in importances:
3843
+ self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
3844
+ continue
3845
+
3846
+ importances[feature_meta.name] = feature_meta.shap_value
3847
+
3848
+ return importances
3849
+
3808
3850
  def __prepare_feature_importances(
3809
3851
  self, trace_id: str, df: pd.DataFrame, updated_shaps: Optional[Dict[str, float]] = None, silent=False
3810
3852
  ):
@@ -3813,6 +3855,7 @@ if response.status_code == 200:
3813
3855
  features_meta = self._search_task.get_all_features_metadata_v2()
3814
3856
  if features_meta is None:
3815
3857
  raise Exception(self.bundle.get("missing_features_meta"))
3858
+ features_meta = deepcopy(features_meta)
3816
3859
 
3817
3860
  original_names_dict = {c.name: c.originalName for c in self._search_task.get_file_metadata(trace_id).columns}
3818
3861
  features_df = self._search_task.get_all_initial_raw_features(trace_id, metrics_calculation=True)
@@ -3828,15 +3871,23 @@ if response.status_code == 200:
3828
3871
 
3829
3872
  original_shaps = {original_names_dict.get(fm.name, fm.name): fm.shap_value for fm in features_meta}
3830
3873
 
3831
- if updated_shaps is not None:
3832
- for fm in features_meta:
3833
- fm.shap_value = updated_shaps.get(fm.name, 0.0)
3834
-
3835
- features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3836
3874
  for feature_meta in features_meta:
3837
3875
  if feature_meta.name in original_names_dict.keys():
3838
3876
  feature_meta.name = original_names_dict[feature_meta.name]
3839
3877
 
3878
+ if updated_shaps is not None:
3879
+ updating_shap = updated_shaps.get(feature_meta.name)
3880
+ if updating_shap is None:
3881
+ self.logger.warning(
3882
+ f"WARNING: Shap value for feature {feature_meta.name} not found and will be set to 0.0"
3883
+ )
3884
+ updating_shap = 0.0
3885
+ feature_meta.shap_value = updating_shap
3886
+
3887
+ features_meta.sort(key=lambda m: (-m.shap_value, m.name))
3888
+
3889
+ for feature_meta in features_meta:
3890
+
3840
3891
  is_client_feature = feature_meta.name in df.columns
3841
3892
 
3842
3893
  # TODO make a decision about selected features based on special flag from mlb
@@ -3848,7 +3899,7 @@ if response.status_code == 200:
3848
3899
  # Use only important features
3849
3900
  if (
3850
3901
  # feature_meta.name in self.fit_generated_features or
3851
- feature_meta.name == COUNTRY
3902
+ feature_meta.name == COUNTRY # constant synthetic column
3852
3903
  # In select_features mode we select also from etalon features and need to show them
3853
3904
  or (not self.fit_select_features and is_client_feature)
3854
3905
  ):
@@ -3990,16 +4041,19 @@ if response.status_code == 200:
3990
4041
  )
3991
4042
 
3992
4043
  def __filtered_importance_names(
3993
- self, importance_threshold: Optional[float], max_features: Optional[int]
4044
+ self, importance_threshold: Optional[float], max_features: Optional[int], trace_id: str, df: pd.DataFrame
3994
4045
  ) -> List[str]:
3995
- if len(self.feature_names_) == 0:
3996
- return []
4046
+ # get features importance from server
4047
+ filtered_importances = self.__get_features_importance_from_server(trace_id, df)
3997
4048
 
3998
- filtered_importances = list(zip(self.feature_names_, self.feature_importances_))
4049
+ if len(filtered_importances) == 0:
4050
+ return []
3999
4051
 
4000
4052
  if importance_threshold is not None:
4001
4053
  filtered_importances = [
4002
- (name, importance) for name, importance in filtered_importances if importance > importance_threshold
4054
+ (name, importance)
4055
+ for name, importance in filtered_importances.items()
4056
+ if importance > importance_threshold
4003
4057
  ]
4004
4058
  if max_features is not None:
4005
4059
  filtered_importances = list(filtered_importances)[:max_features]
@@ -4084,7 +4138,10 @@ if response.status_code == 200:
4084
4138
  )
4085
4139
 
4086
4140
  if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
4087
- msg = self.bundle.get("unregistered_only_personal_keys")
4141
+ if self.__is_registered:
4142
+ msg = self.bundle.get("only_custom_keys")
4143
+ else:
4144
+ msg = self.bundle.get("unregistered_only_personal_keys")
4088
4145
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
4089
4146
  raise ValidationError(msg)
4090
4147
 
@@ -4135,7 +4192,7 @@ if response.status_code == 200:
4135
4192
  max_features=max_features,
4136
4193
  remove_outliers_calc_metrics=remove_outliers_calc_metrics,
4137
4194
  trace_id=trace_id,
4138
- silent=True,
4195
+ internal_call=True,
4139
4196
  progress_bar=progress_bar,
4140
4197
  progress_callback=progress_callback,
4141
4198
  )
@@ -4209,11 +4266,13 @@ if response.status_code == 200:
4209
4266
  self,
4210
4267
  importance_threshold: Optional[float],
4211
4268
  max_features: Optional[int],
4269
+ trace_id: str,
4270
+ df: pd.DataFrame,
4212
4271
  ) -> List[str]:
4213
4272
  importance_threshold = self.__validate_importance_threshold(importance_threshold)
4214
4273
  max_features = self.__validate_max_features(max_features)
4215
4274
 
4216
- return self.__filtered_importance_names(importance_threshold, max_features)
4275
+ return self.__filtered_importance_names(importance_threshold, max_features, trace_id, df)
4217
4276
 
4218
4277
  def __detect_missing_search_keys(
4219
4278
  self,
upgini/http.py CHANGED
@@ -16,6 +16,7 @@ from typing import Any, Dict, List, Optional, Tuple
16
16
  from urllib.parse import urljoin
17
17
 
18
18
  import jwt
19
+
19
20
  # import pandas as pd
20
21
  import requests
21
22
  from pydantic import BaseModel
@@ -342,7 +343,9 @@ class _RestClient:
342
343
  else:
343
344
  return self._syncronized_refresh_access_token()
344
345
 
345
- def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True):
346
+ def _with_unauth_retry(
347
+ self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False
348
+ ):
346
349
  try:
347
350
  return request()
348
351
  except RequestException as e:
@@ -373,8 +376,9 @@ class _RestClient:
373
376
  elif "more than one concurrent search request" in e.message.lower():
374
377
  raise ValidationError(bundle.get("concurrent_request"))
375
378
  else:
376
- print(e)
377
- show_status_error()
379
+ if not silent:
380
+ print(e)
381
+ show_status_error()
378
382
  raise e
379
383
 
380
384
  @staticmethod
@@ -706,6 +710,7 @@ class _RestClient:
706
710
  silent=True,
707
711
  ),
708
712
  need_connection_retry=False,
713
+ silent=True,
709
714
  )
710
715
  except Exception:
711
716
  self.send_log_event_unauth(log_event)
@@ -716,7 +721,7 @@ class _RestClient:
716
721
  try:
717
722
  requests.post(
718
723
  url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
719
- json=log_event.dict(exclude_none=True),
724
+ json=log_event.model_dump(exclude_none=True),
720
725
  headers=_RestClient._get_base_headers(content_type="application/json"),
721
726
  )
722
727
  except Exception:
upgini/metrics.py CHANGED
@@ -1,20 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
3
  import inspect
5
4
  import logging
6
5
  import re
7
6
  from collections import defaultdict
8
7
  from copy import deepcopy
8
+ from dataclasses import dataclass
9
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
10
 
11
- import catboost
11
+ import lightgbm as lgb
12
12
  import numpy as np
13
13
  import pandas as pd
14
- from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
14
+ from lightgbm import LGBMClassifier, LGBMRegressor
15
15
  from numpy import log1p
16
16
  from pandas.api.types import is_numeric_dtype
17
17
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
18
+ from sklearn.preprocessing import OrdinalEncoder
18
19
 
19
20
  from upgini.utils.features_validator import FeaturesValidator
20
21
  from upgini.utils.sklearn_ext import cross_validate
@@ -27,11 +28,8 @@ except ImportError:
27
28
  from sklearn.metrics._scorer import SCORERS
28
29
 
29
30
  available_scorers = SCORERS
30
- from sklearn.metrics._regression import (
31
- _check_reg_targets,
32
- check_consistent_length,
33
- )
34
31
  from sklearn.metrics import mean_squared_error
32
+ from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
35
33
  from sklearn.model_selection import BaseCrossValidator
36
34
 
37
35
  from upgini.errors import ValidationError
@@ -88,13 +86,73 @@ CATBOOST_MULTICLASS_PARAMS = {
88
86
 
89
87
  LIGHTGBM_PARAMS = {
90
88
  "random_state": DEFAULT_RANDOM_STATE,
91
- "num_leaves": 16,
89
+ # "num_leaves": 16,
90
+ # "n_estimators": 150,
91
+ # "min_child_weight": 1,
92
92
  "max_depth": 4,
93
- "n_estimators": 150,
93
+ "max_cat_threshold": 80,
94
+ "min_data_per_group": 25,
95
+ "num_boost_round": 150,
96
+ "cat_l2": 10,
97
+ "cat_smooth": 12,
98
+ "learning_rate": 0.05,
99
+ "feature_fraction": 1.0,
100
+ "min_sum_hessian_in_leaf": 0.01,
101
+ }
102
+
103
+ LIGHTGBM_REGRESSION_PARAMS = {
104
+ "random_state": DEFAULT_RANDOM_STATE,
105
+ "deterministic": True,
106
+ "min_gain_to_split": 0.001,
107
+ "n_estimators": 275,
108
+ "max_depth": 5,
109
+ "max_cat_threshold": 80,
110
+ "min_data_per_group": 25,
111
+ "cat_l2": 10,
112
+ "cat_smooth": 12,
113
+ "learning_rate": 0.05,
114
+ "feature_fraction": 1.0,
115
+ "min_sum_hessian_in_leaf": 0.01,
116
+ "objective": "huber",
117
+ "verbosity": -1,
118
+ }
119
+
120
+ LIGHTGBM_MULTICLASS_PARAMS = {
121
+ "random_state": DEFAULT_RANDOM_STATE,
122
+ "n_estimators": 275,
123
+ "max_depth": 5,
94
124
  "learning_rate": 0.05,
95
- "min_child_weight": 1,
125
+ "min_gain_to_split": 0.001,
126
+ "max_cat_threshold": 80,
127
+ "min_data_per_group": 20,
128
+ "cat_smooth": 18,
129
+ "cat_l2": 8,
130
+ "objective": "multiclass",
131
+ "class_weight": "balanced",
132
+ "use_quantized_grad": "true",
133
+ "num_grad_quant_bins": "8",
134
+ "stochastic_rounding": "true",
135
+ "verbosity": -1,
96
136
  }
97
137
 
138
+ LIGHTGBM_BINARY_PARAMS = {
139
+ "random_state": DEFAULT_RANDOM_STATE,
140
+ "min_gain_to_split": 0.001,
141
+ "n_estimators": 275,
142
+ "max_depth": 5,
143
+ "learning_rate": 0.05,
144
+ "objective": "binary",
145
+ "class_weight": "balanced",
146
+ "deterministic": True,
147
+ "max_cat_threshold": 80,
148
+ "min_data_per_group": 20,
149
+ "cat_smooth": 18,
150
+ "cat_l2": 8,
151
+ "verbosity": -1,
152
+ }
153
+
154
+ LIGHTGBM_EARLY_STOPPING_ROUNDS = 20
155
+
98
156
  N_FOLDS = 5
99
157
  BLOCKED_TS_TEST_SIZE = 0.2
100
158
 
@@ -211,6 +269,15 @@ SUPPORTED_CATBOOST_METRICS = {
211
269
  }
212
270
 
213
271
 
272
+ def is_catboost_estimator(estimator):
273
+ try:
274
+ from catboost import CatBoostClassifier, CatBoostRegressor
275
+
276
+ return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
277
+ except ImportError:
278
+ return False
279
+
280
+
214
281
  @dataclass
215
282
  class _CrossValResults:
216
283
  metric: Optional[float]
@@ -274,7 +341,7 @@ class EstimatorWrapper:
274
341
  for c in x.columns:
275
342
  if is_numeric_dtype(x[c]):
276
343
  x[c] = x[c].astype(float)
277
- else:
344
+ elif not x[c].dtype == "category":
278
345
  x[c] = x[c].astype(str)
279
346
 
280
347
  if not isinstance(y, pd.Series):
@@ -292,7 +359,7 @@ class EstimatorWrapper:
292
359
  self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
293
360
  return x, y, groups
294
361
 
295
- def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
362
+ def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray]:
296
363
  joined = pd.concat([x, y], axis=1)
297
364
  joined = joined[joined[y.name].notna()]
298
365
  joined = joined.reset_index(drop=True)
@@ -346,12 +413,15 @@ class EstimatorWrapper:
346
413
  for estimator, split in zip(self.cv_estimators, splits):
347
414
  _, validation_idx = split
348
415
  cv_x = x.iloc[validation_idx]
349
- cv_y = y[validation_idx]
416
+ if isinstance(y, pd.Series):
417
+ cv_y = y.iloc[validation_idx]
418
+ else:
419
+ cv_y = y[validation_idx]
350
420
  shaps = self.calculate_shap(cv_x, cv_y, estimator)
351
421
  if shaps is not None:
352
422
  for feature, shap_value in shaps.items():
353
423
  # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
354
- shap_values_all_folds[feature].extend(shap_value.tolist())
424
+ shap_values_all_folds[feature].append(shap_value)
355
425
 
356
426
  if shap_values_all_folds:
357
427
  average_shap_values = {
@@ -427,21 +497,18 @@ class EstimatorWrapper:
427
497
  }
428
498
  if estimator is None:
429
499
  params = {}
430
- params["has_time"] = has_date
431
- # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
432
- # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
433
500
  if target_type == ModelTaskType.MULTICLASS:
434
- params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
501
+ params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
435
502
  params = _get_add_params(params, add_params)
436
- estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
503
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
437
504
  elif target_type == ModelTaskType.BINARY:
438
- params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
505
+ params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
439
506
  params = _get_add_params(params, add_params)
440
- estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
507
+ estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
441
508
  elif target_type == ModelTaskType.REGRESSION:
442
- params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
509
+ params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
443
510
  params = _get_add_params(params, add_params)
444
- estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
511
+ estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
445
512
  else:
446
513
  raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
447
514
  else:
@@ -450,31 +517,21 @@ class EstimatorWrapper:
450
517
  else:
451
518
  estimator_copy = deepcopy(estimator)
452
519
  kwargs["estimator"] = estimator_copy
453
- if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
520
+ if is_catboost_estimator(estimator):
454
521
  if cat_features is not None:
455
522
  for cat_feature in cat_features:
456
523
  if cat_feature not in x.columns:
457
524
  logger.error(
458
525
  f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
459
526
  )
460
- estimator_copy.set_params(
461
- # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
462
- cat_features=cat_features
463
- )
527
+ estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
464
528
  estimator = CatBoostWrapper(**kwargs)
465
529
  else:
466
- try:
467
- from lightgbm import LGBMClassifier, LGBMRegressor
468
-
469
- if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
470
- estimator = LightGBMWrapper(**kwargs)
471
- else:
472
- logger.warning(
473
- f"Unexpected estimator is used for metrics: {estimator}. "
474
- "Default strategy for category features will be used"
475
- )
476
- estimator = OtherEstimatorWrapper(**kwargs)
477
- except ModuleNotFoundError:
530
+ if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
531
+ estimator = LightGBMWrapper(**kwargs)
532
+ elif is_catboost_estimator(estimator):
533
+ estimator = CatBoostWrapper(**kwargs)
534
+ else:
478
535
  logger.warning(
479
536
  f"Unexpected estimator is used for metrics: {estimator}. "
480
537
  "Default strategy for category features will be used"
@@ -487,7 +544,7 @@ class EstimatorWrapper:
487
544
  class CatBoostWrapper(EstimatorWrapper):
488
545
  def __init__(
489
546
  self,
490
- estimator: Union[CatBoostClassifier, CatBoostRegressor],
547
+ estimator,
491
548
  scorer: Callable,
492
549
  metric_name: str,
493
550
  multiplier: int,
@@ -517,6 +574,9 @@ class CatBoostWrapper(EstimatorWrapper):
517
574
  x, y, groups, params = super()._prepare_to_fit(x, y)
518
575
 
519
576
  # Find embeddings
577
+ import catboost
578
+ from catboost import CatBoostClassifier
579
+
520
580
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
521
581
  emb_pattern = r"(.+)_emb\d+"
522
582
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -637,8 +697,10 @@ class CatBoostWrapper(EstimatorWrapper):
637
697
  else:
638
698
  raise e
639
699
 
640
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
700
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
641
701
  try:
702
+ from catboost import Pool
703
+
642
704
  # Create Pool for fold data, if need (for example, when categorical features are present)
643
705
  fold_pool = Pool(
644
706
  x,
@@ -693,33 +755,83 @@ class LightGBMWrapper(EstimatorWrapper):
693
755
  logger=logger,
694
756
  )
695
757
  self.cat_features = None
758
+ self.cat_encoder = None
759
+ self.n_classes = None
696
760
 
697
761
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
698
- x, y, groups, params = super()._prepare_to_fit(x, y)
762
+ x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
763
+ if self.target_type in [ModelTaskType.BINARY, ModelTaskType.MULTICLASS]:
764
+ self.n_classes = len(np.unique(y_numpy))
765
+ if LIGHTGBM_EARLY_STOPPING_ROUNDS is not None:
766
+ params["callbacks"] = [lgb.early_stopping(stopping_rounds=LIGHTGBM_EARLY_STOPPING_ROUNDS, verbose=False)]
699
767
  self.cat_features = _get_cat_features(x)
700
- print("prepare to fit")
701
- print(x.dtypes.to_dict())
702
- print(self.cat_features)
703
- x = fill_na_cat_features(x, self.cat_features)
704
- for feature in self.cat_features:
705
- x[feature] = x[feature].astype("category").cat.codes
706
- if not is_numeric_dtype(y):
707
- y = correct_string_target(y)
768
+ if self.cat_features:
769
+ x = fill_na_cat_features(x, self.cat_features)
770
+ encoder = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
771
+ encoded = pd.DataFrame(
772
+ encoder.fit_transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
773
+ )
774
+ x[self.cat_features] = encoded
775
+ self.cat_encoder = encoder
776
+ if not is_numeric_dtype(y_numpy):
777
+ y_numpy = correct_string_target(y_numpy)
708
778
 
709
- return x, y, groups, params
779
+ return x, y_numpy, groups, params
710
780
 
711
781
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
712
- x, y, params = super()._prepare_to_calculate(x, y)
713
- print("prepare to calculate")
714
- print(x.dtypes.to_dict())
715
- print(self.cat_features)
782
+ x, y_numpy, params = super()._prepare_to_calculate(x, y)
716
783
  if self.cat_features is not None:
717
784
  x = fill_na_cat_features(x, self.cat_features)
718
- for feature in self.cat_features:
719
- x[feature] = x[feature].astype("category").cat.codes
785
+ if self.cat_encoder is not None:
786
+ x[self.cat_features] = pd.DataFrame(
787
+ self.cat_encoder.transform(x[self.cat_features]), columns=self.cat_features, dtype="category"
788
+ )
720
789
  if not is_numeric_dtype(y):
721
- y = correct_string_target(y)
722
- return x, y, params
790
+ y_numpy = correct_string_target(y_numpy)
791
+ return x, y_numpy, params
792
+
793
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
794
+ try:
795
+ shap_matrix = estimator.predict(
796
+ x,
797
+ predict_disable_shape_check=True,
798
+ raw_score=True,
799
+ pred_leaf=False,
800
+ pred_early_stop=True,
801
+ pred_contrib=True,
802
+ )
803
+
804
+ if self.target_type == ModelTaskType.MULTICLASS:
805
+ n_feat = x.shape[1]
806
+ shap_matrix.shape = (shap_matrix.shape[0], self.n_classes, n_feat + 1)
807
+ shap_matrix = np.mean(np.abs(shap_matrix), axis=1)
808
+
809
+ # exclude base value
810
+ shap_matrix = shap_matrix[:, :-1]
811
+
812
+ feature_importance = {}
813
+ for i, col in enumerate(x.columns):
814
+ feature_importance[col] = np.mean(np.abs(shap_matrix[:, i]))
815
+
816
+ # # exclude last column (base value)
817
+ # shap_values_only = shap_values[:, :-1]
818
+ # mean_abs_shap = np.mean(np.abs(shap_values_only), axis=0)
819
+
820
+ # # For classification, shap_values is returned as a list for each class
821
+ # # Take values for the positive class
822
+ # if isinstance(shap_values, list):
823
+ # shap_values = shap_values[1]
824
+
825
+ # # Calculate mean absolute SHAP value for each feature
826
+ # feature_importance = {}
827
+ # for i, col in enumerate(x.columns):
828
+ # feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
829
+
830
+ return feature_importance
831
+
832
+ except Exception as e:
833
+ self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
834
+ return None
723
835
 
724
836
 
725
837
  class OtherEstimatorWrapper(EstimatorWrapper):
@@ -80,6 +80,7 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
80
80
  postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
81
81
  multiple_search_key=Search key {} passed multiple times
82
82
  unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
83
+ only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
83
84
  search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
84
85
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
85
86
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
@@ -74,6 +74,8 @@ def remove_fintech_duplicates(
74
74
  # Checking for different dates by the same personal keys
75
75
  uniques = grouped_by_personal_cols[date_col].nunique()
76
76
  total = len(uniques)
77
+ if total == 0:
78
+ return segment_df, None
77
79
  diff_dates = len(uniques[uniques > 1])
78
80
  if diff_dates / total >= 0.6:
79
81
  return segment_df, None
@@ -90,7 +90,8 @@ class FeatureInfo:
90
90
  def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
91
91
  if data is not None and len(data) > 0 and feature_meta.name in data.columns:
92
92
  if len(data) > 3:
93
- feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
93
+ rand = np.random.RandomState(42)
94
+ feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
94
95
  else:
95
96
  feature_sample = data[feature_meta.name].dropna().unique().tolist()
96
97
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
@@ -9,7 +9,6 @@ from traceback import format_exc
9
9
 
10
10
  import numpy as np
11
11
  import scipy.sparse as sp
12
- from catboost import CatBoostClassifier, CatBoostRegressor
13
12
  from joblib import Parallel, logger
14
13
  from scipy.sparse import issparse
15
14
  from sklearn import config_context, get_config
@@ -342,6 +341,22 @@ def cross_validate(
342
341
  raise e
343
342
 
344
343
 
344
+ def is_catboost_estimator(estimator):
345
+ try:
346
+ from catboost import CatBoostClassifier, CatBoostRegressor
347
+ return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
348
+ except ImportError:
349
+ return False
350
+
351
+
352
+ def is_lightgbm_estimator(estimator):
353
+ try:
354
+ from lightgbm import LGBMClassifier, LGBMRegressor
355
+ return isinstance(estimator, (LGBMClassifier, LGBMRegressor))
356
+ except ImportError:
357
+ return False
358
+
359
+
345
360
  def _fit_and_score(
346
361
  estimator,
347
362
  X,
@@ -497,7 +512,10 @@ def _fit_and_score(
497
512
  if y_train is None:
498
513
  estimator.fit(X_train, **fit_params)
499
514
  else:
500
- if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
515
+ if is_catboost_estimator(estimator):
516
+ fit_params = fit_params.copy()
517
+ fit_params["eval_set"] = [(X_test, y_test)]
518
+ elif is_lightgbm_estimator(estimator):
501
519
  fit_params = fit_params.copy()
502
520
  fit_params["eval_set"] = [(X_test, y_test)]
503
521
  estimator.fit(X_train, y_train, **fit_params)
upgini/utils/sort.py CHANGED
@@ -87,7 +87,7 @@ def get_sort_columns_dict(
87
87
  df_with_target = df_with_target.loc[~target.isna()]
88
88
  df = df_with_target.iloc[:, :-1]
89
89
  target = df_with_target.iloc[:, -1]
90
- df = df.fillna(df.mean())
90
+ df = df.fillna(df.apply(lambda x: int(x.mean()) if pd.api.types.is_integer_dtype(x) else x.mean()))
91
91
  omit_nan = False
92
92
  hashes = [hash_series(df[col]) for col in columns_for_sort]
93
93
  df = np.asarray(df, dtype=np.float32)
@@ -204,7 +204,7 @@ def balance_undersample(
204
204
  def balance_undersample_forced(
205
205
  df: pd.DataFrame,
206
206
  target_column: str,
207
- id_columns: List[str],
207
+ id_columns: Optional[List[str]],
208
208
  date_column: str,
209
209
  task_type: ModelTaskType,
210
210
  cv_type: Optional[CVType],
@@ -287,7 +287,7 @@ DEFAULT_TIME_UNIT_THRESHOLD = pd.Timedelta(weeks=4)
287
287
 
288
288
  def balance_undersample_time_series_trunc(
289
289
  df: pd.DataFrame,
290
- id_columns: List[str],
290
+ id_columns: Optional[List[str]],
291
291
  date_column: str,
292
292
  sample_size: int,
293
293
  random_state: int = 42,
@@ -298,6 +298,8 @@ def balance_undersample_time_series_trunc(
298
298
  **kwargs,
299
299
  ):
300
300
  # Convert date column to datetime
301
+ if id_columns is None:
302
+ id_columns = [date_column]
301
303
  dates_df = df[id_columns + [date_column]].copy()
302
304
  dates_df[date_column] = pd.to_datetime(dates_df[date_column], unit="ms")
303
305
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.71a3810.dev5
3
+ Version: 1.2.71a3810.dev7
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -22,14 +22,14 @@ Classifier: Programming Language :: Python :: 3.11
22
22
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
23
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
24
24
  Requires-Python: <3.12,>=3.10
25
- Requires-Dist: catboost>=1.0.3
26
25
  Requires-Dist: fastparquet>=0.8.1
27
26
  Requires-Dist: ipywidgets>=8.1.0
28
27
  Requires-Dist: jarowinkler>=2.0.0
29
28
  Requires-Dist: levenshtein>=0.25.1
30
- Requires-Dist: numpy<=1.26.4,>=1.19.0
29
+ Requires-Dist: lightgbm>=4.6.0
30
+ Requires-Dist: numpy<3.0.0,>=1.19.0
31
31
  Requires-Dist: pandas<3.0.0,>=1.1.0
32
- Requires-Dist: psutil>=6.0.0
32
+ Requires-Dist: psutil>=5.9.0
33
33
  Requires-Dist: pydantic<3.0.0,>1.0.0
34
34
  Requires-Dist: pyjwt>=2.8.0
35
35
  Requires-Dist: python-bidi==0.4.2
@@ -38,6 +38,7 @@ Requires-Dist: python-json-logger>=3.3.0
38
38
  Requires-Dist: requests>=2.8.0
39
39
  Requires-Dist: scikit-learn>=1.3.0
40
40
  Requires-Dist: scipy>=1.10.0
41
+ Requires-Dist: shap>=0.44.0
41
42
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
42
43
  Description-Content-Type: text/markdown
43
44
 
@@ -1,13 +1,12 @@
1
- upgini/__about__.py,sha256=QR5uw20nhIxDI34CnHAuXHBaXYnOBpyCMHIjl0vktNQ,33
1
+ upgini/__about__.py,sha256=8L5b0YAc11O_l7JTUIlZ33k196Ga-M-OqrDVycquS2g,33
2
2
  upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
3
3
  upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
4
- upgini/dataset.py,sha256=nCPfkQIlAanLgCpcmsDfxFXmg99dRm9m0K_ibdLUr-4,35365
4
+ upgini/dataset.py,sha256=aspri7ZAgwkNNUiIgQ1GRXvw8XQii3F4RfNXSrF4wrw,35365
5
5
  upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
6
- upgini/features_enricher.py,sha256=KqDQ29sU1Aty5Z40DDqO869Y_CClQfmU58nE9rScxRc,204434
7
- upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
8
- upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
6
+ upgini/features_enricher.py,sha256=Li1sPihWVkPUPcma8HRbPFwpCqd9V9d2p5zQUgkpdpU,206998
7
+ upgini/http.py,sha256=RvzcShpDXssLs6ycGN8xilkKi8ZV9XGUrrk8bwdUzbw,43607
9
8
  upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
10
- upgini/metrics.py,sha256=KqSoT-TOnSpYGEY3ZC7Hq8YrYdxNXbjtyorCAk86MzU,35681
9
+ upgini/metrics.py,sha256=UpEfJHTai3grXgicRKaoC89qwjU0WxENmQwbb2NyGrw,39206
11
10
  upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
12
11
  upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
13
12
  upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
@@ -17,15 +16,15 @@ upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
16
  upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
18
17
  upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
19
18
  upgini/autofe/date.py,sha256=MM1S-6imNSzCDOhbNnmsc_bwSqUWBcS8vWAdHF8j1kY,11134
20
- upgini/autofe/feature.py,sha256=md43NwDof0s_nWn_WfOO0l2wYItQ416nEzHm5u29XOA,14945
19
+ upgini/autofe/feature.py,sha256=uoNRZlI_6koGLT0cyoPlnOIA6E9_WQXvalMqu6KGGB4,14970
21
20
  upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
22
21
  upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
23
22
  upgini/autofe/unary.py,sha256=Sx11IoHRh5nwyALzjgG9GQOrVNIs8NZ1JzunAJuN66A,5731
24
23
  upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
25
24
  upgini/autofe/vector.py,sha256=zehv1J9ChHdZKWjKlkRf6RpfQMCJduZmqCEePYNUfkQ,943
26
25
  upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
27
- upgini/autofe/timeseries/base.py,sha256=MYK260n3h9kEbgunbyp0cpR0pgNHml3N2WDLGW5BLDU,3603
28
- upgini/autofe/timeseries/cross.py,sha256=xpHHVITXYUK20BgEZlqKN1Uy2uxKnHz72gngjt7BxVE,5316
26
+ upgini/autofe/timeseries/base.py,sha256=rWJqRuFAzTZEsUdWG5s1Vhif9zzRRmalASXvarufRxI,3610
27
+ upgini/autofe/timeseries/cross.py,sha256=BTINVwuZSbm_4NKkVm0FGM68SrvZLENZKXN7-UyvhYI,5319
29
28
  upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
30
29
  upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
31
30
  upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
@@ -39,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
39
38
  upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
40
39
  upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
41
40
  upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
42
- upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
41
+ upgini/resource_bundle/strings.properties,sha256=mwQrerdJj3adzT-fHqvs6Qjf-rqDccsUzELDIXJKAmY,27791
43
42
  upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
44
43
  upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
45
44
  upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
@@ -53,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
53
52
  upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
54
53
  upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
55
54
  upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
56
- upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
55
+ upgini/utils/deduplicate_utils.py,sha256=AcMLoObMjhOTQ_fMS1LWy0GKp6WXnZ-FNux_8V3nbZU,8914
57
56
  upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
58
57
  upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
59
58
  upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
60
- upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
59
+ upgini/utils/feature_info.py,sha256=Q9HN6A-fvfVD-irFWrmOqqZG9RsUSvh5MTY_k0xu-tE,7287
61
60
  upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
62
61
  upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
63
62
  upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
@@ -65,13 +64,13 @@ upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
65
64
  upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
66
65
  upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
67
66
  upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
68
- upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
69
- upgini/utils/sort.py,sha256=H79A17NMoHtLbqLCPFx_MBUloLZcDKjOba_H4gCE3t8,6965
70
- upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
67
+ upgini/utils/sklearn_ext.py,sha256=HpaNQaKJisgNE7IZ71n7uswxTj7kbPglU2G3s1sORAc,45042
68
+ upgini/utils/sort.py,sha256=8uuHs2nfSMVnz8GgvbOmgMB1PgEIZP1uhmeRFxcwnYw,7039
69
+ upgini/utils/target_utils.py,sha256=KNFzJta1SpGU4sp07dHKSeVJlDs_9qgD2wcw5YuJfOc,16661
71
70
  upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
72
71
  upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
73
72
  upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
74
- upgini-1.2.71a3810.dev5.dist-info/METADATA,sha256=F7wqhbZWwxUebgf0hxfovlBTpUh9-mz4d5LX8TcJP5Q,49075
75
- upgini-1.2.71a3810.dev5.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
76
- upgini-1.2.71a3810.dev5.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
77
- upgini-1.2.71a3810.dev5.dist-info/RECORD,,
73
+ upgini-1.2.71a3810.dev7.dist-info/METADATA,sha256=IJ7cupE5ReCxICsHNhZwoGihsiyYZ2U68rQ0taW9VQ0,49101
74
+ upgini-1.2.71a3810.dev7.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
75
+ upgini-1.2.71a3810.dev7.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
76
+ upgini-1.2.71a3810.dev7.dist-info/RECORD,,
upgini/lazy_import.py DELETED
@@ -1,35 +0,0 @@
1
- import importlib
2
- import importlib.util
3
- import importlib.machinery
4
-
5
-
6
- class LazyImport:
7
- def __init__(self, module_name, class_name):
8
- self.module_name = module_name
9
- self.class_name = class_name
10
- self._module = None
11
- self._class = None
12
-
13
- def _load(self):
14
- if self._module is None:
15
- # Load module and save link to it
16
- spec = importlib.util.find_spec(self.module_name)
17
- if spec is None:
18
- raise ImportError(f"Module {self.module_name} not found")
19
-
20
- # Create module
21
- self._module = importlib.util.module_from_spec(spec)
22
-
23
- # Execute module
24
- spec.loader.exec_module(self._module)
25
-
26
- # Get class from module
27
- self._class = getattr(self._module, self.class_name)
28
-
29
- def __call__(self, *args, **kwargs):
30
- self._load()
31
- return self._class(*args, **kwargs)
32
-
33
- def __getattr__(self, name):
34
- self._load()
35
- return getattr(self._class, name)