upgini 1.2.68a3832.dev11__tar.gz → 1.2.69__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (78) hide show
  1. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/PKG-INFO +3 -4
  2. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/pyproject.toml +2 -3
  3. upgini-1.2.69/src/upgini/__about__.py +1 -0
  4. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/dataset.py +1 -1
  5. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/features_enricher.py +6 -4
  6. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/http.py +4 -6
  7. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/metrics.py +41 -128
  8. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/resource_bundle/strings.properties +0 -1
  9. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/deduplicate_utils.py +0 -2
  10. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/feature_info.py +1 -2
  11. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/sklearn_ext.py +2 -9
  12. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/sort.py +5 -0
  13. upgini-1.2.68a3832.dev11/src/upgini/__about__.py +0 -1
  14. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/.gitignore +0 -0
  15. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/LICENSE +0 -0
  16. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/README.md +0 -0
  17. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/__init__.py +0 -0
  18. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/ads.py +0 -0
  19. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/ads_management/__init__.py +0 -0
  20. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/ads_management/ads_manager.py +0 -0
  21. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/__init__.py +0 -0
  22. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/all_operators.py +0 -0
  23. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/binary.py +0 -0
  24. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/date.py +0 -0
  25. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/feature.py +0 -0
  26. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/groupby.py +0 -0
  27. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/operator.py +0 -0
  28. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/__init__.py +0 -0
  29. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/base.py +0 -0
  30. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/cross.py +0 -0
  31. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/delta.py +0 -0
  32. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/lag.py +0 -0
  33. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/roll.py +0 -0
  34. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/trend.py +0 -0
  35. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/timeseries/volatility.py +0 -0
  36. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/unary.py +0 -0
  37. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/autofe/vector.py +0 -0
  38. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/data_source/__init__.py +0 -0
  39. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/data_source/data_source_publisher.py +0 -0
  40. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/errors.py +0 -0
  41. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/lazy_import.py +0 -0
  42. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/mdc/__init__.py +0 -0
  43. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/mdc/context.py +0 -0
  44. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/metadata.py +0 -0
  45. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/normalizer/__init__.py +0 -0
  46. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/normalizer/normalize_utils.py +0 -0
  47. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/resource_bundle/__init__.py +0 -0
  48. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/resource_bundle/exceptions.py +0 -0
  49. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  50. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/sampler/__init__.py +0 -0
  51. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/sampler/base.py +0 -0
  52. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/sampler/random_under_sampler.py +0 -0
  53. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/sampler/utils.py +0 -0
  54. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/search_task.py +0 -0
  55. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/spinner.py +0 -0
  56. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  57. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/__init__.py +0 -0
  58. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/base_search_key_detector.py +0 -0
  59. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/blocked_time_series.py +0 -0
  60. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/country_utils.py +0 -0
  61. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/custom_loss_utils.py +0 -0
  62. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/cv_utils.py +0 -0
  63. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/datetime_utils.py +0 -0
  64. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/display_utils.py +0 -0
  65. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/email_utils.py +0 -0
  66. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/fallback_progress_bar.py +0 -0
  67. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/features_validator.py +0 -0
  68. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/format.py +0 -0
  69. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/ip_utils.py +0 -0
  70. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/mstats.py +0 -0
  71. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/phone_utils.py +0 -0
  72. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/postal_code_utils.py +0 -0
  73. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/progress_bar.py +0 -0
  74. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/target_utils.py +0 -0
  75. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/track_info.py +0 -0
  76. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/ts_utils.py +0 -0
  77. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/utils/warning_counter.py +0 -0
  78. {upgini-1.2.68a3832.dev11 → upgini-1.2.69}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.68a3832.dev11
3
+ Version: 1.2.69
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -23,12 +23,12 @@ Classifier: Programming Language :: Python :: 3.10
23
23
  Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
24
24
  Classifier: Topic :: Scientific/Engineering :: Information Analysis
25
25
  Requires-Python: <3.12,>=3.8
26
+ Requires-Dist: catboost>=1.0.3
26
27
  Requires-Dist: fastparquet>=0.8.1
27
28
  Requires-Dist: ipywidgets>=8.1.0
28
29
  Requires-Dist: jarowinkler>=2.0.0
29
30
  Requires-Dist: levenshtein>=0.25.1
30
- Requires-Dist: lightgbm>=4.6.0
31
- Requires-Dist: numpy<3.0.0,>=1.19.0
31
+ Requires-Dist: numpy<=1.26.4,>=1.19.0
32
32
  Requires-Dist: pandas<3.0.0,>=1.1.0
33
33
  Requires-Dist: psutil>=6.0.0
34
34
  Requires-Dist: pydantic<3.0.0,>1.0.0
@@ -39,7 +39,6 @@ Requires-Dist: python-json-logger>=3.3.0
39
39
  Requires-Dist: requests>=2.8.0
40
40
  Requires-Dist: scikit-learn>=1.3.0
41
41
  Requires-Dist: scipy>=1.10.0
42
- Requires-Dist: shap>=0.44.0
43
42
  Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
44
43
  Description-Content-Type: text/markdown
45
44
 
@@ -35,11 +35,10 @@ classifiers = [
35
35
  "Topic :: Scientific/Engineering :: Information Analysis",
36
36
  ]
37
37
  dependencies = [
38
- "lightgbm>=4.6.0",
39
- "shap>=0.44.0",
38
+ "catboost>=1.0.3",
40
39
  "fastparquet>=0.8.1",
41
40
  "ipywidgets>=8.1.0",
42
- "numpy>=1.19.0,<3.0.0",
41
+ "numpy>=1.19.0,<=1.26.4",
43
42
  "pandas>=1.1.0,<3.0.0",
44
43
  "pydantic>1.0.0,<3.0.0",
45
44
  "pyjwt>=2.8.0",
@@ -0,0 +1 @@
1
+ __version__ = "1.2.69"
@@ -388,7 +388,7 @@ class Dataset: # (pd.DataFrame):
388
388
  for col in columns_to_validate:
389
389
  self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
390
390
  if validate_target and target is not None and col == target:
391
- self.data.loc[self.data[target] == np.inf, f"{col}_is_valid"] = False
391
+ self.data.loc[self.data[target] == np.Inf, f"{col}_is_valid"] = False
392
392
 
393
393
  if col in mandatory_columns:
394
394
  self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
@@ -3845,6 +3845,11 @@ if response.status_code == 200:
3845
3845
  ):
3846
3846
  continue
3847
3847
 
3848
+ # Temporary workaround for duplicate features metadata
3849
+ if feature_meta.name in self.feature_names_:
3850
+ self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
3851
+ continue
3852
+
3848
3853
  self.feature_names_.append(feature_meta.name)
3849
3854
  self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
3850
3855
 
@@ -4070,10 +4075,7 @@ if response.status_code == 200:
4070
4075
  )
4071
4076
 
4072
4077
  if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
4073
- if self.__is_registered:
4074
- msg = self.bundle.get("only_custom_keys")
4075
- else:
4076
- msg = self.bundle.get("unregistered_only_personal_keys")
4078
+ msg = self.bundle.get("unregistered_only_personal_keys")
4077
4079
  self.logger.warning(msg + f" Provided search keys: {search_keys}")
4078
4080
  raise ValidationError(msg)
4079
4081
 
@@ -342,7 +342,7 @@ class _RestClient:
342
342
  else:
343
343
  return self._syncronized_refresh_access_token()
344
344
 
345
- def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False):
345
+ def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True):
346
346
  try:
347
347
  return request()
348
348
  except RequestException as e:
@@ -373,9 +373,8 @@ class _RestClient:
373
373
  elif "more than one concurrent search request" in e.message.lower():
374
374
  raise ValidationError(bundle.get("concurrent_request"))
375
375
  else:
376
- if not silent:
377
- print(e)
378
- show_status_error()
376
+ print(e)
377
+ show_status_error()
379
378
  raise e
380
379
 
381
380
  @staticmethod
@@ -707,7 +706,6 @@ class _RestClient:
707
706
  silent=True,
708
707
  ),
709
708
  need_connection_retry=False,
710
- silent=True,
711
709
  )
712
710
  except Exception:
713
711
  self.send_log_event_unauth(log_event)
@@ -718,7 +716,7 @@ class _RestClient:
718
716
  try:
719
717
  requests.post(
720
718
  url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
721
- json=log_event.model_dump(exclude_none=True),
719
+ json=log_event.dict(exclude_none=True),
722
720
  headers=_RestClient._get_base_headers(content_type="application/json"),
723
721
  )
724
722
  except Exception:
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataclasses import dataclass
3
4
  import inspect
4
5
  import logging
5
6
  import re
6
7
  from collections import defaultdict
7
8
  from copy import deepcopy
8
- from dataclasses import dataclass
9
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
10
 
11
+ import catboost
11
12
  import numpy as np
12
13
  import pandas as pd
13
- from lightgbm import LGBMClassifier, LGBMRegressor
14
+ from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
14
15
  from numpy import log1p
15
16
  from pandas.api.types import is_numeric_dtype
16
17
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -26,8 +27,11 @@ except ImportError:
26
27
  from sklearn.metrics._scorer import SCORERS
27
28
 
28
29
  available_scorers = SCORERS
30
+ from sklearn.metrics._regression import (
31
+ _check_reg_targets,
32
+ check_consistent_length,
33
+ )
29
34
  from sklearn.metrics import mean_squared_error
30
- from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
31
35
  from sklearn.model_selection import BaseCrossValidator
32
36
 
33
37
  from upgini.errors import ValidationError
@@ -84,73 +88,11 @@ CATBOOST_MULTICLASS_PARAMS = {
84
88
 
85
89
  LIGHTGBM_PARAMS = {
86
90
  "random_state": DEFAULT_RANDOM_STATE,
87
- # "num_leaves": 16,
88
- # "n_estimators": 150,
89
- # "min_child_weight": 1,
91
+ "num_leaves": 16,
90
92
  "max_depth": 4,
91
- "max_cat_threshold": 80,
92
- "min_data_per_group": 25,
93
- "num_boost_round": 150,
94
- "cat_l2": 10,
95
- "cat_smooth": 12,
96
- "learning_rate": 0.05,
97
- "feature_fraction": 1.0,
98
- "min_sum_hessian_in_leaf": 0.01,
99
- }
100
-
101
- LIGHTGBM_REGRESSION_PARAMS = {
102
- "random_state": DEFAULT_RANDOM_STATE,
103
- "deterministic": True,
104
- "min_gain_to_split": 0.001,
105
- "n_estimators": 275,
106
- "max_depth": 5,
107
- "max_cat_threshold": 80,
108
- "min_data_per_group": 25,
109
- "cat_l2": 10,
110
- "cat_smooth": 12,
111
- "learning_rate": 0.05,
112
- "feature_fraction": 1.0,
113
- "min_sum_hessian_in_leaf": 0.01,
114
- "objective": "huber",
115
- "verbosity": -1,
116
- }
117
-
118
- LIGHTGBM_MULTICLASS_PARAMS = {
119
- "random_state": DEFAULT_RANDOM_STATE,
120
- "deterministic": True,
121
- "min_gain_to_split": 0.001,
122
- "n_estimators": 275,
123
- "max_depth": 3,
124
- "max_cat_threshold": 80,
125
- "min_data_per_group": 25,
126
- "cat_l2": 10,
127
- "cat_smooth": 12,
128
- "learning_rate": 0.25, # CatBoost 0.25
129
- "min_sum_hessian_in_leaf": 0.01,
130
- "class_weight": "balanced", # TODO pass dict with weights for each class
131
- "objective": "multiclass",
132
- "use_quantized_grad": "true",
133
- "num_grad_quant_bins": "8",
134
- "stochastic_rounding": "true",
135
- "verbosity": -1,
136
- }
137
-
138
- LIGHTGBM_BINARY_PARAMS = {
139
- "random_state": DEFAULT_RANDOM_STATE,
140
- "deterministic": True,
141
- "min_gain_to_split": 0.001,
142
- "n_estimators": 275,
143
- "max_depth": 5,
144
- "max_cat_threshold": 80,
145
- "min_data_per_group": 25,
146
- "cat_l2": 10,
147
- "cat_smooth": 12,
93
+ "n_estimators": 150,
148
94
  "learning_rate": 0.05,
149
- "feature_fraction": 1.0,
150
- "min_sum_hessian_in_leaf": 0.01,
151
- "objective": "binary",
152
- "class_weight": "balanced", # TODO pass dict with weights for each class
153
- "verbosity": -1,
95
+ "min_child_weight": 1,
154
96
  }
155
97
 
156
98
  N_FOLDS = 5
@@ -269,15 +211,6 @@ SUPPORTED_CATBOOST_METRICS = {
269
211
  }
270
212
 
271
213
 
272
- def is_catboost_estimator(estimator):
273
- try:
274
- from catboost import CatBoostClassifier, CatBoostRegressor
275
-
276
- return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
277
- except ImportError:
278
- return False
279
-
280
-
281
214
  @dataclass
282
215
  class _CrossValResults:
283
216
  metric: Optional[float]
@@ -418,7 +351,7 @@ class EstimatorWrapper:
418
351
  if shaps is not None:
419
352
  for feature, shap_value in shaps.items():
420
353
  # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
421
- shap_values_all_folds[feature].append(shap_value)
354
+ shap_values_all_folds[feature].extend(shap_value.tolist())
422
355
 
423
356
  if shap_values_all_folds:
424
357
  average_shap_values = {
@@ -494,18 +427,21 @@ class EstimatorWrapper:
494
427
  }
495
428
  if estimator is None:
496
429
  params = {}
430
+ params["has_time"] = has_date
431
+ # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
432
+ # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
497
433
  if target_type == ModelTaskType.MULTICLASS:
498
- params = _get_add_params(params, LIGHTGBM_MULTICLASS_PARAMS)
434
+ params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
499
435
  params = _get_add_params(params, add_params)
500
- estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
436
+ estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
501
437
  elif target_type == ModelTaskType.BINARY:
502
- params = _get_add_params(params, LIGHTGBM_BINARY_PARAMS)
438
+ params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
503
439
  params = _get_add_params(params, add_params)
504
- estimator = LightGBMWrapper(LGBMClassifier(**params), **kwargs)
440
+ estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
505
441
  elif target_type == ModelTaskType.REGRESSION:
506
- params = _get_add_params(params, LIGHTGBM_REGRESSION_PARAMS)
442
+ params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
507
443
  params = _get_add_params(params, add_params)
508
- estimator = LightGBMWrapper(LGBMRegressor(**params), **kwargs)
444
+ estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
509
445
  else:
510
446
  raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
511
447
  else:
@@ -514,21 +450,31 @@ class EstimatorWrapper:
514
450
  else:
515
451
  estimator_copy = deepcopy(estimator)
516
452
  kwargs["estimator"] = estimator_copy
517
- if is_catboost_estimator(estimator):
453
+ if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
518
454
  if cat_features is not None:
519
455
  for cat_feature in cat_features:
520
456
  if cat_feature not in x.columns:
521
457
  logger.error(
522
458
  f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
523
459
  )
524
- estimator_copy.set_params(cat_features=cat_features, has_time=has_date)
460
+ estimator_copy.set_params(
461
+ # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
462
+ cat_features=cat_features
463
+ )
525
464
  estimator = CatBoostWrapper(**kwargs)
526
465
  else:
527
- if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
528
- estimator = LightGBMWrapper(**kwargs)
529
- elif is_catboost_estimator(estimator):
530
- estimator = CatBoostWrapper(**kwargs)
531
- else:
466
+ try:
467
+ from lightgbm import LGBMClassifier, LGBMRegressor
468
+
469
+ if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
470
+ estimator = LightGBMWrapper(**kwargs)
471
+ else:
472
+ logger.warning(
473
+ f"Unexpected estimator is used for metrics: {estimator}. "
474
+ "Default strategy for category features will be used"
475
+ )
476
+ estimator = OtherEstimatorWrapper(**kwargs)
477
+ except ModuleNotFoundError:
532
478
  logger.warning(
533
479
  f"Unexpected estimator is used for metrics: {estimator}. "
534
480
  "Default strategy for category features will be used"
@@ -541,7 +487,7 @@ class EstimatorWrapper:
541
487
  class CatBoostWrapper(EstimatorWrapper):
542
488
  def __init__(
543
489
  self,
544
- estimator,
490
+ estimator: Union[CatBoostClassifier, CatBoostRegressor],
545
491
  scorer: Callable,
546
492
  metric_name: str,
547
493
  multiplier: int,
@@ -571,9 +517,6 @@ class CatBoostWrapper(EstimatorWrapper):
571
517
  x, y, groups, params = super()._prepare_to_fit(x, y)
572
518
 
573
519
  # Find embeddings
574
- import catboost
575
- from catboost import CatBoostClassifier
576
-
577
520
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
578
521
  emb_pattern = r"(.+)_emb\d+"
579
522
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
@@ -694,10 +637,8 @@ class CatBoostWrapper(EstimatorWrapper):
694
637
  else:
695
638
  raise e
696
639
 
697
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
640
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
698
641
  try:
699
- from catboost import Pool
700
-
701
642
  # Create Pool for fold data, if need (for example, when categorical features are present)
702
643
  fold_pool = Pool(
703
644
  x,
@@ -754,12 +695,12 @@ class LightGBMWrapper(EstimatorWrapper):
754
695
  self.cat_features = None
755
696
 
756
697
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
757
- x, y_numpy, groups, params = super()._prepare_to_fit(x, y)
698
+ x, y, groups, params = super()._prepare_to_fit(x, y)
758
699
  self.cat_features = _get_cat_features(x)
759
700
  x = fill_na_cat_features(x, self.cat_features)
760
701
  for feature in self.cat_features:
761
702
  x[feature] = x[feature].astype("category").cat.codes
762
- if not is_numeric_dtype(y_numpy):
703
+ if not is_numeric_dtype(y):
763
704
  y = correct_string_target(y)
764
705
 
765
706
  return x, y, groups, params
@@ -774,34 +715,6 @@ class LightGBMWrapper(EstimatorWrapper):
774
715
  y = correct_string_target(y)
775
716
  return x, y, params
776
717
 
777
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
778
- try:
779
- import lightgbm as lgb
780
- import shap
781
-
782
- if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
783
- return None
784
-
785
- explainer = shap.TreeExplainer(estimator)
786
-
787
- shap_values = explainer.shap_values(x)
788
-
789
- # For classification, shap_values is returned as a list for each class
790
- # Take values for the positive class
791
- if isinstance(shap_values, list):
792
- shap_values = shap_values[1]
793
-
794
- # Calculate mean absolute SHAP value for each feature
795
- feature_importance = {}
796
- for i, col in enumerate(x.columns):
797
- feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
798
-
799
- return feature_importance
800
-
801
- except Exception as e:
802
- self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
803
- return None
804
-
805
718
 
806
719
  class OtherEstimatorWrapper(EstimatorWrapper):
807
720
  def __init__(
@@ -80,7 +80,6 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
80
80
  postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
81
81
  multiple_search_key=Search key {} passed multiple times
82
82
  unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
83
- only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
84
83
  search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
85
84
  numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
86
85
  unsupported_search_key_type=Unsupported type of key in search_keys: {}
@@ -74,8 +74,6 @@ def remove_fintech_duplicates(
74
74
  # Checking for different dates by the same personal keys
75
75
  uniques = grouped_by_personal_cols[date_col].nunique()
76
76
  total = len(uniques)
77
- if total == 0:
78
- return segment_df, None
79
77
  diff_dates = len(uniques[uniques > 1])
80
78
  if diff_dates / total >= 0.6:
81
79
  return segment_df, None
@@ -90,8 +90,7 @@ class FeatureInfo:
90
90
  def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
91
91
  if data is not None and len(data) > 0 and feature_meta.name in data.columns:
92
92
  if len(data) > 3:
93
- rand = np.random.RandomState(42)
94
- feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
93
+ feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
95
94
  else:
96
95
  feature_sample = data[feature_meta.name].dropna().unique().tolist()
97
96
  if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
@@ -9,6 +9,7 @@ from traceback import format_exc
9
9
 
10
10
  import numpy as np
11
11
  import scipy.sparse as sp
12
+ from catboost import CatBoostClassifier, CatBoostRegressor
12
13
  from joblib import Parallel, logger
13
14
  from scipy.sparse import issparse
14
15
  from sklearn import config_context, get_config
@@ -341,14 +342,6 @@ def cross_validate(
341
342
  raise e
342
343
 
343
344
 
344
- def is_catboost_estimator(estimator):
345
- try:
346
- from catboost import CatBoostClassifier, CatBoostRegressor
347
- return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
348
- except ImportError:
349
- return False
350
-
351
-
352
345
  def _fit_and_score(
353
346
  estimator,
354
347
  X,
@@ -504,7 +497,7 @@ def _fit_and_score(
504
497
  if y_train is None:
505
498
  estimator.fit(X_train, **fit_params)
506
499
  else:
507
- if is_catboost_estimator(estimator):
500
+ if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
508
501
  fit_params = fit_params.copy()
509
502
  fit_params["eval_set"] = [(X_test, y_test)]
510
503
  estimator.fit(X_train, y_train, **fit_params)
@@ -39,6 +39,11 @@ def sort_columns(
39
39
  sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
40
40
  sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
41
41
 
42
+ duplicate_names = df.columns[df.columns.duplicated()].unique()
43
+ if len(duplicate_names) > 0:
44
+ logger.warning(f"WARNING: Found columns with duplicate names: {list(duplicate_names)}")
45
+ df = df[list(set(df.columns))]
46
+
42
47
  other_columns = sorted(
43
48
  [
44
49
  c
@@ -1 +0,0 @@
1
- __version__ = "1.2.68a3832.dev11"
File without changes
File without changes
File without changes