upgini 1.2.68a3832.dev11__py3-none-any.whl → 1.2.69__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +6 -4
- upgini/http.py +4 -6
- upgini/metrics.py +41 -128
- upgini/resource_bundle/strings.properties +0 -1
- upgini/utils/deduplicate_utils.py +0 -2
- upgini/utils/feature_info.py +1 -2
- upgini/utils/sklearn_ext.py +2 -9
- upgini/utils/sort.py +5 -0
- {upgini-1.2.68a3832.dev11.dist-info → upgini-1.2.69.dist-info}/METADATA +3 -4
- {upgini-1.2.68a3832.dev11.dist-info → upgini-1.2.69.dist-info}/RECORD +14 -14
- {upgini-1.2.68a3832.dev11.dist-info → upgini-1.2.69.dist-info}/WHEEL +0 -0
- {upgini-1.2.68a3832.dev11.dist-info → upgini-1.2.69.dist-info}/licenses/LICENSE +0 -0
upgini/__about__.py
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
__version__ = "1.2.
|
|
1
|
+
__version__ = "1.2.69"
|
upgini/dataset.py
CHANGED
|
@@ -388,7 +388,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
388
388
|
for col in columns_to_validate:
|
|
389
389
|
self.data[f"{col}_is_valid"] = ~self.data[col].isnull()
|
|
390
390
|
if validate_target and target is not None and col == target:
|
|
391
|
-
self.data.loc[self.data[target] == np.
|
|
391
|
+
self.data.loc[self.data[target] == np.Inf, f"{col}_is_valid"] = False
|
|
392
392
|
|
|
393
393
|
if col in mandatory_columns:
|
|
394
394
|
self.data["valid_mandatory"] = self.data["valid_mandatory"] & self.data[f"{col}_is_valid"]
|
upgini/features_enricher.py
CHANGED
|
@@ -3845,6 +3845,11 @@ if response.status_code == 200:
|
|
|
3845
3845
|
):
|
|
3846
3846
|
continue
|
|
3847
3847
|
|
|
3848
|
+
# Temporary workaround for duplicate features metadata
|
|
3849
|
+
if feature_meta.name in self.feature_names_:
|
|
3850
|
+
self.logger.warning(f"WARNING: Duplicate feature metadata: {feature_meta}")
|
|
3851
|
+
continue
|
|
3852
|
+
|
|
3848
3853
|
self.feature_names_.append(feature_meta.name)
|
|
3849
3854
|
self.feature_importances_.append(_round_shap_value(feature_meta.shap_value))
|
|
3850
3855
|
|
|
@@ -4070,10 +4075,7 @@ if response.status_code == 200:
|
|
|
4070
4075
|
)
|
|
4071
4076
|
|
|
4072
4077
|
if all(k == SearchKey.CUSTOM_KEY for k in valid_search_keys.values()):
|
|
4073
|
-
|
|
4074
|
-
msg = self.bundle.get("only_custom_keys")
|
|
4075
|
-
else:
|
|
4076
|
-
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4078
|
+
msg = self.bundle.get("unregistered_only_personal_keys")
|
|
4077
4079
|
self.logger.warning(msg + f" Provided search keys: {search_keys}")
|
|
4078
4080
|
raise ValidationError(msg)
|
|
4079
4081
|
|
upgini/http.py
CHANGED
|
@@ -342,7 +342,7 @@ class _RestClient:
|
|
|
342
342
|
else:
|
|
343
343
|
return self._syncronized_refresh_access_token()
|
|
344
344
|
|
|
345
|
-
def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True
|
|
345
|
+
def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True):
|
|
346
346
|
try:
|
|
347
347
|
return request()
|
|
348
348
|
except RequestException as e:
|
|
@@ -373,9 +373,8 @@ class _RestClient:
|
|
|
373
373
|
elif "more than one concurrent search request" in e.message.lower():
|
|
374
374
|
raise ValidationError(bundle.get("concurrent_request"))
|
|
375
375
|
else:
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
show_status_error()
|
|
376
|
+
print(e)
|
|
377
|
+
show_status_error()
|
|
379
378
|
raise e
|
|
380
379
|
|
|
381
380
|
@staticmethod
|
|
@@ -707,7 +706,6 @@ class _RestClient:
|
|
|
707
706
|
silent=True,
|
|
708
707
|
),
|
|
709
708
|
need_connection_retry=False,
|
|
710
|
-
silent=True,
|
|
711
709
|
)
|
|
712
710
|
except Exception:
|
|
713
711
|
self.send_log_event_unauth(log_event)
|
|
@@ -718,7 +716,7 @@ class _RestClient:
|
|
|
718
716
|
try:
|
|
719
717
|
requests.post(
|
|
720
718
|
url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
|
|
721
|
-
json=log_event.
|
|
719
|
+
json=log_event.dict(exclude_none=True),
|
|
722
720
|
headers=_RestClient._get_base_headers(content_type="application/json"),
|
|
723
721
|
)
|
|
724
722
|
except Exception:
|
upgini/metrics.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
import inspect
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
6
7
|
from collections import defaultdict
|
|
7
8
|
from copy import deepcopy
|
|
8
|
-
from dataclasses import dataclass
|
|
9
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
10
10
|
|
|
11
|
+
import catboost
|
|
11
12
|
import numpy as np
|
|
12
13
|
import pandas as pd
|
|
13
|
-
from
|
|
14
|
+
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
|
|
14
15
|
from numpy import log1p
|
|
15
16
|
from pandas.api.types import is_numeric_dtype
|
|
16
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -26,8 +27,11 @@ except ImportError:
|
|
|
26
27
|
from sklearn.metrics._scorer import SCORERS
|
|
27
28
|
|
|
28
29
|
available_scorers = SCORERS
|
|
30
|
+
from sklearn.metrics._regression import (
|
|
31
|
+
_check_reg_targets,
|
|
32
|
+
check_consistent_length,
|
|
33
|
+
)
|
|
29
34
|
from sklearn.metrics import mean_squared_error
|
|
30
|
-
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
|
31
35
|
from sklearn.model_selection import BaseCrossValidator
|
|
32
36
|
|
|
33
37
|
from upgini.errors import ValidationError
|
|
@@ -84,73 +88,11 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
|
84
88
|
|
|
85
89
|
LIGHTGBM_PARAMS = {
|
|
86
90
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
87
|
-
|
|
88
|
-
# "n_estimators": 150,
|
|
89
|
-
# "min_child_weight": 1,
|
|
91
|
+
"num_leaves": 16,
|
|
90
92
|
"max_depth": 4,
|
|
91
|
-
"
|
|
92
|
-
"min_data_per_group": 25,
|
|
93
|
-
"num_boost_round": 150,
|
|
94
|
-
"cat_l2": 10,
|
|
95
|
-
"cat_smooth": 12,
|
|
96
|
-
"learning_rate": 0.05,
|
|
97
|
-
"feature_fraction": 1.0,
|
|
98
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
99
|
-
}
|
|
100
|
-
|
|
101
|
-
LIGHTGBM_REGRESSION_PARAMS = {
|
|
102
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
|
103
|
-
"deterministic": True,
|
|
104
|
-
"min_gain_to_split": 0.001,
|
|
105
|
-
"n_estimators": 275,
|
|
106
|
-
"max_depth": 5,
|
|
107
|
-
"max_cat_threshold": 80,
|
|
108
|
-
"min_data_per_group": 25,
|
|
109
|
-
"cat_l2": 10,
|
|
110
|
-
"cat_smooth": 12,
|
|
111
|
-
"learning_rate": 0.05,
|
|
112
|
-
"feature_fraction": 1.0,
|
|
113
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
114
|
-
"objective": "huber",
|
|
115
|
-
"verbosity": -1,
|
|
116
|
-
}
|
|
117
|
-
|
|
118
|
-
LIGHTGBM_MULTICLASS_PARAMS = {
|
|
119
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
|
120
|
-
"deterministic": True,
|
|
121
|
-
"min_gain_to_split": 0.001,
|
|
122
|
-
"n_estimators": 275,
|
|
123
|
-
"max_depth": 3,
|
|
124
|
-
"max_cat_threshold": 80,
|
|
125
|
-
"min_data_per_group": 25,
|
|
126
|
-
"cat_l2": 10,
|
|
127
|
-
"cat_smooth": 12,
|
|
128
|
-
"learning_rate": 0.25, # CatBoost 0.25
|
|
129
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
130
|
-
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
131
|
-
"objective": "multiclass",
|
|
132
|
-
"use_quantized_grad": "true",
|
|
133
|
-
"num_grad_quant_bins": "8",
|
|
134
|
-
"stochastic_rounding": "true",
|
|
135
|
-
"verbosity": -1,
|
|
136
|
-
}
|
|
137
|
-
|
|
138
|
-
LIGHTGBM_BINARY_PARAMS = {
|
|
139
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
|
140
|
-
"deterministic": True,
|
|
141
|
-
"min_gain_to_split": 0.001,
|
|
142
|
-
"n_estimators": 275,
|
|
143
|
-
"max_depth": 5,
|
|
144
|
-
"max_cat_threshold": 80,
|
|
145
|
-
"min_data_per_group": 25,
|
|
146
|
-
"cat_l2": 10,
|
|
147
|
-
"cat_smooth": 12,
|
|
93
|
+
"n_estimators": 150,
|
|
148
94
|
"learning_rate": 0.05,
|
|
149
|
-
"
|
|
150
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
151
|
-
"objective": "binary",
|
|
152
|
-
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
153
|
-
"verbosity": -1,
|
|
95
|
+
"min_child_weight": 1,
|
|
154
96
|
}
|
|
155
97
|
|
|
156
98
|
N_FOLDS = 5
|
|
@@ -269,15 +211,6 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
269
211
|
}
|
|
270
212
|
|
|
271
213
|
|
|
272
|
-
def is_catboost_estimator(estimator):
|
|
273
|
-
try:
|
|
274
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
275
|
-
|
|
276
|
-
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
277
|
-
except ImportError:
|
|
278
|
-
return False
|
|
279
|
-
|
|
280
|
-
|
|
281
214
|
@dataclass
|
|
282
215
|
class _CrossValResults:
|
|
283
216
|
metric: Optional[float]
|
|
@@ -418,7 +351,7 @@ class EstimatorWrapper:
|
|
|
418
351
|
if shaps is not None:
|
|
419
352
|
for feature, shap_value in shaps.items():
|
|
420
353
|
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
421
|
-
shap_values_all_folds[feature].
|
|
354
|
+
shap_values_all_folds[feature].extend(shap_value.tolist())
|
|
422
355
|
|
|
423
356
|
if shap_values_all_folds:
|
|
424
357
|
average_shap_values = {
|
|
@@ -494,18 +427,21 @@ class EstimatorWrapper:
|
|
|
494
427
|
}
|
|
495
428
|
if estimator is None:
|
|
496
429
|
params = {}
|
|
430
|
+
params["has_time"] = has_date
|
|
431
|
+
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
432
|
+
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
497
433
|
if target_type == ModelTaskType.MULTICLASS:
|
|
498
|
-
params = _get_add_params(params,
|
|
434
|
+
params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
|
499
435
|
params = _get_add_params(params, add_params)
|
|
500
|
-
estimator =
|
|
436
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
501
437
|
elif target_type == ModelTaskType.BINARY:
|
|
502
|
-
params = _get_add_params(params,
|
|
438
|
+
params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
|
|
503
439
|
params = _get_add_params(params, add_params)
|
|
504
|
-
estimator =
|
|
440
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
505
441
|
elif target_type == ModelTaskType.REGRESSION:
|
|
506
|
-
params = _get_add_params(params,
|
|
442
|
+
params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
|
507
443
|
params = _get_add_params(params, add_params)
|
|
508
|
-
estimator =
|
|
444
|
+
estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
|
|
509
445
|
else:
|
|
510
446
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
511
447
|
else:
|
|
@@ -514,21 +450,31 @@ class EstimatorWrapper:
|
|
|
514
450
|
else:
|
|
515
451
|
estimator_copy = deepcopy(estimator)
|
|
516
452
|
kwargs["estimator"] = estimator_copy
|
|
517
|
-
if
|
|
453
|
+
if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
|
|
518
454
|
if cat_features is not None:
|
|
519
455
|
for cat_feature in cat_features:
|
|
520
456
|
if cat_feature not in x.columns:
|
|
521
457
|
logger.error(
|
|
522
458
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
523
459
|
)
|
|
524
|
-
estimator_copy.set_params(
|
|
460
|
+
estimator_copy.set_params(
|
|
461
|
+
# cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
462
|
+
cat_features=cat_features
|
|
463
|
+
)
|
|
525
464
|
estimator = CatBoostWrapper(**kwargs)
|
|
526
465
|
else:
|
|
527
|
-
|
|
528
|
-
|
|
529
|
-
|
|
530
|
-
estimator
|
|
531
|
-
|
|
466
|
+
try:
|
|
467
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
468
|
+
|
|
469
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
470
|
+
estimator = LightGBMWrapper(**kwargs)
|
|
471
|
+
else:
|
|
472
|
+
logger.warning(
|
|
473
|
+
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
474
|
+
"Default strategy for category features will be used"
|
|
475
|
+
)
|
|
476
|
+
estimator = OtherEstimatorWrapper(**kwargs)
|
|
477
|
+
except ModuleNotFoundError:
|
|
532
478
|
logger.warning(
|
|
533
479
|
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
534
480
|
"Default strategy for category features will be used"
|
|
@@ -541,7 +487,7 @@ class EstimatorWrapper:
|
|
|
541
487
|
class CatBoostWrapper(EstimatorWrapper):
|
|
542
488
|
def __init__(
|
|
543
489
|
self,
|
|
544
|
-
estimator,
|
|
490
|
+
estimator: Union[CatBoostClassifier, CatBoostRegressor],
|
|
545
491
|
scorer: Callable,
|
|
546
492
|
metric_name: str,
|
|
547
493
|
multiplier: int,
|
|
@@ -571,9 +517,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
571
517
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
572
518
|
|
|
573
519
|
# Find embeddings
|
|
574
|
-
import catboost
|
|
575
|
-
from catboost import CatBoostClassifier
|
|
576
|
-
|
|
577
520
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
578
521
|
emb_pattern = r"(.+)_emb\d+"
|
|
579
522
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
@@ -694,10 +637,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
694
637
|
else:
|
|
695
638
|
raise e
|
|
696
639
|
|
|
697
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
640
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
|
|
698
641
|
try:
|
|
699
|
-
from catboost import Pool
|
|
700
|
-
|
|
701
642
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
702
643
|
fold_pool = Pool(
|
|
703
644
|
x,
|
|
@@ -754,12 +695,12 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
754
695
|
self.cat_features = None
|
|
755
696
|
|
|
756
697
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
757
|
-
x,
|
|
698
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
758
699
|
self.cat_features = _get_cat_features(x)
|
|
759
700
|
x = fill_na_cat_features(x, self.cat_features)
|
|
760
701
|
for feature in self.cat_features:
|
|
761
702
|
x[feature] = x[feature].astype("category").cat.codes
|
|
762
|
-
if not is_numeric_dtype(
|
|
703
|
+
if not is_numeric_dtype(y):
|
|
763
704
|
y = correct_string_target(y)
|
|
764
705
|
|
|
765
706
|
return x, y, groups, params
|
|
@@ -774,34 +715,6 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
774
715
|
y = correct_string_target(y)
|
|
775
716
|
return x, y, params
|
|
776
717
|
|
|
777
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
778
|
-
try:
|
|
779
|
-
import lightgbm as lgb
|
|
780
|
-
import shap
|
|
781
|
-
|
|
782
|
-
if not isinstance(estimator, (lgb.LGBMRegressor, lgb.LGBMClassifier)):
|
|
783
|
-
return None
|
|
784
|
-
|
|
785
|
-
explainer = shap.TreeExplainer(estimator)
|
|
786
|
-
|
|
787
|
-
shap_values = explainer.shap_values(x)
|
|
788
|
-
|
|
789
|
-
# For classification, shap_values is returned as a list for each class
|
|
790
|
-
# Take values for the positive class
|
|
791
|
-
if isinstance(shap_values, list):
|
|
792
|
-
shap_values = shap_values[1]
|
|
793
|
-
|
|
794
|
-
# Calculate mean absolute SHAP value for each feature
|
|
795
|
-
feature_importance = {}
|
|
796
|
-
for i, col in enumerate(x.columns):
|
|
797
|
-
feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
|
798
|
-
|
|
799
|
-
return feature_importance
|
|
800
|
-
|
|
801
|
-
except Exception as e:
|
|
802
|
-
self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
|
|
803
|
-
return None
|
|
804
|
-
|
|
805
718
|
|
|
806
719
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
807
720
|
def __init__(
|
|
@@ -80,7 +80,6 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
|
|
|
80
80
|
postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
|
|
81
81
|
multiple_search_key=Search key {} passed multiple times
|
|
82
82
|
unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
|
|
83
|
-
only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
|
|
84
83
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
85
84
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
86
85
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
@@ -74,8 +74,6 @@ def remove_fintech_duplicates(
|
|
|
74
74
|
# Checking for different dates by the same personal keys
|
|
75
75
|
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
76
76
|
total = len(uniques)
|
|
77
|
-
if total == 0:
|
|
78
|
-
return segment_df, None
|
|
79
77
|
diff_dates = len(uniques[uniques > 1])
|
|
80
78
|
if diff_dates / total >= 0.6:
|
|
81
79
|
return segment_df, None
|
upgini/utils/feature_info.py
CHANGED
|
@@ -90,8 +90,7 @@ class FeatureInfo:
|
|
|
90
90
|
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
|
|
91
91
|
if data is not None and len(data) > 0 and feature_meta.name in data.columns:
|
|
92
92
|
if len(data) > 3:
|
|
93
|
-
|
|
94
|
-
feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
93
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
95
94
|
else:
|
|
96
95
|
feature_sample = data[feature_meta.name].dropna().unique().tolist()
|
|
97
96
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
upgini/utils/sklearn_ext.py
CHANGED
|
@@ -9,6 +9,7 @@ from traceback import format_exc
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import scipy.sparse as sp
|
|
12
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
12
13
|
from joblib import Parallel, logger
|
|
13
14
|
from scipy.sparse import issparse
|
|
14
15
|
from sklearn import config_context, get_config
|
|
@@ -341,14 +342,6 @@ def cross_validate(
|
|
|
341
342
|
raise e
|
|
342
343
|
|
|
343
344
|
|
|
344
|
-
def is_catboost_estimator(estimator):
|
|
345
|
-
try:
|
|
346
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
347
|
-
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
348
|
-
except ImportError:
|
|
349
|
-
return False
|
|
350
|
-
|
|
351
|
-
|
|
352
345
|
def _fit_and_score(
|
|
353
346
|
estimator,
|
|
354
347
|
X,
|
|
@@ -504,7 +497,7 @@ def _fit_and_score(
|
|
|
504
497
|
if y_train is None:
|
|
505
498
|
estimator.fit(X_train, **fit_params)
|
|
506
499
|
else:
|
|
507
|
-
if
|
|
500
|
+
if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
|
|
508
501
|
fit_params = fit_params.copy()
|
|
509
502
|
fit_params["eval_set"] = [(X_test, y_test)]
|
|
510
503
|
estimator.fit(X_train, y_train, **fit_params)
|
upgini/utils/sort.py
CHANGED
|
@@ -39,6 +39,11 @@ def sort_columns(
|
|
|
39
39
|
sorted_keys = sorted(search_keys.keys(), key=lambda x: str(search_keys.get(x)))
|
|
40
40
|
sorted_keys = [k for k in sorted_keys if k in df.columns and k not in exclude_columns]
|
|
41
41
|
|
|
42
|
+
duplicate_names = df.columns[df.columns.duplicated()].unique()
|
|
43
|
+
if len(duplicate_names) > 0:
|
|
44
|
+
logger.warning(f"WARNING: Found columns with duplicate names: {list(duplicate_names)}")
|
|
45
|
+
df = df[list(set(df.columns))]
|
|
46
|
+
|
|
42
47
|
other_columns = sorted(
|
|
43
48
|
[
|
|
44
49
|
c
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.69
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -23,12 +23,12 @@ Classifier: Programming Language :: Python :: 3.10
|
|
|
23
23
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
24
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
25
|
Requires-Python: <3.12,>=3.8
|
|
26
|
+
Requires-Dist: catboost>=1.0.3
|
|
26
27
|
Requires-Dist: fastparquet>=0.8.1
|
|
27
28
|
Requires-Dist: ipywidgets>=8.1.0
|
|
28
29
|
Requires-Dist: jarowinkler>=2.0.0
|
|
29
30
|
Requires-Dist: levenshtein>=0.25.1
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist: numpy<3.0.0,>=1.19.0
|
|
31
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
32
32
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
33
33
|
Requires-Dist: psutil>=6.0.0
|
|
34
34
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
@@ -39,7 +39,6 @@ Requires-Dist: python-json-logger>=3.3.0
|
|
|
39
39
|
Requires-Dist: requests>=2.8.0
|
|
40
40
|
Requires-Dist: scikit-learn>=1.3.0
|
|
41
41
|
Requires-Dist: scipy>=1.10.0
|
|
42
|
-
Requires-Dist: shap>=0.44.0
|
|
43
42
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
44
43
|
Description-Content-Type: text/markdown
|
|
45
44
|
|
|
@@ -1,13 +1,13 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=b5n5Ah2b8KdU4qEsuokdYRRb9Cz2Tg3GOvmqydpG060,23
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=OGjpeFHbj3lWiZTOHTpWEoMMDmFY1FlNC44FKktoZvU,34956
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=J5U6nprU-oEGUM54fS1W6daG6j4C2xYJE1lx3p6lcBc,205601
|
|
7
|
+
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
9
|
upgini/metadata.py,sha256=Jh6YTaS00m_nbaOY_owvlSyn9zgkErkqu8iTr9ZjKI8,12279
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
10
|
+
upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
|
|
11
11
|
upgini/search_task.py,sha256=qxUxAD-bed-FpZYmTB_4orW7YJsW_O6a1TcgnZIRFr4,17307
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
@@ -38,7 +38,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
38
38
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
39
39
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
40
40
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
41
|
+
upgini/resource_bundle/strings.properties,sha256=3zctRNQDJ1STTvLUfryBT72wYeHYnrllV4rG1C3HtfI,27542
|
|
42
42
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
43
43
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
44
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -52,11 +52,11 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
|
|
|
52
52
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
53
53
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
54
54
|
upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
|
|
55
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
55
|
+
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
56
56
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
57
57
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
58
58
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
59
|
-
upgini/utils/feature_info.py,sha256=
|
|
59
|
+
upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
|
|
60
60
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
61
61
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
62
62
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
@@ -64,13 +64,13 @@ upgini/utils/mstats.py,sha256=dLJQr5Ak5BAoV-pDPpnfvMURZVkZ3_v250QzAsSlqY4,6286
|
|
|
64
64
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
65
65
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
66
66
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
67
|
-
upgini/utils/sklearn_ext.py,sha256=
|
|
68
|
-
upgini/utils/sort.py,sha256=
|
|
67
|
+
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
68
|
+
upgini/utils/sort.py,sha256=H79A17NMoHtLbqLCPFx_MBUloLZcDKjOba_H4gCE3t8,6965
|
|
69
69
|
upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
|
|
70
70
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
71
71
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
72
72
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
73
|
-
upgini-1.2.
|
|
74
|
-
upgini-1.2.
|
|
75
|
-
upgini-1.2.
|
|
76
|
-
upgini-1.2.
|
|
73
|
+
upgini-1.2.69.dist-info/METADATA,sha256=Z8doK3pmiKqcbPbXbG-JZwaqGwtAEsc6YJg8zfqb7cM,49113
|
|
74
|
+
upgini-1.2.69.dist-info/WHEEL,sha256=1yFddiXMmvYK7QYTqtRNtX66WJ0Mz8PYEiEUoOUUxRY,87
|
|
75
|
+
upgini-1.2.69.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
76
|
+
upgini-1.2.69.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|