upgini 1.2.70a3832.dev3__py3-none-any.whl → 1.2.71a3810.dev2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/autofe/date.py +15 -21
- upgini/autofe/feature.py +5 -1
- upgini/autofe/timeseries/base.py +1 -7
- upgini/autofe/timeseries/cross.py +21 -11
- upgini/autofe/timeseries/roll.py +2 -7
- upgini/autofe/timeseries/trend.py +2 -1
- upgini/autofe/unary.py +38 -1
- upgini/autofe/utils.py +83 -0
- upgini/dataset.py +9 -2
- upgini/features_enricher.py +265 -259
- upgini/http.py +4 -9
- upgini/metadata.py +4 -0
- upgini/metrics.py +48 -145
- upgini/resource_bundle/strings.properties +1 -1
- upgini/search_task.py +7 -1
- upgini/utils/deduplicate_utils.py +0 -2
- upgini/utils/feature_info.py +1 -2
- upgini/utils/mstats.py +1 -1
- upgini/utils/sklearn_ext.py +2 -9
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71a3810.dev2.dist-info}/METADATA +6 -8
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71a3810.dev2.dist-info}/RECORD +24 -23
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71a3810.dev2.dist-info}/WHEEL +1 -1
- {upgini-1.2.70a3832.dev3.dist-info → upgini-1.2.71a3810.dev2.dist-info}/licenses/LICENSE +0 -0
upgini/http.py
CHANGED
|
@@ -16,7 +16,6 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
16
16
|
from urllib.parse import urljoin
|
|
17
17
|
|
|
18
18
|
import jwt
|
|
19
|
-
|
|
20
19
|
# import pandas as pd
|
|
21
20
|
import requests
|
|
22
21
|
from pydantic import BaseModel
|
|
@@ -343,9 +342,7 @@ class _RestClient:
|
|
|
343
342
|
else:
|
|
344
343
|
return self._syncronized_refresh_access_token()
|
|
345
344
|
|
|
346
|
-
def _with_unauth_retry(
|
|
347
|
-
self, request, try_number: int = 0, need_connection_retry: bool = True, silent: bool = False
|
|
348
|
-
):
|
|
345
|
+
def _with_unauth_retry(self, request, try_number: int = 0, need_connection_retry: bool = True):
|
|
349
346
|
try:
|
|
350
347
|
return request()
|
|
351
348
|
except RequestException as e:
|
|
@@ -376,9 +373,8 @@ class _RestClient:
|
|
|
376
373
|
elif "more than one concurrent search request" in e.message.lower():
|
|
377
374
|
raise ValidationError(bundle.get("concurrent_request"))
|
|
378
375
|
else:
|
|
379
|
-
|
|
380
|
-
|
|
381
|
-
show_status_error()
|
|
376
|
+
print(e)
|
|
377
|
+
show_status_error()
|
|
382
378
|
raise e
|
|
383
379
|
|
|
384
380
|
@staticmethod
|
|
@@ -710,7 +706,6 @@ class _RestClient:
|
|
|
710
706
|
silent=True,
|
|
711
707
|
),
|
|
712
708
|
need_connection_retry=False,
|
|
713
|
-
silent=True,
|
|
714
709
|
)
|
|
715
710
|
except Exception:
|
|
716
711
|
self.send_log_event_unauth(log_event)
|
|
@@ -721,7 +716,7 @@ class _RestClient:
|
|
|
721
716
|
try:
|
|
722
717
|
requests.post(
|
|
723
718
|
url=urljoin(_RestClient.PROD_BACKEND_URL, api_path),
|
|
724
|
-
json=log_event.
|
|
719
|
+
json=log_event.dict(exclude_none=True),
|
|
725
720
|
headers=_RestClient._get_base_headers(content_type="application/json"),
|
|
726
721
|
)
|
|
727
722
|
except Exception:
|
upgini/metadata.py
CHANGED
|
@@ -325,6 +325,10 @@ class RuntimeParameters(BaseModel):
|
|
|
325
325
|
properties: Dict[str, Any] = {}
|
|
326
326
|
|
|
327
327
|
|
|
328
|
+
class AutoFEParameters(BaseModel):
|
|
329
|
+
ts_gap_days: Optional[int] = None
|
|
330
|
+
|
|
331
|
+
|
|
328
332
|
class SearchCustomization(BaseModel):
|
|
329
333
|
featuresFilter: Optional[FeaturesFilter] = None
|
|
330
334
|
extractFeatures: Optional[bool] = None
|
upgini/metrics.py
CHANGED
|
@@ -1,17 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
import inspect
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
6
|
-
import warnings
|
|
7
7
|
from collections import defaultdict
|
|
8
8
|
from copy import deepcopy
|
|
9
|
-
from dataclasses import dataclass
|
|
10
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
11
10
|
|
|
11
|
+
import catboost
|
|
12
12
|
import numpy as np
|
|
13
13
|
import pandas as pd
|
|
14
|
-
from
|
|
14
|
+
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
|
|
15
15
|
from numpy import log1p
|
|
16
16
|
from pandas.api.types import is_numeric_dtype
|
|
17
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -27,8 +27,11 @@ except ImportError:
|
|
|
27
27
|
from sklearn.metrics._scorer import SCORERS
|
|
28
28
|
|
|
29
29
|
available_scorers = SCORERS
|
|
30
|
+
from sklearn.metrics._regression import (
|
|
31
|
+
_check_reg_targets,
|
|
32
|
+
check_consistent_length,
|
|
33
|
+
)
|
|
30
34
|
from sklearn.metrics import mean_squared_error
|
|
31
|
-
from sklearn.metrics._regression import _check_reg_targets, check_consistent_length
|
|
32
35
|
from sklearn.model_selection import BaseCrossValidator
|
|
33
36
|
|
|
34
37
|
from upgini.errors import ValidationError
|
|
@@ -85,73 +88,11 @@ CATBOOST_MULTICLASS_PARAMS = {
|
|
|
85
88
|
|
|
86
89
|
LIGHTGBM_PARAMS = {
|
|
87
90
|
"random_state": DEFAULT_RANDOM_STATE,
|
|
88
|
-
|
|
89
|
-
# "n_estimators": 150,
|
|
90
|
-
# "min_child_weight": 1,
|
|
91
|
+
"num_leaves": 16,
|
|
91
92
|
"max_depth": 4,
|
|
92
|
-
"
|
|
93
|
-
"min_data_per_group": 25,
|
|
94
|
-
"num_boost_round": 150,
|
|
95
|
-
"cat_l2": 10,
|
|
96
|
-
"cat_smooth": 12,
|
|
97
|
-
"learning_rate": 0.05,
|
|
98
|
-
"feature_fraction": 1.0,
|
|
99
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
100
|
-
}
|
|
101
|
-
|
|
102
|
-
LIGHTGBM_REGRESSION_PARAMS = {
|
|
103
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
|
104
|
-
"deterministic": True,
|
|
105
|
-
"min_gain_to_split": 0.001,
|
|
106
|
-
"n_estimators": 275,
|
|
107
|
-
"max_depth": 5,
|
|
108
|
-
"max_cat_threshold": 80,
|
|
109
|
-
"min_data_per_group": 25,
|
|
110
|
-
"cat_l2": 10,
|
|
111
|
-
"cat_smooth": 12,
|
|
93
|
+
"n_estimators": 150,
|
|
112
94
|
"learning_rate": 0.05,
|
|
113
|
-
"
|
|
114
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
115
|
-
"objective": "huber",
|
|
116
|
-
"verbosity": -1,
|
|
117
|
-
}
|
|
118
|
-
|
|
119
|
-
LIGHTGBM_MULTICLASS_PARAMS = {
|
|
120
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
|
121
|
-
"deterministic": True,
|
|
122
|
-
"min_gain_to_split": 0.001,
|
|
123
|
-
"n_estimators": 275,
|
|
124
|
-
"max_depth": 3,
|
|
125
|
-
"max_cat_threshold": 80,
|
|
126
|
-
"min_data_per_group": 25,
|
|
127
|
-
"cat_l2": 10,
|
|
128
|
-
"cat_smooth": 12,
|
|
129
|
-
"learning_rate": 0.25, # CatBoost 0.25
|
|
130
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
131
|
-
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
132
|
-
"objective": "multiclass",
|
|
133
|
-
"use_quantized_grad": "true",
|
|
134
|
-
"num_grad_quant_bins": "8",
|
|
135
|
-
"stochastic_rounding": "true",
|
|
136
|
-
"verbosity": -1,
|
|
137
|
-
}
|
|
138
|
-
|
|
139
|
-
LIGHTGBM_BINARY_PARAMS = {
|
|
140
|
-
"random_state": DEFAULT_RANDOM_STATE,
|
|
141
|
-
"deterministic": True,
|
|
142
|
-
"min_gain_to_split": 0.001,
|
|
143
|
-
"n_estimators": 275,
|
|
144
|
-
"max_depth": 5,
|
|
145
|
-
"max_cat_threshold": 80,
|
|
146
|
-
"min_data_per_group": 25,
|
|
147
|
-
"cat_l2": 10,
|
|
148
|
-
"cat_smooth": 12,
|
|
149
|
-
"learning_rate": 0.05,
|
|
150
|
-
"feature_fraction": 1.0,
|
|
151
|
-
"min_sum_hessian_in_leaf": 0.01,
|
|
152
|
-
"objective": "binary",
|
|
153
|
-
"class_weight": "balanced", # TODO pass dict with weights for each class
|
|
154
|
-
"verbosity": -1,
|
|
95
|
+
"min_child_weight": 1,
|
|
155
96
|
}
|
|
156
97
|
|
|
157
98
|
N_FOLDS = 5
|
|
@@ -270,15 +211,6 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
270
211
|
}
|
|
271
212
|
|
|
272
213
|
|
|
273
|
-
def is_catboost_estimator(estimator):
|
|
274
|
-
try:
|
|
275
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
276
|
-
|
|
277
|
-
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
278
|
-
except ImportError:
|
|
279
|
-
return False
|
|
280
|
-
|
|
281
|
-
|
|
282
214
|
@dataclass
|
|
283
215
|
class _CrossValResults:
|
|
284
216
|
metric: Optional[float]
|
|
@@ -360,7 +292,7 @@ class EstimatorWrapper:
|
|
|
360
292
|
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
361
293
|
return x, y, groups
|
|
362
294
|
|
|
363
|
-
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame,
|
|
295
|
+
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
|
|
364
296
|
joined = pd.concat([x, y], axis=1)
|
|
365
297
|
joined = joined[joined[y.name].notna()]
|
|
366
298
|
joined = joined.reset_index(drop=True)
|
|
@@ -414,15 +346,12 @@ class EstimatorWrapper:
|
|
|
414
346
|
for estimator, split in zip(self.cv_estimators, splits):
|
|
415
347
|
_, validation_idx = split
|
|
416
348
|
cv_x = x.iloc[validation_idx]
|
|
417
|
-
|
|
418
|
-
cv_y = y.iloc[validation_idx]
|
|
419
|
-
else:
|
|
420
|
-
cv_y = y[validation_idx]
|
|
349
|
+
cv_y = y[validation_idx]
|
|
421
350
|
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
422
351
|
if shaps is not None:
|
|
423
352
|
for feature, shap_value in shaps.items():
|
|
424
353
|
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
425
|
-
shap_values_all_folds[feature].
|
|
354
|
+
shap_values_all_folds[feature].extend(shap_value.tolist())
|
|
426
355
|
|
|
427
356
|
if shap_values_all_folds:
|
|
428
357
|
average_shap_values = {
|
|
@@ -498,18 +427,21 @@ class EstimatorWrapper:
|
|
|
498
427
|
}
|
|
499
428
|
if estimator is None:
|
|
500
429
|
params = {}
|
|
430
|
+
params["has_time"] = has_date
|
|
431
|
+
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
432
|
+
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
501
433
|
if target_type == ModelTaskType.MULTICLASS:
|
|
502
|
-
params = _get_add_params(params,
|
|
434
|
+
params = _get_add_params(params, CATBOOST_MULTICLASS_PARAMS)
|
|
503
435
|
params = _get_add_params(params, add_params)
|
|
504
|
-
estimator =
|
|
436
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
505
437
|
elif target_type == ModelTaskType.BINARY:
|
|
506
|
-
params = _get_add_params(params,
|
|
438
|
+
params = _get_add_params(params, CATBOOST_BINARY_PARAMS)
|
|
507
439
|
params = _get_add_params(params, add_params)
|
|
508
|
-
estimator =
|
|
440
|
+
estimator = CatBoostWrapper(CatBoostClassifier(**params), **kwargs)
|
|
509
441
|
elif target_type == ModelTaskType.REGRESSION:
|
|
510
|
-
params = _get_add_params(params,
|
|
442
|
+
params = _get_add_params(params, CATBOOST_REGRESSION_PARAMS)
|
|
511
443
|
params = _get_add_params(params, add_params)
|
|
512
|
-
estimator =
|
|
444
|
+
estimator = CatBoostWrapper(CatBoostRegressor(**params), **kwargs)
|
|
513
445
|
else:
|
|
514
446
|
raise Exception(bundle.get("metrics_unsupported_target_type").format(target_type))
|
|
515
447
|
else:
|
|
@@ -518,21 +450,31 @@ class EstimatorWrapper:
|
|
|
518
450
|
else:
|
|
519
451
|
estimator_copy = deepcopy(estimator)
|
|
520
452
|
kwargs["estimator"] = estimator_copy
|
|
521
|
-
if
|
|
453
|
+
if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
|
|
522
454
|
if cat_features is not None:
|
|
523
455
|
for cat_feature in cat_features:
|
|
524
456
|
if cat_feature not in x.columns:
|
|
525
457
|
logger.error(
|
|
526
458
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
527
459
|
)
|
|
528
|
-
estimator_copy.set_params(
|
|
460
|
+
estimator_copy.set_params(
|
|
461
|
+
# cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
462
|
+
cat_features=cat_features
|
|
463
|
+
)
|
|
529
464
|
estimator = CatBoostWrapper(**kwargs)
|
|
530
465
|
else:
|
|
531
|
-
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
estimator
|
|
535
|
-
|
|
466
|
+
try:
|
|
467
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
468
|
+
|
|
469
|
+
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
470
|
+
estimator = LightGBMWrapper(**kwargs)
|
|
471
|
+
else:
|
|
472
|
+
logger.warning(
|
|
473
|
+
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
474
|
+
"Default strategy for category features will be used"
|
|
475
|
+
)
|
|
476
|
+
estimator = OtherEstimatorWrapper(**kwargs)
|
|
477
|
+
except ModuleNotFoundError:
|
|
536
478
|
logger.warning(
|
|
537
479
|
f"Unexpected estimator is used for metrics: {estimator}. "
|
|
538
480
|
"Default strategy for category features will be used"
|
|
@@ -545,7 +487,7 @@ class EstimatorWrapper:
|
|
|
545
487
|
class CatBoostWrapper(EstimatorWrapper):
|
|
546
488
|
def __init__(
|
|
547
489
|
self,
|
|
548
|
-
estimator,
|
|
490
|
+
estimator: Union[CatBoostClassifier, CatBoostRegressor],
|
|
549
491
|
scorer: Callable,
|
|
550
492
|
metric_name: str,
|
|
551
493
|
multiplier: int,
|
|
@@ -575,9 +517,6 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
575
517
|
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
576
518
|
|
|
577
519
|
# Find embeddings
|
|
578
|
-
import catboost
|
|
579
|
-
from catboost import CatBoostClassifier
|
|
580
|
-
|
|
581
520
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
582
521
|
emb_pattern = r"(.+)_emb\d+"
|
|
583
522
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
@@ -698,10 +637,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
698
637
|
else:
|
|
699
638
|
raise e
|
|
700
639
|
|
|
701
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
640
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
|
|
702
641
|
try:
|
|
703
|
-
from catboost import Pool
|
|
704
|
-
|
|
705
642
|
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
706
643
|
fold_pool = Pool(
|
|
707
644
|
x,
|
|
@@ -758,59 +695,25 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
758
695
|
self.cat_features = None
|
|
759
696
|
|
|
760
697
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
|
|
761
|
-
x,
|
|
698
|
+
x, y, groups, params = super()._prepare_to_fit(x, y)
|
|
762
699
|
self.cat_features = _get_cat_features(x)
|
|
763
700
|
x = fill_na_cat_features(x, self.cat_features)
|
|
764
701
|
for feature in self.cat_features:
|
|
765
702
|
x[feature] = x[feature].astype("category").cat.codes
|
|
766
|
-
if not is_numeric_dtype(
|
|
767
|
-
|
|
703
|
+
if not is_numeric_dtype(y):
|
|
704
|
+
y = correct_string_target(y)
|
|
768
705
|
|
|
769
|
-
return x,
|
|
706
|
+
return x, y, groups, params
|
|
770
707
|
|
|
771
708
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
772
|
-
x,
|
|
709
|
+
x, y, params = super()._prepare_to_calculate(x, y)
|
|
773
710
|
if self.cat_features is not None:
|
|
774
711
|
x = fill_na_cat_features(x, self.cat_features)
|
|
775
712
|
for feature in self.cat_features:
|
|
776
713
|
x[feature] = x[feature].astype("category").cat.codes
|
|
777
714
|
if not is_numeric_dtype(y):
|
|
778
|
-
|
|
779
|
-
return x,
|
|
780
|
-
|
|
781
|
-
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
782
|
-
try:
|
|
783
|
-
# Suppress specific warning from SHAP for LightGBM binary classifier
|
|
784
|
-
warnings.filterwarnings(
|
|
785
|
-
"ignore",
|
|
786
|
-
message=(
|
|
787
|
-
"LightGBM binary classifier with TreeExplainer shap values output has changed to a list of ndarray"
|
|
788
|
-
),
|
|
789
|
-
)
|
|
790
|
-
from shap import TreeExplainer
|
|
791
|
-
|
|
792
|
-
if not isinstance(estimator, (LGBMRegressor, LGBMClassifier)):
|
|
793
|
-
return None
|
|
794
|
-
|
|
795
|
-
explainer = TreeExplainer(estimator)
|
|
796
|
-
|
|
797
|
-
shap_values = explainer.shap_values(x)
|
|
798
|
-
|
|
799
|
-
# For classification, shap_values is returned as a list for each class
|
|
800
|
-
# Take values for the positive class
|
|
801
|
-
if isinstance(shap_values, list):
|
|
802
|
-
shap_values = shap_values[1]
|
|
803
|
-
|
|
804
|
-
# Calculate mean absolute SHAP value for each feature
|
|
805
|
-
feature_importance = {}
|
|
806
|
-
for i, col in enumerate(x.columns):
|
|
807
|
-
feature_importance[col] = np.mean(np.abs(shap_values[:, i]))
|
|
808
|
-
|
|
809
|
-
return feature_importance
|
|
810
|
-
|
|
811
|
-
except Exception as e:
|
|
812
|
-
self.logger.warning(f"Failed to calculate SHAP values: {str(e)}")
|
|
813
|
-
return None
|
|
715
|
+
y = correct_string_target(y)
|
|
716
|
+
return x, y, params
|
|
814
717
|
|
|
815
718
|
|
|
816
719
|
class OtherEstimatorWrapper(EstimatorWrapper):
|
|
@@ -80,7 +80,6 @@ email_and_hem_simultanious=EMAIL and HEM search keys cannot be used simultaneous
|
|
|
80
80
|
postal_code_without_country=COUNTRY search key required if POSTAL_CODE is present
|
|
81
81
|
multiple_search_key=Search key {} passed multiple times
|
|
82
82
|
unregistered_only_personal_keys=Only personal search keys used. Api_key from profile.upgini.com required for EMAIL/HEM, PHONE NUMBER or IPv4/IPv6 search keys\nSee docs https://github.com/upgini/upgini#-open-up-all-capabilities-of-upgini
|
|
83
|
-
only_custom_keys=Only CUSTOM_KEY search keys were provided. At least one of DATE, COUNTRY, POSTAL_CODE, PHONE, EMAIL, HEM, IP should be provided
|
|
84
83
|
search_key_not_found=Column `{}` from search_keys was not found in X dataframe: {}
|
|
85
84
|
numeric_search_key_not_found=Index {} in search_keys is out of bounds for {} columns of X dataframe
|
|
86
85
|
unsupported_search_key_type=Unsupported type of key in search_keys: {}
|
|
@@ -137,6 +136,7 @@ x_and_eval_x_diff_types=X and eval_set X has different types: {} and {}
|
|
|
137
136
|
baseline_score_column_not_exists=baseline_score_column {} doesn't exist in input dataframe
|
|
138
137
|
baseline_score_column_has_na=baseline_score_column contains NaN. Clear it and and retry
|
|
139
138
|
missing_features_for_transform=Missing some features for transform that were presented on fit: {}
|
|
139
|
+
missing_target_for_transform=Search contains features on target. Please add y to the call and try again
|
|
140
140
|
missing_id_column=Id column {} not found in X
|
|
141
141
|
# target validation
|
|
142
142
|
empty_target=Target is empty in all rows
|
upgini/search_task.py
CHANGED
|
@@ -168,7 +168,13 @@ class SearchTask:
|
|
|
168
168
|
for meta in self.provider_metadata_v2:
|
|
169
169
|
if meta.features_used_for_embeddings is not None:
|
|
170
170
|
features_for_transform.update(meta.features_used_for_embeddings)
|
|
171
|
-
|
|
171
|
+
if meta.generated_features:
|
|
172
|
+
features_for_transform.update(
|
|
173
|
+
c.original_name
|
|
174
|
+
for f in meta.generated_features
|
|
175
|
+
for c in f.base_columns
|
|
176
|
+
if c.ads_definition_id is None
|
|
177
|
+
)
|
|
172
178
|
return list(features_for_transform)
|
|
173
179
|
|
|
174
180
|
def get_shuffle_kfold(self) -> Optional[bool]:
|
|
@@ -74,8 +74,6 @@ def remove_fintech_duplicates(
|
|
|
74
74
|
# Checking for different dates by the same personal keys
|
|
75
75
|
uniques = grouped_by_personal_cols[date_col].nunique()
|
|
76
76
|
total = len(uniques)
|
|
77
|
-
if total == 0:
|
|
78
|
-
return segment_df, None
|
|
79
77
|
diff_dates = len(uniques[uniques > 1])
|
|
80
78
|
if diff_dates / total >= 0.6:
|
|
81
79
|
return segment_df, None
|
upgini/utils/feature_info.py
CHANGED
|
@@ -90,8 +90,7 @@ class FeatureInfo:
|
|
|
90
90
|
def _get_feature_sample(feature_meta: FeaturesMetadataV2, data: Optional[pd.DataFrame]) -> str:
|
|
91
91
|
if data is not None and len(data) > 0 and feature_meta.name in data.columns:
|
|
92
92
|
if len(data) > 3:
|
|
93
|
-
|
|
94
|
-
feature_sample = rand.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
93
|
+
feature_sample = np.random.choice(data[feature_meta.name].dropna().unique(), 3).tolist()
|
|
95
94
|
else:
|
|
96
95
|
feature_sample = data[feature_meta.name].dropna().unique().tolist()
|
|
97
96
|
if len(feature_sample) > 0 and isinstance(feature_sample[0], float):
|
upgini/utils/mstats.py
CHANGED
|
@@ -118,7 +118,7 @@ def spearmanr(
|
|
|
118
118
|
# - dof: degrees of freedom
|
|
119
119
|
# - t_stat: t-statistic
|
|
120
120
|
# - alternative: 'two-sided', 'greater', 'less'
|
|
121
|
-
def compute_t_pvalue(t_stat, dof, alternative=
|
|
121
|
+
def compute_t_pvalue(t_stat, dof, alternative="two-sided"):
|
|
122
122
|
from scipy.stats import t
|
|
123
123
|
|
|
124
124
|
if alternative == "two-sided":
|
upgini/utils/sklearn_ext.py
CHANGED
|
@@ -9,6 +9,7 @@ from traceback import format_exc
|
|
|
9
9
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
import scipy.sparse as sp
|
|
12
|
+
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
12
13
|
from joblib import Parallel, logger
|
|
13
14
|
from scipy.sparse import issparse
|
|
14
15
|
from sklearn import config_context, get_config
|
|
@@ -341,14 +342,6 @@ def cross_validate(
|
|
|
341
342
|
raise e
|
|
342
343
|
|
|
343
344
|
|
|
344
|
-
def is_catboost_estimator(estimator):
|
|
345
|
-
try:
|
|
346
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
347
|
-
return isinstance(estimator, (CatBoostClassifier, CatBoostRegressor))
|
|
348
|
-
except ImportError:
|
|
349
|
-
return False
|
|
350
|
-
|
|
351
|
-
|
|
352
345
|
def _fit_and_score(
|
|
353
346
|
estimator,
|
|
354
347
|
X,
|
|
@@ -504,7 +497,7 @@ def _fit_and_score(
|
|
|
504
497
|
if y_train is None:
|
|
505
498
|
estimator.fit(X_train, **fit_params)
|
|
506
499
|
else:
|
|
507
|
-
if
|
|
500
|
+
if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
|
|
508
501
|
fit_params = fit_params.copy()
|
|
509
502
|
fit_params["eval_set"] = [(X_test, y_test)]
|
|
510
503
|
estimator.fit(X_train, y_train, **fit_params)
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.3
|
|
2
2
|
Name: upgini
|
|
3
|
-
Version: 1.2.
|
|
3
|
+
Version: 1.2.71a3810.dev2
|
|
4
4
|
Summary: Intelligent data search & enrichment for Machine Learning
|
|
5
5
|
Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
|
|
6
6
|
Project-URL: Homepage, https://upgini.com/
|
|
@@ -17,20 +17,19 @@ Classifier: Intended Audience :: Science/Research
|
|
|
17
17
|
Classifier: Intended Audience :: Telecommunications Industry
|
|
18
18
|
Classifier: License :: OSI Approved :: BSD License
|
|
19
19
|
Classifier: Operating System :: OS Independent
|
|
20
|
-
Classifier: Programming Language :: Python :: 3.8
|
|
21
|
-
Classifier: Programming Language :: Python :: 3.9
|
|
22
20
|
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
23
22
|
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
24
23
|
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
25
|
-
Requires-Python: <3.12,>=3.
|
|
24
|
+
Requires-Python: <3.12,>=3.10
|
|
25
|
+
Requires-Dist: catboost>=1.0.3
|
|
26
26
|
Requires-Dist: fastparquet>=0.8.1
|
|
27
27
|
Requires-Dist: ipywidgets>=8.1.0
|
|
28
28
|
Requires-Dist: jarowinkler>=2.0.0
|
|
29
29
|
Requires-Dist: levenshtein>=0.25.1
|
|
30
|
-
Requires-Dist:
|
|
31
|
-
Requires-Dist: numpy<3.0.0,>=1.19.0
|
|
30
|
+
Requires-Dist: numpy<=1.26.4,>=1.19.0
|
|
32
31
|
Requires-Dist: pandas<3.0.0,>=1.1.0
|
|
33
|
-
Requires-Dist: psutil>=
|
|
32
|
+
Requires-Dist: psutil>=6.0.0
|
|
34
33
|
Requires-Dist: pydantic<3.0.0,>1.0.0
|
|
35
34
|
Requires-Dist: pyjwt>=2.8.0
|
|
36
35
|
Requires-Dist: python-bidi==0.4.2
|
|
@@ -39,7 +38,6 @@ Requires-Dist: python-json-logger>=3.3.0
|
|
|
39
38
|
Requires-Dist: requests>=2.8.0
|
|
40
39
|
Requires-Dist: scikit-learn>=1.3.0
|
|
41
40
|
Requires-Dist: scipy>=1.10.0
|
|
42
|
-
Requires-Dist: shap>=0.44.0
|
|
43
41
|
Requires-Dist: xhtml2pdf<0.3.0,>=0.2.11
|
|
44
42
|
Description-Content-Type: text/markdown
|
|
45
43
|
|
|
@@ -1,14 +1,14 @@
|
|
|
1
|
-
upgini/__about__.py,sha256=
|
|
1
|
+
upgini/__about__.py,sha256=MTGUBBTe5h0uDXYCCEi_Ls0ph00v8U1H8Ryg234maxU,33
|
|
2
2
|
upgini/__init__.py,sha256=LXSfTNU0HnlOkE69VCxkgIKDhWP-JFo_eBQ71OxTr5Y,261
|
|
3
3
|
upgini/ads.py,sha256=nvuRxRx5MHDMgPr9SiU-fsqRdFaBv8p4_v1oqiysKpc,2714
|
|
4
|
-
upgini/dataset.py,sha256=
|
|
4
|
+
upgini/dataset.py,sha256=nCPfkQIlAanLgCpcmsDfxFXmg99dRm9m0K_ibdLUr-4,35365
|
|
5
5
|
upgini/errors.py,sha256=2b_Wbo0OYhLUbrZqdLIx5jBnAsiD1Mcenh-VjR4HCTw,950
|
|
6
|
-
upgini/features_enricher.py,sha256=
|
|
7
|
-
upgini/http.py,sha256=
|
|
6
|
+
upgini/features_enricher.py,sha256=KqDQ29sU1Aty5Z40DDqO869Y_CClQfmU58nE9rScxRc,204434
|
|
7
|
+
upgini/http.py,sha256=ud0Cp7h0jNeHuuZGpU_1dAAEiabGoJjGxc1X5oeBQr4,43496
|
|
8
8
|
upgini/lazy_import.py,sha256=74gQ8JuA48BGRLxAo7lNHNKY2D2emMxrUxKGdxVGhuY,1012
|
|
9
|
-
upgini/metadata.py,sha256=
|
|
10
|
-
upgini/metrics.py,sha256=
|
|
11
|
-
upgini/search_task.py,sha256=
|
|
9
|
+
upgini/metadata.py,sha256=Yd6iW2f7Wz6vUkg5uvR4xylN16ANnCKVKqAsAkap7p8,12354
|
|
10
|
+
upgini/metrics.py,sha256=t7uOOnlDYvP6E3DLjPMQcFBjyhJfUQY8aUlx7N0Mh-s,35477
|
|
11
|
+
upgini/search_task.py,sha256=EuCGp0iCWz2fpuJgN6M47aP_CtIi3Oq9zw78w0mkKiU,17595
|
|
12
12
|
upgini/spinner.py,sha256=4iMd-eIe_BnkqFEMIliULTbj6rNI2HkN_VJ4qYe0cUc,1118
|
|
13
13
|
upgini/version_validator.py,sha256=DvbaAvuYFoJqYt0fitpsk6Xcv-H1BYDJYHUMxaKSH_Y,1509
|
|
14
14
|
upgini/ads_management/__init__.py,sha256=qzyisOToVRP-tquAJD1PblZhNtMrOB8FiyF9JvfkvgE,50
|
|
@@ -16,19 +16,20 @@ upgini/ads_management/ads_manager.py,sha256=igVbN2jz80Umb2BUJixmJVj-zx8unoKpecVo
|
|
|
16
16
|
upgini/autofe/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
upgini/autofe/all_operators.py,sha256=rdjF5eaE4bC6Q4eu_el5Z7ekYt8DjOFermz2bePPbUc,333
|
|
18
18
|
upgini/autofe/binary.py,sha256=MnQuFiERpocjCPQUjOljlsq5FE-04GPfwtNjzvfNMyU,7671
|
|
19
|
-
upgini/autofe/date.py,sha256=
|
|
20
|
-
upgini/autofe/feature.py,sha256=
|
|
19
|
+
upgini/autofe/date.py,sha256=C86F7sPiscUGq2a45UtQA9ADWBWg0kt54mePHHzjbLE,10633
|
|
20
|
+
upgini/autofe/feature.py,sha256=y1x3wijhTVBmloayQAHiscqKU9Ll8kLcGm1PdvS357I,14910
|
|
21
21
|
upgini/autofe/groupby.py,sha256=IYmQV9uoCdRcpkeWZj_kI3ObzoNCNx3ff3h8sTL01tk,3603
|
|
22
22
|
upgini/autofe/operator.py,sha256=EOffJw6vKXpEh5yymqb1RFNJPxGxmnHdFRo9dB5SCFo,4969
|
|
23
|
-
upgini/autofe/unary.py,sha256=
|
|
23
|
+
upgini/autofe/unary.py,sha256=_4F3ZyuPUz2nbkJFMJi2Dk5FirGZngUammstgK1Fq34,5720
|
|
24
|
+
upgini/autofe/utils.py,sha256=fK1am2_tQj3fL2vDslblye8lmyfWgGIUOX1beYVBz4k,2420
|
|
24
25
|
upgini/autofe/vector.py,sha256=l0KdKg-txlZxDSE4hPPfCtfGQofYbl7oaABPr830sPI,667
|
|
25
26
|
upgini/autofe/timeseries/__init__.py,sha256=PGwwDAMwvkXl3el12tXVEmZUgDUvlmIPlXtROm6bD18,738
|
|
26
|
-
upgini/autofe/timeseries/base.py,sha256=
|
|
27
|
-
upgini/autofe/timeseries/cross.py,sha256=
|
|
27
|
+
upgini/autofe/timeseries/base.py,sha256=MYK260n3h9kEbgunbyp0cpR0pgNHml3N2WDLGW5BLDU,3603
|
|
28
|
+
upgini/autofe/timeseries/cross.py,sha256=xpHHVITXYUK20BgEZlqKN1Uy2uxKnHz72gngjt7BxVE,5316
|
|
28
29
|
upgini/autofe/timeseries/delta.py,sha256=h0YhmI1TlPJnjwFpN_GQxLb6r59DQuucnG5tQAXSgjU,3520
|
|
29
30
|
upgini/autofe/timeseries/lag.py,sha256=LfQtg484vuqM0mgY4Wft1swHX_Srq7OKKgZswCXoiXI,1882
|
|
30
|
-
upgini/autofe/timeseries/roll.py,sha256=
|
|
31
|
-
upgini/autofe/timeseries/trend.py,sha256=
|
|
31
|
+
upgini/autofe/timeseries/roll.py,sha256=zADKXU-eYWQnQ5R3am1yEal8uU6Tm0jLAixwPb_aCHg,2794
|
|
32
|
+
upgini/autofe/timeseries/trend.py,sha256=K1_iw2ko_LIUU8YCUgrvN3n0MkHtsi7-63-8x9er1k4,2129
|
|
32
33
|
upgini/autofe/timeseries/volatility.py,sha256=9shUmIKjpWTHVYjj80YBsk0XheBJ9uBuLv5NW9Mchnk,7953
|
|
33
34
|
upgini/data_source/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
34
35
|
upgini/data_source/data_source_publisher.py,sha256=4S9qwlAklD8vg9tUU_c1pHE2_glUHAh15-wr5hMwKFw,22879
|
|
@@ -38,7 +39,7 @@ upgini/normalizer/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU
|
|
|
38
39
|
upgini/normalizer/normalize_utils.py,sha256=Ft2MwSgVoBilXAORAOYAuwPD79GOLfwn4qQE3IUFzzg,7218
|
|
39
40
|
upgini/resource_bundle/__init__.py,sha256=S5F2G47pnJd2LDpmFsjDqEwiKkP8Hm-hcseDbMka6Ko,8345
|
|
40
41
|
upgini/resource_bundle/exceptions.py,sha256=5fRvx0_vWdE1-7HcSgF0tckB4A9AKyf5RiinZkInTsI,621
|
|
41
|
-
upgini/resource_bundle/strings.properties,sha256=
|
|
42
|
+
upgini/resource_bundle/strings.properties,sha256=LDT-jtYlrD1IXvWjFSf-dtvapje0qSrqI9W3v7y2zVo,27646
|
|
42
43
|
upgini/resource_bundle/strings_widget.properties,sha256=gOdqvZWntP2LCza_tyVk1_yRYcG4c04K9sQOAVhF_gw,1577
|
|
43
44
|
upgini/sampler/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
45
|
upgini/sampler/base.py,sha256=7GpjYqjOp58vYcJLiX__1R5wjUlyQbxvHJ2klFnup_M,6389
|
|
@@ -52,25 +53,25 @@ upgini/utils/country_utils.py,sha256=lY-eXWwFVegdVENFttbvLcgGDjFO17Sex8hd2PyJaRk
|
|
|
52
53
|
upgini/utils/custom_loss_utils.py,sha256=kieNZYBYZm5ZGBltF1F_jOSF4ea6C29rYuCyiDcqVNY,3857
|
|
53
54
|
upgini/utils/cv_utils.py,sha256=w6FQb9nO8BWDx88EF83NpjPLarK4eR4ia0Wg0kLBJC4,3525
|
|
54
55
|
upgini/utils/datetime_utils.py,sha256=_jq-kn_dGNFfs-DGXcWCGzy9bkplfAjrZ8SsmN28zXc,13535
|
|
55
|
-
upgini/utils/deduplicate_utils.py,sha256=
|
|
56
|
+
upgini/utils/deduplicate_utils.py,sha256=SMZx9IKIhWI5HqXepfKiQb3uDJrogQZtG6jcWuMo5Z4,8855
|
|
56
57
|
upgini/utils/display_utils.py,sha256=DsBjJ8jEYAh8BPgfAbzq5imoGFV6IACP20PQ78BQCX0,11964
|
|
57
58
|
upgini/utils/email_utils.py,sha256=pZ2vCfNxLIPUhxr0-OlABNXm12jjU44isBk8kGmqQzA,5277
|
|
58
59
|
upgini/utils/fallback_progress_bar.py,sha256=PDaKb8dYpVZaWMroNcOHsTc3pSjgi9mOm0--cOFTwJ0,1074
|
|
59
|
-
upgini/utils/feature_info.py,sha256=
|
|
60
|
+
upgini/utils/feature_info.py,sha256=m1tQcT3hTChPAiXzpk0WQcEqElj8KgeCifEJFa7-gss,7247
|
|
60
61
|
upgini/utils/features_validator.py,sha256=lEfmk4DoxZ4ooOE1HC0ZXtUb_lFKRFHIrnFULZ4_rL8,3746
|
|
61
62
|
upgini/utils/format.py,sha256=Yv5cvvSs2bOLUzzNu96Pu33VMDNbabio92QepUj41jU,243
|
|
62
63
|
upgini/utils/ip_utils.py,sha256=TSQ_qDsLlVnm09X1HacpabEf_HNqSWpxBF4Sdc2xs08,6580
|
|
63
|
-
upgini/utils/mstats.py,sha256=
|
|
64
|
+
upgini/utils/mstats.py,sha256=u3gQVUtDRbyrOQK6V1UJ2Rx1QbkSNYGjXa6m3Z_dPVs,6286
|
|
64
65
|
upgini/utils/phone_utils.py,sha256=IrbztLuOJBiePqqxllfABWfYlfAjYevPhXKipl95wUI,10432
|
|
65
66
|
upgini/utils/postal_code_utils.py,sha256=5M0sUqH2DAr33kARWCTXR-ACyzWbjDq_-0mmEml6ZcU,1716
|
|
66
67
|
upgini/utils/progress_bar.py,sha256=N-Sfdah2Hg8lXP_fV9EfUTXz_PyRt4lo9fAHoUDOoLc,1550
|
|
67
|
-
upgini/utils/sklearn_ext.py,sha256=
|
|
68
|
+
upgini/utils/sklearn_ext.py,sha256=13jQS_k7v0aUtudXV6nGUEWjttPQzAW9AFYL5wgEz9k,44511
|
|
68
69
|
upgini/utils/sort.py,sha256=H79A17NMoHtLbqLCPFx_MBUloLZcDKjOba_H4gCE3t8,6965
|
|
69
70
|
upgini/utils/target_utils.py,sha256=b1GzO8_gMcwXSZ2v98CY50MJJBzKbWHId_BJGybXfkM,16579
|
|
70
71
|
upgini/utils/track_info.py,sha256=G5Lu1xxakg2_TQjKZk4b5SvrHsATTXNVV3NbvWtT8k8,5663
|
|
71
72
|
upgini/utils/ts_utils.py,sha256=26vhC0pN7vLXK6R09EEkMK3Lwb9IVPH7LRdqFIQ3kPs,1383
|
|
72
73
|
upgini/utils/warning_counter.py,sha256=-GRY8EUggEBKODPSuXAkHn9KnEQwAORC0mmz_tim-PM,254
|
|
73
|
-
upgini-1.2.
|
|
74
|
-
upgini-1.2.
|
|
75
|
-
upgini-1.2.
|
|
76
|
-
upgini-1.2.
|
|
74
|
+
upgini-1.2.71a3810.dev2.dist-info/METADATA,sha256=KShCDNaZiUeH7OC7TETgJwx-UCZ9QWlaMcML-eZPJGY,49075
|
|
75
|
+
upgini-1.2.71a3810.dev2.dist-info/WHEEL,sha256=zEMcRr9Kr03x1ozGwg5v9NQBKn3kndp6LSoSlVg-jhU,87
|
|
76
|
+
upgini-1.2.71a3810.dev2.dist-info/licenses/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
77
|
+
upgini-1.2.71a3810.dev2.dist-info/RECORD,,
|
|
File without changes
|