upgini 1.1.267__tar.gz → 1.1.268__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {upgini-1.1.267/src/upgini.egg-info → upgini-1.1.268}/PKG-INFO +1 -1
- {upgini-1.1.267 → upgini-1.1.268}/setup.py +1 -1
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/features_enricher.py +1 -1
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/metrics.py +57 -7
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/sklearn_ext.py +1 -2
- {upgini-1.1.267 → upgini-1.1.268/src/upgini.egg-info}/PKG-INFO +1 -1
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_target_utils.py +6 -6
- {upgini-1.1.267 → upgini-1.1.268}/LICENSE +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/README.md +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/pyproject.toml +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/setup.cfg +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/ads.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/ads_management/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/ads_management/ads_manager.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/all_operands.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/binary.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/date.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/feature.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/groupby.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/operand.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/unary.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/vector.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/data_source/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/data_source/data_source_publisher.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/dataset.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/errors.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/fingerprint.js +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/http.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/mdc/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/mdc/context.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/metadata.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/normalizer/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/normalizer/phone_normalizer.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/exceptions.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/strings.properties +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/strings_widget.properties +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/base.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/random_under_sampler.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/search_task.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/spinner.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/__init__.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/base_search_key_detector.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/blocked_time_series.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/country_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/custom_loss_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/cv_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/datetime_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/deduplicate_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/display_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/email_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/fallback_progress_bar.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/features_validator.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/format.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/ip_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/phone_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/postal_code_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/progress_bar.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/target_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/track_info.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/warning_counter.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini/version_validator.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/SOURCES.txt +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/dependency_links.txt +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/requires.txt +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/top_level.txt +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_autofe_operands.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_binary_dataset.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_blocked_time_series.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_categorical_dataset.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_continuous_dataset.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_country_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_custom_loss_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_datetime_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_email_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_etalon_validation.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_features_enricher.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_metrics.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_phone_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_postal_code_utils.py +0 -0
- {upgini-1.1.267 → upgini-1.1.268}/tests/test_widget.py +0 -0
|
@@ -3665,7 +3665,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
3665
3665
|
if y is not None:
|
|
3666
3666
|
with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
|
|
3667
3667
|
pickle.dump(sample(y, xy_sample_index), y_file)
|
|
3668
|
-
if eval_set
|
|
3668
|
+
if eval_set:
|
|
3669
3669
|
eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
|
|
3670
3670
|
with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
|
|
3671
3671
|
pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
|
|
@@ -3,15 +3,16 @@ import re
|
|
|
3
3
|
from copy import deepcopy
|
|
4
4
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
5
5
|
|
|
6
|
+
import catboost
|
|
6
7
|
import numpy as np
|
|
7
8
|
import pandas as pd
|
|
8
9
|
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
9
|
-
import catboost
|
|
10
10
|
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
11
11
|
from numpy import log1p
|
|
12
12
|
from pandas.api.types import is_numeric_dtype
|
|
13
13
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
14
14
|
|
|
15
|
+
from upgini.utils.features_validator import FeaturesValidator
|
|
15
16
|
from upgini.utils.sklearn_ext import cross_validate
|
|
16
17
|
|
|
17
18
|
try:
|
|
@@ -352,6 +353,7 @@ class EstimatorWrapper:
|
|
|
352
353
|
"target_type": target_type,
|
|
353
354
|
"groups": groups,
|
|
354
355
|
"text_features": text_features,
|
|
356
|
+
"logger": logger,
|
|
355
357
|
}
|
|
356
358
|
if estimator is None:
|
|
357
359
|
params = dict()
|
|
@@ -414,12 +416,22 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
414
416
|
target_type: ModelTaskType,
|
|
415
417
|
groups: Optional[List[str]] = None,
|
|
416
418
|
text_features: Optional[List[str]] = None,
|
|
419
|
+
logger: Optional[logging.Logger] = None,
|
|
417
420
|
):
|
|
418
421
|
super(CatBoostWrapper, self).__init__(
|
|
419
|
-
estimator,
|
|
422
|
+
estimator,
|
|
423
|
+
scorer,
|
|
424
|
+
metric_name,
|
|
425
|
+
multiplier,
|
|
426
|
+
cv,
|
|
427
|
+
target_type,
|
|
428
|
+
groups=groups,
|
|
429
|
+
text_features=text_features,
|
|
430
|
+
logger=logger,
|
|
420
431
|
)
|
|
421
432
|
self.cat_features = None
|
|
422
433
|
self.emb_features = None
|
|
434
|
+
self.exclude_features = []
|
|
423
435
|
|
|
424
436
|
def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
425
437
|
X, y, groups, params = super()._prepare_to_fit(X, y)
|
|
@@ -437,9 +449,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
437
449
|
X, embedding_features = self.group_embeddings(X)
|
|
438
450
|
params["embedding_features"] = embedding_features
|
|
439
451
|
else:
|
|
440
|
-
self.logger.info(
|
|
441
|
-
f"Embedding features count less than 3, so use them separately: {self.emb_features}"
|
|
442
|
-
)
|
|
452
|
+
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
443
453
|
self.emb_features = []
|
|
444
454
|
else:
|
|
445
455
|
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
@@ -498,6 +508,8 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
498
508
|
return df, [emb_name]
|
|
499
509
|
|
|
500
510
|
def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
511
|
+
if self.exclude_features:
|
|
512
|
+
X = X.drop(columns=self.exclude_features)
|
|
501
513
|
X, y, params = super()._prepare_to_calculate(X, y)
|
|
502
514
|
if self.text_features:
|
|
503
515
|
params["text_features"] = self.text_features
|
|
@@ -510,6 +522,26 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
510
522
|
|
|
511
523
|
return X, y, params
|
|
512
524
|
|
|
525
|
+
def cross_val_predict(
|
|
526
|
+
self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
527
|
+
) -> Optional[float]:
|
|
528
|
+
try:
|
|
529
|
+
return super().cross_val_predict(X, y, baseline_score_column)
|
|
530
|
+
except Exception as e:
|
|
531
|
+
if "Dictionary size is 0" in e.args[0] and self.text_features:
|
|
532
|
+
high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
|
|
533
|
+
self.logger.warning(
|
|
534
|
+
"Failed to calculate metrics. Try to remove high cardinality"
|
|
535
|
+
f" text features {high_cardinality_features} and retry"
|
|
536
|
+
)
|
|
537
|
+
for f in high_cardinality_features:
|
|
538
|
+
self.text_features.remove(f)
|
|
539
|
+
self.exclude_features.append(f)
|
|
540
|
+
X = X.drop(columns=f)
|
|
541
|
+
return super().cross_val_predict(X, y, baseline_score_column)
|
|
542
|
+
else:
|
|
543
|
+
raise e
|
|
544
|
+
|
|
513
545
|
|
|
514
546
|
class LightGBMWrapper(EstimatorWrapper):
|
|
515
547
|
def __init__(
|
|
@@ -522,9 +554,18 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
522
554
|
target_type: ModelTaskType,
|
|
523
555
|
groups: Optional[List[str]] = None,
|
|
524
556
|
text_features: Optional[List[str]] = None,
|
|
557
|
+
logger: Optional[logging.Logger] = None,
|
|
525
558
|
):
|
|
526
559
|
super(LightGBMWrapper, self).__init__(
|
|
527
|
-
estimator,
|
|
560
|
+
estimator,
|
|
561
|
+
scorer,
|
|
562
|
+
metric_name,
|
|
563
|
+
multiplier,
|
|
564
|
+
cv,
|
|
565
|
+
target_type,
|
|
566
|
+
groups=groups,
|
|
567
|
+
text_features=text_features,
|
|
568
|
+
logger=logger,
|
|
528
569
|
)
|
|
529
570
|
self.cat_features = None
|
|
530
571
|
|
|
@@ -561,9 +602,18 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
561
602
|
target_type: ModelTaskType,
|
|
562
603
|
groups: Optional[List[str]] = None,
|
|
563
604
|
text_features: Optional[List[str]] = None,
|
|
605
|
+
logger: Optional[logging.Logger] = None,
|
|
564
606
|
):
|
|
565
607
|
super(OtherEstimatorWrapper, self).__init__(
|
|
566
|
-
estimator,
|
|
608
|
+
estimator,
|
|
609
|
+
scorer,
|
|
610
|
+
metric_name,
|
|
611
|
+
multiplier,
|
|
612
|
+
cv,
|
|
613
|
+
target_type,
|
|
614
|
+
groups=groups,
|
|
615
|
+
text_features=text_features,
|
|
616
|
+
logger=logger,
|
|
567
617
|
)
|
|
568
618
|
self.cat_features = None
|
|
569
619
|
|
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
import functools
|
|
2
|
-
import logging
|
|
3
2
|
import numbers
|
|
4
3
|
import time
|
|
5
4
|
import warnings
|
|
@@ -313,7 +312,7 @@ def cross_validate(
|
|
|
313
312
|
|
|
314
313
|
return ret
|
|
315
314
|
except Exception:
|
|
316
|
-
logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
315
|
+
# logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
317
316
|
raise
|
|
318
317
|
# fit_params["use_best_model"] = False
|
|
319
318
|
# return original_cross_validate(
|
|
@@ -140,7 +140,7 @@ def test_binary_psi_calculation():
|
|
|
140
140
|
"target": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1]
|
|
141
141
|
})
|
|
142
142
|
df["date"] = pd.date_range("2020-01-01", "2020-01-20")
|
|
143
|
-
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
|
|
143
|
+
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
|
|
144
144
|
enricher._validate_PSI(df)
|
|
145
145
|
assert not enricher.warning_counter.has_warnings()
|
|
146
146
|
|
|
@@ -148,7 +148,7 @@ def test_binary_psi_calculation():
|
|
|
148
148
|
"target": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]
|
|
149
149
|
})
|
|
150
150
|
df["date"] = pd.date_range("2020-01-01", "2020-01-20")
|
|
151
|
-
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
|
|
151
|
+
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
|
|
152
152
|
enricher._validate_PSI(df)
|
|
153
153
|
assert enricher.warning_counter._count == 1
|
|
154
154
|
|
|
@@ -157,7 +157,7 @@ def test_binary_psi_calculation():
|
|
|
157
157
|
"eval_set_index": [0] * 10 + [1] * 10,
|
|
158
158
|
})
|
|
159
159
|
df["date"] = pd.date_range("2020-01-01", "2020-01-20")
|
|
160
|
-
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
|
|
160
|
+
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
|
|
161
161
|
enricher._validate_PSI(df)
|
|
162
162
|
assert enricher.warning_counter._count == 1
|
|
163
163
|
|
|
@@ -166,7 +166,7 @@ def test_binary_psi_calculation():
|
|
|
166
166
|
"eval_set_index": [0] * 10 + [1] * 10,
|
|
167
167
|
})
|
|
168
168
|
df["date"] = pd.date_range("2020-01-01", "2020-01-20")
|
|
169
|
-
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
|
|
169
|
+
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
|
|
170
170
|
enricher._validate_PSI(df)
|
|
171
171
|
assert enricher.warning_counter._count == 2
|
|
172
172
|
|
|
@@ -177,7 +177,7 @@ def test_regression_psi_calculation():
|
|
|
177
177
|
"target": random.rand(20)
|
|
178
178
|
})
|
|
179
179
|
df["date"] = pd.date_range("2020-01-01", "2020-01-20")
|
|
180
|
-
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
|
|
180
|
+
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
|
|
181
181
|
enricher._validate_PSI(df)
|
|
182
182
|
assert enricher.warning_counter._count == 1
|
|
183
183
|
|
|
@@ -189,6 +189,6 @@ def test_regression_psi_calculation():
|
|
|
189
189
|
"target": list(values1) + list(values2)
|
|
190
190
|
})
|
|
191
191
|
df["date"] = pd.date_range("2020-01-01", "2020-01-20")
|
|
192
|
-
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
|
|
192
|
+
enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
|
|
193
193
|
enricher._validate_PSI(df)
|
|
194
194
|
assert not enricher.warning_counter.has_warnings()
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|