upgini 1.1.244a7__py3-none-any.whl → 1.1.244a8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- upgini/dataset.py +1 -1
- upgini/features_enricher.py +3 -0
- upgini/metrics.py +68 -15
- upgini/utils/sklearn_ext.py +18 -15
- {upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/METADATA +1 -1
- {upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/RECORD +9 -9
- {upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/LICENSE +0 -0
- {upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/WHEEL +0 -0
- {upgini-1.1.244a7.dist-info → upgini-1.1.244a8.dist-info}/top_level.txt +0 -0
upgini/dataset.py
CHANGED
|
@@ -61,7 +61,7 @@ class Dataset: # (pd.DataFrame):
|
|
|
61
61
|
FIT_SAMPLE_THRESHOLD = 200_000
|
|
62
62
|
FIT_SAMPLE_WITH_EVAL_SET_ROWS = 200_000
|
|
63
63
|
FIT_SAMPLE_WITH_EVAL_SET_THRESHOLD = 200_000
|
|
64
|
-
MIN_SAMPLE_THRESHOLD =
|
|
64
|
+
MIN_SAMPLE_THRESHOLD = 5_000
|
|
65
65
|
IMBALANCE_THESHOLD = 0.4
|
|
66
66
|
MIN_TARGET_CLASS_ROWS = 100
|
|
67
67
|
MAX_MULTICLASS_CLASS_COUNT = 100
|
upgini/features_enricher.py
CHANGED
|
@@ -955,6 +955,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
955
955
|
fitting_enriched_X,
|
|
956
956
|
scoring,
|
|
957
957
|
groups=groups,
|
|
958
|
+
text_features=self.generate_features,
|
|
958
959
|
)
|
|
959
960
|
metric = wrapper.metric_name
|
|
960
961
|
multiplier = wrapper.multiplier
|
|
@@ -980,6 +981,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
980
981
|
cat_features,
|
|
981
982
|
add_params=custom_loss_add_params,
|
|
982
983
|
groups=groups,
|
|
984
|
+
text_features=self.generate_features,
|
|
983
985
|
)
|
|
984
986
|
etalon_metric = baseline_estimator.cross_val_predict(
|
|
985
987
|
fitting_X, y_sorted, self.baseline_score_column
|
|
@@ -1004,6 +1006,7 @@ class FeaturesEnricher(TransformerMixin):
|
|
|
1004
1006
|
cat_features,
|
|
1005
1007
|
add_params=custom_loss_add_params,
|
|
1006
1008
|
groups=groups,
|
|
1009
|
+
text_features=self.generate_features,
|
|
1007
1010
|
)
|
|
1008
1011
|
enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
|
|
1009
1012
|
self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
|
upgini/metrics.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
|
1
1
|
import logging
|
|
2
|
+
import re
|
|
2
3
|
from copy import deepcopy
|
|
3
4
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
4
5
|
|
|
@@ -201,6 +202,7 @@ class EstimatorWrapper:
|
|
|
201
202
|
target_type: ModelTaskType,
|
|
202
203
|
add_params: Optional[Dict[str, Any]] = None,
|
|
203
204
|
groups: Optional[np.ndarray] = None,
|
|
205
|
+
text_features: Optional[List[str]] = None,
|
|
204
206
|
):
|
|
205
207
|
self.estimator = estimator
|
|
206
208
|
self.scorer = scorer
|
|
@@ -213,6 +215,7 @@ class EstimatorWrapper:
|
|
|
213
215
|
self.add_params = add_params
|
|
214
216
|
self.cv_estimators = None
|
|
215
217
|
self.groups = groups
|
|
218
|
+
self.text_features = text_features
|
|
216
219
|
|
|
217
220
|
def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
|
|
218
221
|
X, y, _, fit_params = self._prepare_to_fit(X, y)
|
|
@@ -285,6 +288,7 @@ class EstimatorWrapper:
|
|
|
285
288
|
groups=groups,
|
|
286
289
|
fit_params=fit_params,
|
|
287
290
|
return_estimator=True,
|
|
291
|
+
error_score="raise",
|
|
288
292
|
)
|
|
289
293
|
metrics_by_fold = cv_results["test_score"]
|
|
290
294
|
self.cv_estimators = cv_results["estimator"]
|
|
@@ -330,6 +334,7 @@ class EstimatorWrapper:
|
|
|
330
334
|
"cv": cv,
|
|
331
335
|
"target_type": target_type,
|
|
332
336
|
"groups": groups,
|
|
337
|
+
"text_features": text_features,
|
|
333
338
|
}
|
|
334
339
|
if estimator is None:
|
|
335
340
|
params = dict()
|
|
@@ -391,27 +396,56 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
391
396
|
cv: BaseCrossValidator,
|
|
392
397
|
target_type: ModelTaskType,
|
|
393
398
|
groups: Optional[List[str]] = None,
|
|
399
|
+
text_features: Optional[List[str]] = None,
|
|
394
400
|
):
|
|
395
401
|
super(CatBoostWrapper, self).__init__(
|
|
396
|
-
estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups
|
|
402
|
+
estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
|
|
397
403
|
)
|
|
398
404
|
self.cat_features = None
|
|
399
405
|
self.cat_features_idx = None
|
|
406
|
+
self.emb_groups = None
|
|
400
407
|
|
|
401
408
|
def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
402
409
|
X, y, groups, params = super()._prepare_to_fit(X, y)
|
|
410
|
+
|
|
411
|
+
# Find embeddings
|
|
412
|
+
emb_pattern = r"(.+)_emb\d+"
|
|
413
|
+
emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
|
|
414
|
+
embedding_features = []
|
|
415
|
+
if len(emb_features) > 0:
|
|
416
|
+
# group by source feature
|
|
417
|
+
self.emb_groups = dict()
|
|
418
|
+
for emb in emb_features:
|
|
419
|
+
source_feature = re.match(emb_pattern, emb).group(1)
|
|
420
|
+
embs = self.emb_groups.get(source_feature, [])
|
|
421
|
+
embs.append(emb)
|
|
422
|
+
self.emb_groups[source_feature] = embs
|
|
423
|
+
self.emb_groups = {
|
|
424
|
+
source_feature: embs for source_feature, embs in self.emb_groups.items() if len(embs) > 1
|
|
425
|
+
}
|
|
426
|
+
X, embedding_features = self.group_embeddings(X)
|
|
427
|
+
params["embedding_features"] = embedding_features
|
|
428
|
+
|
|
429
|
+
# Find text features from passed in generate_features
|
|
430
|
+
if self.text_features is not None:
|
|
431
|
+
self.text_features = [f for f in self.text_features if not is_numeric_dtype(X[f])]
|
|
432
|
+
params["text_features"] = self.text_features
|
|
433
|
+
|
|
434
|
+
# Find rest categorical features
|
|
403
435
|
self.cat_features = _get_cat_features(X)
|
|
436
|
+
if self.text_features is not None:
|
|
437
|
+
self.cat_features = [
|
|
438
|
+
f for f in self.cat_features if f not in self.text_features and f not in embedding_features
|
|
439
|
+
]
|
|
404
440
|
X = fill_na_cat_features(X, self.cat_features)
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
# cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
|
|
414
|
-
self.cat_features_idx = [X.columns.get_loc(c) for c in self.cat_features]
|
|
441
|
+
unique_cat_features = []
|
|
442
|
+
for name in self.cat_features:
|
|
443
|
+
# Remove constant categorical features
|
|
444
|
+
if X[name].nunique() > 1:
|
|
445
|
+
unique_cat_features.append(name)
|
|
446
|
+
else:
|
|
447
|
+
X = X.drop(columns=name)
|
|
448
|
+
self.cat_features_idx = [X.columns.get_loc(c) for c in unique_cat_features]
|
|
415
449
|
if (
|
|
416
450
|
hasattr(self.estimator, "get_param")
|
|
417
451
|
and hasattr(self.estimator, "_init_params")
|
|
@@ -422,15 +456,32 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
422
456
|
self.cat_features_idx = list(cat_features_set)
|
|
423
457
|
del self.estimator._init_params["cat_features"]
|
|
424
458
|
|
|
425
|
-
params
|
|
459
|
+
params["cat_features"] = self.cat_features_idx
|
|
460
|
+
|
|
426
461
|
return X, y, groups, params
|
|
427
462
|
|
|
463
|
+
def group_embeddings(self, df: pd.DataFrame):
|
|
464
|
+
emb_columns = []
|
|
465
|
+
for source_feature, embs in self.emb_groups.items():
|
|
466
|
+
emb_name = f"{source_feature}_emb"
|
|
467
|
+
df[embs] = df[embs].fillna(0.0)
|
|
468
|
+
df[emb_name] = df[embs].values.tolist()
|
|
469
|
+
df = df.drop(columns=embs)
|
|
470
|
+
emb_columns.append(emb_name)
|
|
471
|
+
return df, emb_columns
|
|
472
|
+
|
|
428
473
|
def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
429
474
|
X, y, params = super()._prepare_to_calculate(X, y)
|
|
475
|
+
if self.text_features is not None:
|
|
476
|
+
params["text_features"] = self.text_features
|
|
477
|
+
if self.emb_groups is not None:
|
|
478
|
+
X, emb_columns = self.group_embeddings(X)
|
|
479
|
+
params["embedding_features"] = emb_columns
|
|
430
480
|
if self.cat_features is not None:
|
|
431
481
|
X = fill_na_cat_features(X, self.cat_features)
|
|
432
482
|
if self.cat_features_idx is not None:
|
|
433
|
-
params
|
|
483
|
+
params["cat_features"] = self.cat_features_idx
|
|
484
|
+
|
|
434
485
|
return X, y, params
|
|
435
486
|
|
|
436
487
|
|
|
@@ -444,9 +495,10 @@ class LightGBMWrapper(EstimatorWrapper):
|
|
|
444
495
|
cv: BaseCrossValidator,
|
|
445
496
|
target_type: ModelTaskType,
|
|
446
497
|
groups: Optional[List[str]] = None,
|
|
498
|
+
text_features: Optional[List[str]] = None,
|
|
447
499
|
):
|
|
448
500
|
super(LightGBMWrapper, self).__init__(
|
|
449
|
-
estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups
|
|
501
|
+
estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
|
|
450
502
|
)
|
|
451
503
|
self.cat_features = None
|
|
452
504
|
|
|
@@ -482,9 +534,10 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
482
534
|
cv: BaseCrossValidator,
|
|
483
535
|
target_type: ModelTaskType,
|
|
484
536
|
groups: Optional[List[str]] = None,
|
|
537
|
+
text_features: Optional[List[str]] = None,
|
|
485
538
|
):
|
|
486
539
|
super(OtherEstimatorWrapper, self).__init__(
|
|
487
|
-
estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups
|
|
540
|
+
estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
|
|
488
541
|
)
|
|
489
542
|
self.cat_features = None
|
|
490
543
|
|
upgini/utils/sklearn_ext.py
CHANGED
|
@@ -21,6 +21,7 @@ from sklearn.metrics._scorer import _MultimetricScorer
|
|
|
21
21
|
from sklearn.model_selection import check_cv
|
|
22
22
|
from sklearn.utils.fixes import np_version, parse_version
|
|
23
23
|
from sklearn.utils.validation import indexable
|
|
24
|
+
from sklearn.model_selection import cross_validate as original_cross_validate
|
|
24
25
|
|
|
25
26
|
_DEFAULT_TAGS = {
|
|
26
27
|
"non_deterministic": False,
|
|
@@ -313,21 +314,23 @@ def cross_validate(
|
|
|
313
314
|
return ret
|
|
314
315
|
except Exception:
|
|
315
316
|
logging.exception("Failed to execute overriden cross_validate. Fallback to original")
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
317
|
+
raise
|
|
318
|
+
# fit_params["use_best_model"] = False
|
|
319
|
+
# return original_cross_validate(
|
|
320
|
+
# estimator,
|
|
321
|
+
# X,
|
|
322
|
+
# y,
|
|
323
|
+
# groups=groups,
|
|
324
|
+
# scoring=scoring,
|
|
325
|
+
# cv=cv,
|
|
326
|
+
# n_jobs=n_jobs,
|
|
327
|
+
# verbose=verbose,
|
|
328
|
+
# fit_params=fit_params,
|
|
329
|
+
# pre_dispatch=pre_dispatch,
|
|
330
|
+
# return_train_score=return_train_score,
|
|
331
|
+
# return_estimator=return_estimator,
|
|
332
|
+
# error_score=error_score,
|
|
333
|
+
# )
|
|
331
334
|
|
|
332
335
|
|
|
333
336
|
def _fit_and_score(
|
|
@@ -1,12 +1,12 @@
|
|
|
1
1
|
upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
|
|
2
2
|
upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
|
|
3
|
-
upgini/dataset.py,sha256=
|
|
3
|
+
upgini/dataset.py,sha256=2oOmBe8_mpwJ8Fw14gw4uZ1GgLU4PtjozkXhvIXhRq0,50022
|
|
4
4
|
upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
|
|
5
|
-
upgini/features_enricher.py,sha256=
|
|
5
|
+
upgini/features_enricher.py,sha256=tQI3qhwMqBfmPD3pygmT6Jrg6SiuLoc7FIXMUQRj1W4,165007
|
|
6
6
|
upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
|
|
7
7
|
upgini/http.py,sha256=eSG4gOpmCGlXmB6KIPNzAG8tRZNUjyYpMeUeHw_2li4,42264
|
|
8
8
|
upgini/metadata.py,sha256=55t0uQI910tzTcnwxZCUL1413BhTiSm8oqiwp-94NyA,9613
|
|
9
|
-
upgini/metrics.py,sha256=
|
|
9
|
+
upgini/metrics.py,sha256=6KglRDnHOotP5HttlkPj2oQMM0MDjY_QtUMrczpl3gQ,26065
|
|
10
10
|
upgini/search_task.py,sha256=5n4qGJmtu48s0-FHAtF3L5qVLMd1JVW3FJlM8dFbh-s,17063
|
|
11
11
|
upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
|
|
12
12
|
upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
|
|
@@ -50,12 +50,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
|
|
|
50
50
|
upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
|
|
51
51
|
upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
|
|
52
52
|
upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
|
|
53
|
-
upgini/utils/sklearn_ext.py,sha256=
|
|
53
|
+
upgini/utils/sklearn_ext.py,sha256=fbRQ2ggX2Ock61RJZ-QqvMasqy8-x71knjQrj19GTMM,44025
|
|
54
54
|
upgini/utils/target_utils.py,sha256=qyj-bGsIEl9X2Vc5gwXtsuRaocvId8bn46F7mZ9dy9A,1707
|
|
55
55
|
upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
|
|
56
56
|
upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
|
|
57
|
-
upgini-1.1.
|
|
58
|
-
upgini-1.1.
|
|
59
|
-
upgini-1.1.
|
|
60
|
-
upgini-1.1.
|
|
61
|
-
upgini-1.1.
|
|
57
|
+
upgini-1.1.244a8.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
|
|
58
|
+
upgini-1.1.244a8.dist-info/METADATA,sha256=2y08SoG74Ck2fqeRZy8OazKjnGNd5_S1G4I5JAYEY5M,48264
|
|
59
|
+
upgini-1.1.244a8.dist-info/WHEEL,sha256=pkctZYzUS4AYVn6dJ-7367OJZivF2e8RA9b_ZBjif18,92
|
|
60
|
+
upgini-1.1.244a8.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
|
|
61
|
+
upgini-1.1.244a8.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|