upgini 1.1.280.dev0__py3-none-any.whl → 1.2.31a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of upgini might be problematic. Click here for more details.
- upgini/__about__.py +1 -1
- upgini/__init__.py +4 -20
- upgini/autofe/all_operands.py +39 -9
- upgini/autofe/binary.py +148 -45
- upgini/autofe/date.py +197 -26
- upgini/autofe/feature.py +102 -19
- upgini/autofe/groupby.py +22 -22
- upgini/autofe/operand.py +9 -6
- upgini/autofe/unary.py +83 -41
- upgini/autofe/vector.py +8 -8
- upgini/data_source/data_source_publisher.py +128 -5
- upgini/dataset.py +50 -386
- upgini/features_enricher.py +931 -542
- upgini/http.py +27 -16
- upgini/lazy_import.py +35 -0
- upgini/metadata.py +84 -59
- upgini/metrics.py +164 -34
- upgini/normalizer/normalize_utils.py +197 -0
- upgini/resource_bundle/strings.properties +66 -51
- upgini/search_task.py +10 -4
- upgini/utils/Roboto-Regular.ttf +0 -0
- upgini/utils/base_search_key_detector.py +14 -12
- upgini/utils/country_utils.py +16 -0
- upgini/utils/custom_loss_utils.py +39 -36
- upgini/utils/datetime_utils.py +98 -45
- upgini/utils/deduplicate_utils.py +135 -112
- upgini/utils/display_utils.py +46 -15
- upgini/utils/email_utils.py +54 -16
- upgini/utils/feature_info.py +172 -0
- upgini/utils/features_validator.py +34 -20
- upgini/utils/ip_utils.py +100 -1
- upgini/utils/phone_utils.py +343 -0
- upgini/utils/postal_code_utils.py +34 -0
- upgini/utils/sklearn_ext.py +28 -19
- upgini/utils/target_utils.py +113 -57
- upgini/utils/warning_counter.py +1 -0
- upgini/version_validator.py +8 -4
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
- upgini-1.2.31a1.dist-info/RECORD +65 -0
- upgini/normalizer/phone_normalizer.py +0 -340
- upgini-1.1.280.dev0.dist-info/RECORD +0 -62
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
- {upgini-1.1.280.dev0.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/metrics.py
CHANGED
|
@@ -1,16 +1,17 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
|
+
from dataclasses import dataclass
|
|
3
4
|
import inspect
|
|
4
5
|
import logging
|
|
5
6
|
import re
|
|
7
|
+
from collections import defaultdict
|
|
6
8
|
from copy import deepcopy
|
|
7
9
|
from typing import Any, Callable, Dict, List, Optional, Tuple, Union
|
|
8
10
|
|
|
9
11
|
import catboost
|
|
10
12
|
import numpy as np
|
|
11
13
|
import pandas as pd
|
|
12
|
-
from catboost import CatBoostClassifier, CatBoostRegressor
|
|
13
|
-
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
14
|
+
from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
|
|
14
15
|
from numpy import log1p
|
|
15
16
|
from pandas.api.types import is_numeric_dtype
|
|
16
17
|
from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
|
|
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
|
|
|
210
211
|
}
|
|
211
212
|
|
|
212
213
|
|
|
214
|
+
@dataclass
|
|
215
|
+
class _CrossValResults:
|
|
216
|
+
metric: Optional[float]
|
|
217
|
+
metric_std: Optional[float]
|
|
218
|
+
shap_values: Optional[Dict[str, float]]
|
|
219
|
+
|
|
220
|
+
def get_display_metric(self) -> Optional[str]:
|
|
221
|
+
if self.metric is None:
|
|
222
|
+
return None
|
|
223
|
+
elif self.metric_std is None:
|
|
224
|
+
return f"{self.metric:.3f}"
|
|
225
|
+
else:
|
|
226
|
+
return f"{self.metric:.3f} ± {self.metric_std:.3f}"
|
|
227
|
+
|
|
228
|
+
|
|
213
229
|
class EstimatorWrapper:
|
|
214
230
|
def __init__(
|
|
215
231
|
self,
|
|
@@ -254,6 +270,7 @@ class EstimatorWrapper:
|
|
|
254
270
|
def _prepare_data(
|
|
255
271
|
self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
|
|
256
272
|
) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
|
|
273
|
+
self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
|
|
257
274
|
for c in x.columns:
|
|
258
275
|
if is_numeric_dtype(x[c]):
|
|
259
276
|
x[c] = x[c].astype(float)
|
|
@@ -272,6 +289,10 @@ class EstimatorWrapper:
|
|
|
272
289
|
else:
|
|
273
290
|
x, y = self._remove_empty_target_rows(x, y)
|
|
274
291
|
|
|
292
|
+
# Make order of columns idempotent
|
|
293
|
+
x = x[sorted(x.columns)]
|
|
294
|
+
|
|
295
|
+
self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
|
|
275
296
|
return x, y, groups
|
|
276
297
|
|
|
277
298
|
def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
|
|
@@ -287,17 +308,22 @@ class EstimatorWrapper:
|
|
|
287
308
|
x, y, _ = self._prepare_data(x, y)
|
|
288
309
|
return x, y, {}
|
|
289
310
|
|
|
311
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
|
|
312
|
+
return None
|
|
313
|
+
|
|
290
314
|
def cross_val_predict(
|
|
291
315
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
292
|
-
) ->
|
|
316
|
+
) -> _CrossValResults:
|
|
293
317
|
x, y, groups, fit_params = self._prepare_to_fit(x, y)
|
|
294
318
|
|
|
295
319
|
if x.shape[1] == 0:
|
|
296
|
-
return None
|
|
320
|
+
return _CrossValResults(metric=None, metric_std=None, shap_values=None)
|
|
297
321
|
|
|
298
322
|
scorer = check_scoring(self.estimator, scoring=self.scorer)
|
|
299
323
|
|
|
324
|
+
shap_values_all_folds = defaultdict(list)
|
|
300
325
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
326
|
+
self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
|
|
301
327
|
metric = roc_auc_score(y, x[baseline_score_column])
|
|
302
328
|
else:
|
|
303
329
|
cv_results = cross_validate(
|
|
@@ -314,25 +340,68 @@ class EstimatorWrapper:
|
|
|
314
340
|
metrics_by_fold = cv_results["test_score"]
|
|
315
341
|
self.cv_estimators = cv_results["estimator"]
|
|
316
342
|
|
|
317
|
-
|
|
318
|
-
|
|
343
|
+
self.check_fold_metrics(metrics_by_fold)
|
|
344
|
+
|
|
345
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
|
|
346
|
+
|
|
347
|
+
splits = self.cv.split(x, y, groups)
|
|
348
|
+
|
|
349
|
+
for estimator, split in zip(self.cv_estimators, splits):
|
|
350
|
+
_, validation_idx = split
|
|
351
|
+
cv_x = x.iloc[validation_idx]
|
|
352
|
+
cv_y = y[validation_idx]
|
|
353
|
+
shaps = self.calculate_shap(cv_x, cv_y, estimator)
|
|
354
|
+
if shaps is not None:
|
|
355
|
+
for feature, shap_value in shaps.items():
|
|
356
|
+
# shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
|
|
357
|
+
shap_values_all_folds[feature].extend(shap_value.tolist())
|
|
358
|
+
|
|
359
|
+
if shap_values_all_folds:
|
|
360
|
+
average_shap_values = {
|
|
361
|
+
feature: np.mean(np.array(shaps)) for feature, shaps in shap_values_all_folds.items() if len(shaps) > 0
|
|
362
|
+
}
|
|
363
|
+
if len(average_shap_values) == 0:
|
|
364
|
+
average_shap_values = None
|
|
365
|
+
else:
|
|
366
|
+
average_shap_values = self.process_shap_values(average_shap_values)
|
|
367
|
+
else:
|
|
368
|
+
average_shap_values = None
|
|
369
|
+
|
|
370
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
|
|
371
|
+
|
|
372
|
+
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
|
373
|
+
return shap_values
|
|
374
|
+
|
|
375
|
+
def check_fold_metrics(self, metrics_by_fold: List[float]):
|
|
376
|
+
first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
|
|
377
|
+
for metric in metrics_by_fold[1:]:
|
|
378
|
+
if first_metric_sign * metric < 0:
|
|
379
|
+
self.logger.warning(f"Sign of metrics differs between folds: {metrics_by_fold}")
|
|
319
380
|
|
|
320
381
|
def post_process_metric(self, metric: float) -> float:
|
|
321
382
|
if self.metric_name == "GINI":
|
|
322
383
|
metric = 2 * metric - 1
|
|
323
384
|
return metric
|
|
324
385
|
|
|
325
|
-
def calculate_metric(
|
|
386
|
+
def calculate_metric(
|
|
387
|
+
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
388
|
+
) -> _CrossValResults:
|
|
326
389
|
x, y, _ = self._prepare_to_calculate(x, y)
|
|
327
390
|
if baseline_score_column is not None and self.metric_name == "GINI":
|
|
328
|
-
metric = roc_auc_score(y, x[baseline_score_column])
|
|
391
|
+
metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
|
|
329
392
|
else:
|
|
330
393
|
metrics = []
|
|
331
394
|
for est in self.cv_estimators:
|
|
332
395
|
metrics.append(self.scorer(est, x, y))
|
|
333
396
|
|
|
334
|
-
metric =
|
|
335
|
-
return
|
|
397
|
+
metric, metric_std = self._calculate_metric_from_folds(metrics)
|
|
398
|
+
return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
|
|
399
|
+
|
|
400
|
+
def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
|
|
401
|
+
metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
|
|
402
|
+
metric = np.mean(metrics_by_fold) * self.multiplier
|
|
403
|
+
metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
|
|
404
|
+
return metric, metric_std
|
|
336
405
|
|
|
337
406
|
@staticmethod
|
|
338
407
|
def create(
|
|
@@ -346,6 +415,7 @@ class EstimatorWrapper:
|
|
|
346
415
|
text_features: Optional[List[str]] = None,
|
|
347
416
|
add_params: Optional[Dict[str, Any]] = None,
|
|
348
417
|
groups: Optional[List[str]] = None,
|
|
418
|
+
has_date: Optional[bool] = None,
|
|
349
419
|
) -> EstimatorWrapper:
|
|
350
420
|
scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
|
|
351
421
|
kwargs = {
|
|
@@ -359,7 +429,8 @@ class EstimatorWrapper:
|
|
|
359
429
|
"logger": logger,
|
|
360
430
|
}
|
|
361
431
|
if estimator is None:
|
|
362
|
-
params =
|
|
432
|
+
params = {}
|
|
433
|
+
params["has_time"] = has_date
|
|
363
434
|
# if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
|
|
364
435
|
# params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
|
|
365
436
|
if target_type == ModelTaskType.MULTICLASS:
|
|
@@ -390,11 +461,14 @@ class EstimatorWrapper:
|
|
|
390
461
|
f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
|
|
391
462
|
)
|
|
392
463
|
estimator_copy.set_params(
|
|
393
|
-
cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
464
|
+
# cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
|
|
465
|
+
cat_features=cat_features
|
|
394
466
|
)
|
|
395
467
|
estimator = CatBoostWrapper(**kwargs)
|
|
396
468
|
else:
|
|
397
469
|
try:
|
|
470
|
+
from lightgbm import LGBMClassifier, LGBMRegressor
|
|
471
|
+
|
|
398
472
|
if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
|
|
399
473
|
estimator = LightGBMWrapper(**kwargs)
|
|
400
474
|
else:
|
|
@@ -439,6 +513,7 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
439
513
|
)
|
|
440
514
|
self.cat_features = None
|
|
441
515
|
self.emb_features = None
|
|
516
|
+
self.grouped_embedding_features = None
|
|
442
517
|
self.exclude_features = []
|
|
443
518
|
|
|
444
519
|
def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
|
|
@@ -448,17 +523,16 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
448
523
|
if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
|
|
449
524
|
emb_pattern = r"(.+)_emb\d+"
|
|
450
525
|
self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
|
|
451
|
-
embedding_features = []
|
|
452
526
|
if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
|
|
453
527
|
self.logger.info(
|
|
454
528
|
"Embedding features count more than 3, so group them into one vector for CatBoost: "
|
|
455
529
|
f"{self.emb_features}"
|
|
456
530
|
)
|
|
457
|
-
x,
|
|
458
|
-
params["embedding_features"] =
|
|
531
|
+
x, self.grouped_embedding_features = self.group_embeddings(x)
|
|
532
|
+
params["embedding_features"] = self.grouped_embedding_features
|
|
459
533
|
else:
|
|
460
534
|
self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
|
|
461
|
-
self.
|
|
535
|
+
self.grouped_embedding_features = None
|
|
462
536
|
else:
|
|
463
537
|
self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
|
|
464
538
|
|
|
@@ -474,15 +548,17 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
474
548
|
self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
|
|
475
549
|
|
|
476
550
|
# Find rest categorical features
|
|
477
|
-
self.cat_features = _get_cat_features(x, self.text_features,
|
|
478
|
-
x = fill_na_cat_features(x, self.cat_features)
|
|
551
|
+
self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
|
|
552
|
+
# x = fill_na_cat_features(x, self.cat_features)
|
|
479
553
|
unique_cat_features = []
|
|
480
554
|
for name in self.cat_features:
|
|
481
555
|
# Remove constant categorical features
|
|
482
556
|
if x[name].nunique() > 1:
|
|
483
557
|
unique_cat_features.append(name)
|
|
484
558
|
else:
|
|
559
|
+
self.logger.info(f"Drop column {name} on preparing data for fit")
|
|
485
560
|
x = x.drop(columns=name)
|
|
561
|
+
self.exclude_features.append(name)
|
|
486
562
|
self.cat_features = unique_cat_features
|
|
487
563
|
if (
|
|
488
564
|
hasattr(self.estimator, "get_param")
|
|
@@ -510,46 +586,90 @@ class CatBoostWrapper(EstimatorWrapper):
|
|
|
510
586
|
emb_name = "__grouped_embeddings"
|
|
511
587
|
df = df.copy()
|
|
512
588
|
df[self.emb_features] = df[self.emb_features].fillna(0.0)
|
|
513
|
-
df[emb_name] = df[self.emb_features].values.tolist()
|
|
589
|
+
df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
|
|
514
590
|
df = df.drop(columns=self.emb_features)
|
|
515
591
|
|
|
516
592
|
return df, [emb_name]
|
|
517
593
|
|
|
594
|
+
def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
|
|
595
|
+
if "__grouped_embeddings" in shap_values:
|
|
596
|
+
for emb_feature in self.emb_features:
|
|
597
|
+
shap_values[emb_feature] = shap_values["__grouped_embeddings"]
|
|
598
|
+
del shap_values["__grouped_embeddings"]
|
|
599
|
+
return shap_values
|
|
600
|
+
|
|
518
601
|
def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
|
|
519
602
|
if self.exclude_features:
|
|
520
603
|
x = x.drop(columns=self.exclude_features)
|
|
521
604
|
x, y, params = super()._prepare_to_calculate(x, y)
|
|
522
605
|
if self.text_features:
|
|
523
606
|
params["text_features"] = self.text_features
|
|
524
|
-
if self.
|
|
607
|
+
if self.grouped_embedding_features:
|
|
525
608
|
x, emb_columns = self.group_embeddings(x)
|
|
526
609
|
params["embedding_features"] = emb_columns
|
|
527
610
|
if self.cat_features:
|
|
528
|
-
x = fill_na_cat_features(x, self.cat_features)
|
|
611
|
+
# x = fill_na_cat_features(x, self.cat_features)
|
|
529
612
|
params["cat_features"] = self.cat_features
|
|
530
613
|
|
|
531
614
|
return x, y, params
|
|
532
615
|
|
|
533
616
|
def cross_val_predict(
|
|
534
617
|
self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
|
|
535
|
-
) ->
|
|
618
|
+
) -> _CrossValResults:
|
|
536
619
|
try:
|
|
537
620
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
538
621
|
except Exception as e:
|
|
539
622
|
if "Dictionary size is 0" in e.args[0] and self.text_features:
|
|
540
623
|
high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
|
|
544
|
-
|
|
624
|
+
if len(high_cardinality_features) == 0:
|
|
625
|
+
high_cardinality_features = self.text_features
|
|
626
|
+
self.logger.warning(
|
|
627
|
+
"Calculate metrics has problem with CatBoost text features. High cardinality features not found"
|
|
628
|
+
f". Try to remove all text features {high_cardinality_features} and retry"
|
|
629
|
+
)
|
|
630
|
+
else:
|
|
631
|
+
self.logger.warning(
|
|
632
|
+
"Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
|
|
633
|
+
f" text features {high_cardinality_features} and retry"
|
|
634
|
+
)
|
|
545
635
|
for f in high_cardinality_features:
|
|
546
636
|
self.text_features.remove(f)
|
|
547
637
|
self.exclude_features.append(f)
|
|
548
|
-
x = x.drop(columns=f)
|
|
638
|
+
x = x.drop(columns=f, errors="ignore")
|
|
549
639
|
return super().cross_val_predict(x, y, baseline_score_column)
|
|
550
640
|
else:
|
|
551
641
|
raise e
|
|
552
642
|
|
|
643
|
+
def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
|
|
644
|
+
try:
|
|
645
|
+
# Create Pool for fold data, if need (for example, when categorical features are present)
|
|
646
|
+
fold_pool = Pool(
|
|
647
|
+
x,
|
|
648
|
+
y,
|
|
649
|
+
cat_features=self.cat_features,
|
|
650
|
+
text_features=self.text_features,
|
|
651
|
+
embedding_features=self.grouped_embedding_features,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
# Get SHAP values of current estimator
|
|
655
|
+
shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
|
|
656
|
+
|
|
657
|
+
# Remove last columns (base value) and flatten
|
|
658
|
+
if self.target_type == ModelTaskType.MULTICLASS:
|
|
659
|
+
all_shaps = shap_values_fold[:, :, :-1]
|
|
660
|
+
all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
|
|
661
|
+
else:
|
|
662
|
+
all_shaps = shap_values_fold[:, :-1]
|
|
663
|
+
all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
|
|
664
|
+
|
|
665
|
+
all_shaps = np.abs(all_shaps)
|
|
666
|
+
|
|
667
|
+
return dict(zip(estimator.feature_names_, all_shaps))
|
|
668
|
+
|
|
669
|
+
except Exception:
|
|
670
|
+
self.logger.exception("Failed to recalculate new SHAP values")
|
|
671
|
+
return None
|
|
672
|
+
|
|
553
673
|
|
|
554
674
|
class LightGBMWrapper(EstimatorWrapper):
|
|
555
675
|
def __init__(
|
|
@@ -653,14 +773,24 @@ class OtherEstimatorWrapper(EstimatorWrapper):
|
|
|
653
773
|
|
|
654
774
|
|
|
655
775
|
def validate_scoring_argument(scoring: Union[Callable, str, None]):
|
|
656
|
-
if
|
|
776
|
+
if scoring is None:
|
|
777
|
+
return
|
|
778
|
+
|
|
779
|
+
if isinstance(scoring, str):
|
|
657
780
|
_get_scorer_by_name(scoring)
|
|
658
|
-
|
|
659
|
-
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
781
|
+
return
|
|
782
|
+
|
|
783
|
+
if not isinstance(scoring, Callable):
|
|
784
|
+
raise ValidationError(
|
|
785
|
+
f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
|
|
786
|
+
" that accepts 3 input arguments: estimator, x, y"
|
|
787
|
+
)
|
|
788
|
+
|
|
789
|
+
spec = inspect.getfullargspec(scoring)
|
|
790
|
+
if len(spec.args) < 3:
|
|
791
|
+
raise ValidationError(
|
|
792
|
+
f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
|
|
793
|
+
)
|
|
664
794
|
|
|
665
795
|
|
|
666
796
|
def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
|
|
@@ -0,0 +1,197 @@
|
|
|
1
|
+
import hashlib
|
|
2
|
+
from logging import Logger, getLogger
|
|
3
|
+
from typing import Dict, List, Tuple
|
|
4
|
+
|
|
5
|
+
import numpy as np
|
|
6
|
+
import pandas as pd
|
|
7
|
+
from pandas.api.types import is_bool_dtype as is_bool
|
|
8
|
+
from pandas.api.types import is_datetime64_any_dtype as is_datetime
|
|
9
|
+
from pandas.api.types import (
|
|
10
|
+
is_float_dtype,
|
|
11
|
+
is_numeric_dtype,
|
|
12
|
+
is_object_dtype,
|
|
13
|
+
is_string_dtype,
|
|
14
|
+
)
|
|
15
|
+
|
|
16
|
+
from upgini.errors import ValidationError
|
|
17
|
+
from upgini.metadata import (
|
|
18
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
19
|
+
EVAL_SET_INDEX,
|
|
20
|
+
SEARCH_KEY_UNNEST,
|
|
21
|
+
SYSTEM_RECORD_ID,
|
|
22
|
+
TARGET,
|
|
23
|
+
SearchKey,
|
|
24
|
+
)
|
|
25
|
+
from upgini.resource_bundle import ResourceBundle, get_custom_bundle
|
|
26
|
+
from upgini.utils import find_numbers_with_decimal_comma
|
|
27
|
+
from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
|
|
28
|
+
from upgini.utils.phone_utils import PhoneSearchKeyConverter
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class Normalizer:
|
|
32
|
+
|
|
33
|
+
MAX_STRING_FEATURE_LENGTH = 24573
|
|
34
|
+
|
|
35
|
+
def __init__(
|
|
36
|
+
self,
|
|
37
|
+
bundle: ResourceBundle = None,
|
|
38
|
+
logger: Logger = None,
|
|
39
|
+
):
|
|
40
|
+
self.bundle = bundle or get_custom_bundle()
|
|
41
|
+
self.logger = logger or getLogger()
|
|
42
|
+
self.columns_renaming = {}
|
|
43
|
+
self.search_keys = {}
|
|
44
|
+
self.generated_features = []
|
|
45
|
+
self.removed_features = []
|
|
46
|
+
|
|
47
|
+
def normalize(
|
|
48
|
+
self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
|
|
49
|
+
) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
|
|
50
|
+
self.search_keys = search_keys.copy()
|
|
51
|
+
self.generated_features = generated_features.copy()
|
|
52
|
+
|
|
53
|
+
df = df.copy()
|
|
54
|
+
df = self._rename_columns(df)
|
|
55
|
+
|
|
56
|
+
df = self._remove_dates_from_features(df)
|
|
57
|
+
|
|
58
|
+
df = self._cut_too_long_string_values(df)
|
|
59
|
+
|
|
60
|
+
df = self._convert_bools(df)
|
|
61
|
+
|
|
62
|
+
df = self._convert_float16(df)
|
|
63
|
+
|
|
64
|
+
df = self._correct_decimal_comma(df)
|
|
65
|
+
|
|
66
|
+
df = self._convert_phone_numbers(df)
|
|
67
|
+
|
|
68
|
+
df = self.__convert_features_types(df)
|
|
69
|
+
|
|
70
|
+
return df, self.search_keys, self.generated_features
|
|
71
|
+
|
|
72
|
+
def _rename_columns(self, df: pd.DataFrame):
|
|
73
|
+
# logger.info("Replace restricted symbols in column names")
|
|
74
|
+
new_columns = []
|
|
75
|
+
dup_counter = 0
|
|
76
|
+
for column in df.columns:
|
|
77
|
+
if (
|
|
78
|
+
column
|
|
79
|
+
in [
|
|
80
|
+
TARGET,
|
|
81
|
+
EVAL_SET_INDEX,
|
|
82
|
+
SYSTEM_RECORD_ID,
|
|
83
|
+
ENTITY_SYSTEM_RECORD_ID,
|
|
84
|
+
SEARCH_KEY_UNNEST,
|
|
85
|
+
DateTimeSearchKeyConverter.DATETIME_COL,
|
|
86
|
+
]
|
|
87
|
+
+ self.generated_features
|
|
88
|
+
):
|
|
89
|
+
self.columns_renaming[column] = column
|
|
90
|
+
new_columns.append(column)
|
|
91
|
+
continue
|
|
92
|
+
|
|
93
|
+
new_column = str(column)
|
|
94
|
+
suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
|
|
95
|
+
if len(new_column) == 0:
|
|
96
|
+
raise ValidationError(self.bundle.get("dataset_empty_column_names"))
|
|
97
|
+
# db limit for column length
|
|
98
|
+
if len(new_column) > 250:
|
|
99
|
+
new_column = new_column[:250]
|
|
100
|
+
|
|
101
|
+
# make column name unique relative to server features
|
|
102
|
+
new_column = f"{new_column}_{suffix}"
|
|
103
|
+
|
|
104
|
+
new_column = new_column.lower()
|
|
105
|
+
|
|
106
|
+
# if column starts with non alphabetic symbol then add "a" to the beginning of string
|
|
107
|
+
if ord(new_column[0]) not in range(ord("a"), ord("z") + 1):
|
|
108
|
+
new_column = "a" + new_column
|
|
109
|
+
|
|
110
|
+
# replace unsupported characters to "_"
|
|
111
|
+
for idx, c in enumerate(new_column):
|
|
112
|
+
if ord(c) not in range(ord("a"), ord("z") + 1) and ord(c) not in range(ord("0"), ord("9") + 1):
|
|
113
|
+
new_column = new_column[:idx] + "_" + new_column[idx + 1 :]
|
|
114
|
+
|
|
115
|
+
if new_column in new_columns:
|
|
116
|
+
new_column = f"{new_column}_{dup_counter}"
|
|
117
|
+
dup_counter += 1
|
|
118
|
+
new_columns.append(new_column)
|
|
119
|
+
|
|
120
|
+
# df.columns.values[col_idx] = new_column
|
|
121
|
+
# rename(columns={column: new_column}, inplace=True)
|
|
122
|
+
|
|
123
|
+
if new_column != column and column in self.search_keys:
|
|
124
|
+
self.search_keys[new_column] = self.search_keys[column]
|
|
125
|
+
del self.search_keys[column]
|
|
126
|
+
self.columns_renaming[new_column] = str(column)
|
|
127
|
+
df.columns = new_columns
|
|
128
|
+
return df
|
|
129
|
+
|
|
130
|
+
def _get_features(self, df: pd.DataFrame) -> List[str]:
|
|
131
|
+
system_columns = [ENTITY_SYSTEM_RECORD_ID, EVAL_SET_INDEX, SEARCH_KEY_UNNEST, SYSTEM_RECORD_ID, TARGET]
|
|
132
|
+
features = set(df.columns) - set(self.search_keys.keys()) - set(system_columns)
|
|
133
|
+
return sorted(list(features))
|
|
134
|
+
|
|
135
|
+
def _remove_dates_from_features(self, df: pd.DataFrame):
|
|
136
|
+
features = self._get_features(df)
|
|
137
|
+
|
|
138
|
+
for f in features:
|
|
139
|
+
if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
|
|
140
|
+
self.removed_features.append(f)
|
|
141
|
+
df.drop(columns=f, inplace=True)
|
|
142
|
+
|
|
143
|
+
return df
|
|
144
|
+
|
|
145
|
+
def _cut_too_long_string_values(self, df: pd.DataFrame):
|
|
146
|
+
"""Check that string values less than maximum characters for LLM"""
|
|
147
|
+
# logger.info("Validate too long string values")
|
|
148
|
+
for col in df.columns:
|
|
149
|
+
if is_string_dtype(df[col]) or is_object_dtype(df[col]):
|
|
150
|
+
max_length: int = df[col].astype("str").str.len().max()
|
|
151
|
+
if max_length > self.MAX_STRING_FEATURE_LENGTH:
|
|
152
|
+
df[col] = df[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
|
|
153
|
+
|
|
154
|
+
return df
|
|
155
|
+
|
|
156
|
+
@staticmethod
|
|
157
|
+
def _convert_bools(df: pd.DataFrame):
|
|
158
|
+
"""Convert bool columns to string"""
|
|
159
|
+
# logger.info("Converting bool to int")
|
|
160
|
+
for col in df.columns:
|
|
161
|
+
if is_bool(df[col]):
|
|
162
|
+
df[col] = df[col].astype("str")
|
|
163
|
+
return df
|
|
164
|
+
|
|
165
|
+
@staticmethod
|
|
166
|
+
def _convert_float16(df: pd.DataFrame):
|
|
167
|
+
"""Convert float16 to float"""
|
|
168
|
+
# logger.info("Converting float16 to float")
|
|
169
|
+
for col in df.columns:
|
|
170
|
+
if is_float_dtype(df[col]):
|
|
171
|
+
df[col] = df[col].astype("float64")
|
|
172
|
+
return df
|
|
173
|
+
|
|
174
|
+
def _correct_decimal_comma(self, df: pd.DataFrame):
|
|
175
|
+
"""Check DataSet for decimal commas and fix them"""
|
|
176
|
+
# logger.info("Correct decimal commas")
|
|
177
|
+
columns_to_fix = find_numbers_with_decimal_comma(df)
|
|
178
|
+
if len(columns_to_fix) > 0:
|
|
179
|
+
self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
|
|
180
|
+
for col in columns_to_fix:
|
|
181
|
+
df[col] = df[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
|
|
182
|
+
return df
|
|
183
|
+
|
|
184
|
+
def _convert_phone_numbers(self, df: pd.DataFrame) -> pd.DataFrame:
|
|
185
|
+
maybe_country_col = SearchKey.find_key(self.search_keys, SearchKey.COUNTRY)
|
|
186
|
+
for phone_col in SearchKey.find_all_keys(self.search_keys, SearchKey.PHONE):
|
|
187
|
+
converter = PhoneSearchKeyConverter(phone_col, maybe_country_col)
|
|
188
|
+
df = converter.convert(df)
|
|
189
|
+
return df
|
|
190
|
+
|
|
191
|
+
def __convert_features_types(self, df: pd.DataFrame):
|
|
192
|
+
# self.logger.info("Convert features to supported data types")
|
|
193
|
+
|
|
194
|
+
for f in self._get_features(df):
|
|
195
|
+
if not is_numeric_dtype(df[f]):
|
|
196
|
+
df[f] = df[f].astype("string")
|
|
197
|
+
return df
|