upgini 1.1.280a3418.post2__py3-none-any.whl → 1.2.31a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (43) hide show
  1. upgini/__about__.py +1 -1
  2. upgini/__init__.py +4 -20
  3. upgini/autofe/all_operands.py +39 -10
  4. upgini/autofe/binary.py +148 -45
  5. upgini/autofe/date.py +197 -26
  6. upgini/autofe/feature.py +102 -19
  7. upgini/autofe/groupby.py +22 -22
  8. upgini/autofe/operand.py +9 -6
  9. upgini/autofe/unary.py +78 -54
  10. upgini/autofe/vector.py +8 -8
  11. upgini/data_source/data_source_publisher.py +128 -5
  12. upgini/dataset.py +50 -386
  13. upgini/features_enricher.py +936 -541
  14. upgini/http.py +27 -16
  15. upgini/lazy_import.py +35 -0
  16. upgini/metadata.py +84 -59
  17. upgini/metrics.py +164 -34
  18. upgini/normalizer/normalize_utils.py +197 -0
  19. upgini/resource_bundle/strings.properties +66 -51
  20. upgini/search_task.py +10 -4
  21. upgini/utils/Roboto-Regular.ttf +0 -0
  22. upgini/utils/base_search_key_detector.py +14 -12
  23. upgini/utils/country_utils.py +16 -0
  24. upgini/utils/custom_loss_utils.py +39 -36
  25. upgini/utils/datetime_utils.py +98 -45
  26. upgini/utils/deduplicate_utils.py +135 -112
  27. upgini/utils/display_utils.py +46 -15
  28. upgini/utils/email_utils.py +54 -16
  29. upgini/utils/feature_info.py +172 -0
  30. upgini/utils/features_validator.py +34 -20
  31. upgini/utils/ip_utils.py +100 -1
  32. upgini/utils/phone_utils.py +343 -0
  33. upgini/utils/postal_code_utils.py +34 -0
  34. upgini/utils/sklearn_ext.py +28 -19
  35. upgini/utils/target_utils.py +113 -57
  36. upgini/utils/warning_counter.py +1 -0
  37. upgini/version_validator.py +8 -4
  38. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/METADATA +31 -16
  39. upgini-1.2.31a1.dist-info/RECORD +65 -0
  40. upgini/normalizer/phone_normalizer.py +0 -340
  41. upgini-1.1.280a3418.post2.dist-info/RECORD +0 -62
  42. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/WHEEL +0 -0
  43. {upgini-1.1.280a3418.post2.dist-info → upgini-1.2.31a1.dist-info}/licenses/LICENSE +0 -0
upgini/metrics.py CHANGED
@@ -1,16 +1,17 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataclasses import dataclass
3
4
  import inspect
4
5
  import logging
5
6
  import re
7
+ from collections import defaultdict
6
8
  from copy import deepcopy
7
9
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
8
10
 
9
11
  import catboost
10
12
  import numpy as np
11
13
  import pandas as pd
12
- from catboost import CatBoostClassifier, CatBoostRegressor
13
- from lightgbm import LGBMClassifier, LGBMRegressor
14
+ from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
14
15
  from numpy import log1p
15
16
  from pandas.api.types import is_numeric_dtype
16
17
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
210
211
  }
211
212
 
212
213
 
214
+ @dataclass
215
+ class _CrossValResults:
216
+ metric: Optional[float]
217
+ metric_std: Optional[float]
218
+ shap_values: Optional[Dict[str, float]]
219
+
220
+ def get_display_metric(self) -> Optional[str]:
221
+ if self.metric is None:
222
+ return None
223
+ elif self.metric_std is None:
224
+ return f"{self.metric:.3f}"
225
+ else:
226
+ return f"{self.metric:.3f} ± {self.metric_std:.3f}"
227
+
228
+
213
229
  class EstimatorWrapper:
214
230
  def __init__(
215
231
  self,
@@ -254,6 +270,7 @@ class EstimatorWrapper:
254
270
  def _prepare_data(
255
271
  self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
256
272
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
273
+ self.logger.info(f"Before preparing data columns: {x.columns.to_list()}")
257
274
  for c in x.columns:
258
275
  if is_numeric_dtype(x[c]):
259
276
  x[c] = x[c].astype(float)
@@ -272,6 +289,10 @@ class EstimatorWrapper:
272
289
  else:
273
290
  x, y = self._remove_empty_target_rows(x, y)
274
291
 
292
+ # Make order of columns idempotent
293
+ x = x[sorted(x.columns)]
294
+
295
+ self.logger.info(f"After preparing data columns: {x.columns.to_list()}")
275
296
  return x, y, groups
276
297
 
277
298
  def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
@@ -287,17 +308,22 @@ class EstimatorWrapper:
287
308
  x, y, _ = self._prepare_data(x, y)
288
309
  return x, y, {}
289
310
 
311
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
312
+ return None
313
+
290
314
  def cross_val_predict(
291
315
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
292
- ) -> Optional[float]:
316
+ ) -> _CrossValResults:
293
317
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
294
318
 
295
319
  if x.shape[1] == 0:
296
- return None
320
+ return _CrossValResults(metric=None, metric_std=None, shap_values=None)
297
321
 
298
322
  scorer = check_scoring(self.estimator, scoring=self.scorer)
299
323
 
324
+ shap_values_all_folds = defaultdict(list)
300
325
  if baseline_score_column is not None and self.metric_name == "GINI":
326
+ self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
301
327
  metric = roc_auc_score(y, x[baseline_score_column])
302
328
  else:
303
329
  cv_results = cross_validate(
@@ -314,25 +340,68 @@ class EstimatorWrapper:
314
340
  metrics_by_fold = cv_results["test_score"]
315
341
  self.cv_estimators = cv_results["estimator"]
316
342
 
317
- metric = np.mean(metrics_by_fold) * self.multiplier
318
- return self.post_process_metric(metric)
343
+ self.check_fold_metrics(metrics_by_fold)
344
+
345
+ metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
346
+
347
+ splits = self.cv.split(x, y, groups)
348
+
349
+ for estimator, split in zip(self.cv_estimators, splits):
350
+ _, validation_idx = split
351
+ cv_x = x.iloc[validation_idx]
352
+ cv_y = y[validation_idx]
353
+ shaps = self.calculate_shap(cv_x, cv_y, estimator)
354
+ if shaps is not None:
355
+ for feature, shap_value in shaps.items():
356
+ # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
357
+ shap_values_all_folds[feature].extend(shap_value.tolist())
358
+
359
+ if shap_values_all_folds:
360
+ average_shap_values = {
361
+ feature: np.mean(np.array(shaps)) for feature, shaps in shap_values_all_folds.items() if len(shaps) > 0
362
+ }
363
+ if len(average_shap_values) == 0:
364
+ average_shap_values = None
365
+ else:
366
+ average_shap_values = self.process_shap_values(average_shap_values)
367
+ else:
368
+ average_shap_values = None
369
+
370
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
371
+
372
+ def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
373
+ return shap_values
374
+
375
+ def check_fold_metrics(self, metrics_by_fold: List[float]):
376
+ first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
377
+ for metric in metrics_by_fold[1:]:
378
+ if first_metric_sign * metric < 0:
379
+ self.logger.warning(f"Sign of metrics differs between folds: {metrics_by_fold}")
319
380
 
320
381
  def post_process_metric(self, metric: float) -> float:
321
382
  if self.metric_name == "GINI":
322
383
  metric = 2 * metric - 1
323
384
  return metric
324
385
 
325
- def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
386
+ def calculate_metric(
387
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
388
+ ) -> _CrossValResults:
326
389
  x, y, _ = self._prepare_to_calculate(x, y)
327
390
  if baseline_score_column is not None and self.metric_name == "GINI":
328
- metric = roc_auc_score(y, x[baseline_score_column])
391
+ metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
329
392
  else:
330
393
  metrics = []
331
394
  for est in self.cv_estimators:
332
395
  metrics.append(self.scorer(est, x, y))
333
396
 
334
- metric = np.mean(metrics) * self.multiplier
335
- return self.post_process_metric(metric)
397
+ metric, metric_std = self._calculate_metric_from_folds(metrics)
398
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
399
+
400
+ def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
401
+ metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
402
+ metric = np.mean(metrics_by_fold) * self.multiplier
403
+ metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
404
+ return metric, metric_std
336
405
 
337
406
  @staticmethod
338
407
  def create(
@@ -346,6 +415,7 @@ class EstimatorWrapper:
346
415
  text_features: Optional[List[str]] = None,
347
416
  add_params: Optional[Dict[str, Any]] = None,
348
417
  groups: Optional[List[str]] = None,
418
+ has_date: Optional[bool] = None,
349
419
  ) -> EstimatorWrapper:
350
420
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
351
421
  kwargs = {
@@ -359,7 +429,8 @@ class EstimatorWrapper:
359
429
  "logger": logger,
360
430
  }
361
431
  if estimator is None:
362
- params = dict()
432
+ params = {}
433
+ params["has_time"] = has_date
363
434
  # if metric_name.upper() in SUPPORTED_CATBOOST_METRICS:
364
435
  # params["eval_metric"] = SUPPORTED_CATBOOST_METRICS[metric_name.upper()]
365
436
  if target_type == ModelTaskType.MULTICLASS:
@@ -390,11 +461,14 @@ class EstimatorWrapper:
390
461
  f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
391
462
  )
392
463
  estimator_copy.set_params(
393
- cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
464
+ # cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
465
+ cat_features=cat_features
394
466
  )
395
467
  estimator = CatBoostWrapper(**kwargs)
396
468
  else:
397
469
  try:
470
+ from lightgbm import LGBMClassifier, LGBMRegressor
471
+
398
472
  if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
399
473
  estimator = LightGBMWrapper(**kwargs)
400
474
  else:
@@ -439,6 +513,7 @@ class CatBoostWrapper(EstimatorWrapper):
439
513
  )
440
514
  self.cat_features = None
441
515
  self.emb_features = None
516
+ self.grouped_embedding_features = None
442
517
  self.exclude_features = []
443
518
 
444
519
  def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
@@ -448,17 +523,16 @@ class CatBoostWrapper(EstimatorWrapper):
448
523
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
449
524
  emb_pattern = r"(.+)_emb\d+"
450
525
  self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
451
- embedding_features = []
452
526
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
453
527
  self.logger.info(
454
528
  "Embedding features count more than 3, so group them into one vector for CatBoost: "
455
529
  f"{self.emb_features}"
456
530
  )
457
- x, embedding_features = self.group_embeddings(x)
458
- params["embedding_features"] = embedding_features
531
+ x, self.grouped_embedding_features = self.group_embeddings(x)
532
+ params["embedding_features"] = self.grouped_embedding_features
459
533
  else:
460
534
  self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
461
- self.emb_features = []
535
+ self.grouped_embedding_features = None
462
536
  else:
463
537
  self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
464
538
 
@@ -474,15 +548,17 @@ class CatBoostWrapper(EstimatorWrapper):
474
548
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
475
549
 
476
550
  # Find rest categorical features
477
- self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
478
- x = fill_na_cat_features(x, self.cat_features)
551
+ self.cat_features = _get_cat_features(x, self.text_features, self.grouped_embedding_features)
552
+ # x = fill_na_cat_features(x, self.cat_features)
479
553
  unique_cat_features = []
480
554
  for name in self.cat_features:
481
555
  # Remove constant categorical features
482
556
  if x[name].nunique() > 1:
483
557
  unique_cat_features.append(name)
484
558
  else:
559
+ self.logger.info(f"Drop column {name} on preparing data for fit")
485
560
  x = x.drop(columns=name)
561
+ self.exclude_features.append(name)
486
562
  self.cat_features = unique_cat_features
487
563
  if (
488
564
  hasattr(self.estimator, "get_param")
@@ -510,46 +586,90 @@ class CatBoostWrapper(EstimatorWrapper):
510
586
  emb_name = "__grouped_embeddings"
511
587
  df = df.copy()
512
588
  df[self.emb_features] = df[self.emb_features].fillna(0.0)
513
- df[emb_name] = df[self.emb_features].values.tolist()
589
+ df[emb_name] = pd.Series(df[self.emb_features].values.tolist())
514
590
  df = df.drop(columns=self.emb_features)
515
591
 
516
592
  return df, [emb_name]
517
593
 
594
+ def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
595
+ if "__grouped_embeddings" in shap_values:
596
+ for emb_feature in self.emb_features:
597
+ shap_values[emb_feature] = shap_values["__grouped_embeddings"]
598
+ del shap_values["__grouped_embeddings"]
599
+ return shap_values
600
+
518
601
  def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
519
602
  if self.exclude_features:
520
603
  x = x.drop(columns=self.exclude_features)
521
604
  x, y, params = super()._prepare_to_calculate(x, y)
522
605
  if self.text_features:
523
606
  params["text_features"] = self.text_features
524
- if self.emb_features:
607
+ if self.grouped_embedding_features:
525
608
  x, emb_columns = self.group_embeddings(x)
526
609
  params["embedding_features"] = emb_columns
527
610
  if self.cat_features:
528
- x = fill_na_cat_features(x, self.cat_features)
611
+ # x = fill_na_cat_features(x, self.cat_features)
529
612
  params["cat_features"] = self.cat_features
530
613
 
531
614
  return x, y, params
532
615
 
533
616
  def cross_val_predict(
534
617
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
535
- ) -> Optional[float]:
618
+ ) -> _CrossValResults:
536
619
  try:
537
620
  return super().cross_val_predict(x, y, baseline_score_column)
538
621
  except Exception as e:
539
622
  if "Dictionary size is 0" in e.args[0] and self.text_features:
540
623
  high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
541
- self.logger.warning(
542
- "Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
543
- f" text features {high_cardinality_features} and retry"
544
- )
624
+ if len(high_cardinality_features) == 0:
625
+ high_cardinality_features = self.text_features
626
+ self.logger.warning(
627
+ "Calculate metrics has problem with CatBoost text features. High cardinality features not found"
628
+ f". Try to remove all text features {high_cardinality_features} and retry"
629
+ )
630
+ else:
631
+ self.logger.warning(
632
+ "Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
633
+ f" text features {high_cardinality_features} and retry"
634
+ )
545
635
  for f in high_cardinality_features:
546
636
  self.text_features.remove(f)
547
637
  self.exclude_features.append(f)
548
- x = x.drop(columns=f)
638
+ x = x.drop(columns=f, errors="ignore")
549
639
  return super().cross_val_predict(x, y, baseline_score_column)
550
640
  else:
551
641
  raise e
552
642
 
643
+ def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
644
+ try:
645
+ # Create Pool for fold data, if need (for example, when categorical features are present)
646
+ fold_pool = Pool(
647
+ x,
648
+ y,
649
+ cat_features=self.cat_features,
650
+ text_features=self.text_features,
651
+ embedding_features=self.grouped_embedding_features,
652
+ )
653
+
654
+ # Get SHAP values of current estimator
655
+ shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
656
+
657
+ # Remove last columns (base value) and flatten
658
+ if self.target_type == ModelTaskType.MULTICLASS:
659
+ all_shaps = shap_values_fold[:, :, :-1]
660
+ all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
661
+ else:
662
+ all_shaps = shap_values_fold[:, :-1]
663
+ all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
664
+
665
+ all_shaps = np.abs(all_shaps)
666
+
667
+ return dict(zip(estimator.feature_names_, all_shaps))
668
+
669
+ except Exception:
670
+ self.logger.exception("Failed to recalculate new SHAP values")
671
+ return None
672
+
553
673
 
554
674
  class LightGBMWrapper(EstimatorWrapper):
555
675
  def __init__(
@@ -653,14 +773,24 @@ class OtherEstimatorWrapper(EstimatorWrapper):
653
773
 
654
774
 
655
775
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
656
- if isinstance(scoring, str) and scoring is not None:
776
+ if scoring is None:
777
+ return
778
+
779
+ if isinstance(scoring, str):
657
780
  _get_scorer_by_name(scoring)
658
- elif isinstance(scoring, Callable):
659
- spec = inspect.getfullargspec(scoring)
660
- if len(spec.args) < 3:
661
- raise ValidationError(
662
- f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
663
- )
781
+ return
782
+
783
+ if not isinstance(scoring, Callable):
784
+ raise ValidationError(
785
+ f"Invalid scoring argument passed {scoring}. It should be string with scoring name or function"
786
+ " that accepts 3 input arguments: estimator, x, y"
787
+ )
788
+
789
+ spec = inspect.getfullargspec(scoring)
790
+ if len(spec.args) < 3:
791
+ raise ValidationError(
792
+ f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
793
+ )
664
794
 
665
795
 
666
796
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
@@ -0,0 +1,197 @@
1
+ import hashlib
2
+ from logging import Logger, getLogger
3
+ from typing import Dict, List, Tuple
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ from pandas.api.types import is_bool_dtype as is_bool
8
+ from pandas.api.types import is_datetime64_any_dtype as is_datetime
9
+ from pandas.api.types import (
10
+ is_float_dtype,
11
+ is_numeric_dtype,
12
+ is_object_dtype,
13
+ is_string_dtype,
14
+ )
15
+
16
+ from upgini.errors import ValidationError
17
+ from upgini.metadata import (
18
+ ENTITY_SYSTEM_RECORD_ID,
19
+ EVAL_SET_INDEX,
20
+ SEARCH_KEY_UNNEST,
21
+ SYSTEM_RECORD_ID,
22
+ TARGET,
23
+ SearchKey,
24
+ )
25
+ from upgini.resource_bundle import ResourceBundle, get_custom_bundle
26
+ from upgini.utils import find_numbers_with_decimal_comma
27
+ from upgini.utils.datetime_utils import DateTimeSearchKeyConverter
28
+ from upgini.utils.phone_utils import PhoneSearchKeyConverter
29
+
30
+
31
+ class Normalizer:
32
+
33
+ MAX_STRING_FEATURE_LENGTH = 24573
34
+
35
+ def __init__(
36
+ self,
37
+ bundle: ResourceBundle = None,
38
+ logger: Logger = None,
39
+ ):
40
+ self.bundle = bundle or get_custom_bundle()
41
+ self.logger = logger or getLogger()
42
+ self.columns_renaming = {}
43
+ self.search_keys = {}
44
+ self.generated_features = []
45
+ self.removed_features = []
46
+
47
+ def normalize(
48
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
49
+ ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
50
+ self.search_keys = search_keys.copy()
51
+ self.generated_features = generated_features.copy()
52
+
53
+ df = df.copy()
54
+ df = self._rename_columns(df)
55
+
56
+ df = self._remove_dates_from_features(df)
57
+
58
+ df = self._cut_too_long_string_values(df)
59
+
60
+ df = self._convert_bools(df)
61
+
62
+ df = self._convert_float16(df)
63
+
64
+ df = self._correct_decimal_comma(df)
65
+
66
+ df = self._convert_phone_numbers(df)
67
+
68
+ df = self.__convert_features_types(df)
69
+
70
+ return df, self.search_keys, self.generated_features
71
+
72
+ def _rename_columns(self, df: pd.DataFrame):
73
+ # logger.info("Replace restricted symbols in column names")
74
+ new_columns = []
75
+ dup_counter = 0
76
+ for column in df.columns:
77
+ if (
78
+ column
79
+ in [
80
+ TARGET,
81
+ EVAL_SET_INDEX,
82
+ SYSTEM_RECORD_ID,
83
+ ENTITY_SYSTEM_RECORD_ID,
84
+ SEARCH_KEY_UNNEST,
85
+ DateTimeSearchKeyConverter.DATETIME_COL,
86
+ ]
87
+ + self.generated_features
88
+ ):
89
+ self.columns_renaming[column] = column
90
+ new_columns.append(column)
91
+ continue
92
+
93
+ new_column = str(column)
94
+ suffix = hashlib.sha256(new_column.encode()).hexdigest()[:6]
95
+ if len(new_column) == 0:
96
+ raise ValidationError(self.bundle.get("dataset_empty_column_names"))
97
+ # db limit for column length
98
+ if len(new_column) > 250:
99
+ new_column = new_column[:250]
100
+
101
+ # make column name unique relative to server features
102
+ new_column = f"{new_column}_{suffix}"
103
+
104
+ new_column = new_column.lower()
105
+
106
+ # if column starts with non alphabetic symbol then add "a" to the beginning of string
107
+ if ord(new_column[0]) not in range(ord("a"), ord("z") + 1):
108
+ new_column = "a" + new_column
109
+
110
+ # replace unsupported characters to "_"
111
+ for idx, c in enumerate(new_column):
112
+ if ord(c) not in range(ord("a"), ord("z") + 1) and ord(c) not in range(ord("0"), ord("9") + 1):
113
+ new_column = new_column[:idx] + "_" + new_column[idx + 1 :]
114
+
115
+ if new_column in new_columns:
116
+ new_column = f"{new_column}_{dup_counter}"
117
+ dup_counter += 1
118
+ new_columns.append(new_column)
119
+
120
+ # df.columns.values[col_idx] = new_column
121
+ # rename(columns={column: new_column}, inplace=True)
122
+
123
+ if new_column != column and column in self.search_keys:
124
+ self.search_keys[new_column] = self.search_keys[column]
125
+ del self.search_keys[column]
126
+ self.columns_renaming[new_column] = str(column)
127
+ df.columns = new_columns
128
+ return df
129
+
130
+ def _get_features(self, df: pd.DataFrame) -> List[str]:
131
+ system_columns = [ENTITY_SYSTEM_RECORD_ID, EVAL_SET_INDEX, SEARCH_KEY_UNNEST, SYSTEM_RECORD_ID, TARGET]
132
+ features = set(df.columns) - set(self.search_keys.keys()) - set(system_columns)
133
+ return sorted(list(features))
134
+
135
+ def _remove_dates_from_features(self, df: pd.DataFrame):
136
+ features = self._get_features(df)
137
+
138
+ for f in features:
139
+ if is_datetime(df[f]) or isinstance(df[f].dtype, pd.PeriodDtype):
140
+ self.removed_features.append(f)
141
+ df.drop(columns=f, inplace=True)
142
+
143
+ return df
144
+
145
+ def _cut_too_long_string_values(self, df: pd.DataFrame):
146
+ """Check that string values less than maximum characters for LLM"""
147
+ # logger.info("Validate too long string values")
148
+ for col in df.columns:
149
+ if is_string_dtype(df[col]) or is_object_dtype(df[col]):
150
+ max_length: int = df[col].astype("str").str.len().max()
151
+ if max_length > self.MAX_STRING_FEATURE_LENGTH:
152
+ df[col] = df[col].astype("str").str.slice(stop=self.MAX_STRING_FEATURE_LENGTH)
153
+
154
+ return df
155
+
156
+ @staticmethod
157
+ def _convert_bools(df: pd.DataFrame):
158
+ """Convert bool columns to string"""
159
+ # logger.info("Converting bool to int")
160
+ for col in df.columns:
161
+ if is_bool(df[col]):
162
+ df[col] = df[col].astype("str")
163
+ return df
164
+
165
+ @staticmethod
166
+ def _convert_float16(df: pd.DataFrame):
167
+ """Convert float16 to float"""
168
+ # logger.info("Converting float16 to float")
169
+ for col in df.columns:
170
+ if is_float_dtype(df[col]):
171
+ df[col] = df[col].astype("float64")
172
+ return df
173
+
174
+ def _correct_decimal_comma(self, df: pd.DataFrame):
175
+ """Check DataSet for decimal commas and fix them"""
176
+ # logger.info("Correct decimal commas")
177
+ columns_to_fix = find_numbers_with_decimal_comma(df)
178
+ if len(columns_to_fix) > 0:
179
+ self.logger.warning(f"Convert strings with decimal comma to float: {columns_to_fix}")
180
+ for col in columns_to_fix:
181
+ df[col] = df[col].astype("string").str.replace(",", ".", regex=False).astype(np.float64)
182
+ return df
183
+
184
+ def _convert_phone_numbers(self, df: pd.DataFrame) -> pd.DataFrame:
185
+ maybe_country_col = SearchKey.find_key(self.search_keys, SearchKey.COUNTRY)
186
+ for phone_col in SearchKey.find_all_keys(self.search_keys, SearchKey.PHONE):
187
+ converter = PhoneSearchKeyConverter(phone_col, maybe_country_col)
188
+ df = converter.convert(df)
189
+ return df
190
+
191
+ def __convert_features_types(self, df: pd.DataFrame):
192
+ # self.logger.info("Convert features to supported data types")
193
+
194
+ for f in self._get_features(df):
195
+ if not is_numeric_dtype(df[f]):
196
+ df[f] = df[f].astype("string")
197
+ return df