upgini 1.1.262a3250.post4__py3-none-any.whl → 1.1.280a3418.post2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (49) hide show
  1. upgini/__about__.py +1 -0
  2. upgini/ads.py +6 -2
  3. upgini/ads_management/ads_manager.py +4 -2
  4. upgini/autofe/all_operands.py +16 -4
  5. upgini/autofe/binary.py +2 -1
  6. upgini/autofe/date.py +74 -7
  7. upgini/autofe/feature.py +1 -1
  8. upgini/autofe/groupby.py +3 -1
  9. upgini/autofe/operand.py +4 -3
  10. upgini/autofe/unary.py +20 -1
  11. upgini/autofe/vector.py +2 -0
  12. upgini/data_source/data_source_publisher.py +14 -4
  13. upgini/dataset.py +8 -7
  14. upgini/errors.py +1 -1
  15. upgini/features_enricher.py +156 -63
  16. upgini/http.py +11 -10
  17. upgini/mdc/__init__.py +1 -3
  18. upgini/mdc/context.py +4 -6
  19. upgini/metadata.py +3 -0
  20. upgini/metrics.py +160 -96
  21. upgini/normalizer/phone_normalizer.py +2 -2
  22. upgini/resource_bundle/__init__.py +5 -5
  23. upgini/resource_bundle/strings.properties +9 -4
  24. upgini/sampler/base.py +1 -4
  25. upgini/sampler/random_under_sampler.py +2 -5
  26. upgini/search_task.py +4 -4
  27. upgini/spinner.py +1 -1
  28. upgini/utils/__init__.py +3 -2
  29. upgini/utils/base_search_key_detector.py +2 -2
  30. upgini/utils/blocked_time_series.py +4 -2
  31. upgini/utils/country_utils.py +2 -2
  32. upgini/utils/custom_loss_utils.py +3 -2
  33. upgini/utils/cv_utils.py +2 -2
  34. upgini/utils/datetime_utils.py +75 -18
  35. upgini/utils/deduplicate_utils.py +61 -18
  36. upgini/utils/email_utils.py +3 -3
  37. upgini/utils/fallback_progress_bar.py +1 -1
  38. upgini/utils/features_validator.py +2 -1
  39. upgini/utils/progress_bar.py +1 -1
  40. upgini/utils/sklearn_ext.py +15 -15
  41. upgini/utils/target_utils.py +21 -7
  42. upgini/utils/track_info.py +27 -15
  43. upgini/version_validator.py +2 -2
  44. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/METADATA +21 -23
  45. upgini-1.1.280a3418.post2.dist-info/RECORD +62 -0
  46. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info}/WHEEL +1 -2
  47. upgini-1.1.262a3250.post4.dist-info/RECORD +0 -62
  48. upgini-1.1.262a3250.post4.dist-info/top_level.txt +0 -1
  49. {upgini-1.1.262a3250.post4.dist-info → upgini-1.1.280a3418.post2.dist-info/licenses}/LICENSE +0 -0
upgini/metrics.py CHANGED
@@ -1,17 +1,21 @@
1
+ from __future__ import annotations
2
+
3
+ import inspect
1
4
  import logging
2
5
  import re
3
6
  from copy import deepcopy
4
7
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
5
8
 
9
+ import catboost
6
10
  import numpy as np
7
11
  import pandas as pd
8
12
  from catboost import CatBoostClassifier, CatBoostRegressor
9
- import catboost
10
13
  from lightgbm import LGBMClassifier, LGBMRegressor
11
14
  from numpy import log1p
12
15
  from pandas.api.types import is_numeric_dtype
13
16
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
14
17
 
18
+ from upgini.utils.features_validator import FeaturesValidator
15
19
  from upgini.utils.sklearn_ext import cross_validate
16
20
 
17
21
  try:
@@ -123,7 +127,7 @@ NA_REPLACEMENT = "NA"
123
127
 
124
128
  SUPPORTED_CATBOOST_METRICS = {
125
129
  s.upper(): s
126
- for s in {
130
+ for s in (
127
131
  "Logloss",
128
132
  "CrossEntropy",
129
133
  "CtrFactor",
@@ -202,7 +206,7 @@ SUPPORTED_CATBOOST_METRICS = {
202
206
  "MultiLogloss",
203
207
  "MultiCrossEntropy",
204
208
  "Combination",
205
- }
209
+ )
206
210
  }
207
211
 
208
212
 
@@ -234,71 +238,71 @@ class EstimatorWrapper:
234
238
  self.text_features = text_features
235
239
  self.logger = logger or logging.getLogger()
236
240
 
237
- def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
238
- X, y, _, fit_params = self._prepare_to_fit(X, y)
241
+ def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
242
+ x, y, _, fit_params = self._prepare_to_fit(x, y)
239
243
  kwargs.update(fit_params)
240
- self.estimator.fit(X, y, **kwargs)
244
+ self.estimator.fit(x, y, **kwargs)
241
245
  return self
242
246
 
243
247
  def predict(self, **kwargs):
244
248
  return self.estimator.predict(**kwargs)
245
249
 
246
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
247
- X, y, groups = self._prepare_data(X, y, groups=self.groups)
248
- return X, y, groups, {}
250
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
251
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
252
+ return x, y, groups, {}
249
253
 
250
254
  def _prepare_data(
251
- self, X: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
255
+ self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
252
256
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
253
- for c in X.columns:
254
- if is_numeric_dtype(X[c]):
255
- X[c] = X[c].astype(float)
257
+ for c in x.columns:
258
+ if is_numeric_dtype(x[c]):
259
+ x[c] = x[c].astype(float)
256
260
  else:
257
- X[c] = X[c].astype(str)
261
+ x[c] = x[c].astype(str)
258
262
 
259
263
  if not isinstance(y, pd.Series):
260
264
  raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
261
265
 
262
266
  if groups is not None:
263
- X = X.copy()
264
- X["__groups"] = groups
265
- X, y = self._remove_empty_target_rows(X, y)
266
- groups = X["__groups"]
267
- X = X.drop(columns="__groups")
267
+ x = x.copy()
268
+ x["__groups"] = groups
269
+ x, y = self._remove_empty_target_rows(x, y)
270
+ groups = x["__groups"]
271
+ x = x.drop(columns="__groups")
268
272
  else:
269
- X, y = self._remove_empty_target_rows(X, y)
273
+ x, y = self._remove_empty_target_rows(x, y)
270
274
 
271
- return X, y, groups
275
+ return x, y, groups
272
276
 
273
- def _remove_empty_target_rows(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
274
- joined = pd.concat([X, y], axis=1)
277
+ def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
278
+ joined = pd.concat([x, y], axis=1)
275
279
  joined = joined[joined[y.name].notna()]
276
280
  joined = joined.reset_index(drop=True)
277
- X = joined.drop(columns=y.name)
281
+ x = joined.drop(columns=y.name)
278
282
  y = np.array(list(joined[y.name].values))
279
283
 
280
- return X, y
284
+ return x, y
281
285
 
282
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
283
- X, y, _ = self._prepare_data(X, y)
284
- return X, y, {}
286
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
287
+ x, y, _ = self._prepare_data(x, y)
288
+ return x, y, {}
285
289
 
286
290
  def cross_val_predict(
287
- self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
291
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
288
292
  ) -> Optional[float]:
289
- X, y, groups, fit_params = self._prepare_to_fit(X, y)
293
+ x, y, groups, fit_params = self._prepare_to_fit(x, y)
290
294
 
291
- if X.shape[1] == 0:
295
+ if x.shape[1] == 0:
292
296
  return None
293
297
 
294
298
  scorer = check_scoring(self.estimator, scoring=self.scorer)
295
299
 
296
300
  if baseline_score_column is not None and self.metric_name == "GINI":
297
- metric = roc_auc_score(y, X[baseline_score_column])
301
+ metric = roc_auc_score(y, x[baseline_score_column])
298
302
  else:
299
303
  cv_results = cross_validate(
300
304
  estimator=self.estimator,
301
- X=X,
305
+ x=x,
302
306
  y=y,
303
307
  scoring=scorer,
304
308
  cv=self.cv,
@@ -318,14 +322,14 @@ class EstimatorWrapper:
318
322
  metric = 2 * metric - 1
319
323
  return metric
320
324
 
321
- def calculate_metric(self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
322
- X, y, _ = self._prepare_to_calculate(X, y)
325
+ def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
326
+ x, y, _ = self._prepare_to_calculate(x, y)
323
327
  if baseline_score_column is not None and self.metric_name == "GINI":
324
- metric = roc_auc_score(y, X[baseline_score_column])
328
+ metric = roc_auc_score(y, x[baseline_score_column])
325
329
  else:
326
330
  metrics = []
327
331
  for est in self.cv_estimators:
328
- metrics.append(self.scorer(est, X, y))
332
+ metrics.append(self.scorer(est, x, y))
329
333
 
330
334
  metric = np.mean(metrics) * self.multiplier
331
335
  return self.post_process_metric(metric)
@@ -336,13 +340,13 @@ class EstimatorWrapper:
336
340
  logger: logging.Logger,
337
341
  target_type: ModelTaskType,
338
342
  cv: BaseCrossValidator,
339
- X: pd.DataFrame,
343
+ x: pd.DataFrame,
340
344
  scoring: Union[Callable, str, None] = None,
341
345
  cat_features: Optional[List[str]] = None,
342
346
  text_features: Optional[List[str]] = None,
343
347
  add_params: Optional[Dict[str, Any]] = None,
344
348
  groups: Optional[List[str]] = None,
345
- ) -> "EstimatorWrapper":
349
+ ) -> EstimatorWrapper:
346
350
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
347
351
  kwargs = {
348
352
  "scorer": scorer,
@@ -352,6 +356,7 @@ class EstimatorWrapper:
352
356
  "target_type": target_type,
353
357
  "groups": groups,
354
358
  "text_features": text_features,
359
+ "logger": logger,
355
360
  }
356
361
  if estimator is None:
357
362
  params = dict()
@@ -377,15 +382,20 @@ class EstimatorWrapper:
377
382
  else:
378
383
  estimator_copy = deepcopy(estimator)
379
384
  kwargs["estimator"] = estimator_copy
380
- if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
385
+ if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
381
386
  if cat_features is not None:
387
+ for cat_feature in cat_features:
388
+ if cat_feature not in x.columns:
389
+ logger.error(
390
+ f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
391
+ )
382
392
  estimator_copy.set_params(
383
- cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
393
+ cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
384
394
  )
385
395
  estimator = CatBoostWrapper(**kwargs)
386
396
  else:
387
397
  try:
388
- if isinstance(estimator, LGBMClassifier) or isinstance(estimator, LGBMRegressor):
398
+ if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
389
399
  estimator = LightGBMWrapper(**kwargs)
390
400
  else:
391
401
  logger.warning(
@@ -414,32 +424,40 @@ class CatBoostWrapper(EstimatorWrapper):
414
424
  target_type: ModelTaskType,
415
425
  groups: Optional[List[str]] = None,
416
426
  text_features: Optional[List[str]] = None,
427
+ logger: Optional[logging.Logger] = None,
417
428
  ):
418
429
  super(CatBoostWrapper, self).__init__(
419
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
430
+ estimator,
431
+ scorer,
432
+ metric_name,
433
+ multiplier,
434
+ cv,
435
+ target_type,
436
+ groups=groups,
437
+ text_features=text_features,
438
+ logger=logger,
420
439
  )
421
440
  self.cat_features = None
422
441
  self.emb_features = None
442
+ self.exclude_features = []
423
443
 
424
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
425
- X, y, groups, params = super()._prepare_to_fit(X, y)
444
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
445
+ x, y, groups, params = super()._prepare_to_fit(x, y)
426
446
 
427
447
  # Find embeddings
428
448
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
429
449
  emb_pattern = r"(.+)_emb\d+"
430
- self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
450
+ self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
431
451
  embedding_features = []
432
452
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
433
453
  self.logger.info(
434
454
  "Embedding features count more than 3, so group them into one vector for CatBoost: "
435
455
  f"{self.emb_features}"
436
456
  )
437
- X, embedding_features = self.group_embeddings(X)
457
+ x, embedding_features = self.group_embeddings(x)
438
458
  params["embedding_features"] = embedding_features
439
459
  else:
440
- self.logger.info(
441
- f"Embedding features count less than 3, so use them separately: {self.emb_features}"
442
- )
460
+ self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
443
461
  self.emb_features = []
444
462
  else:
445
463
  self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
@@ -448,7 +466,7 @@ class CatBoostWrapper(EstimatorWrapper):
448
466
  if hasattr(CatBoostClassifier, "get_text_feature_indices"):
449
467
  if self.text_features is not None:
450
468
  self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
451
- self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
469
+ self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
452
470
  self.logger.info(f"Rest text features after checks: {self.text_features}")
453
471
  params["text_features"] = self.text_features
454
472
  else:
@@ -456,15 +474,15 @@ class CatBoostWrapper(EstimatorWrapper):
456
474
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
457
475
 
458
476
  # Find rest categorical features
459
- self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
460
- X = fill_na_cat_features(X, self.cat_features)
477
+ self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
478
+ x = fill_na_cat_features(x, self.cat_features)
461
479
  unique_cat_features = []
462
480
  for name in self.cat_features:
463
481
  # Remove constant categorical features
464
- if X[name].nunique() > 1:
482
+ if x[name].nunique() > 1:
465
483
  unique_cat_features.append(name)
466
484
  else:
467
- X = X.drop(columns=name)
485
+ x = x.drop(columns=name)
468
486
  self.cat_features = unique_cat_features
469
487
  if (
470
488
  hasattr(self.estimator, "get_param")
@@ -473,9 +491,9 @@ class CatBoostWrapper(EstimatorWrapper):
473
491
  ):
474
492
  estimator_cat_features = self.estimator.get_param("cat_features")
475
493
  if all([isinstance(c, int) for c in estimator_cat_features]):
476
- cat_features_idx = {X.columns.get_loc(c) for c in self.cat_features}
494
+ cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
477
495
  cat_features_idx.update(estimator_cat_features)
478
- self.cat_features = [X.columns[idx] for idx in sorted(cat_features_idx)]
496
+ self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
479
497
  elif all([isinstance(c, str) for c in estimator_cat_features]):
480
498
  self.cat_features = list(set(self.cat_features + estimator_cat_features))
481
499
  else:
@@ -486,7 +504,7 @@ class CatBoostWrapper(EstimatorWrapper):
486
504
  self.logger.info(f"Selected categorical features: {self.cat_features}")
487
505
  params["cat_features"] = self.cat_features
488
506
 
489
- return X, y, groups, params
507
+ return x, y, groups, params
490
508
 
491
509
  def group_embeddings(self, df: pd.DataFrame):
492
510
  emb_name = "__grouped_embeddings"
@@ -497,18 +515,40 @@ class CatBoostWrapper(EstimatorWrapper):
497
515
 
498
516
  return df, [emb_name]
499
517
 
500
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
501
- X, y, params = super()._prepare_to_calculate(X, y)
518
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
519
+ if self.exclude_features:
520
+ x = x.drop(columns=self.exclude_features)
521
+ x, y, params = super()._prepare_to_calculate(x, y)
502
522
  if self.text_features:
503
523
  params["text_features"] = self.text_features
504
524
  if self.emb_features:
505
- X, emb_columns = self.group_embeddings(X)
525
+ x, emb_columns = self.group_embeddings(x)
506
526
  params["embedding_features"] = emb_columns
507
527
  if self.cat_features:
508
- X = fill_na_cat_features(X, self.cat_features)
528
+ x = fill_na_cat_features(x, self.cat_features)
509
529
  params["cat_features"] = self.cat_features
510
530
 
511
- return X, y, params
531
+ return x, y, params
532
+
533
+ def cross_val_predict(
534
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
535
+ ) -> Optional[float]:
536
+ try:
537
+ return super().cross_val_predict(x, y, baseline_score_column)
538
+ except Exception as e:
539
+ if "Dictionary size is 0" in e.args[0] and self.text_features:
540
+ high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
541
+ self.logger.warning(
542
+ "Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
543
+ f" text features {high_cardinality_features} and retry"
544
+ )
545
+ for f in high_cardinality_features:
546
+ self.text_features.remove(f)
547
+ self.exclude_features.append(f)
548
+ x = x.drop(columns=f)
549
+ return super().cross_val_predict(x, y, baseline_score_column)
550
+ else:
551
+ raise e
512
552
 
513
553
 
514
554
  class LightGBMWrapper(EstimatorWrapper):
@@ -522,32 +562,41 @@ class LightGBMWrapper(EstimatorWrapper):
522
562
  target_type: ModelTaskType,
523
563
  groups: Optional[List[str]] = None,
524
564
  text_features: Optional[List[str]] = None,
565
+ logger: Optional[logging.Logger] = None,
525
566
  ):
526
567
  super(LightGBMWrapper, self).__init__(
527
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
568
+ estimator,
569
+ scorer,
570
+ metric_name,
571
+ multiplier,
572
+ cv,
573
+ target_type,
574
+ groups=groups,
575
+ text_features=text_features,
576
+ logger=logger,
528
577
  )
529
578
  self.cat_features = None
530
579
 
531
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
532
- X, y, groups, params = super()._prepare_to_fit(X, y)
533
- self.cat_features = _get_cat_features(X)
534
- X = fill_na_cat_features(X, self.cat_features)
580
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
581
+ x, y, groups, params = super()._prepare_to_fit(x, y)
582
+ self.cat_features = _get_cat_features(x)
583
+ x = fill_na_cat_features(x, self.cat_features)
535
584
  for feature in self.cat_features:
536
- X[feature] = X[feature].astype("category").cat.codes
585
+ x[feature] = x[feature].astype("category").cat.codes
537
586
  if not is_numeric_dtype(y):
538
587
  y = correct_string_target(y)
539
588
 
540
- return X, y, groups, params
589
+ return x, y, groups, params
541
590
 
542
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
543
- X, y, params = super()._prepare_to_calculate(X, y)
591
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
592
+ x, y, params = super()._prepare_to_calculate(x, y)
544
593
  if self.cat_features is not None:
545
- X = fill_na_cat_features(X, self.cat_features)
594
+ x = fill_na_cat_features(x, self.cat_features)
546
595
  for feature in self.cat_features:
547
- X[feature] = X[feature].astype("category").cat.codes
596
+ x[feature] = x[feature].astype("category").cat.codes
548
597
  if not is_numeric_dtype(y):
549
598
  y = correct_string_target(y)
550
- return X, y, params
599
+ return x, y, params
551
600
 
552
601
 
553
602
  class OtherEstimatorWrapper(EstimatorWrapper):
@@ -561,54 +610,69 @@ class OtherEstimatorWrapper(EstimatorWrapper):
561
610
  target_type: ModelTaskType,
562
611
  groups: Optional[List[str]] = None,
563
612
  text_features: Optional[List[str]] = None,
613
+ logger: Optional[logging.Logger] = None,
564
614
  ):
565
615
  super(OtherEstimatorWrapper, self).__init__(
566
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
616
+ estimator,
617
+ scorer,
618
+ metric_name,
619
+ multiplier,
620
+ cv,
621
+ target_type,
622
+ groups=groups,
623
+ text_features=text_features,
624
+ logger=logger,
567
625
  )
568
626
  self.cat_features = None
569
627
 
570
- def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
571
- X, y, groups, params = super()._prepare_to_fit(X, y)
572
- self.cat_features = _get_cat_features(X)
573
- num_features = [col for col in X.columns if col not in self.cat_features]
574
- X[num_features] = X[num_features].fillna(-999)
575
- X = fill_na_cat_features(X, self.cat_features)
628
+ def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
629
+ x, y, groups, params = super()._prepare_to_fit(x, y)
630
+ self.cat_features = _get_cat_features(x)
631
+ num_features = [col for col in x.columns if col not in self.cat_features]
632
+ x[num_features] = x[num_features].fillna(-999)
633
+ x = fill_na_cat_features(x, self.cat_features)
576
634
  # TODO use one-hot encoding if cardinality is less 50
577
635
  for feature in self.cat_features:
578
- X[feature] = X[feature].astype("category").cat.codes
636
+ x[feature] = x[feature].astype("category").cat.codes
579
637
  if not is_numeric_dtype(y):
580
638
  y = correct_string_target(y)
581
- return X, y, groups, params
639
+ return x, y, groups, params
582
640
 
583
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
584
- X, y, params = super()._prepare_to_calculate(X, y)
641
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
642
+ x, y, params = super()._prepare_to_calculate(x, y)
585
643
  if self.cat_features is not None:
586
- num_features = [col for col in X.columns if col not in self.cat_features]
587
- X[num_features] = X[num_features].fillna(-999)
588
- X = fill_na_cat_features(X, self.cat_features)
644
+ num_features = [col for col in x.columns if col not in self.cat_features]
645
+ x[num_features] = x[num_features].fillna(-999)
646
+ x = fill_na_cat_features(x, self.cat_features)
589
647
  # TODO use one-hot encoding if cardinality is less 50
590
648
  for feature in self.cat_features:
591
- X[feature] = X[feature].astype("category").cat.codes
649
+ x[feature] = x[feature].astype("category").cat.codes
592
650
  if not is_numeric_dtype(y):
593
651
  y = correct_string_target(y)
594
- return X, y, params
652
+ return x, y, params
595
653
 
596
654
 
597
655
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
598
656
  if isinstance(scoring, str) and scoring is not None:
599
657
  _get_scorer_by_name(scoring)
658
+ elif isinstance(scoring, Callable):
659
+ spec = inspect.getfullargspec(scoring)
660
+ if len(spec.args) < 3:
661
+ raise ValidationError(
662
+ f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
663
+ )
600
664
 
601
665
 
602
666
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
603
667
  metric_name = scoring
604
668
  multiplier = 1
605
- if "mean_squared_log_error" == metric_name or "MSLE" == metric_name or "msle" == metric_name:
669
+ if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
606
670
  scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
607
671
  multiplier = -1
608
- elif "root_mean_squared_log_error" in metric_name or "RMSLE" == metric_name or "rmsle" == metric_name:
672
+ elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
609
673
  scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
610
674
  multiplier = -1
611
- elif "root_mean_squared_error" == metric_name or "RMSE" == metric_name or "rmse" == metric_name:
675
+ elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
612
676
  scoring = get_scorer("neg_root_mean_squared_error")
613
677
  multiplier = -1
614
678
  elif scoring in available_scorers:
@@ -660,12 +724,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
660
724
 
661
725
 
662
726
  def _get_cat_features(
663
- X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
727
+ x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
664
728
  ) -> List[str]:
665
729
  text_features = text_features or []
666
730
  emb_features = emb_features or []
667
731
  exclude_features = text_features + emb_features
668
- return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
732
+ return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
669
733
 
670
734
 
671
735
  def _get_add_params(input_params, add_params):
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -44,7 +44,7 @@ class PhoneNormalizer:
44
44
  Method will remove all non numeric chars from string and convert it to int.
45
45
  None will be set for phone numbers that couldn"t be converted to int
46
46
  """
47
- if is_string_dtype(self.df[self.phone_column_name]):
47
+ if is_string_dtype(self.df[self.phone_column_name]) or is_object_dtype(self.df[self.phone_column_name]):
48
48
  convert_func = self.phone_str_to_int_safe
49
49
  elif is_float_dtype(self.df[self.phone_column_name]):
50
50
  convert_func = self.phone_float_to_int_safe
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
17
17
  __email__ = "felix.zenk@web.de"
18
18
 
19
19
 
20
- class _Parser(object):
20
+ class _Parser:
21
21
  """
22
22
  A parser for the .properties file format.
23
23
  """
@@ -49,7 +49,7 @@ class _Parser(object):
49
49
  return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
50
50
 
51
51
  # I/O read
52
- with open(file_path, mode="r", encoding="utf-8") as f:
52
+ with open(file_path, encoding="utf-8") as f:
53
53
  lines = f.readlines()
54
54
 
55
55
  # parse
@@ -83,7 +83,7 @@ class _Parser(object):
83
83
  return mapping
84
84
 
85
85
 
86
- class ResourceBundle(object):
86
+ class ResourceBundle:
87
87
  """
88
88
  A ResourceBundle manages internationalization of string resources
89
89
  """
@@ -199,7 +199,7 @@ class ResourceBundle(object):
199
199
  raise NotInResourceBundleError(self.name, item)
200
200
 
201
201
 
202
- def get_bundle(bundle_name: str, locale: str | Sequence[str | str] = None, path: Path | str = None) -> ResourceBundle:
202
+ def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
203
203
  """
204
204
  Return a new :class:`ResourceBundle` after parsing the locale
205
205
 
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
224
224
  custom_bundles = dict()
225
225
 
226
226
 
227
- def get_custom_bundle(custom_cfg: Optional[str] = None) -> "ResourceBundle":
227
+ def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
228
228
  global custom_bundles
229
229
  if custom_cfg is not None:
230
230
  custom_bundle = custom_bundles.get(custom_cfg)
@@ -38,6 +38,7 @@ loss_selection_warn=\nWARNING: Loss `{0}` is not supported for feature selection
38
38
  loss_calc_metrics_warn=\nWARNING: Loss `{0}` is not supported for metrics calculation with {1}
39
39
  multivariate_timeseries_detected=\nWARNING: Multivariate TimeSeries detected. Blocked time series cross-validation split selected.\nMore details: https://github.com/upgini/upgini#-time-series-prediction-support
40
40
  group_k_fold_in_classification=\nWARNING: Using group K-fold cross-validation split for classification task.
41
+ current_date_added=\nWARNING: No date/datetime column was detected in X to be used as a search key. The current date will be used to match the latest version of data sources
41
42
 
42
43
  # Errors
43
44
  failed_search_by_task_id=Failed to retrieve the specified search results
@@ -111,6 +112,9 @@ x_is_empty=X is empty
111
112
  y_is_empty=y is empty
112
113
  x_contains_reserved_column_name=Column name {} is reserved. Please rename column and try again
113
114
  missing_generate_feature=\nWARNING: Feature {} specified in `generate_features` is not present in input columns: {}
115
+ x_unstable_by_date=\nWARNING: Your training sample is unstable in number of rows per date. It is recommended to redesign the training sample
116
+ train_unstable_target=\nWARNING: Your training sample contains an unstable target event, PSI = {}. This will lead to unstable scoring on deferred samples. It is recommended to redesign the training sample
117
+ eval_unstable_target=\nWARNING: Your training and evaluation samples have a difference in target distribution. PSI = {}. The results will be unstable. It is recommended to redesign the training and evaluation samples
114
118
  # eval set validation
115
119
  unsupported_type_eval_set=Unsupported type of eval_set: {}. It should be list of tuples with two elements: X and y
116
120
  eval_set_invalid_tuple_size=eval_set contains a tuple of size {}. It should contain only pairs of X and y
@@ -145,7 +149,8 @@ dataset_too_many_rows_registered=X rows limit for transform is {}. Please sample
145
149
  dataset_empty_column_names=Some column names are empty. Add names please
146
150
  dataset_full_duplicates=\nWARNING: {:.5f}% of the rows are fully duplicated
147
151
  dataset_diff_target_duplicates=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates with different y values. These rows will be deleted as incorrect\nIncorrect row indexes: {}
148
- dataset_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X and eval_set are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
152
+ dataset_train_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
153
+ dataset_eval_diff_target_duplicates_fintech=\nWARNING: {:.4f}% of rows ({}) in eval{} X are duplicates, not taking into consideration dates, IP addresses and features from the training set, but have different y values. These rows have been removed to optimize search results.\nRemoved row indexes: {}
149
154
  dataset_drop_old_dates=\nWARNING: We don't have data before '2000-01-01' and removed all earlier records from the search dataset
150
155
  dataset_all_dates_old=There is empty train dataset after removing data before '2000-01-01'
151
156
  dataset_invalid_target_type=Unexpected dtype of target for binary task type: {}. Expected int or bool
@@ -154,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
154
159
  dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
155
160
  dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
156
161
  dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
157
- dataset_rarest_class_less_min=Frequency of the rarest class `{}` is {}, minimum frequency must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
162
+ dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
158
163
  dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
159
164
  dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
160
165
  dataset_too_many_features=Too many features. Maximum number of features is {}
@@ -196,10 +201,10 @@ email_detected=Emails detected in column `{}`. It will be used as a search key\n
196
201
  email_detected_not_registered=Emails detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
197
202
  phone_detected=Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
198
203
  phone_detected_not_registered=\nWARNING: Phone numbers detected in column `{}`. It can be used only with api_key from profile.upgini.com\nSee docs to turn off the automatic detection: https://github.com/upgini/upgini/blob/main/README.md#turn-off-autodetection-for-search-key-columns
199
- target_type_detected=Detected task type: {}\n
204
+ target_type_detected=\nDetected task type: {}\n
200
205
  # all_ok_community_invite=Chat with us in Slack community:
201
206
  all_ok_community_invite=❓ Support request
202
- too_small_for_metrics=Your train dataset contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
207
+ too_small_for_metrics=Your train dataset or one of eval datasets contains less than 500 rows. For such dataset Upgini will not calculate accuracy metrics. Please increase the number of rows in the training dataset to calculate accuracy metrics
203
208
  imbalance_multiclass=Class {0} is on 25% quantile of classes distribution ({1} records in train dataset). \nDownsample classes with records more than {1}.
204
209
  loss_selection_info=Using loss `{}` for feature selection
205
210
  loss_calc_metrics_info=Using loss `{}` for metrics calculation with default estimator
upgini/sampler/base.py CHANGED
@@ -9,13 +9,11 @@ from abc import ABCMeta, abstractmethod
9
9
  from typing import List, Optional
10
10
 
11
11
  import numpy as np
12
-
13
12
  from sklearn.base import BaseEstimator
14
13
  from sklearn.preprocessing import label_binarize
15
14
  from sklearn.utils.multiclass import check_classification_targets
16
15
 
17
- from .utils import check_sampling_strategy, check_target_type
18
- from .utils import ArraysTransformer
16
+ from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
19
17
 
20
18
 
21
19
  class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -107,7 +105,6 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
107
105
  The corresponding label of `X_resampled`.
108
106
 
109
107
  """
110
- pass
111
108
 
112
109
  @abstractmethod
113
110
  def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):