upgini 1.1.274a4__py3-none-any.whl → 1.1.280.dev0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. upgini/__about__.py +1 -0
  2. upgini/ads.py +6 -2
  3. upgini/ads_management/ads_manager.py +4 -2
  4. upgini/autofe/all_operands.py +3 -2
  5. upgini/autofe/binary.py +2 -1
  6. upgini/autofe/date.py +9 -2
  7. upgini/autofe/feature.py +1 -1
  8. upgini/autofe/groupby.py +3 -1
  9. upgini/autofe/operand.py +4 -3
  10. upgini/autofe/unary.py +2 -1
  11. upgini/autofe/vector.py +2 -0
  12. upgini/dataset.py +7 -6
  13. upgini/errors.py +1 -1
  14. upgini/features_enricher.py +52 -27
  15. upgini/http.py +11 -10
  16. upgini/mdc/__init__.py +1 -3
  17. upgini/mdc/context.py +4 -6
  18. upgini/metadata.py +3 -0
  19. upgini/metrics.py +110 -97
  20. upgini/normalizer/phone_normalizer.py +1 -1
  21. upgini/resource_bundle/__init__.py +5 -5
  22. upgini/resource_bundle/strings.properties +1 -1
  23. upgini/sampler/base.py +1 -4
  24. upgini/sampler/random_under_sampler.py +2 -5
  25. upgini/search_task.py +4 -4
  26. upgini/spinner.py +1 -1
  27. upgini/utils/__init__.py +3 -2
  28. upgini/utils/base_search_key_detector.py +2 -2
  29. upgini/utils/blocked_time_series.py +4 -2
  30. upgini/utils/country_utils.py +2 -2
  31. upgini/utils/custom_loss_utils.py +3 -2
  32. upgini/utils/cv_utils.py +2 -2
  33. upgini/utils/datetime_utils.py +25 -19
  34. upgini/utils/email_utils.py +3 -3
  35. upgini/utils/fallback_progress_bar.py +1 -1
  36. upgini/utils/features_validator.py +2 -1
  37. upgini/utils/progress_bar.py +1 -1
  38. upgini/utils/sklearn_ext.py +14 -13
  39. upgini/utils/target_utils.py +1 -1
  40. upgini/utils/track_info.py +27 -15
  41. upgini/version_validator.py +2 -2
  42. {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info}/METADATA +21 -23
  43. upgini-1.1.280.dev0.dist-info/RECORD +62 -0
  44. {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info}/WHEEL +1 -2
  45. upgini/fingerprint.js +0 -8
  46. upgini-1.1.274a4.dist-info/RECORD +0 -63
  47. upgini-1.1.274a4.dist-info/top_level.txt +0 -1
  48. {upgini-1.1.274a4.dist-info → upgini-1.1.280.dev0.dist-info/licenses}/LICENSE +0 -0
upgini/metrics.py CHANGED
@@ -1,3 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ import inspect
1
4
  import logging
2
5
  import re
3
6
  from copy import deepcopy
@@ -124,7 +127,7 @@ NA_REPLACEMENT = "NA"
124
127
 
125
128
  SUPPORTED_CATBOOST_METRICS = {
126
129
  s.upper(): s
127
- for s in {
130
+ for s in (
128
131
  "Logloss",
129
132
  "CrossEntropy",
130
133
  "CtrFactor",
@@ -203,7 +206,7 @@ SUPPORTED_CATBOOST_METRICS = {
203
206
  "MultiLogloss",
204
207
  "MultiCrossEntropy",
205
208
  "Combination",
206
- }
209
+ )
207
210
  }
208
211
 
209
212
 
@@ -235,71 +238,71 @@ class EstimatorWrapper:
235
238
  self.text_features = text_features
236
239
  self.logger = logger or logging.getLogger()
237
240
 
238
- def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
239
- X, y, _, fit_params = self._prepare_to_fit(X, y)
241
+ def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
242
+ x, y, _, fit_params = self._prepare_to_fit(x, y)
240
243
  kwargs.update(fit_params)
241
- self.estimator.fit(X, y, **kwargs)
244
+ self.estimator.fit(x, y, **kwargs)
242
245
  return self
243
246
 
244
247
  def predict(self, **kwargs):
245
248
  return self.estimator.predict(**kwargs)
246
249
 
247
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
248
- X, y, groups = self._prepare_data(X, y, groups=self.groups)
249
- return X, y, groups, {}
250
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
251
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
252
+ return x, y, groups, {}
250
253
 
251
254
  def _prepare_data(
252
- self, X: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
255
+ self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
253
256
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
254
- for c in X.columns:
255
- if is_numeric_dtype(X[c]):
256
- X[c] = X[c].astype(float)
257
+ for c in x.columns:
258
+ if is_numeric_dtype(x[c]):
259
+ x[c] = x[c].astype(float)
257
260
  else:
258
- X[c] = X[c].astype(str)
261
+ x[c] = x[c].astype(str)
259
262
 
260
263
  if not isinstance(y, pd.Series):
261
264
  raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
262
265
 
263
266
  if groups is not None:
264
- X = X.copy()
265
- X["__groups"] = groups
266
- X, y = self._remove_empty_target_rows(X, y)
267
- groups = X["__groups"]
268
- X = X.drop(columns="__groups")
267
+ x = x.copy()
268
+ x["__groups"] = groups
269
+ x, y = self._remove_empty_target_rows(x, y)
270
+ groups = x["__groups"]
271
+ x = x.drop(columns="__groups")
269
272
  else:
270
- X, y = self._remove_empty_target_rows(X, y)
273
+ x, y = self._remove_empty_target_rows(x, y)
271
274
 
272
- return X, y, groups
275
+ return x, y, groups
273
276
 
274
- def _remove_empty_target_rows(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
275
- joined = pd.concat([X, y], axis=1)
277
+ def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
278
+ joined = pd.concat([x, y], axis=1)
276
279
  joined = joined[joined[y.name].notna()]
277
280
  joined = joined.reset_index(drop=True)
278
- X = joined.drop(columns=y.name)
281
+ x = joined.drop(columns=y.name)
279
282
  y = np.array(list(joined[y.name].values))
280
283
 
281
- return X, y
284
+ return x, y
282
285
 
283
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
284
- X, y, _ = self._prepare_data(X, y)
285
- return X, y, {}
286
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
287
+ x, y, _ = self._prepare_data(x, y)
288
+ return x, y, {}
286
289
 
287
290
  def cross_val_predict(
288
- self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
291
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
289
292
  ) -> Optional[float]:
290
- X, y, groups, fit_params = self._prepare_to_fit(X, y)
293
+ x, y, groups, fit_params = self._prepare_to_fit(x, y)
291
294
 
292
- if X.shape[1] == 0:
295
+ if x.shape[1] == 0:
293
296
  return None
294
297
 
295
298
  scorer = check_scoring(self.estimator, scoring=self.scorer)
296
299
 
297
300
  if baseline_score_column is not None and self.metric_name == "GINI":
298
- metric = roc_auc_score(y, X[baseline_score_column])
301
+ metric = roc_auc_score(y, x[baseline_score_column])
299
302
  else:
300
303
  cv_results = cross_validate(
301
304
  estimator=self.estimator,
302
- X=X,
305
+ x=x,
303
306
  y=y,
304
307
  scoring=scorer,
305
308
  cv=self.cv,
@@ -319,14 +322,14 @@ class EstimatorWrapper:
319
322
  metric = 2 * metric - 1
320
323
  return metric
321
324
 
322
- def calculate_metric(self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
323
- X, y, _ = self._prepare_to_calculate(X, y)
325
+ def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
326
+ x, y, _ = self._prepare_to_calculate(x, y)
324
327
  if baseline_score_column is not None and self.metric_name == "GINI":
325
- metric = roc_auc_score(y, X[baseline_score_column])
328
+ metric = roc_auc_score(y, x[baseline_score_column])
326
329
  else:
327
330
  metrics = []
328
331
  for est in self.cv_estimators:
329
- metrics.append(self.scorer(est, X, y))
332
+ metrics.append(self.scorer(est, x, y))
330
333
 
331
334
  metric = np.mean(metrics) * self.multiplier
332
335
  return self.post_process_metric(metric)
@@ -337,13 +340,13 @@ class EstimatorWrapper:
337
340
  logger: logging.Logger,
338
341
  target_type: ModelTaskType,
339
342
  cv: BaseCrossValidator,
340
- X: pd.DataFrame,
343
+ x: pd.DataFrame,
341
344
  scoring: Union[Callable, str, None] = None,
342
345
  cat_features: Optional[List[str]] = None,
343
346
  text_features: Optional[List[str]] = None,
344
347
  add_params: Optional[Dict[str, Any]] = None,
345
348
  groups: Optional[List[str]] = None,
346
- ) -> "EstimatorWrapper":
349
+ ) -> EstimatorWrapper:
347
350
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
348
351
  kwargs = {
349
352
  "scorer": scorer,
@@ -379,15 +382,20 @@ class EstimatorWrapper:
379
382
  else:
380
383
  estimator_copy = deepcopy(estimator)
381
384
  kwargs["estimator"] = estimator_copy
382
- if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
385
+ if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
383
386
  if cat_features is not None:
387
+ for cat_feature in cat_features:
388
+ if cat_feature not in x.columns:
389
+ logger.error(
390
+ f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
391
+ )
384
392
  estimator_copy.set_params(
385
- cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
393
+ cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
386
394
  )
387
395
  estimator = CatBoostWrapper(**kwargs)
388
396
  else:
389
397
  try:
390
- if isinstance(estimator, LGBMClassifier) or isinstance(estimator, LGBMRegressor):
398
+ if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
391
399
  estimator = LightGBMWrapper(**kwargs)
392
400
  else:
393
401
  logger.warning(
@@ -433,20 +441,20 @@ class CatBoostWrapper(EstimatorWrapper):
433
441
  self.emb_features = None
434
442
  self.exclude_features = []
435
443
 
436
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
437
- X, y, groups, params = super()._prepare_to_fit(X, y)
444
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
445
+ x, y, groups, params = super()._prepare_to_fit(x, y)
438
446
 
439
447
  # Find embeddings
440
448
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
441
449
  emb_pattern = r"(.+)_emb\d+"
442
- self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
450
+ self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
443
451
  embedding_features = []
444
452
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
445
453
  self.logger.info(
446
454
  "Embedding features count more than 3, so group them into one vector for CatBoost: "
447
455
  f"{self.emb_features}"
448
456
  )
449
- X, embedding_features = self.group_embeddings(X)
457
+ x, embedding_features = self.group_embeddings(x)
450
458
  params["embedding_features"] = embedding_features
451
459
  else:
452
460
  self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
@@ -458,7 +466,7 @@ class CatBoostWrapper(EstimatorWrapper):
458
466
  if hasattr(CatBoostClassifier, "get_text_feature_indices"):
459
467
  if self.text_features is not None:
460
468
  self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
461
- self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
469
+ self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
462
470
  self.logger.info(f"Rest text features after checks: {self.text_features}")
463
471
  params["text_features"] = self.text_features
464
472
  else:
@@ -466,15 +474,15 @@ class CatBoostWrapper(EstimatorWrapper):
466
474
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
467
475
 
468
476
  # Find rest categorical features
469
- self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
470
- X = fill_na_cat_features(X, self.cat_features)
477
+ self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
478
+ x = fill_na_cat_features(x, self.cat_features)
471
479
  unique_cat_features = []
472
480
  for name in self.cat_features:
473
481
  # Remove constant categorical features
474
- if X[name].nunique() > 1:
482
+ if x[name].nunique() > 1:
475
483
  unique_cat_features.append(name)
476
484
  else:
477
- X = X.drop(columns=name)
485
+ x = x.drop(columns=name)
478
486
  self.cat_features = unique_cat_features
479
487
  if (
480
488
  hasattr(self.estimator, "get_param")
@@ -483,9 +491,9 @@ class CatBoostWrapper(EstimatorWrapper):
483
491
  ):
484
492
  estimator_cat_features = self.estimator.get_param("cat_features")
485
493
  if all([isinstance(c, int) for c in estimator_cat_features]):
486
- cat_features_idx = {X.columns.get_loc(c) for c in self.cat_features}
494
+ cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
487
495
  cat_features_idx.update(estimator_cat_features)
488
- self.cat_features = [X.columns[idx] for idx in sorted(cat_features_idx)]
496
+ self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
489
497
  elif all([isinstance(c, str) for c in estimator_cat_features]):
490
498
  self.cat_features = list(set(self.cat_features + estimator_cat_features))
491
499
  else:
@@ -496,7 +504,7 @@ class CatBoostWrapper(EstimatorWrapper):
496
504
  self.logger.info(f"Selected categorical features: {self.cat_features}")
497
505
  params["cat_features"] = self.cat_features
498
506
 
499
- return X, y, groups, params
507
+ return x, y, groups, params
500
508
 
501
509
  def group_embeddings(self, df: pd.DataFrame):
502
510
  emb_name = "__grouped_embeddings"
@@ -507,38 +515,38 @@ class CatBoostWrapper(EstimatorWrapper):
507
515
 
508
516
  return df, [emb_name]
509
517
 
510
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
518
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
511
519
  if self.exclude_features:
512
- X = X.drop(columns=self.exclude_features)
513
- X, y, params = super()._prepare_to_calculate(X, y)
520
+ x = x.drop(columns=self.exclude_features)
521
+ x, y, params = super()._prepare_to_calculate(x, y)
514
522
  if self.text_features:
515
523
  params["text_features"] = self.text_features
516
524
  if self.emb_features:
517
- X, emb_columns = self.group_embeddings(X)
525
+ x, emb_columns = self.group_embeddings(x)
518
526
  params["embedding_features"] = emb_columns
519
527
  if self.cat_features:
520
- X = fill_na_cat_features(X, self.cat_features)
528
+ x = fill_na_cat_features(x, self.cat_features)
521
529
  params["cat_features"] = self.cat_features
522
530
 
523
- return X, y, params
531
+ return x, y, params
524
532
 
525
533
  def cross_val_predict(
526
- self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
534
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
527
535
  ) -> Optional[float]:
528
536
  try:
529
- return super().cross_val_predict(X, y, baseline_score_column)
537
+ return super().cross_val_predict(x, y, baseline_score_column)
530
538
  except Exception as e:
531
539
  if "Dictionary size is 0" in e.args[0] and self.text_features:
532
- high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
540
+ high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
533
541
  self.logger.warning(
534
- "Failed to calculate metrics. Try to remove high cardinality"
542
+ "Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
535
543
  f" text features {high_cardinality_features} and retry"
536
544
  )
537
545
  for f in high_cardinality_features:
538
546
  self.text_features.remove(f)
539
547
  self.exclude_features.append(f)
540
- X = X.drop(columns=f)
541
- return super().cross_val_predict(X, y, baseline_score_column)
548
+ x = x.drop(columns=f)
549
+ return super().cross_val_predict(x, y, baseline_score_column)
542
550
  else:
543
551
  raise e
544
552
 
@@ -569,26 +577,26 @@ class LightGBMWrapper(EstimatorWrapper):
569
577
  )
570
578
  self.cat_features = None
571
579
 
572
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
573
- X, y, groups, params = super()._prepare_to_fit(X, y)
574
- self.cat_features = _get_cat_features(X)
575
- X = fill_na_cat_features(X, self.cat_features)
580
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
581
+ x, y, groups, params = super()._prepare_to_fit(x, y)
582
+ self.cat_features = _get_cat_features(x)
583
+ x = fill_na_cat_features(x, self.cat_features)
576
584
  for feature in self.cat_features:
577
- X[feature] = X[feature].astype("category").cat.codes
585
+ x[feature] = x[feature].astype("category").cat.codes
578
586
  if not is_numeric_dtype(y):
579
587
  y = correct_string_target(y)
580
588
 
581
- return X, y, groups, params
589
+ return x, y, groups, params
582
590
 
583
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
584
- X, y, params = super()._prepare_to_calculate(X, y)
591
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
592
+ x, y, params = super()._prepare_to_calculate(x, y)
585
593
  if self.cat_features is not None:
586
- X = fill_na_cat_features(X, self.cat_features)
594
+ x = fill_na_cat_features(x, self.cat_features)
587
595
  for feature in self.cat_features:
588
- X[feature] = X[feature].astype("category").cat.codes
596
+ x[feature] = x[feature].astype("category").cat.codes
589
597
  if not is_numeric_dtype(y):
590
598
  y = correct_string_target(y)
591
- return X, y, params
599
+ return x, y, params
592
600
 
593
601
 
594
602
  class OtherEstimatorWrapper(EstimatorWrapper):
@@ -617,49 +625,54 @@ class OtherEstimatorWrapper(EstimatorWrapper):
617
625
  )
618
626
  self.cat_features = None
619
627
 
620
- def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
621
- X, y, groups, params = super()._prepare_to_fit(X, y)
622
- self.cat_features = _get_cat_features(X)
623
- num_features = [col for col in X.columns if col not in self.cat_features]
624
- X[num_features] = X[num_features].fillna(-999)
625
- X = fill_na_cat_features(X, self.cat_features)
628
+ def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
629
+ x, y, groups, params = super()._prepare_to_fit(x, y)
630
+ self.cat_features = _get_cat_features(x)
631
+ num_features = [col for col in x.columns if col not in self.cat_features]
632
+ x[num_features] = x[num_features].fillna(-999)
633
+ x = fill_na_cat_features(x, self.cat_features)
626
634
  # TODO use one-hot encoding if cardinality is less 50
627
635
  for feature in self.cat_features:
628
- X[feature] = X[feature].astype("category").cat.codes
636
+ x[feature] = x[feature].astype("category").cat.codes
629
637
  if not is_numeric_dtype(y):
630
638
  y = correct_string_target(y)
631
- return X, y, groups, params
639
+ return x, y, groups, params
632
640
 
633
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
634
- X, y, params = super()._prepare_to_calculate(X, y)
641
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
642
+ x, y, params = super()._prepare_to_calculate(x, y)
635
643
  if self.cat_features is not None:
636
- num_features = [col for col in X.columns if col not in self.cat_features]
637
- X[num_features] = X[num_features].fillna(-999)
638
- X = fill_na_cat_features(X, self.cat_features)
644
+ num_features = [col for col in x.columns if col not in self.cat_features]
645
+ x[num_features] = x[num_features].fillna(-999)
646
+ x = fill_na_cat_features(x, self.cat_features)
639
647
  # TODO use one-hot encoding if cardinality is less 50
640
648
  for feature in self.cat_features:
641
- X[feature] = X[feature].astype("category").cat.codes
649
+ x[feature] = x[feature].astype("category").cat.codes
642
650
  if not is_numeric_dtype(y):
643
651
  y = correct_string_target(y)
644
- return X, y, params
652
+ return x, y, params
645
653
 
646
654
 
647
655
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
648
- # TODO validate that if it is Callable then it accepts 3 arguments
649
656
  if isinstance(scoring, str) and scoring is not None:
650
657
  _get_scorer_by_name(scoring)
658
+ elif isinstance(scoring, Callable):
659
+ spec = inspect.getfullargspec(scoring)
660
+ if len(spec.args) < 3:
661
+ raise ValidationError(
662
+ f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
663
+ )
651
664
 
652
665
 
653
666
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
654
667
  metric_name = scoring
655
668
  multiplier = 1
656
- if "mean_squared_log_error" == metric_name or "MSLE" == metric_name or "msle" == metric_name:
669
+ if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
657
670
  scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
658
671
  multiplier = -1
659
- elif "root_mean_squared_log_error" in metric_name or "RMSLE" == metric_name or "rmsle" == metric_name:
672
+ elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
660
673
  scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
661
674
  multiplier = -1
662
- elif "root_mean_squared_error" == metric_name or "RMSE" == metric_name or "rmse" == metric_name:
675
+ elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
663
676
  scoring = get_scorer("neg_root_mean_squared_error")
664
677
  multiplier = -1
665
678
  elif scoring in available_scorers:
@@ -711,12 +724,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
711
724
 
712
725
 
713
726
  def _get_cat_features(
714
- X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
727
+ x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
715
728
  ) -> List[str]:
716
729
  text_features = text_features or []
717
730
  emb_features = emb_features or []
718
731
  exclude_features = text_features + emb_features
719
- return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
732
+ return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
720
733
 
721
734
 
722
735
  def _get_add_params(input_params, add_params):
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
17
17
  __email__ = "felix.zenk@web.de"
18
18
 
19
19
 
20
- class _Parser(object):
20
+ class _Parser:
21
21
  """
22
22
  A parser for the .properties file format.
23
23
  """
@@ -49,7 +49,7 @@ class _Parser(object):
49
49
  return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
50
50
 
51
51
  # I/O read
52
- with open(file_path, mode="r", encoding="utf-8") as f:
52
+ with open(file_path, encoding="utf-8") as f:
53
53
  lines = f.readlines()
54
54
 
55
55
  # parse
@@ -83,7 +83,7 @@ class _Parser(object):
83
83
  return mapping
84
84
 
85
85
 
86
- class ResourceBundle(object):
86
+ class ResourceBundle:
87
87
  """
88
88
  A ResourceBundle manages internationalization of string resources
89
89
  """
@@ -199,7 +199,7 @@ class ResourceBundle(object):
199
199
  raise NotInResourceBundleError(self.name, item)
200
200
 
201
201
 
202
- def get_bundle(bundle_name: str, locale: str | Sequence[str | str] = None, path: Path | str = None) -> ResourceBundle:
202
+ def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
203
203
  """
204
204
  Return a new :class:`ResourceBundle` after parsing the locale
205
205
 
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
224
224
  custom_bundles = dict()
225
225
 
226
226
 
227
- def get_custom_bundle(custom_cfg: Optional[str] = None) -> "ResourceBundle":
227
+ def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
228
228
  global custom_bundles
229
229
  if custom_cfg is not None:
230
230
  custom_bundle = custom_bundles.get(custom_cfg)
@@ -159,7 +159,7 @@ dataset_invalid_multiclass_target=Unexpected dtype of target for multiclass task
159
159
  dataset_invalid_regression_target=Unexpected dtype of target for regression task type: {}. Expected float
160
160
  dataset_invalid_timeseries_target=Unexpected dtype of target for timeseries task type: {}. Expected float
161
161
  dataset_to_many_multiclass_targets=The number of target classes {} exceeds the allowed threshold: {}. Please, correct your data and try again
162
- dataset_rarest_class_less_min=Frequency of the rarest class `{}` is {}, minimum frequency must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
162
+ dataset_rarest_class_less_min=Count of rows with the rarest class `{}` is {}, minimum count must be > {} for each class\nPlease, remove rows with rarest class from your dataframe
163
163
  dataset_rarest_class_less_threshold=\nWARNING: Target is imbalanced and will be undersampled to the rarest class. Frequency of the rarest class `{}` is {}\nMinimum number of observations for each class to avoid undersampling {} ({}%)
164
164
  dataset_date_features=\nWARNING: Columns {} is a datetime or period type but not used as a search key, removed from X
165
165
  dataset_too_many_features=Too many features. Maximum number of features is {}
upgini/sampler/base.py CHANGED
@@ -9,13 +9,11 @@ from abc import ABCMeta, abstractmethod
9
9
  from typing import List, Optional
10
10
 
11
11
  import numpy as np
12
-
13
12
  from sklearn.base import BaseEstimator
14
13
  from sklearn.preprocessing import label_binarize
15
14
  from sklearn.utils.multiclass import check_classification_targets
16
15
 
17
- from .utils import check_sampling_strategy, check_target_type
18
- from .utils import ArraysTransformer
16
+ from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
19
17
 
20
18
 
21
19
  class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -107,7 +105,6 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
107
105
  The corresponding label of `X_resampled`.
108
106
 
109
107
  """
110
- pass
111
108
 
112
109
  @abstractmethod
113
110
  def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):
@@ -5,13 +5,10 @@
5
5
  # License: MIT
6
6
 
7
7
  import numpy as np
8
-
9
- from sklearn.utils import check_random_state
10
- from sklearn.utils import _safe_indexing
8
+ from sklearn.utils import _safe_indexing, check_random_state
11
9
 
12
10
  from .base import BaseUnderSampler
13
- from .utils import check_target_type
14
- from .utils import _deprecate_positional_args
11
+ from .utils import _deprecate_positional_args, check_target_type
15
12
 
16
13
 
17
14
  class RandomUnderSampler(BaseUnderSampler):
upgini/search_task.py CHANGED
@@ -8,10 +8,10 @@ import pandas as pd
8
8
 
9
9
  from upgini import dataset
10
10
  from upgini.http import (
11
- _RestClient,
12
11
  ProviderTaskSummary,
13
12
  SearchProgress,
14
13
  SearchTaskSummary,
14
+ _RestClient,
15
15
  get_rest_client,
16
16
  is_demo_api_key,
17
17
  )
@@ -295,7 +295,7 @@ class SearchTask:
295
295
  return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
296
296
 
297
297
 
298
- @lru_cache()
298
+ @lru_cache
299
299
  def _get_all_initial_raw_features_cached(
300
300
  endpoint: Optional[str],
301
301
  api_key: Optional[str],
@@ -328,7 +328,7 @@ def _get_all_initial_raw_features_cached(
328
328
  return result_df
329
329
 
330
330
 
331
- @lru_cache()
331
+ @lru_cache
332
332
  def _get_all_validation_raw_features_cached(
333
333
  endpoint: Optional[str],
334
334
  api_key: Optional[str],
@@ -357,7 +357,7 @@ def _get_all_validation_raw_features_cached(
357
357
  return result_df
358
358
 
359
359
 
360
- @lru_cache()
360
+ @lru_cache
361
361
  def _get_target_outliers_cached(
362
362
  endpoint: Optional[str],
363
363
  api_key: Optional[str],
upgini/spinner.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import threading
2
- from typing import Optional, List
3
2
  import time
3
+ from typing import List, Optional
4
4
 
5
5
 
6
6
  class Spinner:
upgini/utils/__init__.py CHANGED
@@ -2,7 +2,7 @@ import itertools
2
2
  from typing import List, Tuple
3
3
 
4
4
  import pandas as pd
5
- from pandas.api.types import is_string_dtype
5
+ from pandas.api.types import is_object_dtype, is_string_dtype
6
6
 
7
7
 
8
8
  def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
@@ -20,5 +20,6 @@ def find_numbers_with_decimal_comma(df: pd.DataFrame) -> pd.DataFrame:
20
20
  return [
21
21
  col
22
22
  for col in tmp.columns
23
- if is_string_dtype(tmp[col]) and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
23
+ if (is_string_dtype(tmp[col]) or is_object_dtype(tmp[col]))
24
+ and tmp[col].astype("string").str.match("^[0-9]+,[0-9]*$").any()
24
25
  ]
@@ -5,10 +5,10 @@ import pandas as pd
5
5
 
6
6
  class BaseSearchKeyDetector:
7
7
  def _is_search_key_by_name(self, column_name: str) -> bool:
8
- raise NotImplementedError()
8
+ raise NotImplementedError
9
9
 
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
- raise NotImplementedError()
11
+ raise NotImplementedError
12
12
 
13
13
  def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
14
  for column_name in column_names:
@@ -1,8 +1,10 @@
1
- import numpy as np
2
1
  import numbers
2
+
3
+ import numpy as np
4
+ from sklearn.model_selection import BaseCrossValidator
3
5
  from sklearn.utils import indexable
4
6
  from sklearn.utils.validation import _num_samples
5
- from sklearn.model_selection import BaseCrossValidator
7
+
6
8
  from upgini.resource_bundle import bundle
7
9
 
8
10
 
@@ -1,5 +1,5 @@
1
1
  import pandas as pd
2
- from pandas.api.types import is_string_dtype
2
+ from pandas.api.types import is_object_dtype, is_string_dtype
3
3
 
4
4
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
5
5
 
@@ -9,7 +9,7 @@ class CountrySearchKeyDetector(BaseSearchKeyDetector):
9
9
  return "country" in str(column_name).lower()
10
10
 
11
11
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
12
- if not is_string_dtype(column):
12
+ if not is_string_dtype(column) and not is_object_dtype(column):
13
13
  return False
14
14
 
15
15
  all_count = len(column)