upgini 1.1.279a2__py3-none-any.whl → 1.1.279a2.dev1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (44) hide show
  1. upgini/__about__.py +1 -0
  2. upgini/ads_management/ads_manager.py +4 -2
  3. upgini/autofe/all_operands.py +3 -2
  4. upgini/autofe/binary.py +2 -1
  5. upgini/autofe/date.py +2 -1
  6. upgini/autofe/feature.py +1 -1
  7. upgini/autofe/groupby.py +3 -1
  8. upgini/autofe/operand.py +4 -3
  9. upgini/autofe/unary.py +2 -1
  10. upgini/autofe/vector.py +2 -0
  11. upgini/dataset.py +4 -4
  12. upgini/errors.py +1 -1
  13. upgini/features_enricher.py +4 -4
  14. upgini/http.py +11 -10
  15. upgini/mdc/__init__.py +1 -3
  16. upgini/mdc/context.py +4 -6
  17. upgini/metadata.py +3 -0
  18. upgini/metrics.py +101 -99
  19. upgini/normalizer/phone_normalizer.py +1 -1
  20. upgini/resource_bundle/__init__.py +5 -5
  21. upgini/sampler/base.py +1 -4
  22. upgini/sampler/random_under_sampler.py +2 -5
  23. upgini/search_task.py +4 -4
  24. upgini/spinner.py +1 -1
  25. upgini/utils/__init__.py +1 -1
  26. upgini/utils/base_search_key_detector.py +2 -2
  27. upgini/utils/blocked_time_series.py +4 -2
  28. upgini/utils/country_utils.py +1 -1
  29. upgini/utils/custom_loss_utils.py +3 -2
  30. upgini/utils/cv_utils.py +2 -2
  31. upgini/utils/datetime_utils.py +9 -3
  32. upgini/utils/email_utils.py +2 -2
  33. upgini/utils/fallback_progress_bar.py +1 -1
  34. upgini/utils/progress_bar.py +1 -1
  35. upgini/utils/sklearn_ext.py +14 -13
  36. upgini/utils/track_info.py +2 -2
  37. upgini/version_validator.py +2 -2
  38. {upgini-1.1.279a2.dist-info → upgini-1.1.279a2.dev1.dist-info}/METADATA +21 -23
  39. upgini-1.1.279a2.dev1.dist-info/RECORD +62 -0
  40. {upgini-1.1.279a2.dist-info → upgini-1.1.279a2.dev1.dist-info}/WHEEL +1 -2
  41. upgini/fingerprint.js +0 -8
  42. upgini-1.1.279a2.dist-info/RECORD +0 -63
  43. upgini-1.1.279a2.dist-info/top_level.txt +0 -1
  44. {upgini-1.1.279a2.dist-info → upgini-1.1.279a2.dev1.dist-info/licenses}/LICENSE +0 -0
upgini/metrics.py CHANGED
@@ -1,3 +1,5 @@
1
+ from __future__ import annotations
2
+
1
3
  import inspect
2
4
  import logging
3
5
  import re
@@ -125,7 +127,7 @@ NA_REPLACEMENT = "NA"
125
127
 
126
128
  SUPPORTED_CATBOOST_METRICS = {
127
129
  s.upper(): s
128
- for s in {
130
+ for s in (
129
131
  "Logloss",
130
132
  "CrossEntropy",
131
133
  "CtrFactor",
@@ -204,7 +206,7 @@ SUPPORTED_CATBOOST_METRICS = {
204
206
  "MultiLogloss",
205
207
  "MultiCrossEntropy",
206
208
  "Combination",
207
- }
209
+ )
208
210
  }
209
211
 
210
212
 
@@ -236,71 +238,71 @@ class EstimatorWrapper:
236
238
  self.text_features = text_features
237
239
  self.logger = logger or logging.getLogger()
238
240
 
239
- def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
240
- X, y, _, fit_params = self._prepare_to_fit(X, y)
241
+ def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
242
+ x, y, _, fit_params = self._prepare_to_fit(x, y)
241
243
  kwargs.update(fit_params)
242
- self.estimator.fit(X, y, **kwargs)
244
+ self.estimator.fit(x, y, **kwargs)
243
245
  return self
244
246
 
245
247
  def predict(self, **kwargs):
246
248
  return self.estimator.predict(**kwargs)
247
249
 
248
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
249
- X, y, groups = self._prepare_data(X, y, groups=self.groups)
250
- return X, y, groups, {}
250
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
251
+ x, y, groups = self._prepare_data(x, y, groups=self.groups)
252
+ return x, y, groups, {}
251
253
 
252
254
  def _prepare_data(
253
- self, X: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
255
+ self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
254
256
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
255
- for c in X.columns:
256
- if is_numeric_dtype(X[c]):
257
- X[c] = X[c].astype(float)
257
+ for c in x.columns:
258
+ if is_numeric_dtype(x[c]):
259
+ x[c] = x[c].astype(float)
258
260
  else:
259
- X[c] = X[c].astype(str)
261
+ x[c] = x[c].astype(str)
260
262
 
261
263
  if not isinstance(y, pd.Series):
262
264
  raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
263
265
 
264
266
  if groups is not None:
265
- X = X.copy()
266
- X["__groups"] = groups
267
- X, y = self._remove_empty_target_rows(X, y)
268
- groups = X["__groups"]
269
- X = X.drop(columns="__groups")
267
+ x = x.copy()
268
+ x["__groups"] = groups
269
+ x, y = self._remove_empty_target_rows(x, y)
270
+ groups = x["__groups"]
271
+ x = x.drop(columns="__groups")
270
272
  else:
271
- X, y = self._remove_empty_target_rows(X, y)
273
+ x, y = self._remove_empty_target_rows(x, y)
272
274
 
273
- return X, y, groups
275
+ return x, y, groups
274
276
 
275
- def _remove_empty_target_rows(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
276
- joined = pd.concat([X, y], axis=1)
277
+ def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
278
+ joined = pd.concat([x, y], axis=1)
277
279
  joined = joined[joined[y.name].notna()]
278
280
  joined = joined.reset_index(drop=True)
279
- X = joined.drop(columns=y.name)
281
+ x = joined.drop(columns=y.name)
280
282
  y = np.array(list(joined[y.name].values))
281
283
 
282
- return X, y
284
+ return x, y
283
285
 
284
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
285
- X, y, _ = self._prepare_data(X, y)
286
- return X, y, {}
286
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
287
+ x, y, _ = self._prepare_data(x, y)
288
+ return x, y, {}
287
289
 
288
290
  def cross_val_predict(
289
- self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
291
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
290
292
  ) -> Optional[float]:
291
- X, y, groups, fit_params = self._prepare_to_fit(X, y)
293
+ x, y, groups, fit_params = self._prepare_to_fit(x, y)
292
294
 
293
- if X.shape[1] == 0:
295
+ if x.shape[1] == 0:
294
296
  return None
295
297
 
296
298
  scorer = check_scoring(self.estimator, scoring=self.scorer)
297
299
 
298
300
  if baseline_score_column is not None and self.metric_name == "GINI":
299
- metric = roc_auc_score(y, X[baseline_score_column])
301
+ metric = roc_auc_score(y, x[baseline_score_column])
300
302
  else:
301
303
  cv_results = cross_validate(
302
304
  estimator=self.estimator,
303
- X=X,
305
+ x=x,
304
306
  y=y,
305
307
  scoring=scorer,
306
308
  cv=self.cv,
@@ -320,14 +322,14 @@ class EstimatorWrapper:
320
322
  metric = 2 * metric - 1
321
323
  return metric
322
324
 
323
- def calculate_metric(self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
324
- X, y, _ = self._prepare_to_calculate(X, y)
325
+ def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
326
+ x, y, _ = self._prepare_to_calculate(x, y)
325
327
  if baseline_score_column is not None and self.metric_name == "GINI":
326
- metric = roc_auc_score(y, X[baseline_score_column])
328
+ metric = roc_auc_score(y, x[baseline_score_column])
327
329
  else:
328
330
  metrics = []
329
331
  for est in self.cv_estimators:
330
- metrics.append(self.scorer(est, X, y))
332
+ metrics.append(self.scorer(est, x, y))
331
333
 
332
334
  metric = np.mean(metrics) * self.multiplier
333
335
  return self.post_process_metric(metric)
@@ -338,13 +340,13 @@ class EstimatorWrapper:
338
340
  logger: logging.Logger,
339
341
  target_type: ModelTaskType,
340
342
  cv: BaseCrossValidator,
341
- X: pd.DataFrame,
343
+ x: pd.DataFrame,
342
344
  scoring: Union[Callable, str, None] = None,
343
345
  cat_features: Optional[List[str]] = None,
344
346
  text_features: Optional[List[str]] = None,
345
347
  add_params: Optional[Dict[str, Any]] = None,
346
348
  groups: Optional[List[str]] = None,
347
- ) -> "EstimatorWrapper":
349
+ ) -> EstimatorWrapper:
348
350
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
349
351
  kwargs = {
350
352
  "scorer": scorer,
@@ -380,20 +382,20 @@ class EstimatorWrapper:
380
382
  else:
381
383
  estimator_copy = deepcopy(estimator)
382
384
  kwargs["estimator"] = estimator_copy
383
- if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
385
+ if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
384
386
  if cat_features is not None:
385
387
  for cat_feature in cat_features:
386
- if cat_feature not in X.columns:
388
+ if cat_feature not in x.columns:
387
389
  logger.error(
388
- f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
390
+ f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
389
391
  )
390
392
  estimator_copy.set_params(
391
- cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
393
+ cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
392
394
  )
393
395
  estimator = CatBoostWrapper(**kwargs)
394
396
  else:
395
397
  try:
396
- if isinstance(estimator, LGBMClassifier) or isinstance(estimator, LGBMRegressor):
398
+ if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
397
399
  estimator = LightGBMWrapper(**kwargs)
398
400
  else:
399
401
  logger.warning(
@@ -439,20 +441,20 @@ class CatBoostWrapper(EstimatorWrapper):
439
441
  self.emb_features = None
440
442
  self.exclude_features = []
441
443
 
442
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
443
- X, y, groups, params = super()._prepare_to_fit(X, y)
444
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
445
+ x, y, groups, params = super()._prepare_to_fit(x, y)
444
446
 
445
447
  # Find embeddings
446
448
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
447
449
  emb_pattern = r"(.+)_emb\d+"
448
- self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
450
+ self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
449
451
  embedding_features = []
450
452
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
451
453
  self.logger.info(
452
454
  "Embedding features count more than 3, so group them into one vector for CatBoost: "
453
455
  f"{self.emb_features}"
454
456
  )
455
- X, embedding_features = self.group_embeddings(X)
457
+ x, embedding_features = self.group_embeddings(x)
456
458
  params["embedding_features"] = embedding_features
457
459
  else:
458
460
  self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
@@ -464,7 +466,7 @@ class CatBoostWrapper(EstimatorWrapper):
464
466
  if hasattr(CatBoostClassifier, "get_text_feature_indices"):
465
467
  if self.text_features is not None:
466
468
  self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
467
- self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
469
+ self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
468
470
  self.logger.info(f"Rest text features after checks: {self.text_features}")
469
471
  params["text_features"] = self.text_features
470
472
  else:
@@ -472,15 +474,15 @@ class CatBoostWrapper(EstimatorWrapper):
472
474
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
473
475
 
474
476
  # Find rest categorical features
475
- self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
476
- X = fill_na_cat_features(X, self.cat_features)
477
+ self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
478
+ x = fill_na_cat_features(x, self.cat_features)
477
479
  unique_cat_features = []
478
480
  for name in self.cat_features:
479
481
  # Remove constant categorical features
480
- if X[name].nunique() > 1:
482
+ if x[name].nunique() > 1:
481
483
  unique_cat_features.append(name)
482
484
  else:
483
- X = X.drop(columns=name)
485
+ x = x.drop(columns=name)
484
486
  self.cat_features = unique_cat_features
485
487
  if (
486
488
  hasattr(self.estimator, "get_param")
@@ -489,9 +491,9 @@ class CatBoostWrapper(EstimatorWrapper):
489
491
  ):
490
492
  estimator_cat_features = self.estimator.get_param("cat_features")
491
493
  if all([isinstance(c, int) for c in estimator_cat_features]):
492
- cat_features_idx = {X.columns.get_loc(c) for c in self.cat_features}
494
+ cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
493
495
  cat_features_idx.update(estimator_cat_features)
494
- self.cat_features = [X.columns[idx] for idx in sorted(cat_features_idx)]
496
+ self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
495
497
  elif all([isinstance(c, str) for c in estimator_cat_features]):
496
498
  self.cat_features = list(set(self.cat_features + estimator_cat_features))
497
499
  else:
@@ -502,7 +504,7 @@ class CatBoostWrapper(EstimatorWrapper):
502
504
  self.logger.info(f"Selected categorical features: {self.cat_features}")
503
505
  params["cat_features"] = self.cat_features
504
506
 
505
- return X, y, groups, params
507
+ return x, y, groups, params
506
508
 
507
509
  def group_embeddings(self, df: pd.DataFrame):
508
510
  emb_name = "__grouped_embeddings"
@@ -513,38 +515,38 @@ class CatBoostWrapper(EstimatorWrapper):
513
515
 
514
516
  return df, [emb_name]
515
517
 
516
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
518
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
517
519
  if self.exclude_features:
518
- X = X.drop(columns=self.exclude_features)
519
- X, y, params = super()._prepare_to_calculate(X, y)
520
+ x = x.drop(columns=self.exclude_features)
521
+ x, y, params = super()._prepare_to_calculate(x, y)
520
522
  if self.text_features:
521
523
  params["text_features"] = self.text_features
522
524
  if self.emb_features:
523
- X, emb_columns = self.group_embeddings(X)
525
+ x, emb_columns = self.group_embeddings(x)
524
526
  params["embedding_features"] = emb_columns
525
527
  if self.cat_features:
526
- X = fill_na_cat_features(X, self.cat_features)
528
+ x = fill_na_cat_features(x, self.cat_features)
527
529
  params["cat_features"] = self.cat_features
528
530
 
529
- return X, y, params
531
+ return x, y, params
530
532
 
531
533
  def cross_val_predict(
532
- self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
534
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
533
535
  ) -> Optional[float]:
534
536
  try:
535
- return super().cross_val_predict(X, y, baseline_score_column)
537
+ return super().cross_val_predict(x, y, baseline_score_column)
536
538
  except Exception as e:
537
539
  if "Dictionary size is 0" in e.args[0] and self.text_features:
538
- high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
540
+ high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
539
541
  self.logger.warning(
540
- "Failed to calculate metrics. Try to remove high cardinality"
542
+ "Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
541
543
  f" text features {high_cardinality_features} and retry"
542
544
  )
543
545
  for f in high_cardinality_features:
544
546
  self.text_features.remove(f)
545
547
  self.exclude_features.append(f)
546
- X = X.drop(columns=f)
547
- return super().cross_val_predict(X, y, baseline_score_column)
548
+ x = x.drop(columns=f)
549
+ return super().cross_val_predict(x, y, baseline_score_column)
548
550
  else:
549
551
  raise e
550
552
 
@@ -575,26 +577,26 @@ class LightGBMWrapper(EstimatorWrapper):
575
577
  )
576
578
  self.cat_features = None
577
579
 
578
- def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
579
- X, y, groups, params = super()._prepare_to_fit(X, y)
580
- self.cat_features = _get_cat_features(X)
581
- X = fill_na_cat_features(X, self.cat_features)
580
+ def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
581
+ x, y, groups, params = super()._prepare_to_fit(x, y)
582
+ self.cat_features = _get_cat_features(x)
583
+ x = fill_na_cat_features(x, self.cat_features)
582
584
  for feature in self.cat_features:
583
- X[feature] = X[feature].astype("category").cat.codes
585
+ x[feature] = x[feature].astype("category").cat.codes
584
586
  if not is_numeric_dtype(y):
585
587
  y = correct_string_target(y)
586
588
 
587
- return X, y, groups, params
589
+ return x, y, groups, params
588
590
 
589
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
590
- X, y, params = super()._prepare_to_calculate(X, y)
591
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
592
+ x, y, params = super()._prepare_to_calculate(x, y)
591
593
  if self.cat_features is not None:
592
- X = fill_na_cat_features(X, self.cat_features)
594
+ x = fill_na_cat_features(x, self.cat_features)
593
595
  for feature in self.cat_features:
594
- X[feature] = X[feature].astype("category").cat.codes
596
+ x[feature] = x[feature].astype("category").cat.codes
595
597
  if not is_numeric_dtype(y):
596
598
  y = correct_string_target(y)
597
- return X, y, params
599
+ return x, y, params
598
600
 
599
601
 
600
602
  class OtherEstimatorWrapper(EstimatorWrapper):
@@ -623,31 +625,31 @@ class OtherEstimatorWrapper(EstimatorWrapper):
623
625
  )
624
626
  self.cat_features = None
625
627
 
626
- def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
627
- X, y, groups, params = super()._prepare_to_fit(X, y)
628
- self.cat_features = _get_cat_features(X)
629
- num_features = [col for col in X.columns if col not in self.cat_features]
630
- X[num_features] = X[num_features].fillna(-999)
631
- X = fill_na_cat_features(X, self.cat_features)
628
+ def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
629
+ x, y, groups, params = super()._prepare_to_fit(x, y)
630
+ self.cat_features = _get_cat_features(x)
631
+ num_features = [col for col in x.columns if col not in self.cat_features]
632
+ x[num_features] = x[num_features].fillna(-999)
633
+ x = fill_na_cat_features(x, self.cat_features)
632
634
  # TODO use one-hot encoding if cardinality is less 50
633
635
  for feature in self.cat_features:
634
- X[feature] = X[feature].astype("category").cat.codes
636
+ x[feature] = x[feature].astype("category").cat.codes
635
637
  if not is_numeric_dtype(y):
636
638
  y = correct_string_target(y)
637
- return X, y, groups, params
639
+ return x, y, groups, params
638
640
 
639
- def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
640
- X, y, params = super()._prepare_to_calculate(X, y)
641
+ def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
642
+ x, y, params = super()._prepare_to_calculate(x, y)
641
643
  if self.cat_features is not None:
642
- num_features = [col for col in X.columns if col not in self.cat_features]
643
- X[num_features] = X[num_features].fillna(-999)
644
- X = fill_na_cat_features(X, self.cat_features)
644
+ num_features = [col for col in x.columns if col not in self.cat_features]
645
+ x[num_features] = x[num_features].fillna(-999)
646
+ x = fill_na_cat_features(x, self.cat_features)
645
647
  # TODO use one-hot encoding if cardinality is less 50
646
648
  for feature in self.cat_features:
647
- X[feature] = X[feature].astype("category").cat.codes
649
+ x[feature] = x[feature].astype("category").cat.codes
648
650
  if not is_numeric_dtype(y):
649
651
  y = correct_string_target(y)
650
- return X, y, params
652
+ return x, y, params
651
653
 
652
654
 
653
655
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
@@ -657,20 +659,20 @@ def validate_scoring_argument(scoring: Union[Callable, str, None]):
657
659
  spec = inspect.getfullargspec(scoring)
658
660
  if len(spec.args) < 3:
659
661
  raise ValidationError(
660
- f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
662
+ f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
661
663
  )
662
664
 
663
665
 
664
666
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
665
667
  metric_name = scoring
666
668
  multiplier = 1
667
- if "mean_squared_log_error" == metric_name or "MSLE" == metric_name or "msle" == metric_name:
669
+ if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
668
670
  scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
669
671
  multiplier = -1
670
- elif "root_mean_squared_log_error" in metric_name or "RMSLE" == metric_name or "rmsle" == metric_name:
672
+ elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
671
673
  scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
672
674
  multiplier = -1
673
- elif "root_mean_squared_error" == metric_name or "RMSE" == metric_name or "rmse" == metric_name:
675
+ elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
674
676
  scoring = get_scorer("neg_root_mean_squared_error")
675
677
  multiplier = -1
676
678
  elif scoring in available_scorers:
@@ -722,12 +724,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
722
724
 
723
725
 
724
726
  def _get_cat_features(
725
- X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
727
+ x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
726
728
  ) -> List[str]:
727
729
  text_features = text_features or []
728
730
  emb_features = emb_features or []
729
731
  exclude_features = text_features + emb_features
730
- return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
732
+ return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
731
733
 
732
734
 
733
735
  def _get_add_params(input_params, add_params):
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
17
17
  __email__ = "felix.zenk@web.de"
18
18
 
19
19
 
20
- class _Parser(object):
20
+ class _Parser:
21
21
  """
22
22
  A parser for the .properties file format.
23
23
  """
@@ -49,7 +49,7 @@ class _Parser(object):
49
49
  return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
50
50
 
51
51
  # I/O read
52
- with open(file_path, mode="r", encoding="utf-8") as f:
52
+ with open(file_path, encoding="utf-8") as f:
53
53
  lines = f.readlines()
54
54
 
55
55
  # parse
@@ -83,7 +83,7 @@ class _Parser(object):
83
83
  return mapping
84
84
 
85
85
 
86
- class ResourceBundle(object):
86
+ class ResourceBundle:
87
87
  """
88
88
  A ResourceBundle manages internationalization of string resources
89
89
  """
@@ -199,7 +199,7 @@ class ResourceBundle(object):
199
199
  raise NotInResourceBundleError(self.name, item)
200
200
 
201
201
 
202
- def get_bundle(bundle_name: str, locale: str | Sequence[str | str] = None, path: Path | str = None) -> ResourceBundle:
202
+ def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
203
203
  """
204
204
  Return a new :class:`ResourceBundle` after parsing the locale
205
205
 
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
224
224
  custom_bundles = dict()
225
225
 
226
226
 
227
- def get_custom_bundle(custom_cfg: Optional[str] = None) -> "ResourceBundle":
227
+ def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
228
228
  global custom_bundles
229
229
  if custom_cfg is not None:
230
230
  custom_bundle = custom_bundles.get(custom_cfg)
upgini/sampler/base.py CHANGED
@@ -9,13 +9,11 @@ from abc import ABCMeta, abstractmethod
9
9
  from typing import List, Optional
10
10
 
11
11
  import numpy as np
12
-
13
12
  from sklearn.base import BaseEstimator
14
13
  from sklearn.preprocessing import label_binarize
15
14
  from sklearn.utils.multiclass import check_classification_targets
16
15
 
17
- from .utils import check_sampling_strategy, check_target_type
18
- from .utils import ArraysTransformer
16
+ from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
19
17
 
20
18
 
21
19
  class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -107,7 +105,6 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
107
105
  The corresponding label of `X_resampled`.
108
106
 
109
107
  """
110
- pass
111
108
 
112
109
  @abstractmethod
113
110
  def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):
@@ -5,13 +5,10 @@
5
5
  # License: MIT
6
6
 
7
7
  import numpy as np
8
-
9
- from sklearn.utils import check_random_state
10
- from sklearn.utils import _safe_indexing
8
+ from sklearn.utils import _safe_indexing, check_random_state
11
9
 
12
10
  from .base import BaseUnderSampler
13
- from .utils import check_target_type
14
- from .utils import _deprecate_positional_args
11
+ from .utils import _deprecate_positional_args, check_target_type
15
12
 
16
13
 
17
14
  class RandomUnderSampler(BaseUnderSampler):
upgini/search_task.py CHANGED
@@ -8,10 +8,10 @@ import pandas as pd
8
8
 
9
9
  from upgini import dataset
10
10
  from upgini.http import (
11
- _RestClient,
12
11
  ProviderTaskSummary,
13
12
  SearchProgress,
14
13
  SearchTaskSummary,
14
+ _RestClient,
15
15
  get_rest_client,
16
16
  is_demo_api_key,
17
17
  )
@@ -295,7 +295,7 @@ class SearchTask:
295
295
  return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
296
296
 
297
297
 
298
- @lru_cache()
298
+ @lru_cache
299
299
  def _get_all_initial_raw_features_cached(
300
300
  endpoint: Optional[str],
301
301
  api_key: Optional[str],
@@ -328,7 +328,7 @@ def _get_all_initial_raw_features_cached(
328
328
  return result_df
329
329
 
330
330
 
331
- @lru_cache()
331
+ @lru_cache
332
332
  def _get_all_validation_raw_features_cached(
333
333
  endpoint: Optional[str],
334
334
  api_key: Optional[str],
@@ -357,7 +357,7 @@ def _get_all_validation_raw_features_cached(
357
357
  return result_df
358
358
 
359
359
 
360
- @lru_cache()
360
+ @lru_cache
361
361
  def _get_target_outliers_cached(
362
362
  endpoint: Optional[str],
363
363
  api_key: Optional[str],
upgini/spinner.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import threading
2
- from typing import Optional, List
3
2
  import time
3
+ from typing import List, Optional
4
4
 
5
5
 
6
6
  class Spinner:
upgini/utils/__init__.py CHANGED
@@ -2,7 +2,7 @@ import itertools
2
2
  from typing import List, Tuple
3
3
 
4
4
  import pandas as pd
5
- from pandas.api.types import is_string_dtype, is_object_dtype
5
+ from pandas.api.types import is_object_dtype, is_string_dtype
6
6
 
7
7
 
8
8
  def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
@@ -5,10 +5,10 @@ import pandas as pd
5
5
 
6
6
  class BaseSearchKeyDetector:
7
7
  def _is_search_key_by_name(self, column_name: str) -> bool:
8
- raise NotImplementedError()
8
+ raise NotImplementedError
9
9
 
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
- raise NotImplementedError()
11
+ raise NotImplementedError
12
12
 
13
13
  def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
14
  for column_name in column_names:
@@ -1,8 +1,10 @@
1
- import numpy as np
2
1
  import numbers
2
+
3
+ import numpy as np
4
+ from sklearn.model_selection import BaseCrossValidator
3
5
  from sklearn.utils import indexable
4
6
  from sklearn.utils.validation import _num_samples
5
- from sklearn.model_selection import BaseCrossValidator
7
+
6
8
  from upgini.resource_bundle import bundle
7
9
 
8
10
 
@@ -1,5 +1,5 @@
1
1
  import pandas as pd
2
- from pandas.api.types import is_string_dtype, is_object_dtype
2
+ from pandas.api.types import is_object_dtype, is_string_dtype
3
3
 
4
4
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
5
5
 
@@ -1,6 +1,7 @@
1
- from upgini.metadata import ModelTaskType, RuntimeParameters
2
- from typing import Optional, Dict, Any
3
1
  import logging
2
+ from typing import Any, Dict, Optional
3
+
4
+ from upgini.metadata import ModelTaskType, RuntimeParameters
4
5
  from upgini.resource_bundle import bundle
5
6
 
6
7
 
upgini/utils/cv_utils.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from functools import reduce
2
2
  from typing import Any, Dict, List, Optional, Tuple, Union
3
- import numpy as np
4
3
 
4
+ import numpy as np
5
5
  import pandas as pd
6
- from sklearn.model_selection import BaseCrossValidator, KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit
6
+ from sklearn.model_selection import BaseCrossValidator, GroupKFold, GroupShuffleSplit, KFold, TimeSeriesSplit
7
7
 
8
8
  from upgini.metadata import CVType
9
9
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit