upgini 1.1.279__py3-none-any.whl → 1.1.279a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of upgini might be problematic. Click here for more details.

Files changed (44) hide show
  1. upgini/ads_management/ads_manager.py +2 -4
  2. upgini/autofe/all_operands.py +2 -3
  3. upgini/autofe/binary.py +1 -2
  4. upgini/autofe/date.py +1 -2
  5. upgini/autofe/feature.py +1 -1
  6. upgini/autofe/groupby.py +1 -3
  7. upgini/autofe/operand.py +3 -4
  8. upgini/autofe/unary.py +1 -2
  9. upgini/autofe/vector.py +0 -2
  10. upgini/dataset.py +4 -4
  11. upgini/errors.py +1 -1
  12. upgini/features_enricher.py +4 -4
  13. upgini/fingerprint.js +8 -0
  14. upgini/http.py +10 -11
  15. upgini/mdc/__init__.py +3 -1
  16. upgini/mdc/context.py +6 -4
  17. upgini/metadata.py +0 -3
  18. upgini/metrics.py +99 -101
  19. upgini/normalizer/phone_normalizer.py +1 -1
  20. upgini/resource_bundle/__init__.py +5 -5
  21. upgini/sampler/base.py +4 -1
  22. upgini/sampler/random_under_sampler.py +5 -2
  23. upgini/search_task.py +4 -4
  24. upgini/spinner.py +1 -1
  25. upgini/utils/__init__.py +1 -1
  26. upgini/utils/base_search_key_detector.py +2 -2
  27. upgini/utils/blocked_time_series.py +2 -4
  28. upgini/utils/country_utils.py +1 -1
  29. upgini/utils/custom_loss_utils.py +2 -3
  30. upgini/utils/cv_utils.py +2 -2
  31. upgini/utils/datetime_utils.py +3 -9
  32. upgini/utils/email_utils.py +2 -2
  33. upgini/utils/fallback_progress_bar.py +1 -1
  34. upgini/utils/progress_bar.py +1 -1
  35. upgini/utils/sklearn_ext.py +13 -14
  36. upgini/utils/track_info.py +2 -2
  37. upgini/version_validator.py +2 -2
  38. {upgini-1.1.279.dist-info → upgini-1.1.279a2.dist-info}/METADATA +23 -21
  39. upgini-1.1.279a2.dist-info/RECORD +63 -0
  40. {upgini-1.1.279.dist-info → upgini-1.1.279a2.dist-info}/WHEEL +2 -1
  41. upgini-1.1.279a2.dist-info/top_level.txt +1 -0
  42. upgini/__about__.py +0 -1
  43. upgini-1.1.279.dist-info/RECORD +0 -62
  44. {upgini-1.1.279.dist-info/licenses → upgini-1.1.279a2.dist-info}/LICENSE +0 -0
upgini/metrics.py CHANGED
@@ -1,5 +1,3 @@
1
- from __future__ import annotations
2
-
3
1
  import inspect
4
2
  import logging
5
3
  import re
@@ -127,7 +125,7 @@ NA_REPLACEMENT = "NA"
127
125
 
128
126
  SUPPORTED_CATBOOST_METRICS = {
129
127
  s.upper(): s
130
- for s in (
128
+ for s in {
131
129
  "Logloss",
132
130
  "CrossEntropy",
133
131
  "CtrFactor",
@@ -206,7 +204,7 @@ SUPPORTED_CATBOOST_METRICS = {
206
204
  "MultiLogloss",
207
205
  "MultiCrossEntropy",
208
206
  "Combination",
209
- )
207
+ }
210
208
  }
211
209
 
212
210
 
@@ -238,71 +236,71 @@ class EstimatorWrapper:
238
236
  self.text_features = text_features
239
237
  self.logger = logger or logging.getLogger()
240
238
 
241
- def fit(self, x: pd.DataFrame, y: np.ndarray, **kwargs):
242
- x, y, _, fit_params = self._prepare_to_fit(x, y)
239
+ def fit(self, X: pd.DataFrame, y: np.ndarray, **kwargs):
240
+ X, y, _, fit_params = self._prepare_to_fit(X, y)
243
241
  kwargs.update(fit_params)
244
- self.estimator.fit(x, y, **kwargs)
242
+ self.estimator.fit(X, y, **kwargs)
245
243
  return self
246
244
 
247
245
  def predict(self, **kwargs):
248
246
  return self.estimator.predict(**kwargs)
249
247
 
250
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
251
- x, y, groups = self._prepare_data(x, y, groups=self.groups)
252
- return x, y, groups, {}
248
+ def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
249
+ X, y, groups = self._prepare_data(X, y, groups=self.groups)
250
+ return X, y, groups, {}
253
251
 
254
252
  def _prepare_data(
255
- self, x: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
253
+ self, X: pd.DataFrame, y: pd.Series, groups: Optional[np.ndarray] = None
256
254
  ) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray]:
257
- for c in x.columns:
258
- if is_numeric_dtype(x[c]):
259
- x[c] = x[c].astype(float)
255
+ for c in X.columns:
256
+ if is_numeric_dtype(X[c]):
257
+ X[c] = X[c].astype(float)
260
258
  else:
261
- x[c] = x[c].astype(str)
259
+ X[c] = X[c].astype(str)
262
260
 
263
261
  if not isinstance(y, pd.Series):
264
262
  raise Exception(bundle.get("metrics_unsupported_target_type").format(type(y)))
265
263
 
266
264
  if groups is not None:
267
- x = x.copy()
268
- x["__groups"] = groups
269
- x, y = self._remove_empty_target_rows(x, y)
270
- groups = x["__groups"]
271
- x = x.drop(columns="__groups")
265
+ X = X.copy()
266
+ X["__groups"] = groups
267
+ X, y = self._remove_empty_target_rows(X, y)
268
+ groups = X["__groups"]
269
+ X = X.drop(columns="__groups")
272
270
  else:
273
- x, y = self._remove_empty_target_rows(x, y)
271
+ X, y = self._remove_empty_target_rows(X, y)
274
272
 
275
- return x, y, groups
273
+ return X, y, groups
276
274
 
277
- def _remove_empty_target_rows(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
278
- joined = pd.concat([x, y], axis=1)
275
+ def _remove_empty_target_rows(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series]:
276
+ joined = pd.concat([X, y], axis=1)
279
277
  joined = joined[joined[y.name].notna()]
280
278
  joined = joined.reset_index(drop=True)
281
- x = joined.drop(columns=y.name)
279
+ X = joined.drop(columns=y.name)
282
280
  y = np.array(list(joined[y.name].values))
283
281
 
284
- return x, y
282
+ return X, y
285
283
 
286
- def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
287
- x, y, _ = self._prepare_data(x, y)
288
- return x, y, {}
284
+ def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
285
+ X, y, _ = self._prepare_data(X, y)
286
+ return X, y, {}
289
287
 
290
288
  def cross_val_predict(
291
- self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
289
+ self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
292
290
  ) -> Optional[float]:
293
- x, y, groups, fit_params = self._prepare_to_fit(x, y)
291
+ X, y, groups, fit_params = self._prepare_to_fit(X, y)
294
292
 
295
- if x.shape[1] == 0:
293
+ if X.shape[1] == 0:
296
294
  return None
297
295
 
298
296
  scorer = check_scoring(self.estimator, scoring=self.scorer)
299
297
 
300
298
  if baseline_score_column is not None and self.metric_name == "GINI":
301
- metric = roc_auc_score(y, x[baseline_score_column])
299
+ metric = roc_auc_score(y, X[baseline_score_column])
302
300
  else:
303
301
  cv_results = cross_validate(
304
302
  estimator=self.estimator,
305
- x=x,
303
+ X=X,
306
304
  y=y,
307
305
  scoring=scorer,
308
306
  cv=self.cv,
@@ -322,14 +320,14 @@ class EstimatorWrapper:
322
320
  metric = 2 * metric - 1
323
321
  return metric
324
322
 
325
- def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
326
- x, y, _ = self._prepare_to_calculate(x, y)
323
+ def calculate_metric(self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
324
+ X, y, _ = self._prepare_to_calculate(X, y)
327
325
  if baseline_score_column is not None and self.metric_name == "GINI":
328
- metric = roc_auc_score(y, x[baseline_score_column])
326
+ metric = roc_auc_score(y, X[baseline_score_column])
329
327
  else:
330
328
  metrics = []
331
329
  for est in self.cv_estimators:
332
- metrics.append(self.scorer(est, x, y))
330
+ metrics.append(self.scorer(est, X, y))
333
331
 
334
332
  metric = np.mean(metrics) * self.multiplier
335
333
  return self.post_process_metric(metric)
@@ -340,13 +338,13 @@ class EstimatorWrapper:
340
338
  logger: logging.Logger,
341
339
  target_type: ModelTaskType,
342
340
  cv: BaseCrossValidator,
343
- x: pd.DataFrame,
341
+ X: pd.DataFrame,
344
342
  scoring: Union[Callable, str, None] = None,
345
343
  cat_features: Optional[List[str]] = None,
346
344
  text_features: Optional[List[str]] = None,
347
345
  add_params: Optional[Dict[str, Any]] = None,
348
346
  groups: Optional[List[str]] = None,
349
- ) -> EstimatorWrapper:
347
+ ) -> "EstimatorWrapper":
350
348
  scorer, metric_name, multiplier = _get_scorer(target_type, scoring)
351
349
  kwargs = {
352
350
  "scorer": scorer,
@@ -382,20 +380,20 @@ class EstimatorWrapper:
382
380
  else:
383
381
  estimator_copy = deepcopy(estimator)
384
382
  kwargs["estimator"] = estimator_copy
385
- if isinstance(estimator, (CatBoostClassifier, CatBoostRegressor)):
383
+ if isinstance(estimator, CatBoostClassifier) or isinstance(estimator, CatBoostRegressor):
386
384
  if cat_features is not None:
387
385
  for cat_feature in cat_features:
388
- if cat_feature not in x.columns:
386
+ if cat_feature not in X.columns:
389
387
  logger.error(
390
- f"Client cat_feature `{cat_feature}` not found in x columns: {x.columns.to_list()}"
388
+ f"Client cat_feature `{cat_feature}` not found in X columns: {X.columns.to_list()}"
391
389
  )
392
390
  estimator_copy.set_params(
393
- cat_features=[x.columns.get_loc(cat_feature) for cat_feature in cat_features]
391
+ cat_features=[X.columns.get_loc(cat_feature) for cat_feature in cat_features]
394
392
  )
395
393
  estimator = CatBoostWrapper(**kwargs)
396
394
  else:
397
395
  try:
398
- if isinstance(estimator, (LGBMClassifier, LGBMRegressor)):
396
+ if isinstance(estimator, LGBMClassifier) or isinstance(estimator, LGBMRegressor):
399
397
  estimator = LightGBMWrapper(**kwargs)
400
398
  else:
401
399
  logger.warning(
@@ -441,20 +439,20 @@ class CatBoostWrapper(EstimatorWrapper):
441
439
  self.emb_features = None
442
440
  self.exclude_features = []
443
441
 
444
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
445
- x, y, groups, params = super()._prepare_to_fit(x, y)
442
+ def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
443
+ X, y, groups, params = super()._prepare_to_fit(X, y)
446
444
 
447
445
  # Find embeddings
448
446
  if hasattr(CatBoostClassifier, "get_embedding_feature_indices"):
449
447
  emb_pattern = r"(.+)_emb\d+"
450
- self.emb_features = [c for c in x.columns if re.match(emb_pattern, c) and is_numeric_dtype(x[c])]
448
+ self.emb_features = [c for c in X.columns if re.match(emb_pattern, c) and is_numeric_dtype(X[c])]
451
449
  embedding_features = []
452
450
  if len(self.emb_features) > 3: # There is no reason to reduce embeddings dimension with less than 4
453
451
  self.logger.info(
454
452
  "Embedding features count more than 3, so group them into one vector for CatBoost: "
455
453
  f"{self.emb_features}"
456
454
  )
457
- x, embedding_features = self.group_embeddings(x)
455
+ X, embedding_features = self.group_embeddings(X)
458
456
  params["embedding_features"] = embedding_features
459
457
  else:
460
458
  self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
@@ -466,7 +464,7 @@ class CatBoostWrapper(EstimatorWrapper):
466
464
  if hasattr(CatBoostClassifier, "get_text_feature_indices"):
467
465
  if self.text_features is not None:
468
466
  self.logger.info(f"Passed text features for CatBoost: {self.text_features}")
469
- self.text_features = [f for f in self.text_features if f in x.columns and not is_numeric_dtype(x[f])]
467
+ self.text_features = [f for f in self.text_features if f in X.columns and not is_numeric_dtype(X[f])]
470
468
  self.logger.info(f"Rest text features after checks: {self.text_features}")
471
469
  params["text_features"] = self.text_features
472
470
  else:
@@ -474,15 +472,15 @@ class CatBoostWrapper(EstimatorWrapper):
474
472
  self.logger.warning(f"Text features are not supported by this Catboost version {catboost.__version__}")
475
473
 
476
474
  # Find rest categorical features
477
- self.cat_features = _get_cat_features(x, self.text_features, embedding_features)
478
- x = fill_na_cat_features(x, self.cat_features)
475
+ self.cat_features = _get_cat_features(X, self.text_features, embedding_features)
476
+ X = fill_na_cat_features(X, self.cat_features)
479
477
  unique_cat_features = []
480
478
  for name in self.cat_features:
481
479
  # Remove constant categorical features
482
- if x[name].nunique() > 1:
480
+ if X[name].nunique() > 1:
483
481
  unique_cat_features.append(name)
484
482
  else:
485
- x = x.drop(columns=name)
483
+ X = X.drop(columns=name)
486
484
  self.cat_features = unique_cat_features
487
485
  if (
488
486
  hasattr(self.estimator, "get_param")
@@ -491,9 +489,9 @@ class CatBoostWrapper(EstimatorWrapper):
491
489
  ):
492
490
  estimator_cat_features = self.estimator.get_param("cat_features")
493
491
  if all([isinstance(c, int) for c in estimator_cat_features]):
494
- cat_features_idx = {x.columns.get_loc(c) for c in self.cat_features}
492
+ cat_features_idx = {X.columns.get_loc(c) for c in self.cat_features}
495
493
  cat_features_idx.update(estimator_cat_features)
496
- self.cat_features = [x.columns[idx] for idx in sorted(cat_features_idx)]
494
+ self.cat_features = [X.columns[idx] for idx in sorted(cat_features_idx)]
497
495
  elif all([isinstance(c, str) for c in estimator_cat_features]):
498
496
  self.cat_features = list(set(self.cat_features + estimator_cat_features))
499
497
  else:
@@ -504,7 +502,7 @@ class CatBoostWrapper(EstimatorWrapper):
504
502
  self.logger.info(f"Selected categorical features: {self.cat_features}")
505
503
  params["cat_features"] = self.cat_features
506
504
 
507
- return x, y, groups, params
505
+ return X, y, groups, params
508
506
 
509
507
  def group_embeddings(self, df: pd.DataFrame):
510
508
  emb_name = "__grouped_embeddings"
@@ -515,38 +513,38 @@ class CatBoostWrapper(EstimatorWrapper):
515
513
 
516
514
  return df, [emb_name]
517
515
 
518
- def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
516
+ def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
519
517
  if self.exclude_features:
520
- x = x.drop(columns=self.exclude_features)
521
- x, y, params = super()._prepare_to_calculate(x, y)
518
+ X = X.drop(columns=self.exclude_features)
519
+ X, y, params = super()._prepare_to_calculate(X, y)
522
520
  if self.text_features:
523
521
  params["text_features"] = self.text_features
524
522
  if self.emb_features:
525
- x, emb_columns = self.group_embeddings(x)
523
+ X, emb_columns = self.group_embeddings(X)
526
524
  params["embedding_features"] = emb_columns
527
525
  if self.cat_features:
528
- x = fill_na_cat_features(x, self.cat_features)
526
+ X = fill_na_cat_features(X, self.cat_features)
529
527
  params["cat_features"] = self.cat_features
530
528
 
531
- return x, y, params
529
+ return X, y, params
532
530
 
533
531
  def cross_val_predict(
534
- self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
532
+ self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
535
533
  ) -> Optional[float]:
536
534
  try:
537
- return super().cross_val_predict(x, y, baseline_score_column)
535
+ return super().cross_val_predict(X, y, baseline_score_column)
538
536
  except Exception as e:
539
537
  if "Dictionary size is 0" in e.args[0] and self.text_features:
540
- high_cardinality_features = FeaturesValidator.find_high_cardinality(x[self.text_features])
538
+ high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
541
539
  self.logger.warning(
542
- "Calculate metrics has problem with CatBoost text features. Try to remove high cardinality"
540
+ "Failed to calculate metrics. Try to remove high cardinality"
543
541
  f" text features {high_cardinality_features} and retry"
544
542
  )
545
543
  for f in high_cardinality_features:
546
544
  self.text_features.remove(f)
547
545
  self.exclude_features.append(f)
548
- x = x.drop(columns=f)
549
- return super().cross_val_predict(x, y, baseline_score_column)
546
+ X = X.drop(columns=f)
547
+ return super().cross_val_predict(X, y, baseline_score_column)
550
548
  else:
551
549
  raise e
552
550
 
@@ -577,26 +575,26 @@ class LightGBMWrapper(EstimatorWrapper):
577
575
  )
578
576
  self.cat_features = None
579
577
 
580
- def _prepare_to_fit(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
581
- x, y, groups, params = super()._prepare_to_fit(x, y)
582
- self.cat_features = _get_cat_features(x)
583
- x = fill_na_cat_features(x, self.cat_features)
578
+ def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, pd.Series, np.ndarray, dict]:
579
+ X, y, groups, params = super()._prepare_to_fit(X, y)
580
+ self.cat_features = _get_cat_features(X)
581
+ X = fill_na_cat_features(X, self.cat_features)
584
582
  for feature in self.cat_features:
585
- x[feature] = x[feature].astype("category").cat.codes
583
+ X[feature] = X[feature].astype("category").cat.codes
586
584
  if not is_numeric_dtype(y):
587
585
  y = correct_string_target(y)
588
586
 
589
- return x, y, groups, params
587
+ return X, y, groups, params
590
588
 
591
- def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
592
- x, y, params = super()._prepare_to_calculate(x, y)
589
+ def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
590
+ X, y, params = super()._prepare_to_calculate(X, y)
593
591
  if self.cat_features is not None:
594
- x = fill_na_cat_features(x, self.cat_features)
592
+ X = fill_na_cat_features(X, self.cat_features)
595
593
  for feature in self.cat_features:
596
- x[feature] = x[feature].astype("category").cat.codes
594
+ X[feature] = X[feature].astype("category").cat.codes
597
595
  if not is_numeric_dtype(y):
598
596
  y = correct_string_target(y)
599
- return x, y, params
597
+ return X, y, params
600
598
 
601
599
 
602
600
  class OtherEstimatorWrapper(EstimatorWrapper):
@@ -625,31 +623,31 @@ class OtherEstimatorWrapper(EstimatorWrapper):
625
623
  )
626
624
  self.cat_features = None
627
625
 
628
- def _prepare_to_fit(self, x: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
629
- x, y, groups, params = super()._prepare_to_fit(x, y)
630
- self.cat_features = _get_cat_features(x)
631
- num_features = [col for col in x.columns if col not in self.cat_features]
632
- x[num_features] = x[num_features].fillna(-999)
633
- x = fill_na_cat_features(x, self.cat_features)
626
+ def _prepare_to_fit(self, X: pd.DataFrame, y: np.ndarray) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
627
+ X, y, groups, params = super()._prepare_to_fit(X, y)
628
+ self.cat_features = _get_cat_features(X)
629
+ num_features = [col for col in X.columns if col not in self.cat_features]
630
+ X[num_features] = X[num_features].fillna(-999)
631
+ X = fill_na_cat_features(X, self.cat_features)
634
632
  # TODO use one-hot encoding if cardinality is less 50
635
633
  for feature in self.cat_features:
636
- x[feature] = x[feature].astype("category").cat.codes
634
+ X[feature] = X[feature].astype("category").cat.codes
637
635
  if not is_numeric_dtype(y):
638
636
  y = correct_string_target(y)
639
- return x, y, groups, params
637
+ return X, y, groups, params
640
638
 
641
- def _prepare_to_calculate(self, x: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
642
- x, y, params = super()._prepare_to_calculate(x, y)
639
+ def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
640
+ X, y, params = super()._prepare_to_calculate(X, y)
643
641
  if self.cat_features is not None:
644
- num_features = [col for col in x.columns if col not in self.cat_features]
645
- x[num_features] = x[num_features].fillna(-999)
646
- x = fill_na_cat_features(x, self.cat_features)
642
+ num_features = [col for col in X.columns if col not in self.cat_features]
643
+ X[num_features] = X[num_features].fillna(-999)
644
+ X = fill_na_cat_features(X, self.cat_features)
647
645
  # TODO use one-hot encoding if cardinality is less 50
648
646
  for feature in self.cat_features:
649
- x[feature] = x[feature].astype("category").cat.codes
647
+ X[feature] = X[feature].astype("category").cat.codes
650
648
  if not is_numeric_dtype(y):
651
649
  y = correct_string_target(y)
652
- return x, y, params
650
+ return X, y, params
653
651
 
654
652
 
655
653
  def validate_scoring_argument(scoring: Union[Callable, str, None]):
@@ -659,20 +657,20 @@ def validate_scoring_argument(scoring: Union[Callable, str, None]):
659
657
  spec = inspect.getfullargspec(scoring)
660
658
  if len(spec.args) < 3:
661
659
  raise ValidationError(
662
- f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, x, y"
660
+ f"Invalid scoring function passed {scoring}. It should accept 3 input arguments: estimator, X, y"
663
661
  )
664
662
 
665
663
 
666
664
  def _get_scorer_by_name(scoring: str) -> Tuple[Callable, str, int]:
667
665
  metric_name = scoring
668
666
  multiplier = 1
669
- if metric_name == "mean_squared_log_error" or metric_name == "MSLE" or metric_name == "msle":
667
+ if "mean_squared_log_error" == metric_name or "MSLE" == metric_name or "msle" == metric_name:
670
668
  scoring = make_scorer(_ext_mean_squared_log_error, greater_is_better=False)
671
669
  multiplier = -1
672
- elif "root_mean_squared_log_error" in metric_name or metric_name == "RMSLE" or metric_name == "rmsle":
670
+ elif "root_mean_squared_log_error" in metric_name or "RMSLE" == metric_name or "rmsle" == metric_name:
673
671
  scoring = make_scorer(_ext_root_mean_squared_log_error, greater_is_better=False)
674
672
  multiplier = -1
675
- elif metric_name == "root_mean_squared_error" or metric_name == "RMSE" or metric_name == "rmse":
673
+ elif "root_mean_squared_error" == metric_name or "RMSE" == metric_name or "rmse" == metric_name:
676
674
  scoring = get_scorer("neg_root_mean_squared_error")
677
675
  multiplier = -1
678
676
  elif scoring in available_scorers:
@@ -724,12 +722,12 @@ def _get_scorer(target_type: ModelTaskType, scoring: Union[Callable, str, None])
724
722
 
725
723
 
726
724
  def _get_cat_features(
727
- x: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
725
+ X: pd.DataFrame, text_features: Optional[List[str]] = None, emb_features: Optional[List[str]] = None
728
726
  ) -> List[str]:
729
727
  text_features = text_features or []
730
728
  emb_features = emb_features or []
731
729
  exclude_features = text_features + emb_features
732
- return [c for c in x.columns if c not in exclude_features and not is_numeric_dtype(x[c])]
730
+ return [c for c in X.columns if c not in exclude_features and not is_numeric_dtype(X[c])]
733
731
 
734
732
 
735
733
  def _get_add_params(input_params, add_params):
@@ -1,7 +1,7 @@
1
1
  from typing import Optional
2
2
 
3
3
  import pandas as pd
4
- from pandas.api.types import is_float_dtype, is_int64_dtype, is_object_dtype, is_string_dtype
4
+ from pandas.api.types import is_float_dtype, is_int64_dtype, is_string_dtype, is_object_dtype
5
5
 
6
6
  from upgini.errors import ValidationError
7
7
 
@@ -17,7 +17,7 @@ __author__ = "Felix Zenk"
17
17
  __email__ = "felix.zenk@web.de"
18
18
 
19
19
 
20
- class _Parser:
20
+ class _Parser(object):
21
21
  """
22
22
  A parser for the .properties file format.
23
23
  """
@@ -49,7 +49,7 @@ class _Parser:
49
49
  return re.sub(pattern, lambda match: codecs.decode(match.group(0), "unicode-escape"), arg)
50
50
 
51
51
  # I/O read
52
- with open(file_path, encoding="utf-8") as f:
52
+ with open(file_path, mode="r", encoding="utf-8") as f:
53
53
  lines = f.readlines()
54
54
 
55
55
  # parse
@@ -83,7 +83,7 @@ class _Parser:
83
83
  return mapping
84
84
 
85
85
 
86
- class ResourceBundle:
86
+ class ResourceBundle(object):
87
87
  """
88
88
  A ResourceBundle manages internationalization of string resources
89
89
  """
@@ -199,7 +199,7 @@ class ResourceBundle:
199
199
  raise NotInResourceBundleError(self.name, item)
200
200
 
201
201
 
202
- def get_bundle(bundle_name: str, locale: str | Sequence[str] = None, path: Path | str = None) -> ResourceBundle:
202
+ def get_bundle(bundle_name: str, locale: str | Sequence[str | str] = None, path: Path | str = None) -> ResourceBundle:
203
203
  """
204
204
  Return a new :class:`ResourceBundle` after parsing the locale
205
205
 
@@ -224,7 +224,7 @@ bundle = ResourceBundle("strings", None, path=os.path.dirname(os.path.realpath(_
224
224
  custom_bundles = dict()
225
225
 
226
226
 
227
- def get_custom_bundle(custom_cfg: Optional[str] = None) -> ResourceBundle:
227
+ def get_custom_bundle(custom_cfg: Optional[str] = None) -> "ResourceBundle":
228
228
  global custom_bundles
229
229
  if custom_cfg is not None:
230
230
  custom_bundle = custom_bundles.get(custom_cfg)
upgini/sampler/base.py CHANGED
@@ -9,11 +9,13 @@ from abc import ABCMeta, abstractmethod
9
9
  from typing import List, Optional
10
10
 
11
11
  import numpy as np
12
+
12
13
  from sklearn.base import BaseEstimator
13
14
  from sklearn.preprocessing import label_binarize
14
15
  from sklearn.utils.multiclass import check_classification_targets
15
16
 
16
- from .utils import ArraysTransformer, check_sampling_strategy, check_target_type
17
+ from .utils import check_sampling_strategy, check_target_type
18
+ from .utils import ArraysTransformer
17
19
 
18
20
 
19
21
  class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
@@ -105,6 +107,7 @@ class SamplerMixin(BaseEstimator, metaclass=ABCMeta):
105
107
  The corresponding label of `X_resampled`.
106
108
 
107
109
  """
110
+ pass
108
111
 
109
112
  @abstractmethod
110
113
  def _check_X_y(self, X, y, accept_sparse: Optional[List[str]] = None):
@@ -5,10 +5,13 @@
5
5
  # License: MIT
6
6
 
7
7
  import numpy as np
8
- from sklearn.utils import _safe_indexing, check_random_state
8
+
9
+ from sklearn.utils import check_random_state
10
+ from sklearn.utils import _safe_indexing
9
11
 
10
12
  from .base import BaseUnderSampler
11
- from .utils import _deprecate_positional_args, check_target_type
13
+ from .utils import check_target_type
14
+ from .utils import _deprecate_positional_args
12
15
 
13
16
 
14
17
  class RandomUnderSampler(BaseUnderSampler):
upgini/search_task.py CHANGED
@@ -8,10 +8,10 @@ import pandas as pd
8
8
 
9
9
  from upgini import dataset
10
10
  from upgini.http import (
11
+ _RestClient,
11
12
  ProviderTaskSummary,
12
13
  SearchProgress,
13
14
  SearchTaskSummary,
14
- _RestClient,
15
15
  get_rest_client,
16
16
  is_demo_api_key,
17
17
  )
@@ -295,7 +295,7 @@ class SearchTask:
295
295
  return self.rest_client.get_search_file_metadata(self.search_task_id, trace_id)
296
296
 
297
297
 
298
- @lru_cache
298
+ @lru_cache()
299
299
  def _get_all_initial_raw_features_cached(
300
300
  endpoint: Optional[str],
301
301
  api_key: Optional[str],
@@ -328,7 +328,7 @@ def _get_all_initial_raw_features_cached(
328
328
  return result_df
329
329
 
330
330
 
331
- @lru_cache
331
+ @lru_cache()
332
332
  def _get_all_validation_raw_features_cached(
333
333
  endpoint: Optional[str],
334
334
  api_key: Optional[str],
@@ -357,7 +357,7 @@ def _get_all_validation_raw_features_cached(
357
357
  return result_df
358
358
 
359
359
 
360
- @lru_cache
360
+ @lru_cache()
361
361
  def _get_target_outliers_cached(
362
362
  endpoint: Optional[str],
363
363
  api_key: Optional[str],
upgini/spinner.py CHANGED
@@ -1,6 +1,6 @@
1
1
  import threading
2
+ from typing import Optional, List
2
3
  import time
3
- from typing import List, Optional
4
4
 
5
5
 
6
6
  class Spinner:
upgini/utils/__init__.py CHANGED
@@ -2,7 +2,7 @@ import itertools
2
2
  from typing import List, Tuple
3
3
 
4
4
  import pandas as pd
5
- from pandas.api.types import is_object_dtype, is_string_dtype
5
+ from pandas.api.types import is_string_dtype, is_object_dtype
6
6
 
7
7
 
8
8
  def combine_search_keys(search_keys: List[str]) -> List[Tuple[str]]:
@@ -5,10 +5,10 @@ import pandas as pd
5
5
 
6
6
  class BaseSearchKeyDetector:
7
7
  def _is_search_key_by_name(self, column_name: str) -> bool:
8
- raise NotImplementedError
8
+ raise NotImplementedError()
9
9
 
10
10
  def _is_search_key_by_values(self, column: pd.Series) -> bool:
11
- raise NotImplementedError
11
+ raise NotImplementedError()
12
12
 
13
13
  def _get_search_key_by_name(self, column_names: List[str]) -> Optional[str]:
14
14
  for column_name in column_names:
@@ -1,10 +1,8 @@
1
- import numbers
2
-
3
1
  import numpy as np
4
- from sklearn.model_selection import BaseCrossValidator
2
+ import numbers
5
3
  from sklearn.utils import indexable
6
4
  from sklearn.utils.validation import _num_samples
7
-
5
+ from sklearn.model_selection import BaseCrossValidator
8
6
  from upgini.resource_bundle import bundle
9
7
 
10
8
 
@@ -1,5 +1,5 @@
1
1
  import pandas as pd
2
- from pandas.api.types import is_object_dtype, is_string_dtype
2
+ from pandas.api.types import is_string_dtype, is_object_dtype
3
3
 
4
4
  from upgini.utils.base_search_key_detector import BaseSearchKeyDetector
5
5
 
@@ -1,7 +1,6 @@
1
- import logging
2
- from typing import Any, Dict, Optional
3
-
4
1
  from upgini.metadata import ModelTaskType, RuntimeParameters
2
+ from typing import Optional, Dict, Any
3
+ import logging
5
4
  from upgini.resource_bundle import bundle
6
5
 
7
6
 
upgini/utils/cv_utils.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from functools import reduce
2
2
  from typing import Any, Dict, List, Optional, Tuple, Union
3
-
4
3
  import numpy as np
4
+
5
5
  import pandas as pd
6
- from sklearn.model_selection import BaseCrossValidator, GroupKFold, GroupShuffleSplit, KFold, TimeSeriesSplit
6
+ from sklearn.model_selection import BaseCrossValidator, KFold, TimeSeriesSplit, GroupKFold, GroupShuffleSplit
7
7
 
8
8
  from upgini.metadata import CVType
9
9
  from upgini.utils.blocked_time_series import BlockedTimeSeriesSplit