upgini 1.1.267__py3-none-any.whl → 1.1.268__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3665,7 +3665,7 @@ class FeaturesEnricher(TransformerMixin):
3665
3665
  if y is not None:
3666
3666
  with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
3667
3667
  pickle.dump(sample(y, xy_sample_index), y_file)
3668
- if eval_set is not None:
3668
+ if eval_set:
3669
3669
  eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
3670
3670
  with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
3671
3671
  pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
upgini/metrics.py CHANGED
@@ -3,15 +3,16 @@ import re
3
3
  from copy import deepcopy
4
4
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
5
5
 
6
+ import catboost
6
7
  import numpy as np
7
8
  import pandas as pd
8
9
  from catboost import CatBoostClassifier, CatBoostRegressor
9
- import catboost
10
10
  from lightgbm import LGBMClassifier, LGBMRegressor
11
11
  from numpy import log1p
12
12
  from pandas.api.types import is_numeric_dtype
13
13
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
14
14
 
15
+ from upgini.utils.features_validator import FeaturesValidator
15
16
  from upgini.utils.sklearn_ext import cross_validate
16
17
 
17
18
  try:
@@ -352,6 +353,7 @@ class EstimatorWrapper:
352
353
  "target_type": target_type,
353
354
  "groups": groups,
354
355
  "text_features": text_features,
356
+ "logger": logger,
355
357
  }
356
358
  if estimator is None:
357
359
  params = dict()
@@ -414,12 +416,22 @@ class CatBoostWrapper(EstimatorWrapper):
414
416
  target_type: ModelTaskType,
415
417
  groups: Optional[List[str]] = None,
416
418
  text_features: Optional[List[str]] = None,
419
+ logger: Optional[logging.Logger] = None,
417
420
  ):
418
421
  super(CatBoostWrapper, self).__init__(
419
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
422
+ estimator,
423
+ scorer,
424
+ metric_name,
425
+ multiplier,
426
+ cv,
427
+ target_type,
428
+ groups=groups,
429
+ text_features=text_features,
430
+ logger=logger,
420
431
  )
421
432
  self.cat_features = None
422
433
  self.emb_features = None
434
+ self.exclude_features = []
423
435
 
424
436
  def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
425
437
  X, y, groups, params = super()._prepare_to_fit(X, y)
@@ -437,9 +449,7 @@ class CatBoostWrapper(EstimatorWrapper):
437
449
  X, embedding_features = self.group_embeddings(X)
438
450
  params["embedding_features"] = embedding_features
439
451
  else:
440
- self.logger.info(
441
- f"Embedding features count less than 3, so use them separately: {self.emb_features}"
442
- )
452
+ self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
443
453
  self.emb_features = []
444
454
  else:
445
455
  self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
@@ -498,6 +508,8 @@ class CatBoostWrapper(EstimatorWrapper):
498
508
  return df, [emb_name]
499
509
 
500
510
  def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
511
+ if self.exclude_features:
512
+ X = X.drop(columns=self.exclude_features)
501
513
  X, y, params = super()._prepare_to_calculate(X, y)
502
514
  if self.text_features:
503
515
  params["text_features"] = self.text_features
@@ -510,6 +522,26 @@ class CatBoostWrapper(EstimatorWrapper):
510
522
 
511
523
  return X, y, params
512
524
 
525
+ def cross_val_predict(
526
+ self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
527
+ ) -> Optional[float]:
528
+ try:
529
+ return super().cross_val_predict(X, y, baseline_score_column)
530
+ except Exception as e:
531
+ if "Dictionary size is 0" in e.args[0] and self.text_features:
532
+ high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
533
+ self.logger.warning(
534
+ "Failed to calculate metrics. Try to remove high cardinality"
535
+ f" text features {high_cardinality_features} and retry"
536
+ )
537
+ for f in high_cardinality_features:
538
+ self.text_features.remove(f)
539
+ self.exclude_features.append(f)
540
+ X = X.drop(columns=f)
541
+ return super().cross_val_predict(X, y, baseline_score_column)
542
+ else:
543
+ raise e
544
+
513
545
 
514
546
  class LightGBMWrapper(EstimatorWrapper):
515
547
  def __init__(
@@ -522,9 +554,18 @@ class LightGBMWrapper(EstimatorWrapper):
522
554
  target_type: ModelTaskType,
523
555
  groups: Optional[List[str]] = None,
524
556
  text_features: Optional[List[str]] = None,
557
+ logger: Optional[logging.Logger] = None,
525
558
  ):
526
559
  super(LightGBMWrapper, self).__init__(
527
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
560
+ estimator,
561
+ scorer,
562
+ metric_name,
563
+ multiplier,
564
+ cv,
565
+ target_type,
566
+ groups=groups,
567
+ text_features=text_features,
568
+ logger=logger,
528
569
  )
529
570
  self.cat_features = None
530
571
 
@@ -561,9 +602,18 @@ class OtherEstimatorWrapper(EstimatorWrapper):
561
602
  target_type: ModelTaskType,
562
603
  groups: Optional[List[str]] = None,
563
604
  text_features: Optional[List[str]] = None,
605
+ logger: Optional[logging.Logger] = None,
564
606
  ):
565
607
  super(OtherEstimatorWrapper, self).__init__(
566
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
608
+ estimator,
609
+ scorer,
610
+ metric_name,
611
+ multiplier,
612
+ cv,
613
+ target_type,
614
+ groups=groups,
615
+ text_features=text_features,
616
+ logger=logger,
567
617
  )
568
618
  self.cat_features = None
569
619
 
@@ -1,5 +1,4 @@
1
1
  import functools
2
- import logging
3
2
  import numbers
4
3
  import time
5
4
  import warnings
@@ -313,7 +312,7 @@ def cross_validate(
313
312
 
314
313
  return ret
315
314
  except Exception:
316
- logging.exception("Failed to execute overriden cross_validate. Fallback to original")
315
+ # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
317
316
  raise
318
317
  # fit_params["use_best_model"] = False
319
318
  # return original_cross_validate(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.267
3
+ Version: 1.1.268
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -2,11 +2,11 @@ upgini/__init__.py,sha256=asENHgEVHQBIkV-e_0IhE_ZWqkCG6398U3ZLrNzAH6k,407
2
2
  upgini/ads.py,sha256=mre6xn44wcC_fg63iLT_kTh4mViZqR9AKRJZAtpQz8Y,2592
3
3
  upgini/dataset.py,sha256=xb4gIANyGbdcuM8Awyq2pJPiH_3k_LEbETApJgAoRBA,45529
4
4
  upgini/errors.py,sha256=pdzQl3MKuK52yvncxMWMRWeSIOGhUFzpQoszoRFBOk0,958
5
- upgini/features_enricher.py,sha256=poGGf5MZgangMFmfTxRWtE6FDPDy5VUtXLmW2tGiorI,174170
5
+ upgini/features_enricher.py,sha256=1vHhSQBnFsq6IoYaG_oJbgEWqRZMpkt1rkoLOD-6nl4,174158
6
6
  upgini/fingerprint.js,sha256=VygVIQlN1v4NGZfjHqtRogOw8zjTnnMNJg_f7M5iGQU,33442
7
7
  upgini/http.py,sha256=zaO86LBBLmkieGbgYifk29eVoPCxXimZQ8YkQtKcM0I,42244
8
8
  upgini/metadata.py,sha256=fwVxtkR6Mn4iRoOqV6BfMJvJrx65I3YwZUMbZjhPyOI,9673
9
- upgini/metrics.py,sha256=3VvSZW1cCOIPHImXuqcnWzD3fWcpPzVa9k8eulLbUmY,27426
9
+ upgini/metrics.py,sha256=VmxVc-plbRPZ1U3Ve3E-FZkhYqi0X2r7x8H5L-shux4,29058
10
10
  upgini/search_task.py,sha256=tmJ17WUxv3J5NWrYUJB_NKdZ792Ifz8Z8UnDXeQnpss,17077
11
11
  upgini/spinner.py,sha256=Dm1dQ5F_z_Ua2odLxZX7OypcOX9tSx_vE5MGaKtUmfw,1118
12
12
  upgini/version_validator.py,sha256=rDIncP6BEko4J2F2hUcMOtKm_vZbI4ICWcNcw8hrwM4,1400
@@ -52,12 +52,12 @@ upgini/utils/ip_utils.py,sha256=Zf3F2cnQmOCH09QLQHetpjMFu1PnD0cTmDymn0SnSy8,1672
52
52
  upgini/utils/phone_utils.py,sha256=JNSkF8G6mgsN8Czy11pamaJdsY6rBINEMpi7jbVt_RA,408
53
53
  upgini/utils/postal_code_utils.py,sha256=_8CR9tBqsPptQsmMUvnrCAmBaMIQSWH3JfJ4ly3x_zs,409
54
54
  upgini/utils/progress_bar.py,sha256=iNXyqT3vKCeHpfiG5HHwr7Lk2cTtKViM93Fl8iZnjGc,1564
55
- upgini/utils/sklearn_ext.py,sha256=fvuTWJ5AnT3ED9KSaQu_yIgW2JR19hFlaGDoVP3k60g,44027
55
+ upgini/utils/sklearn_ext.py,sha256=e1aMNXk1zUt7uFnl0FcUF0zOnaXSE7z5xBHmJPknUVs,44014
56
56
  upgini/utils/target_utils.py,sha256=9K67tkY7LWhQMO-vbbPqBaO-KriAmg_6fVz5RQRaLQc,7802
57
57
  upgini/utils/track_info.py,sha256=EPcJ13Jqa17_T0JjM37Ac9kWDz5Zk0GVsIZKutOb8aU,5207
58
58
  upgini/utils/warning_counter.py,sha256=dIWBB4dI5XRRJZudvIlqlIYKEiwLLPcXarsZuYRt338,227
59
- upgini-1.1.267.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
- upgini-1.1.267.dist-info/METADATA,sha256=TiFi7bLKF7TP0gGesfvnN_rs-2htvjOYQko0K4GKdDM,48156
61
- upgini-1.1.267.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
- upgini-1.1.267.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
- upgini-1.1.267.dist-info/RECORD,,
59
+ upgini-1.1.268.dist-info/LICENSE,sha256=5RRzgvdJUu3BUDfv4bzVU6FqKgwHlIay63pPCSmSgzw,1514
60
+ upgini-1.1.268.dist-info/METADATA,sha256=gTuBYet6-H97ppvX37qAJuC8tQCZ7bPetPl11O9XEFY,48156
61
+ upgini-1.1.268.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
62
+ upgini-1.1.268.dist-info/top_level.txt,sha256=OFhTGiDIWKl5gFI49qvWq1R9IKflPaE2PekcbDXDtx4,7
63
+ upgini-1.1.268.dist-info/RECORD,,