upgini 1.1.267__tar.gz → 1.1.268__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. {upgini-1.1.267/src/upgini.egg-info → upgini-1.1.268}/PKG-INFO +1 -1
  2. {upgini-1.1.267 → upgini-1.1.268}/setup.py +1 -1
  3. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/features_enricher.py +1 -1
  4. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/metrics.py +57 -7
  5. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/sklearn_ext.py +1 -2
  6. {upgini-1.1.267 → upgini-1.1.268/src/upgini.egg-info}/PKG-INFO +1 -1
  7. {upgini-1.1.267 → upgini-1.1.268}/tests/test_target_utils.py +6 -6
  8. {upgini-1.1.267 → upgini-1.1.268}/LICENSE +0 -0
  9. {upgini-1.1.267 → upgini-1.1.268}/README.md +0 -0
  10. {upgini-1.1.267 → upgini-1.1.268}/pyproject.toml +0 -0
  11. {upgini-1.1.267 → upgini-1.1.268}/setup.cfg +0 -0
  12. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/__init__.py +0 -0
  13. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/ads.py +0 -0
  14. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/ads_management/__init__.py +0 -0
  15. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/ads_management/ads_manager.py +0 -0
  16. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/__init__.py +0 -0
  17. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/all_operands.py +0 -0
  18. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/binary.py +0 -0
  19. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/date.py +0 -0
  20. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/feature.py +0 -0
  21. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/groupby.py +0 -0
  22. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/operand.py +0 -0
  23. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/unary.py +0 -0
  24. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/autofe/vector.py +0 -0
  25. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/data_source/__init__.py +0 -0
  26. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/data_source/data_source_publisher.py +0 -0
  27. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/dataset.py +0 -0
  28. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/errors.py +0 -0
  29. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/fingerprint.js +0 -0
  30. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/http.py +0 -0
  31. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/mdc/__init__.py +0 -0
  32. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/mdc/context.py +0 -0
  33. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/metadata.py +0 -0
  34. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/normalizer/__init__.py +0 -0
  35. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/normalizer/phone_normalizer.py +0 -0
  36. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/search_task.py +0 -0
  45. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/spinner.py +0 -0
  46. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/display_utils.py +0 -0
  55. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/email_utils.py +0 -0
  56. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/fallback_progress_bar.py +0 -0
  57. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/features_validator.py +0 -0
  58. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/format.py +0 -0
  59. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/ip_utils.py +0 -0
  60. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/phone_utils.py +0 -0
  61. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/postal_code_utils.py +0 -0
  62. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/progress_bar.py +0 -0
  63. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/target_utils.py +0 -0
  64. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/track_info.py +0 -0
  65. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/utils/warning_counter.py +0 -0
  66. {upgini-1.1.267 → upgini-1.1.268}/src/upgini/version_validator.py +0 -0
  67. {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/SOURCES.txt +0 -0
  68. {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/dependency_links.txt +0 -0
  69. {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/requires.txt +0 -0
  70. {upgini-1.1.267 → upgini-1.1.268}/src/upgini.egg-info/top_level.txt +0 -0
  71. {upgini-1.1.267 → upgini-1.1.268}/tests/test_autofe_operands.py +0 -0
  72. {upgini-1.1.267 → upgini-1.1.268}/tests/test_binary_dataset.py +0 -0
  73. {upgini-1.1.267 → upgini-1.1.268}/tests/test_blocked_time_series.py +0 -0
  74. {upgini-1.1.267 → upgini-1.1.268}/tests/test_categorical_dataset.py +0 -0
  75. {upgini-1.1.267 → upgini-1.1.268}/tests/test_continuous_dataset.py +0 -0
  76. {upgini-1.1.267 → upgini-1.1.268}/tests/test_country_utils.py +0 -0
  77. {upgini-1.1.267 → upgini-1.1.268}/tests/test_custom_loss_utils.py +0 -0
  78. {upgini-1.1.267 → upgini-1.1.268}/tests/test_datetime_utils.py +0 -0
  79. {upgini-1.1.267 → upgini-1.1.268}/tests/test_email_utils.py +0 -0
  80. {upgini-1.1.267 → upgini-1.1.268}/tests/test_etalon_validation.py +0 -0
  81. {upgini-1.1.267 → upgini-1.1.268}/tests/test_features_enricher.py +0 -0
  82. {upgini-1.1.267 → upgini-1.1.268}/tests/test_metrics.py +0 -0
  83. {upgini-1.1.267 → upgini-1.1.268}/tests/test_phone_utils.py +0 -0
  84. {upgini-1.1.267 → upgini-1.1.268}/tests/test_postal_code_utils.py +0 -0
  85. {upgini-1.1.267 → upgini-1.1.268}/tests/test_widget.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.267
3
+ Version: 1.1.268
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -40,7 +40,7 @@ def send_log(msg: str):
40
40
 
41
41
 
42
42
  here = Path(__file__).parent.resolve()
43
- version = "1.1.267"
43
+ version = "1.1.268"
44
44
  try:
45
45
  send_log(f"Start setup PyLib version {version}")
46
46
  setup(
@@ -3665,7 +3665,7 @@ class FeaturesEnricher(TransformerMixin):
3665
3665
  if y is not None:
3666
3666
  with open(f"{tmp_dir}/y.pickle", "wb") as y_file:
3667
3667
  pickle.dump(sample(y, xy_sample_index), y_file)
3668
- if eval_set is not None:
3668
+ if eval_set:
3669
3669
  eval_xy_sample_index = rnd.randint(0, _num_samples(eval_set[0][0]), size=1000)
3670
3670
  with open(f"{tmp_dir}/eval_x.pickle", "wb") as eval_x_file:
3671
3671
  pickle.dump(sample(eval_set[0][0], eval_xy_sample_index), eval_x_file)
@@ -3,15 +3,16 @@ import re
3
3
  from copy import deepcopy
4
4
  from typing import Any, Callable, Dict, List, Optional, Tuple, Union
5
5
 
6
+ import catboost
6
7
  import numpy as np
7
8
  import pandas as pd
8
9
  from catboost import CatBoostClassifier, CatBoostRegressor
9
- import catboost
10
10
  from lightgbm import LGBMClassifier, LGBMRegressor
11
11
  from numpy import log1p
12
12
  from pandas.api.types import is_numeric_dtype
13
13
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
14
14
 
15
+ from upgini.utils.features_validator import FeaturesValidator
15
16
  from upgini.utils.sklearn_ext import cross_validate
16
17
 
17
18
  try:
@@ -352,6 +353,7 @@ class EstimatorWrapper:
352
353
  "target_type": target_type,
353
354
  "groups": groups,
354
355
  "text_features": text_features,
356
+ "logger": logger,
355
357
  }
356
358
  if estimator is None:
357
359
  params = dict()
@@ -414,12 +416,22 @@ class CatBoostWrapper(EstimatorWrapper):
414
416
  target_type: ModelTaskType,
415
417
  groups: Optional[List[str]] = None,
416
418
  text_features: Optional[List[str]] = None,
419
+ logger: Optional[logging.Logger] = None,
417
420
  ):
418
421
  super(CatBoostWrapper, self).__init__(
419
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
422
+ estimator,
423
+ scorer,
424
+ metric_name,
425
+ multiplier,
426
+ cv,
427
+ target_type,
428
+ groups=groups,
429
+ text_features=text_features,
430
+ logger=logger,
420
431
  )
421
432
  self.cat_features = None
422
433
  self.emb_features = None
434
+ self.exclude_features = []
423
435
 
424
436
  def _prepare_to_fit(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, np.ndarray, dict]:
425
437
  X, y, groups, params = super()._prepare_to_fit(X, y)
@@ -437,9 +449,7 @@ class CatBoostWrapper(EstimatorWrapper):
437
449
  X, embedding_features = self.group_embeddings(X)
438
450
  params["embedding_features"] = embedding_features
439
451
  else:
440
- self.logger.info(
441
- f"Embedding features count less than 3, so use them separately: {self.emb_features}"
442
- )
452
+ self.logger.info(f"Embedding features count less than 3, so use them separately: {self.emb_features}")
443
453
  self.emb_features = []
444
454
  else:
445
455
  self.logger.warning(f"Embedding features are not supported by Catboost version {catboost.__version__}")
@@ -498,6 +508,8 @@ class CatBoostWrapper(EstimatorWrapper):
498
508
  return df, [emb_name]
499
509
 
500
510
  def _prepare_to_calculate(self, X: pd.DataFrame, y: pd.Series) -> Tuple[pd.DataFrame, np.ndarray, dict]:
511
+ if self.exclude_features:
512
+ X = X.drop(columns=self.exclude_features)
501
513
  X, y, params = super()._prepare_to_calculate(X, y)
502
514
  if self.text_features:
503
515
  params["text_features"] = self.text_features
@@ -510,6 +522,26 @@ class CatBoostWrapper(EstimatorWrapper):
510
522
 
511
523
  return X, y, params
512
524
 
525
+ def cross_val_predict(
526
+ self, X: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
527
+ ) -> Optional[float]:
528
+ try:
529
+ return super().cross_val_predict(X, y, baseline_score_column)
530
+ except Exception as e:
531
+ if "Dictionary size is 0" in e.args[0] and self.text_features:
532
+ high_cardinality_features = FeaturesValidator.find_high_cardinality(X[self.text_features])
533
+ self.logger.warning(
534
+ "Failed to calculate metrics. Try to remove high cardinality"
535
+ f" text features {high_cardinality_features} and retry"
536
+ )
537
+ for f in high_cardinality_features:
538
+ self.text_features.remove(f)
539
+ self.exclude_features.append(f)
540
+ X = X.drop(columns=f)
541
+ return super().cross_val_predict(X, y, baseline_score_column)
542
+ else:
543
+ raise e
544
+
513
545
 
514
546
  class LightGBMWrapper(EstimatorWrapper):
515
547
  def __init__(
@@ -522,9 +554,18 @@ class LightGBMWrapper(EstimatorWrapper):
522
554
  target_type: ModelTaskType,
523
555
  groups: Optional[List[str]] = None,
524
556
  text_features: Optional[List[str]] = None,
557
+ logger: Optional[logging.Logger] = None,
525
558
  ):
526
559
  super(LightGBMWrapper, self).__init__(
527
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
560
+ estimator,
561
+ scorer,
562
+ metric_name,
563
+ multiplier,
564
+ cv,
565
+ target_type,
566
+ groups=groups,
567
+ text_features=text_features,
568
+ logger=logger,
528
569
  )
529
570
  self.cat_features = None
530
571
 
@@ -561,9 +602,18 @@ class OtherEstimatorWrapper(EstimatorWrapper):
561
602
  target_type: ModelTaskType,
562
603
  groups: Optional[List[str]] = None,
563
604
  text_features: Optional[List[str]] = None,
605
+ logger: Optional[logging.Logger] = None,
564
606
  ):
565
607
  super(OtherEstimatorWrapper, self).__init__(
566
- estimator, scorer, metric_name, multiplier, cv, target_type, groups=groups, text_features=text_features
608
+ estimator,
609
+ scorer,
610
+ metric_name,
611
+ multiplier,
612
+ cv,
613
+ target_type,
614
+ groups=groups,
615
+ text_features=text_features,
616
+ logger=logger,
567
617
  )
568
618
  self.cat_features = None
569
619
 
@@ -1,5 +1,4 @@
1
1
  import functools
2
- import logging
3
2
  import numbers
4
3
  import time
5
4
  import warnings
@@ -313,7 +312,7 @@ def cross_validate(
313
312
 
314
313
  return ret
315
314
  except Exception:
316
- logging.exception("Failed to execute overriden cross_validate. Fallback to original")
315
+ # logging.exception("Failed to execute overriden cross_validate. Fallback to original")
317
316
  raise
318
317
  # fit_params["use_best_model"] = False
319
318
  # return original_cross_validate(
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.1
2
2
  Name: upgini
3
- Version: 1.1.267
3
+ Version: 1.1.268
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Home-page: https://upgini.com/
6
6
  Author: Upgini Developers
@@ -140,7 +140,7 @@ def test_binary_psi_calculation():
140
140
  "target": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1]
141
141
  })
142
142
  df["date"] = pd.date_range("2020-01-01", "2020-01-20")
143
- enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
143
+ enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
144
144
  enricher._validate_PSI(df)
145
145
  assert not enricher.warning_counter.has_warnings()
146
146
 
@@ -148,7 +148,7 @@ def test_binary_psi_calculation():
148
148
  "target": [0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1]
149
149
  })
150
150
  df["date"] = pd.date_range("2020-01-01", "2020-01-20")
151
- enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
151
+ enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
152
152
  enricher._validate_PSI(df)
153
153
  assert enricher.warning_counter._count == 1
154
154
 
@@ -157,7 +157,7 @@ def test_binary_psi_calculation():
157
157
  "eval_set_index": [0] * 10 + [1] * 10,
158
158
  })
159
159
  df["date"] = pd.date_range("2020-01-01", "2020-01-20")
160
- enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
160
+ enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
161
161
  enricher._validate_PSI(df)
162
162
  assert enricher.warning_counter._count == 1
163
163
 
@@ -166,7 +166,7 @@ def test_binary_psi_calculation():
166
166
  "eval_set_index": [0] * 10 + [1] * 10,
167
167
  })
168
168
  df["date"] = pd.date_range("2020-01-01", "2020-01-20")
169
- enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
169
+ enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
170
170
  enricher._validate_PSI(df)
171
171
  assert enricher.warning_counter._count == 2
172
172
 
@@ -177,7 +177,7 @@ def test_regression_psi_calculation():
177
177
  "target": random.rand(20)
178
178
  })
179
179
  df["date"] = pd.date_range("2020-01-01", "2020-01-20")
180
- enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
180
+ enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
181
181
  enricher._validate_PSI(df)
182
182
  assert enricher.warning_counter._count == 1
183
183
 
@@ -189,6 +189,6 @@ def test_regression_psi_calculation():
189
189
  "target": list(values1) + list(values2)
190
190
  })
191
191
  df["date"] = pd.date_range("2020-01-01", "2020-01-20")
192
- enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE})
192
+ enricher = FeaturesEnricher(search_keys={"date": SearchKey.DATE}, logs_enabled=False)
193
193
  enricher._validate_PSI(df)
194
194
  assert not enricher.warning_counter.has_warnings()
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes