upgini 1.2.16a1__tar.gz → 1.2.16a3654.dev2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (65) hide show
  1. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/PKG-INFO +1 -1
  2. upgini-1.2.16a3654.dev2/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/features_enricher.py +22 -39
  4. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/metrics.py +6 -56
  5. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/normalizer/normalize_utils.py +1 -4
  6. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/display_utils.py +2 -2
  7. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/features_validator.py +0 -4
  8. upgini-1.2.16a1/src/upgini/__about__.py +0 -1
  9. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/.gitignore +0 -0
  10. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/LICENSE +0 -0
  11. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/README.md +0 -0
  12. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/pyproject.toml +0 -0
  13. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/__init__.py +0 -0
  14. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/ads.py +0 -0
  15. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/ads_management/__init__.py +0 -0
  16. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/ads_management/ads_manager.py +0 -0
  17. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/__init__.py +0 -0
  18. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/all_operands.py +0 -0
  19. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/binary.py +0 -0
  20. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/date.py +0 -0
  21. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/feature.py +0 -0
  22. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/groupby.py +0 -0
  23. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/operand.py +0 -0
  24. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/unary.py +0 -0
  25. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/autofe/vector.py +0 -0
  26. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/data_source/__init__.py +0 -0
  27. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/data_source/data_source_publisher.py +0 -0
  28. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/dataset.py +0 -0
  29. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/errors.py +0 -0
  30. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/http.py +0 -0
  31. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/lazy_import.py +0 -0
  32. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/mdc/__init__.py +0 -0
  33. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/mdc/context.py +0 -0
  34. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/metadata.py +0 -0
  35. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/normalizer/__init__.py +0 -0
  36. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/resource_bundle/__init__.py +0 -0
  37. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/resource_bundle/exceptions.py +0 -0
  38. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/resource_bundle/strings.properties +0 -0
  39. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  40. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/sampler/__init__.py +0 -0
  41. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/sampler/base.py +0 -0
  42. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/sampler/random_under_sampler.py +0 -0
  43. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/sampler/utils.py +0 -0
  44. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/search_task.py +0 -0
  45. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/spinner.py +0 -0
  46. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/datetime_utils.py +0 -0
  53. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/deduplicate_utils.py +0 -0
  54. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/format.py +0 -0
  57. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/ip_utils.py +0 -0
  58. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/phone_utils.py +0 -0
  59. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/postal_code_utils.py +0 -0
  60. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/progress_bar.py +0 -0
  61. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/sklearn_ext.py +0 -0
  62. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/target_utils.py +0 -0
  63. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/track_info.py +0 -0
  64. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/utils/warning_counter.py +0 -0
  65. {upgini-1.2.16a1 → upgini-1.2.16a3654.dev2}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.16a1
3
+ Version: 1.2.16a3654.dev2
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.16a3654.dev2"
@@ -165,6 +165,7 @@ class FeaturesEnricher(TransformerMixin):
165
165
  RANDOM_STATE = 42
166
166
  CALCULATE_METRICS_THRESHOLD = 50_000_000
167
167
  CALCULATE_METRICS_MIN_THRESHOLD = 500
168
+ TEXT_FEATURES_THRESHOLD = 5_000
168
169
  GENERATE_FEATURES_LIMIT = 10
169
170
  EMPTY_FEATURES_INFO = pd.DataFrame(
170
171
  columns=[
@@ -871,13 +872,6 @@ class FeaturesEnricher(TransformerMixin):
871
872
  else None
872
873
  )
873
874
 
874
- if self.X is None:
875
- self.X = X
876
- if self.y is None:
877
- self.y = y
878
- if self.eval_set is None:
879
- self.eval_set = effective_eval_set
880
-
881
875
  try:
882
876
  self.__log_debug_information(
883
877
  validated_X,
@@ -945,14 +939,16 @@ class FeaturesEnricher(TransformerMixin):
945
939
 
946
940
  gc.collect()
947
941
 
948
- if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
949
- print(self.bundle.get("metrics_no_important_free_features"))
950
- self.logger.warning("No client or free relevant ADS features found to calculate metrics")
951
- self.warning_counter.increment()
952
- return None
942
+ text_features = self.generate_features if fitting_X.shape[0] >= self.TEXT_FEATURES_THRESHOLD else []
953
943
 
954
944
  print(self.bundle.get("metrics_start"))
955
945
  with Spinner():
946
+ if fitting_X.shape[1] == 0 and fitting_enriched_X.shape[1] == 0:
947
+ print(self.bundle.get("metrics_no_important_free_features"))
948
+ self.logger.warning("No client or free relevant ADS features found to calculate metrics")
949
+ self.warning_counter.increment()
950
+ return None
951
+
956
952
  self._check_train_and_eval_target_distribution(y_sorted, fitting_eval_set_dict)
957
953
 
958
954
  has_date = SearchKey.find_key(search_keys, [SearchKey.DATE, SearchKey.DATETIME]) is not None
@@ -966,7 +962,7 @@ class FeaturesEnricher(TransformerMixin):
966
962
  fitting_enriched_X,
967
963
  scoring,
968
964
  groups=groups,
969
- text_features=self.generate_features,
965
+ text_features=text_features,
970
966
  has_date=has_date,
971
967
  )
972
968
  metric = wrapper.metric_name
@@ -993,10 +989,10 @@ class FeaturesEnricher(TransformerMixin):
993
989
  cat_features,
994
990
  add_params=custom_loss_add_params,
995
991
  groups=groups,
996
- text_features=self.generate_features,
992
+ text_features=text_features,
997
993
  has_date=has_date,
998
994
  )
999
- etalon_metric, _ = baseline_estimator.cross_val_predict(
995
+ etalon_metric = baseline_estimator.cross_val_predict(
1000
996
  fitting_X, y_sorted, self.baseline_score_column
1001
997
  )
1002
998
  if etalon_metric is None:
@@ -1027,14 +1023,10 @@ class FeaturesEnricher(TransformerMixin):
1027
1023
  cat_features,
1028
1024
  add_params=custom_loss_add_params,
1029
1025
  groups=groups,
1030
- text_features=self.generate_features,
1026
+ text_features=text_features,
1031
1027
  has_date=has_date,
1032
1028
  )
1033
- enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1034
- fitting_enriched_X, enriched_y_sorted
1035
- )
1036
-
1037
- print(f"Calculated enriched shaps: {enriched_shaps}")
1029
+ enriched_metric = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1038
1030
  if enriched_metric is None:
1039
1031
  self.logger.warning(
1040
1032
  f"Enriched {metric} on train combined features is None (maybe all features was removed)"
@@ -1167,6 +1159,13 @@ class FeaturesEnricher(TransformerMixin):
1167
1159
  elif uplift_col in metrics_df.columns and (metrics_df[uplift_col] < 0).any():
1168
1160
  self.logger.warning("Uplift is negative")
1169
1161
 
1162
+ if self.X is None:
1163
+ self.X = X
1164
+ if self.y is None:
1165
+ self.y = y
1166
+ if self.eval_set is None:
1167
+ self.eval_set = effective_eval_set
1168
+
1170
1169
  return metrics_df
1171
1170
  except Exception as e:
1172
1171
  error_message = "Failed to calculate metrics" + (
@@ -1191,18 +1190,6 @@ class FeaturesEnricher(TransformerMixin):
1191
1190
  finally:
1192
1191
  self.logger.info(f"Calculating metrics elapsed time: {time.time() - start_time}")
1193
1192
 
1194
- def _update_shap_values(self, new_shaps: Dict[str, float]):
1195
- feature_name_header = self.bundle.get("features_info_name")
1196
- shap_value_header = self.bundle.get("features_info_shap")
1197
-
1198
- def update_shap(row):
1199
- return new_shaps.get(row[feature_name_header], row[shap_value_header])
1200
-
1201
- self.features_info[shap_value_header] = self.features_info.apply(update_shap, axis=1)
1202
- self.features_info.sort_values(by=shap_value_header, ascending=False, inplace=True)
1203
-
1204
- # TODO redraw
1205
-
1206
1193
  def _check_train_and_eval_target_distribution(self, y, eval_set_dict):
1207
1194
  uneven_distribution = False
1208
1195
  for eval_set in eval_set_dict.values():
@@ -1594,9 +1581,7 @@ class FeaturesEnricher(TransformerMixin):
1594
1581
  generated_features.extend(generator.generated_features)
1595
1582
 
1596
1583
  normalizer = Normalizer(self.bundle, self.logger, self.warning_counter)
1597
- df, search_keys, generated_features = normalizer.normalize(
1598
- df, search_keys, generated_features
1599
- )
1584
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
1600
1585
  columns_renaming = normalizer.columns_renaming
1601
1586
 
1602
1587
  df = clean_full_duplicates(df, logger=self.logger, silent=True, bundle=self.bundle)
@@ -2036,9 +2021,7 @@ class FeaturesEnricher(TransformerMixin):
2036
2021
  generated_features.extend(generator.generated_features)
2037
2022
 
2038
2023
  normalizer = Normalizer(self.bundle, self.logger, self.warning_counter, silent_mode)
2039
- df, search_keys, generated_features = normalizer.normalize(
2040
- df, search_keys, generated_features
2041
- )
2024
+ df, search_keys, generated_features = normalizer.normalize(df, search_keys, generated_features)
2042
2025
  columns_renaming = normalizer.columns_renaming
2043
2026
 
2044
2027
  # Don't pass all features in backend on transform
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
2
 
3
- from collections import defaultdict
4
3
  import inspect
5
4
  import logging
6
5
  import re
@@ -10,7 +9,7 @@ from typing import Any, Callable, Dict, List, Optional, Tuple, Union
10
9
  import catboost
11
10
  import numpy as np
12
11
  import pandas as pd
13
- from catboost import CatBoost, CatBoostClassifier, CatBoostRegressor, Pool
12
+ from catboost import CatBoostClassifier, CatBoostRegressor
14
13
  from numpy import log1p
15
14
  from pandas.api.types import is_numeric_dtype
16
15
  from sklearn.metrics import check_scoring, get_scorer, make_scorer, roc_auc_score
@@ -64,7 +63,7 @@ CATBOOST_BINARY_PARAMS = {
64
63
  "verbose": False,
65
64
  "random_state": DEFAULT_RANDOM_STATE,
66
65
  "allow_writing_files": False,
67
- "auto_class_weights": "Balanced",
66
+ "auto_class_weights": "SqrtBalanced",
68
67
  }
69
68
 
70
69
  CATBOOST_MULTICLASS_PARAMS = {
@@ -82,7 +81,7 @@ CATBOOST_MULTICLASS_PARAMS = {
82
81
  "verbose": False,
83
82
  "random_state": DEFAULT_RANDOM_STATE,
84
83
  "allow_writing_files": False,
85
- "auto_class_weights": "Balanced",
84
+ "auto_class_weights": "SqrtBalanced",
86
85
  }
87
86
 
88
87
  LIGHTGBM_PARAMS = {
@@ -289,12 +288,9 @@ class EstimatorWrapper:
289
288
  x, y, _ = self._prepare_data(x, y)
290
289
  return x, y, {}
291
290
 
292
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator) -> Optional[Dict[str, float]]:
293
- return None
294
-
295
291
  def cross_val_predict(
296
292
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
297
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
293
+ ) -> Optional[float]:
298
294
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
299
295
 
300
296
  if x.shape[1] == 0:
@@ -302,7 +298,6 @@ class EstimatorWrapper:
302
298
 
303
299
  scorer = check_scoring(self.estimator, scoring=self.scorer)
304
300
 
305
- shap_values_all_folds = defaultdict(list)
306
301
  if baseline_score_column is not None and self.metric_name == "GINI":
307
302
  self.logger.info("Calculate baseline GINI on passed baseline_score_column and target")
308
303
  metric = roc_auc_score(y, x[baseline_score_column])
@@ -324,29 +319,7 @@ class EstimatorWrapper:
324
319
  self.check_fold_metrics(metrics_by_fold)
325
320
 
326
321
  metric = np.mean(metrics_by_fold) * self.multiplier
327
-
328
- splits = self.cv.split(x, y, groups)
329
-
330
- for estimator, split in zip(self.cv_estimators, splits):
331
- _, validation_idx = split
332
- cv_x = x.iloc[validation_idx]
333
- cv_y = y[validation_idx]
334
- shaps = self.calculate_shap(cv_x, cv_y, estimator)
335
- if shaps is not None:
336
- for feature, shap_value in shaps.items():
337
- # shap_values_all_folds[feature] = shap_values_all_folds.get(feature, []) + shap_value.tolist()
338
- shap_values_all_folds[feature].extend(shap_value.tolist())
339
-
340
- if shap_values_all_folds:
341
- average_shap_values = {
342
- feature: np.mean(shaps)
343
- for feature, shaps
344
- in shap_values_all_folds.items()
345
- }
346
- else:
347
- average_shap_values = None
348
-
349
- return self.post_process_metric(metric), average_shap_values
322
+ return self.post_process_metric(metric)
350
323
 
351
324
  def check_fold_metrics(self, metrics_by_fold: List[float]):
352
325
  first_metric_sign = 1 if metrics_by_fold[0] >= 0 else -1
@@ -575,7 +548,7 @@ class CatBoostWrapper(EstimatorWrapper):
575
548
 
576
549
  def cross_val_predict(
577
550
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
578
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
551
+ ) -> Optional[float]:
579
552
  try:
580
553
  return super().cross_val_predict(x, y, baseline_score_column)
581
554
  except Exception as e:
@@ -600,29 +573,6 @@ class CatBoostWrapper(EstimatorWrapper):
600
573
  else:
601
574
  raise e
602
575
 
603
- def calculate_shap(self, x: pd.DataFrame, y: pd.Series, estimator: CatBoost) -> Optional[Dict[str, float]]:
604
- try:
605
- # Create Pool for fold data, if need (for example, when categorical features are present)
606
- fold_pool = Pool(x, y, cat_features=self.cat_features)
607
-
608
- # Get SHAP values of current estimator
609
- shap_values_fold = estimator.get_feature_importance(data=fold_pool, type="ShapValues")
610
-
611
- # Remove last columns (base value) and flatten
612
- if self.target_type == ModelTaskType.MULTICLASS:
613
- all_shaps = shap_values_fold[:, :, :-1]
614
- all_shaps = [all_shaps[:, :, k].flatten() for k in range(all_shaps.shape[2])]
615
- else:
616
- all_shaps = shap_values_fold[:, :-1]
617
- all_shaps = [all_shaps[:, k].flatten() for k in range(all_shaps.shape[1])]
618
-
619
- all_shaps = np.abs(all_shaps)
620
-
621
- return dict(zip(estimator.feature_names_, all_shaps))
622
-
623
- except Exception:
624
- return None
625
-
626
576
 
627
577
  class LightGBMWrapper(EstimatorWrapper):
628
578
  def __init__(
@@ -49,10 +49,7 @@ class Normalizer:
49
49
  self.generated_features = []
50
50
 
51
51
  def normalize(
52
- self,
53
- df: pd.DataFrame,
54
- search_keys: Dict[str, SearchKey],
55
- generated_features: List[str],
52
+ self, df: pd.DataFrame, search_keys: Dict[str, SearchKey], generated_features: List[str]
56
53
  ) -> Tuple[pd.DataFrame, Dict[str, SearchKey], List[str]]:
57
54
  self.search_keys = search_keys.copy()
58
55
  self.generated_features = generated_features.copy()
@@ -72,7 +72,7 @@ def make_table(df: pd.DataFrame, wrap_long_string=None) -> str:
72
72
  )
73
73
 
74
74
 
75
- def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str, display_id: str):
75
+ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header: str):
76
76
  if not ipython_available():
77
77
  print(header)
78
78
  print(internal_df)
@@ -133,7 +133,7 @@ def display_html_dataframe(df: pd.DataFrame, internal_df: pd.DataFrame, header:
133
133
  {table_html}
134
134
  </div>
135
135
  """
136
- return display(HTML(result_html))
136
+ display(HTML(result_html))
137
137
 
138
138
 
139
139
  def make_html_report(
@@ -58,10 +58,6 @@ class FeaturesValidator:
58
58
 
59
59
  columns_renaming = columns_renaming or {}
60
60
 
61
- if features_for_generate:
62
- empty_or_constant_features = [
63
- f for f in empty_or_constant_features if columns_renaming.get(f, f) not in features_for_generate
64
- ]
65
61
  if empty_or_constant_features:
66
62
  msg = bundle.get("empty_or_contant_features").format(
67
63
  [columns_renaming.get(f, f) for f in empty_or_constant_features]
@@ -1 +0,0 @@
1
- __version__ = "1.2.16a1"
File without changes
File without changes
File without changes