upgini 1.2.29a2__tar.gz → 1.2.29a4__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. {upgini-1.2.29a2 → upgini-1.2.29a4}/PKG-INFO +1 -1
  2. upgini-1.2.29a4/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/features_enricher.py +11 -8
  4. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/metrics.py +33 -9
  5. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/datetime_utils.py +4 -4
  6. upgini-1.2.29a2/src/upgini/__about__.py +0 -1
  7. {upgini-1.2.29a2 → upgini-1.2.29a4}/.gitignore +0 -0
  8. {upgini-1.2.29a2 → upgini-1.2.29a4}/LICENSE +0 -0
  9. {upgini-1.2.29a2 → upgini-1.2.29a4}/README.md +0 -0
  10. {upgini-1.2.29a2 → upgini-1.2.29a4}/pyproject.toml +0 -0
  11. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/__init__.py +0 -0
  12. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/ads.py +0 -0
  13. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/ads_management/__init__.py +0 -0
  14. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/ads_management/ads_manager.py +0 -0
  15. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/__init__.py +0 -0
  16. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/all_operands.py +0 -0
  17. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/binary.py +0 -0
  18. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/date.py +0 -0
  19. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/feature.py +0 -0
  20. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/groupby.py +0 -0
  21. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/operand.py +0 -0
  22. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/unary.py +0 -0
  23. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/autofe/vector.py +0 -0
  24. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/data_source/__init__.py +0 -0
  25. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/data_source/data_source_publisher.py +0 -0
  26. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/dataset.py +0 -0
  27. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/errors.py +0 -0
  28. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/http.py +0 -0
  29. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/lazy_import.py +0 -0
  30. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/mdc/__init__.py +0 -0
  31. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/mdc/context.py +0 -0
  32. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/metadata.py +0 -0
  33. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/normalizer/__init__.py +0 -0
  34. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/normalizer/normalize_utils.py +0 -0
  35. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/resource_bundle/__init__.py +0 -0
  36. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/resource_bundle/exceptions.py +0 -0
  37. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/resource_bundle/strings.properties +0 -0
  38. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  39. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/sampler/__init__.py +0 -0
  40. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/sampler/base.py +0 -0
  41. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/sampler/random_under_sampler.py +0 -0
  42. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/sampler/utils.py +0 -0
  43. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/search_task.py +0 -0
  44. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/spinner.py +0 -0
  45. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  46. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/__init__.py +0 -0
  47. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/base_search_key_detector.py +0 -0
  48. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/blocked_time_series.py +0 -0
  49. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/country_utils.py +0 -0
  50. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/custom_loss_utils.py +0 -0
  51. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/cv_utils.py +0 -0
  52. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/deduplicate_utils.py +0 -0
  53. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/display_utils.py +0 -0
  54. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/email_utils.py +0 -0
  55. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/fallback_progress_bar.py +0 -0
  56. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/features_validator.py +0 -0
  57. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/format.py +0 -0
  58. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/ip_utils.py +0 -0
  59. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/phone_utils.py +0 -0
  60. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/postal_code_utils.py +0 -0
  61. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/progress_bar.py +0 -0
  62. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/sklearn_ext.py +0 -0
  63. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/target_utils.py +0 -0
  64. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/track_info.py +0 -0
  65. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/utils/warning_counter.py +0 -0
  66. {upgini-1.2.29a2 → upgini-1.2.29a4}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.29a2
3
+ Version: 1.2.29a4
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.29a4"
@@ -999,9 +999,10 @@ class FeaturesEnricher(TransformerMixin):
999
999
  text_features=self.generate_features,
1000
1000
  has_date=has_date,
1001
1001
  )
1002
- etalon_metric, _ = baseline_estimator.cross_val_predict(
1002
+ etalon_cv_result = baseline_estimator.cross_val_predict(
1003
1003
  fitting_X, y_sorted, self.baseline_score_column
1004
1004
  )
1005
+ etalon_metric = etalon_cv_result.get_display_metric()
1005
1006
  if etalon_metric is None:
1006
1007
  self.logger.info(
1007
1008
  f"Baseline {metric} on train client features is None (maybe all features was removed)"
@@ -1033,9 +1034,9 @@ class FeaturesEnricher(TransformerMixin):
1033
1034
  text_features=self.generate_features,
1034
1035
  has_date=has_date,
1035
1036
  )
1036
- enriched_metric, enriched_shaps = enriched_estimator.cross_val_predict(
1037
- fitting_enriched_X, enriched_y_sorted
1038
- )
1037
+ enriched_cv_result = enriched_estimator.cross_val_predict(fitting_enriched_X, enriched_y_sorted)
1038
+ enriched_metric = enriched_cv_result.get_display_metric()
1039
+ enriched_shaps = enriched_cv_result.shap_values
1039
1040
 
1040
1041
  if enriched_shaps is not None:
1041
1042
  self._update_shap_values(enriched_shaps)
@@ -1048,7 +1049,7 @@ class FeaturesEnricher(TransformerMixin):
1048
1049
  else:
1049
1050
  self.logger.info(f"Enriched {metric} on train combined features: {enriched_metric}")
1050
1051
  if etalon_metric is not None and enriched_metric is not None:
1051
- uplift = (enriched_metric - etalon_metric) * multiplier
1052
+ uplift = (enriched_cv_result.metric - etalon_cv_result.metric) * multiplier
1052
1053
 
1053
1054
  train_metrics = {
1054
1055
  self.bundle.get("quality_metrics_segment_header"): self.bundle.get(
@@ -1091,9 +1092,10 @@ class FeaturesEnricher(TransformerMixin):
1091
1092
  f"Calculate baseline {metric} on eval set {idx + 1} "
1092
1093
  f"on client features: {eval_X_sorted.columns.to_list()}"
1093
1094
  )
1094
- etalon_eval_metric = baseline_estimator.calculate_metric(
1095
+ etalon_eval_results = baseline_estimator.calculate_metric(
1095
1096
  eval_X_sorted, eval_y_sorted, self.baseline_score_column
1096
1097
  )
1098
+ etalon_eval_metric = etalon_eval_results.get_display_metric()
1097
1099
  self.logger.info(
1098
1100
  f"Baseline {metric} on eval set {idx + 1} client features: {etalon_eval_metric}"
1099
1101
  )
@@ -1105,9 +1107,10 @@ class FeaturesEnricher(TransformerMixin):
1105
1107
  f"Calculate enriched {metric} on eval set {idx + 1} "
1106
1108
  f"on combined features: {enriched_eval_X_sorted.columns.to_list()}"
1107
1109
  )
1108
- enriched_eval_metric = enriched_estimator.calculate_metric(
1110
+ enriched_eval_results = enriched_estimator.calculate_metric(
1109
1111
  enriched_eval_X_sorted, enriched_eval_y_sorted
1110
1112
  )
1113
+ enriched_eval_metric = enriched_eval_results.get_display_metric()
1111
1114
  self.logger.info(
1112
1115
  f"Enriched {metric} on eval set {idx + 1} combined features: {enriched_eval_metric}"
1113
1116
  )
@@ -1115,7 +1118,7 @@ class FeaturesEnricher(TransformerMixin):
1115
1118
  enriched_eval_metric = None
1116
1119
 
1117
1120
  if etalon_eval_metric is not None and enriched_eval_metric is not None:
1118
- eval_uplift = (enriched_eval_metric - etalon_eval_metric) * multiplier
1121
+ eval_uplift = (enriched_eval_results.metric - etalon_eval_results.metric) * multiplier
1119
1122
  else:
1120
1123
  eval_uplift = None
1121
1124
 
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from dataclasses import dataclass
3
4
  import inspect
4
5
  import logging
5
6
  import re
@@ -210,6 +211,21 @@ SUPPORTED_CATBOOST_METRICS = {
210
211
  }
211
212
 
212
213
 
214
+ @dataclass
215
+ class _CrossValResults:
216
+ metric: Optional[float]
217
+ metric_std: Optional[float]
218
+ shap_values: Optional[Dict[str, float]]
219
+
220
+ def get_display_metric(self) -> Optional[str]:
221
+ if self.metric is None:
222
+ return None
223
+ elif self.metric_std is None:
224
+ return f"{self.metric:.3f}"
225
+ else:
226
+ return f"{self.metric:.3f} ± {self.metric_std:.3f}"
227
+
228
+
213
229
  class EstimatorWrapper:
214
230
  def __init__(
215
231
  self,
@@ -297,11 +313,11 @@ class EstimatorWrapper:
297
313
 
298
314
  def cross_val_predict(
299
315
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
300
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
316
+ ) -> _CrossValResults:
301
317
  x, y, groups, fit_params = self._prepare_to_fit(x, y)
302
318
 
303
319
  if x.shape[1] == 0:
304
- return None
320
+ return _CrossValResults(metric=None, metric_std=None, shap_values=None)
305
321
 
306
322
  scorer = check_scoring(self.estimator, scoring=self.scorer)
307
323
 
@@ -326,7 +342,7 @@ class EstimatorWrapper:
326
342
 
327
343
  self.check_fold_metrics(metrics_by_fold)
328
344
 
329
- metric = np.mean(metrics_by_fold) * self.multiplier
345
+ metric, metric_std = self._calculate_metric_from_folds(metrics_by_fold)
330
346
 
331
347
  splits = self.cv.split(x, y, groups)
332
348
 
@@ -351,7 +367,7 @@ class EstimatorWrapper:
351
367
  else:
352
368
  average_shap_values = None
353
369
 
354
- return self.post_process_metric(metric), average_shap_values
370
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=average_shap_values)
355
371
 
356
372
  def process_shap_values(self, shap_values: Dict[str, float]) -> Dict[str, float]:
357
373
  return shap_values
@@ -367,17 +383,25 @@ class EstimatorWrapper:
367
383
  metric = 2 * metric - 1
368
384
  return metric
369
385
 
370
- def calculate_metric(self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None) -> float:
386
+ def calculate_metric(
387
+ self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
388
+ ) -> _CrossValResults:
371
389
  x, y, _ = self._prepare_to_calculate(x, y)
372
390
  if baseline_score_column is not None and self.metric_name == "GINI":
373
- metric = roc_auc_score(y, x[baseline_score_column])
391
+ metric, metric_std = roc_auc_score(y, x[baseline_score_column]), None
374
392
  else:
375
393
  metrics = []
376
394
  for est in self.cv_estimators:
377
395
  metrics.append(self.scorer(est, x, y))
378
396
 
379
- metric = np.mean(metrics) * self.multiplier
380
- return self.post_process_metric(metric)
397
+ metric, metric_std = self._calculate_metric_from_folds(metrics)
398
+ return _CrossValResults(metric=metric, metric_std=metric_std, shap_values=None)
399
+
400
+ def _calculate_metric_from_folds(self, metrics_by_fold: List[float]) -> Tuple[float, float]:
401
+ metrics_by_fold = [self.post_process_metric(m) for m in metrics_by_fold]
402
+ metric = np.mean(metrics_by_fold) * self.multiplier
403
+ metric_std = np.std(metrics_by_fold) * np.abs(self.multiplier)
404
+ return metric, metric_std
381
405
 
382
406
  @staticmethod
383
407
  def create(
@@ -591,7 +615,7 @@ class CatBoostWrapper(EstimatorWrapper):
591
615
 
592
616
  def cross_val_predict(
593
617
  self, x: pd.DataFrame, y: np.ndarray, baseline_score_column: Optional[Any] = None
594
- ) -> Tuple[Optional[float], Optional[Dict[str, float]]]:
618
+ ) -> _CrossValResults:
595
619
  try:
596
620
  return super().cross_val_predict(x, y, baseline_score_column)
597
621
  except Exception as e:
@@ -111,7 +111,7 @@ class DateTimeSearchKeyConverter:
111
111
 
112
112
  # Define function to apply sine and cosine transformations
113
113
  def add_cyclical_features(df, column, period):
114
- period_suffix = f"_{period}" if column != 'day_in_quarter' else ""
114
+ period_suffix = f"_{period}" if column != "day_in_quarter" else ""
115
115
  sin_feature = f"datetime_{column}_sin{period_suffix}"
116
116
  cos_feature = f"datetime_{column}_cos{period_suffix}"
117
117
  df[sin_feature] = np.sin(2 * np.pi * df[column] / period)
@@ -122,10 +122,10 @@ class DateTimeSearchKeyConverter:
122
122
  df["quarter"] = df[self.date_column].dt.quarter
123
123
 
124
124
  # Calculate the start date of the quarter for each timestamp
125
- df["quarter_start"] = df["timestamp"].dt.to_period("Q").dt.start_time
125
+ df["quarter_start"] = df[self.date_column].dt.to_period("Q").dt.start_time
126
126
 
127
127
  # Calculate the day in the quarter
128
- df["day_in_quarter"] = (df["timestamp"] - df["quarter_start"]).dt.days + 1
128
+ df["day_in_quarter"] = (df[self.date_column] - df["quarter_start"]).dt.days + 1
129
129
 
130
130
  # Vectorized calculation of days_in_quarter
131
131
  quarter = df["quarter"]
@@ -138,7 +138,7 @@ class DateTimeSearchKeyConverter:
138
138
 
139
139
  end = pd.to_datetime({"year": quarter_end_year, "month": quarter_end_month, "day": 1})
140
140
 
141
- df["days_in_quarter"] = (end - start).dt.days
141
+ df["days_in_quarter"] = (end.reset_index(drop=True) - start.reset_index(drop=True)).dt.days
142
142
 
143
143
  add_cyclical_features(df, "day_in_quarter", df["days_in_quarter"]) # Days in the quarter
144
144
 
@@ -1 +0,0 @@
1
- __version__ = "1.2.29a2"
File without changes
File without changes
File without changes
File without changes
File without changes
File without changes