upgini 1.2.113a4__tar.gz → 1.2.113a5__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. {upgini-1.2.113a4 → upgini-1.2.113a5}/PKG-INFO +1 -1
  2. upgini-1.2.113a5/src/upgini/__about__.py +1 -0
  3. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/features_enricher.py +17 -9
  4. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/psi.py +26 -0
  5. upgini-1.2.113a4/src/upgini/__about__.py +0 -1
  6. {upgini-1.2.113a4 → upgini-1.2.113a5}/.gitignore +0 -0
  7. {upgini-1.2.113a4 → upgini-1.2.113a5}/LICENSE +0 -0
  8. {upgini-1.2.113a4 → upgini-1.2.113a5}/README.md +0 -0
  9. {upgini-1.2.113a4 → upgini-1.2.113a5}/pyproject.toml +0 -0
  10. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/__init__.py +0 -0
  11. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/ads.py +0 -0
  12. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/ads_management/__init__.py +0 -0
  13. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/ads_management/ads_manager.py +0 -0
  14. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/__init__.py +0 -0
  15. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/all_operators.py +0 -0
  16. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/binary.py +0 -0
  17. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/date.py +0 -0
  18. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/feature.py +0 -0
  19. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/groupby.py +0 -0
  20. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/operator.py +0 -0
  21. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/__init__.py +0 -0
  22. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/base.py +0 -0
  23. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/cross.py +0 -0
  24. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/delta.py +0 -0
  25. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/lag.py +0 -0
  26. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/roll.py +0 -0
  27. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/trend.py +0 -0
  28. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/timeseries/volatility.py +0 -0
  29. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/unary.py +0 -0
  30. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/utils.py +0 -0
  31. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/autofe/vector.py +0 -0
  32. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/data_source/__init__.py +0 -0
  33. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/data_source/data_source_publisher.py +0 -0
  34. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/dataset.py +0 -0
  35. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/errors.py +0 -0
  36. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/http.py +0 -0
  37. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/mdc/__init__.py +0 -0
  38. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/mdc/context.py +0 -0
  39. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/metadata.py +0 -0
  40. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/metrics.py +0 -0
  41. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/normalizer/__init__.py +0 -0
  42. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/normalizer/normalize_utils.py +0 -0
  43. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/resource_bundle/__init__.py +0 -0
  44. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/resource_bundle/exceptions.py +0 -0
  45. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/resource_bundle/strings.properties +0 -0
  46. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/resource_bundle/strings_widget.properties +0 -0
  47. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/sampler/__init__.py +0 -0
  48. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/sampler/base.py +0 -0
  49. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/sampler/random_under_sampler.py +0 -0
  50. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/sampler/utils.py +0 -0
  51. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/search_task.py +0 -0
  52. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/spinner.py +0 -0
  53. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/Roboto-Regular.ttf +0 -0
  54. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/__init__.py +0 -0
  55. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/base_search_key_detector.py +0 -0
  56. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/blocked_time_series.py +0 -0
  57. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/country_utils.py +0 -0
  58. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/custom_loss_utils.py +0 -0
  59. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/cv_utils.py +0 -0
  60. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/datetime_utils.py +0 -0
  61. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/deduplicate_utils.py +0 -0
  62. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/display_utils.py +0 -0
  63. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/email_utils.py +0 -0
  64. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/fallback_progress_bar.py +0 -0
  65. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/feature_info.py +0 -0
  66. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/features_validator.py +0 -0
  67. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/format.py +0 -0
  68. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/ip_utils.py +0 -0
  69. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/mstats.py +0 -0
  70. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/phone_utils.py +0 -0
  71. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/postal_code_utils.py +0 -0
  72. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/progress_bar.py +0 -0
  73. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/sample_utils.py +0 -0
  74. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/sklearn_ext.py +0 -0
  75. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/sort.py +0 -0
  76. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/target_utils.py +0 -0
  77. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/track_info.py +0 -0
  78. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/ts_utils.py +0 -0
  79. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/utils/warning_counter.py +0 -0
  80. {upgini-1.2.113a4 → upgini-1.2.113a5}/src/upgini/version_validator.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.3
2
2
  Name: upgini
3
- Version: 1.2.113a4
3
+ Version: 1.2.113a5
4
4
  Summary: Intelligent data search & enrichment for Machine Learning
5
5
  Project-URL: Bug Reports, https://github.com/upgini/upgini/issues
6
6
  Project-URL: Homepage, https://upgini.com/
@@ -0,0 +1 @@
1
+ __version__ = "1.2.113a5"
@@ -112,7 +112,7 @@ except Exception:
112
112
  CustomFallbackProgressBar as ProgressBar,
113
113
  )
114
114
 
115
- from upgini.utils.psi import calculate_features_psi
115
+ from upgini.utils.psi import calculate_features_psi, calculate_sparsity_psi
116
116
  from upgini.utils.sample_utils import SampleColumns, SampleConfig, _num_samples, sample
117
117
  from upgini.utils.sort import sort_columns
118
118
  from upgini.utils.target_utils import calculate_psi, define_task
@@ -1513,15 +1513,29 @@ class FeaturesEnricher(TransformerMixin):
1513
1513
 
1514
1514
  checking_eval_set_df[date_column] = eval_set_dates[selected_eval_set_idx]
1515
1515
 
1516
+ psi_values_sparse = calculate_sparsity_psi(
1517
+ checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1518
+ )
1519
+
1520
+ unstable_by_sparsity = [feature for feature, psi in psi_values_sparse.items() if psi > stability_threshold]
1521
+ if unstable_by_sparsity:
1522
+ self.logger.info(f"Unstable by sparsity features: {sorted(unstable_by_sparsity)}")
1523
+
1516
1524
  psi_values = calculate_features_psi(
1517
1525
  checking_eval_set_df, cat_features, date_column, self.logger, model_task_type
1518
1526
  )
1519
1527
 
1528
+ unstable_by_value = [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1529
+ if unstable_by_value:
1530
+ self.logger.info(f"Unstable by value features: {sorted(unstable_by_value)}")
1531
+
1520
1532
  self.psi_values = {
1521
1533
  feature: psi_value for feature, psi_value in psi_values.items() if psi_value <= stability_threshold
1522
1534
  }
1523
1535
 
1524
- return [feature for feature, psi in psi_values.items() if psi > stability_threshold]
1536
+ total_unstable_features = sorted(set(unstable_by_sparsity + unstable_by_value))
1537
+
1538
+ return total_unstable_features
1525
1539
 
1526
1540
  def _update_shap_values(self, trace_id: str, df: pd.DataFrame, new_shaps: Dict[str, float], silent: bool = False):
1527
1541
  renaming = self.fit_columns_renaming or {}
@@ -2273,13 +2287,7 @@ class FeaturesEnricher(TransformerMixin):
2273
2287
  enriched_df, x_columns, enriched_X.columns.tolist(), len(eval_set) if has_eval_set else 0
2274
2288
  )
2275
2289
 
2276
- # Add hash-suffixes because output of transform has original names
2277
- reversed_renaming = {v: k for k, v in columns_renaming.items()}
2278
- X_sampled.rename(columns=reversed_renaming, inplace=True)
2279
- enriched_X.rename(columns=reversed_renaming, inplace=True)
2280
- for _, (eval_X_sampled, enriched_eval_X, _) in eval_set_sampled_dict.items():
2281
- eval_X_sampled.rename(columns=reversed_renaming, inplace=True)
2282
- enriched_eval_X.rename(columns=reversed_renaming, inplace=True)
2290
+ search_keys = {columns_renaming.get(k, k): v for k, v in search_keys.items()}
2283
2291
 
2284
2292
  # Cache and return results
2285
2293
  datasets_hash = hash_input(validated_X, validated_y, eval_set)
@@ -42,6 +42,32 @@ DEFAULT_FEATURES_PARAMS = StabilityParams(
42
42
  )
43
43
 
44
44
 
45
+ def calculate_sparsity_psi(
46
+ df: pd.DataFrame,
47
+ cat_features: list[str],
48
+ date_column: str,
49
+ logger: logging.Logger,
50
+ model_task_type: ModelTaskType,
51
+ psi_features_params: StabilityParams = DEFAULT_FEATURES_PARAMS,
52
+ psi_target_params: StabilityParams = DEFAULT_TARGET_PARAMS,
53
+ ) -> Dict[str, float]:
54
+ sparse_features = df.columns[df.isna().sum() > 0].to_list()
55
+ if len(sparse_features) > 0:
56
+ logger.info(f"Calculating sparsity stability for {len(sparse_features)} sparse features")
57
+ sparse_df = df[sparse_features].notna()
58
+ sparse_df[date_column] = df[date_column]
59
+ return calculate_features_psi(
60
+ sparse_df,
61
+ cat_features,
62
+ date_column,
63
+ logger,
64
+ model_task_type,
65
+ psi_target_params,
66
+ psi_features_params,
67
+ )
68
+ return {}
69
+
70
+
45
71
  def calculate_features_psi(
46
72
  df: pd.DataFrame,
47
73
  cat_features: list[str],
@@ -1 +0,0 @@
1
- __version__ = "1.2.113a4"
File without changes
File without changes
File without changes
File without changes
File without changes