churnkit 0.75.0a3__py3-none-any.whl → 0.75.1a2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +72 -72
  2. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +134 -134
  3. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +207 -207
  4. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +109 -109
  5. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +283 -283
  6. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +145 -145
  7. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +126 -126
  8. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +149 -149
  9. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +172 -172
  10. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +130 -130
  11. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +163 -163
  12. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +99 -99
  13. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +126 -126
  14. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +89 -89
  15. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +132 -132
  16. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +197 -197
  17. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +27 -27
  18. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/METADATA +2 -2
  19. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/RECORD +48 -48
  20. customer_retention/__init__.py +1 -1
  21. customer_retention/analysis/business/fairness_analyzer.py +2 -2
  22. customer_retention/analysis/diagnostics/segment_analyzer.py +3 -3
  23. customer_retention/analysis/interpretability/cohort_analyzer.py +4 -4
  24. customer_retention/analysis/notebook_progress.py +4 -2
  25. customer_retention/core/compat/__init__.py +20 -0
  26. customer_retention/stages/features/behavioral_features.py +3 -3
  27. customer_retention/stages/features/customer_segmentation.py +10 -10
  28. customer_retention/stages/features/feature_selector.py +2 -2
  29. customer_retention/stages/profiling/column_profiler.py +2 -2
  30. customer_retention/stages/profiling/relationship_recommender.py +2 -2
  31. customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
  32. customer_retention/stages/profiling/temporal_feature_engineer.py +7 -7
  33. customer_retention/stages/profiling/temporal_pattern_analyzer.py +16 -12
  34. customer_retention/stages/profiling/temporal_quality_checks.py +7 -6
  35. customer_retention/stages/profiling/time_series_profiler.py +15 -8
  36. customer_retention/stages/profiling/time_window_aggregator.py +22 -15
  37. customer_retention/stages/transformation/categorical_encoder.py +2 -2
  38. customer_retention/stages/transformation/pipeline.py +2 -2
  39. customer_retention/stages/validation/data_quality_gate.py +5 -5
  40. customer_retention/stages/validation/data_validators.py +3 -3
  41. customer_retention/stages/validation/leakage_gate.py +4 -4
  42. customer_retention/stages/validation/timeseries_detector.py +9 -6
  43. customer_retention/transforms/ops.py +2 -2
  44. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
  45. {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
  46. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/WHEEL +0 -0
  47. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/entry_points.txt +0 -0
  48. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
5
5
  import numpy as np
6
6
  from scipy import stats
7
7
 
8
- from customer_retention.core.compat import DataFrame, pd
8
+ from customer_retention.core.compat import DataFrame, pd, qcut, to_datetime
9
9
  from customer_retention.core.utils import compute_effect_size
10
10
 
11
11
 
@@ -627,7 +627,7 @@ class TemporalFeatureAnalyzer:
627
627
 
628
628
  def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
629
629
  df = df.copy()
630
- df[self.time_column] = pd.to_datetime(df[self.time_column])
630
+ df[self.time_column] = to_datetime(df[self.time_column])
631
631
  return df
632
632
 
633
633
  def _validate_event_level_target_usage(self, df: DataFrame, target_column: Optional[str]) -> None:
@@ -646,7 +646,7 @@ class TemporalFeatureAnalyzer:
646
646
  if len(df_iv) < bins * 2:
647
647
  return 0.0
648
648
  try:
649
- df_iv["bin"] = pd.qcut(df_iv["feature"], q=bins, duplicates="drop")
649
+ df_iv["bin"] = qcut(df_iv["feature"], q=bins, duplicates="drop")
650
650
  except ValueError:
651
651
  return 0.0
652
652
 
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
25
25
 
26
26
  import numpy as np
27
27
 
28
- from customer_retention.core.compat import pd
28
+ from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
29
29
 
30
30
 
31
31
  class ReferenceMode(Enum):
@@ -179,8 +179,8 @@ class TemporalFeatureEngineer:
179
179
  Returns:
180
180
  TemporalFeatureResult with features DataFrame and metadata
181
181
  """
182
- events_df = events_df.copy()
183
- events_df[time_col] = pd.to_datetime(events_df[time_col])
182
+ events_df = to_pandas(events_df).copy()
183
+ events_df[time_col] = to_datetime(events_df[time_col])
184
184
 
185
185
  # Determine reference dates per entity
186
186
  ref_dates = self._get_reference_dates(
@@ -313,9 +313,9 @@ class TemporalFeatureEngineer:
313
313
  })
314
314
 
315
315
  if reference_dates is not None and reference_col is not None:
316
- ref_df = reference_dates[[entity_col, reference_col]].copy()
316
+ ref_df = to_pandas(reference_dates)[[entity_col, reference_col]].copy()
317
317
  ref_df.columns = [entity_col, "reference_date"]
318
- ref_df["reference_date"] = pd.to_datetime(ref_df["reference_date"])
318
+ ref_df["reference_date"] = to_datetime(ref_df["reference_date"])
319
319
  return ref_df
320
320
 
321
321
  # Default: Use last event date per entity
@@ -511,8 +511,8 @@ class TemporalFeatureEngineer:
511
511
  entity_df["last_event"].iloc[0]
512
512
 
513
513
  # Calculate split boundaries
514
- split1 = first_event + pd.Timedelta(days=history_days * splits[0])
515
- split2 = first_event + pd.Timedelta(days=history_days * (splits[0] + splits[1]))
514
+ split1 = first_event + Timedelta(days=history_days * splits[0])
515
+ split2 = first_event + Timedelta(days=history_days * (splits[0] + splits[1]))
516
516
 
517
517
  for col in value_cols:
518
518
  beginning_val = entity_df[entity_df[time_col] < split1][col].sum()
@@ -5,7 +5,7 @@ from typing import Dict, List, Optional, Tuple
5
5
  import numpy as np
6
6
  from scipy import stats
7
7
 
8
- from customer_retention.core.compat import DataFrame, pd
8
+ from customer_retention.core.compat import DataFrame, Timestamp, cut, pd, to_datetime, to_pandas
9
9
  from customer_retention.core.utils import compute_effect_size
10
10
 
11
11
 
@@ -229,15 +229,16 @@ MONOTONIC_TOLERANCE = 0.05
229
229
 
230
230
  def compute_recency_buckets(
231
231
  df: DataFrame, entity_column: str, time_column: str, target_column: str,
232
- reference_date: pd.Timestamp, bucket_edges: Optional[List[float]] = None
232
+ reference_date: Timestamp, bucket_edges: Optional[List[float]] = None
233
233
  ) -> List[RecencyBucketStats]:
234
+ df = to_pandas(df)
234
235
  edges = bucket_edges or DEFAULT_BUCKET_EDGES
235
236
  labels = _generate_bucket_labels(edges)
236
237
  entity_last = df.groupby(entity_column)[time_column].max().reset_index()
237
238
  entity_last["recency_days"] = (reference_date - entity_last[time_column]).dt.days
238
239
  entity_target = df.groupby(entity_column)[target_column].first().reset_index()
239
240
  entity_data = entity_last.merge(entity_target, on=entity_column)
240
- entity_data["bucket"] = pd.cut(entity_data["recency_days"], bins=edges, labels=labels, include_lowest=True)
241
+ entity_data["bucket"] = cut(entity_data["recency_days"], bins=edges, labels=labels, include_lowest=True)
241
242
  bucket_stats = []
242
243
  for i, label in enumerate(labels):
243
244
  bucket_data = entity_data[entity_data["bucket"] == label]
@@ -296,6 +297,7 @@ def classify_distribution_pattern(buckets: List[RecencyBucketStats]) -> str:
296
297
  def _diagnose_anomaly_pattern(
297
298
  df: DataFrame, entity_column: str, time_column: str, target_column: str
298
299
  ) -> AnomalyDiagnostics:
300
+ df = to_pandas(df)
299
301
  entity_target = df.groupby(entity_column)[target_column].first()
300
302
  target_1_pct = float(entity_target.mean() * 100)
301
303
  target_1_is_minority = target_1_pct < 50
@@ -429,8 +431,9 @@ def _extract_threshold_from_bucket(bucket_label: str) -> int:
429
431
 
430
432
  def compare_recency_by_target(
431
433
  df: DataFrame, entity_column: str, time_column: str, target_column: str,
432
- reference_date: Optional[pd.Timestamp] = None, cap_percentile: float = 0.99
434
+ reference_date: Optional[Timestamp] = None, cap_percentile: float = 0.99
433
435
  ) -> Optional[RecencyComparisonResult]:
436
+ df = to_pandas(df)
434
437
  if target_column not in df.columns:
435
438
  return None
436
439
  ref_date = reference_date or df[time_column].max()
@@ -495,11 +498,11 @@ class TemporalPatternAnalyzer:
495
498
  if len(df) < 3:
496
499
  return self._unknown_trend()
497
500
 
498
- df_clean = df[[self.time_column, value_column]].dropna()
501
+ df_clean = to_pandas(df)[[self.time_column, value_column]].dropna()
499
502
  if len(df_clean) < 3:
500
503
  return self._unknown_trend()
501
504
 
502
- time_col = pd.to_datetime(df_clean[self.time_column])
505
+ time_col = to_datetime(df_clean[self.time_column])
503
506
  x = (time_col - time_col.min()).dt.total_seconds() / 86400
504
507
  y = df_clean[value_column].values
505
508
 
@@ -583,10 +586,10 @@ class TemporalPatternAnalyzer:
583
586
  if len(df) == 0:
584
587
  return pd.DataFrame()
585
588
 
586
- df_copy = df.copy()
589
+ df_copy = to_pandas(df).copy()
587
590
  entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
588
591
  df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
589
- df_copy["_cohort"] = pd.to_datetime(df_copy["_cohort"]).dt.to_period(period)
592
+ df_copy["_cohort"] = to_datetime(df_copy["_cohort"]).dt.to_period(period)
590
593
 
591
594
  entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
592
595
  entity_cohorts.columns = [entity_column, "_cohort"]
@@ -607,15 +610,16 @@ class TemporalPatternAnalyzer:
607
610
 
608
611
  return cohort_stats.sort_values("cohort")
609
612
 
610
- def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[pd.Timestamp] = None) -> RecencyResult:
613
+ def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[Timestamp] = None) -> RecencyResult:
611
614
  if len(df) == 0:
612
615
  return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
613
616
 
614
- ref_date = reference_date or pd.Timestamp.now()
615
- pd.to_datetime(df[self.time_column])
617
+ df = to_pandas(df)
618
+ ref_date = reference_date or Timestamp.now()
619
+ to_datetime(df[self.time_column])
616
620
 
617
621
  entity_last = df.groupby(entity_column)[self.time_column].max()
618
- entity_last = pd.to_datetime(entity_last)
622
+ entity_last = to_datetime(entity_last)
619
623
  recency_days = (ref_date - entity_last).dt.days
620
624
 
621
625
  target_correlation = None
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass, field
2
2
  from typing import Optional
3
3
 
4
- from customer_retention.core.compat import DataFrame, pd
4
+ from customer_retention.core.compat import DataFrame, Timestamp, to_datetime, to_pandas
5
5
  from customer_retention.core.components.enums import Severity
6
6
 
7
7
 
@@ -73,7 +73,8 @@ class TemporalGapCheck(TemporalQualityCheck):
73
73
  if len(df) < 2:
74
74
  return self._pass_result("Insufficient data to check gaps")
75
75
 
76
- time_col = pd.to_datetime(df.sort_values(self.time_column)[self.time_column])
76
+ df = to_pandas(df)
77
+ time_col = to_datetime(df.sort_values(self.time_column)[self.time_column])
77
78
  diffs_days = time_col.diff().dropna().dt.total_seconds() / 86400
78
79
  expected_days = self.FREQ_TO_DAYS.get(self.expected_frequency, 1)
79
80
  threshold_days = expected_days * self.max_gap_multiple
@@ -101,16 +102,16 @@ class TemporalGapCheck(TemporalQualityCheck):
101
102
 
102
103
 
103
104
  class FutureDateCheck(TemporalQualityCheck):
104
- def __init__(self, time_column: str, reference_date: Optional[pd.Timestamp] = None):
105
+ def __init__(self, time_column: str, reference_date: Optional[Timestamp] = None):
105
106
  super().__init__("TQ003", "Future Dates", Severity.HIGH)
106
107
  self.time_column = time_column
107
- self.reference_date = reference_date or pd.Timestamp.now()
108
+ self.reference_date = reference_date or Timestamp.now()
108
109
 
109
110
  def run(self, df: DataFrame) -> TemporalQualityResult:
110
111
  if len(df) == 0:
111
112
  return self._pass_result("No data to check")
112
113
 
113
- time_col = pd.to_datetime(df[self.time_column])
114
+ time_col = to_datetime(df[self.time_column])
114
115
  future_mask = time_col > self.reference_date
115
116
  future_count = future_mask.sum()
116
117
 
@@ -140,7 +141,7 @@ class EventOrderCheck(TemporalQualityCheck):
140
141
  if len(df) < 2:
141
142
  return self._pass_result("Insufficient data to check ordering")
142
143
 
143
- df_check = df.assign(_parsed_time=pd.to_datetime(df[self.time_column]))
144
+ df_check = df.assign(_parsed_time=to_datetime(df[self.time_column]))
144
145
  collision_counts = df_check.groupby([self.entity_column, "_parsed_time"]).size()
145
146
  ambiguous = collision_counts[collision_counts > 1]
146
147
  ambiguous_count = ambiguous.sum() - len(ambiguous)
@@ -3,7 +3,14 @@ from typing import Optional
3
3
 
4
4
  import numpy as np
5
5
 
6
- from customer_retention.core.compat import DataFrame, pd
6
+ from customer_retention.core.compat import (
7
+ DataFrame,
8
+ Timestamp,
9
+ is_datetime64_any_dtype,
10
+ pd,
11
+ to_datetime,
12
+ to_pandas,
13
+ )
7
14
 
8
15
 
9
16
  @dataclass
@@ -161,8 +168,8 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
161
168
  @dataclass
162
169
  class EntityLifecycle:
163
170
  entity: str
164
- first_event: pd.Timestamp
165
- last_event: pd.Timestamp
171
+ first_event: Timestamp
172
+ last_event: Timestamp
166
173
  duration_days: int
167
174
  event_count: int
168
175
 
@@ -177,8 +184,8 @@ class TimeSeriesProfile:
177
184
  events_per_entity: DistributionStats
178
185
  entity_lifecycles: DataFrame
179
186
  avg_inter_event_days: Optional[float] = None
180
- first_event_date: Optional[pd.Timestamp] = None
181
- last_event_date: Optional[pd.Timestamp] = None
187
+ first_event_date: Optional[Timestamp] = None
188
+ last_event_date: Optional[Timestamp] = None
182
189
 
183
190
 
184
191
  class TimeSeriesProfiler:
@@ -223,9 +230,9 @@ class TimeSeriesProfiler:
223
230
  raise KeyError(f"Time column '{self.time_column}' not found")
224
231
 
225
232
  def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
226
- df = df.copy()
227
- if not pd.api.types.is_datetime64_any_dtype(df[self.time_column]):
228
- df[self.time_column] = pd.to_datetime(df[self.time_column])
233
+ df = to_pandas(df).copy()
234
+ if not is_datetime64_any_dtype(df[self.time_column]):
235
+ df[self.time_column] = to_datetime(df[self.time_column])
229
236
  return df
230
237
 
231
238
  def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
@@ -6,7 +6,14 @@ from typing import Dict, List, Optional, Union
6
6
 
7
7
  import numpy as np
8
8
 
9
- from customer_retention.core.compat import DataFrame, pd
9
+ from customer_retention.core.compat import (
10
+ DataFrame,
11
+ Timedelta,
12
+ Timestamp,
13
+ is_numeric_dtype,
14
+ pd,
15
+ to_datetime,
16
+ )
10
17
 
11
18
 
12
19
  class AggregationType(str, Enum):
@@ -71,7 +78,7 @@ class TimeWindowAggregator:
71
78
  def aggregate(
72
79
  self, df: DataFrame, windows: Optional[List[str]] = None,
73
80
  value_columns: Optional[List[str]] = None, agg_funcs: Optional[List[str]] = None,
74
- reference_date: Optional[pd.Timestamp] = None, include_event_count: bool = False,
81
+ reference_date: Optional[Timestamp] = None, include_event_count: bool = False,
75
82
  include_recency: bool = False, include_tenure: bool = False,
76
83
  exclude_columns: Optional[List[str]] = None,
77
84
  ) -> DataFrame:
@@ -79,7 +86,7 @@ class TimeWindowAggregator:
79
86
  return pd.DataFrame()
80
87
 
81
88
  df = df.copy()
82
- df[self.time_column] = pd.to_datetime(df[self.time_column])
89
+ df[self.time_column] = to_datetime(df[self.time_column])
83
90
  reference_date = self._validate_reference_date(df, reference_date)
84
91
  parsed_windows = [TimeWindow.from_string(w) for w in (windows or ["30d"])]
85
92
 
@@ -107,13 +114,13 @@ class TimeWindowAggregator:
107
114
  result = pd.DataFrame(result_data)
108
115
  result.attrs["aggregation_reference_date"] = (
109
116
  reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
110
- result.attrs["aggregation_timestamp"] = pd.Timestamp.now().isoformat()
117
+ result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()
111
118
  return result
112
119
 
113
120
  def _add_value_aggregations(
114
121
  self, result_data: Dict, df: DataFrame, entities: np.ndarray,
115
122
  windows: List[TimeWindow], value_columns: List[str], agg_funcs: List[str],
116
- reference_date: pd.Timestamp,
123
+ reference_date: Timestamp,
117
124
  ) -> None:
118
125
  for window in windows:
119
126
  for col in value_columns:
@@ -169,9 +176,9 @@ class TimeWindowAggregator:
169
176
 
170
177
  return feature_columns, value_counts_categories
171
178
 
172
- def _validate_reference_date(self, df: DataFrame, reference_date: Optional[pd.Timestamp]) -> pd.Timestamp:
179
+ def _validate_reference_date(self, df: DataFrame, reference_date: Optional[Timestamp]) -> Timestamp:
173
180
  data_min, data_max = df[self.time_column].min(), df[self.time_column].max()
174
- current_date = pd.Timestamp.now()
181
+ current_date = Timestamp.now()
175
182
 
176
183
  if reference_date is None:
177
184
  warnings.warn(
@@ -196,16 +203,16 @@ class TimeWindowAggregator:
196
203
  return reference_date
197
204
 
198
205
  def _compute_event_counts(
199
- self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date: pd.Timestamp,
206
+ self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date: Timestamp,
200
207
  ) -> np.ndarray:
201
208
  filtered_df = self._filter_by_window(df, window, reference_date)
202
209
  counts = filtered_df.groupby(self.entity_column).size()
203
210
  return np.array([counts.get(e, 0) for e in entities])
204
211
 
205
- def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date: pd.Timestamp) -> DataFrame:
212
+ def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date: Timestamp) -> DataFrame:
206
213
  if window.days is None:
207
214
  return df
208
- cutoff = reference_date - pd.Timedelta(days=window.days)
215
+ cutoff = reference_date - Timedelta(days=window.days)
209
216
  return df[df[self.time_column] >= cutoff]
210
217
 
211
218
  def _compute_aggregation(
@@ -215,14 +222,14 @@ class TimeWindowAggregator:
215
222
  value_column: str,
216
223
  agg_func: str,
217
224
  window: TimeWindow,
218
- reference_date: pd.Timestamp,
225
+ reference_date: Timestamp,
219
226
  ) -> np.ndarray:
220
227
  filtered_df = self._filter_by_window(df, window, reference_date)
221
228
  if len(filtered_df) == 0:
222
229
  default = 0 if agg_func in ["sum", "count", "nunique"] else np.nan
223
230
  return np.full(len(entities), default)
224
231
 
225
- is_numeric = pd.api.types.is_numeric_dtype(df[value_column])
232
+ is_numeric = is_numeric_dtype(df[value_column])
226
233
  if agg_func in CATEGORICAL_AGG_FUNCS:
227
234
  return self._compute_categorical_agg(filtered_df, entities, value_column, agg_func)
228
235
  elif agg_func in NUMERIC_AGG_FUNCS and not is_numeric:
@@ -288,7 +295,7 @@ class TimeWindowAggregator:
288
295
  return np.array([entropy_result.get(e, np.nan) for e in entities])
289
296
 
290
297
  def _compute_value_counts(
291
- self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date: pd.Timestamp
298
+ self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date: Timestamp
292
299
  ) -> Dict[str, np.ndarray]:
293
300
  filtered_df = self._filter_by_window(df, window, reference_date)
294
301
  unique_values = df[col].dropna().unique()
@@ -302,12 +309,12 @@ class TimeWindowAggregator:
302
309
  result[col_name] = np.array([counts.get(e, 0) for e in entities])
303
310
  return result
304
311
 
305
- def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date: pd.Timestamp) -> np.ndarray:
312
+ def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
306
313
  last_dates = df.groupby(self.entity_column)[self.time_column].max()
307
314
  days_since_last = (reference_date - last_dates).dt.days
308
315
  return np.array([days_since_last.get(e, np.nan) for e in entities])
309
316
 
310
- def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date: pd.Timestamp) -> np.ndarray:
317
+ def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
311
318
  first_dates = df.groupby(self.entity_column)[self.time_column].min()
312
319
  days_since_first = (reference_date - first_dates).dt.days
313
320
  return np.array([days_since_first.get(e, np.nan) for e in entities])
@@ -4,7 +4,7 @@ from typing import Optional
4
4
 
5
5
  import numpy as np
6
6
 
7
- from customer_retention.core.compat import DataFrame, Series, pd
7
+ from customer_retention.core.compat import DataFrame, Series, to_numeric
8
8
 
9
9
 
10
10
  class EncodingStrategy(str, Enum):
@@ -212,7 +212,7 @@ class CategoricalEncoder:
212
212
  if hasattr(self, '_cyclical_mapping') and self._cyclical_mapping is not None:
213
213
  numeric = series.map(self._cyclical_mapping)
214
214
  else:
215
- numeric = pd.to_numeric(series, errors='coerce')
215
+ numeric = to_numeric(series, errors='coerce')
216
216
 
217
217
  sin_vals = np.sin(2 * np.pi * numeric / self.period)
218
218
  cos_vals = np.cos(2 * np.pi * numeric / self.period)
@@ -4,7 +4,7 @@ from typing import Optional
4
4
 
5
5
  import numpy as np
6
6
 
7
- from customer_retention.core.compat import DataFrame, pd
7
+ from customer_retention.core.compat import DataFrame, notna
8
8
  from customer_retention.core.config import ColumnType
9
9
  from customer_retention.stages.cleaning import MissingValueHandler, OutlierHandler, OutlierTreatmentStrategy
10
10
 
@@ -198,7 +198,7 @@ class TransformationPipeline:
198
198
  if col in working_df.columns and working_df[col].isna().any():
199
199
  # Fill with median for extracted datetime features
200
200
  median_val = working_df[col].median()
201
- if pd.notna(median_val):
201
+ if notna(median_val):
202
202
  working_df[col] = working_df[col].fillna(median_val)
203
203
 
204
204
  for col, transformer in self._numeric_transformers.items():
@@ -1,6 +1,6 @@
1
1
  import time
2
2
 
3
- from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype, pd
3
+ from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype, to_datetime, to_numeric
4
4
  from customer_retention.core.config.column_config import ColumnType
5
5
  from customer_retention.core.config.pipeline_config import BronzeConfig, PipelineConfig
6
6
 
@@ -159,7 +159,7 @@ class DataQualityGate(ValidationGate):
159
159
  continue
160
160
 
161
161
  if not is_datetime64_any_dtype(df_temp):
162
- df_temp = pd.to_datetime(df_temp, errors='coerce', format='mixed')
162
+ df_temp = to_datetime(df_temp, errors='coerce', format='mixed')
163
163
 
164
164
  future_dates = df_temp > Timestamp.now()
165
165
  future_count = future_dates.sum()
@@ -185,8 +185,8 @@ class DataQualityGate(ValidationGate):
185
185
  if len(df_temp) == 0:
186
186
  return issues
187
187
 
188
- created = pd.to_datetime(df_temp['created'], errors='coerce', format='mixed')
189
- firstorder = pd.to_datetime(df_temp['firstorder'], errors='coerce', format='mixed')
188
+ created = to_datetime(df_temp['created'], errors='coerce', format='mixed')
189
+ firstorder = to_datetime(df_temp['firstorder'], errors='coerce', format='mixed')
190
190
 
191
191
  violations = created > firstorder
192
192
  violation_count = violations.sum()
@@ -214,7 +214,7 @@ class DataQualityGate(ValidationGate):
214
214
 
215
215
  if col_config.is_numeric() and column_data.dtype == 'object':
216
216
  try:
217
- pd.to_numeric(column_data.dropna(), errors='raise')
217
+ to_numeric(column_data.dropna(), errors='raise')
218
218
  issues.append(self.create_issue(
219
219
  "DQ040", "Numeric column stored as string",
220
220
  Severity.MEDIUM, col_config.name, len(df), len(df),
@@ -8,7 +8,7 @@ including duplicate detection, date logic validation, and value range validation
8
8
  from dataclasses import dataclass, field
9
9
  from typing import Any, Dict, List, Optional
10
10
 
11
- from customer_retention.core.compat import DataFrame, pd
11
+ from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime
12
12
  from customer_retention.core.components.enums import Severity
13
13
 
14
14
 
@@ -249,8 +249,8 @@ class DataValidator:
249
249
  # Convert to datetime if needed
250
250
  df_dates = df[order].copy()
251
251
  for col in order:
252
- if not pd.api.types.is_datetime64_any_dtype(df_dates[col]):
253
- df_dates[col] = pd.to_datetime(df_dates[col], errors='coerce', format='mixed')
252
+ if not is_datetime64_any_dtype(df_dates[col]):
253
+ df_dates[col] = to_datetime(df_dates[col], errors='coerce', format='mixed')
254
254
 
255
255
  # Check sequential ordering
256
256
  violations = []
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING, Any, Dict, List, Optional
3
3
 
4
- from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype, pd
4
+ from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype, notna, to_datetime
5
5
  from customer_retention.core.components.enums import Severity
6
6
 
7
7
  if TYPE_CHECKING:
@@ -124,7 +124,7 @@ class LeakageGate:
124
124
  for feature in numeric_features:
125
125
  try:
126
126
  corr = df[feature].corr(df[self.target_column])
127
- if pd.notna(corr):
127
+ if notna(corr):
128
128
  correlations[feature] = corr
129
129
  except Exception:
130
130
  continue
@@ -153,7 +153,7 @@ class LeakageGate:
153
153
 
154
154
  @staticmethod
155
155
  def _parse_datetime(series, errors="coerce"):
156
- return pd.to_datetime(series, errors=errors, format='mixed')
156
+ return to_datetime(series, errors=errors, format='mixed')
157
157
 
158
158
  def _check_perfect_separation(
159
159
  self,
@@ -236,7 +236,7 @@ class LeakageGate:
236
236
  mean_0 = df[df[self.target_column] == target_values[0]][feature].mean()
237
237
  mean_1 = df[df[self.target_column] == target_values[1]][feature].mean()
238
238
 
239
- if (pd.notna(var_0) and pd.notna(var_1) and
239
+ if (notna(var_0) and notna(var_1) and
240
240
  var_0 < 0.01 and var_1 < 0.01 and
241
241
  abs(mean_0 - mean_1) > 0.1):
242
242
  issues.append(LeakageIssue(
@@ -11,7 +11,7 @@ from datetime import timedelta
11
11
  from enum import Enum
12
12
  from typing import Any, Dict, List, Optional, Tuple
13
13
 
14
- from customer_retention.core.compat import DataFrame, pd
14
+ from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime, to_pandas
15
15
 
16
16
 
17
17
  class DatasetType(Enum):
@@ -181,6 +181,7 @@ class TimeSeriesDetector:
181
181
  TimeSeriesCharacteristics
182
182
  Detected characteristics of the dataset
183
183
  """
184
+ df = to_pandas(df)
184
185
  evidence = []
185
186
 
186
187
  # Auto-detect entity column if not provided
@@ -255,7 +256,7 @@ class TimeSeriesDetector:
255
256
 
256
257
  if timestamp_column and timestamp_column in df.columns:
257
258
  # Convert to datetime if needed
258
- ts_series = pd.to_datetime(
259
+ ts_series = to_datetime(
259
260
  df[timestamp_column], errors='coerce', format='mixed'
260
261
  )
261
262
  valid_ts = ts_series.notna()
@@ -342,7 +343,7 @@ class TimeSeriesDetector:
342
343
  name_match = any(pattern in col_lower for pattern in self.TIMESTAMP_PATTERNS)
343
344
 
344
345
  # Check if column is datetime type
345
- is_datetime = pd.api.types.is_datetime64_any_dtype(df[col])
346
+ is_datetime = is_datetime64_any_dtype(df[col])
346
347
 
347
348
  # Try to parse as datetime
348
349
  can_parse = False
@@ -350,7 +351,7 @@ class TimeSeriesDetector:
350
351
  try:
351
352
  with warnings.catch_warnings():
352
353
  warnings.filterwarnings('ignore', category=FutureWarning)
353
- parsed = pd.to_datetime(
354
+ parsed = to_datetime(
354
355
  df[col].head(100), errors='coerce', format='mixed'
355
356
  )
356
357
  can_parse = parsed.notna().mean() > 0.8
@@ -389,7 +390,7 @@ class TimeSeriesDetector:
389
390
  if len(entity_data) < 2:
390
391
  continue
391
392
 
392
- ts = pd.to_datetime(
393
+ ts = to_datetime(
393
394
  entity_data[timestamp_column], errors='coerce', format='mixed'
394
395
  )
395
396
  ts = ts.dropna().sort_values()
@@ -510,6 +511,8 @@ class TimeSeriesValidator:
510
511
  """
511
512
  issues = []
512
513
 
514
+ df = to_pandas(df)
515
+
513
516
  # Validate inputs
514
517
  if entity_column not in df.columns:
515
518
  return TimeSeriesValidationResult(
@@ -525,7 +528,7 @@ class TimeSeriesValidator:
525
528
 
526
529
  # Convert timestamp
527
530
  df_copy = df.copy()
528
- df_copy['_ts'] = pd.to_datetime(
531
+ df_copy['_ts'] = to_datetime(
529
532
  df_copy[timestamp_column], errors='coerce', format='mixed'
530
533
  )
531
534
 
@@ -12,7 +12,7 @@ from typing import Any
12
12
 
13
13
  import numpy as np
14
14
 
15
- from customer_retention.core.compat import DataFrame, pd
15
+ from customer_retention.core.compat import DataFrame, get_dummies, pd
16
16
 
17
17
 
18
18
  def _requires_column(fn):
@@ -113,7 +113,7 @@ def apply_cap_then_log(df: DataFrame, column: str) -> DataFrame:
113
113
 
114
114
  @_requires_column
115
115
  def apply_one_hot_encode(df: DataFrame, column: str) -> DataFrame:
116
- return pd.get_dummies(df, columns=[column], prefix=column)
116
+ return get_dummies(df, columns=[column], prefix=column)
117
117
 
118
118
 
119
119
  def apply_feature_select(df: DataFrame, column: str) -> DataFrame: