churnkit 0.75.0a3__py3-none-any.whl → 0.75.1a1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +72 -72
  2. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +134 -134
  3. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +207 -207
  4. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +109 -109
  5. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +283 -283
  6. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +145 -145
  7. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +126 -126
  8. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +149 -149
  9. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +172 -172
  10. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +130 -130
  11. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +163 -163
  12. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +99 -99
  13. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +126 -126
  14. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +89 -89
  15. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +132 -132
  16. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +197 -197
  17. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +27 -27
  18. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/METADATA +2 -2
  19. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/RECORD +45 -45
  20. customer_retention/__init__.py +1 -1
  21. customer_retention/analysis/business/fairness_analyzer.py +2 -2
  22. customer_retention/analysis/diagnostics/segment_analyzer.py +3 -3
  23. customer_retention/analysis/interpretability/cohort_analyzer.py +4 -4
  24. customer_retention/core/compat/__init__.py +20 -0
  25. customer_retention/stages/features/behavioral_features.py +3 -3
  26. customer_retention/stages/features/customer_segmentation.py +10 -10
  27. customer_retention/stages/features/feature_selector.py +2 -2
  28. customer_retention/stages/profiling/relationship_recommender.py +2 -2
  29. customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
  30. customer_retention/stages/profiling/temporal_pattern_analyzer.py +10 -10
  31. customer_retention/stages/profiling/temporal_quality_checks.py +6 -6
  32. customer_retention/stages/profiling/time_series_profiler.py +13 -7
  33. customer_retention/stages/profiling/time_window_aggregator.py +22 -15
  34. customer_retention/stages/transformation/categorical_encoder.py +2 -2
  35. customer_retention/stages/transformation/pipeline.py +2 -2
  36. customer_retention/stages/validation/data_quality_gate.py +5 -5
  37. customer_retention/stages/validation/data_validators.py +3 -3
  38. customer_retention/stages/validation/leakage_gate.py +4 -4
  39. customer_retention/stages/validation/timeseries_detector.py +6 -6
  40. customer_retention/transforms/ops.py +2 -2
  41. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
  42. {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
  43. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/WHEEL +0 -0
  44. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/entry_points.txt +0 -0
  45. {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/licenses/LICENSE +0 -0
@@ -5,7 +5,7 @@ from typing import Dict, List, Optional, Tuple
5
5
  import numpy as np
6
6
  from scipy import stats
7
7
 
8
- from customer_retention.core.compat import DataFrame, pd
8
+ from customer_retention.core.compat import DataFrame, Timestamp, cut, pd, to_datetime
9
9
  from customer_retention.core.utils import compute_effect_size
10
10
 
11
11
 
@@ -229,7 +229,7 @@ MONOTONIC_TOLERANCE = 0.05
229
229
 
230
230
  def compute_recency_buckets(
231
231
  df: DataFrame, entity_column: str, time_column: str, target_column: str,
232
- reference_date: pd.Timestamp, bucket_edges: Optional[List[float]] = None
232
+ reference_date: Timestamp, bucket_edges: Optional[List[float]] = None
233
233
  ) -> List[RecencyBucketStats]:
234
234
  edges = bucket_edges or DEFAULT_BUCKET_EDGES
235
235
  labels = _generate_bucket_labels(edges)
@@ -237,7 +237,7 @@ def compute_recency_buckets(
237
237
  entity_last["recency_days"] = (reference_date - entity_last[time_column]).dt.days
238
238
  entity_target = df.groupby(entity_column)[target_column].first().reset_index()
239
239
  entity_data = entity_last.merge(entity_target, on=entity_column)
240
- entity_data["bucket"] = pd.cut(entity_data["recency_days"], bins=edges, labels=labels, include_lowest=True)
240
+ entity_data["bucket"] = cut(entity_data["recency_days"], bins=edges, labels=labels, include_lowest=True)
241
241
  bucket_stats = []
242
242
  for i, label in enumerate(labels):
243
243
  bucket_data = entity_data[entity_data["bucket"] == label]
@@ -429,7 +429,7 @@ def _extract_threshold_from_bucket(bucket_label: str) -> int:
429
429
 
430
430
  def compare_recency_by_target(
431
431
  df: DataFrame, entity_column: str, time_column: str, target_column: str,
432
- reference_date: Optional[pd.Timestamp] = None, cap_percentile: float = 0.99
432
+ reference_date: Optional[Timestamp] = None, cap_percentile: float = 0.99
433
433
  ) -> Optional[RecencyComparisonResult]:
434
434
  if target_column not in df.columns:
435
435
  return None
@@ -499,7 +499,7 @@ class TemporalPatternAnalyzer:
499
499
  if len(df_clean) < 3:
500
500
  return self._unknown_trend()
501
501
 
502
- time_col = pd.to_datetime(df_clean[self.time_column])
502
+ time_col = to_datetime(df_clean[self.time_column])
503
503
  x = (time_col - time_col.min()).dt.total_seconds() / 86400
504
504
  y = df_clean[value_column].values
505
505
 
@@ -586,7 +586,7 @@ class TemporalPatternAnalyzer:
586
586
  df_copy = df.copy()
587
587
  entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
588
588
  df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
589
- df_copy["_cohort"] = pd.to_datetime(df_copy["_cohort"]).dt.to_period(period)
589
+ df_copy["_cohort"] = to_datetime(df_copy["_cohort"]).dt.to_period(period)
590
590
 
591
591
  entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
592
592
  entity_cohorts.columns = [entity_column, "_cohort"]
@@ -607,15 +607,15 @@ class TemporalPatternAnalyzer:
607
607
 
608
608
  return cohort_stats.sort_values("cohort")
609
609
 
610
- def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[pd.Timestamp] = None) -> RecencyResult:
610
+ def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[Timestamp] = None) -> RecencyResult:
611
611
  if len(df) == 0:
612
612
  return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
613
613
 
614
- ref_date = reference_date or pd.Timestamp.now()
615
- pd.to_datetime(df[self.time_column])
614
+ ref_date = reference_date or Timestamp.now()
615
+ to_datetime(df[self.time_column])
616
616
 
617
617
  entity_last = df.groupby(entity_column)[self.time_column].max()
618
- entity_last = pd.to_datetime(entity_last)
618
+ entity_last = to_datetime(entity_last)
619
619
  recency_days = (ref_date - entity_last).dt.days
620
620
 
621
621
  target_correlation = None
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass, field
2
2
  from typing import Optional
3
3
 
4
- from customer_retention.core.compat import DataFrame, pd
4
+ from customer_retention.core.compat import DataFrame, Timestamp, to_datetime
5
5
  from customer_retention.core.components.enums import Severity
6
6
 
7
7
 
@@ -73,7 +73,7 @@ class TemporalGapCheck(TemporalQualityCheck):
73
73
  if len(df) < 2:
74
74
  return self._pass_result("Insufficient data to check gaps")
75
75
 
76
- time_col = pd.to_datetime(df.sort_values(self.time_column)[self.time_column])
76
+ time_col = to_datetime(df.sort_values(self.time_column)[self.time_column])
77
77
  diffs_days = time_col.diff().dropna().dt.total_seconds() / 86400
78
78
  expected_days = self.FREQ_TO_DAYS.get(self.expected_frequency, 1)
79
79
  threshold_days = expected_days * self.max_gap_multiple
@@ -101,16 +101,16 @@ class TemporalGapCheck(TemporalQualityCheck):
101
101
 
102
102
 
103
103
  class FutureDateCheck(TemporalQualityCheck):
104
- def __init__(self, time_column: str, reference_date: Optional[pd.Timestamp] = None):
104
+ def __init__(self, time_column: str, reference_date: Optional[Timestamp] = None):
105
105
  super().__init__("TQ003", "Future Dates", Severity.HIGH)
106
106
  self.time_column = time_column
107
- self.reference_date = reference_date or pd.Timestamp.now()
107
+ self.reference_date = reference_date or Timestamp.now()
108
108
 
109
109
  def run(self, df: DataFrame) -> TemporalQualityResult:
110
110
  if len(df) == 0:
111
111
  return self._pass_result("No data to check")
112
112
 
113
- time_col = pd.to_datetime(df[self.time_column])
113
+ time_col = to_datetime(df[self.time_column])
114
114
  future_mask = time_col > self.reference_date
115
115
  future_count = future_mask.sum()
116
116
 
@@ -140,7 +140,7 @@ class EventOrderCheck(TemporalQualityCheck):
140
140
  if len(df) < 2:
141
141
  return self._pass_result("Insufficient data to check ordering")
142
142
 
143
- df_check = df.assign(_parsed_time=pd.to_datetime(df[self.time_column]))
143
+ df_check = df.assign(_parsed_time=to_datetime(df[self.time_column]))
144
144
  collision_counts = df_check.groupby([self.entity_column, "_parsed_time"]).size()
145
145
  ambiguous = collision_counts[collision_counts > 1]
146
146
  ambiguous_count = ambiguous.sum() - len(ambiguous)
@@ -3,7 +3,13 @@ from typing import Optional
3
3
 
4
4
  import numpy as np
5
5
 
6
- from customer_retention.core.compat import DataFrame, pd
6
+ from customer_retention.core.compat import (
7
+ DataFrame,
8
+ Timestamp,
9
+ is_datetime64_any_dtype,
10
+ pd,
11
+ to_datetime,
12
+ )
7
13
 
8
14
 
9
15
  @dataclass
@@ -161,8 +167,8 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
161
167
  @dataclass
162
168
  class EntityLifecycle:
163
169
  entity: str
164
- first_event: pd.Timestamp
165
- last_event: pd.Timestamp
170
+ first_event: Timestamp
171
+ last_event: Timestamp
166
172
  duration_days: int
167
173
  event_count: int
168
174
 
@@ -177,8 +183,8 @@ class TimeSeriesProfile:
177
183
  events_per_entity: DistributionStats
178
184
  entity_lifecycles: DataFrame
179
185
  avg_inter_event_days: Optional[float] = None
180
- first_event_date: Optional[pd.Timestamp] = None
181
- last_event_date: Optional[pd.Timestamp] = None
186
+ first_event_date: Optional[Timestamp] = None
187
+ last_event_date: Optional[Timestamp] = None
182
188
 
183
189
 
184
190
  class TimeSeriesProfiler:
@@ -224,8 +230,8 @@ class TimeSeriesProfiler:
224
230
 
225
231
  def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
226
232
  df = df.copy()
227
- if not pd.api.types.is_datetime64_any_dtype(df[self.time_column]):
228
- df[self.time_column] = pd.to_datetime(df[self.time_column])
233
+ if not is_datetime64_any_dtype(df[self.time_column]):
234
+ df[self.time_column] = to_datetime(df[self.time_column])
229
235
  return df
230
236
 
231
237
  def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
@@ -6,7 +6,14 @@ from typing import Dict, List, Optional, Union
6
6
 
7
7
  import numpy as np
8
8
 
9
- from customer_retention.core.compat import DataFrame, pd
9
+ from customer_retention.core.compat import (
10
+ DataFrame,
11
+ Timedelta,
12
+ Timestamp,
13
+ is_numeric_dtype,
14
+ pd,
15
+ to_datetime,
16
+ )
10
17
 
11
18
 
12
19
  class AggregationType(str, Enum):
@@ -71,7 +78,7 @@ class TimeWindowAggregator:
71
78
  def aggregate(
72
79
  self, df: DataFrame, windows: Optional[List[str]] = None,
73
80
  value_columns: Optional[List[str]] = None, agg_funcs: Optional[List[str]] = None,
74
- reference_date: Optional[pd.Timestamp] = None, include_event_count: bool = False,
81
+ reference_date: Optional[Timestamp] = None, include_event_count: bool = False,
75
82
  include_recency: bool = False, include_tenure: bool = False,
76
83
  exclude_columns: Optional[List[str]] = None,
77
84
  ) -> DataFrame:
@@ -79,7 +86,7 @@ class TimeWindowAggregator:
79
86
  return pd.DataFrame()
80
87
 
81
88
  df = df.copy()
82
- df[self.time_column] = pd.to_datetime(df[self.time_column])
89
+ df[self.time_column] = to_datetime(df[self.time_column])
83
90
  reference_date = self._validate_reference_date(df, reference_date)
84
91
  parsed_windows = [TimeWindow.from_string(w) for w in (windows or ["30d"])]
85
92
 
@@ -107,13 +114,13 @@ class TimeWindowAggregator:
107
114
  result = pd.DataFrame(result_data)
108
115
  result.attrs["aggregation_reference_date"] = (
109
116
  reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
110
- result.attrs["aggregation_timestamp"] = pd.Timestamp.now().isoformat()
117
+ result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()
111
118
  return result
112
119
 
113
120
  def _add_value_aggregations(
114
121
  self, result_data: Dict, df: DataFrame, entities: np.ndarray,
115
122
  windows: List[TimeWindow], value_columns: List[str], agg_funcs: List[str],
116
- reference_date: pd.Timestamp,
123
+ reference_date: Timestamp,
117
124
  ) -> None:
118
125
  for window in windows:
119
126
  for col in value_columns:
@@ -169,9 +176,9 @@ class TimeWindowAggregator:
169
176
 
170
177
  return feature_columns, value_counts_categories
171
178
 
172
- def _validate_reference_date(self, df: DataFrame, reference_date: Optional[pd.Timestamp]) -> pd.Timestamp:
179
+ def _validate_reference_date(self, df: DataFrame, reference_date: Optional[Timestamp]) -> Timestamp:
173
180
  data_min, data_max = df[self.time_column].min(), df[self.time_column].max()
174
- current_date = pd.Timestamp.now()
181
+ current_date = Timestamp.now()
175
182
 
176
183
  if reference_date is None:
177
184
  warnings.warn(
@@ -196,16 +203,16 @@ class TimeWindowAggregator:
196
203
  return reference_date
197
204
 
198
205
  def _compute_event_counts(
199
- self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date: pd.Timestamp,
206
+ self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date: Timestamp,
200
207
  ) -> np.ndarray:
201
208
  filtered_df = self._filter_by_window(df, window, reference_date)
202
209
  counts = filtered_df.groupby(self.entity_column).size()
203
210
  return np.array([counts.get(e, 0) for e in entities])
204
211
 
205
- def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date: pd.Timestamp) -> DataFrame:
212
+ def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date: Timestamp) -> DataFrame:
206
213
  if window.days is None:
207
214
  return df
208
- cutoff = reference_date - pd.Timedelta(days=window.days)
215
+ cutoff = reference_date - Timedelta(days=window.days)
209
216
  return df[df[self.time_column] >= cutoff]
210
217
 
211
218
  def _compute_aggregation(
@@ -215,14 +222,14 @@ class TimeWindowAggregator:
215
222
  value_column: str,
216
223
  agg_func: str,
217
224
  window: TimeWindow,
218
- reference_date: pd.Timestamp,
225
+ reference_date: Timestamp,
219
226
  ) -> np.ndarray:
220
227
  filtered_df = self._filter_by_window(df, window, reference_date)
221
228
  if len(filtered_df) == 0:
222
229
  default = 0 if agg_func in ["sum", "count", "nunique"] else np.nan
223
230
  return np.full(len(entities), default)
224
231
 
225
- is_numeric = pd.api.types.is_numeric_dtype(df[value_column])
232
+ is_numeric = is_numeric_dtype(df[value_column])
226
233
  if agg_func in CATEGORICAL_AGG_FUNCS:
227
234
  return self._compute_categorical_agg(filtered_df, entities, value_column, agg_func)
228
235
  elif agg_func in NUMERIC_AGG_FUNCS and not is_numeric:
@@ -288,7 +295,7 @@ class TimeWindowAggregator:
288
295
  return np.array([entropy_result.get(e, np.nan) for e in entities])
289
296
 
290
297
  def _compute_value_counts(
291
- self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date: pd.Timestamp
298
+ self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date: Timestamp
292
299
  ) -> Dict[str, np.ndarray]:
293
300
  filtered_df = self._filter_by_window(df, window, reference_date)
294
301
  unique_values = df[col].dropna().unique()
@@ -302,12 +309,12 @@ class TimeWindowAggregator:
302
309
  result[col_name] = np.array([counts.get(e, 0) for e in entities])
303
310
  return result
304
311
 
305
- def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date: pd.Timestamp) -> np.ndarray:
312
+ def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
306
313
  last_dates = df.groupby(self.entity_column)[self.time_column].max()
307
314
  days_since_last = (reference_date - last_dates).dt.days
308
315
  return np.array([days_since_last.get(e, np.nan) for e in entities])
309
316
 
310
- def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date: pd.Timestamp) -> np.ndarray:
317
+ def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
311
318
  first_dates = df.groupby(self.entity_column)[self.time_column].min()
312
319
  days_since_first = (reference_date - first_dates).dt.days
313
320
  return np.array([days_since_first.get(e, np.nan) for e in entities])
@@ -4,7 +4,7 @@ from typing import Optional
4
4
 
5
5
  import numpy as np
6
6
 
7
- from customer_retention.core.compat import DataFrame, Series, pd
7
+ from customer_retention.core.compat import DataFrame, Series, to_numeric
8
8
 
9
9
 
10
10
  class EncodingStrategy(str, Enum):
@@ -212,7 +212,7 @@ class CategoricalEncoder:
212
212
  if hasattr(self, '_cyclical_mapping') and self._cyclical_mapping is not None:
213
213
  numeric = series.map(self._cyclical_mapping)
214
214
  else:
215
- numeric = pd.to_numeric(series, errors='coerce')
215
+ numeric = to_numeric(series, errors='coerce')
216
216
 
217
217
  sin_vals = np.sin(2 * np.pi * numeric / self.period)
218
218
  cos_vals = np.cos(2 * np.pi * numeric / self.period)
@@ -4,7 +4,7 @@ from typing import Optional
4
4
 
5
5
  import numpy as np
6
6
 
7
- from customer_retention.core.compat import DataFrame, pd
7
+ from customer_retention.core.compat import DataFrame, notna
8
8
  from customer_retention.core.config import ColumnType
9
9
  from customer_retention.stages.cleaning import MissingValueHandler, OutlierHandler, OutlierTreatmentStrategy
10
10
 
@@ -198,7 +198,7 @@ class TransformationPipeline:
198
198
  if col in working_df.columns and working_df[col].isna().any():
199
199
  # Fill with median for extracted datetime features
200
200
  median_val = working_df[col].median()
201
- if pd.notna(median_val):
201
+ if notna(median_val):
202
202
  working_df[col] = working_df[col].fillna(median_val)
203
203
 
204
204
  for col, transformer in self._numeric_transformers.items():
@@ -1,6 +1,6 @@
1
1
  import time
2
2
 
3
- from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype, pd
3
+ from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype, to_datetime, to_numeric
4
4
  from customer_retention.core.config.column_config import ColumnType
5
5
  from customer_retention.core.config.pipeline_config import BronzeConfig, PipelineConfig
6
6
 
@@ -159,7 +159,7 @@ class DataQualityGate(ValidationGate):
159
159
  continue
160
160
 
161
161
  if not is_datetime64_any_dtype(df_temp):
162
- df_temp = pd.to_datetime(df_temp, errors='coerce', format='mixed')
162
+ df_temp = to_datetime(df_temp, errors='coerce', format='mixed')
163
163
 
164
164
  future_dates = df_temp > Timestamp.now()
165
165
  future_count = future_dates.sum()
@@ -185,8 +185,8 @@ class DataQualityGate(ValidationGate):
185
185
  if len(df_temp) == 0:
186
186
  return issues
187
187
 
188
- created = pd.to_datetime(df_temp['created'], errors='coerce', format='mixed')
189
- firstorder = pd.to_datetime(df_temp['firstorder'], errors='coerce', format='mixed')
188
+ created = to_datetime(df_temp['created'], errors='coerce', format='mixed')
189
+ firstorder = to_datetime(df_temp['firstorder'], errors='coerce', format='mixed')
190
190
 
191
191
  violations = created > firstorder
192
192
  violation_count = violations.sum()
@@ -214,7 +214,7 @@ class DataQualityGate(ValidationGate):
214
214
 
215
215
  if col_config.is_numeric() and column_data.dtype == 'object':
216
216
  try:
217
- pd.to_numeric(column_data.dropna(), errors='raise')
217
+ to_numeric(column_data.dropna(), errors='raise')
218
218
  issues.append(self.create_issue(
219
219
  "DQ040", "Numeric column stored as string",
220
220
  Severity.MEDIUM, col_config.name, len(df), len(df),
@@ -8,7 +8,7 @@ including duplicate detection, date logic validation, and value range validation
8
8
  from dataclasses import dataclass, field
9
9
  from typing import Any, Dict, List, Optional
10
10
 
11
- from customer_retention.core.compat import DataFrame, pd
11
+ from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime
12
12
  from customer_retention.core.components.enums import Severity
13
13
 
14
14
 
@@ -249,8 +249,8 @@ class DataValidator:
249
249
  # Convert to datetime if needed
250
250
  df_dates = df[order].copy()
251
251
  for col in order:
252
- if not pd.api.types.is_datetime64_any_dtype(df_dates[col]):
253
- df_dates[col] = pd.to_datetime(df_dates[col], errors='coerce', format='mixed')
252
+ if not is_datetime64_any_dtype(df_dates[col]):
253
+ df_dates[col] = to_datetime(df_dates[col], errors='coerce', format='mixed')
254
254
 
255
255
  # Check sequential ordering
256
256
  violations = []
@@ -1,7 +1,7 @@
1
1
  from dataclasses import dataclass
2
2
  from typing import TYPE_CHECKING, Any, Dict, List, Optional
3
3
 
4
- from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype, pd
4
+ from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype, notna, to_datetime
5
5
  from customer_retention.core.components.enums import Severity
6
6
 
7
7
  if TYPE_CHECKING:
@@ -124,7 +124,7 @@ class LeakageGate:
124
124
  for feature in numeric_features:
125
125
  try:
126
126
  corr = df[feature].corr(df[self.target_column])
127
- if pd.notna(corr):
127
+ if notna(corr):
128
128
  correlations[feature] = corr
129
129
  except Exception:
130
130
  continue
@@ -153,7 +153,7 @@ class LeakageGate:
153
153
 
154
154
  @staticmethod
155
155
  def _parse_datetime(series, errors="coerce"):
156
- return pd.to_datetime(series, errors=errors, format='mixed')
156
+ return to_datetime(series, errors=errors, format='mixed')
157
157
 
158
158
  def _check_perfect_separation(
159
159
  self,
@@ -236,7 +236,7 @@ class LeakageGate:
236
236
  mean_0 = df[df[self.target_column] == target_values[0]][feature].mean()
237
237
  mean_1 = df[df[self.target_column] == target_values[1]][feature].mean()
238
238
 
239
- if (pd.notna(var_0) and pd.notna(var_1) and
239
+ if (notna(var_0) and notna(var_1) and
240
240
  var_0 < 0.01 and var_1 < 0.01 and
241
241
  abs(mean_0 - mean_1) > 0.1):
242
242
  issues.append(LeakageIssue(
@@ -11,7 +11,7 @@ from datetime import timedelta
11
11
  from enum import Enum
12
12
  from typing import Any, Dict, List, Optional, Tuple
13
13
 
14
- from customer_retention.core.compat import DataFrame, pd
14
+ from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime
15
15
 
16
16
 
17
17
  class DatasetType(Enum):
@@ -255,7 +255,7 @@ class TimeSeriesDetector:
255
255
 
256
256
  if timestamp_column and timestamp_column in df.columns:
257
257
  # Convert to datetime if needed
258
- ts_series = pd.to_datetime(
258
+ ts_series = to_datetime(
259
259
  df[timestamp_column], errors='coerce', format='mixed'
260
260
  )
261
261
  valid_ts = ts_series.notna()
@@ -342,7 +342,7 @@ class TimeSeriesDetector:
342
342
  name_match = any(pattern in col_lower for pattern in self.TIMESTAMP_PATTERNS)
343
343
 
344
344
  # Check if column is datetime type
345
- is_datetime = pd.api.types.is_datetime64_any_dtype(df[col])
345
+ is_datetime = is_datetime64_any_dtype(df[col])
346
346
 
347
347
  # Try to parse as datetime
348
348
  can_parse = False
@@ -350,7 +350,7 @@ class TimeSeriesDetector:
350
350
  try:
351
351
  with warnings.catch_warnings():
352
352
  warnings.filterwarnings('ignore', category=FutureWarning)
353
- parsed = pd.to_datetime(
353
+ parsed = to_datetime(
354
354
  df[col].head(100), errors='coerce', format='mixed'
355
355
  )
356
356
  can_parse = parsed.notna().mean() > 0.8
@@ -389,7 +389,7 @@ class TimeSeriesDetector:
389
389
  if len(entity_data) < 2:
390
390
  continue
391
391
 
392
- ts = pd.to_datetime(
392
+ ts = to_datetime(
393
393
  entity_data[timestamp_column], errors='coerce', format='mixed'
394
394
  )
395
395
  ts = ts.dropna().sort_values()
@@ -525,7 +525,7 @@ class TimeSeriesValidator:
525
525
 
526
526
  # Convert timestamp
527
527
  df_copy = df.copy()
528
- df_copy['_ts'] = pd.to_datetime(
528
+ df_copy['_ts'] = to_datetime(
529
529
  df_copy[timestamp_column], errors='coerce', format='mixed'
530
530
  )
531
531
 
@@ -12,7 +12,7 @@ from typing import Any
12
12
 
13
13
  import numpy as np
14
14
 
15
- from customer_retention.core.compat import DataFrame, pd
15
+ from customer_retention.core.compat import DataFrame, get_dummies, pd
16
16
 
17
17
 
18
18
  def _requires_column(fn):
@@ -113,7 +113,7 @@ def apply_cap_then_log(df: DataFrame, column: str) -> DataFrame:
113
113
 
114
114
  @_requires_column
115
115
  def apply_one_hot_encode(df: DataFrame, column: str) -> DataFrame:
116
- return pd.get_dummies(df, columns=[column], prefix=column)
116
+ return get_dummies(df, columns=[column], prefix=column)
117
117
 
118
118
 
119
119
  def apply_feature_select(df: DataFrame, column: str) -> DataFrame: