churnkit 0.75.0a3__py3-none-any.whl → 0.75.1a2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +72 -72
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +134 -134
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +207 -207
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +109 -109
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +283 -283
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +145 -145
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +126 -126
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +149 -149
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +172 -172
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +130 -130
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +163 -163
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +99 -99
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +126 -126
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +89 -89
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +132 -132
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +197 -197
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +27 -27
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/METADATA +2 -2
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/RECORD +48 -48
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/business/fairness_analyzer.py +2 -2
- customer_retention/analysis/diagnostics/segment_analyzer.py +3 -3
- customer_retention/analysis/interpretability/cohort_analyzer.py +4 -4
- customer_retention/analysis/notebook_progress.py +4 -2
- customer_retention/core/compat/__init__.py +20 -0
- customer_retention/stages/features/behavioral_features.py +3 -3
- customer_retention/stages/features/customer_segmentation.py +10 -10
- customer_retention/stages/features/feature_selector.py +2 -2
- customer_retention/stages/profiling/column_profiler.py +2 -2
- customer_retention/stages/profiling/relationship_recommender.py +2 -2
- customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
- customer_retention/stages/profiling/temporal_feature_engineer.py +7 -7
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +16 -12
- customer_retention/stages/profiling/temporal_quality_checks.py +7 -6
- customer_retention/stages/profiling/time_series_profiler.py +15 -8
- customer_retention/stages/profiling/time_window_aggregator.py +22 -15
- customer_retention/stages/transformation/categorical_encoder.py +2 -2
- customer_retention/stages/transformation/pipeline.py +2 -2
- customer_retention/stages/validation/data_quality_gate.py +5 -5
- customer_retention/stages/validation/data_validators.py +3 -3
- customer_retention/stages/validation/leakage_gate.py +4 -4
- customer_retention/stages/validation/timeseries_detector.py +9 -6
- customer_retention/transforms/ops.py +2 -2
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
- {churnkit-0.75.0a3.data → churnkit-0.75.1a2.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/WHEEL +0 -0
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/entry_points.txt +0 -0
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a2.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
8
|
+
from customer_retention.core.compat import DataFrame, pd, qcut, to_datetime
|
|
9
9
|
from customer_retention.core.utils import compute_effect_size
|
|
10
10
|
|
|
11
11
|
|
|
@@ -627,7 +627,7 @@ class TemporalFeatureAnalyzer:
|
|
|
627
627
|
|
|
628
628
|
def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
|
|
629
629
|
df = df.copy()
|
|
630
|
-
df[self.time_column] =
|
|
630
|
+
df[self.time_column] = to_datetime(df[self.time_column])
|
|
631
631
|
return df
|
|
632
632
|
|
|
633
633
|
def _validate_event_level_target_usage(self, df: DataFrame, target_column: Optional[str]) -> None:
|
|
@@ -646,7 +646,7 @@ class TemporalFeatureAnalyzer:
|
|
|
646
646
|
if len(df_iv) < bins * 2:
|
|
647
647
|
return 0.0
|
|
648
648
|
try:
|
|
649
|
-
df_iv["bin"] =
|
|
649
|
+
df_iv["bin"] = qcut(df_iv["feature"], q=bins, duplicates="drop")
|
|
650
650
|
except ValueError:
|
|
651
651
|
return 0.0
|
|
652
652
|
|
|
@@ -25,7 +25,7 @@ from typing import Any, Dict, List, Optional
|
|
|
25
25
|
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
|
-
from customer_retention.core.compat import pd
|
|
28
|
+
from customer_retention.core.compat import Timedelta, pd, to_datetime, to_pandas
|
|
29
29
|
|
|
30
30
|
|
|
31
31
|
class ReferenceMode(Enum):
|
|
@@ -179,8 +179,8 @@ class TemporalFeatureEngineer:
|
|
|
179
179
|
Returns:
|
|
180
180
|
TemporalFeatureResult with features DataFrame and metadata
|
|
181
181
|
"""
|
|
182
|
-
events_df = events_df.copy()
|
|
183
|
-
events_df[time_col] =
|
|
182
|
+
events_df = to_pandas(events_df).copy()
|
|
183
|
+
events_df[time_col] = to_datetime(events_df[time_col])
|
|
184
184
|
|
|
185
185
|
# Determine reference dates per entity
|
|
186
186
|
ref_dates = self._get_reference_dates(
|
|
@@ -313,9 +313,9 @@ class TemporalFeatureEngineer:
|
|
|
313
313
|
})
|
|
314
314
|
|
|
315
315
|
if reference_dates is not None and reference_col is not None:
|
|
316
|
-
ref_df = reference_dates[[entity_col, reference_col]].copy()
|
|
316
|
+
ref_df = to_pandas(reference_dates)[[entity_col, reference_col]].copy()
|
|
317
317
|
ref_df.columns = [entity_col, "reference_date"]
|
|
318
|
-
ref_df["reference_date"] =
|
|
318
|
+
ref_df["reference_date"] = to_datetime(ref_df["reference_date"])
|
|
319
319
|
return ref_df
|
|
320
320
|
|
|
321
321
|
# Default: Use last event date per entity
|
|
@@ -511,8 +511,8 @@ class TemporalFeatureEngineer:
|
|
|
511
511
|
entity_df["last_event"].iloc[0]
|
|
512
512
|
|
|
513
513
|
# Calculate split boundaries
|
|
514
|
-
split1 = first_event +
|
|
515
|
-
split2 = first_event +
|
|
514
|
+
split1 = first_event + Timedelta(days=history_days * splits[0])
|
|
515
|
+
split2 = first_event + Timedelta(days=history_days * (splits[0] + splits[1]))
|
|
516
516
|
|
|
517
517
|
for col in value_cols:
|
|
518
518
|
beginning_val = entity_df[entity_df[time_col] < split1][col].sum()
|
|
@@ -5,7 +5,7 @@ from typing import Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
8
|
+
from customer_retention.core.compat import DataFrame, Timestamp, cut, pd, to_datetime, to_pandas
|
|
9
9
|
from customer_retention.core.utils import compute_effect_size
|
|
10
10
|
|
|
11
11
|
|
|
@@ -229,15 +229,16 @@ MONOTONIC_TOLERANCE = 0.05
|
|
|
229
229
|
|
|
230
230
|
def compute_recency_buckets(
|
|
231
231
|
df: DataFrame, entity_column: str, time_column: str, target_column: str,
|
|
232
|
-
reference_date:
|
|
232
|
+
reference_date: Timestamp, bucket_edges: Optional[List[float]] = None
|
|
233
233
|
) -> List[RecencyBucketStats]:
|
|
234
|
+
df = to_pandas(df)
|
|
234
235
|
edges = bucket_edges or DEFAULT_BUCKET_EDGES
|
|
235
236
|
labels = _generate_bucket_labels(edges)
|
|
236
237
|
entity_last = df.groupby(entity_column)[time_column].max().reset_index()
|
|
237
238
|
entity_last["recency_days"] = (reference_date - entity_last[time_column]).dt.days
|
|
238
239
|
entity_target = df.groupby(entity_column)[target_column].first().reset_index()
|
|
239
240
|
entity_data = entity_last.merge(entity_target, on=entity_column)
|
|
240
|
-
entity_data["bucket"] =
|
|
241
|
+
entity_data["bucket"] = cut(entity_data["recency_days"], bins=edges, labels=labels, include_lowest=True)
|
|
241
242
|
bucket_stats = []
|
|
242
243
|
for i, label in enumerate(labels):
|
|
243
244
|
bucket_data = entity_data[entity_data["bucket"] == label]
|
|
@@ -296,6 +297,7 @@ def classify_distribution_pattern(buckets: List[RecencyBucketStats]) -> str:
|
|
|
296
297
|
def _diagnose_anomaly_pattern(
|
|
297
298
|
df: DataFrame, entity_column: str, time_column: str, target_column: str
|
|
298
299
|
) -> AnomalyDiagnostics:
|
|
300
|
+
df = to_pandas(df)
|
|
299
301
|
entity_target = df.groupby(entity_column)[target_column].first()
|
|
300
302
|
target_1_pct = float(entity_target.mean() * 100)
|
|
301
303
|
target_1_is_minority = target_1_pct < 50
|
|
@@ -429,8 +431,9 @@ def _extract_threshold_from_bucket(bucket_label: str) -> int:
|
|
|
429
431
|
|
|
430
432
|
def compare_recency_by_target(
|
|
431
433
|
df: DataFrame, entity_column: str, time_column: str, target_column: str,
|
|
432
|
-
reference_date: Optional[
|
|
434
|
+
reference_date: Optional[Timestamp] = None, cap_percentile: float = 0.99
|
|
433
435
|
) -> Optional[RecencyComparisonResult]:
|
|
436
|
+
df = to_pandas(df)
|
|
434
437
|
if target_column not in df.columns:
|
|
435
438
|
return None
|
|
436
439
|
ref_date = reference_date or df[time_column].max()
|
|
@@ -495,11 +498,11 @@ class TemporalPatternAnalyzer:
|
|
|
495
498
|
if len(df) < 3:
|
|
496
499
|
return self._unknown_trend()
|
|
497
500
|
|
|
498
|
-
df_clean = df[[self.time_column, value_column]].dropna()
|
|
501
|
+
df_clean = to_pandas(df)[[self.time_column, value_column]].dropna()
|
|
499
502
|
if len(df_clean) < 3:
|
|
500
503
|
return self._unknown_trend()
|
|
501
504
|
|
|
502
|
-
time_col =
|
|
505
|
+
time_col = to_datetime(df_clean[self.time_column])
|
|
503
506
|
x = (time_col - time_col.min()).dt.total_seconds() / 86400
|
|
504
507
|
y = df_clean[value_column].values
|
|
505
508
|
|
|
@@ -583,10 +586,10 @@ class TemporalPatternAnalyzer:
|
|
|
583
586
|
if len(df) == 0:
|
|
584
587
|
return pd.DataFrame()
|
|
585
588
|
|
|
586
|
-
df_copy = df.copy()
|
|
589
|
+
df_copy = to_pandas(df).copy()
|
|
587
590
|
entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
|
|
588
591
|
df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
|
|
589
|
-
df_copy["_cohort"] =
|
|
592
|
+
df_copy["_cohort"] = to_datetime(df_copy["_cohort"]).dt.to_period(period)
|
|
590
593
|
|
|
591
594
|
entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
|
|
592
595
|
entity_cohorts.columns = [entity_column, "_cohort"]
|
|
@@ -607,15 +610,16 @@ class TemporalPatternAnalyzer:
|
|
|
607
610
|
|
|
608
611
|
return cohort_stats.sort_values("cohort")
|
|
609
612
|
|
|
610
|
-
def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[
|
|
613
|
+
def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[Timestamp] = None) -> RecencyResult:
|
|
611
614
|
if len(df) == 0:
|
|
612
615
|
return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
|
|
613
616
|
|
|
614
|
-
|
|
615
|
-
|
|
617
|
+
df = to_pandas(df)
|
|
618
|
+
ref_date = reference_date or Timestamp.now()
|
|
619
|
+
to_datetime(df[self.time_column])
|
|
616
620
|
|
|
617
621
|
entity_last = df.groupby(entity_column)[self.time_column].max()
|
|
618
|
-
entity_last =
|
|
622
|
+
entity_last = to_datetime(entity_last)
|
|
619
623
|
recency_days = (ref_date - entity_last).dt.days
|
|
620
624
|
|
|
621
625
|
target_correlation = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from customer_retention.core.compat import DataFrame,
|
|
4
|
+
from customer_retention.core.compat import DataFrame, Timestamp, to_datetime, to_pandas
|
|
5
5
|
from customer_retention.core.components.enums import Severity
|
|
6
6
|
|
|
7
7
|
|
|
@@ -73,7 +73,8 @@ class TemporalGapCheck(TemporalQualityCheck):
|
|
|
73
73
|
if len(df) < 2:
|
|
74
74
|
return self._pass_result("Insufficient data to check gaps")
|
|
75
75
|
|
|
76
|
-
|
|
76
|
+
df = to_pandas(df)
|
|
77
|
+
time_col = to_datetime(df.sort_values(self.time_column)[self.time_column])
|
|
77
78
|
diffs_days = time_col.diff().dropna().dt.total_seconds() / 86400
|
|
78
79
|
expected_days = self.FREQ_TO_DAYS.get(self.expected_frequency, 1)
|
|
79
80
|
threshold_days = expected_days * self.max_gap_multiple
|
|
@@ -101,16 +102,16 @@ class TemporalGapCheck(TemporalQualityCheck):
|
|
|
101
102
|
|
|
102
103
|
|
|
103
104
|
class FutureDateCheck(TemporalQualityCheck):
|
|
104
|
-
def __init__(self, time_column: str, reference_date: Optional[
|
|
105
|
+
def __init__(self, time_column: str, reference_date: Optional[Timestamp] = None):
|
|
105
106
|
super().__init__("TQ003", "Future Dates", Severity.HIGH)
|
|
106
107
|
self.time_column = time_column
|
|
107
|
-
self.reference_date = reference_date or
|
|
108
|
+
self.reference_date = reference_date or Timestamp.now()
|
|
108
109
|
|
|
109
110
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
110
111
|
if len(df) == 0:
|
|
111
112
|
return self._pass_result("No data to check")
|
|
112
113
|
|
|
113
|
-
time_col =
|
|
114
|
+
time_col = to_datetime(df[self.time_column])
|
|
114
115
|
future_mask = time_col > self.reference_date
|
|
115
116
|
future_count = future_mask.sum()
|
|
116
117
|
|
|
@@ -140,7 +141,7 @@ class EventOrderCheck(TemporalQualityCheck):
|
|
|
140
141
|
if len(df) < 2:
|
|
141
142
|
return self._pass_result("Insufficient data to check ordering")
|
|
142
143
|
|
|
143
|
-
df_check = df.assign(_parsed_time=
|
|
144
|
+
df_check = df.assign(_parsed_time=to_datetime(df[self.time_column]))
|
|
144
145
|
collision_counts = df_check.groupby([self.entity_column, "_parsed_time"]).size()
|
|
145
146
|
ambiguous = collision_counts[collision_counts > 1]
|
|
146
147
|
ambiguous_count = ambiguous.sum() - len(ambiguous)
|
|
@@ -3,7 +3,14 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
from customer_retention.core.compat import
|
|
6
|
+
from customer_retention.core.compat import (
|
|
7
|
+
DataFrame,
|
|
8
|
+
Timestamp,
|
|
9
|
+
is_datetime64_any_dtype,
|
|
10
|
+
pd,
|
|
11
|
+
to_datetime,
|
|
12
|
+
to_pandas,
|
|
13
|
+
)
|
|
7
14
|
|
|
8
15
|
|
|
9
16
|
@dataclass
|
|
@@ -161,8 +168,8 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
|
|
|
161
168
|
@dataclass
|
|
162
169
|
class EntityLifecycle:
|
|
163
170
|
entity: str
|
|
164
|
-
first_event:
|
|
165
|
-
last_event:
|
|
171
|
+
first_event: Timestamp
|
|
172
|
+
last_event: Timestamp
|
|
166
173
|
duration_days: int
|
|
167
174
|
event_count: int
|
|
168
175
|
|
|
@@ -177,8 +184,8 @@ class TimeSeriesProfile:
|
|
|
177
184
|
events_per_entity: DistributionStats
|
|
178
185
|
entity_lifecycles: DataFrame
|
|
179
186
|
avg_inter_event_days: Optional[float] = None
|
|
180
|
-
first_event_date: Optional[
|
|
181
|
-
last_event_date: Optional[
|
|
187
|
+
first_event_date: Optional[Timestamp] = None
|
|
188
|
+
last_event_date: Optional[Timestamp] = None
|
|
182
189
|
|
|
183
190
|
|
|
184
191
|
class TimeSeriesProfiler:
|
|
@@ -223,9 +230,9 @@ class TimeSeriesProfiler:
|
|
|
223
230
|
raise KeyError(f"Time column '{self.time_column}' not found")
|
|
224
231
|
|
|
225
232
|
def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
|
|
226
|
-
df = df.copy()
|
|
227
|
-
if not
|
|
228
|
-
df[self.time_column] =
|
|
233
|
+
df = to_pandas(df).copy()
|
|
234
|
+
if not is_datetime64_any_dtype(df[self.time_column]):
|
|
235
|
+
df[self.time_column] = to_datetime(df[self.time_column])
|
|
229
236
|
return df
|
|
230
237
|
|
|
231
238
|
def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
|
|
@@ -6,7 +6,14 @@ from typing import Dict, List, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
-
from customer_retention.core.compat import
|
|
9
|
+
from customer_retention.core.compat import (
|
|
10
|
+
DataFrame,
|
|
11
|
+
Timedelta,
|
|
12
|
+
Timestamp,
|
|
13
|
+
is_numeric_dtype,
|
|
14
|
+
pd,
|
|
15
|
+
to_datetime,
|
|
16
|
+
)
|
|
10
17
|
|
|
11
18
|
|
|
12
19
|
class AggregationType(str, Enum):
|
|
@@ -71,7 +78,7 @@ class TimeWindowAggregator:
|
|
|
71
78
|
def aggregate(
|
|
72
79
|
self, df: DataFrame, windows: Optional[List[str]] = None,
|
|
73
80
|
value_columns: Optional[List[str]] = None, agg_funcs: Optional[List[str]] = None,
|
|
74
|
-
reference_date: Optional[
|
|
81
|
+
reference_date: Optional[Timestamp] = None, include_event_count: bool = False,
|
|
75
82
|
include_recency: bool = False, include_tenure: bool = False,
|
|
76
83
|
exclude_columns: Optional[List[str]] = None,
|
|
77
84
|
) -> DataFrame:
|
|
@@ -79,7 +86,7 @@ class TimeWindowAggregator:
|
|
|
79
86
|
return pd.DataFrame()
|
|
80
87
|
|
|
81
88
|
df = df.copy()
|
|
82
|
-
df[self.time_column] =
|
|
89
|
+
df[self.time_column] = to_datetime(df[self.time_column])
|
|
83
90
|
reference_date = self._validate_reference_date(df, reference_date)
|
|
84
91
|
parsed_windows = [TimeWindow.from_string(w) for w in (windows or ["30d"])]
|
|
85
92
|
|
|
@@ -107,13 +114,13 @@ class TimeWindowAggregator:
|
|
|
107
114
|
result = pd.DataFrame(result_data)
|
|
108
115
|
result.attrs["aggregation_reference_date"] = (
|
|
109
116
|
reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
|
|
110
|
-
result.attrs["aggregation_timestamp"] =
|
|
117
|
+
result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()
|
|
111
118
|
return result
|
|
112
119
|
|
|
113
120
|
def _add_value_aggregations(
|
|
114
121
|
self, result_data: Dict, df: DataFrame, entities: np.ndarray,
|
|
115
122
|
windows: List[TimeWindow], value_columns: List[str], agg_funcs: List[str],
|
|
116
|
-
reference_date:
|
|
123
|
+
reference_date: Timestamp,
|
|
117
124
|
) -> None:
|
|
118
125
|
for window in windows:
|
|
119
126
|
for col in value_columns:
|
|
@@ -169,9 +176,9 @@ class TimeWindowAggregator:
|
|
|
169
176
|
|
|
170
177
|
return feature_columns, value_counts_categories
|
|
171
178
|
|
|
172
|
-
def _validate_reference_date(self, df: DataFrame, reference_date: Optional[
|
|
179
|
+
def _validate_reference_date(self, df: DataFrame, reference_date: Optional[Timestamp]) -> Timestamp:
|
|
173
180
|
data_min, data_max = df[self.time_column].min(), df[self.time_column].max()
|
|
174
|
-
current_date =
|
|
181
|
+
current_date = Timestamp.now()
|
|
175
182
|
|
|
176
183
|
if reference_date is None:
|
|
177
184
|
warnings.warn(
|
|
@@ -196,16 +203,16 @@ class TimeWindowAggregator:
|
|
|
196
203
|
return reference_date
|
|
197
204
|
|
|
198
205
|
def _compute_event_counts(
|
|
199
|
-
self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date:
|
|
206
|
+
self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date: Timestamp,
|
|
200
207
|
) -> np.ndarray:
|
|
201
208
|
filtered_df = self._filter_by_window(df, window, reference_date)
|
|
202
209
|
counts = filtered_df.groupby(self.entity_column).size()
|
|
203
210
|
return np.array([counts.get(e, 0) for e in entities])
|
|
204
211
|
|
|
205
|
-
def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date:
|
|
212
|
+
def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date: Timestamp) -> DataFrame:
|
|
206
213
|
if window.days is None:
|
|
207
214
|
return df
|
|
208
|
-
cutoff = reference_date -
|
|
215
|
+
cutoff = reference_date - Timedelta(days=window.days)
|
|
209
216
|
return df[df[self.time_column] >= cutoff]
|
|
210
217
|
|
|
211
218
|
def _compute_aggregation(
|
|
@@ -215,14 +222,14 @@ class TimeWindowAggregator:
|
|
|
215
222
|
value_column: str,
|
|
216
223
|
agg_func: str,
|
|
217
224
|
window: TimeWindow,
|
|
218
|
-
reference_date:
|
|
225
|
+
reference_date: Timestamp,
|
|
219
226
|
) -> np.ndarray:
|
|
220
227
|
filtered_df = self._filter_by_window(df, window, reference_date)
|
|
221
228
|
if len(filtered_df) == 0:
|
|
222
229
|
default = 0 if agg_func in ["sum", "count", "nunique"] else np.nan
|
|
223
230
|
return np.full(len(entities), default)
|
|
224
231
|
|
|
225
|
-
is_numeric =
|
|
232
|
+
is_numeric = is_numeric_dtype(df[value_column])
|
|
226
233
|
if agg_func in CATEGORICAL_AGG_FUNCS:
|
|
227
234
|
return self._compute_categorical_agg(filtered_df, entities, value_column, agg_func)
|
|
228
235
|
elif agg_func in NUMERIC_AGG_FUNCS and not is_numeric:
|
|
@@ -288,7 +295,7 @@ class TimeWindowAggregator:
|
|
|
288
295
|
return np.array([entropy_result.get(e, np.nan) for e in entities])
|
|
289
296
|
|
|
290
297
|
def _compute_value_counts(
|
|
291
|
-
self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date:
|
|
298
|
+
self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date: Timestamp
|
|
292
299
|
) -> Dict[str, np.ndarray]:
|
|
293
300
|
filtered_df = self._filter_by_window(df, window, reference_date)
|
|
294
301
|
unique_values = df[col].dropna().unique()
|
|
@@ -302,12 +309,12 @@ class TimeWindowAggregator:
|
|
|
302
309
|
result[col_name] = np.array([counts.get(e, 0) for e in entities])
|
|
303
310
|
return result
|
|
304
311
|
|
|
305
|
-
def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date:
|
|
312
|
+
def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
|
|
306
313
|
last_dates = df.groupby(self.entity_column)[self.time_column].max()
|
|
307
314
|
days_since_last = (reference_date - last_dates).dt.days
|
|
308
315
|
return np.array([days_since_last.get(e, np.nan) for e in entities])
|
|
309
316
|
|
|
310
|
-
def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date:
|
|
317
|
+
def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
|
|
311
318
|
first_dates = df.groupby(self.entity_column)[self.time_column].min()
|
|
312
319
|
days_since_first = (reference_date - first_dates).dt.days
|
|
313
320
|
return np.array([days_since_first.get(e, np.nan) for e in entities])
|
|
@@ -4,7 +4,7 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
-
from customer_retention.core.compat import DataFrame, Series,
|
|
7
|
+
from customer_retention.core.compat import DataFrame, Series, to_numeric
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class EncodingStrategy(str, Enum):
|
|
@@ -212,7 +212,7 @@ class CategoricalEncoder:
|
|
|
212
212
|
if hasattr(self, '_cyclical_mapping') and self._cyclical_mapping is not None:
|
|
213
213
|
numeric = series.map(self._cyclical_mapping)
|
|
214
214
|
else:
|
|
215
|
-
numeric =
|
|
215
|
+
numeric = to_numeric(series, errors='coerce')
|
|
216
216
|
|
|
217
217
|
sin_vals = np.sin(2 * np.pi * numeric / self.period)
|
|
218
218
|
cos_vals = np.cos(2 * np.pi * numeric / self.period)
|
|
@@ -4,7 +4,7 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
-
from customer_retention.core.compat import DataFrame,
|
|
7
|
+
from customer_retention.core.compat import DataFrame, notna
|
|
8
8
|
from customer_retention.core.config import ColumnType
|
|
9
9
|
from customer_retention.stages.cleaning import MissingValueHandler, OutlierHandler, OutlierTreatmentStrategy
|
|
10
10
|
|
|
@@ -198,7 +198,7 @@ class TransformationPipeline:
|
|
|
198
198
|
if col in working_df.columns and working_df[col].isna().any():
|
|
199
199
|
# Fill with median for extracted datetime features
|
|
200
200
|
median_val = working_df[col].median()
|
|
201
|
-
if
|
|
201
|
+
if notna(median_val):
|
|
202
202
|
working_df[col] = working_df[col].fillna(median_val)
|
|
203
203
|
|
|
204
204
|
for col, transformer in self._numeric_transformers.items():
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import time
|
|
2
2
|
|
|
3
|
-
from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype,
|
|
3
|
+
from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype, to_datetime, to_numeric
|
|
4
4
|
from customer_retention.core.config.column_config import ColumnType
|
|
5
5
|
from customer_retention.core.config.pipeline_config import BronzeConfig, PipelineConfig
|
|
6
6
|
|
|
@@ -159,7 +159,7 @@ class DataQualityGate(ValidationGate):
|
|
|
159
159
|
continue
|
|
160
160
|
|
|
161
161
|
if not is_datetime64_any_dtype(df_temp):
|
|
162
|
-
df_temp =
|
|
162
|
+
df_temp = to_datetime(df_temp, errors='coerce', format='mixed')
|
|
163
163
|
|
|
164
164
|
future_dates = df_temp > Timestamp.now()
|
|
165
165
|
future_count = future_dates.sum()
|
|
@@ -185,8 +185,8 @@ class DataQualityGate(ValidationGate):
|
|
|
185
185
|
if len(df_temp) == 0:
|
|
186
186
|
return issues
|
|
187
187
|
|
|
188
|
-
created =
|
|
189
|
-
firstorder =
|
|
188
|
+
created = to_datetime(df_temp['created'], errors='coerce', format='mixed')
|
|
189
|
+
firstorder = to_datetime(df_temp['firstorder'], errors='coerce', format='mixed')
|
|
190
190
|
|
|
191
191
|
violations = created > firstorder
|
|
192
192
|
violation_count = violations.sum()
|
|
@@ -214,7 +214,7 @@ class DataQualityGate(ValidationGate):
|
|
|
214
214
|
|
|
215
215
|
if col_config.is_numeric() and column_data.dtype == 'object':
|
|
216
216
|
try:
|
|
217
|
-
|
|
217
|
+
to_numeric(column_data.dropna(), errors='raise')
|
|
218
218
|
issues.append(self.create_issue(
|
|
219
219
|
"DQ040", "Numeric column stored as string",
|
|
220
220
|
Severity.MEDIUM, col_config.name, len(df), len(df),
|
|
@@ -8,7 +8,7 @@ including duplicate detection, date logic validation, and value range validation
|
|
|
8
8
|
from dataclasses import dataclass, field
|
|
9
9
|
from typing import Any, Dict, List, Optional
|
|
10
10
|
|
|
11
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
11
|
+
from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime
|
|
12
12
|
from customer_retention.core.components.enums import Severity
|
|
13
13
|
|
|
14
14
|
|
|
@@ -249,8 +249,8 @@ class DataValidator:
|
|
|
249
249
|
# Convert to datetime if needed
|
|
250
250
|
df_dates = df[order].copy()
|
|
251
251
|
for col in order:
|
|
252
|
-
if not
|
|
253
|
-
df_dates[col] =
|
|
252
|
+
if not is_datetime64_any_dtype(df_dates[col]):
|
|
253
|
+
df_dates[col] = to_datetime(df_dates[col], errors='coerce', format='mixed')
|
|
254
254
|
|
|
255
255
|
# Check sequential ordering
|
|
256
256
|
violations = []
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
3
3
|
|
|
4
|
-
from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype,
|
|
4
|
+
from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype, notna, to_datetime
|
|
5
5
|
from customer_retention.core.components.enums import Severity
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
@@ -124,7 +124,7 @@ class LeakageGate:
|
|
|
124
124
|
for feature in numeric_features:
|
|
125
125
|
try:
|
|
126
126
|
corr = df[feature].corr(df[self.target_column])
|
|
127
|
-
if
|
|
127
|
+
if notna(corr):
|
|
128
128
|
correlations[feature] = corr
|
|
129
129
|
except Exception:
|
|
130
130
|
continue
|
|
@@ -153,7 +153,7 @@ class LeakageGate:
|
|
|
153
153
|
|
|
154
154
|
@staticmethod
|
|
155
155
|
def _parse_datetime(series, errors="coerce"):
|
|
156
|
-
return
|
|
156
|
+
return to_datetime(series, errors=errors, format='mixed')
|
|
157
157
|
|
|
158
158
|
def _check_perfect_separation(
|
|
159
159
|
self,
|
|
@@ -236,7 +236,7 @@ class LeakageGate:
|
|
|
236
236
|
mean_0 = df[df[self.target_column] == target_values[0]][feature].mean()
|
|
237
237
|
mean_1 = df[df[self.target_column] == target_values[1]][feature].mean()
|
|
238
238
|
|
|
239
|
-
if (
|
|
239
|
+
if (notna(var_0) and notna(var_1) and
|
|
240
240
|
var_0 < 0.01 and var_1 < 0.01 and
|
|
241
241
|
abs(mean_0 - mean_1) > 0.1):
|
|
242
242
|
issues.append(LeakageIssue(
|
|
@@ -11,7 +11,7 @@ from datetime import timedelta
|
|
|
11
11
|
from enum import Enum
|
|
12
12
|
from typing import Any, Dict, List, Optional, Tuple
|
|
13
13
|
|
|
14
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
14
|
+
from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime, to_pandas
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class DatasetType(Enum):
|
|
@@ -181,6 +181,7 @@ class TimeSeriesDetector:
|
|
|
181
181
|
TimeSeriesCharacteristics
|
|
182
182
|
Detected characteristics of the dataset
|
|
183
183
|
"""
|
|
184
|
+
df = to_pandas(df)
|
|
184
185
|
evidence = []
|
|
185
186
|
|
|
186
187
|
# Auto-detect entity column if not provided
|
|
@@ -255,7 +256,7 @@ class TimeSeriesDetector:
|
|
|
255
256
|
|
|
256
257
|
if timestamp_column and timestamp_column in df.columns:
|
|
257
258
|
# Convert to datetime if needed
|
|
258
|
-
ts_series =
|
|
259
|
+
ts_series = to_datetime(
|
|
259
260
|
df[timestamp_column], errors='coerce', format='mixed'
|
|
260
261
|
)
|
|
261
262
|
valid_ts = ts_series.notna()
|
|
@@ -342,7 +343,7 @@ class TimeSeriesDetector:
|
|
|
342
343
|
name_match = any(pattern in col_lower for pattern in self.TIMESTAMP_PATTERNS)
|
|
343
344
|
|
|
344
345
|
# Check if column is datetime type
|
|
345
|
-
is_datetime =
|
|
346
|
+
is_datetime = is_datetime64_any_dtype(df[col])
|
|
346
347
|
|
|
347
348
|
# Try to parse as datetime
|
|
348
349
|
can_parse = False
|
|
@@ -350,7 +351,7 @@ class TimeSeriesDetector:
|
|
|
350
351
|
try:
|
|
351
352
|
with warnings.catch_warnings():
|
|
352
353
|
warnings.filterwarnings('ignore', category=FutureWarning)
|
|
353
|
-
parsed =
|
|
354
|
+
parsed = to_datetime(
|
|
354
355
|
df[col].head(100), errors='coerce', format='mixed'
|
|
355
356
|
)
|
|
356
357
|
can_parse = parsed.notna().mean() > 0.8
|
|
@@ -389,7 +390,7 @@ class TimeSeriesDetector:
|
|
|
389
390
|
if len(entity_data) < 2:
|
|
390
391
|
continue
|
|
391
392
|
|
|
392
|
-
ts =
|
|
393
|
+
ts = to_datetime(
|
|
393
394
|
entity_data[timestamp_column], errors='coerce', format='mixed'
|
|
394
395
|
)
|
|
395
396
|
ts = ts.dropna().sort_values()
|
|
@@ -510,6 +511,8 @@ class TimeSeriesValidator:
|
|
|
510
511
|
"""
|
|
511
512
|
issues = []
|
|
512
513
|
|
|
514
|
+
df = to_pandas(df)
|
|
515
|
+
|
|
513
516
|
# Validate inputs
|
|
514
517
|
if entity_column not in df.columns:
|
|
515
518
|
return TimeSeriesValidationResult(
|
|
@@ -525,7 +528,7 @@ class TimeSeriesValidator:
|
|
|
525
528
|
|
|
526
529
|
# Convert timestamp
|
|
527
530
|
df_copy = df.copy()
|
|
528
|
-
df_copy['_ts'] =
|
|
531
|
+
df_copy['_ts'] = to_datetime(
|
|
529
532
|
df_copy[timestamp_column], errors='coerce', format='mixed'
|
|
530
533
|
)
|
|
531
534
|
|
|
@@ -12,7 +12,7 @@ from typing import Any
|
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
|
|
15
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
15
|
+
from customer_retention.core.compat import DataFrame, get_dummies, pd
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def _requires_column(fn):
|
|
@@ -113,7 +113,7 @@ def apply_cap_then_log(df: DataFrame, column: str) -> DataFrame:
|
|
|
113
113
|
|
|
114
114
|
@_requires_column
|
|
115
115
|
def apply_one_hot_encode(df: DataFrame, column: str) -> DataFrame:
|
|
116
|
-
return
|
|
116
|
+
return get_dummies(df, columns=[column], prefix=column)
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
def apply_feature_select(df: DataFrame, column: str) -> DataFrame:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|