churnkit 0.75.0a3__py3-none-any.whl → 0.75.1a1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +72 -72
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +134 -134
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +207 -207
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +109 -109
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +283 -283
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +145 -145
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +126 -126
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +149 -149
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +172 -172
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +130 -130
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +163 -163
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +99 -99
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +126 -126
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +89 -89
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +132 -132
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +197 -197
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +27 -27
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/METADATA +2 -2
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/RECORD +45 -45
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/business/fairness_analyzer.py +2 -2
- customer_retention/analysis/diagnostics/segment_analyzer.py +3 -3
- customer_retention/analysis/interpretability/cohort_analyzer.py +4 -4
- customer_retention/core/compat/__init__.py +20 -0
- customer_retention/stages/features/behavioral_features.py +3 -3
- customer_retention/stages/features/customer_segmentation.py +10 -10
- customer_retention/stages/features/feature_selector.py +2 -2
- customer_retention/stages/profiling/relationship_recommender.py +2 -2
- customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +10 -10
- customer_retention/stages/profiling/temporal_quality_checks.py +6 -6
- customer_retention/stages/profiling/time_series_profiler.py +13 -7
- customer_retention/stages/profiling/time_window_aggregator.py +22 -15
- customer_retention/stages/transformation/categorical_encoder.py +2 -2
- customer_retention/stages/transformation/pipeline.py +2 -2
- customer_retention/stages/validation/data_quality_gate.py +5 -5
- customer_retention/stages/validation/data_validators.py +3 -3
- customer_retention/stages/validation/leakage_gate.py +4 -4
- customer_retention/stages/validation/timeseries_detector.py +6 -6
- customer_retention/transforms/ops.py +2 -2
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
- {churnkit-0.75.0a3.data → churnkit-0.75.1a1.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/WHEEL +0 -0
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/entry_points.txt +0 -0
- {churnkit-0.75.0a3.dist-info → churnkit-0.75.1a1.dist-info}/licenses/LICENSE +0 -0
|
@@ -5,7 +5,7 @@ from typing import Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
8
|
+
from customer_retention.core.compat import DataFrame, Timestamp, cut, pd, to_datetime
|
|
9
9
|
from customer_retention.core.utils import compute_effect_size
|
|
10
10
|
|
|
11
11
|
|
|
@@ -229,7 +229,7 @@ MONOTONIC_TOLERANCE = 0.05
|
|
|
229
229
|
|
|
230
230
|
def compute_recency_buckets(
|
|
231
231
|
df: DataFrame, entity_column: str, time_column: str, target_column: str,
|
|
232
|
-
reference_date:
|
|
232
|
+
reference_date: Timestamp, bucket_edges: Optional[List[float]] = None
|
|
233
233
|
) -> List[RecencyBucketStats]:
|
|
234
234
|
edges = bucket_edges or DEFAULT_BUCKET_EDGES
|
|
235
235
|
labels = _generate_bucket_labels(edges)
|
|
@@ -237,7 +237,7 @@ def compute_recency_buckets(
|
|
|
237
237
|
entity_last["recency_days"] = (reference_date - entity_last[time_column]).dt.days
|
|
238
238
|
entity_target = df.groupby(entity_column)[target_column].first().reset_index()
|
|
239
239
|
entity_data = entity_last.merge(entity_target, on=entity_column)
|
|
240
|
-
entity_data["bucket"] =
|
|
240
|
+
entity_data["bucket"] = cut(entity_data["recency_days"], bins=edges, labels=labels, include_lowest=True)
|
|
241
241
|
bucket_stats = []
|
|
242
242
|
for i, label in enumerate(labels):
|
|
243
243
|
bucket_data = entity_data[entity_data["bucket"] == label]
|
|
@@ -429,7 +429,7 @@ def _extract_threshold_from_bucket(bucket_label: str) -> int:
|
|
|
429
429
|
|
|
430
430
|
def compare_recency_by_target(
|
|
431
431
|
df: DataFrame, entity_column: str, time_column: str, target_column: str,
|
|
432
|
-
reference_date: Optional[
|
|
432
|
+
reference_date: Optional[Timestamp] = None, cap_percentile: float = 0.99
|
|
433
433
|
) -> Optional[RecencyComparisonResult]:
|
|
434
434
|
if target_column not in df.columns:
|
|
435
435
|
return None
|
|
@@ -499,7 +499,7 @@ class TemporalPatternAnalyzer:
|
|
|
499
499
|
if len(df_clean) < 3:
|
|
500
500
|
return self._unknown_trend()
|
|
501
501
|
|
|
502
|
-
time_col =
|
|
502
|
+
time_col = to_datetime(df_clean[self.time_column])
|
|
503
503
|
x = (time_col - time_col.min()).dt.total_seconds() / 86400
|
|
504
504
|
y = df_clean[value_column].values
|
|
505
505
|
|
|
@@ -586,7 +586,7 @@ class TemporalPatternAnalyzer:
|
|
|
586
586
|
df_copy = df.copy()
|
|
587
587
|
entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
|
|
588
588
|
df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
|
|
589
|
-
df_copy["_cohort"] =
|
|
589
|
+
df_copy["_cohort"] = to_datetime(df_copy["_cohort"]).dt.to_period(period)
|
|
590
590
|
|
|
591
591
|
entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
|
|
592
592
|
entity_cohorts.columns = [entity_column, "_cohort"]
|
|
@@ -607,15 +607,15 @@ class TemporalPatternAnalyzer:
|
|
|
607
607
|
|
|
608
608
|
return cohort_stats.sort_values("cohort")
|
|
609
609
|
|
|
610
|
-
def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[
|
|
610
|
+
def analyze_recency(self, df: DataFrame, entity_column: str, target_column: Optional[str] = None, reference_date: Optional[Timestamp] = None) -> RecencyResult:
|
|
611
611
|
if len(df) == 0:
|
|
612
612
|
return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
|
|
613
613
|
|
|
614
|
-
ref_date = reference_date or
|
|
615
|
-
|
|
614
|
+
ref_date = reference_date or Timestamp.now()
|
|
615
|
+
to_datetime(df[self.time_column])
|
|
616
616
|
|
|
617
617
|
entity_last = df.groupby(entity_column)[self.time_column].max()
|
|
618
|
-
entity_last =
|
|
618
|
+
entity_last = to_datetime(entity_last)
|
|
619
619
|
recency_days = (ref_date - entity_last).dt.days
|
|
620
620
|
|
|
621
621
|
target_correlation = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from customer_retention.core.compat import DataFrame,
|
|
4
|
+
from customer_retention.core.compat import DataFrame, Timestamp, to_datetime
|
|
5
5
|
from customer_retention.core.components.enums import Severity
|
|
6
6
|
|
|
7
7
|
|
|
@@ -73,7 +73,7 @@ class TemporalGapCheck(TemporalQualityCheck):
|
|
|
73
73
|
if len(df) < 2:
|
|
74
74
|
return self._pass_result("Insufficient data to check gaps")
|
|
75
75
|
|
|
76
|
-
time_col =
|
|
76
|
+
time_col = to_datetime(df.sort_values(self.time_column)[self.time_column])
|
|
77
77
|
diffs_days = time_col.diff().dropna().dt.total_seconds() / 86400
|
|
78
78
|
expected_days = self.FREQ_TO_DAYS.get(self.expected_frequency, 1)
|
|
79
79
|
threshold_days = expected_days * self.max_gap_multiple
|
|
@@ -101,16 +101,16 @@ class TemporalGapCheck(TemporalQualityCheck):
|
|
|
101
101
|
|
|
102
102
|
|
|
103
103
|
class FutureDateCheck(TemporalQualityCheck):
|
|
104
|
-
def __init__(self, time_column: str, reference_date: Optional[
|
|
104
|
+
def __init__(self, time_column: str, reference_date: Optional[Timestamp] = None):
|
|
105
105
|
super().__init__("TQ003", "Future Dates", Severity.HIGH)
|
|
106
106
|
self.time_column = time_column
|
|
107
|
-
self.reference_date = reference_date or
|
|
107
|
+
self.reference_date = reference_date or Timestamp.now()
|
|
108
108
|
|
|
109
109
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
110
110
|
if len(df) == 0:
|
|
111
111
|
return self._pass_result("No data to check")
|
|
112
112
|
|
|
113
|
-
time_col =
|
|
113
|
+
time_col = to_datetime(df[self.time_column])
|
|
114
114
|
future_mask = time_col > self.reference_date
|
|
115
115
|
future_count = future_mask.sum()
|
|
116
116
|
|
|
@@ -140,7 +140,7 @@ class EventOrderCheck(TemporalQualityCheck):
|
|
|
140
140
|
if len(df) < 2:
|
|
141
141
|
return self._pass_result("Insufficient data to check ordering")
|
|
142
142
|
|
|
143
|
-
df_check = df.assign(_parsed_time=
|
|
143
|
+
df_check = df.assign(_parsed_time=to_datetime(df[self.time_column]))
|
|
144
144
|
collision_counts = df_check.groupby([self.entity_column, "_parsed_time"]).size()
|
|
145
145
|
ambiguous = collision_counts[collision_counts > 1]
|
|
146
146
|
ambiguous_count = ambiguous.sum() - len(ambiguous)
|
|
@@ -3,7 +3,13 @@ from typing import Optional
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
from customer_retention.core.compat import
|
|
6
|
+
from customer_retention.core.compat import (
|
|
7
|
+
DataFrame,
|
|
8
|
+
Timestamp,
|
|
9
|
+
is_datetime64_any_dtype,
|
|
10
|
+
pd,
|
|
11
|
+
to_datetime,
|
|
12
|
+
)
|
|
7
13
|
|
|
8
14
|
|
|
9
15
|
@dataclass
|
|
@@ -161,8 +167,8 @@ def classify_activity_segments(entity_lifecycles: DataFrame) -> ActivitySegmentR
|
|
|
161
167
|
@dataclass
|
|
162
168
|
class EntityLifecycle:
|
|
163
169
|
entity: str
|
|
164
|
-
first_event:
|
|
165
|
-
last_event:
|
|
170
|
+
first_event: Timestamp
|
|
171
|
+
last_event: Timestamp
|
|
166
172
|
duration_days: int
|
|
167
173
|
event_count: int
|
|
168
174
|
|
|
@@ -177,8 +183,8 @@ class TimeSeriesProfile:
|
|
|
177
183
|
events_per_entity: DistributionStats
|
|
178
184
|
entity_lifecycles: DataFrame
|
|
179
185
|
avg_inter_event_days: Optional[float] = None
|
|
180
|
-
first_event_date: Optional[
|
|
181
|
-
last_event_date: Optional[
|
|
186
|
+
first_event_date: Optional[Timestamp] = None
|
|
187
|
+
last_event_date: Optional[Timestamp] = None
|
|
182
188
|
|
|
183
189
|
|
|
184
190
|
class TimeSeriesProfiler:
|
|
@@ -224,8 +230,8 @@ class TimeSeriesProfiler:
|
|
|
224
230
|
|
|
225
231
|
def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
|
|
226
232
|
df = df.copy()
|
|
227
|
-
if not
|
|
228
|
-
df[self.time_column] =
|
|
233
|
+
if not is_datetime64_any_dtype(df[self.time_column]):
|
|
234
|
+
df[self.time_column] = to_datetime(df[self.time_column])
|
|
229
235
|
return df
|
|
230
236
|
|
|
231
237
|
def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
|
|
@@ -6,7 +6,14 @@ from typing import Dict, List, Optional, Union
|
|
|
6
6
|
|
|
7
7
|
import numpy as np
|
|
8
8
|
|
|
9
|
-
from customer_retention.core.compat import
|
|
9
|
+
from customer_retention.core.compat import (
|
|
10
|
+
DataFrame,
|
|
11
|
+
Timedelta,
|
|
12
|
+
Timestamp,
|
|
13
|
+
is_numeric_dtype,
|
|
14
|
+
pd,
|
|
15
|
+
to_datetime,
|
|
16
|
+
)
|
|
10
17
|
|
|
11
18
|
|
|
12
19
|
class AggregationType(str, Enum):
|
|
@@ -71,7 +78,7 @@ class TimeWindowAggregator:
|
|
|
71
78
|
def aggregate(
|
|
72
79
|
self, df: DataFrame, windows: Optional[List[str]] = None,
|
|
73
80
|
value_columns: Optional[List[str]] = None, agg_funcs: Optional[List[str]] = None,
|
|
74
|
-
reference_date: Optional[
|
|
81
|
+
reference_date: Optional[Timestamp] = None, include_event_count: bool = False,
|
|
75
82
|
include_recency: bool = False, include_tenure: bool = False,
|
|
76
83
|
exclude_columns: Optional[List[str]] = None,
|
|
77
84
|
) -> DataFrame:
|
|
@@ -79,7 +86,7 @@ class TimeWindowAggregator:
|
|
|
79
86
|
return pd.DataFrame()
|
|
80
87
|
|
|
81
88
|
df = df.copy()
|
|
82
|
-
df[self.time_column] =
|
|
89
|
+
df[self.time_column] = to_datetime(df[self.time_column])
|
|
83
90
|
reference_date = self._validate_reference_date(df, reference_date)
|
|
84
91
|
parsed_windows = [TimeWindow.from_string(w) for w in (windows or ["30d"])]
|
|
85
92
|
|
|
@@ -107,13 +114,13 @@ class TimeWindowAggregator:
|
|
|
107
114
|
result = pd.DataFrame(result_data)
|
|
108
115
|
result.attrs["aggregation_reference_date"] = (
|
|
109
116
|
reference_date.isoformat() if hasattr(reference_date, "isoformat") else str(reference_date))
|
|
110
|
-
result.attrs["aggregation_timestamp"] =
|
|
117
|
+
result.attrs["aggregation_timestamp"] = Timestamp.now().isoformat()
|
|
111
118
|
return result
|
|
112
119
|
|
|
113
120
|
def _add_value_aggregations(
|
|
114
121
|
self, result_data: Dict, df: DataFrame, entities: np.ndarray,
|
|
115
122
|
windows: List[TimeWindow], value_columns: List[str], agg_funcs: List[str],
|
|
116
|
-
reference_date:
|
|
123
|
+
reference_date: Timestamp,
|
|
117
124
|
) -> None:
|
|
118
125
|
for window in windows:
|
|
119
126
|
for col in value_columns:
|
|
@@ -169,9 +176,9 @@ class TimeWindowAggregator:
|
|
|
169
176
|
|
|
170
177
|
return feature_columns, value_counts_categories
|
|
171
178
|
|
|
172
|
-
def _validate_reference_date(self, df: DataFrame, reference_date: Optional[
|
|
179
|
+
def _validate_reference_date(self, df: DataFrame, reference_date: Optional[Timestamp]) -> Timestamp:
|
|
173
180
|
data_min, data_max = df[self.time_column].min(), df[self.time_column].max()
|
|
174
|
-
current_date =
|
|
181
|
+
current_date = Timestamp.now()
|
|
175
182
|
|
|
176
183
|
if reference_date is None:
|
|
177
184
|
warnings.warn(
|
|
@@ -196,16 +203,16 @@ class TimeWindowAggregator:
|
|
|
196
203
|
return reference_date
|
|
197
204
|
|
|
198
205
|
def _compute_event_counts(
|
|
199
|
-
self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date:
|
|
206
|
+
self, df: DataFrame, entities: np.ndarray, window: TimeWindow, reference_date: Timestamp,
|
|
200
207
|
) -> np.ndarray:
|
|
201
208
|
filtered_df = self._filter_by_window(df, window, reference_date)
|
|
202
209
|
counts = filtered_df.groupby(self.entity_column).size()
|
|
203
210
|
return np.array([counts.get(e, 0) for e in entities])
|
|
204
211
|
|
|
205
|
-
def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date:
|
|
212
|
+
def _filter_by_window(self, df: DataFrame, window: TimeWindow, reference_date: Timestamp) -> DataFrame:
|
|
206
213
|
if window.days is None:
|
|
207
214
|
return df
|
|
208
|
-
cutoff = reference_date -
|
|
215
|
+
cutoff = reference_date - Timedelta(days=window.days)
|
|
209
216
|
return df[df[self.time_column] >= cutoff]
|
|
210
217
|
|
|
211
218
|
def _compute_aggregation(
|
|
@@ -215,14 +222,14 @@ class TimeWindowAggregator:
|
|
|
215
222
|
value_column: str,
|
|
216
223
|
agg_func: str,
|
|
217
224
|
window: TimeWindow,
|
|
218
|
-
reference_date:
|
|
225
|
+
reference_date: Timestamp,
|
|
219
226
|
) -> np.ndarray:
|
|
220
227
|
filtered_df = self._filter_by_window(df, window, reference_date)
|
|
221
228
|
if len(filtered_df) == 0:
|
|
222
229
|
default = 0 if agg_func in ["sum", "count", "nunique"] else np.nan
|
|
223
230
|
return np.full(len(entities), default)
|
|
224
231
|
|
|
225
|
-
is_numeric =
|
|
232
|
+
is_numeric = is_numeric_dtype(df[value_column])
|
|
226
233
|
if agg_func in CATEGORICAL_AGG_FUNCS:
|
|
227
234
|
return self._compute_categorical_agg(filtered_df, entities, value_column, agg_func)
|
|
228
235
|
elif agg_func in NUMERIC_AGG_FUNCS and not is_numeric:
|
|
@@ -288,7 +295,7 @@ class TimeWindowAggregator:
|
|
|
288
295
|
return np.array([entropy_result.get(e, np.nan) for e in entities])
|
|
289
296
|
|
|
290
297
|
def _compute_value_counts(
|
|
291
|
-
self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date:
|
|
298
|
+
self, df: DataFrame, entities: np.ndarray, col: str, window: TimeWindow, reference_date: Timestamp
|
|
292
299
|
) -> Dict[str, np.ndarray]:
|
|
293
300
|
filtered_df = self._filter_by_window(df, window, reference_date)
|
|
294
301
|
unique_values = df[col].dropna().unique()
|
|
@@ -302,12 +309,12 @@ class TimeWindowAggregator:
|
|
|
302
309
|
result[col_name] = np.array([counts.get(e, 0) for e in entities])
|
|
303
310
|
return result
|
|
304
311
|
|
|
305
|
-
def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date:
|
|
312
|
+
def _compute_recency(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
|
|
306
313
|
last_dates = df.groupby(self.entity_column)[self.time_column].max()
|
|
307
314
|
days_since_last = (reference_date - last_dates).dt.days
|
|
308
315
|
return np.array([days_since_last.get(e, np.nan) for e in entities])
|
|
309
316
|
|
|
310
|
-
def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date:
|
|
317
|
+
def _compute_tenure(self, df: DataFrame, entities: np.ndarray, reference_date: Timestamp) -> np.ndarray:
|
|
311
318
|
first_dates = df.groupby(self.entity_column)[self.time_column].min()
|
|
312
319
|
days_since_first = (reference_date - first_dates).dt.days
|
|
313
320
|
return np.array([days_since_first.get(e, np.nan) for e in entities])
|
|
@@ -4,7 +4,7 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
-
from customer_retention.core.compat import DataFrame, Series,
|
|
7
|
+
from customer_retention.core.compat import DataFrame, Series, to_numeric
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
class EncodingStrategy(str, Enum):
|
|
@@ -212,7 +212,7 @@ class CategoricalEncoder:
|
|
|
212
212
|
if hasattr(self, '_cyclical_mapping') and self._cyclical_mapping is not None:
|
|
213
213
|
numeric = series.map(self._cyclical_mapping)
|
|
214
214
|
else:
|
|
215
|
-
numeric =
|
|
215
|
+
numeric = to_numeric(series, errors='coerce')
|
|
216
216
|
|
|
217
217
|
sin_vals = np.sin(2 * np.pi * numeric / self.period)
|
|
218
218
|
cos_vals = np.cos(2 * np.pi * numeric / self.period)
|
|
@@ -4,7 +4,7 @@ from typing import Optional
|
|
|
4
4
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
|
|
7
|
-
from customer_retention.core.compat import DataFrame,
|
|
7
|
+
from customer_retention.core.compat import DataFrame, notna
|
|
8
8
|
from customer_retention.core.config import ColumnType
|
|
9
9
|
from customer_retention.stages.cleaning import MissingValueHandler, OutlierHandler, OutlierTreatmentStrategy
|
|
10
10
|
|
|
@@ -198,7 +198,7 @@ class TransformationPipeline:
|
|
|
198
198
|
if col in working_df.columns and working_df[col].isna().any():
|
|
199
199
|
# Fill with median for extracted datetime features
|
|
200
200
|
median_val = working_df[col].median()
|
|
201
|
-
if
|
|
201
|
+
if notna(median_val):
|
|
202
202
|
working_df[col] = working_df[col].fillna(median_val)
|
|
203
203
|
|
|
204
204
|
for col, transformer in self._numeric_transformers.items():
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
import time
|
|
2
2
|
|
|
3
|
-
from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype,
|
|
3
|
+
from customer_retention.core.compat import DataFrame, Timestamp, is_datetime64_any_dtype, to_datetime, to_numeric
|
|
4
4
|
from customer_retention.core.config.column_config import ColumnType
|
|
5
5
|
from customer_retention.core.config.pipeline_config import BronzeConfig, PipelineConfig
|
|
6
6
|
|
|
@@ -159,7 +159,7 @@ class DataQualityGate(ValidationGate):
|
|
|
159
159
|
continue
|
|
160
160
|
|
|
161
161
|
if not is_datetime64_any_dtype(df_temp):
|
|
162
|
-
df_temp =
|
|
162
|
+
df_temp = to_datetime(df_temp, errors='coerce', format='mixed')
|
|
163
163
|
|
|
164
164
|
future_dates = df_temp > Timestamp.now()
|
|
165
165
|
future_count = future_dates.sum()
|
|
@@ -185,8 +185,8 @@ class DataQualityGate(ValidationGate):
|
|
|
185
185
|
if len(df_temp) == 0:
|
|
186
186
|
return issues
|
|
187
187
|
|
|
188
|
-
created =
|
|
189
|
-
firstorder =
|
|
188
|
+
created = to_datetime(df_temp['created'], errors='coerce', format='mixed')
|
|
189
|
+
firstorder = to_datetime(df_temp['firstorder'], errors='coerce', format='mixed')
|
|
190
190
|
|
|
191
191
|
violations = created > firstorder
|
|
192
192
|
violation_count = violations.sum()
|
|
@@ -214,7 +214,7 @@ class DataQualityGate(ValidationGate):
|
|
|
214
214
|
|
|
215
215
|
if col_config.is_numeric() and column_data.dtype == 'object':
|
|
216
216
|
try:
|
|
217
|
-
|
|
217
|
+
to_numeric(column_data.dropna(), errors='raise')
|
|
218
218
|
issues.append(self.create_issue(
|
|
219
219
|
"DQ040", "Numeric column stored as string",
|
|
220
220
|
Severity.MEDIUM, col_config.name, len(df), len(df),
|
|
@@ -8,7 +8,7 @@ including duplicate detection, date logic validation, and value range validation
|
|
|
8
8
|
from dataclasses import dataclass, field
|
|
9
9
|
from typing import Any, Dict, List, Optional
|
|
10
10
|
|
|
11
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
11
|
+
from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime
|
|
12
12
|
from customer_retention.core.components.enums import Severity
|
|
13
13
|
|
|
14
14
|
|
|
@@ -249,8 +249,8 @@ class DataValidator:
|
|
|
249
249
|
# Convert to datetime if needed
|
|
250
250
|
df_dates = df[order].copy()
|
|
251
251
|
for col in order:
|
|
252
|
-
if not
|
|
253
|
-
df_dates[col] =
|
|
252
|
+
if not is_datetime64_any_dtype(df_dates[col]):
|
|
253
|
+
df_dates[col] = to_datetime(df_dates[col], errors='coerce', format='mixed')
|
|
254
254
|
|
|
255
255
|
# Check sequential ordering
|
|
256
256
|
violations = []
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass
|
|
2
2
|
from typing import TYPE_CHECKING, Any, Dict, List, Optional
|
|
3
3
|
|
|
4
|
-
from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype,
|
|
4
|
+
from customer_retention.core.compat import DataFrame, Timestamp, is_numeric_dtype, notna, to_datetime
|
|
5
5
|
from customer_retention.core.components.enums import Severity
|
|
6
6
|
|
|
7
7
|
if TYPE_CHECKING:
|
|
@@ -124,7 +124,7 @@ class LeakageGate:
|
|
|
124
124
|
for feature in numeric_features:
|
|
125
125
|
try:
|
|
126
126
|
corr = df[feature].corr(df[self.target_column])
|
|
127
|
-
if
|
|
127
|
+
if notna(corr):
|
|
128
128
|
correlations[feature] = corr
|
|
129
129
|
except Exception:
|
|
130
130
|
continue
|
|
@@ -153,7 +153,7 @@ class LeakageGate:
|
|
|
153
153
|
|
|
154
154
|
@staticmethod
|
|
155
155
|
def _parse_datetime(series, errors="coerce"):
|
|
156
|
-
return
|
|
156
|
+
return to_datetime(series, errors=errors, format='mixed')
|
|
157
157
|
|
|
158
158
|
def _check_perfect_separation(
|
|
159
159
|
self,
|
|
@@ -236,7 +236,7 @@ class LeakageGate:
|
|
|
236
236
|
mean_0 = df[df[self.target_column] == target_values[0]][feature].mean()
|
|
237
237
|
mean_1 = df[df[self.target_column] == target_values[1]][feature].mean()
|
|
238
238
|
|
|
239
|
-
if (
|
|
239
|
+
if (notna(var_0) and notna(var_1) and
|
|
240
240
|
var_0 < 0.01 and var_1 < 0.01 and
|
|
241
241
|
abs(mean_0 - mean_1) > 0.1):
|
|
242
242
|
issues.append(LeakageIssue(
|
|
@@ -11,7 +11,7 @@ from datetime import timedelta
|
|
|
11
11
|
from enum import Enum
|
|
12
12
|
from typing import Any, Dict, List, Optional, Tuple
|
|
13
13
|
|
|
14
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
14
|
+
from customer_retention.core.compat import DataFrame, is_datetime64_any_dtype, pd, to_datetime
|
|
15
15
|
|
|
16
16
|
|
|
17
17
|
class DatasetType(Enum):
|
|
@@ -255,7 +255,7 @@ class TimeSeriesDetector:
|
|
|
255
255
|
|
|
256
256
|
if timestamp_column and timestamp_column in df.columns:
|
|
257
257
|
# Convert to datetime if needed
|
|
258
|
-
ts_series =
|
|
258
|
+
ts_series = to_datetime(
|
|
259
259
|
df[timestamp_column], errors='coerce', format='mixed'
|
|
260
260
|
)
|
|
261
261
|
valid_ts = ts_series.notna()
|
|
@@ -342,7 +342,7 @@ class TimeSeriesDetector:
|
|
|
342
342
|
name_match = any(pattern in col_lower for pattern in self.TIMESTAMP_PATTERNS)
|
|
343
343
|
|
|
344
344
|
# Check if column is datetime type
|
|
345
|
-
is_datetime =
|
|
345
|
+
is_datetime = is_datetime64_any_dtype(df[col])
|
|
346
346
|
|
|
347
347
|
# Try to parse as datetime
|
|
348
348
|
can_parse = False
|
|
@@ -350,7 +350,7 @@ class TimeSeriesDetector:
|
|
|
350
350
|
try:
|
|
351
351
|
with warnings.catch_warnings():
|
|
352
352
|
warnings.filterwarnings('ignore', category=FutureWarning)
|
|
353
|
-
parsed =
|
|
353
|
+
parsed = to_datetime(
|
|
354
354
|
df[col].head(100), errors='coerce', format='mixed'
|
|
355
355
|
)
|
|
356
356
|
can_parse = parsed.notna().mean() > 0.8
|
|
@@ -389,7 +389,7 @@ class TimeSeriesDetector:
|
|
|
389
389
|
if len(entity_data) < 2:
|
|
390
390
|
continue
|
|
391
391
|
|
|
392
|
-
ts =
|
|
392
|
+
ts = to_datetime(
|
|
393
393
|
entity_data[timestamp_column], errors='coerce', format='mixed'
|
|
394
394
|
)
|
|
395
395
|
ts = ts.dropna().sort_values()
|
|
@@ -525,7 +525,7 @@ class TimeSeriesValidator:
|
|
|
525
525
|
|
|
526
526
|
# Convert timestamp
|
|
527
527
|
df_copy = df.copy()
|
|
528
|
-
df_copy['_ts'] =
|
|
528
|
+
df_copy['_ts'] = to_datetime(
|
|
529
529
|
df_copy[timestamp_column], errors='coerce', format='mixed'
|
|
530
530
|
)
|
|
531
531
|
|
|
@@ -12,7 +12,7 @@ from typing import Any
|
|
|
12
12
|
|
|
13
13
|
import numpy as np
|
|
14
14
|
|
|
15
|
-
from customer_retention.core.compat import DataFrame, pd
|
|
15
|
+
from customer_retention.core.compat import DataFrame, get_dummies, pd
|
|
16
16
|
|
|
17
17
|
|
|
18
18
|
def _requires_column(fn):
|
|
@@ -113,7 +113,7 @@ def apply_cap_then_log(df: DataFrame, column: str) -> DataFrame:
|
|
|
113
113
|
|
|
114
114
|
@_requires_column
|
|
115
115
|
def apply_one_hot_encode(df: DataFrame, column: str) -> DataFrame:
|
|
116
|
-
return
|
|
116
|
+
return get_dummies(df, columns=[column], prefix=column)
|
|
117
117
|
|
|
118
118
|
|
|
119
119
|
def apply_feature_select(df: DataFrame, column: str) -> DataFrame:
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|