churnkit 0.75.1a2__py3-none-any.whl → 0.75.1a3__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {churnkit-0.75.1a2.dist-info → churnkit-0.75.1a3.dist-info}/METADATA +1 -1
- {churnkit-0.75.1a2.dist-info → churnkit-0.75.1a3.dist-info}/RECORD +38 -38
- customer_retention/__init__.py +1 -1
- customer_retention/analysis/visualization/chart_builder.py +6 -7
- customer_retention/core/compat/__init__.py +50 -0
- customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py +2 -1
- customer_retention/generators/pipeline_generator/renderer.py +7 -5
- customer_retention/generators/spec_generator/mlflow_pipeline_generator.py +2 -1
- customer_retention/stages/features/temporal_features.py +12 -12
- customer_retention/stages/profiling/pattern_analysis_config.py +4 -3
- customer_retention/stages/profiling/temporal_feature_analyzer.py +3 -3
- customer_retention/stages/profiling/temporal_pattern_analyzer.py +18 -5
- customer_retention/stages/profiling/temporal_quality_checks.py +9 -5
- customer_retention/stages/profiling/time_series_profiler.py +4 -5
- customer_retention/stages/profiling/time_window_aggregator.py +4 -2
- customer_retention/stages/transformation/datetime_transformer.py +10 -2
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/00_start_here.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb +0 -0
- {churnkit-0.75.1a2.data → churnkit-0.75.1a3.data}/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb +0 -0
- {churnkit-0.75.1a2.dist-info → churnkit-0.75.1a3.dist-info}/WHEEL +0 -0
- {churnkit-0.75.1a2.dist-info → churnkit-0.75.1a3.dist-info}/entry_points.txt +0 -0
- {churnkit-0.75.1a2.dist-info → churnkit-0.75.1a3.dist-info}/licenses/LICENSE +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: churnkit
|
|
3
|
-
Version: 0.75.
|
|
3
|
+
Version: 0.75.1a3
|
|
4
4
|
Summary: Structured ML framework for customer churn prediction -- from exploration notebooks to production pipelines, locally or on Databricks.
|
|
5
5
|
Project-URL: Homepage, https://github.com/aladjov/CR
|
|
6
6
|
Project-URL: Documentation, https://github.com/aladjov/CR/wiki
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
customer_retention/__init__.py,sha256=
|
|
1
|
+
customer_retention/__init__.py,sha256=9vKI748I497pRMAJl1x4_Th5hfFQRDfIHny7dk6gyQU,1114
|
|
2
2
|
customer_retention/cli.py,sha256=Wdl540cZgu_9mV-hWmTV9jD3S8QTDR8Ik-5hQXYCvmg,2466
|
|
3
3
|
customer_retention/analysis/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
4
|
customer_retention/analysis/jupyter_save_hook.py,sha256=iiNFIL83yOPX8BGUjCE6Pt5Kc8X-2adtE1_NZTMUaZQ,947
|
|
@@ -56,14 +56,14 @@ customer_retention/analysis/recommendations/transform/__init__.py,sha256=z5HPxPG
|
|
|
56
56
|
customer_retention/analysis/recommendations/transform/power.py,sha256=4S-zZnLWrHVW4Q52xiyCPXJ8OweO28Tnld94kiFY5yw,3738
|
|
57
57
|
customer_retention/analysis/recommendations/transform/scale.py,sha256=mKt6_UV0iQ1AiQwyHr3owhvkFWngecr6sTzgA4DX7Is,5081
|
|
58
58
|
customer_retention/analysis/visualization/__init__.py,sha256=5dVikBgzwJuQZ-W0vN5uMB1lLjVmvJbEhROQw9_87PI,399
|
|
59
|
-
customer_retention/analysis/visualization/chart_builder.py,sha256=
|
|
59
|
+
customer_retention/analysis/visualization/chart_builder.py,sha256=TmeTgMRChrsr4bFevToTBAsYqyy0e9Z5sNFQ37avC48,111799
|
|
60
60
|
customer_retention/analysis/visualization/console.py,sha256=dl_nEo6rXXSRfSnYkkJ4CsvBcE-n3l4mH9MIIjtw8Yw,2853
|
|
61
61
|
customer_retention/analysis/visualization/display.py,sha256=9px602M7GrllJYthHLthjpVYd0jiTTAyY5WK69dd4s0,6625
|
|
62
62
|
customer_retention/analysis/visualization/number_formatter.py,sha256=I1gUB0tEmfTQuDfOGYBZ3KRbq1rUd7ltR0vhDxFNRv8,1171
|
|
63
63
|
customer_retention/artifacts/__init__.py,sha256=zTROqiS6zlkkuCZgR6YOB0Cvlsyr0TpRBYsOEorpDYw,118
|
|
64
64
|
customer_retention/artifacts/fit_artifact_registry.py,sha256=aNfZC0Dgbc6jEwRR5keDEop9jo_tuL82hKO3ouCh5eY,5750
|
|
65
65
|
customer_retention/core/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
66
|
-
customer_retention/core/compat/__init__.py,sha256=
|
|
66
|
+
customer_retention/core/compat/__init__.py,sha256=dwamNiYIDzHEHpcmaphvR7wAwHslIqz6FoYjmQiR8Gg,7245
|
|
67
67
|
customer_retention/core/compat/detection.py,sha256=6W_1LefgQriBtRY2PnvSCUGDt0X63oIUEEVjFqG3qH0,2492
|
|
68
68
|
customer_retention/core/compat/ops.py,sha256=L-tAh4A3UEfRvePS6rAbhqb0QtZ_bN-TV7ZWpTkMFLA,1809
|
|
69
69
|
customer_retention/core/compat/pandas_backend.py,sha256=14JPoYTW6X-a3UwFaemhmPr8zi_GTdZnyitmqPQODR0,1839
|
|
@@ -107,7 +107,7 @@ customer_retention/generators/notebook_generator/stages/s01_ingestion.py,sha256=
|
|
|
107
107
|
customer_retention/generators/notebook_generator/stages/s02_profiling.py,sha256=kpI-3FfTYpr29NBX24bYFXB03eq3cKSQBftCRr15qxY,3794
|
|
108
108
|
customer_retention/generators/notebook_generator/stages/s03_cleaning.py,sha256=cNY9AEoZx2r1hNmz2cD4zy36bV855GKavcWSTjp1Hc4,8084
|
|
109
109
|
customer_retention/generators/notebook_generator/stages/s04_transformation.py,sha256=pzZOnWUfGjtGKzaqGfkN-Dipef1KUfErbSejMJv8Eo0,7623
|
|
110
|
-
customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py,sha256=
|
|
110
|
+
customer_retention/generators/notebook_generator/stages/s05_feature_engineering.py,sha256=XksIe9u36SJyZzQxLv-v7rHEOp30PtwX-K-rpuk6iGc,5985
|
|
111
111
|
customer_retention/generators/notebook_generator/stages/s06_feature_selection.py,sha256=FIPy6Dk6OI2LLo3vikq7i8EWkp_-kMbto1yN7Pgi7f4,4484
|
|
112
112
|
customer_retention/generators/notebook_generator/stages/s07_model_training.py,sha256=yJ-FWSCamvAqjZrvxWaUAviWLPHHS4EQ2nrZMRbPey4,8076
|
|
113
113
|
customer_retention/generators/notebook_generator/stages/s08_deployment.py,sha256=6IS1_9ZMvXBNMCTwGNZgSRU5Gh0kaats_CKJZ-z46wg,3556
|
|
@@ -124,11 +124,11 @@ customer_retention/generators/pipeline_generator/__init__.py,sha256=1SRNHmQGM-yY
|
|
|
124
124
|
customer_retention/generators/pipeline_generator/findings_parser.py,sha256=YvlXmDPDXkNnCvScUDNycwkp1J2HXpbDUO43NiShAig,34527
|
|
125
125
|
customer_retention/generators/pipeline_generator/generator.py,sha256=ZKLr34AM-XEswjoddJXciASUg2mL8jgsXjpQiaKy29M,6097
|
|
126
126
|
customer_retention/generators/pipeline_generator/models.py,sha256=1vSUXzO1uZw194nPdDJ5vU3lZw35Am-UWQY0Ic9CvbE,4874
|
|
127
|
-
customer_retention/generators/pipeline_generator/renderer.py,sha256=
|
|
127
|
+
customer_retention/generators/pipeline_generator/renderer.py,sha256=bvGTU_AkRgFSa0_xiMJawuOg7EswP8GcErVBR661TYM,81872
|
|
128
128
|
customer_retention/generators/spec_generator/__init__.py,sha256=vojlxKgLGnLHH9DNolB8mgL0_FsIfSSLmuHPXyr8bYY,782
|
|
129
129
|
customer_retention/generators/spec_generator/databricks_generator.py,sha256=o_qAik7mXuwzC9c7xUTkno5GHUmfHz5F2dIWqTcaDzw,15416
|
|
130
130
|
customer_retention/generators/spec_generator/generic_generator.py,sha256=I_glnOOsXDbL_v_ffxkeKwSYm5MCEB5qF9WAAZ8Woho,13962
|
|
131
|
-
customer_retention/generators/spec_generator/mlflow_pipeline_generator.py,sha256=
|
|
131
|
+
customer_retention/generators/spec_generator/mlflow_pipeline_generator.py,sha256=8-iUBgGThRJM5EmfJUwEoy8hJGZb7dZfuO6eh_QRH7A,27614
|
|
132
132
|
customer_retention/generators/spec_generator/pipeline_spec.py,sha256=c8v1SWgTdeGmNs96l1hOS0qx1B1ua0iwPhw1I5w9OIo,10705
|
|
133
133
|
customer_retention/integrations/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
134
134
|
customer_retention/integrations/adapters/__init__.py,sha256=Fgdp0ESROTUHnOb2RN9Ubo0A4BdfoenOGuUz61lHz8g,583
|
|
@@ -187,7 +187,7 @@ customer_retention/stages/features/feature_engineer.py,sha256=btVsdLHRKYk6E5xI-9
|
|
|
187
187
|
customer_retention/stages/features/feature_manifest.py,sha256=EEBG7kdU_jWNcnDqdLHONIaJ-n2GcqLkjXjIxo3zn9w,9731
|
|
188
188
|
customer_retention/stages/features/feature_selector.py,sha256=_CG3ZKVuZuqrwV9YvYvlTnXf0ahhsZNLmSRhf4nwNiQ,10089
|
|
189
189
|
customer_retention/stages/features/interaction_features.py,sha256=P7aaHALbFcfEchJsesVPhVmOm-v2VmYkG90t8p2tNVA,4634
|
|
190
|
-
customer_retention/stages/features/temporal_features.py,sha256=
|
|
190
|
+
customer_retention/stages/features/temporal_features.py,sha256=KyXehl56Bt2tW7uP5uG_EJIgWIy8ee5qkjRSKxuJlhQ,9183
|
|
191
191
|
customer_retention/stages/ingestion/__init__.py,sha256=kYVOe8kq7S0I_tjY-BcdZ1IsNWrYYjzDmoAcV2lhijQ,308
|
|
192
192
|
customer_retention/stages/ingestion/load_result.py,sha256=sambVq085Lj1rAfIrbDA2BgPU3HsVVJJpgkVWojkpyc,860
|
|
193
193
|
customer_retention/stages/ingestion/loaders.py,sha256=I0cgJo1XU47y_y7RKk6oELGVu1062qNP2GU5jJfgXVk,7705
|
|
@@ -216,7 +216,7 @@ customer_retention/stages/profiling/column_profiler.py,sha256=WZKwPxpDmCQiBJBHB-
|
|
|
216
216
|
customer_retention/stages/profiling/distribution_analysis.py,sha256=9v-QY41cuQI_Fuvjkqx1Q3QAcsSK8ThU43t8PRgD0uo,17052
|
|
217
217
|
customer_retention/stages/profiling/drift_detector.py,sha256=I1OYr37ew-XB7sVp6VARqjH0eKZA1Rx0eOQNRJZTOMs,12681
|
|
218
218
|
customer_retention/stages/profiling/feature_capacity.py,sha256=fP_sK2KxU6zpdfnIcAW313N451SXqHT1wv9psd5WhSk,19598
|
|
219
|
-
customer_retention/stages/profiling/pattern_analysis_config.py,sha256=
|
|
219
|
+
customer_retention/stages/profiling/pattern_analysis_config.py,sha256=TivC8fY3xNQ561VgHgaSloDVl7zSDOi-no8BSr5Favg,22575
|
|
220
220
|
customer_retention/stages/profiling/profile_result.py,sha256=NKKh1u2FmfBqnIbOEiqBh25IZDMm91h38RT7wzA8yQI,6350
|
|
221
221
|
customer_retention/stages/profiling/quality_checks.py,sha256=ov8opsY4AoM9D6Yr_fGXsVwXfpmO0OeFfhdML-xfoIM,65678
|
|
222
222
|
customer_retention/stages/profiling/relationship_detector.py,sha256=9WMM8YOIl-EWPY2P3PFuOENM9D1nm5lU5sDfZTE_chQ,9477
|
|
@@ -228,16 +228,16 @@ customer_retention/stages/profiling/segment_aware_outlier.py,sha256=PS5GXnf_g3D9
|
|
|
228
228
|
customer_retention/stages/profiling/target_level_analyzer.py,sha256=XPhdHqTdK9zzBDqy-JyrTi6NFf07wRwIGsVEOAiR_dE,10491
|
|
229
229
|
customer_retention/stages/profiling/temporal_analyzer.py,sha256=PXf4pYNcszp7N8_14MKFKXDku-fw2M_NLWN7jUsHd1Q,16102
|
|
230
230
|
customer_retention/stages/profiling/temporal_coverage.py,sha256=r23s1qyB7o11ab_TTLOgb4q29OPA_crRshFpMLt4t_w,18561
|
|
231
|
-
customer_retention/stages/profiling/temporal_feature_analyzer.py,sha256=
|
|
231
|
+
customer_retention/stages/profiling/temporal_feature_analyzer.py,sha256=Gl8GLxPlDIzh-shUYrePYnjzYQUwsBB-sB4Voqf69O8,32364
|
|
232
232
|
customer_retention/stages/profiling/temporal_feature_engineer.py,sha256=kTp5avXNsGGCYF_TBUg4KpbzfL79zz50zQ7ywVOxPkg,27141
|
|
233
|
-
customer_retention/stages/profiling/temporal_pattern_analyzer.py,sha256
|
|
234
|
-
customer_retention/stages/profiling/temporal_quality_checks.py,sha256=
|
|
233
|
+
customer_retention/stages/profiling/temporal_pattern_analyzer.py,sha256=-DBNhBfyEGhl0-rIgbpEGDJikyINDG55FP15JURKm_A,26814
|
|
234
|
+
customer_retention/stages/profiling/temporal_quality_checks.py,sha256=SosW3omX2c025UIdlXpLEBJCsAsIvoGXMbxw6tzBocA,13750
|
|
235
235
|
customer_retention/stages/profiling/temporal_target_analyzer.py,sha256=eeZlUhTWZfCftwgm_dySi1feRLuoU9SRLL_r_4jgN5g,8785
|
|
236
236
|
customer_retention/stages/profiling/text_embedder.py,sha256=ck7WIq7pGC7xgEzMQr7fYdHcJegYR6wfdh3z32WUiK8,3038
|
|
237
237
|
customer_retention/stages/profiling/text_processor.py,sha256=spdfwVSEU07aYbl2bIsg_INOBt3Js-IA15WVkjf1ask,4474
|
|
238
238
|
customer_retention/stages/profiling/text_reducer.py,sha256=ilSuUAu0dHUyRGTNg8TzoCEd-EAyXKvoAm4uGqwlSQs,2409
|
|
239
|
-
customer_retention/stages/profiling/time_series_profiler.py,sha256=
|
|
240
|
-
customer_retention/stages/profiling/time_window_aggregator.py,sha256=
|
|
239
|
+
customer_retention/stages/profiling/time_series_profiler.py,sha256=RRpaHrd6CXzat6HTdowIFxoZQyzqC3LlO9y-q_tsv2g,10315
|
|
240
|
+
customer_retention/stages/profiling/time_window_aggregator.py,sha256=SD53z3Itz2F3ptfYHRmlW4d7IbrXvJoJbsPw0VOoUWI,15909
|
|
241
241
|
customer_retention/stages/profiling/type_detector.py,sha256=VgYHWcBGepyJKNdY1FKgb9scOaosN6fDY_-WiTjfoAg,14726
|
|
242
242
|
customer_retention/stages/profiling/window_recommendation.py,sha256=Apd_PDFpo49HJJzldTcwzzgJjBzEfd8mbGboBwHhzGw,13354
|
|
243
243
|
customer_retention/stages/temporal/__init__.py,sha256=f86XiSUMKQgeTLyOsu89IJcafOPjdBIR9bH_hhrY8b8,6135
|
|
@@ -254,7 +254,7 @@ customer_retention/stages/temporal/timestamp_manager.py,sha256=EisQM4_e14wsdqVxz
|
|
|
254
254
|
customer_retention/stages/transformation/__init__.py,sha256=6XQGYKYNqdOuxlX6IujtVqRZ099pS8X_ATd6mLqwVtQ,783
|
|
255
255
|
customer_retention/stages/transformation/binary_handler.py,sha256=ObwL90YP3ivwOJONBikzZouUoBz-YCTcxWybfwA5ddc,3201
|
|
256
256
|
customer_retention/stages/transformation/categorical_encoder.py,sha256=T0mLgJ6cf2kLkha4HclAeeaxlz7cVJBWYEsEt8fs5KA,10145
|
|
257
|
-
customer_retention/stages/transformation/datetime_transformer.py,sha256=
|
|
257
|
+
customer_retention/stages/transformation/datetime_transformer.py,sha256=60qQUizDS_h-i6BNOAzDoOJxC1T1OEJE3ZguSA3mimI,3716
|
|
258
258
|
customer_retention/stages/transformation/numeric_transformer.py,sha256=wqC2aUfXargeOph8d9F4P2wLet4lnFOKoI9x1mpJucw,6367
|
|
259
259
|
customer_retention/stages/transformation/pipeline.py,sha256=qqbpisjN4uZ050eishlEj037u2mPKEwxGG0o7GruoQM,11278
|
|
260
260
|
customer_retention/stages/validation/__init__.py,sha256=8Klgpez2ApVM1n1HUWcaGjaa21-aC-ReaZIVj7zHFh4,2380
|
|
@@ -276,27 +276,27 @@ customer_retention/transforms/artifact_store.py,sha256=FYLpDcv2N6-dUTX5RPEIK3aCW
|
|
|
276
276
|
customer_retention/transforms/executor.py,sha256=oML5dCidxbW_q6YUkAwWcutYP6bIFB6IdD3BvemK45A,6304
|
|
277
277
|
customer_retention/transforms/fitted.py,sha256=3pNvnae-P3t3bKMeZz1Bl0xww-feapIYdoeTY6aUtI8,3278
|
|
278
278
|
customer_retention/transforms/ops.py,sha256=Xg2g9UOOudq_y9Hf3oWsjpqw3dEoykQR5pDSoyW8GX0,4294
|
|
279
|
-
churnkit-0.75.
|
|
280
|
-
churnkit-0.75.
|
|
281
|
-
churnkit-0.75.
|
|
282
|
-
churnkit-0.75.
|
|
283
|
-
churnkit-0.75.
|
|
284
|
-
churnkit-0.75.
|
|
285
|
-
churnkit-0.75.
|
|
286
|
-
churnkit-0.75.
|
|
287
|
-
churnkit-0.75.
|
|
288
|
-
churnkit-0.75.
|
|
289
|
-
churnkit-0.75.
|
|
290
|
-
churnkit-0.75.
|
|
291
|
-
churnkit-0.75.
|
|
292
|
-
churnkit-0.75.
|
|
293
|
-
churnkit-0.75.
|
|
294
|
-
churnkit-0.75.
|
|
295
|
-
churnkit-0.75.
|
|
296
|
-
churnkit-0.75.
|
|
297
|
-
churnkit-0.75.
|
|
298
|
-
churnkit-0.75.
|
|
299
|
-
churnkit-0.75.
|
|
300
|
-
churnkit-0.75.
|
|
301
|
-
churnkit-0.75.
|
|
302
|
-
churnkit-0.75.
|
|
279
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/00_start_here.ipynb,sha256=zCyhftEd3v9fc0Ta6wvA6b-9LcoGzRi8bS1tMZ3iu9w,21911
|
|
280
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/01_data_discovery.ipynb,sha256=up0X3oDJ5sAo1-tbqMyZj_f1h6D542G2uAxjVmtYCOI,46430
|
|
281
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/01a_a_temporal_text_deep_dive.ipynb,sha256=uai8T3iJSqOrabBQnVi8Z0k8zZGVgs_VVQWRHyXN8QU,33690
|
|
282
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/01a_temporal_deep_dive.ipynb,sha256=fC1ASNtvI8X1lAe-Lzcw3oX2cptDC-ymPeEtKKWhg20,67326
|
|
283
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/01b_temporal_quality.ipynb,sha256=RU5hxgrTVMZs1ytChVv1t49WpTO0Oj6B_Fu8g0xS0To,23039
|
|
284
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/01c_temporal_patterns.ipynb,sha256=ZGYfztP6JhOEwPmTYdC0l7w579fKXcNEJXq-PnCLc2I,153167
|
|
285
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/01d_event_aggregation.ipynb,sha256=-FT3SoBU0fhaZxGeTo-_UQl6riCrtoJaFnUg31opk64,63244
|
|
286
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/02_column_deep_dive.ipynb,sha256=mbP2LQWsXDyTsWg0bhrCBHEfHsEer_XOXRYV9f8JxAk,60250
|
|
287
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/02a_text_columns_deep_dive.ipynb,sha256=M9YN8yAjjuC6ZaUlc-rVqVLEkWd7Rc_GNILHS9qO3PU,29704
|
|
288
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/03_quality_assessment.ipynb,sha256=H49LLmn1PHbcbAvSQfteESRGk125QwkPI5qbLk3yZgc,68595
|
|
289
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/04_relationship_analysis.ipynb,sha256=Rr-B4-xg0ILuAIgztlZkiGJdTzLuNjOqBFxO8W4o9iU,78624
|
|
290
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/05_multi_dataset.ipynb,sha256=bBxkuZyTl1yZg4kMXO87WRjgZMhj_6hwLGX6m3XC270,62664
|
|
291
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/06_feature_opportunities.ipynb,sha256=cBJF5o4z3Z-dustQ4CVklnfTcQ8saG97tlgswWK9uWE,67409
|
|
292
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/07_modeling_readiness.ipynb,sha256=IiA04fyb-l097Glp3MtR03vPjQsZlS1Icg-hjEHa_Dg,28376
|
|
293
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/08_baseline_experiments.ipynb,sha256=KmjhnDf1JdpEiIcdfQ-ZFo_at6t9JRC30B6NmmvMBmg,34226
|
|
294
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/09_business_alignment.ipynb,sha256=tMNfGM7AH50N1ONzHhGW2HZLpQwraIxVzOiVnI-10X8,17214
|
|
295
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/10_spec_generation.ipynb,sha256=KeUdfL9Mvdi6023XpnfZ6oLEDNZaWiIHUfsAWig24mE,42847
|
|
296
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/11_scoring_validation.ipynb,sha256=5fi3eHMm03ZKZgdFAXMgydtZ3qX2TtR3L9bZS2MpWPE,49937
|
|
297
|
+
churnkit-0.75.1a3.data/data/share/churnkit/exploration_notebooks/12_view_documentation.ipynb,sha256=aQF7CG8HxckqUKOKqnmZgMkSvfVzyO2LlYPrymLYjBY,4405
|
|
298
|
+
churnkit-0.75.1a3.dist-info/METADATA,sha256=hYbCUfYKPP5jeW_YjZRN8j4M6msQHdApmOx7-KEJDmU,12736
|
|
299
|
+
churnkit-0.75.1a3.dist-info/WHEEL,sha256=WLgqFyCfm_KASv4WHyYy0P3pM_m7J5L9k2skdKLirC8,87
|
|
300
|
+
churnkit-0.75.1a3.dist-info/entry_points.txt,sha256=swQFVe-jjgQSBJQNO2Ulkz2F5odaE-TsnlTor3HQBjw,70
|
|
301
|
+
churnkit-0.75.1a3.dist-info/licenses/LICENSE,sha256=Bud8Oj25tnpoIuXCWW0xcSfmGPeEZAAHrDRoKdSYtZY,11344
|
|
302
|
+
churnkit-0.75.1a3.dist-info/RECORD,,
|
customer_retention/__init__.py
CHANGED
|
@@ -5,7 +5,7 @@ import numpy as np
|
|
|
5
5
|
import plotly.express as px
|
|
6
6
|
import plotly.graph_objects as go
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, Series, ensure_pandas_series, to_pandas
|
|
8
|
+
from customer_retention.core.compat import DataFrame, Series, ensure_pandas_series, safe_to_datetime, to_pandas
|
|
9
9
|
|
|
10
10
|
from .number_formatter import NumberFormatter
|
|
11
11
|
|
|
@@ -532,9 +532,8 @@ class ChartBuilder:
|
|
|
532
532
|
dates: Series,
|
|
533
533
|
title: Optional[str] = None,
|
|
534
534
|
) -> go.Figure:
|
|
535
|
-
import pandas as pd
|
|
536
535
|
dates = ensure_pandas_series(dates)
|
|
537
|
-
parsed =
|
|
536
|
+
parsed = safe_to_datetime(dates, errors="coerce").dropna()
|
|
538
537
|
|
|
539
538
|
if len(parsed) == 0:
|
|
540
539
|
fig = go.Figure()
|
|
@@ -1029,7 +1028,7 @@ class ChartBuilder:
|
|
|
1029
1028
|
"""
|
|
1030
1029
|
import pandas as pd
|
|
1031
1030
|
dates = ensure_pandas_series(dates)
|
|
1032
|
-
parsed =
|
|
1031
|
+
parsed = safe_to_datetime(dates, errors="coerce")
|
|
1033
1032
|
|
|
1034
1033
|
if values is not None:
|
|
1035
1034
|
values = ensure_pandas_series(values)
|
|
@@ -1078,7 +1077,7 @@ class ChartBuilder:
|
|
|
1078
1077
|
"""Create a month x day-of-week heatmap for pattern discovery."""
|
|
1079
1078
|
import pandas as pd
|
|
1080
1079
|
dates = ensure_pandas_series(dates)
|
|
1081
|
-
parsed =
|
|
1080
|
+
parsed = safe_to_datetime(dates, errors="coerce").dropna()
|
|
1082
1081
|
|
|
1083
1082
|
if values is not None:
|
|
1084
1083
|
values = ensure_pandas_series(values)
|
|
@@ -1127,7 +1126,7 @@ class ChartBuilder:
|
|
|
1127
1126
|
dates = ensure_pandas_series(dates)
|
|
1128
1127
|
values = ensure_pandas_series(values)
|
|
1129
1128
|
|
|
1130
|
-
df = pd.DataFrame({"date":
|
|
1129
|
+
df = pd.DataFrame({"date": safe_to_datetime(dates), "value": values}).dropna()
|
|
1131
1130
|
df = df.sort_values("date")
|
|
1132
1131
|
|
|
1133
1132
|
df["rolling_mean"] = df["value"].rolling(window=window, center=True, min_periods=1).mean()
|
|
@@ -2222,7 +2221,7 @@ class ChartBuilder:
|
|
|
2222
2221
|
import pandas as pd
|
|
2223
2222
|
with warnings.catch_warnings():
|
|
2224
2223
|
warnings.simplefilter("ignore")
|
|
2225
|
-
dates = pd.
|
|
2224
|
+
dates = safe_to_datetime(pd.Series(series), errors='coerce').dropna()
|
|
2226
2225
|
if len(dates) == 0:
|
|
2227
2226
|
return
|
|
2228
2227
|
|
|
@@ -147,6 +147,54 @@ def is_float_dtype(arr_or_dtype: Any) -> bool:
|
|
|
147
147
|
return _pandas.api.types.is_float_dtype(arr_or_dtype)
|
|
148
148
|
|
|
149
149
|
|
|
150
|
+
def _infer_epoch_unit(value: int) -> str:
|
|
151
|
+
"""Infer the epoch unit from a representative integer timestamp value.
|
|
152
|
+
|
|
153
|
+
Spark LongType timestamps become int64 after ``to_pandas()``. The bare
|
|
154
|
+
``pd.to_datetime()`` call assumes nanoseconds for large integers, which
|
|
155
|
+
silently produces wrong dates when the source used seconds or milliseconds.
|
|
156
|
+
This helper picks the right ``unit`` based on magnitude.
|
|
157
|
+
"""
|
|
158
|
+
abs_val = abs(int(value))
|
|
159
|
+
if abs_val > 1e17:
|
|
160
|
+
return "ns"
|
|
161
|
+
if abs_val > 1e14:
|
|
162
|
+
return "us"
|
|
163
|
+
if abs_val > 1e11:
|
|
164
|
+
return "ms"
|
|
165
|
+
return "s"
|
|
166
|
+
|
|
167
|
+
|
|
168
|
+
def safe_to_datetime(series: Any, **kwargs: Any) -> _pandas.Series:
|
|
169
|
+
"""Convert a Series to datetime, handling Spark LongType epoch integers.
|
|
170
|
+
|
|
171
|
+
Like ``pd.to_datetime`` but automatically detects integer epoch columns
|
|
172
|
+
and passes the correct ``unit`` parameter. Any extra *kwargs* are
|
|
173
|
+
forwarded to ``pd.to_datetime``.
|
|
174
|
+
"""
|
|
175
|
+
series = ensure_pandas_series(series)
|
|
176
|
+
if _pandas.api.types.is_datetime64_any_dtype(series):
|
|
177
|
+
return series
|
|
178
|
+
if _pandas.api.types.is_integer_dtype(series):
|
|
179
|
+
non_null = series.dropna()
|
|
180
|
+
if len(non_null) > 0:
|
|
181
|
+
unit = _infer_epoch_unit(non_null.iloc[0])
|
|
182
|
+
return _pandas.to_datetime(series, unit=unit, **kwargs)
|
|
183
|
+
return _pandas.to_datetime(series, **kwargs)
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
def ensure_datetime_column(df: _pandas.DataFrame, column: str) -> _pandas.DataFrame:
|
|
187
|
+
"""Ensure *column* in a **pandas** DataFrame is ``datetime64``.
|
|
188
|
+
|
|
189
|
+
Call this after ``to_pandas()`` to safely convert columns that may have
|
|
190
|
+
arrived as int64 epoch values from Spark. Returns the DataFrame
|
|
191
|
+
(modified in-place).
|
|
192
|
+
"""
|
|
193
|
+
if not _pandas.api.types.is_datetime64_any_dtype(df[column]):
|
|
194
|
+
df[column] = safe_to_datetime(df[column])
|
|
195
|
+
return df
|
|
196
|
+
|
|
197
|
+
|
|
150
198
|
class PandasCompat:
|
|
151
199
|
@staticmethod
|
|
152
200
|
def value_counts_normalize(series: Any, normalize: bool = False) -> Any:
|
|
@@ -208,6 +256,8 @@ __all__ = [
|
|
|
208
256
|
"is_notebook",
|
|
209
257
|
"get_display_function",
|
|
210
258
|
"get_dbutils",
|
|
259
|
+
"safe_to_datetime",
|
|
260
|
+
"ensure_datetime_column",
|
|
211
261
|
"ops",
|
|
212
262
|
"DataOps",
|
|
213
263
|
]
|
|
@@ -55,7 +55,8 @@ else:
|
|
|
55
55
|
else:
|
|
56
56
|
print("Warning: No feature_timestamp column found. Using current date (may cause leakage).")
|
|
57
57
|
if "signup_date" in df.columns:
|
|
58
|
-
|
|
58
|
+
from customer_retention.core.compat import safe_to_datetime
|
|
59
|
+
df["tenure_days"] = (pd.Timestamp.now() - safe_to_datetime(df["signup_date"])).dt.days'''),
|
|
59
60
|
self.cb.section("Validate Point-in-Time Correctness"),
|
|
60
61
|
self.cb.code('''if "feature_timestamp" in df.columns:
|
|
61
62
|
pit_report = PointInTimeJoiner.validate_temporal_integrity(df)
|
|
@@ -290,6 +290,7 @@ from pathlib import Path
|
|
|
290
290
|
{% if ops %}
|
|
291
291
|
from customer_retention.transforms import {{ ops | sort | join(', ') }}
|
|
292
292
|
{% endif %}
|
|
293
|
+
from customer_retention.core.compat import ensure_datetime_column, safe_to_datetime
|
|
293
294
|
from config import SOURCES, get_bronze_path{{ ', RAW_SOURCES' if config.lifecycle else '' }}
|
|
294
295
|
|
|
295
296
|
SOURCE_NAME = "{{ source }}"
|
|
@@ -356,7 +357,7 @@ def _load_raw_events():
|
|
|
356
357
|
{% if config.lifecycle.include_recency_bucket %}
|
|
357
358
|
|
|
358
359
|
def add_recency_tenure(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
359
|
-
raw_df
|
|
360
|
+
ensure_datetime_column(raw_df, TIME_COLUMN)
|
|
360
361
|
reference_date = raw_df[TIME_COLUMN].max()
|
|
361
362
|
entity_stats = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].agg(["min", "max"])
|
|
362
363
|
entity_stats["days_since_last"] = (reference_date - entity_stats["max"]).dt.days
|
|
@@ -398,7 +399,7 @@ def add_lifecycle_quadrant(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
398
399
|
{% if config.lifecycle.include_cyclical_features %}
|
|
399
400
|
|
|
400
401
|
def add_cyclical_features(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
401
|
-
raw_df
|
|
402
|
+
ensure_datetime_column(raw_df, TIME_COLUMN)
|
|
402
403
|
mean_dow = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())
|
|
403
404
|
df = df.merge(mean_dow.rename("mean_dow"), left_on=ENTITY_COLUMN, right_index=True, how="left")
|
|
404
405
|
df["dow_sin"] = np.sin(2 * np.pi * df["mean_dow"] / 7)
|
|
@@ -1447,6 +1448,7 @@ from pathlib import Path
|
|
|
1447
1448
|
{% if ops %}
|
|
1448
1449
|
from customer_retention.transforms import {{ ops | sort | join(', ') }}
|
|
1449
1450
|
{% endif %}
|
|
1451
|
+
from customer_retention.core.compat import ensure_datetime_column, safe_to_datetime
|
|
1450
1452
|
from config import PRODUCTION_DIR, RAW_SOURCES, TARGET_COLUMN
|
|
1451
1453
|
|
|
1452
1454
|
SOURCE_NAME = "{{ source }}"
|
|
@@ -1502,7 +1504,7 @@ AGG_FUNCS = {{ config.aggregation.agg_funcs }}
|
|
|
1502
1504
|
|
|
1503
1505
|
def apply_reshaping(df: pd.DataFrame) -> pd.DataFrame:
|
|
1504
1506
|
{% if config.aggregation %}
|
|
1505
|
-
df
|
|
1507
|
+
ensure_datetime_column(df, TIME_COLUMN)
|
|
1506
1508
|
reference_date = df[TIME_COLUMN].max()
|
|
1507
1509
|
result = df.groupby(ENTITY_COLUMN).agg("first")[[]]
|
|
1508
1510
|
if TARGET_COLUMN in df.columns:
|
|
@@ -1535,7 +1537,7 @@ def _load_raw_events():
|
|
|
1535
1537
|
{% if config.lifecycle.include_recency_bucket %}
|
|
1536
1538
|
|
|
1537
1539
|
def add_recency_tenure(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
1538
|
-
raw_df
|
|
1540
|
+
ensure_datetime_column(raw_df, TIME_COLUMN)
|
|
1539
1541
|
reference_date = raw_df[TIME_COLUMN].max()
|
|
1540
1542
|
entity_stats = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].agg(["min", "max"])
|
|
1541
1543
|
entity_stats["days_since_last"] = (reference_date - entity_stats["max"]).dt.days
|
|
@@ -1577,7 +1579,7 @@ def add_lifecycle_quadrant(df: pd.DataFrame) -> pd.DataFrame:
|
|
|
1577
1579
|
{% if config.lifecycle.include_cyclical_features %}
|
|
1578
1580
|
|
|
1579
1581
|
def add_cyclical_features(df: pd.DataFrame, raw_df: pd.DataFrame) -> pd.DataFrame:
|
|
1580
|
-
raw_df
|
|
1582
|
+
ensure_datetime_column(raw_df, TIME_COLUMN)
|
|
1581
1583
|
mean_dow = raw_df.groupby(ENTITY_COLUMN)[TIME_COLUMN].apply(lambda x: x.dt.dayofweek.mean())
|
|
1582
1584
|
df = df.merge(mean_dow.rename("mean_dow"), left_on=ENTITY_COLUMN, right_index=True, how="left")
|
|
1583
1585
|
df["dow_sin"] = np.sin(2 * np.pi * df["mean_dow"] / 7)
|
|
@@ -395,6 +395,7 @@ def log_data_quality_metrics(df: pd.DataFrame, prefix: str = "data"):
|
|
|
395
395
|
code_lines = [
|
|
396
396
|
"def engineer_features(df: pd.DataFrame) -> pd.DataFrame:",
|
|
397
397
|
' """Engineer features based on exploration findings."""',
|
|
398
|
+
" from customer_retention.core.compat import safe_to_datetime",
|
|
398
399
|
" df = df.copy()",
|
|
399
400
|
" new_features = []",
|
|
400
401
|
"",
|
|
@@ -411,7 +412,7 @@ def log_data_quality_metrics(df: pd.DataFrame, prefix: str = "data"):
|
|
|
411
412
|
code_lines.extend([
|
|
412
413
|
f" # Datetime features from {col_name}",
|
|
413
414
|
f" if '{col_name}' in df.columns:",
|
|
414
|
-
f" df['{col_name}'] =
|
|
415
|
+
f" df['{col_name}'] = safe_to_datetime(df['{col_name}'], errors='coerce')",
|
|
415
416
|
"",
|
|
416
417
|
])
|
|
417
418
|
|
|
@@ -10,7 +10,7 @@ from dataclasses import dataclass, field
|
|
|
10
10
|
from enum import Enum
|
|
11
11
|
from typing import List, Optional, Union
|
|
12
12
|
|
|
13
|
-
from customer_retention.core.compat import DataFrame, Series, Timedelta, Timestamp, pd
|
|
13
|
+
from customer_retention.core.compat import DataFrame, Series, Timedelta, Timestamp, pd, safe_to_datetime, to_pandas
|
|
14
14
|
|
|
15
15
|
|
|
16
16
|
class ReferenceDateSource(Enum):
|
|
@@ -122,19 +122,19 @@ class TemporalFeatureGenerator:
|
|
|
122
122
|
if not self._is_fitted:
|
|
123
123
|
raise ValueError("Generator not fitted. Call fit() first.")
|
|
124
124
|
|
|
125
|
-
result = df.copy()
|
|
125
|
+
result = to_pandas(df).copy()
|
|
126
126
|
self.generated_features = []
|
|
127
127
|
warnings_list = []
|
|
128
128
|
|
|
129
129
|
# Get reference date(s) for this transform
|
|
130
130
|
if self.reference_date_source in [ReferenceDateSource.COLUMN, ReferenceDateSource.FEATURE_TIMESTAMP]:
|
|
131
|
-
ref_dates =
|
|
131
|
+
ref_dates = safe_to_datetime(df[self.reference_date_column])
|
|
132
132
|
else:
|
|
133
133
|
ref_dates = self.reference_date
|
|
134
134
|
|
|
135
135
|
# Tenure features
|
|
136
136
|
if self.created_column and self.created_column in df.columns:
|
|
137
|
-
created =
|
|
137
|
+
created = safe_to_datetime(df[self.created_column])
|
|
138
138
|
tenure_days = self._compute_days_diff(ref_dates, created)
|
|
139
139
|
result["tenure_days"] = tenure_days
|
|
140
140
|
self.generated_features.append("tenure_days")
|
|
@@ -154,7 +154,7 @@ class TemporalFeatureGenerator:
|
|
|
154
154
|
|
|
155
155
|
# Recency features
|
|
156
156
|
if self.last_order_column and self.last_order_column in df.columns:
|
|
157
|
-
last_order =
|
|
157
|
+
last_order = safe_to_datetime(df[self.last_order_column])
|
|
158
158
|
days_since_last = self._compute_days_diff(ref_dates, last_order)
|
|
159
159
|
result["days_since_last_order"] = days_since_last
|
|
160
160
|
self.generated_features.append("days_since_last_order")
|
|
@@ -162,8 +162,8 @@ class TemporalFeatureGenerator:
|
|
|
162
162
|
# Activation features
|
|
163
163
|
if (self.first_order_column and self.first_order_column in df.columns and
|
|
164
164
|
self.created_column and self.created_column in df.columns):
|
|
165
|
-
created =
|
|
166
|
-
first_order =
|
|
165
|
+
created = safe_to_datetime(df[self.created_column])
|
|
166
|
+
first_order = safe_to_datetime(df[self.first_order_column])
|
|
167
167
|
days_to_first = self._compute_days_diff(first_order, created)
|
|
168
168
|
result["days_to_first_order"] = days_to_first
|
|
169
169
|
self.generated_features.append("days_to_first_order")
|
|
@@ -171,8 +171,8 @@ class TemporalFeatureGenerator:
|
|
|
171
171
|
# Active period
|
|
172
172
|
if (self.first_order_column and self.first_order_column in df.columns and
|
|
173
173
|
self.last_order_column and self.last_order_column in df.columns):
|
|
174
|
-
first_order =
|
|
175
|
-
last_order =
|
|
174
|
+
first_order = safe_to_datetime(df[self.first_order_column])
|
|
175
|
+
last_order = safe_to_datetime(df[self.last_order_column])
|
|
176
176
|
active_period = self._compute_days_diff(last_order, first_order)
|
|
177
177
|
result["active_period_days"] = active_period
|
|
178
178
|
self.generated_features.append("active_period_days")
|
|
@@ -210,21 +210,21 @@ class TemporalFeatureGenerator:
|
|
|
210
210
|
raise ValueError(
|
|
211
211
|
"date_column must be provided when source is MAX_DATE"
|
|
212
212
|
)
|
|
213
|
-
self.reference_date =
|
|
213
|
+
self.reference_date = safe_to_datetime(df[self.date_column]).max()
|
|
214
214
|
|
|
215
215
|
elif self.reference_date_source == ReferenceDateSource.COLUMN:
|
|
216
216
|
if self.reference_date_column is None:
|
|
217
217
|
raise ValueError(
|
|
218
218
|
"reference_date_column must be provided when source is COLUMN"
|
|
219
219
|
)
|
|
220
|
-
self.reference_date =
|
|
220
|
+
self.reference_date = safe_to_datetime(df[self.reference_date_column])
|
|
221
221
|
|
|
222
222
|
elif self.reference_date_source == ReferenceDateSource.FEATURE_TIMESTAMP:
|
|
223
223
|
if "feature_timestamp" not in df.columns:
|
|
224
224
|
raise ValueError(
|
|
225
225
|
"feature_timestamp column required when source is FEATURE_TIMESTAMP"
|
|
226
226
|
)
|
|
227
|
-
self.reference_date =
|
|
227
|
+
self.reference_date = safe_to_datetime(df["feature_timestamp"])
|
|
228
228
|
self.reference_date_column = "feature_timestamp"
|
|
229
229
|
|
|
230
230
|
def _compute_days_diff(
|
|
@@ -4,7 +4,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
import pandas as pd
|
|
6
6
|
|
|
7
|
-
from customer_retention.core.compat import DataFrame
|
|
7
|
+
from customer_retention.core.compat import DataFrame, ensure_datetime_column, to_pandas
|
|
8
8
|
|
|
9
9
|
|
|
10
10
|
@dataclass
|
|
@@ -216,12 +216,13 @@ class SparklineDataBuilder:
|
|
|
216
216
|
self.freq = freq
|
|
217
217
|
|
|
218
218
|
def build(self, df: DataFrame, columns: List[str]) -> Tuple[List[SparklineData], bool]:
|
|
219
|
-
|
|
219
|
+
df = to_pandas(df)
|
|
220
220
|
has_target = self.target_column is not None and self.target_column in df.columns
|
|
221
221
|
if has_target:
|
|
222
222
|
validate_not_event_level(df, self.entity_column, self.target_column)
|
|
223
223
|
df_work = self._prepare_working_df(df, has_target)
|
|
224
|
-
df_work
|
|
224
|
+
ensure_datetime_column(df_work, self.time_column)
|
|
225
|
+
df_work['_period'] = df_work[self.time_column].dt.to_period(self.freq).dt.start_time
|
|
225
226
|
results = [self._build_sparkline_for_column(df_work, col, has_target)
|
|
226
227
|
for col in columns if col in df_work.columns]
|
|
227
228
|
return results, has_target
|
|
@@ -5,7 +5,7 @@ from typing import Any, Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import DataFrame, pd, qcut,
|
|
8
|
+
from customer_retention.core.compat import DataFrame, ensure_datetime_column, pd, qcut, to_pandas
|
|
9
9
|
from customer_retention.core.utils import compute_effect_size
|
|
10
10
|
|
|
11
11
|
|
|
@@ -626,8 +626,8 @@ class TemporalFeatureAnalyzer:
|
|
|
626
626
|
return next_priority
|
|
627
627
|
|
|
628
628
|
def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
|
|
629
|
-
df = df.copy()
|
|
630
|
-
df
|
|
629
|
+
df = to_pandas(df).copy()
|
|
630
|
+
ensure_datetime_column(df, self.time_column)
|
|
631
631
|
return df
|
|
632
632
|
|
|
633
633
|
def _validate_event_level_target_usage(self, df: DataFrame, target_column: Optional[str]) -> None:
|
|
@@ -5,7 +5,15 @@ from typing import Dict, List, Optional, Tuple
|
|
|
5
5
|
import numpy as np
|
|
6
6
|
from scipy import stats
|
|
7
7
|
|
|
8
|
-
from customer_retention.core.compat import
|
|
8
|
+
from customer_retention.core.compat import (
|
|
9
|
+
DataFrame,
|
|
10
|
+
Timestamp,
|
|
11
|
+
cut,
|
|
12
|
+
ensure_datetime_column,
|
|
13
|
+
pd,
|
|
14
|
+
safe_to_datetime,
|
|
15
|
+
to_pandas,
|
|
16
|
+
)
|
|
9
17
|
from customer_retention.core.utils import compute_effect_size
|
|
10
18
|
|
|
11
19
|
|
|
@@ -177,6 +185,8 @@ def generate_trend_recommendations(trend: TrendResult, mean_value: float = 1.0)
|
|
|
177
185
|
|
|
178
186
|
|
|
179
187
|
def analyze_cohort_distribution(first_events: DataFrame, time_column: str) -> CohortDistribution:
|
|
188
|
+
first_events = to_pandas(first_events)
|
|
189
|
+
ensure_datetime_column(first_events, time_column)
|
|
180
190
|
years = first_events[time_column].dt.year
|
|
181
191
|
year_counts = years.value_counts().sort_index().to_dict()
|
|
182
192
|
total = len(first_events)
|
|
@@ -232,6 +242,7 @@ def compute_recency_buckets(
|
|
|
232
242
|
reference_date: Timestamp, bucket_edges: Optional[List[float]] = None
|
|
233
243
|
) -> List[RecencyBucketStats]:
|
|
234
244
|
df = to_pandas(df)
|
|
245
|
+
ensure_datetime_column(df, time_column)
|
|
235
246
|
edges = bucket_edges or DEFAULT_BUCKET_EDGES
|
|
236
247
|
labels = _generate_bucket_labels(edges)
|
|
237
248
|
entity_last = df.groupby(entity_column)[time_column].max().reset_index()
|
|
@@ -298,6 +309,7 @@ def _diagnose_anomaly_pattern(
|
|
|
298
309
|
df: DataFrame, entity_column: str, time_column: str, target_column: str
|
|
299
310
|
) -> AnomalyDiagnostics:
|
|
300
311
|
df = to_pandas(df)
|
|
312
|
+
ensure_datetime_column(df, time_column)
|
|
301
313
|
entity_target = df.groupby(entity_column)[target_column].first()
|
|
302
314
|
target_1_pct = float(entity_target.mean() * 100)
|
|
303
315
|
target_1_is_minority = target_1_pct < 50
|
|
@@ -436,6 +448,7 @@ def compare_recency_by_target(
|
|
|
436
448
|
df = to_pandas(df)
|
|
437
449
|
if target_column not in df.columns:
|
|
438
450
|
return None
|
|
451
|
+
ensure_datetime_column(df, time_column)
|
|
439
452
|
ref_date = reference_date or df[time_column].max()
|
|
440
453
|
entity_last = df.groupby(entity_column)[time_column].max().reset_index()
|
|
441
454
|
entity_last["recency_days"] = (ref_date - entity_last[time_column]).dt.days
|
|
@@ -502,7 +515,7 @@ class TemporalPatternAnalyzer:
|
|
|
502
515
|
if len(df_clean) < 3:
|
|
503
516
|
return self._unknown_trend()
|
|
504
517
|
|
|
505
|
-
time_col =
|
|
518
|
+
time_col = safe_to_datetime(df_clean[self.time_column])
|
|
506
519
|
x = (time_col - time_col.min()).dt.total_seconds() / 86400
|
|
507
520
|
y = df_clean[value_column].values
|
|
508
521
|
|
|
@@ -587,9 +600,10 @@ class TemporalPatternAnalyzer:
|
|
|
587
600
|
return pd.DataFrame()
|
|
588
601
|
|
|
589
602
|
df_copy = to_pandas(df).copy()
|
|
603
|
+
ensure_datetime_column(df_copy, cohort_column)
|
|
590
604
|
entity_first_event = df_copy.groupby(entity_column)[cohort_column].min()
|
|
591
605
|
df_copy["_cohort"] = df_copy[entity_column].map(entity_first_event)
|
|
592
|
-
df_copy["_cohort"] =
|
|
606
|
+
df_copy["_cohort"] = df_copy["_cohort"].dt.to_period(period)
|
|
593
607
|
|
|
594
608
|
entity_cohorts = df_copy.groupby(entity_column)["_cohort"].first().reset_index()
|
|
595
609
|
entity_cohorts.columns = [entity_column, "_cohort"]
|
|
@@ -615,11 +629,10 @@ class TemporalPatternAnalyzer:
|
|
|
615
629
|
return RecencyResult(avg_recency_days=0, median_recency_days=0, min_recency_days=0, max_recency_days=0)
|
|
616
630
|
|
|
617
631
|
df = to_pandas(df)
|
|
632
|
+
ensure_datetime_column(df, self.time_column)
|
|
618
633
|
ref_date = reference_date or Timestamp.now()
|
|
619
|
-
to_datetime(df[self.time_column])
|
|
620
634
|
|
|
621
635
|
entity_last = df.groupby(entity_column)[self.time_column].max()
|
|
622
|
-
entity_last = to_datetime(entity_last)
|
|
623
636
|
recency_days = (ref_date - entity_last).dt.days
|
|
624
637
|
|
|
625
638
|
target_correlation = None
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
from dataclasses import dataclass, field
|
|
2
2
|
from typing import Optional
|
|
3
3
|
|
|
4
|
-
from customer_retention.core.compat import DataFrame, Timestamp,
|
|
4
|
+
from customer_retention.core.compat import DataFrame, Timestamp, ensure_datetime_column, safe_to_datetime, to_pandas
|
|
5
5
|
from customer_retention.core.components.enums import Severity
|
|
6
6
|
|
|
7
7
|
|
|
@@ -38,6 +38,7 @@ class DuplicateEventCheck(TemporalQualityCheck):
|
|
|
38
38
|
self.time_column = time_column
|
|
39
39
|
|
|
40
40
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
41
|
+
df = to_pandas(df)
|
|
41
42
|
if len(df) == 0:
|
|
42
43
|
return self._pass_result("No data to check")
|
|
43
44
|
|
|
@@ -70,11 +71,12 @@ class TemporalGapCheck(TemporalQualityCheck):
|
|
|
70
71
|
self.max_gap_multiple = max_gap_multiple
|
|
71
72
|
|
|
72
73
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
74
|
+
df = to_pandas(df)
|
|
73
75
|
if len(df) < 2:
|
|
74
76
|
return self._pass_result("Insufficient data to check gaps")
|
|
75
77
|
|
|
76
|
-
df
|
|
77
|
-
time_col =
|
|
78
|
+
ensure_datetime_column(df, self.time_column)
|
|
79
|
+
time_col = df.sort_values(self.time_column)[self.time_column]
|
|
78
80
|
diffs_days = time_col.diff().dropna().dt.total_seconds() / 86400
|
|
79
81
|
expected_days = self.FREQ_TO_DAYS.get(self.expected_frequency, 1)
|
|
80
82
|
threshold_days = expected_days * self.max_gap_multiple
|
|
@@ -108,10 +110,11 @@ class FutureDateCheck(TemporalQualityCheck):
|
|
|
108
110
|
self.reference_date = reference_date or Timestamp.now()
|
|
109
111
|
|
|
110
112
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
113
|
+
df = to_pandas(df)
|
|
111
114
|
if len(df) == 0:
|
|
112
115
|
return self._pass_result("No data to check")
|
|
113
116
|
|
|
114
|
-
time_col =
|
|
117
|
+
time_col = safe_to_datetime(df[self.time_column])
|
|
115
118
|
future_mask = time_col > self.reference_date
|
|
116
119
|
future_count = future_mask.sum()
|
|
117
120
|
|
|
@@ -138,10 +141,11 @@ class EventOrderCheck(TemporalQualityCheck):
|
|
|
138
141
|
self.time_column = time_column
|
|
139
142
|
|
|
140
143
|
def run(self, df: DataFrame) -> TemporalQualityResult:
|
|
144
|
+
df = to_pandas(df)
|
|
141
145
|
if len(df) < 2:
|
|
142
146
|
return self._pass_result("Insufficient data to check ordering")
|
|
143
147
|
|
|
144
|
-
df_check = df.assign(_parsed_time=
|
|
148
|
+
df_check = df.assign(_parsed_time=safe_to_datetime(df[self.time_column]))
|
|
145
149
|
collision_counts = df_check.groupby([self.entity_column, "_parsed_time"]).size()
|
|
146
150
|
ambiguous = collision_counts[collision_counts > 1]
|
|
147
151
|
ambiguous_count = ambiguous.sum() - len(ambiguous)
|
|
@@ -6,9 +6,8 @@ import numpy as np
|
|
|
6
6
|
from customer_retention.core.compat import (
|
|
7
7
|
DataFrame,
|
|
8
8
|
Timestamp,
|
|
9
|
-
|
|
9
|
+
ensure_datetime_column,
|
|
10
10
|
pd,
|
|
11
|
-
to_datetime,
|
|
12
11
|
to_pandas,
|
|
13
12
|
)
|
|
14
13
|
|
|
@@ -196,11 +195,12 @@ class TimeSeriesProfiler:
|
|
|
196
195
|
self.time_column = time_column
|
|
197
196
|
|
|
198
197
|
def profile(self, df: DataFrame) -> TimeSeriesProfile:
|
|
198
|
+
df = self._prepare_dataframe(df)
|
|
199
|
+
|
|
199
200
|
if len(df) == 0:
|
|
200
201
|
return self._empty_profile()
|
|
201
202
|
|
|
202
203
|
self._validate_columns(df)
|
|
203
|
-
df = self._prepare_dataframe(df)
|
|
204
204
|
|
|
205
205
|
total_events = len(df)
|
|
206
206
|
unique_entities = df[self.entity_column].nunique()
|
|
@@ -231,8 +231,7 @@ class TimeSeriesProfiler:
|
|
|
231
231
|
|
|
232
232
|
def _prepare_dataframe(self, df: DataFrame) -> DataFrame:
|
|
233
233
|
df = to_pandas(df).copy()
|
|
234
|
-
|
|
235
|
-
df[self.time_column] = to_datetime(df[self.time_column])
|
|
234
|
+
ensure_datetime_column(df, self.time_column)
|
|
236
235
|
return df
|
|
237
236
|
|
|
238
237
|
def _compute_entity_lifecycles(self, df: DataFrame) -> DataFrame:
|
|
@@ -10,9 +10,10 @@ from customer_retention.core.compat import (
|
|
|
10
10
|
DataFrame,
|
|
11
11
|
Timedelta,
|
|
12
12
|
Timestamp,
|
|
13
|
+
ensure_datetime_column,
|
|
13
14
|
is_numeric_dtype,
|
|
14
15
|
pd,
|
|
15
|
-
|
|
16
|
+
to_pandas,
|
|
16
17
|
)
|
|
17
18
|
|
|
18
19
|
|
|
@@ -82,11 +83,12 @@ class TimeWindowAggregator:
|
|
|
82
83
|
include_recency: bool = False, include_tenure: bool = False,
|
|
83
84
|
exclude_columns: Optional[List[str]] = None,
|
|
84
85
|
) -> DataFrame:
|
|
86
|
+
df = to_pandas(df)
|
|
85
87
|
if len(df) == 0:
|
|
86
88
|
return pd.DataFrame()
|
|
87
89
|
|
|
88
90
|
df = df.copy()
|
|
89
|
-
df
|
|
91
|
+
ensure_datetime_column(df, self.time_column)
|
|
90
92
|
reference_date = self._validate_reference_date(df, reference_date)
|
|
91
93
|
parsed_windows = [TimeWindow.from_string(w) for w in (windows or ["30d"])]
|
|
92
94
|
|
|
@@ -3,7 +3,14 @@ from typing import Optional, Union
|
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
5
5
|
|
|
6
|
-
from customer_retention.core.compat import
|
|
6
|
+
from customer_retention.core.compat import (
|
|
7
|
+
DataFrame,
|
|
8
|
+
Series,
|
|
9
|
+
Timestamp,
|
|
10
|
+
ensure_pandas_series,
|
|
11
|
+
is_datetime64_any_dtype,
|
|
12
|
+
safe_to_datetime,
|
|
13
|
+
)
|
|
7
14
|
|
|
8
15
|
|
|
9
16
|
@dataclass
|
|
@@ -92,6 +99,7 @@ class DatetimeTransformer:
|
|
|
92
99
|
)
|
|
93
100
|
|
|
94
101
|
def _ensure_datetime(self, series: Series) -> Series:
|
|
102
|
+
series = ensure_pandas_series(series)
|
|
95
103
|
if is_datetime64_any_dtype(series):
|
|
96
104
|
return series
|
|
97
|
-
return
|
|
105
|
+
return safe_to_datetime(series, errors='coerce')
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|