tanml 0.1.6__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tanml might be problematic. Click here for more details.

Files changed (62) hide show
  1. tanml/__init__.py +1 -0
  2. tanml/check_runners/__init__.py +0 -0
  3. tanml/check_runners/base_runner.py +6 -0
  4. tanml/check_runners/cleaning_repro_runner.py +18 -0
  5. tanml/check_runners/correlation_runner.py +15 -0
  6. tanml/check_runners/data_quality_runner.py +24 -0
  7. tanml/check_runners/eda_runner.py +21 -0
  8. tanml/check_runners/explainability_runner.py +28 -0
  9. tanml/check_runners/input_cluster_runner.py +43 -0
  10. tanml/check_runners/logistic_stats_runner.py +28 -0
  11. tanml/check_runners/model_meta_runner.py +23 -0
  12. tanml/check_runners/performance_runner.py +28 -0
  13. tanml/check_runners/raw_data_runner.py +41 -0
  14. tanml/check_runners/rule_engine_runner.py +5 -0
  15. tanml/check_runners/stress_test_runner.py +26 -0
  16. tanml/check_runners/vif_runner.py +54 -0
  17. tanml/checks/__init__.py +0 -0
  18. tanml/checks/base.py +20 -0
  19. tanml/checks/cleaning_repro.py +47 -0
  20. tanml/checks/correlation.py +61 -0
  21. tanml/checks/data_quality.py +26 -0
  22. tanml/checks/eda.py +67 -0
  23. tanml/checks/explainability/shap_check.py +55 -0
  24. tanml/checks/input_cluster.py +109 -0
  25. tanml/checks/logit_stats.py +59 -0
  26. tanml/checks/model_contents.py +40 -0
  27. tanml/checks/model_meta.py +50 -0
  28. tanml/checks/performance.py +90 -0
  29. tanml/checks/raw_data.py +47 -0
  30. tanml/checks/rule_engine.py +45 -0
  31. tanml/checks/stress_test.py +64 -0
  32. tanml/checks/vif.py +51 -0
  33. tanml/cli/__init__.py +0 -0
  34. tanml/cli/arg_parser.py +31 -0
  35. tanml/cli/init_cmd.py +8 -0
  36. tanml/cli/main.py +27 -0
  37. tanml/cli/validate_cmd.py +7 -0
  38. tanml/config_templates/__init__.py +0 -0
  39. tanml/config_templates/rules_multiple_models_datasets.yaml +144 -0
  40. tanml/config_templates/rules_one_dataset_segment_column.yaml +140 -0
  41. tanml/config_templates/rules_one_model_one_dataset.yaml +143 -0
  42. tanml/engine/__init__.py +0 -0
  43. tanml/engine/check_agent_registry.py +42 -0
  44. tanml/engine/core_engine_agent.py +115 -0
  45. tanml/engine/segmentation_agent.py +118 -0
  46. tanml/engine/validation_agent.py +91 -0
  47. tanml/report/report_builder.py +230 -0
  48. tanml/report/templates/report_template.docx +0 -0
  49. tanml/utils/__init__.py +0 -0
  50. tanml/utils/data_loader.py +17 -0
  51. tanml/utils/model_loader.py +35 -0
  52. tanml/utils/r_loader.py +30 -0
  53. tanml/utils/sas_loader.py +50 -0
  54. tanml/utils/yaml_generator.py +34 -0
  55. tanml/utils/yaml_loader.py +5 -0
  56. tanml/validate.py +209 -0
  57. tanml-0.1.6.dist-info/METADATA +317 -0
  58. tanml-0.1.6.dist-info/RECORD +62 -0
  59. tanml-0.1.6.dist-info/WHEEL +5 -0
  60. tanml-0.1.6.dist-info/entry_points.txt +2 -0
  61. tanml-0.1.6.dist-info/licenses/LICENSE +21 -0
  62. tanml-0.1.6.dist-info/top_level.txt +1 -0
@@ -0,0 +1,144 @@
1
+ # ============================================================
2
+ # TanML Validation Configuration File: Scenario B
3
+ # ------------------------------------------------------------
4
+ # ๐Ÿงช Scenario: One model and one cleaned dataset per segment
5
+ #
6
+ # โœ… Required:
7
+ # - Define segment-wise cleaned data and model path (or retrain)
8
+ # - Choose ONE model source strategy (Option A or Option B)
9
+ # - Set input features and target column
10
+ # - Optional: provide global raw data
11
+ # - Adjust thresholds and check options as needed
12
+ # ============================================================
13
+
14
+ # ------------------------------------------
15
+ # REQUIRED: Model Input Schema
16
+ # ------------------------------------------
17
+ model:
18
+ features:
19
+ - feature_0 # ๐Ÿ‘‰ replace with actual feature names used in all segment models
20
+ - feature_1 # ๐Ÿ‘‰ remove or add more lines if needed
21
+ - feature_2
22
+ target: default_flag # ๐Ÿ‘‰ replace with your actual target column
23
+
24
+ # ------------------------------------------
25
+ # OPTIONAL: Raw Data Path
26
+ # ------------------------------------------
27
+ paths:
28
+ raw_data: data/raw.csv # ๐Ÿ‘‰ optional โ€“ provide full path if you have original raw dataset otherwise use null
29
+
30
+ output:
31
+ report_path_template: /absolute/path/to/output/{segment} # ๐Ÿ‘‰ Output path template โ€“ {segment} will be replaced dynamically, it is folder path
32
+
33
+ # ------------------------------------------
34
+ # โœ… OPTION A โ€” Pretrained Model per Segment (.pkl)
35
+ # ------------------------------------------
36
+ # Use this option when each segment has its own trained model and cleaned dataset.
37
+ # You must provide:
38
+ # - model: path to the pretrained `.pkl` file
39
+ # - cleaned: path to the cleaned CSV used for that segment
40
+ #
41
+ # ๐Ÿ‘‰ Comment out the OPTION B block below if using this
42
+ # ------------------------------------------
43
+ segment:
44
+ runs:
45
+ segment_A: # ๐Ÿ‘‰ rename (e.g., high_risk, bronze, tier_1)
46
+ model: models/logistic/model_b_segment1.pkl
47
+ cleaned: data/scenario_b_segment1.csv
48
+
49
+ segment_B:
50
+ model: models/logistic/model_b_segment1.pkl
51
+ cleaned: data/scenario_b_segment2.csv
52
+
53
+ # ------------------------------------------
54
+ # ๐Ÿ” OPTION B โ€” Retrain Model from Cleaned Data (Per Segment)
55
+ # ------------------------------------------
56
+ # Use this when you want to retrain a model for each segment
57
+ # using the corresponding cleaned dataset and shared model config.
58
+ #
59
+ # ๐Ÿ‘‰ If using this, comment out OPTION A (i.e., segment.runs[].model lines)
60
+ # ๐Ÿ‘‰ Each segment must still have its own cleaned dataset
61
+ # ------------------------------------------
62
+
63
+ # segment:
64
+ # runs:
65
+ # segment_A:
66
+ # cleaned: data/cleaned_a.csv
67
+ # segment_B:
68
+ # cleaned: data/cleaned_b.csv
69
+
70
+ # ------------------------------------------
71
+ # MODEL SOURCE CONFIG
72
+ # ------------------------------------------
73
+ # This tells TanML how to load or build the model(s) for each segment.
74
+ #
75
+ # ๐Ÿ‘‰ If using pretrained models (Option A): set `from_pickle: true`
76
+ # ๐Ÿ‘‰ If retraining per segment (Option B): set `from_pickle: false`
77
+ # ------------------------------------------
78
+ model_source:
79
+ from_pickle: true # ๐Ÿ‘‰ change to false if using retraining (Option B)
80
+ type: LogisticRegression
81
+ module: sklearn.linear_model
82
+ hyperparameters:
83
+ penalty: "l2"
84
+ solver: "liblinear"
85
+ random_state: 42
86
+ class_weight: "balanced"
87
+ max_iter: 100
88
+
89
+ # ------------------------------------------
90
+ # PERFORMANCE THRESHOLDS
91
+ # ------------------------------------------
92
+ auc_roc:
93
+ min: 0.60
94
+
95
+ f1:
96
+ min: 0.60
97
+
98
+ ks:
99
+ min: 0.20
100
+
101
+ # ------------------------------------------
102
+ # VALIDATION CHECKS
103
+ # ------------------------------------------
104
+
105
+ EDACheck:
106
+ enabled: true
107
+ max_plots: -1 # -1 = all numeric; or set number of columns
108
+
109
+ correlation:
110
+ enabled: true
111
+
112
+ VIFCheck:
113
+ enabled: true
114
+
115
+ raw_data_check:
116
+ enabled: true
117
+
118
+ model_meta:
119
+ enabled: true
120
+
121
+ # ------------------------------------------
122
+ # STRESS TESTING (Robustness Check)
123
+ # ------------------------------------------
124
+ StressTestCheck:
125
+ enabled: true
126
+ epsilon: 0.01 # โžœ 1% noise
127
+ perturb_fraction: 0.2 # โžœ 20% of rows
128
+
129
+ # ------------------------------------------
130
+ # INPUT CLUSTER COVERAGE
131
+ # ------------------------------------------
132
+ InputClusterCoverageCheck:
133
+ enabled: true
134
+ n_clusters: 5 # โžœ fixed clusters for coverage bar chart
135
+ max_k: 10 # โžœ elbow method search (if needed)
136
+
137
+ # ------------------------------------------
138
+ # EXPLAINABILITY
139
+ # ------------------------------------------
140
+ explainability:
141
+ shap:
142
+ enabled: true
143
+ background_sample_size: 100 # โžœ SHAP explainer training background
144
+ test_sample_size: 200 # โžœ test rows to explain
@@ -0,0 +1,140 @@
1
+ # ============================================================
2
+ # TanML Validation Configuration File: Scenario C
3
+ # ------------------------------------------------------------
4
+ # ๐Ÿงช Scenario: One dataset with a segment column, one model per segment
5
+ #
6
+ # โœ… Required:
7
+ # - Provide cleaned data (includes all segments)
8
+ # - Define the segment column used to split the data
9
+ # - Choose ONE model source strategy (Option A or Option B)
10
+ # - Set input features and target column
11
+ # - Optional: provide global raw data
12
+ # - Adjust thresholds and check options as needed
13
+ # ============================================================
14
+
15
+ # ------------------------------------------
16
+ # REQUIRED: Model Input Schema
17
+ # ------------------------------------------
18
+ model:
19
+ features:
20
+ - feature_0 # ๐Ÿ‘‰ replace with actual feature names used across all segment models
21
+ - feature_1
22
+ - feature_2
23
+ target: default_flag # ๐Ÿ‘‰ replace with your actual target column
24
+
25
+ # ------------------------------------------
26
+ # REQUIRED: File Paths
27
+ # ------------------------------------------
28
+ paths:
29
+ cleaned_data: data/cleaned.csv # ๐Ÿ‘‰ path to full cleaned dataset (includes all segments)
30
+ raw_data: data/raw.csv # ๐Ÿ‘‰ optional โ€” use null if raw data not available
31
+
32
+ # ------------------------------------------
33
+ # OUTPUT CONFIGURATION
34
+ # ------------------------------------------
35
+ # Path template where validation reports will be saved.
36
+ # ๐Ÿ‘‰ Use `{segment}` as a placeholder for the segment name.
37
+ # ------------------------------------------
38
+ output:
39
+ report_path_template: reports/scenario_c/{segment}_report.docx # ๐Ÿ‘‰ customize this path as needed
40
+
41
+ # ------------------------------------------
42
+ # โœ… OPTION A โ€” Pretrained Models per Segment (.pkl)
43
+ # ------------------------------------------
44
+ # Provide one model per segment (already trained)
45
+ # This is the default option.
46
+ # ๐Ÿ‘‰ Comment out the OPTION B block below if using this
47
+ # ------------------------------------------
48
+ segment:
49
+ column: customer_segment # ๐Ÿ‘‰ column used to split segments
50
+
51
+ runs:
52
+ segment_A:
53
+ model: models/logistic/model_a.pkl
54
+ segment_B:
55
+ model: models/logistic/model_b.pkl
56
+
57
+ # ------------------------------------------
58
+ # ๐Ÿ” OPTION B โ€” Retrain Models per Segment from Cleaned Data
59
+ # ------------------------------------------
60
+ # Use this if you want TanML to retrain a model for each segment
61
+ # from the common cleaned dataset.
62
+ # ๐Ÿ‘‰ Comment out the OPTION A block above if using this
63
+ # ๐Ÿ‘‰ `segment.runs` must list segment values (no model paths needed)
64
+ # ------------------------------------------
65
+ # segment:
66
+ # column: customer_segment
67
+ # runs:
68
+ # segment_A: {}
69
+ # segment_B: {}
70
+
71
+ # ------------------------------------------
72
+ # MODEL SOURCE CONFIGURATION
73
+ # ------------------------------------------
74
+ # ๐Ÿ‘‰ If using pretrained models: set `from_pickle: true`
75
+ # ๐Ÿ‘‰ If retraining per segment: set `from_pickle: false`
76
+ # ------------------------------------------
77
+ model_source:
78
+ from_pickle: true
79
+ type: LogisticRegression
80
+ module: sklearn.linear_model
81
+ hyperparameters:
82
+ penalty: "l2"
83
+ solver: "liblinear"
84
+ random_state: 42
85
+ class_weight: "balanced"
86
+ max_iter: 100
87
+
88
+ # ------------------------------------------
89
+ # PERFORMANCE THRESHOLDS
90
+ # ------------------------------------------
91
+ auc_roc:
92
+ min: 0.60
93
+ f1:
94
+ min: 0.60
95
+ ks:
96
+ min: 0.20
97
+
98
+ # ------------------------------------------
99
+ # VALIDATION CHECKS
100
+ # ------------------------------------------
101
+ EDACheck:
102
+ enabled: true
103
+ max_plots: -1
104
+
105
+ correlation:
106
+ enabled: true
107
+
108
+ VIFCheck:
109
+ enabled: true
110
+
111
+ raw_data_check:
112
+ enabled: true
113
+
114
+ model_meta:
115
+ enabled: true
116
+
117
+ # ------------------------------------------
118
+ # STRESS TESTING (Robustness Check)
119
+ # ------------------------------------------
120
+ StressTestCheck:
121
+ enabled: true
122
+ epsilon: 0.01
123
+ perturb_fraction: 0.2
124
+
125
+ # ------------------------------------------
126
+ # INPUT CLUSTER COVERAGE
127
+ # ------------------------------------------
128
+ InputClusterCoverageCheck:
129
+ enabled: true
130
+ n_clusters: 5
131
+ max_k: 10
132
+
133
+ # ------------------------------------------
134
+ # EXPLAINABILITY
135
+ # ------------------------------------------
136
+ explainability:
137
+ shap:
138
+ enabled: true
139
+ background_sample_size: 100
140
+ test_sample_size: 200
@@ -0,0 +1,143 @@
1
+ # ============================================================
2
+ # TanML Validation Configuration File: Scenario A
3
+ # ------------------------------------------------------------
4
+ # ๐Ÿงช Scenario: One model, one cleaned dataset (no segmentation)
5
+ #
6
+ # โœ… Required:
7
+ # - Choose ONE set of paths (Option A or Option B)
8
+ # - Choose ONE model source block (Option A or Option B)
9
+ # - Fill in your feature names and target column
10
+ # - Adjust any thresholds or check options as needed
11
+ # ============================================================
12
+
13
+
14
+ # ------------------------------------------
15
+ # REQUIRED: Model Input Schema
16
+ # ------------------------------------------
17
+ model:
18
+ features:
19
+ - feature_0 # ๐Ÿ‘‰ replace with actual feature names used in your model
20
+ - feature_1
21
+ - feature_2
22
+ - feature_3
23
+ - feature_4
24
+ target: default_flag # ๐Ÿ‘‰ replace with actual target column
25
+
26
+ # ------------------------------------------
27
+ # OUTPUT: Report Path
28
+ # ------------------------------------------
29
+ output:
30
+ report_path: /mnt/c/Users/you/Desktop/tanml_output/scenario_a_report.docx # ๐Ÿ‘‰ customize as needed
31
+
32
+
33
+ # ------------------------------------------
34
+ # โœ… OPTION A โ€” Use a Pretrained Model (.pkl)
35
+ # ------------------------------------------
36
+ # Use this option when you already have a trained model saved as a `.pkl` file.
37
+ # You must provide the path to:
38
+ # - The cleaned dataset used during training
39
+ # - The raw dataset (optional, but recommended)
40
+ # - The trained model file via `paths.model`
41
+ #
42
+ # Be sure to:
43
+ # - Keep `from_pickle: true` in `model_source`
44
+ # - Match the `path` in `model_source` and `paths.model`
45
+ # - Comment out OPTION B block if using this
46
+ # ------------------------------------------
47
+ paths:
48
+ raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv # ๐Ÿ‘‰ optional โ€” use null if raw data is not available
49
+ cleaned_data: data/cleaned_a.csv
50
+ model: models/logistic/model_a.pkl
51
+
52
+ model_source:
53
+ from_pickle: true
54
+
55
+
56
+ # ------------------------------------------
57
+ # ๐Ÿ” OPTION B โ€” Retrain Model from Scratch
58
+ # ------------------------------------------
59
+ # Use this option if you want TanML to retrain the model for you
60
+ # using the cleaned dataset and the specified algorithm + hyperparameters.
61
+ #
62
+ # You must provide:
63
+ # - Path to the raw dataset (optional)
64
+ # - Path to the cleaned dataset (required)
65
+ # - The full model definition: type, module, and hyperparameters
66
+ #
67
+ # Be sure to:
68
+ # - Set `from_pickle: false`
69
+ # - Omit or comment out the `paths.model` line
70
+ # - Comment out the OPTION A block above
71
+ # ------------------------------------------
72
+ # paths:
73
+ # raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv
74
+ # cleaned_data: data/cleaned_a.csv
75
+
76
+ # model_source:
77
+ # from_pickle: false
78
+ # type: LogisticRegression
79
+ # module: sklearn.linear_model
80
+ # hyperparameters:
81
+ # penalty: "l2"
82
+ # C: 1.0
83
+ # solver: "liblinear"
84
+ # class_weight: "balanced"
85
+ # max_iter: 100
86
+ # random_state: 42
87
+
88
+ # ------------------------------------------
89
+ # PERFORMANCE THRESHOLDS
90
+ # ------------------------------------------
91
+ auc_roc:
92
+ min: 0.60
93
+
94
+ f1:
95
+ min: 0.60
96
+
97
+ ks:
98
+ min: 0.20
99
+
100
+ # ------------------------------------------
101
+ # VALIDATION CHECKS
102
+ # ------------------------------------------
103
+
104
+ EDACheck:
105
+ enabled: true
106
+ max_plots: -1 # -1 = all numeric; or set number of columns
107
+
108
+ correlation:
109
+ enabled: true
110
+
111
+ VIFCheck:
112
+ enabled: true
113
+
114
+ raw_data_check:
115
+ enabled: true
116
+
117
+ model_meta:
118
+ enabled: true
119
+
120
+ # ------------------------------------------
121
+ # STRESS TESTING (Robustness Check)
122
+ # ------------------------------------------
123
+ StressTestCheck:
124
+ enabled: true
125
+ epsilon: 0.01 # โžœ 1% noise
126
+ perturb_fraction: 0.2 # โžœ 20% of rows
127
+
128
+ # ------------------------------------------
129
+ # INPUT CLUSTER COVERAGE
130
+ # ------------------------------------------
131
+ InputClusterCoverageCheck:
132
+ enabled: true
133
+ n_clusters: 5 # โžœ fixed clusters for coverage bar chart
134
+ max_k: 10 # โžœ elbow method search (if needed)
135
+
136
+ # ------------------------------------------
137
+ # EXPLAINABILITY
138
+ # ------------------------------------------
139
+ explainability:
140
+ shap:
141
+ enabled: true
142
+ background_sample_size: 100 # โžœ SHAP explainer training background
143
+ test_sample_size: 200 # โžœ test rows to explain
File without changes
@@ -0,0 +1,42 @@
1
+ from tanml.check_runners.performance_runner import run_performance_check
2
+ from tanml.check_runners.data_quality_runner import run_data_quality_check
3
+ from tanml.check_runners.stress_test_runner import run_stress_test_check
4
+ from tanml.check_runners.input_cluster_runner import run_input_cluster_check
5
+ from tanml.check_runners.logistic_stats_runner import run_logistic_stats_check
6
+ from tanml.check_runners.raw_data_runner import run_raw_data_check
7
+ #from tanml.check_runners.cleaning_repro_runner import run_cleaning_repro_check
8
+ from tanml.check_runners.model_meta_runner import ModelMetaCheckRunner
9
+ from tanml.check_runners.correlation_runner import CorrelationCheckRunner
10
+ from tanml.check_runners.eda_runner import EDACheckRunner
11
+ from tanml.check_runners.explainability_runner import run_shap_check
12
+ from tanml.check_runners.vif_runner import VIFCheckRunner
13
+ from tanml.check_runners.rule_engine_runner import RuleEngineCheckRunner
14
+
15
+
16
+ # Wrapper for InputClusterCheck to inject expected_features from model
17
+ def input_cluster_wrapper(model, X_train, X_test, y_train, y_test, rule_config, cleaned_df, *args, **kwargs):
18
+ if hasattr(model, "feature_names_in_"):
19
+ expected_features = list(model.feature_names_in_)
20
+ else:
21
+ raise ValueError("Model does not have 'feature_names_in_' attribute required for InputClusterCheck.")
22
+
23
+ return run_input_cluster_check(
24
+ model, X_train, X_test, y_train, y_test, rule_config, cleaned_df, expected_features
25
+ )
26
+
27
+ CHECK_RUNNER_REGISTRY = {
28
+ "PerformanceCheck": run_performance_check,
29
+ "DataQualityCheck": run_data_quality_check,
30
+ "StressTestCheck": run_stress_test_check,
31
+ "InputClusterCheck": input_cluster_wrapper,
32
+ "LogisticStatsCheck": run_logistic_stats_check,
33
+ "RawDataCheck": run_raw_data_check,
34
+ #"CleaningReproCheck": run_cleaning_repro_check,
35
+ "ModelMetaCheck": ModelMetaCheckRunner,
36
+ "CorrelationCheck": CorrelationCheckRunner,
37
+ "EDACheck": EDACheckRunner,
38
+ "SHAPCheck": run_shap_check,
39
+ "VIFCheck": VIFCheckRunner,
40
+ "RuleEngineCheck": RuleEngineCheckRunner,
41
+
42
+ }
@@ -0,0 +1,115 @@
1
+ """
2
+ ValidationEngine โ€“ runs all registered check-runners and assembles a
3
+ single results dictionary that the ReportBuilder / Jinja template expects.
4
+ """
5
+
6
+ from tanml.engine.check_agent_registry import CHECK_RUNNER_REGISTRY
7
+ #from tanml.checks.cleaning_repro import CleaningReproCheck
8
+
9
+
10
+ KEEP_AS_NESTED = {
11
+ "DataQualityCheck",
12
+ "StressTestCheck",
13
+ "InputClusterCheck",
14
+ "RawDataCheck",
15
+ #"CleaningReproCheck",
16
+ "SHAPCheck",
17
+ "VIFCheck",
18
+ "CorrelationCheck",
19
+ "EDACheck",
20
+ }
21
+
22
+
23
+ class ValidationEngine:
24
+ def __init__(
25
+ self,
26
+ model,
27
+ X_train,
28
+ X_test,
29
+ y_train,
30
+ y_test,
31
+ config,
32
+ cleaned_data,
33
+ raw_df=None ,
34
+ ctx=None
35
+ ):
36
+ self.model = model
37
+ self.X_train = X_train
38
+ self.X_test = X_test
39
+ self.y_train = y_train
40
+ self.y_test = y_test
41
+ self.config = config
42
+ self.cleaned_data = cleaned_data
43
+ self.raw_df = raw_df
44
+
45
+ self.results = dict(config.get("check_results", {}))
46
+ self.ctx = ctx or {}
47
+ def run_all_checks(self):
48
+ for check_name, runner_func in CHECK_RUNNER_REGISTRY.items():
49
+ if check_name in self.config.get("skip_checks", []):
50
+ continue
51
+
52
+ print(f"โœ… Running {check_name}")
53
+ try:
54
+ result = runner_func(
55
+ self.model,
56
+ self.X_train,
57
+ self.X_test,
58
+ self.y_train,
59
+ self.y_test,
60
+ self.config,
61
+ self.cleaned_data,
62
+ raw_df=self.raw_df
63
+ )
64
+
65
+ self._integrate(check_name, result)
66
+
67
+ except Exception as e:
68
+ print(f"โš ๏ธ {check_name} failed: {e}")
69
+ self.results[check_name] = {"error": str(e)}
70
+
71
+ # Add CleaningReproCheck manually
72
+ # if self.raw_df is not None:
73
+ # print("โœ… Running CleaningReproCheck")
74
+ # try:
75
+ # check = CleaningReproCheck(self.raw_df, self.cleaned_data)
76
+
77
+ # self.results["CleaningReproCheck"] = check.run()
78
+ # except Exception as e:
79
+ # print(f"โš ๏ธ CleaningReproCheck failed: {e}")
80
+ # self.results["CleaningReproCheck"] = {"error": str(e)}
81
+ # else:
82
+ # print("โš ๏ธ Skipping CleaningReproCheck โ€” raw_df not provided")
83
+ # self.results["CleaningReproCheck"] = {"error": "raw_data not available"}
84
+
85
+ # convenience copy for template
86
+ self.results["check_results"] = dict(self.results)
87
+ return self.results
88
+
89
+ def _integrate(self, check_name: str, result):
90
+ """Merge a check result into self.results respecting the template layout."""
91
+ if not result:
92
+ return
93
+
94
+ # Special flatten for LogisticStatsCheck
95
+ if check_name == "LogisticStatsCheck":
96
+ self.results.update(result)
97
+ return
98
+
99
+ # If it's a simple object (rare), store as-is
100
+ if not isinstance(result, dict):
101
+ self.results[check_name] = result
102
+ return
103
+
104
+ # Keep entire dict nested
105
+ if check_name in KEEP_AS_NESTED:
106
+ self.results[check_name] = result
107
+ return
108
+
109
+ # If runner returns {"CheckName": {...}}, unwrap
110
+ if set(result.keys()) == {check_name}:
111
+ self.results[check_name] = result[check_name]
112
+ return
113
+
114
+ # Default: merge into root
115
+ self.results.update(result)