tanml 0.1.6__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tanml might be problematic. Click here for more details.
- tanml/__init__.py +1 -0
- tanml/check_runners/__init__.py +0 -0
- tanml/check_runners/base_runner.py +6 -0
- tanml/check_runners/cleaning_repro_runner.py +18 -0
- tanml/check_runners/correlation_runner.py +15 -0
- tanml/check_runners/data_quality_runner.py +24 -0
- tanml/check_runners/eda_runner.py +21 -0
- tanml/check_runners/explainability_runner.py +28 -0
- tanml/check_runners/input_cluster_runner.py +43 -0
- tanml/check_runners/logistic_stats_runner.py +28 -0
- tanml/check_runners/model_meta_runner.py +23 -0
- tanml/check_runners/performance_runner.py +28 -0
- tanml/check_runners/raw_data_runner.py +41 -0
- tanml/check_runners/rule_engine_runner.py +5 -0
- tanml/check_runners/stress_test_runner.py +26 -0
- tanml/check_runners/vif_runner.py +54 -0
- tanml/checks/__init__.py +0 -0
- tanml/checks/base.py +20 -0
- tanml/checks/cleaning_repro.py +47 -0
- tanml/checks/correlation.py +61 -0
- tanml/checks/data_quality.py +26 -0
- tanml/checks/eda.py +67 -0
- tanml/checks/explainability/shap_check.py +55 -0
- tanml/checks/input_cluster.py +109 -0
- tanml/checks/logit_stats.py +59 -0
- tanml/checks/model_contents.py +40 -0
- tanml/checks/model_meta.py +50 -0
- tanml/checks/performance.py +90 -0
- tanml/checks/raw_data.py +47 -0
- tanml/checks/rule_engine.py +45 -0
- tanml/checks/stress_test.py +64 -0
- tanml/checks/vif.py +51 -0
- tanml/cli/__init__.py +0 -0
- tanml/cli/arg_parser.py +31 -0
- tanml/cli/init_cmd.py +8 -0
- tanml/cli/main.py +27 -0
- tanml/cli/validate_cmd.py +7 -0
- tanml/config_templates/__init__.py +0 -0
- tanml/config_templates/rules_multiple_models_datasets.yaml +144 -0
- tanml/config_templates/rules_one_dataset_segment_column.yaml +140 -0
- tanml/config_templates/rules_one_model_one_dataset.yaml +143 -0
- tanml/engine/__init__.py +0 -0
- tanml/engine/check_agent_registry.py +42 -0
- tanml/engine/core_engine_agent.py +115 -0
- tanml/engine/segmentation_agent.py +118 -0
- tanml/engine/validation_agent.py +91 -0
- tanml/report/report_builder.py +230 -0
- tanml/report/templates/report_template.docx +0 -0
- tanml/utils/__init__.py +0 -0
- tanml/utils/data_loader.py +17 -0
- tanml/utils/model_loader.py +35 -0
- tanml/utils/r_loader.py +30 -0
- tanml/utils/sas_loader.py +50 -0
- tanml/utils/yaml_generator.py +34 -0
- tanml/utils/yaml_loader.py +5 -0
- tanml/validate.py +209 -0
- tanml-0.1.6.dist-info/METADATA +317 -0
- tanml-0.1.6.dist-info/RECORD +62 -0
- tanml-0.1.6.dist-info/WHEEL +5 -0
- tanml-0.1.6.dist-info/entry_points.txt +2 -0
- tanml-0.1.6.dist-info/licenses/LICENSE +21 -0
- tanml-0.1.6.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,144 @@
|
|
|
1
|
+
# ============================================================
|
|
2
|
+
# TanML Validation Configuration File: Scenario B
|
|
3
|
+
# ------------------------------------------------------------
|
|
4
|
+
# ๐งช Scenario: One model and one cleaned dataset per segment
|
|
5
|
+
#
|
|
6
|
+
# โ
Required:
|
|
7
|
+
# - Define segment-wise cleaned data and model path (or retrain)
|
|
8
|
+
# - Choose ONE model source strategy (Option A or Option B)
|
|
9
|
+
# - Set input features and target column
|
|
10
|
+
# - Optional: provide global raw data
|
|
11
|
+
# - Adjust thresholds and check options as needed
|
|
12
|
+
# ============================================================
|
|
13
|
+
|
|
14
|
+
# ------------------------------------------
|
|
15
|
+
# REQUIRED: Model Input Schema
|
|
16
|
+
# ------------------------------------------
|
|
17
|
+
model:
|
|
18
|
+
features:
|
|
19
|
+
- feature_0 # ๐ replace with actual feature names used in all segment models
|
|
20
|
+
- feature_1 # ๐ remove or add more lines if needed
|
|
21
|
+
- feature_2
|
|
22
|
+
target: default_flag # ๐ replace with your actual target column
|
|
23
|
+
|
|
24
|
+
# ------------------------------------------
|
|
25
|
+
# OPTIONAL: Raw Data Path
|
|
26
|
+
# ------------------------------------------
|
|
27
|
+
paths:
|
|
28
|
+
raw_data: data/raw.csv # ๐ optional โ provide full path if you have original raw dataset otherwise use null
|
|
29
|
+
|
|
30
|
+
output:
|
|
31
|
+
report_path_template: /absolute/path/to/output/{segment} # ๐ Output path template โ {segment} will be replaced dynamically, it is folder path
|
|
32
|
+
|
|
33
|
+
# ------------------------------------------
|
|
34
|
+
# โ
OPTION A โ Pretrained Model per Segment (.pkl)
|
|
35
|
+
# ------------------------------------------
|
|
36
|
+
# Use this option when each segment has its own trained model and cleaned dataset.
|
|
37
|
+
# You must provide:
|
|
38
|
+
# - model: path to the pretrained `.pkl` file
|
|
39
|
+
# - cleaned: path to the cleaned CSV used for that segment
|
|
40
|
+
#
|
|
41
|
+
# ๐ Comment out the OPTION B block below if using this
|
|
42
|
+
# ------------------------------------------
|
|
43
|
+
segment:
|
|
44
|
+
runs:
|
|
45
|
+
segment_A: # ๐ rename (e.g., high_risk, bronze, tier_1)
|
|
46
|
+
model: models/logistic/model_b_segment1.pkl
|
|
47
|
+
cleaned: data/scenario_b_segment1.csv
|
|
48
|
+
|
|
49
|
+
segment_B:
|
|
50
|
+
model: models/logistic/model_b_segment1.pkl
|
|
51
|
+
cleaned: data/scenario_b_segment2.csv
|
|
52
|
+
|
|
53
|
+
# ------------------------------------------
|
|
54
|
+
# ๐ OPTION B โ Retrain Model from Cleaned Data (Per Segment)
|
|
55
|
+
# ------------------------------------------
|
|
56
|
+
# Use this when you want to retrain a model for each segment
|
|
57
|
+
# using the corresponding cleaned dataset and shared model config.
|
|
58
|
+
#
|
|
59
|
+
# ๐ If using this, comment out OPTION A (i.e., segment.runs[].model lines)
|
|
60
|
+
# ๐ Each segment must still have its own cleaned dataset
|
|
61
|
+
# ------------------------------------------
|
|
62
|
+
|
|
63
|
+
# segment:
|
|
64
|
+
# runs:
|
|
65
|
+
# segment_A:
|
|
66
|
+
# cleaned: data/cleaned_a.csv
|
|
67
|
+
# segment_B:
|
|
68
|
+
# cleaned: data/cleaned_b.csv
|
|
69
|
+
|
|
70
|
+
# ------------------------------------------
|
|
71
|
+
# MODEL SOURCE CONFIG
|
|
72
|
+
# ------------------------------------------
|
|
73
|
+
# This tells TanML how to load or build the model(s) for each segment.
|
|
74
|
+
#
|
|
75
|
+
# ๐ If using pretrained models (Option A): set `from_pickle: true`
|
|
76
|
+
# ๐ If retraining per segment (Option B): set `from_pickle: false`
|
|
77
|
+
# ------------------------------------------
|
|
78
|
+
model_source:
|
|
79
|
+
from_pickle: true # ๐ change to false if using retraining (Option B)
|
|
80
|
+
type: LogisticRegression
|
|
81
|
+
module: sklearn.linear_model
|
|
82
|
+
hyperparameters:
|
|
83
|
+
penalty: "l2"
|
|
84
|
+
solver: "liblinear"
|
|
85
|
+
random_state: 42
|
|
86
|
+
class_weight: "balanced"
|
|
87
|
+
max_iter: 100
|
|
88
|
+
|
|
89
|
+
# ------------------------------------------
|
|
90
|
+
# PERFORMANCE THRESHOLDS
|
|
91
|
+
# ------------------------------------------
|
|
92
|
+
auc_roc:
|
|
93
|
+
min: 0.60
|
|
94
|
+
|
|
95
|
+
f1:
|
|
96
|
+
min: 0.60
|
|
97
|
+
|
|
98
|
+
ks:
|
|
99
|
+
min: 0.20
|
|
100
|
+
|
|
101
|
+
# ------------------------------------------
|
|
102
|
+
# VALIDATION CHECKS
|
|
103
|
+
# ------------------------------------------
|
|
104
|
+
|
|
105
|
+
EDACheck:
|
|
106
|
+
enabled: true
|
|
107
|
+
max_plots: -1 # -1 = all numeric; or set number of columns
|
|
108
|
+
|
|
109
|
+
correlation:
|
|
110
|
+
enabled: true
|
|
111
|
+
|
|
112
|
+
VIFCheck:
|
|
113
|
+
enabled: true
|
|
114
|
+
|
|
115
|
+
raw_data_check:
|
|
116
|
+
enabled: true
|
|
117
|
+
|
|
118
|
+
model_meta:
|
|
119
|
+
enabled: true
|
|
120
|
+
|
|
121
|
+
# ------------------------------------------
|
|
122
|
+
# STRESS TESTING (Robustness Check)
|
|
123
|
+
# ------------------------------------------
|
|
124
|
+
StressTestCheck:
|
|
125
|
+
enabled: true
|
|
126
|
+
epsilon: 0.01 # โ 1% noise
|
|
127
|
+
perturb_fraction: 0.2 # โ 20% of rows
|
|
128
|
+
|
|
129
|
+
# ------------------------------------------
|
|
130
|
+
# INPUT CLUSTER COVERAGE
|
|
131
|
+
# ------------------------------------------
|
|
132
|
+
InputClusterCoverageCheck:
|
|
133
|
+
enabled: true
|
|
134
|
+
n_clusters: 5 # โ fixed clusters for coverage bar chart
|
|
135
|
+
max_k: 10 # โ elbow method search (if needed)
|
|
136
|
+
|
|
137
|
+
# ------------------------------------------
|
|
138
|
+
# EXPLAINABILITY
|
|
139
|
+
# ------------------------------------------
|
|
140
|
+
explainability:
|
|
141
|
+
shap:
|
|
142
|
+
enabled: true
|
|
143
|
+
background_sample_size: 100 # โ SHAP explainer training background
|
|
144
|
+
test_sample_size: 200 # โ test rows to explain
|
|
@@ -0,0 +1,140 @@
|
|
|
1
|
+
# ============================================================
|
|
2
|
+
# TanML Validation Configuration File: Scenario C
|
|
3
|
+
# ------------------------------------------------------------
|
|
4
|
+
# ๐งช Scenario: One dataset with a segment column, one model per segment
|
|
5
|
+
#
|
|
6
|
+
# โ
Required:
|
|
7
|
+
# - Provide cleaned data (includes all segments)
|
|
8
|
+
# - Define the segment column used to split the data
|
|
9
|
+
# - Choose ONE model source strategy (Option A or Option B)
|
|
10
|
+
# - Set input features and target column
|
|
11
|
+
# - Optional: provide global raw data
|
|
12
|
+
# - Adjust thresholds and check options as needed
|
|
13
|
+
# ============================================================
|
|
14
|
+
|
|
15
|
+
# ------------------------------------------
|
|
16
|
+
# REQUIRED: Model Input Schema
|
|
17
|
+
# ------------------------------------------
|
|
18
|
+
model:
|
|
19
|
+
features:
|
|
20
|
+
- feature_0 # ๐ replace with actual feature names used across all segment models
|
|
21
|
+
- feature_1
|
|
22
|
+
- feature_2
|
|
23
|
+
target: default_flag # ๐ replace with your actual target column
|
|
24
|
+
|
|
25
|
+
# ------------------------------------------
|
|
26
|
+
# REQUIRED: File Paths
|
|
27
|
+
# ------------------------------------------
|
|
28
|
+
paths:
|
|
29
|
+
cleaned_data: data/cleaned.csv # ๐ path to full cleaned dataset (includes all segments)
|
|
30
|
+
raw_data: data/raw.csv # ๐ optional โ use null if raw data not available
|
|
31
|
+
|
|
32
|
+
# ------------------------------------------
|
|
33
|
+
# OUTPUT CONFIGURATION
|
|
34
|
+
# ------------------------------------------
|
|
35
|
+
# Path template where validation reports will be saved.
|
|
36
|
+
# ๐ Use `{segment}` as a placeholder for the segment name.
|
|
37
|
+
# ------------------------------------------
|
|
38
|
+
output:
|
|
39
|
+
report_path_template: reports/scenario_c/{segment}_report.docx # ๐ customize this path as needed
|
|
40
|
+
|
|
41
|
+
# ------------------------------------------
|
|
42
|
+
# โ
OPTION A โ Pretrained Models per Segment (.pkl)
|
|
43
|
+
# ------------------------------------------
|
|
44
|
+
# Provide one model per segment (already trained)
|
|
45
|
+
# This is the default option.
|
|
46
|
+
# ๐ Comment out the OPTION B block below if using this
|
|
47
|
+
# ------------------------------------------
|
|
48
|
+
segment:
|
|
49
|
+
column: customer_segment # ๐ column used to split segments
|
|
50
|
+
|
|
51
|
+
runs:
|
|
52
|
+
segment_A:
|
|
53
|
+
model: models/logistic/model_a.pkl
|
|
54
|
+
segment_B:
|
|
55
|
+
model: models/logistic/model_b.pkl
|
|
56
|
+
|
|
57
|
+
# ------------------------------------------
|
|
58
|
+
# ๐ OPTION B โ Retrain Models per Segment from Cleaned Data
|
|
59
|
+
# ------------------------------------------
|
|
60
|
+
# Use this if you want TanML to retrain a model for each segment
|
|
61
|
+
# from the common cleaned dataset.
|
|
62
|
+
# ๐ Comment out the OPTION A block above if using this
|
|
63
|
+
# ๐ `segment.runs` must list segment values (no model paths needed)
|
|
64
|
+
# ------------------------------------------
|
|
65
|
+
# segment:
|
|
66
|
+
# column: customer_segment
|
|
67
|
+
# runs:
|
|
68
|
+
# segment_A: {}
|
|
69
|
+
# segment_B: {}
|
|
70
|
+
|
|
71
|
+
# ------------------------------------------
|
|
72
|
+
# MODEL SOURCE CONFIGURATION
|
|
73
|
+
# ------------------------------------------
|
|
74
|
+
# ๐ If using pretrained models: set `from_pickle: true`
|
|
75
|
+
# ๐ If retraining per segment: set `from_pickle: false`
|
|
76
|
+
# ------------------------------------------
|
|
77
|
+
model_source:
|
|
78
|
+
from_pickle: true
|
|
79
|
+
type: LogisticRegression
|
|
80
|
+
module: sklearn.linear_model
|
|
81
|
+
hyperparameters:
|
|
82
|
+
penalty: "l2"
|
|
83
|
+
solver: "liblinear"
|
|
84
|
+
random_state: 42
|
|
85
|
+
class_weight: "balanced"
|
|
86
|
+
max_iter: 100
|
|
87
|
+
|
|
88
|
+
# ------------------------------------------
|
|
89
|
+
# PERFORMANCE THRESHOLDS
|
|
90
|
+
# ------------------------------------------
|
|
91
|
+
auc_roc:
|
|
92
|
+
min: 0.60
|
|
93
|
+
f1:
|
|
94
|
+
min: 0.60
|
|
95
|
+
ks:
|
|
96
|
+
min: 0.20
|
|
97
|
+
|
|
98
|
+
# ------------------------------------------
|
|
99
|
+
# VALIDATION CHECKS
|
|
100
|
+
# ------------------------------------------
|
|
101
|
+
EDACheck:
|
|
102
|
+
enabled: true
|
|
103
|
+
max_plots: -1
|
|
104
|
+
|
|
105
|
+
correlation:
|
|
106
|
+
enabled: true
|
|
107
|
+
|
|
108
|
+
VIFCheck:
|
|
109
|
+
enabled: true
|
|
110
|
+
|
|
111
|
+
raw_data_check:
|
|
112
|
+
enabled: true
|
|
113
|
+
|
|
114
|
+
model_meta:
|
|
115
|
+
enabled: true
|
|
116
|
+
|
|
117
|
+
# ------------------------------------------
|
|
118
|
+
# STRESS TESTING (Robustness Check)
|
|
119
|
+
# ------------------------------------------
|
|
120
|
+
StressTestCheck:
|
|
121
|
+
enabled: true
|
|
122
|
+
epsilon: 0.01
|
|
123
|
+
perturb_fraction: 0.2
|
|
124
|
+
|
|
125
|
+
# ------------------------------------------
|
|
126
|
+
# INPUT CLUSTER COVERAGE
|
|
127
|
+
# ------------------------------------------
|
|
128
|
+
InputClusterCoverageCheck:
|
|
129
|
+
enabled: true
|
|
130
|
+
n_clusters: 5
|
|
131
|
+
max_k: 10
|
|
132
|
+
|
|
133
|
+
# ------------------------------------------
|
|
134
|
+
# EXPLAINABILITY
|
|
135
|
+
# ------------------------------------------
|
|
136
|
+
explainability:
|
|
137
|
+
shap:
|
|
138
|
+
enabled: true
|
|
139
|
+
background_sample_size: 100
|
|
140
|
+
test_sample_size: 200
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
# ============================================================
|
|
2
|
+
# TanML Validation Configuration File: Scenario A
|
|
3
|
+
# ------------------------------------------------------------
|
|
4
|
+
# ๐งช Scenario: One model, one cleaned dataset (no segmentation)
|
|
5
|
+
#
|
|
6
|
+
# โ
Required:
|
|
7
|
+
# - Choose ONE set of paths (Option A or Option B)
|
|
8
|
+
# - Choose ONE model source block (Option A or Option B)
|
|
9
|
+
# - Fill in your feature names and target column
|
|
10
|
+
# - Adjust any thresholds or check options as needed
|
|
11
|
+
# ============================================================
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
# ------------------------------------------
|
|
15
|
+
# REQUIRED: Model Input Schema
|
|
16
|
+
# ------------------------------------------
|
|
17
|
+
model:
|
|
18
|
+
features:
|
|
19
|
+
- feature_0 # ๐ replace with actual feature names used in your model
|
|
20
|
+
- feature_1
|
|
21
|
+
- feature_2
|
|
22
|
+
- feature_3
|
|
23
|
+
- feature_4
|
|
24
|
+
target: default_flag # ๐ replace with actual target column
|
|
25
|
+
|
|
26
|
+
# ------------------------------------------
|
|
27
|
+
# OUTPUT: Report Path
|
|
28
|
+
# ------------------------------------------
|
|
29
|
+
output:
|
|
30
|
+
report_path: /mnt/c/Users/you/Desktop/tanml_output/scenario_a_report.docx # ๐ customize as needed
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
# ------------------------------------------
|
|
34
|
+
# โ
OPTION A โ Use a Pretrained Model (.pkl)
|
|
35
|
+
# ------------------------------------------
|
|
36
|
+
# Use this option when you already have a trained model saved as a `.pkl` file.
|
|
37
|
+
# You must provide the path to:
|
|
38
|
+
# - The cleaned dataset used during training
|
|
39
|
+
# - The raw dataset (optional, but recommended)
|
|
40
|
+
# - The trained model file via `paths.model`
|
|
41
|
+
#
|
|
42
|
+
# Be sure to:
|
|
43
|
+
# - Keep `from_pickle: true` in `model_source`
|
|
44
|
+
# - Match the `path` in `model_source` and `paths.model`
|
|
45
|
+
# - Comment out OPTION B block if using this
|
|
46
|
+
# ------------------------------------------
|
|
47
|
+
paths:
|
|
48
|
+
raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv # ๐ optional โ use null if raw data is not available
|
|
49
|
+
cleaned_data: data/cleaned_a.csv
|
|
50
|
+
model: models/logistic/model_a.pkl
|
|
51
|
+
|
|
52
|
+
model_source:
|
|
53
|
+
from_pickle: true
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ------------------------------------------
|
|
57
|
+
# ๐ OPTION B โ Retrain Model from Scratch
|
|
58
|
+
# ------------------------------------------
|
|
59
|
+
# Use this option if you want TanML to retrain the model for you
|
|
60
|
+
# using the cleaned dataset and the specified algorithm + hyperparameters.
|
|
61
|
+
#
|
|
62
|
+
# You must provide:
|
|
63
|
+
# - Path to the raw dataset (optional)
|
|
64
|
+
# - Path to the cleaned dataset (required)
|
|
65
|
+
# - The full model definition: type, module, and hyperparameters
|
|
66
|
+
#
|
|
67
|
+
# Be sure to:
|
|
68
|
+
# - Set `from_pickle: false`
|
|
69
|
+
# - Omit or comment out the `paths.model` line
|
|
70
|
+
# - Comment out the OPTION A block above
|
|
71
|
+
# ------------------------------------------
|
|
72
|
+
# paths:
|
|
73
|
+
# raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv
|
|
74
|
+
# cleaned_data: data/cleaned_a.csv
|
|
75
|
+
|
|
76
|
+
# model_source:
|
|
77
|
+
# from_pickle: false
|
|
78
|
+
# type: LogisticRegression
|
|
79
|
+
# module: sklearn.linear_model
|
|
80
|
+
# hyperparameters:
|
|
81
|
+
# penalty: "l2"
|
|
82
|
+
# C: 1.0
|
|
83
|
+
# solver: "liblinear"
|
|
84
|
+
# class_weight: "balanced"
|
|
85
|
+
# max_iter: 100
|
|
86
|
+
# random_state: 42
|
|
87
|
+
|
|
88
|
+
# ------------------------------------------
|
|
89
|
+
# PERFORMANCE THRESHOLDS
|
|
90
|
+
# ------------------------------------------
|
|
91
|
+
auc_roc:
|
|
92
|
+
min: 0.60
|
|
93
|
+
|
|
94
|
+
f1:
|
|
95
|
+
min: 0.60
|
|
96
|
+
|
|
97
|
+
ks:
|
|
98
|
+
min: 0.20
|
|
99
|
+
|
|
100
|
+
# ------------------------------------------
|
|
101
|
+
# VALIDATION CHECKS
|
|
102
|
+
# ------------------------------------------
|
|
103
|
+
|
|
104
|
+
EDACheck:
|
|
105
|
+
enabled: true
|
|
106
|
+
max_plots: -1 # -1 = all numeric; or set number of columns
|
|
107
|
+
|
|
108
|
+
correlation:
|
|
109
|
+
enabled: true
|
|
110
|
+
|
|
111
|
+
VIFCheck:
|
|
112
|
+
enabled: true
|
|
113
|
+
|
|
114
|
+
raw_data_check:
|
|
115
|
+
enabled: true
|
|
116
|
+
|
|
117
|
+
model_meta:
|
|
118
|
+
enabled: true
|
|
119
|
+
|
|
120
|
+
# ------------------------------------------
|
|
121
|
+
# STRESS TESTING (Robustness Check)
|
|
122
|
+
# ------------------------------------------
|
|
123
|
+
StressTestCheck:
|
|
124
|
+
enabled: true
|
|
125
|
+
epsilon: 0.01 # โ 1% noise
|
|
126
|
+
perturb_fraction: 0.2 # โ 20% of rows
|
|
127
|
+
|
|
128
|
+
# ------------------------------------------
|
|
129
|
+
# INPUT CLUSTER COVERAGE
|
|
130
|
+
# ------------------------------------------
|
|
131
|
+
InputClusterCoverageCheck:
|
|
132
|
+
enabled: true
|
|
133
|
+
n_clusters: 5 # โ fixed clusters for coverage bar chart
|
|
134
|
+
max_k: 10 # โ elbow method search (if needed)
|
|
135
|
+
|
|
136
|
+
# ------------------------------------------
|
|
137
|
+
# EXPLAINABILITY
|
|
138
|
+
# ------------------------------------------
|
|
139
|
+
explainability:
|
|
140
|
+
shap:
|
|
141
|
+
enabled: true
|
|
142
|
+
background_sample_size: 100 # โ SHAP explainer training background
|
|
143
|
+
test_sample_size: 200 # โ test rows to explain
|
tanml/engine/__init__.py
ADDED
|
File without changes
|
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
from tanml.check_runners.performance_runner import run_performance_check
|
|
2
|
+
from tanml.check_runners.data_quality_runner import run_data_quality_check
|
|
3
|
+
from tanml.check_runners.stress_test_runner import run_stress_test_check
|
|
4
|
+
from tanml.check_runners.input_cluster_runner import run_input_cluster_check
|
|
5
|
+
from tanml.check_runners.logistic_stats_runner import run_logistic_stats_check
|
|
6
|
+
from tanml.check_runners.raw_data_runner import run_raw_data_check
|
|
7
|
+
#from tanml.check_runners.cleaning_repro_runner import run_cleaning_repro_check
|
|
8
|
+
from tanml.check_runners.model_meta_runner import ModelMetaCheckRunner
|
|
9
|
+
from tanml.check_runners.correlation_runner import CorrelationCheckRunner
|
|
10
|
+
from tanml.check_runners.eda_runner import EDACheckRunner
|
|
11
|
+
from tanml.check_runners.explainability_runner import run_shap_check
|
|
12
|
+
from tanml.check_runners.vif_runner import VIFCheckRunner
|
|
13
|
+
from tanml.check_runners.rule_engine_runner import RuleEngineCheckRunner
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# Wrapper for InputClusterCheck to inject expected_features from model
|
|
17
|
+
def input_cluster_wrapper(model, X_train, X_test, y_train, y_test, rule_config, cleaned_df, *args, **kwargs):
|
|
18
|
+
if hasattr(model, "feature_names_in_"):
|
|
19
|
+
expected_features = list(model.feature_names_in_)
|
|
20
|
+
else:
|
|
21
|
+
raise ValueError("Model does not have 'feature_names_in_' attribute required for InputClusterCheck.")
|
|
22
|
+
|
|
23
|
+
return run_input_cluster_check(
|
|
24
|
+
model, X_train, X_test, y_train, y_test, rule_config, cleaned_df, expected_features
|
|
25
|
+
)
|
|
26
|
+
|
|
27
|
+
CHECK_RUNNER_REGISTRY = {
|
|
28
|
+
"PerformanceCheck": run_performance_check,
|
|
29
|
+
"DataQualityCheck": run_data_quality_check,
|
|
30
|
+
"StressTestCheck": run_stress_test_check,
|
|
31
|
+
"InputClusterCheck": input_cluster_wrapper,
|
|
32
|
+
"LogisticStatsCheck": run_logistic_stats_check,
|
|
33
|
+
"RawDataCheck": run_raw_data_check,
|
|
34
|
+
#"CleaningReproCheck": run_cleaning_repro_check,
|
|
35
|
+
"ModelMetaCheck": ModelMetaCheckRunner,
|
|
36
|
+
"CorrelationCheck": CorrelationCheckRunner,
|
|
37
|
+
"EDACheck": EDACheckRunner,
|
|
38
|
+
"SHAPCheck": run_shap_check,
|
|
39
|
+
"VIFCheck": VIFCheckRunner,
|
|
40
|
+
"RuleEngineCheck": RuleEngineCheckRunner,
|
|
41
|
+
|
|
42
|
+
}
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
"""
|
|
2
|
+
ValidationEngine โ runs all registered check-runners and assembles a
|
|
3
|
+
single results dictionary that the ReportBuilder / Jinja template expects.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from tanml.engine.check_agent_registry import CHECK_RUNNER_REGISTRY
|
|
7
|
+
#from tanml.checks.cleaning_repro import CleaningReproCheck
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
KEEP_AS_NESTED = {
|
|
11
|
+
"DataQualityCheck",
|
|
12
|
+
"StressTestCheck",
|
|
13
|
+
"InputClusterCheck",
|
|
14
|
+
"RawDataCheck",
|
|
15
|
+
#"CleaningReproCheck",
|
|
16
|
+
"SHAPCheck",
|
|
17
|
+
"VIFCheck",
|
|
18
|
+
"CorrelationCheck",
|
|
19
|
+
"EDACheck",
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
class ValidationEngine:
|
|
24
|
+
def __init__(
|
|
25
|
+
self,
|
|
26
|
+
model,
|
|
27
|
+
X_train,
|
|
28
|
+
X_test,
|
|
29
|
+
y_train,
|
|
30
|
+
y_test,
|
|
31
|
+
config,
|
|
32
|
+
cleaned_data,
|
|
33
|
+
raw_df=None ,
|
|
34
|
+
ctx=None
|
|
35
|
+
):
|
|
36
|
+
self.model = model
|
|
37
|
+
self.X_train = X_train
|
|
38
|
+
self.X_test = X_test
|
|
39
|
+
self.y_train = y_train
|
|
40
|
+
self.y_test = y_test
|
|
41
|
+
self.config = config
|
|
42
|
+
self.cleaned_data = cleaned_data
|
|
43
|
+
self.raw_df = raw_df
|
|
44
|
+
|
|
45
|
+
self.results = dict(config.get("check_results", {}))
|
|
46
|
+
self.ctx = ctx or {}
|
|
47
|
+
def run_all_checks(self):
|
|
48
|
+
for check_name, runner_func in CHECK_RUNNER_REGISTRY.items():
|
|
49
|
+
if check_name in self.config.get("skip_checks", []):
|
|
50
|
+
continue
|
|
51
|
+
|
|
52
|
+
print(f"โ
Running {check_name}")
|
|
53
|
+
try:
|
|
54
|
+
result = runner_func(
|
|
55
|
+
self.model,
|
|
56
|
+
self.X_train,
|
|
57
|
+
self.X_test,
|
|
58
|
+
self.y_train,
|
|
59
|
+
self.y_test,
|
|
60
|
+
self.config,
|
|
61
|
+
self.cleaned_data,
|
|
62
|
+
raw_df=self.raw_df
|
|
63
|
+
)
|
|
64
|
+
|
|
65
|
+
self._integrate(check_name, result)
|
|
66
|
+
|
|
67
|
+
except Exception as e:
|
|
68
|
+
print(f"โ ๏ธ {check_name} failed: {e}")
|
|
69
|
+
self.results[check_name] = {"error": str(e)}
|
|
70
|
+
|
|
71
|
+
# Add CleaningReproCheck manually
|
|
72
|
+
# if self.raw_df is not None:
|
|
73
|
+
# print("โ
Running CleaningReproCheck")
|
|
74
|
+
# try:
|
|
75
|
+
# check = CleaningReproCheck(self.raw_df, self.cleaned_data)
|
|
76
|
+
|
|
77
|
+
# self.results["CleaningReproCheck"] = check.run()
|
|
78
|
+
# except Exception as e:
|
|
79
|
+
# print(f"โ ๏ธ CleaningReproCheck failed: {e}")
|
|
80
|
+
# self.results["CleaningReproCheck"] = {"error": str(e)}
|
|
81
|
+
# else:
|
|
82
|
+
# print("โ ๏ธ Skipping CleaningReproCheck โ raw_df not provided")
|
|
83
|
+
# self.results["CleaningReproCheck"] = {"error": "raw_data not available"}
|
|
84
|
+
|
|
85
|
+
# convenience copy for template
|
|
86
|
+
self.results["check_results"] = dict(self.results)
|
|
87
|
+
return self.results
|
|
88
|
+
|
|
89
|
+
def _integrate(self, check_name: str, result):
|
|
90
|
+
"""Merge a check result into self.results respecting the template layout."""
|
|
91
|
+
if not result:
|
|
92
|
+
return
|
|
93
|
+
|
|
94
|
+
# Special flatten for LogisticStatsCheck
|
|
95
|
+
if check_name == "LogisticStatsCheck":
|
|
96
|
+
self.results.update(result)
|
|
97
|
+
return
|
|
98
|
+
|
|
99
|
+
# If it's a simple object (rare), store as-is
|
|
100
|
+
if not isinstance(result, dict):
|
|
101
|
+
self.results[check_name] = result
|
|
102
|
+
return
|
|
103
|
+
|
|
104
|
+
# Keep entire dict nested
|
|
105
|
+
if check_name in KEEP_AS_NESTED:
|
|
106
|
+
self.results[check_name] = result
|
|
107
|
+
return
|
|
108
|
+
|
|
109
|
+
# If runner returns {"CheckName": {...}}, unwrap
|
|
110
|
+
if set(result.keys()) == {check_name}:
|
|
111
|
+
self.results[check_name] = result[check_name]
|
|
112
|
+
return
|
|
113
|
+
|
|
114
|
+
# Default: merge into root
|
|
115
|
+
self.results.update(result)
|