tanml 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tanml might be problematic. Click here for more details.
- tanml/__init__.py +1 -1
- tanml/check_runners/cleaning_repro_runner.py +2 -2
- tanml/check_runners/correlation_runner.py +49 -12
- tanml/check_runners/explainability_runner.py +12 -22
- tanml/check_runners/logistic_stats_runner.py +196 -17
- tanml/check_runners/performance_runner.py +82 -26
- tanml/check_runners/raw_data_runner.py +29 -14
- tanml/check_runners/regression_metrics_runner.py +195 -0
- tanml/check_runners/stress_test_runner.py +23 -6
- tanml/check_runners/vif_runner.py +33 -27
- tanml/checks/correlation.py +241 -41
- tanml/checks/explainability/shap_check.py +261 -29
- tanml/checks/logit_stats.py +186 -54
- tanml/checks/performance_classification.py +305 -0
- tanml/checks/raw_data.py +58 -23
- tanml/checks/regression_metrics.py +167 -0
- tanml/checks/stress_test.py +157 -53
- tanml/cli/main.py +99 -27
- tanml/engine/check_agent_registry.py +20 -10
- tanml/engine/core_engine_agent.py +199 -37
- tanml/models/registry.py +329 -0
- tanml/report/report_builder.py +1180 -147
- tanml/report/templates/report_template_cls.docx +0 -0
- tanml/report/templates/report_template_reg.docx +0 -0
- tanml/ui/app.py +1205 -0
- tanml/utils/data_loader.py +105 -15
- tanml-0.1.7.dist-info/METADATA +164 -0
- tanml-0.1.7.dist-info/RECORD +54 -0
- tanml/cli/arg_parser.py +0 -31
- tanml/cli/init_cmd.py +0 -8
- tanml/cli/validate_cmd.py +0 -7
- tanml/config_templates/rules_multiple_models_datasets.yaml +0 -144
- tanml/config_templates/rules_one_dataset_segment_column.yaml +0 -140
- tanml/config_templates/rules_one_model_one_dataset.yaml +0 -143
- tanml/engine/segmentation_agent.py +0 -118
- tanml/engine/validation_agent.py +0 -91
- tanml/report/templates/report_template.docx +0 -0
- tanml/utils/model_loader.py +0 -35
- tanml/utils/r_loader.py +0 -30
- tanml/utils/sas_loader.py +0 -50
- tanml/utils/yaml_generator.py +0 -34
- tanml/utils/yaml_loader.py +0 -5
- tanml/validate.py +0 -209
- tanml-0.1.6.dist-info/METADATA +0 -317
- tanml-0.1.6.dist-info/RECORD +0 -62
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/WHEEL +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/entry_points.txt +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/top_level.txt +0 -0
|
@@ -1,143 +0,0 @@
|
|
|
1
|
-
# ============================================================
|
|
2
|
-
# TanML Validation Configuration File: Scenario A
|
|
3
|
-
# ------------------------------------------------------------
|
|
4
|
-
# 🧪 Scenario: One model, one cleaned dataset (no segmentation)
|
|
5
|
-
#
|
|
6
|
-
# ✅ Required:
|
|
7
|
-
# - Choose ONE set of paths (Option A or Option B)
|
|
8
|
-
# - Choose ONE model source block (Option A or Option B)
|
|
9
|
-
# - Fill in your feature names and target column
|
|
10
|
-
# - Adjust any thresholds or check options as needed
|
|
11
|
-
# ============================================================
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
# ------------------------------------------
|
|
15
|
-
# REQUIRED: Model Input Schema
|
|
16
|
-
# ------------------------------------------
|
|
17
|
-
model:
|
|
18
|
-
features:
|
|
19
|
-
- feature_0 # 👉 replace with actual feature names used in your model
|
|
20
|
-
- feature_1
|
|
21
|
-
- feature_2
|
|
22
|
-
- feature_3
|
|
23
|
-
- feature_4
|
|
24
|
-
target: default_flag # 👉 replace with actual target column
|
|
25
|
-
|
|
26
|
-
# ------------------------------------------
|
|
27
|
-
# OUTPUT: Report Path
|
|
28
|
-
# ------------------------------------------
|
|
29
|
-
output:
|
|
30
|
-
report_path: /mnt/c/Users/you/Desktop/tanml_output/scenario_a_report.docx # 👉 customize as needed
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
# ------------------------------------------
|
|
34
|
-
# ✅ OPTION A — Use a Pretrained Model (.pkl)
|
|
35
|
-
# ------------------------------------------
|
|
36
|
-
# Use this option when you already have a trained model saved as a `.pkl` file.
|
|
37
|
-
# You must provide the path to:
|
|
38
|
-
# - The cleaned dataset used during training
|
|
39
|
-
# - The raw dataset (optional, but recommended)
|
|
40
|
-
# - The trained model file via `paths.model`
|
|
41
|
-
#
|
|
42
|
-
# Be sure to:
|
|
43
|
-
# - Keep `from_pickle: true` in `model_source`
|
|
44
|
-
# - Match the `path` in `model_source` and `paths.model`
|
|
45
|
-
# - Comment out OPTION B block if using this
|
|
46
|
-
# ------------------------------------------
|
|
47
|
-
paths:
|
|
48
|
-
raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv # 👉 optional — use null if raw data is not available
|
|
49
|
-
cleaned_data: data/cleaned_a.csv
|
|
50
|
-
model: models/logistic/model_a.pkl
|
|
51
|
-
|
|
52
|
-
model_source:
|
|
53
|
-
from_pickle: true
|
|
54
|
-
|
|
55
|
-
|
|
56
|
-
# ------------------------------------------
|
|
57
|
-
# 🔁 OPTION B — Retrain Model from Scratch
|
|
58
|
-
# ------------------------------------------
|
|
59
|
-
# Use this option if you want TanML to retrain the model for you
|
|
60
|
-
# using the cleaned dataset and the specified algorithm + hyperparameters.
|
|
61
|
-
#
|
|
62
|
-
# You must provide:
|
|
63
|
-
# - Path to the raw dataset (optional)
|
|
64
|
-
# - Path to the cleaned dataset (required)
|
|
65
|
-
# - The full model definition: type, module, and hyperparameters
|
|
66
|
-
#
|
|
67
|
-
# Be sure to:
|
|
68
|
-
# - Set `from_pickle: false`
|
|
69
|
-
# - Omit or comment out the `paths.model` line
|
|
70
|
-
# - Comment out the OPTION A block above
|
|
71
|
-
# ------------------------------------------
|
|
72
|
-
# paths:
|
|
73
|
-
# raw_data: /mnt/c/Users/your_name/Desktop/data/raw.csv
|
|
74
|
-
# cleaned_data: data/cleaned_a.csv
|
|
75
|
-
|
|
76
|
-
# model_source:
|
|
77
|
-
# from_pickle: false
|
|
78
|
-
# type: LogisticRegression
|
|
79
|
-
# module: sklearn.linear_model
|
|
80
|
-
# hyperparameters:
|
|
81
|
-
# penalty: "l2"
|
|
82
|
-
# C: 1.0
|
|
83
|
-
# solver: "liblinear"
|
|
84
|
-
# class_weight: "balanced"
|
|
85
|
-
# max_iter: 100
|
|
86
|
-
# random_state: 42
|
|
87
|
-
|
|
88
|
-
# ------------------------------------------
|
|
89
|
-
# PERFORMANCE THRESHOLDS
|
|
90
|
-
# ------------------------------------------
|
|
91
|
-
auc_roc:
|
|
92
|
-
min: 0.60
|
|
93
|
-
|
|
94
|
-
f1:
|
|
95
|
-
min: 0.60
|
|
96
|
-
|
|
97
|
-
ks:
|
|
98
|
-
min: 0.20
|
|
99
|
-
|
|
100
|
-
# ------------------------------------------
|
|
101
|
-
# VALIDATION CHECKS
|
|
102
|
-
# ------------------------------------------
|
|
103
|
-
|
|
104
|
-
EDACheck:
|
|
105
|
-
enabled: true
|
|
106
|
-
max_plots: -1 # -1 = all numeric; or set number of columns
|
|
107
|
-
|
|
108
|
-
correlation:
|
|
109
|
-
enabled: true
|
|
110
|
-
|
|
111
|
-
VIFCheck:
|
|
112
|
-
enabled: true
|
|
113
|
-
|
|
114
|
-
raw_data_check:
|
|
115
|
-
enabled: true
|
|
116
|
-
|
|
117
|
-
model_meta:
|
|
118
|
-
enabled: true
|
|
119
|
-
|
|
120
|
-
# ------------------------------------------
|
|
121
|
-
# STRESS TESTING (Robustness Check)
|
|
122
|
-
# ------------------------------------------
|
|
123
|
-
StressTestCheck:
|
|
124
|
-
enabled: true
|
|
125
|
-
epsilon: 0.01 # ➜ 1% noise
|
|
126
|
-
perturb_fraction: 0.2 # ➜ 20% of rows
|
|
127
|
-
|
|
128
|
-
# ------------------------------------------
|
|
129
|
-
# INPUT CLUSTER COVERAGE
|
|
130
|
-
# ------------------------------------------
|
|
131
|
-
InputClusterCoverageCheck:
|
|
132
|
-
enabled: true
|
|
133
|
-
n_clusters: 5 # ➜ fixed clusters for coverage bar chart
|
|
134
|
-
max_k: 10 # ➜ elbow method search (if needed)
|
|
135
|
-
|
|
136
|
-
# ------------------------------------------
|
|
137
|
-
# EXPLAINABILITY
|
|
138
|
-
# ------------------------------------------
|
|
139
|
-
explainability:
|
|
140
|
-
shap:
|
|
141
|
-
enabled: true
|
|
142
|
-
background_sample_size: 100 # ➜ SHAP explainer training background
|
|
143
|
-
test_sample_size: 200 # ➜ test rows to explain
|
|
@@ -1,118 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import joblib
|
|
5
|
-
import importlib
|
|
6
|
-
from tanml.utils.sas_loader import SASLogisticModel
|
|
7
|
-
from tanml.utils.r_loader import RLogisticModel
|
|
8
|
-
from tanml.engine.validation_agent import SegmentValidator
|
|
9
|
-
from tanml.utils.data_loader import load_dataframe
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
def handle_segmentation(segment_config, rule_config, args=None, report_output=None):
|
|
13
|
-
global_raw_path = rule_config.get("paths", {}).get("raw_data")
|
|
14
|
-
segment_col = segment_config.get("column") # ✅ only required for Scenario C
|
|
15
|
-
report_template = report_output # e.g., "reports/report_{segment}.docx"
|
|
16
|
-
|
|
17
|
-
print("🔍 Detected segmentation setup in rules.yaml. Running each segment run separately...")
|
|
18
|
-
|
|
19
|
-
for name, run_cfg in segment_config["runs"].items():
|
|
20
|
-
print(f"\n🔹 Validating segment: {name}")
|
|
21
|
-
model_path = run_cfg["model"]
|
|
22
|
-
|
|
23
|
-
# CASE 1: Retrain model from cleaned data (model: train)
|
|
24
|
-
if isinstance(model_path, str) and model_path.lower() == "train":
|
|
25
|
-
print(f"🛠️ Retraining model from cleaned data for segment: {name}")
|
|
26
|
-
|
|
27
|
-
if "cleaned" in run_cfg:
|
|
28
|
-
cleaned_df_run = load_dataframe(run_cfg["cleaned"])
|
|
29
|
-
else:
|
|
30
|
-
full_cleaned = load_dataframe(rule_config["paths"]["cleaned_data"])
|
|
31
|
-
if segment_col:
|
|
32
|
-
cleaned_df_run = full_cleaned[full_cleaned[segment_col] == name]
|
|
33
|
-
else:
|
|
34
|
-
raise ValueError("Missing segment column for slicing cleaned data.")
|
|
35
|
-
|
|
36
|
-
X = cleaned_df_run[rule_config["model"]["features"]]
|
|
37
|
-
y = cleaned_df_run[rule_config["model"]["target"]]
|
|
38
|
-
|
|
39
|
-
model_source = rule_config.get("model_source", {})
|
|
40
|
-
model_type = model_source.get("type")
|
|
41
|
-
model_module = model_source.get("module")
|
|
42
|
-
model_params = model_source.get("hyperparameters", {})
|
|
43
|
-
|
|
44
|
-
if not model_type or not model_module:
|
|
45
|
-
raise ValueError("❌ 'model_source.type' and 'model_source.module' must be defined for retraining.")
|
|
46
|
-
|
|
47
|
-
model_class = getattr(importlib.import_module(model_module), model_type)
|
|
48
|
-
model = model_class(**model_params)
|
|
49
|
-
print(f"📦 Using model: {model}")
|
|
50
|
-
|
|
51
|
-
model.fit(X, y)
|
|
52
|
-
print(f"✅ Retrained {model_type} for segment '{name}'")
|
|
53
|
-
|
|
54
|
-
elif isinstance(model_path, str) and "r_logistic" in model_path.lower():
|
|
55
|
-
model = RLogisticModel(model_path)
|
|
56
|
-
|
|
57
|
-
elif isinstance(model_path, str) and model_path.endswith(".pkl"):
|
|
58
|
-
model = joblib.load(model_path)
|
|
59
|
-
|
|
60
|
-
elif isinstance(model_path, str) and model_path.endswith(".csv"):
|
|
61
|
-
base = os.path.splitext(model_path)[0]
|
|
62
|
-
model = SASLogisticModel(
|
|
63
|
-
coeffs_path=model_path,
|
|
64
|
-
intercept_path=base + "_intercept.txt",
|
|
65
|
-
feature_order_path=base + "_features.txt"
|
|
66
|
-
)
|
|
67
|
-
|
|
68
|
-
else:
|
|
69
|
-
raise ValueError(f"❌ Unsupported model format for segment '{name}': {model_path}")
|
|
70
|
-
|
|
71
|
-
# Load and optionally slice raw data
|
|
72
|
-
raw_df_run = None
|
|
73
|
-
if global_raw_path and os.path.exists(global_raw_path):
|
|
74
|
-
full_raw = load_dataframe(global_raw_path)
|
|
75
|
-
if segment_col:
|
|
76
|
-
raw_df_run = full_raw[full_raw[segment_col] == name]
|
|
77
|
-
else:
|
|
78
|
-
raw_df_run = full_raw
|
|
79
|
-
|
|
80
|
-
print(f"[DEBUG] raw_df_run for {name}: {type(raw_df_run)}, rows = {len(raw_df_run) if raw_df_run is not None else 'None'}")
|
|
81
|
-
|
|
82
|
-
# Load cleaned data
|
|
83
|
-
if "cleaned" in run_cfg:
|
|
84
|
-
cleaned_df_run = load_dataframe(run_cfg["cleaned"])
|
|
85
|
-
else:
|
|
86
|
-
full_cleaned = load_dataframe(rule_config["paths"]["cleaned_data"])
|
|
87
|
-
if segment_col:
|
|
88
|
-
cleaned_df_run = full_cleaned[full_cleaned[segment_col] == name]
|
|
89
|
-
else:
|
|
90
|
-
cleaned_df_run = full_cleaned
|
|
91
|
-
|
|
92
|
-
# Format output path
|
|
93
|
-
report_base = report_template.format(segment=name)
|
|
94
|
-
|
|
95
|
-
# Run validation
|
|
96
|
-
validator = SegmentValidator(
|
|
97
|
-
segment_column=segment_col,
|
|
98
|
-
segment_values=[name],
|
|
99
|
-
model=model,
|
|
100
|
-
raw_df=raw_df_run,
|
|
101
|
-
cleaned_df=cleaned_df_run,
|
|
102
|
-
target_col=rule_config.get("model", {}).get("target"),
|
|
103
|
-
config=rule_config,
|
|
104
|
-
segment_name=name,
|
|
105
|
-
report_base=report_base
|
|
106
|
-
)
|
|
107
|
-
results = validator.run()
|
|
108
|
-
|
|
109
|
-
# Extract report path from SegmentValidator result
|
|
110
|
-
report_base_path = report_template.format(segment=name)
|
|
111
|
-
report_path = os.path.join(report_base_path, f"report_{name}.docx")
|
|
112
|
-
|
|
113
|
-
print(f"📄 Report saved: {report_path}")
|
|
114
|
-
print(f"✅ Segment '{name}' validated.")
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
print("✅ All segment runs completed.")
|
|
118
|
-
return True
|
tanml/engine/validation_agent.py
DELETED
|
@@ -1,91 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
from importlib.resources import files
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
import tzlocal
|
|
5
|
-
import json
|
|
6
|
-
from pathlib import Path
|
|
7
|
-
|
|
8
|
-
from tanml.engine.core_engine_agent import ValidationEngine
|
|
9
|
-
from tanml.report.report_builder import ReportBuilder
|
|
10
|
-
from tanml.engine.check_agent_registry import CHECK_RUNNER_REGISTRY
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
class SegmentValidator:
|
|
14
|
-
def __init__(
|
|
15
|
-
self,
|
|
16
|
-
model,
|
|
17
|
-
raw_df,
|
|
18
|
-
cleaned_df,
|
|
19
|
-
config,
|
|
20
|
-
target_col=None,
|
|
21
|
-
segment_column=None,
|
|
22
|
-
segment_values=None,
|
|
23
|
-
segment_name=None,
|
|
24
|
-
report_base="reports"
|
|
25
|
-
):
|
|
26
|
-
self.model = model
|
|
27
|
-
self.raw_df = raw_df
|
|
28
|
-
self.cleaned_df = cleaned_df
|
|
29
|
-
self.config = config
|
|
30
|
-
self.target_col = target_col
|
|
31
|
-
self.segment_column = segment_column
|
|
32
|
-
self.segment_values = segment_values
|
|
33
|
-
self.segment_name = segment_name
|
|
34
|
-
self.report_base = report_base
|
|
35
|
-
|
|
36
|
-
def run(self):
|
|
37
|
-
if self.segment_name:
|
|
38
|
-
return self._run_single(self.cleaned_df, self.segment_name)
|
|
39
|
-
|
|
40
|
-
if not self.segment_column:
|
|
41
|
-
raise ValueError("Segmentation column not specified in rules.yaml")
|
|
42
|
-
|
|
43
|
-
results = {}
|
|
44
|
-
for segment in self.segment_values:
|
|
45
|
-
segment_df = self.cleaned_df[self.cleaned_df[self.segment_column] == segment]
|
|
46
|
-
print(f"🔹 Running validation for segment value: {segment}")
|
|
47
|
-
result = self._run_single(segment_df, segment)
|
|
48
|
-
results[segment] = result
|
|
49
|
-
|
|
50
|
-
return results
|
|
51
|
-
|
|
52
|
-
def _run_single(self, segment_df, segment_name):
|
|
53
|
-
if self.target_col is None or self.target_col not in segment_df.columns:
|
|
54
|
-
raise ValueError(f"❌ Target column '{self.target_col}' missing in cleaned data for segment '{segment_name}'")
|
|
55
|
-
|
|
56
|
-
y = segment_df[self.target_col]
|
|
57
|
-
cols_to_drop = [c for c in (self.target_col, self.segment_column) if c in segment_df.columns]
|
|
58
|
-
X = segment_df.drop(columns=cols_to_drop)
|
|
59
|
-
|
|
60
|
-
# Pass raw_df to ValidationEngine so CleaningReproCheck works
|
|
61
|
-
engine = ValidationEngine(self.model, X, X, y, y, self.config, segment_df, self.raw_df)
|
|
62
|
-
results = engine.run_all_checks()
|
|
63
|
-
|
|
64
|
-
local_tz = tzlocal.get_localzone()
|
|
65
|
-
now = datetime.now(local_tz)
|
|
66
|
-
results["validation_date"] = now.strftime("%Y-%m-%d %H:%M:%S %Z (UTC%z)")
|
|
67
|
-
results["model_path"] = self.model.__class__.__name__
|
|
68
|
-
results["validated_by"] = "TanML Automated Validator"
|
|
69
|
-
results["rules"] = self.config
|
|
70
|
-
|
|
71
|
-
report_base_formatted = self.report_base.format(segment=segment_name) if "{segment}" in self.report_base else self.report_base
|
|
72
|
-
output_dir = Path(report_base_formatted)
|
|
73
|
-
output_dir.mkdir(parents=True, exist_ok=True)
|
|
74
|
-
|
|
75
|
-
tpl_cfg = self.config.get("output", {}).get("template_path") # may be None
|
|
76
|
-
template_path = (
|
|
77
|
-
Path(tpl_cfg).expanduser() if tpl_cfg
|
|
78
|
-
else files("tanml.report.templates").joinpath("report_template.docx")
|
|
79
|
-
)
|
|
80
|
-
|
|
81
|
-
output_path = output_dir / f"report_{segment_name}.docx"
|
|
82
|
-
|
|
83
|
-
builder = ReportBuilder(results, template_path, output_path)
|
|
84
|
-
builder.build()
|
|
85
|
-
|
|
86
|
-
print(f"📄 Report saved for segment '{segment_name}': {output_path}")
|
|
87
|
-
print(json.dumps(results, indent=2, default=str))
|
|
88
|
-
return results
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
__all__ = ["SegmentValidator", "CHECK_RUNNER_REGISTRY"]
|
|
Binary file
|
tanml/utils/model_loader.py
DELETED
|
@@ -1,35 +0,0 @@
|
|
|
1
|
-
# tanml/utils/model_loader.py
|
|
2
|
-
|
|
3
|
-
import os
|
|
4
|
-
import joblib
|
|
5
|
-
from tanml.utils.sas_loader import SASLogisticModel
|
|
6
|
-
from tanml.utils.r_loader import RLogisticModel
|
|
7
|
-
|
|
8
|
-
def load_model(model_path):
|
|
9
|
-
"""
|
|
10
|
-
Load a model from path. Supports:
|
|
11
|
-
- sklearn/xgboost .pkl
|
|
12
|
-
- SAS .csv with _intercept.txt and _features.txt
|
|
13
|
-
- R exported logistic CSV
|
|
14
|
-
"""
|
|
15
|
-
if not model_path:
|
|
16
|
-
raise ValueError("❌ No model path provided.")
|
|
17
|
-
|
|
18
|
-
if "r_logistic" in model_path.lower():
|
|
19
|
-
print("✅ Detected R Logistic Regression model")
|
|
20
|
-
return RLogisticModel(model_path)
|
|
21
|
-
|
|
22
|
-
elif model_path.endswith(".pkl"):
|
|
23
|
-
print(f"✅ Loading sklearn/XGB model from {model_path}")
|
|
24
|
-
return joblib.load(model_path)
|
|
25
|
-
|
|
26
|
-
elif model_path.endswith(".csv"):
|
|
27
|
-
base = os.path.splitext(model_path)[0]
|
|
28
|
-
return SASLogisticModel(
|
|
29
|
-
coeffs_path=model_path,
|
|
30
|
-
intercept_path=base + "_intercept.txt",
|
|
31
|
-
feature_order_path=base + "_features.txt"
|
|
32
|
-
)
|
|
33
|
-
|
|
34
|
-
else:
|
|
35
|
-
raise ValueError("❌ Unsupported model format. Use .pkl, .csv, or R model CSV")
|
tanml/utils/r_loader.py
DELETED
|
@@ -1,30 +0,0 @@
|
|
|
1
|
-
import pandas as pd
|
|
2
|
-
import numpy as np
|
|
3
|
-
|
|
4
|
-
class RLogisticModel:
|
|
5
|
-
"""
|
|
6
|
-
Wrapper for logistic regression models exported from R.
|
|
7
|
-
Assumes CSV with columns: ID, y_true, y_pred_proba
|
|
8
|
-
"""
|
|
9
|
-
|
|
10
|
-
def __init__(self, model_path: str):
|
|
11
|
-
self.model_path = model_path
|
|
12
|
-
self.df = pd.read_csv(model_path)
|
|
13
|
-
|
|
14
|
-
# Check required columns exist
|
|
15
|
-
expected_cols = {'y_true', 'y_pred_proba'}
|
|
16
|
-
if not expected_cols.issubset(set(self.df.columns)):
|
|
17
|
-
raise ValueError(f"R model CSV must contain columns: {expected_cols}")
|
|
18
|
-
|
|
19
|
-
self.y_true = self.df['y_true'].values
|
|
20
|
-
self.y_pred_proba = self.df['y_pred_proba'].values
|
|
21
|
-
|
|
22
|
-
def predict_proba(self, X=None):
|
|
23
|
-
"""
|
|
24
|
-
Mimics sklearn’s predict_proba format: n_samples x 2
|
|
25
|
-
"""
|
|
26
|
-
proba = self.y_pred_proba.reshape(-1, 1)
|
|
27
|
-
return np.hstack([1 - proba, proba])
|
|
28
|
-
|
|
29
|
-
def get_true_labels(self):
|
|
30
|
-
return self.y_true
|
tanml/utils/sas_loader.py
DELETED
|
@@ -1,50 +0,0 @@
|
|
|
1
|
-
# File: tanml/utils/sas_loader.py
|
|
2
|
-
|
|
3
|
-
import pandas as pd
|
|
4
|
-
import numpy as np
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
class SASLogisticModel:
|
|
8
|
-
def __init__(self, coeffs_path, intercept_path, feature_order_path):
|
|
9
|
-
self.coeffs_path = coeffs_path
|
|
10
|
-
self.intercept_path = intercept_path
|
|
11
|
-
self.feature_order_path = feature_order_path
|
|
12
|
-
|
|
13
|
-
self.coefficients = self._load_coefficients()
|
|
14
|
-
self.intercept = self._load_intercept()
|
|
15
|
-
self.feature_order = self._load_feature_order()
|
|
16
|
-
|
|
17
|
-
def _load_coefficients(self):
|
|
18
|
-
return pd.read_csv(self.coeffs_path, index_col=0).squeeze("columns")
|
|
19
|
-
|
|
20
|
-
def _load_intercept(self):
|
|
21
|
-
with open(self.intercept_path) as f:
|
|
22
|
-
return float(f.read().strip())
|
|
23
|
-
|
|
24
|
-
def _load_feature_order(self):
|
|
25
|
-
with open(self.feature_order_path) as f:
|
|
26
|
-
return [line.strip() for line in f.readlines()]
|
|
27
|
-
|
|
28
|
-
def predict_proba(self, X):
|
|
29
|
-
"""
|
|
30
|
-
Return a NumPy array shaped (n_samples, 2) like sklearn:
|
|
31
|
-
[:, 0] = P(class 0), [:, 1] = P(class 1)
|
|
32
|
-
"""
|
|
33
|
-
X = X[self.feature_order]
|
|
34
|
-
logits = X.dot(self.coefficients) + self.intercept
|
|
35
|
-
|
|
36
|
-
# numeric stability clamp
|
|
37
|
-
logits = logits.clip(-700, 700)
|
|
38
|
-
|
|
39
|
-
proba_1 = 1 / (1 + np.exp(-logits))
|
|
40
|
-
proba_0 = 1 - proba_1
|
|
41
|
-
return np.vstack([proba_0, proba_1]).T # shape (n, 2)
|
|
42
|
-
|
|
43
|
-
def predict(self, X):
|
|
44
|
-
"""
|
|
45
|
-
Return class labels (0/1) based on 0.5 threshold.
|
|
46
|
-
Works with the NumPy array returned by predict_proba().
|
|
47
|
-
"""
|
|
48
|
-
proba_1 = self.predict_proba(X)[:, 1] # probability of class 1
|
|
49
|
-
return (proba_1 >= 0.5).astype(int)
|
|
50
|
-
|
tanml/utils/yaml_generator.py
DELETED
|
@@ -1,34 +0,0 @@
|
|
|
1
|
-
from pathlib import Path
|
|
2
|
-
import shutil
|
|
3
|
-
|
|
4
|
-
def generate_rules_yaml(
|
|
5
|
-
scenario: str = "A",
|
|
6
|
-
dest_path: str = "rules.yaml",
|
|
7
|
-
overwrite: bool = False
|
|
8
|
-
):
|
|
9
|
-
|
|
10
|
-
scenario = scenario.upper()
|
|
11
|
-
template_map = {
|
|
12
|
-
"A": "rules_one_model_one_dataset.yaml",
|
|
13
|
-
"B": "rules_multiple_models_datasets.yaml",
|
|
14
|
-
"C": "rules_one_dataset_segment_column.yaml"
|
|
15
|
-
}
|
|
16
|
-
|
|
17
|
-
if scenario not in template_map:
|
|
18
|
-
raise ValueError("Invalid scenario. Must be 'A', 'B', or 'C'.")
|
|
19
|
-
|
|
20
|
-
src = Path(__file__).parent.parent / "config_templates" / template_map[scenario]
|
|
21
|
-
if not src.exists():
|
|
22
|
-
raise FileNotFoundError(f"Template not found at {src}")
|
|
23
|
-
|
|
24
|
-
dst = Path(dest_path)
|
|
25
|
-
if dst.exists() and not overwrite:
|
|
26
|
-
raise FileExistsError(f"{dst} already exists. Use --overwrite to replace it.")
|
|
27
|
-
|
|
28
|
-
if dst.parent and not dst.parent.exists():
|
|
29
|
-
dst.parent.mkdir(parents=True, exist_ok=True)
|
|
30
|
-
|
|
31
|
-
shutil.copyfile(src, dst)
|
|
32
|
-
|
|
33
|
-
print(f"✅ Created: {dst.resolve()} for Scenario {scenario}")
|
|
34
|
-
print("👉 Now edit this YAML to fill in your model, data, and feature details.")
|