tanml 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tanml might be problematic. Click here for more details.
- tanml/__init__.py +1 -1
- tanml/check_runners/cleaning_repro_runner.py +2 -2
- tanml/check_runners/correlation_runner.py +49 -12
- tanml/check_runners/explainability_runner.py +12 -22
- tanml/check_runners/logistic_stats_runner.py +196 -17
- tanml/check_runners/performance_runner.py +82 -26
- tanml/check_runners/raw_data_runner.py +29 -14
- tanml/check_runners/regression_metrics_runner.py +195 -0
- tanml/check_runners/stress_test_runner.py +23 -6
- tanml/check_runners/vif_runner.py +33 -27
- tanml/checks/correlation.py +241 -41
- tanml/checks/explainability/shap_check.py +261 -29
- tanml/checks/logit_stats.py +186 -54
- tanml/checks/performance_classification.py +305 -0
- tanml/checks/raw_data.py +58 -23
- tanml/checks/regression_metrics.py +167 -0
- tanml/checks/stress_test.py +157 -53
- tanml/cli/main.py +99 -27
- tanml/engine/check_agent_registry.py +20 -10
- tanml/engine/core_engine_agent.py +199 -37
- tanml/models/registry.py +329 -0
- tanml/report/report_builder.py +1180 -147
- tanml/report/templates/report_template_cls.docx +0 -0
- tanml/report/templates/report_template_reg.docx +0 -0
- tanml/ui/app.py +1205 -0
- tanml/utils/data_loader.py +105 -15
- tanml-0.1.7.dist-info/METADATA +164 -0
- tanml-0.1.7.dist-info/RECORD +54 -0
- tanml/cli/arg_parser.py +0 -31
- tanml/cli/init_cmd.py +0 -8
- tanml/cli/validate_cmd.py +0 -7
- tanml/config_templates/rules_multiple_models_datasets.yaml +0 -144
- tanml/config_templates/rules_one_dataset_segment_column.yaml +0 -140
- tanml/config_templates/rules_one_model_one_dataset.yaml +0 -143
- tanml/engine/segmentation_agent.py +0 -118
- tanml/engine/validation_agent.py +0 -91
- tanml/report/templates/report_template.docx +0 -0
- tanml/utils/model_loader.py +0 -35
- tanml/utils/r_loader.py +0 -30
- tanml/utils/sas_loader.py +0 -50
- tanml/utils/yaml_generator.py +0 -34
- tanml/utils/yaml_loader.py +0 -5
- tanml/validate.py +0 -209
- tanml-0.1.6.dist-info/METADATA +0 -317
- tanml-0.1.6.dist-info/RECORD +0 -62
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/WHEEL +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/entry_points.txt +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/top_level.txt +0 -0
tanml/validate.py
DELETED
|
@@ -1,209 +0,0 @@
|
|
|
1
|
-
from __future__ import annotations
|
|
2
|
-
import importlib
|
|
3
|
-
from datetime import datetime
|
|
4
|
-
from pathlib import Path
|
|
5
|
-
from importlib.resources import files
|
|
6
|
-
from typing import Optional, Dict, Any
|
|
7
|
-
|
|
8
|
-
import joblib
|
|
9
|
-
import pandas as pd
|
|
10
|
-
import tzlocal
|
|
11
|
-
from sklearn.model_selection import train_test_split
|
|
12
|
-
|
|
13
|
-
from tanml.cli.arg_parser import parse_args
|
|
14
|
-
from tanml.engine.segmentation_agent import handle_segmentation
|
|
15
|
-
from tanml.engine.core_engine_agent import ValidationEngine
|
|
16
|
-
from tanml.report.report_builder import ReportBuilder
|
|
17
|
-
from tanml.utils.data_loader import load_dataframe
|
|
18
|
-
from tanml.utils.model_loader import load_model
|
|
19
|
-
from tanml.utils.yaml_loader import load_yaml_config
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
load_yaml = load_yaml_config
|
|
23
|
-
|
|
24
|
-
def _resolve_template_path(explicit: Optional[str] = None) -> Path:
|
|
25
|
-
"""Return a Path to the DOCX report template.
|
|
26
|
-
|
|
27
|
-
Priority:
|
|
28
|
-
1. If the caller supplied an absolute/relative path (`explicit`), use it.
|
|
29
|
-
2. Fallback to the *packaged* template shipped inside
|
|
30
|
-
`tanml.report.templates`.
|
|
31
|
-
"""
|
|
32
|
-
if explicit:
|
|
33
|
-
return Path(explicit).expanduser().resolve()
|
|
34
|
-
|
|
35
|
-
# packaged default
|
|
36
|
-
return files("tanml.report.templates").joinpath("report_template.docx")
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
# ---------------------------------------------------------------------------
|
|
40
|
-
# CLI path (Scenario A)
|
|
41
|
-
# ---------------------------------------------------------------------------
|
|
42
|
-
|
|
43
|
-
def validate_from_args() -> None:
|
|
44
|
-
args = parse_args()
|
|
45
|
-
|
|
46
|
-
# Read YAML rules if provided via --rules, else an empty dict
|
|
47
|
-
rule_cfg: Dict[str, Any] = load_yaml(args.rules) if args.rules else {}
|
|
48
|
-
explicit_feats = rule_cfg.get("model", {}).get("features", [])
|
|
49
|
-
|
|
50
|
-
# Optional feature override from CLI flag
|
|
51
|
-
if not explicit_feats and args.features:
|
|
52
|
-
explicit_feats = [f.strip() for f in args.features.split(",")]
|
|
53
|
-
print(f"✅ Using CLI feature override: {explicit_feats}")
|
|
54
|
-
|
|
55
|
-
# -------------------------------------------------------------------
|
|
56
|
-
# Segmentation scenarios (B / C) are handled by a dedicated agent
|
|
57
|
-
# -------------------------------------------------------------------
|
|
58
|
-
if "segment" in rule_cfg and ("runs" in rule_cfg["segment"] or "column" in rule_cfg["segment"]):
|
|
59
|
-
print("✅ Detected segmentation scenario (Scenario B or C).")
|
|
60
|
-
handle_segmentation(rule_cfg["segment"], rule_cfg, args, args.report_path)
|
|
61
|
-
return
|
|
62
|
-
|
|
63
|
-
# -------------------------------------------------------------------
|
|
64
|
-
# Scenario A (single model / dataset)
|
|
65
|
-
# -------------------------------------------------------------------
|
|
66
|
-
if not args.model:
|
|
67
|
-
raise ValueError("❌ No model path provided (use --model).")
|
|
68
|
-
|
|
69
|
-
model = load_model(args.model)
|
|
70
|
-
raw_df = load_dataframe(args.raw)
|
|
71
|
-
cleaned_df = load_dataframe(args.cleaned)
|
|
72
|
-
|
|
73
|
-
# Column sanity check
|
|
74
|
-
if explicit_feats:
|
|
75
|
-
missing = set(explicit_feats) - set(cleaned_df.columns)
|
|
76
|
-
if missing:
|
|
77
|
-
raise ValueError(f"❌ cleaned.csv missing columns: {missing}")
|
|
78
|
-
|
|
79
|
-
target = args.target
|
|
80
|
-
if target not in cleaned_df.columns:
|
|
81
|
-
raise ValueError(f"❌ Target column '{target}' not found in cleaned data.")
|
|
82
|
-
|
|
83
|
-
y = cleaned_df[target]
|
|
84
|
-
X = cleaned_df.drop(columns=[target])
|
|
85
|
-
|
|
86
|
-
# Feature‑name consistency with persisted model
|
|
87
|
-
if hasattr(model, "feature_names_in_"):
|
|
88
|
-
expected = list(model.feature_names_in_)
|
|
89
|
-
if expected != list(X.columns):
|
|
90
|
-
raise ValueError(
|
|
91
|
-
"❌ Feature mismatch\nExpected: "
|
|
92
|
-
f"{expected}\nGot: {list(X.columns)}"
|
|
93
|
-
)
|
|
94
|
-
print("✅ Feature names match the model expectations.")
|
|
95
|
-
|
|
96
|
-
# Train/test split
|
|
97
|
-
test_size = rule_cfg.get("train_test_split", {}).get("test_size", 0.3)
|
|
98
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
99
|
-
X, y, test_size=test_size, random_state=42
|
|
100
|
-
)
|
|
101
|
-
|
|
102
|
-
engine = ValidationEngine(model, X_train, X_test, y_train, y_test, rule_cfg, cleaned_df)
|
|
103
|
-
results = engine.run_all_checks()
|
|
104
|
-
|
|
105
|
-
# Meta‑info block for the report
|
|
106
|
-
now = datetime.now(tzlocal.get_localzone())
|
|
107
|
-
results.update(
|
|
108
|
-
{
|
|
109
|
-
"validation_date": now.strftime("%Y-%m-%d %H:%M:%S %Z (UTC%z)"),
|
|
110
|
-
"model_path": args.model,
|
|
111
|
-
"validated_by": "TanML Automated Validator",
|
|
112
|
-
"rules": rule_cfg.get("rules", {}),
|
|
113
|
-
}
|
|
114
|
-
)
|
|
115
|
-
|
|
116
|
-
# -------------------------------------------------------------------
|
|
117
|
-
# Report output location
|
|
118
|
-
# -------------------------------------------------------------------
|
|
119
|
-
output_path = Path(args.report_path).expanduser().resolve()
|
|
120
|
-
if output_path.exists() and output_path.is_dir():
|
|
121
|
-
raise ValueError(
|
|
122
|
-
f"'{output_path}' is a directory. Please provide a full .docx filename."
|
|
123
|
-
)
|
|
124
|
-
output_path.parent.mkdir(parents=True, exist_ok=True)
|
|
125
|
-
|
|
126
|
-
template_path = _resolve_template_path(rule_cfg.get("output", {}).get("template_path"))
|
|
127
|
-
ReportBuilder(results, template_path=template_path, output_path=output_path).build()
|
|
128
|
-
print(f"\n✅ Report saved to {output_path}")
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
# ---------------------------------------------------------------------------
|
|
132
|
-
# YAML‑driven path (CLI `tanml validate --rules <file>.yaml`)
|
|
133
|
-
# ---------------------------------------------------------------------------
|
|
134
|
-
|
|
135
|
-
def validate_from_yaml(rules_path: str | Path) -> None:
|
|
136
|
-
config = load_yaml_config(rules_path)
|
|
137
|
-
|
|
138
|
-
# Segmentation handling first
|
|
139
|
-
report_output_tpl = config.get("output", {}).get("report_path_template", "reports/{segment}")
|
|
140
|
-
if "segment" in config and "runs" in config["segment"]:
|
|
141
|
-
print("🔍 Detected segmentation setup in rules.yaml. Running segment‑wise validation.")
|
|
142
|
-
handle_segmentation(config["segment"], config, args=None, report_output=report_output_tpl)
|
|
143
|
-
return
|
|
144
|
-
|
|
145
|
-
# --------------------------
|
|
146
|
-
# Scenario A (single run)
|
|
147
|
-
# --------------------------
|
|
148
|
-
cleaned_df = pd.read_csv(config["paths"]["cleaned_data"])
|
|
149
|
-
print(f"✅ Loaded cleaned data from: {config['paths']['cleaned_data']}")
|
|
150
|
-
print("✅ No segmentation detected. Running single model validation.")
|
|
151
|
-
|
|
152
|
-
target = config["model"]["target"]
|
|
153
|
-
features = config["model"]["features"]
|
|
154
|
-
X = cleaned_df[features]
|
|
155
|
-
y = cleaned_df[target]
|
|
156
|
-
|
|
157
|
-
# Model loader / trainer
|
|
158
|
-
if config.get("model_source", {}).get("from_pickle", True):
|
|
159
|
-
model_path = config["paths"].get("model")
|
|
160
|
-
if not model_path:
|
|
161
|
-
raise ValueError("❌ 'paths.model' must be provided when using from_pickle = true")
|
|
162
|
-
model = joblib.load(model_path)
|
|
163
|
-
print(f"✅ Loaded model from: {model_path}")
|
|
164
|
-
else:
|
|
165
|
-
model_cfg = config["model_source"]
|
|
166
|
-
module = importlib.import_module(model_cfg["module"])
|
|
167
|
-
model_class = getattr(module, model_cfg["type"])
|
|
168
|
-
model = model_class(**model_cfg.get("hyperparameters", {}))
|
|
169
|
-
model.fit(X, y)
|
|
170
|
-
print(f"✅ Retrained model from: {model_cfg['module']}.{model_cfg['type']}")
|
|
171
|
-
# For SHAP etc.
|
|
172
|
-
if not hasattr(model, "feature_names_in_"):
|
|
173
|
-
model.feature_names_in_ = X.columns.to_numpy()
|
|
174
|
-
|
|
175
|
-
# Train/test split
|
|
176
|
-
test_size = config.get("train_test_split", {}).get("test_size", 0.3)
|
|
177
|
-
X_train, X_test, y_train, y_test = train_test_split(
|
|
178
|
-
X, y, test_size=test_size, random_state=42
|
|
179
|
-
)
|
|
180
|
-
|
|
181
|
-
ctx = {"expected_features": features}
|
|
182
|
-
engine = ValidationEngine(model, X_train, X_test, y_train, y_test, config, cleaned_df, ctx=ctx)
|
|
183
|
-
results = engine.run_all_checks()
|
|
184
|
-
|
|
185
|
-
# Meta block
|
|
186
|
-
now = datetime.now(tzlocal.get_localzone())
|
|
187
|
-
results.update(
|
|
188
|
-
{
|
|
189
|
-
"validation_date": now.strftime("%Y-%m-%d %H:%M:%S %Z (UTC%z)"),
|
|
190
|
-
"model_path": config["paths"].get("model", "retrained_from_yaml"),
|
|
191
|
-
"validated_by": "TanML Automated Validator",
|
|
192
|
-
"rules": config.get("rules", {}),
|
|
193
|
-
}
|
|
194
|
-
)
|
|
195
|
-
|
|
196
|
-
# Report output
|
|
197
|
-
report_path = Path(config.get("output", {}).get("report_path", "report.docx")).expanduser()
|
|
198
|
-
report_path.parent.mkdir(parents=True, exist_ok=True)
|
|
199
|
-
|
|
200
|
-
template_path = _resolve_template_path(config.get("output", {}).get("template_path"))
|
|
201
|
-
ReportBuilder(results, template_path=template_path, output_path=report_path).build()
|
|
202
|
-
|
|
203
|
-
print(f"📄 Report saved: {report_path}")
|
|
204
|
-
print("✅ All validations completed.")
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
if __name__ == "__main__":
|
|
208
|
-
validate_from_args()
|
|
209
|
-
|
tanml-0.1.6.dist-info/METADATA
DELETED
|
@@ -1,317 +0,0 @@
|
|
|
1
|
-
Metadata-Version: 2.4
|
|
2
|
-
Name: tanml
|
|
3
|
-
Version: 0.1.6
|
|
4
|
-
Summary: Automated validation toolkit for tabular ML models in finance and regulated domains.
|
|
5
|
-
Author: Dolly Sah
|
|
6
|
-
Author-email: Tanmay Sah <tradertanmay@gmail.com>
|
|
7
|
-
License: MIT
|
|
8
|
-
Project-URL: Homepage, https://github.com/tdlabs-ai/tanml
|
|
9
|
-
Requires-Python: >=3.8
|
|
10
|
-
Description-Content-Type: text/markdown
|
|
11
|
-
License-File: LICENSE
|
|
12
|
-
Requires-Dist: scikit-learn
|
|
13
|
-
Requires-Dist: pandas
|
|
14
|
-
Requires-Dist: matplotlib
|
|
15
|
-
Requires-Dist: seaborn
|
|
16
|
-
Requires-Dist: shap
|
|
17
|
-
Requires-Dist: docxtpl
|
|
18
|
-
Requires-Dist: python-docx
|
|
19
|
-
Requires-Dist: docxcompose
|
|
20
|
-
Requires-Dist: PyYAML
|
|
21
|
-
Requires-Dist: scipy
|
|
22
|
-
Requires-Dist: statsmodels
|
|
23
|
-
Requires-Dist: joblib
|
|
24
|
-
Requires-Dist: tzlocal
|
|
25
|
-
Requires-Dist: tqdm
|
|
26
|
-
Requires-Dist: imgkit
|
|
27
|
-
Requires-Dist: xgboost
|
|
28
|
-
Provides-Extra: dev
|
|
29
|
-
Requires-Dist: pytest; extra == "dev"
|
|
30
|
-
Requires-Dist: black; extra == "dev"
|
|
31
|
-
Requires-Dist: isort; extra == "dev"
|
|
32
|
-
Dynamic: license-file
|
|
33
|
-
|
|
34
|
-
# TanML: Automated Model Validation Toolkit for Tabular Machine Learning
|
|
35
|
-
[](https://github.com/tdlabs-ai/tanml)
|
|
36
|
-
|
|
37
|
-
TanML is a modular, automated model validation toolkit for tabular machine learning workflows. It supports end-to-end validation with just a YAML configuration and a single command, performing checks across data quality, robustness, explainability, and model performance.
|
|
38
|
-
|
|
39
|
-
TanML generates structured Word (DOCX) reports suitable for internal model reviews, audit documentation, and stakeholder presentations. It is designed for general-purpose ML validation and works well in domains where interpretability, fairness, and reliability are critical.
|
|
40
|
-
|
|
41
|
-
While TanML currently operates as a command-line toolkit, its architecture is designed to evolve into an intelligent validation agent—capable of integrating with AutoML pipelines, CI/CD workflows, and human-in-the-loop validation systems.
|
|
42
|
-
|
|
43
|
-
## Key Features
|
|
44
|
-
|
|
45
|
-
* One-command validation using CLI and YAML
|
|
46
|
-
* Supports models developed in Python (scikit-learn, XGBoost), R, or SAS
|
|
47
|
-
* Scenario-based flexibility:
|
|
48
|
-
|
|
49
|
-
* Scenario A: Single model and dataset
|
|
50
|
-
* Scenario B: One model per segment with separate datasets
|
|
51
|
-
* Scenario C: One model and dataset with an internal segmentation column
|
|
52
|
-
* Comprehensive validation checks:
|
|
53
|
-
|
|
54
|
-
* Model performance (e.g., accuracy, AUC, KS)
|
|
55
|
-
* Data quality diagnostics
|
|
56
|
-
* Stress testing for input robustness
|
|
57
|
-
* SHAP-based explainability
|
|
58
|
-
* Segment-wise validation
|
|
59
|
-
* Logistic regression coefficient summaries (if applicable)
|
|
60
|
-
* VIF, correlation, and multicollinearity checks
|
|
61
|
-
* EDA summaries
|
|
62
|
-
* Rule-based threshold validation using YAML configuration
|
|
63
|
-
* Professional report generation in Word (DOCX) format
|
|
64
|
-
* Easily extensible architecture to add custom checks or outputs
|
|
65
|
-
|
|
66
|
-
## Installation
|
|
67
|
-
|
|
68
|
-
TanML can be installed directly from PyPI using pip:
|
|
69
|
-
|
|
70
|
-
```bash
|
|
71
|
-
pip install tanml
|
|
72
|
-
```
|
|
73
|
-
|
|
74
|
-
To upgrade to the latest version:
|
|
75
|
-
|
|
76
|
-
```bash
|
|
77
|
-
pip install --upgrade tanml
|
|
78
|
-
```
|
|
79
|
-
|
|
80
|
-
After installation, you can verify the CLI is working by running:
|
|
81
|
-
|
|
82
|
-
```bash
|
|
83
|
-
tanml --help
|
|
84
|
-
```
|
|
85
|
-
|
|
86
|
-
This will display the list of available commands and options.
|
|
87
|
-
|
|
88
|
-
TanML supports Python 3.8 and above. It is recommended to use a virtual environment for clean dependency management:
|
|
89
|
-
|
|
90
|
-
```bash
|
|
91
|
-
python3 -m venv .venv
|
|
92
|
-
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
93
|
-
```
|
|
94
|
-
|
|
95
|
-
## Folder Structure
|
|
96
|
-
|
|
97
|
-
A typical TanML project is organized as follows:
|
|
98
|
-
|
|
99
|
-
```
|
|
100
|
-
TanML/
|
|
101
|
-
├── models/ # Trained model files (e.g., .pkl, .sas7bdat)
|
|
102
|
-
├── data/
|
|
103
|
-
│ ├── raw/ # Raw data files (optional)
|
|
104
|
-
│ └── cleaned/ # Cleaned datasets for validation
|
|
105
|
-
├── examples/
|
|
106
|
-
│ ├── scenario_a/ # Example for single model and dataset
|
|
107
|
-
│ ├── scenario_b/ # One model per segment
|
|
108
|
-
│ └── scenario_c/ # Single model with segmentation column
|
|
109
|
-
├── reports/
|
|
110
|
-
│ ├── images/ # SHAP plots, cluster visualizations
|
|
111
|
-
│ ├── clusters/ # Cluster summary CSVs
|
|
112
|
-
│ └── *.docx # Final validation reports
|
|
113
|
-
├── tanml/ # Core source code
|
|
114
|
-
│ ├── cli/ # CLI logic
|
|
115
|
-
│ ├── engine/ # Orchestration and segmentation logic
|
|
116
|
-
│ ├── checks/ # Validation check classes
|
|
117
|
-
│ ├── check_runners/ # Check runners linked to the engine
|
|
118
|
-
│ ├── report/ # Report generation using docxtpl
|
|
119
|
-
│ └── utils/ # Data/model loaders and helpers
|
|
120
|
-
├── tests/ # Unit tests (WIP or planned)
|
|
121
|
-
├── setup.py # Package installer for pip
|
|
122
|
-
├── pyproject.toml # Modern Python build metadata (PEP 518)
|
|
123
|
-
├── MANIFEST.in # Files to include when packaging
|
|
124
|
-
└── README.md # This documentation
|
|
125
|
-
```
|
|
126
|
-
|
|
127
|
-
## How to Use TanML
|
|
128
|
-
|
|
129
|
-
To use TanML effectively, follow these three main steps. This process is designed to be simple and intuitive, even for users without a programming background.
|
|
130
|
-
|
|
131
|
-
1. **Initialize the Configuration File:**
|
|
132
|
-
Start by generating a YAML configuration file that tells TanML how to run the validation. You do this using a command-line instruction where you select one of the three supported validation scenarios:
|
|
133
|
-
|
|
134
|
-
* Scenario A: For validating one model using a single dataset
|
|
135
|
-
* Scenario B: For validating multiple models, each with its own segment-specific dataset
|
|
136
|
-
* Scenario C: For validating a single model that needs to be run separately on different segments within one dataset
|
|
137
|
-
|
|
138
|
-
You can also choose where the generated YAML file should be saved. If you don’t provide a location, it will be saved in your current directory with a default name.
|
|
139
|
-
|
|
140
|
-
2. **Fill in the Configuration File:**
|
|
141
|
-
Once the YAML file is created, you’ll open it and fill in the necessary details:
|
|
142
|
-
|
|
143
|
-
* Where your model file is located (e.g., a `.pkl` file for Python, or equivalent for SAS or R)
|
|
144
|
-
* Where your cleaned dataset is saved
|
|
145
|
-
* (Optional) Where your raw dataset is saved, if you want data quality comparisons
|
|
146
|
-
* Which input features your model expects, and what the target column is
|
|
147
|
-
* Where you want the final report to be saved (Word `.docx` format)
|
|
148
|
-
|
|
149
|
-
This YAML file acts like a blueprint—TanML reads it and follows the instructions you provide to run all relevant validation checks.
|
|
150
|
-
|
|
151
|
-
3. **Run the Validation Process:**
|
|
152
|
-
After the YAML file is completed, you will run the validation process. This will trigger TanML to:
|
|
153
|
-
|
|
154
|
-
* Load your model and data
|
|
155
|
-
* Perform all configured validation checks (like performance, SHAP explainability, stress testing, etc.)
|
|
156
|
-
* Automatically generate a professional report with summaries, tables, and visuals
|
|
157
|
-
|
|
158
|
-
You can use TanML either through simple command-line instructions or directly in Python by calling its functions. Both methods achieve the same results. The command-line approach is ideal for repeatable, scriptable workflows, while the Python interface is useful for advanced users who want to integrate TanML into larger systems or notebooks.
|
|
159
|
-
|
|
160
|
-
TanML is controlled entirely through a YAML configuration file and a single CLI command. The configuration specifies the model, data, validation rules, and output paths.
|
|
161
|
-
|
|
162
|
-
### Scenario A: Single Model, Single Dataset
|
|
163
|
-
|
|
164
|
-
This is the simplest usage mode. You have one model file and one cleaned dataset.
|
|
165
|
-
|
|
166
|
-
**rules.yaml**
|
|
167
|
-
|
|
168
|
-
```yaml
|
|
169
|
-
model:
|
|
170
|
-
features:
|
|
171
|
-
- age
|
|
172
|
-
- income
|
|
173
|
-
- debt_to_income
|
|
174
|
-
- credit_score
|
|
175
|
-
- employment_length
|
|
176
|
-
target: default_flag
|
|
177
|
-
|
|
178
|
-
paths:
|
|
179
|
-
model: models/model.pkl
|
|
180
|
-
cleaned_data: data/cleaned/cleaned.csv
|
|
181
|
-
raw_data: data/raw/raw.csv
|
|
182
|
-
|
|
183
|
-
output:
|
|
184
|
-
report_path: reports/validation_report.docx
|
|
185
|
-
```
|
|
186
|
-
|
|
187
|
-
**Run the validation:**
|
|
188
|
-
|
|
189
|
-
```bash
|
|
190
|
-
tanml validate --rules rules.yaml
|
|
191
|
-
```
|
|
192
|
-
|
|
193
|
-
This generates a `.docx` report along with SHAP plots and other artifacts.
|
|
194
|
-
|
|
195
|
-
### Scenario B: One Model per Segment
|
|
196
|
-
|
|
197
|
-
Use this if you have multiple segments, each with its own cleaned dataset and model.
|
|
198
|
-
|
|
199
|
-
**rules.yaml**
|
|
200
|
-
|
|
201
|
-
```yaml
|
|
202
|
-
segment:
|
|
203
|
-
runs:
|
|
204
|
-
segment_A:
|
|
205
|
-
model: models/model_segment_A.pkl
|
|
206
|
-
cleaned: data/cleaned/segment_A.csv
|
|
207
|
-
raw: data/raw/segment_A.csv
|
|
208
|
-
output_report: reports/report_segment_A.docx
|
|
209
|
-
|
|
210
|
-
segment_B:
|
|
211
|
-
model: models/model_segment_B.pkl
|
|
212
|
-
cleaned: data/cleaned/segment_B.csv
|
|
213
|
-
raw: data/raw/segment_B.csv
|
|
214
|
-
output_report: reports/report_segment_B.docx
|
|
215
|
-
|
|
216
|
-
model:
|
|
217
|
-
features:
|
|
218
|
-
- age
|
|
219
|
-
- income
|
|
220
|
-
- debt_to_income
|
|
221
|
-
- credit_score
|
|
222
|
-
- employment_length
|
|
223
|
-
target: default_flag
|
|
224
|
-
```
|
|
225
|
-
|
|
226
|
-
**Run the validation:**
|
|
227
|
-
|
|
228
|
-
```bash
|
|
229
|
-
tanml validate --rules rules.yaml
|
|
230
|
-
```
|
|
231
|
-
|
|
232
|
-
Each segment will be validated independently with its own output report.
|
|
233
|
-
|
|
234
|
-
### Scenario C: One Model with Segmentation Column
|
|
235
|
-
|
|
236
|
-
Use this if you have one dataset with a segmentation column (e.g., region, product type).
|
|
237
|
-
|
|
238
|
-
**rules.yaml**
|
|
239
|
-
|
|
240
|
-
```yaml
|
|
241
|
-
segment:
|
|
242
|
-
column: segment_id # This column will be used to split the data automatically
|
|
243
|
-
|
|
244
|
-
model:
|
|
245
|
-
features:
|
|
246
|
-
- age
|
|
247
|
-
- income
|
|
248
|
-
- debt_to_income
|
|
249
|
-
- credit_score
|
|
250
|
-
- employment_length
|
|
251
|
-
target: default_flag
|
|
252
|
-
|
|
253
|
-
paths:
|
|
254
|
-
model: models/credit_model.pkl
|
|
255
|
-
cleaned_data: data/cleaned/combined.csv
|
|
256
|
-
raw_data: data/raw/combined.csv
|
|
257
|
-
|
|
258
|
-
output:
|
|
259
|
-
report_path: reports/report_{segment}.docx
|
|
260
|
-
```
|
|
261
|
-
|
|
262
|
-
**Run the validation:**
|
|
263
|
-
|
|
264
|
-
```bash
|
|
265
|
-
tanml validate --rules rules.yaml
|
|
266
|
-
```
|
|
267
|
-
|
|
268
|
-
This will automatically split the dataset by the `segment_id` column, apply the same model to each subset, and produce one report per segment.
|
|
269
|
-
|
|
270
|
-
## Output Artifacts
|
|
271
|
-
|
|
272
|
-
After a successful validation run, TanML generates a set of output files based on your configuration:
|
|
273
|
-
|
|
274
|
-
- **Validation Report (.docx):**
|
|
275
|
-
A professionally formatted Word document containing:
|
|
276
|
-
- A summary of all validation checks
|
|
277
|
-
- Tables for performance metrics, data quality, logistic regression coefficients, VIF, and more
|
|
278
|
-
- SHAP-based feature importance visualizations (if enabled)
|
|
279
|
-
- Segment-wise validation summaries (for Scenario B and C)
|
|
280
|
-
|
|
281
|
-
- **SHAP Visualizations:**
|
|
282
|
-
Summary bar plots and other SHAP outputs are saved in the `reports/images/` folder.
|
|
283
|
-
|
|
284
|
-
- **Input Cluster Coverage Charts and CSVs:**
|
|
285
|
-
If cluster coverage analysis is enabled, visual and tabular summaries are stored in:
|
|
286
|
-
- `reports/images/` (cluster bar plots)
|
|
287
|
-
- `reports/clusters/` (CSV summaries)
|
|
288
|
-
|
|
289
|
-
- **Logs and Intermediate Results:**
|
|
290
|
-
Optional debug or intermediate outputs (e.g., cleaned data snapshots or rule validation results) can be generated depending on configuration or verbosity level.
|
|
291
|
-
|
|
292
|
-
By default, all outputs are saved to the paths you define in the YAML configuration. Each segment (in Scenario B or C) will generate its own report file if multiple runs are triggered.
|
|
293
|
-
|
|
294
|
-
Extending TanML
|
|
295
|
-
|
|
296
|
-
TanML is designed with modularity in mind. Advanced users and developers can easily extend its capabilities by adding new validation checks or modifying report components. Here's how extension works:
|
|
297
|
-
|
|
298
|
-
Add a Custom Check:Create a new check class in the tanml/checks/ directory and implement the validation logic based on the BaseCheck interface.
|
|
299
|
-
|
|
300
|
-
Create a Check Runner:Add a corresponding runner in tanml/check_runners/. This file controls how the check is executed and connected to the engine.
|
|
301
|
-
|
|
302
|
-
Register the Check:Link your runner in the central registry inside the validation engine. The YAML config will trigger it automatically based on user-defined rules.
|
|
303
|
-
|
|
304
|
-
This modular structure ensures that domain-specific validations (e.g., industry regulations, fairness audits) can be added without modifying core logic.
|
|
305
|
-
|
|
306
|
-
## License and Citation
|
|
307
|
-
|
|
308
|
-
TanML is open-source under the MIT License.
|
|
309
|
-
Copyright © 2025 Tanmay Sah and Dolly Sah.
|
|
310
|
-
|
|
311
|
-
You are free to use, modify, and distribute this software with appropriate attribution.
|
|
312
|
-
|
|
313
|
-
If you use TanML in your work or publication, please cite it as:
|
|
314
|
-
|
|
315
|
-
> Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning*. GitHub. https://github.com/tdlabs-ai/tanml
|
|
316
|
-
|
|
317
|
-
📄 A machine-readable citation file (`CITATION.cff`) is included for use with citation tools and GitHub's “Cite this repository” button.
|
tanml-0.1.6.dist-info/RECORD
DELETED
|
@@ -1,62 +0,0 @@
|
|
|
1
|
-
tanml/__init__.py,sha256=5xvN_gb61nKeq5TER5dSfcArTP3DVasZGN_MQq5dNpA,23
|
|
2
|
-
tanml/validate.py,sha256=GgLPaCHXk126hb-En_caQzDW0AViqh5QU0JBJP1cILo,8502
|
|
3
|
-
tanml/check_runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
4
|
-
tanml/check_runners/base_runner.py,sha256=jibUcl6a7SQH9EKnPFZhU1t9FY-Bu_nkPjRfjW2T5x0,210
|
|
5
|
-
tanml/check_runners/cleaning_repro_runner.py,sha256=nr3HlA1rxy6qv3XqBgK1gf6euz6zRLdsTf4tB41Nk0k,921
|
|
6
|
-
tanml/check_runners/correlation_runner.py,sha256=rhcOoZkbALZMGFax-wFMRCJ8JgmVa3k4uui4DSHdpfs,597
|
|
7
|
-
tanml/check_runners/data_quality_runner.py,sha256=IdhWYOtDBPkAwT2Aa6SYSiI2gkLEhkBXHAQop5ZFg0I,883
|
|
8
|
-
tanml/check_runners/eda_runner.py,sha256=Gr5ZmgOvUilej-rhinsL9KmsIQOMWjd6jdOYUQTJ4os,623
|
|
9
|
-
tanml/check_runners/explainability_runner.py,sha256=okA6zfybTeDRiaNeoR8QyR00mqvyRX28wNylR_R-N_g,840
|
|
10
|
-
tanml/check_runners/input_cluster_runner.py,sha256=0O8JJq3HOvNd3_nrZY8FevVYimI_Nl2Iftyyx56LVzc,1251
|
|
11
|
-
tanml/check_runners/logistic_stats_runner.py,sha256=nst-kW-uDLbdXYG9bBzhdDAHd2MX4TnNv94EF9-Jigg,1181
|
|
12
|
-
tanml/check_runners/model_meta_runner.py,sha256=hOIEp6k98o0FwYjgmi_53Ni7H14jerKdf46wpxVkAkk,799
|
|
13
|
-
tanml/check_runners/performance_runner.py,sha256=Dqhm1kNwKdbMc_GBr-yivQGX3nxa__fW9XmXZnnSvXM,1150
|
|
14
|
-
tanml/check_runners/raw_data_runner.py,sha256=XhroQoyiJb2B8URO1pMgI6H0RuwVqKzzPJGjzu5NbZE,1430
|
|
15
|
-
tanml/check_runners/rule_engine_runner.py,sha256=xgij_9S8kUSESgXeWGN7FnOD0hNmd3GAregxuD_x9nI,286
|
|
16
|
-
tanml/check_runners/stress_test_runner.py,sha256=xw3Q8Da_U91MQE9sixUn7RJu-y4IUu2lUQYyGzymOgc,1109
|
|
17
|
-
tanml/check_runners/vif_runner.py,sha256=0CeR5nl_JRVRoSProuJitbzRxw7AFNXt0FtnhBe2aP0,1893
|
|
18
|
-
tanml/checks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
-
tanml/checks/base.py,sha256=g1GLXHWkE848bDKb4zXLHRqfgucVWFa-n6ZDT5vzs2I,570
|
|
20
|
-
tanml/checks/cleaning_repro.py,sha256=LtPEkikjTY0NMsv1FZ8tOPUkaqF3yK_-DQdLCltMsIA,1795
|
|
21
|
-
tanml/checks/correlation.py,sha256=WT83wAclH73ud8JzA5HRe8VfWd1y3dfPR7VPkDutMqw,2135
|
|
22
|
-
tanml/checks/data_quality.py,sha256=bNrM469c_G-TxU-6ne2UYv5gwoUoyrab4lYbI16l7VU,981
|
|
23
|
-
tanml/checks/eda.py,sha256=7fOIBhCg0oiK4SL22WcKHK8vL3s-YwypiAnZDwoq3lo,2515
|
|
24
|
-
tanml/checks/input_cluster.py,sha256=93RvE-vd_elt4xqnkCkHmHH6NWPOtJF7ylb5mPTsagA,3990
|
|
25
|
-
tanml/checks/logit_stats.py,sha256=0BuuKYG8L-A0CPpq6hiL-uSporUnzyuElSgdYmvUeFo,1850
|
|
26
|
-
tanml/checks/model_contents.py,sha256=jcS4GrUoaQVUBf9pFwvtv9KKvDO5vZroiPz8mOmyAGo,1263
|
|
27
|
-
tanml/checks/model_meta.py,sha256=x2RnT3kUWY4ERWfM9cItpaIJX8Fca9IQByR9zPN2V20,1881
|
|
28
|
-
tanml/checks/performance.py,sha256=vQ_FeIcr3ASDhVd97y2JKMEMfJUrfCRnUBZh1ZI6rCs,3400
|
|
29
|
-
tanml/checks/raw_data.py,sha256=si67YqKVd6v6dzTeUK6ZLD3KSzRpO6ohODPNv7L3zio,1785
|
|
30
|
-
tanml/checks/rule_engine.py,sha256=bqGd0-2MlSPGGUFPyMxgHiwVTvw1Iu7NFqUzl_-0iIg,1694
|
|
31
|
-
tanml/checks/stress_test.py,sha256=ILpgbi-qWvHwbX7aJL-3Xsj-gHxWyaw64tGK7fLiiCo,2438
|
|
32
|
-
tanml/checks/vif.py,sha256=0tghDaiG8z4frgP5OHj3ctgsH4Mak1N7Docu6OcG8JA,1902
|
|
33
|
-
tanml/checks/explainability/shap_check.py,sha256=LIRcFvsYJl37I5Rr6WoNDEIUGL3mhtj4CcqBfj5XvZA,2091
|
|
34
|
-
tanml/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
35
|
-
tanml/cli/arg_parser.py,sha256=m13PWV2DmO3Hl-sGOt-VCqJ3cFocrLhWQT4MaDmjl7E,1674
|
|
36
|
-
tanml/cli/init_cmd.py,sha256=Coj681Zp3CAsTL-wXGM8QVop7me-vV3l4OBA74XjETQ,308
|
|
37
|
-
tanml/cli/main.py,sha256=Z288eLioZdT0k2ApLP-0m0xnSHI446mW823bS3YnQeI,1195
|
|
38
|
-
tanml/cli/validate_cmd.py,sha256=Civ6ELN7jO65TUeXsI9TX41iwoelqYOtWvHsX8CJn3g,218
|
|
39
|
-
tanml/config_templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
-
tanml/config_templates/rules_multiple_models_datasets.yaml,sha256=_swwZsKV5TMGYy2IxHU_d381rqcGU12tZg59yl8NKuk,4872
|
|
41
|
-
tanml/config_templates/rules_one_dataset_segment_column.yaml,sha256=jmUA5TGuSDt7o6WRTNFA9Nv4ObOnDywBgI4jNHYzKpI,4416
|
|
42
|
-
tanml/config_templates/rules_one_model_one_dataset.yaml,sha256=BHggGZnMeqWR4aKisgD8ghH5fZORCRVVD_KP6GpoE-g,4452
|
|
43
|
-
tanml/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
44
|
-
tanml/engine/check_agent_registry.py,sha256=UzJZhnAkMh4s-nYUMFPSspZZav8AHwmLuYveVIdlUcQ,2120
|
|
45
|
-
tanml/engine/core_engine_agent.py,sha256=fvWZa2XjadcKhhvQiGuOlb5x3KQFwFyH6bjJfqz91wk,3653
|
|
46
|
-
tanml/engine/segmentation_agent.py,sha256=ETCGvQ6mUf4qmtfdCrQlO9nGqpIxApfod3tObdDHlTY,4904
|
|
47
|
-
tanml/engine/validation_agent.py,sha256=5js9NDIx7mItXA7SYWr85Oe5wwmDyePAqkB6i49imKk,3393
|
|
48
|
-
tanml/report/report_builder.py,sha256=SIXI6pFDXLRa3DlrA4uM6s-5Do33iTj85Yk6biX4W9A,8538
|
|
49
|
-
tanml/report/templates/report_template.docx,sha256=Pq5QNSclws5PNaGntul4Y4JwWf08MjNBno-UN__XWYY,27170
|
|
50
|
-
tanml/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
51
|
-
tanml/utils/data_loader.py,sha256=pYOcp8bmA9mq9FBxk_0-O5z8GtnxR24ooDCRWUYv83s,545
|
|
52
|
-
tanml/utils/model_loader.py,sha256=bcYyaLfEkgpfCw_BCUqRifZxckMRnR-9O32F_EnCYWg,1121
|
|
53
|
-
tanml/utils/r_loader.py,sha256=88k7deZWvO68zYGK8G8rFHfc_VoVK2h8ce55pD8lw3U,964
|
|
54
|
-
tanml/utils/sas_loader.py,sha256=8-hz2jlS71hbiacP99mnG7Ip5zV0f09htPl1-h34Qvw,1673
|
|
55
|
-
tanml/utils/yaml_generator.py,sha256=sBumacoUAZFoxG9Enr7mljgCSzp9RAbszsVYlDATz4c,1130
|
|
56
|
-
tanml/utils/yaml_loader.py,sha256=QlrwEm685wDiQjkR-BYQ45HIyuQS6_H6ajadAp1nsrg,110
|
|
57
|
-
tanml-0.1.6.dist-info/licenses/LICENSE,sha256=e6xQyG7SdWiD4cLlj7rFdMrjG6H7ABglGOrooZxWLKQ,1102
|
|
58
|
-
tanml-0.1.6.dist-info/METADATA,sha256=655Z4HIVXKeMgjfqESMcv9XSngupFKKg-FrRFk77Iw4,11973
|
|
59
|
-
tanml-0.1.6.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
60
|
-
tanml-0.1.6.dist-info/entry_points.txt,sha256=WUM_y0uRIL7iXPcxK69Bn5mKZXnDshWDbLyohjND1IE,46
|
|
61
|
-
tanml-0.1.6.dist-info/top_level.txt,sha256=81dIhCm6opwY6E7Pb9G1kdIVmYrUkXX4PaYhQ873gIE,6
|
|
62
|
-
tanml-0.1.6.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|