tanml 0.1.6__tar.gz → 0.1.7__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tanml might be problematic. Click here for more details.
- {tanml-0.1.6 → tanml-0.1.7}/MANIFEST.in +0 -2
- tanml-0.1.7/PKG-INFO +164 -0
- tanml-0.1.7/README-pypi.md +107 -0
- tanml-0.1.7/README.md +148 -0
- tanml-0.1.7/pyproject.toml +112 -0
- tanml-0.1.7/tanml/__init__.py +1 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/cleaning_repro_runner.py +2 -2
- tanml-0.1.7/tanml/check_runners/correlation_runner.py +52 -0
- tanml-0.1.7/tanml/check_runners/explainability_runner.py +18 -0
- tanml-0.1.7/tanml/check_runners/logistic_stats_runner.py +207 -0
- tanml-0.1.7/tanml/check_runners/performance_runner.py +84 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/raw_data_runner.py +29 -14
- tanml-0.1.7/tanml/check_runners/regression_metrics_runner.py +195 -0
- tanml-0.1.7/tanml/check_runners/stress_test_runner.py +43 -0
- tanml-0.1.7/tanml/check_runners/vif_runner.py +60 -0
- tanml-0.1.7/tanml/checks/correlation.py +261 -0
- tanml-0.1.7/tanml/checks/explainability/shap_check.py +287 -0
- tanml-0.1.7/tanml/checks/logit_stats.py +191 -0
- tanml-0.1.7/tanml/checks/performance_classification.py +305 -0
- tanml-0.1.7/tanml/checks/raw_data.py +82 -0
- tanml-0.1.7/tanml/checks/regression_metrics.py +167 -0
- tanml-0.1.7/tanml/checks/stress_test.py +168 -0
- tanml-0.1.7/tanml/cli/main.py +99 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/engine/check_agent_registry.py +20 -10
- tanml-0.1.7/tanml/engine/core_engine_agent.py +277 -0
- tanml-0.1.7/tanml/models/registry.py +329 -0
- tanml-0.1.7/tanml/report/report_builder.py +1263 -0
- tanml-0.1.7/tanml/report/templates/report_template_cls.docx +0 -0
- tanml-0.1.7/tanml/report/templates/report_template_reg.docx +0 -0
- tanml-0.1.7/tanml/ui/app.py +1205 -0
- tanml-0.1.7/tanml/utils/data_loader.py +107 -0
- tanml-0.1.7/tanml.egg-info/PKG-INFO +164 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml.egg-info/SOURCES.txt +9 -16
- tanml-0.1.7/tanml.egg-info/requires.txt +25 -0
- tanml-0.1.6/PKG-INFO +0 -317
- tanml-0.1.6/README.md +0 -284
- tanml-0.1.6/pyproject.toml +0 -54
- tanml-0.1.6/tanml/__init__.py +0 -1
- tanml-0.1.6/tanml/check_runners/correlation_runner.py +0 -15
- tanml-0.1.6/tanml/check_runners/explainability_runner.py +0 -28
- tanml-0.1.6/tanml/check_runners/logistic_stats_runner.py +0 -28
- tanml-0.1.6/tanml/check_runners/performance_runner.py +0 -28
- tanml-0.1.6/tanml/check_runners/stress_test_runner.py +0 -26
- tanml-0.1.6/tanml/check_runners/vif_runner.py +0 -54
- tanml-0.1.6/tanml/checks/correlation.py +0 -61
- tanml-0.1.6/tanml/checks/explainability/shap_check.py +0 -55
- tanml-0.1.6/tanml/checks/logit_stats.py +0 -59
- tanml-0.1.6/tanml/checks/raw_data.py +0 -47
- tanml-0.1.6/tanml/checks/stress_test.py +0 -64
- tanml-0.1.6/tanml/cli/arg_parser.py +0 -31
- tanml-0.1.6/tanml/cli/init_cmd.py +0 -8
- tanml-0.1.6/tanml/cli/main.py +0 -27
- tanml-0.1.6/tanml/cli/validate_cmd.py +0 -7
- tanml-0.1.6/tanml/config_templates/rules_multiple_models_datasets.yaml +0 -144
- tanml-0.1.6/tanml/config_templates/rules_one_dataset_segment_column.yaml +0 -140
- tanml-0.1.6/tanml/config_templates/rules_one_model_one_dataset.yaml +0 -143
- tanml-0.1.6/tanml/engine/core_engine_agent.py +0 -115
- tanml-0.1.6/tanml/engine/segmentation_agent.py +0 -118
- tanml-0.1.6/tanml/engine/validation_agent.py +0 -91
- tanml-0.1.6/tanml/report/report_builder.py +0 -230
- tanml-0.1.6/tanml/report/templates/report_template.docx +0 -0
- tanml-0.1.6/tanml/utils/data_loader.py +0 -17
- tanml-0.1.6/tanml/utils/model_loader.py +0 -35
- tanml-0.1.6/tanml/utils/r_loader.py +0 -30
- tanml-0.1.6/tanml/utils/sas_loader.py +0 -50
- tanml-0.1.6/tanml/utils/yaml_generator.py +0 -34
- tanml-0.1.6/tanml/utils/yaml_loader.py +0 -5
- tanml-0.1.6/tanml/validate.py +0 -209
- tanml-0.1.6/tanml.egg-info/PKG-INFO +0 -317
- tanml-0.1.6/tanml.egg-info/requires.txt +0 -21
- {tanml-0.1.6 → tanml-0.1.7}/LICENSE +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/setup.cfg +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/__init__.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/base_runner.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/data_quality_runner.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/eda_runner.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/input_cluster_runner.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/model_meta_runner.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/check_runners/rule_engine_runner.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/__init__.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/base.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/cleaning_repro.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/data_quality.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/eda.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/input_cluster.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/model_contents.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/model_meta.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/performance.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/rule_engine.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/checks/vif.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/cli/__init__.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/config_templates/__init__.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/engine/__init__.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml/utils/__init__.py +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml.egg-info/dependency_links.txt +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml.egg-info/entry_points.txt +0 -0
- {tanml-0.1.6 → tanml-0.1.7}/tanml.egg-info/top_level.txt +0 -0
tanml-0.1.7/PKG-INFO
ADDED
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tanml
|
|
3
|
+
Version: 0.1.7
|
|
4
|
+
Summary: Automated validation toolkit for tabular ML models—MRM, credit risk, insurance, and other regulated use cases.
|
|
5
|
+
Author: Tanmay Sah, Dolly Sah
|
|
6
|
+
Maintainer: Tanmay Sah, Dolly Sah
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/tdlabs-ai/tanml
|
|
9
|
+
Project-URL: Source, https://github.com/tdlabs-ai/tanml
|
|
10
|
+
Project-URL: Issues, https://github.com/tdlabs-ai/tanml/issues
|
|
11
|
+
Project-URL: Documentation, https://github.com/tdlabs-ai/tanml#readme
|
|
12
|
+
Keywords: model validation,model risk management,model governance,SR 11-7,tabular ML,credit risk,insurance analytics,explainability,XAI,SHAP,stress testing,reporting,docx,streamlit,xgboost,lightgbm,catboost
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
28
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
29
|
+
Requires-Python: >=3.8
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Requires-Dist: numpy>=1.26
|
|
33
|
+
Requires-Dist: scipy>=1.11
|
|
34
|
+
Requires-Dist: pandas>=2.0
|
|
35
|
+
Requires-Dist: scikit-learn>=1.3
|
|
36
|
+
Requires-Dist: statsmodels>=0.14
|
|
37
|
+
Requires-Dist: xgboost>=2.0
|
|
38
|
+
Requires-Dist: lightgbm>=4.3
|
|
39
|
+
Requires-Dist: catboost>=1.2
|
|
40
|
+
Requires-Dist: shap>=0.44
|
|
41
|
+
Requires-Dist: numba>=0.58
|
|
42
|
+
Requires-Dist: matplotlib>=3.8
|
|
43
|
+
Requires-Dist: seaborn>=0.13
|
|
44
|
+
Requires-Dist: Pillow>=10.0
|
|
45
|
+
Requires-Dist: python-docx>=1.1.2
|
|
46
|
+
Requires-Dist: tzlocal>=5.0
|
|
47
|
+
Requires-Dist: tqdm>=4.66
|
|
48
|
+
Requires-Dist: pyarrow>=14.0
|
|
49
|
+
Requires-Dist: openpyxl>=3.1
|
|
50
|
+
Requires-Dist: pyreadstat>=1.2
|
|
51
|
+
Requires-Dist: streamlit>=1.36
|
|
52
|
+
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: pytest; extra == "dev"
|
|
54
|
+
Requires-Dist: black; extra == "dev"
|
|
55
|
+
Requires-Dist: isort; extra == "dev"
|
|
56
|
+
Dynamic: license-file
|
|
57
|
+
|
|
58
|
+
# TanML: Automated Model Validation Toolkit for Tabular Machine Learning
|
|
59
|
+
|
|
60
|
+
[](https://github.com/tdlabs-ai/tanml#license--citation)
|
|
61
|
+
[](https://opensource.org/licenses/MIT)
|
|
62
|
+
[](https://pepy.tech/project/tanml)
|
|
63
|
+
|
|
64
|
+
**TanML** validates tabular ML models with a zero-config **Streamlit UI** and exports an audit-ready, **editable Word report (.docx)**. It covers data quality, correlation/VIF, performance, explainability (SHAP), and robustness/stress tests—built for regulated settings (MRM, credit risk, insurance, etc.).
|
|
65
|
+
|
|
66
|
+
* **Status:** Beta (`0.x`)
|
|
67
|
+
* **License:** MIT
|
|
68
|
+
* **Python:** 3.8–3.12
|
|
69
|
+
* **OS:** Linux / macOS / Windows (incl. WSL)
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Why TanML?
|
|
74
|
+
|
|
75
|
+
* **Zero-config UI:** launch Streamlit, upload data, click **Run**—no YAML needed.
|
|
76
|
+
* **Audit-ready outputs:** tables/plots + a polished DOCX your stakeholders can edit.
|
|
77
|
+
* **Regulatory alignment:** supports common Model Risk Management themes (e.g., SR 11-7 style).
|
|
78
|
+
* **Works with your stack:** scikit-learn, XGBoost/LightGBM/CatBoost, etc.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Install
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install tanml
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Quick Start (UI)
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
tanml ui
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
* Opens at **[http://127.0.0.1:8501](http://127.0.0.1:8501)**
|
|
95
|
+
* **Upload limit ~1 GB** (preconfigured)
|
|
96
|
+
* **Telemetry disabled by default**
|
|
97
|
+
|
|
98
|
+
### In the app
|
|
99
|
+
|
|
100
|
+
1. **Load data** — upload a cleaned CSV/XLSX/Parquet (optional: raw or separate Train/Test).
|
|
101
|
+
2. **Select target & features** — target auto-suggested; features default to all non-target columns.
|
|
102
|
+
3. **Pick a model** — choose library/algorithm (scikit-learn, XGBoost, LightGBM, CatBoost) and tweak params.
|
|
103
|
+
4. **Run validation** — click **▶️ Refit & validate**.
|
|
104
|
+
5. **Export** — click **⬇️ Download report** to get a **DOCX** (auto-selects classification/regression template).
|
|
105
|
+
|
|
106
|
+
**Outputs**
|
|
107
|
+
|
|
108
|
+
* Report: `./.ui_runs/<session>/tanml_report_*.docx`
|
|
109
|
+
* Artifacts (CSV/PNGs): `./.ui_runs/<session>/artifacts/*`
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## What TanML Checks
|
|
114
|
+
|
|
115
|
+
* **Raw Data (optional):** rows/cols, missingness, duplicates, constant columns
|
|
116
|
+
* **Data Quality & EDA:** summaries, distributions
|
|
117
|
+
* **Correlation & Multicollinearity:** heatmap, top-pairs CSV, **VIF** table
|
|
118
|
+
* **Performance**
|
|
119
|
+
|
|
120
|
+
* **Classification:** AUC, PR-AUC, KS, decile lift, confusion
|
|
121
|
+
* **Regression:** R², MAE, MSE/RMSE, error stats
|
|
122
|
+
* **Explainability:** SHAP (auto explainer; configurable background size)
|
|
123
|
+
* **Robustness/Stress Tests:** feature perturbations → delta-metrics
|
|
124
|
+
* **Model Metadata:** model class, hyperparameters, features, training info
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
## Templates
|
|
130
|
+
|
|
131
|
+
TanML ships DOCX templates (packaged in wheel & sdist):
|
|
132
|
+
|
|
133
|
+
* `tanml/report/templates/report_template_cls.docx`
|
|
134
|
+
* `tanml/report/templates/report_template_reg.docx`
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
## License & Citation
|
|
140
|
+
|
|
141
|
+
**License:** MIT. See [LICENSE](https://github.com/tdlabs-ai/tanml/blob/main/LICENSE).
|
|
142
|
+
SPDX-License-Identifier: MIT
|
|
143
|
+
|
|
144
|
+
© 2025 Tanmay Sah and Dolly Sah. You may use, modify, and distribute this software with appropriate attribution.
|
|
145
|
+
|
|
146
|
+
### How to cite
|
|
147
|
+
|
|
148
|
+
If TanML helps your work or publications, please cite:
|
|
149
|
+
|
|
150
|
+
> Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning* [Software]. Available at https://github.com/tdlabs-ai/tanml
|
|
151
|
+
|
|
152
|
+
Or in BibTeX (version-agnostic):
|
|
153
|
+
|
|
154
|
+
```bibtex
|
|
155
|
+
@misc{tanml,
|
|
156
|
+
author = {Sah, Tanmay and Sah, Dolly},
|
|
157
|
+
title = {TanML: Automated Model Validation Toolkit for Tabular Machine Learning},
|
|
158
|
+
year = {2025},
|
|
159
|
+
note = {Software; MIT License},
|
|
160
|
+
url = {https://github.com/tdlabs-ai/tanml}
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
A machine-readable citation file (`CITATION.cff`) is included for citation tools and GitHub’s “Cite this repository” button.
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
# TanML: Automated Model Validation Toolkit for Tabular Machine Learning
|
|
2
|
+
|
|
3
|
+
[](https://github.com/tdlabs-ai/tanml#license--citation)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://pepy.tech/project/tanml)
|
|
6
|
+
|
|
7
|
+
**TanML** validates tabular ML models with a zero-config **Streamlit UI** and exports an audit-ready, **editable Word report (.docx)**. It covers data quality, correlation/VIF, performance, explainability (SHAP), and robustness/stress tests—built for regulated settings (MRM, credit risk, insurance, etc.).
|
|
8
|
+
|
|
9
|
+
* **Status:** Beta (`0.x`)
|
|
10
|
+
* **License:** MIT
|
|
11
|
+
* **Python:** 3.8–3.12
|
|
12
|
+
* **OS:** Linux / macOS / Windows (incl. WSL)
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Why TanML?
|
|
17
|
+
|
|
18
|
+
* **Zero-config UI:** launch Streamlit, upload data, click **Run**—no YAML needed.
|
|
19
|
+
* **Audit-ready outputs:** tables/plots + a polished DOCX your stakeholders can edit.
|
|
20
|
+
* **Regulatory alignment:** supports common Model Risk Management themes (e.g., SR 11-7 style).
|
|
21
|
+
* **Works with your stack:** scikit-learn, XGBoost/LightGBM/CatBoost, etc.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install tanml
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start (UI)
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
tanml ui
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
* Opens at **[http://127.0.0.1:8501](http://127.0.0.1:8501)**
|
|
38
|
+
* **Upload limit ~1 GB** (preconfigured)
|
|
39
|
+
* **Telemetry disabled by default**
|
|
40
|
+
|
|
41
|
+
### In the app
|
|
42
|
+
|
|
43
|
+
1. **Load data** — upload a cleaned CSV/XLSX/Parquet (optional: raw or separate Train/Test).
|
|
44
|
+
2. **Select target & features** — target auto-suggested; features default to all non-target columns.
|
|
45
|
+
3. **Pick a model** — choose library/algorithm (scikit-learn, XGBoost, LightGBM, CatBoost) and tweak params.
|
|
46
|
+
4. **Run validation** — click **▶️ Refit & validate**.
|
|
47
|
+
5. **Export** — click **⬇️ Download report** to get a **DOCX** (auto-selects classification/regression template).
|
|
48
|
+
|
|
49
|
+
**Outputs**
|
|
50
|
+
|
|
51
|
+
* Report: `./.ui_runs/<session>/tanml_report_*.docx`
|
|
52
|
+
* Artifacts (CSV/PNGs): `./.ui_runs/<session>/artifacts/*`
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## What TanML Checks
|
|
57
|
+
|
|
58
|
+
* **Raw Data (optional):** rows/cols, missingness, duplicates, constant columns
|
|
59
|
+
* **Data Quality & EDA:** summaries, distributions
|
|
60
|
+
* **Correlation & Multicollinearity:** heatmap, top-pairs CSV, **VIF** table
|
|
61
|
+
* **Performance**
|
|
62
|
+
|
|
63
|
+
* **Classification:** AUC, PR-AUC, KS, decile lift, confusion
|
|
64
|
+
* **Regression:** R², MAE, MSE/RMSE, error stats
|
|
65
|
+
* **Explainability:** SHAP (auto explainer; configurable background size)
|
|
66
|
+
* **Robustness/Stress Tests:** feature perturbations → delta-metrics
|
|
67
|
+
* **Model Metadata:** model class, hyperparameters, features, training info
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
## Templates
|
|
73
|
+
|
|
74
|
+
TanML ships DOCX templates (packaged in wheel & sdist):
|
|
75
|
+
|
|
76
|
+
* `tanml/report/templates/report_template_cls.docx`
|
|
77
|
+
* `tanml/report/templates/report_template_reg.docx`
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
## License & Citation
|
|
83
|
+
|
|
84
|
+
**License:** MIT. See [LICENSE](https://github.com/tdlabs-ai/tanml/blob/main/LICENSE).
|
|
85
|
+
SPDX-License-Identifier: MIT
|
|
86
|
+
|
|
87
|
+
© 2025 Tanmay Sah and Dolly Sah. You may use, modify, and distribute this software with appropriate attribution.
|
|
88
|
+
|
|
89
|
+
### How to cite
|
|
90
|
+
|
|
91
|
+
If TanML helps your work or publications, please cite:
|
|
92
|
+
|
|
93
|
+
> Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning* [Software]. Available at https://github.com/tdlabs-ai/tanml
|
|
94
|
+
|
|
95
|
+
Or in BibTeX (version-agnostic):
|
|
96
|
+
|
|
97
|
+
```bibtex
|
|
98
|
+
@misc{tanml,
|
|
99
|
+
author = {Sah, Tanmay and Sah, Dolly},
|
|
100
|
+
title = {TanML: Automated Model Validation Toolkit for Tabular Machine Learning},
|
|
101
|
+
year = {2025},
|
|
102
|
+
note = {Software; MIT License},
|
|
103
|
+
url = {https://github.com/tdlabs-ai/tanml}
|
|
104
|
+
}
|
|
105
|
+
```
|
|
106
|
+
|
|
107
|
+
A machine-readable citation file (`CITATION.cff`) is included for citation tools and GitHub’s “Cite this repository” button.
|
tanml-0.1.7/README.md
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
1
|
+
# TanML: Automated Model Validation Toolkit for Tabular Machine Learning
|
|
2
|
+
|
|
3
|
+
[](https://github.com/tdlabs-ai/tanml#license--citation)
|
|
4
|
+
[](https://opensource.org/licenses/MIT)
|
|
5
|
+
[](https://pepy.tech/project/tanml)
|
|
6
|
+
|
|
7
|
+
**TanML** validates tabular ML models with a zero-config **Streamlit UI** and exports an audit-ready, **editable Word report (.docx)**. It covers data quality, correlation/VIF, performance, explainability (SHAP), and robustness/stress tests—built for regulated settings (MRM, credit risk, insurance, etc.).
|
|
8
|
+
|
|
9
|
+
* **Status:** Beta (`0.x`)
|
|
10
|
+
* **License:** MIT
|
|
11
|
+
* **Python:** 3.8–3.12
|
|
12
|
+
* **OS:** Linux / macOS / Windows (incl. WSL)
|
|
13
|
+
|
|
14
|
+
---
|
|
15
|
+
|
|
16
|
+
## Why TanML?
|
|
17
|
+
|
|
18
|
+
* **Zero-config UI:** launch Streamlit, upload data, click **Run**—no YAML needed.
|
|
19
|
+
* **Audit-ready outputs:** tables/plots + a polished DOCX your stakeholders can edit.
|
|
20
|
+
* **Regulatory alignment:** supports common Model Risk Management themes (e.g., SR 11-7 style).
|
|
21
|
+
* **Works with your stack:** scikit-learn, XGBoost/LightGBM/CatBoost, etc.
|
|
22
|
+
|
|
23
|
+
---
|
|
24
|
+
|
|
25
|
+
## Install
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
pip install tanml
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
## Quick Start (UI)
|
|
32
|
+
|
|
33
|
+
```bash
|
|
34
|
+
tanml ui
|
|
35
|
+
```
|
|
36
|
+
|
|
37
|
+
* Opens at **[http://127.0.0.1:8501](http://127.0.0.1:8501)**
|
|
38
|
+
* **Upload limit ~1 GB** (preconfigured)
|
|
39
|
+
* **Telemetry disabled by default**
|
|
40
|
+
|
|
41
|
+
### In the app
|
|
42
|
+
|
|
43
|
+
1. **Load data** — upload a cleaned CSV/XLSX/Parquet (optional: raw or separate Train/Test).
|
|
44
|
+
2. **Select target & features** — target auto-suggested; features default to all non-target columns.
|
|
45
|
+
3. **Pick a model** — choose library/algorithm (scikit-learn, XGBoost, LightGBM, CatBoost) and tweak params.
|
|
46
|
+
4. **Run validation** — click **▶️ Refit & validate**.
|
|
47
|
+
5. **Export** — click **⬇️ Download report** to get a **DOCX** (auto-selects classification/regression template).
|
|
48
|
+
|
|
49
|
+
**Outputs**
|
|
50
|
+
|
|
51
|
+
* Report: `./.ui_runs/<session>/tanml_report_*.docx`
|
|
52
|
+
* Artifacts (CSV/PNGs): `./.ui_runs/<session>/artifacts/*`
|
|
53
|
+
|
|
54
|
+
---
|
|
55
|
+
|
|
56
|
+
## What TanML Checks
|
|
57
|
+
|
|
58
|
+
* **Raw Data (optional):** rows/cols, missingness, duplicates, constant columns
|
|
59
|
+
* **Data Quality & EDA:** summaries, distributions
|
|
60
|
+
* **Correlation & Multicollinearity:** heatmap, top-pairs CSV, **VIF** table
|
|
61
|
+
* **Performance**
|
|
62
|
+
|
|
63
|
+
* **Classification:** AUC, PR-AUC, KS, decile lift, confusion
|
|
64
|
+
* **Regression:** R², MAE, MSE/RMSE, error stats
|
|
65
|
+
* **Explainability:** SHAP (auto explainer; configurable background size)
|
|
66
|
+
* **Robustness/Stress Tests:** feature perturbations → delta-metrics
|
|
67
|
+
* **Model Metadata:** model class, hyperparameters, features, training info
|
|
68
|
+
|
|
69
|
+
---
|
|
70
|
+
|
|
71
|
+
## Optional CLI Flags
|
|
72
|
+
|
|
73
|
+
Most users just run `tanml ui`. These help on teams/servers:
|
|
74
|
+
|
|
75
|
+
```bash
|
|
76
|
+
# Share on LAN
|
|
77
|
+
tanml ui --public
|
|
78
|
+
|
|
79
|
+
# Different port
|
|
80
|
+
tanml ui --port 9000
|
|
81
|
+
|
|
82
|
+
# Headless (server/CI; no auto-open browser)
|
|
83
|
+
tanml ui --headless
|
|
84
|
+
|
|
85
|
+
# Larger limit (e.g., 2 GB)
|
|
86
|
+
tanml ui --max-mb 2048
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Env var equivalents (Linux/macOS bash):
|
|
90
|
+
|
|
91
|
+
```bash
|
|
92
|
+
TANML_SERVER_ADDRESS=0.0.0.0 TANML_PORT=9000 TANML_MAX_MB=2048 tanml ui
|
|
93
|
+
```
|
|
94
|
+
|
|
95
|
+
Windows PowerShell:
|
|
96
|
+
|
|
97
|
+
```powershell
|
|
98
|
+
$env:TANML_SERVER_ADDRESS="0.0.0.0"; $env:TANML_PORT="9000"; $env:TANML_MAX_MB="2048"; tanml ui
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
**Defaults:** address `127.0.0.1`, port `8501`, limit `1024 MB`, telemetry **OFF**.
|
|
102
|
+
|
|
103
|
+
---
|
|
104
|
+
|
|
105
|
+
## Templates
|
|
106
|
+
|
|
107
|
+
TanML ships DOCX templates (packaged in wheel & sdist):
|
|
108
|
+
|
|
109
|
+
* `tanml/report/templates/report_template_cls.docx`
|
|
110
|
+
* `tanml/report/templates/report_template_reg.docx`
|
|
111
|
+
|
|
112
|
+
---
|
|
113
|
+
|
|
114
|
+
## Troubleshooting
|
|
115
|
+
|
|
116
|
+
* **Page didn’t open?** Visit `http://127.0.0.1:8501` or run `tanml ui --port 9000`.
|
|
117
|
+
* **Large CSVs are slow/heavy?** Prefer **Parquet**; CSV → DataFrame can use several GB RAM.
|
|
118
|
+
* **Artifacts missing?** Check `./.ui_runs/<session>/artifacts/`.
|
|
119
|
+
* **Corporate networks:** use `tanml ui --public` to share on LAN.
|
|
120
|
+
|
|
121
|
+
---
|
|
122
|
+
|
|
123
|
+
## License & Citation
|
|
124
|
+
|
|
125
|
+
**License:** MIT. See [LICENSE](https://github.com/tdlabs-ai/tanml/blob/main/LICENSE).
|
|
126
|
+
SPDX-License-Identifier: MIT
|
|
127
|
+
|
|
128
|
+
© 2025 Tanmay Sah and Dolly Sah. You may use, modify, and distribute this software with appropriate attribution.
|
|
129
|
+
|
|
130
|
+
### How to cite
|
|
131
|
+
|
|
132
|
+
If TanML helps your work or publications, please cite:
|
|
133
|
+
|
|
134
|
+
> Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning* [Software]. Available at https://github.com/tdlabs-ai/tanml
|
|
135
|
+
|
|
136
|
+
Or in BibTeX (version-agnostic):
|
|
137
|
+
|
|
138
|
+
```bibtex
|
|
139
|
+
@misc{tanml,
|
|
140
|
+
author = {Sah, Tanmay and Sah, Dolly},
|
|
141
|
+
title = {TanML: Automated Model Validation Toolkit for Tabular Machine Learning},
|
|
142
|
+
year = {2025},
|
|
143
|
+
note = {Software; MIT License},
|
|
144
|
+
url = {https://github.com/tdlabs-ai/tanml}
|
|
145
|
+
}
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
A machine-readable citation file (`CITATION.cff`) is included for citation tools and GitHub’s “Cite this repository” button.
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "tanml"
|
|
7
|
+
version = "0.1.7"
|
|
8
|
+
description = "Automated validation toolkit for tabular ML models—MRM, credit risk, insurance, and other regulated use cases."
|
|
9
|
+
readme = { file = "README-pypi.md", content-type = "text/markdown" }
|
|
10
|
+
requires-python = ">=3.8"
|
|
11
|
+
license = "MIT"
|
|
12
|
+
license-files = ["LICENSE"]
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
keywords = [
|
|
16
|
+
"model validation",
|
|
17
|
+
"model risk management",
|
|
18
|
+
"model governance",
|
|
19
|
+
"SR 11-7",
|
|
20
|
+
"tabular ML",
|
|
21
|
+
"credit risk",
|
|
22
|
+
"insurance analytics",
|
|
23
|
+
"explainability",
|
|
24
|
+
"XAI",
|
|
25
|
+
"SHAP",
|
|
26
|
+
"stress testing",
|
|
27
|
+
"reporting",
|
|
28
|
+
"docx",
|
|
29
|
+
"streamlit",
|
|
30
|
+
"xgboost",
|
|
31
|
+
"lightgbm",
|
|
32
|
+
"catboost"
|
|
33
|
+
]
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
classifiers = [
|
|
38
|
+
"Development Status :: 4 - Beta",
|
|
39
|
+
"Intended Audience :: Science/Research",
|
|
40
|
+
"Intended Audience :: Financial and Insurance Industry",
|
|
41
|
+
"Natural Language :: English",
|
|
42
|
+
"Operating System :: OS Independent",
|
|
43
|
+
"Programming Language :: Python",
|
|
44
|
+
"Programming Language :: Python :: 3",
|
|
45
|
+
"Programming Language :: Python :: 3 :: Only",
|
|
46
|
+
"Programming Language :: Python :: 3.8",
|
|
47
|
+
"Programming Language :: Python :: 3.9",
|
|
48
|
+
"Programming Language :: Python :: 3.10",
|
|
49
|
+
"Programming Language :: Python :: 3.11",
|
|
50
|
+
"Programming Language :: Python :: 3.12",
|
|
51
|
+
"Topic :: Scientific/Engineering :: Artificial Intelligence",
|
|
52
|
+
"Topic :: Scientific/Engineering :: Information Analysis",
|
|
53
|
+
"Topic :: Scientific/Engineering :: Visualization"
|
|
54
|
+
]
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
authors = [
|
|
58
|
+
{ name = "Tanmay Sah" },
|
|
59
|
+
{ name = "Dolly Sah" }
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
maintainers = [
|
|
63
|
+
{ name = "Tanmay Sah" },
|
|
64
|
+
{ name = "Dolly Sah" }
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
dependencies = [
|
|
68
|
+
"numpy>=1.26",
|
|
69
|
+
"scipy>=1.11",
|
|
70
|
+
"pandas>=2.0",
|
|
71
|
+
"scikit-learn>=1.3",
|
|
72
|
+
"statsmodels>=0.14",
|
|
73
|
+
"xgboost>=2.0",
|
|
74
|
+
"lightgbm>=4.3",
|
|
75
|
+
"catboost>=1.2",
|
|
76
|
+
"shap>=0.44",
|
|
77
|
+
"numba>=0.58",
|
|
78
|
+
"matplotlib>=3.8",
|
|
79
|
+
"seaborn>=0.13",
|
|
80
|
+
"Pillow>=10.0",
|
|
81
|
+
"python-docx>=1.1.2",
|
|
82
|
+
"tzlocal>=5.0",
|
|
83
|
+
"tqdm>=4.66",
|
|
84
|
+
"pyarrow>=14.0",
|
|
85
|
+
"openpyxl>=3.1",
|
|
86
|
+
"pyreadstat>=1.2",
|
|
87
|
+
"streamlit>=1.36"
|
|
88
|
+
]
|
|
89
|
+
|
|
90
|
+
|
|
91
|
+
[project.optional-dependencies]
|
|
92
|
+
dev = ["pytest", "black", "isort"]
|
|
93
|
+
|
|
94
|
+
[project.scripts]
|
|
95
|
+
tanml = "tanml.cli.main:main"
|
|
96
|
+
|
|
97
|
+
[project.urls]
|
|
98
|
+
Homepage = "https://github.com/tdlabs-ai/tanml"
|
|
99
|
+
Source = "https://github.com/tdlabs-ai/tanml"
|
|
100
|
+
Issues = "https://github.com/tdlabs-ai/tanml/issues"
|
|
101
|
+
Documentation = "https://github.com/tdlabs-ai/tanml#readme"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
[tool.setuptools]
|
|
105
|
+
include-package-data = true
|
|
106
|
+
|
|
107
|
+
[tool.setuptools.packages.find]
|
|
108
|
+
where = ["."]
|
|
109
|
+
include = ["tanml*"]
|
|
110
|
+
|
|
111
|
+
[tool.setuptools.package-data]
|
|
112
|
+
"tanml.report.templates" = ["*.docx"]
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "0.1.7"
|
|
@@ -3,12 +3,12 @@ from tanml.checks.cleaning_repro import CleaningReproCheck
|
|
|
3
3
|
|
|
4
4
|
def run_cleaning_repro_check(model, X_train, X_test, y_train, y_test,
|
|
5
5
|
config, cleaned_data, *args, **kwargs):
|
|
6
|
-
|
|
6
|
+
|
|
7
7
|
if not config.get("rules", {}).get("CleaningReproCheck", {}).get("enabled", True):
|
|
8
8
|
print("ℹ️ CleaningReproCheck skipped (disabled in rules.yaml)")
|
|
9
9
|
return None
|
|
10
10
|
|
|
11
|
-
|
|
11
|
+
|
|
12
12
|
raw_data = config.get("raw_data") or kwargs.get("raw_df")
|
|
13
13
|
if raw_data is None:
|
|
14
14
|
print("⚠️ Skipping CleaningReproCheck — raw_data missing in config and kwargs")
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# tanml/check_runners/correlation_runner.py
|
|
2
|
+
from __future__ import annotations
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any, Dict, List
|
|
5
|
+
import pandas as pd
|
|
6
|
+
|
|
7
|
+
from tanml.checks.correlation import CorrelationCheck
|
|
8
|
+
|
|
9
|
+
def _resolve_outdir(config: Dict[str, Any]) -> str:
|
|
10
|
+
base = (config.get("options") or {}).get("save_artifacts_dir") or "reports"
|
|
11
|
+
outdir = os.path.join(base, "correlation")
|
|
12
|
+
os.makedirs(outdir, exist_ok=True)
|
|
13
|
+
return outdir
|
|
14
|
+
|
|
15
|
+
def _df_features_only(cleaned_df: pd.DataFrame) -> pd.DataFrame:
|
|
16
|
+
if cleaned_df is None or cleaned_df.empty:
|
|
17
|
+
return cleaned_df
|
|
18
|
+
cols = list(cleaned_df.columns)
|
|
19
|
+
if len(cols) >= 2:
|
|
20
|
+
return cleaned_df[cols[:-1]]
|
|
21
|
+
return cleaned_df
|
|
22
|
+
|
|
23
|
+
def CorrelationCheckRunner(
|
|
24
|
+
model,
|
|
25
|
+
X_train,
|
|
26
|
+
X_test,
|
|
27
|
+
y_train,
|
|
28
|
+
y_test,
|
|
29
|
+
config: Dict[str, Any],
|
|
30
|
+
cleaned_df: pd.DataFrame,
|
|
31
|
+
raw_df: pd.DataFrame | None = None,
|
|
32
|
+
):
|
|
33
|
+
ui_block: Dict[str, Any] = (config.get("CorrelationCheck") or {})
|
|
34
|
+
legacy: Dict[str, Any] = (config.get("correlation") or {})
|
|
35
|
+
if not bool(ui_block.get("enabled", legacy.get("enabled", True))):
|
|
36
|
+
return None
|
|
37
|
+
|
|
38
|
+
df = _df_features_only(cleaned_df)
|
|
39
|
+
cfg: Dict[str, Any] = {
|
|
40
|
+
"method": ui_block.get("method", "pearson"),
|
|
41
|
+
"high_corr_threshold": float(ui_block.get("high_corr_threshold", 0.8)),
|
|
42
|
+
"heatmap_max_features_default": int(ui_block.get("heatmap_max_features_default", 20)),
|
|
43
|
+
"heatmap_max_features_limit": int(ui_block.get("heatmap_max_features_limit", 60)),
|
|
44
|
+
"subset_strategy": ui_block.get("subset_strategy", "cluster"),
|
|
45
|
+
"sample_rows": int(ui_block.get("sample_rows", 150_000)),
|
|
46
|
+
"seed": int(ui_block.get("seed", 42)),
|
|
47
|
+
"save_csv": True,
|
|
48
|
+
"save_fig": True,
|
|
49
|
+
"appendix_csv_cap": ui_block.get("appendix_csv_cap", None),
|
|
50
|
+
}
|
|
51
|
+
outdir = _resolve_outdir(config)
|
|
52
|
+
return CorrelationCheck(cleaned_data=df, cfg=cfg, output_dir=outdir).run()
|
|
@@ -0,0 +1,18 @@
|
|
|
1
|
+
# explainability runner
|
|
2
|
+
from tanml.checks.explainability.shap_check import SHAPCheck
|
|
3
|
+
|
|
4
|
+
def run_shap_check(model, X_train, X_test, y_train, y_test, rule_config, cleaned_df, *args, **kwargs):
|
|
5
|
+
try:
|
|
6
|
+
cfg_shapcheck = (rule_config or {}).get("SHAPCheck", {}) or {}
|
|
7
|
+
cfg_expl = (rule_config or {}).get("explainability", {}).get("shap", {}) or {}
|
|
8
|
+
enabled = cfg_shapcheck.get("enabled", cfg_expl.get("enabled", True))
|
|
9
|
+
if not enabled:
|
|
10
|
+
print("ℹ️ SHAPCheck skipped (disabled)")
|
|
11
|
+
return {"SHAPCheck": {"skipped": True}}
|
|
12
|
+
|
|
13
|
+
check = SHAPCheck(model, X_train, X_test, y_train, y_test, rule_config=rule_config, cleaned_df=cleaned_df)
|
|
14
|
+
result = check.run()
|
|
15
|
+
return {"SHAPCheck": result}
|
|
16
|
+
except Exception as e:
|
|
17
|
+
print(f"⚠️ SHAPCheck failed: {e}")
|
|
18
|
+
return {"SHAPCheck": {"status": "error", "error": str(e)}}
|