tanml 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of tanml might be problematic. Click here for more details.
- tanml/__init__.py +1 -1
- tanml/check_runners/cleaning_repro_runner.py +2 -2
- tanml/check_runners/correlation_runner.py +49 -12
- tanml/check_runners/explainability_runner.py +12 -22
- tanml/check_runners/logistic_stats_runner.py +196 -17
- tanml/check_runners/performance_runner.py +82 -26
- tanml/check_runners/raw_data_runner.py +29 -14
- tanml/check_runners/regression_metrics_runner.py +195 -0
- tanml/check_runners/stress_test_runner.py +23 -6
- tanml/check_runners/vif_runner.py +33 -27
- tanml/checks/correlation.py +241 -41
- tanml/checks/explainability/shap_check.py +261 -29
- tanml/checks/logit_stats.py +186 -54
- tanml/checks/performance_classification.py +305 -0
- tanml/checks/raw_data.py +58 -23
- tanml/checks/regression_metrics.py +167 -0
- tanml/checks/stress_test.py +157 -53
- tanml/cli/main.py +99 -27
- tanml/engine/check_agent_registry.py +20 -10
- tanml/engine/core_engine_agent.py +199 -37
- tanml/models/registry.py +329 -0
- tanml/report/report_builder.py +1180 -147
- tanml/report/templates/report_template_cls.docx +0 -0
- tanml/report/templates/report_template_reg.docx +0 -0
- tanml/ui/app.py +1205 -0
- tanml/utils/data_loader.py +105 -15
- tanml-0.1.7.dist-info/METADATA +164 -0
- tanml-0.1.7.dist-info/RECORD +54 -0
- tanml/cli/arg_parser.py +0 -31
- tanml/cli/init_cmd.py +0 -8
- tanml/cli/validate_cmd.py +0 -7
- tanml/config_templates/rules_multiple_models_datasets.yaml +0 -144
- tanml/config_templates/rules_one_dataset_segment_column.yaml +0 -140
- tanml/config_templates/rules_one_model_one_dataset.yaml +0 -143
- tanml/engine/segmentation_agent.py +0 -118
- tanml/engine/validation_agent.py +0 -91
- tanml/report/templates/report_template.docx +0 -0
- tanml/utils/model_loader.py +0 -35
- tanml/utils/r_loader.py +0 -30
- tanml/utils/sas_loader.py +0 -50
- tanml/utils/yaml_generator.py +0 -34
- tanml/utils/yaml_loader.py +0 -5
- tanml/validate.py +0 -209
- tanml-0.1.6.dist-info/METADATA +0 -317
- tanml-0.1.6.dist-info/RECORD +0 -62
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/WHEEL +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/entry_points.txt +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/licenses/LICENSE +0 -0
- {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/top_level.txt +0 -0
tanml/utils/data_loader.py
CHANGED
|
@@ -1,17 +1,107 @@
|
|
|
1
|
-
import
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from typing import Any, Optional
|
|
2
5
|
import pandas as pd
|
|
3
6
|
|
|
4
|
-
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
7
|
+
|
|
8
|
+
def load_dataframe(filepath: str | Path,
|
|
9
|
+
sheet_name: Optional[str | int] = None,
|
|
10
|
+
sep: Optional[str] = None,
|
|
11
|
+
encoding: Optional[str] = None,
|
|
12
|
+
**_: Any) -> pd.DataFrame:
|
|
13
|
+
"""
|
|
14
|
+
Universal table loader with safe fallbacks.
|
|
15
|
+
|
|
16
|
+
Supports:
|
|
17
|
+
.csv, .txt, .tsv
|
|
18
|
+
.xlsx, .xls (needs openpyxl)
|
|
19
|
+
.parquet (pyarrow or fastparquet)
|
|
20
|
+
.feather/.ft (pyarrow)
|
|
21
|
+
.json
|
|
22
|
+
.pkl/.pickle
|
|
23
|
+
.sas7bdat/.xpt (pyreadstat preferred; falls back to pandas.read_sas)
|
|
24
|
+
.sav (SPSS; pyreadstat)
|
|
25
|
+
.dta (Stata)
|
|
26
|
+
"""
|
|
27
|
+
p = Path(filepath)
|
|
28
|
+
ext = p.suffix.lower()
|
|
29
|
+
|
|
30
|
+
# --- Delimited text ---
|
|
31
|
+
if ext in {".csv", ".txt"}:
|
|
32
|
+
# If sep is unspecified:
|
|
33
|
+
# - CSV: assume comma (keeps fast C engine, no warning)
|
|
34
|
+
# - TXT: keep sep=None (we'll use Python engine to infer)
|
|
35
|
+
eff_sep = sep if sep is not None else ("," if ext == ".csv" else None)
|
|
36
|
+
engine = "c" if eff_sep is not None else "python"
|
|
37
|
+
try:
|
|
38
|
+
return pd.read_csv(p, sep=eff_sep, encoding=encoding, engine=engine)
|
|
39
|
+
except UnicodeDecodeError:
|
|
40
|
+
return pd.read_csv(p, sep=eff_sep, encoding=encoding or "latin-1", engine=engine)
|
|
41
|
+
|
|
42
|
+
if ext == ".tsv":
|
|
43
|
+
try:
|
|
44
|
+
return pd.read_csv(p, sep="\t", encoding=encoding, engine="c")
|
|
45
|
+
except UnicodeDecodeError:
|
|
46
|
+
return pd.read_csv(p, sep="\t", encoding=encoding or "latin-1", engine="c")
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
# --- Excel ---
|
|
50
|
+
if ext in {".xlsx", ".xls"}:
|
|
51
|
+
try:
|
|
52
|
+
return pd.read_excel(p, sheet_name=0 if sheet_name is None else sheet_name)
|
|
53
|
+
except ImportError as e:
|
|
54
|
+
raise ModuleNotFoundError(
|
|
55
|
+
"openpyxl>=3.1 is required for Excel files. Install with: pip install openpyxl"
|
|
56
|
+
) from e
|
|
57
|
+
|
|
58
|
+
# --- Columnar ---
|
|
59
|
+
if ext == ".parquet":
|
|
60
|
+
try:
|
|
61
|
+
return pd.read_parquet(p, engine="pyarrow")
|
|
62
|
+
except Exception:
|
|
63
|
+
return pd.read_parquet(p, engine="fastparquet")
|
|
64
|
+
|
|
65
|
+
if ext in {".feather", ".ft"}:
|
|
66
|
+
try:
|
|
67
|
+
import pyarrow # noqa: F401
|
|
68
|
+
except ModuleNotFoundError as e:
|
|
69
|
+
raise ModuleNotFoundError(
|
|
70
|
+
"pyarrow is required for Feather files. Install with: pip install pyarrow"
|
|
71
|
+
) from e
|
|
72
|
+
return pd.read_feather(p)
|
|
73
|
+
|
|
74
|
+
# --- Other common formats ---
|
|
75
|
+
if ext in {".pkl", ".pickle"}:
|
|
76
|
+
return pd.read_pickle(p)
|
|
77
|
+
|
|
78
|
+
if ext == ".json":
|
|
79
|
+
return pd.read_json(p, convert_dates=True)
|
|
80
|
+
|
|
81
|
+
# --- SAS / SPSS / Stata ---
|
|
82
|
+
if ext in {".sas7bdat", ".xpt"}:
|
|
83
|
+
try:
|
|
84
|
+
import pyreadstat # type: ignore
|
|
85
|
+
if ext == ".sas7bdat":
|
|
86
|
+
df, _ = pyreadstat.read_sas7bdat(str(p))
|
|
87
|
+
else:
|
|
88
|
+
df, _ = pyreadstat.read_xport(str(p))
|
|
89
|
+
return df
|
|
90
|
+
except ModuleNotFoundError:
|
|
91
|
+
# best-effort fallback
|
|
92
|
+
return pd.read_sas(p)
|
|
93
|
+
|
|
94
|
+
if ext == ".sav": # SPSS
|
|
95
|
+
try:
|
|
96
|
+
import pyreadstat # type: ignore
|
|
97
|
+
except ModuleNotFoundError as e:
|
|
98
|
+
raise ModuleNotFoundError(
|
|
99
|
+
"pyreadstat is required for SPSS .sav files. Install with: pip install pyreadstat"
|
|
100
|
+
) from e
|
|
101
|
+
df, _ = pyreadstat.read_sav(str(p))
|
|
102
|
+
return df
|
|
103
|
+
|
|
104
|
+
if ext == ".dta": # Stata
|
|
105
|
+
return pd.read_stata(p)
|
|
106
|
+
|
|
107
|
+
raise ValueError(f"Unsupported file format: {ext} (path={p})")
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: tanml
|
|
3
|
+
Version: 0.1.7
|
|
4
|
+
Summary: Automated validation toolkit for tabular ML models—MRM, credit risk, insurance, and other regulated use cases.
|
|
5
|
+
Author: Tanmay Sah, Dolly Sah
|
|
6
|
+
Maintainer: Tanmay Sah, Dolly Sah
|
|
7
|
+
License-Expression: MIT
|
|
8
|
+
Project-URL: Homepage, https://github.com/tdlabs-ai/tanml
|
|
9
|
+
Project-URL: Source, https://github.com/tdlabs-ai/tanml
|
|
10
|
+
Project-URL: Issues, https://github.com/tdlabs-ai/tanml/issues
|
|
11
|
+
Project-URL: Documentation, https://github.com/tdlabs-ai/tanml#readme
|
|
12
|
+
Keywords: model validation,model risk management,model governance,SR 11-7,tabular ML,credit risk,insurance analytics,explainability,XAI,SHAP,stress testing,reporting,docx,streamlit,xgboost,lightgbm,catboost
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Science/Research
|
|
15
|
+
Classifier: Intended Audience :: Financial and Insurance Industry
|
|
16
|
+
Classifier: Natural Language :: English
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python
|
|
19
|
+
Classifier: Programming Language :: Python :: 3
|
|
20
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
24
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
25
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
26
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
27
|
+
Classifier: Topic :: Scientific/Engineering :: Information Analysis
|
|
28
|
+
Classifier: Topic :: Scientific/Engineering :: Visualization
|
|
29
|
+
Requires-Python: >=3.8
|
|
30
|
+
Description-Content-Type: text/markdown
|
|
31
|
+
License-File: LICENSE
|
|
32
|
+
Requires-Dist: numpy>=1.26
|
|
33
|
+
Requires-Dist: scipy>=1.11
|
|
34
|
+
Requires-Dist: pandas>=2.0
|
|
35
|
+
Requires-Dist: scikit-learn>=1.3
|
|
36
|
+
Requires-Dist: statsmodels>=0.14
|
|
37
|
+
Requires-Dist: xgboost>=2.0
|
|
38
|
+
Requires-Dist: lightgbm>=4.3
|
|
39
|
+
Requires-Dist: catboost>=1.2
|
|
40
|
+
Requires-Dist: shap>=0.44
|
|
41
|
+
Requires-Dist: numba>=0.58
|
|
42
|
+
Requires-Dist: matplotlib>=3.8
|
|
43
|
+
Requires-Dist: seaborn>=0.13
|
|
44
|
+
Requires-Dist: Pillow>=10.0
|
|
45
|
+
Requires-Dist: python-docx>=1.1.2
|
|
46
|
+
Requires-Dist: tzlocal>=5.0
|
|
47
|
+
Requires-Dist: tqdm>=4.66
|
|
48
|
+
Requires-Dist: pyarrow>=14.0
|
|
49
|
+
Requires-Dist: openpyxl>=3.1
|
|
50
|
+
Requires-Dist: pyreadstat>=1.2
|
|
51
|
+
Requires-Dist: streamlit>=1.36
|
|
52
|
+
Provides-Extra: dev
|
|
53
|
+
Requires-Dist: pytest; extra == "dev"
|
|
54
|
+
Requires-Dist: black; extra == "dev"
|
|
55
|
+
Requires-Dist: isort; extra == "dev"
|
|
56
|
+
Dynamic: license-file
|
|
57
|
+
|
|
58
|
+
# TanML: Automated Model Validation Toolkit for Tabular Machine Learning
|
|
59
|
+
|
|
60
|
+
[](https://github.com/tdlabs-ai/tanml#license--citation)
|
|
61
|
+
[](https://opensource.org/licenses/MIT)
|
|
62
|
+
[](https://pepy.tech/project/tanml)
|
|
63
|
+
|
|
64
|
+
**TanML** validates tabular ML models with a zero-config **Streamlit UI** and exports an audit-ready, **editable Word report (.docx)**. It covers data quality, correlation/VIF, performance, explainability (SHAP), and robustness/stress tests—built for regulated settings (MRM, credit risk, insurance, etc.).
|
|
65
|
+
|
|
66
|
+
* **Status:** Beta (`0.x`)
|
|
67
|
+
* **License:** MIT
|
|
68
|
+
* **Python:** 3.8–3.12
|
|
69
|
+
* **OS:** Linux / macOS / Windows (incl. WSL)
|
|
70
|
+
|
|
71
|
+
---
|
|
72
|
+
|
|
73
|
+
## Why TanML?
|
|
74
|
+
|
|
75
|
+
* **Zero-config UI:** launch Streamlit, upload data, click **Run**—no YAML needed.
|
|
76
|
+
* **Audit-ready outputs:** tables/plots + a polished DOCX your stakeholders can edit.
|
|
77
|
+
* **Regulatory alignment:** supports common Model Risk Management themes (e.g., SR 11-7 style).
|
|
78
|
+
* **Works with your stack:** scikit-learn, XGBoost/LightGBM/CatBoost, etc.
|
|
79
|
+
|
|
80
|
+
---
|
|
81
|
+
|
|
82
|
+
## Install
|
|
83
|
+
|
|
84
|
+
```bash
|
|
85
|
+
pip install tanml
|
|
86
|
+
```
|
|
87
|
+
|
|
88
|
+
## Quick Start (UI)
|
|
89
|
+
|
|
90
|
+
```bash
|
|
91
|
+
tanml ui
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
* Opens at **[http://127.0.0.1:8501](http://127.0.0.1:8501)**
|
|
95
|
+
* **Upload limit ~1 GB** (preconfigured)
|
|
96
|
+
* **Telemetry disabled by default**
|
|
97
|
+
|
|
98
|
+
### In the app
|
|
99
|
+
|
|
100
|
+
1. **Load data** — upload a cleaned CSV/XLSX/Parquet (optional: raw or separate Train/Test).
|
|
101
|
+
2. **Select target & features** — target auto-suggested; features default to all non-target columns.
|
|
102
|
+
3. **Pick a model** — choose library/algorithm (scikit-learn, XGBoost, LightGBM, CatBoost) and tweak params.
|
|
103
|
+
4. **Run validation** — click **▶️ Refit & validate**.
|
|
104
|
+
5. **Export** — click **⬇️ Download report** to get a **DOCX** (auto-selects classification/regression template).
|
|
105
|
+
|
|
106
|
+
**Outputs**
|
|
107
|
+
|
|
108
|
+
* Report: `./.ui_runs/<session>/tanml_report_*.docx`
|
|
109
|
+
* Artifacts (CSV/PNGs): `./.ui_runs/<session>/artifacts/*`
|
|
110
|
+
|
|
111
|
+
---
|
|
112
|
+
|
|
113
|
+
## What TanML Checks
|
|
114
|
+
|
|
115
|
+
* **Raw Data (optional):** rows/cols, missingness, duplicates, constant columns
|
|
116
|
+
* **Data Quality & EDA:** summaries, distributions
|
|
117
|
+
* **Correlation & Multicollinearity:** heatmap, top-pairs CSV, **VIF** table
|
|
118
|
+
* **Performance**
|
|
119
|
+
|
|
120
|
+
* **Classification:** AUC, PR-AUC, KS, decile lift, confusion
|
|
121
|
+
* **Regression:** R², MAE, MSE/RMSE, error stats
|
|
122
|
+
* **Explainability:** SHAP (auto explainer; configurable background size)
|
|
123
|
+
* **Robustness/Stress Tests:** feature perturbations → delta-metrics
|
|
124
|
+
* **Model Metadata:** model class, hyperparameters, features, training info
|
|
125
|
+
|
|
126
|
+
---
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
## Templates
|
|
130
|
+
|
|
131
|
+
TanML ships DOCX templates (packaged in wheel & sdist):
|
|
132
|
+
|
|
133
|
+
* `tanml/report/templates/report_template_cls.docx`
|
|
134
|
+
* `tanml/report/templates/report_template_reg.docx`
|
|
135
|
+
|
|
136
|
+
---
|
|
137
|
+
|
|
138
|
+
|
|
139
|
+
## License & Citation
|
|
140
|
+
|
|
141
|
+
**License:** MIT. See [LICENSE](https://github.com/tdlabs-ai/tanml/blob/main/LICENSE).
|
|
142
|
+
SPDX-License-Identifier: MIT
|
|
143
|
+
|
|
144
|
+
© 2025 Tanmay Sah and Dolly Sah. You may use, modify, and distribute this software with appropriate attribution.
|
|
145
|
+
|
|
146
|
+
### How to cite
|
|
147
|
+
|
|
148
|
+
If TanML helps your work or publications, please cite:
|
|
149
|
+
|
|
150
|
+
> Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning* [Software]. Available at https://github.com/tdlabs-ai/tanml
|
|
151
|
+
|
|
152
|
+
Or in BibTeX (version-agnostic):
|
|
153
|
+
|
|
154
|
+
```bibtex
|
|
155
|
+
@misc{tanml,
|
|
156
|
+
author = {Sah, Tanmay and Sah, Dolly},
|
|
157
|
+
title = {TanML: Automated Model Validation Toolkit for Tabular Machine Learning},
|
|
158
|
+
year = {2025},
|
|
159
|
+
note = {Software; MIT License},
|
|
160
|
+
url = {https://github.com/tdlabs-ai/tanml}
|
|
161
|
+
}
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
A machine-readable citation file (`CITATION.cff`) is included for citation tools and GitHub’s “Cite this repository” button.
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
tanml/__init__.py,sha256=GhTYFGQ77wbJM35lScp2XZKYnkNHTrdYozPoCZiCVSM,23
|
|
2
|
+
tanml/check_runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
3
|
+
tanml/check_runners/base_runner.py,sha256=jibUcl6a7SQH9EKnPFZhU1t9FY-Bu_nkPjRfjW2T5x0,210
|
|
4
|
+
tanml/check_runners/cleaning_repro_runner.py,sha256=fXlujWnuA0Hfzu9uQ31itU8eO-6wGD4eUImiR6kyHfg,817
|
|
5
|
+
tanml/check_runners/correlation_runner.py,sha256=-TxktyjXi8IX4t0DCTe-9Ea5vVb-RcN6gBptbzUJdNQ,1975
|
|
6
|
+
tanml/check_runners/data_quality_runner.py,sha256=IdhWYOtDBPkAwT2Aa6SYSiI2gkLEhkBXHAQop5ZFg0I,883
|
|
7
|
+
tanml/check_runners/eda_runner.py,sha256=Gr5ZmgOvUilej-rhinsL9KmsIQOMWjd6jdOYUQTJ4os,623
|
|
8
|
+
tanml/check_runners/explainability_runner.py,sha256=I40KVRoN1yEzZ6i1WZ6a1g89cPrRHpZ97lbfMEyOq2U,908
|
|
9
|
+
tanml/check_runners/input_cluster_runner.py,sha256=0O8JJq3HOvNd3_nrZY8FevVYimI_Nl2Iftyyx56LVzc,1251
|
|
10
|
+
tanml/check_runners/logistic_stats_runner.py,sha256=vTwnWHwUjl7ML2PKDteOgUc247D7xGXkKa8NfxUxYa4,7739
|
|
11
|
+
tanml/check_runners/model_meta_runner.py,sha256=hOIEp6k98o0FwYjgmi_53Ni7H14jerKdf46wpxVkAkk,799
|
|
12
|
+
tanml/check_runners/performance_runner.py,sha256=Q9r5mqj2ayOSfIN2SVaF4sm_Gc-pVyPfU3Q9NE9uZ8k,2567
|
|
13
|
+
tanml/check_runners/raw_data_runner.py,sha256=qxX2Fst7u2VUwobtWr1StaDj8x5q1P01zCDhB0ewbdY,1531
|
|
14
|
+
tanml/check_runners/regression_metrics_runner.py,sha256=wPDtGs713aX0rVRcUNsWVIybQee8MlHt6dxJxigRAFY,5774
|
|
15
|
+
tanml/check_runners/rule_engine_runner.py,sha256=xgij_9S8kUSESgXeWGN7FnOD0hNmd3GAregxuD_x9nI,286
|
|
16
|
+
tanml/check_runners/stress_test_runner.py,sha256=s4Ax7JnqGzqc3kw0wfCGt3LNIcZzZMX30Uk73zPDT34,1544
|
|
17
|
+
tanml/check_runners/vif_runner.py,sha256=T7ho2jYnHKuz3P0R5CuoNB0LjPoH_Gs_FS0dcoyYbj4,2311
|
|
18
|
+
tanml/checks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
19
|
+
tanml/checks/base.py,sha256=g1GLXHWkE848bDKb4zXLHRqfgucVWFa-n6ZDT5vzs2I,570
|
|
20
|
+
tanml/checks/cleaning_repro.py,sha256=LtPEkikjTY0NMsv1FZ8tOPUkaqF3yK_-DQdLCltMsIA,1795
|
|
21
|
+
tanml/checks/correlation.py,sha256=IN86Yc-yUlBKNSaK-YTlGM4V2DH6BPZw5KHJVN-hE2c,10350
|
|
22
|
+
tanml/checks/data_quality.py,sha256=bNrM469c_G-TxU-6ne2UYv5gwoUoyrab4lYbI16l7VU,981
|
|
23
|
+
tanml/checks/eda.py,sha256=7fOIBhCg0oiK4SL22WcKHK8vL3s-YwypiAnZDwoq3lo,2515
|
|
24
|
+
tanml/checks/input_cluster.py,sha256=93RvE-vd_elt4xqnkCkHmHH6NWPOtJF7ylb5mPTsagA,3990
|
|
25
|
+
tanml/checks/logit_stats.py,sha256=OnL2_M4bg-T5RE5Rv_wMJNkaX4Ox1ihSk428_7OJ15w,6958
|
|
26
|
+
tanml/checks/model_contents.py,sha256=jcS4GrUoaQVUBf9pFwvtv9KKvDO5vZroiPz8mOmyAGo,1263
|
|
27
|
+
tanml/checks/model_meta.py,sha256=x2RnT3kUWY4ERWfM9cItpaIJX8Fca9IQByR9zPN2V20,1881
|
|
28
|
+
tanml/checks/performance.py,sha256=vQ_FeIcr3ASDhVd97y2JKMEMfJUrfCRnUBZh1ZI6rCs,3400
|
|
29
|
+
tanml/checks/performance_classification.py,sha256=2MnMeQEBVswCOjT7hFK7gjRBiXmhLgriA6_qw_9jsXk,11982
|
|
30
|
+
tanml/checks/raw_data.py,sha256=asjyURkvBQ1qOfTJkIZblcyELf1HO044jRo1BZGsPOQ,2862
|
|
31
|
+
tanml/checks/regression_metrics.py,sha256=736Hk6tmCwR8uFUJF1Z5kszFiT_qHM86jBPGveb_BkY,6158
|
|
32
|
+
tanml/checks/rule_engine.py,sha256=bqGd0-2MlSPGGUFPyMxgHiwVTvw1Iu7NFqUzl_-0iIg,1694
|
|
33
|
+
tanml/checks/stress_test.py,sha256=64fVn96uzGg7_qvPMwiWhYRSr4FflFMkkjXQXmv2YB0,7460
|
|
34
|
+
tanml/checks/vif.py,sha256=0tghDaiG8z4frgP5OHj3ctgsH4Mak1N7Docu6OcG8JA,1902
|
|
35
|
+
tanml/checks/explainability/shap_check.py,sha256=Di5WD3jUgs1TEwnhCoEU0dhvMlBUf0I3tYNpsjST1wk,12829
|
|
36
|
+
tanml/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
37
|
+
tanml/cli/main.py,sha256=A9W7RXYbnvw_O-jJDbwvkAOL4NQF93XNsn3PlzNuqZg,3929
|
|
38
|
+
tanml/config_templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
39
|
+
tanml/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
40
|
+
tanml/engine/check_agent_registry.py,sha256=CrRAL_BnsXcivXLQydka2R8emHrkHOqZDSzuqZNmknc,2419
|
|
41
|
+
tanml/engine/core_engine_agent.py,sha256=-DE9ehdY60W2IAzNMnv-BTOf3KPJobX8p5dBcWH1aiE,9334
|
|
42
|
+
tanml/models/registry.py,sha256=SMAwKTLVIFYQ_KrYsaxm-dmSfyE_zk1Mo1AL8-JZt1w,15467
|
|
43
|
+
tanml/report/report_builder.py,sha256=_UuaLxJOuvfXZjOWIrI478G8jMUy2CxQNi2YDR-8FYI,55915
|
|
44
|
+
tanml/report/templates/report_template_cls.docx,sha256=1pAKIvC8nG2JGN3H7SpU10QVaeDNgJI1aOi1SH6TCzs,30199
|
|
45
|
+
tanml/report/templates/report_template_reg.docx,sha256=bjr-18WFD-KwtRtNcbRipstyNa29z7hF-JHp86OMKVg,29673
|
|
46
|
+
tanml/ui/app.py,sha256=n6seLRuoWSF6sK6arewyhhEyfEhNlGeyhourRAB_loc,47682
|
|
47
|
+
tanml/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
48
|
+
tanml/utils/data_loader.py,sha256=_Ja3XSONwL_NVwpxuBjLacXWX5yPTN9H0moFxSVqGlw,3713
|
|
49
|
+
tanml-0.1.7.dist-info/licenses/LICENSE,sha256=e6xQyG7SdWiD4cLlj7rFdMrjG6H7ABglGOrooZxWLKQ,1102
|
|
50
|
+
tanml-0.1.7.dist-info/METADATA,sha256=gy0-NjrrTIIl9J0kFL2iNTI07AYuNSa-GEf_Iq1wSHo,6118
|
|
51
|
+
tanml-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
|
|
52
|
+
tanml-0.1.7.dist-info/entry_points.txt,sha256=WUM_y0uRIL7iXPcxK69Bn5mKZXnDshWDbLyohjND1IE,46
|
|
53
|
+
tanml-0.1.7.dist-info/top_level.txt,sha256=81dIhCm6opwY6E7Pb9G1kdIVmYrUkXX4PaYhQ873gIE,6
|
|
54
|
+
tanml-0.1.7.dist-info/RECORD,,
|
tanml/cli/arg_parser.py
DELETED
|
@@ -1,31 +0,0 @@
|
|
|
1
|
-
import argparse
|
|
2
|
-
|
|
3
|
-
def parse_args():
|
|
4
|
-
parser = argparse.ArgumentParser(description="Run TanML model validation toolkit")
|
|
5
|
-
subparsers = parser.add_subparsers(dest="command", required=True)
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
validate_parser = subparsers.add_parser("validate", help="Run validation checks and generate report")
|
|
9
|
-
validate_parser.add_argument("--model", required=False,
|
|
10
|
-
help="Model path: .pkl for sklearn/xgb, .csv for SAS or R logistic")
|
|
11
|
-
validate_parser.add_argument("--raw", required=False, help="Path to raw input data file")
|
|
12
|
-
validate_parser.add_argument("--cleaned", required=False, help="Path to cleaned input data file")
|
|
13
|
-
validate_parser.add_argument("--rules", required=True, help="Path to rules.yaml config file")
|
|
14
|
-
validate_parser.add_argument("--target", required=False, help="Target column name (optional)")
|
|
15
|
-
validate_parser.add_argument("--features", required=False, help="Comma-separated list of features")
|
|
16
|
-
validate_parser.add_argument(
|
|
17
|
-
"--report_path",
|
|
18
|
-
type=str,
|
|
19
|
-
default="reports/final_report.docx",
|
|
20
|
-
help="Path to output DOCX report. Example: --report_path my_reports/run1.docx"
|
|
21
|
-
)
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
init_parser = subparsers.add_parser("init", help="Generate starter rules.yaml file")
|
|
25
|
-
init_parser.add_argument("--scenario", required=True, choices=["A", "B", "C"],
|
|
26
|
-
help="Choose validation scenario: A (single model), B (multiple segments), C (single dataset + segment column)")
|
|
27
|
-
init_parser.add_argument(
|
|
28
|
-
"--output", type=str, default="rules.yaml",
|
|
29
|
-
help="Destination path for rules YAML file (default: rules.yaml)"
|
|
30
|
-
)
|
|
31
|
-
return parser.parse_args()
|
tanml/cli/init_cmd.py
DELETED
|
@@ -1,8 +0,0 @@
|
|
|
1
|
-
from tanml.utils.yaml_generator import generate_rules_yaml
|
|
2
|
-
|
|
3
|
-
def run_init(scenario, dest_path="rules.yaml", overwrite=False):
|
|
4
|
-
try:
|
|
5
|
-
generate_rules_yaml(scenario=scenario, dest_path=dest_path, overwrite=overwrite)
|
|
6
|
-
|
|
7
|
-
except Exception as e:
|
|
8
|
-
print(f"❌ Failed to create YAML: {e}")
|
tanml/cli/validate_cmd.py
DELETED
|
@@ -1,144 +0,0 @@
|
|
|
1
|
-
# ============================================================
|
|
2
|
-
# TanML Validation Configuration File: Scenario B
|
|
3
|
-
# ------------------------------------------------------------
|
|
4
|
-
# 🧪 Scenario: One model and one cleaned dataset per segment
|
|
5
|
-
#
|
|
6
|
-
# ✅ Required:
|
|
7
|
-
# - Define segment-wise cleaned data and model path (or retrain)
|
|
8
|
-
# - Choose ONE model source strategy (Option A or Option B)
|
|
9
|
-
# - Set input features and target column
|
|
10
|
-
# - Optional: provide global raw data
|
|
11
|
-
# - Adjust thresholds and check options as needed
|
|
12
|
-
# ============================================================
|
|
13
|
-
|
|
14
|
-
# ------------------------------------------
|
|
15
|
-
# REQUIRED: Model Input Schema
|
|
16
|
-
# ------------------------------------------
|
|
17
|
-
model:
|
|
18
|
-
features:
|
|
19
|
-
- feature_0 # 👉 replace with actual feature names used in all segment models
|
|
20
|
-
- feature_1 # 👉 remove or add more lines if needed
|
|
21
|
-
- feature_2
|
|
22
|
-
target: default_flag # 👉 replace with your actual target column
|
|
23
|
-
|
|
24
|
-
# ------------------------------------------
|
|
25
|
-
# OPTIONAL: Raw Data Path
|
|
26
|
-
# ------------------------------------------
|
|
27
|
-
paths:
|
|
28
|
-
raw_data: data/raw.csv # 👉 optional – provide full path if you have original raw dataset otherwise use null
|
|
29
|
-
|
|
30
|
-
output:
|
|
31
|
-
report_path_template: /absolute/path/to/output/{segment} # 👉 Output path template – {segment} will be replaced dynamically, it is folder path
|
|
32
|
-
|
|
33
|
-
# ------------------------------------------
|
|
34
|
-
# ✅ OPTION A — Pretrained Model per Segment (.pkl)
|
|
35
|
-
# ------------------------------------------
|
|
36
|
-
# Use this option when each segment has its own trained model and cleaned dataset.
|
|
37
|
-
# You must provide:
|
|
38
|
-
# - model: path to the pretrained `.pkl` file
|
|
39
|
-
# - cleaned: path to the cleaned CSV used for that segment
|
|
40
|
-
#
|
|
41
|
-
# 👉 Comment out the OPTION B block below if using this
|
|
42
|
-
# ------------------------------------------
|
|
43
|
-
segment:
|
|
44
|
-
runs:
|
|
45
|
-
segment_A: # 👉 rename (e.g., high_risk, bronze, tier_1)
|
|
46
|
-
model: models/logistic/model_b_segment1.pkl
|
|
47
|
-
cleaned: data/scenario_b_segment1.csv
|
|
48
|
-
|
|
49
|
-
segment_B:
|
|
50
|
-
model: models/logistic/model_b_segment1.pkl
|
|
51
|
-
cleaned: data/scenario_b_segment2.csv
|
|
52
|
-
|
|
53
|
-
# ------------------------------------------
|
|
54
|
-
# 🔁 OPTION B — Retrain Model from Cleaned Data (Per Segment)
|
|
55
|
-
# ------------------------------------------
|
|
56
|
-
# Use this when you want to retrain a model for each segment
|
|
57
|
-
# using the corresponding cleaned dataset and shared model config.
|
|
58
|
-
#
|
|
59
|
-
# 👉 If using this, comment out OPTION A (i.e., segment.runs[].model lines)
|
|
60
|
-
# 👉 Each segment must still have its own cleaned dataset
|
|
61
|
-
# ------------------------------------------
|
|
62
|
-
|
|
63
|
-
# segment:
|
|
64
|
-
# runs:
|
|
65
|
-
# segment_A:
|
|
66
|
-
# cleaned: data/cleaned_a.csv
|
|
67
|
-
# segment_B:
|
|
68
|
-
# cleaned: data/cleaned_b.csv
|
|
69
|
-
|
|
70
|
-
# ------------------------------------------
|
|
71
|
-
# MODEL SOURCE CONFIG
|
|
72
|
-
# ------------------------------------------
|
|
73
|
-
# This tells TanML how to load or build the model(s) for each segment.
|
|
74
|
-
#
|
|
75
|
-
# 👉 If using pretrained models (Option A): set `from_pickle: true`
|
|
76
|
-
# 👉 If retraining per segment (Option B): set `from_pickle: false`
|
|
77
|
-
# ------------------------------------------
|
|
78
|
-
model_source:
|
|
79
|
-
from_pickle: true # 👉 change to false if using retraining (Option B)
|
|
80
|
-
type: LogisticRegression
|
|
81
|
-
module: sklearn.linear_model
|
|
82
|
-
hyperparameters:
|
|
83
|
-
penalty: "l2"
|
|
84
|
-
solver: "liblinear"
|
|
85
|
-
random_state: 42
|
|
86
|
-
class_weight: "balanced"
|
|
87
|
-
max_iter: 100
|
|
88
|
-
|
|
89
|
-
# ------------------------------------------
|
|
90
|
-
# PERFORMANCE THRESHOLDS
|
|
91
|
-
# ------------------------------------------
|
|
92
|
-
auc_roc:
|
|
93
|
-
min: 0.60
|
|
94
|
-
|
|
95
|
-
f1:
|
|
96
|
-
min: 0.60
|
|
97
|
-
|
|
98
|
-
ks:
|
|
99
|
-
min: 0.20
|
|
100
|
-
|
|
101
|
-
# ------------------------------------------
|
|
102
|
-
# VALIDATION CHECKS
|
|
103
|
-
# ------------------------------------------
|
|
104
|
-
|
|
105
|
-
EDACheck:
|
|
106
|
-
enabled: true
|
|
107
|
-
max_plots: -1 # -1 = all numeric; or set number of columns
|
|
108
|
-
|
|
109
|
-
correlation:
|
|
110
|
-
enabled: true
|
|
111
|
-
|
|
112
|
-
VIFCheck:
|
|
113
|
-
enabled: true
|
|
114
|
-
|
|
115
|
-
raw_data_check:
|
|
116
|
-
enabled: true
|
|
117
|
-
|
|
118
|
-
model_meta:
|
|
119
|
-
enabled: true
|
|
120
|
-
|
|
121
|
-
# ------------------------------------------
|
|
122
|
-
# STRESS TESTING (Robustness Check)
|
|
123
|
-
# ------------------------------------------
|
|
124
|
-
StressTestCheck:
|
|
125
|
-
enabled: true
|
|
126
|
-
epsilon: 0.01 # ➜ 1% noise
|
|
127
|
-
perturb_fraction: 0.2 # ➜ 20% of rows
|
|
128
|
-
|
|
129
|
-
# ------------------------------------------
|
|
130
|
-
# INPUT CLUSTER COVERAGE
|
|
131
|
-
# ------------------------------------------
|
|
132
|
-
InputClusterCoverageCheck:
|
|
133
|
-
enabled: true
|
|
134
|
-
n_clusters: 5 # ➜ fixed clusters for coverage bar chart
|
|
135
|
-
max_k: 10 # ➜ elbow method search (if needed)
|
|
136
|
-
|
|
137
|
-
# ------------------------------------------
|
|
138
|
-
# EXPLAINABILITY
|
|
139
|
-
# ------------------------------------------
|
|
140
|
-
explainability:
|
|
141
|
-
shap:
|
|
142
|
-
enabled: true
|
|
143
|
-
background_sample_size: 100 # ➜ SHAP explainer training background
|
|
144
|
-
test_sample_size: 200 # ➜ test rows to explain
|
|
@@ -1,140 +0,0 @@
|
|
|
1
|
-
# ============================================================
|
|
2
|
-
# TanML Validation Configuration File: Scenario C
|
|
3
|
-
# ------------------------------------------------------------
|
|
4
|
-
# 🧪 Scenario: One dataset with a segment column, one model per segment
|
|
5
|
-
#
|
|
6
|
-
# ✅ Required:
|
|
7
|
-
# - Provide cleaned data (includes all segments)
|
|
8
|
-
# - Define the segment column used to split the data
|
|
9
|
-
# - Choose ONE model source strategy (Option A or Option B)
|
|
10
|
-
# - Set input features and target column
|
|
11
|
-
# - Optional: provide global raw data
|
|
12
|
-
# - Adjust thresholds and check options as needed
|
|
13
|
-
# ============================================================
|
|
14
|
-
|
|
15
|
-
# ------------------------------------------
|
|
16
|
-
# REQUIRED: Model Input Schema
|
|
17
|
-
# ------------------------------------------
|
|
18
|
-
model:
|
|
19
|
-
features:
|
|
20
|
-
- feature_0 # 👉 replace with actual feature names used across all segment models
|
|
21
|
-
- feature_1
|
|
22
|
-
- feature_2
|
|
23
|
-
target: default_flag # 👉 replace with your actual target column
|
|
24
|
-
|
|
25
|
-
# ------------------------------------------
|
|
26
|
-
# REQUIRED: File Paths
|
|
27
|
-
# ------------------------------------------
|
|
28
|
-
paths:
|
|
29
|
-
cleaned_data: data/cleaned.csv # 👉 path to full cleaned dataset (includes all segments)
|
|
30
|
-
raw_data: data/raw.csv # 👉 optional — use null if raw data not available
|
|
31
|
-
|
|
32
|
-
# ------------------------------------------
|
|
33
|
-
# OUTPUT CONFIGURATION
|
|
34
|
-
# ------------------------------------------
|
|
35
|
-
# Path template where validation reports will be saved.
|
|
36
|
-
# 👉 Use `{segment}` as a placeholder for the segment name.
|
|
37
|
-
# ------------------------------------------
|
|
38
|
-
output:
|
|
39
|
-
report_path_template: reports/scenario_c/{segment}_report.docx # 👉 customize this path as needed
|
|
40
|
-
|
|
41
|
-
# ------------------------------------------
|
|
42
|
-
# ✅ OPTION A — Pretrained Models per Segment (.pkl)
|
|
43
|
-
# ------------------------------------------
|
|
44
|
-
# Provide one model per segment (already trained)
|
|
45
|
-
# This is the default option.
|
|
46
|
-
# 👉 Comment out the OPTION B block below if using this
|
|
47
|
-
# ------------------------------------------
|
|
48
|
-
segment:
|
|
49
|
-
column: customer_segment # 👉 column used to split segments
|
|
50
|
-
|
|
51
|
-
runs:
|
|
52
|
-
segment_A:
|
|
53
|
-
model: models/logistic/model_a.pkl
|
|
54
|
-
segment_B:
|
|
55
|
-
model: models/logistic/model_b.pkl
|
|
56
|
-
|
|
57
|
-
# ------------------------------------------
|
|
58
|
-
# 🔁 OPTION B — Retrain Models per Segment from Cleaned Data
|
|
59
|
-
# ------------------------------------------
|
|
60
|
-
# Use this if you want TanML to retrain a model for each segment
|
|
61
|
-
# from the common cleaned dataset.
|
|
62
|
-
# 👉 Comment out the OPTION A block above if using this
|
|
63
|
-
# 👉 `segment.runs` must list segment values (no model paths needed)
|
|
64
|
-
# ------------------------------------------
|
|
65
|
-
# segment:
|
|
66
|
-
# column: customer_segment
|
|
67
|
-
# runs:
|
|
68
|
-
# segment_A: {}
|
|
69
|
-
# segment_B: {}
|
|
70
|
-
|
|
71
|
-
# ------------------------------------------
|
|
72
|
-
# MODEL SOURCE CONFIGURATION
|
|
73
|
-
# ------------------------------------------
|
|
74
|
-
# 👉 If using pretrained models: set `from_pickle: true`
|
|
75
|
-
# 👉 If retraining per segment: set `from_pickle: false`
|
|
76
|
-
# ------------------------------------------
|
|
77
|
-
model_source:
|
|
78
|
-
from_pickle: true
|
|
79
|
-
type: LogisticRegression
|
|
80
|
-
module: sklearn.linear_model
|
|
81
|
-
hyperparameters:
|
|
82
|
-
penalty: "l2"
|
|
83
|
-
solver: "liblinear"
|
|
84
|
-
random_state: 42
|
|
85
|
-
class_weight: "balanced"
|
|
86
|
-
max_iter: 100
|
|
87
|
-
|
|
88
|
-
# ------------------------------------------
|
|
89
|
-
# PERFORMANCE THRESHOLDS
|
|
90
|
-
# ------------------------------------------
|
|
91
|
-
auc_roc:
|
|
92
|
-
min: 0.60
|
|
93
|
-
f1:
|
|
94
|
-
min: 0.60
|
|
95
|
-
ks:
|
|
96
|
-
min: 0.20
|
|
97
|
-
|
|
98
|
-
# ------------------------------------------
|
|
99
|
-
# VALIDATION CHECKS
|
|
100
|
-
# ------------------------------------------
|
|
101
|
-
EDACheck:
|
|
102
|
-
enabled: true
|
|
103
|
-
max_plots: -1
|
|
104
|
-
|
|
105
|
-
correlation:
|
|
106
|
-
enabled: true
|
|
107
|
-
|
|
108
|
-
VIFCheck:
|
|
109
|
-
enabled: true
|
|
110
|
-
|
|
111
|
-
raw_data_check:
|
|
112
|
-
enabled: true
|
|
113
|
-
|
|
114
|
-
model_meta:
|
|
115
|
-
enabled: true
|
|
116
|
-
|
|
117
|
-
# ------------------------------------------
|
|
118
|
-
# STRESS TESTING (Robustness Check)
|
|
119
|
-
# ------------------------------------------
|
|
120
|
-
StressTestCheck:
|
|
121
|
-
enabled: true
|
|
122
|
-
epsilon: 0.01
|
|
123
|
-
perturb_fraction: 0.2
|
|
124
|
-
|
|
125
|
-
# ------------------------------------------
|
|
126
|
-
# INPUT CLUSTER COVERAGE
|
|
127
|
-
# ------------------------------------------
|
|
128
|
-
InputClusterCoverageCheck:
|
|
129
|
-
enabled: true
|
|
130
|
-
n_clusters: 5
|
|
131
|
-
max_k: 10
|
|
132
|
-
|
|
133
|
-
# ------------------------------------------
|
|
134
|
-
# EXPLAINABILITY
|
|
135
|
-
# ------------------------------------------
|
|
136
|
-
explainability:
|
|
137
|
-
shap:
|
|
138
|
-
enabled: true
|
|
139
|
-
background_sample_size: 100
|
|
140
|
-
test_sample_size: 200
|