tanml 0.1.6__py3-none-any.whl → 0.1.7__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tanml might be problematic. Click here for more details.

Files changed (49) hide show
  1. tanml/__init__.py +1 -1
  2. tanml/check_runners/cleaning_repro_runner.py +2 -2
  3. tanml/check_runners/correlation_runner.py +49 -12
  4. tanml/check_runners/explainability_runner.py +12 -22
  5. tanml/check_runners/logistic_stats_runner.py +196 -17
  6. tanml/check_runners/performance_runner.py +82 -26
  7. tanml/check_runners/raw_data_runner.py +29 -14
  8. tanml/check_runners/regression_metrics_runner.py +195 -0
  9. tanml/check_runners/stress_test_runner.py +23 -6
  10. tanml/check_runners/vif_runner.py +33 -27
  11. tanml/checks/correlation.py +241 -41
  12. tanml/checks/explainability/shap_check.py +261 -29
  13. tanml/checks/logit_stats.py +186 -54
  14. tanml/checks/performance_classification.py +305 -0
  15. tanml/checks/raw_data.py +58 -23
  16. tanml/checks/regression_metrics.py +167 -0
  17. tanml/checks/stress_test.py +157 -53
  18. tanml/cli/main.py +99 -27
  19. tanml/engine/check_agent_registry.py +20 -10
  20. tanml/engine/core_engine_agent.py +199 -37
  21. tanml/models/registry.py +329 -0
  22. tanml/report/report_builder.py +1180 -147
  23. tanml/report/templates/report_template_cls.docx +0 -0
  24. tanml/report/templates/report_template_reg.docx +0 -0
  25. tanml/ui/app.py +1205 -0
  26. tanml/utils/data_loader.py +105 -15
  27. tanml-0.1.7.dist-info/METADATA +164 -0
  28. tanml-0.1.7.dist-info/RECORD +54 -0
  29. tanml/cli/arg_parser.py +0 -31
  30. tanml/cli/init_cmd.py +0 -8
  31. tanml/cli/validate_cmd.py +0 -7
  32. tanml/config_templates/rules_multiple_models_datasets.yaml +0 -144
  33. tanml/config_templates/rules_one_dataset_segment_column.yaml +0 -140
  34. tanml/config_templates/rules_one_model_one_dataset.yaml +0 -143
  35. tanml/engine/segmentation_agent.py +0 -118
  36. tanml/engine/validation_agent.py +0 -91
  37. tanml/report/templates/report_template.docx +0 -0
  38. tanml/utils/model_loader.py +0 -35
  39. tanml/utils/r_loader.py +0 -30
  40. tanml/utils/sas_loader.py +0 -50
  41. tanml/utils/yaml_generator.py +0 -34
  42. tanml/utils/yaml_loader.py +0 -5
  43. tanml/validate.py +0 -209
  44. tanml-0.1.6.dist-info/METADATA +0 -317
  45. tanml-0.1.6.dist-info/RECORD +0 -62
  46. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/WHEEL +0 -0
  47. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/entry_points.txt +0 -0
  48. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/licenses/LICENSE +0 -0
  49. {tanml-0.1.6.dist-info → tanml-0.1.7.dist-info}/top_level.txt +0 -0
@@ -1,17 +1,107 @@
1
- import os
1
+ from __future__ import annotations
2
+
3
+ from pathlib import Path
4
+ from typing import Any, Optional
2
5
  import pandas as pd
3
6
 
4
- def load_dataframe(filepath):
5
- ext = os.path.splitext(filepath)[1].lower()
6
- if ext == ".csv":
7
- return pd.read_csv(filepath)
8
- elif ext in [".xls", ".xlsx"]:
9
- return pd.read_excel(filepath)
10
- elif ext == ".parquet":
11
- return pd.read_parquet(filepath)
12
- elif ext == ".sas7bdat":
13
- return pd.read_sas(filepath)
14
- elif ext in [".txt", ".tsv"]:
15
- return pd.read_csv(filepath, sep="\t")
16
- else:
17
- raise ValueError(f"Unsupported file format: {ext}")
7
+
8
+ def load_dataframe(filepath: str | Path,
9
+ sheet_name: Optional[str | int] = None,
10
+ sep: Optional[str] = None,
11
+ encoding: Optional[str] = None,
12
+ **_: Any) -> pd.DataFrame:
13
+ """
14
+ Universal table loader with safe fallbacks.
15
+
16
+ Supports:
17
+ .csv, .txt, .tsv
18
+ .xlsx, .xls (needs openpyxl)
19
+ .parquet (pyarrow or fastparquet)
20
+ .feather/.ft (pyarrow)
21
+ .json
22
+ .pkl/.pickle
23
+ .sas7bdat/.xpt (pyreadstat preferred; falls back to pandas.read_sas)
24
+ .sav (SPSS; pyreadstat)
25
+ .dta (Stata)
26
+ """
27
+ p = Path(filepath)
28
+ ext = p.suffix.lower()
29
+
30
+ # --- Delimited text ---
31
+ if ext in {".csv", ".txt"}:
32
+ # If sep is unspecified:
33
+ # - CSV: assume comma (keeps fast C engine, no warning)
34
+ # - TXT: keep sep=None (we'll use Python engine to infer)
35
+ eff_sep = sep if sep is not None else ("," if ext == ".csv" else None)
36
+ engine = "c" if eff_sep is not None else "python"
37
+ try:
38
+ return pd.read_csv(p, sep=eff_sep, encoding=encoding, engine=engine)
39
+ except UnicodeDecodeError:
40
+ return pd.read_csv(p, sep=eff_sep, encoding=encoding or "latin-1", engine=engine)
41
+
42
+ if ext == ".tsv":
43
+ try:
44
+ return pd.read_csv(p, sep="\t", encoding=encoding, engine="c")
45
+ except UnicodeDecodeError:
46
+ return pd.read_csv(p, sep="\t", encoding=encoding or "latin-1", engine="c")
47
+
48
+
49
+ # --- Excel ---
50
+ if ext in {".xlsx", ".xls"}:
51
+ try:
52
+ return pd.read_excel(p, sheet_name=0 if sheet_name is None else sheet_name)
53
+ except ImportError as e:
54
+ raise ModuleNotFoundError(
55
+ "openpyxl>=3.1 is required for Excel files. Install with: pip install openpyxl"
56
+ ) from e
57
+
58
+ # --- Columnar ---
59
+ if ext == ".parquet":
60
+ try:
61
+ return pd.read_parquet(p, engine="pyarrow")
62
+ except Exception:
63
+ return pd.read_parquet(p, engine="fastparquet")
64
+
65
+ if ext in {".feather", ".ft"}:
66
+ try:
67
+ import pyarrow # noqa: F401
68
+ except ModuleNotFoundError as e:
69
+ raise ModuleNotFoundError(
70
+ "pyarrow is required for Feather files. Install with: pip install pyarrow"
71
+ ) from e
72
+ return pd.read_feather(p)
73
+
74
+ # --- Other common formats ---
75
+ if ext in {".pkl", ".pickle"}:
76
+ return pd.read_pickle(p)
77
+
78
+ if ext == ".json":
79
+ return pd.read_json(p, convert_dates=True)
80
+
81
+ # --- SAS / SPSS / Stata ---
82
+ if ext in {".sas7bdat", ".xpt"}:
83
+ try:
84
+ import pyreadstat # type: ignore
85
+ if ext == ".sas7bdat":
86
+ df, _ = pyreadstat.read_sas7bdat(str(p))
87
+ else:
88
+ df, _ = pyreadstat.read_xport(str(p))
89
+ return df
90
+ except ModuleNotFoundError:
91
+ # best-effort fallback
92
+ return pd.read_sas(p)
93
+
94
+ if ext == ".sav": # SPSS
95
+ try:
96
+ import pyreadstat # type: ignore
97
+ except ModuleNotFoundError as e:
98
+ raise ModuleNotFoundError(
99
+ "pyreadstat is required for SPSS .sav files. Install with: pip install pyreadstat"
100
+ ) from e
101
+ df, _ = pyreadstat.read_sav(str(p))
102
+ return df
103
+
104
+ if ext == ".dta": # Stata
105
+ return pd.read_stata(p)
106
+
107
+ raise ValueError(f"Unsupported file format: {ext} (path={p})")
@@ -0,0 +1,164 @@
1
+ Metadata-Version: 2.4
2
+ Name: tanml
3
+ Version: 0.1.7
4
+ Summary: Automated validation toolkit for tabular ML models—MRM, credit risk, insurance, and other regulated use cases.
5
+ Author: Tanmay Sah, Dolly Sah
6
+ Maintainer: Tanmay Sah, Dolly Sah
7
+ License-Expression: MIT
8
+ Project-URL: Homepage, https://github.com/tdlabs-ai/tanml
9
+ Project-URL: Source, https://github.com/tdlabs-ai/tanml
10
+ Project-URL: Issues, https://github.com/tdlabs-ai/tanml/issues
11
+ Project-URL: Documentation, https://github.com/tdlabs-ai/tanml#readme
12
+ Keywords: model validation,model risk management,model governance,SR 11-7,tabular ML,credit risk,insurance analytics,explainability,XAI,SHAP,stress testing,reporting,docx,streamlit,xgboost,lightgbm,catboost
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Science/Research
15
+ Classifier: Intended Audience :: Financial and Insurance Industry
16
+ Classifier: Natural Language :: English
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3 :: Only
21
+ Classifier: Programming Language :: Python :: 3.8
22
+ Classifier: Programming Language :: Python :: 3.9
23
+ Classifier: Programming Language :: Python :: 3.10
24
+ Classifier: Programming Language :: Python :: 3.11
25
+ Classifier: Programming Language :: Python :: 3.12
26
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
27
+ Classifier: Topic :: Scientific/Engineering :: Information Analysis
28
+ Classifier: Topic :: Scientific/Engineering :: Visualization
29
+ Requires-Python: >=3.8
30
+ Description-Content-Type: text/markdown
31
+ License-File: LICENSE
32
+ Requires-Dist: numpy>=1.26
33
+ Requires-Dist: scipy>=1.11
34
+ Requires-Dist: pandas>=2.0
35
+ Requires-Dist: scikit-learn>=1.3
36
+ Requires-Dist: statsmodels>=0.14
37
+ Requires-Dist: xgboost>=2.0
38
+ Requires-Dist: lightgbm>=4.3
39
+ Requires-Dist: catboost>=1.2
40
+ Requires-Dist: shap>=0.44
41
+ Requires-Dist: numba>=0.58
42
+ Requires-Dist: matplotlib>=3.8
43
+ Requires-Dist: seaborn>=0.13
44
+ Requires-Dist: Pillow>=10.0
45
+ Requires-Dist: python-docx>=1.1.2
46
+ Requires-Dist: tzlocal>=5.0
47
+ Requires-Dist: tqdm>=4.66
48
+ Requires-Dist: pyarrow>=14.0
49
+ Requires-Dist: openpyxl>=3.1
50
+ Requires-Dist: pyreadstat>=1.2
51
+ Requires-Dist: streamlit>=1.36
52
+ Provides-Extra: dev
53
+ Requires-Dist: pytest; extra == "dev"
54
+ Requires-Dist: black; extra == "dev"
55
+ Requires-Dist: isort; extra == "dev"
56
+ Dynamic: license-file
57
+
58
+ # TanML: Automated Model Validation Toolkit for Tabular Machine Learning
59
+
60
+ [![Cite this repo](https://img.shields.io/badge/Cite-this_repo-blue)](https://github.com/tdlabs-ai/tanml#license--citation)
61
+ [![License: MIT](https://img.shields.io/badge/License-MIT-green.svg)](https://opensource.org/licenses/MIT)
62
+ [![Downloads](https://pepy.tech/badge/tanml)](https://pepy.tech/project/tanml)
63
+
64
+ **TanML** validates tabular ML models with a zero-config **Streamlit UI** and exports an audit-ready, **editable Word report (.docx)**. It covers data quality, correlation/VIF, performance, explainability (SHAP), and robustness/stress tests—built for regulated settings (MRM, credit risk, insurance, etc.).
65
+
66
+ * **Status:** Beta (`0.x`)
67
+ * **License:** MIT
68
+ * **Python:** 3.8–3.12
69
+ * **OS:** Linux / macOS / Windows (incl. WSL)
70
+
71
+ ---
72
+
73
+ ## Why TanML?
74
+
75
+ * **Zero-config UI:** launch Streamlit, upload data, click **Run**—no YAML needed.
76
+ * **Audit-ready outputs:** tables/plots + a polished DOCX your stakeholders can edit.
77
+ * **Regulatory alignment:** supports common Model Risk Management themes (e.g., SR 11-7 style).
78
+ * **Works with your stack:** scikit-learn, XGBoost/LightGBM/CatBoost, etc.
79
+
80
+ ---
81
+
82
+ ## Install
83
+
84
+ ```bash
85
+ pip install tanml
86
+ ```
87
+
88
+ ## Quick Start (UI)
89
+
90
+ ```bash
91
+ tanml ui
92
+ ```
93
+
94
+ * Opens at **[http://127.0.0.1:8501](http://127.0.0.1:8501)**
95
+ * **Upload limit ~1 GB** (preconfigured)
96
+ * **Telemetry disabled by default**
97
+
98
+ ### In the app
99
+
100
+ 1. **Load data** — upload a cleaned CSV/XLSX/Parquet (optional: raw or separate Train/Test).
101
+ 2. **Select target & features** — target auto-suggested; features default to all non-target columns.
102
+ 3. **Pick a model** — choose library/algorithm (scikit-learn, XGBoost, LightGBM, CatBoost) and tweak params.
103
+ 4. **Run validation** — click **▶️ Refit & validate**.
104
+ 5. **Export** — click **⬇️ Download report** to get a **DOCX** (auto-selects classification/regression template).
105
+
106
+ **Outputs**
107
+
108
+ * Report: `./.ui_runs/<session>/tanml_report_*.docx`
109
+ * Artifacts (CSV/PNGs): `./.ui_runs/<session>/artifacts/*`
110
+
111
+ ---
112
+
113
+ ## What TanML Checks
114
+
115
+ * **Raw Data (optional):** rows/cols, missingness, duplicates, constant columns
116
+ * **Data Quality & EDA:** summaries, distributions
117
+ * **Correlation & Multicollinearity:** heatmap, top-pairs CSV, **VIF** table
118
+ * **Performance**
119
+
120
+ * **Classification:** AUC, PR-AUC, KS, decile lift, confusion
121
+ * **Regression:** R², MAE, MSE/RMSE, error stats
122
+ * **Explainability:** SHAP (auto explainer; configurable background size)
123
+ * **Robustness/Stress Tests:** feature perturbations → delta-metrics
124
+ * **Model Metadata:** model class, hyperparameters, features, training info
125
+
126
+ ---
127
+
128
+
129
+ ## Templates
130
+
131
+ TanML ships DOCX templates (packaged in wheel & sdist):
132
+
133
+ * `tanml/report/templates/report_template_cls.docx`
134
+ * `tanml/report/templates/report_template_reg.docx`
135
+
136
+ ---
137
+
138
+
139
+ ## License & Citation
140
+
141
+ **License:** MIT. See [LICENSE](https://github.com/tdlabs-ai/tanml/blob/main/LICENSE).
142
+ SPDX-License-Identifier: MIT
143
+
144
+ © 2025 Tanmay Sah and Dolly Sah. You may use, modify, and distribute this software with appropriate attribution.
145
+
146
+ ### How to cite
147
+
148
+ If TanML helps your work or publications, please cite:
149
+
150
+ > Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning* [Software]. Available at https://github.com/tdlabs-ai/tanml
151
+
152
+ Or in BibTeX (version-agnostic):
153
+
154
+ ```bibtex
155
+ @misc{tanml,
156
+ author = {Sah, Tanmay and Sah, Dolly},
157
+ title = {TanML: Automated Model Validation Toolkit for Tabular Machine Learning},
158
+ year = {2025},
159
+ note = {Software; MIT License},
160
+ url = {https://github.com/tdlabs-ai/tanml}
161
+ }
162
+ ```
163
+
164
+ A machine-readable citation file (`CITATION.cff`) is included for citation tools and GitHub’s “Cite this repository” button.
@@ -0,0 +1,54 @@
1
+ tanml/__init__.py,sha256=GhTYFGQ77wbJM35lScp2XZKYnkNHTrdYozPoCZiCVSM,23
2
+ tanml/check_runners/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
3
+ tanml/check_runners/base_runner.py,sha256=jibUcl6a7SQH9EKnPFZhU1t9FY-Bu_nkPjRfjW2T5x0,210
4
+ tanml/check_runners/cleaning_repro_runner.py,sha256=fXlujWnuA0Hfzu9uQ31itU8eO-6wGD4eUImiR6kyHfg,817
5
+ tanml/check_runners/correlation_runner.py,sha256=-TxktyjXi8IX4t0DCTe-9Ea5vVb-RcN6gBptbzUJdNQ,1975
6
+ tanml/check_runners/data_quality_runner.py,sha256=IdhWYOtDBPkAwT2Aa6SYSiI2gkLEhkBXHAQop5ZFg0I,883
7
+ tanml/check_runners/eda_runner.py,sha256=Gr5ZmgOvUilej-rhinsL9KmsIQOMWjd6jdOYUQTJ4os,623
8
+ tanml/check_runners/explainability_runner.py,sha256=I40KVRoN1yEzZ6i1WZ6a1g89cPrRHpZ97lbfMEyOq2U,908
9
+ tanml/check_runners/input_cluster_runner.py,sha256=0O8JJq3HOvNd3_nrZY8FevVYimI_Nl2Iftyyx56LVzc,1251
10
+ tanml/check_runners/logistic_stats_runner.py,sha256=vTwnWHwUjl7ML2PKDteOgUc247D7xGXkKa8NfxUxYa4,7739
11
+ tanml/check_runners/model_meta_runner.py,sha256=hOIEp6k98o0FwYjgmi_53Ni7H14jerKdf46wpxVkAkk,799
12
+ tanml/check_runners/performance_runner.py,sha256=Q9r5mqj2ayOSfIN2SVaF4sm_Gc-pVyPfU3Q9NE9uZ8k,2567
13
+ tanml/check_runners/raw_data_runner.py,sha256=qxX2Fst7u2VUwobtWr1StaDj8x5q1P01zCDhB0ewbdY,1531
14
+ tanml/check_runners/regression_metrics_runner.py,sha256=wPDtGs713aX0rVRcUNsWVIybQee8MlHt6dxJxigRAFY,5774
15
+ tanml/check_runners/rule_engine_runner.py,sha256=xgij_9S8kUSESgXeWGN7FnOD0hNmd3GAregxuD_x9nI,286
16
+ tanml/check_runners/stress_test_runner.py,sha256=s4Ax7JnqGzqc3kw0wfCGt3LNIcZzZMX30Uk73zPDT34,1544
17
+ tanml/check_runners/vif_runner.py,sha256=T7ho2jYnHKuz3P0R5CuoNB0LjPoH_Gs_FS0dcoyYbj4,2311
18
+ tanml/checks/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
+ tanml/checks/base.py,sha256=g1GLXHWkE848bDKb4zXLHRqfgucVWFa-n6ZDT5vzs2I,570
20
+ tanml/checks/cleaning_repro.py,sha256=LtPEkikjTY0NMsv1FZ8tOPUkaqF3yK_-DQdLCltMsIA,1795
21
+ tanml/checks/correlation.py,sha256=IN86Yc-yUlBKNSaK-YTlGM4V2DH6BPZw5KHJVN-hE2c,10350
22
+ tanml/checks/data_quality.py,sha256=bNrM469c_G-TxU-6ne2UYv5gwoUoyrab4lYbI16l7VU,981
23
+ tanml/checks/eda.py,sha256=7fOIBhCg0oiK4SL22WcKHK8vL3s-YwypiAnZDwoq3lo,2515
24
+ tanml/checks/input_cluster.py,sha256=93RvE-vd_elt4xqnkCkHmHH6NWPOtJF7ylb5mPTsagA,3990
25
+ tanml/checks/logit_stats.py,sha256=OnL2_M4bg-T5RE5Rv_wMJNkaX4Ox1ihSk428_7OJ15w,6958
26
+ tanml/checks/model_contents.py,sha256=jcS4GrUoaQVUBf9pFwvtv9KKvDO5vZroiPz8mOmyAGo,1263
27
+ tanml/checks/model_meta.py,sha256=x2RnT3kUWY4ERWfM9cItpaIJX8Fca9IQByR9zPN2V20,1881
28
+ tanml/checks/performance.py,sha256=vQ_FeIcr3ASDhVd97y2JKMEMfJUrfCRnUBZh1ZI6rCs,3400
29
+ tanml/checks/performance_classification.py,sha256=2MnMeQEBVswCOjT7hFK7gjRBiXmhLgriA6_qw_9jsXk,11982
30
+ tanml/checks/raw_data.py,sha256=asjyURkvBQ1qOfTJkIZblcyELf1HO044jRo1BZGsPOQ,2862
31
+ tanml/checks/regression_metrics.py,sha256=736Hk6tmCwR8uFUJF1Z5kszFiT_qHM86jBPGveb_BkY,6158
32
+ tanml/checks/rule_engine.py,sha256=bqGd0-2MlSPGGUFPyMxgHiwVTvw1Iu7NFqUzl_-0iIg,1694
33
+ tanml/checks/stress_test.py,sha256=64fVn96uzGg7_qvPMwiWhYRSr4FflFMkkjXQXmv2YB0,7460
34
+ tanml/checks/vif.py,sha256=0tghDaiG8z4frgP5OHj3ctgsH4Mak1N7Docu6OcG8JA,1902
35
+ tanml/checks/explainability/shap_check.py,sha256=Di5WD3jUgs1TEwnhCoEU0dhvMlBUf0I3tYNpsjST1wk,12829
36
+ tanml/cli/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
37
+ tanml/cli/main.py,sha256=A9W7RXYbnvw_O-jJDbwvkAOL4NQF93XNsn3PlzNuqZg,3929
38
+ tanml/config_templates/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
39
+ tanml/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
40
+ tanml/engine/check_agent_registry.py,sha256=CrRAL_BnsXcivXLQydka2R8emHrkHOqZDSzuqZNmknc,2419
41
+ tanml/engine/core_engine_agent.py,sha256=-DE9ehdY60W2IAzNMnv-BTOf3KPJobX8p5dBcWH1aiE,9334
42
+ tanml/models/registry.py,sha256=SMAwKTLVIFYQ_KrYsaxm-dmSfyE_zk1Mo1AL8-JZt1w,15467
43
+ tanml/report/report_builder.py,sha256=_UuaLxJOuvfXZjOWIrI478G8jMUy2CxQNi2YDR-8FYI,55915
44
+ tanml/report/templates/report_template_cls.docx,sha256=1pAKIvC8nG2JGN3H7SpU10QVaeDNgJI1aOi1SH6TCzs,30199
45
+ tanml/report/templates/report_template_reg.docx,sha256=bjr-18WFD-KwtRtNcbRipstyNa29z7hF-JHp86OMKVg,29673
46
+ tanml/ui/app.py,sha256=n6seLRuoWSF6sK6arewyhhEyfEhNlGeyhourRAB_loc,47682
47
+ tanml/utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
48
+ tanml/utils/data_loader.py,sha256=_Ja3XSONwL_NVwpxuBjLacXWX5yPTN9H0moFxSVqGlw,3713
49
+ tanml-0.1.7.dist-info/licenses/LICENSE,sha256=e6xQyG7SdWiD4cLlj7rFdMrjG6H7ABglGOrooZxWLKQ,1102
50
+ tanml-0.1.7.dist-info/METADATA,sha256=gy0-NjrrTIIl9J0kFL2iNTI07AYuNSa-GEf_Iq1wSHo,6118
51
+ tanml-0.1.7.dist-info/WHEEL,sha256=_zCd3N1l69ArxyTb8rzEoP9TpbYXkqRFSNOD5OuxnTs,91
52
+ tanml-0.1.7.dist-info/entry_points.txt,sha256=WUM_y0uRIL7iXPcxK69Bn5mKZXnDshWDbLyohjND1IE,46
53
+ tanml-0.1.7.dist-info/top_level.txt,sha256=81dIhCm6opwY6E7Pb9G1kdIVmYrUkXX4PaYhQ873gIE,6
54
+ tanml-0.1.7.dist-info/RECORD,,
tanml/cli/arg_parser.py DELETED
@@ -1,31 +0,0 @@
1
- import argparse
2
-
3
- def parse_args():
4
- parser = argparse.ArgumentParser(description="Run TanML model validation toolkit")
5
- subparsers = parser.add_subparsers(dest="command", required=True)
6
-
7
-
8
- validate_parser = subparsers.add_parser("validate", help="Run validation checks and generate report")
9
- validate_parser.add_argument("--model", required=False,
10
- help="Model path: .pkl for sklearn/xgb, .csv for SAS or R logistic")
11
- validate_parser.add_argument("--raw", required=False, help="Path to raw input data file")
12
- validate_parser.add_argument("--cleaned", required=False, help="Path to cleaned input data file")
13
- validate_parser.add_argument("--rules", required=True, help="Path to rules.yaml config file")
14
- validate_parser.add_argument("--target", required=False, help="Target column name (optional)")
15
- validate_parser.add_argument("--features", required=False, help="Comma-separated list of features")
16
- validate_parser.add_argument(
17
- "--report_path",
18
- type=str,
19
- default="reports/final_report.docx",
20
- help="Path to output DOCX report. Example: --report_path my_reports/run1.docx"
21
- )
22
-
23
-
24
- init_parser = subparsers.add_parser("init", help="Generate starter rules.yaml file")
25
- init_parser.add_argument("--scenario", required=True, choices=["A", "B", "C"],
26
- help="Choose validation scenario: A (single model), B (multiple segments), C (single dataset + segment column)")
27
- init_parser.add_argument(
28
- "--output", type=str, default="rules.yaml",
29
- help="Destination path for rules YAML file (default: rules.yaml)"
30
- )
31
- return parser.parse_args()
tanml/cli/init_cmd.py DELETED
@@ -1,8 +0,0 @@
1
- from tanml.utils.yaml_generator import generate_rules_yaml
2
-
3
- def run_init(scenario, dest_path="rules.yaml", overwrite=False):
4
- try:
5
- generate_rules_yaml(scenario=scenario, dest_path=dest_path, overwrite=overwrite)
6
-
7
- except Exception as e:
8
- print(f"❌ Failed to create YAML: {e}")
tanml/cli/validate_cmd.py DELETED
@@ -1,7 +0,0 @@
1
- # tanml/cli/validate_cmd.py
2
-
3
- from tanml.validate import validate_from_yaml
4
-
5
- def run_validate(rules_path):
6
- print(f"🧪 Starting validation using rules from: {rules_path}")
7
- validate_from_yaml(rules_path)
@@ -1,144 +0,0 @@
1
- # ============================================================
2
- # TanML Validation Configuration File: Scenario B
3
- # ------------------------------------------------------------
4
- # 🧪 Scenario: One model and one cleaned dataset per segment
5
- #
6
- # ✅ Required:
7
- # - Define segment-wise cleaned data and model path (or retrain)
8
- # - Choose ONE model source strategy (Option A or Option B)
9
- # - Set input features and target column
10
- # - Optional: provide global raw data
11
- # - Adjust thresholds and check options as needed
12
- # ============================================================
13
-
14
- # ------------------------------------------
15
- # REQUIRED: Model Input Schema
16
- # ------------------------------------------
17
- model:
18
- features:
19
- - feature_0 # 👉 replace with actual feature names used in all segment models
20
- - feature_1 # 👉 remove or add more lines if needed
21
- - feature_2
22
- target: default_flag # 👉 replace with your actual target column
23
-
24
- # ------------------------------------------
25
- # OPTIONAL: Raw Data Path
26
- # ------------------------------------------
27
- paths:
28
- raw_data: data/raw.csv # 👉 optional – provide full path if you have original raw dataset otherwise use null
29
-
30
- output:
31
- report_path_template: /absolute/path/to/output/{segment} # 👉 Output path template – {segment} will be replaced dynamically, it is folder path
32
-
33
- # ------------------------------------------
34
- # ✅ OPTION A — Pretrained Model per Segment (.pkl)
35
- # ------------------------------------------
36
- # Use this option when each segment has its own trained model and cleaned dataset.
37
- # You must provide:
38
- # - model: path to the pretrained `.pkl` file
39
- # - cleaned: path to the cleaned CSV used for that segment
40
- #
41
- # 👉 Comment out the OPTION B block below if using this
42
- # ------------------------------------------
43
- segment:
44
- runs:
45
- segment_A: # 👉 rename (e.g., high_risk, bronze, tier_1)
46
- model: models/logistic/model_b_segment1.pkl
47
- cleaned: data/scenario_b_segment1.csv
48
-
49
- segment_B:
50
- model: models/logistic/model_b_segment1.pkl
51
- cleaned: data/scenario_b_segment2.csv
52
-
53
- # ------------------------------------------
54
- # 🔁 OPTION B — Retrain Model from Cleaned Data (Per Segment)
55
- # ------------------------------------------
56
- # Use this when you want to retrain a model for each segment
57
- # using the corresponding cleaned dataset and shared model config.
58
- #
59
- # 👉 If using this, comment out OPTION A (i.e., segment.runs[].model lines)
60
- # 👉 Each segment must still have its own cleaned dataset
61
- # ------------------------------------------
62
-
63
- # segment:
64
- # runs:
65
- # segment_A:
66
- # cleaned: data/cleaned_a.csv
67
- # segment_B:
68
- # cleaned: data/cleaned_b.csv
69
-
70
- # ------------------------------------------
71
- # MODEL SOURCE CONFIG
72
- # ------------------------------------------
73
- # This tells TanML how to load or build the model(s) for each segment.
74
- #
75
- # 👉 If using pretrained models (Option A): set `from_pickle: true`
76
- # 👉 If retraining per segment (Option B): set `from_pickle: false`
77
- # ------------------------------------------
78
- model_source:
79
- from_pickle: true # 👉 change to false if using retraining (Option B)
80
- type: LogisticRegression
81
- module: sklearn.linear_model
82
- hyperparameters:
83
- penalty: "l2"
84
- solver: "liblinear"
85
- random_state: 42
86
- class_weight: "balanced"
87
- max_iter: 100
88
-
89
- # ------------------------------------------
90
- # PERFORMANCE THRESHOLDS
91
- # ------------------------------------------
92
- auc_roc:
93
- min: 0.60
94
-
95
- f1:
96
- min: 0.60
97
-
98
- ks:
99
- min: 0.20
100
-
101
- # ------------------------------------------
102
- # VALIDATION CHECKS
103
- # ------------------------------------------
104
-
105
- EDACheck:
106
- enabled: true
107
- max_plots: -1 # -1 = all numeric; or set number of columns
108
-
109
- correlation:
110
- enabled: true
111
-
112
- VIFCheck:
113
- enabled: true
114
-
115
- raw_data_check:
116
- enabled: true
117
-
118
- model_meta:
119
- enabled: true
120
-
121
- # ------------------------------------------
122
- # STRESS TESTING (Robustness Check)
123
- # ------------------------------------------
124
- StressTestCheck:
125
- enabled: true
126
- epsilon: 0.01 # ➜ 1% noise
127
- perturb_fraction: 0.2 # ➜ 20% of rows
128
-
129
- # ------------------------------------------
130
- # INPUT CLUSTER COVERAGE
131
- # ------------------------------------------
132
- InputClusterCoverageCheck:
133
- enabled: true
134
- n_clusters: 5 # ➜ fixed clusters for coverage bar chart
135
- max_k: 10 # ➜ elbow method search (if needed)
136
-
137
- # ------------------------------------------
138
- # EXPLAINABILITY
139
- # ------------------------------------------
140
- explainability:
141
- shap:
142
- enabled: true
143
- background_sample_size: 100 # ➜ SHAP explainer training background
144
- test_sample_size: 200 # ➜ test rows to explain
@@ -1,140 +0,0 @@
1
- # ============================================================
2
- # TanML Validation Configuration File: Scenario C
3
- # ------------------------------------------------------------
4
- # 🧪 Scenario: One dataset with a segment column, one model per segment
5
- #
6
- # ✅ Required:
7
- # - Provide cleaned data (includes all segments)
8
- # - Define the segment column used to split the data
9
- # - Choose ONE model source strategy (Option A or Option B)
10
- # - Set input features and target column
11
- # - Optional: provide global raw data
12
- # - Adjust thresholds and check options as needed
13
- # ============================================================
14
-
15
- # ------------------------------------------
16
- # REQUIRED: Model Input Schema
17
- # ------------------------------------------
18
- model:
19
- features:
20
- - feature_0 # 👉 replace with actual feature names used across all segment models
21
- - feature_1
22
- - feature_2
23
- target: default_flag # 👉 replace with your actual target column
24
-
25
- # ------------------------------------------
26
- # REQUIRED: File Paths
27
- # ------------------------------------------
28
- paths:
29
- cleaned_data: data/cleaned.csv # 👉 path to full cleaned dataset (includes all segments)
30
- raw_data: data/raw.csv # 👉 optional — use null if raw data not available
31
-
32
- # ------------------------------------------
33
- # OUTPUT CONFIGURATION
34
- # ------------------------------------------
35
- # Path template where validation reports will be saved.
36
- # 👉 Use `{segment}` as a placeholder for the segment name.
37
- # ------------------------------------------
38
- output:
39
- report_path_template: reports/scenario_c/{segment}_report.docx # 👉 customize this path as needed
40
-
41
- # ------------------------------------------
42
- # ✅ OPTION A — Pretrained Models per Segment (.pkl)
43
- # ------------------------------------------
44
- # Provide one model per segment (already trained)
45
- # This is the default option.
46
- # 👉 Comment out the OPTION B block below if using this
47
- # ------------------------------------------
48
- segment:
49
- column: customer_segment # 👉 column used to split segments
50
-
51
- runs:
52
- segment_A:
53
- model: models/logistic/model_a.pkl
54
- segment_B:
55
- model: models/logistic/model_b.pkl
56
-
57
- # ------------------------------------------
58
- # 🔁 OPTION B — Retrain Models per Segment from Cleaned Data
59
- # ------------------------------------------
60
- # Use this if you want TanML to retrain a model for each segment
61
- # from the common cleaned dataset.
62
- # 👉 Comment out the OPTION A block above if using this
63
- # 👉 `segment.runs` must list segment values (no model paths needed)
64
- # ------------------------------------------
65
- # segment:
66
- # column: customer_segment
67
- # runs:
68
- # segment_A: {}
69
- # segment_B: {}
70
-
71
- # ------------------------------------------
72
- # MODEL SOURCE CONFIGURATION
73
- # ------------------------------------------
74
- # 👉 If using pretrained models: set `from_pickle: true`
75
- # 👉 If retraining per segment: set `from_pickle: false`
76
- # ------------------------------------------
77
- model_source:
78
- from_pickle: true
79
- type: LogisticRegression
80
- module: sklearn.linear_model
81
- hyperparameters:
82
- penalty: "l2"
83
- solver: "liblinear"
84
- random_state: 42
85
- class_weight: "balanced"
86
- max_iter: 100
87
-
88
- # ------------------------------------------
89
- # PERFORMANCE THRESHOLDS
90
- # ------------------------------------------
91
- auc_roc:
92
- min: 0.60
93
- f1:
94
- min: 0.60
95
- ks:
96
- min: 0.20
97
-
98
- # ------------------------------------------
99
- # VALIDATION CHECKS
100
- # ------------------------------------------
101
- EDACheck:
102
- enabled: true
103
- max_plots: -1
104
-
105
- correlation:
106
- enabled: true
107
-
108
- VIFCheck:
109
- enabled: true
110
-
111
- raw_data_check:
112
- enabled: true
113
-
114
- model_meta:
115
- enabled: true
116
-
117
- # ------------------------------------------
118
- # STRESS TESTING (Robustness Check)
119
- # ------------------------------------------
120
- StressTestCheck:
121
- enabled: true
122
- epsilon: 0.01
123
- perturb_fraction: 0.2
124
-
125
- # ------------------------------------------
126
- # INPUT CLUSTER COVERAGE
127
- # ------------------------------------------
128
- InputClusterCoverageCheck:
129
- enabled: true
130
- n_clusters: 5
131
- max_k: 10
132
-
133
- # ------------------------------------------
134
- # EXPLAINABILITY
135
- # ------------------------------------------
136
- explainability:
137
- shap:
138
- enabled: true
139
- background_sample_size: 100
140
- test_sample_size: 200