tanml 0.1.6__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of tanml might be problematic. Click here for more details.

Files changed (68) hide show
  1. tanml-0.1.6/LICENSE +21 -0
  2. tanml-0.1.6/MANIFEST.in +6 -0
  3. tanml-0.1.6/PKG-INFO +317 -0
  4. tanml-0.1.6/README.md +284 -0
  5. tanml-0.1.6/pyproject.toml +54 -0
  6. tanml-0.1.6/setup.cfg +4 -0
  7. tanml-0.1.6/tanml/__init__.py +1 -0
  8. tanml-0.1.6/tanml/check_runners/__init__.py +0 -0
  9. tanml-0.1.6/tanml/check_runners/base_runner.py +6 -0
  10. tanml-0.1.6/tanml/check_runners/cleaning_repro_runner.py +18 -0
  11. tanml-0.1.6/tanml/check_runners/correlation_runner.py +15 -0
  12. tanml-0.1.6/tanml/check_runners/data_quality_runner.py +24 -0
  13. tanml-0.1.6/tanml/check_runners/eda_runner.py +21 -0
  14. tanml-0.1.6/tanml/check_runners/explainability_runner.py +28 -0
  15. tanml-0.1.6/tanml/check_runners/input_cluster_runner.py +43 -0
  16. tanml-0.1.6/tanml/check_runners/logistic_stats_runner.py +28 -0
  17. tanml-0.1.6/tanml/check_runners/model_meta_runner.py +23 -0
  18. tanml-0.1.6/tanml/check_runners/performance_runner.py +28 -0
  19. tanml-0.1.6/tanml/check_runners/raw_data_runner.py +41 -0
  20. tanml-0.1.6/tanml/check_runners/rule_engine_runner.py +5 -0
  21. tanml-0.1.6/tanml/check_runners/stress_test_runner.py +26 -0
  22. tanml-0.1.6/tanml/check_runners/vif_runner.py +54 -0
  23. tanml-0.1.6/tanml/checks/__init__.py +0 -0
  24. tanml-0.1.6/tanml/checks/base.py +20 -0
  25. tanml-0.1.6/tanml/checks/cleaning_repro.py +47 -0
  26. tanml-0.1.6/tanml/checks/correlation.py +61 -0
  27. tanml-0.1.6/tanml/checks/data_quality.py +26 -0
  28. tanml-0.1.6/tanml/checks/eda.py +67 -0
  29. tanml-0.1.6/tanml/checks/explainability/shap_check.py +55 -0
  30. tanml-0.1.6/tanml/checks/input_cluster.py +109 -0
  31. tanml-0.1.6/tanml/checks/logit_stats.py +59 -0
  32. tanml-0.1.6/tanml/checks/model_contents.py +40 -0
  33. tanml-0.1.6/tanml/checks/model_meta.py +50 -0
  34. tanml-0.1.6/tanml/checks/performance.py +90 -0
  35. tanml-0.1.6/tanml/checks/raw_data.py +47 -0
  36. tanml-0.1.6/tanml/checks/rule_engine.py +45 -0
  37. tanml-0.1.6/tanml/checks/stress_test.py +64 -0
  38. tanml-0.1.6/tanml/checks/vif.py +51 -0
  39. tanml-0.1.6/tanml/cli/__init__.py +0 -0
  40. tanml-0.1.6/tanml/cli/arg_parser.py +31 -0
  41. tanml-0.1.6/tanml/cli/init_cmd.py +8 -0
  42. tanml-0.1.6/tanml/cli/main.py +27 -0
  43. tanml-0.1.6/tanml/cli/validate_cmd.py +7 -0
  44. tanml-0.1.6/tanml/config_templates/__init__.py +0 -0
  45. tanml-0.1.6/tanml/config_templates/rules_multiple_models_datasets.yaml +144 -0
  46. tanml-0.1.6/tanml/config_templates/rules_one_dataset_segment_column.yaml +140 -0
  47. tanml-0.1.6/tanml/config_templates/rules_one_model_one_dataset.yaml +143 -0
  48. tanml-0.1.6/tanml/engine/__init__.py +0 -0
  49. tanml-0.1.6/tanml/engine/check_agent_registry.py +42 -0
  50. tanml-0.1.6/tanml/engine/core_engine_agent.py +115 -0
  51. tanml-0.1.6/tanml/engine/segmentation_agent.py +118 -0
  52. tanml-0.1.6/tanml/engine/validation_agent.py +91 -0
  53. tanml-0.1.6/tanml/report/report_builder.py +230 -0
  54. tanml-0.1.6/tanml/report/templates/report_template.docx +0 -0
  55. tanml-0.1.6/tanml/utils/__init__.py +0 -0
  56. tanml-0.1.6/tanml/utils/data_loader.py +17 -0
  57. tanml-0.1.6/tanml/utils/model_loader.py +35 -0
  58. tanml-0.1.6/tanml/utils/r_loader.py +30 -0
  59. tanml-0.1.6/tanml/utils/sas_loader.py +50 -0
  60. tanml-0.1.6/tanml/utils/yaml_generator.py +34 -0
  61. tanml-0.1.6/tanml/utils/yaml_loader.py +5 -0
  62. tanml-0.1.6/tanml/validate.py +209 -0
  63. tanml-0.1.6/tanml.egg-info/PKG-INFO +317 -0
  64. tanml-0.1.6/tanml.egg-info/SOURCES.txt +66 -0
  65. tanml-0.1.6/tanml.egg-info/dependency_links.txt +1 -0
  66. tanml-0.1.6/tanml.egg-info/entry_points.txt +2 -0
  67. tanml-0.1.6/tanml.egg-info/requires.txt +21 -0
  68. tanml-0.1.6/tanml.egg-info/top_level.txt +1 -0
tanml-0.1.6/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Tanmay Sah and Dolly Sah
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,6 @@
1
+ include README.md
2
+ include LICENSE
3
+ include pyproject.toml
4
+
5
+ include tanml/report/templates/*.docx
6
+ include tanml/config_templates/*.yaml
tanml-0.1.6/PKG-INFO ADDED
@@ -0,0 +1,317 @@
1
+ Metadata-Version: 2.4
2
+ Name: tanml
3
+ Version: 0.1.6
4
+ Summary: Automated validation toolkit for tabular ML models in finance and regulated domains.
5
+ Author: Dolly Sah
6
+ Author-email: Tanmay Sah <tradertanmay@gmail.com>
7
+ License: MIT
8
+ Project-URL: Homepage, https://github.com/tdlabs-ai/tanml
9
+ Requires-Python: >=3.8
10
+ Description-Content-Type: text/markdown
11
+ License-File: LICENSE
12
+ Requires-Dist: scikit-learn
13
+ Requires-Dist: pandas
14
+ Requires-Dist: matplotlib
15
+ Requires-Dist: seaborn
16
+ Requires-Dist: shap
17
+ Requires-Dist: docxtpl
18
+ Requires-Dist: python-docx
19
+ Requires-Dist: docxcompose
20
+ Requires-Dist: PyYAML
21
+ Requires-Dist: scipy
22
+ Requires-Dist: statsmodels
23
+ Requires-Dist: joblib
24
+ Requires-Dist: tzlocal
25
+ Requires-Dist: tqdm
26
+ Requires-Dist: imgkit
27
+ Requires-Dist: xgboost
28
+ Provides-Extra: dev
29
+ Requires-Dist: pytest; extra == "dev"
30
+ Requires-Dist: black; extra == "dev"
31
+ Requires-Dist: isort; extra == "dev"
32
+ Dynamic: license-file
33
+
34
+ # TanML: Automated Model Validation Toolkit for Tabular Machine Learning
35
+ [![Cite this repo](https://img.shields.io/badge/Cite-this_repo-blue)](https://github.com/tdlabs-ai/tanml)
36
+
37
+ TanML is a modular, automated model validation toolkit for tabular machine learning workflows. It supports end-to-end validation with just a YAML configuration and a single command, performing checks across data quality, robustness, explainability, and model performance.
38
+
39
+ TanML generates structured Word (DOCX) reports suitable for internal model reviews, audit documentation, and stakeholder presentations. It is designed for general-purpose ML validation and works well in domains where interpretability, fairness, and reliability are critical.
40
+
41
+ While TanML currently operates as a command-line toolkit, its architecture is designed to evolve into an intelligent validation agent—capable of integrating with AutoML pipelines, CI/CD workflows, and human-in-the-loop validation systems.
42
+
43
+ ## Key Features
44
+
45
+ * One-command validation using CLI and YAML
46
+ * Supports models developed in Python (scikit-learn, XGBoost), R, or SAS
47
+ * Scenario-based flexibility:
48
+
49
+ * Scenario A: Single model and dataset
50
+ * Scenario B: One model per segment with separate datasets
51
+ * Scenario C: One model and dataset with an internal segmentation column
52
+ * Comprehensive validation checks:
53
+
54
+ * Model performance (e.g., accuracy, AUC, KS)
55
+ * Data quality diagnostics
56
+ * Stress testing for input robustness
57
+ * SHAP-based explainability
58
+ * Segment-wise validation
59
+ * Logistic regression coefficient summaries (if applicable)
60
+ * VIF, correlation, and multicollinearity checks
61
+ * EDA summaries
62
+ * Rule-based threshold validation using YAML configuration
63
+ * Professional report generation in Word (DOCX) format
64
+ * Easily extensible architecture to add custom checks or outputs
65
+
66
+ ## Installation
67
+
68
+ TanML can be installed directly from PyPI using pip:
69
+
70
+ ```bash
71
+ pip install tanml
72
+ ```
73
+
74
+ To upgrade to the latest version:
75
+
76
+ ```bash
77
+ pip install --upgrade tanml
78
+ ```
79
+
80
+ After installation, you can verify the CLI is working by running:
81
+
82
+ ```bash
83
+ tanml --help
84
+ ```
85
+
86
+ This will display the list of available commands and options.
87
+
88
+ TanML supports Python 3.8 and above. It is recommended to use a virtual environment for clean dependency management:
89
+
90
+ ```bash
91
+ python3 -m venv .venv
92
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
93
+ ```
94
+
95
+ ## Folder Structure
96
+
97
+ A typical TanML project is organized as follows:
98
+
99
+ ```
100
+ TanML/
101
+ ├── models/ # Trained model files (e.g., .pkl, .sas7bdat)
102
+ ├── data/
103
+ │ ├── raw/ # Raw data files (optional)
104
+ │ └── cleaned/ # Cleaned datasets for validation
105
+ ├── examples/
106
+ │ ├── scenario_a/ # Example for single model and dataset
107
+ │ ├── scenario_b/ # One model per segment
108
+ │ └── scenario_c/ # Single model with segmentation column
109
+ ├── reports/
110
+ │ ├── images/ # SHAP plots, cluster visualizations
111
+ │ ├── clusters/ # Cluster summary CSVs
112
+ │ └── *.docx # Final validation reports
113
+ ├── tanml/ # Core source code
114
+ │ ├── cli/ # CLI logic
115
+ │ ├── engine/ # Orchestration and segmentation logic
116
+ │ ├── checks/ # Validation check classes
117
+ │ ├── check_runners/ # Check runners linked to the engine
118
+ │ ├── report/ # Report generation using docxtpl
119
+ │ └── utils/ # Data/model loaders and helpers
120
+ ├── tests/ # Unit tests (WIP or planned)
121
+ ├── setup.py # Package installer for pip
122
+ ├── pyproject.toml # Modern Python build metadata (PEP 518)
123
+ ├── MANIFEST.in # Files to include when packaging
124
+ └── README.md # This documentation
125
+ ```
126
+
127
+ ## How to Use TanML
128
+
129
+ To use TanML effectively, follow these three main steps. This process is designed to be simple and intuitive, even for users without a programming background.
130
+
131
+ 1. **Initialize the Configuration File:**
132
+ Start by generating a YAML configuration file that tells TanML how to run the validation. You do this using a command-line instruction where you select one of the three supported validation scenarios:
133
+
134
+ * Scenario A: For validating one model using a single dataset
135
+ * Scenario B: For validating multiple models, each with its own segment-specific dataset
136
+ * Scenario C: For validating a single model that needs to be run separately on different segments within one dataset
137
+
138
+ You can also choose where the generated YAML file should be saved. If you don’t provide a location, it will be saved in your current directory with a default name.
139
+
140
+ 2. **Fill in the Configuration File:**
141
+ Once the YAML file is created, you’ll open it and fill in the necessary details:
142
+
143
+ * Where your model file is located (e.g., a `.pkl` file for Python, or equivalent for SAS or R)
144
+ * Where your cleaned dataset is saved
145
+ * (Optional) Where your raw dataset is saved, if you want data quality comparisons
146
+ * Which input features your model expects, and what the target column is
147
+ * Where you want the final report to be saved (Word `.docx` format)
148
+
149
+ This YAML file acts like a blueprint—TanML reads it and follows the instructions you provide to run all relevant validation checks.
150
+
151
+ 3. **Run the Validation Process:**
152
+ After the YAML file is completed, you will run the validation process. This will trigger TanML to:
153
+
154
+ * Load your model and data
155
+ * Perform all configured validation checks (like performance, SHAP explainability, stress testing, etc.)
156
+ * Automatically generate a professional report with summaries, tables, and visuals
157
+
158
+ You can use TanML either through simple command-line instructions or directly in Python by calling its functions. Both methods achieve the same results. The command-line approach is ideal for repeatable, scriptable workflows, while the Python interface is useful for advanced users who want to integrate TanML into larger systems or notebooks.
159
+
160
+ TanML is controlled entirely through a YAML configuration file and a single CLI command. The configuration specifies the model, data, validation rules, and output paths.
161
+
162
+ ### Scenario A: Single Model, Single Dataset
163
+
164
+ This is the simplest usage mode. You have one model file and one cleaned dataset.
165
+
166
+ **rules.yaml**
167
+
168
+ ```yaml
169
+ model:
170
+ features:
171
+ - age
172
+ - income
173
+ - debt_to_income
174
+ - credit_score
175
+ - employment_length
176
+ target: default_flag
177
+
178
+ paths:
179
+ model: models/model.pkl
180
+ cleaned_data: data/cleaned/cleaned.csv
181
+ raw_data: data/raw/raw.csv
182
+
183
+ output:
184
+ report_path: reports/validation_report.docx
185
+ ```
186
+
187
+ **Run the validation:**
188
+
189
+ ```bash
190
+ tanml validate --rules rules.yaml
191
+ ```
192
+
193
+ This generates a `.docx` report along with SHAP plots and other artifacts.
194
+
195
+ ### Scenario B: One Model per Segment
196
+
197
+ Use this if you have multiple segments, each with its own cleaned dataset and model.
198
+
199
+ **rules.yaml**
200
+
201
+ ```yaml
202
+ segment:
203
+ runs:
204
+ segment_A:
205
+ model: models/model_segment_A.pkl
206
+ cleaned: data/cleaned/segment_A.csv
207
+ raw: data/raw/segment_A.csv
208
+ output_report: reports/report_segment_A.docx
209
+
210
+ segment_B:
211
+ model: models/model_segment_B.pkl
212
+ cleaned: data/cleaned/segment_B.csv
213
+ raw: data/raw/segment_B.csv
214
+ output_report: reports/report_segment_B.docx
215
+
216
+ model:
217
+ features:
218
+ - age
219
+ - income
220
+ - debt_to_income
221
+ - credit_score
222
+ - employment_length
223
+ target: default_flag
224
+ ```
225
+
226
+ **Run the validation:**
227
+
228
+ ```bash
229
+ tanml validate --rules rules.yaml
230
+ ```
231
+
232
+ Each segment will be validated independently with its own output report.
233
+
234
+ ### Scenario C: One Model with Segmentation Column
235
+
236
+ Use this if you have one dataset with a segmentation column (e.g., region, product type).
237
+
238
+ **rules.yaml**
239
+
240
+ ```yaml
241
+ segment:
242
+ column: segment_id # This column will be used to split the data automatically
243
+
244
+ model:
245
+ features:
246
+ - age
247
+ - income
248
+ - debt_to_income
249
+ - credit_score
250
+ - employment_length
251
+ target: default_flag
252
+
253
+ paths:
254
+ model: models/credit_model.pkl
255
+ cleaned_data: data/cleaned/combined.csv
256
+ raw_data: data/raw/combined.csv
257
+
258
+ output:
259
+ report_path: reports/report_{segment}.docx
260
+ ```
261
+
262
+ **Run the validation:**
263
+
264
+ ```bash
265
+ tanml validate --rules rules.yaml
266
+ ```
267
+
268
+ This will automatically split the dataset by the `segment_id` column, apply the same model to each subset, and produce one report per segment.
269
+
270
+ ## Output Artifacts
271
+
272
+ After a successful validation run, TanML generates a set of output files based on your configuration:
273
+
274
+ - **Validation Report (.docx):**
275
+ A professionally formatted Word document containing:
276
+ - A summary of all validation checks
277
+ - Tables for performance metrics, data quality, logistic regression coefficients, VIF, and more
278
+ - SHAP-based feature importance visualizations (if enabled)
279
+ - Segment-wise validation summaries (for Scenario B and C)
280
+
281
+ - **SHAP Visualizations:**
282
+ Summary bar plots and other SHAP outputs are saved in the `reports/images/` folder.
283
+
284
+ - **Input Cluster Coverage Charts and CSVs:**
285
+ If cluster coverage analysis is enabled, visual and tabular summaries are stored in:
286
+ - `reports/images/` (cluster bar plots)
287
+ - `reports/clusters/` (CSV summaries)
288
+
289
+ - **Logs and Intermediate Results:**
290
+ Optional debug or intermediate outputs (e.g., cleaned data snapshots or rule validation results) can be generated depending on configuration or verbosity level.
291
+
292
+ By default, all outputs are saved to the paths you define in the YAML configuration. Each segment (in Scenario B or C) will generate its own report file if multiple runs are triggered.
293
+
294
+ Extending TanML
295
+
296
+ TanML is designed with modularity in mind. Advanced users and developers can easily extend its capabilities by adding new validation checks or modifying report components. Here's how extension works:
297
+
298
+ Add a Custom Check:Create a new check class in the tanml/checks/ directory and implement the validation logic based on the BaseCheck interface.
299
+
300
+ Create a Check Runner:Add a corresponding runner in tanml/check_runners/. This file controls how the check is executed and connected to the engine.
301
+
302
+ Register the Check:Link your runner in the central registry inside the validation engine. The YAML config will trigger it automatically based on user-defined rules.
303
+
304
+ This modular structure ensures that domain-specific validations (e.g., industry regulations, fairness audits) can be added without modifying core logic.
305
+
306
+ ## License and Citation
307
+
308
+ TanML is open-source under the MIT License.
309
+ Copyright © 2025 Tanmay Sah and Dolly Sah.
310
+
311
+ You are free to use, modify, and distribute this software with appropriate attribution.
312
+
313
+ If you use TanML in your work or publication, please cite it as:
314
+
315
+ > Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning*. GitHub. https://github.com/tdlabs-ai/tanml
316
+
317
+ 📄 A machine-readable citation file (`CITATION.cff`) is included for use with citation tools and GitHub's “Cite this repository” button.
tanml-0.1.6/README.md ADDED
@@ -0,0 +1,284 @@
1
+ # TanML: Automated Model Validation Toolkit for Tabular Machine Learning
2
+ [![Cite this repo](https://img.shields.io/badge/Cite-this_repo-blue)](https://github.com/tdlabs-ai/tanml)
3
+
4
+ TanML is a modular, automated model validation toolkit for tabular machine learning workflows. It supports end-to-end validation with just a YAML configuration and a single command, performing checks across data quality, robustness, explainability, and model performance.
5
+
6
+ TanML generates structured Word (DOCX) reports suitable for internal model reviews, audit documentation, and stakeholder presentations. It is designed for general-purpose ML validation and works well in domains where interpretability, fairness, and reliability are critical.
7
+
8
+ While TanML currently operates as a command-line toolkit, its architecture is designed to evolve into an intelligent validation agent—capable of integrating with AutoML pipelines, CI/CD workflows, and human-in-the-loop validation systems.
9
+
10
+ ## Key Features
11
+
12
+ * One-command validation using CLI and YAML
13
+ * Supports models developed in Python (scikit-learn, XGBoost), R, or SAS
14
+ * Scenario-based flexibility:
15
+
16
+ * Scenario A: Single model and dataset
17
+ * Scenario B: One model per segment with separate datasets
18
+ * Scenario C: One model and dataset with an internal segmentation column
19
+ * Comprehensive validation checks:
20
+
21
+ * Model performance (e.g., accuracy, AUC, KS)
22
+ * Data quality diagnostics
23
+ * Stress testing for input robustness
24
+ * SHAP-based explainability
25
+ * Segment-wise validation
26
+ * Logistic regression coefficient summaries (if applicable)
27
+ * VIF, correlation, and multicollinearity checks
28
+ * EDA summaries
29
+ * Rule-based threshold validation using YAML configuration
30
+ * Professional report generation in Word (DOCX) format
31
+ * Easily extensible architecture to add custom checks or outputs
32
+
33
+ ## Installation
34
+
35
+ TanML can be installed directly from PyPI using pip:
36
+
37
+ ```bash
38
+ pip install tanml
39
+ ```
40
+
41
+ To upgrade to the latest version:
42
+
43
+ ```bash
44
+ pip install --upgrade tanml
45
+ ```
46
+
47
+ After installation, you can verify the CLI is working by running:
48
+
49
+ ```bash
50
+ tanml --help
51
+ ```
52
+
53
+ This will display the list of available commands and options.
54
+
55
+ TanML supports Python 3.8 and above. It is recommended to use a virtual environment for clean dependency management:
56
+
57
+ ```bash
58
+ python3 -m venv .venv
59
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
60
+ ```
61
+
62
+ ## Folder Structure
63
+
64
+ A typical TanML project is organized as follows:
65
+
66
+ ```
67
+ TanML/
68
+ ├── models/ # Trained model files (e.g., .pkl, .sas7bdat)
69
+ ├── data/
70
+ │ ├── raw/ # Raw data files (optional)
71
+ │ └── cleaned/ # Cleaned datasets for validation
72
+ ├── examples/
73
+ │ ├── scenario_a/ # Example for single model and dataset
74
+ │ ├── scenario_b/ # One model per segment
75
+ │ └── scenario_c/ # Single model with segmentation column
76
+ ├── reports/
77
+ │ ├── images/ # SHAP plots, cluster visualizations
78
+ │ ├── clusters/ # Cluster summary CSVs
79
+ │ └── *.docx # Final validation reports
80
+ ├── tanml/ # Core source code
81
+ │ ├── cli/ # CLI logic
82
+ │ ├── engine/ # Orchestration and segmentation logic
83
+ │ ├── checks/ # Validation check classes
84
+ │ ├── check_runners/ # Check runners linked to the engine
85
+ │ ├── report/ # Report generation using docxtpl
86
+ │ └── utils/ # Data/model loaders and helpers
87
+ ├── tests/ # Unit tests (WIP or planned)
88
+ ├── setup.py # Package installer for pip
89
+ ├── pyproject.toml # Modern Python build metadata (PEP 518)
90
+ ├── MANIFEST.in # Files to include when packaging
91
+ └── README.md # This documentation
92
+ ```
93
+
94
+ ## How to Use TanML
95
+
96
+ To use TanML effectively, follow these three main steps. This process is designed to be simple and intuitive, even for users without a programming background.
97
+
98
+ 1. **Initialize the Configuration File:**
99
+ Start by generating a YAML configuration file that tells TanML how to run the validation. You do this using a command-line instruction where you select one of the three supported validation scenarios:
100
+
101
+ * Scenario A: For validating one model using a single dataset
102
+ * Scenario B: For validating multiple models, each with its own segment-specific dataset
103
+ * Scenario C: For validating a single model that needs to be run separately on different segments within one dataset
104
+
105
+ You can also choose where the generated YAML file should be saved. If you don’t provide a location, it will be saved in your current directory with a default name.
106
+
107
+ 2. **Fill in the Configuration File:**
108
+ Once the YAML file is created, you’ll open it and fill in the necessary details:
109
+
110
+ * Where your model file is located (e.g., a `.pkl` file for Python, or equivalent for SAS or R)
111
+ * Where your cleaned dataset is saved
112
+ * (Optional) Where your raw dataset is saved, if you want data quality comparisons
113
+ * Which input features your model expects, and what the target column is
114
+ * Where you want the final report to be saved (Word `.docx` format)
115
+
116
+ This YAML file acts like a blueprint—TanML reads it and follows the instructions you provide to run all relevant validation checks.
117
+
118
+ 3. **Run the Validation Process:**
119
+ After the YAML file is completed, you will run the validation process. This will trigger TanML to:
120
+
121
+ * Load your model and data
122
+ * Perform all configured validation checks (like performance, SHAP explainability, stress testing, etc.)
123
+ * Automatically generate a professional report with summaries, tables, and visuals
124
+
125
+ You can use TanML either through simple command-line instructions or directly in Python by calling its functions. Both methods achieve the same results. The command-line approach is ideal for repeatable, scriptable workflows, while the Python interface is useful for advanced users who want to integrate TanML into larger systems or notebooks.
126
+
127
+ TanML is controlled entirely through a YAML configuration file and a single CLI command. The configuration specifies the model, data, validation rules, and output paths.
128
+
129
+ ### Scenario A: Single Model, Single Dataset
130
+
131
+ This is the simplest usage mode. You have one model file and one cleaned dataset.
132
+
133
+ **rules.yaml**
134
+
135
+ ```yaml
136
+ model:
137
+ features:
138
+ - age
139
+ - income
140
+ - debt_to_income
141
+ - credit_score
142
+ - employment_length
143
+ target: default_flag
144
+
145
+ paths:
146
+ model: models/model.pkl
147
+ cleaned_data: data/cleaned/cleaned.csv
148
+ raw_data: data/raw/raw.csv
149
+
150
+ output:
151
+ report_path: reports/validation_report.docx
152
+ ```
153
+
154
+ **Run the validation:**
155
+
156
+ ```bash
157
+ tanml validate --rules rules.yaml
158
+ ```
159
+
160
+ This generates a `.docx` report along with SHAP plots and other artifacts.
161
+
162
+ ### Scenario B: One Model per Segment
163
+
164
+ Use this if you have multiple segments, each with its own cleaned dataset and model.
165
+
166
+ **rules.yaml**
167
+
168
+ ```yaml
169
+ segment:
170
+ runs:
171
+ segment_A:
172
+ model: models/model_segment_A.pkl
173
+ cleaned: data/cleaned/segment_A.csv
174
+ raw: data/raw/segment_A.csv
175
+ output_report: reports/report_segment_A.docx
176
+
177
+ segment_B:
178
+ model: models/model_segment_B.pkl
179
+ cleaned: data/cleaned/segment_B.csv
180
+ raw: data/raw/segment_B.csv
181
+ output_report: reports/report_segment_B.docx
182
+
183
+ model:
184
+ features:
185
+ - age
186
+ - income
187
+ - debt_to_income
188
+ - credit_score
189
+ - employment_length
190
+ target: default_flag
191
+ ```
192
+
193
+ **Run the validation:**
194
+
195
+ ```bash
196
+ tanml validate --rules rules.yaml
197
+ ```
198
+
199
+ Each segment will be validated independently with its own output report.
200
+
201
+ ### Scenario C: One Model with Segmentation Column
202
+
203
+ Use this if you have one dataset with a segmentation column (e.g., region, product type).
204
+
205
+ **rules.yaml**
206
+
207
+ ```yaml
208
+ segment:
209
+ column: segment_id # This column will be used to split the data automatically
210
+
211
+ model:
212
+ features:
213
+ - age
214
+ - income
215
+ - debt_to_income
216
+ - credit_score
217
+ - employment_length
218
+ target: default_flag
219
+
220
+ paths:
221
+ model: models/credit_model.pkl
222
+ cleaned_data: data/cleaned/combined.csv
223
+ raw_data: data/raw/combined.csv
224
+
225
+ output:
226
+ report_path: reports/report_{segment}.docx
227
+ ```
228
+
229
+ **Run the validation:**
230
+
231
+ ```bash
232
+ tanml validate --rules rules.yaml
233
+ ```
234
+
235
+ This will automatically split the dataset by the `segment_id` column, apply the same model to each subset, and produce one report per segment.
236
+
237
+ ## Output Artifacts
238
+
239
+ After a successful validation run, TanML generates a set of output files based on your configuration:
240
+
241
+ - **Validation Report (.docx):**
242
+ A professionally formatted Word document containing:
243
+ - A summary of all validation checks
244
+ - Tables for performance metrics, data quality, logistic regression coefficients, VIF, and more
245
+ - SHAP-based feature importance visualizations (if enabled)
246
+ - Segment-wise validation summaries (for Scenario B and C)
247
+
248
+ - **SHAP Visualizations:**
249
+ Summary bar plots and other SHAP outputs are saved in the `reports/images/` folder.
250
+
251
+ - **Input Cluster Coverage Charts and CSVs:**
252
+ If cluster coverage analysis is enabled, visual and tabular summaries are stored in:
253
+ - `reports/images/` (cluster bar plots)
254
+ - `reports/clusters/` (CSV summaries)
255
+
256
+ - **Logs and Intermediate Results:**
257
+ Optional debug or intermediate outputs (e.g., cleaned data snapshots or rule validation results) can be generated depending on configuration or verbosity level.
258
+
259
+ By default, all outputs are saved to the paths you define in the YAML configuration. Each segment (in Scenario B or C) will generate its own report file if multiple runs are triggered.
260
+
261
+ Extending TanML
262
+
263
+ TanML is designed with modularity in mind. Advanced users and developers can easily extend its capabilities by adding new validation checks or modifying report components. Here's how extension works:
264
+
265
+ Add a Custom Check:Create a new check class in the tanml/checks/ directory and implement the validation logic based on the BaseCheck interface.
266
+
267
+ Create a Check Runner:Add a corresponding runner in tanml/check_runners/. This file controls how the check is executed and connected to the engine.
268
+
269
+ Register the Check:Link your runner in the central registry inside the validation engine. The YAML config will trigger it automatically based on user-defined rules.
270
+
271
+ This modular structure ensures that domain-specific validations (e.g., industry regulations, fairness audits) can be added without modifying core logic.
272
+
273
+ ## License and Citation
274
+
275
+ TanML is open-source under the MIT License.
276
+ Copyright © 2025 Tanmay Sah and Dolly Sah.
277
+
278
+ You are free to use, modify, and distribute this software with appropriate attribution.
279
+
280
+ If you use TanML in your work or publication, please cite it as:
281
+
282
+ > Sah, T., & Sah, D. (2025). *TanML: Automated Model Validation Toolkit for Tabular Machine Learning*. GitHub. https://github.com/tdlabs-ai/tanml
283
+
284
+ 📄 A machine-readable citation file (`CITATION.cff`) is included for use with citation tools and GitHub's “Cite this repository” button.
@@ -0,0 +1,54 @@
1
+ [build-system]
2
+ requires = ["setuptools>=61.0", "wheel"]
3
+ build-backend = "setuptools.build_meta"
4
+
5
+ [project]
6
+ name = "tanml"
7
+ version = "0.1.6"
8
+ description = "Automated validation toolkit for tabular ML models in finance and regulated domains."
9
+ readme = "README.md"
10
+ requires-python = ">=3.8"
11
+ license = {text = "MIT"}
12
+
13
+ authors = [
14
+ { name = "Tanmay Sah", email = "tradertanmay@gmail.com" },
15
+ { name = "Dolly Sah" }
16
+ ]
17
+
18
+ dependencies = [
19
+ "scikit-learn",
20
+ "pandas",
21
+ "matplotlib",
22
+ "seaborn",
23
+ "shap",
24
+ "docxtpl",
25
+ "python-docx",
26
+ "docxcompose",
27
+ "PyYAML",
28
+ "scipy",
29
+ "statsmodels",
30
+ "joblib",
31
+ "tzlocal",
32
+ "tqdm",
33
+ "imgkit",
34
+ "xgboost"
35
+ ]
36
+
37
+ [project.optional-dependencies]
38
+ dev = ["pytest", "black", "isort"]
39
+
40
+ [project.scripts]
41
+ tanml = "tanml.cli.main:main"
42
+
43
+ [project.urls]
44
+ "Homepage" = "https://github.com/tdlabs-ai/tanml"
45
+
46
+ [tool.setuptools]
47
+ include-package-data = true
48
+
49
+ [tool.setuptools.packages.find]
50
+ include = ["tanml*"]
51
+
52
+ [tool.setuptools.package-data]
53
+ "tanml.report.templates" = ["report_template.docx"]
54
+ "tanml.config_templates" = ["*.yaml"]
tanml-0.1.6/setup.cfg ADDED
@@ -0,0 +1,4 @@
1
+ [egg_info]
2
+ tag_build =
3
+ tag_date = 0
4
+
@@ -0,0 +1 @@
1
+ __version__ = "0.1.1"
File without changes