model-eval-toolkit 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- model_eval_toolkit-0.1.0/LICENSE +21 -0
- model_eval_toolkit-0.1.0/PKG-INFO +339 -0
- model_eval_toolkit-0.1.0/README.md +281 -0
- model_eval_toolkit-0.1.0/evalreport/__init__.py +28 -0
- model_eval_toolkit-0.1.0/evalreport/__version__.py +2 -0
- model_eval_toolkit-0.1.0/evalreport/classification/__init__.py +4 -0
- model_eval_toolkit-0.1.0/evalreport/classification/report.py +319 -0
- model_eval_toolkit-0.1.0/evalreport/clustering/__init__.py +4 -0
- model_eval_toolkit-0.1.0/evalreport/clustering/report.py +174 -0
- model_eval_toolkit-0.1.0/evalreport/core/base_report.py +479 -0
- model_eval_toolkit-0.1.0/evalreport/core/entrypoints.py +97 -0
- model_eval_toolkit-0.1.0/evalreport/core/task_inference.py +180 -0
- model_eval_toolkit-0.1.0/evalreport/nlp/__init__.py +5 -0
- model_eval_toolkit-0.1.0/evalreport/nlp/text_classification.py +21 -0
- model_eval_toolkit-0.1.0/evalreport/nlp/text_generation.py +202 -0
- model_eval_toolkit-0.1.0/evalreport/ranking/__init__.py +3 -0
- model_eval_toolkit-0.1.0/evalreport/ranking/report.py +274 -0
- model_eval_toolkit-0.1.0/evalreport/regression/__init__.py +4 -0
- model_eval_toolkit-0.1.0/evalreport/regression/report.py +173 -0
- model_eval_toolkit-0.1.0/evalreport/timeseries/__init__.py +4 -0
- model_eval_toolkit-0.1.0/evalreport/timeseries/report.py +211 -0
- model_eval_toolkit-0.1.0/evalreport/vision/__init__.py +6 -0
- model_eval_toolkit-0.1.0/evalreport/vision/detection.py +359 -0
- model_eval_toolkit-0.1.0/evalreport/vision/image_classification.py +25 -0
- model_eval_toolkit-0.1.0/evalreport/vision/segmentation.py +140 -0
- model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/PKG-INFO +339 -0
- model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/SOURCES.txt +45 -0
- model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/dependency_links.txt +1 -0
- model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/requires.txt +16 -0
- model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/top_level.txt +1 -0
- model_eval_toolkit-0.1.0/pyproject.toml +54 -0
- model_eval_toolkit-0.1.0/setup.cfg +4 -0
- model_eval_toolkit-0.1.0/tests/test_base_report.py +24 -0
- model_eval_toolkit-0.1.0/tests/test_classification_comprehensive.py +117 -0
- model_eval_toolkit-0.1.0/tests/test_classification_report.py +32 -0
- model_eval_toolkit-0.1.0/tests/test_clustering_report.py +50 -0
- model_eval_toolkit-0.1.0/tests/test_detection_map.py +55 -0
- model_eval_toolkit-0.1.0/tests/test_generate_report.py +113 -0
- model_eval_toolkit-0.1.0/tests/test_image_classification_report.py +61 -0
- model_eval_toolkit-0.1.0/tests/test_nlp_reports.py +38 -0
- model_eval_toolkit-0.1.0/tests/test_ranking_report.py +37 -0
- model_eval_toolkit-0.1.0/tests/test_regression_comprehensive.py +72 -0
- model_eval_toolkit-0.1.0/tests/test_regression_report.py +28 -0
- model_eval_toolkit-0.1.0/tests/test_report_outputs.py +46 -0
- model_eval_toolkit-0.1.0/tests/test_task_inference.py +83 -0
- model_eval_toolkit-0.1.0/tests/test_timeseries_report.py +44 -0
- model_eval_toolkit-0.1.0/tests/test_vision_reports.py +63 -0
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Raahul Krishna Durairaju
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
|
@@ -0,0 +1,339 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: model-eval-toolkit
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Unified ML evaluation framework for classification, regression, clustering, time series, NLP, CV, and recommendation systems.
|
|
5
|
+
Author-email: Raahul Krishna Durairaju <rahulkrish28@gmail.com>
|
|
6
|
+
License: MIT License
|
|
7
|
+
|
|
8
|
+
Copyright (c) 2026 Raahul Krishna Durairaju
|
|
9
|
+
|
|
10
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
11
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
12
|
+
in the Software without restriction, including without limitation the rights
|
|
13
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
14
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
15
|
+
furnished to do so, subject to the following conditions:
|
|
16
|
+
|
|
17
|
+
The above copyright notice and this permission notice shall be included in all
|
|
18
|
+
copies or substantial portions of the Software.
|
|
19
|
+
|
|
20
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
21
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
22
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
23
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
24
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
25
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
26
|
+
SOFTWARE.
|
|
27
|
+
|
|
28
|
+
Project-URL: Homepage, https://github.com/RAAHUL-tech/model-eval-toolkit
|
|
29
|
+
Project-URL: Documentation, https://github.com/RAAHUL-tech/model-eval-toolkit/tree/main/docs
|
|
30
|
+
Project-URL: Repository, https://github.com/RAAHUL-tech/model-eval-toolkit
|
|
31
|
+
Project-URL: Issues, https://github.com/RAAHUL-tech/model-eval-toolkit/issues
|
|
32
|
+
Keywords: machine-learning,evaluation,metrics,nlp,computer-vision,time-series,ranking
|
|
33
|
+
Classifier: Programming Language :: Python :: 3
|
|
34
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
35
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
36
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
37
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
38
|
+
Classifier: Operating System :: OS Independent
|
|
39
|
+
Classifier: Intended Audience :: Science/Research
|
|
40
|
+
Classifier: Intended Audience :: Developers
|
|
41
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
|
42
|
+
Requires-Python: >=3.9
|
|
43
|
+
Description-Content-Type: text/markdown
|
|
44
|
+
License-File: LICENSE
|
|
45
|
+
Requires-Dist: numpy>=1.23
|
|
46
|
+
Requires-Dist: pandas>=1.5
|
|
47
|
+
Requires-Dist: scikit-learn>=1.2
|
|
48
|
+
Requires-Dist: matplotlib>=3.7
|
|
49
|
+
Requires-Dist: seaborn>=0.12
|
|
50
|
+
Provides-Extra: test
|
|
51
|
+
Requires-Dist: pytest>=7; extra == "test"
|
|
52
|
+
Requires-Dist: coverage[toml]>=7; extra == "test"
|
|
53
|
+
Provides-Extra: pdf
|
|
54
|
+
Requires-Dist: reportlab>=4; extra == "pdf"
|
|
55
|
+
Provides-Extra: nlp
|
|
56
|
+
Provides-Extra: vision
|
|
57
|
+
Dynamic: license-file
|
|
58
|
+
|
|
59
|
+
<p align="center">
|
|
60
|
+
<img src="https://raw.githubusercontent.com/RAAHUL-tech/model-eval-toolkit/main/docs/images/evalreport-logo.png" alt="Model Eval Toolkit" width="260">
|
|
61
|
+
</p>
|
|
62
|
+
|
|
63
|
+
# Model Eval Toolkit
|
|
64
|
+
|
|
65
|
+
**Unified ML evaluation reports** for Python: metrics, plots, auto-insights, and export to **HTML**, **JSON**, **Markdown**, or **PDF**.
|
|
66
|
+
|
|
67
|
+
Model Eval Toolkit provides a single, task-aware evaluation layer to benchmark model quality consistently across ML domains.
|
|
68
|
+
|
|
69
|
+
Import from the **`evalreport`** package:
|
|
70
|
+
|
|
71
|
+
```python
|
|
72
|
+
from evalreport import (
|
|
73
|
+
generate_report,
|
|
74
|
+
ClassificationReport,
|
|
75
|
+
RegressionReport,
|
|
76
|
+
ClusteringReport,
|
|
77
|
+
TimeSeriesReport,
|
|
78
|
+
TextClassificationReport,
|
|
79
|
+
TextGenerationReport,
|
|
80
|
+
SegmentationReport,
|
|
81
|
+
DetectionReport,
|
|
82
|
+
RankingReport,
|
|
83
|
+
__version__,
|
|
84
|
+
)
|
|
85
|
+
```
|
|
86
|
+
|
|
87
|
+
> **Current supported tasks (v0.1):**
|
|
88
|
+
> classification (binary & multiclass), regression, clustering, time series/forecasting,
|
|
89
|
+
> NLP (text classification + text generation), CV (segmentation + detection), and **recommendation / ranking**.
|
|
90
|
+
> The roadmap includes multilabel and richer recsys (e.g. session-based, implicit feedback models).
|
|
91
|
+
|
|
92
|
+
---
|
|
93
|
+
|
|
94
|
+
## Install
|
|
95
|
+
|
|
96
|
+
```bash
|
|
97
|
+
pip install model-eval-toolkit
|
|
98
|
+
```
|
|
99
|
+
|
|
100
|
+
**PDF export** needs ReportLab:
|
|
101
|
+
|
|
102
|
+
```bash
|
|
103
|
+
pip install "model-eval-toolkit[pdf]"
|
|
104
|
+
# or
|
|
105
|
+
pip install reportlab
|
|
106
|
+
```
|
|
107
|
+
|
|
108
|
+
**Requirements:** Python ≥ 3.9, NumPy, pandas, scikit-learn, Matplotlib, Seaborn.
|
|
109
|
+
|
|
110
|
+
Optional task extras (currently dependency-light for NLP/CV):
|
|
111
|
+
|
|
112
|
+
```bash
|
|
113
|
+
pip install "model-eval-toolkit[nlp]"
|
|
114
|
+
pip install "model-eval-toolkit[vision]"
|
|
115
|
+
```
|
|
116
|
+
|
|
117
|
+
---
|
|
118
|
+
|
|
119
|
+
## Quick start
|
|
120
|
+
|
|
121
|
+
### `generate_report` (recommended)
|
|
122
|
+
|
|
123
|
+
```python
|
|
124
|
+
from evalreport import generate_report
|
|
125
|
+
|
|
126
|
+
summary = generate_report(
|
|
127
|
+
task="classification", # or "regression", or "auto"
|
|
128
|
+
y_true=[0, 1, 0, 1, 1],
|
|
129
|
+
y_pred=[0, 1, 1, 1, 1],
|
|
130
|
+
y_prob=[0.1, 0.9, 0.8, 0.7, 0.6], # optional; enables log loss, ROC/PR (binary)
|
|
131
|
+
output_path="my_reports/model_report.html",
|
|
132
|
+
format="html",
|
|
133
|
+
)
|
|
134
|
+
|
|
135
|
+
print(summary["metrics"]["accuracy"])
|
|
136
|
+
```
|
|
137
|
+
|
|
138
|
+
NLP + CV examples:
|
|
139
|
+
|
|
140
|
+
```python
|
|
141
|
+
from evalreport import generate_report
|
|
142
|
+
|
|
143
|
+
# Text generation
|
|
144
|
+
generate_report(
|
|
145
|
+
task="text_generation",
|
|
146
|
+
y_true=["the cat sat on the mat"],
|
|
147
|
+
y_pred=["the cat sat on mat"],
|
|
148
|
+
output_path="reports/text_generation.html",
|
|
149
|
+
)
|
|
150
|
+
|
|
151
|
+
# Image segmentation (binary masks)
|
|
152
|
+
generate_report(
|
|
153
|
+
task="segmentation",
|
|
154
|
+
y_true=[[[0, 0], [1, 1]]],
|
|
155
|
+
y_pred=[[[0, 1], [1, 1]]],
|
|
156
|
+
output_path="reports/segmentation.html",
|
|
157
|
+
)
|
|
158
|
+
|
|
159
|
+
# Object detection (per-image list of box dicts)
|
|
160
|
+
generate_report(
|
|
161
|
+
task="detection",
|
|
162
|
+
y_true=[[{\"bbox\": [0, 0, 10, 10], \"label\": \"obj\"}]],
|
|
163
|
+
y_pred=[[{\"bbox\": [1, 1, 9, 9], \"label\": \"obj\", \"score\": 0.9}]],
|
|
164
|
+
output_path="reports/detection.html",
|
|
165
|
+
)
|
|
166
|
+
|
|
167
|
+
# Recommendation / ranking (one list per user)
|
|
168
|
+
generate_report(
|
|
169
|
+
task="recommendation", # or "ranking", "recommender"
|
|
170
|
+
y_true=[[10, 20], [30]], # relevant item IDs per user
|
|
171
|
+
y_pred=[[10, 99, 20, 5], [7, 30]], # ranked recommendations per user (best first)
|
|
172
|
+
k_values=(1, 5, 10), # optional cutoffs for P@K, R@K, NDCG@K, Hit@K
|
|
173
|
+
output_path="reports/recommendation.html",
|
|
174
|
+
)
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
- **`task="auto"`** — float targets → regression; integer/string labels → classification.
|
|
178
|
+
- If you **omit `output_path`**, the report is written under **`reports/`** (created if needed), e.g. `reports/classification_report.html` or `reports/regression_report.json` when `format="json"`.
|
|
179
|
+
- **Plots** are saved under **`<report_directory>/evalreport_plots/`** (same folder as your HTML/JSON/PDF file’s parent). So custom `output_path="my_reports/x.html"` → plots in `my_reports/evalreport_plots/`.
|
|
180
|
+
|
|
181
|
+
### Task-specific API
|
|
182
|
+
|
|
183
|
+
Useful when you want full control (e.g. set `output_dir` before `run_all()` so plots land next to a chosen folder):
|
|
184
|
+
|
|
185
|
+
```python
|
|
186
|
+
from pathlib import Path
|
|
187
|
+
from evalreport import ClassificationReport, RegressionReport, RankingReport
|
|
188
|
+
|
|
189
|
+
# Classification (binary or multiclass)
|
|
190
|
+
cls = ClassificationReport(
|
|
191
|
+
y_true=[0, 1, 2, 0],
|
|
192
|
+
y_pred=[0, 2, 2, 0],
|
|
193
|
+
# y_prob: (n_samples, n_classes) for multiclass log loss / AUC
|
|
194
|
+
labels=[0, 1, 2], # optional fixed class order for confusion matrix
|
|
195
|
+
)
|
|
196
|
+
cls.output_dir = Path("reports") # optional; default for plots if set before run_all()
|
|
197
|
+
cls.run_all()
|
|
198
|
+
cls.save("reports/classification_report.html", format="html")
|
|
199
|
+
cls.save("reports/classification_report.json", format="json")
|
|
200
|
+
|
|
201
|
+
# Regression
|
|
202
|
+
reg = RegressionReport(y_true=[1.0, 2.0, 3.0], y_pred=[1.1, 1.9, 3.2])
|
|
203
|
+
reg.output_dir = Path("reports")
|
|
204
|
+
reg.run_all()
|
|
205
|
+
reg.save("reports/regression_report.pdf", format="pdf") # needs reportlab
|
|
206
|
+
|
|
207
|
+
# Recommendation / ranking
|
|
208
|
+
rank = RankingReport(
|
|
209
|
+
relevant=[[1, 2], [3]],
|
|
210
|
+
ranked=[[1, 4, 5], [3, 1, 2]],
|
|
211
|
+
k_values=(1, 5, 10),
|
|
212
|
+
)
|
|
213
|
+
rank.output_dir = Path("reports")
|
|
214
|
+
rank.run_all()
|
|
215
|
+
rank.save("reports/ranking_report.html", format="html")
|
|
216
|
+
```
|
|
217
|
+
|
|
218
|
+
---
|
|
219
|
+
|
|
220
|
+
## What each task includes
|
|
221
|
+
|
|
222
|
+
### Classification
|
|
223
|
+
|
|
224
|
+
| Area | Details |
|
|
225
|
+
|------|--------|
|
|
226
|
+
| **Metrics** | Accuracy; precision / recall / F1 (micro, macro, weighted); MCC; Cohen’s κ; log loss (with probs); ROC-AUC / PR-AUC when applicable; confusion matrix (table). |
|
|
227
|
+
| **Plots** | Confusion matrix heatmap; **binary** ROC & PR curves when `y_prob` is provided. |
|
|
228
|
+
| **Insights** | Class imbalance hint; most common misclassification pair. |
|
|
229
|
+
| **HTML** | Styled layout: each metric with a short explanation, insights, and embedded plot images. |
|
|
230
|
+
|
|
231
|
+
**Probabilities**
|
|
232
|
+
|
|
233
|
+
- Binary: `y_prob` as length-`n` scores for the positive class, or shape `(n, 2)`.
|
|
234
|
+
- Multiclass: `(n_samples, n_classes)` for log loss / multiclass AUC where supported.
|
|
235
|
+
|
|
236
|
+
### Regression
|
|
237
|
+
|
|
238
|
+
| Area | Details |
|
|
239
|
+
|------|--------|
|
|
240
|
+
| **Metrics** | MAE, MSE, RMSE, R², median absolute error, MAPE (where defined), mean error (bias). |
|
|
241
|
+
| **Plots** | Residuals vs predicted, predicted vs actual, residual histogram. |
|
|
242
|
+
| **Insights** | Over/under-prediction bias; heavy-tail error hint. |
|
|
243
|
+
| **HTML** | Same rich layout as classification. |
|
|
244
|
+
|
|
245
|
+
### Clustering
|
|
246
|
+
|
|
247
|
+
| Area | Details |
|
|
248
|
+
|------|--------|
|
|
249
|
+
| **Inputs** | `X` (feature matrix) and `labels` (cluster assignments) |
|
|
250
|
+
| **Metrics** | Silhouette score, Davies–Bouldin index, Calinski–Harabasz score, cluster sizes |
|
|
251
|
+
| **Plots** | Cluster scatter (PCA) and cluster size distribution |
|
|
252
|
+
| **Insights** | Separability + imbalance hints |
|
|
253
|
+
| **HTML** | Styled metrics/insights plus embedded plot images |
|
|
254
|
+
|
|
255
|
+
### Time Series / Forecasting
|
|
256
|
+
|
|
257
|
+
| Area | Details |
|
|
258
|
+
|------|--------|
|
|
259
|
+
| **Inputs** | `y_true`, `y_pred`, and `timestamps` (same length) |
|
|
260
|
+
| **Metrics** | MAE, MSE, RMSE, MAPE, SMAPE, mean forecast error, rolling RMSE summary |
|
|
261
|
+
| **Plots** | Actual vs forecast, residuals over time, rolling RMSE over time |
|
|
262
|
+
| **Insights** | Systematic bias and drift/stability hints via rolling RMSE |
|
|
263
|
+
| **HTML** | Styled metrics/insights plus embedded plot images |
|
|
264
|
+
|
|
265
|
+
### Recommendation / Ranking
|
|
266
|
+
|
|
267
|
+
| Area | Details |
|
|
268
|
+
|------|--------|
|
|
269
|
+
| **Inputs** | `relevant`: ground-truth relevant **item IDs** per user (or query). `ranked`: **ordered** recommended lists per user (same length as `relevant`). |
|
|
270
|
+
| **Metrics** | **MAP** (binary relevance), **Precision@K**, **Recall@K**, **NDCG@K**, **Hit Rate@K** for each K in `k_values` (default `(1, 5, 10)`). |
|
|
271
|
+
| **Plots** | Precision@K curve; mean **cumulative gain** vs rank cutoff. |
|
|
272
|
+
| **Insights** | Drop in precision at larger K; long-tail spread in \#relevant per user; low-MAP hint. |
|
|
273
|
+
| **`generate_report`** | `task="recommendation"` / `"ranking"` / `"recommender"` with `y_true=relevant`, `y_pred=ranked`. |
|
|
274
|
+
|
|
275
|
+
---
|
|
276
|
+
|
|
277
|
+
## Output formats
|
|
278
|
+
|
|
279
|
+
| Format | How | Notes |
|
|
280
|
+
|--------|-----|--------|
|
|
281
|
+
| **HTML** | `format="html"` or `.html` | Metrics + descriptions + insights + plot images. |
|
|
282
|
+
| **JSON** | `format="json"` or `.json` | `metrics`, `insights`, `plots` (paths to PNGs). |
|
|
283
|
+
| **Markdown** | `format="markdown"` or `.md` | Metrics and insights (no embedded images). |
|
|
284
|
+
| **PDF** | `format="pdf"` or `.pdf` | Text summary (metrics + descriptions + insights); install `reportlab`. |
|
|
285
|
+
|
|
286
|
+
---
|
|
287
|
+
|
|
288
|
+
## Development
|
|
289
|
+
|
|
290
|
+
```bash
|
|
291
|
+
git clone https://github.com/RAAHUL-tech/model-eval-toolkit.git
|
|
292
|
+
cd model-eval-toolkit
|
|
293
|
+
python -m venv .venv
|
|
294
|
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
295
|
+
pip install -e ".[test]"
|
|
296
|
+
pytest -q
|
|
297
|
+
# optional coverage
|
|
298
|
+
pytest --cov=evalreport --cov-report=term-missing
|
|
299
|
+
```
|
|
300
|
+
|
|
301
|
+
Build and check the package:
|
|
302
|
+
|
|
303
|
+
```bash
|
|
304
|
+
pip install build twine
|
|
305
|
+
python -m build
|
|
306
|
+
twine check dist/*
|
|
307
|
+
```
|
|
308
|
+
|
|
309
|
+
### CI and PyPI releases
|
|
310
|
+
|
|
311
|
+
GitHub Actions (`.github/workflows/ci.yml`):
|
|
312
|
+
|
|
313
|
+
- **Pull requests** → runs **tests** only (Python 3.9–3.11).
|
|
314
|
+
- **Push to `main`** (including when a PR is merged) → runs **tests**, then **publishes** to [PyPI](https://pypi.org/project/model-eval-toolkit/) if tests pass.
|
|
315
|
+
|
|
316
|
+
**One-time setup**
|
|
317
|
+
|
|
318
|
+
1. On [pypi.org](https://pypi.org/manage/account/token/), create an **API token** scoped to this project (or your whole account for a first publish).
|
|
319
|
+
2. In the GitHub repo: **Settings → Secrets and variables → Actions → New repository secret**
|
|
320
|
+
- Name: `PYPI_API_TOKEN`
|
|
321
|
+
- Value: the token (often starts with `pypi-`).
|
|
322
|
+
|
|
323
|
+
**Before each release**
|
|
324
|
+
|
|
325
|
+
- Bump `version` in `pyproject.toml`. PyPI rejects re-uploading the same version.
|
|
326
|
+
|
|
327
|
+
Optional: use [Trusted Publishing](https://docs.pypi.org/trusted-publishers/) (OIDC) and drop the token; the workflow already requests `id-token: write` for that path.
|
|
328
|
+
|
|
329
|
+
---
|
|
330
|
+
|
|
331
|
+
## Roadmap
|
|
332
|
+
|
|
333
|
+
Additional task types (clustering, time series, ranking, NLP, CV) and a plugin-style API are planned. Issues and PRs welcome on [GitHub](https://github.com/RAAHUL-tech/model-eval-toolkit).
|
|
334
|
+
|
|
335
|
+
---
|
|
336
|
+
|
|
337
|
+
## License
|
|
338
|
+
|
|
339
|
+
See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,281 @@
|
|
|
1
|
+
<p align="center">
|
|
2
|
+
<img src="https://raw.githubusercontent.com/RAAHUL-tech/model-eval-toolkit/main/docs/images/evalreport-logo.png" alt="Model Eval Toolkit" width="260">
|
|
3
|
+
</p>
|
|
4
|
+
|
|
5
|
+
# Model Eval Toolkit
|
|
6
|
+
|
|
7
|
+
**Unified ML evaluation reports** for Python: metrics, plots, auto-insights, and export to **HTML**, **JSON**, **Markdown**, or **PDF**.
|
|
8
|
+
|
|
9
|
+
Model Eval Toolkit provides a single, task-aware evaluation layer to benchmark model quality consistently across ML domains.
|
|
10
|
+
|
|
11
|
+
Import from the **`evalreport`** package:
|
|
12
|
+
|
|
13
|
+
```python
|
|
14
|
+
from evalreport import (
|
|
15
|
+
generate_report,
|
|
16
|
+
ClassificationReport,
|
|
17
|
+
RegressionReport,
|
|
18
|
+
ClusteringReport,
|
|
19
|
+
TimeSeriesReport,
|
|
20
|
+
TextClassificationReport,
|
|
21
|
+
TextGenerationReport,
|
|
22
|
+
SegmentationReport,
|
|
23
|
+
DetectionReport,
|
|
24
|
+
RankingReport,
|
|
25
|
+
__version__,
|
|
26
|
+
)
|
|
27
|
+
```
|
|
28
|
+
|
|
29
|
+
> **Current supported tasks (v0.1):**
|
|
30
|
+
> classification (binary & multiclass), regression, clustering, time series/forecasting,
|
|
31
|
+
> NLP (text classification + text generation), CV (segmentation + detection), and **recommendation / ranking**.
|
|
32
|
+
> The roadmap includes multilabel and richer recsys (e.g. session-based, implicit feedback models).
|
|
33
|
+
|
|
34
|
+
---
|
|
35
|
+
|
|
36
|
+
## Install
|
|
37
|
+
|
|
38
|
+
```bash
|
|
39
|
+
pip install model-eval-toolkit
|
|
40
|
+
```
|
|
41
|
+
|
|
42
|
+
**PDF export** needs ReportLab:
|
|
43
|
+
|
|
44
|
+
```bash
|
|
45
|
+
pip install "model-eval-toolkit[pdf]"
|
|
46
|
+
# or
|
|
47
|
+
pip install reportlab
|
|
48
|
+
```
|
|
49
|
+
|
|
50
|
+
**Requirements:** Python ≥ 3.9, NumPy, pandas, scikit-learn, Matplotlib, Seaborn.
|
|
51
|
+
|
|
52
|
+
Optional task extras (currently dependency-light for NLP/CV):
|
|
53
|
+
|
|
54
|
+
```bash
|
|
55
|
+
pip install "model-eval-toolkit[nlp]"
|
|
56
|
+
pip install "model-eval-toolkit[vision]"
|
|
57
|
+
```
|
|
58
|
+
|
|
59
|
+
---
|
|
60
|
+
|
|
61
|
+
## Quick start
|
|
62
|
+
|
|
63
|
+
### `generate_report` (recommended)
|
|
64
|
+
|
|
65
|
+
```python
|
|
66
|
+
from evalreport import generate_report
|
|
67
|
+
|
|
68
|
+
summary = generate_report(
|
|
69
|
+
task="classification", # or "regression", or "auto"
|
|
70
|
+
y_true=[0, 1, 0, 1, 1],
|
|
71
|
+
y_pred=[0, 1, 1, 1, 1],
|
|
72
|
+
y_prob=[0.1, 0.9, 0.8, 0.7, 0.6], # optional; enables log loss, ROC/PR (binary)
|
|
73
|
+
output_path="my_reports/model_report.html",
|
|
74
|
+
format="html",
|
|
75
|
+
)
|
|
76
|
+
|
|
77
|
+
print(summary["metrics"]["accuracy"])
|
|
78
|
+
```
|
|
79
|
+
|
|
80
|
+
NLP + CV examples:
|
|
81
|
+
|
|
82
|
+
```python
|
|
83
|
+
from evalreport import generate_report
|
|
84
|
+
|
|
85
|
+
# Text generation
|
|
86
|
+
generate_report(
|
|
87
|
+
task="text_generation",
|
|
88
|
+
y_true=["the cat sat on the mat"],
|
|
89
|
+
y_pred=["the cat sat on mat"],
|
|
90
|
+
output_path="reports/text_generation.html",
|
|
91
|
+
)
|
|
92
|
+
|
|
93
|
+
# Image segmentation (binary masks)
|
|
94
|
+
generate_report(
|
|
95
|
+
task="segmentation",
|
|
96
|
+
y_true=[[[0, 0], [1, 1]]],
|
|
97
|
+
y_pred=[[[0, 1], [1, 1]]],
|
|
98
|
+
output_path="reports/segmentation.html",
|
|
99
|
+
)
|
|
100
|
+
|
|
101
|
+
# Object detection (per-image list of box dicts)
|
|
102
|
+
generate_report(
|
|
103
|
+
task="detection",
|
|
104
|
+
y_true=[[{\"bbox\": [0, 0, 10, 10], \"label\": \"obj\"}]],
|
|
105
|
+
y_pred=[[{\"bbox\": [1, 1, 9, 9], \"label\": \"obj\", \"score\": 0.9}]],
|
|
106
|
+
output_path="reports/detection.html",
|
|
107
|
+
)
|
|
108
|
+
|
|
109
|
+
# Recommendation / ranking (one list per user)
|
|
110
|
+
generate_report(
|
|
111
|
+
task="recommendation", # or "ranking", "recommender"
|
|
112
|
+
y_true=[[10, 20], [30]], # relevant item IDs per user
|
|
113
|
+
y_pred=[[10, 99, 20, 5], [7, 30]], # ranked recommendations per user (best first)
|
|
114
|
+
k_values=(1, 5, 10), # optional cutoffs for P@K, R@K, NDCG@K, Hit@K
|
|
115
|
+
output_path="reports/recommendation.html",
|
|
116
|
+
)
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
- **`task="auto"`** — float targets → regression; integer/string labels → classification.
|
|
120
|
+
- If you **omit `output_path`**, the report is written under **`reports/`** (created if needed), e.g. `reports/classification_report.html` or `reports/regression_report.json` when `format="json"`.
|
|
121
|
+
- **Plots** are saved under **`<report_directory>/evalreport_plots/`** (same folder as your HTML/JSON/PDF file’s parent). So custom `output_path="my_reports/x.html"` → plots in `my_reports/evalreport_plots/`.
|
|
122
|
+
|
|
123
|
+
### Task-specific API
|
|
124
|
+
|
|
125
|
+
Useful when you want full control (e.g. set `output_dir` before `run_all()` so plots land next to a chosen folder):
|
|
126
|
+
|
|
127
|
+
```python
|
|
128
|
+
from pathlib import Path
|
|
129
|
+
from evalreport import ClassificationReport, RegressionReport, RankingReport
|
|
130
|
+
|
|
131
|
+
# Classification (binary or multiclass)
|
|
132
|
+
cls = ClassificationReport(
|
|
133
|
+
y_true=[0, 1, 2, 0],
|
|
134
|
+
y_pred=[0, 2, 2, 0],
|
|
135
|
+
# y_prob: (n_samples, n_classes) for multiclass log loss / AUC
|
|
136
|
+
labels=[0, 1, 2], # optional fixed class order for confusion matrix
|
|
137
|
+
)
|
|
138
|
+
cls.output_dir = Path("reports") # optional; default for plots if set before run_all()
|
|
139
|
+
cls.run_all()
|
|
140
|
+
cls.save("reports/classification_report.html", format="html")
|
|
141
|
+
cls.save("reports/classification_report.json", format="json")
|
|
142
|
+
|
|
143
|
+
# Regression
|
|
144
|
+
reg = RegressionReport(y_true=[1.0, 2.0, 3.0], y_pred=[1.1, 1.9, 3.2])
|
|
145
|
+
reg.output_dir = Path("reports")
|
|
146
|
+
reg.run_all()
|
|
147
|
+
reg.save("reports/regression_report.pdf", format="pdf") # needs reportlab
|
|
148
|
+
|
|
149
|
+
# Recommendation / ranking
|
|
150
|
+
rank = RankingReport(
|
|
151
|
+
relevant=[[1, 2], [3]],
|
|
152
|
+
ranked=[[1, 4, 5], [3, 1, 2]],
|
|
153
|
+
k_values=(1, 5, 10),
|
|
154
|
+
)
|
|
155
|
+
rank.output_dir = Path("reports")
|
|
156
|
+
rank.run_all()
|
|
157
|
+
rank.save("reports/ranking_report.html", format="html")
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
---
|
|
161
|
+
|
|
162
|
+
## What each task includes
|
|
163
|
+
|
|
164
|
+
### Classification
|
|
165
|
+
|
|
166
|
+
| Area | Details |
|
|
167
|
+
|------|--------|
|
|
168
|
+
| **Metrics** | Accuracy; precision / recall / F1 (micro, macro, weighted); MCC; Cohen’s κ; log loss (with probs); ROC-AUC / PR-AUC when applicable; confusion matrix (table). |
|
|
169
|
+
| **Plots** | Confusion matrix heatmap; **binary** ROC & PR curves when `y_prob` is provided. |
|
|
170
|
+
| **Insights** | Class imbalance hint; most common misclassification pair. |
|
|
171
|
+
| **HTML** | Styled layout: each metric with a short explanation, insights, and embedded plot images. |
|
|
172
|
+
|
|
173
|
+
**Probabilities**
|
|
174
|
+
|
|
175
|
+
- Binary: `y_prob` as length-`n` scores for the positive class, or shape `(n, 2)`.
|
|
176
|
+
- Multiclass: `(n_samples, n_classes)` for log loss / multiclass AUC where supported.
|
|
177
|
+
|
|
178
|
+
### Regression
|
|
179
|
+
|
|
180
|
+
| Area | Details |
|
|
181
|
+
|------|--------|
|
|
182
|
+
| **Metrics** | MAE, MSE, RMSE, R², median absolute error, MAPE (where defined), mean error (bias). |
|
|
183
|
+
| **Plots** | Residuals vs predicted, predicted vs actual, residual histogram. |
|
|
184
|
+
| **Insights** | Over/under-prediction bias; heavy-tail error hint. |
|
|
185
|
+
| **HTML** | Same rich layout as classification. |
|
|
186
|
+
|
|
187
|
+
### Clustering
|
|
188
|
+
|
|
189
|
+
| Area | Details |
|
|
190
|
+
|------|--------|
|
|
191
|
+
| **Inputs** | `X` (feature matrix) and `labels` (cluster assignments) |
|
|
192
|
+
| **Metrics** | Silhouette score, Davies–Bouldin index, Calinski–Harabasz score, cluster sizes |
|
|
193
|
+
| **Plots** | Cluster scatter (PCA) and cluster size distribution |
|
|
194
|
+
| **Insights** | Separability + imbalance hints |
|
|
195
|
+
| **HTML** | Styled metrics/insights plus embedded plot images |
|
|
196
|
+
|
|
197
|
+
### Time Series / Forecasting
|
|
198
|
+
|
|
199
|
+
| Area | Details |
|
|
200
|
+
|------|--------|
|
|
201
|
+
| **Inputs** | `y_true`, `y_pred`, and `timestamps` (same length) |
|
|
202
|
+
| **Metrics** | MAE, MSE, RMSE, MAPE, SMAPE, mean forecast error, rolling RMSE summary |
|
|
203
|
+
| **Plots** | Actual vs forecast, residuals over time, rolling RMSE over time |
|
|
204
|
+
| **Insights** | Systematic bias and drift/stability hints via rolling RMSE |
|
|
205
|
+
| **HTML** | Styled metrics/insights plus embedded plot images |
|
|
206
|
+
|
|
207
|
+
### Recommendation / Ranking
|
|
208
|
+
|
|
209
|
+
| Area | Details |
|
|
210
|
+
|------|--------|
|
|
211
|
+
| **Inputs** | `relevant`: ground-truth relevant **item IDs** per user (or query). `ranked`: **ordered** recommended lists per user (same length as `relevant`). |
|
|
212
|
+
| **Metrics** | **MAP** (binary relevance), **Precision@K**, **Recall@K**, **NDCG@K**, **Hit Rate@K** for each K in `k_values` (default `(1, 5, 10)`). |
|
|
213
|
+
| **Plots** | Precision@K curve; mean **cumulative gain** vs rank cutoff. |
|
|
214
|
+
| **Insights** | Drop in precision at larger K; long-tail spread in \#relevant per user; low-MAP hint. |
|
|
215
|
+
| **`generate_report`** | `task="recommendation"` / `"ranking"` / `"recommender"` with `y_true=relevant`, `y_pred=ranked`. |
|
|
216
|
+
|
|
217
|
+
---
|
|
218
|
+
|
|
219
|
+
## Output formats
|
|
220
|
+
|
|
221
|
+
| Format | How | Notes |
|
|
222
|
+
|--------|-----|--------|
|
|
223
|
+
| **HTML** | `format="html"` or `.html` | Metrics + descriptions + insights + plot images. |
|
|
224
|
+
| **JSON** | `format="json"` or `.json` | `metrics`, `insights`, `plots` (paths to PNGs). |
|
|
225
|
+
| **Markdown** | `format="markdown"` or `.md` | Metrics and insights (no embedded images). |
|
|
226
|
+
| **PDF** | `format="pdf"` or `.pdf` | Text summary (metrics + descriptions + insights); install `reportlab`. |
|
|
227
|
+
|
|
228
|
+
---
|
|
229
|
+
|
|
230
|
+
## Development
|
|
231
|
+
|
|
232
|
+
```bash
|
|
233
|
+
git clone https://github.com/RAAHUL-tech/model-eval-toolkit.git
|
|
234
|
+
cd model-eval-toolkit
|
|
235
|
+
python -m venv .venv
|
|
236
|
+
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
237
|
+
pip install -e ".[test]"
|
|
238
|
+
pytest -q
|
|
239
|
+
# optional coverage
|
|
240
|
+
pytest --cov=evalreport --cov-report=term-missing
|
|
241
|
+
```
|
|
242
|
+
|
|
243
|
+
Build and check the package:
|
|
244
|
+
|
|
245
|
+
```bash
|
|
246
|
+
pip install build twine
|
|
247
|
+
python -m build
|
|
248
|
+
twine check dist/*
|
|
249
|
+
```
|
|
250
|
+
|
|
251
|
+
### CI and PyPI releases
|
|
252
|
+
|
|
253
|
+
GitHub Actions (`.github/workflows/ci.yml`):
|
|
254
|
+
|
|
255
|
+
- **Pull requests** → runs **tests** only (Python 3.9–3.11).
|
|
256
|
+
- **Push to `main`** (including when a PR is merged) → runs **tests**, then **publishes** to [PyPI](https://pypi.org/project/model-eval-toolkit/) if tests pass.
|
|
257
|
+
|
|
258
|
+
**One-time setup**
|
|
259
|
+
|
|
260
|
+
1. On [pypi.org](https://pypi.org/manage/account/token/), create an **API token** scoped to this project (or your whole account for a first publish).
|
|
261
|
+
2. In the GitHub repo: **Settings → Secrets and variables → Actions → New repository secret**
|
|
262
|
+
- Name: `PYPI_API_TOKEN`
|
|
263
|
+
- Value: the token (often starts with `pypi-`).
|
|
264
|
+
|
|
265
|
+
**Before each release**
|
|
266
|
+
|
|
267
|
+
- Bump `version` in `pyproject.toml`. PyPI rejects re-uploading the same version.
|
|
268
|
+
|
|
269
|
+
Optional: use [Trusted Publishing](https://docs.pypi.org/trusted-publishers/) (OIDC) and drop the token; the workflow already requests `id-token: write` for that path.
|
|
270
|
+
|
|
271
|
+
---
|
|
272
|
+
|
|
273
|
+
## Roadmap
|
|
274
|
+
|
|
275
|
+
Additional task types (clustering, time series, ranking, NLP, CV) and a plugin-style API are planned. Issues and PRs welcome on [GitHub](https://github.com/RAAHUL-tech/model-eval-toolkit).
|
|
276
|
+
|
|
277
|
+
---
|
|
278
|
+
|
|
279
|
+
## License
|
|
280
|
+
|
|
281
|
+
See [LICENSE](LICENSE).
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
from .core.entrypoints import generate_report
|
|
2
|
+
from .classification.report import ClassificationReport
|
|
3
|
+
from .regression.report import RegressionReport
|
|
4
|
+
from .__version__ import __version__
|
|
5
|
+
from .clustering.report import ClusteringReport
|
|
6
|
+
from .timeseries.report import TimeSeriesReport
|
|
7
|
+
from .nlp.text_classification import TextClassificationReport
|
|
8
|
+
from .nlp.text_generation import TextGenerationReport
|
|
9
|
+
from .vision.segmentation import SegmentationReport
|
|
10
|
+
from .vision.detection import DetectionReport
|
|
11
|
+
from .vision.image_classification import ImageClassificationReport
|
|
12
|
+
from .ranking.report import RankingReport
|
|
13
|
+
|
|
14
|
+
__all__ = [
|
|
15
|
+
"generate_report",
|
|
16
|
+
"__version__",
|
|
17
|
+
"ClassificationReport",
|
|
18
|
+
"RegressionReport",
|
|
19
|
+
"ClusteringReport",
|
|
20
|
+
"TimeSeriesReport",
|
|
21
|
+
"TextClassificationReport",
|
|
22
|
+
"TextGenerationReport",
|
|
23
|
+
"SegmentationReport",
|
|
24
|
+
"DetectionReport",
|
|
25
|
+
"ImageClassificationReport",
|
|
26
|
+
"RankingReport",
|
|
27
|
+
]
|
|
28
|
+
|