model-eval-toolkit 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. model_eval_toolkit-0.1.0/LICENSE +21 -0
  2. model_eval_toolkit-0.1.0/PKG-INFO +339 -0
  3. model_eval_toolkit-0.1.0/README.md +281 -0
  4. model_eval_toolkit-0.1.0/evalreport/__init__.py +28 -0
  5. model_eval_toolkit-0.1.0/evalreport/__version__.py +2 -0
  6. model_eval_toolkit-0.1.0/evalreport/classification/__init__.py +4 -0
  7. model_eval_toolkit-0.1.0/evalreport/classification/report.py +319 -0
  8. model_eval_toolkit-0.1.0/evalreport/clustering/__init__.py +4 -0
  9. model_eval_toolkit-0.1.0/evalreport/clustering/report.py +174 -0
  10. model_eval_toolkit-0.1.0/evalreport/core/base_report.py +479 -0
  11. model_eval_toolkit-0.1.0/evalreport/core/entrypoints.py +97 -0
  12. model_eval_toolkit-0.1.0/evalreport/core/task_inference.py +180 -0
  13. model_eval_toolkit-0.1.0/evalreport/nlp/__init__.py +5 -0
  14. model_eval_toolkit-0.1.0/evalreport/nlp/text_classification.py +21 -0
  15. model_eval_toolkit-0.1.0/evalreport/nlp/text_generation.py +202 -0
  16. model_eval_toolkit-0.1.0/evalreport/ranking/__init__.py +3 -0
  17. model_eval_toolkit-0.1.0/evalreport/ranking/report.py +274 -0
  18. model_eval_toolkit-0.1.0/evalreport/regression/__init__.py +4 -0
  19. model_eval_toolkit-0.1.0/evalreport/regression/report.py +173 -0
  20. model_eval_toolkit-0.1.0/evalreport/timeseries/__init__.py +4 -0
  21. model_eval_toolkit-0.1.0/evalreport/timeseries/report.py +211 -0
  22. model_eval_toolkit-0.1.0/evalreport/vision/__init__.py +6 -0
  23. model_eval_toolkit-0.1.0/evalreport/vision/detection.py +359 -0
  24. model_eval_toolkit-0.1.0/evalreport/vision/image_classification.py +25 -0
  25. model_eval_toolkit-0.1.0/evalreport/vision/segmentation.py +140 -0
  26. model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/PKG-INFO +339 -0
  27. model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/SOURCES.txt +45 -0
  28. model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/dependency_links.txt +1 -0
  29. model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/requires.txt +16 -0
  30. model_eval_toolkit-0.1.0/model_eval_toolkit.egg-info/top_level.txt +1 -0
  31. model_eval_toolkit-0.1.0/pyproject.toml +54 -0
  32. model_eval_toolkit-0.1.0/setup.cfg +4 -0
  33. model_eval_toolkit-0.1.0/tests/test_base_report.py +24 -0
  34. model_eval_toolkit-0.1.0/tests/test_classification_comprehensive.py +117 -0
  35. model_eval_toolkit-0.1.0/tests/test_classification_report.py +32 -0
  36. model_eval_toolkit-0.1.0/tests/test_clustering_report.py +50 -0
  37. model_eval_toolkit-0.1.0/tests/test_detection_map.py +55 -0
  38. model_eval_toolkit-0.1.0/tests/test_generate_report.py +113 -0
  39. model_eval_toolkit-0.1.0/tests/test_image_classification_report.py +61 -0
  40. model_eval_toolkit-0.1.0/tests/test_nlp_reports.py +38 -0
  41. model_eval_toolkit-0.1.0/tests/test_ranking_report.py +37 -0
  42. model_eval_toolkit-0.1.0/tests/test_regression_comprehensive.py +72 -0
  43. model_eval_toolkit-0.1.0/tests/test_regression_report.py +28 -0
  44. model_eval_toolkit-0.1.0/tests/test_report_outputs.py +46 -0
  45. model_eval_toolkit-0.1.0/tests/test_task_inference.py +83 -0
  46. model_eval_toolkit-0.1.0/tests/test_timeseries_report.py +44 -0
  47. model_eval_toolkit-0.1.0/tests/test_vision_reports.py +63 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Raahul Krishna Durairaju
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,339 @@
1
+ Metadata-Version: 2.4
2
+ Name: model-eval-toolkit
3
+ Version: 0.1.0
4
+ Summary: Unified ML evaluation framework for classification, regression, clustering, time series, NLP, CV, and recommendation systems.
5
+ Author-email: Raahul Krishna Durairaju <rahulkrish28@gmail.com>
6
+ License: MIT License
7
+
8
+ Copyright (c) 2026 Raahul Krishna Durairaju
9
+
10
+ Permission is hereby granted, free of charge, to any person obtaining a copy
11
+ of this software and associated documentation files (the "Software"), to deal
12
+ in the Software without restriction, including without limitation the rights
13
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
14
+ copies of the Software, and to permit persons to whom the Software is
15
+ furnished to do so, subject to the following conditions:
16
+
17
+ The above copyright notice and this permission notice shall be included in all
18
+ copies or substantial portions of the Software.
19
+
20
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
21
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
22
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
23
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
24
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
25
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
26
+ SOFTWARE.
27
+
28
+ Project-URL: Homepage, https://github.com/RAAHUL-tech/model-eval-toolkit
29
+ Project-URL: Documentation, https://github.com/RAAHUL-tech/model-eval-toolkit/tree/main/docs
30
+ Project-URL: Repository, https://github.com/RAAHUL-tech/model-eval-toolkit
31
+ Project-URL: Issues, https://github.com/RAAHUL-tech/model-eval-toolkit/issues
32
+ Keywords: machine-learning,evaluation,metrics,nlp,computer-vision,time-series,ranking
33
+ Classifier: Programming Language :: Python :: 3
34
+ Classifier: Programming Language :: Python :: 3.9
35
+ Classifier: Programming Language :: Python :: 3.10
36
+ Classifier: Programming Language :: Python :: 3.11
37
+ Classifier: License :: OSI Approved :: MIT License
38
+ Classifier: Operating System :: OS Independent
39
+ Classifier: Intended Audience :: Science/Research
40
+ Classifier: Intended Audience :: Developers
41
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
42
+ Requires-Python: >=3.9
43
+ Description-Content-Type: text/markdown
44
+ License-File: LICENSE
45
+ Requires-Dist: numpy>=1.23
46
+ Requires-Dist: pandas>=1.5
47
+ Requires-Dist: scikit-learn>=1.2
48
+ Requires-Dist: matplotlib>=3.7
49
+ Requires-Dist: seaborn>=0.12
50
+ Provides-Extra: test
51
+ Requires-Dist: pytest>=7; extra == "test"
52
+ Requires-Dist: coverage[toml]>=7; extra == "test"
53
+ Provides-Extra: pdf
54
+ Requires-Dist: reportlab>=4; extra == "pdf"
55
+ Provides-Extra: nlp
56
+ Provides-Extra: vision
57
+ Dynamic: license-file
58
+
59
+ <p align="center">
60
+ <img src="https://raw.githubusercontent.com/RAAHUL-tech/model-eval-toolkit/main/docs/images/evalreport-logo.png" alt="Model Eval Toolkit" width="260">
61
+ </p>
62
+
63
+ # Model Eval Toolkit
64
+
65
+ **Unified ML evaluation reports** for Python: metrics, plots, auto-insights, and export to **HTML**, **JSON**, **Markdown**, or **PDF**.
66
+
67
+ Model Eval Toolkit provides a single, task-aware evaluation layer to benchmark model quality consistently across ML domains.
68
+
69
+ Import from the **`evalreport`** package:
70
+
71
+ ```python
72
+ from evalreport import (
73
+ generate_report,
74
+ ClassificationReport,
75
+ RegressionReport,
76
+ ClusteringReport,
77
+ TimeSeriesReport,
78
+ TextClassificationReport,
79
+ TextGenerationReport,
80
+ SegmentationReport,
81
+ DetectionReport,
82
+ RankingReport,
83
+ __version__,
84
+ )
85
+ ```
86
+
87
+ > **Current supported tasks (v0.1):**
88
+ > classification (binary & multiclass), regression, clustering, time series/forecasting,
89
+ > NLP (text classification + text generation), CV (segmentation + detection), and **recommendation / ranking**.
90
+ > The roadmap includes multilabel and richer recsys (e.g. session-based, implicit feedback models).
91
+
92
+ ---
93
+
94
+ ## Install
95
+
96
+ ```bash
97
+ pip install model-eval-toolkit
98
+ ```
99
+
100
+ **PDF export** needs ReportLab:
101
+
102
+ ```bash
103
+ pip install "model-eval-toolkit[pdf]"
104
+ # or
105
+ pip install reportlab
106
+ ```
107
+
108
+ **Requirements:** Python ≥ 3.9, NumPy, pandas, scikit-learn, Matplotlib, Seaborn.
109
+
110
+ Optional task extras (currently dependency-light for NLP/CV):
111
+
112
+ ```bash
113
+ pip install "model-eval-toolkit[nlp]"
114
+ pip install "model-eval-toolkit[vision]"
115
+ ```
116
+
117
+ ---
118
+
119
+ ## Quick start
120
+
121
+ ### `generate_report` (recommended)
122
+
123
+ ```python
124
+ from evalreport import generate_report
125
+
126
+ summary = generate_report(
127
+ task="classification", # or "regression", or "auto"
128
+ y_true=[0, 1, 0, 1, 1],
129
+ y_pred=[0, 1, 1, 1, 1],
130
+ y_prob=[0.1, 0.9, 0.8, 0.7, 0.6], # optional; enables log loss, ROC/PR (binary)
131
+ output_path="my_reports/model_report.html",
132
+ format="html",
133
+ )
134
+
135
+ print(summary["metrics"]["accuracy"])
136
+ ```
137
+
138
+ NLP + CV examples:
139
+
140
+ ```python
141
+ from evalreport import generate_report
142
+
143
+ # Text generation
144
+ generate_report(
145
+ task="text_generation",
146
+ y_true=["the cat sat on the mat"],
147
+ y_pred=["the cat sat on mat"],
148
+ output_path="reports/text_generation.html",
149
+ )
150
+
151
+ # Image segmentation (binary masks)
152
+ generate_report(
153
+ task="segmentation",
154
+ y_true=[[[0, 0], [1, 1]]],
155
+ y_pred=[[[0, 1], [1, 1]]],
156
+ output_path="reports/segmentation.html",
157
+ )
158
+
159
+ # Object detection (per-image list of box dicts)
160
+ generate_report(
161
+ task="detection",
162
+ y_true=[[{\"bbox\": [0, 0, 10, 10], \"label\": \"obj\"}]],
163
+ y_pred=[[{\"bbox\": [1, 1, 9, 9], \"label\": \"obj\", \"score\": 0.9}]],
164
+ output_path="reports/detection.html",
165
+ )
166
+
167
+ # Recommendation / ranking (one list per user)
168
+ generate_report(
169
+ task="recommendation", # or "ranking", "recommender"
170
+ y_true=[[10, 20], [30]], # relevant item IDs per user
171
+ y_pred=[[10, 99, 20, 5], [7, 30]], # ranked recommendations per user (best first)
172
+ k_values=(1, 5, 10), # optional cutoffs for P@K, R@K, NDCG@K, Hit@K
173
+ output_path="reports/recommendation.html",
174
+ )
175
+ ```
176
+
177
+ - **`task="auto"`** — float targets → regression; integer/string labels → classification.
178
+ - If you **omit `output_path`**, the report is written under **`reports/`** (created if needed), e.g. `reports/classification_report.html` or `reports/regression_report.json` when `format="json"`.
179
+ - **Plots** are saved under **`<report_directory>/evalreport_plots/`** (same folder as your HTML/JSON/PDF file’s parent). So custom `output_path="my_reports/x.html"` → plots in `my_reports/evalreport_plots/`.
180
+
181
+ ### Task-specific API
182
+
183
+ Useful when you want full control (e.g. set `output_dir` before `run_all()` so plots land next to a chosen folder):
184
+
185
+ ```python
186
+ from pathlib import Path
187
+ from evalreport import ClassificationReport, RegressionReport, RankingReport
188
+
189
+ # Classification (binary or multiclass)
190
+ cls = ClassificationReport(
191
+ y_true=[0, 1, 2, 0],
192
+ y_pred=[0, 2, 2, 0],
193
+ # y_prob: (n_samples, n_classes) for multiclass log loss / AUC
194
+ labels=[0, 1, 2], # optional fixed class order for confusion matrix
195
+ )
196
+ cls.output_dir = Path("reports") # optional; default for plots if set before run_all()
197
+ cls.run_all()
198
+ cls.save("reports/classification_report.html", format="html")
199
+ cls.save("reports/classification_report.json", format="json")
200
+
201
+ # Regression
202
+ reg = RegressionReport(y_true=[1.0, 2.0, 3.0], y_pred=[1.1, 1.9, 3.2])
203
+ reg.output_dir = Path("reports")
204
+ reg.run_all()
205
+ reg.save("reports/regression_report.pdf", format="pdf") # needs reportlab
206
+
207
+ # Recommendation / ranking
208
+ rank = RankingReport(
209
+ relevant=[[1, 2], [3]],
210
+ ranked=[[1, 4, 5], [3, 1, 2]],
211
+ k_values=(1, 5, 10),
212
+ )
213
+ rank.output_dir = Path("reports")
214
+ rank.run_all()
215
+ rank.save("reports/ranking_report.html", format="html")
216
+ ```
217
+
218
+ ---
219
+
220
+ ## What each task includes
221
+
222
+ ### Classification
223
+
224
+ | Area | Details |
225
+ |------|--------|
226
+ | **Metrics** | Accuracy; precision / recall / F1 (micro, macro, weighted); MCC; Cohen’s κ; log loss (with probs); ROC-AUC / PR-AUC when applicable; confusion matrix (table). |
227
+ | **Plots** | Confusion matrix heatmap; **binary** ROC & PR curves when `y_prob` is provided. |
228
+ | **Insights** | Class imbalance hint; most common misclassification pair. |
229
+ | **HTML** | Styled layout: each metric with a short explanation, insights, and embedded plot images. |
230
+
231
+ **Probabilities**
232
+
233
+ - Binary: `y_prob` as length-`n` scores for the positive class, or shape `(n, 2)`.
234
+ - Multiclass: `(n_samples, n_classes)` for log loss / multiclass AUC where supported.
235
+
236
+ ### Regression
237
+
238
+ | Area | Details |
239
+ |------|--------|
240
+ | **Metrics** | MAE, MSE, RMSE, R², median absolute error, MAPE (where defined), mean error (bias). |
241
+ | **Plots** | Residuals vs predicted, predicted vs actual, residual histogram. |
242
+ | **Insights** | Over/under-prediction bias; heavy-tail error hint. |
243
+ | **HTML** | Same rich layout as classification. |
244
+
245
+ ### Clustering
246
+
247
+ | Area | Details |
248
+ |------|--------|
249
+ | **Inputs** | `X` (feature matrix) and `labels` (cluster assignments) |
250
+ | **Metrics** | Silhouette score, Davies–Bouldin index, Calinski–Harabasz score, cluster sizes |
251
+ | **Plots** | Cluster scatter (PCA) and cluster size distribution |
252
+ | **Insights** | Separability + imbalance hints |
253
+ | **HTML** | Styled metrics/insights plus embedded plot images |
254
+
255
+ ### Time Series / Forecasting
256
+
257
+ | Area | Details |
258
+ |------|--------|
259
+ | **Inputs** | `y_true`, `y_pred`, and `timestamps` (same length) |
260
+ | **Metrics** | MAE, MSE, RMSE, MAPE, SMAPE, mean forecast error, rolling RMSE summary |
261
+ | **Plots** | Actual vs forecast, residuals over time, rolling RMSE over time |
262
+ | **Insights** | Systematic bias and drift/stability hints via rolling RMSE |
263
+ | **HTML** | Styled metrics/insights plus embedded plot images |
264
+
265
+ ### Recommendation / Ranking
266
+
267
+ | Area | Details |
268
+ |------|--------|
269
+ | **Inputs** | `relevant`: ground-truth relevant **item IDs** per user (or query). `ranked`: **ordered** recommended lists per user (same length as `relevant`). |
270
+ | **Metrics** | **MAP** (binary relevance), **Precision@K**, **Recall@K**, **NDCG@K**, **Hit Rate@K** for each K in `k_values` (default `(1, 5, 10)`). |
271
+ | **Plots** | Precision@K curve; mean **cumulative gain** vs rank cutoff. |
272
+ | **Insights** | Drop in precision at larger K; long-tail spread in \#relevant per user; low-MAP hint. |
273
+ | **`generate_report`** | `task="recommendation"` / `"ranking"` / `"recommender"` with `y_true=relevant`, `y_pred=ranked`. |
274
+
275
+ ---
276
+
277
+ ## Output formats
278
+
279
+ | Format | How | Notes |
280
+ |--------|-----|--------|
281
+ | **HTML** | `format="html"` or `.html` | Metrics + descriptions + insights + plot images. |
282
+ | **JSON** | `format="json"` or `.json` | `metrics`, `insights`, `plots` (paths to PNGs). |
283
+ | **Markdown** | `format="markdown"` or `.md` | Metrics and insights (no embedded images). |
284
+ | **PDF** | `format="pdf"` or `.pdf` | Text summary (metrics + descriptions + insights); install `reportlab`. |
285
+
286
+ ---
287
+
288
+ ## Development
289
+
290
+ ```bash
291
+ git clone https://github.com/RAAHUL-tech/model-eval-toolkit.git
292
+ cd model-eval-toolkit
293
+ python -m venv .venv
294
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
295
+ pip install -e ".[test]"
296
+ pytest -q
297
+ # optional coverage
298
+ pytest --cov=evalreport --cov-report=term-missing
299
+ ```
300
+
301
+ Build and check the package:
302
+
303
+ ```bash
304
+ pip install build twine
305
+ python -m build
306
+ twine check dist/*
307
+ ```
308
+
309
+ ### CI and PyPI releases
310
+
311
+ GitHub Actions (`.github/workflows/ci.yml`):
312
+
313
+ - **Pull requests** → runs **tests** only (Python 3.9–3.11).
314
+ - **Push to `main`** (including when a PR is merged) → runs **tests**, then **publishes** to [PyPI](https://pypi.org/project/model-eval-toolkit/) if tests pass.
315
+
316
+ **One-time setup**
317
+
318
+ 1. On [pypi.org](https://pypi.org/manage/account/token/), create an **API token** scoped to this project (or your whole account for a first publish).
319
+ 2. In the GitHub repo: **Settings → Secrets and variables → Actions → New repository secret**
320
+ - Name: `PYPI_API_TOKEN`
321
+ - Value: the token (often starts with `pypi-`).
322
+
323
+ **Before each release**
324
+
325
+ - Bump `version` in `pyproject.toml`. PyPI rejects re-uploading the same version.
326
+
327
+ Optional: use [Trusted Publishing](https://docs.pypi.org/trusted-publishers/) (OIDC) and drop the token; the workflow already requests `id-token: write` for that path.
328
+
329
+ ---
330
+
331
+ ## Roadmap
332
+
333
+ Additional task types (clustering, time series, ranking, NLP, CV) and a plugin-style API are planned. Issues and PRs welcome on [GitHub](https://github.com/RAAHUL-tech/model-eval-toolkit).
334
+
335
+ ---
336
+
337
+ ## License
338
+
339
+ See [LICENSE](LICENSE).
@@ -0,0 +1,281 @@
1
+ <p align="center">
2
+ <img src="https://raw.githubusercontent.com/RAAHUL-tech/model-eval-toolkit/main/docs/images/evalreport-logo.png" alt="Model Eval Toolkit" width="260">
3
+ </p>
4
+
5
+ # Model Eval Toolkit
6
+
7
+ **Unified ML evaluation reports** for Python: metrics, plots, auto-insights, and export to **HTML**, **JSON**, **Markdown**, or **PDF**.
8
+
9
+ Model Eval Toolkit provides a single, task-aware evaluation layer to benchmark model quality consistently across ML domains.
10
+
11
+ Import from the **`evalreport`** package:
12
+
13
+ ```python
14
+ from evalreport import (
15
+ generate_report,
16
+ ClassificationReport,
17
+ RegressionReport,
18
+ ClusteringReport,
19
+ TimeSeriesReport,
20
+ TextClassificationReport,
21
+ TextGenerationReport,
22
+ SegmentationReport,
23
+ DetectionReport,
24
+ RankingReport,
25
+ __version__,
26
+ )
27
+ ```
28
+
29
+ > **Current supported tasks (v0.1):**
30
+ > classification (binary & multiclass), regression, clustering, time series/forecasting,
31
+ > NLP (text classification + text generation), CV (segmentation + detection), and **recommendation / ranking**.
32
+ > The roadmap includes multilabel and richer recsys (e.g. session-based, implicit feedback models).
33
+
34
+ ---
35
+
36
+ ## Install
37
+
38
+ ```bash
39
+ pip install model-eval-toolkit
40
+ ```
41
+
42
+ **PDF export** needs ReportLab:
43
+
44
+ ```bash
45
+ pip install "model-eval-toolkit[pdf]"
46
+ # or
47
+ pip install reportlab
48
+ ```
49
+
50
+ **Requirements:** Python ≥ 3.9, NumPy, pandas, scikit-learn, Matplotlib, Seaborn.
51
+
52
+ Optional task extras (currently dependency-light for NLP/CV):
53
+
54
+ ```bash
55
+ pip install "model-eval-toolkit[nlp]"
56
+ pip install "model-eval-toolkit[vision]"
57
+ ```
58
+
59
+ ---
60
+
61
+ ## Quick start
62
+
63
+ ### `generate_report` (recommended)
64
+
65
+ ```python
66
+ from evalreport import generate_report
67
+
68
+ summary = generate_report(
69
+ task="classification", # or "regression", or "auto"
70
+ y_true=[0, 1, 0, 1, 1],
71
+ y_pred=[0, 1, 1, 1, 1],
72
+ y_prob=[0.1, 0.9, 0.8, 0.7, 0.6], # optional; enables log loss, ROC/PR (binary)
73
+ output_path="my_reports/model_report.html",
74
+ format="html",
75
+ )
76
+
77
+ print(summary["metrics"]["accuracy"])
78
+ ```
79
+
80
+ NLP + CV examples:
81
+
82
+ ```python
83
+ from evalreport import generate_report
84
+
85
+ # Text generation
86
+ generate_report(
87
+ task="text_generation",
88
+ y_true=["the cat sat on the mat"],
89
+ y_pred=["the cat sat on mat"],
90
+ output_path="reports/text_generation.html",
91
+ )
92
+
93
+ # Image segmentation (binary masks)
94
+ generate_report(
95
+ task="segmentation",
96
+ y_true=[[[0, 0], [1, 1]]],
97
+ y_pred=[[[0, 1], [1, 1]]],
98
+ output_path="reports/segmentation.html",
99
+ )
100
+
101
+ # Object detection (per-image list of box dicts)
102
+ generate_report(
103
+ task="detection",
104
+ y_true=[[{\"bbox\": [0, 0, 10, 10], \"label\": \"obj\"}]],
105
+ y_pred=[[{\"bbox\": [1, 1, 9, 9], \"label\": \"obj\", \"score\": 0.9}]],
106
+ output_path="reports/detection.html",
107
+ )
108
+
109
+ # Recommendation / ranking (one list per user)
110
+ generate_report(
111
+ task="recommendation", # or "ranking", "recommender"
112
+ y_true=[[10, 20], [30]], # relevant item IDs per user
113
+ y_pred=[[10, 99, 20, 5], [7, 30]], # ranked recommendations per user (best first)
114
+ k_values=(1, 5, 10), # optional cutoffs for P@K, R@K, NDCG@K, Hit@K
115
+ output_path="reports/recommendation.html",
116
+ )
117
+ ```
118
+
119
+ - **`task="auto"`** — float targets → regression; integer/string labels → classification.
120
+ - If you **omit `output_path`**, the report is written under **`reports/`** (created if needed), e.g. `reports/classification_report.html` or `reports/regression_report.json` when `format="json"`.
121
+ - **Plots** are saved under **`<report_directory>/evalreport_plots/`** (same folder as your HTML/JSON/PDF file’s parent). So custom `output_path="my_reports/x.html"` → plots in `my_reports/evalreport_plots/`.
122
+
123
+ ### Task-specific API
124
+
125
+ Useful when you want full control (e.g. set `output_dir` before `run_all()` so plots land next to a chosen folder):
126
+
127
+ ```python
128
+ from pathlib import Path
129
+ from evalreport import ClassificationReport, RegressionReport, RankingReport
130
+
131
+ # Classification (binary or multiclass)
132
+ cls = ClassificationReport(
133
+ y_true=[0, 1, 2, 0],
134
+ y_pred=[0, 2, 2, 0],
135
+ # y_prob: (n_samples, n_classes) for multiclass log loss / AUC
136
+ labels=[0, 1, 2], # optional fixed class order for confusion matrix
137
+ )
138
+ cls.output_dir = Path("reports") # optional; default for plots if set before run_all()
139
+ cls.run_all()
140
+ cls.save("reports/classification_report.html", format="html")
141
+ cls.save("reports/classification_report.json", format="json")
142
+
143
+ # Regression
144
+ reg = RegressionReport(y_true=[1.0, 2.0, 3.0], y_pred=[1.1, 1.9, 3.2])
145
+ reg.output_dir = Path("reports")
146
+ reg.run_all()
147
+ reg.save("reports/regression_report.pdf", format="pdf") # needs reportlab
148
+
149
+ # Recommendation / ranking
150
+ rank = RankingReport(
151
+ relevant=[[1, 2], [3]],
152
+ ranked=[[1, 4, 5], [3, 1, 2]],
153
+ k_values=(1, 5, 10),
154
+ )
155
+ rank.output_dir = Path("reports")
156
+ rank.run_all()
157
+ rank.save("reports/ranking_report.html", format="html")
158
+ ```
159
+
160
+ ---
161
+
162
+ ## What each task includes
163
+
164
+ ### Classification
165
+
166
+ | Area | Details |
167
+ |------|--------|
168
+ | **Metrics** | Accuracy; precision / recall / F1 (micro, macro, weighted); MCC; Cohen’s κ; log loss (with probs); ROC-AUC / PR-AUC when applicable; confusion matrix (table). |
169
+ | **Plots** | Confusion matrix heatmap; **binary** ROC & PR curves when `y_prob` is provided. |
170
+ | **Insights** | Class imbalance hint; most common misclassification pair. |
171
+ | **HTML** | Styled layout: each metric with a short explanation, insights, and embedded plot images. |
172
+
173
+ **Probabilities**
174
+
175
+ - Binary: `y_prob` as length-`n` scores for the positive class, or shape `(n, 2)`.
176
+ - Multiclass: `(n_samples, n_classes)` for log loss / multiclass AUC where supported.
177
+
178
+ ### Regression
179
+
180
+ | Area | Details |
181
+ |------|--------|
182
+ | **Metrics** | MAE, MSE, RMSE, R², median absolute error, MAPE (where defined), mean error (bias). |
183
+ | **Plots** | Residuals vs predicted, predicted vs actual, residual histogram. |
184
+ | **Insights** | Over/under-prediction bias; heavy-tail error hint. |
185
+ | **HTML** | Same rich layout as classification. |
186
+
187
+ ### Clustering
188
+
189
+ | Area | Details |
190
+ |------|--------|
191
+ | **Inputs** | `X` (feature matrix) and `labels` (cluster assignments) |
192
+ | **Metrics** | Silhouette score, Davies–Bouldin index, Calinski–Harabasz score, cluster sizes |
193
+ | **Plots** | Cluster scatter (PCA) and cluster size distribution |
194
+ | **Insights** | Separability + imbalance hints |
195
+ | **HTML** | Styled metrics/insights plus embedded plot images |
196
+
197
+ ### Time Series / Forecasting
198
+
199
+ | Area | Details |
200
+ |------|--------|
201
+ | **Inputs** | `y_true`, `y_pred`, and `timestamps` (same length) |
202
+ | **Metrics** | MAE, MSE, RMSE, MAPE, SMAPE, mean forecast error, rolling RMSE summary |
203
+ | **Plots** | Actual vs forecast, residuals over time, rolling RMSE over time |
204
+ | **Insights** | Systematic bias and drift/stability hints via rolling RMSE |
205
+ | **HTML** | Styled metrics/insights plus embedded plot images |
206
+
207
+ ### Recommendation / Ranking
208
+
209
+ | Area | Details |
210
+ |------|--------|
211
+ | **Inputs** | `relevant`: ground-truth relevant **item IDs** per user (or query). `ranked`: **ordered** recommended lists per user (same length as `relevant`). |
212
+ | **Metrics** | **MAP** (binary relevance), **Precision@K**, **Recall@K**, **NDCG@K**, **Hit Rate@K** for each K in `k_values` (default `(1, 5, 10)`). |
213
+ | **Plots** | Precision@K curve; mean **cumulative gain** vs rank cutoff. |
214
+ | **Insights** | Drop in precision at larger K; long-tail spread in \#relevant per user; low-MAP hint. |
215
+ | **`generate_report`** | `task="recommendation"` / `"ranking"` / `"recommender"` with `y_true=relevant`, `y_pred=ranked`. |
216
+
217
+ ---
218
+
219
+ ## Output formats
220
+
221
+ | Format | How | Notes |
222
+ |--------|-----|--------|
223
+ | **HTML** | `format="html"` or `.html` | Metrics + descriptions + insights + plot images. |
224
+ | **JSON** | `format="json"` or `.json` | `metrics`, `insights`, `plots` (paths to PNGs). |
225
+ | **Markdown** | `format="markdown"` or `.md` | Metrics and insights (no embedded images). |
226
+ | **PDF** | `format="pdf"` or `.pdf` | Text summary (metrics + descriptions + insights); install `reportlab`. |
227
+
228
+ ---
229
+
230
+ ## Development
231
+
232
+ ```bash
233
+ git clone https://github.com/RAAHUL-tech/model-eval-toolkit.git
234
+ cd model-eval-toolkit
235
+ python -m venv .venv
236
+ source .venv/bin/activate # Windows: .venv\Scripts\activate
237
+ pip install -e ".[test]"
238
+ pytest -q
239
+ # optional coverage
240
+ pytest --cov=evalreport --cov-report=term-missing
241
+ ```
242
+
243
+ Build and check the package:
244
+
245
+ ```bash
246
+ pip install build twine
247
+ python -m build
248
+ twine check dist/*
249
+ ```
250
+
251
+ ### CI and PyPI releases
252
+
253
+ GitHub Actions (`.github/workflows/ci.yml`):
254
+
255
+ - **Pull requests** → runs **tests** only (Python 3.9–3.11).
256
+ - **Push to `main`** (including when a PR is merged) → runs **tests**, then **publishes** to [PyPI](https://pypi.org/project/model-eval-toolkit/) if tests pass.
257
+
258
+ **One-time setup**
259
+
260
+ 1. On [pypi.org](https://pypi.org/manage/account/token/), create an **API token** scoped to this project (or your whole account for a first publish).
261
+ 2. In the GitHub repo: **Settings → Secrets and variables → Actions → New repository secret**
262
+ - Name: `PYPI_API_TOKEN`
263
+ - Value: the token (often starts with `pypi-`).
264
+
265
+ **Before each release**
266
+
267
+ - Bump `version` in `pyproject.toml`. PyPI rejects re-uploading the same version.
268
+
269
+ Optional: use [Trusted Publishing](https://docs.pypi.org/trusted-publishers/) (OIDC) and drop the token; the workflow already requests `id-token: write` for that path.
270
+
271
+ ---
272
+
273
+ ## Roadmap
274
+
275
+ Additional task types (clustering, time series, ranking, NLP, CV) and a plugin-style API are planned. Issues and PRs welcome on [GitHub](https://github.com/RAAHUL-tech/model-eval-toolkit).
276
+
277
+ ---
278
+
279
+ ## License
280
+
281
+ See [LICENSE](LICENSE).
@@ -0,0 +1,28 @@
1
+ from .core.entrypoints import generate_report
2
+ from .classification.report import ClassificationReport
3
+ from .regression.report import RegressionReport
4
+ from .__version__ import __version__
5
+ from .clustering.report import ClusteringReport
6
+ from .timeseries.report import TimeSeriesReport
7
+ from .nlp.text_classification import TextClassificationReport
8
+ from .nlp.text_generation import TextGenerationReport
9
+ from .vision.segmentation import SegmentationReport
10
+ from .vision.detection import DetectionReport
11
+ from .vision.image_classification import ImageClassificationReport
12
+ from .ranking.report import RankingReport
13
+
14
+ __all__ = [
15
+ "generate_report",
16
+ "__version__",
17
+ "ClassificationReport",
18
+ "RegressionReport",
19
+ "ClusteringReport",
20
+ "TimeSeriesReport",
21
+ "TextClassificationReport",
22
+ "TextGenerationReport",
23
+ "SegmentationReport",
24
+ "DetectionReport",
25
+ "ImageClassificationReport",
26
+ "RankingReport",
27
+ ]
28
+
@@ -0,0 +1,2 @@
1
+ __version__ = "0.1.0"
2
+
@@ -0,0 +1,4 @@
1
+ from .report import ClassificationReport
2
+
3
+ __all__ = ["ClassificationReport"]
4
+