haoline 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haoline/.streamlit/config.toml +10 -0
- haoline/__init__.py +248 -0
- haoline/analyzer.py +935 -0
- haoline/cli.py +2712 -0
- haoline/compare.py +811 -0
- haoline/compare_visualizations.py +1564 -0
- haoline/edge_analysis.py +525 -0
- haoline/eval/__init__.py +131 -0
- haoline/eval/adapters.py +844 -0
- haoline/eval/cli.py +390 -0
- haoline/eval/comparison.py +542 -0
- haoline/eval/deployment.py +633 -0
- haoline/eval/schemas.py +833 -0
- haoline/examples/__init__.py +15 -0
- haoline/examples/basic_inspection.py +74 -0
- haoline/examples/compare_models.py +117 -0
- haoline/examples/hardware_estimation.py +78 -0
- haoline/format_adapters.py +1001 -0
- haoline/formats/__init__.py +123 -0
- haoline/formats/coreml.py +250 -0
- haoline/formats/gguf.py +483 -0
- haoline/formats/openvino.py +255 -0
- haoline/formats/safetensors.py +273 -0
- haoline/formats/tflite.py +369 -0
- haoline/hardware.py +2307 -0
- haoline/hierarchical_graph.py +462 -0
- haoline/html_export.py +1573 -0
- haoline/layer_summary.py +769 -0
- haoline/llm_summarizer.py +465 -0
- haoline/op_icons.py +618 -0
- haoline/operational_profiling.py +1492 -0
- haoline/patterns.py +1116 -0
- haoline/pdf_generator.py +265 -0
- haoline/privacy.py +250 -0
- haoline/pydantic_models.py +241 -0
- haoline/report.py +1923 -0
- haoline/report_sections.py +539 -0
- haoline/risks.py +521 -0
- haoline/schema.py +523 -0
- haoline/streamlit_app.py +2024 -0
- haoline/tests/__init__.py +4 -0
- haoline/tests/conftest.py +123 -0
- haoline/tests/test_analyzer.py +868 -0
- haoline/tests/test_compare_visualizations.py +293 -0
- haoline/tests/test_edge_analysis.py +243 -0
- haoline/tests/test_eval.py +604 -0
- haoline/tests/test_format_adapters.py +460 -0
- haoline/tests/test_hardware.py +237 -0
- haoline/tests/test_hardware_recommender.py +90 -0
- haoline/tests/test_hierarchical_graph.py +326 -0
- haoline/tests/test_html_export.py +180 -0
- haoline/tests/test_layer_summary.py +428 -0
- haoline/tests/test_llm_patterns.py +540 -0
- haoline/tests/test_llm_summarizer.py +339 -0
- haoline/tests/test_patterns.py +774 -0
- haoline/tests/test_pytorch.py +327 -0
- haoline/tests/test_report.py +383 -0
- haoline/tests/test_risks.py +398 -0
- haoline/tests/test_schema.py +417 -0
- haoline/tests/test_tensorflow.py +380 -0
- haoline/tests/test_visualizations.py +316 -0
- haoline/universal_ir.py +856 -0
- haoline/visualizations.py +1086 -0
- haoline/visualize_yolo.py +44 -0
- haoline/web.py +110 -0
- haoline-0.3.0.dist-info/METADATA +471 -0
- haoline-0.3.0.dist-info/RECORD +70 -0
- haoline-0.3.0.dist-info/WHEEL +4 -0
- haoline-0.3.0.dist-info/entry_points.txt +5 -0
- haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
haoline/eval/adapters.py
ADDED
|
@@ -0,0 +1,844 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Eval Adapters
|
|
3
|
+
|
|
4
|
+
Parse evaluation results from external tools into HaoLine's schema.
|
|
5
|
+
|
|
6
|
+
Supported adapters:
|
|
7
|
+
- Ultralytics YOLO (detection): parse_ultralytics_val, load_ultralytics_json
|
|
8
|
+
- HuggingFace evaluate (classification/NLP): parse_hf_evaluate, load_hf_evaluate
|
|
9
|
+
- lm-eval-harness (LLM benchmarks): parse_lm_eval, load_lm_eval
|
|
10
|
+
- timm (image classification): parse_timm_benchmark, load_timm_benchmark
|
|
11
|
+
- Generic CSV/JSON: parse_generic_json, parse_generic_csv, load_generic_json, load_generic_csv
|
|
12
|
+
|
|
13
|
+
Auto-detection: detect_and_parse() tries to identify the format automatically.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import csv
|
|
19
|
+
import json
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
from .schemas import (
|
|
24
|
+
ClassificationEvalResult,
|
|
25
|
+
DetectionEvalResult,
|
|
26
|
+
EvalMetric,
|
|
27
|
+
EvalResult,
|
|
28
|
+
GenericEvalResult,
|
|
29
|
+
LLMEvalResult,
|
|
30
|
+
NLPEvalResult,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
# =============================================================================
|
|
34
|
+
# Ultralytics YOLO Adapter (Task 12.3.1)
|
|
35
|
+
# =============================================================================
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def parse_ultralytics_val(
|
|
39
|
+
data: dict[str, Any],
|
|
40
|
+
model_id: str = "",
|
|
41
|
+
) -> DetectionEvalResult:
|
|
42
|
+
"""
|
|
43
|
+
Parse Ultralytics YOLO validation results.
|
|
44
|
+
|
|
45
|
+
Ultralytics outputs validation metrics in various formats. This parser
|
|
46
|
+
handles the JSON output from `yolo val` or results from `model.val()`.
|
|
47
|
+
|
|
48
|
+
Expected fields (from results.results_dict or JSON):
|
|
49
|
+
- metrics/mAP50(B): float
|
|
50
|
+
- metrics/mAP50-95(B): float
|
|
51
|
+
- metrics/precision(B): float
|
|
52
|
+
- metrics/recall(B): float
|
|
53
|
+
- fitness: float (optional)
|
|
54
|
+
|
|
55
|
+
Args:
|
|
56
|
+
data: Dictionary from YOLO validation output.
|
|
57
|
+
model_id: Model identifier (defaults to extracting from data).
|
|
58
|
+
|
|
59
|
+
Returns:
|
|
60
|
+
DetectionEvalResult with parsed metrics.
|
|
61
|
+
"""
|
|
62
|
+
|
|
63
|
+
# Try different key formats (Ultralytics uses inconsistent naming)
|
|
64
|
+
def get_metric(keys: list[str], default: float = 0.0) -> float:
|
|
65
|
+
for key in keys:
|
|
66
|
+
if key in data:
|
|
67
|
+
val = data[key]
|
|
68
|
+
return float(val) if val is not None else default
|
|
69
|
+
# Check nested metrics dict
|
|
70
|
+
if "metrics" in data and key in data["metrics"]:
|
|
71
|
+
val = data["metrics"][key]
|
|
72
|
+
return float(val) if val is not None else default
|
|
73
|
+
return default
|
|
74
|
+
|
|
75
|
+
# Extract metrics with various key formats
|
|
76
|
+
map50 = get_metric(
|
|
77
|
+
[
|
|
78
|
+
"metrics/mAP50(B)",
|
|
79
|
+
"mAP50",
|
|
80
|
+
"map50",
|
|
81
|
+
"mAP@50",
|
|
82
|
+
"box/mAP50",
|
|
83
|
+
]
|
|
84
|
+
)
|
|
85
|
+
map50_95 = get_metric(
|
|
86
|
+
[
|
|
87
|
+
"metrics/mAP50-95(B)",
|
|
88
|
+
"mAP50-95",
|
|
89
|
+
"map50_95",
|
|
90
|
+
"mAP@50:95",
|
|
91
|
+
"box/mAP50-95",
|
|
92
|
+
"map",
|
|
93
|
+
]
|
|
94
|
+
)
|
|
95
|
+
precision = get_metric(
|
|
96
|
+
[
|
|
97
|
+
"metrics/precision(B)",
|
|
98
|
+
"precision",
|
|
99
|
+
"box/precision",
|
|
100
|
+
"p",
|
|
101
|
+
]
|
|
102
|
+
)
|
|
103
|
+
recall = get_metric(
|
|
104
|
+
[
|
|
105
|
+
"metrics/recall(B)",
|
|
106
|
+
"recall",
|
|
107
|
+
"box/recall",
|
|
108
|
+
"r",
|
|
109
|
+
]
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
# Calculate F1 if not provided
|
|
113
|
+
f1 = get_metric(["f1", "box/f1"])
|
|
114
|
+
if f1 == 0.0 and precision > 0 and recall > 0:
|
|
115
|
+
f1 = 2 * (precision * recall) / (precision + recall)
|
|
116
|
+
|
|
117
|
+
# Extract model ID
|
|
118
|
+
if not model_id:
|
|
119
|
+
model_id = data.get("model", data.get("name", "unknown"))
|
|
120
|
+
|
|
121
|
+
# Extract dataset
|
|
122
|
+
dataset = data.get("data", data.get("dataset", ""))
|
|
123
|
+
if isinstance(dataset, dict):
|
|
124
|
+
dataset = dataset.get("path", dataset.get("name", ""))
|
|
125
|
+
|
|
126
|
+
# Per-class metrics if available
|
|
127
|
+
class_metrics: dict[str, dict[str, float]] = {}
|
|
128
|
+
if "per_class" in data:
|
|
129
|
+
for cls_name, cls_data in data["per_class"].items():
|
|
130
|
+
class_metrics[cls_name] = {
|
|
131
|
+
"precision": cls_data.get("precision", 0.0),
|
|
132
|
+
"recall": cls_data.get("recall", 0.0),
|
|
133
|
+
"ap50": cls_data.get("ap50", cls_data.get("mAP50", 0.0)),
|
|
134
|
+
}
|
|
135
|
+
|
|
136
|
+
# Build the result
|
|
137
|
+
result = DetectionEvalResult.create(
|
|
138
|
+
model_id=str(model_id),
|
|
139
|
+
dataset=str(dataset),
|
|
140
|
+
map50=map50,
|
|
141
|
+
map50_95=map50_95,
|
|
142
|
+
precision=precision,
|
|
143
|
+
recall=recall,
|
|
144
|
+
f1=f1,
|
|
145
|
+
class_metrics=class_metrics,
|
|
146
|
+
)
|
|
147
|
+
|
|
148
|
+
# Add extra metrics from metadata
|
|
149
|
+
speed = data.get("speed", {})
|
|
150
|
+
if speed:
|
|
151
|
+
if "inference" in speed:
|
|
152
|
+
result.metrics.append(
|
|
153
|
+
EvalMetric(
|
|
154
|
+
name="inference_ms",
|
|
155
|
+
value=speed["inference"],
|
|
156
|
+
unit="ms",
|
|
157
|
+
higher_is_better=False,
|
|
158
|
+
category="speed",
|
|
159
|
+
)
|
|
160
|
+
)
|
|
161
|
+
if "preprocess" in speed:
|
|
162
|
+
result.metrics.append(
|
|
163
|
+
EvalMetric(
|
|
164
|
+
name="preprocess_ms",
|
|
165
|
+
value=speed["preprocess"],
|
|
166
|
+
unit="ms",
|
|
167
|
+
higher_is_better=False,
|
|
168
|
+
category="speed",
|
|
169
|
+
)
|
|
170
|
+
)
|
|
171
|
+
if "postprocess" in speed:
|
|
172
|
+
result.metrics.append(
|
|
173
|
+
EvalMetric(
|
|
174
|
+
name="postprocess_ms",
|
|
175
|
+
value=speed["postprocess"],
|
|
176
|
+
unit="ms",
|
|
177
|
+
higher_is_better=False,
|
|
178
|
+
category="speed",
|
|
179
|
+
)
|
|
180
|
+
)
|
|
181
|
+
|
|
182
|
+
# Store raw data in metadata
|
|
183
|
+
result.metadata["raw_ultralytics"] = data
|
|
184
|
+
|
|
185
|
+
return result
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
def load_ultralytics_json(path: Path, model_id: str = "") -> DetectionEvalResult:
|
|
189
|
+
"""
|
|
190
|
+
Load Ultralytics validation results from JSON file.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
path: Path to JSON file.
|
|
194
|
+
model_id: Optional model identifier.
|
|
195
|
+
|
|
196
|
+
Returns:
|
|
197
|
+
DetectionEvalResult with parsed metrics.
|
|
198
|
+
"""
|
|
199
|
+
with open(path, encoding="utf-8") as f:
|
|
200
|
+
data = json.load(f)
|
|
201
|
+
return parse_ultralytics_val(data, model_id)
|
|
202
|
+
|
|
203
|
+
|
|
204
|
+
# =============================================================================
|
|
205
|
+
# Generic CSV/JSON Adapter (Task 12.3.5)
|
|
206
|
+
# =============================================================================
|
|
207
|
+
|
|
208
|
+
|
|
209
|
+
def parse_generic_json(
|
|
210
|
+
data: dict[str, Any],
|
|
211
|
+
model_id: str = "",
|
|
212
|
+
metric_mapping: dict[str, str] | None = None,
|
|
213
|
+
higher_is_better: dict[str, bool] | None = None,
|
|
214
|
+
) -> GenericEvalResult:
|
|
215
|
+
"""
|
|
216
|
+
Parse generic JSON evaluation results.
|
|
217
|
+
|
|
218
|
+
Extracts numeric fields as metrics. User can provide mapping to rename fields.
|
|
219
|
+
|
|
220
|
+
Args:
|
|
221
|
+
data: Dictionary with metric values.
|
|
222
|
+
model_id: Model identifier.
|
|
223
|
+
metric_mapping: Optional dict to rename fields (json_key -> metric_name).
|
|
224
|
+
higher_is_better: Optional dict specifying direction (metric_name -> bool).
|
|
225
|
+
|
|
226
|
+
Returns:
|
|
227
|
+
GenericEvalResult with extracted metrics.
|
|
228
|
+
|
|
229
|
+
Example:
|
|
230
|
+
>>> data = {"acc": 0.95, "loss": 0.12, "model": "resnet50"}
|
|
231
|
+
>>> result = parse_generic_json(
|
|
232
|
+
... data,
|
|
233
|
+
... metric_mapping={"acc": "accuracy", "loss": "val_loss"},
|
|
234
|
+
... higher_is_better={"accuracy": True, "val_loss": False}
|
|
235
|
+
... )
|
|
236
|
+
"""
|
|
237
|
+
mapping = metric_mapping or {}
|
|
238
|
+
better_map = higher_is_better or {}
|
|
239
|
+
|
|
240
|
+
# Extract model_id from data if not provided
|
|
241
|
+
if not model_id:
|
|
242
|
+
model_id = str(data.get("model_id", data.get("model", data.get("name", "unknown"))))
|
|
243
|
+
|
|
244
|
+
# Extract dataset
|
|
245
|
+
dataset = str(data.get("dataset", data.get("data", "")))
|
|
246
|
+
|
|
247
|
+
# Find all numeric fields
|
|
248
|
+
metrics: dict[str, float] = {}
|
|
249
|
+
for key, value in data.items():
|
|
250
|
+
# Skip non-numeric and metadata fields
|
|
251
|
+
if key in ("model_id", "model", "name", "dataset", "data", "timestamp", "metadata"):
|
|
252
|
+
continue
|
|
253
|
+
|
|
254
|
+
if isinstance(value, (int, float)) and not isinstance(value, bool):
|
|
255
|
+
# Apply mapping if provided
|
|
256
|
+
metric_name = mapping.get(key, key)
|
|
257
|
+
metrics[metric_name] = float(value)
|
|
258
|
+
|
|
259
|
+
# Build result
|
|
260
|
+
return GenericEvalResult.create(
|
|
261
|
+
model_id=model_id,
|
|
262
|
+
dataset=dataset,
|
|
263
|
+
metrics=metrics,
|
|
264
|
+
higher_is_better=better_map,
|
|
265
|
+
)
|
|
266
|
+
|
|
267
|
+
|
|
268
|
+
def load_generic_json(
|
|
269
|
+
path: Path,
|
|
270
|
+
model_id: str = "",
|
|
271
|
+
metric_mapping: dict[str, str] | None = None,
|
|
272
|
+
higher_is_better: dict[str, bool] | None = None,
|
|
273
|
+
) -> GenericEvalResult:
|
|
274
|
+
"""
|
|
275
|
+
Load generic evaluation results from JSON file.
|
|
276
|
+
|
|
277
|
+
Args:
|
|
278
|
+
path: Path to JSON file.
|
|
279
|
+
model_id: Optional model identifier.
|
|
280
|
+
metric_mapping: Optional dict to rename fields.
|
|
281
|
+
higher_is_better: Optional dict specifying metric direction.
|
|
282
|
+
|
|
283
|
+
Returns:
|
|
284
|
+
GenericEvalResult with extracted metrics.
|
|
285
|
+
"""
|
|
286
|
+
with open(path, encoding="utf-8") as f:
|
|
287
|
+
data = json.load(f)
|
|
288
|
+
return parse_generic_json(data, model_id, metric_mapping, higher_is_better)
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
def parse_generic_csv(
|
|
292
|
+
rows: list[dict[str, str]],
|
|
293
|
+
model_id_column: str = "model",
|
|
294
|
+
metric_columns: list[str] | None = None,
|
|
295
|
+
higher_is_better: dict[str, bool] | None = None,
|
|
296
|
+
) -> list[GenericEvalResult]:
|
|
297
|
+
"""
|
|
298
|
+
Parse generic CSV evaluation results.
|
|
299
|
+
|
|
300
|
+
Each row becomes one EvalResult. Specify which columns are metrics.
|
|
301
|
+
|
|
302
|
+
Args:
|
|
303
|
+
rows: List of row dicts (from csv.DictReader).
|
|
304
|
+
model_id_column: Column name containing model identifier.
|
|
305
|
+
metric_columns: List of column names to treat as metrics (None = auto-detect numeric).
|
|
306
|
+
higher_is_better: Dict specifying metric direction.
|
|
307
|
+
|
|
308
|
+
Returns:
|
|
309
|
+
List of GenericEvalResult, one per row.
|
|
310
|
+
|
|
311
|
+
Example CSV:
|
|
312
|
+
model,accuracy,f1,loss
|
|
313
|
+
resnet50,0.95,0.94,0.12
|
|
314
|
+
mobilenet,0.91,0.90,0.18
|
|
315
|
+
|
|
316
|
+
>>> with open("results.csv") as f:
|
|
317
|
+
... rows = list(csv.DictReader(f))
|
|
318
|
+
>>> results = parse_generic_csv(rows, metric_columns=["accuracy", "f1", "loss"])
|
|
319
|
+
"""
|
|
320
|
+
better_map = higher_is_better or {}
|
|
321
|
+
results = []
|
|
322
|
+
|
|
323
|
+
for row in rows:
|
|
324
|
+
model_id = row.get(model_id_column, "unknown")
|
|
325
|
+
|
|
326
|
+
# Extract metrics
|
|
327
|
+
metrics: dict[str, float] = {}
|
|
328
|
+
|
|
329
|
+
if metric_columns:
|
|
330
|
+
# Use specified columns
|
|
331
|
+
for col in metric_columns:
|
|
332
|
+
if col in row:
|
|
333
|
+
try:
|
|
334
|
+
metrics[col] = float(row[col])
|
|
335
|
+
except (ValueError, TypeError):
|
|
336
|
+
pass # Skip non-numeric
|
|
337
|
+
else:
|
|
338
|
+
# Auto-detect numeric columns
|
|
339
|
+
for key, value in row.items():
|
|
340
|
+
if key == model_id_column:
|
|
341
|
+
continue
|
|
342
|
+
try:
|
|
343
|
+
metrics[key] = float(value)
|
|
344
|
+
except (ValueError, TypeError):
|
|
345
|
+
pass # Skip non-numeric
|
|
346
|
+
|
|
347
|
+
result = GenericEvalResult.create(
|
|
348
|
+
model_id=model_id,
|
|
349
|
+
metrics=metrics,
|
|
350
|
+
higher_is_better=better_map,
|
|
351
|
+
)
|
|
352
|
+
results.append(result)
|
|
353
|
+
|
|
354
|
+
return results
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def load_generic_csv(
|
|
358
|
+
path: Path,
|
|
359
|
+
model_id_column: str = "model",
|
|
360
|
+
metric_columns: list[str] | None = None,
|
|
361
|
+
higher_is_better: dict[str, bool] | None = None,
|
|
362
|
+
) -> list[GenericEvalResult]:
|
|
363
|
+
"""
|
|
364
|
+
Load generic evaluation results from CSV file.
|
|
365
|
+
|
|
366
|
+
Args:
|
|
367
|
+
path: Path to CSV file.
|
|
368
|
+
model_id_column: Column name containing model identifier.
|
|
369
|
+
metric_columns: List of column names to treat as metrics.
|
|
370
|
+
higher_is_better: Dict specifying metric direction.
|
|
371
|
+
|
|
372
|
+
Returns:
|
|
373
|
+
List of GenericEvalResult, one per row.
|
|
374
|
+
"""
|
|
375
|
+
with open(path, encoding="utf-8", newline="") as f:
|
|
376
|
+
reader = csv.DictReader(f)
|
|
377
|
+
rows = list(reader)
|
|
378
|
+
return parse_generic_csv(rows, model_id_column, metric_columns, higher_is_better)
|
|
379
|
+
|
|
380
|
+
|
|
381
|
+
# =============================================================================
|
|
382
|
+
# HuggingFace Evaluate Adapter (Task 12.3.2)
|
|
383
|
+
# =============================================================================
|
|
384
|
+
|
|
385
|
+
|
|
386
|
+
def parse_hf_evaluate(
|
|
387
|
+
data: dict[str, Any],
|
|
388
|
+
model_id: str = "",
|
|
389
|
+
task_type: str = "classification",
|
|
390
|
+
) -> ClassificationEvalResult | NLPEvalResult | GenericEvalResult:
|
|
391
|
+
"""
|
|
392
|
+
Parse HuggingFace evaluate library output.
|
|
393
|
+
|
|
394
|
+
HuggingFace evaluate returns a dict with metric names as keys.
|
|
395
|
+
Common output formats:
|
|
396
|
+
- Classification: {"accuracy": 0.95, "f1": 0.94, "precision": 0.93, "recall": 0.95}
|
|
397
|
+
- NER: {"precision": 0.9, "recall": 0.88, "f1": 0.89, "accuracy": 0.95}
|
|
398
|
+
- QA: {"exact_match": 80.5, "f1": 85.3}
|
|
399
|
+
|
|
400
|
+
Args:
|
|
401
|
+
data: Dictionary from evaluate.compute() or JSON output.
|
|
402
|
+
model_id: Model identifier.
|
|
403
|
+
task_type: One of "classification", "nlp", or "generic".
|
|
404
|
+
|
|
405
|
+
Returns:
|
|
406
|
+
Appropriate EvalResult subtype.
|
|
407
|
+
|
|
408
|
+
Example:
|
|
409
|
+
>>> import evaluate
|
|
410
|
+
>>> metric = evaluate.load("accuracy")
|
|
411
|
+
>>> result = metric.compute(predictions=[1,1,0], references=[1,0,0])
|
|
412
|
+
>>> eval_result = parse_hf_evaluate(result, model_id="bert-base")
|
|
413
|
+
"""
|
|
414
|
+
# Extract model_id from data if not provided
|
|
415
|
+
if not model_id:
|
|
416
|
+
model_id = str(data.get("model", data.get("model_id", "unknown")))
|
|
417
|
+
|
|
418
|
+
dataset = str(data.get("dataset", data.get("data", "")))
|
|
419
|
+
|
|
420
|
+
# Try to auto-detect task from metric names
|
|
421
|
+
has_exact_match = "exact_match" in data or "em" in data
|
|
422
|
+
has_bleu = "bleu" in data or "sacrebleu" in data
|
|
423
|
+
has_rouge = any(k.startswith("rouge") for k in data.keys())
|
|
424
|
+
|
|
425
|
+
if task_type == "nlp" or has_exact_match or has_bleu or has_rouge:
|
|
426
|
+
# NLP task - determine specific task from metrics
|
|
427
|
+
nlp_task = "qa" if has_exact_match else ("translation" if has_bleu else "classification")
|
|
428
|
+
return NLPEvalResult.create(
|
|
429
|
+
model_id=model_id,
|
|
430
|
+
dataset=dataset,
|
|
431
|
+
nlp_task=nlp_task,
|
|
432
|
+
accuracy=data.get("accuracy"),
|
|
433
|
+
f1=data.get("f1", data.get("f1_score")),
|
|
434
|
+
exact_match=data.get("exact_match", data.get("em")),
|
|
435
|
+
bleu=data.get("bleu", data.get("sacrebleu")),
|
|
436
|
+
)
|
|
437
|
+
|
|
438
|
+
elif task_type == "classification":
|
|
439
|
+
# Classification task - default to 0.0 if not found
|
|
440
|
+
top1 = data.get("accuracy", data.get("top1", data.get("top_1_accuracy", 0.0)))
|
|
441
|
+
top5 = data.get("top5", data.get("top_5_accuracy", 0.0))
|
|
442
|
+
return ClassificationEvalResult.create(
|
|
443
|
+
model_id=model_id,
|
|
444
|
+
dataset=dataset,
|
|
445
|
+
top1_accuracy=float(top1) if top1 is not None else 0.0,
|
|
446
|
+
top5_accuracy=float(top5) if top5 is not None else 0.0,
|
|
447
|
+
)
|
|
448
|
+
|
|
449
|
+
else:
|
|
450
|
+
# Generic fallback
|
|
451
|
+
return parse_generic_json(data, model_id)
|
|
452
|
+
|
|
453
|
+
|
|
454
|
+
def load_hf_evaluate(
|
|
455
|
+
path: Path,
|
|
456
|
+
model_id: str = "",
|
|
457
|
+
task_type: str = "classification",
|
|
458
|
+
) -> ClassificationEvalResult | NLPEvalResult | GenericEvalResult:
|
|
459
|
+
"""
|
|
460
|
+
Load HuggingFace evaluate results from JSON file.
|
|
461
|
+
|
|
462
|
+
Args:
|
|
463
|
+
path: Path to JSON file.
|
|
464
|
+
model_id: Optional model identifier.
|
|
465
|
+
task_type: One of "classification", "nlp", or "generic".
|
|
466
|
+
|
|
467
|
+
Returns:
|
|
468
|
+
Appropriate EvalResult subtype.
|
|
469
|
+
"""
|
|
470
|
+
with open(path, encoding="utf-8") as f:
|
|
471
|
+
data = json.load(f)
|
|
472
|
+
return parse_hf_evaluate(data, model_id, task_type)
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
# =============================================================================
|
|
476
|
+
# lm-eval-harness Adapter (Task 12.3.3)
|
|
477
|
+
# =============================================================================
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
def parse_lm_eval(
|
|
481
|
+
data: dict[str, Any],
|
|
482
|
+
model_id: str = "",
|
|
483
|
+
) -> LLMEvalResult:
|
|
484
|
+
"""
|
|
485
|
+
Parse lm-eval-harness (EleutherAI) output.
|
|
486
|
+
|
|
487
|
+
lm-eval-harness outputs JSON with results per task/benchmark.
|
|
488
|
+
Format varies but typically:
|
|
489
|
+
{
|
|
490
|
+
"results": {
|
|
491
|
+
"hellaswag": {"acc": 0.7, "acc_norm": 0.75},
|
|
492
|
+
"mmlu": {"acc": 0.65},
|
|
493
|
+
"arc_easy": {"acc": 0.8, "acc_norm": 0.82}
|
|
494
|
+
},
|
|
495
|
+
"config": {"model": "llama-7b", ...}
|
|
496
|
+
}
|
|
497
|
+
|
|
498
|
+
Args:
|
|
499
|
+
data: Dictionary from lm-eval JSON output.
|
|
500
|
+
model_id: Model identifier (extracted from config if not provided).
|
|
501
|
+
|
|
502
|
+
Returns:
|
|
503
|
+
LLMEvalResult with benchmark scores.
|
|
504
|
+
|
|
505
|
+
Example:
|
|
506
|
+
>>> # After running: lm_eval --model hf --model_args ... --output_path results.json
|
|
507
|
+
>>> result = load_lm_eval("results.json")
|
|
508
|
+
"""
|
|
509
|
+
# Extract model_id from config
|
|
510
|
+
if not model_id:
|
|
511
|
+
config = data.get("config", {})
|
|
512
|
+
model_id = str(
|
|
513
|
+
config.get("model", config.get("model_args", {}).get("pretrained", "unknown"))
|
|
514
|
+
)
|
|
515
|
+
|
|
516
|
+
# Extract results - can be at top level or nested
|
|
517
|
+
results = data.get("results", data)
|
|
518
|
+
|
|
519
|
+
# Standard LLM benchmark scores
|
|
520
|
+
def get_task_score(task_name: str, metric: str = "acc_norm") -> float | None:
|
|
521
|
+
"""Get score for a task, trying multiple metric names."""
|
|
522
|
+
if task_name not in results:
|
|
523
|
+
# Try case variations
|
|
524
|
+
for key in results:
|
|
525
|
+
if key.lower() == task_name.lower():
|
|
526
|
+
task_name = key
|
|
527
|
+
break
|
|
528
|
+
else:
|
|
529
|
+
return None
|
|
530
|
+
|
|
531
|
+
task_data = results[task_name]
|
|
532
|
+
if isinstance(task_data, dict):
|
|
533
|
+
# Try acc_norm first, then acc, then the raw value
|
|
534
|
+
for m in [metric, "acc_norm", "acc", "accuracy"]:
|
|
535
|
+
if m in task_data:
|
|
536
|
+
val = task_data[m]
|
|
537
|
+
# Handle both raw scores (0-1) and percentages (0-100)
|
|
538
|
+
return float(val) * 100 if float(val) <= 1 else float(val)
|
|
539
|
+
elif isinstance(task_data, (int, float)):
|
|
540
|
+
val = float(task_data)
|
|
541
|
+
return val * 100 if val <= 1 else val
|
|
542
|
+
return None
|
|
543
|
+
|
|
544
|
+
# Extract common benchmarks
|
|
545
|
+
mmlu = get_task_score("mmlu") or get_task_score("mmlu_pro")
|
|
546
|
+
hellaswag = get_task_score("hellaswag")
|
|
547
|
+
truthfulqa = get_task_score("truthfulqa") or get_task_score("truthfulqa_mc")
|
|
548
|
+
arc_easy = get_task_score("arc_easy")
|
|
549
|
+
arc_challenge = get_task_score("arc_challenge")
|
|
550
|
+
winogrande = get_task_score("winogrande")
|
|
551
|
+
|
|
552
|
+
# Calculate average if we have benchmarks
|
|
553
|
+
benchmark_scores: dict[str, float] = {}
|
|
554
|
+
for name, score in [
|
|
555
|
+
("mmlu", mmlu),
|
|
556
|
+
("hellaswag", hellaswag),
|
|
557
|
+
("truthfulqa", truthfulqa),
|
|
558
|
+
("arc_easy", arc_easy),
|
|
559
|
+
("arc_challenge", arc_challenge),
|
|
560
|
+
("winogrande", winogrande),
|
|
561
|
+
]:
|
|
562
|
+
if score is not None:
|
|
563
|
+
benchmark_scores[name] = score
|
|
564
|
+
|
|
565
|
+
# Add any other tasks not in our standard list
|
|
566
|
+
for task_name, _task_data in results.items():
|
|
567
|
+
if task_name.lower() not in [
|
|
568
|
+
"mmlu",
|
|
569
|
+
"mmlu_pro",
|
|
570
|
+
"hellaswag",
|
|
571
|
+
"truthfulqa",
|
|
572
|
+
"truthfulqa_mc",
|
|
573
|
+
"arc_easy",
|
|
574
|
+
"arc_challenge",
|
|
575
|
+
"winogrande",
|
|
576
|
+
]:
|
|
577
|
+
score = get_task_score(task_name)
|
|
578
|
+
if score is not None:
|
|
579
|
+
benchmark_scores[task_name] = score
|
|
580
|
+
|
|
581
|
+
# Try to get perplexity if available
|
|
582
|
+
perplexity = None
|
|
583
|
+
if "perplexity" in results:
|
|
584
|
+
perplexity = results["perplexity"]
|
|
585
|
+
if isinstance(perplexity, dict):
|
|
586
|
+
perplexity = perplexity.get("word_perplexity", perplexity.get("perplexity"))
|
|
587
|
+
|
|
588
|
+
return LLMEvalResult.create(
|
|
589
|
+
model_id=model_id,
|
|
590
|
+
perplexity=float(perplexity) if perplexity else None,
|
|
591
|
+
mmlu=mmlu,
|
|
592
|
+
hellaswag=hellaswag,
|
|
593
|
+
truthfulqa=truthfulqa,
|
|
594
|
+
benchmark_scores=benchmark_scores,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
def load_lm_eval(path: Path, model_id: str = "") -> LLMEvalResult:
|
|
599
|
+
"""
|
|
600
|
+
Load lm-eval-harness results from JSON file.
|
|
601
|
+
|
|
602
|
+
Args:
|
|
603
|
+
path: Path to JSON file (lm-eval output).
|
|
604
|
+
model_id: Optional model identifier.
|
|
605
|
+
|
|
606
|
+
Returns:
|
|
607
|
+
LLMEvalResult with benchmark scores.
|
|
608
|
+
"""
|
|
609
|
+
with open(path, encoding="utf-8") as f:
|
|
610
|
+
data = json.load(f)
|
|
611
|
+
return parse_lm_eval(data, model_id)
|
|
612
|
+
|
|
613
|
+
|
|
614
|
+
# =============================================================================
|
|
615
|
+
# timm Adapter (Task 12.3.4)
|
|
616
|
+
# =============================================================================
|
|
617
|
+
|
|
618
|
+
|
|
619
|
+
def parse_timm_benchmark(
|
|
620
|
+
data: dict[str, Any],
|
|
621
|
+
model_id: str = "",
|
|
622
|
+
) -> ClassificationEvalResult:
|
|
623
|
+
"""
|
|
624
|
+
Parse timm (PyTorch Image Models) benchmark output.
|
|
625
|
+
|
|
626
|
+
timm's validate.py outputs JSON/CSV with classification metrics.
|
|
627
|
+
Common fields:
|
|
628
|
+
- top1: top-1 accuracy (%)
|
|
629
|
+
- top5: top-5 accuracy (%)
|
|
630
|
+
- model: model name
|
|
631
|
+
- param_count: number of parameters
|
|
632
|
+
|
|
633
|
+
Args:
|
|
634
|
+
data: Dictionary from timm validation output.
|
|
635
|
+
model_id: Model identifier.
|
|
636
|
+
|
|
637
|
+
Returns:
|
|
638
|
+
ClassificationEvalResult with accuracy metrics.
|
|
639
|
+
|
|
640
|
+
Example:
|
|
641
|
+
>>> # After: python validate.py --data imagenet --model resnet50 --results-file results.json
|
|
642
|
+
>>> result = load_timm_benchmark("results.json")
|
|
643
|
+
"""
|
|
644
|
+
# Extract model_id
|
|
645
|
+
if not model_id:
|
|
646
|
+
model_id = str(data.get("model", data.get("arch", "unknown")))
|
|
647
|
+
|
|
648
|
+
# Dataset
|
|
649
|
+
dataset = str(data.get("dataset", data.get("data", "imagenet")))
|
|
650
|
+
|
|
651
|
+
# Extract accuracy metrics
|
|
652
|
+
def get_accuracy(keys: list[str]) -> float | None:
|
|
653
|
+
for key in keys:
|
|
654
|
+
if key in data:
|
|
655
|
+
val = data[key]
|
|
656
|
+
if val is not None:
|
|
657
|
+
# timm outputs percentages (0-100)
|
|
658
|
+
return float(val)
|
|
659
|
+
return None
|
|
660
|
+
|
|
661
|
+
top1 = get_accuracy(["top1", "top1_acc", "accuracy", "acc1", "prec1"])
|
|
662
|
+
top5 = get_accuracy(["top5", "top5_acc", "acc5", "prec5"])
|
|
663
|
+
|
|
664
|
+
# Create result - default to 0.0 if not found
|
|
665
|
+
result = ClassificationEvalResult.create(
|
|
666
|
+
model_id=model_id,
|
|
667
|
+
dataset=dataset,
|
|
668
|
+
top1_accuracy=float(top1) if top1 is not None else 0.0,
|
|
669
|
+
top5_accuracy=float(top5) if top5 is not None else 0.0,
|
|
670
|
+
)
|
|
671
|
+
|
|
672
|
+
# Add extra metrics if available
|
|
673
|
+
if "param_count" in data:
|
|
674
|
+
result.metrics.append(
|
|
675
|
+
EvalMetric(
|
|
676
|
+
name="param_count",
|
|
677
|
+
value=float(data["param_count"]),
|
|
678
|
+
unit="params",
|
|
679
|
+
higher_is_better=False,
|
|
680
|
+
category="size",
|
|
681
|
+
)
|
|
682
|
+
)
|
|
683
|
+
|
|
684
|
+
if "img_size" in data:
|
|
685
|
+
result.metrics.append(
|
|
686
|
+
EvalMetric(
|
|
687
|
+
name="img_size",
|
|
688
|
+
value=float(data["img_size"]),
|
|
689
|
+
unit="px",
|
|
690
|
+
higher_is_better=False,
|
|
691
|
+
category="input",
|
|
692
|
+
)
|
|
693
|
+
)
|
|
694
|
+
|
|
695
|
+
if "batch_size" in data:
|
|
696
|
+
result.metrics.append(
|
|
697
|
+
EvalMetric(
|
|
698
|
+
name="batch_size",
|
|
699
|
+
value=float(data["batch_size"]),
|
|
700
|
+
unit="",
|
|
701
|
+
higher_is_better=False,
|
|
702
|
+
category="config",
|
|
703
|
+
)
|
|
704
|
+
)
|
|
705
|
+
|
|
706
|
+
# Throughput/latency if available
|
|
707
|
+
for key in ["samples_per_sec", "throughput", "samples_sec"]:
|
|
708
|
+
if key in data:
|
|
709
|
+
result.metrics.append(
|
|
710
|
+
EvalMetric(
|
|
711
|
+
name="throughput",
|
|
712
|
+
value=float(data[key]),
|
|
713
|
+
unit="samples/sec",
|
|
714
|
+
higher_is_better=True,
|
|
715
|
+
category="speed",
|
|
716
|
+
)
|
|
717
|
+
)
|
|
718
|
+
break
|
|
719
|
+
|
|
720
|
+
for key in ["latency_ms", "inference_time"]:
|
|
721
|
+
if key in data:
|
|
722
|
+
result.metrics.append(
|
|
723
|
+
EvalMetric(
|
|
724
|
+
name="latency_ms",
|
|
725
|
+
value=float(data[key]),
|
|
726
|
+
unit="ms",
|
|
727
|
+
higher_is_better=False,
|
|
728
|
+
category="speed",
|
|
729
|
+
)
|
|
730
|
+
)
|
|
731
|
+
break
|
|
732
|
+
|
|
733
|
+
# Store raw data
|
|
734
|
+
result.metadata["raw_timm"] = data
|
|
735
|
+
|
|
736
|
+
return result
|
|
737
|
+
|
|
738
|
+
|
|
739
|
+
def load_timm_benchmark(path: Path, model_id: str = "") -> ClassificationEvalResult:
|
|
740
|
+
"""
|
|
741
|
+
Load timm benchmark results from JSON file.
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
path: Path to JSON file (timm validate.py output).
|
|
745
|
+
model_id: Optional model identifier.
|
|
746
|
+
|
|
747
|
+
Returns:
|
|
748
|
+
ClassificationEvalResult with accuracy metrics.
|
|
749
|
+
"""
|
|
750
|
+
with open(path, encoding="utf-8") as f:
|
|
751
|
+
data = json.load(f)
|
|
752
|
+
|
|
753
|
+
# Handle both single result and array of results
|
|
754
|
+
if isinstance(data, list):
|
|
755
|
+
if not data:
|
|
756
|
+
return ClassificationEvalResult.create(
|
|
757
|
+
model_id=model_id or "unknown",
|
|
758
|
+
dataset="",
|
|
759
|
+
top1_accuracy=0.0,
|
|
760
|
+
top5_accuracy=0.0,
|
|
761
|
+
)
|
|
762
|
+
data = data[0] # Take first result
|
|
763
|
+
|
|
764
|
+
return parse_timm_benchmark(data, model_id)
|
|
765
|
+
|
|
766
|
+
|
|
767
|
+
# =============================================================================
|
|
768
|
+
# Auto-detect Adapter (Task 12.3.6)
|
|
769
|
+
# =============================================================================
|
|
770
|
+
|
|
771
|
+
|
|
772
|
+
def detect_and_parse(path: Path, model_id: str = "") -> EvalResult | None:
|
|
773
|
+
"""
|
|
774
|
+
Auto-detect file format and parse with appropriate adapter.
|
|
775
|
+
|
|
776
|
+
Detection heuristics:
|
|
777
|
+
- Ultralytics: Has mAP50/mAP50-95 fields (YOLO format)
|
|
778
|
+
- lm-eval-harness: Has "results" dict with benchmark names (mmlu, hellaswag, etc.)
|
|
779
|
+
- timm: Has "top1"/"top5" fields (image classification)
|
|
780
|
+
- HuggingFace evaluate: Has standard metric names (accuracy, f1, precision, recall)
|
|
781
|
+
- Generic: Fallback for any JSON/CSV with numeric fields
|
|
782
|
+
|
|
783
|
+
Args:
|
|
784
|
+
path: Path to eval results file.
|
|
785
|
+
model_id: Optional model identifier.
|
|
786
|
+
|
|
787
|
+
Returns:
|
|
788
|
+
EvalResult or None if format not recognized.
|
|
789
|
+
|
|
790
|
+
Example:
|
|
791
|
+
>>> result = detect_and_parse(Path("yolo_val.json"))
|
|
792
|
+
>>> print(result.task_type) # "detection"
|
|
793
|
+
"""
|
|
794
|
+
suffix = path.suffix.lower()
|
|
795
|
+
|
|
796
|
+
if suffix == ".json":
|
|
797
|
+
with open(path, encoding="utf-8") as f:
|
|
798
|
+
data = json.load(f)
|
|
799
|
+
|
|
800
|
+
# Handle array of results (take first)
|
|
801
|
+
if isinstance(data, list):
|
|
802
|
+
if not data:
|
|
803
|
+
return None
|
|
804
|
+
data = data[0]
|
|
805
|
+
|
|
806
|
+
# Check for Ultralytics signature (YOLO detection)
|
|
807
|
+
ultralytics_keys = ["metrics/mAP50(B)", "box/mAP50", "mAP50", "map50", "mAP50-95"]
|
|
808
|
+
if any(key in data or key in data.get("metrics", {}) for key in ultralytics_keys):
|
|
809
|
+
return parse_ultralytics_val(data, model_id)
|
|
810
|
+
|
|
811
|
+
# Check for lm-eval-harness signature (LLM benchmarks)
|
|
812
|
+
lm_eval_tasks = ["mmlu", "hellaswag", "truthfulqa", "arc_easy", "winogrande"]
|
|
813
|
+
if "results" in data:
|
|
814
|
+
results = data["results"]
|
|
815
|
+
if any(
|
|
816
|
+
task in results or task.lower() in [k.lower() for k in results]
|
|
817
|
+
for task in lm_eval_tasks
|
|
818
|
+
):
|
|
819
|
+
return parse_lm_eval(data, model_id)
|
|
820
|
+
# Also check if tasks are at top level
|
|
821
|
+
if any(task in data or task.lower() in [k.lower() for k in data] for task in lm_eval_tasks):
|
|
822
|
+
return parse_lm_eval(data, model_id)
|
|
823
|
+
|
|
824
|
+
# Check for timm signature (image classification)
|
|
825
|
+
timm_keys = ["top1", "top5", "top1_acc", "top5_acc", "prec1", "prec5"]
|
|
826
|
+
if any(key in data for key in timm_keys):
|
|
827
|
+
return parse_timm_benchmark(data, model_id)
|
|
828
|
+
|
|
829
|
+
# Check for HuggingFace evaluate signature
|
|
830
|
+
hf_keys = ["accuracy", "f1", "precision", "recall", "exact_match", "bleu", "rouge1"]
|
|
831
|
+
if any(key in data for key in hf_keys):
|
|
832
|
+
# Determine if NLP or classification based on keys
|
|
833
|
+
nlp_keys = ["exact_match", "em", "bleu", "rouge1", "rouge2", "rougeL"]
|
|
834
|
+
task_type = "nlp" if any(k in data for k in nlp_keys) else "classification"
|
|
835
|
+
return parse_hf_evaluate(data, model_id, task_type)
|
|
836
|
+
|
|
837
|
+
# Fall back to generic
|
|
838
|
+
return parse_generic_json(data, model_id)
|
|
839
|
+
|
|
840
|
+
elif suffix == ".csv":
|
|
841
|
+
results = load_generic_csv(path)
|
|
842
|
+
return results[0] if results else None
|
|
843
|
+
|
|
844
|
+
return None
|