haoline 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. haoline/.streamlit/config.toml +10 -0
  2. haoline/__init__.py +248 -0
  3. haoline/analyzer.py +935 -0
  4. haoline/cli.py +2712 -0
  5. haoline/compare.py +811 -0
  6. haoline/compare_visualizations.py +1564 -0
  7. haoline/edge_analysis.py +525 -0
  8. haoline/eval/__init__.py +131 -0
  9. haoline/eval/adapters.py +844 -0
  10. haoline/eval/cli.py +390 -0
  11. haoline/eval/comparison.py +542 -0
  12. haoline/eval/deployment.py +633 -0
  13. haoline/eval/schemas.py +833 -0
  14. haoline/examples/__init__.py +15 -0
  15. haoline/examples/basic_inspection.py +74 -0
  16. haoline/examples/compare_models.py +117 -0
  17. haoline/examples/hardware_estimation.py +78 -0
  18. haoline/format_adapters.py +1001 -0
  19. haoline/formats/__init__.py +123 -0
  20. haoline/formats/coreml.py +250 -0
  21. haoline/formats/gguf.py +483 -0
  22. haoline/formats/openvino.py +255 -0
  23. haoline/formats/safetensors.py +273 -0
  24. haoline/formats/tflite.py +369 -0
  25. haoline/hardware.py +2307 -0
  26. haoline/hierarchical_graph.py +462 -0
  27. haoline/html_export.py +1573 -0
  28. haoline/layer_summary.py +769 -0
  29. haoline/llm_summarizer.py +465 -0
  30. haoline/op_icons.py +618 -0
  31. haoline/operational_profiling.py +1492 -0
  32. haoline/patterns.py +1116 -0
  33. haoline/pdf_generator.py +265 -0
  34. haoline/privacy.py +250 -0
  35. haoline/pydantic_models.py +241 -0
  36. haoline/report.py +1923 -0
  37. haoline/report_sections.py +539 -0
  38. haoline/risks.py +521 -0
  39. haoline/schema.py +523 -0
  40. haoline/streamlit_app.py +2024 -0
  41. haoline/tests/__init__.py +4 -0
  42. haoline/tests/conftest.py +123 -0
  43. haoline/tests/test_analyzer.py +868 -0
  44. haoline/tests/test_compare_visualizations.py +293 -0
  45. haoline/tests/test_edge_analysis.py +243 -0
  46. haoline/tests/test_eval.py +604 -0
  47. haoline/tests/test_format_adapters.py +460 -0
  48. haoline/tests/test_hardware.py +237 -0
  49. haoline/tests/test_hardware_recommender.py +90 -0
  50. haoline/tests/test_hierarchical_graph.py +326 -0
  51. haoline/tests/test_html_export.py +180 -0
  52. haoline/tests/test_layer_summary.py +428 -0
  53. haoline/tests/test_llm_patterns.py +540 -0
  54. haoline/tests/test_llm_summarizer.py +339 -0
  55. haoline/tests/test_patterns.py +774 -0
  56. haoline/tests/test_pytorch.py +327 -0
  57. haoline/tests/test_report.py +383 -0
  58. haoline/tests/test_risks.py +398 -0
  59. haoline/tests/test_schema.py +417 -0
  60. haoline/tests/test_tensorflow.py +380 -0
  61. haoline/tests/test_visualizations.py +316 -0
  62. haoline/universal_ir.py +856 -0
  63. haoline/visualizations.py +1086 -0
  64. haoline/visualize_yolo.py +44 -0
  65. haoline/web.py +110 -0
  66. haoline-0.3.0.dist-info/METADATA +471 -0
  67. haoline-0.3.0.dist-info/RECORD +70 -0
  68. haoline-0.3.0.dist-info/WHEEL +4 -0
  69. haoline-0.3.0.dist-info/entry_points.txt +5 -0
  70. haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,844 @@
1
+ """
2
+ Eval Adapters
3
+
4
+ Parse evaluation results from external tools into HaoLine's schema.
5
+
6
+ Supported adapters:
7
+ - Ultralytics YOLO (detection): parse_ultralytics_val, load_ultralytics_json
8
+ - HuggingFace evaluate (classification/NLP): parse_hf_evaluate, load_hf_evaluate
9
+ - lm-eval-harness (LLM benchmarks): parse_lm_eval, load_lm_eval
10
+ - timm (image classification): parse_timm_benchmark, load_timm_benchmark
11
+ - Generic CSV/JSON: parse_generic_json, parse_generic_csv, load_generic_json, load_generic_csv
12
+
13
+ Auto-detection: detect_and_parse() tries to identify the format automatically.
14
+ """
15
+
16
+ from __future__ import annotations
17
+
18
+ import csv
19
+ import json
20
+ from pathlib import Path
21
+ from typing import Any
22
+
23
+ from .schemas import (
24
+ ClassificationEvalResult,
25
+ DetectionEvalResult,
26
+ EvalMetric,
27
+ EvalResult,
28
+ GenericEvalResult,
29
+ LLMEvalResult,
30
+ NLPEvalResult,
31
+ )
32
+
33
+ # =============================================================================
34
+ # Ultralytics YOLO Adapter (Task 12.3.1)
35
+ # =============================================================================
36
+
37
+
38
+ def parse_ultralytics_val(
39
+ data: dict[str, Any],
40
+ model_id: str = "",
41
+ ) -> DetectionEvalResult:
42
+ """
43
+ Parse Ultralytics YOLO validation results.
44
+
45
+ Ultralytics outputs validation metrics in various formats. This parser
46
+ handles the JSON output from `yolo val` or results from `model.val()`.
47
+
48
+ Expected fields (from results.results_dict or JSON):
49
+ - metrics/mAP50(B): float
50
+ - metrics/mAP50-95(B): float
51
+ - metrics/precision(B): float
52
+ - metrics/recall(B): float
53
+ - fitness: float (optional)
54
+
55
+ Args:
56
+ data: Dictionary from YOLO validation output.
57
+ model_id: Model identifier (defaults to extracting from data).
58
+
59
+ Returns:
60
+ DetectionEvalResult with parsed metrics.
61
+ """
62
+
63
+ # Try different key formats (Ultralytics uses inconsistent naming)
64
+ def get_metric(keys: list[str], default: float = 0.0) -> float:
65
+ for key in keys:
66
+ if key in data:
67
+ val = data[key]
68
+ return float(val) if val is not None else default
69
+ # Check nested metrics dict
70
+ if "metrics" in data and key in data["metrics"]:
71
+ val = data["metrics"][key]
72
+ return float(val) if val is not None else default
73
+ return default
74
+
75
+ # Extract metrics with various key formats
76
+ map50 = get_metric(
77
+ [
78
+ "metrics/mAP50(B)",
79
+ "mAP50",
80
+ "map50",
81
+ "mAP@50",
82
+ "box/mAP50",
83
+ ]
84
+ )
85
+ map50_95 = get_metric(
86
+ [
87
+ "metrics/mAP50-95(B)",
88
+ "mAP50-95",
89
+ "map50_95",
90
+ "mAP@50:95",
91
+ "box/mAP50-95",
92
+ "map",
93
+ ]
94
+ )
95
+ precision = get_metric(
96
+ [
97
+ "metrics/precision(B)",
98
+ "precision",
99
+ "box/precision",
100
+ "p",
101
+ ]
102
+ )
103
+ recall = get_metric(
104
+ [
105
+ "metrics/recall(B)",
106
+ "recall",
107
+ "box/recall",
108
+ "r",
109
+ ]
110
+ )
111
+
112
+ # Calculate F1 if not provided
113
+ f1 = get_metric(["f1", "box/f1"])
114
+ if f1 == 0.0 and precision > 0 and recall > 0:
115
+ f1 = 2 * (precision * recall) / (precision + recall)
116
+
117
+ # Extract model ID
118
+ if not model_id:
119
+ model_id = data.get("model", data.get("name", "unknown"))
120
+
121
+ # Extract dataset
122
+ dataset = data.get("data", data.get("dataset", ""))
123
+ if isinstance(dataset, dict):
124
+ dataset = dataset.get("path", dataset.get("name", ""))
125
+
126
+ # Per-class metrics if available
127
+ class_metrics: dict[str, dict[str, float]] = {}
128
+ if "per_class" in data:
129
+ for cls_name, cls_data in data["per_class"].items():
130
+ class_metrics[cls_name] = {
131
+ "precision": cls_data.get("precision", 0.0),
132
+ "recall": cls_data.get("recall", 0.0),
133
+ "ap50": cls_data.get("ap50", cls_data.get("mAP50", 0.0)),
134
+ }
135
+
136
+ # Build the result
137
+ result = DetectionEvalResult.create(
138
+ model_id=str(model_id),
139
+ dataset=str(dataset),
140
+ map50=map50,
141
+ map50_95=map50_95,
142
+ precision=precision,
143
+ recall=recall,
144
+ f1=f1,
145
+ class_metrics=class_metrics,
146
+ )
147
+
148
+ # Add extra metrics from metadata
149
+ speed = data.get("speed", {})
150
+ if speed:
151
+ if "inference" in speed:
152
+ result.metrics.append(
153
+ EvalMetric(
154
+ name="inference_ms",
155
+ value=speed["inference"],
156
+ unit="ms",
157
+ higher_is_better=False,
158
+ category="speed",
159
+ )
160
+ )
161
+ if "preprocess" in speed:
162
+ result.metrics.append(
163
+ EvalMetric(
164
+ name="preprocess_ms",
165
+ value=speed["preprocess"],
166
+ unit="ms",
167
+ higher_is_better=False,
168
+ category="speed",
169
+ )
170
+ )
171
+ if "postprocess" in speed:
172
+ result.metrics.append(
173
+ EvalMetric(
174
+ name="postprocess_ms",
175
+ value=speed["postprocess"],
176
+ unit="ms",
177
+ higher_is_better=False,
178
+ category="speed",
179
+ )
180
+ )
181
+
182
+ # Store raw data in metadata
183
+ result.metadata["raw_ultralytics"] = data
184
+
185
+ return result
186
+
187
+
188
+ def load_ultralytics_json(path: Path, model_id: str = "") -> DetectionEvalResult:
189
+ """
190
+ Load Ultralytics validation results from JSON file.
191
+
192
+ Args:
193
+ path: Path to JSON file.
194
+ model_id: Optional model identifier.
195
+
196
+ Returns:
197
+ DetectionEvalResult with parsed metrics.
198
+ """
199
+ with open(path, encoding="utf-8") as f:
200
+ data = json.load(f)
201
+ return parse_ultralytics_val(data, model_id)
202
+
203
+
204
+ # =============================================================================
205
+ # Generic CSV/JSON Adapter (Task 12.3.5)
206
+ # =============================================================================
207
+
208
+
209
+ def parse_generic_json(
210
+ data: dict[str, Any],
211
+ model_id: str = "",
212
+ metric_mapping: dict[str, str] | None = None,
213
+ higher_is_better: dict[str, bool] | None = None,
214
+ ) -> GenericEvalResult:
215
+ """
216
+ Parse generic JSON evaluation results.
217
+
218
+ Extracts numeric fields as metrics. User can provide mapping to rename fields.
219
+
220
+ Args:
221
+ data: Dictionary with metric values.
222
+ model_id: Model identifier.
223
+ metric_mapping: Optional dict to rename fields (json_key -> metric_name).
224
+ higher_is_better: Optional dict specifying direction (metric_name -> bool).
225
+
226
+ Returns:
227
+ GenericEvalResult with extracted metrics.
228
+
229
+ Example:
230
+ >>> data = {"acc": 0.95, "loss": 0.12, "model": "resnet50"}
231
+ >>> result = parse_generic_json(
232
+ ... data,
233
+ ... metric_mapping={"acc": "accuracy", "loss": "val_loss"},
234
+ ... higher_is_better={"accuracy": True, "val_loss": False}
235
+ ... )
236
+ """
237
+ mapping = metric_mapping or {}
238
+ better_map = higher_is_better or {}
239
+
240
+ # Extract model_id from data if not provided
241
+ if not model_id:
242
+ model_id = str(data.get("model_id", data.get("model", data.get("name", "unknown"))))
243
+
244
+ # Extract dataset
245
+ dataset = str(data.get("dataset", data.get("data", "")))
246
+
247
+ # Find all numeric fields
248
+ metrics: dict[str, float] = {}
249
+ for key, value in data.items():
250
+ # Skip non-numeric and metadata fields
251
+ if key in ("model_id", "model", "name", "dataset", "data", "timestamp", "metadata"):
252
+ continue
253
+
254
+ if isinstance(value, (int, float)) and not isinstance(value, bool):
255
+ # Apply mapping if provided
256
+ metric_name = mapping.get(key, key)
257
+ metrics[metric_name] = float(value)
258
+
259
+ # Build result
260
+ return GenericEvalResult.create(
261
+ model_id=model_id,
262
+ dataset=dataset,
263
+ metrics=metrics,
264
+ higher_is_better=better_map,
265
+ )
266
+
267
+
268
+ def load_generic_json(
269
+ path: Path,
270
+ model_id: str = "",
271
+ metric_mapping: dict[str, str] | None = None,
272
+ higher_is_better: dict[str, bool] | None = None,
273
+ ) -> GenericEvalResult:
274
+ """
275
+ Load generic evaluation results from JSON file.
276
+
277
+ Args:
278
+ path: Path to JSON file.
279
+ model_id: Optional model identifier.
280
+ metric_mapping: Optional dict to rename fields.
281
+ higher_is_better: Optional dict specifying metric direction.
282
+
283
+ Returns:
284
+ GenericEvalResult with extracted metrics.
285
+ """
286
+ with open(path, encoding="utf-8") as f:
287
+ data = json.load(f)
288
+ return parse_generic_json(data, model_id, metric_mapping, higher_is_better)
289
+
290
+
291
+ def parse_generic_csv(
292
+ rows: list[dict[str, str]],
293
+ model_id_column: str = "model",
294
+ metric_columns: list[str] | None = None,
295
+ higher_is_better: dict[str, bool] | None = None,
296
+ ) -> list[GenericEvalResult]:
297
+ """
298
+ Parse generic CSV evaluation results.
299
+
300
+ Each row becomes one EvalResult. Specify which columns are metrics.
301
+
302
+ Args:
303
+ rows: List of row dicts (from csv.DictReader).
304
+ model_id_column: Column name containing model identifier.
305
+ metric_columns: List of column names to treat as metrics (None = auto-detect numeric).
306
+ higher_is_better: Dict specifying metric direction.
307
+
308
+ Returns:
309
+ List of GenericEvalResult, one per row.
310
+
311
+ Example CSV:
312
+ model,accuracy,f1,loss
313
+ resnet50,0.95,0.94,0.12
314
+ mobilenet,0.91,0.90,0.18
315
+
316
+ >>> with open("results.csv") as f:
317
+ ... rows = list(csv.DictReader(f))
318
+ >>> results = parse_generic_csv(rows, metric_columns=["accuracy", "f1", "loss"])
319
+ """
320
+ better_map = higher_is_better or {}
321
+ results = []
322
+
323
+ for row in rows:
324
+ model_id = row.get(model_id_column, "unknown")
325
+
326
+ # Extract metrics
327
+ metrics: dict[str, float] = {}
328
+
329
+ if metric_columns:
330
+ # Use specified columns
331
+ for col in metric_columns:
332
+ if col in row:
333
+ try:
334
+ metrics[col] = float(row[col])
335
+ except (ValueError, TypeError):
336
+ pass # Skip non-numeric
337
+ else:
338
+ # Auto-detect numeric columns
339
+ for key, value in row.items():
340
+ if key == model_id_column:
341
+ continue
342
+ try:
343
+ metrics[key] = float(value)
344
+ except (ValueError, TypeError):
345
+ pass # Skip non-numeric
346
+
347
+ result = GenericEvalResult.create(
348
+ model_id=model_id,
349
+ metrics=metrics,
350
+ higher_is_better=better_map,
351
+ )
352
+ results.append(result)
353
+
354
+ return results
355
+
356
+
357
+ def load_generic_csv(
358
+ path: Path,
359
+ model_id_column: str = "model",
360
+ metric_columns: list[str] | None = None,
361
+ higher_is_better: dict[str, bool] | None = None,
362
+ ) -> list[GenericEvalResult]:
363
+ """
364
+ Load generic evaluation results from CSV file.
365
+
366
+ Args:
367
+ path: Path to CSV file.
368
+ model_id_column: Column name containing model identifier.
369
+ metric_columns: List of column names to treat as metrics.
370
+ higher_is_better: Dict specifying metric direction.
371
+
372
+ Returns:
373
+ List of GenericEvalResult, one per row.
374
+ """
375
+ with open(path, encoding="utf-8", newline="") as f:
376
+ reader = csv.DictReader(f)
377
+ rows = list(reader)
378
+ return parse_generic_csv(rows, model_id_column, metric_columns, higher_is_better)
379
+
380
+
381
+ # =============================================================================
382
+ # HuggingFace Evaluate Adapter (Task 12.3.2)
383
+ # =============================================================================
384
+
385
+
386
+ def parse_hf_evaluate(
387
+ data: dict[str, Any],
388
+ model_id: str = "",
389
+ task_type: str = "classification",
390
+ ) -> ClassificationEvalResult | NLPEvalResult | GenericEvalResult:
391
+ """
392
+ Parse HuggingFace evaluate library output.
393
+
394
+ HuggingFace evaluate returns a dict with metric names as keys.
395
+ Common output formats:
396
+ - Classification: {"accuracy": 0.95, "f1": 0.94, "precision": 0.93, "recall": 0.95}
397
+ - NER: {"precision": 0.9, "recall": 0.88, "f1": 0.89, "accuracy": 0.95}
398
+ - QA: {"exact_match": 80.5, "f1": 85.3}
399
+
400
+ Args:
401
+ data: Dictionary from evaluate.compute() or JSON output.
402
+ model_id: Model identifier.
403
+ task_type: One of "classification", "nlp", or "generic".
404
+
405
+ Returns:
406
+ Appropriate EvalResult subtype.
407
+
408
+ Example:
409
+ >>> import evaluate
410
+ >>> metric = evaluate.load("accuracy")
411
+ >>> result = metric.compute(predictions=[1,1,0], references=[1,0,0])
412
+ >>> eval_result = parse_hf_evaluate(result, model_id="bert-base")
413
+ """
414
+ # Extract model_id from data if not provided
415
+ if not model_id:
416
+ model_id = str(data.get("model", data.get("model_id", "unknown")))
417
+
418
+ dataset = str(data.get("dataset", data.get("data", "")))
419
+
420
+ # Try to auto-detect task from metric names
421
+ has_exact_match = "exact_match" in data or "em" in data
422
+ has_bleu = "bleu" in data or "sacrebleu" in data
423
+ has_rouge = any(k.startswith("rouge") for k in data.keys())
424
+
425
+ if task_type == "nlp" or has_exact_match or has_bleu or has_rouge:
426
+ # NLP task - determine specific task from metrics
427
+ nlp_task = "qa" if has_exact_match else ("translation" if has_bleu else "classification")
428
+ return NLPEvalResult.create(
429
+ model_id=model_id,
430
+ dataset=dataset,
431
+ nlp_task=nlp_task,
432
+ accuracy=data.get("accuracy"),
433
+ f1=data.get("f1", data.get("f1_score")),
434
+ exact_match=data.get("exact_match", data.get("em")),
435
+ bleu=data.get("bleu", data.get("sacrebleu")),
436
+ )
437
+
438
+ elif task_type == "classification":
439
+ # Classification task - default to 0.0 if not found
440
+ top1 = data.get("accuracy", data.get("top1", data.get("top_1_accuracy", 0.0)))
441
+ top5 = data.get("top5", data.get("top_5_accuracy", 0.0))
442
+ return ClassificationEvalResult.create(
443
+ model_id=model_id,
444
+ dataset=dataset,
445
+ top1_accuracy=float(top1) if top1 is not None else 0.0,
446
+ top5_accuracy=float(top5) if top5 is not None else 0.0,
447
+ )
448
+
449
+ else:
450
+ # Generic fallback
451
+ return parse_generic_json(data, model_id)
452
+
453
+
454
+ def load_hf_evaluate(
455
+ path: Path,
456
+ model_id: str = "",
457
+ task_type: str = "classification",
458
+ ) -> ClassificationEvalResult | NLPEvalResult | GenericEvalResult:
459
+ """
460
+ Load HuggingFace evaluate results from JSON file.
461
+
462
+ Args:
463
+ path: Path to JSON file.
464
+ model_id: Optional model identifier.
465
+ task_type: One of "classification", "nlp", or "generic".
466
+
467
+ Returns:
468
+ Appropriate EvalResult subtype.
469
+ """
470
+ with open(path, encoding="utf-8") as f:
471
+ data = json.load(f)
472
+ return parse_hf_evaluate(data, model_id, task_type)
473
+
474
+
475
+ # =============================================================================
476
+ # lm-eval-harness Adapter (Task 12.3.3)
477
+ # =============================================================================
478
+
479
+
480
+ def parse_lm_eval(
481
+ data: dict[str, Any],
482
+ model_id: str = "",
483
+ ) -> LLMEvalResult:
484
+ """
485
+ Parse lm-eval-harness (EleutherAI) output.
486
+
487
+ lm-eval-harness outputs JSON with results per task/benchmark.
488
+ Format varies but typically:
489
+ {
490
+ "results": {
491
+ "hellaswag": {"acc": 0.7, "acc_norm": 0.75},
492
+ "mmlu": {"acc": 0.65},
493
+ "arc_easy": {"acc": 0.8, "acc_norm": 0.82}
494
+ },
495
+ "config": {"model": "llama-7b", ...}
496
+ }
497
+
498
+ Args:
499
+ data: Dictionary from lm-eval JSON output.
500
+ model_id: Model identifier (extracted from config if not provided).
501
+
502
+ Returns:
503
+ LLMEvalResult with benchmark scores.
504
+
505
+ Example:
506
+ >>> # After running: lm_eval --model hf --model_args ... --output_path results.json
507
+ >>> result = load_lm_eval("results.json")
508
+ """
509
+ # Extract model_id from config
510
+ if not model_id:
511
+ config = data.get("config", {})
512
+ model_id = str(
513
+ config.get("model", config.get("model_args", {}).get("pretrained", "unknown"))
514
+ )
515
+
516
+ # Extract results - can be at top level or nested
517
+ results = data.get("results", data)
518
+
519
+ # Standard LLM benchmark scores
520
+ def get_task_score(task_name: str, metric: str = "acc_norm") -> float | None:
521
+ """Get score for a task, trying multiple metric names."""
522
+ if task_name not in results:
523
+ # Try case variations
524
+ for key in results:
525
+ if key.lower() == task_name.lower():
526
+ task_name = key
527
+ break
528
+ else:
529
+ return None
530
+
531
+ task_data = results[task_name]
532
+ if isinstance(task_data, dict):
533
+ # Try acc_norm first, then acc, then the raw value
534
+ for m in [metric, "acc_norm", "acc", "accuracy"]:
535
+ if m in task_data:
536
+ val = task_data[m]
537
+ # Handle both raw scores (0-1) and percentages (0-100)
538
+ return float(val) * 100 if float(val) <= 1 else float(val)
539
+ elif isinstance(task_data, (int, float)):
540
+ val = float(task_data)
541
+ return val * 100 if val <= 1 else val
542
+ return None
543
+
544
+ # Extract common benchmarks
545
+ mmlu = get_task_score("mmlu") or get_task_score("mmlu_pro")
546
+ hellaswag = get_task_score("hellaswag")
547
+ truthfulqa = get_task_score("truthfulqa") or get_task_score("truthfulqa_mc")
548
+ arc_easy = get_task_score("arc_easy")
549
+ arc_challenge = get_task_score("arc_challenge")
550
+ winogrande = get_task_score("winogrande")
551
+
552
+ # Calculate average if we have benchmarks
553
+ benchmark_scores: dict[str, float] = {}
554
+ for name, score in [
555
+ ("mmlu", mmlu),
556
+ ("hellaswag", hellaswag),
557
+ ("truthfulqa", truthfulqa),
558
+ ("arc_easy", arc_easy),
559
+ ("arc_challenge", arc_challenge),
560
+ ("winogrande", winogrande),
561
+ ]:
562
+ if score is not None:
563
+ benchmark_scores[name] = score
564
+
565
+ # Add any other tasks not in our standard list
566
+ for task_name, _task_data in results.items():
567
+ if task_name.lower() not in [
568
+ "mmlu",
569
+ "mmlu_pro",
570
+ "hellaswag",
571
+ "truthfulqa",
572
+ "truthfulqa_mc",
573
+ "arc_easy",
574
+ "arc_challenge",
575
+ "winogrande",
576
+ ]:
577
+ score = get_task_score(task_name)
578
+ if score is not None:
579
+ benchmark_scores[task_name] = score
580
+
581
+ # Try to get perplexity if available
582
+ perplexity = None
583
+ if "perplexity" in results:
584
+ perplexity = results["perplexity"]
585
+ if isinstance(perplexity, dict):
586
+ perplexity = perplexity.get("word_perplexity", perplexity.get("perplexity"))
587
+
588
+ return LLMEvalResult.create(
589
+ model_id=model_id,
590
+ perplexity=float(perplexity) if perplexity else None,
591
+ mmlu=mmlu,
592
+ hellaswag=hellaswag,
593
+ truthfulqa=truthfulqa,
594
+ benchmark_scores=benchmark_scores,
595
+ )
596
+
597
+
598
+ def load_lm_eval(path: Path, model_id: str = "") -> LLMEvalResult:
599
+ """
600
+ Load lm-eval-harness results from JSON file.
601
+
602
+ Args:
603
+ path: Path to JSON file (lm-eval output).
604
+ model_id: Optional model identifier.
605
+
606
+ Returns:
607
+ LLMEvalResult with benchmark scores.
608
+ """
609
+ with open(path, encoding="utf-8") as f:
610
+ data = json.load(f)
611
+ return parse_lm_eval(data, model_id)
612
+
613
+
614
+ # =============================================================================
615
+ # timm Adapter (Task 12.3.4)
616
+ # =============================================================================
617
+
618
+
619
+ def parse_timm_benchmark(
620
+ data: dict[str, Any],
621
+ model_id: str = "",
622
+ ) -> ClassificationEvalResult:
623
+ """
624
+ Parse timm (PyTorch Image Models) benchmark output.
625
+
626
+ timm's validate.py outputs JSON/CSV with classification metrics.
627
+ Common fields:
628
+ - top1: top-1 accuracy (%)
629
+ - top5: top-5 accuracy (%)
630
+ - model: model name
631
+ - param_count: number of parameters
632
+
633
+ Args:
634
+ data: Dictionary from timm validation output.
635
+ model_id: Model identifier.
636
+
637
+ Returns:
638
+ ClassificationEvalResult with accuracy metrics.
639
+
640
+ Example:
641
+ >>> # After: python validate.py --data imagenet --model resnet50 --results-file results.json
642
+ >>> result = load_timm_benchmark("results.json")
643
+ """
644
+ # Extract model_id
645
+ if not model_id:
646
+ model_id = str(data.get("model", data.get("arch", "unknown")))
647
+
648
+ # Dataset
649
+ dataset = str(data.get("dataset", data.get("data", "imagenet")))
650
+
651
+ # Extract accuracy metrics
652
+ def get_accuracy(keys: list[str]) -> float | None:
653
+ for key in keys:
654
+ if key in data:
655
+ val = data[key]
656
+ if val is not None:
657
+ # timm outputs percentages (0-100)
658
+ return float(val)
659
+ return None
660
+
661
+ top1 = get_accuracy(["top1", "top1_acc", "accuracy", "acc1", "prec1"])
662
+ top5 = get_accuracy(["top5", "top5_acc", "acc5", "prec5"])
663
+
664
+ # Create result - default to 0.0 if not found
665
+ result = ClassificationEvalResult.create(
666
+ model_id=model_id,
667
+ dataset=dataset,
668
+ top1_accuracy=float(top1) if top1 is not None else 0.0,
669
+ top5_accuracy=float(top5) if top5 is not None else 0.0,
670
+ )
671
+
672
+ # Add extra metrics if available
673
+ if "param_count" in data:
674
+ result.metrics.append(
675
+ EvalMetric(
676
+ name="param_count",
677
+ value=float(data["param_count"]),
678
+ unit="params",
679
+ higher_is_better=False,
680
+ category="size",
681
+ )
682
+ )
683
+
684
+ if "img_size" in data:
685
+ result.metrics.append(
686
+ EvalMetric(
687
+ name="img_size",
688
+ value=float(data["img_size"]),
689
+ unit="px",
690
+ higher_is_better=False,
691
+ category="input",
692
+ )
693
+ )
694
+
695
+ if "batch_size" in data:
696
+ result.metrics.append(
697
+ EvalMetric(
698
+ name="batch_size",
699
+ value=float(data["batch_size"]),
700
+ unit="",
701
+ higher_is_better=False,
702
+ category="config",
703
+ )
704
+ )
705
+
706
+ # Throughput/latency if available
707
+ for key in ["samples_per_sec", "throughput", "samples_sec"]:
708
+ if key in data:
709
+ result.metrics.append(
710
+ EvalMetric(
711
+ name="throughput",
712
+ value=float(data[key]),
713
+ unit="samples/sec",
714
+ higher_is_better=True,
715
+ category="speed",
716
+ )
717
+ )
718
+ break
719
+
720
+ for key in ["latency_ms", "inference_time"]:
721
+ if key in data:
722
+ result.metrics.append(
723
+ EvalMetric(
724
+ name="latency_ms",
725
+ value=float(data[key]),
726
+ unit="ms",
727
+ higher_is_better=False,
728
+ category="speed",
729
+ )
730
+ )
731
+ break
732
+
733
+ # Store raw data
734
+ result.metadata["raw_timm"] = data
735
+
736
+ return result
737
+
738
+
739
+ def load_timm_benchmark(path: Path, model_id: str = "") -> ClassificationEvalResult:
740
+ """
741
+ Load timm benchmark results from JSON file.
742
+
743
+ Args:
744
+ path: Path to JSON file (timm validate.py output).
745
+ model_id: Optional model identifier.
746
+
747
+ Returns:
748
+ ClassificationEvalResult with accuracy metrics.
749
+ """
750
+ with open(path, encoding="utf-8") as f:
751
+ data = json.load(f)
752
+
753
+ # Handle both single result and array of results
754
+ if isinstance(data, list):
755
+ if not data:
756
+ return ClassificationEvalResult.create(
757
+ model_id=model_id or "unknown",
758
+ dataset="",
759
+ top1_accuracy=0.0,
760
+ top5_accuracy=0.0,
761
+ )
762
+ data = data[0] # Take first result
763
+
764
+ return parse_timm_benchmark(data, model_id)
765
+
766
+
767
+ # =============================================================================
768
+ # Auto-detect Adapter (Task 12.3.6)
769
+ # =============================================================================
770
+
771
+
772
+ def detect_and_parse(path: Path, model_id: str = "") -> EvalResult | None:
773
+ """
774
+ Auto-detect file format and parse with appropriate adapter.
775
+
776
+ Detection heuristics:
777
+ - Ultralytics: Has mAP50/mAP50-95 fields (YOLO format)
778
+ - lm-eval-harness: Has "results" dict with benchmark names (mmlu, hellaswag, etc.)
779
+ - timm: Has "top1"/"top5" fields (image classification)
780
+ - HuggingFace evaluate: Has standard metric names (accuracy, f1, precision, recall)
781
+ - Generic: Fallback for any JSON/CSV with numeric fields
782
+
783
+ Args:
784
+ path: Path to eval results file.
785
+ model_id: Optional model identifier.
786
+
787
+ Returns:
788
+ EvalResult or None if format not recognized.
789
+
790
+ Example:
791
+ >>> result = detect_and_parse(Path("yolo_val.json"))
792
+ >>> print(result.task_type) # "detection"
793
+ """
794
+ suffix = path.suffix.lower()
795
+
796
+ if suffix == ".json":
797
+ with open(path, encoding="utf-8") as f:
798
+ data = json.load(f)
799
+
800
+ # Handle array of results (take first)
801
+ if isinstance(data, list):
802
+ if not data:
803
+ return None
804
+ data = data[0]
805
+
806
+ # Check for Ultralytics signature (YOLO detection)
807
+ ultralytics_keys = ["metrics/mAP50(B)", "box/mAP50", "mAP50", "map50", "mAP50-95"]
808
+ if any(key in data or key in data.get("metrics", {}) for key in ultralytics_keys):
809
+ return parse_ultralytics_val(data, model_id)
810
+
811
+ # Check for lm-eval-harness signature (LLM benchmarks)
812
+ lm_eval_tasks = ["mmlu", "hellaswag", "truthfulqa", "arc_easy", "winogrande"]
813
+ if "results" in data:
814
+ results = data["results"]
815
+ if any(
816
+ task in results or task.lower() in [k.lower() for k in results]
817
+ for task in lm_eval_tasks
818
+ ):
819
+ return parse_lm_eval(data, model_id)
820
+ # Also check if tasks are at top level
821
+ if any(task in data or task.lower() in [k.lower() for k in data] for task in lm_eval_tasks):
822
+ return parse_lm_eval(data, model_id)
823
+
824
+ # Check for timm signature (image classification)
825
+ timm_keys = ["top1", "top5", "top1_acc", "top5_acc", "prec1", "prec5"]
826
+ if any(key in data for key in timm_keys):
827
+ return parse_timm_benchmark(data, model_id)
828
+
829
+ # Check for HuggingFace evaluate signature
830
+ hf_keys = ["accuracy", "f1", "precision", "recall", "exact_match", "bleu", "rouge1"]
831
+ if any(key in data for key in hf_keys):
832
+ # Determine if NLP or classification based on keys
833
+ nlp_keys = ["exact_match", "em", "bleu", "rouge1", "rouge2", "rougeL"]
834
+ task_type = "nlp" if any(k in data for k in nlp_keys) else "classification"
835
+ return parse_hf_evaluate(data, model_id, task_type)
836
+
837
+ # Fall back to generic
838
+ return parse_generic_json(data, model_id)
839
+
840
+ elif suffix == ".csv":
841
+ results = load_generic_csv(path)
842
+ return results[0] if results else None
843
+
844
+ return None