haoline 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haoline/.streamlit/config.toml +10 -0
- haoline/__init__.py +248 -0
- haoline/analyzer.py +935 -0
- haoline/cli.py +2712 -0
- haoline/compare.py +811 -0
- haoline/compare_visualizations.py +1564 -0
- haoline/edge_analysis.py +525 -0
- haoline/eval/__init__.py +131 -0
- haoline/eval/adapters.py +844 -0
- haoline/eval/cli.py +390 -0
- haoline/eval/comparison.py +542 -0
- haoline/eval/deployment.py +633 -0
- haoline/eval/schemas.py +833 -0
- haoline/examples/__init__.py +15 -0
- haoline/examples/basic_inspection.py +74 -0
- haoline/examples/compare_models.py +117 -0
- haoline/examples/hardware_estimation.py +78 -0
- haoline/format_adapters.py +1001 -0
- haoline/formats/__init__.py +123 -0
- haoline/formats/coreml.py +250 -0
- haoline/formats/gguf.py +483 -0
- haoline/formats/openvino.py +255 -0
- haoline/formats/safetensors.py +273 -0
- haoline/formats/tflite.py +369 -0
- haoline/hardware.py +2307 -0
- haoline/hierarchical_graph.py +462 -0
- haoline/html_export.py +1573 -0
- haoline/layer_summary.py +769 -0
- haoline/llm_summarizer.py +465 -0
- haoline/op_icons.py +618 -0
- haoline/operational_profiling.py +1492 -0
- haoline/patterns.py +1116 -0
- haoline/pdf_generator.py +265 -0
- haoline/privacy.py +250 -0
- haoline/pydantic_models.py +241 -0
- haoline/report.py +1923 -0
- haoline/report_sections.py +539 -0
- haoline/risks.py +521 -0
- haoline/schema.py +523 -0
- haoline/streamlit_app.py +2024 -0
- haoline/tests/__init__.py +4 -0
- haoline/tests/conftest.py +123 -0
- haoline/tests/test_analyzer.py +868 -0
- haoline/tests/test_compare_visualizations.py +293 -0
- haoline/tests/test_edge_analysis.py +243 -0
- haoline/tests/test_eval.py +604 -0
- haoline/tests/test_format_adapters.py +460 -0
- haoline/tests/test_hardware.py +237 -0
- haoline/tests/test_hardware_recommender.py +90 -0
- haoline/tests/test_hierarchical_graph.py +326 -0
- haoline/tests/test_html_export.py +180 -0
- haoline/tests/test_layer_summary.py +428 -0
- haoline/tests/test_llm_patterns.py +540 -0
- haoline/tests/test_llm_summarizer.py +339 -0
- haoline/tests/test_patterns.py +774 -0
- haoline/tests/test_pytorch.py +327 -0
- haoline/tests/test_report.py +383 -0
- haoline/tests/test_risks.py +398 -0
- haoline/tests/test_schema.py +417 -0
- haoline/tests/test_tensorflow.py +380 -0
- haoline/tests/test_visualizations.py +316 -0
- haoline/universal_ir.py +856 -0
- haoline/visualizations.py +1086 -0
- haoline/visualize_yolo.py +44 -0
- haoline/web.py +110 -0
- haoline-0.3.0.dist-info/METADATA +471 -0
- haoline-0.3.0.dist-info/RECORD +70 -0
- haoline-0.3.0.dist-info/WHEEL +4 -0
- haoline-0.3.0.dist-info/entry_points.txt +5 -0
- haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
haoline/eval/schemas.py
ADDED
|
@@ -0,0 +1,833 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Eval Result Schemas (Pydantic v2)
|
|
3
|
+
|
|
4
|
+
Task-agnostic and task-specific schemas for importing evaluation results
|
|
5
|
+
from external tools like Ultralytics, HuggingFace evaluate, lm-eval, etc.
|
|
6
|
+
|
|
7
|
+
All schemas use Pydantic for validation, serialization, and JSON Schema generation.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from datetime import datetime
|
|
13
|
+
from enum import Enum
|
|
14
|
+
from typing import Annotated, Any
|
|
15
|
+
|
|
16
|
+
from pydantic import BaseModel, Field
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class TaskType(str, Enum):
|
|
20
|
+
"""Supported evaluation task types."""
|
|
21
|
+
|
|
22
|
+
detection = "detection"
|
|
23
|
+
classification = "classification"
|
|
24
|
+
nlp = "nlp"
|
|
25
|
+
llm = "llm"
|
|
26
|
+
segmentation = "segmentation"
|
|
27
|
+
generic = "generic"
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class EvalMetric(BaseModel):
|
|
31
|
+
"""A single evaluation metric."""
|
|
32
|
+
|
|
33
|
+
name: Annotated[str, Field(description="Metric name, e.g., 'mAP@50', 'top1_accuracy'")]
|
|
34
|
+
value: Annotated[float, Field(description="The metric value")]
|
|
35
|
+
unit: Annotated[str, Field(default="", description="Unit, e.g., '%', 'ms', '' (dimensionless)")]
|
|
36
|
+
higher_is_better: Annotated[
|
|
37
|
+
bool, Field(default=True, description="Whether higher values are better")
|
|
38
|
+
]
|
|
39
|
+
category: Annotated[
|
|
40
|
+
str, Field(default="", description="Category, e.g., 'accuracy', 'speed', 'size'")
|
|
41
|
+
]
|
|
42
|
+
|
|
43
|
+
|
|
44
|
+
class EvalResult(BaseModel):
|
|
45
|
+
"""
|
|
46
|
+
Base class for evaluation results.
|
|
47
|
+
|
|
48
|
+
Task-agnostic fields that all eval results share.
|
|
49
|
+
"""
|
|
50
|
+
|
|
51
|
+
model_id: Annotated[str, Field(description="Identifier for the model (path, name, or hash)")]
|
|
52
|
+
task_type: Annotated[str, Field(description="Task type: detection, classification, etc.")]
|
|
53
|
+
timestamp: Annotated[str, Field(default="", description="ISO format timestamp of eval run")] = (
|
|
54
|
+
""
|
|
55
|
+
)
|
|
56
|
+
dataset: Annotated[str, Field(default="", description="Dataset used for evaluation")] = ""
|
|
57
|
+
metrics: Annotated[
|
|
58
|
+
list[EvalMetric], Field(default_factory=list, description="List of evaluation metrics")
|
|
59
|
+
]
|
|
60
|
+
metadata: Annotated[
|
|
61
|
+
dict[str, Any], Field(default_factory=dict, description="Tool-specific extras")
|
|
62
|
+
]
|
|
63
|
+
|
|
64
|
+
def model_post_init(self, __context: Any) -> None:
|
|
65
|
+
"""Set timestamp if not provided."""
|
|
66
|
+
if not self.timestamp:
|
|
67
|
+
object.__setattr__(self, "timestamp", datetime.now().isoformat())
|
|
68
|
+
|
|
69
|
+
def get_metric(self, name: str) -> EvalMetric | None:
|
|
70
|
+
"""Get a metric by name."""
|
|
71
|
+
for m in self.metrics:
|
|
72
|
+
if m.name == name:
|
|
73
|
+
return m
|
|
74
|
+
return None
|
|
75
|
+
|
|
76
|
+
def get_metric_value(self, name: str, default: float = 0.0) -> float:
|
|
77
|
+
"""Get a metric value by name, with default."""
|
|
78
|
+
m = self.get_metric(name)
|
|
79
|
+
return m.value if m else default
|
|
80
|
+
|
|
81
|
+
def to_json(self, indent: int = 2) -> str:
|
|
82
|
+
"""Serialize to JSON string."""
|
|
83
|
+
result: str = self.model_dump_json(indent=indent)
|
|
84
|
+
return result
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def from_json(cls, json_str: str) -> EvalResult:
|
|
88
|
+
"""Deserialize from JSON string."""
|
|
89
|
+
result: EvalResult = cls.model_validate_json(json_str)
|
|
90
|
+
return result
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
# =============================================================================
|
|
94
|
+
# Task-Specific Schemas
|
|
95
|
+
# =============================================================================
|
|
96
|
+
|
|
97
|
+
|
|
98
|
+
class DetectionEvalResult(EvalResult):
|
|
99
|
+
"""
|
|
100
|
+
Object detection evaluation results.
|
|
101
|
+
|
|
102
|
+
Standard metrics: mAP@50, mAP@50:95, precision, recall, F1 per class.
|
|
103
|
+
Compatible with: Ultralytics YOLO, Detectron2, MMDetection
|
|
104
|
+
"""
|
|
105
|
+
|
|
106
|
+
task_type: str = "detection"
|
|
107
|
+
|
|
108
|
+
# Per-class metrics
|
|
109
|
+
class_metrics: Annotated[
|
|
110
|
+
dict[str, dict[str, float]],
|
|
111
|
+
Field(
|
|
112
|
+
default_factory=dict,
|
|
113
|
+
description="Per-class metrics, e.g., {'person': {'precision': 0.92}}",
|
|
114
|
+
),
|
|
115
|
+
]
|
|
116
|
+
|
|
117
|
+
# IoU thresholds used
|
|
118
|
+
iou_thresholds: Annotated[
|
|
119
|
+
list[float], Field(default_factory=lambda: [0.5, 0.75], description="IoU thresholds")
|
|
120
|
+
]
|
|
121
|
+
|
|
122
|
+
# Confidence threshold
|
|
123
|
+
confidence_threshold: Annotated[float, Field(default=0.5, description="Confidence threshold")]
|
|
124
|
+
|
|
125
|
+
@classmethod
|
|
126
|
+
def create(
|
|
127
|
+
cls,
|
|
128
|
+
model_id: str,
|
|
129
|
+
dataset: str,
|
|
130
|
+
map50: float,
|
|
131
|
+
map50_95: float,
|
|
132
|
+
precision: float,
|
|
133
|
+
recall: float,
|
|
134
|
+
f1: float,
|
|
135
|
+
class_metrics: dict[str, dict[str, float]] | None = None,
|
|
136
|
+
**kwargs: Any,
|
|
137
|
+
) -> DetectionEvalResult:
|
|
138
|
+
"""Convenience constructor with standard detection metrics."""
|
|
139
|
+
metrics = [
|
|
140
|
+
EvalMetric(
|
|
141
|
+
name="mAP@50", value=map50, unit="%", higher_is_better=True, category="accuracy"
|
|
142
|
+
),
|
|
143
|
+
EvalMetric(
|
|
144
|
+
name="mAP@50:95",
|
|
145
|
+
value=map50_95,
|
|
146
|
+
unit="%",
|
|
147
|
+
higher_is_better=True,
|
|
148
|
+
category="accuracy",
|
|
149
|
+
),
|
|
150
|
+
EvalMetric(
|
|
151
|
+
name="precision",
|
|
152
|
+
value=precision,
|
|
153
|
+
unit="%",
|
|
154
|
+
higher_is_better=True,
|
|
155
|
+
category="accuracy",
|
|
156
|
+
),
|
|
157
|
+
EvalMetric(
|
|
158
|
+
name="recall", value=recall, unit="%", higher_is_better=True, category="accuracy"
|
|
159
|
+
),
|
|
160
|
+
EvalMetric(name="f1", value=f1, unit="%", higher_is_better=True, category="accuracy"),
|
|
161
|
+
]
|
|
162
|
+
return cls(
|
|
163
|
+
model_id=model_id,
|
|
164
|
+
dataset=dataset,
|
|
165
|
+
metrics=metrics,
|
|
166
|
+
class_metrics=class_metrics or {},
|
|
167
|
+
**kwargs,
|
|
168
|
+
)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
class ClassificationEvalResult(EvalResult):
|
|
172
|
+
"""
|
|
173
|
+
Image/text classification evaluation results.
|
|
174
|
+
|
|
175
|
+
Standard metrics: top-1 accuracy, top-5 accuracy, per-class accuracy.
|
|
176
|
+
Compatible with: timm, torchvision, HuggingFace
|
|
177
|
+
"""
|
|
178
|
+
|
|
179
|
+
task_type: str = "classification"
|
|
180
|
+
|
|
181
|
+
# Per-class accuracy
|
|
182
|
+
class_accuracy: Annotated[
|
|
183
|
+
dict[str, float],
|
|
184
|
+
Field(default_factory=dict, description="Per-class accuracy"),
|
|
185
|
+
]
|
|
186
|
+
|
|
187
|
+
# Confusion matrix (optional)
|
|
188
|
+
confusion_matrix: Annotated[
|
|
189
|
+
list[list[int]] | None,
|
|
190
|
+
Field(default=None, description="Confusion matrix"),
|
|
191
|
+
]
|
|
192
|
+
class_names: Annotated[
|
|
193
|
+
list[str], Field(default_factory=list, description="Class names for confusion matrix")
|
|
194
|
+
]
|
|
195
|
+
|
|
196
|
+
@classmethod
|
|
197
|
+
def create(
|
|
198
|
+
cls,
|
|
199
|
+
model_id: str,
|
|
200
|
+
dataset: str,
|
|
201
|
+
top1_accuracy: float,
|
|
202
|
+
top5_accuracy: float,
|
|
203
|
+
class_accuracy: dict[str, float] | None = None,
|
|
204
|
+
**kwargs: Any,
|
|
205
|
+
) -> ClassificationEvalResult:
|
|
206
|
+
"""Convenience constructor with standard classification metrics."""
|
|
207
|
+
metrics = [
|
|
208
|
+
EvalMetric(
|
|
209
|
+
name="top1_accuracy",
|
|
210
|
+
value=top1_accuracy,
|
|
211
|
+
unit="%",
|
|
212
|
+
higher_is_better=True,
|
|
213
|
+
category="accuracy",
|
|
214
|
+
),
|
|
215
|
+
EvalMetric(
|
|
216
|
+
name="top5_accuracy",
|
|
217
|
+
value=top5_accuracy,
|
|
218
|
+
unit="%",
|
|
219
|
+
higher_is_better=True,
|
|
220
|
+
category="accuracy",
|
|
221
|
+
),
|
|
222
|
+
]
|
|
223
|
+
return cls(
|
|
224
|
+
model_id=model_id,
|
|
225
|
+
dataset=dataset,
|
|
226
|
+
metrics=metrics,
|
|
227
|
+
class_accuracy=class_accuracy or {},
|
|
228
|
+
**kwargs,
|
|
229
|
+
)
|
|
230
|
+
|
|
231
|
+
|
|
232
|
+
class NLPEvalResult(EvalResult):
|
|
233
|
+
"""
|
|
234
|
+
NLP task evaluation results.
|
|
235
|
+
|
|
236
|
+
Standard metrics: accuracy, F1, exact match, BLEU, ROUGE.
|
|
237
|
+
Compatible with: HuggingFace evaluate, SacreBLEU
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
task_type: str = "nlp"
|
|
241
|
+
|
|
242
|
+
# Task subtype
|
|
243
|
+
nlp_task: Annotated[
|
|
244
|
+
str,
|
|
245
|
+
Field(
|
|
246
|
+
default="",
|
|
247
|
+
description="NLP task: classification, ner, qa, translation, summarization",
|
|
248
|
+
),
|
|
249
|
+
] = ""
|
|
250
|
+
|
|
251
|
+
@classmethod
|
|
252
|
+
def create(
|
|
253
|
+
cls,
|
|
254
|
+
model_id: str,
|
|
255
|
+
dataset: str,
|
|
256
|
+
nlp_task: str,
|
|
257
|
+
accuracy: float | None = None,
|
|
258
|
+
f1: float | None = None,
|
|
259
|
+
exact_match: float | None = None,
|
|
260
|
+
bleu: float | None = None,
|
|
261
|
+
rouge_l: float | None = None,
|
|
262
|
+
**kwargs: Any,
|
|
263
|
+
) -> NLPEvalResult:
|
|
264
|
+
"""Convenience constructor with standard NLP metrics."""
|
|
265
|
+
metrics = []
|
|
266
|
+
if accuracy is not None:
|
|
267
|
+
metrics.append(
|
|
268
|
+
EvalMetric(
|
|
269
|
+
name="accuracy",
|
|
270
|
+
value=accuracy,
|
|
271
|
+
unit="%",
|
|
272
|
+
higher_is_better=True,
|
|
273
|
+
category="accuracy",
|
|
274
|
+
)
|
|
275
|
+
)
|
|
276
|
+
if f1 is not None:
|
|
277
|
+
metrics.append(
|
|
278
|
+
EvalMetric(
|
|
279
|
+
name="f1", value=f1, unit="%", higher_is_better=True, category="accuracy"
|
|
280
|
+
)
|
|
281
|
+
)
|
|
282
|
+
if exact_match is not None:
|
|
283
|
+
metrics.append(
|
|
284
|
+
EvalMetric(
|
|
285
|
+
name="exact_match",
|
|
286
|
+
value=exact_match,
|
|
287
|
+
unit="%",
|
|
288
|
+
higher_is_better=True,
|
|
289
|
+
category="accuracy",
|
|
290
|
+
)
|
|
291
|
+
)
|
|
292
|
+
if bleu is not None:
|
|
293
|
+
metrics.append(
|
|
294
|
+
EvalMetric(
|
|
295
|
+
name="bleu", value=bleu, unit="", higher_is_better=True, category="accuracy"
|
|
296
|
+
)
|
|
297
|
+
)
|
|
298
|
+
if rouge_l is not None:
|
|
299
|
+
metrics.append(
|
|
300
|
+
EvalMetric(
|
|
301
|
+
name="rouge_l",
|
|
302
|
+
value=rouge_l,
|
|
303
|
+
unit="",
|
|
304
|
+
higher_is_better=True,
|
|
305
|
+
category="accuracy",
|
|
306
|
+
)
|
|
307
|
+
)
|
|
308
|
+
|
|
309
|
+
return cls(
|
|
310
|
+
model_id=model_id,
|
|
311
|
+
dataset=dataset,
|
|
312
|
+
metrics=metrics,
|
|
313
|
+
nlp_task=nlp_task,
|
|
314
|
+
**kwargs,
|
|
315
|
+
)
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
class LLMEvalResult(EvalResult):
|
|
319
|
+
"""
|
|
320
|
+
Large Language Model evaluation results.
|
|
321
|
+
|
|
322
|
+
Standard metrics: perplexity, MMLU, HellaSwag, TruthfulQA, etc.
|
|
323
|
+
Compatible with: lm-eval-harness, EleutherAI eval
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
task_type: str = "llm"
|
|
327
|
+
|
|
328
|
+
# Benchmark scores (0-100 or 0-1 depending on benchmark)
|
|
329
|
+
benchmark_scores: Annotated[
|
|
330
|
+
dict[str, float],
|
|
331
|
+
Field(
|
|
332
|
+
default_factory=dict,
|
|
333
|
+
description="Benchmark scores, e.g., {'mmlu': 0.72, 'hellaswag': 0.81}",
|
|
334
|
+
),
|
|
335
|
+
]
|
|
336
|
+
|
|
337
|
+
@classmethod
|
|
338
|
+
def create(
|
|
339
|
+
cls,
|
|
340
|
+
model_id: str,
|
|
341
|
+
perplexity: float | None = None,
|
|
342
|
+
mmlu: float | None = None,
|
|
343
|
+
hellaswag: float | None = None,
|
|
344
|
+
truthfulqa: float | None = None,
|
|
345
|
+
arc_challenge: float | None = None,
|
|
346
|
+
winogrande: float | None = None,
|
|
347
|
+
**kwargs: Any,
|
|
348
|
+
) -> LLMEvalResult:
|
|
349
|
+
"""Convenience constructor with standard LLM benchmarks."""
|
|
350
|
+
metrics = []
|
|
351
|
+
benchmark_scores = {}
|
|
352
|
+
|
|
353
|
+
if perplexity is not None:
|
|
354
|
+
metrics.append(
|
|
355
|
+
EvalMetric(
|
|
356
|
+
name="perplexity",
|
|
357
|
+
value=perplexity,
|
|
358
|
+
unit="",
|
|
359
|
+
higher_is_better=False,
|
|
360
|
+
category="accuracy",
|
|
361
|
+
)
|
|
362
|
+
)
|
|
363
|
+
|
|
364
|
+
benchmarks = {
|
|
365
|
+
"mmlu": mmlu,
|
|
366
|
+
"hellaswag": hellaswag,
|
|
367
|
+
"truthfulqa": truthfulqa,
|
|
368
|
+
"arc_challenge": arc_challenge,
|
|
369
|
+
"winogrande": winogrande,
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
for name, value in benchmarks.items():
|
|
373
|
+
if value is not None:
|
|
374
|
+
metrics.append(
|
|
375
|
+
EvalMetric(
|
|
376
|
+
name=name, value=value, unit="%", higher_is_better=True, category="accuracy"
|
|
377
|
+
)
|
|
378
|
+
)
|
|
379
|
+
benchmark_scores[name] = value
|
|
380
|
+
|
|
381
|
+
return cls(
|
|
382
|
+
model_id=model_id,
|
|
383
|
+
dataset="multiple",
|
|
384
|
+
metrics=metrics,
|
|
385
|
+
benchmark_scores=benchmark_scores,
|
|
386
|
+
**kwargs,
|
|
387
|
+
)
|
|
388
|
+
|
|
389
|
+
|
|
390
|
+
class SegmentationEvalResult(EvalResult):
|
|
391
|
+
"""
|
|
392
|
+
Semantic/instance segmentation evaluation results.
|
|
393
|
+
|
|
394
|
+
Standard metrics: mIoU, dice coefficient, per-class IoU.
|
|
395
|
+
Compatible with: MMSegmentation, Detectron2
|
|
396
|
+
"""
|
|
397
|
+
|
|
398
|
+
task_type: str = "segmentation"
|
|
399
|
+
|
|
400
|
+
# Per-class IoU
|
|
401
|
+
class_iou: Annotated[
|
|
402
|
+
dict[str, float],
|
|
403
|
+
Field(default_factory=dict, description="Per-class IoU values"),
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
# Segmentation type
|
|
407
|
+
segmentation_type: Annotated[
|
|
408
|
+
str,
|
|
409
|
+
Field(default="semantic", description="Type: semantic, instance, or panoptic"),
|
|
410
|
+
] = "semantic"
|
|
411
|
+
|
|
412
|
+
@classmethod
|
|
413
|
+
def create(
|
|
414
|
+
cls,
|
|
415
|
+
model_id: str,
|
|
416
|
+
dataset: str,
|
|
417
|
+
miou: float,
|
|
418
|
+
dice: float | None = None,
|
|
419
|
+
class_iou: dict[str, float] | None = None,
|
|
420
|
+
segmentation_type: str = "semantic",
|
|
421
|
+
**kwargs: Any,
|
|
422
|
+
) -> SegmentationEvalResult:
|
|
423
|
+
"""Convenience constructor with standard segmentation metrics."""
|
|
424
|
+
metrics = [
|
|
425
|
+
EvalMetric(
|
|
426
|
+
name="mIoU", value=miou, unit="%", higher_is_better=True, category="accuracy"
|
|
427
|
+
),
|
|
428
|
+
]
|
|
429
|
+
if dice is not None:
|
|
430
|
+
metrics.append(
|
|
431
|
+
EvalMetric(
|
|
432
|
+
name="dice", value=dice, unit="%", higher_is_better=True, category="accuracy"
|
|
433
|
+
)
|
|
434
|
+
)
|
|
435
|
+
|
|
436
|
+
return cls(
|
|
437
|
+
model_id=model_id,
|
|
438
|
+
dataset=dataset,
|
|
439
|
+
metrics=metrics,
|
|
440
|
+
class_iou=class_iou or {},
|
|
441
|
+
segmentation_type=segmentation_type,
|
|
442
|
+
**kwargs,
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
|
|
446
|
+
class GenericEvalResult(EvalResult):
|
|
447
|
+
"""
|
|
448
|
+
Generic evaluation results with user-defined metrics.
|
|
449
|
+
|
|
450
|
+
Use this when no task-specific schema fits, or for custom evaluation tasks.
|
|
451
|
+
"""
|
|
452
|
+
|
|
453
|
+
task_type: str = "generic"
|
|
454
|
+
|
|
455
|
+
# User can specify what metrics mean
|
|
456
|
+
metric_definitions: Annotated[
|
|
457
|
+
dict[str, str],
|
|
458
|
+
Field(
|
|
459
|
+
default_factory=dict,
|
|
460
|
+
description="Metric definitions, e.g., {'custom_score': 'Higher is better'}",
|
|
461
|
+
),
|
|
462
|
+
]
|
|
463
|
+
|
|
464
|
+
@classmethod
|
|
465
|
+
def create(
|
|
466
|
+
cls,
|
|
467
|
+
model_id: str,
|
|
468
|
+
dataset: str = "",
|
|
469
|
+
metrics: dict[str, float] | None = None,
|
|
470
|
+
metric_definitions: dict[str, str] | None = None,
|
|
471
|
+
higher_is_better: dict[str, bool] | None = None,
|
|
472
|
+
**kwargs: Any,
|
|
473
|
+
) -> GenericEvalResult:
|
|
474
|
+
"""Convenience constructor for generic metrics."""
|
|
475
|
+
metric_list = []
|
|
476
|
+
higher_map = higher_is_better or {}
|
|
477
|
+
|
|
478
|
+
for name, value in (metrics or {}).items():
|
|
479
|
+
metric_list.append(
|
|
480
|
+
EvalMetric(
|
|
481
|
+
name=name,
|
|
482
|
+
value=value,
|
|
483
|
+
unit="",
|
|
484
|
+
higher_is_better=higher_map.get(name, True),
|
|
485
|
+
category="custom",
|
|
486
|
+
)
|
|
487
|
+
)
|
|
488
|
+
|
|
489
|
+
return cls(
|
|
490
|
+
model_id=model_id,
|
|
491
|
+
dataset=dataset,
|
|
492
|
+
metrics=metric_list,
|
|
493
|
+
metric_definitions=metric_definitions or {},
|
|
494
|
+
**kwargs,
|
|
495
|
+
)
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# =============================================================================
|
|
499
|
+
# Combined Report (Architecture + Eval)
|
|
500
|
+
# =============================================================================
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
class CombinedReport(BaseModel):
|
|
504
|
+
"""
|
|
505
|
+
Combines architecture analysis with evaluation results.
|
|
506
|
+
|
|
507
|
+
Links an InspectionReport (model structure, FLOPs, params) with
|
|
508
|
+
EvalResult (accuracy, speed benchmarks) for unified comparison.
|
|
509
|
+
"""
|
|
510
|
+
|
|
511
|
+
model_id: Annotated[str, Field(description="Model identifier")]
|
|
512
|
+
model_path: Annotated[str, Field(default="", description="Path to model file")]
|
|
513
|
+
|
|
514
|
+
# Architecture analysis (from haoline inspect)
|
|
515
|
+
architecture: Annotated[
|
|
516
|
+
dict[str, Any],
|
|
517
|
+
Field(
|
|
518
|
+
default_factory=dict,
|
|
519
|
+
description="Architecture summary: params_total, flops_total, etc.",
|
|
520
|
+
),
|
|
521
|
+
]
|
|
522
|
+
|
|
523
|
+
# Evaluation results (from external tools)
|
|
524
|
+
eval_results: Annotated[
|
|
525
|
+
list[EvalResult],
|
|
526
|
+
Field(default_factory=list, description="Evaluation results from external tools"),
|
|
527
|
+
]
|
|
528
|
+
|
|
529
|
+
# Computed summaries
|
|
530
|
+
primary_accuracy_metric: Annotated[
|
|
531
|
+
str, Field(default="", description="Primary accuracy metric name")
|
|
532
|
+
] = ""
|
|
533
|
+
primary_accuracy_value: Annotated[
|
|
534
|
+
float, Field(default=0.0, description="Primary accuracy metric value")
|
|
535
|
+
] = 0.0
|
|
536
|
+
|
|
537
|
+
# Hardware estimates (from haoline)
|
|
538
|
+
hardware_profile: Annotated[str, Field(default="", description="Hardware profile name")] = ""
|
|
539
|
+
latency_ms: Annotated[float, Field(default=0.0, description="Latency in milliseconds")] = 0.0
|
|
540
|
+
throughput_fps: Annotated[
|
|
541
|
+
float, Field(default=0.0, description="Throughput in frames per second")
|
|
542
|
+
] = 0.0
|
|
543
|
+
|
|
544
|
+
# Deployment cost (if calculated)
|
|
545
|
+
cost_per_day_usd: Annotated[
|
|
546
|
+
float, Field(default=0.0, description="Estimated cost per day in USD")
|
|
547
|
+
] = 0.0
|
|
548
|
+
cost_per_month_usd: Annotated[
|
|
549
|
+
float, Field(default=0.0, description="Estimated cost per month in USD")
|
|
550
|
+
] = 0.0
|
|
551
|
+
|
|
552
|
+
def add_eval_result(self, result: EvalResult) -> None:
|
|
553
|
+
"""Add an evaluation result."""
|
|
554
|
+
self.eval_results.append(result)
|
|
555
|
+
|
|
556
|
+
def get_eval_by_task(self, task_type: str) -> EvalResult | None:
|
|
557
|
+
"""Get eval result by task type."""
|
|
558
|
+
for r in self.eval_results:
|
|
559
|
+
if r.task_type == task_type:
|
|
560
|
+
return r
|
|
561
|
+
return None
|
|
562
|
+
|
|
563
|
+
def get_all_metrics(self) -> list[EvalMetric]:
|
|
564
|
+
"""Get all metrics from all eval results."""
|
|
565
|
+
metrics = []
|
|
566
|
+
for r in self.eval_results:
|
|
567
|
+
metrics.extend(r.metrics)
|
|
568
|
+
return metrics
|
|
569
|
+
|
|
570
|
+
def to_json(self, indent: int = 2) -> str:
|
|
571
|
+
"""Serialize to JSON string."""
|
|
572
|
+
result: str = self.model_dump_json(indent=indent)
|
|
573
|
+
return result
|
|
574
|
+
|
|
575
|
+
@classmethod
|
|
576
|
+
def from_inspection_report(
|
|
577
|
+
cls,
|
|
578
|
+
report: Any, # InspectionReport
|
|
579
|
+
model_path: str = "",
|
|
580
|
+
eval_results: list[EvalResult] | None = None,
|
|
581
|
+
) -> CombinedReport:
|
|
582
|
+
"""
|
|
583
|
+
Create from an InspectionReport.
|
|
584
|
+
|
|
585
|
+
Args:
|
|
586
|
+
report: InspectionReport from haoline.
|
|
587
|
+
model_path: Path to the model file.
|
|
588
|
+
eval_results: Optional list of eval results to attach.
|
|
589
|
+
"""
|
|
590
|
+
from pathlib import Path
|
|
591
|
+
|
|
592
|
+
# Extract key architecture metrics
|
|
593
|
+
mem_bytes = 0
|
|
594
|
+
if report.memory_estimates:
|
|
595
|
+
mem_bytes = (
|
|
596
|
+
report.memory_estimates.model_size_bytes
|
|
597
|
+
+ report.memory_estimates.peak_activation_bytes
|
|
598
|
+
)
|
|
599
|
+
|
|
600
|
+
arch_summary = {
|
|
601
|
+
"params_total": (report.param_counts.total if report.param_counts else 0),
|
|
602
|
+
"flops_total": (report.flop_counts.total if report.flop_counts else 0),
|
|
603
|
+
"memory_bytes": mem_bytes,
|
|
604
|
+
"model_size_bytes": (
|
|
605
|
+
report.memory_estimates.model_size_bytes if report.memory_estimates else 0
|
|
606
|
+
),
|
|
607
|
+
"peak_activation_bytes": (
|
|
608
|
+
report.memory_estimates.peak_activation_bytes if report.memory_estimates else 0
|
|
609
|
+
),
|
|
610
|
+
"architecture_type": report.architecture_type,
|
|
611
|
+
"num_nodes": (report.graph_summary.num_nodes if report.graph_summary else 0),
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
# Hardware estimates if available
|
|
615
|
+
hw_profile = ""
|
|
616
|
+
latency = 0.0
|
|
617
|
+
throughput = 0.0
|
|
618
|
+
if report.hardware_estimates:
|
|
619
|
+
hw_profile = report.hardware_profile.name if report.hardware_profile else ""
|
|
620
|
+
latency = getattr(report.hardware_estimates, "latency_ms", 0.0)
|
|
621
|
+
throughput = getattr(report.hardware_estimates, "throughput_samples_per_sec", 0.0)
|
|
622
|
+
|
|
623
|
+
# Model ID: use filename stem or path
|
|
624
|
+
model_id = ""
|
|
625
|
+
if model_path:
|
|
626
|
+
model_id = Path(model_path).stem
|
|
627
|
+
elif report.metadata:
|
|
628
|
+
model_id = Path(report.metadata.path).stem if report.metadata.path else ""
|
|
629
|
+
|
|
630
|
+
# Set primary accuracy from first eval result
|
|
631
|
+
primary_metric = ""
|
|
632
|
+
primary_value = 0.0
|
|
633
|
+
evals = eval_results or []
|
|
634
|
+
if evals and evals[0].metrics:
|
|
635
|
+
# Use first accuracy-type metric as primary
|
|
636
|
+
for m in evals[0].metrics:
|
|
637
|
+
if m.higher_is_better and m.category in ("accuracy", ""):
|
|
638
|
+
primary_metric = m.name
|
|
639
|
+
primary_value = m.value
|
|
640
|
+
break
|
|
641
|
+
|
|
642
|
+
return cls(
|
|
643
|
+
model_id=model_id,
|
|
644
|
+
model_path=model_path or (report.metadata.path if report.metadata else ""),
|
|
645
|
+
architecture=arch_summary,
|
|
646
|
+
eval_results=evals,
|
|
647
|
+
primary_accuracy_metric=primary_metric,
|
|
648
|
+
primary_accuracy_value=primary_value,
|
|
649
|
+
hardware_profile=hw_profile,
|
|
650
|
+
latency_ms=latency,
|
|
651
|
+
throughput_fps=throughput,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
# =============================================================================
|
|
656
|
+
# Model Linking Utilities (Task 12.4.1)
|
|
657
|
+
# =============================================================================
|
|
658
|
+
|
|
659
|
+
|
|
660
|
+
def compute_model_hash(model_path: str, algorithm: str = "sha256") -> str:
|
|
661
|
+
"""
|
|
662
|
+
Compute a hash of a model file for unique identification.
|
|
663
|
+
|
|
664
|
+
Args:
|
|
665
|
+
model_path: Path to the model file.
|
|
666
|
+
algorithm: Hash algorithm ("sha256", "md5", "sha1").
|
|
667
|
+
|
|
668
|
+
Returns:
|
|
669
|
+
Hex digest of the file hash.
|
|
670
|
+
|
|
671
|
+
Example:
|
|
672
|
+
>>> hash_id = compute_model_hash("model.onnx")
|
|
673
|
+
>>> print(hash_id[:12]) # First 12 chars as short ID
|
|
674
|
+
'a1b2c3d4e5f6'
|
|
675
|
+
"""
|
|
676
|
+
import hashlib
|
|
677
|
+
from pathlib import Path
|
|
678
|
+
|
|
679
|
+
path = Path(model_path)
|
|
680
|
+
if not path.exists():
|
|
681
|
+
raise FileNotFoundError(f"Model file not found: {model_path}")
|
|
682
|
+
|
|
683
|
+
hash_func = hashlib.new(algorithm)
|
|
684
|
+
|
|
685
|
+
# Read in chunks to handle large files
|
|
686
|
+
with open(path, "rb") as f:
|
|
687
|
+
for chunk in iter(lambda: f.read(8192), b""):
|
|
688
|
+
hash_func.update(chunk)
|
|
689
|
+
|
|
690
|
+
return hash_func.hexdigest()
|
|
691
|
+
|
|
692
|
+
|
|
693
|
+
def link_eval_to_model(
|
|
694
|
+
model_path: str,
|
|
695
|
+
eval_result: EvalResult,
|
|
696
|
+
use_hash: bool = False,
|
|
697
|
+
) -> EvalResult:
|
|
698
|
+
"""
|
|
699
|
+
Link an evaluation result to a model file.
|
|
700
|
+
|
|
701
|
+
Updates the eval_result's model_id to match the model file identifier
|
|
702
|
+
(either path or hash).
|
|
703
|
+
|
|
704
|
+
Args:
|
|
705
|
+
model_path: Path to the model file.
|
|
706
|
+
eval_result: EvalResult to link.
|
|
707
|
+
use_hash: If True, use file hash as model_id. If False, use filename.
|
|
708
|
+
|
|
709
|
+
Returns:
|
|
710
|
+
Updated EvalResult with linked model_id.
|
|
711
|
+
|
|
712
|
+
Example:
|
|
713
|
+
>>> eval_result = parse_ultralytics_val(data)
|
|
714
|
+
>>> linked = link_eval_to_model("yolov8n.onnx", eval_result)
|
|
715
|
+
>>> print(linked.model_id) # 'yolov8n'
|
|
716
|
+
"""
|
|
717
|
+
from pathlib import Path
|
|
718
|
+
|
|
719
|
+
if use_hash:
|
|
720
|
+
model_id = compute_model_hash(model_path)[:12] # Short hash
|
|
721
|
+
else:
|
|
722
|
+
model_id = Path(model_path).stem
|
|
723
|
+
|
|
724
|
+
# Update the eval result's model_id
|
|
725
|
+
eval_result.model_id = model_id
|
|
726
|
+
eval_result.metadata["linked_model_path"] = model_path
|
|
727
|
+
|
|
728
|
+
return eval_result
|
|
729
|
+
|
|
730
|
+
|
|
731
|
+
def create_combined_report(
|
|
732
|
+
model_path: str,
|
|
733
|
+
eval_results: list[EvalResult] | None = None,
|
|
734
|
+
inspection_report: Any = None, # InspectionReport
|
|
735
|
+
run_inspection: bool = True,
|
|
736
|
+
) -> CombinedReport:
|
|
737
|
+
"""
|
|
738
|
+
Create a CombinedReport by linking model analysis with eval results.
|
|
739
|
+
|
|
740
|
+
If inspection_report is not provided and run_inspection is True,
|
|
741
|
+
runs haoline analysis on the model first.
|
|
742
|
+
|
|
743
|
+
Args:
|
|
744
|
+
model_path: Path to the model file.
|
|
745
|
+
eval_results: List of evaluation results to attach.
|
|
746
|
+
inspection_report: Pre-computed InspectionReport (optional).
|
|
747
|
+
run_inspection: Whether to run inspection if not provided.
|
|
748
|
+
|
|
749
|
+
Returns:
|
|
750
|
+
CombinedReport combining architecture analysis and eval metrics.
|
|
751
|
+
|
|
752
|
+
Example:
|
|
753
|
+
>>> # Import eval, then combine with architecture analysis
|
|
754
|
+
>>> eval_result = load_ultralytics_json("val_results.json")
|
|
755
|
+
>>> combined = create_combined_report("yolov8n.onnx", [eval_result])
|
|
756
|
+
>>> print(combined.architecture["params_total"])
|
|
757
|
+
>>> print(combined.eval_results[0].metrics[0].value)
|
|
758
|
+
"""
|
|
759
|
+
from pathlib import Path
|
|
760
|
+
|
|
761
|
+
# Run inspection if needed
|
|
762
|
+
if inspection_report is None and run_inspection:
|
|
763
|
+
try:
|
|
764
|
+
from haoline.report import ModelInspector
|
|
765
|
+
|
|
766
|
+
inspector = ModelInspector()
|
|
767
|
+
inspection_report = inspector.inspect(Path(model_path))
|
|
768
|
+
except Exception as e:
|
|
769
|
+
# Can't import or run haoline - create minimal combined report
|
|
770
|
+
print(f"Warning: Could not run model inspection: {e}")
|
|
771
|
+
return CombinedReport(
|
|
772
|
+
model_id=Path(model_path).stem,
|
|
773
|
+
model_path=model_path,
|
|
774
|
+
architecture={},
|
|
775
|
+
eval_results=eval_results or [],
|
|
776
|
+
)
|
|
777
|
+
|
|
778
|
+
# Link eval results to model
|
|
779
|
+
linked_evals: list[EvalResult] = []
|
|
780
|
+
if eval_results:
|
|
781
|
+
for er in eval_results:
|
|
782
|
+
linked = link_eval_to_model(model_path, er)
|
|
783
|
+
linked_evals.append(linked)
|
|
784
|
+
|
|
785
|
+
# Create combined report
|
|
786
|
+
if inspection_report:
|
|
787
|
+
return CombinedReport.from_inspection_report(
|
|
788
|
+
inspection_report,
|
|
789
|
+
model_path=model_path,
|
|
790
|
+
eval_results=linked_evals,
|
|
791
|
+
)
|
|
792
|
+
else:
|
|
793
|
+
return CombinedReport(
|
|
794
|
+
model_id=Path(model_path).stem,
|
|
795
|
+
model_path=model_path,
|
|
796
|
+
architecture={},
|
|
797
|
+
eval_results=linked_evals,
|
|
798
|
+
)
|
|
799
|
+
|
|
800
|
+
|
|
801
|
+
# =============================================================================
|
|
802
|
+
# Schema Generation and Validation
|
|
803
|
+
# =============================================================================
|
|
804
|
+
|
|
805
|
+
|
|
806
|
+
def get_eval_schema() -> dict[str, Any]:
|
|
807
|
+
"""Get JSON Schema for EvalResult."""
|
|
808
|
+
schema: dict[str, Any] = EvalResult.model_json_schema()
|
|
809
|
+
return schema
|
|
810
|
+
|
|
811
|
+
|
|
812
|
+
def get_combined_report_schema() -> dict[str, Any]:
|
|
813
|
+
"""Get JSON Schema for CombinedReport."""
|
|
814
|
+
schema: dict[str, Any] = CombinedReport.model_json_schema()
|
|
815
|
+
return schema
|
|
816
|
+
|
|
817
|
+
|
|
818
|
+
def validate_eval_result(data: dict[str, Any]) -> bool:
|
|
819
|
+
"""
|
|
820
|
+
Validate eval result data using Pydantic.
|
|
821
|
+
|
|
822
|
+
Returns True if valid, False otherwise.
|
|
823
|
+
"""
|
|
824
|
+
try:
|
|
825
|
+
EvalResult.model_validate(data)
|
|
826
|
+
return True
|
|
827
|
+
except Exception:
|
|
828
|
+
return False
|
|
829
|
+
|
|
830
|
+
|
|
831
|
+
def is_valid_task_type(task_type: str) -> bool:
|
|
832
|
+
"""Check if a task type is valid."""
|
|
833
|
+
return task_type in [t.value for t in TaskType]
|