haoline 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haoline/.streamlit/config.toml +10 -0
- haoline/__init__.py +248 -0
- haoline/analyzer.py +935 -0
- haoline/cli.py +2712 -0
- haoline/compare.py +811 -0
- haoline/compare_visualizations.py +1564 -0
- haoline/edge_analysis.py +525 -0
- haoline/eval/__init__.py +131 -0
- haoline/eval/adapters.py +844 -0
- haoline/eval/cli.py +390 -0
- haoline/eval/comparison.py +542 -0
- haoline/eval/deployment.py +633 -0
- haoline/eval/schemas.py +833 -0
- haoline/examples/__init__.py +15 -0
- haoline/examples/basic_inspection.py +74 -0
- haoline/examples/compare_models.py +117 -0
- haoline/examples/hardware_estimation.py +78 -0
- haoline/format_adapters.py +1001 -0
- haoline/formats/__init__.py +123 -0
- haoline/formats/coreml.py +250 -0
- haoline/formats/gguf.py +483 -0
- haoline/formats/openvino.py +255 -0
- haoline/formats/safetensors.py +273 -0
- haoline/formats/tflite.py +369 -0
- haoline/hardware.py +2307 -0
- haoline/hierarchical_graph.py +462 -0
- haoline/html_export.py +1573 -0
- haoline/layer_summary.py +769 -0
- haoline/llm_summarizer.py +465 -0
- haoline/op_icons.py +618 -0
- haoline/operational_profiling.py +1492 -0
- haoline/patterns.py +1116 -0
- haoline/pdf_generator.py +265 -0
- haoline/privacy.py +250 -0
- haoline/pydantic_models.py +241 -0
- haoline/report.py +1923 -0
- haoline/report_sections.py +539 -0
- haoline/risks.py +521 -0
- haoline/schema.py +523 -0
- haoline/streamlit_app.py +2024 -0
- haoline/tests/__init__.py +4 -0
- haoline/tests/conftest.py +123 -0
- haoline/tests/test_analyzer.py +868 -0
- haoline/tests/test_compare_visualizations.py +293 -0
- haoline/tests/test_edge_analysis.py +243 -0
- haoline/tests/test_eval.py +604 -0
- haoline/tests/test_format_adapters.py +460 -0
- haoline/tests/test_hardware.py +237 -0
- haoline/tests/test_hardware_recommender.py +90 -0
- haoline/tests/test_hierarchical_graph.py +326 -0
- haoline/tests/test_html_export.py +180 -0
- haoline/tests/test_layer_summary.py +428 -0
- haoline/tests/test_llm_patterns.py +540 -0
- haoline/tests/test_llm_summarizer.py +339 -0
- haoline/tests/test_patterns.py +774 -0
- haoline/tests/test_pytorch.py +327 -0
- haoline/tests/test_report.py +383 -0
- haoline/tests/test_risks.py +398 -0
- haoline/tests/test_schema.py +417 -0
- haoline/tests/test_tensorflow.py +380 -0
- haoline/tests/test_visualizations.py +316 -0
- haoline/universal_ir.py +856 -0
- haoline/visualizations.py +1086 -0
- haoline/visualize_yolo.py +44 -0
- haoline/web.py +110 -0
- haoline-0.3.0.dist-info/METADATA +471 -0
- haoline-0.3.0.dist-info/RECORD +70 -0
- haoline-0.3.0.dist-info/WHEEL +4 -0
- haoline-0.3.0.dist-info/entry_points.txt +5 -0
- haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,604 @@
|
|
|
1
|
+
"""Tests for the eval module: schemas, adapters, and linking utilities."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import json
|
|
6
|
+
from pathlib import Path
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
|
|
10
|
+
from haoline.eval.comparison import (
|
|
11
|
+
ModelComparisonRow,
|
|
12
|
+
ModelComparisonTable,
|
|
13
|
+
compare_models,
|
|
14
|
+
generate_eval_metrics_html,
|
|
15
|
+
)
|
|
16
|
+
from haoline.eval.deployment import (
|
|
17
|
+
DeploymentScenario,
|
|
18
|
+
DeploymentTarget,
|
|
19
|
+
calculate_deployment_cost,
|
|
20
|
+
estimate_latency_from_flops,
|
|
21
|
+
get_hardware_tier,
|
|
22
|
+
list_hardware_tiers,
|
|
23
|
+
select_hardware_tier_for_latency,
|
|
24
|
+
)
|
|
25
|
+
from haoline.eval.schemas import (
|
|
26
|
+
CombinedReport,
|
|
27
|
+
DetectionEvalResult,
|
|
28
|
+
EvalMetric,
|
|
29
|
+
EvalResult,
|
|
30
|
+
compute_model_hash,
|
|
31
|
+
create_combined_report,
|
|
32
|
+
link_eval_to_model,
|
|
33
|
+
validate_eval_result,
|
|
34
|
+
)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class TestEvalMetric:
|
|
38
|
+
"""Tests for EvalMetric Pydantic model."""
|
|
39
|
+
|
|
40
|
+
def test_create_metric(self) -> None:
|
|
41
|
+
"""Test creating an EvalMetric."""
|
|
42
|
+
metric = EvalMetric(
|
|
43
|
+
name="accuracy",
|
|
44
|
+
value=95.5,
|
|
45
|
+
unit="%",
|
|
46
|
+
higher_is_better=True,
|
|
47
|
+
category="accuracy",
|
|
48
|
+
)
|
|
49
|
+
assert metric.name == "accuracy"
|
|
50
|
+
assert metric.value == 95.5
|
|
51
|
+
assert metric.unit == "%"
|
|
52
|
+
assert metric.higher_is_better is True
|
|
53
|
+
|
|
54
|
+
def test_metric_json_serialization(self) -> None:
|
|
55
|
+
"""Test EvalMetric serialization."""
|
|
56
|
+
metric = EvalMetric(
|
|
57
|
+
name="loss",
|
|
58
|
+
value=0.05,
|
|
59
|
+
unit="",
|
|
60
|
+
higher_is_better=False,
|
|
61
|
+
category="loss",
|
|
62
|
+
)
|
|
63
|
+
data = json.loads(metric.model_dump_json())
|
|
64
|
+
assert data["name"] == "loss"
|
|
65
|
+
assert data["higher_is_better"] is False
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
class TestEvalResult:
|
|
69
|
+
"""Tests for EvalResult base class."""
|
|
70
|
+
|
|
71
|
+
def test_create_eval_result(self) -> None:
|
|
72
|
+
"""Test creating an EvalResult."""
|
|
73
|
+
result = EvalResult(
|
|
74
|
+
model_id="test-model",
|
|
75
|
+
task_type="classification",
|
|
76
|
+
dataset="imagenet",
|
|
77
|
+
metrics=[
|
|
78
|
+
EvalMetric(
|
|
79
|
+
name="top1",
|
|
80
|
+
value=76.5,
|
|
81
|
+
unit="%",
|
|
82
|
+
higher_is_better=True,
|
|
83
|
+
category="accuracy",
|
|
84
|
+
)
|
|
85
|
+
],
|
|
86
|
+
)
|
|
87
|
+
assert result.model_id == "test-model"
|
|
88
|
+
assert result.task_type == "classification"
|
|
89
|
+
assert len(result.metrics) == 1
|
|
90
|
+
|
|
91
|
+
def test_to_json(self) -> None:
|
|
92
|
+
"""Test JSON serialization."""
|
|
93
|
+
result = EvalResult(
|
|
94
|
+
model_id="model",
|
|
95
|
+
task_type="detection",
|
|
96
|
+
metrics=[],
|
|
97
|
+
)
|
|
98
|
+
json_str = result.to_json()
|
|
99
|
+
data = json.loads(json_str)
|
|
100
|
+
assert data["model_id"] == "model"
|
|
101
|
+
assert data["task_type"] == "detection"
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
class TestDetectionEvalResult:
|
|
105
|
+
"""Tests for detection-specific eval result."""
|
|
106
|
+
|
|
107
|
+
def test_create_with_factory(self) -> None:
|
|
108
|
+
"""Test using the create() convenience method."""
|
|
109
|
+
result = DetectionEvalResult.create(
|
|
110
|
+
model_id="yolov8n",
|
|
111
|
+
dataset="coco",
|
|
112
|
+
map50=0.65,
|
|
113
|
+
map50_95=0.48,
|
|
114
|
+
precision=0.72,
|
|
115
|
+
recall=0.68,
|
|
116
|
+
f1=0.70,
|
|
117
|
+
)
|
|
118
|
+
assert result.model_id == "yolov8n"
|
|
119
|
+
assert result.dataset == "coco"
|
|
120
|
+
assert len(result.metrics) == 5
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
class TestLinkingUtilities:
|
|
124
|
+
"""Tests for model-eval linking functions."""
|
|
125
|
+
|
|
126
|
+
def test_compute_model_hash(self, tmp_path: Path) -> None:
|
|
127
|
+
"""Test computing file hash."""
|
|
128
|
+
# Create a temporary file
|
|
129
|
+
test_file = tmp_path / "model.onnx"
|
|
130
|
+
test_file.write_bytes(b"fake model content")
|
|
131
|
+
|
|
132
|
+
hash_result = compute_model_hash(str(test_file))
|
|
133
|
+
assert len(hash_result) == 64 # SHA-256 hex length
|
|
134
|
+
assert hash_result.isalnum()
|
|
135
|
+
|
|
136
|
+
def test_compute_model_hash_not_found(self) -> None:
|
|
137
|
+
"""Test hash of non-existent file raises error."""
|
|
138
|
+
with pytest.raises(FileNotFoundError):
|
|
139
|
+
compute_model_hash("/nonexistent/path/model.onnx")
|
|
140
|
+
|
|
141
|
+
def test_link_eval_to_model(self, tmp_path: Path) -> None:
|
|
142
|
+
"""Test linking eval result to model file."""
|
|
143
|
+
# Create a temporary model file
|
|
144
|
+
model_file = tmp_path / "yolov8n.onnx"
|
|
145
|
+
model_file.write_bytes(b"model content")
|
|
146
|
+
|
|
147
|
+
result = EvalResult(
|
|
148
|
+
model_id="",
|
|
149
|
+
task_type="detection",
|
|
150
|
+
metrics=[],
|
|
151
|
+
)
|
|
152
|
+
|
|
153
|
+
linked = link_eval_to_model(str(model_file), result, use_hash=False)
|
|
154
|
+
assert linked.model_id == "yolov8n"
|
|
155
|
+
assert "linked_model_path" in linked.metadata
|
|
156
|
+
|
|
157
|
+
def test_link_eval_to_model_with_hash(self, tmp_path: Path) -> None:
|
|
158
|
+
"""Test linking with hash-based model ID."""
|
|
159
|
+
model_file = tmp_path / "model.onnx"
|
|
160
|
+
model_file.write_bytes(b"unique content")
|
|
161
|
+
|
|
162
|
+
result = EvalResult(
|
|
163
|
+
model_id="",
|
|
164
|
+
task_type="classification",
|
|
165
|
+
metrics=[],
|
|
166
|
+
)
|
|
167
|
+
|
|
168
|
+
linked = link_eval_to_model(str(model_file), result, use_hash=True)
|
|
169
|
+
assert len(linked.model_id) == 12 # Short hash
|
|
170
|
+
|
|
171
|
+
def test_create_combined_report_no_inspection(self, tmp_path: Path) -> None:
|
|
172
|
+
"""Test creating combined report without running inspection."""
|
|
173
|
+
model_file = tmp_path / "model.onnx"
|
|
174
|
+
model_file.write_bytes(b"model")
|
|
175
|
+
|
|
176
|
+
eval_result = DetectionEvalResult.create(
|
|
177
|
+
model_id="",
|
|
178
|
+
dataset="coco",
|
|
179
|
+
map50=0.65,
|
|
180
|
+
map50_95=0.48,
|
|
181
|
+
precision=0.72,
|
|
182
|
+
recall=0.68,
|
|
183
|
+
f1=0.70,
|
|
184
|
+
)
|
|
185
|
+
|
|
186
|
+
combined = create_combined_report(
|
|
187
|
+
str(model_file),
|
|
188
|
+
eval_results=[eval_result],
|
|
189
|
+
run_inspection=False,
|
|
190
|
+
)
|
|
191
|
+
|
|
192
|
+
assert combined.model_id == "model"
|
|
193
|
+
assert len(combined.eval_results) == 1
|
|
194
|
+
assert combined.eval_results[0].model_id == "model"
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
class TestValidation:
|
|
198
|
+
"""Tests for schema validation."""
|
|
199
|
+
|
|
200
|
+
def test_validate_valid_eval_result(self) -> None:
|
|
201
|
+
"""Test validation of valid data."""
|
|
202
|
+
data = {
|
|
203
|
+
"model_id": "test",
|
|
204
|
+
"task_type": "classification",
|
|
205
|
+
"metrics": [],
|
|
206
|
+
}
|
|
207
|
+
assert validate_eval_result(data) is True
|
|
208
|
+
|
|
209
|
+
def test_validate_invalid_eval_result(self) -> None:
|
|
210
|
+
"""Test validation of invalid data."""
|
|
211
|
+
data = {"invalid": "data"}
|
|
212
|
+
assert validate_eval_result(data) is False
|
|
213
|
+
|
|
214
|
+
|
|
215
|
+
class TestCombinedReport:
|
|
216
|
+
"""Tests for CombinedReport model."""
|
|
217
|
+
|
|
218
|
+
def test_create_combined_report(self) -> None:
|
|
219
|
+
"""Test creating a CombinedReport manually."""
|
|
220
|
+
combined = CombinedReport(
|
|
221
|
+
model_id="resnet50",
|
|
222
|
+
model_path="/path/to/resnet50.onnx",
|
|
223
|
+
architecture={
|
|
224
|
+
"params_total": 25_000_000,
|
|
225
|
+
"flops_total": 4_000_000_000,
|
|
226
|
+
},
|
|
227
|
+
eval_results=[],
|
|
228
|
+
)
|
|
229
|
+
assert combined.model_id == "resnet50"
|
|
230
|
+
assert combined.architecture["params_total"] == 25_000_000
|
|
231
|
+
|
|
232
|
+
def test_add_eval_result(self) -> None:
|
|
233
|
+
"""Test adding eval results to combined report."""
|
|
234
|
+
combined = CombinedReport(
|
|
235
|
+
model_id="model",
|
|
236
|
+
architecture={},
|
|
237
|
+
)
|
|
238
|
+
eval_result = EvalResult(
|
|
239
|
+
model_id="model",
|
|
240
|
+
task_type="classification",
|
|
241
|
+
metrics=[],
|
|
242
|
+
)
|
|
243
|
+
combined.add_eval_result(eval_result)
|
|
244
|
+
assert len(combined.eval_results) == 1
|
|
245
|
+
|
|
246
|
+
def test_get_eval_by_task(self) -> None:
|
|
247
|
+
"""Test retrieving eval by task type."""
|
|
248
|
+
combined = CombinedReport(
|
|
249
|
+
model_id="model",
|
|
250
|
+
architecture={},
|
|
251
|
+
eval_results=[
|
|
252
|
+
EvalResult(model_id="m", task_type="detection", metrics=[]),
|
|
253
|
+
EvalResult(model_id="m", task_type="classification", metrics=[]),
|
|
254
|
+
],
|
|
255
|
+
)
|
|
256
|
+
det = combined.get_eval_by_task("detection")
|
|
257
|
+
assert det is not None
|
|
258
|
+
assert det.task_type == "detection"
|
|
259
|
+
|
|
260
|
+
missing = combined.get_eval_by_task("segmentation")
|
|
261
|
+
assert missing is None
|
|
262
|
+
|
|
263
|
+
|
|
264
|
+
# =============================================================================
|
|
265
|
+
# Deployment Cost Calculator Tests
|
|
266
|
+
# =============================================================================
|
|
267
|
+
|
|
268
|
+
|
|
269
|
+
class TestDeploymentScenario:
|
|
270
|
+
"""Tests for DeploymentScenario dataclass."""
|
|
271
|
+
|
|
272
|
+
def test_default_scenario(self) -> None:
|
|
273
|
+
"""Test creating scenario with defaults."""
|
|
274
|
+
scenario = DeploymentScenario()
|
|
275
|
+
assert scenario.target_fps == 30.0
|
|
276
|
+
assert scenario.hours_per_day == 24.0
|
|
277
|
+
assert scenario.target == DeploymentTarget.CLOUD_GPU
|
|
278
|
+
|
|
279
|
+
def test_realtime_video_preset(self) -> None:
|
|
280
|
+
"""Test realtime video preset."""
|
|
281
|
+
scenario = DeploymentScenario.realtime_video(fps=60.0)
|
|
282
|
+
assert scenario.target_fps == 60.0
|
|
283
|
+
assert scenario.max_latency_ms == pytest.approx(1000.0 / 60, rel=0.01)
|
|
284
|
+
assert scenario.name == "realtime_video"
|
|
285
|
+
|
|
286
|
+
def test_edge_device_preset(self) -> None:
|
|
287
|
+
"""Test edge device preset."""
|
|
288
|
+
scenario = DeploymentScenario.edge_device(fps=10.0)
|
|
289
|
+
assert scenario.target == DeploymentTarget.EDGE_GPU
|
|
290
|
+
assert scenario.target_fps == 10.0
|
|
291
|
+
|
|
292
|
+
def test_serialization(self) -> None:
|
|
293
|
+
"""Test to_dict and from_dict."""
|
|
294
|
+
original = DeploymentScenario(
|
|
295
|
+
target_fps=15.0,
|
|
296
|
+
hours_per_day=8.0,
|
|
297
|
+
precision="fp16",
|
|
298
|
+
)
|
|
299
|
+
data = original.to_dict()
|
|
300
|
+
restored = DeploymentScenario.from_dict(data)
|
|
301
|
+
assert restored.target_fps == 15.0
|
|
302
|
+
assert restored.hours_per_day == 8.0
|
|
303
|
+
assert restored.precision == "fp16"
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
class TestHardwareTiers:
|
|
307
|
+
"""Tests for hardware tier lookups."""
|
|
308
|
+
|
|
309
|
+
def test_get_hardware_tier(self) -> None:
|
|
310
|
+
"""Test getting a tier by name."""
|
|
311
|
+
tier = get_hardware_tier("t4")
|
|
312
|
+
assert tier is not None
|
|
313
|
+
assert tier.name == "T4"
|
|
314
|
+
assert tier.cost_per_hour_usd > 0
|
|
315
|
+
|
|
316
|
+
def test_get_hardware_tier_case_insensitive(self) -> None:
|
|
317
|
+
"""Test case-insensitive lookup."""
|
|
318
|
+
tier = get_hardware_tier("A10G")
|
|
319
|
+
assert tier is not None
|
|
320
|
+
assert tier.name == "A10G"
|
|
321
|
+
|
|
322
|
+
def test_get_unknown_tier(self) -> None:
|
|
323
|
+
"""Test getting non-existent tier returns None."""
|
|
324
|
+
tier = get_hardware_tier("nonexistent")
|
|
325
|
+
assert tier is None
|
|
326
|
+
|
|
327
|
+
def test_list_hardware_tiers(self) -> None:
|
|
328
|
+
"""Test listing all tiers."""
|
|
329
|
+
tiers = list_hardware_tiers()
|
|
330
|
+
assert len(tiers) > 0
|
|
331
|
+
# Should be sorted by cost
|
|
332
|
+
costs = [t.cost_per_hour_usd for t in tiers]
|
|
333
|
+
assert costs == sorted(costs)
|
|
334
|
+
|
|
335
|
+
def test_list_hardware_tiers_filtered(self) -> None:
|
|
336
|
+
"""Test filtering by target."""
|
|
337
|
+
gpu_tiers = list_hardware_tiers(DeploymentTarget.CLOUD_GPU)
|
|
338
|
+
for tier in gpu_tiers:
|
|
339
|
+
assert tier.target == DeploymentTarget.CLOUD_GPU
|
|
340
|
+
|
|
341
|
+
edge_tiers = list_hardware_tiers(DeploymentTarget.EDGE_GPU)
|
|
342
|
+
for tier in edge_tiers:
|
|
343
|
+
assert tier.target == DeploymentTarget.EDGE_GPU
|
|
344
|
+
|
|
345
|
+
|
|
346
|
+
class TestCostCalculation:
|
|
347
|
+
"""Tests for deployment cost calculation."""
|
|
348
|
+
|
|
349
|
+
def test_estimate_latency_from_flops(self) -> None:
|
|
350
|
+
"""Test latency estimation."""
|
|
351
|
+
tier = get_hardware_tier("t4")
|
|
352
|
+
assert tier is not None
|
|
353
|
+
|
|
354
|
+
# 1 GFLOP model
|
|
355
|
+
flops = 1_000_000_000
|
|
356
|
+
latency = estimate_latency_from_flops(flops, tier, "fp32")
|
|
357
|
+
|
|
358
|
+
# Should be a reasonable latency value
|
|
359
|
+
assert latency > 0
|
|
360
|
+
assert latency < 10000 # Less than 10 seconds
|
|
361
|
+
|
|
362
|
+
def test_select_hardware_for_latency(self) -> None:
|
|
363
|
+
"""Test hardware selection based on latency SLA."""
|
|
364
|
+
flops = 10_000_000_000 # 10 GFLOP model
|
|
365
|
+
|
|
366
|
+
# Strict latency requirement - should pick faster hardware
|
|
367
|
+
tier = select_hardware_tier_for_latency(
|
|
368
|
+
flops,
|
|
369
|
+
target_latency_ms=10.0,
|
|
370
|
+
precision="fp16",
|
|
371
|
+
)
|
|
372
|
+
# May or may not find suitable tier
|
|
373
|
+
if tier:
|
|
374
|
+
assert tier.cost_per_hour_usd > 0
|
|
375
|
+
|
|
376
|
+
def test_calculate_deployment_cost(self) -> None:
|
|
377
|
+
"""Test full cost calculation."""
|
|
378
|
+
scenario = DeploymentScenario(
|
|
379
|
+
target_fps=10.0,
|
|
380
|
+
hours_per_day=8.0,
|
|
381
|
+
days_per_month=22, # Business days
|
|
382
|
+
target=DeploymentTarget.CLOUD_GPU,
|
|
383
|
+
)
|
|
384
|
+
|
|
385
|
+
flops = 5_000_000_000 # 5 GFLOP model
|
|
386
|
+
estimate = calculate_deployment_cost(flops, scenario)
|
|
387
|
+
|
|
388
|
+
# Check basic fields are populated
|
|
389
|
+
assert estimate.hardware_tier is not None
|
|
390
|
+
assert estimate.cost_per_hour_usd >= 0
|
|
391
|
+
assert estimate.cost_per_day_usd >= 0
|
|
392
|
+
assert estimate.cost_per_month_usd >= 0
|
|
393
|
+
assert estimate.estimated_latency_ms > 0
|
|
394
|
+
|
|
395
|
+
# Costs should scale correctly
|
|
396
|
+
assert estimate.cost_per_day_usd == pytest.approx(
|
|
397
|
+
estimate.cost_per_hour_usd * 8.0, rel=0.01
|
|
398
|
+
)
|
|
399
|
+
assert estimate.cost_per_month_usd == pytest.approx(
|
|
400
|
+
estimate.cost_per_day_usd * 22, rel=0.01
|
|
401
|
+
)
|
|
402
|
+
|
|
403
|
+
def test_cost_estimate_summary(self) -> None:
|
|
404
|
+
"""Test human-readable summary generation."""
|
|
405
|
+
scenario = DeploymentScenario(
|
|
406
|
+
target_fps=30.0,
|
|
407
|
+
hours_per_day=24.0,
|
|
408
|
+
name="test_scenario",
|
|
409
|
+
)
|
|
410
|
+
|
|
411
|
+
estimate = calculate_deployment_cost(1_000_000_000, scenario)
|
|
412
|
+
summary = estimate.summary()
|
|
413
|
+
|
|
414
|
+
assert "test_scenario" in summary
|
|
415
|
+
assert "Per hour:" in summary
|
|
416
|
+
assert "Per month:" in summary
|
|
417
|
+
|
|
418
|
+
|
|
419
|
+
# =============================================================================
|
|
420
|
+
# Model Comparison Tests
|
|
421
|
+
# =============================================================================
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
class TestModelComparison:
|
|
425
|
+
"""Tests for multi-model comparison functionality."""
|
|
426
|
+
|
|
427
|
+
def test_create_comparison_row(self) -> None:
|
|
428
|
+
"""Test creating a comparison row from combined report."""
|
|
429
|
+
report = CombinedReport(
|
|
430
|
+
model_id="yolov8n",
|
|
431
|
+
model_path="/path/to/yolov8n.onnx",
|
|
432
|
+
architecture={
|
|
433
|
+
"params_total": 3_000_000,
|
|
434
|
+
"flops_total": 8_000_000_000,
|
|
435
|
+
"model_size_bytes": 12 * 1024 * 1024,
|
|
436
|
+
},
|
|
437
|
+
primary_accuracy_metric="mAP@50",
|
|
438
|
+
primary_accuracy_value=65.0,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
row = ModelComparisonRow.from_combined_report(report)
|
|
442
|
+
|
|
443
|
+
assert row.model_id == "yolov8n"
|
|
444
|
+
assert row.params_total == 3_000_000
|
|
445
|
+
assert row.flops_total == 8_000_000_000
|
|
446
|
+
assert row.model_size_mb == pytest.approx(12.0, rel=0.01)
|
|
447
|
+
assert row.primary_metric_value == 65.0
|
|
448
|
+
|
|
449
|
+
def test_comparison_table(self) -> None:
|
|
450
|
+
"""Test creating and populating a comparison table."""
|
|
451
|
+
report1 = CombinedReport(
|
|
452
|
+
model_id="model_a",
|
|
453
|
+
architecture={"params_total": 1_000_000, "flops_total": 1e9},
|
|
454
|
+
primary_accuracy_metric="accuracy",
|
|
455
|
+
primary_accuracy_value=90.0,
|
|
456
|
+
)
|
|
457
|
+
report2 = CombinedReport(
|
|
458
|
+
model_id="model_b",
|
|
459
|
+
architecture={"params_total": 5_000_000, "flops_total": 5e9},
|
|
460
|
+
primary_accuracy_metric="accuracy",
|
|
461
|
+
primary_accuracy_value=95.0,
|
|
462
|
+
)
|
|
463
|
+
|
|
464
|
+
table = ModelComparisonTable(title="Test Comparison")
|
|
465
|
+
table.add_model(report1)
|
|
466
|
+
table.add_model(report2)
|
|
467
|
+
|
|
468
|
+
assert len(table.rows) == 2
|
|
469
|
+
assert table.rows[0].model_id == "model_a"
|
|
470
|
+
assert table.rows[1].model_id == "model_b"
|
|
471
|
+
|
|
472
|
+
def test_compare_models_function(self) -> None:
|
|
473
|
+
"""Test the compare_models() convenience function."""
|
|
474
|
+
reports = [
|
|
475
|
+
CombinedReport(
|
|
476
|
+
model_id="small",
|
|
477
|
+
architecture={"params_total": 1_000_000},
|
|
478
|
+
primary_accuracy_value=80.0,
|
|
479
|
+
),
|
|
480
|
+
CombinedReport(
|
|
481
|
+
model_id="medium",
|
|
482
|
+
architecture={"params_total": 10_000_000},
|
|
483
|
+
primary_accuracy_value=90.0,
|
|
484
|
+
),
|
|
485
|
+
CombinedReport(
|
|
486
|
+
model_id="large",
|
|
487
|
+
architecture={"params_total": 100_000_000},
|
|
488
|
+
primary_accuracy_value=95.0,
|
|
489
|
+
),
|
|
490
|
+
]
|
|
491
|
+
|
|
492
|
+
table = compare_models(
|
|
493
|
+
reports,
|
|
494
|
+
sort_by="primary_metric_value",
|
|
495
|
+
sort_descending=True,
|
|
496
|
+
)
|
|
497
|
+
|
|
498
|
+
assert len(table.rows) == 3
|
|
499
|
+
# Should be sorted by accuracy descending
|
|
500
|
+
assert table.rows[0].model_id == "large"
|
|
501
|
+
assert table.rows[1].model_id == "medium"
|
|
502
|
+
assert table.rows[2].model_id == "small"
|
|
503
|
+
|
|
504
|
+
def test_table_to_csv(self) -> None:
|
|
505
|
+
"""Test CSV export."""
|
|
506
|
+
report = CombinedReport(
|
|
507
|
+
model_id="test_model",
|
|
508
|
+
architecture={"params_total": 1_000_000},
|
|
509
|
+
)
|
|
510
|
+
table = ModelComparisonTable()
|
|
511
|
+
table.add_model(report)
|
|
512
|
+
|
|
513
|
+
csv_output = table.to_csv()
|
|
514
|
+
assert "model_id" in csv_output
|
|
515
|
+
assert "test_model" in csv_output
|
|
516
|
+
|
|
517
|
+
def test_table_to_json(self) -> None:
|
|
518
|
+
"""Test JSON export."""
|
|
519
|
+
report = CombinedReport(
|
|
520
|
+
model_id="test_model",
|
|
521
|
+
architecture={"params_total": 1_000_000},
|
|
522
|
+
)
|
|
523
|
+
table = ModelComparisonTable(title="JSON Test")
|
|
524
|
+
table.add_model(report)
|
|
525
|
+
|
|
526
|
+
json_output = table.to_json()
|
|
527
|
+
data = json.loads(json_output)
|
|
528
|
+
|
|
529
|
+
assert data["title"] == "JSON Test"
|
|
530
|
+
assert len(data["rows"]) == 1
|
|
531
|
+
assert data["rows"][0]["model_id"] == "test_model"
|
|
532
|
+
|
|
533
|
+
def test_table_to_markdown(self) -> None:
|
|
534
|
+
"""Test Markdown export."""
|
|
535
|
+
report = CombinedReport(
|
|
536
|
+
model_id="model_a",
|
|
537
|
+
architecture={"params_total": 3_000_000, "flops_total": 8e9},
|
|
538
|
+
primary_accuracy_value=75.5,
|
|
539
|
+
)
|
|
540
|
+
table = ModelComparisonTable(title="MD Test")
|
|
541
|
+
table.add_model(report)
|
|
542
|
+
|
|
543
|
+
md_output = table.to_markdown()
|
|
544
|
+
|
|
545
|
+
assert "## MD Test" in md_output
|
|
546
|
+
assert "| Model |" in md_output
|
|
547
|
+
assert "model_a" in md_output
|
|
548
|
+
assert "3.0M" in md_output
|
|
549
|
+
assert "75.5%" in md_output
|
|
550
|
+
|
|
551
|
+
def test_table_to_console(self) -> None:
|
|
552
|
+
"""Test console table output."""
|
|
553
|
+
report = CombinedReport(
|
|
554
|
+
model_id="console_test",
|
|
555
|
+
architecture={"params_total": 2_000_000},
|
|
556
|
+
)
|
|
557
|
+
table = ModelComparisonTable(title="Console Test")
|
|
558
|
+
table.add_model(report)
|
|
559
|
+
|
|
560
|
+
console_output = table.to_console()
|
|
561
|
+
|
|
562
|
+
assert "Console Test" in console_output
|
|
563
|
+
assert "console_test" in console_output
|
|
564
|
+
|
|
565
|
+
def test_generate_eval_metrics_html(self) -> None:
|
|
566
|
+
"""Test HTML generation for eval metrics."""
|
|
567
|
+
eval_result = EvalResult(
|
|
568
|
+
model_id="test",
|
|
569
|
+
task_type="classification",
|
|
570
|
+
metrics=[
|
|
571
|
+
EvalMetric(
|
|
572
|
+
name="accuracy",
|
|
573
|
+
value=95.5,
|
|
574
|
+
unit="%",
|
|
575
|
+
higher_is_better=True,
|
|
576
|
+
category="accuracy",
|
|
577
|
+
),
|
|
578
|
+
EvalMetric(
|
|
579
|
+
name="f1",
|
|
580
|
+
value=0.93,
|
|
581
|
+
unit="",
|
|
582
|
+
higher_is_better=True,
|
|
583
|
+
category="accuracy",
|
|
584
|
+
),
|
|
585
|
+
],
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
html = generate_eval_metrics_html([eval_result])
|
|
589
|
+
|
|
590
|
+
assert '<section class="eval-metrics">' in html
|
|
591
|
+
assert "accuracy" in html
|
|
592
|
+
assert "95.5%" in html
|
|
593
|
+
assert "classification" in html
|
|
594
|
+
|
|
595
|
+
def test_generate_eval_metrics_html_with_cost(self) -> None:
|
|
596
|
+
"""Test HTML generation includes cost estimate."""
|
|
597
|
+
scenario = DeploymentScenario(target_fps=30.0)
|
|
598
|
+
cost_estimate = calculate_deployment_cost(1_000_000_000, scenario)
|
|
599
|
+
|
|
600
|
+
html = generate_eval_metrics_html([], cost_estimate)
|
|
601
|
+
|
|
602
|
+
assert "Deployment Cost Estimate" in html
|
|
603
|
+
assert "$/Month" in html
|
|
604
|
+
assert cost_estimate.hardware_tier.name in html
|