haoline 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haoline/.streamlit/config.toml +10 -0
- haoline/__init__.py +248 -0
- haoline/analyzer.py +935 -0
- haoline/cli.py +2712 -0
- haoline/compare.py +811 -0
- haoline/compare_visualizations.py +1564 -0
- haoline/edge_analysis.py +525 -0
- haoline/eval/__init__.py +131 -0
- haoline/eval/adapters.py +844 -0
- haoline/eval/cli.py +390 -0
- haoline/eval/comparison.py +542 -0
- haoline/eval/deployment.py +633 -0
- haoline/eval/schemas.py +833 -0
- haoline/examples/__init__.py +15 -0
- haoline/examples/basic_inspection.py +74 -0
- haoline/examples/compare_models.py +117 -0
- haoline/examples/hardware_estimation.py +78 -0
- haoline/format_adapters.py +1001 -0
- haoline/formats/__init__.py +123 -0
- haoline/formats/coreml.py +250 -0
- haoline/formats/gguf.py +483 -0
- haoline/formats/openvino.py +255 -0
- haoline/formats/safetensors.py +273 -0
- haoline/formats/tflite.py +369 -0
- haoline/hardware.py +2307 -0
- haoline/hierarchical_graph.py +462 -0
- haoline/html_export.py +1573 -0
- haoline/layer_summary.py +769 -0
- haoline/llm_summarizer.py +465 -0
- haoline/op_icons.py +618 -0
- haoline/operational_profiling.py +1492 -0
- haoline/patterns.py +1116 -0
- haoline/pdf_generator.py +265 -0
- haoline/privacy.py +250 -0
- haoline/pydantic_models.py +241 -0
- haoline/report.py +1923 -0
- haoline/report_sections.py +539 -0
- haoline/risks.py +521 -0
- haoline/schema.py +523 -0
- haoline/streamlit_app.py +2024 -0
- haoline/tests/__init__.py +4 -0
- haoline/tests/conftest.py +123 -0
- haoline/tests/test_analyzer.py +868 -0
- haoline/tests/test_compare_visualizations.py +293 -0
- haoline/tests/test_edge_analysis.py +243 -0
- haoline/tests/test_eval.py +604 -0
- haoline/tests/test_format_adapters.py +460 -0
- haoline/tests/test_hardware.py +237 -0
- haoline/tests/test_hardware_recommender.py +90 -0
- haoline/tests/test_hierarchical_graph.py +326 -0
- haoline/tests/test_html_export.py +180 -0
- haoline/tests/test_layer_summary.py +428 -0
- haoline/tests/test_llm_patterns.py +540 -0
- haoline/tests/test_llm_summarizer.py +339 -0
- haoline/tests/test_patterns.py +774 -0
- haoline/tests/test_pytorch.py +327 -0
- haoline/tests/test_report.py +383 -0
- haoline/tests/test_risks.py +398 -0
- haoline/tests/test_schema.py +417 -0
- haoline/tests/test_tensorflow.py +380 -0
- haoline/tests/test_visualizations.py +316 -0
- haoline/universal_ir.py +856 -0
- haoline/visualizations.py +1086 -0
- haoline/visualize_yolo.py +44 -0
- haoline/web.py +110 -0
- haoline-0.3.0.dist-info/METADATA +471 -0
- haoline-0.3.0.dist-info/RECORD +70 -0
- haoline-0.3.0.dist-info/WHEEL +4 -0
- haoline-0.3.0.dist-info/entry_points.txt +5 -0
- haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,1492 @@
|
|
|
1
|
+
# Copyright (c) 2025 HaoLine Contributors
|
|
2
|
+
# SPDX-License-Identifier: MIT
|
|
3
|
+
|
|
4
|
+
"""
|
|
5
|
+
Operational profiling and system requirements analysis.
|
|
6
|
+
|
|
7
|
+
This module implements:
|
|
8
|
+
- Batch size scalability analysis (sweeps)
|
|
9
|
+
- System requirements generation (Steam-style min/rec/optimal)
|
|
10
|
+
- Resolution impact analysis (future)
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import logging
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from .hardware import (
|
|
20
|
+
HARDWARE_PROFILES,
|
|
21
|
+
HardwareEstimates,
|
|
22
|
+
HardwareEstimator,
|
|
23
|
+
HardwareProfile,
|
|
24
|
+
)
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class BatchSweepPoint:
|
|
29
|
+
"""Metrics for a single batch size point."""
|
|
30
|
+
|
|
31
|
+
batch_size: int
|
|
32
|
+
vram_required_bytes: int
|
|
33
|
+
estimated_latency_ms: float
|
|
34
|
+
throughput_fps: float
|
|
35
|
+
compute_utilization: float
|
|
36
|
+
bottleneck: str
|
|
37
|
+
fits_in_vram: bool
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
@dataclass
|
|
41
|
+
class GPUMetrics:
|
|
42
|
+
"""Real-time GPU metrics from pynvml."""
|
|
43
|
+
|
|
44
|
+
vram_used_bytes: int
|
|
45
|
+
vram_total_bytes: int
|
|
46
|
+
gpu_utilization_percent: float
|
|
47
|
+
memory_utilization_percent: float
|
|
48
|
+
temperature_c: int
|
|
49
|
+
power_draw_w: float
|
|
50
|
+
|
|
51
|
+
def to_dict(self) -> dict[str, Any]:
|
|
52
|
+
return {
|
|
53
|
+
"vram_used_gb": round(self.vram_used_bytes / (1024**3), 3),
|
|
54
|
+
"vram_total_gb": round(self.vram_total_bytes / (1024**3), 1),
|
|
55
|
+
"gpu_utilization_percent": self.gpu_utilization_percent,
|
|
56
|
+
"memory_utilization_percent": self.memory_utilization_percent,
|
|
57
|
+
"temperature_c": self.temperature_c,
|
|
58
|
+
"power_draw_w": self.power_draw_w,
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
@dataclass
|
|
63
|
+
class LayerProfile:
|
|
64
|
+
"""Profiling data for a single layer/operator."""
|
|
65
|
+
|
|
66
|
+
name: str
|
|
67
|
+
op_type: str
|
|
68
|
+
duration_us: float # Microseconds
|
|
69
|
+
provider: str # e.g., "CUDAExecutionProvider"
|
|
70
|
+
input_shapes: list[list[int]]
|
|
71
|
+
output_shapes: list[list[int]]
|
|
72
|
+
|
|
73
|
+
@property
|
|
74
|
+
def duration_ms(self) -> float:
|
|
75
|
+
return self.duration_us / 1000.0
|
|
76
|
+
|
|
77
|
+
def to_dict(self) -> dict[str, Any]:
|
|
78
|
+
return {
|
|
79
|
+
"name": self.name,
|
|
80
|
+
"op_type": self.op_type,
|
|
81
|
+
"duration_ms": round(self.duration_ms, 3),
|
|
82
|
+
"provider": self.provider,
|
|
83
|
+
"input_shapes": self.input_shapes,
|
|
84
|
+
"output_shapes": self.output_shapes,
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
@dataclass
|
|
89
|
+
class ProfilingResult:
|
|
90
|
+
"""Complete profiling results from ONNX Runtime."""
|
|
91
|
+
|
|
92
|
+
total_time_ms: float
|
|
93
|
+
layer_profiles: list[LayerProfile]
|
|
94
|
+
gpu_metrics: GPUMetrics | None
|
|
95
|
+
session_options: dict[str, Any]
|
|
96
|
+
|
|
97
|
+
def get_slowest_layers(self, top_n: int = 10) -> list[LayerProfile]:
|
|
98
|
+
"""Get the N slowest layers by execution time."""
|
|
99
|
+
return sorted(self.layer_profiles, key=lambda x: -x.duration_us)[:top_n]
|
|
100
|
+
|
|
101
|
+
def get_time_by_op_type(self) -> dict[str, float]:
|
|
102
|
+
"""Aggregate execution time by operator type."""
|
|
103
|
+
time_by_op: dict[str, float] = {}
|
|
104
|
+
for layer in self.layer_profiles:
|
|
105
|
+
time_by_op[layer.op_type] = time_by_op.get(layer.op_type, 0) + layer.duration_ms
|
|
106
|
+
return dict(sorted(time_by_op.items(), key=lambda x: -x[1]))
|
|
107
|
+
|
|
108
|
+
def to_dict(self) -> dict[str, Any]:
|
|
109
|
+
return {
|
|
110
|
+
"total_time_ms": round(self.total_time_ms, 3),
|
|
111
|
+
"layer_count": len(self.layer_profiles),
|
|
112
|
+
"slowest_layers": [lp.to_dict() for lp in self.get_slowest_layers()],
|
|
113
|
+
"time_by_op_type": {k: round(v, 3) for k, v in self.get_time_by_op_type().items()},
|
|
114
|
+
"gpu_metrics": self.gpu_metrics.to_dict() if self.gpu_metrics else None,
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
@dataclass
|
|
119
|
+
class BottleneckAnalysis:
|
|
120
|
+
"""Analysis of model performance bottlenecks."""
|
|
121
|
+
|
|
122
|
+
bottleneck_type: str # "compute-bound", "memory-bound", "balanced"
|
|
123
|
+
compute_time_ms: float
|
|
124
|
+
memory_time_ms: float # Estimated memory transfer time
|
|
125
|
+
compute_ratio: float # Fraction of time spent in compute
|
|
126
|
+
memory_ratio: float # Fraction of time spent in memory ops
|
|
127
|
+
theoretical_peak_tflops: float
|
|
128
|
+
achieved_tflops: float
|
|
129
|
+
efficiency_percent: float
|
|
130
|
+
recommendations: list[str]
|
|
131
|
+
|
|
132
|
+
def to_dict(self) -> dict[str, Any]:
|
|
133
|
+
return {
|
|
134
|
+
"bottleneck_type": self.bottleneck_type,
|
|
135
|
+
"compute_time_ms": round(self.compute_time_ms, 3),
|
|
136
|
+
"memory_time_ms": round(self.memory_time_ms, 3),
|
|
137
|
+
"compute_ratio": round(self.compute_ratio, 2),
|
|
138
|
+
"memory_ratio": round(self.memory_ratio, 2),
|
|
139
|
+
"theoretical_peak_tflops": round(self.theoretical_peak_tflops, 2),
|
|
140
|
+
"achieved_tflops": round(self.achieved_tflops, 4),
|
|
141
|
+
"efficiency_percent": round(self.efficiency_percent, 1),
|
|
142
|
+
"recommendations": self.recommendations,
|
|
143
|
+
}
|
|
144
|
+
|
|
145
|
+
|
|
146
|
+
@dataclass
|
|
147
|
+
class ResolutionPoint:
|
|
148
|
+
"""Metrics for a single resolution point."""
|
|
149
|
+
|
|
150
|
+
resolution: tuple[int, int]
|
|
151
|
+
resolution_str: str # e.g., "224x224"
|
|
152
|
+
flops: int
|
|
153
|
+
memory_bytes: int
|
|
154
|
+
vram_required_bytes: int
|
|
155
|
+
estimated_latency_ms: float
|
|
156
|
+
throughput_fps: float
|
|
157
|
+
fits_in_vram: bool
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
@dataclass
|
|
161
|
+
class ResolutionSweep:
|
|
162
|
+
"""Results of a resolution sweep analysis."""
|
|
163
|
+
|
|
164
|
+
resolutions: list[str] # ["224x224", "384x384", ...]
|
|
165
|
+
flops: list[int]
|
|
166
|
+
memory_gb: list[float]
|
|
167
|
+
latencies: list[float]
|
|
168
|
+
throughputs: list[float]
|
|
169
|
+
vram_usage_gb: list[float]
|
|
170
|
+
optimal_resolution: str
|
|
171
|
+
max_resolution: str # Largest resolution that fits in VRAM
|
|
172
|
+
|
|
173
|
+
def to_dict(self) -> dict[str, Any]:
|
|
174
|
+
return {
|
|
175
|
+
"resolutions": self.resolutions,
|
|
176
|
+
"flops": self.flops,
|
|
177
|
+
"memory_gb": self.memory_gb,
|
|
178
|
+
"latencies": self.latencies,
|
|
179
|
+
"throughputs": self.throughputs,
|
|
180
|
+
"vram_usage_gb": self.vram_usage_gb,
|
|
181
|
+
"optimal_resolution": self.optimal_resolution,
|
|
182
|
+
"max_resolution": self.max_resolution,
|
|
183
|
+
}
|
|
184
|
+
|
|
185
|
+
|
|
186
|
+
@dataclass
|
|
187
|
+
class BatchSizeSweep:
|
|
188
|
+
"""Results of a batch size sweep analysis."""
|
|
189
|
+
|
|
190
|
+
batch_sizes: list[int]
|
|
191
|
+
latencies: list[float]
|
|
192
|
+
throughputs: list[float]
|
|
193
|
+
vram_usage_gb: list[float]
|
|
194
|
+
optimal_batch_size: int
|
|
195
|
+
|
|
196
|
+
def to_dict(self) -> dict[str, Any]:
|
|
197
|
+
return {
|
|
198
|
+
"batch_sizes": self.batch_sizes,
|
|
199
|
+
"latencies": self.latencies,
|
|
200
|
+
"throughputs": self.throughputs,
|
|
201
|
+
"vram_usage_gb": self.vram_usage_gb,
|
|
202
|
+
"optimal_batch_size": self.optimal_batch_size,
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
|
|
206
|
+
@dataclass
|
|
207
|
+
class SystemRequirements:
|
|
208
|
+
"""Recommended hardware tiers for deployment.
|
|
209
|
+
|
|
210
|
+
This is a lightweight, report-friendly wrapper around :class:`HardwareEstimates`.
|
|
211
|
+
It deliberately mirrors the older `SystemRequirements` helper in `hardware.py`,
|
|
212
|
+
exposing `minimum_gpu`, `recommended_gpu`, and `optimal_gpu` style attributes so
|
|
213
|
+
existing report/HTML code (and mental model) continue to work.
|
|
214
|
+
"""
|
|
215
|
+
|
|
216
|
+
# Core estimates for each tier
|
|
217
|
+
minimum: HardwareEstimates | None # The lowest spec that runs it
|
|
218
|
+
recommended: HardwareEstimates | None # Good balance of cost/perf
|
|
219
|
+
optimal: HardwareEstimates | None # Maximum performance
|
|
220
|
+
|
|
221
|
+
def to_dict(self) -> dict[str, Any]:
|
|
222
|
+
return {
|
|
223
|
+
"minimum": self.minimum.to_dict() if self.minimum else None,
|
|
224
|
+
"recommended": self.recommended.to_dict() if self.recommended else None,
|
|
225
|
+
"optimal": self.optimal.to_dict() if self.optimal else None,
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
# Backwards/HTML-friendly convenience properties ---------------------
|
|
229
|
+
#
|
|
230
|
+
# These keep the `reqs.minimum_gpu.name` / `reqs.minimum_vram_gb` style
|
|
231
|
+
# access patterns working in `report.py` and HTML templates without
|
|
232
|
+
# duplicating all the shape logic here.
|
|
233
|
+
|
|
234
|
+
@property
|
|
235
|
+
def minimum_gpu(self) -> HardwareEstimates | None:
|
|
236
|
+
return self.minimum
|
|
237
|
+
|
|
238
|
+
@property
|
|
239
|
+
def recommended_gpu(self) -> HardwareEstimates | None:
|
|
240
|
+
return self.recommended
|
|
241
|
+
|
|
242
|
+
@property
|
|
243
|
+
def optimal_gpu(self) -> HardwareEstimates | None:
|
|
244
|
+
return self.optimal
|
|
245
|
+
|
|
246
|
+
@staticmethod
|
|
247
|
+
def _vram_gb(est: HardwareEstimates | None) -> float | None:
|
|
248
|
+
if not est:
|
|
249
|
+
return None
|
|
250
|
+
return round(est.vram_required_bytes / (1024**3), 2)
|
|
251
|
+
|
|
252
|
+
@property
|
|
253
|
+
def minimum_vram_gb(self) -> float | None:
|
|
254
|
+
return self._vram_gb(self.minimum)
|
|
255
|
+
|
|
256
|
+
@property
|
|
257
|
+
def recommended_vram_gb(self) -> float | None:
|
|
258
|
+
return self._vram_gb(self.recommended)
|
|
259
|
+
|
|
260
|
+
|
|
261
|
+
class OperationalProfiler:
|
|
262
|
+
"""
|
|
263
|
+
Analyzes model operational characteristics.
|
|
264
|
+
"""
|
|
265
|
+
|
|
266
|
+
def __init__(self, logger: logging.Logger | None = None):
|
|
267
|
+
self.logger = logger or logging.getLogger("haoline.profiler")
|
|
268
|
+
self.hw_estimator = HardwareEstimator(logger=self.logger)
|
|
269
|
+
|
|
270
|
+
def _create_input_feed(
|
|
271
|
+
self,
|
|
272
|
+
sess: Any,
|
|
273
|
+
batch_size: int = 1,
|
|
274
|
+
seq_len: int = 128,
|
|
275
|
+
) -> dict[str, Any]:
|
|
276
|
+
"""
|
|
277
|
+
Create input feed dict for all model inputs (Story 9.6).
|
|
278
|
+
|
|
279
|
+
Handles multi-input models like BERT, LLMs, and multimodal models.
|
|
280
|
+
|
|
281
|
+
Args:
|
|
282
|
+
sess: ONNX Runtime InferenceSession
|
|
283
|
+
batch_size: Batch size for inputs
|
|
284
|
+
seq_len: Sequence length for text inputs (default: 128)
|
|
285
|
+
|
|
286
|
+
Returns:
|
|
287
|
+
Dict mapping input names to numpy arrays
|
|
288
|
+
"""
|
|
289
|
+
import numpy as np
|
|
290
|
+
|
|
291
|
+
input_feed = {}
|
|
292
|
+
|
|
293
|
+
for inp in sess.get_inputs():
|
|
294
|
+
name = inp.name
|
|
295
|
+
shape = list(inp.shape)
|
|
296
|
+
dtype_str = inp.type # e.g., "tensor(float)", "tensor(int64)"
|
|
297
|
+
|
|
298
|
+
# Determine numpy dtype from ONNX type
|
|
299
|
+
np_dtype: type[np.generic]
|
|
300
|
+
if "int64" in dtype_str:
|
|
301
|
+
np_dtype = np.int64
|
|
302
|
+
is_text = True
|
|
303
|
+
elif "int32" in dtype_str:
|
|
304
|
+
np_dtype = np.int32
|
|
305
|
+
is_text = True
|
|
306
|
+
elif "float16" in dtype_str:
|
|
307
|
+
np_dtype = np.float16
|
|
308
|
+
is_text = False
|
|
309
|
+
elif "bool" in dtype_str:
|
|
310
|
+
np_dtype = np.bool_
|
|
311
|
+
is_text = False
|
|
312
|
+
else:
|
|
313
|
+
np_dtype = np.float32
|
|
314
|
+
is_text = False
|
|
315
|
+
|
|
316
|
+
# Resolve dynamic dimensions
|
|
317
|
+
resolved_shape = []
|
|
318
|
+
for i, dim in enumerate(shape):
|
|
319
|
+
if isinstance(dim, int) and dim > 0:
|
|
320
|
+
resolved_shape.append(dim)
|
|
321
|
+
elif i == 0:
|
|
322
|
+
# Batch dimension
|
|
323
|
+
resolved_shape.append(batch_size)
|
|
324
|
+
elif is_text:
|
|
325
|
+
# Text models: sequence length
|
|
326
|
+
resolved_shape.append(seq_len)
|
|
327
|
+
elif len(shape) == 4 and i == 1:
|
|
328
|
+
# Vision models: channels
|
|
329
|
+
resolved_shape.append(3)
|
|
330
|
+
else:
|
|
331
|
+
# Vision models: spatial dims
|
|
332
|
+
resolved_shape.append(224)
|
|
333
|
+
|
|
334
|
+
# Generate appropriate dummy data
|
|
335
|
+
if is_text:
|
|
336
|
+
# Token IDs: random integers in typical vocab range
|
|
337
|
+
# numpy stubs are overly strict about randint dtype
|
|
338
|
+
dummy: np.ndarray = np.random.randint(0, 30000, size=resolved_shape, dtype=np_dtype) # type: ignore[arg-type]
|
|
339
|
+
elif np_dtype == np.bool_:
|
|
340
|
+
# Boolean masks
|
|
341
|
+
dummy = np.ones(resolved_shape, dtype=np_dtype)
|
|
342
|
+
else:
|
|
343
|
+
# Continuous values (vision, etc.)
|
|
344
|
+
dummy = np.random.randn(*resolved_shape).astype(np_dtype)
|
|
345
|
+
|
|
346
|
+
input_feed[name] = dummy
|
|
347
|
+
|
|
348
|
+
return input_feed
|
|
349
|
+
|
|
350
|
+
def run_batch_sweep(
|
|
351
|
+
self,
|
|
352
|
+
model_params: int,
|
|
353
|
+
model_flops: int,
|
|
354
|
+
peak_activation_bytes: int,
|
|
355
|
+
hardware: HardwareProfile,
|
|
356
|
+
batch_sizes: list[int] | None = None,
|
|
357
|
+
precision: str = "fp16",
|
|
358
|
+
) -> BatchSizeSweep:
|
|
359
|
+
"""
|
|
360
|
+
Analyze performance scaling across batch sizes.
|
|
361
|
+
|
|
362
|
+
Args:
|
|
363
|
+
model_params: Total parameters
|
|
364
|
+
model_flops: FLOPs per inference (batch=1)
|
|
365
|
+
peak_activation_bytes: Peak activation memory (batch=1)
|
|
366
|
+
hardware: Target hardware profile
|
|
367
|
+
batch_sizes: List of batch sizes to test (default: powers of 2)
|
|
368
|
+
precision: Precision to simulate ("fp32", "fp16", "int8")
|
|
369
|
+
|
|
370
|
+
Returns:
|
|
371
|
+
BatchSizeSweep results
|
|
372
|
+
"""
|
|
373
|
+
if batch_sizes is None:
|
|
374
|
+
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
|
|
375
|
+
|
|
376
|
+
latencies = []
|
|
377
|
+
throughputs = []
|
|
378
|
+
vram_usage = []
|
|
379
|
+
optimal_bs = 1
|
|
380
|
+
max_throughput = 0.0
|
|
381
|
+
|
|
382
|
+
for bs in batch_sizes:
|
|
383
|
+
est = self.hw_estimator.estimate(
|
|
384
|
+
model_params=model_params,
|
|
385
|
+
model_flops=model_flops,
|
|
386
|
+
peak_activation_bytes=peak_activation_bytes,
|
|
387
|
+
hardware=hardware,
|
|
388
|
+
batch_size=bs,
|
|
389
|
+
precision=precision,
|
|
390
|
+
)
|
|
391
|
+
|
|
392
|
+
# Calculate throughput (inferences per second)
|
|
393
|
+
# If latency is infinite (OOM), throughput is 0
|
|
394
|
+
throughput = 0.0
|
|
395
|
+
latency = float("inf")
|
|
396
|
+
vram_gb = est.vram_required_bytes / (1024**3)
|
|
397
|
+
|
|
398
|
+
if est.theoretical_latency_ms > 0 and est.fits_in_vram:
|
|
399
|
+
latency = est.theoretical_latency_ms
|
|
400
|
+
throughput = (1000.0 / latency) * bs
|
|
401
|
+
|
|
402
|
+
if throughput > max_throughput:
|
|
403
|
+
max_throughput = throughput
|
|
404
|
+
optimal_bs = bs
|
|
405
|
+
|
|
406
|
+
latencies.append(latency)
|
|
407
|
+
throughputs.append(throughput)
|
|
408
|
+
vram_usage.append(vram_gb)
|
|
409
|
+
|
|
410
|
+
return BatchSizeSweep(
|
|
411
|
+
batch_sizes=batch_sizes,
|
|
412
|
+
latencies=latencies,
|
|
413
|
+
throughputs=throughputs,
|
|
414
|
+
vram_usage_gb=vram_usage,
|
|
415
|
+
optimal_batch_size=optimal_bs,
|
|
416
|
+
)
|
|
417
|
+
|
|
418
|
+
def run_batch_sweep_benchmark(
|
|
419
|
+
self,
|
|
420
|
+
model_path: str,
|
|
421
|
+
batch_sizes: list[int] | None = None,
|
|
422
|
+
num_warmup: int = 5,
|
|
423
|
+
num_runs: int = 20,
|
|
424
|
+
) -> BatchSizeSweep | None:
|
|
425
|
+
"""
|
|
426
|
+
Benchmark actual inference performance across batch sizes.
|
|
427
|
+
|
|
428
|
+
Uses ONNX Runtime to measure real latency and throughput.
|
|
429
|
+
Requires onnxruntime to be installed.
|
|
430
|
+
|
|
431
|
+
Args:
|
|
432
|
+
model_path: Path to ONNX model file
|
|
433
|
+
batch_sizes: List of batch sizes to test (default: powers of 2)
|
|
434
|
+
num_warmup: Number of warmup runs before timing
|
|
435
|
+
num_runs: Number of timed runs per batch size
|
|
436
|
+
|
|
437
|
+
Returns:
|
|
438
|
+
BatchSizeSweep with measured (not estimated) metrics
|
|
439
|
+
"""
|
|
440
|
+
try:
|
|
441
|
+
import numpy as np
|
|
442
|
+
import onnxruntime as ort
|
|
443
|
+
except ImportError:
|
|
444
|
+
self.logger.warning("onnxruntime not available, falling back to estimates")
|
|
445
|
+
return None
|
|
446
|
+
|
|
447
|
+
if batch_sizes is None:
|
|
448
|
+
batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
|
|
449
|
+
|
|
450
|
+
# Create session
|
|
451
|
+
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
452
|
+
try:
|
|
453
|
+
sess = ort.InferenceSession(model_path, providers=providers)
|
|
454
|
+
except Exception as e:
|
|
455
|
+
self.logger.error(f"Failed to load model for benchmarking: {e}")
|
|
456
|
+
return None
|
|
457
|
+
|
|
458
|
+
active_provider = sess.get_providers()[0]
|
|
459
|
+
self.logger.info(f"Benchmarking with {active_provider}")
|
|
460
|
+
|
|
461
|
+
# Get ALL input info (Story 9.6: Multi-input model support)
|
|
462
|
+
all_inputs = sess.get_inputs()
|
|
463
|
+
input_specs = [] # List of (name, shape_template, dtype, is_text)
|
|
464
|
+
|
|
465
|
+
for inp in all_inputs:
|
|
466
|
+
name = inp.name
|
|
467
|
+
shape = list(inp.shape)
|
|
468
|
+
dtype_str = inp.type # e.g., "tensor(float)", "tensor(int64)"
|
|
469
|
+
|
|
470
|
+
# Determine numpy dtype
|
|
471
|
+
np_dtype: type[np.generic]
|
|
472
|
+
if "int64" in dtype_str:
|
|
473
|
+
np_dtype = np.int64
|
|
474
|
+
is_text = True # Likely token IDs
|
|
475
|
+
elif "int32" in dtype_str:
|
|
476
|
+
np_dtype = np.int32
|
|
477
|
+
is_text = True
|
|
478
|
+
elif "float16" in dtype_str:
|
|
479
|
+
np_dtype = np.float16
|
|
480
|
+
is_text = False
|
|
481
|
+
else:
|
|
482
|
+
np_dtype = np.float32
|
|
483
|
+
is_text = False
|
|
484
|
+
|
|
485
|
+
# Resolve dynamic dimensions with sensible defaults
|
|
486
|
+
resolved_shape = []
|
|
487
|
+
for i, dim in enumerate(shape):
|
|
488
|
+
if isinstance(dim, int) and dim > 0:
|
|
489
|
+
resolved_shape.append(dim)
|
|
490
|
+
elif i == 0:
|
|
491
|
+
resolved_shape.append(1) # Batch dim, replaced per iteration
|
|
492
|
+
elif is_text:
|
|
493
|
+
# Text models: sequence length
|
|
494
|
+
resolved_shape.append(128) # Default seq_len
|
|
495
|
+
elif len(shape) == 4 and i == 1:
|
|
496
|
+
resolved_shape.append(3) # Channels for vision
|
|
497
|
+
else:
|
|
498
|
+
resolved_shape.append(224) # Spatial dims for vision
|
|
499
|
+
|
|
500
|
+
input_specs.append((name, resolved_shape, np_dtype, is_text))
|
|
501
|
+
self.logger.debug(
|
|
502
|
+
f" Input '{name}': shape={resolved_shape}, dtype={np_dtype.__name__}"
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
self.logger.info(f"Model has {len(input_specs)} input(s)")
|
|
506
|
+
|
|
507
|
+
latencies = []
|
|
508
|
+
throughputs = []
|
|
509
|
+
vram_usage = []
|
|
510
|
+
optimal_bs = 1
|
|
511
|
+
max_throughput = 0.0
|
|
512
|
+
|
|
513
|
+
for bs in batch_sizes:
|
|
514
|
+
# Create input feed for ALL inputs
|
|
515
|
+
input_feed = {}
|
|
516
|
+
total_bytes = 0
|
|
517
|
+
|
|
518
|
+
try:
|
|
519
|
+
for name, shape_template, np_dtype, is_text in input_specs:
|
|
520
|
+
# Set batch size
|
|
521
|
+
shape = shape_template.copy()
|
|
522
|
+
shape[0] = bs
|
|
523
|
+
|
|
524
|
+
# Generate appropriate dummy data
|
|
525
|
+
if is_text:
|
|
526
|
+
# Token IDs: random integers in vocab range
|
|
527
|
+
dummy: np.ndarray = np.random.randint(0, 30000, size=shape, dtype=np_dtype) # type: ignore[arg-type]
|
|
528
|
+
else:
|
|
529
|
+
# Vision/continuous: random floats
|
|
530
|
+
dummy = np.random.randn(*shape).astype(np_dtype)
|
|
531
|
+
|
|
532
|
+
input_feed[name] = dummy
|
|
533
|
+
total_bytes += dummy.nbytes
|
|
534
|
+
|
|
535
|
+
except Exception as e:
|
|
536
|
+
self.logger.warning(f"Failed to create inputs for batch {bs}: {e}")
|
|
537
|
+
latencies.append(float("inf"))
|
|
538
|
+
throughputs.append(0.0)
|
|
539
|
+
vram_usage.append(0.0)
|
|
540
|
+
continue
|
|
541
|
+
|
|
542
|
+
# Warmup
|
|
543
|
+
try:
|
|
544
|
+
for _ in range(num_warmup):
|
|
545
|
+
sess.run(None, input_feed)
|
|
546
|
+
except Exception as e:
|
|
547
|
+
self.logger.warning(f"Batch {bs} failed (OOM?): {e}")
|
|
548
|
+
latencies.append(float("inf"))
|
|
549
|
+
throughputs.append(0.0)
|
|
550
|
+
vram_usage.append(0.0)
|
|
551
|
+
continue
|
|
552
|
+
|
|
553
|
+
# Benchmark
|
|
554
|
+
import time
|
|
555
|
+
|
|
556
|
+
run_latencies = []
|
|
557
|
+
for _ in range(num_runs):
|
|
558
|
+
start = time.perf_counter()
|
|
559
|
+
sess.run(None, input_feed)
|
|
560
|
+
end = time.perf_counter()
|
|
561
|
+
run_latencies.append((end - start) * 1000) # ms
|
|
562
|
+
|
|
563
|
+
# Use median latency (more stable than mean)
|
|
564
|
+
run_latencies.sort()
|
|
565
|
+
p50_latency = run_latencies[len(run_latencies) // 2]
|
|
566
|
+
throughput = (bs * 1000.0) / p50_latency
|
|
567
|
+
|
|
568
|
+
latencies.append(round(p50_latency, 2))
|
|
569
|
+
throughputs.append(round(throughput, 1))
|
|
570
|
+
|
|
571
|
+
# VRAM: try to measure with pynvml, fall back to estimate
|
|
572
|
+
gpu_metrics = self.get_gpu_metrics()
|
|
573
|
+
if gpu_metrics:
|
|
574
|
+
vram_gb = gpu_metrics.vram_used_bytes / (1024**3)
|
|
575
|
+
else:
|
|
576
|
+
# Estimate: total input bytes * 10 for activations
|
|
577
|
+
vram_gb = (total_bytes * 10) / (1024**3)
|
|
578
|
+
vram_usage.append(round(vram_gb, 3))
|
|
579
|
+
|
|
580
|
+
if throughput > max_throughput:
|
|
581
|
+
max_throughput = throughput
|
|
582
|
+
optimal_bs = bs
|
|
583
|
+
|
|
584
|
+
self.logger.info(
|
|
585
|
+
f" Batch {bs}: latency={p50_latency:.2f}ms, throughput={throughput:.1f} inf/s"
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
return BatchSizeSweep(
|
|
589
|
+
batch_sizes=batch_sizes,
|
|
590
|
+
latencies=latencies,
|
|
591
|
+
throughputs=throughputs,
|
|
592
|
+
vram_usage_gb=vram_usage,
|
|
593
|
+
optimal_batch_size=optimal_bs,
|
|
594
|
+
)
|
|
595
|
+
|
|
596
|
+
def run_resolution_sweep(
|
|
597
|
+
self,
|
|
598
|
+
base_flops: int,
|
|
599
|
+
base_activation_bytes: int,
|
|
600
|
+
base_resolution: tuple[int, int],
|
|
601
|
+
model_params: int,
|
|
602
|
+
hardware: HardwareProfile,
|
|
603
|
+
resolutions: list[tuple[int, int]] | None = None,
|
|
604
|
+
batch_size: int = 1,
|
|
605
|
+
precision: str = "fp16",
|
|
606
|
+
) -> ResolutionSweep:
|
|
607
|
+
"""
|
|
608
|
+
Analyze performance scaling across input resolutions.
|
|
609
|
+
|
|
610
|
+
For vision models, FLOPs and memory scale approximately quadratically
|
|
611
|
+
with resolution (for most architectures like ResNet, ViT, YOLO).
|
|
612
|
+
|
|
613
|
+
Args:
|
|
614
|
+
base_flops: FLOPs at base_resolution
|
|
615
|
+
base_activation_bytes: Activation memory at base_resolution
|
|
616
|
+
base_resolution: The resolution used for base measurements (H, W)
|
|
617
|
+
model_params: Total parameters (doesn't change with resolution)
|
|
618
|
+
hardware: Target hardware profile
|
|
619
|
+
resolutions: List of (H, W) resolutions to test
|
|
620
|
+
batch_size: Batch size for estimates
|
|
621
|
+
precision: Precision ("fp32", "fp16", "int8")
|
|
622
|
+
|
|
623
|
+
Returns:
|
|
624
|
+
ResolutionSweep results
|
|
625
|
+
"""
|
|
626
|
+
base_h, base_w = base_resolution
|
|
627
|
+
base_pixels = base_h * base_w
|
|
628
|
+
base_aspect = base_w / base_h if base_h > 0 else 1.0
|
|
629
|
+
|
|
630
|
+
if resolutions is None:
|
|
631
|
+
# Generate resolutions that:
|
|
632
|
+
# 1. Match the aspect ratio of training data
|
|
633
|
+
# 2. Only go UP TO (not above) the training resolution
|
|
634
|
+
# Running above training resolution typically produces poor results
|
|
635
|
+
resolutions = []
|
|
636
|
+
|
|
637
|
+
# Common scale factors (smaller than or equal to 1.0)
|
|
638
|
+
if base_aspect == 1.0:
|
|
639
|
+
# Square aspect ratio
|
|
640
|
+
candidates = [
|
|
641
|
+
128,
|
|
642
|
+
160,
|
|
643
|
+
192,
|
|
644
|
+
224,
|
|
645
|
+
256,
|
|
646
|
+
320,
|
|
647
|
+
384,
|
|
648
|
+
416,
|
|
649
|
+
448,
|
|
650
|
+
512,
|
|
651
|
+
640,
|
|
652
|
+
768,
|
|
653
|
+
1024,
|
|
654
|
+
]
|
|
655
|
+
for size in candidates:
|
|
656
|
+
if size <= base_h:
|
|
657
|
+
resolutions.append((size, size))
|
|
658
|
+
else:
|
|
659
|
+
# Non-square: generate resolutions matching aspect ratio
|
|
660
|
+
scale_factors = [0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
|
|
661
|
+
for scale in scale_factors:
|
|
662
|
+
h = int(base_h * scale)
|
|
663
|
+
w = int(base_w * scale)
|
|
664
|
+
# Round to nearest 32 for GPU efficiency
|
|
665
|
+
h = max(32, (h // 32) * 32)
|
|
666
|
+
w = max(32, (w // 32) * 32)
|
|
667
|
+
if h <= base_h and w <= base_w and (h, w) not in resolutions:
|
|
668
|
+
resolutions.append((h, w))
|
|
669
|
+
|
|
670
|
+
# Always include the base resolution
|
|
671
|
+
if base_resolution not in resolutions:
|
|
672
|
+
resolutions.append(base_resolution)
|
|
673
|
+
|
|
674
|
+
# Sort by pixel count
|
|
675
|
+
resolutions.sort(key=lambda r: r[0] * r[1])
|
|
676
|
+
|
|
677
|
+
resolution_strs = []
|
|
678
|
+
flops_list = []
|
|
679
|
+
memory_gb_list = []
|
|
680
|
+
latencies = []
|
|
681
|
+
throughputs = []
|
|
682
|
+
vram_usage = []
|
|
683
|
+
optimal_res = f"{base_h}x{base_w}"
|
|
684
|
+
max_res = f"{base_h}x{base_w}"
|
|
685
|
+
max_throughput = 0.0
|
|
686
|
+
max_fitting_pixels = 0
|
|
687
|
+
|
|
688
|
+
for h, w in resolutions:
|
|
689
|
+
res_str = f"{h}x{w}"
|
|
690
|
+
resolution_strs.append(res_str)
|
|
691
|
+
|
|
692
|
+
# Scale FLOPs and memory quadratically with resolution
|
|
693
|
+
pixels = h * w
|
|
694
|
+
scale_factor = pixels / base_pixels
|
|
695
|
+
|
|
696
|
+
scaled_flops = int(base_flops * scale_factor)
|
|
697
|
+
scaled_activation = int(base_activation_bytes * scale_factor)
|
|
698
|
+
|
|
699
|
+
flops_list.append(scaled_flops)
|
|
700
|
+
memory_gb_list.append(scaled_activation / (1024**3))
|
|
701
|
+
|
|
702
|
+
# Get hardware estimates for this resolution
|
|
703
|
+
est = self.hw_estimator.estimate(
|
|
704
|
+
model_params=model_params,
|
|
705
|
+
model_flops=scaled_flops,
|
|
706
|
+
peak_activation_bytes=scaled_activation,
|
|
707
|
+
hardware=hardware,
|
|
708
|
+
batch_size=batch_size,
|
|
709
|
+
precision=precision,
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
vram_gb = est.vram_required_bytes / (1024**3)
|
|
713
|
+
vram_usage.append(vram_gb)
|
|
714
|
+
|
|
715
|
+
if est.fits_in_vram and est.theoretical_latency_ms > 0:
|
|
716
|
+
latency = est.theoretical_latency_ms
|
|
717
|
+
throughput = (1000.0 / latency) * batch_size
|
|
718
|
+
|
|
719
|
+
latencies.append(latency)
|
|
720
|
+
throughputs.append(throughput)
|
|
721
|
+
|
|
722
|
+
# Track max resolution that fits
|
|
723
|
+
if pixels > max_fitting_pixels:
|
|
724
|
+
max_fitting_pixels = pixels
|
|
725
|
+
max_res = res_str
|
|
726
|
+
|
|
727
|
+
# Track optimal (highest throughput)
|
|
728
|
+
if throughput > max_throughput:
|
|
729
|
+
max_throughput = throughput
|
|
730
|
+
optimal_res = res_str
|
|
731
|
+
else:
|
|
732
|
+
latencies.append(float("inf"))
|
|
733
|
+
throughputs.append(0.0)
|
|
734
|
+
|
|
735
|
+
return ResolutionSweep(
|
|
736
|
+
resolutions=resolution_strs,
|
|
737
|
+
flops=flops_list,
|
|
738
|
+
memory_gb=memory_gb_list,
|
|
739
|
+
latencies=latencies,
|
|
740
|
+
throughputs=throughputs,
|
|
741
|
+
vram_usage_gb=vram_usage,
|
|
742
|
+
optimal_resolution=optimal_res,
|
|
743
|
+
max_resolution=max_res,
|
|
744
|
+
)
|
|
745
|
+
|
|
746
|
+
def recommend_resolution(
|
|
747
|
+
self,
|
|
748
|
+
base_flops: int,
|
|
749
|
+
base_activation_bytes: int,
|
|
750
|
+
base_resolution: tuple[int, int],
|
|
751
|
+
model_params: int,
|
|
752
|
+
hardware: HardwareProfile,
|
|
753
|
+
target_fps: float = 30.0,
|
|
754
|
+
batch_size: int = 1,
|
|
755
|
+
precision: str = "fp16",
|
|
756
|
+
) -> dict[str, Any]:
|
|
757
|
+
"""
|
|
758
|
+
Recommend optimal resolution for target hardware and latency requirements.
|
|
759
|
+
|
|
760
|
+
Task 6.8.5: Resolution recommendations for target hardware
|
|
761
|
+
|
|
762
|
+
Args:
|
|
763
|
+
base_flops: FLOPs at base_resolution
|
|
764
|
+
base_activation_bytes: Activation memory at base_resolution
|
|
765
|
+
base_resolution: The resolution used for base measurements (H, W)
|
|
766
|
+
model_params: Total parameters
|
|
767
|
+
hardware: Target hardware profile
|
|
768
|
+
target_fps: Desired frames per second (default: 30 fps)
|
|
769
|
+
batch_size: Batch size
|
|
770
|
+
precision: Precision for estimates
|
|
771
|
+
|
|
772
|
+
Returns:
|
|
773
|
+
Dict with recommended_resolution, max_resolution, and rationale
|
|
774
|
+
"""
|
|
775
|
+
target_latency_ms = 1000.0 / target_fps
|
|
776
|
+
|
|
777
|
+
# Run sweep with common resolutions
|
|
778
|
+
sweep = self.run_resolution_sweep(
|
|
779
|
+
base_flops=base_flops,
|
|
780
|
+
base_activation_bytes=base_activation_bytes,
|
|
781
|
+
base_resolution=base_resolution,
|
|
782
|
+
model_params=model_params,
|
|
783
|
+
hardware=hardware,
|
|
784
|
+
batch_size=batch_size,
|
|
785
|
+
precision=precision,
|
|
786
|
+
)
|
|
787
|
+
|
|
788
|
+
# Find resolution that meets target FPS
|
|
789
|
+
recommended = None
|
|
790
|
+
recommended_idx = -1
|
|
791
|
+
for i, (res, lat) in enumerate(zip(sweep.resolutions, sweep.latencies, strict=False)):
|
|
792
|
+
if lat != float("inf") and lat <= target_latency_ms:
|
|
793
|
+
recommended = res
|
|
794
|
+
recommended_idx = i
|
|
795
|
+
|
|
796
|
+
# Build recommendation rationale
|
|
797
|
+
rationale_parts = []
|
|
798
|
+
|
|
799
|
+
if recommended:
|
|
800
|
+
rationale_parts.append(
|
|
801
|
+
f"Resolution **{recommended}** meets {target_fps} FPS target "
|
|
802
|
+
f"({sweep.latencies[recommended_idx]:.1f}ms latency)."
|
|
803
|
+
)
|
|
804
|
+
else:
|
|
805
|
+
# Find closest resolution that fits
|
|
806
|
+
for i, (res, lat) in enumerate(zip(sweep.resolutions, sweep.latencies, strict=False)):
|
|
807
|
+
if lat != float("inf"):
|
|
808
|
+
recommended = res
|
|
809
|
+
recommended_idx = i
|
|
810
|
+
break
|
|
811
|
+
|
|
812
|
+
if recommended:
|
|
813
|
+
actual_fps = 1000.0 / sweep.latencies[recommended_idx]
|
|
814
|
+
rationale_parts.append(
|
|
815
|
+
f"Cannot meet {target_fps} FPS. Best achievable: "
|
|
816
|
+
f"**{recommended}** at {actual_fps:.1f} FPS."
|
|
817
|
+
)
|
|
818
|
+
else:
|
|
819
|
+
rationale_parts.append("No resolution fits in available VRAM.")
|
|
820
|
+
|
|
821
|
+
if sweep.max_resolution and sweep.max_resolution != recommended:
|
|
822
|
+
rationale_parts.append(
|
|
823
|
+
f"Maximum resolution that fits in VRAM: **{sweep.max_resolution}**."
|
|
824
|
+
)
|
|
825
|
+
|
|
826
|
+
return {
|
|
827
|
+
"recommended_resolution": recommended,
|
|
828
|
+
"max_resolution": sweep.max_resolution,
|
|
829
|
+
"optimal_resolution": sweep.optimal_resolution,
|
|
830
|
+
"target_fps": target_fps,
|
|
831
|
+
"achievable_fps": (
|
|
832
|
+
1000.0 / sweep.latencies[recommended_idx]
|
|
833
|
+
if recommended and recommended_idx >= 0
|
|
834
|
+
else 0.0
|
|
835
|
+
),
|
|
836
|
+
"rationale": " ".join(rationale_parts),
|
|
837
|
+
"sweep_results": sweep.to_dict(),
|
|
838
|
+
}
|
|
839
|
+
|
|
840
|
+
def determine_system_requirements(
|
|
841
|
+
self,
|
|
842
|
+
model_params: int,
|
|
843
|
+
model_flops: int,
|
|
844
|
+
peak_activation_bytes: int,
|
|
845
|
+
precision: str = "fp16",
|
|
846
|
+
target_fps: float = 30.0, # For "Recommended" tier
|
|
847
|
+
) -> SystemRequirements:
|
|
848
|
+
"""
|
|
849
|
+
Find suitable hardware tiers ("Steam-style" requirements).
|
|
850
|
+
|
|
851
|
+
Strategy:
|
|
852
|
+
- Minimum: Cheapest hardware that fits the model in VRAM (Batch=1)
|
|
853
|
+
- Recommended: Cheapest hardware that hits target_fps (Batch=1) OR fits with good utilization
|
|
854
|
+
- Optimal: Hardware providing highest throughput/lowest latency
|
|
855
|
+
"""
|
|
856
|
+
candidates = []
|
|
857
|
+
|
|
858
|
+
# Evaluate against all known profiles
|
|
859
|
+
# Filter out mobile/multi-gpu for cleaner list, or keep them?
|
|
860
|
+
# Let's keep single-GPU desktops/servers for simplicity of recommendation
|
|
861
|
+
for name, profile in HARDWARE_PROFILES.items():
|
|
862
|
+
# Skip generic CPU for this analysis unless it's the only option
|
|
863
|
+
if profile.device_type == "cpu":
|
|
864
|
+
continue
|
|
865
|
+
|
|
866
|
+
# Skip mobile variants to keep list clean (optional)
|
|
867
|
+
if "mobile" in name:
|
|
868
|
+
continue
|
|
869
|
+
|
|
870
|
+
est = self.hw_estimator.estimate(
|
|
871
|
+
model_params=model_params,
|
|
872
|
+
model_flops=model_flops,
|
|
873
|
+
peak_activation_bytes=peak_activation_bytes,
|
|
874
|
+
hardware=profile,
|
|
875
|
+
batch_size=1,
|
|
876
|
+
precision=precision,
|
|
877
|
+
)
|
|
878
|
+
candidates.append((profile, est))
|
|
879
|
+
|
|
880
|
+
if not candidates:
|
|
881
|
+
return SystemRequirements(None, None, None)
|
|
882
|
+
|
|
883
|
+
# --- Find Minimum ---
|
|
884
|
+
# Sort by VRAM (ascending), then FLOPs (ascending)
|
|
885
|
+
candidates.sort(key=lambda x: (x[0].vram_bytes, x[0].peak_fp16_tflops))
|
|
886
|
+
|
|
887
|
+
minimum = None
|
|
888
|
+
for _, est in candidates:
|
|
889
|
+
if est.fits_in_vram:
|
|
890
|
+
minimum = est
|
|
891
|
+
break
|
|
892
|
+
|
|
893
|
+
# --- Find Optimal ---
|
|
894
|
+
# Sort by Latency (ascending)
|
|
895
|
+
candidates.sort(key=lambda x: x[1].theoretical_latency_ms)
|
|
896
|
+
|
|
897
|
+
optimal = None
|
|
898
|
+
# Filter for ones that fit
|
|
899
|
+
valid_candidates = [x for x in candidates if x[1].fits_in_vram]
|
|
900
|
+
if valid_candidates:
|
|
901
|
+
optimal = valid_candidates[0][1] # Fastest
|
|
902
|
+
|
|
903
|
+
# --- Find Recommended ---
|
|
904
|
+
# Heuristic: Fits VRAM AND (Latency <= 1000/target_fps OR Utilization > 0.5)
|
|
905
|
+
# We want something reasonable, not necessarily the fastest (which is often H100)
|
|
906
|
+
# Let's look for the "cheapest" card that meets a performance bar.
|
|
907
|
+
|
|
908
|
+
recommended = None
|
|
909
|
+
|
|
910
|
+
# Re-sort by cost proxy (we don't have prices in HardwareProfile, but TFLOPS is a rough proxy)
|
|
911
|
+
valid_candidates.sort(key=lambda x: x[0].peak_fp16_tflops)
|
|
912
|
+
|
|
913
|
+
target_latency_ms = 1000.0 / target_fps
|
|
914
|
+
|
|
915
|
+
for _, est in valid_candidates:
|
|
916
|
+
if est.theoretical_latency_ms <= target_latency_ms:
|
|
917
|
+
recommended = est
|
|
918
|
+
break
|
|
919
|
+
|
|
920
|
+
# If nothing meets strict FPS target, pick the one with decent utilization
|
|
921
|
+
if recommended is None and valid_candidates:
|
|
922
|
+
# Pick median performer? Or just fallback to Minimum if nothing is fast enough?
|
|
923
|
+
# Let's pick the one that is ~4x faster than minimum if possible, or just minimum
|
|
924
|
+
minimum_latency = minimum.theoretical_latency_ms if minimum else float("inf")
|
|
925
|
+
for _, est in valid_candidates:
|
|
926
|
+
if est.theoretical_latency_ms <= minimum_latency / 4.0:
|
|
927
|
+
recommended = est
|
|
928
|
+
break
|
|
929
|
+
|
|
930
|
+
if recommended is None:
|
|
931
|
+
recommended = minimum # Fallback
|
|
932
|
+
|
|
933
|
+
return SystemRequirements(minimum=minimum, recommended=recommended, optimal=optimal)
|
|
934
|
+
|
|
935
|
+
# =========================================================================
|
|
936
|
+
# Story 9.2: GPU Memory Profiling
|
|
937
|
+
# =========================================================================
|
|
938
|
+
|
|
939
|
+
def get_gpu_metrics(self, device_index: int = 0) -> GPUMetrics | None:
|
|
940
|
+
"""
|
|
941
|
+
Get real-time GPU metrics using pynvml.
|
|
942
|
+
|
|
943
|
+
Args:
|
|
944
|
+
device_index: GPU device index (default: 0)
|
|
945
|
+
|
|
946
|
+
Returns:
|
|
947
|
+
GPUMetrics with VRAM usage, utilization, temperature, power
|
|
948
|
+
None if pynvml is not available or fails
|
|
949
|
+
"""
|
|
950
|
+
try:
|
|
951
|
+
import pynvml
|
|
952
|
+
|
|
953
|
+
pynvml.nvmlInit()
|
|
954
|
+
handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
|
|
955
|
+
|
|
956
|
+
# Memory info
|
|
957
|
+
mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
|
|
958
|
+
|
|
959
|
+
# Utilization
|
|
960
|
+
util = pynvml.nvmlDeviceGetUtilizationRates(handle)
|
|
961
|
+
|
|
962
|
+
# Temperature
|
|
963
|
+
temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
|
|
964
|
+
|
|
965
|
+
# Power
|
|
966
|
+
try:
|
|
967
|
+
power_mw = pynvml.nvmlDeviceGetPowerUsage(handle)
|
|
968
|
+
power_w = power_mw / 1000.0
|
|
969
|
+
except pynvml.NVMLError:
|
|
970
|
+
power_w = 0.0
|
|
971
|
+
|
|
972
|
+
pynvml.nvmlShutdown()
|
|
973
|
+
|
|
974
|
+
return GPUMetrics(
|
|
975
|
+
vram_used_bytes=mem_info.used,
|
|
976
|
+
vram_total_bytes=mem_info.total,
|
|
977
|
+
gpu_utilization_percent=float(util.gpu),
|
|
978
|
+
memory_utilization_percent=float(util.memory),
|
|
979
|
+
temperature_c=temp,
|
|
980
|
+
power_draw_w=power_w,
|
|
981
|
+
)
|
|
982
|
+
except ImportError:
|
|
983
|
+
self.logger.debug("pynvml not available for GPU metrics")
|
|
984
|
+
return None
|
|
985
|
+
except Exception as e:
|
|
986
|
+
self.logger.warning(f"Failed to get GPU metrics: {e}")
|
|
987
|
+
return None
|
|
988
|
+
|
|
989
|
+
def measure_peak_vram(
|
|
990
|
+
self,
|
|
991
|
+
model_path: str,
|
|
992
|
+
batch_size: int = 1,
|
|
993
|
+
num_runs: int = 5,
|
|
994
|
+
device_index: int = 0,
|
|
995
|
+
) -> dict[str, Any]:
|
|
996
|
+
"""
|
|
997
|
+
Measure actual peak VRAM usage during inference.
|
|
998
|
+
|
|
999
|
+
Args:
|
|
1000
|
+
model_path: Path to ONNX model
|
|
1001
|
+
batch_size: Batch size for inference
|
|
1002
|
+
num_runs: Number of inference runs
|
|
1003
|
+
device_index: GPU device index
|
|
1004
|
+
|
|
1005
|
+
Returns:
|
|
1006
|
+
Dict with baseline, peak, and delta VRAM usage
|
|
1007
|
+
"""
|
|
1008
|
+
try:
|
|
1009
|
+
import numpy as np
|
|
1010
|
+
import onnxruntime as ort
|
|
1011
|
+
except ImportError:
|
|
1012
|
+
return {"error": "onnxruntime not available"}
|
|
1013
|
+
|
|
1014
|
+
# Get baseline GPU metrics
|
|
1015
|
+
baseline_metrics = self.get_gpu_metrics(device_index)
|
|
1016
|
+
if baseline_metrics is None:
|
|
1017
|
+
return {"error": "pynvml not available for VRAM measurement"}
|
|
1018
|
+
|
|
1019
|
+
baseline_vram = baseline_metrics.vram_used_bytes
|
|
1020
|
+
|
|
1021
|
+
# Create session with CUDA
|
|
1022
|
+
try:
|
|
1023
|
+
sess = ort.InferenceSession(
|
|
1024
|
+
model_path,
|
|
1025
|
+
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
|
|
1026
|
+
)
|
|
1027
|
+
except Exception as e:
|
|
1028
|
+
return {"error": f"Failed to create session: {e}"}
|
|
1029
|
+
|
|
1030
|
+
# Get input info and create dummy input
|
|
1031
|
+
input_info = sess.get_inputs()[0]
|
|
1032
|
+
input_shape = list(input_info.shape)
|
|
1033
|
+
for i, dim in enumerate(input_shape):
|
|
1034
|
+
if not isinstance(dim, int) or dim <= 0:
|
|
1035
|
+
if i == 0:
|
|
1036
|
+
input_shape[i] = batch_size
|
|
1037
|
+
elif i == 1:
|
|
1038
|
+
input_shape[i] = 3
|
|
1039
|
+
else:
|
|
1040
|
+
input_shape[i] = 224
|
|
1041
|
+
input_shape[0] = batch_size
|
|
1042
|
+
|
|
1043
|
+
dummy_input = np.random.randn(*input_shape).astype(np.float32)
|
|
1044
|
+
|
|
1045
|
+
# Run inference and measure peak VRAM
|
|
1046
|
+
peak_vram = baseline_vram
|
|
1047
|
+
vram_samples = []
|
|
1048
|
+
|
|
1049
|
+
for _ in range(num_runs):
|
|
1050
|
+
sess.run(None, {input_info.name: dummy_input})
|
|
1051
|
+
metrics = self.get_gpu_metrics(device_index)
|
|
1052
|
+
if metrics:
|
|
1053
|
+
vram_samples.append(metrics.vram_used_bytes)
|
|
1054
|
+
if metrics.vram_used_bytes > peak_vram:
|
|
1055
|
+
peak_vram = metrics.vram_used_bytes
|
|
1056
|
+
|
|
1057
|
+
delta_vram = peak_vram - baseline_vram
|
|
1058
|
+
|
|
1059
|
+
return {
|
|
1060
|
+
"baseline_vram_gb": round(baseline_vram / (1024**3), 3),
|
|
1061
|
+
"peak_vram_gb": round(peak_vram / (1024**3), 3),
|
|
1062
|
+
"delta_vram_gb": round(delta_vram / (1024**3), 3),
|
|
1063
|
+
"model_vram_estimate_gb": round(delta_vram / (1024**3), 3),
|
|
1064
|
+
"batch_size": batch_size,
|
|
1065
|
+
"samples": len(vram_samples),
|
|
1066
|
+
}
|
|
1067
|
+
|
|
1068
|
+
# =========================================================================
|
|
1069
|
+
# Story 9.3: Per-Layer Profiling
|
|
1070
|
+
# =========================================================================
|
|
1071
|
+
|
|
1072
|
+
def profile_model(
|
|
1073
|
+
self,
|
|
1074
|
+
model_path: str,
|
|
1075
|
+
batch_size: int = 1,
|
|
1076
|
+
num_runs: int = 10,
|
|
1077
|
+
device_index: int = 0,
|
|
1078
|
+
) -> ProfilingResult | None:
|
|
1079
|
+
"""
|
|
1080
|
+
Profile model execution with ONNX Runtime's built-in profiler.
|
|
1081
|
+
|
|
1082
|
+
Args:
|
|
1083
|
+
model_path: Path to ONNX model
|
|
1084
|
+
batch_size: Batch size for profiling
|
|
1085
|
+
num_runs: Number of profiling runs
|
|
1086
|
+
device_index: GPU device index
|
|
1087
|
+
|
|
1088
|
+
Returns:
|
|
1089
|
+
ProfilingResult with per-layer timing data
|
|
1090
|
+
"""
|
|
1091
|
+
try:
|
|
1092
|
+
import json
|
|
1093
|
+
import os
|
|
1094
|
+
import tempfile
|
|
1095
|
+
import time
|
|
1096
|
+
|
|
1097
|
+
import onnxruntime as ort
|
|
1098
|
+
except ImportError:
|
|
1099
|
+
self.logger.warning("onnxruntime not available for profiling")
|
|
1100
|
+
return None
|
|
1101
|
+
|
|
1102
|
+
# Create session with profiling enabled
|
|
1103
|
+
sess_options = ort.SessionOptions()
|
|
1104
|
+
sess_options.enable_profiling = True
|
|
1105
|
+
|
|
1106
|
+
# Use temp directory for profile output
|
|
1107
|
+
with tempfile.TemporaryDirectory() as tmpdir:
|
|
1108
|
+
sess_options.profile_file_prefix = os.path.join(tmpdir, "ort_profile")
|
|
1109
|
+
|
|
1110
|
+
try:
|
|
1111
|
+
sess = ort.InferenceSession(
|
|
1112
|
+
model_path,
|
|
1113
|
+
sess_options=sess_options,
|
|
1114
|
+
providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
|
|
1115
|
+
)
|
|
1116
|
+
except Exception as e:
|
|
1117
|
+
self.logger.error(f"Failed to create profiling session: {e}")
|
|
1118
|
+
return None
|
|
1119
|
+
|
|
1120
|
+
# Get ALL inputs (Story 9.6: Multi-input model support)
|
|
1121
|
+
input_feed = self._create_input_feed(sess, batch_size)
|
|
1122
|
+
|
|
1123
|
+
# Warmup
|
|
1124
|
+
for _ in range(3):
|
|
1125
|
+
sess.run(None, input_feed)
|
|
1126
|
+
|
|
1127
|
+
# Profile runs
|
|
1128
|
+
start = time.perf_counter()
|
|
1129
|
+
for _ in range(num_runs):
|
|
1130
|
+
sess.run(None, input_feed)
|
|
1131
|
+
total_time_ms = ((time.perf_counter() - start) / num_runs) * 1000
|
|
1132
|
+
|
|
1133
|
+
# End profiling and get the file
|
|
1134
|
+
profile_file = sess.end_profiling()
|
|
1135
|
+
|
|
1136
|
+
# Parse profile JSON
|
|
1137
|
+
layer_profiles = []
|
|
1138
|
+
try:
|
|
1139
|
+
with open(profile_file, encoding="utf-8") as f:
|
|
1140
|
+
profile_data = json.load(f)
|
|
1141
|
+
|
|
1142
|
+
for event in profile_data:
|
|
1143
|
+
if event.get("cat") == "Node":
|
|
1144
|
+
name = event.get("name", "")
|
|
1145
|
+
args = event.get("args", {})
|
|
1146
|
+
op_type = args.get("op_name", "Unknown")
|
|
1147
|
+
provider = args.get("provider", "Unknown")
|
|
1148
|
+
dur = event.get("dur", 0) # Duration in microseconds
|
|
1149
|
+
|
|
1150
|
+
# Parse shapes from args
|
|
1151
|
+
input_shapes: list[list[int]] = []
|
|
1152
|
+
output_shapes: list[list[int]] = []
|
|
1153
|
+
for key, value in args.items():
|
|
1154
|
+
if key.startswith("input_") and "shape" not in key:
|
|
1155
|
+
continue
|
|
1156
|
+
if "shape" in key.lower():
|
|
1157
|
+
try:
|
|
1158
|
+
if isinstance(value, str):
|
|
1159
|
+
# Parse shape string like "[1,3,224,224]"
|
|
1160
|
+
shape = [
|
|
1161
|
+
int(x)
|
|
1162
|
+
for x in value.strip("[]").split(",")
|
|
1163
|
+
if x.strip()
|
|
1164
|
+
]
|
|
1165
|
+
if "output" in key.lower():
|
|
1166
|
+
output_shapes.append(shape)
|
|
1167
|
+
else:
|
|
1168
|
+
input_shapes.append(shape)
|
|
1169
|
+
except (ValueError, AttributeError):
|
|
1170
|
+
pass
|
|
1171
|
+
|
|
1172
|
+
layer_profiles.append(
|
|
1173
|
+
LayerProfile(
|
|
1174
|
+
name=name,
|
|
1175
|
+
op_type=op_type,
|
|
1176
|
+
duration_us=dur,
|
|
1177
|
+
provider=provider,
|
|
1178
|
+
input_shapes=input_shapes,
|
|
1179
|
+
output_shapes=output_shapes,
|
|
1180
|
+
)
|
|
1181
|
+
)
|
|
1182
|
+
except Exception as e:
|
|
1183
|
+
self.logger.warning(f"Failed to parse profile: {e}")
|
|
1184
|
+
|
|
1185
|
+
# Get GPU metrics
|
|
1186
|
+
gpu_metrics = self.get_gpu_metrics(device_index)
|
|
1187
|
+
|
|
1188
|
+
return ProfilingResult(
|
|
1189
|
+
total_time_ms=total_time_ms,
|
|
1190
|
+
layer_profiles=layer_profiles,
|
|
1191
|
+
gpu_metrics=gpu_metrics,
|
|
1192
|
+
session_options={"batch_size": batch_size, "num_runs": num_runs},
|
|
1193
|
+
)
|
|
1194
|
+
|
|
1195
|
+
# =========================================================================
|
|
1196
|
+
# Story 9.4: Bottleneck Detection
|
|
1197
|
+
# =========================================================================
|
|
1198
|
+
|
|
1199
|
+
def analyze_bottleneck(
|
|
1200
|
+
self,
|
|
1201
|
+
model_flops: int,
|
|
1202
|
+
profiling_result: ProfilingResult | None,
|
|
1203
|
+
hardware: HardwareProfile,
|
|
1204
|
+
precision: str = "fp16",
|
|
1205
|
+
) -> BottleneckAnalysis:
|
|
1206
|
+
"""
|
|
1207
|
+
Analyze whether model is compute-bound or memory-bound.
|
|
1208
|
+
|
|
1209
|
+
Uses roofline model principles:
|
|
1210
|
+
- Compute-bound: Time dominated by FLOP execution
|
|
1211
|
+
- Memory-bound: Time dominated by memory bandwidth
|
|
1212
|
+
|
|
1213
|
+
Args:
|
|
1214
|
+
model_flops: Total FLOPs per inference
|
|
1215
|
+
profiling_result: Results from profile_model()
|
|
1216
|
+
hardware: Target hardware profile
|
|
1217
|
+
precision: Precision used ("fp32", "fp16", "int8")
|
|
1218
|
+
|
|
1219
|
+
Returns:
|
|
1220
|
+
BottleneckAnalysis with classification and recommendations
|
|
1221
|
+
"""
|
|
1222
|
+
# Get peak theoretical compute
|
|
1223
|
+
if precision == "fp32":
|
|
1224
|
+
peak_tflops = hardware.peak_fp32_tflops or hardware.peak_fp16_tflops / 2
|
|
1225
|
+
elif precision == "int8":
|
|
1226
|
+
peak_tflops = hardware.peak_int8_tops or hardware.peak_fp16_tflops * 2
|
|
1227
|
+
else:
|
|
1228
|
+
peak_tflops = hardware.peak_fp16_tflops
|
|
1229
|
+
|
|
1230
|
+
# Actual latency
|
|
1231
|
+
if profiling_result:
|
|
1232
|
+
actual_latency_ms = profiling_result.total_time_ms
|
|
1233
|
+
else:
|
|
1234
|
+
# Estimate from theoretical
|
|
1235
|
+
actual_latency_ms = (model_flops / (peak_tflops * 1e12)) * 1000
|
|
1236
|
+
|
|
1237
|
+
# Calculate achieved TFLOPs
|
|
1238
|
+
achieved_tflops = (model_flops / actual_latency_ms) / 1e9 # TFLOPS
|
|
1239
|
+
|
|
1240
|
+
# Efficiency
|
|
1241
|
+
efficiency = (achieved_tflops / peak_tflops) * 100 if peak_tflops > 0 else 0
|
|
1242
|
+
|
|
1243
|
+
# Estimate memory transfer time
|
|
1244
|
+
# Rough estimate: assume model params + activations need to be read
|
|
1245
|
+
# Memory bandwidth in bytes/s -> convert to bytes/ms
|
|
1246
|
+
mem_bandwidth_bytes_per_ms = hardware.memory_bandwidth_bytes_per_s / 1000 # B/s -> B/ms
|
|
1247
|
+
|
|
1248
|
+
# Estimate memory footprint accessed per inference
|
|
1249
|
+
# This is a rough estimate - actual depends on caching, batch size, etc.
|
|
1250
|
+
bytes_per_param = 2 if precision == "fp16" else 4 if precision == "fp32" else 1
|
|
1251
|
+
# Assume we read all params once + some activation memory
|
|
1252
|
+
estimated_memory_bytes = model_flops * bytes_per_param / 1000 # Rough
|
|
1253
|
+
|
|
1254
|
+
memory_time_ms = estimated_memory_bytes / mem_bandwidth_bytes_per_ms
|
|
1255
|
+
|
|
1256
|
+
# Compute time (from achieved throughput)
|
|
1257
|
+
compute_time_ms = actual_latency_ms - memory_time_ms
|
|
1258
|
+
if compute_time_ms < 0:
|
|
1259
|
+
compute_time_ms = actual_latency_ms * 0.5 # Fallback
|
|
1260
|
+
|
|
1261
|
+
# Ratios
|
|
1262
|
+
total_time = compute_time_ms + memory_time_ms
|
|
1263
|
+
compute_ratio = compute_time_ms / total_time if total_time > 0 else 0.5
|
|
1264
|
+
memory_ratio = 1.0 - compute_ratio
|
|
1265
|
+
|
|
1266
|
+
# Classification
|
|
1267
|
+
if compute_ratio > 0.7:
|
|
1268
|
+
bottleneck_type = "compute-bound"
|
|
1269
|
+
elif memory_ratio > 0.7:
|
|
1270
|
+
bottleneck_type = "memory-bound"
|
|
1271
|
+
else:
|
|
1272
|
+
bottleneck_type = "balanced"
|
|
1273
|
+
|
|
1274
|
+
# Recommendations based on bottleneck
|
|
1275
|
+
recommendations = []
|
|
1276
|
+
|
|
1277
|
+
if bottleneck_type == "compute-bound":
|
|
1278
|
+
recommendations.extend(
|
|
1279
|
+
[
|
|
1280
|
+
"Use INT8/FP16 quantization to reduce compute requirements",
|
|
1281
|
+
"Consider model pruning to reduce FLOP count",
|
|
1282
|
+
"Use Tensor Cores (if available) for matrix operations",
|
|
1283
|
+
"Increase batch size to improve GPU utilization",
|
|
1284
|
+
]
|
|
1285
|
+
)
|
|
1286
|
+
if efficiency < 50:
|
|
1287
|
+
recommendations.append(
|
|
1288
|
+
f"GPU utilization is low ({efficiency:.0f}%). "
|
|
1289
|
+
"Check for CPU bottlenecks or data loading issues."
|
|
1290
|
+
)
|
|
1291
|
+
elif bottleneck_type == "memory-bound":
|
|
1292
|
+
recommendations.extend(
|
|
1293
|
+
[
|
|
1294
|
+
"Use lower precision (FP16/INT8) to reduce memory bandwidth",
|
|
1295
|
+
"Enable operator fusion to reduce memory round-trips",
|
|
1296
|
+
"Consider tensor compression or activation checkpointing",
|
|
1297
|
+
"Use hardware with higher memory bandwidth",
|
|
1298
|
+
]
|
|
1299
|
+
)
|
|
1300
|
+
else: # balanced
|
|
1301
|
+
recommendations.extend(
|
|
1302
|
+
[
|
|
1303
|
+
"Model has balanced compute/memory characteristics",
|
|
1304
|
+
"Both quantization and bandwidth optimization may help",
|
|
1305
|
+
"Profile individual layers to find specific bottlenecks",
|
|
1306
|
+
]
|
|
1307
|
+
)
|
|
1308
|
+
|
|
1309
|
+
# Add efficiency-specific recommendations
|
|
1310
|
+
if efficiency < 30:
|
|
1311
|
+
recommendations.append(
|
|
1312
|
+
"Very low GPU efficiency. Consider using TensorRT or "
|
|
1313
|
+
"ONNX Runtime optimization passes."
|
|
1314
|
+
)
|
|
1315
|
+
|
|
1316
|
+
return BottleneckAnalysis(
|
|
1317
|
+
bottleneck_type=bottleneck_type,
|
|
1318
|
+
compute_time_ms=compute_time_ms,
|
|
1319
|
+
memory_time_ms=memory_time_ms,
|
|
1320
|
+
compute_ratio=compute_ratio,
|
|
1321
|
+
memory_ratio=memory_ratio,
|
|
1322
|
+
theoretical_peak_tflops=peak_tflops,
|
|
1323
|
+
achieved_tflops=achieved_tflops,
|
|
1324
|
+
efficiency_percent=efficiency,
|
|
1325
|
+
recommendations=recommendations,
|
|
1326
|
+
)
|
|
1327
|
+
|
|
1328
|
+
# =========================================================================
|
|
1329
|
+
# Story 9.5: Resolution Benchmarking
|
|
1330
|
+
# =========================================================================
|
|
1331
|
+
|
|
1332
|
+
def benchmark_resolutions(
|
|
1333
|
+
self,
|
|
1334
|
+
model_path: str,
|
|
1335
|
+
resolutions: list[tuple[int, int]] | None = None,
|
|
1336
|
+
batch_size: int = 1,
|
|
1337
|
+
num_warmup: int = 5,
|
|
1338
|
+
num_runs: int = 20,
|
|
1339
|
+
) -> ResolutionSweep | None:
|
|
1340
|
+
"""
|
|
1341
|
+
Benchmark actual inference performance across resolutions.
|
|
1342
|
+
|
|
1343
|
+
Args:
|
|
1344
|
+
model_path: Path to ONNX model
|
|
1345
|
+
resolutions: List of (H, W) resolutions to test
|
|
1346
|
+
batch_size: Batch size for benchmarking
|
|
1347
|
+
num_warmup: Warmup runs before timing
|
|
1348
|
+
num_runs: Timed runs per resolution
|
|
1349
|
+
|
|
1350
|
+
Returns:
|
|
1351
|
+
ResolutionSweep with measured (not estimated) metrics
|
|
1352
|
+
"""
|
|
1353
|
+
try:
|
|
1354
|
+
import time
|
|
1355
|
+
|
|
1356
|
+
import numpy as np
|
|
1357
|
+
import onnxruntime as ort
|
|
1358
|
+
except ImportError:
|
|
1359
|
+
self.logger.warning("onnxruntime not available for benchmarking")
|
|
1360
|
+
return None
|
|
1361
|
+
|
|
1362
|
+
if resolutions is None:
|
|
1363
|
+
# Default resolutions for vision models
|
|
1364
|
+
resolutions = [
|
|
1365
|
+
(128, 128),
|
|
1366
|
+
(224, 224),
|
|
1367
|
+
(256, 256),
|
|
1368
|
+
(384, 384),
|
|
1369
|
+
(512, 512),
|
|
1370
|
+
(640, 640),
|
|
1371
|
+
]
|
|
1372
|
+
|
|
1373
|
+
# Create session
|
|
1374
|
+
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
|
|
1375
|
+
try:
|
|
1376
|
+
sess = ort.InferenceSession(model_path, providers=providers)
|
|
1377
|
+
except Exception as e:
|
|
1378
|
+
self.logger.error(f"Failed to create session: {e}")
|
|
1379
|
+
return None
|
|
1380
|
+
|
|
1381
|
+
input_info = sess.get_inputs()[0]
|
|
1382
|
+
input_name = input_info.name
|
|
1383
|
+
|
|
1384
|
+
resolution_strs: list[str] = []
|
|
1385
|
+
flops_list: list[int] = []
|
|
1386
|
+
memory_gb_list: list[float] = []
|
|
1387
|
+
latencies: list[float] = []
|
|
1388
|
+
throughputs: list[float] = []
|
|
1389
|
+
vram_usage: list[float] = []
|
|
1390
|
+
|
|
1391
|
+
max_throughput = 0.0
|
|
1392
|
+
optimal_res = ""
|
|
1393
|
+
max_res = ""
|
|
1394
|
+
max_pixels = 0
|
|
1395
|
+
|
|
1396
|
+
for h, w in resolutions:
|
|
1397
|
+
res_str = f"{h}x{w}"
|
|
1398
|
+
resolution_strs.append(res_str)
|
|
1399
|
+
|
|
1400
|
+
# Create input with this resolution
|
|
1401
|
+
# Assume [N, C, H, W] format
|
|
1402
|
+
input_shape = list(input_info.shape)
|
|
1403
|
+
for i, dim in enumerate(input_shape):
|
|
1404
|
+
if not isinstance(dim, int) or dim <= 0:
|
|
1405
|
+
if i == 0:
|
|
1406
|
+
input_shape[i] = batch_size
|
|
1407
|
+
elif i == 1:
|
|
1408
|
+
input_shape[i] = 3
|
|
1409
|
+
elif i == 2:
|
|
1410
|
+
input_shape[i] = h
|
|
1411
|
+
elif i == 3:
|
|
1412
|
+
input_shape[i] = w
|
|
1413
|
+
|
|
1414
|
+
try:
|
|
1415
|
+
dummy_input = np.random.randn(*input_shape).astype(np.float32)
|
|
1416
|
+
except Exception as e:
|
|
1417
|
+
self.logger.warning(f"Failed to create input for {res_str}: {e}")
|
|
1418
|
+
flops_list.append(0)
|
|
1419
|
+
memory_gb_list.append(0.0)
|
|
1420
|
+
latencies.append(float("inf"))
|
|
1421
|
+
throughputs.append(0.0)
|
|
1422
|
+
vram_usage.append(0.0)
|
|
1423
|
+
continue
|
|
1424
|
+
|
|
1425
|
+
# Estimate FLOPs (scales quadratically with resolution)
|
|
1426
|
+
base_flops = 4_000_000_000 # Rough estimate for 224x224
|
|
1427
|
+
scale = (h * w) / (224 * 224)
|
|
1428
|
+
flops = int(base_flops * scale)
|
|
1429
|
+
flops_list.append(flops)
|
|
1430
|
+
|
|
1431
|
+
# Memory estimate
|
|
1432
|
+
memory_gb = dummy_input.nbytes / (1024**3)
|
|
1433
|
+
memory_gb_list.append(round(memory_gb, 4))
|
|
1434
|
+
|
|
1435
|
+
# Warmup
|
|
1436
|
+
try:
|
|
1437
|
+
for _ in range(num_warmup):
|
|
1438
|
+
sess.run(None, {input_name: dummy_input})
|
|
1439
|
+
except Exception as e:
|
|
1440
|
+
self.logger.warning(f"Resolution {res_str} failed (OOM?): {e}")
|
|
1441
|
+
latencies.append(float("inf"))
|
|
1442
|
+
throughputs.append(0.0)
|
|
1443
|
+
vram_usage.append(0.0)
|
|
1444
|
+
continue
|
|
1445
|
+
|
|
1446
|
+
# Benchmark
|
|
1447
|
+
run_latencies = []
|
|
1448
|
+
for _ in range(num_runs):
|
|
1449
|
+
start = time.perf_counter()
|
|
1450
|
+
sess.run(None, {input_name: dummy_input})
|
|
1451
|
+
end = time.perf_counter()
|
|
1452
|
+
run_latencies.append((end - start) * 1000)
|
|
1453
|
+
|
|
1454
|
+
run_latencies.sort()
|
|
1455
|
+
p50_latency = run_latencies[len(run_latencies) // 2]
|
|
1456
|
+
throughput = (batch_size * 1000.0) / p50_latency
|
|
1457
|
+
|
|
1458
|
+
latencies.append(round(p50_latency, 2))
|
|
1459
|
+
throughputs.append(round(throughput, 1))
|
|
1460
|
+
|
|
1461
|
+
# VRAM estimate (or measure with pynvml)
|
|
1462
|
+
gpu_metrics = self.get_gpu_metrics()
|
|
1463
|
+
if gpu_metrics:
|
|
1464
|
+
vram_usage.append(round(gpu_metrics.vram_used_bytes / (1024**3), 3))
|
|
1465
|
+
else:
|
|
1466
|
+
vram_usage.append(round(dummy_input.nbytes * 2 / (1024**3), 3))
|
|
1467
|
+
|
|
1468
|
+
# Track optimal and max
|
|
1469
|
+
pixels = h * w
|
|
1470
|
+
if pixels > max_pixels:
|
|
1471
|
+
max_pixels = pixels
|
|
1472
|
+
max_res = res_str
|
|
1473
|
+
|
|
1474
|
+
if throughput > max_throughput:
|
|
1475
|
+
max_throughput = throughput
|
|
1476
|
+
optimal_res = res_str
|
|
1477
|
+
|
|
1478
|
+
self.logger.info(
|
|
1479
|
+
f" Resolution {res_str}: latency={p50_latency:.2f}ms, "
|
|
1480
|
+
f"throughput={throughput:.1f} inf/s"
|
|
1481
|
+
)
|
|
1482
|
+
|
|
1483
|
+
return ResolutionSweep(
|
|
1484
|
+
resolutions=resolution_strs,
|
|
1485
|
+
flops=flops_list,
|
|
1486
|
+
memory_gb=memory_gb_list,
|
|
1487
|
+
latencies=latencies,
|
|
1488
|
+
throughputs=throughputs,
|
|
1489
|
+
vram_usage_gb=vram_usage,
|
|
1490
|
+
optimal_resolution=optimal_res or resolution_strs[0],
|
|
1491
|
+
max_resolution=max_res or resolution_strs[-1],
|
|
1492
|
+
)
|