haoline 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. haoline/.streamlit/config.toml +10 -0
  2. haoline/__init__.py +248 -0
  3. haoline/analyzer.py +935 -0
  4. haoline/cli.py +2712 -0
  5. haoline/compare.py +811 -0
  6. haoline/compare_visualizations.py +1564 -0
  7. haoline/edge_analysis.py +525 -0
  8. haoline/eval/__init__.py +131 -0
  9. haoline/eval/adapters.py +844 -0
  10. haoline/eval/cli.py +390 -0
  11. haoline/eval/comparison.py +542 -0
  12. haoline/eval/deployment.py +633 -0
  13. haoline/eval/schemas.py +833 -0
  14. haoline/examples/__init__.py +15 -0
  15. haoline/examples/basic_inspection.py +74 -0
  16. haoline/examples/compare_models.py +117 -0
  17. haoline/examples/hardware_estimation.py +78 -0
  18. haoline/format_adapters.py +1001 -0
  19. haoline/formats/__init__.py +123 -0
  20. haoline/formats/coreml.py +250 -0
  21. haoline/formats/gguf.py +483 -0
  22. haoline/formats/openvino.py +255 -0
  23. haoline/formats/safetensors.py +273 -0
  24. haoline/formats/tflite.py +369 -0
  25. haoline/hardware.py +2307 -0
  26. haoline/hierarchical_graph.py +462 -0
  27. haoline/html_export.py +1573 -0
  28. haoline/layer_summary.py +769 -0
  29. haoline/llm_summarizer.py +465 -0
  30. haoline/op_icons.py +618 -0
  31. haoline/operational_profiling.py +1492 -0
  32. haoline/patterns.py +1116 -0
  33. haoline/pdf_generator.py +265 -0
  34. haoline/privacy.py +250 -0
  35. haoline/pydantic_models.py +241 -0
  36. haoline/report.py +1923 -0
  37. haoline/report_sections.py +539 -0
  38. haoline/risks.py +521 -0
  39. haoline/schema.py +523 -0
  40. haoline/streamlit_app.py +2024 -0
  41. haoline/tests/__init__.py +4 -0
  42. haoline/tests/conftest.py +123 -0
  43. haoline/tests/test_analyzer.py +868 -0
  44. haoline/tests/test_compare_visualizations.py +293 -0
  45. haoline/tests/test_edge_analysis.py +243 -0
  46. haoline/tests/test_eval.py +604 -0
  47. haoline/tests/test_format_adapters.py +460 -0
  48. haoline/tests/test_hardware.py +237 -0
  49. haoline/tests/test_hardware_recommender.py +90 -0
  50. haoline/tests/test_hierarchical_graph.py +326 -0
  51. haoline/tests/test_html_export.py +180 -0
  52. haoline/tests/test_layer_summary.py +428 -0
  53. haoline/tests/test_llm_patterns.py +540 -0
  54. haoline/tests/test_llm_summarizer.py +339 -0
  55. haoline/tests/test_patterns.py +774 -0
  56. haoline/tests/test_pytorch.py +327 -0
  57. haoline/tests/test_report.py +383 -0
  58. haoline/tests/test_risks.py +398 -0
  59. haoline/tests/test_schema.py +417 -0
  60. haoline/tests/test_tensorflow.py +380 -0
  61. haoline/tests/test_visualizations.py +316 -0
  62. haoline/universal_ir.py +856 -0
  63. haoline/visualizations.py +1086 -0
  64. haoline/visualize_yolo.py +44 -0
  65. haoline/web.py +110 -0
  66. haoline-0.3.0.dist-info/METADATA +471 -0
  67. haoline-0.3.0.dist-info/RECORD +70 -0
  68. haoline-0.3.0.dist-info/WHEEL +4 -0
  69. haoline-0.3.0.dist-info/entry_points.txt +5 -0
  70. haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,1492 @@
1
+ # Copyright (c) 2025 HaoLine Contributors
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Operational profiling and system requirements analysis.
6
+
7
+ This module implements:
8
+ - Batch size scalability analysis (sweeps)
9
+ - System requirements generation (Steam-style min/rec/optimal)
10
+ - Resolution impact analysis (future)
11
+ """
12
+
13
+ from __future__ import annotations
14
+
15
+ import logging
16
+ from dataclasses import dataclass
17
+ from typing import Any
18
+
19
+ from .hardware import (
20
+ HARDWARE_PROFILES,
21
+ HardwareEstimates,
22
+ HardwareEstimator,
23
+ HardwareProfile,
24
+ )
25
+
26
+
27
+ @dataclass
28
+ class BatchSweepPoint:
29
+ """Metrics for a single batch size point."""
30
+
31
+ batch_size: int
32
+ vram_required_bytes: int
33
+ estimated_latency_ms: float
34
+ throughput_fps: float
35
+ compute_utilization: float
36
+ bottleneck: str
37
+ fits_in_vram: bool
38
+
39
+
40
+ @dataclass
41
+ class GPUMetrics:
42
+ """Real-time GPU metrics from pynvml."""
43
+
44
+ vram_used_bytes: int
45
+ vram_total_bytes: int
46
+ gpu_utilization_percent: float
47
+ memory_utilization_percent: float
48
+ temperature_c: int
49
+ power_draw_w: float
50
+
51
+ def to_dict(self) -> dict[str, Any]:
52
+ return {
53
+ "vram_used_gb": round(self.vram_used_bytes / (1024**3), 3),
54
+ "vram_total_gb": round(self.vram_total_bytes / (1024**3), 1),
55
+ "gpu_utilization_percent": self.gpu_utilization_percent,
56
+ "memory_utilization_percent": self.memory_utilization_percent,
57
+ "temperature_c": self.temperature_c,
58
+ "power_draw_w": self.power_draw_w,
59
+ }
60
+
61
+
62
+ @dataclass
63
+ class LayerProfile:
64
+ """Profiling data for a single layer/operator."""
65
+
66
+ name: str
67
+ op_type: str
68
+ duration_us: float # Microseconds
69
+ provider: str # e.g., "CUDAExecutionProvider"
70
+ input_shapes: list[list[int]]
71
+ output_shapes: list[list[int]]
72
+
73
+ @property
74
+ def duration_ms(self) -> float:
75
+ return self.duration_us / 1000.0
76
+
77
+ def to_dict(self) -> dict[str, Any]:
78
+ return {
79
+ "name": self.name,
80
+ "op_type": self.op_type,
81
+ "duration_ms": round(self.duration_ms, 3),
82
+ "provider": self.provider,
83
+ "input_shapes": self.input_shapes,
84
+ "output_shapes": self.output_shapes,
85
+ }
86
+
87
+
88
+ @dataclass
89
+ class ProfilingResult:
90
+ """Complete profiling results from ONNX Runtime."""
91
+
92
+ total_time_ms: float
93
+ layer_profiles: list[LayerProfile]
94
+ gpu_metrics: GPUMetrics | None
95
+ session_options: dict[str, Any]
96
+
97
+ def get_slowest_layers(self, top_n: int = 10) -> list[LayerProfile]:
98
+ """Get the N slowest layers by execution time."""
99
+ return sorted(self.layer_profiles, key=lambda x: -x.duration_us)[:top_n]
100
+
101
+ def get_time_by_op_type(self) -> dict[str, float]:
102
+ """Aggregate execution time by operator type."""
103
+ time_by_op: dict[str, float] = {}
104
+ for layer in self.layer_profiles:
105
+ time_by_op[layer.op_type] = time_by_op.get(layer.op_type, 0) + layer.duration_ms
106
+ return dict(sorted(time_by_op.items(), key=lambda x: -x[1]))
107
+
108
+ def to_dict(self) -> dict[str, Any]:
109
+ return {
110
+ "total_time_ms": round(self.total_time_ms, 3),
111
+ "layer_count": len(self.layer_profiles),
112
+ "slowest_layers": [lp.to_dict() for lp in self.get_slowest_layers()],
113
+ "time_by_op_type": {k: round(v, 3) for k, v in self.get_time_by_op_type().items()},
114
+ "gpu_metrics": self.gpu_metrics.to_dict() if self.gpu_metrics else None,
115
+ }
116
+
117
+
118
+ @dataclass
119
+ class BottleneckAnalysis:
120
+ """Analysis of model performance bottlenecks."""
121
+
122
+ bottleneck_type: str # "compute-bound", "memory-bound", "balanced"
123
+ compute_time_ms: float
124
+ memory_time_ms: float # Estimated memory transfer time
125
+ compute_ratio: float # Fraction of time spent in compute
126
+ memory_ratio: float # Fraction of time spent in memory ops
127
+ theoretical_peak_tflops: float
128
+ achieved_tflops: float
129
+ efficiency_percent: float
130
+ recommendations: list[str]
131
+
132
+ def to_dict(self) -> dict[str, Any]:
133
+ return {
134
+ "bottleneck_type": self.bottleneck_type,
135
+ "compute_time_ms": round(self.compute_time_ms, 3),
136
+ "memory_time_ms": round(self.memory_time_ms, 3),
137
+ "compute_ratio": round(self.compute_ratio, 2),
138
+ "memory_ratio": round(self.memory_ratio, 2),
139
+ "theoretical_peak_tflops": round(self.theoretical_peak_tflops, 2),
140
+ "achieved_tflops": round(self.achieved_tflops, 4),
141
+ "efficiency_percent": round(self.efficiency_percent, 1),
142
+ "recommendations": self.recommendations,
143
+ }
144
+
145
+
146
+ @dataclass
147
+ class ResolutionPoint:
148
+ """Metrics for a single resolution point."""
149
+
150
+ resolution: tuple[int, int]
151
+ resolution_str: str # e.g., "224x224"
152
+ flops: int
153
+ memory_bytes: int
154
+ vram_required_bytes: int
155
+ estimated_latency_ms: float
156
+ throughput_fps: float
157
+ fits_in_vram: bool
158
+
159
+
160
+ @dataclass
161
+ class ResolutionSweep:
162
+ """Results of a resolution sweep analysis."""
163
+
164
+ resolutions: list[str] # ["224x224", "384x384", ...]
165
+ flops: list[int]
166
+ memory_gb: list[float]
167
+ latencies: list[float]
168
+ throughputs: list[float]
169
+ vram_usage_gb: list[float]
170
+ optimal_resolution: str
171
+ max_resolution: str # Largest resolution that fits in VRAM
172
+
173
+ def to_dict(self) -> dict[str, Any]:
174
+ return {
175
+ "resolutions": self.resolutions,
176
+ "flops": self.flops,
177
+ "memory_gb": self.memory_gb,
178
+ "latencies": self.latencies,
179
+ "throughputs": self.throughputs,
180
+ "vram_usage_gb": self.vram_usage_gb,
181
+ "optimal_resolution": self.optimal_resolution,
182
+ "max_resolution": self.max_resolution,
183
+ }
184
+
185
+
186
+ @dataclass
187
+ class BatchSizeSweep:
188
+ """Results of a batch size sweep analysis."""
189
+
190
+ batch_sizes: list[int]
191
+ latencies: list[float]
192
+ throughputs: list[float]
193
+ vram_usage_gb: list[float]
194
+ optimal_batch_size: int
195
+
196
+ def to_dict(self) -> dict[str, Any]:
197
+ return {
198
+ "batch_sizes": self.batch_sizes,
199
+ "latencies": self.latencies,
200
+ "throughputs": self.throughputs,
201
+ "vram_usage_gb": self.vram_usage_gb,
202
+ "optimal_batch_size": self.optimal_batch_size,
203
+ }
204
+
205
+
206
+ @dataclass
207
+ class SystemRequirements:
208
+ """Recommended hardware tiers for deployment.
209
+
210
+ This is a lightweight, report-friendly wrapper around :class:`HardwareEstimates`.
211
+ It deliberately mirrors the older `SystemRequirements` helper in `hardware.py`,
212
+ exposing `minimum_gpu`, `recommended_gpu`, and `optimal_gpu` style attributes so
213
+ existing report/HTML code (and mental model) continue to work.
214
+ """
215
+
216
+ # Core estimates for each tier
217
+ minimum: HardwareEstimates | None # The lowest spec that runs it
218
+ recommended: HardwareEstimates | None # Good balance of cost/perf
219
+ optimal: HardwareEstimates | None # Maximum performance
220
+
221
+ def to_dict(self) -> dict[str, Any]:
222
+ return {
223
+ "minimum": self.minimum.to_dict() if self.minimum else None,
224
+ "recommended": self.recommended.to_dict() if self.recommended else None,
225
+ "optimal": self.optimal.to_dict() if self.optimal else None,
226
+ }
227
+
228
+ # Backwards/HTML-friendly convenience properties ---------------------
229
+ #
230
+ # These keep the `reqs.minimum_gpu.name` / `reqs.minimum_vram_gb` style
231
+ # access patterns working in `report.py` and HTML templates without
232
+ # duplicating all the shape logic here.
233
+
234
+ @property
235
+ def minimum_gpu(self) -> HardwareEstimates | None:
236
+ return self.minimum
237
+
238
+ @property
239
+ def recommended_gpu(self) -> HardwareEstimates | None:
240
+ return self.recommended
241
+
242
+ @property
243
+ def optimal_gpu(self) -> HardwareEstimates | None:
244
+ return self.optimal
245
+
246
+ @staticmethod
247
+ def _vram_gb(est: HardwareEstimates | None) -> float | None:
248
+ if not est:
249
+ return None
250
+ return round(est.vram_required_bytes / (1024**3), 2)
251
+
252
+ @property
253
+ def minimum_vram_gb(self) -> float | None:
254
+ return self._vram_gb(self.minimum)
255
+
256
+ @property
257
+ def recommended_vram_gb(self) -> float | None:
258
+ return self._vram_gb(self.recommended)
259
+
260
+
261
+ class OperationalProfiler:
262
+ """
263
+ Analyzes model operational characteristics.
264
+ """
265
+
266
+ def __init__(self, logger: logging.Logger | None = None):
267
+ self.logger = logger or logging.getLogger("haoline.profiler")
268
+ self.hw_estimator = HardwareEstimator(logger=self.logger)
269
+
270
+ def _create_input_feed(
271
+ self,
272
+ sess: Any,
273
+ batch_size: int = 1,
274
+ seq_len: int = 128,
275
+ ) -> dict[str, Any]:
276
+ """
277
+ Create input feed dict for all model inputs (Story 9.6).
278
+
279
+ Handles multi-input models like BERT, LLMs, and multimodal models.
280
+
281
+ Args:
282
+ sess: ONNX Runtime InferenceSession
283
+ batch_size: Batch size for inputs
284
+ seq_len: Sequence length for text inputs (default: 128)
285
+
286
+ Returns:
287
+ Dict mapping input names to numpy arrays
288
+ """
289
+ import numpy as np
290
+
291
+ input_feed = {}
292
+
293
+ for inp in sess.get_inputs():
294
+ name = inp.name
295
+ shape = list(inp.shape)
296
+ dtype_str = inp.type # e.g., "tensor(float)", "tensor(int64)"
297
+
298
+ # Determine numpy dtype from ONNX type
299
+ np_dtype: type[np.generic]
300
+ if "int64" in dtype_str:
301
+ np_dtype = np.int64
302
+ is_text = True
303
+ elif "int32" in dtype_str:
304
+ np_dtype = np.int32
305
+ is_text = True
306
+ elif "float16" in dtype_str:
307
+ np_dtype = np.float16
308
+ is_text = False
309
+ elif "bool" in dtype_str:
310
+ np_dtype = np.bool_
311
+ is_text = False
312
+ else:
313
+ np_dtype = np.float32
314
+ is_text = False
315
+
316
+ # Resolve dynamic dimensions
317
+ resolved_shape = []
318
+ for i, dim in enumerate(shape):
319
+ if isinstance(dim, int) and dim > 0:
320
+ resolved_shape.append(dim)
321
+ elif i == 0:
322
+ # Batch dimension
323
+ resolved_shape.append(batch_size)
324
+ elif is_text:
325
+ # Text models: sequence length
326
+ resolved_shape.append(seq_len)
327
+ elif len(shape) == 4 and i == 1:
328
+ # Vision models: channels
329
+ resolved_shape.append(3)
330
+ else:
331
+ # Vision models: spatial dims
332
+ resolved_shape.append(224)
333
+
334
+ # Generate appropriate dummy data
335
+ if is_text:
336
+ # Token IDs: random integers in typical vocab range
337
+ # numpy stubs are overly strict about randint dtype
338
+ dummy: np.ndarray = np.random.randint(0, 30000, size=resolved_shape, dtype=np_dtype) # type: ignore[arg-type]
339
+ elif np_dtype == np.bool_:
340
+ # Boolean masks
341
+ dummy = np.ones(resolved_shape, dtype=np_dtype)
342
+ else:
343
+ # Continuous values (vision, etc.)
344
+ dummy = np.random.randn(*resolved_shape).astype(np_dtype)
345
+
346
+ input_feed[name] = dummy
347
+
348
+ return input_feed
349
+
350
+ def run_batch_sweep(
351
+ self,
352
+ model_params: int,
353
+ model_flops: int,
354
+ peak_activation_bytes: int,
355
+ hardware: HardwareProfile,
356
+ batch_sizes: list[int] | None = None,
357
+ precision: str = "fp16",
358
+ ) -> BatchSizeSweep:
359
+ """
360
+ Analyze performance scaling across batch sizes.
361
+
362
+ Args:
363
+ model_params: Total parameters
364
+ model_flops: FLOPs per inference (batch=1)
365
+ peak_activation_bytes: Peak activation memory (batch=1)
366
+ hardware: Target hardware profile
367
+ batch_sizes: List of batch sizes to test (default: powers of 2)
368
+ precision: Precision to simulate ("fp32", "fp16", "int8")
369
+
370
+ Returns:
371
+ BatchSizeSweep results
372
+ """
373
+ if batch_sizes is None:
374
+ batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
375
+
376
+ latencies = []
377
+ throughputs = []
378
+ vram_usage = []
379
+ optimal_bs = 1
380
+ max_throughput = 0.0
381
+
382
+ for bs in batch_sizes:
383
+ est = self.hw_estimator.estimate(
384
+ model_params=model_params,
385
+ model_flops=model_flops,
386
+ peak_activation_bytes=peak_activation_bytes,
387
+ hardware=hardware,
388
+ batch_size=bs,
389
+ precision=precision,
390
+ )
391
+
392
+ # Calculate throughput (inferences per second)
393
+ # If latency is infinite (OOM), throughput is 0
394
+ throughput = 0.0
395
+ latency = float("inf")
396
+ vram_gb = est.vram_required_bytes / (1024**3)
397
+
398
+ if est.theoretical_latency_ms > 0 and est.fits_in_vram:
399
+ latency = est.theoretical_latency_ms
400
+ throughput = (1000.0 / latency) * bs
401
+
402
+ if throughput > max_throughput:
403
+ max_throughput = throughput
404
+ optimal_bs = bs
405
+
406
+ latencies.append(latency)
407
+ throughputs.append(throughput)
408
+ vram_usage.append(vram_gb)
409
+
410
+ return BatchSizeSweep(
411
+ batch_sizes=batch_sizes,
412
+ latencies=latencies,
413
+ throughputs=throughputs,
414
+ vram_usage_gb=vram_usage,
415
+ optimal_batch_size=optimal_bs,
416
+ )
417
+
418
+ def run_batch_sweep_benchmark(
419
+ self,
420
+ model_path: str,
421
+ batch_sizes: list[int] | None = None,
422
+ num_warmup: int = 5,
423
+ num_runs: int = 20,
424
+ ) -> BatchSizeSweep | None:
425
+ """
426
+ Benchmark actual inference performance across batch sizes.
427
+
428
+ Uses ONNX Runtime to measure real latency and throughput.
429
+ Requires onnxruntime to be installed.
430
+
431
+ Args:
432
+ model_path: Path to ONNX model file
433
+ batch_sizes: List of batch sizes to test (default: powers of 2)
434
+ num_warmup: Number of warmup runs before timing
435
+ num_runs: Number of timed runs per batch size
436
+
437
+ Returns:
438
+ BatchSizeSweep with measured (not estimated) metrics
439
+ """
440
+ try:
441
+ import numpy as np
442
+ import onnxruntime as ort
443
+ except ImportError:
444
+ self.logger.warning("onnxruntime not available, falling back to estimates")
445
+ return None
446
+
447
+ if batch_sizes is None:
448
+ batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
449
+
450
+ # Create session
451
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
452
+ try:
453
+ sess = ort.InferenceSession(model_path, providers=providers)
454
+ except Exception as e:
455
+ self.logger.error(f"Failed to load model for benchmarking: {e}")
456
+ return None
457
+
458
+ active_provider = sess.get_providers()[0]
459
+ self.logger.info(f"Benchmarking with {active_provider}")
460
+
461
+ # Get ALL input info (Story 9.6: Multi-input model support)
462
+ all_inputs = sess.get_inputs()
463
+ input_specs = [] # List of (name, shape_template, dtype, is_text)
464
+
465
+ for inp in all_inputs:
466
+ name = inp.name
467
+ shape = list(inp.shape)
468
+ dtype_str = inp.type # e.g., "tensor(float)", "tensor(int64)"
469
+
470
+ # Determine numpy dtype
471
+ np_dtype: type[np.generic]
472
+ if "int64" in dtype_str:
473
+ np_dtype = np.int64
474
+ is_text = True # Likely token IDs
475
+ elif "int32" in dtype_str:
476
+ np_dtype = np.int32
477
+ is_text = True
478
+ elif "float16" in dtype_str:
479
+ np_dtype = np.float16
480
+ is_text = False
481
+ else:
482
+ np_dtype = np.float32
483
+ is_text = False
484
+
485
+ # Resolve dynamic dimensions with sensible defaults
486
+ resolved_shape = []
487
+ for i, dim in enumerate(shape):
488
+ if isinstance(dim, int) and dim > 0:
489
+ resolved_shape.append(dim)
490
+ elif i == 0:
491
+ resolved_shape.append(1) # Batch dim, replaced per iteration
492
+ elif is_text:
493
+ # Text models: sequence length
494
+ resolved_shape.append(128) # Default seq_len
495
+ elif len(shape) == 4 and i == 1:
496
+ resolved_shape.append(3) # Channels for vision
497
+ else:
498
+ resolved_shape.append(224) # Spatial dims for vision
499
+
500
+ input_specs.append((name, resolved_shape, np_dtype, is_text))
501
+ self.logger.debug(
502
+ f" Input '{name}': shape={resolved_shape}, dtype={np_dtype.__name__}"
503
+ )
504
+
505
+ self.logger.info(f"Model has {len(input_specs)} input(s)")
506
+
507
+ latencies = []
508
+ throughputs = []
509
+ vram_usage = []
510
+ optimal_bs = 1
511
+ max_throughput = 0.0
512
+
513
+ for bs in batch_sizes:
514
+ # Create input feed for ALL inputs
515
+ input_feed = {}
516
+ total_bytes = 0
517
+
518
+ try:
519
+ for name, shape_template, np_dtype, is_text in input_specs:
520
+ # Set batch size
521
+ shape = shape_template.copy()
522
+ shape[0] = bs
523
+
524
+ # Generate appropriate dummy data
525
+ if is_text:
526
+ # Token IDs: random integers in vocab range
527
+ dummy: np.ndarray = np.random.randint(0, 30000, size=shape, dtype=np_dtype) # type: ignore[arg-type]
528
+ else:
529
+ # Vision/continuous: random floats
530
+ dummy = np.random.randn(*shape).astype(np_dtype)
531
+
532
+ input_feed[name] = dummy
533
+ total_bytes += dummy.nbytes
534
+
535
+ except Exception as e:
536
+ self.logger.warning(f"Failed to create inputs for batch {bs}: {e}")
537
+ latencies.append(float("inf"))
538
+ throughputs.append(0.0)
539
+ vram_usage.append(0.0)
540
+ continue
541
+
542
+ # Warmup
543
+ try:
544
+ for _ in range(num_warmup):
545
+ sess.run(None, input_feed)
546
+ except Exception as e:
547
+ self.logger.warning(f"Batch {bs} failed (OOM?): {e}")
548
+ latencies.append(float("inf"))
549
+ throughputs.append(0.0)
550
+ vram_usage.append(0.0)
551
+ continue
552
+
553
+ # Benchmark
554
+ import time
555
+
556
+ run_latencies = []
557
+ for _ in range(num_runs):
558
+ start = time.perf_counter()
559
+ sess.run(None, input_feed)
560
+ end = time.perf_counter()
561
+ run_latencies.append((end - start) * 1000) # ms
562
+
563
+ # Use median latency (more stable than mean)
564
+ run_latencies.sort()
565
+ p50_latency = run_latencies[len(run_latencies) // 2]
566
+ throughput = (bs * 1000.0) / p50_latency
567
+
568
+ latencies.append(round(p50_latency, 2))
569
+ throughputs.append(round(throughput, 1))
570
+
571
+ # VRAM: try to measure with pynvml, fall back to estimate
572
+ gpu_metrics = self.get_gpu_metrics()
573
+ if gpu_metrics:
574
+ vram_gb = gpu_metrics.vram_used_bytes / (1024**3)
575
+ else:
576
+ # Estimate: total input bytes * 10 for activations
577
+ vram_gb = (total_bytes * 10) / (1024**3)
578
+ vram_usage.append(round(vram_gb, 3))
579
+
580
+ if throughput > max_throughput:
581
+ max_throughput = throughput
582
+ optimal_bs = bs
583
+
584
+ self.logger.info(
585
+ f" Batch {bs}: latency={p50_latency:.2f}ms, throughput={throughput:.1f} inf/s"
586
+ )
587
+
588
+ return BatchSizeSweep(
589
+ batch_sizes=batch_sizes,
590
+ latencies=latencies,
591
+ throughputs=throughputs,
592
+ vram_usage_gb=vram_usage,
593
+ optimal_batch_size=optimal_bs,
594
+ )
595
+
596
+ def run_resolution_sweep(
597
+ self,
598
+ base_flops: int,
599
+ base_activation_bytes: int,
600
+ base_resolution: tuple[int, int],
601
+ model_params: int,
602
+ hardware: HardwareProfile,
603
+ resolutions: list[tuple[int, int]] | None = None,
604
+ batch_size: int = 1,
605
+ precision: str = "fp16",
606
+ ) -> ResolutionSweep:
607
+ """
608
+ Analyze performance scaling across input resolutions.
609
+
610
+ For vision models, FLOPs and memory scale approximately quadratically
611
+ with resolution (for most architectures like ResNet, ViT, YOLO).
612
+
613
+ Args:
614
+ base_flops: FLOPs at base_resolution
615
+ base_activation_bytes: Activation memory at base_resolution
616
+ base_resolution: The resolution used for base measurements (H, W)
617
+ model_params: Total parameters (doesn't change with resolution)
618
+ hardware: Target hardware profile
619
+ resolutions: List of (H, W) resolutions to test
620
+ batch_size: Batch size for estimates
621
+ precision: Precision ("fp32", "fp16", "int8")
622
+
623
+ Returns:
624
+ ResolutionSweep results
625
+ """
626
+ base_h, base_w = base_resolution
627
+ base_pixels = base_h * base_w
628
+ base_aspect = base_w / base_h if base_h > 0 else 1.0
629
+
630
+ if resolutions is None:
631
+ # Generate resolutions that:
632
+ # 1. Match the aspect ratio of training data
633
+ # 2. Only go UP TO (not above) the training resolution
634
+ # Running above training resolution typically produces poor results
635
+ resolutions = []
636
+
637
+ # Common scale factors (smaller than or equal to 1.0)
638
+ if base_aspect == 1.0:
639
+ # Square aspect ratio
640
+ candidates = [
641
+ 128,
642
+ 160,
643
+ 192,
644
+ 224,
645
+ 256,
646
+ 320,
647
+ 384,
648
+ 416,
649
+ 448,
650
+ 512,
651
+ 640,
652
+ 768,
653
+ 1024,
654
+ ]
655
+ for size in candidates:
656
+ if size <= base_h:
657
+ resolutions.append((size, size))
658
+ else:
659
+ # Non-square: generate resolutions matching aspect ratio
660
+ scale_factors = [0.25, 0.375, 0.5, 0.625, 0.75, 0.875, 1.0]
661
+ for scale in scale_factors:
662
+ h = int(base_h * scale)
663
+ w = int(base_w * scale)
664
+ # Round to nearest 32 for GPU efficiency
665
+ h = max(32, (h // 32) * 32)
666
+ w = max(32, (w // 32) * 32)
667
+ if h <= base_h and w <= base_w and (h, w) not in resolutions:
668
+ resolutions.append((h, w))
669
+
670
+ # Always include the base resolution
671
+ if base_resolution not in resolutions:
672
+ resolutions.append(base_resolution)
673
+
674
+ # Sort by pixel count
675
+ resolutions.sort(key=lambda r: r[0] * r[1])
676
+
677
+ resolution_strs = []
678
+ flops_list = []
679
+ memory_gb_list = []
680
+ latencies = []
681
+ throughputs = []
682
+ vram_usage = []
683
+ optimal_res = f"{base_h}x{base_w}"
684
+ max_res = f"{base_h}x{base_w}"
685
+ max_throughput = 0.0
686
+ max_fitting_pixels = 0
687
+
688
+ for h, w in resolutions:
689
+ res_str = f"{h}x{w}"
690
+ resolution_strs.append(res_str)
691
+
692
+ # Scale FLOPs and memory quadratically with resolution
693
+ pixels = h * w
694
+ scale_factor = pixels / base_pixels
695
+
696
+ scaled_flops = int(base_flops * scale_factor)
697
+ scaled_activation = int(base_activation_bytes * scale_factor)
698
+
699
+ flops_list.append(scaled_flops)
700
+ memory_gb_list.append(scaled_activation / (1024**3))
701
+
702
+ # Get hardware estimates for this resolution
703
+ est = self.hw_estimator.estimate(
704
+ model_params=model_params,
705
+ model_flops=scaled_flops,
706
+ peak_activation_bytes=scaled_activation,
707
+ hardware=hardware,
708
+ batch_size=batch_size,
709
+ precision=precision,
710
+ )
711
+
712
+ vram_gb = est.vram_required_bytes / (1024**3)
713
+ vram_usage.append(vram_gb)
714
+
715
+ if est.fits_in_vram and est.theoretical_latency_ms > 0:
716
+ latency = est.theoretical_latency_ms
717
+ throughput = (1000.0 / latency) * batch_size
718
+
719
+ latencies.append(latency)
720
+ throughputs.append(throughput)
721
+
722
+ # Track max resolution that fits
723
+ if pixels > max_fitting_pixels:
724
+ max_fitting_pixels = pixels
725
+ max_res = res_str
726
+
727
+ # Track optimal (highest throughput)
728
+ if throughput > max_throughput:
729
+ max_throughput = throughput
730
+ optimal_res = res_str
731
+ else:
732
+ latencies.append(float("inf"))
733
+ throughputs.append(0.0)
734
+
735
+ return ResolutionSweep(
736
+ resolutions=resolution_strs,
737
+ flops=flops_list,
738
+ memory_gb=memory_gb_list,
739
+ latencies=latencies,
740
+ throughputs=throughputs,
741
+ vram_usage_gb=vram_usage,
742
+ optimal_resolution=optimal_res,
743
+ max_resolution=max_res,
744
+ )
745
+
746
+ def recommend_resolution(
747
+ self,
748
+ base_flops: int,
749
+ base_activation_bytes: int,
750
+ base_resolution: tuple[int, int],
751
+ model_params: int,
752
+ hardware: HardwareProfile,
753
+ target_fps: float = 30.0,
754
+ batch_size: int = 1,
755
+ precision: str = "fp16",
756
+ ) -> dict[str, Any]:
757
+ """
758
+ Recommend optimal resolution for target hardware and latency requirements.
759
+
760
+ Task 6.8.5: Resolution recommendations for target hardware
761
+
762
+ Args:
763
+ base_flops: FLOPs at base_resolution
764
+ base_activation_bytes: Activation memory at base_resolution
765
+ base_resolution: The resolution used for base measurements (H, W)
766
+ model_params: Total parameters
767
+ hardware: Target hardware profile
768
+ target_fps: Desired frames per second (default: 30 fps)
769
+ batch_size: Batch size
770
+ precision: Precision for estimates
771
+
772
+ Returns:
773
+ Dict with recommended_resolution, max_resolution, and rationale
774
+ """
775
+ target_latency_ms = 1000.0 / target_fps
776
+
777
+ # Run sweep with common resolutions
778
+ sweep = self.run_resolution_sweep(
779
+ base_flops=base_flops,
780
+ base_activation_bytes=base_activation_bytes,
781
+ base_resolution=base_resolution,
782
+ model_params=model_params,
783
+ hardware=hardware,
784
+ batch_size=batch_size,
785
+ precision=precision,
786
+ )
787
+
788
+ # Find resolution that meets target FPS
789
+ recommended = None
790
+ recommended_idx = -1
791
+ for i, (res, lat) in enumerate(zip(sweep.resolutions, sweep.latencies, strict=False)):
792
+ if lat != float("inf") and lat <= target_latency_ms:
793
+ recommended = res
794
+ recommended_idx = i
795
+
796
+ # Build recommendation rationale
797
+ rationale_parts = []
798
+
799
+ if recommended:
800
+ rationale_parts.append(
801
+ f"Resolution **{recommended}** meets {target_fps} FPS target "
802
+ f"({sweep.latencies[recommended_idx]:.1f}ms latency)."
803
+ )
804
+ else:
805
+ # Find closest resolution that fits
806
+ for i, (res, lat) in enumerate(zip(sweep.resolutions, sweep.latencies, strict=False)):
807
+ if lat != float("inf"):
808
+ recommended = res
809
+ recommended_idx = i
810
+ break
811
+
812
+ if recommended:
813
+ actual_fps = 1000.0 / sweep.latencies[recommended_idx]
814
+ rationale_parts.append(
815
+ f"Cannot meet {target_fps} FPS. Best achievable: "
816
+ f"**{recommended}** at {actual_fps:.1f} FPS."
817
+ )
818
+ else:
819
+ rationale_parts.append("No resolution fits in available VRAM.")
820
+
821
+ if sweep.max_resolution and sweep.max_resolution != recommended:
822
+ rationale_parts.append(
823
+ f"Maximum resolution that fits in VRAM: **{sweep.max_resolution}**."
824
+ )
825
+
826
+ return {
827
+ "recommended_resolution": recommended,
828
+ "max_resolution": sweep.max_resolution,
829
+ "optimal_resolution": sweep.optimal_resolution,
830
+ "target_fps": target_fps,
831
+ "achievable_fps": (
832
+ 1000.0 / sweep.latencies[recommended_idx]
833
+ if recommended and recommended_idx >= 0
834
+ else 0.0
835
+ ),
836
+ "rationale": " ".join(rationale_parts),
837
+ "sweep_results": sweep.to_dict(),
838
+ }
839
+
840
+ def determine_system_requirements(
841
+ self,
842
+ model_params: int,
843
+ model_flops: int,
844
+ peak_activation_bytes: int,
845
+ precision: str = "fp16",
846
+ target_fps: float = 30.0, # For "Recommended" tier
847
+ ) -> SystemRequirements:
848
+ """
849
+ Find suitable hardware tiers ("Steam-style" requirements).
850
+
851
+ Strategy:
852
+ - Minimum: Cheapest hardware that fits the model in VRAM (Batch=1)
853
+ - Recommended: Cheapest hardware that hits target_fps (Batch=1) OR fits with good utilization
854
+ - Optimal: Hardware providing highest throughput/lowest latency
855
+ """
856
+ candidates = []
857
+
858
+ # Evaluate against all known profiles
859
+ # Filter out mobile/multi-gpu for cleaner list, or keep them?
860
+ # Let's keep single-GPU desktops/servers for simplicity of recommendation
861
+ for name, profile in HARDWARE_PROFILES.items():
862
+ # Skip generic CPU for this analysis unless it's the only option
863
+ if profile.device_type == "cpu":
864
+ continue
865
+
866
+ # Skip mobile variants to keep list clean (optional)
867
+ if "mobile" in name:
868
+ continue
869
+
870
+ est = self.hw_estimator.estimate(
871
+ model_params=model_params,
872
+ model_flops=model_flops,
873
+ peak_activation_bytes=peak_activation_bytes,
874
+ hardware=profile,
875
+ batch_size=1,
876
+ precision=precision,
877
+ )
878
+ candidates.append((profile, est))
879
+
880
+ if not candidates:
881
+ return SystemRequirements(None, None, None)
882
+
883
+ # --- Find Minimum ---
884
+ # Sort by VRAM (ascending), then FLOPs (ascending)
885
+ candidates.sort(key=lambda x: (x[0].vram_bytes, x[0].peak_fp16_tflops))
886
+
887
+ minimum = None
888
+ for _, est in candidates:
889
+ if est.fits_in_vram:
890
+ minimum = est
891
+ break
892
+
893
+ # --- Find Optimal ---
894
+ # Sort by Latency (ascending)
895
+ candidates.sort(key=lambda x: x[1].theoretical_latency_ms)
896
+
897
+ optimal = None
898
+ # Filter for ones that fit
899
+ valid_candidates = [x for x in candidates if x[1].fits_in_vram]
900
+ if valid_candidates:
901
+ optimal = valid_candidates[0][1] # Fastest
902
+
903
+ # --- Find Recommended ---
904
+ # Heuristic: Fits VRAM AND (Latency <= 1000/target_fps OR Utilization > 0.5)
905
+ # We want something reasonable, not necessarily the fastest (which is often H100)
906
+ # Let's look for the "cheapest" card that meets a performance bar.
907
+
908
+ recommended = None
909
+
910
+ # Re-sort by cost proxy (we don't have prices in HardwareProfile, but TFLOPS is a rough proxy)
911
+ valid_candidates.sort(key=lambda x: x[0].peak_fp16_tflops)
912
+
913
+ target_latency_ms = 1000.0 / target_fps
914
+
915
+ for _, est in valid_candidates:
916
+ if est.theoretical_latency_ms <= target_latency_ms:
917
+ recommended = est
918
+ break
919
+
920
+ # If nothing meets strict FPS target, pick the one with decent utilization
921
+ if recommended is None and valid_candidates:
922
+ # Pick median performer? Or just fallback to Minimum if nothing is fast enough?
923
+ # Let's pick the one that is ~4x faster than minimum if possible, or just minimum
924
+ minimum_latency = minimum.theoretical_latency_ms if minimum else float("inf")
925
+ for _, est in valid_candidates:
926
+ if est.theoretical_latency_ms <= minimum_latency / 4.0:
927
+ recommended = est
928
+ break
929
+
930
+ if recommended is None:
931
+ recommended = minimum # Fallback
932
+
933
+ return SystemRequirements(minimum=minimum, recommended=recommended, optimal=optimal)
934
+
935
+ # =========================================================================
936
+ # Story 9.2: GPU Memory Profiling
937
+ # =========================================================================
938
+
939
+ def get_gpu_metrics(self, device_index: int = 0) -> GPUMetrics | None:
940
+ """
941
+ Get real-time GPU metrics using pynvml.
942
+
943
+ Args:
944
+ device_index: GPU device index (default: 0)
945
+
946
+ Returns:
947
+ GPUMetrics with VRAM usage, utilization, temperature, power
948
+ None if pynvml is not available or fails
949
+ """
950
+ try:
951
+ import pynvml
952
+
953
+ pynvml.nvmlInit()
954
+ handle = pynvml.nvmlDeviceGetHandleByIndex(device_index)
955
+
956
+ # Memory info
957
+ mem_info = pynvml.nvmlDeviceGetMemoryInfo(handle)
958
+
959
+ # Utilization
960
+ util = pynvml.nvmlDeviceGetUtilizationRates(handle)
961
+
962
+ # Temperature
963
+ temp = pynvml.nvmlDeviceGetTemperature(handle, pynvml.NVML_TEMPERATURE_GPU)
964
+
965
+ # Power
966
+ try:
967
+ power_mw = pynvml.nvmlDeviceGetPowerUsage(handle)
968
+ power_w = power_mw / 1000.0
969
+ except pynvml.NVMLError:
970
+ power_w = 0.0
971
+
972
+ pynvml.nvmlShutdown()
973
+
974
+ return GPUMetrics(
975
+ vram_used_bytes=mem_info.used,
976
+ vram_total_bytes=mem_info.total,
977
+ gpu_utilization_percent=float(util.gpu),
978
+ memory_utilization_percent=float(util.memory),
979
+ temperature_c=temp,
980
+ power_draw_w=power_w,
981
+ )
982
+ except ImportError:
983
+ self.logger.debug("pynvml not available for GPU metrics")
984
+ return None
985
+ except Exception as e:
986
+ self.logger.warning(f"Failed to get GPU metrics: {e}")
987
+ return None
988
+
989
+ def measure_peak_vram(
990
+ self,
991
+ model_path: str,
992
+ batch_size: int = 1,
993
+ num_runs: int = 5,
994
+ device_index: int = 0,
995
+ ) -> dict[str, Any]:
996
+ """
997
+ Measure actual peak VRAM usage during inference.
998
+
999
+ Args:
1000
+ model_path: Path to ONNX model
1001
+ batch_size: Batch size for inference
1002
+ num_runs: Number of inference runs
1003
+ device_index: GPU device index
1004
+
1005
+ Returns:
1006
+ Dict with baseline, peak, and delta VRAM usage
1007
+ """
1008
+ try:
1009
+ import numpy as np
1010
+ import onnxruntime as ort
1011
+ except ImportError:
1012
+ return {"error": "onnxruntime not available"}
1013
+
1014
+ # Get baseline GPU metrics
1015
+ baseline_metrics = self.get_gpu_metrics(device_index)
1016
+ if baseline_metrics is None:
1017
+ return {"error": "pynvml not available for VRAM measurement"}
1018
+
1019
+ baseline_vram = baseline_metrics.vram_used_bytes
1020
+
1021
+ # Create session with CUDA
1022
+ try:
1023
+ sess = ort.InferenceSession(
1024
+ model_path,
1025
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
1026
+ )
1027
+ except Exception as e:
1028
+ return {"error": f"Failed to create session: {e}"}
1029
+
1030
+ # Get input info and create dummy input
1031
+ input_info = sess.get_inputs()[0]
1032
+ input_shape = list(input_info.shape)
1033
+ for i, dim in enumerate(input_shape):
1034
+ if not isinstance(dim, int) or dim <= 0:
1035
+ if i == 0:
1036
+ input_shape[i] = batch_size
1037
+ elif i == 1:
1038
+ input_shape[i] = 3
1039
+ else:
1040
+ input_shape[i] = 224
1041
+ input_shape[0] = batch_size
1042
+
1043
+ dummy_input = np.random.randn(*input_shape).astype(np.float32)
1044
+
1045
+ # Run inference and measure peak VRAM
1046
+ peak_vram = baseline_vram
1047
+ vram_samples = []
1048
+
1049
+ for _ in range(num_runs):
1050
+ sess.run(None, {input_info.name: dummy_input})
1051
+ metrics = self.get_gpu_metrics(device_index)
1052
+ if metrics:
1053
+ vram_samples.append(metrics.vram_used_bytes)
1054
+ if metrics.vram_used_bytes > peak_vram:
1055
+ peak_vram = metrics.vram_used_bytes
1056
+
1057
+ delta_vram = peak_vram - baseline_vram
1058
+
1059
+ return {
1060
+ "baseline_vram_gb": round(baseline_vram / (1024**3), 3),
1061
+ "peak_vram_gb": round(peak_vram / (1024**3), 3),
1062
+ "delta_vram_gb": round(delta_vram / (1024**3), 3),
1063
+ "model_vram_estimate_gb": round(delta_vram / (1024**3), 3),
1064
+ "batch_size": batch_size,
1065
+ "samples": len(vram_samples),
1066
+ }
1067
+
1068
+ # =========================================================================
1069
+ # Story 9.3: Per-Layer Profiling
1070
+ # =========================================================================
1071
+
1072
+ def profile_model(
1073
+ self,
1074
+ model_path: str,
1075
+ batch_size: int = 1,
1076
+ num_runs: int = 10,
1077
+ device_index: int = 0,
1078
+ ) -> ProfilingResult | None:
1079
+ """
1080
+ Profile model execution with ONNX Runtime's built-in profiler.
1081
+
1082
+ Args:
1083
+ model_path: Path to ONNX model
1084
+ batch_size: Batch size for profiling
1085
+ num_runs: Number of profiling runs
1086
+ device_index: GPU device index
1087
+
1088
+ Returns:
1089
+ ProfilingResult with per-layer timing data
1090
+ """
1091
+ try:
1092
+ import json
1093
+ import os
1094
+ import tempfile
1095
+ import time
1096
+
1097
+ import onnxruntime as ort
1098
+ except ImportError:
1099
+ self.logger.warning("onnxruntime not available for profiling")
1100
+ return None
1101
+
1102
+ # Create session with profiling enabled
1103
+ sess_options = ort.SessionOptions()
1104
+ sess_options.enable_profiling = True
1105
+
1106
+ # Use temp directory for profile output
1107
+ with tempfile.TemporaryDirectory() as tmpdir:
1108
+ sess_options.profile_file_prefix = os.path.join(tmpdir, "ort_profile")
1109
+
1110
+ try:
1111
+ sess = ort.InferenceSession(
1112
+ model_path,
1113
+ sess_options=sess_options,
1114
+ providers=["CUDAExecutionProvider", "CPUExecutionProvider"],
1115
+ )
1116
+ except Exception as e:
1117
+ self.logger.error(f"Failed to create profiling session: {e}")
1118
+ return None
1119
+
1120
+ # Get ALL inputs (Story 9.6: Multi-input model support)
1121
+ input_feed = self._create_input_feed(sess, batch_size)
1122
+
1123
+ # Warmup
1124
+ for _ in range(3):
1125
+ sess.run(None, input_feed)
1126
+
1127
+ # Profile runs
1128
+ start = time.perf_counter()
1129
+ for _ in range(num_runs):
1130
+ sess.run(None, input_feed)
1131
+ total_time_ms = ((time.perf_counter() - start) / num_runs) * 1000
1132
+
1133
+ # End profiling and get the file
1134
+ profile_file = sess.end_profiling()
1135
+
1136
+ # Parse profile JSON
1137
+ layer_profiles = []
1138
+ try:
1139
+ with open(profile_file, encoding="utf-8") as f:
1140
+ profile_data = json.load(f)
1141
+
1142
+ for event in profile_data:
1143
+ if event.get("cat") == "Node":
1144
+ name = event.get("name", "")
1145
+ args = event.get("args", {})
1146
+ op_type = args.get("op_name", "Unknown")
1147
+ provider = args.get("provider", "Unknown")
1148
+ dur = event.get("dur", 0) # Duration in microseconds
1149
+
1150
+ # Parse shapes from args
1151
+ input_shapes: list[list[int]] = []
1152
+ output_shapes: list[list[int]] = []
1153
+ for key, value in args.items():
1154
+ if key.startswith("input_") and "shape" not in key:
1155
+ continue
1156
+ if "shape" in key.lower():
1157
+ try:
1158
+ if isinstance(value, str):
1159
+ # Parse shape string like "[1,3,224,224]"
1160
+ shape = [
1161
+ int(x)
1162
+ for x in value.strip("[]").split(",")
1163
+ if x.strip()
1164
+ ]
1165
+ if "output" in key.lower():
1166
+ output_shapes.append(shape)
1167
+ else:
1168
+ input_shapes.append(shape)
1169
+ except (ValueError, AttributeError):
1170
+ pass
1171
+
1172
+ layer_profiles.append(
1173
+ LayerProfile(
1174
+ name=name,
1175
+ op_type=op_type,
1176
+ duration_us=dur,
1177
+ provider=provider,
1178
+ input_shapes=input_shapes,
1179
+ output_shapes=output_shapes,
1180
+ )
1181
+ )
1182
+ except Exception as e:
1183
+ self.logger.warning(f"Failed to parse profile: {e}")
1184
+
1185
+ # Get GPU metrics
1186
+ gpu_metrics = self.get_gpu_metrics(device_index)
1187
+
1188
+ return ProfilingResult(
1189
+ total_time_ms=total_time_ms,
1190
+ layer_profiles=layer_profiles,
1191
+ gpu_metrics=gpu_metrics,
1192
+ session_options={"batch_size": batch_size, "num_runs": num_runs},
1193
+ )
1194
+
1195
+ # =========================================================================
1196
+ # Story 9.4: Bottleneck Detection
1197
+ # =========================================================================
1198
+
1199
+ def analyze_bottleneck(
1200
+ self,
1201
+ model_flops: int,
1202
+ profiling_result: ProfilingResult | None,
1203
+ hardware: HardwareProfile,
1204
+ precision: str = "fp16",
1205
+ ) -> BottleneckAnalysis:
1206
+ """
1207
+ Analyze whether model is compute-bound or memory-bound.
1208
+
1209
+ Uses roofline model principles:
1210
+ - Compute-bound: Time dominated by FLOP execution
1211
+ - Memory-bound: Time dominated by memory bandwidth
1212
+
1213
+ Args:
1214
+ model_flops: Total FLOPs per inference
1215
+ profiling_result: Results from profile_model()
1216
+ hardware: Target hardware profile
1217
+ precision: Precision used ("fp32", "fp16", "int8")
1218
+
1219
+ Returns:
1220
+ BottleneckAnalysis with classification and recommendations
1221
+ """
1222
+ # Get peak theoretical compute
1223
+ if precision == "fp32":
1224
+ peak_tflops = hardware.peak_fp32_tflops or hardware.peak_fp16_tflops / 2
1225
+ elif precision == "int8":
1226
+ peak_tflops = hardware.peak_int8_tops or hardware.peak_fp16_tflops * 2
1227
+ else:
1228
+ peak_tflops = hardware.peak_fp16_tflops
1229
+
1230
+ # Actual latency
1231
+ if profiling_result:
1232
+ actual_latency_ms = profiling_result.total_time_ms
1233
+ else:
1234
+ # Estimate from theoretical
1235
+ actual_latency_ms = (model_flops / (peak_tflops * 1e12)) * 1000
1236
+
1237
+ # Calculate achieved TFLOPs
1238
+ achieved_tflops = (model_flops / actual_latency_ms) / 1e9 # TFLOPS
1239
+
1240
+ # Efficiency
1241
+ efficiency = (achieved_tflops / peak_tflops) * 100 if peak_tflops > 0 else 0
1242
+
1243
+ # Estimate memory transfer time
1244
+ # Rough estimate: assume model params + activations need to be read
1245
+ # Memory bandwidth in bytes/s -> convert to bytes/ms
1246
+ mem_bandwidth_bytes_per_ms = hardware.memory_bandwidth_bytes_per_s / 1000 # B/s -> B/ms
1247
+
1248
+ # Estimate memory footprint accessed per inference
1249
+ # This is a rough estimate - actual depends on caching, batch size, etc.
1250
+ bytes_per_param = 2 if precision == "fp16" else 4 if precision == "fp32" else 1
1251
+ # Assume we read all params once + some activation memory
1252
+ estimated_memory_bytes = model_flops * bytes_per_param / 1000 # Rough
1253
+
1254
+ memory_time_ms = estimated_memory_bytes / mem_bandwidth_bytes_per_ms
1255
+
1256
+ # Compute time (from achieved throughput)
1257
+ compute_time_ms = actual_latency_ms - memory_time_ms
1258
+ if compute_time_ms < 0:
1259
+ compute_time_ms = actual_latency_ms * 0.5 # Fallback
1260
+
1261
+ # Ratios
1262
+ total_time = compute_time_ms + memory_time_ms
1263
+ compute_ratio = compute_time_ms / total_time if total_time > 0 else 0.5
1264
+ memory_ratio = 1.0 - compute_ratio
1265
+
1266
+ # Classification
1267
+ if compute_ratio > 0.7:
1268
+ bottleneck_type = "compute-bound"
1269
+ elif memory_ratio > 0.7:
1270
+ bottleneck_type = "memory-bound"
1271
+ else:
1272
+ bottleneck_type = "balanced"
1273
+
1274
+ # Recommendations based on bottleneck
1275
+ recommendations = []
1276
+
1277
+ if bottleneck_type == "compute-bound":
1278
+ recommendations.extend(
1279
+ [
1280
+ "Use INT8/FP16 quantization to reduce compute requirements",
1281
+ "Consider model pruning to reduce FLOP count",
1282
+ "Use Tensor Cores (if available) for matrix operations",
1283
+ "Increase batch size to improve GPU utilization",
1284
+ ]
1285
+ )
1286
+ if efficiency < 50:
1287
+ recommendations.append(
1288
+ f"GPU utilization is low ({efficiency:.0f}%). "
1289
+ "Check for CPU bottlenecks or data loading issues."
1290
+ )
1291
+ elif bottleneck_type == "memory-bound":
1292
+ recommendations.extend(
1293
+ [
1294
+ "Use lower precision (FP16/INT8) to reduce memory bandwidth",
1295
+ "Enable operator fusion to reduce memory round-trips",
1296
+ "Consider tensor compression or activation checkpointing",
1297
+ "Use hardware with higher memory bandwidth",
1298
+ ]
1299
+ )
1300
+ else: # balanced
1301
+ recommendations.extend(
1302
+ [
1303
+ "Model has balanced compute/memory characteristics",
1304
+ "Both quantization and bandwidth optimization may help",
1305
+ "Profile individual layers to find specific bottlenecks",
1306
+ ]
1307
+ )
1308
+
1309
+ # Add efficiency-specific recommendations
1310
+ if efficiency < 30:
1311
+ recommendations.append(
1312
+ "Very low GPU efficiency. Consider using TensorRT or "
1313
+ "ONNX Runtime optimization passes."
1314
+ )
1315
+
1316
+ return BottleneckAnalysis(
1317
+ bottleneck_type=bottleneck_type,
1318
+ compute_time_ms=compute_time_ms,
1319
+ memory_time_ms=memory_time_ms,
1320
+ compute_ratio=compute_ratio,
1321
+ memory_ratio=memory_ratio,
1322
+ theoretical_peak_tflops=peak_tflops,
1323
+ achieved_tflops=achieved_tflops,
1324
+ efficiency_percent=efficiency,
1325
+ recommendations=recommendations,
1326
+ )
1327
+
1328
+ # =========================================================================
1329
+ # Story 9.5: Resolution Benchmarking
1330
+ # =========================================================================
1331
+
1332
+ def benchmark_resolutions(
1333
+ self,
1334
+ model_path: str,
1335
+ resolutions: list[tuple[int, int]] | None = None,
1336
+ batch_size: int = 1,
1337
+ num_warmup: int = 5,
1338
+ num_runs: int = 20,
1339
+ ) -> ResolutionSweep | None:
1340
+ """
1341
+ Benchmark actual inference performance across resolutions.
1342
+
1343
+ Args:
1344
+ model_path: Path to ONNX model
1345
+ resolutions: List of (H, W) resolutions to test
1346
+ batch_size: Batch size for benchmarking
1347
+ num_warmup: Warmup runs before timing
1348
+ num_runs: Timed runs per resolution
1349
+
1350
+ Returns:
1351
+ ResolutionSweep with measured (not estimated) metrics
1352
+ """
1353
+ try:
1354
+ import time
1355
+
1356
+ import numpy as np
1357
+ import onnxruntime as ort
1358
+ except ImportError:
1359
+ self.logger.warning("onnxruntime not available for benchmarking")
1360
+ return None
1361
+
1362
+ if resolutions is None:
1363
+ # Default resolutions for vision models
1364
+ resolutions = [
1365
+ (128, 128),
1366
+ (224, 224),
1367
+ (256, 256),
1368
+ (384, 384),
1369
+ (512, 512),
1370
+ (640, 640),
1371
+ ]
1372
+
1373
+ # Create session
1374
+ providers = ["CUDAExecutionProvider", "CPUExecutionProvider"]
1375
+ try:
1376
+ sess = ort.InferenceSession(model_path, providers=providers)
1377
+ except Exception as e:
1378
+ self.logger.error(f"Failed to create session: {e}")
1379
+ return None
1380
+
1381
+ input_info = sess.get_inputs()[0]
1382
+ input_name = input_info.name
1383
+
1384
+ resolution_strs: list[str] = []
1385
+ flops_list: list[int] = []
1386
+ memory_gb_list: list[float] = []
1387
+ latencies: list[float] = []
1388
+ throughputs: list[float] = []
1389
+ vram_usage: list[float] = []
1390
+
1391
+ max_throughput = 0.0
1392
+ optimal_res = ""
1393
+ max_res = ""
1394
+ max_pixels = 0
1395
+
1396
+ for h, w in resolutions:
1397
+ res_str = f"{h}x{w}"
1398
+ resolution_strs.append(res_str)
1399
+
1400
+ # Create input with this resolution
1401
+ # Assume [N, C, H, W] format
1402
+ input_shape = list(input_info.shape)
1403
+ for i, dim in enumerate(input_shape):
1404
+ if not isinstance(dim, int) or dim <= 0:
1405
+ if i == 0:
1406
+ input_shape[i] = batch_size
1407
+ elif i == 1:
1408
+ input_shape[i] = 3
1409
+ elif i == 2:
1410
+ input_shape[i] = h
1411
+ elif i == 3:
1412
+ input_shape[i] = w
1413
+
1414
+ try:
1415
+ dummy_input = np.random.randn(*input_shape).astype(np.float32)
1416
+ except Exception as e:
1417
+ self.logger.warning(f"Failed to create input for {res_str}: {e}")
1418
+ flops_list.append(0)
1419
+ memory_gb_list.append(0.0)
1420
+ latencies.append(float("inf"))
1421
+ throughputs.append(0.0)
1422
+ vram_usage.append(0.0)
1423
+ continue
1424
+
1425
+ # Estimate FLOPs (scales quadratically with resolution)
1426
+ base_flops = 4_000_000_000 # Rough estimate for 224x224
1427
+ scale = (h * w) / (224 * 224)
1428
+ flops = int(base_flops * scale)
1429
+ flops_list.append(flops)
1430
+
1431
+ # Memory estimate
1432
+ memory_gb = dummy_input.nbytes / (1024**3)
1433
+ memory_gb_list.append(round(memory_gb, 4))
1434
+
1435
+ # Warmup
1436
+ try:
1437
+ for _ in range(num_warmup):
1438
+ sess.run(None, {input_name: dummy_input})
1439
+ except Exception as e:
1440
+ self.logger.warning(f"Resolution {res_str} failed (OOM?): {e}")
1441
+ latencies.append(float("inf"))
1442
+ throughputs.append(0.0)
1443
+ vram_usage.append(0.0)
1444
+ continue
1445
+
1446
+ # Benchmark
1447
+ run_latencies = []
1448
+ for _ in range(num_runs):
1449
+ start = time.perf_counter()
1450
+ sess.run(None, {input_name: dummy_input})
1451
+ end = time.perf_counter()
1452
+ run_latencies.append((end - start) * 1000)
1453
+
1454
+ run_latencies.sort()
1455
+ p50_latency = run_latencies[len(run_latencies) // 2]
1456
+ throughput = (batch_size * 1000.0) / p50_latency
1457
+
1458
+ latencies.append(round(p50_latency, 2))
1459
+ throughputs.append(round(throughput, 1))
1460
+
1461
+ # VRAM estimate (or measure with pynvml)
1462
+ gpu_metrics = self.get_gpu_metrics()
1463
+ if gpu_metrics:
1464
+ vram_usage.append(round(gpu_metrics.vram_used_bytes / (1024**3), 3))
1465
+ else:
1466
+ vram_usage.append(round(dummy_input.nbytes * 2 / (1024**3), 3))
1467
+
1468
+ # Track optimal and max
1469
+ pixels = h * w
1470
+ if pixels > max_pixels:
1471
+ max_pixels = pixels
1472
+ max_res = res_str
1473
+
1474
+ if throughput > max_throughput:
1475
+ max_throughput = throughput
1476
+ optimal_res = res_str
1477
+
1478
+ self.logger.info(
1479
+ f" Resolution {res_str}: latency={p50_latency:.2f}ms, "
1480
+ f"throughput={throughput:.1f} inf/s"
1481
+ )
1482
+
1483
+ return ResolutionSweep(
1484
+ resolutions=resolution_strs,
1485
+ flops=flops_list,
1486
+ memory_gb=memory_gb_list,
1487
+ latencies=latencies,
1488
+ throughputs=throughputs,
1489
+ vram_usage_gb=vram_usage,
1490
+ optimal_resolution=optimal_res or resolution_strs[0],
1491
+ max_resolution=max_res or resolution_strs[-1],
1492
+ )