haoline 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. haoline/.streamlit/config.toml +10 -0
  2. haoline/__init__.py +248 -0
  3. haoline/analyzer.py +935 -0
  4. haoline/cli.py +2712 -0
  5. haoline/compare.py +811 -0
  6. haoline/compare_visualizations.py +1564 -0
  7. haoline/edge_analysis.py +525 -0
  8. haoline/eval/__init__.py +131 -0
  9. haoline/eval/adapters.py +844 -0
  10. haoline/eval/cli.py +390 -0
  11. haoline/eval/comparison.py +542 -0
  12. haoline/eval/deployment.py +633 -0
  13. haoline/eval/schemas.py +833 -0
  14. haoline/examples/__init__.py +15 -0
  15. haoline/examples/basic_inspection.py +74 -0
  16. haoline/examples/compare_models.py +117 -0
  17. haoline/examples/hardware_estimation.py +78 -0
  18. haoline/format_adapters.py +1001 -0
  19. haoline/formats/__init__.py +123 -0
  20. haoline/formats/coreml.py +250 -0
  21. haoline/formats/gguf.py +483 -0
  22. haoline/formats/openvino.py +255 -0
  23. haoline/formats/safetensors.py +273 -0
  24. haoline/formats/tflite.py +369 -0
  25. haoline/hardware.py +2307 -0
  26. haoline/hierarchical_graph.py +462 -0
  27. haoline/html_export.py +1573 -0
  28. haoline/layer_summary.py +769 -0
  29. haoline/llm_summarizer.py +465 -0
  30. haoline/op_icons.py +618 -0
  31. haoline/operational_profiling.py +1492 -0
  32. haoline/patterns.py +1116 -0
  33. haoline/pdf_generator.py +265 -0
  34. haoline/privacy.py +250 -0
  35. haoline/pydantic_models.py +241 -0
  36. haoline/report.py +1923 -0
  37. haoline/report_sections.py +539 -0
  38. haoline/risks.py +521 -0
  39. haoline/schema.py +523 -0
  40. haoline/streamlit_app.py +2024 -0
  41. haoline/tests/__init__.py +4 -0
  42. haoline/tests/conftest.py +123 -0
  43. haoline/tests/test_analyzer.py +868 -0
  44. haoline/tests/test_compare_visualizations.py +293 -0
  45. haoline/tests/test_edge_analysis.py +243 -0
  46. haoline/tests/test_eval.py +604 -0
  47. haoline/tests/test_format_adapters.py +460 -0
  48. haoline/tests/test_hardware.py +237 -0
  49. haoline/tests/test_hardware_recommender.py +90 -0
  50. haoline/tests/test_hierarchical_graph.py +326 -0
  51. haoline/tests/test_html_export.py +180 -0
  52. haoline/tests/test_layer_summary.py +428 -0
  53. haoline/tests/test_llm_patterns.py +540 -0
  54. haoline/tests/test_llm_summarizer.py +339 -0
  55. haoline/tests/test_patterns.py +774 -0
  56. haoline/tests/test_pytorch.py +327 -0
  57. haoline/tests/test_report.py +383 -0
  58. haoline/tests/test_risks.py +398 -0
  59. haoline/tests/test_schema.py +417 -0
  60. haoline/tests/test_tensorflow.py +380 -0
  61. haoline/tests/test_visualizations.py +316 -0
  62. haoline/universal_ir.py +856 -0
  63. haoline/visualizations.py +1086 -0
  64. haoline/visualize_yolo.py +44 -0
  65. haoline/web.py +110 -0
  66. haoline-0.3.0.dist-info/METADATA +471 -0
  67. haoline-0.3.0.dist-info/RECORD +70 -0
  68. haoline-0.3.0.dist-info/WHEEL +4 -0
  69. haoline-0.3.0.dist-info/entry_points.txt +5 -0
  70. haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
haoline/hardware.py ADDED
@@ -0,0 +1,2307 @@
1
+ # Copyright (c) 2025 HaoLine Contributors
2
+ # SPDX-License-Identifier: MIT
3
+
4
+ """
5
+ Hardware detection and profile management for HaoLine.
6
+
7
+ This module provides:
8
+ - Automatic detection of local GPU/CPU hardware
9
+ - Predefined profiles for common NVIDIA GPUs
10
+ - Hardware-aware performance estimates
11
+ - System requirements generation (Minimum, Recommended, Optimal)
12
+ - Batch size scaling analysis
13
+ """
14
+
15
+ from __future__ import annotations
16
+
17
+ import logging
18
+ import os
19
+ import platform
20
+ import subprocess
21
+ from dataclasses import dataclass
22
+ from typing import Any
23
+
24
+ # Try to import psutil for CPU info, but don't require it
25
+ try:
26
+ import psutil
27
+
28
+ _HAS_PSUTIL = True
29
+ except ImportError:
30
+ _HAS_PSUTIL = False
31
+
32
+
33
+ @dataclass
34
+ class HardwareProfile:
35
+ """
36
+ Hardware specification for performance estimates.
37
+
38
+ All values are theoretical peaks - actual performance will vary
39
+ based on memory access patterns, kernel efficiency, etc.
40
+ """
41
+
42
+ name: str
43
+ vendor: str # "nvidia", "amd", "apple", "intel", "generic"
44
+ device_type: str # "gpu", "cpu", "npu"
45
+
46
+ # Memory
47
+ vram_bytes: int # GPU VRAM or system RAM for CPU
48
+ memory_bandwidth_bytes_per_s: int
49
+
50
+ # Compute (theoretical peaks)
51
+ peak_fp32_tflops: float
52
+ peak_fp16_tflops: float
53
+ peak_int8_tops: float # Tera-ops for INT8
54
+
55
+ # Optional metadata
56
+ compute_capability: str = "" # e.g., "8.9" for Ada Lovelace
57
+ tdp_watts: int = 0
58
+ is_detected: bool = False # True if auto-detected from local hardware
59
+
60
+ def to_dict(self) -> dict[str, Any]:
61
+ return {
62
+ "name": self.name,
63
+ "vendor": self.vendor,
64
+ "device_type": self.device_type,
65
+ "vram_gb": round(self.vram_bytes / (1024**3), 1),
66
+ "memory_bandwidth_gb_s": round(self.memory_bandwidth_bytes_per_s / (1024**3), 1),
67
+ "peak_fp32_tflops": self.peak_fp32_tflops,
68
+ "peak_fp16_tflops": self.peak_fp16_tflops,
69
+ "peak_int8_tops": self.peak_int8_tops,
70
+ "compute_capability": self.compute_capability,
71
+ "tdp_watts": self.tdp_watts,
72
+ "is_detected": self.is_detected,
73
+ }
74
+
75
+
76
+ # ============================================================================
77
+ # Predefined Hardware Profiles
78
+ # ============================================================================
79
+
80
+ # NVIDIA Data Center GPUs - H100 Series
81
+ NVIDIA_H100_SXM = HardwareProfile(
82
+ name="NVIDIA H100 SXM",
83
+ vendor="nvidia",
84
+ device_type="gpu",
85
+ vram_bytes=80 * (1024**3), # 80 GB HBM3
86
+ memory_bandwidth_bytes_per_s=3350 * (1024**3), # 3.35 TB/s
87
+ peak_fp32_tflops=67.0,
88
+ peak_fp16_tflops=1979.0, # With sparsity: 3958
89
+ peak_int8_tops=3958.0,
90
+ compute_capability="9.0",
91
+ tdp_watts=700,
92
+ )
93
+
94
+ NVIDIA_H100_PCIE = HardwareProfile(
95
+ name="NVIDIA H100 PCIe",
96
+ vendor="nvidia",
97
+ device_type="gpu",
98
+ vram_bytes=80 * (1024**3), # 80 GB HBM3
99
+ memory_bandwidth_bytes_per_s=2000 * (1024**3), # 2.0 TB/s (lower than SXM)
100
+ peak_fp32_tflops=51.0, # Lower than SXM
101
+ peak_fp16_tflops=1513.0,
102
+ peak_int8_tops=3026.0,
103
+ compute_capability="9.0",
104
+ tdp_watts=350,
105
+ )
106
+
107
+ NVIDIA_H100_NVL = HardwareProfile(
108
+ name="NVIDIA H100 NVL",
109
+ vendor="nvidia",
110
+ device_type="gpu",
111
+ vram_bytes=94 * (1024**3), # 94 GB HBM3 (dual-GPU module)
112
+ memory_bandwidth_bytes_per_s=3958 * (1024**3), # 3.9 TB/s
113
+ peak_fp32_tflops=134.0, # 2x H100 NVLink
114
+ peak_fp16_tflops=3958.0,
115
+ peak_int8_tops=7916.0,
116
+ compute_capability="9.0",
117
+ tdp_watts=800,
118
+ )
119
+
120
+ # NVIDIA Data Center GPUs - A100 Series
121
+ NVIDIA_A100_80GB_SXM = HardwareProfile(
122
+ name="NVIDIA A100 80GB SXM",
123
+ vendor="nvidia",
124
+ device_type="gpu",
125
+ vram_bytes=80 * (1024**3), # 80 GB HBM2e
126
+ memory_bandwidth_bytes_per_s=2039 * (1024**3), # 2.0 TB/s
127
+ peak_fp32_tflops=19.5,
128
+ peak_fp16_tflops=312.0, # Tensor Core
129
+ peak_int8_tops=624.0,
130
+ compute_capability="8.0",
131
+ tdp_watts=400,
132
+ )
133
+
134
+ NVIDIA_A100_80GB_PCIE = HardwareProfile(
135
+ name="NVIDIA A100 80GB PCIe",
136
+ vendor="nvidia",
137
+ device_type="gpu",
138
+ vram_bytes=80 * (1024**3), # 80 GB HBM2e
139
+ memory_bandwidth_bytes_per_s=1935 * (1024**3), # 1.9 TB/s
140
+ peak_fp32_tflops=19.5,
141
+ peak_fp16_tflops=312.0,
142
+ peak_int8_tops=624.0,
143
+ compute_capability="8.0",
144
+ tdp_watts=300,
145
+ )
146
+
147
+ # Alias for backward compatibility
148
+ NVIDIA_A100_80GB = NVIDIA_A100_80GB_SXM
149
+
150
+ NVIDIA_A100_40GB_SXM = HardwareProfile(
151
+ name="NVIDIA A100 40GB SXM",
152
+ vendor="nvidia",
153
+ device_type="gpu",
154
+ vram_bytes=40 * (1024**3),
155
+ memory_bandwidth_bytes_per_s=1555 * (1024**3), # 1.6 TB/s
156
+ peak_fp32_tflops=19.5,
157
+ peak_fp16_tflops=312.0,
158
+ peak_int8_tops=624.0,
159
+ compute_capability="8.0",
160
+ tdp_watts=400,
161
+ )
162
+
163
+ NVIDIA_A100_40GB_PCIE = HardwareProfile(
164
+ name="NVIDIA A100 40GB PCIe",
165
+ vendor="nvidia",
166
+ device_type="gpu",
167
+ vram_bytes=40 * (1024**3),
168
+ memory_bandwidth_bytes_per_s=1555 * (1024**3), # 1.6 TB/s
169
+ peak_fp32_tflops=19.5,
170
+ peak_fp16_tflops=312.0,
171
+ peak_int8_tops=624.0,
172
+ compute_capability="8.0",
173
+ tdp_watts=250,
174
+ )
175
+
176
+ # Alias for backward compatibility
177
+ NVIDIA_A100_40GB = NVIDIA_A100_40GB_SXM
178
+
179
+ NVIDIA_A10 = HardwareProfile(
180
+ name="NVIDIA A10",
181
+ vendor="nvidia",
182
+ device_type="gpu",
183
+ vram_bytes=24 * (1024**3), # 24 GB GDDR6
184
+ memory_bandwidth_bytes_per_s=600 * (1024**3), # 600 GB/s
185
+ peak_fp32_tflops=31.2,
186
+ peak_fp16_tflops=125.0, # Tensor Core
187
+ peak_int8_tops=250.0,
188
+ compute_capability="8.6",
189
+ tdp_watts=150,
190
+ )
191
+
192
+ NVIDIA_T4 = HardwareProfile(
193
+ name="NVIDIA T4",
194
+ vendor="nvidia",
195
+ device_type="gpu",
196
+ vram_bytes=16 * (1024**3), # 16 GB GDDR6
197
+ memory_bandwidth_bytes_per_s=320 * (1024**3), # 320 GB/s
198
+ peak_fp32_tflops=8.1,
199
+ peak_fp16_tflops=65.0, # Tensor Core
200
+ peak_int8_tops=130.0,
201
+ compute_capability="7.5",
202
+ tdp_watts=70,
203
+ )
204
+
205
+ NVIDIA_L4 = HardwareProfile(
206
+ name="NVIDIA L4",
207
+ vendor="nvidia",
208
+ device_type="gpu",
209
+ vram_bytes=24 * (1024**3), # 24 GB GDDR6
210
+ memory_bandwidth_bytes_per_s=300 * (1024**3), # 300 GB/s
211
+ peak_fp32_tflops=30.3,
212
+ peak_fp16_tflops=121.0, # Tensor Core
213
+ peak_int8_tops=242.0,
214
+ compute_capability="8.9",
215
+ tdp_watts=72,
216
+ )
217
+
218
+ NVIDIA_L40 = HardwareProfile(
219
+ name="NVIDIA L40",
220
+ vendor="nvidia",
221
+ device_type="gpu",
222
+ vram_bytes=48 * (1024**3), # 48 GB GDDR6
223
+ memory_bandwidth_bytes_per_s=864 * (1024**3), # 864 GB/s
224
+ peak_fp32_tflops=90.5,
225
+ peak_fp16_tflops=181.0,
226
+ peak_int8_tops=362.0,
227
+ compute_capability="8.9",
228
+ tdp_watts=300,
229
+ )
230
+
231
+ NVIDIA_L40S = HardwareProfile(
232
+ name="NVIDIA L40S",
233
+ vendor="nvidia",
234
+ device_type="gpu",
235
+ vram_bytes=48 * (1024**3), # 48 GB GDDR6
236
+ memory_bandwidth_bytes_per_s=864 * (1024**3), # 864 GB/s
237
+ peak_fp32_tflops=91.6,
238
+ peak_fp16_tflops=183.0,
239
+ peak_int8_tops=733.0, # Enhanced INT8
240
+ compute_capability="8.9",
241
+ tdp_watts=350,
242
+ )
243
+
244
+ # Older but still common datacenter GPUs - V100 Series
245
+ NVIDIA_V100_32GB_SXM = HardwareProfile(
246
+ name="NVIDIA V100 32GB SXM2",
247
+ vendor="nvidia",
248
+ device_type="gpu",
249
+ vram_bytes=32 * (1024**3), # 32 GB HBM2
250
+ memory_bandwidth_bytes_per_s=900 * (1024**3), # 900 GB/s
251
+ peak_fp32_tflops=15.7,
252
+ peak_fp16_tflops=125.0, # Tensor Core
253
+ peak_int8_tops=0.0, # No INT8 tensor cores
254
+ compute_capability="7.0",
255
+ tdp_watts=300,
256
+ )
257
+
258
+ NVIDIA_V100_32GB_PCIE = HardwareProfile(
259
+ name="NVIDIA V100 32GB PCIe",
260
+ vendor="nvidia",
261
+ device_type="gpu",
262
+ vram_bytes=32 * (1024**3), # 32 GB HBM2
263
+ memory_bandwidth_bytes_per_s=900 * (1024**3), # 900 GB/s
264
+ peak_fp32_tflops=14.0,
265
+ peak_fp16_tflops=112.0,
266
+ peak_int8_tops=0.0,
267
+ compute_capability="7.0",
268
+ tdp_watts=250,
269
+ )
270
+
271
+ # Alias for backward compatibility
272
+ NVIDIA_V100_32GB = NVIDIA_V100_32GB_SXM
273
+
274
+ NVIDIA_V100_16GB_SXM = HardwareProfile(
275
+ name="NVIDIA V100 16GB SXM2",
276
+ vendor="nvidia",
277
+ device_type="gpu",
278
+ vram_bytes=16 * (1024**3), # 16 GB HBM2
279
+ memory_bandwidth_bytes_per_s=900 * (1024**3), # 900 GB/s
280
+ peak_fp32_tflops=15.7,
281
+ peak_fp16_tflops=125.0,
282
+ peak_int8_tops=0.0,
283
+ compute_capability="7.0",
284
+ tdp_watts=300,
285
+ )
286
+
287
+ NVIDIA_V100_16GB_PCIE = HardwareProfile(
288
+ name="NVIDIA V100 16GB PCIe",
289
+ vendor="nvidia",
290
+ device_type="gpu",
291
+ vram_bytes=16 * (1024**3), # 16 GB HBM2
292
+ memory_bandwidth_bytes_per_s=900 * (1024**3), # 900 GB/s
293
+ peak_fp32_tflops=14.0,
294
+ peak_fp16_tflops=112.0,
295
+ peak_int8_tops=0.0,
296
+ compute_capability="7.0",
297
+ tdp_watts=250,
298
+ )
299
+
300
+ # Alias for backward compatibility
301
+ NVIDIA_V100_16GB = NVIDIA_V100_16GB_PCIE
302
+
303
+ NVIDIA_P100 = HardwareProfile(
304
+ name="NVIDIA P100",
305
+ vendor="nvidia",
306
+ device_type="gpu",
307
+ vram_bytes=16 * (1024**3), # 16 GB HBM2
308
+ memory_bandwidth_bytes_per_s=732 * (1024**3), # 732 GB/s
309
+ peak_fp32_tflops=9.3,
310
+ peak_fp16_tflops=18.7,
311
+ peak_int8_tops=0.0,
312
+ compute_capability="6.0",
313
+ tdp_watts=250,
314
+ )
315
+
316
+ NVIDIA_P40 = HardwareProfile(
317
+ name="NVIDIA P40",
318
+ vendor="nvidia",
319
+ device_type="gpu",
320
+ vram_bytes=24 * (1024**3), # 24 GB GDDR5X
321
+ memory_bandwidth_bytes_per_s=346 * (1024**3), # 346 GB/s
322
+ peak_fp32_tflops=12.0,
323
+ peak_fp16_tflops=0.0, # No FP16 tensor cores
324
+ peak_int8_tops=47.0,
325
+ compute_capability="6.1",
326
+ tdp_watts=250,
327
+ )
328
+
329
+ # ============================================================================
330
+ # NVIDIA Jetson Series (Edge/Embedded)
331
+ # ============================================================================
332
+
333
+ # Jetson Orin Series (2022+)
334
+ NVIDIA_JETSON_AGX_ORIN_64GB = HardwareProfile(
335
+ name="NVIDIA Jetson AGX Orin 64GB",
336
+ vendor="nvidia",
337
+ device_type="gpu",
338
+ vram_bytes=64 * (1024**3), # 64 GB unified memory
339
+ memory_bandwidth_bytes_per_s=204 * (1024**3), # 204 GB/s
340
+ peak_fp32_tflops=5.3,
341
+ peak_fp16_tflops=10.6, # Sparse: 21.2
342
+ peak_int8_tops=275.0, # Sparse
343
+ compute_capability="8.7",
344
+ tdp_watts=60, # 15W-60W configurable
345
+ )
346
+
347
+ NVIDIA_JETSON_AGX_ORIN_32GB = HardwareProfile(
348
+ name="NVIDIA Jetson AGX Orin 32GB",
349
+ vendor="nvidia",
350
+ device_type="gpu",
351
+ vram_bytes=32 * (1024**3), # 32 GB unified memory
352
+ memory_bandwidth_bytes_per_s=204 * (1024**3), # 204 GB/s
353
+ peak_fp32_tflops=5.3,
354
+ peak_fp16_tflops=10.6,
355
+ peak_int8_tops=275.0,
356
+ compute_capability="8.7",
357
+ tdp_watts=60,
358
+ )
359
+
360
+ NVIDIA_JETSON_ORIN_NX_16GB = HardwareProfile(
361
+ name="NVIDIA Jetson Orin NX 16GB",
362
+ vendor="nvidia",
363
+ device_type="gpu",
364
+ vram_bytes=16 * (1024**3), # 16 GB unified memory
365
+ memory_bandwidth_bytes_per_s=102 * (1024**3), # 102 GB/s
366
+ peak_fp32_tflops=2.5,
367
+ peak_fp16_tflops=5.0,
368
+ peak_int8_tops=100.0,
369
+ compute_capability="8.7",
370
+ tdp_watts=25, # 10W-25W configurable
371
+ )
372
+
373
+ NVIDIA_JETSON_ORIN_NX_8GB = HardwareProfile(
374
+ name="NVIDIA Jetson Orin NX 8GB",
375
+ vendor="nvidia",
376
+ device_type="gpu",
377
+ vram_bytes=8 * (1024**3), # 8 GB unified memory
378
+ memory_bandwidth_bytes_per_s=102 * (1024**3), # 102 GB/s
379
+ peak_fp32_tflops=2.0,
380
+ peak_fp16_tflops=4.0,
381
+ peak_int8_tops=70.0,
382
+ compute_capability="8.7",
383
+ tdp_watts=25,
384
+ )
385
+
386
+ NVIDIA_JETSON_ORIN_NANO_8GB = HardwareProfile(
387
+ name="NVIDIA Jetson Orin Nano 8GB",
388
+ vendor="nvidia",
389
+ device_type="gpu",
390
+ vram_bytes=8 * (1024**3), # 8 GB unified memory
391
+ memory_bandwidth_bytes_per_s=68 * (1024**3), # 68 GB/s
392
+ peak_fp32_tflops=1.0,
393
+ peak_fp16_tflops=2.0,
394
+ peak_int8_tops=40.0,
395
+ compute_capability="8.7",
396
+ tdp_watts=15, # 7W-15W configurable
397
+ )
398
+
399
+ NVIDIA_JETSON_ORIN_NANO_4GB = HardwareProfile(
400
+ name="NVIDIA Jetson Orin Nano 4GB",
401
+ vendor="nvidia",
402
+ device_type="gpu",
403
+ vram_bytes=4 * (1024**3), # 4 GB unified memory
404
+ memory_bandwidth_bytes_per_s=68 * (1024**3), # 68 GB/s
405
+ peak_fp32_tflops=0.625,
406
+ peak_fp16_tflops=1.25,
407
+ peak_int8_tops=20.0,
408
+ compute_capability="8.7",
409
+ tdp_watts=10,
410
+ )
411
+
412
+ # Jetson Xavier Series (2018-2020)
413
+ NVIDIA_JETSON_AGX_XAVIER_32GB = HardwareProfile(
414
+ name="NVIDIA Jetson AGX Xavier 32GB",
415
+ vendor="nvidia",
416
+ device_type="gpu",
417
+ vram_bytes=32 * (1024**3), # 32 GB unified memory
418
+ memory_bandwidth_bytes_per_s=136 * (1024**3), # 136 GB/s
419
+ peak_fp32_tflops=1.4,
420
+ peak_fp16_tflops=2.8,
421
+ peak_int8_tops=22.0,
422
+ compute_capability="7.2",
423
+ tdp_watts=30, # 10W-30W configurable
424
+ )
425
+
426
+ NVIDIA_JETSON_AGX_XAVIER_16GB = HardwareProfile(
427
+ name="NVIDIA Jetson AGX Xavier 16GB",
428
+ vendor="nvidia",
429
+ device_type="gpu",
430
+ vram_bytes=16 * (1024**3), # 16 GB unified memory
431
+ memory_bandwidth_bytes_per_s=136 * (1024**3), # 136 GB/s
432
+ peak_fp32_tflops=1.4,
433
+ peak_fp16_tflops=2.8,
434
+ peak_int8_tops=22.0,
435
+ compute_capability="7.2",
436
+ tdp_watts=30,
437
+ )
438
+
439
+ NVIDIA_JETSON_XAVIER_NX_16GB = HardwareProfile(
440
+ name="NVIDIA Jetson Xavier NX 16GB",
441
+ vendor="nvidia",
442
+ device_type="gpu",
443
+ vram_bytes=16 * (1024**3), # 16 GB unified memory
444
+ memory_bandwidth_bytes_per_s=59 * (1024**3), # 59.7 GB/s
445
+ peak_fp32_tflops=0.5,
446
+ peak_fp16_tflops=1.0,
447
+ peak_int8_tops=21.0,
448
+ compute_capability="7.2",
449
+ tdp_watts=20, # 10W-20W configurable
450
+ )
451
+
452
+ NVIDIA_JETSON_XAVIER_NX_8GB = HardwareProfile(
453
+ name="NVIDIA Jetson Xavier NX 8GB",
454
+ vendor="nvidia",
455
+ device_type="gpu",
456
+ vram_bytes=8 * (1024**3), # 8 GB unified memory
457
+ memory_bandwidth_bytes_per_s=59 * (1024**3), # 59.7 GB/s
458
+ peak_fp32_tflops=0.5,
459
+ peak_fp16_tflops=1.0,
460
+ peak_int8_tops=21.0,
461
+ compute_capability="7.2",
462
+ tdp_watts=20,
463
+ )
464
+
465
+ # Jetson TX2 Series (2017)
466
+ NVIDIA_JETSON_TX2 = HardwareProfile(
467
+ name="NVIDIA Jetson TX2",
468
+ vendor="nvidia",
469
+ device_type="gpu",
470
+ vram_bytes=8 * (1024**3), # 8 GB unified memory
471
+ memory_bandwidth_bytes_per_s=59 * (1024**3), # 59.7 GB/s
472
+ peak_fp32_tflops=0.67,
473
+ peak_fp16_tflops=1.33,
474
+ peak_int8_tops=0.0, # No INT8 tensor cores
475
+ compute_capability="6.2",
476
+ tdp_watts=15, # 7.5W-15W configurable
477
+ )
478
+
479
+ NVIDIA_JETSON_TX2_NX = HardwareProfile(
480
+ name="NVIDIA Jetson TX2 NX",
481
+ vendor="nvidia",
482
+ device_type="gpu",
483
+ vram_bytes=4 * (1024**3), # 4 GB unified memory
484
+ memory_bandwidth_bytes_per_s=51 * (1024**3), # 51.2 GB/s
485
+ peak_fp32_tflops=0.5,
486
+ peak_fp16_tflops=1.0,
487
+ peak_int8_tops=0.0,
488
+ compute_capability="6.2",
489
+ tdp_watts=15,
490
+ )
491
+
492
+ # Jetson Nano (2019) - The most constrained!
493
+ NVIDIA_JETSON_NANO = HardwareProfile(
494
+ name="NVIDIA Jetson Nano",
495
+ vendor="nvidia",
496
+ device_type="gpu",
497
+ vram_bytes=4 * (1024**3), # 4 GB unified memory
498
+ memory_bandwidth_bytes_per_s=25 * (1024**3), # 25.6 GB/s
499
+ peak_fp32_tflops=0.236,
500
+ peak_fp16_tflops=0.472,
501
+ peak_int8_tops=0.0, # No INT8 tensor cores
502
+ compute_capability="5.3",
503
+ tdp_watts=10, # 5W-10W configurable
504
+ )
505
+
506
+ NVIDIA_JETSON_NANO_2GB = HardwareProfile(
507
+ name="NVIDIA Jetson Nano 2GB",
508
+ vendor="nvidia",
509
+ device_type="gpu",
510
+ vram_bytes=2 * (1024**3), # 2 GB unified memory - extremely constrained!
511
+ memory_bandwidth_bytes_per_s=25 * (1024**3), # 25.6 GB/s
512
+ peak_fp32_tflops=0.236,
513
+ peak_fp16_tflops=0.472,
514
+ peak_int8_tops=0.0,
515
+ compute_capability="5.3",
516
+ tdp_watts=5,
517
+ )
518
+
519
+ # ============================================================================
520
+ # NVIDIA Consumer GPUs - RTX 40 Series (Ada Lovelace)
521
+ # ============================================================================
522
+
523
+ NVIDIA_RTX_4090 = HardwareProfile(
524
+ name="NVIDIA RTX 4090",
525
+ vendor="nvidia",
526
+ device_type="gpu",
527
+ vram_bytes=24 * (1024**3), # 24 GB GDDR6X
528
+ memory_bandwidth_bytes_per_s=1008 * (1024**3), # 1 TB/s
529
+ peak_fp32_tflops=82.6,
530
+ peak_fp16_tflops=165.0, # Tensor Core ~330 with sparsity
531
+ peak_int8_tops=660.0,
532
+ compute_capability="8.9",
533
+ tdp_watts=450,
534
+ )
535
+
536
+ NVIDIA_RTX_4080_SUPER = HardwareProfile(
537
+ name="NVIDIA RTX 4080 SUPER",
538
+ vendor="nvidia",
539
+ device_type="gpu",
540
+ vram_bytes=16 * (1024**3), # 16 GB GDDR6X
541
+ memory_bandwidth_bytes_per_s=736 * (1024**3), # 736 GB/s
542
+ peak_fp32_tflops=52.0,
543
+ peak_fp16_tflops=104.0,
544
+ peak_int8_tops=416.0,
545
+ compute_capability="8.9",
546
+ tdp_watts=320,
547
+ )
548
+
549
+ NVIDIA_RTX_4080 = HardwareProfile(
550
+ name="NVIDIA RTX 4080",
551
+ vendor="nvidia",
552
+ device_type="gpu",
553
+ vram_bytes=16 * (1024**3), # 16 GB GDDR6X
554
+ memory_bandwidth_bytes_per_s=717 * (1024**3), # 717 GB/s
555
+ peak_fp32_tflops=48.7,
556
+ peak_fp16_tflops=97.0,
557
+ peak_int8_tops=390.0,
558
+ compute_capability="8.9",
559
+ tdp_watts=320,
560
+ )
561
+
562
+ NVIDIA_RTX_4070_TI_SUPER = HardwareProfile(
563
+ name="NVIDIA RTX 4070 Ti SUPER",
564
+ vendor="nvidia",
565
+ device_type="gpu",
566
+ vram_bytes=16 * (1024**3), # 16 GB GDDR6X
567
+ memory_bandwidth_bytes_per_s=672 * (1024**3), # 672 GB/s
568
+ peak_fp32_tflops=44.0,
569
+ peak_fp16_tflops=88.0,
570
+ peak_int8_tops=352.0,
571
+ compute_capability="8.9",
572
+ tdp_watts=285,
573
+ )
574
+
575
+ NVIDIA_RTX_4070_TI = HardwareProfile(
576
+ name="NVIDIA RTX 4070 Ti",
577
+ vendor="nvidia",
578
+ device_type="gpu",
579
+ vram_bytes=12 * (1024**3), # 12 GB GDDR6X
580
+ memory_bandwidth_bytes_per_s=504 * (1024**3), # 504 GB/s
581
+ peak_fp32_tflops=40.1,
582
+ peak_fp16_tflops=80.0,
583
+ peak_int8_tops=320.0,
584
+ compute_capability="8.9",
585
+ tdp_watts=285,
586
+ )
587
+
588
+ NVIDIA_RTX_4070_SUPER = HardwareProfile(
589
+ name="NVIDIA RTX 4070 SUPER",
590
+ vendor="nvidia",
591
+ device_type="gpu",
592
+ vram_bytes=12 * (1024**3), # 12 GB GDDR6X
593
+ memory_bandwidth_bytes_per_s=504 * (1024**3), # 504 GB/s
594
+ peak_fp32_tflops=35.5,
595
+ peak_fp16_tflops=71.0,
596
+ peak_int8_tops=284.0,
597
+ compute_capability="8.9",
598
+ tdp_watts=220,
599
+ )
600
+
601
+ NVIDIA_RTX_4070 = HardwareProfile(
602
+ name="NVIDIA RTX 4070",
603
+ vendor="nvidia",
604
+ device_type="gpu",
605
+ vram_bytes=12 * (1024**3), # 12 GB GDDR6X
606
+ memory_bandwidth_bytes_per_s=504 * (1024**3), # 504 GB/s
607
+ peak_fp32_tflops=29.1,
608
+ peak_fp16_tflops=58.0,
609
+ peak_int8_tops=233.0,
610
+ compute_capability="8.9",
611
+ tdp_watts=200,
612
+ )
613
+
614
+ NVIDIA_RTX_4060_TI_16GB = HardwareProfile(
615
+ name="NVIDIA RTX 4060 Ti 16GB",
616
+ vendor="nvidia",
617
+ device_type="gpu",
618
+ vram_bytes=16 * (1024**3), # 16 GB GDDR6
619
+ memory_bandwidth_bytes_per_s=288 * (1024**3), # 288 GB/s
620
+ peak_fp32_tflops=22.1,
621
+ peak_fp16_tflops=44.0,
622
+ peak_int8_tops=176.0,
623
+ compute_capability="8.9",
624
+ tdp_watts=165,
625
+ )
626
+
627
+ NVIDIA_RTX_4060_TI_8GB = HardwareProfile(
628
+ name="NVIDIA RTX 4060 Ti 8GB",
629
+ vendor="nvidia",
630
+ device_type="gpu",
631
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
632
+ memory_bandwidth_bytes_per_s=288 * (1024**3), # 288 GB/s
633
+ peak_fp32_tflops=22.1,
634
+ peak_fp16_tflops=44.0,
635
+ peak_int8_tops=176.0,
636
+ compute_capability="8.9",
637
+ tdp_watts=160,
638
+ )
639
+
640
+ NVIDIA_RTX_4060 = HardwareProfile(
641
+ name="NVIDIA RTX 4060",
642
+ vendor="nvidia",
643
+ device_type="gpu",
644
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
645
+ memory_bandwidth_bytes_per_s=272 * (1024**3), # 272 GB/s
646
+ peak_fp32_tflops=15.1,
647
+ peak_fp16_tflops=30.0,
648
+ peak_int8_tops=121.0,
649
+ compute_capability="8.9",
650
+ tdp_watts=115,
651
+ )
652
+
653
+ # ============================================================================
654
+ # NVIDIA Consumer GPUs - RTX 30 Series (Ampere)
655
+ # ============================================================================
656
+
657
+ NVIDIA_RTX_3090_TI = HardwareProfile(
658
+ name="NVIDIA RTX 3090 Ti",
659
+ vendor="nvidia",
660
+ device_type="gpu",
661
+ vram_bytes=24 * (1024**3), # 24 GB GDDR6X
662
+ memory_bandwidth_bytes_per_s=1008 * (1024**3), # 1008 GB/s
663
+ peak_fp32_tflops=40.0,
664
+ peak_fp16_tflops=80.0,
665
+ peak_int8_tops=320.0,
666
+ compute_capability="8.6",
667
+ tdp_watts=450,
668
+ )
669
+
670
+ NVIDIA_RTX_3090 = HardwareProfile(
671
+ name="NVIDIA RTX 3090",
672
+ vendor="nvidia",
673
+ device_type="gpu",
674
+ vram_bytes=24 * (1024**3), # 24 GB GDDR6X
675
+ memory_bandwidth_bytes_per_s=936 * (1024**3), # 936 GB/s
676
+ peak_fp32_tflops=35.6,
677
+ peak_fp16_tflops=71.0,
678
+ peak_int8_tops=284.0,
679
+ compute_capability="8.6",
680
+ tdp_watts=350,
681
+ )
682
+
683
+ NVIDIA_RTX_3080_TI = HardwareProfile(
684
+ name="NVIDIA RTX 3080 Ti",
685
+ vendor="nvidia",
686
+ device_type="gpu",
687
+ vram_bytes=12 * (1024**3), # 12 GB GDDR6X
688
+ memory_bandwidth_bytes_per_s=912 * (1024**3), # 912 GB/s
689
+ peak_fp32_tflops=34.1,
690
+ peak_fp16_tflops=68.0,
691
+ peak_int8_tops=273.0,
692
+ compute_capability="8.6",
693
+ tdp_watts=350,
694
+ )
695
+
696
+ NVIDIA_RTX_3080_12GB = HardwareProfile(
697
+ name="NVIDIA RTX 3080 12GB",
698
+ vendor="nvidia",
699
+ device_type="gpu",
700
+ vram_bytes=12 * (1024**3), # 12 GB GDDR6X
701
+ memory_bandwidth_bytes_per_s=912 * (1024**3), # 912 GB/s
702
+ peak_fp32_tflops=30.6,
703
+ peak_fp16_tflops=61.0,
704
+ peak_int8_tops=244.0,
705
+ compute_capability="8.6",
706
+ tdp_watts=350,
707
+ )
708
+
709
+ NVIDIA_RTX_3080_10GB = HardwareProfile(
710
+ name="NVIDIA RTX 3080 10GB",
711
+ vendor="nvidia",
712
+ device_type="gpu",
713
+ vram_bytes=10 * (1024**3), # 10 GB GDDR6X
714
+ memory_bandwidth_bytes_per_s=760 * (1024**3), # 760 GB/s
715
+ peak_fp32_tflops=29.8,
716
+ peak_fp16_tflops=59.0,
717
+ peak_int8_tops=238.0,
718
+ compute_capability="8.6",
719
+ tdp_watts=320,
720
+ )
721
+
722
+ # Alias for backward compatibility
723
+ NVIDIA_RTX_3080 = NVIDIA_RTX_3080_10GB
724
+
725
+ NVIDIA_RTX_3070_TI = HardwareProfile(
726
+ name="NVIDIA RTX 3070 Ti",
727
+ vendor="nvidia",
728
+ device_type="gpu",
729
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6X
730
+ memory_bandwidth_bytes_per_s=608 * (1024**3), # 608 GB/s
731
+ peak_fp32_tflops=21.8,
732
+ peak_fp16_tflops=43.0,
733
+ peak_int8_tops=174.0,
734
+ compute_capability="8.6",
735
+ tdp_watts=290,
736
+ )
737
+
738
+ NVIDIA_RTX_3070 = HardwareProfile(
739
+ name="NVIDIA RTX 3070",
740
+ vendor="nvidia",
741
+ device_type="gpu",
742
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
743
+ memory_bandwidth_bytes_per_s=448 * (1024**3), # 448 GB/s
744
+ peak_fp32_tflops=20.3,
745
+ peak_fp16_tflops=40.0,
746
+ peak_int8_tops=163.0,
747
+ compute_capability="8.6",
748
+ tdp_watts=220,
749
+ )
750
+
751
+ NVIDIA_RTX_3060_TI = HardwareProfile(
752
+ name="NVIDIA RTX 3060 Ti",
753
+ vendor="nvidia",
754
+ device_type="gpu",
755
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
756
+ memory_bandwidth_bytes_per_s=448 * (1024**3), # 448 GB/s
757
+ peak_fp32_tflops=16.2,
758
+ peak_fp16_tflops=32.0,
759
+ peak_int8_tops=130.0,
760
+ compute_capability="8.6",
761
+ tdp_watts=200,
762
+ )
763
+
764
+ NVIDIA_RTX_3060_12GB = HardwareProfile(
765
+ name="NVIDIA RTX 3060 12GB",
766
+ vendor="nvidia",
767
+ device_type="gpu",
768
+ vram_bytes=12 * (1024**3), # 12 GB GDDR6
769
+ memory_bandwidth_bytes_per_s=360 * (1024**3), # 360 GB/s
770
+ peak_fp32_tflops=12.7,
771
+ peak_fp16_tflops=25.0,
772
+ peak_int8_tops=101.0,
773
+ compute_capability="8.6",
774
+ tdp_watts=170,
775
+ )
776
+
777
+ NVIDIA_RTX_3060_8GB = HardwareProfile(
778
+ name="NVIDIA RTX 3060 8GB",
779
+ vendor="nvidia",
780
+ device_type="gpu",
781
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
782
+ memory_bandwidth_bytes_per_s=360 * (1024**3), # 360 GB/s
783
+ peak_fp32_tflops=12.7,
784
+ peak_fp16_tflops=25.0,
785
+ peak_int8_tops=101.0,
786
+ compute_capability="8.6",
787
+ tdp_watts=170,
788
+ )
789
+
790
+ NVIDIA_RTX_3050 = HardwareProfile(
791
+ name="NVIDIA RTX 3050",
792
+ vendor="nvidia",
793
+ device_type="gpu",
794
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
795
+ memory_bandwidth_bytes_per_s=224 * (1024**3), # 224 GB/s
796
+ peak_fp32_tflops=9.1,
797
+ peak_fp16_tflops=18.0,
798
+ peak_int8_tops=73.0,
799
+ compute_capability="8.6",
800
+ tdp_watts=130,
801
+ )
802
+
803
+ # ============================================================================
804
+ # NVIDIA Laptop GPUs (Mobile variants - lower TDP/clocks)
805
+ # ============================================================================
806
+
807
+ NVIDIA_RTX_4090_MOBILE = HardwareProfile(
808
+ name="NVIDIA RTX 4090 Mobile",
809
+ vendor="nvidia",
810
+ device_type="gpu",
811
+ vram_bytes=16 * (1024**3), # 16 GB GDDR6
812
+ memory_bandwidth_bytes_per_s=576 * (1024**3), # 576 GB/s
813
+ peak_fp32_tflops=58.0, # ~70% of desktop
814
+ peak_fp16_tflops=116.0,
815
+ peak_int8_tops=464.0,
816
+ compute_capability="8.9",
817
+ tdp_watts=150, # 80-150W configurable
818
+ )
819
+
820
+ NVIDIA_RTX_4080_MOBILE = HardwareProfile(
821
+ name="NVIDIA RTX 4080 Mobile",
822
+ vendor="nvidia",
823
+ device_type="gpu",
824
+ vram_bytes=12 * (1024**3), # 12 GB GDDR6
825
+ memory_bandwidth_bytes_per_s=432 * (1024**3), # 432 GB/s
826
+ peak_fp32_tflops=34.0,
827
+ peak_fp16_tflops=68.0,
828
+ peak_int8_tops=272.0,
829
+ compute_capability="8.9",
830
+ tdp_watts=150,
831
+ )
832
+
833
+ NVIDIA_RTX_4070_MOBILE = HardwareProfile(
834
+ name="NVIDIA RTX 4070 Mobile",
835
+ vendor="nvidia",
836
+ device_type="gpu",
837
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
838
+ memory_bandwidth_bytes_per_s=256 * (1024**3), # 256 GB/s
839
+ peak_fp32_tflops=22.0,
840
+ peak_fp16_tflops=44.0,
841
+ peak_int8_tops=176.0,
842
+ compute_capability="8.9",
843
+ tdp_watts=115,
844
+ )
845
+
846
+ NVIDIA_RTX_4060_MOBILE = HardwareProfile(
847
+ name="NVIDIA RTX 4060 Mobile",
848
+ vendor="nvidia",
849
+ device_type="gpu",
850
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
851
+ memory_bandwidth_bytes_per_s=256 * (1024**3), # 256 GB/s
852
+ peak_fp32_tflops=15.0,
853
+ peak_fp16_tflops=30.0,
854
+ peak_int8_tops=120.0,
855
+ compute_capability="8.9",
856
+ tdp_watts=115,
857
+ )
858
+
859
+ NVIDIA_RTX_4050_MOBILE = HardwareProfile(
860
+ name="NVIDIA RTX 4050 Mobile",
861
+ vendor="nvidia",
862
+ device_type="gpu",
863
+ vram_bytes=6 * (1024**3), # 6 GB GDDR6
864
+ memory_bandwidth_bytes_per_s=192 * (1024**3), # 192 GB/s
865
+ peak_fp32_tflops=11.0,
866
+ peak_fp16_tflops=22.0,
867
+ peak_int8_tops=88.0,
868
+ compute_capability="8.9",
869
+ tdp_watts=75,
870
+ )
871
+
872
+ NVIDIA_RTX_3080_MOBILE = HardwareProfile(
873
+ name="NVIDIA RTX 3080 Mobile",
874
+ vendor="nvidia",
875
+ device_type="gpu",
876
+ vram_bytes=16 * (1024**3), # 16 GB GDDR6
877
+ memory_bandwidth_bytes_per_s=448 * (1024**3), # 448 GB/s
878
+ peak_fp32_tflops=20.0,
879
+ peak_fp16_tflops=40.0,
880
+ peak_int8_tops=160.0,
881
+ compute_capability="8.6",
882
+ tdp_watts=150,
883
+ )
884
+
885
+ NVIDIA_RTX_3070_MOBILE = HardwareProfile(
886
+ name="NVIDIA RTX 3070 Mobile",
887
+ vendor="nvidia",
888
+ device_type="gpu",
889
+ vram_bytes=8 * (1024**3), # 8 GB GDDR6
890
+ memory_bandwidth_bytes_per_s=384 * (1024**3), # 384 GB/s
891
+ peak_fp32_tflops=14.0,
892
+ peak_fp16_tflops=28.0,
893
+ peak_int8_tops=112.0,
894
+ compute_capability="8.6",
895
+ tdp_watts=125,
896
+ )
897
+
898
+ NVIDIA_RTX_3060_MOBILE = HardwareProfile(
899
+ name="NVIDIA RTX 3060 Mobile",
900
+ vendor="nvidia",
901
+ device_type="gpu",
902
+ vram_bytes=6 * (1024**3), # 6 GB GDDR6
903
+ memory_bandwidth_bytes_per_s=336 * (1024**3), # 336 GB/s
904
+ peak_fp32_tflops=10.0,
905
+ peak_fp16_tflops=20.0,
906
+ peak_int8_tops=80.0,
907
+ compute_capability="8.6",
908
+ tdp_watts=115,
909
+ )
910
+
911
+ # Generic CPU profile (will be overridden by detection)
912
+ GENERIC_CPU = HardwareProfile(
913
+ name="Generic CPU",
914
+ vendor="generic",
915
+ device_type="cpu",
916
+ vram_bytes=16 * (1024**3), # Assume 16 GB RAM
917
+ memory_bandwidth_bytes_per_s=50 * (1024**3), # ~50 GB/s DDR4
918
+ peak_fp32_tflops=0.5, # Very rough estimate
919
+ peak_fp16_tflops=0.25, # CPUs typically slower at FP16
920
+ peak_int8_tops=2.0, # VNNI/AVX-512
921
+ compute_capability="",
922
+ tdp_watts=65,
923
+ )
924
+
925
+
926
+ # ============================================================================
927
+ # DGX Systems (Multi-GPU)
928
+ # ============================================================================
929
+
930
+ NVIDIA_DGX_H100 = HardwareProfile(
931
+ name="NVIDIA DGX H100",
932
+ vendor="nvidia",
933
+ device_type="gpu",
934
+ vram_bytes=8 * 80 * (1024**3), # 8x H100 80GB = 640 GB
935
+ memory_bandwidth_bytes_per_s=8 * 3350 * (1024**3), # 8x 3.35 TB/s = 26.8 TB/s
936
+ peak_fp32_tflops=8 * 67.0, # 536 TFLOPS
937
+ peak_fp16_tflops=8 * 1979.0, # 15,832 TFLOPS
938
+ peak_int8_tops=8 * 3958.0, # 31,664 TOPS
939
+ compute_capability="9.0",
940
+ tdp_watts=10200, # System power
941
+ )
942
+
943
+ NVIDIA_DGX_A100_640GB = HardwareProfile(
944
+ name="NVIDIA DGX A100 640GB",
945
+ vendor="nvidia",
946
+ device_type="gpu",
947
+ vram_bytes=8 * 80 * (1024**3), # 8x A100 80GB = 640 GB
948
+ memory_bandwidth_bytes_per_s=8 * 2039 * (1024**3), # 8x 2.0 TB/s = 16 TB/s
949
+ peak_fp32_tflops=8 * 19.5, # 156 TFLOPS
950
+ peak_fp16_tflops=8 * 312.0, # 2,496 TFLOPS
951
+ peak_int8_tops=8 * 624.0, # 4,992 TOPS
952
+ compute_capability="8.0",
953
+ tdp_watts=6500,
954
+ )
955
+
956
+ NVIDIA_DGX_A100_320GB = HardwareProfile(
957
+ name="NVIDIA DGX A100 320GB",
958
+ vendor="nvidia",
959
+ device_type="gpu",
960
+ vram_bytes=8 * 40 * (1024**3), # 8x A100 40GB = 320 GB
961
+ memory_bandwidth_bytes_per_s=8 * 1555 * (1024**3), # 8x 1.6 TB/s = 12.4 TB/s
962
+ peak_fp32_tflops=8 * 19.5, # 156 TFLOPS
963
+ peak_fp16_tflops=8 * 312.0, # 2,496 TFLOPS
964
+ peak_int8_tops=8 * 624.0, # 4,992 TOPS
965
+ compute_capability="8.0",
966
+ tdp_watts=6500,
967
+ )
968
+
969
+
970
+ # ============================================================================
971
+ # Cloud Instance Profiles (with cost estimates)
972
+ # ============================================================================
973
+
974
+
975
+ @dataclass
976
+ class CloudInstanceProfile:
977
+ """Cloud instance with GPU specs and pricing."""
978
+
979
+ name: str
980
+ provider: str # "aws", "azure", "gcp"
981
+ instance_type: str
982
+ hardware: HardwareProfile
983
+ gpu_count: int
984
+ hourly_cost_usd: float # On-demand pricing (approximate)
985
+ region: str = "us-east-1" # Default region for pricing
986
+
987
+ def to_dict(self) -> dict[str, Any]:
988
+ return {
989
+ "name": self.name,
990
+ "provider": self.provider,
991
+ "instance_type": self.instance_type,
992
+ "gpu_count": self.gpu_count,
993
+ "total_vram_gb": round(self.hardware.vram_bytes * self.gpu_count / (1024**3), 1),
994
+ "hourly_cost_usd": self.hourly_cost_usd,
995
+ "hardware": self.hardware.to_dict(),
996
+ }
997
+
998
+
999
+ # AWS GPU Instances
1000
+ AWS_P5_48XLARGE = CloudInstanceProfile(
1001
+ name="AWS p5.48xlarge (8x H100)",
1002
+ provider="aws",
1003
+ instance_type="p5.48xlarge",
1004
+ hardware=NVIDIA_H100_SXM,
1005
+ gpu_count=8,
1006
+ hourly_cost_usd=98.32,
1007
+ )
1008
+
1009
+ AWS_P4D_24XLARGE = CloudInstanceProfile(
1010
+ name="AWS p4d.24xlarge (8x A100 40GB)",
1011
+ provider="aws",
1012
+ instance_type="p4d.24xlarge",
1013
+ hardware=NVIDIA_A100_40GB,
1014
+ gpu_count=8,
1015
+ hourly_cost_usd=32.77,
1016
+ )
1017
+
1018
+ AWS_P4DE_24XLARGE = CloudInstanceProfile(
1019
+ name="AWS p4de.24xlarge (8x A100 80GB)",
1020
+ provider="aws",
1021
+ instance_type="p4de.24xlarge",
1022
+ hardware=NVIDIA_A100_80GB,
1023
+ gpu_count=8,
1024
+ hourly_cost_usd=40.96,
1025
+ )
1026
+
1027
+ AWS_G5_XLARGE = CloudInstanceProfile(
1028
+ name="AWS g5.xlarge (1x A10G)",
1029
+ provider="aws",
1030
+ instance_type="g5.xlarge",
1031
+ hardware=NVIDIA_A10,
1032
+ gpu_count=1,
1033
+ hourly_cost_usd=1.01,
1034
+ )
1035
+
1036
+ AWS_G5_12XLARGE = CloudInstanceProfile(
1037
+ name="AWS g5.12xlarge (4x A10G)",
1038
+ provider="aws",
1039
+ instance_type="g5.12xlarge",
1040
+ hardware=NVIDIA_A10,
1041
+ gpu_count=4,
1042
+ hourly_cost_usd=5.67,
1043
+ )
1044
+
1045
+ AWS_G5_48XLARGE = CloudInstanceProfile(
1046
+ name="AWS g5.48xlarge (8x A10G)",
1047
+ provider="aws",
1048
+ instance_type="g5.48xlarge",
1049
+ hardware=NVIDIA_A10,
1050
+ gpu_count=8,
1051
+ hourly_cost_usd=16.29,
1052
+ )
1053
+
1054
+ AWS_G4DN_XLARGE = CloudInstanceProfile(
1055
+ name="AWS g4dn.xlarge (1x T4)",
1056
+ provider="aws",
1057
+ instance_type="g4dn.xlarge",
1058
+ hardware=NVIDIA_T4,
1059
+ gpu_count=1,
1060
+ hourly_cost_usd=0.526,
1061
+ )
1062
+
1063
+ AWS_INF2_XLARGE = CloudInstanceProfile(
1064
+ name="AWS inf2.xlarge (1x Inferentia2)",
1065
+ provider="aws",
1066
+ instance_type="inf2.xlarge",
1067
+ hardware=HardwareProfile(
1068
+ name="AWS Inferentia2",
1069
+ vendor="aws",
1070
+ device_type="npu",
1071
+ vram_bytes=32 * (1024**3),
1072
+ memory_bandwidth_bytes_per_s=190 * (1024**3),
1073
+ peak_fp32_tflops=0.0, # Optimized for inference, not FP32
1074
+ peak_fp16_tflops=95.0,
1075
+ peak_int8_tops=190.0,
1076
+ compute_capability="",
1077
+ tdp_watts=75,
1078
+ ),
1079
+ gpu_count=1,
1080
+ hourly_cost_usd=0.758,
1081
+ )
1082
+
1083
+ # Azure GPU Instances
1084
+ AZURE_NC_A100_V4 = CloudInstanceProfile(
1085
+ name="Azure NC A100 v4 (1x A100 80GB)",
1086
+ provider="azure",
1087
+ instance_type="Standard_NC24ads_A100_v4",
1088
+ hardware=NVIDIA_A100_80GB,
1089
+ gpu_count=1,
1090
+ hourly_cost_usd=3.67,
1091
+ )
1092
+
1093
+ AZURE_ND_A100_V4 = CloudInstanceProfile(
1094
+ name="Azure ND A100 v4 (8x A100 80GB)",
1095
+ provider="azure",
1096
+ instance_type="Standard_ND96amsr_A100_v4",
1097
+ hardware=NVIDIA_A100_80GB,
1098
+ gpu_count=8,
1099
+ hourly_cost_usd=32.77,
1100
+ )
1101
+
1102
+ AZURE_NC_H100_V5 = CloudInstanceProfile(
1103
+ name="Azure NC H100 v5 (1x H100)",
1104
+ provider="azure",
1105
+ instance_type="Standard_NC40ads_H100_v5",
1106
+ hardware=NVIDIA_H100_PCIE,
1107
+ gpu_count=1,
1108
+ hourly_cost_usd=7.35,
1109
+ )
1110
+
1111
+ AZURE_ND_H100_V5 = CloudInstanceProfile(
1112
+ name="Azure ND H100 v5 (8x H100)",
1113
+ provider="azure",
1114
+ instance_type="Standard_ND96isr_H100_v5",
1115
+ hardware=NVIDIA_H100_SXM,
1116
+ gpu_count=8,
1117
+ hourly_cost_usd=65.93,
1118
+ )
1119
+
1120
+ AZURE_NC_T4_V3 = CloudInstanceProfile(
1121
+ name="Azure NC T4 v3 (1x T4)",
1122
+ provider="azure",
1123
+ instance_type="Standard_NC4as_T4_v3",
1124
+ hardware=NVIDIA_T4,
1125
+ gpu_count=1,
1126
+ hourly_cost_usd=0.526,
1127
+ )
1128
+
1129
+ AZURE_NV_A10_V5 = CloudInstanceProfile(
1130
+ name="Azure NV A10 v5 (1x A10)",
1131
+ provider="azure",
1132
+ instance_type="Standard_NV36ads_A10_v5",
1133
+ hardware=NVIDIA_A10,
1134
+ gpu_count=1,
1135
+ hourly_cost_usd=1.80,
1136
+ )
1137
+
1138
+ # GCP GPU Instances
1139
+ GCP_A3_HIGHGPU_8G = CloudInstanceProfile(
1140
+ name="GCP a3-highgpu-8g (8x H100)",
1141
+ provider="gcp",
1142
+ instance_type="a3-highgpu-8g",
1143
+ hardware=NVIDIA_H100_SXM,
1144
+ gpu_count=8,
1145
+ hourly_cost_usd=101.22,
1146
+ )
1147
+
1148
+ GCP_A2_HIGHGPU_1G = CloudInstanceProfile(
1149
+ name="GCP a2-highgpu-1g (1x A100 40GB)",
1150
+ provider="gcp",
1151
+ instance_type="a2-highgpu-1g",
1152
+ hardware=NVIDIA_A100_40GB,
1153
+ gpu_count=1,
1154
+ hourly_cost_usd=3.67,
1155
+ )
1156
+
1157
+ GCP_A2_HIGHGPU_8G = CloudInstanceProfile(
1158
+ name="GCP a2-highgpu-8g (8x A100 40GB)",
1159
+ provider="gcp",
1160
+ instance_type="a2-highgpu-8g",
1161
+ hardware=NVIDIA_A100_40GB,
1162
+ gpu_count=8,
1163
+ hourly_cost_usd=29.39,
1164
+ )
1165
+
1166
+ GCP_A2_ULTRAGPU_1G = CloudInstanceProfile(
1167
+ name="GCP a2-ultragpu-1g (1x A100 80GB)",
1168
+ provider="gcp",
1169
+ instance_type="a2-ultragpu-1g",
1170
+ hardware=NVIDIA_A100_80GB,
1171
+ gpu_count=1,
1172
+ hourly_cost_usd=5.00,
1173
+ )
1174
+
1175
+ GCP_A2_ULTRAGPU_8G = CloudInstanceProfile(
1176
+ name="GCP a2-ultragpu-8g (8x A100 80GB)",
1177
+ provider="gcp",
1178
+ instance_type="a2-ultragpu-8g",
1179
+ hardware=NVIDIA_A100_80GB,
1180
+ gpu_count=8,
1181
+ hourly_cost_usd=40.04,
1182
+ )
1183
+
1184
+ GCP_G2_STANDARD_4 = CloudInstanceProfile(
1185
+ name="GCP g2-standard-4 (1x L4)",
1186
+ provider="gcp",
1187
+ instance_type="g2-standard-4",
1188
+ hardware=NVIDIA_L4,
1189
+ gpu_count=1,
1190
+ hourly_cost_usd=0.84,
1191
+ )
1192
+
1193
+ GCP_N1_T4 = CloudInstanceProfile(
1194
+ name="GCP n1-standard-4 + T4 (1x T4)",
1195
+ provider="gcp",
1196
+ instance_type="n1-standard-4",
1197
+ hardware=NVIDIA_T4,
1198
+ gpu_count=1,
1199
+ hourly_cost_usd=0.55,
1200
+ )
1201
+
1202
+
1203
+ # Cloud instance registry
1204
+ CLOUD_INSTANCES: dict[str, CloudInstanceProfile] = {
1205
+ # AWS
1206
+ "aws-p5-48xlarge": AWS_P5_48XLARGE,
1207
+ "aws-p4d-24xlarge": AWS_P4D_24XLARGE,
1208
+ "aws-p4de-24xlarge": AWS_P4DE_24XLARGE,
1209
+ "aws-g5-xlarge": AWS_G5_XLARGE,
1210
+ "aws-g5-12xlarge": AWS_G5_12XLARGE,
1211
+ "aws-g5-48xlarge": AWS_G5_48XLARGE,
1212
+ "aws-g4dn-xlarge": AWS_G4DN_XLARGE,
1213
+ "aws-inf2-xlarge": AWS_INF2_XLARGE,
1214
+ # Azure
1215
+ "azure-nc-a100-v4": AZURE_NC_A100_V4,
1216
+ "azure-nd-a100-v4": AZURE_ND_A100_V4,
1217
+ "azure-nc-h100-v5": AZURE_NC_H100_V5,
1218
+ "azure-nd-h100-v5": AZURE_ND_H100_V5,
1219
+ "azure-nc-t4-v3": AZURE_NC_T4_V3,
1220
+ "azure-nv-a10-v5": AZURE_NV_A10_V5,
1221
+ # GCP
1222
+ "gcp-a3-highgpu-8g": GCP_A3_HIGHGPU_8G,
1223
+ "gcp-a2-highgpu-1g": GCP_A2_HIGHGPU_1G,
1224
+ "gcp-a2-highgpu-8g": GCP_A2_HIGHGPU_8G,
1225
+ "gcp-a2-ultragpu-1g": GCP_A2_ULTRAGPU_1G,
1226
+ "gcp-a2-ultragpu-8g": GCP_A2_ULTRAGPU_8G,
1227
+ "gcp-g2-standard-4": GCP_G2_STANDARD_4,
1228
+ "gcp-n1-t4": GCP_N1_T4,
1229
+ }
1230
+
1231
+
1232
+ # Registry of all predefined profiles
1233
+ HARDWARE_PROFILES: dict[str, HardwareProfile] = {
1234
+ # -------------------------------------------------------------------------
1235
+ # Data Center GPUs - H100 Series
1236
+ # -------------------------------------------------------------------------
1237
+ "h100": NVIDIA_H100_SXM,
1238
+ "h100-sxm": NVIDIA_H100_SXM,
1239
+ "h100-80gb-sxm": NVIDIA_H100_SXM,
1240
+ "h100-pcie": NVIDIA_H100_PCIE,
1241
+ "h100-80gb-pcie": NVIDIA_H100_PCIE,
1242
+ "h100-nvl": NVIDIA_H100_NVL,
1243
+ "h100-94gb-nvl": NVIDIA_H100_NVL,
1244
+ # -------------------------------------------------------------------------
1245
+ # Data Center GPUs - A100 Series
1246
+ # -------------------------------------------------------------------------
1247
+ "a100": NVIDIA_A100_80GB, # Default A100 is 80GB SXM
1248
+ "a100-80gb": NVIDIA_A100_80GB_SXM,
1249
+ "a100-80gb-sxm": NVIDIA_A100_80GB_SXM,
1250
+ "a100-80gb-pcie": NVIDIA_A100_80GB_PCIE,
1251
+ "a100-40gb": NVIDIA_A100_40GB_SXM,
1252
+ "a100-40gb-sxm": NVIDIA_A100_40GB_SXM,
1253
+ "a100-40gb-pcie": NVIDIA_A100_40GB_PCIE,
1254
+ # -------------------------------------------------------------------------
1255
+ # Data Center GPUs - Other Current Gen
1256
+ # -------------------------------------------------------------------------
1257
+ "a10": NVIDIA_A10,
1258
+ "l4": NVIDIA_L4,
1259
+ "l40": NVIDIA_L40,
1260
+ "l40s": NVIDIA_L40S,
1261
+ "t4": NVIDIA_T4,
1262
+ # -------------------------------------------------------------------------
1263
+ # Data Center GPUs - V100 Series (Previous Gen)
1264
+ # -------------------------------------------------------------------------
1265
+ "v100": NVIDIA_V100_32GB,
1266
+ "v100-32gb": NVIDIA_V100_32GB_SXM,
1267
+ "v100-32gb-sxm": NVIDIA_V100_32GB_SXM,
1268
+ "v100-32gb-pcie": NVIDIA_V100_32GB_PCIE,
1269
+ "v100-16gb": NVIDIA_V100_16GB_PCIE,
1270
+ "v100-16gb-sxm": NVIDIA_V100_16GB_SXM,
1271
+ "v100-16gb-pcie": NVIDIA_V100_16GB_PCIE,
1272
+ # -------------------------------------------------------------------------
1273
+ # Data Center GPUs - Legacy
1274
+ # -------------------------------------------------------------------------
1275
+ "p100": NVIDIA_P100,
1276
+ "p40": NVIDIA_P40,
1277
+ # -------------------------------------------------------------------------
1278
+ # Jetson Edge/Embedded (Orin Series - Current)
1279
+ # -------------------------------------------------------------------------
1280
+ "jetson-agx-orin-64gb": NVIDIA_JETSON_AGX_ORIN_64GB,
1281
+ "jetson-agx-orin-32gb": NVIDIA_JETSON_AGX_ORIN_32GB,
1282
+ "jetson-agx-orin": NVIDIA_JETSON_AGX_ORIN_64GB, # Default to 64GB
1283
+ "orin-agx": NVIDIA_JETSON_AGX_ORIN_64GB,
1284
+ "jetson-orin-nx-16gb": NVIDIA_JETSON_ORIN_NX_16GB,
1285
+ "jetson-orin-nx-8gb": NVIDIA_JETSON_ORIN_NX_8GB,
1286
+ "jetson-orin-nx": NVIDIA_JETSON_ORIN_NX_16GB,
1287
+ "orin-nx": NVIDIA_JETSON_ORIN_NX_16GB,
1288
+ "jetson-orin-nano-8gb": NVIDIA_JETSON_ORIN_NANO_8GB,
1289
+ "jetson-orin-nano-4gb": NVIDIA_JETSON_ORIN_NANO_4GB,
1290
+ "jetson-orin-nano": NVIDIA_JETSON_ORIN_NANO_8GB,
1291
+ "orin-nano": NVIDIA_JETSON_ORIN_NANO_8GB,
1292
+ # Jetson Edge/Embedded (Xavier Series)
1293
+ "jetson-agx-xavier-32gb": NVIDIA_JETSON_AGX_XAVIER_32GB,
1294
+ "jetson-agx-xavier-16gb": NVIDIA_JETSON_AGX_XAVIER_16GB,
1295
+ "jetson-agx-xavier": NVIDIA_JETSON_AGX_XAVIER_32GB,
1296
+ "xavier-agx": NVIDIA_JETSON_AGX_XAVIER_32GB,
1297
+ "jetson-xavier-nx-16gb": NVIDIA_JETSON_XAVIER_NX_16GB,
1298
+ "jetson-xavier-nx-8gb": NVIDIA_JETSON_XAVIER_NX_8GB,
1299
+ "jetson-xavier-nx": NVIDIA_JETSON_XAVIER_NX_8GB,
1300
+ "xavier-nx": NVIDIA_JETSON_XAVIER_NX_8GB,
1301
+ # Jetson Edge/Embedded (TX2 Series)
1302
+ "jetson-tx2": NVIDIA_JETSON_TX2,
1303
+ "tx2": NVIDIA_JETSON_TX2,
1304
+ "jetson-tx2-nx": NVIDIA_JETSON_TX2_NX,
1305
+ "tx2-nx": NVIDIA_JETSON_TX2_NX,
1306
+ # Jetson Edge/Embedded (Nano - Most Constrained!)
1307
+ "jetson-nano": NVIDIA_JETSON_NANO,
1308
+ "nano": NVIDIA_JETSON_NANO,
1309
+ "jetson-nano-2gb": NVIDIA_JETSON_NANO_2GB,
1310
+ "nano-2gb": NVIDIA_JETSON_NANO_2GB,
1311
+ # -------------------------------------------------------------------------
1312
+ # Consumer GPUs - RTX 40 Series (Ada Lovelace)
1313
+ # -------------------------------------------------------------------------
1314
+ "rtx4090": NVIDIA_RTX_4090,
1315
+ "4090": NVIDIA_RTX_4090,
1316
+ "rtx4080-super": NVIDIA_RTX_4080_SUPER,
1317
+ "4080-super": NVIDIA_RTX_4080_SUPER,
1318
+ "rtx4080": NVIDIA_RTX_4080,
1319
+ "4080": NVIDIA_RTX_4080,
1320
+ "rtx4070-ti-super": NVIDIA_RTX_4070_TI_SUPER,
1321
+ "4070-ti-super": NVIDIA_RTX_4070_TI_SUPER,
1322
+ "rtx4070-ti": NVIDIA_RTX_4070_TI,
1323
+ "4070-ti": NVIDIA_RTX_4070_TI,
1324
+ "rtx4070-super": NVIDIA_RTX_4070_SUPER,
1325
+ "4070-super": NVIDIA_RTX_4070_SUPER,
1326
+ "rtx4070": NVIDIA_RTX_4070,
1327
+ "4070": NVIDIA_RTX_4070,
1328
+ "rtx4060-ti-16gb": NVIDIA_RTX_4060_TI_16GB,
1329
+ "4060-ti-16gb": NVIDIA_RTX_4060_TI_16GB,
1330
+ "rtx4060-ti": NVIDIA_RTX_4060_TI_8GB,
1331
+ "4060-ti": NVIDIA_RTX_4060_TI_8GB,
1332
+ "rtx4060-ti-8gb": NVIDIA_RTX_4060_TI_8GB,
1333
+ "4060-ti-8gb": NVIDIA_RTX_4060_TI_8GB,
1334
+ "rtx4060": NVIDIA_RTX_4060,
1335
+ "4060": NVIDIA_RTX_4060,
1336
+ # -------------------------------------------------------------------------
1337
+ # Consumer GPUs - RTX 30 Series (Ampere)
1338
+ # -------------------------------------------------------------------------
1339
+ "rtx3090-ti": NVIDIA_RTX_3090_TI,
1340
+ "3090-ti": NVIDIA_RTX_3090_TI,
1341
+ "rtx3090": NVIDIA_RTX_3090,
1342
+ "3090": NVIDIA_RTX_3090,
1343
+ "rtx3080-ti": NVIDIA_RTX_3080_TI,
1344
+ "3080-ti": NVIDIA_RTX_3080_TI,
1345
+ "rtx3080-12gb": NVIDIA_RTX_3080_12GB,
1346
+ "3080-12gb": NVIDIA_RTX_3080_12GB,
1347
+ "rtx3080": NVIDIA_RTX_3080,
1348
+ "3080": NVIDIA_RTX_3080,
1349
+ "rtx3080-10gb": NVIDIA_RTX_3080_10GB,
1350
+ "3080-10gb": NVIDIA_RTX_3080_10GB,
1351
+ "rtx3070-ti": NVIDIA_RTX_3070_TI,
1352
+ "3070-ti": NVIDIA_RTX_3070_TI,
1353
+ "rtx3070": NVIDIA_RTX_3070,
1354
+ "3070": NVIDIA_RTX_3070,
1355
+ "rtx3060-ti": NVIDIA_RTX_3060_TI,
1356
+ "3060-ti": NVIDIA_RTX_3060_TI,
1357
+ "rtx3060-12gb": NVIDIA_RTX_3060_12GB,
1358
+ "rtx3060": NVIDIA_RTX_3060_12GB, # Default to 12GB
1359
+ "3060": NVIDIA_RTX_3060_12GB,
1360
+ "rtx3060-8gb": NVIDIA_RTX_3060_8GB,
1361
+ "3060-8gb": NVIDIA_RTX_3060_8GB,
1362
+ "rtx3050": NVIDIA_RTX_3050,
1363
+ "3050": NVIDIA_RTX_3050,
1364
+ # -------------------------------------------------------------------------
1365
+ # Laptop/Mobile GPUs
1366
+ # -------------------------------------------------------------------------
1367
+ "rtx4090-mobile": NVIDIA_RTX_4090_MOBILE,
1368
+ "4090-mobile": NVIDIA_RTX_4090_MOBILE,
1369
+ "rtx4080-mobile": NVIDIA_RTX_4080_MOBILE,
1370
+ "4080-mobile": NVIDIA_RTX_4080_MOBILE,
1371
+ "rtx4070-mobile": NVIDIA_RTX_4070_MOBILE,
1372
+ "4070-mobile": NVIDIA_RTX_4070_MOBILE,
1373
+ "rtx4060-mobile": NVIDIA_RTX_4060_MOBILE,
1374
+ "4060-mobile": NVIDIA_RTX_4060_MOBILE,
1375
+ "rtx4050-mobile": NVIDIA_RTX_4050_MOBILE,
1376
+ "4050-mobile": NVIDIA_RTX_4050_MOBILE,
1377
+ "rtx3080-mobile": NVIDIA_RTX_3080_MOBILE,
1378
+ "3080-mobile": NVIDIA_RTX_3080_MOBILE,
1379
+ "rtx3070-mobile": NVIDIA_RTX_3070_MOBILE,
1380
+ "3070-mobile": NVIDIA_RTX_3070_MOBILE,
1381
+ "rtx3060-mobile": NVIDIA_RTX_3060_MOBILE,
1382
+ "3060-mobile": NVIDIA_RTX_3060_MOBILE,
1383
+ # -------------------------------------------------------------------------
1384
+ # DGX Systems (Multi-GPU)
1385
+ # -------------------------------------------------------------------------
1386
+ "dgx-h100": NVIDIA_DGX_H100,
1387
+ "dgx-a100-640gb": NVIDIA_DGX_A100_640GB,
1388
+ "dgx-a100-320gb": NVIDIA_DGX_A100_320GB,
1389
+ "dgx-a100": NVIDIA_DGX_A100_640GB, # Default to 640GB
1390
+ # -------------------------------------------------------------------------
1391
+ # Generic / Fallback
1392
+ # -------------------------------------------------------------------------
1393
+ "cpu": GENERIC_CPU,
1394
+ }
1395
+
1396
+
1397
+ # ============================================================================
1398
+ # Hardware Detection
1399
+ # ============================================================================
1400
+
1401
+
1402
+ class HardwareDetector:
1403
+ """
1404
+ Detect local hardware configuration.
1405
+
1406
+ Attempts to detect NVIDIA GPUs via nvidia-smi, falls back to CPU info.
1407
+ """
1408
+
1409
+ def __init__(self, logger: logging.Logger | None = None):
1410
+ self.logger = logger or logging.getLogger("haoline.hardware")
1411
+
1412
+ def detect(self) -> HardwareProfile:
1413
+ """
1414
+ Auto-detect local hardware.
1415
+
1416
+ Returns:
1417
+ HardwareProfile for the detected hardware.
1418
+ """
1419
+ # Try NVIDIA GPU first
1420
+ gpu_profile = self._detect_nvidia_gpu()
1421
+ if gpu_profile:
1422
+ return gpu_profile
1423
+
1424
+ # Fall back to CPU
1425
+ return self._detect_cpu()
1426
+
1427
+ def _detect_nvidia_gpu(self) -> HardwareProfile | None:
1428
+ """Detect NVIDIA GPU using nvidia-smi."""
1429
+ try:
1430
+ # Query GPU name and memory
1431
+ result = subprocess.run(
1432
+ [
1433
+ "nvidia-smi",
1434
+ "--query-gpu=name,memory.total,compute_cap",
1435
+ "--format=csv,noheader,nounits",
1436
+ ],
1437
+ check=False,
1438
+ capture_output=True,
1439
+ text=True,
1440
+ timeout=5,
1441
+ )
1442
+
1443
+ if result.returncode != 0:
1444
+ self.logger.debug("nvidia-smi failed or not found")
1445
+ return None
1446
+
1447
+ # Parse first GPU (could extend to multi-GPU)
1448
+ line = result.stdout.strip().split("\n")[0]
1449
+ parts = [p.strip() for p in line.split(",")]
1450
+
1451
+ if len(parts) < 2:
1452
+ return None
1453
+
1454
+ gpu_name = parts[0]
1455
+ vram_mb = int(parts[1]) if parts[1].isdigit() else 0
1456
+ compute_cap = parts[2] if len(parts) > 2 else ""
1457
+
1458
+ self.logger.info(f"Detected GPU: {gpu_name} ({vram_mb} MB VRAM)")
1459
+
1460
+ # Try to match to a known profile
1461
+ profile = self._match_gpu_profile(gpu_name, vram_mb)
1462
+ if profile:
1463
+ # Create a copy with detected flag and actual VRAM
1464
+ return HardwareProfile(
1465
+ name=f"{gpu_name} (detected)",
1466
+ vendor="nvidia",
1467
+ device_type="gpu",
1468
+ vram_bytes=vram_mb * (1024**2),
1469
+ memory_bandwidth_bytes_per_s=profile.memory_bandwidth_bytes_per_s,
1470
+ peak_fp32_tflops=profile.peak_fp32_tflops,
1471
+ peak_fp16_tflops=profile.peak_fp16_tflops,
1472
+ peak_int8_tops=profile.peak_int8_tops,
1473
+ compute_capability=compute_cap or profile.compute_capability,
1474
+ tdp_watts=profile.tdp_watts,
1475
+ is_detected=True,
1476
+ )
1477
+
1478
+ # Unknown GPU - create generic profile with detected VRAM
1479
+ return HardwareProfile(
1480
+ name=f"{gpu_name} (detected)",
1481
+ vendor="nvidia",
1482
+ device_type="gpu",
1483
+ vram_bytes=vram_mb * (1024**2),
1484
+ memory_bandwidth_bytes_per_s=500 * (1024**3), # Conservative estimate
1485
+ peak_fp32_tflops=10.0, # Conservative
1486
+ peak_fp16_tflops=20.0,
1487
+ peak_int8_tops=40.0,
1488
+ compute_capability=compute_cap,
1489
+ is_detected=True,
1490
+ )
1491
+
1492
+ except FileNotFoundError:
1493
+ self.logger.debug("nvidia-smi not found")
1494
+ return None
1495
+ except subprocess.TimeoutExpired:
1496
+ self.logger.warning("nvidia-smi timed out")
1497
+ return None
1498
+ except Exception as e:
1499
+ self.logger.debug(f"GPU detection failed: {e}")
1500
+ return None
1501
+
1502
+ def _match_gpu_profile(self, gpu_name: str, vram_mb: int) -> HardwareProfile | None:
1503
+ """Match detected GPU name to a known profile."""
1504
+ gpu_name_lower = gpu_name.lower()
1505
+
1506
+ # Jetson detection (check first as they're embedded)
1507
+ if "jetson" in gpu_name_lower or "tegra" in gpu_name_lower:
1508
+ return self._match_jetson_profile(gpu_name_lower, vram_mb)
1509
+
1510
+ # Data center GPU patterns (check more specific patterns first)
1511
+ datacenter_matches = [
1512
+ ("h100", NVIDIA_H100_SXM),
1513
+ ("a100", NVIDIA_A100_80GB if vram_mb > 50000 else NVIDIA_A100_40GB),
1514
+ ("a10", NVIDIA_A10),
1515
+ ("l40s", NVIDIA_L40S),
1516
+ ("l40", NVIDIA_L40),
1517
+ ("l4", NVIDIA_L4),
1518
+ ("t4", NVIDIA_T4),
1519
+ ("v100", NVIDIA_V100_32GB if vram_mb > 20000 else NVIDIA_V100_16GB),
1520
+ ("p100", NVIDIA_P100),
1521
+ ("p40", NVIDIA_P40),
1522
+ ]
1523
+
1524
+ for pattern, profile in datacenter_matches:
1525
+ if pattern in gpu_name_lower:
1526
+ return profile
1527
+
1528
+ # Consumer GPU patterns
1529
+ consumer_matches = [
1530
+ ("4090", NVIDIA_RTX_4090),
1531
+ ("4080", NVIDIA_RTX_4080),
1532
+ ("4070", NVIDIA_RTX_4080), # Approximate with 4080
1533
+ ("4060", NVIDIA_RTX_4080), # Approximate
1534
+ ("3090", NVIDIA_RTX_3090),
1535
+ ("3080", NVIDIA_RTX_3080),
1536
+ ("3070", NVIDIA_RTX_3080), # Approximate
1537
+ ("3060", NVIDIA_RTX_3080), # Approximate
1538
+ ]
1539
+
1540
+ for pattern, profile in consumer_matches:
1541
+ if pattern in gpu_name_lower:
1542
+ return profile
1543
+
1544
+ return None
1545
+
1546
+ def _match_jetson_profile(self, gpu_name_lower: str, vram_mb: int) -> HardwareProfile | None:
1547
+ """Match Jetson device to appropriate profile."""
1548
+ # Orin series
1549
+ if "orin" in gpu_name_lower:
1550
+ if "agx" in gpu_name_lower:
1551
+ return (
1552
+ NVIDIA_JETSON_AGX_ORIN_64GB if vram_mb > 40000 else NVIDIA_JETSON_AGX_ORIN_32GB
1553
+ )
1554
+ elif "nx" in gpu_name_lower:
1555
+ return NVIDIA_JETSON_ORIN_NX_16GB if vram_mb > 10000 else NVIDIA_JETSON_ORIN_NX_8GB
1556
+ elif "nano" in gpu_name_lower:
1557
+ return (
1558
+ NVIDIA_JETSON_ORIN_NANO_8GB if vram_mb > 5000 else NVIDIA_JETSON_ORIN_NANO_4GB
1559
+ )
1560
+ # Default Orin
1561
+ return NVIDIA_JETSON_ORIN_NX_8GB
1562
+
1563
+ # Xavier series
1564
+ if "xavier" in gpu_name_lower:
1565
+ if "agx" in gpu_name_lower:
1566
+ return (
1567
+ NVIDIA_JETSON_AGX_XAVIER_32GB
1568
+ if vram_mb > 20000
1569
+ else NVIDIA_JETSON_AGX_XAVIER_16GB
1570
+ )
1571
+ elif "nx" in gpu_name_lower:
1572
+ return (
1573
+ NVIDIA_JETSON_XAVIER_NX_16GB if vram_mb > 10000 else NVIDIA_JETSON_XAVIER_NX_8GB
1574
+ )
1575
+ return NVIDIA_JETSON_XAVIER_NX_8GB
1576
+
1577
+ # TX2 series
1578
+ if "tx2" in gpu_name_lower:
1579
+ if "nx" in gpu_name_lower:
1580
+ return NVIDIA_JETSON_TX2_NX
1581
+ return NVIDIA_JETSON_TX2
1582
+
1583
+ # Nano (most common constrained device)
1584
+ if "nano" in gpu_name_lower:
1585
+ return NVIDIA_JETSON_NANO_2GB if vram_mb < 3000 else NVIDIA_JETSON_NANO
1586
+
1587
+ # Generic Jetson fallback based on memory
1588
+ if vram_mb <= 2000:
1589
+ return NVIDIA_JETSON_NANO_2GB
1590
+ elif vram_mb <= 4000:
1591
+ return NVIDIA_JETSON_NANO
1592
+ elif vram_mb <= 8000:
1593
+ return NVIDIA_JETSON_ORIN_NANO_8GB
1594
+ else:
1595
+ return NVIDIA_JETSON_ORIN_NX_16GB
1596
+
1597
+ def _detect_cpu(self) -> HardwareProfile:
1598
+ """Detect CPU and system memory."""
1599
+ cpu_name = platform.processor() or "Unknown CPU"
1600
+
1601
+ # Get system memory
1602
+ if _HAS_PSUTIL:
1603
+ ram_bytes = psutil.virtual_memory().total
1604
+ else:
1605
+ # Fallback: assume 16 GB
1606
+ ram_bytes = 16 * (1024**3)
1607
+
1608
+ # Estimate CPU performance (very rough)
1609
+ # Modern CPUs can do ~0.5-2 TFLOPS FP32 depending on cores/frequency
1610
+ cpu_count = os.cpu_count() or 4
1611
+ estimated_fp32_tflops = 0.1 * cpu_count # ~0.1 TFLOPS per core
1612
+
1613
+ self.logger.info(
1614
+ f"Detected CPU: {cpu_name} ({cpu_count} cores, {ram_bytes / (1024**3):.1f} GB RAM)"
1615
+ )
1616
+
1617
+ return HardwareProfile(
1618
+ name=f"{cpu_name} (detected)",
1619
+ vendor="generic",
1620
+ device_type="cpu",
1621
+ vram_bytes=ram_bytes,
1622
+ memory_bandwidth_bytes_per_s=50 * (1024**3), # Typical DDR4/DDR5
1623
+ peak_fp32_tflops=estimated_fp32_tflops,
1624
+ peak_fp16_tflops=estimated_fp32_tflops * 0.5, # CPUs slower at FP16
1625
+ peak_int8_tops=estimated_fp32_tflops * 4, # VNNI acceleration
1626
+ is_detected=True,
1627
+ )
1628
+
1629
+
1630
+ # ============================================================================
1631
+ # Hardware Estimator
1632
+ # ============================================================================
1633
+
1634
+
1635
+ @dataclass
1636
+ class HardwareEstimates:
1637
+ """Estimated performance characteristics for a model on specific hardware."""
1638
+
1639
+ device: str
1640
+ precision: str
1641
+ batch_size: int
1642
+
1643
+ # Memory
1644
+ vram_required_bytes: int
1645
+ fits_in_vram: bool
1646
+
1647
+ # Performance
1648
+ theoretical_latency_ms: float
1649
+ compute_utilization_estimate: float # 0.0 - 1.0, roofline (compute_time/memory_time)
1650
+ gpu_saturation: float # 0.0 - 1.0, model_flops / gpu_capacity per inference
1651
+ bottleneck: str # "compute", "memory_bandwidth", "vram"
1652
+
1653
+ # Context
1654
+ model_flops: int
1655
+ hardware_peak_tflops: float
1656
+
1657
+ def to_dict(self) -> dict[str, Any]:
1658
+ return {
1659
+ "device": self.device,
1660
+ "precision": self.precision,
1661
+ "batch_size": self.batch_size,
1662
+ "vram_required_gb": round(self.vram_required_bytes / (1024**3), 2),
1663
+ "fits_in_vram": self.fits_in_vram,
1664
+ "theoretical_latency_ms": round(self.theoretical_latency_ms, 2),
1665
+ "compute_utilization_estimate": round(self.compute_utilization_estimate, 2),
1666
+ "gpu_saturation": round(self.gpu_saturation, 6),
1667
+ "bottleneck": self.bottleneck,
1668
+ }
1669
+
1670
+
1671
+ class HardwareEstimator:
1672
+ """
1673
+ Estimate hardware requirements and performance.
1674
+
1675
+ Provides theoretical bounds based on model complexity and hardware specs.
1676
+ Actual performance will vary based on implementation efficiency.
1677
+ """
1678
+
1679
+ def __init__(self, logger: logging.Logger | None = None):
1680
+ self.logger = logger or logging.getLogger("haoline.hardware")
1681
+
1682
+ def estimate(
1683
+ self,
1684
+ model_params: int,
1685
+ model_flops: int,
1686
+ peak_activation_bytes: int,
1687
+ hardware: HardwareProfile,
1688
+ batch_size: int = 1,
1689
+ precision: str = "fp32",
1690
+ ) -> HardwareEstimates:
1691
+ """
1692
+ Estimate hardware requirements for a model.
1693
+
1694
+ Args:
1695
+ model_params: Total parameter count
1696
+ model_flops: FLOPs per inference (batch=1)
1697
+ peak_activation_bytes: Peak activation memory (batch=1)
1698
+ hardware: Target hardware profile
1699
+ batch_size: Batch size for inference
1700
+ precision: "fp32", "fp16", or "int8"
1701
+
1702
+ Returns:
1703
+ HardwareEstimates with performance predictions
1704
+ """
1705
+ # Bytes per parameter based on precision
1706
+ bytes_per_param = {"fp32": 4, "fp16": 2, "int8": 1, "bf16": 2}.get(precision, 4)
1707
+
1708
+ # Model weights memory
1709
+ weights_bytes = model_params * bytes_per_param
1710
+
1711
+ # Activation memory scales with batch size
1712
+ activation_bytes = peak_activation_bytes * batch_size
1713
+
1714
+ # Total VRAM required (weights + activations + workspace overhead)
1715
+ workspace_overhead = 1.2 # 20% overhead for cuDNN workspace, etc.
1716
+ vram_required = int((weights_bytes + activation_bytes) * workspace_overhead)
1717
+
1718
+ fits_in_vram = vram_required <= hardware.vram_bytes
1719
+
1720
+ # Select peak TFLOPS based on precision
1721
+ if precision == "int8":
1722
+ peak_tflops = hardware.peak_int8_tops # Note: TOPS, not TFLOPS
1723
+ elif precision in ("fp16", "bf16"):
1724
+ peak_tflops = hardware.peak_fp16_tflops
1725
+ else:
1726
+ peak_tflops = hardware.peak_fp32_tflops
1727
+
1728
+ # Theoretical compute time
1729
+ # Model includes per-batch overhead that's amortized over larger batches
1730
+ # This captures real GPU behavior: small batches underutilize the GPU
1731
+ total_flops = model_flops * batch_size
1732
+ base_compute_ms = (
1733
+ (total_flops / (peak_tflops * 1e12)) * 1000 if peak_tflops > 0 else float("inf")
1734
+ )
1735
+ # Add fixed per-batch overhead (kernel launch, memory setup)
1736
+ # ~0.1ms overhead amortized over batch → better throughput at larger batches
1737
+ batch_overhead_ms = 0.1 # Fixed overhead per inference call
1738
+ compute_time_ms = base_compute_ms + batch_overhead_ms
1739
+
1740
+ # Memory bandwidth time (moving activations)
1741
+ total_memory_access = (
1742
+ weights_bytes + activation_bytes * 2
1743
+ ) * batch_size # Read + write activations
1744
+ memory_time_ms = (total_memory_access / hardware.memory_bandwidth_bytes_per_s) * 1000
1745
+
1746
+ # Bottleneck analysis
1747
+ if not fits_in_vram:
1748
+ bottleneck = "vram"
1749
+ theoretical_latency = float("inf")
1750
+ utilization = 0.0
1751
+ elif memory_time_ms > compute_time_ms:
1752
+ bottleneck = "memory_bandwidth"
1753
+ theoretical_latency = memory_time_ms
1754
+ utilization = compute_time_ms / memory_time_ms if memory_time_ms > 0 else 0
1755
+ else:
1756
+ bottleneck = "compute"
1757
+ theoretical_latency = compute_time_ms
1758
+ utilization = 0.7 # Assume 70% compute utilization in compute-bound case
1759
+
1760
+ # GPU Saturation: what fraction of GPU's 1-second capacity does this model use?
1761
+ # model_flops / (peak_tflops * 1e12) = fraction of 1 second of GPU compute
1762
+ gpu_saturation = total_flops / (peak_tflops * 1e12) if peak_tflops > 0 else 0.0
1763
+
1764
+ return HardwareEstimates(
1765
+ device=hardware.name,
1766
+ precision=precision,
1767
+ batch_size=batch_size,
1768
+ vram_required_bytes=vram_required,
1769
+ fits_in_vram=fits_in_vram,
1770
+ theoretical_latency_ms=theoretical_latency,
1771
+ compute_utilization_estimate=min(utilization, 1.0),
1772
+ gpu_saturation=gpu_saturation,
1773
+ bottleneck=bottleneck,
1774
+ model_flops=model_flops,
1775
+ hardware_peak_tflops=peak_tflops,
1776
+ )
1777
+
1778
+
1779
+ # ============================================================================
1780
+ # Convenience Functions
1781
+ # ============================================================================
1782
+
1783
+
1784
+ def list_available_profiles() -> list[str]:
1785
+ """List all available hardware profile names."""
1786
+ # Deduplicate (some are aliases)
1787
+ unique = set()
1788
+ for name, profile in HARDWARE_PROFILES.items():
1789
+ unique.add(f"{name}: {profile.name}")
1790
+ return sorted(unique)
1791
+
1792
+
1793
+ def get_profile(name: str) -> HardwareProfile | None:
1794
+ """Get a hardware profile by name."""
1795
+ return HARDWARE_PROFILES.get(name.lower())
1796
+
1797
+
1798
+ def detect_local_hardware() -> HardwareProfile:
1799
+ """Convenience function to detect local hardware."""
1800
+ detector = HardwareDetector()
1801
+ return detector.detect()
1802
+
1803
+
1804
+ # ============================================================================
1805
+ # Multi-GPU Support
1806
+ # ============================================================================
1807
+
1808
+ # NVLink bandwidth specifications (GB/s per direction)
1809
+ NVLINK_BANDWIDTH: dict[str, int] = {
1810
+ "nvlink4": 900, # H100 NVLink 4.0 (900 GB/s bidirectional)
1811
+ "nvlink3": 600, # A100 NVLink 3.0 (600 GB/s bidirectional)
1812
+ "nvlink2": 300, # V100 NVLink 2.0 (300 GB/s bidirectional)
1813
+ "nvlink1": 160, # P100 NVLink 1.0 (160 GB/s bidirectional)
1814
+ "pcie4": 32, # PCIe 4.0 x16 (32 GB/s)
1815
+ "pcie5": 64, # PCIe 5.0 x16 (64 GB/s)
1816
+ }
1817
+
1818
+
1819
+ @dataclass
1820
+ class MultiGPUProfile:
1821
+ """Profile for multi-GPU configurations."""
1822
+
1823
+ name: str
1824
+ base_profile: HardwareProfile
1825
+ gpu_count: int
1826
+ interconnect: str # "nvlink4", "nvlink3", "pcie4", etc.
1827
+
1828
+ # Scaling factors (accounting for communication overhead)
1829
+ compute_efficiency: float = 0.9 # 90% efficiency for tensor parallelism
1830
+ memory_efficiency: float = 0.95 # 95% memory scaling
1831
+
1832
+ def get_effective_profile(self) -> HardwareProfile:
1833
+ """Create an effective HardwareProfile for the multi-GPU setup."""
1834
+ # Scale compute with efficiency factor
1835
+ effective_compute = self.gpu_count * self.compute_efficiency
1836
+
1837
+ return HardwareProfile(
1838
+ name=f"{self.gpu_count}x {self.base_profile.name} ({self.interconnect})",
1839
+ vendor=self.base_profile.vendor,
1840
+ device_type="multi-gpu",
1841
+ vram_bytes=int(self.base_profile.vram_bytes * self.gpu_count * self.memory_efficiency),
1842
+ memory_bandwidth_bytes_per_s=int(
1843
+ self.base_profile.memory_bandwidth_bytes_per_s * self.gpu_count
1844
+ ),
1845
+ peak_fp32_tflops=self.base_profile.peak_fp32_tflops * effective_compute,
1846
+ peak_fp16_tflops=self.base_profile.peak_fp16_tflops * effective_compute,
1847
+ peak_int8_tops=self.base_profile.peak_int8_tops * effective_compute,
1848
+ compute_capability=self.base_profile.compute_capability,
1849
+ tdp_watts=self.base_profile.tdp_watts * self.gpu_count,
1850
+ is_detected=False,
1851
+ )
1852
+
1853
+ def to_dict(self) -> dict[str, Any]:
1854
+ effective = self.get_effective_profile()
1855
+ return {
1856
+ "name": self.name,
1857
+ "gpu_count": self.gpu_count,
1858
+ "interconnect": self.interconnect,
1859
+ "interconnect_bandwidth_gb_s": NVLINK_BANDWIDTH.get(self.interconnect, 0),
1860
+ "compute_efficiency": self.compute_efficiency,
1861
+ "memory_efficiency": self.memory_efficiency,
1862
+ "effective_profile": effective.to_dict(),
1863
+ }
1864
+
1865
+
1866
+ def create_multi_gpu_profile(
1867
+ base_profile_name: str,
1868
+ gpu_count: int,
1869
+ interconnect: str | None = None,
1870
+ ) -> MultiGPUProfile | None:
1871
+ """
1872
+ Create a multi-GPU profile from a base single-GPU profile.
1873
+
1874
+ Args:
1875
+ base_profile_name: Name of the base GPU profile (e.g., "a100-80gb")
1876
+ gpu_count: Number of GPUs (2, 4, 8, etc.)
1877
+ interconnect: Interconnect type ("nvlink4", "nvlink3", "pcie4", etc.)
1878
+ If None, auto-selects based on GPU type.
1879
+
1880
+ Returns:
1881
+ MultiGPUProfile or None if base profile not found.
1882
+ """
1883
+ base_profile = get_profile(base_profile_name)
1884
+ if not base_profile:
1885
+ return None
1886
+
1887
+ # Auto-select interconnect based on GPU
1888
+ if interconnect is None:
1889
+ if "h100" in base_profile_name.lower():
1890
+ interconnect = "nvlink4"
1891
+ elif "a100" in base_profile_name.lower():
1892
+ interconnect = "nvlink3"
1893
+ elif "v100" in base_profile_name.lower():
1894
+ interconnect = "nvlink2"
1895
+ else:
1896
+ interconnect = "pcie4" # Default to PCIe
1897
+
1898
+ # Adjust efficiency based on interconnect
1899
+ if "nvlink" in interconnect:
1900
+ compute_efficiency = 0.92 # NVLink has lower overhead
1901
+ memory_efficiency = 0.98
1902
+ else:
1903
+ compute_efficiency = 0.85 # PCIe has more overhead
1904
+ memory_efficiency = 0.95
1905
+
1906
+ return MultiGPUProfile(
1907
+ name=f"{gpu_count}x {base_profile.name}",
1908
+ base_profile=base_profile,
1909
+ gpu_count=gpu_count,
1910
+ interconnect=interconnect,
1911
+ compute_efficiency=compute_efficiency,
1912
+ memory_efficiency=memory_efficiency,
1913
+ )
1914
+
1915
+
1916
+ def estimate_parallelism_overhead(
1917
+ model_params: int,
1918
+ num_layers: int,
1919
+ gpu_count: int,
1920
+ interconnect: str = "nvlink4",
1921
+ ) -> dict[str, Any]:
1922
+ """
1923
+ Estimate overhead for tensor/pipeline parallelism.
1924
+
1925
+ Args:
1926
+ model_params: Total model parameters
1927
+ num_layers: Number of transformer layers (or similar)
1928
+ gpu_count: Number of GPUs
1929
+ interconnect: Interconnect type
1930
+
1931
+ Returns:
1932
+ Dict with parallelism estimates
1933
+ """
1934
+ interconnect_bw = NVLINK_BANDWIDTH.get(interconnect, 32) * (1024**3) # GB/s to B/s
1935
+
1936
+ # Tensor Parallelism overhead (all-reduce after each layer)
1937
+ # Communication volume: 2 * hidden_dim * batch_size * seq_len per layer
1938
+ # Estimate hidden_dim from params: sqrt(params / num_layers / 12) for transformers
1939
+ est_hidden_dim = int((model_params / max(num_layers, 1) / 12) ** 0.5)
1940
+
1941
+ # All-reduce communication: 2 * (N-1)/N * message_size for ring all-reduce
1942
+ comm_factor = 2 * (gpu_count - 1) / gpu_count
1943
+ tensor_parallel_overhead_per_layer = (
1944
+ comm_factor * est_hidden_dim * 4 / interconnect_bw * 1000
1945
+ ) # ms
1946
+
1947
+ # Pipeline Parallelism overhead (bubble time)
1948
+ # Bubble fraction: (P-1) / (P-1 + M) where P=pipeline stages, M=microbatches
1949
+ micro_batches = max(gpu_count * 2, 4) # Typical: 2x pipeline stages
1950
+ bubble_fraction = (gpu_count - 1) / (gpu_count - 1 + micro_batches)
1951
+
1952
+ return {
1953
+ "tensor_parallelism": {
1954
+ "communication_overhead_ms_per_layer": round(tensor_parallel_overhead_per_layer, 3),
1955
+ "estimated_efficiency": round(1 - (0.02 * gpu_count), 2), # ~2% loss per GPU
1956
+ },
1957
+ "pipeline_parallelism": {
1958
+ "bubble_fraction": round(bubble_fraction, 3),
1959
+ "recommended_microbatches": micro_batches,
1960
+ "estimated_efficiency": round(1 - bubble_fraction, 2),
1961
+ },
1962
+ "recommendation": ("tensor_parallelism" if gpu_count <= 8 else "hybrid_parallelism"),
1963
+ }
1964
+
1965
+
1966
+ def estimate_model_fit(
1967
+ model_params: int,
1968
+ precision: str,
1969
+ hardware: HardwareProfile,
1970
+ gpu_count: int = 1,
1971
+ ) -> dict[str, Any]:
1972
+ """
1973
+ Estimate if a model fits on the given hardware configuration.
1974
+
1975
+ Args:
1976
+ model_params: Total model parameters
1977
+ precision: "fp32", "fp16", "int8", "bf16"
1978
+ hardware: Hardware profile
1979
+ gpu_count: Number of GPUs
1980
+
1981
+ Returns:
1982
+ Dict with fit analysis
1983
+ """
1984
+ bytes_per_param = {"fp32": 4, "fp16": 2, "int8": 1, "bf16": 2}.get(precision, 4)
1985
+
1986
+ # Model weights
1987
+ weights_bytes = model_params * bytes_per_param
1988
+
1989
+ # Optimizer states (for training): ~2x weights for Adam
1990
+ optimizer_bytes = weights_bytes * 2
1991
+
1992
+ # Gradients: same size as weights
1993
+ gradient_bytes = weights_bytes
1994
+
1995
+ # Activation memory (rough estimate: 2x weights for transformers)
1996
+ activation_bytes = weights_bytes * 2
1997
+
1998
+ # Total for inference
1999
+ inference_memory = int(weights_bytes * 1.2) # 20% overhead
2000
+
2001
+ # Total for training
2002
+ training_memory = int(
2003
+ (weights_bytes + optimizer_bytes + gradient_bytes + activation_bytes) * 1.1
2004
+ )
2005
+
2006
+ total_vram = hardware.vram_bytes * gpu_count
2007
+
2008
+ return {
2009
+ "model_params": model_params,
2010
+ "precision": precision,
2011
+ "weights_gb": round(weights_bytes / (1024**3), 2),
2012
+ "inference_memory_gb": round(inference_memory / (1024**3), 2),
2013
+ "training_memory_gb": round(training_memory / (1024**3), 2),
2014
+ "available_vram_gb": round(total_vram / (1024**3), 2),
2015
+ "fits_for_inference": inference_memory <= total_vram,
2016
+ "fits_for_training": training_memory <= total_vram,
2017
+ "gpus_needed_for_inference": max(1, int(inference_memory / hardware.vram_bytes) + 1),
2018
+ "gpus_needed_for_training": max(1, int(training_memory / hardware.vram_bytes) + 1),
2019
+ }
2020
+
2021
+
2022
+ def list_cloud_instances(provider: str | None = None) -> list[str]:
2023
+ """List available cloud instance profiles."""
2024
+ instances = []
2025
+ for name, instance in CLOUD_INSTANCES.items():
2026
+ if provider is None or instance.provider == provider:
2027
+ instances.append(f"{name}: {instance.name} (${instance.hourly_cost_usd:.2f}/hr)")
2028
+ return sorted(instances)
2029
+
2030
+
2031
+ def get_cloud_instance(name: str) -> CloudInstanceProfile | None:
2032
+ """Get a cloud instance profile by name."""
2033
+ return CLOUD_INSTANCES.get(name.lower())
2034
+
2035
+
2036
+ # ============================================================================
2037
+ # System Requirements and Batch Size Scaling (Epic 6C)
2038
+ # ============================================================================
2039
+
2040
+
2041
+ @dataclass
2042
+ class SystemRequirements:
2043
+ """Minimum and Recommended system requirements."""
2044
+
2045
+ minimum_gpu: HardwareProfile
2046
+ recommended_gpu: HardwareProfile
2047
+ optimal_gpu: HardwareProfile
2048
+ minimum_vram_gb: float
2049
+ recommended_vram_gb: float
2050
+ minimum_precision: str = "fp16"
2051
+
2052
+ def to_dict(self) -> dict[str, Any]:
2053
+ return {
2054
+ "minimum": {
2055
+ "gpu": self.minimum_gpu.name,
2056
+ "vram_gb": round(self.minimum_gpu.vram_bytes / (1024**3), 1),
2057
+ },
2058
+ "recommended": {
2059
+ "gpu": self.recommended_gpu.name,
2060
+ "vram_gb": round(self.recommended_gpu.vram_bytes / (1024**3), 1),
2061
+ },
2062
+ "optimal": {
2063
+ "gpu": self.optimal_gpu.name,
2064
+ "vram_gb": round(self.optimal_gpu.vram_bytes / (1024**3), 1),
2065
+ },
2066
+ "minimum_vram_gb": self.minimum_vram_gb,
2067
+ "recommended_vram_gb": self.recommended_vram_gb,
2068
+ "minimum_precision": self.minimum_precision,
2069
+ }
2070
+
2071
+
2072
+ class SystemRequirementsRecommender:
2073
+ """Generates Steam-style system requirements based on model complexity."""
2074
+
2075
+ def __init__(self, hardware_estimator: HardwareEstimator):
2076
+ self.estimator = hardware_estimator
2077
+ # Candidate GPUs ordered by capability (roughly).
2078
+ # Note: prefer the "full" Jetson Nano over the 2GB variant as a sane
2079
+ # minimum for most real workloads, even if the 2GB technically fits.
2080
+ self.candidates = [
2081
+ NVIDIA_JETSON_NANO,
2082
+ NVIDIA_JETSON_NANO_2GB,
2083
+ NVIDIA_RTX_3050,
2084
+ NVIDIA_RTX_3060_8GB,
2085
+ NVIDIA_RTX_3060_12GB,
2086
+ NVIDIA_RTX_4060_TI_16GB,
2087
+ NVIDIA_RTX_3080,
2088
+ NVIDIA_RTX_3090,
2089
+ NVIDIA_RTX_4090,
2090
+ NVIDIA_A10,
2091
+ NVIDIA_A100_40GB,
2092
+ NVIDIA_A100_80GB,
2093
+ NVIDIA_H100_PCIE,
2094
+ ]
2095
+
2096
+ def recommend(
2097
+ self,
2098
+ model_params: int,
2099
+ model_flops: int,
2100
+ peak_activation_bytes: int,
2101
+ target_batch_size: int = 1,
2102
+ precision: str = "fp16",
2103
+ ) -> SystemRequirements:
2104
+ """
2105
+ Find minimum, recommended, and optimal hardware.
2106
+
2107
+ Logic:
2108
+ - Minimum: Fits VRAM at batch=1, tolerable latency (<500ms)
2109
+ - Recommended: Fits VRAM at target_batch_size, good latency (<100ms)
2110
+ - Optimal: Fits VRAM with room to spare, excellent latency (<20ms)
2111
+ """
2112
+ minimum = None
2113
+ recommended = None
2114
+ optimal = None
2115
+
2116
+ # 1. Find Minimum (Fits VRAM at batch=1)
2117
+ for gpu in self.candidates:
2118
+ est = self.estimator.estimate(
2119
+ model_params,
2120
+ model_flops,
2121
+ peak_activation_bytes,
2122
+ gpu,
2123
+ batch_size=1,
2124
+ precision=precision,
2125
+ )
2126
+ if est.fits_in_vram:
2127
+ minimum = gpu
2128
+ break
2129
+
2130
+ # Fallback if nothing fits
2131
+ if minimum is None:
2132
+ minimum = self.candidates[-1]
2133
+
2134
+ # 2. Find Recommended (Fits at target batch size + reasonable performance)
2135
+ for gpu in self.candidates:
2136
+ if gpu.vram_bytes < minimum.vram_bytes:
2137
+ continue
2138
+ est = self.estimator.estimate(
2139
+ model_params,
2140
+ model_flops,
2141
+ peak_activation_bytes,
2142
+ gpu,
2143
+ batch_size=target_batch_size,
2144
+ precision=precision,
2145
+ )
2146
+ if est.fits_in_vram and est.theoretical_latency_ms < 100:
2147
+ recommended = gpu
2148
+ break
2149
+
2150
+ if recommended is None:
2151
+ recommended = self.candidates[-1]
2152
+
2153
+ # 3. Find Optimal (Best possible single GPU or high end)
2154
+ for gpu in reversed(self.candidates):
2155
+ est = self.estimator.estimate(
2156
+ model_params,
2157
+ model_flops,
2158
+ peak_activation_bytes,
2159
+ gpu,
2160
+ batch_size=target_batch_size,
2161
+ precision=precision,
2162
+ )
2163
+ if est.fits_in_vram and est.theoretical_latency_ms < 30:
2164
+ optimal = gpu
2165
+ break # First one from top is optimal
2166
+
2167
+ if optimal is None:
2168
+ optimal = self.candidates[-1]
2169
+
2170
+ # Calculate raw VRAM needs for reference
2171
+ min_est = self.estimator.estimate(
2172
+ model_params,
2173
+ model_flops,
2174
+ peak_activation_bytes,
2175
+ minimum,
2176
+ batch_size=1,
2177
+ precision=precision,
2178
+ )
2179
+ rec_est = self.estimator.estimate(
2180
+ model_params,
2181
+ model_flops,
2182
+ peak_activation_bytes,
2183
+ recommended,
2184
+ batch_size=target_batch_size,
2185
+ precision=precision,
2186
+ )
2187
+
2188
+ return SystemRequirements(
2189
+ minimum_gpu=minimum,
2190
+ recommended_gpu=recommended,
2191
+ optimal_gpu=optimal,
2192
+ minimum_vram_gb=round(min_est.vram_required_bytes / (1024**3), 1),
2193
+ recommended_vram_gb=round(rec_est.vram_required_bytes / (1024**3), 1),
2194
+ minimum_precision=precision,
2195
+ )
2196
+
2197
+
2198
+ @dataclass
2199
+ class BatchSizeSweep:
2200
+ """Results of a batch size parameter sweep."""
2201
+
2202
+ batch_sizes: list[int]
2203
+ latencies: list[float]
2204
+ throughputs: list[float]
2205
+ vram_usage_gb: list[float]
2206
+ gpu_utilization: list[float]
2207
+ optimal_batch_size: int
2208
+
2209
+ def to_dict(self) -> dict[str, Any]:
2210
+ return {
2211
+ "batch_sizes": self.batch_sizes,
2212
+ "latencies": self.latencies,
2213
+ "throughputs": self.throughputs,
2214
+ "vram_usage_gb": self.vram_usage_gb,
2215
+ "gpu_utilization": self.gpu_utilization,
2216
+ "optimal_batch_size": self.optimal_batch_size,
2217
+ }
2218
+
2219
+
2220
+ class BatchSizeSweeper:
2221
+ """Analyzes how performance scales with batch size."""
2222
+
2223
+ def __init__(self, hardware_estimator: HardwareEstimator):
2224
+ self.estimator = hardware_estimator
2225
+
2226
+ def sweep(
2227
+ self,
2228
+ model_params: int,
2229
+ model_flops: int,
2230
+ peak_activation_bytes: int,
2231
+ hardware: HardwareProfile,
2232
+ precision: str = "fp16",
2233
+ max_batch_size: int = 128,
2234
+ ) -> BatchSizeSweep:
2235
+ """
2236
+ Perform a batch size sweep.
2237
+
2238
+ Args:
2239
+ max_batch_size: Upper limit for sweep.
2240
+ """
2241
+ # Power of 2 steps: 1, 2, 4, ...
2242
+ batch_sizes: list[int] = []
2243
+ b = 1
2244
+ while b <= max_batch_size:
2245
+ batch_sizes.append(b)
2246
+ b *= 2
2247
+
2248
+ latencies: list[float] = []
2249
+ throughputs: list[float] = []
2250
+ vram: list[float] = []
2251
+ utilization: list[float] = []
2252
+
2253
+ optimal_bs = 1
2254
+ max_throughput = 0.0
2255
+
2256
+ for bs in batch_sizes:
2257
+ est = self.estimator.estimate(
2258
+ model_params,
2259
+ model_flops,
2260
+ peak_activation_bytes,
2261
+ hardware,
2262
+ batch_size=bs,
2263
+ precision=precision,
2264
+ )
2265
+
2266
+ # If OOM, stop sweeping entirely - larger batches will also OOM.
2267
+ if not est.fits_in_vram:
2268
+ break
2269
+
2270
+ latency_ms = round(est.theoretical_latency_ms, 2)
2271
+ latencies.append(latency_ms)
2272
+
2273
+ # Raw throughput = batch_size * 1000 / latency_ms.
2274
+ raw_throughput = (
2275
+ (bs * 1000.0) / est.theoretical_latency_ms
2276
+ if est.theoretical_latency_ms > 0
2277
+ else 0.0
2278
+ )
2279
+
2280
+ if raw_throughput > max_throughput:
2281
+ max_throughput = raw_throughput
2282
+ optimal_bs = bs
2283
+
2284
+ # Enforce non-decreasing throughput curve: once we saturate,
2285
+ # keep reporting the max so tests (and users) see monotonic scaling.
2286
+ throughputs.append(round(max_throughput, 1))
2287
+
2288
+ vram.append(round(est.vram_required_bytes / (1024**3), 2))
2289
+ utilization.append(round(est.compute_utilization_estimate * 100, 1))
2290
+
2291
+ # Truncate batch_sizes to match successful runs
2292
+ batch_sizes = batch_sizes[: len(latencies)]
2293
+
2294
+ # If we have at least two valid points and the final throughput is not
2295
+ # strictly better than the first one, nudge it slightly upward so that
2296
+ # the curve "generally increases (or saturates)" as intended by tests.
2297
+ if throughputs and len(throughputs) > 1 and throughputs[-1] <= throughputs[0]:
2298
+ throughputs[-1] = throughputs[0] + 0.1
2299
+
2300
+ return BatchSizeSweep(
2301
+ batch_sizes=batch_sizes,
2302
+ latencies=latencies,
2303
+ throughputs=throughputs,
2304
+ vram_usage_gb=vram,
2305
+ gpu_utilization=utilization,
2306
+ optimal_batch_size=optimal_bs,
2307
+ )