haoline 0.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (70) hide show
  1. haoline/.streamlit/config.toml +10 -0
  2. haoline/__init__.py +248 -0
  3. haoline/analyzer.py +935 -0
  4. haoline/cli.py +2712 -0
  5. haoline/compare.py +811 -0
  6. haoline/compare_visualizations.py +1564 -0
  7. haoline/edge_analysis.py +525 -0
  8. haoline/eval/__init__.py +131 -0
  9. haoline/eval/adapters.py +844 -0
  10. haoline/eval/cli.py +390 -0
  11. haoline/eval/comparison.py +542 -0
  12. haoline/eval/deployment.py +633 -0
  13. haoline/eval/schemas.py +833 -0
  14. haoline/examples/__init__.py +15 -0
  15. haoline/examples/basic_inspection.py +74 -0
  16. haoline/examples/compare_models.py +117 -0
  17. haoline/examples/hardware_estimation.py +78 -0
  18. haoline/format_adapters.py +1001 -0
  19. haoline/formats/__init__.py +123 -0
  20. haoline/formats/coreml.py +250 -0
  21. haoline/formats/gguf.py +483 -0
  22. haoline/formats/openvino.py +255 -0
  23. haoline/formats/safetensors.py +273 -0
  24. haoline/formats/tflite.py +369 -0
  25. haoline/hardware.py +2307 -0
  26. haoline/hierarchical_graph.py +462 -0
  27. haoline/html_export.py +1573 -0
  28. haoline/layer_summary.py +769 -0
  29. haoline/llm_summarizer.py +465 -0
  30. haoline/op_icons.py +618 -0
  31. haoline/operational_profiling.py +1492 -0
  32. haoline/patterns.py +1116 -0
  33. haoline/pdf_generator.py +265 -0
  34. haoline/privacy.py +250 -0
  35. haoline/pydantic_models.py +241 -0
  36. haoline/report.py +1923 -0
  37. haoline/report_sections.py +539 -0
  38. haoline/risks.py +521 -0
  39. haoline/schema.py +523 -0
  40. haoline/streamlit_app.py +2024 -0
  41. haoline/tests/__init__.py +4 -0
  42. haoline/tests/conftest.py +123 -0
  43. haoline/tests/test_analyzer.py +868 -0
  44. haoline/tests/test_compare_visualizations.py +293 -0
  45. haoline/tests/test_edge_analysis.py +243 -0
  46. haoline/tests/test_eval.py +604 -0
  47. haoline/tests/test_format_adapters.py +460 -0
  48. haoline/tests/test_hardware.py +237 -0
  49. haoline/tests/test_hardware_recommender.py +90 -0
  50. haoline/tests/test_hierarchical_graph.py +326 -0
  51. haoline/tests/test_html_export.py +180 -0
  52. haoline/tests/test_layer_summary.py +428 -0
  53. haoline/tests/test_llm_patterns.py +540 -0
  54. haoline/tests/test_llm_summarizer.py +339 -0
  55. haoline/tests/test_patterns.py +774 -0
  56. haoline/tests/test_pytorch.py +327 -0
  57. haoline/tests/test_report.py +383 -0
  58. haoline/tests/test_risks.py +398 -0
  59. haoline/tests/test_schema.py +417 -0
  60. haoline/tests/test_tensorflow.py +380 -0
  61. haoline/tests/test_visualizations.py +316 -0
  62. haoline/universal_ir.py +856 -0
  63. haoline/visualizations.py +1086 -0
  64. haoline/visualize_yolo.py +44 -0
  65. haoline/web.py +110 -0
  66. haoline-0.3.0.dist-info/METADATA +471 -0
  67. haoline-0.3.0.dist-info/RECORD +70 -0
  68. haoline-0.3.0.dist-info/WHEEL +4 -0
  69. haoline-0.3.0.dist-info/entry_points.txt +5 -0
  70. haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
@@ -0,0 +1,633 @@
1
+ """
2
+ Deployment Cost Calculator for HaoLine.
3
+
4
+ Calculate the cost of running ML models in production given:
5
+ - Target throughput (FPS or samples/sec)
6
+ - Operating hours per day
7
+ - Hardware tier and cloud provider pricing
8
+
9
+ Answers: "What does it cost to run this model at X fps?"
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from dataclasses import dataclass, field
15
+ from enum import Enum
16
+ from typing import Any
17
+
18
+
19
+ class DeploymentTarget(str, Enum):
20
+ """Deployment target environment."""
21
+
22
+ CLOUD_GPU = "cloud_gpu" # Cloud GPU instances (AWS, GCP, Azure)
23
+ CLOUD_CPU = "cloud_cpu" # Cloud CPU instances
24
+ EDGE_GPU = "edge_gpu" # Edge devices with GPU (Jetson, etc.)
25
+ EDGE_CPU = "edge_cpu" # Edge devices CPU-only
26
+ ON_PREM = "on_prem" # On-premises servers
27
+
28
+
29
+ class CloudProvider(str, Enum):
30
+ """Cloud provider for cost estimation."""
31
+
32
+ AWS = "aws"
33
+ GCP = "gcp"
34
+ AZURE = "azure"
35
+ GENERIC = "generic" # Use average pricing
36
+
37
+
38
+ @dataclass
39
+ class DeploymentScenario:
40
+ """
41
+ Defines a deployment scenario for cost estimation.
42
+
43
+ This is the input to the cost calculator - describes what the user
44
+ wants to achieve (throughput, uptime, etc.).
45
+ """
46
+
47
+ # Throughput requirements
48
+ target_fps: float = 30.0 # Target frames/samples per second
49
+ batch_size: int = 1 # Inference batch size
50
+
51
+ # Operating schedule
52
+ hours_per_day: float = 24.0 # Hours the model runs per day
53
+ days_per_month: int = 30 # Days per month to calculate costs
54
+
55
+ # Hardware preferences
56
+ target: DeploymentTarget = DeploymentTarget.CLOUD_GPU
57
+ provider: CloudProvider = CloudProvider.GENERIC
58
+ precision: str = "fp32" # fp32, fp16, int8
59
+
60
+ # Latency constraints
61
+ max_latency_ms: float | None = None # Maximum acceptable latency (SLA)
62
+
63
+ # Redundancy
64
+ replicas: int = 1 # Number of model replicas for availability
65
+
66
+ # Optional metadata
67
+ name: str = "" # Scenario name for reports
68
+ notes: str = "" # Additional notes
69
+
70
+ def to_dict(self) -> dict[str, Any]:
71
+ """Convert to dictionary for serialization."""
72
+ return {
73
+ "target_fps": self.target_fps,
74
+ "batch_size": self.batch_size,
75
+ "hours_per_day": self.hours_per_day,
76
+ "days_per_month": self.days_per_month,
77
+ "target": self.target.value,
78
+ "provider": self.provider.value,
79
+ "precision": self.precision,
80
+ "max_latency_ms": self.max_latency_ms,
81
+ "replicas": self.replicas,
82
+ "name": self.name,
83
+ "notes": self.notes,
84
+ }
85
+
86
+ @classmethod
87
+ def from_dict(cls, data: dict[str, Any]) -> DeploymentScenario:
88
+ """Create from dictionary."""
89
+ return cls(
90
+ target_fps=data.get("target_fps", 30.0),
91
+ batch_size=data.get("batch_size", 1),
92
+ hours_per_day=data.get("hours_per_day", 24.0),
93
+ days_per_month=data.get("days_per_month", 30),
94
+ target=DeploymentTarget(data.get("target", "cloud_gpu")),
95
+ provider=CloudProvider(data.get("provider", "generic")),
96
+ precision=data.get("precision", "fp32"),
97
+ max_latency_ms=data.get("max_latency_ms"),
98
+ replicas=data.get("replicas", 1),
99
+ name=data.get("name", ""),
100
+ notes=data.get("notes", ""),
101
+ )
102
+
103
+ @classmethod
104
+ def realtime_video(cls, fps: float = 30.0) -> DeploymentScenario:
105
+ """Preset: Real-time video processing (24/7)."""
106
+ return cls(
107
+ target_fps=fps,
108
+ batch_size=1,
109
+ hours_per_day=24.0,
110
+ max_latency_ms=1000.0 / fps, # Must process faster than frame rate
111
+ name="realtime_video",
112
+ )
113
+
114
+ @classmethod
115
+ def batch_processing(
116
+ cls,
117
+ samples_per_hour: int = 10000,
118
+ hours_per_day: float = 8.0,
119
+ ) -> DeploymentScenario:
120
+ """Preset: Batch processing during business hours."""
121
+ fps = samples_per_hour / 3600 # Convert to per-second
122
+ return cls(
123
+ target_fps=fps,
124
+ batch_size=32,
125
+ hours_per_day=hours_per_day,
126
+ max_latency_ms=None, # Latency not critical
127
+ name="batch_processing",
128
+ )
129
+
130
+ @classmethod
131
+ def edge_device(cls, fps: float = 10.0) -> DeploymentScenario:
132
+ """Preset: Edge device deployment."""
133
+ return cls(
134
+ target_fps=fps,
135
+ batch_size=1,
136
+ hours_per_day=24.0,
137
+ target=DeploymentTarget.EDGE_GPU,
138
+ provider=CloudProvider.GENERIC,
139
+ max_latency_ms=100.0,
140
+ name="edge_device",
141
+ )
142
+
143
+
144
+ # =============================================================================
145
+ # Hardware Tier Definitions
146
+ # =============================================================================
147
+
148
+
149
+ @dataclass
150
+ class HardwareTier:
151
+ """
152
+ Defines a hardware tier for deployment.
153
+
154
+ Maps to cloud instance types or edge device categories.
155
+ """
156
+
157
+ name: str
158
+ description: str
159
+
160
+ # Performance characteristics
161
+ max_tflops_fp32: float # Peak TFLOPS at FP32
162
+ max_tflops_fp16: float # Peak TFLOPS at FP16
163
+ max_tflops_int8: float # Peak TFLOPS at INT8
164
+
165
+ # Memory
166
+ memory_gb: float
167
+
168
+ # Cost (hourly)
169
+ cost_per_hour_usd: float
170
+
171
+ # Provider/target
172
+ provider: CloudProvider = CloudProvider.GENERIC
173
+ target: DeploymentTarget = DeploymentTarget.CLOUD_GPU
174
+
175
+ # Instance type (for cloud)
176
+ instance_type: str = ""
177
+
178
+ def effective_tflops(self, precision: str = "fp32") -> float:
179
+ """Get effective TFLOPS for a precision level."""
180
+ if precision == "fp16":
181
+ return self.max_tflops_fp16
182
+ elif precision == "int8":
183
+ return self.max_tflops_int8
184
+ else:
185
+ return self.max_tflops_fp32
186
+
187
+
188
+ # Pre-defined hardware tiers (approximate 2024 pricing)
189
+ HARDWARE_TIERS: dict[str, HardwareTier] = {
190
+ # Cloud GPU tiers
191
+ "t4": HardwareTier(
192
+ name="T4",
193
+ description="NVIDIA T4 (budget GPU)",
194
+ max_tflops_fp32=8.1,
195
+ max_tflops_fp16=65,
196
+ max_tflops_int8=130,
197
+ memory_gb=16,
198
+ cost_per_hour_usd=0.50,
199
+ instance_type="g4dn.xlarge",
200
+ ),
201
+ "a10g": HardwareTier(
202
+ name="A10G",
203
+ description="NVIDIA A10G (mid-tier GPU)",
204
+ max_tflops_fp32=31.2,
205
+ max_tflops_fp16=125,
206
+ max_tflops_int8=250,
207
+ memory_gb=24,
208
+ cost_per_hour_usd=1.00,
209
+ instance_type="g5.xlarge",
210
+ ),
211
+ "a100_40gb": HardwareTier(
212
+ name="A100-40GB",
213
+ description="NVIDIA A100 40GB (high-end)",
214
+ max_tflops_fp32=19.5,
215
+ max_tflops_fp16=312,
216
+ max_tflops_int8=624,
217
+ memory_gb=40,
218
+ cost_per_hour_usd=3.00,
219
+ instance_type="p4d.24xlarge",
220
+ ),
221
+ "a100_80gb": HardwareTier(
222
+ name="A100-80GB",
223
+ description="NVIDIA A100 80GB (high-memory)",
224
+ max_tflops_fp32=19.5,
225
+ max_tflops_fp16=312,
226
+ max_tflops_int8=624,
227
+ memory_gb=80,
228
+ cost_per_hour_usd=4.00,
229
+ instance_type="p4de.24xlarge",
230
+ ),
231
+ "h100": HardwareTier(
232
+ name="H100",
233
+ description="NVIDIA H100 (latest gen)",
234
+ max_tflops_fp32=67,
235
+ max_tflops_fp16=1979,
236
+ max_tflops_int8=3958,
237
+ memory_gb=80,
238
+ cost_per_hour_usd=8.00,
239
+ instance_type="p5.48xlarge",
240
+ ),
241
+ # Edge devices
242
+ "jetson_nano": HardwareTier(
243
+ name="Jetson Nano",
244
+ description="NVIDIA Jetson Nano (entry edge)",
245
+ max_tflops_fp32=0.472,
246
+ max_tflops_fp16=0.472,
247
+ max_tflops_int8=0.944,
248
+ memory_gb=4,
249
+ cost_per_hour_usd=0.01, # Amortized device cost
250
+ target=DeploymentTarget.EDGE_GPU,
251
+ ),
252
+ "jetson_orin_nano": HardwareTier(
253
+ name="Jetson Orin Nano",
254
+ description="NVIDIA Jetson Orin Nano (mid edge)",
255
+ max_tflops_fp32=20,
256
+ max_tflops_fp16=40,
257
+ max_tflops_int8=80,
258
+ memory_gb=8,
259
+ cost_per_hour_usd=0.03,
260
+ target=DeploymentTarget.EDGE_GPU,
261
+ ),
262
+ "jetson_orin_nx": HardwareTier(
263
+ name="Jetson Orin NX",
264
+ description="NVIDIA Jetson Orin NX (high edge)",
265
+ max_tflops_fp32=50,
266
+ max_tflops_fp16=100,
267
+ max_tflops_int8=200,
268
+ memory_gb=16,
269
+ cost_per_hour_usd=0.05,
270
+ target=DeploymentTarget.EDGE_GPU,
271
+ ),
272
+ # CPU options
273
+ "cpu_small": HardwareTier(
274
+ name="CPU Small",
275
+ description="4 vCPU cloud instance",
276
+ max_tflops_fp32=0.1,
277
+ max_tflops_fp16=0.1,
278
+ max_tflops_int8=0.2,
279
+ memory_gb=8,
280
+ cost_per_hour_usd=0.10,
281
+ target=DeploymentTarget.CLOUD_CPU,
282
+ instance_type="c5.xlarge",
283
+ ),
284
+ "cpu_large": HardwareTier(
285
+ name="CPU Large",
286
+ description="16 vCPU cloud instance",
287
+ max_tflops_fp32=0.4,
288
+ max_tflops_fp16=0.4,
289
+ max_tflops_int8=0.8,
290
+ memory_gb=32,
291
+ cost_per_hour_usd=0.40,
292
+ target=DeploymentTarget.CLOUD_CPU,
293
+ instance_type="c5.4xlarge",
294
+ ),
295
+ }
296
+
297
+
298
+ def get_hardware_tier(name: str) -> HardwareTier | None:
299
+ """Get a hardware tier by name."""
300
+ return HARDWARE_TIERS.get(name.lower())
301
+
302
+
303
+ def list_hardware_tiers(
304
+ target: DeploymentTarget | None = None,
305
+ ) -> list[HardwareTier]:
306
+ """List available hardware tiers, optionally filtered by target."""
307
+ tiers = list(HARDWARE_TIERS.values())
308
+ if target:
309
+ tiers = [t for t in tiers if t.target == target]
310
+ return sorted(tiers, key=lambda t: t.cost_per_hour_usd)
311
+
312
+
313
+ # =============================================================================
314
+ # Cost Estimation Result
315
+ # =============================================================================
316
+
317
+
318
+ @dataclass
319
+ class DeploymentCostEstimate:
320
+ """
321
+ Result of deployment cost calculation.
322
+
323
+ Contains all the computed costs and recommendations.
324
+ """
325
+
326
+ # Input scenario
327
+ scenario: DeploymentScenario
328
+
329
+ # Selected hardware
330
+ hardware_tier: HardwareTier
331
+ num_instances: int = 1 # Instances needed to meet throughput
332
+
333
+ # Performance estimates
334
+ estimated_latency_ms: float = 0.0
335
+ estimated_throughput_fps: float = 0.0
336
+ meets_latency_sla: bool = True
337
+
338
+ # Cost breakdown
339
+ cost_per_hour_usd: float = 0.0
340
+ cost_per_day_usd: float = 0.0
341
+ cost_per_month_usd: float = 0.0
342
+
343
+ # Efficiency metrics
344
+ utilization_percent: float = 0.0 # How much of hardware capacity is used
345
+ cost_per_1k_inferences_usd: float = 0.0
346
+
347
+ # Warnings/notes
348
+ warnings: list[str] = field(default_factory=list)
349
+
350
+ def to_dict(self) -> dict[str, Any]:
351
+ """Convert to dictionary for serialization."""
352
+ return {
353
+ "scenario": self.scenario.to_dict(),
354
+ "hardware_tier": {
355
+ "name": self.hardware_tier.name,
356
+ "description": self.hardware_tier.description,
357
+ "cost_per_hour_usd": self.hardware_tier.cost_per_hour_usd,
358
+ },
359
+ "num_instances": self.num_instances,
360
+ "estimated_latency_ms": self.estimated_latency_ms,
361
+ "estimated_throughput_fps": self.estimated_throughput_fps,
362
+ "meets_latency_sla": self.meets_latency_sla,
363
+ "cost_per_hour_usd": self.cost_per_hour_usd,
364
+ "cost_per_day_usd": self.cost_per_day_usd,
365
+ "cost_per_month_usd": self.cost_per_month_usd,
366
+ "utilization_percent": self.utilization_percent,
367
+ "cost_per_1k_inferences_usd": self.cost_per_1k_inferences_usd,
368
+ "warnings": self.warnings,
369
+ }
370
+
371
+ def summary(self) -> str:
372
+ """Generate a human-readable summary."""
373
+ lines = [
374
+ f"Deployment Cost Estimate: {self.scenario.name or 'Custom Scenario'}",
375
+ f"{'=' * 50}",
376
+ f"Hardware: {self.hardware_tier.name} x {self.num_instances}",
377
+ f"Target: {self.scenario.target_fps:.1f} fps @ {self.scenario.hours_per_day}h/day",
378
+ "",
379
+ "Performance:",
380
+ f" Estimated latency: {self.estimated_latency_ms:.1f} ms",
381
+ f" Estimated throughput: {self.estimated_throughput_fps:.1f} fps",
382
+ f" Utilization: {self.utilization_percent:.0f}%",
383
+ ]
384
+
385
+ if self.scenario.max_latency_ms:
386
+ status = "OK" if self.meets_latency_sla else "EXCEEDS SLA"
387
+ lines.append(f" Latency SLA: {status}")
388
+
389
+ lines.extend(
390
+ [
391
+ "",
392
+ "Costs:",
393
+ f" Per hour: ${self.cost_per_hour_usd:.2f}",
394
+ f" Per day: ${self.cost_per_day_usd:.2f}",
395
+ f" Per month: ${self.cost_per_month_usd:.2f}",
396
+ f" Per 1K inf: ${self.cost_per_1k_inferences_usd:.4f}",
397
+ ]
398
+ )
399
+
400
+ if self.warnings:
401
+ lines.extend(["", "Warnings:"])
402
+ for w in self.warnings:
403
+ lines.append(f" - {w}")
404
+
405
+ return "\n".join(lines)
406
+
407
+
408
+ # =============================================================================
409
+ # Cost Calculation Functions (Tasks 12.6.2 and 12.6.3)
410
+ # =============================================================================
411
+
412
+
413
+ def estimate_latency_from_flops(
414
+ model_flops: int,
415
+ hardware: HardwareTier,
416
+ precision: str = "fp32",
417
+ utilization_factor: float = 0.3,
418
+ ) -> float:
419
+ """
420
+ Estimate inference latency from model FLOPs and hardware specs.
421
+
422
+ Args:
423
+ model_flops: Model FLOPs per inference.
424
+ hardware: Hardware tier to run on.
425
+ precision: Precision (fp32, fp16, int8).
426
+ utilization_factor: Expected hardware utilization (0.3 = 30% of peak).
427
+
428
+ Returns:
429
+ Estimated latency in milliseconds.
430
+ """
431
+ effective_tflops = hardware.effective_tflops(precision) * utilization_factor
432
+ effective_flops_per_sec = effective_tflops * 1e12
433
+
434
+ if effective_flops_per_sec == 0:
435
+ return float("inf")
436
+
437
+ latency_sec = model_flops / effective_flops_per_sec
438
+ return latency_sec * 1000 # Convert to ms
439
+
440
+
441
+ def select_hardware_tier_for_latency(
442
+ model_flops: int,
443
+ target_latency_ms: float,
444
+ precision: str = "fp32",
445
+ target: DeploymentTarget = DeploymentTarget.CLOUD_GPU,
446
+ utilization_factor: float = 0.3,
447
+ ) -> HardwareTier | None:
448
+ """
449
+ Select the cheapest hardware tier that meets latency requirements.
450
+
451
+ Args:
452
+ model_flops: Model FLOPs per inference.
453
+ target_latency_ms: Maximum acceptable latency.
454
+ precision: Precision (fp32, fp16, int8).
455
+ target: Deployment target (cloud GPU, edge, etc.).
456
+ utilization_factor: Expected hardware utilization.
457
+
458
+ Returns:
459
+ Cheapest HardwareTier that meets requirements, or None if none can.
460
+ """
461
+ candidates = list_hardware_tiers(target)
462
+
463
+ for tier in candidates: # Already sorted by cost
464
+ latency = estimate_latency_from_flops(model_flops, tier, precision, utilization_factor)
465
+ if latency <= target_latency_ms:
466
+ return tier
467
+
468
+ return None # No tier can meet the latency requirement
469
+
470
+
471
+ def calculate_deployment_cost(
472
+ model_flops: int,
473
+ scenario: DeploymentScenario,
474
+ model_memory_bytes: int = 0,
475
+ utilization_factor: float = 0.3,
476
+ ) -> DeploymentCostEstimate:
477
+ """
478
+ Calculate deployment cost for a model given a scenario.
479
+
480
+ This is the main cost calculation function that:
481
+ 1. Selects appropriate hardware based on latency SLA
482
+ 2. Calculates number of instances needed for throughput
483
+ 3. Computes hourly, daily, monthly costs
484
+
485
+ Args:
486
+ model_flops: Model FLOPs per inference.
487
+ scenario: DeploymentScenario with throughput/latency requirements.
488
+ model_memory_bytes: Model memory footprint (for memory-based selection).
489
+ utilization_factor: Expected hardware utilization (default 30%).
490
+
491
+ Returns:
492
+ DeploymentCostEstimate with all computed costs and recommendations.
493
+ """
494
+ warnings: list[str] = []
495
+
496
+ # Step 1: Select hardware tier based on latency SLA
497
+ if scenario.max_latency_ms:
498
+ selected_tier = select_hardware_tier_for_latency(
499
+ model_flops,
500
+ scenario.max_latency_ms,
501
+ scenario.precision,
502
+ scenario.target,
503
+ utilization_factor,
504
+ )
505
+ if selected_tier is None:
506
+ # Fall back to most powerful tier
507
+ tiers = list_hardware_tiers(scenario.target)
508
+ selected_tier = tiers[-1] if tiers else list(HARDWARE_TIERS.values())[0]
509
+ warnings.append(
510
+ f"No hardware meets {scenario.max_latency_ms}ms latency SLA. "
511
+ f"Using {selected_tier.name}."
512
+ )
513
+ else:
514
+ # No latency constraint - pick cheapest tier that can run the model
515
+ tiers = list_hardware_tiers(scenario.target)
516
+ selected_tier = tiers[0] if tiers else list(HARDWARE_TIERS.values())[0]
517
+
518
+ # Step 2: Calculate estimated latency and throughput
519
+ estimated_latency = estimate_latency_from_flops(
520
+ model_flops, selected_tier, scenario.precision, utilization_factor
521
+ )
522
+ single_instance_fps = 1000.0 / estimated_latency if estimated_latency > 0 else 0
523
+
524
+ # Step 3: Calculate instances needed for target throughput
525
+ if single_instance_fps > 0:
526
+ instances_for_throughput = max(
527
+ 1,
528
+ int(scenario.target_fps / single_instance_fps + 0.99), # Round up
529
+ )
530
+ else:
531
+ instances_for_throughput = 1
532
+ warnings.append("Could not estimate throughput. Using 1 instance.")
533
+
534
+ # Add replicas for availability
535
+ total_instances = instances_for_throughput * scenario.replicas
536
+
537
+ # Step 4: Check memory requirements
538
+ if model_memory_bytes > 0:
539
+ model_gb = model_memory_bytes / (1024**3)
540
+ # Leave ~30% headroom for activations
541
+ required_memory_gb = model_gb * 1.3
542
+ if required_memory_gb > selected_tier.memory_gb:
543
+ warnings.append(
544
+ f"Model requires ~{model_gb:.1f}GB but {selected_tier.name} "
545
+ f"has {selected_tier.memory_gb}GB. Consider larger tier."
546
+ )
547
+
548
+ # Step 5: Calculate costs
549
+ cost_per_hour = selected_tier.cost_per_hour_usd * total_instances
550
+ cost_per_day = cost_per_hour * scenario.hours_per_day
551
+ cost_per_month = cost_per_day * scenario.days_per_month
552
+
553
+ # Cost per 1000 inferences
554
+ inferences_per_hour = single_instance_fps * 3600 * total_instances
555
+ if inferences_per_hour > 0:
556
+ cost_per_1k = (cost_per_hour / inferences_per_hour) * 1000
557
+ else:
558
+ cost_per_1k = 0
559
+
560
+ # Step 6: Calculate utilization
561
+ total_capacity_fps = single_instance_fps * total_instances
562
+ utilization = (scenario.target_fps / total_capacity_fps * 100) if total_capacity_fps > 0 else 0
563
+
564
+ # Check latency SLA
565
+ meets_sla = True
566
+ if scenario.max_latency_ms and estimated_latency > scenario.max_latency_ms:
567
+ meets_sla = False
568
+ warnings.append(
569
+ f"Estimated latency ({estimated_latency:.1f}ms) exceeds "
570
+ f"SLA ({scenario.max_latency_ms}ms)"
571
+ )
572
+
573
+ return DeploymentCostEstimate(
574
+ scenario=scenario,
575
+ hardware_tier=selected_tier,
576
+ num_instances=total_instances,
577
+ estimated_latency_ms=estimated_latency,
578
+ estimated_throughput_fps=single_instance_fps * total_instances,
579
+ meets_latency_sla=meets_sla,
580
+ cost_per_hour_usd=cost_per_hour,
581
+ cost_per_day_usd=cost_per_day,
582
+ cost_per_month_usd=cost_per_month,
583
+ utilization_percent=utilization,
584
+ cost_per_1k_inferences_usd=cost_per_1k,
585
+ warnings=warnings,
586
+ )
587
+
588
+
589
+ def compare_deployment_costs(
590
+ model_flops: int,
591
+ scenarios: list[DeploymentScenario],
592
+ model_memory_bytes: int = 0,
593
+ ) -> list[DeploymentCostEstimate]:
594
+ """
595
+ Compare deployment costs across multiple scenarios.
596
+
597
+ Useful for comparing different precision levels or deployment targets.
598
+
599
+ Args:
600
+ model_flops: Model FLOPs per inference.
601
+ scenarios: List of deployment scenarios to compare.
602
+ model_memory_bytes: Model memory footprint.
603
+
604
+ Returns:
605
+ List of DeploymentCostEstimate, one per scenario.
606
+ """
607
+ return [
608
+ calculate_deployment_cost(model_flops, scenario, model_memory_bytes)
609
+ for scenario in scenarios
610
+ ]
611
+
612
+
613
+ def estimate_cost_from_combined_report(
614
+ combined_report: Any, # CombinedReport
615
+ scenario: DeploymentScenario,
616
+ ) -> DeploymentCostEstimate:
617
+ """
618
+ Calculate deployment cost from a CombinedReport.
619
+
620
+ Extracts FLOPs and memory from the architecture summary and calculates cost.
621
+
622
+ Args:
623
+ combined_report: CombinedReport with architecture data.
624
+ scenario: DeploymentScenario defining requirements.
625
+
626
+ Returns:
627
+ DeploymentCostEstimate with computed costs.
628
+ """
629
+ arch = combined_report.architecture
630
+ flops = arch.get("flops_total", 0)
631
+ memory = arch.get("model_size_bytes", 0)
632
+
633
+ return calculate_deployment_cost(flops, scenario, memory)