haoline 0.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- haoline/.streamlit/config.toml +10 -0
- haoline/__init__.py +248 -0
- haoline/analyzer.py +935 -0
- haoline/cli.py +2712 -0
- haoline/compare.py +811 -0
- haoline/compare_visualizations.py +1564 -0
- haoline/edge_analysis.py +525 -0
- haoline/eval/__init__.py +131 -0
- haoline/eval/adapters.py +844 -0
- haoline/eval/cli.py +390 -0
- haoline/eval/comparison.py +542 -0
- haoline/eval/deployment.py +633 -0
- haoline/eval/schemas.py +833 -0
- haoline/examples/__init__.py +15 -0
- haoline/examples/basic_inspection.py +74 -0
- haoline/examples/compare_models.py +117 -0
- haoline/examples/hardware_estimation.py +78 -0
- haoline/format_adapters.py +1001 -0
- haoline/formats/__init__.py +123 -0
- haoline/formats/coreml.py +250 -0
- haoline/formats/gguf.py +483 -0
- haoline/formats/openvino.py +255 -0
- haoline/formats/safetensors.py +273 -0
- haoline/formats/tflite.py +369 -0
- haoline/hardware.py +2307 -0
- haoline/hierarchical_graph.py +462 -0
- haoline/html_export.py +1573 -0
- haoline/layer_summary.py +769 -0
- haoline/llm_summarizer.py +465 -0
- haoline/op_icons.py +618 -0
- haoline/operational_profiling.py +1492 -0
- haoline/patterns.py +1116 -0
- haoline/pdf_generator.py +265 -0
- haoline/privacy.py +250 -0
- haoline/pydantic_models.py +241 -0
- haoline/report.py +1923 -0
- haoline/report_sections.py +539 -0
- haoline/risks.py +521 -0
- haoline/schema.py +523 -0
- haoline/streamlit_app.py +2024 -0
- haoline/tests/__init__.py +4 -0
- haoline/tests/conftest.py +123 -0
- haoline/tests/test_analyzer.py +868 -0
- haoline/tests/test_compare_visualizations.py +293 -0
- haoline/tests/test_edge_analysis.py +243 -0
- haoline/tests/test_eval.py +604 -0
- haoline/tests/test_format_adapters.py +460 -0
- haoline/tests/test_hardware.py +237 -0
- haoline/tests/test_hardware_recommender.py +90 -0
- haoline/tests/test_hierarchical_graph.py +326 -0
- haoline/tests/test_html_export.py +180 -0
- haoline/tests/test_layer_summary.py +428 -0
- haoline/tests/test_llm_patterns.py +540 -0
- haoline/tests/test_llm_summarizer.py +339 -0
- haoline/tests/test_patterns.py +774 -0
- haoline/tests/test_pytorch.py +327 -0
- haoline/tests/test_report.py +383 -0
- haoline/tests/test_risks.py +398 -0
- haoline/tests/test_schema.py +417 -0
- haoline/tests/test_tensorflow.py +380 -0
- haoline/tests/test_visualizations.py +316 -0
- haoline/universal_ir.py +856 -0
- haoline/visualizations.py +1086 -0
- haoline/visualize_yolo.py +44 -0
- haoline/web.py +110 -0
- haoline-0.3.0.dist-info/METADATA +471 -0
- haoline-0.3.0.dist-info/RECORD +70 -0
- haoline-0.3.0.dist-info/WHEEL +4 -0
- haoline-0.3.0.dist-info/entry_points.txt +5 -0
- haoline-0.3.0.dist-info/licenses/LICENSE +22 -0
|
@@ -0,0 +1,633 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Deployment Cost Calculator for HaoLine.
|
|
3
|
+
|
|
4
|
+
Calculate the cost of running ML models in production given:
|
|
5
|
+
- Target throughput (FPS or samples/sec)
|
|
6
|
+
- Operating hours per day
|
|
7
|
+
- Hardware tier and cloud provider pricing
|
|
8
|
+
|
|
9
|
+
Answers: "What does it cost to run this model at X fps?"
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from dataclasses import dataclass, field
|
|
15
|
+
from enum import Enum
|
|
16
|
+
from typing import Any
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DeploymentTarget(str, Enum):
|
|
20
|
+
"""Deployment target environment."""
|
|
21
|
+
|
|
22
|
+
CLOUD_GPU = "cloud_gpu" # Cloud GPU instances (AWS, GCP, Azure)
|
|
23
|
+
CLOUD_CPU = "cloud_cpu" # Cloud CPU instances
|
|
24
|
+
EDGE_GPU = "edge_gpu" # Edge devices with GPU (Jetson, etc.)
|
|
25
|
+
EDGE_CPU = "edge_cpu" # Edge devices CPU-only
|
|
26
|
+
ON_PREM = "on_prem" # On-premises servers
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
class CloudProvider(str, Enum):
|
|
30
|
+
"""Cloud provider for cost estimation."""
|
|
31
|
+
|
|
32
|
+
AWS = "aws"
|
|
33
|
+
GCP = "gcp"
|
|
34
|
+
AZURE = "azure"
|
|
35
|
+
GENERIC = "generic" # Use average pricing
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@dataclass
|
|
39
|
+
class DeploymentScenario:
|
|
40
|
+
"""
|
|
41
|
+
Defines a deployment scenario for cost estimation.
|
|
42
|
+
|
|
43
|
+
This is the input to the cost calculator - describes what the user
|
|
44
|
+
wants to achieve (throughput, uptime, etc.).
|
|
45
|
+
"""
|
|
46
|
+
|
|
47
|
+
# Throughput requirements
|
|
48
|
+
target_fps: float = 30.0 # Target frames/samples per second
|
|
49
|
+
batch_size: int = 1 # Inference batch size
|
|
50
|
+
|
|
51
|
+
# Operating schedule
|
|
52
|
+
hours_per_day: float = 24.0 # Hours the model runs per day
|
|
53
|
+
days_per_month: int = 30 # Days per month to calculate costs
|
|
54
|
+
|
|
55
|
+
# Hardware preferences
|
|
56
|
+
target: DeploymentTarget = DeploymentTarget.CLOUD_GPU
|
|
57
|
+
provider: CloudProvider = CloudProvider.GENERIC
|
|
58
|
+
precision: str = "fp32" # fp32, fp16, int8
|
|
59
|
+
|
|
60
|
+
# Latency constraints
|
|
61
|
+
max_latency_ms: float | None = None # Maximum acceptable latency (SLA)
|
|
62
|
+
|
|
63
|
+
# Redundancy
|
|
64
|
+
replicas: int = 1 # Number of model replicas for availability
|
|
65
|
+
|
|
66
|
+
# Optional metadata
|
|
67
|
+
name: str = "" # Scenario name for reports
|
|
68
|
+
notes: str = "" # Additional notes
|
|
69
|
+
|
|
70
|
+
def to_dict(self) -> dict[str, Any]:
|
|
71
|
+
"""Convert to dictionary for serialization."""
|
|
72
|
+
return {
|
|
73
|
+
"target_fps": self.target_fps,
|
|
74
|
+
"batch_size": self.batch_size,
|
|
75
|
+
"hours_per_day": self.hours_per_day,
|
|
76
|
+
"days_per_month": self.days_per_month,
|
|
77
|
+
"target": self.target.value,
|
|
78
|
+
"provider": self.provider.value,
|
|
79
|
+
"precision": self.precision,
|
|
80
|
+
"max_latency_ms": self.max_latency_ms,
|
|
81
|
+
"replicas": self.replicas,
|
|
82
|
+
"name": self.name,
|
|
83
|
+
"notes": self.notes,
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
@classmethod
|
|
87
|
+
def from_dict(cls, data: dict[str, Any]) -> DeploymentScenario:
|
|
88
|
+
"""Create from dictionary."""
|
|
89
|
+
return cls(
|
|
90
|
+
target_fps=data.get("target_fps", 30.0),
|
|
91
|
+
batch_size=data.get("batch_size", 1),
|
|
92
|
+
hours_per_day=data.get("hours_per_day", 24.0),
|
|
93
|
+
days_per_month=data.get("days_per_month", 30),
|
|
94
|
+
target=DeploymentTarget(data.get("target", "cloud_gpu")),
|
|
95
|
+
provider=CloudProvider(data.get("provider", "generic")),
|
|
96
|
+
precision=data.get("precision", "fp32"),
|
|
97
|
+
max_latency_ms=data.get("max_latency_ms"),
|
|
98
|
+
replicas=data.get("replicas", 1),
|
|
99
|
+
name=data.get("name", ""),
|
|
100
|
+
notes=data.get("notes", ""),
|
|
101
|
+
)
|
|
102
|
+
|
|
103
|
+
@classmethod
|
|
104
|
+
def realtime_video(cls, fps: float = 30.0) -> DeploymentScenario:
|
|
105
|
+
"""Preset: Real-time video processing (24/7)."""
|
|
106
|
+
return cls(
|
|
107
|
+
target_fps=fps,
|
|
108
|
+
batch_size=1,
|
|
109
|
+
hours_per_day=24.0,
|
|
110
|
+
max_latency_ms=1000.0 / fps, # Must process faster than frame rate
|
|
111
|
+
name="realtime_video",
|
|
112
|
+
)
|
|
113
|
+
|
|
114
|
+
@classmethod
|
|
115
|
+
def batch_processing(
|
|
116
|
+
cls,
|
|
117
|
+
samples_per_hour: int = 10000,
|
|
118
|
+
hours_per_day: float = 8.0,
|
|
119
|
+
) -> DeploymentScenario:
|
|
120
|
+
"""Preset: Batch processing during business hours."""
|
|
121
|
+
fps = samples_per_hour / 3600 # Convert to per-second
|
|
122
|
+
return cls(
|
|
123
|
+
target_fps=fps,
|
|
124
|
+
batch_size=32,
|
|
125
|
+
hours_per_day=hours_per_day,
|
|
126
|
+
max_latency_ms=None, # Latency not critical
|
|
127
|
+
name="batch_processing",
|
|
128
|
+
)
|
|
129
|
+
|
|
130
|
+
@classmethod
|
|
131
|
+
def edge_device(cls, fps: float = 10.0) -> DeploymentScenario:
|
|
132
|
+
"""Preset: Edge device deployment."""
|
|
133
|
+
return cls(
|
|
134
|
+
target_fps=fps,
|
|
135
|
+
batch_size=1,
|
|
136
|
+
hours_per_day=24.0,
|
|
137
|
+
target=DeploymentTarget.EDGE_GPU,
|
|
138
|
+
provider=CloudProvider.GENERIC,
|
|
139
|
+
max_latency_ms=100.0,
|
|
140
|
+
name="edge_device",
|
|
141
|
+
)
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
# =============================================================================
|
|
145
|
+
# Hardware Tier Definitions
|
|
146
|
+
# =============================================================================
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
@dataclass
|
|
150
|
+
class HardwareTier:
|
|
151
|
+
"""
|
|
152
|
+
Defines a hardware tier for deployment.
|
|
153
|
+
|
|
154
|
+
Maps to cloud instance types or edge device categories.
|
|
155
|
+
"""
|
|
156
|
+
|
|
157
|
+
name: str
|
|
158
|
+
description: str
|
|
159
|
+
|
|
160
|
+
# Performance characteristics
|
|
161
|
+
max_tflops_fp32: float # Peak TFLOPS at FP32
|
|
162
|
+
max_tflops_fp16: float # Peak TFLOPS at FP16
|
|
163
|
+
max_tflops_int8: float # Peak TFLOPS at INT8
|
|
164
|
+
|
|
165
|
+
# Memory
|
|
166
|
+
memory_gb: float
|
|
167
|
+
|
|
168
|
+
# Cost (hourly)
|
|
169
|
+
cost_per_hour_usd: float
|
|
170
|
+
|
|
171
|
+
# Provider/target
|
|
172
|
+
provider: CloudProvider = CloudProvider.GENERIC
|
|
173
|
+
target: DeploymentTarget = DeploymentTarget.CLOUD_GPU
|
|
174
|
+
|
|
175
|
+
# Instance type (for cloud)
|
|
176
|
+
instance_type: str = ""
|
|
177
|
+
|
|
178
|
+
def effective_tflops(self, precision: str = "fp32") -> float:
|
|
179
|
+
"""Get effective TFLOPS for a precision level."""
|
|
180
|
+
if precision == "fp16":
|
|
181
|
+
return self.max_tflops_fp16
|
|
182
|
+
elif precision == "int8":
|
|
183
|
+
return self.max_tflops_int8
|
|
184
|
+
else:
|
|
185
|
+
return self.max_tflops_fp32
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
# Pre-defined hardware tiers (approximate 2024 pricing)
|
|
189
|
+
HARDWARE_TIERS: dict[str, HardwareTier] = {
|
|
190
|
+
# Cloud GPU tiers
|
|
191
|
+
"t4": HardwareTier(
|
|
192
|
+
name="T4",
|
|
193
|
+
description="NVIDIA T4 (budget GPU)",
|
|
194
|
+
max_tflops_fp32=8.1,
|
|
195
|
+
max_tflops_fp16=65,
|
|
196
|
+
max_tflops_int8=130,
|
|
197
|
+
memory_gb=16,
|
|
198
|
+
cost_per_hour_usd=0.50,
|
|
199
|
+
instance_type="g4dn.xlarge",
|
|
200
|
+
),
|
|
201
|
+
"a10g": HardwareTier(
|
|
202
|
+
name="A10G",
|
|
203
|
+
description="NVIDIA A10G (mid-tier GPU)",
|
|
204
|
+
max_tflops_fp32=31.2,
|
|
205
|
+
max_tflops_fp16=125,
|
|
206
|
+
max_tflops_int8=250,
|
|
207
|
+
memory_gb=24,
|
|
208
|
+
cost_per_hour_usd=1.00,
|
|
209
|
+
instance_type="g5.xlarge",
|
|
210
|
+
),
|
|
211
|
+
"a100_40gb": HardwareTier(
|
|
212
|
+
name="A100-40GB",
|
|
213
|
+
description="NVIDIA A100 40GB (high-end)",
|
|
214
|
+
max_tflops_fp32=19.5,
|
|
215
|
+
max_tflops_fp16=312,
|
|
216
|
+
max_tflops_int8=624,
|
|
217
|
+
memory_gb=40,
|
|
218
|
+
cost_per_hour_usd=3.00,
|
|
219
|
+
instance_type="p4d.24xlarge",
|
|
220
|
+
),
|
|
221
|
+
"a100_80gb": HardwareTier(
|
|
222
|
+
name="A100-80GB",
|
|
223
|
+
description="NVIDIA A100 80GB (high-memory)",
|
|
224
|
+
max_tflops_fp32=19.5,
|
|
225
|
+
max_tflops_fp16=312,
|
|
226
|
+
max_tflops_int8=624,
|
|
227
|
+
memory_gb=80,
|
|
228
|
+
cost_per_hour_usd=4.00,
|
|
229
|
+
instance_type="p4de.24xlarge",
|
|
230
|
+
),
|
|
231
|
+
"h100": HardwareTier(
|
|
232
|
+
name="H100",
|
|
233
|
+
description="NVIDIA H100 (latest gen)",
|
|
234
|
+
max_tflops_fp32=67,
|
|
235
|
+
max_tflops_fp16=1979,
|
|
236
|
+
max_tflops_int8=3958,
|
|
237
|
+
memory_gb=80,
|
|
238
|
+
cost_per_hour_usd=8.00,
|
|
239
|
+
instance_type="p5.48xlarge",
|
|
240
|
+
),
|
|
241
|
+
# Edge devices
|
|
242
|
+
"jetson_nano": HardwareTier(
|
|
243
|
+
name="Jetson Nano",
|
|
244
|
+
description="NVIDIA Jetson Nano (entry edge)",
|
|
245
|
+
max_tflops_fp32=0.472,
|
|
246
|
+
max_tflops_fp16=0.472,
|
|
247
|
+
max_tflops_int8=0.944,
|
|
248
|
+
memory_gb=4,
|
|
249
|
+
cost_per_hour_usd=0.01, # Amortized device cost
|
|
250
|
+
target=DeploymentTarget.EDGE_GPU,
|
|
251
|
+
),
|
|
252
|
+
"jetson_orin_nano": HardwareTier(
|
|
253
|
+
name="Jetson Orin Nano",
|
|
254
|
+
description="NVIDIA Jetson Orin Nano (mid edge)",
|
|
255
|
+
max_tflops_fp32=20,
|
|
256
|
+
max_tflops_fp16=40,
|
|
257
|
+
max_tflops_int8=80,
|
|
258
|
+
memory_gb=8,
|
|
259
|
+
cost_per_hour_usd=0.03,
|
|
260
|
+
target=DeploymentTarget.EDGE_GPU,
|
|
261
|
+
),
|
|
262
|
+
"jetson_orin_nx": HardwareTier(
|
|
263
|
+
name="Jetson Orin NX",
|
|
264
|
+
description="NVIDIA Jetson Orin NX (high edge)",
|
|
265
|
+
max_tflops_fp32=50,
|
|
266
|
+
max_tflops_fp16=100,
|
|
267
|
+
max_tflops_int8=200,
|
|
268
|
+
memory_gb=16,
|
|
269
|
+
cost_per_hour_usd=0.05,
|
|
270
|
+
target=DeploymentTarget.EDGE_GPU,
|
|
271
|
+
),
|
|
272
|
+
# CPU options
|
|
273
|
+
"cpu_small": HardwareTier(
|
|
274
|
+
name="CPU Small",
|
|
275
|
+
description="4 vCPU cloud instance",
|
|
276
|
+
max_tflops_fp32=0.1,
|
|
277
|
+
max_tflops_fp16=0.1,
|
|
278
|
+
max_tflops_int8=0.2,
|
|
279
|
+
memory_gb=8,
|
|
280
|
+
cost_per_hour_usd=0.10,
|
|
281
|
+
target=DeploymentTarget.CLOUD_CPU,
|
|
282
|
+
instance_type="c5.xlarge",
|
|
283
|
+
),
|
|
284
|
+
"cpu_large": HardwareTier(
|
|
285
|
+
name="CPU Large",
|
|
286
|
+
description="16 vCPU cloud instance",
|
|
287
|
+
max_tflops_fp32=0.4,
|
|
288
|
+
max_tflops_fp16=0.4,
|
|
289
|
+
max_tflops_int8=0.8,
|
|
290
|
+
memory_gb=32,
|
|
291
|
+
cost_per_hour_usd=0.40,
|
|
292
|
+
target=DeploymentTarget.CLOUD_CPU,
|
|
293
|
+
instance_type="c5.4xlarge",
|
|
294
|
+
),
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
|
|
298
|
+
def get_hardware_tier(name: str) -> HardwareTier | None:
|
|
299
|
+
"""Get a hardware tier by name."""
|
|
300
|
+
return HARDWARE_TIERS.get(name.lower())
|
|
301
|
+
|
|
302
|
+
|
|
303
|
+
def list_hardware_tiers(
|
|
304
|
+
target: DeploymentTarget | None = None,
|
|
305
|
+
) -> list[HardwareTier]:
|
|
306
|
+
"""List available hardware tiers, optionally filtered by target."""
|
|
307
|
+
tiers = list(HARDWARE_TIERS.values())
|
|
308
|
+
if target:
|
|
309
|
+
tiers = [t for t in tiers if t.target == target]
|
|
310
|
+
return sorted(tiers, key=lambda t: t.cost_per_hour_usd)
|
|
311
|
+
|
|
312
|
+
|
|
313
|
+
# =============================================================================
|
|
314
|
+
# Cost Estimation Result
|
|
315
|
+
# =============================================================================
|
|
316
|
+
|
|
317
|
+
|
|
318
|
+
@dataclass
|
|
319
|
+
class DeploymentCostEstimate:
|
|
320
|
+
"""
|
|
321
|
+
Result of deployment cost calculation.
|
|
322
|
+
|
|
323
|
+
Contains all the computed costs and recommendations.
|
|
324
|
+
"""
|
|
325
|
+
|
|
326
|
+
# Input scenario
|
|
327
|
+
scenario: DeploymentScenario
|
|
328
|
+
|
|
329
|
+
# Selected hardware
|
|
330
|
+
hardware_tier: HardwareTier
|
|
331
|
+
num_instances: int = 1 # Instances needed to meet throughput
|
|
332
|
+
|
|
333
|
+
# Performance estimates
|
|
334
|
+
estimated_latency_ms: float = 0.0
|
|
335
|
+
estimated_throughput_fps: float = 0.0
|
|
336
|
+
meets_latency_sla: bool = True
|
|
337
|
+
|
|
338
|
+
# Cost breakdown
|
|
339
|
+
cost_per_hour_usd: float = 0.0
|
|
340
|
+
cost_per_day_usd: float = 0.0
|
|
341
|
+
cost_per_month_usd: float = 0.0
|
|
342
|
+
|
|
343
|
+
# Efficiency metrics
|
|
344
|
+
utilization_percent: float = 0.0 # How much of hardware capacity is used
|
|
345
|
+
cost_per_1k_inferences_usd: float = 0.0
|
|
346
|
+
|
|
347
|
+
# Warnings/notes
|
|
348
|
+
warnings: list[str] = field(default_factory=list)
|
|
349
|
+
|
|
350
|
+
def to_dict(self) -> dict[str, Any]:
|
|
351
|
+
"""Convert to dictionary for serialization."""
|
|
352
|
+
return {
|
|
353
|
+
"scenario": self.scenario.to_dict(),
|
|
354
|
+
"hardware_tier": {
|
|
355
|
+
"name": self.hardware_tier.name,
|
|
356
|
+
"description": self.hardware_tier.description,
|
|
357
|
+
"cost_per_hour_usd": self.hardware_tier.cost_per_hour_usd,
|
|
358
|
+
},
|
|
359
|
+
"num_instances": self.num_instances,
|
|
360
|
+
"estimated_latency_ms": self.estimated_latency_ms,
|
|
361
|
+
"estimated_throughput_fps": self.estimated_throughput_fps,
|
|
362
|
+
"meets_latency_sla": self.meets_latency_sla,
|
|
363
|
+
"cost_per_hour_usd": self.cost_per_hour_usd,
|
|
364
|
+
"cost_per_day_usd": self.cost_per_day_usd,
|
|
365
|
+
"cost_per_month_usd": self.cost_per_month_usd,
|
|
366
|
+
"utilization_percent": self.utilization_percent,
|
|
367
|
+
"cost_per_1k_inferences_usd": self.cost_per_1k_inferences_usd,
|
|
368
|
+
"warnings": self.warnings,
|
|
369
|
+
}
|
|
370
|
+
|
|
371
|
+
def summary(self) -> str:
|
|
372
|
+
"""Generate a human-readable summary."""
|
|
373
|
+
lines = [
|
|
374
|
+
f"Deployment Cost Estimate: {self.scenario.name or 'Custom Scenario'}",
|
|
375
|
+
f"{'=' * 50}",
|
|
376
|
+
f"Hardware: {self.hardware_tier.name} x {self.num_instances}",
|
|
377
|
+
f"Target: {self.scenario.target_fps:.1f} fps @ {self.scenario.hours_per_day}h/day",
|
|
378
|
+
"",
|
|
379
|
+
"Performance:",
|
|
380
|
+
f" Estimated latency: {self.estimated_latency_ms:.1f} ms",
|
|
381
|
+
f" Estimated throughput: {self.estimated_throughput_fps:.1f} fps",
|
|
382
|
+
f" Utilization: {self.utilization_percent:.0f}%",
|
|
383
|
+
]
|
|
384
|
+
|
|
385
|
+
if self.scenario.max_latency_ms:
|
|
386
|
+
status = "OK" if self.meets_latency_sla else "EXCEEDS SLA"
|
|
387
|
+
lines.append(f" Latency SLA: {status}")
|
|
388
|
+
|
|
389
|
+
lines.extend(
|
|
390
|
+
[
|
|
391
|
+
"",
|
|
392
|
+
"Costs:",
|
|
393
|
+
f" Per hour: ${self.cost_per_hour_usd:.2f}",
|
|
394
|
+
f" Per day: ${self.cost_per_day_usd:.2f}",
|
|
395
|
+
f" Per month: ${self.cost_per_month_usd:.2f}",
|
|
396
|
+
f" Per 1K inf: ${self.cost_per_1k_inferences_usd:.4f}",
|
|
397
|
+
]
|
|
398
|
+
)
|
|
399
|
+
|
|
400
|
+
if self.warnings:
|
|
401
|
+
lines.extend(["", "Warnings:"])
|
|
402
|
+
for w in self.warnings:
|
|
403
|
+
lines.append(f" - {w}")
|
|
404
|
+
|
|
405
|
+
return "\n".join(lines)
|
|
406
|
+
|
|
407
|
+
|
|
408
|
+
# =============================================================================
|
|
409
|
+
# Cost Calculation Functions (Tasks 12.6.2 and 12.6.3)
|
|
410
|
+
# =============================================================================
|
|
411
|
+
|
|
412
|
+
|
|
413
|
+
def estimate_latency_from_flops(
|
|
414
|
+
model_flops: int,
|
|
415
|
+
hardware: HardwareTier,
|
|
416
|
+
precision: str = "fp32",
|
|
417
|
+
utilization_factor: float = 0.3,
|
|
418
|
+
) -> float:
|
|
419
|
+
"""
|
|
420
|
+
Estimate inference latency from model FLOPs and hardware specs.
|
|
421
|
+
|
|
422
|
+
Args:
|
|
423
|
+
model_flops: Model FLOPs per inference.
|
|
424
|
+
hardware: Hardware tier to run on.
|
|
425
|
+
precision: Precision (fp32, fp16, int8).
|
|
426
|
+
utilization_factor: Expected hardware utilization (0.3 = 30% of peak).
|
|
427
|
+
|
|
428
|
+
Returns:
|
|
429
|
+
Estimated latency in milliseconds.
|
|
430
|
+
"""
|
|
431
|
+
effective_tflops = hardware.effective_tflops(precision) * utilization_factor
|
|
432
|
+
effective_flops_per_sec = effective_tflops * 1e12
|
|
433
|
+
|
|
434
|
+
if effective_flops_per_sec == 0:
|
|
435
|
+
return float("inf")
|
|
436
|
+
|
|
437
|
+
latency_sec = model_flops / effective_flops_per_sec
|
|
438
|
+
return latency_sec * 1000 # Convert to ms
|
|
439
|
+
|
|
440
|
+
|
|
441
|
+
def select_hardware_tier_for_latency(
|
|
442
|
+
model_flops: int,
|
|
443
|
+
target_latency_ms: float,
|
|
444
|
+
precision: str = "fp32",
|
|
445
|
+
target: DeploymentTarget = DeploymentTarget.CLOUD_GPU,
|
|
446
|
+
utilization_factor: float = 0.3,
|
|
447
|
+
) -> HardwareTier | None:
|
|
448
|
+
"""
|
|
449
|
+
Select the cheapest hardware tier that meets latency requirements.
|
|
450
|
+
|
|
451
|
+
Args:
|
|
452
|
+
model_flops: Model FLOPs per inference.
|
|
453
|
+
target_latency_ms: Maximum acceptable latency.
|
|
454
|
+
precision: Precision (fp32, fp16, int8).
|
|
455
|
+
target: Deployment target (cloud GPU, edge, etc.).
|
|
456
|
+
utilization_factor: Expected hardware utilization.
|
|
457
|
+
|
|
458
|
+
Returns:
|
|
459
|
+
Cheapest HardwareTier that meets requirements, or None if none can.
|
|
460
|
+
"""
|
|
461
|
+
candidates = list_hardware_tiers(target)
|
|
462
|
+
|
|
463
|
+
for tier in candidates: # Already sorted by cost
|
|
464
|
+
latency = estimate_latency_from_flops(model_flops, tier, precision, utilization_factor)
|
|
465
|
+
if latency <= target_latency_ms:
|
|
466
|
+
return tier
|
|
467
|
+
|
|
468
|
+
return None # No tier can meet the latency requirement
|
|
469
|
+
|
|
470
|
+
|
|
471
|
+
def calculate_deployment_cost(
|
|
472
|
+
model_flops: int,
|
|
473
|
+
scenario: DeploymentScenario,
|
|
474
|
+
model_memory_bytes: int = 0,
|
|
475
|
+
utilization_factor: float = 0.3,
|
|
476
|
+
) -> DeploymentCostEstimate:
|
|
477
|
+
"""
|
|
478
|
+
Calculate deployment cost for a model given a scenario.
|
|
479
|
+
|
|
480
|
+
This is the main cost calculation function that:
|
|
481
|
+
1. Selects appropriate hardware based on latency SLA
|
|
482
|
+
2. Calculates number of instances needed for throughput
|
|
483
|
+
3. Computes hourly, daily, monthly costs
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
model_flops: Model FLOPs per inference.
|
|
487
|
+
scenario: DeploymentScenario with throughput/latency requirements.
|
|
488
|
+
model_memory_bytes: Model memory footprint (for memory-based selection).
|
|
489
|
+
utilization_factor: Expected hardware utilization (default 30%).
|
|
490
|
+
|
|
491
|
+
Returns:
|
|
492
|
+
DeploymentCostEstimate with all computed costs and recommendations.
|
|
493
|
+
"""
|
|
494
|
+
warnings: list[str] = []
|
|
495
|
+
|
|
496
|
+
# Step 1: Select hardware tier based on latency SLA
|
|
497
|
+
if scenario.max_latency_ms:
|
|
498
|
+
selected_tier = select_hardware_tier_for_latency(
|
|
499
|
+
model_flops,
|
|
500
|
+
scenario.max_latency_ms,
|
|
501
|
+
scenario.precision,
|
|
502
|
+
scenario.target,
|
|
503
|
+
utilization_factor,
|
|
504
|
+
)
|
|
505
|
+
if selected_tier is None:
|
|
506
|
+
# Fall back to most powerful tier
|
|
507
|
+
tiers = list_hardware_tiers(scenario.target)
|
|
508
|
+
selected_tier = tiers[-1] if tiers else list(HARDWARE_TIERS.values())[0]
|
|
509
|
+
warnings.append(
|
|
510
|
+
f"No hardware meets {scenario.max_latency_ms}ms latency SLA. "
|
|
511
|
+
f"Using {selected_tier.name}."
|
|
512
|
+
)
|
|
513
|
+
else:
|
|
514
|
+
# No latency constraint - pick cheapest tier that can run the model
|
|
515
|
+
tiers = list_hardware_tiers(scenario.target)
|
|
516
|
+
selected_tier = tiers[0] if tiers else list(HARDWARE_TIERS.values())[0]
|
|
517
|
+
|
|
518
|
+
# Step 2: Calculate estimated latency and throughput
|
|
519
|
+
estimated_latency = estimate_latency_from_flops(
|
|
520
|
+
model_flops, selected_tier, scenario.precision, utilization_factor
|
|
521
|
+
)
|
|
522
|
+
single_instance_fps = 1000.0 / estimated_latency if estimated_latency > 0 else 0
|
|
523
|
+
|
|
524
|
+
# Step 3: Calculate instances needed for target throughput
|
|
525
|
+
if single_instance_fps > 0:
|
|
526
|
+
instances_for_throughput = max(
|
|
527
|
+
1,
|
|
528
|
+
int(scenario.target_fps / single_instance_fps + 0.99), # Round up
|
|
529
|
+
)
|
|
530
|
+
else:
|
|
531
|
+
instances_for_throughput = 1
|
|
532
|
+
warnings.append("Could not estimate throughput. Using 1 instance.")
|
|
533
|
+
|
|
534
|
+
# Add replicas for availability
|
|
535
|
+
total_instances = instances_for_throughput * scenario.replicas
|
|
536
|
+
|
|
537
|
+
# Step 4: Check memory requirements
|
|
538
|
+
if model_memory_bytes > 0:
|
|
539
|
+
model_gb = model_memory_bytes / (1024**3)
|
|
540
|
+
# Leave ~30% headroom for activations
|
|
541
|
+
required_memory_gb = model_gb * 1.3
|
|
542
|
+
if required_memory_gb > selected_tier.memory_gb:
|
|
543
|
+
warnings.append(
|
|
544
|
+
f"Model requires ~{model_gb:.1f}GB but {selected_tier.name} "
|
|
545
|
+
f"has {selected_tier.memory_gb}GB. Consider larger tier."
|
|
546
|
+
)
|
|
547
|
+
|
|
548
|
+
# Step 5: Calculate costs
|
|
549
|
+
cost_per_hour = selected_tier.cost_per_hour_usd * total_instances
|
|
550
|
+
cost_per_day = cost_per_hour * scenario.hours_per_day
|
|
551
|
+
cost_per_month = cost_per_day * scenario.days_per_month
|
|
552
|
+
|
|
553
|
+
# Cost per 1000 inferences
|
|
554
|
+
inferences_per_hour = single_instance_fps * 3600 * total_instances
|
|
555
|
+
if inferences_per_hour > 0:
|
|
556
|
+
cost_per_1k = (cost_per_hour / inferences_per_hour) * 1000
|
|
557
|
+
else:
|
|
558
|
+
cost_per_1k = 0
|
|
559
|
+
|
|
560
|
+
# Step 6: Calculate utilization
|
|
561
|
+
total_capacity_fps = single_instance_fps * total_instances
|
|
562
|
+
utilization = (scenario.target_fps / total_capacity_fps * 100) if total_capacity_fps > 0 else 0
|
|
563
|
+
|
|
564
|
+
# Check latency SLA
|
|
565
|
+
meets_sla = True
|
|
566
|
+
if scenario.max_latency_ms and estimated_latency > scenario.max_latency_ms:
|
|
567
|
+
meets_sla = False
|
|
568
|
+
warnings.append(
|
|
569
|
+
f"Estimated latency ({estimated_latency:.1f}ms) exceeds "
|
|
570
|
+
f"SLA ({scenario.max_latency_ms}ms)"
|
|
571
|
+
)
|
|
572
|
+
|
|
573
|
+
return DeploymentCostEstimate(
|
|
574
|
+
scenario=scenario,
|
|
575
|
+
hardware_tier=selected_tier,
|
|
576
|
+
num_instances=total_instances,
|
|
577
|
+
estimated_latency_ms=estimated_latency,
|
|
578
|
+
estimated_throughput_fps=single_instance_fps * total_instances,
|
|
579
|
+
meets_latency_sla=meets_sla,
|
|
580
|
+
cost_per_hour_usd=cost_per_hour,
|
|
581
|
+
cost_per_day_usd=cost_per_day,
|
|
582
|
+
cost_per_month_usd=cost_per_month,
|
|
583
|
+
utilization_percent=utilization,
|
|
584
|
+
cost_per_1k_inferences_usd=cost_per_1k,
|
|
585
|
+
warnings=warnings,
|
|
586
|
+
)
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def compare_deployment_costs(
|
|
590
|
+
model_flops: int,
|
|
591
|
+
scenarios: list[DeploymentScenario],
|
|
592
|
+
model_memory_bytes: int = 0,
|
|
593
|
+
) -> list[DeploymentCostEstimate]:
|
|
594
|
+
"""
|
|
595
|
+
Compare deployment costs across multiple scenarios.
|
|
596
|
+
|
|
597
|
+
Useful for comparing different precision levels or deployment targets.
|
|
598
|
+
|
|
599
|
+
Args:
|
|
600
|
+
model_flops: Model FLOPs per inference.
|
|
601
|
+
scenarios: List of deployment scenarios to compare.
|
|
602
|
+
model_memory_bytes: Model memory footprint.
|
|
603
|
+
|
|
604
|
+
Returns:
|
|
605
|
+
List of DeploymentCostEstimate, one per scenario.
|
|
606
|
+
"""
|
|
607
|
+
return [
|
|
608
|
+
calculate_deployment_cost(model_flops, scenario, model_memory_bytes)
|
|
609
|
+
for scenario in scenarios
|
|
610
|
+
]
|
|
611
|
+
|
|
612
|
+
|
|
613
|
+
def estimate_cost_from_combined_report(
|
|
614
|
+
combined_report: Any, # CombinedReport
|
|
615
|
+
scenario: DeploymentScenario,
|
|
616
|
+
) -> DeploymentCostEstimate:
|
|
617
|
+
"""
|
|
618
|
+
Calculate deployment cost from a CombinedReport.
|
|
619
|
+
|
|
620
|
+
Extracts FLOPs and memory from the architecture summary and calculates cost.
|
|
621
|
+
|
|
622
|
+
Args:
|
|
623
|
+
combined_report: CombinedReport with architecture data.
|
|
624
|
+
scenario: DeploymentScenario defining requirements.
|
|
625
|
+
|
|
626
|
+
Returns:
|
|
627
|
+
DeploymentCostEstimate with computed costs.
|
|
628
|
+
"""
|
|
629
|
+
arch = combined_report.architecture
|
|
630
|
+
flops = arch.get("flops_total", 0)
|
|
631
|
+
memory = arch.get("model_size_bytes", 0)
|
|
632
|
+
|
|
633
|
+
return calculate_deployment_cost(flops, scenario, memory)
|