@peakinfer/cli 1.0.133

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. package/.claude/settings.local.json +8 -0
  2. package/.env.example +6 -0
  3. package/.github/workflows/peakinfer.yml +64 -0
  4. package/CHANGELOG.md +31 -0
  5. package/LICENSE +190 -0
  6. package/README.md +335 -0
  7. package/data/inferencemax.json +274 -0
  8. package/dist/agent-analyzer.d.ts +45 -0
  9. package/dist/agent-analyzer.d.ts.map +1 -0
  10. package/dist/agent-analyzer.js +374 -0
  11. package/dist/agent-analyzer.js.map +1 -0
  12. package/dist/agent.d.ts +76 -0
  13. package/dist/agent.d.ts.map +1 -0
  14. package/dist/agent.js +965 -0
  15. package/dist/agent.js.map +1 -0
  16. package/dist/agents/correlation-analyzer.d.ts +34 -0
  17. package/dist/agents/correlation-analyzer.d.ts.map +1 -0
  18. package/dist/agents/correlation-analyzer.js +261 -0
  19. package/dist/agents/correlation-analyzer.js.map +1 -0
  20. package/dist/agents/index.d.ts +91 -0
  21. package/dist/agents/index.d.ts.map +1 -0
  22. package/dist/agents/index.js +111 -0
  23. package/dist/agents/index.js.map +1 -0
  24. package/dist/agents/runtime-analyzer.d.ts +38 -0
  25. package/dist/agents/runtime-analyzer.d.ts.map +1 -0
  26. package/dist/agents/runtime-analyzer.js +244 -0
  27. package/dist/agents/runtime-analyzer.js.map +1 -0
  28. package/dist/analysis-types.d.ts +500 -0
  29. package/dist/analysis-types.d.ts.map +1 -0
  30. package/dist/analysis-types.js +11 -0
  31. package/dist/analysis-types.js.map +1 -0
  32. package/dist/analytics.d.ts +25 -0
  33. package/dist/analytics.d.ts.map +1 -0
  34. package/dist/analytics.js +94 -0
  35. package/dist/analytics.js.map +1 -0
  36. package/dist/analyzer.d.ts +48 -0
  37. package/dist/analyzer.d.ts.map +1 -0
  38. package/dist/analyzer.js +547 -0
  39. package/dist/analyzer.js.map +1 -0
  40. package/dist/artifacts.d.ts +44 -0
  41. package/dist/artifacts.d.ts.map +1 -0
  42. package/dist/artifacts.js +165 -0
  43. package/dist/artifacts.js.map +1 -0
  44. package/dist/benchmarks/index.d.ts +88 -0
  45. package/dist/benchmarks/index.d.ts.map +1 -0
  46. package/dist/benchmarks/index.js +205 -0
  47. package/dist/benchmarks/index.js.map +1 -0
  48. package/dist/cli.d.ts +3 -0
  49. package/dist/cli.d.ts.map +1 -0
  50. package/dist/cli.js +427 -0
  51. package/dist/cli.js.map +1 -0
  52. package/dist/commands/ci.d.ts +19 -0
  53. package/dist/commands/ci.d.ts.map +1 -0
  54. package/dist/commands/ci.js +253 -0
  55. package/dist/commands/ci.js.map +1 -0
  56. package/dist/commands/config.d.ts +16 -0
  57. package/dist/commands/config.d.ts.map +1 -0
  58. package/dist/commands/config.js +249 -0
  59. package/dist/commands/config.js.map +1 -0
  60. package/dist/commands/demo.d.ts +15 -0
  61. package/dist/commands/demo.d.ts.map +1 -0
  62. package/dist/commands/demo.js +106 -0
  63. package/dist/commands/demo.js.map +1 -0
  64. package/dist/commands/export.d.ts +14 -0
  65. package/dist/commands/export.d.ts.map +1 -0
  66. package/dist/commands/export.js +209 -0
  67. package/dist/commands/export.js.map +1 -0
  68. package/dist/commands/history.d.ts +15 -0
  69. package/dist/commands/history.d.ts.map +1 -0
  70. package/dist/commands/history.js +389 -0
  71. package/dist/commands/history.js.map +1 -0
  72. package/dist/commands/template.d.ts +14 -0
  73. package/dist/commands/template.d.ts.map +1 -0
  74. package/dist/commands/template.js +341 -0
  75. package/dist/commands/template.js.map +1 -0
  76. package/dist/commands/validate-map.d.ts +12 -0
  77. package/dist/commands/validate-map.d.ts.map +1 -0
  78. package/dist/commands/validate-map.js +274 -0
  79. package/dist/commands/validate-map.js.map +1 -0
  80. package/dist/commands/whatif.d.ts +17 -0
  81. package/dist/commands/whatif.d.ts.map +1 -0
  82. package/dist/commands/whatif.js +206 -0
  83. package/dist/commands/whatif.js.map +1 -0
  84. package/dist/comparison.d.ts +38 -0
  85. package/dist/comparison.d.ts.map +1 -0
  86. package/dist/comparison.js +223 -0
  87. package/dist/comparison.js.map +1 -0
  88. package/dist/config.d.ts +42 -0
  89. package/dist/config.d.ts.map +1 -0
  90. package/dist/config.js +158 -0
  91. package/dist/config.js.map +1 -0
  92. package/dist/connectors/helicone.d.ts +9 -0
  93. package/dist/connectors/helicone.d.ts.map +1 -0
  94. package/dist/connectors/helicone.js +106 -0
  95. package/dist/connectors/helicone.js.map +1 -0
  96. package/dist/connectors/index.d.ts +37 -0
  97. package/dist/connectors/index.d.ts.map +1 -0
  98. package/dist/connectors/index.js +65 -0
  99. package/dist/connectors/index.js.map +1 -0
  100. package/dist/connectors/langsmith.d.ts +9 -0
  101. package/dist/connectors/langsmith.d.ts.map +1 -0
  102. package/dist/connectors/langsmith.js +122 -0
  103. package/dist/connectors/langsmith.js.map +1 -0
  104. package/dist/connectors/types.d.ts +83 -0
  105. package/dist/connectors/types.d.ts.map +1 -0
  106. package/dist/connectors/types.js +98 -0
  107. package/dist/connectors/types.js.map +1 -0
  108. package/dist/cost-estimator.d.ts +46 -0
  109. package/dist/cost-estimator.d.ts.map +1 -0
  110. package/dist/cost-estimator.js +104 -0
  111. package/dist/cost-estimator.js.map +1 -0
  112. package/dist/costs.d.ts +57 -0
  113. package/dist/costs.d.ts.map +1 -0
  114. package/dist/costs.js +251 -0
  115. package/dist/costs.js.map +1 -0
  116. package/dist/counterfactuals.d.ts +29 -0
  117. package/dist/counterfactuals.d.ts.map +1 -0
  118. package/dist/counterfactuals.js +448 -0
  119. package/dist/counterfactuals.js.map +1 -0
  120. package/dist/enhancement-prompts.d.ts +41 -0
  121. package/dist/enhancement-prompts.d.ts.map +1 -0
  122. package/dist/enhancement-prompts.js +88 -0
  123. package/dist/enhancement-prompts.js.map +1 -0
  124. package/dist/envelopes.d.ts +20 -0
  125. package/dist/envelopes.d.ts.map +1 -0
  126. package/dist/envelopes.js +790 -0
  127. package/dist/envelopes.js.map +1 -0
  128. package/dist/format-normalizer.d.ts +71 -0
  129. package/dist/format-normalizer.d.ts.map +1 -0
  130. package/dist/format-normalizer.js +1331 -0
  131. package/dist/format-normalizer.js.map +1 -0
  132. package/dist/history.d.ts +79 -0
  133. package/dist/history.d.ts.map +1 -0
  134. package/dist/history.js +313 -0
  135. package/dist/history.js.map +1 -0
  136. package/dist/html.d.ts +11 -0
  137. package/dist/html.d.ts.map +1 -0
  138. package/dist/html.js +463 -0
  139. package/dist/html.js.map +1 -0
  140. package/dist/impact.d.ts +42 -0
  141. package/dist/impact.d.ts.map +1 -0
  142. package/dist/impact.js +443 -0
  143. package/dist/impact.js.map +1 -0
  144. package/dist/index.d.ts +26 -0
  145. package/dist/index.d.ts.map +1 -0
  146. package/dist/index.js +34 -0
  147. package/dist/index.js.map +1 -0
  148. package/dist/insights.d.ts +5 -0
  149. package/dist/insights.d.ts.map +1 -0
  150. package/dist/insights.js +271 -0
  151. package/dist/insights.js.map +1 -0
  152. package/dist/joiner.d.ts +9 -0
  153. package/dist/joiner.d.ts.map +1 -0
  154. package/dist/joiner.js +247 -0
  155. package/dist/joiner.js.map +1 -0
  156. package/dist/orchestrator.d.ts +34 -0
  157. package/dist/orchestrator.d.ts.map +1 -0
  158. package/dist/orchestrator.js +827 -0
  159. package/dist/orchestrator.js.map +1 -0
  160. package/dist/pdf.d.ts +26 -0
  161. package/dist/pdf.d.ts.map +1 -0
  162. package/dist/pdf.js +84 -0
  163. package/dist/pdf.js.map +1 -0
  164. package/dist/prediction.d.ts +33 -0
  165. package/dist/prediction.d.ts.map +1 -0
  166. package/dist/prediction.js +316 -0
  167. package/dist/prediction.js.map +1 -0
  168. package/dist/prompts/loader.d.ts +38 -0
  169. package/dist/prompts/loader.d.ts.map +1 -0
  170. package/dist/prompts/loader.js +60 -0
  171. package/dist/prompts/loader.js.map +1 -0
  172. package/dist/renderer.d.ts +64 -0
  173. package/dist/renderer.d.ts.map +1 -0
  174. package/dist/renderer.js +923 -0
  175. package/dist/renderer.js.map +1 -0
  176. package/dist/runid.d.ts +57 -0
  177. package/dist/runid.d.ts.map +1 -0
  178. package/dist/runid.js +199 -0
  179. package/dist/runid.js.map +1 -0
  180. package/dist/runtime.d.ts +29 -0
  181. package/dist/runtime.d.ts.map +1 -0
  182. package/dist/runtime.js +366 -0
  183. package/dist/runtime.js.map +1 -0
  184. package/dist/scanner.d.ts +11 -0
  185. package/dist/scanner.d.ts.map +1 -0
  186. package/dist/scanner.js +426 -0
  187. package/dist/scanner.js.map +1 -0
  188. package/dist/templates.d.ts +120 -0
  189. package/dist/templates.d.ts.map +1 -0
  190. package/dist/templates.js +429 -0
  191. package/dist/templates.js.map +1 -0
  192. package/dist/tools/index.d.ts +153 -0
  193. package/dist/tools/index.d.ts.map +1 -0
  194. package/dist/tools/index.js +177 -0
  195. package/dist/tools/index.js.map +1 -0
  196. package/dist/types.d.ts +3647 -0
  197. package/dist/types.d.ts.map +1 -0
  198. package/dist/types.js +703 -0
  199. package/dist/types.js.map +1 -0
  200. package/dist/version.d.ts +7 -0
  201. package/dist/version.d.ts.map +1 -0
  202. package/dist/version.js +23 -0
  203. package/dist/version.js.map +1 -0
  204. package/docs/demo-guide.md +423 -0
  205. package/docs/events-format.md +295 -0
  206. package/docs/inferencemap-spec.md +344 -0
  207. package/docs/migration-v2.md +293 -0
  208. package/fixtures/demo/precomputed.json +142 -0
  209. package/fixtures/demo-project/README.md +52 -0
  210. package/fixtures/demo-project/ai-service.ts +65 -0
  211. package/fixtures/demo-project/sample-events.jsonl +15 -0
  212. package/fixtures/demo-project/src/ai-service.ts +128 -0
  213. package/fixtures/demo-project/src/llm-client.ts +155 -0
  214. package/package.json +65 -0
  215. package/prompts/agent-analyzer.yaml +47 -0
  216. package/prompts/ci-gate.yaml +98 -0
  217. package/prompts/correlation-analyzer.yaml +178 -0
  218. package/prompts/format-normalizer.yaml +46 -0
  219. package/prompts/peak-performance.yaml +180 -0
  220. package/prompts/pr-comment.yaml +111 -0
  221. package/prompts/runtime-analyzer.yaml +189 -0
  222. package/prompts/unified-analyzer.yaml +241 -0
  223. package/schemas/inference-map.v0.1.json +215 -0
  224. package/scripts/benchmark.ts +394 -0
  225. package/scripts/demo-v1.5.sh +158 -0
  226. package/scripts/sync-from-site.sh +197 -0
  227. package/scripts/validate-sync.sh +178 -0
  228. package/src/agent-analyzer.ts +481 -0
  229. package/src/agent.ts +1232 -0
  230. package/src/agents/correlation-analyzer.ts +353 -0
  231. package/src/agents/index.ts +235 -0
  232. package/src/agents/runtime-analyzer.ts +343 -0
  233. package/src/analysis-types.ts +558 -0
  234. package/src/analytics.ts +100 -0
  235. package/src/analyzer.ts +692 -0
  236. package/src/artifacts.ts +218 -0
  237. package/src/benchmarks/index.ts +309 -0
  238. package/src/cli.ts +503 -0
  239. package/src/commands/ci.ts +336 -0
  240. package/src/commands/config.ts +288 -0
  241. package/src/commands/demo.ts +175 -0
  242. package/src/commands/export.ts +297 -0
  243. package/src/commands/history.ts +425 -0
  244. package/src/commands/template.ts +385 -0
  245. package/src/commands/validate-map.ts +324 -0
  246. package/src/commands/whatif.ts +272 -0
  247. package/src/comparison.ts +283 -0
  248. package/src/config.ts +188 -0
  249. package/src/connectors/helicone.ts +164 -0
  250. package/src/connectors/index.ts +93 -0
  251. package/src/connectors/langsmith.ts +179 -0
  252. package/src/connectors/types.ts +180 -0
  253. package/src/cost-estimator.ts +146 -0
  254. package/src/costs.ts +347 -0
  255. package/src/counterfactuals.ts +516 -0
  256. package/src/enhancement-prompts.ts +118 -0
  257. package/src/envelopes.ts +814 -0
  258. package/src/format-normalizer.ts +1486 -0
  259. package/src/history.ts +400 -0
  260. package/src/html.ts +512 -0
  261. package/src/impact.ts +522 -0
  262. package/src/index.ts +83 -0
  263. package/src/insights.ts +341 -0
  264. package/src/joiner.ts +289 -0
  265. package/src/orchestrator.ts +1015 -0
  266. package/src/pdf.ts +110 -0
  267. package/src/prediction.ts +392 -0
  268. package/src/prompts/loader.ts +88 -0
  269. package/src/renderer.ts +1045 -0
  270. package/src/runid.ts +261 -0
  271. package/src/runtime.ts +450 -0
  272. package/src/scanner.ts +508 -0
  273. package/src/templates.ts +561 -0
  274. package/src/tools/index.ts +214 -0
  275. package/src/types.ts +873 -0
  276. package/src/version.ts +24 -0
  277. package/templates/context-accumulation.yaml +23 -0
  278. package/templates/cost-concentration.yaml +20 -0
  279. package/templates/dead-code.yaml +20 -0
  280. package/templates/latency-explainer.yaml +23 -0
  281. package/templates/optimizations/ab-testing-framework.yaml +74 -0
  282. package/templates/optimizations/api-gateway-optimization.yaml +81 -0
  283. package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
  284. package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
  285. package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
  286. package/templates/optimizations/comprehensive-apm.yaml +76 -0
  287. package/templates/optimizations/context-window-optimization.yaml +91 -0
  288. package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
  289. package/templates/optimizations/distributed-training-optimization.yaml +77 -0
  290. package/templates/optimizations/document-analysis-edge.yaml +77 -0
  291. package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
  292. package/templates/optimizations/domain-specific-distillation.yaml +78 -0
  293. package/templates/optimizations/error-handling-optimization.yaml +76 -0
  294. package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
  295. package/templates/optimizations/long-context-memory-management.yaml +78 -0
  296. package/templates/optimizations/max-tokens-optimization.yaml +76 -0
  297. package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
  298. package/templates/optimizations/multi-framework-resilience.yaml +75 -0
  299. package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
  300. package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
  301. package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
  302. package/templates/optimizations/quality-monitoring.yaml +74 -0
  303. package/templates/optimizations/realtime-budget-controls.yaml +74 -0
  304. package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
  305. package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
  306. package/templates/optimizations/smart-model-routing.yaml +96 -0
  307. package/templates/optimizations/streaming-batch-selection.yaml +167 -0
  308. package/templates/optimizations/system-prompt-optimization.yaml +75 -0
  309. package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
  310. package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
  311. package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
  312. package/templates/overpowered-extraction.yaml +32 -0
  313. package/templates/overpowered-model.yaml +31 -0
  314. package/templates/prompt-bloat.yaml +24 -0
  315. package/templates/retry-explosion.yaml +28 -0
  316. package/templates/schema/insight.schema.json +113 -0
  317. package/templates/schema/optimization.schema.json +180 -0
  318. package/templates/streaming-drift.yaml +30 -0
  319. package/templates/throughput-gap.yaml +21 -0
  320. package/templates/token-underutilization.yaml +28 -0
  321. package/templates/untested-fallback.yaml +21 -0
  322. package/tests/accuracy/drift-detection.test.ts +184 -0
  323. package/tests/accuracy/false-positives.test.ts +166 -0
  324. package/tests/accuracy/templates.test.ts +205 -0
  325. package/tests/action/commands.test.ts +125 -0
  326. package/tests/action/comments.test.ts +347 -0
  327. package/tests/cli.test.ts +203 -0
  328. package/tests/comparison.test.ts +309 -0
  329. package/tests/correlation-analyzer.test.ts +534 -0
  330. package/tests/counterfactuals.test.ts +347 -0
  331. package/tests/fixtures/events/missing-id.jsonl +1 -0
  332. package/tests/fixtures/events/missing-input.jsonl +1 -0
  333. package/tests/fixtures/events/missing-latency.jsonl +1 -0
  334. package/tests/fixtures/events/missing-model.jsonl +1 -0
  335. package/tests/fixtures/events/missing-output.jsonl +1 -0
  336. package/tests/fixtures/events/missing-provider.jsonl +1 -0
  337. package/tests/fixtures/events/missing-ts.jsonl +1 -0
  338. package/tests/fixtures/events/valid.csv +3 -0
  339. package/tests/fixtures/events/valid.json +1 -0
  340. package/tests/fixtures/events/valid.jsonl +2 -0
  341. package/tests/fixtures/events/with-callsite.jsonl +1 -0
  342. package/tests/fixtures/events/with-intent.jsonl +1 -0
  343. package/tests/fixtures/events/wrong-type.jsonl +1 -0
  344. package/tests/fixtures/repos/empty/.gitkeep +0 -0
  345. package/tests/fixtures/repos/hybrid-router/router.py +35 -0
  346. package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
  347. package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
  348. package/tests/fixtures/repos/saas-openai/client.py +26 -0
  349. package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
  350. package/tests/github-action.test.ts +292 -0
  351. package/tests/insights.test.ts +878 -0
  352. package/tests/joiner.test.ts +168 -0
  353. package/tests/performance/action-latency.test.ts +132 -0
  354. package/tests/performance/benchmark.test.ts +189 -0
  355. package/tests/performance/cli-latency.test.ts +102 -0
  356. package/tests/pr-comment.test.ts +313 -0
  357. package/tests/prediction.test.ts +296 -0
  358. package/tests/runtime-analyzer.test.ts +375 -0
  359. package/tests/runtime.test.ts +205 -0
  360. package/tests/scanner.test.ts +122 -0
  361. package/tests/template-conformance.test.ts +526 -0
  362. package/tests/unit/cost-calculator.test.ts +303 -0
  363. package/tests/unit/credits.test.ts +180 -0
  364. package/tests/unit/inference-map.test.ts +276 -0
  365. package/tests/unit/schema.test.ts +300 -0
  366. package/tsconfig.json +20 -0
  367. package/vitest.config.ts +14 -0
@@ -0,0 +1,77 @@
1
+ id: tensorrt-llm-performance
2
+ name: TensorRT-LLM Maximum Performance
3
+ description: Deploy models with TensorRT-LLM for maximum inference performance on NVIDIA GPUs
4
+ category: runtime_optimization
5
+ confidence: 0.88
6
+ success_count: 789
7
+ verified_environments: 38
8
+ contributors:
9
+ - tensorrt_specialist
10
+ - nvidia_partner
11
+ last_updated: "2025-01-03"
12
+
13
+ environment_match:
14
+ gpu_vendor: nvidia
15
+ performance_requirement: maximum
16
+ model_support: transformer
17
+
18
+ optimization:
19
+ technique: tensorrt_llm_deployment
20
+ expected_throughput_improvement: "3-5x"
21
+ expected_latency_improvement: "50-70%"
22
+ effort_estimate: "2-3 weeks"
23
+ risk_level: medium
24
+
25
+ economics:
26
+ projected_improvement:
27
+ throughput_multiplier: 4
28
+ cost_per_token_reduction: 0.75
29
+ implementation_cost:
30
+ engineering_hours: 120
31
+ total_cost: 24000
32
+
33
+ implementation:
34
+ prerequisites:
35
+ - requirement: "NVIDIA GPU (A100/H100 recommended)"
36
+ - requirement: "TensorRT-LLM installation"
37
+ validation_command: "python -c 'import tensorrt_llm'"
38
+ - requirement: "Sufficient disk space for compiled engines"
39
+ automated_steps:
40
+ - step_id: engine_build
41
+ name: TensorRT Engine Build
42
+ executable: true
43
+ commands:
44
+ - "python scripts/convert_to_trt.py --model ./model --dtype fp16"
45
+ - "python scripts/build_trt_engine.py --max-batch-size 64 --max-input-len 2048"
46
+ validation:
47
+ command: "python scripts/validate_engine.py"
48
+ success_criteria: "engine_valid AND accuracy > 0.99"
49
+ - step_id: deployment
50
+ name: TensorRT Deployment
51
+ executable: true
52
+ commands:
53
+ - "python scripts/deploy_trt_server.py --engine ./engine"
54
+ - "python scripts/configure_inflight_batching.py"
55
+ validation:
56
+ command: "python scripts/benchmark_trt.py"
57
+ success_criteria: "throughput > baseline * 3"
58
+ rollback_command: "python scripts/fallback_to_pytorch.py"
59
+
60
+ monitoring:
61
+ key_metrics:
62
+ - metric: throughput_tokens_per_second
63
+ target: ">5000"
64
+ alert_threshold: "<3000"
65
+ - metric: gpu_memory_utilization
66
+ target: "70-85%"
67
+ alert_threshold: ">95%"
68
+ rollback_triggers:
69
+ - condition: "engine_error_rate > 0.01 for 5 minutes"
70
+ action: automatic_rollback
71
+
72
+ results:
73
+ recent_implementations:
74
+ - environment: high_volume_api
75
+ baseline_throughput: 800
76
+ optimized_throughput: 3500
77
+ improvement_factor: 4.4
@@ -0,0 +1,93 @@
1
+ id: vllm-high-throughput-optimization
2
+ name: vLLM Continuous Batching for High-Volume Production
3
+ description: Optimize vLLM deployment for maximum throughput in high-traffic scenarios
4
+ category: batching_optimization
5
+ confidence: 0.91
6
+ success_count: 1923
7
+ verified_environments: 67
8
+ contributors:
9
+ - scaling_team
10
+ - vllm_expert
11
+ - production_engineer
12
+ last_updated: "2025-01-14"
13
+
14
+ environment_match:
15
+ runtime: vllm
16
+ monthly_requests: ">1M"
17
+ current_batch_size: "<8"
18
+ gpu_utilization: "<70%"
19
+ latency_requirements: flexible
20
+
21
+ optimization:
22
+ technique: continuous_batching
23
+ expected_throughput_improvement: "3-5x"
24
+ expected_cost_reduction: "60-75%"
25
+ effort_estimate: "1-2 weeks"
26
+ risk_level: low
27
+
28
+ economics:
29
+ baseline_calculation:
30
+ current_throughput_factor: 1.0
31
+ projected_improvement:
32
+ new_throughput_factor: 4.0
33
+ gpu_reduction_factor: 0.25
34
+ implementation_cost:
35
+ engineering_hours: 80
36
+ total_cost: 16000
37
+
38
+ implementation:
39
+ prerequisites:
40
+ - requirement: "vLLM 0.2.7+"
41
+ validation_command: "python -c 'import vllm; print(vllm.__version__)'"
42
+ - requirement: "CUDA 11.8+"
43
+ validation_command: "nvcc --version | grep 'release 11.8'"
44
+ - requirement: "16GB+ GPU memory"
45
+ validation_command: "nvidia-smi --query-gpu=memory.total --format=csv,noheader | awk '{if($1<16000) exit 1}'"
46
+ automated_steps:
47
+ - step_id: batch_configuration
48
+ name: Optimal Batch Configuration
49
+ executable: true
50
+ commands:
51
+ - "python scripts/configure_vllm.py --max-num-batched-tokens 8192 --max-num-seqs 32"
52
+ - "python scripts/start_vllm_server.py --model meta-llama/Llama-2-7b-hf --gpu-memory-utilization 0.85"
53
+ validation:
54
+ command: "python scripts/test_batch_performance.py --target-batch-size 16"
55
+ success_criteria: "average_batch_size > 12"
56
+ rollback_command: "python scripts/revert_vllm_config.py"
57
+ - step_id: memory_optimization
58
+ name: Memory Optimization
59
+ executable: true
60
+ commands:
61
+ - "python scripts/enable_prefix_caching.py"
62
+ - "python scripts/configure_swap_space.py --swap-size 4GB"
63
+ validation:
64
+ command: "python scripts/check_memory_efficiency.py"
65
+ success_criteria: "memory_utilization > 0.8 AND memory_utilization < 0.9"
66
+ rollback_command: "python scripts/disable_optimizations.py"
67
+
68
+ monitoring:
69
+ key_metrics:
70
+ - metric: average_batch_size
71
+ target: ">16"
72
+ alert_threshold: "<12"
73
+ - metric: throughput_tokens_per_second
74
+ target: ">3000"
75
+ alert_threshold: "<2000"
76
+ - metric: gpu_memory_utilization
77
+ target: "0.8-0.85"
78
+ alert_threshold: ">0.9"
79
+ rollback_triggers:
80
+ - condition: "average_batch_size < 8 for 20 minutes"
81
+ action: automatic_rollback
82
+ - condition: "gpu_memory_utilization > 0.95 for 10 minutes"
83
+ action: automatic_rollback
84
+ - condition: "throughput_degradation > 30% for 15 minutes"
85
+ action: alert_and_investigation
86
+
87
+ results:
88
+ recent_implementations:
89
+ - environment: video_streaming_recommendations
90
+ baseline_throughput: 800
91
+ optimized_throughput: 3200
92
+ throughput_improvement: 4.0
93
+ implementation_days: 8
@@ -0,0 +1,78 @@
1
+ id: vllm-migration-memory-bound
2
+ name: vLLM Migration from Memory-Bound Workloads
3
+ description: Migrate from traditional serving to vLLM for memory-bound inference workloads
4
+ category: runtime_optimization
5
+ confidence: 0.90
6
+ success_count: 1123
7
+ verified_environments: 52
8
+ contributors:
9
+ - vllm_specialist
10
+ - migration_engineer
11
+ last_updated: "2025-01-10"
12
+
13
+ environment_match:
14
+ current_runtime:
15
+ - huggingface
16
+ - pytorch
17
+ memory_bound: true
18
+ batch_size: "<4"
19
+
20
+ optimization:
21
+ technique: vllm_migration
22
+ expected_throughput_improvement: "3-6x"
23
+ expected_cost_reduction: "60-80%"
24
+ effort_estimate: "1-2 weeks"
25
+ risk_level: low
26
+
27
+ economics:
28
+ projected_improvement:
29
+ throughput_multiplier: 4.5
30
+ cost_reduction_percent: 70
31
+ implementation_cost:
32
+ engineering_hours: 60
33
+ total_cost: 12000
34
+
35
+ implementation:
36
+ prerequisites:
37
+ - requirement: "vLLM compatible model"
38
+ validation_command: "python scripts/check_vllm_compatibility.py --model ./model"
39
+ - requirement: "GPU with 16GB+ memory"
40
+ automated_steps:
41
+ - step_id: compatibility_check
42
+ name: Compatibility Verification
43
+ executable: true
44
+ commands:
45
+ - "python scripts/verify_model_format.py"
46
+ - "python scripts/test_vllm_loading.py"
47
+ validation:
48
+ command: "python scripts/validate_loading.py"
49
+ success_criteria: "model_loads_successfully"
50
+ - step_id: migration
51
+ name: vLLM Migration
52
+ executable: true
53
+ commands:
54
+ - "python scripts/setup_vllm_server.py --model ./model --tensor-parallel-size 1"
55
+ - "python scripts/configure_batching.py --max-tokens 8192"
56
+ validation:
57
+ command: "python scripts/benchmark_vllm.py"
58
+ success_criteria: "throughput > baseline * 3"
59
+ rollback_command: "python scripts/revert_to_original.py"
60
+
61
+ monitoring:
62
+ key_metrics:
63
+ - metric: throughput_rps
64
+ target: ">baseline * 3"
65
+ alert_threshold: "<baseline * 2"
66
+ - metric: latency_p99
67
+ target: "<baseline * 1.2"
68
+ alert_threshold: ">baseline * 2"
69
+ rollback_triggers:
70
+ - condition: "throughput < baseline for 15 minutes"
71
+ action: automatic_rollback
72
+
73
+ results:
74
+ recent_implementations:
75
+ - environment: api_inference_service
76
+ baseline_throughput: 50
77
+ optimized_throughput: 220
78
+ improvement_factor: 4.4
@@ -0,0 +1,32 @@
1
+ # Based on: https://www.kalmantic.com/posts/gpt5-model-selection-economics-extraction-tasks
2
+ # "Why Premium Models Waste Money on Extraction Tasks"
3
+
4
+ id: overpowered-extraction
5
+ name: Overpowered Model for Simple Tasks
6
+ version: "1.0"
7
+ category: cost
8
+ severity: warning
9
+ layer: model
10
+
11
+ match:
12
+ scope: callsite
13
+ conditions:
14
+ - field: model
15
+ op: in
16
+ value: ["gpt-4o", "gpt-4", "gpt-4-turbo", "claude-3-opus", "claude-3.5-sonnet"]
17
+ - field: avg_tokens
18
+ op: lt
19
+ value: 100
20
+
21
+ output:
22
+ headline: "Using {{model}} for {{avg_tokens}}-token outputs"
23
+ evidence: "{{location}}: Consider gpt-4o-mini or claude-3-haiku for simple extraction tasks"
24
+
25
+ defaults:
26
+ small_output_threshold: 100
27
+ premium_models:
28
+ - gpt-4o
29
+ - gpt-4
30
+ - gpt-4-turbo
31
+ - claude-3-opus
32
+ - claude-3.5-sonnet
@@ -0,0 +1,31 @@
1
+ id: overpowered-model
2
+ name: Overpowered Model Detection
3
+ version: "1.0"
4
+ category: waste
5
+ severity: info
6
+ layer: model
7
+
8
+ source:
9
+ url: https://openai.com/pricing
10
+ title: "Model Pricing and Capability Tiers"
11
+
12
+ match:
13
+ scope: callsite
14
+ conditions:
15
+ - field: model
16
+ op: in
17
+ value: ["gpt-4o", "gpt-4", "gpt-4-turbo", "claude-3-opus", "claude-3-opus-20240229"]
18
+ - field: usage.avg_output_tokens
19
+ op: lt
20
+ value: 100
21
+ - field: usage.calls
22
+ op: gt
23
+ value: 100
24
+
25
+ output:
26
+ headline: "{{model}} used for short outputs (avg {{avg_tokens}} tokens)"
27
+ evidence: "Premium models have minimum cost overhead regardless of output length"
28
+
29
+ defaults:
30
+ output_threshold: 100
31
+ calls_threshold: 100
@@ -0,0 +1,24 @@
1
+ # Based on: https://www.kalmantic.com/posts/system-prompt-optimization-stop-paying-redundant-instructions
2
+ # "Stop Paying 40x More for Redundant AI Instructions"
3
+
4
+ id: prompt-bloat
5
+ name: Prompt Bloat Detection
6
+ version: "1.0"
7
+ category: cost
8
+ severity: warning
9
+ layer: model
10
+
11
+ match:
12
+ scope: callsite
13
+ conditions:
14
+ - field: usage.tokens_in
15
+ op: ratio_gt
16
+ compare_to: usage.tokens_out
17
+ value: 20
18
+
19
+ output:
20
+ headline: "{{ratio}}x more input than output tokens"
21
+ evidence: "{{location}}: {{tokens_in}} tokens in → {{tokens_out}} tokens out. Consider prompt optimization."
22
+
23
+ defaults:
24
+ input_output_ratio_threshold: 20
@@ -0,0 +1,28 @@
1
+ # Based on: https://www.kalmantic.com/posts/ai-retry-logic-error-handling-multiplies-costs
2
+ # "How Bad Error Handling Turns $10 Failures into $1000 Bills"
3
+
4
+ id: retry-explosion
5
+ name: Retry Storm Detection
6
+ version: "1.0"
7
+ category: cost
8
+ severity: critical
9
+ layer: api
10
+
11
+ match:
12
+ scope: callsite
13
+ conditions:
14
+ - field: usage.calls
15
+ op: gt
16
+ value: 10
17
+ - field: usage.latency_p99
18
+ op: ratio_gt
19
+ compare_to: usage.latency_p50
20
+ value: 5
21
+
22
+ output:
23
+ headline: "Possible retry storm at {{location}}"
24
+ evidence: "{{calls}} calls with p99/p50 ratio of {{ratio}}x - check retry logic"
25
+
26
+ defaults:
27
+ min_calls: 10
28
+ latency_ratio_threshold: 5
@@ -0,0 +1,113 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://github.com/Kalmantic/peakinfer_templates/schema/insight.schema.json",
4
+ "title": "PeakInfer Insight Template",
5
+ "description": "Schema for insight detection templates",
6
+ "type": "object",
7
+ "required": ["id", "version", "name", "category", "severity", "match", "output"],
8
+ "properties": {
9
+ "id": {
10
+ "type": "string",
11
+ "pattern": "^[a-z][a-z0-9-]*$",
12
+ "description": "Unique identifier (kebab-case)"
13
+ },
14
+ "version": {
15
+ "type": "string",
16
+ "pattern": "^\\d+\\.\\d+$",
17
+ "description": "Template version (semver major.minor)"
18
+ },
19
+ "name": {
20
+ "type": "string",
21
+ "description": "Human-readable name"
22
+ },
23
+ "description": {
24
+ "type": "string",
25
+ "description": "Detailed description of what this insight detects"
26
+ },
27
+ "source": {
28
+ "type": "object",
29
+ "properties": {
30
+ "url": { "type": "string", "format": "uri" },
31
+ "title": { "type": "string" }
32
+ },
33
+ "description": "Attribution to blog post, research, etc."
34
+ },
35
+ "category": {
36
+ "type": "string",
37
+ "enum": ["cost", "drift", "performance", "waste", "reliability", "latency", "throughput"],
38
+ "description": "Primary category for grouping"
39
+ },
40
+ "severity": {
41
+ "type": "string",
42
+ "enum": ["critical", "warning", "info"],
43
+ "description": "Impact severity"
44
+ },
45
+ "tags": {
46
+ "type": "array",
47
+ "items": { "type": "string" },
48
+ "description": "Searchable tags"
49
+ },
50
+ "match": {
51
+ "type": "object",
52
+ "required": ["scope", "conditions"],
53
+ "properties": {
54
+ "scope": {
55
+ "type": "string",
56
+ "enum": ["callsite", "joined", "global", "envelope"],
57
+ "description": "What data context to evaluate against"
58
+ },
59
+ "conditions": {
60
+ "type": "array",
61
+ "items": {
62
+ "type": "object",
63
+ "required": ["field", "op"],
64
+ "properties": {
65
+ "field": { "type": "string" },
66
+ "op": {
67
+ "type": "string",
68
+ "enum": ["eq", "neq", "gt", "lt", "gte", "lte", "exists", "in", "ratio_gt", "ratio_lt", "has_pattern"]
69
+ },
70
+ "value": {},
71
+ "compare_to": { "type": "string" },
72
+ "pattern": { "type": "string" },
73
+ "count_gt": { "type": "number" }
74
+ }
75
+ }
76
+ }
77
+ }
78
+ },
79
+ "output": {
80
+ "type": "object",
81
+ "required": ["headline", "evidence"],
82
+ "properties": {
83
+ "headline": {
84
+ "type": "string",
85
+ "description": "Short summary with {{variables}}"
86
+ },
87
+ "evidence": {
88
+ "type": "string",
89
+ "description": "Supporting details with {{variables}}"
90
+ }
91
+ }
92
+ },
93
+ "recommends": {
94
+ "type": "array",
95
+ "items": {
96
+ "type": "object",
97
+ "properties": {
98
+ "optimization": { "type": "string" },
99
+ "relevance": { "type": "number", "minimum": 0, "maximum": 1 },
100
+ "reason": { "type": "string" }
101
+ }
102
+ },
103
+ "description": "Links to optimization templates"
104
+ },
105
+ "defaults": {
106
+ "type": "object",
107
+ "description": "Default threshold values"
108
+ },
109
+ "author": { "type": "string" },
110
+ "created": { "type": "string", "format": "date" },
111
+ "updated": { "type": "string", "format": "date" }
112
+ }
113
+ }
@@ -0,0 +1,180 @@
1
+ {
2
+ "$schema": "http://json-schema.org/draft-07/schema#",
3
+ "$id": "https://github.com/Kalmantic/peakinfer_templates/schema/optimization.schema.json",
4
+ "title": "PeakInfer Optimization Template",
5
+ "description": "Schema for optimization recommendation templates",
6
+ "type": "object",
7
+ "required": ["id", "name", "description", "category", "optimization", "implementation"],
8
+ "properties": {
9
+ "id": {
10
+ "type": "string",
11
+ "pattern": "^[a-z][a-z0-9-]*$",
12
+ "description": "Unique identifier (kebab-case)"
13
+ },
14
+ "name": {
15
+ "type": "string",
16
+ "description": "Human-readable name"
17
+ },
18
+ "description": {
19
+ "type": "string",
20
+ "description": "Detailed description of the optimization"
21
+ },
22
+ "source": {
23
+ "type": "object",
24
+ "properties": {
25
+ "url": { "type": "string", "format": "uri" },
26
+ "title": { "type": "string" },
27
+ "authors": {
28
+ "type": "array",
29
+ "items": { "type": "string" }
30
+ }
31
+ },
32
+ "description": "Attribution to research paper, blog post, etc."
33
+ },
34
+ "category": {
35
+ "type": "string",
36
+ "enum": [
37
+ "api_optimization",
38
+ "memory_optimization",
39
+ "latency_optimization",
40
+ "cost_optimization",
41
+ "reliability_optimization",
42
+ "throughput_optimization",
43
+ "serving_optimization"
44
+ ],
45
+ "description": "Primary optimization category"
46
+ },
47
+ "confidence": {
48
+ "type": "number",
49
+ "minimum": 0,
50
+ "maximum": 1,
51
+ "description": "Confidence score based on verified implementations"
52
+ },
53
+ "success_count": {
54
+ "type": "integer",
55
+ "minimum": 0,
56
+ "description": "Number of successful implementations"
57
+ },
58
+ "verified_environments": {
59
+ "type": "integer",
60
+ "minimum": 0,
61
+ "description": "Number of verified deployment environments"
62
+ },
63
+ "contributors": {
64
+ "type": "array",
65
+ "items": { "type": "string" },
66
+ "description": "Contributors to this optimization"
67
+ },
68
+ "last_updated": {
69
+ "type": "string",
70
+ "format": "date",
71
+ "description": "Last update date"
72
+ },
73
+ "environment_match": {
74
+ "type": "object",
75
+ "description": "Conditions for when this optimization applies",
76
+ "properties": {
77
+ "model_size": {
78
+ "type": "array",
79
+ "items": { "type": "string" }
80
+ },
81
+ "memory_pressure": { "type": "string" },
82
+ "quality_tolerance": { "type": "string" },
83
+ "deployment": {
84
+ "type": "array",
85
+ "items": { "type": "string" }
86
+ }
87
+ }
88
+ },
89
+ "optimization": {
90
+ "type": "object",
91
+ "required": ["technique"],
92
+ "properties": {
93
+ "technique": { "type": "string" },
94
+ "expected_memory_reduction": { "type": "string" },
95
+ "expected_quality_retention": { "type": "string" },
96
+ "expected_latency_improvement": { "type": "string" },
97
+ "expected_cost_reduction": { "type": "string" },
98
+ "effort_estimate": { "type": "string" },
99
+ "risk_level": {
100
+ "type": "string",
101
+ "enum": ["low", "medium", "high"]
102
+ }
103
+ }
104
+ },
105
+ "economics": {
106
+ "type": "object",
107
+ "properties": {
108
+ "baseline_calculation": { "type": "object" },
109
+ "projected_improvement": { "type": "object" },
110
+ "implementation_cost": { "type": "object" }
111
+ }
112
+ },
113
+ "implementation": {
114
+ "type": "object",
115
+ "properties": {
116
+ "prerequisites": {
117
+ "type": "array",
118
+ "items": {
119
+ "type": "object",
120
+ "properties": {
121
+ "requirement": { "type": "string" },
122
+ "validation_command": { "type": "string" }
123
+ }
124
+ }
125
+ },
126
+ "automated_steps": {
127
+ "type": "array",
128
+ "items": {
129
+ "type": "object",
130
+ "properties": {
131
+ "step_id": { "type": "string" },
132
+ "name": { "type": "string" },
133
+ "executable": { "type": "boolean" },
134
+ "commands": {
135
+ "type": "array",
136
+ "items": { "type": "string" }
137
+ },
138
+ "validation": { "type": "object" }
139
+ }
140
+ }
141
+ }
142
+ }
143
+ },
144
+ "monitoring": {
145
+ "type": "object",
146
+ "properties": {
147
+ "key_metrics": {
148
+ "type": "array",
149
+ "items": {
150
+ "type": "object",
151
+ "properties": {
152
+ "metric": { "type": "string" },
153
+ "target": { "type": "string" },
154
+ "alert_threshold": { "type": "string" }
155
+ }
156
+ }
157
+ },
158
+ "rollback_triggers": {
159
+ "type": "array",
160
+ "items": {
161
+ "type": "object",
162
+ "properties": {
163
+ "condition": { "type": "string" },
164
+ "action": { "type": "string" }
165
+ }
166
+ }
167
+ }
168
+ }
169
+ },
170
+ "results": {
171
+ "type": "object",
172
+ "properties": {
173
+ "recent_implementations": {
174
+ "type": "array",
175
+ "items": { "type": "object" }
176
+ }
177
+ }
178
+ }
179
+ }
180
+ }
@@ -0,0 +1,30 @@
1
+ id: streaming-drift
2
+ name: Streaming Drift Detection
3
+ version: "1.0"
4
+ category: latency
5
+ severity: critical
6
+ layer: application
7
+
8
+ source:
9
+ url: https://anthropic.com/research/streaming-tokens
10
+ title: "Token Streaming for Real-Time Applications"
11
+
12
+ match:
13
+ scope: callsite
14
+ conditions:
15
+ - field: patterns.streaming
16
+ op: eq
17
+ value: true
18
+ - field: usage
19
+ op: exists
20
+ - field: usage.latency_p99
21
+ op: ratio_gt
22
+ compare_to: usage.latency_p50
23
+ value: 5
24
+
25
+ output:
26
+ headline: "Streaming enabled but responses arrive in bursts"
27
+ evidence: "p99/p50 ratio is {{ratio}}x — true streaming would be under 2x"
28
+
29
+ defaults:
30
+ threshold: 2