@peakinfer/cli 1.0.133

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. package/.claude/settings.local.json +8 -0
  2. package/.env.example +6 -0
  3. package/.github/workflows/peakinfer.yml +64 -0
  4. package/CHANGELOG.md +31 -0
  5. package/LICENSE +190 -0
  6. package/README.md +335 -0
  7. package/data/inferencemax.json +274 -0
  8. package/dist/agent-analyzer.d.ts +45 -0
  9. package/dist/agent-analyzer.d.ts.map +1 -0
  10. package/dist/agent-analyzer.js +374 -0
  11. package/dist/agent-analyzer.js.map +1 -0
  12. package/dist/agent.d.ts +76 -0
  13. package/dist/agent.d.ts.map +1 -0
  14. package/dist/agent.js +965 -0
  15. package/dist/agent.js.map +1 -0
  16. package/dist/agents/correlation-analyzer.d.ts +34 -0
  17. package/dist/agents/correlation-analyzer.d.ts.map +1 -0
  18. package/dist/agents/correlation-analyzer.js +261 -0
  19. package/dist/agents/correlation-analyzer.js.map +1 -0
  20. package/dist/agents/index.d.ts +91 -0
  21. package/dist/agents/index.d.ts.map +1 -0
  22. package/dist/agents/index.js +111 -0
  23. package/dist/agents/index.js.map +1 -0
  24. package/dist/agents/runtime-analyzer.d.ts +38 -0
  25. package/dist/agents/runtime-analyzer.d.ts.map +1 -0
  26. package/dist/agents/runtime-analyzer.js +244 -0
  27. package/dist/agents/runtime-analyzer.js.map +1 -0
  28. package/dist/analysis-types.d.ts +500 -0
  29. package/dist/analysis-types.d.ts.map +1 -0
  30. package/dist/analysis-types.js +11 -0
  31. package/dist/analysis-types.js.map +1 -0
  32. package/dist/analytics.d.ts +25 -0
  33. package/dist/analytics.d.ts.map +1 -0
  34. package/dist/analytics.js +94 -0
  35. package/dist/analytics.js.map +1 -0
  36. package/dist/analyzer.d.ts +48 -0
  37. package/dist/analyzer.d.ts.map +1 -0
  38. package/dist/analyzer.js +547 -0
  39. package/dist/analyzer.js.map +1 -0
  40. package/dist/artifacts.d.ts +44 -0
  41. package/dist/artifacts.d.ts.map +1 -0
  42. package/dist/artifacts.js +165 -0
  43. package/dist/artifacts.js.map +1 -0
  44. package/dist/benchmarks/index.d.ts +88 -0
  45. package/dist/benchmarks/index.d.ts.map +1 -0
  46. package/dist/benchmarks/index.js +205 -0
  47. package/dist/benchmarks/index.js.map +1 -0
  48. package/dist/cli.d.ts +3 -0
  49. package/dist/cli.d.ts.map +1 -0
  50. package/dist/cli.js +427 -0
  51. package/dist/cli.js.map +1 -0
  52. package/dist/commands/ci.d.ts +19 -0
  53. package/dist/commands/ci.d.ts.map +1 -0
  54. package/dist/commands/ci.js +253 -0
  55. package/dist/commands/ci.js.map +1 -0
  56. package/dist/commands/config.d.ts +16 -0
  57. package/dist/commands/config.d.ts.map +1 -0
  58. package/dist/commands/config.js +249 -0
  59. package/dist/commands/config.js.map +1 -0
  60. package/dist/commands/demo.d.ts +15 -0
  61. package/dist/commands/demo.d.ts.map +1 -0
  62. package/dist/commands/demo.js +106 -0
  63. package/dist/commands/demo.js.map +1 -0
  64. package/dist/commands/export.d.ts +14 -0
  65. package/dist/commands/export.d.ts.map +1 -0
  66. package/dist/commands/export.js +209 -0
  67. package/dist/commands/export.js.map +1 -0
  68. package/dist/commands/history.d.ts +15 -0
  69. package/dist/commands/history.d.ts.map +1 -0
  70. package/dist/commands/history.js +389 -0
  71. package/dist/commands/history.js.map +1 -0
  72. package/dist/commands/template.d.ts +14 -0
  73. package/dist/commands/template.d.ts.map +1 -0
  74. package/dist/commands/template.js +341 -0
  75. package/dist/commands/template.js.map +1 -0
  76. package/dist/commands/validate-map.d.ts +12 -0
  77. package/dist/commands/validate-map.d.ts.map +1 -0
  78. package/dist/commands/validate-map.js +274 -0
  79. package/dist/commands/validate-map.js.map +1 -0
  80. package/dist/commands/whatif.d.ts +17 -0
  81. package/dist/commands/whatif.d.ts.map +1 -0
  82. package/dist/commands/whatif.js +206 -0
  83. package/dist/commands/whatif.js.map +1 -0
  84. package/dist/comparison.d.ts +38 -0
  85. package/dist/comparison.d.ts.map +1 -0
  86. package/dist/comparison.js +223 -0
  87. package/dist/comparison.js.map +1 -0
  88. package/dist/config.d.ts +42 -0
  89. package/dist/config.d.ts.map +1 -0
  90. package/dist/config.js +158 -0
  91. package/dist/config.js.map +1 -0
  92. package/dist/connectors/helicone.d.ts +9 -0
  93. package/dist/connectors/helicone.d.ts.map +1 -0
  94. package/dist/connectors/helicone.js +106 -0
  95. package/dist/connectors/helicone.js.map +1 -0
  96. package/dist/connectors/index.d.ts +37 -0
  97. package/dist/connectors/index.d.ts.map +1 -0
  98. package/dist/connectors/index.js +65 -0
  99. package/dist/connectors/index.js.map +1 -0
  100. package/dist/connectors/langsmith.d.ts +9 -0
  101. package/dist/connectors/langsmith.d.ts.map +1 -0
  102. package/dist/connectors/langsmith.js +122 -0
  103. package/dist/connectors/langsmith.js.map +1 -0
  104. package/dist/connectors/types.d.ts +83 -0
  105. package/dist/connectors/types.d.ts.map +1 -0
  106. package/dist/connectors/types.js +98 -0
  107. package/dist/connectors/types.js.map +1 -0
  108. package/dist/cost-estimator.d.ts +46 -0
  109. package/dist/cost-estimator.d.ts.map +1 -0
  110. package/dist/cost-estimator.js +104 -0
  111. package/dist/cost-estimator.js.map +1 -0
  112. package/dist/costs.d.ts +57 -0
  113. package/dist/costs.d.ts.map +1 -0
  114. package/dist/costs.js +251 -0
  115. package/dist/costs.js.map +1 -0
  116. package/dist/counterfactuals.d.ts +29 -0
  117. package/dist/counterfactuals.d.ts.map +1 -0
  118. package/dist/counterfactuals.js +448 -0
  119. package/dist/counterfactuals.js.map +1 -0
  120. package/dist/enhancement-prompts.d.ts +41 -0
  121. package/dist/enhancement-prompts.d.ts.map +1 -0
  122. package/dist/enhancement-prompts.js +88 -0
  123. package/dist/enhancement-prompts.js.map +1 -0
  124. package/dist/envelopes.d.ts +20 -0
  125. package/dist/envelopes.d.ts.map +1 -0
  126. package/dist/envelopes.js +790 -0
  127. package/dist/envelopes.js.map +1 -0
  128. package/dist/format-normalizer.d.ts +71 -0
  129. package/dist/format-normalizer.d.ts.map +1 -0
  130. package/dist/format-normalizer.js +1331 -0
  131. package/dist/format-normalizer.js.map +1 -0
  132. package/dist/history.d.ts +79 -0
  133. package/dist/history.d.ts.map +1 -0
  134. package/dist/history.js +313 -0
  135. package/dist/history.js.map +1 -0
  136. package/dist/html.d.ts +11 -0
  137. package/dist/html.d.ts.map +1 -0
  138. package/dist/html.js +463 -0
  139. package/dist/html.js.map +1 -0
  140. package/dist/impact.d.ts +42 -0
  141. package/dist/impact.d.ts.map +1 -0
  142. package/dist/impact.js +443 -0
  143. package/dist/impact.js.map +1 -0
  144. package/dist/index.d.ts +26 -0
  145. package/dist/index.d.ts.map +1 -0
  146. package/dist/index.js +34 -0
  147. package/dist/index.js.map +1 -0
  148. package/dist/insights.d.ts +5 -0
  149. package/dist/insights.d.ts.map +1 -0
  150. package/dist/insights.js +271 -0
  151. package/dist/insights.js.map +1 -0
  152. package/dist/joiner.d.ts +9 -0
  153. package/dist/joiner.d.ts.map +1 -0
  154. package/dist/joiner.js +247 -0
  155. package/dist/joiner.js.map +1 -0
  156. package/dist/orchestrator.d.ts +34 -0
  157. package/dist/orchestrator.d.ts.map +1 -0
  158. package/dist/orchestrator.js +827 -0
  159. package/dist/orchestrator.js.map +1 -0
  160. package/dist/pdf.d.ts +26 -0
  161. package/dist/pdf.d.ts.map +1 -0
  162. package/dist/pdf.js +84 -0
  163. package/dist/pdf.js.map +1 -0
  164. package/dist/prediction.d.ts +33 -0
  165. package/dist/prediction.d.ts.map +1 -0
  166. package/dist/prediction.js +316 -0
  167. package/dist/prediction.js.map +1 -0
  168. package/dist/prompts/loader.d.ts +38 -0
  169. package/dist/prompts/loader.d.ts.map +1 -0
  170. package/dist/prompts/loader.js +60 -0
  171. package/dist/prompts/loader.js.map +1 -0
  172. package/dist/renderer.d.ts +64 -0
  173. package/dist/renderer.d.ts.map +1 -0
  174. package/dist/renderer.js +923 -0
  175. package/dist/renderer.js.map +1 -0
  176. package/dist/runid.d.ts +57 -0
  177. package/dist/runid.d.ts.map +1 -0
  178. package/dist/runid.js +199 -0
  179. package/dist/runid.js.map +1 -0
  180. package/dist/runtime.d.ts +29 -0
  181. package/dist/runtime.d.ts.map +1 -0
  182. package/dist/runtime.js +366 -0
  183. package/dist/runtime.js.map +1 -0
  184. package/dist/scanner.d.ts +11 -0
  185. package/dist/scanner.d.ts.map +1 -0
  186. package/dist/scanner.js +426 -0
  187. package/dist/scanner.js.map +1 -0
  188. package/dist/templates.d.ts +120 -0
  189. package/dist/templates.d.ts.map +1 -0
  190. package/dist/templates.js +429 -0
  191. package/dist/templates.js.map +1 -0
  192. package/dist/tools/index.d.ts +153 -0
  193. package/dist/tools/index.d.ts.map +1 -0
  194. package/dist/tools/index.js +177 -0
  195. package/dist/tools/index.js.map +1 -0
  196. package/dist/types.d.ts +3647 -0
  197. package/dist/types.d.ts.map +1 -0
  198. package/dist/types.js +703 -0
  199. package/dist/types.js.map +1 -0
  200. package/dist/version.d.ts +7 -0
  201. package/dist/version.d.ts.map +1 -0
  202. package/dist/version.js +23 -0
  203. package/dist/version.js.map +1 -0
  204. package/docs/demo-guide.md +423 -0
  205. package/docs/events-format.md +295 -0
  206. package/docs/inferencemap-spec.md +344 -0
  207. package/docs/migration-v2.md +293 -0
  208. package/fixtures/demo/precomputed.json +142 -0
  209. package/fixtures/demo-project/README.md +52 -0
  210. package/fixtures/demo-project/ai-service.ts +65 -0
  211. package/fixtures/demo-project/sample-events.jsonl +15 -0
  212. package/fixtures/demo-project/src/ai-service.ts +128 -0
  213. package/fixtures/demo-project/src/llm-client.ts +155 -0
  214. package/package.json +65 -0
  215. package/prompts/agent-analyzer.yaml +47 -0
  216. package/prompts/ci-gate.yaml +98 -0
  217. package/prompts/correlation-analyzer.yaml +178 -0
  218. package/prompts/format-normalizer.yaml +46 -0
  219. package/prompts/peak-performance.yaml +180 -0
  220. package/prompts/pr-comment.yaml +111 -0
  221. package/prompts/runtime-analyzer.yaml +189 -0
  222. package/prompts/unified-analyzer.yaml +241 -0
  223. package/schemas/inference-map.v0.1.json +215 -0
  224. package/scripts/benchmark.ts +394 -0
  225. package/scripts/demo-v1.5.sh +158 -0
  226. package/scripts/sync-from-site.sh +197 -0
  227. package/scripts/validate-sync.sh +178 -0
  228. package/src/agent-analyzer.ts +481 -0
  229. package/src/agent.ts +1232 -0
  230. package/src/agents/correlation-analyzer.ts +353 -0
  231. package/src/agents/index.ts +235 -0
  232. package/src/agents/runtime-analyzer.ts +343 -0
  233. package/src/analysis-types.ts +558 -0
  234. package/src/analytics.ts +100 -0
  235. package/src/analyzer.ts +692 -0
  236. package/src/artifacts.ts +218 -0
  237. package/src/benchmarks/index.ts +309 -0
  238. package/src/cli.ts +503 -0
  239. package/src/commands/ci.ts +336 -0
  240. package/src/commands/config.ts +288 -0
  241. package/src/commands/demo.ts +175 -0
  242. package/src/commands/export.ts +297 -0
  243. package/src/commands/history.ts +425 -0
  244. package/src/commands/template.ts +385 -0
  245. package/src/commands/validate-map.ts +324 -0
  246. package/src/commands/whatif.ts +272 -0
  247. package/src/comparison.ts +283 -0
  248. package/src/config.ts +188 -0
  249. package/src/connectors/helicone.ts +164 -0
  250. package/src/connectors/index.ts +93 -0
  251. package/src/connectors/langsmith.ts +179 -0
  252. package/src/connectors/types.ts +180 -0
  253. package/src/cost-estimator.ts +146 -0
  254. package/src/costs.ts +347 -0
  255. package/src/counterfactuals.ts +516 -0
  256. package/src/enhancement-prompts.ts +118 -0
  257. package/src/envelopes.ts +814 -0
  258. package/src/format-normalizer.ts +1486 -0
  259. package/src/history.ts +400 -0
  260. package/src/html.ts +512 -0
  261. package/src/impact.ts +522 -0
  262. package/src/index.ts +83 -0
  263. package/src/insights.ts +341 -0
  264. package/src/joiner.ts +289 -0
  265. package/src/orchestrator.ts +1015 -0
  266. package/src/pdf.ts +110 -0
  267. package/src/prediction.ts +392 -0
  268. package/src/prompts/loader.ts +88 -0
  269. package/src/renderer.ts +1045 -0
  270. package/src/runid.ts +261 -0
  271. package/src/runtime.ts +450 -0
  272. package/src/scanner.ts +508 -0
  273. package/src/templates.ts +561 -0
  274. package/src/tools/index.ts +214 -0
  275. package/src/types.ts +873 -0
  276. package/src/version.ts +24 -0
  277. package/templates/context-accumulation.yaml +23 -0
  278. package/templates/cost-concentration.yaml +20 -0
  279. package/templates/dead-code.yaml +20 -0
  280. package/templates/latency-explainer.yaml +23 -0
  281. package/templates/optimizations/ab-testing-framework.yaml +74 -0
  282. package/templates/optimizations/api-gateway-optimization.yaml +81 -0
  283. package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
  284. package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
  285. package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
  286. package/templates/optimizations/comprehensive-apm.yaml +76 -0
  287. package/templates/optimizations/context-window-optimization.yaml +91 -0
  288. package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
  289. package/templates/optimizations/distributed-training-optimization.yaml +77 -0
  290. package/templates/optimizations/document-analysis-edge.yaml +77 -0
  291. package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
  292. package/templates/optimizations/domain-specific-distillation.yaml +78 -0
  293. package/templates/optimizations/error-handling-optimization.yaml +76 -0
  294. package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
  295. package/templates/optimizations/long-context-memory-management.yaml +78 -0
  296. package/templates/optimizations/max-tokens-optimization.yaml +76 -0
  297. package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
  298. package/templates/optimizations/multi-framework-resilience.yaml +75 -0
  299. package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
  300. package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
  301. package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
  302. package/templates/optimizations/quality-monitoring.yaml +74 -0
  303. package/templates/optimizations/realtime-budget-controls.yaml +74 -0
  304. package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
  305. package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
  306. package/templates/optimizations/smart-model-routing.yaml +96 -0
  307. package/templates/optimizations/streaming-batch-selection.yaml +167 -0
  308. package/templates/optimizations/system-prompt-optimization.yaml +75 -0
  309. package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
  310. package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
  311. package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
  312. package/templates/overpowered-extraction.yaml +32 -0
  313. package/templates/overpowered-model.yaml +31 -0
  314. package/templates/prompt-bloat.yaml +24 -0
  315. package/templates/retry-explosion.yaml +28 -0
  316. package/templates/schema/insight.schema.json +113 -0
  317. package/templates/schema/optimization.schema.json +180 -0
  318. package/templates/streaming-drift.yaml +30 -0
  319. package/templates/throughput-gap.yaml +21 -0
  320. package/templates/token-underutilization.yaml +28 -0
  321. package/templates/untested-fallback.yaml +21 -0
  322. package/tests/accuracy/drift-detection.test.ts +184 -0
  323. package/tests/accuracy/false-positives.test.ts +166 -0
  324. package/tests/accuracy/templates.test.ts +205 -0
  325. package/tests/action/commands.test.ts +125 -0
  326. package/tests/action/comments.test.ts +347 -0
  327. package/tests/cli.test.ts +203 -0
  328. package/tests/comparison.test.ts +309 -0
  329. package/tests/correlation-analyzer.test.ts +534 -0
  330. package/tests/counterfactuals.test.ts +347 -0
  331. package/tests/fixtures/events/missing-id.jsonl +1 -0
  332. package/tests/fixtures/events/missing-input.jsonl +1 -0
  333. package/tests/fixtures/events/missing-latency.jsonl +1 -0
  334. package/tests/fixtures/events/missing-model.jsonl +1 -0
  335. package/tests/fixtures/events/missing-output.jsonl +1 -0
  336. package/tests/fixtures/events/missing-provider.jsonl +1 -0
  337. package/tests/fixtures/events/missing-ts.jsonl +1 -0
  338. package/tests/fixtures/events/valid.csv +3 -0
  339. package/tests/fixtures/events/valid.json +1 -0
  340. package/tests/fixtures/events/valid.jsonl +2 -0
  341. package/tests/fixtures/events/with-callsite.jsonl +1 -0
  342. package/tests/fixtures/events/with-intent.jsonl +1 -0
  343. package/tests/fixtures/events/wrong-type.jsonl +1 -0
  344. package/tests/fixtures/repos/empty/.gitkeep +0 -0
  345. package/tests/fixtures/repos/hybrid-router/router.py +35 -0
  346. package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
  347. package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
  348. package/tests/fixtures/repos/saas-openai/client.py +26 -0
  349. package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
  350. package/tests/github-action.test.ts +292 -0
  351. package/tests/insights.test.ts +878 -0
  352. package/tests/joiner.test.ts +168 -0
  353. package/tests/performance/action-latency.test.ts +132 -0
  354. package/tests/performance/benchmark.test.ts +189 -0
  355. package/tests/performance/cli-latency.test.ts +102 -0
  356. package/tests/pr-comment.test.ts +313 -0
  357. package/tests/prediction.test.ts +296 -0
  358. package/tests/runtime-analyzer.test.ts +375 -0
  359. package/tests/runtime.test.ts +205 -0
  360. package/tests/scanner.test.ts +122 -0
  361. package/tests/template-conformance.test.ts +526 -0
  362. package/tests/unit/cost-calculator.test.ts +303 -0
  363. package/tests/unit/credits.test.ts +180 -0
  364. package/tests/unit/inference-map.test.ts +276 -0
  365. package/tests/unit/schema.test.ts +300 -0
  366. package/tsconfig.json +20 -0
  367. package/vitest.config.ts +14 -0
@@ -0,0 +1,74 @@
1
+ id: quality-monitoring
2
+ name: Quality Preservation Monitoring
3
+ description: Monitor and maintain model quality during optimization deployments
4
+ category: monitoring
5
+ confidence: 0.93
6
+ success_count: 1890
7
+ verified_environments: 87
8
+ contributors:
9
+ - ml_quality_engineer
10
+ - monitoring_specialist
11
+ last_updated: "2024-12-23"
12
+
13
+ environment_match:
14
+ optimization_deployed: true
15
+ quality_requirements: high
16
+ production: true
17
+
18
+ optimization:
19
+ technique: quality_monitoring
20
+ expected_quality_retention: ">99%"
21
+ effort_estimate: "1-2 weeks"
22
+ risk_level: low
23
+
24
+ economics:
25
+ baseline_calculation:
26
+ quality_incident_cost: 10000
27
+ projected_improvement:
28
+ incident_prevention_rate: 0.9
29
+ implementation_cost:
30
+ engineering_hours: 60
31
+ total_cost: 12000
32
+
33
+ implementation:
34
+ prerequisites:
35
+ - requirement: "Ground truth data access"
36
+ - requirement: "Evaluation pipeline"
37
+ automated_steps:
38
+ - step_id: evaluation_setup
39
+ name: Evaluation Pipeline Setup
40
+ executable: true
41
+ commands:
42
+ - "python scripts/setup_evaluation_pipeline.py"
43
+ - "python scripts/configure_quality_metrics.py"
44
+ validation:
45
+ command: "python scripts/verify_evaluation.py"
46
+ success_criteria: "pipeline_functional"
47
+ - step_id: monitoring
48
+ name: Quality Monitoring
49
+ executable: true
50
+ commands:
51
+ - "python scripts/enable_continuous_evaluation.py --sample-rate 0.01"
52
+ - "python scripts/setup_quality_alerts.py"
53
+ validation:
54
+ command: "python scripts/test_quality_detection.py"
55
+ success_criteria: "detection_accuracy > 0.95"
56
+
57
+ monitoring:
58
+ key_metrics:
59
+ - metric: model_accuracy
60
+ target: ">baseline * 0.99"
61
+ alert_threshold: "<baseline * 0.95"
62
+ - metric: quality_drift_score
63
+ target: "<0.05"
64
+ alert_threshold: ">0.1"
65
+ rollback_triggers:
66
+ - condition: "model_accuracy < baseline * 0.93 for 10 minutes"
67
+ action: automatic_rollback
68
+
69
+ results:
70
+ recent_implementations:
71
+ - environment: classification_service
72
+ quality_incidents_before: 5
73
+ quality_incidents_after: 0
74
+ detection_time_reduction_percent: 85
@@ -0,0 +1,74 @@
1
+ id: realtime-budget-controls
2
+ name: Real-time Budget Controls
3
+ description: Implement real-time cost controls to prevent budget overruns
4
+ category: cost_optimization
5
+ confidence: 0.95
6
+ success_count: 2567
7
+ verified_environments: 124
8
+ contributors:
9
+ - finops_engineer
10
+ - platform_architect
11
+ last_updated: "2024-12-27"
12
+
13
+ environment_match:
14
+ monthly_budget: ">$10K"
15
+ budget_overrun_risk: high
16
+ cost_visibility: low
17
+
18
+ optimization:
19
+ technique: realtime_budget_enforcement
20
+ expected_cost_reduction: "10-30%"
21
+ effort_estimate: "1-2 weeks"
22
+ risk_level: low
23
+
24
+ economics:
25
+ baseline_calculation:
26
+ monthly_overrun_risk_percent: 20
27
+ projected_improvement:
28
+ budget_adherence: 0.99
29
+ implementation_cost:
30
+ engineering_hours: 60
31
+ total_cost: 12000
32
+
33
+ implementation:
34
+ prerequisites:
35
+ - requirement: "Cost tracking API access"
36
+ - requirement: "Alerting infrastructure"
37
+ automated_steps:
38
+ - step_id: tracking_setup
39
+ name: Cost Tracking Setup
40
+ executable: true
41
+ commands:
42
+ - "python scripts/setup_cost_tracking.py --granularity hourly"
43
+ - "python scripts/configure_cost_alerts.py --thresholds 50,75,90,100"
44
+ validation:
45
+ command: "python scripts/verify_tracking.py"
46
+ success_criteria: "tracking_active"
47
+ - step_id: controls_setup
48
+ name: Budget Controls
49
+ executable: true
50
+ commands:
51
+ - "python scripts/implement_rate_limiting.py --daily-limit auto"
52
+ - "python scripts/add_circuit_breaker.py --budget-threshold 95"
53
+ validation:
54
+ command: "python scripts/test_budget_controls.py"
55
+ success_criteria: "controls_functional"
56
+
57
+ monitoring:
58
+ key_metrics:
59
+ - metric: budget_utilization
60
+ target: "80-95%"
61
+ alert_threshold: ">100%"
62
+ - metric: cost_prediction_accuracy
63
+ target: ">0.9"
64
+ alert_threshold: "<0.7"
65
+ rollback_triggers:
66
+ - condition: "false_positive_rate > 10% for controls"
67
+ action: alert_and_tuning
68
+
69
+ results:
70
+ recent_implementations:
71
+ - environment: saas_platform
72
+ monthly_budget: 50000
73
+ previous_overruns: 3
74
+ post_implementation_overruns: 0
@@ -0,0 +1,74 @@
1
+ id: realtime-latency-optimization
2
+ name: Real-time Latency Optimization
3
+ description: Optimize inference for real-time applications with strict latency requirements
4
+ category: application_optimization
5
+ confidence: 0.90
6
+ success_count: 1456
7
+ verified_environments: 67
8
+ contributors:
9
+ - latency_specialist
10
+ - realtime_engineer
11
+ last_updated: "2025-01-05"
12
+
13
+ environment_match:
14
+ latency_requirement: "<50ms"
15
+ use_case:
16
+ - chat
17
+ - autocomplete
18
+ - real_time_translation
19
+
20
+ optimization:
21
+ technique: latency_optimization
22
+ expected_latency_improvement: "50-70%"
23
+ effort_estimate: "2-3 weeks"
24
+ risk_level: medium
25
+
26
+ economics:
27
+ implementation_cost:
28
+ engineering_hours: 120
29
+ total_cost: 24000
30
+
31
+ implementation:
32
+ prerequisites:
33
+ - requirement: "Profiling tools available"
34
+ - requirement: "Quantization support"
35
+ automated_steps:
36
+ - step_id: latency_profiling
37
+ name: Latency Profiling
38
+ executable: true
39
+ commands:
40
+ - "python scripts/profile_inference_latency.py"
41
+ - "python scripts/identify_latency_bottlenecks.py"
42
+ validation:
43
+ command: "python scripts/validate_profile.py"
44
+ success_criteria: "bottlenecks_identified"
45
+ - step_id: optimization_application
46
+ name: Apply Latency Optimizations
47
+ executable: true
48
+ commands:
49
+ - "python scripts/enable_speculative_decoding.py"
50
+ - "python scripts/optimize_batch_size.py --target-latency 40"
51
+ - "python scripts/enable_kv_cache_quantization.py"
52
+ validation:
53
+ command: "python scripts/benchmark_latency.py"
54
+ success_criteria: "p95_latency < 50ms"
55
+ rollback_command: "python scripts/revert_latency_config.py"
56
+
57
+ monitoring:
58
+ key_metrics:
59
+ - metric: latency_p50
60
+ target: "<30ms"
61
+ alert_threshold: ">40ms"
62
+ - metric: latency_p99
63
+ target: "<50ms"
64
+ alert_threshold: ">75ms"
65
+ rollback_triggers:
66
+ - condition: "latency_p99 > 100ms for 5 minutes"
67
+ action: automatic_rollback
68
+
69
+ results:
70
+ recent_implementations:
71
+ - environment: chatbot_api
72
+ baseline_p95_ms: 120
73
+ optimized_p95_ms: 42
74
+ latency_reduction_percent: 65
@@ -0,0 +1,78 @@
1
+ id: sglang-concurrency-optimization
2
+ name: SGLang High-Concurrency Optimization
3
+ description: Optimize SGLang deployment for high-concurrency structured generation workloads
4
+ category: runtime_optimization
5
+ confidence: 0.87
6
+ success_count: 567
7
+ verified_environments: 29
8
+ contributors:
9
+ - sglang_specialist
10
+ - concurrency_engineer
11
+ last_updated: "2025-01-02"
12
+
13
+ environment_match:
14
+ use_case: structured_generation
15
+ concurrency: ">100"
16
+ output_format:
17
+ - json
18
+ - structured
19
+
20
+ optimization:
21
+ technique: sglang_optimization
22
+ expected_throughput_improvement: "2-4x"
23
+ expected_cost_reduction: "50-70%"
24
+ effort_estimate: "1-2 weeks"
25
+ risk_level: low
26
+
27
+ economics:
28
+ projected_improvement:
29
+ throughput_multiplier: 3
30
+ cost_reduction_percent: 65
31
+ implementation_cost:
32
+ engineering_hours: 60
33
+ total_cost: 12000
34
+
35
+ implementation:
36
+ prerequisites:
37
+ - requirement: "SGLang installation"
38
+ validation_command: "python -c 'import sglang'"
39
+ - requirement: "Structured output requirements"
40
+ automated_steps:
41
+ - step_id: sglang_setup
42
+ name: SGLang Server Setup
43
+ executable: true
44
+ commands:
45
+ - "python scripts/setup_sglang.py --model ./model"
46
+ - "python scripts/configure_radix_cache.py"
47
+ validation:
48
+ command: "python scripts/test_sglang_server.py"
49
+ success_criteria: "server_healthy"
50
+ - step_id: concurrency_tuning
51
+ name: Concurrency Tuning
52
+ executable: true
53
+ commands:
54
+ - "python scripts/tune_concurrency.py --target-concurrent 200"
55
+ - "python scripts/enable_prefix_sharing.py"
56
+ validation:
57
+ command: "python scripts/benchmark_concurrency.py"
58
+ success_criteria: "concurrent_requests > 150"
59
+ rollback_command: "python scripts/revert_sglang_config.py"
60
+
61
+ monitoring:
62
+ key_metrics:
63
+ - metric: concurrent_requests
64
+ target: ">150"
65
+ alert_threshold: "<100"
66
+ - metric: structured_output_accuracy
67
+ target: ">0.99"
68
+ alert_threshold: "<0.95"
69
+ rollback_triggers:
70
+ - condition: "structured_output_accuracy < 0.9 for 5 minutes"
71
+ action: automatic_rollback
72
+
73
+ results:
74
+ recent_implementations:
75
+ - environment: api_generation_service
76
+ baseline_concurrency: 50
77
+ optimized_concurrency: 180
78
+ improvement_factor: 3.6
@@ -0,0 +1,96 @@
1
+ id: smart-model-routing
2
+ name: Intelligent Model Routing for Cost-Optimized Task Execution
3
+ description: Route different task types to appropriately-sized models instead of using premium models for everything
4
+ category: application_optimization
5
+ confidence: 0.92
6
+ success_count: 1567
7
+ verified_environments: 78
8
+ contributors:
9
+ - app_architect
10
+ - cost_optimizer
11
+ - routing_specialist
12
+ last_updated: "2025-01-16"
13
+
14
+ environment_match:
15
+ task_variety: mixed
16
+ model_usage: single_premium_model
17
+ monthly_api_cost: ">$20K"
18
+ task_complexity: variable
19
+
20
+ optimization:
21
+ technique: smart_model_routing
22
+ expected_cost_reduction: "60-80%"
23
+ expected_quality_retention: ">95%"
24
+ effort_estimate: "2-3 weeks"
25
+ risk_level: low
26
+
27
+ economics:
28
+ baseline_calculation:
29
+ premium_model_cost_per_token: 0.03
30
+ current_avg_tokens_per_task: 200
31
+ projected_improvement:
32
+ extraction_cost_per_token: 0.003
33
+ qa_cost_per_token: 0.01
34
+ generation_cost_per_token: 0.03
35
+ implementation_cost:
36
+ engineering_hours: 160
37
+ total_cost: 32000
38
+
39
+ implementation:
40
+ prerequisites:
41
+ - requirement: "Task classification capability"
42
+ validation_command: "python scripts/test_classifier.py --accuracy-threshold 0.95"
43
+ - requirement: "Multiple model access"
44
+ validation_command: "python scripts/test_model_access.py --models claude-haiku,gpt-4o-mini,gpt-4o"
45
+ - requirement: "Request routing infrastructure"
46
+ validation_command: "python scripts/test_routing.py"
47
+ automated_steps:
48
+ - step_id: task_classification_setup
49
+ name: Task Classification
50
+ executable: true
51
+ commands:
52
+ - "python scripts/setup_task_classifier.py --tasks extraction,qa,summarization,generation"
53
+ - "python scripts/train_routing_model.py --training-data task_examples.json --accuracy-target 0.95"
54
+ validation:
55
+ command: "python scripts/validate_classifier.py --test-data validation_tasks.json"
56
+ success_criteria: "accuracy > 0.95 AND precision > 0.93 AND recall > 0.93"
57
+ rollback_command: "python scripts/disable_classification.py"
58
+ - step_id: routing_configuration
59
+ name: Model Routing Logic
60
+ executable: true
61
+ commands:
62
+ - "python scripts/configure_model_routing.py --extraction claude-haiku --qa gpt-4o-mini --generation gpt-4o"
63
+ - "python scripts/implement_fallback_logic.py --quality-threshold 0.9 --fallback-model gpt-4o"
64
+ validation:
65
+ command: "python scripts/test_routing_logic.py --sample-tasks 100"
66
+ success_criteria: "routing_accuracy > 0.95 AND fallback_rate < 0.1"
67
+ rollback_command: "python scripts/revert_to_single_model.py"
68
+
69
+ monitoring:
70
+ key_metrics:
71
+ - metric: routing_accuracy
72
+ target: ">0.95"
73
+ alert_threshold: "<0.93"
74
+ - metric: cost_per_task
75
+ target: "<baseline * 0.4"
76
+ alert_threshold: ">baseline * 0.6"
77
+ - metric: task_quality_score
78
+ target: ">0.95"
79
+ alert_threshold: "<0.93"
80
+ - metric: fallback_rate
81
+ target: "<0.1"
82
+ alert_threshold: ">0.15"
83
+ rollback_triggers:
84
+ - condition: "routing_accuracy < 0.9 for 30 minutes"
85
+ action: automatic_rollback
86
+ - condition: "task_quality_score < 0.9 for 3 consecutive measurements"
87
+ action: automatic_rollback
88
+
89
+ results:
90
+ recent_implementations:
91
+ - environment: document_processing_saas
92
+ baseline_monthly_cost: 45000
93
+ optimized_monthly_cost: 12000
94
+ cost_reduction_percent: 73.3
95
+ quality_retention: 97.1
96
+ implementation_days: 16
@@ -0,0 +1,167 @@
1
+ id: streaming-batch-selection
2
+ name: Streaming vs Batch Pattern Selection Framework
3
+ description: Choose optimal API pattern based on latency, cost, and UX requirements
4
+ category: api_optimization
5
+ confidence: 0.90
6
+ success_count: 1456
7
+ verified_environments: 78
8
+ contributors:
9
+ - inference_squeeze
10
+ - ux_engineer
11
+ - api_architect
12
+ last_updated: "2025-01-20"
13
+ source: "Inference Squeeze Chapter 3 - Request Patterns"
14
+
15
+ environment_match:
16
+ application_type: mixed
17
+ latency_requirements: variable
18
+ user_experience: critical
19
+ cost_sensitivity: high
20
+
21
+ optimization:
22
+ technique: pattern_optimization
23
+ expected_ux_improvement: "30-50%"
24
+ expected_cost_optimization: "20-40%"
25
+ effort_estimate: "1 week"
26
+ risk_level: low
27
+
28
+ decision_framework:
29
+ use_streaming_when:
30
+ - "User-facing interactive applications"
31
+ - "First-token latency matters more than total latency"
32
+ - "Long responses (>500 tokens)"
33
+ - "User expects real-time feedback"
34
+ use_batch_when:
35
+ - "Background processing"
36
+ - "API offers batch pricing discount (OpenAI: 50%)"
37
+ - "Latency tolerance >24 hours"
38
+ - "High volume, consistent workloads"
39
+ use_sync_when:
40
+ - "Simple queries, short responses"
41
+ - "Strict latency SLAs"
42
+ - "Integration constraints require sync"
43
+
44
+ pattern_comparison:
45
+ streaming:
46
+ first_token_latency: "200-500ms"
47
+ total_latency: variable
48
+ cost_modifier: "1.0x"
49
+ ux_benefit: "High - perceived responsiveness"
50
+ implementation: "WebSocket or SSE"
51
+ synchronous:
52
+ first_token_latency: "N/A"
53
+ total_latency: "500-5000ms"
54
+ cost_modifier: "1.0x"
55
+ ux_benefit: "Medium - simple integration"
56
+ implementation: "REST API"
57
+ batch:
58
+ first_token_latency: "N/A"
59
+ total_latency: "minutes to 24 hours"
60
+ cost_modifier: "0.5x (OpenAI)"
61
+ ux_benefit: "Low - async only"
62
+ implementation: "Job queue + polling"
63
+
64
+ economics:
65
+ baseline_calculation:
66
+ monthly_requests: 100000
67
+ avg_cost_per_request: 0.05
68
+ monthly_cost: 5000
69
+ projected_improvement:
70
+ batch_eligible_percentage: 0.40
71
+ batch_discount: 0.50
72
+ streaming_improvement: 0.0
73
+ new_monthly_cost: 4000
74
+ monthly_savings: 1000
75
+ implementation_cost:
76
+ engineering_hours: 40
77
+ total_cost: 8000
78
+
79
+ implementation:
80
+ prerequisites:
81
+ - requirement: "WebSocket/SSE infrastructure"
82
+ validation: "Can handle streaming connections"
83
+ - requirement: "Job queue system"
84
+ validation: "Can process async batch jobs"
85
+ automated_steps:
86
+ - step_id: endpoint_audit
87
+ name: Audit Endpoint Requirements
88
+ executable: true
89
+ commands:
90
+ - "List all LLM-calling endpoints"
91
+ - "Categorize by latency requirement"
92
+ - "Identify batch-eligible workloads"
93
+ validation:
94
+ command: "Endpoint audit complete"
95
+ success_criteria: "all_endpoints_categorized"
96
+ rollback_command: "Continue with current patterns"
97
+ - step_id: pattern_assignment
98
+ name: Assign Patterns to Endpoints
99
+ executable: true
100
+ matrix:
101
+ real_time_chat: streaming
102
+ document_processing: batch
103
+ search_results: sync
104
+ bulk_analysis: batch
105
+ code_completion: streaming
106
+ content_moderation: sync_or_batch
107
+ validation:
108
+ command: "Validate pattern assignments"
109
+ success_criteria: "patterns_assigned AND no_conflicts"
110
+ rollback_command: "Revert to uniform pattern"
111
+ - step_id: streaming_implementation
112
+ name: Implement Streaming for Interactive Endpoints
113
+ executable: true
114
+ commands:
115
+ - "Add SSE/WebSocket support for chat endpoints"
116
+ - "Implement token-by-token rendering"
117
+ - "Handle connection lifecycle"
118
+ validation:
119
+ command: "Test streaming endpoints"
120
+ success_criteria: "first_token_latency < 500ms"
121
+ rollback_command: "Disable streaming"
122
+ - step_id: batch_implementation
123
+ name: Implement Batch for Background Workloads
124
+ executable: true
125
+ commands:
126
+ - "Queue async workloads for batch API"
127
+ - "Implement job status tracking"
128
+ - "Handle batch result retrieval"
129
+ validation:
130
+ command: "Test batch processing"
131
+ success_criteria: "batch_cost_savings > 40%"
132
+ rollback_command: "Revert to sync processing"
133
+
134
+ monitoring:
135
+ key_metrics:
136
+ - metric: first_token_latency_p50
137
+ target: "<300ms"
138
+ alert_threshold: ">500ms"
139
+ - metric: batch_utilization_rate
140
+ target: ">60%"
141
+ alert_threshold: "<40%"
142
+ - metric: pattern_cost_efficiency
143
+ target: ">0.7"
144
+ alert_threshold: "<0.5"
145
+ - metric: streaming_connection_success_rate
146
+ target: ">99%"
147
+ alert_threshold: "<95%"
148
+ rollback_triggers:
149
+ - condition: "first_token_latency_p50 > 1000ms for 15 minutes"
150
+ action: investigate_streaming_issues
151
+ - condition: "batch_utilization_rate < 30% for 1 hour"
152
+ action: review_batch_eligibility
153
+
154
+ results:
155
+ case_study:
156
+ environment: AI writing assistant
157
+ before:
158
+ all_sync: true
159
+ avg_time_to_first_word: "2.3s"
160
+ monthly_cost: 45000
161
+ after:
162
+ streaming_for_interactive: true
163
+ batch_for_background: true
164
+ avg_time_to_first_word: "0.4s"
165
+ monthly_cost: 31500
166
+ cost_reduction: "30%"
167
+ ux_improvement: "83% faster first response"
@@ -0,0 +1,75 @@
1
+ id: system-prompt-optimization
2
+ name: Redundant System Prompt Optimization
3
+ description: Reduce system prompt token costs through caching and optimization
4
+ category: cost_optimization
5
+ confidence: 0.91
6
+ success_count: 1890
7
+ verified_environments: 89
8
+ contributors:
9
+ - prompt_engineer
10
+ - token_optimizer
11
+ last_updated: "2024-12-30"
12
+
13
+ environment_match:
14
+ system_prompt_length: ">500 tokens"
15
+ request_volume: ">10K/day"
16
+ system_prompt_repetition: high
17
+
18
+ optimization:
19
+ technique: system_prompt_caching
20
+ expected_cost_reduction: "15-30%"
21
+ effort_estimate: "1-2 days"
22
+ risk_level: low
23
+
24
+ economics:
25
+ baseline_calculation:
26
+ system_tokens_per_request: 800
27
+ daily_requests: 50000
28
+ projected_improvement:
29
+ cached_token_savings_percent: 25
30
+ implementation_cost:
31
+ engineering_hours: 12
32
+ total_cost: 2400
33
+
34
+ implementation:
35
+ prerequisites:
36
+ - requirement: "Prompt caching support (Anthropic/OpenAI)"
37
+ - requirement: "Stable system prompt"
38
+ automated_steps:
39
+ - step_id: prompt_analysis
40
+ name: System Prompt Analysis
41
+ executable: true
42
+ commands:
43
+ - "python scripts/analyze_system_prompts.py"
44
+ - "python scripts/identify_cacheable_content.py"
45
+ validation:
46
+ command: "python scripts/validate_analysis.py"
47
+ success_criteria: "cacheable_tokens > 400"
48
+ - step_id: caching_setup
49
+ name: Prompt Caching Setup
50
+ executable: true
51
+ commands:
52
+ - "python scripts/enable_prompt_caching.py"
53
+ - "python scripts/optimize_prompt_structure.py"
54
+ validation:
55
+ command: "python scripts/verify_caching.py"
56
+ success_criteria: "cache_hit_rate > 0.9"
57
+
58
+ monitoring:
59
+ key_metrics:
60
+ - metric: cache_hit_rate
61
+ target: ">0.95"
62
+ alert_threshold: "<0.8"
63
+ - metric: input_token_cost
64
+ target: "<baseline * 0.75"
65
+ alert_threshold: ">baseline * 0.9"
66
+ rollback_triggers:
67
+ - condition: "cache_hit_rate < 0.5 for 10 minutes"
68
+ action: alert_and_investigation
69
+
70
+ results:
71
+ recent_implementations:
72
+ - environment: chatbot_service
73
+ baseline_token_cost_daily: 500
74
+ optimized_token_cost_daily: 375
75
+ cost_reduction_percent: 25