@peakinfer/cli 1.0.133

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. package/.claude/settings.local.json +8 -0
  2. package/.env.example +6 -0
  3. package/.github/workflows/peakinfer.yml +64 -0
  4. package/CHANGELOG.md +31 -0
  5. package/LICENSE +190 -0
  6. package/README.md +335 -0
  7. package/data/inferencemax.json +274 -0
  8. package/dist/agent-analyzer.d.ts +45 -0
  9. package/dist/agent-analyzer.d.ts.map +1 -0
  10. package/dist/agent-analyzer.js +374 -0
  11. package/dist/agent-analyzer.js.map +1 -0
  12. package/dist/agent.d.ts +76 -0
  13. package/dist/agent.d.ts.map +1 -0
  14. package/dist/agent.js +965 -0
  15. package/dist/agent.js.map +1 -0
  16. package/dist/agents/correlation-analyzer.d.ts +34 -0
  17. package/dist/agents/correlation-analyzer.d.ts.map +1 -0
  18. package/dist/agents/correlation-analyzer.js +261 -0
  19. package/dist/agents/correlation-analyzer.js.map +1 -0
  20. package/dist/agents/index.d.ts +91 -0
  21. package/dist/agents/index.d.ts.map +1 -0
  22. package/dist/agents/index.js +111 -0
  23. package/dist/agents/index.js.map +1 -0
  24. package/dist/agents/runtime-analyzer.d.ts +38 -0
  25. package/dist/agents/runtime-analyzer.d.ts.map +1 -0
  26. package/dist/agents/runtime-analyzer.js +244 -0
  27. package/dist/agents/runtime-analyzer.js.map +1 -0
  28. package/dist/analysis-types.d.ts +500 -0
  29. package/dist/analysis-types.d.ts.map +1 -0
  30. package/dist/analysis-types.js +11 -0
  31. package/dist/analysis-types.js.map +1 -0
  32. package/dist/analytics.d.ts +25 -0
  33. package/dist/analytics.d.ts.map +1 -0
  34. package/dist/analytics.js +94 -0
  35. package/dist/analytics.js.map +1 -0
  36. package/dist/analyzer.d.ts +48 -0
  37. package/dist/analyzer.d.ts.map +1 -0
  38. package/dist/analyzer.js +547 -0
  39. package/dist/analyzer.js.map +1 -0
  40. package/dist/artifacts.d.ts +44 -0
  41. package/dist/artifacts.d.ts.map +1 -0
  42. package/dist/artifacts.js +165 -0
  43. package/dist/artifacts.js.map +1 -0
  44. package/dist/benchmarks/index.d.ts +88 -0
  45. package/dist/benchmarks/index.d.ts.map +1 -0
  46. package/dist/benchmarks/index.js +205 -0
  47. package/dist/benchmarks/index.js.map +1 -0
  48. package/dist/cli.d.ts +3 -0
  49. package/dist/cli.d.ts.map +1 -0
  50. package/dist/cli.js +427 -0
  51. package/dist/cli.js.map +1 -0
  52. package/dist/commands/ci.d.ts +19 -0
  53. package/dist/commands/ci.d.ts.map +1 -0
  54. package/dist/commands/ci.js +253 -0
  55. package/dist/commands/ci.js.map +1 -0
  56. package/dist/commands/config.d.ts +16 -0
  57. package/dist/commands/config.d.ts.map +1 -0
  58. package/dist/commands/config.js +249 -0
  59. package/dist/commands/config.js.map +1 -0
  60. package/dist/commands/demo.d.ts +15 -0
  61. package/dist/commands/demo.d.ts.map +1 -0
  62. package/dist/commands/demo.js +106 -0
  63. package/dist/commands/demo.js.map +1 -0
  64. package/dist/commands/export.d.ts +14 -0
  65. package/dist/commands/export.d.ts.map +1 -0
  66. package/dist/commands/export.js +209 -0
  67. package/dist/commands/export.js.map +1 -0
  68. package/dist/commands/history.d.ts +15 -0
  69. package/dist/commands/history.d.ts.map +1 -0
  70. package/dist/commands/history.js +389 -0
  71. package/dist/commands/history.js.map +1 -0
  72. package/dist/commands/template.d.ts +14 -0
  73. package/dist/commands/template.d.ts.map +1 -0
  74. package/dist/commands/template.js +341 -0
  75. package/dist/commands/template.js.map +1 -0
  76. package/dist/commands/validate-map.d.ts +12 -0
  77. package/dist/commands/validate-map.d.ts.map +1 -0
  78. package/dist/commands/validate-map.js +274 -0
  79. package/dist/commands/validate-map.js.map +1 -0
  80. package/dist/commands/whatif.d.ts +17 -0
  81. package/dist/commands/whatif.d.ts.map +1 -0
  82. package/dist/commands/whatif.js +206 -0
  83. package/dist/commands/whatif.js.map +1 -0
  84. package/dist/comparison.d.ts +38 -0
  85. package/dist/comparison.d.ts.map +1 -0
  86. package/dist/comparison.js +223 -0
  87. package/dist/comparison.js.map +1 -0
  88. package/dist/config.d.ts +42 -0
  89. package/dist/config.d.ts.map +1 -0
  90. package/dist/config.js +158 -0
  91. package/dist/config.js.map +1 -0
  92. package/dist/connectors/helicone.d.ts +9 -0
  93. package/dist/connectors/helicone.d.ts.map +1 -0
  94. package/dist/connectors/helicone.js +106 -0
  95. package/dist/connectors/helicone.js.map +1 -0
  96. package/dist/connectors/index.d.ts +37 -0
  97. package/dist/connectors/index.d.ts.map +1 -0
  98. package/dist/connectors/index.js +65 -0
  99. package/dist/connectors/index.js.map +1 -0
  100. package/dist/connectors/langsmith.d.ts +9 -0
  101. package/dist/connectors/langsmith.d.ts.map +1 -0
  102. package/dist/connectors/langsmith.js +122 -0
  103. package/dist/connectors/langsmith.js.map +1 -0
  104. package/dist/connectors/types.d.ts +83 -0
  105. package/dist/connectors/types.d.ts.map +1 -0
  106. package/dist/connectors/types.js +98 -0
  107. package/dist/connectors/types.js.map +1 -0
  108. package/dist/cost-estimator.d.ts +46 -0
  109. package/dist/cost-estimator.d.ts.map +1 -0
  110. package/dist/cost-estimator.js +104 -0
  111. package/dist/cost-estimator.js.map +1 -0
  112. package/dist/costs.d.ts +57 -0
  113. package/dist/costs.d.ts.map +1 -0
  114. package/dist/costs.js +251 -0
  115. package/dist/costs.js.map +1 -0
  116. package/dist/counterfactuals.d.ts +29 -0
  117. package/dist/counterfactuals.d.ts.map +1 -0
  118. package/dist/counterfactuals.js +448 -0
  119. package/dist/counterfactuals.js.map +1 -0
  120. package/dist/enhancement-prompts.d.ts +41 -0
  121. package/dist/enhancement-prompts.d.ts.map +1 -0
  122. package/dist/enhancement-prompts.js +88 -0
  123. package/dist/enhancement-prompts.js.map +1 -0
  124. package/dist/envelopes.d.ts +20 -0
  125. package/dist/envelopes.d.ts.map +1 -0
  126. package/dist/envelopes.js +790 -0
  127. package/dist/envelopes.js.map +1 -0
  128. package/dist/format-normalizer.d.ts +71 -0
  129. package/dist/format-normalizer.d.ts.map +1 -0
  130. package/dist/format-normalizer.js +1331 -0
  131. package/dist/format-normalizer.js.map +1 -0
  132. package/dist/history.d.ts +79 -0
  133. package/dist/history.d.ts.map +1 -0
  134. package/dist/history.js +313 -0
  135. package/dist/history.js.map +1 -0
  136. package/dist/html.d.ts +11 -0
  137. package/dist/html.d.ts.map +1 -0
  138. package/dist/html.js +463 -0
  139. package/dist/html.js.map +1 -0
  140. package/dist/impact.d.ts +42 -0
  141. package/dist/impact.d.ts.map +1 -0
  142. package/dist/impact.js +443 -0
  143. package/dist/impact.js.map +1 -0
  144. package/dist/index.d.ts +26 -0
  145. package/dist/index.d.ts.map +1 -0
  146. package/dist/index.js +34 -0
  147. package/dist/index.js.map +1 -0
  148. package/dist/insights.d.ts +5 -0
  149. package/dist/insights.d.ts.map +1 -0
  150. package/dist/insights.js +271 -0
  151. package/dist/insights.js.map +1 -0
  152. package/dist/joiner.d.ts +9 -0
  153. package/dist/joiner.d.ts.map +1 -0
  154. package/dist/joiner.js +247 -0
  155. package/dist/joiner.js.map +1 -0
  156. package/dist/orchestrator.d.ts +34 -0
  157. package/dist/orchestrator.d.ts.map +1 -0
  158. package/dist/orchestrator.js +827 -0
  159. package/dist/orchestrator.js.map +1 -0
  160. package/dist/pdf.d.ts +26 -0
  161. package/dist/pdf.d.ts.map +1 -0
  162. package/dist/pdf.js +84 -0
  163. package/dist/pdf.js.map +1 -0
  164. package/dist/prediction.d.ts +33 -0
  165. package/dist/prediction.d.ts.map +1 -0
  166. package/dist/prediction.js +316 -0
  167. package/dist/prediction.js.map +1 -0
  168. package/dist/prompts/loader.d.ts +38 -0
  169. package/dist/prompts/loader.d.ts.map +1 -0
  170. package/dist/prompts/loader.js +60 -0
  171. package/dist/prompts/loader.js.map +1 -0
  172. package/dist/renderer.d.ts +64 -0
  173. package/dist/renderer.d.ts.map +1 -0
  174. package/dist/renderer.js +923 -0
  175. package/dist/renderer.js.map +1 -0
  176. package/dist/runid.d.ts +57 -0
  177. package/dist/runid.d.ts.map +1 -0
  178. package/dist/runid.js +199 -0
  179. package/dist/runid.js.map +1 -0
  180. package/dist/runtime.d.ts +29 -0
  181. package/dist/runtime.d.ts.map +1 -0
  182. package/dist/runtime.js +366 -0
  183. package/dist/runtime.js.map +1 -0
  184. package/dist/scanner.d.ts +11 -0
  185. package/dist/scanner.d.ts.map +1 -0
  186. package/dist/scanner.js +426 -0
  187. package/dist/scanner.js.map +1 -0
  188. package/dist/templates.d.ts +120 -0
  189. package/dist/templates.d.ts.map +1 -0
  190. package/dist/templates.js +429 -0
  191. package/dist/templates.js.map +1 -0
  192. package/dist/tools/index.d.ts +153 -0
  193. package/dist/tools/index.d.ts.map +1 -0
  194. package/dist/tools/index.js +177 -0
  195. package/dist/tools/index.js.map +1 -0
  196. package/dist/types.d.ts +3647 -0
  197. package/dist/types.d.ts.map +1 -0
  198. package/dist/types.js +703 -0
  199. package/dist/types.js.map +1 -0
  200. package/dist/version.d.ts +7 -0
  201. package/dist/version.d.ts.map +1 -0
  202. package/dist/version.js +23 -0
  203. package/dist/version.js.map +1 -0
  204. package/docs/demo-guide.md +423 -0
  205. package/docs/events-format.md +295 -0
  206. package/docs/inferencemap-spec.md +344 -0
  207. package/docs/migration-v2.md +293 -0
  208. package/fixtures/demo/precomputed.json +142 -0
  209. package/fixtures/demo-project/README.md +52 -0
  210. package/fixtures/demo-project/ai-service.ts +65 -0
  211. package/fixtures/demo-project/sample-events.jsonl +15 -0
  212. package/fixtures/demo-project/src/ai-service.ts +128 -0
  213. package/fixtures/demo-project/src/llm-client.ts +155 -0
  214. package/package.json +65 -0
  215. package/prompts/agent-analyzer.yaml +47 -0
  216. package/prompts/ci-gate.yaml +98 -0
  217. package/prompts/correlation-analyzer.yaml +178 -0
  218. package/prompts/format-normalizer.yaml +46 -0
  219. package/prompts/peak-performance.yaml +180 -0
  220. package/prompts/pr-comment.yaml +111 -0
  221. package/prompts/runtime-analyzer.yaml +189 -0
  222. package/prompts/unified-analyzer.yaml +241 -0
  223. package/schemas/inference-map.v0.1.json +215 -0
  224. package/scripts/benchmark.ts +394 -0
  225. package/scripts/demo-v1.5.sh +158 -0
  226. package/scripts/sync-from-site.sh +197 -0
  227. package/scripts/validate-sync.sh +178 -0
  228. package/src/agent-analyzer.ts +481 -0
  229. package/src/agent.ts +1232 -0
  230. package/src/agents/correlation-analyzer.ts +353 -0
  231. package/src/agents/index.ts +235 -0
  232. package/src/agents/runtime-analyzer.ts +343 -0
  233. package/src/analysis-types.ts +558 -0
  234. package/src/analytics.ts +100 -0
  235. package/src/analyzer.ts +692 -0
  236. package/src/artifacts.ts +218 -0
  237. package/src/benchmarks/index.ts +309 -0
  238. package/src/cli.ts +503 -0
  239. package/src/commands/ci.ts +336 -0
  240. package/src/commands/config.ts +288 -0
  241. package/src/commands/demo.ts +175 -0
  242. package/src/commands/export.ts +297 -0
  243. package/src/commands/history.ts +425 -0
  244. package/src/commands/template.ts +385 -0
  245. package/src/commands/validate-map.ts +324 -0
  246. package/src/commands/whatif.ts +272 -0
  247. package/src/comparison.ts +283 -0
  248. package/src/config.ts +188 -0
  249. package/src/connectors/helicone.ts +164 -0
  250. package/src/connectors/index.ts +93 -0
  251. package/src/connectors/langsmith.ts +179 -0
  252. package/src/connectors/types.ts +180 -0
  253. package/src/cost-estimator.ts +146 -0
  254. package/src/costs.ts +347 -0
  255. package/src/counterfactuals.ts +516 -0
  256. package/src/enhancement-prompts.ts +118 -0
  257. package/src/envelopes.ts +814 -0
  258. package/src/format-normalizer.ts +1486 -0
  259. package/src/history.ts +400 -0
  260. package/src/html.ts +512 -0
  261. package/src/impact.ts +522 -0
  262. package/src/index.ts +83 -0
  263. package/src/insights.ts +341 -0
  264. package/src/joiner.ts +289 -0
  265. package/src/orchestrator.ts +1015 -0
  266. package/src/pdf.ts +110 -0
  267. package/src/prediction.ts +392 -0
  268. package/src/prompts/loader.ts +88 -0
  269. package/src/renderer.ts +1045 -0
  270. package/src/runid.ts +261 -0
  271. package/src/runtime.ts +450 -0
  272. package/src/scanner.ts +508 -0
  273. package/src/templates.ts +561 -0
  274. package/src/tools/index.ts +214 -0
  275. package/src/types.ts +873 -0
  276. package/src/version.ts +24 -0
  277. package/templates/context-accumulation.yaml +23 -0
  278. package/templates/cost-concentration.yaml +20 -0
  279. package/templates/dead-code.yaml +20 -0
  280. package/templates/latency-explainer.yaml +23 -0
  281. package/templates/optimizations/ab-testing-framework.yaml +74 -0
  282. package/templates/optimizations/api-gateway-optimization.yaml +81 -0
  283. package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
  284. package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
  285. package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
  286. package/templates/optimizations/comprehensive-apm.yaml +76 -0
  287. package/templates/optimizations/context-window-optimization.yaml +91 -0
  288. package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
  289. package/templates/optimizations/distributed-training-optimization.yaml +77 -0
  290. package/templates/optimizations/document-analysis-edge.yaml +77 -0
  291. package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
  292. package/templates/optimizations/domain-specific-distillation.yaml +78 -0
  293. package/templates/optimizations/error-handling-optimization.yaml +76 -0
  294. package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
  295. package/templates/optimizations/long-context-memory-management.yaml +78 -0
  296. package/templates/optimizations/max-tokens-optimization.yaml +76 -0
  297. package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
  298. package/templates/optimizations/multi-framework-resilience.yaml +75 -0
  299. package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
  300. package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
  301. package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
  302. package/templates/optimizations/quality-monitoring.yaml +74 -0
  303. package/templates/optimizations/realtime-budget-controls.yaml +74 -0
  304. package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
  305. package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
  306. package/templates/optimizations/smart-model-routing.yaml +96 -0
  307. package/templates/optimizations/streaming-batch-selection.yaml +167 -0
  308. package/templates/optimizations/system-prompt-optimization.yaml +75 -0
  309. package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
  310. package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
  311. package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
  312. package/templates/overpowered-extraction.yaml +32 -0
  313. package/templates/overpowered-model.yaml +31 -0
  314. package/templates/prompt-bloat.yaml +24 -0
  315. package/templates/retry-explosion.yaml +28 -0
  316. package/templates/schema/insight.schema.json +113 -0
  317. package/templates/schema/optimization.schema.json +180 -0
  318. package/templates/streaming-drift.yaml +30 -0
  319. package/templates/throughput-gap.yaml +21 -0
  320. package/templates/token-underutilization.yaml +28 -0
  321. package/templates/untested-fallback.yaml +21 -0
  322. package/tests/accuracy/drift-detection.test.ts +184 -0
  323. package/tests/accuracy/false-positives.test.ts +166 -0
  324. package/tests/accuracy/templates.test.ts +205 -0
  325. package/tests/action/commands.test.ts +125 -0
  326. package/tests/action/comments.test.ts +347 -0
  327. package/tests/cli.test.ts +203 -0
  328. package/tests/comparison.test.ts +309 -0
  329. package/tests/correlation-analyzer.test.ts +534 -0
  330. package/tests/counterfactuals.test.ts +347 -0
  331. package/tests/fixtures/events/missing-id.jsonl +1 -0
  332. package/tests/fixtures/events/missing-input.jsonl +1 -0
  333. package/tests/fixtures/events/missing-latency.jsonl +1 -0
  334. package/tests/fixtures/events/missing-model.jsonl +1 -0
  335. package/tests/fixtures/events/missing-output.jsonl +1 -0
  336. package/tests/fixtures/events/missing-provider.jsonl +1 -0
  337. package/tests/fixtures/events/missing-ts.jsonl +1 -0
  338. package/tests/fixtures/events/valid.csv +3 -0
  339. package/tests/fixtures/events/valid.json +1 -0
  340. package/tests/fixtures/events/valid.jsonl +2 -0
  341. package/tests/fixtures/events/with-callsite.jsonl +1 -0
  342. package/tests/fixtures/events/with-intent.jsonl +1 -0
  343. package/tests/fixtures/events/wrong-type.jsonl +1 -0
  344. package/tests/fixtures/repos/empty/.gitkeep +0 -0
  345. package/tests/fixtures/repos/hybrid-router/router.py +35 -0
  346. package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
  347. package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
  348. package/tests/fixtures/repos/saas-openai/client.py +26 -0
  349. package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
  350. package/tests/github-action.test.ts +292 -0
  351. package/tests/insights.test.ts +878 -0
  352. package/tests/joiner.test.ts +168 -0
  353. package/tests/performance/action-latency.test.ts +132 -0
  354. package/tests/performance/benchmark.test.ts +189 -0
  355. package/tests/performance/cli-latency.test.ts +102 -0
  356. package/tests/pr-comment.test.ts +313 -0
  357. package/tests/prediction.test.ts +296 -0
  358. package/tests/runtime-analyzer.test.ts +375 -0
  359. package/tests/runtime.test.ts +205 -0
  360. package/tests/scanner.test.ts +122 -0
  361. package/tests/template-conformance.test.ts +526 -0
  362. package/tests/unit/cost-calculator.test.ts +303 -0
  363. package/tests/unit/credits.test.ts +180 -0
  364. package/tests/unit/inference-map.test.ts +276 -0
  365. package/tests/unit/schema.test.ts +300 -0
  366. package/tsconfig.json +20 -0
  367. package/vitest.config.ts +14 -0
@@ -0,0 +1,180 @@
1
+ id: peak-performance
2
+ name: Peak Inference Performance Analysis
3
+ version: "1.0"
4
+ description: |
5
+ Analyzes code for inference performance optimization opportunities.
6
+ Focus areas: latency, throughput, cost efficiency, and achieving peak performance.
7
+
8
+ # The analysis prompt sent to the LLM
9
+ prompt: |
10
+ You are an expert at analyzing LLM inference code for PEAK PERFORMANCE optimization.
11
+
12
+ Your goal: Identify why this code may NOT be achieving peak inference performance.
13
+
14
+ ## FOCUS AREAS (in priority order)
15
+
16
+ ### 1. LATENCY
17
+ - Time to First Token (TTFT) optimization
18
+ - Streaming vs synchronous calls
19
+ - Network round-trip reduction
20
+ - Prompt size impact on latency
21
+ - Cold start detection
22
+
23
+ ### 2. THROUGHPUT
24
+ - Batching opportunities (multiple requests → single batch)
25
+ - Concurrent request patterns
26
+ - Queue/backpressure handling
27
+ - Connection pooling
28
+ - Rate limit handling
29
+
30
+ ### 3. COST EFFICIENCY
31
+ - Model selection for task complexity (GPT-4 for simple tasks = waste)
32
+ - Token usage optimization (prompt bloat detection)
33
+ - Caching opportunities (repeated similar prompts)
34
+ - Output token limits vs actual needs
35
+
36
+ ### 4. RELIABILITY FOR PERFORMANCE
37
+ - Retry patterns that don't hurt latency
38
+ - Fallback strategies (faster model as fallback)
39
+ - Timeout configurations
40
+ - Circuit breaker patterns
41
+
42
+ ## WHAT TO LOOK FOR
43
+
44
+ For each LLM API call, analyze:
45
+ - Is streaming enabled? (impacts TTFT)
46
+ - Is batching used for multiple calls? (impacts throughput)
47
+ - Is the model appropriate for task complexity? (impacts cost)
48
+ - Are there caching opportunities? (impacts all three)
49
+ - Is there retry logic that preserves latency SLAs?
50
+
51
+ ## OUTPUT FORMAT
52
+
53
+ ### PART 1: Identify LLM Callsites
54
+ For each call:
55
+ - line: The EXACT line number where the inference call is made (not client initialization)
56
+ - provider: openai, anthropic, google, together, fireworks, groq, mistral, cohere, replicate, aws_bedrock, azure, vllm, sglang, ollama, unknown
57
+ - model: The EXACT model name as specified in the code
58
+ - framework: langchain, llamaindex, dspy, or null
59
+ - patterns: {streaming, batching, retries, caching, fallback} - true/false
60
+ - confidence: 0.0 to 1.0
61
+ - reasoning: Brief explanation
62
+
63
+ ### CRITICAL RULES FOR MODEL EXTRACTION
64
+ 1. Look at the model= parameter in the SAME function call
65
+ 2. If model is a variable, trace it to find the actual string value
66
+ 3. For embeddings calls, use the embedding model name (e.g., "text-embedding-3-small"), NOT a chat model
67
+ 4. For DSPy: look at dspy.LM("provider/model") or dspy.context(lm=...) to find the model
68
+ 5. Return the FULL model name exactly as written (e.g., "gpt-4o-mini" not "gpt-4", "claude-3-5-sonnet-20241022" not "claude")
69
+
70
+ ### CRITICAL: DO NOT FLAG THESE AS CALLSITES
71
+ - Client initialization: openai.OpenAI(), anthropic.Anthropic(), AsyncOpenAI(), etc.
72
+ - Import statements: from openai import OpenAI
73
+ - Type annotations or comments
74
+ - Variable assignments without actual API calls: model = "gpt-4o"
75
+ - SDK client creation without inference: client = OpenAI()
76
+
77
+ ### DSPY FRAMEWORK DETECTION
78
+ DSPy wraps LLM calls in module invocations. Flag these as callsites:
79
+ - dspy.Predict(signature)(question=...) - the invocation, not the Predict() creation
80
+ - dspy.ChainOfThought(signature)(question=...) - the invocation
81
+ - predictor(question=...) where predictor is a DSPy module
82
+ - compiled_program(input=...) after BootstrapFewShot compilation
83
+ - self.generate(...) inside a dspy.Module.forward() method
84
+
85
+ For DSPy, find the model from:
86
+ - dspy.LM("provider/model") at module level
87
+ - dspy.configure(lm=...)
88
+ - dspy.context(lm=dspy.LM("provider/model"))
89
+
90
+ ### PART 2: Performance Insights with Impact Estimation
91
+ Generate insights focused on PEAK PERFORMANCE with estimated impact:
92
+
93
+ Required fields:
94
+ - severity: MUST be one of: "critical", "warning", "info"
95
+ - category: MUST be one of: "latency", "throughput", "cost", "reliability"
96
+
97
+ Critical = blocking peak performance
98
+ Warning = leaving performance on the table
99
+ Info = optimization opportunity or positive pattern
100
+
101
+ Impact estimation fields (provide for actionable insights):
102
+ - layer: MUST be one of: "application", "model", "runtime", "infrastructure"
103
+ - application: Code patterns (caching, batching, streaming, retry logic)
104
+ - model: Model selection (GPT-4 vs GPT-3.5, Claude Opus vs Haiku)
105
+ - runtime: Inference engines (vLLM, sglang, TGI optimizations)
106
+ - infrastructure: Hosting (serverless vs dedicated, provider selection)
107
+ - impactType: MUST be one of: "cost", "latency", "throughput"
108
+ - estimatedImpactPercent: 0-100 (realistic estimate of improvement)
109
+ - effort: MUST be one of: "low", "medium", "high"
110
+ - low: Config change or few lines of code
111
+ - medium: Requires refactoring or new integration
112
+ - high: Architectural change or new infrastructure
113
+
114
+ Impact estimation guidelines:
115
+ - Model downgrade (GPT-4 → GPT-3.5): ~95% cost reduction
116
+ - Enable streaming: ~70% perceived latency reduction
117
+ - Add batching: ~60% throughput improvement
118
+ - Add caching: ~40% cost reduction (depends on hit rate)
119
+ - Connection pooling: ~20% latency reduction
120
+ - vLLM deployment: ~300% throughput improvement
121
+ - Dedicated GPU hosting: ~60% cost reduction vs API
122
+
123
+ Return ONLY valid JSON:
124
+ {
125
+ "callsites": [...],
126
+ "insights": [
127
+ {
128
+ "severity": "critical",
129
+ "category": "latency",
130
+ "headline": "Synchronous calls blocking TTFT optimization",
131
+ "evidence": "callLlm at line 42 uses synchronous API - streaming would reduce perceived latency by 60-80%",
132
+ "location": "src/llm.ts:42",
133
+ "recommendation": "Enable streaming with stream: true parameter",
134
+ "impact": {
135
+ "layer": "application",
136
+ "impactType": "latency",
137
+ "estimatedImpactPercent": 70,
138
+ "effort": "low"
139
+ }
140
+ },
141
+ {
142
+ "severity": "warning",
143
+ "category": "cost",
144
+ "headline": "GPT-4 used for simple yes/no validation",
145
+ "evidence": "validateInput() at line 89 uses GPT-4 for binary classification",
146
+ "location": "src/validate.ts:89",
147
+ "recommendation": "Switch to GPT-3.5-turbo or GPT-4o-mini for simple validation",
148
+ "impact": {
149
+ "layer": "model",
150
+ "impactType": "cost",
151
+ "estimatedImpactPercent": 97,
152
+ "effort": "low"
153
+ }
154
+ }
155
+ ]
156
+ }
157
+
158
+ If no issues found, return empty arrays: {"callsites": [], "insights": []}
159
+
160
+ # Categories this prompt focuses on
161
+ categories:
162
+ - latency
163
+ - throughput
164
+ - cost
165
+ - reliability
166
+
167
+ # Default thresholds (can be overridden)
168
+ defaults:
169
+ expensive_models:
170
+ - gpt-4
171
+ - gpt-4o
172
+ - gpt-4-turbo
173
+ - claude-3-opus
174
+ - claude-3-sonnet
175
+ cheap_models:
176
+ - gpt-3.5-turbo
177
+ - gpt-4o-mini
178
+ - claude-3-haiku
179
+ latency_critical_threshold_ms: 1000
180
+ batch_opportunity_threshold: 3
@@ -0,0 +1,111 @@
1
+ id: pr-comment
2
+ name: PR Comment Generator
3
+ version: "2.0"
4
+ description: Generates verdict-first PR comments for analysis results
5
+
6
+ context:
7
+ - analysis_results: The full analysis results object
8
+ - baseline: Previous run baseline for comparison (optional)
9
+ - changed_files: List of files changed in this PR
10
+ - new_issues: Issues introduced by this PR
11
+ - status: Overall status (pass, warning, fail)
12
+ - regressions: List of regression descriptions
13
+
14
+ output_format: markdown
15
+
16
+ prompt: |
17
+ <role>
18
+ You are generating a GitHub PR comment for PeakInfer analysis results.
19
+ Goal: User decides in 5 seconds if PR is safe, acts on top issue in 30 seconds.
20
+ </role>
21
+
22
+ <design_principles>
23
+ Julie Zhou behavior-first principles:
24
+ - Verdict first: User knows immediately if this needs attention
25
+ - One top issue: User acts on one thing at a time
26
+ - Progressive disclosure: Details collapsed for power users
27
+ - Action-oriented: Inline suggestions user can apply with one click
28
+ </design_principles>
29
+
30
+ <style>
31
+ - Lead with verdict (emoji + label + message)
32
+ - No metrics tables unless directly actionable
33
+ - Use collapsible <details> for lists > 3 items
34
+ - Plain language, no jargon
35
+ - No anthropomorphic phrasing ("I found...", "I think...")
36
+ </style>
37
+
38
+ <instructions>
39
+ Generate a markdown PR comment with this structure:
40
+
41
+ 1. **Verdict Line** (always first)
42
+ - Format: **{emoji} {Label}** — {message}
43
+ - Labels: Safe to Merge, Mostly Good, Review Recommended, Changes Requested
44
+ - Emojis: ✅ 🟢 🟡 🔴
45
+
46
+ 2. **Top Issue Highlight** (if issues exist)
47
+ - Simple table showing the ONE most important issue
48
+ - Include: title, location, why it matters
49
+ - This is what user should fix first
50
+
51
+ 3. **Collapsible Details** (if > 1 issue)
52
+ - Use <details><summary>See all N issues</summary>
53
+ - Group by severity (Critical, Warning, Info)
54
+ - Max 5 per severity, note if more
55
+
56
+ 4. **Footer**
57
+ - If issues: "See inline comments for suggested fixes"
58
+ - Always: "Generated by PeakInfer"
59
+ </instructions>
60
+
61
+ <verdict_logic>
62
+ - ≥2 critical → Changes Requested (🔴)
63
+ - 1 critical OR >5 warnings → Review Recommended (🟡)
64
+ - 1-5 warnings → Mostly Good (🟢)
65
+ - 0 issues → Safe to Merge (✅)
66
+ </verdict_logic>
67
+
68
+ <constraints>
69
+ - Maximum length: 1500 characters (shorter than before)
70
+ - No speculation or suggestions not in the data
71
+ - Inline suggestions are posted separately, not in main comment
72
+ </constraints>
73
+
74
+ example_output: |
75
+ ## PeakInfer Analysis
76
+
77
+ **🟡 Review Recommended** — 2 issues need attention before merge
78
+
79
+ | | |
80
+ |---|---|
81
+ | **Top Issue** | Missing error handling in LLM calls |
82
+ | **Location** | `src/api/chat.ts:45` |
83
+ | **Why it matters** | Unhandled API failures will crash the service |
84
+
85
+ <details>
86
+ <summary>See all 7 issues</summary>
87
+
88
+ **Critical** (2)
89
+ - Missing error handling in LLM calls — `src/api/chat.ts:45`
90
+ - Unbounded retry without backoff — `src/api/retry.ts:23`
91
+
92
+ **Warning** (5)
93
+ - Premium model used for simple task — `src/llm/classify.ts:12`
94
+ - Sequential calls could be parallelized — `src/batch/process.ts:67`
95
+ - _...3 more_
96
+ </details>
97
+
98
+ ---
99
+ <sub>See inline comments for suggested fixes</sub>
100
+
101
+ <sub>Generated by [PeakInfer](https://github.com/Kalmantic/peakinfer)</sub>
102
+
103
+ zero_state_example: |
104
+ ## PeakInfer Analysis
105
+
106
+ **✅ Safe to Merge** — No issues found
107
+
108
+ Analyzed 4 inference points, all following best practices.
109
+
110
+ ---
111
+ <sub>Generated by [PeakInfer](https://github.com/Kalmantic/peakinfer)</sub>
@@ -0,0 +1,189 @@
1
+ id: runtime-analyzer
2
+ name: Runtime Telemetry Analyzer
3
+ version: "1.0"
4
+ description: |
5
+ Analyzes LLM inference telemetry data to find patterns, anomalies, and optimization opportunities.
6
+ Provides semantic analysis that templates cannot capture.
7
+
8
+ prompt: |
9
+ <role>
10
+ You are an LLM operations analyst specializing in inference telemetry analysis.
11
+ Your job is to find patterns, anomalies, and optimization opportunities in runtime data.
12
+ </role>
13
+
14
+ <background>
15
+ You receive aggregated runtime telemetry from LLM inference calls:
16
+ - Per-provider and per-model statistics
17
+ - Token counts (input/output)
18
+ - Latency distributions (p50, p95, p99)
19
+ - Call counts and patterns
20
+ - Optional: streaming, batching, caching, retry indicators
21
+ - PRICING DATA: Dynamic pricing from LiteLLM API ($/1M tokens)
22
+
23
+ This data comes from production systems. Your analysis helps teams optimize cost, latency, and reliability.
24
+
25
+ Key metrics to understand:
26
+ - Token ratio = output_tokens / input_tokens (high ratio may indicate verbose prompts)
27
+ - Latency spread = p99 / p50 (high spread indicates inconsistency)
28
+ - Call concentration = calls to expensive models vs cheap models
29
+
30
+ PRICING TIERS (use provided pricing data):
31
+ - Expensive: >$10 per 1M tokens (GPT-4, Claude Opus)
32
+ - Moderate: $1-10 per 1M tokens (GPT-4o, Claude Sonnet)
33
+ - Cheap: <$1 per 1M tokens (GPT-4o-mini, Claude Haiku, GPT-3.5)
34
+
35
+ IMPORTANT: Use the pricing_context provided with the data to calculate actual costs.
36
+ Do NOT assume pricing - use the real numbers from pricing_context.
37
+ </background>
38
+
39
+ <instructions>
40
+ Analyze the data for the following (in priority order):
41
+
42
+ 1. COST PATTERNS
43
+ - Which models consume the most tokens?
44
+ - Are expensive models (GPT-4, Claude Opus) used for simple tasks?
45
+ - Token ratios indicating prompt bloat (input >> output)?
46
+ - Total token consumption and estimated costs
47
+
48
+ 2. LATENCY PATTERNS
49
+ - Bimodal distributions (some fast, some slow)?
50
+ - P95/P99 spikes suggesting cold starts or rate limits?
51
+ - Correlation between input size and latency?
52
+ - Missing streaming on latency-sensitive paths?
53
+
54
+ 3. USAGE PATTERNS
55
+ - Application type inference (RAG, agents, batch, chat)?
56
+ - Multi-model pipelines (cheap model -> expensive model)?
57
+ - Retry patterns suggesting reliability issues?
58
+ - Time-of-day patterns?
59
+
60
+ 4. ANOMALIES
61
+ - Outliers in latency (>3x p95)?
62
+ - Unusual token ratios (output > 10x input)?
63
+ - Unexpected provider/model combinations?
64
+ - Failed request patterns?
65
+
66
+ 5. RECOMMENDATIONS
67
+ - Model right-sizing opportunities (GPT-4 -> GPT-4o-mini)
68
+ - Caching opportunities (repeated similar requests)
69
+ - Batching opportunities (sequential small requests)
70
+ - Streaming opportunities (high latency paths)
71
+ </instructions>
72
+
73
+ <output_format>
74
+ Return valid JSON:
75
+ {
76
+ "insights": [
77
+ {
78
+ "severity": "critical|warning|info",
79
+ "category": "cost|latency|reliability|throughput|waste",
80
+ "headline": "Short, actionable title",
81
+ "evidence": "Specific data points from the telemetry",
82
+ "recommendation": "What to do about it",
83
+ "impact": {
84
+ "layer": "application|model|runtime|infrastructure",
85
+ "impactType": "cost|latency|throughput",
86
+ "estimatedImpactPercent": 0-100,
87
+ "effort": "low|medium|high"
88
+ }
89
+ }
90
+ ],
91
+ "detected_patterns": {
92
+ "application_type": "rag|agent|batch|chat|pipeline|unknown",
93
+ "multi_model_pipeline": true|false,
94
+ "streaming_detected": true|false,
95
+ "batching_detected": true|false,
96
+ "caching_detected": true|false
97
+ },
98
+ "summary": {
99
+ "total_calls": 0,
100
+ "total_tokens": 0,
101
+ "dominant_provider": "provider_name",
102
+ "dominant_model": "model_name",
103
+ "estimated_daily_cost_usd": 0.0
104
+ }
105
+ }
106
+ </output_format>
107
+
108
+ <constraints>
109
+ - Be specific. Use actual numbers from the data.
110
+ - Prioritize actionable insights over observations.
111
+ - Maximum 10 insights, ranked by impact.
112
+ - Only report patterns with clear evidence.
113
+ - Do NOT fabricate data - only analyze what's provided.
114
+ </constraints>
115
+
116
+ <examples>
117
+ Example input summary:
118
+ {
119
+ "byModel": {
120
+ "gpt-4": {"calls": 500, "tokens_in": 50000, "tokens_out": 25000, "latency_p50": 2500, "latency_p95": 8000},
121
+ "gpt-3.5-turbo": {"calls": 50, "tokens_in": 2000, "tokens_out": 1000, "latency_p50": 400, "latency_p95": 800}
122
+ }
123
+ }
124
+
125
+ Example output:
126
+ {
127
+ "insights": [
128
+ {
129
+ "severity": "critical",
130
+ "category": "cost",
131
+ "headline": "90% of calls use expensive GPT-4 model",
132
+ "evidence": "500 GPT-4 calls vs 50 GPT-3.5-turbo calls. GPT-4 is ~20x more expensive per token.",
133
+ "recommendation": "Evaluate if GPT-4 is necessary for all use cases. Consider GPT-4o-mini for simpler tasks.",
134
+ "impact": {
135
+ "layer": "model",
136
+ "impactType": "cost",
137
+ "estimatedImpactPercent": 85,
138
+ "effort": "low"
139
+ }
140
+ },
141
+ {
142
+ "severity": "warning",
143
+ "category": "latency",
144
+ "headline": "GPT-4 p95 latency 3x higher than p50",
145
+ "evidence": "p50=2500ms, p95=8000ms. This 3.2x spread indicates inconsistent response times.",
146
+ "recommendation": "Investigate high-latency requests. Consider streaming for long responses.",
147
+ "impact": {
148
+ "layer": "application",
149
+ "impactType": "latency",
150
+ "estimatedImpactPercent": 50,
151
+ "effort": "low"
152
+ }
153
+ }
154
+ ],
155
+ "detected_patterns": {
156
+ "application_type": "unknown",
157
+ "multi_model_pipeline": true,
158
+ "streaming_detected": false,
159
+ "batching_detected": false,
160
+ "caching_detected": false
161
+ },
162
+ "summary": {
163
+ "total_calls": 550,
164
+ "total_tokens": 78000,
165
+ "dominant_provider": "openai",
166
+ "dominant_model": "gpt-4",
167
+ "estimated_daily_cost_usd": 2.34
168
+ }
169
+ }
170
+ </examples>
171
+
172
+ categories:
173
+ - cost
174
+ - latency
175
+ - throughput
176
+ - reliability
177
+ - waste
178
+
179
+ defaults:
180
+ # NOTE: Model pricing is DYNAMIC - loaded from LiteLLM pricing API
181
+ # The agent will receive pricing data as context, not hardcoded lists
182
+ # Pricing context format: { model: { input: $/1M tokens, output: $/1M tokens } }
183
+ use_dynamic_pricing: true
184
+ latency_warning_threshold_ms: 5000
185
+ cost_warning_threshold_usd: 10.0
186
+ # Cost tiers (per 1M tokens) - used for classification when pricing context provided
187
+ expensive_threshold_per_1m: 10.0 # >$10/1M = expensive
188
+ moderate_threshold_per_1m: 1.0 # $1-10/1M = moderate
189
+ # Below $1/1M = cheap