@peakinfer/cli 1.0.133
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +8 -0
- package/.env.example +6 -0
- package/.github/workflows/peakinfer.yml +64 -0
- package/CHANGELOG.md +31 -0
- package/LICENSE +190 -0
- package/README.md +335 -0
- package/data/inferencemax.json +274 -0
- package/dist/agent-analyzer.d.ts +45 -0
- package/dist/agent-analyzer.d.ts.map +1 -0
- package/dist/agent-analyzer.js +374 -0
- package/dist/agent-analyzer.js.map +1 -0
- package/dist/agent.d.ts +76 -0
- package/dist/agent.d.ts.map +1 -0
- package/dist/agent.js +965 -0
- package/dist/agent.js.map +1 -0
- package/dist/agents/correlation-analyzer.d.ts +34 -0
- package/dist/agents/correlation-analyzer.d.ts.map +1 -0
- package/dist/agents/correlation-analyzer.js +261 -0
- package/dist/agents/correlation-analyzer.js.map +1 -0
- package/dist/agents/index.d.ts +91 -0
- package/dist/agents/index.d.ts.map +1 -0
- package/dist/agents/index.js +111 -0
- package/dist/agents/index.js.map +1 -0
- package/dist/agents/runtime-analyzer.d.ts +38 -0
- package/dist/agents/runtime-analyzer.d.ts.map +1 -0
- package/dist/agents/runtime-analyzer.js +244 -0
- package/dist/agents/runtime-analyzer.js.map +1 -0
- package/dist/analysis-types.d.ts +500 -0
- package/dist/analysis-types.d.ts.map +1 -0
- package/dist/analysis-types.js +11 -0
- package/dist/analysis-types.js.map +1 -0
- package/dist/analytics.d.ts +25 -0
- package/dist/analytics.d.ts.map +1 -0
- package/dist/analytics.js +94 -0
- package/dist/analytics.js.map +1 -0
- package/dist/analyzer.d.ts +48 -0
- package/dist/analyzer.d.ts.map +1 -0
- package/dist/analyzer.js +547 -0
- package/dist/analyzer.js.map +1 -0
- package/dist/artifacts.d.ts +44 -0
- package/dist/artifacts.d.ts.map +1 -0
- package/dist/artifacts.js +165 -0
- package/dist/artifacts.js.map +1 -0
- package/dist/benchmarks/index.d.ts +88 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +205 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +427 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/ci.d.ts +19 -0
- package/dist/commands/ci.d.ts.map +1 -0
- package/dist/commands/ci.js +253 -0
- package/dist/commands/ci.js.map +1 -0
- package/dist/commands/config.d.ts +16 -0
- package/dist/commands/config.d.ts.map +1 -0
- package/dist/commands/config.js +249 -0
- package/dist/commands/config.js.map +1 -0
- package/dist/commands/demo.d.ts +15 -0
- package/dist/commands/demo.d.ts.map +1 -0
- package/dist/commands/demo.js +106 -0
- package/dist/commands/demo.js.map +1 -0
- package/dist/commands/export.d.ts +14 -0
- package/dist/commands/export.d.ts.map +1 -0
- package/dist/commands/export.js +209 -0
- package/dist/commands/export.js.map +1 -0
- package/dist/commands/history.d.ts +15 -0
- package/dist/commands/history.d.ts.map +1 -0
- package/dist/commands/history.js +389 -0
- package/dist/commands/history.js.map +1 -0
- package/dist/commands/template.d.ts +14 -0
- package/dist/commands/template.d.ts.map +1 -0
- package/dist/commands/template.js +341 -0
- package/dist/commands/template.js.map +1 -0
- package/dist/commands/validate-map.d.ts +12 -0
- package/dist/commands/validate-map.d.ts.map +1 -0
- package/dist/commands/validate-map.js +274 -0
- package/dist/commands/validate-map.js.map +1 -0
- package/dist/commands/whatif.d.ts +17 -0
- package/dist/commands/whatif.d.ts.map +1 -0
- package/dist/commands/whatif.js +206 -0
- package/dist/commands/whatif.js.map +1 -0
- package/dist/comparison.d.ts +38 -0
- package/dist/comparison.d.ts.map +1 -0
- package/dist/comparison.js +223 -0
- package/dist/comparison.js.map +1 -0
- package/dist/config.d.ts +42 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +158 -0
- package/dist/config.js.map +1 -0
- package/dist/connectors/helicone.d.ts +9 -0
- package/dist/connectors/helicone.d.ts.map +1 -0
- package/dist/connectors/helicone.js +106 -0
- package/dist/connectors/helicone.js.map +1 -0
- package/dist/connectors/index.d.ts +37 -0
- package/dist/connectors/index.d.ts.map +1 -0
- package/dist/connectors/index.js +65 -0
- package/dist/connectors/index.js.map +1 -0
- package/dist/connectors/langsmith.d.ts +9 -0
- package/dist/connectors/langsmith.d.ts.map +1 -0
- package/dist/connectors/langsmith.js +122 -0
- package/dist/connectors/langsmith.js.map +1 -0
- package/dist/connectors/types.d.ts +83 -0
- package/dist/connectors/types.d.ts.map +1 -0
- package/dist/connectors/types.js +98 -0
- package/dist/connectors/types.js.map +1 -0
- package/dist/cost-estimator.d.ts +46 -0
- package/dist/cost-estimator.d.ts.map +1 -0
- package/dist/cost-estimator.js +104 -0
- package/dist/cost-estimator.js.map +1 -0
- package/dist/costs.d.ts +57 -0
- package/dist/costs.d.ts.map +1 -0
- package/dist/costs.js +251 -0
- package/dist/costs.js.map +1 -0
- package/dist/counterfactuals.d.ts +29 -0
- package/dist/counterfactuals.d.ts.map +1 -0
- package/dist/counterfactuals.js +448 -0
- package/dist/counterfactuals.js.map +1 -0
- package/dist/enhancement-prompts.d.ts +41 -0
- package/dist/enhancement-prompts.d.ts.map +1 -0
- package/dist/enhancement-prompts.js +88 -0
- package/dist/enhancement-prompts.js.map +1 -0
- package/dist/envelopes.d.ts +20 -0
- package/dist/envelopes.d.ts.map +1 -0
- package/dist/envelopes.js +790 -0
- package/dist/envelopes.js.map +1 -0
- package/dist/format-normalizer.d.ts +71 -0
- package/dist/format-normalizer.d.ts.map +1 -0
- package/dist/format-normalizer.js +1331 -0
- package/dist/format-normalizer.js.map +1 -0
- package/dist/history.d.ts +79 -0
- package/dist/history.d.ts.map +1 -0
- package/dist/history.js +313 -0
- package/dist/history.js.map +1 -0
- package/dist/html.d.ts +11 -0
- package/dist/html.d.ts.map +1 -0
- package/dist/html.js +463 -0
- package/dist/html.js.map +1 -0
- package/dist/impact.d.ts +42 -0
- package/dist/impact.d.ts.map +1 -0
- package/dist/impact.js +443 -0
- package/dist/impact.js.map +1 -0
- package/dist/index.d.ts +26 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +34 -0
- package/dist/index.js.map +1 -0
- package/dist/insights.d.ts +5 -0
- package/dist/insights.d.ts.map +1 -0
- package/dist/insights.js +271 -0
- package/dist/insights.js.map +1 -0
- package/dist/joiner.d.ts +9 -0
- package/dist/joiner.d.ts.map +1 -0
- package/dist/joiner.js +247 -0
- package/dist/joiner.js.map +1 -0
- package/dist/orchestrator.d.ts +34 -0
- package/dist/orchestrator.d.ts.map +1 -0
- package/dist/orchestrator.js +827 -0
- package/dist/orchestrator.js.map +1 -0
- package/dist/pdf.d.ts +26 -0
- package/dist/pdf.d.ts.map +1 -0
- package/dist/pdf.js +84 -0
- package/dist/pdf.js.map +1 -0
- package/dist/prediction.d.ts +33 -0
- package/dist/prediction.d.ts.map +1 -0
- package/dist/prediction.js +316 -0
- package/dist/prediction.js.map +1 -0
- package/dist/prompts/loader.d.ts +38 -0
- package/dist/prompts/loader.d.ts.map +1 -0
- package/dist/prompts/loader.js +60 -0
- package/dist/prompts/loader.js.map +1 -0
- package/dist/renderer.d.ts +64 -0
- package/dist/renderer.d.ts.map +1 -0
- package/dist/renderer.js +923 -0
- package/dist/renderer.js.map +1 -0
- package/dist/runid.d.ts +57 -0
- package/dist/runid.d.ts.map +1 -0
- package/dist/runid.js +199 -0
- package/dist/runid.js.map +1 -0
- package/dist/runtime.d.ts +29 -0
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +366 -0
- package/dist/runtime.js.map +1 -0
- package/dist/scanner.d.ts +11 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +426 -0
- package/dist/scanner.js.map +1 -0
- package/dist/templates.d.ts +120 -0
- package/dist/templates.d.ts.map +1 -0
- package/dist/templates.js +429 -0
- package/dist/templates.js.map +1 -0
- package/dist/tools/index.d.ts +153 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +177 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/types.d.ts +3647 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +703 -0
- package/dist/types.js.map +1 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +23 -0
- package/dist/version.js.map +1 -0
- package/docs/demo-guide.md +423 -0
- package/docs/events-format.md +295 -0
- package/docs/inferencemap-spec.md +344 -0
- package/docs/migration-v2.md +293 -0
- package/fixtures/demo/precomputed.json +142 -0
- package/fixtures/demo-project/README.md +52 -0
- package/fixtures/demo-project/ai-service.ts +65 -0
- package/fixtures/demo-project/sample-events.jsonl +15 -0
- package/fixtures/demo-project/src/ai-service.ts +128 -0
- package/fixtures/demo-project/src/llm-client.ts +155 -0
- package/package.json +65 -0
- package/prompts/agent-analyzer.yaml +47 -0
- package/prompts/ci-gate.yaml +98 -0
- package/prompts/correlation-analyzer.yaml +178 -0
- package/prompts/format-normalizer.yaml +46 -0
- package/prompts/peak-performance.yaml +180 -0
- package/prompts/pr-comment.yaml +111 -0
- package/prompts/runtime-analyzer.yaml +189 -0
- package/prompts/unified-analyzer.yaml +241 -0
- package/schemas/inference-map.v0.1.json +215 -0
- package/scripts/benchmark.ts +394 -0
- package/scripts/demo-v1.5.sh +158 -0
- package/scripts/sync-from-site.sh +197 -0
- package/scripts/validate-sync.sh +178 -0
- package/src/agent-analyzer.ts +481 -0
- package/src/agent.ts +1232 -0
- package/src/agents/correlation-analyzer.ts +353 -0
- package/src/agents/index.ts +235 -0
- package/src/agents/runtime-analyzer.ts +343 -0
- package/src/analysis-types.ts +558 -0
- package/src/analytics.ts +100 -0
- package/src/analyzer.ts +692 -0
- package/src/artifacts.ts +218 -0
- package/src/benchmarks/index.ts +309 -0
- package/src/cli.ts +503 -0
- package/src/commands/ci.ts +336 -0
- package/src/commands/config.ts +288 -0
- package/src/commands/demo.ts +175 -0
- package/src/commands/export.ts +297 -0
- package/src/commands/history.ts +425 -0
- package/src/commands/template.ts +385 -0
- package/src/commands/validate-map.ts +324 -0
- package/src/commands/whatif.ts +272 -0
- package/src/comparison.ts +283 -0
- package/src/config.ts +188 -0
- package/src/connectors/helicone.ts +164 -0
- package/src/connectors/index.ts +93 -0
- package/src/connectors/langsmith.ts +179 -0
- package/src/connectors/types.ts +180 -0
- package/src/cost-estimator.ts +146 -0
- package/src/costs.ts +347 -0
- package/src/counterfactuals.ts +516 -0
- package/src/enhancement-prompts.ts +118 -0
- package/src/envelopes.ts +814 -0
- package/src/format-normalizer.ts +1486 -0
- package/src/history.ts +400 -0
- package/src/html.ts +512 -0
- package/src/impact.ts +522 -0
- package/src/index.ts +83 -0
- package/src/insights.ts +341 -0
- package/src/joiner.ts +289 -0
- package/src/orchestrator.ts +1015 -0
- package/src/pdf.ts +110 -0
- package/src/prediction.ts +392 -0
- package/src/prompts/loader.ts +88 -0
- package/src/renderer.ts +1045 -0
- package/src/runid.ts +261 -0
- package/src/runtime.ts +450 -0
- package/src/scanner.ts +508 -0
- package/src/templates.ts +561 -0
- package/src/tools/index.ts +214 -0
- package/src/types.ts +873 -0
- package/src/version.ts +24 -0
- package/templates/context-accumulation.yaml +23 -0
- package/templates/cost-concentration.yaml +20 -0
- package/templates/dead-code.yaml +20 -0
- package/templates/latency-explainer.yaml +23 -0
- package/templates/optimizations/ab-testing-framework.yaml +74 -0
- package/templates/optimizations/api-gateway-optimization.yaml +81 -0
- package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
- package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
- package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
- package/templates/optimizations/comprehensive-apm.yaml +76 -0
- package/templates/optimizations/context-window-optimization.yaml +91 -0
- package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
- package/templates/optimizations/distributed-training-optimization.yaml +77 -0
- package/templates/optimizations/document-analysis-edge.yaml +77 -0
- package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
- package/templates/optimizations/domain-specific-distillation.yaml +78 -0
- package/templates/optimizations/error-handling-optimization.yaml +76 -0
- package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
- package/templates/optimizations/long-context-memory-management.yaml +78 -0
- package/templates/optimizations/max-tokens-optimization.yaml +76 -0
- package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
- package/templates/optimizations/multi-framework-resilience.yaml +75 -0
- package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
- package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
- package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
- package/templates/optimizations/quality-monitoring.yaml +74 -0
- package/templates/optimizations/realtime-budget-controls.yaml +74 -0
- package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
- package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
- package/templates/optimizations/smart-model-routing.yaml +96 -0
- package/templates/optimizations/streaming-batch-selection.yaml +167 -0
- package/templates/optimizations/system-prompt-optimization.yaml +75 -0
- package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
- package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
- package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
- package/templates/overpowered-extraction.yaml +32 -0
- package/templates/overpowered-model.yaml +31 -0
- package/templates/prompt-bloat.yaml +24 -0
- package/templates/retry-explosion.yaml +28 -0
- package/templates/schema/insight.schema.json +113 -0
- package/templates/schema/optimization.schema.json +180 -0
- package/templates/streaming-drift.yaml +30 -0
- package/templates/throughput-gap.yaml +21 -0
- package/templates/token-underutilization.yaml +28 -0
- package/templates/untested-fallback.yaml +21 -0
- package/tests/accuracy/drift-detection.test.ts +184 -0
- package/tests/accuracy/false-positives.test.ts +166 -0
- package/tests/accuracy/templates.test.ts +205 -0
- package/tests/action/commands.test.ts +125 -0
- package/tests/action/comments.test.ts +347 -0
- package/tests/cli.test.ts +203 -0
- package/tests/comparison.test.ts +309 -0
- package/tests/correlation-analyzer.test.ts +534 -0
- package/tests/counterfactuals.test.ts +347 -0
- package/tests/fixtures/events/missing-id.jsonl +1 -0
- package/tests/fixtures/events/missing-input.jsonl +1 -0
- package/tests/fixtures/events/missing-latency.jsonl +1 -0
- package/tests/fixtures/events/missing-model.jsonl +1 -0
- package/tests/fixtures/events/missing-output.jsonl +1 -0
- package/tests/fixtures/events/missing-provider.jsonl +1 -0
- package/tests/fixtures/events/missing-ts.jsonl +1 -0
- package/tests/fixtures/events/valid.csv +3 -0
- package/tests/fixtures/events/valid.json +1 -0
- package/tests/fixtures/events/valid.jsonl +2 -0
- package/tests/fixtures/events/with-callsite.jsonl +1 -0
- package/tests/fixtures/events/with-intent.jsonl +1 -0
- package/tests/fixtures/events/wrong-type.jsonl +1 -0
- package/tests/fixtures/repos/empty/.gitkeep +0 -0
- package/tests/fixtures/repos/hybrid-router/router.py +35 -0
- package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
- package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
- package/tests/fixtures/repos/saas-openai/client.py +26 -0
- package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
- package/tests/github-action.test.ts +292 -0
- package/tests/insights.test.ts +878 -0
- package/tests/joiner.test.ts +168 -0
- package/tests/performance/action-latency.test.ts +132 -0
- package/tests/performance/benchmark.test.ts +189 -0
- package/tests/performance/cli-latency.test.ts +102 -0
- package/tests/pr-comment.test.ts +313 -0
- package/tests/prediction.test.ts +296 -0
- package/tests/runtime-analyzer.test.ts +375 -0
- package/tests/runtime.test.ts +205 -0
- package/tests/scanner.test.ts +122 -0
- package/tests/template-conformance.test.ts +526 -0
- package/tests/unit/cost-calculator.test.ts +303 -0
- package/tests/unit/credits.test.ts +180 -0
- package/tests/unit/inference-map.test.ts +276 -0
- package/tests/unit/schema.test.ts +300 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +14 -0
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
id: peak-performance
|
|
2
|
+
name: Peak Inference Performance Analysis
|
|
3
|
+
version: "1.0"
|
|
4
|
+
description: |
|
|
5
|
+
Analyzes code for inference performance optimization opportunities.
|
|
6
|
+
Focus areas: latency, throughput, cost efficiency, and achieving peak performance.
|
|
7
|
+
|
|
8
|
+
# The analysis prompt sent to the LLM
|
|
9
|
+
prompt: |
|
|
10
|
+
You are an expert at analyzing LLM inference code for PEAK PERFORMANCE optimization.
|
|
11
|
+
|
|
12
|
+
Your goal: Identify why this code may NOT be achieving peak inference performance.
|
|
13
|
+
|
|
14
|
+
## FOCUS AREAS (in priority order)
|
|
15
|
+
|
|
16
|
+
### 1. LATENCY
|
|
17
|
+
- Time to First Token (TTFT) optimization
|
|
18
|
+
- Streaming vs synchronous calls
|
|
19
|
+
- Network round-trip reduction
|
|
20
|
+
- Prompt size impact on latency
|
|
21
|
+
- Cold start detection
|
|
22
|
+
|
|
23
|
+
### 2. THROUGHPUT
|
|
24
|
+
- Batching opportunities (multiple requests → single batch)
|
|
25
|
+
- Concurrent request patterns
|
|
26
|
+
- Queue/backpressure handling
|
|
27
|
+
- Connection pooling
|
|
28
|
+
- Rate limit handling
|
|
29
|
+
|
|
30
|
+
### 3. COST EFFICIENCY
|
|
31
|
+
- Model selection for task complexity (GPT-4 for simple tasks = waste)
|
|
32
|
+
- Token usage optimization (prompt bloat detection)
|
|
33
|
+
- Caching opportunities (repeated similar prompts)
|
|
34
|
+
- Output token limits vs actual needs
|
|
35
|
+
|
|
36
|
+
### 4. RELIABILITY FOR PERFORMANCE
|
|
37
|
+
- Retry patterns that don't hurt latency
|
|
38
|
+
- Fallback strategies (faster model as fallback)
|
|
39
|
+
- Timeout configurations
|
|
40
|
+
- Circuit breaker patterns
|
|
41
|
+
|
|
42
|
+
## WHAT TO LOOK FOR
|
|
43
|
+
|
|
44
|
+
For each LLM API call, analyze:
|
|
45
|
+
- Is streaming enabled? (impacts TTFT)
|
|
46
|
+
- Is batching used for multiple calls? (impacts throughput)
|
|
47
|
+
- Is the model appropriate for task complexity? (impacts cost)
|
|
48
|
+
- Are there caching opportunities? (impacts all three)
|
|
49
|
+
- Is there retry logic that preserves latency SLAs?
|
|
50
|
+
|
|
51
|
+
## OUTPUT FORMAT
|
|
52
|
+
|
|
53
|
+
### PART 1: Identify LLM Callsites
|
|
54
|
+
For each call:
|
|
55
|
+
- line: The EXACT line number where the inference call is made (not client initialization)
|
|
56
|
+
- provider: openai, anthropic, google, together, fireworks, groq, mistral, cohere, replicate, aws_bedrock, azure, vllm, sglang, ollama, unknown
|
|
57
|
+
- model: The EXACT model name as specified in the code
|
|
58
|
+
- framework: langchain, llamaindex, dspy, or null
|
|
59
|
+
- patterns: {streaming, batching, retries, caching, fallback} - true/false
|
|
60
|
+
- confidence: 0.0 to 1.0
|
|
61
|
+
- reasoning: Brief explanation
|
|
62
|
+
|
|
63
|
+
### CRITICAL RULES FOR MODEL EXTRACTION
|
|
64
|
+
1. Look at the model= parameter in the SAME function call
|
|
65
|
+
2. If model is a variable, trace it to find the actual string value
|
|
66
|
+
3. For embeddings calls, use the embedding model name (e.g., "text-embedding-3-small"), NOT a chat model
|
|
67
|
+
4. For DSPy: look at dspy.LM("provider/model") or dspy.context(lm=...) to find the model
|
|
68
|
+
5. Return the FULL model name exactly as written (e.g., "gpt-4o-mini" not "gpt-4", "claude-3-5-sonnet-20241022" not "claude")
|
|
69
|
+
|
|
70
|
+
### CRITICAL: DO NOT FLAG THESE AS CALLSITES
|
|
71
|
+
- Client initialization: openai.OpenAI(), anthropic.Anthropic(), AsyncOpenAI(), etc.
|
|
72
|
+
- Import statements: from openai import OpenAI
|
|
73
|
+
- Type annotations or comments
|
|
74
|
+
- Variable assignments without actual API calls: model = "gpt-4o"
|
|
75
|
+
- SDK client creation without inference: client = OpenAI()
|
|
76
|
+
|
|
77
|
+
### DSPY FRAMEWORK DETECTION
|
|
78
|
+
DSPy wraps LLM calls in module invocations. Flag these as callsites:
|
|
79
|
+
- dspy.Predict(signature)(question=...) - the invocation, not the Predict() creation
|
|
80
|
+
- dspy.ChainOfThought(signature)(question=...) - the invocation
|
|
81
|
+
- predictor(question=...) where predictor is a DSPy module
|
|
82
|
+
- compiled_program(input=...) after BootstrapFewShot compilation
|
|
83
|
+
- self.generate(...) inside a dspy.Module.forward() method
|
|
84
|
+
|
|
85
|
+
For DSPy, find the model from:
|
|
86
|
+
- dspy.LM("provider/model") at module level
|
|
87
|
+
- dspy.configure(lm=...)
|
|
88
|
+
- dspy.context(lm=dspy.LM("provider/model"))
|
|
89
|
+
|
|
90
|
+
### PART 2: Performance Insights with Impact Estimation
|
|
91
|
+
Generate insights focused on PEAK PERFORMANCE with estimated impact:
|
|
92
|
+
|
|
93
|
+
Required fields:
|
|
94
|
+
- severity: MUST be one of: "critical", "warning", "info"
|
|
95
|
+
- category: MUST be one of: "latency", "throughput", "cost", "reliability"
|
|
96
|
+
|
|
97
|
+
Critical = blocking peak performance
|
|
98
|
+
Warning = leaving performance on the table
|
|
99
|
+
Info = optimization opportunity or positive pattern
|
|
100
|
+
|
|
101
|
+
Impact estimation fields (provide for actionable insights):
|
|
102
|
+
- layer: MUST be one of: "application", "model", "runtime", "infrastructure"
|
|
103
|
+
- application: Code patterns (caching, batching, streaming, retry logic)
|
|
104
|
+
- model: Model selection (GPT-4 vs GPT-3.5, Claude Opus vs Haiku)
|
|
105
|
+
- runtime: Inference engines (vLLM, sglang, TGI optimizations)
|
|
106
|
+
- infrastructure: Hosting (serverless vs dedicated, provider selection)
|
|
107
|
+
- impactType: MUST be one of: "cost", "latency", "throughput"
|
|
108
|
+
- estimatedImpactPercent: 0-100 (realistic estimate of improvement)
|
|
109
|
+
- effort: MUST be one of: "low", "medium", "high"
|
|
110
|
+
- low: Config change or few lines of code
|
|
111
|
+
- medium: Requires refactoring or new integration
|
|
112
|
+
- high: Architectural change or new infrastructure
|
|
113
|
+
|
|
114
|
+
Impact estimation guidelines:
|
|
115
|
+
- Model downgrade (GPT-4 → GPT-3.5): ~95% cost reduction
|
|
116
|
+
- Enable streaming: ~70% perceived latency reduction
|
|
117
|
+
- Add batching: ~60% throughput improvement
|
|
118
|
+
- Add caching: ~40% cost reduction (depends on hit rate)
|
|
119
|
+
- Connection pooling: ~20% latency reduction
|
|
120
|
+
- vLLM deployment: ~300% throughput improvement
|
|
121
|
+
- Dedicated GPU hosting: ~60% cost reduction vs API
|
|
122
|
+
|
|
123
|
+
Return ONLY valid JSON:
|
|
124
|
+
{
|
|
125
|
+
"callsites": [...],
|
|
126
|
+
"insights": [
|
|
127
|
+
{
|
|
128
|
+
"severity": "critical",
|
|
129
|
+
"category": "latency",
|
|
130
|
+
"headline": "Synchronous calls blocking TTFT optimization",
|
|
131
|
+
"evidence": "callLlm at line 42 uses synchronous API - streaming would reduce perceived latency by 60-80%",
|
|
132
|
+
"location": "src/llm.ts:42",
|
|
133
|
+
"recommendation": "Enable streaming with stream: true parameter",
|
|
134
|
+
"impact": {
|
|
135
|
+
"layer": "application",
|
|
136
|
+
"impactType": "latency",
|
|
137
|
+
"estimatedImpactPercent": 70,
|
|
138
|
+
"effort": "low"
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"severity": "warning",
|
|
143
|
+
"category": "cost",
|
|
144
|
+
"headline": "GPT-4 used for simple yes/no validation",
|
|
145
|
+
"evidence": "validateInput() at line 89 uses GPT-4 for binary classification",
|
|
146
|
+
"location": "src/validate.ts:89",
|
|
147
|
+
"recommendation": "Switch to GPT-3.5-turbo or GPT-4o-mini for simple validation",
|
|
148
|
+
"impact": {
|
|
149
|
+
"layer": "model",
|
|
150
|
+
"impactType": "cost",
|
|
151
|
+
"estimatedImpactPercent": 97,
|
|
152
|
+
"effort": "low"
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
]
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
If no issues found, return empty arrays: {"callsites": [], "insights": []}
|
|
159
|
+
|
|
160
|
+
# Categories this prompt focuses on
|
|
161
|
+
categories:
|
|
162
|
+
- latency
|
|
163
|
+
- throughput
|
|
164
|
+
- cost
|
|
165
|
+
- reliability
|
|
166
|
+
|
|
167
|
+
# Default thresholds (can be overridden)
|
|
168
|
+
defaults:
|
|
169
|
+
expensive_models:
|
|
170
|
+
- gpt-4
|
|
171
|
+
- gpt-4o
|
|
172
|
+
- gpt-4-turbo
|
|
173
|
+
- claude-3-opus
|
|
174
|
+
- claude-3-sonnet
|
|
175
|
+
cheap_models:
|
|
176
|
+
- gpt-3.5-turbo
|
|
177
|
+
- gpt-4o-mini
|
|
178
|
+
- claude-3-haiku
|
|
179
|
+
latency_critical_threshold_ms: 1000
|
|
180
|
+
batch_opportunity_threshold: 3
|
|
@@ -0,0 +1,111 @@
|
|
|
1
|
+
id: pr-comment
|
|
2
|
+
name: PR Comment Generator
|
|
3
|
+
version: "2.0"
|
|
4
|
+
description: Generates verdict-first PR comments for analysis results
|
|
5
|
+
|
|
6
|
+
context:
|
|
7
|
+
- analysis_results: The full analysis results object
|
|
8
|
+
- baseline: Previous run baseline for comparison (optional)
|
|
9
|
+
- changed_files: List of files changed in this PR
|
|
10
|
+
- new_issues: Issues introduced by this PR
|
|
11
|
+
- status: Overall status (pass, warning, fail)
|
|
12
|
+
- regressions: List of regression descriptions
|
|
13
|
+
|
|
14
|
+
output_format: markdown
|
|
15
|
+
|
|
16
|
+
prompt: |
|
|
17
|
+
<role>
|
|
18
|
+
You are generating a GitHub PR comment for PeakInfer analysis results.
|
|
19
|
+
Goal: User decides in 5 seconds if PR is safe, acts on top issue in 30 seconds.
|
|
20
|
+
</role>
|
|
21
|
+
|
|
22
|
+
<design_principles>
|
|
23
|
+
Julie Zhou behavior-first principles:
|
|
24
|
+
- Verdict first: User knows immediately if this needs attention
|
|
25
|
+
- One top issue: User acts on one thing at a time
|
|
26
|
+
- Progressive disclosure: Details collapsed for power users
|
|
27
|
+
- Action-oriented: Inline suggestions user can apply with one click
|
|
28
|
+
</design_principles>
|
|
29
|
+
|
|
30
|
+
<style>
|
|
31
|
+
- Lead with verdict (emoji + label + message)
|
|
32
|
+
- No metrics tables unless directly actionable
|
|
33
|
+
- Use collapsible <details> for lists > 3 items
|
|
34
|
+
- Plain language, no jargon
|
|
35
|
+
- No anthropomorphic phrasing ("I found...", "I think...")
|
|
36
|
+
</style>
|
|
37
|
+
|
|
38
|
+
<instructions>
|
|
39
|
+
Generate a markdown PR comment with this structure:
|
|
40
|
+
|
|
41
|
+
1. **Verdict Line** (always first)
|
|
42
|
+
- Format: **{emoji} {Label}** — {message}
|
|
43
|
+
- Labels: Safe to Merge, Mostly Good, Review Recommended, Changes Requested
|
|
44
|
+
- Emojis: ✅ 🟢 🟡 🔴
|
|
45
|
+
|
|
46
|
+
2. **Top Issue Highlight** (if issues exist)
|
|
47
|
+
- Simple table showing the ONE most important issue
|
|
48
|
+
- Include: title, location, why it matters
|
|
49
|
+
- This is what user should fix first
|
|
50
|
+
|
|
51
|
+
3. **Collapsible Details** (if > 1 issue)
|
|
52
|
+
- Use <details><summary>See all N issues</summary>
|
|
53
|
+
- Group by severity (Critical, Warning, Info)
|
|
54
|
+
- Max 5 per severity, note if more
|
|
55
|
+
|
|
56
|
+
4. **Footer**
|
|
57
|
+
- If issues: "See inline comments for suggested fixes"
|
|
58
|
+
- Always: "Generated by PeakInfer"
|
|
59
|
+
</instructions>
|
|
60
|
+
|
|
61
|
+
<verdict_logic>
|
|
62
|
+
- ≥2 critical → Changes Requested (🔴)
|
|
63
|
+
- 1 critical OR >5 warnings → Review Recommended (🟡)
|
|
64
|
+
- 1-5 warnings → Mostly Good (🟢)
|
|
65
|
+
- 0 issues → Safe to Merge (✅)
|
|
66
|
+
</verdict_logic>
|
|
67
|
+
|
|
68
|
+
<constraints>
|
|
69
|
+
- Maximum length: 1500 characters (shorter than before)
|
|
70
|
+
- No speculation or suggestions not in the data
|
|
71
|
+
- Inline suggestions are posted separately, not in main comment
|
|
72
|
+
</constraints>
|
|
73
|
+
|
|
74
|
+
example_output: |
|
|
75
|
+
## PeakInfer Analysis
|
|
76
|
+
|
|
77
|
+
**🟡 Review Recommended** — 2 issues need attention before merge
|
|
78
|
+
|
|
79
|
+
| | |
|
|
80
|
+
|---|---|
|
|
81
|
+
| **Top Issue** | Missing error handling in LLM calls |
|
|
82
|
+
| **Location** | `src/api/chat.ts:45` |
|
|
83
|
+
| **Why it matters** | Unhandled API failures will crash the service |
|
|
84
|
+
|
|
85
|
+
<details>
|
|
86
|
+
<summary>See all 7 issues</summary>
|
|
87
|
+
|
|
88
|
+
**Critical** (2)
|
|
89
|
+
- Missing error handling in LLM calls — `src/api/chat.ts:45`
|
|
90
|
+
- Unbounded retry without backoff — `src/api/retry.ts:23`
|
|
91
|
+
|
|
92
|
+
**Warning** (5)
|
|
93
|
+
- Premium model used for simple task — `src/llm/classify.ts:12`
|
|
94
|
+
- Sequential calls could be parallelized — `src/batch/process.ts:67`
|
|
95
|
+
- _...3 more_
|
|
96
|
+
</details>
|
|
97
|
+
|
|
98
|
+
---
|
|
99
|
+
<sub>See inline comments for suggested fixes</sub>
|
|
100
|
+
|
|
101
|
+
<sub>Generated by [PeakInfer](https://github.com/Kalmantic/peakinfer)</sub>
|
|
102
|
+
|
|
103
|
+
zero_state_example: |
|
|
104
|
+
## PeakInfer Analysis
|
|
105
|
+
|
|
106
|
+
**✅ Safe to Merge** — No issues found
|
|
107
|
+
|
|
108
|
+
Analyzed 4 inference points, all following best practices.
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
<sub>Generated by [PeakInfer](https://github.com/Kalmantic/peakinfer)</sub>
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
id: runtime-analyzer
|
|
2
|
+
name: Runtime Telemetry Analyzer
|
|
3
|
+
version: "1.0"
|
|
4
|
+
description: |
|
|
5
|
+
Analyzes LLM inference telemetry data to find patterns, anomalies, and optimization opportunities.
|
|
6
|
+
Provides semantic analysis that templates cannot capture.
|
|
7
|
+
|
|
8
|
+
prompt: |
|
|
9
|
+
<role>
|
|
10
|
+
You are an LLM operations analyst specializing in inference telemetry analysis.
|
|
11
|
+
Your job is to find patterns, anomalies, and optimization opportunities in runtime data.
|
|
12
|
+
</role>
|
|
13
|
+
|
|
14
|
+
<background>
|
|
15
|
+
You receive aggregated runtime telemetry from LLM inference calls:
|
|
16
|
+
- Per-provider and per-model statistics
|
|
17
|
+
- Token counts (input/output)
|
|
18
|
+
- Latency distributions (p50, p95, p99)
|
|
19
|
+
- Call counts and patterns
|
|
20
|
+
- Optional: streaming, batching, caching, retry indicators
|
|
21
|
+
- PRICING DATA: Dynamic pricing from LiteLLM API ($/1M tokens)
|
|
22
|
+
|
|
23
|
+
This data comes from production systems. Your analysis helps teams optimize cost, latency, and reliability.
|
|
24
|
+
|
|
25
|
+
Key metrics to understand:
|
|
26
|
+
- Token ratio = output_tokens / input_tokens (high ratio may indicate verbose prompts)
|
|
27
|
+
- Latency spread = p99 / p50 (high spread indicates inconsistency)
|
|
28
|
+
- Call concentration = calls to expensive models vs cheap models
|
|
29
|
+
|
|
30
|
+
PRICING TIERS (use provided pricing data):
|
|
31
|
+
- Expensive: >$10 per 1M tokens (GPT-4, Claude Opus)
|
|
32
|
+
- Moderate: $1-10 per 1M tokens (GPT-4o, Claude Sonnet)
|
|
33
|
+
- Cheap: <$1 per 1M tokens (GPT-4o-mini, Claude Haiku, GPT-3.5)
|
|
34
|
+
|
|
35
|
+
IMPORTANT: Use the pricing_context provided with the data to calculate actual costs.
|
|
36
|
+
Do NOT assume pricing - use the real numbers from pricing_context.
|
|
37
|
+
</background>
|
|
38
|
+
|
|
39
|
+
<instructions>
|
|
40
|
+
Analyze the data for the following (in priority order):
|
|
41
|
+
|
|
42
|
+
1. COST PATTERNS
|
|
43
|
+
- Which models consume the most tokens?
|
|
44
|
+
- Are expensive models (GPT-4, Claude Opus) used for simple tasks?
|
|
45
|
+
- Token ratios indicating prompt bloat (input >> output)?
|
|
46
|
+
- Total token consumption and estimated costs
|
|
47
|
+
|
|
48
|
+
2. LATENCY PATTERNS
|
|
49
|
+
- Bimodal distributions (some fast, some slow)?
|
|
50
|
+
- P95/P99 spikes suggesting cold starts or rate limits?
|
|
51
|
+
- Correlation between input size and latency?
|
|
52
|
+
- Missing streaming on latency-sensitive paths?
|
|
53
|
+
|
|
54
|
+
3. USAGE PATTERNS
|
|
55
|
+
- Application type inference (RAG, agents, batch, chat)?
|
|
56
|
+
- Multi-model pipelines (cheap model -> expensive model)?
|
|
57
|
+
- Retry patterns suggesting reliability issues?
|
|
58
|
+
- Time-of-day patterns?
|
|
59
|
+
|
|
60
|
+
4. ANOMALIES
|
|
61
|
+
- Outliers in latency (>3x p95)?
|
|
62
|
+
- Unusual token ratios (output > 10x input)?
|
|
63
|
+
- Unexpected provider/model combinations?
|
|
64
|
+
- Failed request patterns?
|
|
65
|
+
|
|
66
|
+
5. RECOMMENDATIONS
|
|
67
|
+
- Model right-sizing opportunities (GPT-4 -> GPT-4o-mini)
|
|
68
|
+
- Caching opportunities (repeated similar requests)
|
|
69
|
+
- Batching opportunities (sequential small requests)
|
|
70
|
+
- Streaming opportunities (high latency paths)
|
|
71
|
+
</instructions>
|
|
72
|
+
|
|
73
|
+
<output_format>
|
|
74
|
+
Return valid JSON:
|
|
75
|
+
{
|
|
76
|
+
"insights": [
|
|
77
|
+
{
|
|
78
|
+
"severity": "critical|warning|info",
|
|
79
|
+
"category": "cost|latency|reliability|throughput|waste",
|
|
80
|
+
"headline": "Short, actionable title",
|
|
81
|
+
"evidence": "Specific data points from the telemetry",
|
|
82
|
+
"recommendation": "What to do about it",
|
|
83
|
+
"impact": {
|
|
84
|
+
"layer": "application|model|runtime|infrastructure",
|
|
85
|
+
"impactType": "cost|latency|throughput",
|
|
86
|
+
"estimatedImpactPercent": 0-100,
|
|
87
|
+
"effort": "low|medium|high"
|
|
88
|
+
}
|
|
89
|
+
}
|
|
90
|
+
],
|
|
91
|
+
"detected_patterns": {
|
|
92
|
+
"application_type": "rag|agent|batch|chat|pipeline|unknown",
|
|
93
|
+
"multi_model_pipeline": true|false,
|
|
94
|
+
"streaming_detected": true|false,
|
|
95
|
+
"batching_detected": true|false,
|
|
96
|
+
"caching_detected": true|false
|
|
97
|
+
},
|
|
98
|
+
"summary": {
|
|
99
|
+
"total_calls": 0,
|
|
100
|
+
"total_tokens": 0,
|
|
101
|
+
"dominant_provider": "provider_name",
|
|
102
|
+
"dominant_model": "model_name",
|
|
103
|
+
"estimated_daily_cost_usd": 0.0
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
</output_format>
|
|
107
|
+
|
|
108
|
+
<constraints>
|
|
109
|
+
- Be specific. Use actual numbers from the data.
|
|
110
|
+
- Prioritize actionable insights over observations.
|
|
111
|
+
- Maximum 10 insights, ranked by impact.
|
|
112
|
+
- Only report patterns with clear evidence.
|
|
113
|
+
- Do NOT fabricate data - only analyze what's provided.
|
|
114
|
+
</constraints>
|
|
115
|
+
|
|
116
|
+
<examples>
|
|
117
|
+
Example input summary:
|
|
118
|
+
{
|
|
119
|
+
"byModel": {
|
|
120
|
+
"gpt-4": {"calls": 500, "tokens_in": 50000, "tokens_out": 25000, "latency_p50": 2500, "latency_p95": 8000},
|
|
121
|
+
"gpt-3.5-turbo": {"calls": 50, "tokens_in": 2000, "tokens_out": 1000, "latency_p50": 400, "latency_p95": 800}
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
|
|
125
|
+
Example output:
|
|
126
|
+
{
|
|
127
|
+
"insights": [
|
|
128
|
+
{
|
|
129
|
+
"severity": "critical",
|
|
130
|
+
"category": "cost",
|
|
131
|
+
"headline": "90% of calls use expensive GPT-4 model",
|
|
132
|
+
"evidence": "500 GPT-4 calls vs 50 GPT-3.5-turbo calls. GPT-4 is ~20x more expensive per token.",
|
|
133
|
+
"recommendation": "Evaluate if GPT-4 is necessary for all use cases. Consider GPT-4o-mini for simpler tasks.",
|
|
134
|
+
"impact": {
|
|
135
|
+
"layer": "model",
|
|
136
|
+
"impactType": "cost",
|
|
137
|
+
"estimatedImpactPercent": 85,
|
|
138
|
+
"effort": "low"
|
|
139
|
+
}
|
|
140
|
+
},
|
|
141
|
+
{
|
|
142
|
+
"severity": "warning",
|
|
143
|
+
"category": "latency",
|
|
144
|
+
"headline": "GPT-4 p95 latency 3x higher than p50",
|
|
145
|
+
"evidence": "p50=2500ms, p95=8000ms. This 3.2x spread indicates inconsistent response times.",
|
|
146
|
+
"recommendation": "Investigate high-latency requests. Consider streaming for long responses.",
|
|
147
|
+
"impact": {
|
|
148
|
+
"layer": "application",
|
|
149
|
+
"impactType": "latency",
|
|
150
|
+
"estimatedImpactPercent": 50,
|
|
151
|
+
"effort": "low"
|
|
152
|
+
}
|
|
153
|
+
}
|
|
154
|
+
],
|
|
155
|
+
"detected_patterns": {
|
|
156
|
+
"application_type": "unknown",
|
|
157
|
+
"multi_model_pipeline": true,
|
|
158
|
+
"streaming_detected": false,
|
|
159
|
+
"batching_detected": false,
|
|
160
|
+
"caching_detected": false
|
|
161
|
+
},
|
|
162
|
+
"summary": {
|
|
163
|
+
"total_calls": 550,
|
|
164
|
+
"total_tokens": 78000,
|
|
165
|
+
"dominant_provider": "openai",
|
|
166
|
+
"dominant_model": "gpt-4",
|
|
167
|
+
"estimated_daily_cost_usd": 2.34
|
|
168
|
+
}
|
|
169
|
+
}
|
|
170
|
+
</examples>
|
|
171
|
+
|
|
172
|
+
categories:
|
|
173
|
+
- cost
|
|
174
|
+
- latency
|
|
175
|
+
- throughput
|
|
176
|
+
- reliability
|
|
177
|
+
- waste
|
|
178
|
+
|
|
179
|
+
defaults:
|
|
180
|
+
# NOTE: Model pricing is DYNAMIC - loaded from LiteLLM pricing API
|
|
181
|
+
# The agent will receive pricing data as context, not hardcoded lists
|
|
182
|
+
# Pricing context format: { model: { input: $/1M tokens, output: $/1M tokens } }
|
|
183
|
+
use_dynamic_pricing: true
|
|
184
|
+
latency_warning_threshold_ms: 5000
|
|
185
|
+
cost_warning_threshold_usd: 10.0
|
|
186
|
+
# Cost tiers (per 1M tokens) - used for classification when pricing context provided
|
|
187
|
+
expensive_threshold_per_1m: 10.0 # >$10/1M = expensive
|
|
188
|
+
moderate_threshold_per_1m: 1.0 # $1-10/1M = moderate
|
|
189
|
+
# Below $1/1M = cheap
|