@peakinfer/cli 1.0.133
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +8 -0
- package/.env.example +6 -0
- package/.github/workflows/peakinfer.yml +64 -0
- package/CHANGELOG.md +31 -0
- package/LICENSE +190 -0
- package/README.md +335 -0
- package/data/inferencemax.json +274 -0
- package/dist/agent-analyzer.d.ts +45 -0
- package/dist/agent-analyzer.d.ts.map +1 -0
- package/dist/agent-analyzer.js +374 -0
- package/dist/agent-analyzer.js.map +1 -0
- package/dist/agent.d.ts +76 -0
- package/dist/agent.d.ts.map +1 -0
- package/dist/agent.js +965 -0
- package/dist/agent.js.map +1 -0
- package/dist/agents/correlation-analyzer.d.ts +34 -0
- package/dist/agents/correlation-analyzer.d.ts.map +1 -0
- package/dist/agents/correlation-analyzer.js +261 -0
- package/dist/agents/correlation-analyzer.js.map +1 -0
- package/dist/agents/index.d.ts +91 -0
- package/dist/agents/index.d.ts.map +1 -0
- package/dist/agents/index.js +111 -0
- package/dist/agents/index.js.map +1 -0
- package/dist/agents/runtime-analyzer.d.ts +38 -0
- package/dist/agents/runtime-analyzer.d.ts.map +1 -0
- package/dist/agents/runtime-analyzer.js +244 -0
- package/dist/agents/runtime-analyzer.js.map +1 -0
- package/dist/analysis-types.d.ts +500 -0
- package/dist/analysis-types.d.ts.map +1 -0
- package/dist/analysis-types.js +11 -0
- package/dist/analysis-types.js.map +1 -0
- package/dist/analytics.d.ts +25 -0
- package/dist/analytics.d.ts.map +1 -0
- package/dist/analytics.js +94 -0
- package/dist/analytics.js.map +1 -0
- package/dist/analyzer.d.ts +48 -0
- package/dist/analyzer.d.ts.map +1 -0
- package/dist/analyzer.js +547 -0
- package/dist/analyzer.js.map +1 -0
- package/dist/artifacts.d.ts +44 -0
- package/dist/artifacts.d.ts.map +1 -0
- package/dist/artifacts.js +165 -0
- package/dist/artifacts.js.map +1 -0
- package/dist/benchmarks/index.d.ts +88 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +205 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +427 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/ci.d.ts +19 -0
- package/dist/commands/ci.d.ts.map +1 -0
- package/dist/commands/ci.js +253 -0
- package/dist/commands/ci.js.map +1 -0
- package/dist/commands/config.d.ts +16 -0
- package/dist/commands/config.d.ts.map +1 -0
- package/dist/commands/config.js +249 -0
- package/dist/commands/config.js.map +1 -0
- package/dist/commands/demo.d.ts +15 -0
- package/dist/commands/demo.d.ts.map +1 -0
- package/dist/commands/demo.js +106 -0
- package/dist/commands/demo.js.map +1 -0
- package/dist/commands/export.d.ts +14 -0
- package/dist/commands/export.d.ts.map +1 -0
- package/dist/commands/export.js +209 -0
- package/dist/commands/export.js.map +1 -0
- package/dist/commands/history.d.ts +15 -0
- package/dist/commands/history.d.ts.map +1 -0
- package/dist/commands/history.js +389 -0
- package/dist/commands/history.js.map +1 -0
- package/dist/commands/template.d.ts +14 -0
- package/dist/commands/template.d.ts.map +1 -0
- package/dist/commands/template.js +341 -0
- package/dist/commands/template.js.map +1 -0
- package/dist/commands/validate-map.d.ts +12 -0
- package/dist/commands/validate-map.d.ts.map +1 -0
- package/dist/commands/validate-map.js +274 -0
- package/dist/commands/validate-map.js.map +1 -0
- package/dist/commands/whatif.d.ts +17 -0
- package/dist/commands/whatif.d.ts.map +1 -0
- package/dist/commands/whatif.js +206 -0
- package/dist/commands/whatif.js.map +1 -0
- package/dist/comparison.d.ts +38 -0
- package/dist/comparison.d.ts.map +1 -0
- package/dist/comparison.js +223 -0
- package/dist/comparison.js.map +1 -0
- package/dist/config.d.ts +42 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +158 -0
- package/dist/config.js.map +1 -0
- package/dist/connectors/helicone.d.ts +9 -0
- package/dist/connectors/helicone.d.ts.map +1 -0
- package/dist/connectors/helicone.js +106 -0
- package/dist/connectors/helicone.js.map +1 -0
- package/dist/connectors/index.d.ts +37 -0
- package/dist/connectors/index.d.ts.map +1 -0
- package/dist/connectors/index.js +65 -0
- package/dist/connectors/index.js.map +1 -0
- package/dist/connectors/langsmith.d.ts +9 -0
- package/dist/connectors/langsmith.d.ts.map +1 -0
- package/dist/connectors/langsmith.js +122 -0
- package/dist/connectors/langsmith.js.map +1 -0
- package/dist/connectors/types.d.ts +83 -0
- package/dist/connectors/types.d.ts.map +1 -0
- package/dist/connectors/types.js +98 -0
- package/dist/connectors/types.js.map +1 -0
- package/dist/cost-estimator.d.ts +46 -0
- package/dist/cost-estimator.d.ts.map +1 -0
- package/dist/cost-estimator.js +104 -0
- package/dist/cost-estimator.js.map +1 -0
- package/dist/costs.d.ts +57 -0
- package/dist/costs.d.ts.map +1 -0
- package/dist/costs.js +251 -0
- package/dist/costs.js.map +1 -0
- package/dist/counterfactuals.d.ts +29 -0
- package/dist/counterfactuals.d.ts.map +1 -0
- package/dist/counterfactuals.js +448 -0
- package/dist/counterfactuals.js.map +1 -0
- package/dist/enhancement-prompts.d.ts +41 -0
- package/dist/enhancement-prompts.d.ts.map +1 -0
- package/dist/enhancement-prompts.js +88 -0
- package/dist/enhancement-prompts.js.map +1 -0
- package/dist/envelopes.d.ts +20 -0
- package/dist/envelopes.d.ts.map +1 -0
- package/dist/envelopes.js +790 -0
- package/dist/envelopes.js.map +1 -0
- package/dist/format-normalizer.d.ts +71 -0
- package/dist/format-normalizer.d.ts.map +1 -0
- package/dist/format-normalizer.js +1331 -0
- package/dist/format-normalizer.js.map +1 -0
- package/dist/history.d.ts +79 -0
- package/dist/history.d.ts.map +1 -0
- package/dist/history.js +313 -0
- package/dist/history.js.map +1 -0
- package/dist/html.d.ts +11 -0
- package/dist/html.d.ts.map +1 -0
- package/dist/html.js +463 -0
- package/dist/html.js.map +1 -0
- package/dist/impact.d.ts +42 -0
- package/dist/impact.d.ts.map +1 -0
- package/dist/impact.js +443 -0
- package/dist/impact.js.map +1 -0
- package/dist/index.d.ts +26 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +34 -0
- package/dist/index.js.map +1 -0
- package/dist/insights.d.ts +5 -0
- package/dist/insights.d.ts.map +1 -0
- package/dist/insights.js +271 -0
- package/dist/insights.js.map +1 -0
- package/dist/joiner.d.ts +9 -0
- package/dist/joiner.d.ts.map +1 -0
- package/dist/joiner.js +247 -0
- package/dist/joiner.js.map +1 -0
- package/dist/orchestrator.d.ts +34 -0
- package/dist/orchestrator.d.ts.map +1 -0
- package/dist/orchestrator.js +827 -0
- package/dist/orchestrator.js.map +1 -0
- package/dist/pdf.d.ts +26 -0
- package/dist/pdf.d.ts.map +1 -0
- package/dist/pdf.js +84 -0
- package/dist/pdf.js.map +1 -0
- package/dist/prediction.d.ts +33 -0
- package/dist/prediction.d.ts.map +1 -0
- package/dist/prediction.js +316 -0
- package/dist/prediction.js.map +1 -0
- package/dist/prompts/loader.d.ts +38 -0
- package/dist/prompts/loader.d.ts.map +1 -0
- package/dist/prompts/loader.js +60 -0
- package/dist/prompts/loader.js.map +1 -0
- package/dist/renderer.d.ts +64 -0
- package/dist/renderer.d.ts.map +1 -0
- package/dist/renderer.js +923 -0
- package/dist/renderer.js.map +1 -0
- package/dist/runid.d.ts +57 -0
- package/dist/runid.d.ts.map +1 -0
- package/dist/runid.js +199 -0
- package/dist/runid.js.map +1 -0
- package/dist/runtime.d.ts +29 -0
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +366 -0
- package/dist/runtime.js.map +1 -0
- package/dist/scanner.d.ts +11 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +426 -0
- package/dist/scanner.js.map +1 -0
- package/dist/templates.d.ts +120 -0
- package/dist/templates.d.ts.map +1 -0
- package/dist/templates.js +429 -0
- package/dist/templates.js.map +1 -0
- package/dist/tools/index.d.ts +153 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +177 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/types.d.ts +3647 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +703 -0
- package/dist/types.js.map +1 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +23 -0
- package/dist/version.js.map +1 -0
- package/docs/demo-guide.md +423 -0
- package/docs/events-format.md +295 -0
- package/docs/inferencemap-spec.md +344 -0
- package/docs/migration-v2.md +293 -0
- package/fixtures/demo/precomputed.json +142 -0
- package/fixtures/demo-project/README.md +52 -0
- package/fixtures/demo-project/ai-service.ts +65 -0
- package/fixtures/demo-project/sample-events.jsonl +15 -0
- package/fixtures/demo-project/src/ai-service.ts +128 -0
- package/fixtures/demo-project/src/llm-client.ts +155 -0
- package/package.json +65 -0
- package/prompts/agent-analyzer.yaml +47 -0
- package/prompts/ci-gate.yaml +98 -0
- package/prompts/correlation-analyzer.yaml +178 -0
- package/prompts/format-normalizer.yaml +46 -0
- package/prompts/peak-performance.yaml +180 -0
- package/prompts/pr-comment.yaml +111 -0
- package/prompts/runtime-analyzer.yaml +189 -0
- package/prompts/unified-analyzer.yaml +241 -0
- package/schemas/inference-map.v0.1.json +215 -0
- package/scripts/benchmark.ts +394 -0
- package/scripts/demo-v1.5.sh +158 -0
- package/scripts/sync-from-site.sh +197 -0
- package/scripts/validate-sync.sh +178 -0
- package/src/agent-analyzer.ts +481 -0
- package/src/agent.ts +1232 -0
- package/src/agents/correlation-analyzer.ts +353 -0
- package/src/agents/index.ts +235 -0
- package/src/agents/runtime-analyzer.ts +343 -0
- package/src/analysis-types.ts +558 -0
- package/src/analytics.ts +100 -0
- package/src/analyzer.ts +692 -0
- package/src/artifacts.ts +218 -0
- package/src/benchmarks/index.ts +309 -0
- package/src/cli.ts +503 -0
- package/src/commands/ci.ts +336 -0
- package/src/commands/config.ts +288 -0
- package/src/commands/demo.ts +175 -0
- package/src/commands/export.ts +297 -0
- package/src/commands/history.ts +425 -0
- package/src/commands/template.ts +385 -0
- package/src/commands/validate-map.ts +324 -0
- package/src/commands/whatif.ts +272 -0
- package/src/comparison.ts +283 -0
- package/src/config.ts +188 -0
- package/src/connectors/helicone.ts +164 -0
- package/src/connectors/index.ts +93 -0
- package/src/connectors/langsmith.ts +179 -0
- package/src/connectors/types.ts +180 -0
- package/src/cost-estimator.ts +146 -0
- package/src/costs.ts +347 -0
- package/src/counterfactuals.ts +516 -0
- package/src/enhancement-prompts.ts +118 -0
- package/src/envelopes.ts +814 -0
- package/src/format-normalizer.ts +1486 -0
- package/src/history.ts +400 -0
- package/src/html.ts +512 -0
- package/src/impact.ts +522 -0
- package/src/index.ts +83 -0
- package/src/insights.ts +341 -0
- package/src/joiner.ts +289 -0
- package/src/orchestrator.ts +1015 -0
- package/src/pdf.ts +110 -0
- package/src/prediction.ts +392 -0
- package/src/prompts/loader.ts +88 -0
- package/src/renderer.ts +1045 -0
- package/src/runid.ts +261 -0
- package/src/runtime.ts +450 -0
- package/src/scanner.ts +508 -0
- package/src/templates.ts +561 -0
- package/src/tools/index.ts +214 -0
- package/src/types.ts +873 -0
- package/src/version.ts +24 -0
- package/templates/context-accumulation.yaml +23 -0
- package/templates/cost-concentration.yaml +20 -0
- package/templates/dead-code.yaml +20 -0
- package/templates/latency-explainer.yaml +23 -0
- package/templates/optimizations/ab-testing-framework.yaml +74 -0
- package/templates/optimizations/api-gateway-optimization.yaml +81 -0
- package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
- package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
- package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
- package/templates/optimizations/comprehensive-apm.yaml +76 -0
- package/templates/optimizations/context-window-optimization.yaml +91 -0
- package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
- package/templates/optimizations/distributed-training-optimization.yaml +77 -0
- package/templates/optimizations/document-analysis-edge.yaml +77 -0
- package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
- package/templates/optimizations/domain-specific-distillation.yaml +78 -0
- package/templates/optimizations/error-handling-optimization.yaml +76 -0
- package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
- package/templates/optimizations/long-context-memory-management.yaml +78 -0
- package/templates/optimizations/max-tokens-optimization.yaml +76 -0
- package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
- package/templates/optimizations/multi-framework-resilience.yaml +75 -0
- package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
- package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
- package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
- package/templates/optimizations/quality-monitoring.yaml +74 -0
- package/templates/optimizations/realtime-budget-controls.yaml +74 -0
- package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
- package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
- package/templates/optimizations/smart-model-routing.yaml +96 -0
- package/templates/optimizations/streaming-batch-selection.yaml +167 -0
- package/templates/optimizations/system-prompt-optimization.yaml +75 -0
- package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
- package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
- package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
- package/templates/overpowered-extraction.yaml +32 -0
- package/templates/overpowered-model.yaml +31 -0
- package/templates/prompt-bloat.yaml +24 -0
- package/templates/retry-explosion.yaml +28 -0
- package/templates/schema/insight.schema.json +113 -0
- package/templates/schema/optimization.schema.json +180 -0
- package/templates/streaming-drift.yaml +30 -0
- package/templates/throughput-gap.yaml +21 -0
- package/templates/token-underutilization.yaml +28 -0
- package/templates/untested-fallback.yaml +21 -0
- package/tests/accuracy/drift-detection.test.ts +184 -0
- package/tests/accuracy/false-positives.test.ts +166 -0
- package/tests/accuracy/templates.test.ts +205 -0
- package/tests/action/commands.test.ts +125 -0
- package/tests/action/comments.test.ts +347 -0
- package/tests/cli.test.ts +203 -0
- package/tests/comparison.test.ts +309 -0
- package/tests/correlation-analyzer.test.ts +534 -0
- package/tests/counterfactuals.test.ts +347 -0
- package/tests/fixtures/events/missing-id.jsonl +1 -0
- package/tests/fixtures/events/missing-input.jsonl +1 -0
- package/tests/fixtures/events/missing-latency.jsonl +1 -0
- package/tests/fixtures/events/missing-model.jsonl +1 -0
- package/tests/fixtures/events/missing-output.jsonl +1 -0
- package/tests/fixtures/events/missing-provider.jsonl +1 -0
- package/tests/fixtures/events/missing-ts.jsonl +1 -0
- package/tests/fixtures/events/valid.csv +3 -0
- package/tests/fixtures/events/valid.json +1 -0
- package/tests/fixtures/events/valid.jsonl +2 -0
- package/tests/fixtures/events/with-callsite.jsonl +1 -0
- package/tests/fixtures/events/with-intent.jsonl +1 -0
- package/tests/fixtures/events/wrong-type.jsonl +1 -0
- package/tests/fixtures/repos/empty/.gitkeep +0 -0
- package/tests/fixtures/repos/hybrid-router/router.py +35 -0
- package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
- package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
- package/tests/fixtures/repos/saas-openai/client.py +26 -0
- package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
- package/tests/github-action.test.ts +292 -0
- package/tests/insights.test.ts +878 -0
- package/tests/joiner.test.ts +168 -0
- package/tests/performance/action-latency.test.ts +132 -0
- package/tests/performance/benchmark.test.ts +189 -0
- package/tests/performance/cli-latency.test.ts +102 -0
- package/tests/pr-comment.test.ts +313 -0
- package/tests/prediction.test.ts +296 -0
- package/tests/runtime-analyzer.test.ts +375 -0
- package/tests/runtime.test.ts +205 -0
- package/tests/scanner.test.ts +122 -0
- package/tests/template-conformance.test.ts +526 -0
- package/tests/unit/cost-calculator.test.ts +303 -0
- package/tests/unit/credits.test.ts +180 -0
- package/tests/unit/inference-map.test.ts +276 -0
- package/tests/unit/schema.test.ts +300 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +14 -0
|
@@ -0,0 +1,77 @@
|
|
|
1
|
+
id: tensorrt-llm-performance
|
|
2
|
+
name: TensorRT-LLM Maximum Performance
|
|
3
|
+
description: Deploy models with TensorRT-LLM for maximum inference performance on NVIDIA GPUs
|
|
4
|
+
category: runtime_optimization
|
|
5
|
+
confidence: 0.88
|
|
6
|
+
success_count: 789
|
|
7
|
+
verified_environments: 38
|
|
8
|
+
contributors:
|
|
9
|
+
- tensorrt_specialist
|
|
10
|
+
- nvidia_partner
|
|
11
|
+
last_updated: "2025-01-03"
|
|
12
|
+
|
|
13
|
+
environment_match:
|
|
14
|
+
gpu_vendor: nvidia
|
|
15
|
+
performance_requirement: maximum
|
|
16
|
+
model_support: transformer
|
|
17
|
+
|
|
18
|
+
optimization:
|
|
19
|
+
technique: tensorrt_llm_deployment
|
|
20
|
+
expected_throughput_improvement: "3-5x"
|
|
21
|
+
expected_latency_improvement: "50-70%"
|
|
22
|
+
effort_estimate: "2-3 weeks"
|
|
23
|
+
risk_level: medium
|
|
24
|
+
|
|
25
|
+
economics:
|
|
26
|
+
projected_improvement:
|
|
27
|
+
throughput_multiplier: 4
|
|
28
|
+
cost_per_token_reduction: 0.75
|
|
29
|
+
implementation_cost:
|
|
30
|
+
engineering_hours: 120
|
|
31
|
+
total_cost: 24000
|
|
32
|
+
|
|
33
|
+
implementation:
|
|
34
|
+
prerequisites:
|
|
35
|
+
- requirement: "NVIDIA GPU (A100/H100 recommended)"
|
|
36
|
+
- requirement: "TensorRT-LLM installation"
|
|
37
|
+
validation_command: "python -c 'import tensorrt_llm'"
|
|
38
|
+
- requirement: "Sufficient disk space for compiled engines"
|
|
39
|
+
automated_steps:
|
|
40
|
+
- step_id: engine_build
|
|
41
|
+
name: TensorRT Engine Build
|
|
42
|
+
executable: true
|
|
43
|
+
commands:
|
|
44
|
+
- "python scripts/convert_to_trt.py --model ./model --dtype fp16"
|
|
45
|
+
- "python scripts/build_trt_engine.py --max-batch-size 64 --max-input-len 2048"
|
|
46
|
+
validation:
|
|
47
|
+
command: "python scripts/validate_engine.py"
|
|
48
|
+
success_criteria: "engine_valid AND accuracy > 0.99"
|
|
49
|
+
- step_id: deployment
|
|
50
|
+
name: TensorRT Deployment
|
|
51
|
+
executable: true
|
|
52
|
+
commands:
|
|
53
|
+
- "python scripts/deploy_trt_server.py --engine ./engine"
|
|
54
|
+
- "python scripts/configure_inflight_batching.py"
|
|
55
|
+
validation:
|
|
56
|
+
command: "python scripts/benchmark_trt.py"
|
|
57
|
+
success_criteria: "throughput > baseline * 3"
|
|
58
|
+
rollback_command: "python scripts/fallback_to_pytorch.py"
|
|
59
|
+
|
|
60
|
+
monitoring:
|
|
61
|
+
key_metrics:
|
|
62
|
+
- metric: throughput_tokens_per_second
|
|
63
|
+
target: ">5000"
|
|
64
|
+
alert_threshold: "<3000"
|
|
65
|
+
- metric: gpu_memory_utilization
|
|
66
|
+
target: "70-85%"
|
|
67
|
+
alert_threshold: ">95%"
|
|
68
|
+
rollback_triggers:
|
|
69
|
+
- condition: "engine_error_rate > 0.01 for 5 minutes"
|
|
70
|
+
action: automatic_rollback
|
|
71
|
+
|
|
72
|
+
results:
|
|
73
|
+
recent_implementations:
|
|
74
|
+
- environment: high_volume_api
|
|
75
|
+
baseline_throughput: 800
|
|
76
|
+
optimized_throughput: 3500
|
|
77
|
+
improvement_factor: 4.4
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
id: vllm-high-throughput-optimization
|
|
2
|
+
name: vLLM Continuous Batching for High-Volume Production
|
|
3
|
+
description: Optimize vLLM deployment for maximum throughput in high-traffic scenarios
|
|
4
|
+
category: batching_optimization
|
|
5
|
+
confidence: 0.91
|
|
6
|
+
success_count: 1923
|
|
7
|
+
verified_environments: 67
|
|
8
|
+
contributors:
|
|
9
|
+
- scaling_team
|
|
10
|
+
- vllm_expert
|
|
11
|
+
- production_engineer
|
|
12
|
+
last_updated: "2025-01-14"
|
|
13
|
+
|
|
14
|
+
environment_match:
|
|
15
|
+
runtime: vllm
|
|
16
|
+
monthly_requests: ">1M"
|
|
17
|
+
current_batch_size: "<8"
|
|
18
|
+
gpu_utilization: "<70%"
|
|
19
|
+
latency_requirements: flexible
|
|
20
|
+
|
|
21
|
+
optimization:
|
|
22
|
+
technique: continuous_batching
|
|
23
|
+
expected_throughput_improvement: "3-5x"
|
|
24
|
+
expected_cost_reduction: "60-75%"
|
|
25
|
+
effort_estimate: "1-2 weeks"
|
|
26
|
+
risk_level: low
|
|
27
|
+
|
|
28
|
+
economics:
|
|
29
|
+
baseline_calculation:
|
|
30
|
+
current_throughput_factor: 1.0
|
|
31
|
+
projected_improvement:
|
|
32
|
+
new_throughput_factor: 4.0
|
|
33
|
+
gpu_reduction_factor: 0.25
|
|
34
|
+
implementation_cost:
|
|
35
|
+
engineering_hours: 80
|
|
36
|
+
total_cost: 16000
|
|
37
|
+
|
|
38
|
+
implementation:
|
|
39
|
+
prerequisites:
|
|
40
|
+
- requirement: "vLLM 0.2.7+"
|
|
41
|
+
validation_command: "python -c 'import vllm; print(vllm.__version__)'"
|
|
42
|
+
- requirement: "CUDA 11.8+"
|
|
43
|
+
validation_command: "nvcc --version | grep 'release 11.8'"
|
|
44
|
+
- requirement: "16GB+ GPU memory"
|
|
45
|
+
validation_command: "nvidia-smi --query-gpu=memory.total --format=csv,noheader | awk '{if($1<16000) exit 1}'"
|
|
46
|
+
automated_steps:
|
|
47
|
+
- step_id: batch_configuration
|
|
48
|
+
name: Optimal Batch Configuration
|
|
49
|
+
executable: true
|
|
50
|
+
commands:
|
|
51
|
+
- "python scripts/configure_vllm.py --max-num-batched-tokens 8192 --max-num-seqs 32"
|
|
52
|
+
- "python scripts/start_vllm_server.py --model meta-llama/Llama-2-7b-hf --gpu-memory-utilization 0.85"
|
|
53
|
+
validation:
|
|
54
|
+
command: "python scripts/test_batch_performance.py --target-batch-size 16"
|
|
55
|
+
success_criteria: "average_batch_size > 12"
|
|
56
|
+
rollback_command: "python scripts/revert_vllm_config.py"
|
|
57
|
+
- step_id: memory_optimization
|
|
58
|
+
name: Memory Optimization
|
|
59
|
+
executable: true
|
|
60
|
+
commands:
|
|
61
|
+
- "python scripts/enable_prefix_caching.py"
|
|
62
|
+
- "python scripts/configure_swap_space.py --swap-size 4GB"
|
|
63
|
+
validation:
|
|
64
|
+
command: "python scripts/check_memory_efficiency.py"
|
|
65
|
+
success_criteria: "memory_utilization > 0.8 AND memory_utilization < 0.9"
|
|
66
|
+
rollback_command: "python scripts/disable_optimizations.py"
|
|
67
|
+
|
|
68
|
+
monitoring:
|
|
69
|
+
key_metrics:
|
|
70
|
+
- metric: average_batch_size
|
|
71
|
+
target: ">16"
|
|
72
|
+
alert_threshold: "<12"
|
|
73
|
+
- metric: throughput_tokens_per_second
|
|
74
|
+
target: ">3000"
|
|
75
|
+
alert_threshold: "<2000"
|
|
76
|
+
- metric: gpu_memory_utilization
|
|
77
|
+
target: "0.8-0.85"
|
|
78
|
+
alert_threshold: ">0.9"
|
|
79
|
+
rollback_triggers:
|
|
80
|
+
- condition: "average_batch_size < 8 for 20 minutes"
|
|
81
|
+
action: automatic_rollback
|
|
82
|
+
- condition: "gpu_memory_utilization > 0.95 for 10 minutes"
|
|
83
|
+
action: automatic_rollback
|
|
84
|
+
- condition: "throughput_degradation > 30% for 15 minutes"
|
|
85
|
+
action: alert_and_investigation
|
|
86
|
+
|
|
87
|
+
results:
|
|
88
|
+
recent_implementations:
|
|
89
|
+
- environment: video_streaming_recommendations
|
|
90
|
+
baseline_throughput: 800
|
|
91
|
+
optimized_throughput: 3200
|
|
92
|
+
throughput_improvement: 4.0
|
|
93
|
+
implementation_days: 8
|
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
id: vllm-migration-memory-bound
|
|
2
|
+
name: vLLM Migration from Memory-Bound Workloads
|
|
3
|
+
description: Migrate from traditional serving to vLLM for memory-bound inference workloads
|
|
4
|
+
category: runtime_optimization
|
|
5
|
+
confidence: 0.90
|
|
6
|
+
success_count: 1123
|
|
7
|
+
verified_environments: 52
|
|
8
|
+
contributors:
|
|
9
|
+
- vllm_specialist
|
|
10
|
+
- migration_engineer
|
|
11
|
+
last_updated: "2025-01-10"
|
|
12
|
+
|
|
13
|
+
environment_match:
|
|
14
|
+
current_runtime:
|
|
15
|
+
- huggingface
|
|
16
|
+
- pytorch
|
|
17
|
+
memory_bound: true
|
|
18
|
+
batch_size: "<4"
|
|
19
|
+
|
|
20
|
+
optimization:
|
|
21
|
+
technique: vllm_migration
|
|
22
|
+
expected_throughput_improvement: "3-6x"
|
|
23
|
+
expected_cost_reduction: "60-80%"
|
|
24
|
+
effort_estimate: "1-2 weeks"
|
|
25
|
+
risk_level: low
|
|
26
|
+
|
|
27
|
+
economics:
|
|
28
|
+
projected_improvement:
|
|
29
|
+
throughput_multiplier: 4.5
|
|
30
|
+
cost_reduction_percent: 70
|
|
31
|
+
implementation_cost:
|
|
32
|
+
engineering_hours: 60
|
|
33
|
+
total_cost: 12000
|
|
34
|
+
|
|
35
|
+
implementation:
|
|
36
|
+
prerequisites:
|
|
37
|
+
- requirement: "vLLM compatible model"
|
|
38
|
+
validation_command: "python scripts/check_vllm_compatibility.py --model ./model"
|
|
39
|
+
- requirement: "GPU with 16GB+ memory"
|
|
40
|
+
automated_steps:
|
|
41
|
+
- step_id: compatibility_check
|
|
42
|
+
name: Compatibility Verification
|
|
43
|
+
executable: true
|
|
44
|
+
commands:
|
|
45
|
+
- "python scripts/verify_model_format.py"
|
|
46
|
+
- "python scripts/test_vllm_loading.py"
|
|
47
|
+
validation:
|
|
48
|
+
command: "python scripts/validate_loading.py"
|
|
49
|
+
success_criteria: "model_loads_successfully"
|
|
50
|
+
- step_id: migration
|
|
51
|
+
name: vLLM Migration
|
|
52
|
+
executable: true
|
|
53
|
+
commands:
|
|
54
|
+
- "python scripts/setup_vllm_server.py --model ./model --tensor-parallel-size 1"
|
|
55
|
+
- "python scripts/configure_batching.py --max-tokens 8192"
|
|
56
|
+
validation:
|
|
57
|
+
command: "python scripts/benchmark_vllm.py"
|
|
58
|
+
success_criteria: "throughput > baseline * 3"
|
|
59
|
+
rollback_command: "python scripts/revert_to_original.py"
|
|
60
|
+
|
|
61
|
+
monitoring:
|
|
62
|
+
key_metrics:
|
|
63
|
+
- metric: throughput_rps
|
|
64
|
+
target: ">baseline * 3"
|
|
65
|
+
alert_threshold: "<baseline * 2"
|
|
66
|
+
- metric: latency_p99
|
|
67
|
+
target: "<baseline * 1.2"
|
|
68
|
+
alert_threshold: ">baseline * 2"
|
|
69
|
+
rollback_triggers:
|
|
70
|
+
- condition: "throughput < baseline for 15 minutes"
|
|
71
|
+
action: automatic_rollback
|
|
72
|
+
|
|
73
|
+
results:
|
|
74
|
+
recent_implementations:
|
|
75
|
+
- environment: api_inference_service
|
|
76
|
+
baseline_throughput: 50
|
|
77
|
+
optimized_throughput: 220
|
|
78
|
+
improvement_factor: 4.4
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
# Based on: https://www.kalmantic.com/posts/gpt5-model-selection-economics-extraction-tasks
|
|
2
|
+
# "Why Premium Models Waste Money on Extraction Tasks"
|
|
3
|
+
|
|
4
|
+
id: overpowered-extraction
|
|
5
|
+
name: Overpowered Model for Simple Tasks
|
|
6
|
+
version: "1.0"
|
|
7
|
+
category: cost
|
|
8
|
+
severity: warning
|
|
9
|
+
layer: model
|
|
10
|
+
|
|
11
|
+
match:
|
|
12
|
+
scope: callsite
|
|
13
|
+
conditions:
|
|
14
|
+
- field: model
|
|
15
|
+
op: in
|
|
16
|
+
value: ["gpt-4o", "gpt-4", "gpt-4-turbo", "claude-3-opus", "claude-3.5-sonnet"]
|
|
17
|
+
- field: avg_tokens
|
|
18
|
+
op: lt
|
|
19
|
+
value: 100
|
|
20
|
+
|
|
21
|
+
output:
|
|
22
|
+
headline: "Using {{model}} for {{avg_tokens}}-token outputs"
|
|
23
|
+
evidence: "{{location}}: Consider gpt-4o-mini or claude-3-haiku for simple extraction tasks"
|
|
24
|
+
|
|
25
|
+
defaults:
|
|
26
|
+
small_output_threshold: 100
|
|
27
|
+
premium_models:
|
|
28
|
+
- gpt-4o
|
|
29
|
+
- gpt-4
|
|
30
|
+
- gpt-4-turbo
|
|
31
|
+
- claude-3-opus
|
|
32
|
+
- claude-3.5-sonnet
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
id: overpowered-model
|
|
2
|
+
name: Overpowered Model Detection
|
|
3
|
+
version: "1.0"
|
|
4
|
+
category: waste
|
|
5
|
+
severity: info
|
|
6
|
+
layer: model
|
|
7
|
+
|
|
8
|
+
source:
|
|
9
|
+
url: https://openai.com/pricing
|
|
10
|
+
title: "Model Pricing and Capability Tiers"
|
|
11
|
+
|
|
12
|
+
match:
|
|
13
|
+
scope: callsite
|
|
14
|
+
conditions:
|
|
15
|
+
- field: model
|
|
16
|
+
op: in
|
|
17
|
+
value: ["gpt-4o", "gpt-4", "gpt-4-turbo", "claude-3-opus", "claude-3-opus-20240229"]
|
|
18
|
+
- field: usage.avg_output_tokens
|
|
19
|
+
op: lt
|
|
20
|
+
value: 100
|
|
21
|
+
- field: usage.calls
|
|
22
|
+
op: gt
|
|
23
|
+
value: 100
|
|
24
|
+
|
|
25
|
+
output:
|
|
26
|
+
headline: "{{model}} used for short outputs (avg {{avg_tokens}} tokens)"
|
|
27
|
+
evidence: "Premium models have minimum cost overhead regardless of output length"
|
|
28
|
+
|
|
29
|
+
defaults:
|
|
30
|
+
output_threshold: 100
|
|
31
|
+
calls_threshold: 100
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
# Based on: https://www.kalmantic.com/posts/system-prompt-optimization-stop-paying-redundant-instructions
|
|
2
|
+
# "Stop Paying 40x More for Redundant AI Instructions"
|
|
3
|
+
|
|
4
|
+
id: prompt-bloat
|
|
5
|
+
name: Prompt Bloat Detection
|
|
6
|
+
version: "1.0"
|
|
7
|
+
category: cost
|
|
8
|
+
severity: warning
|
|
9
|
+
layer: model
|
|
10
|
+
|
|
11
|
+
match:
|
|
12
|
+
scope: callsite
|
|
13
|
+
conditions:
|
|
14
|
+
- field: usage.tokens_in
|
|
15
|
+
op: ratio_gt
|
|
16
|
+
compare_to: usage.tokens_out
|
|
17
|
+
value: 20
|
|
18
|
+
|
|
19
|
+
output:
|
|
20
|
+
headline: "{{ratio}}x more input than output tokens"
|
|
21
|
+
evidence: "{{location}}: {{tokens_in}} tokens in → {{tokens_out}} tokens out. Consider prompt optimization."
|
|
22
|
+
|
|
23
|
+
defaults:
|
|
24
|
+
input_output_ratio_threshold: 20
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
# Based on: https://www.kalmantic.com/posts/ai-retry-logic-error-handling-multiplies-costs
|
|
2
|
+
# "How Bad Error Handling Turns $10 Failures into $1000 Bills"
|
|
3
|
+
|
|
4
|
+
id: retry-explosion
|
|
5
|
+
name: Retry Storm Detection
|
|
6
|
+
version: "1.0"
|
|
7
|
+
category: cost
|
|
8
|
+
severity: critical
|
|
9
|
+
layer: api
|
|
10
|
+
|
|
11
|
+
match:
|
|
12
|
+
scope: callsite
|
|
13
|
+
conditions:
|
|
14
|
+
- field: usage.calls
|
|
15
|
+
op: gt
|
|
16
|
+
value: 10
|
|
17
|
+
- field: usage.latency_p99
|
|
18
|
+
op: ratio_gt
|
|
19
|
+
compare_to: usage.latency_p50
|
|
20
|
+
value: 5
|
|
21
|
+
|
|
22
|
+
output:
|
|
23
|
+
headline: "Possible retry storm at {{location}}"
|
|
24
|
+
evidence: "{{calls}} calls with p99/p50 ratio of {{ratio}}x - check retry logic"
|
|
25
|
+
|
|
26
|
+
defaults:
|
|
27
|
+
min_calls: 10
|
|
28
|
+
latency_ratio_threshold: 5
|
|
@@ -0,0 +1,113 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://github.com/Kalmantic/peakinfer_templates/schema/insight.schema.json",
|
|
4
|
+
"title": "PeakInfer Insight Template",
|
|
5
|
+
"description": "Schema for insight detection templates",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["id", "version", "name", "category", "severity", "match", "output"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"id": {
|
|
10
|
+
"type": "string",
|
|
11
|
+
"pattern": "^[a-z][a-z0-9-]*$",
|
|
12
|
+
"description": "Unique identifier (kebab-case)"
|
|
13
|
+
},
|
|
14
|
+
"version": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"pattern": "^\\d+\\.\\d+$",
|
|
17
|
+
"description": "Template version (semver major.minor)"
|
|
18
|
+
},
|
|
19
|
+
"name": {
|
|
20
|
+
"type": "string",
|
|
21
|
+
"description": "Human-readable name"
|
|
22
|
+
},
|
|
23
|
+
"description": {
|
|
24
|
+
"type": "string",
|
|
25
|
+
"description": "Detailed description of what this insight detects"
|
|
26
|
+
},
|
|
27
|
+
"source": {
|
|
28
|
+
"type": "object",
|
|
29
|
+
"properties": {
|
|
30
|
+
"url": { "type": "string", "format": "uri" },
|
|
31
|
+
"title": { "type": "string" }
|
|
32
|
+
},
|
|
33
|
+
"description": "Attribution to blog post, research, etc."
|
|
34
|
+
},
|
|
35
|
+
"category": {
|
|
36
|
+
"type": "string",
|
|
37
|
+
"enum": ["cost", "drift", "performance", "waste", "reliability", "latency", "throughput"],
|
|
38
|
+
"description": "Primary category for grouping"
|
|
39
|
+
},
|
|
40
|
+
"severity": {
|
|
41
|
+
"type": "string",
|
|
42
|
+
"enum": ["critical", "warning", "info"],
|
|
43
|
+
"description": "Impact severity"
|
|
44
|
+
},
|
|
45
|
+
"tags": {
|
|
46
|
+
"type": "array",
|
|
47
|
+
"items": { "type": "string" },
|
|
48
|
+
"description": "Searchable tags"
|
|
49
|
+
},
|
|
50
|
+
"match": {
|
|
51
|
+
"type": "object",
|
|
52
|
+
"required": ["scope", "conditions"],
|
|
53
|
+
"properties": {
|
|
54
|
+
"scope": {
|
|
55
|
+
"type": "string",
|
|
56
|
+
"enum": ["callsite", "joined", "global", "envelope"],
|
|
57
|
+
"description": "What data context to evaluate against"
|
|
58
|
+
},
|
|
59
|
+
"conditions": {
|
|
60
|
+
"type": "array",
|
|
61
|
+
"items": {
|
|
62
|
+
"type": "object",
|
|
63
|
+
"required": ["field", "op"],
|
|
64
|
+
"properties": {
|
|
65
|
+
"field": { "type": "string" },
|
|
66
|
+
"op": {
|
|
67
|
+
"type": "string",
|
|
68
|
+
"enum": ["eq", "neq", "gt", "lt", "gte", "lte", "exists", "in", "ratio_gt", "ratio_lt", "has_pattern"]
|
|
69
|
+
},
|
|
70
|
+
"value": {},
|
|
71
|
+
"compare_to": { "type": "string" },
|
|
72
|
+
"pattern": { "type": "string" },
|
|
73
|
+
"count_gt": { "type": "number" }
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
},
|
|
79
|
+
"output": {
|
|
80
|
+
"type": "object",
|
|
81
|
+
"required": ["headline", "evidence"],
|
|
82
|
+
"properties": {
|
|
83
|
+
"headline": {
|
|
84
|
+
"type": "string",
|
|
85
|
+
"description": "Short summary with {{variables}}"
|
|
86
|
+
},
|
|
87
|
+
"evidence": {
|
|
88
|
+
"type": "string",
|
|
89
|
+
"description": "Supporting details with {{variables}}"
|
|
90
|
+
}
|
|
91
|
+
}
|
|
92
|
+
},
|
|
93
|
+
"recommends": {
|
|
94
|
+
"type": "array",
|
|
95
|
+
"items": {
|
|
96
|
+
"type": "object",
|
|
97
|
+
"properties": {
|
|
98
|
+
"optimization": { "type": "string" },
|
|
99
|
+
"relevance": { "type": "number", "minimum": 0, "maximum": 1 },
|
|
100
|
+
"reason": { "type": "string" }
|
|
101
|
+
}
|
|
102
|
+
},
|
|
103
|
+
"description": "Links to optimization templates"
|
|
104
|
+
},
|
|
105
|
+
"defaults": {
|
|
106
|
+
"type": "object",
|
|
107
|
+
"description": "Default threshold values"
|
|
108
|
+
},
|
|
109
|
+
"author": { "type": "string" },
|
|
110
|
+
"created": { "type": "string", "format": "date" },
|
|
111
|
+
"updated": { "type": "string", "format": "date" }
|
|
112
|
+
}
|
|
113
|
+
}
|
|
@@ -0,0 +1,180 @@
|
|
|
1
|
+
{
|
|
2
|
+
"$schema": "http://json-schema.org/draft-07/schema#",
|
|
3
|
+
"$id": "https://github.com/Kalmantic/peakinfer_templates/schema/optimization.schema.json",
|
|
4
|
+
"title": "PeakInfer Optimization Template",
|
|
5
|
+
"description": "Schema for optimization recommendation templates",
|
|
6
|
+
"type": "object",
|
|
7
|
+
"required": ["id", "name", "description", "category", "optimization", "implementation"],
|
|
8
|
+
"properties": {
|
|
9
|
+
"id": {
|
|
10
|
+
"type": "string",
|
|
11
|
+
"pattern": "^[a-z][a-z0-9-]*$",
|
|
12
|
+
"description": "Unique identifier (kebab-case)"
|
|
13
|
+
},
|
|
14
|
+
"name": {
|
|
15
|
+
"type": "string",
|
|
16
|
+
"description": "Human-readable name"
|
|
17
|
+
},
|
|
18
|
+
"description": {
|
|
19
|
+
"type": "string",
|
|
20
|
+
"description": "Detailed description of the optimization"
|
|
21
|
+
},
|
|
22
|
+
"source": {
|
|
23
|
+
"type": "object",
|
|
24
|
+
"properties": {
|
|
25
|
+
"url": { "type": "string", "format": "uri" },
|
|
26
|
+
"title": { "type": "string" },
|
|
27
|
+
"authors": {
|
|
28
|
+
"type": "array",
|
|
29
|
+
"items": { "type": "string" }
|
|
30
|
+
}
|
|
31
|
+
},
|
|
32
|
+
"description": "Attribution to research paper, blog post, etc."
|
|
33
|
+
},
|
|
34
|
+
"category": {
|
|
35
|
+
"type": "string",
|
|
36
|
+
"enum": [
|
|
37
|
+
"api_optimization",
|
|
38
|
+
"memory_optimization",
|
|
39
|
+
"latency_optimization",
|
|
40
|
+
"cost_optimization",
|
|
41
|
+
"reliability_optimization",
|
|
42
|
+
"throughput_optimization",
|
|
43
|
+
"serving_optimization"
|
|
44
|
+
],
|
|
45
|
+
"description": "Primary optimization category"
|
|
46
|
+
},
|
|
47
|
+
"confidence": {
|
|
48
|
+
"type": "number",
|
|
49
|
+
"minimum": 0,
|
|
50
|
+
"maximum": 1,
|
|
51
|
+
"description": "Confidence score based on verified implementations"
|
|
52
|
+
},
|
|
53
|
+
"success_count": {
|
|
54
|
+
"type": "integer",
|
|
55
|
+
"minimum": 0,
|
|
56
|
+
"description": "Number of successful implementations"
|
|
57
|
+
},
|
|
58
|
+
"verified_environments": {
|
|
59
|
+
"type": "integer",
|
|
60
|
+
"minimum": 0,
|
|
61
|
+
"description": "Number of verified deployment environments"
|
|
62
|
+
},
|
|
63
|
+
"contributors": {
|
|
64
|
+
"type": "array",
|
|
65
|
+
"items": { "type": "string" },
|
|
66
|
+
"description": "Contributors to this optimization"
|
|
67
|
+
},
|
|
68
|
+
"last_updated": {
|
|
69
|
+
"type": "string",
|
|
70
|
+
"format": "date",
|
|
71
|
+
"description": "Last update date"
|
|
72
|
+
},
|
|
73
|
+
"environment_match": {
|
|
74
|
+
"type": "object",
|
|
75
|
+
"description": "Conditions for when this optimization applies",
|
|
76
|
+
"properties": {
|
|
77
|
+
"model_size": {
|
|
78
|
+
"type": "array",
|
|
79
|
+
"items": { "type": "string" }
|
|
80
|
+
},
|
|
81
|
+
"memory_pressure": { "type": "string" },
|
|
82
|
+
"quality_tolerance": { "type": "string" },
|
|
83
|
+
"deployment": {
|
|
84
|
+
"type": "array",
|
|
85
|
+
"items": { "type": "string" }
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
},
|
|
89
|
+
"optimization": {
|
|
90
|
+
"type": "object",
|
|
91
|
+
"required": ["technique"],
|
|
92
|
+
"properties": {
|
|
93
|
+
"technique": { "type": "string" },
|
|
94
|
+
"expected_memory_reduction": { "type": "string" },
|
|
95
|
+
"expected_quality_retention": { "type": "string" },
|
|
96
|
+
"expected_latency_improvement": { "type": "string" },
|
|
97
|
+
"expected_cost_reduction": { "type": "string" },
|
|
98
|
+
"effort_estimate": { "type": "string" },
|
|
99
|
+
"risk_level": {
|
|
100
|
+
"type": "string",
|
|
101
|
+
"enum": ["low", "medium", "high"]
|
|
102
|
+
}
|
|
103
|
+
}
|
|
104
|
+
},
|
|
105
|
+
"economics": {
|
|
106
|
+
"type": "object",
|
|
107
|
+
"properties": {
|
|
108
|
+
"baseline_calculation": { "type": "object" },
|
|
109
|
+
"projected_improvement": { "type": "object" },
|
|
110
|
+
"implementation_cost": { "type": "object" }
|
|
111
|
+
}
|
|
112
|
+
},
|
|
113
|
+
"implementation": {
|
|
114
|
+
"type": "object",
|
|
115
|
+
"properties": {
|
|
116
|
+
"prerequisites": {
|
|
117
|
+
"type": "array",
|
|
118
|
+
"items": {
|
|
119
|
+
"type": "object",
|
|
120
|
+
"properties": {
|
|
121
|
+
"requirement": { "type": "string" },
|
|
122
|
+
"validation_command": { "type": "string" }
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
},
|
|
126
|
+
"automated_steps": {
|
|
127
|
+
"type": "array",
|
|
128
|
+
"items": {
|
|
129
|
+
"type": "object",
|
|
130
|
+
"properties": {
|
|
131
|
+
"step_id": { "type": "string" },
|
|
132
|
+
"name": { "type": "string" },
|
|
133
|
+
"executable": { "type": "boolean" },
|
|
134
|
+
"commands": {
|
|
135
|
+
"type": "array",
|
|
136
|
+
"items": { "type": "string" }
|
|
137
|
+
},
|
|
138
|
+
"validation": { "type": "object" }
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
}
|
|
143
|
+
},
|
|
144
|
+
"monitoring": {
|
|
145
|
+
"type": "object",
|
|
146
|
+
"properties": {
|
|
147
|
+
"key_metrics": {
|
|
148
|
+
"type": "array",
|
|
149
|
+
"items": {
|
|
150
|
+
"type": "object",
|
|
151
|
+
"properties": {
|
|
152
|
+
"metric": { "type": "string" },
|
|
153
|
+
"target": { "type": "string" },
|
|
154
|
+
"alert_threshold": { "type": "string" }
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
},
|
|
158
|
+
"rollback_triggers": {
|
|
159
|
+
"type": "array",
|
|
160
|
+
"items": {
|
|
161
|
+
"type": "object",
|
|
162
|
+
"properties": {
|
|
163
|
+
"condition": { "type": "string" },
|
|
164
|
+
"action": { "type": "string" }
|
|
165
|
+
}
|
|
166
|
+
}
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
},
|
|
170
|
+
"results": {
|
|
171
|
+
"type": "object",
|
|
172
|
+
"properties": {
|
|
173
|
+
"recent_implementations": {
|
|
174
|
+
"type": "array",
|
|
175
|
+
"items": { "type": "object" }
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
}
|
|
180
|
+
}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
id: streaming-drift
|
|
2
|
+
name: Streaming Drift Detection
|
|
3
|
+
version: "1.0"
|
|
4
|
+
category: latency
|
|
5
|
+
severity: critical
|
|
6
|
+
layer: application
|
|
7
|
+
|
|
8
|
+
source:
|
|
9
|
+
url: https://anthropic.com/research/streaming-tokens
|
|
10
|
+
title: "Token Streaming for Real-Time Applications"
|
|
11
|
+
|
|
12
|
+
match:
|
|
13
|
+
scope: callsite
|
|
14
|
+
conditions:
|
|
15
|
+
- field: patterns.streaming
|
|
16
|
+
op: eq
|
|
17
|
+
value: true
|
|
18
|
+
- field: usage
|
|
19
|
+
op: exists
|
|
20
|
+
- field: usage.latency_p99
|
|
21
|
+
op: ratio_gt
|
|
22
|
+
compare_to: usage.latency_p50
|
|
23
|
+
value: 5
|
|
24
|
+
|
|
25
|
+
output:
|
|
26
|
+
headline: "Streaming enabled but responses arrive in bursts"
|
|
27
|
+
evidence: "p99/p50 ratio is {{ratio}}x — true streaming would be under 2x"
|
|
28
|
+
|
|
29
|
+
defaults:
|
|
30
|
+
threshold: 2
|