npm - @peakinfer/cli - Versions diffs - 1.0.133 - Mend

@peakinfer/cli 1.0.133

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (367) hide show

package/.claude/settings.local.json +8 -0
package/.env.example +6 -0
package/.github/workflows/peakinfer.yml +64 -0
package/CHANGELOG.md +31 -0
package/LICENSE +190 -0
package/README.md +335 -0
package/data/inferencemax.json +274 -0
package/dist/agent-analyzer.d.ts +45 -0
package/dist/agent-analyzer.d.ts.map +1 -0
package/dist/agent-analyzer.js +374 -0
package/dist/agent-analyzer.js.map +1 -0
package/dist/agent.d.ts +76 -0
package/dist/agent.d.ts.map +1 -0
package/dist/agent.js +965 -0
package/dist/agent.js.map +1 -0
package/dist/agents/correlation-analyzer.d.ts +34 -0
package/dist/agents/correlation-analyzer.d.ts.map +1 -0
package/dist/agents/correlation-analyzer.js +261 -0
package/dist/agents/correlation-analyzer.js.map +1 -0
package/dist/agents/index.d.ts +91 -0
package/dist/agents/index.d.ts.map +1 -0
package/dist/agents/index.js +111 -0
package/dist/agents/index.js.map +1 -0
package/dist/agents/runtime-analyzer.d.ts +38 -0
package/dist/agents/runtime-analyzer.d.ts.map +1 -0
package/dist/agents/runtime-analyzer.js +244 -0
package/dist/agents/runtime-analyzer.js.map +1 -0
package/dist/analysis-types.d.ts +500 -0
package/dist/analysis-types.d.ts.map +1 -0
package/dist/analysis-types.js +11 -0
package/dist/analysis-types.js.map +1 -0
package/dist/analytics.d.ts +25 -0
package/dist/analytics.d.ts.map +1 -0
package/dist/analytics.js +94 -0
package/dist/analytics.js.map +1 -0
package/dist/analyzer.d.ts +48 -0
package/dist/analyzer.d.ts.map +1 -0
package/dist/analyzer.js +547 -0
package/dist/analyzer.js.map +1 -0
package/dist/artifacts.d.ts +44 -0
package/dist/artifacts.d.ts.map +1 -0
package/dist/artifacts.js +165 -0
package/dist/artifacts.js.map +1 -0
package/dist/benchmarks/index.d.ts +88 -0
package/dist/benchmarks/index.d.ts.map +1 -0
package/dist/benchmarks/index.js +205 -0
package/dist/benchmarks/index.js.map +1 -0
package/dist/cli.d.ts +3 -0
package/dist/cli.d.ts.map +1 -0
package/dist/cli.js +427 -0
package/dist/cli.js.map +1 -0
package/dist/commands/ci.d.ts +19 -0
package/dist/commands/ci.d.ts.map +1 -0
package/dist/commands/ci.js +253 -0
package/dist/commands/ci.js.map +1 -0
package/dist/commands/config.d.ts +16 -0
package/dist/commands/config.d.ts.map +1 -0
package/dist/commands/config.js +249 -0
package/dist/commands/config.js.map +1 -0
package/dist/commands/demo.d.ts +15 -0
package/dist/commands/demo.d.ts.map +1 -0
package/dist/commands/demo.js +106 -0
package/dist/commands/demo.js.map +1 -0
package/dist/commands/export.d.ts +14 -0
package/dist/commands/export.d.ts.map +1 -0
package/dist/commands/export.js +209 -0
package/dist/commands/export.js.map +1 -0
package/dist/commands/history.d.ts +15 -0
package/dist/commands/history.d.ts.map +1 -0
package/dist/commands/history.js +389 -0
package/dist/commands/history.js.map +1 -0
package/dist/commands/template.d.ts +14 -0
package/dist/commands/template.d.ts.map +1 -0
package/dist/commands/template.js +341 -0
package/dist/commands/template.js.map +1 -0
package/dist/commands/validate-map.d.ts +12 -0
package/dist/commands/validate-map.d.ts.map +1 -0
package/dist/commands/validate-map.js +274 -0
package/dist/commands/validate-map.js.map +1 -0
package/dist/commands/whatif.d.ts +17 -0
package/dist/commands/whatif.d.ts.map +1 -0
package/dist/commands/whatif.js +206 -0
package/dist/commands/whatif.js.map +1 -0
package/dist/comparison.d.ts +38 -0
package/dist/comparison.d.ts.map +1 -0
package/dist/comparison.js +223 -0
package/dist/comparison.js.map +1 -0
package/dist/config.d.ts +42 -0
package/dist/config.d.ts.map +1 -0
package/dist/config.js +158 -0
package/dist/config.js.map +1 -0
package/dist/connectors/helicone.d.ts +9 -0
package/dist/connectors/helicone.d.ts.map +1 -0
package/dist/connectors/helicone.js +106 -0
package/dist/connectors/helicone.js.map +1 -0
package/dist/connectors/index.d.ts +37 -0
package/dist/connectors/index.d.ts.map +1 -0
package/dist/connectors/index.js +65 -0
package/dist/connectors/index.js.map +1 -0
package/dist/connectors/langsmith.d.ts +9 -0
package/dist/connectors/langsmith.d.ts.map +1 -0
package/dist/connectors/langsmith.js +122 -0
package/dist/connectors/langsmith.js.map +1 -0
package/dist/connectors/types.d.ts +83 -0
package/dist/connectors/types.d.ts.map +1 -0
package/dist/connectors/types.js +98 -0
package/dist/connectors/types.js.map +1 -0
package/dist/cost-estimator.d.ts +46 -0
package/dist/cost-estimator.d.ts.map +1 -0
package/dist/cost-estimator.js +104 -0
package/dist/cost-estimator.js.map +1 -0
package/dist/costs.d.ts +57 -0
package/dist/costs.d.ts.map +1 -0
package/dist/costs.js +251 -0
package/dist/costs.js.map +1 -0
package/dist/counterfactuals.d.ts +29 -0
package/dist/counterfactuals.d.ts.map +1 -0
package/dist/counterfactuals.js +448 -0
package/dist/counterfactuals.js.map +1 -0
package/dist/enhancement-prompts.d.ts +41 -0
package/dist/enhancement-prompts.d.ts.map +1 -0
package/dist/enhancement-prompts.js +88 -0
package/dist/enhancement-prompts.js.map +1 -0
package/dist/envelopes.d.ts +20 -0
package/dist/envelopes.d.ts.map +1 -0
package/dist/envelopes.js +790 -0
package/dist/envelopes.js.map +1 -0
package/dist/format-normalizer.d.ts +71 -0
package/dist/format-normalizer.d.ts.map +1 -0
package/dist/format-normalizer.js +1331 -0
package/dist/format-normalizer.js.map +1 -0
package/dist/history.d.ts +79 -0
package/dist/history.d.ts.map +1 -0
package/dist/history.js +313 -0
package/dist/history.js.map +1 -0
package/dist/html.d.ts +11 -0
package/dist/html.d.ts.map +1 -0
package/dist/html.js +463 -0
package/dist/html.js.map +1 -0
package/dist/impact.d.ts +42 -0
package/dist/impact.d.ts.map +1 -0
package/dist/impact.js +443 -0
package/dist/impact.js.map +1 -0
package/dist/index.d.ts +26 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +34 -0
package/dist/index.js.map +1 -0
package/dist/insights.d.ts +5 -0
package/dist/insights.d.ts.map +1 -0
package/dist/insights.js +271 -0
package/dist/insights.js.map +1 -0
package/dist/joiner.d.ts +9 -0
package/dist/joiner.d.ts.map +1 -0
package/dist/joiner.js +247 -0
package/dist/joiner.js.map +1 -0
package/dist/orchestrator.d.ts +34 -0
package/dist/orchestrator.d.ts.map +1 -0
package/dist/orchestrator.js +827 -0
package/dist/orchestrator.js.map +1 -0
package/dist/pdf.d.ts +26 -0
package/dist/pdf.d.ts.map +1 -0
package/dist/pdf.js +84 -0
package/dist/pdf.js.map +1 -0
package/dist/prediction.d.ts +33 -0
package/dist/prediction.d.ts.map +1 -0
package/dist/prediction.js +316 -0
package/dist/prediction.js.map +1 -0
package/dist/prompts/loader.d.ts +38 -0
package/dist/prompts/loader.d.ts.map +1 -0
package/dist/prompts/loader.js +60 -0
package/dist/prompts/loader.js.map +1 -0
package/dist/renderer.d.ts +64 -0
package/dist/renderer.d.ts.map +1 -0
package/dist/renderer.js +923 -0
package/dist/renderer.js.map +1 -0
package/dist/runid.d.ts +57 -0
package/dist/runid.d.ts.map +1 -0
package/dist/runid.js +199 -0
package/dist/runid.js.map +1 -0
package/dist/runtime.d.ts +29 -0
package/dist/runtime.d.ts.map +1 -0
package/dist/runtime.js +366 -0
package/dist/runtime.js.map +1 -0
package/dist/scanner.d.ts +11 -0
package/dist/scanner.d.ts.map +1 -0
package/dist/scanner.js +426 -0
package/dist/scanner.js.map +1 -0
package/dist/templates.d.ts +120 -0
package/dist/templates.d.ts.map +1 -0
package/dist/templates.js +429 -0
package/dist/templates.js.map +1 -0
package/dist/tools/index.d.ts +153 -0
package/dist/tools/index.d.ts.map +1 -0
package/dist/tools/index.js +177 -0
package/dist/tools/index.js.map +1 -0
package/dist/types.d.ts +3647 -0
package/dist/types.d.ts.map +1 -0
package/dist/types.js +703 -0
package/dist/types.js.map +1 -0
package/dist/version.d.ts +7 -0
package/dist/version.d.ts.map +1 -0
package/dist/version.js +23 -0
package/dist/version.js.map +1 -0
package/docs/demo-guide.md +423 -0
package/docs/events-format.md +295 -0
package/docs/inferencemap-spec.md +344 -0
package/docs/migration-v2.md +293 -0
package/fixtures/demo/precomputed.json +142 -0
package/fixtures/demo-project/README.md +52 -0
package/fixtures/demo-project/ai-service.ts +65 -0
package/fixtures/demo-project/sample-events.jsonl +15 -0
package/fixtures/demo-project/src/ai-service.ts +128 -0
package/fixtures/demo-project/src/llm-client.ts +155 -0
package/package.json +65 -0
package/prompts/agent-analyzer.yaml +47 -0
package/prompts/ci-gate.yaml +98 -0
package/prompts/correlation-analyzer.yaml +178 -0
package/prompts/format-normalizer.yaml +46 -0
package/prompts/peak-performance.yaml +180 -0
package/prompts/pr-comment.yaml +111 -0
package/prompts/runtime-analyzer.yaml +189 -0
package/prompts/unified-analyzer.yaml +241 -0
package/schemas/inference-map.v0.1.json +215 -0
package/scripts/benchmark.ts +394 -0
package/scripts/demo-v1.5.sh +158 -0
package/scripts/sync-from-site.sh +197 -0
package/scripts/validate-sync.sh +178 -0
package/src/agent-analyzer.ts +481 -0
package/src/agent.ts +1232 -0
package/src/agents/correlation-analyzer.ts +353 -0
package/src/agents/index.ts +235 -0
package/src/agents/runtime-analyzer.ts +343 -0
package/src/analysis-types.ts +558 -0
package/src/analytics.ts +100 -0
package/src/analyzer.ts +692 -0
package/src/artifacts.ts +218 -0
package/src/benchmarks/index.ts +309 -0
package/src/cli.ts +503 -0
package/src/commands/ci.ts +336 -0
package/src/commands/config.ts +288 -0
package/src/commands/demo.ts +175 -0
package/src/commands/export.ts +297 -0
package/src/commands/history.ts +425 -0
package/src/commands/template.ts +385 -0
package/src/commands/validate-map.ts +324 -0
package/src/commands/whatif.ts +272 -0
package/src/comparison.ts +283 -0
package/src/config.ts +188 -0
package/src/connectors/helicone.ts +164 -0
package/src/connectors/index.ts +93 -0
package/src/connectors/langsmith.ts +179 -0
package/src/connectors/types.ts +180 -0
package/src/cost-estimator.ts +146 -0
package/src/costs.ts +347 -0
package/src/counterfactuals.ts +516 -0
package/src/enhancement-prompts.ts +118 -0
package/src/envelopes.ts +814 -0
package/src/format-normalizer.ts +1486 -0
package/src/history.ts +400 -0
package/src/html.ts +512 -0
package/src/impact.ts +522 -0
package/src/index.ts +83 -0
package/src/insights.ts +341 -0
package/src/joiner.ts +289 -0
package/src/orchestrator.ts +1015 -0
package/src/pdf.ts +110 -0
package/src/prediction.ts +392 -0
package/src/prompts/loader.ts +88 -0
package/src/renderer.ts +1045 -0
package/src/runid.ts +261 -0
package/src/runtime.ts +450 -0
package/src/scanner.ts +508 -0
package/src/templates.ts +561 -0
package/src/tools/index.ts +214 -0
package/src/types.ts +873 -0
package/src/version.ts +24 -0
package/templates/context-accumulation.yaml +23 -0
package/templates/cost-concentration.yaml +20 -0
package/templates/dead-code.yaml +20 -0
package/templates/latency-explainer.yaml +23 -0
package/templates/optimizations/ab-testing-framework.yaml +74 -0
package/templates/optimizations/api-gateway-optimization.yaml +81 -0
package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
package/templates/optimizations/comprehensive-apm.yaml +76 -0
package/templates/optimizations/context-window-optimization.yaml +91 -0
package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
package/templates/optimizations/distributed-training-optimization.yaml +77 -0
package/templates/optimizations/document-analysis-edge.yaml +77 -0
package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
package/templates/optimizations/domain-specific-distillation.yaml +78 -0
package/templates/optimizations/error-handling-optimization.yaml +76 -0
package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
package/templates/optimizations/long-context-memory-management.yaml +78 -0
package/templates/optimizations/max-tokens-optimization.yaml +76 -0
package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
package/templates/optimizations/multi-framework-resilience.yaml +75 -0
package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
package/templates/optimizations/quality-monitoring.yaml +74 -0
package/templates/optimizations/realtime-budget-controls.yaml +74 -0
package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
package/templates/optimizations/smart-model-routing.yaml +96 -0
package/templates/optimizations/streaming-batch-selection.yaml +167 -0
package/templates/optimizations/system-prompt-optimization.yaml +75 -0
package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
package/templates/overpowered-extraction.yaml +32 -0
package/templates/overpowered-model.yaml +31 -0
package/templates/prompt-bloat.yaml +24 -0
package/templates/retry-explosion.yaml +28 -0
package/templates/schema/insight.schema.json +113 -0
package/templates/schema/optimization.schema.json +180 -0
package/templates/streaming-drift.yaml +30 -0
package/templates/throughput-gap.yaml +21 -0
package/templates/token-underutilization.yaml +28 -0
package/templates/untested-fallback.yaml +21 -0
package/tests/accuracy/drift-detection.test.ts +184 -0
package/tests/accuracy/false-positives.test.ts +166 -0
package/tests/accuracy/templates.test.ts +205 -0
package/tests/action/commands.test.ts +125 -0
package/tests/action/comments.test.ts +347 -0
package/tests/cli.test.ts +203 -0
package/tests/comparison.test.ts +309 -0
package/tests/correlation-analyzer.test.ts +534 -0
package/tests/counterfactuals.test.ts +347 -0
package/tests/fixtures/events/missing-id.jsonl +1 -0
package/tests/fixtures/events/missing-input.jsonl +1 -0
package/tests/fixtures/events/missing-latency.jsonl +1 -0
package/tests/fixtures/events/missing-model.jsonl +1 -0
package/tests/fixtures/events/missing-output.jsonl +1 -0
package/tests/fixtures/events/missing-provider.jsonl +1 -0
package/tests/fixtures/events/missing-ts.jsonl +1 -0
package/tests/fixtures/events/valid.csv +3 -0
package/tests/fixtures/events/valid.json +1 -0
package/tests/fixtures/events/valid.jsonl +2 -0
package/tests/fixtures/events/with-callsite.jsonl +1 -0
package/tests/fixtures/events/with-intent.jsonl +1 -0
package/tests/fixtures/events/wrong-type.jsonl +1 -0
package/tests/fixtures/repos/empty/.gitkeep +0 -0
package/tests/fixtures/repos/hybrid-router/router.py +35 -0
package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
package/tests/fixtures/repos/saas-openai/client.py +26 -0
package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
package/tests/github-action.test.ts +292 -0
package/tests/insights.test.ts +878 -0
package/tests/joiner.test.ts +168 -0
package/tests/performance/action-latency.test.ts +132 -0
package/tests/performance/benchmark.test.ts +189 -0
package/tests/performance/cli-latency.test.ts +102 -0
package/tests/pr-comment.test.ts +313 -0
package/tests/prediction.test.ts +296 -0
package/tests/runtime-analyzer.test.ts +375 -0
package/tests/runtime.test.ts +205 -0
package/tests/scanner.test.ts +122 -0
package/tests/template-conformance.test.ts +526 -0
package/tests/unit/cost-calculator.test.ts +303 -0
package/tests/unit/credits.test.ts +180 -0
package/tests/unit/inference-map.test.ts +276 -0
package/tests/unit/schema.test.ts +300 -0
package/tsconfig.json +20 -0
package/vitest.config.ts +14 -0

package/templates/optimizations/tensorrt-llm-performance.yaml ADDED Viewed

@@ -0,0 +1,77 @@
+id: tensorrt-llm-performance
+name: TensorRT-LLM Maximum Performance
+description: Deploy models with TensorRT-LLM for maximum inference performance on NVIDIA GPUs
+category: runtime_optimization
+confidence: 0.88
+success_count: 789
+verified_environments: 38
+contributors:
+  - tensorrt_specialist
+  - nvidia_partner
+last_updated: "2025-01-03"
+environment_match:
+  gpu_vendor: nvidia
+  performance_requirement: maximum
+  model_support: transformer
+optimization:
+  technique: tensorrt_llm_deployment
+  expected_throughput_improvement: "3-5x"
+  expected_latency_improvement: "50-70%"
+  effort_estimate: "2-3 weeks"
+  risk_level: medium
+economics:
+  projected_improvement:
+    throughput_multiplier: 4
+    cost_per_token_reduction: 0.75
+  implementation_cost:
+    engineering_hours: 120
+    total_cost: 24000
+implementation:
+  prerequisites:
+    - requirement: "NVIDIA GPU (A100/H100 recommended)"
+    - requirement: "TensorRT-LLM installation"
+      validation_command: "python -c 'import tensorrt_llm'"
+    - requirement: "Sufficient disk space for compiled engines"
+  automated_steps:
+    - step_id: engine_build
+      name: TensorRT Engine Build
+      executable: true
+      commands:
+        - "python scripts/convert_to_trt.py --model ./model --dtype fp16"
+        - "python scripts/build_trt_engine.py --max-batch-size 64 --max-input-len 2048"
+      validation:
+        command: "python scripts/validate_engine.py"
+        success_criteria: "engine_valid AND accuracy > 0.99"
+    - step_id: deployment
+      name: TensorRT Deployment
+      executable: true
+      commands:
+        - "python scripts/deploy_trt_server.py --engine ./engine"
+        - "python scripts/configure_inflight_batching.py"
+      validation:
+        command: "python scripts/benchmark_trt.py"
+        success_criteria: "throughput > baseline * 3"
+        rollback_command: "python scripts/fallback_to_pytorch.py"
+monitoring:
+  key_metrics:
+    - metric: throughput_tokens_per_second
+      target: ">5000"
+      alert_threshold: "<3000"
+    - metric: gpu_memory_utilization
+      target: "70-85%"
+      alert_threshold: ">95%"
+  rollback_triggers:
+    - condition: "engine_error_rate > 0.01 for 5 minutes"
+      action: automatic_rollback
+results:
+  recent_implementations:
+    - environment: high_volume_api
+      baseline_throughput: 800
+      optimized_throughput: 3500
+      improvement_factor: 4.4

package/templates/optimizations/vllm-high-throughput-optimization.yaml ADDED Viewed

@@ -0,0 +1,93 @@
+id: vllm-high-throughput-optimization
+name: vLLM Continuous Batching for High-Volume Production
+description: Optimize vLLM deployment for maximum throughput in high-traffic scenarios
+category: batching_optimization
+confidence: 0.91
+success_count: 1923
+verified_environments: 67
+contributors:
+  - scaling_team
+  - vllm_expert
+  - production_engineer
+last_updated: "2025-01-14"
+environment_match:
+  runtime: vllm
+  monthly_requests: ">1M"
+  current_batch_size: "<8"
+  gpu_utilization: "<70%"
+  latency_requirements: flexible
+optimization:
+  technique: continuous_batching
+  expected_throughput_improvement: "3-5x"
+  expected_cost_reduction: "60-75%"
+  effort_estimate: "1-2 weeks"
+  risk_level: low
+economics:
+  baseline_calculation:
+    current_throughput_factor: 1.0
+  projected_improvement:
+    new_throughput_factor: 4.0
+    gpu_reduction_factor: 0.25
+  implementation_cost:
+    engineering_hours: 80
+    total_cost: 16000
+implementation:
+  prerequisites:
+    - requirement: "vLLM 0.2.7+"
+      validation_command: "python -c 'import vllm; print(vllm.__version__)'"
+    - requirement: "CUDA 11.8+"
+      validation_command: "nvcc --version | grep 'release 11.8'"
+    - requirement: "16GB+ GPU memory"
+      validation_command: "nvidia-smi --query-gpu=memory.total --format=csv,noheader | awk '{if($1<16000) exit 1}'"
+  automated_steps:
+    - step_id: batch_configuration
+      name: Optimal Batch Configuration
+      executable: true
+      commands:
+        - "python scripts/configure_vllm.py --max-num-batched-tokens 8192 --max-num-seqs 32"
+        - "python scripts/start_vllm_server.py --model meta-llama/Llama-2-7b-hf --gpu-memory-utilization 0.85"
+      validation:
+        command: "python scripts/test_batch_performance.py --target-batch-size 16"
+        success_criteria: "average_batch_size > 12"
+        rollback_command: "python scripts/revert_vllm_config.py"
+    - step_id: memory_optimization
+      name: Memory Optimization
+      executable: true
+      commands:
+        - "python scripts/enable_prefix_caching.py"
+        - "python scripts/configure_swap_space.py --swap-size 4GB"
+      validation:
+        command: "python scripts/check_memory_efficiency.py"
+        success_criteria: "memory_utilization > 0.8 AND memory_utilization < 0.9"
+        rollback_command: "python scripts/disable_optimizations.py"
+monitoring:
+  key_metrics:
+    - metric: average_batch_size
+      target: ">16"
+      alert_threshold: "<12"
+    - metric: throughput_tokens_per_second
+      target: ">3000"
+      alert_threshold: "<2000"
+    - metric: gpu_memory_utilization
+      target: "0.8-0.85"
+      alert_threshold: ">0.9"
+  rollback_triggers:
+    - condition: "average_batch_size < 8 for 20 minutes"
+      action: automatic_rollback
+    - condition: "gpu_memory_utilization > 0.95 for 10 minutes"
+      action: automatic_rollback
+    - condition: "throughput_degradation > 30% for 15 minutes"
+      action: alert_and_investigation
+results:
+  recent_implementations:
+    - environment: video_streaming_recommendations
+      baseline_throughput: 800
+      optimized_throughput: 3200
+      throughput_improvement: 4.0
+      implementation_days: 8

package/templates/optimizations/vllm-migration-memory-bound.yaml ADDED Viewed

@@ -0,0 +1,78 @@
+id: vllm-migration-memory-bound
+name: vLLM Migration from Memory-Bound Workloads
+description: Migrate from traditional serving to vLLM for memory-bound inference workloads
+category: runtime_optimization
+confidence: 0.90
+success_count: 1123
+verified_environments: 52
+contributors:
+  - vllm_specialist
+  - migration_engineer
+last_updated: "2025-01-10"
+environment_match:
+  current_runtime:
+    - huggingface
+    - pytorch
+  memory_bound: true
+  batch_size: "<4"
+optimization:
+  technique: vllm_migration
+  expected_throughput_improvement: "3-6x"
+  expected_cost_reduction: "60-80%"
+  effort_estimate: "1-2 weeks"
+  risk_level: low
+economics:
+  projected_improvement:
+    throughput_multiplier: 4.5
+    cost_reduction_percent: 70
+  implementation_cost:
+    engineering_hours: 60
+    total_cost: 12000
+implementation:
+  prerequisites:
+    - requirement: "vLLM compatible model"
+      validation_command: "python scripts/check_vllm_compatibility.py --model ./model"
+    - requirement: "GPU with 16GB+ memory"
+  automated_steps:
+    - step_id: compatibility_check
+      name: Compatibility Verification
+      executable: true
+      commands:
+        - "python scripts/verify_model_format.py"
+        - "python scripts/test_vllm_loading.py"
+      validation:
+        command: "python scripts/validate_loading.py"
+        success_criteria: "model_loads_successfully"
+    - step_id: migration
+      name: vLLM Migration
+      executable: true
+      commands:
+        - "python scripts/setup_vllm_server.py --model ./model --tensor-parallel-size 1"
+        - "python scripts/configure_batching.py --max-tokens 8192"
+      validation:
+        command: "python scripts/benchmark_vllm.py"
+        success_criteria: "throughput > baseline * 3"
+        rollback_command: "python scripts/revert_to_original.py"
+monitoring:
+  key_metrics:
+    - metric: throughput_rps
+      target: ">baseline * 3"
+      alert_threshold: "<baseline * 2"
+    - metric: latency_p99
+      target: "<baseline * 1.2"
+      alert_threshold: ">baseline * 2"
+  rollback_triggers:
+    - condition: "throughput < baseline for 15 minutes"
+      action: automatic_rollback
+results:
+  recent_implementations:
+    - environment: api_inference_service
+      baseline_throughput: 50
+      optimized_throughput: 220
+      improvement_factor: 4.4

package/templates/overpowered-extraction.yaml ADDED Viewed

@@ -0,0 +1,32 @@
+# Based on: https://www.kalmantic.com/posts/gpt5-model-selection-economics-extraction-tasks
+# "Why Premium Models Waste Money on Extraction Tasks"
+id: overpowered-extraction
+name: Overpowered Model for Simple Tasks
+version: "1.0"
+category: cost
+severity: warning
+layer: model
+match:
+  scope: callsite
+  conditions:
+    - field: model
+      op: in
+      value: ["gpt-4o", "gpt-4", "gpt-4-turbo", "claude-3-opus", "claude-3.5-sonnet"]
+    - field: avg_tokens
+      op: lt
+      value: 100
+output:
+  headline: "Using {{model}} for {{avg_tokens}}-token outputs"
+  evidence: "{{location}}: Consider gpt-4o-mini or claude-3-haiku for simple extraction tasks"
+defaults:
+  small_output_threshold: 100
+  premium_models:
+    - gpt-4o
+    - gpt-4
+    - gpt-4-turbo
+    - claude-3-opus
+    - claude-3.5-sonnet

package/templates/overpowered-model.yaml ADDED Viewed

@@ -0,0 +1,31 @@
+id: overpowered-model
+name: Overpowered Model Detection
+version: "1.0"
+category: waste
+severity: info
+layer: model
+source:
+  url: https://openai.com/pricing
+  title: "Model Pricing and Capability Tiers"
+match:
+  scope: callsite
+  conditions:
+    - field: model
+      op: in
+      value: ["gpt-4o", "gpt-4", "gpt-4-turbo", "claude-3-opus", "claude-3-opus-20240229"]
+    - field: usage.avg_output_tokens
+      op: lt
+      value: 100
+    - field: usage.calls
+      op: gt
+      value: 100
+output:
+  headline: "{{model}} used for short outputs (avg {{avg_tokens}} tokens)"
+  evidence: "Premium models have minimum cost overhead regardless of output length"
+defaults:
+  output_threshold: 100
+  calls_threshold: 100

package/templates/prompt-bloat.yaml ADDED Viewed

@@ -0,0 +1,24 @@
+# Based on: https://www.kalmantic.com/posts/system-prompt-optimization-stop-paying-redundant-instructions
+# "Stop Paying 40x More for Redundant AI Instructions"
+id: prompt-bloat
+name: Prompt Bloat Detection
+version: "1.0"
+category: cost
+severity: warning
+layer: model
+match:
+  scope: callsite
+  conditions:
+    - field: usage.tokens_in
+      op: ratio_gt
+      compare_to: usage.tokens_out
+      value: 20
+output:
+  headline: "{{ratio}}x more input than output tokens"
+  evidence: "{{location}}: {{tokens_in}} tokens in → {{tokens_out}} tokens out. Consider prompt optimization."
+defaults:
+  input_output_ratio_threshold: 20

package/templates/retry-explosion.yaml ADDED Viewed

@@ -0,0 +1,28 @@
+# Based on: https://www.kalmantic.com/posts/ai-retry-logic-error-handling-multiplies-costs
+# "How Bad Error Handling Turns $10 Failures into $1000 Bills"
+id: retry-explosion
+name: Retry Storm Detection
+version: "1.0"
+category: cost
+severity: critical
+layer: api
+match:
+  scope: callsite
+  conditions:
+    - field: usage.calls
+      op: gt
+      value: 10
+    - field: usage.latency_p99
+      op: ratio_gt
+      compare_to: usage.latency_p50
+      value: 5
+output:
+  headline: "Possible retry storm at {{location}}"
+  evidence: "{{calls}} calls with p99/p50 ratio of {{ratio}}x - check retry logic"
+defaults:
+  min_calls: 10
+  latency_ratio_threshold: 5

package/templates/schema/insight.schema.json ADDED Viewed

@@ -0,0 +1,113 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://github.com/Kalmantic/peakinfer_templates/schema/insight.schema.json",
+  "title": "PeakInfer Insight Template",
+  "description": "Schema for insight detection templates",
+  "type": "object",
+  "required": ["id", "version", "name", "category", "severity", "match", "output"],
+  "properties": {
+    "id": {
+      "type": "string",
+      "pattern": "^[a-z][a-z0-9-]*$",
+      "description": "Unique identifier (kebab-case)"
+    },
+    "version": {
+      "type": "string",
+      "pattern": "^\\d+\\.\\d+$",
+      "description": "Template version (semver major.minor)"
+    },
+    "name": {
+      "type": "string",
+      "description": "Human-readable name"
+    },
+    "description": {
+      "type": "string",
+      "description": "Detailed description of what this insight detects"
+    },
+    "source": {
+      "type": "object",
+      "properties": {
+        "url": { "type": "string", "format": "uri" },
+        "title": { "type": "string" }
+      },
+      "description": "Attribution to blog post, research, etc."
+    },
+    "category": {
+      "type": "string",
+      "enum": ["cost", "drift", "performance", "waste", "reliability", "latency", "throughput"],
+      "description": "Primary category for grouping"
+    },
+    "severity": {
+      "type": "string",
+      "enum": ["critical", "warning", "info"],
+      "description": "Impact severity"
+    },
+    "tags": {
+      "type": "array",
+      "items": { "type": "string" },
+      "description": "Searchable tags"
+    },
+    "match": {
+      "type": "object",
+      "required": ["scope", "conditions"],
+      "properties": {
+        "scope": {
+          "type": "string",
+          "enum": ["callsite", "joined", "global", "envelope"],
+          "description": "What data context to evaluate against"
+        },
+        "conditions": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "required": ["field", "op"],
+            "properties": {
+              "field": { "type": "string" },
+              "op": {
+                "type": "string",
+                "enum": ["eq", "neq", "gt", "lt", "gte", "lte", "exists", "in", "ratio_gt", "ratio_lt", "has_pattern"]
+              },
+              "value": {},
+              "compare_to": { "type": "string" },
+              "pattern": { "type": "string" },
+              "count_gt": { "type": "number" }
+            }
+          }
+        }
+      }
+    },
+    "output": {
+      "type": "object",
+      "required": ["headline", "evidence"],
+      "properties": {
+        "headline": {
+          "type": "string",
+          "description": "Short summary with {{variables}}"
+        },
+        "evidence": {
+          "type": "string",
+          "description": "Supporting details with {{variables}}"
+        }
+      }
+    },
+    "recommends": {
+      "type": "array",
+      "items": {
+        "type": "object",
+        "properties": {
+          "optimization": { "type": "string" },
+          "relevance": { "type": "number", "minimum": 0, "maximum": 1 },
+          "reason": { "type": "string" }
+        }
+      },
+      "description": "Links to optimization templates"
+    },
+    "defaults": {
+      "type": "object",
+      "description": "Default threshold values"
+    },
+    "author": { "type": "string" },
+    "created": { "type": "string", "format": "date" },
+    "updated": { "type": "string", "format": "date" }
+  }
+}

package/templates/schema/optimization.schema.json ADDED Viewed

@@ -0,0 +1,180 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "$id": "https://github.com/Kalmantic/peakinfer_templates/schema/optimization.schema.json",
+  "title": "PeakInfer Optimization Template",
+  "description": "Schema for optimization recommendation templates",
+  "type": "object",
+  "required": ["id", "name", "description", "category", "optimization", "implementation"],
+  "properties": {
+    "id": {
+      "type": "string",
+      "pattern": "^[a-z][a-z0-9-]*$",
+      "description": "Unique identifier (kebab-case)"
+    },
+    "name": {
+      "type": "string",
+      "description": "Human-readable name"
+    },
+    "description": {
+      "type": "string",
+      "description": "Detailed description of the optimization"
+    },
+    "source": {
+      "type": "object",
+      "properties": {
+        "url": { "type": "string", "format": "uri" },
+        "title": { "type": "string" },
+        "authors": {
+          "type": "array",
+          "items": { "type": "string" }
+        }
+      },
+      "description": "Attribution to research paper, blog post, etc."
+    },
+    "category": {
+      "type": "string",
+      "enum": [
+        "api_optimization",
+        "memory_optimization",
+        "latency_optimization",
+        "cost_optimization",
+        "reliability_optimization",
+        "throughput_optimization",
+        "serving_optimization"
+      ],
+      "description": "Primary optimization category"
+    },
+    "confidence": {
+      "type": "number",
+      "minimum": 0,
+      "maximum": 1,
+      "description": "Confidence score based on verified implementations"
+    },
+    "success_count": {
+      "type": "integer",
+      "minimum": 0,
+      "description": "Number of successful implementations"
+    },
+    "verified_environments": {
+      "type": "integer",
+      "minimum": 0,
+      "description": "Number of verified deployment environments"
+    },
+    "contributors": {
+      "type": "array",
+      "items": { "type": "string" },
+      "description": "Contributors to this optimization"
+    },
+    "last_updated": {
+      "type": "string",
+      "format": "date",
+      "description": "Last update date"
+    },
+    "environment_match": {
+      "type": "object",
+      "description": "Conditions for when this optimization applies",
+      "properties": {
+        "model_size": {
+          "type": "array",
+          "items": { "type": "string" }
+        },
+        "memory_pressure": { "type": "string" },
+        "quality_tolerance": { "type": "string" },
+        "deployment": {
+          "type": "array",
+          "items": { "type": "string" }
+        }
+      }
+    },
+    "optimization": {
+      "type": "object",
+      "required": ["technique"],
+      "properties": {
+        "technique": { "type": "string" },
+        "expected_memory_reduction": { "type": "string" },
+        "expected_quality_retention": { "type": "string" },
+        "expected_latency_improvement": { "type": "string" },
+        "expected_cost_reduction": { "type": "string" },
+        "effort_estimate": { "type": "string" },
+        "risk_level": {
+          "type": "string",
+          "enum": ["low", "medium", "high"]
+        }
+      }
+    },
+    "economics": {
+      "type": "object",
+      "properties": {
+        "baseline_calculation": { "type": "object" },
+        "projected_improvement": { "type": "object" },
+        "implementation_cost": { "type": "object" }
+      }
+    },
+    "implementation": {
+      "type": "object",
+      "properties": {
+        "prerequisites": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "requirement": { "type": "string" },
+              "validation_command": { "type": "string" }
+            }
+          }
+        },
+        "automated_steps": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "step_id": { "type": "string" },
+              "name": { "type": "string" },
+              "executable": { "type": "boolean" },
+              "commands": {
+                "type": "array",
+                "items": { "type": "string" }
+              },
+              "validation": { "type": "object" }
+            }
+          }
+        }
+      }
+    },
+    "monitoring": {
+      "type": "object",
+      "properties": {
+        "key_metrics": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "metric": { "type": "string" },
+              "target": { "type": "string" },
+              "alert_threshold": { "type": "string" }
+            }
+          }
+        },
+        "rollback_triggers": {
+          "type": "array",
+          "items": {
+            "type": "object",
+            "properties": {
+              "condition": { "type": "string" },
+              "action": { "type": "string" }
+            }
+          }
+        }
+      }
+    },
+    "results": {
+      "type": "object",
+      "properties": {
+        "recent_implementations": {
+          "type": "array",
+          "items": { "type": "object" }
+        }
+      }
+    }
+  }
+}

package/templates/streaming-drift.yaml ADDED Viewed

@@ -0,0 +1,30 @@
+id: streaming-drift
+name: Streaming Drift Detection
+version: "1.0"
+category: latency
+severity: critical
+layer: application
+source:
+  url: https://anthropic.com/research/streaming-tokens
+  title: "Token Streaming for Real-Time Applications"
+match:
+  scope: callsite
+  conditions:
+    - field: patterns.streaming
+      op: eq
+      value: true
+    - field: usage
+      op: exists
+    - field: usage.latency_p99
+      op: ratio_gt
+      compare_to: usage.latency_p50
+      value: 5
+output:
+  headline: "Streaming enabled but responses arrive in bursts"
+  evidence: "p99/p50 ratio is {{ratio}}x — true streaming would be under 2x"
+defaults:
+  threshold: 2