@peakinfer/cli 1.0.133
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +8 -0
- package/.env.example +6 -0
- package/.github/workflows/peakinfer.yml +64 -0
- package/CHANGELOG.md +31 -0
- package/LICENSE +190 -0
- package/README.md +335 -0
- package/data/inferencemax.json +274 -0
- package/dist/agent-analyzer.d.ts +45 -0
- package/dist/agent-analyzer.d.ts.map +1 -0
- package/dist/agent-analyzer.js +374 -0
- package/dist/agent-analyzer.js.map +1 -0
- package/dist/agent.d.ts +76 -0
- package/dist/agent.d.ts.map +1 -0
- package/dist/agent.js +965 -0
- package/dist/agent.js.map +1 -0
- package/dist/agents/correlation-analyzer.d.ts +34 -0
- package/dist/agents/correlation-analyzer.d.ts.map +1 -0
- package/dist/agents/correlation-analyzer.js +261 -0
- package/dist/agents/correlation-analyzer.js.map +1 -0
- package/dist/agents/index.d.ts +91 -0
- package/dist/agents/index.d.ts.map +1 -0
- package/dist/agents/index.js +111 -0
- package/dist/agents/index.js.map +1 -0
- package/dist/agents/runtime-analyzer.d.ts +38 -0
- package/dist/agents/runtime-analyzer.d.ts.map +1 -0
- package/dist/agents/runtime-analyzer.js +244 -0
- package/dist/agents/runtime-analyzer.js.map +1 -0
- package/dist/analysis-types.d.ts +500 -0
- package/dist/analysis-types.d.ts.map +1 -0
- package/dist/analysis-types.js +11 -0
- package/dist/analysis-types.js.map +1 -0
- package/dist/analytics.d.ts +25 -0
- package/dist/analytics.d.ts.map +1 -0
- package/dist/analytics.js +94 -0
- package/dist/analytics.js.map +1 -0
- package/dist/analyzer.d.ts +48 -0
- package/dist/analyzer.d.ts.map +1 -0
- package/dist/analyzer.js +547 -0
- package/dist/analyzer.js.map +1 -0
- package/dist/artifacts.d.ts +44 -0
- package/dist/artifacts.d.ts.map +1 -0
- package/dist/artifacts.js +165 -0
- package/dist/artifacts.js.map +1 -0
- package/dist/benchmarks/index.d.ts +88 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +205 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +427 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/ci.d.ts +19 -0
- package/dist/commands/ci.d.ts.map +1 -0
- package/dist/commands/ci.js +253 -0
- package/dist/commands/ci.js.map +1 -0
- package/dist/commands/config.d.ts +16 -0
- package/dist/commands/config.d.ts.map +1 -0
- package/dist/commands/config.js +249 -0
- package/dist/commands/config.js.map +1 -0
- package/dist/commands/demo.d.ts +15 -0
- package/dist/commands/demo.d.ts.map +1 -0
- package/dist/commands/demo.js +106 -0
- package/dist/commands/demo.js.map +1 -0
- package/dist/commands/export.d.ts +14 -0
- package/dist/commands/export.d.ts.map +1 -0
- package/dist/commands/export.js +209 -0
- package/dist/commands/export.js.map +1 -0
- package/dist/commands/history.d.ts +15 -0
- package/dist/commands/history.d.ts.map +1 -0
- package/dist/commands/history.js +389 -0
- package/dist/commands/history.js.map +1 -0
- package/dist/commands/template.d.ts +14 -0
- package/dist/commands/template.d.ts.map +1 -0
- package/dist/commands/template.js +341 -0
- package/dist/commands/template.js.map +1 -0
- package/dist/commands/validate-map.d.ts +12 -0
- package/dist/commands/validate-map.d.ts.map +1 -0
- package/dist/commands/validate-map.js +274 -0
- package/dist/commands/validate-map.js.map +1 -0
- package/dist/commands/whatif.d.ts +17 -0
- package/dist/commands/whatif.d.ts.map +1 -0
- package/dist/commands/whatif.js +206 -0
- package/dist/commands/whatif.js.map +1 -0
- package/dist/comparison.d.ts +38 -0
- package/dist/comparison.d.ts.map +1 -0
- package/dist/comparison.js +223 -0
- package/dist/comparison.js.map +1 -0
- package/dist/config.d.ts +42 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +158 -0
- package/dist/config.js.map +1 -0
- package/dist/connectors/helicone.d.ts +9 -0
- package/dist/connectors/helicone.d.ts.map +1 -0
- package/dist/connectors/helicone.js +106 -0
- package/dist/connectors/helicone.js.map +1 -0
- package/dist/connectors/index.d.ts +37 -0
- package/dist/connectors/index.d.ts.map +1 -0
- package/dist/connectors/index.js +65 -0
- package/dist/connectors/index.js.map +1 -0
- package/dist/connectors/langsmith.d.ts +9 -0
- package/dist/connectors/langsmith.d.ts.map +1 -0
- package/dist/connectors/langsmith.js +122 -0
- package/dist/connectors/langsmith.js.map +1 -0
- package/dist/connectors/types.d.ts +83 -0
- package/dist/connectors/types.d.ts.map +1 -0
- package/dist/connectors/types.js +98 -0
- package/dist/connectors/types.js.map +1 -0
- package/dist/cost-estimator.d.ts +46 -0
- package/dist/cost-estimator.d.ts.map +1 -0
- package/dist/cost-estimator.js +104 -0
- package/dist/cost-estimator.js.map +1 -0
- package/dist/costs.d.ts +57 -0
- package/dist/costs.d.ts.map +1 -0
- package/dist/costs.js +251 -0
- package/dist/costs.js.map +1 -0
- package/dist/counterfactuals.d.ts +29 -0
- package/dist/counterfactuals.d.ts.map +1 -0
- package/dist/counterfactuals.js +448 -0
- package/dist/counterfactuals.js.map +1 -0
- package/dist/enhancement-prompts.d.ts +41 -0
- package/dist/enhancement-prompts.d.ts.map +1 -0
- package/dist/enhancement-prompts.js +88 -0
- package/dist/enhancement-prompts.js.map +1 -0
- package/dist/envelopes.d.ts +20 -0
- package/dist/envelopes.d.ts.map +1 -0
- package/dist/envelopes.js +790 -0
- package/dist/envelopes.js.map +1 -0
- package/dist/format-normalizer.d.ts +71 -0
- package/dist/format-normalizer.d.ts.map +1 -0
- package/dist/format-normalizer.js +1331 -0
- package/dist/format-normalizer.js.map +1 -0
- package/dist/history.d.ts +79 -0
- package/dist/history.d.ts.map +1 -0
- package/dist/history.js +313 -0
- package/dist/history.js.map +1 -0
- package/dist/html.d.ts +11 -0
- package/dist/html.d.ts.map +1 -0
- package/dist/html.js +463 -0
- package/dist/html.js.map +1 -0
- package/dist/impact.d.ts +42 -0
- package/dist/impact.d.ts.map +1 -0
- package/dist/impact.js +443 -0
- package/dist/impact.js.map +1 -0
- package/dist/index.d.ts +26 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +34 -0
- package/dist/index.js.map +1 -0
- package/dist/insights.d.ts +5 -0
- package/dist/insights.d.ts.map +1 -0
- package/dist/insights.js +271 -0
- package/dist/insights.js.map +1 -0
- package/dist/joiner.d.ts +9 -0
- package/dist/joiner.d.ts.map +1 -0
- package/dist/joiner.js +247 -0
- package/dist/joiner.js.map +1 -0
- package/dist/orchestrator.d.ts +34 -0
- package/dist/orchestrator.d.ts.map +1 -0
- package/dist/orchestrator.js +827 -0
- package/dist/orchestrator.js.map +1 -0
- package/dist/pdf.d.ts +26 -0
- package/dist/pdf.d.ts.map +1 -0
- package/dist/pdf.js +84 -0
- package/dist/pdf.js.map +1 -0
- package/dist/prediction.d.ts +33 -0
- package/dist/prediction.d.ts.map +1 -0
- package/dist/prediction.js +316 -0
- package/dist/prediction.js.map +1 -0
- package/dist/prompts/loader.d.ts +38 -0
- package/dist/prompts/loader.d.ts.map +1 -0
- package/dist/prompts/loader.js +60 -0
- package/dist/prompts/loader.js.map +1 -0
- package/dist/renderer.d.ts +64 -0
- package/dist/renderer.d.ts.map +1 -0
- package/dist/renderer.js +923 -0
- package/dist/renderer.js.map +1 -0
- package/dist/runid.d.ts +57 -0
- package/dist/runid.d.ts.map +1 -0
- package/dist/runid.js +199 -0
- package/dist/runid.js.map +1 -0
- package/dist/runtime.d.ts +29 -0
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +366 -0
- package/dist/runtime.js.map +1 -0
- package/dist/scanner.d.ts +11 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +426 -0
- package/dist/scanner.js.map +1 -0
- package/dist/templates.d.ts +120 -0
- package/dist/templates.d.ts.map +1 -0
- package/dist/templates.js +429 -0
- package/dist/templates.js.map +1 -0
- package/dist/tools/index.d.ts +153 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +177 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/types.d.ts +3647 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +703 -0
- package/dist/types.js.map +1 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +23 -0
- package/dist/version.js.map +1 -0
- package/docs/demo-guide.md +423 -0
- package/docs/events-format.md +295 -0
- package/docs/inferencemap-spec.md +344 -0
- package/docs/migration-v2.md +293 -0
- package/fixtures/demo/precomputed.json +142 -0
- package/fixtures/demo-project/README.md +52 -0
- package/fixtures/demo-project/ai-service.ts +65 -0
- package/fixtures/demo-project/sample-events.jsonl +15 -0
- package/fixtures/demo-project/src/ai-service.ts +128 -0
- package/fixtures/demo-project/src/llm-client.ts +155 -0
- package/package.json +65 -0
- package/prompts/agent-analyzer.yaml +47 -0
- package/prompts/ci-gate.yaml +98 -0
- package/prompts/correlation-analyzer.yaml +178 -0
- package/prompts/format-normalizer.yaml +46 -0
- package/prompts/peak-performance.yaml +180 -0
- package/prompts/pr-comment.yaml +111 -0
- package/prompts/runtime-analyzer.yaml +189 -0
- package/prompts/unified-analyzer.yaml +241 -0
- package/schemas/inference-map.v0.1.json +215 -0
- package/scripts/benchmark.ts +394 -0
- package/scripts/demo-v1.5.sh +158 -0
- package/scripts/sync-from-site.sh +197 -0
- package/scripts/validate-sync.sh +178 -0
- package/src/agent-analyzer.ts +481 -0
- package/src/agent.ts +1232 -0
- package/src/agents/correlation-analyzer.ts +353 -0
- package/src/agents/index.ts +235 -0
- package/src/agents/runtime-analyzer.ts +343 -0
- package/src/analysis-types.ts +558 -0
- package/src/analytics.ts +100 -0
- package/src/analyzer.ts +692 -0
- package/src/artifacts.ts +218 -0
- package/src/benchmarks/index.ts +309 -0
- package/src/cli.ts +503 -0
- package/src/commands/ci.ts +336 -0
- package/src/commands/config.ts +288 -0
- package/src/commands/demo.ts +175 -0
- package/src/commands/export.ts +297 -0
- package/src/commands/history.ts +425 -0
- package/src/commands/template.ts +385 -0
- package/src/commands/validate-map.ts +324 -0
- package/src/commands/whatif.ts +272 -0
- package/src/comparison.ts +283 -0
- package/src/config.ts +188 -0
- package/src/connectors/helicone.ts +164 -0
- package/src/connectors/index.ts +93 -0
- package/src/connectors/langsmith.ts +179 -0
- package/src/connectors/types.ts +180 -0
- package/src/cost-estimator.ts +146 -0
- package/src/costs.ts +347 -0
- package/src/counterfactuals.ts +516 -0
- package/src/enhancement-prompts.ts +118 -0
- package/src/envelopes.ts +814 -0
- package/src/format-normalizer.ts +1486 -0
- package/src/history.ts +400 -0
- package/src/html.ts +512 -0
- package/src/impact.ts +522 -0
- package/src/index.ts +83 -0
- package/src/insights.ts +341 -0
- package/src/joiner.ts +289 -0
- package/src/orchestrator.ts +1015 -0
- package/src/pdf.ts +110 -0
- package/src/prediction.ts +392 -0
- package/src/prompts/loader.ts +88 -0
- package/src/renderer.ts +1045 -0
- package/src/runid.ts +261 -0
- package/src/runtime.ts +450 -0
- package/src/scanner.ts +508 -0
- package/src/templates.ts +561 -0
- package/src/tools/index.ts +214 -0
- package/src/types.ts +873 -0
- package/src/version.ts +24 -0
- package/templates/context-accumulation.yaml +23 -0
- package/templates/cost-concentration.yaml +20 -0
- package/templates/dead-code.yaml +20 -0
- package/templates/latency-explainer.yaml +23 -0
- package/templates/optimizations/ab-testing-framework.yaml +74 -0
- package/templates/optimizations/api-gateway-optimization.yaml +81 -0
- package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
- package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
- package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
- package/templates/optimizations/comprehensive-apm.yaml +76 -0
- package/templates/optimizations/context-window-optimization.yaml +91 -0
- package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
- package/templates/optimizations/distributed-training-optimization.yaml +77 -0
- package/templates/optimizations/document-analysis-edge.yaml +77 -0
- package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
- package/templates/optimizations/domain-specific-distillation.yaml +78 -0
- package/templates/optimizations/error-handling-optimization.yaml +76 -0
- package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
- package/templates/optimizations/long-context-memory-management.yaml +78 -0
- package/templates/optimizations/max-tokens-optimization.yaml +76 -0
- package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
- package/templates/optimizations/multi-framework-resilience.yaml +75 -0
- package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
- package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
- package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
- package/templates/optimizations/quality-monitoring.yaml +74 -0
- package/templates/optimizations/realtime-budget-controls.yaml +74 -0
- package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
- package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
- package/templates/optimizations/smart-model-routing.yaml +96 -0
- package/templates/optimizations/streaming-batch-selection.yaml +167 -0
- package/templates/optimizations/system-prompt-optimization.yaml +75 -0
- package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
- package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
- package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
- package/templates/overpowered-extraction.yaml +32 -0
- package/templates/overpowered-model.yaml +31 -0
- package/templates/prompt-bloat.yaml +24 -0
- package/templates/retry-explosion.yaml +28 -0
- package/templates/schema/insight.schema.json +113 -0
- package/templates/schema/optimization.schema.json +180 -0
- package/templates/streaming-drift.yaml +30 -0
- package/templates/throughput-gap.yaml +21 -0
- package/templates/token-underutilization.yaml +28 -0
- package/templates/untested-fallback.yaml +21 -0
- package/tests/accuracy/drift-detection.test.ts +184 -0
- package/tests/accuracy/false-positives.test.ts +166 -0
- package/tests/accuracy/templates.test.ts +205 -0
- package/tests/action/commands.test.ts +125 -0
- package/tests/action/comments.test.ts +347 -0
- package/tests/cli.test.ts +203 -0
- package/tests/comparison.test.ts +309 -0
- package/tests/correlation-analyzer.test.ts +534 -0
- package/tests/counterfactuals.test.ts +347 -0
- package/tests/fixtures/events/missing-id.jsonl +1 -0
- package/tests/fixtures/events/missing-input.jsonl +1 -0
- package/tests/fixtures/events/missing-latency.jsonl +1 -0
- package/tests/fixtures/events/missing-model.jsonl +1 -0
- package/tests/fixtures/events/missing-output.jsonl +1 -0
- package/tests/fixtures/events/missing-provider.jsonl +1 -0
- package/tests/fixtures/events/missing-ts.jsonl +1 -0
- package/tests/fixtures/events/valid.csv +3 -0
- package/tests/fixtures/events/valid.json +1 -0
- package/tests/fixtures/events/valid.jsonl +2 -0
- package/tests/fixtures/events/with-callsite.jsonl +1 -0
- package/tests/fixtures/events/with-intent.jsonl +1 -0
- package/tests/fixtures/events/wrong-type.jsonl +1 -0
- package/tests/fixtures/repos/empty/.gitkeep +0 -0
- package/tests/fixtures/repos/hybrid-router/router.py +35 -0
- package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
- package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
- package/tests/fixtures/repos/saas-openai/client.py +26 -0
- package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
- package/tests/github-action.test.ts +292 -0
- package/tests/insights.test.ts +878 -0
- package/tests/joiner.test.ts +168 -0
- package/tests/performance/action-latency.test.ts +132 -0
- package/tests/performance/benchmark.test.ts +189 -0
- package/tests/performance/cli-latency.test.ts +102 -0
- package/tests/pr-comment.test.ts +313 -0
- package/tests/prediction.test.ts +296 -0
- package/tests/runtime-analyzer.test.ts +375 -0
- package/tests/runtime.test.ts +205 -0
- package/tests/scanner.test.ts +122 -0
- package/tests/template-conformance.test.ts +526 -0
- package/tests/unit/cost-calculator.test.ts +303 -0
- package/tests/unit/credits.test.ts +180 -0
- package/tests/unit/inference-map.test.ts +276 -0
- package/tests/unit/schema.test.ts +300 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +14 -0
|
@@ -0,0 +1,78 @@
|
|
|
1
|
+
id: long-context-memory-management
|
|
2
|
+
name: Long Context Memory Management
|
|
3
|
+
description: Optimize memory usage for long-context inference with KV cache management
|
|
4
|
+
category: memory_optimization
|
|
5
|
+
confidence: 0.88
|
|
6
|
+
success_count: 892
|
|
7
|
+
verified_environments: 41
|
|
8
|
+
contributors:
|
|
9
|
+
- memory_specialist
|
|
10
|
+
- llm_engineer
|
|
11
|
+
last_updated: "2025-01-07"
|
|
12
|
+
|
|
13
|
+
environment_match:
|
|
14
|
+
context_length: ">16K tokens"
|
|
15
|
+
memory_pressure: high
|
|
16
|
+
use_case:
|
|
17
|
+
- document_qa
|
|
18
|
+
- long_form_generation
|
|
19
|
+
|
|
20
|
+
optimization:
|
|
21
|
+
technique: kv_cache_optimization
|
|
22
|
+
expected_memory_reduction: "40-60%"
|
|
23
|
+
expected_throughput_improvement: "2-3x"
|
|
24
|
+
effort_estimate: "2-3 weeks"
|
|
25
|
+
risk_level: medium
|
|
26
|
+
|
|
27
|
+
economics:
|
|
28
|
+
projected_improvement:
|
|
29
|
+
memory_reduction_percent: 50
|
|
30
|
+
batch_size_increase: 2
|
|
31
|
+
implementation_cost:
|
|
32
|
+
engineering_hours: 100
|
|
33
|
+
total_cost: 20000
|
|
34
|
+
|
|
35
|
+
implementation:
|
|
36
|
+
prerequisites:
|
|
37
|
+
- requirement: "PagedAttention support"
|
|
38
|
+
validation_command: "python scripts/check_paged_attention.py"
|
|
39
|
+
- requirement: "Sufficient swap space"
|
|
40
|
+
automated_steps:
|
|
41
|
+
- step_id: kv_cache_analysis
|
|
42
|
+
name: KV Cache Analysis
|
|
43
|
+
executable: true
|
|
44
|
+
commands:
|
|
45
|
+
- "python scripts/analyze_kv_cache_usage.py"
|
|
46
|
+
- "python scripts/identify_cache_patterns.py"
|
|
47
|
+
validation:
|
|
48
|
+
command: "python scripts/validate_analysis.py"
|
|
49
|
+
success_criteria: "analysis_complete"
|
|
50
|
+
- step_id: cache_optimization
|
|
51
|
+
name: KV Cache Optimization
|
|
52
|
+
executable: true
|
|
53
|
+
commands:
|
|
54
|
+
- "python scripts/enable_paged_attention.py"
|
|
55
|
+
- "python scripts/configure_cache_offloading.py --swap-size 8GB"
|
|
56
|
+
validation:
|
|
57
|
+
command: "python scripts/benchmark_memory.py"
|
|
58
|
+
success_criteria: "memory_reduction > 0.4"
|
|
59
|
+
rollback_command: "python scripts/disable_cache_optimization.py"
|
|
60
|
+
|
|
61
|
+
monitoring:
|
|
62
|
+
key_metrics:
|
|
63
|
+
- metric: kv_cache_memory_gb
|
|
64
|
+
target: "<baseline * 0.6"
|
|
65
|
+
alert_threshold: ">baseline * 0.8"
|
|
66
|
+
- metric: cache_hit_rate
|
|
67
|
+
target: ">0.9"
|
|
68
|
+
alert_threshold: "<0.7"
|
|
69
|
+
rollback_triggers:
|
|
70
|
+
- condition: "cache_hit_rate < 0.5 for 10 minutes"
|
|
71
|
+
action: automatic_rollback
|
|
72
|
+
|
|
73
|
+
results:
|
|
74
|
+
recent_implementations:
|
|
75
|
+
- environment: legal_document_analysis
|
|
76
|
+
baseline_memory_gb: 48
|
|
77
|
+
optimized_memory_gb: 22
|
|
78
|
+
memory_reduction_percent: 54.2
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
id: max-tokens-optimization
|
|
2
|
+
name: Max Tokens Configuration Optimization
|
|
3
|
+
description: Optimize max_tokens settings to reduce wasted output token capacity
|
|
4
|
+
category: cost_optimization
|
|
5
|
+
confidence: 0.93
|
|
6
|
+
success_count: 2345
|
|
7
|
+
verified_environments: 112
|
|
8
|
+
contributors:
|
|
9
|
+
- token_optimizer
|
|
10
|
+
- cost_analyst
|
|
11
|
+
last_updated: "2024-12-31"
|
|
12
|
+
|
|
13
|
+
environment_match:
|
|
14
|
+
max_tokens_setting: ">1000"
|
|
15
|
+
avg_output_tokens: "<max_tokens * 0.3"
|
|
16
|
+
monthly_cost: ">$5K"
|
|
17
|
+
|
|
18
|
+
optimization:
|
|
19
|
+
technique: max_tokens_right_sizing
|
|
20
|
+
expected_cost_reduction: "20-40%"
|
|
21
|
+
effort_estimate: "1-3 days"
|
|
22
|
+
risk_level: low
|
|
23
|
+
|
|
24
|
+
economics:
|
|
25
|
+
baseline_calculation:
|
|
26
|
+
wasted_token_capacity_percent: 70
|
|
27
|
+
projected_improvement:
|
|
28
|
+
optimized_waste_percent: 20
|
|
29
|
+
cost_reduction_percent: 30
|
|
30
|
+
implementation_cost:
|
|
31
|
+
engineering_hours: 16
|
|
32
|
+
total_cost: 3200
|
|
33
|
+
|
|
34
|
+
implementation:
|
|
35
|
+
prerequisites:
|
|
36
|
+
- requirement: "Output length analytics"
|
|
37
|
+
- requirement: "API configuration access"
|
|
38
|
+
automated_steps:
|
|
39
|
+
- step_id: analysis
|
|
40
|
+
name: Output Length Analysis
|
|
41
|
+
executable: true
|
|
42
|
+
commands:
|
|
43
|
+
- "python scripts/analyze_output_lengths.py --logs ./request_logs"
|
|
44
|
+
- "python scripts/calculate_optimal_max_tokens.py"
|
|
45
|
+
validation:
|
|
46
|
+
command: "python scripts/validate_analysis.py"
|
|
47
|
+
success_criteria: "analysis_complete"
|
|
48
|
+
- step_id: configuration
|
|
49
|
+
name: Max Tokens Configuration
|
|
50
|
+
executable: true
|
|
51
|
+
commands:
|
|
52
|
+
- "python scripts/configure_dynamic_max_tokens.py --percentile 95"
|
|
53
|
+
- "python scripts/add_overflow_handling.py"
|
|
54
|
+
validation:
|
|
55
|
+
command: "python scripts/test_configuration.py"
|
|
56
|
+
success_criteria: "truncation_rate < 0.01"
|
|
57
|
+
rollback_command: "python scripts/revert_max_tokens.py"
|
|
58
|
+
|
|
59
|
+
monitoring:
|
|
60
|
+
key_metrics:
|
|
61
|
+
- metric: truncation_rate
|
|
62
|
+
target: "<0.01"
|
|
63
|
+
alert_threshold: ">0.05"
|
|
64
|
+
- metric: token_efficiency
|
|
65
|
+
target: ">0.8"
|
|
66
|
+
alert_threshold: "<0.5"
|
|
67
|
+
rollback_triggers:
|
|
68
|
+
- condition: "truncation_rate > 0.1 for 5 minutes"
|
|
69
|
+
action: automatic_rollback
|
|
70
|
+
|
|
71
|
+
results:
|
|
72
|
+
recent_implementations:
|
|
73
|
+
- environment: content_generation
|
|
74
|
+
baseline_max_tokens: 2000
|
|
75
|
+
optimized_max_tokens: 650
|
|
76
|
+
cost_reduction_percent: 28
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
id: memory-bandwidth-optimization
|
|
2
|
+
name: Memory Bandwidth Optimization for Large Models
|
|
3
|
+
description: Optimize memory access patterns for memory-bound large model inference
|
|
4
|
+
category: memory_optimization
|
|
5
|
+
confidence: 0.87
|
|
6
|
+
success_count: 987
|
|
7
|
+
verified_environments: 43
|
|
8
|
+
contributors:
|
|
9
|
+
- gpu_specialist
|
|
10
|
+
- memory_optimizer
|
|
11
|
+
last_updated: "2025-01-11"
|
|
12
|
+
|
|
13
|
+
environment_match:
|
|
14
|
+
model_size: ">13B"
|
|
15
|
+
gpu_memory_utilization: ">80%"
|
|
16
|
+
compute_utilization: "<50%"
|
|
17
|
+
|
|
18
|
+
optimization:
|
|
19
|
+
technique: memory_bandwidth_optimization
|
|
20
|
+
expected_throughput_improvement: "2-3x"
|
|
21
|
+
expected_latency_improvement: "30-50%"
|
|
22
|
+
effort_estimate: "2-3 weeks"
|
|
23
|
+
risk_level: medium
|
|
24
|
+
|
|
25
|
+
economics:
|
|
26
|
+
implementation_cost:
|
|
27
|
+
engineering_hours: 120
|
|
28
|
+
total_cost: 24000
|
|
29
|
+
|
|
30
|
+
implementation:
|
|
31
|
+
prerequisites:
|
|
32
|
+
- requirement: "CUDA profiler access"
|
|
33
|
+
validation_command: "which nvprof || which nsys"
|
|
34
|
+
- requirement: "Model profiling capability"
|
|
35
|
+
automated_steps:
|
|
36
|
+
- step_id: profiling
|
|
37
|
+
name: Memory Access Profiling
|
|
38
|
+
executable: true
|
|
39
|
+
commands:
|
|
40
|
+
- "python scripts/profile_memory_access.py --model ./model"
|
|
41
|
+
- "python scripts/identify_bottlenecks.py"
|
|
42
|
+
validation:
|
|
43
|
+
command: "python scripts/validate_profile.py"
|
|
44
|
+
success_criteria: "profile_complete"
|
|
45
|
+
- step_id: optimization
|
|
46
|
+
name: Apply Memory Optimizations
|
|
47
|
+
executable: true
|
|
48
|
+
commands:
|
|
49
|
+
- "python scripts/optimize_memory_layout.py"
|
|
50
|
+
- "python scripts/enable_flash_attention.py"
|
|
51
|
+
validation:
|
|
52
|
+
command: "python scripts/benchmark_memory.py"
|
|
53
|
+
success_criteria: "bandwidth_improvement > 1.5"
|
|
54
|
+
rollback_command: "python scripts/revert_memory_config.py"
|
|
55
|
+
|
|
56
|
+
monitoring:
|
|
57
|
+
key_metrics:
|
|
58
|
+
- metric: memory_bandwidth_utilization
|
|
59
|
+
target: ">70%"
|
|
60
|
+
alert_threshold: "<50%"
|
|
61
|
+
- metric: inference_latency
|
|
62
|
+
target: "<baseline * 0.7"
|
|
63
|
+
alert_threshold: ">baseline"
|
|
64
|
+
rollback_triggers:
|
|
65
|
+
- condition: "latency > baseline * 1.2 for 10 minutes"
|
|
66
|
+
action: automatic_rollback
|
|
67
|
+
|
|
68
|
+
results:
|
|
69
|
+
recent_implementations:
|
|
70
|
+
- environment: large_model_inference
|
|
71
|
+
baseline_latency_ms: 450
|
|
72
|
+
optimized_latency_ms: 280
|
|
73
|
+
improvement_percent: 37.8
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
id: multi-framework-resilience
|
|
2
|
+
name: Multi-Framework Resilience Architecture
|
|
3
|
+
description: Build resilient inference architecture with multiple framework fallbacks
|
|
4
|
+
category: application_optimization
|
|
5
|
+
confidence: 0.86
|
|
6
|
+
success_count: 456
|
|
7
|
+
verified_environments: 28
|
|
8
|
+
contributors:
|
|
9
|
+
- reliability_engineer
|
|
10
|
+
- platform_architect
|
|
11
|
+
last_updated: "2025-01-08"
|
|
12
|
+
|
|
13
|
+
environment_match:
|
|
14
|
+
availability_requirement: ">99.9%"
|
|
15
|
+
single_framework: true
|
|
16
|
+
traffic: ">100K requests/day"
|
|
17
|
+
|
|
18
|
+
optimization:
|
|
19
|
+
technique: multi_framework_resilience
|
|
20
|
+
expected_cost_reduction: "10-20%"
|
|
21
|
+
effort_estimate: "3-4 weeks"
|
|
22
|
+
risk_level: medium
|
|
23
|
+
|
|
24
|
+
economics:
|
|
25
|
+
projected_improvement:
|
|
26
|
+
availability_improvement: 0.999
|
|
27
|
+
downtime_cost_savings_monthly: 15000
|
|
28
|
+
implementation_cost:
|
|
29
|
+
engineering_hours: 240
|
|
30
|
+
total_cost: 48000
|
|
31
|
+
|
|
32
|
+
implementation:
|
|
33
|
+
prerequisites:
|
|
34
|
+
- requirement: "Multiple inference backends available"
|
|
35
|
+
- requirement: "Health check infrastructure"
|
|
36
|
+
- requirement: "Load balancer with health-aware routing"
|
|
37
|
+
automated_steps:
|
|
38
|
+
- step_id: backend_setup
|
|
39
|
+
name: Setup Multiple Backends
|
|
40
|
+
executable: true
|
|
41
|
+
commands:
|
|
42
|
+
- "python scripts/setup_vllm_backend.py"
|
|
43
|
+
- "python scripts/setup_tgi_backend.py"
|
|
44
|
+
- "python scripts/setup_onnx_backend.py"
|
|
45
|
+
validation:
|
|
46
|
+
command: "python scripts/verify_all_backends.py"
|
|
47
|
+
success_criteria: "all_backends_healthy"
|
|
48
|
+
- step_id: routing_setup
|
|
49
|
+
name: Health-Aware Routing
|
|
50
|
+
executable: true
|
|
51
|
+
commands:
|
|
52
|
+
- "python scripts/configure_health_checks.py --interval 5s"
|
|
53
|
+
- "python scripts/setup_failover_routing.py"
|
|
54
|
+
validation:
|
|
55
|
+
command: "python scripts/test_failover.py"
|
|
56
|
+
success_criteria: "failover_time < 5s"
|
|
57
|
+
|
|
58
|
+
monitoring:
|
|
59
|
+
key_metrics:
|
|
60
|
+
- metric: availability
|
|
61
|
+
target: ">99.9%"
|
|
62
|
+
alert_threshold: "<99.5%"
|
|
63
|
+
- metric: failover_time
|
|
64
|
+
target: "<5s"
|
|
65
|
+
alert_threshold: ">15s"
|
|
66
|
+
rollback_triggers:
|
|
67
|
+
- condition: "availability < 99% for 5 minutes"
|
|
68
|
+
action: alert_and_investigation
|
|
69
|
+
|
|
70
|
+
results:
|
|
71
|
+
recent_implementations:
|
|
72
|
+
- environment: critical_api_service
|
|
73
|
+
baseline_availability: 99.5
|
|
74
|
+
optimized_availability: 99.95
|
|
75
|
+
monthly_downtime_reduction_hours: 3.5
|
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
id: multi-tenant-optimization
|
|
2
|
+
name: Multi-Tenant Cost Allocation
|
|
3
|
+
description: Optimize multi-tenant AI deployments with fair cost allocation and isolation
|
|
4
|
+
category: cost_optimization
|
|
5
|
+
confidence: 0.88
|
|
6
|
+
success_count: 678
|
|
7
|
+
verified_environments: 35
|
|
8
|
+
contributors:
|
|
9
|
+
- multi_tenant_architect
|
|
10
|
+
- cost_analyst
|
|
11
|
+
last_updated: "2024-12-26"
|
|
12
|
+
|
|
13
|
+
environment_match:
|
|
14
|
+
deployment_type: multi_tenant
|
|
15
|
+
tenant_count: ">10"
|
|
16
|
+
cost_attribution_requirement: high
|
|
17
|
+
|
|
18
|
+
optimization:
|
|
19
|
+
technique: tenant_cost_optimization
|
|
20
|
+
expected_cost_reduction: "20-40%"
|
|
21
|
+
effort_estimate: "3-4 weeks"
|
|
22
|
+
risk_level: medium
|
|
23
|
+
|
|
24
|
+
economics:
|
|
25
|
+
baseline_calculation:
|
|
26
|
+
shared_resource_waste_percent: 30
|
|
27
|
+
projected_improvement:
|
|
28
|
+
optimized_utilization: 0.85
|
|
29
|
+
cost_reduction_percent: 30
|
|
30
|
+
implementation_cost:
|
|
31
|
+
engineering_hours: 180
|
|
32
|
+
total_cost: 36000
|
|
33
|
+
|
|
34
|
+
implementation:
|
|
35
|
+
prerequisites:
|
|
36
|
+
- requirement: "Tenant identification in requests"
|
|
37
|
+
- requirement: "Per-tenant metrics capability"
|
|
38
|
+
automated_steps:
|
|
39
|
+
- step_id: attribution_setup
|
|
40
|
+
name: Cost Attribution Setup
|
|
41
|
+
executable: true
|
|
42
|
+
commands:
|
|
43
|
+
- "python scripts/setup_tenant_tracking.py"
|
|
44
|
+
- "python scripts/configure_cost_allocation.py"
|
|
45
|
+
validation:
|
|
46
|
+
command: "python scripts/verify_attribution.py"
|
|
47
|
+
success_criteria: "attribution_accuracy > 0.98"
|
|
48
|
+
- step_id: optimization
|
|
49
|
+
name: Tenant Optimization
|
|
50
|
+
executable: true
|
|
51
|
+
commands:
|
|
52
|
+
- "python scripts/implement_tenant_quotas.py"
|
|
53
|
+
- "python scripts/enable_tenant_autoscaling.py"
|
|
54
|
+
validation:
|
|
55
|
+
command: "python scripts/test_tenant_isolation.py"
|
|
56
|
+
success_criteria: "isolation_verified"
|
|
57
|
+
|
|
58
|
+
monitoring:
|
|
59
|
+
key_metrics:
|
|
60
|
+
- metric: tenant_attribution_accuracy
|
|
61
|
+
target: ">0.99"
|
|
62
|
+
alert_threshold: "<0.95"
|
|
63
|
+
- metric: noisy_neighbor_incidents
|
|
64
|
+
target: "<1/week"
|
|
65
|
+
alert_threshold: ">5/day"
|
|
66
|
+
rollback_triggers:
|
|
67
|
+
- condition: "tenant_isolation_breach detected"
|
|
68
|
+
action: automatic_rollback
|
|
69
|
+
|
|
70
|
+
results:
|
|
71
|
+
recent_implementations:
|
|
72
|
+
- environment: saas_ai_platform
|
|
73
|
+
tenant_count: 50
|
|
74
|
+
baseline_cost_per_tenant: 800
|
|
75
|
+
optimized_cost_per_tenant: 560
|
|
@@ -0,0 +1,143 @@
|
|
|
1
|
+
id: prompt-caching-optimization
|
|
2
|
+
name: Prompt Caching for Repetitive Workloads
|
|
3
|
+
description: Reduce API costs by 50-90% through intelligent prompt caching
|
|
4
|
+
category: api_optimization
|
|
5
|
+
confidence: 0.92
|
|
6
|
+
success_count: 1678
|
|
7
|
+
verified_environments: 89
|
|
8
|
+
contributors:
|
|
9
|
+
- inference_squeeze
|
|
10
|
+
- cache_engineer
|
|
11
|
+
- api_optimizer
|
|
12
|
+
last_updated: "2025-01-20"
|
|
13
|
+
source: "Inference Squeeze Chapter 4 - Prompt Optimization"
|
|
14
|
+
|
|
15
|
+
environment_match:
|
|
16
|
+
system_prompt_usage: static_repetitive
|
|
17
|
+
prompt_patterns: templated
|
|
18
|
+
request_volume: ">10K/day"
|
|
19
|
+
cache_infrastructure: available
|
|
20
|
+
|
|
21
|
+
optimization:
|
|
22
|
+
technique: prompt_caching
|
|
23
|
+
expected_cost_reduction: "50-90%"
|
|
24
|
+
expected_latency_improvement: "40-60%"
|
|
25
|
+
effort_estimate: "1 week"
|
|
26
|
+
risk_level: low
|
|
27
|
+
|
|
28
|
+
economics:
|
|
29
|
+
baseline_calculation:
|
|
30
|
+
system_prompt_tokens: 1500
|
|
31
|
+
daily_requests: 50000
|
|
32
|
+
daily_system_prompt_tokens: 75000000
|
|
33
|
+
cost_per_input_token: 0.000015
|
|
34
|
+
daily_system_prompt_cost: 1125
|
|
35
|
+
projected_improvement:
|
|
36
|
+
cache_hit_rate: 0.95
|
|
37
|
+
cached_token_cost: 0.0000015
|
|
38
|
+
new_daily_cost: 118
|
|
39
|
+
monthly_savings: 30210
|
|
40
|
+
implementation_cost:
|
|
41
|
+
engineering_hours: 40
|
|
42
|
+
total_cost: 8000
|
|
43
|
+
|
|
44
|
+
caching_strategies:
|
|
45
|
+
provider_native:
|
|
46
|
+
anthropic:
|
|
47
|
+
feature: "Prompt Caching"
|
|
48
|
+
discount: "90% on cached tokens"
|
|
49
|
+
cache_duration: "5 minutes"
|
|
50
|
+
min_tokens: 1024
|
|
51
|
+
openai:
|
|
52
|
+
feature: "Automatic caching on gpt-4o, o1"
|
|
53
|
+
discount: "50% on cached tokens"
|
|
54
|
+
cache_duration: "5-10 minutes"
|
|
55
|
+
min_tokens: 1024
|
|
56
|
+
application_level:
|
|
57
|
+
semantic_cache:
|
|
58
|
+
description: "Cache responses for semantically similar queries"
|
|
59
|
+
similarity_threshold: 0.95
|
|
60
|
+
storage: "Vector database (Pinecone, Weaviate)"
|
|
61
|
+
ttl: "24 hours"
|
|
62
|
+
exact_match:
|
|
63
|
+
description: "Cache exact query-response pairs"
|
|
64
|
+
storage: "Redis, Memcached"
|
|
65
|
+
ttl: "1-24 hours"
|
|
66
|
+
|
|
67
|
+
implementation:
|
|
68
|
+
prerequisites:
|
|
69
|
+
- requirement: "Static system prompts"
|
|
70
|
+
validation: "System prompts don't change per-request"
|
|
71
|
+
- requirement: "Sufficient prompt length"
|
|
72
|
+
validation: "System prompt >= 1024 tokens for provider caching"
|
|
73
|
+
automated_steps:
|
|
74
|
+
- step_id: prompt_analysis
|
|
75
|
+
name: Analyze Prompt Patterns
|
|
76
|
+
executable: true
|
|
77
|
+
commands:
|
|
78
|
+
- "Identify static vs dynamic prompt components"
|
|
79
|
+
- "Measure system prompt token counts"
|
|
80
|
+
- "Calculate cache hit potential"
|
|
81
|
+
validation:
|
|
82
|
+
command: "Prompt analysis complete"
|
|
83
|
+
success_criteria: "static_components_identified AND cache_potential > 50%"
|
|
84
|
+
rollback_command: "Skip caching optimization"
|
|
85
|
+
- step_id: provider_native_caching
|
|
86
|
+
name: Enable Provider Native Caching
|
|
87
|
+
executable: true
|
|
88
|
+
commands:
|
|
89
|
+
- "Enable Anthropic prompt caching (if using Claude)"
|
|
90
|
+
- "Structure prompts with static prefix >= 1024 tokens"
|
|
91
|
+
- "Verify cache headers in responses"
|
|
92
|
+
validation:
|
|
93
|
+
command: "Check cache hit rate in API responses"
|
|
94
|
+
success_criteria: "cache_hit_rate > 80%"
|
|
95
|
+
rollback_command: "Disable prompt caching"
|
|
96
|
+
- step_id: application_cache
|
|
97
|
+
name: Implement Application-Level Cache
|
|
98
|
+
executable: true
|
|
99
|
+
commands:
|
|
100
|
+
- "Deploy semantic similarity cache (optional)"
|
|
101
|
+
- "Configure embedding model for queries"
|
|
102
|
+
- "Set similarity thresholds and TTL"
|
|
103
|
+
validation:
|
|
104
|
+
command: "Test cache hit rates"
|
|
105
|
+
success_criteria: "semantic_cache_hit_rate > 30%"
|
|
106
|
+
rollback_command: "Disable application cache"
|
|
107
|
+
- step_id: monitoring_setup
|
|
108
|
+
name: Cache Monitoring
|
|
109
|
+
executable: true
|
|
110
|
+
commands:
|
|
111
|
+
- "Track cache hit rates by endpoint"
|
|
112
|
+
- "Monitor cache staleness"
|
|
113
|
+
- "Alert on cache performance degradation"
|
|
114
|
+
validation:
|
|
115
|
+
command: "Verify monitoring dashboards"
|
|
116
|
+
success_criteria: "metrics_visible AND alerts_configured"
|
|
117
|
+
rollback_command: "Continue without monitoring"
|
|
118
|
+
|
|
119
|
+
monitoring:
|
|
120
|
+
key_metrics:
|
|
121
|
+
- metric: cache_hit_rate
|
|
122
|
+
target: ">90%"
|
|
123
|
+
alert_threshold: "<70%"
|
|
124
|
+
- metric: cache_cost_savings
|
|
125
|
+
target: ">60%"
|
|
126
|
+
alert_threshold: "<40%"
|
|
127
|
+
- metric: cache_staleness_rate
|
|
128
|
+
target: "<5%"
|
|
129
|
+
alert_threshold: ">15%"
|
|
130
|
+
rollback_triggers:
|
|
131
|
+
- condition: "cache_hit_rate < 50% for 1 hour"
|
|
132
|
+
action: investigate_cache_invalidation
|
|
133
|
+
- condition: "cache_staleness_rate > 20% for 30 minutes"
|
|
134
|
+
action: reduce_cache_ttl
|
|
135
|
+
|
|
136
|
+
results:
|
|
137
|
+
case_study:
|
|
138
|
+
environment: Legal document analysis
|
|
139
|
+
system_prompt_tokens: 2200
|
|
140
|
+
daily_requests: 75000
|
|
141
|
+
baseline_daily_cost: 2475
|
|
142
|
+
optimized_daily_cost: 371
|
|
143
|
+
cost_reduction_percent: 85
|
|
@@ -0,0 +1,109 @@
|
|
|
1
|
+
id: pytorch-to-onnx-migration
|
|
2
|
+
name: PyTorch to ONNX Runtime Production Migration
|
|
3
|
+
description: Migrate development PyTorch models to optimized ONNX Runtime for 50-70% cost reduction
|
|
4
|
+
category: runtime_optimization
|
|
5
|
+
confidence: 0.94
|
|
6
|
+
success_count: 2847
|
|
7
|
+
verified_environments: 89
|
|
8
|
+
contributors:
|
|
9
|
+
- production_ai_team
|
|
10
|
+
- ml_ops_specialist
|
|
11
|
+
- inference_optimizer
|
|
12
|
+
last_updated: "2025-01-15"
|
|
13
|
+
|
|
14
|
+
environment_match:
|
|
15
|
+
runtime: pytorch
|
|
16
|
+
deployment_stage:
|
|
17
|
+
- development
|
|
18
|
+
- staging
|
|
19
|
+
gpu_utilization: "<60%"
|
|
20
|
+
batch_size: "<4"
|
|
21
|
+
model_types:
|
|
22
|
+
- transformer
|
|
23
|
+
- cnn
|
|
24
|
+
- rnn
|
|
25
|
+
|
|
26
|
+
optimization:
|
|
27
|
+
technique: runtime_migration
|
|
28
|
+
expected_cost_reduction: "50-70%"
|
|
29
|
+
expected_latency_improvement: "40-60%"
|
|
30
|
+
effort_estimate: "2-3 weeks"
|
|
31
|
+
risk_level: low
|
|
32
|
+
|
|
33
|
+
economics:
|
|
34
|
+
baseline_calculation:
|
|
35
|
+
current_cost_per_token: 0.004
|
|
36
|
+
projected_savings:
|
|
37
|
+
new_cost_per_token: 0.0015
|
|
38
|
+
monthly_savings_percent: 62.5
|
|
39
|
+
implementation_cost:
|
|
40
|
+
engineering_hours: 240
|
|
41
|
+
hourly_rate: 200
|
|
42
|
+
total_cost: 48000
|
|
43
|
+
|
|
44
|
+
implementation:
|
|
45
|
+
prerequisites:
|
|
46
|
+
- requirement: "Python 3.8+"
|
|
47
|
+
validation_command: "python --version | grep -E '3\\.[8-9]|3\\.1[0-9]'"
|
|
48
|
+
- requirement: "ONNX 1.14+"
|
|
49
|
+
validation_command: "python -c 'import onnx; print(onnx.__version__)'"
|
|
50
|
+
- requirement: "onnxruntime-gpu 1.16+"
|
|
51
|
+
validation_command: "python -c 'import onnxruntime; print(onnxruntime.__version__)'"
|
|
52
|
+
automated_steps:
|
|
53
|
+
- step_id: model_export
|
|
54
|
+
name: Model Export
|
|
55
|
+
executable: true
|
|
56
|
+
commands:
|
|
57
|
+
- "python scripts/export_to_onnx.py --model-path ./pytorch_model --output ./model.onnx"
|
|
58
|
+
- "python -m onnxruntime.tools.symbolic_shape_infer --input model.onnx --output model_opt.onnx"
|
|
59
|
+
validation:
|
|
60
|
+
command: "python scripts/validate_onnx.py --model model_opt.onnx"
|
|
61
|
+
success_criteria: "exit_code == 0"
|
|
62
|
+
rollback_command: "rm -f model_opt.onnx"
|
|
63
|
+
- step_id: runtime_setup
|
|
64
|
+
name: Runtime Setup
|
|
65
|
+
executable: true
|
|
66
|
+
commands:
|
|
67
|
+
- "pip install onnxruntime-gpu==1.16.0"
|
|
68
|
+
- "python scripts/setup_onnx_server.py --model model_opt.onnx --port 8001"
|
|
69
|
+
validation:
|
|
70
|
+
command: "curl -f http://localhost:8001/health"
|
|
71
|
+
success_criteria: "http_status == 200"
|
|
72
|
+
rollback_command: "pkill -f onnx_server"
|
|
73
|
+
- step_id: performance_validation
|
|
74
|
+
name: Performance Validation
|
|
75
|
+
executable: true
|
|
76
|
+
commands:
|
|
77
|
+
- "python scripts/benchmark_comparison.py --pytorch-endpoint localhost:8000 --onnx-endpoint localhost:8001"
|
|
78
|
+
validation:
|
|
79
|
+
command: "python scripts/validate_outputs.py --tolerance 1e-5"
|
|
80
|
+
success_criteria: "accuracy_match > 0.995"
|
|
81
|
+
rollback_command: "python scripts/rollback_to_pytorch.py"
|
|
82
|
+
|
|
83
|
+
monitoring:
|
|
84
|
+
key_metrics:
|
|
85
|
+
- metric: cost_per_token
|
|
86
|
+
target: "<0.002"
|
|
87
|
+
alert_threshold: ">0.0025"
|
|
88
|
+
- metric: latency_p95
|
|
89
|
+
target: "<200ms"
|
|
90
|
+
alert_threshold: ">250ms"
|
|
91
|
+
- metric: accuracy_score
|
|
92
|
+
target: ">0.995"
|
|
93
|
+
alert_threshold: "<0.99"
|
|
94
|
+
rollback_triggers:
|
|
95
|
+
- condition: "cost_per_token > baseline * 1.1 for 30 minutes"
|
|
96
|
+
action: automatic_rollback
|
|
97
|
+
- condition: "accuracy_score < 0.99 for 3 consecutive validations"
|
|
98
|
+
action: automatic_rollback
|
|
99
|
+
- condition: "latency_p95 > baseline * 2.0 for 15 minutes"
|
|
100
|
+
action: alert_and_manual_review
|
|
101
|
+
|
|
102
|
+
results:
|
|
103
|
+
recent_implementations:
|
|
104
|
+
- environment: healthcare_document_processing
|
|
105
|
+
baseline_monthly_cost: 36000
|
|
106
|
+
optimized_monthly_cost: 13500
|
|
107
|
+
cost_reduction_percent: 62.5
|
|
108
|
+
implementation_days: 14
|
|
109
|
+
quality_impact: -0.6
|