@peakinfer/cli 1.0.133

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. package/.claude/settings.local.json +8 -0
  2. package/.env.example +6 -0
  3. package/.github/workflows/peakinfer.yml +64 -0
  4. package/CHANGELOG.md +31 -0
  5. package/LICENSE +190 -0
  6. package/README.md +335 -0
  7. package/data/inferencemax.json +274 -0
  8. package/dist/agent-analyzer.d.ts +45 -0
  9. package/dist/agent-analyzer.d.ts.map +1 -0
  10. package/dist/agent-analyzer.js +374 -0
  11. package/dist/agent-analyzer.js.map +1 -0
  12. package/dist/agent.d.ts +76 -0
  13. package/dist/agent.d.ts.map +1 -0
  14. package/dist/agent.js +965 -0
  15. package/dist/agent.js.map +1 -0
  16. package/dist/agents/correlation-analyzer.d.ts +34 -0
  17. package/dist/agents/correlation-analyzer.d.ts.map +1 -0
  18. package/dist/agents/correlation-analyzer.js +261 -0
  19. package/dist/agents/correlation-analyzer.js.map +1 -0
  20. package/dist/agents/index.d.ts +91 -0
  21. package/dist/agents/index.d.ts.map +1 -0
  22. package/dist/agents/index.js +111 -0
  23. package/dist/agents/index.js.map +1 -0
  24. package/dist/agents/runtime-analyzer.d.ts +38 -0
  25. package/dist/agents/runtime-analyzer.d.ts.map +1 -0
  26. package/dist/agents/runtime-analyzer.js +244 -0
  27. package/dist/agents/runtime-analyzer.js.map +1 -0
  28. package/dist/analysis-types.d.ts +500 -0
  29. package/dist/analysis-types.d.ts.map +1 -0
  30. package/dist/analysis-types.js +11 -0
  31. package/dist/analysis-types.js.map +1 -0
  32. package/dist/analytics.d.ts +25 -0
  33. package/dist/analytics.d.ts.map +1 -0
  34. package/dist/analytics.js +94 -0
  35. package/dist/analytics.js.map +1 -0
  36. package/dist/analyzer.d.ts +48 -0
  37. package/dist/analyzer.d.ts.map +1 -0
  38. package/dist/analyzer.js +547 -0
  39. package/dist/analyzer.js.map +1 -0
  40. package/dist/artifacts.d.ts +44 -0
  41. package/dist/artifacts.d.ts.map +1 -0
  42. package/dist/artifacts.js +165 -0
  43. package/dist/artifacts.js.map +1 -0
  44. package/dist/benchmarks/index.d.ts +88 -0
  45. package/dist/benchmarks/index.d.ts.map +1 -0
  46. package/dist/benchmarks/index.js +205 -0
  47. package/dist/benchmarks/index.js.map +1 -0
  48. package/dist/cli.d.ts +3 -0
  49. package/dist/cli.d.ts.map +1 -0
  50. package/dist/cli.js +427 -0
  51. package/dist/cli.js.map +1 -0
  52. package/dist/commands/ci.d.ts +19 -0
  53. package/dist/commands/ci.d.ts.map +1 -0
  54. package/dist/commands/ci.js +253 -0
  55. package/dist/commands/ci.js.map +1 -0
  56. package/dist/commands/config.d.ts +16 -0
  57. package/dist/commands/config.d.ts.map +1 -0
  58. package/dist/commands/config.js +249 -0
  59. package/dist/commands/config.js.map +1 -0
  60. package/dist/commands/demo.d.ts +15 -0
  61. package/dist/commands/demo.d.ts.map +1 -0
  62. package/dist/commands/demo.js +106 -0
  63. package/dist/commands/demo.js.map +1 -0
  64. package/dist/commands/export.d.ts +14 -0
  65. package/dist/commands/export.d.ts.map +1 -0
  66. package/dist/commands/export.js +209 -0
  67. package/dist/commands/export.js.map +1 -0
  68. package/dist/commands/history.d.ts +15 -0
  69. package/dist/commands/history.d.ts.map +1 -0
  70. package/dist/commands/history.js +389 -0
  71. package/dist/commands/history.js.map +1 -0
  72. package/dist/commands/template.d.ts +14 -0
  73. package/dist/commands/template.d.ts.map +1 -0
  74. package/dist/commands/template.js +341 -0
  75. package/dist/commands/template.js.map +1 -0
  76. package/dist/commands/validate-map.d.ts +12 -0
  77. package/dist/commands/validate-map.d.ts.map +1 -0
  78. package/dist/commands/validate-map.js +274 -0
  79. package/dist/commands/validate-map.js.map +1 -0
  80. package/dist/commands/whatif.d.ts +17 -0
  81. package/dist/commands/whatif.d.ts.map +1 -0
  82. package/dist/commands/whatif.js +206 -0
  83. package/dist/commands/whatif.js.map +1 -0
  84. package/dist/comparison.d.ts +38 -0
  85. package/dist/comparison.d.ts.map +1 -0
  86. package/dist/comparison.js +223 -0
  87. package/dist/comparison.js.map +1 -0
  88. package/dist/config.d.ts +42 -0
  89. package/dist/config.d.ts.map +1 -0
  90. package/dist/config.js +158 -0
  91. package/dist/config.js.map +1 -0
  92. package/dist/connectors/helicone.d.ts +9 -0
  93. package/dist/connectors/helicone.d.ts.map +1 -0
  94. package/dist/connectors/helicone.js +106 -0
  95. package/dist/connectors/helicone.js.map +1 -0
  96. package/dist/connectors/index.d.ts +37 -0
  97. package/dist/connectors/index.d.ts.map +1 -0
  98. package/dist/connectors/index.js +65 -0
  99. package/dist/connectors/index.js.map +1 -0
  100. package/dist/connectors/langsmith.d.ts +9 -0
  101. package/dist/connectors/langsmith.d.ts.map +1 -0
  102. package/dist/connectors/langsmith.js +122 -0
  103. package/dist/connectors/langsmith.js.map +1 -0
  104. package/dist/connectors/types.d.ts +83 -0
  105. package/dist/connectors/types.d.ts.map +1 -0
  106. package/dist/connectors/types.js +98 -0
  107. package/dist/connectors/types.js.map +1 -0
  108. package/dist/cost-estimator.d.ts +46 -0
  109. package/dist/cost-estimator.d.ts.map +1 -0
  110. package/dist/cost-estimator.js +104 -0
  111. package/dist/cost-estimator.js.map +1 -0
  112. package/dist/costs.d.ts +57 -0
  113. package/dist/costs.d.ts.map +1 -0
  114. package/dist/costs.js +251 -0
  115. package/dist/costs.js.map +1 -0
  116. package/dist/counterfactuals.d.ts +29 -0
  117. package/dist/counterfactuals.d.ts.map +1 -0
  118. package/dist/counterfactuals.js +448 -0
  119. package/dist/counterfactuals.js.map +1 -0
  120. package/dist/enhancement-prompts.d.ts +41 -0
  121. package/dist/enhancement-prompts.d.ts.map +1 -0
  122. package/dist/enhancement-prompts.js +88 -0
  123. package/dist/enhancement-prompts.js.map +1 -0
  124. package/dist/envelopes.d.ts +20 -0
  125. package/dist/envelopes.d.ts.map +1 -0
  126. package/dist/envelopes.js +790 -0
  127. package/dist/envelopes.js.map +1 -0
  128. package/dist/format-normalizer.d.ts +71 -0
  129. package/dist/format-normalizer.d.ts.map +1 -0
  130. package/dist/format-normalizer.js +1331 -0
  131. package/dist/format-normalizer.js.map +1 -0
  132. package/dist/history.d.ts +79 -0
  133. package/dist/history.d.ts.map +1 -0
  134. package/dist/history.js +313 -0
  135. package/dist/history.js.map +1 -0
  136. package/dist/html.d.ts +11 -0
  137. package/dist/html.d.ts.map +1 -0
  138. package/dist/html.js +463 -0
  139. package/dist/html.js.map +1 -0
  140. package/dist/impact.d.ts +42 -0
  141. package/dist/impact.d.ts.map +1 -0
  142. package/dist/impact.js +443 -0
  143. package/dist/impact.js.map +1 -0
  144. package/dist/index.d.ts +26 -0
  145. package/dist/index.d.ts.map +1 -0
  146. package/dist/index.js +34 -0
  147. package/dist/index.js.map +1 -0
  148. package/dist/insights.d.ts +5 -0
  149. package/dist/insights.d.ts.map +1 -0
  150. package/dist/insights.js +271 -0
  151. package/dist/insights.js.map +1 -0
  152. package/dist/joiner.d.ts +9 -0
  153. package/dist/joiner.d.ts.map +1 -0
  154. package/dist/joiner.js +247 -0
  155. package/dist/joiner.js.map +1 -0
  156. package/dist/orchestrator.d.ts +34 -0
  157. package/dist/orchestrator.d.ts.map +1 -0
  158. package/dist/orchestrator.js +827 -0
  159. package/dist/orchestrator.js.map +1 -0
  160. package/dist/pdf.d.ts +26 -0
  161. package/dist/pdf.d.ts.map +1 -0
  162. package/dist/pdf.js +84 -0
  163. package/dist/pdf.js.map +1 -0
  164. package/dist/prediction.d.ts +33 -0
  165. package/dist/prediction.d.ts.map +1 -0
  166. package/dist/prediction.js +316 -0
  167. package/dist/prediction.js.map +1 -0
  168. package/dist/prompts/loader.d.ts +38 -0
  169. package/dist/prompts/loader.d.ts.map +1 -0
  170. package/dist/prompts/loader.js +60 -0
  171. package/dist/prompts/loader.js.map +1 -0
  172. package/dist/renderer.d.ts +64 -0
  173. package/dist/renderer.d.ts.map +1 -0
  174. package/dist/renderer.js +923 -0
  175. package/dist/renderer.js.map +1 -0
  176. package/dist/runid.d.ts +57 -0
  177. package/dist/runid.d.ts.map +1 -0
  178. package/dist/runid.js +199 -0
  179. package/dist/runid.js.map +1 -0
  180. package/dist/runtime.d.ts +29 -0
  181. package/dist/runtime.d.ts.map +1 -0
  182. package/dist/runtime.js +366 -0
  183. package/dist/runtime.js.map +1 -0
  184. package/dist/scanner.d.ts +11 -0
  185. package/dist/scanner.d.ts.map +1 -0
  186. package/dist/scanner.js +426 -0
  187. package/dist/scanner.js.map +1 -0
  188. package/dist/templates.d.ts +120 -0
  189. package/dist/templates.d.ts.map +1 -0
  190. package/dist/templates.js +429 -0
  191. package/dist/templates.js.map +1 -0
  192. package/dist/tools/index.d.ts +153 -0
  193. package/dist/tools/index.d.ts.map +1 -0
  194. package/dist/tools/index.js +177 -0
  195. package/dist/tools/index.js.map +1 -0
  196. package/dist/types.d.ts +3647 -0
  197. package/dist/types.d.ts.map +1 -0
  198. package/dist/types.js +703 -0
  199. package/dist/types.js.map +1 -0
  200. package/dist/version.d.ts +7 -0
  201. package/dist/version.d.ts.map +1 -0
  202. package/dist/version.js +23 -0
  203. package/dist/version.js.map +1 -0
  204. package/docs/demo-guide.md +423 -0
  205. package/docs/events-format.md +295 -0
  206. package/docs/inferencemap-spec.md +344 -0
  207. package/docs/migration-v2.md +293 -0
  208. package/fixtures/demo/precomputed.json +142 -0
  209. package/fixtures/demo-project/README.md +52 -0
  210. package/fixtures/demo-project/ai-service.ts +65 -0
  211. package/fixtures/demo-project/sample-events.jsonl +15 -0
  212. package/fixtures/demo-project/src/ai-service.ts +128 -0
  213. package/fixtures/demo-project/src/llm-client.ts +155 -0
  214. package/package.json +65 -0
  215. package/prompts/agent-analyzer.yaml +47 -0
  216. package/prompts/ci-gate.yaml +98 -0
  217. package/prompts/correlation-analyzer.yaml +178 -0
  218. package/prompts/format-normalizer.yaml +46 -0
  219. package/prompts/peak-performance.yaml +180 -0
  220. package/prompts/pr-comment.yaml +111 -0
  221. package/prompts/runtime-analyzer.yaml +189 -0
  222. package/prompts/unified-analyzer.yaml +241 -0
  223. package/schemas/inference-map.v0.1.json +215 -0
  224. package/scripts/benchmark.ts +394 -0
  225. package/scripts/demo-v1.5.sh +158 -0
  226. package/scripts/sync-from-site.sh +197 -0
  227. package/scripts/validate-sync.sh +178 -0
  228. package/src/agent-analyzer.ts +481 -0
  229. package/src/agent.ts +1232 -0
  230. package/src/agents/correlation-analyzer.ts +353 -0
  231. package/src/agents/index.ts +235 -0
  232. package/src/agents/runtime-analyzer.ts +343 -0
  233. package/src/analysis-types.ts +558 -0
  234. package/src/analytics.ts +100 -0
  235. package/src/analyzer.ts +692 -0
  236. package/src/artifacts.ts +218 -0
  237. package/src/benchmarks/index.ts +309 -0
  238. package/src/cli.ts +503 -0
  239. package/src/commands/ci.ts +336 -0
  240. package/src/commands/config.ts +288 -0
  241. package/src/commands/demo.ts +175 -0
  242. package/src/commands/export.ts +297 -0
  243. package/src/commands/history.ts +425 -0
  244. package/src/commands/template.ts +385 -0
  245. package/src/commands/validate-map.ts +324 -0
  246. package/src/commands/whatif.ts +272 -0
  247. package/src/comparison.ts +283 -0
  248. package/src/config.ts +188 -0
  249. package/src/connectors/helicone.ts +164 -0
  250. package/src/connectors/index.ts +93 -0
  251. package/src/connectors/langsmith.ts +179 -0
  252. package/src/connectors/types.ts +180 -0
  253. package/src/cost-estimator.ts +146 -0
  254. package/src/costs.ts +347 -0
  255. package/src/counterfactuals.ts +516 -0
  256. package/src/enhancement-prompts.ts +118 -0
  257. package/src/envelopes.ts +814 -0
  258. package/src/format-normalizer.ts +1486 -0
  259. package/src/history.ts +400 -0
  260. package/src/html.ts +512 -0
  261. package/src/impact.ts +522 -0
  262. package/src/index.ts +83 -0
  263. package/src/insights.ts +341 -0
  264. package/src/joiner.ts +289 -0
  265. package/src/orchestrator.ts +1015 -0
  266. package/src/pdf.ts +110 -0
  267. package/src/prediction.ts +392 -0
  268. package/src/prompts/loader.ts +88 -0
  269. package/src/renderer.ts +1045 -0
  270. package/src/runid.ts +261 -0
  271. package/src/runtime.ts +450 -0
  272. package/src/scanner.ts +508 -0
  273. package/src/templates.ts +561 -0
  274. package/src/tools/index.ts +214 -0
  275. package/src/types.ts +873 -0
  276. package/src/version.ts +24 -0
  277. package/templates/context-accumulation.yaml +23 -0
  278. package/templates/cost-concentration.yaml +20 -0
  279. package/templates/dead-code.yaml +20 -0
  280. package/templates/latency-explainer.yaml +23 -0
  281. package/templates/optimizations/ab-testing-framework.yaml +74 -0
  282. package/templates/optimizations/api-gateway-optimization.yaml +81 -0
  283. package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
  284. package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
  285. package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
  286. package/templates/optimizations/comprehensive-apm.yaml +76 -0
  287. package/templates/optimizations/context-window-optimization.yaml +91 -0
  288. package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
  289. package/templates/optimizations/distributed-training-optimization.yaml +77 -0
  290. package/templates/optimizations/document-analysis-edge.yaml +77 -0
  291. package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
  292. package/templates/optimizations/domain-specific-distillation.yaml +78 -0
  293. package/templates/optimizations/error-handling-optimization.yaml +76 -0
  294. package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
  295. package/templates/optimizations/long-context-memory-management.yaml +78 -0
  296. package/templates/optimizations/max-tokens-optimization.yaml +76 -0
  297. package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
  298. package/templates/optimizations/multi-framework-resilience.yaml +75 -0
  299. package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
  300. package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
  301. package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
  302. package/templates/optimizations/quality-monitoring.yaml +74 -0
  303. package/templates/optimizations/realtime-budget-controls.yaml +74 -0
  304. package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
  305. package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
  306. package/templates/optimizations/smart-model-routing.yaml +96 -0
  307. package/templates/optimizations/streaming-batch-selection.yaml +167 -0
  308. package/templates/optimizations/system-prompt-optimization.yaml +75 -0
  309. package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
  310. package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
  311. package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
  312. package/templates/overpowered-extraction.yaml +32 -0
  313. package/templates/overpowered-model.yaml +31 -0
  314. package/templates/prompt-bloat.yaml +24 -0
  315. package/templates/retry-explosion.yaml +28 -0
  316. package/templates/schema/insight.schema.json +113 -0
  317. package/templates/schema/optimization.schema.json +180 -0
  318. package/templates/streaming-drift.yaml +30 -0
  319. package/templates/throughput-gap.yaml +21 -0
  320. package/templates/token-underutilization.yaml +28 -0
  321. package/templates/untested-fallback.yaml +21 -0
  322. package/tests/accuracy/drift-detection.test.ts +184 -0
  323. package/tests/accuracy/false-positives.test.ts +166 -0
  324. package/tests/accuracy/templates.test.ts +205 -0
  325. package/tests/action/commands.test.ts +125 -0
  326. package/tests/action/comments.test.ts +347 -0
  327. package/tests/cli.test.ts +203 -0
  328. package/tests/comparison.test.ts +309 -0
  329. package/tests/correlation-analyzer.test.ts +534 -0
  330. package/tests/counterfactuals.test.ts +347 -0
  331. package/tests/fixtures/events/missing-id.jsonl +1 -0
  332. package/tests/fixtures/events/missing-input.jsonl +1 -0
  333. package/tests/fixtures/events/missing-latency.jsonl +1 -0
  334. package/tests/fixtures/events/missing-model.jsonl +1 -0
  335. package/tests/fixtures/events/missing-output.jsonl +1 -0
  336. package/tests/fixtures/events/missing-provider.jsonl +1 -0
  337. package/tests/fixtures/events/missing-ts.jsonl +1 -0
  338. package/tests/fixtures/events/valid.csv +3 -0
  339. package/tests/fixtures/events/valid.json +1 -0
  340. package/tests/fixtures/events/valid.jsonl +2 -0
  341. package/tests/fixtures/events/with-callsite.jsonl +1 -0
  342. package/tests/fixtures/events/with-intent.jsonl +1 -0
  343. package/tests/fixtures/events/wrong-type.jsonl +1 -0
  344. package/tests/fixtures/repos/empty/.gitkeep +0 -0
  345. package/tests/fixtures/repos/hybrid-router/router.py +35 -0
  346. package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
  347. package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
  348. package/tests/fixtures/repos/saas-openai/client.py +26 -0
  349. package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
  350. package/tests/github-action.test.ts +292 -0
  351. package/tests/insights.test.ts +878 -0
  352. package/tests/joiner.test.ts +168 -0
  353. package/tests/performance/action-latency.test.ts +132 -0
  354. package/tests/performance/benchmark.test.ts +189 -0
  355. package/tests/performance/cli-latency.test.ts +102 -0
  356. package/tests/pr-comment.test.ts +313 -0
  357. package/tests/prediction.test.ts +296 -0
  358. package/tests/runtime-analyzer.test.ts +375 -0
  359. package/tests/runtime.test.ts +205 -0
  360. package/tests/scanner.test.ts +122 -0
  361. package/tests/template-conformance.test.ts +526 -0
  362. package/tests/unit/cost-calculator.test.ts +303 -0
  363. package/tests/unit/credits.test.ts +180 -0
  364. package/tests/unit/inference-map.test.ts +276 -0
  365. package/tests/unit/schema.test.ts +300 -0
  366. package/tsconfig.json +20 -0
  367. package/vitest.config.ts +14 -0
package/src/version.ts ADDED
@@ -0,0 +1,24 @@
1
+ /**
2
+ * Single source of truth for version
3
+ * Auto-bumped on each build via npm version patch
4
+ */
5
+
6
+ import { readFileSync } from 'fs';
7
+ import { join, dirname } from 'path';
8
+ import { fileURLToPath } from 'url';
9
+
10
+ // Read version from package.json at runtime
11
+ function getPackageVersion(): string {
12
+ try {
13
+ // Handle both development (src/) and production (dist/) paths
14
+ const __dirname = dirname(fileURLToPath(import.meta.url));
15
+ const packagePath = join(__dirname, '..', 'package.json');
16
+ const pkg = JSON.parse(readFileSync(packagePath, 'utf-8'));
17
+ return pkg.version || '1.0.0';
18
+ } catch {
19
+ return '1.0.0';
20
+ }
21
+ }
22
+
23
+ export const VERSION = getPackageVersion();
24
+ export const VERSION_DISPLAY = `PeakInfer v${VERSION}`;
@@ -0,0 +1,23 @@
1
+ # Based on: https://www.kalmantic.com/posts/conversation-history-costs-context-windows-drain-budgets
2
+ # "Why Context Windows Drain AI Budgets 10x Faster"
3
+
4
+ id: context-accumulation
5
+ name: Context Window Bloat Detection
6
+ version: "1.0"
7
+ category: cost
8
+ severity: warning
9
+ layer: model
10
+
11
+ match:
12
+ scope: callsite
13
+ conditions:
14
+ - field: usage.tokens_in
15
+ op: gt
16
+ value: 50000
17
+
18
+ output:
19
+ headline: "High context usage at {{location}}"
20
+ evidence: "Averaging {{avg_tokens_in}} input tokens per call. Consider sliding window or summarization."
21
+
22
+ defaults:
23
+ high_context_threshold: 50000
@@ -0,0 +1,20 @@
1
+ id: cost-concentration
2
+ name: Cost Concentration
3
+ version: "1.0"
4
+ category: cost
5
+ severity: warning
6
+ layer: application
7
+
8
+ match:
9
+ scope: global
10
+ conditions:
11
+ - field: top_callsite_cost_percent
12
+ op: gt
13
+ value: 50
14
+
15
+ output:
16
+ headline: "{{percent}}% of inference cost from one callsite"
17
+ evidence: "{{model}} at {{location}}"
18
+
19
+ defaults:
20
+ threshold_percent: 50
@@ -0,0 +1,20 @@
1
+ id: dead-code
2
+ name: Dead Code Detection
3
+ version: "1.0"
4
+ category: drift
5
+ severity: warning
6
+ layer: application
7
+
8
+ match:
9
+ scope: joined
10
+ conditions:
11
+ - field: codeOnly.length
12
+ op: gt
13
+ value: 0
14
+
15
+ output:
16
+ headline: "{{count}} callsites in code with no runtime events"
17
+ evidence: "{{locations}}"
18
+
19
+ defaults:
20
+ min_count: 1
@@ -0,0 +1,23 @@
1
+ id: latency-explainer
2
+ name: Latency Explainer
3
+ version: "1.0"
4
+ category: latency
5
+ severity: warning
6
+ layer: application
7
+
8
+ match:
9
+ scope: callsite
10
+ conditions:
11
+ - field: usage.latency_p95
12
+ op: gt
13
+ value: 3000
14
+ - field: patterns.streaming
15
+ op: neq
16
+ value: true
17
+
18
+ output:
19
+ headline: "High tail latency: {{p95}}ms at p95"
20
+ evidence: "No streaming enabled; full response wait contributes to latency"
21
+
22
+ defaults:
23
+ latency_threshold_ms: 3000
@@ -0,0 +1,74 @@
1
+ id: ab-testing-framework
2
+ name: A/B Testing for Optimization Validation
3
+ description: Implement A/B testing framework to validate optimization effectiveness
4
+ category: monitoring
5
+ confidence: 0.91
6
+ success_count: 1234
7
+ verified_environments: 61
8
+ contributors:
9
+ - experimentation_engineer
10
+ - data_scientist
11
+ last_updated: "2024-12-22"
12
+
13
+ environment_match:
14
+ traffic_volume: ">10K/day"
15
+ optimization_candidates: ">1"
16
+ statistical_rigor_requirement: high
17
+
18
+ optimization:
19
+ technique: ab_testing_framework
20
+ expected_cost_reduction: "5-15%"
21
+ effort_estimate: "2-3 weeks"
22
+ risk_level: low
23
+
24
+ economics:
25
+ baseline_calculation:
26
+ failed_optimization_cost: 20000
27
+ projected_improvement:
28
+ optimization_success_rate: 0.85
29
+ implementation_cost:
30
+ engineering_hours: 100
31
+ total_cost: 20000
32
+
33
+ implementation:
34
+ prerequisites:
35
+ - requirement: "Traffic splitting capability"
36
+ - requirement: "Statistical analysis tools"
37
+ automated_steps:
38
+ - step_id: framework_setup
39
+ name: A/B Framework Setup
40
+ executable: true
41
+ commands:
42
+ - "python scripts/setup_ab_framework.py"
43
+ - "python scripts/configure_traffic_splitting.py"
44
+ validation:
45
+ command: "python scripts/verify_framework.py"
46
+ success_criteria: "framework_functional"
47
+ - step_id: analysis
48
+ name: Statistical Analysis
49
+ executable: true
50
+ commands:
51
+ - "python scripts/configure_metrics.py --primary cost,latency,quality"
52
+ - "python scripts/setup_significance_testing.py --confidence 0.95"
53
+ validation:
54
+ command: "python scripts/test_analysis.py"
55
+ success_criteria: "analysis_accurate"
56
+
57
+ monitoring:
58
+ key_metrics:
59
+ - metric: experiment_validity
60
+ target: ">0.95"
61
+ alert_threshold: "<0.8"
62
+ - metric: sample_ratio_mismatch
63
+ target: "<0.01"
64
+ alert_threshold: ">0.05"
65
+ rollback_triggers:
66
+ - condition: "sample_ratio_mismatch > 0.1"
67
+ action: pause_experiment
68
+
69
+ results:
70
+ recent_implementations:
71
+ - environment: optimization_validation
72
+ experiments_run: 15
73
+ successful_optimizations: 12
74
+ prevented_regressions: 3
@@ -0,0 +1,81 @@
1
+ id: api-gateway-optimization
2
+ name: API Gateway Traffic Optimization
3
+ description: Optimize API gateway for AI inference traffic patterns
4
+ category: application_optimization
5
+ confidence: 0.90
6
+ success_count: 1234
7
+ verified_environments: 58
8
+ contributors:
9
+ - gateway_specialist
10
+ - traffic_engineer
11
+ last_updated: "2024-12-25"
12
+
13
+ environment_match:
14
+ gateway_type:
15
+ - kong
16
+ - nginx
17
+ - envoy
18
+ daily_requests: ">100K"
19
+ inference_traffic_percent: ">50%"
20
+
21
+ optimization:
22
+ technique: gateway_optimization
23
+ expected_latency_improvement: "20-40%"
24
+ expected_cost_reduction: "15-25%"
25
+ effort_estimate: "1-2 weeks"
26
+ risk_level: low
27
+
28
+ economics:
29
+ baseline_calculation:
30
+ gateway_overhead_ms: 50
31
+ projected_improvement:
32
+ optimized_overhead_ms: 20
33
+ latency_reduction_percent: 60
34
+ implementation_cost:
35
+ engineering_hours: 60
36
+ total_cost: 12000
37
+
38
+ implementation:
39
+ prerequisites:
40
+ - requirement: "Gateway configuration access"
41
+ - requirement: "Traffic analytics"
42
+ automated_steps:
43
+ - step_id: traffic_analysis
44
+ name: Traffic Pattern Analysis
45
+ executable: true
46
+ commands:
47
+ - "python scripts/analyze_gateway_traffic.py"
48
+ - "python scripts/identify_optimization_opportunities.py"
49
+ validation:
50
+ command: "python scripts/validate_analysis.py"
51
+ success_criteria: "patterns_identified"
52
+ - step_id: optimization
53
+ name: Gateway Optimization
54
+ executable: true
55
+ commands:
56
+ - "python scripts/configure_connection_pooling.py"
57
+ - "python scripts/enable_request_coalescing.py"
58
+ - "python scripts/optimize_routing.py"
59
+ validation:
60
+ command: "python scripts/benchmark_gateway.py"
61
+ success_criteria: "latency_reduction > 0.2"
62
+ rollback_command: "python scripts/revert_gateway_config.py"
63
+
64
+ monitoring:
65
+ key_metrics:
66
+ - metric: gateway_latency_p95
67
+ target: "<30ms"
68
+ alert_threshold: ">50ms"
69
+ - metric: connection_pool_utilization
70
+ target: "60-80%"
71
+ alert_threshold: ">95%"
72
+ rollback_triggers:
73
+ - condition: "error_rate > 1% for 5 minutes"
74
+ action: automatic_rollback
75
+
76
+ results:
77
+ recent_implementations:
78
+ - environment: ai_api_gateway
79
+ baseline_latency_ms: 55
80
+ optimized_latency_ms: 22
81
+ latency_reduction_percent: 60
@@ -0,0 +1,126 @@
1
+ id: api-model-routing-strategy
2
+ name: Multi-Provider Model Routing for Cost-Quality Optimization
3
+ description: Route API requests to appropriate model tiers based on task complexity, achieving 60-80% cost reduction
4
+ category: api_optimization
5
+ confidence: 0.93
6
+ success_count: 2341
7
+ verified_environments: 112
8
+ contributors:
9
+ - inference_squeeze
10
+ - api_optimizer
11
+ - cost_engineer
12
+ last_updated: "2025-01-20"
13
+ source: "Inference Squeeze Chapter 4 - Smart Model Routing"
14
+
15
+ environment_match:
16
+ api_usage: multi_provider
17
+ monthly_api_cost: ">$5K"
18
+ task_complexity: variable
19
+ quality_tolerance: task_dependent
20
+
21
+ optimization:
22
+ technique: tiered_model_routing
23
+ expected_cost_reduction: "60-80%"
24
+ expected_quality_impact: "<2%"
25
+ effort_estimate: "1-2 weeks"
26
+ risk_level: low
27
+
28
+ economics:
29
+ baseline_calculation:
30
+ monthly_api_calls: 500000
31
+ premium_model_cost_per_call: 0.03
32
+ current_monthly_cost: 15000
33
+ projected_improvement:
34
+ task_distribution:
35
+ extraction: 0.40
36
+ qa: 0.35
37
+ generation: 0.25
38
+ tier_costs:
39
+ extraction: 0.003
40
+ qa: 0.01
41
+ generation: 0.03
42
+ new_avg_cost: 0.0117
43
+ monthly_savings: 9150
44
+ implementation_cost:
45
+ engineering_hours: 80
46
+ total_cost: 16000
47
+
48
+ implementation:
49
+ prerequisites:
50
+ - requirement: "Task classification capability"
51
+ validation: "Ability to categorize requests by complexity"
52
+ - requirement: "Multi-provider API access"
53
+ validation: "API keys for OpenAI, Anthropic, or alternatives"
54
+ automated_steps:
55
+ - step_id: task_analysis
56
+ name: Analyze Task Distribution
57
+ executable: true
58
+ commands:
59
+ - "Audit last 30 days of API calls by task type"
60
+ - "Categorize: extraction, QA, summarization, generation, reasoning"
61
+ - "Measure quality requirements per task type"
62
+ validation:
63
+ command: "Task distribution documented with quality thresholds"
64
+ success_criteria: "task_types_identified AND quality_thresholds_set"
65
+ rollback_command: "Revert to single-model configuration"
66
+ - step_id: routing_rules
67
+ name: Define Routing Rules
68
+ executable: true
69
+ commands:
70
+ - "Map task types to model tiers"
71
+ - "Define fallback logic for edge cases"
72
+ - "Set quality thresholds for tier escalation"
73
+ routing_matrix:
74
+ extraction:
75
+ primary: "claude-3-haiku / gpt-4o-mini"
76
+ cost_per_1k_tokens: 0.25
77
+ qa:
78
+ primary: "claude-3-5-sonnet / gpt-4o-mini"
79
+ cost_per_1k_tokens: 3.00
80
+ generation:
81
+ primary: "claude-3-5-sonnet / gpt-4o"
82
+ cost_per_1k_tokens: 15.00
83
+ reasoning:
84
+ primary: "claude-opus-4 / o1-preview"
85
+ cost_per_1k_tokens: 75.00
86
+ validation:
87
+ command: "Routing matrix validated"
88
+ success_criteria: "all_task_types_mapped AND fallbacks_defined"
89
+ rollback_command: "Disable routing, use premium model"
90
+ - step_id: implementation
91
+ name: Implement Routing Layer
92
+ executable: true
93
+ commands:
94
+ - "Add task classifier to API gateway"
95
+ - "Implement model selector based on classification"
96
+ - "Add quality monitoring and fallback triggers"
97
+ validation:
98
+ command: "Test routing with sample requests"
99
+ success_criteria: "routing_accuracy > 0.93 AND quality_maintained"
100
+ rollback_command: "Revert to single-model configuration"
101
+
102
+ monitoring:
103
+ key_metrics:
104
+ - metric: cost_per_successful_task
105
+ target: "<baseline * 0.4"
106
+ alert_threshold: ">baseline * 0.6"
107
+ - metric: task_quality_score
108
+ target: ">0.95"
109
+ alert_threshold: "<0.92"
110
+ - metric: routing_accuracy
111
+ target: ">0.93"
112
+ alert_threshold: "<0.88"
113
+ rollback_triggers:
114
+ - condition: "quality_score < 0.90 for 1 hour"
115
+ action: escalate_to_premium_model
116
+ - condition: "routing_accuracy < 0.85 for 30 minutes"
117
+ action: disable_routing_use_premium
118
+
119
+ results:
120
+ case_study:
121
+ environment: Legal document processing SaaS
122
+ baseline_monthly_cost: 47000
123
+ optimized_monthly_cost: 14100
124
+ cost_reduction_percent: 70
125
+ quality_retention: 97.2
126
+ implementation_days: 12
@@ -0,0 +1,85 @@
1
+ id: auto-scaling-optimization
2
+ name: Auto-scaling Optimization
3
+ description: Optimize auto-scaling for AI inference workloads with predictive scaling
4
+ category: scaling
5
+ confidence: 0.89
6
+ success_count: 1456
7
+ verified_environments: 68
8
+ contributors:
9
+ - scaling_engineer
10
+ - infrastructure_architect
11
+ last_updated: "2024-12-21"
12
+
13
+ environment_match:
14
+ traffic_pattern: variable
15
+ current_scaling: reactive
16
+ cost_sensitivity: high
17
+
18
+ optimization:
19
+ technique: predictive_autoscaling
20
+ expected_cost_reduction: "20-40%"
21
+ expected_latency_improvement: "30-50%"
22
+ effort_estimate: "2-3 weeks"
23
+ risk_level: medium
24
+
25
+ economics:
26
+ baseline_calculation:
27
+ overprovisioning_percent: 40
28
+ projected_improvement:
29
+ optimized_utilization: 0.75
30
+ cost_reduction_percent: 30
31
+ implementation_cost:
32
+ engineering_hours: 120
33
+ total_cost: 24000
34
+
35
+ implementation:
36
+ prerequisites:
37
+ - requirement: "Auto-scaling infrastructure"
38
+ - requirement: "Historical traffic data"
39
+ - requirement: "Kubernetes or cloud autoscaler"
40
+ automated_steps:
41
+ - step_id: traffic_analysis
42
+ name: Traffic Pattern Analysis
43
+ executable: true
44
+ commands:
45
+ - "python scripts/analyze_traffic_patterns.py --history 30d"
46
+ - "python scripts/build_prediction_model.py"
47
+ validation:
48
+ command: "python scripts/validate_predictions.py"
49
+ success_criteria: "prediction_accuracy > 0.85"
50
+ - step_id: scaling_config
51
+ name: Scaling Configuration
52
+ executable: true
53
+ commands:
54
+ - "python scripts/configure_predictive_scaling.py"
55
+ - "python scripts/set_scaling_bounds.py --min 2 --max 50"
56
+ - "python scripts/enable_scale_down_delay.py --delay 300"
57
+ validation:
58
+ command: "python scripts/test_scaling.py"
59
+ success_criteria: "scaling_responsive"
60
+ rollback_command: "python scripts/revert_scaling_config.py"
61
+
62
+ monitoring:
63
+ key_metrics:
64
+ - metric: prediction_accuracy
65
+ target: ">0.85"
66
+ alert_threshold: "<0.7"
67
+ - metric: resource_utilization
68
+ target: "60-80%"
69
+ alert_threshold: "<40% OR >90%"
70
+ - metric: cold_start_rate
71
+ target: "<0.01"
72
+ alert_threshold: ">0.05"
73
+ rollback_triggers:
74
+ - condition: "cold_start_rate > 0.1 for 10 minutes"
75
+ action: automatic_rollback
76
+ - condition: "utilization < 30% for 30 minutes"
77
+ action: alert_and_investigation
78
+
79
+ results:
80
+ recent_implementations:
81
+ - environment: inference_cluster
82
+ baseline_overprovisioning: 45
83
+ optimized_utilization: 72
84
+ cost_reduction_percent: 35
85
+ latency_improvement_percent: 40
@@ -0,0 +1,142 @@
1
+ id: batch-utilization-diagnostic
2
+ name: API Batch Utilization Diagnostic and Optimization
3
+ description: Identify batch optimization opportunities in API workloads - 5-minute diagnostic that reveals 50-70% cost savings
4
+ category: api_optimization
5
+ confidence: 0.91
6
+ success_count: 1876
7
+ verified_environments: 89
8
+ contributors:
9
+ - inference_squeeze
10
+ - batch_expert
11
+ - api_architect
12
+ last_updated: "2025-01-20"
13
+ source: "Inference Squeeze Chapter 3 - Batch Utilization Diagnostic"
14
+
15
+ environment_match:
16
+ api_pattern: real_time_or_mixed
17
+ request_pattern: individual_calls
18
+ latency_tolerance: variable
19
+ monthly_requests: ">50K"
20
+
21
+ optimization:
22
+ technique: request_batching
23
+ expected_cost_reduction: "50-70%"
24
+ expected_latency_impact: "50-200ms additional wait"
25
+ effort_estimate: "3-5 days"
26
+ risk_level: low
27
+
28
+ economics:
29
+ baseline_calculation:
30
+ individual_requests_per_day: 10000
31
+ avg_tokens_per_request: 500
32
+ cost_per_token: 0.00003
33
+ daily_cost: 150
34
+ projected_improvement:
35
+ batch_discount: 0.50
36
+ new_daily_cost: 75
37
+ monthly_savings: 2250
38
+ implementation_cost:
39
+ engineering_hours: 24
40
+ total_cost: 4800
41
+
42
+ diagnostic:
43
+ five_minute_test:
44
+ step_1:
45
+ question: "What percentage of requests require <1s response?"
46
+ low_batch_signal: ">80% require immediate response"
47
+ high_batch_signal: "<50% require immediate response"
48
+ step_2:
49
+ question: "Can requests be queued for 5-60 seconds?"
50
+ low_batch_signal: "No, real-time required"
51
+ high_batch_signal: "Yes, async processing acceptable"
52
+ step_3:
53
+ question: "Are requests independent (no dependencies)?"
54
+ low_batch_signal: "No, requests depend on each other"
55
+ high_batch_signal: "Yes, can process in any order"
56
+ step_4:
57
+ question: "What is your current batch size?"
58
+ interpretation:
59
+ batch_1: "6-8x cost reduction possible"
60
+ batch_2_4: "3-4x cost reduction possible"
61
+ batch_8_16: "1.5-2x cost reduction possible"
62
+ batch_32_plus: "Already optimized"
63
+
64
+ implementation:
65
+ prerequisites:
66
+ - requirement: "Request queue infrastructure"
67
+ validation: "Redis, SQS, or equivalent available"
68
+ - requirement: "Async processing capability"
69
+ validation: "Can handle delayed responses"
70
+ automated_steps:
71
+ - step_id: workload_analysis
72
+ name: Analyze Request Patterns
73
+ executable: true
74
+ commands:
75
+ - "Audit latency requirements per endpoint"
76
+ - "Identify async-eligible workloads"
77
+ - "Calculate potential batch sizes by endpoint"
78
+ validation:
79
+ command: "Request patterns documented"
80
+ success_criteria: "latency_requirements_mapped AND async_eligible_identified"
81
+ rollback_command: "Continue with individual requests"
82
+ - step_id: queue_implementation
83
+ name: Implement Request Queuing
84
+ executable: true
85
+ commands:
86
+ - "Add message queue (Redis, SQS, etc.)"
87
+ - "Configure batch collection windows (50-500ms)"
88
+ - "Implement batch processor with OpenAI Batch API or similar"
89
+ validation:
90
+ command: "Test batch processing"
91
+ success_criteria: "batch_size > 8 AND latency_within_tolerance"
92
+ rollback_command: "Disable queuing"
93
+ - step_id: latency_tiers
94
+ name: Define Latency Tiers
95
+ executable: true
96
+ tiers:
97
+ real_time:
98
+ max_wait: "0ms"
99
+ batch_size: 1
100
+ use_case: "Interactive chat, autocomplete"
101
+ near_real_time:
102
+ max_wait: "500ms"
103
+ batch_size: 8
104
+ use_case: "Document processing, search"
105
+ batch:
106
+ max_wait: "5s"
107
+ batch_size: 32
108
+ use_case: "Bulk analysis, reporting"
109
+ async:
110
+ max_wait: "24h"
111
+ batch_size: 1000
112
+ use_case: "Batch processing, training data"
113
+ validation:
114
+ command: "Validate tier assignments"
115
+ success_criteria: "all_endpoints_tiered"
116
+ rollback_command: "Revert to real-time only"
117
+
118
+ monitoring:
119
+ key_metrics:
120
+ - metric: avg_batch_size
121
+ target: ">16"
122
+ alert_threshold: "<8"
123
+ - metric: batch_api_utilization
124
+ target: ">60%"
125
+ alert_threshold: "<40%"
126
+ - metric: queue_wait_time_p95
127
+ target: "<configured_max"
128
+ alert_threshold: ">configured_max * 1.5"
129
+ rollback_triggers:
130
+ - condition: "avg_batch_size < 4 for 1 hour"
131
+ action: review_batch_configuration
132
+ - condition: "queue_wait_time_p95 > SLA for 15 minutes"
133
+ action: reduce_batch_window
134
+
135
+ results:
136
+ case_study:
137
+ environment: E-commerce product description generation
138
+ baseline_cost_per_item: 0.12
139
+ optimized_cost_per_item: 0.04
140
+ cost_reduction_percent: 67
141
+ items_processed_daily: 50000
142
+ monthly_savings: 120000
@@ -0,0 +1,76 @@
1
+ id: comprehensive-apm
2
+ name: Comprehensive Application Performance Monitoring
3
+ description: Implement end-to-end APM for AI inference workloads
4
+ category: monitoring
5
+ confidence: 0.92
6
+ success_count: 1567
7
+ verified_environments: 72
8
+ contributors:
9
+ - observability_engineer
10
+ - sre_specialist
11
+ last_updated: "2024-12-24"
12
+
13
+ environment_match:
14
+ observability_maturity: low
15
+ production_traffic: true
16
+ incident_frequency: ">2/month"
17
+
18
+ optimization:
19
+ technique: comprehensive_apm
20
+ expected_cost_reduction: "10-20%"
21
+ effort_estimate: "2-3 weeks"
22
+ risk_level: low
23
+
24
+ economics:
25
+ baseline_calculation:
26
+ incident_cost_monthly: 15000
27
+ projected_improvement:
28
+ incident_reduction_percent: 50
29
+ monthly_savings: 7500
30
+ implementation_cost:
31
+ engineering_hours: 120
32
+ total_cost: 24000
33
+
34
+ implementation:
35
+ prerequisites:
36
+ - requirement: "APM tool access (Datadog/NewRelic/etc)"
37
+ - requirement: "Instrumentation capability"
38
+ automated_steps:
39
+ - step_id: instrumentation
40
+ name: Instrumentation Setup
41
+ executable: true
42
+ commands:
43
+ - "python scripts/setup_apm_agent.py"
44
+ - "python scripts/instrument_inference_calls.py"
45
+ - "python scripts/configure_custom_metrics.py"
46
+ validation:
47
+ command: "python scripts/verify_instrumentation.py"
48
+ success_criteria: "coverage > 0.95"
49
+ - step_id: dashboards
50
+ name: Dashboard Configuration
51
+ executable: true
52
+ commands:
53
+ - "python scripts/create_inference_dashboard.py"
54
+ - "python scripts/setup_alerts.py"
55
+ validation:
56
+ command: "python scripts/test_alerting.py"
57
+ success_criteria: "alerts_functional"
58
+
59
+ monitoring:
60
+ key_metrics:
61
+ - metric: instrumentation_coverage
62
+ target: ">0.95"
63
+ alert_threshold: "<0.8"
64
+ - metric: metric_cardinality
65
+ target: "<100K"
66
+ alert_threshold: ">500K"
67
+ rollback_triggers:
68
+ - condition: "APM overhead > 5% latency impact"
69
+ action: reduce_sampling
70
+
71
+ results:
72
+ recent_implementations:
73
+ - environment: production_api
74
+ baseline_mttr_hours: 4
75
+ optimized_mttr_hours: 1.5
76
+ improvement_percent: 62.5