@peakinfer/cli 1.0.133

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. package/.claude/settings.local.json +8 -0
  2. package/.env.example +6 -0
  3. package/.github/workflows/peakinfer.yml +64 -0
  4. package/CHANGELOG.md +31 -0
  5. package/LICENSE +190 -0
  6. package/README.md +335 -0
  7. package/data/inferencemax.json +274 -0
  8. package/dist/agent-analyzer.d.ts +45 -0
  9. package/dist/agent-analyzer.d.ts.map +1 -0
  10. package/dist/agent-analyzer.js +374 -0
  11. package/dist/agent-analyzer.js.map +1 -0
  12. package/dist/agent.d.ts +76 -0
  13. package/dist/agent.d.ts.map +1 -0
  14. package/dist/agent.js +965 -0
  15. package/dist/agent.js.map +1 -0
  16. package/dist/agents/correlation-analyzer.d.ts +34 -0
  17. package/dist/agents/correlation-analyzer.d.ts.map +1 -0
  18. package/dist/agents/correlation-analyzer.js +261 -0
  19. package/dist/agents/correlation-analyzer.js.map +1 -0
  20. package/dist/agents/index.d.ts +91 -0
  21. package/dist/agents/index.d.ts.map +1 -0
  22. package/dist/agents/index.js +111 -0
  23. package/dist/agents/index.js.map +1 -0
  24. package/dist/agents/runtime-analyzer.d.ts +38 -0
  25. package/dist/agents/runtime-analyzer.d.ts.map +1 -0
  26. package/dist/agents/runtime-analyzer.js +244 -0
  27. package/dist/agents/runtime-analyzer.js.map +1 -0
  28. package/dist/analysis-types.d.ts +500 -0
  29. package/dist/analysis-types.d.ts.map +1 -0
  30. package/dist/analysis-types.js +11 -0
  31. package/dist/analysis-types.js.map +1 -0
  32. package/dist/analytics.d.ts +25 -0
  33. package/dist/analytics.d.ts.map +1 -0
  34. package/dist/analytics.js +94 -0
  35. package/dist/analytics.js.map +1 -0
  36. package/dist/analyzer.d.ts +48 -0
  37. package/dist/analyzer.d.ts.map +1 -0
  38. package/dist/analyzer.js +547 -0
  39. package/dist/analyzer.js.map +1 -0
  40. package/dist/artifacts.d.ts +44 -0
  41. package/dist/artifacts.d.ts.map +1 -0
  42. package/dist/artifacts.js +165 -0
  43. package/dist/artifacts.js.map +1 -0
  44. package/dist/benchmarks/index.d.ts +88 -0
  45. package/dist/benchmarks/index.d.ts.map +1 -0
  46. package/dist/benchmarks/index.js +205 -0
  47. package/dist/benchmarks/index.js.map +1 -0
  48. package/dist/cli.d.ts +3 -0
  49. package/dist/cli.d.ts.map +1 -0
  50. package/dist/cli.js +427 -0
  51. package/dist/cli.js.map +1 -0
  52. package/dist/commands/ci.d.ts +19 -0
  53. package/dist/commands/ci.d.ts.map +1 -0
  54. package/dist/commands/ci.js +253 -0
  55. package/dist/commands/ci.js.map +1 -0
  56. package/dist/commands/config.d.ts +16 -0
  57. package/dist/commands/config.d.ts.map +1 -0
  58. package/dist/commands/config.js +249 -0
  59. package/dist/commands/config.js.map +1 -0
  60. package/dist/commands/demo.d.ts +15 -0
  61. package/dist/commands/demo.d.ts.map +1 -0
  62. package/dist/commands/demo.js +106 -0
  63. package/dist/commands/demo.js.map +1 -0
  64. package/dist/commands/export.d.ts +14 -0
  65. package/dist/commands/export.d.ts.map +1 -0
  66. package/dist/commands/export.js +209 -0
  67. package/dist/commands/export.js.map +1 -0
  68. package/dist/commands/history.d.ts +15 -0
  69. package/dist/commands/history.d.ts.map +1 -0
  70. package/dist/commands/history.js +389 -0
  71. package/dist/commands/history.js.map +1 -0
  72. package/dist/commands/template.d.ts +14 -0
  73. package/dist/commands/template.d.ts.map +1 -0
  74. package/dist/commands/template.js +341 -0
  75. package/dist/commands/template.js.map +1 -0
  76. package/dist/commands/validate-map.d.ts +12 -0
  77. package/dist/commands/validate-map.d.ts.map +1 -0
  78. package/dist/commands/validate-map.js +274 -0
  79. package/dist/commands/validate-map.js.map +1 -0
  80. package/dist/commands/whatif.d.ts +17 -0
  81. package/dist/commands/whatif.d.ts.map +1 -0
  82. package/dist/commands/whatif.js +206 -0
  83. package/dist/commands/whatif.js.map +1 -0
  84. package/dist/comparison.d.ts +38 -0
  85. package/dist/comparison.d.ts.map +1 -0
  86. package/dist/comparison.js +223 -0
  87. package/dist/comparison.js.map +1 -0
  88. package/dist/config.d.ts +42 -0
  89. package/dist/config.d.ts.map +1 -0
  90. package/dist/config.js +158 -0
  91. package/dist/config.js.map +1 -0
  92. package/dist/connectors/helicone.d.ts +9 -0
  93. package/dist/connectors/helicone.d.ts.map +1 -0
  94. package/dist/connectors/helicone.js +106 -0
  95. package/dist/connectors/helicone.js.map +1 -0
  96. package/dist/connectors/index.d.ts +37 -0
  97. package/dist/connectors/index.d.ts.map +1 -0
  98. package/dist/connectors/index.js +65 -0
  99. package/dist/connectors/index.js.map +1 -0
  100. package/dist/connectors/langsmith.d.ts +9 -0
  101. package/dist/connectors/langsmith.d.ts.map +1 -0
  102. package/dist/connectors/langsmith.js +122 -0
  103. package/dist/connectors/langsmith.js.map +1 -0
  104. package/dist/connectors/types.d.ts +83 -0
  105. package/dist/connectors/types.d.ts.map +1 -0
  106. package/dist/connectors/types.js +98 -0
  107. package/dist/connectors/types.js.map +1 -0
  108. package/dist/cost-estimator.d.ts +46 -0
  109. package/dist/cost-estimator.d.ts.map +1 -0
  110. package/dist/cost-estimator.js +104 -0
  111. package/dist/cost-estimator.js.map +1 -0
  112. package/dist/costs.d.ts +57 -0
  113. package/dist/costs.d.ts.map +1 -0
  114. package/dist/costs.js +251 -0
  115. package/dist/costs.js.map +1 -0
  116. package/dist/counterfactuals.d.ts +29 -0
  117. package/dist/counterfactuals.d.ts.map +1 -0
  118. package/dist/counterfactuals.js +448 -0
  119. package/dist/counterfactuals.js.map +1 -0
  120. package/dist/enhancement-prompts.d.ts +41 -0
  121. package/dist/enhancement-prompts.d.ts.map +1 -0
  122. package/dist/enhancement-prompts.js +88 -0
  123. package/dist/enhancement-prompts.js.map +1 -0
  124. package/dist/envelopes.d.ts +20 -0
  125. package/dist/envelopes.d.ts.map +1 -0
  126. package/dist/envelopes.js +790 -0
  127. package/dist/envelopes.js.map +1 -0
  128. package/dist/format-normalizer.d.ts +71 -0
  129. package/dist/format-normalizer.d.ts.map +1 -0
  130. package/dist/format-normalizer.js +1331 -0
  131. package/dist/format-normalizer.js.map +1 -0
  132. package/dist/history.d.ts +79 -0
  133. package/dist/history.d.ts.map +1 -0
  134. package/dist/history.js +313 -0
  135. package/dist/history.js.map +1 -0
  136. package/dist/html.d.ts +11 -0
  137. package/dist/html.d.ts.map +1 -0
  138. package/dist/html.js +463 -0
  139. package/dist/html.js.map +1 -0
  140. package/dist/impact.d.ts +42 -0
  141. package/dist/impact.d.ts.map +1 -0
  142. package/dist/impact.js +443 -0
  143. package/dist/impact.js.map +1 -0
  144. package/dist/index.d.ts +26 -0
  145. package/dist/index.d.ts.map +1 -0
  146. package/dist/index.js +34 -0
  147. package/dist/index.js.map +1 -0
  148. package/dist/insights.d.ts +5 -0
  149. package/dist/insights.d.ts.map +1 -0
  150. package/dist/insights.js +271 -0
  151. package/dist/insights.js.map +1 -0
  152. package/dist/joiner.d.ts +9 -0
  153. package/dist/joiner.d.ts.map +1 -0
  154. package/dist/joiner.js +247 -0
  155. package/dist/joiner.js.map +1 -0
  156. package/dist/orchestrator.d.ts +34 -0
  157. package/dist/orchestrator.d.ts.map +1 -0
  158. package/dist/orchestrator.js +827 -0
  159. package/dist/orchestrator.js.map +1 -0
  160. package/dist/pdf.d.ts +26 -0
  161. package/dist/pdf.d.ts.map +1 -0
  162. package/dist/pdf.js +84 -0
  163. package/dist/pdf.js.map +1 -0
  164. package/dist/prediction.d.ts +33 -0
  165. package/dist/prediction.d.ts.map +1 -0
  166. package/dist/prediction.js +316 -0
  167. package/dist/prediction.js.map +1 -0
  168. package/dist/prompts/loader.d.ts +38 -0
  169. package/dist/prompts/loader.d.ts.map +1 -0
  170. package/dist/prompts/loader.js +60 -0
  171. package/dist/prompts/loader.js.map +1 -0
  172. package/dist/renderer.d.ts +64 -0
  173. package/dist/renderer.d.ts.map +1 -0
  174. package/dist/renderer.js +923 -0
  175. package/dist/renderer.js.map +1 -0
  176. package/dist/runid.d.ts +57 -0
  177. package/dist/runid.d.ts.map +1 -0
  178. package/dist/runid.js +199 -0
  179. package/dist/runid.js.map +1 -0
  180. package/dist/runtime.d.ts +29 -0
  181. package/dist/runtime.d.ts.map +1 -0
  182. package/dist/runtime.js +366 -0
  183. package/dist/runtime.js.map +1 -0
  184. package/dist/scanner.d.ts +11 -0
  185. package/dist/scanner.d.ts.map +1 -0
  186. package/dist/scanner.js +426 -0
  187. package/dist/scanner.js.map +1 -0
  188. package/dist/templates.d.ts +120 -0
  189. package/dist/templates.d.ts.map +1 -0
  190. package/dist/templates.js +429 -0
  191. package/dist/templates.js.map +1 -0
  192. package/dist/tools/index.d.ts +153 -0
  193. package/dist/tools/index.d.ts.map +1 -0
  194. package/dist/tools/index.js +177 -0
  195. package/dist/tools/index.js.map +1 -0
  196. package/dist/types.d.ts +3647 -0
  197. package/dist/types.d.ts.map +1 -0
  198. package/dist/types.js +703 -0
  199. package/dist/types.js.map +1 -0
  200. package/dist/version.d.ts +7 -0
  201. package/dist/version.d.ts.map +1 -0
  202. package/dist/version.js +23 -0
  203. package/dist/version.js.map +1 -0
  204. package/docs/demo-guide.md +423 -0
  205. package/docs/events-format.md +295 -0
  206. package/docs/inferencemap-spec.md +344 -0
  207. package/docs/migration-v2.md +293 -0
  208. package/fixtures/demo/precomputed.json +142 -0
  209. package/fixtures/demo-project/README.md +52 -0
  210. package/fixtures/demo-project/ai-service.ts +65 -0
  211. package/fixtures/demo-project/sample-events.jsonl +15 -0
  212. package/fixtures/demo-project/src/ai-service.ts +128 -0
  213. package/fixtures/demo-project/src/llm-client.ts +155 -0
  214. package/package.json +65 -0
  215. package/prompts/agent-analyzer.yaml +47 -0
  216. package/prompts/ci-gate.yaml +98 -0
  217. package/prompts/correlation-analyzer.yaml +178 -0
  218. package/prompts/format-normalizer.yaml +46 -0
  219. package/prompts/peak-performance.yaml +180 -0
  220. package/prompts/pr-comment.yaml +111 -0
  221. package/prompts/runtime-analyzer.yaml +189 -0
  222. package/prompts/unified-analyzer.yaml +241 -0
  223. package/schemas/inference-map.v0.1.json +215 -0
  224. package/scripts/benchmark.ts +394 -0
  225. package/scripts/demo-v1.5.sh +158 -0
  226. package/scripts/sync-from-site.sh +197 -0
  227. package/scripts/validate-sync.sh +178 -0
  228. package/src/agent-analyzer.ts +481 -0
  229. package/src/agent.ts +1232 -0
  230. package/src/agents/correlation-analyzer.ts +353 -0
  231. package/src/agents/index.ts +235 -0
  232. package/src/agents/runtime-analyzer.ts +343 -0
  233. package/src/analysis-types.ts +558 -0
  234. package/src/analytics.ts +100 -0
  235. package/src/analyzer.ts +692 -0
  236. package/src/artifacts.ts +218 -0
  237. package/src/benchmarks/index.ts +309 -0
  238. package/src/cli.ts +503 -0
  239. package/src/commands/ci.ts +336 -0
  240. package/src/commands/config.ts +288 -0
  241. package/src/commands/demo.ts +175 -0
  242. package/src/commands/export.ts +297 -0
  243. package/src/commands/history.ts +425 -0
  244. package/src/commands/template.ts +385 -0
  245. package/src/commands/validate-map.ts +324 -0
  246. package/src/commands/whatif.ts +272 -0
  247. package/src/comparison.ts +283 -0
  248. package/src/config.ts +188 -0
  249. package/src/connectors/helicone.ts +164 -0
  250. package/src/connectors/index.ts +93 -0
  251. package/src/connectors/langsmith.ts +179 -0
  252. package/src/connectors/types.ts +180 -0
  253. package/src/cost-estimator.ts +146 -0
  254. package/src/costs.ts +347 -0
  255. package/src/counterfactuals.ts +516 -0
  256. package/src/enhancement-prompts.ts +118 -0
  257. package/src/envelopes.ts +814 -0
  258. package/src/format-normalizer.ts +1486 -0
  259. package/src/history.ts +400 -0
  260. package/src/html.ts +512 -0
  261. package/src/impact.ts +522 -0
  262. package/src/index.ts +83 -0
  263. package/src/insights.ts +341 -0
  264. package/src/joiner.ts +289 -0
  265. package/src/orchestrator.ts +1015 -0
  266. package/src/pdf.ts +110 -0
  267. package/src/prediction.ts +392 -0
  268. package/src/prompts/loader.ts +88 -0
  269. package/src/renderer.ts +1045 -0
  270. package/src/runid.ts +261 -0
  271. package/src/runtime.ts +450 -0
  272. package/src/scanner.ts +508 -0
  273. package/src/templates.ts +561 -0
  274. package/src/tools/index.ts +214 -0
  275. package/src/types.ts +873 -0
  276. package/src/version.ts +24 -0
  277. package/templates/context-accumulation.yaml +23 -0
  278. package/templates/cost-concentration.yaml +20 -0
  279. package/templates/dead-code.yaml +20 -0
  280. package/templates/latency-explainer.yaml +23 -0
  281. package/templates/optimizations/ab-testing-framework.yaml +74 -0
  282. package/templates/optimizations/api-gateway-optimization.yaml +81 -0
  283. package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
  284. package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
  285. package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
  286. package/templates/optimizations/comprehensive-apm.yaml +76 -0
  287. package/templates/optimizations/context-window-optimization.yaml +91 -0
  288. package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
  289. package/templates/optimizations/distributed-training-optimization.yaml +77 -0
  290. package/templates/optimizations/document-analysis-edge.yaml +77 -0
  291. package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
  292. package/templates/optimizations/domain-specific-distillation.yaml +78 -0
  293. package/templates/optimizations/error-handling-optimization.yaml +76 -0
  294. package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
  295. package/templates/optimizations/long-context-memory-management.yaml +78 -0
  296. package/templates/optimizations/max-tokens-optimization.yaml +76 -0
  297. package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
  298. package/templates/optimizations/multi-framework-resilience.yaml +75 -0
  299. package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
  300. package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
  301. package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
  302. package/templates/optimizations/quality-monitoring.yaml +74 -0
  303. package/templates/optimizations/realtime-budget-controls.yaml +74 -0
  304. package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
  305. package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
  306. package/templates/optimizations/smart-model-routing.yaml +96 -0
  307. package/templates/optimizations/streaming-batch-selection.yaml +167 -0
  308. package/templates/optimizations/system-prompt-optimization.yaml +75 -0
  309. package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
  310. package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
  311. package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
  312. package/templates/overpowered-extraction.yaml +32 -0
  313. package/templates/overpowered-model.yaml +31 -0
  314. package/templates/prompt-bloat.yaml +24 -0
  315. package/templates/retry-explosion.yaml +28 -0
  316. package/templates/schema/insight.schema.json +113 -0
  317. package/templates/schema/optimization.schema.json +180 -0
  318. package/templates/streaming-drift.yaml +30 -0
  319. package/templates/throughput-gap.yaml +21 -0
  320. package/templates/token-underutilization.yaml +28 -0
  321. package/templates/untested-fallback.yaml +21 -0
  322. package/tests/accuracy/drift-detection.test.ts +184 -0
  323. package/tests/accuracy/false-positives.test.ts +166 -0
  324. package/tests/accuracy/templates.test.ts +205 -0
  325. package/tests/action/commands.test.ts +125 -0
  326. package/tests/action/comments.test.ts +347 -0
  327. package/tests/cli.test.ts +203 -0
  328. package/tests/comparison.test.ts +309 -0
  329. package/tests/correlation-analyzer.test.ts +534 -0
  330. package/tests/counterfactuals.test.ts +347 -0
  331. package/tests/fixtures/events/missing-id.jsonl +1 -0
  332. package/tests/fixtures/events/missing-input.jsonl +1 -0
  333. package/tests/fixtures/events/missing-latency.jsonl +1 -0
  334. package/tests/fixtures/events/missing-model.jsonl +1 -0
  335. package/tests/fixtures/events/missing-output.jsonl +1 -0
  336. package/tests/fixtures/events/missing-provider.jsonl +1 -0
  337. package/tests/fixtures/events/missing-ts.jsonl +1 -0
  338. package/tests/fixtures/events/valid.csv +3 -0
  339. package/tests/fixtures/events/valid.json +1 -0
  340. package/tests/fixtures/events/valid.jsonl +2 -0
  341. package/tests/fixtures/events/with-callsite.jsonl +1 -0
  342. package/tests/fixtures/events/with-intent.jsonl +1 -0
  343. package/tests/fixtures/events/wrong-type.jsonl +1 -0
  344. package/tests/fixtures/repos/empty/.gitkeep +0 -0
  345. package/tests/fixtures/repos/hybrid-router/router.py +35 -0
  346. package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
  347. package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
  348. package/tests/fixtures/repos/saas-openai/client.py +26 -0
  349. package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
  350. package/tests/github-action.test.ts +292 -0
  351. package/tests/insights.test.ts +878 -0
  352. package/tests/joiner.test.ts +168 -0
  353. package/tests/performance/action-latency.test.ts +132 -0
  354. package/tests/performance/benchmark.test.ts +189 -0
  355. package/tests/performance/cli-latency.test.ts +102 -0
  356. package/tests/pr-comment.test.ts +313 -0
  357. package/tests/prediction.test.ts +296 -0
  358. package/tests/runtime-analyzer.test.ts +375 -0
  359. package/tests/runtime.test.ts +205 -0
  360. package/tests/scanner.test.ts +122 -0
  361. package/tests/template-conformance.test.ts +526 -0
  362. package/tests/unit/cost-calculator.test.ts +303 -0
  363. package/tests/unit/credits.test.ts +180 -0
  364. package/tests/unit/inference-map.test.ts +276 -0
  365. package/tests/unit/schema.test.ts +300 -0
  366. package/tsconfig.json +20 -0
  367. package/vitest.config.ts +14 -0
@@ -0,0 +1,1331 @@
1
+ /**
2
+ * Format Normalizer - Agent-based runtime event format detection and normalization.
3
+ *
4
+ * This module implements PRD §6.4: Enable PeakInfer to ingest runtime data from any
5
+ * observability system, logging framework, or custom format without requiring users
6
+ * to transform their data first.
7
+ *
8
+ * Design Principles (Julie Zhou aligned):
9
+ * - Behavior First: Detect formats automatically, fallback gracefully
10
+ * - Clarity Over Cleverness: Clear confidence scores, no silent assumptions
11
+ * - State Completeness: Handle all format states (known, agent-required, unknown)
12
+ */
13
+ import { query } from '@anthropic-ai/claude-agent-sdk';
14
+ import { loadPrompt } from './templates.js';
15
+ /**
16
+ * Extract text content from Claude Agent SDK messages
17
+ */
18
+ function extractTextFromMessages(messages) {
19
+ let text = '';
20
+ for (const msg of messages) {
21
+ if (msg.type === 'assistant' && msg.message?.content) {
22
+ for (const block of msg.message.content) {
23
+ if (block.type === 'text') {
24
+ text += block.text;
25
+ }
26
+ }
27
+ }
28
+ }
29
+ return text;
30
+ }
31
+ // =============================================================================
32
+ // CONSTANTS
33
+ // =============================================================================
34
+ const SAMPLE_LINES = 20; // Number of lines to sample for detection
35
+ const MIN_CONFIDENCE_THRESHOLD = 0.7; // Minimum confidence for auto-acceptance
36
+ const LLM_MODEL = 'claude-sonnet-4-20250514';
37
+ // Required fields for InferenceEvent
38
+ const REQUIRED_FIELDS = ['id', 'ts', 'provider', 'model', 'input_tokens', 'output_tokens', 'latency_ms'];
39
+ // =============================================================================
40
+ // FORMAT SIGNATURES
41
+ // =============================================================================
42
+ /**
43
+ * Known format signatures for heuristic detection.
44
+ * Each signature includes structural patterns that uniquely identify a format.
45
+ */
46
+ const FORMAT_SIGNATURES = {
47
+ otel: {
48
+ patterns: [
49
+ /resourceSpans/i,
50
+ /scopeSpans/i,
51
+ /traceId/i,
52
+ /spanId/i,
53
+ ],
54
+ structuralCheck: (data) => {
55
+ if (typeof data !== 'object' || data === null)
56
+ return false;
57
+ const obj = data;
58
+ return 'resourceSpans' in obj || 'resource_spans' in obj;
59
+ },
60
+ confidence: 0.95,
61
+ },
62
+ jaeger: {
63
+ patterns: [
64
+ /traceID/, // Jaeger uses capital ID (vs OTEL's traceId)
65
+ /spanID/, // Jaeger uses capital ID (vs OTEL's spanId)
66
+ /operationName/,
67
+ /jaeger/i,
68
+ /"processes"/, // Jaeger-specific field
69
+ ],
70
+ structuralCheck: (data) => {
71
+ if (typeof data !== 'object' || data === null)
72
+ return false;
73
+ const obj = data;
74
+ // Jaeger format: { data: [{ traceID: ..., processes: ... }] }
75
+ if ('data' in obj && Array.isArray(obj.data)) {
76
+ const firstTrace = obj.data[0];
77
+ // Must have traceID (capital ID) to distinguish from OTEL
78
+ return firstTrace?.traceID !== undefined && firstTrace?.processes !== undefined;
79
+ }
80
+ return false;
81
+ },
82
+ confidence: 0.95,
83
+ },
84
+ zipkin: {
85
+ patterns: [
86
+ /"traceId"/,
87
+ /"parentId"/,
88
+ /"localEndpoint"/,
89
+ /zipkin/i,
90
+ ],
91
+ structuralCheck: (data) => {
92
+ if (!Array.isArray(data))
93
+ return false;
94
+ const first = data[0];
95
+ return first?.traceId !== undefined && first?.localEndpoint !== undefined;
96
+ },
97
+ confidence: 0.95,
98
+ },
99
+ langsmith: {
100
+ patterns: [
101
+ /run_id/,
102
+ /run_type/,
103
+ /langsmith/i,
104
+ /langchain/i,
105
+ ],
106
+ structuralCheck: (data) => {
107
+ if (typeof data !== 'object' || data === null)
108
+ return false;
109
+ const obj = data;
110
+ return 'run_id' in obj || 'runs' in obj;
111
+ },
112
+ confidence: 0.90,
113
+ },
114
+ litellm: {
115
+ patterns: [
116
+ /litellm/i,
117
+ /call_type/,
118
+ /api_base/,
119
+ /response_time_ms/,
120
+ ],
121
+ structuralCheck: (data) => {
122
+ if (typeof data !== 'object' || data === null)
123
+ return false;
124
+ const obj = data;
125
+ // LiteLLM logs have call_type OR api_base OR response_time_ms fields
126
+ return 'call_type' in obj || 'api_base' in obj || 'response_time_ms' in obj;
127
+ },
128
+ confidence: 0.90,
129
+ },
130
+ helicone: {
131
+ patterns: [
132
+ /helicone/i,
133
+ /helicone_request_id/,
134
+ /helicone_response_id/,
135
+ /helicone_properties/,
136
+ ],
137
+ structuralCheck: (data) => {
138
+ if (typeof data !== 'object' || data === null)
139
+ return false;
140
+ const obj = data;
141
+ // Helicone-specific fields - must have helicone_ prefixed fields
142
+ return 'helicone_request_id' in obj || 'helicone_response_id' in obj ||
143
+ 'helicone_properties' in obj || 'helicone' in obj;
144
+ },
145
+ confidence: 0.85,
146
+ },
147
+ };
148
+ // =============================================================================
149
+ // PREDEFINED FIELD MAPPINGS
150
+ // =============================================================================
151
+ /**
152
+ * Predefined field mappings for known formats.
153
+ * These are high-confidence mappings based on format specifications.
154
+ */
155
+ const PREDEFINED_MAPPINGS = {
156
+ otel: [
157
+ {
158
+ target: 'id',
159
+ source_path: '$.resourceSpans[*].scopeSpans[*].spans[*].spanId',
160
+ extraction_type: 'jsonpath',
161
+ transform: 'none',
162
+ confidence: 1.0,
163
+ evidence: 'OTLP span ID field',
164
+ },
165
+ {
166
+ target: 'ts',
167
+ source_path: '$.resourceSpans[*].scopeSpans[*].spans[*].startTimeUnixNano',
168
+ extraction_type: 'jsonpath',
169
+ transform: 'unix_nano_to_iso',
170
+ confidence: 1.0,
171
+ evidence: 'OTLP start time in nanoseconds',
172
+ },
173
+ {
174
+ target: 'provider',
175
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.provider')].value.stringValue",
176
+ extraction_type: 'jsonpath',
177
+ transform: 'provider_normalize',
178
+ confidence: 0.9,
179
+ evidence: 'LLM semantic convention attribute',
180
+ },
181
+ {
182
+ target: 'model',
183
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.model')].value.stringValue",
184
+ extraction_type: 'jsonpath',
185
+ transform: 'none',
186
+ confidence: 0.9,
187
+ evidence: 'LLM semantic convention attribute',
188
+ },
189
+ {
190
+ target: 'input_tokens',
191
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.token_count.prompt')].value.intValue",
192
+ extraction_type: 'jsonpath',
193
+ transform: 'parse_int',
194
+ confidence: 0.9,
195
+ evidence: 'LLM semantic convention attribute',
196
+ },
197
+ {
198
+ target: 'output_tokens',
199
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.token_count.completion')].value.intValue",
200
+ extraction_type: 'jsonpath',
201
+ transform: 'parse_int',
202
+ confidence: 0.9,
203
+ evidence: 'LLM semantic convention attribute',
204
+ },
205
+ {
206
+ target: 'latency_ms',
207
+ source_path: '(endTimeUnixNano - startTimeUnixNano) / 1000000',
208
+ extraction_type: 'computed',
209
+ transform: 'none',
210
+ confidence: 1.0,
211
+ evidence: 'Computed from OTLP start/end timestamps',
212
+ },
213
+ ],
214
+ jaeger: [
215
+ {
216
+ target: 'id',
217
+ source_path: '$.data[*].spans[*].spanID',
218
+ extraction_type: 'jsonpath',
219
+ transform: 'none',
220
+ confidence: 1.0,
221
+ evidence: 'Jaeger span ID',
222
+ },
223
+ {
224
+ target: 'ts',
225
+ source_path: '$.data[*].spans[*].startTime',
226
+ extraction_type: 'jsonpath',
227
+ transform: 'unix_ms_to_iso',
228
+ confidence: 1.0,
229
+ evidence: 'Jaeger start time in microseconds',
230
+ },
231
+ {
232
+ target: 'provider',
233
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.provider')].value",
234
+ extraction_type: 'jsonpath',
235
+ transform: 'provider_normalize',
236
+ confidence: 0.85,
237
+ evidence: 'Tag-based provider extraction',
238
+ },
239
+ {
240
+ target: 'model',
241
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.model')].value",
242
+ extraction_type: 'jsonpath',
243
+ transform: 'none',
244
+ confidence: 0.85,
245
+ evidence: 'Tag-based model extraction',
246
+ },
247
+ {
248
+ target: 'input_tokens',
249
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.input_tokens')].value",
250
+ extraction_type: 'jsonpath',
251
+ transform: 'parse_int',
252
+ confidence: 0.85,
253
+ evidence: 'Tag-based token extraction',
254
+ },
255
+ {
256
+ target: 'output_tokens',
257
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.output_tokens')].value",
258
+ extraction_type: 'jsonpath',
259
+ transform: 'parse_int',
260
+ confidence: 0.85,
261
+ evidence: 'Tag-based token extraction',
262
+ },
263
+ {
264
+ target: 'latency_ms',
265
+ source_path: '$.data[*].spans[*].duration',
266
+ extraction_type: 'jsonpath',
267
+ transform: 'none', // Jaeger duration is already in microseconds, convert to ms
268
+ confidence: 1.0,
269
+ evidence: 'Jaeger duration field (microseconds -> ms)',
270
+ },
271
+ ],
272
+ zipkin: [
273
+ {
274
+ target: 'id',
275
+ source_path: '$[*].id',
276
+ extraction_type: 'jsonpath',
277
+ transform: 'none',
278
+ confidence: 1.0,
279
+ evidence: 'Zipkin span ID',
280
+ },
281
+ {
282
+ target: 'ts',
283
+ source_path: '$[*].timestamp',
284
+ extraction_type: 'jsonpath',
285
+ transform: 'unix_ms_to_iso',
286
+ confidence: 1.0,
287
+ evidence: 'Zipkin timestamp in microseconds',
288
+ },
289
+ {
290
+ target: 'provider',
291
+ source_path: "$[*].tags['llm.provider']",
292
+ extraction_type: 'jsonpath',
293
+ transform: 'provider_normalize',
294
+ confidence: 0.85,
295
+ evidence: 'Tag-based provider extraction',
296
+ },
297
+ {
298
+ target: 'model',
299
+ source_path: "$[*].tags['llm.model']",
300
+ extraction_type: 'jsonpath',
301
+ transform: 'none',
302
+ confidence: 0.85,
303
+ evidence: 'Tag-based model extraction',
304
+ },
305
+ {
306
+ target: 'input_tokens',
307
+ source_path: "$[*].tags['llm.input_tokens']",
308
+ extraction_type: 'jsonpath',
309
+ transform: 'parse_int',
310
+ confidence: 0.85,
311
+ evidence: 'Tag-based token extraction',
312
+ },
313
+ {
314
+ target: 'output_tokens',
315
+ source_path: "$[*].tags['llm.output_tokens']",
316
+ extraction_type: 'jsonpath',
317
+ transform: 'parse_int',
318
+ confidence: 0.85,
319
+ evidence: 'Tag-based token extraction',
320
+ },
321
+ {
322
+ target: 'latency_ms',
323
+ source_path: '$[*].duration',
324
+ extraction_type: 'jsonpath',
325
+ transform: 'none', // Zipkin duration is in microseconds
326
+ confidence: 1.0,
327
+ evidence: 'Zipkin duration field (microseconds -> ms)',
328
+ },
329
+ ],
330
+ langsmith: [
331
+ {
332
+ target: 'id',
333
+ source_path: 'run_id',
334
+ extraction_type: 'direct',
335
+ transform: 'none',
336
+ confidence: 0.95,
337
+ evidence: 'LangSmith run ID',
338
+ },
339
+ {
340
+ target: 'ts',
341
+ source_path: 'start_time',
342
+ extraction_type: 'direct',
343
+ transform: 'none', // Already ISO format
344
+ confidence: 0.95,
345
+ evidence: 'LangSmith start timestamp',
346
+ },
347
+ {
348
+ target: 'provider',
349
+ source_path: 'extra.invocation_params.model_provider',
350
+ extraction_type: 'jsonpath',
351
+ transform: 'provider_normalize',
352
+ confidence: 0.8,
353
+ evidence: 'LangSmith invocation params provider',
354
+ },
355
+ {
356
+ target: 'model',
357
+ source_path: 'extra.invocation_params.model',
358
+ extraction_type: 'jsonpath',
359
+ transform: 'none',
360
+ confidence: 0.85,
361
+ evidence: 'LangSmith invocation params model',
362
+ },
363
+ {
364
+ target: 'input_tokens',
365
+ source_path: 'token_usage.prompt_tokens',
366
+ extraction_type: 'jsonpath',
367
+ transform: 'parse_int',
368
+ confidence: 0.9,
369
+ evidence: 'LangSmith token usage prompt_tokens',
370
+ },
371
+ {
372
+ target: 'output_tokens',
373
+ source_path: 'token_usage.completion_tokens',
374
+ extraction_type: 'jsonpath',
375
+ transform: 'parse_int',
376
+ confidence: 0.9,
377
+ evidence: 'LangSmith token usage completion_tokens',
378
+ },
379
+ {
380
+ target: 'latency_ms',
381
+ source_path: 'latency',
382
+ extraction_type: 'direct',
383
+ transform: 'duration_to_ms',
384
+ confidence: 0.9,
385
+ evidence: 'LangSmith latency field',
386
+ },
387
+ ],
388
+ litellm: [
389
+ {
390
+ target: 'id',
391
+ source_path: 'id',
392
+ extraction_type: 'direct',
393
+ transform: 'none',
394
+ confidence: 0.95,
395
+ evidence: 'LiteLLM request ID',
396
+ },
397
+ {
398
+ target: 'ts',
399
+ source_path: 'startTime',
400
+ extraction_type: 'direct',
401
+ transform: 'unix_ms_to_iso',
402
+ confidence: 0.9,
403
+ evidence: 'LiteLLM start timestamp',
404
+ },
405
+ {
406
+ target: 'provider',
407
+ source_path: 'model',
408
+ extraction_type: 'direct',
409
+ transform: 'provider_normalize', // LiteLLM uses model format like "openai/gpt-4"
410
+ confidence: 0.85,
411
+ evidence: 'LiteLLM model field (provider/model format)',
412
+ },
413
+ {
414
+ target: 'model',
415
+ source_path: 'model',
416
+ extraction_type: 'direct',
417
+ transform: 'none',
418
+ confidence: 0.95,
419
+ evidence: 'LiteLLM model field',
420
+ },
421
+ {
422
+ target: 'input_tokens',
423
+ source_path: 'usage.prompt_tokens',
424
+ extraction_type: 'jsonpath',
425
+ transform: 'parse_int',
426
+ confidence: 0.95,
427
+ evidence: 'LiteLLM usage prompt_tokens',
428
+ },
429
+ {
430
+ target: 'output_tokens',
431
+ source_path: 'usage.completion_tokens',
432
+ extraction_type: 'jsonpath',
433
+ transform: 'parse_int',
434
+ confidence: 0.95,
435
+ evidence: 'LiteLLM usage completion_tokens',
436
+ },
437
+ {
438
+ target: 'latency_ms',
439
+ source_path: 'response_time_ms',
440
+ extraction_type: 'direct',
441
+ transform: 'none',
442
+ confidence: 1.0,
443
+ evidence: 'LiteLLM response_time_ms field',
444
+ },
445
+ ],
446
+ helicone: [
447
+ {
448
+ target: 'id',
449
+ source_path: 'helicone_request_id',
450
+ extraction_type: 'direct',
451
+ transform: 'none',
452
+ confidence: 0.95,
453
+ evidence: 'Helicone request ID',
454
+ },
455
+ {
456
+ target: 'ts',
457
+ source_path: 'created_at',
458
+ extraction_type: 'direct',
459
+ transform: 'none', // Helicone uses ISO format
460
+ confidence: 0.9,
461
+ evidence: 'Helicone created_at timestamp',
462
+ },
463
+ {
464
+ target: 'provider',
465
+ source_path: 'provider',
466
+ extraction_type: 'direct',
467
+ transform: 'provider_normalize',
468
+ confidence: 0.9,
469
+ evidence: 'Helicone provider field',
470
+ },
471
+ {
472
+ target: 'model',
473
+ source_path: 'model',
474
+ extraction_type: 'direct',
475
+ transform: 'none',
476
+ confidence: 0.95,
477
+ evidence: 'Helicone model field',
478
+ },
479
+ {
480
+ target: 'input_tokens',
481
+ source_path: 'prompt_tokens',
482
+ extraction_type: 'direct',
483
+ transform: 'parse_int',
484
+ confidence: 0.9,
485
+ evidence: 'Helicone prompt_tokens',
486
+ },
487
+ {
488
+ target: 'output_tokens',
489
+ source_path: 'completion_tokens',
490
+ extraction_type: 'direct',
491
+ transform: 'parse_int',
492
+ confidence: 0.9,
493
+ evidence: 'Helicone completion_tokens',
494
+ },
495
+ {
496
+ target: 'latency_ms',
497
+ source_path: 'latency_ms',
498
+ extraction_type: 'direct',
499
+ transform: 'none',
500
+ confidence: 1.0,
501
+ evidence: 'Helicone latency_ms field',
502
+ },
503
+ ],
504
+ };
505
+ // =============================================================================
506
+ // FORMAT DETECTION
507
+ // =============================================================================
508
+ /**
509
+ * Detect the format type of a runtime events file.
510
+ *
511
+ * Detection strategy:
512
+ * 1. Try file extension heuristics
513
+ * 2. Sample content and check against known signatures
514
+ * 3. Fall back to agent-based detection for unknown formats
515
+ */
516
+ export function detectFormat(content, filename) {
517
+ const lines = content.trim().split('\n').slice(0, SAMPLE_LINES);
518
+ // First, try to parse as complete JSON (object or array)
519
+ // This handles single-line JSON arrays which would incorrectly match JSONL
520
+ let parsedAsWhole;
521
+ try {
522
+ parsedAsWhole = JSON.parse(content);
523
+ }
524
+ catch {
525
+ parsedAsWhole = null;
526
+ }
527
+ // If it's a JSON array, check for InferenceEvent schema first
528
+ if (Array.isArray(parsedAsWhole) && parsedAsWhole.length > 0) {
529
+ const first = parsedAsWhole[0];
530
+ const hasRequiredFields = REQUIRED_FIELDS.every(f => f in first);
531
+ if (hasRequiredFields) {
532
+ return {
533
+ format_type: 'json_array',
534
+ confidence: 1.0,
535
+ evidence: 'JSON array with InferenceEvent schema',
536
+ sample_size: Math.min(parsedAsWhole.length, SAMPLE_LINES),
537
+ requires_agent: false,
538
+ };
539
+ }
540
+ }
541
+ // Check if it's likely JSONL (newline-delimited JSON)
542
+ // Note: Multi-line content where each line is valid JSON
543
+ if (lines.length > 1) {
544
+ const isJSONL = lines.every(line => {
545
+ const trimmed = line.trim();
546
+ if (!trimmed)
547
+ return true; // Empty lines are ok
548
+ try {
549
+ JSON.parse(trimmed);
550
+ return true;
551
+ }
552
+ catch {
553
+ return false;
554
+ }
555
+ });
556
+ if (isJSONL) {
557
+ // Parse first non-empty line
558
+ const firstLine = lines.find(l => l.trim());
559
+ if (firstLine) {
560
+ try {
561
+ const firstEvent = JSON.parse(firstLine);
562
+ // Check if JSONL matches InferenceEvent schema
563
+ const hasRequiredFields = REQUIRED_FIELDS.every(f => f in firstEvent);
564
+ if (hasRequiredFields) {
565
+ return {
566
+ format_type: 'jsonl',
567
+ confidence: 1.0,
568
+ evidence: 'JSONL with InferenceEvent schema (all required fields present)',
569
+ sample_size: lines.length,
570
+ requires_agent: false,
571
+ };
572
+ }
573
+ // Check against known format signatures for JSONL data
574
+ // Only match if structuralCheck passes (required for JSONL format detection)
575
+ const jsonStr = JSON.stringify(firstEvent);
576
+ for (const [formatType, signature] of Object.entries(FORMAT_SIGNATURES)) {
577
+ // For JSONL, require structuralCheck to pass (if defined)
578
+ if (signature.structuralCheck) {
579
+ const structuralMatch = signature.structuralCheck(firstEvent);
580
+ if (!structuralMatch)
581
+ continue;
582
+ const patternMatches = signature.patterns.filter(p => p.test(jsonStr)).length;
583
+ const patternRatio = patternMatches / signature.patterns.length;
584
+ const confidence = Math.max(0.8, patternRatio) * signature.confidence;
585
+ return {
586
+ format_type: formatType,
587
+ confidence,
588
+ evidence: `JSONL with ${formatType} format (structural match, ${patternMatches}/${signature.patterns.length} patterns)`,
589
+ sample_size: lines.length,
590
+ requires_agent: true,
591
+ };
592
+ }
593
+ }
594
+ // JSONL but unknown schema - mark as custom_json requiring agent
595
+ return {
596
+ format_type: 'custom_json',
597
+ confidence: 0.7,
598
+ evidence: 'JSONL with custom schema - requires field mapping',
599
+ sample_size: lines.length,
600
+ requires_agent: true,
601
+ };
602
+ }
603
+ catch {
604
+ // Continue to other detection methods
605
+ }
606
+ }
607
+ }
608
+ }
609
+ // Try to parse as JSON (array or object)
610
+ let parsedData;
611
+ try {
612
+ parsedData = JSON.parse(content);
613
+ }
614
+ catch {
615
+ // Not valid JSON, check for CSV/TSV or text logs
616
+ return detectNonJSONFormat(content, lines, filename);
617
+ }
618
+ // Check if it's a JSON array with InferenceEvent schema
619
+ if (Array.isArray(parsedData) && parsedData.length > 0) {
620
+ const first = parsedData[0];
621
+ const hasRequiredFields = REQUIRED_FIELDS.every(f => f in first);
622
+ if (hasRequiredFields) {
623
+ return {
624
+ format_type: 'json_array',
625
+ confidence: 1.0,
626
+ evidence: 'JSON array with InferenceEvent schema',
627
+ sample_size: Math.min(parsedData.length, SAMPLE_LINES),
628
+ requires_agent: false,
629
+ };
630
+ }
631
+ }
632
+ // Check against known format signatures
633
+ // Require structural match for reliable detection
634
+ const contentStr = JSON.stringify(parsedData);
635
+ for (const [formatType, signature] of Object.entries(FORMAT_SIGNATURES)) {
636
+ // Require structuralCheck to pass for format identification
637
+ if (signature.structuralCheck) {
638
+ const structuralMatch = signature.structuralCheck(parsedData);
639
+ if (!structuralMatch)
640
+ continue;
641
+ const patternMatches = signature.patterns.filter(p => p.test(contentStr)).length;
642
+ const patternRatio = patternMatches / signature.patterns.length;
643
+ const confidence = Math.max(0.8, patternRatio) * signature.confidence;
644
+ return {
645
+ format_type: formatType,
646
+ confidence,
647
+ evidence: `Matched ${formatType} format (structural match, ${patternMatches}/${signature.patterns.length} patterns)`,
648
+ sample_size: Array.isArray(parsedData) ? parsedData.length : 1,
649
+ requires_agent: true, // Known formats still need agent for field mapping
650
+ };
651
+ }
652
+ }
653
+ // Unknown JSON structure - requires agent normalization
654
+ return {
655
+ format_type: 'custom_json',
656
+ confidence: 0.5,
657
+ evidence: 'Valid JSON but unknown schema - requires agent normalization',
658
+ sample_size: Array.isArray(parsedData) ? parsedData.length : 1,
659
+ requires_agent: true,
660
+ };
661
+ }
662
+ /**
663
+ * Detect non-JSON formats (CSV, TSV, text logs).
664
+ */
665
+ function detectNonJSONFormat(content, lines, filename) {
666
+ // Check for CSV
667
+ const firstLine = lines[0];
668
+ if (firstLine.includes(',')) {
669
+ const headers = firstLine.split(',').map(h => h.trim().toLowerCase());
670
+ const hasLLMHeaders = ['provider', 'model', 'latency', 'tokens'].some(h => headers.some(header => header.includes(h)));
671
+ if (hasLLMHeaders) {
672
+ return {
673
+ format_type: 'csv',
674
+ confidence: 0.9,
675
+ evidence: 'CSV with LLM-related headers detected',
676
+ sample_size: lines.length,
677
+ requires_agent: false,
678
+ };
679
+ }
680
+ return {
681
+ format_type: 'csv',
682
+ confidence: 0.7,
683
+ evidence: 'CSV format detected but headers may need mapping',
684
+ sample_size: lines.length,
685
+ requires_agent: true,
686
+ };
687
+ }
688
+ // Check for TSV
689
+ if (firstLine.includes('\t')) {
690
+ return {
691
+ format_type: 'tsv',
692
+ confidence: 0.8,
693
+ evidence: 'Tab-separated values detected',
694
+ sample_size: lines.length,
695
+ requires_agent: true,
696
+ };
697
+ }
698
+ // Structured text logs
699
+ const logPatterns = [
700
+ /^\d{4}-\d{2}-\d{2}/, // ISO date prefix
701
+ /^\[\d+\]/, // Timestamp prefix
702
+ /level=(info|warn|error|debug)/i,
703
+ /provider=\w+/,
704
+ /model=\w+/,
705
+ ];
706
+ const logMatchCount = logPatterns.filter(p => lines.some(line => p.test(line))).length;
707
+ if (logMatchCount >= 2) {
708
+ return {
709
+ format_type: 'custom_text',
710
+ confidence: 0.6,
711
+ evidence: `Structured text logs detected (${logMatchCount} patterns matched)`,
712
+ sample_size: lines.length,
713
+ requires_agent: true,
714
+ };
715
+ }
716
+ return {
717
+ format_type: 'unknown',
718
+ confidence: 0.3,
719
+ evidence: 'Could not determine format - manual field mapping may be required',
720
+ sample_size: lines.length,
721
+ requires_agent: true,
722
+ };
723
+ }
724
+ // =============================================================================
725
+ // AGENT-BASED NORMALIZATION
726
+ // =============================================================================
727
+ // Load normalization prompt from YAML (with hardcoded fallback)
728
+ function getNormalizationPrompt() {
729
+ const prompt = loadPrompt('format-normalizer');
730
+ if (prompt) {
731
+ return prompt.prompt;
732
+ }
733
+ // Fallback to hardcoded prompt if YAML not available
734
+ return `You are an expert at parsing log formats and trace data. Analyze the following sample data and determine field mappings to the InferenceEvent schema.
735
+
736
+ The target InferenceEvent schema requires these fields:
737
+ - id (string): Unique event identifier
738
+ - ts (string): ISO 8601 timestamp
739
+ - provider (string): LLM provider (openai, anthropic, google, etc.)
740
+ - model (string): Model name (gpt-4o, claude-3-5-sonnet, etc.)
741
+ - input_tokens (number): Input/prompt token count
742
+ - output_tokens (number): Output/completion token count
743
+ - latency_ms (number): Request latency in milliseconds
744
+
745
+ Optional fields:
746
+ - streaming (boolean), ttft_ms (number), batch_size (number), cached (boolean), retry_count (number)
747
+
748
+ For each target field, provide:
749
+ 1. The source path/expression to extract the value
750
+ 2. The extraction type (direct, jsonpath, regex, computed)
751
+ 3. Any transform needed (unix_ms_to_iso, unix_nano_to_iso, parse_int, etc.)
752
+ 4. Your confidence (0.0-1.0) in this mapping
753
+ 5. Evidence explaining why you chose this mapping
754
+
755
+ If a field cannot be mapped, indicate it as unmappable with confidence 0.
756
+
757
+ Respond in JSON format:
758
+ {
759
+ "format_type": "detected format name",
760
+ "mappings": [
761
+ {
762
+ "target": "field_name",
763
+ "source_path": "path or expression",
764
+ "extraction_type": "direct|jsonpath|regex|computed",
765
+ "transform": "none|unix_ms_to_iso|parse_int|...",
766
+ "confidence": 0.9,
767
+ "evidence": "explanation"
768
+ }
769
+ ],
770
+ "unmapped_fields": ["fields that could not be mapped"],
771
+ "warnings": ["any issues or caveats"]
772
+ }`;
773
+ }
774
+ /**
775
+ * Use LLM agent to normalize an unknown format.
776
+ */
777
+ export async function normalizeWithAgent(content, detection, options = {}) {
778
+ // Check for API key
779
+ const apiKey = process.env.ANTHROPIC_API_KEY;
780
+ if (!apiKey) {
781
+ return createFallbackResult(detection, 'No ANTHROPIC_API_KEY - agent normalization unavailable');
782
+ }
783
+ // Sample content for the agent
784
+ const sampleLines = content.trim().split('\n').slice(0, SAMPLE_LINES);
785
+ const sampleContent = sampleLines.join('\n');
786
+ // Build context prompt
787
+ let contextPrompt = '';
788
+ if (options.codebase_context) {
789
+ const scanResult = options.codebase_context;
790
+ contextPrompt = `\n\nCodebase context available:
791
+ - ${scanResult.files.length} files scanned
792
+ - Languages: ${scanResult.summary.languages.join(', ')}
793
+ - ${scanResult.summary.totalCandidates} potential inference points detected
794
+
795
+ This may help identify logging patterns and field names used in the application.`;
796
+ }
797
+ // User hints
798
+ let hintsPrompt = '';
799
+ if (options.format_hint) {
800
+ hintsPrompt += `\nUser hint: Format appears to be "${options.format_hint}"`;
801
+ }
802
+ if (options.field_hints) {
803
+ hintsPrompt += `\nUser-provided field mappings: ${JSON.stringify(options.field_hints)}`;
804
+ }
805
+ try {
806
+ // Use Claude Agent SDK query() function
807
+ const agentQuery = query({
808
+ prompt: `${getNormalizationPrompt()}${contextPrompt}${hintsPrompt}
809
+
810
+ Detected format: ${detection.format_type} (confidence: ${detection.confidence})
811
+
812
+ Sample data:
813
+ \`\`\`
814
+ ${sampleContent}
815
+ \`\`\``,
816
+ options: {
817
+ model: LLM_MODEL,
818
+ tools: [],
819
+ permissionMode: 'plan',
820
+ cwd: process.cwd(),
821
+ },
822
+ });
823
+ // Collect all messages from the async generator
824
+ const messages = [];
825
+ for await (const message of agentQuery) {
826
+ messages.push(message);
827
+ }
828
+ // Parse LLM response
829
+ const responseText = extractTextFromMessages(messages);
830
+ const jsonMatch = responseText.match(/\{[\s\S]*\}/);
831
+ if (!jsonMatch) {
832
+ return createFallbackResult(detection, 'Could not parse agent response');
833
+ }
834
+ const agentResult = JSON.parse(jsonMatch[0]);
835
+ // Validate and build result
836
+ const mappings = (agentResult.mappings || []).map(m => ({
837
+ target: m.target,
838
+ source_path: m.source_path,
839
+ extraction_type: m.extraction_type,
840
+ transform: (m.transform || 'none'),
841
+ confidence: m.confidence,
842
+ evidence: m.evidence,
843
+ }));
844
+ // Check confidence threshold
845
+ const avgConfidence = mappings.reduce((sum, m) => sum + m.confidence, 0) / mappings.length;
846
+ const warnings = agentResult.warnings || [];
847
+ if (avgConfidence < MIN_CONFIDENCE_THRESHOLD && !options.lenient) {
848
+ warnings.push(`Average mapping confidence (${avgConfidence.toFixed(2)}) is below threshold (${MIN_CONFIDENCE_THRESHOLD}). ` +
849
+ `Use --lenient flag to accept low-confidence mappings.`);
850
+ }
851
+ return {
852
+ detection: {
853
+ ...detection,
854
+ format_type: agentResult.format_type || detection.format_type,
855
+ },
856
+ mappings,
857
+ unmapped_fields: agentResult.unmapped_fields || [],
858
+ warnings,
859
+ audit: {
860
+ normalized_at: new Date().toISOString(),
861
+ agent_used: true,
862
+ codebase_context_used: !!options.codebase_context,
863
+ llm_model: LLM_MODEL,
864
+ },
865
+ };
866
+ }
867
+ catch (error) {
868
+ return createFallbackResult(detection, `Agent normalization failed: ${error instanceof Error ? error.message : String(error)}`);
869
+ }
870
+ }
871
+ /**
872
+ * Create a fallback normalization result when agent is unavailable.
873
+ */
874
+ function createFallbackResult(detection, warning) {
875
+ // Use predefined mappings if available
876
+ const predefinedMappings = PREDEFINED_MAPPINGS[detection.format_type];
877
+ return {
878
+ detection,
879
+ mappings: predefinedMappings || [],
880
+ unmapped_fields: predefinedMappings ? [] : REQUIRED_FIELDS,
881
+ warnings: [warning],
882
+ audit: {
883
+ normalized_at: new Date().toISOString(),
884
+ agent_used: false,
885
+ codebase_context_used: false,
886
+ },
887
+ };
888
+ }
889
+ // =============================================================================
890
+ // FIELD EXTRACTION
891
+ // =============================================================================
892
+ /**
893
+ * Apply a transformation to an extracted value.
894
+ */
895
+ function applyTransform(value, transform) {
896
+ if (value === null || value === undefined)
897
+ return value;
898
+ switch (transform) {
899
+ case 'none':
900
+ return value;
901
+ case 'unix_ms_to_iso':
902
+ return new Date(Number(value)).toISOString();
903
+ case 'unix_s_to_iso':
904
+ return new Date(Number(value) * 1000).toISOString();
905
+ case 'unix_nano_to_iso':
906
+ return new Date(Number(value) / 1_000_000).toISOString();
907
+ case 'duration_to_ms': {
908
+ const str = String(value);
909
+ const match = str.match(/^([\d.]+)(ms|s|m)?$/);
910
+ if (match) {
911
+ const num = parseFloat(match[1]);
912
+ const unit = match[2] || 'ms';
913
+ switch (unit) {
914
+ case 's': return num * 1000;
915
+ case 'm': return num * 60000;
916
+ default: return num;
917
+ }
918
+ }
919
+ return parseFloat(str);
920
+ }
921
+ case 'parse_int':
922
+ return parseInt(String(value), 10);
923
+ case 'parse_float':
924
+ return parseFloat(String(value));
925
+ case 'lowercase':
926
+ return String(value).toLowerCase();
927
+ case 'provider_normalize': {
928
+ const str = String(value).toLowerCase();
929
+ // Normalize common provider variations
930
+ if (str.includes('openai'))
931
+ return 'openai';
932
+ if (str.includes('anthropic'))
933
+ return 'anthropic';
934
+ if (str.includes('google'))
935
+ return 'google';
936
+ if (str.includes('azure'))
937
+ return 'azure_openai';
938
+ if (str.includes('bedrock'))
939
+ return 'bedrock';
940
+ if (str.includes('together'))
941
+ return 'together';
942
+ if (str.includes('groq'))
943
+ return 'groq';
944
+ return str;
945
+ }
946
+ default:
947
+ return value;
948
+ }
949
+ }
950
+ /**
951
+ * Extract a value from an object using a simple path.
952
+ * Supports basic dot notation and array access.
953
+ */
954
+ function extractValue(obj, path) {
955
+ if (path.startsWith('$.')) {
956
+ path = path.slice(2);
957
+ }
958
+ const parts = path.split(/\.|\[(\d+)\]/).filter(Boolean);
959
+ let current = obj;
960
+ for (const part of parts) {
961
+ if (current === null || current === undefined)
962
+ return undefined;
963
+ if (typeof current === 'object') {
964
+ current = current[part];
965
+ }
966
+ else {
967
+ return undefined;
968
+ }
969
+ }
970
+ return current;
971
+ }
972
+ /**
973
+ * Extract InferenceEvents from normalized data using field mappings.
974
+ */
975
+ export function extractEvents(content, normalization) {
976
+ const events = [];
977
+ const errors = [];
978
+ // Parse content based on format type
979
+ let records;
980
+ try {
981
+ const formatType = normalization.detection.format_type;
982
+ if (formatType === 'jsonl') {
983
+ records = content.trim().split('\n').map(line => JSON.parse(line));
984
+ }
985
+ else if (formatType === 'json_array') {
986
+ records = JSON.parse(content);
987
+ }
988
+ else if (formatType === 'csv' || formatType === 'tsv') {
989
+ const delimiter = formatType === 'csv' ? ',' : '\t';
990
+ const lines = content.trim().split('\n');
991
+ const headers = lines[0].split(delimiter).map(h => h.trim());
992
+ records = lines.slice(1).map(line => {
993
+ const values = line.split(delimiter);
994
+ const obj = {};
995
+ headers.forEach((h, i) => { obj[h] = values[i]?.trim() || ''; });
996
+ return obj;
997
+ });
998
+ }
999
+ else {
1000
+ // For complex formats (OTEL, Jaeger, etc.), parse and flatten
1001
+ const data = JSON.parse(content);
1002
+ records = flattenComplexFormat(data, normalization.detection.format_type);
1003
+ }
1004
+ }
1005
+ catch (error) {
1006
+ errors.push(`Failed to parse content: ${error instanceof Error ? error.message : String(error)}`);
1007
+ return { events, errors };
1008
+ }
1009
+ // Extract events using mappings
1010
+ for (let i = 0; i < records.length; i++) {
1011
+ const record = records[i];
1012
+ const event = {};
1013
+ for (const mapping of normalization.mappings) {
1014
+ try {
1015
+ let value;
1016
+ if (mapping.extraction_type === 'computed') {
1017
+ // Handle computed fields (e.g., latency = end - start)
1018
+ value = computeValue(record, mapping.source_path);
1019
+ }
1020
+ else if (mapping.extraction_type === 'constant') {
1021
+ value = mapping.source_path;
1022
+ }
1023
+ else {
1024
+ value = extractValue(record, mapping.source_path);
1025
+ }
1026
+ if (value !== undefined && value !== null) {
1027
+ const transformed = applyTransform(value, mapping.transform);
1028
+ event[mapping.target] = transformed;
1029
+ }
1030
+ }
1031
+ catch (error) {
1032
+ // Skip this field for this record
1033
+ }
1034
+ }
1035
+ // Validate required fields
1036
+ const missingFields = REQUIRED_FIELDS.filter(f => !(f in event));
1037
+ if (missingFields.length === 0) {
1038
+ events.push(event);
1039
+ }
1040
+ else {
1041
+ errors.push(`Record ${i + 1}: Missing required fields: ${missingFields.join(', ')}`);
1042
+ }
1043
+ }
1044
+ return { events, errors };
1045
+ }
1046
+ /**
1047
+ * Flatten complex nested formats (OTEL, Jaeger, Zipkin) into individual records.
1048
+ */
1049
+ function flattenComplexFormat(data, formatType) {
1050
+ if (formatType === 'otel') {
1051
+ return flattenOTEL(data);
1052
+ }
1053
+ else if (formatType === 'jaeger') {
1054
+ return flattenJaeger(data);
1055
+ }
1056
+ else if (formatType === 'zipkin') {
1057
+ // Zipkin is already an array of spans
1058
+ return Array.isArray(data) ? data : [];
1059
+ }
1060
+ // For unknown formats, try to handle arrays or wrap single object
1061
+ if (Array.isArray(data))
1062
+ return data;
1063
+ return [data];
1064
+ }
1065
+ /**
1066
+ * Flatten OTEL traces into individual spans.
1067
+ */
1068
+ function flattenOTEL(data) {
1069
+ const spans = [];
1070
+ const otelData = data;
1071
+ for (const resourceSpan of otelData.resourceSpans || []) {
1072
+ for (const scopeSpan of resourceSpan.scopeSpans || []) {
1073
+ for (const span of scopeSpan.spans || []) {
1074
+ spans.push(span);
1075
+ }
1076
+ }
1077
+ }
1078
+ return spans;
1079
+ }
1080
+ /**
1081
+ * Flatten Jaeger traces into individual spans.
1082
+ */
1083
+ function flattenJaeger(data) {
1084
+ const spans = [];
1085
+ const jaegerData = data;
1086
+ for (const trace of jaegerData.data || []) {
1087
+ for (const span of trace.spans || []) {
1088
+ spans.push(span);
1089
+ }
1090
+ }
1091
+ return spans;
1092
+ }
1093
+ /**
1094
+ * Compute a derived value from an expression.
1095
+ */
1096
+ function computeValue(record, expression) {
1097
+ // Simple expression parser for common patterns
1098
+ // e.g., "(endTimeUnixNano - startTimeUnixNano) / 1000000"
1099
+ const match = expression.match(/\((\w+)\s*-\s*(\w+)\)\s*\/\s*(\d+)/);
1100
+ if (match) {
1101
+ const [, endField, startField, divisor] = match;
1102
+ const endValue = Number(record[endField]);
1103
+ const startValue = Number(record[startField]);
1104
+ if (!isNaN(endValue) && !isNaN(startValue)) {
1105
+ return (endValue - startValue) / Number(divisor);
1106
+ }
1107
+ }
1108
+ // Try direct field access
1109
+ const fieldMatch = expression.match(/^\$\.(\w+)$/);
1110
+ if (fieldMatch) {
1111
+ return Number(record[fieldMatch[1]]);
1112
+ }
1113
+ return undefined;
1114
+ }
1115
+ // =============================================================================
1116
+ // PUBLIC API
1117
+ // =============================================================================
1118
+ /**
1119
+ * Main entry point: Detect format and normalize runtime events.
1120
+ *
1121
+ * This function implements the complete normalization pipeline:
1122
+ * 1. Detect format type from content
1123
+ * 2. For direct-parse formats (JSONL, JSON array), parse directly
1124
+ * 3. Apply predefined mappings for known complex formats
1125
+ * 4. Use agent for unknown formats (if API key available)
1126
+ */
1127
+ export async function normalizeRuntimeEvents(content, options = {}) {
1128
+ // Step 1: Detect format
1129
+ const detection = detectFormat(content, options.format_hint?.toString());
1130
+ // Override with user hint if provided
1131
+ if (options.format_hint) {
1132
+ detection.format_type = options.format_hint;
1133
+ detection.evidence = `User-specified format: ${options.format_hint}`;
1134
+ }
1135
+ // Step 2: For direct-parse formats, parse directly without field mappings
1136
+ if (detection.format_type === 'jsonl' || detection.format_type === 'json_array') {
1137
+ const events = [];
1138
+ const errors = [];
1139
+ try {
1140
+ if (detection.format_type === 'jsonl') {
1141
+ // JSONL: one JSON object per line
1142
+ const lines = content.trim().split('\n');
1143
+ for (let i = 0; i < lines.length; i++) {
1144
+ const line = lines[i].trim();
1145
+ if (!line)
1146
+ continue;
1147
+ try {
1148
+ const data = JSON.parse(line);
1149
+ events.push(validateAndConvertEvent(data, i + 1));
1150
+ }
1151
+ catch (e) {
1152
+ errors.push(`Line ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
1153
+ }
1154
+ }
1155
+ }
1156
+ else {
1157
+ // JSON array
1158
+ const data = JSON.parse(content);
1159
+ if (Array.isArray(data)) {
1160
+ for (let i = 0; i < data.length; i++) {
1161
+ try {
1162
+ events.push(validateAndConvertEvent(data[i], i + 1));
1163
+ }
1164
+ catch (e) {
1165
+ errors.push(`Record ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
1166
+ }
1167
+ }
1168
+ }
1169
+ }
1170
+ }
1171
+ catch (e) {
1172
+ errors.push(`Parse error: ${e instanceof Error ? e.message : String(e)}`);
1173
+ }
1174
+ const normalization = {
1175
+ detection,
1176
+ mappings: [],
1177
+ unmapped_fields: [],
1178
+ warnings: errors.length > 0 ? [`${errors.length} records had parsing issues`] : [],
1179
+ audit: {
1180
+ normalized_at: new Date().toISOString(),
1181
+ agent_used: false,
1182
+ codebase_context_used: false,
1183
+ },
1184
+ };
1185
+ return { events, normalization, errors };
1186
+ }
1187
+ // Step 3: Get or generate field mappings for complex formats
1188
+ let normalization;
1189
+ if (!detection.requires_agent || detection.confidence >= 0.95) {
1190
+ // Use predefined mappings for known complex formats
1191
+ const predefinedMappings = PREDEFINED_MAPPINGS[detection.format_type];
1192
+ normalization = {
1193
+ detection,
1194
+ mappings: predefinedMappings || [],
1195
+ unmapped_fields: [],
1196
+ warnings: [],
1197
+ audit: {
1198
+ normalized_at: new Date().toISOString(),
1199
+ agent_used: false,
1200
+ codebase_context_used: false,
1201
+ },
1202
+ };
1203
+ }
1204
+ else {
1205
+ // Agent normalization required
1206
+ normalization = await normalizeWithAgent(content, detection, options);
1207
+ }
1208
+ // Step 4: Extract events using field mappings
1209
+ const { events, errors } = extractEvents(content, normalization);
1210
+ // Add extraction errors to warnings
1211
+ if (errors.length > 0 && errors.length <= 5) {
1212
+ normalization.warnings.push(...errors);
1213
+ }
1214
+ else if (errors.length > 5) {
1215
+ normalization.warnings.push(`${errors.length} records failed extraction (first: ${errors[0]})`);
1216
+ }
1217
+ return { events, normalization, errors };
1218
+ }
1219
+ /**
1220
+ * Validate and convert raw data to InferenceEvent.
1221
+ * Used for direct-parse formats (JSONL, JSON array).
1222
+ */
1223
+ function validateAndConvertEvent(data, recordNum) {
1224
+ if (typeof data !== 'object' || data === null) {
1225
+ throw new Error(`Expected object, got ${typeof data}`);
1226
+ }
1227
+ const obj = data;
1228
+ const errors = [];
1229
+ // Required fields
1230
+ if (typeof obj.id !== 'string')
1231
+ errors.push("Missing 'id'");
1232
+ if (typeof obj.ts !== 'string')
1233
+ errors.push("Missing 'ts'");
1234
+ if (typeof obj.provider !== 'string')
1235
+ errors.push("Missing 'provider'");
1236
+ if (typeof obj.model !== 'string')
1237
+ errors.push("Missing 'model'");
1238
+ if (typeof obj.input_tokens !== 'number')
1239
+ errors.push("Missing 'input_tokens'");
1240
+ if (typeof obj.output_tokens !== 'number')
1241
+ errors.push("Missing 'output_tokens'");
1242
+ if (typeof obj.latency_ms !== 'number')
1243
+ errors.push("Missing 'latency_ms'");
1244
+ if (errors.length > 0) {
1245
+ throw new Error(`Record ${recordNum}: ${errors.join(', ')}`);
1246
+ }
1247
+ return {
1248
+ id: obj.id,
1249
+ ts: obj.ts,
1250
+ provider: obj.provider,
1251
+ model: obj.model,
1252
+ input_tokens: obj.input_tokens,
1253
+ output_tokens: obj.output_tokens,
1254
+ latency_ms: obj.latency_ms,
1255
+ // Optional fields
1256
+ intent: typeof obj.intent === 'string' ? obj.intent : undefined,
1257
+ callsite_id: typeof obj.callsite_id === 'string' ? obj.callsite_id : undefined,
1258
+ streaming: typeof obj.streaming === 'boolean' ? obj.streaming : undefined,
1259
+ ttft_ms: typeof obj.ttft_ms === 'number' ? obj.ttft_ms : undefined,
1260
+ batch_size: typeof obj.batch_size === 'number' ? obj.batch_size : undefined,
1261
+ batch_id: typeof obj.batch_id === 'string' ? obj.batch_id : undefined,
1262
+ cached: typeof obj.cached === 'boolean' ? obj.cached : undefined,
1263
+ retry_count: typeof obj.retry_count === 'number' ? obj.retry_count : undefined,
1264
+ fallback_used: typeof obj.fallback_used === 'boolean' ? obj.fallback_used : undefined,
1265
+ original_model: typeof obj.original_model === 'string' ? obj.original_model : undefined,
1266
+ };
1267
+ }
1268
+ /**
1269
+ * Get predefined mappings for a known format type.
1270
+ */
1271
+ export function getPredefinedMappings(formatType) {
1272
+ return PREDEFINED_MAPPINGS[formatType];
1273
+ }
1274
+ /**
1275
+ * Check if a format type requires agent normalization.
1276
+ */
1277
+ export function requiresAgentNormalization(formatType) {
1278
+ return !['jsonl', 'json_array', 'csv', 'tsv'].includes(formatType);
1279
+ }
1280
+ /**
1281
+ * Parse and validate a field mapping string.
1282
+ * Format: "target=source,target2=source2,..."
1283
+ * Example: "model=llm_model,latency_ms=duration_ms"
1284
+ */
1285
+ export function validateFieldMappings(mappingStr) {
1286
+ const result = {
1287
+ valid: true,
1288
+ mappings: [],
1289
+ errors: [],
1290
+ warnings: [],
1291
+ };
1292
+ if (!mappingStr || mappingStr.trim() === '') {
1293
+ return result; // Empty mapping is valid (no custom mappings)
1294
+ }
1295
+ // Parse mapping string
1296
+ const pairs = mappingStr.split(',').map(p => p.trim()).filter(p => p);
1297
+ for (const pair of pairs) {
1298
+ const parts = pair.split('=');
1299
+ if (parts.length !== 2) {
1300
+ result.errors.push(`Invalid mapping format: "${pair}" (expected target=source)`);
1301
+ result.valid = false;
1302
+ continue;
1303
+ }
1304
+ const [target, source] = parts.map(p => p.trim());
1305
+ if (!target || !source) {
1306
+ result.errors.push(`Empty target or source in mapping: "${pair}"`);
1307
+ result.valid = false;
1308
+ continue;
1309
+ }
1310
+ // Validate target is a known InferenceEvent field
1311
+ const validTargets = [
1312
+ 'id', 'ts', 'provider', 'model', 'input_tokens', 'output_tokens',
1313
+ 'latency_ms', 'callsite_id', 'streaming', 'cached', 'batch_id',
1314
+ 'batch_size', 'retry_count', 'fallback_used', 'error_code', 'error_message'
1315
+ ];
1316
+ if (!validTargets.includes(target)) {
1317
+ result.warnings.push(`Unknown target field "${target}" - may not be used`);
1318
+ }
1319
+ result.mappings.push({ target, source });
1320
+ }
1321
+ // Check for required fields coverage
1322
+ const mappedTargets = new Set(result.mappings.map(m => m.target));
1323
+ const criticalFields = ['model', 'latency_ms'];
1324
+ for (const field of criticalFields) {
1325
+ if (!mappedTargets.has(field)) {
1326
+ result.warnings.push(`Consider mapping "${field}" for better analysis`);
1327
+ }
1328
+ }
1329
+ return result;
1330
+ }
1331
+ //# sourceMappingURL=format-normalizer.js.map