@peakinfer/cli 1.0.133

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (367) hide show
  1. package/.claude/settings.local.json +8 -0
  2. package/.env.example +6 -0
  3. package/.github/workflows/peakinfer.yml +64 -0
  4. package/CHANGELOG.md +31 -0
  5. package/LICENSE +190 -0
  6. package/README.md +335 -0
  7. package/data/inferencemax.json +274 -0
  8. package/dist/agent-analyzer.d.ts +45 -0
  9. package/dist/agent-analyzer.d.ts.map +1 -0
  10. package/dist/agent-analyzer.js +374 -0
  11. package/dist/agent-analyzer.js.map +1 -0
  12. package/dist/agent.d.ts +76 -0
  13. package/dist/agent.d.ts.map +1 -0
  14. package/dist/agent.js +965 -0
  15. package/dist/agent.js.map +1 -0
  16. package/dist/agents/correlation-analyzer.d.ts +34 -0
  17. package/dist/agents/correlation-analyzer.d.ts.map +1 -0
  18. package/dist/agents/correlation-analyzer.js +261 -0
  19. package/dist/agents/correlation-analyzer.js.map +1 -0
  20. package/dist/agents/index.d.ts +91 -0
  21. package/dist/agents/index.d.ts.map +1 -0
  22. package/dist/agents/index.js +111 -0
  23. package/dist/agents/index.js.map +1 -0
  24. package/dist/agents/runtime-analyzer.d.ts +38 -0
  25. package/dist/agents/runtime-analyzer.d.ts.map +1 -0
  26. package/dist/agents/runtime-analyzer.js +244 -0
  27. package/dist/agents/runtime-analyzer.js.map +1 -0
  28. package/dist/analysis-types.d.ts +500 -0
  29. package/dist/analysis-types.d.ts.map +1 -0
  30. package/dist/analysis-types.js +11 -0
  31. package/dist/analysis-types.js.map +1 -0
  32. package/dist/analytics.d.ts +25 -0
  33. package/dist/analytics.d.ts.map +1 -0
  34. package/dist/analytics.js +94 -0
  35. package/dist/analytics.js.map +1 -0
  36. package/dist/analyzer.d.ts +48 -0
  37. package/dist/analyzer.d.ts.map +1 -0
  38. package/dist/analyzer.js +547 -0
  39. package/dist/analyzer.js.map +1 -0
  40. package/dist/artifacts.d.ts +44 -0
  41. package/dist/artifacts.d.ts.map +1 -0
  42. package/dist/artifacts.js +165 -0
  43. package/dist/artifacts.js.map +1 -0
  44. package/dist/benchmarks/index.d.ts +88 -0
  45. package/dist/benchmarks/index.d.ts.map +1 -0
  46. package/dist/benchmarks/index.js +205 -0
  47. package/dist/benchmarks/index.js.map +1 -0
  48. package/dist/cli.d.ts +3 -0
  49. package/dist/cli.d.ts.map +1 -0
  50. package/dist/cli.js +427 -0
  51. package/dist/cli.js.map +1 -0
  52. package/dist/commands/ci.d.ts +19 -0
  53. package/dist/commands/ci.d.ts.map +1 -0
  54. package/dist/commands/ci.js +253 -0
  55. package/dist/commands/ci.js.map +1 -0
  56. package/dist/commands/config.d.ts +16 -0
  57. package/dist/commands/config.d.ts.map +1 -0
  58. package/dist/commands/config.js +249 -0
  59. package/dist/commands/config.js.map +1 -0
  60. package/dist/commands/demo.d.ts +15 -0
  61. package/dist/commands/demo.d.ts.map +1 -0
  62. package/dist/commands/demo.js +106 -0
  63. package/dist/commands/demo.js.map +1 -0
  64. package/dist/commands/export.d.ts +14 -0
  65. package/dist/commands/export.d.ts.map +1 -0
  66. package/dist/commands/export.js +209 -0
  67. package/dist/commands/export.js.map +1 -0
  68. package/dist/commands/history.d.ts +15 -0
  69. package/dist/commands/history.d.ts.map +1 -0
  70. package/dist/commands/history.js +389 -0
  71. package/dist/commands/history.js.map +1 -0
  72. package/dist/commands/template.d.ts +14 -0
  73. package/dist/commands/template.d.ts.map +1 -0
  74. package/dist/commands/template.js +341 -0
  75. package/dist/commands/template.js.map +1 -0
  76. package/dist/commands/validate-map.d.ts +12 -0
  77. package/dist/commands/validate-map.d.ts.map +1 -0
  78. package/dist/commands/validate-map.js +274 -0
  79. package/dist/commands/validate-map.js.map +1 -0
  80. package/dist/commands/whatif.d.ts +17 -0
  81. package/dist/commands/whatif.d.ts.map +1 -0
  82. package/dist/commands/whatif.js +206 -0
  83. package/dist/commands/whatif.js.map +1 -0
  84. package/dist/comparison.d.ts +38 -0
  85. package/dist/comparison.d.ts.map +1 -0
  86. package/dist/comparison.js +223 -0
  87. package/dist/comparison.js.map +1 -0
  88. package/dist/config.d.ts +42 -0
  89. package/dist/config.d.ts.map +1 -0
  90. package/dist/config.js +158 -0
  91. package/dist/config.js.map +1 -0
  92. package/dist/connectors/helicone.d.ts +9 -0
  93. package/dist/connectors/helicone.d.ts.map +1 -0
  94. package/dist/connectors/helicone.js +106 -0
  95. package/dist/connectors/helicone.js.map +1 -0
  96. package/dist/connectors/index.d.ts +37 -0
  97. package/dist/connectors/index.d.ts.map +1 -0
  98. package/dist/connectors/index.js +65 -0
  99. package/dist/connectors/index.js.map +1 -0
  100. package/dist/connectors/langsmith.d.ts +9 -0
  101. package/dist/connectors/langsmith.d.ts.map +1 -0
  102. package/dist/connectors/langsmith.js +122 -0
  103. package/dist/connectors/langsmith.js.map +1 -0
  104. package/dist/connectors/types.d.ts +83 -0
  105. package/dist/connectors/types.d.ts.map +1 -0
  106. package/dist/connectors/types.js +98 -0
  107. package/dist/connectors/types.js.map +1 -0
  108. package/dist/cost-estimator.d.ts +46 -0
  109. package/dist/cost-estimator.d.ts.map +1 -0
  110. package/dist/cost-estimator.js +104 -0
  111. package/dist/cost-estimator.js.map +1 -0
  112. package/dist/costs.d.ts +57 -0
  113. package/dist/costs.d.ts.map +1 -0
  114. package/dist/costs.js +251 -0
  115. package/dist/costs.js.map +1 -0
  116. package/dist/counterfactuals.d.ts +29 -0
  117. package/dist/counterfactuals.d.ts.map +1 -0
  118. package/dist/counterfactuals.js +448 -0
  119. package/dist/counterfactuals.js.map +1 -0
  120. package/dist/enhancement-prompts.d.ts +41 -0
  121. package/dist/enhancement-prompts.d.ts.map +1 -0
  122. package/dist/enhancement-prompts.js +88 -0
  123. package/dist/enhancement-prompts.js.map +1 -0
  124. package/dist/envelopes.d.ts +20 -0
  125. package/dist/envelopes.d.ts.map +1 -0
  126. package/dist/envelopes.js +790 -0
  127. package/dist/envelopes.js.map +1 -0
  128. package/dist/format-normalizer.d.ts +71 -0
  129. package/dist/format-normalizer.d.ts.map +1 -0
  130. package/dist/format-normalizer.js +1331 -0
  131. package/dist/format-normalizer.js.map +1 -0
  132. package/dist/history.d.ts +79 -0
  133. package/dist/history.d.ts.map +1 -0
  134. package/dist/history.js +313 -0
  135. package/dist/history.js.map +1 -0
  136. package/dist/html.d.ts +11 -0
  137. package/dist/html.d.ts.map +1 -0
  138. package/dist/html.js +463 -0
  139. package/dist/html.js.map +1 -0
  140. package/dist/impact.d.ts +42 -0
  141. package/dist/impact.d.ts.map +1 -0
  142. package/dist/impact.js +443 -0
  143. package/dist/impact.js.map +1 -0
  144. package/dist/index.d.ts +26 -0
  145. package/dist/index.d.ts.map +1 -0
  146. package/dist/index.js +34 -0
  147. package/dist/index.js.map +1 -0
  148. package/dist/insights.d.ts +5 -0
  149. package/dist/insights.d.ts.map +1 -0
  150. package/dist/insights.js +271 -0
  151. package/dist/insights.js.map +1 -0
  152. package/dist/joiner.d.ts +9 -0
  153. package/dist/joiner.d.ts.map +1 -0
  154. package/dist/joiner.js +247 -0
  155. package/dist/joiner.js.map +1 -0
  156. package/dist/orchestrator.d.ts +34 -0
  157. package/dist/orchestrator.d.ts.map +1 -0
  158. package/dist/orchestrator.js +827 -0
  159. package/dist/orchestrator.js.map +1 -0
  160. package/dist/pdf.d.ts +26 -0
  161. package/dist/pdf.d.ts.map +1 -0
  162. package/dist/pdf.js +84 -0
  163. package/dist/pdf.js.map +1 -0
  164. package/dist/prediction.d.ts +33 -0
  165. package/dist/prediction.d.ts.map +1 -0
  166. package/dist/prediction.js +316 -0
  167. package/dist/prediction.js.map +1 -0
  168. package/dist/prompts/loader.d.ts +38 -0
  169. package/dist/prompts/loader.d.ts.map +1 -0
  170. package/dist/prompts/loader.js +60 -0
  171. package/dist/prompts/loader.js.map +1 -0
  172. package/dist/renderer.d.ts +64 -0
  173. package/dist/renderer.d.ts.map +1 -0
  174. package/dist/renderer.js +923 -0
  175. package/dist/renderer.js.map +1 -0
  176. package/dist/runid.d.ts +57 -0
  177. package/dist/runid.d.ts.map +1 -0
  178. package/dist/runid.js +199 -0
  179. package/dist/runid.js.map +1 -0
  180. package/dist/runtime.d.ts +29 -0
  181. package/dist/runtime.d.ts.map +1 -0
  182. package/dist/runtime.js +366 -0
  183. package/dist/runtime.js.map +1 -0
  184. package/dist/scanner.d.ts +11 -0
  185. package/dist/scanner.d.ts.map +1 -0
  186. package/dist/scanner.js +426 -0
  187. package/dist/scanner.js.map +1 -0
  188. package/dist/templates.d.ts +120 -0
  189. package/dist/templates.d.ts.map +1 -0
  190. package/dist/templates.js +429 -0
  191. package/dist/templates.js.map +1 -0
  192. package/dist/tools/index.d.ts +153 -0
  193. package/dist/tools/index.d.ts.map +1 -0
  194. package/dist/tools/index.js +177 -0
  195. package/dist/tools/index.js.map +1 -0
  196. package/dist/types.d.ts +3647 -0
  197. package/dist/types.d.ts.map +1 -0
  198. package/dist/types.js +703 -0
  199. package/dist/types.js.map +1 -0
  200. package/dist/version.d.ts +7 -0
  201. package/dist/version.d.ts.map +1 -0
  202. package/dist/version.js +23 -0
  203. package/dist/version.js.map +1 -0
  204. package/docs/demo-guide.md +423 -0
  205. package/docs/events-format.md +295 -0
  206. package/docs/inferencemap-spec.md +344 -0
  207. package/docs/migration-v2.md +293 -0
  208. package/fixtures/demo/precomputed.json +142 -0
  209. package/fixtures/demo-project/README.md +52 -0
  210. package/fixtures/demo-project/ai-service.ts +65 -0
  211. package/fixtures/demo-project/sample-events.jsonl +15 -0
  212. package/fixtures/demo-project/src/ai-service.ts +128 -0
  213. package/fixtures/demo-project/src/llm-client.ts +155 -0
  214. package/package.json +65 -0
  215. package/prompts/agent-analyzer.yaml +47 -0
  216. package/prompts/ci-gate.yaml +98 -0
  217. package/prompts/correlation-analyzer.yaml +178 -0
  218. package/prompts/format-normalizer.yaml +46 -0
  219. package/prompts/peak-performance.yaml +180 -0
  220. package/prompts/pr-comment.yaml +111 -0
  221. package/prompts/runtime-analyzer.yaml +189 -0
  222. package/prompts/unified-analyzer.yaml +241 -0
  223. package/schemas/inference-map.v0.1.json +215 -0
  224. package/scripts/benchmark.ts +394 -0
  225. package/scripts/demo-v1.5.sh +158 -0
  226. package/scripts/sync-from-site.sh +197 -0
  227. package/scripts/validate-sync.sh +178 -0
  228. package/src/agent-analyzer.ts +481 -0
  229. package/src/agent.ts +1232 -0
  230. package/src/agents/correlation-analyzer.ts +353 -0
  231. package/src/agents/index.ts +235 -0
  232. package/src/agents/runtime-analyzer.ts +343 -0
  233. package/src/analysis-types.ts +558 -0
  234. package/src/analytics.ts +100 -0
  235. package/src/analyzer.ts +692 -0
  236. package/src/artifacts.ts +218 -0
  237. package/src/benchmarks/index.ts +309 -0
  238. package/src/cli.ts +503 -0
  239. package/src/commands/ci.ts +336 -0
  240. package/src/commands/config.ts +288 -0
  241. package/src/commands/demo.ts +175 -0
  242. package/src/commands/export.ts +297 -0
  243. package/src/commands/history.ts +425 -0
  244. package/src/commands/template.ts +385 -0
  245. package/src/commands/validate-map.ts +324 -0
  246. package/src/commands/whatif.ts +272 -0
  247. package/src/comparison.ts +283 -0
  248. package/src/config.ts +188 -0
  249. package/src/connectors/helicone.ts +164 -0
  250. package/src/connectors/index.ts +93 -0
  251. package/src/connectors/langsmith.ts +179 -0
  252. package/src/connectors/types.ts +180 -0
  253. package/src/cost-estimator.ts +146 -0
  254. package/src/costs.ts +347 -0
  255. package/src/counterfactuals.ts +516 -0
  256. package/src/enhancement-prompts.ts +118 -0
  257. package/src/envelopes.ts +814 -0
  258. package/src/format-normalizer.ts +1486 -0
  259. package/src/history.ts +400 -0
  260. package/src/html.ts +512 -0
  261. package/src/impact.ts +522 -0
  262. package/src/index.ts +83 -0
  263. package/src/insights.ts +341 -0
  264. package/src/joiner.ts +289 -0
  265. package/src/orchestrator.ts +1015 -0
  266. package/src/pdf.ts +110 -0
  267. package/src/prediction.ts +392 -0
  268. package/src/prompts/loader.ts +88 -0
  269. package/src/renderer.ts +1045 -0
  270. package/src/runid.ts +261 -0
  271. package/src/runtime.ts +450 -0
  272. package/src/scanner.ts +508 -0
  273. package/src/templates.ts +561 -0
  274. package/src/tools/index.ts +214 -0
  275. package/src/types.ts +873 -0
  276. package/src/version.ts +24 -0
  277. package/templates/context-accumulation.yaml +23 -0
  278. package/templates/cost-concentration.yaml +20 -0
  279. package/templates/dead-code.yaml +20 -0
  280. package/templates/latency-explainer.yaml +23 -0
  281. package/templates/optimizations/ab-testing-framework.yaml +74 -0
  282. package/templates/optimizations/api-gateway-optimization.yaml +81 -0
  283. package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
  284. package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
  285. package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
  286. package/templates/optimizations/comprehensive-apm.yaml +76 -0
  287. package/templates/optimizations/context-window-optimization.yaml +91 -0
  288. package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
  289. package/templates/optimizations/distributed-training-optimization.yaml +77 -0
  290. package/templates/optimizations/document-analysis-edge.yaml +77 -0
  291. package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
  292. package/templates/optimizations/domain-specific-distillation.yaml +78 -0
  293. package/templates/optimizations/error-handling-optimization.yaml +76 -0
  294. package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
  295. package/templates/optimizations/long-context-memory-management.yaml +78 -0
  296. package/templates/optimizations/max-tokens-optimization.yaml +76 -0
  297. package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
  298. package/templates/optimizations/multi-framework-resilience.yaml +75 -0
  299. package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
  300. package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
  301. package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
  302. package/templates/optimizations/quality-monitoring.yaml +74 -0
  303. package/templates/optimizations/realtime-budget-controls.yaml +74 -0
  304. package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
  305. package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
  306. package/templates/optimizations/smart-model-routing.yaml +96 -0
  307. package/templates/optimizations/streaming-batch-selection.yaml +167 -0
  308. package/templates/optimizations/system-prompt-optimization.yaml +75 -0
  309. package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
  310. package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
  311. package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
  312. package/templates/overpowered-extraction.yaml +32 -0
  313. package/templates/overpowered-model.yaml +31 -0
  314. package/templates/prompt-bloat.yaml +24 -0
  315. package/templates/retry-explosion.yaml +28 -0
  316. package/templates/schema/insight.schema.json +113 -0
  317. package/templates/schema/optimization.schema.json +180 -0
  318. package/templates/streaming-drift.yaml +30 -0
  319. package/templates/throughput-gap.yaml +21 -0
  320. package/templates/token-underutilization.yaml +28 -0
  321. package/templates/untested-fallback.yaml +21 -0
  322. package/tests/accuracy/drift-detection.test.ts +184 -0
  323. package/tests/accuracy/false-positives.test.ts +166 -0
  324. package/tests/accuracy/templates.test.ts +205 -0
  325. package/tests/action/commands.test.ts +125 -0
  326. package/tests/action/comments.test.ts +347 -0
  327. package/tests/cli.test.ts +203 -0
  328. package/tests/comparison.test.ts +309 -0
  329. package/tests/correlation-analyzer.test.ts +534 -0
  330. package/tests/counterfactuals.test.ts +347 -0
  331. package/tests/fixtures/events/missing-id.jsonl +1 -0
  332. package/tests/fixtures/events/missing-input.jsonl +1 -0
  333. package/tests/fixtures/events/missing-latency.jsonl +1 -0
  334. package/tests/fixtures/events/missing-model.jsonl +1 -0
  335. package/tests/fixtures/events/missing-output.jsonl +1 -0
  336. package/tests/fixtures/events/missing-provider.jsonl +1 -0
  337. package/tests/fixtures/events/missing-ts.jsonl +1 -0
  338. package/tests/fixtures/events/valid.csv +3 -0
  339. package/tests/fixtures/events/valid.json +1 -0
  340. package/tests/fixtures/events/valid.jsonl +2 -0
  341. package/tests/fixtures/events/with-callsite.jsonl +1 -0
  342. package/tests/fixtures/events/with-intent.jsonl +1 -0
  343. package/tests/fixtures/events/wrong-type.jsonl +1 -0
  344. package/tests/fixtures/repos/empty/.gitkeep +0 -0
  345. package/tests/fixtures/repos/hybrid-router/router.py +35 -0
  346. package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
  347. package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
  348. package/tests/fixtures/repos/saas-openai/client.py +26 -0
  349. package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
  350. package/tests/github-action.test.ts +292 -0
  351. package/tests/insights.test.ts +878 -0
  352. package/tests/joiner.test.ts +168 -0
  353. package/tests/performance/action-latency.test.ts +132 -0
  354. package/tests/performance/benchmark.test.ts +189 -0
  355. package/tests/performance/cli-latency.test.ts +102 -0
  356. package/tests/pr-comment.test.ts +313 -0
  357. package/tests/prediction.test.ts +296 -0
  358. package/tests/runtime-analyzer.test.ts +375 -0
  359. package/tests/runtime.test.ts +205 -0
  360. package/tests/scanner.test.ts +122 -0
  361. package/tests/template-conformance.test.ts +526 -0
  362. package/tests/unit/cost-calculator.test.ts +303 -0
  363. package/tests/unit/credits.test.ts +180 -0
  364. package/tests/unit/inference-map.test.ts +276 -0
  365. package/tests/unit/schema.test.ts +300 -0
  366. package/tsconfig.json +20 -0
  367. package/vitest.config.ts +14 -0
@@ -0,0 +1,1486 @@
1
+ /**
2
+ * Format Normalizer - Agent-based runtime event format detection and normalization.
3
+ *
4
+ * This module implements PRD §6.4: Enable PeakInfer to ingest runtime data from any
5
+ * observability system, logging framework, or custom format without requiring users
6
+ * to transform their data first.
7
+ *
8
+ * Design Principles (Julie Zhou aligned):
9
+ * - Behavior First: Detect formats automatically, fallback gracefully
10
+ * - Clarity Over Cleverness: Clear confidence scores, no silent assumptions
11
+ * - State Completeness: Handle all format states (known, agent-required, unknown)
12
+ */
13
+
14
+ import { query } from '@anthropic-ai/claude-agent-sdk';
15
+ import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk';
16
+ import type {
17
+ FormatType,
18
+ FieldMapping,
19
+ FormatDetectionResult,
20
+ NormalizationResult,
21
+ NormalizationOptions,
22
+ InferenceEvent,
23
+ ScanResult,
24
+ } from './types.js';
25
+ import { loadPrompt } from './templates.js';
26
+
27
+ /**
28
+ * Extract text content from Claude Agent SDK messages
29
+ */
30
+ function extractTextFromMessages(messages: SDKMessage[]): string {
31
+ let text = '';
32
+ for (const msg of messages) {
33
+ if (msg.type === 'assistant' && msg.message?.content) {
34
+ for (const block of msg.message.content) {
35
+ if (block.type === 'text') {
36
+ text += block.text;
37
+ }
38
+ }
39
+ }
40
+ }
41
+ return text;
42
+ }
43
+
44
+ // =============================================================================
45
+ // CONSTANTS
46
+ // =============================================================================
47
+
48
+ const SAMPLE_LINES = 20; // Number of lines to sample for detection
49
+ const MIN_CONFIDENCE_THRESHOLD = 0.7; // Minimum confidence for auto-acceptance
50
+ const LLM_MODEL = 'claude-sonnet-4-20250514';
51
+
52
+ // Required fields for InferenceEvent
53
+ const REQUIRED_FIELDS = ['id', 'ts', 'provider', 'model', 'input_tokens', 'output_tokens', 'latency_ms'];
54
+
55
+ // =============================================================================
56
+ // FORMAT SIGNATURES
57
+ // =============================================================================
58
+
59
+ /**
60
+ * Known format signatures for heuristic detection.
61
+ * Each signature includes structural patterns that uniquely identify a format.
62
+ */
63
+ const FORMAT_SIGNATURES: Record<string, {
64
+ patterns: RegExp[];
65
+ structuralCheck?: (data: unknown) => boolean;
66
+ confidence: number;
67
+ }> = {
68
+ otel: {
69
+ patterns: [
70
+ /resourceSpans/i,
71
+ /scopeSpans/i,
72
+ /traceId/i,
73
+ /spanId/i,
74
+ ],
75
+ structuralCheck: (data) => {
76
+ if (typeof data !== 'object' || data === null) return false;
77
+ const obj = data as Record<string, unknown>;
78
+ return 'resourceSpans' in obj || 'resource_spans' in obj;
79
+ },
80
+ confidence: 0.95,
81
+ },
82
+ jaeger: {
83
+ patterns: [
84
+ /traceID/, // Jaeger uses capital ID (vs OTEL's traceId)
85
+ /spanID/, // Jaeger uses capital ID (vs OTEL's spanId)
86
+ /operationName/,
87
+ /jaeger/i,
88
+ /"processes"/, // Jaeger-specific field
89
+ ],
90
+ structuralCheck: (data) => {
91
+ if (typeof data !== 'object' || data === null) return false;
92
+ const obj = data as Record<string, unknown>;
93
+ // Jaeger format: { data: [{ traceID: ..., processes: ... }] }
94
+ if ('data' in obj && Array.isArray((obj as { data: unknown }).data)) {
95
+ const firstTrace = (obj as { data: unknown[] }).data[0] as Record<string, unknown>;
96
+ // Must have traceID (capital ID) to distinguish from OTEL
97
+ return firstTrace?.traceID !== undefined && firstTrace?.processes !== undefined;
98
+ }
99
+ return false;
100
+ },
101
+ confidence: 0.95,
102
+ },
103
+ zipkin: {
104
+ patterns: [
105
+ /"traceId"/,
106
+ /"parentId"/,
107
+ /"localEndpoint"/,
108
+ /zipkin/i,
109
+ ],
110
+ structuralCheck: (data) => {
111
+ if (!Array.isArray(data)) return false;
112
+ const first = data[0] as Record<string, unknown>;
113
+ return first?.traceId !== undefined && first?.localEndpoint !== undefined;
114
+ },
115
+ confidence: 0.95,
116
+ },
117
+ langsmith: {
118
+ patterns: [
119
+ /run_id/,
120
+ /run_type/,
121
+ /langsmith/i,
122
+ /langchain/i,
123
+ ],
124
+ structuralCheck: (data) => {
125
+ if (typeof data !== 'object' || data === null) return false;
126
+ const obj = data as Record<string, unknown>;
127
+ return 'run_id' in obj || 'runs' in obj;
128
+ },
129
+ confidence: 0.90,
130
+ },
131
+ litellm: {
132
+ patterns: [
133
+ /litellm/i,
134
+ /call_type/,
135
+ /api_base/,
136
+ /response_time_ms/,
137
+ ],
138
+ structuralCheck: (data) => {
139
+ if (typeof data !== 'object' || data === null) return false;
140
+ const obj = data as Record<string, unknown>;
141
+ // LiteLLM logs have call_type OR api_base OR response_time_ms fields
142
+ return 'call_type' in obj || 'api_base' in obj || 'response_time_ms' in obj;
143
+ },
144
+ confidence: 0.90,
145
+ },
146
+ helicone: {
147
+ patterns: [
148
+ /helicone/i,
149
+ /helicone_request_id/,
150
+ /helicone_response_id/,
151
+ /helicone_properties/,
152
+ ],
153
+ structuralCheck: (data) => {
154
+ if (typeof data !== 'object' || data === null) return false;
155
+ const obj = data as Record<string, unknown>;
156
+ // Helicone-specific fields - must have helicone_ prefixed fields
157
+ return 'helicone_request_id' in obj || 'helicone_response_id' in obj ||
158
+ 'helicone_properties' in obj || 'helicone' in obj;
159
+ },
160
+ confidence: 0.85,
161
+ },
162
+ };
163
+
164
+ // =============================================================================
165
+ // PREDEFINED FIELD MAPPINGS
166
+ // =============================================================================
167
+
168
+ /**
169
+ * Predefined field mappings for known formats.
170
+ * These are high-confidence mappings based on format specifications.
171
+ */
172
+ const PREDEFINED_MAPPINGS: Record<string, FieldMapping[]> = {
173
+ otel: [
174
+ {
175
+ target: 'id',
176
+ source_path: '$.resourceSpans[*].scopeSpans[*].spans[*].spanId',
177
+ extraction_type: 'jsonpath',
178
+ transform: 'none',
179
+ confidence: 1.0,
180
+ evidence: 'OTLP span ID field',
181
+ },
182
+ {
183
+ target: 'ts',
184
+ source_path: '$.resourceSpans[*].scopeSpans[*].spans[*].startTimeUnixNano',
185
+ extraction_type: 'jsonpath',
186
+ transform: 'unix_nano_to_iso',
187
+ confidence: 1.0,
188
+ evidence: 'OTLP start time in nanoseconds',
189
+ },
190
+ {
191
+ target: 'provider',
192
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.provider')].value.stringValue",
193
+ extraction_type: 'jsonpath',
194
+ transform: 'provider_normalize',
195
+ confidence: 0.9,
196
+ evidence: 'LLM semantic convention attribute',
197
+ },
198
+ {
199
+ target: 'model',
200
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.model')].value.stringValue",
201
+ extraction_type: 'jsonpath',
202
+ transform: 'none',
203
+ confidence: 0.9,
204
+ evidence: 'LLM semantic convention attribute',
205
+ },
206
+ {
207
+ target: 'input_tokens',
208
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.token_count.prompt')].value.intValue",
209
+ extraction_type: 'jsonpath',
210
+ transform: 'parse_int',
211
+ confidence: 0.9,
212
+ evidence: 'LLM semantic convention attribute',
213
+ },
214
+ {
215
+ target: 'output_tokens',
216
+ source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.token_count.completion')].value.intValue",
217
+ extraction_type: 'jsonpath',
218
+ transform: 'parse_int',
219
+ confidence: 0.9,
220
+ evidence: 'LLM semantic convention attribute',
221
+ },
222
+ {
223
+ target: 'latency_ms',
224
+ source_path: '(endTimeUnixNano - startTimeUnixNano) / 1000000',
225
+ extraction_type: 'computed',
226
+ transform: 'none',
227
+ confidence: 1.0,
228
+ evidence: 'Computed from OTLP start/end timestamps',
229
+ },
230
+ ],
231
+ jaeger: [
232
+ {
233
+ target: 'id',
234
+ source_path: '$.data[*].spans[*].spanID',
235
+ extraction_type: 'jsonpath',
236
+ transform: 'none',
237
+ confidence: 1.0,
238
+ evidence: 'Jaeger span ID',
239
+ },
240
+ {
241
+ target: 'ts',
242
+ source_path: '$.data[*].spans[*].startTime',
243
+ extraction_type: 'jsonpath',
244
+ transform: 'unix_ms_to_iso',
245
+ confidence: 1.0,
246
+ evidence: 'Jaeger start time in microseconds',
247
+ },
248
+ {
249
+ target: 'provider',
250
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.provider')].value",
251
+ extraction_type: 'jsonpath',
252
+ transform: 'provider_normalize',
253
+ confidence: 0.85,
254
+ evidence: 'Tag-based provider extraction',
255
+ },
256
+ {
257
+ target: 'model',
258
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.model')].value",
259
+ extraction_type: 'jsonpath',
260
+ transform: 'none',
261
+ confidence: 0.85,
262
+ evidence: 'Tag-based model extraction',
263
+ },
264
+ {
265
+ target: 'input_tokens',
266
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.input_tokens')].value",
267
+ extraction_type: 'jsonpath',
268
+ transform: 'parse_int',
269
+ confidence: 0.85,
270
+ evidence: 'Tag-based token extraction',
271
+ },
272
+ {
273
+ target: 'output_tokens',
274
+ source_path: "$.data[*].spans[*].tags[?(@.key=='llm.output_tokens')].value",
275
+ extraction_type: 'jsonpath',
276
+ transform: 'parse_int',
277
+ confidence: 0.85,
278
+ evidence: 'Tag-based token extraction',
279
+ },
280
+ {
281
+ target: 'latency_ms',
282
+ source_path: '$.data[*].spans[*].duration',
283
+ extraction_type: 'jsonpath',
284
+ transform: 'none', // Jaeger duration is already in microseconds, convert to ms
285
+ confidence: 1.0,
286
+ evidence: 'Jaeger duration field (microseconds -> ms)',
287
+ },
288
+ ],
289
+ zipkin: [
290
+ {
291
+ target: 'id',
292
+ source_path: '$[*].id',
293
+ extraction_type: 'jsonpath',
294
+ transform: 'none',
295
+ confidence: 1.0,
296
+ evidence: 'Zipkin span ID',
297
+ },
298
+ {
299
+ target: 'ts',
300
+ source_path: '$[*].timestamp',
301
+ extraction_type: 'jsonpath',
302
+ transform: 'unix_ms_to_iso',
303
+ confidence: 1.0,
304
+ evidence: 'Zipkin timestamp in microseconds',
305
+ },
306
+ {
307
+ target: 'provider',
308
+ source_path: "$[*].tags['llm.provider']",
309
+ extraction_type: 'jsonpath',
310
+ transform: 'provider_normalize',
311
+ confidence: 0.85,
312
+ evidence: 'Tag-based provider extraction',
313
+ },
314
+ {
315
+ target: 'model',
316
+ source_path: "$[*].tags['llm.model']",
317
+ extraction_type: 'jsonpath',
318
+ transform: 'none',
319
+ confidence: 0.85,
320
+ evidence: 'Tag-based model extraction',
321
+ },
322
+ {
323
+ target: 'input_tokens',
324
+ source_path: "$[*].tags['llm.input_tokens']",
325
+ extraction_type: 'jsonpath',
326
+ transform: 'parse_int',
327
+ confidence: 0.85,
328
+ evidence: 'Tag-based token extraction',
329
+ },
330
+ {
331
+ target: 'output_tokens',
332
+ source_path: "$[*].tags['llm.output_tokens']",
333
+ extraction_type: 'jsonpath',
334
+ transform: 'parse_int',
335
+ confidence: 0.85,
336
+ evidence: 'Tag-based token extraction',
337
+ },
338
+ {
339
+ target: 'latency_ms',
340
+ source_path: '$[*].duration',
341
+ extraction_type: 'jsonpath',
342
+ transform: 'none', // Zipkin duration is in microseconds
343
+ confidence: 1.0,
344
+ evidence: 'Zipkin duration field (microseconds -> ms)',
345
+ },
346
+ ],
347
+ langsmith: [
348
+ {
349
+ target: 'id',
350
+ source_path: 'run_id',
351
+ extraction_type: 'direct',
352
+ transform: 'none',
353
+ confidence: 0.95,
354
+ evidence: 'LangSmith run ID',
355
+ },
356
+ {
357
+ target: 'ts',
358
+ source_path: 'start_time',
359
+ extraction_type: 'direct',
360
+ transform: 'none', // Already ISO format
361
+ confidence: 0.95,
362
+ evidence: 'LangSmith start timestamp',
363
+ },
364
+ {
365
+ target: 'provider',
366
+ source_path: 'extra.invocation_params.model_provider',
367
+ extraction_type: 'jsonpath',
368
+ transform: 'provider_normalize',
369
+ confidence: 0.8,
370
+ evidence: 'LangSmith invocation params provider',
371
+ },
372
+ {
373
+ target: 'model',
374
+ source_path: 'extra.invocation_params.model',
375
+ extraction_type: 'jsonpath',
376
+ transform: 'none',
377
+ confidence: 0.85,
378
+ evidence: 'LangSmith invocation params model',
379
+ },
380
+ {
381
+ target: 'input_tokens',
382
+ source_path: 'token_usage.prompt_tokens',
383
+ extraction_type: 'jsonpath',
384
+ transform: 'parse_int',
385
+ confidence: 0.9,
386
+ evidence: 'LangSmith token usage prompt_tokens',
387
+ },
388
+ {
389
+ target: 'output_tokens',
390
+ source_path: 'token_usage.completion_tokens',
391
+ extraction_type: 'jsonpath',
392
+ transform: 'parse_int',
393
+ confidence: 0.9,
394
+ evidence: 'LangSmith token usage completion_tokens',
395
+ },
396
+ {
397
+ target: 'latency_ms',
398
+ source_path: 'latency',
399
+ extraction_type: 'direct',
400
+ transform: 'duration_to_ms',
401
+ confidence: 0.9,
402
+ evidence: 'LangSmith latency field',
403
+ },
404
+ ],
405
+ litellm: [
406
+ {
407
+ target: 'id',
408
+ source_path: 'id',
409
+ extraction_type: 'direct',
410
+ transform: 'none',
411
+ confidence: 0.95,
412
+ evidence: 'LiteLLM request ID',
413
+ },
414
+ {
415
+ target: 'ts',
416
+ source_path: 'startTime',
417
+ extraction_type: 'direct',
418
+ transform: 'unix_ms_to_iso',
419
+ confidence: 0.9,
420
+ evidence: 'LiteLLM start timestamp',
421
+ },
422
+ {
423
+ target: 'provider',
424
+ source_path: 'model',
425
+ extraction_type: 'direct',
426
+ transform: 'provider_normalize', // LiteLLM uses model format like "openai/gpt-4"
427
+ confidence: 0.85,
428
+ evidence: 'LiteLLM model field (provider/model format)',
429
+ },
430
+ {
431
+ target: 'model',
432
+ source_path: 'model',
433
+ extraction_type: 'direct',
434
+ transform: 'none',
435
+ confidence: 0.95,
436
+ evidence: 'LiteLLM model field',
437
+ },
438
+ {
439
+ target: 'input_tokens',
440
+ source_path: 'usage.prompt_tokens',
441
+ extraction_type: 'jsonpath',
442
+ transform: 'parse_int',
443
+ confidence: 0.95,
444
+ evidence: 'LiteLLM usage prompt_tokens',
445
+ },
446
+ {
447
+ target: 'output_tokens',
448
+ source_path: 'usage.completion_tokens',
449
+ extraction_type: 'jsonpath',
450
+ transform: 'parse_int',
451
+ confidence: 0.95,
452
+ evidence: 'LiteLLM usage completion_tokens',
453
+ },
454
+ {
455
+ target: 'latency_ms',
456
+ source_path: 'response_time_ms',
457
+ extraction_type: 'direct',
458
+ transform: 'none',
459
+ confidence: 1.0,
460
+ evidence: 'LiteLLM response_time_ms field',
461
+ },
462
+ ],
463
+ helicone: [
464
+ {
465
+ target: 'id',
466
+ source_path: 'helicone_request_id',
467
+ extraction_type: 'direct',
468
+ transform: 'none',
469
+ confidence: 0.95,
470
+ evidence: 'Helicone request ID',
471
+ },
472
+ {
473
+ target: 'ts',
474
+ source_path: 'created_at',
475
+ extraction_type: 'direct',
476
+ transform: 'none', // Helicone uses ISO format
477
+ confidence: 0.9,
478
+ evidence: 'Helicone created_at timestamp',
479
+ },
480
+ {
481
+ target: 'provider',
482
+ source_path: 'provider',
483
+ extraction_type: 'direct',
484
+ transform: 'provider_normalize',
485
+ confidence: 0.9,
486
+ evidence: 'Helicone provider field',
487
+ },
488
+ {
489
+ target: 'model',
490
+ source_path: 'model',
491
+ extraction_type: 'direct',
492
+ transform: 'none',
493
+ confidence: 0.95,
494
+ evidence: 'Helicone model field',
495
+ },
496
+ {
497
+ target: 'input_tokens',
498
+ source_path: 'prompt_tokens',
499
+ extraction_type: 'direct',
500
+ transform: 'parse_int',
501
+ confidence: 0.9,
502
+ evidence: 'Helicone prompt_tokens',
503
+ },
504
+ {
505
+ target: 'output_tokens',
506
+ source_path: 'completion_tokens',
507
+ extraction_type: 'direct',
508
+ transform: 'parse_int',
509
+ confidence: 0.9,
510
+ evidence: 'Helicone completion_tokens',
511
+ },
512
+ {
513
+ target: 'latency_ms',
514
+ source_path: 'latency_ms',
515
+ extraction_type: 'direct',
516
+ transform: 'none',
517
+ confidence: 1.0,
518
+ evidence: 'Helicone latency_ms field',
519
+ },
520
+ ],
521
+ };
522
+
523
+ // =============================================================================
524
+ // FORMAT DETECTION
525
+ // =============================================================================
526
+
527
+ /**
528
+ * Detect the format type of a runtime events file.
529
+ *
530
+ * Detection strategy:
531
+ * 1. Try file extension heuristics
532
+ * 2. Sample content and check against known signatures
533
+ * 3. Fall back to agent-based detection for unknown formats
534
+ */
535
+ export function detectFormat(
536
+ content: string,
537
+ filename?: string,
538
+ ): FormatDetectionResult {
539
+ const lines = content.trim().split('\n').slice(0, SAMPLE_LINES);
540
+
541
+ // First, try to parse as complete JSON (object or array)
542
+ // This handles single-line JSON arrays which would incorrectly match JSONL
543
+ let parsedAsWhole: unknown;
544
+ try {
545
+ parsedAsWhole = JSON.parse(content);
546
+ } catch {
547
+ parsedAsWhole = null;
548
+ }
549
+
550
+ // If it's a JSON array, check for InferenceEvent schema first
551
+ if (Array.isArray(parsedAsWhole) && parsedAsWhole.length > 0) {
552
+ const first = parsedAsWhole[0] as Record<string, unknown>;
553
+ const hasRequiredFields = REQUIRED_FIELDS.every(f => f in first);
554
+
555
+ if (hasRequiredFields) {
556
+ return {
557
+ format_type: 'json_array',
558
+ confidence: 1.0,
559
+ evidence: 'JSON array with InferenceEvent schema',
560
+ sample_size: Math.min(parsedAsWhole.length, SAMPLE_LINES),
561
+ requires_agent: false,
562
+ };
563
+ }
564
+ }
565
+
566
+ // Check if it's likely JSONL (newline-delimited JSON)
567
+ // Note: Multi-line content where each line is valid JSON
568
+ if (lines.length > 1) {
569
+ const isJSONL = lines.every(line => {
570
+ const trimmed = line.trim();
571
+ if (!trimmed) return true; // Empty lines are ok
572
+ try {
573
+ JSON.parse(trimmed);
574
+ return true;
575
+ } catch {
576
+ return false;
577
+ }
578
+ });
579
+
580
+ if (isJSONL) {
581
+ // Parse first non-empty line
582
+ const firstLine = lines.find(l => l.trim());
583
+ if (firstLine) {
584
+ try {
585
+ const firstEvent = JSON.parse(firstLine);
586
+
587
+ // Check if JSONL matches InferenceEvent schema
588
+ const hasRequiredFields = REQUIRED_FIELDS.every(f => f in firstEvent);
589
+
590
+ if (hasRequiredFields) {
591
+ return {
592
+ format_type: 'jsonl',
593
+ confidence: 1.0,
594
+ evidence: 'JSONL with InferenceEvent schema (all required fields present)',
595
+ sample_size: lines.length,
596
+ requires_agent: false,
597
+ };
598
+ }
599
+
600
+ // Check against known format signatures for JSONL data
601
+ // Only match if structuralCheck passes (required for JSONL format detection)
602
+ const jsonStr = JSON.stringify(firstEvent);
603
+ for (const [formatType, signature] of Object.entries(FORMAT_SIGNATURES)) {
604
+ // For JSONL, require structuralCheck to pass (if defined)
605
+ if (signature.structuralCheck) {
606
+ const structuralMatch = signature.structuralCheck(firstEvent);
607
+ if (!structuralMatch) continue;
608
+
609
+ const patternMatches = signature.patterns.filter(p => p.test(jsonStr)).length;
610
+ const patternRatio = patternMatches / signature.patterns.length;
611
+ const confidence = Math.max(0.8, patternRatio) * signature.confidence;
612
+
613
+ return {
614
+ format_type: formatType as FormatType,
615
+ confidence,
616
+ evidence: `JSONL with ${formatType} format (structural match, ${patternMatches}/${signature.patterns.length} patterns)`,
617
+ sample_size: lines.length,
618
+ requires_agent: true,
619
+ };
620
+ }
621
+ }
622
+
623
+ // JSONL but unknown schema - mark as custom_json requiring agent
624
+ return {
625
+ format_type: 'custom_json',
626
+ confidence: 0.7,
627
+ evidence: 'JSONL with custom schema - requires field mapping',
628
+ sample_size: lines.length,
629
+ requires_agent: true,
630
+ };
631
+ } catch {
632
+ // Continue to other detection methods
633
+ }
634
+ }
635
+ }
636
+ }
637
+
638
+ // Try to parse as JSON (array or object)
639
+ let parsedData: unknown;
640
+ try {
641
+ parsedData = JSON.parse(content);
642
+ } catch {
643
+ // Not valid JSON, check for CSV/TSV or text logs
644
+ return detectNonJSONFormat(content, lines, filename);
645
+ }
646
+
647
+ // Check if it's a JSON array with InferenceEvent schema
648
+ if (Array.isArray(parsedData) && parsedData.length > 0) {
649
+ const first = parsedData[0] as Record<string, unknown>;
650
+ const hasRequiredFields = REQUIRED_FIELDS.every(f => f in first);
651
+
652
+ if (hasRequiredFields) {
653
+ return {
654
+ format_type: 'json_array',
655
+ confidence: 1.0,
656
+ evidence: 'JSON array with InferenceEvent schema',
657
+ sample_size: Math.min(parsedData.length, SAMPLE_LINES),
658
+ requires_agent: false,
659
+ };
660
+ }
661
+ }
662
+
663
+ // Check against known format signatures
664
+ // Require structural match for reliable detection
665
+ const contentStr = JSON.stringify(parsedData);
666
+
667
+ for (const [formatType, signature] of Object.entries(FORMAT_SIGNATURES)) {
668
+ // Require structuralCheck to pass for format identification
669
+ if (signature.structuralCheck) {
670
+ const structuralMatch = signature.structuralCheck(parsedData);
671
+ if (!structuralMatch) continue;
672
+
673
+ const patternMatches = signature.patterns.filter(p => p.test(contentStr)).length;
674
+ const patternRatio = patternMatches / signature.patterns.length;
675
+ const confidence = Math.max(0.8, patternRatio) * signature.confidence;
676
+
677
+ return {
678
+ format_type: formatType as FormatType,
679
+ confidence,
680
+ evidence: `Matched ${formatType} format (structural match, ${patternMatches}/${signature.patterns.length} patterns)`,
681
+ sample_size: Array.isArray(parsedData) ? parsedData.length : 1,
682
+ requires_agent: true, // Known formats still need agent for field mapping
683
+ };
684
+ }
685
+ }
686
+
687
+ // Unknown JSON structure - requires agent normalization
688
+ return {
689
+ format_type: 'custom_json',
690
+ confidence: 0.5,
691
+ evidence: 'Valid JSON but unknown schema - requires agent normalization',
692
+ sample_size: Array.isArray(parsedData) ? parsedData.length : 1,
693
+ requires_agent: true,
694
+ };
695
+ }
696
+
697
+ /**
698
+ * Detect non-JSON formats (CSV, TSV, text logs).
699
+ */
700
+ function detectNonJSONFormat(
701
+ content: string,
702
+ lines: string[],
703
+ filename?: string,
704
+ ): FormatDetectionResult {
705
+ // Check for CSV
706
+ const firstLine = lines[0];
707
+ if (firstLine.includes(',')) {
708
+ const headers = firstLine.split(',').map(h => h.trim().toLowerCase());
709
+ const hasLLMHeaders = ['provider', 'model', 'latency', 'tokens'].some(
710
+ h => headers.some(header => header.includes(h))
711
+ );
712
+
713
+ if (hasLLMHeaders) {
714
+ return {
715
+ format_type: 'csv',
716
+ confidence: 0.9,
717
+ evidence: 'CSV with LLM-related headers detected',
718
+ sample_size: lines.length,
719
+ requires_agent: false,
720
+ };
721
+ }
722
+
723
+ return {
724
+ format_type: 'csv',
725
+ confidence: 0.7,
726
+ evidence: 'CSV format detected but headers may need mapping',
727
+ sample_size: lines.length,
728
+ requires_agent: true,
729
+ };
730
+ }
731
+
732
+ // Check for TSV
733
+ if (firstLine.includes('\t')) {
734
+ return {
735
+ format_type: 'tsv',
736
+ confidence: 0.8,
737
+ evidence: 'Tab-separated values detected',
738
+ sample_size: lines.length,
739
+ requires_agent: true,
740
+ };
741
+ }
742
+
743
+ // Structured text logs
744
+ const logPatterns = [
745
+ /^\d{4}-\d{2}-\d{2}/, // ISO date prefix
746
+ /^\[\d+\]/, // Timestamp prefix
747
+ /level=(info|warn|error|debug)/i,
748
+ /provider=\w+/,
749
+ /model=\w+/,
750
+ ];
751
+
752
+ const logMatchCount = logPatterns.filter(p =>
753
+ lines.some(line => p.test(line))
754
+ ).length;
755
+
756
+ if (logMatchCount >= 2) {
757
+ return {
758
+ format_type: 'custom_text',
759
+ confidence: 0.6,
760
+ evidence: `Structured text logs detected (${logMatchCount} patterns matched)`,
761
+ sample_size: lines.length,
762
+ requires_agent: true,
763
+ };
764
+ }
765
+
766
+ return {
767
+ format_type: 'unknown',
768
+ confidence: 0.3,
769
+ evidence: 'Could not determine format - manual field mapping may be required',
770
+ sample_size: lines.length,
771
+ requires_agent: true,
772
+ };
773
+ }
774
+
775
+ // =============================================================================
776
+ // AGENT-BASED NORMALIZATION
777
+ // =============================================================================
778
+
779
+ // Load normalization prompt from YAML (with hardcoded fallback)
780
+ function getNormalizationPrompt(): string {
781
+ const prompt = loadPrompt('format-normalizer');
782
+ if (prompt) {
783
+ return prompt.prompt;
784
+ }
785
+ // Fallback to hardcoded prompt if YAML not available
786
+ return `You are an expert at parsing log formats and trace data. Analyze the following sample data and determine field mappings to the InferenceEvent schema.
787
+
788
+ The target InferenceEvent schema requires these fields:
789
+ - id (string): Unique event identifier
790
+ - ts (string): ISO 8601 timestamp
791
+ - provider (string): LLM provider (openai, anthropic, google, etc.)
792
+ - model (string): Model name (gpt-4o, claude-3-5-sonnet, etc.)
793
+ - input_tokens (number): Input/prompt token count
794
+ - output_tokens (number): Output/completion token count
795
+ - latency_ms (number): Request latency in milliseconds
796
+
797
+ Optional fields:
798
+ - streaming (boolean), ttft_ms (number), batch_size (number), cached (boolean), retry_count (number)
799
+
800
+ For each target field, provide:
801
+ 1. The source path/expression to extract the value
802
+ 2. The extraction type (direct, jsonpath, regex, computed)
803
+ 3. Any transform needed (unix_ms_to_iso, unix_nano_to_iso, parse_int, etc.)
804
+ 4. Your confidence (0.0-1.0) in this mapping
805
+ 5. Evidence explaining why you chose this mapping
806
+
807
+ If a field cannot be mapped, indicate it as unmappable with confidence 0.
808
+
809
+ Respond in JSON format:
810
+ {
811
+ "format_type": "detected format name",
812
+ "mappings": [
813
+ {
814
+ "target": "field_name",
815
+ "source_path": "path or expression",
816
+ "extraction_type": "direct|jsonpath|regex|computed",
817
+ "transform": "none|unix_ms_to_iso|parse_int|...",
818
+ "confidence": 0.9,
819
+ "evidence": "explanation"
820
+ }
821
+ ],
822
+ "unmapped_fields": ["fields that could not be mapped"],
823
+ "warnings": ["any issues or caveats"]
824
+ }`;
825
+ }
826
+
827
+ /**
828
+ * Use LLM agent to normalize an unknown format.
829
+ */
830
+ export async function normalizeWithAgent(
831
+ content: string,
832
+ detection: FormatDetectionResult,
833
+ options: NormalizationOptions = {},
834
+ ): Promise<NormalizationResult> {
835
+ // Check for API key
836
+ const apiKey = process.env.ANTHROPIC_API_KEY;
837
+ if (!apiKey) {
838
+ return createFallbackResult(detection, 'No ANTHROPIC_API_KEY - agent normalization unavailable');
839
+ }
840
+
841
+ // Sample content for the agent
842
+ const sampleLines = content.trim().split('\n').slice(0, SAMPLE_LINES);
843
+ const sampleContent = sampleLines.join('\n');
844
+
845
+ // Build context prompt
846
+ let contextPrompt = '';
847
+ if (options.codebase_context) {
848
+ const scanResult = options.codebase_context as ScanResult;
849
+ contextPrompt = `\n\nCodebase context available:
850
+ - ${scanResult.files.length} files scanned
851
+ - Languages: ${scanResult.summary.languages.join(', ')}
852
+ - ${scanResult.summary.totalCandidates} potential inference points detected
853
+
854
+ This may help identify logging patterns and field names used in the application.`;
855
+ }
856
+
857
+ // User hints
858
+ let hintsPrompt = '';
859
+ if (options.format_hint) {
860
+ hintsPrompt += `\nUser hint: Format appears to be "${options.format_hint}"`;
861
+ }
862
+ if (options.field_hints) {
863
+ hintsPrompt += `\nUser-provided field mappings: ${JSON.stringify(options.field_hints)}`;
864
+ }
865
+
866
+ try {
867
+ // Use Claude Agent SDK query() function
868
+ const agentQuery = query({
869
+ prompt: `${getNormalizationPrompt()}${contextPrompt}${hintsPrompt}
870
+
871
+ Detected format: ${detection.format_type} (confidence: ${detection.confidence})
872
+
873
+ Sample data:
874
+ \`\`\`
875
+ ${sampleContent}
876
+ \`\`\``,
877
+ options: {
878
+ model: LLM_MODEL,
879
+ tools: [],
880
+ permissionMode: 'plan',
881
+ cwd: process.cwd(),
882
+ },
883
+ });
884
+
885
+ // Collect all messages from the async generator
886
+ const messages: SDKMessage[] = [];
887
+ for await (const message of agentQuery) {
888
+ messages.push(message);
889
+ }
890
+
891
+ // Parse LLM response
892
+ const responseText = extractTextFromMessages(messages);
893
+ const jsonMatch = responseText.match(/\{[\s\S]*\}/);
894
+
895
+ if (!jsonMatch) {
896
+ return createFallbackResult(detection, 'Could not parse agent response');
897
+ }
898
+
899
+ const agentResult = JSON.parse(jsonMatch[0]) as {
900
+ format_type?: string;
901
+ mappings?: Array<{
902
+ target: string;
903
+ source_path: string;
904
+ extraction_type: string;
905
+ transform?: string;
906
+ confidence: number;
907
+ evidence?: string;
908
+ }>;
909
+ unmapped_fields?: string[];
910
+ warnings?: string[];
911
+ };
912
+
913
+ // Validate and build result
914
+ const mappings: FieldMapping[] = (agentResult.mappings || []).map(m => ({
915
+ target: m.target,
916
+ source_path: m.source_path,
917
+ extraction_type: m.extraction_type as FieldMapping['extraction_type'],
918
+ transform: (m.transform || 'none') as FieldMapping['transform'],
919
+ confidence: m.confidence,
920
+ evidence: m.evidence,
921
+ }));
922
+
923
+ // Check confidence threshold
924
+ const avgConfidence = mappings.reduce((sum, m) => sum + m.confidence, 0) / mappings.length;
925
+ const warnings = agentResult.warnings || [];
926
+
927
+ if (avgConfidence < MIN_CONFIDENCE_THRESHOLD && !options.lenient) {
928
+ warnings.push(
929
+ `Average mapping confidence (${avgConfidence.toFixed(2)}) is below threshold (${MIN_CONFIDENCE_THRESHOLD}). ` +
930
+ `Use --lenient flag to accept low-confidence mappings.`
931
+ );
932
+ }
933
+
934
+ return {
935
+ detection: {
936
+ ...detection,
937
+ format_type: (agentResult.format_type as FormatType) || detection.format_type,
938
+ },
939
+ mappings,
940
+ unmapped_fields: agentResult.unmapped_fields || [],
941
+ warnings,
942
+ audit: {
943
+ normalized_at: new Date().toISOString(),
944
+ agent_used: true,
945
+ codebase_context_used: !!options.codebase_context,
946
+ llm_model: LLM_MODEL,
947
+ },
948
+ };
949
+ } catch (error) {
950
+ return createFallbackResult(
951
+ detection,
952
+ `Agent normalization failed: ${error instanceof Error ? error.message : String(error)}`
953
+ );
954
+ }
955
+ }
956
+
957
+ /**
958
+ * Create a fallback normalization result when agent is unavailable.
959
+ */
960
+ function createFallbackResult(
961
+ detection: FormatDetectionResult,
962
+ warning: string,
963
+ ): NormalizationResult {
964
+ // Use predefined mappings if available
965
+ const predefinedMappings = PREDEFINED_MAPPINGS[detection.format_type];
966
+
967
+ return {
968
+ detection,
969
+ mappings: predefinedMappings || [],
970
+ unmapped_fields: predefinedMappings ? [] : REQUIRED_FIELDS,
971
+ warnings: [warning],
972
+ audit: {
973
+ normalized_at: new Date().toISOString(),
974
+ agent_used: false,
975
+ codebase_context_used: false,
976
+ },
977
+ };
978
+ }
979
+
980
+ // =============================================================================
981
+ // FIELD EXTRACTION
982
+ // =============================================================================
983
+
984
+ /**
985
+ * Apply a transformation to an extracted value.
986
+ */
987
+ function applyTransform(value: unknown, transform: FieldMapping['transform']): unknown {
988
+ if (value === null || value === undefined) return value;
989
+
990
+ switch (transform) {
991
+ case 'none':
992
+ return value;
993
+
994
+ case 'unix_ms_to_iso':
995
+ return new Date(Number(value)).toISOString();
996
+
997
+ case 'unix_s_to_iso':
998
+ return new Date(Number(value) * 1000).toISOString();
999
+
1000
+ case 'unix_nano_to_iso':
1001
+ return new Date(Number(value) / 1_000_000).toISOString();
1002
+
1003
+ case 'duration_to_ms': {
1004
+ const str = String(value);
1005
+ const match = str.match(/^([\d.]+)(ms|s|m)?$/);
1006
+ if (match) {
1007
+ const num = parseFloat(match[1]);
1008
+ const unit = match[2] || 'ms';
1009
+ switch (unit) {
1010
+ case 's': return num * 1000;
1011
+ case 'm': return num * 60000;
1012
+ default: return num;
1013
+ }
1014
+ }
1015
+ return parseFloat(str);
1016
+ }
1017
+
1018
+ case 'parse_int':
1019
+ return parseInt(String(value), 10);
1020
+
1021
+ case 'parse_float':
1022
+ return parseFloat(String(value));
1023
+
1024
+ case 'lowercase':
1025
+ return String(value).toLowerCase();
1026
+
1027
+ case 'provider_normalize': {
1028
+ const str = String(value).toLowerCase();
1029
+ // Normalize common provider variations
1030
+ if (str.includes('openai')) return 'openai';
1031
+ if (str.includes('anthropic')) return 'anthropic';
1032
+ if (str.includes('google')) return 'google';
1033
+ if (str.includes('azure')) return 'azure_openai';
1034
+ if (str.includes('bedrock')) return 'bedrock';
1035
+ if (str.includes('together')) return 'together';
1036
+ if (str.includes('groq')) return 'groq';
1037
+ return str;
1038
+ }
1039
+
1040
+ default:
1041
+ return value;
1042
+ }
1043
+ }
1044
+
1045
+ /**
1046
+ * Extract a value from an object using a simple path.
1047
+ * Supports basic dot notation and array access.
1048
+ */
1049
+ function extractValue(obj: unknown, path: string): unknown {
1050
+ if (path.startsWith('$.')) {
1051
+ path = path.slice(2);
1052
+ }
1053
+
1054
+ const parts = path.split(/\.|\[(\d+)\]/).filter(Boolean);
1055
+ let current: unknown = obj;
1056
+
1057
+ for (const part of parts) {
1058
+ if (current === null || current === undefined) return undefined;
1059
+
1060
+ if (typeof current === 'object') {
1061
+ current = (current as Record<string, unknown>)[part];
1062
+ } else {
1063
+ return undefined;
1064
+ }
1065
+ }
1066
+
1067
+ return current;
1068
+ }
1069
+
1070
+ /**
1071
+ * Extract InferenceEvents from normalized data using field mappings.
1072
+ */
1073
+ export function extractEvents(
1074
+ content: string,
1075
+ normalization: NormalizationResult,
1076
+ ): { events: InferenceEvent[]; errors: string[] } {
1077
+ const events: InferenceEvent[] = [];
1078
+ const errors: string[] = [];
1079
+
1080
+ // Parse content based on format type
1081
+ let records: unknown[];
1082
+
1083
+ try {
1084
+ const formatType = normalization.detection.format_type;
1085
+
1086
+ if (formatType === 'jsonl') {
1087
+ records = content.trim().split('\n').map(line => JSON.parse(line));
1088
+ } else if (formatType === 'json_array') {
1089
+ records = JSON.parse(content);
1090
+ } else if (formatType === 'csv' || formatType === 'tsv') {
1091
+ const delimiter = formatType === 'csv' ? ',' : '\t';
1092
+ const lines = content.trim().split('\n');
1093
+ const headers = lines[0].split(delimiter).map(h => h.trim());
1094
+ records = lines.slice(1).map(line => {
1095
+ const values = line.split(delimiter);
1096
+ const obj: Record<string, string> = {};
1097
+ headers.forEach((h, i) => { obj[h] = values[i]?.trim() || ''; });
1098
+ return obj;
1099
+ });
1100
+ } else {
1101
+ // For complex formats (OTEL, Jaeger, etc.), parse and flatten
1102
+ const data = JSON.parse(content);
1103
+ records = flattenComplexFormat(data, normalization.detection.format_type);
1104
+ }
1105
+ } catch (error) {
1106
+ errors.push(`Failed to parse content: ${error instanceof Error ? error.message : String(error)}`);
1107
+ return { events, errors };
1108
+ }
1109
+
1110
+ // Extract events using mappings
1111
+ for (let i = 0; i < records.length; i++) {
1112
+ const record = records[i];
1113
+ const event: Partial<InferenceEvent> = {};
1114
+
1115
+ for (const mapping of normalization.mappings) {
1116
+ try {
1117
+ let value: unknown;
1118
+
1119
+ if (mapping.extraction_type === 'computed') {
1120
+ // Handle computed fields (e.g., latency = end - start)
1121
+ value = computeValue(record as Record<string, unknown>, mapping.source_path);
1122
+ } else if (mapping.extraction_type === 'constant') {
1123
+ value = mapping.source_path;
1124
+ } else {
1125
+ value = extractValue(record, mapping.source_path);
1126
+ }
1127
+
1128
+ if (value !== undefined && value !== null) {
1129
+ const transformed = applyTransform(value, mapping.transform);
1130
+ (event as Record<string, unknown>)[mapping.target] = transformed;
1131
+ }
1132
+ } catch (error) {
1133
+ // Skip this field for this record
1134
+ }
1135
+ }
1136
+
1137
+ // Validate required fields
1138
+ const missingFields = REQUIRED_FIELDS.filter(f => !(f in event));
1139
+ if (missingFields.length === 0) {
1140
+ events.push(event as InferenceEvent);
1141
+ } else {
1142
+ errors.push(`Record ${i + 1}: Missing required fields: ${missingFields.join(', ')}`);
1143
+ }
1144
+ }
1145
+
1146
+ return { events, errors };
1147
+ }
1148
+
1149
+ /**
1150
+ * Flatten complex nested formats (OTEL, Jaeger, Zipkin) into individual records.
1151
+ */
1152
+ function flattenComplexFormat(data: unknown, formatType: FormatType): unknown[] {
1153
+ if (formatType === 'otel') {
1154
+ return flattenOTEL(data);
1155
+ } else if (formatType === 'jaeger') {
1156
+ return flattenJaeger(data);
1157
+ } else if (formatType === 'zipkin') {
1158
+ // Zipkin is already an array of spans
1159
+ return Array.isArray(data) ? data : [];
1160
+ }
1161
+
1162
+ // For unknown formats, try to handle arrays or wrap single object
1163
+ if (Array.isArray(data)) return data;
1164
+ return [data];
1165
+ }
1166
+
1167
+ /**
1168
+ * Flatten OTEL traces into individual spans.
1169
+ */
1170
+ function flattenOTEL(data: unknown): unknown[] {
1171
+ const spans: unknown[] = [];
1172
+ const otelData = data as {
1173
+ resourceSpans?: Array<{
1174
+ scopeSpans?: Array<{
1175
+ spans?: unknown[];
1176
+ }>;
1177
+ }>;
1178
+ };
1179
+
1180
+ for (const resourceSpan of otelData.resourceSpans || []) {
1181
+ for (const scopeSpan of resourceSpan.scopeSpans || []) {
1182
+ for (const span of scopeSpan.spans || []) {
1183
+ spans.push(span);
1184
+ }
1185
+ }
1186
+ }
1187
+
1188
+ return spans;
1189
+ }
1190
+
1191
+ /**
1192
+ * Flatten Jaeger traces into individual spans.
1193
+ */
1194
+ function flattenJaeger(data: unknown): unknown[] {
1195
+ const spans: unknown[] = [];
1196
+ const jaegerData = data as {
1197
+ data?: Array<{
1198
+ spans?: unknown[];
1199
+ }>;
1200
+ };
1201
+
1202
+ for (const trace of jaegerData.data || []) {
1203
+ for (const span of trace.spans || []) {
1204
+ spans.push(span);
1205
+ }
1206
+ }
1207
+
1208
+ return spans;
1209
+ }
1210
+
1211
+ /**
1212
+ * Compute a derived value from an expression.
1213
+ */
1214
+ function computeValue(record: Record<string, unknown>, expression: string): number | undefined {
1215
+ // Simple expression parser for common patterns
1216
+ // e.g., "(endTimeUnixNano - startTimeUnixNano) / 1000000"
1217
+
1218
+ const match = expression.match(/\((\w+)\s*-\s*(\w+)\)\s*\/\s*(\d+)/);
1219
+ if (match) {
1220
+ const [, endField, startField, divisor] = match;
1221
+ const endValue = Number(record[endField]);
1222
+ const startValue = Number(record[startField]);
1223
+
1224
+ if (!isNaN(endValue) && !isNaN(startValue)) {
1225
+ return (endValue - startValue) / Number(divisor);
1226
+ }
1227
+ }
1228
+
1229
+ // Try direct field access
1230
+ const fieldMatch = expression.match(/^\$\.(\w+)$/);
1231
+ if (fieldMatch) {
1232
+ return Number(record[fieldMatch[1]]);
1233
+ }
1234
+
1235
+ return undefined;
1236
+ }
1237
+
1238
+ // =============================================================================
1239
+ // PUBLIC API
1240
+ // =============================================================================
1241
+
1242
+ /**
1243
+ * Main entry point: Detect format and normalize runtime events.
1244
+ *
1245
+ * This function implements the complete normalization pipeline:
1246
+ * 1. Detect format type from content
1247
+ * 2. For direct-parse formats (JSONL, JSON array), parse directly
1248
+ * 3. Apply predefined mappings for known complex formats
1249
+ * 4. Use agent for unknown formats (if API key available)
1250
+ */
1251
+ export async function normalizeRuntimeEvents(
1252
+ content: string,
1253
+ options: NormalizationOptions = {},
1254
+ ): Promise<{
1255
+ events: InferenceEvent[];
1256
+ normalization: NormalizationResult;
1257
+ errors: string[];
1258
+ }> {
1259
+ // Step 1: Detect format
1260
+ const detection = detectFormat(content, options.format_hint?.toString());
1261
+
1262
+ // Override with user hint if provided
1263
+ if (options.format_hint) {
1264
+ detection.format_type = options.format_hint;
1265
+ detection.evidence = `User-specified format: ${options.format_hint}`;
1266
+ }
1267
+
1268
+ // Step 2: For direct-parse formats, parse directly without field mappings
1269
+ if (detection.format_type === 'jsonl' || detection.format_type === 'json_array') {
1270
+ const events: InferenceEvent[] = [];
1271
+ const errors: string[] = [];
1272
+
1273
+ try {
1274
+ if (detection.format_type === 'jsonl') {
1275
+ // JSONL: one JSON object per line
1276
+ const lines = content.trim().split('\n');
1277
+ for (let i = 0; i < lines.length; i++) {
1278
+ const line = lines[i].trim();
1279
+ if (!line) continue;
1280
+ try {
1281
+ const data = JSON.parse(line);
1282
+ events.push(validateAndConvertEvent(data, i + 1));
1283
+ } catch (e) {
1284
+ errors.push(`Line ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
1285
+ }
1286
+ }
1287
+ } else {
1288
+ // JSON array
1289
+ const data = JSON.parse(content);
1290
+ if (Array.isArray(data)) {
1291
+ for (let i = 0; i < data.length; i++) {
1292
+ try {
1293
+ events.push(validateAndConvertEvent(data[i], i + 1));
1294
+ } catch (e) {
1295
+ errors.push(`Record ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
1296
+ }
1297
+ }
1298
+ }
1299
+ }
1300
+ } catch (e) {
1301
+ errors.push(`Parse error: ${e instanceof Error ? e.message : String(e)}`);
1302
+ }
1303
+
1304
+ const normalization: NormalizationResult = {
1305
+ detection,
1306
+ mappings: [],
1307
+ unmapped_fields: [],
1308
+ warnings: errors.length > 0 ? [`${errors.length} records had parsing issues`] : [],
1309
+ audit: {
1310
+ normalized_at: new Date().toISOString(),
1311
+ agent_used: false,
1312
+ codebase_context_used: false,
1313
+ },
1314
+ };
1315
+
1316
+ return { events, normalization, errors };
1317
+ }
1318
+
1319
+ // Step 3: Get or generate field mappings for complex formats
1320
+ let normalization: NormalizationResult;
1321
+
1322
+ if (!detection.requires_agent || detection.confidence >= 0.95) {
1323
+ // Use predefined mappings for known complex formats
1324
+ const predefinedMappings = PREDEFINED_MAPPINGS[detection.format_type];
1325
+ normalization = {
1326
+ detection,
1327
+ mappings: predefinedMappings || [],
1328
+ unmapped_fields: [],
1329
+ warnings: [],
1330
+ audit: {
1331
+ normalized_at: new Date().toISOString(),
1332
+ agent_used: false,
1333
+ codebase_context_used: false,
1334
+ },
1335
+ };
1336
+ } else {
1337
+ // Agent normalization required
1338
+ normalization = await normalizeWithAgent(content, detection, options);
1339
+ }
1340
+
1341
+ // Step 4: Extract events using field mappings
1342
+ const { events, errors } = extractEvents(content, normalization);
1343
+
1344
+ // Add extraction errors to warnings
1345
+ if (errors.length > 0 && errors.length <= 5) {
1346
+ normalization.warnings.push(...errors);
1347
+ } else if (errors.length > 5) {
1348
+ normalization.warnings.push(`${errors.length} records failed extraction (first: ${errors[0]})`);
1349
+ }
1350
+
1351
+ return { events, normalization, errors };
1352
+ }
1353
+
1354
+ /**
1355
+ * Validate and convert raw data to InferenceEvent.
1356
+ * Used for direct-parse formats (JSONL, JSON array).
1357
+ */
1358
+ function validateAndConvertEvent(data: unknown, recordNum: number): InferenceEvent {
1359
+ if (typeof data !== 'object' || data === null) {
1360
+ throw new Error(`Expected object, got ${typeof data}`);
1361
+ }
1362
+
1363
+ const obj = data as Record<string, unknown>;
1364
+ const errors: string[] = [];
1365
+
1366
+ // Required fields
1367
+ if (typeof obj.id !== 'string') errors.push("Missing 'id'");
1368
+ if (typeof obj.ts !== 'string') errors.push("Missing 'ts'");
1369
+ if (typeof obj.provider !== 'string') errors.push("Missing 'provider'");
1370
+ if (typeof obj.model !== 'string') errors.push("Missing 'model'");
1371
+ if (typeof obj.input_tokens !== 'number') errors.push("Missing 'input_tokens'");
1372
+ if (typeof obj.output_tokens !== 'number') errors.push("Missing 'output_tokens'");
1373
+ if (typeof obj.latency_ms !== 'number') errors.push("Missing 'latency_ms'");
1374
+
1375
+ if (errors.length > 0) {
1376
+ throw new Error(`Record ${recordNum}: ${errors.join(', ')}`);
1377
+ }
1378
+
1379
+ return {
1380
+ id: obj.id as string,
1381
+ ts: obj.ts as string,
1382
+ provider: obj.provider as InferenceEvent['provider'],
1383
+ model: obj.model as string,
1384
+ input_tokens: obj.input_tokens as number,
1385
+ output_tokens: obj.output_tokens as number,
1386
+ latency_ms: obj.latency_ms as number,
1387
+ // Optional fields
1388
+ intent: typeof obj.intent === 'string' ? obj.intent : undefined,
1389
+ callsite_id: typeof obj.callsite_id === 'string' ? obj.callsite_id : undefined,
1390
+ streaming: typeof obj.streaming === 'boolean' ? obj.streaming : undefined,
1391
+ ttft_ms: typeof obj.ttft_ms === 'number' ? obj.ttft_ms : undefined,
1392
+ batch_size: typeof obj.batch_size === 'number' ? obj.batch_size : undefined,
1393
+ batch_id: typeof obj.batch_id === 'string' ? obj.batch_id : undefined,
1394
+ cached: typeof obj.cached === 'boolean' ? obj.cached : undefined,
1395
+ retry_count: typeof obj.retry_count === 'number' ? obj.retry_count : undefined,
1396
+ fallback_used: typeof obj.fallback_used === 'boolean' ? obj.fallback_used : undefined,
1397
+ original_model: typeof obj.original_model === 'string' ? obj.original_model : undefined,
1398
+ };
1399
+ }
1400
+
1401
+ /**
1402
+ * Get predefined mappings for a known format type.
1403
+ */
1404
+ export function getPredefinedMappings(formatType: FormatType): FieldMapping[] | undefined {
1405
+ return PREDEFINED_MAPPINGS[formatType];
1406
+ }
1407
+
1408
+ /**
1409
+ * Check if a format type requires agent normalization.
1410
+ */
1411
+ export function requiresAgentNormalization(formatType: FormatType): boolean {
1412
+ return !['jsonl', 'json_array', 'csv', 'tsv'].includes(formatType);
1413
+ }
1414
+
1415
+ // =============================================================================
1416
+ // FIELD MAPPING VALIDATION
1417
+ // =============================================================================
1418
+
1419
+ export interface MappingValidationResult {
1420
+ valid: boolean;
1421
+ mappings: Array<{ target: string; source: string }>;
1422
+ errors: string[];
1423
+ warnings: string[];
1424
+ }
1425
+
1426
+ /**
1427
+ * Parse and validate a field mapping string.
1428
+ * Format: "target=source,target2=source2,..."
1429
+ * Example: "model=llm_model,latency_ms=duration_ms"
1430
+ */
1431
+ export function validateFieldMappings(mappingStr: string | undefined): MappingValidationResult {
1432
+ const result: MappingValidationResult = {
1433
+ valid: true,
1434
+ mappings: [],
1435
+ errors: [],
1436
+ warnings: [],
1437
+ };
1438
+
1439
+ if (!mappingStr || mappingStr.trim() === '') {
1440
+ return result; // Empty mapping is valid (no custom mappings)
1441
+ }
1442
+
1443
+ // Parse mapping string
1444
+ const pairs = mappingStr.split(',').map(p => p.trim()).filter(p => p);
1445
+
1446
+ for (const pair of pairs) {
1447
+ const parts = pair.split('=');
1448
+ if (parts.length !== 2) {
1449
+ result.errors.push(`Invalid mapping format: "${pair}" (expected target=source)`);
1450
+ result.valid = false;
1451
+ continue;
1452
+ }
1453
+
1454
+ const [target, source] = parts.map(p => p.trim());
1455
+
1456
+ if (!target || !source) {
1457
+ result.errors.push(`Empty target or source in mapping: "${pair}"`);
1458
+ result.valid = false;
1459
+ continue;
1460
+ }
1461
+
1462
+ // Validate target is a known InferenceEvent field
1463
+ const validTargets = [
1464
+ 'id', 'ts', 'provider', 'model', 'input_tokens', 'output_tokens',
1465
+ 'latency_ms', 'callsite_id', 'streaming', 'cached', 'batch_id',
1466
+ 'batch_size', 'retry_count', 'fallback_used', 'error_code', 'error_message'
1467
+ ];
1468
+
1469
+ if (!validTargets.includes(target)) {
1470
+ result.warnings.push(`Unknown target field "${target}" - may not be used`);
1471
+ }
1472
+
1473
+ result.mappings.push({ target, source });
1474
+ }
1475
+
1476
+ // Check for required fields coverage
1477
+ const mappedTargets = new Set(result.mappings.map(m => m.target));
1478
+ const criticalFields = ['model', 'latency_ms'];
1479
+ for (const field of criticalFields) {
1480
+ if (!mappedTargets.has(field)) {
1481
+ result.warnings.push(`Consider mapping "${field}" for better analysis`);
1482
+ }
1483
+ }
1484
+
1485
+ return result;
1486
+ }