@peakinfer/cli 1.0.133
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +8 -0
- package/.env.example +6 -0
- package/.github/workflows/peakinfer.yml +64 -0
- package/CHANGELOG.md +31 -0
- package/LICENSE +190 -0
- package/README.md +335 -0
- package/data/inferencemax.json +274 -0
- package/dist/agent-analyzer.d.ts +45 -0
- package/dist/agent-analyzer.d.ts.map +1 -0
- package/dist/agent-analyzer.js +374 -0
- package/dist/agent-analyzer.js.map +1 -0
- package/dist/agent.d.ts +76 -0
- package/dist/agent.d.ts.map +1 -0
- package/dist/agent.js +965 -0
- package/dist/agent.js.map +1 -0
- package/dist/agents/correlation-analyzer.d.ts +34 -0
- package/dist/agents/correlation-analyzer.d.ts.map +1 -0
- package/dist/agents/correlation-analyzer.js +261 -0
- package/dist/agents/correlation-analyzer.js.map +1 -0
- package/dist/agents/index.d.ts +91 -0
- package/dist/agents/index.d.ts.map +1 -0
- package/dist/agents/index.js +111 -0
- package/dist/agents/index.js.map +1 -0
- package/dist/agents/runtime-analyzer.d.ts +38 -0
- package/dist/agents/runtime-analyzer.d.ts.map +1 -0
- package/dist/agents/runtime-analyzer.js +244 -0
- package/dist/agents/runtime-analyzer.js.map +1 -0
- package/dist/analysis-types.d.ts +500 -0
- package/dist/analysis-types.d.ts.map +1 -0
- package/dist/analysis-types.js +11 -0
- package/dist/analysis-types.js.map +1 -0
- package/dist/analytics.d.ts +25 -0
- package/dist/analytics.d.ts.map +1 -0
- package/dist/analytics.js +94 -0
- package/dist/analytics.js.map +1 -0
- package/dist/analyzer.d.ts +48 -0
- package/dist/analyzer.d.ts.map +1 -0
- package/dist/analyzer.js +547 -0
- package/dist/analyzer.js.map +1 -0
- package/dist/artifacts.d.ts +44 -0
- package/dist/artifacts.d.ts.map +1 -0
- package/dist/artifacts.js +165 -0
- package/dist/artifacts.js.map +1 -0
- package/dist/benchmarks/index.d.ts +88 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +205 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/cli.d.ts +3 -0
- package/dist/cli.d.ts.map +1 -0
- package/dist/cli.js +427 -0
- package/dist/cli.js.map +1 -0
- package/dist/commands/ci.d.ts +19 -0
- package/dist/commands/ci.d.ts.map +1 -0
- package/dist/commands/ci.js +253 -0
- package/dist/commands/ci.js.map +1 -0
- package/dist/commands/config.d.ts +16 -0
- package/dist/commands/config.d.ts.map +1 -0
- package/dist/commands/config.js +249 -0
- package/dist/commands/config.js.map +1 -0
- package/dist/commands/demo.d.ts +15 -0
- package/dist/commands/demo.d.ts.map +1 -0
- package/dist/commands/demo.js +106 -0
- package/dist/commands/demo.js.map +1 -0
- package/dist/commands/export.d.ts +14 -0
- package/dist/commands/export.d.ts.map +1 -0
- package/dist/commands/export.js +209 -0
- package/dist/commands/export.js.map +1 -0
- package/dist/commands/history.d.ts +15 -0
- package/dist/commands/history.d.ts.map +1 -0
- package/dist/commands/history.js +389 -0
- package/dist/commands/history.js.map +1 -0
- package/dist/commands/template.d.ts +14 -0
- package/dist/commands/template.d.ts.map +1 -0
- package/dist/commands/template.js +341 -0
- package/dist/commands/template.js.map +1 -0
- package/dist/commands/validate-map.d.ts +12 -0
- package/dist/commands/validate-map.d.ts.map +1 -0
- package/dist/commands/validate-map.js +274 -0
- package/dist/commands/validate-map.js.map +1 -0
- package/dist/commands/whatif.d.ts +17 -0
- package/dist/commands/whatif.d.ts.map +1 -0
- package/dist/commands/whatif.js +206 -0
- package/dist/commands/whatif.js.map +1 -0
- package/dist/comparison.d.ts +38 -0
- package/dist/comparison.d.ts.map +1 -0
- package/dist/comparison.js +223 -0
- package/dist/comparison.js.map +1 -0
- package/dist/config.d.ts +42 -0
- package/dist/config.d.ts.map +1 -0
- package/dist/config.js +158 -0
- package/dist/config.js.map +1 -0
- package/dist/connectors/helicone.d.ts +9 -0
- package/dist/connectors/helicone.d.ts.map +1 -0
- package/dist/connectors/helicone.js +106 -0
- package/dist/connectors/helicone.js.map +1 -0
- package/dist/connectors/index.d.ts +37 -0
- package/dist/connectors/index.d.ts.map +1 -0
- package/dist/connectors/index.js +65 -0
- package/dist/connectors/index.js.map +1 -0
- package/dist/connectors/langsmith.d.ts +9 -0
- package/dist/connectors/langsmith.d.ts.map +1 -0
- package/dist/connectors/langsmith.js +122 -0
- package/dist/connectors/langsmith.js.map +1 -0
- package/dist/connectors/types.d.ts +83 -0
- package/dist/connectors/types.d.ts.map +1 -0
- package/dist/connectors/types.js +98 -0
- package/dist/connectors/types.js.map +1 -0
- package/dist/cost-estimator.d.ts +46 -0
- package/dist/cost-estimator.d.ts.map +1 -0
- package/dist/cost-estimator.js +104 -0
- package/dist/cost-estimator.js.map +1 -0
- package/dist/costs.d.ts +57 -0
- package/dist/costs.d.ts.map +1 -0
- package/dist/costs.js +251 -0
- package/dist/costs.js.map +1 -0
- package/dist/counterfactuals.d.ts +29 -0
- package/dist/counterfactuals.d.ts.map +1 -0
- package/dist/counterfactuals.js +448 -0
- package/dist/counterfactuals.js.map +1 -0
- package/dist/enhancement-prompts.d.ts +41 -0
- package/dist/enhancement-prompts.d.ts.map +1 -0
- package/dist/enhancement-prompts.js +88 -0
- package/dist/enhancement-prompts.js.map +1 -0
- package/dist/envelopes.d.ts +20 -0
- package/dist/envelopes.d.ts.map +1 -0
- package/dist/envelopes.js +790 -0
- package/dist/envelopes.js.map +1 -0
- package/dist/format-normalizer.d.ts +71 -0
- package/dist/format-normalizer.d.ts.map +1 -0
- package/dist/format-normalizer.js +1331 -0
- package/dist/format-normalizer.js.map +1 -0
- package/dist/history.d.ts +79 -0
- package/dist/history.d.ts.map +1 -0
- package/dist/history.js +313 -0
- package/dist/history.js.map +1 -0
- package/dist/html.d.ts +11 -0
- package/dist/html.d.ts.map +1 -0
- package/dist/html.js +463 -0
- package/dist/html.js.map +1 -0
- package/dist/impact.d.ts +42 -0
- package/dist/impact.d.ts.map +1 -0
- package/dist/impact.js +443 -0
- package/dist/impact.js.map +1 -0
- package/dist/index.d.ts +26 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +34 -0
- package/dist/index.js.map +1 -0
- package/dist/insights.d.ts +5 -0
- package/dist/insights.d.ts.map +1 -0
- package/dist/insights.js +271 -0
- package/dist/insights.js.map +1 -0
- package/dist/joiner.d.ts +9 -0
- package/dist/joiner.d.ts.map +1 -0
- package/dist/joiner.js +247 -0
- package/dist/joiner.js.map +1 -0
- package/dist/orchestrator.d.ts +34 -0
- package/dist/orchestrator.d.ts.map +1 -0
- package/dist/orchestrator.js +827 -0
- package/dist/orchestrator.js.map +1 -0
- package/dist/pdf.d.ts +26 -0
- package/dist/pdf.d.ts.map +1 -0
- package/dist/pdf.js +84 -0
- package/dist/pdf.js.map +1 -0
- package/dist/prediction.d.ts +33 -0
- package/dist/prediction.d.ts.map +1 -0
- package/dist/prediction.js +316 -0
- package/dist/prediction.js.map +1 -0
- package/dist/prompts/loader.d.ts +38 -0
- package/dist/prompts/loader.d.ts.map +1 -0
- package/dist/prompts/loader.js +60 -0
- package/dist/prompts/loader.js.map +1 -0
- package/dist/renderer.d.ts +64 -0
- package/dist/renderer.d.ts.map +1 -0
- package/dist/renderer.js +923 -0
- package/dist/renderer.js.map +1 -0
- package/dist/runid.d.ts +57 -0
- package/dist/runid.d.ts.map +1 -0
- package/dist/runid.js +199 -0
- package/dist/runid.js.map +1 -0
- package/dist/runtime.d.ts +29 -0
- package/dist/runtime.d.ts.map +1 -0
- package/dist/runtime.js +366 -0
- package/dist/runtime.js.map +1 -0
- package/dist/scanner.d.ts +11 -0
- package/dist/scanner.d.ts.map +1 -0
- package/dist/scanner.js +426 -0
- package/dist/scanner.js.map +1 -0
- package/dist/templates.d.ts +120 -0
- package/dist/templates.d.ts.map +1 -0
- package/dist/templates.js +429 -0
- package/dist/templates.js.map +1 -0
- package/dist/tools/index.d.ts +153 -0
- package/dist/tools/index.d.ts.map +1 -0
- package/dist/tools/index.js +177 -0
- package/dist/tools/index.js.map +1 -0
- package/dist/types.d.ts +3647 -0
- package/dist/types.d.ts.map +1 -0
- package/dist/types.js +703 -0
- package/dist/types.js.map +1 -0
- package/dist/version.d.ts +7 -0
- package/dist/version.d.ts.map +1 -0
- package/dist/version.js +23 -0
- package/dist/version.js.map +1 -0
- package/docs/demo-guide.md +423 -0
- package/docs/events-format.md +295 -0
- package/docs/inferencemap-spec.md +344 -0
- package/docs/migration-v2.md +293 -0
- package/fixtures/demo/precomputed.json +142 -0
- package/fixtures/demo-project/README.md +52 -0
- package/fixtures/demo-project/ai-service.ts +65 -0
- package/fixtures/demo-project/sample-events.jsonl +15 -0
- package/fixtures/demo-project/src/ai-service.ts +128 -0
- package/fixtures/demo-project/src/llm-client.ts +155 -0
- package/package.json +65 -0
- package/prompts/agent-analyzer.yaml +47 -0
- package/prompts/ci-gate.yaml +98 -0
- package/prompts/correlation-analyzer.yaml +178 -0
- package/prompts/format-normalizer.yaml +46 -0
- package/prompts/peak-performance.yaml +180 -0
- package/prompts/pr-comment.yaml +111 -0
- package/prompts/runtime-analyzer.yaml +189 -0
- package/prompts/unified-analyzer.yaml +241 -0
- package/schemas/inference-map.v0.1.json +215 -0
- package/scripts/benchmark.ts +394 -0
- package/scripts/demo-v1.5.sh +158 -0
- package/scripts/sync-from-site.sh +197 -0
- package/scripts/validate-sync.sh +178 -0
- package/src/agent-analyzer.ts +481 -0
- package/src/agent.ts +1232 -0
- package/src/agents/correlation-analyzer.ts +353 -0
- package/src/agents/index.ts +235 -0
- package/src/agents/runtime-analyzer.ts +343 -0
- package/src/analysis-types.ts +558 -0
- package/src/analytics.ts +100 -0
- package/src/analyzer.ts +692 -0
- package/src/artifacts.ts +218 -0
- package/src/benchmarks/index.ts +309 -0
- package/src/cli.ts +503 -0
- package/src/commands/ci.ts +336 -0
- package/src/commands/config.ts +288 -0
- package/src/commands/demo.ts +175 -0
- package/src/commands/export.ts +297 -0
- package/src/commands/history.ts +425 -0
- package/src/commands/template.ts +385 -0
- package/src/commands/validate-map.ts +324 -0
- package/src/commands/whatif.ts +272 -0
- package/src/comparison.ts +283 -0
- package/src/config.ts +188 -0
- package/src/connectors/helicone.ts +164 -0
- package/src/connectors/index.ts +93 -0
- package/src/connectors/langsmith.ts +179 -0
- package/src/connectors/types.ts +180 -0
- package/src/cost-estimator.ts +146 -0
- package/src/costs.ts +347 -0
- package/src/counterfactuals.ts +516 -0
- package/src/enhancement-prompts.ts +118 -0
- package/src/envelopes.ts +814 -0
- package/src/format-normalizer.ts +1486 -0
- package/src/history.ts +400 -0
- package/src/html.ts +512 -0
- package/src/impact.ts +522 -0
- package/src/index.ts +83 -0
- package/src/insights.ts +341 -0
- package/src/joiner.ts +289 -0
- package/src/orchestrator.ts +1015 -0
- package/src/pdf.ts +110 -0
- package/src/prediction.ts +392 -0
- package/src/prompts/loader.ts +88 -0
- package/src/renderer.ts +1045 -0
- package/src/runid.ts +261 -0
- package/src/runtime.ts +450 -0
- package/src/scanner.ts +508 -0
- package/src/templates.ts +561 -0
- package/src/tools/index.ts +214 -0
- package/src/types.ts +873 -0
- package/src/version.ts +24 -0
- package/templates/context-accumulation.yaml +23 -0
- package/templates/cost-concentration.yaml +20 -0
- package/templates/dead-code.yaml +20 -0
- package/templates/latency-explainer.yaml +23 -0
- package/templates/optimizations/ab-testing-framework.yaml +74 -0
- package/templates/optimizations/api-gateway-optimization.yaml +81 -0
- package/templates/optimizations/api-model-routing-strategy.yaml +126 -0
- package/templates/optimizations/auto-scaling-optimization.yaml +85 -0
- package/templates/optimizations/batch-utilization-diagnostic.yaml +142 -0
- package/templates/optimizations/comprehensive-apm.yaml +76 -0
- package/templates/optimizations/context-window-optimization.yaml +91 -0
- package/templates/optimizations/cost-sensitive-batch-processing.yaml +77 -0
- package/templates/optimizations/distributed-training-optimization.yaml +77 -0
- package/templates/optimizations/document-analysis-edge.yaml +77 -0
- package/templates/optimizations/document-pipeline-optimization.yaml +78 -0
- package/templates/optimizations/domain-specific-distillation.yaml +78 -0
- package/templates/optimizations/error-handling-optimization.yaml +76 -0
- package/templates/optimizations/gptq-4bit-quantization.yaml +96 -0
- package/templates/optimizations/long-context-memory-management.yaml +78 -0
- package/templates/optimizations/max-tokens-optimization.yaml +76 -0
- package/templates/optimizations/memory-bandwidth-optimization.yaml +73 -0
- package/templates/optimizations/multi-framework-resilience.yaml +75 -0
- package/templates/optimizations/multi-tenant-optimization.yaml +75 -0
- package/templates/optimizations/prompt-caching-optimization.yaml +143 -0
- package/templates/optimizations/pytorch-to-onnx-migration.yaml +109 -0
- package/templates/optimizations/quality-monitoring.yaml +74 -0
- package/templates/optimizations/realtime-budget-controls.yaml +74 -0
- package/templates/optimizations/realtime-latency-optimization.yaml +74 -0
- package/templates/optimizations/sglang-concurrency-optimization.yaml +78 -0
- package/templates/optimizations/smart-model-routing.yaml +96 -0
- package/templates/optimizations/streaming-batch-selection.yaml +167 -0
- package/templates/optimizations/system-prompt-optimization.yaml +75 -0
- package/templates/optimizations/tensorrt-llm-performance.yaml +77 -0
- package/templates/optimizations/vllm-high-throughput-optimization.yaml +93 -0
- package/templates/optimizations/vllm-migration-memory-bound.yaml +78 -0
- package/templates/overpowered-extraction.yaml +32 -0
- package/templates/overpowered-model.yaml +31 -0
- package/templates/prompt-bloat.yaml +24 -0
- package/templates/retry-explosion.yaml +28 -0
- package/templates/schema/insight.schema.json +113 -0
- package/templates/schema/optimization.schema.json +180 -0
- package/templates/streaming-drift.yaml +30 -0
- package/templates/throughput-gap.yaml +21 -0
- package/templates/token-underutilization.yaml +28 -0
- package/templates/untested-fallback.yaml +21 -0
- package/tests/accuracy/drift-detection.test.ts +184 -0
- package/tests/accuracy/false-positives.test.ts +166 -0
- package/tests/accuracy/templates.test.ts +205 -0
- package/tests/action/commands.test.ts +125 -0
- package/tests/action/comments.test.ts +347 -0
- package/tests/cli.test.ts +203 -0
- package/tests/comparison.test.ts +309 -0
- package/tests/correlation-analyzer.test.ts +534 -0
- package/tests/counterfactuals.test.ts +347 -0
- package/tests/fixtures/events/missing-id.jsonl +1 -0
- package/tests/fixtures/events/missing-input.jsonl +1 -0
- package/tests/fixtures/events/missing-latency.jsonl +1 -0
- package/tests/fixtures/events/missing-model.jsonl +1 -0
- package/tests/fixtures/events/missing-output.jsonl +1 -0
- package/tests/fixtures/events/missing-provider.jsonl +1 -0
- package/tests/fixtures/events/missing-ts.jsonl +1 -0
- package/tests/fixtures/events/valid.csv +3 -0
- package/tests/fixtures/events/valid.json +1 -0
- package/tests/fixtures/events/valid.jsonl +2 -0
- package/tests/fixtures/events/with-callsite.jsonl +1 -0
- package/tests/fixtures/events/with-intent.jsonl +1 -0
- package/tests/fixtures/events/wrong-type.jsonl +1 -0
- package/tests/fixtures/repos/empty/.gitkeep +0 -0
- package/tests/fixtures/repos/hybrid-router/router.py +35 -0
- package/tests/fixtures/repos/saas-anthropic/agent.ts +27 -0
- package/tests/fixtures/repos/saas-openai/assistant.js +33 -0
- package/tests/fixtures/repos/saas-openai/client.py +26 -0
- package/tests/fixtures/repos/self-hosted-vllm/inference.py +22 -0
- package/tests/github-action.test.ts +292 -0
- package/tests/insights.test.ts +878 -0
- package/tests/joiner.test.ts +168 -0
- package/tests/performance/action-latency.test.ts +132 -0
- package/tests/performance/benchmark.test.ts +189 -0
- package/tests/performance/cli-latency.test.ts +102 -0
- package/tests/pr-comment.test.ts +313 -0
- package/tests/prediction.test.ts +296 -0
- package/tests/runtime-analyzer.test.ts +375 -0
- package/tests/runtime.test.ts +205 -0
- package/tests/scanner.test.ts +122 -0
- package/tests/template-conformance.test.ts +526 -0
- package/tests/unit/cost-calculator.test.ts +303 -0
- package/tests/unit/credits.test.ts +180 -0
- package/tests/unit/inference-map.test.ts +276 -0
- package/tests/unit/schema.test.ts +300 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +14 -0
|
@@ -0,0 +1,1486 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Format Normalizer - Agent-based runtime event format detection and normalization.
|
|
3
|
+
*
|
|
4
|
+
* This module implements PRD §6.4: Enable PeakInfer to ingest runtime data from any
|
|
5
|
+
* observability system, logging framework, or custom format without requiring users
|
|
6
|
+
* to transform their data first.
|
|
7
|
+
*
|
|
8
|
+
* Design Principles (Julie Zhou aligned):
|
|
9
|
+
* - Behavior First: Detect formats automatically, fallback gracefully
|
|
10
|
+
* - Clarity Over Cleverness: Clear confidence scores, no silent assumptions
|
|
11
|
+
* - State Completeness: Handle all format states (known, agent-required, unknown)
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { query } from '@anthropic-ai/claude-agent-sdk';
|
|
15
|
+
import type { SDKMessage } from '@anthropic-ai/claude-agent-sdk';
|
|
16
|
+
import type {
|
|
17
|
+
FormatType,
|
|
18
|
+
FieldMapping,
|
|
19
|
+
FormatDetectionResult,
|
|
20
|
+
NormalizationResult,
|
|
21
|
+
NormalizationOptions,
|
|
22
|
+
InferenceEvent,
|
|
23
|
+
ScanResult,
|
|
24
|
+
} from './types.js';
|
|
25
|
+
import { loadPrompt } from './templates.js';
|
|
26
|
+
|
|
27
|
+
/**
|
|
28
|
+
* Extract text content from Claude Agent SDK messages
|
|
29
|
+
*/
|
|
30
|
+
function extractTextFromMessages(messages: SDKMessage[]): string {
|
|
31
|
+
let text = '';
|
|
32
|
+
for (const msg of messages) {
|
|
33
|
+
if (msg.type === 'assistant' && msg.message?.content) {
|
|
34
|
+
for (const block of msg.message.content) {
|
|
35
|
+
if (block.type === 'text') {
|
|
36
|
+
text += block.text;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
return text;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// =============================================================================
|
|
45
|
+
// CONSTANTS
|
|
46
|
+
// =============================================================================
|
|
47
|
+
|
|
48
|
+
const SAMPLE_LINES = 20; // Number of lines to sample for detection
|
|
49
|
+
const MIN_CONFIDENCE_THRESHOLD = 0.7; // Minimum confidence for auto-acceptance
|
|
50
|
+
const LLM_MODEL = 'claude-sonnet-4-20250514';
|
|
51
|
+
|
|
52
|
+
// Required fields for InferenceEvent
|
|
53
|
+
const REQUIRED_FIELDS = ['id', 'ts', 'provider', 'model', 'input_tokens', 'output_tokens', 'latency_ms'];
|
|
54
|
+
|
|
55
|
+
// =============================================================================
|
|
56
|
+
// FORMAT SIGNATURES
|
|
57
|
+
// =============================================================================
|
|
58
|
+
|
|
59
|
+
/**
|
|
60
|
+
* Known format signatures for heuristic detection.
|
|
61
|
+
* Each signature includes structural patterns that uniquely identify a format.
|
|
62
|
+
*/
|
|
63
|
+
const FORMAT_SIGNATURES: Record<string, {
|
|
64
|
+
patterns: RegExp[];
|
|
65
|
+
structuralCheck?: (data: unknown) => boolean;
|
|
66
|
+
confidence: number;
|
|
67
|
+
}> = {
|
|
68
|
+
otel: {
|
|
69
|
+
patterns: [
|
|
70
|
+
/resourceSpans/i,
|
|
71
|
+
/scopeSpans/i,
|
|
72
|
+
/traceId/i,
|
|
73
|
+
/spanId/i,
|
|
74
|
+
],
|
|
75
|
+
structuralCheck: (data) => {
|
|
76
|
+
if (typeof data !== 'object' || data === null) return false;
|
|
77
|
+
const obj = data as Record<string, unknown>;
|
|
78
|
+
return 'resourceSpans' in obj || 'resource_spans' in obj;
|
|
79
|
+
},
|
|
80
|
+
confidence: 0.95,
|
|
81
|
+
},
|
|
82
|
+
jaeger: {
|
|
83
|
+
patterns: [
|
|
84
|
+
/traceID/, // Jaeger uses capital ID (vs OTEL's traceId)
|
|
85
|
+
/spanID/, // Jaeger uses capital ID (vs OTEL's spanId)
|
|
86
|
+
/operationName/,
|
|
87
|
+
/jaeger/i,
|
|
88
|
+
/"processes"/, // Jaeger-specific field
|
|
89
|
+
],
|
|
90
|
+
structuralCheck: (data) => {
|
|
91
|
+
if (typeof data !== 'object' || data === null) return false;
|
|
92
|
+
const obj = data as Record<string, unknown>;
|
|
93
|
+
// Jaeger format: { data: [{ traceID: ..., processes: ... }] }
|
|
94
|
+
if ('data' in obj && Array.isArray((obj as { data: unknown }).data)) {
|
|
95
|
+
const firstTrace = (obj as { data: unknown[] }).data[0] as Record<string, unknown>;
|
|
96
|
+
// Must have traceID (capital ID) to distinguish from OTEL
|
|
97
|
+
return firstTrace?.traceID !== undefined && firstTrace?.processes !== undefined;
|
|
98
|
+
}
|
|
99
|
+
return false;
|
|
100
|
+
},
|
|
101
|
+
confidence: 0.95,
|
|
102
|
+
},
|
|
103
|
+
zipkin: {
|
|
104
|
+
patterns: [
|
|
105
|
+
/"traceId"/,
|
|
106
|
+
/"parentId"/,
|
|
107
|
+
/"localEndpoint"/,
|
|
108
|
+
/zipkin/i,
|
|
109
|
+
],
|
|
110
|
+
structuralCheck: (data) => {
|
|
111
|
+
if (!Array.isArray(data)) return false;
|
|
112
|
+
const first = data[0] as Record<string, unknown>;
|
|
113
|
+
return first?.traceId !== undefined && first?.localEndpoint !== undefined;
|
|
114
|
+
},
|
|
115
|
+
confidence: 0.95,
|
|
116
|
+
},
|
|
117
|
+
langsmith: {
|
|
118
|
+
patterns: [
|
|
119
|
+
/run_id/,
|
|
120
|
+
/run_type/,
|
|
121
|
+
/langsmith/i,
|
|
122
|
+
/langchain/i,
|
|
123
|
+
],
|
|
124
|
+
structuralCheck: (data) => {
|
|
125
|
+
if (typeof data !== 'object' || data === null) return false;
|
|
126
|
+
const obj = data as Record<string, unknown>;
|
|
127
|
+
return 'run_id' in obj || 'runs' in obj;
|
|
128
|
+
},
|
|
129
|
+
confidence: 0.90,
|
|
130
|
+
},
|
|
131
|
+
litellm: {
|
|
132
|
+
patterns: [
|
|
133
|
+
/litellm/i,
|
|
134
|
+
/call_type/,
|
|
135
|
+
/api_base/,
|
|
136
|
+
/response_time_ms/,
|
|
137
|
+
],
|
|
138
|
+
structuralCheck: (data) => {
|
|
139
|
+
if (typeof data !== 'object' || data === null) return false;
|
|
140
|
+
const obj = data as Record<string, unknown>;
|
|
141
|
+
// LiteLLM logs have call_type OR api_base OR response_time_ms fields
|
|
142
|
+
return 'call_type' in obj || 'api_base' in obj || 'response_time_ms' in obj;
|
|
143
|
+
},
|
|
144
|
+
confidence: 0.90,
|
|
145
|
+
},
|
|
146
|
+
helicone: {
|
|
147
|
+
patterns: [
|
|
148
|
+
/helicone/i,
|
|
149
|
+
/helicone_request_id/,
|
|
150
|
+
/helicone_response_id/,
|
|
151
|
+
/helicone_properties/,
|
|
152
|
+
],
|
|
153
|
+
structuralCheck: (data) => {
|
|
154
|
+
if (typeof data !== 'object' || data === null) return false;
|
|
155
|
+
const obj = data as Record<string, unknown>;
|
|
156
|
+
// Helicone-specific fields - must have helicone_ prefixed fields
|
|
157
|
+
return 'helicone_request_id' in obj || 'helicone_response_id' in obj ||
|
|
158
|
+
'helicone_properties' in obj || 'helicone' in obj;
|
|
159
|
+
},
|
|
160
|
+
confidence: 0.85,
|
|
161
|
+
},
|
|
162
|
+
};
|
|
163
|
+
|
|
164
|
+
// =============================================================================
|
|
165
|
+
// PREDEFINED FIELD MAPPINGS
|
|
166
|
+
// =============================================================================
|
|
167
|
+
|
|
168
|
+
/**
|
|
169
|
+
* Predefined field mappings for known formats.
|
|
170
|
+
* These are high-confidence mappings based on format specifications.
|
|
171
|
+
*/
|
|
172
|
+
const PREDEFINED_MAPPINGS: Record<string, FieldMapping[]> = {
|
|
173
|
+
otel: [
|
|
174
|
+
{
|
|
175
|
+
target: 'id',
|
|
176
|
+
source_path: '$.resourceSpans[*].scopeSpans[*].spans[*].spanId',
|
|
177
|
+
extraction_type: 'jsonpath',
|
|
178
|
+
transform: 'none',
|
|
179
|
+
confidence: 1.0,
|
|
180
|
+
evidence: 'OTLP span ID field',
|
|
181
|
+
},
|
|
182
|
+
{
|
|
183
|
+
target: 'ts',
|
|
184
|
+
source_path: '$.resourceSpans[*].scopeSpans[*].spans[*].startTimeUnixNano',
|
|
185
|
+
extraction_type: 'jsonpath',
|
|
186
|
+
transform: 'unix_nano_to_iso',
|
|
187
|
+
confidence: 1.0,
|
|
188
|
+
evidence: 'OTLP start time in nanoseconds',
|
|
189
|
+
},
|
|
190
|
+
{
|
|
191
|
+
target: 'provider',
|
|
192
|
+
source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.provider')].value.stringValue",
|
|
193
|
+
extraction_type: 'jsonpath',
|
|
194
|
+
transform: 'provider_normalize',
|
|
195
|
+
confidence: 0.9,
|
|
196
|
+
evidence: 'LLM semantic convention attribute',
|
|
197
|
+
},
|
|
198
|
+
{
|
|
199
|
+
target: 'model',
|
|
200
|
+
source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.model')].value.stringValue",
|
|
201
|
+
extraction_type: 'jsonpath',
|
|
202
|
+
transform: 'none',
|
|
203
|
+
confidence: 0.9,
|
|
204
|
+
evidence: 'LLM semantic convention attribute',
|
|
205
|
+
},
|
|
206
|
+
{
|
|
207
|
+
target: 'input_tokens',
|
|
208
|
+
source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.token_count.prompt')].value.intValue",
|
|
209
|
+
extraction_type: 'jsonpath',
|
|
210
|
+
transform: 'parse_int',
|
|
211
|
+
confidence: 0.9,
|
|
212
|
+
evidence: 'LLM semantic convention attribute',
|
|
213
|
+
},
|
|
214
|
+
{
|
|
215
|
+
target: 'output_tokens',
|
|
216
|
+
source_path: "$.resourceSpans[*].scopeSpans[*].spans[*].attributes[?(@.key=='llm.token_count.completion')].value.intValue",
|
|
217
|
+
extraction_type: 'jsonpath',
|
|
218
|
+
transform: 'parse_int',
|
|
219
|
+
confidence: 0.9,
|
|
220
|
+
evidence: 'LLM semantic convention attribute',
|
|
221
|
+
},
|
|
222
|
+
{
|
|
223
|
+
target: 'latency_ms',
|
|
224
|
+
source_path: '(endTimeUnixNano - startTimeUnixNano) / 1000000',
|
|
225
|
+
extraction_type: 'computed',
|
|
226
|
+
transform: 'none',
|
|
227
|
+
confidence: 1.0,
|
|
228
|
+
evidence: 'Computed from OTLP start/end timestamps',
|
|
229
|
+
},
|
|
230
|
+
],
|
|
231
|
+
jaeger: [
|
|
232
|
+
{
|
|
233
|
+
target: 'id',
|
|
234
|
+
source_path: '$.data[*].spans[*].spanID',
|
|
235
|
+
extraction_type: 'jsonpath',
|
|
236
|
+
transform: 'none',
|
|
237
|
+
confidence: 1.0,
|
|
238
|
+
evidence: 'Jaeger span ID',
|
|
239
|
+
},
|
|
240
|
+
{
|
|
241
|
+
target: 'ts',
|
|
242
|
+
source_path: '$.data[*].spans[*].startTime',
|
|
243
|
+
extraction_type: 'jsonpath',
|
|
244
|
+
transform: 'unix_ms_to_iso',
|
|
245
|
+
confidence: 1.0,
|
|
246
|
+
evidence: 'Jaeger start time in microseconds',
|
|
247
|
+
},
|
|
248
|
+
{
|
|
249
|
+
target: 'provider',
|
|
250
|
+
source_path: "$.data[*].spans[*].tags[?(@.key=='llm.provider')].value",
|
|
251
|
+
extraction_type: 'jsonpath',
|
|
252
|
+
transform: 'provider_normalize',
|
|
253
|
+
confidence: 0.85,
|
|
254
|
+
evidence: 'Tag-based provider extraction',
|
|
255
|
+
},
|
|
256
|
+
{
|
|
257
|
+
target: 'model',
|
|
258
|
+
source_path: "$.data[*].spans[*].tags[?(@.key=='llm.model')].value",
|
|
259
|
+
extraction_type: 'jsonpath',
|
|
260
|
+
transform: 'none',
|
|
261
|
+
confidence: 0.85,
|
|
262
|
+
evidence: 'Tag-based model extraction',
|
|
263
|
+
},
|
|
264
|
+
{
|
|
265
|
+
target: 'input_tokens',
|
|
266
|
+
source_path: "$.data[*].spans[*].tags[?(@.key=='llm.input_tokens')].value",
|
|
267
|
+
extraction_type: 'jsonpath',
|
|
268
|
+
transform: 'parse_int',
|
|
269
|
+
confidence: 0.85,
|
|
270
|
+
evidence: 'Tag-based token extraction',
|
|
271
|
+
},
|
|
272
|
+
{
|
|
273
|
+
target: 'output_tokens',
|
|
274
|
+
source_path: "$.data[*].spans[*].tags[?(@.key=='llm.output_tokens')].value",
|
|
275
|
+
extraction_type: 'jsonpath',
|
|
276
|
+
transform: 'parse_int',
|
|
277
|
+
confidence: 0.85,
|
|
278
|
+
evidence: 'Tag-based token extraction',
|
|
279
|
+
},
|
|
280
|
+
{
|
|
281
|
+
target: 'latency_ms',
|
|
282
|
+
source_path: '$.data[*].spans[*].duration',
|
|
283
|
+
extraction_type: 'jsonpath',
|
|
284
|
+
transform: 'none', // Jaeger duration is already in microseconds, convert to ms
|
|
285
|
+
confidence: 1.0,
|
|
286
|
+
evidence: 'Jaeger duration field (microseconds -> ms)',
|
|
287
|
+
},
|
|
288
|
+
],
|
|
289
|
+
zipkin: [
|
|
290
|
+
{
|
|
291
|
+
target: 'id',
|
|
292
|
+
source_path: '$[*].id',
|
|
293
|
+
extraction_type: 'jsonpath',
|
|
294
|
+
transform: 'none',
|
|
295
|
+
confidence: 1.0,
|
|
296
|
+
evidence: 'Zipkin span ID',
|
|
297
|
+
},
|
|
298
|
+
{
|
|
299
|
+
target: 'ts',
|
|
300
|
+
source_path: '$[*].timestamp',
|
|
301
|
+
extraction_type: 'jsonpath',
|
|
302
|
+
transform: 'unix_ms_to_iso',
|
|
303
|
+
confidence: 1.0,
|
|
304
|
+
evidence: 'Zipkin timestamp in microseconds',
|
|
305
|
+
},
|
|
306
|
+
{
|
|
307
|
+
target: 'provider',
|
|
308
|
+
source_path: "$[*].tags['llm.provider']",
|
|
309
|
+
extraction_type: 'jsonpath',
|
|
310
|
+
transform: 'provider_normalize',
|
|
311
|
+
confidence: 0.85,
|
|
312
|
+
evidence: 'Tag-based provider extraction',
|
|
313
|
+
},
|
|
314
|
+
{
|
|
315
|
+
target: 'model',
|
|
316
|
+
source_path: "$[*].tags['llm.model']",
|
|
317
|
+
extraction_type: 'jsonpath',
|
|
318
|
+
transform: 'none',
|
|
319
|
+
confidence: 0.85,
|
|
320
|
+
evidence: 'Tag-based model extraction',
|
|
321
|
+
},
|
|
322
|
+
{
|
|
323
|
+
target: 'input_tokens',
|
|
324
|
+
source_path: "$[*].tags['llm.input_tokens']",
|
|
325
|
+
extraction_type: 'jsonpath',
|
|
326
|
+
transform: 'parse_int',
|
|
327
|
+
confidence: 0.85,
|
|
328
|
+
evidence: 'Tag-based token extraction',
|
|
329
|
+
},
|
|
330
|
+
{
|
|
331
|
+
target: 'output_tokens',
|
|
332
|
+
source_path: "$[*].tags['llm.output_tokens']",
|
|
333
|
+
extraction_type: 'jsonpath',
|
|
334
|
+
transform: 'parse_int',
|
|
335
|
+
confidence: 0.85,
|
|
336
|
+
evidence: 'Tag-based token extraction',
|
|
337
|
+
},
|
|
338
|
+
{
|
|
339
|
+
target: 'latency_ms',
|
|
340
|
+
source_path: '$[*].duration',
|
|
341
|
+
extraction_type: 'jsonpath',
|
|
342
|
+
transform: 'none', // Zipkin duration is in microseconds
|
|
343
|
+
confidence: 1.0,
|
|
344
|
+
evidence: 'Zipkin duration field (microseconds -> ms)',
|
|
345
|
+
},
|
|
346
|
+
],
|
|
347
|
+
langsmith: [
|
|
348
|
+
{
|
|
349
|
+
target: 'id',
|
|
350
|
+
source_path: 'run_id',
|
|
351
|
+
extraction_type: 'direct',
|
|
352
|
+
transform: 'none',
|
|
353
|
+
confidence: 0.95,
|
|
354
|
+
evidence: 'LangSmith run ID',
|
|
355
|
+
},
|
|
356
|
+
{
|
|
357
|
+
target: 'ts',
|
|
358
|
+
source_path: 'start_time',
|
|
359
|
+
extraction_type: 'direct',
|
|
360
|
+
transform: 'none', // Already ISO format
|
|
361
|
+
confidence: 0.95,
|
|
362
|
+
evidence: 'LangSmith start timestamp',
|
|
363
|
+
},
|
|
364
|
+
{
|
|
365
|
+
target: 'provider',
|
|
366
|
+
source_path: 'extra.invocation_params.model_provider',
|
|
367
|
+
extraction_type: 'jsonpath',
|
|
368
|
+
transform: 'provider_normalize',
|
|
369
|
+
confidence: 0.8,
|
|
370
|
+
evidence: 'LangSmith invocation params provider',
|
|
371
|
+
},
|
|
372
|
+
{
|
|
373
|
+
target: 'model',
|
|
374
|
+
source_path: 'extra.invocation_params.model',
|
|
375
|
+
extraction_type: 'jsonpath',
|
|
376
|
+
transform: 'none',
|
|
377
|
+
confidence: 0.85,
|
|
378
|
+
evidence: 'LangSmith invocation params model',
|
|
379
|
+
},
|
|
380
|
+
{
|
|
381
|
+
target: 'input_tokens',
|
|
382
|
+
source_path: 'token_usage.prompt_tokens',
|
|
383
|
+
extraction_type: 'jsonpath',
|
|
384
|
+
transform: 'parse_int',
|
|
385
|
+
confidence: 0.9,
|
|
386
|
+
evidence: 'LangSmith token usage prompt_tokens',
|
|
387
|
+
},
|
|
388
|
+
{
|
|
389
|
+
target: 'output_tokens',
|
|
390
|
+
source_path: 'token_usage.completion_tokens',
|
|
391
|
+
extraction_type: 'jsonpath',
|
|
392
|
+
transform: 'parse_int',
|
|
393
|
+
confidence: 0.9,
|
|
394
|
+
evidence: 'LangSmith token usage completion_tokens',
|
|
395
|
+
},
|
|
396
|
+
{
|
|
397
|
+
target: 'latency_ms',
|
|
398
|
+
source_path: 'latency',
|
|
399
|
+
extraction_type: 'direct',
|
|
400
|
+
transform: 'duration_to_ms',
|
|
401
|
+
confidence: 0.9,
|
|
402
|
+
evidence: 'LangSmith latency field',
|
|
403
|
+
},
|
|
404
|
+
],
|
|
405
|
+
litellm: [
|
|
406
|
+
{
|
|
407
|
+
target: 'id',
|
|
408
|
+
source_path: 'id',
|
|
409
|
+
extraction_type: 'direct',
|
|
410
|
+
transform: 'none',
|
|
411
|
+
confidence: 0.95,
|
|
412
|
+
evidence: 'LiteLLM request ID',
|
|
413
|
+
},
|
|
414
|
+
{
|
|
415
|
+
target: 'ts',
|
|
416
|
+
source_path: 'startTime',
|
|
417
|
+
extraction_type: 'direct',
|
|
418
|
+
transform: 'unix_ms_to_iso',
|
|
419
|
+
confidence: 0.9,
|
|
420
|
+
evidence: 'LiteLLM start timestamp',
|
|
421
|
+
},
|
|
422
|
+
{
|
|
423
|
+
target: 'provider',
|
|
424
|
+
source_path: 'model',
|
|
425
|
+
extraction_type: 'direct',
|
|
426
|
+
transform: 'provider_normalize', // LiteLLM uses model format like "openai/gpt-4"
|
|
427
|
+
confidence: 0.85,
|
|
428
|
+
evidence: 'LiteLLM model field (provider/model format)',
|
|
429
|
+
},
|
|
430
|
+
{
|
|
431
|
+
target: 'model',
|
|
432
|
+
source_path: 'model',
|
|
433
|
+
extraction_type: 'direct',
|
|
434
|
+
transform: 'none',
|
|
435
|
+
confidence: 0.95,
|
|
436
|
+
evidence: 'LiteLLM model field',
|
|
437
|
+
},
|
|
438
|
+
{
|
|
439
|
+
target: 'input_tokens',
|
|
440
|
+
source_path: 'usage.prompt_tokens',
|
|
441
|
+
extraction_type: 'jsonpath',
|
|
442
|
+
transform: 'parse_int',
|
|
443
|
+
confidence: 0.95,
|
|
444
|
+
evidence: 'LiteLLM usage prompt_tokens',
|
|
445
|
+
},
|
|
446
|
+
{
|
|
447
|
+
target: 'output_tokens',
|
|
448
|
+
source_path: 'usage.completion_tokens',
|
|
449
|
+
extraction_type: 'jsonpath',
|
|
450
|
+
transform: 'parse_int',
|
|
451
|
+
confidence: 0.95,
|
|
452
|
+
evidence: 'LiteLLM usage completion_tokens',
|
|
453
|
+
},
|
|
454
|
+
{
|
|
455
|
+
target: 'latency_ms',
|
|
456
|
+
source_path: 'response_time_ms',
|
|
457
|
+
extraction_type: 'direct',
|
|
458
|
+
transform: 'none',
|
|
459
|
+
confidence: 1.0,
|
|
460
|
+
evidence: 'LiteLLM response_time_ms field',
|
|
461
|
+
},
|
|
462
|
+
],
|
|
463
|
+
helicone: [
|
|
464
|
+
{
|
|
465
|
+
target: 'id',
|
|
466
|
+
source_path: 'helicone_request_id',
|
|
467
|
+
extraction_type: 'direct',
|
|
468
|
+
transform: 'none',
|
|
469
|
+
confidence: 0.95,
|
|
470
|
+
evidence: 'Helicone request ID',
|
|
471
|
+
},
|
|
472
|
+
{
|
|
473
|
+
target: 'ts',
|
|
474
|
+
source_path: 'created_at',
|
|
475
|
+
extraction_type: 'direct',
|
|
476
|
+
transform: 'none', // Helicone uses ISO format
|
|
477
|
+
confidence: 0.9,
|
|
478
|
+
evidence: 'Helicone created_at timestamp',
|
|
479
|
+
},
|
|
480
|
+
{
|
|
481
|
+
target: 'provider',
|
|
482
|
+
source_path: 'provider',
|
|
483
|
+
extraction_type: 'direct',
|
|
484
|
+
transform: 'provider_normalize',
|
|
485
|
+
confidence: 0.9,
|
|
486
|
+
evidence: 'Helicone provider field',
|
|
487
|
+
},
|
|
488
|
+
{
|
|
489
|
+
target: 'model',
|
|
490
|
+
source_path: 'model',
|
|
491
|
+
extraction_type: 'direct',
|
|
492
|
+
transform: 'none',
|
|
493
|
+
confidence: 0.95,
|
|
494
|
+
evidence: 'Helicone model field',
|
|
495
|
+
},
|
|
496
|
+
{
|
|
497
|
+
target: 'input_tokens',
|
|
498
|
+
source_path: 'prompt_tokens',
|
|
499
|
+
extraction_type: 'direct',
|
|
500
|
+
transform: 'parse_int',
|
|
501
|
+
confidence: 0.9,
|
|
502
|
+
evidence: 'Helicone prompt_tokens',
|
|
503
|
+
},
|
|
504
|
+
{
|
|
505
|
+
target: 'output_tokens',
|
|
506
|
+
source_path: 'completion_tokens',
|
|
507
|
+
extraction_type: 'direct',
|
|
508
|
+
transform: 'parse_int',
|
|
509
|
+
confidence: 0.9,
|
|
510
|
+
evidence: 'Helicone completion_tokens',
|
|
511
|
+
},
|
|
512
|
+
{
|
|
513
|
+
target: 'latency_ms',
|
|
514
|
+
source_path: 'latency_ms',
|
|
515
|
+
extraction_type: 'direct',
|
|
516
|
+
transform: 'none',
|
|
517
|
+
confidence: 1.0,
|
|
518
|
+
evidence: 'Helicone latency_ms field',
|
|
519
|
+
},
|
|
520
|
+
],
|
|
521
|
+
};
|
|
522
|
+
|
|
523
|
+
// =============================================================================
|
|
524
|
+
// FORMAT DETECTION
|
|
525
|
+
// =============================================================================
|
|
526
|
+
|
|
527
|
+
/**
|
|
528
|
+
* Detect the format type of a runtime events file.
|
|
529
|
+
*
|
|
530
|
+
* Detection strategy:
|
|
531
|
+
* 1. Try file extension heuristics
|
|
532
|
+
* 2. Sample content and check against known signatures
|
|
533
|
+
* 3. Fall back to agent-based detection for unknown formats
|
|
534
|
+
*/
|
|
535
|
+
export function detectFormat(
|
|
536
|
+
content: string,
|
|
537
|
+
filename?: string,
|
|
538
|
+
): FormatDetectionResult {
|
|
539
|
+
const lines = content.trim().split('\n').slice(0, SAMPLE_LINES);
|
|
540
|
+
|
|
541
|
+
// First, try to parse as complete JSON (object or array)
|
|
542
|
+
// This handles single-line JSON arrays which would incorrectly match JSONL
|
|
543
|
+
let parsedAsWhole: unknown;
|
|
544
|
+
try {
|
|
545
|
+
parsedAsWhole = JSON.parse(content);
|
|
546
|
+
} catch {
|
|
547
|
+
parsedAsWhole = null;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// If it's a JSON array, check for InferenceEvent schema first
|
|
551
|
+
if (Array.isArray(parsedAsWhole) && parsedAsWhole.length > 0) {
|
|
552
|
+
const first = parsedAsWhole[0] as Record<string, unknown>;
|
|
553
|
+
const hasRequiredFields = REQUIRED_FIELDS.every(f => f in first);
|
|
554
|
+
|
|
555
|
+
if (hasRequiredFields) {
|
|
556
|
+
return {
|
|
557
|
+
format_type: 'json_array',
|
|
558
|
+
confidence: 1.0,
|
|
559
|
+
evidence: 'JSON array with InferenceEvent schema',
|
|
560
|
+
sample_size: Math.min(parsedAsWhole.length, SAMPLE_LINES),
|
|
561
|
+
requires_agent: false,
|
|
562
|
+
};
|
|
563
|
+
}
|
|
564
|
+
}
|
|
565
|
+
|
|
566
|
+
// Check if it's likely JSONL (newline-delimited JSON)
|
|
567
|
+
// Note: Multi-line content where each line is valid JSON
|
|
568
|
+
if (lines.length > 1) {
|
|
569
|
+
const isJSONL = lines.every(line => {
|
|
570
|
+
const trimmed = line.trim();
|
|
571
|
+
if (!trimmed) return true; // Empty lines are ok
|
|
572
|
+
try {
|
|
573
|
+
JSON.parse(trimmed);
|
|
574
|
+
return true;
|
|
575
|
+
} catch {
|
|
576
|
+
return false;
|
|
577
|
+
}
|
|
578
|
+
});
|
|
579
|
+
|
|
580
|
+
if (isJSONL) {
|
|
581
|
+
// Parse first non-empty line
|
|
582
|
+
const firstLine = lines.find(l => l.trim());
|
|
583
|
+
if (firstLine) {
|
|
584
|
+
try {
|
|
585
|
+
const firstEvent = JSON.parse(firstLine);
|
|
586
|
+
|
|
587
|
+
// Check if JSONL matches InferenceEvent schema
|
|
588
|
+
const hasRequiredFields = REQUIRED_FIELDS.every(f => f in firstEvent);
|
|
589
|
+
|
|
590
|
+
if (hasRequiredFields) {
|
|
591
|
+
return {
|
|
592
|
+
format_type: 'jsonl',
|
|
593
|
+
confidence: 1.0,
|
|
594
|
+
evidence: 'JSONL with InferenceEvent schema (all required fields present)',
|
|
595
|
+
sample_size: lines.length,
|
|
596
|
+
requires_agent: false,
|
|
597
|
+
};
|
|
598
|
+
}
|
|
599
|
+
|
|
600
|
+
// Check against known format signatures for JSONL data
|
|
601
|
+
// Only match if structuralCheck passes (required for JSONL format detection)
|
|
602
|
+
const jsonStr = JSON.stringify(firstEvent);
|
|
603
|
+
for (const [formatType, signature] of Object.entries(FORMAT_SIGNATURES)) {
|
|
604
|
+
// For JSONL, require structuralCheck to pass (if defined)
|
|
605
|
+
if (signature.structuralCheck) {
|
|
606
|
+
const structuralMatch = signature.structuralCheck(firstEvent);
|
|
607
|
+
if (!structuralMatch) continue;
|
|
608
|
+
|
|
609
|
+
const patternMatches = signature.patterns.filter(p => p.test(jsonStr)).length;
|
|
610
|
+
const patternRatio = patternMatches / signature.patterns.length;
|
|
611
|
+
const confidence = Math.max(0.8, patternRatio) * signature.confidence;
|
|
612
|
+
|
|
613
|
+
return {
|
|
614
|
+
format_type: formatType as FormatType,
|
|
615
|
+
confidence,
|
|
616
|
+
evidence: `JSONL with ${formatType} format (structural match, ${patternMatches}/${signature.patterns.length} patterns)`,
|
|
617
|
+
sample_size: lines.length,
|
|
618
|
+
requires_agent: true,
|
|
619
|
+
};
|
|
620
|
+
}
|
|
621
|
+
}
|
|
622
|
+
|
|
623
|
+
// JSONL but unknown schema - mark as custom_json requiring agent
|
|
624
|
+
return {
|
|
625
|
+
format_type: 'custom_json',
|
|
626
|
+
confidence: 0.7,
|
|
627
|
+
evidence: 'JSONL with custom schema - requires field mapping',
|
|
628
|
+
sample_size: lines.length,
|
|
629
|
+
requires_agent: true,
|
|
630
|
+
};
|
|
631
|
+
} catch {
|
|
632
|
+
// Continue to other detection methods
|
|
633
|
+
}
|
|
634
|
+
}
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
|
|
638
|
+
// Try to parse as JSON (array or object)
|
|
639
|
+
let parsedData: unknown;
|
|
640
|
+
try {
|
|
641
|
+
parsedData = JSON.parse(content);
|
|
642
|
+
} catch {
|
|
643
|
+
// Not valid JSON, check for CSV/TSV or text logs
|
|
644
|
+
return detectNonJSONFormat(content, lines, filename);
|
|
645
|
+
}
|
|
646
|
+
|
|
647
|
+
// Check if it's a JSON array with InferenceEvent schema
|
|
648
|
+
if (Array.isArray(parsedData) && parsedData.length > 0) {
|
|
649
|
+
const first = parsedData[0] as Record<string, unknown>;
|
|
650
|
+
const hasRequiredFields = REQUIRED_FIELDS.every(f => f in first);
|
|
651
|
+
|
|
652
|
+
if (hasRequiredFields) {
|
|
653
|
+
return {
|
|
654
|
+
format_type: 'json_array',
|
|
655
|
+
confidence: 1.0,
|
|
656
|
+
evidence: 'JSON array with InferenceEvent schema',
|
|
657
|
+
sample_size: Math.min(parsedData.length, SAMPLE_LINES),
|
|
658
|
+
requires_agent: false,
|
|
659
|
+
};
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
// Check against known format signatures
|
|
664
|
+
// Require structural match for reliable detection
|
|
665
|
+
const contentStr = JSON.stringify(parsedData);
|
|
666
|
+
|
|
667
|
+
for (const [formatType, signature] of Object.entries(FORMAT_SIGNATURES)) {
|
|
668
|
+
// Require structuralCheck to pass for format identification
|
|
669
|
+
if (signature.structuralCheck) {
|
|
670
|
+
const structuralMatch = signature.structuralCheck(parsedData);
|
|
671
|
+
if (!structuralMatch) continue;
|
|
672
|
+
|
|
673
|
+
const patternMatches = signature.patterns.filter(p => p.test(contentStr)).length;
|
|
674
|
+
const patternRatio = patternMatches / signature.patterns.length;
|
|
675
|
+
const confidence = Math.max(0.8, patternRatio) * signature.confidence;
|
|
676
|
+
|
|
677
|
+
return {
|
|
678
|
+
format_type: formatType as FormatType,
|
|
679
|
+
confidence,
|
|
680
|
+
evidence: `Matched ${formatType} format (structural match, ${patternMatches}/${signature.patterns.length} patterns)`,
|
|
681
|
+
sample_size: Array.isArray(parsedData) ? parsedData.length : 1,
|
|
682
|
+
requires_agent: true, // Known formats still need agent for field mapping
|
|
683
|
+
};
|
|
684
|
+
}
|
|
685
|
+
}
|
|
686
|
+
|
|
687
|
+
// Unknown JSON structure - requires agent normalization
|
|
688
|
+
return {
|
|
689
|
+
format_type: 'custom_json',
|
|
690
|
+
confidence: 0.5,
|
|
691
|
+
evidence: 'Valid JSON but unknown schema - requires agent normalization',
|
|
692
|
+
sample_size: Array.isArray(parsedData) ? parsedData.length : 1,
|
|
693
|
+
requires_agent: true,
|
|
694
|
+
};
|
|
695
|
+
}
|
|
696
|
+
|
|
697
|
+
/**
|
|
698
|
+
* Detect non-JSON formats (CSV, TSV, text logs).
|
|
699
|
+
*/
|
|
700
|
+
function detectNonJSONFormat(
|
|
701
|
+
content: string,
|
|
702
|
+
lines: string[],
|
|
703
|
+
filename?: string,
|
|
704
|
+
): FormatDetectionResult {
|
|
705
|
+
// Check for CSV
|
|
706
|
+
const firstLine = lines[0];
|
|
707
|
+
if (firstLine.includes(',')) {
|
|
708
|
+
const headers = firstLine.split(',').map(h => h.trim().toLowerCase());
|
|
709
|
+
const hasLLMHeaders = ['provider', 'model', 'latency', 'tokens'].some(
|
|
710
|
+
h => headers.some(header => header.includes(h))
|
|
711
|
+
);
|
|
712
|
+
|
|
713
|
+
if (hasLLMHeaders) {
|
|
714
|
+
return {
|
|
715
|
+
format_type: 'csv',
|
|
716
|
+
confidence: 0.9,
|
|
717
|
+
evidence: 'CSV with LLM-related headers detected',
|
|
718
|
+
sample_size: lines.length,
|
|
719
|
+
requires_agent: false,
|
|
720
|
+
};
|
|
721
|
+
}
|
|
722
|
+
|
|
723
|
+
return {
|
|
724
|
+
format_type: 'csv',
|
|
725
|
+
confidence: 0.7,
|
|
726
|
+
evidence: 'CSV format detected but headers may need mapping',
|
|
727
|
+
sample_size: lines.length,
|
|
728
|
+
requires_agent: true,
|
|
729
|
+
};
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// Check for TSV
|
|
733
|
+
if (firstLine.includes('\t')) {
|
|
734
|
+
return {
|
|
735
|
+
format_type: 'tsv',
|
|
736
|
+
confidence: 0.8,
|
|
737
|
+
evidence: 'Tab-separated values detected',
|
|
738
|
+
sample_size: lines.length,
|
|
739
|
+
requires_agent: true,
|
|
740
|
+
};
|
|
741
|
+
}
|
|
742
|
+
|
|
743
|
+
// Structured text logs
|
|
744
|
+
const logPatterns = [
|
|
745
|
+
/^\d{4}-\d{2}-\d{2}/, // ISO date prefix
|
|
746
|
+
/^\[\d+\]/, // Timestamp prefix
|
|
747
|
+
/level=(info|warn|error|debug)/i,
|
|
748
|
+
/provider=\w+/,
|
|
749
|
+
/model=\w+/,
|
|
750
|
+
];
|
|
751
|
+
|
|
752
|
+
const logMatchCount = logPatterns.filter(p =>
|
|
753
|
+
lines.some(line => p.test(line))
|
|
754
|
+
).length;
|
|
755
|
+
|
|
756
|
+
if (logMatchCount >= 2) {
|
|
757
|
+
return {
|
|
758
|
+
format_type: 'custom_text',
|
|
759
|
+
confidence: 0.6,
|
|
760
|
+
evidence: `Structured text logs detected (${logMatchCount} patterns matched)`,
|
|
761
|
+
sample_size: lines.length,
|
|
762
|
+
requires_agent: true,
|
|
763
|
+
};
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
return {
|
|
767
|
+
format_type: 'unknown',
|
|
768
|
+
confidence: 0.3,
|
|
769
|
+
evidence: 'Could not determine format - manual field mapping may be required',
|
|
770
|
+
sample_size: lines.length,
|
|
771
|
+
requires_agent: true,
|
|
772
|
+
};
|
|
773
|
+
}
|
|
774
|
+
|
|
775
|
+
// =============================================================================
|
|
776
|
+
// AGENT-BASED NORMALIZATION
|
|
777
|
+
// =============================================================================
|
|
778
|
+
|
|
779
|
+
// Load normalization prompt from YAML (with hardcoded fallback)
|
|
780
|
+
function getNormalizationPrompt(): string {
|
|
781
|
+
const prompt = loadPrompt('format-normalizer');
|
|
782
|
+
if (prompt) {
|
|
783
|
+
return prompt.prompt;
|
|
784
|
+
}
|
|
785
|
+
// Fallback to hardcoded prompt if YAML not available
|
|
786
|
+
return `You are an expert at parsing log formats and trace data. Analyze the following sample data and determine field mappings to the InferenceEvent schema.
|
|
787
|
+
|
|
788
|
+
The target InferenceEvent schema requires these fields:
|
|
789
|
+
- id (string): Unique event identifier
|
|
790
|
+
- ts (string): ISO 8601 timestamp
|
|
791
|
+
- provider (string): LLM provider (openai, anthropic, google, etc.)
|
|
792
|
+
- model (string): Model name (gpt-4o, claude-3-5-sonnet, etc.)
|
|
793
|
+
- input_tokens (number): Input/prompt token count
|
|
794
|
+
- output_tokens (number): Output/completion token count
|
|
795
|
+
- latency_ms (number): Request latency in milliseconds
|
|
796
|
+
|
|
797
|
+
Optional fields:
|
|
798
|
+
- streaming (boolean), ttft_ms (number), batch_size (number), cached (boolean), retry_count (number)
|
|
799
|
+
|
|
800
|
+
For each target field, provide:
|
|
801
|
+
1. The source path/expression to extract the value
|
|
802
|
+
2. The extraction type (direct, jsonpath, regex, computed)
|
|
803
|
+
3. Any transform needed (unix_ms_to_iso, unix_nano_to_iso, parse_int, etc.)
|
|
804
|
+
4. Your confidence (0.0-1.0) in this mapping
|
|
805
|
+
5. Evidence explaining why you chose this mapping
|
|
806
|
+
|
|
807
|
+
If a field cannot be mapped, indicate it as unmappable with confidence 0.
|
|
808
|
+
|
|
809
|
+
Respond in JSON format:
|
|
810
|
+
{
|
|
811
|
+
"format_type": "detected format name",
|
|
812
|
+
"mappings": [
|
|
813
|
+
{
|
|
814
|
+
"target": "field_name",
|
|
815
|
+
"source_path": "path or expression",
|
|
816
|
+
"extraction_type": "direct|jsonpath|regex|computed",
|
|
817
|
+
"transform": "none|unix_ms_to_iso|parse_int|...",
|
|
818
|
+
"confidence": 0.9,
|
|
819
|
+
"evidence": "explanation"
|
|
820
|
+
}
|
|
821
|
+
],
|
|
822
|
+
"unmapped_fields": ["fields that could not be mapped"],
|
|
823
|
+
"warnings": ["any issues or caveats"]
|
|
824
|
+
}`;
|
|
825
|
+
}
|
|
826
|
+
|
|
827
|
+
/**
|
|
828
|
+
* Use LLM agent to normalize an unknown format.
|
|
829
|
+
*/
|
|
830
|
+
export async function normalizeWithAgent(
|
|
831
|
+
content: string,
|
|
832
|
+
detection: FormatDetectionResult,
|
|
833
|
+
options: NormalizationOptions = {},
|
|
834
|
+
): Promise<NormalizationResult> {
|
|
835
|
+
// Check for API key
|
|
836
|
+
const apiKey = process.env.ANTHROPIC_API_KEY;
|
|
837
|
+
if (!apiKey) {
|
|
838
|
+
return createFallbackResult(detection, 'No ANTHROPIC_API_KEY - agent normalization unavailable');
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// Sample content for the agent
|
|
842
|
+
const sampleLines = content.trim().split('\n').slice(0, SAMPLE_LINES);
|
|
843
|
+
const sampleContent = sampleLines.join('\n');
|
|
844
|
+
|
|
845
|
+
// Build context prompt
|
|
846
|
+
let contextPrompt = '';
|
|
847
|
+
if (options.codebase_context) {
|
|
848
|
+
const scanResult = options.codebase_context as ScanResult;
|
|
849
|
+
contextPrompt = `\n\nCodebase context available:
|
|
850
|
+
- ${scanResult.files.length} files scanned
|
|
851
|
+
- Languages: ${scanResult.summary.languages.join(', ')}
|
|
852
|
+
- ${scanResult.summary.totalCandidates} potential inference points detected
|
|
853
|
+
|
|
854
|
+
This may help identify logging patterns and field names used in the application.`;
|
|
855
|
+
}
|
|
856
|
+
|
|
857
|
+
// User hints
|
|
858
|
+
let hintsPrompt = '';
|
|
859
|
+
if (options.format_hint) {
|
|
860
|
+
hintsPrompt += `\nUser hint: Format appears to be "${options.format_hint}"`;
|
|
861
|
+
}
|
|
862
|
+
if (options.field_hints) {
|
|
863
|
+
hintsPrompt += `\nUser-provided field mappings: ${JSON.stringify(options.field_hints)}`;
|
|
864
|
+
}
|
|
865
|
+
|
|
866
|
+
try {
|
|
867
|
+
// Use Claude Agent SDK query() function
|
|
868
|
+
const agentQuery = query({
|
|
869
|
+
prompt: `${getNormalizationPrompt()}${contextPrompt}${hintsPrompt}
|
|
870
|
+
|
|
871
|
+
Detected format: ${detection.format_type} (confidence: ${detection.confidence})
|
|
872
|
+
|
|
873
|
+
Sample data:
|
|
874
|
+
\`\`\`
|
|
875
|
+
${sampleContent}
|
|
876
|
+
\`\`\``,
|
|
877
|
+
options: {
|
|
878
|
+
model: LLM_MODEL,
|
|
879
|
+
tools: [],
|
|
880
|
+
permissionMode: 'plan',
|
|
881
|
+
cwd: process.cwd(),
|
|
882
|
+
},
|
|
883
|
+
});
|
|
884
|
+
|
|
885
|
+
// Collect all messages from the async generator
|
|
886
|
+
const messages: SDKMessage[] = [];
|
|
887
|
+
for await (const message of agentQuery) {
|
|
888
|
+
messages.push(message);
|
|
889
|
+
}
|
|
890
|
+
|
|
891
|
+
// Parse LLM response
|
|
892
|
+
const responseText = extractTextFromMessages(messages);
|
|
893
|
+
const jsonMatch = responseText.match(/\{[\s\S]*\}/);
|
|
894
|
+
|
|
895
|
+
if (!jsonMatch) {
|
|
896
|
+
return createFallbackResult(detection, 'Could not parse agent response');
|
|
897
|
+
}
|
|
898
|
+
|
|
899
|
+
const agentResult = JSON.parse(jsonMatch[0]) as {
|
|
900
|
+
format_type?: string;
|
|
901
|
+
mappings?: Array<{
|
|
902
|
+
target: string;
|
|
903
|
+
source_path: string;
|
|
904
|
+
extraction_type: string;
|
|
905
|
+
transform?: string;
|
|
906
|
+
confidence: number;
|
|
907
|
+
evidence?: string;
|
|
908
|
+
}>;
|
|
909
|
+
unmapped_fields?: string[];
|
|
910
|
+
warnings?: string[];
|
|
911
|
+
};
|
|
912
|
+
|
|
913
|
+
// Validate and build result
|
|
914
|
+
const mappings: FieldMapping[] = (agentResult.mappings || []).map(m => ({
|
|
915
|
+
target: m.target,
|
|
916
|
+
source_path: m.source_path,
|
|
917
|
+
extraction_type: m.extraction_type as FieldMapping['extraction_type'],
|
|
918
|
+
transform: (m.transform || 'none') as FieldMapping['transform'],
|
|
919
|
+
confidence: m.confidence,
|
|
920
|
+
evidence: m.evidence,
|
|
921
|
+
}));
|
|
922
|
+
|
|
923
|
+
// Check confidence threshold
|
|
924
|
+
const avgConfidence = mappings.reduce((sum, m) => sum + m.confidence, 0) / mappings.length;
|
|
925
|
+
const warnings = agentResult.warnings || [];
|
|
926
|
+
|
|
927
|
+
if (avgConfidence < MIN_CONFIDENCE_THRESHOLD && !options.lenient) {
|
|
928
|
+
warnings.push(
|
|
929
|
+
`Average mapping confidence (${avgConfidence.toFixed(2)}) is below threshold (${MIN_CONFIDENCE_THRESHOLD}). ` +
|
|
930
|
+
`Use --lenient flag to accept low-confidence mappings.`
|
|
931
|
+
);
|
|
932
|
+
}
|
|
933
|
+
|
|
934
|
+
return {
|
|
935
|
+
detection: {
|
|
936
|
+
...detection,
|
|
937
|
+
format_type: (agentResult.format_type as FormatType) || detection.format_type,
|
|
938
|
+
},
|
|
939
|
+
mappings,
|
|
940
|
+
unmapped_fields: agentResult.unmapped_fields || [],
|
|
941
|
+
warnings,
|
|
942
|
+
audit: {
|
|
943
|
+
normalized_at: new Date().toISOString(),
|
|
944
|
+
agent_used: true,
|
|
945
|
+
codebase_context_used: !!options.codebase_context,
|
|
946
|
+
llm_model: LLM_MODEL,
|
|
947
|
+
},
|
|
948
|
+
};
|
|
949
|
+
} catch (error) {
|
|
950
|
+
return createFallbackResult(
|
|
951
|
+
detection,
|
|
952
|
+
`Agent normalization failed: ${error instanceof Error ? error.message : String(error)}`
|
|
953
|
+
);
|
|
954
|
+
}
|
|
955
|
+
}
|
|
956
|
+
|
|
957
|
+
/**
|
|
958
|
+
* Create a fallback normalization result when agent is unavailable.
|
|
959
|
+
*/
|
|
960
|
+
function createFallbackResult(
|
|
961
|
+
detection: FormatDetectionResult,
|
|
962
|
+
warning: string,
|
|
963
|
+
): NormalizationResult {
|
|
964
|
+
// Use predefined mappings if available
|
|
965
|
+
const predefinedMappings = PREDEFINED_MAPPINGS[detection.format_type];
|
|
966
|
+
|
|
967
|
+
return {
|
|
968
|
+
detection,
|
|
969
|
+
mappings: predefinedMappings || [],
|
|
970
|
+
unmapped_fields: predefinedMappings ? [] : REQUIRED_FIELDS,
|
|
971
|
+
warnings: [warning],
|
|
972
|
+
audit: {
|
|
973
|
+
normalized_at: new Date().toISOString(),
|
|
974
|
+
agent_used: false,
|
|
975
|
+
codebase_context_used: false,
|
|
976
|
+
},
|
|
977
|
+
};
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
// =============================================================================
|
|
981
|
+
// FIELD EXTRACTION
|
|
982
|
+
// =============================================================================
|
|
983
|
+
|
|
984
|
+
/**
|
|
985
|
+
* Apply a transformation to an extracted value.
|
|
986
|
+
*/
|
|
987
|
+
function applyTransform(value: unknown, transform: FieldMapping['transform']): unknown {
|
|
988
|
+
if (value === null || value === undefined) return value;
|
|
989
|
+
|
|
990
|
+
switch (transform) {
|
|
991
|
+
case 'none':
|
|
992
|
+
return value;
|
|
993
|
+
|
|
994
|
+
case 'unix_ms_to_iso':
|
|
995
|
+
return new Date(Number(value)).toISOString();
|
|
996
|
+
|
|
997
|
+
case 'unix_s_to_iso':
|
|
998
|
+
return new Date(Number(value) * 1000).toISOString();
|
|
999
|
+
|
|
1000
|
+
case 'unix_nano_to_iso':
|
|
1001
|
+
return new Date(Number(value) / 1_000_000).toISOString();
|
|
1002
|
+
|
|
1003
|
+
case 'duration_to_ms': {
|
|
1004
|
+
const str = String(value);
|
|
1005
|
+
const match = str.match(/^([\d.]+)(ms|s|m)?$/);
|
|
1006
|
+
if (match) {
|
|
1007
|
+
const num = parseFloat(match[1]);
|
|
1008
|
+
const unit = match[2] || 'ms';
|
|
1009
|
+
switch (unit) {
|
|
1010
|
+
case 's': return num * 1000;
|
|
1011
|
+
case 'm': return num * 60000;
|
|
1012
|
+
default: return num;
|
|
1013
|
+
}
|
|
1014
|
+
}
|
|
1015
|
+
return parseFloat(str);
|
|
1016
|
+
}
|
|
1017
|
+
|
|
1018
|
+
case 'parse_int':
|
|
1019
|
+
return parseInt(String(value), 10);
|
|
1020
|
+
|
|
1021
|
+
case 'parse_float':
|
|
1022
|
+
return parseFloat(String(value));
|
|
1023
|
+
|
|
1024
|
+
case 'lowercase':
|
|
1025
|
+
return String(value).toLowerCase();
|
|
1026
|
+
|
|
1027
|
+
case 'provider_normalize': {
|
|
1028
|
+
const str = String(value).toLowerCase();
|
|
1029
|
+
// Normalize common provider variations
|
|
1030
|
+
if (str.includes('openai')) return 'openai';
|
|
1031
|
+
if (str.includes('anthropic')) return 'anthropic';
|
|
1032
|
+
if (str.includes('google')) return 'google';
|
|
1033
|
+
if (str.includes('azure')) return 'azure_openai';
|
|
1034
|
+
if (str.includes('bedrock')) return 'bedrock';
|
|
1035
|
+
if (str.includes('together')) return 'together';
|
|
1036
|
+
if (str.includes('groq')) return 'groq';
|
|
1037
|
+
return str;
|
|
1038
|
+
}
|
|
1039
|
+
|
|
1040
|
+
default:
|
|
1041
|
+
return value;
|
|
1042
|
+
}
|
|
1043
|
+
}
|
|
1044
|
+
|
|
1045
|
+
/**
|
|
1046
|
+
* Extract a value from an object using a simple path.
|
|
1047
|
+
* Supports basic dot notation and array access.
|
|
1048
|
+
*/
|
|
1049
|
+
function extractValue(obj: unknown, path: string): unknown {
|
|
1050
|
+
if (path.startsWith('$.')) {
|
|
1051
|
+
path = path.slice(2);
|
|
1052
|
+
}
|
|
1053
|
+
|
|
1054
|
+
const parts = path.split(/\.|\[(\d+)\]/).filter(Boolean);
|
|
1055
|
+
let current: unknown = obj;
|
|
1056
|
+
|
|
1057
|
+
for (const part of parts) {
|
|
1058
|
+
if (current === null || current === undefined) return undefined;
|
|
1059
|
+
|
|
1060
|
+
if (typeof current === 'object') {
|
|
1061
|
+
current = (current as Record<string, unknown>)[part];
|
|
1062
|
+
} else {
|
|
1063
|
+
return undefined;
|
|
1064
|
+
}
|
|
1065
|
+
}
|
|
1066
|
+
|
|
1067
|
+
return current;
|
|
1068
|
+
}
|
|
1069
|
+
|
|
1070
|
+
/**
|
|
1071
|
+
* Extract InferenceEvents from normalized data using field mappings.
|
|
1072
|
+
*/
|
|
1073
|
+
export function extractEvents(
|
|
1074
|
+
content: string,
|
|
1075
|
+
normalization: NormalizationResult,
|
|
1076
|
+
): { events: InferenceEvent[]; errors: string[] } {
|
|
1077
|
+
const events: InferenceEvent[] = [];
|
|
1078
|
+
const errors: string[] = [];
|
|
1079
|
+
|
|
1080
|
+
// Parse content based on format type
|
|
1081
|
+
let records: unknown[];
|
|
1082
|
+
|
|
1083
|
+
try {
|
|
1084
|
+
const formatType = normalization.detection.format_type;
|
|
1085
|
+
|
|
1086
|
+
if (formatType === 'jsonl') {
|
|
1087
|
+
records = content.trim().split('\n').map(line => JSON.parse(line));
|
|
1088
|
+
} else if (formatType === 'json_array') {
|
|
1089
|
+
records = JSON.parse(content);
|
|
1090
|
+
} else if (formatType === 'csv' || formatType === 'tsv') {
|
|
1091
|
+
const delimiter = formatType === 'csv' ? ',' : '\t';
|
|
1092
|
+
const lines = content.trim().split('\n');
|
|
1093
|
+
const headers = lines[0].split(delimiter).map(h => h.trim());
|
|
1094
|
+
records = lines.slice(1).map(line => {
|
|
1095
|
+
const values = line.split(delimiter);
|
|
1096
|
+
const obj: Record<string, string> = {};
|
|
1097
|
+
headers.forEach((h, i) => { obj[h] = values[i]?.trim() || ''; });
|
|
1098
|
+
return obj;
|
|
1099
|
+
});
|
|
1100
|
+
} else {
|
|
1101
|
+
// For complex formats (OTEL, Jaeger, etc.), parse and flatten
|
|
1102
|
+
const data = JSON.parse(content);
|
|
1103
|
+
records = flattenComplexFormat(data, normalization.detection.format_type);
|
|
1104
|
+
}
|
|
1105
|
+
} catch (error) {
|
|
1106
|
+
errors.push(`Failed to parse content: ${error instanceof Error ? error.message : String(error)}`);
|
|
1107
|
+
return { events, errors };
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
// Extract events using mappings
|
|
1111
|
+
for (let i = 0; i < records.length; i++) {
|
|
1112
|
+
const record = records[i];
|
|
1113
|
+
const event: Partial<InferenceEvent> = {};
|
|
1114
|
+
|
|
1115
|
+
for (const mapping of normalization.mappings) {
|
|
1116
|
+
try {
|
|
1117
|
+
let value: unknown;
|
|
1118
|
+
|
|
1119
|
+
if (mapping.extraction_type === 'computed') {
|
|
1120
|
+
// Handle computed fields (e.g., latency = end - start)
|
|
1121
|
+
value = computeValue(record as Record<string, unknown>, mapping.source_path);
|
|
1122
|
+
} else if (mapping.extraction_type === 'constant') {
|
|
1123
|
+
value = mapping.source_path;
|
|
1124
|
+
} else {
|
|
1125
|
+
value = extractValue(record, mapping.source_path);
|
|
1126
|
+
}
|
|
1127
|
+
|
|
1128
|
+
if (value !== undefined && value !== null) {
|
|
1129
|
+
const transformed = applyTransform(value, mapping.transform);
|
|
1130
|
+
(event as Record<string, unknown>)[mapping.target] = transformed;
|
|
1131
|
+
}
|
|
1132
|
+
} catch (error) {
|
|
1133
|
+
// Skip this field for this record
|
|
1134
|
+
}
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
// Validate required fields
|
|
1138
|
+
const missingFields = REQUIRED_FIELDS.filter(f => !(f in event));
|
|
1139
|
+
if (missingFields.length === 0) {
|
|
1140
|
+
events.push(event as InferenceEvent);
|
|
1141
|
+
} else {
|
|
1142
|
+
errors.push(`Record ${i + 1}: Missing required fields: ${missingFields.join(', ')}`);
|
|
1143
|
+
}
|
|
1144
|
+
}
|
|
1145
|
+
|
|
1146
|
+
return { events, errors };
|
|
1147
|
+
}
|
|
1148
|
+
|
|
1149
|
+
/**
|
|
1150
|
+
* Flatten complex nested formats (OTEL, Jaeger, Zipkin) into individual records.
|
|
1151
|
+
*/
|
|
1152
|
+
function flattenComplexFormat(data: unknown, formatType: FormatType): unknown[] {
|
|
1153
|
+
if (formatType === 'otel') {
|
|
1154
|
+
return flattenOTEL(data);
|
|
1155
|
+
} else if (formatType === 'jaeger') {
|
|
1156
|
+
return flattenJaeger(data);
|
|
1157
|
+
} else if (formatType === 'zipkin') {
|
|
1158
|
+
// Zipkin is already an array of spans
|
|
1159
|
+
return Array.isArray(data) ? data : [];
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
// For unknown formats, try to handle arrays or wrap single object
|
|
1163
|
+
if (Array.isArray(data)) return data;
|
|
1164
|
+
return [data];
|
|
1165
|
+
}
|
|
1166
|
+
|
|
1167
|
+
/**
|
|
1168
|
+
* Flatten OTEL traces into individual spans.
|
|
1169
|
+
*/
|
|
1170
|
+
function flattenOTEL(data: unknown): unknown[] {
|
|
1171
|
+
const spans: unknown[] = [];
|
|
1172
|
+
const otelData = data as {
|
|
1173
|
+
resourceSpans?: Array<{
|
|
1174
|
+
scopeSpans?: Array<{
|
|
1175
|
+
spans?: unknown[];
|
|
1176
|
+
}>;
|
|
1177
|
+
}>;
|
|
1178
|
+
};
|
|
1179
|
+
|
|
1180
|
+
for (const resourceSpan of otelData.resourceSpans || []) {
|
|
1181
|
+
for (const scopeSpan of resourceSpan.scopeSpans || []) {
|
|
1182
|
+
for (const span of scopeSpan.spans || []) {
|
|
1183
|
+
spans.push(span);
|
|
1184
|
+
}
|
|
1185
|
+
}
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
return spans;
|
|
1189
|
+
}
|
|
1190
|
+
|
|
1191
|
+
/**
|
|
1192
|
+
* Flatten Jaeger traces into individual spans.
|
|
1193
|
+
*/
|
|
1194
|
+
function flattenJaeger(data: unknown): unknown[] {
|
|
1195
|
+
const spans: unknown[] = [];
|
|
1196
|
+
const jaegerData = data as {
|
|
1197
|
+
data?: Array<{
|
|
1198
|
+
spans?: unknown[];
|
|
1199
|
+
}>;
|
|
1200
|
+
};
|
|
1201
|
+
|
|
1202
|
+
for (const trace of jaegerData.data || []) {
|
|
1203
|
+
for (const span of trace.spans || []) {
|
|
1204
|
+
spans.push(span);
|
|
1205
|
+
}
|
|
1206
|
+
}
|
|
1207
|
+
|
|
1208
|
+
return spans;
|
|
1209
|
+
}
|
|
1210
|
+
|
|
1211
|
+
/**
|
|
1212
|
+
* Compute a derived value from an expression.
|
|
1213
|
+
*/
|
|
1214
|
+
function computeValue(record: Record<string, unknown>, expression: string): number | undefined {
|
|
1215
|
+
// Simple expression parser for common patterns
|
|
1216
|
+
// e.g., "(endTimeUnixNano - startTimeUnixNano) / 1000000"
|
|
1217
|
+
|
|
1218
|
+
const match = expression.match(/\((\w+)\s*-\s*(\w+)\)\s*\/\s*(\d+)/);
|
|
1219
|
+
if (match) {
|
|
1220
|
+
const [, endField, startField, divisor] = match;
|
|
1221
|
+
const endValue = Number(record[endField]);
|
|
1222
|
+
const startValue = Number(record[startField]);
|
|
1223
|
+
|
|
1224
|
+
if (!isNaN(endValue) && !isNaN(startValue)) {
|
|
1225
|
+
return (endValue - startValue) / Number(divisor);
|
|
1226
|
+
}
|
|
1227
|
+
}
|
|
1228
|
+
|
|
1229
|
+
// Try direct field access
|
|
1230
|
+
const fieldMatch = expression.match(/^\$\.(\w+)$/);
|
|
1231
|
+
if (fieldMatch) {
|
|
1232
|
+
return Number(record[fieldMatch[1]]);
|
|
1233
|
+
}
|
|
1234
|
+
|
|
1235
|
+
return undefined;
|
|
1236
|
+
}
|
|
1237
|
+
|
|
1238
|
+
// =============================================================================
|
|
1239
|
+
// PUBLIC API
|
|
1240
|
+
// =============================================================================
|
|
1241
|
+
|
|
1242
|
+
/**
|
|
1243
|
+
* Main entry point: Detect format and normalize runtime events.
|
|
1244
|
+
*
|
|
1245
|
+
* This function implements the complete normalization pipeline:
|
|
1246
|
+
* 1. Detect format type from content
|
|
1247
|
+
* 2. For direct-parse formats (JSONL, JSON array), parse directly
|
|
1248
|
+
* 3. Apply predefined mappings for known complex formats
|
|
1249
|
+
* 4. Use agent for unknown formats (if API key available)
|
|
1250
|
+
*/
|
|
1251
|
+
export async function normalizeRuntimeEvents(
|
|
1252
|
+
content: string,
|
|
1253
|
+
options: NormalizationOptions = {},
|
|
1254
|
+
): Promise<{
|
|
1255
|
+
events: InferenceEvent[];
|
|
1256
|
+
normalization: NormalizationResult;
|
|
1257
|
+
errors: string[];
|
|
1258
|
+
}> {
|
|
1259
|
+
// Step 1: Detect format
|
|
1260
|
+
const detection = detectFormat(content, options.format_hint?.toString());
|
|
1261
|
+
|
|
1262
|
+
// Override with user hint if provided
|
|
1263
|
+
if (options.format_hint) {
|
|
1264
|
+
detection.format_type = options.format_hint;
|
|
1265
|
+
detection.evidence = `User-specified format: ${options.format_hint}`;
|
|
1266
|
+
}
|
|
1267
|
+
|
|
1268
|
+
// Step 2: For direct-parse formats, parse directly without field mappings
|
|
1269
|
+
if (detection.format_type === 'jsonl' || detection.format_type === 'json_array') {
|
|
1270
|
+
const events: InferenceEvent[] = [];
|
|
1271
|
+
const errors: string[] = [];
|
|
1272
|
+
|
|
1273
|
+
try {
|
|
1274
|
+
if (detection.format_type === 'jsonl') {
|
|
1275
|
+
// JSONL: one JSON object per line
|
|
1276
|
+
const lines = content.trim().split('\n');
|
|
1277
|
+
for (let i = 0; i < lines.length; i++) {
|
|
1278
|
+
const line = lines[i].trim();
|
|
1279
|
+
if (!line) continue;
|
|
1280
|
+
try {
|
|
1281
|
+
const data = JSON.parse(line);
|
|
1282
|
+
events.push(validateAndConvertEvent(data, i + 1));
|
|
1283
|
+
} catch (e) {
|
|
1284
|
+
errors.push(`Line ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
|
|
1285
|
+
}
|
|
1286
|
+
}
|
|
1287
|
+
} else {
|
|
1288
|
+
// JSON array
|
|
1289
|
+
const data = JSON.parse(content);
|
|
1290
|
+
if (Array.isArray(data)) {
|
|
1291
|
+
for (let i = 0; i < data.length; i++) {
|
|
1292
|
+
try {
|
|
1293
|
+
events.push(validateAndConvertEvent(data[i], i + 1));
|
|
1294
|
+
} catch (e) {
|
|
1295
|
+
errors.push(`Record ${i + 1}: ${e instanceof Error ? e.message : String(e)}`);
|
|
1296
|
+
}
|
|
1297
|
+
}
|
|
1298
|
+
}
|
|
1299
|
+
}
|
|
1300
|
+
} catch (e) {
|
|
1301
|
+
errors.push(`Parse error: ${e instanceof Error ? e.message : String(e)}`);
|
|
1302
|
+
}
|
|
1303
|
+
|
|
1304
|
+
const normalization: NormalizationResult = {
|
|
1305
|
+
detection,
|
|
1306
|
+
mappings: [],
|
|
1307
|
+
unmapped_fields: [],
|
|
1308
|
+
warnings: errors.length > 0 ? [`${errors.length} records had parsing issues`] : [],
|
|
1309
|
+
audit: {
|
|
1310
|
+
normalized_at: new Date().toISOString(),
|
|
1311
|
+
agent_used: false,
|
|
1312
|
+
codebase_context_used: false,
|
|
1313
|
+
},
|
|
1314
|
+
};
|
|
1315
|
+
|
|
1316
|
+
return { events, normalization, errors };
|
|
1317
|
+
}
|
|
1318
|
+
|
|
1319
|
+
// Step 3: Get or generate field mappings for complex formats
|
|
1320
|
+
let normalization: NormalizationResult;
|
|
1321
|
+
|
|
1322
|
+
if (!detection.requires_agent || detection.confidence >= 0.95) {
|
|
1323
|
+
// Use predefined mappings for known complex formats
|
|
1324
|
+
const predefinedMappings = PREDEFINED_MAPPINGS[detection.format_type];
|
|
1325
|
+
normalization = {
|
|
1326
|
+
detection,
|
|
1327
|
+
mappings: predefinedMappings || [],
|
|
1328
|
+
unmapped_fields: [],
|
|
1329
|
+
warnings: [],
|
|
1330
|
+
audit: {
|
|
1331
|
+
normalized_at: new Date().toISOString(),
|
|
1332
|
+
agent_used: false,
|
|
1333
|
+
codebase_context_used: false,
|
|
1334
|
+
},
|
|
1335
|
+
};
|
|
1336
|
+
} else {
|
|
1337
|
+
// Agent normalization required
|
|
1338
|
+
normalization = await normalizeWithAgent(content, detection, options);
|
|
1339
|
+
}
|
|
1340
|
+
|
|
1341
|
+
// Step 4: Extract events using field mappings
|
|
1342
|
+
const { events, errors } = extractEvents(content, normalization);
|
|
1343
|
+
|
|
1344
|
+
// Add extraction errors to warnings
|
|
1345
|
+
if (errors.length > 0 && errors.length <= 5) {
|
|
1346
|
+
normalization.warnings.push(...errors);
|
|
1347
|
+
} else if (errors.length > 5) {
|
|
1348
|
+
normalization.warnings.push(`${errors.length} records failed extraction (first: ${errors[0]})`);
|
|
1349
|
+
}
|
|
1350
|
+
|
|
1351
|
+
return { events, normalization, errors };
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
/**
|
|
1355
|
+
* Validate and convert raw data to InferenceEvent.
|
|
1356
|
+
* Used for direct-parse formats (JSONL, JSON array).
|
|
1357
|
+
*/
|
|
1358
|
+
function validateAndConvertEvent(data: unknown, recordNum: number): InferenceEvent {
|
|
1359
|
+
if (typeof data !== 'object' || data === null) {
|
|
1360
|
+
throw new Error(`Expected object, got ${typeof data}`);
|
|
1361
|
+
}
|
|
1362
|
+
|
|
1363
|
+
const obj = data as Record<string, unknown>;
|
|
1364
|
+
const errors: string[] = [];
|
|
1365
|
+
|
|
1366
|
+
// Required fields
|
|
1367
|
+
if (typeof obj.id !== 'string') errors.push("Missing 'id'");
|
|
1368
|
+
if (typeof obj.ts !== 'string') errors.push("Missing 'ts'");
|
|
1369
|
+
if (typeof obj.provider !== 'string') errors.push("Missing 'provider'");
|
|
1370
|
+
if (typeof obj.model !== 'string') errors.push("Missing 'model'");
|
|
1371
|
+
if (typeof obj.input_tokens !== 'number') errors.push("Missing 'input_tokens'");
|
|
1372
|
+
if (typeof obj.output_tokens !== 'number') errors.push("Missing 'output_tokens'");
|
|
1373
|
+
if (typeof obj.latency_ms !== 'number') errors.push("Missing 'latency_ms'");
|
|
1374
|
+
|
|
1375
|
+
if (errors.length > 0) {
|
|
1376
|
+
throw new Error(`Record ${recordNum}: ${errors.join(', ')}`);
|
|
1377
|
+
}
|
|
1378
|
+
|
|
1379
|
+
return {
|
|
1380
|
+
id: obj.id as string,
|
|
1381
|
+
ts: obj.ts as string,
|
|
1382
|
+
provider: obj.provider as InferenceEvent['provider'],
|
|
1383
|
+
model: obj.model as string,
|
|
1384
|
+
input_tokens: obj.input_tokens as number,
|
|
1385
|
+
output_tokens: obj.output_tokens as number,
|
|
1386
|
+
latency_ms: obj.latency_ms as number,
|
|
1387
|
+
// Optional fields
|
|
1388
|
+
intent: typeof obj.intent === 'string' ? obj.intent : undefined,
|
|
1389
|
+
callsite_id: typeof obj.callsite_id === 'string' ? obj.callsite_id : undefined,
|
|
1390
|
+
streaming: typeof obj.streaming === 'boolean' ? obj.streaming : undefined,
|
|
1391
|
+
ttft_ms: typeof obj.ttft_ms === 'number' ? obj.ttft_ms : undefined,
|
|
1392
|
+
batch_size: typeof obj.batch_size === 'number' ? obj.batch_size : undefined,
|
|
1393
|
+
batch_id: typeof obj.batch_id === 'string' ? obj.batch_id : undefined,
|
|
1394
|
+
cached: typeof obj.cached === 'boolean' ? obj.cached : undefined,
|
|
1395
|
+
retry_count: typeof obj.retry_count === 'number' ? obj.retry_count : undefined,
|
|
1396
|
+
fallback_used: typeof obj.fallback_used === 'boolean' ? obj.fallback_used : undefined,
|
|
1397
|
+
original_model: typeof obj.original_model === 'string' ? obj.original_model : undefined,
|
|
1398
|
+
};
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
/**
|
|
1402
|
+
* Get predefined mappings for a known format type.
|
|
1403
|
+
*/
|
|
1404
|
+
export function getPredefinedMappings(formatType: FormatType): FieldMapping[] | undefined {
|
|
1405
|
+
return PREDEFINED_MAPPINGS[formatType];
|
|
1406
|
+
}
|
|
1407
|
+
|
|
1408
|
+
/**
|
|
1409
|
+
* Check if a format type requires agent normalization.
|
|
1410
|
+
*/
|
|
1411
|
+
export function requiresAgentNormalization(formatType: FormatType): boolean {
|
|
1412
|
+
return !['jsonl', 'json_array', 'csv', 'tsv'].includes(formatType);
|
|
1413
|
+
}
|
|
1414
|
+
|
|
1415
|
+
// =============================================================================
|
|
1416
|
+
// FIELD MAPPING VALIDATION
|
|
1417
|
+
// =============================================================================
|
|
1418
|
+
|
|
1419
|
+
export interface MappingValidationResult {
|
|
1420
|
+
valid: boolean;
|
|
1421
|
+
mappings: Array<{ target: string; source: string }>;
|
|
1422
|
+
errors: string[];
|
|
1423
|
+
warnings: string[];
|
|
1424
|
+
}
|
|
1425
|
+
|
|
1426
|
+
/**
|
|
1427
|
+
* Parse and validate a field mapping string.
|
|
1428
|
+
* Format: "target=source,target2=source2,..."
|
|
1429
|
+
* Example: "model=llm_model,latency_ms=duration_ms"
|
|
1430
|
+
*/
|
|
1431
|
+
export function validateFieldMappings(mappingStr: string | undefined): MappingValidationResult {
|
|
1432
|
+
const result: MappingValidationResult = {
|
|
1433
|
+
valid: true,
|
|
1434
|
+
mappings: [],
|
|
1435
|
+
errors: [],
|
|
1436
|
+
warnings: [],
|
|
1437
|
+
};
|
|
1438
|
+
|
|
1439
|
+
if (!mappingStr || mappingStr.trim() === '') {
|
|
1440
|
+
return result; // Empty mapping is valid (no custom mappings)
|
|
1441
|
+
}
|
|
1442
|
+
|
|
1443
|
+
// Parse mapping string
|
|
1444
|
+
const pairs = mappingStr.split(',').map(p => p.trim()).filter(p => p);
|
|
1445
|
+
|
|
1446
|
+
for (const pair of pairs) {
|
|
1447
|
+
const parts = pair.split('=');
|
|
1448
|
+
if (parts.length !== 2) {
|
|
1449
|
+
result.errors.push(`Invalid mapping format: "${pair}" (expected target=source)`);
|
|
1450
|
+
result.valid = false;
|
|
1451
|
+
continue;
|
|
1452
|
+
}
|
|
1453
|
+
|
|
1454
|
+
const [target, source] = parts.map(p => p.trim());
|
|
1455
|
+
|
|
1456
|
+
if (!target || !source) {
|
|
1457
|
+
result.errors.push(`Empty target or source in mapping: "${pair}"`);
|
|
1458
|
+
result.valid = false;
|
|
1459
|
+
continue;
|
|
1460
|
+
}
|
|
1461
|
+
|
|
1462
|
+
// Validate target is a known InferenceEvent field
|
|
1463
|
+
const validTargets = [
|
|
1464
|
+
'id', 'ts', 'provider', 'model', 'input_tokens', 'output_tokens',
|
|
1465
|
+
'latency_ms', 'callsite_id', 'streaming', 'cached', 'batch_id',
|
|
1466
|
+
'batch_size', 'retry_count', 'fallback_used', 'error_code', 'error_message'
|
|
1467
|
+
];
|
|
1468
|
+
|
|
1469
|
+
if (!validTargets.includes(target)) {
|
|
1470
|
+
result.warnings.push(`Unknown target field "${target}" - may not be used`);
|
|
1471
|
+
}
|
|
1472
|
+
|
|
1473
|
+
result.mappings.push({ target, source });
|
|
1474
|
+
}
|
|
1475
|
+
|
|
1476
|
+
// Check for required fields coverage
|
|
1477
|
+
const mappedTargets = new Set(result.mappings.map(m => m.target));
|
|
1478
|
+
const criticalFields = ['model', 'latency_ms'];
|
|
1479
|
+
for (const field of criticalFields) {
|
|
1480
|
+
if (!mappedTargets.has(field)) {
|
|
1481
|
+
result.warnings.push(`Consider mapping "${field}" for better analysis`);
|
|
1482
|
+
}
|
|
1483
|
+
}
|
|
1484
|
+
|
|
1485
|
+
return result;
|
|
1486
|
+
}
|