@mcoda/mswarm 0.1.56 → 0.1.60
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +20 -1
- package/dist/codali-executor.d.ts +266 -0
- package/dist/codali-executor.d.ts.map +1 -0
- package/dist/codali-executor.js +227 -0
- package/dist/codali-executor.js.map +1 -0
- package/dist/runtime.d.ts +47 -1
- package/dist/runtime.d.ts.map +1 -1
- package/dist/runtime.js +248 -30
- package/dist/runtime.js.map +1 -1
- package/dist/server.d.ts.map +1 -1
- package/dist/server.js +83 -3
- package/dist/server.js.map +1 -1
- package/dist/vendor/codali/agents/AgentProtocol.d.ts +287 -0
- package/dist/vendor/codali/agents/AgentProtocol.d.ts.map +1 -0
- package/dist/vendor/codali/agents/AgentProtocol.js +365 -0
- package/dist/vendor/codali/agents/AgentResolver.d.ts +23 -0
- package/dist/vendor/codali/agents/AgentResolver.d.ts.map +1 -0
- package/dist/vendor/codali/agents/AgentResolver.js +77 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts +23 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.d.ts.map +1 -0
- package/dist/vendor/codali/agents/PhaseAgentSelector.js +287 -0
- package/dist/vendor/codali/cli/EvalCommand.d.ts +37 -0
- package/dist/vendor/codali/cli/EvalCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/EvalCommand.js +333 -0
- package/dist/vendor/codali/cli/FeedbackCommand.d.ts +22 -0
- package/dist/vendor/codali/cli/FeedbackCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/FeedbackCommand.js +163 -0
- package/dist/vendor/codali/cli/RunCommand.d.ts +78 -0
- package/dist/vendor/codali/cli/RunCommand.d.ts.map +1 -0
- package/dist/vendor/codali/cli/RunCommand.js +2261 -0
- package/dist/vendor/codali/cli.d.ts +3 -0
- package/dist/vendor/codali/cli.d.ts.map +1 -0
- package/dist/vendor/codali/cli.js +109 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts +107 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ArchitectPlanner.js +1726 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts +25 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/BuilderOutputParser.js +164 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.d.ts +76 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/BuilderRunner.js +1159 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.d.ts +91 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextAssembler.js +4547 -0
- package/dist/vendor/codali/cognitive/ContextBudget.d.ts +19 -0
- package/dist/vendor/codali/cognitive/ContextBudget.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextBudget.js +35 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts +30 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextFileLoader.js +307 -0
- package/dist/vendor/codali/cognitive/ContextManager.d.ts +47 -0
- package/dist/vendor/codali/cognitive/ContextManager.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextManager.js +272 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.d.ts +18 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextRedactor.js +53 -0
- package/dist/vendor/codali/cognitive/ContextSelector.d.ts +22 -0
- package/dist/vendor/codali/cognitive/ContextSelector.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSelector.js +431 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.d.ts +8 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSerializer.js +882 -0
- package/dist/vendor/codali/cognitive/ContextStore.d.ts +27 -0
- package/dist/vendor/codali/cognitive/ContextStore.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextStore.js +79 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts +16 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ContextSummarizer.js +45 -0
- package/dist/vendor/codali/cognitive/CostEstimator.d.ts +31 -0
- package/dist/vendor/codali/cognitive/CostEstimator.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/CostEstimator.js +66 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts +32 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/CriticEvaluator.js +297 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.d.ts +9 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/EvidenceGate.js +75 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts +12 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/GoldenExampleIndexer.js +34 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts +33 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/GoldenSetStore.js +159 -0
- package/dist/vendor/codali/cognitive/IntentSignals.d.ts +7 -0
- package/dist/vendor/codali/cognitive/IntentSignals.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/IntentSignals.js +285 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.d.ts +100 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/LearningGovernance.js +276 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts +64 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/MemoryWriteback.js +287 -0
- package/dist/vendor/codali/cognitive/PatchApplier.d.ts +49 -0
- package/dist/vendor/codali/cognitive/PatchApplier.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchApplier.js +199 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts +35 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchInterpreter.js +100 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts +7 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PatchOutputNormalizer.js +59 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts +17 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PostMortemAnalyzer.js +131 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts +3 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/PreferenceExtraction.js +85 -0
- package/dist/vendor/codali/cognitive/Prompts.d.ts +15 -0
- package/dist/vendor/codali/cognitive/Prompts.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/Prompts.js +326 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.d.ts +16 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ProviderRouting.js +24 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.d.ts +12 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/QueryExtraction.js +262 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts +13 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/RunHistoryIndexer.js +125 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.d.ts +92 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/SmartPipeline.js +4804 -0
- package/dist/vendor/codali/cognitive/Types.d.ts +474 -0
- package/dist/vendor/codali/cognitive/Types.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/Types.js +7 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.d.ts +57 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.d.ts.map +1 -0
- package/dist/vendor/codali/cognitive/ValidationRunner.js +515 -0
- package/dist/vendor/codali/config/Config.d.ts +249 -0
- package/dist/vendor/codali/config/Config.d.ts.map +1 -0
- package/dist/vendor/codali/config/Config.js +200 -0
- package/dist/vendor/codali/config/ConfigLoader.d.ts +56 -0
- package/dist/vendor/codali/config/ConfigLoader.d.ts.map +1 -0
- package/dist/vendor/codali/config/ConfigLoader.js +1246 -0
- package/dist/vendor/codali/docdex/DocdexClient.d.ts +113 -0
- package/dist/vendor/codali/docdex/DocdexClient.d.ts.map +1 -0
- package/dist/vendor/codali/docdex/DocdexClient.js +524 -0
- package/dist/vendor/codali/eval/EvalRunner.d.ts +35 -0
- package/dist/vendor/codali/eval/EvalRunner.d.ts.map +1 -0
- package/dist/vendor/codali/eval/EvalRunner.js +38 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts +81 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.d.ts.map +1 -0
- package/dist/vendor/codali/eval/EvalTaskExecutor.js +371 -0
- package/dist/vendor/codali/eval/GateEvaluator.d.ts +31 -0
- package/dist/vendor/codali/eval/GateEvaluator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/GateEvaluator.js +134 -0
- package/dist/vendor/codali/eval/MetricTypes.d.ts +28 -0
- package/dist/vendor/codali/eval/MetricTypes.d.ts.map +1 -0
- package/dist/vendor/codali/eval/MetricTypes.js +1 -0
- package/dist/vendor/codali/eval/MetricsAggregator.d.ts +4 -0
- package/dist/vendor/codali/eval/MetricsAggregator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/MetricsAggregator.js +97 -0
- package/dist/vendor/codali/eval/RegressionComparator.d.ts +29 -0
- package/dist/vendor/codali/eval/RegressionComparator.d.ts.map +1 -0
- package/dist/vendor/codali/eval/RegressionComparator.js +155 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.d.ts +52 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportInputAdapter.js +229 -0
- package/dist/vendor/codali/eval/ReportSerializer.d.ts +32 -0
- package/dist/vendor/codali/eval/ReportSerializer.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportSerializer.js +33 -0
- package/dist/vendor/codali/eval/ReportStore.d.ts +18 -0
- package/dist/vendor/codali/eval/ReportStore.d.ts.map +1 -0
- package/dist/vendor/codali/eval/ReportStore.js +96 -0
- package/dist/vendor/codali/eval/SuiteLoader.d.ts +12 -0
- package/dist/vendor/codali/eval/SuiteLoader.d.ts.map +1 -0
- package/dist/vendor/codali/eval/SuiteLoader.js +51 -0
- package/dist/vendor/codali/eval/SuiteSchema.d.ts +56 -0
- package/dist/vendor/codali/eval/SuiteSchema.d.ts.map +1 -0
- package/dist/vendor/codali/eval/SuiteSchema.js +357 -0
- package/dist/vendor/codali/index.d.ts +11 -0
- package/dist/vendor/codali/index.d.ts.map +1 -0
- package/dist/vendor/codali/index.js +5 -0
- package/dist/vendor/codali/providers/CodexCliProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/CodexCliProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/CodexCliProvider.js +282 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/OllamaRemoteProvider.js +300 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts +8 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.d.ts.map +1 -0
- package/dist/vendor/codali/providers/OpenAiCompatibleProvider.js +192 -0
- package/dist/vendor/codali/providers/ProviderRegistry.d.ts +12 -0
- package/dist/vendor/codali/providers/ProviderRegistry.d.ts.map +1 -0
- package/dist/vendor/codali/providers/ProviderRegistry.js +28 -0
- package/dist/vendor/codali/providers/ProviderTypes.d.ts +81 -0
- package/dist/vendor/codali/providers/ProviderTypes.d.ts.map +1 -0
- package/dist/vendor/codali/providers/ProviderTypes.js +1 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.d.ts +183 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/CodaliRuntime.js +1363 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts +39 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/DeepInvestigationErrors.js +57 -0
- package/dist/vendor/codali/runtime/RunContext.d.ts +27 -0
- package/dist/vendor/codali/runtime/RunContext.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunContext.js +51 -0
- package/dist/vendor/codali/runtime/RunLogQuery.d.ts +48 -0
- package/dist/vendor/codali/runtime/RunLogQuery.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogQuery.js +36 -0
- package/dist/vendor/codali/runtime/RunLogReader.d.ts +19 -0
- package/dist/vendor/codali/runtime/RunLogReader.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogReader.js +361 -0
- package/dist/vendor/codali/runtime/RunLogger.d.ts +71 -0
- package/dist/vendor/codali/runtime/RunLogger.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunLogger.js +100 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts +117 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/RunTelemetryTypes.js +299 -0
- package/dist/vendor/codali/runtime/Runner.d.ts +66 -0
- package/dist/vendor/codali/runtime/Runner.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/Runner.js +215 -0
- package/dist/vendor/codali/runtime/StoragePaths.d.ts +3 -0
- package/dist/vendor/codali/runtime/StoragePaths.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/StoragePaths.js +19 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.d.ts +30 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.d.ts.map +1 -0
- package/dist/vendor/codali/runtime/WorkspaceLock.js +141 -0
- package/dist/vendor/codali/session/InstructionLoader.d.ts +14 -0
- package/dist/vendor/codali/session/InstructionLoader.d.ts.map +1 -0
- package/dist/vendor/codali/session/InstructionLoader.js +107 -0
- package/dist/vendor/codali/session/SessionStore.d.ts +81 -0
- package/dist/vendor/codali/session/SessionStore.d.ts.map +1 -0
- package/dist/vendor/codali/session/SessionStore.js +244 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts +68 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.d.ts.map +1 -0
- package/dist/vendor/codali/subagents/SubagentOrchestrator.js +150 -0
- package/dist/vendor/codali/tools/ToolRegistry.d.ts +9 -0
- package/dist/vendor/codali/tools/ToolRegistry.d.ts.map +1 -0
- package/dist/vendor/codali/tools/ToolRegistry.js +263 -0
- package/dist/vendor/codali/tools/ToolTypes.d.ts +66 -0
- package/dist/vendor/codali/tools/ToolTypes.d.ts.map +1 -0
- package/dist/vendor/codali/tools/ToolTypes.js +32 -0
- package/dist/vendor/codali/tools/diff/DiffTool.d.ts +3 -0
- package/dist/vendor/codali/tools/diff/DiffTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/diff/DiffTool.js +34 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts +4 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.d.ts.map +1 -0
- package/dist/vendor/codali/tools/docdex/DocdexTools.js +453 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.d.ts +3 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.d.ts.map +1 -0
- package/dist/vendor/codali/tools/filesystem/FileTools.js +141 -0
- package/dist/vendor/codali/tools/search/SearchTool.d.ts +3 -0
- package/dist/vendor/codali/tools/search/SearchTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/search/SearchTool.js +46 -0
- package/dist/vendor/codali/tools/shell/ShellTool.d.ts +3 -0
- package/dist/vendor/codali/tools/shell/ShellTool.d.ts.map +1 -0
- package/dist/vendor/codali/tools/shell/ShellTool.js +104 -0
- package/package.json +5 -3
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
export interface RateMetric {
|
|
2
|
+
numerator: number;
|
|
3
|
+
denominator: number;
|
|
4
|
+
missing: number;
|
|
5
|
+
value: number | null;
|
|
6
|
+
}
|
|
7
|
+
export interface PercentileMetric {
|
|
8
|
+
sample_size: number;
|
|
9
|
+
missing: number;
|
|
10
|
+
median: number | null;
|
|
11
|
+
p95: number | null;
|
|
12
|
+
}
|
|
13
|
+
export interface EvalMetrics {
|
|
14
|
+
schema_version: 1;
|
|
15
|
+
generated_at: string;
|
|
16
|
+
task_count: number;
|
|
17
|
+
m001_task_success_rate: RateMetric;
|
|
18
|
+
m002_first_pass_success_rate: RateMetric;
|
|
19
|
+
m003_patch_apply_success_rate: RateMetric;
|
|
20
|
+
m004_verification_pass_rate: RateMetric;
|
|
21
|
+
m005_hallucination_rate: RateMetric;
|
|
22
|
+
m006_scope_violation_rate: RateMetric;
|
|
23
|
+
m007_latency_ms: PercentileMetric;
|
|
24
|
+
m008_success_tokens: PercentileMetric;
|
|
25
|
+
m008_success_cost_usd: PercentileMetric;
|
|
26
|
+
}
|
|
27
|
+
export type EvalMetricKey = "m001_task_success_rate" | "m002_first_pass_success_rate" | "m003_patch_apply_success_rate" | "m004_verification_pass_rate" | "m005_hallucination_rate" | "m006_scope_violation_rate" | "m007_latency_ms.median" | "m007_latency_ms.p95" | "m008_success_tokens.median" | "m008_success_tokens.p95" | "m008_success_cost_usd.median" | "m008_success_cost_usd.p95";
|
|
28
|
+
//# sourceMappingURL=MetricTypes.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"MetricTypes.d.ts","sourceRoot":"","sources":["../../src/eval/MetricTypes.ts"],"names":[],"mappings":"AAAA,MAAM,WAAW,UAAU;IACzB,SAAS,EAAE,MAAM,CAAC;IAClB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;CACtB;AAED,MAAM,WAAW,gBAAgB;IAC/B,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,GAAG,EAAE,MAAM,GAAG,IAAI,CAAC;CACpB;AAED,MAAM,WAAW,WAAW;IAC1B,cAAc,EAAE,CAAC,CAAC;IAClB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,sBAAsB,EAAE,UAAU,CAAC;IACnC,4BAA4B,EAAE,UAAU,CAAC;IACzC,6BAA6B,EAAE,UAAU,CAAC;IAC1C,2BAA2B,EAAE,UAAU,CAAC;IACxC,uBAAuB,EAAE,UAAU,CAAC;IACpC,yBAAyB,EAAE,UAAU,CAAC;IACtC,eAAe,EAAE,gBAAgB,CAAC;IAClC,mBAAmB,EAAE,gBAAgB,CAAC;IACtC,qBAAqB,EAAE,gBAAgB,CAAC;CACzC;AAED,MAAM,MAAM,aAAa,GACrB,wBAAwB,GACxB,8BAA8B,GAC9B,+BAA+B,GAC/B,6BAA6B,GAC7B,yBAAyB,GACzB,2BAA2B,GAC3B,wBAAwB,GACxB,qBAAqB,GACrB,4BAA4B,GAC5B,yBAAyB,GACzB,8BAA8B,GAC9B,2BAA2B,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"MetricsAggregator.d.ts","sourceRoot":"","sources":["../../src/eval/MetricsAggregator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,WAAW,EAAgC,MAAM,kBAAkB,CAAC;AAClF,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AA0ErD,eAAO,MAAM,gBAAgB,GAAI,KAAK,aAAa,KAAG,WA0BrD,CAAC"}
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
const computeRate = (samples) => {
|
|
2
|
+
let numerator = 0;
|
|
3
|
+
let denominator = 0;
|
|
4
|
+
let missing = 0;
|
|
5
|
+
for (const sample of samples) {
|
|
6
|
+
if (sample === null) {
|
|
7
|
+
missing += 1;
|
|
8
|
+
continue;
|
|
9
|
+
}
|
|
10
|
+
denominator += 1;
|
|
11
|
+
if (sample)
|
|
12
|
+
numerator += 1;
|
|
13
|
+
}
|
|
14
|
+
return {
|
|
15
|
+
numerator,
|
|
16
|
+
denominator,
|
|
17
|
+
missing,
|
|
18
|
+
value: denominator > 0 ? numerator / denominator : null,
|
|
19
|
+
};
|
|
20
|
+
};
|
|
21
|
+
const percentile = (values, fraction) => {
|
|
22
|
+
if (values.length === 0)
|
|
23
|
+
return null;
|
|
24
|
+
const sorted = [...values].sort((left, right) => left - right);
|
|
25
|
+
if (sorted.length === 1)
|
|
26
|
+
return sorted[0];
|
|
27
|
+
const index = (sorted.length - 1) * fraction;
|
|
28
|
+
const lower = Math.floor(index);
|
|
29
|
+
const upper = Math.ceil(index);
|
|
30
|
+
const lowerValue = sorted[lower];
|
|
31
|
+
const upperValue = sorted[upper];
|
|
32
|
+
if (lower === upper)
|
|
33
|
+
return lowerValue;
|
|
34
|
+
const weight = index - lower;
|
|
35
|
+
return lowerValue + (upperValue - lowerValue) * weight;
|
|
36
|
+
};
|
|
37
|
+
const computePercentiles = (samples) => {
|
|
38
|
+
const values = samples.filter((entry) => entry !== null);
|
|
39
|
+
return {
|
|
40
|
+
sample_size: values.length,
|
|
41
|
+
missing: samples.length - values.length,
|
|
42
|
+
median: percentile(values, 0.5),
|
|
43
|
+
p95: percentile(values, 0.95),
|
|
44
|
+
};
|
|
45
|
+
};
|
|
46
|
+
const toFirstPassSample = (result) => {
|
|
47
|
+
if (!result.task_passed)
|
|
48
|
+
return false;
|
|
49
|
+
if (result.first_pass === null)
|
|
50
|
+
return null;
|
|
51
|
+
return result.first_pass;
|
|
52
|
+
};
|
|
53
|
+
const toPatchApplySample = (result) => {
|
|
54
|
+
if (result.patch_apply_success !== null)
|
|
55
|
+
return result.patch_apply_success;
|
|
56
|
+
const hasPatchExpectation = result.assertion_results.some((assertion) => assertion.code === "assert_expect_patch_apply");
|
|
57
|
+
if (!hasPatchExpectation)
|
|
58
|
+
return null;
|
|
59
|
+
return false;
|
|
60
|
+
};
|
|
61
|
+
const toVerificationSample = (result) => result.verification_passed;
|
|
62
|
+
const toHallucinationSample = (result) => {
|
|
63
|
+
if (result.hallucination_detected === null)
|
|
64
|
+
return null;
|
|
65
|
+
return result.hallucination_detected;
|
|
66
|
+
};
|
|
67
|
+
const toScopeViolationSample = (result) => {
|
|
68
|
+
if (result.scope_violation_detected === null)
|
|
69
|
+
return null;
|
|
70
|
+
return result.scope_violation_detected;
|
|
71
|
+
};
|
|
72
|
+
export const aggregateMetrics = (run) => {
|
|
73
|
+
const taskSuccess = run.task_results.map((result) => result.task_passed);
|
|
74
|
+
const firstPass = run.task_results.map((result) => toFirstPassSample(result));
|
|
75
|
+
const patchApply = run.task_results.map((result) => toPatchApplySample(result));
|
|
76
|
+
const verificationPass = run.task_results.map((result) => toVerificationSample(result));
|
|
77
|
+
const hallucinationRate = run.task_results.map((result) => toHallucinationSample(result));
|
|
78
|
+
const scopeViolationRate = run.task_results.map((result) => toScopeViolationSample(result));
|
|
79
|
+
const latency = run.task_results.map((result) => result.latency_ms);
|
|
80
|
+
const successfulTasks = run.task_results.filter((result) => result.task_passed);
|
|
81
|
+
const successfulTokens = successfulTasks.map((result) => result.tokens_used);
|
|
82
|
+
const successfulCost = successfulTasks.map((result) => result.cost_usd);
|
|
83
|
+
return {
|
|
84
|
+
schema_version: 1,
|
|
85
|
+
generated_at: new Date().toISOString(),
|
|
86
|
+
task_count: run.summary.total,
|
|
87
|
+
m001_task_success_rate: computeRate(taskSuccess.map((value) => value)),
|
|
88
|
+
m002_first_pass_success_rate: computeRate(firstPass),
|
|
89
|
+
m003_patch_apply_success_rate: computeRate(patchApply),
|
|
90
|
+
m004_verification_pass_rate: computeRate(verificationPass),
|
|
91
|
+
m005_hallucination_rate: computeRate(hallucinationRate),
|
|
92
|
+
m006_scope_violation_rate: computeRate(scopeViolationRate),
|
|
93
|
+
m007_latency_ms: computePercentiles(latency),
|
|
94
|
+
m008_success_tokens: computePercentiles(successfulTokens),
|
|
95
|
+
m008_success_cost_usd: computePercentiles(successfulCost),
|
|
96
|
+
};
|
|
97
|
+
};
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { EvalMetricKey, EvalMetrics } from "./MetricTypes.js";
|
|
2
|
+
export interface EvalMetricDelta {
|
|
3
|
+
key: EvalMetricKey;
|
|
4
|
+
unit: "ratio" | "ms" | "tokens" | "usd";
|
|
5
|
+
higher_is_better: boolean;
|
|
6
|
+
baseline: number | null;
|
|
7
|
+
current: number | null;
|
|
8
|
+
delta: number | null;
|
|
9
|
+
direction: "up" | "down" | "flat" | "unknown";
|
|
10
|
+
regression: boolean;
|
|
11
|
+
improved: boolean;
|
|
12
|
+
}
|
|
13
|
+
export interface EvalRegressionComparison {
|
|
14
|
+
schema_version: 1;
|
|
15
|
+
status: "baseline_missing" | "compared";
|
|
16
|
+
baseline_report_id?: string;
|
|
17
|
+
baseline_created_at?: string;
|
|
18
|
+
deltas: EvalMetricDelta[];
|
|
19
|
+
regression_count: number;
|
|
20
|
+
improved_count: number;
|
|
21
|
+
unchanged_count: number;
|
|
22
|
+
}
|
|
23
|
+
export declare const compareAgainstBaseline: (params: {
|
|
24
|
+
current: EvalMetrics;
|
|
25
|
+
baseline?: EvalMetrics;
|
|
26
|
+
baseline_report_id?: string;
|
|
27
|
+
baseline_created_at?: string;
|
|
28
|
+
}) => EvalRegressionComparison;
|
|
29
|
+
//# sourceMappingURL=RegressionComparator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"RegressionComparator.d.ts","sourceRoot":"","sources":["../../src/eval/RegressionComparator.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AAEnE,MAAM,WAAW,eAAe;IAC9B,GAAG,EAAE,aAAa,CAAC;IACnB,IAAI,EAAE,OAAO,GAAG,IAAI,GAAG,QAAQ,GAAG,KAAK,CAAC;IACxC,gBAAgB,EAAE,OAAO,CAAC;IAC1B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,SAAS,EAAE,IAAI,GAAG,MAAM,GAAG,MAAM,GAAG,SAAS,CAAC;IAC9C,UAAU,EAAE,OAAO,CAAC;IACpB,QAAQ,EAAE,OAAO,CAAC;CACnB;AAED,MAAM,WAAW,wBAAwB;IACvC,cAAc,EAAE,CAAC,CAAC;IAClB,MAAM,EAAE,kBAAkB,GAAG,UAAU,CAAC;IACxC,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,mBAAmB,CAAC,EAAE,MAAM,CAAC;IAC7B,MAAM,EAAE,eAAe,EAAE,CAAC;IAC1B,gBAAgB,EAAE,MAAM,CAAC;IACzB,cAAc,EAAE,MAAM,CAAC;IACvB,eAAe,EAAE,MAAM,CAAC;CACzB;AA0HD,eAAO,MAAM,sBAAsB,GAAI,QAAQ;IAC7C,OAAO,EAAE,WAAW,CAAC;IACrB,QAAQ,CAAC,EAAE,WAAW,CAAC;IACvB,kBAAkB,CAAC,EAAE,MAAM,CAAC;IAC5B,mBAAmB,CAAC,EAAE,MAAM,CAAC;CAC9B,KAAG,wBAiDH,CAAC"}
|
|
@@ -0,0 +1,155 @@
|
|
|
1
|
+
const EPSILON = 1e-12;
|
|
2
|
+
const flattenMetrics = (metrics) => [
|
|
3
|
+
{
|
|
4
|
+
key: "m001_task_success_rate",
|
|
5
|
+
value: metrics.m001_task_success_rate.value,
|
|
6
|
+
unit: "ratio",
|
|
7
|
+
higher_is_better: true,
|
|
8
|
+
},
|
|
9
|
+
{
|
|
10
|
+
key: "m002_first_pass_success_rate",
|
|
11
|
+
value: metrics.m002_first_pass_success_rate.value,
|
|
12
|
+
unit: "ratio",
|
|
13
|
+
higher_is_better: true,
|
|
14
|
+
},
|
|
15
|
+
{
|
|
16
|
+
key: "m003_patch_apply_success_rate",
|
|
17
|
+
value: metrics.m003_patch_apply_success_rate.value,
|
|
18
|
+
unit: "ratio",
|
|
19
|
+
higher_is_better: true,
|
|
20
|
+
},
|
|
21
|
+
{
|
|
22
|
+
key: "m004_verification_pass_rate",
|
|
23
|
+
value: metrics.m004_verification_pass_rate.value,
|
|
24
|
+
unit: "ratio",
|
|
25
|
+
higher_is_better: true,
|
|
26
|
+
},
|
|
27
|
+
{
|
|
28
|
+
key: "m005_hallucination_rate",
|
|
29
|
+
value: metrics.m005_hallucination_rate.value,
|
|
30
|
+
unit: "ratio",
|
|
31
|
+
higher_is_better: false,
|
|
32
|
+
},
|
|
33
|
+
{
|
|
34
|
+
key: "m006_scope_violation_rate",
|
|
35
|
+
value: metrics.m006_scope_violation_rate.value,
|
|
36
|
+
unit: "ratio",
|
|
37
|
+
higher_is_better: false,
|
|
38
|
+
},
|
|
39
|
+
{
|
|
40
|
+
key: "m007_latency_ms.median",
|
|
41
|
+
value: metrics.m007_latency_ms.median,
|
|
42
|
+
unit: "ms",
|
|
43
|
+
higher_is_better: false,
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
key: "m007_latency_ms.p95",
|
|
47
|
+
value: metrics.m007_latency_ms.p95,
|
|
48
|
+
unit: "ms",
|
|
49
|
+
higher_is_better: false,
|
|
50
|
+
},
|
|
51
|
+
{
|
|
52
|
+
key: "m008_success_tokens.median",
|
|
53
|
+
value: metrics.m008_success_tokens.median,
|
|
54
|
+
unit: "tokens",
|
|
55
|
+
higher_is_better: false,
|
|
56
|
+
},
|
|
57
|
+
{
|
|
58
|
+
key: "m008_success_tokens.p95",
|
|
59
|
+
value: metrics.m008_success_tokens.p95,
|
|
60
|
+
unit: "tokens",
|
|
61
|
+
higher_is_better: false,
|
|
62
|
+
},
|
|
63
|
+
{
|
|
64
|
+
key: "m008_success_cost_usd.median",
|
|
65
|
+
value: metrics.m008_success_cost_usd.median,
|
|
66
|
+
unit: "usd",
|
|
67
|
+
higher_is_better: false,
|
|
68
|
+
},
|
|
69
|
+
{
|
|
70
|
+
key: "m008_success_cost_usd.p95",
|
|
71
|
+
value: metrics.m008_success_cost_usd.p95,
|
|
72
|
+
unit: "usd",
|
|
73
|
+
higher_is_better: false,
|
|
74
|
+
},
|
|
75
|
+
];
|
|
76
|
+
const compareMetric = (current, baseline) => {
|
|
77
|
+
if (current.value === null || baseline.value === null) {
|
|
78
|
+
return {
|
|
79
|
+
key: current.key,
|
|
80
|
+
unit: current.unit,
|
|
81
|
+
higher_is_better: current.higher_is_better,
|
|
82
|
+
baseline: baseline.value,
|
|
83
|
+
current: current.value,
|
|
84
|
+
delta: null,
|
|
85
|
+
direction: "unknown",
|
|
86
|
+
regression: false,
|
|
87
|
+
improved: false,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
90
|
+
const delta = current.value - baseline.value;
|
|
91
|
+
const direction = Math.abs(delta) <= EPSILON ? "flat" : (delta > 0 ? "up" : "down");
|
|
92
|
+
const regression = current.higher_is_better ? delta < -EPSILON : delta > EPSILON;
|
|
93
|
+
const improved = current.higher_is_better ? delta > EPSILON : delta < -EPSILON;
|
|
94
|
+
return {
|
|
95
|
+
key: current.key,
|
|
96
|
+
unit: current.unit,
|
|
97
|
+
higher_is_better: current.higher_is_better,
|
|
98
|
+
baseline: baseline.value,
|
|
99
|
+
current: current.value,
|
|
100
|
+
delta: Math.abs(delta) <= EPSILON ? 0 : delta,
|
|
101
|
+
direction,
|
|
102
|
+
regression,
|
|
103
|
+
improved,
|
|
104
|
+
};
|
|
105
|
+
};
|
|
106
|
+
export const compareAgainstBaseline = (params) => {
|
|
107
|
+
const currentEntries = flattenMetrics(params.current);
|
|
108
|
+
if (!params.baseline) {
|
|
109
|
+
return {
|
|
110
|
+
schema_version: 1,
|
|
111
|
+
status: "baseline_missing",
|
|
112
|
+
baseline_report_id: params.baseline_report_id,
|
|
113
|
+
baseline_created_at: params.baseline_created_at,
|
|
114
|
+
deltas: currentEntries.map((entry) => ({
|
|
115
|
+
key: entry.key,
|
|
116
|
+
unit: entry.unit,
|
|
117
|
+
higher_is_better: entry.higher_is_better,
|
|
118
|
+
baseline: null,
|
|
119
|
+
current: entry.value,
|
|
120
|
+
delta: null,
|
|
121
|
+
direction: "unknown",
|
|
122
|
+
regression: false,
|
|
123
|
+
improved: false,
|
|
124
|
+
})),
|
|
125
|
+
regression_count: 0,
|
|
126
|
+
improved_count: 0,
|
|
127
|
+
unchanged_count: currentEntries.length,
|
|
128
|
+
};
|
|
129
|
+
}
|
|
130
|
+
const baselineEntries = flattenMetrics(params.baseline);
|
|
131
|
+
const baselineByKey = new Map();
|
|
132
|
+
for (const entry of baselineEntries) {
|
|
133
|
+
baselineByKey.set(entry.key, entry);
|
|
134
|
+
}
|
|
135
|
+
const deltas = [];
|
|
136
|
+
for (const entry of currentEntries) {
|
|
137
|
+
const baselineEntry = baselineByKey.get(entry.key);
|
|
138
|
+
if (!baselineEntry)
|
|
139
|
+
continue;
|
|
140
|
+
deltas.push(compareMetric(entry, baselineEntry));
|
|
141
|
+
}
|
|
142
|
+
const regressionCount = deltas.filter((entry) => entry.regression).length;
|
|
143
|
+
const improvedCount = deltas.filter((entry) => entry.improved).length;
|
|
144
|
+
const unchangedCount = deltas.length - regressionCount - improvedCount;
|
|
145
|
+
return {
|
|
146
|
+
schema_version: 1,
|
|
147
|
+
status: "compared",
|
|
148
|
+
baseline_report_id: params.baseline_report_id,
|
|
149
|
+
baseline_created_at: params.baseline_created_at,
|
|
150
|
+
deltas,
|
|
151
|
+
regression_count: regressionCount,
|
|
152
|
+
improved_count: improvedCount,
|
|
153
|
+
unchanged_count: unchangedCount,
|
|
154
|
+
};
|
|
155
|
+
};
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
import type { VerificationOutcome } from "../cognitive/Types.js";
|
|
2
|
+
export type NormalizedRunStatus = "pass" | "fail" | "degraded" | "unknown";
|
|
3
|
+
export type NormalizedPhaseStatus = "available" | "missing" | "degraded";
|
|
4
|
+
export interface NormalizedArtifactReference {
|
|
5
|
+
phase: string;
|
|
6
|
+
kind: string;
|
|
7
|
+
status: "present" | "missing";
|
|
8
|
+
path: string | null;
|
|
9
|
+
reason_code: string | null;
|
|
10
|
+
}
|
|
11
|
+
export interface NormalizedPhaseOutcome {
|
|
12
|
+
phase: string;
|
|
13
|
+
status: NormalizedPhaseStatus;
|
|
14
|
+
duration_ms: number | null;
|
|
15
|
+
provider: string | null;
|
|
16
|
+
model: string | null;
|
|
17
|
+
input_tokens: number | null;
|
|
18
|
+
output_tokens: number | null;
|
|
19
|
+
total_tokens: number | null;
|
|
20
|
+
cost_usd: number | null;
|
|
21
|
+
cost_source: string | null;
|
|
22
|
+
missing_usage_reason: string | null;
|
|
23
|
+
missing_cost_reason: string | null;
|
|
24
|
+
}
|
|
25
|
+
export interface NormalizedRunRecord {
|
|
26
|
+
schema_version: 1;
|
|
27
|
+
run_id: string | null;
|
|
28
|
+
task_id: string | null;
|
|
29
|
+
fingerprint: string | null;
|
|
30
|
+
duration_ms: number | null;
|
|
31
|
+
final_status: NormalizedRunStatus;
|
|
32
|
+
failure_class: string | null;
|
|
33
|
+
reason_codes: string[];
|
|
34
|
+
retryable: boolean | null;
|
|
35
|
+
verification_outcome: VerificationOutcome | null;
|
|
36
|
+
touched_files: string[];
|
|
37
|
+
artifact_references: NormalizedArtifactReference[];
|
|
38
|
+
missing_artifacts: string[];
|
|
39
|
+
phase_outcomes: NormalizedPhaseOutcome[];
|
|
40
|
+
usage_tokens_total: number | null;
|
|
41
|
+
cost_usd: number | null;
|
|
42
|
+
missing_data_markers: string[];
|
|
43
|
+
}
|
|
44
|
+
export interface AdaptRunSummaryInput {
|
|
45
|
+
runSummary?: unknown;
|
|
46
|
+
runId?: string;
|
|
47
|
+
taskId?: string;
|
|
48
|
+
verificationOutcome?: VerificationOutcome | null;
|
|
49
|
+
touchedFiles?: string[];
|
|
50
|
+
}
|
|
51
|
+
export declare const adaptRunSummaryForReport: (input?: AdaptRunSummaryInput) => NormalizedRunRecord;
|
|
52
|
+
//# sourceMappingURL=ReportInputAdapter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ReportInputAdapter.d.ts","sourceRoot":"","sources":["../../src/eval/ReportInputAdapter.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,uBAAuB,CAAC;AAEjE,MAAM,MAAM,mBAAmB,GAAG,MAAM,GAAG,MAAM,GAAG,UAAU,GAAG,SAAS,CAAC;AAC3E,MAAM,MAAM,qBAAqB,GAAG,WAAW,GAAG,SAAS,GAAG,UAAU,CAAC;AAEzE,MAAM,WAAW,2BAA2B;IAC1C,KAAK,EAAE,MAAM,CAAC;IACd,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,SAAS,GAAG,SAAS,CAAC;IAC9B,IAAI,EAAE,MAAM,GAAG,IAAI,CAAC;IACpB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;CAC5B;AAED,MAAM,WAAW,sBAAsB;IACrC,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,qBAAqB,CAAC;IAC9B,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,KAAK,EAAE,MAAM,GAAG,IAAI,CAAC;IACrB,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,MAAM,GAAG,IAAI,CAAC;IAC5B,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,oBAAoB,EAAE,MAAM,GAAG,IAAI,CAAC;IACpC,mBAAmB,EAAE,MAAM,GAAG,IAAI,CAAC;CACpC;AAED,MAAM,WAAW,mBAAmB;IAClC,cAAc,EAAE,CAAC,CAAC;IAClB,MAAM,EAAE,MAAM,GAAG,IAAI,CAAC;IACtB,OAAO,EAAE,MAAM,GAAG,IAAI,CAAC;IACvB,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,WAAW,EAAE,MAAM,GAAG,IAAI,CAAC;IAC3B,YAAY,EAAE,mBAAmB,CAAC;IAClC,aAAa,EAAE,MAAM,GAAG,IAAI,CAAC;IAC7B,YAAY,EAAE,MAAM,EAAE,CAAC;IACvB,SAAS,EAAE,OAAO,GAAG,IAAI,CAAC;IAC1B,oBAAoB,EAAE,mBAAmB,GAAG,IAAI,CAAC;IACjD,aAAa,EAAE,MAAM,EAAE,CAAC;IACxB,mBAAmB,EAAE,2BAA2B,EAAE,CAAC;IACnD,iBAAiB,EAAE,MAAM,EAAE,CAAC;IAC5B,cAAc,EAAE,sBAAsB,EAAE,CAAC;IACzC,kBAAkB,EAAE,MAAM,GAAG,IAAI,CAAC;IAClC,QAAQ,EAAE,MAAM,GAAG,IAAI,CAAC;IACxB,oBAAoB,EAAE,MAAM,EAAE,CAAC;CAChC;AAED,MAAM,WAAW,oBAAoB;IACnC,UAAU,CAAC,EAAE,OAAO,CAAC;IACrB,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;IAChB,mBAAmB,CAAC,EAAE,mBAAmB,GAAG,IAAI,CAAC;IACjD,YAAY,CAAC,EAAE,MAAM,EAAE,CAAC;CACzB;AAmMD,eAAO,MAAM,wBAAwB,GAAI,QAAO,oBAAyB,KAAG,mBAsE3E,CAAC"}
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
const asRecord = (value) => {
|
|
2
|
+
if (!value || typeof value !== "object" || Array.isArray(value))
|
|
3
|
+
return undefined;
|
|
4
|
+
return value;
|
|
5
|
+
};
|
|
6
|
+
const asString = (value) => {
|
|
7
|
+
if (typeof value !== "string")
|
|
8
|
+
return undefined;
|
|
9
|
+
const trimmed = value.trim();
|
|
10
|
+
return trimmed.length ? trimmed : undefined;
|
|
11
|
+
};
|
|
12
|
+
const asBoolean = (value) => {
|
|
13
|
+
if (typeof value === "boolean")
|
|
14
|
+
return value;
|
|
15
|
+
if (value === null)
|
|
16
|
+
return null;
|
|
17
|
+
return null;
|
|
18
|
+
};
|
|
19
|
+
const asNumber = (value) => {
|
|
20
|
+
if (typeof value !== "number" || !Number.isFinite(value))
|
|
21
|
+
return null;
|
|
22
|
+
return value;
|
|
23
|
+
};
|
|
24
|
+
const uniqueSortedStrings = (value) => {
|
|
25
|
+
if (!Array.isArray(value))
|
|
26
|
+
return [];
|
|
27
|
+
return Array.from(new Set(value
|
|
28
|
+
.filter((entry) => typeof entry === "string")
|
|
29
|
+
.map((entry) => entry.trim())
|
|
30
|
+
.filter((entry) => entry.length > 0))).sort((left, right) => left.localeCompare(right));
|
|
31
|
+
};
|
|
32
|
+
const normalizeRunStatus = (value) => {
|
|
33
|
+
if (value === "pass" || value === "fail" || value === "degraded")
|
|
34
|
+
return value;
|
|
35
|
+
return "unknown";
|
|
36
|
+
};
|
|
37
|
+
const normalizePhaseStatus = (value) => {
|
|
38
|
+
if (value === "available" || value === "missing" || value === "degraded")
|
|
39
|
+
return value;
|
|
40
|
+
return "missing";
|
|
41
|
+
};
|
|
42
|
+
const normalizeVerificationOutcome = (value) => {
|
|
43
|
+
if (value === "verified_passed"
|
|
44
|
+
|| value === "verified_failed"
|
|
45
|
+
|| value === "unverified_with_reason") {
|
|
46
|
+
return value;
|
|
47
|
+
}
|
|
48
|
+
return null;
|
|
49
|
+
};
|
|
50
|
+
const normalizeArtifactReferences = (value) => {
|
|
51
|
+
if (!Array.isArray(value))
|
|
52
|
+
return [];
|
|
53
|
+
const results = [];
|
|
54
|
+
for (const entry of value) {
|
|
55
|
+
const record = asRecord(entry);
|
|
56
|
+
if (!record)
|
|
57
|
+
continue;
|
|
58
|
+
const phase = asString(record.phase);
|
|
59
|
+
const kind = asString(record.kind);
|
|
60
|
+
if (!phase || !kind)
|
|
61
|
+
continue;
|
|
62
|
+
results.push({
|
|
63
|
+
phase,
|
|
64
|
+
kind,
|
|
65
|
+
status: record.status === "missing" ? "missing" : "present",
|
|
66
|
+
path: asString(record.path) ?? null,
|
|
67
|
+
reason_code: asString(record.reason_code) ?? null,
|
|
68
|
+
});
|
|
69
|
+
}
|
|
70
|
+
return results.sort((left, right) => `${left.phase}:${left.kind}:${left.path ?? ""}`.localeCompare(`${right.phase}:${right.kind}:${right.path ?? ""}`));
|
|
71
|
+
};
|
|
72
|
+
const normalizePhaseTelemetry = (value) => {
|
|
73
|
+
if (!Array.isArray(value))
|
|
74
|
+
return [];
|
|
75
|
+
const sources = [];
|
|
76
|
+
for (const entry of value) {
|
|
77
|
+
const record = asRecord(entry);
|
|
78
|
+
if (!record)
|
|
79
|
+
continue;
|
|
80
|
+
const phase = asString(record.phase);
|
|
81
|
+
if (!phase)
|
|
82
|
+
continue;
|
|
83
|
+
const usage = asRecord(record.usage);
|
|
84
|
+
const cost = asRecord(record.cost);
|
|
85
|
+
const inputTokens = asNumber(usage?.input_tokens);
|
|
86
|
+
const outputTokens = asNumber(usage?.output_tokens);
|
|
87
|
+
const totalTokens = asNumber(usage?.total_tokens)
|
|
88
|
+
?? (inputTokens !== null || outputTokens !== null
|
|
89
|
+
? (inputTokens ?? 0) + (outputTokens ?? 0)
|
|
90
|
+
: null);
|
|
91
|
+
sources.push({
|
|
92
|
+
phase,
|
|
93
|
+
duration_ms: asNumber(record.duration_ms),
|
|
94
|
+
provider: asString(record.provider) ?? null,
|
|
95
|
+
model: asString(record.model) ?? null,
|
|
96
|
+
input_tokens: inputTokens,
|
|
97
|
+
output_tokens: outputTokens,
|
|
98
|
+
total_tokens: totalTokens,
|
|
99
|
+
cost_usd: asNumber(cost?.usd),
|
|
100
|
+
cost_source: asString(cost?.source) ?? null,
|
|
101
|
+
missing_usage_reason: asString(record.missing_usage_reason) ?? null,
|
|
102
|
+
missing_cost_reason: asString(record.missing_cost_reason) ?? null,
|
|
103
|
+
});
|
|
104
|
+
}
|
|
105
|
+
return sources.sort((left, right) => left.phase.localeCompare(right.phase));
|
|
106
|
+
};
|
|
107
|
+
const phaseKeyToSummaryPhase = (key) => {
|
|
108
|
+
if (key === "plan")
|
|
109
|
+
return "plan";
|
|
110
|
+
if (key === "retrieval")
|
|
111
|
+
return "retrieve";
|
|
112
|
+
if (key === "patch")
|
|
113
|
+
return "act";
|
|
114
|
+
if (key === "verification")
|
|
115
|
+
return "verify";
|
|
116
|
+
return undefined;
|
|
117
|
+
};
|
|
118
|
+
const buildPhaseOutcomes = (runSummary, telemetry) => {
|
|
119
|
+
const quality = asRecord(runSummary.quality_dimensions);
|
|
120
|
+
const phaseStatus = new Map();
|
|
121
|
+
if (quality) {
|
|
122
|
+
for (const [key, rawValue] of Object.entries(quality)) {
|
|
123
|
+
const phase = phaseKeyToSummaryPhase(key);
|
|
124
|
+
if (!phase)
|
|
125
|
+
continue;
|
|
126
|
+
phaseStatus.set(phase, normalizePhaseStatus(rawValue));
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
for (const entry of telemetry) {
|
|
130
|
+
if (!phaseStatus.has(entry.phase)) {
|
|
131
|
+
phaseStatus.set(entry.phase, "missing");
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
if (phaseStatus.size === 0) {
|
|
135
|
+
for (const phase of ["plan", "retrieve", "act", "verify"]) {
|
|
136
|
+
phaseStatus.set(phase, "missing");
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
const telemetryByPhase = new Map();
|
|
140
|
+
for (const entry of telemetry) {
|
|
141
|
+
telemetryByPhase.set(entry.phase, entry);
|
|
142
|
+
}
|
|
143
|
+
const outcomes = [];
|
|
144
|
+
for (const [phase, status] of phaseStatus.entries()) {
|
|
145
|
+
const source = telemetryByPhase.get(phase);
|
|
146
|
+
outcomes.push({
|
|
147
|
+
phase,
|
|
148
|
+
status,
|
|
149
|
+
duration_ms: source?.duration_ms ?? null,
|
|
150
|
+
provider: source?.provider ?? null,
|
|
151
|
+
model: source?.model ?? null,
|
|
152
|
+
input_tokens: source?.input_tokens ?? null,
|
|
153
|
+
output_tokens: source?.output_tokens ?? null,
|
|
154
|
+
total_tokens: source?.total_tokens ?? null,
|
|
155
|
+
cost_usd: source?.cost_usd ?? null,
|
|
156
|
+
cost_source: source?.cost_source ?? null,
|
|
157
|
+
missing_usage_reason: source?.missing_usage_reason ?? null,
|
|
158
|
+
missing_cost_reason: source?.missing_cost_reason ?? null,
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
return outcomes.sort((left, right) => left.phase.localeCompare(right.phase));
|
|
162
|
+
};
|
|
163
|
+
const sumNullable = (values) => {
|
|
164
|
+
const present = values.filter((entry) => entry !== null);
|
|
165
|
+
if (!present.length)
|
|
166
|
+
return null;
|
|
167
|
+
return present.reduce((sum, value) => sum + value, 0);
|
|
168
|
+
};
|
|
169
|
+
export const adaptRunSummaryForReport = (input = {}) => {
|
|
170
|
+
const runSummary = asRecord(input.runSummary);
|
|
171
|
+
const finalDisposition = asRecord(runSummary?.final_disposition);
|
|
172
|
+
const artifactReferences = normalizeArtifactReferences(runSummary?.artifact_references);
|
|
173
|
+
const phaseTelemetry = normalizePhaseTelemetry(runSummary?.phase_telemetry);
|
|
174
|
+
const phaseOutcomes = buildPhaseOutcomes(runSummary ?? {}, phaseTelemetry);
|
|
175
|
+
const topLevelUsage = asRecord(runSummary?.usage);
|
|
176
|
+
const topLevelTotalTokens = asNumber(topLevelUsage?.totalTokens)
|
|
177
|
+
?? (asNumber(topLevelUsage?.inputTokens) !== null || asNumber(topLevelUsage?.outputTokens) !== null
|
|
178
|
+
? (asNumber(topLevelUsage?.inputTokens) ?? 0) + (asNumber(topLevelUsage?.outputTokens) ?? 0)
|
|
179
|
+
: null);
|
|
180
|
+
const usageTokensTotal = topLevelTotalTokens ?? sumNullable(phaseOutcomes.map((phase) => phase.total_tokens));
|
|
181
|
+
const topLevelCost = asNumber(runSummary?.actualCost);
|
|
182
|
+
const phaseCost = sumNullable(phaseOutcomes.map((phase) => phase.cost_usd));
|
|
183
|
+
const costUsd = topLevelCost ?? phaseCost;
|
|
184
|
+
const missingArtifacts = uniqueSortedStrings(runSummary?.missing_artifacts
|
|
185
|
+
?? artifactReferences
|
|
186
|
+
.filter((entry) => entry.status === "missing")
|
|
187
|
+
.map((entry) => `${entry.phase}:${entry.kind}`));
|
|
188
|
+
const verificationRecord = asRecord(runSummary?.verification);
|
|
189
|
+
const verificationOutcome = normalizeVerificationOutcome(input.verificationOutcome)
|
|
190
|
+
?? normalizeVerificationOutcome(verificationRecord?.outcome);
|
|
191
|
+
const markers = new Set();
|
|
192
|
+
if (!runSummary)
|
|
193
|
+
markers.add("run_summary_missing");
|
|
194
|
+
if (!asString(runSummary?.run_id ?? runSummary?.runId ?? input.runId))
|
|
195
|
+
markers.add("run_id_missing");
|
|
196
|
+
if (!finalDisposition)
|
|
197
|
+
markers.add("final_disposition_missing");
|
|
198
|
+
if (!phaseTelemetry.length)
|
|
199
|
+
markers.add("phase_telemetry_missing");
|
|
200
|
+
if (verificationOutcome === null)
|
|
201
|
+
markers.add("verification_outcome_missing");
|
|
202
|
+
if (usageTokensTotal === null)
|
|
203
|
+
markers.add("usage_tokens_missing");
|
|
204
|
+
if (costUsd === null)
|
|
205
|
+
markers.add("cost_missing");
|
|
206
|
+
const touchedFiles = Array.from(new Set([
|
|
207
|
+
...uniqueSortedStrings(runSummary?.touchedFiles),
|
|
208
|
+
...(Array.isArray(input.touchedFiles) ? input.touchedFiles : []),
|
|
209
|
+
])).sort((left, right) => left.localeCompare(right));
|
|
210
|
+
return {
|
|
211
|
+
schema_version: 1,
|
|
212
|
+
run_id: asString(runSummary?.run_id ?? runSummary?.runId ?? input.runId) ?? null,
|
|
213
|
+
task_id: asString(runSummary?.task_id ?? runSummary?.taskId ?? input.taskId) ?? null,
|
|
214
|
+
fingerprint: asString(runSummary?.fingerprint) ?? null,
|
|
215
|
+
duration_ms: asNumber(runSummary?.durationMs),
|
|
216
|
+
final_status: normalizeRunStatus(finalDisposition?.status),
|
|
217
|
+
failure_class: asString(finalDisposition?.failure_class ?? finalDisposition?.failureClass) ?? null,
|
|
218
|
+
reason_codes: uniqueSortedStrings(finalDisposition?.reason_codes ?? finalDisposition?.reasons),
|
|
219
|
+
retryable: asBoolean(finalDisposition?.retryable),
|
|
220
|
+
verification_outcome: verificationOutcome,
|
|
221
|
+
touched_files: touchedFiles,
|
|
222
|
+
artifact_references: artifactReferences,
|
|
223
|
+
missing_artifacts: missingArtifacts,
|
|
224
|
+
phase_outcomes: phaseOutcomes,
|
|
225
|
+
usage_tokens_total: usageTokensTotal,
|
|
226
|
+
cost_usd: costUsd,
|
|
227
|
+
missing_data_markers: Array.from(markers).sort((left, right) => left.localeCompare(right)),
|
|
228
|
+
};
|
|
229
|
+
};
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type { EvalGateResult } from "./GateEvaluator.js";
|
|
2
|
+
import type { EvalMetrics } from "./MetricTypes.js";
|
|
3
|
+
import type { EvalRegressionComparison } from "./RegressionComparator.js";
|
|
4
|
+
import type { EvalRunResult } from "./EvalRunner.js";
|
|
5
|
+
export interface EvalReport {
|
|
6
|
+
schema_version: 1;
|
|
7
|
+
report_id: string;
|
|
8
|
+
created_at: string;
|
|
9
|
+
suite: {
|
|
10
|
+
suite_id: string;
|
|
11
|
+
suite_name: string;
|
|
12
|
+
suite_path: string;
|
|
13
|
+
suite_fingerprint: string;
|
|
14
|
+
task_count: number;
|
|
15
|
+
};
|
|
16
|
+
summary: {
|
|
17
|
+
exit_code: number;
|
|
18
|
+
passed: boolean;
|
|
19
|
+
gate_passed: boolean;
|
|
20
|
+
task_total: number;
|
|
21
|
+
task_passed: number;
|
|
22
|
+
task_failed: number;
|
|
23
|
+
execution_errors: number;
|
|
24
|
+
};
|
|
25
|
+
run: EvalRunResult;
|
|
26
|
+
metrics: EvalMetrics;
|
|
27
|
+
regression: EvalRegressionComparison;
|
|
28
|
+
gates: EvalGateResult;
|
|
29
|
+
}
|
|
30
|
+
export declare const serializeEvalReport: (report: EvalReport, pretty?: boolean) => string;
|
|
31
|
+
export declare const parseEvalReport: (content: string) => EvalReport;
|
|
32
|
+
//# sourceMappingURL=ReportSerializer.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"ReportSerializer.d.ts","sourceRoot":"","sources":["../../src/eval/ReportSerializer.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACzD,OAAO,KAAK,EAAE,WAAW,EAAE,MAAM,kBAAkB,CAAC;AACpD,OAAO,KAAK,EAAE,wBAAwB,EAAE,MAAM,2BAA2B,CAAC;AAC1E,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,iBAAiB,CAAC;AAErD,MAAM,WAAW,UAAU;IACzB,cAAc,EAAE,CAAC,CAAC;IAClB,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE;QACL,QAAQ,EAAE,MAAM,CAAC;QACjB,UAAU,EAAE,MAAM,CAAC;QACnB,UAAU,EAAE,MAAM,CAAC;QACnB,iBAAiB,EAAE,MAAM,CAAC;QAC1B,UAAU,EAAE,MAAM,CAAC;KACpB,CAAC;IACF,OAAO,EAAE;QACP,SAAS,EAAE,MAAM,CAAC;QAClB,MAAM,EAAE,OAAO,CAAC;QAChB,WAAW,EAAE,OAAO,CAAC;QACrB,UAAU,EAAE,MAAM,CAAC;QACnB,WAAW,EAAE,MAAM,CAAC;QACpB,WAAW,EAAE,MAAM,CAAC;QACpB,gBAAgB,EAAE,MAAM,CAAC;KAC1B,CAAC;IACF,GAAG,EAAE,aAAa,CAAC;IACnB,OAAO,EAAE,WAAW,CAAC;IACrB,UAAU,EAAE,wBAAwB,CAAC;IACrC,KAAK,EAAE,cAAc,CAAC;CACvB;AAOD,eAAO,MAAM,mBAAmB,GAAI,QAAQ,UAAU,EAAE,gBAAa,KAAG,MAKvE,CAAC;AAEF,eAAO,MAAM,eAAe,GAAI,SAAS,MAAM,KAAG,UAkBjD,CAAC"}
|