model-test-bench 1.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +201 -0
- package/README.md +389 -0
- package/dist/bin/mtb.d.ts +3 -0
- package/dist/bin/mtb.d.ts.map +1 -0
- package/dist/bin/mtb.js +148 -0
- package/dist/bin/mtb.js.map +1 -0
- package/dist/server/index.d.ts +13 -0
- package/dist/server/index.d.ts.map +1 -0
- package/dist/server/index.js +72 -0
- package/dist/server/index.js.map +1 -0
- package/dist/server/interfaces/evaluator.d.ts +15 -0
- package/dist/server/interfaces/evaluator.d.ts.map +1 -0
- package/dist/server/interfaces/evaluator.js +2 -0
- package/dist/server/interfaces/evaluator.js.map +1 -0
- package/dist/server/interfaces/logger.d.ts +9 -0
- package/dist/server/interfaces/logger.d.ts.map +1 -0
- package/dist/server/interfaces/logger.js +2 -0
- package/dist/server/interfaces/logger.js.map +1 -0
- package/dist/server/interfaces/runner.d.ts +9 -0
- package/dist/server/interfaces/runner.d.ts.map +1 -0
- package/dist/server/interfaces/runner.js +2 -0
- package/dist/server/interfaces/runner.js.map +1 -0
- package/dist/server/interfaces/storage.d.ts +36 -0
- package/dist/server/interfaces/storage.d.ts.map +1 -0
- package/dist/server/interfaces/storage.js +2 -0
- package/dist/server/interfaces/storage.js.map +1 -0
- package/dist/server/routes/eval-queue.d.ts +23 -0
- package/dist/server/routes/eval-queue.d.ts.map +1 -0
- package/dist/server/routes/eval-queue.js +45 -0
- package/dist/server/routes/eval-queue.js.map +1 -0
- package/dist/server/routes/evaluations.d.ts +8 -0
- package/dist/server/routes/evaluations.d.ts.map +1 -0
- package/dist/server/routes/evaluations.js +221 -0
- package/dist/server/routes/evaluations.js.map +1 -0
- package/dist/server/routes/providers.d.ts +5 -0
- package/dist/server/routes/providers.d.ts.map +1 -0
- package/dist/server/routes/providers.js +179 -0
- package/dist/server/routes/providers.js.map +1 -0
- package/dist/server/routes/run-queue.d.ts +17 -0
- package/dist/server/routes/run-queue.d.ts.map +1 -0
- package/dist/server/routes/run-queue.js +34 -0
- package/dist/server/routes/run-queue.js.map +1 -0
- package/dist/server/routes/run-sse.d.ts +18 -0
- package/dist/server/routes/run-sse.d.ts.map +1 -0
- package/dist/server/routes/run-sse.js +57 -0
- package/dist/server/routes/run-sse.js.map +1 -0
- package/dist/server/routes/runs.d.ts +9 -0
- package/dist/server/routes/runs.d.ts.map +1 -0
- package/dist/server/routes/runs.js +380 -0
- package/dist/server/routes/runs.js.map +1 -0
- package/dist/server/routes/scenarios.d.ts +5 -0
- package/dist/server/routes/scenarios.d.ts.map +1 -0
- package/dist/server/routes/scenarios.js +181 -0
- package/dist/server/routes/scenarios.js.map +1 -0
- package/dist/server/services/eval-helpers.d.ts +22 -0
- package/dist/server/services/eval-helpers.d.ts.map +1 -0
- package/dist/server/services/eval-helpers.js +75 -0
- package/dist/server/services/eval-helpers.js.map +1 -0
- package/dist/server/services/eval-parsers-debate-impl.d.ts +11 -0
- package/dist/server/services/eval-parsers-debate-impl.d.ts.map +1 -0
- package/dist/server/services/eval-parsers-debate-impl.js +133 -0
- package/dist/server/services/eval-parsers-debate-impl.js.map +1 -0
- package/dist/server/services/eval-parsers.d.ts +24 -0
- package/dist/server/services/eval-parsers.d.ts.map +1 -0
- package/dist/server/services/eval-parsers.js +153 -0
- package/dist/server/services/eval-parsers.js.map +1 -0
- package/dist/server/services/eval-prompts.d.ts +9 -0
- package/dist/server/services/eval-prompts.d.ts.map +1 -0
- package/dist/server/services/eval-prompts.js +164 -0
- package/dist/server/services/eval-prompts.js.map +1 -0
- package/dist/server/services/evaluator.d.ts +10 -0
- package/dist/server/services/evaluator.d.ts.map +1 -0
- package/dist/server/services/evaluator.js +143 -0
- package/dist/server/services/evaluator.js.map +1 -0
- package/dist/server/services/fs-adapter.d.ts +20 -0
- package/dist/server/services/fs-adapter.d.ts.map +1 -0
- package/dist/server/services/fs-adapter.js +13 -0
- package/dist/server/services/fs-adapter.js.map +1 -0
- package/dist/server/services/instruction-parser.d.ts +26 -0
- package/dist/server/services/instruction-parser.d.ts.map +1 -0
- package/dist/server/services/instruction-parser.js +121 -0
- package/dist/server/services/instruction-parser.js.map +1 -0
- package/dist/server/services/log-rotator.d.ts +20 -0
- package/dist/server/services/log-rotator.d.ts.map +1 -0
- package/dist/server/services/log-rotator.js +60 -0
- package/dist/server/services/log-rotator.js.map +1 -0
- package/dist/server/services/logger.d.ts +15 -0
- package/dist/server/services/logger.d.ts.map +1 -0
- package/dist/server/services/logger.js +69 -0
- package/dist/server/services/logger.js.map +1 -0
- package/dist/server/services/model-factory.d.ts +10 -0
- package/dist/server/services/model-factory.d.ts.map +1 -0
- package/dist/server/services/model-factory.js +33 -0
- package/dist/server/services/model-factory.js.map +1 -0
- package/dist/server/services/runner.d.ts +9 -0
- package/dist/server/services/runner.d.ts.map +1 -0
- package/dist/server/services/runner.js +99 -0
- package/dist/server/services/runner.js.map +1 -0
- package/dist/server/services/seeder.d.ts +5 -0
- package/dist/server/services/seeder.d.ts.map +1 -0
- package/dist/server/services/seeder.js +79 -0
- package/dist/server/services/seeder.js.map +1 -0
- package/dist/server/services/storage-test-helpers.d.ts +15 -0
- package/dist/server/services/storage-test-helpers.d.ts.map +1 -0
- package/dist/server/services/storage-test-helpers.js +151 -0
- package/dist/server/services/storage-test-helpers.js.map +1 -0
- package/dist/server/services/storage.d.ts +35 -0
- package/dist/server/services/storage.d.ts.map +1 -0
- package/dist/server/services/storage.js +219 -0
- package/dist/server/services/storage.js.map +1 -0
- package/dist/server/services/tools.d.ts +6 -0
- package/dist/server/services/tools.d.ts.map +1 -0
- package/dist/server/services/tools.js +94 -0
- package/dist/server/services/tools.js.map +1 -0
- package/dist/server/services/transcript-formatter.d.ts +18 -0
- package/dist/server/services/transcript-formatter.d.ts.map +1 -0
- package/dist/server/services/transcript-formatter.js +227 -0
- package/dist/server/services/transcript-formatter.js.map +1 -0
- package/dist/server/services/update-checker.d.ts +3 -0
- package/dist/server/services/update-checker.d.ts.map +1 -0
- package/dist/server/services/update-checker.js +34 -0
- package/dist/server/services/update-checker.js.map +1 -0
- package/dist/server/types/evaluation.d.ts +94 -0
- package/dist/server/types/evaluation.d.ts.map +1 -0
- package/dist/server/types/evaluation.js +5 -0
- package/dist/server/types/evaluation.js.map +1 -0
- package/dist/server/types/index.d.ts +5 -0
- package/dist/server/types/index.d.ts.map +1 -0
- package/dist/server/types/index.js +5 -0
- package/dist/server/types/index.js.map +1 -0
- package/dist/server/types/provider.d.ts +23 -0
- package/dist/server/types/provider.d.ts.map +1 -0
- package/dist/server/types/provider.js +5 -0
- package/dist/server/types/provider.js.map +1 -0
- package/dist/server/types/run.d.ts +31 -0
- package/dist/server/types/run.d.ts.map +1 -0
- package/dist/server/types/run.js +5 -0
- package/dist/server/types/run.js.map +1 -0
- package/dist/server/types/scenario.d.ts +19 -0
- package/dist/server/types/scenario.d.ts.map +1 -0
- package/dist/server/types/scenario.js +5 -0
- package/dist/server/types/scenario.js.map +1 -0
- package/dist/src/server/index.d.ts +13 -0
- package/dist/src/server/index.d.ts.map +1 -0
- package/dist/src/server/index.js +72 -0
- package/dist/src/server/index.js.map +1 -0
- package/dist/src/server/interfaces/evaluator.d.ts +15 -0
- package/dist/src/server/interfaces/evaluator.d.ts.map +1 -0
- package/dist/src/server/interfaces/evaluator.js +2 -0
- package/dist/src/server/interfaces/evaluator.js.map +1 -0
- package/dist/src/server/interfaces/logger.d.ts +9 -0
- package/dist/src/server/interfaces/logger.d.ts.map +1 -0
- package/dist/src/server/interfaces/logger.js +2 -0
- package/dist/src/server/interfaces/logger.js.map +1 -0
- package/dist/src/server/interfaces/runner.d.ts +9 -0
- package/dist/src/server/interfaces/runner.d.ts.map +1 -0
- package/dist/src/server/interfaces/runner.js +2 -0
- package/dist/src/server/interfaces/runner.js.map +1 -0
- package/dist/src/server/interfaces/storage.d.ts +36 -0
- package/dist/src/server/interfaces/storage.d.ts.map +1 -0
- package/dist/src/server/interfaces/storage.js +2 -0
- package/dist/src/server/interfaces/storage.js.map +1 -0
- package/dist/src/server/routes/eval-queue.d.ts +23 -0
- package/dist/src/server/routes/eval-queue.d.ts.map +1 -0
- package/dist/src/server/routes/eval-queue.js +45 -0
- package/dist/src/server/routes/eval-queue.js.map +1 -0
- package/dist/src/server/routes/evaluations.d.ts +8 -0
- package/dist/src/server/routes/evaluations.d.ts.map +1 -0
- package/dist/src/server/routes/evaluations.js +221 -0
- package/dist/src/server/routes/evaluations.js.map +1 -0
- package/dist/src/server/routes/providers.d.ts +5 -0
- package/dist/src/server/routes/providers.d.ts.map +1 -0
- package/dist/src/server/routes/providers.js +179 -0
- package/dist/src/server/routes/providers.js.map +1 -0
- package/dist/src/server/routes/run-queue.d.ts +17 -0
- package/dist/src/server/routes/run-queue.d.ts.map +1 -0
- package/dist/src/server/routes/run-queue.js +34 -0
- package/dist/src/server/routes/run-queue.js.map +1 -0
- package/dist/src/server/routes/run-sse.d.ts +18 -0
- package/dist/src/server/routes/run-sse.d.ts.map +1 -0
- package/dist/src/server/routes/run-sse.js +57 -0
- package/dist/src/server/routes/run-sse.js.map +1 -0
- package/dist/src/server/routes/runs.d.ts +9 -0
- package/dist/src/server/routes/runs.d.ts.map +1 -0
- package/dist/src/server/routes/runs.js +380 -0
- package/dist/src/server/routes/runs.js.map +1 -0
- package/dist/src/server/routes/scenarios.d.ts +5 -0
- package/dist/src/server/routes/scenarios.d.ts.map +1 -0
- package/dist/src/server/routes/scenarios.js +181 -0
- package/dist/src/server/routes/scenarios.js.map +1 -0
- package/dist/src/server/services/eval-helpers.d.ts +22 -0
- package/dist/src/server/services/eval-helpers.d.ts.map +1 -0
- package/dist/src/server/services/eval-helpers.js +75 -0
- package/dist/src/server/services/eval-helpers.js.map +1 -0
- package/dist/src/server/services/eval-parsers-debate-impl.d.ts +11 -0
- package/dist/src/server/services/eval-parsers-debate-impl.d.ts.map +1 -0
- package/dist/src/server/services/eval-parsers-debate-impl.js +133 -0
- package/dist/src/server/services/eval-parsers-debate-impl.js.map +1 -0
- package/dist/src/server/services/eval-parsers.d.ts +24 -0
- package/dist/src/server/services/eval-parsers.d.ts.map +1 -0
- package/dist/src/server/services/eval-parsers.js +153 -0
- package/dist/src/server/services/eval-parsers.js.map +1 -0
- package/dist/src/server/services/eval-prompts.d.ts +9 -0
- package/dist/src/server/services/eval-prompts.d.ts.map +1 -0
- package/dist/src/server/services/eval-prompts.js +164 -0
- package/dist/src/server/services/eval-prompts.js.map +1 -0
- package/dist/src/server/services/evaluator.d.ts +10 -0
- package/dist/src/server/services/evaluator.d.ts.map +1 -0
- package/dist/src/server/services/evaluator.js +143 -0
- package/dist/src/server/services/evaluator.js.map +1 -0
- package/dist/src/server/services/fs-adapter.d.ts +20 -0
- package/dist/src/server/services/fs-adapter.d.ts.map +1 -0
- package/dist/src/server/services/fs-adapter.js +13 -0
- package/dist/src/server/services/fs-adapter.js.map +1 -0
- package/dist/src/server/services/instruction-parser.d.ts +26 -0
- package/dist/src/server/services/instruction-parser.d.ts.map +1 -0
- package/dist/src/server/services/instruction-parser.js +121 -0
- package/dist/src/server/services/instruction-parser.js.map +1 -0
- package/dist/src/server/services/log-rotator.d.ts +20 -0
- package/dist/src/server/services/log-rotator.d.ts.map +1 -0
- package/dist/src/server/services/log-rotator.js +60 -0
- package/dist/src/server/services/log-rotator.js.map +1 -0
- package/dist/src/server/services/logger.d.ts +15 -0
- package/dist/src/server/services/logger.d.ts.map +1 -0
- package/dist/src/server/services/logger.js +69 -0
- package/dist/src/server/services/logger.js.map +1 -0
- package/dist/src/server/services/model-factory.d.ts +10 -0
- package/dist/src/server/services/model-factory.d.ts.map +1 -0
- package/dist/src/server/services/model-factory.js +33 -0
- package/dist/src/server/services/model-factory.js.map +1 -0
- package/dist/src/server/services/runner.d.ts +9 -0
- package/dist/src/server/services/runner.d.ts.map +1 -0
- package/dist/src/server/services/runner.js +99 -0
- package/dist/src/server/services/runner.js.map +1 -0
- package/dist/src/server/services/seeder.d.ts +5 -0
- package/dist/src/server/services/seeder.d.ts.map +1 -0
- package/dist/src/server/services/seeder.js +79 -0
- package/dist/src/server/services/seeder.js.map +1 -0
- package/dist/src/server/services/storage.d.ts +35 -0
- package/dist/src/server/services/storage.d.ts.map +1 -0
- package/dist/src/server/services/storage.js +219 -0
- package/dist/src/server/services/storage.js.map +1 -0
- package/dist/src/server/services/tools.d.ts +6 -0
- package/dist/src/server/services/tools.d.ts.map +1 -0
- package/dist/src/server/services/tools.js +94 -0
- package/dist/src/server/services/tools.js.map +1 -0
- package/dist/src/server/services/transcript-formatter.d.ts +18 -0
- package/dist/src/server/services/transcript-formatter.d.ts.map +1 -0
- package/dist/src/server/services/transcript-formatter.js +227 -0
- package/dist/src/server/services/transcript-formatter.js.map +1 -0
- package/dist/src/server/services/update-checker.d.ts +3 -0
- package/dist/src/server/services/update-checker.d.ts.map +1 -0
- package/dist/src/server/services/update-checker.js +34 -0
- package/dist/src/server/services/update-checker.js.map +1 -0
- package/dist/src/server/types/evaluation.d.ts +94 -0
- package/dist/src/server/types/evaluation.d.ts.map +1 -0
- package/dist/src/server/types/evaluation.js +5 -0
- package/dist/src/server/types/evaluation.js.map +1 -0
- package/dist/src/server/types/index.d.ts +5 -0
- package/dist/src/server/types/index.d.ts.map +1 -0
- package/dist/src/server/types/index.js +5 -0
- package/dist/src/server/types/index.js.map +1 -0
- package/dist/src/server/types/provider.d.ts +23 -0
- package/dist/src/server/types/provider.d.ts.map +1 -0
- package/dist/src/server/types/provider.js +5 -0
- package/dist/src/server/types/provider.js.map +1 -0
- package/dist/src/server/types/run.d.ts +31 -0
- package/dist/src/server/types/run.d.ts.map +1 -0
- package/dist/src/server/types/run.js +5 -0
- package/dist/src/server/types/run.js.map +1 -0
- package/dist/src/server/types/scenario.d.ts +19 -0
- package/dist/src/server/types/scenario.d.ts.map +1 -0
- package/dist/src/server/types/scenario.js +5 -0
- package/dist/src/server/types/scenario.js.map +1 -0
- package/dist/web/assets/index-AJu1Yn5F.js +70 -0
- package/dist/web/assets/index-C_ioEISr.css +1 -0
- package/dist/web/index.html +15 -0
- package/docs/schemas/provider-api.example.json +12 -0
- package/docs/schemas/provider-openai.example.json +11 -0
- package/docs/schemas/scenario-baseline.example.json +24 -0
- package/docs/schemas/scenario-carwash-baseline.example.json +22 -0
- package/docs/schemas/scenario-carwash-with-system-prompt.example.json +24 -0
- package/docs/schemas/scenario-golden-rules-baseline.example.json +24 -0
- package/docs/schemas/scenario-golden-rules-with-system-prompt.example.json +28 -0
- package/docs/schemas/scenario-negative-analysis-baseline.example.json +23 -0
- package/docs/schemas/scenario-negative-analysis-with-system-prompt.example.json +25 -0
- package/docs/schemas/scenario-with-system-prompt.example.json +25 -0
- package/package.json +97 -0
|
@@ -0,0 +1,75 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Evaluation Helpers — result aggregation extracted from the orchestrator
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
import { toInstructionCompliance } from './eval-parsers.js';
|
|
5
|
+
/** Check whether evaluator scores have converged (within 1 point per dimension). */
|
|
6
|
+
export function checkConsensus(accumulators) {
|
|
7
|
+
if (accumulators.length < 2)
|
|
8
|
+
return true;
|
|
9
|
+
// Require ALL evaluators to have parsed scores before declaring consensus.
|
|
10
|
+
// If any evaluator's score result is missing, consensus is false.
|
|
11
|
+
const allScores = accumulators.map((a) => a.scoreResult.scores ?? {});
|
|
12
|
+
if (allScores.some((s) => Object.keys(s).length === 0))
|
|
13
|
+
return false;
|
|
14
|
+
const dimensions = new Set(allScores.flatMap((s) => Object.keys(s)));
|
|
15
|
+
for (const dim of dimensions) {
|
|
16
|
+
const vals = allScores.map((s) => s[dim]).filter((v) => v !== undefined);
|
|
17
|
+
if (vals.length < 2)
|
|
18
|
+
continue;
|
|
19
|
+
const range = Math.max(...vals) - Math.min(...vals);
|
|
20
|
+
if (range > 1)
|
|
21
|
+
return false;
|
|
22
|
+
}
|
|
23
|
+
return true;
|
|
24
|
+
}
|
|
25
|
+
/** Aggregate answer closeness from all evaluators into a comparison. */
|
|
26
|
+
export function buildAnswerComparison(accumulators) {
|
|
27
|
+
const closenessValues = accumulators
|
|
28
|
+
.map((a) => a.scoreResult.overallCloseness)
|
|
29
|
+
.filter((v) => v !== undefined && v > 0);
|
|
30
|
+
const avgCloseness = closenessValues.length > 0
|
|
31
|
+
? closenessValues.reduce((a, b) => a + b, 0) / closenessValues.length
|
|
32
|
+
: 0;
|
|
33
|
+
const summaries = accumulators
|
|
34
|
+
.map((a) => a.scoreResult.summary)
|
|
35
|
+
.filter((s) => !!s);
|
|
36
|
+
return {
|
|
37
|
+
matches: avgCloseness >= 0.7,
|
|
38
|
+
explanation: summaries[0] ?? 'No explanation available',
|
|
39
|
+
similarity: avgCloseness,
|
|
40
|
+
};
|
|
41
|
+
}
|
|
42
|
+
/** Build critical requirement results by checking evaluator-flagged misses. */
|
|
43
|
+
export function buildCriticalResults(accumulators, scenario) {
|
|
44
|
+
const allMissed = new Set(accumulators.flatMap((a) => a.scoreResult.missedCritical ?? []));
|
|
45
|
+
return scenario.criticalRequirements.map((req) => ({
|
|
46
|
+
requirement: req,
|
|
47
|
+
met: !allMissed.has(req),
|
|
48
|
+
evidence: allMissed.has(req) ? 'Flagged as missed by evaluator' : 'Not flagged',
|
|
49
|
+
}));
|
|
50
|
+
}
|
|
51
|
+
/** Merge compliance results from all evaluators into a single report. */
|
|
52
|
+
export function mergeCompliance(accumulators) {
|
|
53
|
+
const followed = new Set();
|
|
54
|
+
const violated = new Set();
|
|
55
|
+
const notApplicable = new Set();
|
|
56
|
+
let complianceSum = 0;
|
|
57
|
+
let complianceCount = 0;
|
|
58
|
+
for (const acc of accumulators) {
|
|
59
|
+
const c = acc.complianceResult;
|
|
60
|
+
(c.followed ?? []).forEach((s) => followed.add(s));
|
|
61
|
+
(c.violated ?? []).forEach((s) => violated.add(s));
|
|
62
|
+
(c.notApplicable ?? []).forEach((s) => notApplicable.add(s));
|
|
63
|
+
if (c.overallCompliance !== undefined) {
|
|
64
|
+
complianceSum += c.overallCompliance;
|
|
65
|
+
complianceCount++;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
return toInstructionCompliance({
|
|
69
|
+
followed: [...followed],
|
|
70
|
+
violated: [...violated],
|
|
71
|
+
notApplicable: [...notApplicable],
|
|
72
|
+
overallCompliance: complianceCount > 0 ? complianceSum / complianceCount : 0,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
//# sourceMappingURL=eval-helpers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-helpers.js","sourceRoot":"","sources":["../../../../src/server/services/eval-helpers.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,0EAA0E;AAC1E,8EAA8E;AAQ9E,OAAO,EAAE,uBAAuB,EAAE,MAAM,mBAAmB,CAAC;AAe5D,oFAAoF;AACpF,MAAM,UAAU,cAAc,CAAC,YAA6C;IAC1E,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC;QAAE,OAAO,IAAI,CAAC;IACzC,2EAA2E;IAC3E,kEAAkE;IAClE,MAAM,SAAS,GAAG,YAAY,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC;IACtE,IAAI,SAAS,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC;QAAE,OAAO,KAAK,CAAC;IAErE,MAAM,UAAU,GAAG,IAAI,GAAG,CAAC,SAAS,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IACrE,KAAK,MAAM,GAAG,IAAI,UAAU,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC;QACtF,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;YAAE,SAAS;QAC9B,MAAM,KAAK,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,IAAI,CAAC,CAAC;QACpD,IAAI,KAAK,GAAG,CAAC;YAAE,OAAO,KAAK,CAAC;IAC9B,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,wEAAwE;AACxE,MAAM,UAAU,qBAAqB,CACnC,YAA6C;IAE7C,MAAM,eAAe,GAAG,YAAY;SACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,gBAAgB,CAAC;SAC1C,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,KAAK,SAAS,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;IACxD,MAAM,YAAY,GAAG,eAAe,CAAC,MAAM,GAAG,CAAC;QAC7C,CAAC,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,eAAe,CAAC,MAAM;QACrE,CAAC,CAAC,CAAC,CAAC;IACN,MAAM,SAAS,GAAG,YAAY;SAC3B,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,OAAO,CAAC;SACjC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAEnC,OAAO;QACL,OAAO,EAAE,YAAY,IAAI,GAAG;QAC5B,WAAW,EAAE,SAAS,CAAC,CAAC,CAAC,IAAI,0BAA0B;QACvD,UAAU,EAAE,YAAY;KACzB,CAAC;AACJ,CAAC;AAED,+EAA+E;AAC/E,MAAM,UAAU,oBAAoB,CAClC,YAA6C,EAC7C,QAAkB;IAElB,MAAM,SAAS,GAAG,IAAI,GAAG,CACvB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,WAAW,CAAC,cAAc,IAAI,EAAE,CAAC,CAChE,CAAC;IACF,OAAO,QAAQ,CAAC,oBAAoB,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACjD,WAAW,EAAE,GAAG;QAChB,GAAG,EAAE,CAAC,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC;QACxB,QAAQ,EAAE,SAAS,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,gCAAgC,CAAC,CAAC,CAAC,aAAa;KAChF,CAAC,CAAC,CAAC;AACN,CAAC;AAED,yEAAyE;AACzE,MAAM,UAAU,eAAe,CAC7B,YAA6C;IAE7C,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IACnC,MAAM,QAAQ,GAAG,IAAI,GAAG,EAAU,CAAC;IACnC,MAAM,aAAa,GAAG,IAAI,GAAG,EAAU,CAAC;IACxC,IAAI,aAAa,GAAG,CAAC,CAAC;IACtB,IAAI,eAAe,GAAG,CAAC,CAAC;IAExB,KAAK,MAAM,GAAG,IAAI,YAAY,EAAE,CAAC;QAC/B,MAAM,CAAC,GAAG,GAAG,CAAC,gBAAgB,CAAC;QAC/B,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC,QAAQ,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QACnD,CAAC,CAAC,CAAC,aAAa,IAAI,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,aAAa,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC7D,IAAI,CAAC,CAAC,iBAAiB,KAAK,SAAS,EAAE,CAAC;YACtC,aAAa,IAAI,CAAC,CAAC,iBAAiB,CAAC;YACrC,eAAe,EAAE,CAAC;QACpB,CAAC;IACH,CAAC;IAED,OAAO,uBAAuB,CAAC;QAC7B,QAAQ,EAAE,CAAC,GAAG,QAAQ,CAAC;QACvB,QAAQ,EAAE,CAAC,GAAG,QAAQ,CAAC;QACvB,aAAa,EAAE,CAAC,GAAG,aAAa,CAAC;QACjC,iBAAiB,EAAE,eAAe,GAAG,CAAC,CAAC,CAAC,CAAC,aAAa,GAAG,eAAe,CAAC,CAAC,CAAC,CAAC;KAC7E,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
import type { EvaluationSynthesis } from '../types/evaluation.js';
|
|
2
|
+
export type Verdict = 'AGREE' | 'DISAGREE' | 'PARTIAL';
|
|
3
|
+
export interface DebateParseResult {
|
|
4
|
+
readonly verdict: Verdict;
|
|
5
|
+
readonly updatedScores: Readonly<Record<string, number>>;
|
|
6
|
+
readonly critiques: readonly string[];
|
|
7
|
+
readonly reasoning: string;
|
|
8
|
+
}
|
|
9
|
+
export declare function parseSynthesisResponse(response: string): Partial<EvaluationSynthesis>;
|
|
10
|
+
export declare function parseDebateResponse(response: string): Partial<DebateParseResult>;
|
|
11
|
+
//# sourceMappingURL=eval-parsers-debate-impl.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-parsers-debate-impl.d.ts","sourceRoot":"","sources":["../../../../src/server/services/eval-parsers-debate-impl.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,mBAAmB,EAAE,MAAM,wBAAwB,CAAC;AAMlE,MAAM,MAAM,OAAO,GAAG,OAAO,GAAG,UAAU,GAAG,SAAS,CAAC;AAEvD,MAAM,WAAW,iBAAiB;IAChC,QAAQ,CAAC,OAAO,EAAE,OAAO,CAAC;IAC1B,QAAQ,CAAC,aAAa,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IACzD,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,SAAS,EAAE,MAAM,CAAC;CAC5B;AAMD,wBAAgB,sBAAsB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,mBAAmB,CAAC,CAWrF;AAMD,wBAAgB,mBAAmB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,iBAAiB,CAAC,CAahF"}
|
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Debate & Synthesis parsers — extracted from eval-parsers.ts
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
// Synthesis response
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
export function parseSynthesisResponse(response) {
|
|
8
|
+
const parsed = tryParseJson(response);
|
|
9
|
+
if (parsed) {
|
|
10
|
+
return {
|
|
11
|
+
dimensionScores: validScores(parsed.dimensionScores),
|
|
12
|
+
weightedTotal: clampScore(parsed.weightedTotal),
|
|
13
|
+
confidence: clamp01(parsed.confidence),
|
|
14
|
+
dissenting: toStringArray(parsed.dissenting),
|
|
15
|
+
};
|
|
16
|
+
}
|
|
17
|
+
return parseSynthesisFromText(response);
|
|
18
|
+
}
|
|
19
|
+
// ---------------------------------------------------------------------------
|
|
20
|
+
// Debate verdict parsing
|
|
21
|
+
// ---------------------------------------------------------------------------
|
|
22
|
+
export function parseDebateResponse(response) {
|
|
23
|
+
const parsed = tryParseJson(response);
|
|
24
|
+
if (parsed) {
|
|
25
|
+
return {
|
|
26
|
+
verdict: parseVerdict(parsed.verdict),
|
|
27
|
+
updatedScores: validScores(parsed.updatedScores),
|
|
28
|
+
critiques: toStringArray(parsed.critiques),
|
|
29
|
+
reasoning: typeof parsed.reasoning === 'string' ? parsed.reasoning : undefined,
|
|
30
|
+
};
|
|
31
|
+
}
|
|
32
|
+
// Fallback: extract verdict and scores from text patterns
|
|
33
|
+
return parseDebateFromText(response);
|
|
34
|
+
}
|
|
35
|
+
// ---------------------------------------------------------------------------
|
|
36
|
+
// Internal: Text fallback parsers
|
|
37
|
+
// ---------------------------------------------------------------------------
|
|
38
|
+
function parseDebateFromText(text) {
|
|
39
|
+
// Try to extract verdict from text patterns like "VERDICT: AGREE"
|
|
40
|
+
const verdictMatch = text.match(/VERDICT\s*:\s*(AGREE|DISAGREE|PARTIAL)/i);
|
|
41
|
+
const verdict = verdictMatch ? parseVerdict(verdictMatch[1]) : 'PARTIAL';
|
|
42
|
+
// Try to extract scores using the same pattern as score text fallback
|
|
43
|
+
const scores = {};
|
|
44
|
+
const scorePattern = /(\w[\w\s]*?):\s*(\d+(?:\.\d+)?)\s*(?:\/\s*10)?/g;
|
|
45
|
+
let match;
|
|
46
|
+
while ((match = scorePattern.exec(text)) !== null) {
|
|
47
|
+
const dim = match[1].trim();
|
|
48
|
+
// Skip the VERDICT line we already parsed
|
|
49
|
+
if (/^verdict$/i.test(dim))
|
|
50
|
+
continue;
|
|
51
|
+
const val = parseFloat(match[2]);
|
|
52
|
+
if (!isNaN(val) && val <= 10)
|
|
53
|
+
scores[dim] = val;
|
|
54
|
+
}
|
|
55
|
+
return {
|
|
56
|
+
verdict,
|
|
57
|
+
updatedScores: Object.keys(scores).length > 0 ? scores : undefined,
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
function parseSynthesisFromText(text) {
|
|
61
|
+
const scoreMatch = text.match(/weighted\s*(?:total|average|score)\s*:?\s*(\d+(?:\.\d+)?)/i);
|
|
62
|
+
return {
|
|
63
|
+
weightedTotal: scoreMatch ? clampScore(parseFloat(scoreMatch[1])) : undefined,
|
|
64
|
+
};
|
|
65
|
+
}
|
|
66
|
+
// ---------------------------------------------------------------------------
|
|
67
|
+
// Internal: JSON parsing
|
|
68
|
+
// ---------------------------------------------------------------------------
|
|
69
|
+
function tryParseJson(text) {
|
|
70
|
+
try {
|
|
71
|
+
return JSON.parse(text);
|
|
72
|
+
}
|
|
73
|
+
catch {
|
|
74
|
+
// fall through
|
|
75
|
+
}
|
|
76
|
+
const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
77
|
+
if (jsonMatch) {
|
|
78
|
+
try {
|
|
79
|
+
return JSON.parse(jsonMatch[1]);
|
|
80
|
+
}
|
|
81
|
+
catch {
|
|
82
|
+
// fall through
|
|
83
|
+
}
|
|
84
|
+
}
|
|
85
|
+
const braceMatch = text.match(/\{[\s\S]*\}/);
|
|
86
|
+
if (braceMatch) {
|
|
87
|
+
try {
|
|
88
|
+
return JSON.parse(braceMatch[0]);
|
|
89
|
+
}
|
|
90
|
+
catch {
|
|
91
|
+
// fall through
|
|
92
|
+
}
|
|
93
|
+
}
|
|
94
|
+
return undefined;
|
|
95
|
+
}
|
|
96
|
+
// ---------------------------------------------------------------------------
|
|
97
|
+
// Internal: Utilities
|
|
98
|
+
// ---------------------------------------------------------------------------
|
|
99
|
+
function validScores(scores) {
|
|
100
|
+
if (!scores || typeof scores !== 'object')
|
|
101
|
+
return {};
|
|
102
|
+
const result = {};
|
|
103
|
+
for (const [k, v] of Object.entries(scores)) {
|
|
104
|
+
if (typeof v === 'number' && !isNaN(v))
|
|
105
|
+
result[k] = clampScore(v);
|
|
106
|
+
}
|
|
107
|
+
return result;
|
|
108
|
+
}
|
|
109
|
+
function clamp01(val) {
|
|
110
|
+
if (val === undefined || isNaN(val))
|
|
111
|
+
return 0;
|
|
112
|
+
return Math.max(0, Math.min(1, val));
|
|
113
|
+
}
|
|
114
|
+
function clampScore(val) {
|
|
115
|
+
if (val === undefined || isNaN(val))
|
|
116
|
+
return 0;
|
|
117
|
+
return Math.max(0, Math.min(10, val));
|
|
118
|
+
}
|
|
119
|
+
function toStringArray(arr) {
|
|
120
|
+
if (!Array.isArray(arr))
|
|
121
|
+
return [];
|
|
122
|
+
return arr.filter((x) => typeof x === 'string');
|
|
123
|
+
}
|
|
124
|
+
function parseVerdict(val) {
|
|
125
|
+
if (typeof val === 'string') {
|
|
126
|
+
const upper = val.toUpperCase();
|
|
127
|
+
if (upper === 'AGREE' || upper === 'DISAGREE' || upper === 'PARTIAL') {
|
|
128
|
+
return upper;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
return 'PARTIAL';
|
|
132
|
+
}
|
|
133
|
+
//# sourceMappingURL=eval-parsers-debate-impl.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-parsers-debate-impl.js","sourceRoot":"","sources":["../../../../src/server/services/eval-parsers-debate-impl.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,8DAA8D;AAC9D,8EAA8E;AAiB9E,8EAA8E;AAC9E,qBAAqB;AACrB,8EAA8E;AAE9E,MAAM,UAAU,sBAAsB,CAAC,QAAgB;IACrD,MAAM,MAAM,GAAG,YAAY,CAAuB,QAAQ,CAAC,CAAC;IAC5D,IAAI,MAAM,EAAE,CAAC;QACX,OAAO;YACL,eAAe,EAAE,WAAW,CAAC,MAAM,CAAC,eAAe,CAAC;YACpD,aAAa,EAAE,UAAU,CAAC,MAAM,CAAC,aAAa,CAAC;YAC/C,UAAU,EAAE,OAAO,CAAC,MAAM,CAAC,UAAU,CAAC;YACtC,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,UAAU,CAAC;SAC7C,CAAC;IACJ,CAAC;IACD,OAAO,sBAAsB,CAAC,QAAQ,CAAC,CAAC;AAC1C,CAAC;AAED,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,MAAM,UAAU,mBAAmB,CAAC,QAAgB;IAClD,MAAM,MAAM,GAAG,YAAY,CAAoB,QAAQ,CAAC,CAAC;IACzD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO;YACL,OAAO,EAAE,YAAY,CAAC,MAAM,CAAC,OAAO,CAAC;YACrC,aAAa,EAAE,WAAW,CAAC,MAAM,CAAC,aAAa,CAAC;YAChD,SAAS,EAAE,aAAa,CAAC,MAAM,CAAC,SAAS,CAAC;YAC1C,SAAS,EAAE,OAAO,MAAM,CAAC,SAAS,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,SAAS,CAAC,CAAC,CAAC,SAAS;SAC/E,CAAC;IACJ,CAAC;IAED,0DAA0D;IAC1D,OAAO,mBAAmB,CAAC,QAAQ,CAAC,CAAC;AACvC,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,SAAS,mBAAmB,CAAC,IAAY;IACvC,kEAAkE;IAClE,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,yCAAyC,CAAC,CAAC;IAC3E,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,CAAC,YAAY,CAAC,YAAY,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAoB,CAAC;IAEpF,sEAAsE;IACtE,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,iDAAiD,CAAC;IACvE,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,0CAA0C;QAC1C,IAAI,YAAY,CAAC,IAAI,CAAC,GAAG,CAAC;YAAE,SAAS;QACrC,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACjC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,GAAG,IAAI,EAAE;YAAE,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IAClD,CAAC;IAED,OAAO;QACL,OAAO;QACP,aAAa,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS;KACnE,CAAC;AACJ,CAAC;AAED,SAAS,sBAAsB,CAAC,IAAY;IAC1C,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAC3B,4DAA4D,CAC7D,CAAC;IACF,OAAO;QACL,aAAa,EAAE,UAAU,CAAC,CAAC,CAAC,UAAU,CAAC,UAAU,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,SAAS;KAC9E,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,SAAS,YAAY,CAAI,IAAY;IACnC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAM,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,eAAe;IACjB,CAAC;IACD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAC7D,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAM,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC7C,IAAI,UAAU,EAAE,CAAC;QACf,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAM,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAoBD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,SAAS,WAAW,CAClB,MAA0C;IAE1C,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ;QAAE,OAAO,EAAE,CAAC;IACrD,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC5C,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;YAAE,MAAM,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,OAAO,CAAC,GAAuB;IACtC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,UAAU,CAAC,GAAuB;IACzC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,aAAa,CAAC,GAAY;IACjC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IACnC,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC,CAAC;AAC/D,CAAC;AAED,SAAS,YAAY,CAAC,GAAY;IAChC,IAAI,OAAO,GAAG,KAAK,QAAQ,EAAE,CAAC;QAC5B,MAAM,KAAK,GAAG,GAAG,CAAC,WAAW,EAAE,CAAC;QAChC,IAAI,KAAK,KAAK,OAAO,IAAI,KAAK,KAAK,UAAU,IAAI,KAAK,KAAK,SAAS,EAAE,CAAC;YACrE,OAAO,KAAK,CAAC;QACf,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { IndividualEvaluation, InstructionCompliance } from '../types/evaluation.js';
|
|
2
|
+
export { parseSynthesisResponse, parseDebateResponse } from './eval-parsers-debate-impl.js';
|
|
3
|
+
export type { Verdict, DebateParseResult } from './eval-parsers-debate-impl.js';
|
|
4
|
+
export interface ScoreParseResult {
|
|
5
|
+
readonly scores: Readonly<Record<string, number>>;
|
|
6
|
+
readonly overallCloseness: number;
|
|
7
|
+
readonly missedCritical: readonly string[];
|
|
8
|
+
readonly strengths: readonly string[];
|
|
9
|
+
readonly weaknesses: readonly string[];
|
|
10
|
+
readonly summary: string;
|
|
11
|
+
}
|
|
12
|
+
export declare function parseScoreResponse(response: string): Partial<ScoreParseResult>;
|
|
13
|
+
export interface ComplianceParseResult {
|
|
14
|
+
readonly followed: readonly string[];
|
|
15
|
+
readonly violated: readonly string[];
|
|
16
|
+
readonly notApplicable: readonly string[];
|
|
17
|
+
readonly overallCompliance: number;
|
|
18
|
+
}
|
|
19
|
+
export declare function parseComplianceResponse(response: string): Partial<ComplianceParseResult>;
|
|
20
|
+
/** Convert parsed compliance into our InstructionCompliance type. */
|
|
21
|
+
export declare function toInstructionCompliance(result: Partial<ComplianceParseResult>): InstructionCompliance;
|
|
22
|
+
/** Convert score parse result into IndividualEvaluation entries. */
|
|
23
|
+
export declare function toIndividualEvaluations(scores: Readonly<Record<string, number>>, role: string, reasoningMap: Readonly<Record<string, string>>): IndividualEvaluation[];
|
|
24
|
+
//# sourceMappingURL=eval-parsers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-parsers.d.ts","sourceRoot":"","sources":["../../../../src/server/services/eval-parsers.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EACV,oBAAoB,EACpB,qBAAqB,EACtB,MAAM,wBAAwB,CAAC;AAGhC,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAC5F,YAAY,EAAE,OAAO,EAAE,iBAAiB,EAAE,MAAM,+BAA+B,CAAC;AAMhF,MAAM,WAAW,gBAAgB;IAC/B,QAAQ,CAAC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC;IAClD,QAAQ,CAAC,gBAAgB,EAAE,MAAM,CAAC;IAClC,QAAQ,CAAC,cAAc,EAAE,SAAS,MAAM,EAAE,CAAC;IAC3C,QAAQ,CAAC,SAAS,EAAE,SAAS,MAAM,EAAE,CAAC;IACtC,QAAQ,CAAC,UAAU,EAAE,SAAS,MAAM,EAAE,CAAC;IACvC,QAAQ,CAAC,OAAO,EAAE,MAAM,CAAC;CAC1B;AAED,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,gBAAgB,CAAC,CAa9E;AAMD,MAAM,WAAW,qBAAqB;IACpC,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;IACrC,QAAQ,CAAC,QAAQ,EAAE,SAAS,MAAM,EAAE,CAAC;IACrC,QAAQ,CAAC,aAAa,EAAE,SAAS,MAAM,EAAE,CAAC;IAC1C,QAAQ,CAAC,iBAAiB,EAAE,MAAM,CAAC;CACpC;AAED,wBAAgB,uBAAuB,CAAC,QAAQ,EAAE,MAAM,GAAG,OAAO,CAAC,qBAAqB,CAAC,CAMxF;AAED,qEAAqE;AACrE,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,OAAO,CAAC,qBAAqB,CAAC,GACrC,qBAAqB,CAOvB;AAMD,oEAAoE;AACpE,wBAAgB,uBAAuB,CACrC,MAAM,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,EACxC,IAAI,EAAE,MAAM,EACZ,YAAY,EAAE,QAAQ,CAAC,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC,GAC7C,oBAAoB,EAAE,CAOxB"}
|
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Evaluation Response Parsers — extract structured data from LLM responses
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// Re-export debate/synthesis parsers so existing imports continue to work
|
|
5
|
+
export { parseSynthesisResponse, parseDebateResponse } from './eval-parsers-debate-impl.js';
|
|
6
|
+
export function parseScoreResponse(response) {
|
|
7
|
+
const parsed = tryParseJson(response);
|
|
8
|
+
if (parsed) {
|
|
9
|
+
return {
|
|
10
|
+
scores: validScores(parsed.scores),
|
|
11
|
+
overallCloseness: clamp01(parsed.overallCloseness),
|
|
12
|
+
missedCritical: toStringArray(parsed.missedCritical),
|
|
13
|
+
strengths: toStringArray(parsed.strengths),
|
|
14
|
+
weaknesses: toStringArray(parsed.weaknesses),
|
|
15
|
+
summary: typeof parsed.summary === 'string' ? parsed.summary : undefined,
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
return parseScoreFromText(response);
|
|
19
|
+
}
|
|
20
|
+
export function parseComplianceResponse(response) {
|
|
21
|
+
const parsed = tryParseJson(response);
|
|
22
|
+
if (parsed) {
|
|
23
|
+
return categorizeComplianceResults(parsed);
|
|
24
|
+
}
|
|
25
|
+
return parseComplianceFromText(response);
|
|
26
|
+
}
|
|
27
|
+
/** Convert parsed compliance into our InstructionCompliance type. */
|
|
28
|
+
export function toInstructionCompliance(result) {
|
|
29
|
+
return {
|
|
30
|
+
followed: result.followed ?? [],
|
|
31
|
+
violated: result.violated ?? [],
|
|
32
|
+
notApplicable: result.notApplicable ?? [],
|
|
33
|
+
overallCompliance: result.overallCompliance ?? 0,
|
|
34
|
+
};
|
|
35
|
+
}
|
|
36
|
+
// ---------------------------------------------------------------------------
|
|
37
|
+
// Helpers
|
|
38
|
+
// ---------------------------------------------------------------------------
|
|
39
|
+
/** Convert score parse result into IndividualEvaluation entries. */
|
|
40
|
+
export function toIndividualEvaluations(scores, role, reasoningMap) {
|
|
41
|
+
return Object.entries(scores).map(([dimension, score]) => ({
|
|
42
|
+
evaluatorRole: role,
|
|
43
|
+
dimension,
|
|
44
|
+
score: clampScore(score),
|
|
45
|
+
reasoning: reasoningMap[dimension] ?? '',
|
|
46
|
+
}));
|
|
47
|
+
}
|
|
48
|
+
// ---------------------------------------------------------------------------
|
|
49
|
+
// Internal: JSON parsing
|
|
50
|
+
// ---------------------------------------------------------------------------
|
|
51
|
+
function tryParseJson(text) {
|
|
52
|
+
try {
|
|
53
|
+
return JSON.parse(text);
|
|
54
|
+
}
|
|
55
|
+
catch {
|
|
56
|
+
// fall through
|
|
57
|
+
}
|
|
58
|
+
const jsonMatch = text.match(/```(?:json)?\s*([\s\S]*?)```/);
|
|
59
|
+
if (jsonMatch) {
|
|
60
|
+
try {
|
|
61
|
+
return JSON.parse(jsonMatch[1]);
|
|
62
|
+
}
|
|
63
|
+
catch {
|
|
64
|
+
// fall through
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
const braceMatch = text.match(/\{[\s\S]*\}/);
|
|
68
|
+
if (braceMatch) {
|
|
69
|
+
try {
|
|
70
|
+
return JSON.parse(braceMatch[0]);
|
|
71
|
+
}
|
|
72
|
+
catch {
|
|
73
|
+
// fall through
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
return undefined;
|
|
77
|
+
}
|
|
78
|
+
// ---------------------------------------------------------------------------
|
|
79
|
+
// Internal: Text fallback parsers
|
|
80
|
+
// ---------------------------------------------------------------------------
|
|
81
|
+
function parseScoreFromText(text) {
|
|
82
|
+
const scores = {};
|
|
83
|
+
const scorePattern = /(\w[\w\s]*?):\s*(\d+(?:\.\d+)?)\s*(?:\/\s*10)?/g;
|
|
84
|
+
let match;
|
|
85
|
+
while ((match = scorePattern.exec(text)) !== null) {
|
|
86
|
+
const dim = match[1].trim();
|
|
87
|
+
const val = parseFloat(match[2]);
|
|
88
|
+
if (!isNaN(val) && val <= 10)
|
|
89
|
+
scores[dim] = val;
|
|
90
|
+
}
|
|
91
|
+
return { scores: Object.keys(scores).length > 0 ? scores : undefined };
|
|
92
|
+
}
|
|
93
|
+
function parseComplianceFromText(text) {
|
|
94
|
+
const followed = [];
|
|
95
|
+
const violated = [];
|
|
96
|
+
if (/followed|compliant/i.test(text))
|
|
97
|
+
followed.push('(extracted from text)');
|
|
98
|
+
if (/violated|non-compliant/i.test(text))
|
|
99
|
+
violated.push('(extracted from text)');
|
|
100
|
+
return { followed, violated, notApplicable: [], overallCompliance: undefined };
|
|
101
|
+
}
|
|
102
|
+
// ---------------------------------------------------------------------------
|
|
103
|
+
// Internal: Utilities
|
|
104
|
+
// ---------------------------------------------------------------------------
|
|
105
|
+
function validScores(scores) {
|
|
106
|
+
if (!scores || typeof scores !== 'object')
|
|
107
|
+
return {};
|
|
108
|
+
const result = {};
|
|
109
|
+
for (const [k, v] of Object.entries(scores)) {
|
|
110
|
+
if (typeof v === 'number' && !isNaN(v))
|
|
111
|
+
result[k] = clampScore(v);
|
|
112
|
+
}
|
|
113
|
+
return result;
|
|
114
|
+
}
|
|
115
|
+
function clamp01(val) {
|
|
116
|
+
if (val === undefined || isNaN(val))
|
|
117
|
+
return 0;
|
|
118
|
+
return Math.max(0, Math.min(1, val));
|
|
119
|
+
}
|
|
120
|
+
function clampScore(val) {
|
|
121
|
+
if (val === undefined || isNaN(val))
|
|
122
|
+
return 0;
|
|
123
|
+
return Math.max(0, Math.min(10, val));
|
|
124
|
+
}
|
|
125
|
+
function toStringArray(arr) {
|
|
126
|
+
if (!Array.isArray(arr))
|
|
127
|
+
return [];
|
|
128
|
+
return arr.filter((x) => typeof x === 'string');
|
|
129
|
+
}
|
|
130
|
+
function categorizeComplianceResults(parsed) {
|
|
131
|
+
const followed = [];
|
|
132
|
+
const violated = [];
|
|
133
|
+
const notApplicable = [];
|
|
134
|
+
if (Array.isArray(parsed.results)) {
|
|
135
|
+
for (const r of parsed.results) {
|
|
136
|
+
const text = r.instruction ?? '(unknown)';
|
|
137
|
+
const status = (r.status ?? '').toLowerCase();
|
|
138
|
+
if (status === 'followed')
|
|
139
|
+
followed.push(text);
|
|
140
|
+
else if (status === 'violated')
|
|
141
|
+
violated.push(text);
|
|
142
|
+
else
|
|
143
|
+
notApplicable.push(text);
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
return {
|
|
147
|
+
followed,
|
|
148
|
+
violated,
|
|
149
|
+
notApplicable,
|
|
150
|
+
overallCompliance: clamp01(parsed.overallCompliance),
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
//# sourceMappingURL=eval-parsers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-parsers.js","sourceRoot":"","sources":["../../../../src/server/services/eval-parsers.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,2EAA2E;AAC3E,8EAA8E;AAO9E,0EAA0E;AAC1E,OAAO,EAAE,sBAAsB,EAAE,mBAAmB,EAAE,MAAM,+BAA+B,CAAC;AAgB5F,MAAM,UAAU,kBAAkB,CAAC,QAAgB;IACjD,MAAM,MAAM,GAAG,YAAY,CAAmB,QAAQ,CAAC,CAAC;IACxD,IAAI,MAAM,EAAE,CAAC;QACX,OAAO;YACL,MAAM,EAAE,WAAW,CAAC,MAAM,CAAC,MAAM,CAAC;YAClC,gBAAgB,EAAE,OAAO,CAAC,MAAM,CAAC,gBAAgB,CAAC;YAClD,cAAc,EAAE,aAAa,CAAC,MAAM,CAAC,cAAc,CAAC;YACpD,SAAS,EAAE,aAAa,CAAC,MAAM,CAAC,SAAS,CAAC;YAC1C,UAAU,EAAE,aAAa,CAAC,MAAM,CAAC,UAAU,CAAC;YAC5C,OAAO,EAAE,OAAO,MAAM,CAAC,OAAO,KAAK,QAAQ,CAAC,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,CAAC,CAAC,SAAS;SACzE,CAAC;IACJ,CAAC;IACD,OAAO,kBAAkB,CAAC,QAAQ,CAAC,CAAC;AACtC,CAAC;AAaD,MAAM,UAAU,uBAAuB,CAAC,QAAgB;IACtD,MAAM,MAAM,GAAG,YAAY,CAAwB,QAAQ,CAAC,CAAC;IAC7D,IAAI,MAAM,EAAE,CAAC;QACX,OAAO,2BAA2B,CAAC,MAAM,CAAC,CAAC;IAC7C,CAAC;IACD,OAAO,uBAAuB,CAAC,QAAQ,CAAC,CAAC;AAC3C,CAAC;AAED,qEAAqE;AACrE,MAAM,UAAU,uBAAuB,CACrC,MAAsC;IAEtC,OAAO;QACL,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE;QAC/B,QAAQ,EAAE,MAAM,CAAC,QAAQ,IAAI,EAAE;QAC/B,aAAa,EAAE,MAAM,CAAC,aAAa,IAAI,EAAE;QACzC,iBAAiB,EAAE,MAAM,CAAC,iBAAiB,IAAI,CAAC;KACjD,CAAC;AACJ,CAAC;AAED,8EAA8E;AAC9E,UAAU;AACV,8EAA8E;AAE9E,oEAAoE;AACpE,MAAM,UAAU,uBAAuB,CACrC,MAAwC,EACxC,IAAY,EACZ,YAA8C;IAE9C,OAAO,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,SAAS,EAAE,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;QACzD,aAAa,EAAE,IAAI;QACnB,SAAS;QACT,KAAK,EAAE,UAAU,CAAC,KAAK,CAAC;QACxB,SAAS,EAAE,YAAY,CAAC,SAAS,CAAC,IAAI,EAAE;KACzC,CAAC,CAAC,CAAC;AACN,CAAC;AAED,8EAA8E;AAC9E,yBAAyB;AACzB,8EAA8E;AAE9E,SAAS,YAAY,CAAI,IAAY;IACnC,IAAI,CAAC;QACH,OAAO,IAAI,CAAC,KAAK,CAAC,IAAI,CAAM,CAAC;IAC/B,CAAC;IAAC,MAAM,CAAC;QACP,eAAe;IACjB,CAAC;IACD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,8BAA8B,CAAC,CAAC;IAC7D,IAAI,SAAS,EAAE,CAAC;QACd,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,SAAS,CAAC,CAAC,CAAC,CAAM,CAAC;QACvC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,MAAM,UAAU,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;IAC7C,IAAI,UAAU,EAAE,CAAC;QACf,IAAI,CAAC;YACH,OAAO,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,CAAC,CAAC,CAAM,CAAC;QACxC,CAAC;QAAC,MAAM,CAAC;YACP,eAAe;QACjB,CAAC;IACH,CAAC;IACD,OAAO,SAAS,CAAC;AACnB,CAAC;AAED,8EAA8E;AAC9E,kCAAkC;AAClC,8EAA8E;AAE9E,SAAS,kBAAkB,CAAC,IAAY;IACtC,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,MAAM,YAAY,GAAG,iDAAiD,CAAC;IACvE,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,YAAY,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAClD,MAAM,GAAG,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC5B,MAAM,GAAG,GAAG,UAAU,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;QACjC,IAAI,CAAC,KAAK,CAAC,GAAG,CAAC,IAAI,GAAG,IAAI,EAAE;YAAE,MAAM,CAAC,GAAG,CAAC,GAAG,GAAG,CAAC;IAClD,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,SAAS,EAAE,CAAC;AACzE,CAAC;AAED,SAAS,uBAAuB,CAAC,IAAY;IAC3C,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,IAAI,qBAAqB,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,QAAQ,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;IAC7E,IAAI,yBAAyB,CAAC,IAAI,CAAC,IAAI,CAAC;QAAE,QAAQ,CAAC,IAAI,CAAC,uBAAuB,CAAC,CAAC;IACjF,OAAO,EAAE,QAAQ,EAAE,QAAQ,EAAE,aAAa,EAAE,EAAE,EAAE,iBAAiB,EAAE,SAAS,EAAE,CAAC;AACjF,CAAC;AAoBD,8EAA8E;AAC9E,sBAAsB;AACtB,8EAA8E;AAE9E,SAAS,WAAW,CAClB,MAA0C;IAE1C,IAAI,CAAC,MAAM,IAAI,OAAO,MAAM,KAAK,QAAQ;QAAE,OAAO,EAAE,CAAC;IACrD,MAAM,MAAM,GAA2B,EAAE,CAAC;IAC1C,KAAK,MAAM,CAAC,CAAC,EAAE,CAAC,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,CAAC;QAC5C,IAAI,OAAO,CAAC,KAAK,QAAQ,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC;YAAE,MAAM,CAAC,CAAC,CAAC,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IACpE,CAAC;IACD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED,SAAS,OAAO,CAAC,GAAuB;IACtC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC,CAAC;AACvC,CAAC;AAED,SAAS,UAAU,CAAC,GAAuB;IACzC,IAAI,GAAG,KAAK,SAAS,IAAI,KAAK,CAAC,GAAG,CAAC;QAAE,OAAO,CAAC,CAAC;IAC9C,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,EAAE,IAAI,CAAC,GAAG,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC;AACxC,CAAC;AAED,SAAS,aAAa,CAAC,GAAY;IACjC,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IACnC,OAAO,GAAG,CAAC,MAAM,CAAC,CAAC,CAAC,EAAe,EAAE,CAAC,OAAO,CAAC,KAAK,QAAQ,CAAC,CAAC;AAC/D,CAAC;AAED,SAAS,2BAA2B,CAClC,MAA6B;IAE7B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,aAAa,GAAa,EAAE,CAAC;IAEnC,IAAI,KAAK,CAAC,OAAO,CAAC,MAAM,CAAC,OAAO,CAAC,EAAE,CAAC;QAClC,KAAK,MAAM,CAAC,IAAI,MAAM,CAAC,OAAO,EAAE,CAAC;YAC/B,MAAM,IAAI,GAAG,CAAC,CAAC,WAAW,IAAI,WAAW,CAAC;YAC1C,MAAM,MAAM,GAAG,CAAC,CAAC,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,WAAW,EAAE,CAAC;YAC9C,IAAI,MAAM,KAAK,UAAU;gBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;iBAC1C,IAAI,MAAM,KAAK,UAAU;gBAAE,QAAQ,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;;gBAC/C,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChC,CAAC;IACH,CAAC;IAED,OAAO;QACL,QAAQ;QACR,QAAQ;QACR,aAAa;QACb,iBAAiB,EAAE,OAAO,CAAC,MAAM,CAAC,iBAAiB,CAAC;KACrD,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import type { Scenario, Provider } from '../types/index.js';
|
|
2
|
+
import type { IndividualEvaluation } from '../types/evaluation.js';
|
|
3
|
+
import type { TranscriptSummary } from './transcript-formatter.js';
|
|
4
|
+
import type { InstructionBlock } from './instruction-parser.js';
|
|
5
|
+
export declare function buildScorePrompt(transcript: string, scenario: Scenario, summary: TranscriptSummary): string;
|
|
6
|
+
export declare function buildCompliancePrompt(transcript: string, scenario: Scenario, instructions: readonly InstructionBlock[]): string;
|
|
7
|
+
export declare function buildDebatePrompt(myPreviousAssessment: string, otherAssessments: readonly string[], roundNumber: number): string;
|
|
8
|
+
export declare function buildSynthesisPrompt(allEvaluations: readonly IndividualEvaluation[], scenario: Scenario, provider: Provider): string;
|
|
9
|
+
//# sourceMappingURL=eval-prompts.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-prompts.d.ts","sourceRoot":"","sources":["../../../../src/server/services/eval-prompts.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAAE,QAAQ,EAAoB,QAAQ,EAAE,MAAM,mBAAmB,CAAC;AAC9E,OAAO,KAAK,EAAE,oBAAoB,EAAE,MAAM,wBAAwB,CAAC;AACnE,OAAO,KAAK,EAAE,iBAAiB,EAAE,MAAM,2BAA2B,CAAC;AACnE,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,yBAAyB,CAAC;AAMhE,wBAAgB,gBAAgB,CAC9B,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,QAAQ,EAClB,OAAO,EAAE,iBAAiB,GACzB,MAAM,CA+CR;AAMD,wBAAgB,qBAAqB,CACnC,UAAU,EAAE,MAAM,EAClB,QAAQ,EAAE,QAAQ,EAClB,YAAY,EAAE,SAAS,gBAAgB,EAAE,GACxC,MAAM,CA4BR;AAMD,wBAAgB,iBAAiB,CAC/B,oBAAoB,EAAE,MAAM,EAC5B,gBAAgB,EAAE,SAAS,MAAM,EAAE,EACnC,WAAW,EAAE,MAAM,GAClB,MAAM,CAyBR;AAMD,wBAAgB,oBAAoB,CAClC,cAAc,EAAE,SAAS,oBAAoB,EAAE,EAC/C,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,QAAQ,GACjB,MAAM,CAkCR"}
|
|
@@ -0,0 +1,164 @@
|
|
|
1
|
+
// ---------------------------------------------------------------------------
|
|
2
|
+
// Evaluation Prompt Builders
|
|
3
|
+
// ---------------------------------------------------------------------------
|
|
4
|
+
// ---------------------------------------------------------------------------
|
|
5
|
+
// Query 1: Score + answer comparison prompt
|
|
6
|
+
// ---------------------------------------------------------------------------
|
|
7
|
+
export function buildScorePrompt(transcript, scenario, summary) {
|
|
8
|
+
const dimensions = formatDimensions(scenario.scoringDimensions);
|
|
9
|
+
const toolSequence = summary.toolCallSequence.length > 0
|
|
10
|
+
? `Tool call sequence: ${summary.toolCallSequence.join(' → ')}`
|
|
11
|
+
: 'No tool calls recorded.';
|
|
12
|
+
return `You are an expert evaluator assessing an AI agent's performance on a task.
|
|
13
|
+
|
|
14
|
+
## Task Description
|
|
15
|
+
${scenario.prompt}
|
|
16
|
+
|
|
17
|
+
## Expected Answer
|
|
18
|
+
${scenario.expectedAnswer}
|
|
19
|
+
|
|
20
|
+
## Critical Requirements
|
|
21
|
+
${formatCriticalRequirements(scenario.criticalRequirements)}
|
|
22
|
+
|
|
23
|
+
## Grading Guidelines
|
|
24
|
+
${scenario.gradingGuidelines || 'No specific grading guidelines provided.'}
|
|
25
|
+
|
|
26
|
+
## Scoring Dimensions
|
|
27
|
+
${dimensions}
|
|
28
|
+
|
|
29
|
+
## Agent Behavior Summary
|
|
30
|
+
${toolSequence}
|
|
31
|
+
Files read: ${summary.filesRead.length > 0 ? summary.filesRead.join(', ') : 'none'}
|
|
32
|
+
Files modified: ${summary.filesModified.length > 0 ? summary.filesModified.join(', ') : 'none'}
|
|
33
|
+
Command failures: ${summary.commandFailures.length}
|
|
34
|
+
Asked clarifying questions: ${summary.askedClarifyingQuestions ? 'yes' : 'no'}
|
|
35
|
+
|
|
36
|
+
## Full Transcript
|
|
37
|
+
${transcript}
|
|
38
|
+
|
|
39
|
+
## Instructions
|
|
40
|
+
Evaluate the agent's output. For each scoring dimension, provide a score from 0-10.
|
|
41
|
+
Also assess how closely the agent's final answer matches the expected answer (0.0-1.0).
|
|
42
|
+
Identify any critical requirements that were missed, as well as strengths and weaknesses.
|
|
43
|
+
|
|
44
|
+
Respond with valid JSON matching this structure:
|
|
45
|
+
{
|
|
46
|
+
"scores": { "<dimension_name>": <0-10>, ... },
|
|
47
|
+
"overallCloseness": <0.0-1.0>,
|
|
48
|
+
"missedCritical": ["<requirement that was not met>", ...],
|
|
49
|
+
"strengths": ["<strength>", ...],
|
|
50
|
+
"weaknesses": ["<weakness>", ...],
|
|
51
|
+
"summary": "<brief overall assessment>"
|
|
52
|
+
}`;
|
|
53
|
+
}
|
|
54
|
+
// ---------------------------------------------------------------------------
|
|
55
|
+
// Query 2: Instruction compliance prompt
|
|
56
|
+
// ---------------------------------------------------------------------------
|
|
57
|
+
export function buildCompliancePrompt(transcript, scenario, instructions) {
|
|
58
|
+
const instructionList = instructions
|
|
59
|
+
.map((b, i) => `${i + 1}. [${b.source}] ${b.text}`)
|
|
60
|
+
.join('\n');
|
|
61
|
+
return `You are an expert evaluator checking whether an AI agent followed its configured instructions.
|
|
62
|
+
|
|
63
|
+
## Instructions to Check
|
|
64
|
+
${instructionList || 'No instructions configured.'}
|
|
65
|
+
|
|
66
|
+
## Agent Transcript
|
|
67
|
+
${transcript}
|
|
68
|
+
|
|
69
|
+
## Instructions
|
|
70
|
+
For each instruction listed above, determine if the agent:
|
|
71
|
+
- "followed" it (clear evidence of compliance)
|
|
72
|
+
- "violated" it (clear evidence of non-compliance)
|
|
73
|
+
- "not_applicable" (instruction was not relevant to this task)
|
|
74
|
+
|
|
75
|
+
Also rate overall compliance from 0.0 to 1.0.
|
|
76
|
+
|
|
77
|
+
Respond with valid JSON:
|
|
78
|
+
{
|
|
79
|
+
"results": [
|
|
80
|
+
{ "instruction": "<instruction text>", "status": "followed|violated|not_applicable", "evidence": "<brief evidence>" }
|
|
81
|
+
],
|
|
82
|
+
"overallCompliance": <0.0-1.0>
|
|
83
|
+
}`;
|
|
84
|
+
}
|
|
85
|
+
// ---------------------------------------------------------------------------
|
|
86
|
+
// Multi-round debate prompt
|
|
87
|
+
// ---------------------------------------------------------------------------
|
|
88
|
+
export function buildDebatePrompt(myPreviousAssessment, otherAssessments, roundNumber) {
|
|
89
|
+
const othersFormatted = otherAssessments
|
|
90
|
+
.map((a, i) => `### Other Evaluator ${i + 1}\n${a}`)
|
|
91
|
+
.join('\n\n');
|
|
92
|
+
return `You are participating in round ${roundNumber} of a multi-evaluator review.
|
|
93
|
+
|
|
94
|
+
## Your Previous Assessment
|
|
95
|
+
${myPreviousAssessment}
|
|
96
|
+
|
|
97
|
+
## Other Evaluators' Assessments
|
|
98
|
+
${othersFormatted}
|
|
99
|
+
|
|
100
|
+
## Instructions
|
|
101
|
+
Review the other evaluators' assessments and compare them with your own.
|
|
102
|
+
Determine if you agree, partially agree, or disagree with the emerging consensus.
|
|
103
|
+
Provide your updated scores if you have changed your mind, and explain why.
|
|
104
|
+
|
|
105
|
+
Respond with valid JSON:
|
|
106
|
+
{
|
|
107
|
+
"verdict": "AGREE|DISAGREE|PARTIAL",
|
|
108
|
+
"updatedScores": { "<dimension_name>": <0-10>, ... },
|
|
109
|
+
"critiques": ["<specific point of disagreement>", ...],
|
|
110
|
+
"reasoning": "<why you agree/disagree>"
|
|
111
|
+
}`;
|
|
112
|
+
}
|
|
113
|
+
// ---------------------------------------------------------------------------
|
|
114
|
+
// Synthesis prompt (final aggregation)
|
|
115
|
+
// ---------------------------------------------------------------------------
|
|
116
|
+
export function buildSynthesisPrompt(allEvaluations, scenario, provider) {
|
|
117
|
+
const evalSummaries = allEvaluations
|
|
118
|
+
.map((e) => `[${e.evaluatorRole}] ${e.dimension}: ${e.score}/10 — ${e.reasoning}`)
|
|
119
|
+
.join('\n');
|
|
120
|
+
const dimensions = formatDimensions(scenario.scoringDimensions);
|
|
121
|
+
return `You are the final synthesizer for a multi-evaluator assessment.
|
|
122
|
+
|
|
123
|
+
## Scenario
|
|
124
|
+
${scenario.name}: ${scenario.prompt}
|
|
125
|
+
|
|
126
|
+
## Scoring Dimensions & Weights
|
|
127
|
+
${dimensions}
|
|
128
|
+
|
|
129
|
+
## Provider
|
|
130
|
+
Name: ${provider.name}
|
|
131
|
+
Model: ${provider.model}
|
|
132
|
+
|
|
133
|
+
## All Individual Evaluations
|
|
134
|
+
${evalSummaries}
|
|
135
|
+
|
|
136
|
+
## Instructions
|
|
137
|
+
Synthesize all evaluations into final scores. Weight each dimension according to the scoring
|
|
138
|
+
dimensions defined above. Identify areas of evaluator consensus and disagreement.
|
|
139
|
+
Provide a confidence level (0.0-1.0) based on evaluator agreement.
|
|
140
|
+
|
|
141
|
+
Respond with valid JSON:
|
|
142
|
+
{
|
|
143
|
+
"dimensionScores": { "<dimension_name>": <0-10>, ... },
|
|
144
|
+
"weightedTotal": <weighted average 0-10>,
|
|
145
|
+
"confidence": <0.0-1.0>,
|
|
146
|
+
"dissenting": ["<areas where evaluators disagreed>", ...]
|
|
147
|
+
}`;
|
|
148
|
+
}
|
|
149
|
+
// ---------------------------------------------------------------------------
|
|
150
|
+
// Internal helpers
|
|
151
|
+
// ---------------------------------------------------------------------------
|
|
152
|
+
function formatDimensions(dims) {
|
|
153
|
+
if (dims.length === 0)
|
|
154
|
+
return 'No dimensions defined. Use general quality assessment.';
|
|
155
|
+
return dims
|
|
156
|
+
.map((d) => `- ${d.name} (weight: ${d.weight}): ${d.description}`)
|
|
157
|
+
.join('\n');
|
|
158
|
+
}
|
|
159
|
+
function formatCriticalRequirements(reqs) {
|
|
160
|
+
if (reqs.length === 0)
|
|
161
|
+
return 'None specified.';
|
|
162
|
+
return reqs.map((r, i) => `${i + 1}. ${r}`).join('\n');
|
|
163
|
+
}
|
|
164
|
+
//# sourceMappingURL=eval-prompts.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"eval-prompts.js","sourceRoot":"","sources":["../../../../src/server/services/eval-prompts.ts"],"names":[],"mappings":"AAAA,8EAA8E;AAC9E,6BAA6B;AAC7B,8EAA8E;AAO9E,8EAA8E;AAC9E,4CAA4C;AAC5C,8EAA8E;AAE9E,MAAM,UAAU,gBAAgB,CAC9B,UAAkB,EAClB,QAAkB,EAClB,OAA0B;IAE1B,MAAM,UAAU,GAAG,gBAAgB,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAChE,MAAM,YAAY,GAAG,OAAO,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC;QACtD,CAAC,CAAC,uBAAuB,OAAO,CAAC,gBAAgB,CAAC,IAAI,CAAC,KAAK,CAAC,EAAE;QAC/D,CAAC,CAAC,yBAAyB,CAAC;IAE9B,OAAO;;;EAGP,QAAQ,CAAC,MAAM;;;EAGf,QAAQ,CAAC,cAAc;;;EAGvB,0BAA0B,CAAC,QAAQ,CAAC,oBAAoB,CAAC;;;EAGzD,QAAQ,CAAC,iBAAiB,IAAI,0CAA0C;;;EAGxE,UAAU;;;EAGV,YAAY;cACA,OAAO,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM;kBAChE,OAAO,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,MAAM;oBAC1E,OAAO,CAAC,eAAe,CAAC,MAAM;8BACpB,OAAO,CAAC,wBAAwB,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,IAAI;;;EAG3E,UAAU;;;;;;;;;;;;;;;EAeV,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,yCAAyC;AACzC,8EAA8E;AAE9E,MAAM,UAAU,qBAAqB,CACnC,UAAkB,EAClB,QAAkB,EAClB,YAAyC;IAEzC,MAAM,eAAe,GAAG,YAAY;SACjC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,CAAC,CAAC,MAAM,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;SAClD,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;;EAGP,eAAe,IAAI,6BAA6B;;;EAGhD,UAAU;;;;;;;;;;;;;;;;EAgBV,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,4BAA4B;AAC5B,8EAA8E;AAE9E,MAAM,UAAU,iBAAiB,CAC/B,oBAA4B,EAC5B,gBAAmC,EACnC,WAAmB;IAEnB,MAAM,eAAe,GAAG,gBAAgB;SACrC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,uBAAuB,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;SACnD,IAAI,CAAC,MAAM,CAAC,CAAC;IAEhB,OAAO,kCAAkC,WAAW;;;EAGpD,oBAAoB;;;EAGpB,eAAe;;;;;;;;;;;;;EAaf,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,uCAAuC;AACvC,8EAA8E;AAE9E,MAAM,UAAU,oBAAoB,CAClC,cAA+C,EAC/C,QAAkB,EAClB,QAAkB;IAElB,MAAM,aAAa,GAAG,cAAc;SACjC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,CAAC,aAAa,KAAK,CAAC,CAAC,SAAS,KAAK,CAAC,CAAC,KAAK,SAAS,CAAC,CAAC,SAAS,EAAE,CAAC;SACjF,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,MAAM,UAAU,GAAG,gBAAgB,CAAC,QAAQ,CAAC,iBAAiB,CAAC,CAAC;IAEhE,OAAO;;;EAGP,QAAQ,CAAC,IAAI,KAAK,QAAQ,CAAC,MAAM;;;EAGjC,UAAU;;;QAGJ,QAAQ,CAAC,IAAI;SACZ,QAAQ,CAAC,KAAK;;;EAGrB,aAAa;;;;;;;;;;;;;EAab,CAAC;AACH,CAAC;AAED,8EAA8E;AAC9E,mBAAmB;AACnB,8EAA8E;AAE9E,SAAS,gBAAgB,CAAC,IAAiC;IACzD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,wDAAwD,CAAC;IACvF,OAAO,IAAI;SACR,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,KAAK,CAAC,CAAC,IAAI,aAAa,CAAC,CAAC,MAAM,MAAM,CAAC,CAAC,WAAW,EAAE,CAAC;SACjE,IAAI,CAAC,IAAI,CAAC,CAAC;AAChB,CAAC;AAED,SAAS,0BAA0B,CAAC,IAAuB;IACzD,IAAI,IAAI,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,iBAAiB,CAAC;IAChD,OAAO,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AACzD,CAAC"}
|
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
import type { IEvaluator, EvaluationCallbacks } from '../interfaces/evaluator.js';
|
|
2
|
+
import type { Run, Scenario, Provider, Evaluation, EvaluationRequest } from '../types/index.js';
|
|
3
|
+
export declare class EvaluationOrchestrator implements IEvaluator {
|
|
4
|
+
evaluateRun(run: Run, scenario: Scenario, provider: Provider, request: EvaluationRequest, callbacks: EvaluationCallbacks): Promise<Evaluation>;
|
|
5
|
+
private runRound1;
|
|
6
|
+
private runDebateRound;
|
|
7
|
+
private runSynthesis;
|
|
8
|
+
private runQuery;
|
|
9
|
+
}
|
|
10
|
+
//# sourceMappingURL=evaluator.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../../../../src/server/services/evaluator.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAAE,UAAU,EAAE,mBAAmB,EAAmB,MAAM,4BAA4B,CAAC;AACnG,OAAO,KAAK,EACV,GAAG,EACH,QAAQ,EACR,QAAQ,EACR,UAAU,EACV,iBAAiB,EAMlB,MAAM,mBAAmB,CAAC;AA6B3B,qBAAa,sBAAuB,YAAW,UAAU;IACjD,WAAW,CACf,GAAG,EAAE,GAAG,EACR,QAAQ,EAAE,QAAQ,EAClB,QAAQ,EAAE,QAAQ,EAClB,OAAO,EAAE,iBAAiB,EAC1B,SAAS,EAAE,mBAAmB,GAC7B,OAAO,CAAC,UAAU,CAAC;YA0ER,SAAS;YA6BT,cAAc;YAyBd,YAAY;YAiBZ,QAAQ;CA6BvB"}
|