@plune-ai/cairn 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +674 -0
- package/README.md +181 -0
- package/dist/agent/graph.d.ts +538 -0
- package/dist/agent/graph.d.ts.map +1 -0
- package/dist/agent/graph.js +0 -0
- package/dist/agent/graph.js.map +1 -0
- package/dist/agent/index.d.ts +83 -0
- package/dist/agent/index.d.ts.map +1 -0
- package/dist/agent/index.js +407 -0
- package/dist/agent/index.js.map +1 -0
- package/dist/analyze/index.d.ts +22 -0
- package/dist/analyze/index.d.ts.map +1 -0
- package/dist/analyze/index.js +39 -0
- package/dist/analyze/index.js.map +1 -0
- package/dist/artifacts/index.d.ts +29 -0
- package/dist/artifacts/index.d.ts.map +1 -0
- package/dist/artifacts/index.js +68 -0
- package/dist/artifacts/index.js.map +1 -0
- package/dist/artifacts/report.d.ts +21 -0
- package/dist/artifacts/report.d.ts.map +1 -0
- package/dist/artifacts/report.js +56 -0
- package/dist/artifacts/report.js.map +1 -0
- package/dist/artifacts/testcase-md.d.ts +37 -0
- package/dist/artifacts/testcase-md.d.ts.map +1 -0
- package/dist/artifacts/testcase-md.js +91 -0
- package/dist/artifacts/testcase-md.js.map +1 -0
- package/dist/browser/backends/playwright-cli.d.ts +23 -0
- package/dist/browser/backends/playwright-cli.d.ts.map +1 -0
- package/dist/browser/backends/playwright-cli.js +85 -0
- package/dist/browser/backends/playwright-cli.js.map +1 -0
- package/dist/browser/backends/playwright-lib.d.ts +32 -0
- package/dist/browser/backends/playwright-lib.d.ts.map +1 -0
- package/dist/browser/backends/playwright-lib.js +157 -0
- package/dist/browser/backends/playwright-lib.js.map +1 -0
- package/dist/browser/gateway.d.ts +33 -0
- package/dist/browser/gateway.d.ts.map +1 -0
- package/dist/browser/gateway.js +2 -0
- package/dist/browser/gateway.js.map +1 -0
- package/dist/browser/index.d.ts +15 -0
- package/dist/browser/index.d.ts.map +1 -0
- package/dist/browser/index.js +59 -0
- package/dist/browser/index.js.map +1 -0
- package/dist/browser/types.d.ts +99 -0
- package/dist/browser/types.d.ts.map +1 -0
- package/dist/browser/types.js +6 -0
- package/dist/browser/types.js.map +1 -0
- package/dist/checklist/index.d.ts +24 -0
- package/dist/checklist/index.d.ts.map +1 -0
- package/dist/checklist/index.js +65 -0
- package/dist/checklist/index.js.map +1 -0
- package/dist/cli/branding.d.ts +14 -0
- package/dist/cli/branding.d.ts.map +1 -0
- package/dist/cli/branding.js +14 -0
- package/dist/cli/branding.js.map +1 -0
- package/dist/cli/index.d.ts +12 -0
- package/dist/cli/index.d.ts.map +1 -0
- package/dist/cli/index.js +322 -0
- package/dist/cli/index.js.map +1 -0
- package/dist/cli/lex-bot.d.ts +3 -0
- package/dist/cli/lex-bot.d.ts.map +1 -0
- package/dist/cli/lex-bot.js +11 -0
- package/dist/cli/lex-bot.js.map +1 -0
- package/dist/codegen/index.d.ts +36 -0
- package/dist/codegen/index.d.ts.map +1 -0
- package/dist/codegen/index.js +63 -0
- package/dist/codegen/index.js.map +1 -0
- package/dist/codegen/schema.d.ts +14 -0
- package/dist/codegen/schema.d.ts.map +1 -0
- package/dist/codegen/schema.js +9 -0
- package/dist/codegen/schema.js.map +1 -0
- package/dist/config/env.d.ts +18 -0
- package/dist/config/env.d.ts.map +1 -0
- package/dist/config/env.js +42 -0
- package/dist/config/env.js.map +1 -0
- package/dist/config/index.d.ts +11 -0
- package/dist/config/index.d.ts.map +1 -0
- package/dist/config/index.js +74 -0
- package/dist/config/index.js.map +1 -0
- package/dist/config/profiles.d.ts +7 -0
- package/dist/config/profiles.d.ts.map +1 -0
- package/dist/config/profiles.js +28 -0
- package/dist/config/profiles.js.map +1 -0
- package/dist/config/schema.d.ts +91 -0
- package/dist/config/schema.d.ts.map +1 -0
- package/dist/config/schema.js +20 -0
- package/dist/config/schema.js.map +1 -0
- package/dist/design/index.d.ts +36 -0
- package/dist/design/index.d.ts.map +1 -0
- package/dist/design/index.js +35 -0
- package/dist/design/index.js.map +1 -0
- package/dist/design/schema.d.ts +109 -0
- package/dist/design/schema.d.ts.map +1 -0
- package/dist/design/schema.js +35 -0
- package/dist/design/schema.js.map +1 -0
- package/dist/eval/collect.d.ts +18 -0
- package/dist/eval/collect.d.ts.map +1 -0
- package/dist/eval/collect.js +53 -0
- package/dist/eval/collect.js.map +1 -0
- package/dist/eval/experiment.d.ts +49 -0
- package/dist/eval/experiment.d.ts.map +1 -0
- package/dist/eval/experiment.js +66 -0
- package/dist/eval/experiment.js.map +1 -0
- package/dist/eval/judge.d.ts +30 -0
- package/dist/eval/judge.d.ts.map +1 -0
- package/dist/eval/judge.js +47 -0
- package/dist/eval/judge.js.map +1 -0
- package/dist/eval/pilot.d.ts +21 -0
- package/dist/eval/pilot.d.ts.map +1 -0
- package/dist/eval/pilot.js +24 -0
- package/dist/eval/pilot.js.map +1 -0
- package/dist/eval/scorers.d.ts +23 -0
- package/dist/eval/scorers.d.ts.map +1 -0
- package/dist/eval/scorers.js +38 -0
- package/dist/eval/scorers.js.map +1 -0
- package/dist/index.d.ts +21 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +13 -0
- package/dist/index.js.map +1 -0
- package/dist/knowledge/index.d.ts +7 -0
- package/dist/knowledge/index.d.ts.map +1 -0
- package/dist/knowledge/index.js +39 -0
- package/dist/knowledge/index.js.map +1 -0
- package/dist/llm/factory.d.ts +31 -0
- package/dist/llm/factory.d.ts.map +1 -0
- package/dist/llm/factory.js +48 -0
- package/dist/llm/factory.js.map +1 -0
- package/dist/llm/index.d.ts +5 -0
- package/dist/llm/index.d.ts.map +1 -0
- package/dist/llm/index.js +3 -0
- package/dist/llm/index.js.map +1 -0
- package/dist/llm/structured.d.ts +30 -0
- package/dist/llm/structured.d.ts.map +1 -0
- package/dist/llm/structured.js +58 -0
- package/dist/llm/structured.js.map +1 -0
- package/dist/llm/vision.d.ts +16 -0
- package/dist/llm/vision.d.ts.map +1 -0
- package/dist/llm/vision.js +4 -0
- package/dist/llm/vision.js.map +1 -0
- package/dist/observe/index.d.ts +21 -0
- package/dist/observe/index.d.ts.map +1 -0
- package/dist/observe/index.js +18 -0
- package/dist/observe/index.js.map +1 -0
- package/dist/observe/parse-aria.d.ts +8 -0
- package/dist/observe/parse-aria.d.ts.map +1 -0
- package/dist/observe/parse-aria.js +71 -0
- package/dist/observe/parse-aria.js.map +1 -0
- package/dist/probe/index.d.ts +19 -0
- package/dist/probe/index.d.ts.map +1 -0
- package/dist/probe/index.js +38 -0
- package/dist/probe/index.js.map +1 -0
- package/dist/promote/index.d.ts +6 -0
- package/dist/promote/index.d.ts.map +1 -0
- package/dist/promote/index.js +4 -0
- package/dist/promote/index.js.map +1 -0
- package/dist/promote/promote-case.d.ts +12 -0
- package/dist/promote/promote-case.d.ts.map +1 -0
- package/dist/promote/promote-case.js +103 -0
- package/dist/promote/promote-case.js.map +1 -0
- package/dist/promote/selectors.d.ts +29 -0
- package/dist/promote/selectors.d.ts.map +1 -0
- package/dist/promote/selectors.js +58 -0
- package/dist/promote/selectors.js.map +1 -0
- package/dist/prompts/index.d.ts +32 -0
- package/dist/prompts/index.d.ts.map +1 -0
- package/dist/prompts/index.js +55 -0
- package/dist/prompts/index.js.map +1 -0
- package/dist/prompts/local/identify-elements.d.ts +6 -0
- package/dist/prompts/local/identify-elements.d.ts.map +1 -0
- package/dist/prompts/local/identify-elements.js +21 -0
- package/dist/prompts/local/identify-elements.js.map +1 -0
- package/dist/prompts/local/index.d.ts +3 -0
- package/dist/prompts/local/index.d.ts.map +1 -0
- package/dist/prompts/local/index.js +18 -0
- package/dist/prompts/local/index.js.map +1 -0
- package/dist/prompts/local/judge-checklist-coverage.d.ts +3 -0
- package/dist/prompts/local/judge-checklist-coverage.d.ts.map +1 -0
- package/dist/prompts/local/judge-checklist-coverage.js +11 -0
- package/dist/prompts/local/judge-checklist-coverage.js.map +1 -0
- package/dist/prompts/local/judge-test-cases.d.ts +3 -0
- package/dist/prompts/local/judge-test-cases.d.ts.map +1 -0
- package/dist/prompts/local/judge-test-cases.js +11 -0
- package/dist/prompts/local/judge-test-cases.js.map +1 -0
- package/dist/prompts/local/pilot-review.d.ts +3 -0
- package/dist/prompts/local/pilot-review.d.ts.map +1 -0
- package/dist/prompts/local/pilot-review.js +13 -0
- package/dist/prompts/local/pilot-review.js.map +1 -0
- package/dist/prompts/local/qa-manual-test-designer.d.ts +7 -0
- package/dist/prompts/local/qa-manual-test-designer.d.ts.map +1 -0
- package/dist/prompts/local/qa-manual-test-designer.js +13 -0
- package/dist/prompts/local/qa-manual-test-designer.js.map +1 -0
- package/dist/prompts/local/qa-playwright-ts-writer.d.ts +6 -0
- package/dist/prompts/local/qa-playwright-ts-writer.d.ts.map +1 -0
- package/dist/prompts/local/qa-playwright-ts-writer.js +40 -0
- package/dist/prompts/local/qa-playwright-ts-writer.js.map +1 -0
- package/dist/prompts/local/qa-testcase-from-ui.d.ts +6 -0
- package/dist/prompts/local/qa-testcase-from-ui.d.ts.map +1 -0
- package/dist/prompts/local/qa-testcase-from-ui.js +52 -0
- package/dist/prompts/local/qa-testcase-from-ui.js.map +1 -0
- package/dist/session/index.d.ts +27 -0
- package/dist/session/index.d.ts.map +1 -0
- package/dist/session/index.js +74 -0
- package/dist/session/index.js.map +1 -0
- package/dist/telemetry/index.d.ts +21 -0
- package/dist/telemetry/index.d.ts.map +1 -0
- package/dist/telemetry/index.js +26 -0
- package/dist/telemetry/index.js.map +1 -0
- package/dist/tui/App.d.ts +6 -0
- package/dist/tui/App.d.ts.map +1 -0
- package/dist/tui/App.js +61 -0
- package/dist/tui/App.js.map +1 -0
- package/dist/tui/components/error-boundary.d.ts +17 -0
- package/dist/tui/components/error-boundary.d.ts.map +1 -0
- package/dist/tui/components/error-boundary.js +20 -0
- package/dist/tui/components/error-boundary.js.map +1 -0
- package/dist/tui/components/field.d.ts +10 -0
- package/dist/tui/components/field.d.ts.map +1 -0
- package/dist/tui/components/field.js +8 -0
- package/dist/tui/components/field.js.map +1 -0
- package/dist/tui/components/help.d.ts +5 -0
- package/dist/tui/components/help.d.ts.map +1 -0
- package/dist/tui/components/help.js +7 -0
- package/dist/tui/components/help.js.map +1 -0
- package/dist/tui/components/log-pane.d.ts +6 -0
- package/dist/tui/components/log-pane.d.ts.map +1 -0
- package/dist/tui/components/log-pane.js +10 -0
- package/dist/tui/components/log-pane.js.map +1 -0
- package/dist/tui/components/node-checklist.d.ts +6 -0
- package/dist/tui/components/node-checklist.d.ts.map +1 -0
- package/dist/tui/components/node-checklist.js +9 -0
- package/dist/tui/components/node-checklist.js.map +1 -0
- package/dist/tui/components/pilot-badge.d.ts +6 -0
- package/dist/tui/components/pilot-badge.d.ts.map +1 -0
- package/dist/tui/components/pilot-badge.js +12 -0
- package/dist/tui/components/pilot-badge.js.map +1 -0
- package/dist/tui/components/scores-table.d.ts +6 -0
- package/dist/tui/components/scores-table.d.ts.map +1 -0
- package/dist/tui/components/scores-table.js +9 -0
- package/dist/tui/components/scores-table.js.map +1 -0
- package/dist/tui/components/scrollable-text.d.ts +6 -0
- package/dist/tui/components/scrollable-text.d.ts.map +1 -0
- package/dist/tui/components/scrollable-text.js +22 -0
- package/dist/tui/components/scrollable-text.js.map +1 -0
- package/dist/tui/components/session-picker.d.ts +5 -0
- package/dist/tui/components/session-picker.d.ts.map +1 -0
- package/dist/tui/components/session-picker.js +16 -0
- package/dist/tui/components/session-picker.js.map +1 -0
- package/dist/tui/components/test-case-list.d.ts +7 -0
- package/dist/tui/components/test-case-list.d.ts.map +1 -0
- package/dist/tui/components/test-case-list.js +10 -0
- package/dist/tui/components/test-case-list.js.map +1 -0
- package/dist/tui/hooks/use-run-artifacts.d.ts +14 -0
- package/dist/tui/hooks/use-run-artifacts.d.ts.map +1 -0
- package/dist/tui/hooks/use-run-artifacts.js +37 -0
- package/dist/tui/hooks/use-run-artifacts.js.map +1 -0
- package/dist/tui/hooks/use-runner.d.ts +25 -0
- package/dist/tui/hooks/use-runner.d.ts.map +1 -0
- package/dist/tui/hooks/use-runner.js +116 -0
- package/dist/tui/hooks/use-runner.js.map +1 -0
- package/dist/tui/hooks/use-runs.d.ts +14 -0
- package/dist/tui/hooks/use-runs.d.ts.map +1 -0
- package/dist/tui/hooks/use-runs.js +57 -0
- package/dist/tui/hooks/use-runs.js.map +1 -0
- package/dist/tui/hooks/use-sessions.d.ts +10 -0
- package/dist/tui/hooks/use-sessions.d.ts.map +1 -0
- package/dist/tui/hooks/use-sessions.js +32 -0
- package/dist/tui/hooks/use-sessions.js.map +1 -0
- package/dist/tui/hooks/use-stdout-dimensions.d.ts +3 -0
- package/dist/tui/hooks/use-stdout-dimensions.d.ts.map +1 -0
- package/dist/tui/hooks/use-stdout-dimensions.js +18 -0
- package/dist/tui/hooks/use-stdout-dimensions.js.map +1 -0
- package/dist/tui/index.d.ts +7 -0
- package/dist/tui/index.d.ts.map +1 -0
- package/dist/tui/index.js +13 -0
- package/dist/tui/index.js.map +1 -0
- package/dist/tui/router-context.d.ts +12 -0
- package/dist/tui/router-context.d.ts.map +1 -0
- package/dist/tui/router-context.js +14 -0
- package/dist/tui/router-context.js.map +1 -0
- package/dist/tui/router.d.ts +44 -0
- package/dist/tui/router.d.ts.map +1 -0
- package/dist/tui/router.js +22 -0
- package/dist/tui/router.js.map +1 -0
- package/dist/tui/screens/form-screen.d.ts +7 -0
- package/dist/tui/screens/form-screen.d.ts.map +1 -0
- package/dist/tui/screens/form-screen.js +104 -0
- package/dist/tui/screens/form-screen.js.map +1 -0
- package/dist/tui/screens/launcher-screen.d.ts +2 -0
- package/dist/tui/screens/launcher-screen.d.ts.map +1 -0
- package/dist/tui/screens/launcher-screen.js +28 -0
- package/dist/tui/screens/launcher-screen.js.map +1 -0
- package/dist/tui/screens/run-dashboard-screen.d.ts +7 -0
- package/dist/tui/screens/run-dashboard-screen.d.ts.map +1 -0
- package/dist/tui/screens/run-dashboard-screen.js +32 -0
- package/dist/tui/screens/run-dashboard-screen.js.map +1 -0
- package/dist/tui/screens/run-detail-screen.d.ts +5 -0
- package/dist/tui/screens/run-detail-screen.d.ts.map +1 -0
- package/dist/tui/screens/run-detail-screen.js +67 -0
- package/dist/tui/screens/run-detail-screen.js.map +1 -0
- package/dist/tui/screens/runs-list-screen.d.ts +2 -0
- package/dist/tui/screens/runs-list-screen.d.ts.map +1 -0
- package/dist/tui/screens/runs-list-screen.js +22 -0
- package/dist/tui/screens/runs-list-screen.js.map +1 -0
- package/dist/tui/screens/summary-screen.d.ts +6 -0
- package/dist/tui/screens/summary-screen.d.ts.map +1 -0
- package/dist/tui/screens/summary-screen.js +53 -0
- package/dist/tui/screens/summary-screen.js.map +1 -0
- package/dist/tui/theme.d.ts +31 -0
- package/dist/tui/theme.d.ts.map +1 -0
- package/dist/tui/theme.js +70 -0
- package/dist/tui/theme.js.map +1 -0
- package/dist/tui/types.d.ts +38 -0
- package/dist/tui/types.d.ts.map +1 -0
- package/dist/tui/types.js +2 -0
- package/dist/tui/types.js.map +1 -0
- package/dist/validate/index.d.ts +30 -0
- package/dist/validate/index.d.ts.map +1 -0
- package/dist/validate/index.js +39 -0
- package/dist/validate/index.js.map +1 -0
- package/dist/validate/runner.d.ts +15 -0
- package/dist/validate/runner.d.ts.map +1 -0
- package/dist/validate/runner.js +74 -0
- package/dist/validate/runner.js.map +1 -0
- package/package.json +97 -0
|
@@ -0,0 +1,66 @@
|
|
|
1
|
+
import { designTestCases } from "../design/index.js";
|
|
2
|
+
import { deterministicScores } from "./scorers.js";
|
|
3
|
+
import { judgeTestCases } from "./judge.js";
|
|
4
|
+
/**
|
|
5
|
+
* Run dataset × prompt versions through the design stage, average the metrics, and produce a verdict
|
|
6
|
+
* (B2 self-improvement). A verdict is produced only for two variants: baseline (first) vs candidate (second).
|
|
7
|
+
*/
|
|
8
|
+
export async function runExperiment(items, variants, deps, opts = {}) {
|
|
9
|
+
const perVariant = [];
|
|
10
|
+
for (const variant of variants) {
|
|
11
|
+
const collected = {};
|
|
12
|
+
for (const item of items) {
|
|
13
|
+
const verified = item.verified ?? item.study.elements.map((e) => ({ ...e, count: 1, verified: true }));
|
|
14
|
+
const testCases = await designTestCases({
|
|
15
|
+
study: item.study,
|
|
16
|
+
pageSemantics: item.pageSemantics,
|
|
17
|
+
elements: verified.filter((v) => v.verified),
|
|
18
|
+
}, { invoke: deps.designInvoke, prompts: variant.prompts });
|
|
19
|
+
const scores = deterministicScores({ study: item.study, verified, testCases });
|
|
20
|
+
if (deps.judgeInvoke) {
|
|
21
|
+
scores.push(...(await judgeTestCases(testCases, item.pageSemantics, deps.judgeInvoke, variant.prompts)));
|
|
22
|
+
}
|
|
23
|
+
for (const s of scores)
|
|
24
|
+
(collected[s.name] ??= []).push(s.value);
|
|
25
|
+
}
|
|
26
|
+
const meanScores = {};
|
|
27
|
+
for (const [name, vals] of Object.entries(collected)) {
|
|
28
|
+
meanScores[name] = vals.reduce((a, b) => a + b, 0) / vals.length;
|
|
29
|
+
}
|
|
30
|
+
perVariant.push({ label: variant.label, meanScores, itemCount: items.length });
|
|
31
|
+
}
|
|
32
|
+
let verdict;
|
|
33
|
+
const base = perVariant[0];
|
|
34
|
+
const cand = perVariant[1];
|
|
35
|
+
if (perVariant.length === 2 && base && cand) {
|
|
36
|
+
const target = opts.target ?? "grounding";
|
|
37
|
+
const threshold = opts.threshold ?? 0.05;
|
|
38
|
+
const tolerance = opts.tolerance ?? 0.02;
|
|
39
|
+
const guardrails = opts.guardrails ?? [
|
|
40
|
+
"grounding",
|
|
41
|
+
"test_case_quality",
|
|
42
|
+
"methodology_adherence",
|
|
43
|
+
"locator_quality",
|
|
44
|
+
];
|
|
45
|
+
const delta = (cand.meanScores[target] ?? 0) - (base.meanScores[target] ?? 0);
|
|
46
|
+
const guardrailRegressions = guardrails.filter((g) => {
|
|
47
|
+
if (g === target)
|
|
48
|
+
return false;
|
|
49
|
+
const b = base.meanScores[g];
|
|
50
|
+
const c = cand.meanScores[g];
|
|
51
|
+
if (b === undefined || c === undefined)
|
|
52
|
+
return false;
|
|
53
|
+
return c < b - tolerance;
|
|
54
|
+
});
|
|
55
|
+
verdict = {
|
|
56
|
+
target,
|
|
57
|
+
baseline: base.label,
|
|
58
|
+
candidate: cand.label,
|
|
59
|
+
delta,
|
|
60
|
+
improved: delta >= threshold && guardrailRegressions.length === 0,
|
|
61
|
+
guardrailRegressions,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
return { perVariant, verdict };
|
|
65
|
+
}
|
|
66
|
+
//# sourceMappingURL=experiment.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"experiment.js","sourceRoot":"","sources":["../../src/eval/experiment.ts"],"names":[],"mappings":"AAIA,OAAO,EAAE,eAAe,EAAE,MAAM,oBAAoB,CAAC;AACrD,OAAO,EAAE,mBAAmB,EAAE,MAAM,cAAc,CAAC;AACnD,OAAO,EAAE,cAAc,EAAE,MAAM,YAAY,CAAC;AAgD5C;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CACjC,KAAoB,EACpB,QAAmB,EACnB,IAAoB,EACpB,OAA0B,EAAE;IAE5B,MAAM,UAAU,GAAoB,EAAE,CAAC;IAEvC,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,MAAM,SAAS,GAA6B,EAAE,CAAC;QAC/C,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;YACzB,MAAM,QAAQ,GACZ,IAAI,CAAC,QAAQ,IAAI,IAAI,CAAC,KAAK,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,KAAK,EAAE,CAAC,EAAE,QAAQ,EAAE,IAAI,EAAE,CAAC,CAAC,CAAC;YACxF,MAAM,SAAS,GAAG,MAAM,eAAe,CACrC;gBACE,KAAK,EAAE,IAAI,CAAC,KAAK;gBACjB,aAAa,EAAE,IAAI,CAAC,aAAa;gBACjC,QAAQ,EAAE,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC;aAC7C,EACD,EAAE,MAAM,EAAE,IAAI,CAAC,YAAY,EAAE,OAAO,EAAE,OAAO,CAAC,OAAO,EAAE,CACxD,CAAC;YACF,MAAM,MAAM,GAAG,mBAAmB,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,QAAQ,EAAE,SAAS,EAAE,CAAC,CAAC;YAC/E,IAAI,IAAI,CAAC,WAAW,EAAE,CAAC;gBACrB,MAAM,CAAC,IAAI,CACT,GAAG,CAAC,MAAM,cAAc,CAAC,SAAS,EAAE,IAAI,CAAC,aAAa,EAAE,IAAI,CAAC,WAAW,EAAE,OAAO,CAAC,OAAO,CAAC,CAAC,CAC5F,CAAC;YACJ,CAAC;YACD,KAAK,MAAM,CAAC,IAAI,MAAM;gBAAE,CAAC,SAAS,CAAC,CAAC,CAAC,IAAI,CAAC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC;QACnE,CAAC;QAED,MAAM,UAAU,GAA2B,EAAE,CAAC;QAC9C,KAAK,MAAM,CAAC,IAAI,EAAE,IAAI,CAAC,IAAI,MAAM,CAAC,OAAO,CAAC,SAAS,CAAC,EAAE,CAAC;YACrD,UAAU,CAAC,IAAI,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC,GAAG,CAAC,EAAE,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC;QACnE,CAAC;QACD,UAAU,CAAC,IAAI,CAAC,EAAE,KAAK,EAAE,OAAO,CAAC,KAAK,EAAE,UAAU,EAAE,SAAS,EAAE,KAAK,CAAC,MAAM,EAAE,CAAC,CAAC;IACjF,CAAC;IAED,IAAI,OAAsC,CAAC;IAC3C,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC3B,MAAM,IAAI,GAAG,UAAU,CAAC,CAAC,CAAC,CAAC;IAC3B,IAAI,UAAU,CAAC,MAAM,KAAK,CAAC,IAAI,IAAI,IAAI,IAAI,EAAE,CAAC;QAC5C,MAAM,MAAM,GAAG,IAAI,CAAC,MAAM,IAAI,WAAW,CAAC;QAC1C,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC;QACzC,MAAM,SAAS,GAAG,IAAI,CAAC,SAAS,IAAI,IAAI,CAAC;QACzC,MAAM,UAAU,GAAG,IAAI,CAAC,UAAU,IAAI;YACpC,WAAW;YACX,mBAAmB;YACnB,uBAAuB;YACvB,iBAAiB;SAClB,CAAC;QACF,MAAM,KAAK,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC;QAC9E,MAAM,oBAAoB,GAAG,UAAU,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE;YACnD,IAAI,CAAC,KAAK,MAAM;gBAAE,OAAO,KAAK,CAAC;YAC/B,MAAM,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAC7B,MAAM,CAAC,GAAG,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,CAAC;YAC7B,IAAI,CAAC,KAAK,SAAS,IAAI,CAAC,KAAK,SAAS;gBAAE,OAAO,KAAK,CAAC;YACrD,OAAO,CAAC,GAAG,CAAC,GAAG,SAAS,CAAC;QAC3B,CAAC,CAAC,CAAC;QACH,OAAO,GAAG;YACR,MAAM;YACN,QAAQ,EAAE,IAAI,CAAC,KAAK;YACpB,SAAS,EAAE,IAAI,CAAC,KAAK;YACrB,KAAK;YACL,QAAQ,EAAE,KAAK,IAAI,SAAS,IAAI,oBAAoB,CAAC,MAAM,KAAK,CAAC;YACjE,oBAAoB;SACrB,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,UAAU,EAAE,OAAO,EAAE,CAAC;AACjC,CAAC"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { StructuredInvoke } from "../llm/structured.js";
|
|
3
|
+
import type { PromptRegistry } from "../prompts/index.js";
|
|
4
|
+
import type { TestCase } from "../design/index.js";
|
|
5
|
+
import type { Score } from "./scorers.js";
|
|
6
|
+
export declare const JudgeSchema: z.ZodObject<{
|
|
7
|
+
test_case_quality: z.ZodNumber;
|
|
8
|
+
methodology_adherence: z.ZodNumber;
|
|
9
|
+
comment: z.ZodString;
|
|
10
|
+
}, z.core.$strip>;
|
|
11
|
+
/**
|
|
12
|
+
* LLM-as-judge (SDK-side, cheap tier) — subjective evaluation of cases (ADR-0006).
|
|
13
|
+
* The prompt is versioned in the registry ("judge-test-cases"), like the methodology prompts.
|
|
14
|
+
*/
|
|
15
|
+
export declare function judgeTestCases(testCases: TestCase[], pageSemantics: string, invoke: StructuredInvoke, prompts: PromptRegistry): Promise<Score[]>;
|
|
16
|
+
export declare const ChecklistCoverageSchema: z.ZodObject<{
|
|
17
|
+
coverage: z.ZodNumber;
|
|
18
|
+
uncovered: z.ZodDefault<z.ZodArray<z.ZodString>>;
|
|
19
|
+
}, z.core.$strip>;
|
|
20
|
+
/**
|
|
21
|
+
* Semantic coverage of the checklist by the cases (LLM judge) — understands meaning regardless of LANGUAGE.
|
|
22
|
+
* The prompt is versioned in the registry ("judge-checklist-coverage").
|
|
23
|
+
*/
|
|
24
|
+
export declare function judgeChecklistCoverage(checklistItems: {
|
|
25
|
+
text: string;
|
|
26
|
+
}[], testCases: TestCase[], invoke: StructuredInvoke, prompts: PromptRegistry): Promise<{
|
|
27
|
+
value: number;
|
|
28
|
+
comment: string;
|
|
29
|
+
}>;
|
|
30
|
+
//# sourceMappingURL=judge.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.d.ts","sourceRoot":"","sources":["../../src/eval/judge.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC1D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,KAAK,EAAE,KAAK,EAAE,MAAM,cAAc,CAAC;AAE1C,eAAO,MAAM,WAAW;;;;iBAItB,CAAC;AAEH;;;GAGG;AACH,wBAAsB,cAAc,CAClC,SAAS,EAAE,QAAQ,EAAE,EACrB,aAAa,EAAE,MAAM,EACrB,MAAM,EAAE,gBAAgB,EACxB,OAAO,EAAE,cAAc,GACtB,OAAO,CAAC,KAAK,EAAE,CAAC,CAWlB;AAED,eAAO,MAAM,uBAAuB;;;iBAGlC,CAAC;AAEH;;;GAGG;AACH,wBAAsB,sBAAsB,CAC1C,cAAc,EAAE;IAAE,IAAI,EAAE,MAAM,CAAA;CAAE,EAAE,EAClC,SAAS,EAAE,QAAQ,EAAE,EACrB,MAAM,EAAE,gBAAgB,EACxB,OAAO,EAAE,cAAc,GACtB,OAAO,CAAC;IAAE,KAAK,EAAE,MAAM,CAAC;IAAC,OAAO,EAAE,MAAM,CAAA;CAAE,CAAC,CAY7C"}
|
|
@@ -0,0 +1,47 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { HumanMessage } from "@langchain/core/messages";
|
|
3
|
+
export const JudgeSchema = z.object({
|
|
4
|
+
test_case_quality: z.number().min(0).max(1),
|
|
5
|
+
methodology_adherence: z.number().min(0).max(1),
|
|
6
|
+
comment: z.string(),
|
|
7
|
+
});
|
|
8
|
+
/**
|
|
9
|
+
* LLM-as-judge (SDK-side, cheap tier) — subjective evaluation of cases (ADR-0006).
|
|
10
|
+
* The prompt is versioned in the registry ("judge-test-cases"), like the methodology prompts.
|
|
11
|
+
*/
|
|
12
|
+
export async function judgeTestCases(testCases, pageSemantics, invoke, prompts) {
|
|
13
|
+
if (testCases.length === 0)
|
|
14
|
+
return [];
|
|
15
|
+
const cases = testCases
|
|
16
|
+
.map((tc) => `- [${tc.technique}] ${tc.title}: ${tc.steps.join("; ")} ⇒ ${tc.expected}`)
|
|
17
|
+
.join("\n");
|
|
18
|
+
const prompt = await prompts.getPrompt("judge-test-cases", { pageSemantics, cases });
|
|
19
|
+
const r = await invoke(JudgeSchema, [new HumanMessage(prompt.text)]);
|
|
20
|
+
return [
|
|
21
|
+
{ name: "test_case_quality", value: r.test_case_quality, comment: r.comment },
|
|
22
|
+
{ name: "methodology_adherence", value: r.methodology_adherence },
|
|
23
|
+
];
|
|
24
|
+
}
|
|
25
|
+
export const ChecklistCoverageSchema = z.object({
|
|
26
|
+
coverage: z.number().min(0).max(1),
|
|
27
|
+
uncovered: z.array(z.string()).default([]),
|
|
28
|
+
});
|
|
29
|
+
/**
|
|
30
|
+
* Semantic coverage of the checklist by the cases (LLM judge) — understands meaning regardless of LANGUAGE.
|
|
31
|
+
* The prompt is versioned in the registry ("judge-checklist-coverage").
|
|
32
|
+
*/
|
|
33
|
+
export async function judgeChecklistCoverage(checklistItems, testCases, invoke, prompts) {
|
|
34
|
+
if (checklistItems.length === 0)
|
|
35
|
+
return { value: 0, comment: "" };
|
|
36
|
+
const items = checklistItems.map((i, n) => `${n + 1}. ${i.text}`).join("\n");
|
|
37
|
+
const cases = testCases
|
|
38
|
+
.map((tc) => `- ${tc.title}: ${tc.steps.join("; ")} ⇒ ${tc.expected}`)
|
|
39
|
+
.join("\n");
|
|
40
|
+
const prompt = await prompts.getPrompt("judge-checklist-coverage", { items, cases });
|
|
41
|
+
const r = await invoke(ChecklistCoverageSchema, [new HumanMessage(prompt.text)]);
|
|
42
|
+
return {
|
|
43
|
+
value: r.coverage,
|
|
44
|
+
comment: r.uncovered.length > 0 ? `uncovered: ${r.uncovered.join("; ")}` : "full coverage",
|
|
45
|
+
};
|
|
46
|
+
}
|
|
47
|
+
//# sourceMappingURL=judge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"judge.js","sourceRoot":"","sources":["../../src/eval/judge.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AAMxD,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,iBAAiB,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC3C,qBAAqB,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAC/C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE;CACpB,CAAC,CAAC;AAEH;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAClC,SAAqB,EACrB,aAAqB,EACrB,MAAwB,EACxB,OAAuB;IAEvB,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACtC,MAAM,KAAK,GAAG,SAAS;SACpB,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,SAAS,KAAK,EAAE,CAAC,KAAK,KAAK,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC;SACvF,IAAI,CAAC,IAAI,CAAC,CAAC;IACd,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,SAAS,CAAC,kBAAkB,EAAE,EAAE,aAAa,EAAE,KAAK,EAAE,CAAC,CAAC;IACrF,MAAM,CAAC,GAAG,MAAM,MAAM,CAAC,WAAW,EAAE,CAAC,IAAI,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACrE,OAAO;QACL,EAAE,IAAI,EAAE,mBAAmB,EAAE,KAAK,EAAE,CAAC,CAAC,iBAAiB,EAAE,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE;QAC7E,EAAE,IAAI,EAAE,uBAAuB,EAAE,KAAK,EAAE,CAAC,CAAC,qBAAqB,EAAE;KAClE,CAAC;AACJ,CAAC;AAED,MAAM,CAAC,MAAM,uBAAuB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC9C,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IAClC,SAAS,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,EAAE,CAAC;CAC3C,CAAC,CAAC;AAEH;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,sBAAsB,CAC1C,cAAkC,EAClC,SAAqB,EACrB,MAAwB,EACxB,OAAuB;IAEvB,IAAI,cAAc,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,KAAK,EAAE,CAAC,EAAE,OAAO,EAAE,EAAE,EAAE,CAAC;IAClE,MAAM,KAAK,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC7E,MAAM,KAAK,GAAG,SAAS;SACpB,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,KAAK,EAAE,CAAC,KAAK,KAAK,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE,CAAC;SACrE,IAAI,CAAC,IAAI,CAAC,CAAC;IACd,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,SAAS,CAAC,0BAA0B,EAAE,EAAE,KAAK,EAAE,KAAK,EAAE,CAAC,CAAC;IACrF,MAAM,CAAC,GAAG,MAAM,MAAM,CAAC,uBAAuB,EAAE,CAAC,IAAI,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;IACjF,OAAO;QACL,KAAK,EAAE,CAAC,CAAC,QAAQ;QACjB,OAAO,EAAE,CAAC,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,cAAc,CAAC,CAAC,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC,CAAC,eAAe;KAC3F,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import type { StructuredInvoke } from "../llm/structured.js";
|
|
3
|
+
import type { PromptRegistry } from "../prompts/index.js";
|
|
4
|
+
import type { TestCase } from "../design/index.js";
|
|
5
|
+
import type { ValidationReport } from "../validate/index.js";
|
|
6
|
+
export declare const PilotSchema: z.ZodObject<{
|
|
7
|
+
verdict: z.ZodEnum<{
|
|
8
|
+
pass: "pass";
|
|
9
|
+
"needs-work": "needs-work";
|
|
10
|
+
fail: "fail";
|
|
11
|
+
}>;
|
|
12
|
+
reason: z.ZodString;
|
|
13
|
+
guidance: z.ZodString;
|
|
14
|
+
}, z.core.$strip>;
|
|
15
|
+
export type PilotVerdict = z.infer<typeof PilotSchema>;
|
|
16
|
+
/**
|
|
17
|
+
* Pilot supervisor (idea from explorbot): a holistic run verdict (pass/needs-work/fail) + guidance.
|
|
18
|
+
* Complements the per-metric judge with a single decision on "whether the run is good enough".
|
|
19
|
+
*/
|
|
20
|
+
export declare function pilotReview(pageSemantics: string, validation: ValidationReport | undefined, testCases: TestCase[], invoke: StructuredInvoke, prompts: PromptRegistry): Promise<PilotVerdict>;
|
|
21
|
+
//# sourceMappingURL=pilot.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pilot.d.ts","sourceRoot":"","sources":["../../src/eval/pilot.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC1D,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAE7D,eAAO,MAAM,WAAW;;;;;;;;iBAItB,CAAC;AACH,MAAM,MAAM,YAAY,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,WAAW,CAAC,CAAC;AAEvD;;;GAGG;AACH,wBAAsB,WAAW,CAC/B,aAAa,EAAE,MAAM,EACrB,UAAU,EAAE,gBAAgB,GAAG,SAAS,EACxC,SAAS,EAAE,QAAQ,EAAE,EACrB,MAAM,EAAE,gBAAgB,EACxB,OAAO,EAAE,cAAc,GACtB,OAAO,CAAC,YAAY,CAAC,CAWvB"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { z } from "zod";
|
|
2
|
+
import { HumanMessage } from "@langchain/core/messages";
|
|
3
|
+
export const PilotSchema = z.object({
|
|
4
|
+
verdict: z.enum(["pass", "needs-work", "fail"]),
|
|
5
|
+
reason: z.string(),
|
|
6
|
+
guidance: z.string(),
|
|
7
|
+
});
|
|
8
|
+
/**
|
|
9
|
+
* Pilot supervisor (idea from explorbot): a holistic run verdict (pass/needs-work/fail) + guidance.
|
|
10
|
+
* Complements the per-metric judge with a single decision on "whether the run is good enough".
|
|
11
|
+
*/
|
|
12
|
+
export async function pilotReview(pageSemantics, validation, testCases, invoke, prompts) {
|
|
13
|
+
const validationText = validation
|
|
14
|
+
? `${Math.round(validation.greenRatio * 100)}% green (${validation.results.length} tests, flaky: ${validation.flakyCount})`
|
|
15
|
+
: "(no validation — case design only)";
|
|
16
|
+
const cases = testCases.map((tc) => `- [${tc.type}/${tc.priority}] ${tc.title}`).join("\n");
|
|
17
|
+
const prompt = await prompts.getPrompt("pilot-review", {
|
|
18
|
+
pageSemantics,
|
|
19
|
+
validation: validationText,
|
|
20
|
+
cases,
|
|
21
|
+
});
|
|
22
|
+
return invoke(PilotSchema, [new HumanMessage(prompt.text)]);
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=pilot.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"pilot.js","sourceRoot":"","sources":["../../src/eval/pilot.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AACxB,OAAO,EAAE,YAAY,EAAE,MAAM,0BAA0B,CAAC;AAMxD,MAAM,CAAC,MAAM,WAAW,GAAG,CAAC,CAAC,MAAM,CAAC;IAClC,OAAO,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,YAAY,EAAE,MAAM,CAAC,CAAC;IAC/C,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE;CACrB,CAAC,CAAC;AAGH;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,WAAW,CAC/B,aAAqB,EACrB,UAAwC,EACxC,SAAqB,EACrB,MAAwB,EACxB,OAAuB;IAEvB,MAAM,cAAc,GAAG,UAAU;QAC/B,CAAC,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,UAAU,CAAC,UAAU,GAAG,GAAG,CAAC,YAAY,UAAU,CAAC,OAAO,CAAC,MAAM,kBAAkB,UAAU,CAAC,UAAU,GAAG;QAC3H,CAAC,CAAC,oCAAoC,CAAC;IACzC,MAAM,KAAK,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,EAAE,EAAE,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,IAAI,EAAE,CAAC,QAAQ,KAAK,EAAE,CAAC,KAAK,EAAE,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC5F,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,SAAS,CAAC,cAAc,EAAE;QACrD,aAAa;QACb,UAAU,EAAE,cAAc;QAC1B,KAAK;KACN,CAAC,CAAC;IACH,OAAO,MAAM,CAAC,WAAW,EAAE,CAAC,IAAI,YAAY,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC;AAC9D,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type { PageStudy } from "../observe/index.js";
|
|
2
|
+
import type { TestCase } from "../design/index.js";
|
|
3
|
+
import type { GeneratedSuite } from "../codegen/index.js";
|
|
4
|
+
import type { ValidationReport } from "../validate/index.js";
|
|
5
|
+
import type { VerifiedElement } from "../browser/types.js";
|
|
6
|
+
export interface Score {
|
|
7
|
+
name: string;
|
|
8
|
+
value: number;
|
|
9
|
+
comment?: string;
|
|
10
|
+
}
|
|
11
|
+
export interface ScoreInput {
|
|
12
|
+
study: PageStudy;
|
|
13
|
+
verified: VerifiedElement[];
|
|
14
|
+
testCases: TestCase[];
|
|
15
|
+
suite?: GeneratedSuite;
|
|
16
|
+
validation?: ValidationReport;
|
|
17
|
+
}
|
|
18
|
+
/**
|
|
19
|
+
* Deterministic (objective) scorers — computed from run data without an LLM or network.
|
|
20
|
+
* Foundation of the self-improvement loop: a measurable signal for every run (ADR-0006).
|
|
21
|
+
*/
|
|
22
|
+
export declare function deterministicScores(input: ScoreInput): Score[];
|
|
23
|
+
//# sourceMappingURL=scorers.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scorers.d.ts","sourceRoot":"","sources":["../../src/eval/scorers.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,qBAAqB,CAAC;AACrD,OAAO,KAAK,EAAE,QAAQ,EAAE,MAAM,oBAAoB,CAAC;AACnD,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,qBAAqB,CAAC;AAC1D,OAAO,KAAK,EAAE,gBAAgB,EAAE,MAAM,sBAAsB,CAAC;AAC7D,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,qBAAqB,CAAC;AAE3D,MAAM,WAAW,KAAK;IACpB,IAAI,EAAE,MAAM,CAAC;IACb,KAAK,EAAE,MAAM,CAAC;IACd,OAAO,CAAC,EAAE,MAAM,CAAC;CAClB;AAED,MAAM,WAAW,UAAU;IACzB,KAAK,EAAE,SAAS,CAAC;IACjB,QAAQ,EAAE,eAAe,EAAE,CAAC;IAC5B,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,KAAK,CAAC,EAAE,cAAc,CAAC;IACvB,UAAU,CAAC,EAAE,gBAAgB,CAAC;CAC/B;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,UAAU,GAAG,KAAK,EAAE,CAsC9D"}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Deterministic (objective) scorers — computed from run data without an LLM or network.
|
|
3
|
+
* Foundation of the self-improvement loop: a measurable signal for every run (ADR-0006).
|
|
4
|
+
*/
|
|
5
|
+
export function deterministicScores(input) {
|
|
6
|
+
const scores = [];
|
|
7
|
+
if (input.validation) {
|
|
8
|
+
scores.push({ name: "runs_green", value: input.validation.greenRatio });
|
|
9
|
+
if (input.validation.results.length > 0) {
|
|
10
|
+
scores.push({
|
|
11
|
+
name: "flaky_ratio",
|
|
12
|
+
value: input.validation.flakyCount / input.validation.results.length,
|
|
13
|
+
});
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
if (input.verified.length > 0) {
|
|
17
|
+
const ok = input.verified.filter((v) => v.verified).length;
|
|
18
|
+
scores.push({ name: "verified_ratio", value: ok / input.verified.length });
|
|
19
|
+
}
|
|
20
|
+
// grounding: share of cases whose elementRefs all point to REAL elements (count≥1),
|
|
21
|
+
// including duplicated ones (.first()). verified_ratio separately measures uniqueness (count===1).
|
|
22
|
+
const realRefs = new Set(input.verified.filter((v) => v.count >= 1).map((v) => v.ref));
|
|
23
|
+
if (input.testCases.length > 0) {
|
|
24
|
+
const grounded = input.testCases.filter((tc) => tc.elementRefs.length > 0 && tc.elementRefs.every((r) => realRefs.has(r))).length;
|
|
25
|
+
scores.push({ name: "grounding", value: grounded / input.testCases.length });
|
|
26
|
+
}
|
|
27
|
+
// locator_quality: share of user-facing locators vs CSS/testid in the generated code.
|
|
28
|
+
if (input.suite) {
|
|
29
|
+
const code = input.suite.files.map((f) => f.content).join("\n");
|
|
30
|
+
const userFacing = (code.match(/getBy(Role|Label|Text|Placeholder|AltText|Title)/g) ?? []).length;
|
|
31
|
+
const fragile = (code.match(/\.locator\(|getByTestId|page\.\$/g) ?? []).length;
|
|
32
|
+
const total = userFacing + fragile;
|
|
33
|
+
if (total > 0)
|
|
34
|
+
scores.push({ name: "locator_quality", value: userFacing / total });
|
|
35
|
+
}
|
|
36
|
+
return scores;
|
|
37
|
+
}
|
|
38
|
+
//# sourceMappingURL=scorers.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"scorers.js","sourceRoot":"","sources":["../../src/eval/scorers.ts"],"names":[],"mappings":"AAoBA;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAiB;IACnD,MAAM,MAAM,GAAY,EAAE,CAAC;IAE3B,IAAI,KAAK,CAAC,UAAU,EAAE,CAAC;QACrB,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,YAAY,EAAE,KAAK,EAAE,KAAK,CAAC,UAAU,CAAC,UAAU,EAAE,CAAC,CAAC;QACxE,IAAI,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC;gBACV,IAAI,EAAE,aAAa;gBACnB,KAAK,EAAE,KAAK,CAAC,UAAU,CAAC,UAAU,GAAG,KAAK,CAAC,UAAU,CAAC,OAAO,CAAC,MAAM;aACrE,CAAC,CAAC;QACL,CAAC;IACH,CAAC;IAED,IAAI,KAAK,CAAC,QAAQ,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC9B,MAAM,EAAE,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,MAAM,CAAC;QAC3D,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,gBAAgB,EAAE,KAAK,EAAE,EAAE,GAAG,KAAK,CAAC,QAAQ,CAAC,MAAM,EAAE,CAAC,CAAC;IAC7E,CAAC;IAED,oFAAoF;IACpF,mGAAmG;IACnG,MAAM,QAAQ,GAAG,IAAI,GAAG,CAAC,KAAK,CAAC,QAAQ,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,KAAK,IAAI,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC;IACvF,IAAI,KAAK,CAAC,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAC/B,MAAM,QAAQ,GAAG,KAAK,CAAC,SAAS,CAAC,MAAM,CACrC,CAAC,EAAE,EAAE,EAAE,CAAC,EAAE,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,IAAI,EAAE,CAAC,WAAW,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,QAAQ,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAClF,CAAC,MAAM,CAAC;QACT,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,WAAW,EAAE,KAAK,EAAE,QAAQ,GAAG,KAAK,CAAC,SAAS,CAAC,MAAM,EAAE,CAAC,CAAC;IAC/E,CAAC;IAED,sFAAsF;IACtF,IAAI,KAAK,CAAC,KAAK,EAAE,CAAC;QAChB,MAAM,IAAI,GAAG,KAAK,CAAC,KAAK,CAAC,KAAK,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QAChE,MAAM,UAAU,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,mDAAmD,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAClG,MAAM,OAAO,GAAG,CAAC,IAAI,CAAC,KAAK,CAAC,mCAAmC,CAAC,IAAI,EAAE,CAAC,CAAC,MAAM,CAAC;QAC/E,MAAM,KAAK,GAAG,UAAU,GAAG,OAAO,CAAC;QACnC,IAAI,KAAK,GAAG,CAAC;YAAE,MAAM,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,iBAAiB,EAAE,KAAK,EAAE,UAAU,GAAG,KAAK,EAAE,CAAC,CAAC;IACrF,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
package/dist/index.d.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* QA Explorer Bot — public library API (Sprint 6: frozen for embedding).
|
|
3
|
+
*
|
|
4
|
+
* Example: import { runDesign, runAutomate, runExploration } from "@plune-ai/cairn";
|
|
5
|
+
* Three entry points: explore (everything), design (cases only), automate (code from cases).
|
|
6
|
+
*/
|
|
7
|
+
export declare const BOT_NAME: "@plune-ai/cairn";
|
|
8
|
+
export declare const BOT_VERSION: "0.1.0";
|
|
9
|
+
export { runExploration, runDesign, runAutomate, buildExploreGraph, ExploreState } from "./agent/index.js";
|
|
10
|
+
export type { ExploreInput, ExploreResult, DesignResult, AutomateResult, ExploreDeps, } from "./agent/index.js";
|
|
11
|
+
export { loadConfig } from "./config/index.js";
|
|
12
|
+
export type { AppConfig } from "./config/index.js";
|
|
13
|
+
export type { TestCase, DesignedCase } from "./design/index.js";
|
|
14
|
+
export type { PageStudy } from "./observe/index.js";
|
|
15
|
+
export type { GeneratedSuite } from "./codegen/index.js";
|
|
16
|
+
export type { Score } from "./eval/scorers.js";
|
|
17
|
+
export type { PilotVerdict } from "./eval/pilot.js";
|
|
18
|
+
export type { ValidationReport } from "./validate/index.js";
|
|
19
|
+
export type { ChecklistItem } from "./checklist/index.js";
|
|
20
|
+
export type { ElementRef, VerifiedElement, StorageState, BackendKind } from "./browser/index.js";
|
|
21
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,eAAO,MAAM,QAAQ,EAAG,iBAA0B,CAAC;AACnD,eAAO,MAAM,WAAW,EAAG,OAAgB,CAAC;AAG5C,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,WAAW,EAAE,iBAAiB,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAC3G,YAAY,EACV,YAAY,EACZ,aAAa,EACb,YAAY,EACZ,cAAc,EACd,WAAW,GACZ,MAAM,kBAAkB,CAAC;AAG1B,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAC/C,YAAY,EAAE,SAAS,EAAE,MAAM,mBAAmB,CAAC;AAGnD,YAAY,EAAE,QAAQ,EAAE,YAAY,EAAE,MAAM,mBAAmB,CAAC;AAChE,YAAY,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AACpD,YAAY,EAAE,cAAc,EAAE,MAAM,oBAAoB,CAAC;AACzD,YAAY,EAAE,KAAK,EAAE,MAAM,mBAAmB,CAAC;AAC/C,YAAY,EAAE,YAAY,EAAE,MAAM,iBAAiB,CAAC;AACpD,YAAY,EAAE,gBAAgB,EAAE,MAAM,qBAAqB,CAAC;AAC5D,YAAY,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AAC1D,YAAY,EAAE,UAAU,EAAE,eAAe,EAAE,YAAY,EAAE,WAAW,EAAE,MAAM,oBAAoB,CAAC"}
|
package/dist/index.js
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* QA Explorer Bot — public library API (Sprint 6: frozen for embedding).
|
|
3
|
+
*
|
|
4
|
+
* Example: import { runDesign, runAutomate, runExploration } from "@plune-ai/cairn";
|
|
5
|
+
* Three entry points: explore (everything), design (cases only), automate (code from cases).
|
|
6
|
+
*/
|
|
7
|
+
export const BOT_NAME = "@plune-ai/cairn";
|
|
8
|
+
export const BOT_VERSION = "0.1.0";
|
|
9
|
+
// Entry points.
|
|
10
|
+
export { runExploration, runDesign, runAutomate, buildExploreGraph, ExploreState } from "./agent/index.js";
|
|
11
|
+
// Config.
|
|
12
|
+
export { loadConfig } from "./config/index.js";
|
|
13
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,MAAM,CAAC,MAAM,QAAQ,GAAG,iBAA0B,CAAC;AACnD,MAAM,CAAC,MAAM,WAAW,GAAG,OAAgB,CAAC;AAE5C,gBAAgB;AAChB,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,WAAW,EAAE,iBAAiB,EAAE,YAAY,EAAE,MAAM,kBAAkB,CAAC;AAS3G,UAAU;AACV,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC"}
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Load domain knowledge (.md with credentials/validation rules/notes), URL-matched (idea from explorbot).
|
|
3
|
+
* A file without `url:` is global (always applied); with `url:` only when the pattern is contained in the page URL.
|
|
4
|
+
* Injected into the design prompt → the bot knows facts not visible in the snapshot (reduces the oracle gap).
|
|
5
|
+
*/
|
|
6
|
+
export declare function loadKnowledge(dir: string, url: string): Promise<string>;
|
|
7
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/knowledge/index.ts"],"names":[],"mappings":"AAeA;;;;GAIG;AACH,wBAAsB,aAAa,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,EAAE,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,CAiB7E"}
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
import { readdir, readFile } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
/** url pattern from frontmatter (the `url: ...` line), if present. */
|
|
4
|
+
function frontmatterUrl(raw) {
|
|
5
|
+
const fm = raw.match(/^---\r?\n([\s\S]*?)\r?\n---/);
|
|
6
|
+
if (!fm?.[1])
|
|
7
|
+
return undefined;
|
|
8
|
+
const line = fm[1].split(/\r?\n/).find((l) => /^url:/.test(l.trim()));
|
|
9
|
+
return line ? line.replace(/^\s*url:\s*/, "").trim() : undefined;
|
|
10
|
+
}
|
|
11
|
+
function stripFrontmatter(raw) {
|
|
12
|
+
return raw.replace(/^---\r?\n[\s\S]*?\r?\n---\r?\n?/, "").trim();
|
|
13
|
+
}
|
|
14
|
+
/**
|
|
15
|
+
* Load domain knowledge (.md with credentials/validation rules/notes), URL-matched (idea from explorbot).
|
|
16
|
+
* A file without `url:` is global (always applied); with `url:` only when the pattern is contained in the page URL.
|
|
17
|
+
* Injected into the design prompt → the bot knows facts not visible in the snapshot (reduces the oracle gap).
|
|
18
|
+
*/
|
|
19
|
+
export async function loadKnowledge(dir, url) {
|
|
20
|
+
let files = [];
|
|
21
|
+
try {
|
|
22
|
+
files = (await readdir(dir)).filter((f) => f.endsWith(".md"));
|
|
23
|
+
}
|
|
24
|
+
catch {
|
|
25
|
+
return "";
|
|
26
|
+
}
|
|
27
|
+
const parts = [];
|
|
28
|
+
for (const f of files) {
|
|
29
|
+
const raw = await readFile(join(dir, f), "utf8");
|
|
30
|
+
const pattern = frontmatterUrl(raw);
|
|
31
|
+
if (!pattern || url.includes(pattern)) {
|
|
32
|
+
const body = stripFrontmatter(raw);
|
|
33
|
+
if (body.length > 0)
|
|
34
|
+
parts.push(body);
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
return parts.join("\n\n");
|
|
38
|
+
}
|
|
39
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/knowledge/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,QAAQ,EAAE,MAAM,kBAAkB,CAAC;AACrD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,sEAAsE;AACtE,SAAS,cAAc,CAAC,GAAW;IACjC,MAAM,EAAE,GAAG,GAAG,CAAC,KAAK,CAAC,6BAA6B,CAAC,CAAC;IACpD,IAAI,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC;QAAE,OAAO,SAAS,CAAC;IAC/B,MAAM,IAAI,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC;IACtE,OAAO,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC,OAAO,CAAC,aAAa,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,SAAS,CAAC;AACnE,CAAC;AAED,SAAS,gBAAgB,CAAC,GAAW;IACnC,OAAO,GAAG,CAAC,OAAO,CAAC,iCAAiC,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;AACnE,CAAC;AAED;;;;GAIG;AACH,MAAM,CAAC,KAAK,UAAU,aAAa,CAAC,GAAW,EAAE,GAAW;IAC1D,IAAI,KAAK,GAAa,EAAE,CAAC;IACzB,IAAI,CAAC;QACH,KAAK,GAAG,CAAC,MAAM,OAAO,CAAC,GAAG,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC;IAChE,CAAC;IAAC,MAAM,CAAC;QACP,OAAO,EAAE,CAAC;IACZ,CAAC;IACD,MAAM,KAAK,GAAa,EAAE,CAAC;IAC3B,KAAK,MAAM,CAAC,IAAI,KAAK,EAAE,CAAC;QACtB,MAAM,GAAG,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,GAAG,EAAE,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;QACjD,MAAM,OAAO,GAAG,cAAc,CAAC,GAAG,CAAC,CAAC;QACpC,IAAI,CAAC,OAAO,IAAI,GAAG,CAAC,QAAQ,CAAC,OAAO,CAAC,EAAE,CAAC;YACtC,MAAM,IAAI,GAAG,gBAAgB,CAAC,GAAG,CAAC,CAAC;YACnC,IAAI,IAAI,CAAC,MAAM,GAAG,CAAC;gBAAE,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;QACxC,CAAC;IACH,CAAC;IACD,OAAO,KAAK,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC5B,CAAC"}
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import type { BaseChatModel } from "@langchain/core/language_models/chat_models";
|
|
2
|
+
import type { ModelTier } from "../config/index.js";
|
|
3
|
+
/** OpenRouter — OpenAI-compatible API; we connect via the ChatOpenAI baseURL (ADR-0002). */
|
|
4
|
+
export declare const OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1";
|
|
5
|
+
export interface ProviderKeys {
|
|
6
|
+
anthropicApiKey?: string;
|
|
7
|
+
openrouterApiKey?: string;
|
|
8
|
+
}
|
|
9
|
+
/** Resolved model specification (pure result, no side effects). */
|
|
10
|
+
export type ModelSpec = {
|
|
11
|
+
provider: "anthropic";
|
|
12
|
+
model: string;
|
|
13
|
+
apiKey: string;
|
|
14
|
+
temperature?: number;
|
|
15
|
+
supportsVision: boolean;
|
|
16
|
+
} | {
|
|
17
|
+
provider: "openrouter";
|
|
18
|
+
model: string;
|
|
19
|
+
apiKey: string;
|
|
20
|
+
baseURL: string;
|
|
21
|
+
temperature?: number;
|
|
22
|
+
supportsVision: boolean;
|
|
23
|
+
};
|
|
24
|
+
/**
|
|
25
|
+
* Pure function: tier + keys → ModelSpec. Throws if the required provider's key is missing.
|
|
26
|
+
* Separated from instantiation so it is fully testable without the SDK.
|
|
27
|
+
*/
|
|
28
|
+
export declare function resolveModelSpec(tier: ModelTier, keys: ProviderKeys): ModelSpec;
|
|
29
|
+
/** Provider-agnostic factory: returns a LangChain `BaseChatModel` for the tier. */
|
|
30
|
+
export declare function makeModel(tier: ModelTier, keys: ProviderKeys): BaseChatModel;
|
|
31
|
+
//# sourceMappingURL=factory.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"factory.d.ts","sourceRoot":"","sources":["../../src/llm/factory.ts"],"names":[],"mappings":"AAEA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,6CAA6C,CAAC;AACjF,OAAO,KAAK,EAAE,SAAS,EAAE,MAAM,oBAAoB,CAAC;AAEpD,4FAA4F;AAC5F,eAAO,MAAM,mBAAmB,iCAAiC,CAAC;AAElE,MAAM,WAAW,YAAY;IAC3B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,gBAAgB,CAAC,EAAE,MAAM,CAAC;CAC3B;AAED,mEAAmE;AACnE,MAAM,MAAM,SAAS,GACjB;IACE,QAAQ,EAAE,WAAW,CAAC;IACtB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,OAAO,CAAC;CACzB,GACD;IACE,QAAQ,EAAE,YAAY,CAAC;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,OAAO,EAAE,MAAM,CAAC;IAChB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,cAAc,EAAE,OAAO,CAAC;CACzB,CAAC;AAEN;;;GAGG;AACH,wBAAgB,gBAAgB,CAAC,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,YAAY,GAAG,SAAS,CAwB/E;AAED,mFAAmF;AACnF,wBAAgB,SAAS,CAAC,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,YAAY,GAAG,aAAa,CAa5E"}
|
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
import { ChatAnthropic } from "@langchain/anthropic";
|
|
2
|
+
import { ChatOpenAI } from "@langchain/openai";
|
|
3
|
+
/** OpenRouter — OpenAI-compatible API; we connect via the ChatOpenAI baseURL (ADR-0002). */
|
|
4
|
+
export const OPENROUTER_BASE_URL = "https://openrouter.ai/api/v1";
|
|
5
|
+
/**
|
|
6
|
+
* Pure function: tier + keys → ModelSpec. Throws if the required provider's key is missing.
|
|
7
|
+
* Separated from instantiation so it is fully testable without the SDK.
|
|
8
|
+
*/
|
|
9
|
+
export function resolveModelSpec(tier, keys) {
|
|
10
|
+
if (tier.provider === "anthropic") {
|
|
11
|
+
if (!keys.anthropicApiKey) {
|
|
12
|
+
throw new Error(`Tier '${tier.model}' requires Anthropic, but ANTHROPIC_API_KEY is not set.`);
|
|
13
|
+
}
|
|
14
|
+
return {
|
|
15
|
+
provider: "anthropic",
|
|
16
|
+
model: tier.model,
|
|
17
|
+
apiKey: keys.anthropicApiKey,
|
|
18
|
+
temperature: tier.temperature,
|
|
19
|
+
supportsVision: tier.supportsVision,
|
|
20
|
+
};
|
|
21
|
+
}
|
|
22
|
+
if (!keys.openrouterApiKey) {
|
|
23
|
+
throw new Error(`Tier '${tier.model}' requires OpenRouter, but OPENROUTER_API_KEY is not set.`);
|
|
24
|
+
}
|
|
25
|
+
return {
|
|
26
|
+
provider: "openrouter",
|
|
27
|
+
model: tier.model,
|
|
28
|
+
apiKey: keys.openrouterApiKey,
|
|
29
|
+
baseURL: OPENROUTER_BASE_URL,
|
|
30
|
+
temperature: tier.temperature,
|
|
31
|
+
supportsVision: tier.supportsVision,
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
/** Provider-agnostic factory: returns a LangChain `BaseChatModel` for the tier. */
|
|
35
|
+
export function makeModel(tier, keys) {
|
|
36
|
+
const spec = resolveModelSpec(tier, keys);
|
|
37
|
+
const temperature = spec.temperature !== undefined ? { temperature: spec.temperature } : {};
|
|
38
|
+
if (spec.provider === "anthropic") {
|
|
39
|
+
return new ChatAnthropic({ model: spec.model, apiKey: spec.apiKey, ...temperature });
|
|
40
|
+
}
|
|
41
|
+
return new ChatOpenAI({
|
|
42
|
+
model: spec.model,
|
|
43
|
+
apiKey: spec.apiKey,
|
|
44
|
+
configuration: { baseURL: spec.baseURL },
|
|
45
|
+
...temperature,
|
|
46
|
+
});
|
|
47
|
+
}
|
|
48
|
+
//# sourceMappingURL=factory.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"factory.js","sourceRoot":"","sources":["../../src/llm/factory.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,aAAa,EAAE,MAAM,sBAAsB,CAAC;AACrD,OAAO,EAAE,UAAU,EAAE,MAAM,mBAAmB,CAAC;AAI/C,4FAA4F;AAC5F,MAAM,CAAC,MAAM,mBAAmB,GAAG,8BAA8B,CAAC;AAyBlE;;;GAGG;AACH,MAAM,UAAU,gBAAgB,CAAC,IAAe,EAAE,IAAkB;IAClE,IAAI,IAAI,CAAC,QAAQ,KAAK,WAAW,EAAE,CAAC;QAClC,IAAI,CAAC,IAAI,CAAC,eAAe,EAAE,CAAC;YAC1B,MAAM,IAAI,KAAK,CAAC,SAAS,IAAI,CAAC,KAAK,yDAAyD,CAAC,CAAC;QAChG,CAAC;QACD,OAAO;YACL,QAAQ,EAAE,WAAW;YACrB,KAAK,EAAE,IAAI,CAAC,KAAK;YACjB,MAAM,EAAE,IAAI,CAAC,eAAe;YAC5B,WAAW,EAAE,IAAI,CAAC,WAAW;YAC7B,cAAc,EAAE,IAAI,CAAC,cAAc;SACpC,CAAC;IACJ,CAAC;IACD,IAAI,CAAC,IAAI,CAAC,gBAAgB,EAAE,CAAC;QAC3B,MAAM,IAAI,KAAK,CAAC,SAAS,IAAI,CAAC,KAAK,2DAA2D,CAAC,CAAC;IAClG,CAAC;IACD,OAAO;QACL,QAAQ,EAAE,YAAY;QACtB,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,MAAM,EAAE,IAAI,CAAC,gBAAgB;QAC7B,OAAO,EAAE,mBAAmB;QAC5B,WAAW,EAAE,IAAI,CAAC,WAAW;QAC7B,cAAc,EAAE,IAAI,CAAC,cAAc;KACpC,CAAC;AACJ,CAAC;AAED,mFAAmF;AACnF,MAAM,UAAU,SAAS,CAAC,IAAe,EAAE,IAAkB;IAC3D,MAAM,IAAI,GAAG,gBAAgB,CAAC,IAAI,EAAE,IAAI,CAAC,CAAC;IAC1C,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,KAAK,SAAS,CAAC,CAAC,CAAC,EAAE,WAAW,EAAE,IAAI,CAAC,WAAW,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC;IAE5F,IAAI,IAAI,CAAC,QAAQ,KAAK,WAAW,EAAE,CAAC;QAClC,OAAO,IAAI,aAAa,CAAC,EAAE,KAAK,EAAE,IAAI,CAAC,KAAK,EAAE,MAAM,EAAE,IAAI,CAAC,MAAM,EAAE,GAAG,WAAW,EAAE,CAAC,CAAC;IACvF,CAAC;IACD,OAAO,IAAI,UAAU,CAAC;QACpB,KAAK,EAAE,IAAI,CAAC,KAAK;QACjB,MAAM,EAAE,IAAI,CAAC,MAAM;QACnB,aAAa,EAAE,EAAE,OAAO,EAAE,IAAI,CAAC,OAAO,EAAE;QACxC,GAAG,WAAW;KACf,CAAC,CAAC;AACL,CAAC"}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
export { OPENROUTER_BASE_URL, resolveModelSpec, makeModel } from "./factory.js";
|
|
2
|
+
export type { ModelSpec, ProviderKeys } from "./factory.js";
|
|
3
|
+
export { imageBlock } from "./vision.js";
|
|
4
|
+
export type { ImageBlock } from "./vision.js";
|
|
5
|
+
//# sourceMappingURL=index.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../../src/llm/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAChF,YAAY,EAAE,SAAS,EAAE,YAAY,EAAE,MAAM,cAAc,CAAC;AAC5D,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC;AACzC,YAAY,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/llm/index.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,mBAAmB,EAAE,gBAAgB,EAAE,SAAS,EAAE,MAAM,cAAc,CAAC;AAEhF,OAAO,EAAE,UAAU,EAAE,MAAM,aAAa,CAAC"}
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
import type { BaseChatModel } from "@langchain/core/language_models/chat_models";
|
|
2
|
+
import type { BaseMessageLike } from "@langchain/core/messages";
|
|
3
|
+
import type { ZodType } from "zod";
|
|
4
|
+
/**
|
|
5
|
+
* Structured-call seam: (schema, messages) → typed result.
|
|
6
|
+
* Decouples domain logic (design/analyze) from the concrete model — a fake is injected in tests.
|
|
7
|
+
*/
|
|
8
|
+
export type StructuredInvoke = <T>(schema: ZodType<T>, messages: BaseMessageLike[]) => Promise<T>;
|
|
9
|
+
/** Real implementation on top of LangChain `withStructuredOutput`. */
|
|
10
|
+
export declare function structuredInvoker(model: BaseChatModel): StructuredInvoke;
|
|
11
|
+
export interface RetryOptions {
|
|
12
|
+
retries?: number;
|
|
13
|
+
baseDelayMs?: number;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Retry wrapper (Sprint 6 robustness): retries the call on TRANSIENT errors (429/5xx/overloaded)
|
|
17
|
+
* with exponential backoff. Non-transient errors (schema validation, 4xx) are thrown immediately.
|
|
18
|
+
*/
|
|
19
|
+
export declare function retryInvoke(inner: StructuredInvoke, opts?: RetryOptions): StructuredInvoke;
|
|
20
|
+
/** Cost-guardrail (Sprint 6): shared LLM-call counter per run — a safeguard against runaway cost. */
|
|
21
|
+
export declare class CallBudget {
|
|
22
|
+
private readonly max;
|
|
23
|
+
private n;
|
|
24
|
+
constructor(max: number);
|
|
25
|
+
charge(): void;
|
|
26
|
+
get spent(): number;
|
|
27
|
+
}
|
|
28
|
+
/** Cost-guardrail wrapper: counts the call against the shared CallBudget (throws when exceeded). */
|
|
29
|
+
export declare function cappedInvoke(inner: StructuredInvoke, budget: CallBudget): StructuredInvoke;
|
|
30
|
+
//# sourceMappingURL=structured.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured.d.ts","sourceRoot":"","sources":["../../src/llm/structured.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EAAE,aAAa,EAAE,MAAM,6CAA6C,CAAC;AACjF,OAAO,KAAK,EAAE,eAAe,EAAE,MAAM,0BAA0B,CAAC;AAChE,OAAO,KAAK,EAAE,OAAO,EAAE,MAAM,KAAK,CAAC;AAEnC;;;GAGG;AACH,MAAM,MAAM,gBAAgB,GAAG,CAAC,CAAC,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC,CAAC,EAAE,QAAQ,EAAE,eAAe,EAAE,KAAK,OAAO,CAAC,CAAC,CAAC,CAAC;AAElG,sEAAsE;AACtE,wBAAgB,iBAAiB,CAAC,KAAK,EAAE,aAAa,GAAG,gBAAgB,CAKxE;AAMD,MAAM,WAAW,YAAY;IAC3B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED;;;GAGG;AACH,wBAAgB,WAAW,CAAC,KAAK,EAAE,gBAAgB,EAAE,IAAI,GAAE,YAAiB,GAAG,gBAAgB,CAiB9F;AAED,qGAAqG;AACrG,qBAAa,UAAU;IAET,OAAO,CAAC,QAAQ,CAAC,GAAG;IADhC,OAAO,CAAC,CAAC,CAAK;gBACe,GAAG,EAAE,MAAM;IACxC,MAAM,IAAI,IAAI;IAQd,IAAI,KAAK,IAAI,MAAM,CAElB;CACF;AAED,oGAAoG;AACpG,wBAAgB,YAAY,CAAC,KAAK,EAAE,gBAAgB,EAAE,MAAM,EAAE,UAAU,GAAG,gBAAgB,CAK1F"}
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
/** Real implementation on top of LangChain `withStructuredOutput`. */
|
|
2
|
+
export function structuredInvoker(model) {
|
|
3
|
+
return async (schema, messages) => {
|
|
4
|
+
const structured = model.withStructuredOutput(schema);
|
|
5
|
+
return (await structured.invoke(messages));
|
|
6
|
+
};
|
|
7
|
+
}
|
|
8
|
+
/** Transient provider errors worth retrying (rate limit / overload / 5xx / timeout). */
|
|
9
|
+
const TRANSIENT = /\b429\b|rate.?limit|overloaded|temporarily|timeout|ETIMEDOUT|ECONNRESET|\b50\d\b|service unavailable/i;
|
|
10
|
+
/**
|
|
11
|
+
* Retry wrapper (Sprint 6 robustness): retries the call on TRANSIENT errors (429/5xx/overloaded)
|
|
12
|
+
* with exponential backoff. Non-transient errors (schema validation, 4xx) are thrown immediately.
|
|
13
|
+
*/
|
|
14
|
+
export function retryInvoke(inner, opts = {}) {
|
|
15
|
+
const retries = opts.retries ?? 3;
|
|
16
|
+
const baseDelayMs = opts.baseDelayMs ?? 800;
|
|
17
|
+
return async (schema, messages) => {
|
|
18
|
+
let lastErr;
|
|
19
|
+
for (let attempt = 0; attempt <= retries; attempt++) {
|
|
20
|
+
try {
|
|
21
|
+
return await inner(schema, messages);
|
|
22
|
+
}
|
|
23
|
+
catch (e) {
|
|
24
|
+
lastErr = e;
|
|
25
|
+
const msg = e instanceof Error ? e.message : String(e);
|
|
26
|
+
if (attempt === retries || !TRANSIENT.test(msg))
|
|
27
|
+
throw e;
|
|
28
|
+
await new Promise((r) => setTimeout(r, baseDelayMs * 2 ** attempt));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
throw lastErr;
|
|
32
|
+
};
|
|
33
|
+
}
|
|
34
|
+
/** Cost-guardrail (Sprint 6): shared LLM-call counter per run — a safeguard against runaway cost. */
|
|
35
|
+
export class CallBudget {
|
|
36
|
+
max;
|
|
37
|
+
n = 0;
|
|
38
|
+
constructor(max) {
|
|
39
|
+
this.max = max;
|
|
40
|
+
}
|
|
41
|
+
charge() {
|
|
42
|
+
this.n += 1;
|
|
43
|
+
if (this.n > this.max) {
|
|
44
|
+
throw new Error(`LLM-call limit per run exceeded (${this.max}) — cost-guardrail. Increase maxLlmCalls or check the loop.`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
get spent() {
|
|
48
|
+
return this.n;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
/** Cost-guardrail wrapper: counts the call against the shared CallBudget (throws when exceeded). */
|
|
52
|
+
export function cappedInvoke(inner, budget) {
|
|
53
|
+
return async (schema, messages) => {
|
|
54
|
+
budget.charge();
|
|
55
|
+
return inner(schema, messages);
|
|
56
|
+
};
|
|
57
|
+
}
|
|
58
|
+
//# sourceMappingURL=structured.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured.js","sourceRoot":"","sources":["../../src/llm/structured.ts"],"names":[],"mappings":"AAUA,sEAAsE;AACtE,MAAM,UAAU,iBAAiB,CAAC,KAAoB;IACpD,OAAO,KAAK,EAAK,MAAkB,EAAE,QAA2B,EAAc,EAAE;QAC9E,MAAM,UAAU,GAAG,KAAK,CAAC,oBAAoB,CAAC,MAAM,CAAC,CAAC;QACtD,OAAO,CAAC,MAAM,UAAU,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAM,CAAC;IAClD,CAAC,CAAC;AACJ,CAAC;AAED,wFAAwF;AACxF,MAAM,SAAS,GACb,uGAAuG,CAAC;AAO1G;;;GAGG;AACH,MAAM,UAAU,WAAW,CAAC,KAAuB,EAAE,OAAqB,EAAE;IAC1E,MAAM,OAAO,GAAG,IAAI,CAAC,OAAO,IAAI,CAAC,CAAC;IAClC,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,IAAI,GAAG,CAAC;IAC5C,OAAO,KAAK,EAAK,MAAkB,EAAE,QAA2B,EAAc,EAAE;QAC9E,IAAI,OAAgB,CAAC;QACrB,KAAK,IAAI,OAAO,GAAG,CAAC,EAAE,OAAO,IAAI,OAAO,EAAE,OAAO,EAAE,EAAE,CAAC;YACpD,IAAI,CAAC;gBACH,OAAO,MAAM,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;YACvC,CAAC;YAAC,OAAO,CAAC,EAAE,CAAC;gBACX,OAAO,GAAG,CAAC,CAAC;gBACZ,MAAM,GAAG,GAAG,CAAC,YAAY,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC;gBACvD,IAAI,OAAO,KAAK,OAAO,IAAI,CAAC,SAAS,CAAC,IAAI,CAAC,GAAG,CAAC;oBAAE,MAAM,CAAC,CAAC;gBACzD,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,WAAW,GAAG,CAAC,IAAI,OAAO,CAAC,CAAC,CAAC;YACtE,CAAC;QACH,CAAC;QACD,MAAM,OAAO,CAAC;IAChB,CAAC,CAAC;AACJ,CAAC;AAED,qGAAqG;AACrG,MAAM,OAAO,UAAU;IAEQ;IADrB,CAAC,GAAG,CAAC,CAAC;IACd,YAA6B,GAAW;QAAX,QAAG,GAAH,GAAG,CAAQ;IAAG,CAAC;IAC5C,MAAM;QACJ,IAAI,CAAC,CAAC,IAAI,CAAC,CAAC;QACZ,IAAI,IAAI,CAAC,CAAC,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;YACtB,MAAM,IAAI,KAAK,CACb,oCAAoC,IAAI,CAAC,GAAG,6DAA6D,CAC1G,CAAC;QACJ,CAAC;IACH,CAAC;IACD,IAAI,KAAK;QACP,OAAO,IAAI,CAAC,CAAC,CAAC;IAChB,CAAC;CACF;AAED,oGAAoG;AACpG,MAAM,UAAU,YAAY,CAAC,KAAuB,EAAE,MAAkB;IACtE,OAAO,KAAK,EAAK,MAAkB,EAAE,QAA2B,EAAc,EAAE;QAC9E,MAAM,CAAC,MAAM,EAAE,CAAC;QAChB,OAAO,KAAK,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;IACjC,CAAC,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Cross-provider image content block for HumanMessage.
|
|
3
|
+
*
|
|
4
|
+
* The `image_url` form with a data-URL is accepted by both ChatAnthropic (via LangChain coercion)
|
|
5
|
+
* and ChatOpenAI/OpenRouter. The exact working form for Anthropic Opus is confirmed by Spike S2 —
|
|
6
|
+
* if needed, a per-provider branch will live here.
|
|
7
|
+
*/
|
|
8
|
+
export interface ImageBlock {
|
|
9
|
+
type: "image_url";
|
|
10
|
+
image_url: {
|
|
11
|
+
url: string;
|
|
12
|
+
};
|
|
13
|
+
[key: string]: unknown;
|
|
14
|
+
}
|
|
15
|
+
export declare function imageBlock(dataB64: string, mediaType?: string): ImageBlock;
|
|
16
|
+
//# sourceMappingURL=vision.d.ts.map
|