@justyork/repo-mind 0.4.0 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +5 -2
- package/dist/ab-demo/compute-pass.d.ts +24 -0
- package/dist/ab-demo/compute-pass.js +80 -0
- package/dist/ab-demo/live-answer.d.ts +6 -0
- package/dist/ab-demo/live-answer.js +89 -0
- package/dist/ab-demo/live-eval.d.ts +19 -0
- package/dist/ab-demo/live-eval.js +192 -0
- package/dist/ab-demo/paths.d.ts +1 -0
- package/dist/ab-demo/paths.js +3 -0
- package/dist/ab-demo/record-transcript.d.ts +9 -0
- package/dist/ab-demo/record-transcript.js +97 -0
- package/dist/ab-demo/types.d.ts +57 -0
- package/dist/ab-demo/validate-questions.d.ts +5 -0
- package/dist/ab-demo/validate-questions.js +32 -0
- package/dist/agent-write-gate.d.ts +7 -0
- package/dist/agent-write-gate.js +33 -0
- package/dist/cli.js +102 -0
- package/dist/commands/ab-eval.d.ts +11 -0
- package/dist/commands/ab-eval.js +103 -0
- package/dist/commands/publish.d.ts +11 -0
- package/dist/commands/publish.js +91 -0
- package/dist/git/git-exec.d.ts +19 -0
- package/dist/git/git-exec.js +89 -0
- package/dist/git/publish-pr.d.ts +33 -0
- package/dist/git/publish-pr.js +121 -0
- package/dist/index/slug.d.ts +2 -0
- package/dist/index/slug.js +15 -0
- package/dist/mcp/server.js +40 -0
- package/dist/publish/batch-publish.d.ts +29 -0
- package/dist/publish/batch-publish.js +106 -0
- package/dist/tools/create-draft.d.ts +32 -0
- package/dist/tools/create-draft.js +163 -0
- package/package.json +7 -1
- package/ui/dist/assets/{arc-C6B0IXf5.js → arc-DufwQU06.js} +1 -1
- package/ui/dist/assets/{architectureDiagram-3BPJPVTR-BwcC0zwn.js → architectureDiagram-3BPJPVTR-CI2fjjSo.js} +1 -1
- package/ui/dist/assets/{blockDiagram-GPEHLZMM-DIhdWMA6.js → blockDiagram-GPEHLZMM-DVBIs-Z6.js} +1 -1
- package/ui/dist/assets/{c4Diagram-AAUBKEIU-DAe6bsUB.js → c4Diagram-AAUBKEIU-rhAX_HrB.js} +1 -1
- package/ui/dist/assets/channel-Q_-BqwUT.js +1 -0
- package/ui/dist/assets/{chunk-2J33WTMH-Cy3md3pb.js → chunk-2J33WTMH-CxWSQi5S.js} +1 -1
- package/ui/dist/assets/{chunk-4BX2VUAB-CzL8eUDD.js → chunk-4BX2VUAB-BEM-vVbU.js} +1 -1
- package/ui/dist/assets/{chunk-55IACEB6-CbthXzDA.js → chunk-55IACEB6-TcrLDHYx.js} +1 -1
- package/ui/dist/assets/{chunk-727SXJPM-B7ZW-l9j.js → chunk-727SXJPM-D8g4OcIW.js} +1 -1
- package/ui/dist/assets/{chunk-AQP2D5EJ-Vaux-7Ld.js → chunk-AQP2D5EJ-CSvrR8rQ.js} +1 -1
- package/ui/dist/assets/{chunk-FMBD7UC4-C6P2YSWU.js → chunk-FMBD7UC4-C14JNc29.js} +1 -1
- package/ui/dist/assets/{chunk-ND2GUHAM-gSNqdDAn.js → chunk-ND2GUHAM-Cnb0VFKm.js} +1 -1
- package/ui/dist/assets/{chunk-QZHKN3VN-DPPMoZqF.js → chunk-QZHKN3VN-D2pqmAXN.js} +1 -1
- package/ui/dist/assets/classDiagram-4FO5ZUOK-CPrZx4R4.js +1 -0
- package/ui/dist/assets/classDiagram-v2-Q7XG4LA2-CPrZx4R4.js +1 -0
- package/ui/dist/assets/{cose-bilkent-S5V4N54A-Ca4nXCOA.js → cose-bilkent-S5V4N54A-Cli2Sbvt.js} +1 -1
- package/ui/dist/assets/{dagre-BM42HDAG-C-4Y4Jyn.js → dagre-BM42HDAG-gUsVuy4s.js} +1 -1
- package/ui/dist/assets/{diagram-2AECGRRQ-CNv_WVZk.js → diagram-2AECGRRQ-DTKi_1b6.js} +1 -1
- package/ui/dist/assets/{diagram-5GNKFQAL-CNwYGvCw.js → diagram-5GNKFQAL-8Bj0k_-k.js} +1 -1
- package/ui/dist/assets/{diagram-KO2AKTUF-CEULTL49.js → diagram-KO2AKTUF-CKOX2fpv.js} +1 -1
- package/ui/dist/assets/{diagram-LMA3HP47-sGQpPW5i.js → diagram-LMA3HP47-B12JvpF3.js} +1 -1
- package/ui/dist/assets/{diagram-OG6HWLK6-BBEtEBkh.js → diagram-OG6HWLK6-DKeTI9IH.js} +1 -1
- package/ui/dist/assets/editor-DZbRkvnr.js +211 -0
- package/ui/dist/assets/{erDiagram-TEJ5UH35-B4mEdVPJ.js → erDiagram-TEJ5UH35-oDb31edH.js} +1 -1
- package/ui/dist/assets/{flowDiagram-I6XJVG4X-7-OngwLn.js → flowDiagram-I6XJVG4X-SPf3KLS-.js} +1 -1
- package/ui/dist/assets/{ganttDiagram-6RSMTGT7-Bq3z_Nvr.js → ganttDiagram-6RSMTGT7-D8oPEcrt.js} +1 -1
- package/ui/dist/assets/{gitGraphDiagram-PVQCEYII-BjMdjnGR.js → gitGraphDiagram-PVQCEYII-eI9Yp7d7.js} +1 -1
- package/ui/dist/assets/{graph-CwHQTpjf.js → graph-DqCH9VKs.js} +1 -1
- package/ui/dist/assets/{infoDiagram-5YYISTIA-ByORmROU.js → infoDiagram-5YYISTIA-CQ6dtyj3.js} +1 -1
- package/ui/dist/assets/{ishikawaDiagram-YF4QCWOH-B7yRHlcY.js → ishikawaDiagram-YF4QCWOH-BHnOZ8-9.js} +1 -1
- package/ui/dist/assets/{journeyDiagram-JHISSGLW-J5mLr9YH.js → journeyDiagram-JHISSGLW-0t985reH.js} +1 -1
- package/ui/dist/assets/{kanban-definition-UN3LZRKU-B3GyGD8X.js → kanban-definition-UN3LZRKU-vPlhgwPS.js} +1 -1
- package/ui/dist/assets/main-Io6jccwi.js +232 -0
- package/ui/dist/assets/{mermaid.core-C57NVZG0.js → mermaid.core-C2wn2wM5.js} +4 -4
- package/ui/dist/assets/{mindmap-definition-RKZ34NQL-CkcXt9tU.js → mindmap-definition-RKZ34NQL-COKv8bQN.js} +1 -1
- package/ui/dist/assets/{pieDiagram-4H26LBE5-Y7B3kCyQ.js → pieDiagram-4H26LBE5-CgVeawD6.js} +1 -1
- package/ui/dist/assets/{quadrantDiagram-W4KKPZXB-CXKUOw1G.js → quadrantDiagram-W4KKPZXB-Bwpr8PxN.js} +1 -1
- package/ui/dist/assets/{requirementDiagram-4Y6WPE33-DBmmmiXs.js → requirementDiagram-4Y6WPE33-Cc8HFBxl.js} +1 -1
- package/ui/dist/assets/{sankeyDiagram-5OEKKPKP-BNADiaE4.js → sankeyDiagram-5OEKKPKP-B_1nSAK5.js} +1 -1
- package/ui/dist/assets/{sequenceDiagram-3UESZ5HK-CctElOC5.js → sequenceDiagram-3UESZ5HK-hsYrUJza.js} +1 -1
- package/ui/dist/assets/{stateDiagram-AJRCARHV-D40_7g7Y.js → stateDiagram-AJRCARHV-BMf4FL5O.js} +1 -1
- package/ui/dist/assets/stateDiagram-v2-BHNVJYJU-BpaPdu-O.js +1 -0
- package/ui/dist/assets/{theme-DxqwV6dp.js → theme-Dl8H1UuW.js} +1 -1
- package/ui/dist/assets/theme-PE9lQ4bz.css +1 -0
- package/ui/dist/assets/{timeline-definition-PNZ67QCA-DMVhUcek.js → timeline-definition-PNZ67QCA-Cf3u1h2_.js} +1 -1
- package/ui/dist/assets/{vennDiagram-CIIHVFJN-Ci0RqPuV.js → vennDiagram-CIIHVFJN-2uKO3x22.js} +1 -1
- package/ui/dist/assets/visual-editor-D1uYYGOW.js +47 -0
- package/ui/dist/assets/{wardley-L42UT6IY-Chn0BKir.js → wardley-L42UT6IY-BjLaoVtm.js} +1 -1
- package/ui/dist/assets/{wardleyDiagram-YWT4CUSO-BLdXlrC5.js → wardleyDiagram-YWT4CUSO-DlrUKeJk.js} +1 -1
- package/ui/dist/assets/{xychartDiagram-2RQKCTM6-C4gOVVe1.js → xychartDiagram-2RQKCTM6-Bzbb3cqD.js} +1 -1
- package/ui/dist/graph.html +3 -3
- package/ui/dist/index.html +4 -4
- package/ui/dist/assets/channel-C8Q7-wTd.js +0 -1
- package/ui/dist/assets/classDiagram-4FO5ZUOK-pYCdSDhT.js +0 -1
- package/ui/dist/assets/classDiagram-v2-Q7XG4LA2-pYCdSDhT.js +0 -1
- package/ui/dist/assets/editor-BcAuZsp8.js +0 -211
- package/ui/dist/assets/main-Du7fLv5Y.js +0 -244
- package/ui/dist/assets/stateDiagram-v2-BHNVJYJU-DSc6L6rE.js +0 -1
- package/ui/dist/assets/theme-BJgORXba.css +0 -1
- package/ui/dist/assets/visual-editor-UWcMGp6p.js +0 -39
package/README.md
CHANGED
|
@@ -4,7 +4,7 @@
|
|
|
4
4
|
|
|
5
5
|
## Status
|
|
6
6
|
|
|
7
|
-
**v0.
|
|
7
|
+
**v0.6.1** — live A/B eval (`repo-mind ab-eval`) for kill-switch proof on real `docs/`. Also: `create_draft` MCP, `publish --pr` (0.6.0). Prior: WYSIWYG editor, keyboard nav, domain labels.
|
|
8
8
|
|
|
9
9
|
| Artifact | Location |
|
|
10
10
|
|----------|----------|
|
|
@@ -46,9 +46,11 @@ Then ask your agent a project question — it should call `search_docs` / `get_d
|
|
|
46
46
|
| `repo-mind prepare` | Add frontmatter to legacy markdown (`--all` for batch) |
|
|
47
47
|
| `repo-mind sync-links` | Convert markdown links to wikilinks; sync `related:` |
|
|
48
48
|
| `repo-mind export` | Write `agents.md` to repo root |
|
|
49
|
+
| `repo-mind publish` | Publish active drafts to `docs/`; `--pr` opens a GitHub pull request |
|
|
49
50
|
| `repo-mind mcp` | Start the MCP stdio server |
|
|
50
51
|
| `repo-mind ui` | Confluence-style workspace over `docs/` (127.0.0.1:3847) |
|
|
51
52
|
| `npm run ab-demo` | Validate A/B demo fixture (repo checkout only) |
|
|
53
|
+
| `repo-mind ab-eval` | Live A/B eval on project `docs/` (skyforge dogfood gate) |
|
|
52
54
|
|
|
53
55
|
## Cursor skill
|
|
54
56
|
|
|
@@ -80,6 +82,7 @@ Binds **127.0.0.1** only. MCP reads published files in `docs/` only (not SQLite
|
|
|
80
82
|
- `get_doc` — fetch one doc by slug
|
|
81
83
|
- `get_glossary_term` — resolve glossary entries
|
|
82
84
|
- `explore_graph` — BFS over `related:` links
|
|
85
|
+
- `create_draft` — create a SQLite draft for human review (gated on kill-switch pass)
|
|
83
86
|
|
|
84
87
|
## CI
|
|
85
88
|
|
|
@@ -103,7 +106,7 @@ This repo is developed with [gstack](https://github.com/garrytan/gstack). Projec
|
|
|
103
106
|
|
|
104
107
|
## Roadmap
|
|
105
108
|
|
|
106
|
-
See [`docs/product/wiki/roadmap.md`](docs/product/wiki/roadmap.md) for v4.0–v4.2 phases.
|
|
109
|
+
See [`docs/product/wiki/roadmap.md`](docs/product/wiki/roadmap.md) for v4.0–v4.2 phases. v4.2 agent write shipped in 0.6.0 (gated); next: v4.0 prove closure (dogfood + hallucination scoring).
|
|
107
110
|
|
|
108
111
|
## License
|
|
109
112
|
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import type { AbArmSummary, AbQuestionComparison, HumanScoreEntry } from './types.js';
|
|
2
|
+
export declare function summarizeArmTokens(arm: 'baseline' | 'repomind', comparisons: AbQuestionComparison[]): AbArmSummary;
|
|
3
|
+
export declare function computeTokenPass(comparisons: AbQuestionComparison[], baseline: AbArmSummary, repomind: AbArmSummary): {
|
|
4
|
+
tokenPass: boolean;
|
|
5
|
+
repomindWins: number;
|
|
6
|
+
passThreshold: number;
|
|
7
|
+
};
|
|
8
|
+
export type EvalCategory = 'factual' | 'synthesis' | 'glossary-adr';
|
|
9
|
+
export declare function categoryFromTags(tags: string[] | undefined): EvalCategory;
|
|
10
|
+
export interface CategoryWin {
|
|
11
|
+
category: EvalCategory;
|
|
12
|
+
questionIds: string[];
|
|
13
|
+
baselineHallucinationTotal: number;
|
|
14
|
+
repomindHallucinationTotal: number;
|
|
15
|
+
baselineMedianTokens: number;
|
|
16
|
+
repomindMedianTokens: number;
|
|
17
|
+
repomindWins: boolean;
|
|
18
|
+
}
|
|
19
|
+
/** RepoMind wins a category when hallucination total is lower and median tokens are lower or equal. */
|
|
20
|
+
export declare function computeCategoryWins(scores: HumanScoreEntry[], comparisons: AbQuestionComparison[], questionsById: Map<string, {
|
|
21
|
+
tags?: string[];
|
|
22
|
+
}>): CategoryWin[];
|
|
23
|
+
export declare function computeHallucinationPass(categoryWins: CategoryWin[]): boolean;
|
|
24
|
+
export declare function mergePassFlags(tokenPass: boolean, hallucinationPass: boolean | null): boolean | null;
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
function median(values) {
|
|
2
|
+
if (values.length === 0) {
|
|
3
|
+
return 0;
|
|
4
|
+
}
|
|
5
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
6
|
+
const mid = Math.floor(sorted.length / 2);
|
|
7
|
+
if (sorted.length % 2 === 0) {
|
|
8
|
+
return Math.round((sorted[mid - 1] + sorted[mid]) / 2);
|
|
9
|
+
}
|
|
10
|
+
return sorted[mid];
|
|
11
|
+
}
|
|
12
|
+
export function summarizeArmTokens(arm, comparisons) {
|
|
13
|
+
const tokens = comparisons.map((row) => arm === 'baseline' ? row.baseline.tokens : row.repomind.tokens);
|
|
14
|
+
return {
|
|
15
|
+
arm,
|
|
16
|
+
medianTokens: median(tokens),
|
|
17
|
+
perQuestionTokens: tokens,
|
|
18
|
+
};
|
|
19
|
+
}
|
|
20
|
+
export function computeTokenPass(comparisons, baseline, repomind) {
|
|
21
|
+
const n = comparisons.length;
|
|
22
|
+
const repomindWins = comparisons.filter((row) => row.tokenWinner === 'repomind').length;
|
|
23
|
+
const passThreshold = Math.ceil((n * 2) / 3);
|
|
24
|
+
const tokenPass = repomind.medianTokens < baseline.medianTokens && repomindWins >= passThreshold;
|
|
25
|
+
return { tokenPass, repomindWins, passThreshold };
|
|
26
|
+
}
|
|
27
|
+
export function categoryFromTags(tags) {
|
|
28
|
+
if (!tags || tags.length === 0) {
|
|
29
|
+
return 'factual';
|
|
30
|
+
}
|
|
31
|
+
if (tags.some((t) => t === 'synthesis' || t === 'open-question')) {
|
|
32
|
+
return 'synthesis';
|
|
33
|
+
}
|
|
34
|
+
if (tags.some((t) => t === 'glossary' || t === 'adr')) {
|
|
35
|
+
return 'glossary-adr';
|
|
36
|
+
}
|
|
37
|
+
return 'factual';
|
|
38
|
+
}
|
|
39
|
+
/** RepoMind wins a category when hallucination total is lower and median tokens are lower or equal. */
|
|
40
|
+
export function computeCategoryWins(scores, comparisons, questionsById) {
|
|
41
|
+
const byCategory = new Map();
|
|
42
|
+
for (const score of scores) {
|
|
43
|
+
const question = questionsById.get(score.questionId);
|
|
44
|
+
const category = categoryFromTags(question?.tags);
|
|
45
|
+
const list = byCategory.get(category) ?? [];
|
|
46
|
+
list.push(score);
|
|
47
|
+
byCategory.set(category, list);
|
|
48
|
+
}
|
|
49
|
+
const results = [];
|
|
50
|
+
for (const [category, categoryScores] of byCategory) {
|
|
51
|
+
const questionIds = categoryScores.map((s) => s.questionId);
|
|
52
|
+
const baselineHallucinationTotal = categoryScores.reduce((sum, s) => sum + s.baseline, 0);
|
|
53
|
+
const repomindHallucinationTotal = categoryScores.reduce((sum, s) => sum + s.repomind, 0);
|
|
54
|
+
const compRows = comparisons.filter((c) => questionIds.includes(c.questionId));
|
|
55
|
+
const baselineTokens = compRows.map((c) => c.baseline.tokens);
|
|
56
|
+
const repomindTokens = compRows.map((c) => c.repomind.tokens);
|
|
57
|
+
const repomindWins = repomindHallucinationTotal < baselineHallucinationTotal &&
|
|
58
|
+
median(repomindTokens) <= median(baselineTokens);
|
|
59
|
+
results.push({
|
|
60
|
+
category,
|
|
61
|
+
questionIds,
|
|
62
|
+
baselineHallucinationTotal,
|
|
63
|
+
repomindHallucinationTotal,
|
|
64
|
+
baselineMedianTokens: median(baselineTokens),
|
|
65
|
+
repomindMedianTokens: median(repomindTokens),
|
|
66
|
+
repomindWins,
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
return results;
|
|
70
|
+
}
|
|
71
|
+
export function computeHallucinationPass(categoryWins) {
|
|
72
|
+
const wins = categoryWins.filter((row) => row.repomindWins).length;
|
|
73
|
+
return wins >= 2 && categoryWins.length >= 2;
|
|
74
|
+
}
|
|
75
|
+
export function mergePassFlags(tokenPass, hallucinationPass) {
|
|
76
|
+
if (hallucinationPass === null) {
|
|
77
|
+
return tokenPass ? null : false;
|
|
78
|
+
}
|
|
79
|
+
return tokenPass && hallucinationPass;
|
|
80
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
import type { DocIndex } from '../index/doc-index.js';
|
|
2
|
+
import type { AbQuestion, LiveArmAnswer } from './types.js';
|
|
3
|
+
/** Build a grounded answer excerpt from baseline grep-then-read retrieval. */
|
|
4
|
+
export declare function buildBaselineAnswer(index: DocIndex, question: AbQuestion): LiveArmAnswer;
|
|
5
|
+
/** Build a grounded answer excerpt from RepoMind search_docs + get_doc retrieval. */
|
|
6
|
+
export declare function buildRepomindAnswer(index: DocIndex, question: AbQuestion): LiveArmAnswer;
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
import { getDoc } from '../tools/get-doc.js';
|
|
2
|
+
import { searchDocs } from '../tools/search-docs.js';
|
|
3
|
+
import { runArmBaseline } from './arm-baseline.js';
|
|
4
|
+
import { runArmRepomind } from './arm-repomind.js';
|
|
5
|
+
import { BASELINE_CLAUDE_SNIPPET, mcpToolSchemaTokenEstimate, } from './session-overhead.js';
|
|
6
|
+
import { estimateTokens } from './estimate-tokens.js';
|
|
7
|
+
const SNIPPET_CHARS = 1200;
|
|
8
|
+
const GET_DOC_LIMIT = 3;
|
|
9
|
+
function excerpt(body, maxChars = SNIPPET_CHARS) {
|
|
10
|
+
const trimmed = body.trim();
|
|
11
|
+
if (trimmed.length <= maxChars) {
|
|
12
|
+
return trimmed;
|
|
13
|
+
}
|
|
14
|
+
return `${trimmed.slice(0, maxChars)}…`;
|
|
15
|
+
}
|
|
16
|
+
function queryTerms(prompt) {
|
|
17
|
+
return prompt
|
|
18
|
+
.toLowerCase()
|
|
19
|
+
.split(/[^a-z0-9]+/)
|
|
20
|
+
.filter((term) => term.length >= 3);
|
|
21
|
+
}
|
|
22
|
+
function docMatchesTerms(doc, terms) {
|
|
23
|
+
if (terms.length === 0) {
|
|
24
|
+
return true;
|
|
25
|
+
}
|
|
26
|
+
const haystack = `${doc.title}\n${doc.tags.join(' ')}\n${doc.body}`.toLowerCase();
|
|
27
|
+
return terms.some((term) => haystack.includes(term));
|
|
28
|
+
}
|
|
29
|
+
/** Build a grounded answer excerpt from baseline grep-then-read retrieval. */
|
|
30
|
+
export function buildBaselineAnswer(index, question) {
|
|
31
|
+
const docs = index.refresh();
|
|
32
|
+
const terms = queryTerms(question.prompt);
|
|
33
|
+
let matched = docs.filter((doc) => docMatchesTerms(doc, terms));
|
|
34
|
+
let strategy = 'grep-then-read';
|
|
35
|
+
if (matched.length === 0) {
|
|
36
|
+
matched = docs.slice(0, 5);
|
|
37
|
+
strategy = 'read-all';
|
|
38
|
+
}
|
|
39
|
+
else {
|
|
40
|
+
matched = matched.slice(0, 5);
|
|
41
|
+
}
|
|
42
|
+
const listing = docs.map((doc) => doc.relativePath).join('\n');
|
|
43
|
+
const sessionOverhead = estimateTokens(BASELINE_CLAUDE_SNIPPET) + estimateTokens(listing);
|
|
44
|
+
const perQuestionOverhead = Math.ceil(sessionOverhead / Math.max(docs.length, 1));
|
|
45
|
+
const simulated = runArmBaseline(index, question, perQuestionOverhead);
|
|
46
|
+
const parts = matched.map((doc) => {
|
|
47
|
+
const title = doc.title || doc.slug;
|
|
48
|
+
return `## ${title} (${doc.slug})\n\n${excerpt(doc.body)}`;
|
|
49
|
+
});
|
|
50
|
+
return {
|
|
51
|
+
arm: 'baseline',
|
|
52
|
+
questionId: question.id,
|
|
53
|
+
answer: parts.join('\n\n---\n\n'),
|
|
54
|
+
retrievedSlugs: matched.map((doc) => doc.slug),
|
|
55
|
+
tokens: simulated.tokens,
|
|
56
|
+
filesRead: matched.length,
|
|
57
|
+
strategy,
|
|
58
|
+
searchHits: matched.length,
|
|
59
|
+
docsFetched: matched.length,
|
|
60
|
+
};
|
|
61
|
+
}
|
|
62
|
+
/** Build a grounded answer excerpt from RepoMind search_docs + get_doc retrieval. */
|
|
63
|
+
export function buildRepomindAnswer(index, question) {
|
|
64
|
+
const docs = index.refresh();
|
|
65
|
+
const sessionOverhead = mcpToolSchemaTokenEstimate();
|
|
66
|
+
const perQuestionOverhead = Math.ceil(sessionOverhead / Math.max(docs.length, 1));
|
|
67
|
+
const simulated = runArmRepomind(index, question, perQuestionOverhead);
|
|
68
|
+
const hits = searchDocs(index, { query: question.prompt });
|
|
69
|
+
const slugsToFetch = hits.slice(0, GET_DOC_LIMIT).map((hit) => hit.slug);
|
|
70
|
+
const parts = [];
|
|
71
|
+
for (const slug of slugsToFetch) {
|
|
72
|
+
const doc = getDoc(index, slug);
|
|
73
|
+
if (doc.found && doc.body) {
|
|
74
|
+
const title = typeof doc.frontmatter?.title === 'string' ? doc.frontmatter.title : doc.slug;
|
|
75
|
+
parts.push(`## ${title} (${doc.slug})\n\n${excerpt(doc.body)}`);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
return {
|
|
79
|
+
arm: 'repomind',
|
|
80
|
+
questionId: question.id,
|
|
81
|
+
answer: parts.length > 0 ? parts.join('\n\n---\n\n') : '_No matching documents retrieved._',
|
|
82
|
+
retrievedSlugs: slugsToFetch,
|
|
83
|
+
tokens: simulated.tokens,
|
|
84
|
+
filesRead: slugsToFetch.length,
|
|
85
|
+
strategy: 'search-then-get_doc',
|
|
86
|
+
searchHits: hits.length,
|
|
87
|
+
docsFetched: parts.length,
|
|
88
|
+
};
|
|
89
|
+
}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
import { DocIndex } from '../index/doc-index.js';
|
|
2
|
+
import type { HumanScoreEntry, LiveEvalResult, LiveQuestionResult } from './types.js';
|
|
3
|
+
import type { AbQuestion } from './types.js';
|
|
4
|
+
export interface RunLiveEvalOptions {
|
|
5
|
+
cwd: string;
|
|
6
|
+
questionsFile: string;
|
|
7
|
+
outputPath: string;
|
|
8
|
+
dryRun?: boolean;
|
|
9
|
+
baselineTranscript?: string;
|
|
10
|
+
repomindTranscript?: string;
|
|
11
|
+
}
|
|
12
|
+
export declare function runLiveEvalQuestions(index: DocIndex, questions: AbQuestion[], options: Pick<RunLiveEvalOptions, 'baselineTranscript' | 'repomindTranscript'>): LiveQuestionResult[];
|
|
13
|
+
export declare function exportBlindPack(live: LiveQuestionResult[], outputPath: string, meta: {
|
|
14
|
+
corpusCwd: string;
|
|
15
|
+
runAt: string;
|
|
16
|
+
}): void;
|
|
17
|
+
export declare function buildLiveEvalResult(options: RunLiveEvalOptions, questionsFile: AbQuestion[], questionsVersion: number, live: LiveQuestionResult[], humanScores?: HumanScoreEntry[] | null): LiveEvalResult;
|
|
18
|
+
export declare function runLiveEval(options: RunLiveEvalOptions): number;
|
|
19
|
+
export declare function mergeHumanScores(resultsPath: string, scores: HumanScoreEntry[], outputPath?: string): LiveEvalResult;
|
|
@@ -0,0 +1,192 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import { DocIndex } from '../index/doc-index.js';
|
|
4
|
+
import { computeCategoryWins, computeHallucinationPass, computeTokenPass, mergePassFlags, summarizeArmTokens, } from './compute-pass.js';
|
|
5
|
+
import { loadQuestions } from './load-questions.js';
|
|
6
|
+
import { buildBaselineAnswer, buildRepomindAnswer } from './live-answer.js';
|
|
7
|
+
import { parseTranscriptTokensSync } from './record-transcript.js';
|
|
8
|
+
import { validateQuestionsAgainstIndex } from './validate-questions.js';
|
|
9
|
+
function tokenWinner(baseline, repomind) {
|
|
10
|
+
if (repomind < baseline) {
|
|
11
|
+
return 'repomind';
|
|
12
|
+
}
|
|
13
|
+
if (repomind > baseline) {
|
|
14
|
+
return 'baseline';
|
|
15
|
+
}
|
|
16
|
+
return 'tie';
|
|
17
|
+
}
|
|
18
|
+
function assignBlindLabels(seed) {
|
|
19
|
+
return seed % 2 === 0
|
|
20
|
+
? { baseline: 'A', repomind: 'B' }
|
|
21
|
+
: { baseline: 'B', repomind: 'A' };
|
|
22
|
+
}
|
|
23
|
+
function applyTranscriptTokens(answer, transcriptPath) {
|
|
24
|
+
if (!transcriptPath) {
|
|
25
|
+
answer.transcriptSource = 'simulated';
|
|
26
|
+
return;
|
|
27
|
+
}
|
|
28
|
+
const usage = parseTranscriptTokensSync(transcriptPath);
|
|
29
|
+
if (usage.source === 'transcript' && usage.totalTokens > 0) {
|
|
30
|
+
answer.tokens = usage.totalTokens;
|
|
31
|
+
answer.transcriptSource = 'transcript';
|
|
32
|
+
}
|
|
33
|
+
else {
|
|
34
|
+
answer.transcriptSource = 'simulated';
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
function toComparison(row) {
|
|
38
|
+
return {
|
|
39
|
+
questionId: row.questionId,
|
|
40
|
+
prompt: row.prompt,
|
|
41
|
+
anchorSlugs: row.anchorSlugs,
|
|
42
|
+
baseline: {
|
|
43
|
+
arm: 'baseline',
|
|
44
|
+
questionId: row.questionId,
|
|
45
|
+
tokens: row.baseline.tokens,
|
|
46
|
+
filesRead: row.baseline.filesRead,
|
|
47
|
+
strategy: row.baseline.strategy,
|
|
48
|
+
},
|
|
49
|
+
repomind: {
|
|
50
|
+
arm: 'repomind',
|
|
51
|
+
questionId: row.questionId,
|
|
52
|
+
tokens: row.repomind.tokens,
|
|
53
|
+
searchHits: row.repomind.searchHits,
|
|
54
|
+
docsFetched: row.repomind.docsFetched,
|
|
55
|
+
},
|
|
56
|
+
tokenWinner: row.tokenWinner,
|
|
57
|
+
};
|
|
58
|
+
}
|
|
59
|
+
export function runLiveEvalQuestions(index, questions, options) {
|
|
60
|
+
return questions.map((question, questionIndex) => {
|
|
61
|
+
const baseline = buildBaselineAnswer(index, question);
|
|
62
|
+
const repomind = buildRepomindAnswer(index, question);
|
|
63
|
+
applyTranscriptTokens(baseline, options.baselineTranscript);
|
|
64
|
+
applyTranscriptTokens(repomind, options.repomindTranscript);
|
|
65
|
+
const winner = tokenWinner(baseline.tokens, repomind.tokens);
|
|
66
|
+
return {
|
|
67
|
+
questionId: question.id,
|
|
68
|
+
prompt: question.prompt,
|
|
69
|
+
anchorSlugs: question.anchorSlugs,
|
|
70
|
+
tags: question.tags,
|
|
71
|
+
baseline,
|
|
72
|
+
repomind,
|
|
73
|
+
tokenWinner: winner,
|
|
74
|
+
blindLabels: assignBlindLabels(questionIndex),
|
|
75
|
+
};
|
|
76
|
+
});
|
|
77
|
+
}
|
|
78
|
+
export function exportBlindPack(live, outputPath, meta) {
|
|
79
|
+
const lines = [
|
|
80
|
+
'# RepoMind live eval — blind review pack',
|
|
81
|
+
'',
|
|
82
|
+
`Generated: ${meta.runAt}`,
|
|
83
|
+
`Corpus: ${meta.corpusCwd}`,
|
|
84
|
+
'',
|
|
85
|
+
'Score each answer 0–3 using `ab-demo/score-hallucination.md`.',
|
|
86
|
+
'Do not look up slugs until scoring is complete.',
|
|
87
|
+
'',
|
|
88
|
+
];
|
|
89
|
+
for (const row of live) {
|
|
90
|
+
const labels = row.blindLabels ?? { baseline: 'A', repomind: 'B' };
|
|
91
|
+
const answerA = labels.baseline === 'A' ? row.baseline.answer : row.repomind.answer;
|
|
92
|
+
const answerB = labels.baseline === 'B' ? row.baseline.answer : row.repomind.answer;
|
|
93
|
+
lines.push(`## ${row.questionId}`, '', `**Prompt:** ${row.prompt}`, '');
|
|
94
|
+
lines.push(`### Answer A`, '', answerA, '');
|
|
95
|
+
lines.push(`### Answer B`, '', answerB, '');
|
|
96
|
+
lines.push('---', '');
|
|
97
|
+
}
|
|
98
|
+
fs.mkdirSync(path.dirname(outputPath), { recursive: true });
|
|
99
|
+
fs.writeFileSync(outputPath, lines.join('\n'), 'utf8');
|
|
100
|
+
}
|
|
101
|
+
export function buildLiveEvalResult(options, questionsFile, questionsVersion, live, humanScores = null) {
|
|
102
|
+
const comparisons = live.map(toComparison);
|
|
103
|
+
const baseline = summarizeArmTokens('baseline', comparisons);
|
|
104
|
+
const repomind = summarizeArmTokens('repomind', comparisons);
|
|
105
|
+
const { tokenPass, repomindWins, passThreshold } = computeTokenPass(comparisons, baseline, repomind);
|
|
106
|
+
const questionsById = new Map(questionsFile.map((q) => [q.id, q]));
|
|
107
|
+
let hallucinationPass = null;
|
|
108
|
+
let categoryWins;
|
|
109
|
+
if (humanScores && humanScores.length > 0) {
|
|
110
|
+
const wins = computeCategoryWins(humanScores, comparisons, questionsById);
|
|
111
|
+
categoryWins = wins.map((row) => ({
|
|
112
|
+
category: row.category,
|
|
113
|
+
repomindWins: row.repomindWins,
|
|
114
|
+
baselineHallucinationTotal: row.baselineHallucinationTotal,
|
|
115
|
+
repomindHallucinationTotal: row.repomindHallucinationTotal,
|
|
116
|
+
}));
|
|
117
|
+
hallucinationPass = computeHallucinationPass(wins);
|
|
118
|
+
}
|
|
119
|
+
const pass = mergePassFlags(tokenPass, hallucinationPass);
|
|
120
|
+
return {
|
|
121
|
+
runAt: new Date().toISOString(),
|
|
122
|
+
evalKind: 'live',
|
|
123
|
+
corpusCwd: path.resolve(options.cwd),
|
|
124
|
+
questionsFile: options.questionsFile,
|
|
125
|
+
questionsVersion,
|
|
126
|
+
comparisons,
|
|
127
|
+
live,
|
|
128
|
+
baseline,
|
|
129
|
+
repomind,
|
|
130
|
+
repomindTokenWins: repomindWins,
|
|
131
|
+
passThreshold,
|
|
132
|
+
tokenPass,
|
|
133
|
+
humanScores,
|
|
134
|
+
hallucinationPass,
|
|
135
|
+
categoryWins,
|
|
136
|
+
pass,
|
|
137
|
+
note: humanScores === null
|
|
138
|
+
? 'Token comparison complete. Add humanScores and re-run with --record-scores to compute hallucinationPass.'
|
|
139
|
+
: 'Token and hallucination gates evaluated.',
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
export function runLiveEval(options) {
|
|
143
|
+
const cwd = path.resolve(options.cwd);
|
|
144
|
+
const questionsPayload = loadQuestions(options.questionsFile);
|
|
145
|
+
const index = new DocIndex(cwd);
|
|
146
|
+
const report = validateQuestionsAgainstIndex(index, questionsPayload.questions, cwd);
|
|
147
|
+
if (report.missingAnchors.length > 0) {
|
|
148
|
+
for (const missing of report.missingAnchors) {
|
|
149
|
+
console.error(`missing anchor: question=${missing.questionId} slug=${missing.slug}`);
|
|
150
|
+
}
|
|
151
|
+
return 1;
|
|
152
|
+
}
|
|
153
|
+
console.log(`corpus ok: ${report.docCount} docs, ${report.questionCount} questions (${cwd})`);
|
|
154
|
+
if (options.dryRun) {
|
|
155
|
+
console.log('dry-run complete — eval arms not executed');
|
|
156
|
+
return 0;
|
|
157
|
+
}
|
|
158
|
+
const live = runLiveEvalQuestions(index, questionsPayload.questions, options);
|
|
159
|
+
const result = buildLiveEvalResult(options, questionsPayload.questions, questionsPayload.version, live, null);
|
|
160
|
+
const blindPath = options.outputPath.replace(/\.json$/i, '-blind.md');
|
|
161
|
+
exportBlindPack(live, blindPath, { corpusCwd: cwd, runAt: result.runAt });
|
|
162
|
+
result.blindPackPath = blindPath;
|
|
163
|
+
fs.mkdirSync(path.dirname(options.outputPath), { recursive: true });
|
|
164
|
+
fs.writeFileSync(options.outputPath, `${JSON.stringify(result, null, 2)}\n`, 'utf8');
|
|
165
|
+
console.log(`wrote ${options.outputPath}`);
|
|
166
|
+
console.log(`wrote ${blindPath}`);
|
|
167
|
+
console.log(`tokenPass: ${result.tokenPass} (repomind wins ${result.repomindTokenWins}/${result.comparisons.length}, median ${result.repomind.medianTokens} vs ${result.baseline.medianTokens})`);
|
|
168
|
+
console.log('Next: blind-score answers, then `repo-mind ab-eval --record-scores <results.json>`');
|
|
169
|
+
return 0;
|
|
170
|
+
}
|
|
171
|
+
export function mergeHumanScores(resultsPath, scores, outputPath) {
|
|
172
|
+
const raw = JSON.parse(fs.readFileSync(resultsPath, 'utf8'));
|
|
173
|
+
if (!raw.live || raw.evalKind !== 'live') {
|
|
174
|
+
throw new Error('results file is not a live eval output');
|
|
175
|
+
}
|
|
176
|
+
const questions = raw.live.map((row) => ({
|
|
177
|
+
id: row.questionId,
|
|
178
|
+
prompt: row.prompt,
|
|
179
|
+
anchorSlugs: row.anchorSlugs,
|
|
180
|
+
tags: row.tags,
|
|
181
|
+
}));
|
|
182
|
+
const merged = buildLiveEvalResult({
|
|
183
|
+
cwd: raw.corpusCwd,
|
|
184
|
+
questionsFile: raw.questionsFile,
|
|
185
|
+
outputPath: outputPath ?? resultsPath,
|
|
186
|
+
}, questions, raw.questionsVersion, raw.live, scores);
|
|
187
|
+
merged.blindPackPath = raw.blindPackPath;
|
|
188
|
+
merged.runAt = raw.runAt;
|
|
189
|
+
const dest = outputPath ?? resultsPath;
|
|
190
|
+
fs.writeFileSync(dest, `${JSON.stringify(merged, null, 2)}\n`, 'utf8');
|
|
191
|
+
return merged;
|
|
192
|
+
}
|
package/dist/ab-demo/paths.d.ts
CHANGED
|
@@ -3,3 +3,4 @@ export declare function corpusPath(abDemoRoot: string): string;
|
|
|
3
3
|
export declare function questionsPath(abDemoRoot: string): string;
|
|
4
4
|
export declare function resultsDir(abDemoRoot: string): string;
|
|
5
5
|
export declare function scoreRubricPath(abDemoRoot: string): string;
|
|
6
|
+
export declare function skyforgeQuestionsPath(abDemoRoot: string): string;
|
package/dist/ab-demo/paths.js
CHANGED
|
@@ -29,3 +29,6 @@ export function resultsDir(abDemoRoot) {
|
|
|
29
29
|
export function scoreRubricPath(abDemoRoot) {
|
|
30
30
|
return path.join(abDemoRoot, 'score-hallucination.md');
|
|
31
31
|
}
|
|
32
|
+
export function skyforgeQuestionsPath(abDemoRoot) {
|
|
33
|
+
return path.join(abDemoRoot, 'skyforge-questions.json');
|
|
34
|
+
}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
export interface TranscriptTokenUsage {
|
|
2
|
+
inputTokens: number;
|
|
3
|
+
outputTokens: number;
|
|
4
|
+
totalTokens: number;
|
|
5
|
+
source: 'transcript' | 'unavailable';
|
|
6
|
+
}
|
|
7
|
+
/** Sum token usage from agent transcript JSONL (best-effort). */
|
|
8
|
+
export declare function parseTranscriptTokens(filePath: string): Promise<TranscriptTokenUsage>;
|
|
9
|
+
export declare function parseTranscriptTokensSync(filePath: string): TranscriptTokenUsage;
|
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import readline from 'node:readline';
|
|
3
|
+
function addUsage(target, usage) {
|
|
4
|
+
const input = typeof usage.input_tokens === 'number'
|
|
5
|
+
? usage.input_tokens
|
|
6
|
+
: typeof usage.prompt_tokens === 'number'
|
|
7
|
+
? usage.prompt_tokens
|
|
8
|
+
: 0;
|
|
9
|
+
const output = typeof usage.output_tokens === 'number'
|
|
10
|
+
? usage.output_tokens
|
|
11
|
+
: typeof usage.completion_tokens === 'number'
|
|
12
|
+
? usage.completion_tokens
|
|
13
|
+
: 0;
|
|
14
|
+
target.inputTokens += input;
|
|
15
|
+
target.outputTokens += output;
|
|
16
|
+
target.totalTokens += input + output;
|
|
17
|
+
}
|
|
18
|
+
function extractUsageFromObject(value, target) {
|
|
19
|
+
if (!value || typeof value !== 'object') {
|
|
20
|
+
return;
|
|
21
|
+
}
|
|
22
|
+
const record = value;
|
|
23
|
+
if (record.usage && typeof record.usage === 'object') {
|
|
24
|
+
addUsage(target, record.usage);
|
|
25
|
+
}
|
|
26
|
+
if (record.message && typeof record.message === 'object') {
|
|
27
|
+
const message = record.message;
|
|
28
|
+
if (message.usage && typeof message.usage === 'object') {
|
|
29
|
+
addUsage(target, message.usage);
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
if (record.metadata && typeof record.metadata === 'object') {
|
|
33
|
+
const metadata = record.metadata;
|
|
34
|
+
if (metadata.usage && typeof metadata.usage === 'object') {
|
|
35
|
+
addUsage(target, metadata.usage);
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
/** Sum token usage from agent transcript JSONL (best-effort). */
|
|
40
|
+
export async function parseTranscriptTokens(filePath) {
|
|
41
|
+
if (!fs.existsSync(filePath)) {
|
|
42
|
+
return { inputTokens: 0, outputTokens: 0, totalTokens: 0, source: 'unavailable' };
|
|
43
|
+
}
|
|
44
|
+
const usage = {
|
|
45
|
+
inputTokens: 0,
|
|
46
|
+
outputTokens: 0,
|
|
47
|
+
totalTokens: 0,
|
|
48
|
+
source: 'transcript',
|
|
49
|
+
};
|
|
50
|
+
const stream = fs.createReadStream(filePath, { encoding: 'utf8' });
|
|
51
|
+
const rl = readline.createInterface({ input: stream, crlfDelay: Infinity });
|
|
52
|
+
for await (const line of rl) {
|
|
53
|
+
const trimmed = line.trim();
|
|
54
|
+
if (!trimmed) {
|
|
55
|
+
continue;
|
|
56
|
+
}
|
|
57
|
+
try {
|
|
58
|
+
const parsed = JSON.parse(trimmed);
|
|
59
|
+
extractUsageFromObject(parsed, usage);
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
// skip non-JSON lines
|
|
63
|
+
}
|
|
64
|
+
}
|
|
65
|
+
if (usage.totalTokens === 0) {
|
|
66
|
+
usage.source = 'unavailable';
|
|
67
|
+
}
|
|
68
|
+
return usage;
|
|
69
|
+
}
|
|
70
|
+
export function parseTranscriptTokensSync(filePath) {
|
|
71
|
+
if (!fs.existsSync(filePath)) {
|
|
72
|
+
return { inputTokens: 0, outputTokens: 0, totalTokens: 0, source: 'unavailable' };
|
|
73
|
+
}
|
|
74
|
+
const usage = {
|
|
75
|
+
inputTokens: 0,
|
|
76
|
+
outputTokens: 0,
|
|
77
|
+
totalTokens: 0,
|
|
78
|
+
source: 'transcript',
|
|
79
|
+
};
|
|
80
|
+
const content = fs.readFileSync(filePath, 'utf8');
|
|
81
|
+
for (const line of content.split('\n')) {
|
|
82
|
+
const trimmed = line.trim();
|
|
83
|
+
if (!trimmed) {
|
|
84
|
+
continue;
|
|
85
|
+
}
|
|
86
|
+
try {
|
|
87
|
+
extractUsageFromObject(JSON.parse(trimmed), usage);
|
|
88
|
+
}
|
|
89
|
+
catch {
|
|
90
|
+
// skip
|
|
91
|
+
}
|
|
92
|
+
}
|
|
93
|
+
if (usage.totalTokens === 0) {
|
|
94
|
+
usage.source = 'unavailable';
|
|
95
|
+
}
|
|
96
|
+
return usage;
|
|
97
|
+
}
|
package/dist/ab-demo/types.d.ts
CHANGED
|
@@ -63,3 +63,60 @@ export interface AbRunResult {
|
|
|
63
63
|
arms: AbArmsRunResult;
|
|
64
64
|
pass: boolean | null;
|
|
65
65
|
}
|
|
66
|
+
export interface HumanScoreEntry {
|
|
67
|
+
questionId: string;
|
|
68
|
+
baseline: number;
|
|
69
|
+
repomind: number;
|
|
70
|
+
reviewer?: string;
|
|
71
|
+
notes?: string;
|
|
72
|
+
}
|
|
73
|
+
export interface LiveArmAnswer {
|
|
74
|
+
arm: 'baseline' | 'repomind';
|
|
75
|
+
questionId: string;
|
|
76
|
+
answer: string;
|
|
77
|
+
retrievedSlugs: string[];
|
|
78
|
+
tokens: number;
|
|
79
|
+
filesRead: number;
|
|
80
|
+
strategy: string;
|
|
81
|
+
searchHits: number;
|
|
82
|
+
docsFetched: number;
|
|
83
|
+
transcriptSource?: 'transcript' | 'simulated';
|
|
84
|
+
}
|
|
85
|
+
export interface LiveQuestionResult {
|
|
86
|
+
questionId: string;
|
|
87
|
+
prompt: string;
|
|
88
|
+
anchorSlugs: string[];
|
|
89
|
+
tags?: string[];
|
|
90
|
+
baseline: LiveArmAnswer;
|
|
91
|
+
repomind: LiveArmAnswer;
|
|
92
|
+
tokenWinner: 'baseline' | 'repomind' | 'tie';
|
|
93
|
+
blindLabels?: {
|
|
94
|
+
baseline: 'A' | 'B';
|
|
95
|
+
repomind: 'A' | 'B';
|
|
96
|
+
};
|
|
97
|
+
}
|
|
98
|
+
export interface LiveEvalResult {
|
|
99
|
+
runAt: string;
|
|
100
|
+
evalKind: 'live';
|
|
101
|
+
corpusCwd: string;
|
|
102
|
+
questionsFile: string;
|
|
103
|
+
questionsVersion: number;
|
|
104
|
+
comparisons: AbQuestionComparison[];
|
|
105
|
+
live: LiveQuestionResult[];
|
|
106
|
+
baseline: AbArmSummary;
|
|
107
|
+
repomind: AbArmSummary;
|
|
108
|
+
repomindTokenWins: number;
|
|
109
|
+
passThreshold: number;
|
|
110
|
+
tokenPass: boolean;
|
|
111
|
+
humanScores: HumanScoreEntry[] | null;
|
|
112
|
+
hallucinationPass: boolean | null;
|
|
113
|
+
categoryWins?: Array<{
|
|
114
|
+
category: string;
|
|
115
|
+
repomindWins: boolean;
|
|
116
|
+
baselineHallucinationTotal: number;
|
|
117
|
+
repomindHallucinationTotal: number;
|
|
118
|
+
}>;
|
|
119
|
+
pass: boolean | null;
|
|
120
|
+
blindPackPath?: string;
|
|
121
|
+
note: string;
|
|
122
|
+
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import type { DocIndex } from '../index/doc-index.js';
|
|
2
|
+
import type { AbDryRunReport, AbQuestion } from './types.js';
|
|
3
|
+
export declare function validateQuestionsAgainstIndex(index: DocIndex, questions: AbQuestion[], label: string): AbDryRunReport;
|
|
4
|
+
export declare function defaultSkyforgeQuestionsPath(abDemoRoot: string): string;
|
|
5
|
+
export declare function assertQuestionsFile(filePath: string): void;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import fs from 'node:fs';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
export function validateQuestionsAgainstIndex(index, questions, label) {
|
|
4
|
+
const knowledgeRoot = index.getKnowledgeRoot();
|
|
5
|
+
if (!knowledgeRoot) {
|
|
6
|
+
throw new Error(`no docs/ found in ${label}`);
|
|
7
|
+
}
|
|
8
|
+
const docs = index.refresh();
|
|
9
|
+
const slugSet = new Set(docs.map((doc) => doc.slug));
|
|
10
|
+
const missingAnchors = [];
|
|
11
|
+
for (const question of questions) {
|
|
12
|
+
for (const slug of question.anchorSlugs) {
|
|
13
|
+
if (!slugSet.has(slug)) {
|
|
14
|
+
missingAnchors.push({ questionId: question.id, slug });
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
return {
|
|
19
|
+
corpusPath: knowledgeRoot,
|
|
20
|
+
docCount: docs.length,
|
|
21
|
+
questionCount: questions.length,
|
|
22
|
+
missingAnchors,
|
|
23
|
+
};
|
|
24
|
+
}
|
|
25
|
+
export function defaultSkyforgeQuestionsPath(abDemoRoot) {
|
|
26
|
+
return path.join(abDemoRoot, 'skyforge-questions.json');
|
|
27
|
+
}
|
|
28
|
+
export function assertQuestionsFile(filePath) {
|
|
29
|
+
if (!fs.existsSync(filePath)) {
|
|
30
|
+
throw new Error(`questions file not found: ${filePath}`);
|
|
31
|
+
}
|
|
32
|
+
}
|