pcl-mcp 0.2.4 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +137 -28
- package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts +30 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.js +50 -0
- package/dist/benchmarks/evaluators/context-retrieval-quality.js.map +1 -0
- package/dist/benchmarks/evaluators/ir-metrics.d.ts +32 -0
- package/dist/benchmarks/evaluators/ir-metrics.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/ir-metrics.js +98 -0
- package/dist/benchmarks/evaluators/ir-metrics.js.map +1 -0
- package/dist/benchmarks/evaluators/structured-judge.d.ts +34 -0
- package/dist/benchmarks/evaluators/structured-judge.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/structured-judge.js +153 -0
- package/dist/benchmarks/evaluators/structured-judge.js.map +1 -0
- package/dist/benchmarks/evaluators/token-counter.d.ts +9 -0
- package/dist/benchmarks/evaluators/token-counter.d.ts.map +1 -0
- package/dist/benchmarks/evaluators/token-counter.js +24 -0
- package/dist/benchmarks/evaluators/token-counter.js.map +1 -0
- package/dist/benchmarks/generators/generate-corpus.d.ts +2 -0
- package/dist/benchmarks/generators/generate-corpus.d.ts.map +1 -0
- package/dist/benchmarks/generators/generate-corpus.js +243 -0
- package/dist/benchmarks/generators/generate-corpus.js.map +1 -0
- package/dist/benchmarks/lib/harness.d.ts +23 -0
- package/dist/benchmarks/lib/harness.d.ts.map +1 -0
- package/dist/benchmarks/lib/harness.js +44 -0
- package/dist/benchmarks/lib/harness.js.map +1 -0
- package/dist/benchmarks/lib/types.d.ts +79 -0
- package/dist/benchmarks/lib/types.d.ts.map +1 -0
- package/dist/benchmarks/lib/types.js +2 -0
- package/dist/benchmarks/lib/types.js.map +1 -0
- package/dist/benchmarks/reporters/markdown-reporter.d.ts +2 -0
- package/dist/benchmarks/reporters/markdown-reporter.d.ts.map +1 -0
- package/dist/benchmarks/reporters/markdown-reporter.js +80 -0
- package/dist/benchmarks/reporters/markdown-reporter.js.map +1 -0
- package/dist/benchmarks/runners/bench-ablation.d.ts +2 -0
- package/dist/benchmarks/runners/bench-ablation.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-ablation.js +49 -0
- package/dist/benchmarks/runners/bench-ablation.js.map +1 -0
- package/dist/benchmarks/runners/bench-ai-quality.d.ts +2 -0
- package/dist/benchmarks/runners/bench-ai-quality.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-ai-quality.js +297 -0
- package/dist/benchmarks/runners/bench-ai-quality.js.map +1 -0
- package/dist/benchmarks/runners/bench-interactive-eval.d.ts +2 -0
- package/dist/benchmarks/runners/bench-interactive-eval.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-interactive-eval.js +119 -0
- package/dist/benchmarks/runners/bench-interactive-eval.js.map +1 -0
- package/dist/benchmarks/runners/bench-performance.bench.d.ts +2 -0
- package/dist/benchmarks/runners/bench-performance.bench.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-performance.bench.js +50 -0
- package/dist/benchmarks/runners/bench-performance.bench.js.map +1 -0
- package/dist/benchmarks/runners/bench-search-quality.d.ts +2 -0
- package/dist/benchmarks/runners/bench-search-quality.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-search-quality.js +70 -0
- package/dist/benchmarks/runners/bench-search-quality.js.map +1 -0
- package/dist/benchmarks/runners/bench-token-efficiency.d.ts +2 -0
- package/dist/benchmarks/runners/bench-token-efficiency.d.ts.map +1 -0
- package/dist/benchmarks/runners/bench-token-efficiency.js +89 -0
- package/dist/benchmarks/runners/bench-token-efficiency.js.map +1 -0
- package/dist/benchmarks/runners/diag.d.ts +2 -0
- package/dist/benchmarks/runners/diag.d.ts.map +1 -0
- package/dist/benchmarks/runners/diag.js +30 -0
- package/dist/benchmarks/runners/diag.js.map +1 -0
- package/dist/benchmarks/vitest.config.bench.d.ts +3 -0
- package/dist/benchmarks/vitest.config.bench.d.ts.map +1 -0
- package/dist/benchmarks/vitest.config.bench.js +14 -0
- package/dist/benchmarks/vitest.config.bench.js.map +1 -0
- package/dist/bin/pcl.js +36 -23
- package/dist/bin/pcl.js.map +1 -1
- package/dist/src/db.d.ts +2 -1
- package/dist/src/db.d.ts.map +1 -1
- package/dist/src/db.js +25 -21
- package/dist/src/db.js.map +1 -1
- package/dist/src/embeddings.d.ts +1 -1
- package/dist/src/embeddings.js +2 -2
- package/dist/src/embeddings.js.map +1 -1
- package/dist/src/indexer.d.ts +1 -1
- package/dist/src/indexer.d.ts.map +1 -1
- package/dist/src/indexer.js +6 -2
- package/dist/src/indexer.js.map +1 -1
- package/dist/src/search.d.ts.map +1 -1
- package/dist/src/search.js +138 -26
- package/dist/src/search.js.map +1 -1
- package/dist/src/server.js +6 -0
- package/dist/src/server.js.map +1 -1
- package/dist/src/types.d.ts +1 -0
- package/dist/src/types.d.ts.map +1 -1
- package/dist/tests/db.test.d.ts +2 -0
- package/dist/tests/db.test.d.ts.map +1 -0
- package/dist/tests/db.test.js +459 -0
- package/dist/tests/db.test.js.map +1 -0
- package/dist/tests/embeddings.test.d.ts +2 -0
- package/dist/tests/embeddings.test.d.ts.map +1 -0
- package/dist/tests/embeddings.test.js +165 -0
- package/dist/tests/embeddings.test.js.map +1 -0
- package/dist/tests/helpers/test-harness.d.ts +26 -0
- package/dist/tests/helpers/test-harness.d.ts.map +1 -0
- package/dist/tests/helpers/test-harness.js +80 -0
- package/dist/tests/helpers/test-harness.js.map +1 -0
- package/dist/tests/indexer.test.d.ts +2 -0
- package/dist/tests/indexer.test.d.ts.map +1 -0
- package/dist/tests/indexer.test.js +299 -0
- package/dist/tests/indexer.test.js.map +1 -0
- package/dist/tests/schemas.test.d.ts +2 -0
- package/dist/tests/schemas.test.d.ts.map +1 -0
- package/dist/tests/schemas.test.js +378 -0
- package/dist/tests/schemas.test.js.map +1 -0
- package/dist/tests/search.test.d.ts +2 -0
- package/dist/tests/search.test.d.ts.map +1 -0
- package/dist/tests/search.test.js +129 -0
- package/dist/tests/search.test.js.map +1 -0
- package/dist/tests/tools.test.d.ts +2 -0
- package/dist/tests/tools.test.d.ts.map +1 -0
- package/dist/tests/tools.test.js +232 -0
- package/dist/tests/tools.test.js.map +1 -0
- package/package.json +14 -2
|
@@ -0,0 +1,153 @@
|
|
|
1
|
+
const JUDGE_MODEL = "claude-sonnet-4-20250514";
|
|
2
|
+
/**
|
|
3
|
+
* Build structured yes/no questions from task criteria and context.
|
|
4
|
+
*/
|
|
5
|
+
function buildQuestions(task, contextDocs) {
|
|
6
|
+
const questions = [];
|
|
7
|
+
for (const criterion of task.evaluationCriteria) {
|
|
8
|
+
if (criterion.prompt) {
|
|
9
|
+
questions.push(criterion.prompt);
|
|
10
|
+
}
|
|
11
|
+
else if (criterion.pattern) {
|
|
12
|
+
questions.push(`Does the code contain or implement: ${criterion.criterion}?`);
|
|
13
|
+
}
|
|
14
|
+
else {
|
|
15
|
+
questions.push(`Does the code satisfy: ${criterion.criterion}?`);
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
// Add forbidden pattern checks
|
|
19
|
+
for (const pattern of task.forbiddenPatterns) {
|
|
20
|
+
questions.push(`Does the code AVOID the forbidden pattern: ${pattern}? (YES means it correctly avoids it)`);
|
|
21
|
+
}
|
|
22
|
+
return questions;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Evaluate an AI-generated output against structured criteria.
|
|
26
|
+
*/
|
|
27
|
+
export async function structuredJudge(client, task, output, contextDocs) {
|
|
28
|
+
const questions = buildQuestions(task, contextDocs);
|
|
29
|
+
const questionsBlock = questions
|
|
30
|
+
.map((q, i) => `Q${i + 1}: ${q}`)
|
|
31
|
+
.join("\n");
|
|
32
|
+
const prompt = `You are evaluating an AI coding agent's output against specific product requirements.
|
|
33
|
+
|
|
34
|
+
TASK DESCRIPTION:
|
|
35
|
+
${task.description}
|
|
36
|
+
|
|
37
|
+
RELEVANT PRODUCT CONTEXT:
|
|
38
|
+
${contextDocs.slice(0, 8000)}
|
|
39
|
+
|
|
40
|
+
AI AGENT'S CODE OUTPUT:
|
|
41
|
+
${output.slice(0, 12000)}
|
|
42
|
+
|
|
43
|
+
EVALUATION QUESTIONS:
|
|
44
|
+
${questionsBlock}
|
|
45
|
+
|
|
46
|
+
For each question, answer with:
|
|
47
|
+
- "YES" if the code clearly satisfies the criterion
|
|
48
|
+
- "NO" if the code clearly fails the criterion
|
|
49
|
+
- "PARTIAL" if partially satisfied
|
|
50
|
+
|
|
51
|
+
Return a JSON array with one object per question:
|
|
52
|
+
[{"answer": "YES"|"NO"|"PARTIAL", "reasoning": "brief explanation"}]
|
|
53
|
+
|
|
54
|
+
Return ONLY the JSON array, no other text.`;
|
|
55
|
+
const response = await client.messages.create({
|
|
56
|
+
model: JUDGE_MODEL,
|
|
57
|
+
max_tokens: 2048,
|
|
58
|
+
temperature: 0,
|
|
59
|
+
system: "You are a precise code evaluation judge. Return only valid JSON arrays.",
|
|
60
|
+
messages: [{ role: "user", content: prompt }],
|
|
61
|
+
});
|
|
62
|
+
const text = response.content[0]?.type === "text" ? response.content[0].text : "[]";
|
|
63
|
+
let answers;
|
|
64
|
+
try {
|
|
65
|
+
answers = JSON.parse(text);
|
|
66
|
+
}
|
|
67
|
+
catch {
|
|
68
|
+
// Fallback: try to extract JSON from text
|
|
69
|
+
const match = text.match(/\[[\s\S]*\]/);
|
|
70
|
+
if (match) {
|
|
71
|
+
try {
|
|
72
|
+
answers = JSON.parse(match[0]);
|
|
73
|
+
}
|
|
74
|
+
catch {
|
|
75
|
+
answers = [];
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
else {
|
|
79
|
+
answers = [];
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
const details = questions.map((q, i) => {
|
|
83
|
+
const a = answers[i];
|
|
84
|
+
const answer = (a?.answer?.toUpperCase() ?? "NO");
|
|
85
|
+
return {
|
|
86
|
+
criterion: q,
|
|
87
|
+
answer: answer === "YES" || answer === "NO" || answer === "PARTIAL"
|
|
88
|
+
? answer
|
|
89
|
+
: "NO",
|
|
90
|
+
reasoning: a?.reasoning ?? "No response from judge",
|
|
91
|
+
};
|
|
92
|
+
});
|
|
93
|
+
const yesCount = details.reduce((sum, d) => {
|
|
94
|
+
if (d.answer === "YES")
|
|
95
|
+
return sum + 1;
|
|
96
|
+
if (d.answer === "PARTIAL")
|
|
97
|
+
return sum + 0.5;
|
|
98
|
+
return sum;
|
|
99
|
+
}, 0);
|
|
100
|
+
const totalQuestions = details.length;
|
|
101
|
+
const score = totalQuestions > 0 ? (yesCount / totalQuestions) * 10 : 0;
|
|
102
|
+
return {
|
|
103
|
+
taskId: task.id,
|
|
104
|
+
totalQuestions,
|
|
105
|
+
yesCount,
|
|
106
|
+
score: Math.round(score * 10) / 10,
|
|
107
|
+
details,
|
|
108
|
+
};
|
|
109
|
+
}
|
|
110
|
+
/**
|
|
111
|
+
* Check if generated code contains valid TypeScript/JSX syntax.
|
|
112
|
+
* Uses the TypeScript compiler API in syntax-only mode for accurate parsing
|
|
113
|
+
* of template literals, JSX, and other complex syntax.
|
|
114
|
+
*/
|
|
115
|
+
export function checkTypeScriptSyntax(output) {
|
|
116
|
+
// Extract fenced code blocks
|
|
117
|
+
const codeBlockRegex = /```(?:typescript|tsx?|jsx?|js)?\s*\n([\s\S]*?)```/g;
|
|
118
|
+
const blocks = [];
|
|
119
|
+
let match;
|
|
120
|
+
while ((match = codeBlockRegex.exec(output)) !== null) {
|
|
121
|
+
if (match[1]?.trim())
|
|
122
|
+
blocks.push(match[1]);
|
|
123
|
+
}
|
|
124
|
+
if (blocks.length === 0) {
|
|
125
|
+
return { valid: true, errorCount: 0, codeBlockCount: 0 };
|
|
126
|
+
}
|
|
127
|
+
let totalErrors = 0;
|
|
128
|
+
// Dynamic import of typescript — may not be available in all environments
|
|
129
|
+
let ts;
|
|
130
|
+
try {
|
|
131
|
+
// eslint-disable-next-line @typescript-eslint/no-require-imports
|
|
132
|
+
ts = require("typescript");
|
|
133
|
+
}
|
|
134
|
+
catch {
|
|
135
|
+
// TypeScript not available — fall back to permissive (assume valid)
|
|
136
|
+
return { valid: true, errorCount: 0, codeBlockCount: blocks.length };
|
|
137
|
+
}
|
|
138
|
+
for (const block of blocks) {
|
|
139
|
+
// Parse as TSX to handle both TypeScript and JSX syntax
|
|
140
|
+
const sourceFile = ts.createSourceFile("check.tsx", block, ts.ScriptTarget.Latest, false, ts.ScriptKind.TSX);
|
|
141
|
+
// Count syntax-level diagnostics only (not semantic/type errors)
|
|
142
|
+
// parseDiagnostics is internal; access via type assertion
|
|
143
|
+
const syntaxDiags = (sourceFile
|
|
144
|
+
.parseDiagnostics?.length) ?? 0;
|
|
145
|
+
totalErrors += syntaxDiags;
|
|
146
|
+
}
|
|
147
|
+
return {
|
|
148
|
+
valid: totalErrors === 0,
|
|
149
|
+
errorCount: totalErrors,
|
|
150
|
+
codeBlockCount: blocks.length,
|
|
151
|
+
};
|
|
152
|
+
}
|
|
153
|
+
//# sourceMappingURL=structured-judge.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"structured-judge.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/structured-judge.ts"],"names":[],"mappings":"AASA,MAAM,WAAW,GAAG,0BAA0B,CAAC;AAc/C;;GAEG;AACH,SAAS,cAAc,CACrB,IAAgB,EAChB,WAAmB;IAEnB,MAAM,SAAS,GAAa,EAAE,CAAC;IAE/B,KAAK,MAAM,SAAS,IAAI,IAAI,CAAC,kBAAkB,EAAE,CAAC;QAChD,IAAI,SAAS,CAAC,MAAM,EAAE,CAAC;YACrB,SAAS,CAAC,IAAI,CAAC,SAAS,CAAC,MAAM,CAAC,CAAC;QACnC,CAAC;aAAM,IAAI,SAAS,CAAC,OAAO,EAAE,CAAC;YAC7B,SAAS,CAAC,IAAI,CACZ,uCAAuC,SAAS,CAAC,SAAS,GAAG,CAC9D,CAAC;QACJ,CAAC;aAAM,CAAC;YACN,SAAS,CAAC,IAAI,CAAC,0BAA0B,SAAS,CAAC,SAAS,GAAG,CAAC,CAAC;QACnE,CAAC;IACH,CAAC;IAED,+BAA+B;IAC/B,KAAK,MAAM,OAAO,IAAI,IAAI,CAAC,iBAAiB,EAAE,CAAC;QAC7C,SAAS,CAAC,IAAI,CACZ,8CAA8C,OAAO,sCAAsC,CAC5F,CAAC;IACJ,CAAC;IAED,OAAO,SAAS,CAAC;AACnB,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,eAAe,CACnC,MAAiB,EACjB,IAAgB,EAChB,MAAc,EACd,WAAmB;IAEnB,MAAM,SAAS,GAAG,cAAc,CAAC,IAAI,EAAE,WAAW,CAAC,CAAC;IAEpD,MAAM,cAAc,GAAG,SAAS;SAC7B,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;SAChC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,MAAM,MAAM,GAAG;;;EAGf,IAAI,CAAC,WAAW;;;EAGhB,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,IAAI,CAAC;;;EAG1B,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,KAAK,CAAC;;;EAGtB,cAAc;;;;;;;;;;2CAU2B,CAAC;IAE1C,MAAM,QAAQ,GAAG,MAAM,MAAM,CAAC,QAAQ,CAAC,MAAM,CAAC;QAC5C,KAAK,EAAE,WAAW;QAClB,UAAU,EAAE,IAAI;QAChB,WAAW,EAAE,CAAC;QACd,MAAM,EACJ,yEAAyE;QAC3E,QAAQ,EAAE,CAAC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,EAAE,CAAC;KAC9C,CAAC,CAAC;IAEH,MAAM,IAAI,GACR,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,EAAE,IAAI,KAAK,MAAM,CAAC,CAAC,CAAC,QAAQ,CAAC,OAAO,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,CAAC,IAAI,CAAC;IAEzE,IAAI,OAGF,CAAC;IAEH,IAAI,CAAC;QACH,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,IAAI,CAGvB,CAAC;IACL,CAAC;IAAC,MAAM,CAAC;QACP,0CAA0C;QAC1C,MAAM,KAAK,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,CAAC,CAAC;QACxC,IAAI,KAAK,EAAE,CAAC;YACV,IAAI,CAAC;gBACH,OAAO,GAAG,IAAI,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,CAAC,CAG3B,CAAC;YACL,CAAC;YAAC,MAAM,CAAC;gBACP,OAAO,GAAG,EAAE,CAAC;YACf,CAAC;QACH,CAAC;aAAM,CAAC;YACN,OAAO,GAAG,EAAE,CAAC;QACf,CAAC;IACH,CAAC;IAED,MAAM,OAAO,GAAG,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE;QACrC,MAAM,CAAC,GAAG,OAAO,CAAC,CAAC,CAAC,CAAC;QACrB,MAAM,MAAM,GAAG,CAAC,CAAC,EAAE,MAAM,EAAE,WAAW,EAAE,IAAI,IAAI,CAGnC,CAAC;QACd,OAAO;YACL,SAAS,EAAE,CAAC;YACZ,MAAM,EAAE,MAAM,KAAK,KAAK,IAAI,MAAM,KAAK,IAAI,IAAI,MAAM,KAAK,SAAS;gBACjE,CAAC,CAAC,MAAM;gBACR,CAAC,CAAE,IAAc;YACnB,SAAS,EAAE,CAAC,EAAE,SAAS,IAAI,wBAAwB;SACpD,CAAC;IACJ,CAAC,CAAC,CAAC;IAEH,MAAM,QAAQ,GAAG,OAAO,CAAC,MAAM,CAAC,CAAC,GAAG,EAAE,CAAC,EAAE,EAAE;QACzC,IAAI,CAAC,CAAC,MAAM,KAAK,KAAK;YAAE,OAAO,GAAG,GAAG,CAAC,CAAC;QACvC,IAAI,CAAC,CAAC,MAAM,KAAK,SAAS;YAAE,OAAO,GAAG,GAAG,GAAG,CAAC;QAC7C,OAAO,GAAG,CAAC;IACb,CAAC,EAAE,CAAC,CAAC,CAAC;IAEN,MAAM,cAAc,GAAG,OAAO,CAAC,MAAM,CAAC;IACtC,MAAM,KAAK,GACT,cAAc,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,QAAQ,GAAG,cAAc,CAAC,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;IAE5D,OAAO;QACL,MAAM,EAAE,IAAI,CAAC,EAAE;QACf,cAAc;QACd,QAAQ;QACR,KAAK,EAAE,IAAI,CAAC,KAAK,CAAC,KAAK,GAAG,EAAE,CAAC,GAAG,EAAE;QAClC,OAAO;KACR,CAAC;AACJ,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,qBAAqB,CACnC,MAAc;IAEd,6BAA6B;IAC7B,MAAM,cAAc,GAAG,oDAAoD,CAAC;IAC5E,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,KAAK,CAAC;IACV,OAAO,CAAC,KAAK,GAAG,cAAc,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACtD,IAAI,KAAK,CAAC,CAAC,CAAC,EAAE,IAAI,EAAE;YAAE,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED,IAAI,MAAM,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QACxB,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,cAAc,EAAE,CAAC,EAAE,CAAC;IAC3D,CAAC;IAED,IAAI,WAAW,GAAG,CAAC,CAAC;IAEpB,0EAA0E;IAC1E,IAAI,EAA2C,CAAC;IAChD,IAAI,CAAC;QACH,iEAAiE;QACjE,EAAE,GAAG,OAAO,CAAC,YAAY,CAAgC,CAAC;IAC5D,CAAC;IAAC,MAAM,CAAC;QACP,oEAAoE;QACpE,OAAO,EAAE,KAAK,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,EAAE,cAAc,EAAE,MAAM,CAAC,MAAM,EAAE,CAAC;IACvE,CAAC;IAED,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,wDAAwD;QACxD,MAAM,UAAU,GAAG,EAAE,CAAC,gBAAgB,CACpC,WAAW,EACX,KAAK,EACL,EAAE,CAAC,YAAY,CAAC,MAAM,EACtB,KAAK,EACL,EAAE,CAAC,UAAU,CAAC,GAAG,CAClB,CAAC;QAEF,iEAAiE;QACjE,0DAA0D;QAC1D,MAAM,WAAW,GACf,CAAE,UAA0D;aACzD,gBAAgB,EAAE,MAAM,CAAC,IAAI,CAAC,CAAC;QACpC,WAAW,IAAI,WAAW,CAAC;IAC7B,CAAC;IAED,OAAO;QACL,KAAK,EAAE,WAAW,KAAK,CAAC;QACxB,UAAU,EAAE,WAAW;QACvB,cAAc,EAAE,MAAM,CAAC,MAAM;KAC9B,CAAC;AACJ,CAAC"}
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Count tokens in a string using cl100k_base (Claude/GPT-4 tokenizer).
|
|
3
|
+
*/
|
|
4
|
+
export declare function countTokens(text: string): number;
|
|
5
|
+
/**
|
|
6
|
+
* Dispose the encoder to free memory.
|
|
7
|
+
*/
|
|
8
|
+
export declare function disposeEncoder(): void;
|
|
9
|
+
//# sourceMappingURL=token-counter.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"token-counter.d.ts","sourceRoot":"","sources":["../../../benchmarks/evaluators/token-counter.ts"],"names":[],"mappings":"AAWA;;GAEG;AACH,wBAAgB,WAAW,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,CAEhD;AAED;;GAEG;AACH,wBAAgB,cAAc,IAAI,IAAI,CAKrC"}
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
import { get_encoding } from "tiktoken";
|
|
2
|
+
let _encoder = null;
|
|
3
|
+
function getEncoder() {
|
|
4
|
+
if (!_encoder) {
|
|
5
|
+
_encoder = get_encoding("cl100k_base");
|
|
6
|
+
}
|
|
7
|
+
return _encoder;
|
|
8
|
+
}
|
|
9
|
+
/**
|
|
10
|
+
* Count tokens in a string using cl100k_base (Claude/GPT-4 tokenizer).
|
|
11
|
+
*/
|
|
12
|
+
export function countTokens(text) {
|
|
13
|
+
return getEncoder().encode(text).length;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* Dispose the encoder to free memory.
|
|
17
|
+
*/
|
|
18
|
+
export function disposeEncoder() {
|
|
19
|
+
if (_encoder) {
|
|
20
|
+
_encoder.free();
|
|
21
|
+
_encoder = null;
|
|
22
|
+
}
|
|
23
|
+
}
|
|
24
|
+
//# sourceMappingURL=token-counter.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"token-counter.js","sourceRoot":"","sources":["../../../benchmarks/evaluators/token-counter.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,YAAY,EAAiB,MAAM,UAAU,CAAC;AAEvD,IAAI,QAAQ,GAAoB,IAAI,CAAC;AAErC,SAAS,UAAU;IACjB,IAAI,CAAC,QAAQ,EAAE,CAAC;QACd,QAAQ,GAAG,YAAY,CAAC,aAAa,CAAC,CAAC;IACzC,CAAC;IACD,OAAO,QAAQ,CAAC;AAClB,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,WAAW,CAAC,IAAY;IACtC,OAAO,UAAU,EAAE,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,MAAM,CAAC;AAC1C,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,cAAc;IAC5B,IAAI,QAAQ,EAAE,CAAC;QACb,QAAQ,CAAC,IAAI,EAAE,CAAC;QAChB,QAAQ,GAAG,IAAI,CAAC;IAClB,CAAC;AACH,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generate-corpus.d.ts","sourceRoot":"","sources":["../../../benchmarks/generators/generate-corpus.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,243 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Corpus generator — expands corpus-small into medium (50), large (100), xlarge (500).
|
|
3
|
+
* Uses deterministic seeding for reproducibility.
|
|
4
|
+
*/
|
|
5
|
+
import { mkdir, writeFile, cp } from "node:fs/promises";
|
|
6
|
+
import { join } from "node:path";
|
|
7
|
+
const FIXTURES_DIR = join(import.meta.dirname, "..", "fixtures");
|
|
8
|
+
// Simple seeded PRNG (mulberry32)
|
|
9
|
+
function createRNG(seed) {
|
|
10
|
+
return () => {
|
|
11
|
+
seed |= 0;
|
|
12
|
+
seed = (seed + 0x6d2b79f5) | 0;
|
|
13
|
+
let t = Math.imul(seed ^ (seed >>> 15), 1 | seed);
|
|
14
|
+
t = (t + Math.imul(t ^ (t >>> 7), 61 | t)) ^ t;
|
|
15
|
+
return ((t ^ (t >>> 14)) >>> 0) / 4294967296;
|
|
16
|
+
};
|
|
17
|
+
}
|
|
18
|
+
const PERSONA_NAMES = [
|
|
19
|
+
{ name: "Maya", role: "UX Researcher", goal: "Validate designs with real users" },
|
|
20
|
+
{ name: "Jordan", role: "DevOps Engineer", goal: "Keep infrastructure reliable" },
|
|
21
|
+
{ name: "Priya", role: "Product Owner", goal: "Maximize business value per sprint" },
|
|
22
|
+
{ name: "Marcus", role: "QA Lead", goal: "Catch bugs before they reach production" },
|
|
23
|
+
{ name: "Elena", role: "Data Analyst", goal: "Turn data into actionable insights" },
|
|
24
|
+
{ name: "Tomás", role: "Frontend Developer", goal: "Build pixel-perfect responsive UIs" },
|
|
25
|
+
{ name: "Kenji", role: "Backend Architect", goal: "Design scalable API systems" },
|
|
26
|
+
{ name: "Fatima", role: "Scrum Master", goal: "Remove blockers for the team" },
|
|
27
|
+
{ name: "Oscar", role: "Security Engineer", goal: "Protect user data and prevent breaches" },
|
|
28
|
+
{ name: "Li Wei", role: "Mobile Developer", goal: "Deliver smooth native app experiences" },
|
|
29
|
+
];
|
|
30
|
+
const SPEC_TOPICS = [
|
|
31
|
+
"Time Tracking Widget", "Invoice Generator", "Client Portal", "File Attachments",
|
|
32
|
+
"Task Dependencies", "Gantt Chart View", "Resource Allocation", "Budget Tracker",
|
|
33
|
+
"Custom Fields", "Recurring Tasks", "Email Integration", "Calendar Sync",
|
|
34
|
+
"Milestone Tracking", "Approval Workflows", "Template Library", "API Webhooks",
|
|
35
|
+
"Bulk Operations", "Dark Mode", "Offline Support", "Export to PDF",
|
|
36
|
+
"Team Chat", "Activity Feed", "Custom Reports", "Sprint Planning",
|
|
37
|
+
"Workload View", "Time Zone Support", "Multi-language", "Search & Filter",
|
|
38
|
+
];
|
|
39
|
+
const DECISION_TOPICS = [
|
|
40
|
+
"Use Tailwind CSS for styling",
|
|
41
|
+
"Adopt Zod for runtime validation",
|
|
42
|
+
"Choose PostgreSQL over MongoDB",
|
|
43
|
+
"Use React Server Components",
|
|
44
|
+
"Implement RBAC with Supabase RLS",
|
|
45
|
+
"Deploy on Vercel Edge",
|
|
46
|
+
"Use Resend for transactional emails",
|
|
47
|
+
"Adopt pnpm over npm",
|
|
48
|
+
"Use Vitest for testing",
|
|
49
|
+
"Implement feature flags with PostHog",
|
|
50
|
+
];
|
|
51
|
+
const DOMAIN_RULES = [
|
|
52
|
+
{ title: "Password Policy", critical: false },
|
|
53
|
+
{ title: "File Upload Limits", critical: false },
|
|
54
|
+
{ title: "Workspace Naming Rules", critical: false },
|
|
55
|
+
{ title: "API Versioning Policy", critical: false },
|
|
56
|
+
{ title: "Accessibility Requirements", critical: true },
|
|
57
|
+
{ title: "Internationalization Rules", critical: false },
|
|
58
|
+
{ title: "Error Handling Standards", critical: false },
|
|
59
|
+
{ title: "Logging and Monitoring Policy", critical: true },
|
|
60
|
+
{ title: "Third-Party Integration Rules", critical: false },
|
|
61
|
+
{ title: "Performance Budgets", critical: false },
|
|
62
|
+
];
|
|
63
|
+
function generatePersona(index, rng) {
|
|
64
|
+
const p = PERSONA_NAMES[index % PERSONA_NAMES.length];
|
|
65
|
+
const id = p.name.toLowerCase().replace(/\s+/g, "-");
|
|
66
|
+
const techLevel = ["beginner", "intermediate", "advanced"][Math.floor(rng() * 3)];
|
|
67
|
+
return `---
|
|
68
|
+
id: "${id}"
|
|
69
|
+
name: "${p.name}"
|
|
70
|
+
role: "${p.role}"
|
|
71
|
+
tech_level: "${techLevel}"
|
|
72
|
+
primary_goal: "${p.goal}"
|
|
73
|
+
jobs_to_be_done:
|
|
74
|
+
- "Complete daily work efficiently"
|
|
75
|
+
- "Collaborate with team members"
|
|
76
|
+
- "Track progress on deliverables"
|
|
77
|
+
anti_patterns:
|
|
78
|
+
- "Avoids tools that require excessive configuration"
|
|
79
|
+
- "Ignores features hidden behind multiple clicks"
|
|
80
|
+
channels:
|
|
81
|
+
- "desktop"
|
|
82
|
+
- "email"
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
## Background
|
|
86
|
+
|
|
87
|
+
${p.name} is a ${p.role} with ${Math.floor(rng() * 10 + 2)} years of experience. Their primary focus is to ${p.goal.toLowerCase()}. They work in a fast-paced environment where context switching is common and tools need to be intuitive and fast.
|
|
88
|
+
|
|
89
|
+
## Key Behaviors
|
|
90
|
+
|
|
91
|
+
${p.name} typically starts the day by reviewing their task list and prioritizing based on urgency and impact. They prefer tools that surface the most important information first without requiring manual sorting or filtering. Keyboard shortcuts are appreciated but not required.
|
|
92
|
+
|
|
93
|
+
## Pain Points
|
|
94
|
+
|
|
95
|
+
The biggest frustration for ${p.name} is when tools slow down their workflow rather than accelerating it. They have abandoned previous tools that required too much upfront configuration or that cluttered the interface with features they did not use. Simplicity and speed are more important than feature completeness.
|
|
96
|
+
`;
|
|
97
|
+
}
|
|
98
|
+
function generateSpec(index, rng) {
|
|
99
|
+
const topic = SPEC_TOPICS[index % SPEC_TOPICS.length];
|
|
100
|
+
const id = topic.toLowerCase().replace(/\s+/g, "-");
|
|
101
|
+
const statuses = ["draft", "in-progress", "approved", "implemented"];
|
|
102
|
+
const status = statuses[Math.floor(rng() * statuses.length)];
|
|
103
|
+
return `---
|
|
104
|
+
id: "${id}"
|
|
105
|
+
title: "${topic}"
|
|
106
|
+
status: "${status}"
|
|
107
|
+
acceptance_criteria:
|
|
108
|
+
- "Feature must be fully functional on desktop and mobile viewports"
|
|
109
|
+
- "Loading time must not exceed 2 seconds on a standard connection"
|
|
110
|
+
- "All user inputs must be validated both client-side and server-side"
|
|
111
|
+
- "Feature must include proper error states and empty states"
|
|
112
|
+
out_of_scope:
|
|
113
|
+
- "Offline support for this feature"
|
|
114
|
+
- "Integration with third-party services"
|
|
115
|
+
---
|
|
116
|
+
|
|
117
|
+
## Overview
|
|
118
|
+
|
|
119
|
+
The ${topic} feature enables users to manage their ${topic.toLowerCase()} workflow directly within TaskPilot. This reduces context switching and keeps all project-related information in one place.
|
|
120
|
+
|
|
121
|
+
## Requirements
|
|
122
|
+
|
|
123
|
+
The feature must integrate seamlessly with the existing dashboard and project views. Users should be able to access it from the project navigation sidebar. The interface must follow existing design patterns including the card-based layout, consistent button styles, and the standard form validation approach.
|
|
124
|
+
|
|
125
|
+
## Technical Considerations
|
|
126
|
+
|
|
127
|
+
Implementation should use Next.js Server Components for initial data loading and Client Components only for interactive elements. Data should be stored in the existing Supabase PostgreSQL database with appropriate RLS policies. All database queries should be optimized with proper indexes to maintain the 2-second loading time requirement.
|
|
128
|
+
|
|
129
|
+
## User Stories
|
|
130
|
+
|
|
131
|
+
As a project manager, I want to use ${topic.toLowerCase()} so that I can track progress more effectively. As a developer, I want the ${topic.toLowerCase()} interface to be keyboard-navigable so that I can work efficiently without switching to the mouse.
|
|
132
|
+
`;
|
|
133
|
+
}
|
|
134
|
+
function generateDecision(index, _rng) {
|
|
135
|
+
const topic = DECISION_TOPICS[index % DECISION_TOPICS.length];
|
|
136
|
+
const id = `adr-${String(index + 2).padStart(3, "0")}`;
|
|
137
|
+
return `---
|
|
138
|
+
id: "${id}"
|
|
139
|
+
title: "${topic}"
|
|
140
|
+
status: "accepted"
|
|
141
|
+
date: "2025-0${Math.min(index + 1, 9)}-15"
|
|
142
|
+
context: "The team needed to decide on the approach for ${topic.toLowerCase()} to ensure consistency and maintainability across the codebase."
|
|
143
|
+
decision: "We decided to ${topic.toLowerCase()} based on team expertise, ecosystem maturity, and alignment with our existing architecture."
|
|
144
|
+
consequences:
|
|
145
|
+
- "All new code must follow this decision"
|
|
146
|
+
- "Existing code should be migrated incrementally"
|
|
147
|
+
- "Documentation must be updated to reflect this choice"
|
|
148
|
+
alternatives_rejected:
|
|
149
|
+
- "The alternative approach was considered but rejected due to higher complexity"
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Context
|
|
153
|
+
|
|
154
|
+
The team evaluated multiple options for ${topic.toLowerCase()}. The primary factors in the decision were developer experience, performance characteristics, and long-term maintainability.
|
|
155
|
+
|
|
156
|
+
## Decision Details
|
|
157
|
+
|
|
158
|
+
After evaluating the options, we chose to ${topic.toLowerCase()}. This aligns with our existing technology choices and reduces the learning curve for new team members. The decision was made based on a proof-of-concept implementation that demonstrated the viability of this approach.
|
|
159
|
+
|
|
160
|
+
## Migration Plan
|
|
161
|
+
|
|
162
|
+
Existing code that does not follow this decision will be migrated as part of regular maintenance work. There is no urgent timeline for migration, but all new code must adhere to this decision immediately.
|
|
163
|
+
`;
|
|
164
|
+
}
|
|
165
|
+
function generateDomainRule(index, _rng) {
|
|
166
|
+
const rule = DOMAIN_RULES[index % DOMAIN_RULES.length];
|
|
167
|
+
const id = rule.title.toLowerCase().replace(/\s+/g, "-");
|
|
168
|
+
return `---
|
|
169
|
+
id: "${id}"
|
|
170
|
+
critical: ${rule.critical}
|
|
171
|
+
title: "${rule.title}"
|
|
172
|
+
---
|
|
173
|
+
|
|
174
|
+
## ${rule.title}
|
|
175
|
+
|
|
176
|
+
These rules define the standards for ${rule.title.toLowerCase()} in TaskPilot. All engineers must follow these rules when working on related features.
|
|
177
|
+
|
|
178
|
+
### Requirements
|
|
179
|
+
|
|
180
|
+
1. All implementations must be reviewed against these standards before merging
|
|
181
|
+
2. Automated checks should be added where feasible to enforce compliance
|
|
182
|
+
3. Exceptions require documented justification and team lead approval
|
|
183
|
+
|
|
184
|
+
### Rationale
|
|
185
|
+
|
|
186
|
+
These rules exist to ensure consistency, maintainability, and quality across the product. They were established based on industry best practices and lessons learned from previous projects.
|
|
187
|
+
`;
|
|
188
|
+
}
|
|
189
|
+
async function generateCorpus(targetSize, outputDir) {
|
|
190
|
+
const rng = createRNG(42); // deterministic seed
|
|
191
|
+
// Start by copying corpus-small as the base
|
|
192
|
+
await cp(join(FIXTURES_DIR, "corpus-small"), outputDir, { recursive: true });
|
|
193
|
+
// Calculate how many of each type to generate
|
|
194
|
+
// Base: 1 product + 2 personas + 2 journeys + 2 specs + 1 decision + 2 domain = 10
|
|
195
|
+
const remaining = targetSize - 10;
|
|
196
|
+
if (remaining <= 0)
|
|
197
|
+
return;
|
|
198
|
+
// Distribution: 20% personas, 15% journeys, 30% specs, 20% decisions, 15% domain
|
|
199
|
+
const personaCount = Math.floor(remaining * 0.2);
|
|
200
|
+
const specCount = Math.floor(remaining * 0.3);
|
|
201
|
+
const decisionCount = Math.floor(remaining * 0.2);
|
|
202
|
+
const domainCount = remaining - personaCount - specCount - decisionCount;
|
|
203
|
+
// Generate personas
|
|
204
|
+
for (let i = 0; i < personaCount; i++) {
|
|
205
|
+
const content = generatePersona(i, rng);
|
|
206
|
+
const name = PERSONA_NAMES[i % PERSONA_NAMES.length].name.toLowerCase().replace(/\s+/g, "-");
|
|
207
|
+
await writeFile(join(outputDir, "personas", `${name}.md`), content);
|
|
208
|
+
}
|
|
209
|
+
// Generate specs
|
|
210
|
+
for (let i = 0; i < specCount; i++) {
|
|
211
|
+
const content = generateSpec(i, rng);
|
|
212
|
+
const name = SPEC_TOPICS[i % SPEC_TOPICS.length].toLowerCase().replace(/\s+/g, "-");
|
|
213
|
+
await writeFile(join(outputDir, "specs", `${name}.md`), content);
|
|
214
|
+
}
|
|
215
|
+
// Generate decisions
|
|
216
|
+
for (let i = 0; i < decisionCount; i++) {
|
|
217
|
+
const content = generateDecision(i, rng);
|
|
218
|
+
await writeFile(join(outputDir, "decisions", `adr-${String(i + 2).padStart(3, "0")}.md`), content);
|
|
219
|
+
}
|
|
220
|
+
// Generate domain rules
|
|
221
|
+
for (let i = 0; i < domainCount; i++) {
|
|
222
|
+
const content = generateDomainRule(i, rng);
|
|
223
|
+
const name = DOMAIN_RULES[i % DOMAIN_RULES.length].title.toLowerCase().replace(/\s+/g, "-");
|
|
224
|
+
await writeFile(join(outputDir, "domain", `${name}.md`), content);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
async function main() {
|
|
228
|
+
const sizes = [
|
|
229
|
+
{ name: "corpus-medium", size: 50 },
|
|
230
|
+
{ name: "corpus-large", size: 100 },
|
|
231
|
+
{ name: "corpus-xlarge", size: 500 },
|
|
232
|
+
];
|
|
233
|
+
for (const { name, size } of sizes) {
|
|
234
|
+
const outputDir = join(FIXTURES_DIR, name);
|
|
235
|
+
console.log(`Generating ${name} (${size} files)...`);
|
|
236
|
+
await mkdir(outputDir, { recursive: true });
|
|
237
|
+
await generateCorpus(size, outputDir);
|
|
238
|
+
console.log(` → ${name} created at ${outputDir}`);
|
|
239
|
+
}
|
|
240
|
+
console.log("\nDone! Generated corpora for benchmarking.");
|
|
241
|
+
}
|
|
242
|
+
main().catch(console.error);
|
|
243
|
+
//# sourceMappingURL=generate-corpus.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"generate-corpus.js","sourceRoot":"","sources":["../../../benchmarks/generators/generate-corpus.ts"],"names":[],"mappings":"AAAA;;;GAGG;AACH,OAAO,EAAE,KAAK,EAAE,SAAS,EAAE,EAAE,EAAE,MAAM,kBAAkB,CAAC;AACxD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AAEjC,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC;AAEjE,kCAAkC;AAClC,SAAS,SAAS,CAAC,IAAY;IAC7B,OAAO,GAAG,EAAE;QACV,IAAI,IAAI,CAAC,CAAC;QACV,IAAI,GAAG,CAAC,IAAI,GAAG,UAAU,CAAC,GAAG,CAAC,CAAC;QAC/B,IAAI,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,GAAG,CAAC,IAAI,KAAK,EAAE,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,CAAC;QAClD,CAAC,GAAG,CAAC,CAAC,GAAG,IAAI,CAAC,IAAI,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,CAAC,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC;QAC/C,OAAO,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,GAAG,UAAU,CAAC;IAC/C,CAAC,CAAC;AACJ,CAAC;AAED,MAAM,aAAa,GAAG;IACpB,EAAE,IAAI,EAAE,MAAM,EAAE,IAAI,EAAE,eAAe,EAAE,IAAI,EAAE,kCAAkC,EAAE;IACjF,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,iBAAiB,EAAE,IAAI,EAAE,8BAA8B,EAAE;IACjF,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,eAAe,EAAE,IAAI,EAAE,oCAAoC,EAAE;IACpF,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,SAAS,EAAE,IAAI,EAAE,yCAAyC,EAAE;IACpF,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,cAAc,EAAE,IAAI,EAAE,oCAAoC,EAAE;IACnF,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,oBAAoB,EAAE,IAAI,EAAE,oCAAoC,EAAE;IACzF,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,mBAAmB,EAAE,IAAI,EAAE,6BAA6B,EAAE;IACjF,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,cAAc,EAAE,IAAI,EAAE,8BAA8B,EAAE;IAC9E,EAAE,IAAI,EAAE,OAAO,EAAE,IAAI,EAAE,mBAAmB,EAAE,IAAI,EAAE,wCAAwC,EAAE;IAC5F,EAAE,IAAI,EAAE,QAAQ,EAAE,IAAI,EAAE,kBAAkB,EAAE,IAAI,EAAE,uCAAuC,EAAE;CAC5F,CAAC;AAEF,MAAM,WAAW,GAAG;IAClB,sBAAsB,EAAE,mBAAmB,EAAE,eAAe,EAAE,kBAAkB;IAChF,mBAAmB,EAAE,kBAAkB,EAAE,qBAAqB,EAAE,gBAAgB;IAChF,eAAe,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,eAAe;IACxE,oBAAoB,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,cAAc;IAC9E,iBAAiB,EAAE,WAAW,EAAE,iBAAiB,EAAE,eAAe;IAClE,WAAW,EAAE,eAAe,EAAE,gBAAgB,EAAE,iBAAiB;IACjE,eAAe,EAAE,mBAAmB,EAAE,gBAAgB,EAAE,iBAAiB;CAC1E,CAAC;AAEF,MAAM,eAAe,GAAG;IACtB,8BAA8B;IAC9B,kCAAkC;IAClC,gCAAgC;IAChC,6BAA6B;IAC7B,kCAAkC;IAClC,uBAAuB;IACvB,qCAAqC;IACrC,qBAAqB;IACrB,wBAAwB;IACxB,sCAAsC;CACvC,CAAC;AAEF,MAAM,YAAY,GAAG;IACnB,EAAE,KAAK,EAAE,iBAAiB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC7C,EAAE,KAAK,EAAE,oBAAoB,EAAE,QAAQ,EAAE,KAAK,EAAE;IAChD,EAAE,KAAK,EAAE,wBAAwB,EAAE,QAAQ,EAAE,KAAK,EAAE;IACpD,EAAE,KAAK,EAAE,uBAAuB,EAAE,QAAQ,EAAE,KAAK,EAAE;IACnD,EAAE,KAAK,EAAE,4BAA4B,EAAE,QAAQ,EAAE,IAAI,EAAE;IACvD,EAAE,KAAK,EAAE,4BAA4B,EAAE,QAAQ,EAAE,KAAK,EAAE;IACxD,EAAE,KAAK,EAAE,0BAA0B,EAAE,QAAQ,EAAE,KAAK,EAAE;IACtD,EAAE,KAAK,EAAE,+BAA+B,EAAE,QAAQ,EAAE,IAAI,EAAE;IAC1D,EAAE,KAAK,EAAE,+BAA+B,EAAE,QAAQ,EAAE,KAAK,EAAE;IAC3D,EAAE,KAAK,EAAE,qBAAqB,EAAE,QAAQ,EAAE,KAAK,EAAE;CAClD,CAAC;AAEF,SAAS,eAAe,CAAC,KAAa,EAAE,GAAiB;IACvD,MAAM,CAAC,GAAG,aAAa,CAAC,KAAK,GAAG,aAAa,CAAC,MAAM,CAAE,CAAC;IACvD,MAAM,EAAE,GAAG,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACrD,MAAM,SAAS,GAAG,CAAC,UAAU,EAAE,cAAc,EAAE,UAAU,CAAC,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,CAAC,CAAC,CAAC,CAAC;IAElF,OAAO;OACF,EAAE;SACA,CAAC,CAAC,IAAI;SACN,CAAC,CAAC,IAAI;eACA,SAAS;iBACP,CAAC,CAAC,IAAI;;;;;;;;;;;;;;;EAerB,CAAC,CAAC,IAAI,SAAS,CAAC,CAAC,IAAI,SAAS,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,EAAE,GAAG,CAAC,CAAC,mDAAmD,CAAC,CAAC,IAAI,CAAC,WAAW,EAAE;;;;EAI/H,CAAC,CAAC,IAAI;;;;8BAIsB,CAAC,CAAC,IAAI;CACnC,CAAC;AACF,CAAC;AAED,SAAS,YAAY,CAAC,KAAa,EAAE,GAAiB;IACpD,MAAM,KAAK,GAAG,WAAW,CAAC,KAAK,GAAG,WAAW,CAAC,MAAM,CAAE,CAAC;IACvD,MAAM,EAAE,GAAG,KAAK,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IACpD,MAAM,QAAQ,GAAG,CAAC,OAAO,EAAE,aAAa,EAAE,UAAU,EAAE,aAAa,CAAC,CAAC;IACrE,MAAM,MAAM,GAAG,QAAQ,CAAC,IAAI,CAAC,KAAK,CAAC,GAAG,EAAE,GAAG,QAAQ,CAAC,MAAM,CAAC,CAAE,CAAC;IAE9D,OAAO;OACF,EAAE;UACC,KAAK;WACJ,MAAM;;;;;;;;;;;;;MAaX,KAAK,0CAA0C,KAAK,CAAC,WAAW,EAAE;;;;;;;;;;;;sCAYlC,KAAK,CAAC,WAAW,EAAE,8EAA8E,KAAK,CAAC,WAAW,EAAE;CACzJ,CAAC;AACF,CAAC;AAED,SAAS,gBAAgB,CAAC,KAAa,EAAE,IAAkB;IACzD,MAAM,KAAK,GAAG,eAAe,CAAC,KAAK,GAAG,eAAe,CAAC,MAAM,CAAE,CAAC;IAC/D,MAAM,EAAE,GAAG,OAAO,MAAM,CAAC,KAAK,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,EAAE,CAAC;IAEvD,OAAO;OACF,EAAE;UACC,KAAK;;eAEA,IAAI,CAAC,GAAG,CAAC,KAAK,GAAG,CAAC,EAAE,CAAC,CAAC;0DACqB,KAAK,CAAC,WAAW,EAAE;2BAClD,KAAK,CAAC,WAAW,EAAE;;;;;;;;;;;0CAWJ,KAAK,CAAC,WAAW,EAAE;;;;4CAIjB,KAAK,CAAC,WAAW,EAAE;;;;;CAK9D,CAAC;AACF,CAAC;AAED,SAAS,kBAAkB,CAAC,KAAa,EAAE,IAAkB;IAC3D,MAAM,IAAI,GAAG,YAAY,CAAC,KAAK,GAAG,YAAY,CAAC,MAAM,CAAE,CAAC;IACxD,MAAM,EAAE,GAAG,IAAI,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;IAEzD,OAAO;OACF,EAAE;YACG,IAAI,CAAC,QAAQ;UACf,IAAI,CAAC,KAAK;;;KAGf,IAAI,CAAC,KAAK;;uCAEwB,IAAI,CAAC,KAAK,CAAC,WAAW,EAAE;;;;;;;;;;;CAW9D,CAAC;AACF,CAAC;AAED,KAAK,UAAU,cAAc,CAAC,UAAkB,EAAE,SAAiB;IACjE,MAAM,GAAG,GAAG,SAAS,CAAC,EAAE,CAAC,CAAC,CAAC,qBAAqB;IAEhD,4CAA4C;IAC5C,MAAM,EAAE,CAAC,IAAI,CAAC,YAAY,EAAE,cAAc,CAAC,EAAE,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAE7E,8CAA8C;IAC9C,mFAAmF;IACnF,MAAM,SAAS,GAAG,UAAU,GAAG,EAAE,CAAC;IAClC,IAAI,SAAS,IAAI,CAAC;QAAE,OAAO;IAE3B,iFAAiF;IACjF,MAAM,YAAY,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC;IACjD,MAAM,SAAS,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC;IAC9C,MAAM,aAAa,GAAG,IAAI,CAAC,KAAK,CAAC,SAAS,GAAG,GAAG,CAAC,CAAC;IAClD,MAAM,WAAW,GAAG,SAAS,GAAG,YAAY,GAAG,SAAS,GAAG,aAAa,CAAC;IAEzE,oBAAoB;IACpB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,YAAY,EAAE,CAAC,EAAE,EAAE,CAAC;QACtC,MAAM,OAAO,GAAG,eAAe,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QACxC,MAAM,IAAI,GAAG,aAAa,CAAC,CAAC,GAAG,aAAa,CAAC,MAAM,CAAE,CAAC,IAAI,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC9F,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,UAAU,EAAE,GAAG,IAAI,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;IACtE,CAAC;IAED,iBAAiB;IACjB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,SAAS,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,MAAM,OAAO,GAAG,YAAY,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QACrC,MAAM,IAAI,GAAG,WAAW,CAAC,CAAC,GAAG,WAAW,CAAC,MAAM,CAAE,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QACrF,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,OAAO,EAAE,GAAG,IAAI,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;IACnE,CAAC;IAED,qBAAqB;IACrB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,aAAa,EAAE,CAAC,EAAE,EAAE,CAAC;QACvC,MAAM,OAAO,GAAG,gBAAgB,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QACzC,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,WAAW,EAAE,OAAO,MAAM,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,QAAQ,CAAC,CAAC,EAAE,GAAG,CAAC,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;IACrG,CAAC;IAED,wBAAwB;IACxB,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,WAAW,EAAE,CAAC,EAAE,EAAE,CAAC;QACrC,MAAM,OAAO,GAAG,kBAAkB,CAAC,CAAC,EAAE,GAAG,CAAC,CAAC;QAC3C,MAAM,IAAI,GAAG,YAAY,CAAC,CAAC,GAAG,YAAY,CAAC,MAAM,CAAE,CAAC,KAAK,CAAC,WAAW,EAAE,CAAC,OAAO,CAAC,MAAM,EAAE,GAAG,CAAC,CAAC;QAC7F,MAAM,SAAS,CAAC,IAAI,CAAC,SAAS,EAAE,QAAQ,EAAE,GAAG,IAAI,KAAK,CAAC,EAAE,OAAO,CAAC,CAAC;IACpE,CAAC;AACH,CAAC;AAED,KAAK,UAAU,IAAI;IACjB,MAAM,KAAK,GAAG;QACZ,EAAE,IAAI,EAAE,eAAe,EAAE,IAAI,EAAE,EAAE,EAAE;QACnC,EAAE,IAAI,EAAE,cAAc,EAAE,IAAI,EAAE,GAAG,EAAE;QACnC,EAAE,IAAI,EAAE,eAAe,EAAE,IAAI,EAAE,GAAG,EAAE;KACrC,CAAC;IAEF,KAAK,MAAM,EAAE,IAAI,EAAE,IAAI,EAAE,IAAI,KAAK,EAAE,CAAC;QACnC,MAAM,SAAS,GAAG,IAAI,CAAC,YAAY,EAAE,IAAI,CAAC,CAAC;QAC3C,OAAO,CAAC,GAAG,CAAC,cAAc,IAAI,KAAK,IAAI,YAAY,CAAC,CAAC;QACrD,MAAM,KAAK,CAAC,SAAS,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;QAC5C,MAAM,cAAc,CAAC,IAAI,EAAE,SAAS,CAAC,CAAC;QACtC,OAAO,CAAC,GAAG,CAAC,OAAO,IAAI,eAAe,SAAS,EAAE,CAAC,CAAC;IACrD,CAAC;IAED,OAAO,CAAC,GAAG,CAAC,6CAA6C,CAAC,CAAC;AAC7D,CAAC;AAED,IAAI,EAAE,CAAC,KAAK,CAAC,OAAO,CAAC,KAAK,CAAC,CAAC"}
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
import type Database from "better-sqlite3";
|
|
2
|
+
declare const FIXTURES_DIR: string;
|
|
3
|
+
export interface BenchHarness {
|
|
4
|
+
db: Database.Database;
|
|
5
|
+
productDir: string;
|
|
6
|
+
cleanup: () => Promise<void>;
|
|
7
|
+
}
|
|
8
|
+
/**
|
|
9
|
+
* Create a fresh benchmark environment:
|
|
10
|
+
* 1. Copy corpus to a temp directory
|
|
11
|
+
* 2. Open a fresh SQLite DB
|
|
12
|
+
* 3. Run fullIndex to populate it
|
|
13
|
+
*/
|
|
14
|
+
export declare function setup(corpus?: "corpus-small", options?: {
|
|
15
|
+
skipIndex?: boolean;
|
|
16
|
+
}): Promise<BenchHarness>;
|
|
17
|
+
/**
|
|
18
|
+
* Warm up the embedding pipeline (first call loads the 23MB model).
|
|
19
|
+
* Call once before timed benchmarks.
|
|
20
|
+
*/
|
|
21
|
+
export declare function warmupEmbeddings(): Promise<void>;
|
|
22
|
+
export { FIXTURES_DIR };
|
|
23
|
+
//# sourceMappingURL=harness.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"harness.d.ts","sourceRoot":"","sources":["../../../benchmarks/lib/harness.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,QAAQ,MAAM,gBAAgB,CAAC;AAE3C,QAAA,MAAM,YAAY,QAA8C,CAAC;AAEjE,MAAM,WAAW,YAAY;IAC3B,EAAE,EAAE,QAAQ,CAAC,QAAQ,CAAC;IACtB,UAAU,EAAE,MAAM,CAAC;IACnB,OAAO,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;CAC9B;AAED;;;;;GAKG;AACH,wBAAsB,KAAK,CACzB,MAAM,GAAE,cAA+B,EACvC,OAAO,GAAE;IAAE,SAAS,CAAC,EAAE,OAAO,CAAA;CAAO,GACpC,OAAO,CAAC,YAAY,CAAC,CAyBvB;AAED;;;GAGG;AACH,wBAAsB,gBAAgB,IAAI,OAAO,CAAC,IAAI,CAAC,CAGtD;AAED,OAAO,EAAE,YAAY,EAAE,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
import { mkdtemp, cp, rm } from "node:fs/promises";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
import { tmpdir } from "node:os";
|
|
4
|
+
import { openDB, closeDB } from "../../src/db.js";
|
|
5
|
+
import { fullIndex } from "../../src/indexer.js";
|
|
6
|
+
const FIXTURES_DIR = join(import.meta.dirname, "..", "fixtures");
|
|
7
|
+
/**
|
|
8
|
+
* Create a fresh benchmark environment:
|
|
9
|
+
* 1. Copy corpus to a temp directory
|
|
10
|
+
* 2. Open a fresh SQLite DB
|
|
11
|
+
* 3. Run fullIndex to populate it
|
|
12
|
+
*/
|
|
13
|
+
export async function setup(corpus = "corpus-small", options = {}) {
|
|
14
|
+
// Ensure any previous DB singleton is closed
|
|
15
|
+
closeDB();
|
|
16
|
+
// Create temp directory and copy corpus
|
|
17
|
+
const tmpDir = await mkdtemp(join(tmpdir(), "pcl-bench-"));
|
|
18
|
+
const productDir = join(tmpDir, "product");
|
|
19
|
+
await cp(join(FIXTURES_DIR, corpus), productDir, { recursive: true });
|
|
20
|
+
// Open fresh DB
|
|
21
|
+
const db = openDB(productDir);
|
|
22
|
+
// Index corpus unless skipped
|
|
23
|
+
if (!options.skipIndex) {
|
|
24
|
+
await fullIndex(db, productDir);
|
|
25
|
+
}
|
|
26
|
+
return {
|
|
27
|
+
db,
|
|
28
|
+
productDir,
|
|
29
|
+
cleanup: async () => {
|
|
30
|
+
closeDB();
|
|
31
|
+
await rm(tmpDir, { recursive: true, force: true });
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Warm up the embedding pipeline (first call loads the 23MB model).
|
|
37
|
+
* Call once before timed benchmarks.
|
|
38
|
+
*/
|
|
39
|
+
export async function warmupEmbeddings() {
|
|
40
|
+
const { embedText } = await import("../../src/embeddings.js");
|
|
41
|
+
await embedText("warmup");
|
|
42
|
+
}
|
|
43
|
+
export { FIXTURES_DIR };
|
|
44
|
+
//# sourceMappingURL=harness.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"harness.js","sourceRoot":"","sources":["../../../benchmarks/lib/harness.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,OAAO,EAAE,EAAE,EAAE,EAAE,EAAE,MAAM,kBAAkB,CAAC;AACnD,OAAO,EAAE,IAAI,EAAE,MAAM,WAAW,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,MAAM,SAAS,CAAC;AACjC,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,iBAAiB,CAAC;AAClD,OAAO,EAAE,SAAS,EAAE,MAAM,sBAAsB,CAAC;AAGjD,MAAM,YAAY,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,OAAO,EAAE,IAAI,EAAE,UAAU,CAAC,CAAC;AAQjE;;;;;GAKG;AACH,MAAM,CAAC,KAAK,UAAU,KAAK,CACzB,SAAyB,cAAc,EACvC,UAAmC,EAAE;IAErC,6CAA6C;IAC7C,OAAO,EAAE,CAAC;IAEV,wCAAwC;IACxC,MAAM,MAAM,GAAG,MAAM,OAAO,CAAC,IAAI,CAAC,MAAM,EAAE,EAAE,YAAY,CAAC,CAAC,CAAC;IAC3D,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IAC3C,MAAM,EAAE,CAAC,IAAI,CAAC,YAAY,EAAE,MAAM,CAAC,EAAE,UAAU,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAEtE,gBAAgB;IAChB,MAAM,EAAE,GAAG,MAAM,CAAC,UAAU,CAAC,CAAC;IAE9B,8BAA8B;IAC9B,IAAI,CAAC,OAAO,CAAC,SAAS,EAAE,CAAC;QACvB,MAAM,SAAS,CAAC,EAAE,EAAE,UAAU,CAAC,CAAC;IAClC,CAAC;IAED,OAAO;QACL,EAAE;QACF,UAAU;QACV,OAAO,EAAE,KAAK,IAAI,EAAE;YAClB,OAAO,EAAE,CAAC;YACV,MAAM,EAAE,CAAC,MAAM,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;QACrD,CAAC;KACF,CAAC;AACJ,CAAC;AAED;;;GAGG;AACH,MAAM,CAAC,KAAK,UAAU,gBAAgB;IACpC,MAAM,EAAE,SAAS,EAAE,GAAG,MAAM,MAAM,CAAC,yBAAyB,CAAC,CAAC;IAC9D,MAAM,SAAS,CAAC,QAAQ,CAAC,CAAC;AAC5B,CAAC;AAED,OAAO,EAAE,YAAY,EAAE,CAAC"}
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
export interface RelevanceJudgment {
|
|
2
|
+
query: string;
|
|
3
|
+
queryType: "exact_term" | "conceptual" | "multi_hop";
|
|
4
|
+
judgments: Array<{
|
|
5
|
+
docId: string;
|
|
6
|
+
relevance: 0 | 1 | 2 | 3;
|
|
7
|
+
}>;
|
|
8
|
+
}
|
|
9
|
+
export interface RelevanceGroundTruth {
|
|
10
|
+
queries: RelevanceJudgment[];
|
|
11
|
+
}
|
|
12
|
+
export interface EvaluationCriterion {
|
|
13
|
+
criterion: string;
|
|
14
|
+
weight: number;
|
|
15
|
+
type: "regex" | "llm_judge";
|
|
16
|
+
pattern?: string;
|
|
17
|
+
prompt?: string;
|
|
18
|
+
}
|
|
19
|
+
export interface CodingTask {
|
|
20
|
+
id: string;
|
|
21
|
+
category: "spec_compliance" | "business_rule" | "persona_alignment" | "architecture" | "journey_correctness";
|
|
22
|
+
description: string;
|
|
23
|
+
requiredContext: string[];
|
|
24
|
+
forbiddenPatterns: string[];
|
|
25
|
+
requiredPatterns: string[];
|
|
26
|
+
evaluationCriteria: EvaluationCriterion[];
|
|
27
|
+
}
|
|
28
|
+
export interface TaskGroundTruth {
|
|
29
|
+
tasks: CodingTask[];
|
|
30
|
+
}
|
|
31
|
+
export interface PerformanceResult {
|
|
32
|
+
name: string;
|
|
33
|
+
mean: number;
|
|
34
|
+
median: number;
|
|
35
|
+
p95: number;
|
|
36
|
+
iterations: number;
|
|
37
|
+
}
|
|
38
|
+
export interface SearchQualityResult {
|
|
39
|
+
mode: "hybrid" | "semantic" | "keyword";
|
|
40
|
+
queryType?: string;
|
|
41
|
+
precisionAt1: number;
|
|
42
|
+
precisionAt3: number;
|
|
43
|
+
precisionAt5: number;
|
|
44
|
+
recallAt5: number;
|
|
45
|
+
mrr: number;
|
|
46
|
+
ndcgAt5: number;
|
|
47
|
+
}
|
|
48
|
+
export interface TokenEfficiencyResult {
|
|
49
|
+
corpusSize: number;
|
|
50
|
+
pclSessionStartTokens: number;
|
|
51
|
+
pasteAllTokens: number;
|
|
52
|
+
savingsPercent: number;
|
|
53
|
+
ratio: number;
|
|
54
|
+
}
|
|
55
|
+
export interface AIQualityResult {
|
|
56
|
+
taskId: string;
|
|
57
|
+
category: string;
|
|
58
|
+
noContextScore: number;
|
|
59
|
+
pasteAllScore: number;
|
|
60
|
+
pclScore: number;
|
|
61
|
+
noContextTokens: number;
|
|
62
|
+
pasteAllTokens: number;
|
|
63
|
+
pclTokens: number;
|
|
64
|
+
}
|
|
65
|
+
export interface AblationResult {
|
|
66
|
+
configuration: string;
|
|
67
|
+
searchQuality: SearchQualityResult;
|
|
68
|
+
}
|
|
69
|
+
export interface BenchmarkReport {
|
|
70
|
+
timestamp: string;
|
|
71
|
+
version: string;
|
|
72
|
+
commit: string;
|
|
73
|
+
performance?: PerformanceResult[];
|
|
74
|
+
searchQuality?: SearchQualityResult[];
|
|
75
|
+
tokenEfficiency?: TokenEfficiencyResult[];
|
|
76
|
+
aiQuality?: AIQualityResult[];
|
|
77
|
+
ablation?: AblationResult[];
|
|
78
|
+
}
|
|
79
|
+
//# sourceMappingURL=types.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.d.ts","sourceRoot":"","sources":["../../../benchmarks/lib/types.ts"],"names":[],"mappings":"AAIA,MAAM,WAAW,iBAAiB;IAChC,KAAK,EAAE,MAAM,CAAC;IACd,SAAS,EAAE,YAAY,GAAG,YAAY,GAAG,WAAW,CAAC;IACrD,SAAS,EAAE,KAAK,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;QACd,SAAS,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;KAC1B,CAAC,CAAC;CACJ;AAED,MAAM,WAAW,oBAAoB;IACnC,OAAO,EAAE,iBAAiB,EAAE,CAAC;CAC9B;AAED,MAAM,WAAW,mBAAmB;IAClC,SAAS,EAAE,MAAM,CAAC;IAClB,MAAM,EAAE,MAAM,CAAC;IACf,IAAI,EAAE,OAAO,GAAG,WAAW,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,UAAU;IACzB,EAAE,EAAE,MAAM,CAAC;IACX,QAAQ,EAAE,iBAAiB,GAAG,eAAe,GAAG,mBAAmB,GAAG,cAAc,GAAG,qBAAqB,CAAC;IAC7G,WAAW,EAAE,MAAM,CAAC;IACpB,eAAe,EAAE,MAAM,EAAE,CAAC;IAC1B,iBAAiB,EAAE,MAAM,EAAE,CAAC;IAC5B,gBAAgB,EAAE,MAAM,EAAE,CAAC;IAC3B,kBAAkB,EAAE,mBAAmB,EAAE,CAAC;CAC3C;AAED,MAAM,WAAW,eAAe;IAC9B,KAAK,EAAE,UAAU,EAAE,CAAC;CACrB;AAID,MAAM,WAAW,iBAAiB;IAChC,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,MAAM,EAAE,MAAM,CAAC;IACf,GAAG,EAAE,MAAM,CAAC;IACZ,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,mBAAmB;IAClC,IAAI,EAAE,QAAQ,GAAG,UAAU,GAAG,SAAS,CAAC;IACxC,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,YAAY,EAAE,MAAM,CAAC;IACrB,SAAS,EAAE,MAAM,CAAC;IAClB,GAAG,EAAE,MAAM,CAAC;IACZ,OAAO,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,qBAAqB;IACpC,UAAU,EAAE,MAAM,CAAC;IACnB,qBAAqB,EAAE,MAAM,CAAC;IAC9B,cAAc,EAAE,MAAM,CAAC;IACvB,cAAc,EAAE,MAAM,CAAC;IACvB,KAAK,EAAE,MAAM,CAAC;CACf;AAED,MAAM,WAAW,eAAe;IAC9B,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,cAAc,EAAE,MAAM,CAAC;IACvB,aAAa,EAAE,MAAM,CAAC;IACtB,QAAQ,EAAE,MAAM,CAAC;IACjB,eAAe,EAAE,MAAM,CAAC;IACxB,cAAc,EAAE,MAAM,CAAC;IACvB,SAAS,EAAE,MAAM,CAAC;CACnB;AAED,MAAM,WAAW,cAAc;IAC7B,aAAa,EAAE,MAAM,CAAC;IACtB,aAAa,EAAE,mBAAmB,CAAC;CACpC;AAED,MAAM,WAAW,eAAe;IAC9B,SAAS,EAAE,MAAM,CAAC;IAClB,OAAO,EAAE,MAAM,CAAC;IAChB,MAAM,EAAE,MAAM,CAAC;IACf,WAAW,CAAC,EAAE,iBAAiB,EAAE,CAAC;IAClC,aAAa,CAAC,EAAE,mBAAmB,EAAE,CAAC;IACtC,eAAe,CAAC,EAAE,qBAAqB,EAAE,CAAC;IAC1C,SAAS,CAAC,EAAE,eAAe,EAAE,CAAC;IAC9B,QAAQ,CAAC,EAAE,cAAc,EAAE,CAAC;CAC7B"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"types.js","sourceRoot":"","sources":["../../../benchmarks/lib/types.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"markdown-reporter.d.ts","sourceRoot":"","sources":["../../../benchmarks/reporters/markdown-reporter.ts"],"names":[],"mappings":""}
|