aspectcode 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/ui/theme.d.ts +3 -3
- package/dist/ui/theme.js +3 -3
- package/dist/ui/theme.js.map +1 -1
- package/node_modules/@aspectcode/evaluator/dist/diagnosis.d.ts +33 -0
- package/node_modules/@aspectcode/evaluator/dist/diagnosis.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/diagnosis.js +172 -0
- package/node_modules/@aspectcode/evaluator/dist/diagnosis.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/aider.d.ts +23 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/aider.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/aider.js +125 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/aider.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/claudeCode.d.ts +17 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/claudeCode.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/claudeCode.js +223 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/claudeCode.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/cline.d.ts +17 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/cline.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/cline.js +179 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/cline.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/common.d.ts +39 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/common.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/common.js +170 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/common.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/export.d.ts +23 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/export.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/export.js +98 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/export.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/index.d.ts +29 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/index.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/index.js +94 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/index.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/vscodeDb.d.ts +37 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/vscodeDb.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/vscodeDb.js +473 -0
- package/node_modules/@aspectcode/evaluator/dist/harvest/vscodeDb.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/index.d.ts +48 -0
- package/node_modules/@aspectcode/evaluator/dist/index.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/index.js +75 -0
- package/node_modules/@aspectcode/evaluator/dist/index.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/probes.d.ts +38 -0
- package/node_modules/@aspectcode/evaluator/dist/probes.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/probes.js +229 -0
- package/node_modules/@aspectcode/evaluator/dist/probes.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/runner.d.ts +34 -0
- package/node_modules/@aspectcode/evaluator/dist/runner.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/runner.js +183 -0
- package/node_modules/@aspectcode/evaluator/dist/runner.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/types.d.ts +176 -0
- package/node_modules/@aspectcode/evaluator/dist/types.d.ts.map +1 -0
- package/node_modules/@aspectcode/evaluator/dist/types.js +9 -0
- package/node_modules/@aspectcode/evaluator/dist/types.js.map +1 -0
- package/node_modules/@aspectcode/evaluator/package.json +41 -0
- package/package.json +2 -1
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Probe generator — creates scoped micro-tests from KB content.
|
|
3
|
+
*
|
|
4
|
+
* Probes are derived from:
|
|
5
|
+
* 1. KB structure (hubs, entry points, naming conventions, integrations)
|
|
6
|
+
* 2. KB diff (changed areas and their 1-hop dependents)
|
|
7
|
+
* 3. Harvested prompts (real user interactions that reveal problem areas)
|
|
8
|
+
*
|
|
9
|
+
* Each probe is a self-contained scenario that can be "run" by sending it
|
|
10
|
+
* to an LLM with AGENTS.md as context and evaluating the response.
|
|
11
|
+
*/
|
|
12
|
+
import type { Probe, ProbeGeneratorOptions } from './types';
|
|
13
|
+
/** Extract a section from KB text by heading prefix. */
|
|
14
|
+
declare function extractSection(kb: string, heading: string): string;
|
|
15
|
+
/** Parse "High-Risk Architectural Hubs" table rows: | path | in | out | */
|
|
16
|
+
declare function parseHubs(architecture: string): Array<{
|
|
17
|
+
file: string;
|
|
18
|
+
inDegree: number;
|
|
19
|
+
outDegree: number;
|
|
20
|
+
}>;
|
|
21
|
+
/** Parse "Entry Points" from architecture section. */
|
|
22
|
+
declare function parseEntryPoints(architecture: string): Array<{
|
|
23
|
+
file: string;
|
|
24
|
+
kind: string;
|
|
25
|
+
}>;
|
|
26
|
+
/** Parse naming conventions from the map section. */
|
|
27
|
+
declare function parseConventions(mapSection: string): string[];
|
|
28
|
+
/** Parse file paths mentioned in a diff string. */
|
|
29
|
+
declare function parseDiffFiles(diff: string): string[];
|
|
30
|
+
/**
|
|
31
|
+
* Generate probes scoped to the KB content and optional diff.
|
|
32
|
+
*
|
|
33
|
+
* When a diff is provided, probes focus on changed areas.
|
|
34
|
+
* Otherwise, probes cover the full KB (hubs, entry points, conventions).
|
|
35
|
+
*/
|
|
36
|
+
export declare function generateProbes(options: ProbeGeneratorOptions): Probe[];
|
|
37
|
+
export { extractSection, parseHubs, parseEntryPoints, parseConventions, parseDiffFiles };
|
|
38
|
+
//# sourceMappingURL=probes.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"probes.d.ts","sourceRoot":"","sources":["../src/probes.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;GAUG;AAEH,OAAO,KAAK,EACV,KAAK,EAEL,qBAAqB,EAEtB,MAAM,SAAS,CAAC;AAIjB,wDAAwD;AACxD,iBAAS,cAAc,CAAC,EAAE,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,GAAG,MAAM,CAO3D;AAED,2EAA2E;AAC3E,iBAAS,SAAS,CAAC,YAAY,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,QAAQ,EAAE,MAAM,CAAC;IAAC,SAAS,EAAE,MAAM,CAAA;CAAE,CAAC,CAarG;AAED,sDAAsD;AACtD,iBAAS,gBAAgB,CAAC,YAAY,EAAE,MAAM,GAAG,KAAK,CAAC;IAAE,IAAI,EAAE,MAAM,CAAC;IAAC,IAAI,EAAE,MAAM,CAAA;CAAE,CAAC,CAarF;AAED,qDAAqD;AACrD,iBAAS,gBAAgB,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,EAAE,CAOtD;AAaD,mDAAmD;AACnD,iBAAS,cAAc,CAAC,IAAI,EAAE,MAAM,GAAG,MAAM,EAAE,CAa9C;AAsGD;;;;;GAKG;AACH,wBAAgB,cAAc,CAAC,OAAO,EAAE,qBAAqB,GAAG,KAAK,EAAE,CA4CtE;AAGD,OAAO,EAAE,cAAc,EAAE,SAAS,EAAE,gBAAgB,EAAE,gBAAgB,EAAE,cAAc,EAAE,CAAC"}
|
|
@@ -0,0 +1,229 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Probe generator — creates scoped micro-tests from KB content.
|
|
4
|
+
*
|
|
5
|
+
* Probes are derived from:
|
|
6
|
+
* 1. KB structure (hubs, entry points, naming conventions, integrations)
|
|
7
|
+
* 2. KB diff (changed areas and their 1-hop dependents)
|
|
8
|
+
* 3. Harvested prompts (real user interactions that reveal problem areas)
|
|
9
|
+
*
|
|
10
|
+
* Each probe is a self-contained scenario that can be "run" by sending it
|
|
11
|
+
* to an LLM with AGENTS.md as context and evaluating the response.
|
|
12
|
+
*/
|
|
13
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
14
|
+
exports.generateProbes = generateProbes;
|
|
15
|
+
exports.extractSection = extractSection;
|
|
16
|
+
exports.parseHubs = parseHubs;
|
|
17
|
+
exports.parseEntryPoints = parseEntryPoints;
|
|
18
|
+
exports.parseConventions = parseConventions;
|
|
19
|
+
exports.parseDiffFiles = parseDiffFiles;
|
|
20
|
+
// ── KB section parsers ──────────────────────────────────────
|
|
21
|
+
/** Extract a section from KB text by heading prefix. */
|
|
22
|
+
function extractSection(kb, heading) {
|
|
23
|
+
const idx = kb.indexOf(heading);
|
|
24
|
+
if (idx < 0)
|
|
25
|
+
return '';
|
|
26
|
+
const start = idx + heading.length;
|
|
27
|
+
// Find the next `---` separator or end of string
|
|
28
|
+
const sepIdx = kb.indexOf('\n---\n', start);
|
|
29
|
+
return sepIdx > 0 ? kb.slice(start, sepIdx).trim() : kb.slice(start).trim();
|
|
30
|
+
}
|
|
31
|
+
/** Parse "High-Risk Architectural Hubs" table rows: | path | in | out | */
|
|
32
|
+
function parseHubs(architecture) {
|
|
33
|
+
const hubs = [];
|
|
34
|
+
const tableRegex = /\|\s*`?([^`|]+?)`?\s*\|\s*(\d+)\s*\|\s*(\d+)\s*\|/g;
|
|
35
|
+
const section = extractSubSection(architecture, 'High-Risk Architectural Hubs');
|
|
36
|
+
let match;
|
|
37
|
+
while ((match = tableRegex.exec(section)) !== null) {
|
|
38
|
+
hubs.push({
|
|
39
|
+
file: match[1].trim(),
|
|
40
|
+
inDegree: parseInt(match[2], 10),
|
|
41
|
+
outDegree: parseInt(match[3], 10),
|
|
42
|
+
});
|
|
43
|
+
}
|
|
44
|
+
return hubs;
|
|
45
|
+
}
|
|
46
|
+
/** Parse "Entry Points" from architecture section. */
|
|
47
|
+
function parseEntryPoints(architecture) {
|
|
48
|
+
const entries = [];
|
|
49
|
+
const section = extractSubSection(architecture, 'Entry Points');
|
|
50
|
+
const regex = /\|\s*`?([^`|]+?)`?\s*\|\s*([^|]+?)\s*\|/g;
|
|
51
|
+
let match;
|
|
52
|
+
while ((match = regex.exec(section)) !== null) {
|
|
53
|
+
const file = match[1].trim();
|
|
54
|
+
const kind = match[2].trim();
|
|
55
|
+
if (file && !file.includes('---') && kind !== 'Kind') {
|
|
56
|
+
entries.push({ file, kind });
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
return entries;
|
|
60
|
+
}
|
|
61
|
+
/** Parse naming conventions from the map section. */
|
|
62
|
+
function parseConventions(mapSection) {
|
|
63
|
+
const section = extractSubSection(mapSection, 'Conventions');
|
|
64
|
+
return section
|
|
65
|
+
.split('\n')
|
|
66
|
+
.filter((line) => line.startsWith('- ') || line.startsWith('* '))
|
|
67
|
+
.map((line) => line.replace(/^[-*]\s*/, '').trim())
|
|
68
|
+
.filter(Boolean);
|
|
69
|
+
}
|
|
70
|
+
/** Extract a sub-section within a larger section by heading. */
|
|
71
|
+
function extractSubSection(section, heading) {
|
|
72
|
+
const regex = new RegExp(`#+\\s*${heading.replace(/[.*+?^${}()|[\]\\]/g, '\\$&')}`, 'i');
|
|
73
|
+
const match = regex.exec(section);
|
|
74
|
+
if (!match)
|
|
75
|
+
return '';
|
|
76
|
+
const start = match.index + match[0].length;
|
|
77
|
+
// Find next heading of same or higher level
|
|
78
|
+
const nextHeading = section.slice(start).search(/\n#{1,3}\s/);
|
|
79
|
+
return nextHeading > 0 ? section.slice(start, start + nextHeading).trim() : section.slice(start).trim();
|
|
80
|
+
}
|
|
81
|
+
/** Parse file paths mentioned in a diff string. */
|
|
82
|
+
function parseDiffFiles(diff) {
|
|
83
|
+
const files = new Set();
|
|
84
|
+
const regex = /^[+-]{3}\s+[ab]\/(.+)$/gm;
|
|
85
|
+
let match;
|
|
86
|
+
while ((match = regex.exec(diff)) !== null) {
|
|
87
|
+
files.add(match[1]);
|
|
88
|
+
}
|
|
89
|
+
// Also pick up paths from @@ hunks or added/removed lines referencing files
|
|
90
|
+
const pathRegex = /(?:^[+-]\s*(?:import|from|require)\s*.*?['"])([^'"]+)['"]/gm;
|
|
91
|
+
while ((match = pathRegex.exec(diff)) !== null) {
|
|
92
|
+
files.add(match[1]);
|
|
93
|
+
}
|
|
94
|
+
return [...files];
|
|
95
|
+
}
|
|
96
|
+
// ── Probe generators by category ────────────────────────────
|
|
97
|
+
function generateHubProbes(hubs) {
|
|
98
|
+
return hubs.slice(0, 5).map((hub) => ({
|
|
99
|
+
id: `hub-safety-${sanitizeId(hub.file)}`,
|
|
100
|
+
description: `Test that the AI handles ${hub.file} carefully (${hub.inDegree} dependents)`,
|
|
101
|
+
category: 'hub-safety',
|
|
102
|
+
contextFiles: [hub.file],
|
|
103
|
+
task: `I need to modify \`${hub.file}\` to add a new exported function. This file has ${hub.inDegree} files that depend on it. Write the code change and explain what else needs updating.`,
|
|
104
|
+
expectedBehaviors: [
|
|
105
|
+
`Acknowledges that ${hub.file} is a high-risk hub with many dependents`,
|
|
106
|
+
'Warns about or checks for breaking changes to existing exports',
|
|
107
|
+
'Suggests verifying or updating downstream consumers',
|
|
108
|
+
'Makes a minimal, backwards-compatible change',
|
|
109
|
+
],
|
|
110
|
+
}));
|
|
111
|
+
}
|
|
112
|
+
function generateEntryPointProbes(entries) {
|
|
113
|
+
return entries.slice(0, 3).map((entry) => ({
|
|
114
|
+
id: `entry-point-${sanitizeId(entry.file)}`,
|
|
115
|
+
description: `Test that the AI handles ${entry.kind} entry point ${entry.file} correctly`,
|
|
116
|
+
category: 'entry-point',
|
|
117
|
+
contextFiles: [entry.file],
|
|
118
|
+
task: `I want to add a new ${entry.kind.toLowerCase()} route/command in \`${entry.file}\`. Where should I add it and what patterns should I follow?`,
|
|
119
|
+
expectedBehaviors: [
|
|
120
|
+
`References the existing patterns in ${entry.file}`,
|
|
121
|
+
`Follows the ${entry.kind.toLowerCase()} conventions used in the project`,
|
|
122
|
+
'Suggests appropriate error handling consistent with existing handlers',
|
|
123
|
+
'Places the new code in the correct location within the file',
|
|
124
|
+
],
|
|
125
|
+
}));
|
|
126
|
+
}
|
|
127
|
+
function generateNamingProbes(conventions) {
|
|
128
|
+
if (conventions.length === 0)
|
|
129
|
+
return [];
|
|
130
|
+
const conventionText = conventions.slice(0, 5).join('; ');
|
|
131
|
+
return [{
|
|
132
|
+
id: 'naming-conventions',
|
|
133
|
+
description: 'Test that the AI follows the project\'s naming conventions',
|
|
134
|
+
category: 'naming',
|
|
135
|
+
contextFiles: [],
|
|
136
|
+
task: `I need to create a new utility module with a helper function and a class. What should I name the file, function, and class? The project has these conventions: ${conventionText}`,
|
|
137
|
+
expectedBehaviors: conventions.slice(0, 5).map((c) => `Follows convention: ${c}`),
|
|
138
|
+
}];
|
|
139
|
+
}
|
|
140
|
+
function generateDiffProbes(diffFiles) {
|
|
141
|
+
if (diffFiles.length === 0)
|
|
142
|
+
return [];
|
|
143
|
+
return diffFiles.slice(0, 3).map((file) => ({
|
|
144
|
+
id: `diff-area-${sanitizeId(file)}`,
|
|
145
|
+
description: `Test AI awareness of recently changed file ${file}`,
|
|
146
|
+
category: 'architecture',
|
|
147
|
+
contextFiles: [file],
|
|
148
|
+
task: `I'm working on \`${file}\` which was recently modified. I need to add a related feature. What do I need to know about this file and its dependencies before making changes?`,
|
|
149
|
+
expectedBehaviors: [
|
|
150
|
+
`Identifies the role/purpose of ${file} in the project`,
|
|
151
|
+
'Notes any imports/exports that constrain changes',
|
|
152
|
+
'Suggests checking dependent files',
|
|
153
|
+
'Follows the existing code style in the file',
|
|
154
|
+
],
|
|
155
|
+
}));
|
|
156
|
+
}
|
|
157
|
+
function generateHarvestedProbes(prompts) {
|
|
158
|
+
// Take the most recent prompts that reference specific files
|
|
159
|
+
const withFiles = prompts
|
|
160
|
+
.filter((p) => p.filesReferenced.length > 0)
|
|
161
|
+
.slice(0, 3);
|
|
162
|
+
return withFiles.map((p, i) => ({
|
|
163
|
+
id: `harvested-${i}-${sanitizeId(p.filesReferenced[0] ?? 'general')}`,
|
|
164
|
+
description: `Probe from real ${p.source} interaction involving ${p.filesReferenced.join(', ')}`,
|
|
165
|
+
category: 'harvested',
|
|
166
|
+
contextFiles: p.filesReferenced,
|
|
167
|
+
task: p.userPrompt,
|
|
168
|
+
expectedBehaviors: [
|
|
169
|
+
'Produces a response consistent with the project\'s conventions',
|
|
170
|
+
'References the correct files and their roles',
|
|
171
|
+
'Does not hallucinate non-existent APIs or patterns',
|
|
172
|
+
`Handles the task at least as well as the original ${p.source} response`,
|
|
173
|
+
],
|
|
174
|
+
}));
|
|
175
|
+
}
|
|
176
|
+
// ── Helpers ─────────────────────────────────────────────────
|
|
177
|
+
function sanitizeId(path) {
|
|
178
|
+
return path
|
|
179
|
+
.replace(/[/\\]/g, '-')
|
|
180
|
+
.replace(/\.[^.]+$/, '')
|
|
181
|
+
.replace(/[^a-zA-Z0-9-]/g, '')
|
|
182
|
+
.toLowerCase()
|
|
183
|
+
.slice(0, 40);
|
|
184
|
+
}
|
|
185
|
+
// ── Public API ──────────────────────────────────────────────
|
|
186
|
+
/**
|
|
187
|
+
* Generate probes scoped to the KB content and optional diff.
|
|
188
|
+
*
|
|
189
|
+
* When a diff is provided, probes focus on changed areas.
|
|
190
|
+
* Otherwise, probes cover the full KB (hubs, entry points, conventions).
|
|
191
|
+
*/
|
|
192
|
+
function generateProbes(options) {
|
|
193
|
+
const { kb, kbDiff, harvestedPrompts, maxProbes = 10 } = options;
|
|
194
|
+
const architecture = extractSection(kb, '## High-Risk Architectural Hubs');
|
|
195
|
+
const fullArch = extractSection(kb, '# Architecture') || extractSection(kb, '## High-Risk');
|
|
196
|
+
const mapSection = extractSection(kb, '# Map') || extractSection(kb, '## Data Models');
|
|
197
|
+
const probes = [];
|
|
198
|
+
// 1. Hub safety probes
|
|
199
|
+
const hubs = parseHubs(fullArch || architecture);
|
|
200
|
+
probes.push(...generateHubProbes(hubs));
|
|
201
|
+
// 2. Entry point probes
|
|
202
|
+
const entries = parseEntryPoints(fullArch || architecture);
|
|
203
|
+
probes.push(...generateEntryPointProbes(entries));
|
|
204
|
+
// 3. Naming convention probes
|
|
205
|
+
const conventions = parseConventions(mapSection);
|
|
206
|
+
probes.push(...generateNamingProbes(conventions));
|
|
207
|
+
// 4. Diff-scoped probes (prioritized when available)
|
|
208
|
+
if (kbDiff) {
|
|
209
|
+
const diffFiles = parseDiffFiles(kbDiff);
|
|
210
|
+
const diffProbes = generateDiffProbes(diffFiles);
|
|
211
|
+
// Insert diff probes at the front (highest priority)
|
|
212
|
+
probes.unshift(...diffProbes);
|
|
213
|
+
}
|
|
214
|
+
// 5. Harvested prompt probes
|
|
215
|
+
if (harvestedPrompts && harvestedPrompts.length > 0) {
|
|
216
|
+
probes.push(...generateHarvestedProbes(harvestedPrompts));
|
|
217
|
+
}
|
|
218
|
+
// Deduplicate by id and cap at maxProbes
|
|
219
|
+
const seen = new Set();
|
|
220
|
+
const unique = [];
|
|
221
|
+
for (const probe of probes) {
|
|
222
|
+
if (!seen.has(probe.id) && unique.length < maxProbes) {
|
|
223
|
+
seen.add(probe.id);
|
|
224
|
+
unique.push(probe);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
return unique;
|
|
228
|
+
}
|
|
229
|
+
//# sourceMappingURL=probes.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"probes.js","sourceRoot":"","sources":["../src/probes.ts"],"names":[],"mappings":";AAAA;;;;;;;;;;GAUG;;AAoMH,wCA4CC;AAGQ,wCAAc;AAAE,8BAAS;AAAE,4CAAgB;AAAE,4CAAgB;AAAE,wCAAc;AA1OtF,+DAA+D;AAE/D,wDAAwD;AACxD,SAAS,cAAc,CAAC,EAAU,EAAE,OAAe;IACjD,MAAM,GAAG,GAAG,EAAE,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;IAChC,IAAI,GAAG,GAAG,CAAC;QAAE,OAAO,EAAE,CAAC;IACvB,MAAM,KAAK,GAAG,GAAG,GAAG,OAAO,CAAC,MAAM,CAAC;IACnC,iDAAiD;IACjD,MAAM,MAAM,GAAG,EAAE,CAAC,OAAO,CAAC,SAAS,EAAE,KAAK,CAAC,CAAC;IAC5C,OAAO,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,EAAE,MAAM,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,EAAE,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;AAC9E,CAAC;AAED,2EAA2E;AAC3E,SAAS,SAAS,CAAC,YAAoB;IACrC,MAAM,IAAI,GAAiE,EAAE,CAAC;IAC9E,MAAM,UAAU,GAAG,oDAAoD,CAAC;IACxE,MAAM,OAAO,GAAG,iBAAiB,CAAC,YAAY,EAAE,8BAA8B,CAAC,CAAC;IAChF,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,UAAU,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QACnD,IAAI,CAAC,IAAI,CAAC;YACR,IAAI,EAAE,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE;YACrB,QAAQ,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;YAChC,SAAS,EAAE,QAAQ,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC;SAClC,CAAC,CAAC;IACL,CAAC;IACD,OAAO,IAAI,CAAC;AACd,CAAC;AAED,sDAAsD;AACtD,SAAS,gBAAgB,CAAC,YAAoB;IAC5C,MAAM,OAAO,GAA0C,EAAE,CAAC;IAC1D,MAAM,OAAO,GAAG,iBAAiB,CAAC,YAAY,EAAE,cAAc,CAAC,CAAC;IAChE,MAAM,KAAK,GAAG,0CAA0C,CAAC;IACzD,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC9C,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC7B,MAAM,IAAI,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC;QAC7B,IAAI,IAAI,IAAI,CAAC,IAAI,CAAC,QAAQ,CAAC,KAAK,CAAC,IAAI,IAAI,KAAK,MAAM,EAAE,CAAC;YACrD,OAAO,CAAC,IAAI,CAAC,EAAE,IAAI,EAAE,IAAI,EAAE,CAAC,CAAC;QAC/B,CAAC;IACH,CAAC;IACD,OAAO,OAAO,CAAC;AACjB,CAAC;AAED,qDAAqD;AACrD,SAAS,gBAAgB,CAAC,UAAkB;IAC1C,MAAM,OAAO,GAAG,iBAAiB,CAAC,UAAU,EAAE,aAAa,CAAC,CAAC;IAC7D,OAAO,OAAO;SACX,KAAK,CAAC,IAAI,CAAC;SACX,MAAM,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,IAAI,IAAI,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;SAChE,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,IAAI,CAAC,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC,CAAC,IAAI,EAAE,CAAC;SAClD,MAAM,CAAC,OAAO,CAAC,CAAC;AACrB,CAAC;AAED,gEAAgE;AAChE,SAAS,iBAAiB,CAAC,OAAe,EAAE,OAAe;IACzD,MAAM,KAAK,GAAG,IAAI,MAAM,CAAC,SAAS,OAAO,CAAC,OAAO,CAAC,qBAAqB,EAAE,MAAM,CAAC,EAAE,EAAE,GAAG,CAAC,CAAC;IACzF,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,OAAO,CAAC,CAAC;IAClC,IAAI,CAAC,KAAK;QAAE,OAAO,EAAE,CAAC;IACtB,MAAM,KAAK,GAAG,KAAK,CAAC,KAAK,GAAG,KAAK,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;IAC5C,4CAA4C;IAC5C,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,MAAM,CAAC,YAAY,CAAC,CAAC;IAC9D,OAAO,WAAW,GAAG,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,EAAE,KAAK,GAAG,WAAW,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,OAAO,CAAC,KAAK,CAAC,KAAK,CAAC,CAAC,IAAI,EAAE,CAAC;AAC1G,CAAC;AAED,mDAAmD;AACnD,SAAS,cAAc,CAAC,IAAY;IAClC,MAAM,KAAK,GAAG,IAAI,GAAG,EAAU,CAAC;IAChC,MAAM,KAAK,GAAG,0BAA0B,CAAC;IACzC,IAAI,KAA6B,CAAC;IAClC,OAAO,CAAC,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC3C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IACD,4EAA4E;IAC5E,MAAM,SAAS,GAAG,6DAA6D,CAAC;IAChF,OAAO,CAAC,KAAK,GAAG,SAAS,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC,KAAK,IAAI,EAAE,CAAC;QAC/C,KAAK,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,CAAC;IACtB,CAAC;IACD,OAAO,CAAC,GAAG,KAAK,CAAC,CAAC;AACpB,CAAC;AAED,+DAA+D;AAE/D,SAAS,iBAAiB,CAAC,IAAkE;IAC3F,OAAO,IAAI,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,GAAG,EAAE,EAAE,CAAC,CAAC;QACpC,EAAE,EAAE,cAAc,UAAU,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE;QACxC,WAAW,EAAE,4BAA4B,GAAG,CAAC,IAAI,eAAe,GAAG,CAAC,QAAQ,cAAc;QAC1F,QAAQ,EAAE,YAA6B;QACvC,YAAY,EAAE,CAAC,GAAG,CAAC,IAAI,CAAC;QACxB,IAAI,EAAE,sBAAsB,GAAG,CAAC,IAAI,oDAAoD,GAAG,CAAC,QAAQ,uFAAuF;QAC3L,iBAAiB,EAAE;YACjB,qBAAqB,GAAG,CAAC,IAAI,0CAA0C;YACvE,gEAAgE;YAChE,qDAAqD;YACrD,8CAA8C;SAC/C;KACF,CAAC,CAAC,CAAC;AACN,CAAC;AAED,SAAS,wBAAwB,CAAC,OAA8C;IAC9E,OAAO,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,EAAE,CAAC,CAAC;QACzC,EAAE,EAAE,eAAe,UAAU,CAAC,KAAK,CAAC,IAAI,CAAC,EAAE;QAC3C,WAAW,EAAE,4BAA4B,KAAK,CAAC,IAAI,gBAAgB,KAAK,CAAC,IAAI,YAAY;QACzF,QAAQ,EAAE,aAA8B;QACxC,YAAY,EAAE,CAAC,KAAK,CAAC,IAAI,CAAC;QAC1B,IAAI,EAAE,uBAAuB,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,uBAAuB,KAAK,CAAC,IAAI,8DAA8D;QACpJ,iBAAiB,EAAE;YACjB,uCAAuC,KAAK,CAAC,IAAI,EAAE;YACnD,eAAe,KAAK,CAAC,IAAI,CAAC,WAAW,EAAE,kCAAkC;YACzE,uEAAuE;YACvE,6DAA6D;SAC9D;KACF,CAAC,CAAC,CAAC;AACN,CAAC;AAED,SAAS,oBAAoB,CAAC,WAAqB;IACjD,IAAI,WAAW,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACxC,MAAM,cAAc,GAAG,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAC1D,OAAO,CAAC;YACN,EAAE,EAAE,oBAAoB;YACxB,WAAW,EAAE,4DAA4D;YACzE,QAAQ,EAAE,QAAyB;YACnC,YAAY,EAAE,EAAE;YAChB,IAAI,EAAE,kKAAkK,cAAc,EAAE;YACxL,iBAAiB,EAAE,WAAW,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CACnD,uBAAuB,CAAC,EAAE,CAC3B;SACF,CAAC,CAAC;AACL,CAAC;AAED,SAAS,kBAAkB,CAAC,SAAmB;IAC7C,IAAI,SAAS,CAAC,MAAM,KAAK,CAAC;QAAE,OAAO,EAAE,CAAC;IACtC,OAAO,SAAS,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,EAAE,CAAC,CAAC;QAC1C,EAAE,EAAE,aAAa,UAAU,CAAC,IAAI,CAAC,EAAE;QACnC,WAAW,EAAE,8CAA8C,IAAI,EAAE;QACjE,QAAQ,EAAE,cAA+B;QACzC,YAAY,EAAE,CAAC,IAAI,CAAC;QACpB,IAAI,EAAE,oBAAoB,IAAI,qJAAqJ;QACnL,iBAAiB,EAAE;YACjB,kCAAkC,IAAI,iBAAiB;YACvD,kDAAkD;YAClD,mCAAmC;YACnC,6CAA6C;SAC9C;KACF,CAAC,CAAC,CAAC;AACN,CAAC;AAED,SAAS,uBAAuB,CAAC,OAA0B;IACzD,6DAA6D;IAC7D,MAAM,SAAS,GAAG,OAAO;SACtB,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,eAAe,CAAC,MAAM,GAAG,CAAC,CAAC;SAC3C,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAEf,OAAO,SAAS,CAAC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,CAAC;QAC9B,EAAE,EAAE,aAAa,CAAC,IAAI,UAAU,CAAC,CAAC,CAAC,eAAe,CAAC,CAAC,CAAC,IAAI,SAAS,CAAC,EAAE;QACrE,WAAW,EAAE,mBAAmB,CAAC,CAAC,MAAM,0BAA0B,CAAC,CAAC,eAAe,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE;QAChG,QAAQ,EAAE,WAA4B;QACtC,YAAY,EAAE,CAAC,CAAC,eAAe;QAC/B,IAAI,EAAE,CAAC,CAAC,UAAU;QAClB,iBAAiB,EAAE;YACjB,gEAAgE;YAChE,8CAA8C;YAC9C,oDAAoD;YACpD,qDAAqD,CAAC,CAAC,MAAM,WAAW;SACzE;KACF,CAAC,CAAC,CAAC;AACN,CAAC;AAED,+DAA+D;AAE/D,SAAS,UAAU,CAAC,IAAY;IAC9B,OAAO,IAAI;SACR,OAAO,CAAC,QAAQ,EAAE,GAAG,CAAC;SACtB,OAAO,CAAC,UAAU,EAAE,EAAE,CAAC;SACvB,OAAO,CAAC,gBAAgB,EAAE,EAAE,CAAC;SAC7B,WAAW,EAAE;SACb,KAAK,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC;AAClB,CAAC;AAED,+DAA+D;AAE/D;;;;;GAKG;AACH,SAAgB,cAAc,CAAC,OAA8B;IAC3D,MAAM,EAAE,EAAE,EAAE,MAAM,EAAE,gBAAgB,EAAE,SAAS,GAAG,EAAE,EAAE,GAAG,OAAO,CAAC;IAEjE,MAAM,YAAY,GAAG,cAAc,CAAC,EAAE,EAAE,iCAAiC,CAAC,CAAC;IAC3E,MAAM,QAAQ,GAAG,cAAc,CAAC,EAAE,EAAE,gBAAgB,CAAC,IAAI,cAAc,CAAC,EAAE,EAAE,cAAc,CAAC,CAAC;IAC5F,MAAM,UAAU,GAAG,cAAc,CAAC,EAAE,EAAE,OAAO,CAAC,IAAI,cAAc,CAAC,EAAE,EAAE,gBAAgB,CAAC,CAAC;IACvF,MAAM,MAAM,GAAY,EAAE,CAAC;IAE3B,uBAAuB;IACvB,MAAM,IAAI,GAAG,SAAS,CAAC,QAAQ,IAAI,YAAY,CAAC,CAAC;IACjD,MAAM,CAAC,IAAI,CAAC,GAAG,iBAAiB,CAAC,IAAI,CAAC,CAAC,CAAC;IAExC,wBAAwB;IACxB,MAAM,OAAO,GAAG,gBAAgB,CAAC,QAAQ,IAAI,YAAY,CAAC,CAAC;IAC3D,MAAM,CAAC,IAAI,CAAC,GAAG,wBAAwB,CAAC,OAAO,CAAC,CAAC,CAAC;IAElD,8BAA8B;IAC9B,MAAM,WAAW,GAAG,gBAAgB,CAAC,UAAU,CAAC,CAAC;IACjD,MAAM,CAAC,IAAI,CAAC,GAAG,oBAAoB,CAAC,WAAW,CAAC,CAAC,CAAC;IAElD,qDAAqD;IACrD,IAAI,MAAM,EAAE,CAAC;QACX,MAAM,SAAS,GAAG,cAAc,CAAC,MAAM,CAAC,CAAC;QACzC,MAAM,UAAU,GAAG,kBAAkB,CAAC,SAAS,CAAC,CAAC;QACjD,qDAAqD;QACrD,MAAM,CAAC,OAAO,CAAC,GAAG,UAAU,CAAC,CAAC;IAChC,CAAC;IAED,6BAA6B;IAC7B,IAAI,gBAAgB,IAAI,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACpD,MAAM,CAAC,IAAI,CAAC,GAAG,uBAAuB,CAAC,gBAAgB,CAAC,CAAC,CAAC;IAC5D,CAAC;IAED,yCAAyC;IACzC,MAAM,IAAI,GAAG,IAAI,GAAG,EAAU,CAAC;IAC/B,MAAM,MAAM,GAAY,EAAE,CAAC;IAC3B,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,CAAC,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,IAAI,MAAM,CAAC,MAAM,GAAG,SAAS,EAAE,CAAC;YACrD,IAAI,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC,CAAC;YACnB,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC"}
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Probe runner — simulates AI responses to probes using AGENTS.md as context.
|
|
3
|
+
*
|
|
4
|
+
* For each probe, constructs a chat where:
|
|
5
|
+
* - System prompt = current AGENTS.md + relevant file contents
|
|
6
|
+
* - User prompt = the probe's task
|
|
7
|
+
* Then sends it to the LLM and collects the response.
|
|
8
|
+
*/
|
|
9
|
+
import type { LlmProvider, OptLogger } from '@aspectcode/optimizer';
|
|
10
|
+
import type { Probe, ProbeResult, BehaviorResult } from './types';
|
|
11
|
+
/**
|
|
12
|
+
* Build the system prompt for a probe run.
|
|
13
|
+
* Includes the AGENTS.md instructions and relevant file contents.
|
|
14
|
+
*/
|
|
15
|
+
declare function buildProbeSystemPrompt(agentsContent: string, probe: Probe, fileContents?: ReadonlyMap<string, string>): string;
|
|
16
|
+
/**
|
|
17
|
+
* Build the evaluation prompt that scores a probe response
|
|
18
|
+
* against expected behaviours.
|
|
19
|
+
*/
|
|
20
|
+
declare function buildBehaviorEvalPrompt(probe: Probe, response: string): string;
|
|
21
|
+
/** Parse the structured behaviour evaluation response. */
|
|
22
|
+
declare function parseBehaviorEval(response: string, expectedBehaviors: string[]): {
|
|
23
|
+
results: BehaviorResult[];
|
|
24
|
+
allPassed: boolean;
|
|
25
|
+
};
|
|
26
|
+
/**
|
|
27
|
+
* Run all probes against the current AGENTS.md.
|
|
28
|
+
*
|
|
29
|
+
* Each probe is run sequentially (to respect rate limits).
|
|
30
|
+
* Returns results for all probes.
|
|
31
|
+
*/
|
|
32
|
+
export declare function runProbes(agentsContent: string, probes: Probe[], provider: LlmProvider, fileContents?: ReadonlyMap<string, string>, log?: OptLogger, signal?: AbortSignal): Promise<ProbeResult[]>;
|
|
33
|
+
export { buildProbeSystemPrompt, buildBehaviorEvalPrompt, parseBehaviorEval };
|
|
34
|
+
//# sourceMappingURL=runner.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,KAAK,EAAE,WAAW,EAAe,SAAS,EAAE,MAAM,uBAAuB,CAAC;AACjF,OAAO,KAAK,EAAE,KAAK,EAAE,WAAW,EAAE,cAAc,EAAE,MAAM,SAAS,CAAC;AAKlE;;;GAGG;AACH,iBAAS,sBAAsB,CAC7B,aAAa,EAAE,MAAM,EACrB,KAAK,EAAE,KAAK,EACZ,YAAY,CAAC,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,GACzC,MAAM,CAqBR;AAED;;;GAGG;AACH,iBAAS,uBAAuB,CAC9B,KAAK,EAAE,KAAK,EACZ,QAAQ,EAAE,MAAM,GACf,MAAM,CA2BR;AAED,0DAA0D;AAC1D,iBAAS,iBAAiB,CACxB,QAAQ,EAAE,MAAM,EAChB,iBAAiB,EAAE,MAAM,EAAE,GAC1B;IAAE,OAAO,EAAE,cAAc,EAAE,CAAC;IAAC,SAAS,EAAE,OAAO,CAAA;CAAE,CAkBnD;AAkGD;;;;;GAKG;AACH,wBAAsB,SAAS,CAC7B,aAAa,EAAE,MAAM,EACrB,MAAM,EAAE,KAAK,EAAE,EACf,QAAQ,EAAE,WAAW,EACrB,YAAY,CAAC,EAAE,WAAW,CAAC,MAAM,EAAE,MAAM,CAAC,EAC1C,GAAG,CAAC,EAAE,SAAS,EACf,MAAM,CAAC,EAAE,WAAW,GACnB,OAAO,CAAC,WAAW,EAAE,CAAC,CAoBxB;AAGD,OAAO,EAAE,sBAAsB,EAAE,uBAAuB,EAAE,iBAAiB,EAAE,CAAC"}
|
|
@@ -0,0 +1,183 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
/**
|
|
3
|
+
* Probe runner — simulates AI responses to probes using AGENTS.md as context.
|
|
4
|
+
*
|
|
5
|
+
* For each probe, constructs a chat where:
|
|
6
|
+
* - System prompt = current AGENTS.md + relevant file contents
|
|
7
|
+
* - User prompt = the probe's task
|
|
8
|
+
* Then sends it to the LLM and collects the response.
|
|
9
|
+
*/
|
|
10
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
11
|
+
exports.runProbes = runProbes;
|
|
12
|
+
exports.buildProbeSystemPrompt = buildProbeSystemPrompt;
|
|
13
|
+
exports.buildBehaviorEvalPrompt = buildBehaviorEvalPrompt;
|
|
14
|
+
exports.parseBehaviorEval = parseBehaviorEval;
|
|
15
|
+
/** Maximum file content characters to include per probe. */
|
|
16
|
+
const MAX_CONTEXT_CHARS = 20000;
|
|
17
|
+
/**
|
|
18
|
+
* Build the system prompt for a probe run.
|
|
19
|
+
* Includes the AGENTS.md instructions and relevant file contents.
|
|
20
|
+
*/
|
|
21
|
+
function buildProbeSystemPrompt(agentsContent, probe, fileContents) {
|
|
22
|
+
let prompt = `You are an AI coding assistant. Follow these project instructions:\n\n${agentsContent}`;
|
|
23
|
+
if (fileContents && probe.contextFiles.length > 0) {
|
|
24
|
+
let contextChars = 0;
|
|
25
|
+
const fileSections = [];
|
|
26
|
+
for (const filePath of probe.contextFiles) {
|
|
27
|
+
const content = fileContents.get(filePath);
|
|
28
|
+
if (!content)
|
|
29
|
+
continue;
|
|
30
|
+
if (contextChars + content.length > MAX_CONTEXT_CHARS)
|
|
31
|
+
break;
|
|
32
|
+
fileSections.push(`### ${filePath}\n\`\`\`\n${content}\n\`\`\``);
|
|
33
|
+
contextChars += content.length;
|
|
34
|
+
}
|
|
35
|
+
if (fileSections.length > 0) {
|
|
36
|
+
prompt += `\n\n## Relevant Files\n\n${fileSections.join('\n\n')}`;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
return prompt;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* Build the evaluation prompt that scores a probe response
|
|
43
|
+
* against expected behaviours.
|
|
44
|
+
*/
|
|
45
|
+
function buildBehaviorEvalPrompt(probe, response) {
|
|
46
|
+
const behaviors = probe.expectedBehaviors
|
|
47
|
+
.map((b, i) => `${i + 1}. ${b}`)
|
|
48
|
+
.join('\n');
|
|
49
|
+
return `You are evaluating an AI coding assistant's response to a specific task.
|
|
50
|
+
|
|
51
|
+
## Task Given
|
|
52
|
+
${probe.task}
|
|
53
|
+
|
|
54
|
+
## Expected Behaviours
|
|
55
|
+
The response should exhibit ALL of these behaviours:
|
|
56
|
+
${behaviors}
|
|
57
|
+
|
|
58
|
+
## AI Response
|
|
59
|
+
${response}
|
|
60
|
+
|
|
61
|
+
## Instructions
|
|
62
|
+
For EACH expected behaviour, determine if the response exhibits it.
|
|
63
|
+
Respond in EXACTLY this format (one line per behaviour):
|
|
64
|
+
|
|
65
|
+
BEHAVIOR_1: PASS|FAIL — <brief explanation>
|
|
66
|
+
BEHAVIOR_2: PASS|FAIL — <brief explanation>
|
|
67
|
+
...
|
|
68
|
+
|
|
69
|
+
Then a final line:
|
|
70
|
+
OVERALL: PASS|FAIL`;
|
|
71
|
+
}
|
|
72
|
+
/** Parse the structured behaviour evaluation response. */
|
|
73
|
+
function parseBehaviorEval(response, expectedBehaviors) {
|
|
74
|
+
const results = [];
|
|
75
|
+
const lines = response.split('\n');
|
|
76
|
+
for (let i = 0; i < expectedBehaviors.length; i++) {
|
|
77
|
+
const pattern = new RegExp(`BEHAVIOR_${i + 1}:\\s*(PASS|FAIL)\\s*[—-]\\s*(.*)`, 'i');
|
|
78
|
+
const match = lines.find((l) => pattern.test(l));
|
|
79
|
+
const parsed = match ? pattern.exec(match) : null;
|
|
80
|
+
results.push({
|
|
81
|
+
behavior: expectedBehaviors[i],
|
|
82
|
+
passed: parsed ? parsed[1].toUpperCase() === 'PASS' : false,
|
|
83
|
+
explanation: parsed ? parsed[2].trim() : 'Could not parse evaluation result',
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
const allPassed = results.every((r) => r.passed);
|
|
87
|
+
return { results, allPassed };
|
|
88
|
+
}
|
|
89
|
+
/**
|
|
90
|
+
* Run a single probe: simulate the AI response, then evaluate it.
|
|
91
|
+
*/
|
|
92
|
+
async function runSingleProbe(probe, agentsContent, provider, fileContents, log, signal) {
|
|
93
|
+
if (signal?.aborted) {
|
|
94
|
+
return {
|
|
95
|
+
probeId: probe.id,
|
|
96
|
+
passed: false,
|
|
97
|
+
response: '',
|
|
98
|
+
shortcomings: ['Cancelled'],
|
|
99
|
+
behaviorResults: [],
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
// Step 1: Simulate the AI response using AGENTS.md as context
|
|
103
|
+
log?.debug(`Running probe: ${probe.id}`);
|
|
104
|
+
const systemPrompt = buildProbeSystemPrompt(agentsContent, probe, fileContents);
|
|
105
|
+
const simMessages = [
|
|
106
|
+
{ role: 'system', content: systemPrompt },
|
|
107
|
+
{ role: 'user', content: probe.task },
|
|
108
|
+
];
|
|
109
|
+
let response;
|
|
110
|
+
try {
|
|
111
|
+
response = await provider.chat(simMessages);
|
|
112
|
+
}
|
|
113
|
+
catch (err) {
|
|
114
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
115
|
+
log?.warn(`Probe ${probe.id} simulation failed: ${msg}`);
|
|
116
|
+
return {
|
|
117
|
+
probeId: probe.id,
|
|
118
|
+
passed: false,
|
|
119
|
+
response: '',
|
|
120
|
+
shortcomings: [`LLM error during simulation: ${msg}`],
|
|
121
|
+
behaviorResults: [],
|
|
122
|
+
};
|
|
123
|
+
}
|
|
124
|
+
if (signal?.aborted) {
|
|
125
|
+
return {
|
|
126
|
+
probeId: probe.id,
|
|
127
|
+
passed: false,
|
|
128
|
+
response,
|
|
129
|
+
shortcomings: ['Cancelled during evaluation'],
|
|
130
|
+
behaviorResults: [],
|
|
131
|
+
};
|
|
132
|
+
}
|
|
133
|
+
// Step 2: Evaluate the response against expected behaviours
|
|
134
|
+
log?.debug(`Evaluating probe: ${probe.id}`);
|
|
135
|
+
const evalPrompt = buildBehaviorEvalPrompt(probe, response);
|
|
136
|
+
const evalMessages = [
|
|
137
|
+
{ role: 'user', content: evalPrompt },
|
|
138
|
+
];
|
|
139
|
+
let evalResponse;
|
|
140
|
+
try {
|
|
141
|
+
evalResponse = await provider.chat(evalMessages);
|
|
142
|
+
}
|
|
143
|
+
catch (err) {
|
|
144
|
+
const msg = err instanceof Error ? err.message : String(err);
|
|
145
|
+
log?.warn(`Probe ${probe.id} evaluation failed: ${msg}`);
|
|
146
|
+
return {
|
|
147
|
+
probeId: probe.id,
|
|
148
|
+
passed: false,
|
|
149
|
+
response,
|
|
150
|
+
shortcomings: [`LLM error during evaluation: ${msg}`],
|
|
151
|
+
behaviorResults: [],
|
|
152
|
+
};
|
|
153
|
+
}
|
|
154
|
+
const { results: behaviorResults, allPassed } = parseBehaviorEval(evalResponse, probe.expectedBehaviors);
|
|
155
|
+
const shortcomings = behaviorResults
|
|
156
|
+
.filter((r) => !r.passed)
|
|
157
|
+
.map((r) => `${r.behavior}: ${r.explanation}`);
|
|
158
|
+
return {
|
|
159
|
+
probeId: probe.id,
|
|
160
|
+
passed: allPassed,
|
|
161
|
+
response,
|
|
162
|
+
shortcomings,
|
|
163
|
+
behaviorResults,
|
|
164
|
+
};
|
|
165
|
+
}
|
|
166
|
+
/**
|
|
167
|
+
* Run all probes against the current AGENTS.md.
|
|
168
|
+
*
|
|
169
|
+
* Each probe is run sequentially (to respect rate limits).
|
|
170
|
+
* Returns results for all probes.
|
|
171
|
+
*/
|
|
172
|
+
async function runProbes(agentsContent, probes, provider, fileContents, log, signal) {
|
|
173
|
+
const results = [];
|
|
174
|
+
for (const probe of probes) {
|
|
175
|
+
if (signal?.aborted)
|
|
176
|
+
break;
|
|
177
|
+
const result = await runSingleProbe(probe, agentsContent, provider, fileContents, log, signal);
|
|
178
|
+
results.push(result);
|
|
179
|
+
log?.info(` ${result.passed ? '✔' : '✖'} ${probe.id}`);
|
|
180
|
+
}
|
|
181
|
+
return results;
|
|
182
|
+
}
|
|
183
|
+
//# sourceMappingURL=runner.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../src/runner.ts"],"names":[],"mappings":";AAAA;;;;;;;GAOG;;AAyMH,8BA2BC;AAGQ,wDAAsB;AAAE,0DAAuB;AAAE,8CAAiB;AAlO3E,4DAA4D;AAC5D,MAAM,iBAAiB,GAAG,KAAM,CAAC;AAEjC;;;GAGG;AACH,SAAS,sBAAsB,CAC7B,aAAqB,EACrB,KAAY,EACZ,YAA0C;IAE1C,IAAI,MAAM,GAAG,yEAAyE,aAAa,EAAE,CAAC;IAEtG,IAAI,YAAY,IAAI,KAAK,CAAC,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QAClD,IAAI,YAAY,GAAG,CAAC,CAAC;QACrB,MAAM,YAAY,GAAa,EAAE,CAAC;QAElC,KAAK,MAAM,QAAQ,IAAI,KAAK,CAAC,YAAY,EAAE,CAAC;YAC1C,MAAM,OAAO,GAAG,YAAY,CAAC,GAAG,CAAC,QAAQ,CAAC,CAAC;YAC3C,IAAI,CAAC,OAAO;gBAAE,SAAS;YACvB,IAAI,YAAY,GAAG,OAAO,CAAC,MAAM,GAAG,iBAAiB;gBAAE,MAAM;YAC7D,YAAY,CAAC,IAAI,CAAC,OAAO,QAAQ,aAAa,OAAO,UAAU,CAAC,CAAC;YACjE,YAAY,IAAI,OAAO,CAAC,MAAM,CAAC;QACjC,CAAC;QAED,IAAI,YAAY,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAC5B,MAAM,IAAI,4BAA4B,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC;QACpE,CAAC;IACH,CAAC;IAED,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;;GAGG;AACH,SAAS,uBAAuB,CAC9B,KAAY,EACZ,QAAgB;IAEhB,MAAM,SAAS,GAAG,KAAK,CAAC,iBAAiB;SACtC,GAAG,CAAC,CAAC,CAAC,EAAE,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,GAAG,CAAC,KAAK,CAAC,EAAE,CAAC;SAC/B,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;;EAGP,KAAK,CAAC,IAAI;;;;EAIV,SAAS;;;EAGT,QAAQ;;;;;;;;;;;mBAWS,CAAC;AACpB,CAAC;AAED,0DAA0D;AAC1D,SAAS,iBAAiB,CACxB,QAAgB,EAChB,iBAA2B;IAE3B,MAAM,OAAO,GAAqB,EAAE,CAAC;IACrC,MAAM,KAAK,GAAG,QAAQ,CAAC,KAAK,CAAC,IAAI,CAAC,CAAC;IAEnC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,iBAAiB,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAClD,MAAM,OAAO,GAAG,IAAI,MAAM,CAAC,YAAY,CAAC,GAAG,CAAC,kCAAkC,EAAE,GAAG,CAAC,CAAC;QACrF,MAAM,KAAK,GAAG,KAAK,CAAC,IAAI,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QACjD,MAAM,MAAM,GAAG,KAAK,CAAC,CAAC,CAAC,OAAO,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,IAAI,CAAC;QAElD,OAAO,CAAC,IAAI,CAAC;YACX,QAAQ,EAAE,iBAAiB,CAAC,CAAC,CAAC;YAC9B,MAAM,EAAE,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,WAAW,EAAE,KAAK,MAAM,CAAC,CAAC,CAAC,KAAK;YAC3D,WAAW,EAAE,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,IAAI,EAAE,CAAC,CAAC,CAAC,mCAAmC;SAC7E,CAAC,CAAC;IACL,CAAC;IAED,MAAM,SAAS,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC;IACjD,OAAO,EAAE,OAAO,EAAE,SAAS,EAAE,CAAC;AAChC,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,cAAc,CAC3B,KAAY,EACZ,aAAqB,EACrB,QAAqB,EACrB,YAA0C,EAC1C,GAAe,EACf,MAAoB;IAEpB,IAAI,MAAM,EAAE,OAAO,EAAE,CAAC;QACpB,OAAO;YACL,OAAO,EAAE,KAAK,CAAC,EAAE;YACjB,MAAM,EAAE,KAAK;YACb,QAAQ,EAAE,EAAE;YACZ,YAAY,EAAE,CAAC,WAAW,CAAC;YAC3B,eAAe,EAAE,EAAE;SACpB,CAAC;IACJ,CAAC;IAED,8DAA8D;IAC9D,GAAG,EAAE,KAAK,CAAC,kBAAkB,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;IAEzC,MAAM,YAAY,GAAG,sBAAsB,CAAC,aAAa,EAAE,KAAK,EAAE,YAAY,CAAC,CAAC;IAChF,MAAM,WAAW,GAAkB;QACjC,EAAE,IAAI,EAAE,QAAQ,EAAE,OAAO,EAAE,YAAY,EAAE;QACzC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,KAAK,CAAC,IAAI,EAAE;KACtC,CAAC;IAEF,IAAI,QAAgB,CAAC;IACrB,IAAI,CAAC;QACH,QAAQ,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,WAAW,CAAC,CAAC;IAC9C,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC7D,GAAG,EAAE,IAAI,CAAC,SAAS,KAAK,CAAC,EAAE,uBAAuB,GAAG,EAAE,CAAC,CAAC;QACzD,OAAO;YACL,OAAO,EAAE,KAAK,CAAC,EAAE;YACjB,MAAM,EAAE,KAAK;YACb,QAAQ,EAAE,EAAE;YACZ,YAAY,EAAE,CAAC,gCAAgC,GAAG,EAAE,CAAC;YACrD,eAAe,EAAE,EAAE;SACpB,CAAC;IACJ,CAAC;IAED,IAAI,MAAM,EAAE,OAAO,EAAE,CAAC;QACpB,OAAO;YACL,OAAO,EAAE,KAAK,CAAC,EAAE;YACjB,MAAM,EAAE,KAAK;YACb,QAAQ;YACR,YAAY,EAAE,CAAC,6BAA6B,CAAC;YAC7C,eAAe,EAAE,EAAE;SACpB,CAAC;IACJ,CAAC;IAED,4DAA4D;IAC5D,GAAG,EAAE,KAAK,CAAC,qBAAqB,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;IAE5C,MAAM,UAAU,GAAG,uBAAuB,CAAC,KAAK,EAAE,QAAQ,CAAC,CAAC;IAC5D,MAAM,YAAY,GAAkB;QAClC,EAAE,IAAI,EAAE,MAAM,EAAE,OAAO,EAAE,UAAU,EAAE;KACtC,CAAC;IAEF,IAAI,YAAoB,CAAC;IACzB,IAAI,CAAC;QACH,YAAY,GAAG,MAAM,QAAQ,CAAC,IAAI,CAAC,YAAY,CAAC,CAAC;IACnD,CAAC;IAAC,OAAO,GAAG,EAAE,CAAC;QACb,MAAM,GAAG,GAAG,GAAG,YAAY,KAAK,CAAC,CAAC,CAAC,GAAG,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,GAAG,CAAC,CAAC;QAC7D,GAAG,EAAE,IAAI,CAAC,SAAS,KAAK,CAAC,EAAE,uBAAuB,GAAG,EAAE,CAAC,CAAC;QACzD,OAAO;YACL,OAAO,EAAE,KAAK,CAAC,EAAE;YACjB,MAAM,EAAE,KAAK;YACb,QAAQ;YACR,YAAY,EAAE,CAAC,gCAAgC,GAAG,EAAE,CAAC;YACrD,eAAe,EAAE,EAAE;SACpB,CAAC;IACJ,CAAC;IAED,MAAM,EAAE,OAAO,EAAE,eAAe,EAAE,SAAS,EAAE,GAAG,iBAAiB,CAC/D,YAAY,EACZ,KAAK,CAAC,iBAAiB,CACxB,CAAC;IAEF,MAAM,YAAY,GAAG,eAAe;SACjC,MAAM,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,CAAC,CAAC,CAAC,MAAM,CAAC;SACxB,GAAG,CAAC,CAAC,CAAC,EAAE,EAAE,CAAC,GAAG,CAAC,CAAC,QAAQ,KAAK,CAAC,CAAC,WAAW,EAAE,CAAC,CAAC;IAEjD,OAAO;QACL,OAAO,EAAE,KAAK,CAAC,EAAE;QACjB,MAAM,EAAE,SAAS;QACjB,QAAQ;QACR,YAAY;QACZ,eAAe;KAChB,CAAC;AACJ,CAAC;AAED;;;;;GAKG;AACI,KAAK,UAAU,SAAS,CAC7B,aAAqB,EACrB,MAAe,EACf,QAAqB,EACrB,YAA0C,EAC1C,GAAe,EACf,MAAoB;IAEpB,MAAM,OAAO,GAAkB,EAAE,CAAC;IAElC,KAAK,MAAM,KAAK,IAAI,MAAM,EAAE,CAAC;QAC3B,IAAI,MAAM,EAAE,OAAO;YAAE,MAAM;QAE3B,MAAM,MAAM,GAAG,MAAM,cAAc,CACjC,KAAK,EACL,aAAa,EACb,QAAQ,EACR,YAAY,EACZ,GAAG,EACH,MAAM,CACP,CAAC;QACF,OAAO,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;QAErB,GAAG,EAAE,IAAI,CAAC,KAAK,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,CAAC,CAAC,CAAC,GAAG,IAAI,KAAK,CAAC,EAAE,EAAE,CAAC,CAAC;IAC1D,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|