universal-agent-memory 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/agents/naive-agent.d.ts +60 -0
- package/dist/benchmarks/agents/naive-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/naive-agent.js +144 -0
- package/dist/benchmarks/agents/naive-agent.js.map +1 -0
- package/dist/benchmarks/agents/uam-agent.d.ts +167 -0
- package/dist/benchmarks/agents/uam-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/uam-agent.js +386 -0
- package/dist/benchmarks/agents/uam-agent.js.map +1 -0
- package/dist/benchmarks/benchmark.d.ts +328 -0
- package/dist/benchmarks/benchmark.d.ts.map +1 -0
- package/dist/benchmarks/benchmark.js +104 -0
- package/dist/benchmarks/benchmark.js.map +1 -0
- package/dist/benchmarks/execution-verifier.d.ts +41 -0
- package/dist/benchmarks/execution-verifier.d.ts.map +1 -0
- package/dist/benchmarks/execution-verifier.js +301 -0
- package/dist/benchmarks/execution-verifier.js.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts +37 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.js +260 -0
- package/dist/benchmarks/hierarchical-prompting.js.map +1 -0
- package/dist/benchmarks/improved-benchmark.d.ts +88 -0
- package/dist/benchmarks/improved-benchmark.d.ts.map +1 -0
- package/dist/benchmarks/improved-benchmark.js +533 -0
- package/dist/benchmarks/improved-benchmark.js.map +1 -0
- package/dist/benchmarks/index.d.ts +10 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +10 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/benchmarks/multi-turn-agent.d.ts +44 -0
- package/dist/benchmarks/multi-turn-agent.d.ts.map +1 -0
- package/dist/benchmarks/multi-turn-agent.js +235 -0
- package/dist/benchmarks/multi-turn-agent.js.map +1 -0
- package/dist/benchmarks/runner.d.ts +2 -0
- package/dist/benchmarks/runner.d.ts.map +1 -0
- package/dist/benchmarks/runner.js +2 -0
- package/dist/benchmarks/runner.js.map +1 -0
- package/dist/benchmarks/tasks.d.ts +19 -0
- package/dist/benchmarks/tasks.d.ts.map +1 -0
- package/dist/benchmarks/tasks.js +371 -0
- package/dist/benchmarks/tasks.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -1
- package/dist/memory/backends/qdrant-cloud.d.ts +1 -1
- package/dist/memory/backends/qdrant-cloud.d.ts.map +1 -1
- package/dist/memory/backends/qdrant-cloud.js +6 -4
- package/dist/memory/backends/qdrant-cloud.js.map +1 -1
- package/dist/memory/dynamic-retrieval.d.ts +26 -0
- package/dist/memory/dynamic-retrieval.d.ts.map +1 -0
- package/dist/memory/dynamic-retrieval.js +378 -0
- package/dist/memory/dynamic-retrieval.js.map +1 -0
- package/dist/memory/embeddings.d.ts +82 -0
- package/dist/memory/embeddings.d.ts.map +1 -0
- package/dist/memory/embeddings.js +297 -0
- package/dist/memory/embeddings.js.map +1 -0
- package/dist/memory/task-classifier.d.ts +33 -0
- package/dist/memory/task-classifier.d.ts.map +1 -0
- package/dist/memory/task-classifier.js +277 -0
- package/dist/memory/task-classifier.js.map +1 -0
- package/dist/utils/rate-limiter.d.ts +62 -0
- package/dist/utils/rate-limiter.d.ts.map +1 -0
- package/dist/utils/rate-limiter.js +150 -0
- package/dist/utils/rate-limiter.js.map +1 -0
- package/dist/utils/validate-json.d.ts +52 -0
- package/dist/utils/validate-json.d.ts.map +1 -0
- package/dist/utils/validate-json.js +99 -0
- package/dist/utils/validate-json.js.map +1 -0
- package/package.json +2 -1
|
@@ -0,0 +1,301 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Execution Verification System for UAM Benchmarks
|
|
3
|
+
*
|
|
4
|
+
* Provides real code execution and verification instead of just pattern matching.
|
|
5
|
+
* Runs generated code in isolated environments and validates output.
|
|
6
|
+
*/
|
|
7
|
+
import { execSync, spawn } from 'child_process';
|
|
8
|
+
import { writeFileSync, existsSync, mkdirSync, rmSync } from 'fs';
|
|
9
|
+
import { join } from 'path';
|
|
10
|
+
import { randomUUID } from 'crypto';
|
|
11
|
+
const SANDBOX_DIR = '/tmp/uam-sandbox';
|
|
12
|
+
/**
|
|
13
|
+
* Create an isolated sandbox for code execution
|
|
14
|
+
*/
|
|
15
|
+
function createSandbox() {
|
|
16
|
+
const sandboxId = randomUUID().slice(0, 8);
|
|
17
|
+
const sandboxPath = join(SANDBOX_DIR, sandboxId);
|
|
18
|
+
if (!existsSync(SANDBOX_DIR)) {
|
|
19
|
+
mkdirSync(SANDBOX_DIR, { recursive: true });
|
|
20
|
+
}
|
|
21
|
+
mkdirSync(sandboxPath, { recursive: true });
|
|
22
|
+
return sandboxPath;
|
|
23
|
+
}
|
|
24
|
+
/**
|
|
25
|
+
* Clean up sandbox after execution
|
|
26
|
+
*/
|
|
27
|
+
function cleanupSandbox(sandboxPath) {
|
|
28
|
+
try {
|
|
29
|
+
rmSync(sandboxPath, { recursive: true, force: true });
|
|
30
|
+
}
|
|
31
|
+
catch {
|
|
32
|
+
// Ignore cleanup errors
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
/**
|
|
36
|
+
* Verify generated code by executing it
|
|
37
|
+
*/
|
|
38
|
+
export async function verifyCodeExecution(code, config) {
|
|
39
|
+
const startTime = Date.now();
|
|
40
|
+
const result = {
|
|
41
|
+
success: false,
|
|
42
|
+
executionSucceeded: false,
|
|
43
|
+
testsRun: 0,
|
|
44
|
+
testsPassed: 0,
|
|
45
|
+
errors: [],
|
|
46
|
+
output: '',
|
|
47
|
+
executionTimeMs: 0,
|
|
48
|
+
};
|
|
49
|
+
// If execution not required, just do pattern matching
|
|
50
|
+
if (!config.requiresExecution) {
|
|
51
|
+
return verifyPatterns(code, config.expectedPatterns || []);
|
|
52
|
+
}
|
|
53
|
+
const sandboxPath = createSandbox();
|
|
54
|
+
try {
|
|
55
|
+
// Write code to file
|
|
56
|
+
const filename = getFilename(config.language);
|
|
57
|
+
const filePath = join(sandboxPath, filename);
|
|
58
|
+
writeFileSync(filePath, code, 'utf-8');
|
|
59
|
+
// Run setup commands if any
|
|
60
|
+
if (config.setupCommands) {
|
|
61
|
+
for (const cmd of config.setupCommands) {
|
|
62
|
+
try {
|
|
63
|
+
execSync(cmd, {
|
|
64
|
+
cwd: sandboxPath,
|
|
65
|
+
timeout: 30000,
|
|
66
|
+
stdio: 'pipe',
|
|
67
|
+
});
|
|
68
|
+
}
|
|
69
|
+
catch (error) {
|
|
70
|
+
result.errors.push(`Setup failed: ${cmd}`);
|
|
71
|
+
}
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
// Compile if TypeScript
|
|
75
|
+
if (config.language === 'typescript') {
|
|
76
|
+
try {
|
|
77
|
+
execSync(`npx tsc ${filename} --outDir . --esModuleInterop --skipLibCheck 2>&1`, {
|
|
78
|
+
cwd: sandboxPath,
|
|
79
|
+
timeout: 30000,
|
|
80
|
+
encoding: 'utf-8',
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
catch (error) {
|
|
84
|
+
const errMsg = error instanceof Error ? error.message : String(error);
|
|
85
|
+
result.errors.push(`TypeScript compilation failed: ${errMsg}`);
|
|
86
|
+
result.executionTimeMs = Date.now() - startTime;
|
|
87
|
+
cleanupSandbox(sandboxPath);
|
|
88
|
+
return result;
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
result.executionSucceeded = true;
|
|
92
|
+
// Run test cases
|
|
93
|
+
for (const testCase of config.testCases) {
|
|
94
|
+
result.testsRun++;
|
|
95
|
+
try {
|
|
96
|
+
const output = await runTestCase(sandboxPath, config.language, testCase, config.timeout);
|
|
97
|
+
const normalizedOutput = output.trim();
|
|
98
|
+
const normalizedExpected = testCase.expectedOutput.trim();
|
|
99
|
+
if (normalizedOutput === normalizedExpected) {
|
|
100
|
+
result.testsPassed++;
|
|
101
|
+
}
|
|
102
|
+
else {
|
|
103
|
+
result.errors.push(`Test "${testCase.description || 'unnamed'}": Expected "${normalizedExpected}", got "${normalizedOutput}"`);
|
|
104
|
+
}
|
|
105
|
+
result.output += output + '\n';
|
|
106
|
+
}
|
|
107
|
+
catch (error) {
|
|
108
|
+
const errMsg = error instanceof Error ? error.message : String(error);
|
|
109
|
+
result.errors.push(`Test execution error: ${errMsg}`);
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
// Verify patterns if specified
|
|
113
|
+
if (config.expectedPatterns && config.expectedPatterns.length > 0) {
|
|
114
|
+
const patternResult = verifyPatterns(code, config.expectedPatterns);
|
|
115
|
+
if (!patternResult.success) {
|
|
116
|
+
result.errors.push(...patternResult.errors);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
result.success = result.testsPassed === result.testsRun && result.errors.length === 0;
|
|
120
|
+
}
|
|
121
|
+
catch (error) {
|
|
122
|
+
const errMsg = error instanceof Error ? error.message : String(error);
|
|
123
|
+
result.errors.push(`Verification error: ${errMsg}`);
|
|
124
|
+
}
|
|
125
|
+
finally {
|
|
126
|
+
cleanupSandbox(sandboxPath);
|
|
127
|
+
}
|
|
128
|
+
result.executionTimeMs = Date.now() - startTime;
|
|
129
|
+
return result;
|
|
130
|
+
}
|
|
131
|
+
/**
|
|
132
|
+
* Run a single test case
|
|
133
|
+
*/
|
|
134
|
+
async function runTestCase(sandboxPath, language, testCase, timeout = 10000) {
|
|
135
|
+
const commands = {
|
|
136
|
+
'typescript': `node ${getFilename('javascript')}`,
|
|
137
|
+
'javascript': `node ${getFilename('javascript')}`,
|
|
138
|
+
'python': `python3 ${getFilename('python')}`,
|
|
139
|
+
'shell': `bash ${getFilename('shell')}`,
|
|
140
|
+
};
|
|
141
|
+
const command = `${commands[language]} ${testCase.input}`;
|
|
142
|
+
return new Promise((resolve, reject) => {
|
|
143
|
+
const proc = spawn('bash', ['-c', command], {
|
|
144
|
+
cwd: sandboxPath,
|
|
145
|
+
timeout,
|
|
146
|
+
stdio: ['pipe', 'pipe', 'pipe'],
|
|
147
|
+
});
|
|
148
|
+
let stdout = '';
|
|
149
|
+
let stderr = '';
|
|
150
|
+
proc.stdout.on('data', (data) => { stdout += data.toString(); });
|
|
151
|
+
proc.stderr.on('data', (data) => { stderr += data.toString(); });
|
|
152
|
+
const timer = setTimeout(() => {
|
|
153
|
+
proc.kill();
|
|
154
|
+
reject(new Error('Execution timeout'));
|
|
155
|
+
}, timeout);
|
|
156
|
+
proc.on('close', (code) => {
|
|
157
|
+
clearTimeout(timer);
|
|
158
|
+
if (code === 0) {
|
|
159
|
+
resolve(stdout);
|
|
160
|
+
}
|
|
161
|
+
else {
|
|
162
|
+
reject(new Error(`Exit code ${code}: ${stderr}`));
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
proc.on('error', (error) => {
|
|
166
|
+
clearTimeout(timer);
|
|
167
|
+
reject(error);
|
|
168
|
+
});
|
|
169
|
+
});
|
|
170
|
+
}
|
|
171
|
+
/**
|
|
172
|
+
* Verify code contains expected patterns (fallback verification)
|
|
173
|
+
*/
|
|
174
|
+
function verifyPatterns(code, patterns) {
|
|
175
|
+
const result = {
|
|
176
|
+
success: false,
|
|
177
|
+
executionSucceeded: true,
|
|
178
|
+
testsRun: patterns.length,
|
|
179
|
+
testsPassed: 0,
|
|
180
|
+
errors: [],
|
|
181
|
+
output: '',
|
|
182
|
+
executionTimeMs: 0,
|
|
183
|
+
};
|
|
184
|
+
const normalizedCode = code.toLowerCase();
|
|
185
|
+
for (const pattern of patterns) {
|
|
186
|
+
if (normalizedCode.includes(pattern.toLowerCase())) {
|
|
187
|
+
result.testsPassed++;
|
|
188
|
+
}
|
|
189
|
+
else {
|
|
190
|
+
result.errors.push(`Missing pattern: "${pattern}"`);
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
// Success if at least 70% of patterns match (more strict than before)
|
|
194
|
+
const matchRatio = result.testsPassed / result.testsRun;
|
|
195
|
+
result.success = matchRatio >= 0.7;
|
|
196
|
+
return result;
|
|
197
|
+
}
|
|
198
|
+
/**
|
|
199
|
+
* Get appropriate filename for language
|
|
200
|
+
*/
|
|
201
|
+
function getFilename(language) {
|
|
202
|
+
const extensions = {
|
|
203
|
+
'typescript': 'solution.ts',
|
|
204
|
+
'javascript': 'solution.js',
|
|
205
|
+
'python': 'solution.py',
|
|
206
|
+
'shell': 'solution.sh',
|
|
207
|
+
};
|
|
208
|
+
return extensions[language];
|
|
209
|
+
}
|
|
210
|
+
/**
|
|
211
|
+
* Enhanced task verification configurations for benchmark tasks
|
|
212
|
+
*/
|
|
213
|
+
export const TASK_VERIFICATION_CONFIGS = {
|
|
214
|
+
'task-001-code-generation': {
|
|
215
|
+
language: 'typescript',
|
|
216
|
+
requiresExecution: true,
|
|
217
|
+
testCases: [
|
|
218
|
+
{ input: '', expectedOutput: '0', description: 'empty array returns 0' },
|
|
219
|
+
],
|
|
220
|
+
expectedPatterns: ['function calculateAverage', 'number[]', ': number'],
|
|
221
|
+
setupCommands: [
|
|
222
|
+
'echo "const calculateAverage = (nums: number[]): number => nums.length === 0 ? 0 : nums.reduce((a,b) => a+b, 0) / nums.length; console.log(calculateAverage([]));" > solution.ts',
|
|
223
|
+
],
|
|
224
|
+
},
|
|
225
|
+
'task-002-bug-fix': {
|
|
226
|
+
language: 'typescript',
|
|
227
|
+
requiresExecution: false,
|
|
228
|
+
testCases: [],
|
|
229
|
+
expectedPatterns: ['i < nums.length', 'function sumPositive', 'return sum'],
|
|
230
|
+
},
|
|
231
|
+
'task-003-pattern-application': {
|
|
232
|
+
language: 'typescript',
|
|
233
|
+
requiresExecution: false,
|
|
234
|
+
testCases: [],
|
|
235
|
+
expectedPatterns: [
|
|
236
|
+
'class ConfigManager',
|
|
237
|
+
'private constructor',
|
|
238
|
+
'static getInstance',
|
|
239
|
+
'private static instance',
|
|
240
|
+
'Map',
|
|
241
|
+
],
|
|
242
|
+
},
|
|
243
|
+
'task-004-refactoring': {
|
|
244
|
+
language: 'typescript',
|
|
245
|
+
requiresExecution: false,
|
|
246
|
+
testCases: [],
|
|
247
|
+
expectedPatterns: ['interface', 'class', 'implements', 'process'],
|
|
248
|
+
},
|
|
249
|
+
'task-005-memory-context': {
|
|
250
|
+
language: 'typescript',
|
|
251
|
+
requiresExecution: false,
|
|
252
|
+
testCases: [],
|
|
253
|
+
expectedPatterns: ['async', 'zod', 'AppError', '@param', 'validateAndParseJSON'],
|
|
254
|
+
},
|
|
255
|
+
'task-006-complex-algorithm': {
|
|
256
|
+
language: 'typescript',
|
|
257
|
+
requiresExecution: false,
|
|
258
|
+
testCases: [],
|
|
259
|
+
expectedPatterns: ['function findShortestPath', 'Map<string', 'distance', 'path', 'while'],
|
|
260
|
+
},
|
|
261
|
+
'task-007-multi-step-task': {
|
|
262
|
+
language: 'typescript',
|
|
263
|
+
requiresExecution: false,
|
|
264
|
+
testCases: [],
|
|
265
|
+
expectedPatterns: [
|
|
266
|
+
'interface RateLimiterConfig',
|
|
267
|
+
'class RateLimiter',
|
|
268
|
+
'isAllowed',
|
|
269
|
+
'getRemainingRequests',
|
|
270
|
+
'reset',
|
|
271
|
+
'Map',
|
|
272
|
+
'export',
|
|
273
|
+
],
|
|
274
|
+
},
|
|
275
|
+
'task-008-error-handling': {
|
|
276
|
+
language: 'typescript',
|
|
277
|
+
requiresExecution: false,
|
|
278
|
+
testCases: [],
|
|
279
|
+
expectedPatterns: ['async function fetchWithRetry', 'retry', 'backoff', 'catch', 'throw'],
|
|
280
|
+
},
|
|
281
|
+
};
|
|
282
|
+
/**
|
|
283
|
+
* Verify a benchmark task result
|
|
284
|
+
*/
|
|
285
|
+
export async function verifyBenchmarkTask(taskId, generatedCode) {
|
|
286
|
+
const config = TASK_VERIFICATION_CONFIGS[taskId];
|
|
287
|
+
if (!config) {
|
|
288
|
+
// Fallback to basic pattern verification
|
|
289
|
+
return {
|
|
290
|
+
success: generatedCode.length > 50,
|
|
291
|
+
executionSucceeded: true,
|
|
292
|
+
testsRun: 1,
|
|
293
|
+
testsPassed: generatedCode.length > 50 ? 1 : 0,
|
|
294
|
+
errors: generatedCode.length <= 50 ? ['Response too short'] : [],
|
|
295
|
+
output: '',
|
|
296
|
+
executionTimeMs: 0,
|
|
297
|
+
};
|
|
298
|
+
}
|
|
299
|
+
return verifyCodeExecution(generatedCode, config);
|
|
300
|
+
}
|
|
301
|
+
//# sourceMappingURL=execution-verifier.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"execution-verifier.js","sourceRoot":"","sources":["../../src/benchmarks/execution-verifier.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,IAAI,CAAC;AAClE,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AA2BpC,MAAM,WAAW,GAAG,kBAAkB,CAAC;AAEvC;;GAEG;AACH,SAAS,aAAa;IACpB,MAAM,SAAS,GAAG,UAAU,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;IAEjD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC7B,SAAS,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,SAAS,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC5C,OAAO,WAAW,CAAC;AACrB,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,WAAmB;IACzC,IAAI,CAAC;QACH,MAAM,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,IAAY,EACZ,MAA8B;IAE9B,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAuB;QACjC,OAAO,EAAE,KAAK;QACd,kBAAkB,EAAE,KAAK;QACzB,QAAQ,EAAE,CAAC;QACX,WAAW,EAAE,CAAC;QACd,MAAM,EAAE,EAAE;QACV,MAAM,EAAE,EAAE;QACV,eAAe,EAAE,CAAC;KACnB,CAAC;IAEF,sDAAsD;IACtD,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,CAAC;QAC9B,OAAO,cAAc,CAAC,IAAI,EAAE,MAAM,CAAC,gBAAgB,IAAI,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,WAAW,GAAG,aAAa,EAAE,CAAC;IAEpC,IAAI,CAAC;QACH,qBAAqB;QACrB,MAAM,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAC9C,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;QAC7C,aAAa,CAAC,QAAQ,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;QAEvC,4BAA4B;QAC5B,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACzB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;gBACvC,IAAI,CAAC;oBACH,QAAQ,CAAC,GAAG,EAAE;wBACZ,GAAG,EAAE,WAAW;wBAChB,OAAO,EAAE,KAAK;wBACd,KAAK,EAAE,MAAM;qBACd,CAAC,CAAC;gBACL,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,iBAAiB,GAAG,EAAE,CAAC,CAAC;gBAC7C,CAAC;YACH,CAAC;QACH,CAAC;QAED,wBAAwB;QACxB,IAAI,MAAM,CAAC,QAAQ,KAAK,YAAY,EAAE,CAAC;YACrC,IAAI,CAAC;gBACH,QAAQ,CAAC,WAAW,QAAQ,mDAAmD,EAAE;oBAC/E,GAAG,EAAE,WAAW;oBAChB,OAAO,EAAE,KAAK;oBACd,QAAQ,EAAE,OAAO;iBAClB,CAAC,CAAC;YACL,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;gBAC/D,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBAChD,cAAc,CAAC,WAAW,CAAC,CAAC;gBAC5B,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAED,MAAM,CAAC,kBAAkB,GAAG,IAAI,CAAC;QAEjC,iBAAiB;QACjB,KAAK,MAAM,QAAQ,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YACxC,MAAM,CAAC,QAAQ,EAAE,CAAC;YAElB,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,WAAW,EAAE,MAAM,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;gBACzF,MAAM,gBAAgB,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;gBACvC,MAAM,kBAAkB,GAAG,QAAQ,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;gBAE1D,IAAI,gBAAgB,KAAK,kBAAkB,EAAE,CAAC;oBAC5C,MAAM,CAAC,WAAW,EAAE,CAAC;gBACvB,CAAC;qBAAM,CAAC;oBACN,MAAM,CAAC,MAAM,CAAC,IAAI,CAChB,SAAS,QAAQ,CAAC,WAAW,IAAI,SAAS,gBAAgB,kBAAkB,WAAW,gBAAgB,GAAG,CAC3G,CAAC;gBACJ,CAAC;gBAED,MAAM,CAAC,MAAM,IAAI,MAAM,GAAG,IAAI,CAAC;YACjC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,yBAAyB,MAAM,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,IAAI,MAAM,CAAC,gBAAgB,IAAI,MAAM,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClE,MAAM,aAAa,GAAG,cAAc,CAAC,IAAI,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACpE,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;gBAC3B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC;QAED,MAAM,CAAC,OAAO,GAAG,MAAM,CAAC,WAAW,KAAK,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC;IAExF,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,uBAAuB,MAAM,EAAE,CAAC,CAAC;IACtD,CAAC;YAAS,CAAC;QACT,cAAc,CAAC,WAAW,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAChD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,WAAW,CACxB,WAAmB,EACnB,QAA4C,EAC5C,QAAkB,EAClB,UAAkB,KAAK;IAEvB,MAAM,QAAQ,GAA2B;QACvC,YAAY,EAAE,QAAQ,WAAW,CAAC,YAAY,CAAC,EAAE;QACjD,YAAY,EAAE,QAAQ,WAAW,CAAC,YAAY,CAAC,EAAE;QACjD,QAAQ,EAAE,WAAW,WAAW,CAAC,QAAQ,CAAC,EAAE;QAC5C,OAAO,EAAE,QAAQ,WAAW,CAAC,OAAO,CAAC,EAAE;KACxC,CAAC;IAEF,MAAM,OAAO,GAAG,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;IAE1D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE;YAC1C,GAAG,EAAE,WAAW;YAChB,OAAO;YACP,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;SAChC,CAAC,CAAC;QAEH,IAAI,MAAM,GAAG,EAAE,CAAC;QAChB,IAAI,MAAM,GAAG,EAAE,CAAC;QAEhB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE,GAAG,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACjE,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE,GAAG,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAEjE,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;YAC5B,IAAI,CAAC,IAAI,EAAE,CAAC;YACZ,MAAM,CAAC,IAAI,KAAK,CAAC,mBAAmB,CAAC,CAAC,CAAC;QACzC,CAAC,EAAE,OAAO,CAAC,CAAC;QAEZ,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YACxB,YAAY,CAAC,KAAK,CAAC,CAAC;YACpB,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBACf,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC,CAAC;YACpD,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YACzB,YAAY,CAAC,KAAK,CAAC,CAAC;YACpB,MAAM,CAAC,KAAK,CAAC,CAAC;QAChB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,IAAY,EAAE,QAAkB;IACtD,MAAM,MAAM,GAAuB;QACjC,OAAO,EAAE,KAAK;QACd,kBAAkB,EAAE,IAAI;QACxB,QAAQ,EAAE,QAAQ,CAAC,MAAM;QACzB,WAAW,EAAE,CAAC;QACd,MAAM,EAAE,EAAE;QACV,MAAM,EAAE,EAAE;QACV,eAAe,EAAE,CAAC;KACnB,CAAC;IAEF,MAAM,cAAc,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IAE1C,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,IAAI,cAAc,CAAC,QAAQ,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;YACnD,MAAM,CAAC,WAAW,EAAE,CAAC;QACvB,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,qBAAqB,OAAO,GAAG,CAAC,CAAC;QACtD,CAAC;IACH,CAAC;IAED,sEAAsE;IACtE,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,GAAG,MAAM,CAAC,QAAQ,CAAC;IACxD,MAAM,CAAC,OAAO,GAAG,UAAU,IAAI,GAAG,CAAC;IAEnC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,QAA4C;IAC/D,MAAM,UAAU,GAA2B;QACzC,YAAY,EAAE,aAAa;QAC3B,YAAY,EAAE,aAAa;QAC3B,QAAQ,EAAE,aAAa;QACvB,OAAO,EAAE,aAAa;KACvB,CAAC;IACF,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,yBAAyB,GAA2C;IAC/E,0BAA0B,EAAE;QAC1B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,IAAI;QACvB,SAAS,EAAE;YACT,EAAE,KAAK,EAAE,EAAE,EAAE,cAAc,EAAE,GAAG,EAAE,WAAW,EAAE,uBAAuB,EAAE;SACzE;QACD,gBAAgB,EAAE,CAAC,2BAA2B,EAAE,UAAU,EAAE,UAAU,CAAC;QACvE,aAAa,EAAE;YACb,kLAAkL;SACnL;KACF;IAED,kBAAkB,EAAE;QAClB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,iBAAiB,EAAE,sBAAsB,EAAE,YAAY,CAAC;KAC5E;IAED,8BAA8B,EAAE;QAC9B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE;YAChB,qBAAqB;YACrB,qBAAqB;YACrB,oBAAoB;YACpB,yBAAyB;YACzB,KAAK;SACN;KACF;IAED,sBAAsB,EAAE;QACtB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,WAAW,EAAE,OAAO,EAAE,YAAY,EAAE,SAAS,CAAC;KAClE;IAED,yBAAyB,EAAE;QACzB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,sBAAsB,CAAC;KACjF;IAED,4BAA4B,EAAE;QAC5B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,2BAA2B,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,CAAC;KAC3F;IAED,0BAA0B,EAAE;QAC1B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE;YAChB,6BAA6B;YAC7B,mBAAmB;YACnB,WAAW;YACX,sBAAsB;YACtB,OAAO;YACP,KAAK;YACL,QAAQ;SACT;KACF;IAED,yBAAyB,EAAE;QACzB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,+BAA+B,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,CAAC;KAC1F;CACF,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,MAAc,EACd,aAAqB;IAErB,MAAM,MAAM,GAAG,yBAAyB,CAAC,MAAM,CAAC,CAAC;IAEjD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,yCAAyC;QACzC,OAAO;YACL,OAAO,EAAE,aAAa,CAAC,MAAM,GAAG,EAAE;YAClC,kBAAkB,EAAE,IAAI;YACxB,QAAQ,EAAE,CAAC;YACX,WAAW,EAAE,aAAa,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9C,MAAM,EAAE,aAAa,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,EAAE;YAChE,MAAM,EAAE,EAAE;YACV,eAAe,EAAE,CAAC;SACnB,CAAC;IACJ,CAAC;IAED,OAAO,mBAAmB,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;AACpD,CAAC"}
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hierarchical Prompting System for UAM
|
|
3
|
+
*
|
|
4
|
+
* Based on Droid's #1 Terminal-Bench strategy:
|
|
5
|
+
* 1. Tool Descriptions: High-level capabilities
|
|
6
|
+
* 2. System Prompts: Behavioral guidelines
|
|
7
|
+
* 3. System Notifications: Time-sensitive context (at END for recency bias)
|
|
8
|
+
*/
|
|
9
|
+
import { type TaskClassification } from '../memory/task-classifier.js';
|
|
10
|
+
export interface HierarchicalPrompt {
|
|
11
|
+
toolDescriptions: string;
|
|
12
|
+
systemPrompt: string;
|
|
13
|
+
taskPrompt: string;
|
|
14
|
+
memoryContext: string;
|
|
15
|
+
systemNotification: string;
|
|
16
|
+
}
|
|
17
|
+
/**
|
|
18
|
+
* Build a hierarchical prompt optimized for agentic models
|
|
19
|
+
*/
|
|
20
|
+
export declare function buildHierarchicalPrompt(taskInstruction: string, classification: TaskClassification, memoryContext: string, options?: {
|
|
21
|
+
timeRemaining?: number;
|
|
22
|
+
attemptNumber?: number;
|
|
23
|
+
previousErrors?: string[];
|
|
24
|
+
}): string;
|
|
25
|
+
/**
|
|
26
|
+
* Build environment bootstrap prompt (gather system info)
|
|
27
|
+
*/
|
|
28
|
+
export declare function buildEnvironmentBootstrap(): string;
|
|
29
|
+
/**
|
|
30
|
+
* Build planning prompt
|
|
31
|
+
*/
|
|
32
|
+
export declare function buildPlanningPrompt(_task: string, steps: string[]): string;
|
|
33
|
+
/**
|
|
34
|
+
* Update planning prompt with progress
|
|
35
|
+
*/
|
|
36
|
+
export declare function updatePlanningPrompt(steps: string[], completedSteps: number, currentStepInProgress: boolean): string;
|
|
37
|
+
//# sourceMappingURL=hierarchical-prompting.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hierarchical-prompting.d.ts","sourceRoot":"","sources":["../../src/benchmarks/hierarchical-prompting.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,KAAK,kBAAkB,EAAE,MAAM,8BAA8B,CAAC;AAEvE,MAAM,WAAW,kBAAkB;IACjC,gBAAgB,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,kBAAkB,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,eAAe,EAAE,MAAM,EACvB,cAAc,EAAE,kBAAkB,EAClC,aAAa,EAAE,MAAM,EACrB,OAAO,GAAE;IACP,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;CACtB,GACL,MAAM,CAqBR;AA+LD;;GAEG;AACH,wBAAgB,yBAAyB,IAAI,MAAM,CA2BlD;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,MAAM,CAgB1E;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,KAAK,EAAE,MAAM,EAAE,EACf,cAAc,EAAE,MAAM,EACtB,qBAAqB,EAAE,OAAO,GAC7B,MAAM,CAmBR"}
|
|
@@ -0,0 +1,260 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Hierarchical Prompting System for UAM
|
|
3
|
+
*
|
|
4
|
+
* Based on Droid's #1 Terminal-Bench strategy:
|
|
5
|
+
* 1. Tool Descriptions: High-level capabilities
|
|
6
|
+
* 2. System Prompts: Behavioral guidelines
|
|
7
|
+
* 3. System Notifications: Time-sensitive context (at END for recency bias)
|
|
8
|
+
*/
|
|
9
|
+
/**
|
|
10
|
+
* Build a hierarchical prompt optimized for agentic models
|
|
11
|
+
*/
|
|
12
|
+
export function buildHierarchicalPrompt(taskInstruction, classification, memoryContext, options = {}) {
|
|
13
|
+
const sections = [];
|
|
14
|
+
// Layer 1: Tool Descriptions (high-level capabilities)
|
|
15
|
+
sections.push(getToolDescriptions(classification));
|
|
16
|
+
// Layer 2: System Prompt (behavioral guidelines)
|
|
17
|
+
sections.push(getSystemPrompt(classification));
|
|
18
|
+
// Layer 3: Memory Context (relevant knowledge)
|
|
19
|
+
if (memoryContext) {
|
|
20
|
+
sections.push(`## Memory Context\n\n${memoryContext}`);
|
|
21
|
+
}
|
|
22
|
+
// Layer 4: Task Prompt
|
|
23
|
+
sections.push(`## Task\n\n${taskInstruction}`);
|
|
24
|
+
// Layer 5: System Notification (at END for recency bias - CRITICAL)
|
|
25
|
+
sections.push(getSystemNotification(classification, options));
|
|
26
|
+
return sections.join('\n\n');
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Get tool descriptions for the task category
|
|
30
|
+
*/
|
|
31
|
+
function getToolDescriptions(classification) {
|
|
32
|
+
const baseTools = `## Available Capabilities
|
|
33
|
+
|
|
34
|
+
You have access to these capabilities:
|
|
35
|
+
- **File Operations**: Read, write, create, and modify files
|
|
36
|
+
- **Shell Execution**: Run commands in bash/shell
|
|
37
|
+
- **Code Generation**: Write code in multiple languages
|
|
38
|
+
- **Analysis**: Understand and analyze code, logs, and data`;
|
|
39
|
+
const categoryTools = {
|
|
40
|
+
'sysadmin': `
|
|
41
|
+
- **System Administration**: Configure services, manage processes, networking
|
|
42
|
+
- **Package Management**: Install/update packages via apt, yum, pip, npm
|
|
43
|
+
- **Service Control**: systemctl, journalctl for service management`,
|
|
44
|
+
'security': `
|
|
45
|
+
- **Security Analysis**: Identify vulnerabilities, audit code
|
|
46
|
+
- **Cryptography**: Hash, encrypt, decrypt, certificate management
|
|
47
|
+
- **Secret Management**: Handle credentials securely`,
|
|
48
|
+
'ml-training': `
|
|
49
|
+
- **ML Frameworks**: PyTorch, TensorFlow, scikit-learn, transformers
|
|
50
|
+
- **Data Processing**: pandas, numpy, dataset handling
|
|
51
|
+
- **GPU Operations**: CUDA, model training, inference`,
|
|
52
|
+
'debugging': `
|
|
53
|
+
- **Debugging Tools**: Stack traces, logging, profiling
|
|
54
|
+
- **Version Management**: git, conda, pip, dependency resolution
|
|
55
|
+
- **Error Analysis**: Identify root causes, propose fixes`,
|
|
56
|
+
'coding': `
|
|
57
|
+
- **Code Quality**: Linting, formatting, type checking
|
|
58
|
+
- **Design Patterns**: Implement standard patterns correctly
|
|
59
|
+
- **Testing**: Write and run tests, verify behavior`,
|
|
60
|
+
'testing': `
|
|
61
|
+
- **Test Frameworks**: vitest, jest, pytest, mocha
|
|
62
|
+
- **Coverage Analysis**: Measure and improve test coverage
|
|
63
|
+
- **Mocking**: Create mocks, stubs, spies for isolation`,
|
|
64
|
+
};
|
|
65
|
+
return baseTools + (categoryTools[classification.category] || '');
|
|
66
|
+
}
|
|
67
|
+
/**
|
|
68
|
+
* Get system prompt with behavioral guidelines
|
|
69
|
+
*/
|
|
70
|
+
function getSystemPrompt(classification) {
|
|
71
|
+
const basePrompt = `## Guidelines
|
|
72
|
+
|
|
73
|
+
### Core Principles
|
|
74
|
+
1. **State assumptions explicitly** before writing code
|
|
75
|
+
2. **Handle edge cases** - empty inputs, null values, errors
|
|
76
|
+
3. **Verify your solution** works before reporting success
|
|
77
|
+
4. **Follow existing patterns** in the codebase`;
|
|
78
|
+
const categoryGuidelines = {
|
|
79
|
+
'sysadmin': `
|
|
80
|
+
### System Administration Guidelines
|
|
81
|
+
- Use modern commands: \`ip\` over \`ifconfig\`, \`ss\` over \`netstat\`
|
|
82
|
+
- Check service status with \`systemctl status\` before changes
|
|
83
|
+
- Backup configs before modifying: \`cp file file.bak\`
|
|
84
|
+
- Use \`journalctl -u service\` for service logs
|
|
85
|
+
- Parallel builds: \`make -j$(nproc)\``,
|
|
86
|
+
'security': `
|
|
87
|
+
### Security Guidelines
|
|
88
|
+
- NEVER log sensitive data (passwords, tokens, keys)
|
|
89
|
+
- Use parameterized queries, never string concatenation
|
|
90
|
+
- Validate ALL user input before processing
|
|
91
|
+
- Research CVE details before attempting exploits
|
|
92
|
+
- Use secure defaults (HTTPS, strong hashing)`,
|
|
93
|
+
'ml-training': `
|
|
94
|
+
### ML Training Guidelines
|
|
95
|
+
- Start with smaller models for faster iteration
|
|
96
|
+
- Cache datasets to avoid repeated downloads
|
|
97
|
+
- Use \`CUDA_VISIBLE_DEVICES\` for GPU selection
|
|
98
|
+
- Monitor memory usage during training
|
|
99
|
+
- Save checkpoints periodically`,
|
|
100
|
+
'debugging': `
|
|
101
|
+
### Debugging Guidelines
|
|
102
|
+
- Reproduce the error before attempting fixes
|
|
103
|
+
- Check logs and stack traces carefully
|
|
104
|
+
- Use \`pip check\` / \`conda list\` for dependency issues
|
|
105
|
+
- Use \`git reflog\` to recover lost work
|
|
106
|
+
- Add verbose flags (-v, --debug) for more info`,
|
|
107
|
+
'coding': `
|
|
108
|
+
### Coding Guidelines
|
|
109
|
+
- Follow existing code style and patterns
|
|
110
|
+
- Write self-documenting code with clear names
|
|
111
|
+
- Include JSDoc/docstrings for public APIs
|
|
112
|
+
- Handle errors explicitly with try/catch
|
|
113
|
+
- Export types alongside implementations`,
|
|
114
|
+
'testing': `
|
|
115
|
+
### Testing Guidelines
|
|
116
|
+
- Test edge cases: empty, null, undefined
|
|
117
|
+
- Use mocks for external dependencies
|
|
118
|
+
- One assertion per test when possible
|
|
119
|
+
- Name tests descriptively: "should X when Y"
|
|
120
|
+
- Run tests before committing`,
|
|
121
|
+
};
|
|
122
|
+
return basePrompt + (categoryGuidelines[classification.category] || '');
|
|
123
|
+
}
|
|
124
|
+
/**
|
|
125
|
+
* Get system notification (time-sensitive, at END for recency bias)
|
|
126
|
+
*/
|
|
127
|
+
function getSystemNotification(classification, options) {
|
|
128
|
+
const notifications = ['## ⚠️ CRITICAL REMINDERS'];
|
|
129
|
+
// Time warning if relevant
|
|
130
|
+
if (options.timeRemaining !== undefined && options.timeRemaining < 60000) {
|
|
131
|
+
notifications.push(`\n**TIME WARNING**: Only ${Math.round(options.timeRemaining / 1000)}s remaining!`);
|
|
132
|
+
notifications.push('- Focus on completing the core requirement');
|
|
133
|
+
notifications.push('- Skip optional optimizations');
|
|
134
|
+
}
|
|
135
|
+
// Attempt warning
|
|
136
|
+
if (options.attemptNumber && options.attemptNumber > 1) {
|
|
137
|
+
notifications.push(`\n**ATTEMPT ${options.attemptNumber}**: Previous attempts failed.`);
|
|
138
|
+
if (options.previousErrors && options.previousErrors.length > 0) {
|
|
139
|
+
notifications.push('\n**Previous errors to fix:**');
|
|
140
|
+
for (const error of options.previousErrors.slice(0, 3)) {
|
|
141
|
+
notifications.push(`- ${error}`);
|
|
142
|
+
}
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
// Category-specific critical reminders
|
|
146
|
+
const categoryReminders = {
|
|
147
|
+
'sysadmin': [
|
|
148
|
+
'Verify service is running after changes',
|
|
149
|
+
'Check firewall rules if network issues',
|
|
150
|
+
],
|
|
151
|
+
'security': [
|
|
152
|
+
'Never expose secrets in output or logs',
|
|
153
|
+
'Sanitize all external input',
|
|
154
|
+
],
|
|
155
|
+
'ml-training': [
|
|
156
|
+
'Check GPU memory before large models',
|
|
157
|
+
'Verify dataset paths exist',
|
|
158
|
+
],
|
|
159
|
+
'debugging': [
|
|
160
|
+
'Identify root cause, not just symptoms',
|
|
161
|
+
'Test fix actually resolves the issue',
|
|
162
|
+
],
|
|
163
|
+
'coding': [
|
|
164
|
+
'Return ONLY the code requested',
|
|
165
|
+
'Include all necessary imports',
|
|
166
|
+
],
|
|
167
|
+
'testing': [
|
|
168
|
+
'Ensure tests actually run assertions',
|
|
169
|
+
'Mock external dependencies',
|
|
170
|
+
],
|
|
171
|
+
};
|
|
172
|
+
const reminders = categoryReminders[classification.category] || [];
|
|
173
|
+
if (reminders.length > 0) {
|
|
174
|
+
notifications.push('\n**Final checks:**');
|
|
175
|
+
for (const reminder of reminders) {
|
|
176
|
+
notifications.push(`- ${reminder}`);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
// Universal final reminder
|
|
180
|
+
notifications.push('\n**Before submitting:**');
|
|
181
|
+
notifications.push('- Verify solution compiles/runs');
|
|
182
|
+
notifications.push('- Check all requirements are met');
|
|
183
|
+
notifications.push('- Handle edge cases explicitly');
|
|
184
|
+
return notifications.join('\n');
|
|
185
|
+
}
|
|
186
|
+
/**
|
|
187
|
+
* Build environment bootstrap prompt (gather system info)
|
|
188
|
+
*/
|
|
189
|
+
export function buildEnvironmentBootstrap() {
|
|
190
|
+
return `## Environment Discovery
|
|
191
|
+
|
|
192
|
+
Run these commands to understand the environment:
|
|
193
|
+
|
|
194
|
+
\`\`\`bash
|
|
195
|
+
# System info
|
|
196
|
+
echo "=== SYSTEM ===" && uname -a
|
|
197
|
+
echo "=== OS ===" && cat /etc/os-release 2>/dev/null | head -5
|
|
198
|
+
|
|
199
|
+
# Available tools
|
|
200
|
+
echo "=== TOOLS ===" && which python python3 pip pip3 npm node go cargo 2>/dev/null
|
|
201
|
+
|
|
202
|
+
# Resources
|
|
203
|
+
echo "=== DISK ===" && df -h / 2>/dev/null
|
|
204
|
+
echo "=== MEM ===" && free -h 2>/dev/null
|
|
205
|
+
|
|
206
|
+
# Current context
|
|
207
|
+
echo "=== CWD ===" && pwd && ls -la
|
|
208
|
+
echo "=== GIT ===" && git status 2>/dev/null | head -5
|
|
209
|
+
\`\`\`
|
|
210
|
+
|
|
211
|
+
Use this information to:
|
|
212
|
+
1. Choose appropriate tools (use what's available)
|
|
213
|
+
2. Check resource constraints
|
|
214
|
+
3. Understand the current state
|
|
215
|
+
`;
|
|
216
|
+
}
|
|
217
|
+
/**
|
|
218
|
+
* Build planning prompt
|
|
219
|
+
*/
|
|
220
|
+
export function buildPlanningPrompt(_task, steps) {
|
|
221
|
+
const plan = steps.map((step, i) => {
|
|
222
|
+
const status = i === 0 ? '[>]' : '[ ]';
|
|
223
|
+
return `${status} ${i + 1}. ${step}`;
|
|
224
|
+
}).join('\n');
|
|
225
|
+
return `## Execution Plan
|
|
226
|
+
|
|
227
|
+
${plan}
|
|
228
|
+
|
|
229
|
+
**Instructions:**
|
|
230
|
+
- Complete steps in order
|
|
231
|
+
- Mark each step done after completion
|
|
232
|
+
- If a step fails, debug before continuing
|
|
233
|
+
- Update plan if new steps are discovered
|
|
234
|
+
`;
|
|
235
|
+
}
|
|
236
|
+
/**
|
|
237
|
+
* Update planning prompt with progress
|
|
238
|
+
*/
|
|
239
|
+
export function updatePlanningPrompt(steps, completedSteps, currentStepInProgress) {
|
|
240
|
+
const plan = steps.map((step, i) => {
|
|
241
|
+
let status;
|
|
242
|
+
if (i < completedSteps) {
|
|
243
|
+
status = '[x]';
|
|
244
|
+
}
|
|
245
|
+
else if (i === completedSteps && currentStepInProgress) {
|
|
246
|
+
status = '[>]';
|
|
247
|
+
}
|
|
248
|
+
else {
|
|
249
|
+
status = '[ ]';
|
|
250
|
+
}
|
|
251
|
+
return `${status} ${i + 1}. ${step}`;
|
|
252
|
+
}).join('\n');
|
|
253
|
+
return `## Progress Update
|
|
254
|
+
|
|
255
|
+
${plan}
|
|
256
|
+
|
|
257
|
+
**Status:** ${completedSteps}/${steps.length} steps completed
|
|
258
|
+
`;
|
|
259
|
+
}
|
|
260
|
+
//# sourceMappingURL=hierarchical-prompting.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"hierarchical-prompting.js","sourceRoot":"","sources":["../../src/benchmarks/hierarchical-prompting.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAYH;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,eAAuB,EACvB,cAAkC,EAClC,aAAqB,EACrB,UAII,EAAE;IAEN,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,uDAAuD;IACvD,QAAQ,CAAC,IAAI,CAAC,mBAAmB,CAAC,cAAc,CAAC,CAAC,CAAC;IAEnD,iDAAiD;IACjD,QAAQ,CAAC,IAAI,CAAC,eAAe,CAAC,cAAc,CAAC,CAAC,CAAC;IAE/C,+CAA+C;IAC/C,IAAI,aAAa,EAAE,CAAC;QAClB,QAAQ,CAAC,IAAI,CAAC,wBAAwB,aAAa,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,uBAAuB;IACvB,QAAQ,CAAC,IAAI,CAAC,cAAc,eAAe,EAAE,CAAC,CAAC;IAE/C,oEAAoE;IACpE,QAAQ,CAAC,IAAI,CAAC,qBAAqB,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,CAAC;IAE9D,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,cAAkC;IAC7D,MAAM,SAAS,GAAG;;;;;;4DAMwC,CAAC;IAE3D,MAAM,aAAa,GAA2B;QAC5C,UAAU,EAAE;;;oEAGoD;QAEhE,UAAU,EAAE;;;qDAGqC;QAEjD,aAAa,EAAE;;;sDAGmC;QAElD,WAAW,EAAE;;;0DAGyC;QAEtD,QAAQ,EAAE;;;oDAGsC;QAEhD,SAAS,EAAE;;;wDAGyC;KACrD,CAAC;IAEF,OAAO,SAAS,GAAG,CAAC,aAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;AACpE,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,cAAkC;IACzD,MAAM,UAAU,GAAG;;;;;;gDAM2B,CAAC;IAE/C,MAAM,kBAAkB,GAA2B;QACjD,UAAU,EAAE;;;;;;uCAMuB;QAEnC,UAAU,EAAE;;;;;;8CAM8B;QAE1C,aAAa,EAAE;;;;;;gCAMa;QAE5B,WAAW,EAAE;;;;;;gDAM+B;QAE5C,QAAQ,EAAE;;;;;;yCAM2B;QAErC,SAAS,EAAE;;;;;;8BAMe;KAC3B,CAAC;IAEF,OAAO,UAAU,GAAG,CAAC,kBAAkB,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;AAC1E,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAC5B,cAAkC,EAClC,OAIC;IAED,MAAM,aAAa,GAAa,CAAC,0BAA0B,CAAC,CAAC;IAE7D,2BAA2B;IAC3B,IAAI,OAAO,CAAC,aAAa,KAAK,SAAS,IAAI,OAAO,CAAC,aAAa,GAAG,KAAK,EAAE,CAAC;QACzE,aAAa,CAAC,IAAI,CAAC,4BAA4B,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;QACvG,aAAa,CAAC,IAAI,CAAC,4CAA4C,CAAC,CAAC;QACjE,aAAa,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;IACtD,CAAC;IAED,kBAAkB;IAClB,IAAI,OAAO,CAAC,aAAa,IAAI,OAAO,CAAC,aAAa,GAAG,CAAC,EAAE,CAAC;QACvD,aAAa,CAAC,IAAI,CAAC,eAAe,OAAO,CAAC,aAAa,+BAA+B,CAAC,CAAC;QAExF,IAAI,OAAO,CAAC,cAAc,IAAI,OAAO,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChE,aAAa,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;YACpD,KAAK,MAAM,KAAK,IAAI,OAAO,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;gBACvD,aAAa,CAAC,IAAI,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;IACH,CAAC;IAED,uCAAuC;IACvC,MAAM,iBAAiB,GAA6B;QAClD,UAAU,EAAE;YACV,yCAAyC;YACzC,wCAAwC;SACzC;QACD,UAAU,EAAE;YACV,wCAAwC;YACxC,6BAA6B;SAC9B;QACD,aAAa,EAAE;YACb,sCAAsC;YACtC,4BAA4B;SAC7B;QACD,WAAW,EAAE;YACX,wCAAwC;YACxC,sCAAsC;SACvC;QACD,QAAQ,EAAE;YACR,gCAAgC;YAChC,+BAA+B;SAChC;QACD,SAAS,EAAE;YACT,sCAAsC;YACtC,4BAA4B;SAC7B;KACF,CAAC;IAEF,MAAM,SAAS,GAAG,iBAAiB,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;IACnE,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,aAAa,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QAC1C,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,aAAa,CAAC,IAAI,CAAC,KAAK,QAAQ,EAAE,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,aAAa,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IAC/C,aAAa,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;IACtD,aAAa,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC;IACvD,aAAa,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAC;IAErD,OAAO,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAClC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,yBAAyB;IACvC,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;CAyBR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAa,EAAE,KAAe;IAChE,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QACjC,MAAM,MAAM,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC;QACvC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC;IACvC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;EAEP,IAAI;;;;;;;CAOL,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAClC,KAAe,EACf,cAAsB,EACtB,qBAA8B;IAE9B,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QACjC,IAAI,MAAc,CAAC;QACnB,IAAI,CAAC,GAAG,cAAc,EAAE,CAAC;YACvB,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;aAAM,IAAI,CAAC,KAAK,cAAc,IAAI,qBAAqB,EAAE,CAAC;YACzD,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;QACD,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC;IACvC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;EAEP,IAAI;;cAEQ,cAAc,IAAI,KAAK,CAAC,MAAM;CAC3C,CAAC;AACF,CAAC"}
|