universal-agent-memory 0.6.1 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (71) hide show
  1. package/dist/benchmarks/agents/naive-agent.d.ts +60 -0
  2. package/dist/benchmarks/agents/naive-agent.d.ts.map +1 -0
  3. package/dist/benchmarks/agents/naive-agent.js +144 -0
  4. package/dist/benchmarks/agents/naive-agent.js.map +1 -0
  5. package/dist/benchmarks/agents/uam-agent.d.ts +167 -0
  6. package/dist/benchmarks/agents/uam-agent.d.ts.map +1 -0
  7. package/dist/benchmarks/agents/uam-agent.js +386 -0
  8. package/dist/benchmarks/agents/uam-agent.js.map +1 -0
  9. package/dist/benchmarks/benchmark.d.ts +328 -0
  10. package/dist/benchmarks/benchmark.d.ts.map +1 -0
  11. package/dist/benchmarks/benchmark.js +104 -0
  12. package/dist/benchmarks/benchmark.js.map +1 -0
  13. package/dist/benchmarks/execution-verifier.d.ts +41 -0
  14. package/dist/benchmarks/execution-verifier.d.ts.map +1 -0
  15. package/dist/benchmarks/execution-verifier.js +301 -0
  16. package/dist/benchmarks/execution-verifier.js.map +1 -0
  17. package/dist/benchmarks/hierarchical-prompting.d.ts +37 -0
  18. package/dist/benchmarks/hierarchical-prompting.d.ts.map +1 -0
  19. package/dist/benchmarks/hierarchical-prompting.js +260 -0
  20. package/dist/benchmarks/hierarchical-prompting.js.map +1 -0
  21. package/dist/benchmarks/improved-benchmark.d.ts +88 -0
  22. package/dist/benchmarks/improved-benchmark.d.ts.map +1 -0
  23. package/dist/benchmarks/improved-benchmark.js +533 -0
  24. package/dist/benchmarks/improved-benchmark.js.map +1 -0
  25. package/dist/benchmarks/index.d.ts +10 -0
  26. package/dist/benchmarks/index.d.ts.map +1 -0
  27. package/dist/benchmarks/index.js +10 -0
  28. package/dist/benchmarks/index.js.map +1 -0
  29. package/dist/benchmarks/multi-turn-agent.d.ts +44 -0
  30. package/dist/benchmarks/multi-turn-agent.d.ts.map +1 -0
  31. package/dist/benchmarks/multi-turn-agent.js +235 -0
  32. package/dist/benchmarks/multi-turn-agent.js.map +1 -0
  33. package/dist/benchmarks/runner.d.ts +2 -0
  34. package/dist/benchmarks/runner.d.ts.map +1 -0
  35. package/dist/benchmarks/runner.js +2 -0
  36. package/dist/benchmarks/runner.js.map +1 -0
  37. package/dist/benchmarks/tasks.d.ts +19 -0
  38. package/dist/benchmarks/tasks.d.ts.map +1 -0
  39. package/dist/benchmarks/tasks.js +371 -0
  40. package/dist/benchmarks/tasks.js.map +1 -0
  41. package/dist/bin/cli.js +0 -0
  42. package/dist/index.d.ts +5 -0
  43. package/dist/index.d.ts.map +1 -1
  44. package/dist/index.js +4 -0
  45. package/dist/index.js.map +1 -1
  46. package/dist/memory/backends/qdrant-cloud.d.ts +1 -1
  47. package/dist/memory/backends/qdrant-cloud.d.ts.map +1 -1
  48. package/dist/memory/backends/qdrant-cloud.js +6 -4
  49. package/dist/memory/backends/qdrant-cloud.js.map +1 -1
  50. package/dist/memory/dynamic-retrieval.d.ts +26 -0
  51. package/dist/memory/dynamic-retrieval.d.ts.map +1 -0
  52. package/dist/memory/dynamic-retrieval.js +378 -0
  53. package/dist/memory/dynamic-retrieval.js.map +1 -0
  54. package/dist/memory/embeddings.d.ts +82 -0
  55. package/dist/memory/embeddings.d.ts.map +1 -0
  56. package/dist/memory/embeddings.js +297 -0
  57. package/dist/memory/embeddings.js.map +1 -0
  58. package/dist/memory/task-classifier.d.ts +33 -0
  59. package/dist/memory/task-classifier.d.ts.map +1 -0
  60. package/dist/memory/task-classifier.js +277 -0
  61. package/dist/memory/task-classifier.js.map +1 -0
  62. package/dist/utils/rate-limiter.d.ts +62 -0
  63. package/dist/utils/rate-limiter.d.ts.map +1 -0
  64. package/dist/utils/rate-limiter.js +150 -0
  65. package/dist/utils/rate-limiter.js.map +1 -0
  66. package/dist/utils/validate-json.d.ts +52 -0
  67. package/dist/utils/validate-json.d.ts.map +1 -0
  68. package/dist/utils/validate-json.js +99 -0
  69. package/dist/utils/validate-json.js.map +1 -0
  70. package/package.json +2 -1
  71. package/templates/CLAUDE.template.md +51 -1
@@ -0,0 +1,301 @@
1
+ /**
2
+ * Execution Verification System for UAM Benchmarks
3
+ *
4
+ * Provides real code execution and verification instead of just pattern matching.
5
+ * Runs generated code in isolated environments and validates output.
6
+ */
7
+ import { execSync, spawn } from 'child_process';
8
+ import { writeFileSync, existsSync, mkdirSync, rmSync } from 'fs';
9
+ import { join } from 'path';
10
+ import { randomUUID } from 'crypto';
11
+ const SANDBOX_DIR = '/tmp/uam-sandbox';
12
+ /**
13
+ * Create an isolated sandbox for code execution
14
+ */
15
+ function createSandbox() {
16
+ const sandboxId = randomUUID().slice(0, 8);
17
+ const sandboxPath = join(SANDBOX_DIR, sandboxId);
18
+ if (!existsSync(SANDBOX_DIR)) {
19
+ mkdirSync(SANDBOX_DIR, { recursive: true });
20
+ }
21
+ mkdirSync(sandboxPath, { recursive: true });
22
+ return sandboxPath;
23
+ }
24
+ /**
25
+ * Clean up sandbox after execution
26
+ */
27
+ function cleanupSandbox(sandboxPath) {
28
+ try {
29
+ rmSync(sandboxPath, { recursive: true, force: true });
30
+ }
31
+ catch {
32
+ // Ignore cleanup errors
33
+ }
34
+ }
35
+ /**
36
+ * Verify generated code by executing it
37
+ */
38
+ export async function verifyCodeExecution(code, config) {
39
+ const startTime = Date.now();
40
+ const result = {
41
+ success: false,
42
+ executionSucceeded: false,
43
+ testsRun: 0,
44
+ testsPassed: 0,
45
+ errors: [],
46
+ output: '',
47
+ executionTimeMs: 0,
48
+ };
49
+ // If execution not required, just do pattern matching
50
+ if (!config.requiresExecution) {
51
+ return verifyPatterns(code, config.expectedPatterns || []);
52
+ }
53
+ const sandboxPath = createSandbox();
54
+ try {
55
+ // Write code to file
56
+ const filename = getFilename(config.language);
57
+ const filePath = join(sandboxPath, filename);
58
+ writeFileSync(filePath, code, 'utf-8');
59
+ // Run setup commands if any
60
+ if (config.setupCommands) {
61
+ for (const cmd of config.setupCommands) {
62
+ try {
63
+ execSync(cmd, {
64
+ cwd: sandboxPath,
65
+ timeout: 30000,
66
+ stdio: 'pipe',
67
+ });
68
+ }
69
+ catch (error) {
70
+ result.errors.push(`Setup failed: ${cmd}`);
71
+ }
72
+ }
73
+ }
74
+ // Compile if TypeScript
75
+ if (config.language === 'typescript') {
76
+ try {
77
+ execSync(`npx tsc ${filename} --outDir . --esModuleInterop --skipLibCheck 2>&1`, {
78
+ cwd: sandboxPath,
79
+ timeout: 30000,
80
+ encoding: 'utf-8',
81
+ });
82
+ }
83
+ catch (error) {
84
+ const errMsg = error instanceof Error ? error.message : String(error);
85
+ result.errors.push(`TypeScript compilation failed: ${errMsg}`);
86
+ result.executionTimeMs = Date.now() - startTime;
87
+ cleanupSandbox(sandboxPath);
88
+ return result;
89
+ }
90
+ }
91
+ result.executionSucceeded = true;
92
+ // Run test cases
93
+ for (const testCase of config.testCases) {
94
+ result.testsRun++;
95
+ try {
96
+ const output = await runTestCase(sandboxPath, config.language, testCase, config.timeout);
97
+ const normalizedOutput = output.trim();
98
+ const normalizedExpected = testCase.expectedOutput.trim();
99
+ if (normalizedOutput === normalizedExpected) {
100
+ result.testsPassed++;
101
+ }
102
+ else {
103
+ result.errors.push(`Test "${testCase.description || 'unnamed'}": Expected "${normalizedExpected}", got "${normalizedOutput}"`);
104
+ }
105
+ result.output += output + '\n';
106
+ }
107
+ catch (error) {
108
+ const errMsg = error instanceof Error ? error.message : String(error);
109
+ result.errors.push(`Test execution error: ${errMsg}`);
110
+ }
111
+ }
112
+ // Verify patterns if specified
113
+ if (config.expectedPatterns && config.expectedPatterns.length > 0) {
114
+ const patternResult = verifyPatterns(code, config.expectedPatterns);
115
+ if (!patternResult.success) {
116
+ result.errors.push(...patternResult.errors);
117
+ }
118
+ }
119
+ result.success = result.testsPassed === result.testsRun && result.errors.length === 0;
120
+ }
121
+ catch (error) {
122
+ const errMsg = error instanceof Error ? error.message : String(error);
123
+ result.errors.push(`Verification error: ${errMsg}`);
124
+ }
125
+ finally {
126
+ cleanupSandbox(sandboxPath);
127
+ }
128
+ result.executionTimeMs = Date.now() - startTime;
129
+ return result;
130
+ }
131
+ /**
132
+ * Run a single test case
133
+ */
134
+ async function runTestCase(sandboxPath, language, testCase, timeout = 10000) {
135
+ const commands = {
136
+ 'typescript': `node ${getFilename('javascript')}`,
137
+ 'javascript': `node ${getFilename('javascript')}`,
138
+ 'python': `python3 ${getFilename('python')}`,
139
+ 'shell': `bash ${getFilename('shell')}`,
140
+ };
141
+ const command = `${commands[language]} ${testCase.input}`;
142
+ return new Promise((resolve, reject) => {
143
+ const proc = spawn('bash', ['-c', command], {
144
+ cwd: sandboxPath,
145
+ timeout,
146
+ stdio: ['pipe', 'pipe', 'pipe'],
147
+ });
148
+ let stdout = '';
149
+ let stderr = '';
150
+ proc.stdout.on('data', (data) => { stdout += data.toString(); });
151
+ proc.stderr.on('data', (data) => { stderr += data.toString(); });
152
+ const timer = setTimeout(() => {
153
+ proc.kill();
154
+ reject(new Error('Execution timeout'));
155
+ }, timeout);
156
+ proc.on('close', (code) => {
157
+ clearTimeout(timer);
158
+ if (code === 0) {
159
+ resolve(stdout);
160
+ }
161
+ else {
162
+ reject(new Error(`Exit code ${code}: ${stderr}`));
163
+ }
164
+ });
165
+ proc.on('error', (error) => {
166
+ clearTimeout(timer);
167
+ reject(error);
168
+ });
169
+ });
170
+ }
171
+ /**
172
+ * Verify code contains expected patterns (fallback verification)
173
+ */
174
+ function verifyPatterns(code, patterns) {
175
+ const result = {
176
+ success: false,
177
+ executionSucceeded: true,
178
+ testsRun: patterns.length,
179
+ testsPassed: 0,
180
+ errors: [],
181
+ output: '',
182
+ executionTimeMs: 0,
183
+ };
184
+ const normalizedCode = code.toLowerCase();
185
+ for (const pattern of patterns) {
186
+ if (normalizedCode.includes(pattern.toLowerCase())) {
187
+ result.testsPassed++;
188
+ }
189
+ else {
190
+ result.errors.push(`Missing pattern: "${pattern}"`);
191
+ }
192
+ }
193
+ // Success if at least 70% of patterns match (more strict than before)
194
+ const matchRatio = result.testsPassed / result.testsRun;
195
+ result.success = matchRatio >= 0.7;
196
+ return result;
197
+ }
198
+ /**
199
+ * Get appropriate filename for language
200
+ */
201
+ function getFilename(language) {
202
+ const extensions = {
203
+ 'typescript': 'solution.ts',
204
+ 'javascript': 'solution.js',
205
+ 'python': 'solution.py',
206
+ 'shell': 'solution.sh',
207
+ };
208
+ return extensions[language];
209
+ }
210
+ /**
211
+ * Enhanced task verification configurations for benchmark tasks
212
+ */
213
+ export const TASK_VERIFICATION_CONFIGS = {
214
+ 'task-001-code-generation': {
215
+ language: 'typescript',
216
+ requiresExecution: true,
217
+ testCases: [
218
+ { input: '', expectedOutput: '0', description: 'empty array returns 0' },
219
+ ],
220
+ expectedPatterns: ['function calculateAverage', 'number[]', ': number'],
221
+ setupCommands: [
222
+ 'echo "const calculateAverage = (nums: number[]): number => nums.length === 0 ? 0 : nums.reduce((a,b) => a+b, 0) / nums.length; console.log(calculateAverage([]));" > solution.ts',
223
+ ],
224
+ },
225
+ 'task-002-bug-fix': {
226
+ language: 'typescript',
227
+ requiresExecution: false,
228
+ testCases: [],
229
+ expectedPatterns: ['i < nums.length', 'function sumPositive', 'return sum'],
230
+ },
231
+ 'task-003-pattern-application': {
232
+ language: 'typescript',
233
+ requiresExecution: false,
234
+ testCases: [],
235
+ expectedPatterns: [
236
+ 'class ConfigManager',
237
+ 'private constructor',
238
+ 'static getInstance',
239
+ 'private static instance',
240
+ 'Map',
241
+ ],
242
+ },
243
+ 'task-004-refactoring': {
244
+ language: 'typescript',
245
+ requiresExecution: false,
246
+ testCases: [],
247
+ expectedPatterns: ['interface', 'class', 'implements', 'process'],
248
+ },
249
+ 'task-005-memory-context': {
250
+ language: 'typescript',
251
+ requiresExecution: false,
252
+ testCases: [],
253
+ expectedPatterns: ['async', 'zod', 'AppError', '@param', 'validateAndParseJSON'],
254
+ },
255
+ 'task-006-complex-algorithm': {
256
+ language: 'typescript',
257
+ requiresExecution: false,
258
+ testCases: [],
259
+ expectedPatterns: ['function findShortestPath', 'Map<string', 'distance', 'path', 'while'],
260
+ },
261
+ 'task-007-multi-step-task': {
262
+ language: 'typescript',
263
+ requiresExecution: false,
264
+ testCases: [],
265
+ expectedPatterns: [
266
+ 'interface RateLimiterConfig',
267
+ 'class RateLimiter',
268
+ 'isAllowed',
269
+ 'getRemainingRequests',
270
+ 'reset',
271
+ 'Map',
272
+ 'export',
273
+ ],
274
+ },
275
+ 'task-008-error-handling': {
276
+ language: 'typescript',
277
+ requiresExecution: false,
278
+ testCases: [],
279
+ expectedPatterns: ['async function fetchWithRetry', 'retry', 'backoff', 'catch', 'throw'],
280
+ },
281
+ };
282
+ /**
283
+ * Verify a benchmark task result
284
+ */
285
+ export async function verifyBenchmarkTask(taskId, generatedCode) {
286
+ const config = TASK_VERIFICATION_CONFIGS[taskId];
287
+ if (!config) {
288
+ // Fallback to basic pattern verification
289
+ return {
290
+ success: generatedCode.length > 50,
291
+ executionSucceeded: true,
292
+ testsRun: 1,
293
+ testsPassed: generatedCode.length > 50 ? 1 : 0,
294
+ errors: generatedCode.length <= 50 ? ['Response too short'] : [],
295
+ output: '',
296
+ executionTimeMs: 0,
297
+ };
298
+ }
299
+ return verifyCodeExecution(generatedCode, config);
300
+ }
301
+ //# sourceMappingURL=execution-verifier.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"execution-verifier.js","sourceRoot":"","sources":["../../src/benchmarks/execution-verifier.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,KAAK,EAAE,MAAM,eAAe,CAAC;AAChD,OAAO,EAAE,aAAa,EAAE,UAAU,EAAE,SAAS,EAAE,MAAM,EAAE,MAAM,IAAI,CAAC;AAClE,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,UAAU,EAAE,MAAM,QAAQ,CAAC;AA2BpC,MAAM,WAAW,GAAG,kBAAkB,CAAC;AAEvC;;GAEG;AACH,SAAS,aAAa;IACpB,MAAM,SAAS,GAAG,UAAU,EAAE,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC;IAC3C,MAAM,WAAW,GAAG,IAAI,CAAC,WAAW,EAAE,SAAS,CAAC,CAAC;IAEjD,IAAI,CAAC,UAAU,CAAC,WAAW,CAAC,EAAE,CAAC;QAC7B,SAAS,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC9C,CAAC;IAED,SAAS,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,CAAC,CAAC;IAC5C,OAAO,WAAW,CAAC;AACrB,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,WAAmB;IACzC,IAAI,CAAC;QACH,MAAM,CAAC,WAAW,EAAE,EAAE,SAAS,EAAE,IAAI,EAAE,KAAK,EAAE,IAAI,EAAE,CAAC,CAAC;IACxD,CAAC;IAAC,MAAM,CAAC;QACP,wBAAwB;IAC1B,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,IAAY,EACZ,MAA8B;IAE9B,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC7B,MAAM,MAAM,GAAuB;QACjC,OAAO,EAAE,KAAK;QACd,kBAAkB,EAAE,KAAK;QACzB,QAAQ,EAAE,CAAC;QACX,WAAW,EAAE,CAAC;QACd,MAAM,EAAE,EAAE;QACV,MAAM,EAAE,EAAE;QACV,eAAe,EAAE,CAAC;KACnB,CAAC;IAEF,sDAAsD;IACtD,IAAI,CAAC,MAAM,CAAC,iBAAiB,EAAE,CAAC;QAC9B,OAAO,cAAc,CAAC,IAAI,EAAE,MAAM,CAAC,gBAAgB,IAAI,EAAE,CAAC,CAAC;IAC7D,CAAC;IAED,MAAM,WAAW,GAAG,aAAa,EAAE,CAAC;IAEpC,IAAI,CAAC;QACH,qBAAqB;QACrB,MAAM,QAAQ,GAAG,WAAW,CAAC,MAAM,CAAC,QAAQ,CAAC,CAAC;QAC9C,MAAM,QAAQ,GAAG,IAAI,CAAC,WAAW,EAAE,QAAQ,CAAC,CAAC;QAC7C,aAAa,CAAC,QAAQ,EAAE,IAAI,EAAE,OAAO,CAAC,CAAC;QAEvC,4BAA4B;QAC5B,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;YACzB,KAAK,MAAM,GAAG,IAAI,MAAM,CAAC,aAAa,EAAE,CAAC;gBACvC,IAAI,CAAC;oBACH,QAAQ,CAAC,GAAG,EAAE;wBACZ,GAAG,EAAE,WAAW;wBAChB,OAAO,EAAE,KAAK;wBACd,KAAK,EAAE,MAAM;qBACd,CAAC,CAAC;gBACL,CAAC;gBAAC,OAAO,KAAK,EAAE,CAAC;oBACf,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,iBAAiB,GAAG,EAAE,CAAC,CAAC;gBAC7C,CAAC;YACH,CAAC;QACH,CAAC;QAED,wBAAwB;QACxB,IAAI,MAAM,CAAC,QAAQ,KAAK,YAAY,EAAE,CAAC;YACrC,IAAI,CAAC;gBACH,QAAQ,CAAC,WAAW,QAAQ,mDAAmD,EAAE;oBAC/E,GAAG,EAAE,WAAW;oBAChB,OAAO,EAAE,KAAK;oBACd,QAAQ,EAAE,OAAO;iBAClB,CAAC,CAAC;YACL,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,kCAAkC,MAAM,EAAE,CAAC,CAAC;gBAC/D,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;gBAChD,cAAc,CAAC,WAAW,CAAC,CAAC;gBAC5B,OAAO,MAAM,CAAC;YAChB,CAAC;QACH,CAAC;QAED,MAAM,CAAC,kBAAkB,GAAG,IAAI,CAAC;QAEjC,iBAAiB;QACjB,KAAK,MAAM,QAAQ,IAAI,MAAM,CAAC,SAAS,EAAE,CAAC;YACxC,MAAM,CAAC,QAAQ,EAAE,CAAC;YAElB,IAAI,CAAC;gBACH,MAAM,MAAM,GAAG,MAAM,WAAW,CAAC,WAAW,EAAE,MAAM,CAAC,QAAQ,EAAE,QAAQ,EAAE,MAAM,CAAC,OAAO,CAAC,CAAC;gBACzF,MAAM,gBAAgB,GAAG,MAAM,CAAC,IAAI,EAAE,CAAC;gBACvC,MAAM,kBAAkB,GAAG,QAAQ,CAAC,cAAc,CAAC,IAAI,EAAE,CAAC;gBAE1D,IAAI,gBAAgB,KAAK,kBAAkB,EAAE,CAAC;oBAC5C,MAAM,CAAC,WAAW,EAAE,CAAC;gBACvB,CAAC;qBAAM,CAAC;oBACN,MAAM,CAAC,MAAM,CAAC,IAAI,CAChB,SAAS,QAAQ,CAAC,WAAW,IAAI,SAAS,gBAAgB,kBAAkB,WAAW,gBAAgB,GAAG,CAC3G,CAAC;gBACJ,CAAC;gBAED,MAAM,CAAC,MAAM,IAAI,MAAM,GAAG,IAAI,CAAC;YACjC,CAAC;YAAC,OAAO,KAAK,EAAE,CAAC;gBACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;gBACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,yBAAyB,MAAM,EAAE,CAAC,CAAC;YACxD,CAAC;QACH,CAAC;QAED,+BAA+B;QAC/B,IAAI,MAAM,CAAC,gBAAgB,IAAI,MAAM,CAAC,gBAAgB,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAClE,MAAM,aAAa,GAAG,cAAc,CAAC,IAAI,EAAE,MAAM,CAAC,gBAAgB,CAAC,CAAC;YACpE,IAAI,CAAC,aAAa,CAAC,OAAO,EAAE,CAAC;gBAC3B,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,aAAa,CAAC,MAAM,CAAC,CAAC;YAC9C,CAAC;QACH,CAAC;QAED,MAAM,CAAC,OAAO,GAAG,MAAM,CAAC,WAAW,KAAK,MAAM,CAAC,QAAQ,IAAI,MAAM,CAAC,MAAM,CAAC,MAAM,KAAK,CAAC,CAAC;IAExF,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,MAAM,MAAM,GAAG,KAAK,YAAY,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC;QACtE,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,uBAAuB,MAAM,EAAE,CAAC,CAAC;IACtD,CAAC;YAAS,CAAC;QACT,cAAc,CAAC,WAAW,CAAC,CAAC;IAC9B,CAAC;IAED,MAAM,CAAC,eAAe,GAAG,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS,CAAC;IAChD,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,WAAW,CACxB,WAAmB,EACnB,QAA4C,EAC5C,QAAkB,EAClB,UAAkB,KAAK;IAEvB,MAAM,QAAQ,GAA2B;QACvC,YAAY,EAAE,QAAQ,WAAW,CAAC,YAAY,CAAC,EAAE;QACjD,YAAY,EAAE,QAAQ,WAAW,CAAC,YAAY,CAAC,EAAE;QACjD,QAAQ,EAAE,WAAW,WAAW,CAAC,QAAQ,CAAC,EAAE;QAC5C,OAAO,EAAE,QAAQ,WAAW,CAAC,OAAO,CAAC,EAAE;KACxC,CAAC;IAEF,MAAM,OAAO,GAAG,GAAG,QAAQ,CAAC,QAAQ,CAAC,IAAI,QAAQ,CAAC,KAAK,EAAE,CAAC;IAE1D,OAAO,IAAI,OAAO,CAAC,CAAC,OAAO,EAAE,MAAM,EAAE,EAAE;QACrC,MAAM,IAAI,GAAG,KAAK,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,OAAO,CAAC,EAAE;YAC1C,GAAG,EAAE,WAAW;YAChB,OAAO;YACP,KAAK,EAAE,CAAC,MAAM,EAAE,MAAM,EAAE,MAAM,CAAC;SAChC,CAAC,CAAC;QAEH,IAAI,MAAM,GAAG,EAAE,CAAC;QAChB,IAAI,MAAM,GAAG,EAAE,CAAC;QAEhB,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE,GAAG,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QACjE,IAAI,CAAC,MAAM,CAAC,EAAE,CAAC,MAAM,EAAE,CAAC,IAAI,EAAE,EAAE,GAAG,MAAM,IAAI,IAAI,CAAC,QAAQ,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC;QAEjE,MAAM,KAAK,GAAG,UAAU,CAAC,GAAG,EAAE;YAC5B,IAAI,CAAC,IAAI,EAAE,CAAC;YACZ,MAAM,CAAC,IAAI,KAAK,CAAC,mBAAmB,CAAC,CAAC,CAAC;QACzC,CAAC,EAAE,OAAO,CAAC,CAAC;QAEZ,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,IAAI,EAAE,EAAE;YACxB,YAAY,CAAC,KAAK,CAAC,CAAC;YACpB,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;gBACf,OAAO,CAAC,MAAM,CAAC,CAAC;YAClB,CAAC;iBAAM,CAAC;gBACN,MAAM,CAAC,IAAI,KAAK,CAAC,aAAa,IAAI,KAAK,MAAM,EAAE,CAAC,CAAC,CAAC;YACpD,CAAC;QACH,CAAC,CAAC,CAAC;QAEH,IAAI,CAAC,EAAE,CAAC,OAAO,EAAE,CAAC,KAAK,EAAE,EAAE;YACzB,YAAY,CAAC,KAAK,CAAC,CAAC;YACpB,MAAM,CAAC,KAAK,CAAC,CAAC;QAChB,CAAC,CAAC,CAAC;IACL,CAAC,CAAC,CAAC;AACL,CAAC;AAED;;GAEG;AACH,SAAS,cAAc,CAAC,IAAY,EAAE,QAAkB;IACtD,MAAM,MAAM,GAAuB;QACjC,OAAO,EAAE,KAAK;QACd,kBAAkB,EAAE,IAAI;QACxB,QAAQ,EAAE,QAAQ,CAAC,MAAM;QACzB,WAAW,EAAE,CAAC;QACd,MAAM,EAAE,EAAE;QACV,MAAM,EAAE,EAAE;QACV,eAAe,EAAE,CAAC;KACnB,CAAC;IAEF,MAAM,cAAc,GAAG,IAAI,CAAC,WAAW,EAAE,CAAC;IAE1C,KAAK,MAAM,OAAO,IAAI,QAAQ,EAAE,CAAC;QAC/B,IAAI,cAAc,CAAC,QAAQ,CAAC,OAAO,CAAC,WAAW,EAAE,CAAC,EAAE,CAAC;YACnD,MAAM,CAAC,WAAW,EAAE,CAAC;QACvB,CAAC;aAAM,CAAC;YACN,MAAM,CAAC,MAAM,CAAC,IAAI,CAAC,qBAAqB,OAAO,GAAG,CAAC,CAAC;QACtD,CAAC;IACH,CAAC;IAED,sEAAsE;IACtE,MAAM,UAAU,GAAG,MAAM,CAAC,WAAW,GAAG,MAAM,CAAC,QAAQ,CAAC;IACxD,MAAM,CAAC,OAAO,GAAG,UAAU,IAAI,GAAG,CAAC;IAEnC,OAAO,MAAM,CAAC;AAChB,CAAC;AAED;;GAEG;AACH,SAAS,WAAW,CAAC,QAA4C;IAC/D,MAAM,UAAU,GAA2B;QACzC,YAAY,EAAE,aAAa;QAC3B,YAAY,EAAE,aAAa;QAC3B,QAAQ,EAAE,aAAa;QACvB,OAAO,EAAE,aAAa;KACvB,CAAC;IACF,OAAO,UAAU,CAAC,QAAQ,CAAC,CAAC;AAC9B,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,MAAM,yBAAyB,GAA2C;IAC/E,0BAA0B,EAAE;QAC1B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,IAAI;QACvB,SAAS,EAAE;YACT,EAAE,KAAK,EAAE,EAAE,EAAE,cAAc,EAAE,GAAG,EAAE,WAAW,EAAE,uBAAuB,EAAE;SACzE;QACD,gBAAgB,EAAE,CAAC,2BAA2B,EAAE,UAAU,EAAE,UAAU,CAAC;QACvE,aAAa,EAAE;YACb,kLAAkL;SACnL;KACF;IAED,kBAAkB,EAAE;QAClB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,iBAAiB,EAAE,sBAAsB,EAAE,YAAY,CAAC;KAC5E;IAED,8BAA8B,EAAE;QAC9B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE;YAChB,qBAAqB;YACrB,qBAAqB;YACrB,oBAAoB;YACpB,yBAAyB;YACzB,KAAK;SACN;KACF;IAED,sBAAsB,EAAE;QACtB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,WAAW,EAAE,OAAO,EAAE,YAAY,EAAE,SAAS,CAAC;KAClE;IAED,yBAAyB,EAAE;QACzB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,OAAO,EAAE,KAAK,EAAE,UAAU,EAAE,QAAQ,EAAE,sBAAsB,CAAC;KACjF;IAED,4BAA4B,EAAE;QAC5B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,2BAA2B,EAAE,YAAY,EAAE,UAAU,EAAE,MAAM,EAAE,OAAO,CAAC;KAC3F;IAED,0BAA0B,EAAE;QAC1B,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE;YAChB,6BAA6B;YAC7B,mBAAmB;YACnB,WAAW;YACX,sBAAsB;YACtB,OAAO;YACP,KAAK;YACL,QAAQ;SACT;KACF;IAED,yBAAyB,EAAE;QACzB,QAAQ,EAAE,YAAY;QACtB,iBAAiB,EAAE,KAAK;QACxB,SAAS,EAAE,EAAE;QACb,gBAAgB,EAAE,CAAC,+BAA+B,EAAE,OAAO,EAAE,SAAS,EAAE,OAAO,EAAE,OAAO,CAAC;KAC1F;CACF,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB,CACvC,MAAc,EACd,aAAqB;IAErB,MAAM,MAAM,GAAG,yBAAyB,CAAC,MAAM,CAAC,CAAC;IAEjD,IAAI,CAAC,MAAM,EAAE,CAAC;QACZ,yCAAyC;QACzC,OAAO;YACL,OAAO,EAAE,aAAa,CAAC,MAAM,GAAG,EAAE;YAClC,kBAAkB,EAAE,IAAI;YACxB,QAAQ,EAAE,CAAC;YACX,WAAW,EAAE,aAAa,CAAC,MAAM,GAAG,EAAE,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;YAC9C,MAAM,EAAE,aAAa,CAAC,MAAM,IAAI,EAAE,CAAC,CAAC,CAAC,CAAC,oBAAoB,CAAC,CAAC,CAAC,CAAC,EAAE;YAChE,MAAM,EAAE,EAAE;YACV,eAAe,EAAE,CAAC;SACnB,CAAC;IACJ,CAAC;IAED,OAAO,mBAAmB,CAAC,aAAa,EAAE,MAAM,CAAC,CAAC;AACpD,CAAC"}
@@ -0,0 +1,37 @@
1
+ /**
2
+ * Hierarchical Prompting System for UAM
3
+ *
4
+ * Based on Droid's #1 Terminal-Bench strategy:
5
+ * 1. Tool Descriptions: High-level capabilities
6
+ * 2. System Prompts: Behavioral guidelines
7
+ * 3. System Notifications: Time-sensitive context (at END for recency bias)
8
+ */
9
+ import { type TaskClassification } from '../memory/task-classifier.js';
10
+ export interface HierarchicalPrompt {
11
+ toolDescriptions: string;
12
+ systemPrompt: string;
13
+ taskPrompt: string;
14
+ memoryContext: string;
15
+ systemNotification: string;
16
+ }
17
+ /**
18
+ * Build a hierarchical prompt optimized for agentic models
19
+ */
20
+ export declare function buildHierarchicalPrompt(taskInstruction: string, classification: TaskClassification, memoryContext: string, options?: {
21
+ timeRemaining?: number;
22
+ attemptNumber?: number;
23
+ previousErrors?: string[];
24
+ }): string;
25
+ /**
26
+ * Build environment bootstrap prompt (gather system info)
27
+ */
28
+ export declare function buildEnvironmentBootstrap(): string;
29
+ /**
30
+ * Build planning prompt
31
+ */
32
+ export declare function buildPlanningPrompt(_task: string, steps: string[]): string;
33
+ /**
34
+ * Update planning prompt with progress
35
+ */
36
+ export declare function updatePlanningPrompt(steps: string[], completedSteps: number, currentStepInProgress: boolean): string;
37
+ //# sourceMappingURL=hierarchical-prompting.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hierarchical-prompting.d.ts","sourceRoot":"","sources":["../../src/benchmarks/hierarchical-prompting.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAEH,OAAO,EAAE,KAAK,kBAAkB,EAAE,MAAM,8BAA8B,CAAC;AAEvE,MAAM,WAAW,kBAAkB;IACjC,gBAAgB,EAAE,MAAM,CAAC;IACzB,YAAY,EAAE,MAAM,CAAC;IACrB,UAAU,EAAE,MAAM,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,kBAAkB,EAAE,MAAM,CAAC;CAC5B;AAED;;GAEG;AACH,wBAAgB,uBAAuB,CACrC,eAAe,EAAE,MAAM,EACvB,cAAc,EAAE,kBAAkB,EAClC,aAAa,EAAE,MAAM,EACrB,OAAO,GAAE;IACP,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,aAAa,CAAC,EAAE,MAAM,CAAC;IACvB,cAAc,CAAC,EAAE,MAAM,EAAE,CAAC;CACtB,GACL,MAAM,CAqBR;AA+LD;;GAEG;AACH,wBAAgB,yBAAyB,IAAI,MAAM,CA2BlD;AAED;;GAEG;AACH,wBAAgB,mBAAmB,CAAC,KAAK,EAAE,MAAM,EAAE,KAAK,EAAE,MAAM,EAAE,GAAG,MAAM,CAgB1E;AAED;;GAEG;AACH,wBAAgB,oBAAoB,CAClC,KAAK,EAAE,MAAM,EAAE,EACf,cAAc,EAAE,MAAM,EACtB,qBAAqB,EAAE,OAAO,GAC7B,MAAM,CAmBR"}
@@ -0,0 +1,260 @@
1
+ /**
2
+ * Hierarchical Prompting System for UAM
3
+ *
4
+ * Based on Droid's #1 Terminal-Bench strategy:
5
+ * 1. Tool Descriptions: High-level capabilities
6
+ * 2. System Prompts: Behavioral guidelines
7
+ * 3. System Notifications: Time-sensitive context (at END for recency bias)
8
+ */
9
+ /**
10
+ * Build a hierarchical prompt optimized for agentic models
11
+ */
12
+ export function buildHierarchicalPrompt(taskInstruction, classification, memoryContext, options = {}) {
13
+ const sections = [];
14
+ // Layer 1: Tool Descriptions (high-level capabilities)
15
+ sections.push(getToolDescriptions(classification));
16
+ // Layer 2: System Prompt (behavioral guidelines)
17
+ sections.push(getSystemPrompt(classification));
18
+ // Layer 3: Memory Context (relevant knowledge)
19
+ if (memoryContext) {
20
+ sections.push(`## Memory Context\n\n${memoryContext}`);
21
+ }
22
+ // Layer 4: Task Prompt
23
+ sections.push(`## Task\n\n${taskInstruction}`);
24
+ // Layer 5: System Notification (at END for recency bias - CRITICAL)
25
+ sections.push(getSystemNotification(classification, options));
26
+ return sections.join('\n\n');
27
+ }
28
+ /**
29
+ * Get tool descriptions for the task category
30
+ */
31
+ function getToolDescriptions(classification) {
32
+ const baseTools = `## Available Capabilities
33
+
34
+ You have access to these capabilities:
35
+ - **File Operations**: Read, write, create, and modify files
36
+ - **Shell Execution**: Run commands in bash/shell
37
+ - **Code Generation**: Write code in multiple languages
38
+ - **Analysis**: Understand and analyze code, logs, and data`;
39
+ const categoryTools = {
40
+ 'sysadmin': `
41
+ - **System Administration**: Configure services, manage processes, networking
42
+ - **Package Management**: Install/update packages via apt, yum, pip, npm
43
+ - **Service Control**: systemctl, journalctl for service management`,
44
+ 'security': `
45
+ - **Security Analysis**: Identify vulnerabilities, audit code
46
+ - **Cryptography**: Hash, encrypt, decrypt, certificate management
47
+ - **Secret Management**: Handle credentials securely`,
48
+ 'ml-training': `
49
+ - **ML Frameworks**: PyTorch, TensorFlow, scikit-learn, transformers
50
+ - **Data Processing**: pandas, numpy, dataset handling
51
+ - **GPU Operations**: CUDA, model training, inference`,
52
+ 'debugging': `
53
+ - **Debugging Tools**: Stack traces, logging, profiling
54
+ - **Version Management**: git, conda, pip, dependency resolution
55
+ - **Error Analysis**: Identify root causes, propose fixes`,
56
+ 'coding': `
57
+ - **Code Quality**: Linting, formatting, type checking
58
+ - **Design Patterns**: Implement standard patterns correctly
59
+ - **Testing**: Write and run tests, verify behavior`,
60
+ 'testing': `
61
+ - **Test Frameworks**: vitest, jest, pytest, mocha
62
+ - **Coverage Analysis**: Measure and improve test coverage
63
+ - **Mocking**: Create mocks, stubs, spies for isolation`,
64
+ };
65
+ return baseTools + (categoryTools[classification.category] || '');
66
+ }
67
+ /**
68
+ * Get system prompt with behavioral guidelines
69
+ */
70
+ function getSystemPrompt(classification) {
71
+ const basePrompt = `## Guidelines
72
+
73
+ ### Core Principles
74
+ 1. **State assumptions explicitly** before writing code
75
+ 2. **Handle edge cases** - empty inputs, null values, errors
76
+ 3. **Verify your solution** works before reporting success
77
+ 4. **Follow existing patterns** in the codebase`;
78
+ const categoryGuidelines = {
79
+ 'sysadmin': `
80
+ ### System Administration Guidelines
81
+ - Use modern commands: \`ip\` over \`ifconfig\`, \`ss\` over \`netstat\`
82
+ - Check service status with \`systemctl status\` before changes
83
+ - Backup configs before modifying: \`cp file file.bak\`
84
+ - Use \`journalctl -u service\` for service logs
85
+ - Parallel builds: \`make -j$(nproc)\``,
86
+ 'security': `
87
+ ### Security Guidelines
88
+ - NEVER log sensitive data (passwords, tokens, keys)
89
+ - Use parameterized queries, never string concatenation
90
+ - Validate ALL user input before processing
91
+ - Research CVE details before attempting exploits
92
+ - Use secure defaults (HTTPS, strong hashing)`,
93
+ 'ml-training': `
94
+ ### ML Training Guidelines
95
+ - Start with smaller models for faster iteration
96
+ - Cache datasets to avoid repeated downloads
97
+ - Use \`CUDA_VISIBLE_DEVICES\` for GPU selection
98
+ - Monitor memory usage during training
99
+ - Save checkpoints periodically`,
100
+ 'debugging': `
101
+ ### Debugging Guidelines
102
+ - Reproduce the error before attempting fixes
103
+ - Check logs and stack traces carefully
104
+ - Use \`pip check\` / \`conda list\` for dependency issues
105
+ - Use \`git reflog\` to recover lost work
106
+ - Add verbose flags (-v, --debug) for more info`,
107
+ 'coding': `
108
+ ### Coding Guidelines
109
+ - Follow existing code style and patterns
110
+ - Write self-documenting code with clear names
111
+ - Include JSDoc/docstrings for public APIs
112
+ - Handle errors explicitly with try/catch
113
+ - Export types alongside implementations`,
114
+ 'testing': `
115
+ ### Testing Guidelines
116
+ - Test edge cases: empty, null, undefined
117
+ - Use mocks for external dependencies
118
+ - One assertion per test when possible
119
+ - Name tests descriptively: "should X when Y"
120
+ - Run tests before committing`,
121
+ };
122
+ return basePrompt + (categoryGuidelines[classification.category] || '');
123
+ }
124
+ /**
125
+ * Get system notification (time-sensitive, at END for recency bias)
126
+ */
127
+ function getSystemNotification(classification, options) {
128
+ const notifications = ['## ⚠️ CRITICAL REMINDERS'];
129
+ // Time warning if relevant
130
+ if (options.timeRemaining !== undefined && options.timeRemaining < 60000) {
131
+ notifications.push(`\n**TIME WARNING**: Only ${Math.round(options.timeRemaining / 1000)}s remaining!`);
132
+ notifications.push('- Focus on completing the core requirement');
133
+ notifications.push('- Skip optional optimizations');
134
+ }
135
+ // Attempt warning
136
+ if (options.attemptNumber && options.attemptNumber > 1) {
137
+ notifications.push(`\n**ATTEMPT ${options.attemptNumber}**: Previous attempts failed.`);
138
+ if (options.previousErrors && options.previousErrors.length > 0) {
139
+ notifications.push('\n**Previous errors to fix:**');
140
+ for (const error of options.previousErrors.slice(0, 3)) {
141
+ notifications.push(`- ${error}`);
142
+ }
143
+ }
144
+ }
145
+ // Category-specific critical reminders
146
+ const categoryReminders = {
147
+ 'sysadmin': [
148
+ 'Verify service is running after changes',
149
+ 'Check firewall rules if network issues',
150
+ ],
151
+ 'security': [
152
+ 'Never expose secrets in output or logs',
153
+ 'Sanitize all external input',
154
+ ],
155
+ 'ml-training': [
156
+ 'Check GPU memory before large models',
157
+ 'Verify dataset paths exist',
158
+ ],
159
+ 'debugging': [
160
+ 'Identify root cause, not just symptoms',
161
+ 'Test fix actually resolves the issue',
162
+ ],
163
+ 'coding': [
164
+ 'Return ONLY the code requested',
165
+ 'Include all necessary imports',
166
+ ],
167
+ 'testing': [
168
+ 'Ensure tests actually run assertions',
169
+ 'Mock external dependencies',
170
+ ],
171
+ };
172
+ const reminders = categoryReminders[classification.category] || [];
173
+ if (reminders.length > 0) {
174
+ notifications.push('\n**Final checks:**');
175
+ for (const reminder of reminders) {
176
+ notifications.push(`- ${reminder}`);
177
+ }
178
+ }
179
+ // Universal final reminder
180
+ notifications.push('\n**Before submitting:**');
181
+ notifications.push('- Verify solution compiles/runs');
182
+ notifications.push('- Check all requirements are met');
183
+ notifications.push('- Handle edge cases explicitly');
184
+ return notifications.join('\n');
185
+ }
186
+ /**
187
+ * Build environment bootstrap prompt (gather system info)
188
+ */
189
+ export function buildEnvironmentBootstrap() {
190
+ return `## Environment Discovery
191
+
192
+ Run these commands to understand the environment:
193
+
194
+ \`\`\`bash
195
+ # System info
196
+ echo "=== SYSTEM ===" && uname -a
197
+ echo "=== OS ===" && cat /etc/os-release 2>/dev/null | head -5
198
+
199
+ # Available tools
200
+ echo "=== TOOLS ===" && which python python3 pip pip3 npm node go cargo 2>/dev/null
201
+
202
+ # Resources
203
+ echo "=== DISK ===" && df -h / 2>/dev/null
204
+ echo "=== MEM ===" && free -h 2>/dev/null
205
+
206
+ # Current context
207
+ echo "=== CWD ===" && pwd && ls -la
208
+ echo "=== GIT ===" && git status 2>/dev/null | head -5
209
+ \`\`\`
210
+
211
+ Use this information to:
212
+ 1. Choose appropriate tools (use what's available)
213
+ 2. Check resource constraints
214
+ 3. Understand the current state
215
+ `;
216
+ }
217
+ /**
218
+ * Build planning prompt
219
+ */
220
+ export function buildPlanningPrompt(_task, steps) {
221
+ const plan = steps.map((step, i) => {
222
+ const status = i === 0 ? '[>]' : '[ ]';
223
+ return `${status} ${i + 1}. ${step}`;
224
+ }).join('\n');
225
+ return `## Execution Plan
226
+
227
+ ${plan}
228
+
229
+ **Instructions:**
230
+ - Complete steps in order
231
+ - Mark each step done after completion
232
+ - If a step fails, debug before continuing
233
+ - Update plan if new steps are discovered
234
+ `;
235
+ }
236
+ /**
237
+ * Update planning prompt with progress
238
+ */
239
+ export function updatePlanningPrompt(steps, completedSteps, currentStepInProgress) {
240
+ const plan = steps.map((step, i) => {
241
+ let status;
242
+ if (i < completedSteps) {
243
+ status = '[x]';
244
+ }
245
+ else if (i === completedSteps && currentStepInProgress) {
246
+ status = '[>]';
247
+ }
248
+ else {
249
+ status = '[ ]';
250
+ }
251
+ return `${status} ${i + 1}. ${step}`;
252
+ }).join('\n');
253
+ return `## Progress Update
254
+
255
+ ${plan}
256
+
257
+ **Status:** ${completedSteps}/${steps.length} steps completed
258
+ `;
259
+ }
260
+ //# sourceMappingURL=hierarchical-prompting.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"hierarchical-prompting.js","sourceRoot":"","sources":["../../src/benchmarks/hierarchical-prompting.ts"],"names":[],"mappings":"AAAA;;;;;;;GAOG;AAYH;;GAEG;AACH,MAAM,UAAU,uBAAuB,CACrC,eAAuB,EACvB,cAAkC,EAClC,aAAqB,EACrB,UAII,EAAE;IAEN,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,uDAAuD;IACvD,QAAQ,CAAC,IAAI,CAAC,mBAAmB,CAAC,cAAc,CAAC,CAAC,CAAC;IAEnD,iDAAiD;IACjD,QAAQ,CAAC,IAAI,CAAC,eAAe,CAAC,cAAc,CAAC,CAAC,CAAC;IAE/C,+CAA+C;IAC/C,IAAI,aAAa,EAAE,CAAC;QAClB,QAAQ,CAAC,IAAI,CAAC,wBAAwB,aAAa,EAAE,CAAC,CAAC;IACzD,CAAC;IAED,uBAAuB;IACvB,QAAQ,CAAC,IAAI,CAAC,cAAc,eAAe,EAAE,CAAC,CAAC;IAE/C,oEAAoE;IACpE,QAAQ,CAAC,IAAI,CAAC,qBAAqB,CAAC,cAAc,EAAE,OAAO,CAAC,CAAC,CAAC;IAE9D,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED;;GAEG;AACH,SAAS,mBAAmB,CAAC,cAAkC;IAC7D,MAAM,SAAS,GAAG;;;;;;4DAMwC,CAAC;IAE3D,MAAM,aAAa,GAA2B;QAC5C,UAAU,EAAE;;;oEAGoD;QAEhE,UAAU,EAAE;;;qDAGqC;QAEjD,aAAa,EAAE;;;sDAGmC;QAElD,WAAW,EAAE;;;0DAGyC;QAEtD,QAAQ,EAAE;;;oDAGsC;QAEhD,SAAS,EAAE;;;wDAGyC;KACrD,CAAC;IAEF,OAAO,SAAS,GAAG,CAAC,aAAa,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;AACpE,CAAC;AAED;;GAEG;AACH,SAAS,eAAe,CAAC,cAAkC;IACzD,MAAM,UAAU,GAAG;;;;;;gDAM2B,CAAC;IAE/C,MAAM,kBAAkB,GAA2B;QACjD,UAAU,EAAE;;;;;;uCAMuB;QAEnC,UAAU,EAAE;;;;;;8CAM8B;QAE1C,aAAa,EAAE;;;;;;gCAMa;QAE5B,WAAW,EAAE;;;;;;gDAM+B;QAE5C,QAAQ,EAAE;;;;;;yCAM2B;QAErC,SAAS,EAAE;;;;;;8BAMe;KAC3B,CAAC;IAEF,OAAO,UAAU,GAAG,CAAC,kBAAkB,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC,CAAC;AAC1E,CAAC;AAED;;GAEG;AACH,SAAS,qBAAqB,CAC5B,cAAkC,EAClC,OAIC;IAED,MAAM,aAAa,GAAa,CAAC,0BAA0B,CAAC,CAAC;IAE7D,2BAA2B;IAC3B,IAAI,OAAO,CAAC,aAAa,KAAK,SAAS,IAAI,OAAO,CAAC,aAAa,GAAG,KAAK,EAAE,CAAC;QACzE,aAAa,CAAC,IAAI,CAAC,4BAA4B,IAAI,CAAC,KAAK,CAAC,OAAO,CAAC,aAAa,GAAG,IAAI,CAAC,cAAc,CAAC,CAAC;QACvG,aAAa,CAAC,IAAI,CAAC,4CAA4C,CAAC,CAAC;QACjE,aAAa,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;IACtD,CAAC;IAED,kBAAkB;IAClB,IAAI,OAAO,CAAC,aAAa,IAAI,OAAO,CAAC,aAAa,GAAG,CAAC,EAAE,CAAC;QACvD,aAAa,CAAC,IAAI,CAAC,eAAe,OAAO,CAAC,aAAa,+BAA+B,CAAC,CAAC;QAExF,IAAI,OAAO,CAAC,cAAc,IAAI,OAAO,CAAC,cAAc,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;YAChE,aAAa,CAAC,IAAI,CAAC,+BAA+B,CAAC,CAAC;YACpD,KAAK,MAAM,KAAK,IAAI,OAAO,CAAC,cAAc,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;gBACvD,aAAa,CAAC,IAAI,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;YACnC,CAAC;QACH,CAAC;IACH,CAAC;IAED,uCAAuC;IACvC,MAAM,iBAAiB,GAA6B;QAClD,UAAU,EAAE;YACV,yCAAyC;YACzC,wCAAwC;SACzC;QACD,UAAU,EAAE;YACV,wCAAwC;YACxC,6BAA6B;SAC9B;QACD,aAAa,EAAE;YACb,sCAAsC;YACtC,4BAA4B;SAC7B;QACD,WAAW,EAAE;YACX,wCAAwC;YACxC,sCAAsC;SACvC;QACD,QAAQ,EAAE;YACR,gCAAgC;YAChC,+BAA+B;SAChC;QACD,SAAS,EAAE;YACT,sCAAsC;YACtC,4BAA4B;SAC7B;KACF,CAAC;IAEF,MAAM,SAAS,GAAG,iBAAiB,CAAC,cAAc,CAAC,QAAQ,CAAC,IAAI,EAAE,CAAC;IACnE,IAAI,SAAS,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzB,aAAa,CAAC,IAAI,CAAC,qBAAqB,CAAC,CAAC;QAC1C,KAAK,MAAM,QAAQ,IAAI,SAAS,EAAE,CAAC;YACjC,aAAa,CAAC,IAAI,CAAC,KAAK,QAAQ,EAAE,CAAC,CAAC;QACtC,CAAC;IACH,CAAC;IAED,2BAA2B;IAC3B,aAAa,CAAC,IAAI,CAAC,0BAA0B,CAAC,CAAC;IAC/C,aAAa,CAAC,IAAI,CAAC,iCAAiC,CAAC,CAAC;IACtD,aAAa,CAAC,IAAI,CAAC,kCAAkC,CAAC,CAAC;IACvD,aAAa,CAAC,IAAI,CAAC,gCAAgC,CAAC,CAAC;IAErD,OAAO,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAClC,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,yBAAyB;IACvC,OAAO;;;;;;;;;;;;;;;;;;;;;;;;;CAyBR,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,mBAAmB,CAAC,KAAa,EAAE,KAAe;IAChE,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QACjC,MAAM,MAAM,GAAG,CAAC,KAAK,CAAC,CAAC,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,KAAK,CAAC;QACvC,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC;IACvC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;EAEP,IAAI;;;;;;;CAOL,CAAC;AACF,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,oBAAoB,CAClC,KAAe,EACf,cAAsB,EACtB,qBAA8B;IAE9B,MAAM,IAAI,GAAG,KAAK,CAAC,GAAG,CAAC,CAAC,IAAI,EAAE,CAAC,EAAE,EAAE;QACjC,IAAI,MAAc,CAAC;QACnB,IAAI,CAAC,GAAG,cAAc,EAAE,CAAC;YACvB,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;aAAM,IAAI,CAAC,KAAK,cAAc,IAAI,qBAAqB,EAAE,CAAC;YACzD,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;aAAM,CAAC;YACN,MAAM,GAAG,KAAK,CAAC;QACjB,CAAC;QACD,OAAO,GAAG,MAAM,IAAI,CAAC,GAAG,CAAC,KAAK,IAAI,EAAE,CAAC;IACvC,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;IAEd,OAAO;;EAEP,IAAI;;cAEQ,cAAc,IAAI,KAAK,CAAC,MAAM;CAC3C,CAAC;AACF,CAAC"}