universal-agent-memory 0.6.2 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/agents/naive-agent.d.ts +60 -0
- package/dist/benchmarks/agents/naive-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/naive-agent.js +144 -0
- package/dist/benchmarks/agents/naive-agent.js.map +1 -0
- package/dist/benchmarks/agents/uam-agent.d.ts +167 -0
- package/dist/benchmarks/agents/uam-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/uam-agent.js +386 -0
- package/dist/benchmarks/agents/uam-agent.js.map +1 -0
- package/dist/benchmarks/benchmark.d.ts +328 -0
- package/dist/benchmarks/benchmark.d.ts.map +1 -0
- package/dist/benchmarks/benchmark.js +104 -0
- package/dist/benchmarks/benchmark.js.map +1 -0
- package/dist/benchmarks/execution-verifier.d.ts +41 -0
- package/dist/benchmarks/execution-verifier.d.ts.map +1 -0
- package/dist/benchmarks/execution-verifier.js +342 -0
- package/dist/benchmarks/execution-verifier.js.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts +37 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.js +260 -0
- package/dist/benchmarks/hierarchical-prompting.js.map +1 -0
- package/dist/benchmarks/improved-benchmark.d.ts +88 -0
- package/dist/benchmarks/improved-benchmark.d.ts.map +1 -0
- package/dist/benchmarks/improved-benchmark.js +533 -0
- package/dist/benchmarks/improved-benchmark.js.map +1 -0
- package/dist/benchmarks/index.d.ts +10 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +10 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/benchmarks/multi-turn-agent.d.ts +44 -0
- package/dist/benchmarks/multi-turn-agent.d.ts.map +1 -0
- package/dist/benchmarks/multi-turn-agent.js +235 -0
- package/dist/benchmarks/multi-turn-agent.js.map +1 -0
- package/dist/benchmarks/runner.d.ts +2 -0
- package/dist/benchmarks/runner.d.ts.map +1 -0
- package/dist/benchmarks/runner.js +2 -0
- package/dist/benchmarks/runner.js.map +1 -0
- package/dist/benchmarks/tasks.d.ts +19 -0
- package/dist/benchmarks/tasks.d.ts.map +1 -0
- package/dist/benchmarks/tasks.js +371 -0
- package/dist/benchmarks/tasks.js.map +1 -0
- package/dist/index.d.ts +14 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +11 -0
- package/dist/index.js.map +1 -1
- package/dist/memory/backends/qdrant-cloud.d.ts +1 -1
- package/dist/memory/backends/qdrant-cloud.d.ts.map +1 -1
- package/dist/memory/backends/qdrant-cloud.js +6 -4
- package/dist/memory/backends/qdrant-cloud.js.map +1 -1
- package/dist/memory/context-compressor.d.ts +66 -0
- package/dist/memory/context-compressor.d.ts.map +1 -0
- package/dist/memory/context-compressor.js +250 -0
- package/dist/memory/context-compressor.js.map +1 -0
- package/dist/memory/dynamic-retrieval.d.ts +26 -0
- package/dist/memory/dynamic-retrieval.d.ts.map +1 -0
- package/dist/memory/dynamic-retrieval.js +378 -0
- package/dist/memory/dynamic-retrieval.js.map +1 -0
- package/dist/memory/embeddings.d.ts +93 -0
- package/dist/memory/embeddings.d.ts.map +1 -0
- package/dist/memory/embeddings.js +391 -0
- package/dist/memory/embeddings.js.map +1 -0
- package/dist/memory/hierarchical-memory.d.ts +116 -0
- package/dist/memory/hierarchical-memory.d.ts.map +1 -0
- package/dist/memory/hierarchical-memory.js +299 -0
- package/dist/memory/hierarchical-memory.js.map +1 -0
- package/dist/memory/memory-consolidator.d.ts +88 -0
- package/dist/memory/memory-consolidator.d.ts.map +1 -0
- package/dist/memory/memory-consolidator.js +348 -0
- package/dist/memory/memory-consolidator.js.map +1 -0
- package/dist/memory/speculative-cache.d.ts +89 -0
- package/dist/memory/speculative-cache.d.ts.map +1 -0
- package/dist/memory/speculative-cache.js +259 -0
- package/dist/memory/speculative-cache.js.map +1 -0
- package/dist/memory/task-classifier.d.ts +33 -0
- package/dist/memory/task-classifier.d.ts.map +1 -0
- package/dist/memory/task-classifier.js +277 -0
- package/dist/memory/task-classifier.js.map +1 -0
- package/dist/utils/rate-limiter.d.ts +62 -0
- package/dist/utils/rate-limiter.d.ts.map +1 -0
- package/dist/utils/rate-limiter.js +150 -0
- package/dist/utils/rate-limiter.js.map +1 -0
- package/dist/utils/validate-json.d.ts +52 -0
- package/dist/utils/validate-json.d.ts.map +1 -0
- package/dist/utils/validate-json.js +99 -0
- package/dist/utils/validate-json.js.map +1 -0
- package/package.json +2 -1
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Terminal-Bench Adapter for UAM - Index
|
|
3
|
+
*
|
|
4
|
+
* Exports everything needed for benchmarking
|
|
5
|
+
*/
|
|
6
|
+
export * from './benchmark.js';
|
|
7
|
+
export * from './agents/naive-agent.js';
|
|
8
|
+
export * from './agents/uam-agent.js';
|
|
9
|
+
export * from './tasks.js';
|
|
10
|
+
//# sourceMappingURL=index.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/benchmarks/index.ts"],"names":[],"mappings":"AAAA;;;;GAIG;AAEH,cAAc,gBAAgB,CAAC;AAC/B,cAAc,yBAAyB,CAAC;AACxC,cAAc,uBAAuB,CAAC;AACtC,cAAc,YAAY,CAAC"}
|
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-Turn Agent Loop for UAM Benchmarks
|
|
3
|
+
*
|
|
4
|
+
* Implements iterative refinement with error feedback.
|
|
5
|
+
* Based on Droid's approach: explore, act, verify, retry.
|
|
6
|
+
*/
|
|
7
|
+
import { type VerificationResult } from './execution-verifier.js';
|
|
8
|
+
export interface AgentTurn {
|
|
9
|
+
turnNumber: number;
|
|
10
|
+
prompt: string;
|
|
11
|
+
response: string;
|
|
12
|
+
verification: VerificationResult;
|
|
13
|
+
feedback?: string;
|
|
14
|
+
durationMs: number;
|
|
15
|
+
}
|
|
16
|
+
export interface MultiTurnResult {
|
|
17
|
+
success: boolean;
|
|
18
|
+
totalTurns: number;
|
|
19
|
+
turns: AgentTurn[];
|
|
20
|
+
finalResponse: string;
|
|
21
|
+
totalDurationMs: number;
|
|
22
|
+
memoryContextUsed: boolean;
|
|
23
|
+
}
|
|
24
|
+
export interface MultiTurnConfig {
|
|
25
|
+
maxTurns: number;
|
|
26
|
+
timeout: number;
|
|
27
|
+
model: string;
|
|
28
|
+
apiKey: string;
|
|
29
|
+
useMemory: boolean;
|
|
30
|
+
projectRoot: string;
|
|
31
|
+
verbose: boolean;
|
|
32
|
+
}
|
|
33
|
+
/**
|
|
34
|
+
* Execute task with multi-turn refinement
|
|
35
|
+
*/
|
|
36
|
+
export declare function executeWithMultiTurn(taskId: string, taskPrompt: string, config?: Partial<MultiTurnConfig>): Promise<MultiTurnResult>;
|
|
37
|
+
/**
|
|
38
|
+
* Execute a batch of tasks with multi-turn
|
|
39
|
+
*/
|
|
40
|
+
export declare function executeBatchWithMultiTurn(tasks: Array<{
|
|
41
|
+
id: string;
|
|
42
|
+
prompt: string;
|
|
43
|
+
}>, config?: Partial<MultiTurnConfig>): Promise<Map<string, MultiTurnResult>>;
|
|
44
|
+
//# sourceMappingURL=multi-turn-agent.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"multi-turn-agent.d.ts","sourceRoot":"","sources":["../../src/benchmarks/multi-turn-agent.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAKH,OAAO,EAAuB,KAAK,kBAAkB,EAAE,MAAM,yBAAyB,CAAC;AAGvF,MAAM,WAAW,SAAS;IACxB,UAAU,EAAE,MAAM,CAAC;IACnB,MAAM,EAAE,MAAM,CAAC;IACf,QAAQ,EAAE,MAAM,CAAC;IACjB,YAAY,EAAE,kBAAkB,CAAC;IACjC,QAAQ,CAAC,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;CACpB;AAED,MAAM,WAAW,eAAe;IAC9B,OAAO,EAAE,OAAO,CAAC;IACjB,UAAU,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,SAAS,EAAE,CAAC;IACnB,aAAa,EAAE,MAAM,CAAC;IACtB,eAAe,EAAE,MAAM,CAAC;IACxB,iBAAiB,EAAE,OAAO,CAAC;CAC5B;AAED,MAAM,WAAW,eAAe;IAC9B,QAAQ,EAAE,MAAM,CAAC;IACjB,OAAO,EAAE,MAAM,CAAC;IAChB,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,OAAO,CAAC;IACnB,WAAW,EAAE,MAAM,CAAC;IACpB,OAAO,EAAE,OAAO,CAAC;CAClB;AAYD;;GAEG;AACH,wBAAsB,oBAAoB,CACxC,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,EAClB,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM,GACpC,OAAO,CAAC,eAAe,CAAC,CAiH1B;AA2HD;;GAEG;AACH,wBAAsB,yBAAyB,CAC7C,KAAK,EAAE,KAAK,CAAC;IAAE,EAAE,EAAE,MAAM,CAAC;IAAC,MAAM,EAAE,MAAM,CAAA;CAAE,CAAC,EAC5C,MAAM,GAAE,OAAO,CAAC,eAAe,CAAM,GACpC,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,eAAe,CAAC,CAAC,CAYvC"}
|
|
@@ -0,0 +1,235 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-Turn Agent Loop for UAM Benchmarks
|
|
3
|
+
*
|
|
4
|
+
* Implements iterative refinement with error feedback.
|
|
5
|
+
* Based on Droid's approach: explore, act, verify, retry.
|
|
6
|
+
*/
|
|
7
|
+
import { execSync } from 'child_process';
|
|
8
|
+
import { writeFileSync, existsSync } from 'fs';
|
|
9
|
+
import { join } from 'path';
|
|
10
|
+
import { verifyBenchmarkTask } from './execution-verifier.js';
|
|
11
|
+
import { retrieveDynamicMemoryContext } from '../memory/dynamic-retrieval.js';
|
|
12
|
+
const DEFAULT_CONFIG = {
|
|
13
|
+
maxTurns: 3,
|
|
14
|
+
timeout: 300000,
|
|
15
|
+
model: 'claude-opus-4-5-20251101',
|
|
16
|
+
apiKey: '',
|
|
17
|
+
useMemory: true,
|
|
18
|
+
projectRoot: process.cwd(),
|
|
19
|
+
verbose: false,
|
|
20
|
+
};
|
|
21
|
+
/**
|
|
22
|
+
* Execute task with multi-turn refinement
|
|
23
|
+
*/
|
|
24
|
+
export async function executeWithMultiTurn(taskId, taskPrompt, config = {}) {
|
|
25
|
+
const cfg = { ...DEFAULT_CONFIG, ...config };
|
|
26
|
+
const startTime = Date.now();
|
|
27
|
+
const turns = [];
|
|
28
|
+
let currentPrompt = taskPrompt;
|
|
29
|
+
let memoryContextUsed = false;
|
|
30
|
+
// Get memory context if enabled
|
|
31
|
+
let memoryContext = '';
|
|
32
|
+
if (cfg.useMemory) {
|
|
33
|
+
try {
|
|
34
|
+
const dynamicContext = await retrieveDynamicMemoryContext(taskPrompt, cfg.projectRoot);
|
|
35
|
+
memoryContext = dynamicContext.formattedContext;
|
|
36
|
+
memoryContextUsed = true;
|
|
37
|
+
if (cfg.verbose) {
|
|
38
|
+
console.log(` [Memory] Retrieved ${dynamicContext.relevantMemories.length} memories`);
|
|
39
|
+
console.log(` [Memory] Category: ${dynamicContext.classification.category}`);
|
|
40
|
+
}
|
|
41
|
+
}
|
|
42
|
+
catch (error) {
|
|
43
|
+
if (cfg.verbose) {
|
|
44
|
+
console.log(` [Memory] Failed to retrieve context: ${error}`);
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
for (let turn = 1; turn <= cfg.maxTurns; turn++) {
|
|
49
|
+
const turnStartTime = Date.now();
|
|
50
|
+
if (cfg.verbose) {
|
|
51
|
+
console.log(` [Turn ${turn}/${cfg.maxTurns}] Executing...`);
|
|
52
|
+
}
|
|
53
|
+
// Build prompt with memory and feedback
|
|
54
|
+
const fullPrompt = buildPromptForTurn(turn, currentPrompt, memoryContext, turns);
|
|
55
|
+
// Execute via droid CLI
|
|
56
|
+
let response = '';
|
|
57
|
+
try {
|
|
58
|
+
response = await executeDroidPrompt(fullPrompt, cfg.model, cfg.apiKey, cfg.timeout);
|
|
59
|
+
}
|
|
60
|
+
catch (error) {
|
|
61
|
+
const agentTurn = {
|
|
62
|
+
turnNumber: turn,
|
|
63
|
+
prompt: fullPrompt,
|
|
64
|
+
response: '',
|
|
65
|
+
verification: {
|
|
66
|
+
success: false,
|
|
67
|
+
executionSucceeded: false,
|
|
68
|
+
testsRun: 0,
|
|
69
|
+
testsPassed: 0,
|
|
70
|
+
errors: [`Execution failed: ${error}`],
|
|
71
|
+
output: '',
|
|
72
|
+
executionTimeMs: 0,
|
|
73
|
+
},
|
|
74
|
+
durationMs: Date.now() - turnStartTime,
|
|
75
|
+
};
|
|
76
|
+
turns.push(agentTurn);
|
|
77
|
+
if (cfg.verbose) {
|
|
78
|
+
console.log(` [Turn ${turn}] Failed: ${error}`);
|
|
79
|
+
}
|
|
80
|
+
continue;
|
|
81
|
+
}
|
|
82
|
+
// Verify the response
|
|
83
|
+
const verification = await verifyBenchmarkTask(taskId, response);
|
|
84
|
+
const agentTurn = {
|
|
85
|
+
turnNumber: turn,
|
|
86
|
+
prompt: fullPrompt.slice(0, 500) + '...',
|
|
87
|
+
response: response,
|
|
88
|
+
verification,
|
|
89
|
+
durationMs: Date.now() - turnStartTime,
|
|
90
|
+
};
|
|
91
|
+
if (cfg.verbose) {
|
|
92
|
+
console.log(` [Turn ${turn}] Verification: ${verification.success ? 'PASS' : 'FAIL'}`);
|
|
93
|
+
if (!verification.success && verification.errors.length > 0) {
|
|
94
|
+
console.log(` [Turn ${turn}] Errors: ${verification.errors.slice(0, 2).join(', ')}`);
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
// If successful, we're done
|
|
98
|
+
if (verification.success) {
|
|
99
|
+
turns.push(agentTurn);
|
|
100
|
+
return {
|
|
101
|
+
success: true,
|
|
102
|
+
totalTurns: turn,
|
|
103
|
+
turns,
|
|
104
|
+
finalResponse: response,
|
|
105
|
+
totalDurationMs: Date.now() - startTime,
|
|
106
|
+
memoryContextUsed,
|
|
107
|
+
};
|
|
108
|
+
}
|
|
109
|
+
// Generate feedback for next turn
|
|
110
|
+
agentTurn.feedback = generateFeedback(verification);
|
|
111
|
+
turns.push(agentTurn);
|
|
112
|
+
// Update prompt with feedback for next turn
|
|
113
|
+
currentPrompt = taskPrompt;
|
|
114
|
+
}
|
|
115
|
+
// All turns exhausted without success
|
|
116
|
+
return {
|
|
117
|
+
success: false,
|
|
118
|
+
totalTurns: cfg.maxTurns,
|
|
119
|
+
turns,
|
|
120
|
+
finalResponse: turns[turns.length - 1]?.response || '',
|
|
121
|
+
totalDurationMs: Date.now() - startTime,
|
|
122
|
+
memoryContextUsed,
|
|
123
|
+
};
|
|
124
|
+
}
|
|
125
|
+
/**
|
|
126
|
+
* Build prompt for a specific turn
|
|
127
|
+
*/
|
|
128
|
+
function buildPromptForTurn(turn, taskPrompt, memoryContext, previousTurns) {
|
|
129
|
+
const sections = [];
|
|
130
|
+
// Add memory context at the start (less important info first)
|
|
131
|
+
if (memoryContext && turn === 1) {
|
|
132
|
+
sections.push(memoryContext);
|
|
133
|
+
}
|
|
134
|
+
// Add task prompt
|
|
135
|
+
sections.push(taskPrompt);
|
|
136
|
+
// Add feedback from previous turns
|
|
137
|
+
if (turn > 1 && previousTurns.length > 0) {
|
|
138
|
+
const lastTurn = previousTurns[previousTurns.length - 1];
|
|
139
|
+
if (lastTurn.feedback) {
|
|
140
|
+
sections.push(`
|
|
141
|
+
## Previous Attempt Feedback
|
|
142
|
+
|
|
143
|
+
Your previous response did not pass verification.
|
|
144
|
+
|
|
145
|
+
**Issues found:**
|
|
146
|
+
${lastTurn.feedback}
|
|
147
|
+
|
|
148
|
+
**Instructions:**
|
|
149
|
+
- Review the issues above carefully
|
|
150
|
+
- Fix the specific problems mentioned
|
|
151
|
+
- Ensure your response addresses ALL requirements
|
|
152
|
+
- Do not repeat the same mistakes
|
|
153
|
+
|
|
154
|
+
Please provide a corrected solution.
|
|
155
|
+
`);
|
|
156
|
+
}
|
|
157
|
+
}
|
|
158
|
+
// Add final reminders at END (recency bias)
|
|
159
|
+
if (turn > 1) {
|
|
160
|
+
sections.push(`
|
|
161
|
+
## CRITICAL REMINDERS
|
|
162
|
+
- This is attempt ${turn} - previous attempts failed
|
|
163
|
+
- Focus on fixing the specific errors mentioned above
|
|
164
|
+
- Verify your solution handles edge cases
|
|
165
|
+
- Return ONLY the corrected code
|
|
166
|
+
`);
|
|
167
|
+
}
|
|
168
|
+
return sections.join('\n\n');
|
|
169
|
+
}
|
|
170
|
+
/**
|
|
171
|
+
* Generate feedback from verification result
|
|
172
|
+
*/
|
|
173
|
+
function generateFeedback(verification) {
|
|
174
|
+
const feedbackLines = [];
|
|
175
|
+
if (!verification.executionSucceeded) {
|
|
176
|
+
feedbackLines.push('- Code failed to compile or execute');
|
|
177
|
+
}
|
|
178
|
+
if (verification.testsRun > 0 && verification.testsPassed < verification.testsRun) {
|
|
179
|
+
feedbackLines.push(`- ${verification.testsRun - verification.testsPassed}/${verification.testsRun} test cases failed`);
|
|
180
|
+
}
|
|
181
|
+
for (const error of verification.errors.slice(0, 5)) {
|
|
182
|
+
feedbackLines.push(`- ${error}`);
|
|
183
|
+
}
|
|
184
|
+
if (feedbackLines.length === 0) {
|
|
185
|
+
feedbackLines.push('- Response did not meet the expected requirements');
|
|
186
|
+
}
|
|
187
|
+
return feedbackLines.join('\n');
|
|
188
|
+
}
|
|
189
|
+
/**
|
|
190
|
+
* Execute prompt via droid CLI
|
|
191
|
+
*/
|
|
192
|
+
async function executeDroidPrompt(prompt, model, apiKey, timeout) {
|
|
193
|
+
const tmpDir = '/tmp/uam-benchmark';
|
|
194
|
+
const promptFile = join(tmpDir, `prompt-${Date.now()}.txt`);
|
|
195
|
+
try {
|
|
196
|
+
if (!existsSync(tmpDir)) {
|
|
197
|
+
execSync(`mkdir -p ${tmpDir}`, { encoding: 'utf-8' });
|
|
198
|
+
}
|
|
199
|
+
writeFileSync(promptFile, prompt, 'utf-8');
|
|
200
|
+
const result = execSync(`FACTORY_API_KEY="${apiKey}" droid exec --model "${model}" --auto medium -f "${promptFile}"`, {
|
|
201
|
+
encoding: 'utf-8',
|
|
202
|
+
timeout,
|
|
203
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
204
|
+
env: { ...process.env, FACTORY_API_KEY: apiKey },
|
|
205
|
+
});
|
|
206
|
+
// Clean up
|
|
207
|
+
try {
|
|
208
|
+
execSync(`rm "${promptFile}"`, { encoding: 'utf-8' });
|
|
209
|
+
}
|
|
210
|
+
catch { }
|
|
211
|
+
return result.trim();
|
|
212
|
+
}
|
|
213
|
+
catch (error) {
|
|
214
|
+
// Clean up on error
|
|
215
|
+
try {
|
|
216
|
+
execSync(`rm "${promptFile}"`, { encoding: 'utf-8' });
|
|
217
|
+
}
|
|
218
|
+
catch { }
|
|
219
|
+
throw error;
|
|
220
|
+
}
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* Execute a batch of tasks with multi-turn
|
|
224
|
+
*/
|
|
225
|
+
export async function executeBatchWithMultiTurn(tasks, config = {}) {
|
|
226
|
+
const results = new Map();
|
|
227
|
+
for (const task of tasks) {
|
|
228
|
+
const result = await executeWithMultiTurn(task.id, task.prompt, config);
|
|
229
|
+
results.set(task.id, result);
|
|
230
|
+
// Small delay between tasks
|
|
231
|
+
await new Promise(r => setTimeout(r, 1000));
|
|
232
|
+
}
|
|
233
|
+
return results;
|
|
234
|
+
}
|
|
235
|
+
//# sourceMappingURL=multi-turn-agent.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"multi-turn-agent.js","sourceRoot":"","sources":["../../src/benchmarks/multi-turn-agent.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAEH,OAAO,EAAE,QAAQ,EAAE,MAAM,eAAe,CAAC;AACzC,OAAO,EAAE,aAAa,EAAE,UAAU,EAAE,MAAM,IAAI,CAAC;AAC/C,OAAO,EAAE,IAAI,EAAE,MAAM,MAAM,CAAC;AAC5B,OAAO,EAAE,mBAAmB,EAA2B,MAAM,yBAAyB,CAAC;AACvF,OAAO,EAAE,4BAA4B,EAAE,MAAM,gCAAgC,CAAC;AA8B9E,MAAM,cAAc,GAAoB;IACtC,QAAQ,EAAE,CAAC;IACX,OAAO,EAAE,MAAM;IACf,KAAK,EAAE,0BAA0B;IACjC,MAAM,EAAE,EAAE;IACV,SAAS,EAAE,IAAI;IACf,WAAW,EAAE,OAAO,CAAC,GAAG,EAAE;IAC1B,OAAO,EAAE,KAAK;CACf,CAAC;AAEF;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,oBAAoB,CACxC,MAAc,EACd,UAAkB,EAClB,SAAmC,EAAE;IAErC,MAAM,GAAG,GAAG,EAAE,GAAG,cAAc,EAAE,GAAG,MAAM,EAAE,CAAC;IAC7C,MAAM,SAAS,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;IAC7B,MAAM,KAAK,GAAgB,EAAE,CAAC;IAC9B,IAAI,aAAa,GAAG,UAAU,CAAC;IAC/B,IAAI,iBAAiB,GAAG,KAAK,CAAC;IAE9B,gCAAgC;IAChC,IAAI,aAAa,GAAG,EAAE,CAAC;IACvB,IAAI,GAAG,CAAC,SAAS,EAAE,CAAC;QAClB,IAAI,CAAC;YACH,MAAM,cAAc,GAAG,MAAM,4BAA4B,CAAC,UAAU,EAAE,GAAG,CAAC,WAAW,CAAC,CAAC;YACvF,aAAa,GAAG,cAAc,CAAC,gBAAgB,CAAC;YAChD,iBAAiB,GAAG,IAAI,CAAC;YAEzB,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;gBAChB,OAAO,CAAC,GAAG,CAAC,wBAAwB,cAAc,CAAC,gBAAgB,CAAC,MAAM,WAAW,CAAC,CAAC;gBACvF,OAAO,CAAC,GAAG,CAAC,wBAAwB,cAAc,CAAC,cAAc,CAAC,QAAQ,EAAE,CAAC,CAAC;YAChF,CAAC;QACH,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;gBAChB,OAAO,CAAC,GAAG,CAAC,0CAA0C,KAAK,EAAE,CAAC,CAAC;YACjE,CAAC;QACH,CAAC;IACH,CAAC;IAED,KAAK,IAAI,IAAI,GAAG,CAAC,EAAE,IAAI,IAAI,GAAG,CAAC,QAAQ,EAAE,IAAI,EAAE,EAAE,CAAC;QAChD,MAAM,aAAa,GAAG,IAAI,CAAC,GAAG,EAAE,CAAC;QAEjC,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;YAChB,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,IAAI,GAAG,CAAC,QAAQ,gBAAgB,CAAC,CAAC;QAC/D,CAAC;QAED,wCAAwC;QACxC,MAAM,UAAU,GAAG,kBAAkB,CAAC,IAAI,EAAE,aAAa,EAAE,aAAa,EAAE,KAAK,CAAC,CAAC;QAEjF,wBAAwB;QACxB,IAAI,QAAQ,GAAG,EAAE,CAAC;QAClB,IAAI,CAAC;YACH,QAAQ,GAAG,MAAM,kBAAkB,CAAC,UAAU,EAAE,GAAG,CAAC,KAAK,EAAE,GAAG,CAAC,MAAM,EAAE,GAAG,CAAC,OAAO,CAAC,CAAC;QACtF,CAAC;QAAC,OAAO,KAAK,EAAE,CAAC;YACf,MAAM,SAAS,GAAc;gBAC3B,UAAU,EAAE,IAAI;gBAChB,MAAM,EAAE,UAAU;gBAClB,QAAQ,EAAE,EAAE;gBACZ,YAAY,EAAE;oBACZ,OAAO,EAAE,KAAK;oBACd,kBAAkB,EAAE,KAAK;oBACzB,QAAQ,EAAE,CAAC;oBACX,WAAW,EAAE,CAAC;oBACd,MAAM,EAAE,CAAC,qBAAqB,KAAK,EAAE,CAAC;oBACtC,MAAM,EAAE,EAAE;oBACV,eAAe,EAAE,CAAC;iBACnB;gBACD,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa;aACvC,CAAC;YACF,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAEtB,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;gBAChB,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,aAAa,KAAK,EAAE,CAAC,CAAC;YACnD,CAAC;YACD,SAAS;QACX,CAAC;QAED,sBAAsB;QACtB,MAAM,YAAY,GAAG,MAAM,mBAAmB,CAAC,MAAM,EAAE,QAAQ,CAAC,CAAC;QAEjE,MAAM,SAAS,GAAc;YAC3B,UAAU,EAAE,IAAI;YAChB,MAAM,EAAE,UAAU,CAAC,KAAK,CAAC,CAAC,EAAE,GAAG,CAAC,GAAG,KAAK;YACxC,QAAQ,EAAE,QAAQ;YAClB,YAAY;YACZ,UAAU,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,aAAa;SACvC,CAAC;QAEF,IAAI,GAAG,CAAC,OAAO,EAAE,CAAC;YAChB,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,mBAAmB,YAAY,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC;YACxF,IAAI,CAAC,YAAY,CAAC,OAAO,IAAI,YAAY,CAAC,MAAM,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;gBAC5D,OAAO,CAAC,GAAG,CAAC,WAAW,IAAI,aAAa,YAAY,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,CAAC,IAAI,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;YACxF,CAAC;QACH,CAAC;QAED,4BAA4B;QAC5B,IAAI,YAAY,CAAC,OAAO,EAAE,CAAC;YACzB,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAEtB,OAAO;gBACL,OAAO,EAAE,IAAI;gBACb,UAAU,EAAE,IAAI;gBAChB,KAAK;gBACL,aAAa,EAAE,QAAQ;gBACvB,eAAe,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;gBACvC,iBAAiB;aAClB,CAAC;QACJ,CAAC;QAED,kCAAkC;QAClC,SAAS,CAAC,QAAQ,GAAG,gBAAgB,CAAC,YAAY,CAAC,CAAC;QACpD,KAAK,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;QAEtB,4CAA4C;QAC5C,aAAa,GAAG,UAAU,CAAC;IAC7B,CAAC;IAED,sCAAsC;IACtC,OAAO;QACL,OAAO,EAAE,KAAK;QACd,UAAU,EAAE,GAAG,CAAC,QAAQ;QACxB,KAAK;QACL,aAAa,EAAE,KAAK,CAAC,KAAK,CAAC,MAAM,GAAG,CAAC,CAAC,EAAE,QAAQ,IAAI,EAAE;QACtD,eAAe,EAAE,IAAI,CAAC,GAAG,EAAE,GAAG,SAAS;QACvC,iBAAiB;KAClB,CAAC;AACJ,CAAC;AAED;;GAEG;AACH,SAAS,kBAAkB,CACzB,IAAY,EACZ,UAAkB,EAClB,aAAqB,EACrB,aAA0B;IAE1B,MAAM,QAAQ,GAAa,EAAE,CAAC;IAE9B,8DAA8D;IAC9D,IAAI,aAAa,IAAI,IAAI,KAAK,CAAC,EAAE,CAAC;QAChC,QAAQ,CAAC,IAAI,CAAC,aAAa,CAAC,CAAC;IAC/B,CAAC;IAED,kBAAkB;IAClB,QAAQ,CAAC,IAAI,CAAC,UAAU,CAAC,CAAC;IAE1B,mCAAmC;IACnC,IAAI,IAAI,GAAG,CAAC,IAAI,aAAa,CAAC,MAAM,GAAG,CAAC,EAAE,CAAC;QACzC,MAAM,QAAQ,GAAG,aAAa,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC;QACzD,IAAI,QAAQ,CAAC,QAAQ,EAAE,CAAC;YACtB,QAAQ,CAAC,IAAI,CAAC;;;;;;EAMlB,QAAQ,CAAC,QAAQ;;;;;;;;;CASlB,CAAC,CAAC;QACC,CAAC;IACH,CAAC;IAED,4CAA4C;IAC5C,IAAI,IAAI,GAAG,CAAC,EAAE,CAAC;QACb,QAAQ,CAAC,IAAI,CAAC;;oBAEE,IAAI;;;;CAIvB,CAAC,CAAC;IACD,CAAC;IAED,OAAO,QAAQ,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;AAC/B,CAAC;AAED;;GAEG;AACH,SAAS,gBAAgB,CAAC,YAAgC;IACxD,MAAM,aAAa,GAAa,EAAE,CAAC;IAEnC,IAAI,CAAC,YAAY,CAAC,kBAAkB,EAAE,CAAC;QACrC,aAAa,CAAC,IAAI,CAAC,qCAAqC,CAAC,CAAC;IAC5D,CAAC;IAED,IAAI,YAAY,CAAC,QAAQ,GAAG,CAAC,IAAI,YAAY,CAAC,WAAW,GAAG,YAAY,CAAC,QAAQ,EAAE,CAAC;QAClF,aAAa,CAAC,IAAI,CAAC,KAAK,YAAY,CAAC,QAAQ,GAAG,YAAY,CAAC,WAAW,IAAI,YAAY,CAAC,QAAQ,oBAAoB,CAAC,CAAC;IACzH,CAAC;IAED,KAAK,MAAM,KAAK,IAAI,YAAY,CAAC,MAAM,CAAC,KAAK,CAAC,CAAC,EAAE,CAAC,CAAC,EAAE,CAAC;QACpD,aAAa,CAAC,IAAI,CAAC,KAAK,KAAK,EAAE,CAAC,CAAC;IACnC,CAAC;IAED,IAAI,aAAa,CAAC,MAAM,KAAK,CAAC,EAAE,CAAC;QAC/B,aAAa,CAAC,IAAI,CAAC,mDAAmD,CAAC,CAAC;IAC1E,CAAC;IAED,OAAO,aAAa,CAAC,IAAI,CAAC,IAAI,CAAC,CAAC;AAClC,CAAC;AAED;;GAEG;AACH,KAAK,UAAU,kBAAkB,CAC/B,MAAc,EACd,KAAa,EACb,MAAc,EACd,OAAe;IAEf,MAAM,MAAM,GAAG,oBAAoB,CAAC;IACpC,MAAM,UAAU,GAAG,IAAI,CAAC,MAAM,EAAE,UAAU,IAAI,CAAC,GAAG,EAAE,MAAM,CAAC,CAAC;IAE5D,IAAI,CAAC;QACH,IAAI,CAAC,UAAU,CAAC,MAAM,CAAC,EAAE,CAAC;YACxB,QAAQ,CAAC,YAAY,MAAM,EAAE,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;QACxD,CAAC;QAED,aAAa,CAAC,UAAU,EAAE,MAAM,EAAE,OAAO,CAAC,CAAC;QAE3C,MAAM,MAAM,GAAG,QAAQ,CACrB,oBAAoB,MAAM,yBAAyB,KAAK,uBAAuB,UAAU,GAAG,EAC5F;YACE,QAAQ,EAAE,OAAO;YACjB,OAAO;YACP,SAAS,EAAE,EAAE,GAAG,IAAI,GAAG,IAAI;YAC3B,GAAG,EAAE,EAAE,GAAG,OAAO,CAAC,GAAG,EAAE,eAAe,EAAE,MAAM,EAAE;SACjD,CACF,CAAC;QAEF,WAAW;QACX,IAAI,CAAC;YAAC,QAAQ,CAAC,OAAO,UAAU,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC,CAAC,CAAC;QAExE,OAAO,MAAM,CAAC,IAAI,EAAE,CAAC;IACvB,CAAC;IAAC,OAAO,KAAK,EAAE,CAAC;QACf,oBAAoB;QACpB,IAAI,CAAC;YAAC,QAAQ,CAAC,OAAO,UAAU,GAAG,EAAE,EAAE,QAAQ,EAAE,OAAO,EAAE,CAAC,CAAC;QAAC,CAAC;QAAC,MAAM,CAAC,CAAC,CAAC;QACxE,MAAM,KAAK,CAAC;IACd,CAAC;AACH,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,yBAAyB,CAC7C,KAA4C,EAC5C,SAAmC,EAAE;IAErC,MAAM,OAAO,GAAG,IAAI,GAAG,EAA2B,CAAC;IAEnD,KAAK,MAAM,IAAI,IAAI,KAAK,EAAE,CAAC;QACzB,MAAM,MAAM,GAAG,MAAM,oBAAoB,CAAC,IAAI,CAAC,EAAE,EAAE,IAAI,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;QACxE,OAAO,CAAC,GAAG,CAAC,IAAI,CAAC,EAAE,EAAE,MAAM,CAAC,CAAC;QAE7B,4BAA4B;QAC5B,MAAM,IAAI,OAAO,CAAC,CAAC,CAAC,EAAE,CAAC,UAAU,CAAC,CAAC,EAAE,IAAI,CAAC,CAAC,CAAC;IAC9C,CAAC;IAED,OAAO,OAAO,CAAC;AACjB,CAAC"}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.d.ts","sourceRoot":"","sources":["../../src/benchmarks/runner.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"runner.js","sourceRoot":"","sources":["../../src/benchmarks/runner.ts"],"names":[],"mappings":""}
|
|
@@ -0,0 +1,19 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Benchmark Tasks for Terminal-Bench Adapter
|
|
3
|
+
*
|
|
4
|
+
* These tasks simulate real terminal-style scenarios that benefit from
|
|
5
|
+
* persistent memory and context awareness.
|
|
6
|
+
*
|
|
7
|
+
* Tasks are designed to test:
|
|
8
|
+
* 1. Memory of past decisions and outcomes
|
|
9
|
+
* 2. Knowledge of project structure and patterns
|
|
10
|
+
* 3. Ability to avoid repeating mistakes
|
|
11
|
+
* 4. Coordination of multi-step workflows
|
|
12
|
+
*/
|
|
13
|
+
import { BenchmarkTask } from './benchmark.js';
|
|
14
|
+
export declare const BENCHMARK_TASKS: BenchmarkTask[];
|
|
15
|
+
export declare function getTaskById(id: string): BenchmarkTask | undefined;
|
|
16
|
+
export declare function getTasksByDifficulty(difficulty: 'easy' | 'medium' | 'hard'): BenchmarkTask[];
|
|
17
|
+
export declare function getTasksByCategory(category: string): BenchmarkTask[];
|
|
18
|
+
export declare function getAllTasks(): BenchmarkTask[];
|
|
19
|
+
//# sourceMappingURL=tasks.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"tasks.d.ts","sourceRoot":"","sources":["../../src/benchmarks/tasks.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;GAWG;AAEH,OAAO,EAAE,aAAa,EAAE,MAAM,gBAAgB,CAAC;AAuF/C,eAAO,MAAM,eAAe,EAAE,aAAa,EA+S1C,CAAC;AAMF,wBAAgB,WAAW,CAAC,EAAE,EAAE,MAAM,GAAG,aAAa,GAAG,SAAS,CAEjE;AAED,wBAAgB,oBAAoB,CAAC,UAAU,EAAE,MAAM,GAAG,QAAQ,GAAG,MAAM,GAAG,aAAa,EAAE,CAE5F;AAED,wBAAgB,kBAAkB,CAAC,QAAQ,EAAE,MAAM,GAAG,aAAa,EAAE,CAEpE;AAED,wBAAgB,WAAW,IAAI,aAAa,EAAE,CAE7C"}
|