universal-agent-memory 0.6.2 → 0.6.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/benchmarks/agents/naive-agent.d.ts +60 -0
- package/dist/benchmarks/agents/naive-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/naive-agent.js +144 -0
- package/dist/benchmarks/agents/naive-agent.js.map +1 -0
- package/dist/benchmarks/agents/uam-agent.d.ts +167 -0
- package/dist/benchmarks/agents/uam-agent.d.ts.map +1 -0
- package/dist/benchmarks/agents/uam-agent.js +386 -0
- package/dist/benchmarks/agents/uam-agent.js.map +1 -0
- package/dist/benchmarks/benchmark.d.ts +328 -0
- package/dist/benchmarks/benchmark.d.ts.map +1 -0
- package/dist/benchmarks/benchmark.js +104 -0
- package/dist/benchmarks/benchmark.js.map +1 -0
- package/dist/benchmarks/execution-verifier.d.ts +41 -0
- package/dist/benchmarks/execution-verifier.d.ts.map +1 -0
- package/dist/benchmarks/execution-verifier.js +301 -0
- package/dist/benchmarks/execution-verifier.js.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts +37 -0
- package/dist/benchmarks/hierarchical-prompting.d.ts.map +1 -0
- package/dist/benchmarks/hierarchical-prompting.js +260 -0
- package/dist/benchmarks/hierarchical-prompting.js.map +1 -0
- package/dist/benchmarks/improved-benchmark.d.ts +88 -0
- package/dist/benchmarks/improved-benchmark.d.ts.map +1 -0
- package/dist/benchmarks/improved-benchmark.js +533 -0
- package/dist/benchmarks/improved-benchmark.js.map +1 -0
- package/dist/benchmarks/index.d.ts +10 -0
- package/dist/benchmarks/index.d.ts.map +1 -0
- package/dist/benchmarks/index.js +10 -0
- package/dist/benchmarks/index.js.map +1 -0
- package/dist/benchmarks/multi-turn-agent.d.ts +44 -0
- package/dist/benchmarks/multi-turn-agent.d.ts.map +1 -0
- package/dist/benchmarks/multi-turn-agent.js +235 -0
- package/dist/benchmarks/multi-turn-agent.js.map +1 -0
- package/dist/benchmarks/runner.d.ts +2 -0
- package/dist/benchmarks/runner.d.ts.map +1 -0
- package/dist/benchmarks/runner.js +2 -0
- package/dist/benchmarks/runner.js.map +1 -0
- package/dist/benchmarks/tasks.d.ts +19 -0
- package/dist/benchmarks/tasks.d.ts.map +1 -0
- package/dist/benchmarks/tasks.js +371 -0
- package/dist/benchmarks/tasks.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +4 -0
- package/dist/index.js.map +1 -1
- package/dist/memory/backends/qdrant-cloud.d.ts +1 -1
- package/dist/memory/backends/qdrant-cloud.d.ts.map +1 -1
- package/dist/memory/backends/qdrant-cloud.js +6 -4
- package/dist/memory/backends/qdrant-cloud.js.map +1 -1
- package/dist/memory/dynamic-retrieval.d.ts +26 -0
- package/dist/memory/dynamic-retrieval.d.ts.map +1 -0
- package/dist/memory/dynamic-retrieval.js +378 -0
- package/dist/memory/dynamic-retrieval.js.map +1 -0
- package/dist/memory/embeddings.d.ts +82 -0
- package/dist/memory/embeddings.d.ts.map +1 -0
- package/dist/memory/embeddings.js +297 -0
- package/dist/memory/embeddings.js.map +1 -0
- package/dist/memory/task-classifier.d.ts +33 -0
- package/dist/memory/task-classifier.d.ts.map +1 -0
- package/dist/memory/task-classifier.js +277 -0
- package/dist/memory/task-classifier.js.map +1 -0
- package/dist/utils/rate-limiter.d.ts +62 -0
- package/dist/utils/rate-limiter.d.ts.map +1 -0
- package/dist/utils/rate-limiter.js +150 -0
- package/dist/utils/rate-limiter.js.map +1 -0
- package/dist/utils/validate-json.d.ts +52 -0
- package/dist/utils/validate-json.d.ts.map +1 -0
- package/dist/utils/validate-json.js +99 -0
- package/dist/utils/validate-json.js.map +1 -0
- package/package.json +2 -1
|
@@ -0,0 +1,328 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Terminal-Bench Adapter for UAM
|
|
3
|
+
*
|
|
4
|
+
* Assumptions:
|
|
5
|
+
* - Target: Compare UAM-enabled agents vs naive agents on terminal-style tasks
|
|
6
|
+
* - Tasks require knowledge of project structure, past decisions, and patterns
|
|
7
|
+
* - UAM provides persistent memory across task sessions
|
|
8
|
+
*
|
|
9
|
+
* What this handles:
|
|
10
|
+
* - Benchmark task definitions
|
|
11
|
+
* - Memory-enabled agent wrapper
|
|
12
|
+
* - Performance comparison framework
|
|
13
|
+
* - Results aggregation and reporting
|
|
14
|
+
*
|
|
15
|
+
* What this does NOT handle:
|
|
16
|
+
* - Full Terminal-Bench framework integration (use tb run CLI for that)
|
|
17
|
+
* - Real Docker environment sandboxing
|
|
18
|
+
* - Multi-agent coordination (future enhancement)
|
|
19
|
+
*/
|
|
20
|
+
import { z } from 'zod';
|
|
21
|
+
export declare const BenchmarkTaskSchema: z.ZodObject<{
|
|
22
|
+
id: z.ZodString;
|
|
23
|
+
name: z.ZodString;
|
|
24
|
+
description: z.ZodString;
|
|
25
|
+
instruction: z.ZodString;
|
|
26
|
+
difficulty: z.ZodEnum<["easy", "medium", "hard"]>;
|
|
27
|
+
category: z.ZodEnum<["memory", "coordination", "code-quality", "performance", "testing"]>;
|
|
28
|
+
verify: z.ZodFunction<z.ZodTuple<[], z.ZodUnknown>, z.ZodPromise<z.ZodObject<{
|
|
29
|
+
success: z.ZodBoolean;
|
|
30
|
+
details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
|
|
31
|
+
}, "strip", z.ZodTypeAny, {
|
|
32
|
+
success: boolean;
|
|
33
|
+
details?: Record<string, any> | undefined;
|
|
34
|
+
}, {
|
|
35
|
+
success: boolean;
|
|
36
|
+
details?: Record<string, any> | undefined;
|
|
37
|
+
}>>>;
|
|
38
|
+
estimatedMinutes: z.ZodOptional<z.ZodNumber>;
|
|
39
|
+
}, "strip", z.ZodTypeAny, {
|
|
40
|
+
name: string;
|
|
41
|
+
description: string;
|
|
42
|
+
id: string;
|
|
43
|
+
category: "memory" | "coordination" | "testing" | "code-quality" | "performance";
|
|
44
|
+
instruction: string;
|
|
45
|
+
difficulty: "medium" | "easy" | "hard";
|
|
46
|
+
verify: (...args: unknown[]) => Promise<{
|
|
47
|
+
success: boolean;
|
|
48
|
+
details?: Record<string, any> | undefined;
|
|
49
|
+
}>;
|
|
50
|
+
estimatedMinutes?: number | undefined;
|
|
51
|
+
}, {
|
|
52
|
+
name: string;
|
|
53
|
+
description: string;
|
|
54
|
+
id: string;
|
|
55
|
+
category: "memory" | "coordination" | "testing" | "code-quality" | "performance";
|
|
56
|
+
instruction: string;
|
|
57
|
+
difficulty: "medium" | "easy" | "hard";
|
|
58
|
+
verify: (...args: unknown[]) => Promise<{
|
|
59
|
+
success: boolean;
|
|
60
|
+
details?: Record<string, any> | undefined;
|
|
61
|
+
}>;
|
|
62
|
+
estimatedMinutes?: number | undefined;
|
|
63
|
+
}>;
|
|
64
|
+
export type BenchmarkTask = z.infer<typeof BenchmarkTaskSchema>;
|
|
65
|
+
export declare const AgentExecutionSchema: z.ZodObject<{
|
|
66
|
+
taskId: z.ZodString;
|
|
67
|
+
agent: z.ZodString;
|
|
68
|
+
startTime: z.ZodNumber;
|
|
69
|
+
endTime: z.ZodNumber;
|
|
70
|
+
durationMs: z.ZodNumber;
|
|
71
|
+
success: z.ZodBoolean;
|
|
72
|
+
attempts: z.ZodNumber;
|
|
73
|
+
memoryQueries: z.ZodOptional<z.ZodNumber>;
|
|
74
|
+
tokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
75
|
+
errors: z.ZodArray<z.ZodString, "many">;
|
|
76
|
+
}, "strip", z.ZodTypeAny, {
|
|
77
|
+
success: boolean;
|
|
78
|
+
errors: string[];
|
|
79
|
+
taskId: string;
|
|
80
|
+
agent: string;
|
|
81
|
+
startTime: number;
|
|
82
|
+
endTime: number;
|
|
83
|
+
durationMs: number;
|
|
84
|
+
attempts: number;
|
|
85
|
+
memoryQueries?: number | undefined;
|
|
86
|
+
tokensUsed?: number | undefined;
|
|
87
|
+
}, {
|
|
88
|
+
success: boolean;
|
|
89
|
+
errors: string[];
|
|
90
|
+
taskId: string;
|
|
91
|
+
agent: string;
|
|
92
|
+
startTime: number;
|
|
93
|
+
endTime: number;
|
|
94
|
+
durationMs: number;
|
|
95
|
+
attempts: number;
|
|
96
|
+
memoryQueries?: number | undefined;
|
|
97
|
+
tokensUsed?: number | undefined;
|
|
98
|
+
}>;
|
|
99
|
+
export type AgentExecution = z.infer<typeof AgentExecutionSchema>;
|
|
100
|
+
export declare const BenchmarkResultSchema: z.ZodObject<{
|
|
101
|
+
taskId: z.ZodString;
|
|
102
|
+
taskName: z.ZodString;
|
|
103
|
+
results: z.ZodArray<z.ZodObject<{
|
|
104
|
+
taskId: z.ZodString;
|
|
105
|
+
agent: z.ZodString;
|
|
106
|
+
startTime: z.ZodNumber;
|
|
107
|
+
endTime: z.ZodNumber;
|
|
108
|
+
durationMs: z.ZodNumber;
|
|
109
|
+
success: z.ZodBoolean;
|
|
110
|
+
attempts: z.ZodNumber;
|
|
111
|
+
memoryQueries: z.ZodOptional<z.ZodNumber>;
|
|
112
|
+
tokensUsed: z.ZodOptional<z.ZodNumber>;
|
|
113
|
+
errors: z.ZodArray<z.ZodString, "many">;
|
|
114
|
+
}, "strip", z.ZodTypeAny, {
|
|
115
|
+
success: boolean;
|
|
116
|
+
errors: string[];
|
|
117
|
+
taskId: string;
|
|
118
|
+
agent: string;
|
|
119
|
+
startTime: number;
|
|
120
|
+
endTime: number;
|
|
121
|
+
durationMs: number;
|
|
122
|
+
attempts: number;
|
|
123
|
+
memoryQueries?: number | undefined;
|
|
124
|
+
tokensUsed?: number | undefined;
|
|
125
|
+
}, {
|
|
126
|
+
success: boolean;
|
|
127
|
+
errors: string[];
|
|
128
|
+
taskId: string;
|
|
129
|
+
agent: string;
|
|
130
|
+
startTime: number;
|
|
131
|
+
endTime: number;
|
|
132
|
+
durationMs: number;
|
|
133
|
+
attempts: number;
|
|
134
|
+
memoryQueries?: number | undefined;
|
|
135
|
+
tokensUsed?: number | undefined;
|
|
136
|
+
}>, "many">;
|
|
137
|
+
summary: z.ZodObject<{
|
|
138
|
+
uamSuccessRate: z.ZodNumber;
|
|
139
|
+
naiveSuccessRate: z.ZodNumber;
|
|
140
|
+
uamAvgDuration: z.ZodNumber;
|
|
141
|
+
naiveAvgDuration: z.ZodNumber;
|
|
142
|
+
improvement: z.ZodObject<{
|
|
143
|
+
successDelta: z.ZodNumber;
|
|
144
|
+
speedup: z.ZodNumber;
|
|
145
|
+
memoryQueries: z.ZodNumber;
|
|
146
|
+
}, "strip", z.ZodTypeAny, {
|
|
147
|
+
memoryQueries: number;
|
|
148
|
+
successDelta: number;
|
|
149
|
+
speedup: number;
|
|
150
|
+
}, {
|
|
151
|
+
memoryQueries: number;
|
|
152
|
+
successDelta: number;
|
|
153
|
+
speedup: number;
|
|
154
|
+
}>;
|
|
155
|
+
}, "strip", z.ZodTypeAny, {
|
|
156
|
+
uamSuccessRate: number;
|
|
157
|
+
naiveSuccessRate: number;
|
|
158
|
+
uamAvgDuration: number;
|
|
159
|
+
naiveAvgDuration: number;
|
|
160
|
+
improvement: {
|
|
161
|
+
memoryQueries: number;
|
|
162
|
+
successDelta: number;
|
|
163
|
+
speedup: number;
|
|
164
|
+
};
|
|
165
|
+
}, {
|
|
166
|
+
uamSuccessRate: number;
|
|
167
|
+
naiveSuccessRate: number;
|
|
168
|
+
uamAvgDuration: number;
|
|
169
|
+
naiveAvgDuration: number;
|
|
170
|
+
improvement: {
|
|
171
|
+
memoryQueries: number;
|
|
172
|
+
successDelta: number;
|
|
173
|
+
speedup: number;
|
|
174
|
+
};
|
|
175
|
+
}>;
|
|
176
|
+
}, "strip", z.ZodTypeAny, {
|
|
177
|
+
taskId: string;
|
|
178
|
+
taskName: string;
|
|
179
|
+
results: {
|
|
180
|
+
success: boolean;
|
|
181
|
+
errors: string[];
|
|
182
|
+
taskId: string;
|
|
183
|
+
agent: string;
|
|
184
|
+
startTime: number;
|
|
185
|
+
endTime: number;
|
|
186
|
+
durationMs: number;
|
|
187
|
+
attempts: number;
|
|
188
|
+
memoryQueries?: number | undefined;
|
|
189
|
+
tokensUsed?: number | undefined;
|
|
190
|
+
}[];
|
|
191
|
+
summary: {
|
|
192
|
+
uamSuccessRate: number;
|
|
193
|
+
naiveSuccessRate: number;
|
|
194
|
+
uamAvgDuration: number;
|
|
195
|
+
naiveAvgDuration: number;
|
|
196
|
+
improvement: {
|
|
197
|
+
memoryQueries: number;
|
|
198
|
+
successDelta: number;
|
|
199
|
+
speedup: number;
|
|
200
|
+
};
|
|
201
|
+
};
|
|
202
|
+
}, {
|
|
203
|
+
taskId: string;
|
|
204
|
+
taskName: string;
|
|
205
|
+
results: {
|
|
206
|
+
success: boolean;
|
|
207
|
+
errors: string[];
|
|
208
|
+
taskId: string;
|
|
209
|
+
agent: string;
|
|
210
|
+
startTime: number;
|
|
211
|
+
endTime: number;
|
|
212
|
+
durationMs: number;
|
|
213
|
+
attempts: number;
|
|
214
|
+
memoryQueries?: number | undefined;
|
|
215
|
+
tokensUsed?: number | undefined;
|
|
216
|
+
}[];
|
|
217
|
+
summary: {
|
|
218
|
+
uamSuccessRate: number;
|
|
219
|
+
naiveSuccessRate: number;
|
|
220
|
+
uamAvgDuration: number;
|
|
221
|
+
naiveAvgDuration: number;
|
|
222
|
+
improvement: {
|
|
223
|
+
memoryQueries: number;
|
|
224
|
+
successDelta: number;
|
|
225
|
+
speedup: number;
|
|
226
|
+
};
|
|
227
|
+
};
|
|
228
|
+
}>;
|
|
229
|
+
export type BenchmarkResult = z.infer<typeof BenchmarkResultSchema>;
|
|
230
|
+
export declare const OverallBenchmarkStatsSchema: z.ZodObject<{
|
|
231
|
+
totalTasks: z.ZodNumber;
|
|
232
|
+
uamSuccess: z.ZodNumber;
|
|
233
|
+
naiveSuccess: z.ZodNumber;
|
|
234
|
+
uamSuccessRate: z.ZodNumber;
|
|
235
|
+
naiveSuccessRate: z.ZodNumber;
|
|
236
|
+
uamAvgDuration: z.ZodNumber;
|
|
237
|
+
naiveAvgDuration: z.ZodNumber;
|
|
238
|
+
overallSpeedup: z.ZodNumber;
|
|
239
|
+
byDifficulty: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
240
|
+
count: z.ZodNumber;
|
|
241
|
+
uamSuccess: z.ZodNumber;
|
|
242
|
+
naiveSuccess: z.ZodNumber;
|
|
243
|
+
}, "strip", z.ZodTypeAny, {
|
|
244
|
+
count: number;
|
|
245
|
+
uamSuccess: number;
|
|
246
|
+
naiveSuccess: number;
|
|
247
|
+
}, {
|
|
248
|
+
count: number;
|
|
249
|
+
uamSuccess: number;
|
|
250
|
+
naiveSuccess: number;
|
|
251
|
+
}>>;
|
|
252
|
+
byCategory: z.ZodRecord<z.ZodString, z.ZodObject<{
|
|
253
|
+
count: z.ZodNumber;
|
|
254
|
+
uamSuccess: z.ZodNumber;
|
|
255
|
+
naiveSuccess: z.ZodNumber;
|
|
256
|
+
}, "strip", z.ZodTypeAny, {
|
|
257
|
+
count: number;
|
|
258
|
+
uamSuccess: number;
|
|
259
|
+
naiveSuccess: number;
|
|
260
|
+
}, {
|
|
261
|
+
count: number;
|
|
262
|
+
uamSuccess: number;
|
|
263
|
+
naiveSuccess: number;
|
|
264
|
+
}>>;
|
|
265
|
+
}, "strip", z.ZodTypeAny, {
|
|
266
|
+
uamSuccessRate: number;
|
|
267
|
+
naiveSuccessRate: number;
|
|
268
|
+
uamAvgDuration: number;
|
|
269
|
+
naiveAvgDuration: number;
|
|
270
|
+
totalTasks: number;
|
|
271
|
+
uamSuccess: number;
|
|
272
|
+
naiveSuccess: number;
|
|
273
|
+
overallSpeedup: number;
|
|
274
|
+
byDifficulty: Record<string, {
|
|
275
|
+
count: number;
|
|
276
|
+
uamSuccess: number;
|
|
277
|
+
naiveSuccess: number;
|
|
278
|
+
}>;
|
|
279
|
+
byCategory: Record<string, {
|
|
280
|
+
count: number;
|
|
281
|
+
uamSuccess: number;
|
|
282
|
+
naiveSuccess: number;
|
|
283
|
+
}>;
|
|
284
|
+
}, {
|
|
285
|
+
uamSuccessRate: number;
|
|
286
|
+
naiveSuccessRate: number;
|
|
287
|
+
uamAvgDuration: number;
|
|
288
|
+
naiveAvgDuration: number;
|
|
289
|
+
totalTasks: number;
|
|
290
|
+
uamSuccess: number;
|
|
291
|
+
naiveSuccess: number;
|
|
292
|
+
overallSpeedup: number;
|
|
293
|
+
byDifficulty: Record<string, {
|
|
294
|
+
count: number;
|
|
295
|
+
uamSuccess: number;
|
|
296
|
+
naiveSuccess: number;
|
|
297
|
+
}>;
|
|
298
|
+
byCategory: Record<string, {
|
|
299
|
+
count: number;
|
|
300
|
+
uamSuccess: number;
|
|
301
|
+
naiveSuccess: number;
|
|
302
|
+
}>;
|
|
303
|
+
}>;
|
|
304
|
+
export type OverallBenchmarkStats = z.infer<typeof OverallBenchmarkStatsSchema>;
|
|
305
|
+
export declare const BenchmarkConfigSchema: z.ZodObject<{
|
|
306
|
+
maxAttempts: z.ZodDefault<z.ZodNumber>;
|
|
307
|
+
timeoutMs: z.ZodDefault<z.ZodNumber>;
|
|
308
|
+
agents: z.ZodDefault<z.ZodArray<z.ZodString, "many">>;
|
|
309
|
+
memoryEnabled: z.ZodDefault<z.ZodBoolean>;
|
|
310
|
+
verbose: z.ZodDefault<z.ZodBoolean>;
|
|
311
|
+
outputDir: z.ZodDefault<z.ZodString>;
|
|
312
|
+
}, "strip", z.ZodTypeAny, {
|
|
313
|
+
agents: string[];
|
|
314
|
+
maxAttempts: number;
|
|
315
|
+
timeoutMs: number;
|
|
316
|
+
memoryEnabled: boolean;
|
|
317
|
+
verbose: boolean;
|
|
318
|
+
outputDir: string;
|
|
319
|
+
}, {
|
|
320
|
+
agents?: string[] | undefined;
|
|
321
|
+
maxAttempts?: number | undefined;
|
|
322
|
+
timeoutMs?: number | undefined;
|
|
323
|
+
memoryEnabled?: boolean | undefined;
|
|
324
|
+
verbose?: boolean | undefined;
|
|
325
|
+
outputDir?: string | undefined;
|
|
326
|
+
}>;
|
|
327
|
+
export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
|
|
328
|
+
//# sourceMappingURL=benchmark.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark.d.ts","sourceRoot":"","sources":["../../src/benchmarks/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAMxB,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAc9B,CAAC;AAEH,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAMhE,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAW/B,CAAC;AAEH,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAMlE,eAAO,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAehC,CAAC;AAEH,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC;AAEpE,eAAO,MAAM,2BAA2B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAmBtC,CAAC;AAEH,MAAM,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,2BAA2B,CAAC,CAAC;AAMhF,eAAO,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;EAOhC,CAAC;AAEH,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC"}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Terminal-Bench Adapter for UAM
|
|
3
|
+
*
|
|
4
|
+
* Assumptions:
|
|
5
|
+
* - Target: Compare UAM-enabled agents vs naive agents on terminal-style tasks
|
|
6
|
+
* - Tasks require knowledge of project structure, past decisions, and patterns
|
|
7
|
+
* - UAM provides persistent memory across task sessions
|
|
8
|
+
*
|
|
9
|
+
* What this handles:
|
|
10
|
+
* - Benchmark task definitions
|
|
11
|
+
* - Memory-enabled agent wrapper
|
|
12
|
+
* - Performance comparison framework
|
|
13
|
+
* - Results aggregation and reporting
|
|
14
|
+
*
|
|
15
|
+
* What this does NOT handle:
|
|
16
|
+
* - Full Terminal-Bench framework integration (use tb run CLI for that)
|
|
17
|
+
* - Real Docker environment sandboxing
|
|
18
|
+
* - Multi-agent coordination (future enhancement)
|
|
19
|
+
*/
|
|
20
|
+
import { z } from 'zod';
|
|
21
|
+
// ============================================================================
|
|
22
|
+
// Task Types
|
|
23
|
+
// ============================================================================
|
|
24
|
+
export const BenchmarkTaskSchema = z.object({
|
|
25
|
+
id: z.string(),
|
|
26
|
+
name: z.string(),
|
|
27
|
+
description: z.string(),
|
|
28
|
+
instruction: z.string(),
|
|
29
|
+
difficulty: z.enum(['easy', 'medium', 'hard']),
|
|
30
|
+
category: z.enum(['memory', 'coordination', 'code-quality', 'performance', 'testing']),
|
|
31
|
+
// Verification: checks if agent solved the task correctly
|
|
32
|
+
verify: z.function().returns(z.promise(z.object({
|
|
33
|
+
success: z.boolean(),
|
|
34
|
+
details: z.record(z.any()).optional(),
|
|
35
|
+
}))),
|
|
36
|
+
// Estimated time to complete
|
|
37
|
+
estimatedMinutes: z.number().optional(),
|
|
38
|
+
});
|
|
39
|
+
// ============================================================================
|
|
40
|
+
// Agent Types
|
|
41
|
+
// ============================================================================
|
|
42
|
+
export const AgentExecutionSchema = z.object({
|
|
43
|
+
taskId: z.string(),
|
|
44
|
+
agent: z.string(),
|
|
45
|
+
startTime: z.number(),
|
|
46
|
+
endTime: z.number(),
|
|
47
|
+
durationMs: z.number(),
|
|
48
|
+
success: z.boolean(),
|
|
49
|
+
attempts: z.number(),
|
|
50
|
+
memoryQueries: z.number().optional(),
|
|
51
|
+
tokensUsed: z.number().optional(),
|
|
52
|
+
errors: z.array(z.string()),
|
|
53
|
+
});
|
|
54
|
+
// ============================================================================
|
|
55
|
+
// Benchmark Result Types
|
|
56
|
+
// ============================================================================
|
|
57
|
+
export const BenchmarkResultSchema = z.object({
|
|
58
|
+
taskId: z.string(),
|
|
59
|
+
taskName: z.string(),
|
|
60
|
+
results: z.array(AgentExecutionSchema),
|
|
61
|
+
summary: z.object({
|
|
62
|
+
uamSuccessRate: z.number(),
|
|
63
|
+
naiveSuccessRate: z.number(),
|
|
64
|
+
uamAvgDuration: z.number(), // in seconds
|
|
65
|
+
naiveAvgDuration: z.number(), // in seconds
|
|
66
|
+
improvement: z.object({
|
|
67
|
+
successDelta: z.number(), // percentage points
|
|
68
|
+
speedup: z.number(), // ratio >1 means UAM is faster
|
|
69
|
+
memoryQueries: z.number(),
|
|
70
|
+
}),
|
|
71
|
+
}),
|
|
72
|
+
});
|
|
73
|
+
export const OverallBenchmarkStatsSchema = z.object({
|
|
74
|
+
totalTasks: z.number(),
|
|
75
|
+
uamSuccess: z.number(),
|
|
76
|
+
naiveSuccess: z.number(),
|
|
77
|
+
uamSuccessRate: z.number(),
|
|
78
|
+
naiveSuccessRate: z.number(),
|
|
79
|
+
uamAvgDuration: z.number(),
|
|
80
|
+
naiveAvgDuration: z.number(),
|
|
81
|
+
overallSpeedup: z.number(),
|
|
82
|
+
byDifficulty: z.record(z.object({
|
|
83
|
+
count: z.number(),
|
|
84
|
+
uamSuccess: z.number(),
|
|
85
|
+
naiveSuccess: z.number(),
|
|
86
|
+
})),
|
|
87
|
+
byCategory: z.record(z.object({
|
|
88
|
+
count: z.number(),
|
|
89
|
+
uamSuccess: z.number(),
|
|
90
|
+
naiveSuccess: z.number(),
|
|
91
|
+
})),
|
|
92
|
+
});
|
|
93
|
+
// ============================================================================
|
|
94
|
+
// Benchmark Configuration
|
|
95
|
+
// ============================================================================
|
|
96
|
+
export const BenchmarkConfigSchema = z.object({
|
|
97
|
+
maxAttempts: z.number().default(3),
|
|
98
|
+
timeoutMs: z.number().default(300000), // 5 minutes per task
|
|
99
|
+
agents: z.array(z.string()).default(['uam-agent', 'naive-agent']),
|
|
100
|
+
memoryEnabled: z.boolean().default(true),
|
|
101
|
+
verbose: z.boolean().default(false),
|
|
102
|
+
outputDir: z.string().default('./benchmarks/results'),
|
|
103
|
+
});
|
|
104
|
+
//# sourceMappingURL=benchmark.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/benchmarks/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,+EAA+E;AAC/E,aAAa;AACb,+EAA+E;AAE/E,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC1C,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE;IACvB,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE;IACvB,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC9C,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,cAAc,EAAE,cAAc,EAAE,aAAa,EAAE,SAAS,CAAC,CAAC;IACtF,0DAA0D;IAC1D,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9C,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE;QACpB,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,QAAQ,EAAE;KACtC,CAAC,CAAC,CAAC;IACJ,6BAA6B;IAC7B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CACxC,CAAC,CAAC;AAIH,+EAA+E;AAC/E,cAAc;AACd,+EAA+E;AAE/E,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3C,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;IACjB,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE;IACrB,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE;IACnB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;IACtB,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE;IACpB,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE;IACpB,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;CAC5B,CAAC,CAAC;AAIH,+EAA+E;AAC/E,yBAAyB;AACzB,+EAA+E;AAE/E,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC5C,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE;IACpB,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,oBAAoB,CAAC;IACtC,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC;QAChB,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;QAC1B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE;QAC5B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,aAAa;QACzC,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,aAAa;QAC3C,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;YACpB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,oBAAoB;YAC9C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,+BAA+B;YACpD,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE;SAC1B,CAAC;KACH,CAAC;CACH,CAAC,CAAC;AAIH,MAAM,CAAC,MAAM,2BAA2B,GAAG,CAAC,CAAC,MAAM,CAAC;IAClD,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;IACtB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;IACtB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE;IACxB,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;IAC1B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE;IAC5B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;IAC1B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE;IAC5B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;IAC1B,YAAY,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;QACjB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;QACtB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE;KACzB,CAAC,CAAC;IACH,UAAU,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;QAC5B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;QACjB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;QACtB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE;KACzB,CAAC,CAAC;CACJ,CAAC,CAAC;AAIH,+EAA+E;AAC/E,0BAA0B;AAC1B,+EAA+E;AAE/E,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC5C,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAClC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,qBAAqB;IAC5D,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;IACjE,aAAa,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC;IACxC,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;IACnC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,sBAAsB,CAAC;CACtD,CAAC,CAAC"}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Execution Verification System for UAM Benchmarks
|
|
3
|
+
*
|
|
4
|
+
* Provides real code execution and verification instead of just pattern matching.
|
|
5
|
+
* Runs generated code in isolated environments and validates output.
|
|
6
|
+
*/
|
|
7
|
+
export interface TestCase {
|
|
8
|
+
input: string;
|
|
9
|
+
expectedOutput: string;
|
|
10
|
+
description?: string;
|
|
11
|
+
}
|
|
12
|
+
export interface VerificationResult {
|
|
13
|
+
success: boolean;
|
|
14
|
+
executionSucceeded: boolean;
|
|
15
|
+
testsRun: number;
|
|
16
|
+
testsPassed: number;
|
|
17
|
+
errors: string[];
|
|
18
|
+
output: string;
|
|
19
|
+
executionTimeMs: number;
|
|
20
|
+
}
|
|
21
|
+
export interface TaskVerificationConfig {
|
|
22
|
+
language: 'typescript' | 'javascript' | 'python' | 'shell';
|
|
23
|
+
setupCommands?: string[];
|
|
24
|
+
testCases: TestCase[];
|
|
25
|
+
expectedPatterns?: string[];
|
|
26
|
+
timeout?: number;
|
|
27
|
+
requiresExecution?: boolean;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* Verify generated code by executing it
|
|
31
|
+
*/
|
|
32
|
+
export declare function verifyCodeExecution(code: string, config: TaskVerificationConfig): Promise<VerificationResult>;
|
|
33
|
+
/**
|
|
34
|
+
* Enhanced task verification configurations for benchmark tasks
|
|
35
|
+
*/
|
|
36
|
+
export declare const TASK_VERIFICATION_CONFIGS: Record<string, TaskVerificationConfig>;
|
|
37
|
+
/**
|
|
38
|
+
* Verify a benchmark task result
|
|
39
|
+
*/
|
|
40
|
+
export declare function verifyBenchmarkTask(taskId: string, generatedCode: string): Promise<VerificationResult>;
|
|
41
|
+
//# sourceMappingURL=execution-verifier.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"execution-verifier.d.ts","sourceRoot":"","sources":["../../src/benchmarks/execution-verifier.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,kBAAkB;IACjC,OAAO,EAAE,OAAO,CAAC;IACjB,kBAAkB,EAAE,OAAO,CAAC;IAC5B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,sBAAsB;IACrC,QAAQ,EAAE,YAAY,GAAG,YAAY,GAAG,QAAQ,GAAG,OAAO,CAAC;IAC3D,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AA8BD;;GAEG;AACH,wBAAsB,mBAAmB,CACvC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,sBAAsB,GAC7B,OAAO,CAAC,kBAAkB,CAAC,CAsG7B;AAkGD;;GAEG;AACH,eAAO,MAAM,yBAAyB,EAAE,MAAM,CAAC,MAAM,EAAE,sBAAsB,CA2E5E,CAAC;AAEF;;GAEG;AACH,wBAAsB,mBAAmB,CACvC,MAAM,EAAE,MAAM,EACd,aAAa,EAAE,MAAM,GACpB,OAAO,CAAC,kBAAkB,CAAC,CAiB7B"}
|