universal-agent-memory 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (85) hide show
  1. package/dist/benchmarks/agents/naive-agent.d.ts +60 -0
  2. package/dist/benchmarks/agents/naive-agent.d.ts.map +1 -0
  3. package/dist/benchmarks/agents/naive-agent.js +144 -0
  4. package/dist/benchmarks/agents/naive-agent.js.map +1 -0
  5. package/dist/benchmarks/agents/uam-agent.d.ts +167 -0
  6. package/dist/benchmarks/agents/uam-agent.d.ts.map +1 -0
  7. package/dist/benchmarks/agents/uam-agent.js +386 -0
  8. package/dist/benchmarks/agents/uam-agent.js.map +1 -0
  9. package/dist/benchmarks/benchmark.d.ts +328 -0
  10. package/dist/benchmarks/benchmark.d.ts.map +1 -0
  11. package/dist/benchmarks/benchmark.js +104 -0
  12. package/dist/benchmarks/benchmark.js.map +1 -0
  13. package/dist/benchmarks/execution-verifier.d.ts +41 -0
  14. package/dist/benchmarks/execution-verifier.d.ts.map +1 -0
  15. package/dist/benchmarks/execution-verifier.js +342 -0
  16. package/dist/benchmarks/execution-verifier.js.map +1 -0
  17. package/dist/benchmarks/hierarchical-prompting.d.ts +37 -0
  18. package/dist/benchmarks/hierarchical-prompting.d.ts.map +1 -0
  19. package/dist/benchmarks/hierarchical-prompting.js +260 -0
  20. package/dist/benchmarks/hierarchical-prompting.js.map +1 -0
  21. package/dist/benchmarks/improved-benchmark.d.ts +88 -0
  22. package/dist/benchmarks/improved-benchmark.d.ts.map +1 -0
  23. package/dist/benchmarks/improved-benchmark.js +533 -0
  24. package/dist/benchmarks/improved-benchmark.js.map +1 -0
  25. package/dist/benchmarks/index.d.ts +10 -0
  26. package/dist/benchmarks/index.d.ts.map +1 -0
  27. package/dist/benchmarks/index.js +10 -0
  28. package/dist/benchmarks/index.js.map +1 -0
  29. package/dist/benchmarks/multi-turn-agent.d.ts +44 -0
  30. package/dist/benchmarks/multi-turn-agent.d.ts.map +1 -0
  31. package/dist/benchmarks/multi-turn-agent.js +235 -0
  32. package/dist/benchmarks/multi-turn-agent.js.map +1 -0
  33. package/dist/benchmarks/runner.d.ts +2 -0
  34. package/dist/benchmarks/runner.d.ts.map +1 -0
  35. package/dist/benchmarks/runner.js +2 -0
  36. package/dist/benchmarks/runner.js.map +1 -0
  37. package/dist/benchmarks/tasks.d.ts +19 -0
  38. package/dist/benchmarks/tasks.d.ts.map +1 -0
  39. package/dist/benchmarks/tasks.js +371 -0
  40. package/dist/benchmarks/tasks.js.map +1 -0
  41. package/dist/index.d.ts +14 -0
  42. package/dist/index.d.ts.map +1 -1
  43. package/dist/index.js +11 -0
  44. package/dist/index.js.map +1 -1
  45. package/dist/memory/backends/qdrant-cloud.d.ts +1 -1
  46. package/dist/memory/backends/qdrant-cloud.d.ts.map +1 -1
  47. package/dist/memory/backends/qdrant-cloud.js +6 -4
  48. package/dist/memory/backends/qdrant-cloud.js.map +1 -1
  49. package/dist/memory/context-compressor.d.ts +66 -0
  50. package/dist/memory/context-compressor.d.ts.map +1 -0
  51. package/dist/memory/context-compressor.js +250 -0
  52. package/dist/memory/context-compressor.js.map +1 -0
  53. package/dist/memory/dynamic-retrieval.d.ts +26 -0
  54. package/dist/memory/dynamic-retrieval.d.ts.map +1 -0
  55. package/dist/memory/dynamic-retrieval.js +378 -0
  56. package/dist/memory/dynamic-retrieval.js.map +1 -0
  57. package/dist/memory/embeddings.d.ts +93 -0
  58. package/dist/memory/embeddings.d.ts.map +1 -0
  59. package/dist/memory/embeddings.js +391 -0
  60. package/dist/memory/embeddings.js.map +1 -0
  61. package/dist/memory/hierarchical-memory.d.ts +116 -0
  62. package/dist/memory/hierarchical-memory.d.ts.map +1 -0
  63. package/dist/memory/hierarchical-memory.js +299 -0
  64. package/dist/memory/hierarchical-memory.js.map +1 -0
  65. package/dist/memory/memory-consolidator.d.ts +88 -0
  66. package/dist/memory/memory-consolidator.d.ts.map +1 -0
  67. package/dist/memory/memory-consolidator.js +348 -0
  68. package/dist/memory/memory-consolidator.js.map +1 -0
  69. package/dist/memory/speculative-cache.d.ts +89 -0
  70. package/dist/memory/speculative-cache.d.ts.map +1 -0
  71. package/dist/memory/speculative-cache.js +259 -0
  72. package/dist/memory/speculative-cache.js.map +1 -0
  73. package/dist/memory/task-classifier.d.ts +33 -0
  74. package/dist/memory/task-classifier.d.ts.map +1 -0
  75. package/dist/memory/task-classifier.js +277 -0
  76. package/dist/memory/task-classifier.js.map +1 -0
  77. package/dist/utils/rate-limiter.d.ts +62 -0
  78. package/dist/utils/rate-limiter.d.ts.map +1 -0
  79. package/dist/utils/rate-limiter.js +150 -0
  80. package/dist/utils/rate-limiter.js.map +1 -0
  81. package/dist/utils/validate-json.d.ts +52 -0
  82. package/dist/utils/validate-json.d.ts.map +1 -0
  83. package/dist/utils/validate-json.js +99 -0
  84. package/dist/utils/validate-json.js.map +1 -0
  85. package/package.json +2 -1
@@ -0,0 +1,328 @@
1
+ /**
2
+ * Terminal-Bench Adapter for UAM
3
+ *
4
+ * Assumptions:
5
+ * - Target: Compare UAM-enabled agents vs naive agents on terminal-style tasks
6
+ * - Tasks require knowledge of project structure, past decisions, and patterns
7
+ * - UAM provides persistent memory across task sessions
8
+ *
9
+ * What this handles:
10
+ * - Benchmark task definitions
11
+ * - Memory-enabled agent wrapper
12
+ * - Performance comparison framework
13
+ * - Results aggregation and reporting
14
+ *
15
+ * What this does NOT handle:
16
+ * - Full Terminal-Bench framework integration (use tb run CLI for that)
17
+ * - Real Docker environment sandboxing
18
+ * - Multi-agent coordination (future enhancement)
19
+ */
20
+ import { z } from 'zod';
21
+ export declare const BenchmarkTaskSchema: z.ZodObject<{
22
+ id: z.ZodString;
23
+ name: z.ZodString;
24
+ description: z.ZodString;
25
+ instruction: z.ZodString;
26
+ difficulty: z.ZodEnum<["easy", "medium", "hard"]>;
27
+ category: z.ZodEnum<["memory", "coordination", "code-quality", "performance", "testing"]>;
28
+ verify: z.ZodFunction<z.ZodTuple<[], z.ZodUnknown>, z.ZodPromise<z.ZodObject<{
29
+ success: z.ZodBoolean;
30
+ details: z.ZodOptional<z.ZodRecord<z.ZodString, z.ZodAny>>;
31
+ }, "strip", z.ZodTypeAny, {
32
+ success: boolean;
33
+ details?: Record<string, any> | undefined;
34
+ }, {
35
+ success: boolean;
36
+ details?: Record<string, any> | undefined;
37
+ }>>>;
38
+ estimatedMinutes: z.ZodOptional<z.ZodNumber>;
39
+ }, "strip", z.ZodTypeAny, {
40
+ name: string;
41
+ description: string;
42
+ id: string;
43
+ category: "memory" | "coordination" | "testing" | "code-quality" | "performance";
44
+ instruction: string;
45
+ difficulty: "medium" | "easy" | "hard";
46
+ verify: (...args: unknown[]) => Promise<{
47
+ success: boolean;
48
+ details?: Record<string, any> | undefined;
49
+ }>;
50
+ estimatedMinutes?: number | undefined;
51
+ }, {
52
+ name: string;
53
+ description: string;
54
+ id: string;
55
+ category: "memory" | "coordination" | "testing" | "code-quality" | "performance";
56
+ instruction: string;
57
+ difficulty: "medium" | "easy" | "hard";
58
+ verify: (...args: unknown[]) => Promise<{
59
+ success: boolean;
60
+ details?: Record<string, any> | undefined;
61
+ }>;
62
+ estimatedMinutes?: number | undefined;
63
+ }>;
64
+ export type BenchmarkTask = z.infer<typeof BenchmarkTaskSchema>;
65
+ export declare const AgentExecutionSchema: z.ZodObject<{
66
+ taskId: z.ZodString;
67
+ agent: z.ZodString;
68
+ startTime: z.ZodNumber;
69
+ endTime: z.ZodNumber;
70
+ durationMs: z.ZodNumber;
71
+ success: z.ZodBoolean;
72
+ attempts: z.ZodNumber;
73
+ memoryQueries: z.ZodOptional<z.ZodNumber>;
74
+ tokensUsed: z.ZodOptional<z.ZodNumber>;
75
+ errors: z.ZodArray<z.ZodString, "many">;
76
+ }, "strip", z.ZodTypeAny, {
77
+ success: boolean;
78
+ errors: string[];
79
+ taskId: string;
80
+ agent: string;
81
+ startTime: number;
82
+ endTime: number;
83
+ durationMs: number;
84
+ attempts: number;
85
+ memoryQueries?: number | undefined;
86
+ tokensUsed?: number | undefined;
87
+ }, {
88
+ success: boolean;
89
+ errors: string[];
90
+ taskId: string;
91
+ agent: string;
92
+ startTime: number;
93
+ endTime: number;
94
+ durationMs: number;
95
+ attempts: number;
96
+ memoryQueries?: number | undefined;
97
+ tokensUsed?: number | undefined;
98
+ }>;
99
+ export type AgentExecution = z.infer<typeof AgentExecutionSchema>;
100
+ export declare const BenchmarkResultSchema: z.ZodObject<{
101
+ taskId: z.ZodString;
102
+ taskName: z.ZodString;
103
+ results: z.ZodArray<z.ZodObject<{
104
+ taskId: z.ZodString;
105
+ agent: z.ZodString;
106
+ startTime: z.ZodNumber;
107
+ endTime: z.ZodNumber;
108
+ durationMs: z.ZodNumber;
109
+ success: z.ZodBoolean;
110
+ attempts: z.ZodNumber;
111
+ memoryQueries: z.ZodOptional<z.ZodNumber>;
112
+ tokensUsed: z.ZodOptional<z.ZodNumber>;
113
+ errors: z.ZodArray<z.ZodString, "many">;
114
+ }, "strip", z.ZodTypeAny, {
115
+ success: boolean;
116
+ errors: string[];
117
+ taskId: string;
118
+ agent: string;
119
+ startTime: number;
120
+ endTime: number;
121
+ durationMs: number;
122
+ attempts: number;
123
+ memoryQueries?: number | undefined;
124
+ tokensUsed?: number | undefined;
125
+ }, {
126
+ success: boolean;
127
+ errors: string[];
128
+ taskId: string;
129
+ agent: string;
130
+ startTime: number;
131
+ endTime: number;
132
+ durationMs: number;
133
+ attempts: number;
134
+ memoryQueries?: number | undefined;
135
+ tokensUsed?: number | undefined;
136
+ }>, "many">;
137
+ summary: z.ZodObject<{
138
+ uamSuccessRate: z.ZodNumber;
139
+ naiveSuccessRate: z.ZodNumber;
140
+ uamAvgDuration: z.ZodNumber;
141
+ naiveAvgDuration: z.ZodNumber;
142
+ improvement: z.ZodObject<{
143
+ successDelta: z.ZodNumber;
144
+ speedup: z.ZodNumber;
145
+ memoryQueries: z.ZodNumber;
146
+ }, "strip", z.ZodTypeAny, {
147
+ memoryQueries: number;
148
+ successDelta: number;
149
+ speedup: number;
150
+ }, {
151
+ memoryQueries: number;
152
+ successDelta: number;
153
+ speedup: number;
154
+ }>;
155
+ }, "strip", z.ZodTypeAny, {
156
+ uamSuccessRate: number;
157
+ naiveSuccessRate: number;
158
+ uamAvgDuration: number;
159
+ naiveAvgDuration: number;
160
+ improvement: {
161
+ memoryQueries: number;
162
+ successDelta: number;
163
+ speedup: number;
164
+ };
165
+ }, {
166
+ uamSuccessRate: number;
167
+ naiveSuccessRate: number;
168
+ uamAvgDuration: number;
169
+ naiveAvgDuration: number;
170
+ improvement: {
171
+ memoryQueries: number;
172
+ successDelta: number;
173
+ speedup: number;
174
+ };
175
+ }>;
176
+ }, "strip", z.ZodTypeAny, {
177
+ summary: {
178
+ uamSuccessRate: number;
179
+ naiveSuccessRate: number;
180
+ uamAvgDuration: number;
181
+ naiveAvgDuration: number;
182
+ improvement: {
183
+ memoryQueries: number;
184
+ successDelta: number;
185
+ speedup: number;
186
+ };
187
+ };
188
+ taskId: string;
189
+ taskName: string;
190
+ results: {
191
+ success: boolean;
192
+ errors: string[];
193
+ taskId: string;
194
+ agent: string;
195
+ startTime: number;
196
+ endTime: number;
197
+ durationMs: number;
198
+ attempts: number;
199
+ memoryQueries?: number | undefined;
200
+ tokensUsed?: number | undefined;
201
+ }[];
202
+ }, {
203
+ summary: {
204
+ uamSuccessRate: number;
205
+ naiveSuccessRate: number;
206
+ uamAvgDuration: number;
207
+ naiveAvgDuration: number;
208
+ improvement: {
209
+ memoryQueries: number;
210
+ successDelta: number;
211
+ speedup: number;
212
+ };
213
+ };
214
+ taskId: string;
215
+ taskName: string;
216
+ results: {
217
+ success: boolean;
218
+ errors: string[];
219
+ taskId: string;
220
+ agent: string;
221
+ startTime: number;
222
+ endTime: number;
223
+ durationMs: number;
224
+ attempts: number;
225
+ memoryQueries?: number | undefined;
226
+ tokensUsed?: number | undefined;
227
+ }[];
228
+ }>;
229
+ export type BenchmarkResult = z.infer<typeof BenchmarkResultSchema>;
230
+ export declare const OverallBenchmarkStatsSchema: z.ZodObject<{
231
+ totalTasks: z.ZodNumber;
232
+ uamSuccess: z.ZodNumber;
233
+ naiveSuccess: z.ZodNumber;
234
+ uamSuccessRate: z.ZodNumber;
235
+ naiveSuccessRate: z.ZodNumber;
236
+ uamAvgDuration: z.ZodNumber;
237
+ naiveAvgDuration: z.ZodNumber;
238
+ overallSpeedup: z.ZodNumber;
239
+ byDifficulty: z.ZodRecord<z.ZodString, z.ZodObject<{
240
+ count: z.ZodNumber;
241
+ uamSuccess: z.ZodNumber;
242
+ naiveSuccess: z.ZodNumber;
243
+ }, "strip", z.ZodTypeAny, {
244
+ count: number;
245
+ uamSuccess: number;
246
+ naiveSuccess: number;
247
+ }, {
248
+ count: number;
249
+ uamSuccess: number;
250
+ naiveSuccess: number;
251
+ }>>;
252
+ byCategory: z.ZodRecord<z.ZodString, z.ZodObject<{
253
+ count: z.ZodNumber;
254
+ uamSuccess: z.ZodNumber;
255
+ naiveSuccess: z.ZodNumber;
256
+ }, "strip", z.ZodTypeAny, {
257
+ count: number;
258
+ uamSuccess: number;
259
+ naiveSuccess: number;
260
+ }, {
261
+ count: number;
262
+ uamSuccess: number;
263
+ naiveSuccess: number;
264
+ }>>;
265
+ }, "strip", z.ZodTypeAny, {
266
+ uamSuccessRate: number;
267
+ naiveSuccessRate: number;
268
+ uamAvgDuration: number;
269
+ naiveAvgDuration: number;
270
+ totalTasks: number;
271
+ uamSuccess: number;
272
+ naiveSuccess: number;
273
+ overallSpeedup: number;
274
+ byDifficulty: Record<string, {
275
+ count: number;
276
+ uamSuccess: number;
277
+ naiveSuccess: number;
278
+ }>;
279
+ byCategory: Record<string, {
280
+ count: number;
281
+ uamSuccess: number;
282
+ naiveSuccess: number;
283
+ }>;
284
+ }, {
285
+ uamSuccessRate: number;
286
+ naiveSuccessRate: number;
287
+ uamAvgDuration: number;
288
+ naiveAvgDuration: number;
289
+ totalTasks: number;
290
+ uamSuccess: number;
291
+ naiveSuccess: number;
292
+ overallSpeedup: number;
293
+ byDifficulty: Record<string, {
294
+ count: number;
295
+ uamSuccess: number;
296
+ naiveSuccess: number;
297
+ }>;
298
+ byCategory: Record<string, {
299
+ count: number;
300
+ uamSuccess: number;
301
+ naiveSuccess: number;
302
+ }>;
303
+ }>;
304
+ export type OverallBenchmarkStats = z.infer<typeof OverallBenchmarkStatsSchema>;
305
+ export declare const BenchmarkConfigSchema: z.ZodObject<{
306
+ maxAttempts: z.ZodDefault<z.ZodNumber>;
307
+ timeoutMs: z.ZodDefault<z.ZodNumber>;
308
+ agents: z.ZodDefault<z.ZodArray<z.ZodString, "many">>;
309
+ memoryEnabled: z.ZodDefault<z.ZodBoolean>;
310
+ verbose: z.ZodDefault<z.ZodBoolean>;
311
+ outputDir: z.ZodDefault<z.ZodString>;
312
+ }, "strip", z.ZodTypeAny, {
313
+ agents: string[];
314
+ maxAttempts: number;
315
+ timeoutMs: number;
316
+ memoryEnabled: boolean;
317
+ verbose: boolean;
318
+ outputDir: string;
319
+ }, {
320
+ agents?: string[] | undefined;
321
+ maxAttempts?: number | undefined;
322
+ timeoutMs?: number | undefined;
323
+ memoryEnabled?: boolean | undefined;
324
+ verbose?: boolean | undefined;
325
+ outputDir?: string | undefined;
326
+ }>;
327
+ export type BenchmarkConfig = z.infer<typeof BenchmarkConfigSchema>;
328
+ //# sourceMappingURL=benchmark.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"benchmark.d.ts","sourceRoot":"","sources":["../../src/benchmarks/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAMxB,eAAO,MAAM,mBAAmB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAc9B,CAAC;AAEH,MAAM,MAAM,aAAa,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,mBAAmB,CAAC,CAAC;AAMhE,eAAO,MAAM,oBAAoB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAW/B,CAAC;AAEH,MAAM,MAAM,cAAc,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,oBAAoB,CAAC,CAAC;AAMlE,eAAO,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAehC,CAAC;AAEH,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC;AAEpE,eAAO,MAAM,2BAA2B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;EAmBtC,CAAC;AAEH,MAAM,MAAM,qBAAqB,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,2BAA2B,CAAC,CAAC;AAMhF,eAAO,MAAM,qBAAqB;;;;;;;;;;;;;;;;;;;;;EAOhC,CAAC;AAEH,MAAM,MAAM,eAAe,GAAG,CAAC,CAAC,KAAK,CAAC,OAAO,qBAAqB,CAAC,CAAC"}
@@ -0,0 +1,104 @@
1
+ /**
2
+ * Terminal-Bench Adapter for UAM
3
+ *
4
+ * Assumptions:
5
+ * - Target: Compare UAM-enabled agents vs naive agents on terminal-style tasks
6
+ * - Tasks require knowledge of project structure, past decisions, and patterns
7
+ * - UAM provides persistent memory across task sessions
8
+ *
9
+ * What this handles:
10
+ * - Benchmark task definitions
11
+ * - Memory-enabled agent wrapper
12
+ * - Performance comparison framework
13
+ * - Results aggregation and reporting
14
+ *
15
+ * What this does NOT handle:
16
+ * - Full Terminal-Bench framework integration (use tb run CLI for that)
17
+ * - Real Docker environment sandboxing
18
+ * - Multi-agent coordination (future enhancement)
19
+ */
20
+ import { z } from 'zod';
21
+ // ============================================================================
22
+ // Task Types
23
+ // ============================================================================
24
+ export const BenchmarkTaskSchema = z.object({
25
+ id: z.string(),
26
+ name: z.string(),
27
+ description: z.string(),
28
+ instruction: z.string(),
29
+ difficulty: z.enum(['easy', 'medium', 'hard']),
30
+ category: z.enum(['memory', 'coordination', 'code-quality', 'performance', 'testing']),
31
+ // Verification: checks if agent solved the task correctly
32
+ verify: z.function().returns(z.promise(z.object({
33
+ success: z.boolean(),
34
+ details: z.record(z.any()).optional(),
35
+ }))),
36
+ // Estimated time to complete
37
+ estimatedMinutes: z.number().optional(),
38
+ });
39
+ // ============================================================================
40
+ // Agent Types
41
+ // ============================================================================
42
+ export const AgentExecutionSchema = z.object({
43
+ taskId: z.string(),
44
+ agent: z.string(),
45
+ startTime: z.number(),
46
+ endTime: z.number(),
47
+ durationMs: z.number(),
48
+ success: z.boolean(),
49
+ attempts: z.number(),
50
+ memoryQueries: z.number().optional(),
51
+ tokensUsed: z.number().optional(),
52
+ errors: z.array(z.string()),
53
+ });
54
+ // ============================================================================
55
+ // Benchmark Result Types
56
+ // ============================================================================
57
+ export const BenchmarkResultSchema = z.object({
58
+ taskId: z.string(),
59
+ taskName: z.string(),
60
+ results: z.array(AgentExecutionSchema),
61
+ summary: z.object({
62
+ uamSuccessRate: z.number(),
63
+ naiveSuccessRate: z.number(),
64
+ uamAvgDuration: z.number(), // in seconds
65
+ naiveAvgDuration: z.number(), // in seconds
66
+ improvement: z.object({
67
+ successDelta: z.number(), // percentage points
68
+ speedup: z.number(), // ratio >1 means UAM is faster
69
+ memoryQueries: z.number(),
70
+ }),
71
+ }),
72
+ });
73
+ export const OverallBenchmarkStatsSchema = z.object({
74
+ totalTasks: z.number(),
75
+ uamSuccess: z.number(),
76
+ naiveSuccess: z.number(),
77
+ uamSuccessRate: z.number(),
78
+ naiveSuccessRate: z.number(),
79
+ uamAvgDuration: z.number(),
80
+ naiveAvgDuration: z.number(),
81
+ overallSpeedup: z.number(),
82
+ byDifficulty: z.record(z.object({
83
+ count: z.number(),
84
+ uamSuccess: z.number(),
85
+ naiveSuccess: z.number(),
86
+ })),
87
+ byCategory: z.record(z.object({
88
+ count: z.number(),
89
+ uamSuccess: z.number(),
90
+ naiveSuccess: z.number(),
91
+ })),
92
+ });
93
+ // ============================================================================
94
+ // Benchmark Configuration
95
+ // ============================================================================
96
+ export const BenchmarkConfigSchema = z.object({
97
+ maxAttempts: z.number().default(3),
98
+ timeoutMs: z.number().default(300000), // 5 minutes per task
99
+ agents: z.array(z.string()).default(['uam-agent', 'naive-agent']),
100
+ memoryEnabled: z.boolean().default(true),
101
+ verbose: z.boolean().default(false),
102
+ outputDir: z.string().default('./benchmarks/results'),
103
+ });
104
+ //# sourceMappingURL=benchmark.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"benchmark.js","sourceRoot":"","sources":["../../src/benchmarks/benchmark.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;GAkBG;AAEH,OAAO,EAAE,CAAC,EAAE,MAAM,KAAK,CAAC;AAExB,+EAA+E;AAC/E,aAAa;AACb,+EAA+E;AAE/E,MAAM,CAAC,MAAM,mBAAmB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC1C,EAAE,EAAE,CAAC,CAAC,MAAM,EAAE;IACd,IAAI,EAAE,CAAC,CAAC,MAAM,EAAE;IAChB,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE;IACvB,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE;IACvB,UAAU,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,MAAM,EAAE,QAAQ,EAAE,MAAM,CAAC,CAAC;IAC9C,QAAQ,EAAE,CAAC,CAAC,IAAI,CAAC,CAAC,QAAQ,EAAE,cAAc,EAAE,cAAc,EAAE,aAAa,EAAE,SAAS,CAAC,CAAC;IACtF,0DAA0D;IAC1D,MAAM,EAAE,CAAC,CAAC,QAAQ,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9C,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE;QACpB,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,GAAG,EAAE,CAAC,CAAC,QAAQ,EAAE;KACtC,CAAC,CAAC,CAAC;IACJ,6BAA6B;IAC7B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;CACxC,CAAC,CAAC;AAIH,+EAA+E;AAC/E,cAAc;AACd,+EAA+E;AAE/E,MAAM,CAAC,MAAM,oBAAoB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC3C,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;IACjB,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE;IACrB,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE;IACnB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;IACtB,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE;IACpB,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE;IACpB,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACpC,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,QAAQ,EAAE;IACjC,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC;CAC5B,CAAC,CAAC;AAIH,+EAA+E;AAC/E,yBAAyB;AACzB,+EAA+E;AAE/E,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC5C,MAAM,EAAE,CAAC,CAAC,MAAM,EAAE;IAClB,QAAQ,EAAE,CAAC,CAAC,MAAM,EAAE;IACpB,OAAO,EAAE,CAAC,CAAC,KAAK,CAAC,oBAAoB,CAAC;IACtC,OAAO,EAAE,CAAC,CAAC,MAAM,CAAC;QAChB,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;QAC1B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE;QAC5B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,aAAa;QACzC,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,aAAa;QAC3C,WAAW,EAAE,CAAC,CAAC,MAAM,CAAC;YACpB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,oBAAoB;YAC9C,OAAO,EAAE,CAAC,CAAC,MAAM,EAAE,EAAE,+BAA+B;YACpD,aAAa,EAAE,CAAC,CAAC,MAAM,EAAE;SAC1B,CAAC;KACH,CAAC;CACH,CAAC,CAAC;AAIH,MAAM,CAAC,MAAM,2BAA2B,GAAG,CAAC,CAAC,MAAM,CAAC;IAClD,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;IACtB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;IACtB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE;IACxB,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;IAC1B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE;IAC5B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;IAC1B,gBAAgB,EAAE,CAAC,CAAC,MAAM,EAAE;IAC5B,cAAc,EAAE,CAAC,CAAC,MAAM,EAAE;IAC1B,YAAY,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;QAC9B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;QACjB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;QACtB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE;KACzB,CAAC,CAAC;IACH,UAAU,EAAE,CAAC,CAAC,MAAM,CAAC,CAAC,CAAC,MAAM,CAAC;QAC5B,KAAK,EAAE,CAAC,CAAC,MAAM,EAAE;QACjB,UAAU,EAAE,CAAC,CAAC,MAAM,EAAE;QACtB,YAAY,EAAE,CAAC,CAAC,MAAM,EAAE;KACzB,CAAC,CAAC;CACJ,CAAC,CAAC;AAIH,+EAA+E;AAC/E,0BAA0B;AAC1B,+EAA+E;AAE/E,MAAM,CAAC,MAAM,qBAAqB,GAAG,CAAC,CAAC,MAAM,CAAC;IAC5C,WAAW,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,CAAC,CAAC;IAClC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,MAAM,CAAC,EAAE,qBAAqB;IAC5D,MAAM,EAAE,CAAC,CAAC,KAAK,CAAC,CAAC,CAAC,MAAM,EAAE,CAAC,CAAC,OAAO,CAAC,CAAC,WAAW,EAAE,aAAa,CAAC,CAAC;IACjE,aAAa,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,IAAI,CAAC;IACxC,OAAO,EAAE,CAAC,CAAC,OAAO,EAAE,CAAC,OAAO,CAAC,KAAK,CAAC;IACnC,SAAS,EAAE,CAAC,CAAC,MAAM,EAAE,CAAC,OAAO,CAAC,sBAAsB,CAAC;CACtD,CAAC,CAAC"}
@@ -0,0 +1,41 @@
1
+ /**
2
+ * Execution Verification System for UAM Benchmarks
3
+ *
4
+ * Provides real code execution and verification instead of just pattern matching.
5
+ * Runs generated code in isolated environments and validates output.
6
+ */
7
+ export interface TestCase {
8
+ input: string;
9
+ expectedOutput: string;
10
+ description?: string;
11
+ }
12
+ export interface VerificationResult {
13
+ success: boolean;
14
+ executionSucceeded: boolean;
15
+ testsRun: number;
16
+ testsPassed: number;
17
+ errors: string[];
18
+ output: string;
19
+ executionTimeMs: number;
20
+ }
21
+ export interface TaskVerificationConfig {
22
+ language: 'typescript' | 'javascript' | 'python' | 'shell';
23
+ setupCommands?: string[];
24
+ testCases: TestCase[];
25
+ expectedPatterns?: string[];
26
+ timeout?: number;
27
+ requiresExecution?: boolean;
28
+ }
29
+ /**
30
+ * Verify generated code by executing it
31
+ */
32
+ export declare function verifyCodeExecution(code: string, config: TaskVerificationConfig): Promise<VerificationResult>;
33
+ /**
34
+ * Enhanced task verification configurations for benchmark tasks
35
+ */
36
+ export declare const TASK_VERIFICATION_CONFIGS: Record<string, TaskVerificationConfig>;
37
+ /**
38
+ * Verify a benchmark task result
39
+ */
40
+ export declare function verifyBenchmarkTask(taskId: string, generatedCode: string): Promise<VerificationResult>;
41
+ //# sourceMappingURL=execution-verifier.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"execution-verifier.d.ts","sourceRoot":"","sources":["../../src/benchmarks/execution-verifier.ts"],"names":[],"mappings":"AAAA;;;;;GAKG;AAOH,MAAM,WAAW,QAAQ;IACvB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,EAAE,MAAM,CAAC;IACvB,WAAW,CAAC,EAAE,MAAM,CAAC;CACtB;AAED,MAAM,WAAW,kBAAkB;IACjC,OAAO,EAAE,OAAO,CAAC;IACjB,kBAAkB,EAAE,OAAO,CAAC;IAC5B,QAAQ,EAAE,MAAM,CAAC;IACjB,WAAW,EAAE,MAAM,CAAC;IACpB,MAAM,EAAE,MAAM,EAAE,CAAC;IACjB,MAAM,EAAE,MAAM,CAAC;IACf,eAAe,EAAE,MAAM,CAAC;CACzB;AAED,MAAM,WAAW,sBAAsB;IACrC,QAAQ,EAAE,YAAY,GAAG,YAAY,GAAG,QAAQ,GAAG,OAAO,CAAC;IAC3D,aAAa,CAAC,EAAE,MAAM,EAAE,CAAC;IACzB,SAAS,EAAE,QAAQ,EAAE,CAAC;IACtB,gBAAgB,CAAC,EAAE,MAAM,EAAE,CAAC;IAC5B,OAAO,CAAC,EAAE,MAAM,CAAC;IACjB,iBAAiB,CAAC,EAAE,OAAO,CAAC;CAC7B;AA0ED;;GAEG;AACH,wBAAsB,mBAAmB,CACvC,IAAI,EAAE,MAAM,EACZ,MAAM,EAAE,sBAAsB,GAC7B,OAAO,CAAC,kBAAkB,CAAC,CAyG7B;AAkGD;;GAEG;AACH,eAAO,MAAM,yBAAyB,EAAE,MAAM,CAAC,MAAM,EAAE,sBAAsB,CA4E5E,CAAC;AAEF;;GAEG;AACH,wBAAsB,mBAAmB,CACvC,MAAM,EAAE,MAAM,EACd,aAAa,EAAE,MAAM,GACpB,OAAO,CAAC,kBAAkB,CAAC,CAiB7B"}