beercan 0.6.13 → 0.6.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. package/README.md +39 -1
  2. package/dist/cli.js +199 -0
  3. package/dist/cli.js.map +1 -1
  4. package/dist/core/roles.js +1 -1
  5. package/dist/core/roles.js.map +1 -1
  6. package/dist/index.d.ts +30 -0
  7. package/dist/index.d.ts.map +1 -1
  8. package/dist/index.js +45 -0
  9. package/dist/index.js.map +1 -1
  10. package/dist/skills/index.d.ts.map +1 -1
  11. package/dist/skills/index.js +44 -0
  12. package/dist/skills/index.js.map +1 -1
  13. package/dist/tools/builtin/email.d.ts +5 -0
  14. package/dist/tools/builtin/email.d.ts.map +1 -0
  15. package/dist/tools/builtin/email.js +171 -0
  16. package/dist/tools/builtin/email.js.map +1 -0
  17. package/dist/training/curriculum.d.ts +4 -0
  18. package/dist/training/curriculum.d.ts.map +1 -0
  19. package/dist/training/curriculum.js +512 -0
  20. package/dist/training/curriculum.js.map +1 -0
  21. package/dist/training/evaluator.d.ts +21 -0
  22. package/dist/training/evaluator.d.ts.map +1 -0
  23. package/dist/training/evaluator.js +163 -0
  24. package/dist/training/evaluator.js.map +1 -0
  25. package/dist/training/exporter.d.ts +35 -0
  26. package/dist/training/exporter.d.ts.map +1 -0
  27. package/dist/training/exporter.js +377 -0
  28. package/dist/training/exporter.js.map +1 -0
  29. package/dist/training/index.d.ts +5 -0
  30. package/dist/training/index.d.ts.map +1 -0
  31. package/dist/training/index.js +5 -0
  32. package/dist/training/index.js.map +1 -0
  33. package/dist/training/sandbox-manager.d.ts +58 -0
  34. package/dist/training/sandbox-manager.d.ts.map +1 -0
  35. package/dist/training/sandbox-manager.js +416 -0
  36. package/dist/training/sandbox-manager.js.map +1 -0
  37. package/dist/training/types.d.ts +790 -0
  38. package/dist/training/types.d.ts.map +1 -0
  39. package/dist/training/types.js +154 -0
  40. package/dist/training/types.js.map +1 -0
  41. package/package.json +1 -1
@@ -0,0 +1,512 @@
1
+ // ── Training Curriculum ───────────────────────────────────────
2
+ // 25 scenarios across 4 difficulty levels, testing key agent capabilities.
3
+ // ── Novice (6 scenarios) ─────────────────────────────────────
4
+ const NOVICE_SCENARIOS = [
5
+ {
6
+ id: "memory-hello",
7
+ name: "Memory Hello World",
8
+ difficulty: "novice",
9
+ category: "memory",
10
+ goal: "Use the memory_store tool to store a fact about yourself: 'I am a BeerCan agent learning to use memory tools.' Then immediately use memory_search to retrieve it and report back what you retrieved.",
11
+ evaluationCriteria: "The agent stored a memory and then retrieved it, reporting the retrieved content.",
12
+ evaluatorType: "contains",
13
+ evaluatorConfig: { pattern: "retrieved", passThreshold: 0.5 },
14
+ teaches: ["memory_store", "memory_search", "basic memory workflow"],
15
+ requiredTools: ["memory_store", "memory_search"],
16
+ prerequisites: [],
17
+ maxAttempts: 3,
18
+ timeoutMs: 120_000,
19
+ },
20
+ {
21
+ id: "file-explorer",
22
+ name: "File Explorer",
23
+ difficulty: "novice",
24
+ category: "tools",
25
+ goal: "Use the list_directory tool to list the current working directory and report what files and folders you find there.",
26
+ evaluationCriteria: "The agent used list_directory and reported the contents of the current directory.",
27
+ evaluatorType: "llm",
28
+ evaluatorConfig: {
29
+ criteria: "Did the agent use list_directory (or equivalent), and does the response contain a listing of directory contents? Score high if they reported actual directory contents.",
30
+ passThreshold: 0.6,
31
+ },
32
+ teaches: ["list_directory", "filesystem exploration"],
33
+ requiredTools: ["list_directory"],
34
+ prerequisites: [],
35
+ maxAttempts: 3,
36
+ timeoutMs: 60_000,
37
+ },
38
+ {
39
+ id: "web-basics",
40
+ name: "Web Basics",
41
+ difficulty: "novice",
42
+ category: "tools",
43
+ goal: "Fetch the webpage at https://example.com using web_fetch and write a 2-3 sentence summary of what the page is about.",
44
+ evaluationCriteria: "The agent fetched example.com and provided an accurate summary of its contents.",
45
+ evaluatorType: "llm",
46
+ evaluatorConfig: {
47
+ criteria: "Did the agent fetch the URL and provide a reasonable summary mentioning it is a sample/example domain? Score high if the summary is coherent and mentions key details from the page.",
48
+ passThreshold: 0.6,
49
+ },
50
+ teaches: ["web_fetch", "content summarization"],
51
+ requiredTools: ["web_fetch"],
52
+ prerequisites: [],
53
+ maxAttempts: 3,
54
+ timeoutMs: 120_000,
55
+ },
56
+ {
57
+ id: "memory-chain",
58
+ name: "Memory Chain",
59
+ difficulty: "novice",
60
+ category: "memory",
61
+ goal: "Store the number 42 in memory with the title 'my_number'. Then retrieve it, double it, and report the result as a number.",
62
+ evaluationCriteria: "The agent stored a number, retrieved it, doubled it, and reports 84.",
63
+ evaluatorType: "regex",
64
+ evaluatorConfig: { pattern: "84", passThreshold: 0.5 },
65
+ teaches: ["memory workflow", "arithmetic", "memory_store", "memory_search"],
66
+ requiredTools: ["memory_store", "memory_search"],
67
+ prerequisites: ["memory-hello"],
68
+ maxAttempts: 3,
69
+ timeoutMs: 120_000,
70
+ },
71
+ {
72
+ id: "self-reflection",
73
+ name: "Self Reflection",
74
+ difficulty: "novice",
75
+ category: "reasoning",
76
+ goal: "Describe your capabilities as an AI agent: what tools are available to you, what kinds of tasks you can perform, and what your limitations are. Be specific and accurate.",
77
+ evaluationCriteria: "The agent accurately describes its capabilities, available tools, and limitations.",
78
+ evaluatorType: "llm",
79
+ evaluatorConfig: {
80
+ criteria: "Does the response include specific tool names (like memory_store, web_fetch, list_directory, etc.)? Does it accurately describe agent capabilities? Does it mention limitations? Score high for accurate, specific self-assessment.",
81
+ passThreshold: 0.6,
82
+ },
83
+ teaches: ["self-awareness", "tool enumeration", "capability assessment"],
84
+ requiredTools: [],
85
+ prerequisites: [],
86
+ maxAttempts: 3,
87
+ timeoutMs: 90_000,
88
+ },
89
+ {
90
+ id: "simple-plan",
91
+ name: "Simple Plan",
92
+ difficulty: "novice",
93
+ category: "planning",
94
+ goal: "Break down the task 'make a cup of tea' into ordered, logical steps. Write these steps to a file called 'tea-steps.txt' in the current directory.",
95
+ evaluationCriteria: "The agent created a file with logical, ordered steps for making tea.",
96
+ evaluatorType: "llm",
97
+ evaluatorConfig: {
98
+ criteria: "Did the agent write a file? Do the steps make logical sense for making tea (boil water, steep, etc.)? Are they ordered correctly? Score high for a complete, logical step-by-step plan written to a file.",
99
+ passThreshold: 0.6,
100
+ },
101
+ teaches: ["planning", "write_file", "task decomposition"],
102
+ requiredTools: ["write_file"],
103
+ prerequisites: [],
104
+ maxAttempts: 3,
105
+ timeoutMs: 90_000,
106
+ },
107
+ ];
108
+ // ── Apprentice (6 scenarios) ──────────────────────────────────
109
+ const APPRENTICE_SCENARIOS = [
110
+ {
111
+ id: "debug-script",
112
+ name: "Debug a Script",
113
+ difficulty: "apprentice",
114
+ category: "coding",
115
+ goal: `First, write this JavaScript file to 'buggy.js':
116
+
117
+ \`\`\`javascript
118
+ function addNumbers(a, b) {
119
+ return a - b; // BUG: should be addition
120
+ }
121
+
122
+ const result = addNumbers(5, 3);
123
+ console.log('5 + 3 =', result); // Should print 8, not 2
124
+ \`\`\`
125
+
126
+ Then find the bug, fix it, and write the corrected version to 'fixed.js'. Report what the bug was and what you changed.`,
127
+ evaluationCriteria: "The agent identified the subtraction bug, fixed it to addition, and wrote the corrected file.",
128
+ evaluatorType: "llm",
129
+ evaluatorConfig: {
130
+ criteria: "Did the agent correctly identify the bug (using subtraction instead of addition)? Did they write a fixed version? Is the fix correct (changing - to +)? Score high for correct identification and fix.",
131
+ passThreshold: 0.7,
132
+ },
133
+ teaches: ["debugging", "read_file", "write_file", "code analysis"],
134
+ requiredTools: ["write_file", "read_file"],
135
+ prerequisites: ["simple-plan"],
136
+ maxAttempts: 3,
137
+ timeoutMs: 180_000,
138
+ },
139
+ {
140
+ id: "research-synthesize",
141
+ name: "Research and Synthesize",
142
+ difficulty: "apprentice",
143
+ category: "reasoning",
144
+ goal: "Research what 'retrieval augmented generation' (RAG) is by fetching information from the web using web_fetch. Then write a 3-paragraph summary to 'rag-summary.txt' covering: (1) what RAG is, (2) how it works, (3) why it is useful.",
145
+ evaluationCriteria: "The agent fetched web content, synthesized it, and wrote a 3-paragraph summary of RAG.",
146
+ evaluatorType: "llm",
147
+ evaluatorConfig: {
148
+ criteria: "Did the agent use web_fetch? Did it write a summary file? Does the summary accurately explain RAG (combining retrieval with LLM generation)? Are there 3 distinct paragraphs covering the required topics? Score high for accurate, well-structured content.",
149
+ passThreshold: 0.65,
150
+ },
151
+ teaches: ["web research", "synthesis", "web_fetch", "write_file", "structured writing"],
152
+ requiredTools: ["web_fetch", "write_file"],
153
+ prerequisites: ["web-basics"],
154
+ maxAttempts: 3,
155
+ timeoutMs: 300_000,
156
+ },
157
+ {
158
+ id: "persistent-memory",
159
+ name: "Persistent Memory",
160
+ difficulty: "apprentice",
161
+ category: "memory",
162
+ goal: "Store these 3 facts in memory: (1) 'Paris is the capital of France', (2) 'Python was created in 1991', (3) 'The speed of light is approximately 299,792 km/s'. Then search your memory for 'capital of France' and use that retrieved fact to answer: What is the capital of France?",
163
+ evaluationCriteria: "The agent stored 3 facts and then retrieved and used one to answer the question.",
164
+ evaluatorType: "llm",
165
+ evaluatorConfig: {
166
+ criteria: "Did the agent store the facts using memory_store? Did they use memory_search to retrieve the France fact? Does the final answer correctly state Paris is the capital of France, demonstrating memory retrieval was used? Score high for demonstrating the full memory store-then-retrieve workflow.",
167
+ passThreshold: 0.7,
168
+ },
169
+ teaches: ["memory persistence", "memory_store", "memory_search", "fact retrieval"],
170
+ requiredTools: ["memory_store", "memory_search"],
171
+ prerequisites: ["memory-chain"],
172
+ maxAttempts: 3,
173
+ timeoutMs: 180_000,
174
+ },
175
+ {
176
+ id: "create-first-skill",
177
+ name: "Create Your First Skill",
178
+ difficulty: "apprentice",
179
+ category: "self_improvement",
180
+ goal: "Review the work you have done so far. Identify a useful pattern or workflow (such as 'store facts then retrieve them' or 'research then summarize'). Create a skill using the create_skill tool that captures this pattern with clear instructions for future use.",
181
+ evaluationCriteria: "The agent created a skill that captures a useful pattern from their work.",
182
+ evaluatorType: "llm",
183
+ evaluatorConfig: {
184
+ criteria: "Did the agent use the create_skill tool? Does the skill capture a useful, reusable pattern from prior work (not just a trivial or empty skill)? Score high for a skill with clear instructions and meaningful triggers.",
185
+ passThreshold: 0.6,
186
+ },
187
+ teaches: ["create_skill", "meta-learning", "self-improvement", "skill creation"],
188
+ requiredTools: ["create_skill"],
189
+ prerequisites: ["persistent-memory"],
190
+ maxAttempts: 3,
191
+ timeoutMs: 180_000,
192
+ },
193
+ {
194
+ id: "tool-selection",
195
+ name: "Tool Selection",
196
+ difficulty: "apprentice",
197
+ category: "reasoning",
198
+ goal: "For each of these 5 tasks, identify which built-in tool is BEST suited and explain why: (1) Reading a local file, (2) Getting current date and time, (3) Searching through past memories, (4) Running a shell command, (5) Fetching a webpage. Write your answers to 'tool-selection.txt'.",
199
+ evaluationCriteria: "The agent correctly identified the best tool for each task and explained the reasoning.",
200
+ evaluatorType: "llm",
201
+ evaluatorConfig: {
202
+ criteria: "Did the agent correctly identify: (1) read_file, (2) get_datetime, (3) memory_search, (4) exec_command, (5) web_fetch? Does the explanation show understanding of when to use each tool? Score high for correct tool selection with good reasoning.",
203
+ passThreshold: 0.65,
204
+ },
205
+ teaches: ["tool knowledge", "tool selection", "reasoning about capabilities"],
206
+ requiredTools: ["write_file"],
207
+ prerequisites: ["self-reflection"],
208
+ maxAttempts: 3,
209
+ timeoutMs: 120_000,
210
+ },
211
+ {
212
+ id: "error-recovery",
213
+ name: "Error Recovery",
214
+ difficulty: "apprentice",
215
+ category: "reasoning",
216
+ goal: "Attempt to read a file called 'this-file-does-not-exist-12345.txt'. When you encounter an error, handle it gracefully. Then report: what error occurred, how you handled it, and what you would do differently to avoid this error in production.",
217
+ evaluationCriteria: "The agent attempted the read, handled the error gracefully, and explained error handling.",
218
+ evaluatorType: "llm",
219
+ evaluatorConfig: {
220
+ criteria: "Did the agent attempt to read the file? Did they handle the error without crashing/giving up? Does their explanation show understanding of error handling (checking file existence first, try/catch patterns, etc.)? Score high for graceful handling and good explanation.",
221
+ passThreshold: 0.65,
222
+ },
223
+ teaches: ["error handling", "graceful degradation", "read_file", "defensive programming"],
224
+ requiredTools: ["read_file"],
225
+ prerequisites: ["file-explorer"],
226
+ maxAttempts: 3,
227
+ timeoutMs: 120_000,
228
+ },
229
+ ];
230
+ // ── Journeyman (7 scenarios) ──────────────────────────────────
231
+ const JOURNEYMAN_SCENARIOS = [
232
+ {
233
+ id: "build-a-tool",
234
+ name: "Build a Tool",
235
+ difficulty: "journeyman",
236
+ category: "coding",
237
+ goal: `Create a custom JavaScript tool file that capitalizes text. Write the file to the current directory as 'capitalize-tool.js'. The tool should:
238
+ - Be named 'capitalize_text'
239
+ - Accept a 'text' parameter (string)
240
+ - Return the text with the first letter of each word capitalized
241
+ - Follow the BeerCan tool export pattern: export const definition = {...}; export async function handler({text}) {...}
242
+
243
+ After writing the file, verify it contains a valid export statement.`,
244
+ evaluationCriteria: "The agent created a valid BeerCan tool file with proper exports and capitalization logic.",
245
+ evaluatorType: "llm",
246
+ evaluatorConfig: {
247
+ criteria: "Did the agent write a JavaScript file? Does it export a 'definition' object with name, description, and inputSchema? Does it export a 'handler' function? Does the logic capitalize text? Score high for a complete, valid tool definition.",
248
+ passThreshold: 0.7,
249
+ },
250
+ teaches: ["tool creation", "write_file", "JavaScript", "BeerCan tool pattern"],
251
+ requiredTools: ["write_file", "read_file"],
252
+ prerequisites: ["debug-script"],
253
+ maxAttempts: 3,
254
+ timeoutMs: 300_000,
255
+ },
256
+ {
257
+ id: "multi-step-plan",
258
+ name: "Multi-Step Plan with Spawning",
259
+ difficulty: "journeyman",
260
+ category: "planning",
261
+ goal: "Use spawn_bloop to break down a research task into 2 parallel sub-tasks. Spawn one bloop to research 'what is machine learning' and another to research 'what is deep learning'. After both complete, synthesize their results into a comparison. Write the synthesis to 'ml-vs-dl.txt'.",
262
+ evaluationCriteria: "The agent spawned child bloops for parallel research and synthesized the results.",
263
+ evaluatorType: "llm",
264
+ evaluatorConfig: {
265
+ criteria: "Did the agent use spawn_bloop to create child bloops? Were at least 2 research tasks spawned? Was a synthesis file created? Does the synthesis compare ML vs DL? Score high for proper use of spawning with coherent synthesis.",
266
+ passThreshold: 0.65,
267
+ },
268
+ teaches: ["spawn_bloop", "parallel execution", "synthesis", "task decomposition"],
269
+ requiredTools: ["spawn_bloop", "get_bloop_result", "write_file"],
270
+ prerequisites: ["research-synthesize"],
271
+ maxAttempts: 3,
272
+ timeoutMs: 600_000,
273
+ },
274
+ {
275
+ id: "cross-project-memory",
276
+ name: "Cross-Project Memory",
277
+ difficulty: "journeyman",
278
+ category: "memory",
279
+ goal: "Store a memory with the title 'cross-project-test' and content 'This memory was created to test cross-project memory search'. Then use memory_search to search for 'cross-project-test' and confirm you can find it. Report the memory ID and confirm the content matches.",
280
+ evaluationCriteria: "The agent stored a memory and then successfully retrieved it using search.",
281
+ evaluatorType: "llm",
282
+ evaluatorConfig: {
283
+ criteria: "Did the agent store a memory? Did they search for it using memory_search? Did they report the memory ID? Did they confirm the content matches? Score high for demonstrating the complete memory store-search-verify workflow.",
284
+ passThreshold: 0.7,
285
+ },
286
+ teaches: ["memory_store", "memory_search", "memory verification", "cross-project"],
287
+ requiredTools: ["memory_store", "memory_search"],
288
+ prerequisites: ["persistent-memory"],
289
+ maxAttempts: 3,
290
+ timeoutMs: 180_000,
291
+ },
292
+ {
293
+ id: "knowledge-graph",
294
+ name: "Knowledge Graph",
295
+ difficulty: "journeyman",
296
+ category: "memory",
297
+ goal: "Use memory_link to create a small knowledge graph about programming languages: create entities for 'Python', 'JavaScript', and 'TypeScript'. Create edges: Python 'relates_to' JavaScript (both are popular languages), TypeScript 'depends_on' JavaScript (TypeScript is a superset). Then use memory_query_graph to traverse from TypeScript and report what you find.",
298
+ evaluationCriteria: "The agent created KG entities and edges, then traversed the graph.",
299
+ evaluatorType: "llm",
300
+ evaluatorConfig: {
301
+ criteria: "Did the agent use memory_link to create entities? Were edges created between entities? Did they use memory_query_graph to traverse? Does the traversal result show connected nodes? Score high for creating and traversing a real knowledge graph.",
302
+ passThreshold: 0.65,
303
+ },
304
+ teaches: ["memory_link", "memory_query_graph", "knowledge graphs", "entity relationships"],
305
+ requiredTools: ["memory_link", "memory_query_graph"],
306
+ prerequisites: ["cross-project-memory"],
307
+ maxAttempts: 3,
308
+ timeoutMs: 300_000,
309
+ },
310
+ {
311
+ id: "self-improve",
312
+ name: "Self-Improvement",
313
+ difficulty: "journeyman",
314
+ category: "self_improvement",
315
+ goal: "Use search_previous_attempts to look for patterns in your past bloop results. Identify one recurring pattern or improvement opportunity. Then either update an existing skill or create a new one using create_skill or update_skill that captures this pattern.",
316
+ evaluationCriteria: "The agent searched past results, identified a pattern, and created/updated a skill.",
317
+ evaluatorType: "llm",
318
+ evaluatorConfig: {
319
+ criteria: "Did the agent use search_previous_attempts? Did they identify a pattern from past work? Did they create or update a skill? Does the skill capture a useful, recurring pattern? Score high for demonstrating genuine self-improvement based on past performance.",
320
+ passThreshold: 0.65,
321
+ },
322
+ teaches: ["search_previous_attempts", "create_skill", "update_skill", "meta-learning", "self-reflection"],
323
+ requiredTools: ["search_previous_attempts", "create_skill"],
324
+ prerequisites: ["create-first-skill"],
325
+ maxAttempts: 3,
326
+ timeoutMs: 300_000,
327
+ },
328
+ {
329
+ id: "data-pipeline",
330
+ name: "Data Pipeline",
331
+ difficulty: "journeyman",
332
+ category: "coding",
333
+ goal: `Create a JSON data pipeline. First, write this JSON file to 'input.json':
334
+ [
335
+ {"name": "Alice", "score": 85},
336
+ {"name": "Bob", "score": 92},
337
+ {"name": "Charlie", "score": 78},
338
+ {"name": "Diana", "score": 95}
339
+ ]
340
+
341
+ Then read it, transform it by adding a 'grade' field ('A' for score >= 90, 'B' for 80-89, 'C' for below 80), and write the transformed data to 'output.json'. Report the number of records processed.`,
342
+ evaluationCriteria: "The agent read JSON data, transformed it correctly, and wrote the output.",
343
+ evaluatorType: "llm",
344
+ evaluatorConfig: {
345
+ criteria: "Did the agent write input.json? Did they read and process it? Did they add grade fields correctly (Alice=B, Bob=A, Charlie=C, Diana=A)? Did they write output.json? Score high for correct data transformation.",
346
+ passThreshold: 0.7,
347
+ },
348
+ teaches: ["data transformation", "read_file", "write_file", "JSON processing"],
349
+ requiredTools: ["write_file", "read_file"],
350
+ prerequisites: ["debug-script"],
351
+ maxAttempts: 3,
352
+ timeoutMs: 300_000,
353
+ },
354
+ {
355
+ id: "verify-integrate",
356
+ name: "Verify and Integrate",
357
+ difficulty: "journeyman",
358
+ category: "coding",
359
+ goal: "Use the verify_and_integrate tool to build and validate a simple tool. First create a tool file called 'greet-tool.js' that exports a 'greet_user' tool which takes a 'name' parameter and returns 'Hello, {name}!'. Then use verify_and_integrate to validate and register it.",
360
+ evaluationCriteria: "The agent created a tool file and used verify_and_integrate to register it.",
361
+ evaluatorType: "llm",
362
+ evaluatorConfig: {
363
+ criteria: "Did the agent write a tool file with a valid tool definition? Did they use verify_and_integrate? Was there an attempt to register the tool? Score high for using the full build-verify-integrate pipeline.",
364
+ passThreshold: 0.65,
365
+ },
366
+ teaches: ["verify_and_integrate", "tool validation", "tool registration", "build pipeline"],
367
+ requiredTools: ["write_file", "verify_and_integrate"],
368
+ prerequisites: ["build-a-tool"],
369
+ maxAttempts: 3,
370
+ timeoutMs: 600_000,
371
+ },
372
+ ];
373
+ // ── Expert (6 scenarios) ─────────────────────────────────────
374
+ const EXPERT_SCENARIOS = [
375
+ {
376
+ id: "architect-system",
377
+ name: "Architect a System",
378
+ difficulty: "expert",
379
+ category: "planning",
380
+ goal: "Design and partially implement a 3-component system: (1) a DataCollector component that reads files from a directory, (2) a DataProcessor component that transforms the data, (3) a DataReporter component that formats and outputs results. Write: architecture.md describing the design with clear interfaces, and stub implementation files for each component (collector.js, processor.js, reporter.js) with documented interfaces.",
381
+ evaluationCriteria: "The agent created a coherent 3-component architecture with documented interfaces.",
382
+ evaluatorType: "llm",
383
+ evaluatorConfig: {
384
+ criteria: "Did the agent create an architecture document? Are all 3 components described with clear interfaces? Were stub files created for each? Do the interfaces make sense together (outputs of one feed into the next)? Score high for a coherent, well-documented architecture with implementation stubs.",
385
+ passThreshold: 0.7,
386
+ },
387
+ teaches: ["system design", "architecture", "interfaces", "write_file", "documentation"],
388
+ requiredTools: ["write_file"],
389
+ prerequisites: ["data-pipeline", "verify-integrate"],
390
+ maxAttempts: 3,
391
+ timeoutMs: 600_000,
392
+ },
393
+ {
394
+ id: "teach-student",
395
+ name: "Teach a Student",
396
+ difficulty: "expert",
397
+ category: "creativity",
398
+ goal: "Write detailed, step-by-step instructions for how a junior agent should accomplish this complex task: 'Research a topic using web_fetch, store key facts in memory, build a knowledge graph of the topic, and create a skill for future use'. Write the instructions to 'teaching-guide.md'. The guide should be so clear that a junior agent with no prior knowledge could follow it.",
399
+ evaluationCriteria: "The agent wrote clear, complete instructions covering all 4 major steps.",
400
+ evaluatorType: "llm",
401
+ evaluatorConfig: {
402
+ criteria: "Did the agent write a guide file? Does it cover all 4 steps (web research, memory storage, knowledge graph, skill creation)? Are instructions specific enough to follow (mentioning tool names, parameters)? Would a junior agent understand? Score high for complete, actionable, well-structured instructions.",
403
+ passThreshold: 0.7,
404
+ },
405
+ teaches: ["teaching", "documentation", "knowledge transfer", "write_file"],
406
+ requiredTools: ["write_file"],
407
+ prerequisites: ["knowledge-graph", "self-improve"],
408
+ maxAttempts: 3,
409
+ timeoutMs: 300_000,
410
+ },
411
+ {
412
+ id: "meta-optimization",
413
+ name: "Meta-Optimization",
414
+ difficulty: "expert",
415
+ category: "self_improvement",
416
+ goal: "Perform a deep meta-cognitive analysis: (1) Search your memories for patterns and insights, (2) Identify 3+ recurring patterns or optimization opportunities, (3) Consolidate related memories that say the same thing, (4) Create or update a comprehensive skill that captures your most important learnings. Write a 'meta-analysis.md' documenting your findings.",
417
+ evaluationCriteria: "The agent performed memory analysis, identified patterns, and created an optimized skill.",
418
+ evaluatorType: "llm",
419
+ evaluatorConfig: {
420
+ criteria: "Did the agent search memories? Were patterns identified? Was a skill created or updated? Was a meta-analysis file written? Does the analysis show genuine meta-cognitive insight (patterns across multiple bloops, not just current state)? Score high for demonstrating real meta-cognition.",
421
+ passThreshold: 0.7,
422
+ },
423
+ teaches: ["meta-cognition", "memory_search", "create_skill", "pattern recognition", "self-optimization"],
424
+ requiredTools: ["memory_search", "create_skill", "write_file"],
425
+ prerequisites: ["self-improve"],
426
+ maxAttempts: 3,
427
+ timeoutMs: 600_000,
428
+ },
429
+ {
430
+ id: "concurrent-agents",
431
+ name: "Concurrent Agents",
432
+ difficulty: "expert",
433
+ category: "planning",
434
+ goal: "Use spawn_bloop to run 3 parallel research tasks simultaneously: (1) 'What are the key principles of functional programming', (2) 'What are the key principles of object-oriented programming', (3) 'What are the key principles of procedural programming'. Wait for all to complete using get_bloop_result. Then synthesize the results into a comprehensive comparison file 'paradigms-comparison.md'.",
435
+ evaluationCriteria: "The agent spawned 3 parallel research bloops and synthesized their results.",
436
+ evaluatorType: "llm",
437
+ evaluatorConfig: {
438
+ criteria: "Did the agent spawn 3 child bloops? Did it use get_bloop_result to collect results? Was a synthesis file created? Does it compare all 3 paradigms? Score high for actual parallel execution and coherent synthesis.",
439
+ passThreshold: 0.7,
440
+ },
441
+ teaches: ["parallel execution", "spawn_bloop", "get_bloop_result", "synthesis", "concurrent agents"],
442
+ requiredTools: ["spawn_bloop", "get_bloop_result", "write_file"],
443
+ prerequisites: ["multi-step-plan"],
444
+ maxAttempts: 3,
445
+ timeoutMs: 900_000,
446
+ },
447
+ {
448
+ id: "self-schedule",
449
+ name: "Self-Schedule",
450
+ difficulty: "expert",
451
+ category: "self_improvement",
452
+ goal: "Create a heartbeat monitoring schedule for yourself. Use create_schedule to set up a cron-based schedule that runs every hour (use '0 * * * *' as the cron expression) with the goal 'Heartbeat: check memory health, review recent bloops, identify any needed improvements'. After creating it, use list_schedules to verify it was created and report the schedule ID.",
453
+ evaluationCriteria: "The agent created a cron schedule for self-monitoring.",
454
+ evaluatorType: "contains",
455
+ evaluatorConfig: { pattern: "schedule", passThreshold: 0.5 },
456
+ teaches: ["create_schedule", "self-scheduling", "autonomous monitoring", "cron expressions"],
457
+ requiredTools: ["create_schedule", "list_schedules"],
458
+ prerequisites: ["meta-optimization"],
459
+ maxAttempts: 3,
460
+ timeoutMs: 180_000,
461
+ },
462
+ {
463
+ id: "capstone",
464
+ name: "Capstone Challenge",
465
+ difficulty: "expert",
466
+ category: "reasoning",
467
+ goal: `Complete this real-world task that requires combining all your skills:
468
+
469
+ 1. Research using web_fetch: Fetch https://example.com and https://httpbin.org/json, gather content from both
470
+ 2. Memory: Store 2 key facts from your research
471
+ 3. File operations: Create a report file 'capstone-report.md' with your findings organized in sections
472
+ 4. Knowledge graph: Create entities for the 2 websites and link them with a 'relates_to' edge
473
+ 5. Skill creation: Create a skill called 'web-research-workflow' that captures the process you just followed
474
+
475
+ Report completion of all 5 steps.`,
476
+ evaluationCriteria: "The agent completed all 5 steps: web research, memory storage, file report, knowledge graph, and skill creation.",
477
+ evaluatorType: "llm",
478
+ evaluatorConfig: {
479
+ criteria: "Did the agent complete all 5 steps? (1) web_fetch used for both URLs, (2) facts stored in memory, (3) report file created, (4) KG entities and edge created, (5) skill created. Score high for completing all steps with evidence of each.",
480
+ passThreshold: 0.65,
481
+ },
482
+ teaches: ["integrated workflow", "all tools", "comprehensive task completion", "multi-step execution"],
483
+ requiredTools: ["web_fetch", "memory_store", "write_file", "memory_link", "create_skill"],
484
+ prerequisites: ["architect-system", "concurrent-agents", "self-schedule"],
485
+ maxAttempts: 3,
486
+ timeoutMs: 900_000,
487
+ },
488
+ ];
489
+ // ── Full Curriculum ───────────────────────────────────────────
490
+ export const DEFAULT_CURRICULUM = [
491
+ ...NOVICE_SCENARIOS,
492
+ ...APPRENTICE_SCENARIOS,
493
+ ...JOURNEYMAN_SCENARIOS,
494
+ ...EXPERT_SCENARIOS,
495
+ ];
496
+ // ── Graduation Criteria ───────────────────────────────────────
497
+ export const GRADUATION_CRITERIA = {
498
+ minPassRateByLevel: {
499
+ novice: 0.8, // Pass 5/6 novice scenarios
500
+ apprentice: 0.67, // Pass 4/6 apprentice scenarios
501
+ journeyman: 0.57, // Pass 4/7 journeyman scenarios
502
+ expert: 0.5, // Pass 3/6 expert scenarios
503
+ },
504
+ requiredScenarioIds: [
505
+ "memory-hello", // Must understand memory basics
506
+ "file-explorer", // Must understand filesystem tools
507
+ "capstone", // Must complete the capstone
508
+ ],
509
+ minToolsCreated: 0,
510
+ minSkillsCreated: 0,
511
+ };
512
+ //# sourceMappingURL=curriculum.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"curriculum.js","sourceRoot":"","sources":["../../src/training/curriculum.ts"],"names":[],"mappings":"AAEA,iEAAiE;AACjE,2EAA2E;AAE3E,gEAAgE;AAEhE,MAAM,gBAAgB,GAAuB;IAC3C;QACE,EAAE,EAAE,cAAc;QAClB,IAAI,EAAE,oBAAoB;QAC1B,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE,sMAAsM;QAC5M,kBAAkB,EAAE,mFAAmF;QACvG,aAAa,EAAE,UAAU;QACzB,eAAe,EAAE,EAAE,OAAO,EAAE,WAAW,EAAE,aAAa,EAAE,GAAG,EAAE;QAC7D,OAAO,EAAE,CAAC,cAAc,EAAE,eAAe,EAAE,uBAAuB,CAAC;QACnE,aAAa,EAAE,CAAC,cAAc,EAAE,eAAe,CAAC;QAChD,aAAa,EAAE,EAAE;QACjB,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,eAAe;QACnB,IAAI,EAAE,eAAe;QACrB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,OAAO;QACjB,IAAI,EAAE,qHAAqH;QAC3H,kBAAkB,EAAE,mFAAmF;QACvG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,yKAAyK;YACnL,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,gBAAgB,EAAE,wBAAwB,CAAC;QACrD,aAAa,EAAE,CAAC,gBAAgB,CAAC;QACjC,aAAa,EAAE,EAAE;QACjB,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,MAAM;KAClB;IACD;QACE,EAAE,EAAE,YAAY;QAChB,IAAI,EAAE,YAAY;QAClB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,OAAO;QACjB,IAAI,EAAE,sHAAsH;QAC5H,kBAAkB,EAAE,iFAAiF;QACrG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,sLAAsL;YAChM,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,WAAW,EAAE,uBAAuB,CAAC;QAC/C,aAAa,EAAE,CAAC,WAAW,CAAC;QAC5B,aAAa,EAAE,EAAE;QACjB,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,cAAc;QAClB,IAAI,EAAE,cAAc;QACpB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE,2HAA2H;QACjI,kBAAkB,EAAE,sEAAsE;QAC1F,aAAa,EAAE,OAAO;QACtB,eAAe,EAAE,EAAE,OAAO,EAAE,IAAI,EAAE,aAAa,EAAE,GAAG,EAAE;QACtD,OAAO,EAAE,CAAC,iBAAiB,EAAE,YAAY,EAAE,cAAc,EAAE,eAAe,CAAC;QAC3E,aAAa,EAAE,CAAC,cAAc,EAAE,eAAe,CAAC;QAChD,aAAa,EAAE,CAAC,cAAc,CAAC;QAC/B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,iBAAiB;QACrB,IAAI,EAAE,iBAAiB;QACvB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,WAAW;QACrB,IAAI,EAAE,2KAA2K;QACjL,kBAAkB,EAAE,oFAAoF;QACxG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,qOAAqO;YAC/O,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,gBAAgB,EAAE,kBAAkB,EAAE,uBAAuB,CAAC;QACxE,aAAa,EAAE,EAAE;QACjB,aAAa,EAAE,EAAE;QACjB,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,MAAM;KAClB;IACD;QACE,EAAE,EAAE,aAAa;QACjB,IAAI,EAAE,aAAa;QACnB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,UAAU;QACpB,IAAI,EAAE,mJAAmJ;QACzJ,kBAAkB,EAAE,sEAAsE;QAC1F,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,2MAA2M;YACrN,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,UAAU,EAAE,YAAY,EAAE,oBAAoB,CAAC;QACzD,aAAa,EAAE,CAAC,YAAY,CAAC;QAC7B,aAAa,EAAE,EAAE;QACjB,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,MAAM;KAClB;CACF,CAAC;AAEF,iEAAiE;AAEjE,MAAM,oBAAoB,GAAuB;IAC/C;QACE,EAAE,EAAE,cAAc;QAClB,IAAI,EAAE,gBAAgB;QACtB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE;;;;;;;;;;;wHAW8G;QACpH,kBAAkB,EAAE,+FAA+F;QACnH,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,wMAAwM;YAClN,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,WAAW,EAAE,WAAW,EAAE,YAAY,EAAE,eAAe,CAAC;QAClE,aAAa,EAAE,CAAC,YAAY,EAAE,WAAW,CAAC;QAC1C,aAAa,EAAE,CAAC,aAAa,CAAC;QAC9B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,qBAAqB;QACzB,IAAI,EAAE,yBAAyB;QAC/B,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,WAAW;QACrB,IAAI,EAAE,wOAAwO;QAC9O,kBAAkB,EAAE,wFAAwF;QAC5G,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,8PAA8P;YACxQ,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,cAAc,EAAE,WAAW,EAAE,WAAW,EAAE,YAAY,EAAE,oBAAoB,CAAC;QACvF,aAAa,EAAE,CAAC,WAAW,EAAE,YAAY,CAAC;QAC1C,aAAa,EAAE,CAAC,YAAY,CAAC;QAC7B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,mBAAmB;QACvB,IAAI,EAAE,mBAAmB;QACzB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE,sRAAsR;QAC5R,kBAAkB,EAAE,kFAAkF;QACtG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,qSAAqS;YAC/S,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,oBAAoB,EAAE,cAAc,EAAE,eAAe,EAAE,gBAAgB,CAAC;QAClF,aAAa,EAAE,CAAC,cAAc,EAAE,eAAe,CAAC;QAChD,aAAa,EAAE,CAAC,cAAc,CAAC;QAC/B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,oBAAoB;QACxB,IAAI,EAAE,yBAAyB;QAC/B,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,kBAAkB;QAC5B,IAAI,EAAE,oQAAoQ;QAC1Q,kBAAkB,EAAE,2EAA2E;QAC/F,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,yNAAyN;YACnO,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,cAAc,EAAE,eAAe,EAAE,kBAAkB,EAAE,gBAAgB,CAAC;QAChF,aAAa,EAAE,CAAC,cAAc,CAAC;QAC/B,aAAa,EAAE,CAAC,mBAAmB,CAAC;QACpC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,gBAAgB;QACpB,IAAI,EAAE,gBAAgB;QACtB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,WAAW;QACrB,IAAI,EAAE,4RAA4R;QAClS,kBAAkB,EAAE,yFAAyF;QAC7G,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,qPAAqP;YAC/P,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,gBAAgB,EAAE,gBAAgB,EAAE,8BAA8B,CAAC;QAC7E,aAAa,EAAE,CAAC,YAAY,CAAC;QAC7B,aAAa,EAAE,CAAC,iBAAiB,CAAC;QAClC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,gBAAgB;QACpB,IAAI,EAAE,gBAAgB;QACtB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,WAAW;QACrB,IAAI,EAAE,mPAAmP;QACzP,kBAAkB,EAAE,2FAA2F;QAC/G,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,6QAA6Q;YACvR,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,gBAAgB,EAAE,sBAAsB,EAAE,WAAW,EAAE,uBAAuB,CAAC;QACzF,aAAa,EAAE,CAAC,WAAW,CAAC;QAC5B,aAAa,EAAE,CAAC,eAAe,CAAC;QAChC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;CACF,CAAC;AAEF,iEAAiE;AAEjE,MAAM,oBAAoB,GAAuB;IAC/C;QACE,EAAE,EAAE,cAAc;QAClB,IAAI,EAAE,cAAc;QACpB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE;;;;;;qEAM2D;QACjE,kBAAkB,EAAE,2FAA2F;QAC/G,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,6OAA6O;YACvP,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,eAAe,EAAE,YAAY,EAAE,YAAY,EAAE,sBAAsB,CAAC;QAC9E,aAAa,EAAE,CAAC,YAAY,EAAE,WAAW,CAAC;QAC1C,aAAa,EAAE,CAAC,cAAc,CAAC;QAC/B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,iBAAiB;QACrB,IAAI,EAAE,+BAA+B;QACrC,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,UAAU;QACpB,IAAI,EAAE,0RAA0R;QAChS,kBAAkB,EAAE,mFAAmF;QACvG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,iOAAiO;YAC3O,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,aAAa,EAAE,oBAAoB,EAAE,WAAW,EAAE,oBAAoB,CAAC;QACjF,aAAa,EAAE,CAAC,aAAa,EAAE,kBAAkB,EAAE,YAAY,CAAC;QAChE,aAAa,EAAE,CAAC,qBAAqB,CAAC;QACtC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,sBAAsB;QAC1B,IAAI,EAAE,sBAAsB;QAC5B,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE,4QAA4Q;QAClR,kBAAkB,EAAE,4EAA4E;QAChG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,+NAA+N;YACzO,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,cAAc,EAAE,eAAe,EAAE,qBAAqB,EAAE,eAAe,CAAC;QAClF,aAAa,EAAE,CAAC,cAAc,EAAE,eAAe,CAAC;QAChD,aAAa,EAAE,CAAC,mBAAmB,CAAC;QACpC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,iBAAiB;QACrB,IAAI,EAAE,iBAAiB;QACvB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE,0WAA0W;QAChX,kBAAkB,EAAE,oEAAoE;QACxF,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,oPAAoP;YAC9P,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,aAAa,EAAE,oBAAoB,EAAE,kBAAkB,EAAE,sBAAsB,CAAC;QAC1F,aAAa,EAAE,CAAC,aAAa,EAAE,oBAAoB,CAAC;QACpD,aAAa,EAAE,CAAC,sBAAsB,CAAC;QACvC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,cAAc;QAClB,IAAI,EAAE,kBAAkB;QACxB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,kBAAkB;QAC5B,IAAI,EAAE,kQAAkQ;QACxQ,kBAAkB,EAAE,qFAAqF;QACzG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,iQAAiQ;YAC3Q,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,0BAA0B,EAAE,cAAc,EAAE,cAAc,EAAE,eAAe,EAAE,iBAAiB,CAAC;QACzG,aAAa,EAAE,CAAC,0BAA0B,EAAE,cAAc,CAAC;QAC3D,aAAa,EAAE,CAAC,oBAAoB,CAAC;QACrC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,eAAe;QACnB,IAAI,EAAE,eAAe;QACrB,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE;;;;;;;;sMAQ4L;QAClM,kBAAkB,EAAE,2EAA2E;QAC/F,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,iNAAiN;YAC3N,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,qBAAqB,EAAE,WAAW,EAAE,YAAY,EAAE,iBAAiB,CAAC;QAC9E,aAAa,EAAE,CAAC,YAAY,EAAE,WAAW,CAAC;QAC1C,aAAa,EAAE,CAAC,cAAc,CAAC;QAC/B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,kBAAkB;QACtB,IAAI,EAAE,sBAAsB;QAC5B,UAAU,EAAE,YAAY;QACxB,QAAQ,EAAE,QAAQ;QAClB,IAAI,EAAE,iRAAiR;QACvR,kBAAkB,EAAE,6EAA6E;QACjG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,4MAA4M;YACtN,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,sBAAsB,EAAE,iBAAiB,EAAE,mBAAmB,EAAE,gBAAgB,CAAC;QAC3F,aAAa,EAAE,CAAC,YAAY,EAAE,sBAAsB,CAAC;QACrD,aAAa,EAAE,CAAC,cAAc,CAAC;QAC/B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;CACF,CAAC;AAEF,gEAAgE;AAEhE,MAAM,gBAAgB,GAAuB;IAC3C;QACE,EAAE,EAAE,kBAAkB;QACtB,IAAI,EAAE,oBAAoB;QAC1B,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,UAAU;QACpB,IAAI,EAAE,yaAAya;QAC/a,kBAAkB,EAAE,mFAAmF;QACvG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,sSAAsS;YAChT,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,eAAe,EAAE,cAAc,EAAE,YAAY,EAAE,YAAY,EAAE,eAAe,CAAC;QACvF,aAAa,EAAE,CAAC,YAAY,CAAC;QAC7B,aAAa,EAAE,CAAC,eAAe,EAAE,kBAAkB,CAAC;QACpD,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,eAAe;QACnB,IAAI,EAAE,iBAAiB;QACvB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,YAAY;QACtB,IAAI,EAAE,wXAAwX;QAC9X,kBAAkB,EAAE,0EAA0E;QAC9F,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,kTAAkT;YAC5T,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,UAAU,EAAE,eAAe,EAAE,oBAAoB,EAAE,YAAY,CAAC;QAC1E,aAAa,EAAE,CAAC,YAAY,CAAC;QAC7B,aAAa,EAAE,CAAC,iBAAiB,EAAE,cAAc,CAAC;QAClD,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,mBAAmB;QACvB,IAAI,EAAE,mBAAmB;QACzB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,kBAAkB;QAC5B,IAAI,EAAE,uWAAuW;QAC7W,kBAAkB,EAAE,2FAA2F;QAC/G,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,+RAA+R;YACzS,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,gBAAgB,EAAE,eAAe,EAAE,cAAc,EAAE,qBAAqB,EAAE,mBAAmB,CAAC;QACxG,aAAa,EAAE,CAAC,eAAe,EAAE,cAAc,EAAE,YAAY,CAAC;QAC9D,aAAa,EAAE,CAAC,cAAc,CAAC;QAC/B,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,mBAAmB;QACvB,IAAI,EAAE,mBAAmB;QACzB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,UAAU;QACpB,IAAI,EAAE,2YAA2Y;QACjZ,kBAAkB,EAAE,6EAA6E;QACjG,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,qNAAqN;YAC/N,aAAa,EAAE,GAAG;SACnB;QACD,OAAO,EAAE,CAAC,oBAAoB,EAAE,aAAa,EAAE,kBAAkB,EAAE,WAAW,EAAE,mBAAmB,CAAC;QACpG,aAAa,EAAE,CAAC,aAAa,EAAE,kBAAkB,EAAE,YAAY,CAAC;QAChE,aAAa,EAAE,CAAC,iBAAiB,CAAC;QAClC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,eAAe;QACnB,IAAI,EAAE,eAAe;QACrB,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,kBAAkB;QAC5B,IAAI,EAAE,2WAA2W;QACjX,kBAAkB,EAAE,wDAAwD;QAC5E,aAAa,EAAE,UAAU;QACzB,eAAe,EAAE,EAAE,OAAO,EAAE,UAAU,EAAE,aAAa,EAAE,GAAG,EAAE;QAC5D,OAAO,EAAE,CAAC,iBAAiB,EAAE,iBAAiB,EAAE,uBAAuB,EAAE,kBAAkB,CAAC;QAC5F,aAAa,EAAE,CAAC,iBAAiB,EAAE,gBAAgB,CAAC;QACpD,aAAa,EAAE,CAAC,mBAAmB,CAAC;QACpC,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;IACD;QACE,EAAE,EAAE,UAAU;QACd,IAAI,EAAE,oBAAoB;QAC1B,UAAU,EAAE,QAAQ;QACpB,QAAQ,EAAE,WAAW;QACrB,IAAI,EAAE;;;;;;;;kCAQwB;QAC9B,kBAAkB,EAAE,kHAAkH;QACtI,aAAa,EAAE,KAAK;QACpB,eAAe,EAAE;YACf,QAAQ,EAAE,4OAA4O;YACtP,aAAa,EAAE,IAAI;SACpB;QACD,OAAO,EAAE,CAAC,qBAAqB,EAAE,WAAW,EAAE,+BAA+B,EAAE,sBAAsB,CAAC;QACtG,aAAa,EAAE,CAAC,WAAW,EAAE,cAAc,EAAE,YAAY,EAAE,aAAa,EAAE,cAAc,CAAC;QACzF,aAAa,EAAE,CAAC,kBAAkB,EAAE,mBAAmB,EAAE,eAAe,CAAC;QACzE,WAAW,EAAE,CAAC;QACd,SAAS,EAAE,OAAO;KACnB;CACF,CAAC;AAEF,iEAAiE;AAEjE,MAAM,CAAC,MAAM,kBAAkB,GAAuB;IACpD,GAAG,gBAAgB;IACnB,GAAG,oBAAoB;IACvB,GAAG,oBAAoB;IACvB,GAAG,gBAAgB;CACpB,CAAC;AAEF,iEAAiE;AAEjE,MAAM,CAAC,MAAM,mBAAmB,GAAuB;IACrD,kBAAkB,EAAE;QAClB,MAAM,EAAE,GAAG,EAAQ,4BAA4B;QAC/C,UAAU,EAAE,IAAI,EAAG,gCAAgC;QACnD,UAAU,EAAE,IAAI,EAAG,gCAAgC;QACnD,MAAM,EAAE,GAAG,EAAQ,4BAA4B;KAChD;IACD,mBAAmB,EAAE;QACnB,cAAc,EAAM,gCAAgC;QACpD,eAAe,EAAK,mCAAmC;QACvD,UAAU,EAAU,6BAA6B;KAClD;IACD,eAAe,EAAE,CAAC;IAClB,gBAAgB,EAAE,CAAC;CACpB,CAAC"}
@@ -0,0 +1,21 @@
1
+ import type { LLMProvider } from "../providers/types.js";
2
+ import type { TrainingScenario } from "./types.js";
3
+ import type { ToolCallRecord } from "../schemas.js";
4
+ export interface EvaluationResult {
5
+ score: number;
6
+ passed: boolean;
7
+ feedback: string;
8
+ }
9
+ export declare class ScenarioEvaluator {
10
+ private provider;
11
+ constructor(provider: LLMProvider);
12
+ /**
13
+ * Evaluate a completed scenario bloop result.
14
+ * Uses the appropriate evaluator type from the scenario config.
15
+ */
16
+ evaluate(scenario: TrainingScenario, bloopResult: string, toolCalls: ToolCallRecord[]): Promise<EvaluationResult>;
17
+ private evaluateContains;
18
+ private evaluateRegex;
19
+ private evaluateLLM;
20
+ }
21
+ //# sourceMappingURL=evaluator.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"evaluator.d.ts","sourceRoot":"","sources":["../../src/training/evaluator.ts"],"names":[],"mappings":"AACA,OAAO,KAAK,EAAE,WAAW,EAA4B,MAAM,uBAAuB,CAAC;AACnF,OAAO,KAAK,EAAE,gBAAgB,EAAmB,MAAM,YAAY,CAAC;AACpE,OAAO,KAAK,EAAE,cAAc,EAAE,MAAM,eAAe,CAAC;AAmCpD,MAAM,WAAW,gBAAgB;IAC/B,KAAK,EAAE,MAAM,CAAC;IACd,MAAM,EAAE,OAAO,CAAC;IAChB,QAAQ,EAAE,MAAM,CAAC;CAClB;AAID,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,QAAQ,CAAc;gBAElB,QAAQ,EAAE,WAAW;IAIjC;;;OAGG;IACG,QAAQ,CACZ,QAAQ,EAAE,gBAAgB,EAC1B,WAAW,EAAE,MAAM,EACnB,SAAS,EAAE,cAAc,EAAE,GAC1B,OAAO,CAAC,gBAAgB,CAAC;IAuC5B,OAAO,CAAC,gBAAgB;IAmBxB,OAAO,CAAC,aAAa;YA0BP,WAAW;CAiF1B"}