joonecli 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (184) hide show
  1. package/README.md +12 -12
  2. package/dist/__tests__/optimizations.test.js.map +1 -1
  3. package/dist/__tests__/promptBuilder.test.js +14 -20
  4. package/dist/__tests__/promptBuilder.test.js.map +1 -1
  5. package/dist/agents/agentRegistry.d.ts +37 -0
  6. package/dist/agents/agentRegistry.js +58 -0
  7. package/dist/agents/agentRegistry.js.map +1 -0
  8. package/dist/agents/agentSpec.d.ts +54 -0
  9. package/dist/agents/agentSpec.js +9 -0
  10. package/dist/agents/agentSpec.js.map +1 -0
  11. package/dist/agents/builtinAgents.d.ts +20 -0
  12. package/{src/agents/builtinAgents.ts → dist/agents/builtinAgents.js} +84 -101
  13. package/dist/agents/builtinAgents.js.map +1 -0
  14. package/dist/cli/config.d.ts +4 -0
  15. package/dist/cli/config.js.map +1 -1
  16. package/dist/cli/index.js +29 -2
  17. package/dist/cli/index.js.map +1 -1
  18. package/dist/cli/postinstall.d.ts +2 -0
  19. package/dist/cli/postinstall.js +25 -0
  20. package/dist/cli/postinstall.js.map +1 -0
  21. package/dist/commands/builtinCommands.d.ts +21 -0
  22. package/dist/commands/builtinCommands.js +241 -0
  23. package/dist/commands/builtinCommands.js.map +1 -0
  24. package/dist/commands/commandRegistry.d.ts +92 -0
  25. package/dist/commands/commandRegistry.js +128 -0
  26. package/dist/commands/commandRegistry.js.map +1 -0
  27. package/dist/core/agentLoop.d.ts +7 -2
  28. package/dist/core/agentLoop.js +35 -13
  29. package/dist/core/agentLoop.js.map +1 -1
  30. package/dist/core/autoSave.d.ts +41 -0
  31. package/dist/core/autoSave.js +69 -0
  32. package/dist/core/autoSave.js.map +1 -0
  33. package/dist/core/compactor.d.ts +66 -0
  34. package/dist/core/compactor.js +170 -0
  35. package/dist/core/compactor.js.map +1 -0
  36. package/dist/core/contextGuard.d.ts +38 -0
  37. package/dist/core/contextGuard.js +122 -0
  38. package/dist/core/contextGuard.js.map +1 -0
  39. package/dist/core/events.d.ts +45 -0
  40. package/dist/core/events.js +8 -0
  41. package/dist/core/events.js.map +1 -0
  42. package/dist/core/promptBuilder.d.ts +16 -1
  43. package/dist/core/promptBuilder.js +27 -14
  44. package/dist/core/promptBuilder.js.map +1 -1
  45. package/dist/core/sessionResumer.js +3 -3
  46. package/dist/core/sessionResumer.js.map +1 -1
  47. package/dist/core/sessionStore.js +3 -2
  48. package/dist/core/sessionStore.js.map +1 -1
  49. package/dist/core/subAgent.d.ts +56 -0
  50. package/dist/core/subAgent.js +240 -0
  51. package/dist/core/subAgent.js.map +1 -0
  52. package/dist/core/tokenCounter.d.ts +8 -1
  53. package/dist/core/tokenCounter.js +28 -0
  54. package/dist/core/tokenCounter.js.map +1 -1
  55. package/dist/debug_google.d.ts +1 -0
  56. package/dist/debug_google.js +23 -0
  57. package/dist/debug_google.js.map +1 -0
  58. package/dist/middleware/permission.js +1 -0
  59. package/dist/middleware/permission.js.map +1 -1
  60. package/dist/test_google.d.ts +1 -0
  61. package/dist/test_google.js +32 -89
  62. package/dist/test_google.js.map +1 -0
  63. package/dist/tools/browser.js +4 -1
  64. package/dist/tools/browser.js.map +1 -1
  65. package/dist/tools/index.d.ts +2 -1
  66. package/dist/tools/index.js +11 -3
  67. package/dist/tools/index.js.map +1 -1
  68. package/dist/tools/installHostDeps.d.ts +2 -0
  69. package/dist/tools/installHostDeps.js +37 -0
  70. package/dist/tools/installHostDeps.js.map +1 -0
  71. package/dist/tools/router.js +3 -0
  72. package/dist/tools/router.js.map +1 -1
  73. package/dist/tools/spawnAgent.d.ts +19 -0
  74. package/dist/tools/spawnAgent.js +132 -0
  75. package/dist/tools/spawnAgent.js.map +1 -0
  76. package/dist/tracing/sessionTracer.d.ts +1 -0
  77. package/dist/tracing/sessionTracer.js +4 -1
  78. package/dist/tracing/sessionTracer.js.map +1 -1
  79. package/dist/ui/App.js +94 -6
  80. package/dist/ui/App.js.map +1 -1
  81. package/dist/ui/components/ActionLog.d.ts +7 -0
  82. package/dist/ui/components/ActionLog.js +63 -0
  83. package/dist/ui/components/ActionLog.js.map +1 -0
  84. package/dist/ui/components/FileBrowser.d.ts +2 -0
  85. package/dist/ui/components/FileBrowser.js +41 -0
  86. package/dist/ui/components/FileBrowser.js.map +1 -0
  87. package/package.json +5 -6
  88. package/AGENTS.md +0 -56
  89. package/Handover.md +0 -115
  90. package/PROGRESS.md +0 -160
  91. package/docs/01_insights_and_patterns.md +0 -27
  92. package/docs/02_edge_cases_and_mitigations.md +0 -143
  93. package/docs/03_initial_implementation_plan.md +0 -66
  94. package/docs/04_tech_stack_proposal.md +0 -20
  95. package/docs/05_prd.md +0 -87
  96. package/docs/06_user_stories.md +0 -72
  97. package/docs/07_system_architecture.md +0 -138
  98. package/docs/08_roadmap.md +0 -200
  99. package/e2b/Dockerfile +0 -26
  100. package/src/__tests__/bootstrap.test.ts +0 -111
  101. package/src/__tests__/config.test.ts +0 -97
  102. package/src/__tests__/m55.test.ts +0 -238
  103. package/src/__tests__/middleware.test.ts +0 -219
  104. package/src/__tests__/modelFactory.test.ts +0 -63
  105. package/src/__tests__/optimizations.test.ts +0 -201
  106. package/src/__tests__/promptBuilder.test.ts +0 -141
  107. package/src/__tests__/sandbox.test.ts +0 -102
  108. package/src/__tests__/security.test.ts +0 -122
  109. package/src/__tests__/streaming.test.ts +0 -82
  110. package/src/__tests__/toolRouter.test.ts +0 -52
  111. package/src/__tests__/tools.test.ts +0 -146
  112. package/src/__tests__/tracing.test.ts +0 -196
  113. package/src/agents/agentRegistry.ts +0 -69
  114. package/src/agents/agentSpec.ts +0 -67
  115. package/src/cli/config.ts +0 -124
  116. package/src/cli/index.ts +0 -730
  117. package/src/cli/modelFactory.ts +0 -174
  118. package/src/cli/providers.ts +0 -107
  119. package/src/commands/builtinCommands.ts +0 -293
  120. package/src/commands/commandRegistry.ts +0 -194
  121. package/src/core/agentLoop.d.ts.map +0 -1
  122. package/src/core/agentLoop.ts +0 -312
  123. package/src/core/autoSave.ts +0 -95
  124. package/src/core/compactor.ts +0 -252
  125. package/src/core/contextGuard.ts +0 -129
  126. package/src/core/errors.ts +0 -202
  127. package/src/core/promptBuilder.d.ts.map +0 -1
  128. package/src/core/promptBuilder.ts +0 -139
  129. package/src/core/reasoningRouter.ts +0 -121
  130. package/src/core/retry.ts +0 -75
  131. package/src/core/sessionResumer.ts +0 -90
  132. package/src/core/sessionStore.ts +0 -215
  133. package/src/core/subAgent.ts +0 -339
  134. package/src/core/tokenCounter.ts +0 -64
  135. package/src/evals/dataset.ts +0 -67
  136. package/src/evals/evaluator.ts +0 -81
  137. package/src/hitl/bridge.ts +0 -160
  138. package/src/middleware/commandSanitizer.ts +0 -60
  139. package/src/middleware/loopDetection.ts +0 -63
  140. package/src/middleware/permission.ts +0 -72
  141. package/src/middleware/pipeline.ts +0 -75
  142. package/src/middleware/preCompletion.ts +0 -94
  143. package/src/middleware/types.ts +0 -45
  144. package/src/sandbox/bootstrap.ts +0 -121
  145. package/src/sandbox/manager.ts +0 -239
  146. package/src/sandbox/sync.ts +0 -157
  147. package/src/skills/loader.ts +0 -143
  148. package/src/skills/tools.ts +0 -99
  149. package/src/skills/types.ts +0 -13
  150. package/src/test_cache.ts +0 -72
  151. package/src/test_google.js +0 -40
  152. package/src/test_google.ts +0 -40
  153. package/src/tools/askUser.ts +0 -47
  154. package/src/tools/browser.ts +0 -137
  155. package/src/tools/index.d.ts.map +0 -1
  156. package/src/tools/index.ts +0 -237
  157. package/src/tools/registry.ts +0 -198
  158. package/src/tools/router.ts +0 -78
  159. package/src/tools/security.ts +0 -220
  160. package/src/tools/spawnAgent.ts +0 -158
  161. package/src/tools/webSearch.ts +0 -142
  162. package/src/tracing/analyzer.ts +0 -265
  163. package/src/tracing/langsmith.ts +0 -63
  164. package/src/tracing/sessionTracer.ts +0 -202
  165. package/src/tracing/types.ts +0 -49
  166. package/src/types/valyu.d.ts +0 -37
  167. package/src/ui/App.tsx +0 -404
  168. package/src/ui/components/HITLPrompt.tsx +0 -119
  169. package/src/ui/components/Header.tsx +0 -51
  170. package/src/ui/components/MessageBubble.tsx +0 -46
  171. package/src/ui/components/StatusBar.tsx +0 -138
  172. package/src/ui/components/StreamingText.tsx +0 -48
  173. package/src/ui/components/ToolCallPanel.tsx +0 -80
  174. package/tests/commands/commands.test.ts +0 -356
  175. package/tests/core/compactor.test.ts +0 -217
  176. package/tests/core/retryAndErrors.test.ts +0 -164
  177. package/tests/core/sessionResumer.test.ts +0 -95
  178. package/tests/core/sessionStore.test.ts +0 -84
  179. package/tests/core/stability.test.ts +0 -165
  180. package/tests/core/subAgent.test.ts +0 -238
  181. package/tests/hitl/hitlBridge.test.ts +0 -115
  182. package/tsconfig.json +0 -16
  183. package/vitest.config.ts +0 -10
  184. package/vitest.out +0 -48
@@ -1,339 +0,0 @@
1
- /**
2
- * Sub-Agent Manager
3
- *
4
- * Spawns and orchestrates isolated sub-agents for scoped tasks.
5
- * Each sub-agent gets its own ExecutionHarness with a separate conversation
6
- * history. Only the final SubAgentResult is returned to the main agent,
7
- * discarding the sub-agent's internal conversation to save context.
8
- *
9
- * Supports both synchronous (blocking) and asynchronous (non-blocking) modes.
10
- *
11
- * Safety:
12
- * - Depth limit of 1: sub-agents cannot spawn other sub-agents
13
- * - maxTurns cap per agent prevents doom-loops
14
- * - Concurrent async agent cap of 3 prevents resource exhaustion
15
- * - Per-agent token budget tracking
16
- */
17
-
18
- import { AgentSpec, SubAgentResult } from "../agents/agentSpec.js";
19
- import { AgentRegistry } from "../agents/agentRegistry.js";
20
- import { DynamicToolInterface, ToolResult } from "../tools/index.js";
21
- import { ContextState } from "../core/promptBuilder.js";
22
- import { countMessageTokens } from "../core/tokenCounter.js";
23
- import { BaseMessage, HumanMessage, AIMessage, SystemMessage, ToolMessage } from "@langchain/core/messages";
24
- import { BaseChatModel } from "@langchain/core/language_models/chat_models";
25
- import { Runnable } from "@langchain/core/runnables";
26
-
27
- // ─── Constants ──────────────────────────────────────────────────────────────────
28
-
29
- const DEFAULT_MAX_TURNS = 10;
30
- const MAX_CONCURRENT_ASYNC = 3;
31
- const ASYNC_EXPIRY_MS = 5 * 60 * 1000; // 5 minutes
32
-
33
- // ─── Async Task State ───────────────────────────────────────────────────────────
34
-
35
- interface AsyncTask {
36
- taskId: string;
37
- agentName: string;
38
- taskDescription: string;
39
- promise: Promise<SubAgentResult>;
40
- result?: SubAgentResult;
41
- startedAt: number;
42
- completed: boolean;
43
- }
44
-
45
- // ─── SubAgentManager ────────────────────────────────────────────────────────────
46
-
47
- export class SubAgentManager {
48
- private registry: AgentRegistry;
49
- private allTools: DynamicToolInterface[];
50
- private llm: Runnable | BaseChatModel;
51
- private asyncTasks: Map<string, AsyncTask> = new Map();
52
- private taskCounter = 0;
53
-
54
- constructor(
55
- registry: AgentRegistry,
56
- tools: DynamicToolInterface[],
57
- llm: Runnable | BaseChatModel,
58
- ) {
59
- this.registry = registry;
60
- // Filter out spawn_agent and check_agent to prevent recursive nesting (depth-1 limit)
61
- this.allTools = tools.filter(
62
- (t) => t.name !== "spawn_agent" && t.name !== "check_agent"
63
- );
64
- this.llm = llm;
65
- }
66
-
67
- /**
68
- * Synchronous spawn — blocks until the sub-agent finishes.
69
- */
70
- async spawn(
71
- agentName: string,
72
- task: string,
73
- maxTurnsOverride?: number
74
- ): Promise<SubAgentResult> {
75
- const spec = this.registry.get(agentName);
76
- if (!spec) {
77
- return this.makeErrorResult(
78
- agentName,
79
- task,
80
- `Unknown agent "${agentName}". Available: ${this.registry.getNames().join(", ")}`
81
- );
82
- }
83
-
84
- return this.runAgent(spec, task, maxTurnsOverride);
85
- }
86
-
87
- /**
88
- * Asynchronous spawn — returns immediately with a taskId.
89
- * The main agent can poll with getResult(taskId).
90
- */
91
- async spawnAsync(
92
- agentName: string,
93
- task: string,
94
- maxTurnsOverride?: number
95
- ): Promise<string> {
96
- // Cap concurrent async agents
97
- this.cleanupExpired();
98
- const activeCount = Array.from(this.asyncTasks.values())
99
- .filter((t) => !t.completed).length;
100
-
101
- if (activeCount >= MAX_CONCURRENT_ASYNC) {
102
- throw new Error(
103
- `Maximum concurrent async agents reached (${MAX_CONCURRENT_ASYNC}). ` +
104
- `Wait for existing tasks to complete or check them with check_agent.`
105
- );
106
- }
107
-
108
- const spec = this.registry.get(agentName);
109
- if (!spec) {
110
- throw new Error(
111
- `Unknown agent "${agentName}". Available: ${this.registry.getNames().join(", ")}`
112
- );
113
- }
114
-
115
- const taskId = `task_${++this.taskCounter}_${Date.now()}`;
116
-
117
- const promise = this.runAgent(spec, task, maxTurnsOverride).then((result) => {
118
- const asyncTask = this.asyncTasks.get(taskId);
119
- if (asyncTask) {
120
- asyncTask.result = result;
121
- asyncTask.completed = true;
122
- }
123
- return result;
124
- });
125
-
126
- this.asyncTasks.set(taskId, {
127
- taskId,
128
- agentName,
129
- taskDescription: task,
130
- promise,
131
- startedAt: Date.now(),
132
- completed: false,
133
- });
134
-
135
- return taskId;
136
- }
137
-
138
- /**
139
- * Check the status or get the result of an async task.
140
- * Returns the result if completed, or a status message if still running.
141
- */
142
- async getResult(taskId: string): Promise<SubAgentResult | string> {
143
- const asyncTask = this.asyncTasks.get(taskId);
144
- if (!asyncTask) {
145
- return `Unknown task ID: ${taskId}. No such async task exists.`;
146
- }
147
-
148
- if (asyncTask.completed && asyncTask.result) {
149
- // Clean up the task
150
- this.asyncTasks.delete(taskId);
151
- return asyncTask.result;
152
- }
153
-
154
- const elapsed = Math.round((Date.now() - asyncTask.startedAt) / 1000);
155
- return `Task "${asyncTask.taskDescription}" (agent: ${asyncTask.agentName}) ` +
156
- `is still running (${elapsed}s elapsed).`;
157
- }
158
-
159
- /**
160
- * Core execution loop for a sub-agent.
161
- * Creates an isolated conversation and runs a multi-turn loop.
162
- */
163
- private async runAgent(
164
- spec: AgentSpec,
165
- task: string,
166
- maxTurnsOverride?: number
167
- ): Promise<SubAgentResult> {
168
- const startTime = Date.now();
169
- const maxTurns = maxTurnsOverride ?? spec.maxTurns ?? DEFAULT_MAX_TURNS;
170
-
171
- // Resolve available tools for this agent
172
- const agentTools = spec.tools
173
- ? this.allTools.filter((t) => spec.tools!.includes(t.name))
174
- : this.allTools;
175
-
176
- // Create isolated conversation history
177
- const systemPrompt = new SystemMessage(
178
- `${spec.systemPrompt}\n\n--- Current Task ---\n${task}`
179
- );
180
-
181
- const history: BaseMessage[] = [
182
- new HumanMessage(task),
183
- ];
184
-
185
- let promptTokens = 0;
186
- let completionTokens = 0;
187
- let toolCallCount = 0;
188
- let turnsUsed = 0;
189
- let lastResponse = "";
190
- const filesModified: Set<string> = new Set();
191
-
192
- // Build LangChain tool declarations for binding
193
- const toolDeclarations = agentTools.map((t) => ({
194
- name: t.name,
195
- description: t.description,
196
- schema: t.schema,
197
- }));
198
-
199
- try {
200
- // Bind tools to the LLM for this sub-agent session
201
- let boundLlm: any;
202
- if ("bindTools" in this.llm && typeof (this.llm as any).bindTools === "function") {
203
- boundLlm = (this.llm as any).bindTools(toolDeclarations);
204
- } else {
205
- boundLlm = this.llm;
206
- }
207
-
208
- for (let turn = 0; turn < maxTurns; turn++) {
209
- turnsUsed++;
210
-
211
- // Build the full message array
212
- const messages = [systemPrompt, ...history];
213
- const stepPromptTokens = countMessageTokens(messages);
214
- promptTokens += stepPromptTokens;
215
-
216
- // Invoke the LLM
217
- const response = await boundLlm.invoke(messages);
218
- const responseTokens = countMessageTokens([response as AIMessage]);
219
- completionTokens += responseTokens;
220
-
221
- const aiMessage = response as AIMessage;
222
- history.push(aiMessage);
223
-
224
- // Extract text content
225
- if (typeof aiMessage.content === "string" && aiMessage.content.length > 0) {
226
- lastResponse = aiMessage.content;
227
- }
228
-
229
- // Check for tool calls
230
- if (!aiMessage.tool_calls || aiMessage.tool_calls.length === 0) {
231
- // No tool calls — agent is done
232
- break;
233
- }
234
-
235
- // Execute tool calls
236
- for (const call of aiMessage.tool_calls) {
237
- if (!call.id) continue;
238
-
239
- const tool = agentTools.find((t) => t.name === call.name);
240
- if (!tool) {
241
- history.push(new ToolMessage({
242
- content: `Error: Tool "${call.name}" is not available to this sub-agent.`,
243
- tool_call_id: call.id,
244
- }));
245
- continue;
246
- }
247
-
248
- toolCallCount++;
249
-
250
- try {
251
- const result = await tool.execute(call.args);
252
- const output = typeof result === "string" ? result : (result as ToolResult).content;
253
-
254
- // Track file modifications
255
- if (call.name === "write_file" && call.args?.path) {
256
- filesModified.add(call.args.path);
257
- }
258
-
259
- history.push(new ToolMessage({
260
- content: output,
261
- tool_call_id: call.id,
262
- }));
263
- } catch (err: any) {
264
- history.push(new ToolMessage({
265
- content: `Tool error: ${err.message}`,
266
- tool_call_id: call.id,
267
- }));
268
- }
269
- }
270
- }
271
-
272
- // Determine outcome
273
- const outcome = turnsUsed >= maxTurns ? "partial" : "success";
274
-
275
- return {
276
- agentName: spec.name,
277
- taskDescription: task,
278
- outcome,
279
- result: lastResponse || "(Sub-agent produced no text output)",
280
- filesModified: Array.from(filesModified),
281
- toolCallCount,
282
- tokenUsage: { prompt: promptTokens, completion: completionTokens },
283
- duration: Date.now() - startTime,
284
- turnsUsed,
285
- };
286
- } catch (error: any) {
287
- return this.makeErrorResult(
288
- spec.name,
289
- task,
290
- `Sub-agent error: ${error.message}`,
291
- { promptTokens, completionTokens, toolCallCount, turnsUsed, startTime, filesModified }
292
- );
293
- }
294
- }
295
-
296
- /**
297
- * Creates an error SubAgentResult.
298
- */
299
- private makeErrorResult(
300
- agentName: string,
301
- task: string,
302
- errorMsg: string,
303
- partial?: {
304
- promptTokens: number;
305
- completionTokens: number;
306
- toolCallCount: number;
307
- turnsUsed: number;
308
- startTime: number;
309
- filesModified: Set<string>;
310
- }
311
- ): SubAgentResult {
312
- return {
313
- agentName,
314
- taskDescription: task,
315
- outcome: "failure",
316
- result: errorMsg,
317
- filesModified: partial ? Array.from(partial.filesModified) : [],
318
- toolCallCount: partial?.toolCallCount ?? 0,
319
- tokenUsage: {
320
- prompt: partial?.promptTokens ?? 0,
321
- completion: partial?.completionTokens ?? 0,
322
- },
323
- duration: partial ? Date.now() - partial.startTime : 0,
324
- turnsUsed: partial?.turnsUsed ?? 0,
325
- };
326
- }
327
-
328
- /**
329
- * Clean up expired async tasks.
330
- */
331
- private cleanupExpired(): void {
332
- const now = Date.now();
333
- for (const [taskId, task] of this.asyncTasks.entries()) {
334
- if (now - task.startedAt > ASYNC_EXPIRY_MS) {
335
- this.asyncTasks.delete(taskId);
336
- }
337
- }
338
- }
339
- }
@@ -1,64 +0,0 @@
1
- import { BaseMessage } from "@langchain/core/messages";
2
-
3
- /**
4
- * Lightweight token counter using character-based heuristic.
5
- *
6
- * Approximation: ~4 characters per token for English text.
7
- * This avoids a dependency on tiktoken while being accurate enough
8
- * for capacity threshold decisions (~90% accuracy for English).
9
- *
10
- * For production accuracy, swap to tiktoken with the appropriate
11
- * model-specific encoding.
12
- */
13
-
14
- const CHARS_PER_TOKEN = 4;
15
-
16
- /**
17
- * Estimates the token count for a string.
18
- */
19
- export function estimateTokens(text: string): number {
20
- return Math.ceil(text.length / CHARS_PER_TOKEN);
21
- }
22
-
23
- /**
24
- * Estimates the total token count across a list of messages.
25
- */
26
- export function countMessageTokens(messages: BaseMessage[]): number {
27
- let total = 0;
28
-
29
- for (const msg of messages) {
30
- if (typeof msg.content === "string") {
31
- total += estimateTokens(msg.content);
32
- } else if (Array.isArray(msg.content)) {
33
- // Handle multi-part messages (text + tool calls)
34
- for (const part of msg.content) {
35
- if (typeof part === "string") {
36
- total += estimateTokens(part);
37
- } else if ("text" in part && typeof part.text === "string") {
38
- total += estimateTokens(part.text);
39
- }
40
- }
41
- }
42
-
43
- // Account for role/name overhead (~4 tokens per message)
44
- total += 4;
45
- }
46
-
47
- return total;
48
- }
49
-
50
- /**
51
- * Checks if the message history is approaching the context window limit.
52
- *
53
- * @param messages - The current conversation messages.
54
- * @param maxTokens - The model's context window size.
55
- * @param threshold - Fraction of capacity to trigger compaction (default: 0.8 = 80%).
56
- */
57
- export function isNearCapacity(
58
- messages: BaseMessage[],
59
- maxTokens: number,
60
- threshold = 0.8
61
- ): boolean {
62
- const used = countMessageTokens(messages);
63
- return used >= maxTokens * threshold;
64
- }
@@ -1,67 +0,0 @@
1
- import { Client } from "langsmith";
2
-
3
- const client = new Client();
4
- const DATASET_NAME = "joone-baseline-v1";
5
-
6
- /**
7
- * Definition of our baseline evaluation dataset.
8
- */
9
- const BASELINE_EXAMPLES = [
10
- {
11
- inputs: {
12
- instruction: "Write a python script that calculates the 10th fibonacci number and saves the result to /workspace/fib_result.txt",
13
- },
14
- outputs: {
15
- expected_file: "/workspace/fib_result.txt",
16
- expected_content: "55\n", // 0,1,1,2,3,5,8,13,21,34,55
17
- },
18
- },
19
- {
20
- inputs: {
21
- instruction: `Create a TypeScript file at /workspace/math.ts with a function 'add(a: number, b: number)' that returns their sum.
22
- Then write a test file at /workspace/math.test.ts using the 'node:assert' module.
23
- Finally, use the bash tool to run 'npx tsx math.test.ts' to verify it passes.`,
24
- },
25
- outputs: {
26
- expected_file: "/workspace/math.ts",
27
- expected_test_execution: true,
28
- },
29
- },
30
- {
31
- inputs: {
32
- instruction: "List all files in the current project root directory and save the output to /workspace/ls.txt",
33
- },
34
- outputs: {
35
- expected_file: "/workspace/ls.txt",
36
- },
37
- },
38
- ];
39
-
40
- /**
41
- * Programmatically creates the baseline dataset in LangSmith if it doesn't already exist.
42
- */
43
- export async function ensureBaselineDataset(): Promise<string> {
44
- try {
45
- const dataset = await client.readDataset({ datasetName: DATASET_NAME });
46
- console.log(`[Eval] Dataset '${DATASET_NAME}' already exists (ID: ${dataset.id}).`);
47
- return DATASET_NAME;
48
- } catch (error: any) {
49
- if (error?.message?.includes("not found") || error?.status === 404) {
50
- console.log(`[Eval] Creating dataset '${DATASET_NAME}' from scratch...`);
51
- const dataset = await client.createDataset(DATASET_NAME, {
52
- description: "Baseline tasks to evaluate Joone's core sandbox, tool routing, and reasoning precision.",
53
- });
54
-
55
- for (const example of BASELINE_EXAMPLES) {
56
- await client.createExample(
57
- example.inputs,
58
- example.outputs,
59
- { datasetId: dataset.id }
60
- );
61
- }
62
- console.log(`[Eval] Successfully seeded dataset '${DATASET_NAME}' with ${BASELINE_EXAMPLES.length} examples.`);
63
- return DATASET_NAME;
64
- }
65
- throw error;
66
- }
67
- }
@@ -1,81 +0,0 @@
1
- import { Run, Example } from "langsmith";
2
- import { EvaluationResult } from "langsmith/evaluation";
3
-
4
- /**
5
- * Custom evaluator: Success Validator
6
- * Checks if the agent crashed or returned a fatal error trace.
7
- */
8
- export async function successEvaluator(run: Run, example?: Example): Promise<EvaluationResult> {
9
- // If the trace has an error field, the harness threw an unhandled exception.
10
- const isError = !!run.error;
11
-
12
- return {
13
- key: "execution_success",
14
- score: isError ? 0 : 1,
15
- comment: isError ? run.error : "Agent completed execution loop cleanly.",
16
- };
17
- }
18
-
19
- /**
20
- * Custom evaluator: Cache Efficiency
21
- * Checks if the run utilized Anthropic Prompt Caching efficiently (> 70%).
22
- *
23
- * Note: Requires the LLM to emit `cache_creation_input_tokens` and `cache_read_input_tokens`
24
- * in its usage metadata payload, which is currently extracted by the SessionTracer.
25
- */
26
- export async function cacheEfficiencyEvaluator(run: Run, example?: Example): Promise<EvaluationResult> {
27
- const outputs = run.outputs || {};
28
- const metrics = outputs.metrics; // We will attach metrics to the harness output
29
-
30
- if (!metrics || !metrics.totalTokens) {
31
- return {
32
- key: "cache_hit_rate",
33
- score: null, // N/A (e.g., OpenAI or missing data)
34
- comment: "No token metrics found in run output.",
35
- };
36
- }
37
-
38
- const creationTokens = metrics.cacheCreationTokens || 0;
39
- const readTokens = metrics.cacheReadTokens || 0;
40
-
41
- if (creationTokens === 0 && readTokens === 0) {
42
- return {
43
- key: "cache_hit_rate",
44
- score: 0,
45
- comment: "Prompt caching is not active or not supported by this provider.",
46
- };
47
- }
48
-
49
- const totalInputTokens = metrics.promptTokens;
50
- const hitRate = readTokens / totalInputTokens;
51
-
52
- return {
53
- key: "cache_hit_rate",
54
- score: hitRate,
55
- comment: `Cache Hit Rate: ${(hitRate * 100).toFixed(1)}% (${readTokens} / ${totalInputTokens} input tokens)`,
56
- };
57
- }
58
-
59
- /**
60
- * Custom evaluator: Output Artifact Check
61
- * Verifies if the file the agent was instructed to create actually exists
62
- * in the Sandbox after execution.
63
- */
64
- export async function filePresenceEvaluator(run: Run, example?: Example): Promise<EvaluationResult> {
65
- if (!example?.outputs?.expected_file) {
66
- return { key: "expected_file_created", score: null };
67
- }
68
-
69
- // The harnessed output should return a manifest or state snapshot we can verify
70
- const outputs = run.outputs || {};
71
- const fileManifest = outputs.fileManifest || [];
72
-
73
- const expectedFile = example.outputs.expected_file;
74
- const didCreate = fileManifest.includes(expectedFile);
75
-
76
- return {
77
- key: "expected_file_created",
78
- score: didCreate ? 1 : 0,
79
- comment: didCreate ? `File ${expectedFile} created successfully.` : `Failed to create expected file: ${expectedFile}`,
80
- };
81
- }