joonecli 0.1.1 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (235) hide show
  1. package/dist/__tests__/config.test.js +1 -0
  2. package/dist/__tests__/config.test.js.map +1 -1
  3. package/dist/__tests__/installHostDeps.test.js +45 -0
  4. package/dist/__tests__/installHostDeps.test.js.map +1 -0
  5. package/dist/__tests__/whitelistedBackend.test.js +18 -0
  6. package/dist/__tests__/whitelistedBackend.test.js.map +1 -0
  7. package/dist/cli/config.d.ts +2 -0
  8. package/dist/cli/config.js +1 -0
  9. package/dist/cli/config.js.map +1 -1
  10. package/dist/cli/index.js +84 -97
  11. package/dist/cli/index.js.map +1 -1
  12. package/dist/commands/builtinCommands.js +6 -6
  13. package/dist/commands/builtinCommands.js.map +1 -1
  14. package/dist/commands/commandRegistry.d.ts +3 -1
  15. package/dist/commands/commandRegistry.js.map +1 -1
  16. package/dist/core/agentLoop.d.ts +11 -28
  17. package/dist/core/agentLoop.js +68 -229
  18. package/dist/core/agentLoop.js.map +1 -1
  19. package/dist/core/compactor.js +2 -2
  20. package/dist/core/compactor.js.map +1 -1
  21. package/dist/core/contextGuard.d.ts +5 -0
  22. package/dist/core/contextGuard.js +30 -3
  23. package/dist/core/contextGuard.js.map +1 -1
  24. package/dist/core/events.d.ts +45 -0
  25. package/dist/core/events.js +8 -0
  26. package/dist/core/events.js.map +1 -0
  27. package/dist/core/promptBuilder.js.map +1 -1
  28. package/dist/core/sessionStore.js +3 -2
  29. package/dist/core/sessionStore.js.map +1 -1
  30. package/dist/core/tokenCounter.d.ts +8 -1
  31. package/dist/core/tokenCounter.js +28 -0
  32. package/dist/core/tokenCounter.js.map +1 -1
  33. package/dist/hitl/bridge.js +1 -27
  34. package/dist/hitl/bridge.js.map +1 -1
  35. package/dist/middleware/loopDetection.d.ts +7 -23
  36. package/dist/middleware/loopDetection.js +38 -42
  37. package/dist/middleware/loopDetection.js.map +1 -1
  38. package/dist/sandbox/whitelistedBackend.d.ts +5 -0
  39. package/dist/sandbox/whitelistedBackend.js +27 -0
  40. package/dist/sandbox/whitelistedBackend.js.map +1 -0
  41. package/dist/tools/askUser.d.ts +12 -3
  42. package/dist/tools/askUser.js +16 -28
  43. package/dist/tools/askUser.js.map +1 -1
  44. package/dist/tools/bashTool.d.ts +11 -0
  45. package/dist/tools/bashTool.js +51 -0
  46. package/dist/tools/bashTool.js.map +1 -0
  47. package/dist/tools/index.d.ts +15 -27
  48. package/dist/tools/index.js +9 -181
  49. package/dist/tools/index.js.map +1 -1
  50. package/dist/tools/installHostDeps.d.ts +8 -0
  51. package/dist/tools/installHostDeps.js +44 -0
  52. package/dist/tools/installHostDeps.js.map +1 -0
  53. package/dist/tracing/sessionTracer.d.ts +1 -0
  54. package/dist/tracing/sessionTracer.js +4 -1
  55. package/dist/tracing/sessionTracer.js.map +1 -1
  56. package/dist/ui/App.js +116 -55
  57. package/dist/ui/App.js.map +1 -1
  58. package/dist/ui/components/ActionLog.d.ts +7 -0
  59. package/dist/ui/components/ActionLog.js +63 -0
  60. package/dist/ui/components/ActionLog.js.map +1 -0
  61. package/dist/ui/components/FileBrowser.d.ts +2 -0
  62. package/dist/ui/components/FileBrowser.js +41 -0
  63. package/dist/ui/components/FileBrowser.js.map +1 -0
  64. package/dist/ui/components/MessageBubble.js +1 -1
  65. package/dist/ui/components/MessageBubble.js.map +1 -1
  66. package/package.json +8 -5
  67. package/AGENTS.md +0 -56
  68. package/Handover.md +0 -115
  69. package/PROGRESS.md +0 -160
  70. package/dist/__tests__/m55.test.js +0 -160
  71. package/dist/__tests__/m55.test.js.map +0 -1
  72. package/dist/__tests__/middleware.test.js +0 -169
  73. package/dist/__tests__/middleware.test.js.map +0 -1
  74. package/dist/__tests__/optimizations.test.d.ts +0 -1
  75. package/dist/__tests__/optimizations.test.js +0 -136
  76. package/dist/__tests__/optimizations.test.js.map +0 -1
  77. package/dist/__tests__/security.test.d.ts +0 -1
  78. package/dist/__tests__/security.test.js +0 -86
  79. package/dist/__tests__/security.test.js.map +0 -1
  80. package/dist/__tests__/streaming.test.d.ts +0 -1
  81. package/dist/__tests__/streaming.test.js +0 -71
  82. package/dist/__tests__/streaming.test.js.map +0 -1
  83. package/dist/__tests__/toolRouter.test.d.ts +0 -1
  84. package/dist/__tests__/toolRouter.test.js +0 -37
  85. package/dist/__tests__/toolRouter.test.js.map +0 -1
  86. package/dist/__tests__/tools.test.d.ts +0 -1
  87. package/dist/__tests__/tools.test.js +0 -112
  88. package/dist/__tests__/tools.test.js.map +0 -1
  89. package/dist/core/subAgent.d.ts +0 -56
  90. package/dist/core/subAgent.js +0 -240
  91. package/dist/core/subAgent.js.map +0 -1
  92. package/dist/debug_google.d.ts +0 -1
  93. package/dist/debug_google.js +0 -23
  94. package/dist/debug_google.js.map +0 -1
  95. package/dist/middleware/commandSanitizer.d.ts +0 -18
  96. package/dist/middleware/commandSanitizer.js +0 -50
  97. package/dist/middleware/commandSanitizer.js.map +0 -1
  98. package/dist/middleware/permission.d.ts +0 -17
  99. package/dist/middleware/permission.js +0 -59
  100. package/dist/middleware/permission.js.map +0 -1
  101. package/dist/middleware/pipeline.d.ts +0 -31
  102. package/dist/middleware/pipeline.js +0 -62
  103. package/dist/middleware/pipeline.js.map +0 -1
  104. package/dist/middleware/preCompletion.d.ts +0 -29
  105. package/dist/middleware/preCompletion.js +0 -82
  106. package/dist/middleware/preCompletion.js.map +0 -1
  107. package/dist/middleware/types.d.ts +0 -40
  108. package/dist/middleware/types.js +0 -8
  109. package/dist/middleware/types.js.map +0 -1
  110. package/dist/skills/loader.d.ts +0 -55
  111. package/dist/skills/loader.js +0 -132
  112. package/dist/skills/loader.js.map +0 -1
  113. package/dist/skills/tools.d.ts +0 -5
  114. package/dist/skills/tools.js +0 -78
  115. package/dist/skills/tools.js.map +0 -1
  116. package/dist/test_cache.d.ts +0 -1
  117. package/dist/test_cache.js +0 -55
  118. package/dist/test_cache.js.map +0 -1
  119. package/dist/test_google.d.ts +0 -1
  120. package/dist/test_google.js +0 -36
  121. package/dist/test_google.js.map +0 -1
  122. package/dist/tools/browser.d.ts +0 -19
  123. package/dist/tools/browser.js +0 -111
  124. package/dist/tools/browser.js.map +0 -1
  125. package/dist/tools/registry.d.ts +0 -31
  126. package/dist/tools/registry.js +0 -168
  127. package/dist/tools/registry.js.map +0 -1
  128. package/dist/tools/router.d.ts +0 -34
  129. package/dist/tools/router.js +0 -75
  130. package/dist/tools/router.js.map +0 -1
  131. package/dist/tools/security.d.ts +0 -28
  132. package/dist/tools/security.js +0 -183
  133. package/dist/tools/security.js.map +0 -1
  134. package/dist/tools/spawnAgent.d.ts +0 -19
  135. package/dist/tools/spawnAgent.js +0 -130
  136. package/dist/tools/spawnAgent.js.map +0 -1
  137. package/dist/tools/webSearch.d.ts +0 -6
  138. package/dist/tools/webSearch.js +0 -120
  139. package/dist/tools/webSearch.js.map +0 -1
  140. package/docs/01_insights_and_patterns.md +0 -27
  141. package/docs/02_edge_cases_and_mitigations.md +0 -143
  142. package/docs/03_initial_implementation_plan.md +0 -66
  143. package/docs/04_tech_stack_proposal.md +0 -20
  144. package/docs/05_prd.md +0 -87
  145. package/docs/06_user_stories.md +0 -72
  146. package/docs/07_system_architecture.md +0 -138
  147. package/docs/08_roadmap.md +0 -200
  148. package/e2b/Dockerfile +0 -26
  149. package/src/__tests__/bootstrap.test.ts +0 -111
  150. package/src/__tests__/config.test.ts +0 -97
  151. package/src/__tests__/m55.test.ts +0 -238
  152. package/src/__tests__/middleware.test.ts +0 -219
  153. package/src/__tests__/modelFactory.test.ts +0 -63
  154. package/src/__tests__/optimizations.test.ts +0 -201
  155. package/src/__tests__/promptBuilder.test.ts +0 -141
  156. package/src/__tests__/sandbox.test.ts +0 -102
  157. package/src/__tests__/security.test.ts +0 -122
  158. package/src/__tests__/streaming.test.ts +0 -82
  159. package/src/__tests__/toolRouter.test.ts +0 -52
  160. package/src/__tests__/tools.test.ts +0 -146
  161. package/src/__tests__/tracing.test.ts +0 -196
  162. package/src/agents/agentRegistry.ts +0 -69
  163. package/src/agents/agentSpec.ts +0 -67
  164. package/src/agents/builtinAgents.ts +0 -142
  165. package/src/cli/config.ts +0 -124
  166. package/src/cli/index.ts +0 -742
  167. package/src/cli/modelFactory.ts +0 -174
  168. package/src/cli/postinstall.ts +0 -28
  169. package/src/cli/providers.ts +0 -107
  170. package/src/commands/builtinCommands.ts +0 -293
  171. package/src/commands/commandRegistry.ts +0 -194
  172. package/src/core/agentLoop.d.ts.map +0 -1
  173. package/src/core/agentLoop.ts +0 -312
  174. package/src/core/autoSave.ts +0 -95
  175. package/src/core/compactor.ts +0 -252
  176. package/src/core/contextGuard.ts +0 -129
  177. package/src/core/errors.ts +0 -202
  178. package/src/core/promptBuilder.d.ts.map +0 -1
  179. package/src/core/promptBuilder.ts +0 -139
  180. package/src/core/reasoningRouter.ts +0 -121
  181. package/src/core/retry.ts +0 -75
  182. package/src/core/sessionResumer.ts +0 -90
  183. package/src/core/sessionStore.ts +0 -216
  184. package/src/core/subAgent.ts +0 -339
  185. package/src/core/tokenCounter.ts +0 -64
  186. package/src/evals/dataset.ts +0 -67
  187. package/src/evals/evaluator.ts +0 -81
  188. package/src/hitl/bridge.ts +0 -160
  189. package/src/middleware/commandSanitizer.ts +0 -60
  190. package/src/middleware/loopDetection.ts +0 -63
  191. package/src/middleware/permission.ts +0 -72
  192. package/src/middleware/pipeline.ts +0 -75
  193. package/src/middleware/preCompletion.ts +0 -94
  194. package/src/middleware/types.ts +0 -45
  195. package/src/sandbox/bootstrap.ts +0 -121
  196. package/src/sandbox/manager.ts +0 -239
  197. package/src/sandbox/sync.ts +0 -157
  198. package/src/skills/loader.ts +0 -143
  199. package/src/skills/tools.ts +0 -99
  200. package/src/skills/types.ts +0 -13
  201. package/src/test_cache.ts +0 -72
  202. package/src/tools/askUser.ts +0 -47
  203. package/src/tools/browser.ts +0 -137
  204. package/src/tools/index.d.ts.map +0 -1
  205. package/src/tools/index.ts +0 -237
  206. package/src/tools/registry.ts +0 -198
  207. package/src/tools/router.ts +0 -78
  208. package/src/tools/security.ts +0 -220
  209. package/src/tools/spawnAgent.ts +0 -158
  210. package/src/tools/webSearch.ts +0 -142
  211. package/src/tracing/analyzer.ts +0 -265
  212. package/src/tracing/langsmith.ts +0 -63
  213. package/src/tracing/sessionTracer.ts +0 -202
  214. package/src/tracing/types.ts +0 -49
  215. package/src/types/valyu.d.ts +0 -37
  216. package/src/ui/App.tsx +0 -404
  217. package/src/ui/components/HITLPrompt.tsx +0 -119
  218. package/src/ui/components/Header.tsx +0 -51
  219. package/src/ui/components/MessageBubble.tsx +0 -46
  220. package/src/ui/components/StatusBar.tsx +0 -138
  221. package/src/ui/components/StreamingText.tsx +0 -48
  222. package/src/ui/components/ToolCallPanel.tsx +0 -80
  223. package/tests/commands/commands.test.ts +0 -356
  224. package/tests/core/compactor.test.ts +0 -217
  225. package/tests/core/retryAndErrors.test.ts +0 -164
  226. package/tests/core/sessionResumer.test.ts +0 -95
  227. package/tests/core/sessionStore.test.ts +0 -84
  228. package/tests/core/stability.test.ts +0 -165
  229. package/tests/core/subAgent.test.ts +0 -238
  230. package/tests/hitl/hitlBridge.test.ts +0 -115
  231. package/tsconfig.json +0 -16
  232. package/vitest.config.ts +0 -10
  233. package/vitest.out +0 -48
  234. /package/dist/__tests__/{m55.test.d.ts → installHostDeps.test.d.ts} +0 -0
  235. /package/dist/__tests__/{middleware.test.d.ts → whitelistedBackend.test.d.ts} +0 -0
@@ -1,339 +0,0 @@
1
- /**
2
- * Sub-Agent Manager
3
- *
4
- * Spawns and orchestrates isolated sub-agents for scoped tasks.
5
- * Each sub-agent gets its own ExecutionHarness with a separate conversation
6
- * history. Only the final SubAgentResult is returned to the main agent,
7
- * discarding the sub-agent's internal conversation to save context.
8
- *
9
- * Supports both synchronous (blocking) and asynchronous (non-blocking) modes.
10
- *
11
- * Safety:
12
- * - Depth limit of 1: sub-agents cannot spawn other sub-agents
13
- * - maxTurns cap per agent prevents doom-loops
14
- * - Concurrent async agent cap of 3 prevents resource exhaustion
15
- * - Per-agent token budget tracking
16
- */
17
-
18
- import { AgentSpec, SubAgentResult } from "../agents/agentSpec.js";
19
- import { AgentRegistry } from "../agents/agentRegistry.js";
20
- import { DynamicToolInterface, ToolResult } from "../tools/index.js";
21
- import { ContextState } from "../core/promptBuilder.js";
22
- import { countMessageTokens } from "../core/tokenCounter.js";
23
- import { BaseMessage, HumanMessage, AIMessage, SystemMessage, ToolMessage } from "@langchain/core/messages";
24
- import { BaseChatModel } from "@langchain/core/language_models/chat_models";
25
- import { Runnable } from "@langchain/core/runnables";
26
-
27
- // ─── Constants ──────────────────────────────────────────────────────────────────
28
-
29
- const DEFAULT_MAX_TURNS = 10;
30
- const MAX_CONCURRENT_ASYNC = 3;
31
- const ASYNC_EXPIRY_MS = 5 * 60 * 1000; // 5 minutes
32
-
33
- // ─── Async Task State ───────────────────────────────────────────────────────────
34
-
35
- interface AsyncTask {
36
- taskId: string;
37
- agentName: string;
38
- taskDescription: string;
39
- promise: Promise<SubAgentResult>;
40
- result?: SubAgentResult;
41
- startedAt: number;
42
- completed: boolean;
43
- }
44
-
45
- // ─── SubAgentManager ────────────────────────────────────────────────────────────
46
-
47
- export class SubAgentManager {
48
- private registry: AgentRegistry;
49
- private allTools: DynamicToolInterface[];
50
- private llm: Runnable | BaseChatModel;
51
- private asyncTasks: Map<string, AsyncTask> = new Map();
52
- private taskCounter = 0;
53
-
54
- constructor(
55
- registry: AgentRegistry,
56
- tools: DynamicToolInterface[],
57
- llm: Runnable | BaseChatModel,
58
- ) {
59
- this.registry = registry;
60
- // Filter out spawn_agent and check_agent to prevent recursive nesting (depth-1 limit)
61
- this.allTools = tools.filter(
62
- (t) => t.name !== "spawn_agent" && t.name !== "check_agent"
63
- );
64
- this.llm = llm;
65
- }
66
-
67
- /**
68
- * Synchronous spawn — blocks until the sub-agent finishes.
69
- */
70
- async spawn(
71
- agentName: string,
72
- task: string,
73
- maxTurnsOverride?: number
74
- ): Promise<SubAgentResult> {
75
- const spec = this.registry.get(agentName);
76
- if (!spec) {
77
- return this.makeErrorResult(
78
- agentName,
79
- task,
80
- `Unknown agent "${agentName}". Available: ${this.registry.getNames().join(", ")}`
81
- );
82
- }
83
-
84
- return this.runAgent(spec, task, maxTurnsOverride);
85
- }
86
-
87
- /**
88
- * Asynchronous spawn — returns immediately with a taskId.
89
- * The main agent can poll with getResult(taskId).
90
- */
91
- async spawnAsync(
92
- agentName: string,
93
- task: string,
94
- maxTurnsOverride?: number
95
- ): Promise<string> {
96
- // Cap concurrent async agents
97
- this.cleanupExpired();
98
- const activeCount = Array.from(this.asyncTasks.values())
99
- .filter((t) => !t.completed).length;
100
-
101
- if (activeCount >= MAX_CONCURRENT_ASYNC) {
102
- throw new Error(
103
- `Maximum concurrent async agents reached (${MAX_CONCURRENT_ASYNC}). ` +
104
- `Wait for existing tasks to complete or check them with check_agent.`
105
- );
106
- }
107
-
108
- const spec = this.registry.get(agentName);
109
- if (!spec) {
110
- throw new Error(
111
- `Unknown agent "${agentName}". Available: ${this.registry.getNames().join(", ")}`
112
- );
113
- }
114
-
115
- const taskId = `task_${++this.taskCounter}_${Date.now()}`;
116
-
117
- const promise = this.runAgent(spec, task, maxTurnsOverride).then((result) => {
118
- const asyncTask = this.asyncTasks.get(taskId);
119
- if (asyncTask) {
120
- asyncTask.result = result;
121
- asyncTask.completed = true;
122
- }
123
- return result;
124
- });
125
-
126
- this.asyncTasks.set(taskId, {
127
- taskId,
128
- agentName,
129
- taskDescription: task,
130
- promise,
131
- startedAt: Date.now(),
132
- completed: false,
133
- });
134
-
135
- return taskId;
136
- }
137
-
138
- /**
139
- * Check the status or get the result of an async task.
140
- * Returns the result if completed, or a status message if still running.
141
- */
142
- async getResult(taskId: string): Promise<SubAgentResult | string> {
143
- const asyncTask = this.asyncTasks.get(taskId);
144
- if (!asyncTask) {
145
- return `Unknown task ID: ${taskId}. No such async task exists.`;
146
- }
147
-
148
- if (asyncTask.completed && asyncTask.result) {
149
- // Clean up the task
150
- this.asyncTasks.delete(taskId);
151
- return asyncTask.result;
152
- }
153
-
154
- const elapsed = Math.round((Date.now() - asyncTask.startedAt) / 1000);
155
- return `Task "${asyncTask.taskDescription}" (agent: ${asyncTask.agentName}) ` +
156
- `is still running (${elapsed}s elapsed).`;
157
- }
158
-
159
- /**
160
- * Core execution loop for a sub-agent.
161
- * Creates an isolated conversation and runs a multi-turn loop.
162
- */
163
- private async runAgent(
164
- spec: AgentSpec,
165
- task: string,
166
- maxTurnsOverride?: number
167
- ): Promise<SubAgentResult> {
168
- const startTime = Date.now();
169
- const maxTurns = maxTurnsOverride ?? spec.maxTurns ?? DEFAULT_MAX_TURNS;
170
-
171
- // Resolve available tools for this agent
172
- const agentTools = spec.tools
173
- ? this.allTools.filter((t) => spec.tools!.includes(t.name))
174
- : this.allTools;
175
-
176
- // Create isolated conversation history
177
- const systemPrompt = new HumanMessage(
178
- `<system-directive>\n${spec.systemPrompt}\n\n--- Current Task ---\n${task}\n</system-directive>`
179
- );
180
-
181
- const history: BaseMessage[] = [
182
- new HumanMessage(task),
183
- ];
184
-
185
- let promptTokens = 0;
186
- let completionTokens = 0;
187
- let toolCallCount = 0;
188
- let turnsUsed = 0;
189
- let lastResponse = "";
190
- const filesModified: Set<string> = new Set();
191
-
192
- // Build LangChain tool declarations for binding
193
- const toolDeclarations = agentTools.map((t) => ({
194
- name: t.name,
195
- description: t.description,
196
- schema: t.schema,
197
- }));
198
-
199
- try {
200
- // Bind tools to the LLM for this sub-agent session
201
- let boundLlm: any;
202
- if ("bindTools" in this.llm && typeof (this.llm as any).bindTools === "function") {
203
- boundLlm = (this.llm as any).bindTools(toolDeclarations);
204
- } else {
205
- boundLlm = this.llm;
206
- }
207
-
208
- for (let turn = 0; turn < maxTurns; turn++) {
209
- turnsUsed++;
210
-
211
- // Build the full message array
212
- const messages = [systemPrompt, ...history];
213
- const stepPromptTokens = countMessageTokens(messages);
214
- promptTokens += stepPromptTokens;
215
-
216
- // Invoke the LLM
217
- const response = await boundLlm.invoke(messages);
218
- const responseTokens = countMessageTokens([response as AIMessage]);
219
- completionTokens += responseTokens;
220
-
221
- const aiMessage = response as AIMessage;
222
- history.push(aiMessage);
223
-
224
- // Extract text content
225
- if (typeof aiMessage.content === "string" && aiMessage.content.length > 0) {
226
- lastResponse = aiMessage.content;
227
- }
228
-
229
- // Check for tool calls
230
- if (!aiMessage.tool_calls || aiMessage.tool_calls.length === 0) {
231
- // No tool calls — agent is done
232
- break;
233
- }
234
-
235
- // Execute tool calls
236
- for (const call of aiMessage.tool_calls) {
237
- if (!call.id) continue;
238
-
239
- const tool = agentTools.find((t) => t.name === call.name);
240
- if (!tool) {
241
- history.push(new ToolMessage({
242
- content: `Error: Tool "${call.name}" is not available to this sub-agent.`,
243
- tool_call_id: call.id,
244
- }));
245
- continue;
246
- }
247
-
248
- toolCallCount++;
249
-
250
- try {
251
- const result = await tool.execute(call.args);
252
- const output = typeof result === "string" ? result : (result as ToolResult).content;
253
-
254
- // Track file modifications
255
- if (call.name === "write_file" && call.args?.path) {
256
- filesModified.add(call.args.path);
257
- }
258
-
259
- history.push(new ToolMessage({
260
- content: output,
261
- tool_call_id: call.id,
262
- }));
263
- } catch (err: any) {
264
- history.push(new ToolMessage({
265
- content: `Tool error: ${err.message}`,
266
- tool_call_id: call.id,
267
- }));
268
- }
269
- }
270
- }
271
-
272
- // Determine outcome
273
- const outcome = turnsUsed >= maxTurns ? "partial" : "success";
274
-
275
- return {
276
- agentName: spec.name,
277
- taskDescription: task,
278
- outcome,
279
- result: lastResponse || "(Sub-agent produced no text output)",
280
- filesModified: Array.from(filesModified),
281
- toolCallCount,
282
- tokenUsage: { prompt: promptTokens, completion: completionTokens },
283
- duration: Date.now() - startTime,
284
- turnsUsed,
285
- };
286
- } catch (error: any) {
287
- return this.makeErrorResult(
288
- spec.name,
289
- task,
290
- `Sub-agent error: ${error.message}`,
291
- { promptTokens, completionTokens, toolCallCount, turnsUsed, startTime, filesModified }
292
- );
293
- }
294
- }
295
-
296
- /**
297
- * Creates an error SubAgentResult.
298
- */
299
- private makeErrorResult(
300
- agentName: string,
301
- task: string,
302
- errorMsg: string,
303
- partial?: {
304
- promptTokens: number;
305
- completionTokens: number;
306
- toolCallCount: number;
307
- turnsUsed: number;
308
- startTime: number;
309
- filesModified: Set<string>;
310
- }
311
- ): SubAgentResult {
312
- return {
313
- agentName,
314
- taskDescription: task,
315
- outcome: "failure",
316
- result: errorMsg,
317
- filesModified: partial ? Array.from(partial.filesModified) : [],
318
- toolCallCount: partial?.toolCallCount ?? 0,
319
- tokenUsage: {
320
- prompt: partial?.promptTokens ?? 0,
321
- completion: partial?.completionTokens ?? 0,
322
- },
323
- duration: partial ? Date.now() - partial.startTime : 0,
324
- turnsUsed: partial?.turnsUsed ?? 0,
325
- };
326
- }
327
-
328
- /**
329
- * Clean up expired async tasks.
330
- */
331
- private cleanupExpired(): void {
332
- const now = Date.now();
333
- for (const [taskId, task] of this.asyncTasks.entries()) {
334
- if (now - task.startedAt > ASYNC_EXPIRY_MS) {
335
- this.asyncTasks.delete(taskId);
336
- }
337
- }
338
- }
339
- }
@@ -1,64 +0,0 @@
1
- import { BaseMessage } from "@langchain/core/messages";
2
-
3
- /**
4
- * Lightweight token counter using character-based heuristic.
5
- *
6
- * Approximation: ~4 characters per token for English text.
7
- * This avoids a dependency on tiktoken while being accurate enough
8
- * for capacity threshold decisions (~90% accuracy for English).
9
- *
10
- * For production accuracy, swap to tiktoken with the appropriate
11
- * model-specific encoding.
12
- */
13
-
14
- const CHARS_PER_TOKEN = 4;
15
-
16
- /**
17
- * Estimates the token count for a string.
18
- */
19
- export function estimateTokens(text: string): number {
20
- return Math.ceil(text.length / CHARS_PER_TOKEN);
21
- }
22
-
23
- /**
24
- * Estimates the total token count across a list of messages.
25
- */
26
- export function countMessageTokens(messages: BaseMessage[]): number {
27
- let total = 0;
28
-
29
- for (const msg of messages) {
30
- if (typeof msg.content === "string") {
31
- total += estimateTokens(msg.content);
32
- } else if (Array.isArray(msg.content)) {
33
- // Handle multi-part messages (text + tool calls)
34
- for (const part of msg.content) {
35
- if (typeof part === "string") {
36
- total += estimateTokens(part);
37
- } else if ("text" in part && typeof part.text === "string") {
38
- total += estimateTokens(part.text);
39
- }
40
- }
41
- }
42
-
43
- // Account for role/name overhead (~4 tokens per message)
44
- total += 4;
45
- }
46
-
47
- return total;
48
- }
49
-
50
- /**
51
- * Checks if the message history is approaching the context window limit.
52
- *
53
- * @param messages - The current conversation messages.
54
- * @param maxTokens - The model's context window size.
55
- * @param threshold - Fraction of capacity to trigger compaction (default: 0.8 = 80%).
56
- */
57
- export function isNearCapacity(
58
- messages: BaseMessage[],
59
- maxTokens: number,
60
- threshold = 0.8
61
- ): boolean {
62
- const used = countMessageTokens(messages);
63
- return used >= maxTokens * threshold;
64
- }
@@ -1,67 +0,0 @@
1
- import { Client } from "langsmith";
2
-
3
- const client = new Client();
4
- const DATASET_NAME = "joone-baseline-v1";
5
-
6
- /**
7
- * Definition of our baseline evaluation dataset.
8
- */
9
- const BASELINE_EXAMPLES = [
10
- {
11
- inputs: {
12
- instruction: "Write a python script that calculates the 10th fibonacci number and saves the result to /workspace/fib_result.txt",
13
- },
14
- outputs: {
15
- expected_file: "/workspace/fib_result.txt",
16
- expected_content: "55\n", // 0,1,1,2,3,5,8,13,21,34,55
17
- },
18
- },
19
- {
20
- inputs: {
21
- instruction: `Create a TypeScript file at /workspace/math.ts with a function 'add(a: number, b: number)' that returns their sum.
22
- Then write a test file at /workspace/math.test.ts using the 'node:assert' module.
23
- Finally, use the bash tool to run 'npx tsx math.test.ts' to verify it passes.`,
24
- },
25
- outputs: {
26
- expected_file: "/workspace/math.ts",
27
- expected_test_execution: true,
28
- },
29
- },
30
- {
31
- inputs: {
32
- instruction: "List all files in the current project root directory and save the output to /workspace/ls.txt",
33
- },
34
- outputs: {
35
- expected_file: "/workspace/ls.txt",
36
- },
37
- },
38
- ];
39
-
40
- /**
41
- * Programmatically creates the baseline dataset in LangSmith if it doesn't already exist.
42
- */
43
- export async function ensureBaselineDataset(): Promise<string> {
44
- try {
45
- const dataset = await client.readDataset({ datasetName: DATASET_NAME });
46
- console.log(`[Eval] Dataset '${DATASET_NAME}' already exists (ID: ${dataset.id}).`);
47
- return DATASET_NAME;
48
- } catch (error: any) {
49
- if (error?.message?.includes("not found") || error?.status === 404) {
50
- console.log(`[Eval] Creating dataset '${DATASET_NAME}' from scratch...`);
51
- const dataset = await client.createDataset(DATASET_NAME, {
52
- description: "Baseline tasks to evaluate Joone's core sandbox, tool routing, and reasoning precision.",
53
- });
54
-
55
- for (const example of BASELINE_EXAMPLES) {
56
- await client.createExample(
57
- example.inputs,
58
- example.outputs,
59
- { datasetId: dataset.id }
60
- );
61
- }
62
- console.log(`[Eval] Successfully seeded dataset '${DATASET_NAME}' with ${BASELINE_EXAMPLES.length} examples.`);
63
- return DATASET_NAME;
64
- }
65
- throw error;
66
- }
67
- }
@@ -1,81 +0,0 @@
1
- import { Run, Example } from "langsmith";
2
- import { EvaluationResult } from "langsmith/evaluation";
3
-
4
- /**
5
- * Custom evaluator: Success Validator
6
- * Checks if the agent crashed or returned a fatal error trace.
7
- */
8
- export async function successEvaluator(run: Run, example?: Example): Promise<EvaluationResult> {
9
- // If the trace has an error field, the harness threw an unhandled exception.
10
- const isError = !!run.error;
11
-
12
- return {
13
- key: "execution_success",
14
- score: isError ? 0 : 1,
15
- comment: isError ? run.error : "Agent completed execution loop cleanly.",
16
- };
17
- }
18
-
19
- /**
20
- * Custom evaluator: Cache Efficiency
21
- * Checks if the run utilized Anthropic Prompt Caching efficiently (> 70%).
22
- *
23
- * Note: Requires the LLM to emit `cache_creation_input_tokens` and `cache_read_input_tokens`
24
- * in its usage metadata payload, which is currently extracted by the SessionTracer.
25
- */
26
- export async function cacheEfficiencyEvaluator(run: Run, example?: Example): Promise<EvaluationResult> {
27
- const outputs = run.outputs || {};
28
- const metrics = outputs.metrics; // We will attach metrics to the harness output
29
-
30
- if (!metrics || !metrics.totalTokens) {
31
- return {
32
- key: "cache_hit_rate",
33
- score: null, // N/A (e.g., OpenAI or missing data)
34
- comment: "No token metrics found in run output.",
35
- };
36
- }
37
-
38
- const creationTokens = metrics.cacheCreationTokens || 0;
39
- const readTokens = metrics.cacheReadTokens || 0;
40
-
41
- if (creationTokens === 0 && readTokens === 0) {
42
- return {
43
- key: "cache_hit_rate",
44
- score: 0,
45
- comment: "Prompt caching is not active or not supported by this provider.",
46
- };
47
- }
48
-
49
- const totalInputTokens = metrics.promptTokens;
50
- const hitRate = readTokens / totalInputTokens;
51
-
52
- return {
53
- key: "cache_hit_rate",
54
- score: hitRate,
55
- comment: `Cache Hit Rate: ${(hitRate * 100).toFixed(1)}% (${readTokens} / ${totalInputTokens} input tokens)`,
56
- };
57
- }
58
-
59
- /**
60
- * Custom evaluator: Output Artifact Check
61
- * Verifies if the file the agent was instructed to create actually exists
62
- * in the Sandbox after execution.
63
- */
64
- export async function filePresenceEvaluator(run: Run, example?: Example): Promise<EvaluationResult> {
65
- if (!example?.outputs?.expected_file) {
66
- return { key: "expected_file_created", score: null };
67
- }
68
-
69
- // The harnessed output should return a manifest or state snapshot we can verify
70
- const outputs = run.outputs || {};
71
- const fileManifest = outputs.fileManifest || [];
72
-
73
- const expectedFile = example.outputs.expected_file;
74
- const didCreate = fileManifest.includes(expectedFile);
75
-
76
- return {
77
- key: "expected_file_created",
78
- score: didCreate ? 1 : 0,
79
- comment: didCreate ? `File ${expectedFile} created successfully.` : `Failed to create expected file: ${expectedFile}`,
80
- };
81
- }