npm - keystone-cli - Versions diffs - 2.0.1 → 2.1.1 - Mend

keystone-cli 2.0.1 → 2.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (58) hide show

package/README.md +30 -4
package/package.json +17 -3
package/src/cli.ts +3 -2
package/src/commands/event.ts +9 -0
package/src/commands/run.ts +17 -0
package/src/db/dynamic-state-manager.ts +12 -9
package/src/db/memory-db.test.ts +19 -1
package/src/db/memory-db.ts +101 -22
package/src/db/workflow-db.ts +181 -9
package/src/expression/evaluator.ts +4 -1
package/src/parser/schema.ts +2 -1
package/src/runner/__test__/llm-test-setup.ts +43 -11
package/src/runner/durable-timers.test.ts +1 -1
package/src/runner/executors/dynamic-executor.ts +125 -88
package/src/runner/executors/engine-executor.ts +10 -39
package/src/runner/executors/file-executor.ts +38 -0
package/src/runner/executors/foreach-executor.ts +170 -17
package/src/runner/executors/human-executor.ts +18 -0
package/src/runner/executors/llm/stream-handler.ts +103 -0
package/src/runner/executors/llm/tool-manager.ts +342 -0
package/src/runner/executors/llm-executor.ts +313 -550
package/src/runner/executors/memory-executor.ts +41 -34
package/src/runner/executors/shell-executor.ts +141 -54
package/src/runner/executors/subworkflow-executor.ts +16 -0
package/src/runner/executors/types.ts +3 -1
package/src/runner/executors/verification_fixes.test.ts +46 -0
package/src/runner/join-scheduling.test.ts +2 -1
package/src/runner/llm-adapter.integration.test.ts +10 -5
package/src/runner/llm-adapter.ts +46 -17
package/src/runner/llm-clarification.test.ts +4 -1
package/src/runner/llm-executor.test.ts +21 -7
package/src/runner/mcp-client.ts +36 -2
package/src/runner/mcp-server.ts +65 -36
package/src/runner/memoization.test.ts +2 -2
package/src/runner/recovery-security.test.ts +5 -2
package/src/runner/reflexion.test.ts +6 -3
package/src/runner/services/context-builder.ts +13 -4
package/src/runner/services/workflow-validator.ts +2 -1
package/src/runner/shell-executor.test.ts +107 -1
package/src/runner/standard-tools-ast.test.ts +4 -2
package/src/runner/standard-tools-execution.test.ts +14 -1
package/src/runner/standard-tools-integration.test.ts +6 -0
package/src/runner/standard-tools.ts +13 -10
package/src/runner/step-executor.ts +2 -2
package/src/runner/tool-integration.test.ts +4 -1
package/src/runner/workflow-runner.test.ts +23 -12
package/src/runner/workflow-runner.ts +174 -85
package/src/runner/workflow-state.ts +186 -111
package/src/ui/dashboard.tsx +17 -3
package/src/utils/config-loader.ts +4 -0
package/src/utils/constants.ts +4 -0
package/src/utils/context-injector.test.ts +27 -27
package/src/utils/context-injector.ts +68 -26
package/src/utils/process-sandbox.ts +138 -148
package/src/utils/redactor.ts +39 -9
package/src/utils/resource-loader.ts +24 -19
package/src/utils/sandbox.ts +6 -0
package/src/utils/stream-utils.ts +58 -0

package/src/runner/llm-executor.test.ts CHANGED Viewed

@@ -29,6 +29,7 @@ import type { ExpressionContext } from '../expression/evaluator';
 import * as agentParser from '../parser/agent-parser';
 import type { Agent, LlmStep, Step } from '../parser/schema';
 import { ConfigLoader } from '../utils/config-loader';
+import * as llmAdapter from './llm-adapter';
 import type { StepResult } from './step-executor';
 // Note: mock.module() for llm-adapter is now handled by the preload file
@@ -66,6 +67,7 @@ describe('llm-executor', () => {
   let spawnSpy: ReturnType<typeof spyOn>;
   let resolveAgentPathSpy: ReturnType<typeof spyOn>;
   let parseAgentSpy: ReturnType<typeof spyOn>;
+  let getModelSpy: ReturnType<typeof spyOn>;
   // Default Mock Chat Logic
   const defaultMockChat = async (messages: LLMMessage[], _options: any) => {
@@ -184,7 +186,9 @@ describe('llm-executor', () => {
     ConfigLoader.clear();
     setupLlmMocks();
     resetLlmMocks();
-    mockGetModel.mockResolvedValue(createUnifiedMockModel());
+    // Spy on getModel to return our mock model directly
+    getModelSpy = spyOn(llmAdapter, 'getModel').mockResolvedValue(createUnifiedMockModel() as any);
     // Mock agent parser to avoid file dependencies
     resolveAgentPathSpy = spyOn(agentParser, 'resolveAgentPath').mockReturnValue('test-agent.md');
@@ -215,6 +219,7 @@ describe('llm-executor', () => {
   afterEach(() => {
     resolveAgentPathSpy?.mockRestore();
     parseAgentSpy?.mockRestore();
+    getModelSpy?.mockRestore();
   });
   afterAll(() => {
@@ -254,21 +259,25 @@ describe('llm-executor', () => {
       needs: [],
       maxIterations: 10,
     };
-    const logger = { log: mock(), error: mock(), warn: mock(), info: mock(), debug: mock() };
+    const loggerSpy = { log: mock(), error: mock(), warn: mock(), info: mock(), debug: mock() };
+    const consoleSpy = spyOn(console, 'error').mockImplementation(() => {});
     await executeLlmStep(
       step,
       { inputs: {}, steps: {} },
       async () => ({ status: 'success', output: 'ok' }),
-      logger
+      loggerSpy
     );
-    expect(logger.log).toHaveBeenCalledWith(
+    consoleSpy.mockRestore();
+    expect(loggerSpy.log).toHaveBeenCalledWith(
       expect.stringContaining('  🛠️  Tool Call: test-tool {"val":123}')
     );
   });
-  it('should return raw output logic if schema schema validation fails (no retry implemented)', async () => {
+  it('should return failed status if schema validation fails and JSON cannot be extracted', async () => {
     setupMockModel(defaultMockChat as any);
     const step: LlmStep = {
       id: 'l1',
@@ -282,13 +291,14 @@ describe('llm-executor', () => {
     // Case 1: Model returns text that is NOT valid JSON
     setupMockModel(async () => ({ message: { role: 'assistant', content: 'Not JSON' } }));
     const result = await executeLlmStep(step, { inputs: {}, steps: {} }, async () => ({
       status: 'success',
       output: 'ok',
     }));
-    // current simple refactor doesn't implement retry, just returns text or throws
-    expect(result.output).toBe('Not JSON');
+    expect(result.status).toBe('failed');
+    expect(result.error).toContain('Failed to extract valid JSON');
   });
   it('should handle tool not found', async () => {
@@ -301,11 +311,15 @@ describe('llm-executor', () => {
       maxIterations: 10,
     };
+    const consoleSpy = spyOn(console, 'error').mockImplementation(() => {});
     const result = await executeLlmStep(step, { inputs: {}, steps: {} }, async () => ({
       status: 'success',
       output: 'ok',
     }));
+    consoleSpy.mockRestore();
     expect(result.status).toBe('success');
   });

package/src/runner/mcp-client.ts CHANGED Viewed

@@ -13,6 +13,9 @@ export const MCP_PROTOCOL_VERSION = MCP.PROTOCOL_VERSION;
 // Maximum buffer size for incoming messages (10MB) to prevent memory exhaustion
 const MAX_BUFFER_SIZE = 10 * 1024 * 1024;
+// Track if we have already warned about SSRF limitations to avoid log spam
+let hasWarnedSSRF = false;
 /**
  * Efficient line splitting without regex to prevent ReDoS attacks.
  * Handles \r\n, \r, and \n line endings.
@@ -99,7 +102,7 @@ function isPrivateIpAddress(address: string): boolean {
 export async function validateRemoteUrl(
   url: string,
-  options: { allowInsecure?: boolean } = {}
+  options: { allowInsecure?: boolean; logger?: Logger } = {}
 ): Promise<void> {
   let parsed: URL;
   try {
@@ -114,6 +117,11 @@ export async function validateRemoteUrl(
   }
   // Require HTTPS in production
+  // SECURITY WARNING: This check is susceptible to TOCTOU (Time-of-Check to Time-of-Use)
+  // DNS rebinding attacks. A malicious domain could resolve to a public IP during this check
+  // and then switch to a private IP (e.g. 127.0.0.1) when the connection is actually made.
+  // Full protection requires resolving the IP once and using that IP for the connection,
+  // or using a proxy that enforces these rules.
   if (parsed.protocol !== 'https:') {
     throw new Error(
       `SSRF Protection: URL must use HTTPS. Got: ${parsed.protocol}. Set allowInsecure option to true if you trust this server.`
@@ -154,12 +162,28 @@ export async function validateRemoteUrl(
   // Resolve DNS to prevent hostnames that map to private IPs (DNS rebinding checks)
   // WARNING: This check is vulnerable to Time-of-Check Time-of-Use (TOCTOU) DNS Rebinding attacks.
   // A malicious DNS server could return a public IP here, then switch to a private IP for the actual fetch.
-  // In a nodejs environment using standard fetch/native DNS, this is hard to fully prevent without
+  // In a nodejs/bun environment using standard fetch/native DNS, this is hard to fully prevent without
   // a custom agent that pins the IP or low-level socket inspection.
+  // Users requiring high security should run this in an isolated network environment (container/VM).
   // For now, this check provides "defense in depth" against accidental internal access.
+  // CRITICAL SECURITY NOTE: In high-security environments, do NOT rely solely on this check.
+  // Use network-level isolation (e.g. firewalls, service meshes, or egress proxies) to strictly block
+  // internal traffic from the Keystone process.
+  //
+  // Recommendation: Use 'allowInsecure: true' only in trusted environments.
   if (!isIP(hostname)) {
     try {
+      // WARNING: This check is vulnerable to DNS Rebinding (TOCTOU)
+      if (options.logger?.warn && !hasWarnedSSRF) {
+        options.logger.warn(
+          '  ⚠️  Security Note: Remote URL validation provides defense-in-depth but does not fully prevent DNS rebinding attacks.\n' +
+            '      For high-security environments, ensure network-level isolation (e.g. firewalls).'
+        );
+        hasWarnedSSRF = true;
+      }
       const resolved = await lookup(hostname, { all: true });
       for (const record of resolved) {
         if (isPrivateIpAddress(record.address)) {
           throw new Error(
@@ -168,6 +192,16 @@ export async function validateRemoteUrl(
         }
       }
     } catch (error) {
+      if (error instanceof Error && error.message.startsWith('SSRF Protection')) {
+        throw error;
+      }
+      if (options.logger?.warn) {
+        options.logger.warn(
+          `[Security Warning] validateRemoteUrl check for ${hostname} failed/bypassed: ${error}`
+        );
+      }
       throw new Error(
         `SSRF Protection: Failed to resolve hostname "${hostname}": ${
           error instanceof Error ? error.message : String(error)

package/src/runner/mcp-server.ts CHANGED Viewed

@@ -248,14 +248,22 @@ export class MCPServer {
             const path = WorkflowRegistry.resolvePath(workflow_name);
             const workflow = WorkflowParser.loadWorkflow(path);
-            // Use a custom logger that captures logs for the MCP response
+            // Use a fixed-size ring buffer for logs to prevent memory leaks
+            const MAX_LOG_LINES = 1000;
             const logs: string[] = [];
+            const addLog = (msg: string) => {
+              if (logs.length >= MAX_LOG_LINES) {
+                logs.shift(); // Remove oldest
+              }
+              logs.push(msg);
+            };
             const logger = {
-              log: (msg: string) => logs.push(msg),
-              error: (msg: string) => logs.push(`ERROR: ${msg}`),
-              warn: (msg: string) => logs.push(`WARN: ${msg}`),
-              info: (msg: string) => logs.push(`INFO: ${msg}`),
-              debug: (msg: string) => logs.push(`DEBUG: ${msg}`),
+              log: (msg: string) => addLog(msg),
+              error: (msg: string) => addLog(`ERROR: ${msg}`),
+              warn: (msg: string) => addLog(`WARN: ${msg}`),
+              info: (msg: string) => addLog(`INFO: ${msg}`),
+              debug: (msg: string) => addLog(`DEBUG: ${msg}`),
             };
             const runner = this.runnerFactory(workflow, {
@@ -545,37 +553,58 @@ export class MCPServer {
             const runId = runner.getRunId();
             // Start the workflow asynchronously
-            runner.run().then(
-              async (outputs) => {
-                // Update DB with success on completion
-                await this.db.updateRunStatus(runId, 'success', outputs);
-              },
-              async (error) => {
-                // Update DB with failure
-                if (error instanceof WorkflowSuspendedError) {
-                  await this.db.updateRunStatus(runId, 'paused');
-                  this.sendNotification('notifications/keystone.human_input', {
-                    run_id: runId,
-                    workflow: workflow_name,
-                    status: 'paused',
-                    message: error.message,
-                    step_id: error.stepId,
-                    input_type: error.inputType,
-                    instructions:
-                      error.inputType === 'confirm'
-                        ? 'Use answer_human_input with input="confirm" to proceed.'
-                        : 'Use answer_human_input with the required text input.',
-                  });
-                } else {
-                  await this.db.updateRunStatus(
-                    runId,
-                    'failed',
-                    undefined,
-                    error instanceof Error ? error.message : String(error)
-                  );
+            // Start the workflow asynchronously
+            runner
+              .run()
+              .then(
+                async (outputs) => {
+                  try {
+                    // Update DB with success on completion
+                    await this.db.updateRunStatus(runId, 'success', outputs);
+                  } catch (e) {
+                    this.logger.error(
+                      `[McpServer] Failed to update success status for run ${runId}: ${e}`
+                    );
+                  }
+                },
+                async (error) => {
+                  try {
+                    // Update DB with failure
+                    if (error instanceof WorkflowSuspendedError) {
+                      await this.db.updateRunStatus(runId, 'paused');
+                      this.sendNotification('notifications/keystone.human_input', {
+                        run_id: runId,
+                        workflow: workflow_name,
+                        status: 'paused',
+                        message: error.message,
+                        step_id: error.stepId,
+                        input_type: error.inputType,
+                        instructions:
+                          error.inputType === 'confirm'
+                            ? 'Use answer_human_input with input="confirm" to proceed.'
+                            : 'Use answer_human_input with the required text input.',
+                      });
+                    } else {
+                      await this.db.updateRunStatus(
+                        runId,
+                        'failed',
+                        undefined,
+                        error instanceof Error ? error.message : String(error)
+                      );
+                    }
+                  } catch (e) {
+                    this.logger.error(
+                      `[McpServer] Failed to update failure status for run ${runId}: ${e}`
+                    );
+                  }
                 }
-              }
-            );
+              )
+              .catch((e) => {
+                // Catch any other errors in the promise chain construction
+                this.logger.error(
+                  `[McpServer] Unexpected error in async workflow execution for run ${runId}: ${e}`
+                );
+              });
             return {
               jsonrpc: '2.0',

package/src/runner/memoization.test.ts CHANGED Viewed

@@ -86,13 +86,13 @@ describe('Workflow Memoization (Auto-Hashing)', () => {
     // We can check if `executeLlmStep` was called.
     let called = false;
-    const trackingExecute = async (s: any, c: any) => {
+    // Match signature of executeLlmStep (at least the required args)
+    const trackingExecute = async (s: any, c: any, _execFn: any, ..._args: any[]) => {
       called = true;
       return mockExecuteLlmStep(s, c);
     };
     // Override the executor for runner2 to track calls
-    // @ts-ignore - hacking private property or constructor option
     // Actually we passed it in constructor option.
     const runner2Tracked = new WorkflowRunner(workflow, {
       dbPath,

package/src/runner/recovery-security.test.ts CHANGED Viewed

@@ -10,20 +10,23 @@ import {
 import { ConfigLoader } from '../utils/config-loader';
-import { beforeEach, describe, expect, jest, mock, test } from 'bun:test';
+import { beforeEach, describe, expect, jest, mock, spyOn, test } from 'bun:test';
 import type { Step, Workflow } from '../parser/schema';
+import * as llmAdapter from './llm-adapter';
 // Note: mock.module() for llm-adapter is now handled by the preload file
 // We should NOT mock 'ai' globally as it breaks other tests using the real ai SDK.
 // Instead, we use a mock model that the real ai SDK calls.
 describe('WorkflowRunner Recovery Security', () => {
+  let getModelSpy: ReturnType<typeof spyOn>;
   beforeEach(() => {
     jest.restoreAllMocks();
     ConfigLoader.clear();
     setupLlmMocks();
     resetLlmMocks();
-    mockGetModel.mockResolvedValue(createUnifiedMockModel());
+    getModelSpy = spyOn(llmAdapter, 'getModel').mockResolvedValue(createUnifiedMockModel() as any);
   });
   test('should NOT allow reflexion to overwrite critical step properties', async () => {

package/src/runner/reflexion.test.ts CHANGED Viewed

@@ -7,9 +7,10 @@ import {
   setupLlmMocks,
 } from './__test__/llm-test-setup';
-import { beforeAll, beforeEach, describe, expect, jest, mock, test } from 'bun:test';
+import { beforeAll, beforeEach, describe, expect, jest, mock, spyOn, test } from 'bun:test';
 import type { Step, Workflow } from '../parser/schema';
 import { ConfigLoader } from '../utils/config-loader';
+import * as llmAdapter from './llm-adapter';
 // Note: mock.module() for llm-adapter is now handled by the preload file
 // We should NOT mock 'ai' globally as it breaks other tests using the real ai SDK.
@@ -19,6 +20,8 @@ import { ConfigLoader } from '../utils/config-loader';
 let WorkflowRunner: any;
 describe('WorkflowRunner Reflexion', () => {
+  let getModelSpy: ReturnType<typeof spyOn>;
   beforeAll(async () => {
     // Set up config
     ConfigLoader.setConfig({
@@ -34,7 +37,7 @@ describe('WorkflowRunner Reflexion', () => {
       expression: { strict: false },
     } as any);
-    mockGetModel.mockResolvedValue(createUnifiedMockModel());
+    getModelSpy = spyOn(llmAdapter, 'getModel').mockResolvedValue(createUnifiedMockModel() as any);
     setupLlmMocks();
     setCurrentChatFn(async () => ({
@@ -50,8 +53,8 @@ describe('WorkflowRunner Reflexion', () => {
     ConfigLoader.clear();
     jest.restoreAllMocks();
     setupLlmMocks();
-    setupLlmMocks();
     resetLlmMocks();
+    getModelSpy = spyOn(llmAdapter, 'getModel').mockResolvedValue(createUnifiedMockModel() as any);
     setCurrentChatFn(async () => ({
       message: { role: 'assistant', content: JSON.stringify({ run: 'echo "fixed"' }) },
     }));

package/src/runner/services/context-builder.ts CHANGED Viewed

@@ -1,6 +1,6 @@
 import type { ExpressionContext } from '../../expression/evaluator.ts';
 import { ExpressionEvaluator } from '../../expression/evaluator.ts';
-import type { Workflow } from '../../parser/schema.ts';
+import type { Step, Workflow } from '../../parser/schema.ts';
 import type { Logger } from '../../utils/logger.ts';
 import type { WorkflowState } from '../workflow-state.ts';
@@ -92,7 +92,7 @@ export class ContextBuilder {
   /**
    * Builds input object for a specific step.
    */
-  public buildStepInputs(step: any, context: ExpressionContext): Record<string, unknown> {
+  public buildStepInputs(step: Step, context: ExpressionContext): Record<string, unknown> {
     const stripUndefined = (value: Record<string, unknown>) => {
       const result: Record<string, unknown> = {};
       for (const [key, val] of Object.entries(value)) {
@@ -165,8 +165,17 @@ export class ContextBuilder {
           inputType: step.inputType,
         });
       case 'sleep': {
-        const evaluated = ExpressionEvaluator.evaluate(step.duration.toString(), context);
-        return { duration: Number(evaluated) };
+        return stripUndefined({
+          duration:
+            step.duration !== undefined
+              ? Number(ExpressionEvaluator.evaluate(step.duration.toString(), context))
+              : undefined,
+          until:
+            step.until !== undefined
+              ? ExpressionEvaluator.evaluateString(step.until, context)
+              : undefined,
+          durable: step.durable,
+        });
       }
       case 'llm':
         return stripUndefined({

package/src/runner/services/workflow-validator.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { Workflow, WorkflowInput } from '../../parser/schema.ts';
+import { Redactor } from '../../utils/redactor.ts';
 import { validateJsonSchema } from '../../utils/schema-validator.ts';
 import { SecretManager } from './secret-manager.ts';
@@ -6,7 +7,7 @@ import { SecretManager } from './secret-manager.ts';
  * Service for validating workflow inputs and applying defaults.
  */
 export class WorkflowValidator {
-  public static readonly REDACTED_PLACEHOLDER = '[REDACTED]';
+  public static readonly REDACTED_PLACEHOLDER = Redactor.REDACTED_PLACEHOLDER;
   constructor(
     private workflow: Workflow,

package/src/runner/shell-executor.test.ts CHANGED Viewed

@@ -1,9 +1,17 @@
 import { describe, expect, it } from 'bun:test';
+import { realpathSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import { basename, resolve as resolvePath, sep } from 'node:path';
 import type { ExpressionContext } from '../expression/evaluator';
+import { ConfigSchema } from '../parser/config-schema';
 import type { ShellStep } from '../parser/schema';
-import { escapeShellArg, executeShell } from './executors/shell-executor.ts';
+import { ConfigLoader } from '../utils/config-loader';
+import { ConsoleLogger } from '../utils/logger';
+import { escapeShellArg, executeShell, executeShellStep } from './executors/shell-executor.ts';
 describe('shell-executor', () => {
+  const logger = new ConsoleLogger();
   describe('escapeShellArg', () => {
     it('should wrap in single quotes', () => {
       expect(escapeShellArg('hello')).toBe("'hello'");
@@ -174,4 +182,102 @@ describe('shell-executor', () => {
       expect(result.stdout.trim()).toBe('match');
     });
   });
+  describe('executeShellStep (args)', () => {
+    const context: ExpressionContext = {
+      inputs: {},
+      steps: {},
+      env: {},
+    };
+    it('should reject empty args', async () => {
+      const step: ShellStep = {
+        id: 'test',
+        type: 'shell',
+        needs: [],
+        args: [],
+      };
+      await expect(executeShellStep(step, context, logger)).rejects.toThrow(
+        /args must contain at least one element/
+      );
+    });
+    it('should apply step env for args execution', async () => {
+      const bunPath = process.execPath;
+      const step: ShellStep = {
+        id: 'test',
+        type: 'shell',
+        needs: [],
+        args: [bunPath, '-e', 'console.log(process.env.TEST_VAR ?? "")'],
+        env: { TEST_VAR: 'args-env' },
+      };
+      const result = await executeShellStep(step, context, logger);
+      expect(result.output?.stdout?.trim()).toBe('args-env');
+    });
+    it('should enforce denylist for args execution', async () => {
+      const bunPath = process.execPath;
+      const denied = basename(bunPath);
+      ConfigLoader.setConfig(
+        ConfigSchema.parse({
+          engines: { denylist: [denied] },
+        })
+      );
+      try {
+        const step: ShellStep = {
+          id: 'test',
+          type: 'shell',
+          needs: [],
+          args: [bunPath, '-e', 'console.log("nope")'],
+        };
+        await expect(executeShellStep(step, context, logger)).rejects.toThrow(/denylist/);
+      } finally {
+        ConfigLoader.clear();
+      }
+    });
+    it('should enforce allowOutsideCwd for args execution', async () => {
+      const bunPath = process.execPath;
+      const cwd = resolvePath(process.cwd());
+      let outsideDir = resolvePath(tmpdir());
+      if (outsideDir.startsWith(`${cwd}${sep}`)) {
+        const parent = resolvePath(cwd, '..');
+        if (parent !== cwd) {
+          outsideDir = parent;
+        }
+      }
+      if (outsideDir === cwd) {
+        return;
+      }
+      const step: ShellStep = {
+        id: 'test',
+        type: 'shell',
+        needs: [],
+        args: [bunPath, '-e', 'console.log(process.cwd())'],
+        dir: outsideDir,
+      };
+      await expect(executeShellStep(step, context, logger)).rejects.toThrow(
+        /outside the project directory/
+      );
+      const allowedStep: ShellStep = {
+        ...step,
+        allowOutsideCwd: true,
+      };
+      const result = await executeShellStep(allowedStep, context, logger);
+      const resolvedOutput = realpathSync(resolvePath(result.output?.stdout?.trim() || ''));
+      const resolvedOutside = realpathSync(outsideDir);
+      expect(resolvedOutput).toBe(resolvedOutside);
+    });
+  });
 });

package/src/runner/standard-tools-ast.test.ts CHANGED Viewed

@@ -142,7 +142,8 @@ describe('AST-Grep Tools', () => {
       };
       expect(() => {
-        vm.runInNewContext(script, sandbox);
+        // Wrap in async IIFE to support top-level return
+        vm.runInNewContext(`(async () => { ${script} })();`, sandbox);
       }).not.toThrow();
     });
   });
@@ -189,7 +190,8 @@ describe('AST-Grep Tools', () => {
       };
       expect(() => {
-        vm.runInNewContext(script, sandbox);
+        // Wrap in async IIFE to support top-level return
+        vm.runInNewContext(`(async () => { ${script} })();`, sandbox);
       }).not.toThrow();
     });
   });

package/src/runner/standard-tools-execution.test.ts CHANGED Viewed

@@ -54,12 +54,25 @@ describe('Standard Tools Execution Verification', () => {
               }),
             };
           }
+          if (mod === 'node:worker_threads') {
+            return {
+              Worker: class MockWorker {
+                on() {}
+                terminate() {}
+              },
+              parentPort: null,
+              workerData: null,
+            };
+          }
           return {};
         },
       };
       expect(() => {
-        vm.runInNewContext(script, sandbox);
+        // Wrap the script in an async IIFE to match ProcessSandbox behavior
+        // ProcessSandbox wraps scripts: const __result = await (async () => { ${code} })();
+        const wrappedScript = `(async () => { ${script} })()`;
+        vm.runInNewContext(wrappedScript, sandbox);
       }).not.toThrow();
     });
   }

package/src/runner/standard-tools-integration.test.ts CHANGED Viewed

@@ -25,6 +25,7 @@ import type { ExpressionContext } from '../expression/evaluator';
 import * as agentParser from '../parser/agent-parser';
 import type { Agent, LlmStep, Step } from '../parser/schema';
 import { ConfigLoader } from '../utils/config-loader';
+import * as llmAdapter from './llm-adapter';
 import type { StepResult } from './step-executor';
 // Note: mock.module() is now handled by the preload file
@@ -40,6 +41,7 @@ describe('Standard Tools Integration', () => {
   const testDir = join(process.cwd(), '.e2e-tmp', 'standard-tools-test');
   let resolveAgentPathSpy: ReturnType<typeof spyOn>;
   let parseAgentSpy: ReturnType<typeof spyOn>;
+  let getModelSpy: ReturnType<typeof spyOn>;
   beforeAll(async () => {
     // Setup config before importing the executor
@@ -54,6 +56,9 @@ describe('Standard Tools Integration', () => {
       model_mappings: {},
     } as any);
+    // Spy on getModel to return mock model
+    getModelSpy = spyOn(llmAdapter, 'getModel').mockResolvedValue(createUnifiedMockModel() as any);
     // Ensure the mock model is set up
     setupLlmMocks();
@@ -88,6 +93,7 @@ describe('Standard Tools Integration', () => {
   afterEach(() => {
     resolveAgentPathSpy?.mockRestore();
     parseAgentSpy?.mockRestore();
+    getModelSpy?.mockClear();
     resetLlmMocks();
   });