npm - keystone-cli - Versions diffs - 0.6.0 → 0.6.1 - Mend

keystone-cli 0.6.0 → 0.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/package.json +1 -1
package/src/cli.ts +233 -21
package/src/db/memory-db.ts +6 -0
package/src/db/sqlite-setup.test.ts +47 -0
package/src/db/workflow-db.ts +6 -0
package/src/runner/debug-repl.test.ts +240 -6
package/src/runner/llm-adapter.test.ts +10 -4
package/src/runner/step-executor.test.ts +194 -1
package/src/runner/step-executor.ts +35 -14
package/src/runner/stream-utils.test.ts +113 -7
package/src/runner/stream-utils.ts +4 -4
package/src/runner/workflow-runner.ts +14 -20

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "keystone-cli",
-  "version": "0.6.0",
+  "version": "0.6.1",
   "description": "A local-first, declarative, agentic workflow orchestrator built on Bun",
   "type": "module",
   "bin": {

package/src/cli.ts CHANGED Viewed

@@ -9,7 +9,7 @@ import architectAgent from './templates/agents/keystone-architect.md' with { typ
 // Default templates
 import scaffoldWorkflow from './templates/scaffold-feature.yaml' with { type: 'text' };
-import { WorkflowDb } from './db/workflow-db.ts';
+import { WorkflowDb, type WorkflowRun } from './db/workflow-db.ts';
 import { WorkflowParser } from './parser/workflow-parser.ts';
 import { ConfigLoader } from './utils/config-loader.ts';
 import { ConsoleLogger } from './utils/logger.ts';
@@ -279,7 +279,79 @@ program
     }
   });
-// ... (optimize command remains here) ...
+// ===== keystone workflows =====
+program
+  .command('workflows')
+  .description('List available workflows')
+  .action(() => {
+    const workflows = WorkflowRegistry.listWorkflows();
+    if (workflows.length === 0) {
+      console.log('No workflows found. Run "keystone init" to seed default workflows.');
+      return;
+    }
+    console.log('\n🏛️  Available Workflows:');
+    for (const w of workflows) {
+      console.log(`\n  ${w.name}`);
+      if (w.description) {
+        console.log(`    ${w.description}`);
+      }
+    }
+    console.log('');
+  });
+// ===== keystone optimize =====
+program
+  .command('optimize')
+  .description('Optimize a specific step in a workflow using iterative evaluation')
+  .argument('<workflow>', 'Workflow name or path to workflow file')
+  .requiredOption('-t, --target <step_id>', 'Target step ID to optimize')
+  .option('-n, --iterations <number>', 'Number of optimization iterations', '5')
+  .option('-i, --input <key=value...>', 'Input values for evaluation')
+  .action(async (workflowPath, options) => {
+    try {
+      const { OptimizationRunner } = await import('./runner/optimization-runner.ts');
+      const resolvedPath = WorkflowRegistry.resolvePath(workflowPath);
+      const workflow = WorkflowParser.loadWorkflow(resolvedPath);
+      // Parse inputs
+      const inputs: Record<string, unknown> = {};
+      if (options.input) {
+        for (const pair of options.input) {
+          const index = pair.indexOf('=');
+          if (index > 0) {
+            const key = pair.slice(0, index);
+            const value = pair.slice(index + 1);
+            try {
+              inputs[key] = JSON.parse(value);
+            } catch {
+              inputs[key] = value;
+            }
+          }
+        }
+      }
+      const runner = new OptimizationRunner(workflow, {
+        workflowPath: resolvedPath,
+        targetStepId: options.target,
+        iterations: Number.parseInt(options.iterations, 10),
+        inputs,
+      });
+      console.log('🏛️  Keystone Prompt Optimization');
+      const { bestPrompt, bestScore } = await runner.optimize();
+      console.log('\n✨ Optimization Complete!');
+      console.log(`🏆 Best Score: ${bestScore}/100`);
+      console.log('\nBest Prompt/Command:');
+      console.log(''.padEnd(80, '-'));
+      console.log(bestPrompt);
+      console.log(''.padEnd(80, '-'));
+    } catch (error) {
+      console.error('✗ Optimization failed:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
 // ===== keystone resume =====
 program
@@ -347,40 +419,180 @@ program
     }
   });
-// ... (other commands) ...
-// ===== keystone maintenance =====
+// ===== keystone history =====
 program
-  .command('maintenance')
-  .description('Perform database maintenance (prune old runs and vacuum)')
-  .option('--days <days>', 'Delete runs older than this many days', '30')
+  .command('history')
+  .description('Show recent workflow runs')
+  .option('-l, --limit <number>', 'Limit the number of runs to show', '50')
   .action(async (options) => {
     try {
-      const days = Number.parseInt(options.days, 10);
-      if (Number.isNaN(days) || days < 0) {
-        console.error('✗ Invalid days value. Must be a positive number.');
-        process.exit(1);
+      const db = new WorkflowDb();
+      const limit = Number.parseInt(options.limit, 10);
+      const runs = await db.listRuns(limit);
+      db.close();
+      if (runs.length === 0) {
+        console.log('No workflow runs found.');
+        return;
       }
-      console.log('🧹 Starting maintenance...');
+      console.log('\n🏛️  Workflow Run History:');
+      console.log(''.padEnd(100, '-'));
+      console.log(
+        `${'ID'.padEnd(10)} ${'Workflow'.padEnd(25)} ${'Status'.padEnd(15)} ${'Started At'}`
+      );
+      console.log(''.padEnd(100, '-'));
+      for (const run of runs) {
+        const id = run.id.slice(0, 8);
+        const status = run.status;
+        const color =
+          status === 'success' ? '\x1b[32m' : status === 'failed' ? '\x1b[31m' : '\x1b[33m';
+        const reset = '\x1b[0m';
+        console.log(
+          `${id.padEnd(10)} ${run.workflow_name.padEnd(25)} ${color}${status.padEnd(
+            15
+          )}${reset} ${new Date(run.started_at).toLocaleString()}`
+        );
+      }
+      console.log('');
+    } catch (error) {
+      console.error('✗ Failed to list runs:', error instanceof Error ? error.message : error);
+      process.exit(1);
+    }
+  });
+// ===== keystone logs =====
+program
+  .command('logs')
+  .description('Show logs for a specific workflow run')
+  .argument('<run_id>', 'Run ID to show logs for')
+  .option('-v, --verbose', 'Show detailed step outputs')
+  .action(async (runId, options) => {
+    try {
       const db = new WorkflowDb();
+      const run = await db.getRun(runId);
-      console.log(`   Pruning runs older than ${days} days...`);
-      const deleted = await db.pruneRuns(days);
-      console.log(`   ✓ Deleted ${deleted} run(s)`);
+      if (!run) {
+        // Try searching by short ID
+        const allRuns = await db.listRuns(200);
+        const matching = allRuns.find((r) => r.id.startsWith(runId));
+        if (matching) {
+          const detailedRun = await db.getRun(matching.id);
+          if (detailedRun) {
+            await showRunLogs(detailedRun, db, !!options.verbose);
+            db.close();
+            return;
+          }
+        }
-      console.log('   Vacuuming database (reclaiming space)...');
-      await db.vacuum();
-      console.log('   ✓ Vacuum complete');
+        console.error(`✗ Run not found: ${runId}`);
+        db.close();
+        process.exit(1);
+      }
+      await showRunLogs(run, db, !!options.verbose);
       db.close();
-      console.log('\n✨ Maintenance completed successfully!');
     } catch (error) {
-      console.error('✗ Maintenance failed:', error instanceof Error ? error.message : error);
+      console.error('✗ Failed to show logs:', error instanceof Error ? error.message : error);
       process.exit(1);
     }
   });
+async function showRunLogs(run: WorkflowRun, db: WorkflowDb, verbose: boolean) {
+  console.log(`\n🏛️  Run: ${run.workflow_name} (${run.id})`);
+  console.log(`   Status: ${run.status}`);
+  console.log(`   Started: ${new Date(run.started_at).toLocaleString()}`);
+  if (run.completed_at) {
+    console.log(`   Completed: ${new Date(run.completed_at).toLocaleString()}`);
+  }
+  const steps = await db.getStepsByRun(run.id);
+  console.log(`\nSteps (${steps.length}):`);
+  console.log(''.padEnd(100, '-'));
+  for (const step of steps) {
+    const statusColor =
+      step.status === 'success' ? '\x1b[32m' : step.status === 'failed' ? '\x1b[31m' : '\x1b[33m';
+    const reset = '\x1b[0m';
+    let label = step.step_id;
+    if (step.iteration_index !== null) {
+      label += ` [${step.iteration_index}]`;
+    }
+    console.log(`${statusColor}${step.status.toUpperCase().padEnd(10)}${reset} ${label}`);
+    if (step.error) {
+      console.log(`           \x1b[31mError: ${step.error}\x1b[0m`);
+    }
+    if (verbose && step.output) {
+      try {
+        const output = JSON.parse(step.output);
+        console.log(
+          `           Output: ${JSON.stringify(output, null, 2).replace(/\n/g, '\n           ')}`
+        );
+      } catch {
+        console.log(`           Output: ${step.output}`);
+      }
+    }
+  }
+  if (run.outputs) {
+    console.log('\nFinal Outputs:');
+    try {
+      const parsed = JSON.parse(run.outputs);
+      console.log(JSON.stringify(parsed, null, 2));
+    } catch {
+      console.log(run.outputs);
+    }
+  }
+  if (run.error) {
+    console.log(`\n\x1b[31mWorkflow Error:\x1b[0m ${run.error}`);
+  }
+}
+// ===== keystone prune / maintenance =====
+async function performMaintenance(days: number) {
+  try {
+    console.log(`🧹 Starting maintenance (pruning runs older than ${days} days)...`);
+    const db = new WorkflowDb();
+    const count = await db.pruneRuns(days);
+    console.log(`   ✓ Pruned ${count} old run(s)`);
+    console.log('   Vacuuming database (reclaiming space)...');
+    await db.vacuum();
+    console.log('   ✓ Vacuum complete');
+    db.close();
+    console.log('\n✨ Maintenance completed successfully!');
+  } catch (error) {
+    console.error('✗ Maintenance failed:', error instanceof Error ? error.message : error);
+    process.exit(1);
+  }
+}
+program
+  .command('prune')
+  .description('Delete old workflow runs from the database (alias for maintenance)')
+  .option('--days <number>', 'Days to keep', '30')
+  .action(async (options) => {
+    const days = Number.parseInt(options.days, 10);
+    await performMaintenance(days);
+  });
+program
+  .command('maintenance')
+  .description('Perform database maintenance (prune old runs and vacuum)')
+  .option('--days <days>', 'Delete runs older than this many days', '30')
+  .action(async (options) => {
+    const days = Number.parseInt(options.days, 10);
+    await performMaintenance(days);
+  });
 // ===== keystone ui =====
 program
   .command('ui')

package/src/db/memory-db.ts CHANGED Viewed

@@ -1,5 +1,7 @@
 import type { Database } from 'bun:sqlite';
 import { randomUUID } from 'node:crypto';
+import { existsSync, mkdirSync } from 'node:fs';
+import { dirname } from 'node:path';
 import * as sqliteVec from 'sqlite-vec';
 import './sqlite-setup.ts';
@@ -22,6 +24,10 @@ export class MemoryDb {
       this.db = cached.db;
     } else {
       const { Database } = require('bun:sqlite');
+      const dir = dirname(dbPath);
+      if (!existsSync(dir)) {
+        mkdirSync(dir, { recursive: true });
+      }
       this.db = new Database(dbPath, { create: true });
       // Load sqlite-vec extension

package/src/db/sqlite-setup.test.ts ADDED Viewed

@@ -0,0 +1,47 @@
+import { afterEach, describe, expect, it, mock, spyOn } from 'bun:test';
+import type { Logger } from '../utils/logger';
+import { setupSqlite } from './sqlite-setup';
+describe('setupSqlite', () => {
+  const originalPlatform = process.platform;
+  afterEach(() => {
+    Object.defineProperty(process, 'platform', {
+      value: originalPlatform,
+    });
+  });
+  it('does nothing on non-darwin platforms', () => {
+    Object.defineProperty(process, 'platform', { value: 'linux' });
+    const logger: Logger = {
+      log: mock(() => {}),
+      warn: mock(() => {}),
+      error: mock(() => {}),
+      info: mock(() => {}),
+    };
+    setupSqlite(logger);
+    expect(logger.log).not.toHaveBeenCalled();
+    expect(logger.warn).not.toHaveBeenCalled();
+  });
+  it('logs warning if no custom sqlite found on darwin', () => {
+    Object.defineProperty(process, 'platform', { value: 'darwin' });
+    const logger: Logger = {
+      log: mock(() => {}),
+      warn: mock(() => {}),
+      error: mock(() => {}),
+      info: mock(() => {}),
+    };
+    // Mock Bun.spawnSync for brew
+    const spawnSpy = spyOn(Bun, 'spawnSync').mockImplementation(
+      () => ({ success: false }) as unknown as ReturnType<typeof Bun.spawnSync>
+    );
+    try {
+      setupSqlite(logger);
+    } finally {
+      spawnSpy.mockRestore();
+    }
+  });
+});

package/src/db/workflow-db.ts CHANGED Viewed

@@ -1,4 +1,6 @@
 import { Database } from 'bun:sqlite';
+import { existsSync, mkdirSync } from 'node:fs';
+import { dirname } from 'node:path';
 import './sqlite-setup.ts';
 import {
   StepStatus as StepStatusConst,
@@ -40,6 +42,10 @@ export class WorkflowDb {
   private db: Database;
   constructor(public readonly dbPath = '.keystone/state.db') {
+    const dir = dirname(dbPath);
+    if (!existsSync(dir)) {
+      mkdirSync(dir, { recursive: true });
+    }
     this.db = new Database(dbPath, { create: true });
     this.db.exec('PRAGMA journal_mode = WAL;'); // Write-ahead logging
     this.db.exec('PRAGMA foreign_keys = ON;'); // Enable foreign key enforcement

package/src/runner/debug-repl.test.ts CHANGED Viewed

@@ -1,19 +1,27 @@
-import { describe, expect, test } from 'bun:test';
+import { describe, expect, mock, spyOn, test } from 'bun:test';
+import * as cp from 'node:child_process';
+import * as fs from 'node:fs';
 import { PassThrough } from 'node:stream';
 import type { ExpressionContext } from '../expression/evaluator.ts';
 import type { Step } from '../parser/schema.ts';
+import type { Logger } from '../utils/logger.ts';
 import { DebugRepl } from './debug-repl.ts';
 describe('DebugRepl', () => {
   const mockContext: ExpressionContext = { inputs: { foo: 'bar' } };
-  // biome-ignore lint/suspicious/noExplicitAny: mock step typing
-  const mockStep: Step = { id: 'test-step', type: 'shell', run: 'echo "fail"' } as any;
+  // mock step typing
+  const mockStep: Step = { id: 'test-step', type: 'shell', run: 'echo "fail"' } as unknown as Step;
   const mockError = new Error('Test Error');
   test('should resolve with "skip" when user types "skip"', async () => {
     const input = new PassThrough();
     const output = new PassThrough();
-    const mockLogger = { log: () => {}, error: () => {}, warn: () => {} };
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
     const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
     const promise = repl.start();
@@ -30,7 +38,12 @@ describe('DebugRepl', () => {
   test('should resolve with "retry" when user types "retry"', async () => {
     const input = new PassThrough();
     const output = new PassThrough();
-    const mockLogger = { log: () => {}, error: () => {}, warn: () => {} };
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
     const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
     const promise = repl.start();
@@ -48,7 +61,12 @@ describe('DebugRepl', () => {
   test('should resolve with "continue_failure" when user types "exit"', async () => {
     const input = new PassThrough();
     const output = new PassThrough();
-    const mockLogger = { log: () => {}, error: () => {}, warn: () => {} };
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
     const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
     const promise = repl.start();
@@ -60,6 +78,137 @@ describe('DebugRepl', () => {
     expect(result).toEqual({ type: 'continue_failure' });
   });
+  test('should handle "context" command', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    repl.start();
+    await new Promise((r) => setTimeout(r, 10));
+    input.write('context\n');
+    await new Promise((r) => setTimeout(r, 10));
+    expect(mockLogger.log).toHaveBeenCalled();
+    // biome-ignore lint/suspicious/noExplicitAny: accessing mock property
+    const lastCall = (mockLogger.log as unknown as any).mock.calls.find((call: any[]) =>
+      String(call[0]).includes('foo')
+    );
+    expect(lastCall?.[0]).toContain('bar');
+    input.write('exit\n');
+  });
+  test('should handle "eval" command', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    repl.start();
+    await new Promise((r) => setTimeout(r, 10));
+    input.write('eval inputs.foo\n');
+    await new Promise((r) => setTimeout(r, 10));
+    expect(mockLogger.log).toHaveBeenCalledWith('bar');
+    input.write('exit\n');
+  });
+  test('should handle "eval" command with error', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    repl.start();
+    await new Promise((r) => setTimeout(r, 10));
+    input.write('eval nonExistent.bar\n');
+    await new Promise((r) => setTimeout(r, 10));
+    expect(mockLogger.error).toHaveBeenCalled();
+    input.write('exit\n');
+  });
+  test('should handle "eval" command without arguments', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    repl.start();
+    await new Promise((r) => setTimeout(r, 10));
+    input.write('eval\n');
+    await new Promise((r) => setTimeout(r, 10));
+    expect(mockLogger.log).toHaveBeenCalledWith('Usage: eval <expression>');
+    input.write('exit\n');
+  });
+  test('should handle unknown command', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    repl.start();
+    await new Promise((r) => setTimeout(r, 10));
+    input.write('unknown_cmd\n');
+    await new Promise((r) => setTimeout(r, 10));
+    expect(mockLogger.log).toHaveBeenCalledWith('Unknown command: unknown_cmd');
+    input.write('exit\n');
+  });
+  test('should handle empty input', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    repl.start();
+    await new Promise((r) => setTimeout(r, 10));
+    input.write('\n');
+    await new Promise((r) => setTimeout(r, 10));
+    expect(mockLogger.log).not.toHaveBeenCalledWith('Unknown command: ');
+    input.write('exit\n');
+  });
   test('should parse shell commands correctly', () => {
     // We import the function dynamically to test it, or we assume it's exported
     const { parseShellCommand } = require('./debug-repl.ts');
@@ -71,4 +220,89 @@ describe('DebugRepl', () => {
     expect(parseShellCommand('editor -a -b -c')).toEqual(['editor', '-a', '-b', '-c']);
     expect(parseShellCommand('  spaced   command  ')).toEqual(['spaced', 'command']);
   });
+  test('should handle "edit" command and update step', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    const spySpawnSync = spyOn(cp, 'spawnSync').mockImplementation(
+      // biome-ignore lint/suspicious/noExplicitAny: mocking child_process
+      () => ({ error: null, status: 0 }) as any
+    );
+    const spyWriteFileSync = spyOn(fs, 'writeFileSync').mockImplementation(() => {});
+    const updatedStep = { ...mockStep, run: 'echo "fixed"' };
+    const spyReadFileSync = spyOn(fs, 'readFileSync').mockImplementation((() =>
+      JSON.stringify(updatedStep)) as unknown as typeof fs.readFileSync);
+    const spyExistsSync = spyOn(fs, 'existsSync').mockImplementation(() => true);
+    const spyUnlinkSync = spyOn(fs, 'unlinkSync').mockImplementation(() => {});
+    try {
+      repl.start();
+      await new Promise((r) => setTimeout(r, 50));
+      input.write('edit\n');
+      await new Promise((r) => setTimeout(r, 50));
+      expect(mockLogger.log).toHaveBeenCalledWith(
+        expect.stringContaining('Step definition updated')
+      );
+      input.write('retry\n');
+      await new Promise((r) => setTimeout(r, 50));
+    } finally {
+      spySpawnSync.mockRestore();
+      spyWriteFileSync.mockRestore();
+      spyReadFileSync.mockRestore();
+      spyExistsSync.mockRestore();
+      spyUnlinkSync.mockRestore();
+    }
+  });
+  test('should handle "edit" command with parse error', async () => {
+    const input = new PassThrough();
+    const output = new PassThrough();
+    const mockLogger: Logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const repl = new DebugRepl(mockContext, mockStep, mockError, mockLogger, input, output);
+    const spySpawnSync = spyOn(cp, 'spawnSync').mockImplementation(
+      // biome-ignore lint/suspicious/noExplicitAny: mocking child_process
+      () => ({ error: null, status: 0 }) as any
+    );
+    const spyWriteFileSync = spyOn(fs, 'writeFileSync').mockImplementation(() => {});
+    const spyReadFileSync = spyOn(fs, 'readFileSync').mockImplementation(
+      (() => 'invalid json') as unknown as typeof fs.readFileSync
+    );
+    const spyExistsSync = spyOn(fs, 'existsSync').mockImplementation(() => true);
+    const spyUnlinkSync = spyOn(fs, 'unlinkSync').mockImplementation(() => {});
+    try {
+      repl.start();
+      await new Promise((r) => setTimeout(r, 50));
+      input.write('edit\n');
+      await new Promise((r) => setTimeout(r, 50));
+      expect(mockLogger.error).toHaveBeenCalledWith(
+        expect.stringContaining('Failed to parse JSON')
+      );
+      input.write('exit\n');
+      await new Promise((r) => setTimeout(r, 50));
+    } finally {
+      spySpawnSync.mockRestore();
+      spyWriteFileSync.mockRestore();
+      spyReadFileSync.mockRestore();
+      spyExistsSync.mockRestore();
+      spyUnlinkSync.mockRestore();
+    }
+  });
 });

package/src/runner/llm-adapter.test.ts CHANGED Viewed

@@ -105,7 +105,9 @@ describe('AnthropicAdapter', () => {
     // @ts-ignore
     const fetchMock = global.fetch as MockFetch;
     // @ts-ignore
-    const [url, init] = fetchMock.mock.calls[0];
+    // @ts-ignore
+    // biome-ignore lint/suspicious/noExplicitAny: mock fetch init
+    const [url, init] = fetchMock.mock.calls[0] as [string, any];
     expect(url).toBe('https://api.anthropic.com/v1/messages');
     expect(init.headers['x-api-key']).toBe('fake-anthropic-key');
@@ -179,7 +181,8 @@ describe('AnthropicAdapter', () => {
     ]);
     // @ts-ignore
-    const init = global.fetch.mock.calls[0][1];
+    // biome-ignore lint/suspicious/noExplicitAny: mock fetch init
+    const init = global.fetch.mock.calls[0][1] as any;
     const body = JSON.parse(init.body);
     expect(body.messages[0].role).toBe('assistant');
     expect(body.messages[0].content).toHaveLength(2);
@@ -208,7 +211,8 @@ describe('AnthropicAdapter', () => {
     ]);
     // @ts-ignore
-    const init = global.fetch.mock.calls[0][1];
+    // biome-ignore lint/suspicious/noExplicitAny: mock fetch init
+    const init = global.fetch.mock.calls[0][1] as any;
     const body = JSON.parse(init.body);
     expect(body.messages[0].role).toBe('user');
     expect(body.messages[0].content[0]).toEqual({
@@ -255,7 +259,9 @@ describe('CopilotAdapter', () => {
     // @ts-ignore
     const fetchMock = global.fetch as MockFetch;
     // @ts-ignore
-    const [url, init] = fetchMock.mock.calls[0];
+    // @ts-ignore
+    // biome-ignore lint/suspicious/noExplicitAny: mock fetch init
+    const [url, init] = fetchMock.mock.calls[0] as [string, any];
     expect(url).toBe('https://api.githubcopilot.com/chat/completions');
     expect(init.headers.Authorization).toBe('Bearer mock-token');
     spy.mockRestore();

package/src/runner/step-executor.test.ts CHANGED Viewed

@@ -13,6 +13,7 @@ import * as dns from 'node:dns/promises';
 import { mkdirSync, rmSync } from 'node:fs';
 import { tmpdir } from 'node:os';
 import { join } from 'node:path';
+import type { MemoryDb } from '../db/memory-db';
 import type { ExpressionContext } from '../expression/evaluator';
 import type {
   FileStep,
@@ -22,6 +23,8 @@ import type {
   SleepStep,
   WorkflowStep,
 } from '../parser/schema';
+import type { SafeSandbox } from '../utils/sandbox';
+import type { getAdapter } from './llm-adapter';
 import { executeStep } from './step-executor';
 // Mock executeLlmStep
@@ -227,6 +230,196 @@ describe('step-executor', () => {
         }
       }
     });
+    it('should block path traversal outside cwd by default', async () => {
+      const outsidePath = join(process.cwd(), '..', 'outside.txt');
+      const step: FileStep = {
+        id: 'f1',
+        type: 'file',
+        needs: [],
+        op: 'read',
+        path: outsidePath,
+      };
+      const result = await executeStep(step, context);
+      expect(result.status).toBe('failed');
+      expect(result.error).toContain('Access denied');
+    });
+    it('should block path traversal with .. inside path resolving outside', async () => {
+      const outsidePath = 'foo/../../passwd';
+      const step: FileStep = {
+        id: 'f1',
+        type: 'file',
+        needs: [],
+        op: 'read',
+        path: outsidePath,
+      };
+      const result = await executeStep(step, context);
+      expect(result.status).toBe('failed');
+      expect(result.error).toContain('Access denied');
+    });
+  });
+  describe('script', () => {
+    const mockSandbox = {
+      execute: mock((code) => {
+        if (code === 'fail') throw new Error('Script failed');
+        return Promise.resolve('script-result');
+      }),
+    };
+    it('should fail if allowInsecure is not set', async () => {
+      // @ts-ignore
+      const step = {
+        id: 's1',
+        type: 'script',
+        run: 'console.log("hello")',
+      };
+      const result = await executeStep(step, context, undefined, {
+        sandbox: mockSandbox as unknown as typeof SafeSandbox,
+      });
+      expect(result.status).toBe('failed');
+      expect(result.error).toContain('Script execution is disabled by default');
+    });
+    it('should execute script if allowInsecure is true', async () => {
+      // @ts-ignore
+      const step = {
+        id: 's1',
+        type: 'script',
+        run: 'console.log("hello")',
+        allowInsecure: true,
+      };
+      const result = await executeStep(step, context, undefined, {
+        sandbox: mockSandbox as unknown as typeof SafeSandbox,
+      });
+      expect(result.status).toBe('success');
+      expect(result.output).toBe('script-result');
+    });
+    it('should handle script failure', async () => {
+      // @ts-ignore
+      const step = {
+        id: 's1',
+        type: 'script',
+        run: 'fail',
+        allowInsecure: true,
+      };
+      const result = await executeStep(step, context, undefined, {
+        sandbox: mockSandbox as unknown as typeof SafeSandbox,
+      });
+      expect(result.status).toBe('failed');
+      expect(result.error).toBe('Script failed');
+    });
+  });
+  describe('memory', () => {
+    const mockMemoryDb = {
+      store: mock(() => Promise.resolve('mem-id')),
+      search: mock(() => Promise.resolve([{ content: 'found', similarity: 0.9 }])),
+    };
+    const mockGetAdapter = mock((model) => {
+      if (model === 'no-embed') return { adapter: {}, resolvedModel: model };
+      return {
+        adapter: {
+          embed: mock((text) => Promise.resolve([0.1, 0.2, 0.3])),
+        },
+        resolvedModel: model,
+      };
+    });
+    it('should fail if memoryDb is not provided', async () => {
+      // @ts-ignore
+      const step = { id: 'm1', type: 'memory', op: 'store', text: 'foo' };
+      const result = await executeStep(step, context, undefined, {
+        getAdapter: mockGetAdapter as unknown as typeof getAdapter,
+      });
+      expect(result.status).toBe('failed');
+      expect(result.error).toBe('Memory database not initialized');
+    });
+    it('should fail if adapter does not support embedding', async () => {
+      // @ts-ignore
+      const step = { id: 'm1', type: 'memory', op: 'store', text: 'foo', model: 'no-embed' };
+      // @ts-ignore
+      const result = await executeStep(step, context, undefined, {
+        memoryDb: mockMemoryDb as unknown as MemoryDb,
+        getAdapter: mockGetAdapter as unknown as typeof getAdapter,
+      });
+      expect(result.status).toBe('failed');
+      expect(result.error).toContain('does not support embeddings');
+    });
+    it('should store memory', async () => {
+      // @ts-ignore
+      const step = {
+        id: 'm1',
+        type: 'memory',
+        op: 'store',
+        text: 'foo',
+        metadata: { source: 'test' },
+      };
+      // @ts-ignore
+      const result = await executeStep(step, context, undefined, {
+        memoryDb: mockMemoryDb as unknown as MemoryDb,
+        getAdapter: mockGetAdapter as unknown as typeof getAdapter,
+      });
+      expect(result.status).toBe('success');
+      expect(result.output).toEqual({ id: 'mem-id', status: 'stored' });
+      expect(mockMemoryDb.store).toHaveBeenCalledWith('foo', [0.1, 0.2, 0.3], { source: 'test' });
+    });
+    it('should search memory', async () => {
+      // @ts-ignore
+      const step = { id: 'm1', type: 'memory', op: 'search', query: 'foo', limit: 5 };
+      // @ts-ignore
+      const result = await executeStep(step, context, undefined, {
+        memoryDb: mockMemoryDb as unknown as MemoryDb,
+        getAdapter: mockGetAdapter as unknown as typeof getAdapter,
+      });
+      expect(result.status).toBe('success');
+      expect(result.output).toEqual([{ content: 'found', similarity: 0.9 }]);
+      expect(mockMemoryDb.search).toHaveBeenCalledWith([0.1, 0.2, 0.3], 5);
+    });
+    it('should fail store if text is missing', async () => {
+      // @ts-ignore
+      const step = { id: 'm1', type: 'memory', op: 'store' };
+      // @ts-ignore
+      const result = await executeStep(step, context, undefined, {
+        memoryDb: mockMemoryDb as unknown as MemoryDb,
+        getAdapter: mockGetAdapter as unknown as typeof getAdapter,
+      });
+      expect(result.status).toBe('failed');
+      expect(result.error).toBe('Text is required for memory store operation');
+    });
+    it('should fail search if query is missing', async () => {
+      // @ts-ignore
+      const step = { id: 'm1', type: 'memory', op: 'search' };
+      // @ts-ignore
+      const result = await executeStep(step, context, undefined, {
+        memoryDb: mockMemoryDb as unknown as MemoryDb,
+        getAdapter: mockGetAdapter as unknown as typeof getAdapter,
+      });
+      expect(result.status).toBe('failed');
+      expect(result.error).toBe('Query is required for memory search operation');
+    });
+    it('should fail for unknown memory operation', async () => {
+      // @ts-ignore
+      const step = { id: 'm1', type: 'memory', op: 'unknown', text: 'foo' };
+      // @ts-ignore
+      const result = await executeStep(step, context, undefined, {
+        memoryDb: mockMemoryDb as unknown as MemoryDb,
+        getAdapter: mockGetAdapter as unknown as typeof getAdapter,
+      });
+      expect(result.status).toBe('failed');
+      expect(result.error).toContain('Unknown memory operation');
+    });
   });
   describe('sleep', () => {
@@ -517,7 +710,7 @@ describe('step-executor', () => {
       );
       // @ts-ignore
-      const result = await executeStep(step, context, undefined, executeWorkflowFn);
+      const result = await executeStep(step, context, undefined, { executeWorkflowFn });
       expect(result.status).toBe('success');
       expect(result.output).toBe('child-output');
       expect(executeWorkflowFn).toHaveBeenCalled();

package/src/runner/step-executor.ts CHANGED Viewed

@@ -48,6 +48,20 @@ export interface StepResult {
   };
 }
+/**
+ * Execute a single step based on its type
+ */
+export interface StepExecutorOptions {
+  executeWorkflowFn?: (step: WorkflowStep, context: ExpressionContext) => Promise<StepResult>;
+  mcpManager?: MCPManager;
+  memoryDb?: MemoryDb;
+  workflowDir?: string;
+  dryRun?: boolean;
+  // Dependency injection for testing
+  getAdapter?: typeof getAdapter;
+  sandbox?: typeof SafeSandbox;
+}
 /**
  * Execute a single step based on its type
  */
@@ -55,12 +69,18 @@ export async function executeStep(
   step: Step,
   context: ExpressionContext,
   logger: Logger = new ConsoleLogger(),
-  executeWorkflowFn?: (step: WorkflowStep, context: ExpressionContext) => Promise<StepResult>,
-  mcpManager?: MCPManager,
-  memoryDb?: MemoryDb,
-  workflowDir?: string,
-  dryRun?: boolean
+  options: StepExecutorOptions = {}
 ): Promise<StepResult> {
+  const {
+    executeWorkflowFn,
+    mcpManager,
+    memoryDb,
+    workflowDir,
+    dryRun,
+    getAdapter: injectedGetAdapter,
+    sandbox: injectedSandbox,
+  } = options;
   try {
     let result: StepResult;
     switch (step.type) {
@@ -83,15 +103,14 @@ export async function executeStep(
         result = await executeLlmStep(
           step,
           context,
-          (s, c) =>
-            executeStep(s, c, logger, executeWorkflowFn, mcpManager, memoryDb, workflowDir, dryRun),
+          (s, c) => executeStep(s, c, logger, options),
           logger,
           mcpManager,
           workflowDir
         );
         break;
       case 'memory':
-        result = await executeMemoryStep(step, context, logger, memoryDb);
+        result = await executeMemoryStep(step, context, logger, memoryDb, injectedGetAdapter);
         break;
       case 'workflow':
         if (!executeWorkflowFn) {
@@ -100,7 +119,7 @@ export async function executeStep(
         result = await executeWorkflowFn(step, context);
         break;
       case 'script':
-        result = await executeScriptStep(step, context, logger);
+        result = await executeScriptStep(step, context, logger, injectedSandbox);
         break;
       default:
         throw new Error(`Unknown step type: ${(step as Step).type}`);
@@ -383,7 +402,7 @@ async function executeRequestStep(
     output: {
       status: response.status,
       statusText: response.statusText,
-      headers: Object.fromEntries(response.headers.entries()),
+      headers: Object.fromEntries(response.headers as unknown as Iterable<[string, string]>),
       data: responseData,
     },
     status: response.ok ? 'success' : 'failed',
@@ -503,7 +522,8 @@ async function executeSleepStep(
 async function executeScriptStep(
   step: ScriptStep,
   context: ExpressionContext,
-  _logger: Logger
+  _logger: Logger,
+  sandbox = SafeSandbox
 ): Promise<StepResult> {
   try {
     if (!step.allowInsecure) {
@@ -513,7 +533,7 @@ async function executeScriptStep(
       );
     }
-    const result = await SafeSandbox.execute(
+    const result = await sandbox.execute(
       step.run,
       {
         inputs: context.inputs,
@@ -546,14 +566,15 @@ async function executeMemoryStep(
   step: MemoryStep,
   context: ExpressionContext,
   logger: Logger,
-  memoryDb?: MemoryDb
+  memoryDb?: MemoryDb,
+  getAdapterFn = getAdapter
 ): Promise<StepResult> {
   if (!memoryDb) {
     throw new Error('Memory database not initialized');
   }
   try {
-    const { adapter, resolvedModel } = getAdapter(step.model || 'local');
+    const { adapter, resolvedModel } = getAdapterFn(step.model || 'local');
     if (!adapter.embed) {
       throw new Error(`Provider for model ${step.model || 'local'} does not support embeddings`);
     }

package/src/runner/stream-utils.test.ts CHANGED Viewed

@@ -4,16 +4,24 @@ import { processOpenAIStream } from './stream-utils';
 const encoder = new TextEncoder();
 function responseFromChunks(chunks: string[]): Response {
-  const stream = new ReadableStream({
-    start(controller) {
-      for (const chunk of chunks) {
-        controller.enqueue(encoder.encode(chunk));
+  let index = 0;
+  const reader = {
+    async read(): Promise<{ done: boolean; value?: Uint8Array }> {
+      if (index >= chunks.length) {
+        return { done: true, value: undefined };
       }
-      controller.close();
+      const value = encoder.encode(chunks[index]);
+      index += 1;
+      return { done: false, value };
     },
-  });
+    async cancel(): Promise<void> {},
+  };
-  return new Response(stream);
+  return {
+    body: {
+      getReader: () => reader,
+    },
+  } as Response;
 }
 describe('processOpenAIStream', () => {
@@ -61,5 +69,103 @@ describe('processOpenAIStream', () => {
     expect(result.message.content).toBe('ok');
     expect(logger.warn).toHaveBeenCalledTimes(1);
+    expect(logger.warn.mock.calls[0][0]).toContain('Malformed JSON line');
+  });
+  it('throws error when buffer size is exceeded', async () => {
+    const response = responseFromChunks(['a'.repeat(1024 * 1024 + 1)]);
+    await expect(processOpenAIStream(response)).rejects.toThrow(
+      'LLM stream line exceed maximum size'
+    );
+  });
+  it('throws error when response size limit is exceeded', async () => {
+    const response = responseFromChunks([
+      `data: {"choices":[{"delta":{"content":"${'a'.repeat(600 * 1024)}"}}]}\n`,
+      `data: {"choices":[{"delta":{"content":"${'a'.repeat(500 * 1024)}"}}]}\n`,
+    ]);
+    await expect(processOpenAIStream(response)).rejects.toThrow(
+      'LLM response exceeds maximum size'
+    );
+  });
+  it('throws error when tool call arguments size limit is exceeded', async () => {
+    const response = responseFromChunks([
+      `data: {"choices":[{"delta":{"tool_calls":[{"index":0,"function":{"arguments":"${'a'.repeat(600 * 1024)}"}}]}}]}\n`,
+      `data: {"choices":[{"delta":{"tool_calls":[{"index":0,"function":{"arguments":"${'a'.repeat(500 * 1024)}"}}]}}]}\n`,
+    ]);
+    await expect(processOpenAIStream(response)).rejects.toThrow(
+      'LLM tool call arguments exceed maximum size'
+    );
+  });
+  it('handles and logs generic errors during chunk processing', async () => {
+    const logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    // Mocking JSON.parse to throw a non-SyntaxError
+    const originalParse = JSON.parse;
+    JSON.parse = (str: string) => {
+      if (str === '{"trigger_error":true}') throw new Error('Generic error');
+      return originalParse(str);
+    };
+    try {
+      const response = responseFromChunks(['data: {"trigger_error":true}\n']);
+      await processOpenAIStream(response, { logger });
+      expect(logger.warn).toHaveBeenCalledTimes(1);
+      expect(logger.warn.mock.calls[0][0]).toContain(
+        'Error processing chunk: Error: Generic error'
+      );
+    } finally {
+      JSON.parse = originalParse;
+    }
+  });
+  it('handles errors in the final line processing', async () => {
+    const logger = {
+      log: mock(() => {}),
+      error: mock(() => {}),
+      warn: mock(() => {}),
+      info: mock(() => {}),
+    };
+    const response = responseFromChunks(['data: {bad json}']); // No newline, triggers buffer processing
+    await processOpenAIStream(response, { logger });
+    expect(logger.warn).toHaveBeenCalledTimes(1);
+    expect(logger.warn.mock.calls[0][0]).toContain('Malformed JSON line');
+  });
+  it('throws size limit error in final line processing', async () => {
+    const response = responseFromChunks([
+      `data: {"choices":[{"delta":{"content":"${'a'.repeat(600 * 1024)}"}}]}\n`,
+      `data: {"choices":[{"delta":{"content":"${'a'.repeat(500 * 1024)}"}}]}`,
+    ]);
+    // The first line is ok, the second line is in the final buffer and exceeds size
+    await expect(processOpenAIStream(response)).rejects.toThrow(
+      'LLM response exceeds maximum size'
+    );
+  });
+  it('bubbles up reader cancel errors', async () => {
+    const reader = {
+      read: async () => {
+        throw new Error('Read error');
+      },
+      cancel: async () => {
+        throw new Error('Cancel error');
+      },
+    };
+    const response = {
+      body: {
+        getReader: () => reader,
+      },
+    } as unknown as Response;
+    await expect(processOpenAIStream(response)).rejects.toThrow('Read error');
   });
 });

package/src/runner/stream-utils.ts CHANGED Viewed

@@ -67,7 +67,7 @@ export async function processOpenAIStream(
               const toolCall = tc as ToolCallDelta;
               if (!toolCalls[toolCall.index]) {
                 toolCalls[toolCall.index] = {
-                  id: toolCall.id,
+                  id: toolCall.id || '',
                   type: 'function',
                   function: { name: '', arguments: '' },
                 };
@@ -93,7 +93,7 @@ export async function processOpenAIStream(
           const activeLogger = options?.logger || new ConsoleLogger();
           // Rethrow size limit errors so they bubble up
-          if (String(e).toLowerCase().includes('exceed maximum size')) {
+          if (e instanceof Error && e.message.toLowerCase().includes('maximum size')) {
             throw e;
           }
@@ -137,7 +137,7 @@ export async function processOpenAIStream(
               const toolCall = tc as ToolCallDelta;
               if (!toolCalls[toolCall.index]) {
                 toolCalls[toolCall.index] = {
-                  id: toolCall.id,
+                  id: toolCall.id || '',
                   type: 'function',
                   function: { name: '', arguments: '' },
                 };
@@ -161,7 +161,7 @@ export async function processOpenAIStream(
           }
         }
       } catch (e) {
-        if (String(e).toLowerCase().includes('exceed maximum size')) {
+        if (e instanceof Error && e.message.toLowerCase().includes('maximum size')) {
           throw e;
         }
         const activeLogger = options?.logger || new ConsoleLogger();

package/src/runner/workflow-runner.ts CHANGED Viewed

@@ -630,16 +630,13 @@ export class WorkflowRunner {
     }
     const operation = async () => {
-      const result = await executeStep(
-        stepToExecute,
-        context,
-        this.logger,
-        this.executeSubWorkflow.bind(this),
-        this.mcpManager,
-        this.memoryDb,
-        this.options.workflowDir,
-        this.options.dryRun
-      );
+      const result = await executeStep(stepToExecute, context, this.logger, {
+        executeWorkflowFn: this.executeSubWorkflow.bind(this),
+        mcpManager: this.mcpManager,
+        memoryDb: this.memoryDb,
+        workflowDir: this.options.workflowDir,
+        dryRun: this.options.dryRun,
+      });
       if (result.status === 'failed') {
         throw new Error(result.error || 'Step failed');
       }
@@ -868,16 +865,13 @@ Do not change the 'id' or 'type' or 'auto_heal' fields.
     // Execute the agent step
     // We use a fresh context but share secrets/env
-    const result = await executeStep(
-      agentStep,
-      context,
-      this.logger,
-      this.executeSubWorkflow.bind(this),
-      this.mcpManager,
-      this.memoryDb,
-      this.options.workflowDir,
-      this.options.dryRun
-    );
+    const result = await executeStep(agentStep, context, this.logger, {
+      executeWorkflowFn: this.executeSubWorkflow.bind(this),
+      mcpManager: this.mcpManager,
+      memoryDb: this.memoryDb,
+      workflowDir: this.options.workflowDir,
+      dryRun: this.options.dryRun,
+    });
     if (result.status !== 'success' || !result.output) {
       throw new Error(`Healer agent failed: ${result.error || 'No output'}`);