npm - @qianxude/tem - Versions diffs - 0.4.0 → 0.4.3 - Mend

@qianxude/tem 0.4.0 → 0.4.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md +196 -38
package/package.json +2 -1
package/src/cli/README.md +218 -0
package/src/core/tem.ts +6 -2
package/src/database/index.ts +11 -108
package/src/database/schema.sql +17 -1
package/src/interfaces/index.ts +3 -0
package/src/mock-server/README.md +180 -13
package/src/services/batch-interruption.ts +8 -2

package/README.md CHANGED Viewed

@@ -6,6 +6,14 @@ Built for **single-process, IO-bound scenarios** where you need reliable task ex
 ---
+## Installation
+```sh
+bun add @qianxude/tem
+```
+---
 ## Features
 - **SQLite Persistence** — Tasks survive process restarts using `bun:sqlite` with WAL mode
@@ -43,38 +51,37 @@ Don't use tem when you need:
 ```typescript
 import { TEM } from "@qianxude/tem";
-// Initialize
 const tem = new TEM({
-  dbPath: "./tem.db",
-  concurrency: 5,           // Max 5 concurrent tasks
-  pollInterval: 1000,       // Check for new tasks every 1s
-  rateLimit: {
-    perMinute: 60,          // Respect LLM provider limits
-    perSecond: 5
-  }
+  databasePath: "./tem.db",
+  concurrency: 5,
+  pollIntervalMs: 1000,
+  rateLimit: { requests: 60, windowMs: 60000 }  // 60 req/min
 });
 // Create a batch
 const batch = await tem.batch.create({
-  code: "2026-02-15-llm-fix",  // Your custom tag
+  code: "2026-02-15-llm-fix",
   type: "rewrite-docs"
 });
-// Enqueue tasks
-await tem.task.enqueueMany([
+// Create tasks
+await tem.task.createMany([
   { batchId: batch.id, type: "rewrite", payload: { docId: 1 } },
   { batchId: batch.id, type: "rewrite", payload: { docId: 2 } },
   { batchId: batch.id, type: "rewrite", payload: { docId: 3 } }
 ]);
-// Register handler
-tem.worker.register("rewrite", async (task) => {
-  const result = await callLLM(task.payload);
+// Register handler — payload is your task data, context has metadata
+tem.worker.register("rewrite", async (payload, context) => {
+  const result = await callLLM(payload);
   return result;  // Stored in task.result
 });
 // Start processing
 tem.worker.start();
+// Stop when done
+await tem.stop();
 ```
 ---
@@ -99,6 +106,111 @@ failed
 ---
+## Core Concepts
+- **Batch** — A named group of tasks. All recovery operations (resume, retry) work at batch level.
+- **Task** — A unit of work with a `type`, opaque `payload`, and tracked `status`.
+- **Worker** — Polls for pending tasks and dispatches them to registered handlers by type.
+- **Payload** — Opaque JSON; the framework never parses it. Your handler receives it as-is.
+- **Claim model** — Tasks are acquired atomically (`UPDATE ... WHERE status='pending'`), preventing duplicate execution.
+### Task Ordering
+Tasks within a batch are claimed and executed in **FIFO order** (First-In-First-Out) based on creation time.
+When multiple tasks are pending, the task created first will be claimed first:
+```typescript
+// These tasks will be claimed in order: task1, then task2, then task3
+await tem.task.create({ batchId: batch.id, type: "process", payload: { id: 1 } }); // task1
+await tem.task.create({ batchId: batch.id, type: "process", payload: { id: 2 } }); // task2
+await tem.task.create({ batchId: batch.id, type: "process", payload: { id: 3 } }); // task3
+```
+---
+## Error Handling
+By default, any thrown error causes the task to retry up to `defaultMaxAttempts`:
+```typescript
+tem.worker.register("process", async (payload, context) => {
+  console.log(`Attempt ${context.attempt}`);
+  const result = await callAPI(payload);  // throws → auto-retry
+  return result;
+});
+```
+For permanent failures that should not be retried, throw `NonRetryableError`:
+```typescript
+import { TEM, NonRetryableError } from "@qianxude/tem";
+tem.worker.register("validate", async (payload) => {
+  if (!payload.id) {
+    throw new NonRetryableError("Missing required field: id");
+    // Task goes directly to 'failed', no retries
+  }
+  return process(payload);
+});
+```
+---
+## Batch Interruption
+Automatically stop a batch when error thresholds are exceeded:
+```typescript
+const batch = await tem.batch.create({
+  code: "llm-run-01",
+  type: "summarize",
+  interruptionCriteria: {
+    maxErrorRate: 0.3,          // Stop if >30% tasks fail
+    maxFailedTasks: 10,         // Stop if >10 tasks fail
+    maxConsecutiveFailures: 5,  // Stop if 5 failures in a row
+  }
+});
+```
+Check interruption details after the batch stops:
+```typescript
+const logs = await tem.interruption.getInterruptionLog(batchId);
+// [{ reason, message, statsAtInterruption }]
+```
+Manually interrupt a running batch:
+```typescript
+await tem.interruptBatch(batchId, "manual", "Stopping due to bad data");
+```
+---
+## Auto-Detect Constraints
+Probe an API endpoint to discover its concurrency and rate limits before running tasks:
+```typescript
+const config = await TEM.detectConstraints({
+  url: "https://api.example.com/v1/endpoint",
+  method: "POST",
+  headers: { Authorization: "Bearer " + process.env.API_KEY },
+  body: { /* minimal valid request */ },
+  timeoutMs: 30000,
+  maxConcurrencyToTest: 50,
+  rateLimitTestDurationMs: 10000,
+});
+const tem = new TEM({
+  databasePath: "./tasks.db",
+  concurrency: config.concurrency,
+  rateLimit: config.rateLimit,
+});
+```
+---
 ## Recovery Patterns
 ### Resume After Crash
@@ -133,7 +245,7 @@ TEM
 ├── Worker             # Execution loop with concurrency/rate limiting
 ├── ConcurrencyController  # Semaphore for local concurrency
 ├── RateLimiter        # Token bucket for API rate limits
-└── RetryStrategy      # Configurable retry logic
+└── BatchInterruptionService  # Auto-stop on error thresholds
 ```
 ### Why Claim-Based?
@@ -199,12 +311,13 @@ This ensures:
 ```typescript
 interface TEMConfig {
-  dbPath: string;           // SQLite file path
-  concurrency?: number;     // Default: 5
-  pollInterval?: number;    // Default: 1000ms
+  databasePath: string;       // SQLite file path
+  concurrency?: number;       // Default: 5
+  pollIntervalMs?: number;    // Default: 1000ms
+  defaultMaxAttempts?: number; // Default: 3
   rateLimit?: {
-    perMinute?: number;
-    perSecond?: number;
+    requests: number;         // Number of requests
+    windowMs: number;         // Time window in ms (e.g. 60000 for per-minute)
   };
 }
 ```
@@ -216,57 +329,71 @@ interface TEMConfig {
 const batch = await tem.batch.create({
   code: "unique-batch-code",
   type: "batch-type",
-  metadata?: { ... }
+  metadata?: { ... },
+  interruptionCriteria?: {
+    maxErrorRate?: number;
+    maxFailedTasks?: number;
+    maxConsecutiveFailures?: number;
+  }
 });
-// Get batch info
-const batch = await tem.batch.get(batchId);
-// List batches
-const batches = await tem.batch.list({ type?: "..." });
+// Get batch by ID
+const batch = await tem.batch.getById(batchId);
 // Get statistics
 const stats = await tem.batch.getStats(batchId);
-// { pending: 5, running: 2, completed: 10, failed: 3 }
+// { pending, running, completed, failed, total }
 // Resume after crash (running → pending)
 await tem.batch.resume(batchId);
-// Retry all failed (failed → pending, attempt=0)
+// Retry all failed (failed → pending, attempt reset)
 await tem.batch.retryFailed(batchId);
 ```
 ### Task Operations
 ```typescript
-// Enqueue single task
-await tem.task.enqueue({
+// Create single task
+await tem.task.create({
   batchId: string,
   type: string,
   payload: object,
-  maxAttempt?: number  // Default: 3
+  maxAttempts?: number
 });
-// Bulk enqueue (transaction)
-await tem.task.enqueueMany([
+// Bulk create (single transaction)
+await tem.task.createMany([
   { batchId, type, payload },
   ...
 ]);
+// Get task by ID
+const task = await tem.task.getById(taskId);
 ```
 ### Worker
 ```typescript
 // Register handler
-tem.worker.register("task-type", async (task) => {
-  // task.id, task.batchId, task.payload, task.attempt
-  const result = await doWork(task.payload);
-  return result;  // Will be JSON-serialized to task.result
+// payload: your task data; context: { taskId, batchId, attempt }
+tem.worker.register("task-type", async (payload, context) => {
+  const result = await doWork(payload);
+  return result;  // JSON-serialized to task.result
 });
 // Control execution
 tem.worker.start();
-await tem.worker.stop();
+await tem.stop();  // Stops worker and closes DB
+```
+### NonRetryableError
+```typescript
+import { NonRetryableError } from "@qianxude/tem";
+throw new NonRetryableError("reason");
+// Task goes to 'failed' immediately, skipping remaining attempts
 ```
 ---
@@ -298,6 +425,37 @@ await tem.worker.stop();
 ---
+## Mock Server
+Tem includes a built-in mock HTTP server for testing task execution under various constraints. Use it to simulate APIs with:
+- **Concurrency limits** — Test how your tasks handle 503 errors
+- **Rate limiting** — Verify retry behavior against 429 responses
+- **Error simulation** — Test resilience with configurable failure rates
+See [src/mock-server/README.md](src/mock-server/README.md) for detailed documentation.
+---
+## CLI
+Tem includes a CLI for batch diagnostics and monitoring:
+```sh
+# Generate diagnostic report
+tem report ./tem.db my-batch
+# List failed tasks
+tem list ./tem.db --batch my-batch --status failed
+# Watch batch progress in real-time
+tem watch ./tem.db --latest
+```
+See [src/cli/README.md](src/cli/README.md) for full documentation.
+---
 ## License
 MIT

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@qianxude/tem",
-  "version": "0.4.0",
+  "version": "0.4.3",
   "description": "A lightweight task execution engine for IO-bound workloads with SQLite persistence, retry, and rate limiting",
   "module": "src/index.ts",
   "type": "module",
@@ -22,6 +22,7 @@
   "scripts": {
     "typecheck": "tsc --noEmit",
     "test": "bun test",
+    "coverage": "bun test --coverage",
     "test:integration": "bun test tests/integration/*.test.ts",
     "test:mock-server": "bun test tests/integration/mock-server.test.ts",
     "test:simple-tasks": "bun test tests/integration/tem-with-mock-server.test.ts",

package/src/cli/README.md ADDED Viewed

@@ -0,0 +1,218 @@
+# tem CLI
+Command-line interface for batch diagnostics and monitoring.
+## Installation
+The CLI is included with the tem package:
+```sh
+bun add @qianxude/tem
+```
+You can run it directly with bun:
+```sh
+bun run src/cli/index.ts <command> [options]
+```
+Or install globally:
+```sh
+bun link
+```
+## Usage
+```
+tem <command> [options]
+```
+## Commands
+### `report`
+Generate a diagnostic report for batches.
+```sh
+tem report <db-path> [batch-code]
+```
+**Arguments:**
+- `db-path` - Path to the SQLite database file (required)
+- `batch-code` - Specific batch code to report on (optional)
+**Options:**
+- `--latest` - Report on the most recently created batch
+- `--limit-errors N` - Show top N error patterns (default: 10)
+**Examples:**
+```sh
+# Summary report for all batches
+tem report ./tem.db
+# Detailed report for specific batch
+tem report ./tem.db my-batch-code
+# Report on latest batch
+tem report ./tem.db --latest
+# Show top 20 error patterns
+tem report ./tem.db my-batch-code --limit-errors 20
+```
+**Report includes:**
+- Batch overview (code, type, status, timestamps, duration)
+- Status breakdown with counts and percentages
+- Timing analysis (avg/min/max task times, throughput)
+- Error patterns for failed tasks
+- Retry analysis statistics
+- Detection of stuck tasks (running > 5 minutes)
+---
+### `list`
+List tasks with filtering options.
+```sh
+tem list <db-path>
+```
+**Arguments:**
+- `db-path` - Path to the SQLite database file (required)
+**Options:**
+- `--batch <code>` - Filter by batch code
+- `--status <status>` - Filter by status: `pending`, `running`, `completed`, or `failed`
+- `--type <type>` - Filter by task type
+- `--limit <n>` - Limit results (default: 100)
+**Examples:**
+```sh
+# List all tasks (up to 100)
+tem list ./tem.db
+# List failed tasks from a specific batch
+tem list ./tem.db --batch my-batch --status failed
+# List pending tasks of a specific type
+tem list ./tem.db --status pending --type rewrite --limit 20
+```
+**Output columns:**
+- ID - Task UUID
+- Batch - Batch code
+- Type - Task type
+- Status - Current status
+- Attempts - Current attempt / max attempts
+- Created - Timestamp
+- Completed - Completion timestamp
+- Error - Truncated error message (if failed)
+---
+### `watch`
+Monitor a running batch in real-time.
+```sh
+tem watch <db-path> [batch-code]
+```
+**Arguments:**
+- `db-path` - Path to the SQLite database file (required)
+- `batch-code` - Specific batch code to watch (optional if using `--latest`)
+**Options:**
+- `--latest` - Watch the most recently created batch
+- `--interval N` - Refresh interval in seconds (default: 5)
+- `--timeout N` - Maximum watch time in seconds (default: 3600)
+- `--no-clear` - Don't clear screen between updates
+**Examples:**
+```sh
+# Watch the latest batch
+tem watch ./tem.db --latest
+# Watch specific batch with 10-second refresh
+tem watch ./tem.db my-batch-code --interval 10
+# Watch for up to 5 minutes
+tem watch ./tem.db --latest --timeout 300
+# Watch without clearing screen (for logging)
+tem watch ./tem.db --latest --no-clear
+```
+**Watch display includes:**
+- Visual progress bar
+- Batch status with color coding:
+  - 🟢 Green - Completed
+  - 🔴 Red - Failed
+  - 🟡 Yellow - Running
+  - 🔵 Cyan - Pending
+- Real-time statistics (pending, running, completed, failed, total)
+- Throughput and ETA
+- Recent errors (last 3)
+- Stuck task warnings (> 5 minutes running)
+Press `Ctrl+C` to stop watching. A final report is displayed when the batch completes.
+---
+## Exit Codes
+| Code | Meaning |
+|------|---------|
+| 0 | Success |
+| 1 | Runtime error (database issues, batch not found, timeout) |
+| 2 | Usage error (missing arguments, invalid commands/options) |
+| 130 | Interrupted by user (SIGINT) |
+---
+## Global Options
+- `--help, -h` - Show help message for any command
+## Common Workflows
+### Debug a failing batch
+```sh
+# Watch the batch in one terminal
+tem watch ./tem.db my-batch --latest
+# In another terminal, list failed tasks
+tem list ./tem.db --batch my-batch --status failed
+# Generate detailed report
+tem report ./tem.db my-batch --limit-errors 20
+```
+### Monitor a long-running job
+```sh
+# Watch with longer interval to reduce database queries
+tem watch ./tem.db my-batch --interval 30 --timeout 7200
+```
+### Quick status check
+```sh
+# Summary of all batches
+tem report ./tem.db
+```

package/src/core/tem.ts CHANGED Viewed

@@ -6,6 +6,7 @@ import {
   type DetectOptions,
   type DetectedConfig,
 } from '../utils/auto-detect.js';
+import type { BatchInterruptionReason, BatchInterruptionCriteria } from '../interfaces/index.js';
 export type { DetectOptions, DetectedConfig };
@@ -30,6 +31,9 @@ export interface TEMConfig {
   // Optional: Specific batch ID to process (if set, only processes this batch)
   batchId?: string;
+  // Default interruption criteria for all batches (batch-level overrides these)
+  defaultInterruptionCriteria?: BatchInterruptionCriteria;
 }
 export class TEM {
@@ -84,7 +88,7 @@ export class TEM {
     // Initialize services
     this.batch = new BatchService(this.database);
     this.task = new TaskService(this.database);
-    this.interruption = new BatchInterruptionService(this.database, this.batch);
+    this.interruption = new BatchInterruptionService(this.database, this.batch, config.defaultInterruptionCriteria);
     // Initialize worker with config
     const workerConfig: WorkerConfig = {
@@ -116,7 +120,7 @@ export class TEM {
    */
   async interruptBatch(
     batchId: string,
-    reason?: import('../interfaces/index.js').BatchInterruptionReason,
+    reason?: BatchInterruptionReason,
     message?: string
   ): Promise<void> {
     await this.interruption.interrupt(

package/src/database/index.ts CHANGED Viewed

@@ -1,4 +1,6 @@
 import { Database as SQLiteDatabase, type SQLQueryBindings } from 'bun:sqlite';
+import { readFileSync } from 'fs';
+import { join } from 'path';
 import * as i from '../interfaces/index.js';
 export interface DatabaseOptions {
@@ -13,121 +15,22 @@ export class Database implements i.DatabaseConnection {
     this.db = new SQLiteDatabase(options.path);
     // Enable WAL mode for better concurrency
-    this.db.exec('PRAGMA journal_mode = WAL;');
+    this.db.run('PRAGMA journal_mode = WAL;');
     // Set busy timeout for concurrent access safety (default 5 seconds)
     const timeout = options.busyTimeout ?? 5000;
-    this.db.exec(`PRAGMA busy_timeout = ${timeout};`);
+    this.db.run(`PRAGMA busy_timeout = ${timeout};`);
-    // Run migrations
-    this.migrate();
+    // Initialize schema
+    this.initSchema();
   }
-  private migrate(): void {
-    // Create migration tracking table first
-    this.db.exec(`
-      CREATE TABLE IF NOT EXISTS _migration (
-        id INTEGER PRIMARY KEY AUTOINCREMENT,
-        name TEXT NOT NULL UNIQUE,
-        applied_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
-      )
-    `);
+  private initSchema(): void {
+    // Read schema from file
+    const schemaPath = join(import.meta.dirname, 'schema.sql');
+    const schema = readFileSync(schemaPath, 'utf-8');
-    // Check and apply migrations in order
-    const migrations = [
-      { name: '001_initial_schema', apply: () => this.applyInitialSchema() },
-      { name: '002_batch_interruption', apply: () => this.applyBatchInterruptionMigration() },
-    ];
-    for (const migration of migrations) {
-      const migrationCount = this.db
-        .query('SELECT COUNT(*) as count FROM _migration WHERE name = $name')
-        .get({ $name: migration.name }) as { count: number };
-      if (migrationCount.count === 0) {
-        migration.apply();
-      }
-    }
-  }
-  private applyInitialSchema(): void {
-    const schema = `
-      -- Batch: Groups of related tasks
-      CREATE TABLE IF NOT EXISTS batch (
-        id TEXT PRIMARY KEY,
-        code TEXT NOT NULL UNIQUE,
-        type TEXT NOT NULL,
-        created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
-        completed_at DATETIME,
-        metadata TEXT
-      );
-      -- Task: Individual units of work
-      CREATE TABLE IF NOT EXISTS task (
-        id TEXT PRIMARY KEY,
-        batch_id TEXT REFERENCES batch(id) ON DELETE CASCADE,
-        type TEXT NOT NULL,
-        status TEXT NOT NULL CHECK(status IN ('pending', 'running', 'completed', 'failed')),
-        payload TEXT NOT NULL,
-        result TEXT,
-        error TEXT,
-        attempt INTEGER NOT NULL DEFAULT 0,
-        max_attempt INTEGER NOT NULL DEFAULT 3,
-        claimed_at DATETIME,
-        completed_at DATETIME,
-        version INTEGER NOT NULL DEFAULT 0,
-        created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
-      );
-      -- Indexes for performance
-      CREATE INDEX IF NOT EXISTS idx_batch_code ON batch(code);
-      CREATE INDEX IF NOT EXISTS idx_batch_type ON batch(type);
-      CREATE INDEX IF NOT EXISTS idx_task_batch_id ON task(batch_id);
-      CREATE INDEX IF NOT EXISTS idx_task_status ON task(status);
-      CREATE INDEX IF NOT EXISTS idx_task_type ON task(type);
-      CREATE INDEX IF NOT EXISTS idx_task_claim ON task(status, claimed_at);
-      CREATE INDEX IF NOT EXISTS idx_task_pending ON task(status, created_at) WHERE status = 'pending';
-    `;
-    this.transaction(() => {
-      this.db.exec(schema);
-      this.db
-        .query('INSERT INTO _migration (name) VALUES ($name)')
-        .run({ $name: '001_initial_schema' });
-    });
-  }
-  private applyBatchInterruptionMigration(): void {
-    const migration = `
-      -- Add status to batch table
-      ALTER TABLE batch ADD COLUMN status TEXT NOT NULL DEFAULT 'active'
-        CHECK(status IN ('active', 'interrupted', 'completed'));
-      -- Add interruption criteria storage (JSON)
-      ALTER TABLE batch ADD COLUMN interruption_criteria TEXT;
-      -- Index for quickly finding active batches
-      CREATE INDEX IF NOT EXISTS idx_batch_status ON batch(status);
-      -- New table: interruption log
-      CREATE TABLE IF NOT EXISTS batch_interrupt_log (
-        id TEXT PRIMARY KEY,
-        batch_id TEXT NOT NULL REFERENCES batch(id) ON DELETE CASCADE,
-        reason TEXT NOT NULL,
-        message TEXT NOT NULL,
-        stats_snapshot TEXT NOT NULL, -- JSON of BatchStats
-        created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
-      );
-      CREATE INDEX IF NOT EXISTS idx_interrupt_log_batch_id ON batch_interrupt_log(batch_id);
-    `;
-    this.transaction(() => {
-      this.db.exec(migration);
-      this.db
-        .query('INSERT INTO _migration (name) VALUES ($name)')
-        .run({ $name: '002_batch_interruption' });
-    });
+    this.db.run(schema);
   }
   query<T = unknown>(sql: string, params?: SQLQueryBindings[]): T[] {

package/src/database/schema.sql CHANGED Viewed

@@ -1,5 +1,6 @@
 -- TEM Database Schema
 -- SQLite with WAL mode
+-- Complete schema - single source of truth for new databases
 -- Migration tracking
 CREATE TABLE IF NOT EXISTS _migration (
@@ -13,9 +14,12 @@ CREATE TABLE IF NOT EXISTS batch (
   id TEXT PRIMARY KEY,
   code TEXT NOT NULL UNIQUE,
   type TEXT NOT NULL,
+  status TEXT NOT NULL DEFAULT 'active'
+    CHECK(status IN ('active', 'interrupted', 'completed')),
   created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
   completed_at DATETIME,
-  metadata TEXT -- JSON object
+  metadata TEXT, -- JSON object
+  interruption_criteria TEXT -- JSON object
 );
 -- Task: Individual units of work
@@ -35,11 +39,23 @@ CREATE TABLE IF NOT EXISTS task (
   created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
 );
+-- Interruption log: Records batch interruption events
+CREATE TABLE IF NOT EXISTS batch_interrupt_log (
+  id TEXT PRIMARY KEY,
+  batch_id TEXT NOT NULL REFERENCES batch(id) ON DELETE CASCADE,
+  reason TEXT NOT NULL,
+  message TEXT NOT NULL,
+  stats_snapshot TEXT NOT NULL, -- JSON of BatchStats
+  created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP
+);
 -- Indexes for performance
 CREATE INDEX IF NOT EXISTS idx_batch_code ON batch(code);
 CREATE INDEX IF NOT EXISTS idx_batch_type ON batch(type);
+CREATE INDEX IF NOT EXISTS idx_batch_status ON batch(status);
 CREATE INDEX IF NOT EXISTS idx_task_batch_id ON task(batch_id);
 CREATE INDEX IF NOT EXISTS idx_task_status ON task(status);
 CREATE INDEX IF NOT EXISTS idx_task_type ON task(type);
 CREATE INDEX IF NOT EXISTS idx_task_claim ON task(status, claimed_at);
 CREATE INDEX IF NOT EXISTS idx_task_pending ON task(status, created_at) WHERE status = 'pending';
+CREATE INDEX IF NOT EXISTS idx_interrupt_log_batch_id ON batch_interrupt_log(batch_id);

package/src/interfaces/index.ts CHANGED Viewed

@@ -105,6 +105,9 @@ export interface TEMConfig {
   // Polling
   pollIntervalMs: number;
+  // Default interruption criteria for all batches (batch-level overrides these)
+  defaultInterruptionCriteria?: BatchInterruptionCriteria;
 }
 // ============================================================================

package/src/mock-server/README.md CHANGED Viewed

@@ -1,14 +1,15 @@
 # Mock Server
-A lightweight HTTP server for simulating external API services with configurable concurrency and rate limiting constraints. Used for testing TEM's task execution capabilities under various load conditions.
+A lightweight HTTP server for simulating external API services with configurable concurrency, rate limiting, and error simulation. Used for testing TEM's task execution capabilities under various load conditions.
 ## Overview
 The mock server provides a controlled environment to test how TEM handles:
-- **Concurrency limits** - Simulate services that reject requests when too many are in flight
-- **Rate limiting** - Test backoff and retry behavior against rate-limited endpoints
-- **Processing delays** - Verify timeout handling and async processing
+- **Concurrency limits** — Simulate services that reject requests when too many are in flight (503 errors)
+- **Rate limiting** — Test backoff and retry behavior against rate-limited endpoints (429 errors)
+- **Error simulation** — Verify resilience with configurable random failure rates
+- **Processing delays** — Verify timeout handling and async processing
 ## Architecture
@@ -30,6 +31,8 @@ The mock server provides a controlled environment to test how TEM handles:
 | Component | File | Purpose |
 |-----------|------|---------|
 | `startMockServer` | `server.ts` | Server lifecycle management |
+| `createMockService` | `server.ts` | Client helper to create services |
+| `createErrorSimulation` | `server.ts` | Helper to create error simulation config |
 | `createRouter` | `router.ts` | HTTP routing and request handling |
 | `MockService` | `service.ts` | Per-service concurrency and rate limiting |
 | `RejectingRateLimiter` | `service.ts` | Token bucket rate limiter with immediate reject |
@@ -49,7 +52,8 @@ startMockServer({
   defaultService: {
     maxConcurrency: 3,
     rateLimit: { limit: 10, windowMs: 1000 },
-    delayMs: [10, 50]
+    delayMs: [10, 50],
+    errorSimulation: { rate: 0.1, statusCode: 500 }
   }
 });
 ```
@@ -61,10 +65,7 @@ Dynamic service creation and management. Each service has its own concurrency/ra
 **Use case:** Complex tests with multiple services having different constraints.
 ```typescript
-startMockServer({
-  port: 8080,
-  mode: 'multi'
-});
+startMockServer({ port: 8080, mode: 'multi' });
 // Create services dynamically via HTTP API
 ```
@@ -75,7 +76,7 @@ startMockServer({
 | Method | Path | Description |
 |--------|------|-------------|
-| `GET` | `/` | Access the default service |
+| `GET` / `POST` | `/` | Access the default service |
 | `POST` | `/shutdown` | Shutdown the server |
 ### Multi Mode Endpoints
@@ -84,7 +85,7 @@ startMockServer({
 |--------|------|-------------|
 | `POST` | `/service/:name` | Create or replace a service |
 | `DELETE` | `/service/:name` | Delete a service |
-| `GET` | `/mock/:name` | Access a service |
+| `GET` / `POST` | `/mock/:name` | Access a service |
 | `POST` | `/shutdown` | Shutdown the server |
 ### Create Service (Multi Mode Only)
@@ -99,7 +100,12 @@ Content-Type: application/json
     "limit": 10,                 // Requests per window (required)
     "windowMs": 1000             // Window size in ms (required)
   },
-  "delayMs": [10, 200]           // [min, max] processing delay (optional, default: [10, 200])
+  "delayMs": [10, 200],          // [min, max] processing delay (optional, default: [10, 200])
+  "errorSimulation": {           // Optional error simulation config
+    "rate": 0.1,                 // Error rate 0-1 (10% = 0.1)
+    "statusCode": 503,           // HTTP status to return (default: 500)
+    "errorMessage": "simulated_error"  // Error message (default: "internal_server_error")
+  }
 }
 ```
@@ -114,11 +120,30 @@ Content-Type: application/json
 }
 ```
+### Delete Service (Multi Mode Only)
+```http
+DELETE /service/:name
+```
+**Response:**
+```http
+HTTP/1.1 200 OK
+Content-Type: application/json
+{
+  "service": "test1",
+  "status": "deleted"
+}
+```
 ### Access Service
 ```http
 GET /mock/:name        # Multi mode
+POST /mock/:name       # Multi mode (body ignored)
 GET /                  # Single mode
+POST /                 # Single mode (body ignored)
 ```
 **Success Response (200):**
@@ -152,6 +177,8 @@ Content-Type: application/json
 }
 ```
+The server will stop accepting new connections and exit the process after a brief delay.
 ## Error Responses
 | Status | Error Code | Description |
@@ -163,6 +190,21 @@ Content-Type: application/json
 | `429` | `rate_limit_exceeded` | Rate limit reached |
 | `503` | `concurrency_limit_exceeded` | Concurrency limit reached |
+### Error Simulation
+When `errorSimulation` is configured, requests may randomly fail with:
+```http
+HTTP/1.1 500 Internal Server Error  // Or configured statusCode
+Content-Type: application/json
+{
+  "error": "internal_server_error"  // Or configured errorMessage
+}
+```
+The error is checked **before** acquiring concurrency/rate limit resources, so simulated errors don't count against limits.
 ## Configuration
 ### ServerConfig
@@ -185,6 +227,21 @@ interface ServiceConfig {
     windowMs: number;              // Window duration in milliseconds
   };
   delayMs: [number, number];       // [min, max] simulated processing delay
+  errorSimulation?: {              // Optional error simulation
+    rate: number;                  // Error rate 0-1
+    statusCode?: number;           // HTTP status code (default: 500)
+    errorMessage?: string;         // Error message (default: "internal_server_error")
+  };
+}
+```
+### ErrorSimulationConfig
+```typescript
+interface ErrorSimulationConfig {
+  rate: number;        // Error rate 0-1 (e.g., 0.1 = 10% error rate)
+  statusCode?: number; // HTTP status code to return (default: 500)
+  errorMessage?: string; // Error message (default: "internal_server_error")
 }
 ```
@@ -315,6 +372,29 @@ console.log(r3.status);  // 429
 console.log(await r3.json());  // { error: 'rate_limit_exceeded' }
 ```
+### Testing Error Simulation
+```typescript
+import { startMockServer, stopMockServer, createErrorSimulation } from './src/mock-server';
+startMockServer({
+  port: 8080,
+  mode: 'single',
+  defaultService: {
+    maxConcurrency: 10,
+    rateLimit: { limit: 100, windowMs: 1000 },
+    delayMs: [10, 50],
+    errorSimulation: createErrorSimulation(0.3, 503, "service_unavailable")
+  }
+});
+// Approximately 30% of requests will fail with 503
+for (let i = 0; i < 10; i++) {
+  const res = await fetch('http://localhost:8080/');
+  console.log(res.status);  // Mix of 200 and 503
+}
+```
 ## Rate Limiting Algorithm
 The mock server uses a **token bucket** algorithm for rate limiting:
@@ -332,15 +412,31 @@ Concurrency is tracked per-service:
 - Decrements when request completes (in `finally` block)
 - If `currentConcurrency >= maxConcurrency`, new requests get 503
+## Error Simulation
+Error simulation is checked before acquiring resources:
+```typescript
+// Pseudocode
+if (Math.random() < errorSimulation.rate) {
+  return errorResponse;  // Doesn't consume concurrency/rate limit tokens
+}
+// Continue with normal request handling...
+```
+This ensures that simulated errors don't deplete your concurrency slots or rate limit budget.
 ## Programmatic API
+### Server Lifecycle
 ```typescript
 import { startMockServer, stopMockServer, getServerState } from './src/mock-server';
 // Start server
 startMockServer(config: ServerConfig): void
-// Stop server programmatically
+// Stop server programmatically (for testing)
 stopMockServer(): void
 // Get current state (for testing)
@@ -350,3 +446,74 @@ getServerState(): {
   hasDefaultService: boolean;
 }
 ```
+### Client Helpers
+```typescript
+import { createMockService, createErrorSimulation } from './src/mock-server';
+// Create a service programmatically (wraps HTTP call)
+createMockService(
+  name: string,
+  config: CreateServiceRequest,
+  mockUrl?: string  // defaults to http://localhost:19999
+): Promise<Response>
+// Create error simulation config with validation
+createErrorSimulation(
+  rate: number,           // 0-1 error rate
+  statusCode?: number,    // HTTP status (default: 500)
+  errorMessage?: string   // Error message
+): ErrorSimulationConfig
+```
+Example using client helpers:
+```typescript
+import { createMockService, createErrorSimulation } from './src/mock-server';
+// Create service with error simulation
+await createMockService('flaky-api', {
+  maxConcurrency: 5,
+  rateLimit: { limit: 10, windowMs: 1000 },
+  delayMs: [50, 100],
+  errorSimulation: createErrorSimulation(0.2, 503)
+});
+// Use the service
+const res = await fetch('http://localhost:19999/mock/flaky-api');
+```
+## Testing with TEM
+The mock server is designed to integrate seamlessly with TEM for testing retry and error handling:
+```typescript
+import { TEM } from '@qianxude/tem';
+import { startMockServer, createMockService, createErrorSimulation } from './src/mock-server';
+// Start mock server with flaky service
+startMockServer({ port: 19999, mode: 'multi' });
+await createMockService('api', {
+  maxConcurrency: 3,
+  rateLimit: { limit: 10, windowMs: 1000 },
+  errorSimulation: createErrorSimulation(0.2)  // 20% failure rate
+});
+// Configure TEM to match mock server limits
+const tem = new TEM({
+  databasePath: ':memory:',
+  concurrency: 3,
+  rateLimit: { requests: 10, windowMs: 1000 }
+});
+// Register handler that calls mock server
+tem.worker.register('test', async (payload) => {
+  const res = await fetch('http://localhost:19999/mock/api');
+  if (!res.ok) throw new Error(`HTTP ${res.status}`);
+  return res.json();
+});
+// TEM's retry mechanism will handle the 20% failure rate
+```

package/src/services/batch-interruption.ts CHANGED Viewed

@@ -13,7 +13,8 @@ export interface BatchInterruptionRow {
 export class BatchInterruptionService implements i.BatchInterruptionService {
   constructor(
     private db: Database,
-    private batchService: BatchService
+    private batchService: BatchService,
+    private defaultCriteria?: i.BatchInterruptionCriteria
   ) {}
   /**
@@ -30,13 +31,18 @@ export class BatchInterruptionService implements i.BatchInterruptionService {
     }
   ): Promise<boolean> {
     // Fetch batch with its interruption criteria
-    const { batch, criteria } = await this.batchService.getWithCriteria(batchId);
+    const { batch, criteria: batchCriteria } = await this.batchService.getWithCriteria(batchId);
     // If already interrupted or completed, no need to check
     if (batch.status !== 'active') {
       return false;
     }
+    // Merge criteria: TEM-level (default) overrides batch-level
+    const criteria: i.BatchInterruptionCriteria | undefined = batchCriteria || this.defaultCriteria
+      ? { ...batchCriteria, ...this.defaultCriteria }
+      : undefined;
     // If no criteria set, never interrupt
     if (!criteria) {
       return false;