npm - @kodrunhq/opencode-autopilot - Versions diffs - 1.3.0 → 1.5.0 - Mend

@kodrunhq/opencode-autopilot 1.3.0 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (50) hide show

package/assets/commands/brainstorm.md +7 -0
package/assets/commands/stocktake.md +7 -0
package/assets/commands/tdd.md +7 -0
package/assets/commands/update-docs.md +7 -0
package/assets/commands/write-plan.md +7 -0
package/assets/skills/brainstorming/SKILL.md +295 -0
package/assets/skills/code-review/SKILL.md +241 -0
package/assets/skills/e2e-testing/SKILL.md +266 -0
package/assets/skills/git-worktrees/SKILL.md +296 -0
package/assets/skills/go-patterns/SKILL.md +240 -0
package/assets/skills/plan-executing/SKILL.md +258 -0
package/assets/skills/plan-writing/SKILL.md +278 -0
package/assets/skills/python-patterns/SKILL.md +255 -0
package/assets/skills/rust-patterns/SKILL.md +293 -0
package/assets/skills/strategic-compaction/SKILL.md +217 -0
package/assets/skills/systematic-debugging/SKILL.md +299 -0
package/assets/skills/tdd-workflow/SKILL.md +311 -0
package/assets/skills/typescript-patterns/SKILL.md +278 -0
package/assets/skills/verification/SKILL.md +240 -0
package/package.json +1 -1
package/src/index.ts +72 -1
package/src/observability/context-monitor.ts +102 -0
package/src/observability/event-emitter.ts +136 -0
package/src/observability/event-handlers.ts +322 -0
package/src/observability/event-store.ts +226 -0
package/src/observability/index.ts +53 -0
package/src/observability/log-reader.ts +152 -0
package/src/observability/log-writer.ts +93 -0
package/src/observability/mock/mock-provider.ts +72 -0
package/src/observability/mock/types.ts +31 -0
package/src/observability/retention.ts +57 -0
package/src/observability/schemas.ts +83 -0
package/src/observability/session-logger.ts +63 -0
package/src/observability/summary-generator.ts +209 -0
package/src/observability/token-tracker.ts +97 -0
package/src/observability/types.ts +24 -0
package/src/orchestrator/skill-injection.ts +38 -0
package/src/review/sanitize.ts +1 -1
package/src/skills/adaptive-injector.ts +122 -0
package/src/skills/dependency-resolver.ts +88 -0
package/src/skills/linter.ts +113 -0
package/src/skills/loader.ts +88 -0
package/src/templates/skill-template.ts +4 -0
package/src/tools/create-skill.ts +12 -0
package/src/tools/logs.ts +178 -0
package/src/tools/mock-fallback.ts +100 -0
package/src/tools/pipeline-report.ts +148 -0
package/src/tools/session-stats.ts +185 -0
package/src/tools/stocktake.ts +170 -0
package/src/tools/update-docs.ts +116 -0

package/assets/skills/typescript-patterns/SKILL.md ADDED Viewed

@@ -0,0 +1,278 @@
+---
+name: typescript-patterns
+description: TypeScript and Bun runtime patterns, testing idioms, type-level programming, and performance best practices
+stacks:
+  - typescript
+  - bun
+requires:
+  - coding-standards
+---
+# TypeScript & Bun Patterns
+TypeScript-specific patterns for projects running on the Bun runtime. Covers type-level programming, Bun-specific APIs, testing with bun:test, error handling, module design, and immutability idioms. Apply these when writing, reviewing, or refactoring TypeScript code.
+## 1. Type-Level Patterns
+**DO:** Use the type system to make invalid states unrepresentable.
+- Prefer `interface` over `type` for object shapes -- better error messages, declaration merging, and extendability
+- Use discriminated unions for state machines:
+  ```ts
+  type RequestState =
+    | { status: "idle" }
+    | { status: "loading" }
+    | { status: "success"; data: Response }
+    | { status: "error"; error: Error }
+  ```
+- Use `readonly` arrays and properties by default. Only remove `readonly` when mutation is explicitly required
+- Use `as const` for literal types and frozen configuration objects
+- Use branded types for domain identifiers to prevent mixing:
+  ```ts
+  type UserId = string & { readonly __brand: "UserId" }
+  type OrderId = string & { readonly __brand: "OrderId" }
+  function fetchUser(id: UserId): Promise<User> { ... }
+  // fetchUser(orderId) is now a compile error
+  ```
+- Use template literal types for string patterns:
+  ```ts
+  type ApiRoute = `/api/${string}`
+  type EventName = `on${Capitalize<string>}`
+  ```
+- Use `satisfies` to validate types without widening:
+  ```ts
+  const config = {
+    port: 3000,
+    host: "localhost",
+  } satisfies ServerConfig
+  // config.port is still `3000` (literal), not `number`
+  ```
+**DON'T:**
+- Use `any` -- use `unknown` and narrow with type guards. If `any` is unavoidable, add a `// biome-ignore lint/suspicious/noExplicitAny: [reason]` comment
+- Use `enum` -- use `as const` objects or union types instead (enums have runtime cost and quirky behavior)
+- Use `!` non-null assertion -- handle the null case explicitly or use optional chaining
+- Cast with `as` when a type guard or conditional check is possible
+- Use `Function` type -- use specific signatures: `(arg: string) => void`
+## 2. Bun Runtime Patterns
+**DO:** Use Bun-native APIs where they provide clear advantages.
+- Use `bun test` for testing -- built-in, fast, Jest-compatible API, no configuration needed
+- Use `node:fs/promises` for all file I/O -- not `Bun.file()` or `Bun.write()` (portability and testability per project constraints)
+- Use `Bun.serve()` for HTTP servers -- not Express or other Node frameworks
+- Import from `bun:sqlite` for SQLite -- zero-dependency, built into the runtime
+- Use `Bun.spawn()` for subprocesses -- streams stdout/stderr natively
+- Use `Bun.hash()` for fast hashing -- faster than Node's crypto for non-cryptographic hashes
+- Use `Bun.env` for environment variables -- typed access with auto-completion
+**DON'T:**
+- Install `jest`, `vitest`, or `mocha` -- `bun test` covers all standard test patterns
+- Use `Bun.file()` or `Bun.write()` in library code -- prefer `node:fs/promises` for portability
+- Use `node:child_process` when `Bun.spawn()` is available
+- Mix CommonJS `require()` with ES module `import` -- use `import` exclusively
+## 3. Error Handling
+**DO:** Use result types for expected failures. Reserve exceptions for unexpected bugs.
+- Return result types instead of throwing:
+  ```ts
+  type Result<T, E = string> =
+    | { success: true; data: T }
+    | { success: false; error: E }
+  ```
+- Catch at boundaries (HTTP handlers, CLI entry points), not in business logic
+- Use `isEnoentError()` pattern for filesystem errors -- check error code, not message:
+  ```ts
+  function isEnoentError(error: unknown): boolean {
+    return error instanceof Error && "code" in error && error.code === "ENOENT"
+  }
+  ```
+- Use `unknown` for catch clause variables and narrow before accessing properties:
+  ```ts
+  try { ... } catch (error: unknown) {
+    if (error instanceof ValidationError) { ... }
+    throw error // re-throw unknown errors
+  }
+  ```
+- Validate external data at system boundaries with Zod schemas:
+  ```ts
+  const result = schema.safeParse(input)
+  if (!result.success) return { success: false, error: result.error.message }
+  ```
+**DON'T:**
+- Catch without re-throwing or logging -- silent catch is a bug
+- Throw strings -- always throw `Error` instances or custom error classes
+- Use `try/catch` for control flow -- use conditional checks or result types
+- Ignore the return value of `safeParse` -- always check `.success`
+## 4. Module Patterns
+**DO:** Design modules for composability and testability.
+- Export pure functions and interfaces, not classes (unless state encapsulation is genuinely needed)
+- Use barrel files (`index.ts`) only for public API surfaces -- internal modules import directly
+- Follow strict top-down dependency flow -- no cycles. Use the dependency tree:
+  ```
+  entry point -> tools -> templates + utils -> Node built-ins
+  ```
+- Follow the `*Core` function pattern: export a testable core function that accepts dependencies, and a thin wrapper that supplies defaults:
+  ```ts
+  // Testable core
+  export async function createAgentCore(name: string, baseDir: string): Promise<Result> { ... }
+  // Thin wrapper for production
+  export function tool() {
+    return { execute: (args) => createAgentCore(args.name, getGlobalConfigDir()) }
+  }
+  ```
+- Target 200-400 lines per file, hard maximum of 800
+**DON'T:**
+- Create circular dependencies -- if A imports B and B imports A, extract shared types to C
+- Use dynamic `import()` for modules that can be statically imported
+- Re-export everything from a barrel file -- explicitly list public API
+- Put multiple unrelated exports in a single file
+## 5. Testing Patterns
+**DO:** Write focused tests that verify behavior, not implementation.
+- Use `describe`/`test` (not `it`) for consistency across the project:
+  ```ts
+  describe("validateAssetName", () => {
+    test("accepts lowercase with hyphens", () => {
+      expect(validateAssetName("my-agent")).toEqual({ valid: true })
+    })
+    test("rejects uppercase characters", () => {
+      const result = validateAssetName("MyAgent")
+      expect(result.valid).toBe(false)
+    })
+  })
+  ```
+- Test pure functions: input goes in, output comes out, no mocks needed
+- Test side effects: mock the boundary (filesystem, network), verify the interaction:
+  ```ts
+  import { mock } from "bun:test"
+  const writeMock = mock(() => Promise.resolve())
+  // inject mock, call function, verify writeMock was called with expected args
+  ```
+- Use `beforeEach` for test isolation, `afterEach` for cleanup
+- Use `expect().toThrow()` for exception testing:
+  ```ts
+  expect(() => parseConfig(invalid)).toThrow("Invalid config")
+  ```
+- Use `expect().toMatchSnapshot()` only for complex output where manual assertion is impractical
+**DON'T:**
+- Test implementation details (private methods, internal state)
+- Use `it` instead of `test` -- project convention is `describe`/`test`
+- Write tests that depend on execution order or shared mutable state
+- Skip tests with `.skip` without a tracking comment (`// TODO(#123): flaky on CI`)
+- Use `any` in test files to bypass type checking -- tests should be as typed as production code
+## 6. Immutability Patterns
+**DO:** Build new objects instead of mutating existing ones.
+- Use object spread for updates:
+  ```ts
+  const updated = { ...existing, status: "active" }
+  ```
+- Use array spread for additions:
+  ```ts
+  const withNew = [...existing, newItem]
+  ```
+- Use `Object.freeze()` for constants and configuration:
+  ```ts
+  const DEFAULTS = Object.freeze({
+    maxRetries: 3,
+    timeoutMs: 5000,
+  })
+  ```
+- Use `ReadonlyArray<T>` and `Readonly<Record<K, V>>` for function parameters:
+  ```ts
+  function process(items: ReadonlyArray<Item>): Result { ... }
+  ```
+- Use `map`, `filter`, `reduce` instead of mutating loops:
+  ```ts
+  const active = users.filter(u => u.isActive)
+  const names = active.map(u => u.name)
+  ```
+**DON'T:**
+- Push to arrays: `items.push(x)` -- use `[...items, x]`
+- Reassign properties: `obj.status = "done"` -- use `{ ...obj, status: "done" }`
+- Use `splice`, `pop`, `shift` on shared arrays
+- Mutate function arguments -- always return new values
+**Exception:** Mutation is acceptable when an API explicitly requires it (OpenCode config hooks, database transaction builders, stream writers). Document the mutation with a comment.
+## 7. Anti-Pattern Catalog
+**Anti-Pattern: Over-typed Generics**
+Writing `function get<T extends Record<string, unknown>, K extends keyof T>(obj: T, key: K): T[K]` when `function get(obj: Record<string, unknown>, key: string): unknown` suffices. Generics should earn their complexity by providing caller-site type narrowing.
+**Anti-Pattern: Barrel File Hell**
+Every directory gets an `index.ts` that re-exports everything. This creates implicit coupling, breaks tree-shaking, and makes imports ambiguous. Use barrel files only for the package's public API surface.
+**Anti-Pattern: Type Assertion Chains**
+`(value as unknown as TargetType)` is a code smell. If you need two casts, the types are wrong. Fix the source type or add a proper type guard.
+**Anti-Pattern: Promise Constructor Anti-Pattern**
+Wrapping an async function in `new Promise()` when you can just return the promise directly. If the function returns a promise, use `async/await` -- don't wrap it.
+**Anti-Pattern: Callback-Style Error Handling**
+Passing `(error, result)` tuples in TypeScript. Use `Result<T, E>` types or throw -- callbacks are a Node.js legacy, not a TypeScript pattern.
+**Anti-Pattern: Default Export Confusion**
+Using `export default` in library code makes imports inconsistent across consumers (each file names it differently). Use named exports: `export function createAgent()` not `export default function()`. Default exports are acceptable only for plugin/framework entry points that require them.
+## 8. Performance Patterns
+**DO:** Write efficient TypeScript that leverages Bun's runtime characteristics.
+- Pre-compute values at module level for constants used in hot paths:
+  ```ts
+  // Module level -- computed once
+  const VALID_PATTERN = /^[a-z0-9]+(-[a-z0-9]+)*$/
+  const DEFAULT_CONFIG = Object.freeze({ maxRetries: 3, timeoutMs: 5000 })
+  // Not inside the function -- recomputed every call
+  ```
+- Use `Map` and `Set` for frequent lookups instead of plain objects and arrays:
+  ```ts
+  const BUILT_IN_COMMANDS = new Set(["help", "quit", "config"])
+  // O(1) lookup vs O(n) array.includes()
+  ```
+- Use `structuredClone()` for deep copies -- built into the runtime, handles circular references
+- Avoid unnecessary `await` in return position:
+  ```ts
+  // DO: Return the promise directly
+  function fetchUser(id: string): Promise<User> {
+    return db.query("SELECT * FROM users WHERE id = ?", [id])
+  }
+  // DON'T: Unnecessary await
+  async function fetchUser(id: string): Promise<User> {
+    return await db.query("SELECT * FROM users WHERE id = ?", [id])
+  }
+  ```
+**DON'T:**
+- Create regex objects inside loops or frequently-called functions
+- Use `JSON.parse(JSON.stringify(obj))` for deep cloning -- use `structuredClone()`
+- Allocate in hot paths -- pre-compute, cache, or use pooling for frequently created objects
+- Use `Array.from()` when spread `[...iterable]` works -- spread is faster in Bun

package/assets/skills/verification/SKILL.md ADDED Viewed

@@ -0,0 +1,240 @@
+---
+name: verification
+description: Pre-completion verification checklist methodology to catch issues before marking work as done
+stacks: []
+requires: []
+---
+# Verification
+A systematic pre-completion checklist methodology. Apply this before marking any task, feature, or PR as complete. Every step is a gate — if it fails, the work is not done.
+## When to Use
+- Before marking any task as complete
+- Before committing code
+- Before opening or merging a pull request
+- Before deploying to any environment
+- Before saying "this is done" to anyone
+- After refactoring existing code
+- After fixing a bug (to verify the fix and check for regressions)
+The cost of catching issues before completion is 10x cheaper than catching them after merge and 100x cheaper than catching them in production. This checklist exists because developers consistently overestimate the completeness of their own work.
+## The Verification Checklist
+### Step 1: Requirements Check
+Re-read the original requirement, task description, or issue. Do not rely on your memory of what was asked.
+**Process:**
+1. Open the original requirement (ticket, issue, plan task, PR description)
+2. List every stated requirement — each one is a checkbox
+3. For each requirement, identify the specific code that satisfies it
+4. Mark each requirement as satisfied or not
+5. If any requirement is not satisfied, the work is not done
+**What to check:**
+- Every explicit requirement has a corresponding implementation
+- Edge cases mentioned in the requirement are handled
+- Acceptance criteria (if provided) are met
+- The implementation does not introduce behavior that contradicts the requirement
+- Optional requirements are either implemented or explicitly deferred with a reason
+**Red flags:**
+- You cannot point to specific code for a requirement — it is missing
+- You implemented something adjacent to the requirement but not the requirement itself
+- You added features that were not requested (scope creep)
+### Step 2: Code Quality Check
+Run automated quality checks. Do not skip these because "it is a small change."
+**Process:**
+1. Run the linter: `bun run lint` (or the project equivalent)
+2. Run the type checker: `bunx tsc --noEmit` (for TypeScript projects)
+3. Run the formatter: `bun run format` (or the project equivalent)
+4. Search for debug artifacts: `console.log`, `debugger`, `print()` statements
+5. Search for deferred work: `TODO`, `FIXME`, `HACK`, `XXX` comments that should be resolved
+6. Check file sizes — no file should exceed 800 lines
+**What to check:**
+- Zero linter errors (warnings are acceptable only if pre-existing)
+- Zero type errors
+- No formatting violations
+- No debug statements left in production code
+- No new TODO/FIXME comments that should be resolved before merge
+- All new files are under 400 lines (target), none over 800 lines (hard limit)
+**Red flags:**
+- Suppressing linter rules with inline comments (`// eslint-disable`) without justification
+- Type assertions (`as any`, `as unknown`) used to silence type errors instead of fixing them
+- Large functions (over 50 lines) or deeply nested code (over 4 levels)
+### Step 3: Test Verification
+Run the test suite. No exceptions.
+**Process:**
+1. Run the full test suite: `bun test`
+2. Check that all existing tests pass (zero regressions)
+3. Verify that new code has test coverage
+4. If new functionality has no tests, write them before proceeding
+5. Check test quality — are tests testing behavior or implementation details?
+**What to check:**
+- All tests pass (not just the ones you think are relevant)
+- New public functions and endpoints have at least one test
+- Error paths are tested (not just the happy path)
+- Edge cases identified in Step 1 have corresponding tests
+- Tests are deterministic — no flaky tests introduced
+**Red flags:**
+- Skipped tests (`it.skip`, `xit`, `@pytest.mark.skip`) without a tracking issue
+- Tests that pass by coincidence (testing the wrong thing)
+- Tests that mock so heavily they do not test real behavior
+- Missing tests for error handling paths
+**Reference:** Use the tdd-workflow skill for writing tests when coverage is missing.
+### Step 4: Integration Check
+Does the change work with the rest of the system? Unit tests passing is necessary but not sufficient.
+**Process:**
+1. Trace all imports — are new exports consumed correctly by their callers?
+2. Check type compatibility at module boundaries — do interfaces match between producer and consumer?
+3. Run the application and manually verify the feature works end-to-end
+4. Check that the feature integrates with existing features without breaking them
+5. Verify configuration — are all required config values, environment variables, and feature flags set?
+**What to check:**
+- No broken imports or missing exports
+- Type interfaces match between modules (producer returns what consumer expects)
+- The feature works when invoked through its actual entry point (not just in isolation)
+- Existing features that interact with the changed code still work
+- Database migrations (if any) apply cleanly and are reversible
+**Red flags:**
+- The feature works in tests but fails when run for real
+- You only tested the feature in isolation, never with the full system
+- New environment variables or configuration are undocumented
+### Step 5: Edge Case Review
+Think adversarially. What inputs or conditions could break this?
+**Process:**
+1. For each input, consider: empty, null/undefined, very large, malformed, special characters
+2. For each external call, consider: timeout, network failure, unexpected response, rate limiting
+3. For each concurrent operation, consider: race conditions, duplicate submissions, stale data
+4. For each state transition, consider: invalid state, repeated transitions, partial failure
+**What to check:**
+- Empty input does not crash (returns appropriate error or default)
+- Very large input does not cause memory issues or timeouts
+- Null/undefined values are handled (not passed through to crash later)
+- Concurrent access is safe (no race conditions on shared state)
+- Network failures are handled gracefully (retry, timeout, fallback)
+- Partial failures do not leave the system in an inconsistent state
+**Red flags:**
+- Functions that assume input is always valid without checking
+- No timeout on external calls (HTTP requests, database queries)
+- Shared mutable state without synchronization
+- Error handling that swallows the error and continues with bad data
+### Step 6: Security Scan
+Check for common security issues. This is not a full security audit — it is a pre-commit sanity check.
+**Process:**
+1. Search for hardcoded secrets: API keys, passwords, tokens, connection strings
+2. Verify all user inputs are validated before use
+3. Check that error messages do not leak sensitive data (stack traces, SQL queries, internal paths)
+4. Verify authentication and authorization on new endpoints or tools
+5. Check for injection risks: SQL injection, XSS, command injection, path traversal
+**What to check:**
+- No secrets in source code (use environment variables or secret managers)
+- All user input is validated at the boundary (schema validation preferred)
+- Error messages are safe for end users (no internal details)
+- New endpoints require authentication
+- Authorized actions check permissions (not just authentication)
+- Dynamic queries use parameterized statements (never string concatenation)
+**Red flags:**
+- API keys or tokens in source code or committed config files
+- User input passed directly to database queries, shell commands, or HTML output
+- Detailed error messages exposed to end users
+- New endpoints accessible without authentication
+## Integration with Our Tools
+Use these tools as part of the verification process:
+- **`oc_review`** — Invoke before marking any task as complete. Provides automated code review that catches issues you might miss reviewing your own code. This is the single most important verification step.
+- **`oc_doctor`** — Run to verify plugin health and configuration integrity. Catches broken tool registrations, missing assets, and config corruption.
+- **`oc_session_stats`** — Check for error patterns in the current session. If the session shows repeated errors, investigate before declaring the work complete.
+- **`oc_forensics`** — When a verification step fails and the root cause is not obvious, use forensics to trace the issue systematically.
+## Anti-Pattern Catalog
+### Anti-Pattern: "Works on My Machine"
+**What goes wrong:** You test only in your local environment and miss environment-specific issues (different OS, different Node/Bun version, different config, missing env vars).
+**Instead:** Check for environment-specific assumptions. Hardcoded paths, OS-specific APIs, version-specific features. If CI exists, verify it passes there too.
+### Anti-Pattern: Skipping Tests for Small Changes
+**What goes wrong:** "It is just a one-line change" — and that one line breaks three other things. Small changes cause big bugs because they slip through review.
+**Instead:** Always run the full test suite. The smaller the change, the faster the tests run anyway.
+### Anti-Pattern: Reviewing Your Own Code
+**What goes wrong:** You will miss the same things you missed when writing the code. Confirmation bias means you see what you expect to see, not what is actually there.
+**Instead:** Use `oc_review` for an independent automated review. For critical changes, request a human review as well.
+### Anti-Pattern: Verifying Only the Happy Path
+**What goes wrong:** The feature works perfectly with valid input. It crashes spectacularly with empty input, null values, or unexpected types.
+**Instead:** Step 5 (Edge Case Review) exists for this reason. Test the boundaries, not just the center.
+### Anti-Pattern: Deferring Verification to Later
+**What goes wrong:** "I will add tests later" or "I will check security before release." Later never comes, and the technical debt compounds.
+**Instead:** Verify now. Every step of this checklist should pass before the work leaves your hands.
+## Failure Modes
+### Linter Fails
+Fix the issues before proceeding. If a linter rule is genuinely wrong for your case, add a justified inline suppression comment — but question whether the rule is actually wrong or your code needs restructuring.
+### Tests Fail
+Do not comment out or skip the failing test. Diagnose the failure using the systematic-debugging skill. The test may be wrong (update it), or your code may be wrong (fix it). Determine which before changing anything.
+### Type Errors
+Trace the type mismatch to its source. Do not use `as any` to suppress the error. The type system is telling you something — usually that your mental model of the data does not match reality.
+### Security Issue Found
+Stop and fix it immediately. Do not defer security issues. If the fix requires significant changes, that is a sign the code needs restructuring, not that the security issue should be ignored.
+### Integration Failure
+If the feature works in isolation but fails in integration, the issue is at a module boundary. Check: are you producing the data the consumer expects? Are interfaces aligned? Is the contract documented?
+## Quick Reference
+For a fast pre-commit check, verify at minimum:
+1. `bun run lint` passes
+2. `bun test` passes
+3. No hardcoded secrets
+4. `oc_review` has no CRITICAL findings
+5. Every requirement has corresponding code
+The full 6-step checklist is for marking work as complete. The quick reference is for every commit.

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
 	"name": "@kodrunhq/opencode-autopilot",
-	"version": "1.3.0",
+	"version": "1.5.0",
 	"description": "Curated agents, skills, and commands for the OpenCode AI coding CLI — autonomous orchestrator, multi-agent code review, model fallback, and in-session asset creation tools.",
 	"main": "src/index.ts",
 	"keywords": [

package/src/index.ts CHANGED Viewed

@@ -3,6 +3,16 @@ import { configHook } from "./agents";
 import { isFirstLoad, loadConfig } from "./config";
 import { runHealthChecks } from "./health/runner";
 import { installAssets } from "./installer";
+import { ContextMonitor } from "./observability/context-monitor";
+import {
+	createObservabilityEventHandler,
+	createToolExecuteAfterHandler as createObsToolAfterHandler,
+	createToolExecuteBeforeHandler,
+} from "./observability/event-handlers";
+import { SessionEventStore } from "./observability/event-store";
+import { writeSessionLog } from "./observability/log-writer";
+import { pruneOldLogs } from "./observability/retention";
+import type { SessionEvent } from "./observability/types";
 import type { SdkOperations } from "./orchestrator/fallback";
 import {
 	createChatMessageHandler,
@@ -24,12 +34,18 @@ import { ocCreateCommand } from "./tools/create-command";
 import { ocCreateSkill } from "./tools/create-skill";
 import { ocDoctor } from "./tools/doctor";
 import { ocForensics } from "./tools/forensics";
+import { ocLogs } from "./tools/logs";
+import { ocMockFallback } from "./tools/mock-fallback";
 import { ocOrchestrate } from "./tools/orchestrate";
 import { ocPhase } from "./tools/phase";
+import { ocPipelineReport } from "./tools/pipeline-report";
 import { ocPlan } from "./tools/plan";
 import { ocQuick } from "./tools/quick";
 import { ocReview } from "./tools/review";
+import { ocSessionStats } from "./tools/session-stats";
 import { ocState } from "./tools/state";
+import { ocStocktake } from "./tools/stocktake";
+import { ocUpdateDocs } from "./tools/update-docs";
 let openCodeConfig: Config | null = null;
@@ -70,6 +86,15 @@ const plugin: Plugin = async (input) => {
 		// Health check failures are non-fatal — oc_doctor provides manual diagnostics
 	});
+	// --- Observability subsystem initialization ---
+	const eventStore = new SessionEventStore();
+	const contextMonitor = new ContextMonitor();
+	// Retention pruning on load (non-blocking per D-14)
+	pruneOldLogs().catch((err) => {
+		console.error("[opencode-autopilot]", err);
+	});
 	// --- Fallback subsystem initialization ---
 	const sdkOps: SdkOperations = {
 		abortSession: async (sessionID) => {
@@ -123,6 +148,32 @@ const plugin: Plugin = async (input) => {
 	const chatMessageHandler = createChatMessageHandler(manager);
 	const toolExecuteAfterHandler = createToolExecuteAfterHandler(manager);
+	// --- Observability handlers ---
+	const toolStartTimes = new Map<string, number>();
+	const observabilityEventHandler = createObservabilityEventHandler({
+		eventStore,
+		contextMonitor,
+		showToast: sdkOps.showToast,
+		writeSessionLog: async (sessionData) => {
+			if (!sessionData) return;
+			// Filter to schema-valid event types that match SessionEvent discriminated union
+			const schemaEvents: SessionEvent[] = sessionData.events.filter(
+				(e): e is SessionEvent =>
+					e.type === "fallback" ||
+					e.type === "error" ||
+					e.type === "decision" ||
+					e.type === "model_switch",
+			);
+			await writeSessionLog({
+				sessionId: sessionData.sessionId,
+				startedAt: sessionData.startedAt,
+				events: schemaEvents,
+			});
+		},
+	});
+	const obsToolBeforeHandler = createToolExecuteBeforeHandler(toolStartTimes);
+	const obsToolAfterHandler = createObsToolAfterHandler(eventStore, toolStartTimes);
 	return {
 		tool: {
 			oc_configure: ocConfigure,
@@ -138,8 +189,18 @@ const plugin: Plugin = async (input) => {
 			oc_quick: ocQuick,
 			oc_forensics: ocForensics,
 			oc_review: ocReview,
+			oc_logs: ocLogs,
+			oc_session_stats: ocSessionStats,
+			oc_pipeline_report: ocPipelineReport,
+			oc_mock_fallback: ocMockFallback,
+			oc_stocktake: ocStocktake,
+			oc_update_docs: ocUpdateDocs,
 		},
 		event: async ({ event }) => {
+			// 1. Observability: collect (pure observer, no side effects on session)
+			await observabilityEventHandler({ event });
+			// 2. First-load toast
 			if (event.type === "session.created" && isFirstLoad(config)) {
 				await sdkOps.showToast(
 					"Welcome to OpenCode Autopilot!",
@@ -148,7 +209,7 @@ const plugin: Plugin = async (input) => {
 				);
 			}
-			// Fallback event handling (runs for all events)
+			// 3. Fallback event handling
 			if (fallbackConfig.enabled) {
 				await fallbackEventHandler({ event });
 			}
@@ -173,6 +234,12 @@ const plugin: Plugin = async (input) => {
 				await chatMessageHandler(hookInput, output);
 			}
 		},
+		"tool.execute.before": async (
+			input: { tool: string; sessionID: string; callID: string },
+			output: { args: unknown },
+		) => {
+			obsToolBeforeHandler({ ...input, args: output.args });
+		},
 		"tool.execute.after": async (
 			hookInput: {
 				readonly tool: string;
@@ -182,6 +249,10 @@ const plugin: Plugin = async (input) => {
 			},
 			output: { title: string; output: string; metadata: unknown },
 		) => {
+			// Observability: record tool execution (pure observer)
+			obsToolAfterHandler(hookInput, output);
+			// Fallback handling
 			if (fallbackConfig.enabled) {
 				await toolExecuteAfterHandler(hookInput, output);
 			}