@nathapp/nax 0.18.3 → 0.18.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/rules/01-project-conventions.md +34 -0
- package/.claude/rules/02-test-architecture.md +39 -0
- package/.claude/rules/03-test-writing.md +58 -0
- package/.claude/rules/04-forbidden-patterns.md +29 -0
- package/.githooks/pre-commit +13 -0
- package/CHANGELOG.md +9 -0
- package/CLAUDE.md +45 -122
- package/docker-compose.test.yml +1 -3
- package/docs/ROADMAP.md +9 -27
- package/package.json +1 -1
- package/src/config/schemas.ts +2 -0
- package/src/config/types.ts +5 -1
- package/src/execution/post-verify.ts +30 -12
- package/src/pipeline/stages/execution.ts +10 -2
- package/src/pipeline/stages/routing.ts +18 -4
- package/src/pipeline/stages/verify.ts +8 -1
- package/src/routing/strategies/keyword.ts +7 -4
- package/src/routing/strategies/llm.ts +40 -4
- package/test/{US-002-orchestrator.test.ts → integration/precheck-orchestrator.test.ts} +3 -3
- package/test/{execution/post-verify-bug026.test.ts → unit/execution/post-verify-regression.test.ts} +22 -50
- package/test/{execution → unit/execution}/post-verify.test.ts +1 -1
- package/test/unit/pipeline/routing-partial-override.test.ts +15 -36
- package/test/unit/pipeline/verify-smart-runner.test.ts +5 -6
- package/test/unit/routing/routing-stability.test.ts +207 -0
- package/test/unit/storyid-events.test.ts +20 -32
- package/test/unit/verification/smart-runner-config.test.ts +162 -0
- package/test/unit/{smart-test-runner.test.ts → verification/smart-runner-discovery.test.ts} +5 -164
- package/test/TEST_COVERAGE_US001.md +0 -217
- package/test/TEST_COVERAGE_US003.md +0 -84
- package/test/TEST_COVERAGE_US005.md +0 -86
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
# Project Conventions
|
|
2
|
+
|
|
3
|
+
## Language & Runtime
|
|
4
|
+
|
|
5
|
+
- **Bun-native only.** Use `Bun.file()`, `Bun.write()`, `Bun.spawn()`, `Bun.sleep()`. Never use Node.js equivalents (`fs.readFile`, `child_process.spawn`, `setTimeout` for delays).
|
|
6
|
+
- TypeScript strict mode. No `any` unless unavoidable (document why).
|
|
7
|
+
- Target: Bun 1.3.7+.
|
|
8
|
+
|
|
9
|
+
## File Size
|
|
10
|
+
|
|
11
|
+
- **400-line hard limit** for all source and test files.
|
|
12
|
+
- If a file approaches 400 lines, split it before adding more code.
|
|
13
|
+
- Split by logical concern (one function/class per file when possible).
|
|
14
|
+
|
|
15
|
+
## Module Structure
|
|
16
|
+
|
|
17
|
+
- Every directory with 2+ exports gets a barrel `index.ts`.
|
|
18
|
+
- Types go in `types.ts` per module directory.
|
|
19
|
+
- Import from barrels (`src/routing`), **never from internal paths** (`src/routing/router`). This prevents singleton fragmentation in Bun's module registry.
|
|
20
|
+
|
|
21
|
+
## Logging
|
|
22
|
+
|
|
23
|
+
- Use the project logger (`src/logger`). Never use `console.log` / `console.error` in source code.
|
|
24
|
+
- Log format: no emojis. Use `[OK]`, `[WARN]`, `[FAIL]`, `->`. Machine-parseable.
|
|
25
|
+
|
|
26
|
+
## Commits
|
|
27
|
+
|
|
28
|
+
- Conventional commits: `feat:`, `fix:`, `refactor:`, `test:`, `docs:`, `chore:`.
|
|
29
|
+
- Atomic — one logical change per commit.
|
|
30
|
+
- Never include `[run-release]` unless explicitly told to.
|
|
31
|
+
|
|
32
|
+
## Formatting
|
|
33
|
+
|
|
34
|
+
- Biome handles formatting and linting. Run `bun run lint` before committing.
|
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
# Test Architecture
|
|
2
|
+
|
|
3
|
+
## Directory Structure
|
|
4
|
+
|
|
5
|
+
Tests **must** mirror the `src/` directory structure:
|
|
6
|
+
|
|
7
|
+
```
|
|
8
|
+
src/routing/strategies/foo.ts → test/unit/routing/strategies/foo.test.ts
|
|
9
|
+
src/execution/runner.ts → test/unit/execution/runner.test.ts
|
|
10
|
+
src/pipeline/stages/verify.ts → test/unit/pipeline/stages/verify.test.ts
|
|
11
|
+
src/verification/smart-runner.ts → test/unit/verification/smart-runner.test.ts
|
|
12
|
+
```
|
|
13
|
+
|
|
14
|
+
## Test Categories
|
|
15
|
+
|
|
16
|
+
| Category | Location | Purpose |
|
|
17
|
+
|:---|:---|:---|
|
|
18
|
+
| Unit | `test/unit/<mirror-of-src>/` | Test individual functions/classes in isolation |
|
|
19
|
+
| Integration | `test/integration/<feature>.test.ts` | Test multiple modules working together |
|
|
20
|
+
| UI | `test/ui/` | TUI component tests |
|
|
21
|
+
|
|
22
|
+
## Placement Rules
|
|
23
|
+
|
|
24
|
+
1. **Never create test files in `test/` root.** Always place in the appropriate subdirectory.
|
|
25
|
+
2. **Never create standalone bug-fix test files** like `test/execution/post-verify-bug026.test.ts`. Add tests to the existing relevant test file instead. If the relevant file would exceed 400 lines, split the file by describe block — not by bug number.
|
|
26
|
+
3. **Never create `TEST_COVERAGE_*.md` or documentation files in `test/`.** Put docs in `docs/`.
|
|
27
|
+
4. **Unit test directories must exist under `test/unit/`**, mirroring `src/`. Do not create top-level test directories like `test/execution/` or `test/context/` — use `test/unit/execution/` and `test/unit/context/`.
|
|
28
|
+
|
|
29
|
+
## File Naming
|
|
30
|
+
|
|
31
|
+
- Test files: `<source-file-name>.test.ts` — must match the source file name exactly.
|
|
32
|
+
- One test file per source file (for unit tests).
|
|
33
|
+
- If a test file needs splitting, split by describe block into `<module>-<concern>.test.ts`.
|
|
34
|
+
|
|
35
|
+
## Temp Files & Fixtures
|
|
36
|
+
|
|
37
|
+
- Use `mkdtempSync(join(tmpdir(), "nax-test-"))` for temporary directories.
|
|
38
|
+
- Clean up in `afterAll()` — never leave files in `test/tmp/`.
|
|
39
|
+
- Integration tests needing git: always `git init` + `git add .` + `git commit` in the temp fixture before testing.
|
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
# Test Writing Rules
|
|
2
|
+
|
|
3
|
+
## Mocking
|
|
4
|
+
|
|
5
|
+
### Never use `mock.module()`
|
|
6
|
+
|
|
7
|
+
`mock.module()` in Bun 1.x is **globally scoped and leaks between test files**. It poisons the ESM module registry for the entire test run. `mock.restore()` does NOT undo `mock.module()` overrides.
|
|
8
|
+
|
|
9
|
+
**Instead, use dependency injection:**
|
|
10
|
+
|
|
11
|
+
```typescript
|
|
12
|
+
// In source file: export a swappable deps object
|
|
13
|
+
export const _deps = {
|
|
14
|
+
readConfig: () => loadConfig(),
|
|
15
|
+
runCommand: (cmd: string) => Bun.spawn(cmd.split(" ")),
|
|
16
|
+
};
|
|
17
|
+
|
|
18
|
+
// In test file: override _deps directly
|
|
19
|
+
import { _deps } from "src/mymodule";
|
|
20
|
+
|
|
21
|
+
beforeEach(() => {
|
|
22
|
+
_deps.readConfig = mock(() => fakeConfig);
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
afterEach(() => {
|
|
26
|
+
mock.restore(); // restores mock() spies (NOT mock.module)
|
|
27
|
+
_deps.readConfig = originalReadConfig;
|
|
28
|
+
});
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
### General Mocking Rules
|
|
32
|
+
|
|
33
|
+
- Always call `mock.restore()` in `afterEach()`.
|
|
34
|
+
- Use `mock()` (function-level) freely — it's properly scoped.
|
|
35
|
+
- Never rely on test file execution order. Each file must be independently runnable.
|
|
36
|
+
- Store original function references before overriding `_deps` and restore in `afterEach`.
|
|
37
|
+
|
|
38
|
+
## CI Compatibility
|
|
39
|
+
|
|
40
|
+
- Tests requiring the `claude` binary: guard with `const skipInCI = process.env.CI ? test.skip : test;`
|
|
41
|
+
- Tests requiring specific OS features: guard with platform checks.
|
|
42
|
+
- Never send real signals (`process.kill`) — mock `process.on()` instead.
|
|
43
|
+
|
|
44
|
+
## Spawning & Subprocesses
|
|
45
|
+
|
|
46
|
+
- Never spawn full `nax` processes in tests — prechecks fail in temp dirs.
|
|
47
|
+
- Wrap `Bun.spawn()` in try/catch — throws `ENOENT` for missing binaries (not a failed exit code).
|
|
48
|
+
|
|
49
|
+
## Test Structure
|
|
50
|
+
|
|
51
|
+
- One `describe()` block per source function or class being tested.
|
|
52
|
+
- Keep test files under 400 lines. Split by `describe()` block if needed.
|
|
53
|
+
- Use `test/helpers/` for shared mock factories and fixtures. Don't copy-paste mocking setup between files.
|
|
54
|
+
|
|
55
|
+
## Imports
|
|
56
|
+
|
|
57
|
+
- **Import from barrels** (`src/routing`), not internal paths (`src/routing/router`).
|
|
58
|
+
- This matches the project convention and prevents Bun singleton fragmentation where the same module loaded via two different paths creates two separate instances.
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
# Forbidden Patterns
|
|
2
|
+
|
|
3
|
+
These patterns are **banned** from the nax codebase. Violations must be caught during implementation, not after.
|
|
4
|
+
|
|
5
|
+
## Source Code
|
|
6
|
+
|
|
7
|
+
| ❌ Forbidden | ✅ Use Instead | Why |
|
|
8
|
+
|:---|:---|:---|
|
|
9
|
+
| `mock.module()` | Dependency injection (`_deps` pattern) | Leaks globally in Bun 1.x, poisons other test files |
|
|
10
|
+
| `console.log` / `console.error` in src/ | Project logger (`src/logger`) | Unstructured output breaks test capture and log parsing |
|
|
11
|
+
| `fs.readFileSync` / `fs.writeFileSync` | `Bun.file()` / `Bun.write()` | Bun-native project — no Node.js file APIs |
|
|
12
|
+
| `child_process.spawn` / `child_process.exec` | `Bun.spawn()` / `Bun.spawnSync()` | Bun-native project — no Node.js process APIs |
|
|
13
|
+
| `setTimeout` / `setInterval` for delays | `Bun.sleep()` | Bun-native equivalent |
|
|
14
|
+
| Hardcoded timeouts in logic | Config values from schema | Hardcoded values can't be tuned per-environment |
|
|
15
|
+
| `import from "src/module/internal-file"` | `import from "src/module"` (barrel) | Prevents singleton fragmentation (BUG-035) |
|
|
16
|
+
| Files > 400 lines | Split by concern | Unmaintainable; violates project convention |
|
|
17
|
+
|
|
18
|
+
## Test Files
|
|
19
|
+
|
|
20
|
+
| ❌ Forbidden | ✅ Use Instead | Why |
|
|
21
|
+
|:---|:---|:---|
|
|
22
|
+
| Test files in `test/` root | `test/unit/`, `test/integration/`, etc. | Orphaned files with no clear ownership |
|
|
23
|
+
| Standalone bug-fix test files (`*-bug026.test.ts`) | Add to existing relevant test file | Fragments test coverage, creates ownership confusion |
|
|
24
|
+
| `TEST_COVERAGE_*.md` in test/ | `docs/` directory | Test dir is for test code only |
|
|
25
|
+
| `rm -rf` in test cleanup | `mkdtempSync` + OS temp dir | Accidental deletion risk |
|
|
26
|
+
| Tests depending on alphabetical file execution order | Independent, self-contained test files | Cross-file coupling causes phantom failures |
|
|
27
|
+
| Copy-pasted mock setup across files | `test/helpers/` shared factories | DRY; single place to update when interfaces change |
|
|
28
|
+
| Spawning full `nax` process in tests | Mock the relevant module | Prechecks fail in temp dirs; slow; flaky |
|
|
29
|
+
| Real signal sending (`process.kill`) | Mock `process.on()` | Can kill the test runner |
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
#!/usr/bin/env bash
|
|
2
|
+
# nax pre-commit hook — runs typecheck + lint
|
|
3
|
+
# Install: git config core.hooksPath .githooks
|
|
4
|
+
|
|
5
|
+
set -e
|
|
6
|
+
|
|
7
|
+
echo "[pre-commit] Running typecheck..."
|
|
8
|
+
bun run typecheck
|
|
9
|
+
|
|
10
|
+
echo "[pre-commit] Running lint..."
|
|
11
|
+
bun run lint
|
|
12
|
+
|
|
13
|
+
echo "[pre-commit] OK"
|
package/CHANGELOG.md
CHANGED
|
@@ -5,6 +5,15 @@ All notable changes to this project will be documented in this file.
|
|
|
5
5
|
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
|
6
6
|
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
|
7
7
|
|
|
8
|
+
## [0.18.4] - 2026-03-04
|
|
9
|
+
|
|
10
|
+
### Fixed
|
|
11
|
+
- **BUG-031:** Keyword classifier no longer drifts across retries — `description` excluded from complexity/strategy classification (only `title`, `acceptanceCriteria`, `tags` used). Prevents prior error context from upgrading story complexity mid-run.
|
|
12
|
+
- **BUG-033:** LLM routing now retries on timeout/transient failure. New config: `routing.llm.retries` (default: 1), `routing.llm.retryDelayMs` (default: 1000ms). Default timeout raised from 15s to 30s.
|
|
13
|
+
|
|
14
|
+
### Added
|
|
15
|
+
- Pre-commit hook (`.githooks/pre-commit`) — runs `typecheck` + `lint` before every commit. Install with: `git config core.hooksPath .githooks`
|
|
16
|
+
|
|
8
17
|
## [0.10.0] - 2026-02-23
|
|
9
18
|
|
|
10
19
|
### Added
|
package/CLAUDE.md
CHANGED
|
@@ -1,10 +1,9 @@
|
|
|
1
1
|
# nax — AI Coding Agent Orchestrator
|
|
2
2
|
|
|
3
|
-
Bun + TypeScript CLI that orchestrates AI coding agents with model routing,
|
|
3
|
+
Bun + TypeScript CLI that orchestrates AI coding agents with model routing, TDD strategies, and lifecycle hooks.
|
|
4
4
|
|
|
5
5
|
## Git Identity
|
|
6
6
|
|
|
7
|
-
Always set before committing:
|
|
8
7
|
```bash
|
|
9
8
|
git config user.name "subrina.tai"
|
|
10
9
|
git config user.email "subrina8080@outlook.com"
|
|
@@ -12,148 +11,72 @@ git config user.email "subrina8080@outlook.com"
|
|
|
12
11
|
|
|
13
12
|
## Commands
|
|
14
13
|
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
- Bun-native APIs only (Bun.file, Bun.write, Bun.spawn, Bun.sleep) — no Node.js equivalents
|
|
25
|
-
- Functional style for pure logic; classes only for stateful adapters (e.g., ClaudeCodeAdapter)
|
|
26
|
-
- Types in `types.ts` per module, barrel exports via `index.ts`
|
|
27
|
-
- Max ~400 lines per file — split if larger
|
|
28
|
-
- Biome for formatting/linting
|
|
29
|
-
|
|
30
|
-
## Testing
|
|
31
|
-
|
|
32
|
-
- Framework: `bun:test` (describe/test/expect)
|
|
33
|
-
- Unit tests: `test/unit/<module>.test.ts`
|
|
34
|
-
- Integration tests: `test/integration/<feature>.test.ts`
|
|
35
|
-
- Routing tests: `test/routing/<router>.test.ts`
|
|
36
|
-
- UI tests: `test/ui/` (TUI testing, rarely needed)
|
|
37
|
-
- All routing, classification, and isolation logic must have unit tests
|
|
14
|
+
```bash
|
|
15
|
+
bun test # Full test suite
|
|
16
|
+
bun test test/unit/foo.test.ts # Specific file
|
|
17
|
+
bun run typecheck # tsc --noEmit
|
|
18
|
+
bun run lint # Biome
|
|
19
|
+
bun run build # Production build
|
|
20
|
+
bun test && bun run typecheck # Pre-commit check
|
|
21
|
+
```
|
|
38
22
|
|
|
39
23
|
## Architecture
|
|
40
24
|
|
|
41
|
-
### Execution Flow
|
|
42
|
-
|
|
43
25
|
```
|
|
44
|
-
Runner.run() [src/execution/runner.ts]
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
-> registry.teardownAll()
|
|
26
|
+
Runner.run() [src/execution/runner.ts — thin orchestrator only]
|
|
27
|
+
→ loadPlugins()
|
|
28
|
+
→ for each story:
|
|
29
|
+
→ Pipeline.execute() [src/pipeline/pipeline.ts]
|
|
30
|
+
→ stages: queueCheck → routing → constitution → context → prompt
|
|
31
|
+
→ execution → verify → review → completion
|
|
32
|
+
→ Reporter.emit()
|
|
33
|
+
→ registry.teardownAll()
|
|
53
34
|
```
|
|
54
35
|
|
|
55
36
|
### Key Directories
|
|
56
37
|
|
|
57
38
|
| Directory | Purpose |
|
|
58
39
|
|:---|:---|
|
|
59
|
-
| `src/execution/` | Runner loop, agent adapters
|
|
60
|
-
| `src/execution/lifecycle/` |
|
|
61
|
-
| `src/execution/escalation/` |
|
|
62
|
-
| `src/execution/acceptance/` |
|
|
63
|
-
| `src/pipeline/stages/` | Pipeline stages
|
|
64
|
-
| `src/routing/` | Model routing — tier classification, router chain
|
|
65
|
-
| `src/plugins/` | Plugin system — loader, registry, validator
|
|
40
|
+
| `src/execution/` | Runner loop, agent adapters, TDD strategies |
|
|
41
|
+
| `src/execution/lifecycle/` | Lifecycle hooks, startup/teardown |
|
|
42
|
+
| `src/execution/escalation/` | Escalation logic on repeated failures |
|
|
43
|
+
| `src/execution/acceptance/` | Acceptance-loop iteration |
|
|
44
|
+
| `src/pipeline/stages/` | Pipeline stages |
|
|
45
|
+
| `src/routing/` | Model routing — tier classification, router chain |
|
|
46
|
+
| `src/plugins/` | Plugin system — loader, registry, validator |
|
|
66
47
|
| `src/config/` | Config schema, loader (layered global + project) |
|
|
67
|
-
| `src/
|
|
68
|
-
| `src/
|
|
69
|
-
| `src/
|
|
70
|
-
| `
|
|
48
|
+
| `src/agents/adapters/` | Agent integrations (Claude Code) |
|
|
49
|
+
| `src/cli/` + `src/commands/` | CLI commands (check both locations) |
|
|
50
|
+
| `src/verification/` | Test execution, smart test runner |
|
|
51
|
+
| `src/review/` | Post-verify review (typecheck, lint, plugin reviewers) |
|
|
71
52
|
|
|
72
|
-
### Plugin System
|
|
73
|
-
|
|
74
|
-
Plugins extend nax via 4 extension points:
|
|
53
|
+
### Plugin System (4 extension points)
|
|
75
54
|
|
|
76
55
|
| Extension | Interface | Integration Point |
|
|
77
56
|
|:---|:---|:---|
|
|
78
|
-
|
|
|
79
|
-
|
|
|
80
|
-
|
|
|
81
|
-
|
|
|
82
|
-
|
|
83
|
-
Plugin loading order: global (`~/.nax/plugins/`) -> project (`<workdir>/nax/plugins/`) -> config (`plugins[]` in config.json).
|
|
57
|
+
| Context Provider | `IContextProvider` | `context.ts` stage — injects into prompts |
|
|
58
|
+
| Reviewer | `IReviewer` | Review stage — after built-in checks |
|
|
59
|
+
| Reporter | `IReporter` | Runner — onRunStart/onStoryComplete/onRunEnd |
|
|
60
|
+
| Router | `IRoutingStrategy` | Router chain — overrides model routing |
|
|
84
61
|
|
|
85
62
|
### Config
|
|
86
63
|
|
|
87
|
-
- Global: `~/.nax/config.json`
|
|
88
|
-
-
|
|
89
|
-
- Key settings: `execution.contextProviderTokenBudget` (default: 2000), `plugins[]` array
|
|
90
|
-
|
|
91
|
-
## Target Architecture (v0.15.0+)
|
|
92
|
-
|
|
93
|
-
### File Size Hard Limit
|
|
94
|
-
|
|
95
|
-
**400 lines maximum per file.** If you are about to exceed it, STOP and split first.
|
|
96
|
-
|
|
97
|
-
### execution/ Module Re-architecture Goal
|
|
98
|
-
|
|
99
|
-
Keep `runner.ts` as a **thin orchestrator only**. Extract:
|
|
64
|
+
- Global: `~/.nax/config.json` → Project: `<workdir>/nax/config.json`
|
|
65
|
+
- Schema: `src/config/schema.ts` — no hardcoded flags or credentials
|
|
100
66
|
|
|
101
|
-
|
|
102
|
-
- `parallel-runner.ts` — parallel story execution (future)
|
|
103
|
-
- `acceptance-loop.ts` — retry/escalation logic for failed stories
|
|
104
|
-
- `reporter-notifier.ts` — plugin event emission (onRunStart, onStoryComplete, onRunEnd)
|
|
105
|
-
- `lifecycle/` subdir — startup, teardown, cleanup handlers
|
|
106
|
-
- `escalation/` subdir — escalation strategies when acceptance loop fails
|
|
67
|
+
## Design Principles
|
|
107
68
|
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
Do not duplicate test execution logic across pipeline stages. When building new verification features (typecheck, lint, test, acceptance checks), put the logic in `src/verification/` and call from pipeline stages. This prevents scattered test invocations and ensures consistent test result parsing.
|
|
113
|
-
|
|
114
|
-
### Plugin Extension Points
|
|
115
|
-
|
|
116
|
-
When adding new agent integrations (e.g., Devin, Aider, Cursor):
|
|
117
|
-
|
|
118
|
-
1. Add adapter class to `src/agents/adapters/<name>.ts`
|
|
119
|
-
2. Register in `src/agents/adapters/index.ts`
|
|
120
|
-
3. Do NOT inline agent logic in `runner.ts` or `claude.ts`
|
|
121
|
-
|
|
122
|
-
### Logging Style
|
|
123
|
-
|
|
124
|
-
- No emojis in log messages
|
|
125
|
-
- Use `[OK]`, `[WARN]`, `[FAIL]`, `->` instead
|
|
126
|
-
- Keep logs machine-parseable
|
|
127
|
-
|
|
128
|
-
### Configuration
|
|
129
|
-
|
|
130
|
-
- No hardcoded flags or credentials
|
|
131
|
-
- Always read from config schema (`src/config/schema.ts`)
|
|
132
|
-
- Validate config at startup
|
|
133
|
-
|
|
134
|
-
### Closure Passing for Long-Lived Handlers
|
|
135
|
-
|
|
136
|
-
Pass **closures, not values** to long-lived handlers (crash handlers, heartbeat timers). This ensures handlers always reference the latest state, not stale snapshots.
|
|
137
|
-
|
|
138
|
-
```typescript
|
|
139
|
-
// WRONG: Captures stale value
|
|
140
|
-
const handler = () => cleanup(currentStory)
|
|
141
|
-
|
|
142
|
-
// CORRECT: Closure references latest state
|
|
143
|
-
const handler = () => cleanup(() => getCurrentStory())
|
|
144
|
-
```
|
|
69
|
+
- **`runner.ts` is a thin orchestrator.** Never add new concerns — extract into focused sub-modules.
|
|
70
|
+
- **`src/verification/` is the single test execution layer.** Don't duplicate test invocation in pipeline stages.
|
|
71
|
+
- **Closures over values** for long-lived handlers (crash handlers, timers) — prevents stale state capture.
|
|
72
|
+
- **New agent adapters** go in `src/agents/adapters/<name>.ts` — never inline in runner or existing adapters.
|
|
145
73
|
|
|
146
|
-
##
|
|
74
|
+
## Rules
|
|
147
75
|
|
|
148
|
-
|
|
149
|
-
- **Integration tests that need git:** Always `git init` + `git add` + `git commit` in the test fixture before running any code that triggers nax precheck validation.
|
|
150
|
-
- **Test files for crash/signal handling:** Use process-level mocks (e.g., mock `process.on('SIGTERM', ...)`) — do not send real signals in tests.
|
|
151
|
-
- **Context files:** If a test needs specific context files, create them in the test fixture directory — don't rely on auto-detection from the real workspace.
|
|
76
|
+
Detailed coding standards, test architecture, and forbidden patterns are in `.claude/rules/`. Claude Code loads these automatically.
|
|
152
77
|
|
|
153
78
|
## IMPORTANT
|
|
154
79
|
|
|
155
|
-
-
|
|
156
|
-
-
|
|
157
|
-
-
|
|
158
|
-
- Keep commits atomic — one logical change per commit
|
|
159
|
-
- Do NOT push to remote — let the human review and push
|
|
80
|
+
- Do NOT push to remote — let the human review and push.
|
|
81
|
+
- Never hardcode API keys — agents use their own auth from env.
|
|
82
|
+
- Agent adapters spawn external processes — always handle timeouts and cleanup.
|
package/docker-compose.test.yml
CHANGED
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
version: "3.9"
|
|
2
2
|
services:
|
|
3
3
|
app:
|
|
4
|
-
image:
|
|
4
|
+
image: nathapp/bun:1.3.8-ci
|
|
5
5
|
working_dir: /app
|
|
6
6
|
volumes:
|
|
7
7
|
- .:/app
|
|
8
8
|
command: >
|
|
9
9
|
sh -c "
|
|
10
|
-
echo 'Running pre-step...' &&
|
|
11
|
-
apt-get update && apt-get install -y --no-install-recommends git &&
|
|
12
10
|
bun install &&
|
|
13
11
|
bun run test:unit
|
|
14
12
|
"
|
package/docs/ROADMAP.md
CHANGED
|
@@ -107,14 +107,14 @@
|
|
|
107
107
|
|
|
108
108
|
---
|
|
109
109
|
|
|
110
|
-
## v0.18.4 — Routing Stability
|
|
110
|
+
## v0.18.4 — Routing Stability ✅
|
|
111
111
|
|
|
112
112
|
**Theme:** Fix routing classifier consistency and LLM routing reliability
|
|
113
|
-
**Status:**
|
|
113
|
+
**Status:** ✅ Shipped (2026-03-04)
|
|
114
114
|
|
|
115
115
|
### Bugfixes
|
|
116
|
-
- [
|
|
117
|
-
- [
|
|
116
|
+
- [x] **BUG-031:** Keyword fallback classifier gives inconsistent strategy across retries for same story. `priorErrors` text shifts keyword classification. **Fix:** Keyword classifier should only use original story fields; or lock `story.routing.testStrategy` once set.
|
|
117
|
+
- [x] **BUG-033:** LLM routing has no retry on timeout — single 15s attempt, then keyword fallback. **Fix:** Add `routing.llm.retries` config (default: 1) with backoff. Raise default timeout to 30s for batch routing.
|
|
118
118
|
|
|
119
119
|
---
|
|
120
120
|
|
|
@@ -150,6 +150,7 @@
|
|
|
150
150
|
| Version | Theme | Date | Details |
|
|
151
151
|
|:---|:---|:---|:---|
|
|
152
152
|
| v0.18.1 | Type Safety + CI Pipeline | 2026-03-03 | 60 TS errors + 12 lint errors fixed, GitLab CI green (1952/56/0) |
|
|
153
|
+
| v0.18.4 | Routing Stability | 2026-03-04 | BUG-031 keyword drift, BUG-033 LLM retry, pre-commit hook |
|
|
153
154
|
| v0.18.3 | Execution Reliability + Smart Runner | 2026-03-04 | BUG-026/028/029/030/032 + SFC-001/002 + STR-007, all items complete |
|
|
154
155
|
| v0.18.2 | Smart Test Runner + Routing Fix | 2026-03-03 | FIX-001 + STR-001–006, 2038 pass/11 skip/0 fail |
|
|
155
156
|
| v0.18.0 | Orchestration Quality | 2026-03-03 | BUG-016/017/018/019/020/021/022/023/025 all fixed |
|
|
@@ -186,28 +187,9 @@
|
|
|
186
187
|
- [x] ~~BUG-012: Greenfield detection ignores pre-existing test files~~
|
|
187
188
|
- [x] ~~BUG-013: Escalation routing not applied in iterations~~
|
|
188
189
|
- [x] ~~BUG-014: buildAllowedEnv() strips USER/LOGNAME~~
|
|
189
|
-
<<<<<<< Updated upstream
|
|
190
190
|
- [x] ~~**BUG-015:** `loadConstitution()` leaks global `~/.nax/constitution.md` into unit tests — fixed via `skipGlobal: true` in all unit tests~~
|
|
191
|
-
|
|
192
|
-
- [
|
|
193
|
-
- [ ] **BUG-027:** `runPrecheck()` always prints to stdout — pollutes test output when called programmatically.
|
|
194
|
-
- **Observed (2026-03-03):** `bun test` output starts with precheck JSON from `US-002-orchestrator.test.ts` calling `runPrecheck()`, which unconditionally calls `console.log()`. nax verify stage captures this, making every failure look like a `git-repo-exists` blocker.
|
|
195
|
-
- **Root cause:** `runPrecheck()` mixes side-effects (printing) with logic (returning result).
|
|
196
|
-
- **Fix:** Add `silent?: boolean` to `PrecheckOptions`; test callers pass `silent: true`.
|
|
197
|
-
- **Workaround (active):** `silent` option + test update shipped in v0.18.2 branch.
|
|
198
|
-
- **Target:** v0.18.2
|
|
199
|
-
- [ ] **BUG-028:** Routing cache ignores escalation tier — escalated stories re-run at original tier.
|
|
200
|
-
- **Observed (2026-03-03):** STR-006 escalated to `powerful`. Router returned LLM cache hit from prior `balanced` run → agent ran as `balanced` anyway.
|
|
201
|
-
- **Root cause:** Cache key does not include requested tier. Lower-tier cache hit served for higher-tier request.
|
|
202
|
-
- **Fix:** Include `requestedTier` in cache key; only serve cache hit if cached tier >= requested tier.
|
|
203
|
-
- **Target:** v0.19.0
|
|
204
|
-
- [ ] **BUG-026:** Regression gate failure triggers full story re-implementation instead of targeted rectification.
|
|
205
|
-
- **Observed (2026-03-03):** During v0.18.2 smart-runner development on Mac01, STR-001 passed scoped verification (5/5 tests green) but the full-suite regression gate timed out (exit code 132, SIGILL/Bun crash). nax treated this as a story failure and re-ran the coding agent, which rewrote already-correct code. The retry agent then produced a different (worse) implementation that failed verification.
|
|
206
|
-
- **Root cause:** Escalation logic does not distinguish between "story code is wrong" and "story code is fine but introduced a regression". Both flow through the same retry path.
|
|
207
|
-
- **Fix:** After regression gate failure, spawn a rectification agent with context of what regressed (failing test names + diff), not a full story re-implementation. Only fall back to full re-implementation if rectification also fails.
|
|
208
|
-
- **Workaround (active):** Disabled regression gate via `rectification.enabled: false` in project nax/config.json for self-dev runs. CI on VPS is the regression gate instead.
|
|
209
|
-
- **Target:** v0.19.0
|
|
210
|
-
>>>>>>> Stashed changes
|
|
191
|
+
- [x] ~~**BUG-027:** `runPrecheck()` always prints to stdout — pollutes test output when called programmatically. Shipped in v0.18.2.~~
|
|
192
|
+
- [x] ~~**BUG-028:** Routing cache ignores escalation tier — escalated stories re-run at original tier. Shipped in v0.18.3.~~
|
|
211
193
|
- [x] ~~**BUG-016:** Hardcoded 120s timeout in pipeline verify stage → fixed in v0.18.0~~
|
|
212
194
|
- [x] ~~**BUG-017:** run.complete not emitted on SIGTERM → fixed in v0.18.0~~
|
|
213
195
|
- [x] ~~**BUG-018:** Test-writer wastes ~3min/retry when tests already exist → fixed in v0.18.0~~
|
|
@@ -220,9 +202,9 @@
|
|
|
220
202
|
|
|
221
203
|
- [x] **BUG-029:** Escalation resets story to `pending` → bypasses BUG-022 retry priority. `handleTierEscalation()` sets `status: "pending"` after escalation, but `getNextStory()` Priority 1 only checks `status === "failed"`. Result: after BUG-026 escalated (iter 1), nax moved to BUG-028 (iter 2) instead of retrying BUG-026 immediately. **Location:** `src/prd/index.ts:getNextStory()` + `src/execution/escalation/tier-escalation.ts`. **Fix:** `getNextStory()` should also prioritize stories with `story.routing.modelTier` that changed since last attempt (escalation marker), or `handleTierEscalation` should use a distinct status like `"retry-pending"` that Priority 1 recognizes.
|
|
222
204
|
- [x] **BUG-030:** Review lint failure → hard `"fail"`, no rectification or retry. `src/pipeline/stages/review.ts:92` returns `{ action: "fail" }` for all review failures including lint. In `pipeline-result-handler.ts`, `"fail"` calls `markStoryFailed()` — permanently dead. But lint errors are auto-fixable (agent can run `biome check --fix`). Contrast with verify stage which returns `"escalate"` on test failure, allowing retry. SFC-001 and SFC-002 both hit this — tests passed but 5 Biome lint errors killed the stories permanently. **Fix:** Review stage should return `"escalate"` (not `"fail"`) for lint/typecheck failures, or add a review-rectification loop (like verify has) that gives the agent one retry with the lint output as context. Reserve `"fail"` for unfixable review issues (e.g. plugin reviewer rejection).
|
|
223
|
-
- [
|
|
205
|
+
- [x] **BUG-031:** Keyword fallback classifier gives inconsistent strategy across retries for same story. BUG-026 was classified as `test-after` on iter 1 (keyword fallback), but `three-session-tdd-lite` on iter 5 (same keyword fallback). The keyword classifier in `src/routing/strategies/keyword.ts:classifyComplexity()` may be influenced by `priorErrors` text added between attempts, shifting the keyword match result. **Location:** `src/routing/strategies/keyword.ts`. **Fix:** Keyword classifier should only consider the story's original title + description + acceptance criteria, not accumulated `priorErrors` or `priorFailures`. Alternatively, once a strategy is set in `story.routing.testStrategy`, the routing stage should preserve it across retries (already partially done in `routing.ts:40-41` but may not apply when LLM falls back to keyword).
|
|
224
206
|
- [x] **BUG-032:** Routing stage overrides escalated `modelTier` with complexity-derived tier. `src/pipeline/stages/routing.ts:43` always runs `complexityToModelTier(routing.complexity, config)` even when `story.routing.modelTier` was explicitly set by `handleTierEscalation()`. BUG-026 was escalated to `balanced` (logged in iteration header), but `Task classified` shows `modelTier=fast` because `complexityToModelTier("simple", config)` → `"fast"`. Related to BUG-013 (escalation routing not applied) which was marked fixed, but the fix in `applyCachedRouting()` in `pipeline-result-handler.ts:295-310` runs **after** the routing stage — too late. **Location:** `src/pipeline/stages/routing.ts:43`. **Fix:** When `story.routing.modelTier` is explicitly set (by escalation), skip `complexityToModelTier()` and use the cached tier directly. Only derive from complexity when `story.routing.modelTier` is absent.
|
|
225
|
-
- [
|
|
207
|
+
- [x] **BUG-033:** LLM routing has no retry on timeout — single attempt with hardcoded 15s default. All 5 LLM routing attempts in the v0.18.3 run timed out at 15s, forcing keyword fallback every time. `src/routing/strategies/llm.ts:63` reads `llmConfig?.timeoutMs ?? 15000` but there's no retry logic — one timeout = immediate fallback. **Location:** `src/routing/strategies/llm.ts:callLlm()`. **Fix:** Add `routing.llm.retries` config (default: 1) with backoff. Also surface `routing.llm.timeoutMs` in `nax config --explain` and consider raising default to 30s for batch routing which processes multiple stories.
|
|
226
208
|
|
|
227
209
|
### Features
|
|
228
210
|
- [x] ~~`nax unlock` command~~
|
package/package.json
CHANGED
package/src/config/schemas.ts
CHANGED
|
@@ -212,6 +212,8 @@ const LlmRoutingConfigSchema = z.object({
|
|
|
212
212
|
mode: z.enum(["one-shot", "per-story", "hybrid"]).optional(),
|
|
213
213
|
batchMode: z.boolean().optional(), // deprecated, for backward compat
|
|
214
214
|
timeoutMs: z.number().int().positive({ message: "llm.timeoutMs must be > 0" }).optional(),
|
|
215
|
+
retries: z.number().int().min(0, { message: "llm.retries must be >= 0" }).optional(),
|
|
216
|
+
retryDelayMs: z.number().int().min(0, { message: "llm.retryDelayMs must be >= 0" }).optional(),
|
|
215
217
|
});
|
|
216
218
|
|
|
217
219
|
const RoutingConfigSchema = z
|
package/src/config/types.ts
CHANGED
|
@@ -365,8 +365,12 @@ export interface LlmRoutingConfig {
|
|
|
365
365
|
mode?: LlmRoutingMode;
|
|
366
366
|
/** @deprecated Use mode instead. Will be removed in v1.0 */
|
|
367
367
|
batchMode?: boolean;
|
|
368
|
-
/** Timeout for LLM call in milliseconds (default:
|
|
368
|
+
/** Timeout for LLM call in milliseconds (default: 30000) */
|
|
369
369
|
timeoutMs?: number;
|
|
370
|
+
/** Number of retries on LLM timeout or transient failure (default: 1) */
|
|
371
|
+
retries?: number;
|
|
372
|
+
/** Delay between retries in milliseconds (default: 1000) */
|
|
373
|
+
retryDelayMs?: number;
|
|
370
374
|
}
|
|
371
375
|
|
|
372
376
|
/** Routing config */
|
|
@@ -26,7 +26,7 @@ function buildStructuredFailure(
|
|
|
26
26
|
): StructuredFailure {
|
|
27
27
|
const testFailures =
|
|
28
28
|
verificationResult.status === "TEST_FAILURE" && verificationResult.output
|
|
29
|
-
? parseBunTestOutput(verificationResult.output).failures.map((f) => ({
|
|
29
|
+
? _postVerifyDeps.parseBunTestOutput(verificationResult.output).failures.map((f) => ({
|
|
30
30
|
file: f.file,
|
|
31
31
|
testName: f.testName,
|
|
32
32
|
error: f.error,
|
|
@@ -121,9 +121,9 @@ export async function runPostAgentVerification(opts: PostVerifyOptions): Promise
|
|
|
121
121
|
const testCommand = scopeTestCommand(config.quality.commands.test, changedTestFiles);
|
|
122
122
|
const timeoutRetryCount = timeoutRetryCountMap.get(story.id) || 0;
|
|
123
123
|
|
|
124
|
-
const verificationResult = await runVerification({
|
|
124
|
+
const verificationResult = await _postVerifyDeps.runVerification({
|
|
125
125
|
workingDirectory: workdir,
|
|
126
|
-
expectedFiles: getExpectedFiles(story),
|
|
126
|
+
expectedFiles: _postVerifyDeps.getExpectedFiles(story),
|
|
127
127
|
command: testCommand,
|
|
128
128
|
timeoutSeconds: config.execution.verificationTimeoutSeconds,
|
|
129
129
|
forceExit: config.quality.forceExit,
|
|
@@ -141,7 +141,7 @@ export async function runPostAgentVerification(opts: PostVerifyOptions): Promise
|
|
|
141
141
|
if (verificationResult.success) {
|
|
142
142
|
logger?.info("verification", "Scoped verification passed");
|
|
143
143
|
if (verificationResult.output) {
|
|
144
|
-
const analysis = parseTestOutput(verificationResult.output, 0);
|
|
144
|
+
const analysis = _postVerifyDeps.parseTestOutput(verificationResult.output, 0);
|
|
145
145
|
if (analysis.passCount > 0) {
|
|
146
146
|
logger?.debug("verification", "Scoped test results", {
|
|
147
147
|
passCount: analysis.passCount,
|
|
@@ -175,7 +175,7 @@ export async function runPostAgentVerification(opts: PostVerifyOptions): Promise
|
|
|
175
175
|
regressionVerificationResult,
|
|
176
176
|
"Full-suite regression detected",
|
|
177
177
|
);
|
|
178
|
-
const updatedPrd = await revertStoriesOnFailure({
|
|
178
|
+
const updatedPrd = await _postVerifyDeps.revertStoriesOnFailure({
|
|
179
179
|
prd,
|
|
180
180
|
prdPath,
|
|
181
181
|
story,
|
|
@@ -193,7 +193,7 @@ export async function runPostAgentVerification(opts: PostVerifyOptions): Promise
|
|
|
193
193
|
// Attempt rectification if enabled and tests failed (not timeout/env)
|
|
194
194
|
const isTestFailure = verificationResult.status === "TEST_FAILURE" && verificationResult.output;
|
|
195
195
|
if (rectificationEnabled && isTestFailure && verificationResult.output) {
|
|
196
|
-
const fixed = await runRectificationLoop({
|
|
196
|
+
const fixed = await _postVerifyDeps.runRectificationLoop({
|
|
197
197
|
config,
|
|
198
198
|
workdir,
|
|
199
199
|
story,
|
|
@@ -222,7 +222,7 @@ export async function runPostAgentVerification(opts: PostVerifyOptions): Promise
|
|
|
222
222
|
// Revert stories and save
|
|
223
223
|
const diagnosticContext = verificationResult.error || `Verification failed: ${verificationResult.status}`;
|
|
224
224
|
const verifyFailure = buildStructuredFailure(story, "verify", verificationResult, diagnosticContext);
|
|
225
|
-
const updatedPrd = await revertStoriesOnFailure({
|
|
225
|
+
const updatedPrd = await _postVerifyDeps.revertStoriesOnFailure({
|
|
226
226
|
prd,
|
|
227
227
|
prdPath,
|
|
228
228
|
story,
|
|
@@ -263,9 +263,9 @@ async function runRegressionGate(
|
|
|
263
263
|
|
|
264
264
|
logger?.info("regression-gate", "Running full-suite regression gate");
|
|
265
265
|
const fullSuiteCommand = config.quality.commands.test ?? "bun test";
|
|
266
|
-
const regressionResult = await runVerification({
|
|
266
|
+
const regressionResult = await _postVerifyDeps.runVerification({
|
|
267
267
|
workingDirectory: workdir,
|
|
268
|
-
expectedFiles: getExpectedFiles(story),
|
|
268
|
+
expectedFiles: _postVerifyDeps.getExpectedFiles(story),
|
|
269
269
|
command: fullSuiteCommand,
|
|
270
270
|
timeoutSeconds: config.execution.regressionGate.timeoutSeconds,
|
|
271
271
|
forceExit: config.quality.forceExit,
|
|
@@ -297,7 +297,7 @@ async function runRegressionGate(
|
|
|
297
297
|
// Attempt rectification on regression failures
|
|
298
298
|
const isTestFailure = regressionResult.status === "TEST_FAILURE" && regressionResult.output;
|
|
299
299
|
if (rectificationEnabled && isTestFailure && regressionResult.output) {
|
|
300
|
-
const fixed = await runRectificationLoop({
|
|
300
|
+
const fixed = await _postVerifyDeps.runRectificationLoop({
|
|
301
301
|
config,
|
|
302
302
|
workdir,
|
|
303
303
|
story,
|
|
@@ -321,10 +321,12 @@ function checkEnvironmentalEscalation(
|
|
|
321
321
|
logger: ReturnType<typeof getSafeLogger>,
|
|
322
322
|
): void {
|
|
323
323
|
const currentTier = story.routing?.modelTier || config.autoMode.escalation.tierOrder[0]?.tier;
|
|
324
|
-
const tierCfg = currentTier
|
|
324
|
+
const tierCfg = currentTier
|
|
325
|
+
? _postVerifyDeps.getTierConfig(currentTier, config.autoMode.escalation.tierOrder)
|
|
326
|
+
: undefined;
|
|
325
327
|
if (!tierCfg) return;
|
|
326
328
|
|
|
327
|
-
const threshold = getEnvironmentalEscalationThreshold(
|
|
329
|
+
const threshold = _postVerifyDeps.getEnvironmentalEscalationThreshold(
|
|
328
330
|
tierCfg.attempts,
|
|
329
331
|
config.quality.environmentalEscalationDivisor,
|
|
330
332
|
);
|
|
@@ -336,3 +338,19 @@ function checkEnvironmentalEscalation(
|
|
|
336
338
|
});
|
|
337
339
|
}
|
|
338
340
|
}
|
|
341
|
+
|
|
342
|
+
/**
|
|
343
|
+
* Swappable dependencies for testing (avoids mock.module() which leaks in Bun 1.x).
|
|
344
|
+
*/
|
|
345
|
+
export const _postVerifyDeps = {
|
|
346
|
+
parseBunTestOutput,
|
|
347
|
+
parseTestOutput,
|
|
348
|
+
runVerification,
|
|
349
|
+
getExpectedFiles,
|
|
350
|
+
savePRD,
|
|
351
|
+
revertStoriesOnFailure,
|
|
352
|
+
runRectificationLoop,
|
|
353
|
+
appendProgress,
|
|
354
|
+
getTierConfig,
|
|
355
|
+
getEnvironmentalEscalationThreshold,
|
|
356
|
+
};
|