nano-brain 2026.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. package/AGENTS_SNIPPET.md +36 -0
  2. package/CHANGELOG.md +68 -0
  3. package/README.md +281 -0
  4. package/SKILL.md +153 -0
  5. package/bin/cli.js +18 -0
  6. package/index.html +929 -0
  7. package/nano-brain +4 -0
  8. package/opencode-mcp.json +9 -0
  9. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/.openspec.yaml +2 -0
  10. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/design.md +68 -0
  11. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/proposal.md +27 -0
  12. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/specs/mcp-integration-testing/spec.md +50 -0
  13. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/specs/mcp-server/spec.md +40 -0
  14. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/specs/search-pipeline/spec.md +29 -0
  15. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/tasks.md +37 -0
  16. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/.openspec.yaml +2 -0
  17. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/design.md +111 -0
  18. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/proposal.md +30 -0
  19. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/specs/mcp-server/spec.md +33 -0
  20. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/specs/storage-limits/spec.md +90 -0
  21. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/specs/workspace-scoping/spec.md +66 -0
  22. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/tasks.md +199 -0
  23. package/openspec/changes/codebase-indexing/.openspec.yaml +2 -0
  24. package/openspec/changes/codebase-indexing/design.md +169 -0
  25. package/openspec/changes/codebase-indexing/proposal.md +30 -0
  26. package/openspec/changes/codebase-indexing/specs/codebase-collection/spec.md +187 -0
  27. package/openspec/changes/codebase-indexing/specs/mcp-server/spec.md +36 -0
  28. package/openspec/changes/codebase-indexing/tasks.md +56 -0
  29. package/openspec/specs/mcp-integration-testing/spec.md +50 -0
  30. package/openspec/specs/mcp-server/spec.md +75 -0
  31. package/openspec/specs/search-pipeline/spec.md +29 -0
  32. package/openspec/specs/storage-limits/spec.md +94 -0
  33. package/openspec/specs/workspace-scoping/spec.md +70 -0
  34. package/package.json +34 -0
  35. package/site/build.js +66 -0
  36. package/site/partials/_api.html +83 -0
  37. package/site/partials/_compare.html +100 -0
  38. package/site/partials/_config.html +23 -0
  39. package/site/partials/_features.html +43 -0
  40. package/site/partials/_footer.html +6 -0
  41. package/site/partials/_hero.html +9 -0
  42. package/site/partials/_how-it-works.html +26 -0
  43. package/site/partials/_models.html +18 -0
  44. package/site/partials/_quick-start.html +15 -0
  45. package/site/partials/_stats.html +1 -0
  46. package/site/partials/_tech-stack.html +13 -0
  47. package/site/script.js +12 -0
  48. package/site/shell.html +44 -0
  49. package/site/styles.css +548 -0
  50. package/src/chunker.ts +427 -0
  51. package/src/codebase.ts +331 -0
  52. package/src/collections.ts +192 -0
  53. package/src/embeddings.ts +293 -0
  54. package/src/expansion.ts +79 -0
  55. package/src/harvester.ts +306 -0
  56. package/src/index.ts +503 -0
  57. package/src/reranker.ts +103 -0
  58. package/src/search.ts +294 -0
  59. package/src/server.ts +664 -0
  60. package/src/storage.ts +221 -0
  61. package/src/store.ts +623 -0
  62. package/src/types.ts +202 -0
  63. package/src/watcher.ts +384 -0
  64. package/test/chunker.test.ts +479 -0
  65. package/test/cli.test.ts +309 -0
  66. package/test/codebase-chunker.test.ts +446 -0
  67. package/test/codebase.test.ts +678 -0
  68. package/test/collections.test.ts +571 -0
  69. package/test/harvester.test.ts +636 -0
  70. package/test/integration.test.ts +150 -0
  71. package/test/llm.test.ts +322 -0
  72. package/test/search.test.ts +572 -0
  73. package/test/server.test.ts +541 -0
  74. package/test/storage.test.ts +302 -0
  75. package/test/store.test.ts +465 -0
  76. package/test/watcher.test.ts +656 -0
  77. package/test/workspace.test.ts +239 -0
  78. package/tsconfig.json +19 -0
  79. package/vitest.config.ts +16 -0
package/nano-brain ADDED
@@ -0,0 +1,4 @@
1
+ #!/usr/bin/env bash
2
+ set -e
3
+ DIR="$(cd "$(dirname "$0")" && pwd)"
4
+ exec bun "$DIR/src/index.ts" "$@"
@@ -0,0 +1,9 @@
1
+ {
2
+ "mcp": {
3
+ "nano-brain": {
4
+ "type": "local",
5
+ "command": ["bun", "run", "/path/to/nano-brain/src/index.ts", "mcp"],
6
+ "description": "Memory system with hybrid search (BM25 + vector + LLM reranking)"
7
+ }
8
+ }
9
+ }
@@ -0,0 +1,2 @@
1
+ schema: spec-driven
2
+ created: 2026-02-16
@@ -0,0 +1,68 @@
1
+ ## Context
2
+
3
+ nano-brain is an MCP server that provides hybrid search (BM25 + vector) over indexed markdown documents. It runs as a subprocess spawned by OpenCode via stdio transport. The server uses better-sqlite3 with FTS5 for full-text search and sqlite-vec for vector search.
4
+
5
+ Three categories of bugs surfaced during real-world testing that unit tests failed to catch:
6
+
7
+ 1. **FTS5 query injection** — User queries passed directly to FTS5 `MATCH` without sanitization. FTS5 interprets bare words matching column names (`filepath`, `title`, `body`) as column references, and hyphens as `NOT` operators.
8
+ 2. **ESM/CJS mismatch** — `require('crypto')` in an ESM module crashes under Node.js. Tests use vitest which transpiles CJS-style requires, masking the error.
9
+ 3. **Static config loading** — Collections loaded once at server startup. Adding a collection requires server restart.
10
+
11
+ ## Goals / Non-Goals
12
+
13
+ **Goals:**
14
+ - All MCP tool handlers work correctly when called through real JSON-RPC over stdio
15
+ - User queries with hyphens, special characters, and FTS5 reserved words search correctly
16
+ - Zero `require()` calls in any `.ts` source file
17
+ - Integration tests that catch runtime errors before deployment
18
+
19
+ **Non-Goals:**
20
+ - Changing the MCP tool API or adding new tools
21
+ - Improving search relevance or ranking algorithms
22
+ - Adding new features beyond bug fixes
23
+
24
+ ## Decisions
25
+
26
+ ### D1: FTS5 Query Sanitization Strategy
27
+
28
+ **Decision:** Quote each search term individually and join with implicit AND.
29
+
30
+ **Rationale:** FTS5 has complex query syntax where bare words can be interpreted as column names, `AND`/`OR`/`NOT`/`NEAR` as operators, and `-` as NOT prefix. Rather than trying to escape individual characters, we split the user query into tokens, wrap each in double quotes (`"term"`), and join them. This ensures every token is treated as a literal search term.
31
+
32
+ **Example:** `nano-brain architecture` → `"opencode" "memory" "architecture"` (hyphen splits into separate quoted terms since FTS5 treats `-` as NOT)
33
+
34
+ Actually, better approach: wrap the entire query in double quotes to preserve phrases, and escape any internal double quotes.
35
+
36
+ **Final decision:** `"nano-brain architecture"` → `"nano-brain architecture"` (single quoted phrase). Escape internal `"` as `""`.
37
+
38
+ **Alternatives considered:**
39
+ - Column-prefixed queries (`body:"query"`) — too restrictive, we want to search all columns
40
+ - FTS5 `simple` tokenizer — would lose porter stemming benefits
41
+
42
+ ### D2: Integration Test Approach
43
+
44
+ **Decision:** Create a test helper that starts the real MCP server in-process, sends JSON-RPC messages, and asserts on responses. Use a temporary SQLite database with pre-indexed test documents.
45
+
46
+ **Rationale:** The current tests mock `Store` and `SearchProviders`, which means the actual SQL queries, FTS5 interactions, and parameter binding are never tested. An integration test that uses a real database catches the exact class of bugs we hit.
47
+
48
+ **Alternatives considered:**
49
+ - Subprocess-based testing (spawn `node bin/cli.js mcp` and pipe JSON-RPC) — too slow, harder to debug
50
+ - Just adding more unit tests — wouldn't catch ESM/CJS issues or SQL parameter binding bugs
51
+
52
+ ### D3: ESM Compliance Audit
53
+
54
+ **Decision:** `grep -r "require(" src/` as a CI-enforceable check. All imports must use ESM `import` syntax.
55
+
56
+ **Rationale:** vitest transpiles `require()` calls, so they work in tests but fail at runtime under Node.js ESM. A simple grep catches this at lint time.
57
+
58
+ ### D4: Dynamic Config Reload in memory_update
59
+
60
+ **Decision:** `memory_update` tool handler reloads `config.yml` on every invocation instead of using the cached startup value.
61
+
62
+ **Rationale:** Users add collections via CLI (`collection add`) which writes to config.yml. The MCP server is a long-running process — it shouldn't require restart to see new collections. The config file is tiny (< 1KB), so re-reading it on each update call has negligible cost.
63
+
64
+ ## Risks / Trade-offs
65
+
66
+ - **FTS5 quoting may reduce search flexibility** — Users can't use FTS5 advanced syntax (OR, NEAR, column filters). This is acceptable because MCP tool users are AI agents, not power users writing FTS5 queries. → Mitigation: If needed later, add a `raw` parameter to bypass sanitization.
67
+ - **Integration tests add test runtime** — Real SQLite operations are slower than mocks. → Mitigation: Keep integration test count small (5-10 critical paths), run unit tests separately.
68
+ - **Dynamic config reload on every update** — Tiny performance cost. → Mitigation: Config file is < 1KB, `fs.readFileSync` + YAML parse is < 1ms.
@@ -0,0 +1,27 @@
1
+ ## Why
2
+
3
+ The MCP server has multiple runtime bugs that only surface in real usage (not caught by unit tests). The server crashes or returns SQL errors when users call the tools through OpenCode. These bugs exist because: (1) tests mock internals rather than exercising real code paths, (2) ESM/CJS incompatibilities weren't caught, and (3) FTS5 query syntax isn't sanitized.
4
+
5
+ ## What Changes
6
+
7
+ - Fix FTS5 search query sanitization — raw user queries containing words that match FTS5 syntax (column names, operators) cause `no such column` errors. Queries must be properly quoted/escaped before passing to `MATCH`.
8
+ - Remove all `require()` calls from ESM source files — `require('crypto')` in `server.ts` crashes at runtime under Node.js ESM mode. All CJS-style requires must use ESM `import`.
9
+ - Add integration tests that exercise real MCP tool handlers end-to-end — current tests mock store/server internals, missing runtime errors like the above.
10
+ - Fix `memory_update` to reload collection config dynamically — already patched but needs proper test coverage.
11
+ - Audit all prepared statements for parameter binding correctness — ensure collection filter parameters are bound in the right order.
12
+
13
+ ## Capabilities
14
+
15
+ ### New Capabilities
16
+ - `mcp-integration-testing`: End-to-end integration tests that start the real MCP server, send JSON-RPC requests, and verify responses against a real SQLite database with indexed documents.
17
+
18
+ ### Modified Capabilities
19
+ - `search-pipeline`: Fix FTS5 query sanitization so user queries with hyphens, special characters, and words matching FTS5 column names don't cause SQL errors.
20
+ - `mcp-server`: Fix ESM compatibility (no `require()`), fix dynamic config reload in `memory_update`, ensure all tool handlers work under Node.js/tsx runtime.
21
+
22
+ ## Impact
23
+
24
+ - **src/store.ts**: `searchFTS()` — must sanitize/quote FTS5 MATCH queries
25
+ - **src/server.ts**: Remove `require('crypto')` (already done), verify all tool handlers work end-to-end
26
+ - **tests/**: New integration test file exercising real MCP tool calls against real DB
27
+ - **No breaking changes** — all fixes are internal, MCP tool API unchanged
@@ -0,0 +1,50 @@
1
+ ## ADDED Requirements
2
+
3
+ ### Requirement: Integration test infrastructure
4
+ The project SHALL have an integration test file that exercises MCP tool handlers against a real SQLite database with real FTS5 indexes and real sqlite-vec tables.
5
+
6
+ #### Scenario: Test setup creates real database with indexed documents
7
+ - **WHEN** the integration test suite starts
8
+ - **THEN** a temporary SQLite database is created with sqlite-vec loaded
9
+ - **THEN** at least 2 test documents are indexed with FTS5 entries
10
+ - **THEN** the MCP server's tool handlers are initialized with the real store
11
+
12
+ #### Scenario: Test teardown cleans up
13
+ - **WHEN** the integration test suite completes
14
+ - **THEN** the temporary database file is deleted
15
+ - **THEN** no test artifacts remain on disk
16
+
17
+ ### Requirement: Search integration tests
18
+ Integration tests SHALL verify that `memory_search` works end-to-end with real FTS5 queries.
19
+
20
+ #### Scenario: Search finds indexed document
21
+ - **WHEN** `memory_search` handler is called with a query matching an indexed document
22
+ - **THEN** the response contains the matching document with title, path, and snippet
23
+
24
+ #### Scenario: Search with hyphenated query
25
+ - **WHEN** `memory_search` handler is called with query `nano-brain`
26
+ - **THEN** the response completes without error
27
+ - **THEN** results include documents containing the term
28
+
29
+ #### Scenario: Search with collection filter
30
+ - **WHEN** `memory_search` handler is called with a collection filter
31
+ - **THEN** only documents from that collection are returned
32
+
33
+ #### Scenario: Search with empty query
34
+ - **WHEN** `memory_search` handler is called with an empty string query
35
+ - **THEN** the response returns empty results without error
36
+
37
+ ### Requirement: Update integration tests
38
+ Integration tests SHALL verify that `memory_update` works end-to-end.
39
+
40
+ #### Scenario: Update indexes new files
41
+ - **WHEN** a new markdown file is added to a collection directory
42
+ - **THEN** calling the `memory_update` handler indexes the new file
43
+ - **THEN** the file is searchable via `memory_search`
44
+
45
+ ### Requirement: Status integration tests
46
+ Integration tests SHALL verify that `memory_status` returns accurate information.
47
+
48
+ #### Scenario: Status reflects indexed documents
49
+ - **WHEN** documents have been indexed
50
+ - **THEN** `memory_status` handler returns correct document count and collection info
@@ -0,0 +1,40 @@
1
+ ## MODIFIED Requirements
2
+
3
+ ### Requirement: ESM module compliance
4
+ All source files in `src/` SHALL use ESM `import` syntax exclusively. No `require()` calls SHALL exist in any TypeScript source file.
5
+
6
+ #### Scenario: Server starts under Node.js ESM runtime
7
+ - **WHEN** the MCP server is started via `node bin/cli.js mcp`
8
+ - **THEN** the server starts without `require is not defined` errors
9
+ - **THEN** all tool handlers execute without CJS/ESM compatibility errors
10
+
11
+ #### Scenario: No require() in source files
12
+ - **WHEN** running `grep -r "require(" src/` on the source directory
13
+ - **THEN** zero matches are returned (excluding comments and string literals)
14
+
15
+ ### Requirement: Dynamic collection config reload
16
+ The `memory_update` tool handler SHALL reload the collection configuration file on every invocation, not use the cached startup value.
17
+
18
+ #### Scenario: Collection added after server start
19
+ - **WHEN** a user adds a collection via CLI (`collection add`) while the MCP server is running
20
+ - **THEN** calling `memory_update` through MCP indexes documents from the newly added collection
21
+ - **THEN** no server restart is required
22
+
23
+ #### Scenario: Collection removed after server start
24
+ - **WHEN** a user removes a collection via CLI while the MCP server is running
25
+ - **THEN** calling `memory_update` through MCP no longer indexes documents from the removed collection
26
+
27
+ ### Requirement: All MCP tool handlers return valid responses
28
+ Every registered MCP tool SHALL return a valid JSON-RPC response for valid inputs, never an unhandled exception.
29
+
30
+ #### Scenario: memory_search with valid query
31
+ - **WHEN** `memory_search` is called with `{"query": "test"}` via JSON-RPC
32
+ - **THEN** a valid response with `content` array is returned
33
+
34
+ #### Scenario: memory_update with configured collections
35
+ - **WHEN** `memory_update` is called via JSON-RPC with collections configured
36
+ - **THEN** a valid response with reindex summary is returned, not a runtime error
37
+
38
+ #### Scenario: memory_status returns health info
39
+ - **WHEN** `memory_status` is called via JSON-RPC
40
+ - **THEN** a valid response with document count, chunk count, and collection info is returned
@@ -0,0 +1,29 @@
1
+ ## MODIFIED Requirements
2
+
3
+ ### Requirement: FTS5 query sanitization
4
+ The `searchFTS` function SHALL sanitize user queries before passing them to FTS5 `MATCH`. All user-provided query strings MUST be treated as literal search text, never as FTS5 syntax.
5
+
6
+ #### Scenario: Query containing hyphenated words
7
+ - **WHEN** user searches for `nano-brain`
8
+ - **THEN** the search treats the entire hyphenated term as a literal phrase, not as `opencode NOT memory`
9
+
10
+ #### Scenario: Query containing FTS5 column names
11
+ - **WHEN** user searches for `memory architecture`
12
+ - **THEN** the search treats `memory` as a search term, not as a column reference
13
+ - **THEN** no `no such column` error is thrown
14
+
15
+ #### Scenario: Query containing FTS5 operators
16
+ - **WHEN** user searches for `AND OR NOT NEAR`
17
+ - **THEN** the search treats these as literal words, not as FTS5 boolean operators
18
+
19
+ #### Scenario: Query containing double quotes
20
+ - **WHEN** user searches for `he said "hello"`
21
+ - **THEN** internal double quotes are escaped and the search completes without SQL error
22
+
23
+ #### Scenario: Empty or whitespace-only query
24
+ - **WHEN** user searches for ` ` or empty string
25
+ - **THEN** the search returns an empty result set without error
26
+
27
+ #### Scenario: Normal multi-word query
28
+ - **WHEN** user searches for `sqlite vector search`
29
+ - **THEN** the search returns documents containing those terms, ranked by BM25 relevance
@@ -0,0 +1,37 @@
1
+ ## 1. FTS5 Query Sanitization
2
+
3
+ - [x] 1.1 Create `sanitizeFTS5Query(query: string): string` helper in `src/store.ts` that wraps user input in double quotes and escapes internal double quotes
4
+ - [x] 1.2 Handle edge cases: empty/whitespace-only queries return empty string, hyphenated words preserved as phrases
5
+ - [x] 1.3 Apply `sanitizeFTS5Query` in `searchFTS()` before passing to prepared statements
6
+ - [x] 1.4 Add unit tests for `sanitizeFTS5Query`: normal words, hyphens, FTS5 operators, double quotes, empty input, column name words
7
+ - [x] 1.5 Verify `memory_search` works via MCP with query `nano-brain architecture` (manual test)
8
+
9
+ ## 2. ESM Compliance
10
+
11
+ - [x] 2.1 Audit all `src/*.ts` files for `require()` calls — confirm the `require('crypto')` fix in server.ts is the only instance
12
+ - [x] 2.2 Add a lint check script in package.json: `"lint:esm": "grep -r 'require(' src/ --include='*.ts' && exit 1 || exit 0"`
13
+ - [x] 2.3 Run lint:esm and verify it passes
14
+
15
+ ## 3. Dynamic Config Reload
16
+
17
+ - [x] 3.1 Verify `memory_update` handler in server.ts reloads config from disk (already patched — confirm code is correct)
18
+ - [x] 3.2 Add test: start server with empty config, add collection to config file, call memory_update, verify it indexes the new collection's documents
19
+
20
+ ## 4. Integration Tests
21
+
22
+ - [x] 4.1 Create `tests/integration.test.ts` with test helper that creates a real temp SQLite DB with sqlite-vec loaded
23
+ - [x] 4.2 Add test fixture: index 2-3 markdown documents into the real DB with FTS5 triggers firing
24
+ - [x] 4.3 Test `memory_search` handler end-to-end: valid query returns results with title, path, snippet
25
+ - [x] 4.4 Test `memory_search` with hyphenated query (`nano-brain`) — no SQL error
26
+ - [x] 4.5 Test `memory_search` with FTS5 operator words (`AND OR NOT`) — no SQL error
27
+ - [x] 4.6 Test `memory_search` with collection filter — only matching collection returned
28
+ - [x] 4.7 Test `memory_search` with empty query — returns empty results, no error
29
+ - [x] 4.8 Test `memory_update` handler: add file to collection dir, call update, verify document count increases
30
+ - [x] 4.9 Test `memory_status` handler: returns correct document count and collection info
31
+ - [x] 4.10 Teardown: verify temp DB is cleaned up after tests
32
+
33
+ ## 5. Verification
34
+
35
+ - [x] 5.1 Run full test suite (`npm test`) — all existing 246 tests + new integration tests pass
36
+ - [x] 5.2 Start MCP server via `node bin/cli.js mcp`, send JSON-RPC initialize + tools/list, verify 8 tools listed
37
+ - [x] 5.3 Manual end-to-end: write memory, update index, search — full cycle works through MCP tools
@@ -0,0 +1,2 @@
1
+ schema: spec-driven
2
+ created: 2026-02-23
@@ -0,0 +1,111 @@
1
+ ## Context
2
+
3
+ nano-brain is an MCP server providing persistent memory across OpenCode sessions. It harvests session JSON from `~/.local/share/opencode/storage/`, converts to markdown, indexes into SQLite (FTS5 + sqlite-vec), and exposes search via MCP tools.
4
+
5
+ Current state:
6
+ - **956 sessions** across 7 workspaces are mixed into one flat index (30MB SQLite DB)
7
+ - **No workspace awareness** — searching in project A returns results from projects B, C, D
8
+ - **No storage limits** — DB grows unbounded, no eviction, no disk safety checks
9
+ - The MCP server is launched per-workspace by OpenCode with `PWD` set to the workspace root
10
+ - Harvested sessions are already organized by projectHash on disk (`sessions/{hash}/*.md`)
11
+ - The `documents` table has a `collection` column but no `project_hash` column
12
+
13
+ Key constraint: The MCP server runs as a stdio process spawned by OpenCode. It knows `PWD` but receives no explicit workspace identifier.
14
+
15
+ ## Goals / Non-Goals
16
+
17
+ **Goals:**
18
+ - Search results scoped to current workspace by default
19
+ - Cross-workspace search available via explicit opt-in
20
+ - Configurable storage cap with automatic eviction of oldest sessions
21
+ - Disk safety guard preventing writes when disk is critically low
22
+ - Backward-compatible config (all new fields optional with safe defaults)
23
+ - Zero data loss for recent sessions during eviction
24
+
25
+ **Non-Goals:**
26
+ - Per-workspace separate SQLite databases (too complex, breaks cross-workspace search)
27
+ - Real-time workspace switching within a session (server restarts on workspace change anyway)
28
+ - Compression or deduplication of session content
29
+ - Cloud sync or backup of memory data
30
+ - Embedding eviction (only session documents are evicted, embeddings are orphaned and cleaned lazily)
31
+
32
+ ## Decisions
33
+
34
+ ### D1: Workspace detection via PWD + SHA-256 hash
35
+
36
+ **Decision**: Compute `projectHash = sha256(process.cwd()).substring(0, 12)` at server startup. Store as `currentProjectHash` on the server context.
37
+
38
+ **Why**: OpenCode already sets `PWD` to the workspace root when spawning MCP servers. The harvester already uses the same `sha256(directory).substring(0, 12)` scheme for organizing session output directories. Reusing this ensures consistency.
39
+
40
+ **Alternative considered**: Use an explicit `--workspace` CLI flag. Rejected because it requires config changes in every OpenCode installation and the PWD approach works automatically.
41
+
42
+ ### D2: Document-level project_hash column in SQLite
43
+
44
+ **Decision**: Add a `project_hash TEXT` column to the `documents` table. Populate it during indexing by extracting the projectHash from the file path (sessions are stored at `sessions/{projectHash}/*.md`). For non-session documents (MEMORY.md, daily logs), set `project_hash = 'global'`.
45
+
46
+ **Why**: Column-level filtering is fast (indexed), works with existing FTS5 queries, and doesn't require restructuring collections.
47
+
48
+ **Alternative considered**: Separate collections per workspace. Rejected because it would require dynamic collection creation and complicate the config.
49
+
50
+ **Migration**: `ALTER TABLE documents ADD COLUMN project_hash TEXT DEFAULT 'global'`. Then backfill from file paths for existing documents.
51
+
52
+ ### D3: Search filtering with workspace parameter
53
+
54
+ **Decision**: All search MCP tools (`memory_search`, `memory_vsearch`, `memory_query`) gain an optional `workspace` parameter:
55
+ - Default: `undefined` → filter to `currentProjectHash` + `'global'` documents
56
+ - `"all"` → no filtering, search everything
57
+ - `"<specific-hash>"` → filter to that project
58
+
59
+ **Why**: Default scoping prevents cross-project pollution. The `"all"` escape hatch preserves the ability to search across workspaces when needed. Including `'global'` in default ensures MEMORY.md and daily logs are always searchable.
60
+
61
+ ### D4: Storage config with safe defaults
62
+
63
+ **Decision**: New `storage` section in `config.yml`:
64
+ ```yaml
65
+ storage:
66
+ maxSize: 2GB # Max total size (DB + sessions dir)
67
+ retention: 90d # Auto-evict sessions older than this
68
+ minFreeDisk: 100MB # Stop writes if disk free below this
69
+ ```
70
+
71
+ All fields optional. Defaults: `maxSize: 2GB`, `retention: 90d`, `minFreeDisk: 100MB`.
72
+
73
+ **Why**: These defaults are safe for most users. 2GB accommodates ~100K sessions. 90 days keeps recent context. 100MB prevents disk-full crashes.
74
+
75
+ **Parsing**: `maxSize` and `minFreeDisk` accept human-readable sizes (`500MB`, `2GB`, `1TB`). `retention` accepts duration strings (`30d`, `90d`, `1y`).
76
+
77
+ ### D5: Eviction strategy — oldest sessions first
78
+
79
+ **Decision**: Eviction runs during the harvest cycle (every 2 minutes). Order of operations:
80
+ 1. **Retention eviction**: Delete session markdown files older than `retention` period. Remove corresponding documents from SQLite.
81
+ 2. **Size eviction**: If total size still exceeds `maxSize`, delete oldest remaining sessions until under limit.
82
+ 3. **Orphan cleanup**: Periodically (every 10 cycles) remove orphaned embeddings whose documents no longer exist.
83
+
84
+ **Why**: Oldest-first is simple, predictable, and preserves the most relevant (recent) context. Running during harvest avoids adding a separate eviction timer.
85
+
86
+ **Alternative considered**: LRU (least recently accessed). Rejected because tracking access timestamps adds complexity and the access pattern (search) doesn't map cleanly to document-level access.
87
+
88
+ ### D6: Disk safety guard via statfs
89
+
90
+ **Decision**: Before any write operation (harvest, reindex, embed), call `os.statfs()` (Node 18+) or `child_process.execSync('df')` to check available disk space. If below `minFreeDisk`, skip the write and log `[storage] Disk space critically low (<100MB free), skipping writes`.
91
+
92
+ **Why**: Prevents the most catastrophic failure mode (disk full → SQLite corruption). The check is cheap (~1ms) and runs at most every 2 minutes.
93
+
94
+ **Fallback**: If `statfs` is unavailable (older Node), skip the check and log a warning.
95
+
96
+ ## Risks / Trade-offs
97
+
98
+ **[Risk] Existing documents lack project_hash** → Migration backfills from file paths. Documents not matching `sessions/{hash}/*.md` pattern get `project_hash = 'global'`. No data loss.
99
+
100
+ **[Risk] PWD might not match session directory** → The session's `directory` field in JSON is the original workspace path. PWD is the current workspace. These should match for the current project's sessions. For harvested sessions from other projects, the projectHash from the file path is authoritative.
101
+
102
+ **[Risk] Eviction deletes data permanently** → Eviction only removes harvested markdown and index entries. The original OpenCode session JSON in `~/.local/share/opencode/storage/` is never touched. Sessions can be re-harvested if needed.
103
+
104
+ **[Risk] Size calculation is approximate** → Checking DB file size + sessions dir size via `fs.statSync` is fast but doesn't account for WAL files or pending transactions. Acceptable for a soft limit.
105
+
106
+ **[Risk] statfs not available in all environments** → Docker containers and some Node versions may not support `os.statfs`. Fallback: skip the check, rely on size-based eviction only.
107
+
108
+ ## Open Questions
109
+
110
+ - Should evicted sessions be logged somewhere (e.g., `eviction.log`) for auditability?
111
+ - Should there be a `memory_evict` MCP tool for manual eviction, or is automatic-only sufficient?
@@ -0,0 +1,30 @@
1
+ ## Why
2
+
3
+ All sessions from every workspace are harvested into a single flat index. Searching in project A returns results from unrelated projects B, C, D — polluting context and wasting tokens. Additionally, storage grows unbounded: a heavy user accumulates GBs of sessions over months with no eviction, eventually hitting OOM or disk-full crashes. Both problems must be solved before the auto-persistence pipeline is production-ready.
4
+
5
+ ## What Changes
6
+
7
+ - **Workspace-scoped search by default**: The MCP server detects the current workspace from `PWD` at startup, computes its projectHash, and filters all search results to that workspace. Cross-workspace search remains available via explicit parameter.
8
+ - **Per-workspace session collections**: Harvested sessions are organized by projectHash (already the case on disk). The indexer tags each document with its projectHash so filtering is efficient at the database level.
9
+ - **Configurable storage limits**: New `storage` section in `config.yml` with `maxSize`, `retention`, and `minFreeDisk` options.
10
+ - **Automatic eviction**: When storage exceeds `maxSize`, oldest sessions are evicted first (by date). Sessions older than `retention` period are cleaned up on each harvest cycle.
11
+ - **Disk safety guard**: Before any write operation (harvest, reindex, embed), check available disk space. If below `minFreeDisk`, stop all writes and log a warning.
12
+ - **Global memory preserved**: `MEMORY.md` and daily logs remain unscoped — they are the user's personal cross-project notes.
13
+
14
+ ## Capabilities
15
+
16
+ ### New Capabilities
17
+ - `workspace-scoping`: Workspace detection from PWD, projectHash computation, per-workspace search filtering, cross-workspace search opt-in, document-level project tagging
18
+ - `storage-limits`: Configurable maxSize/retention/minFreeDisk, automatic eviction of oldest sessions, disk space safety checks, storage config parsing and validation
19
+
20
+ ### Modified Capabilities
21
+ - `mcp-server`: Search tools (`memory_search`, `memory_vsearch`, `memory_query`) gain default workspace filtering and optional cross-workspace parameter. `memory_status` reports per-workspace and total storage usage.
22
+
23
+ ## Impact
24
+
25
+ - **Config schema**: New `storage` section in `config.yml` (backward-compatible — all fields optional with safe defaults)
26
+ - **Database schema**: Documents table needs a `project_hash` column (migration required for existing DBs)
27
+ - **MCP tool API**: Search tools gain optional `workspace` parameter (`"current"` default, `"all"` for cross-workspace). Non-breaking — existing calls work unchanged.
28
+ - **Files affected**: `src/server.ts`, `src/store.ts`, `src/watcher.ts`, `src/harvester.ts`, `src/collections.ts`, `src/types.ts`, `config.yml` schema
29
+ - **Disk I/O**: Eviction adds periodic delete operations. Disk space check adds `statfs` call before writes.
30
+ - **No new dependencies required**
@@ -0,0 +1,33 @@
1
+ ## ADDED Requirements
2
+
3
+ ### Requirement: Search tools support workspace filtering
4
+ The `memory_search`, `memory_vsearch`, and `memory_query` MCP tools SHALL accept an optional `workspace` parameter. When omitted, results are scoped to the current workspace and global documents. When set to `"all"`, results include all workspaces.
5
+
6
+ #### Scenario: memory_search with default workspace scoping
7
+ - **WHEN** `memory_search` is called with `{"query": "test"}` and no `workspace` parameter
8
+ - **THEN** results are filtered to `currentProjectHash` and `'global'` documents only
9
+
10
+ #### Scenario: memory_vsearch with workspace="all"
11
+ - **WHEN** `memory_vsearch` is called with `{"query": "test", "workspace": "all"}`
12
+ - **THEN** results include documents from all workspaces
13
+
14
+ #### Scenario: memory_query with specific workspace
15
+ - **WHEN** `memory_query` is called with `{"query": "test", "workspace": "abc123def456"}`
16
+ - **THEN** results are filtered to `project_hash = 'abc123def456'` and `project_hash = 'global'`
17
+
18
+ ### Requirement: memory_status reports storage usage
19
+ The `memory_status` tool SHALL report per-workspace document counts and total storage size, in addition to existing health information.
20
+
21
+ #### Scenario: memory_status with workspace data
22
+ - **WHEN** `memory_status` is called after documents from multiple workspaces are indexed
23
+ - **THEN** the response includes a breakdown of document counts per workspace (projectHash)
24
+ - **THEN** the response includes total storage size (DB + sessions directory)
25
+ - **THEN** the response includes storage limit configuration (maxSize, retention, minFreeDisk)
26
+
27
+ ### Requirement: Search tool parameter schema includes workspace
28
+ The MCP tool registration for `memory_search`, `memory_vsearch`, and `memory_query` SHALL include `workspace` in their input schema as an optional string parameter with description explaining the scoping behavior.
29
+
30
+ #### Scenario: Tool schema advertises workspace parameter
31
+ - **WHEN** an MCP client lists available tools
32
+ - **THEN** `memory_search`, `memory_vsearch`, and `memory_query` each show a `workspace` parameter in their input schema
33
+ - **THEN** the parameter description explains: omit for current workspace, `"all"` for cross-workspace search
@@ -0,0 +1,90 @@
1
+ ## ADDED Requirements
2
+
3
+ ### Requirement: Storage configuration with safe defaults
4
+ The `config.yml` SHALL support a `storage` section with `maxSize`, `retention`, and `minFreeDisk` fields. All fields SHALL be optional with safe defaults: `maxSize: 2GB`, `retention: 90d`, `minFreeDisk: 100MB`.
5
+
6
+ #### Scenario: Config with all storage fields
7
+ - **WHEN** config.yml contains `storage: { maxSize: "1GB", retention: "30d", minFreeDisk: "200MB" }`
8
+ - **THEN** the server uses those values for eviction and disk safety
9
+
10
+ #### Scenario: Config with no storage section
11
+ - **WHEN** config.yml has no `storage` section
12
+ - **THEN** the server uses defaults: maxSize=2GB, retention=90d, minFreeDisk=100MB
13
+
14
+ #### Scenario: Config with partial storage section
15
+ - **WHEN** config.yml contains `storage: { maxSize: "500MB" }`
16
+ - **THEN** `maxSize` is 500MB, `retention` defaults to 90d, `minFreeDisk` defaults to 100MB
17
+
18
+ ### Requirement: Human-readable size and duration parsing
19
+ The storage config parser SHALL accept human-readable size strings (`500MB`, `2GB`, `1TB`) and duration strings (`30d`, `90d`, `1y`). Invalid values SHALL cause a warning log and fall back to defaults.
20
+
21
+ #### Scenario: Valid size string
22
+ - **WHEN** `maxSize` is set to `"2GB"`
23
+ - **THEN** it is parsed as 2,147,483,648 bytes
24
+
25
+ #### Scenario: Valid duration string
26
+ - **WHEN** `retention` is set to `"30d"`
27
+ - **THEN** it is parsed as 30 days (2,592,000,000 milliseconds)
28
+
29
+ #### Scenario: Invalid size string
30
+ - **WHEN** `maxSize` is set to `"banana"`
31
+ - **THEN** a warning is logged: `[storage] Invalid maxSize "banana", using default 2GB`
32
+ - **THEN** the default value of 2GB is used
33
+
34
+ ### Requirement: Retention-based eviction
35
+ During each harvest cycle, the system SHALL delete session markdown files older than the `retention` period and remove their corresponding documents from the SQLite database.
36
+
37
+ #### Scenario: Session older than retention period
38
+ - **WHEN** a session file has mtime older than `retention` (e.g., 91 days old with 90d retention)
39
+ - **THEN** the session markdown file is deleted from disk
40
+ - **THEN** the corresponding document rows are removed from the `documents` table
41
+
42
+ #### Scenario: Session within retention period
43
+ - **WHEN** a session file has mtime within the `retention` period (e.g., 30 days old with 90d retention)
44
+ - **THEN** the session file is not deleted
45
+ - **THEN** the document rows remain in the database
46
+
47
+ ### Requirement: Size-based eviction
48
+ After retention eviction, if total storage (SQLite DB + sessions directory) still exceeds `maxSize`, the system SHALL delete the oldest remaining session files until total size is under the limit.
49
+
50
+ #### Scenario: Storage exceeds maxSize after retention eviction
51
+ - **WHEN** total storage is 2.5GB and `maxSize` is 2GB after retention eviction
52
+ - **THEN** the oldest session files are deleted one by one
53
+ - **THEN** deletion stops when total size drops below 2GB
54
+
55
+ #### Scenario: Storage under maxSize
56
+ - **WHEN** total storage is 1.5GB and `maxSize` is 2GB
57
+ - **THEN** no size-based eviction occurs
58
+
59
+ ### Requirement: Original session JSON is never deleted
60
+ Eviction SHALL only remove harvested markdown files and their database entries. The original OpenCode session JSON files in `~/.local/share/opencode/storage/` SHALL never be touched by eviction.
61
+
62
+ #### Scenario: Session evicted
63
+ - **WHEN** a session is evicted due to retention or size limits
64
+ - **THEN** only the harvested markdown file in `~/.nano-brain/sessions/` is deleted
65
+ - **THEN** the original JSON in `~/.local/share/opencode/storage/sessions/` remains untouched
66
+
67
+ ### Requirement: Disk safety guard
68
+ Before any write operation (harvest, reindex, embed), the system SHALL check available disk space. If free disk space is below `minFreeDisk`, all write operations SHALL be skipped and a warning logged.
69
+
70
+ #### Scenario: Disk space below minFreeDisk
71
+ - **WHEN** available disk space is 50MB and `minFreeDisk` is 100MB
72
+ - **THEN** harvest, reindex, and embed operations are skipped
73
+ - **THEN** a warning is logged: `[storage] Disk space critically low (<100MB free), skipping writes`
74
+
75
+ #### Scenario: Disk space above minFreeDisk
76
+ - **WHEN** available disk space is 500MB and `minFreeDisk` is 100MB
77
+ - **THEN** all write operations proceed normally
78
+
79
+ #### Scenario: statfs unavailable
80
+ - **WHEN** `os.statfs()` is not available (older Node.js or restricted environment)
81
+ - **THEN** the disk check is skipped with a warning: `[storage] statfs unavailable, disk safety check disabled`
82
+ - **THEN** all other storage limits (maxSize, retention) still function normally
83
+
84
+ ### Requirement: Orphan embedding cleanup
85
+ Periodically (every 10 harvest cycles), the system SHALL remove embedding vectors whose corresponding documents no longer exist in the `documents` table.
86
+
87
+ #### Scenario: Document deleted but embedding remains
88
+ - **WHEN** a document is evicted and its row removed from `documents`
89
+ - **THEN** on the next orphan cleanup cycle, the corresponding embedding vector is removed
90
+ - **THEN** no orphaned embeddings accumulate indefinitely
@@ -0,0 +1,66 @@
1
+ ## ADDED Requirements
2
+
3
+ ### Requirement: Workspace detection from PWD
4
+ The MCP server SHALL compute a `projectHash` from `process.cwd()` at startup using `sha256(cwd).substring(0, 12)`. This hash SHALL be stored as `currentProjectHash` on the server context and used for all default search filtering.
5
+
6
+ #### Scenario: Server starts in a workspace directory
7
+ - **WHEN** the MCP server starts with `PWD=/Users/alice/projects/my-app`
8
+ - **THEN** `currentProjectHash` is set to the first 12 characters of `sha256("/Users/alice/projects/my-app")`
9
+ - **THEN** the hash is consistent across restarts in the same directory
10
+
11
+ #### Scenario: Hash matches harvester convention
12
+ - **WHEN** the MCP server computes `currentProjectHash` for a workspace
13
+ - **THEN** the hash matches the directory name used by the harvester for that workspace's sessions (`sessions/{projectHash}/*.md`)
14
+
15
+ ### Requirement: Document-level project tagging
16
+ The `documents` table SHALL have a `project_hash TEXT` column. Every document indexed from a session file SHALL be tagged with the projectHash extracted from its file path. Non-session documents (MEMORY.md, daily logs) SHALL be tagged with `'global'`.
17
+
18
+ #### Scenario: New document indexed from session file
19
+ - **WHEN** a document is indexed from path `sessions/abc123def456/session-title.md`
20
+ - **THEN** the document's `project_hash` column is set to `abc123def456`
21
+
22
+ #### Scenario: New document indexed from non-session file
23
+ - **WHEN** a document is indexed from `MEMORY.md` or a daily log file
24
+ - **THEN** the document's `project_hash` column is set to `'global'`
25
+
26
+ #### Scenario: Document path does not match session pattern
27
+ - **WHEN** a document is indexed from a path that does not match `sessions/{hash}/*.md`
28
+ - **THEN** the document's `project_hash` column is set to `'global'`
29
+
30
+ ### Requirement: Database migration for existing documents
31
+ On startup, the store SHALL add the `project_hash` column if it does not exist, then backfill existing documents by extracting the projectHash from their file paths.
32
+
33
+ #### Scenario: First startup after upgrade
34
+ - **WHEN** the store opens a database that lacks the `project_hash` column
35
+ - **THEN** the column is added via `ALTER TABLE documents ADD COLUMN project_hash TEXT DEFAULT 'global'`
36
+ - **THEN** existing documents with paths matching `sessions/{hash}/*.md` are updated with the correct projectHash
37
+ - **THEN** existing documents not matching the pattern retain `project_hash = 'global'`
38
+
39
+ #### Scenario: Subsequent startup
40
+ - **WHEN** the store opens a database that already has the `project_hash` column
41
+ - **THEN** no migration runs
42
+ - **THEN** no data is modified
43
+
44
+ ### Requirement: Default search scoping to current workspace
45
+ All search operations SHALL filter results to documents matching `currentProjectHash` or `'global'` by default. This ensures searches return only results relevant to the current workspace plus cross-project notes.
46
+
47
+ #### Scenario: Search without workspace parameter
48
+ - **WHEN** `memory_search` is called with `{"query": "authentication"}` and no `workspace` parameter
49
+ - **THEN** only documents with `project_hash = currentProjectHash` or `project_hash = 'global'` are returned
50
+ - **THEN** documents from other workspaces are excluded
51
+
52
+ #### Scenario: Global documents always included
53
+ - **WHEN** a search is performed with default workspace scoping
54
+ - **THEN** MEMORY.md entries and daily logs (tagged `'global'`) are included in results
55
+ - **THEN** session documents from other workspaces are excluded
56
+
57
+ ### Requirement: Cross-workspace search opt-in
58
+ All search tools SHALL accept an optional `workspace` parameter. When set to `"all"`, search results SHALL include documents from all workspaces. When set to a specific hash, results SHALL be filtered to that workspace plus `'global'`.
59
+
60
+ #### Scenario: Search with workspace="all"
61
+ - **WHEN** `memory_search` is called with `{"query": "auth", "workspace": "all"}`
62
+ - **THEN** documents from all workspaces are included in results
63
+
64
+ #### Scenario: Search with specific workspace hash
65
+ - **WHEN** `memory_search` is called with `{"query": "auth", "workspace": "abc123def456"}`
66
+ - **THEN** only documents with `project_hash = 'abc123def456'` or `project_hash = 'global'` are returned