@nano-step/nano-brain 2026.1.14

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (93) hide show
  1. package/.opencode/command/nano-brain-init.md +13 -0
  2. package/.opencode/command/nano-brain-reindex.md +11 -0
  3. package/.opencode/command/nano-brain-status.md +12 -0
  4. package/AGENTS.md +41 -0
  5. package/AGENTS_SNIPPET.md +44 -0
  6. package/CHANGELOG.md +186 -0
  7. package/README.md +298 -0
  8. package/SKILL.md +109 -0
  9. package/bin/cli.js +29 -0
  10. package/commands/nano-brain-init.md +36 -0
  11. package/commands/nano-brain-reindex.md +31 -0
  12. package/commands/nano-brain-status.md +32 -0
  13. package/index.html +929 -0
  14. package/nano-brain +4 -0
  15. package/opencode-mcp.json +9 -0
  16. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/.openspec.yaml +2 -0
  17. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/design.md +68 -0
  18. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/proposal.md +27 -0
  19. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/specs/mcp-integration-testing/spec.md +50 -0
  20. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/specs/mcp-server/spec.md +40 -0
  21. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/specs/search-pipeline/spec.md +29 -0
  22. package/openspec/changes/archive/2026-02-16-fix-mcp-server-bugs/tasks.md +37 -0
  23. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/.openspec.yaml +2 -0
  24. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/design.md +111 -0
  25. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/proposal.md +30 -0
  26. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/specs/mcp-server/spec.md +33 -0
  27. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/specs/storage-limits/spec.md +90 -0
  28. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/specs/workspace-scoping/spec.md +66 -0
  29. package/openspec/changes/archive/2026-02-23-workspace-scoped-memory-and-storage-limits/tasks.md +199 -0
  30. package/openspec/changes/codebase-indexing/.openspec.yaml +2 -0
  31. package/openspec/changes/codebase-indexing/design.md +169 -0
  32. package/openspec/changes/codebase-indexing/proposal.md +30 -0
  33. package/openspec/changes/codebase-indexing/specs/codebase-collection/spec.md +187 -0
  34. package/openspec/changes/codebase-indexing/specs/mcp-server/spec.md +36 -0
  35. package/openspec/changes/codebase-indexing/tasks.md +56 -0
  36. package/openspec/changes/fix-session-harvest-workspace-scoping/.openspec.yaml +2 -0
  37. package/openspec/changes/fix-session-harvest-workspace-scoping/design.md +84 -0
  38. package/openspec/changes/fix-session-harvest-workspace-scoping/proposal.md +26 -0
  39. package/openspec/changes/fix-session-harvest-workspace-scoping/specs/workspace-scoping/spec.md +65 -0
  40. package/openspec/changes/fix-session-harvest-workspace-scoping/tasks.md +33 -0
  41. package/openspec/changes/performance-and-search-quality/.openspec.yaml +2 -0
  42. package/openspec/changes/performance-and-search-quality/proposal.md +37 -0
  43. package/openspec/specs/mcp-integration-testing/spec.md +50 -0
  44. package/openspec/specs/mcp-server/spec.md +75 -0
  45. package/openspec/specs/search-pipeline/spec.md +29 -0
  46. package/openspec/specs/storage-limits/spec.md +94 -0
  47. package/openspec/specs/workspace-scoping/spec.md +70 -0
  48. package/package.json +37 -0
  49. package/site/build.js +66 -0
  50. package/site/partials/_api.html +83 -0
  51. package/site/partials/_compare.html +100 -0
  52. package/site/partials/_config.html +23 -0
  53. package/site/partials/_features.html +43 -0
  54. package/site/partials/_footer.html +6 -0
  55. package/site/partials/_hero.html +9 -0
  56. package/site/partials/_how-it-works.html +26 -0
  57. package/site/partials/_models.html +18 -0
  58. package/site/partials/_quick-start.html +15 -0
  59. package/site/partials/_stats.html +1 -0
  60. package/site/partials/_tech-stack.html +13 -0
  61. package/site/script.js +12 -0
  62. package/site/shell.html +44 -0
  63. package/site/styles.css +548 -0
  64. package/src/chunker.ts +427 -0
  65. package/src/codebase.ts +425 -0
  66. package/src/collections.ts +217 -0
  67. package/src/embeddings.ts +325 -0
  68. package/src/expansion.ts +79 -0
  69. package/src/harvester.ts +306 -0
  70. package/src/index.ts +778 -0
  71. package/src/reranker.ts +103 -0
  72. package/src/search.ts +294 -0
  73. package/src/server.ts +876 -0
  74. package/src/storage.ts +221 -0
  75. package/src/store.ts +653 -0
  76. package/src/types.ts +215 -0
  77. package/src/watcher.ts +389 -0
  78. package/test/chunker.test.ts +479 -0
  79. package/test/cli.test.ts +309 -0
  80. package/test/codebase-chunker.test.ts +446 -0
  81. package/test/codebase.test.ts +678 -0
  82. package/test/collections.test.ts +571 -0
  83. package/test/harvester.test.ts +636 -0
  84. package/test/integration.test.ts +219 -0
  85. package/test/llm.test.ts +322 -0
  86. package/test/search.test.ts +572 -0
  87. package/test/server.test.ts +541 -0
  88. package/test/storage.test.ts +302 -0
  89. package/test/store.test.ts +530 -0
  90. package/test/watcher.test.ts +717 -0
  91. package/test/workspace.test.ts +239 -0
  92. package/tsconfig.json +19 -0
  93. package/vitest.config.ts +16 -0
@@ -0,0 +1,56 @@
1
+ ## 1. Types and Configuration
2
+
3
+ - [x] 1.1 Add `CodebaseConfig` interface to `src/types.ts` with fields: `enabled: boolean`, `exclude?: string[]`, `extensions?: string[]`, `maxFileSize?: string`, `maxSize?: string`
4
+ - [x] 1.2 Add optional `codebase?: CodebaseConfig` field to `CollectionConfig` interface in `src/types.ts`
5
+ - [x] 1.3 Add `CodebaseIndexResult` interface to `src/types.ts` with fields: `filesScanned`, `filesIndexed`, `filesSkippedUnchanged`, `filesSkippedTooLarge`, `filesSkippedBudget`, `chunksCreated`, `storageUsedBytes`, `maxSizeBytes`
6
+ - [x] 1.4 Add `codebase` stats to `IndexHealth` interface: `codebase?: { enabled: boolean; documents: number; chunks: number; extensions: string[]; excludeCount: number; storageUsed: number; maxSize: number }`
7
+
8
+ ## 2. Codebase Scanner Module
9
+
10
+ - [x] 2.1 Create `src/codebase.ts` with built-in default exclude patterns, project type marker file map
11
+ - [x] 2.2 Implement `detectProjectType(workspaceRoot: string): string[]` — check marker files, return merged extensions list, always include `.md`
12
+ - [x] 2.3 Implement `loadGitignorePatterns(workspaceRoot: string): string[]` — parse `.gitignore` from workspace root, return patterns array, return empty array if file missing
13
+ - [x] 2.4 Implement `mergeExcludePatterns(config: CodebaseConfig, workspaceRoot: string): string[]` — merge config excludes + .gitignore + built-in defaults into single array
14
+ - [x] 2.5 Implement `resolveExtensions(config: CodebaseConfig, workspaceRoot: string): string[]` — return config extensions if set, otherwise auto-detect from project type
15
+ - [x] 2.6 Implement `scanCodebaseFiles(workspaceRoot: string, config: CodebaseConfig): Promise<{ files: string[]; skippedTooLarge: number }>` — use fast-glob with resolved extensions as pattern and merged excludes as ignore, filter by maxFileSize (default 5MB), return absolute paths
16
+ - [x] 2.7 Implement `indexCodebase(store, workspaceRoot, config, projectHash, embedder?): Promise<CodebaseIndexResult>` — scan files, compute hashes, skip unchanged, chunk with `chunkSourceCode`, index via store, deactivate deleted files, embed new chunks, enforce maxSize budget
17
+
18
+ ## 3. Source Code Chunker
19
+
20
+ - [x] 3.1 Add `findSourceCodeBreakPoints(content: string): BreakPoint[]` to `src/chunker.ts` — score structural boundaries: double blank lines (score 90), function/class/type definitions at line start (score 80), single blank lines (score 40), import/export blocks (score 60), regular line breaks (score 1)
21
+ - [x] 3.2 Add `chunkSourceCode(content: string, hash: string, filePath: string, workspaceRoot: string, options?: ChunkOptions): MemoryChunk[]` to `src/chunker.ts` — split using source code break points, prepend metadata header (`File:`, `Language:`, `Lines:`) to each chunk, use same target size (3600 chars) and overlap (540 chars) as markdown chunker
22
+ - [x] 3.3 Add `inferLanguage(filePath: string): string` helper — map file extension to language name (`.ts` → `typescript`, `.py` → `python`, `.go` → `go`, etc.)
23
+
24
+ ## 4. Watcher Integration
25
+
26
+ - [x] 4.1 Add `codebaseConfig?: CodebaseConfig` and `workspaceRoot?: string` and `projectHash?: string` fields to `WatcherOptions` interface in `src/watcher.ts`
27
+ - [x] 4.2 In `setupWatcher()`, when `codebaseConfig?.enabled`, add workspace root as additional chokidar watch target with merged exclude patterns as `ignored` option
28
+ - [x] 4.3 In watcher file change handlers (`add`, `change`, `unlink`), check if file matches codebase extensions (not just `.md`) and trigger `handleFileChange` accordingly
29
+ - [x] 4.4 In `triggerReindex()`, after collection reindex loop, if codebase is enabled, call `indexCodebase()` for the workspace root
30
+
31
+ ## 5. MCP Server Integration
32
+
33
+ - [x] 5.1 Register `memory_index_codebase` tool in `src/server.ts` — no required params, calls `indexCodebase()`, returns `CodebaseIndexResult` summary with storage usage. If codebase not enabled, return error message.
34
+ - [x] 5.2 Update `memory_status` handler in `src/server.ts` to include codebase stats section (enabled, document count, storage used/limit, resolved extensions, exclude count) when codebase is enabled
35
+ - [x] 5.3 Load codebase config from `CollectionConfig.codebase` at server startup and pass to watcher setup
36
+
37
+ ## 6. Storage Budget
38
+
39
+ - [x] 6.1 Add `maxSize?: string` to `CodebaseConfig` (default 2GB)
40
+ - [x] 6.2 Add `getCollectionStorageSize(collection: string): number` to Store interface and implement in `src/store.ts`
41
+ - [x] 6.3 Enforce budget in `indexCodebase()` — track cumulative storage, skip files when over limit
42
+ - [x] 6.4 Report storage usage in `getCodebaseStats()` and `formatStatus()`
43
+
44
+ ## 7. Tests
45
+
46
+ - [ ] 7.1 Add unit tests for `detectProjectType()` — Node.js, Python, Go, Rust, multi-marker, no-marker scenarios
47
+ - [ ] 7.2 Add unit tests for `loadGitignorePatterns()` — existing .gitignore, missing .gitignore, complex patterns
48
+ - [ ] 7.3 Add unit tests for `mergeExcludePatterns()` — all three sources, missing sources, deduplication
49
+ - [ ] 7.4 Add unit tests for `resolveExtensions()` — explicit config, auto-detect, fallback
50
+ - [ ] 7.5 Add unit tests for `chunkSourceCode()` — TypeScript file, Python file, small file (single chunk), large file (multiple chunks with overlap), metadata header format
51
+ - [ ] 7.6 Add unit tests for `findSourceCodeBreakPoints()` — function defs, class defs, blank lines, import blocks
52
+ - [ ] 7.7 Add unit tests for `inferLanguage()` — all supported extensions, unknown extension
53
+ - [ ] 7.8 Add unit tests for `scanCodebaseFiles()` — respects exclude patterns, respects extensions, skips files over maxFileSize
54
+ - [ ] 7.9 Add integration test for `indexCodebase()` — indexes files, skips unchanged, detects deleted, tags with projectHash, enforces budget
55
+ - [ ] 7.10 Add integration test for `memory_index_codebase` MCP tool — enabled case, disabled case
56
+ - [ ] 7.11 Add integration test for `getCollectionStorageSize()` — returns correct size for collection
@@ -0,0 +1,2 @@
1
+ schema: spec-driven
2
+ created: 2026-02-28
@@ -0,0 +1,84 @@
1
+ ## Context
2
+
3
+ The `workspace-scoped-memory-and-storage-limits` change (archived 2026-02-23) introduced workspace-scoped search by adding a `project_hash` column to the `documents` table and filtering search results by `currentProjectHash`. The spec requires that session documents be tagged with the projectHash **extracted from their file path** (`sessions/{hash}/*.md`).
4
+
5
+ However, the implementation has a bug in four code paths where session documents are indexed with the **wrong** projectHash:
6
+
7
+ 1. **`watcher.ts` → `triggerReindex()`** (line 122): Passes the watcher's own `projectHash` (current workspace) to `indexDocument()` for ALL collection files, including session files from other workspaces.
8
+ 2. **`index.ts` → `handleInit()`** (line 429): Indexes session collection files with no `projectHash`, defaulting to `undefined` → `'global'`.
9
+ 3. **`index.ts` → `handleUpdate()`** (line 539): Same issue as `handleInit`.
10
+ 4. **`server.ts` → `memory_update` tool** (line 460): Indexes all collection files without projectHash extraction.
11
+
12
+ The session files on disk are correctly organized by projectHash (`~/.nano-brain/sessions/{projectHash}/*.md`), and the harvester correctly writes them there. The bug is purely in the indexing step that reads these files into the database.
13
+
14
+ ## Goals / Non-Goals
15
+
16
+ **Goals:**
17
+ - Fix all four indexing code paths to extract projectHash from session file paths
18
+ - Create a shared utility function for projectHash extraction
19
+ - Ensure existing incorrectly-tagged documents get re-tagged on next reindex
20
+ - Maintain backward compatibility — no API or config changes
21
+
22
+ **Non-Goals:**
23
+ - Changing the harvester itself (it already works correctly)
24
+ - Modifying the search layer (already correctly filters by projectHash)
25
+ - Adding new MCP tools or parameters
26
+ - Changing the session file format or directory structure
27
+
28
+ ## Decisions
29
+
30
+ ### Decision 1: Shared `extractProjectHashFromPath()` utility
31
+
32
+ **Choice**: Create a function in `store.ts` (or a new `utils.ts`) that extracts projectHash from a file path.
33
+
34
+ ```typescript
35
+ export function extractProjectHashFromPath(filePath: string, sessionsDir: string): string | undefined {
36
+ // If filePath is under sessionsDir, extract the projectHash from the subdirectory name
37
+ // e.g., ~/.nano-brain/sessions/abc123def456/2026-02-16-session.md → 'abc123def456'
38
+ // Returns undefined for non-session files (caller defaults to their own projectHash or 'global')
39
+ }
40
+ ```
41
+
42
+ **Rationale**: All four bug sites need the same logic. A shared function prevents divergence and is easy to test.
43
+
44
+ **Alternative considered**: Parsing YAML frontmatter from session files to read `projectHash`. Rejected because:
45
+ - Slower (requires reading and parsing file content)
46
+ - The directory structure is the canonical source of truth (set by the harvester)
47
+ - Frontmatter could be missing or malformed
48
+
49
+ ### Decision 2: Collection-aware indexing in watcher
50
+
51
+ **Choice**: In `triggerReindex()`, check if the collection being indexed is `sessions`. If so, extract projectHash from each file's path. Otherwise, use the watcher's own `projectHash`.
52
+
53
+ ```typescript
54
+ for (const filePath of files) {
55
+ const effectiveProjectHash = collection.name === 'sessions'
56
+ ? extractProjectHashFromPath(filePath, outputDir) ?? projectHash
57
+ : projectHash;
58
+ indexDocument(store, collection.name, filePath, content, title, effectiveProjectHash);
59
+ }
60
+ ```
61
+
62
+ **Rationale**: Only session files have per-project scoping. Memory files, codebase files, and custom collections should continue using the watcher's projectHash.
63
+
64
+ **Alternative considered**: Always extracting from path for all collections. Rejected because non-session collections don't have projectHash in their directory structure.
65
+
66
+ ### Decision 3: Self-healing via reindex
67
+
68
+ **Choice**: No explicit migration for existing incorrectly-tagged documents. The fix naturally corrects tags on the next reindex cycle because:
69
+ - The watcher periodically reindexes all collections
70
+ - `indexDocument()` uses `INSERT OR REPLACE` (UPSERT), so re-indexing a file updates its `project_hash`
71
+ - Running `memory_update` or restarting the MCP server triggers a full reindex
72
+
73
+ **Rationale**: Simpler than writing a one-time migration. The data self-heals within one reindex cycle (typically < 2 minutes after server restart).
74
+
75
+ **Alternative considered**: Adding a startup migration that scans all session documents and fixes their `project_hash`. Rejected because:
76
+ - Adds complexity for a one-time fix
77
+ - The reindex already handles it
78
+ - Users can trigger immediate fix via `memory_update` tool
79
+
80
+ ## Risks / Trade-offs
81
+
82
+ - **[Risk] Existing sessions tagged with wrong projectHash until reindex** → Mitigation: First reindex after the fix corrects all tags. Users can force immediate reindex via `memory_update`. Document this in release notes.
83
+ - **[Risk] Collection name `sessions` is hardcoded in the check** → Mitigation: The sessions collection name is already hardcoded in `handleInit()` and is a core convention. If collection naming changes, this would need updating — but that's a broader refactor.
84
+ - **[Risk] Path parsing assumes `sessions/{hash}/` directory structure** → Mitigation: The harvester is the only writer to this directory, and it always uses this structure. The extraction function validates the hash format (12-char hex) as a safety check.
@@ -0,0 +1,26 @@
1
+ ## Why
2
+
3
+ Session harvesting collects ALL sessions from every workspace into a single `~/.nano-brain/sessions/` directory, and the watcher's reindex stamps every session document with the **current workspace's** `projectHash` instead of extracting it from the session file's actual path. This means if nano-brain runs in workspace A, sessions from workspaces B, C, D all get tagged as belonging to A — defeating the workspace-scoped search that was implemented in the previous `workspace-scoped-memory-and-storage-limits` change. The result: searching in any project returns sessions from all projects, polluting context and wasting tokens.
4
+
5
+ ## What Changes
6
+
7
+ - **Fix projectHash extraction during session indexing**: The watcher's `triggerReindex()` currently passes its own `projectHash` to `indexDocument()` for ALL files including session files. For the `sessions` collection, the projectHash must be extracted from the file's directory structure (`sessions/{projectHash}/*.md`) instead of using the watcher's workspace hash.
8
+ - **Fix `handleInit` session indexing**: The `init` command indexes session collection files with no projectHash, defaulting to `'global'`. It should also extract projectHash from the file path.
9
+ - **Fix `handleUpdate` session indexing**: Same issue — indexes all collection files without workspace-aware projectHash extraction.
10
+ - **Fix `memory_update` tool**: The MCP tool's reindex handler indexes all collection files without extracting projectHash from session paths.
11
+ - **Add projectHash extraction utility**: Create a shared helper that extracts projectHash from a session file path by matching the `sessions/{hash}/` directory pattern, returning `'global'` for non-session files.
12
+
13
+ ## Capabilities
14
+
15
+ ### New Capabilities
16
+
17
+ ### Modified Capabilities
18
+ - `workspace-scoping`: The "Document-level project tagging" requirement is already specified correctly (extract from file path), but the implementation violates it. This change fixes the implementation to match the spec. No spec text changes needed — only implementation fixes.
19
+
20
+ ## Impact
21
+
22
+ - **Files affected**: `src/watcher.ts` (triggerReindex projectHash logic), `src/index.ts` (handleInit, handleUpdate session indexing), `src/server.ts` (memory_update tool), `src/store.ts` or new utility (projectHash extraction helper)
23
+ - **Database**: Existing documents with incorrect `project_hash` values need re-tagging. A one-time migration or re-harvest will fix stale data.
24
+ - **No API changes**: MCP tool interfaces remain unchanged.
25
+ - **No new dependencies**: Pure logic fix using existing path parsing.
26
+ - **Risk**: Low — this is a bug fix aligning implementation with existing spec. All search tools already support workspace filtering; they just receive wrong data today.
@@ -0,0 +1,65 @@
1
+ ## ADDED Requirements
2
+
3
+ ### Requirement: ProjectHash extraction from session file path
4
+ The system SHALL provide a utility function `extractProjectHashFromPath(filePath, sessionsDir)` that extracts the projectHash from a session file's path. The function SHALL match the pattern `{sessionsDir}/{projectHash}/*.md` where `projectHash` is a 12-character hexadecimal string. For paths that do not match this pattern, the function SHALL return `undefined`.
5
+
6
+ #### Scenario: Valid session file path
7
+ - **WHEN** `extractProjectHashFromPath` is called with path `~/.nano-brain/sessions/abc123def456/2026-02-16-session.md` and sessionsDir `~/.nano-brain/sessions`
8
+ - **THEN** the function returns `'abc123def456'`
9
+
10
+ #### Scenario: Non-session file path
11
+ - **WHEN** `extractProjectHashFromPath` is called with path `~/.nano-brain/memory/2026-02-16.md` and sessionsDir `~/.nano-brain/sessions`
12
+ - **THEN** the function returns `undefined`
13
+
14
+ #### Scenario: Nested path under session directory without valid hash
15
+ - **WHEN** `extractProjectHashFromPath` is called with a path under sessionsDir where the subdirectory name is not a 12-character hex string
16
+ - **THEN** the function returns `undefined`
17
+
18
+ ### Requirement: Collection-aware projectHash during watcher reindex
19
+ The watcher's `triggerReindex()` SHALL use collection-aware projectHash assignment when indexing documents. For the `sessions` collection, the projectHash SHALL be extracted from each file's path using `extractProjectHashFromPath()`. For all other collections, the watcher's own `projectHash` (current workspace) SHALL be used. If extraction returns `undefined` for a session file, the watcher's `projectHash` SHALL be used as fallback.
20
+
21
+ #### Scenario: Watcher reindexes session file from another workspace
22
+ - **WHEN** the watcher runs in workspace A (projectHash `aaa111bbb222`) and reindexes a session file at `sessions/ccc333ddd444/2026-02-16-session.md`
23
+ - **THEN** the document is indexed with `project_hash = 'ccc333ddd444'` (extracted from path)
24
+ - **THEN** the document is NOT indexed with `project_hash = 'aaa111bbb222'`
25
+
26
+ #### Scenario: Watcher reindexes non-session collection file
27
+ - **WHEN** the watcher runs in workspace A (projectHash `aaa111bbb222`) and reindexes a memory file at `memory/2026-02-16.md`
28
+ - **THEN** the document is indexed with `project_hash = 'aaa111bbb222'` (watcher's own hash)
29
+
30
+ #### Scenario: Watcher reindexes session file with unrecognized path structure
31
+ - **WHEN** the watcher reindexes a session file whose path does not match the `sessions/{hash}/*.md` pattern
32
+ - **THEN** the document is indexed with the watcher's own `projectHash` as fallback
33
+
34
+ ### Requirement: Correct projectHash during init indexing
35
+ The `init` command SHALL extract projectHash from session file paths when indexing the `sessions` collection, using the same `extractProjectHashFromPath()` utility. Non-session collections SHALL use the workspace's projectHash.
36
+
37
+ #### Scenario: Init indexes session files from multiple workspaces
38
+ - **WHEN** `nano-brain init` runs in workspace A and indexes session files from `sessions/aaa111bbb222/` and `sessions/ccc333ddd444/`
39
+ - **THEN** documents from `sessions/aaa111bbb222/` are tagged with `project_hash = 'aaa111bbb222'`
40
+ - **THEN** documents from `sessions/ccc333ddd444/` are tagged with `project_hash = 'ccc333ddd444'`
41
+
42
+ ### Requirement: Correct projectHash during manual update
43
+ The `memory_update` MCP tool and the `update` CLI command SHALL extract projectHash from session file paths when reindexing the `sessions` collection, using the same `extractProjectHashFromPath()` utility.
44
+
45
+ #### Scenario: memory_update reindexes session files
46
+ - **WHEN** the `memory_update` tool triggers a reindex
47
+ - **THEN** session documents are tagged with projectHash extracted from their file paths
48
+ - **THEN** non-session documents retain their existing projectHash assignment
49
+
50
+ ## MODIFIED Requirements
51
+
52
+ ### Requirement: Document-level project tagging
53
+ The `documents` table SHALL have a `project_hash TEXT` column. Every document indexed from a session file SHALL be tagged with the projectHash extracted from its file path by the `extractProjectHashFromPath()` utility. Non-session documents (memory files, daily logs, codebase files) SHALL be tagged with the indexer's contextual projectHash (workspace hash for codebase, `'global'` for shared files). All indexing code paths (watcher reindex, init, update, memory_update tool) SHALL use this extraction consistently.
54
+
55
+ #### Scenario: New document indexed from session file
56
+ - **WHEN** a document is indexed from path `sessions/abc123def456/session-title.md`
57
+ - **THEN** the document's `project_hash` column is set to `abc123def456`
58
+
59
+ #### Scenario: New document indexed from non-session file
60
+ - **WHEN** a document is indexed from `MEMORY.md` or a daily log file
61
+ - **THEN** the document's `project_hash` column is set to `'global'`
62
+
63
+ #### Scenario: Document path does not match session pattern
64
+ - **WHEN** a document is indexed from a path that does not match `sessions/{hash}/*.md`
65
+ - **THEN** the document's `project_hash` column is set to the indexer's contextual projectHash
@@ -0,0 +1,33 @@
1
+ ## 1. ProjectHash Extraction Utility
2
+
3
+ - [x] 1.1 Create `extractProjectHashFromPath(filePath: string, sessionsDir: string): string | undefined` function in `src/store.ts` (or `src/utils.ts`). It should parse the path to find a `{sessionsDir}/{12-char-hex}/` segment and return the hex string, or `undefined` if not matched.
4
+ - [x] 1.2 Export the function so it can be imported by `watcher.ts`, `index.ts`, and `server.ts`.
5
+ - [x] 1.3 Add unit tests for `extractProjectHashFromPath` covering: valid session path, non-session path, path with non-hex subdirectory, path without sessionsDir prefix, edge cases (empty string, trailing slashes).
6
+
7
+ ## 2. Fix Watcher Reindex
8
+
9
+ - [x] 2.1 In `src/watcher.ts` `triggerReindex()`, import `extractProjectHashFromPath` and the sessions output directory path.
10
+ - [x] 2.2 Modify the indexing loop: for the `sessions` collection, call `extractProjectHashFromPath(filePath, sessionsOutputDir)` and use the result (falling back to the watcher's `projectHash` if `undefined`). For other collections, keep using the watcher's `projectHash`.
11
+ - [x] 2.3 Add/update test in `test/watcher.test.ts` verifying that session files from different workspaces get tagged with their respective projectHash, not the watcher's.
12
+
13
+ ## 3. Fix Init Command Indexing
14
+
15
+ - [x] 3.1 In `src/index.ts` `handleInit()`, modify the session collection indexing loop (around line 423-435) to extract projectHash from each session file path using `extractProjectHashFromPath`. Pass the extracted hash to `indexDocument()`.
16
+ - [x] 3.2 Ensure non-session collections (`memory`) continue using the workspace's `projectHash`.
17
+
18
+ ## 4. Fix Update Command and MCP Tool
19
+
20
+ - [x] 4.1 In `src/index.ts` `handleUpdate()`, modify the collection indexing loop to use `extractProjectHashFromPath` for the `sessions` collection.
21
+ - [x] 4.2 In `src/server.ts` `memory_update` tool handler, modify the reindex loop to use `extractProjectHashFromPath` for the `sessions` collection. Pass the server's `outputDir + '/sessions'` as the sessionsDir.
22
+
23
+ ## 5. Integration Testing
24
+
25
+ - [x] 5.1 Add an integration test that: (a) creates session files in two different projectHash subdirectories, (b) runs a reindex, (c) verifies each document has the correct `project_hash` in the database.
26
+ - [x] 5.2 Add a test verifying that after reindex, searching with workspace=projectHashA returns only sessions from A (not B), and workspace="all" returns both.
27
+ - [x] 5.3 Run full test suite (`npm test`) and verify all existing tests pass.
28
+
29
+ ## 6. Verification
30
+
31
+ - [ ] 6.1 Run `npx nano-brain init` in a workspace and verify `memory_status` shows correct per-workspace document counts.
32
+ - [ ] 6.2 Run `memory_search` with default workspace scoping and confirm only current-workspace sessions appear.
33
+ - [ ] 6.3 Run `memory_search` with `workspace="all"` and confirm cross-workspace sessions appear.
@@ -0,0 +1,2 @@
1
+ schema: spec-driven
2
+ created: 2026-02-24
@@ -0,0 +1,37 @@
1
+ ## Why
2
+
3
+ nano-brain's search quality and embedding pipeline have several gaps discovered during real-world usage: embedding truncation loses context from the second half of chunks, the chunking strategy doesn't optimize for embedding quality, there's no way to measure or benchmark search relevance, and the embedding pipeline processes documents sequentially with no batching optimization. These issues compound — poor embeddings lead to poor vector search, which degrades hybrid search, which makes the entire memory system less useful for AI agents.
4
+
5
+ ## What Changes
6
+
7
+ - **Smarter embedding truncation**: Replace the hard `substring(0, 1800)` cut with word/sentence-boundary-aware truncation that preserves semantic completeness
8
+ - **Chunk size alignment**: Align chunk size with embedding model's effective context window so chunks don't need truncation at all — currently chunks are 3600 chars but only the first 1800 are embedded, wasting half the stored content for vector search
9
+ - **Embedding batch pipeline**: Implement proper batch embedding with configurable concurrency, progress tracking, and resume-on-failure — currently processes one doc at a time with no parallelism
10
+ - **Search result scoring transparency**: Add `--explain` flag to CLI that shows which search mode (FTS/vector/hybrid) contributed to each result's score, enabling users to diagnose search quality issues
11
+ - **Cross-workspace search**: Allow querying across all workspace DBs from any workspace, with results tagged by project — currently each workspace is fully isolated
12
+ - **Embedding model warm-up**: Pre-warm the Ollama embedding model on MCP server start to avoid cold-start latency on first query (currently 1.4s first request vs 150ms warm)
13
+ - **Incremental session indexing**: Index new sessions into the DB immediately after harvest instead of requiring a separate `embed` step — currently harvest writes markdown files but doesn't trigger indexing or embedding
14
+
15
+ ## Capabilities
16
+
17
+ ### New Capabilities
18
+ - `embedding-pipeline`: Batch embedding with concurrency, progress tracking, resume-on-failure, and model warm-up
19
+ - `search-explain`: Transparent scoring with `--explain` flag showing per-result breakdown of FTS score, vector similarity, and RRF fusion contribution
20
+ - `cross-workspace-search`: Query across all workspace DBs with project-tagged results
21
+
22
+ ### Modified Capabilities
23
+ - `search-pipeline`: Chunk size alignment with embedding window, word-boundary-aware truncation, and scoring transparency
24
+ - `storage-limits`: Incremental session indexing after harvest (currently harvest and indexing are decoupled)
25
+
26
+ ## Impact
27
+
28
+ - **`src/chunker.ts`**: Adjust `maxChunkSize` default and add overlap tuning to align with embedding window
29
+ - **`src/codebase.ts`**: Replace `truncateForEmbedding()` with sentence-boundary-aware truncation, add batch concurrency to `embedPendingCodebase()`
30
+ - **`src/search.ts`**: Add explain/debug metadata to search results, implement cross-workspace query routing
31
+ - **`src/index.ts`**: Add `--explain` CLI flag, add cross-workspace `--all` flag, wire incremental indexing after harvest
32
+ - **`src/server.ts`**: Add model warm-up on startup, expose explain metadata in MCP tool responses
33
+ - **`src/harvester.ts`**: Trigger document indexing + embedding after successful harvest
34
+ - **`src/embeddings.ts`**: Add batch concurrency control, warm-up method, progress callbacks
35
+ - **`src/store.ts`**: Support querying multiple DB files for cross-workspace search
36
+ - **Config**: New `embedding.batchSize`, `embedding.concurrency`, `embedding.warmup` options in `config.yml`
37
+ - **Dependencies**: No new dependencies expected — all changes use existing SQLite, Ollama, and fast-glob
@@ -0,0 +1,50 @@
1
+ ## Requirements
2
+
3
+ ### Requirement: Integration test infrastructure
4
+ The project SHALL have an integration test file that exercises MCP tool handlers against a real SQLite database with real FTS5 indexes and real sqlite-vec tables.
5
+
6
+ #### Scenario: Test setup creates real database with indexed documents
7
+ - **WHEN** the integration test suite starts
8
+ - **THEN** a temporary SQLite database is created with sqlite-vec loaded
9
+ - **THEN** at least 2 test documents are indexed with FTS5 entries
10
+ - **THEN** the MCP server's tool handlers are initialized with the real store
11
+
12
+ #### Scenario: Test teardown cleans up
13
+ - **WHEN** the integration test suite completes
14
+ - **THEN** the temporary database file is deleted
15
+ - **THEN** no test artifacts remain on disk
16
+
17
+ ### Requirement: Search integration tests
18
+ Integration tests SHALL verify that `memory_search` works end-to-end with real FTS5 queries.
19
+
20
+ #### Scenario: Search finds indexed document
21
+ - **WHEN** `memory_search` handler is called with a query matching an indexed document
22
+ - **THEN** the response contains the matching document with title, path, and snippet
23
+
24
+ #### Scenario: Search with hyphenated query
25
+ - **WHEN** `memory_search` handler is called with query `nano-brain`
26
+ - **THEN** the response completes without error
27
+ - **THEN** results include documents containing the term
28
+
29
+ #### Scenario: Search with collection filter
30
+ - **WHEN** `memory_search` handler is called with a collection filter
31
+ - **THEN** only documents from that collection are returned
32
+
33
+ #### Scenario: Search with empty query
34
+ - **WHEN** `memory_search` handler is called with an empty string query
35
+ - **THEN** the response returns empty results without error
36
+
37
+ ### Requirement: Update integration tests
38
+ Integration tests SHALL verify that `memory_update` works end-to-end.
39
+
40
+ #### Scenario: Update indexes new files
41
+ - **WHEN** a new markdown file is added to a collection directory
42
+ - **THEN** calling the `memory_update` handler indexes the new file
43
+ - **THEN** the file is searchable via `memory_search`
44
+
45
+ ### Requirement: Status integration tests
46
+ Integration tests SHALL verify that `memory_status` returns accurate information.
47
+
48
+ #### Scenario: Status reflects indexed documents
49
+ - **WHEN** documents have been indexed
50
+ - **THEN** `memory_status` handler returns correct document count and collection info
@@ -0,0 +1,75 @@
1
+ ## Purpose
2
+
3
+ MCP server providing persistent memory tools (search, status, update, get) for AI coding agents via the Model Context Protocol.
4
+ ## Requirements
5
+ ### Requirement: ESM module compliance
6
+ All source files in `src/` SHALL use ESM `import` syntax exclusively. No `require()` calls SHALL exist in any TypeScript source file.
7
+
8
+ #### Scenario: Server starts under Node.js ESM runtime
9
+ - **WHEN** the MCP server is started via `node bin/cli.js mcp`
10
+ - **THEN** the server starts without `require is not defined` errors
11
+ - **THEN** all tool handlers execute without CJS/ESM compatibility errors
12
+
13
+ #### Scenario: No require() in source files
14
+ - **WHEN** running `grep -r "require(" src/` on the source directory
15
+ - **THEN** zero matches are returned (excluding comments and string literals)
16
+
17
+ ### Requirement: Dynamic collection config reload
18
+ The `memory_update` tool handler SHALL reload the collection configuration file on every invocation, not use the cached startup value.
19
+
20
+ #### Scenario: Collection added after server start
21
+ - **WHEN** a user adds a collection via CLI (`collection add`) while the MCP server is running
22
+ - **THEN** calling `memory_update` through MCP indexes documents from the newly added collection
23
+ - **THEN** no server restart is required
24
+
25
+ #### Scenario: Collection removed after server start
26
+ - **WHEN** a user removes a collection via CLI while the MCP server is running
27
+ - **THEN** calling `memory_update` through MCP no longer indexes documents from the removed collection
28
+
29
+ ### Requirement: All MCP tool handlers return valid responses
30
+ Every registered MCP tool SHALL return a valid JSON-RPC response for valid inputs, never an unhandled exception.
31
+
32
+ #### Scenario: memory_search with valid query
33
+ - **WHEN** `memory_search` is called with `{"query": "test"}` via JSON-RPC
34
+ - **THEN** a valid response with `content` array is returned
35
+
36
+ #### Scenario: memory_update with configured collections
37
+ - **WHEN** `memory_update` is called via JSON-RPC with collections configured
38
+ - **THEN** a valid response with reindex summary is returned, not a runtime error
39
+
40
+ #### Scenario: memory_status returns health info
41
+ - **WHEN** `memory_status` is called via JSON-RPC
42
+ - **THEN** a valid response with document count, chunk count, and collection info is returned
43
+
44
+ ### Requirement: Search tools support workspace filtering
45
+ The `memory_search`, `memory_vsearch`, and `memory_query` MCP tools SHALL accept an optional `workspace` parameter. When omitted, results are scoped to the current workspace and global documents. When set to `"all"`, results include all workspaces.
46
+
47
+ #### Scenario: memory_search with default workspace scoping
48
+ - **WHEN** `memory_search` is called with `{"query": "test"}` and no `workspace` parameter
49
+ - **THEN** results are filtered to `currentProjectHash` and `'global'` documents only
50
+
51
+ #### Scenario: memory_vsearch with workspace="all"
52
+ - **WHEN** `memory_vsearch` is called with `{"query": "test", "workspace": "all"}`
53
+ - **THEN** results include documents from all workspaces
54
+
55
+ #### Scenario: memory_query with specific workspace
56
+ - **WHEN** `memory_query` is called with `{"query": "test", "workspace": "abc123def456"}`
57
+ - **THEN** results are filtered to `project_hash = 'abc123def456'` and `project_hash = 'global'`
58
+
59
+ ### Requirement: memory_status reports storage usage
60
+ The `memory_status` tool SHALL report per-workspace document counts and total storage size, in addition to existing health information.
61
+
62
+ #### Scenario: memory_status with workspace data
63
+ - **WHEN** `memory_status` is called after documents from multiple workspaces are indexed
64
+ - **THEN** the response includes a breakdown of document counts per workspace (projectHash)
65
+ - **THEN** the response includes total storage size (DB + sessions directory)
66
+ - **THEN** the response includes storage limit configuration (maxSize, retention, minFreeDisk)
67
+
68
+ ### Requirement: Search tool parameter schema includes workspace
69
+ The MCP tool registration for `memory_search`, `memory_vsearch`, and `memory_query` SHALL include `workspace` in their input schema as an optional string parameter with description explaining the scoping behavior.
70
+
71
+ #### Scenario: Tool schema advertises workspace parameter
72
+ - **WHEN** an MCP client lists available tools
73
+ - **THEN** `memory_search`, `memory_vsearch`, and `memory_query` each show a `workspace` parameter in their input schema
74
+ - **THEN** the parameter description explains: omit for current workspace, `"all"` for cross-workspace search
75
+
@@ -0,0 +1,29 @@
1
+ ## Requirements
2
+
3
+ ### Requirement: FTS5 query sanitization
4
+ The `searchFTS` function SHALL sanitize user queries before passing them to FTS5 `MATCH`. All user-provided query strings MUST be treated as literal search text, never as FTS5 syntax.
5
+
6
+ #### Scenario: Query containing hyphenated words
7
+ - **WHEN** user searches for `nano-brain`
8
+ - **THEN** the search treats the entire hyphenated term as a literal phrase, not as `opencode NOT memory`
9
+
10
+ #### Scenario: Query containing FTS5 column names
11
+ - **WHEN** user searches for `memory architecture`
12
+ - **THEN** the search treats `memory` as a search term, not as a column reference
13
+ - **THEN** no `no such column` error is thrown
14
+
15
+ #### Scenario: Query containing FTS5 operators
16
+ - **WHEN** user searches for `AND OR NOT NEAR`
17
+ - **THEN** the search treats these as literal words, not as FTS5 boolean operators
18
+
19
+ #### Scenario: Query containing double quotes
20
+ - **WHEN** user searches for `he said "hello"`
21
+ - **THEN** internal double quotes are escaped and the search completes without SQL error
22
+
23
+ #### Scenario: Empty or whitespace-only query
24
+ - **WHEN** user searches for ` ` or empty string
25
+ - **THEN** the search returns an empty result set without error
26
+
27
+ #### Scenario: Normal multi-word query
28
+ - **WHEN** user searches for `sqlite vector search`
29
+ - **THEN** the search returns documents containing those terms, ranked by BM25 relevance
@@ -0,0 +1,94 @@
1
+ # storage-limits Specification
2
+
3
+ ## Purpose
4
+ TBD - created by archiving change workspace-scoped-memory-and-storage-limits. Update Purpose after archive.
5
+ ## Requirements
6
+ ### Requirement: Storage configuration with safe defaults
7
+ The `config.yml` SHALL support a `storage` section with `maxSize`, `retention`, and `minFreeDisk` fields. All fields SHALL be optional with safe defaults: `maxSize: 2GB`, `retention: 90d`, `minFreeDisk: 100MB`.
8
+
9
+ #### Scenario: Config with all storage fields
10
+ - **WHEN** config.yml contains `storage: { maxSize: "1GB", retention: "30d", minFreeDisk: "200MB" }`
11
+ - **THEN** the server uses those values for eviction and disk safety
12
+
13
+ #### Scenario: Config with no storage section
14
+ - **WHEN** config.yml has no `storage` section
15
+ - **THEN** the server uses defaults: maxSize=2GB, retention=90d, minFreeDisk=100MB
16
+
17
+ #### Scenario: Config with partial storage section
18
+ - **WHEN** config.yml contains `storage: { maxSize: "500MB" }`
19
+ - **THEN** `maxSize` is 500MB, `retention` defaults to 90d, `minFreeDisk` defaults to 100MB
20
+
21
+ ### Requirement: Human-readable size and duration parsing
22
+ The storage config parser SHALL accept human-readable size strings (`500MB`, `2GB`, `1TB`) and duration strings (`30d`, `90d`, `1y`). Invalid values SHALL cause a warning log and fall back to defaults.
23
+
24
+ #### Scenario: Valid size string
25
+ - **WHEN** `maxSize` is set to `"2GB"`
26
+ - **THEN** it is parsed as 2,147,483,648 bytes
27
+
28
+ #### Scenario: Valid duration string
29
+ - **WHEN** `retention` is set to `"30d"`
30
+ - **THEN** it is parsed as 30 days (2,592,000,000 milliseconds)
31
+
32
+ #### Scenario: Invalid size string
33
+ - **WHEN** `maxSize` is set to `"banana"`
34
+ - **THEN** a warning is logged: `[storage] Invalid maxSize "banana", using default 2GB`
35
+ - **THEN** the default value of 2GB is used
36
+
37
+ ### Requirement: Retention-based eviction
38
+ During each harvest cycle, the system SHALL delete session markdown files older than the `retention` period and remove their corresponding documents from the SQLite database.
39
+
40
+ #### Scenario: Session older than retention period
41
+ - **WHEN** a session file has mtime older than `retention` (e.g., 91 days old with 90d retention)
42
+ - **THEN** the session markdown file is deleted from disk
43
+ - **THEN** the corresponding document rows are removed from the `documents` table
44
+
45
+ #### Scenario: Session within retention period
46
+ - **WHEN** a session file has mtime within the `retention` period (e.g., 30 days old with 90d retention)
47
+ - **THEN** the session file is not deleted
48
+ - **THEN** the document rows remain in the database
49
+
50
+ ### Requirement: Size-based eviction
51
+ After retention eviction, if total storage (SQLite DB + sessions directory) still exceeds `maxSize`, the system SHALL delete the oldest remaining session files until total size is under the limit.
52
+
53
+ #### Scenario: Storage exceeds maxSize after retention eviction
54
+ - **WHEN** total storage is 2.5GB and `maxSize` is 2GB after retention eviction
55
+ - **THEN** the oldest session files are deleted one by one
56
+ - **THEN** deletion stops when total size drops below 2GB
57
+
58
+ #### Scenario: Storage under maxSize
59
+ - **WHEN** total storage is 1.5GB and `maxSize` is 2GB
60
+ - **THEN** no size-based eviction occurs
61
+
62
+ ### Requirement: Original session JSON is never deleted
63
+ Eviction SHALL only remove harvested markdown files and their database entries. The original OpenCode session JSON files in `~/.local/share/opencode/storage/` SHALL never be touched by eviction.
64
+
65
+ #### Scenario: Session evicted
66
+ - **WHEN** a session is evicted due to retention or size limits
67
+ - **THEN** only the harvested markdown file in `~/.nano-brain/sessions/` is deleted
68
+ - **THEN** the original JSON in `~/.local/share/opencode/storage/sessions/` remains untouched
69
+
70
+ ### Requirement: Disk safety guard
71
+ Before any write operation (harvest, reindex, embed), the system SHALL check available disk space. If free disk space is below `minFreeDisk`, all write operations SHALL be skipped and a warning logged.
72
+
73
+ #### Scenario: Disk space below minFreeDisk
74
+ - **WHEN** available disk space is 50MB and `minFreeDisk` is 100MB
75
+ - **THEN** harvest, reindex, and embed operations are skipped
76
+ - **THEN** a warning is logged: `[storage] Disk space critically low (<100MB free), skipping writes`
77
+
78
+ #### Scenario: Disk space above minFreeDisk
79
+ - **WHEN** available disk space is 500MB and `minFreeDisk` is 100MB
80
+ - **THEN** all write operations proceed normally
81
+
82
+ #### Scenario: statfs unavailable
83
+ - **WHEN** `os.statfs()` is not available (older Node.js or restricted environment)
84
+ - **THEN** the disk check is skipped with a warning: `[storage] statfs unavailable, disk safety check disabled`
85
+ - **THEN** all other storage limits (maxSize, retention) still function normally
86
+
87
+ ### Requirement: Orphan embedding cleanup
88
+ Periodically (every 10 harvest cycles), the system SHALL remove embedding vectors whose corresponding documents no longer exist in the `documents` table.
89
+
90
+ #### Scenario: Document deleted but embedding remains
91
+ - **WHEN** a document is evicted and its row removed from `documents`
92
+ - **THEN** on the next orphan cleanup cycle, the corresponding embedding vector is removed
93
+ - **THEN** no orphaned embeddings accumulate indefinitely
94
+