@nomos-arc/arc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/.claude/settings.local.json +10 -0
  2. package/.nomos-config.json +5 -0
  3. package/CLAUDE.md +108 -0
  4. package/LICENSE +190 -0
  5. package/README.md +569 -0
  6. package/dist/cli.js +21120 -0
  7. package/docs/auth/googel_plan.yaml +1093 -0
  8. package/docs/auth/google_task.md +235 -0
  9. package/docs/auth/hardened_blueprint.yaml +1658 -0
  10. package/docs/auth/red_team_report.yaml +336 -0
  11. package/docs/auth/session_state.yaml +162 -0
  12. package/docs/certificate/cer_enhance_plan.md +605 -0
  13. package/docs/certificate/certificate_report.md +338 -0
  14. package/docs/dev_overview.md +419 -0
  15. package/docs/feature_assessment.md +156 -0
  16. package/docs/how_it_works.md +78 -0
  17. package/docs/infrastructure/map.md +867 -0
  18. package/docs/init/master_plan.md +3581 -0
  19. package/docs/init/red_team_report.md +215 -0
  20. package/docs/init/report_phase_1a.md +304 -0
  21. package/docs/integrity-gate/enhance_drift.md +703 -0
  22. package/docs/integrity-gate/overview.md +108 -0
  23. package/docs/management/manger-task.md +99 -0
  24. package/docs/management/scafffold.md +76 -0
  25. package/docs/map/ATOMIC_BLUEPRINT.md +1349 -0
  26. package/docs/map/RED_TEAM_REPORT.md +159 -0
  27. package/docs/map/map_task.md +147 -0
  28. package/docs/map/semantic_graph_task.md +792 -0
  29. package/docs/map/semantic_master_plan.md +705 -0
  30. package/docs/phase7/TEAM_RED.md +249 -0
  31. package/docs/phase7/plan.md +1682 -0
  32. package/docs/phase7/task.md +275 -0
  33. package/docs/prompts/USAGE.md +312 -0
  34. package/docs/prompts/architect.md +165 -0
  35. package/docs/prompts/executer.md +190 -0
  36. package/docs/prompts/hardener.md +190 -0
  37. package/docs/prompts/red_team.md +146 -0
  38. package/docs/verification/goveranance-overview.md +396 -0
  39. package/docs/verification/governance-overview.md +245 -0
  40. package/docs/verification/verification-arc-ar.md +560 -0
  41. package/docs/verification/verification-architecture.md +560 -0
  42. package/docs/very_next.md +52 -0
  43. package/docs/whitepaper.md +89 -0
  44. package/overview.md +1469 -0
  45. package/package.json +63 -0
  46. package/src/adapters/__tests__/git.test.ts +296 -0
  47. package/src/adapters/__tests__/stdio.test.ts +70 -0
  48. package/src/adapters/git.ts +226 -0
  49. package/src/adapters/pty.ts +159 -0
  50. package/src/adapters/stdio.ts +113 -0
  51. package/src/cli.ts +83 -0
  52. package/src/commands/apply.ts +47 -0
  53. package/src/commands/auth.ts +301 -0
  54. package/src/commands/certificate.ts +89 -0
  55. package/src/commands/discard.ts +24 -0
  56. package/src/commands/drift.ts +116 -0
  57. package/src/commands/index.ts +78 -0
  58. package/src/commands/init.ts +121 -0
  59. package/src/commands/list.ts +75 -0
  60. package/src/commands/map.ts +55 -0
  61. package/src/commands/plan.ts +30 -0
  62. package/src/commands/review.ts +58 -0
  63. package/src/commands/run.ts +63 -0
  64. package/src/commands/search.ts +147 -0
  65. package/src/commands/show.ts +63 -0
  66. package/src/commands/status.ts +59 -0
  67. package/src/core/__tests__/budget.test.ts +213 -0
  68. package/src/core/__tests__/certificate.test.ts +385 -0
  69. package/src/core/__tests__/config.test.ts +191 -0
  70. package/src/core/__tests__/preflight.test.ts +24 -0
  71. package/src/core/__tests__/prompt.test.ts +358 -0
  72. package/src/core/__tests__/review.test.ts +161 -0
  73. package/src/core/__tests__/state.test.ts +362 -0
  74. package/src/core/auth/__tests__/manager.test.ts +166 -0
  75. package/src/core/auth/__tests__/server.test.ts +220 -0
  76. package/src/core/auth/gcp-projects.ts +160 -0
  77. package/src/core/auth/manager.ts +114 -0
  78. package/src/core/auth/server.ts +141 -0
  79. package/src/core/budget.ts +119 -0
  80. package/src/core/certificate.ts +502 -0
  81. package/src/core/config.ts +212 -0
  82. package/src/core/errors.ts +54 -0
  83. package/src/core/factory.ts +49 -0
  84. package/src/core/graph/__tests__/builder.test.ts +272 -0
  85. package/src/core/graph/__tests__/contract-writer.test.ts +175 -0
  86. package/src/core/graph/__tests__/enricher.test.ts +299 -0
  87. package/src/core/graph/__tests__/parser.test.ts +200 -0
  88. package/src/core/graph/__tests__/pipeline.test.ts +202 -0
  89. package/src/core/graph/__tests__/renderer.test.ts +128 -0
  90. package/src/core/graph/__tests__/resolver.test.ts +185 -0
  91. package/src/core/graph/__tests__/scanner.test.ts +231 -0
  92. package/src/core/graph/__tests__/show.test.ts +134 -0
  93. package/src/core/graph/builder.ts +303 -0
  94. package/src/core/graph/constraints.ts +94 -0
  95. package/src/core/graph/contract-writer.ts +93 -0
  96. package/src/core/graph/drift/__tests__/classifier.test.ts +215 -0
  97. package/src/core/graph/drift/__tests__/comparator.test.ts +335 -0
  98. package/src/core/graph/drift/__tests__/drift.test.ts +453 -0
  99. package/src/core/graph/drift/__tests__/reporter.test.ts +203 -0
  100. package/src/core/graph/drift/classifier.ts +165 -0
  101. package/src/core/graph/drift/comparator.ts +205 -0
  102. package/src/core/graph/drift/reporter.ts +77 -0
  103. package/src/core/graph/enricher.ts +251 -0
  104. package/src/core/graph/grammar-paths.ts +30 -0
  105. package/src/core/graph/html-template.ts +493 -0
  106. package/src/core/graph/map-schema.ts +137 -0
  107. package/src/core/graph/parser.ts +336 -0
  108. package/src/core/graph/pipeline.ts +209 -0
  109. package/src/core/graph/renderer.ts +92 -0
  110. package/src/core/graph/resolver.ts +195 -0
  111. package/src/core/graph/scanner.ts +145 -0
  112. package/src/core/logger.ts +46 -0
  113. package/src/core/orchestrator.ts +792 -0
  114. package/src/core/plan-file-manager.ts +66 -0
  115. package/src/core/preflight.ts +64 -0
  116. package/src/core/prompt.ts +173 -0
  117. package/src/core/review.ts +95 -0
  118. package/src/core/state.ts +294 -0
  119. package/src/core/worktree-coordinator.ts +77 -0
  120. package/src/search/__tests__/chunk-extractor.test.ts +339 -0
  121. package/src/search/__tests__/embedder-auth.test.ts +124 -0
  122. package/src/search/__tests__/embedder.test.ts +267 -0
  123. package/src/search/__tests__/graph-enricher.test.ts +178 -0
  124. package/src/search/__tests__/indexer.test.ts +518 -0
  125. package/src/search/__tests__/integration.test.ts +649 -0
  126. package/src/search/__tests__/query-engine.test.ts +334 -0
  127. package/src/search/__tests__/similarity.test.ts +78 -0
  128. package/src/search/__tests__/vector-store.test.ts +281 -0
  129. package/src/search/chunk-extractor.ts +167 -0
  130. package/src/search/embedder.ts +209 -0
  131. package/src/search/graph-enricher.ts +95 -0
  132. package/src/search/indexer.ts +483 -0
  133. package/src/search/lexical-searcher.ts +190 -0
  134. package/src/search/query-engine.ts +225 -0
  135. package/src/search/vector-store.ts +311 -0
  136. package/src/types/index.ts +572 -0
  137. package/src/utils/__tests__/ansi.test.ts +54 -0
  138. package/src/utils/__tests__/frontmatter.test.ts +79 -0
  139. package/src/utils/__tests__/sanitize.test.ts +229 -0
  140. package/src/utils/ansi.ts +19 -0
  141. package/src/utils/context.ts +44 -0
  142. package/src/utils/frontmatter.ts +27 -0
  143. package/src/utils/sanitize.ts +78 -0
  144. package/test/e2e/lifecycle.test.ts +330 -0
  145. package/test/fixtures/mock-planner-hang.ts +5 -0
  146. package/test/fixtures/mock-planner.ts +26 -0
  147. package/test/fixtures/mock-reviewer-bad.ts +8 -0
  148. package/test/fixtures/mock-reviewer-retry.ts +34 -0
  149. package/test/fixtures/mock-reviewer.ts +18 -0
  150. package/test/fixtures/sample-project/src/circular-a.ts +6 -0
  151. package/test/fixtures/sample-project/src/circular-b.ts +6 -0
  152. package/test/fixtures/sample-project/src/config.ts +15 -0
  153. package/test/fixtures/sample-project/src/main.ts +19 -0
  154. package/test/fixtures/sample-project/src/services/product-service.ts +20 -0
  155. package/test/fixtures/sample-project/src/services/user-service.ts +18 -0
  156. package/test/fixtures/sample-project/src/types.ts +14 -0
  157. package/test/fixtures/sample-project/src/utils/index.ts +14 -0
  158. package/test/fixtures/sample-project/src/utils/validate.ts +12 -0
  159. package/tsconfig.json +20 -0
  160. package/vitest.config.ts +12 -0
@@ -0,0 +1,1682 @@
1
+ # Atomic Implementation Blueprint — Phase 7: Global Semantic Search (Remediated)
2
+
3
+ **Source Specification:** `docs/phase7/task.md`
4
+ **Red Team Audit:** `docs/phase7/TEAM_RED.md`
5
+ **Predecessor Phases:** 0–6 (complete). `project_map.json` and `*.semantic.md` files exist.
6
+ **Status:** Ready for execution — all Critical Blockers neutralized.
7
+
8
+ ---
9
+
10
+ ## 0. Executive Summary of Fixes
11
+
12
+ Every finding from the Red Team Adversarial Audit has been addressed. The following table maps each finding to its resolution in this remediated plan.
13
+
14
+ ### Critical Blockers — Neutralized
15
+
16
+ | Finding | Original Flaw | Resolution | Location |
17
+ |---|---|---|---|
18
+ | **BLOCKER-1** | `reset()` + `upsert()` not atomic — zero-availability window | **Table-swap strategy**: full re-index writes to `nomos_vectors_tmp`, then drops old table and renames. Search always hits a live table. | Step 7.1.1 |
19
+ | **BLOCKER-2** | `index-meta.json` written after vector upsert — crash leaves inconsistent state | **Status field**: metadata written at start with `status: "in_progress"`, updated to `"complete"` after upsert. On startup with `in_progress`, force full re-index. | Steps 7.0.5, 7.4.1 |
20
+ | **BLOCKER-3** | No vector dimension validation on incremental index | **Dimension guard**: incremental flow step 2 compares `IndexMetadata.embedding_model` + `vector_dimensions` against current config. Mismatch → forced full re-index with warning. | Step 7.4.1 |
21
+ | **BLOCKER-4** | All vectors accumulated in memory before upsert | **Streaming batch upsert**: after each embedding batch completes, compose records and upsert immediately. References released per-batch. | Step 7.4.1 |
22
+
23
+ ### Ambiguity Traps — Eliminated
24
+
25
+ | Finding | Original Flaw | Resolution | Location |
26
+ |---|---|---|---|
27
+ | **TRAP-1** | `mergeInsert` fallback strategy undefined | **Explicit version detection**: check for `mergeInsert` method existence on table prototype at init time. Store capability flag. Fallback uses single-transaction `overwrite` mode, not delete-then-add. | Step 7.1.1 |
28
+ | **TRAP-2** | `max_concurrent_requests` contradicts "process sequentially" | **Removed `max_concurrent_requests`**. Replaced with `embedding_requests_per_minute` (rate limiter). Batches processed sequentially with delay. One config, one behavior. | Step 7.0.4 |
29
+ | **TRAP-3** | De-duplication logic vague ("takes priority") | **Hard rule**: if a symbol result and its parent file result are both present and within 0.05 absolute similarity, **remove the file-level result**. Keep only the more specific symbol result. | Step 7.5.2 |
30
+ | **TRAP-4** | `graph_depth: -1` sentinel leaks to CLI output | **Stale result handling**: results with `graph_depth === -1` display `"⚠ Stale — file removed since last index"` instead of depth. No negative numbers in output. | Steps 7.5.1, 7.6.2 |
31
+ | **TRAP-5** | `.semantic.md` path derivation fragile for multi-extension files | **Lookup via `project_map.json` `semantic` field**: if `FileNode.semantic` is non-null, the data is inline — no file read needed. If null, skip `.semantic.md` read entirely. No regex-based path derivation. | Step 7.3.1 |
32
+
33
+ ### Resilience Gaps — Sealed
34
+
35
+ | Finding | Original Flaw | Resolution | Location |
36
+ |---|---|---|---|
37
+ | **GAP-1** | Partial embedding failure → permanent blind spots | **`failed_files: string[]`** in `IndexMetadata`. Incremental index treats failed files as "changed" regardless of hash. Cleared on successful re-embed. | Steps 7.0.5, 7.4.1 |
38
+ | **GAP-2** | LanceDB corruption / version mismatch undetected | **`lancedb.connect()` wrapped in try-catch** with actionable error: `"Vector index corrupted. Run: arc index --force"`. **LanceDB pinned to exact version** (`"0.14.3"`, not `"^0.14.3"`). | Steps 7.0.1, 7.1.1 |
39
+ | **GAP-3** | SIGINT between `reset()` and `upsert()` → unrecoverable empty store | **Eliminated by table-swap** (BLOCKER-1 fix). SIGINT during indexing leaves the old table intact. Temp table is orphaned but harmless — cleaned up on next index run. | Step 7.4.1 |
40
+ | **GAP-4** | No timeout on Gemini API calls | **30-second `AbortController` timeout** on every embedding request. Timeout throws `NomosError('search_embedding_failed', 'Embedding request timed out after 30s')`. | Step 7.2.1 |
41
+ | **GAP-5** | `project_map.json` loaded on every search query | **Lazy-loaded and cached** in `GraphEnricher`. Parsed once per `QueryEngine` instance lifetime. For CLI usage (one search per process), this eliminates redundant parsing. | Step 7.5.1 |
42
+
43
+ ### Secondary Concerns — Addressed
44
+
45
+ | Finding | Resolution | Location |
46
+ |---|---|---|
47
+ | **S-1** | PF-5 changed: if `src/search/` exists, verify known files present (idempotent re-entry). | Pre-Flight |
48
+ | **S-2** | `arc index --dry-run` added — extracts and counts chunks without embedding or writing. | Step 7.6.1 |
49
+ | **S-3** | Distance-to-similarity conversion centralized in `VectorStore.query()`. Assertion: returned similarity ∈ [0, 1]. | Step 7.1.1 |
50
+ | **S-4** | Mock-embedder integration test added — runs in CI with deterministic fake vectors. | Step 7.7.1 |
51
+ | **S-5** | `SearchResult` type excludes `vector` field. JSON output verified to not leak raw vectors. | Step 7.0.5 |
52
+ | **S-6** | `content_hash` computed from raw inputs (file path + semantic data + symbol signatures), not composed text. | Step 7.3.1 |
53
+ | **S-7** | Build script modification reads current script, inserts `--external` flags before `--banner`. | Step 7.0.2 |
54
+ | **S-8** | Rate limit delays logged: `"[nomos:search:warn] Rate limited. Waiting {N}ms before next batch..."`. | Step 7.2.1 |
55
+
56
+ ---
57
+
58
+ ## 1. Pre-Flight Checklist
59
+
60
+ Every check must pass before execution begins. Failure on any check = STOP.
61
+
62
+ | # | Check | Command | Expected |
63
+ |---|-------|---------|----------|
64
+ | PF-1 | Node.js >= 20 | `node --version` | Output starts with `v20` or higher |
65
+ | PF-2 | Project root valid | `ls package.json src/cli.ts` | Both exist, exit code 0 |
66
+ | PF-3 | Clean dependency install | `npm install` | Exit code 0 |
67
+ | PF-4 | Existing tests pass | `npx vitest run` | 0 failures |
68
+ | PF-5 | `src/search/` is either absent OR contains Phase 7 files | `ls src/search/vector-store.ts 2>&1` | Either "No such file" (fresh) OR file exists (re-entry). If `src/search/` exists but `vector-store.ts` does not → STOP: unknown directory, investigate. |
69
+ | PF-6 | `project_map.json` exists | `ls tasks-management/graph/project_map.json` | File exists |
70
+ | PF-7 | Git working tree clean | `git status --porcelain` | Empty output (or acknowledged untracked) |
71
+ | PF-8 | `GEMINI_API_KEY` set | `echo $GEMINI_API_KEY \| head -c 5` | Non-empty (first 5 chars visible) |
72
+ | PF-9 | Existing graph module intact | `ls src/core/graph/pipeline.ts` | File exists, exit code 0 |
73
+ | PF-10 | Build script readable | `node -e "const p=JSON.parse(require('fs').readFileSync('package.json'));console.log(p.scripts.build.includes('--banner'))"` | `true` — confirms `--banner` flag exists as insertion anchor |
74
+
75
+ ---
76
+
77
+ ## 2. Atomic Execution Sequence
78
+
79
+ ---
80
+
81
+ ### Task 7.0: Prerequisites — Dependencies & Configuration
82
+
83
+ ---
84
+
85
+ #### Step 7.0.1: Install LanceDB (Pinned Version)
86
+
87
+ **Pre-Condition:** PF-1 through PF-7 pass. `package.json` exists at project root.
88
+
89
+ **Action:**
90
+ ```bash
91
+ npm install @lancedb/lancedb@0.14.3 apache-arrow@18.1.0
92
+ ```
93
+
94
+ **Why pinned versions (M-9):** LanceDB is pre-1.0. No semver stability guarantees. A minor version bump could change the on-disk format, break the Arrow schema, or alter the `mergeInsert` API. Apache Arrow is pinned as its peer dependency for schema compatibility.
95
+
96
+ **Post-install:** Manually edit `package.json` to ensure the version specifiers are exact (`"0.14.3"`, not `"^0.14.3"`). `npm install` by default writes `^` — this MUST be stripped.
97
+
98
+ **Validation:**
99
+ ```bash
100
+ node -e "import('@lancedb/lancedb').then(()=>console.log('ok'))" # → "ok"
101
+ node -e "import('apache-arrow').then(()=>console.log('ok'))" # → "ok"
102
+ grep '"@lancedb/lancedb": "0.14.3"' package.json # → exact match (no ^)
103
+ grep '"apache-arrow": "18.1.0"' package.json # → exact match (no ^)
104
+ ```
105
+
106
+ **Rollback:** `npm uninstall @lancedb/lancedb apache-arrow`
107
+
108
+ **Idempotency:** `npm install` with already-installed packages is a no-op. Version pin check is idempotent.
109
+
110
+ ---
111
+
112
+ #### Step 7.0.2: Update esbuild externals
113
+
114
+ **Pre-Condition:** Step 7.0.1 complete. `package.json` `build` script exists.
115
+
116
+ **Action:** Read the current `build` script from `package.json`. Locate the `--banner:js=` flag. Insert `--external:@lancedb/lancedb --external:apache-arrow` immediately **before** the `--banner` flag. This preserves argument ordering and avoids breaking existing esbuild flag positions.
117
+
118
+ Do NOT blindly append to the end of the script string.
119
+
120
+ **Validation:**
121
+ ```bash
122
+ grep 'external:@lancedb/lancedb' package.json # → match
123
+ grep 'external:apache-arrow' package.json # → match
124
+ npm run build # → exit code 0
125
+ node dist/cli.js --help # → shows help without crash
126
+ ```
127
+
128
+ **Rollback:** `git checkout package.json && npm install`
129
+
130
+ **Idempotency:** Check if `--external:@lancedb/lancedb` already exists in the build script before modifying.
131
+
132
+ ---
133
+
134
+ #### Step 7.0.3: Add new `NomosErrorCode` values
135
+
136
+ **Pre-Condition:** `src/core/errors.ts` exists. Contains `NomosErrorCode` union type.
137
+
138
+ **Action:** Add six new error code literals to the `NomosErrorCode` union type, immediately after `'graph_write_failed'`:
139
+ ```typescript
140
+ | 'search_index_not_found'
141
+ | 'search_index_failed'
142
+ | 'search_index_corrupted'
143
+ | 'search_embedding_failed'
144
+ | 'search_query_failed'
145
+ | 'search_api_key_missing'
146
+ ```
147
+
148
+ Note: `search_index_corrupted` is new (addresses GAP-2 — explicit error code for corrupted/unreadable vector store).
149
+
150
+ **Validation:**
151
+ ```bash
152
+ grep 'search_index_not_found' src/core/errors.ts # → exactly 1 match
153
+ grep 'search_index_corrupted' src/core/errors.ts # → exactly 1 match
154
+ npx tsc --noEmit # → exit code 0
155
+ ```
156
+
157
+ **Rollback:** `git checkout src/core/errors.ts`
158
+
159
+ **Idempotency:** Check if `'search_index_not_found'` already exists before inserting. If present, skip.
160
+
161
+ ---
162
+
163
+ #### Step 7.0.4: Extend `NomosConfig` with `search` section
164
+
165
+ **Pre-Condition:** `src/core/config.ts` and `src/types/index.ts` exist.
166
+
167
+ **Action (types/index.ts):** Add `search` section to the `NomosConfig` interface:
168
+ ```typescript
169
+ search: {
170
+ embedding_model: string;
171
+ embedding_dimensions: number;
172
+ vector_store_path: string;
173
+ default_top_k: number;
174
+ default_threshold: number;
175
+ batch_size: number;
176
+ embedding_requests_per_minute: number;
177
+ request_timeout_ms: number;
178
+ };
179
+ ```
180
+
181
+ **TRAP-2 resolution:** `max_concurrent_requests` is **removed**. It implied concurrent batch processing, but the embedder processes sequentially with rate-limit delays. A dead config that implies a non-existent capability is worse than no config. Replaced by `embedding_requests_per_minute` (already present) and `request_timeout_ms` (new, addresses GAP-4).
182
+
183
+ **Action (config.ts):** Add `SearchConfigSchema` and register it in `NomosConfigSchema`:
184
+ ```typescript
185
+ const SearchConfigSchema = z.object({
186
+ embedding_model: z.string().default('gemini-embedding-001'),
187
+ embedding_dimensions: z.number().int().positive().default(768),
188
+ vector_store_path: z.string().default('tasks-management/graph/vector_index'),
189
+ default_top_k: z.number().int().positive().default(5),
190
+ default_threshold: z.number().min(0).max(1).default(0.7),
191
+ batch_size: z.number().int().positive().max(100).default(50),
192
+ embedding_requests_per_minute: z.number().int().positive().default(300),
193
+ request_timeout_ms: z.number().int().positive().default(30_000),
194
+ });
195
+ ```
196
+
197
+ Add to `NomosConfigSchema`:
198
+ ```typescript
199
+ search: SearchConfigSchema.default(() => SearchConfigSchema.parse({})),
200
+ ```
201
+
202
+ **Validation:**
203
+ ```bash
204
+ npx tsc --noEmit # → exit code 0
205
+ npx vitest run # → 0 failures (no regressions)
206
+ node -e "
207
+ import { NomosConfigSchema } from './src/core/config.js';
208
+ const c = NomosConfigSchema.parse({});
209
+ console.log(c.search.embedding_model); // → 'gemini-embedding-001'
210
+ console.log(c.search.default_top_k); // → 5
211
+ console.log(c.search.embedding_dimensions); // → 768
212
+ console.log(c.search.request_timeout_ms); // → 30000
213
+ console.log(Object.keys(c.search).includes('max_concurrent_requests')); // → false
214
+ "
215
+ ```
216
+
217
+ **Rollback:** `git checkout src/core/config.ts src/types/index.ts`
218
+
219
+ **Idempotency:** Check if `SearchConfigSchema` already exists before adding.
220
+
221
+ ---
222
+
223
+ #### Step 7.0.5: Add search types to `types/index.ts`
224
+
225
+ **Pre-Condition:** Step 7.0.4 complete.
226
+
227
+ **Action:** Append new type definitions to `src/types/index.ts`:
228
+
229
+ ```typescript
230
+ // ─── Semantic Search Types ──────────────────────────────────────────────────
231
+
232
+ export type ChunkType = 'file' | 'symbol';
233
+
234
+ export interface TextChunk {
235
+ id: string; // "src/foo.ts" or "src/foo.ts::MyClass"
236
+ type: ChunkType;
237
+ file_path: string;
238
+ text: string; // concatenated searchable text
239
+ symbol_name: string | null; // non-null for symbol-level chunks
240
+ symbol_type: string | null; // 'function' | 'class' | etc.
241
+ line_start: number | null;
242
+ line_end: number | null;
243
+ parent_file_id: string | null; // for symbol chunks, points to parent file chunk
244
+ content_hash: string; // SHA-256 of raw inputs (NOT composed text) [S-6]
245
+ }
246
+
247
+ export interface VectorRecord {
248
+ id: string;
249
+ type: ChunkType;
250
+ vector: Float32Array;
251
+ file_path: string;
252
+ module: string; // directory name or logical module
253
+ purpose: string;
254
+ symbol_name: string | null;
255
+ symbol_type: string | null;
256
+ line_start: number | null;
257
+ line_end: number | null;
258
+ parent_file_id: string | null;
259
+ graph_depth: number;
260
+ dependents_count: number;
261
+ last_indexed: string; // ISO 8601
262
+ content_hash: string; // SHA-256 of raw inputs
263
+ }
264
+
265
+ /**
266
+ * SearchResult — returned to consumers.
267
+ * CRITICAL [S-5]: Does NOT include `vector` field.
268
+ * JSON.stringify(SearchResult) must never leak raw Float32Array data.
269
+ */
270
+ export interface SearchResult {
271
+ id: string;
272
+ type: ChunkType;
273
+ file_path: string;
274
+ symbol_name: string | null;
275
+ symbol_type: string | null;
276
+ line_start: number | null;
277
+ line_end: number | null;
278
+ purpose: string;
279
+ similarity_score: number; // always in [0, 1] — enforced by VectorStore.query()
280
+ graph_depth: number; // -1 = stale (file deleted since last index)
281
+ dependents_count: number;
282
+ is_core_module: boolean;
283
+ is_stale: boolean; // true when graph_depth === -1 [TRAP-4]
284
+ }
285
+
286
+ export type IndexStatus = 'in_progress' | 'complete';
287
+
288
+ export interface IndexMetadata {
289
+ status: IndexStatus; // [BLOCKER-2] written at start and end of indexing
290
+ last_full_index: string; // ISO 8601
291
+ last_incremental_index: string | null;
292
+ total_files_indexed: number;
293
+ total_symbols_indexed: number;
294
+ total_chunks: number;
295
+ embedding_model: string; // [BLOCKER-3] compared on incremental index
296
+ vector_dimensions: number; // [BLOCKER-3] compared on incremental index
297
+ failed_files: string[]; // [GAP-1] files that failed embedding — re-indexed next run
298
+ files: Record<string, {
299
+ last_indexed: string;
300
+ content_hash: string;
301
+ chunk_count: number;
302
+ }>;
303
+ }
304
+ ```
305
+
306
+ **Validation:**
307
+ ```bash
308
+ npx tsc --noEmit # → exit code 0
309
+ ```
310
+
311
+ **Rollback:** `git checkout src/types/index.ts`
312
+
313
+ **Idempotency:** Check if `ChunkType` already exists before appending.
314
+
315
+ ---
316
+
317
+ #### Step 7.0.6: Update `.gitignore`
318
+
319
+ **Pre-Condition:** `.gitignore` exists at project root.
320
+
321
+ **Action:** Append the following lines to `.gitignore` (if not already present):
322
+ ```
323
+ # Phase 7: Vector search index (local, regenerable)
324
+ tasks-management/graph/vector_index/
325
+ ```
326
+
327
+ **Validation:**
328
+ ```bash
329
+ grep 'vector_index' .gitignore # → match
330
+ ```
331
+
332
+ **Rollback:** Remove the added lines from `.gitignore`.
333
+
334
+ **Idempotency:** Grep for `vector_index` before appending. If present, skip.
335
+
336
+ ---
337
+
338
+ ### Task 7.1: Vector Store — LanceDB Interface (Table-Swap Architecture)
339
+
340
+ ---
341
+
342
+ #### Step 7.1.1: Create `src/search/vector-store.ts`
343
+
344
+ **Pre-Condition:** Step 7.0.1–7.0.5 complete. `@lancedb/lancedb` installed. Types defined.
345
+
346
+ **Action:** Create `src/search/vector-store.ts` implementing the `VectorStore` class.
347
+
348
+ **Class API:**
349
+
350
+ ```typescript
351
+ export class VectorStore {
352
+ private db: Connection | null = null;
353
+ private hasMergeInsert: boolean = false; // [TRAP-1] detected at init
354
+
355
+ constructor(
356
+ private readonly storePath: string,
357
+ private readonly logger: Logger,
358
+ );
359
+
360
+ /** Initialize the DB connection. Detects mergeInsert capability. [GAP-2] wrapped in try-catch. */
361
+ async init(): Promise<void>;
362
+
363
+ /**
364
+ * Upsert a batch of vector records into the LIVE table.
365
+ * Uses mergeInsert if available; otherwise overwrites by id.
366
+ * Called per-batch during indexing [BLOCKER-4] — NOT after accumulating all records.
367
+ */
368
+ async upsert(records: VectorRecord[]): Promise<void>;
369
+
370
+ /**
371
+ * Write records to a TEMPORARY table for full re-index [BLOCKER-1].
372
+ * Does NOT touch the live table. Called per-batch.
373
+ */
374
+ async upsertToStaging(records: VectorRecord[]): Promise<void>;
375
+
376
+ /**
377
+ * Atomic table swap: drop live table, rename staging → live [BLOCKER-1].
378
+ * If swap fails, the live table remains untouched.
379
+ */
380
+ async promoteStagingToLive(): Promise<void>;
381
+
382
+ /** Drop the staging table if it exists (cleanup after failed index). */
383
+ async cleanupStaging(): Promise<void>;
384
+
385
+ /**
386
+ * Query the LIVE table with a vector. Returns top-K results above threshold.
387
+ * Cosine distance → similarity conversion happens HERE and ONLY here [S-3].
388
+ * Returns similarity_score ∈ [0, 1] — asserted before return.
389
+ */
390
+ async query(
391
+ vector: Float32Array,
392
+ topK: number,
393
+ threshold: number,
394
+ ): Promise<Array<Omit<VectorRecord, 'vector'> & { similarity_score: number }>>;
395
+
396
+ /**
397
+ * Delete all records whose file_path matches any of the given paths.
398
+ * Used for incremental re-indexing: delete stale → upsert fresh.
399
+ */
400
+ async deleteByFilePaths(filePaths: string[]): Promise<void>;
401
+
402
+ /** Return total record count in live table. */
403
+ async count(): Promise<number>;
404
+ }
405
+ ```
406
+
407
+ **Internal Design:**
408
+
409
+ 1. **Connection [GAP-2]:**
410
+ ```typescript
411
+ async init(): Promise<void> {
412
+ try {
413
+ this.db = await lancedb.connect(this.storePath);
414
+ } catch (err) {
415
+ throw new NomosError(
416
+ 'search_index_corrupted',
417
+ `Failed to open vector store at ${this.storePath}. ` +
418
+ `The index may be corrupted. Run: arc index --force\n` +
419
+ `Original error: ${err instanceof Error ? err.message : String(err)}`
420
+ );
421
+ }
422
+ // [TRAP-1] Detect mergeInsert capability
423
+ try {
424
+ const names = await this.db.tableNames();
425
+ if (names.includes(LIVE_TABLE)) {
426
+ const table = await this.db.openTable(LIVE_TABLE);
427
+ this.hasMergeInsert = typeof table.mergeInsert === 'function';
428
+ }
429
+ } catch {
430
+ this.hasMergeInsert = false;
431
+ }
432
+ }
433
+ ```
434
+
435
+ 2. **Table names:**
436
+ ```typescript
437
+ const LIVE_TABLE = 'nomos_vectors';
438
+ const STAGING_TABLE = 'nomos_vectors_staging';
439
+ ```
440
+
441
+ 3. **Full re-index flow [BLOCKER-1] — Table-swap strategy:**
442
+ - Indexer calls `cleanupStaging()` at start (remove orphaned staging table from prior crash).
443
+ - For each embedding batch, indexer calls `upsertToStaging(batchRecords)`.
444
+ - After all batches complete, indexer calls `promoteStagingToLive()`.
445
+ - `promoteStagingToLive()` implementation:
446
+ ```typescript
447
+ async promoteStagingToLive(): Promise<void> {
448
+ const names = await this.db!.tableNames();
449
+ if (!names.includes(STAGING_TABLE)) {
450
+ throw new NomosError('search_index_failed', 'Staging table does not exist. Index may have failed.');
451
+ }
452
+ // Drop old live table if it exists
453
+ if (names.includes(LIVE_TABLE)) {
454
+ await this.db!.dropTable(LIVE_TABLE);
455
+ }
456
+ // Rename staging → live
457
+ // LanceDB does not have a native rename. Workaround:
458
+ // Read all from staging, create new live table, drop staging.
459
+ const staging = await this.db!.openTable(STAGING_TABLE);
460
+ const allData = await staging.query().toArray();
461
+ await this.db!.createTable(LIVE_TABLE, allData, { mode: 'overwrite' });
462
+ await this.db!.dropTable(STAGING_TABLE);
463
+ }
464
+ ```
465
+ - **SIGINT safety [GAP-3]:** If SIGINT fires during indexing, the live table is never touched (staging is the only write target). On next run, `cleanupStaging()` removes the orphaned staging table.
466
+
467
+ 4. **Upsert strategy [TRAP-1]:**
468
+ - If `hasMergeInsert === true`: use `table.mergeInsert('id')` for incremental upserts to the live table.
469
+ - If `hasMergeInsert === false`: use `table.overwrite(records)` which atomically replaces all data. For incremental upserts, fall back to: read existing data into memory, merge by `id`, then `overwrite`. This is safe because incremental re-indexes only change a subset of files.
470
+ - **No delete-then-add fallback.** Delete-then-add creates a window where records are missing.
471
+
472
+ 5. **Query with centralized distance conversion [S-3]:**
473
+ ```typescript
474
+ async query(vector, topK, threshold): Promise<...> {
475
+ const table = await this.db!.openTable(LIVE_TABLE);
476
+ const raw = await table.vectorSearch(vector)
477
+ .distanceType('cosine')
478
+ .limit(topK * 2) // over-fetch to allow post-filter
479
+ .toArray();
480
+
481
+ return raw
482
+ .map(r => {
483
+ const similarity = 1 - r._distance;
484
+ // [S-3] Assert similarity ∈ [0, 1]
485
+ const clamped = Math.max(0, Math.min(1, similarity));
486
+ return { ...r, similarity_score: clamped, vector: undefined };
487
+ })
488
+ .filter(r => r.similarity_score >= threshold)
489
+ .slice(0, topK);
490
+ }
491
+ ```
492
+ The `vector` field is stripped from query results — it MUST NOT appear in any output.
493
+
494
+ **Validation:**
495
+ ```bash
496
+ npx tsc --noEmit # → exit code 0
497
+ # Unit test (Step 7.1.2):
498
+ npx vitest run src/search/__tests__/vector-store.test.ts
499
+ ```
500
+
501
+ **Rollback:** `rm src/search/vector-store.ts`
502
+
503
+ ---
504
+
505
+ #### Step 7.1.2: Unit test for `VectorStore`
506
+
507
+ **Pre-Condition:** Step 7.1.1 complete.
508
+
509
+ **Action:** Create `src/search/__tests__/vector-store.test.ts`.
510
+
511
+ **Test cases:**
512
+ 1. `init()` creates the DB directory if it does not exist.
513
+ 2. `init()` wraps connection failure in `NomosError('search_index_corrupted')` [GAP-2].
514
+ 3. `upsert()` inserts records; `count()` returns correct total.
515
+ 4. `upsert()` with duplicate `id` overwrites (not duplicates) — regardless of `mergeInsert` availability.
516
+ 5. `query()` returns results ranked by cosine similarity (closest first).
517
+ 6. `query()` with threshold filters out low-similarity results.
518
+ 7. `query()` returns `similarity_score` ∈ [0, 1] — never negative, never > 1 [S-3].
519
+ 8. `query()` results do NOT contain `vector` field [S-5].
520
+ 9. `deleteByFilePaths()` removes only matching records.
521
+ 10. **Table-swap full cycle [BLOCKER-1]:**
522
+ - `upsertToStaging()` writes to staging table.
523
+ - During staging, `query()` on live table still returns old data (zero-downtime).
524
+ - `promoteStagingToLive()` swaps tables atomically.
525
+ - After promotion, `query()` returns new data.
526
+ 11. `cleanupStaging()` removes orphaned staging table without affecting live table [GAP-3].
527
+ 12. Concurrent `upsert()` calls do not corrupt the store.
528
+
529
+ **Test setup:** Use a temporary directory (`os.tmpdir()`) for each test. Clean up with `fs.rm(dir, { recursive: true })` in `afterEach`.
530
+
531
+ **Validation:**
532
+ ```bash
533
+ npx vitest run src/search/__tests__/vector-store.test.ts # → 0 failures
534
+ ```
535
+
536
+ ---
537
+
538
+ ### Task 7.2: Embedding Client — Gemini `gemini-embedding-001`
539
+
540
+ ---
541
+
542
+ #### Step 7.2.1: Create `src/search/embedder.ts`
543
+
544
+ **Pre-Condition:** `@google/generative-ai` already installed (Phase 6). Config search section defined.
545
+
546
+ **Action:** Create `src/search/embedder.ts` implementing the `Embedder` class.
547
+
548
+ **Class API:**
549
+
550
+ ```typescript
551
+ export class Embedder {
552
+ constructor(config: NomosConfig['search'], logger: Logger);
553
+
554
+ /**
555
+ * Embed a single text string. Returns a Float32Array vector.
556
+ * Used for query-time embedding.
557
+ * Subject to request_timeout_ms [GAP-4].
558
+ */
559
+ async embedOne(text: string): Promise<Float32Array>;
560
+
561
+ /**
562
+ * Embed a batch of text strings. Returns Float32Array[] in the same order.
563
+ * Processes batches SEQUENTIALLY with rate-limit delay [TRAP-2 resolution].
564
+ * Each API call subject to request_timeout_ms [GAP-4].
565
+ * Logs rate-limit delays [S-8].
566
+ */
567
+ async embedBatch(
568
+ texts: string[],
569
+ onBatchComplete?: (batchIndex: number, totalBatches: number) => void,
570
+ ): Promise<Float32Array[]>;
571
+
572
+ /** Return the vector dimensions for the configured model. */
573
+ get dimensions(): number;
574
+ }
575
+ ```
576
+
577
+ **Internal Design:**
578
+
579
+ 1. **Client initialization:** `new GoogleGenerativeAI(apiKey)` where `apiKey = process.env['GEMINI_API_KEY']`. Throw `NomosError('search_api_key_missing', ...)` if not set.
580
+ 2. **Model:** `client.getGenerativeModel({ model: config.embedding_model })` → defaults to `gemini-embedding-001`.
581
+ 3. **Embedding call:** `model.embedContent(text)` for single. `model.batchEmbedContents(requests)` for batch.
582
+
583
+ 4. **Request timeout [GAP-4]:**
584
+ ```typescript
585
+ private async withTimeout<T>(promise: Promise<T>, label: string): Promise<T> {
586
+ const controller = new AbortController();
587
+ const timer = setTimeout(() => controller.abort(), this.config.request_timeout_ms);
588
+ try {
589
+ // Pass abort signal to the request if the SDK supports it.
590
+ // If not, use Promise.race as a fallback:
591
+ return await Promise.race([
592
+ promise,
593
+ new Promise<never>((_, reject) => {
594
+ controller.signal.addEventListener('abort', () => {
595
+ reject(new NomosError(
596
+ 'search_embedding_failed',
597
+ `Embedding request timed out after ${this.config.request_timeout_ms}ms (${label})`
598
+ ));
599
+ });
600
+ }),
601
+ ]);
602
+ } finally {
603
+ clearTimeout(timer);
604
+ }
605
+ }
606
+ ```
607
+
608
+ 5. **Sequential batching with rate-limit logging [TRAP-2, S-8]:**
609
+ ```typescript
610
+ async embedBatch(texts: string[], onBatchComplete?): Promise<Float32Array[]> {
611
+ const results: Float32Array[] = [];
612
+ const batches = chunk(texts, this.config.batch_size);
613
+ const delayMs = Math.ceil(60_000 / this.config.embedding_requests_per_minute);
614
+
615
+ for (let i = 0; i < batches.length; i++) {
616
+ if (i > 0) {
617
+ this.logger.warn(
618
+ `[nomos:search:warn] Rate limiting. Waiting ${delayMs}ms before batch ${i + 1}/${batches.length}...`
619
+ );
620
+ await sleep(delayMs);
621
+ }
622
+ const batch = batches[i];
623
+ const vectors = await this.withTimeout(
624
+ this.embedBatchRaw(batch),
625
+ `batch ${i + 1}/${batches.length}`
626
+ );
627
+ results.push(...vectors);
628
+ onBatchComplete?.(i, batches.length);
629
+ }
630
+ return results;
631
+ }
632
+ ```
633
+ No concurrency. No `max_concurrent_requests`. One batch at a time. One config controls timing.
634
+
635
+ 6. **Retry:** Exponential backoff (2s, 4s, 8s) with jitter on 429/5xx errors. Max 3 retries per batch. On permanent failure, throw `NomosError('search_embedding_failed', ...)`.
636
+ 7. **Output normalization:** Gemini returns `{ embedding: { values: number[] } }`. Convert `values` to `new Float32Array(values)`.
637
+ 8. **Vector dimensions:** Configurable via `config.embedding_dimensions` (default: 768). `gemini-embedding-001` supports 768, 1536, and 3072. Pass `outputDimensionality` to the API call: `model.embedContent({ content, outputDimensionality: config.embedding_dimensions })`. Exposed via `get dimensions()` which returns `config.embedding_dimensions`.
638
+
639
+ **Validation:**
640
+ ```bash
641
+ npx tsc --noEmit # → exit code 0
642
+ ```
643
+
644
+ **Rollback:** `rm src/search/embedder.ts`
645
+
646
+ ---
647
+
648
+ #### Step 7.2.2: Unit test for `Embedder`
649
+
650
+ **Pre-Condition:** Step 7.2.1 complete.
651
+
652
+ **Action:** Create `src/search/__tests__/embedder.test.ts`.
653
+
654
+ **Test cases (mocked — no real API calls):**
655
+ 1. `embedOne()` calls Gemini API with correct model and returns Float32Array.
656
+ 2. `embedBatch()` splits input into chunks of `batch_size` and processes sequentially.
657
+ 3. `embedBatch()` respects rate limit delay between batches (verify `setTimeout` calls).
658
+ 4. `embedBatch()` logs rate-limit wait message [S-8].
659
+ 5. `embedOne()` retries on 429 error with exponential backoff.
660
+ 6. `embedOne()` throws `NomosError('search_embedding_failed')` after max retries.
661
+ 7. `embedOne()` throws `NomosError('search_api_key_missing')` when GEMINI_API_KEY is unset.
662
+ 8. **Timeout [GAP-4]:** `embedOne()` throws `NomosError('search_embedding_failed', /timed out/)` when API hangs past `request_timeout_ms`.
663
+ 9. Vector dimensions match `config.embedding_dimensions` (default 768). Verify by checking `embedder.dimensions === config.embedding_dimensions` and `vector.length === config.embedding_dimensions`.
664
+ 10. `onBatchComplete` callback fires after each batch.
665
+
666
+ **Mock strategy:** Mock `@google/generative-ai` module. Return predictable vectors (e.g., all-zeros with known dimensions) from mocked `embedContent` / `batchEmbedContents`. For timeout test, return a never-resolving promise.
667
+
668
+ **Validation:**
669
+ ```bash
670
+ npx vitest run src/search/__tests__/embedder.test.ts # → 0 failures
671
+ ```
672
+
673
+ ---
674
+
675
+ ### Task 7.3: Chunk Extractor — Text Preparation
676
+
677
+ ---
678
+
679
+ #### Step 7.3.1: Create `src/search/chunk-extractor.ts`
680
+
681
+ **Pre-Condition:** Types from Step 7.0.5 defined. `ProjectMap` type available.
682
+
683
+ **Action:** Create `src/search/chunk-extractor.ts` implementing the `ChunkExtractor` class.
684
+
685
+ **Class API:**
686
+
687
+ ```typescript
688
+ export class ChunkExtractor {
689
+ constructor(private readonly projectRoot: string, logger: Logger);
690
+
691
+ /**
692
+ * Extract file-level and symbol-level TextChunks from a ProjectMap.
693
+ * Uses inline semantic data from ProjectMap — no .semantic.md file reads [TRAP-5].
694
+ */
695
+ extract(map: ProjectMap): TextChunk[];
696
+ }
697
+ ```
698
+
699
+ **Internal Design:**
700
+
701
+ 1. **File-level chunks:** For each `FileNode` in `map.files`:
702
+ - **If `semantic` is non-null (inline in ProjectMap):** Compose text from `semantic.overview`, `semantic.purpose`, `semantic.key_logic[]`, `semantic.usage_context[]`. Delimit fields with labeled headers:
703
+ ```
704
+ File: {file_path}
705
+ Purpose: {semantic.purpose}
706
+ Overview: {semantic.overview}
707
+ Key Logic: {key_logic.join('; ')}
708
+ Usage Context: {usage_context.join('; ')}
709
+ Exports: {symbols.filter(s => s.exported).map(s => s.name).join(', ')}
710
+ Dependencies: {dependencies.join(', ')}
711
+ ```
712
+ - **If `semantic` is null (fallback):** Use file path, symbol names, and import sources as the text. Log a warning per file.
713
+ - **Chunk ID:** `file_path` (relative, e.g., `"src/services/payment.ts"`).
714
+
715
+ 2. **Content hash [S-6]:**
716
+ ```typescript
717
+ // Hash RAW INPUTS, not composed text.
718
+ // This decouples hashing from text composition logic.
719
+ // If composition format changes, hashes remain stable.
720
+ const hashInput = JSON.stringify({
721
+ file_path: fileNode.file_path,
722
+ semantic: fileNode.semantic, // null-safe
723
+ symbols: fileNode.symbols.map(s => ({ name: s.name, kind: s.kind, signature: s.signature })),
724
+ dependencies: fileNode.dependencies,
725
+ });
726
+ const content_hash = crypto.createHash('sha256').update(hashInput).digest('hex');
727
+ ```
728
+
729
+ 3. **Symbol-level chunks:** For each `FileNode`, iterate over `symbols[]` where `exported === true` OR `kind === 'class'` OR `kind === 'function'`:
730
+ - Compose text:
731
+ ```
732
+ Symbol: {symbol.name} ({symbol.kind})
733
+ File: {file_path}
734
+ Signature: {symbol.signature ?? 'N/A'}
735
+ Lines: {symbol.line}-{symbol.end_line ?? '?'}
736
+ File Purpose: {semantic.purpose ?? file_path}
737
+ ```
738
+ - **Chunk ID:** `"{file_path}::{symbol.name}"`.
739
+ - **`parent_file_id`:** `file_path`.
740
+ - **`content_hash`:** Computed from raw symbol data (name, kind, signature, line, end_line) + parent file semantic hash.
741
+
742
+ 4. **No `.semantic.md` file reading [TRAP-5 resolution]:**
743
+ The original plan derived `.semantic.md` paths via fragile regex (`file_path.replace(/\.[^.]+$/, '.semantic.md')`). This breaks for:
744
+ - Multi-extension files (`config.test.ts` → `config.test.semantic.md` — wrong).
745
+ - Extensionless files (`Makefile` → unchanged path — wrong).
746
+
747
+ **Fix:** The `ProjectMap` already contains semantic data inline in `FileNode.semantic`. Use it directly. No filesystem reads for `.semantic.md`. This eliminates the path derivation problem entirely.
748
+
749
+ 5. **Empty text guard:** Skip chunks where composed text is shorter than 20 characters.
750
+
751
+ **Validation:**
752
+ ```bash
753
+ npx tsc --noEmit # → exit code 0
754
+ ```
755
+
756
+ **Rollback:** `rm src/search/chunk-extractor.ts`
757
+
758
+ ---
759
+
760
+ #### Step 7.3.2: Unit test for `ChunkExtractor`
761
+
762
+ **Pre-Condition:** Step 7.3.1 complete.
763
+
764
+ **Action:** Create `src/search/__tests__/chunk-extractor.test.ts`.
765
+
766
+ **Test cases:**
767
+ 1. File with `semantic` data produces a file-level chunk with all fields concatenated.
768
+ 2. File without `semantic` data produces a fallback chunk with file path and symbol names.
769
+ 3. Exported symbols produce symbol-level chunks.
770
+ 4. Non-exported, non-class, non-function symbols are skipped.
771
+ 5. Chunk IDs are correct: file path for file-level, `file::symbol` for symbol-level.
772
+ 6. `parent_file_id` on symbol chunks points to parent file.
773
+ 7. Chunks shorter than 20 chars are skipped.
774
+ 8. `content_hash` is deterministic (same input → same hash) [S-6].
775
+ 9. `content_hash` does NOT change when text composition format changes (only when raw inputs change) [S-6].
776
+ 10. No filesystem reads for `.semantic.md` files [TRAP-5].
777
+
778
+ **Test setup:** Create a minimal `ProjectMap` object in-memory with 2–3 `FileNode` entries (one with semantic, one without, one with symbols).
779
+
780
+ **Validation:**
781
+ ```bash
782
+ npx vitest run src/search/__tests__/chunk-extractor.test.ts # → 0 failures
783
+ ```
784
+
785
+ ---
786
+
787
+ ### Task 7.4: Indexing Pipeline — Extract → Embed → Upsert (Streaming + Table-Swap)
788
+
789
+ ---
790
+
791
+ #### Step 7.4.1: Create `src/search/indexer.ts`
792
+
793
+ **Pre-Condition:** Tasks 7.1–7.3 complete. All sub-modules functional.
794
+
795
+ **Action:** Create `src/search/indexer.ts` implementing the `SearchIndexer` class.
796
+
797
+ **Class API:**
798
+
799
+ ```typescript
800
+ export class SearchIndexer {
801
+ constructor(
802
+ projectRoot: string,
803
+ config: NomosConfig,
804
+ logger: Logger,
805
+ );
806
+
807
+ /**
808
+ * Full index: extract all chunks, embed in streaming batches, upsert to staging,
809
+ * then atomic table-swap to live [BLOCKER-1].
810
+ * Writes metadata with status tracking [BLOCKER-2].
811
+ * Returns IndexMetadata.
812
+ */
813
+ async fullIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata>;
814
+
815
+ /**
816
+ * Incremental index: validates dimensions [BLOCKER-3], re-indexes changed + failed files [GAP-1].
817
+ * Returns updated IndexMetadata.
818
+ */
819
+ async incrementalIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata>;
820
+
821
+ /**
822
+ * Dry-run: extract and count chunks without embedding or writing [S-2].
823
+ * Returns chunk count summary.
824
+ */
825
+ async dryRun(): Promise<{ fileChunks: number; symbolChunks: number; totalChunks: number }>;
826
+ }
827
+ ```
828
+
829
+ **Internal Design — Full Index Flow [BLOCKER-1, BLOCKER-2, BLOCKER-4]:**
830
+
831
+ ```
832
+ 1. Load project_map.json from config.graph.output_dir
833
+ └─ If not found: throw NomosError('search_index_failed', 'project_map.json not found. Run: arc map')
834
+
835
+ 2. Write IndexMetadata with status: "in_progress" [BLOCKER-2]
836
+ └─ This marks the index as incomplete BEFORE any mutation occurs.
837
+ └─ If process crashes after this point, next startup detects "in_progress" → forces full re-index.
838
+
839
+ 3. ChunkExtractor.extract(projectMap) → TextChunk[]
840
+ └─ Log: "[nomos:search:info] Extracted {N} chunks ({F} file-level, {S} symbol-level)"
841
+
842
+ 4. VectorStore.cleanupStaging() → remove orphaned staging table from prior crash [GAP-3]
843
+
844
+ 5. STREAMING BATCH LOOP [BLOCKER-4]:
845
+ └─ Split TextChunk[] into batches of config.batch_size
846
+ └─ For each batch:
847
+ a. Check cancellationFlag — if cancelled, goto step 7 (partial completion)
848
+ b. Embedder.embedBatch(batch.map(c => c.text)) → Float32Array[]
849
+ └─ On batch failure: log error, record file_paths in failedFiles[], continue [GAP-1]
850
+ c. Compose VectorRecord[] for THIS BATCH ONLY
851
+ └─ graph_depth: from projectMap.files[file_path].depth
852
+ └─ dependents_count: from projectMap.files[file_path].dependents.length
853
+ └─ last_indexed: new Date().toISOString()
854
+ d. VectorStore.upsertToStaging(batchRecords) → immediate write, release references
855
+ e. Log: "[nomos:search:info] Embedded batch {i}/{total} ({N} chunks)"
856
+
857
+ 6. VectorStore.promoteStagingToLive() [BLOCKER-1]
858
+ └─ Atomic swap: staging table becomes live table.
859
+ └─ If this fails: live table still contains OLD data (safe). Throw error.
860
+
861
+ 7. Write IndexMetadata with status: "complete" [BLOCKER-2]
862
+ └─ Includes: total counts, per-file hashes, embedding_model, vector_dimensions, failed_files
863
+ └─ Atomic write: write to .tmp file, then rename.
864
+ └─ If cancelled in step 5: write with status: "in_progress" + partial counts + failed_files.
865
+ (Next run detects "in_progress" → forces full re-index.)
866
+
867
+ 8. Return IndexMetadata
868
+ ```
869
+
870
+ **Internal Design — Incremental Index Flow [BLOCKER-3, GAP-1]:**
871
+
872
+ ```
873
+ 1. Load project_map.json
874
+
875
+ 2. Load existing IndexMetadata from {vector_store_path}/index-meta.json
876
+ └─ If not found: fall back to fullIndex()
877
+ └─ If status === "in_progress": fall back to fullIndex() [BLOCKER-2]
878
+ Log: "[nomos:search:warn] Previous index incomplete. Running full re-index."
879
+
880
+ 3. DIMENSION VALIDATION [BLOCKER-3]:
881
+ └─ Compare IndexMetadata.embedding_model against config.search.embedding_model
882
+ └─ Compare IndexMetadata.vector_dimensions against config.search.embedding_dimensions
883
+ └─ If EITHER mismatches:
884
+ Log: "[nomos:search:warn] Embedding model/dimensions changed ({old} → {new}). Forcing full re-index."
885
+ Fall back to fullIndex()
886
+
887
+ 4. ChunkExtractor.extract(projectMap) → TextChunk[]
888
+
889
+ 5. Compute diff:
890
+ └─ changed = chunks where content_hash differs from IndexMetadata.files[file_path].content_hash
891
+ └─ new_files = chunks whose file_path is not in IndexMetadata.files
892
+ └─ removed = files in IndexMetadata.files but not in projectMap
893
+ └─ failed_retry = chunks whose file_path is in IndexMetadata.failed_files [GAP-1]
894
+ └─ to_reindex = union(changed, new_files, failed_retry)
895
+
896
+ 6. If to_reindex is empty AND removed is empty:
897
+ └─ Log: "[nomos:search:info] Index is up-to-date. No changes detected."
898
+ └─ Return existing IndexMetadata unchanged.
899
+
900
+ 7. Write IndexMetadata with status: "in_progress" [BLOCKER-2]
901
+
902
+ 8. VectorStore.deleteByFilePaths([...removed, ...to_reindex.map(c => c.file_path)])
903
+
904
+ 9. STREAMING BATCH LOOP for to_reindex [BLOCKER-4]:
905
+ └─ (Same pattern as full index step 5 — embed batch, compose records, upsert immediately)
906
+
907
+ 10. Update IndexMetadata:
908
+ └─ status: "complete"
909
+ └─ Update per-file entries for changed/new files
910
+ └─ Remove entries for removed files
911
+ └─ Clear failed_files for successfully re-embedded files; keep any that failed again [GAP-1]
912
+ └─ Update last_incremental_index, total counts
913
+
914
+ 11. Write IndexMetadata (atomic: .tmp then rename) [BLOCKER-2]
915
+
916
+ 12. Return IndexMetadata
917
+ ```
918
+
919
+ **Cancellation safety [GAP-3]:**
920
+ - The cancellation flag is checked at the TOP of each batch iteration (step 5a / 9).
921
+ - On cancellation during full index: staging table is orphaned (harmless). Live table untouched. Metadata written as `"in_progress"`. Next run cleans up staging and forces full re-index.
922
+ - On cancellation during incremental index: partial upserts are durable in the live table (LanceDB transactional writes). Metadata written as `"in_progress"`. Next run forces full re-index.
923
+
924
+ **Validation:**
925
+ ```bash
926
+ npx tsc --noEmit # → exit code 0
927
+ ```
928
+
929
+ **Rollback:** `rm src/search/indexer.ts`
930
+
931
+ ---
932
+
933
+ #### Step 7.4.2: Unit test for `SearchIndexer`
934
+
935
+ **Pre-Condition:** Step 7.4.1 complete.
936
+
937
+ **Action:** Create `src/search/__tests__/indexer.test.ts`.
938
+
939
+ **Test cases (mocked Embedder, real VectorStore with temp dir):**
940
+ 1. `fullIndex()` loads project map, extracts chunks, embeds, upserts via staging, swaps, writes metadata.
941
+ 2. `fullIndex()` uses table-swap (staging → live) NOT `reset()` + `upsert()` [BLOCKER-1].
942
+ 3. `fullIndex()` writes `status: "in_progress"` BEFORE embedding, `status: "complete"` AFTER [BLOCKER-2].
943
+ 4. `fullIndex()` upserts per-batch, not all-at-once [BLOCKER-4].
944
+ 5. `incrementalIndex()` only re-embeds changed files (verify by checking `embedBatch` call count).
945
+ 6. `incrementalIndex()` re-embeds files in `failed_files` even if hash unchanged [GAP-1].
946
+ 7. `incrementalIndex()` deletes records for removed files.
947
+ 8. `incrementalIndex()` falls back to `fullIndex()` when no metadata exists.
948
+ 9. `incrementalIndex()` falls back to `fullIndex()` when `status === "in_progress"` [BLOCKER-2].
949
+ 10. `incrementalIndex()` falls back to `fullIndex()` on embedding model mismatch [BLOCKER-3].
950
+ 11. `incrementalIndex()` falls back to `fullIndex()` on vector dimension mismatch [BLOCKER-3].
951
+ 12. Index metadata file is written with correct totals, per-file hashes, and `failed_files`.
952
+ 13. Cancellation flag stops processing between batches; metadata written as `"in_progress"`.
953
+ 14. Missing `project_map.json` throws `NomosError('search_index_failed')`.
954
+ 15. Partial embedding failure: failed files recorded in `IndexMetadata.failed_files` [GAP-1], remaining files indexed successfully.
955
+ 16. `dryRun()` returns chunk counts without calling Embedder or VectorStore [S-2].
956
+
957
+ **Validation:**
958
+ ```bash
959
+ npx vitest run src/search/__tests__/indexer.test.ts # → 0 failures
960
+ ```
961
+
962
+ ---
963
+
964
+ ### Task 7.5: Query Engine — Search Flow
965
+
966
+ ---
967
+
968
+ #### Step 7.5.1: Create `src/search/graph-enricher.ts`
969
+
970
+ **Pre-Condition:** `ProjectMap` type available.
971
+
972
+ **Action:** Create `src/search/graph-enricher.ts` implementing the `GraphEnricher` class.
973
+
974
+ **Class API:**
975
+
976
+ ```typescript
977
+ export class GraphEnricher {
978
+ private projectMap: ProjectMap | null = null;
979
+
980
+ constructor(
981
+ private readonly projectMapPath: string,
982
+ private readonly logger: Logger,
983
+ );
984
+
985
+ /**
986
+ * Lazy-load and cache project_map.json [GAP-5].
987
+ * Parsed once per instance lifetime. Subsequent calls return cached data.
988
+ */
989
+ private async loadMap(): Promise<ProjectMap>;
990
+
991
+ /**
992
+ * Enrich raw search results with dependency graph metadata.
993
+ * Stale results (file deleted since index) get is_stale = true [TRAP-4].
994
+ */
995
+ async enrich(results: RawSearchResult[]): Promise<SearchResult[]>;
996
+ }
997
+
998
+ interface RawSearchResult {
999
+ id: string;
1000
+ type: ChunkType;
1001
+ file_path: string;
1002
+ symbol_name: string | null;
1003
+ symbol_type: string | null;
1004
+ line_start: number | null;
1005
+ line_end: number | null;
1006
+ purpose: string;
1007
+ similarity_score: number;
1008
+ }
1009
+ ```
1010
+
1011
+ **Internal Design:**
1012
+
1013
+ 1. **Lazy loading [GAP-5]:**
1014
+ ```typescript
1015
+ private async loadMap(): Promise<ProjectMap> {
1016
+ if (this.projectMap) return this.projectMap;
1017
+ const raw = await fs.readFile(this.projectMapPath, 'utf-8');
1018
+ this.projectMap = JSON.parse(raw);
1019
+ return this.projectMap;
1020
+ }
1021
+ ```
1022
+ For CLI usage (one search per process), the map is loaded once. No repeated 20MB parses.
1023
+
1024
+ 2. For each result, look up `projectMap.files[result.file_path]`.
1025
+ 3. If found: set `graph_depth`, `dependents_count`, `is_core_module`, `is_stale = false`.
1026
+ 4. **If NOT found [TRAP-4]:** set `graph_depth = -1`, `dependents_count = 0`, `is_core_module = false`, `is_stale = true`. The `-1` sentinel is an internal value — it NEVER reaches CLI output directly. The `is_stale` boolean is the public signal.
1027
+ 5. Sort results by `similarity_score` descending (preserve embedding rank).
1028
+
1029
+ **Validation:**
1030
+ ```bash
1031
+ npx tsc --noEmit # → exit code 0
1032
+ ```
1033
+
1034
+ **Rollback:** `rm src/search/graph-enricher.ts`
1035
+
1036
+ ---
1037
+
1038
+ #### Step 7.5.2: Create `src/search/query-engine.ts`
1039
+
1040
+ **Pre-Condition:** Steps 7.1.1, 7.2.1, 7.5.1 complete.
1041
+
1042
+ **Action:** Create `src/search/query-engine.ts` implementing the `QueryEngine` class.
1043
+
1044
+ **Class API:**
1045
+
1046
+ ```typescript
1047
+ export class QueryEngine {
1048
+ constructor(
1049
+ projectRoot: string,
1050
+ config: NomosConfig,
1051
+ logger: Logger,
1052
+ );
1053
+
1054
+ /**
1055
+ * Execute a semantic search query.
1056
+ * Pipeline: embed query → vector search → graph enrich → deduplicate → rank → return.
1057
+ */
1058
+ async search(query: string, options?: {
1059
+ topK?: number;
1060
+ threshold?: number;
1061
+ }): Promise<SearchResult[]>;
1062
+ }
1063
+ ```
1064
+
1065
+ **Internal Design:**
1066
+
1067
+ ```
1068
+ 1. Validate: query string must be non-empty after trim.
1069
+ └─ Throw NomosError('search_query_failed', 'Query must be a non-empty string.') otherwise.
1070
+
1071
+ 2. Check vector index exists:
1072
+ └─ Read {vector_store_path}/index-meta.json
1073
+ └─ If not found: throw NomosError('search_index_not_found', 'No index found. Run: arc index')
1074
+ └─ If status === "in_progress": log warning "Index is incomplete. Results may be partial."
1075
+
1076
+ 3. Initialize VectorStore [GAP-2]:
1077
+ └─ VectorStore.init() — wrapped in try-catch that produces actionable error.
1078
+
1079
+ 4. Embed query:
1080
+ └─ Embedder.embedOne(query.trim()) → Float32Array
1081
+ └─ Subject to request_timeout_ms [GAP-4]
1082
+
1083
+ 5. Vector search:
1084
+ └─ VectorStore.query(queryVector, topK, threshold) → results with similarity_score
1085
+ └─ similarity_score already ∈ [0, 1] — conversion done in VectorStore [S-3]
1086
+
1087
+ 6. Graph enrich:
1088
+ └─ GraphEnricher.enrich(rawResults) → SearchResult[]
1089
+ └─ project_map.json loaded lazily and cached [GAP-5]
1090
+
1091
+ 7. De-duplicate [TRAP-3 — DETERMINISTIC RULE]:
1092
+ └─ Group results by file_path.
1093
+ └─ For each file_path that has BOTH a 'file' type result AND one or more 'symbol' type results:
1094
+ a. Compute score gap = abs(file_result.similarity_score - max(symbol_results.similarity_score))
1095
+ b. If score gap <= 0.05: REMOVE the file-level result. Keep only symbol results.
1096
+ c. If score gap > 0.05: keep both (they are sufficiently distinct in relevance).
1097
+ └─ This is a hard rule. "Remove" means filter out of the result array entirely.
1098
+
1099
+ 8. Sort by similarity_score descending.
1100
+
1101
+ 9. Return top-K SearchResult[]
1102
+ ```
1103
+
1104
+ **Stale index warning:** If `index-meta.json` `last_full_index` is older than `project_map.json` `generated_at`, log: `"[nomos:search:warn] Index is older than project map. Consider running: arc index --incremental"`.
1105
+
1106
+ **Validation:**
1107
+ ```bash
1108
+ npx tsc --noEmit # → exit code 0
1109
+ ```
1110
+
1111
+ **Rollback:** `rm src/search/query-engine.ts`
1112
+
1113
+ ---
1114
+
1115
+ #### Step 7.5.3: Unit tests for `GraphEnricher` and `QueryEngine`
1116
+
1117
+ **Pre-Condition:** Steps 7.5.1–7.5.2 complete.
1118
+
1119
+ **Action:** Create:
1120
+ - `src/search/__tests__/graph-enricher.test.ts`
1121
+ - `src/search/__tests__/query-engine.test.ts`
1122
+
1123
+ **GraphEnricher test cases:**
1124
+ 1. Results are enriched with correct `graph_depth` and `dependents_count` from project map.
1125
+ 2. Core modules are correctly flagged (`is_core_module = true`).
1126
+ 3. Missing file in project map sets `is_stale = true` and `graph_depth = -1` [TRAP-4].
1127
+ 4. Results maintain similarity_score ranking after enrichment.
1128
+ 5. `loadMap()` is called only once across multiple `enrich()` calls (caching) [GAP-5].
1129
+
1130
+ **QueryEngine test cases (mocked Embedder + VectorStore):**
1131
+ 1. `search()` embeds query, queries store, enriches, and returns ranked results.
1132
+ 2. `search()` throws `NomosError('search_index_not_found')` when no index exists.
1133
+ 3. `search()` throws `NomosError('search_query_failed')` on empty query string.
1134
+ 4. `search()` respects `topK` and `threshold` overrides.
1135
+ 5. Stale index warning is logged when index is older than project map.
1136
+ 6. **De-duplication [TRAP-3]:** symbol-level result within 0.05 of parent file-level result → file-level result removed.
1137
+ 7. **De-duplication [TRAP-3]:** symbol-level result MORE than 0.05 from parent file-level result → both kept.
1138
+ 8. `search()` wraps VectorStore.init() failure with actionable error message [GAP-2].
1139
+
1140
+ **Validation:**
1141
+ ```bash
1142
+ npx vitest run src/search/__tests__/graph-enricher.test.ts # → 0 failures
1143
+ npx vitest run src/search/__tests__/query-engine.test.ts # → 0 failures
1144
+ ```
1145
+
1146
+ ---
1147
+
1148
+ ### Task 7.6: CLI Commands — `arc index` and `arc search`
1149
+
1150
+ ---
1151
+
1152
+ #### Step 7.6.1: Create `src/commands/index.ts`
1153
+
1154
+ **Pre-Condition:** Task 7.4 complete. `SearchIndexer` functional.
1155
+
1156
+ **Action:** Create `src/commands/index.ts` following the existing command registration pattern.
1157
+
1158
+ **Command signature:**
1159
+ ```
1160
+ arc index [--incremental] [--force] [--dry-run]
1161
+ ```
1162
+
1163
+ | Flag | Behavior |
1164
+ |---|---|
1165
+ | (no flags) | Full re-index. Table-swap strategy. |
1166
+ | `--incremental` | Only re-index changed + failed files (content hash + failed_files comparison). |
1167
+ | `--force` | Force full re-index even if incremental metadata exists. Same as no flags. |
1168
+ | `--dry-run` | Extract and count chunks without embedding or writing [S-2]. No API calls. No cost. |
1169
+
1170
+ **Implementation:**
1171
+
1172
+ ```typescript
1173
+ export function registerIndexCommand(program: Command): void {
1174
+ program
1175
+ .command('index')
1176
+ .description('Build or rebuild the vector search index from project map')
1177
+ .option('--incremental', 'Only re-index files changed since last indexing run')
1178
+ .option('--force', 'Force full re-index (ignore incremental metadata)')
1179
+ .option('--dry-run', 'Count chunks without embedding (no API calls, no writes)')
1180
+ .action(async (opts: { incremental?: boolean; force?: boolean; dryRun?: boolean }) => {
1181
+ // 1. loadConfig()
1182
+ // 2. Create logger
1183
+ // 3. Create SearchIndexer
1184
+ // 4. If --dry-run: call dryRun(), print summary, exit 0
1185
+ // 5. Register SIGINT handler:
1186
+ // - Set cancellationFlag.cancelled = true
1187
+ // - Log: "[nomos:search:warn] SIGINT received. Finishing current batch..."
1188
+ // - The indexer will write partial metadata and exit cleanly.
1189
+ // 6. Call fullIndex() or incrementalIndex() based on flags
1190
+ // 7. Print summary:
1191
+ // "Indexed {total_chunks} chunks ({files} files, {symbols} symbols) in {duration}s"
1192
+ // "Vector index stored at: {vector_store_path}"
1193
+ // If failed_files.length > 0:
1194
+ // "⚠ {N} files failed embedding. They will be retried on next incremental index."
1195
+ // 8. Exit 0 on success, 1 on fatal error
1196
+ });
1197
+ }
1198
+ ```
1199
+
1200
+ **Progress output:** Print to stderr (not stdout — preserves machine-parseability):
1201
+ ```
1202
+ [nomos:search:info] Extracting chunks from project map...
1203
+ [nomos:search:info] Extracted 342 chunks (142 file-level, 200 symbol-level)
1204
+ [nomos:search:info] Embedding batch 1/7 (50 chunks)...
1205
+ [nomos:search:warn] Rate limiting. Waiting 200ms before batch 2/7...
1206
+ [nomos:search:info] Embedding batch 2/7 (50 chunks)...
1207
+ ...
1208
+ [nomos:search:info] Writing to staging table...
1209
+ [nomos:search:info] Promoting staging to live (atomic swap)...
1210
+ [nomos:search:info] Writing index metadata...
1211
+ ```
1212
+
1213
+ **Dry-run output [S-2]:**
1214
+ ```
1215
+ [nomos:search:info] DRY RUN — no API calls, no writes.
1216
+ [nomos:search:info] Would index: 342 chunks (142 file-level, 200 symbol-level)
1217
+ [nomos:search:info] Estimated API calls: 7 batches × 50 chunks
1218
+ ```
1219
+
1220
+ **Validation:**
1221
+ ```bash
1222
+ npx tsc --noEmit # → exit code 0
1223
+ npm run build && node dist/cli.js index --help # → shows --incremental, --force, --dry-run
1224
+ ```
1225
+
1226
+ **Rollback:** `rm src/commands/index.ts`
1227
+
1228
+ ---
1229
+
1230
+ #### Step 7.6.2: Create `src/commands/search.ts`
1231
+
1232
+ **Pre-Condition:** Task 7.5 complete. `QueryEngine` functional.
1233
+
1234
+ **Action:** Create `src/commands/search.ts` following the existing command registration pattern.
1235
+
1236
+ **Command signature:**
1237
+ ```
1238
+ arc search <query> [--top <N>] [--threshold <score>] [--json]
1239
+ ```
1240
+
1241
+ | Flag | Default | Behavior |
1242
+ |---|---|---|
1243
+ | `<query>` | (required) | Natural language search query |
1244
+ | `--top <N>` | 5 | Maximum number of results |
1245
+ | `--threshold <score>` | 0.7 | Minimum similarity score (0.0–1.0) |
1246
+ | `--json` | false | Output raw JSON instead of formatted table |
1247
+
1248
+ **Human-readable output format:**
1249
+
1250
+ ```
1251
+ Results for: "how is refund handled?"
1252
+
1253
+ 1. src/services/payment.ts :: processRefund() [0.96] L45-82
1254
+ "Processes a refund request via Stripe, validates eligibility, updates order state"
1255
+ ⚠ Core Module (depth 5) — modifying this affects 10 dependents
1256
+
1257
+ 2. src/services/payment.ts [0.91]
1258
+ "Handles payment processing and refund logic"
1259
+ ⚠ Core Module (depth 5) — modifying this affects 10 dependents
1260
+
1261
+ 3. src/middleware/billing.ts [0.84]
1262
+ "Validates billing state before checkout"
1263
+ Leaf Module (depth 1) — 2 dependents
1264
+
1265
+ Found 3 results (threshold: 0.70, top: 5)
1266
+ ```
1267
+
1268
+ **Stale result formatting [TRAP-4]:**
1269
+ ```
1270
+ 4. src/legacy/old-handler.ts [0.78]
1271
+ "Legacy request handler for v1 API"
1272
+ ⚠ Stale — file removed since last index. Run: arc index --incremental
1273
+ ```
1274
+ When `is_stale === true`: do NOT print `depth -1` or any numeric depth. Print the stale warning message. No negative numbers in CLI output.
1275
+
1276
+ **JSON output format (`--json`) [S-5]:**
1277
+ ```json
1278
+ {
1279
+ "query": "how is refund handled?",
1280
+ "results": [
1281
+ {
1282
+ "id": "src/services/payment.ts::processRefund",
1283
+ "type": "symbol",
1284
+ "file_path": "src/services/payment.ts",
1285
+ "symbol_name": "processRefund",
1286
+ "symbol_type": "function",
1287
+ "line_start": 45,
1288
+ "line_end": 82,
1289
+ "purpose": "Processes a refund request via Stripe...",
1290
+ "similarity_score": 0.96,
1291
+ "graph_depth": 5,
1292
+ "dependents_count": 10,
1293
+ "is_core_module": true,
1294
+ "is_stale": false
1295
+ }
1296
+ ],
1297
+ "metadata": {
1298
+ "top_k": 5,
1299
+ "threshold": 0.7,
1300
+ "total_results": 3,
1301
+ "index_age": "2026-04-06T12:00:00Z"
1302
+ }
1303
+ }
1304
+ ```
1305
+
1306
+ **Verify:** `JSON.stringify(result)` must NOT contain a `vector` field. The `SearchResult` type excludes it by design [S-5].
1307
+
1308
+ Print JSON to stdout. Print nothing to stderr in `--json` mode (machine-parseable output).
1309
+
1310
+ **Edge cases:**
1311
+ - No results found: Print `"No results found above threshold {threshold}."` Exit 0.
1312
+ - Index not found: Print error message, suggest `arc index`. Exit 1.
1313
+ - Index status `in_progress`: Print warning, proceed with partial search. Exit 0.
1314
+ - Empty query string: Print usage help. Exit 1.
1315
+
1316
+ **Validation:**
1317
+ ```bash
1318
+ npx tsc --noEmit # → exit code 0
1319
+ npm run build && node dist/cli.js search --help # → shows options
1320
+ ```
1321
+
1322
+ **Rollback:** `rm src/commands/search.ts`
1323
+
1324
+ ---
1325
+
1326
+ #### Step 7.6.3: Register commands in `src/cli.ts`
1327
+
1328
+ **Pre-Condition:** Steps 7.6.1–7.6.2 complete.
1329
+
1330
+ **Action:**
1331
+
1332
+ 1. Add imports at the top of `src/cli.ts`:
1333
+ ```typescript
1334
+ import { registerIndexCommand } from './commands/index.js';
1335
+ import { registerSearchCommand } from './commands/search.js';
1336
+ ```
1337
+
1338
+ 2. Add to the registration array (in the same position pattern as existing commands):
1339
+ ```typescript
1340
+ registerIndexCommand,
1341
+ registerSearchCommand,
1342
+ ```
1343
+
1344
+ **Validation:**
1345
+ ```bash
1346
+ npx tsc --noEmit # → exit code 0
1347
+ npm run build # → exit code 0
1348
+ node dist/cli.js --help # → shows 'index' and 'search' commands
1349
+ node dist/cli.js index --help # → shows --incremental, --force, --dry-run
1350
+ node dist/cli.js search --help # → shows <query>, --top, --threshold, --json
1351
+ ```
1352
+
1353
+ **Rollback:** `git checkout src/cli.ts`
1354
+
1355
+ ---
1356
+
1357
+ ### Task 7.7: Integration Testing & End-to-End Verification
1358
+
1359
+ ---
1360
+
1361
+ #### Step 7.7.1: Integration tests — Full pipeline + CI-compatible mock
1362
+
1363
+ **Pre-Condition:** All previous tasks complete. `project_map.json` exists with enriched data.
1364
+
1365
+ **Action:** Create `src/search/__tests__/integration.test.ts`.
1366
+
1367
+ **Test A — Live API (requires GEMINI_API_KEY — skip in CI if unset):**
1368
+
1369
+ 1. **Setup:** Copy a minimal `project_map.json` fixture (5 files, ~15 symbols) to a temp directory.
1370
+ 2. **Full index:**
1371
+ - Create `SearchIndexer` with temp directory config.
1372
+ - Call `fullIndex()`.
1373
+ - Assert: `IndexMetadata.status === "complete"` [BLOCKER-2].
1374
+ - Assert: `IndexMetadata.total_files_indexed === 5`.
1375
+ - Assert: `IndexMetadata.total_chunks > 5` (file + symbol chunks).
1376
+ - Assert: `IndexMetadata.embedding_model === 'gemini-embedding-001'` [BLOCKER-3].
1377
+ - Assert: `IndexMetadata.vector_dimensions === config.search.embedding_dimensions` [BLOCKER-3].
1378
+ - Assert: `IndexMetadata.failed_files.length === 0` [GAP-1].
1379
+ - Assert: Vector store `count() > 0`.
1380
+ - Assert: `index-meta.json` file exists.
1381
+ 3. **Search:**
1382
+ - Create `QueryEngine`.
1383
+ - Call `search("error handling and retry logic")`.
1384
+ - Assert: results.length > 0.
1385
+ - Assert: each result has `similarity_score >= 0.7` and `similarity_score <= 1.0` [S-3].
1386
+ - Assert: no result has a `vector` field [S-5].
1387
+ - Assert: each result has valid `file_path`, `graph_depth`, `dependents_count`.
1388
+ - Assert: stale results (if any) have `is_stale === true` [TRAP-4].
1389
+ 4. **Incremental re-index:**
1390
+ - Modify one file's semantic data in the fixture map.
1391
+ - Call `incrementalIndex()`.
1392
+ - Assert: only 1 file re-embedded (check embed call count via spy).
1393
+ - Assert: metadata `status === "complete"`.
1394
+ 5. **Cleanup:** Remove temp directory.
1395
+
1396
+ **Guard:** `describe.skipIf(!process.env['GEMINI_API_KEY'])` — skipped if API key absent.
1397
+
1398
+ **Test B — Mock-embedder integration (runs in CI always) [S-4]:**
1399
+
1400
+ 1. **Setup:** Same fixture. Create a `MockEmbedder` that returns deterministic vectors matching `config.embedding_dimensions` (e.g., hash-based: SHA-256 of text → repeated to fill `config.embedding_dimensions` floats as Float32Array).
1401
+ 2. **Full index with mock embedder.**
1402
+ 3. **Search with mock embedder.**
1403
+ 4. Assert: pipeline completes end-to-end. Results are ranked. Metadata is valid.
1404
+ 5. Assert: table-swap occurred (staging table does not exist after completion) [BLOCKER-1].
1405
+ 6. Assert: de-duplication rule applied correctly [TRAP-3].
1406
+
1407
+ This test runs without `GEMINI_API_KEY` — it exercises the full pipeline minus the actual API call.
1408
+
1409
+ **Validation:**
1410
+ ```bash
1411
+ # CI-compatible (always runs):
1412
+ npx vitest run src/search/__tests__/integration.test.ts --grep "mock-embedder"
1413
+
1414
+ # Full integration (requires API key):
1415
+ GEMINI_API_KEY=$GEMINI_API_KEY npx vitest run src/search/__tests__/integration.test.ts
1416
+ ```
1417
+
1418
+ ---
1419
+
1420
+ #### Step 7.7.2: End-to-end CLI verification
1421
+
1422
+ **Pre-Condition:** Step 7.7.1 passes. Project is built.
1423
+
1424
+ **Action:** Manual CLI verification sequence:
1425
+
1426
+ ```bash
1427
+ # 1. Build
1428
+ npm run build
1429
+
1430
+ # 2. Ensure project map exists
1431
+ node dist/cli.js map --no-ai # Quick structural map (no API calls)
1432
+
1433
+ # 3. Dry run (no API calls, no cost) [S-2]
1434
+ node dist/cli.js index --dry-run
1435
+ # Expected: "Would index: {N} chunks ({F} file-level, {S} symbol-level)"
1436
+ # Expected: exit code 0
1437
+
1438
+ # 4. Full index
1439
+ node dist/cli.js index
1440
+ # Expected: "Indexed {N} chunks ({F} files, {S} symbols) in {T}s"
1441
+ # Expected: exit code 0
1442
+
1443
+ # 5. Verify metadata
1444
+ cat tasks-management/graph/vector_index/index-meta.json | node -e "
1445
+ let d=''; process.stdin.on('data',c=>d+=c);
1446
+ process.stdin.on('end',()=>{
1447
+ const m=JSON.parse(d);
1448
+ console.log('status:', m.status); // → 'complete'
1449
+ console.log('model:', m.embedding_model); // → 'gemini-embedding-001'
1450
+ console.log('dims:', m.vector_dimensions); // → matches config.embedding_dimensions
1451
+ console.log('failed:', m.failed_files.length); // → 0
1452
+ });
1453
+ "
1454
+
1455
+ # 6. Search — natural language
1456
+ node dist/cli.js search "error handling"
1457
+ # Expected: ranked results with file paths, scores, and dependency info
1458
+ # Expected: no results with "depth -1" in output [TRAP-4]
1459
+
1460
+ # 7. Search — JSON output [S-5]
1461
+ node dist/cli.js search "configuration loading" --json | node -e "
1462
+ let d=''; process.stdin.on('data',c=>d+=c);
1463
+ process.stdin.on('end',()=>{
1464
+ const j=JSON.parse(d);
1465
+ console.log('results:', j.results.length);
1466
+ console.log('has vector field:', j.results.some(r => 'vector' in r)); // → false
1467
+ console.log('valid JSON: true');
1468
+ });
1469
+ "
1470
+ # Expected: "has vector field: false"
1471
+
1472
+ # 8. Search — threshold and top-K
1473
+ node dist/cli.js search "state management" --top 3 --threshold 0.8
1474
+ # Expected: at most 3 results, all with score >= 0.80
1475
+
1476
+ # 9. Incremental index
1477
+ node dist/cli.js index --incremental
1478
+ # Expected: "Index is up-to-date. No changes detected." (if nothing changed)
1479
+
1480
+ # 10. Stale index detection
1481
+ # Modify a file, re-run arc map, then arc search
1482
+ # Expected: "[nomos:search:warn] Index is older than project map..."
1483
+
1484
+ # 11. Missing index error
1485
+ rm -rf tasks-management/graph/vector_index
1486
+ node dist/cli.js search "anything"
1487
+ # Expected: error message suggesting "arc index"
1488
+ # Expected: exit code 1
1489
+
1490
+ # 12. Type check and full test suite
1491
+ npx tsc --noEmit # → exit code 0
1492
+ npx vitest run # → 0 failures (all existing + new tests)
1493
+ ```
1494
+
1495
+ ---
1496
+
1497
+ #### Step 7.7.3: Verify cosine similarity correctness
1498
+
1499
+ **Pre-Condition:** Vector store operational.
1500
+
1501
+ **Action:** Create `src/search/__tests__/similarity.test.ts`:
1502
+
1503
+ ```typescript
1504
+ test('similar texts produce similarity > 0.8', async () => {
1505
+ const embedder = new Embedder(config, logger);
1506
+ const v1 = await embedder.embedOne('process payment refund');
1507
+ const v2 = await embedder.embedOne('handle refund for customer payment');
1508
+ const similarity = cosineSimilarity(v1, v2);
1509
+ expect(similarity).toBeGreaterThan(0.8);
1510
+ });
1511
+
1512
+ test('unrelated texts produce similarity < 0.5', async () => {
1513
+ const embedder = new Embedder(config, logger);
1514
+ const v1 = await embedder.embedOne('process payment refund');
1515
+ const v2 = await embedder.embedOne('configure webpack build optimization');
1516
+ const similarity = cosineSimilarity(v1, v2);
1517
+ expect(similarity).toBeLessThan(0.5);
1518
+ });
1519
+
1520
+ test('similarity is always in [0, 1]', async () => {
1521
+ const embedder = new Embedder(config, logger);
1522
+ const v1 = await embedder.embedOne('any arbitrary text');
1523
+ const v2 = await embedder.embedOne('completely different content');
1524
+ const similarity = cosineSimilarity(v1, v2);
1525
+ expect(similarity).toBeGreaterThanOrEqual(0);
1526
+ expect(similarity).toBeLessThanOrEqual(1);
1527
+ });
1528
+
1529
+ function cosineSimilarity(a: Float32Array, b: Float32Array): number {
1530
+ let dot = 0, normA = 0, normB = 0;
1531
+ for (let i = 0; i < a.length; i++) {
1532
+ dot += a[i] * b[i];
1533
+ normA += a[i] * a[i];
1534
+ normB += b[i] * b[i];
1535
+ }
1536
+ return dot / (Math.sqrt(normA) * Math.sqrt(normB));
1537
+ }
1538
+ ```
1539
+
1540
+ **Guard:** `describe.skipIf(!process.env['GEMINI_API_KEY'])`.
1541
+
1542
+ **Validation:**
1543
+ ```bash
1544
+ GEMINI_API_KEY=$GEMINI_API_KEY npx vitest run src/search/__tests__/similarity.test.ts
1545
+ ```
1546
+
1547
+ ---
1548
+
1549
+ ## 3. Final Acceptance Checklist
1550
+
1551
+ Every check must pass. Failure on any = investigation required.
1552
+
1553
+ | # | Criterion | Verification | Addresses |
1554
+ |---|-----------|-------------|-----------|
1555
+ | AC-1 | `arc index` builds vector index from project map | `node dist/cli.js index` → exit 0, index files created | Core |
1556
+ | AC-2 | File-level AND symbol-level embeddings generated | `index-meta.json` shows `total_symbols_indexed > 0` | Core |
1557
+ | AC-3 | `arc index --incremental` only re-indexes changed files | Modify 1 file, re-map, re-index: only 1 file re-embedded | Core |
1558
+ | AC-4 | `arc search` returns semantically relevant results | Search for concept not in any symbol name → gets relevant results | Core |
1559
+ | AC-5 | Symbol-level results include line range | Result includes `L{start}-{end}` for symbol chunks | Core |
1560
+ | AC-6 | Dependency-aware enrichment present | Results show `graph_depth`, `dependents_count`, core/leaf label | Core |
1561
+ | AC-7 | JSON output valid and vector-free | `--json` output parses; no `vector` field in results | S-5 |
1562
+ | AC-8 | Threshold filtering works | `--threshold 0.99` returns fewer results than `--threshold 0.5` | Core |
1563
+ | AC-9 | Top-K limiting works | `--top 1` returns exactly 1 result (if any match) | Core |
1564
+ | AC-10 | Missing index → clear error | `arc search` without index → error, suggests `arc index` | Core |
1565
+ | AC-11 | Missing API key → clear error | `arc index` without GEMINI_API_KEY → clear error message | Core |
1566
+ | AC-12 | No regressions | `npx vitest run` → 0 failures across entire test suite | Core |
1567
+ | AC-13 | Type safety | `npx tsc --noEmit` → exit 0 | Core |
1568
+ | AC-14 | Build succeeds | `npm run build` → exit 0, `node dist/cli.js --help` shows all commands | Core |
1569
+ | AC-15 | Search < 2s for 500 indexed files | `time arc search "query"` on indexed project | GAP-5 |
1570
+ | AC-16 | Full re-index is zero-downtime | During `arc index`, concurrent `arc search` returns old results (not empty/error) | BLOCKER-1 |
1571
+ | AC-17 | Crash recovery works | Kill indexer mid-run; next `arc index --incremental` detects `"in_progress"` → full re-index | BLOCKER-2 |
1572
+ | AC-18 | Model change triggers full re-index | Change `embedding_model` in config; `--incremental` falls back to full | BLOCKER-3 |
1573
+ | AC-19 | Failed files retried on incremental | Simulate embedding failure; next `--incremental` re-embeds failed files | GAP-1 |
1574
+ | AC-20 | API timeout handled | Mock hanging API; indexer throws after `request_timeout_ms` | GAP-4 |
1575
+ | AC-21 | De-duplication deterministic | Symbol + parent file within 0.05 → file-level result removed | TRAP-3 |
1576
+ | AC-22 | Stale results formatted correctly | Deleted file in results → shows stale warning, no `depth -1` | TRAP-4 |
1577
+ | AC-23 | Dry-run makes no API calls | `arc index --dry-run` → chunk count printed, no embeddings, no writes | S-2 |
1578
+ | AC-24 | CI integration test passes | Mock-embedder integration test runs without API key | S-4 |
1579
+ | AC-25 | LanceDB pinned to exact version | `package.json` shows `"0.14.3"` not `"^0.14.3"` | M-9 |
1580
+
1581
+ ---
1582
+
1583
+ ## 4. Dependency Graph (Execution Order)
1584
+
1585
+ ```
1586
+ 7.0.1 ─── 7.0.2 ─┐
1587
+
1588
+ 7.0.3 ────────────┤
1589
+
1590
+ 7.0.4 ─── 7.0.5 ─┤
1591
+
1592
+ 7.0.6 ────────────┘
1593
+
1594
+ ┌─────────┼──────────┐
1595
+ ▼ ▼ ▼
1596
+ 7.1.1 7.2.1 7.3.1
1597
+ │ │ │
1598
+ 7.1.2 7.2.2 7.3.2
1599
+ │ │ │
1600
+ └─────────┼──────────┘
1601
+
1602
+ 7.4.1
1603
+
1604
+ 7.4.2
1605
+
1606
+ ┌───────┴───────┐
1607
+ ▼ ▼
1608
+ 7.5.1 7.5.2
1609
+ │ │
1610
+ └───────┬───────┘
1611
+
1612
+ 7.5.3
1613
+
1614
+ ┌───────┼───────┐
1615
+ ▼ ▼ ▼
1616
+ 7.6.1 7.6.2 7.6.3
1617
+ │ │ │
1618
+ └───────┼───────┘
1619
+
1620
+ ┌───────┼───────┐
1621
+ ▼ ▼ ▼
1622
+ 7.7.1 7.7.2 7.7.3
1623
+ ```
1624
+
1625
+ **Parallelizable:** Tasks 7.1, 7.2, and 7.3 can be implemented in parallel after Task 7.0 completes. Tasks 7.6.1, 7.6.2, and 7.6.3 can be implemented in parallel after Task 7.5.
1626
+
1627
+ ---
1628
+
1629
+ ## 5. Risk Register (Updated)
1630
+
1631
+ | Risk | Probability | Impact | Mitigation | Status |
1632
+ |---|---|---|---|---|
1633
+ | LanceDB native binding fails in esbuild bundle | Medium | Blocker | `--external` in esbuild. Post-build verification. | Mitigated |
1634
+ | Gemini rate limits during large index | High | Delays | Sequential batching, `embedding_requests_per_minute`, backoff. No dead config. | Mitigated (TRAP-2) |
1635
+ | LanceDB API breaking changes (pre-1.0) | Low | Medium | **Pinned to exact `0.14.3`** (M-9). Test suite catches regressions. | Mitigated |
1636
+ | Vector dimensions mismatch on model change | Low | Data corruption | **Dimension validation in incremental flow** (BLOCKER-3). Auto-triggers full re-index. | Mitigated |
1637
+ | Large projects exceed memory during embedding | Medium | Crash | **Streaming batch upsert** (BLOCKER-4). Records written and released per-batch. | Mitigated |
1638
+ | `.semantic.md` files missing or path derivation fails | High | Degraded quality | **Uses inline `ProjectMap.semantic` data** (TRAP-5). No filesystem reads for .semantic.md. | Mitigated |
1639
+ | Process crash during full re-index | Medium | Data loss | **Table-swap** (BLOCKER-1) + **status field** (BLOCKER-2). Old index preserved during crash. | Mitigated |
1640
+ | Partial embedding failure → permanent blind spots | Medium | Search gaps | **`failed_files[]`** in metadata (GAP-1). Re-indexed on next incremental run. | Mitigated |
1641
+ | API hangs indefinitely | Low | CLI freeze | **30s AbortController timeout** (GAP-4) on every API call. | Mitigated |
1642
+ | SIGINT during indexing | Medium | Corruption | **Table-swap** eliminates danger zone. SIGINT leaves old index intact. | Mitigated (GAP-3) |
1643
+
1644
+ ---
1645
+
1646
+ ## 6. File Inventory
1647
+
1648
+ **New files (15):**
1649
+ ```
1650
+ src/search/vector-store.ts
1651
+ src/search/embedder.ts
1652
+ src/search/chunk-extractor.ts
1653
+ src/search/indexer.ts
1654
+ src/search/query-engine.ts
1655
+ src/search/graph-enricher.ts
1656
+ src/search/__tests__/vector-store.test.ts
1657
+ src/search/__tests__/embedder.test.ts
1658
+ src/search/__tests__/chunk-extractor.test.ts
1659
+ src/search/__tests__/indexer.test.ts
1660
+ src/search/__tests__/graph-enricher.test.ts
1661
+ src/search/__tests__/query-engine.test.ts
1662
+ src/search/__tests__/similarity.test.ts
1663
+ src/search/__tests__/integration.test.ts
1664
+ src/commands/index.ts
1665
+ src/commands/search.ts
1666
+ ```
1667
+
1668
+ **Modified files (5):**
1669
+ ```
1670
+ src/types/index.ts ← new search types (SearchResult, IndexMetadata with status + failed_files)
1671
+ src/core/config.ts ← SearchConfigSchema (no max_concurrent_requests)
1672
+ src/core/errors.ts ← new error codes (including search_index_corrupted)
1673
+ src/cli.ts ← register index + search commands
1674
+ package.json ← lancedb@0.14.3 (pinned) + esbuild externals
1675
+ .gitignore ← exclude vector_index/
1676
+ ```
1677
+
1678
+ **Runtime artifacts (generated, gitignored):**
1679
+ ```
1680
+ tasks-management/graph/vector_index/ ← LanceDB data directory
1681
+ tasks-management/graph/vector_index/index-meta.json ← index metadata
1682
+ ```