@nomos-arc/arc 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +10 -0
- package/.nomos-config.json +5 -0
- package/CLAUDE.md +108 -0
- package/LICENSE +190 -0
- package/README.md +569 -0
- package/dist/cli.js +21120 -0
- package/docs/auth/googel_plan.yaml +1093 -0
- package/docs/auth/google_task.md +235 -0
- package/docs/auth/hardened_blueprint.yaml +1658 -0
- package/docs/auth/red_team_report.yaml +336 -0
- package/docs/auth/session_state.yaml +162 -0
- package/docs/certificate/cer_enhance_plan.md +605 -0
- package/docs/certificate/certificate_report.md +338 -0
- package/docs/dev_overview.md +419 -0
- package/docs/feature_assessment.md +156 -0
- package/docs/how_it_works.md +78 -0
- package/docs/infrastructure/map.md +867 -0
- package/docs/init/master_plan.md +3581 -0
- package/docs/init/red_team_report.md +215 -0
- package/docs/init/report_phase_1a.md +304 -0
- package/docs/integrity-gate/enhance_drift.md +703 -0
- package/docs/integrity-gate/overview.md +108 -0
- package/docs/management/manger-task.md +99 -0
- package/docs/management/scafffold.md +76 -0
- package/docs/map/ATOMIC_BLUEPRINT.md +1349 -0
- package/docs/map/RED_TEAM_REPORT.md +159 -0
- package/docs/map/map_task.md +147 -0
- package/docs/map/semantic_graph_task.md +792 -0
- package/docs/map/semantic_master_plan.md +705 -0
- package/docs/phase7/TEAM_RED.md +249 -0
- package/docs/phase7/plan.md +1682 -0
- package/docs/phase7/task.md +275 -0
- package/docs/prompts/USAGE.md +312 -0
- package/docs/prompts/architect.md +165 -0
- package/docs/prompts/executer.md +190 -0
- package/docs/prompts/hardener.md +190 -0
- package/docs/prompts/red_team.md +146 -0
- package/docs/verification/goveranance-overview.md +396 -0
- package/docs/verification/governance-overview.md +245 -0
- package/docs/verification/verification-arc-ar.md +560 -0
- package/docs/verification/verification-architecture.md +560 -0
- package/docs/very_next.md +52 -0
- package/docs/whitepaper.md +89 -0
- package/overview.md +1469 -0
- package/package.json +63 -0
- package/src/adapters/__tests__/git.test.ts +296 -0
- package/src/adapters/__tests__/stdio.test.ts +70 -0
- package/src/adapters/git.ts +226 -0
- package/src/adapters/pty.ts +159 -0
- package/src/adapters/stdio.ts +113 -0
- package/src/cli.ts +83 -0
- package/src/commands/apply.ts +47 -0
- package/src/commands/auth.ts +301 -0
- package/src/commands/certificate.ts +89 -0
- package/src/commands/discard.ts +24 -0
- package/src/commands/drift.ts +116 -0
- package/src/commands/index.ts +78 -0
- package/src/commands/init.ts +121 -0
- package/src/commands/list.ts +75 -0
- package/src/commands/map.ts +55 -0
- package/src/commands/plan.ts +30 -0
- package/src/commands/review.ts +58 -0
- package/src/commands/run.ts +63 -0
- package/src/commands/search.ts +147 -0
- package/src/commands/show.ts +63 -0
- package/src/commands/status.ts +59 -0
- package/src/core/__tests__/budget.test.ts +213 -0
- package/src/core/__tests__/certificate.test.ts +385 -0
- package/src/core/__tests__/config.test.ts +191 -0
- package/src/core/__tests__/preflight.test.ts +24 -0
- package/src/core/__tests__/prompt.test.ts +358 -0
- package/src/core/__tests__/review.test.ts +161 -0
- package/src/core/__tests__/state.test.ts +362 -0
- package/src/core/auth/__tests__/manager.test.ts +166 -0
- package/src/core/auth/__tests__/server.test.ts +220 -0
- package/src/core/auth/gcp-projects.ts +160 -0
- package/src/core/auth/manager.ts +114 -0
- package/src/core/auth/server.ts +141 -0
- package/src/core/budget.ts +119 -0
- package/src/core/certificate.ts +502 -0
- package/src/core/config.ts +212 -0
- package/src/core/errors.ts +54 -0
- package/src/core/factory.ts +49 -0
- package/src/core/graph/__tests__/builder.test.ts +272 -0
- package/src/core/graph/__tests__/contract-writer.test.ts +175 -0
- package/src/core/graph/__tests__/enricher.test.ts +299 -0
- package/src/core/graph/__tests__/parser.test.ts +200 -0
- package/src/core/graph/__tests__/pipeline.test.ts +202 -0
- package/src/core/graph/__tests__/renderer.test.ts +128 -0
- package/src/core/graph/__tests__/resolver.test.ts +185 -0
- package/src/core/graph/__tests__/scanner.test.ts +231 -0
- package/src/core/graph/__tests__/show.test.ts +134 -0
- package/src/core/graph/builder.ts +303 -0
- package/src/core/graph/constraints.ts +94 -0
- package/src/core/graph/contract-writer.ts +93 -0
- package/src/core/graph/drift/__tests__/classifier.test.ts +215 -0
- package/src/core/graph/drift/__tests__/comparator.test.ts +335 -0
- package/src/core/graph/drift/__tests__/drift.test.ts +453 -0
- package/src/core/graph/drift/__tests__/reporter.test.ts +203 -0
- package/src/core/graph/drift/classifier.ts +165 -0
- package/src/core/graph/drift/comparator.ts +205 -0
- package/src/core/graph/drift/reporter.ts +77 -0
- package/src/core/graph/enricher.ts +251 -0
- package/src/core/graph/grammar-paths.ts +30 -0
- package/src/core/graph/html-template.ts +493 -0
- package/src/core/graph/map-schema.ts +137 -0
- package/src/core/graph/parser.ts +336 -0
- package/src/core/graph/pipeline.ts +209 -0
- package/src/core/graph/renderer.ts +92 -0
- package/src/core/graph/resolver.ts +195 -0
- package/src/core/graph/scanner.ts +145 -0
- package/src/core/logger.ts +46 -0
- package/src/core/orchestrator.ts +792 -0
- package/src/core/plan-file-manager.ts +66 -0
- package/src/core/preflight.ts +64 -0
- package/src/core/prompt.ts +173 -0
- package/src/core/review.ts +95 -0
- package/src/core/state.ts +294 -0
- package/src/core/worktree-coordinator.ts +77 -0
- package/src/search/__tests__/chunk-extractor.test.ts +339 -0
- package/src/search/__tests__/embedder-auth.test.ts +124 -0
- package/src/search/__tests__/embedder.test.ts +267 -0
- package/src/search/__tests__/graph-enricher.test.ts +178 -0
- package/src/search/__tests__/indexer.test.ts +518 -0
- package/src/search/__tests__/integration.test.ts +649 -0
- package/src/search/__tests__/query-engine.test.ts +334 -0
- package/src/search/__tests__/similarity.test.ts +78 -0
- package/src/search/__tests__/vector-store.test.ts +281 -0
- package/src/search/chunk-extractor.ts +167 -0
- package/src/search/embedder.ts +209 -0
- package/src/search/graph-enricher.ts +95 -0
- package/src/search/indexer.ts +483 -0
- package/src/search/lexical-searcher.ts +190 -0
- package/src/search/query-engine.ts +225 -0
- package/src/search/vector-store.ts +311 -0
- package/src/types/index.ts +572 -0
- package/src/utils/__tests__/ansi.test.ts +54 -0
- package/src/utils/__tests__/frontmatter.test.ts +79 -0
- package/src/utils/__tests__/sanitize.test.ts +229 -0
- package/src/utils/ansi.ts +19 -0
- package/src/utils/context.ts +44 -0
- package/src/utils/frontmatter.ts +27 -0
- package/src/utils/sanitize.ts +78 -0
- package/test/e2e/lifecycle.test.ts +330 -0
- package/test/fixtures/mock-planner-hang.ts +5 -0
- package/test/fixtures/mock-planner.ts +26 -0
- package/test/fixtures/mock-reviewer-bad.ts +8 -0
- package/test/fixtures/mock-reviewer-retry.ts +34 -0
- package/test/fixtures/mock-reviewer.ts +18 -0
- package/test/fixtures/sample-project/src/circular-a.ts +6 -0
- package/test/fixtures/sample-project/src/circular-b.ts +6 -0
- package/test/fixtures/sample-project/src/config.ts +15 -0
- package/test/fixtures/sample-project/src/main.ts +19 -0
- package/test/fixtures/sample-project/src/services/product-service.ts +20 -0
- package/test/fixtures/sample-project/src/services/user-service.ts +18 -0
- package/test/fixtures/sample-project/src/types.ts +14 -0
- package/test/fixtures/sample-project/src/utils/index.ts +14 -0
- package/test/fixtures/sample-project/src/utils/validate.ts +12 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +12 -0
|
@@ -0,0 +1,1682 @@
|
|
|
1
|
+
# Atomic Implementation Blueprint — Phase 7: Global Semantic Search (Remediated)
|
|
2
|
+
|
|
3
|
+
**Source Specification:** `docs/phase7/task.md`
|
|
4
|
+
**Red Team Audit:** `docs/phase7/TEAM_RED.md`
|
|
5
|
+
**Predecessor Phases:** 0–6 (complete). `project_map.json` and `*.semantic.md` files exist.
|
|
6
|
+
**Status:** Ready for execution — all Critical Blockers neutralized.
|
|
7
|
+
|
|
8
|
+
---
|
|
9
|
+
|
|
10
|
+
## 0. Executive Summary of Fixes
|
|
11
|
+
|
|
12
|
+
Every finding from the Red Team Adversarial Audit has been addressed. The following table maps each finding to its resolution in this remediated plan.
|
|
13
|
+
|
|
14
|
+
### Critical Blockers — Neutralized
|
|
15
|
+
|
|
16
|
+
| Finding | Original Flaw | Resolution | Location |
|
|
17
|
+
|---|---|---|---|
|
|
18
|
+
| **BLOCKER-1** | `reset()` + `upsert()` not atomic — zero-availability window | **Table-swap strategy**: full re-index writes to `nomos_vectors_tmp`, then drops old table and renames. Search always hits a live table. | Step 7.1.1 |
|
|
19
|
+
| **BLOCKER-2** | `index-meta.json` written after vector upsert — crash leaves inconsistent state | **Status field**: metadata written at start with `status: "in_progress"`, updated to `"complete"` after upsert. On startup with `in_progress`, force full re-index. | Steps 7.0.5, 7.4.1 |
|
|
20
|
+
| **BLOCKER-3** | No vector dimension validation on incremental index | **Dimension guard**: incremental flow step 2 compares `IndexMetadata.embedding_model` + `vector_dimensions` against current config. Mismatch → forced full re-index with warning. | Step 7.4.1 |
|
|
21
|
+
| **BLOCKER-4** | All vectors accumulated in memory before upsert | **Streaming batch upsert**: after each embedding batch completes, compose records and upsert immediately. References released per-batch. | Step 7.4.1 |
|
|
22
|
+
|
|
23
|
+
### Ambiguity Traps — Eliminated
|
|
24
|
+
|
|
25
|
+
| Finding | Original Flaw | Resolution | Location |
|
|
26
|
+
|---|---|---|---|
|
|
27
|
+
| **TRAP-1** | `mergeInsert` fallback strategy undefined | **Explicit version detection**: check for `mergeInsert` method existence on table prototype at init time. Store capability flag. Fallback uses single-transaction `overwrite` mode, not delete-then-add. | Step 7.1.1 |
|
|
28
|
+
| **TRAP-2** | `max_concurrent_requests` contradicts "process sequentially" | **Removed `max_concurrent_requests`**. Replaced with `embedding_requests_per_minute` (rate limiter). Batches processed sequentially with delay. One config, one behavior. | Step 7.0.4 |
|
|
29
|
+
| **TRAP-3** | De-duplication logic vague ("takes priority") | **Hard rule**: if a symbol result and its parent file result are both present and within 0.05 absolute similarity, **remove the file-level result**. Keep only the more specific symbol result. | Step 7.5.2 |
|
|
30
|
+
| **TRAP-4** | `graph_depth: -1` sentinel leaks to CLI output | **Stale result handling**: results with `graph_depth === -1` display `"⚠ Stale — file removed since last index"` instead of depth. No negative numbers in output. | Steps 7.5.1, 7.6.2 |
|
|
31
|
+
| **TRAP-5** | `.semantic.md` path derivation fragile for multi-extension files | **Lookup via `project_map.json` `semantic` field**: if `FileNode.semantic` is non-null, the data is inline — no file read needed. If null, skip `.semantic.md` read entirely. No regex-based path derivation. | Step 7.3.1 |
|
|
32
|
+
|
|
33
|
+
### Resilience Gaps — Sealed
|
|
34
|
+
|
|
35
|
+
| Finding | Original Flaw | Resolution | Location |
|
|
36
|
+
|---|---|---|---|
|
|
37
|
+
| **GAP-1** | Partial embedding failure → permanent blind spots | **`failed_files: string[]`** in `IndexMetadata`. Incremental index treats failed files as "changed" regardless of hash. Cleared on successful re-embed. | Steps 7.0.5, 7.4.1 |
|
|
38
|
+
| **GAP-2** | LanceDB corruption / version mismatch undetected | **`lancedb.connect()` wrapped in try-catch** with actionable error: `"Vector index corrupted. Run: arc index --force"`. **LanceDB pinned to exact version** (`"0.14.3"`, not `"^0.14.3"`). | Steps 7.0.1, 7.1.1 |
|
|
39
|
+
| **GAP-3** | SIGINT between `reset()` and `upsert()` → unrecoverable empty store | **Eliminated by table-swap** (BLOCKER-1 fix). SIGINT during indexing leaves the old table intact. Temp table is orphaned but harmless — cleaned up on next index run. | Step 7.4.1 |
|
|
40
|
+
| **GAP-4** | No timeout on Gemini API calls | **30-second `AbortController` timeout** on every embedding request. Timeout throws `NomosError('search_embedding_failed', 'Embedding request timed out after 30s')`. | Step 7.2.1 |
|
|
41
|
+
| **GAP-5** | `project_map.json` loaded on every search query | **Lazy-loaded and cached** in `GraphEnricher`. Parsed once per `QueryEngine` instance lifetime. For CLI usage (one search per process), this eliminates redundant parsing. | Step 7.5.1 |
|
|
42
|
+
|
|
43
|
+
### Secondary Concerns — Addressed
|
|
44
|
+
|
|
45
|
+
| Finding | Resolution | Location |
|
|
46
|
+
|---|---|---|
|
|
47
|
+
| **S-1** | PF-5 changed: if `src/search/` exists, verify known files present (idempotent re-entry). | Pre-Flight |
|
|
48
|
+
| **S-2** | `arc index --dry-run` added — extracts and counts chunks without embedding or writing. | Step 7.6.1 |
|
|
49
|
+
| **S-3** | Distance-to-similarity conversion centralized in `VectorStore.query()`. Assertion: returned similarity ∈ [0, 1]. | Step 7.1.1 |
|
|
50
|
+
| **S-4** | Mock-embedder integration test added — runs in CI with deterministic fake vectors. | Step 7.7.1 |
|
|
51
|
+
| **S-5** | `SearchResult` type excludes `vector` field. JSON output verified to not leak raw vectors. | Step 7.0.5 |
|
|
52
|
+
| **S-6** | `content_hash` computed from raw inputs (file path + semantic data + symbol signatures), not composed text. | Step 7.3.1 |
|
|
53
|
+
| **S-7** | Build script modification reads current script, inserts `--external` flags before `--banner`. | Step 7.0.2 |
|
|
54
|
+
| **S-8** | Rate limit delays logged: `"[nomos:search:warn] Rate limited. Waiting {N}ms before next batch..."`. | Step 7.2.1 |
|
|
55
|
+
|
|
56
|
+
---
|
|
57
|
+
|
|
58
|
+
## 1. Pre-Flight Checklist
|
|
59
|
+
|
|
60
|
+
Every check must pass before execution begins. Failure on any check = STOP.
|
|
61
|
+
|
|
62
|
+
| # | Check | Command | Expected |
|
|
63
|
+
|---|-------|---------|----------|
|
|
64
|
+
| PF-1 | Node.js >= 20 | `node --version` | Output starts with `v20` or higher |
|
|
65
|
+
| PF-2 | Project root valid | `ls package.json src/cli.ts` | Both exist, exit code 0 |
|
|
66
|
+
| PF-3 | Clean dependency install | `npm install` | Exit code 0 |
|
|
67
|
+
| PF-4 | Existing tests pass | `npx vitest run` | 0 failures |
|
|
68
|
+
| PF-5 | `src/search/` is either absent OR contains Phase 7 files | `ls src/search/vector-store.ts 2>&1` | Either "No such file" (fresh) OR file exists (re-entry). If `src/search/` exists but `vector-store.ts` does not → STOP: unknown directory, investigate. |
|
|
69
|
+
| PF-6 | `project_map.json` exists | `ls tasks-management/graph/project_map.json` | File exists |
|
|
70
|
+
| PF-7 | Git working tree clean | `git status --porcelain` | Empty output (or acknowledged untracked) |
|
|
71
|
+
| PF-8 | `GEMINI_API_KEY` set | `echo $GEMINI_API_KEY \| head -c 5` | Non-empty (first 5 chars visible) |
|
|
72
|
+
| PF-9 | Existing graph module intact | `ls src/core/graph/pipeline.ts` | File exists, exit code 0 |
|
|
73
|
+
| PF-10 | Build script readable | `node -e "const p=JSON.parse(require('fs').readFileSync('package.json'));console.log(p.scripts.build.includes('--banner'))"` | `true` — confirms `--banner` flag exists as insertion anchor |
|
|
74
|
+
|
|
75
|
+
---
|
|
76
|
+
|
|
77
|
+
## 2. Atomic Execution Sequence
|
|
78
|
+
|
|
79
|
+
---
|
|
80
|
+
|
|
81
|
+
### Task 7.0: Prerequisites — Dependencies & Configuration
|
|
82
|
+
|
|
83
|
+
---
|
|
84
|
+
|
|
85
|
+
#### Step 7.0.1: Install LanceDB (Pinned Version)
|
|
86
|
+
|
|
87
|
+
**Pre-Condition:** PF-1 through PF-7 pass. `package.json` exists at project root.
|
|
88
|
+
|
|
89
|
+
**Action:**
|
|
90
|
+
```bash
|
|
91
|
+
npm install @lancedb/lancedb@0.14.3 apache-arrow@18.1.0
|
|
92
|
+
```
|
|
93
|
+
|
|
94
|
+
**Why pinned versions (M-9):** LanceDB is pre-1.0. No semver stability guarantees. A minor version bump could change the on-disk format, break the Arrow schema, or alter the `mergeInsert` API. Apache Arrow is pinned as its peer dependency for schema compatibility.
|
|
95
|
+
|
|
96
|
+
**Post-install:** Manually edit `package.json` to ensure the version specifiers are exact (`"0.14.3"`, not `"^0.14.3"`). `npm install` by default writes `^` — this MUST be stripped.
|
|
97
|
+
|
|
98
|
+
**Validation:**
|
|
99
|
+
```bash
|
|
100
|
+
node -e "import('@lancedb/lancedb').then(()=>console.log('ok'))" # → "ok"
|
|
101
|
+
node -e "import('apache-arrow').then(()=>console.log('ok'))" # → "ok"
|
|
102
|
+
grep '"@lancedb/lancedb": "0.14.3"' package.json # → exact match (no ^)
|
|
103
|
+
grep '"apache-arrow": "18.1.0"' package.json # → exact match (no ^)
|
|
104
|
+
```
|
|
105
|
+
|
|
106
|
+
**Rollback:** `npm uninstall @lancedb/lancedb apache-arrow`
|
|
107
|
+
|
|
108
|
+
**Idempotency:** `npm install` with already-installed packages is a no-op. Version pin check is idempotent.
|
|
109
|
+
|
|
110
|
+
---
|
|
111
|
+
|
|
112
|
+
#### Step 7.0.2: Update esbuild externals
|
|
113
|
+
|
|
114
|
+
**Pre-Condition:** Step 7.0.1 complete. `package.json` `build` script exists.
|
|
115
|
+
|
|
116
|
+
**Action:** Read the current `build` script from `package.json`. Locate the `--banner:js=` flag. Insert `--external:@lancedb/lancedb --external:apache-arrow` immediately **before** the `--banner` flag. This preserves argument ordering and avoids breaking existing esbuild flag positions.
|
|
117
|
+
|
|
118
|
+
Do NOT blindly append to the end of the script string.
|
|
119
|
+
|
|
120
|
+
**Validation:**
|
|
121
|
+
```bash
|
|
122
|
+
grep 'external:@lancedb/lancedb' package.json # → match
|
|
123
|
+
grep 'external:apache-arrow' package.json # → match
|
|
124
|
+
npm run build # → exit code 0
|
|
125
|
+
node dist/cli.js --help # → shows help without crash
|
|
126
|
+
```
|
|
127
|
+
|
|
128
|
+
**Rollback:** `git checkout package.json && npm install`
|
|
129
|
+
|
|
130
|
+
**Idempotency:** Check if `--external:@lancedb/lancedb` already exists in the build script before modifying.
|
|
131
|
+
|
|
132
|
+
---
|
|
133
|
+
|
|
134
|
+
#### Step 7.0.3: Add new `NomosErrorCode` values
|
|
135
|
+
|
|
136
|
+
**Pre-Condition:** `src/core/errors.ts` exists. Contains `NomosErrorCode` union type.
|
|
137
|
+
|
|
138
|
+
**Action:** Add six new error code literals to the `NomosErrorCode` union type, immediately after `'graph_write_failed'`:
|
|
139
|
+
```typescript
|
|
140
|
+
| 'search_index_not_found'
|
|
141
|
+
| 'search_index_failed'
|
|
142
|
+
| 'search_index_corrupted'
|
|
143
|
+
| 'search_embedding_failed'
|
|
144
|
+
| 'search_query_failed'
|
|
145
|
+
| 'search_api_key_missing'
|
|
146
|
+
```
|
|
147
|
+
|
|
148
|
+
Note: `search_index_corrupted` is new (addresses GAP-2 — explicit error code for corrupted/unreadable vector store).
|
|
149
|
+
|
|
150
|
+
**Validation:**
|
|
151
|
+
```bash
|
|
152
|
+
grep 'search_index_not_found' src/core/errors.ts # → exactly 1 match
|
|
153
|
+
grep 'search_index_corrupted' src/core/errors.ts # → exactly 1 match
|
|
154
|
+
npx tsc --noEmit # → exit code 0
|
|
155
|
+
```
|
|
156
|
+
|
|
157
|
+
**Rollback:** `git checkout src/core/errors.ts`
|
|
158
|
+
|
|
159
|
+
**Idempotency:** Check if `'search_index_not_found'` already exists before inserting. If present, skip.
|
|
160
|
+
|
|
161
|
+
---
|
|
162
|
+
|
|
163
|
+
#### Step 7.0.4: Extend `NomosConfig` with `search` section
|
|
164
|
+
|
|
165
|
+
**Pre-Condition:** `src/core/config.ts` and `src/types/index.ts` exist.
|
|
166
|
+
|
|
167
|
+
**Action (types/index.ts):** Add `search` section to the `NomosConfig` interface:
|
|
168
|
+
```typescript
|
|
169
|
+
search: {
|
|
170
|
+
embedding_model: string;
|
|
171
|
+
embedding_dimensions: number;
|
|
172
|
+
vector_store_path: string;
|
|
173
|
+
default_top_k: number;
|
|
174
|
+
default_threshold: number;
|
|
175
|
+
batch_size: number;
|
|
176
|
+
embedding_requests_per_minute: number;
|
|
177
|
+
request_timeout_ms: number;
|
|
178
|
+
};
|
|
179
|
+
```
|
|
180
|
+
|
|
181
|
+
**TRAP-2 resolution:** `max_concurrent_requests` is **removed**. It implied concurrent batch processing, but the embedder processes sequentially with rate-limit delays. A dead config that implies a non-existent capability is worse than no config. Replaced by `embedding_requests_per_minute` (already present) and `request_timeout_ms` (new, addresses GAP-4).
|
|
182
|
+
|
|
183
|
+
**Action (config.ts):** Add `SearchConfigSchema` and register it in `NomosConfigSchema`:
|
|
184
|
+
```typescript
|
|
185
|
+
const SearchConfigSchema = z.object({
|
|
186
|
+
embedding_model: z.string().default('gemini-embedding-001'),
|
|
187
|
+
embedding_dimensions: z.number().int().positive().default(768),
|
|
188
|
+
vector_store_path: z.string().default('tasks-management/graph/vector_index'),
|
|
189
|
+
default_top_k: z.number().int().positive().default(5),
|
|
190
|
+
default_threshold: z.number().min(0).max(1).default(0.7),
|
|
191
|
+
batch_size: z.number().int().positive().max(100).default(50),
|
|
192
|
+
embedding_requests_per_minute: z.number().int().positive().default(300),
|
|
193
|
+
request_timeout_ms: z.number().int().positive().default(30_000),
|
|
194
|
+
});
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
Add to `NomosConfigSchema`:
|
|
198
|
+
```typescript
|
|
199
|
+
search: SearchConfigSchema.default(() => SearchConfigSchema.parse({})),
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
**Validation:**
|
|
203
|
+
```bash
|
|
204
|
+
npx tsc --noEmit # → exit code 0
|
|
205
|
+
npx vitest run # → 0 failures (no regressions)
|
|
206
|
+
node -e "
|
|
207
|
+
import { NomosConfigSchema } from './src/core/config.js';
|
|
208
|
+
const c = NomosConfigSchema.parse({});
|
|
209
|
+
console.log(c.search.embedding_model); // → 'gemini-embedding-001'
|
|
210
|
+
console.log(c.search.default_top_k); // → 5
|
|
211
|
+
console.log(c.search.embedding_dimensions); // → 768
|
|
212
|
+
console.log(c.search.request_timeout_ms); // → 30000
|
|
213
|
+
console.log(Object.keys(c.search).includes('max_concurrent_requests')); // → false
|
|
214
|
+
"
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
**Rollback:** `git checkout src/core/config.ts src/types/index.ts`
|
|
218
|
+
|
|
219
|
+
**Idempotency:** Check if `SearchConfigSchema` already exists before adding.
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
#### Step 7.0.5: Add search types to `types/index.ts`
|
|
224
|
+
|
|
225
|
+
**Pre-Condition:** Step 7.0.4 complete.
|
|
226
|
+
|
|
227
|
+
**Action:** Append new type definitions to `src/types/index.ts`:
|
|
228
|
+
|
|
229
|
+
```typescript
|
|
230
|
+
// ─── Semantic Search Types ──────────────────────────────────────────────────
|
|
231
|
+
|
|
232
|
+
export type ChunkType = 'file' | 'symbol';
|
|
233
|
+
|
|
234
|
+
export interface TextChunk {
|
|
235
|
+
id: string; // "src/foo.ts" or "src/foo.ts::MyClass"
|
|
236
|
+
type: ChunkType;
|
|
237
|
+
file_path: string;
|
|
238
|
+
text: string; // concatenated searchable text
|
|
239
|
+
symbol_name: string | null; // non-null for symbol-level chunks
|
|
240
|
+
symbol_type: string | null; // 'function' | 'class' | etc.
|
|
241
|
+
line_start: number | null;
|
|
242
|
+
line_end: number | null;
|
|
243
|
+
parent_file_id: string | null; // for symbol chunks, points to parent file chunk
|
|
244
|
+
content_hash: string; // SHA-256 of raw inputs (NOT composed text) [S-6]
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
export interface VectorRecord {
|
|
248
|
+
id: string;
|
|
249
|
+
type: ChunkType;
|
|
250
|
+
vector: Float32Array;
|
|
251
|
+
file_path: string;
|
|
252
|
+
module: string; // directory name or logical module
|
|
253
|
+
purpose: string;
|
|
254
|
+
symbol_name: string | null;
|
|
255
|
+
symbol_type: string | null;
|
|
256
|
+
line_start: number | null;
|
|
257
|
+
line_end: number | null;
|
|
258
|
+
parent_file_id: string | null;
|
|
259
|
+
graph_depth: number;
|
|
260
|
+
dependents_count: number;
|
|
261
|
+
last_indexed: string; // ISO 8601
|
|
262
|
+
content_hash: string; // SHA-256 of raw inputs
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* SearchResult — returned to consumers.
|
|
267
|
+
* CRITICAL [S-5]: Does NOT include `vector` field.
|
|
268
|
+
* JSON.stringify(SearchResult) must never leak raw Float32Array data.
|
|
269
|
+
*/
|
|
270
|
+
export interface SearchResult {
|
|
271
|
+
id: string;
|
|
272
|
+
type: ChunkType;
|
|
273
|
+
file_path: string;
|
|
274
|
+
symbol_name: string | null;
|
|
275
|
+
symbol_type: string | null;
|
|
276
|
+
line_start: number | null;
|
|
277
|
+
line_end: number | null;
|
|
278
|
+
purpose: string;
|
|
279
|
+
similarity_score: number; // always in [0, 1] — enforced by VectorStore.query()
|
|
280
|
+
graph_depth: number; // -1 = stale (file deleted since last index)
|
|
281
|
+
dependents_count: number;
|
|
282
|
+
is_core_module: boolean;
|
|
283
|
+
is_stale: boolean; // true when graph_depth === -1 [TRAP-4]
|
|
284
|
+
}
|
|
285
|
+
|
|
286
|
+
export type IndexStatus = 'in_progress' | 'complete';
|
|
287
|
+
|
|
288
|
+
export interface IndexMetadata {
|
|
289
|
+
status: IndexStatus; // [BLOCKER-2] written at start and end of indexing
|
|
290
|
+
last_full_index: string; // ISO 8601
|
|
291
|
+
last_incremental_index: string | null;
|
|
292
|
+
total_files_indexed: number;
|
|
293
|
+
total_symbols_indexed: number;
|
|
294
|
+
total_chunks: number;
|
|
295
|
+
embedding_model: string; // [BLOCKER-3] compared on incremental index
|
|
296
|
+
vector_dimensions: number; // [BLOCKER-3] compared on incremental index
|
|
297
|
+
failed_files: string[]; // [GAP-1] files that failed embedding — re-indexed next run
|
|
298
|
+
files: Record<string, {
|
|
299
|
+
last_indexed: string;
|
|
300
|
+
content_hash: string;
|
|
301
|
+
chunk_count: number;
|
|
302
|
+
}>;
|
|
303
|
+
}
|
|
304
|
+
```
|
|
305
|
+
|
|
306
|
+
**Validation:**
|
|
307
|
+
```bash
|
|
308
|
+
npx tsc --noEmit # → exit code 0
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
**Rollback:** `git checkout src/types/index.ts`
|
|
312
|
+
|
|
313
|
+
**Idempotency:** Check if `ChunkType` already exists before appending.
|
|
314
|
+
|
|
315
|
+
---
|
|
316
|
+
|
|
317
|
+
#### Step 7.0.6: Update `.gitignore`
|
|
318
|
+
|
|
319
|
+
**Pre-Condition:** `.gitignore` exists at project root.
|
|
320
|
+
|
|
321
|
+
**Action:** Append the following lines to `.gitignore` (if not already present):
|
|
322
|
+
```
|
|
323
|
+
# Phase 7: Vector search index (local, regenerable)
|
|
324
|
+
tasks-management/graph/vector_index/
|
|
325
|
+
```
|
|
326
|
+
|
|
327
|
+
**Validation:**
|
|
328
|
+
```bash
|
|
329
|
+
grep 'vector_index' .gitignore # → match
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
**Rollback:** Remove the added lines from `.gitignore`.
|
|
333
|
+
|
|
334
|
+
**Idempotency:** Grep for `vector_index` before appending. If present, skip.
|
|
335
|
+
|
|
336
|
+
---
|
|
337
|
+
|
|
338
|
+
### Task 7.1: Vector Store — LanceDB Interface (Table-Swap Architecture)
|
|
339
|
+
|
|
340
|
+
---
|
|
341
|
+
|
|
342
|
+
#### Step 7.1.1: Create `src/search/vector-store.ts`
|
|
343
|
+
|
|
344
|
+
**Pre-Condition:** Step 7.0.1–7.0.5 complete. `@lancedb/lancedb` installed. Types defined.
|
|
345
|
+
|
|
346
|
+
**Action:** Create `src/search/vector-store.ts` implementing the `VectorStore` class.
|
|
347
|
+
|
|
348
|
+
**Class API:**
|
|
349
|
+
|
|
350
|
+
```typescript
|
|
351
|
+
export class VectorStore {
|
|
352
|
+
private db: Connection | null = null;
|
|
353
|
+
private hasMergeInsert: boolean = false; // [TRAP-1] detected at init
|
|
354
|
+
|
|
355
|
+
constructor(
|
|
356
|
+
private readonly storePath: string,
|
|
357
|
+
private readonly logger: Logger,
|
|
358
|
+
);
|
|
359
|
+
|
|
360
|
+
/** Initialize the DB connection. Detects mergeInsert capability. [GAP-2] wrapped in try-catch. */
|
|
361
|
+
async init(): Promise<void>;
|
|
362
|
+
|
|
363
|
+
/**
|
|
364
|
+
* Upsert a batch of vector records into the LIVE table.
|
|
365
|
+
* Uses mergeInsert if available; otherwise overwrites by id.
|
|
366
|
+
* Called per-batch during indexing [BLOCKER-4] — NOT after accumulating all records.
|
|
367
|
+
*/
|
|
368
|
+
async upsert(records: VectorRecord[]): Promise<void>;
|
|
369
|
+
|
|
370
|
+
/**
|
|
371
|
+
* Write records to a TEMPORARY table for full re-index [BLOCKER-1].
|
|
372
|
+
* Does NOT touch the live table. Called per-batch.
|
|
373
|
+
*/
|
|
374
|
+
async upsertToStaging(records: VectorRecord[]): Promise<void>;
|
|
375
|
+
|
|
376
|
+
/**
|
|
377
|
+
* Atomic table swap: drop live table, rename staging → live [BLOCKER-1].
|
|
378
|
+
* If swap fails, the live table remains untouched.
|
|
379
|
+
*/
|
|
380
|
+
async promoteStagingToLive(): Promise<void>;
|
|
381
|
+
|
|
382
|
+
/** Drop the staging table if it exists (cleanup after failed index). */
|
|
383
|
+
async cleanupStaging(): Promise<void>;
|
|
384
|
+
|
|
385
|
+
/**
|
|
386
|
+
* Query the LIVE table with a vector. Returns top-K results above threshold.
|
|
387
|
+
* Cosine distance → similarity conversion happens HERE and ONLY here [S-3].
|
|
388
|
+
* Returns similarity_score ∈ [0, 1] — asserted before return.
|
|
389
|
+
*/
|
|
390
|
+
async query(
|
|
391
|
+
vector: Float32Array,
|
|
392
|
+
topK: number,
|
|
393
|
+
threshold: number,
|
|
394
|
+
): Promise<Array<Omit<VectorRecord, 'vector'> & { similarity_score: number }>>;
|
|
395
|
+
|
|
396
|
+
/**
|
|
397
|
+
* Delete all records whose file_path matches any of the given paths.
|
|
398
|
+
* Used for incremental re-indexing: delete stale → upsert fresh.
|
|
399
|
+
*/
|
|
400
|
+
async deleteByFilePaths(filePaths: string[]): Promise<void>;
|
|
401
|
+
|
|
402
|
+
/** Return total record count in live table. */
|
|
403
|
+
async count(): Promise<number>;
|
|
404
|
+
}
|
|
405
|
+
```
|
|
406
|
+
|
|
407
|
+
**Internal Design:**
|
|
408
|
+
|
|
409
|
+
1. **Connection [GAP-2]:**
|
|
410
|
+
```typescript
|
|
411
|
+
async init(): Promise<void> {
|
|
412
|
+
try {
|
|
413
|
+
this.db = await lancedb.connect(this.storePath);
|
|
414
|
+
} catch (err) {
|
|
415
|
+
throw new NomosError(
|
|
416
|
+
'search_index_corrupted',
|
|
417
|
+
`Failed to open vector store at ${this.storePath}. ` +
|
|
418
|
+
`The index may be corrupted. Run: arc index --force\n` +
|
|
419
|
+
`Original error: ${err instanceof Error ? err.message : String(err)}`
|
|
420
|
+
);
|
|
421
|
+
}
|
|
422
|
+
// [TRAP-1] Detect mergeInsert capability
|
|
423
|
+
try {
|
|
424
|
+
const names = await this.db.tableNames();
|
|
425
|
+
if (names.includes(LIVE_TABLE)) {
|
|
426
|
+
const table = await this.db.openTable(LIVE_TABLE);
|
|
427
|
+
this.hasMergeInsert = typeof table.mergeInsert === 'function';
|
|
428
|
+
}
|
|
429
|
+
} catch {
|
|
430
|
+
this.hasMergeInsert = false;
|
|
431
|
+
}
|
|
432
|
+
}
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
2. **Table names:**
|
|
436
|
+
```typescript
|
|
437
|
+
const LIVE_TABLE = 'nomos_vectors';
|
|
438
|
+
const STAGING_TABLE = 'nomos_vectors_staging';
|
|
439
|
+
```
|
|
440
|
+
|
|
441
|
+
3. **Full re-index flow [BLOCKER-1] — Table-swap strategy:**
|
|
442
|
+
- Indexer calls `cleanupStaging()` at start (remove orphaned staging table from prior crash).
|
|
443
|
+
- For each embedding batch, indexer calls `upsertToStaging(batchRecords)`.
|
|
444
|
+
- After all batches complete, indexer calls `promoteStagingToLive()`.
|
|
445
|
+
- `promoteStagingToLive()` implementation:
|
|
446
|
+
```typescript
|
|
447
|
+
async promoteStagingToLive(): Promise<void> {
|
|
448
|
+
const names = await this.db!.tableNames();
|
|
449
|
+
if (!names.includes(STAGING_TABLE)) {
|
|
450
|
+
throw new NomosError('search_index_failed', 'Staging table does not exist. Index may have failed.');
|
|
451
|
+
}
|
|
452
|
+
// Drop old live table if it exists
|
|
453
|
+
if (names.includes(LIVE_TABLE)) {
|
|
454
|
+
await this.db!.dropTable(LIVE_TABLE);
|
|
455
|
+
}
|
|
456
|
+
// Rename staging → live
|
|
457
|
+
// LanceDB does not have a native rename. Workaround:
|
|
458
|
+
// Read all from staging, create new live table, drop staging.
|
|
459
|
+
const staging = await this.db!.openTable(STAGING_TABLE);
|
|
460
|
+
const allData = await staging.query().toArray();
|
|
461
|
+
await this.db!.createTable(LIVE_TABLE, allData, { mode: 'overwrite' });
|
|
462
|
+
await this.db!.dropTable(STAGING_TABLE);
|
|
463
|
+
}
|
|
464
|
+
```
|
|
465
|
+
- **SIGINT safety [GAP-3]:** If SIGINT fires during indexing, the live table is never touched (staging is the only write target). On next run, `cleanupStaging()` removes the orphaned staging table.
|
|
466
|
+
|
|
467
|
+
4. **Upsert strategy [TRAP-1]:**
|
|
468
|
+
- If `hasMergeInsert === true`: use `table.mergeInsert('id')` for incremental upserts to the live table.
|
|
469
|
+
- If `hasMergeInsert === false`: use `table.overwrite(records)` which atomically replaces all data. For incremental upserts, fall back to: read existing data into memory, merge by `id`, then `overwrite`. This is safe because incremental re-indexes only change a subset of files.
|
|
470
|
+
- **No delete-then-add fallback.** Delete-then-add creates a window where records are missing.
|
|
471
|
+
|
|
472
|
+
5. **Query with centralized distance conversion [S-3]:**
|
|
473
|
+
```typescript
|
|
474
|
+
async query(vector, topK, threshold): Promise<...> {
|
|
475
|
+
const table = await this.db!.openTable(LIVE_TABLE);
|
|
476
|
+
const raw = await table.vectorSearch(vector)
|
|
477
|
+
.distanceType('cosine')
|
|
478
|
+
.limit(topK * 2) // over-fetch to allow post-filter
|
|
479
|
+
.toArray();
|
|
480
|
+
|
|
481
|
+
return raw
|
|
482
|
+
.map(r => {
|
|
483
|
+
const similarity = 1 - r._distance;
|
|
484
|
+
// [S-3] Assert similarity ∈ [0, 1]
|
|
485
|
+
const clamped = Math.max(0, Math.min(1, similarity));
|
|
486
|
+
return { ...r, similarity_score: clamped, vector: undefined };
|
|
487
|
+
})
|
|
488
|
+
.filter(r => r.similarity_score >= threshold)
|
|
489
|
+
.slice(0, topK);
|
|
490
|
+
}
|
|
491
|
+
```
|
|
492
|
+
The `vector` field is stripped from query results — it MUST NOT appear in any output.
|
|
493
|
+
|
|
494
|
+
**Validation:**
|
|
495
|
+
```bash
|
|
496
|
+
npx tsc --noEmit # → exit code 0
|
|
497
|
+
# Unit test (Step 7.1.2):
|
|
498
|
+
npx vitest run src/search/__tests__/vector-store.test.ts
|
|
499
|
+
```
|
|
500
|
+
|
|
501
|
+
**Rollback:** `rm src/search/vector-store.ts`
|
|
502
|
+
|
|
503
|
+
---
|
|
504
|
+
|
|
505
|
+
#### Step 7.1.2: Unit test for `VectorStore`
|
|
506
|
+
|
|
507
|
+
**Pre-Condition:** Step 7.1.1 complete.
|
|
508
|
+
|
|
509
|
+
**Action:** Create `src/search/__tests__/vector-store.test.ts`.
|
|
510
|
+
|
|
511
|
+
**Test cases:**
|
|
512
|
+
1. `init()` creates the DB directory if it does not exist.
|
|
513
|
+
2. `init()` wraps connection failure in `NomosError('search_index_corrupted')` [GAP-2].
|
|
514
|
+
3. `upsert()` inserts records; `count()` returns correct total.
|
|
515
|
+
4. `upsert()` with duplicate `id` overwrites (not duplicates) — regardless of `mergeInsert` availability.
|
|
516
|
+
5. `query()` returns results ranked by cosine similarity (closest first).
|
|
517
|
+
6. `query()` with threshold filters out low-similarity results.
|
|
518
|
+
7. `query()` returns `similarity_score` ∈ [0, 1] — never negative, never > 1 [S-3].
|
|
519
|
+
8. `query()` results do NOT contain `vector` field [S-5].
|
|
520
|
+
9. `deleteByFilePaths()` removes only matching records.
|
|
521
|
+
10. **Table-swap full cycle [BLOCKER-1]:**
|
|
522
|
+
- `upsertToStaging()` writes to staging table.
|
|
523
|
+
- During staging, `query()` on live table still returns old data (zero-downtime).
|
|
524
|
+
- `promoteStagingToLive()` swaps tables atomically.
|
|
525
|
+
- After promotion, `query()` returns new data.
|
|
526
|
+
11. `cleanupStaging()` removes orphaned staging table without affecting live table [GAP-3].
|
|
527
|
+
12. Concurrent `upsert()` calls do not corrupt the store.
|
|
528
|
+
|
|
529
|
+
**Test setup:** Use a temporary directory (`os.tmpdir()`) for each test. Clean up with `fs.rm(dir, { recursive: true })` in `afterEach`.
|
|
530
|
+
|
|
531
|
+
**Validation:**
|
|
532
|
+
```bash
|
|
533
|
+
npx vitest run src/search/__tests__/vector-store.test.ts # → 0 failures
|
|
534
|
+
```
|
|
535
|
+
|
|
536
|
+
---
|
|
537
|
+
|
|
538
|
+
### Task 7.2: Embedding Client — Gemini `gemini-embedding-001`
|
|
539
|
+
|
|
540
|
+
---
|
|
541
|
+
|
|
542
|
+
#### Step 7.2.1: Create `src/search/embedder.ts`
|
|
543
|
+
|
|
544
|
+
**Pre-Condition:** `@google/generative-ai` already installed (Phase 6). Config search section defined.
|
|
545
|
+
|
|
546
|
+
**Action:** Create `src/search/embedder.ts` implementing the `Embedder` class.
|
|
547
|
+
|
|
548
|
+
**Class API:**
|
|
549
|
+
|
|
550
|
+
```typescript
|
|
551
|
+
export class Embedder {
|
|
552
|
+
constructor(config: NomosConfig['search'], logger: Logger);
|
|
553
|
+
|
|
554
|
+
/**
|
|
555
|
+
* Embed a single text string. Returns a Float32Array vector.
|
|
556
|
+
* Used for query-time embedding.
|
|
557
|
+
* Subject to request_timeout_ms [GAP-4].
|
|
558
|
+
*/
|
|
559
|
+
async embedOne(text: string): Promise<Float32Array>;
|
|
560
|
+
|
|
561
|
+
/**
|
|
562
|
+
* Embed a batch of text strings. Returns Float32Array[] in the same order.
|
|
563
|
+
* Processes batches SEQUENTIALLY with rate-limit delay [TRAP-2 resolution].
|
|
564
|
+
* Each API call subject to request_timeout_ms [GAP-4].
|
|
565
|
+
* Logs rate-limit delays [S-8].
|
|
566
|
+
*/
|
|
567
|
+
async embedBatch(
|
|
568
|
+
texts: string[],
|
|
569
|
+
onBatchComplete?: (batchIndex: number, totalBatches: number) => void,
|
|
570
|
+
): Promise<Float32Array[]>;
|
|
571
|
+
|
|
572
|
+
/** Return the vector dimensions for the configured model. */
|
|
573
|
+
get dimensions(): number;
|
|
574
|
+
}
|
|
575
|
+
```
|
|
576
|
+
|
|
577
|
+
**Internal Design:**
|
|
578
|
+
|
|
579
|
+
1. **Client initialization:** `new GoogleGenerativeAI(apiKey)` where `apiKey = process.env['GEMINI_API_KEY']`. Throw `NomosError('search_api_key_missing', ...)` if not set.
|
|
580
|
+
2. **Model:** `client.getGenerativeModel({ model: config.embedding_model })` → defaults to `gemini-embedding-001`.
|
|
581
|
+
3. **Embedding call:** `model.embedContent(text)` for single. `model.batchEmbedContents(requests)` for batch.
|
|
582
|
+
|
|
583
|
+
4. **Request timeout [GAP-4]:**
|
|
584
|
+
```typescript
|
|
585
|
+
private async withTimeout<T>(promise: Promise<T>, label: string): Promise<T> {
|
|
586
|
+
const controller = new AbortController();
|
|
587
|
+
const timer = setTimeout(() => controller.abort(), this.config.request_timeout_ms);
|
|
588
|
+
try {
|
|
589
|
+
// Pass abort signal to the request if the SDK supports it.
|
|
590
|
+
// If not, use Promise.race as a fallback:
|
|
591
|
+
return await Promise.race([
|
|
592
|
+
promise,
|
|
593
|
+
new Promise<never>((_, reject) => {
|
|
594
|
+
controller.signal.addEventListener('abort', () => {
|
|
595
|
+
reject(new NomosError(
|
|
596
|
+
'search_embedding_failed',
|
|
597
|
+
`Embedding request timed out after ${this.config.request_timeout_ms}ms (${label})`
|
|
598
|
+
));
|
|
599
|
+
});
|
|
600
|
+
}),
|
|
601
|
+
]);
|
|
602
|
+
} finally {
|
|
603
|
+
clearTimeout(timer);
|
|
604
|
+
}
|
|
605
|
+
}
|
|
606
|
+
```
|
|
607
|
+
|
|
608
|
+
5. **Sequential batching with rate-limit logging [TRAP-2, S-8]:**
|
|
609
|
+
```typescript
|
|
610
|
+
async embedBatch(texts: string[], onBatchComplete?): Promise<Float32Array[]> {
|
|
611
|
+
const results: Float32Array[] = [];
|
|
612
|
+
const batches = chunk(texts, this.config.batch_size);
|
|
613
|
+
const delayMs = Math.ceil(60_000 / this.config.embedding_requests_per_minute);
|
|
614
|
+
|
|
615
|
+
for (let i = 0; i < batches.length; i++) {
|
|
616
|
+
if (i > 0) {
|
|
617
|
+
this.logger.warn(
|
|
618
|
+
`[nomos:search:warn] Rate limiting. Waiting ${delayMs}ms before batch ${i + 1}/${batches.length}...`
|
|
619
|
+
);
|
|
620
|
+
await sleep(delayMs);
|
|
621
|
+
}
|
|
622
|
+
const batch = batches[i];
|
|
623
|
+
const vectors = await this.withTimeout(
|
|
624
|
+
this.embedBatchRaw(batch),
|
|
625
|
+
`batch ${i + 1}/${batches.length}`
|
|
626
|
+
);
|
|
627
|
+
results.push(...vectors);
|
|
628
|
+
onBatchComplete?.(i, batches.length);
|
|
629
|
+
}
|
|
630
|
+
return results;
|
|
631
|
+
}
|
|
632
|
+
```
|
|
633
|
+
No concurrency. No `max_concurrent_requests`. One batch at a time. One config controls timing.
|
|
634
|
+
|
|
635
|
+
6. **Retry:** Exponential backoff (2s, 4s, 8s) with jitter on 429/5xx errors. Max 3 retries per batch. On permanent failure, throw `NomosError('search_embedding_failed', ...)`.
|
|
636
|
+
7. **Output normalization:** Gemini returns `{ embedding: { values: number[] } }`. Convert `values` to `new Float32Array(values)`.
|
|
637
|
+
8. **Vector dimensions:** Configurable via `config.embedding_dimensions` (default: 768). `gemini-embedding-001` supports 768, 1536, and 3072. Pass `outputDimensionality` to the API call: `model.embedContent({ content, outputDimensionality: config.embedding_dimensions })`. Exposed via `get dimensions()` which returns `config.embedding_dimensions`.
|
|
638
|
+
|
|
639
|
+
**Validation:**
|
|
640
|
+
```bash
|
|
641
|
+
npx tsc --noEmit # → exit code 0
|
|
642
|
+
```
|
|
643
|
+
|
|
644
|
+
**Rollback:** `rm src/search/embedder.ts`
|
|
645
|
+
|
|
646
|
+
---
|
|
647
|
+
|
|
648
|
+
#### Step 7.2.2: Unit test for `Embedder`
|
|
649
|
+
|
|
650
|
+
**Pre-Condition:** Step 7.2.1 complete.
|
|
651
|
+
|
|
652
|
+
**Action:** Create `src/search/__tests__/embedder.test.ts`.
|
|
653
|
+
|
|
654
|
+
**Test cases (mocked — no real API calls):**
|
|
655
|
+
1. `embedOne()` calls Gemini API with correct model and returns Float32Array.
|
|
656
|
+
2. `embedBatch()` splits input into chunks of `batch_size` and processes sequentially.
|
|
657
|
+
3. `embedBatch()` respects rate limit delay between batches (verify `setTimeout` calls).
|
|
658
|
+
4. `embedBatch()` logs rate-limit wait message [S-8].
|
|
659
|
+
5. `embedOne()` retries on 429 error with exponential backoff.
|
|
660
|
+
6. `embedOne()` throws `NomosError('search_embedding_failed')` after max retries.
|
|
661
|
+
7. `embedOne()` throws `NomosError('search_api_key_missing')` when GEMINI_API_KEY is unset.
|
|
662
|
+
8. **Timeout [GAP-4]:** `embedOne()` throws `NomosError('search_embedding_failed', /timed out/)` when API hangs past `request_timeout_ms`.
|
|
663
|
+
9. Vector dimensions match `config.embedding_dimensions` (default 768). Verify by checking `embedder.dimensions === config.embedding_dimensions` and `vector.length === config.embedding_dimensions`.
|
|
664
|
+
10. `onBatchComplete` callback fires after each batch.
|
|
665
|
+
|
|
666
|
+
**Mock strategy:** Mock `@google/generative-ai` module. Return predictable vectors (e.g., all-zeros with known dimensions) from mocked `embedContent` / `batchEmbedContents`. For timeout test, return a never-resolving promise.
|
|
667
|
+
|
|
668
|
+
**Validation:**
|
|
669
|
+
```bash
|
|
670
|
+
npx vitest run src/search/__tests__/embedder.test.ts # → 0 failures
|
|
671
|
+
```
|
|
672
|
+
|
|
673
|
+
---
|
|
674
|
+
|
|
675
|
+
### Task 7.3: Chunk Extractor — Text Preparation
|
|
676
|
+
|
|
677
|
+
---
|
|
678
|
+
|
|
679
|
+
#### Step 7.3.1: Create `src/search/chunk-extractor.ts`
|
|
680
|
+
|
|
681
|
+
**Pre-Condition:** Types from Step 7.0.5 defined. `ProjectMap` type available.
|
|
682
|
+
|
|
683
|
+
**Action:** Create `src/search/chunk-extractor.ts` implementing the `ChunkExtractor` class.
|
|
684
|
+
|
|
685
|
+
**Class API:**
|
|
686
|
+
|
|
687
|
+
```typescript
|
|
688
|
+
export class ChunkExtractor {
|
|
689
|
+
constructor(private readonly projectRoot: string, logger: Logger);
|
|
690
|
+
|
|
691
|
+
/**
|
|
692
|
+
* Extract file-level and symbol-level TextChunks from a ProjectMap.
|
|
693
|
+
* Uses inline semantic data from ProjectMap — no .semantic.md file reads [TRAP-5].
|
|
694
|
+
*/
|
|
695
|
+
extract(map: ProjectMap): TextChunk[];
|
|
696
|
+
}
|
|
697
|
+
```
|
|
698
|
+
|
|
699
|
+
**Internal Design:**
|
|
700
|
+
|
|
701
|
+
1. **File-level chunks:** For each `FileNode` in `map.files`:
|
|
702
|
+
- **If `semantic` is non-null (inline in ProjectMap):** Compose text from `semantic.overview`, `semantic.purpose`, `semantic.key_logic[]`, `semantic.usage_context[]`. Delimit fields with labeled headers:
|
|
703
|
+
```
|
|
704
|
+
File: {file_path}
|
|
705
|
+
Purpose: {semantic.purpose}
|
|
706
|
+
Overview: {semantic.overview}
|
|
707
|
+
Key Logic: {key_logic.join('; ')}
|
|
708
|
+
Usage Context: {usage_context.join('; ')}
|
|
709
|
+
Exports: {symbols.filter(s => s.exported).map(s => s.name).join(', ')}
|
|
710
|
+
Dependencies: {dependencies.join(', ')}
|
|
711
|
+
```
|
|
712
|
+
- **If `semantic` is null (fallback):** Use file path, symbol names, and import sources as the text. Log a warning per file.
|
|
713
|
+
- **Chunk ID:** `file_path` (relative, e.g., `"src/services/payment.ts"`).
|
|
714
|
+
|
|
715
|
+
2. **Content hash [S-6]:**
|
|
716
|
+
```typescript
|
|
717
|
+
// Hash RAW INPUTS, not composed text.
|
|
718
|
+
// This decouples hashing from text composition logic.
|
|
719
|
+
// If composition format changes, hashes remain stable.
|
|
720
|
+
const hashInput = JSON.stringify({
|
|
721
|
+
file_path: fileNode.file_path,
|
|
722
|
+
semantic: fileNode.semantic, // null-safe
|
|
723
|
+
symbols: fileNode.symbols.map(s => ({ name: s.name, kind: s.kind, signature: s.signature })),
|
|
724
|
+
dependencies: fileNode.dependencies,
|
|
725
|
+
});
|
|
726
|
+
const content_hash = crypto.createHash('sha256').update(hashInput).digest('hex');
|
|
727
|
+
```
|
|
728
|
+
|
|
729
|
+
3. **Symbol-level chunks:** For each `FileNode`, iterate over `symbols[]` where `exported === true` OR `kind === 'class'` OR `kind === 'function'`:
|
|
730
|
+
- Compose text:
|
|
731
|
+
```
|
|
732
|
+
Symbol: {symbol.name} ({symbol.kind})
|
|
733
|
+
File: {file_path}
|
|
734
|
+
Signature: {symbol.signature ?? 'N/A'}
|
|
735
|
+
Lines: {symbol.line}-{symbol.end_line ?? '?'}
|
|
736
|
+
File Purpose: {semantic.purpose ?? file_path}
|
|
737
|
+
```
|
|
738
|
+
- **Chunk ID:** `"{file_path}::{symbol.name}"`.
|
|
739
|
+
- **`parent_file_id`:** `file_path`.
|
|
740
|
+
- **`content_hash`:** Computed from raw symbol data (name, kind, signature, line, end_line) + parent file semantic hash.
|
|
741
|
+
|
|
742
|
+
4. **No `.semantic.md` file reading [TRAP-5 resolution]:**
|
|
743
|
+
The original plan derived `.semantic.md` paths via fragile regex (`file_path.replace(/\.[^.]+$/, '.semantic.md')`). This breaks for:
|
|
744
|
+
- Multi-extension files (`config.test.ts` → `config.test.semantic.md` — wrong).
|
|
745
|
+
- Extensionless files (`Makefile` → unchanged path — wrong).
|
|
746
|
+
|
|
747
|
+
**Fix:** The `ProjectMap` already contains semantic data inline in `FileNode.semantic`. Use it directly. No filesystem reads for `.semantic.md`. This eliminates the path derivation problem entirely.
|
|
748
|
+
|
|
749
|
+
5. **Empty text guard:** Skip chunks where composed text is shorter than 20 characters.
|
|
750
|
+
|
|
751
|
+
**Validation:**
|
|
752
|
+
```bash
|
|
753
|
+
npx tsc --noEmit # → exit code 0
|
|
754
|
+
```
|
|
755
|
+
|
|
756
|
+
**Rollback:** `rm src/search/chunk-extractor.ts`
|
|
757
|
+
|
|
758
|
+
---
|
|
759
|
+
|
|
760
|
+
#### Step 7.3.2: Unit test for `ChunkExtractor`
|
|
761
|
+
|
|
762
|
+
**Pre-Condition:** Step 7.3.1 complete.
|
|
763
|
+
|
|
764
|
+
**Action:** Create `src/search/__tests__/chunk-extractor.test.ts`.
|
|
765
|
+
|
|
766
|
+
**Test cases:**
|
|
767
|
+
1. File with `semantic` data produces a file-level chunk with all fields concatenated.
|
|
768
|
+
2. File without `semantic` data produces a fallback chunk with file path and symbol names.
|
|
769
|
+
3. Exported symbols produce symbol-level chunks.
|
|
770
|
+
4. Non-exported, non-class, non-function symbols are skipped.
|
|
771
|
+
5. Chunk IDs are correct: file path for file-level, `file::symbol` for symbol-level.
|
|
772
|
+
6. `parent_file_id` on symbol chunks points to parent file.
|
|
773
|
+
7. Chunks shorter than 20 chars are skipped.
|
|
774
|
+
8. `content_hash` is deterministic (same input → same hash) [S-6].
|
|
775
|
+
9. `content_hash` does NOT change when text composition format changes (only when raw inputs change) [S-6].
|
|
776
|
+
10. No filesystem reads for `.semantic.md` files [TRAP-5].
|
|
777
|
+
|
|
778
|
+
**Test setup:** Create a minimal `ProjectMap` object in-memory with 2–3 `FileNode` entries (one with semantic, one without, one with symbols).
|
|
779
|
+
|
|
780
|
+
**Validation:**
|
|
781
|
+
```bash
|
|
782
|
+
npx vitest run src/search/__tests__/chunk-extractor.test.ts # → 0 failures
|
|
783
|
+
```
|
|
784
|
+
|
|
785
|
+
---
|
|
786
|
+
|
|
787
|
+
### Task 7.4: Indexing Pipeline — Extract → Embed → Upsert (Streaming + Table-Swap)
|
|
788
|
+
|
|
789
|
+
---
|
|
790
|
+
|
|
791
|
+
#### Step 7.4.1: Create `src/search/indexer.ts`
|
|
792
|
+
|
|
793
|
+
**Pre-Condition:** Tasks 7.1–7.3 complete. All sub-modules functional.
|
|
794
|
+
|
|
795
|
+
**Action:** Create `src/search/indexer.ts` implementing the `SearchIndexer` class.
|
|
796
|
+
|
|
797
|
+
**Class API:**
|
|
798
|
+
|
|
799
|
+
```typescript
|
|
800
|
+
export class SearchIndexer {
|
|
801
|
+
constructor(
|
|
802
|
+
projectRoot: string,
|
|
803
|
+
config: NomosConfig,
|
|
804
|
+
logger: Logger,
|
|
805
|
+
);
|
|
806
|
+
|
|
807
|
+
/**
|
|
808
|
+
* Full index: extract all chunks, embed in streaming batches, upsert to staging,
|
|
809
|
+
* then atomic table-swap to live [BLOCKER-1].
|
|
810
|
+
* Writes metadata with status tracking [BLOCKER-2].
|
|
811
|
+
* Returns IndexMetadata.
|
|
812
|
+
*/
|
|
813
|
+
async fullIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata>;
|
|
814
|
+
|
|
815
|
+
/**
|
|
816
|
+
* Incremental index: validates dimensions [BLOCKER-3], re-indexes changed + failed files [GAP-1].
|
|
817
|
+
* Returns updated IndexMetadata.
|
|
818
|
+
*/
|
|
819
|
+
async incrementalIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata>;
|
|
820
|
+
|
|
821
|
+
/**
|
|
822
|
+
* Dry-run: extract and count chunks without embedding or writing [S-2].
|
|
823
|
+
* Returns chunk count summary.
|
|
824
|
+
*/
|
|
825
|
+
async dryRun(): Promise<{ fileChunks: number; symbolChunks: number; totalChunks: number }>;
|
|
826
|
+
}
|
|
827
|
+
```
|
|
828
|
+
|
|
829
|
+
**Internal Design — Full Index Flow [BLOCKER-1, BLOCKER-2, BLOCKER-4]:**
|
|
830
|
+
|
|
831
|
+
```
|
|
832
|
+
1. Load project_map.json from config.graph.output_dir
|
|
833
|
+
└─ If not found: throw NomosError('search_index_failed', 'project_map.json not found. Run: arc map')
|
|
834
|
+
|
|
835
|
+
2. Write IndexMetadata with status: "in_progress" [BLOCKER-2]
|
|
836
|
+
└─ This marks the index as incomplete BEFORE any mutation occurs.
|
|
837
|
+
└─ If process crashes after this point, next startup detects "in_progress" → forces full re-index.
|
|
838
|
+
|
|
839
|
+
3. ChunkExtractor.extract(projectMap) → TextChunk[]
|
|
840
|
+
└─ Log: "[nomos:search:info] Extracted {N} chunks ({F} file-level, {S} symbol-level)"
|
|
841
|
+
|
|
842
|
+
4. VectorStore.cleanupStaging() → remove orphaned staging table from prior crash [GAP-3]
|
|
843
|
+
|
|
844
|
+
5. STREAMING BATCH LOOP [BLOCKER-4]:
|
|
845
|
+
└─ Split TextChunk[] into batches of config.batch_size
|
|
846
|
+
└─ For each batch:
|
|
847
|
+
a. Check cancellationFlag — if cancelled, goto step 7 (partial completion)
|
|
848
|
+
b. Embedder.embedBatch(batch.map(c => c.text)) → Float32Array[]
|
|
849
|
+
└─ On batch failure: log error, record file_paths in failedFiles[], continue [GAP-1]
|
|
850
|
+
c. Compose VectorRecord[] for THIS BATCH ONLY
|
|
851
|
+
└─ graph_depth: from projectMap.files[file_path].depth
|
|
852
|
+
└─ dependents_count: from projectMap.files[file_path].dependents.length
|
|
853
|
+
└─ last_indexed: new Date().toISOString()
|
|
854
|
+
d. VectorStore.upsertToStaging(batchRecords) → immediate write, release references
|
|
855
|
+
e. Log: "[nomos:search:info] Embedded batch {i}/{total} ({N} chunks)"
|
|
856
|
+
|
|
857
|
+
6. VectorStore.promoteStagingToLive() [BLOCKER-1]
|
|
858
|
+
└─ Atomic swap: staging table becomes live table.
|
|
859
|
+
└─ If this fails: live table still contains OLD data (safe). Throw error.
|
|
860
|
+
|
|
861
|
+
7. Write IndexMetadata with status: "complete" [BLOCKER-2]
|
|
862
|
+
└─ Includes: total counts, per-file hashes, embedding_model, vector_dimensions, failed_files
|
|
863
|
+
└─ Atomic write: write to .tmp file, then rename.
|
|
864
|
+
└─ If cancelled in step 5: write with status: "in_progress" + partial counts + failed_files.
|
|
865
|
+
(Next run detects "in_progress" → forces full re-index.)
|
|
866
|
+
|
|
867
|
+
8. Return IndexMetadata
|
|
868
|
+
```
|
|
869
|
+
|
|
870
|
+
**Internal Design — Incremental Index Flow [BLOCKER-3, GAP-1]:**
|
|
871
|
+
|
|
872
|
+
```
|
|
873
|
+
1. Load project_map.json
|
|
874
|
+
|
|
875
|
+
2. Load existing IndexMetadata from {vector_store_path}/index-meta.json
|
|
876
|
+
└─ If not found: fall back to fullIndex()
|
|
877
|
+
└─ If status === "in_progress": fall back to fullIndex() [BLOCKER-2]
|
|
878
|
+
Log: "[nomos:search:warn] Previous index incomplete. Running full re-index."
|
|
879
|
+
|
|
880
|
+
3. DIMENSION VALIDATION [BLOCKER-3]:
|
|
881
|
+
└─ Compare IndexMetadata.embedding_model against config.search.embedding_model
|
|
882
|
+
└─ Compare IndexMetadata.vector_dimensions against config.search.embedding_dimensions
|
|
883
|
+
└─ If EITHER mismatches:
|
|
884
|
+
Log: "[nomos:search:warn] Embedding model/dimensions changed ({old} → {new}). Forcing full re-index."
|
|
885
|
+
Fall back to fullIndex()
|
|
886
|
+
|
|
887
|
+
4. ChunkExtractor.extract(projectMap) → TextChunk[]
|
|
888
|
+
|
|
889
|
+
5. Compute diff:
|
|
890
|
+
└─ changed = chunks where content_hash differs from IndexMetadata.files[file_path].content_hash
|
|
891
|
+
└─ new_files = chunks whose file_path is not in IndexMetadata.files
|
|
892
|
+
└─ removed = files in IndexMetadata.files but not in projectMap
|
|
893
|
+
└─ failed_retry = chunks whose file_path is in IndexMetadata.failed_files [GAP-1]
|
|
894
|
+
└─ to_reindex = union(changed, new_files, failed_retry)
|
|
895
|
+
|
|
896
|
+
6. If to_reindex is empty AND removed is empty:
|
|
897
|
+
└─ Log: "[nomos:search:info] Index is up-to-date. No changes detected."
|
|
898
|
+
└─ Return existing IndexMetadata unchanged.
|
|
899
|
+
|
|
900
|
+
7. Write IndexMetadata with status: "in_progress" [BLOCKER-2]
|
|
901
|
+
|
|
902
|
+
8. VectorStore.deleteByFilePaths([...removed, ...to_reindex.map(c => c.file_path)])
|
|
903
|
+
|
|
904
|
+
9. STREAMING BATCH LOOP for to_reindex [BLOCKER-4]:
|
|
905
|
+
└─ (Same pattern as full index step 5 — embed batch, compose records, upsert immediately)
|
|
906
|
+
|
|
907
|
+
10. Update IndexMetadata:
|
|
908
|
+
└─ status: "complete"
|
|
909
|
+
└─ Update per-file entries for changed/new files
|
|
910
|
+
└─ Remove entries for removed files
|
|
911
|
+
└─ Clear failed_files for successfully re-embedded files; keep any that failed again [GAP-1]
|
|
912
|
+
└─ Update last_incremental_index, total counts
|
|
913
|
+
|
|
914
|
+
11. Write IndexMetadata (atomic: .tmp then rename) [BLOCKER-2]
|
|
915
|
+
|
|
916
|
+
12. Return IndexMetadata
|
|
917
|
+
```
|
|
918
|
+
|
|
919
|
+
**Cancellation safety [GAP-3]:**
|
|
920
|
+
- The cancellation flag is checked at the TOP of each batch iteration (step 5a / 9).
|
|
921
|
+
- On cancellation during full index: staging table is orphaned (harmless). Live table untouched. Metadata written as `"in_progress"`. Next run cleans up staging and forces full re-index.
|
|
922
|
+
- On cancellation during incremental index: partial upserts are durable in the live table (LanceDB transactional writes). Metadata written as `"in_progress"`. Next run forces full re-index.
|
|
923
|
+
|
|
924
|
+
**Validation:**
|
|
925
|
+
```bash
|
|
926
|
+
npx tsc --noEmit # → exit code 0
|
|
927
|
+
```
|
|
928
|
+
|
|
929
|
+
**Rollback:** `rm src/search/indexer.ts`
|
|
930
|
+
|
|
931
|
+
---
|
|
932
|
+
|
|
933
|
+
#### Step 7.4.2: Unit test for `SearchIndexer`
|
|
934
|
+
|
|
935
|
+
**Pre-Condition:** Step 7.4.1 complete.
|
|
936
|
+
|
|
937
|
+
**Action:** Create `src/search/__tests__/indexer.test.ts`.
|
|
938
|
+
|
|
939
|
+
**Test cases (mocked Embedder, real VectorStore with temp dir):**
|
|
940
|
+
1. `fullIndex()` loads project map, extracts chunks, embeds, upserts via staging, swaps, writes metadata.
|
|
941
|
+
2. `fullIndex()` uses table-swap (staging → live) NOT `reset()` + `upsert()` [BLOCKER-1].
|
|
942
|
+
3. `fullIndex()` writes `status: "in_progress"` BEFORE embedding, `status: "complete"` AFTER [BLOCKER-2].
|
|
943
|
+
4. `fullIndex()` upserts per-batch, not all-at-once [BLOCKER-4].
|
|
944
|
+
5. `incrementalIndex()` only re-embeds changed files (verify by checking `embedBatch` call count).
|
|
945
|
+
6. `incrementalIndex()` re-embeds files in `failed_files` even if hash unchanged [GAP-1].
|
|
946
|
+
7. `incrementalIndex()` deletes records for removed files.
|
|
947
|
+
8. `incrementalIndex()` falls back to `fullIndex()` when no metadata exists.
|
|
948
|
+
9. `incrementalIndex()` falls back to `fullIndex()` when `status === "in_progress"` [BLOCKER-2].
|
|
949
|
+
10. `incrementalIndex()` falls back to `fullIndex()` on embedding model mismatch [BLOCKER-3].
|
|
950
|
+
11. `incrementalIndex()` falls back to `fullIndex()` on vector dimension mismatch [BLOCKER-3].
|
|
951
|
+
12. Index metadata file is written with correct totals, per-file hashes, and `failed_files`.
|
|
952
|
+
13. Cancellation flag stops processing between batches; metadata written as `"in_progress"`.
|
|
953
|
+
14. Missing `project_map.json` throws `NomosError('search_index_failed')`.
|
|
954
|
+
15. Partial embedding failure: failed files recorded in `IndexMetadata.failed_files` [GAP-1], remaining files indexed successfully.
|
|
955
|
+
16. `dryRun()` returns chunk counts without calling Embedder or VectorStore [S-2].
|
|
956
|
+
|
|
957
|
+
**Validation:**
|
|
958
|
+
```bash
|
|
959
|
+
npx vitest run src/search/__tests__/indexer.test.ts # → 0 failures
|
|
960
|
+
```
|
|
961
|
+
|
|
962
|
+
---
|
|
963
|
+
|
|
964
|
+
### Task 7.5: Query Engine — Search Flow
|
|
965
|
+
|
|
966
|
+
---
|
|
967
|
+
|
|
968
|
+
#### Step 7.5.1: Create `src/search/graph-enricher.ts`
|
|
969
|
+
|
|
970
|
+
**Pre-Condition:** `ProjectMap` type available.
|
|
971
|
+
|
|
972
|
+
**Action:** Create `src/search/graph-enricher.ts` implementing the `GraphEnricher` class.
|
|
973
|
+
|
|
974
|
+
**Class API:**
|
|
975
|
+
|
|
976
|
+
```typescript
|
|
977
|
+
export class GraphEnricher {
|
|
978
|
+
private projectMap: ProjectMap | null = null;
|
|
979
|
+
|
|
980
|
+
constructor(
|
|
981
|
+
private readonly projectMapPath: string,
|
|
982
|
+
private readonly logger: Logger,
|
|
983
|
+
);
|
|
984
|
+
|
|
985
|
+
/**
|
|
986
|
+
* Lazy-load and cache project_map.json [GAP-5].
|
|
987
|
+
* Parsed once per instance lifetime. Subsequent calls return cached data.
|
|
988
|
+
*/
|
|
989
|
+
private async loadMap(): Promise<ProjectMap>;
|
|
990
|
+
|
|
991
|
+
/**
|
|
992
|
+
* Enrich raw search results with dependency graph metadata.
|
|
993
|
+
* Stale results (file deleted since index) get is_stale = true [TRAP-4].
|
|
994
|
+
*/
|
|
995
|
+
async enrich(results: RawSearchResult[]): Promise<SearchResult[]>;
|
|
996
|
+
}
|
|
997
|
+
|
|
998
|
+
interface RawSearchResult {
|
|
999
|
+
id: string;
|
|
1000
|
+
type: ChunkType;
|
|
1001
|
+
file_path: string;
|
|
1002
|
+
symbol_name: string | null;
|
|
1003
|
+
symbol_type: string | null;
|
|
1004
|
+
line_start: number | null;
|
|
1005
|
+
line_end: number | null;
|
|
1006
|
+
purpose: string;
|
|
1007
|
+
similarity_score: number;
|
|
1008
|
+
}
|
|
1009
|
+
```
|
|
1010
|
+
|
|
1011
|
+
**Internal Design:**
|
|
1012
|
+
|
|
1013
|
+
1. **Lazy loading [GAP-5]:**
|
|
1014
|
+
```typescript
|
|
1015
|
+
private async loadMap(): Promise<ProjectMap> {
|
|
1016
|
+
if (this.projectMap) return this.projectMap;
|
|
1017
|
+
const raw = await fs.readFile(this.projectMapPath, 'utf-8');
|
|
1018
|
+
this.projectMap = JSON.parse(raw);
|
|
1019
|
+
return this.projectMap;
|
|
1020
|
+
}
|
|
1021
|
+
```
|
|
1022
|
+
For CLI usage (one search per process), the map is loaded once. No repeated 20MB parses.
|
|
1023
|
+
|
|
1024
|
+
2. For each result, look up `projectMap.files[result.file_path]`.
|
|
1025
|
+
3. If found: set `graph_depth`, `dependents_count`, `is_core_module`, `is_stale = false`.
|
|
1026
|
+
4. **If NOT found [TRAP-4]:** set `graph_depth = -1`, `dependents_count = 0`, `is_core_module = false`, `is_stale = true`. The `-1` sentinel is an internal value — it NEVER reaches CLI output directly. The `is_stale` boolean is the public signal.
|
|
1027
|
+
5. Sort results by `similarity_score` descending (preserve embedding rank).
|
|
1028
|
+
|
|
1029
|
+
**Validation:**
|
|
1030
|
+
```bash
|
|
1031
|
+
npx tsc --noEmit # → exit code 0
|
|
1032
|
+
```
|
|
1033
|
+
|
|
1034
|
+
**Rollback:** `rm src/search/graph-enricher.ts`
|
|
1035
|
+
|
|
1036
|
+
---
|
|
1037
|
+
|
|
1038
|
+
#### Step 7.5.2: Create `src/search/query-engine.ts`
|
|
1039
|
+
|
|
1040
|
+
**Pre-Condition:** Steps 7.1.1, 7.2.1, 7.5.1 complete.
|
|
1041
|
+
|
|
1042
|
+
**Action:** Create `src/search/query-engine.ts` implementing the `QueryEngine` class.
|
|
1043
|
+
|
|
1044
|
+
**Class API:**
|
|
1045
|
+
|
|
1046
|
+
```typescript
|
|
1047
|
+
export class QueryEngine {
|
|
1048
|
+
constructor(
|
|
1049
|
+
projectRoot: string,
|
|
1050
|
+
config: NomosConfig,
|
|
1051
|
+
logger: Logger,
|
|
1052
|
+
);
|
|
1053
|
+
|
|
1054
|
+
/**
|
|
1055
|
+
* Execute a semantic search query.
|
|
1056
|
+
* Pipeline: embed query → vector search → graph enrich → deduplicate → rank → return.
|
|
1057
|
+
*/
|
|
1058
|
+
async search(query: string, options?: {
|
|
1059
|
+
topK?: number;
|
|
1060
|
+
threshold?: number;
|
|
1061
|
+
}): Promise<SearchResult[]>;
|
|
1062
|
+
}
|
|
1063
|
+
```
|
|
1064
|
+
|
|
1065
|
+
**Internal Design:**
|
|
1066
|
+
|
|
1067
|
+
```
|
|
1068
|
+
1. Validate: query string must be non-empty after trim.
|
|
1069
|
+
└─ Throw NomosError('search_query_failed', 'Query must be a non-empty string.') otherwise.
|
|
1070
|
+
|
|
1071
|
+
2. Check vector index exists:
|
|
1072
|
+
└─ Read {vector_store_path}/index-meta.json
|
|
1073
|
+
└─ If not found: throw NomosError('search_index_not_found', 'No index found. Run: arc index')
|
|
1074
|
+
└─ If status === "in_progress": log warning "Index is incomplete. Results may be partial."
|
|
1075
|
+
|
|
1076
|
+
3. Initialize VectorStore [GAP-2]:
|
|
1077
|
+
└─ VectorStore.init() — wrapped in try-catch that produces actionable error.
|
|
1078
|
+
|
|
1079
|
+
4. Embed query:
|
|
1080
|
+
└─ Embedder.embedOne(query.trim()) → Float32Array
|
|
1081
|
+
└─ Subject to request_timeout_ms [GAP-4]
|
|
1082
|
+
|
|
1083
|
+
5. Vector search:
|
|
1084
|
+
└─ VectorStore.query(queryVector, topK, threshold) → results with similarity_score
|
|
1085
|
+
└─ similarity_score already ∈ [0, 1] — conversion done in VectorStore [S-3]
|
|
1086
|
+
|
|
1087
|
+
6. Graph enrich:
|
|
1088
|
+
└─ GraphEnricher.enrich(rawResults) → SearchResult[]
|
|
1089
|
+
└─ project_map.json loaded lazily and cached [GAP-5]
|
|
1090
|
+
|
|
1091
|
+
7. De-duplicate [TRAP-3 — DETERMINISTIC RULE]:
|
|
1092
|
+
└─ Group results by file_path.
|
|
1093
|
+
└─ For each file_path that has BOTH a 'file' type result AND one or more 'symbol' type results:
|
|
1094
|
+
a. Compute score gap = abs(file_result.similarity_score - max(symbol_results.similarity_score))
|
|
1095
|
+
b. If score gap <= 0.05: REMOVE the file-level result. Keep only symbol results.
|
|
1096
|
+
c. If score gap > 0.05: keep both (they are sufficiently distinct in relevance).
|
|
1097
|
+
└─ This is a hard rule. "Remove" means filter out of the result array entirely.
|
|
1098
|
+
|
|
1099
|
+
8. Sort by similarity_score descending.
|
|
1100
|
+
|
|
1101
|
+
9. Return top-K SearchResult[]
|
|
1102
|
+
```
|
|
1103
|
+
|
|
1104
|
+
**Stale index warning:** If `index-meta.json` `last_full_index` is older than `project_map.json` `generated_at`, log: `"[nomos:search:warn] Index is older than project map. Consider running: arc index --incremental"`.
|
|
1105
|
+
|
|
1106
|
+
**Validation:**
|
|
1107
|
+
```bash
|
|
1108
|
+
npx tsc --noEmit # → exit code 0
|
|
1109
|
+
```
|
|
1110
|
+
|
|
1111
|
+
**Rollback:** `rm src/search/query-engine.ts`
|
|
1112
|
+
|
|
1113
|
+
---
|
|
1114
|
+
|
|
1115
|
+
#### Step 7.5.3: Unit tests for `GraphEnricher` and `QueryEngine`
|
|
1116
|
+
|
|
1117
|
+
**Pre-Condition:** Steps 7.5.1–7.5.2 complete.
|
|
1118
|
+
|
|
1119
|
+
**Action:** Create:
|
|
1120
|
+
- `src/search/__tests__/graph-enricher.test.ts`
|
|
1121
|
+
- `src/search/__tests__/query-engine.test.ts`
|
|
1122
|
+
|
|
1123
|
+
**GraphEnricher test cases:**
|
|
1124
|
+
1. Results are enriched with correct `graph_depth` and `dependents_count` from project map.
|
|
1125
|
+
2. Core modules are correctly flagged (`is_core_module = true`).
|
|
1126
|
+
3. Missing file in project map sets `is_stale = true` and `graph_depth = -1` [TRAP-4].
|
|
1127
|
+
4. Results maintain similarity_score ranking after enrichment.
|
|
1128
|
+
5. `loadMap()` is called only once across multiple `enrich()` calls (caching) [GAP-5].
|
|
1129
|
+
|
|
1130
|
+
**QueryEngine test cases (mocked Embedder + VectorStore):**
|
|
1131
|
+
1. `search()` embeds query, queries store, enriches, and returns ranked results.
|
|
1132
|
+
2. `search()` throws `NomosError('search_index_not_found')` when no index exists.
|
|
1133
|
+
3. `search()` throws `NomosError('search_query_failed')` on empty query string.
|
|
1134
|
+
4. `search()` respects `topK` and `threshold` overrides.
|
|
1135
|
+
5. Stale index warning is logged when index is older than project map.
|
|
1136
|
+
6. **De-duplication [TRAP-3]:** symbol-level result within 0.05 of parent file-level result → file-level result removed.
|
|
1137
|
+
7. **De-duplication [TRAP-3]:** symbol-level result MORE than 0.05 from parent file-level result → both kept.
|
|
1138
|
+
8. `search()` wraps VectorStore.init() failure with actionable error message [GAP-2].
|
|
1139
|
+
|
|
1140
|
+
**Validation:**
|
|
1141
|
+
```bash
|
|
1142
|
+
npx vitest run src/search/__tests__/graph-enricher.test.ts # → 0 failures
|
|
1143
|
+
npx vitest run src/search/__tests__/query-engine.test.ts # → 0 failures
|
|
1144
|
+
```
|
|
1145
|
+
|
|
1146
|
+
---
|
|
1147
|
+
|
|
1148
|
+
### Task 7.6: CLI Commands — `arc index` and `arc search`
|
|
1149
|
+
|
|
1150
|
+
---
|
|
1151
|
+
|
|
1152
|
+
#### Step 7.6.1: Create `src/commands/index.ts`
|
|
1153
|
+
|
|
1154
|
+
**Pre-Condition:** Task 7.4 complete. `SearchIndexer` functional.
|
|
1155
|
+
|
|
1156
|
+
**Action:** Create `src/commands/index.ts` following the existing command registration pattern.
|
|
1157
|
+
|
|
1158
|
+
**Command signature:**
|
|
1159
|
+
```
|
|
1160
|
+
arc index [--incremental] [--force] [--dry-run]
|
|
1161
|
+
```
|
|
1162
|
+
|
|
1163
|
+
| Flag | Behavior |
|
|
1164
|
+
|---|---|
|
|
1165
|
+
| (no flags) | Full re-index. Table-swap strategy. |
|
|
1166
|
+
| `--incremental` | Only re-index changed + failed files (content hash + failed_files comparison). |
|
|
1167
|
+
| `--force` | Force full re-index even if incremental metadata exists. Same as no flags. |
|
|
1168
|
+
| `--dry-run` | Extract and count chunks without embedding or writing [S-2]. No API calls. No cost. |
|
|
1169
|
+
|
|
1170
|
+
**Implementation:**
|
|
1171
|
+
|
|
1172
|
+
```typescript
|
|
1173
|
+
export function registerIndexCommand(program: Command): void {
|
|
1174
|
+
program
|
|
1175
|
+
.command('index')
|
|
1176
|
+
.description('Build or rebuild the vector search index from project map')
|
|
1177
|
+
.option('--incremental', 'Only re-index files changed since last indexing run')
|
|
1178
|
+
.option('--force', 'Force full re-index (ignore incremental metadata)')
|
|
1179
|
+
.option('--dry-run', 'Count chunks without embedding (no API calls, no writes)')
|
|
1180
|
+
.action(async (opts: { incremental?: boolean; force?: boolean; dryRun?: boolean }) => {
|
|
1181
|
+
// 1. loadConfig()
|
|
1182
|
+
// 2. Create logger
|
|
1183
|
+
// 3. Create SearchIndexer
|
|
1184
|
+
// 4. If --dry-run: call dryRun(), print summary, exit 0
|
|
1185
|
+
// 5. Register SIGINT handler:
|
|
1186
|
+
// - Set cancellationFlag.cancelled = true
|
|
1187
|
+
// - Log: "[nomos:search:warn] SIGINT received. Finishing current batch..."
|
|
1188
|
+
// - The indexer will write partial metadata and exit cleanly.
|
|
1189
|
+
// 6. Call fullIndex() or incrementalIndex() based on flags
|
|
1190
|
+
// 7. Print summary:
|
|
1191
|
+
// "Indexed {total_chunks} chunks ({files} files, {symbols} symbols) in {duration}s"
|
|
1192
|
+
// "Vector index stored at: {vector_store_path}"
|
|
1193
|
+
// If failed_files.length > 0:
|
|
1194
|
+
// "⚠ {N} files failed embedding. They will be retried on next incremental index."
|
|
1195
|
+
// 8. Exit 0 on success, 1 on fatal error
|
|
1196
|
+
});
|
|
1197
|
+
}
|
|
1198
|
+
```
|
|
1199
|
+
|
|
1200
|
+
**Progress output:** Print to stderr (not stdout — preserves machine-parseability):
|
|
1201
|
+
```
|
|
1202
|
+
[nomos:search:info] Extracting chunks from project map...
|
|
1203
|
+
[nomos:search:info] Extracted 342 chunks (142 file-level, 200 symbol-level)
|
|
1204
|
+
[nomos:search:info] Embedding batch 1/7 (50 chunks)...
|
|
1205
|
+
[nomos:search:warn] Rate limiting. Waiting 200ms before batch 2/7...
|
|
1206
|
+
[nomos:search:info] Embedding batch 2/7 (50 chunks)...
|
|
1207
|
+
...
|
|
1208
|
+
[nomos:search:info] Writing to staging table...
|
|
1209
|
+
[nomos:search:info] Promoting staging to live (atomic swap)...
|
|
1210
|
+
[nomos:search:info] Writing index metadata...
|
|
1211
|
+
```
|
|
1212
|
+
|
|
1213
|
+
**Dry-run output [S-2]:**
|
|
1214
|
+
```
|
|
1215
|
+
[nomos:search:info] DRY RUN — no API calls, no writes.
|
|
1216
|
+
[nomos:search:info] Would index: 342 chunks (142 file-level, 200 symbol-level)
|
|
1217
|
+
[nomos:search:info] Estimated API calls: 7 batches × 50 chunks
|
|
1218
|
+
```
|
|
1219
|
+
|
|
1220
|
+
**Validation:**
|
|
1221
|
+
```bash
|
|
1222
|
+
npx tsc --noEmit # → exit code 0
|
|
1223
|
+
npm run build && node dist/cli.js index --help # → shows --incremental, --force, --dry-run
|
|
1224
|
+
```
|
|
1225
|
+
|
|
1226
|
+
**Rollback:** `rm src/commands/index.ts`
|
|
1227
|
+
|
|
1228
|
+
---
|
|
1229
|
+
|
|
1230
|
+
#### Step 7.6.2: Create `src/commands/search.ts`
|
|
1231
|
+
|
|
1232
|
+
**Pre-Condition:** Task 7.5 complete. `QueryEngine` functional.
|
|
1233
|
+
|
|
1234
|
+
**Action:** Create `src/commands/search.ts` following the existing command registration pattern.
|
|
1235
|
+
|
|
1236
|
+
**Command signature:**
|
|
1237
|
+
```
|
|
1238
|
+
arc search <query> [--top <N>] [--threshold <score>] [--json]
|
|
1239
|
+
```
|
|
1240
|
+
|
|
1241
|
+
| Flag | Default | Behavior |
|
|
1242
|
+
|---|---|---|
|
|
1243
|
+
| `<query>` | (required) | Natural language search query |
|
|
1244
|
+
| `--top <N>` | 5 | Maximum number of results |
|
|
1245
|
+
| `--threshold <score>` | 0.7 | Minimum similarity score (0.0–1.0) |
|
|
1246
|
+
| `--json` | false | Output raw JSON instead of formatted table |
|
|
1247
|
+
|
|
1248
|
+
**Human-readable output format:**
|
|
1249
|
+
|
|
1250
|
+
```
|
|
1251
|
+
Results for: "how is refund handled?"
|
|
1252
|
+
|
|
1253
|
+
1. src/services/payment.ts :: processRefund() [0.96] L45-82
|
|
1254
|
+
"Processes a refund request via Stripe, validates eligibility, updates order state"
|
|
1255
|
+
⚠ Core Module (depth 5) — modifying this affects 10 dependents
|
|
1256
|
+
|
|
1257
|
+
2. src/services/payment.ts [0.91]
|
|
1258
|
+
"Handles payment processing and refund logic"
|
|
1259
|
+
⚠ Core Module (depth 5) — modifying this affects 10 dependents
|
|
1260
|
+
|
|
1261
|
+
3. src/middleware/billing.ts [0.84]
|
|
1262
|
+
"Validates billing state before checkout"
|
|
1263
|
+
Leaf Module (depth 1) — 2 dependents
|
|
1264
|
+
|
|
1265
|
+
Found 3 results (threshold: 0.70, top: 5)
|
|
1266
|
+
```
|
|
1267
|
+
|
|
1268
|
+
**Stale result formatting [TRAP-4]:**
|
|
1269
|
+
```
|
|
1270
|
+
4. src/legacy/old-handler.ts [0.78]
|
|
1271
|
+
"Legacy request handler for v1 API"
|
|
1272
|
+
⚠ Stale — file removed since last index. Run: arc index --incremental
|
|
1273
|
+
```
|
|
1274
|
+
When `is_stale === true`: do NOT print `depth -1` or any numeric depth. Print the stale warning message. No negative numbers in CLI output.
|
|
1275
|
+
|
|
1276
|
+
**JSON output format (`--json`) [S-5]:**
|
|
1277
|
+
```json
|
|
1278
|
+
{
|
|
1279
|
+
"query": "how is refund handled?",
|
|
1280
|
+
"results": [
|
|
1281
|
+
{
|
|
1282
|
+
"id": "src/services/payment.ts::processRefund",
|
|
1283
|
+
"type": "symbol",
|
|
1284
|
+
"file_path": "src/services/payment.ts",
|
|
1285
|
+
"symbol_name": "processRefund",
|
|
1286
|
+
"symbol_type": "function",
|
|
1287
|
+
"line_start": 45,
|
|
1288
|
+
"line_end": 82,
|
|
1289
|
+
"purpose": "Processes a refund request via Stripe...",
|
|
1290
|
+
"similarity_score": 0.96,
|
|
1291
|
+
"graph_depth": 5,
|
|
1292
|
+
"dependents_count": 10,
|
|
1293
|
+
"is_core_module": true,
|
|
1294
|
+
"is_stale": false
|
|
1295
|
+
}
|
|
1296
|
+
],
|
|
1297
|
+
"metadata": {
|
|
1298
|
+
"top_k": 5,
|
|
1299
|
+
"threshold": 0.7,
|
|
1300
|
+
"total_results": 3,
|
|
1301
|
+
"index_age": "2026-04-06T12:00:00Z"
|
|
1302
|
+
}
|
|
1303
|
+
}
|
|
1304
|
+
```
|
|
1305
|
+
|
|
1306
|
+
**Verify:** `JSON.stringify(result)` must NOT contain a `vector` field. The `SearchResult` type excludes it by design [S-5].
|
|
1307
|
+
|
|
1308
|
+
Print JSON to stdout. Print nothing to stderr in `--json` mode (machine-parseable output).
|
|
1309
|
+
|
|
1310
|
+
**Edge cases:**
|
|
1311
|
+
- No results found: Print `"No results found above threshold {threshold}."` Exit 0.
|
|
1312
|
+
- Index not found: Print error message, suggest `arc index`. Exit 1.
|
|
1313
|
+
- Index status `in_progress`: Print warning, proceed with partial search. Exit 0.
|
|
1314
|
+
- Empty query string: Print usage help. Exit 1.
|
|
1315
|
+
|
|
1316
|
+
**Validation:**
|
|
1317
|
+
```bash
|
|
1318
|
+
npx tsc --noEmit # → exit code 0
|
|
1319
|
+
npm run build && node dist/cli.js search --help # → shows options
|
|
1320
|
+
```
|
|
1321
|
+
|
|
1322
|
+
**Rollback:** `rm src/commands/search.ts`
|
|
1323
|
+
|
|
1324
|
+
---
|
|
1325
|
+
|
|
1326
|
+
#### Step 7.6.3: Register commands in `src/cli.ts`
|
|
1327
|
+
|
|
1328
|
+
**Pre-Condition:** Steps 7.6.1–7.6.2 complete.
|
|
1329
|
+
|
|
1330
|
+
**Action:**
|
|
1331
|
+
|
|
1332
|
+
1. Add imports at the top of `src/cli.ts`:
|
|
1333
|
+
```typescript
|
|
1334
|
+
import { registerIndexCommand } from './commands/index.js';
|
|
1335
|
+
import { registerSearchCommand } from './commands/search.js';
|
|
1336
|
+
```
|
|
1337
|
+
|
|
1338
|
+
2. Add to the registration array (in the same position pattern as existing commands):
|
|
1339
|
+
```typescript
|
|
1340
|
+
registerIndexCommand,
|
|
1341
|
+
registerSearchCommand,
|
|
1342
|
+
```
|
|
1343
|
+
|
|
1344
|
+
**Validation:**
|
|
1345
|
+
```bash
|
|
1346
|
+
npx tsc --noEmit # → exit code 0
|
|
1347
|
+
npm run build # → exit code 0
|
|
1348
|
+
node dist/cli.js --help # → shows 'index' and 'search' commands
|
|
1349
|
+
node dist/cli.js index --help # → shows --incremental, --force, --dry-run
|
|
1350
|
+
node dist/cli.js search --help # → shows <query>, --top, --threshold, --json
|
|
1351
|
+
```
|
|
1352
|
+
|
|
1353
|
+
**Rollback:** `git checkout src/cli.ts`
|
|
1354
|
+
|
|
1355
|
+
---
|
|
1356
|
+
|
|
1357
|
+
### Task 7.7: Integration Testing & End-to-End Verification
|
|
1358
|
+
|
|
1359
|
+
---
|
|
1360
|
+
|
|
1361
|
+
#### Step 7.7.1: Integration tests — Full pipeline + CI-compatible mock
|
|
1362
|
+
|
|
1363
|
+
**Pre-Condition:** All previous tasks complete. `project_map.json` exists with enriched data.
|
|
1364
|
+
|
|
1365
|
+
**Action:** Create `src/search/__tests__/integration.test.ts`.
|
|
1366
|
+
|
|
1367
|
+
**Test A — Live API (requires GEMINI_API_KEY — skip in CI if unset):**
|
|
1368
|
+
|
|
1369
|
+
1. **Setup:** Copy a minimal `project_map.json` fixture (5 files, ~15 symbols) to a temp directory.
|
|
1370
|
+
2. **Full index:**
|
|
1371
|
+
- Create `SearchIndexer` with temp directory config.
|
|
1372
|
+
- Call `fullIndex()`.
|
|
1373
|
+
- Assert: `IndexMetadata.status === "complete"` [BLOCKER-2].
|
|
1374
|
+
- Assert: `IndexMetadata.total_files_indexed === 5`.
|
|
1375
|
+
- Assert: `IndexMetadata.total_chunks > 5` (file + symbol chunks).
|
|
1376
|
+
- Assert: `IndexMetadata.embedding_model === 'gemini-embedding-001'` [BLOCKER-3].
|
|
1377
|
+
- Assert: `IndexMetadata.vector_dimensions === config.search.embedding_dimensions` [BLOCKER-3].
|
|
1378
|
+
- Assert: `IndexMetadata.failed_files.length === 0` [GAP-1].
|
|
1379
|
+
- Assert: Vector store `count() > 0`.
|
|
1380
|
+
- Assert: `index-meta.json` file exists.
|
|
1381
|
+
3. **Search:**
|
|
1382
|
+
- Create `QueryEngine`.
|
|
1383
|
+
- Call `search("error handling and retry logic")`.
|
|
1384
|
+
- Assert: results.length > 0.
|
|
1385
|
+
- Assert: each result has `similarity_score >= 0.7` and `similarity_score <= 1.0` [S-3].
|
|
1386
|
+
- Assert: no result has a `vector` field [S-5].
|
|
1387
|
+
- Assert: each result has valid `file_path`, `graph_depth`, `dependents_count`.
|
|
1388
|
+
- Assert: stale results (if any) have `is_stale === true` [TRAP-4].
|
|
1389
|
+
4. **Incremental re-index:**
|
|
1390
|
+
- Modify one file's semantic data in the fixture map.
|
|
1391
|
+
- Call `incrementalIndex()`.
|
|
1392
|
+
- Assert: only 1 file re-embedded (check embed call count via spy).
|
|
1393
|
+
- Assert: metadata `status === "complete"`.
|
|
1394
|
+
5. **Cleanup:** Remove temp directory.
|
|
1395
|
+
|
|
1396
|
+
**Guard:** `describe.skipIf(!process.env['GEMINI_API_KEY'])` — skipped if API key absent.
|
|
1397
|
+
|
|
1398
|
+
**Test B — Mock-embedder integration (runs in CI always) [S-4]:**
|
|
1399
|
+
|
|
1400
|
+
1. **Setup:** Same fixture. Create a `MockEmbedder` that returns deterministic vectors matching `config.embedding_dimensions` (e.g., hash-based: SHA-256 of text → repeated to fill `config.embedding_dimensions` floats as Float32Array).
|
|
1401
|
+
2. **Full index with mock embedder.**
|
|
1402
|
+
3. **Search with mock embedder.**
|
|
1403
|
+
4. Assert: pipeline completes end-to-end. Results are ranked. Metadata is valid.
|
|
1404
|
+
5. Assert: table-swap occurred (staging table does not exist after completion) [BLOCKER-1].
|
|
1405
|
+
6. Assert: de-duplication rule applied correctly [TRAP-3].
|
|
1406
|
+
|
|
1407
|
+
This test runs without `GEMINI_API_KEY` — it exercises the full pipeline minus the actual API call.
|
|
1408
|
+
|
|
1409
|
+
**Validation:**
|
|
1410
|
+
```bash
|
|
1411
|
+
# CI-compatible (always runs):
|
|
1412
|
+
npx vitest run src/search/__tests__/integration.test.ts --grep "mock-embedder"
|
|
1413
|
+
|
|
1414
|
+
# Full integration (requires API key):
|
|
1415
|
+
GEMINI_API_KEY=$GEMINI_API_KEY npx vitest run src/search/__tests__/integration.test.ts
|
|
1416
|
+
```
|
|
1417
|
+
|
|
1418
|
+
---
|
|
1419
|
+
|
|
1420
|
+
#### Step 7.7.2: End-to-end CLI verification
|
|
1421
|
+
|
|
1422
|
+
**Pre-Condition:** Step 7.7.1 passes. Project is built.
|
|
1423
|
+
|
|
1424
|
+
**Action:** Manual CLI verification sequence:
|
|
1425
|
+
|
|
1426
|
+
```bash
|
|
1427
|
+
# 1. Build
|
|
1428
|
+
npm run build
|
|
1429
|
+
|
|
1430
|
+
# 2. Ensure project map exists
|
|
1431
|
+
node dist/cli.js map --no-ai # Quick structural map (no API calls)
|
|
1432
|
+
|
|
1433
|
+
# 3. Dry run (no API calls, no cost) [S-2]
|
|
1434
|
+
node dist/cli.js index --dry-run
|
|
1435
|
+
# Expected: "Would index: {N} chunks ({F} file-level, {S} symbol-level)"
|
|
1436
|
+
# Expected: exit code 0
|
|
1437
|
+
|
|
1438
|
+
# 4. Full index
|
|
1439
|
+
node dist/cli.js index
|
|
1440
|
+
# Expected: "Indexed {N} chunks ({F} files, {S} symbols) in {T}s"
|
|
1441
|
+
# Expected: exit code 0
|
|
1442
|
+
|
|
1443
|
+
# 5. Verify metadata
|
|
1444
|
+
cat tasks-management/graph/vector_index/index-meta.json | node -e "
|
|
1445
|
+
let d=''; process.stdin.on('data',c=>d+=c);
|
|
1446
|
+
process.stdin.on('end',()=>{
|
|
1447
|
+
const m=JSON.parse(d);
|
|
1448
|
+
console.log('status:', m.status); // → 'complete'
|
|
1449
|
+
console.log('model:', m.embedding_model); // → 'gemini-embedding-001'
|
|
1450
|
+
console.log('dims:', m.vector_dimensions); // → matches config.embedding_dimensions
|
|
1451
|
+
console.log('failed:', m.failed_files.length); // → 0
|
|
1452
|
+
});
|
|
1453
|
+
"
|
|
1454
|
+
|
|
1455
|
+
# 6. Search — natural language
|
|
1456
|
+
node dist/cli.js search "error handling"
|
|
1457
|
+
# Expected: ranked results with file paths, scores, and dependency info
|
|
1458
|
+
# Expected: no results with "depth -1" in output [TRAP-4]
|
|
1459
|
+
|
|
1460
|
+
# 7. Search — JSON output [S-5]
|
|
1461
|
+
node dist/cli.js search "configuration loading" --json | node -e "
|
|
1462
|
+
let d=''; process.stdin.on('data',c=>d+=c);
|
|
1463
|
+
process.stdin.on('end',()=>{
|
|
1464
|
+
const j=JSON.parse(d);
|
|
1465
|
+
console.log('results:', j.results.length);
|
|
1466
|
+
console.log('has vector field:', j.results.some(r => 'vector' in r)); // → false
|
|
1467
|
+
console.log('valid JSON: true');
|
|
1468
|
+
});
|
|
1469
|
+
"
|
|
1470
|
+
# Expected: "has vector field: false"
|
|
1471
|
+
|
|
1472
|
+
# 8. Search — threshold and top-K
|
|
1473
|
+
node dist/cli.js search "state management" --top 3 --threshold 0.8
|
|
1474
|
+
# Expected: at most 3 results, all with score >= 0.80
|
|
1475
|
+
|
|
1476
|
+
# 9. Incremental index
|
|
1477
|
+
node dist/cli.js index --incremental
|
|
1478
|
+
# Expected: "Index is up-to-date. No changes detected." (if nothing changed)
|
|
1479
|
+
|
|
1480
|
+
# 10. Stale index detection
|
|
1481
|
+
# Modify a file, re-run arc map, then arc search
|
|
1482
|
+
# Expected: "[nomos:search:warn] Index is older than project map..."
|
|
1483
|
+
|
|
1484
|
+
# 11. Missing index error
|
|
1485
|
+
rm -rf tasks-management/graph/vector_index
|
|
1486
|
+
node dist/cli.js search "anything"
|
|
1487
|
+
# Expected: error message suggesting "arc index"
|
|
1488
|
+
# Expected: exit code 1
|
|
1489
|
+
|
|
1490
|
+
# 12. Type check and full test suite
|
|
1491
|
+
npx tsc --noEmit # → exit code 0
|
|
1492
|
+
npx vitest run # → 0 failures (all existing + new tests)
|
|
1493
|
+
```
|
|
1494
|
+
|
|
1495
|
+
---
|
|
1496
|
+
|
|
1497
|
+
#### Step 7.7.3: Verify cosine similarity correctness
|
|
1498
|
+
|
|
1499
|
+
**Pre-Condition:** Vector store operational.
|
|
1500
|
+
|
|
1501
|
+
**Action:** Create `src/search/__tests__/similarity.test.ts`:
|
|
1502
|
+
|
|
1503
|
+
```typescript
|
|
1504
|
+
test('similar texts produce similarity > 0.8', async () => {
|
|
1505
|
+
const embedder = new Embedder(config, logger);
|
|
1506
|
+
const v1 = await embedder.embedOne('process payment refund');
|
|
1507
|
+
const v2 = await embedder.embedOne('handle refund for customer payment');
|
|
1508
|
+
const similarity = cosineSimilarity(v1, v2);
|
|
1509
|
+
expect(similarity).toBeGreaterThan(0.8);
|
|
1510
|
+
});
|
|
1511
|
+
|
|
1512
|
+
test('unrelated texts produce similarity < 0.5', async () => {
|
|
1513
|
+
const embedder = new Embedder(config, logger);
|
|
1514
|
+
const v1 = await embedder.embedOne('process payment refund');
|
|
1515
|
+
const v2 = await embedder.embedOne('configure webpack build optimization');
|
|
1516
|
+
const similarity = cosineSimilarity(v1, v2);
|
|
1517
|
+
expect(similarity).toBeLessThan(0.5);
|
|
1518
|
+
});
|
|
1519
|
+
|
|
1520
|
+
test('similarity is always in [0, 1]', async () => {
|
|
1521
|
+
const embedder = new Embedder(config, logger);
|
|
1522
|
+
const v1 = await embedder.embedOne('any arbitrary text');
|
|
1523
|
+
const v2 = await embedder.embedOne('completely different content');
|
|
1524
|
+
const similarity = cosineSimilarity(v1, v2);
|
|
1525
|
+
expect(similarity).toBeGreaterThanOrEqual(0);
|
|
1526
|
+
expect(similarity).toBeLessThanOrEqual(1);
|
|
1527
|
+
});
|
|
1528
|
+
|
|
1529
|
+
function cosineSimilarity(a: Float32Array, b: Float32Array): number {
|
|
1530
|
+
let dot = 0, normA = 0, normB = 0;
|
|
1531
|
+
for (let i = 0; i < a.length; i++) {
|
|
1532
|
+
dot += a[i] * b[i];
|
|
1533
|
+
normA += a[i] * a[i];
|
|
1534
|
+
normB += b[i] * b[i];
|
|
1535
|
+
}
|
|
1536
|
+
return dot / (Math.sqrt(normA) * Math.sqrt(normB));
|
|
1537
|
+
}
|
|
1538
|
+
```
|
|
1539
|
+
|
|
1540
|
+
**Guard:** `describe.skipIf(!process.env['GEMINI_API_KEY'])`.
|
|
1541
|
+
|
|
1542
|
+
**Validation:**
|
|
1543
|
+
```bash
|
|
1544
|
+
GEMINI_API_KEY=$GEMINI_API_KEY npx vitest run src/search/__tests__/similarity.test.ts
|
|
1545
|
+
```
|
|
1546
|
+
|
|
1547
|
+
---
|
|
1548
|
+
|
|
1549
|
+
## 3. Final Acceptance Checklist
|
|
1550
|
+
|
|
1551
|
+
Every check must pass. Failure on any = investigation required.
|
|
1552
|
+
|
|
1553
|
+
| # | Criterion | Verification | Addresses |
|
|
1554
|
+
|---|-----------|-------------|-----------|
|
|
1555
|
+
| AC-1 | `arc index` builds vector index from project map | `node dist/cli.js index` → exit 0, index files created | Core |
|
|
1556
|
+
| AC-2 | File-level AND symbol-level embeddings generated | `index-meta.json` shows `total_symbols_indexed > 0` | Core |
|
|
1557
|
+
| AC-3 | `arc index --incremental` only re-indexes changed files | Modify 1 file, re-map, re-index: only 1 file re-embedded | Core |
|
|
1558
|
+
| AC-4 | `arc search` returns semantically relevant results | Search for concept not in any symbol name → gets relevant results | Core |
|
|
1559
|
+
| AC-5 | Symbol-level results include line range | Result includes `L{start}-{end}` for symbol chunks | Core |
|
|
1560
|
+
| AC-6 | Dependency-aware enrichment present | Results show `graph_depth`, `dependents_count`, core/leaf label | Core |
|
|
1561
|
+
| AC-7 | JSON output valid and vector-free | `--json` output parses; no `vector` field in results | S-5 |
|
|
1562
|
+
| AC-8 | Threshold filtering works | `--threshold 0.99` returns fewer results than `--threshold 0.5` | Core |
|
|
1563
|
+
| AC-9 | Top-K limiting works | `--top 1` returns exactly 1 result (if any match) | Core |
|
|
1564
|
+
| AC-10 | Missing index → clear error | `arc search` without index → error, suggests `arc index` | Core |
|
|
1565
|
+
| AC-11 | Missing API key → clear error | `arc index` without GEMINI_API_KEY → clear error message | Core |
|
|
1566
|
+
| AC-12 | No regressions | `npx vitest run` → 0 failures across entire test suite | Core |
|
|
1567
|
+
| AC-13 | Type safety | `npx tsc --noEmit` → exit 0 | Core |
|
|
1568
|
+
| AC-14 | Build succeeds | `npm run build` → exit 0, `node dist/cli.js --help` shows all commands | Core |
|
|
1569
|
+
| AC-15 | Search < 2s for 500 indexed files | `time arc search "query"` on indexed project | GAP-5 |
|
|
1570
|
+
| AC-16 | Full re-index is zero-downtime | During `arc index`, concurrent `arc search` returns old results (not empty/error) | BLOCKER-1 |
|
|
1571
|
+
| AC-17 | Crash recovery works | Kill indexer mid-run; next `arc index --incremental` detects `"in_progress"` → full re-index | BLOCKER-2 |
|
|
1572
|
+
| AC-18 | Model change triggers full re-index | Change `embedding_model` in config; `--incremental` falls back to full | BLOCKER-3 |
|
|
1573
|
+
| AC-19 | Failed files retried on incremental | Simulate embedding failure; next `--incremental` re-embeds failed files | GAP-1 |
|
|
1574
|
+
| AC-20 | API timeout handled | Mock hanging API; indexer throws after `request_timeout_ms` | GAP-4 |
|
|
1575
|
+
| AC-21 | De-duplication deterministic | Symbol + parent file within 0.05 → file-level result removed | TRAP-3 |
|
|
1576
|
+
| AC-22 | Stale results formatted correctly | Deleted file in results → shows stale warning, no `depth -1` | TRAP-4 |
|
|
1577
|
+
| AC-23 | Dry-run makes no API calls | `arc index --dry-run` → chunk count printed, no embeddings, no writes | S-2 |
|
|
1578
|
+
| AC-24 | CI integration test passes | Mock-embedder integration test runs without API key | S-4 |
|
|
1579
|
+
| AC-25 | LanceDB pinned to exact version | `package.json` shows `"0.14.3"` not `"^0.14.3"` | M-9 |
|
|
1580
|
+
|
|
1581
|
+
---
|
|
1582
|
+
|
|
1583
|
+
## 4. Dependency Graph (Execution Order)
|
|
1584
|
+
|
|
1585
|
+
```
|
|
1586
|
+
7.0.1 ─── 7.0.2 ─┐
|
|
1587
|
+
│
|
|
1588
|
+
7.0.3 ────────────┤
|
|
1589
|
+
│
|
|
1590
|
+
7.0.4 ─── 7.0.5 ─┤
|
|
1591
|
+
│
|
|
1592
|
+
7.0.6 ────────────┘
|
|
1593
|
+
│
|
|
1594
|
+
┌─────────┼──────────┐
|
|
1595
|
+
▼ ▼ ▼
|
|
1596
|
+
7.1.1 7.2.1 7.3.1
|
|
1597
|
+
│ │ │
|
|
1598
|
+
7.1.2 7.2.2 7.3.2
|
|
1599
|
+
│ │ │
|
|
1600
|
+
└─────────┼──────────┘
|
|
1601
|
+
│
|
|
1602
|
+
7.4.1
|
|
1603
|
+
│
|
|
1604
|
+
7.4.2
|
|
1605
|
+
│
|
|
1606
|
+
┌───────┴───────┐
|
|
1607
|
+
▼ ▼
|
|
1608
|
+
7.5.1 7.5.2
|
|
1609
|
+
│ │
|
|
1610
|
+
└───────┬───────┘
|
|
1611
|
+
│
|
|
1612
|
+
7.5.3
|
|
1613
|
+
│
|
|
1614
|
+
┌───────┼───────┐
|
|
1615
|
+
▼ ▼ ▼
|
|
1616
|
+
7.6.1 7.6.2 7.6.3
|
|
1617
|
+
│ │ │
|
|
1618
|
+
└───────┼───────┘
|
|
1619
|
+
│
|
|
1620
|
+
┌───────┼───────┐
|
|
1621
|
+
▼ ▼ ▼
|
|
1622
|
+
7.7.1 7.7.2 7.7.3
|
|
1623
|
+
```
|
|
1624
|
+
|
|
1625
|
+
**Parallelizable:** Tasks 7.1, 7.2, and 7.3 can be implemented in parallel after Task 7.0 completes. Tasks 7.6.1, 7.6.2, and 7.6.3 can be implemented in parallel after Task 7.5.
|
|
1626
|
+
|
|
1627
|
+
---
|
|
1628
|
+
|
|
1629
|
+
## 5. Risk Register (Updated)
|
|
1630
|
+
|
|
1631
|
+
| Risk | Probability | Impact | Mitigation | Status |
|
|
1632
|
+
|---|---|---|---|---|
|
|
1633
|
+
| LanceDB native binding fails in esbuild bundle | Medium | Blocker | `--external` in esbuild. Post-build verification. | Mitigated |
|
|
1634
|
+
| Gemini rate limits during large index | High | Delays | Sequential batching, `embedding_requests_per_minute`, backoff. No dead config. | Mitigated (TRAP-2) |
|
|
1635
|
+
| LanceDB API breaking changes (pre-1.0) | Low | Medium | **Pinned to exact `0.14.3`** (M-9). Test suite catches regressions. | Mitigated |
|
|
1636
|
+
| Vector dimensions mismatch on model change | Low | Data corruption | **Dimension validation in incremental flow** (BLOCKER-3). Auto-triggers full re-index. | Mitigated |
|
|
1637
|
+
| Large projects exceed memory during embedding | Medium | Crash | **Streaming batch upsert** (BLOCKER-4). Records written and released per-batch. | Mitigated |
|
|
1638
|
+
| `.semantic.md` files missing or path derivation fails | High | Degraded quality | **Uses inline `ProjectMap.semantic` data** (TRAP-5). No filesystem reads for .semantic.md. | Mitigated |
|
|
1639
|
+
| Process crash during full re-index | Medium | Data loss | **Table-swap** (BLOCKER-1) + **status field** (BLOCKER-2). Old index preserved during crash. | Mitigated |
|
|
1640
|
+
| Partial embedding failure → permanent blind spots | Medium | Search gaps | **`failed_files[]`** in metadata (GAP-1). Re-indexed on next incremental run. | Mitigated |
|
|
1641
|
+
| API hangs indefinitely | Low | CLI freeze | **30s AbortController timeout** (GAP-4) on every API call. | Mitigated |
|
|
1642
|
+
| SIGINT during indexing | Medium | Corruption | **Table-swap** eliminates danger zone. SIGINT leaves old index intact. | Mitigated (GAP-3) |
|
|
1643
|
+
|
|
1644
|
+
---
|
|
1645
|
+
|
|
1646
|
+
## 6. File Inventory
|
|
1647
|
+
|
|
1648
|
+
**New files (15):**
|
|
1649
|
+
```
|
|
1650
|
+
src/search/vector-store.ts
|
|
1651
|
+
src/search/embedder.ts
|
|
1652
|
+
src/search/chunk-extractor.ts
|
|
1653
|
+
src/search/indexer.ts
|
|
1654
|
+
src/search/query-engine.ts
|
|
1655
|
+
src/search/graph-enricher.ts
|
|
1656
|
+
src/search/__tests__/vector-store.test.ts
|
|
1657
|
+
src/search/__tests__/embedder.test.ts
|
|
1658
|
+
src/search/__tests__/chunk-extractor.test.ts
|
|
1659
|
+
src/search/__tests__/indexer.test.ts
|
|
1660
|
+
src/search/__tests__/graph-enricher.test.ts
|
|
1661
|
+
src/search/__tests__/query-engine.test.ts
|
|
1662
|
+
src/search/__tests__/similarity.test.ts
|
|
1663
|
+
src/search/__tests__/integration.test.ts
|
|
1664
|
+
src/commands/index.ts
|
|
1665
|
+
src/commands/search.ts
|
|
1666
|
+
```
|
|
1667
|
+
|
|
1668
|
+
**Modified files (5):**
|
|
1669
|
+
```
|
|
1670
|
+
src/types/index.ts ← new search types (SearchResult, IndexMetadata with status + failed_files)
|
|
1671
|
+
src/core/config.ts ← SearchConfigSchema (no max_concurrent_requests)
|
|
1672
|
+
src/core/errors.ts ← new error codes (including search_index_corrupted)
|
|
1673
|
+
src/cli.ts ← register index + search commands
|
|
1674
|
+
package.json ← lancedb@0.14.3 (pinned) + esbuild externals
|
|
1675
|
+
.gitignore ← exclude vector_index/
|
|
1676
|
+
```
|
|
1677
|
+
|
|
1678
|
+
**Runtime artifacts (generated, gitignored):**
|
|
1679
|
+
```
|
|
1680
|
+
tasks-management/graph/vector_index/ ← LanceDB data directory
|
|
1681
|
+
tasks-management/graph/vector_index/index-meta.json ← index metadata
|
|
1682
|
+
```
|