@nomos-arc/arc 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (160) hide show
  1. package/.claude/settings.local.json +10 -0
  2. package/.nomos-config.json +5 -0
  3. package/CLAUDE.md +108 -0
  4. package/LICENSE +190 -0
  5. package/README.md +569 -0
  6. package/dist/cli.js +21120 -0
  7. package/docs/auth/googel_plan.yaml +1093 -0
  8. package/docs/auth/google_task.md +235 -0
  9. package/docs/auth/hardened_blueprint.yaml +1658 -0
  10. package/docs/auth/red_team_report.yaml +336 -0
  11. package/docs/auth/session_state.yaml +162 -0
  12. package/docs/certificate/cer_enhance_plan.md +605 -0
  13. package/docs/certificate/certificate_report.md +338 -0
  14. package/docs/dev_overview.md +419 -0
  15. package/docs/feature_assessment.md +156 -0
  16. package/docs/how_it_works.md +78 -0
  17. package/docs/infrastructure/map.md +867 -0
  18. package/docs/init/master_plan.md +3581 -0
  19. package/docs/init/red_team_report.md +215 -0
  20. package/docs/init/report_phase_1a.md +304 -0
  21. package/docs/integrity-gate/enhance_drift.md +703 -0
  22. package/docs/integrity-gate/overview.md +108 -0
  23. package/docs/management/manger-task.md +99 -0
  24. package/docs/management/scafffold.md +76 -0
  25. package/docs/map/ATOMIC_BLUEPRINT.md +1349 -0
  26. package/docs/map/RED_TEAM_REPORT.md +159 -0
  27. package/docs/map/map_task.md +147 -0
  28. package/docs/map/semantic_graph_task.md +792 -0
  29. package/docs/map/semantic_master_plan.md +705 -0
  30. package/docs/phase7/TEAM_RED.md +249 -0
  31. package/docs/phase7/plan.md +1682 -0
  32. package/docs/phase7/task.md +275 -0
  33. package/docs/prompts/USAGE.md +312 -0
  34. package/docs/prompts/architect.md +165 -0
  35. package/docs/prompts/executer.md +190 -0
  36. package/docs/prompts/hardener.md +190 -0
  37. package/docs/prompts/red_team.md +146 -0
  38. package/docs/verification/goveranance-overview.md +396 -0
  39. package/docs/verification/governance-overview.md +245 -0
  40. package/docs/verification/verification-arc-ar.md +560 -0
  41. package/docs/verification/verification-architecture.md +560 -0
  42. package/docs/very_next.md +52 -0
  43. package/docs/whitepaper.md +89 -0
  44. package/overview.md +1469 -0
  45. package/package.json +63 -0
  46. package/src/adapters/__tests__/git.test.ts +296 -0
  47. package/src/adapters/__tests__/stdio.test.ts +70 -0
  48. package/src/adapters/git.ts +226 -0
  49. package/src/adapters/pty.ts +159 -0
  50. package/src/adapters/stdio.ts +113 -0
  51. package/src/cli.ts +83 -0
  52. package/src/commands/apply.ts +47 -0
  53. package/src/commands/auth.ts +301 -0
  54. package/src/commands/certificate.ts +89 -0
  55. package/src/commands/discard.ts +24 -0
  56. package/src/commands/drift.ts +116 -0
  57. package/src/commands/index.ts +78 -0
  58. package/src/commands/init.ts +121 -0
  59. package/src/commands/list.ts +75 -0
  60. package/src/commands/map.ts +55 -0
  61. package/src/commands/plan.ts +30 -0
  62. package/src/commands/review.ts +58 -0
  63. package/src/commands/run.ts +63 -0
  64. package/src/commands/search.ts +147 -0
  65. package/src/commands/show.ts +63 -0
  66. package/src/commands/status.ts +59 -0
  67. package/src/core/__tests__/budget.test.ts +213 -0
  68. package/src/core/__tests__/certificate.test.ts +385 -0
  69. package/src/core/__tests__/config.test.ts +191 -0
  70. package/src/core/__tests__/preflight.test.ts +24 -0
  71. package/src/core/__tests__/prompt.test.ts +358 -0
  72. package/src/core/__tests__/review.test.ts +161 -0
  73. package/src/core/__tests__/state.test.ts +362 -0
  74. package/src/core/auth/__tests__/manager.test.ts +166 -0
  75. package/src/core/auth/__tests__/server.test.ts +220 -0
  76. package/src/core/auth/gcp-projects.ts +160 -0
  77. package/src/core/auth/manager.ts +114 -0
  78. package/src/core/auth/server.ts +141 -0
  79. package/src/core/budget.ts +119 -0
  80. package/src/core/certificate.ts +502 -0
  81. package/src/core/config.ts +212 -0
  82. package/src/core/errors.ts +54 -0
  83. package/src/core/factory.ts +49 -0
  84. package/src/core/graph/__tests__/builder.test.ts +272 -0
  85. package/src/core/graph/__tests__/contract-writer.test.ts +175 -0
  86. package/src/core/graph/__tests__/enricher.test.ts +299 -0
  87. package/src/core/graph/__tests__/parser.test.ts +200 -0
  88. package/src/core/graph/__tests__/pipeline.test.ts +202 -0
  89. package/src/core/graph/__tests__/renderer.test.ts +128 -0
  90. package/src/core/graph/__tests__/resolver.test.ts +185 -0
  91. package/src/core/graph/__tests__/scanner.test.ts +231 -0
  92. package/src/core/graph/__tests__/show.test.ts +134 -0
  93. package/src/core/graph/builder.ts +303 -0
  94. package/src/core/graph/constraints.ts +94 -0
  95. package/src/core/graph/contract-writer.ts +93 -0
  96. package/src/core/graph/drift/__tests__/classifier.test.ts +215 -0
  97. package/src/core/graph/drift/__tests__/comparator.test.ts +335 -0
  98. package/src/core/graph/drift/__tests__/drift.test.ts +453 -0
  99. package/src/core/graph/drift/__tests__/reporter.test.ts +203 -0
  100. package/src/core/graph/drift/classifier.ts +165 -0
  101. package/src/core/graph/drift/comparator.ts +205 -0
  102. package/src/core/graph/drift/reporter.ts +77 -0
  103. package/src/core/graph/enricher.ts +251 -0
  104. package/src/core/graph/grammar-paths.ts +30 -0
  105. package/src/core/graph/html-template.ts +493 -0
  106. package/src/core/graph/map-schema.ts +137 -0
  107. package/src/core/graph/parser.ts +336 -0
  108. package/src/core/graph/pipeline.ts +209 -0
  109. package/src/core/graph/renderer.ts +92 -0
  110. package/src/core/graph/resolver.ts +195 -0
  111. package/src/core/graph/scanner.ts +145 -0
  112. package/src/core/logger.ts +46 -0
  113. package/src/core/orchestrator.ts +792 -0
  114. package/src/core/plan-file-manager.ts +66 -0
  115. package/src/core/preflight.ts +64 -0
  116. package/src/core/prompt.ts +173 -0
  117. package/src/core/review.ts +95 -0
  118. package/src/core/state.ts +294 -0
  119. package/src/core/worktree-coordinator.ts +77 -0
  120. package/src/search/__tests__/chunk-extractor.test.ts +339 -0
  121. package/src/search/__tests__/embedder-auth.test.ts +124 -0
  122. package/src/search/__tests__/embedder.test.ts +267 -0
  123. package/src/search/__tests__/graph-enricher.test.ts +178 -0
  124. package/src/search/__tests__/indexer.test.ts +518 -0
  125. package/src/search/__tests__/integration.test.ts +649 -0
  126. package/src/search/__tests__/query-engine.test.ts +334 -0
  127. package/src/search/__tests__/similarity.test.ts +78 -0
  128. package/src/search/__tests__/vector-store.test.ts +281 -0
  129. package/src/search/chunk-extractor.ts +167 -0
  130. package/src/search/embedder.ts +209 -0
  131. package/src/search/graph-enricher.ts +95 -0
  132. package/src/search/indexer.ts +483 -0
  133. package/src/search/lexical-searcher.ts +190 -0
  134. package/src/search/query-engine.ts +225 -0
  135. package/src/search/vector-store.ts +311 -0
  136. package/src/types/index.ts +572 -0
  137. package/src/utils/__tests__/ansi.test.ts +54 -0
  138. package/src/utils/__tests__/frontmatter.test.ts +79 -0
  139. package/src/utils/__tests__/sanitize.test.ts +229 -0
  140. package/src/utils/ansi.ts +19 -0
  141. package/src/utils/context.ts +44 -0
  142. package/src/utils/frontmatter.ts +27 -0
  143. package/src/utils/sanitize.ts +78 -0
  144. package/test/e2e/lifecycle.test.ts +330 -0
  145. package/test/fixtures/mock-planner-hang.ts +5 -0
  146. package/test/fixtures/mock-planner.ts +26 -0
  147. package/test/fixtures/mock-reviewer-bad.ts +8 -0
  148. package/test/fixtures/mock-reviewer-retry.ts +34 -0
  149. package/test/fixtures/mock-reviewer.ts +18 -0
  150. package/test/fixtures/sample-project/src/circular-a.ts +6 -0
  151. package/test/fixtures/sample-project/src/circular-b.ts +6 -0
  152. package/test/fixtures/sample-project/src/config.ts +15 -0
  153. package/test/fixtures/sample-project/src/main.ts +19 -0
  154. package/test/fixtures/sample-project/src/services/product-service.ts +20 -0
  155. package/test/fixtures/sample-project/src/services/user-service.ts +18 -0
  156. package/test/fixtures/sample-project/src/types.ts +14 -0
  157. package/test/fixtures/sample-project/src/utils/index.ts +14 -0
  158. package/test/fixtures/sample-project/src/utils/validate.ts +12 -0
  159. package/tsconfig.json +20 -0
  160. package/vitest.config.ts +12 -0
@@ -0,0 +1,483 @@
1
+ import fs from 'node:fs/promises';
2
+ import path from 'node:path';
3
+ import type { Logger } from 'winston';
4
+ import { NomosError } from '../core/errors.js';
5
+ import type { IndexMetadata, NomosConfig, ProjectMap, TextChunk, VectorRecord } from '../types/index.js';
6
+ import { ChunkExtractor } from './chunk-extractor.js';
7
+ import { Embedder } from './embedder.js';
8
+ import { VectorStore } from './vector-store.js';
9
+ import { AuthManager } from '../core/auth/manager.js';
10
+
11
+ // ─── SearchIndexer ─────────────────────────────────────────────────────────────
12
+
13
+ export class SearchIndexer {
14
+ private readonly extractor: ChunkExtractor;
15
+ private _embedder: Embedder | null = null; // lazy — not created until first embedding call
16
+ private readonly store: VectorStore;
17
+ private readonly metaPath: string;
18
+
19
+ constructor(
20
+ private readonly projectRoot: string,
21
+ private readonly config: NomosConfig,
22
+ private readonly logger: Logger,
23
+ private readonly authManager?: AuthManager | null,
24
+ ) {
25
+ this.extractor = new ChunkExtractor(projectRoot, logger);
26
+ this.store = new VectorStore(
27
+ config.search.vector_store_path,
28
+ logger,
29
+ config.search.embedding_dimensions,
30
+ );
31
+ this.metaPath = path.join(config.search.vector_store_path, 'index-meta.json');
32
+ }
33
+
34
+ /** Lazy Embedder accessor — throws at construction time only if actually needed. */
35
+ private async getEmbedder(): Promise<Embedder> {
36
+ if (!this._embedder) {
37
+ this._embedder = await Embedder.create(
38
+ this.config.search, this.logger, this.authManager,
39
+ );
40
+ }
41
+ return this._embedder;
42
+ }
43
+
44
+ // ─── Public API ─────────────────────────────────────────────────────────────
45
+
46
+ /**
47
+ * Full index: extract all chunks, embed in streaming batches, upsert to staging,
48
+ * then atomic table-swap to live [BLOCKER-1].
49
+ * Writes metadata with status tracking [BLOCKER-2].
50
+ */
51
+ async fullIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata> {
52
+ // Pre-check: access embedder once to validate API key before any mutations [AC-11]
53
+ await this.getEmbedder();
54
+
55
+ // Step 1: Load project_map.json
56
+ const projectMap = await this.loadProjectMap();
57
+
58
+ // Step 2: Write IndexMetadata with status: "in_progress" BEFORE any mutation [BLOCKER-2]
59
+ const startedAt = new Date().toISOString();
60
+ const inProgressMeta: IndexMetadata = {
61
+ status: 'in_progress',
62
+ last_full_index: startedAt,
63
+ last_incremental_index: null,
64
+ total_files_indexed: 0,
65
+ total_symbols_indexed: 0,
66
+ total_chunks: 0,
67
+ embedding_model: this.config.search.embedding_model,
68
+ vector_dimensions: this.config.search.embedding_dimensions,
69
+ failed_files: [],
70
+ files: {},
71
+ };
72
+ await this.writeMeta(inProgressMeta);
73
+
74
+ // Step 3: Extract all chunks
75
+ const allChunks = this.extractor.extract(projectMap);
76
+ const fileChunks = allChunks.filter(c => c.type === 'file');
77
+ const symbolChunks = allChunks.filter(c => c.type === 'symbol');
78
+ this.logger.info(
79
+ `[nomos:search:info] Extracted ${allChunks.length} chunks (${fileChunks.length} file-level, ${symbolChunks.length} symbol-level)`,
80
+ );
81
+
82
+ // Step 4: Init store + cleanup orphaned staging table from prior crash [GAP-3]
83
+ await this.store.init();
84
+ await this.store.cleanupStaging();
85
+
86
+ // Step 5: Streaming batch loop [BLOCKER-4]
87
+ const batches = chunk(allChunks, this.config.search.batch_size);
88
+ const failedFiles: string[] = [];
89
+ let batchesDone = 0;
90
+ let cancelled = false;
91
+
92
+ for (let i = 0; i < batches.length; i++) {
93
+ // 5a: Check cancellation flag at TOP of each iteration
94
+ if (cancellationFlag?.cancelled) {
95
+ cancelled = true;
96
+ break;
97
+ }
98
+
99
+ const batch = batches[i]!;
100
+
101
+ // 5b: Embed batch — on failure, record file paths and continue [GAP-1]
102
+ let vectors: Float32Array[];
103
+ try {
104
+ vectors = await (await this.getEmbedder()).embedBatch(batch.map(c => c.text));
105
+ } catch (err) {
106
+ this.logger.error(
107
+ `[nomos:search:error] Batch ${i + 1}/${batches.length} embedding failed: ${(err as Error).message}`,
108
+ );
109
+ for (const c of batch) {
110
+ if (!failedFiles.includes(c.file_path)) {
111
+ failedFiles.push(c.file_path);
112
+ }
113
+ }
114
+ continue;
115
+ }
116
+
117
+ // 5c: Compose VectorRecord[] for THIS BATCH ONLY
118
+ const records = this.composeRecords(batch, vectors, projectMap);
119
+
120
+ // 5d: Upsert batch to staging immediately, release references
121
+ await this.store.upsertToStaging(records);
122
+ batchesDone++;
123
+
124
+ this.logger.info(
125
+ `[nomos:search:info] Embedded batch ${i + 1}/${batches.length} (${batch.length} chunks)`,
126
+ );
127
+ }
128
+
129
+ if (cancelled) {
130
+ // Partial run: metadata written as "in_progress"; next run cleans staging + forces full re-index
131
+ const partialMeta: IndexMetadata = {
132
+ ...inProgressMeta,
133
+ status: 'in_progress',
134
+ failed_files: failedFiles,
135
+ total_chunks: batchesDone * this.config.search.batch_size,
136
+ };
137
+ await this.writeMeta(partialMeta);
138
+ return partialMeta;
139
+ }
140
+
141
+ // Step 6: Atomic table-swap [BLOCKER-1]
142
+ await this.store.promoteStagingToLive();
143
+
144
+ // Step 7: Write IndexMetadata with status: "complete" [BLOCKER-2]
145
+ const finalMeta = this.buildFinalMeta(
146
+ allChunks,
147
+ startedAt,
148
+ null,
149
+ failedFiles,
150
+ projectMap,
151
+ );
152
+ await this.writeMeta(finalMeta);
153
+
154
+ // Step 8: Return IndexMetadata
155
+ return finalMeta;
156
+ }
157
+
158
+ /**
159
+ * Incremental index: validates dimensions [BLOCKER-3], re-indexes changed + failed files [GAP-1].
160
+ */
161
+ async incrementalIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata> {
162
+ // Pre-check: access embedder once to validate API key before any mutations [AC-11]
163
+ await this.getEmbedder();
164
+
165
+ // Step 1: Load project_map.json
166
+ const projectMap = await this.loadProjectMap();
167
+
168
+ // Step 2: Load existing IndexMetadata
169
+ let existingMeta: IndexMetadata | null = null;
170
+ try {
171
+ const raw = await fs.readFile(this.metaPath, 'utf-8');
172
+ existingMeta = JSON.parse(raw) as IndexMetadata;
173
+ } catch {
174
+ // Not found → fall back to fullIndex
175
+ this.logger.info('[nomos:search:info] No existing index metadata. Running full index.');
176
+ return this.fullIndex(cancellationFlag);
177
+ }
178
+
179
+ // If previous run was interrupted, force full re-index [BLOCKER-2]
180
+ if (existingMeta.status === 'in_progress') {
181
+ this.logger.warn(
182
+ '[nomos:search:warn] Previous index incomplete. Running full re-index.',
183
+ );
184
+ return this.fullIndex(cancellationFlag);
185
+ }
186
+
187
+ // Step 3: Dimension validation [BLOCKER-3]
188
+ const modelChanged = existingMeta.embedding_model !== this.config.search.embedding_model;
189
+ const dimsChanged = existingMeta.vector_dimensions !== this.config.search.embedding_dimensions;
190
+ if (modelChanged || dimsChanged) {
191
+ const oldModel = `${existingMeta.embedding_model}/${existingMeta.vector_dimensions}`;
192
+ const newModel = `${this.config.search.embedding_model}/${this.config.search.embedding_dimensions}`;
193
+ this.logger.warn(
194
+ `[nomos:search:warn] Embedding model/dimensions changed (${oldModel} → ${newModel}). Forcing full re-index.`,
195
+ );
196
+ return this.fullIndex(cancellationFlag);
197
+ }
198
+
199
+ // Step 4: Extract all chunks
200
+ const allChunks = this.extractor.extract(projectMap);
201
+
202
+ // Step 5: Compute diff — decisions are made at file-path granularity.
203
+ // The metadata stores one content_hash per file path (from the file-level chunk).
204
+ // Symbol chunks share the same file_path and are always re-indexed alongside their parent.
205
+ const chunksByFile = groupByFilePath(allChunks);
206
+ const currentFilePaths = new Set(Object.keys(chunksByFile));
207
+
208
+ const toReindexPaths = new Set<string>();
209
+
210
+ for (const [fp, chunks] of Object.entries(chunksByFile)) {
211
+ const existing = existingMeta.files[fp];
212
+ if (!existing) {
213
+ // new file
214
+ toReindexPaths.add(fp);
215
+ continue;
216
+ }
217
+ // Use the file-level chunk's hash for comparison (matches what buildFinalMeta stores)
218
+ const fileChunk = chunks.find(c => c.type === 'file');
219
+ if (fileChunk && existing.content_hash !== fileChunk.content_hash) {
220
+ toReindexPaths.add(fp);
221
+ continue;
222
+ }
223
+ if (existingMeta.failed_files.includes(fp)) {
224
+ toReindexPaths.add(fp);
225
+ }
226
+ }
227
+
228
+ // Removed: files in metadata not in current projectMap
229
+ const removed = Object.keys(existingMeta.files).filter(fp => !currentFilePaths.has(fp));
230
+
231
+ const toReindex = allChunks.filter(c => toReindexPaths.has(c.file_path));
232
+
233
+ // Step 6: Nothing to do
234
+ if (toReindex.length === 0 && removed.length === 0) {
235
+ this.logger.info('[nomos:search:info] Index is up-to-date. No changes detected.');
236
+ return existingMeta;
237
+ }
238
+
239
+ // Step 7: Mark in_progress [BLOCKER-2]
240
+ await this.writeMeta({ ...existingMeta, status: 'in_progress' });
241
+
242
+ // Step 8: Init store and delete stale records
243
+ await this.store.init();
244
+ const toDeletePaths = unique([...removed, ...toReindex.map(c => c.file_path)]);
245
+ await this.store.deleteByFilePaths(toDeletePaths);
246
+
247
+ // Step 9: Streaming batch loop for toReindex [BLOCKER-4]
248
+ const batches = chunk(toReindex, this.config.search.batch_size);
249
+ const newFailedFiles: string[] = [];
250
+
251
+ for (let i = 0; i < batches.length; i++) {
252
+ // Check cancellation at TOP of each iteration
253
+ if (cancellationFlag?.cancelled) {
254
+ const partialMeta: IndexMetadata = {
255
+ ...existingMeta,
256
+ status: 'in_progress',
257
+ failed_files: newFailedFiles,
258
+ };
259
+ await this.writeMeta(partialMeta);
260
+ return partialMeta;
261
+ }
262
+
263
+ const batch = batches[i]!;
264
+
265
+ let vectors: Float32Array[];
266
+ try {
267
+ vectors = await (await this.getEmbedder()).embedBatch(batch.map(c => c.text));
268
+ } catch (err) {
269
+ this.logger.error(
270
+ `[nomos:search:error] Incremental batch ${i + 1}/${batches.length} failed: ${(err as Error).message}`,
271
+ );
272
+ for (const c of batch) {
273
+ if (!newFailedFiles.includes(c.file_path)) {
274
+ newFailedFiles.push(c.file_path);
275
+ }
276
+ }
277
+ continue;
278
+ }
279
+
280
+ const records = this.composeRecords(batch, vectors, projectMap);
281
+ await this.store.upsert(records);
282
+
283
+ this.logger.info(
284
+ `[nomos:search:info] Embedded batch ${i + 1}/${batches.length} (${batch.length} chunks)`,
285
+ );
286
+ }
287
+
288
+ // Step 10: Update metadata
289
+ const now = new Date().toISOString();
290
+ const updatedFiles: IndexMetadata['files'] = { ...existingMeta.files };
291
+
292
+ // Remove deleted file entries
293
+ for (const fp of removed) {
294
+ delete updatedFiles[fp];
295
+ }
296
+
297
+ // Update/add re-indexed file entries
298
+ for (const c of toReindex) {
299
+ if (!newFailedFiles.includes(c.file_path)) {
300
+ const existingEntry = updatedFiles[c.file_path];
301
+ updatedFiles[c.file_path] = {
302
+ last_indexed: now,
303
+ content_hash: c.content_hash,
304
+ chunk_count: (existingEntry?.chunk_count ?? 0),
305
+ };
306
+ }
307
+ }
308
+
309
+ // Recount from updated files entries
310
+ const totalFileChunks = allChunks.filter(c => c.type === 'file' && updatedFiles[c.file_path]).length;
311
+ const totalSymbolChunks = allChunks.filter(c => c.type === 'symbol' && updatedFiles[c.file_path]).length;
312
+
313
+ // Retain failed_files that were not successfully re-embedded; clear those that succeeded
314
+ const persistedFailedFiles = [
315
+ ...existingMeta.failed_files.filter(fp => newFailedFiles.includes(fp)),
316
+ ...newFailedFiles.filter(fp => !existingMeta.failed_files.includes(fp)),
317
+ ];
318
+
319
+ const updatedMeta: IndexMetadata = {
320
+ status: 'complete',
321
+ last_full_index: existingMeta.last_full_index,
322
+ last_incremental_index: now,
323
+ total_files_indexed: totalFileChunks,
324
+ total_symbols_indexed: totalSymbolChunks,
325
+ total_chunks: totalFileChunks + totalSymbolChunks,
326
+ embedding_model: this.config.search.embedding_model,
327
+ vector_dimensions: this.config.search.embedding_dimensions,
328
+ failed_files: persistedFailedFiles,
329
+ files: updatedFiles,
330
+ };
331
+
332
+ // Step 11: Write metadata atomically [BLOCKER-2]
333
+ await this.writeMeta(updatedMeta);
334
+
335
+ // Step 12: Return updated metadata
336
+ return updatedMeta;
337
+ }
338
+
339
+ /**
340
+ * Dry-run: extract and count chunks without embedding or writing [S-2].
341
+ */
342
+ async dryRun(): Promise<{ fileChunks: number; symbolChunks: number; totalChunks: number }> {
343
+ const projectMap = await this.loadProjectMap();
344
+ const allChunks = this.extractor.extract(projectMap);
345
+ const fileChunks = allChunks.filter(c => c.type === 'file').length;
346
+ const symbolChunks = allChunks.filter(c => c.type === 'symbol').length;
347
+ return { fileChunks, symbolChunks, totalChunks: allChunks.length };
348
+ }
349
+
350
+ // ─── Private helpers ─────────────────────────────────────────────────────────
351
+
352
+ private async loadProjectMap(): Promise<ProjectMap> {
353
+ const mapPath = path.join(this.config.graph.output_dir, 'project_map.json');
354
+ try {
355
+ const raw = await fs.readFile(mapPath, 'utf-8');
356
+ return JSON.parse(raw) as ProjectMap;
357
+ } catch {
358
+ throw new NomosError(
359
+ 'search_index_failed',
360
+ `project_map.json not found at ${mapPath}. Run: arc map`,
361
+ );
362
+ }
363
+ }
364
+
365
+ private composeRecords(
366
+ chunks: TextChunk[],
367
+ vectors: Float32Array[],
368
+ projectMap: ProjectMap,
369
+ ): VectorRecord[] {
370
+ const now = new Date().toISOString();
371
+ return chunks.map((c, idx) => {
372
+ const fileNode = projectMap.files[c.file_path];
373
+ return {
374
+ id: c.id,
375
+ type: c.type,
376
+ vector: vectors[idx]!,
377
+ file_path: c.file_path,
378
+ module: path.dirname(c.file_path),
379
+ purpose: fileNode?.semantic?.purpose ?? c.file_path,
380
+ symbol_name: c.symbol_name,
381
+ symbol_type: c.symbol_type,
382
+ line_start: c.line_start,
383
+ line_end: c.line_end,
384
+ parent_file_id: c.parent_file_id,
385
+ graph_depth: fileNode?.depth ?? 0,
386
+ dependents_count: fileNode?.dependents?.length ?? 0,
387
+ last_indexed: now,
388
+ content_hash: c.content_hash,
389
+ };
390
+ });
391
+ }
392
+
393
+ private buildFinalMeta(
394
+ allChunks: TextChunk[],
395
+ startedAt: string,
396
+ lastIncremental: string | null,
397
+ failedFiles: string[],
398
+ projectMap: ProjectMap,
399
+ ): IndexMetadata {
400
+ const now = new Date().toISOString();
401
+ const successfulChunks = allChunks.filter(c => !failedFiles.includes(c.file_path));
402
+ const fileChunks = successfulChunks.filter(c => c.type === 'file');
403
+ const symbolChunks = successfulChunks.filter(c => c.type === 'symbol');
404
+
405
+ // Build per-file entry: group chunks by file_path
406
+ const fileEntries: IndexMetadata['files'] = {};
407
+ for (const c of successfulChunks) {
408
+ if (!fileEntries[c.file_path]) {
409
+ fileEntries[c.file_path] = {
410
+ last_indexed: now,
411
+ content_hash: c.content_hash,
412
+ chunk_count: 0,
413
+ };
414
+ }
415
+ fileEntries[c.file_path]!.chunk_count++;
416
+ }
417
+
418
+ // Resolve content_hash per file from the ProjectMap-derived chunk [S-6]
419
+ for (const [fp, entry] of Object.entries(fileEntries)) {
420
+ const fileChunk = allChunks.find(c => c.file_path === fp && c.type === 'file');
421
+ if (fileChunk) {
422
+ entry.content_hash = fileChunk.content_hash;
423
+ }
424
+ }
425
+
426
+ return {
427
+ status: 'complete',
428
+ last_full_index: startedAt,
429
+ last_incremental_index: lastIncremental,
430
+ total_files_indexed: fileChunks.length,
431
+ total_symbols_indexed: symbolChunks.length,
432
+ total_chunks: successfulChunks.length,
433
+ embedding_model: this.config.search.embedding_model,
434
+ vector_dimensions: this.config.search.embedding_dimensions,
435
+ failed_files: failedFiles,
436
+ files: fileEntries,
437
+ };
438
+ }
439
+
440
+ /**
441
+ * Atomically write IndexMetadata: write to .tmp file then rename [BLOCKER-2].
442
+ */
443
+ private async writeMeta(meta: IndexMetadata): Promise<void> {
444
+ await fs.mkdir(this.config.search.vector_store_path, { recursive: true });
445
+ const tmpPath = `${this.metaPath}.tmp`;
446
+ await fs.writeFile(tmpPath, JSON.stringify(meta, null, 2), 'utf-8');
447
+ await fs.rename(tmpPath, this.metaPath);
448
+ }
449
+ }
450
+
451
+ // ─── Utilities ────────────────────────────────────────────────────────────────
452
+
453
+ function chunk<T>(arr: T[], size: number): T[][] {
454
+ const result: T[][] = [];
455
+ for (let i = 0; i < arr.length; i += size) {
456
+ result.push(arr.slice(i, i + size));
457
+ }
458
+ return result;
459
+ }
460
+
461
+ /** Deduplicate TextChunks by id, preserving first occurrence. */
462
+ function dedup(chunks: TextChunk[]): TextChunk[] {
463
+ const seen = new Set<string>();
464
+ return chunks.filter(c => {
465
+ if (seen.has(c.id)) return false;
466
+ seen.add(c.id);
467
+ return true;
468
+ });
469
+ }
470
+
471
+ /** Return unique strings, preserving order. */
472
+ function unique(arr: string[]): string[] {
473
+ return [...new Set(arr)];
474
+ }
475
+
476
+ /** Group TextChunks by file_path. */
477
+ function groupByFilePath(chunks: TextChunk[]): Record<string, TextChunk[]> {
478
+ const map: Record<string, TextChunk[]> = {};
479
+ for (const c of chunks) {
480
+ (map[c.file_path] ??= []).push(c);
481
+ }
482
+ return map;
483
+ }
@@ -0,0 +1,190 @@
1
+ import type { ProjectMap, SearchResult } from '../types/index.js';
2
+
3
+ // ─── LexicalSearcher ─────────────────────────────────────────────────────────
4
+
5
+ /**
6
+ * Keyword-based search for files that are not yet vector-indexed.
7
+ * Scores files based on term matches in file path, symbol names,
8
+ * symbol signatures, and import sources.
9
+ *
10
+ * Used as a fallback when the vector index is unavailable or incomplete,
11
+ * enabling `arc search` to work immediately after `arc map --no-ai`.
12
+ */
13
+ export class LexicalSearcher {
14
+
15
+ /**
16
+ * Search a subset of files from the project map using keyword matching.
17
+ *
18
+ * @param query - The user's search query string
19
+ * @param projectMap - The full project map
20
+ * @param filePaths - Set of file paths to search (e.g., files not in vector index)
21
+ * @param topK - Maximum results to return
22
+ * @param threshold - Minimum score (0–1) to include a result
23
+ */
24
+ search(
25
+ query: string,
26
+ projectMap: ProjectMap,
27
+ filePaths: Set<string>,
28
+ topK: number,
29
+ threshold: number,
30
+ ): SearchResult[] {
31
+ const terms = this.tokenize(query);
32
+ if (terms.length === 0) return [];
33
+
34
+ const results: SearchResult[] = [];
35
+ const coreModuleSet = new Set(projectMap.stats.core_modules);
36
+
37
+ for (const fp of filePaths) {
38
+ const fileNode = projectMap.files[fp];
39
+ if (!fileNode) continue;
40
+
41
+ const score = this.scoreFile(terms, fp, fileNode);
42
+ if (score < threshold) continue;
43
+
44
+ results.push({
45
+ id: fp,
46
+ type: 'file',
47
+ file_path: fp,
48
+ symbol_name: null,
49
+ symbol_type: null,
50
+ line_start: null,
51
+ line_end: null,
52
+ purpose: fileNode.semantic?.purpose ?? fp,
53
+ similarity_score: score,
54
+ graph_depth: fileNode.depth,
55
+ dependents_count: fileNode.dependents.length,
56
+ is_core_module: coreModuleSet.has(fp),
57
+ is_stale: false,
58
+ });
59
+
60
+ // Also score individual symbols
61
+ for (const symbol of fileNode.symbols) {
62
+ if (!symbol.exported && symbol.kind !== 'class' && symbol.kind !== 'function') continue;
63
+
64
+ const symbolScore = this.scoreSymbol(terms, fp, symbol.name, symbol.kind, symbol.signature);
65
+ if (symbolScore < threshold) continue;
66
+
67
+ results.push({
68
+ id: `${fp}::${symbol.name}`,
69
+ type: 'symbol',
70
+ file_path: fp,
71
+ symbol_name: symbol.name,
72
+ symbol_type: symbol.kind,
73
+ line_start: symbol.line,
74
+ line_end: symbol.end_line,
75
+ purpose: fileNode.semantic?.purpose ?? fp,
76
+ similarity_score: symbolScore,
77
+ graph_depth: fileNode.depth,
78
+ dependents_count: fileNode.dependents.length,
79
+ is_core_module: coreModuleSet.has(fp),
80
+ is_stale: false,
81
+ });
82
+ }
83
+ }
84
+
85
+ // Sort by score descending, return top-K
86
+ results.sort((a, b) => b.similarity_score - a.similarity_score);
87
+ return results.slice(0, topK);
88
+ }
89
+
90
+ // ─── Private ───────────────────────────────────────────────────────────────
91
+
92
+ /**
93
+ * Tokenize a query into lowercase terms, splitting on whitespace and common
94
+ * code separators (camelCase, underscores, hyphens, dots, slashes).
95
+ */
96
+ private tokenize(query: string): string[] {
97
+ // Split on whitespace first
98
+ const words = query.toLowerCase().split(/\s+/).filter(Boolean);
99
+
100
+ // Further split camelCase/PascalCase and separators
101
+ const terms: string[] = [];
102
+ for (const word of words) {
103
+ // Split on separators: _ - . /
104
+ const parts = word.split(/[_\-./]+/).filter(Boolean);
105
+ for (const part of parts) {
106
+ // Split camelCase: "getAccessToken" → ["get", "access", "token"]
107
+ const camelParts = part.replace(/([a-z])([A-Z])/g, '$1 $2').toLowerCase().split(' ');
108
+ terms.push(...camelParts.filter(Boolean));
109
+ }
110
+ }
111
+
112
+ return [...new Set(terms)]; // deduplicate
113
+ }
114
+
115
+ /**
116
+ * Score a file based on how many query terms appear in its searchable text.
117
+ * Returns a value in [0, 1].
118
+ */
119
+ private scoreFile(
120
+ terms: string[],
121
+ filePath: string,
122
+ fileNode: { symbols: Array<{ name: string }>; imports: Array<{ source: string }> },
123
+ ): number {
124
+ // Build searchable corpus from file metadata
125
+ const corpus = this.buildCorpus(filePath, fileNode);
126
+ return this.computeScore(terms, corpus);
127
+ }
128
+
129
+ /**
130
+ * Score a symbol based on how many query terms match its name/signature.
131
+ */
132
+ private scoreSymbol(
133
+ terms: string[],
134
+ filePath: string,
135
+ symbolName: string,
136
+ kind: string,
137
+ signature: string | null,
138
+ ): number {
139
+ const parts = [
140
+ filePath.toLowerCase(),
141
+ symbolName.toLowerCase(),
142
+ kind.toLowerCase(),
143
+ signature?.toLowerCase() ?? '',
144
+ ];
145
+ const corpus = parts.join(' ');
146
+ // Boost symbol score if the symbol name directly matches a term
147
+ const baseScore = this.computeScore(terms, corpus);
148
+ const nameTokens = this.tokenize(symbolName);
149
+ const nameOverlap = terms.filter(t => nameTokens.includes(t)).length;
150
+ const nameBoost = nameOverlap > 0 ? 0.1 * Math.min(nameOverlap / terms.length, 1) : 0;
151
+ return Math.min(baseScore + nameBoost, 1);
152
+ }
153
+
154
+ /**
155
+ * Build a searchable text corpus from a file's metadata.
156
+ */
157
+ private buildCorpus(
158
+ filePath: string,
159
+ fileNode: { symbols: Array<{ name: string }>; imports: Array<{ source: string }> },
160
+ ): string {
161
+ const parts = [
162
+ filePath.toLowerCase(),
163
+ ...fileNode.symbols.map(s => s.name.toLowerCase()),
164
+ ...fileNode.imports.map(i => i.source.toLowerCase()),
165
+ ];
166
+ return parts.join(' ');
167
+ }
168
+
169
+ /**
170
+ * Compute a [0, 1] score based on term coverage.
171
+ * Exact matches score higher than substring matches.
172
+ */
173
+ private computeScore(terms: string[], corpus: string): number {
174
+ if (terms.length === 0) return 0;
175
+
176
+ let matched = 0;
177
+ for (const term of terms) {
178
+ if (corpus.includes(term)) {
179
+ matched++;
180
+ }
181
+ }
182
+
183
+ // Base score: fraction of terms that matched
184
+ const coverage = matched / terms.length;
185
+
186
+ // Scale to a reasonable similarity range [0, 0.85]
187
+ // Lexical results are capped below 0.85 so vector results naturally rank higher
188
+ return coverage * 0.85;
189
+ }
190
+ }