@nomos-arc/arc 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.claude/settings.local.json +10 -0
- package/.nomos-config.json +5 -0
- package/CLAUDE.md +108 -0
- package/LICENSE +190 -0
- package/README.md +569 -0
- package/dist/cli.js +21120 -0
- package/docs/auth/googel_plan.yaml +1093 -0
- package/docs/auth/google_task.md +235 -0
- package/docs/auth/hardened_blueprint.yaml +1658 -0
- package/docs/auth/red_team_report.yaml +336 -0
- package/docs/auth/session_state.yaml +162 -0
- package/docs/certificate/cer_enhance_plan.md +605 -0
- package/docs/certificate/certificate_report.md +338 -0
- package/docs/dev_overview.md +419 -0
- package/docs/feature_assessment.md +156 -0
- package/docs/how_it_works.md +78 -0
- package/docs/infrastructure/map.md +867 -0
- package/docs/init/master_plan.md +3581 -0
- package/docs/init/red_team_report.md +215 -0
- package/docs/init/report_phase_1a.md +304 -0
- package/docs/integrity-gate/enhance_drift.md +703 -0
- package/docs/integrity-gate/overview.md +108 -0
- package/docs/management/manger-task.md +99 -0
- package/docs/management/scafffold.md +76 -0
- package/docs/map/ATOMIC_BLUEPRINT.md +1349 -0
- package/docs/map/RED_TEAM_REPORT.md +159 -0
- package/docs/map/map_task.md +147 -0
- package/docs/map/semantic_graph_task.md +792 -0
- package/docs/map/semantic_master_plan.md +705 -0
- package/docs/phase7/TEAM_RED.md +249 -0
- package/docs/phase7/plan.md +1682 -0
- package/docs/phase7/task.md +275 -0
- package/docs/prompts/USAGE.md +312 -0
- package/docs/prompts/architect.md +165 -0
- package/docs/prompts/executer.md +190 -0
- package/docs/prompts/hardener.md +190 -0
- package/docs/prompts/red_team.md +146 -0
- package/docs/verification/goveranance-overview.md +396 -0
- package/docs/verification/governance-overview.md +245 -0
- package/docs/verification/verification-arc-ar.md +560 -0
- package/docs/verification/verification-architecture.md +560 -0
- package/docs/very_next.md +52 -0
- package/docs/whitepaper.md +89 -0
- package/overview.md +1469 -0
- package/package.json +63 -0
- package/src/adapters/__tests__/git.test.ts +296 -0
- package/src/adapters/__tests__/stdio.test.ts +70 -0
- package/src/adapters/git.ts +226 -0
- package/src/adapters/pty.ts +159 -0
- package/src/adapters/stdio.ts +113 -0
- package/src/cli.ts +83 -0
- package/src/commands/apply.ts +47 -0
- package/src/commands/auth.ts +301 -0
- package/src/commands/certificate.ts +89 -0
- package/src/commands/discard.ts +24 -0
- package/src/commands/drift.ts +116 -0
- package/src/commands/index.ts +78 -0
- package/src/commands/init.ts +121 -0
- package/src/commands/list.ts +75 -0
- package/src/commands/map.ts +55 -0
- package/src/commands/plan.ts +30 -0
- package/src/commands/review.ts +58 -0
- package/src/commands/run.ts +63 -0
- package/src/commands/search.ts +147 -0
- package/src/commands/show.ts +63 -0
- package/src/commands/status.ts +59 -0
- package/src/core/__tests__/budget.test.ts +213 -0
- package/src/core/__tests__/certificate.test.ts +385 -0
- package/src/core/__tests__/config.test.ts +191 -0
- package/src/core/__tests__/preflight.test.ts +24 -0
- package/src/core/__tests__/prompt.test.ts +358 -0
- package/src/core/__tests__/review.test.ts +161 -0
- package/src/core/__tests__/state.test.ts +362 -0
- package/src/core/auth/__tests__/manager.test.ts +166 -0
- package/src/core/auth/__tests__/server.test.ts +220 -0
- package/src/core/auth/gcp-projects.ts +160 -0
- package/src/core/auth/manager.ts +114 -0
- package/src/core/auth/server.ts +141 -0
- package/src/core/budget.ts +119 -0
- package/src/core/certificate.ts +502 -0
- package/src/core/config.ts +212 -0
- package/src/core/errors.ts +54 -0
- package/src/core/factory.ts +49 -0
- package/src/core/graph/__tests__/builder.test.ts +272 -0
- package/src/core/graph/__tests__/contract-writer.test.ts +175 -0
- package/src/core/graph/__tests__/enricher.test.ts +299 -0
- package/src/core/graph/__tests__/parser.test.ts +200 -0
- package/src/core/graph/__tests__/pipeline.test.ts +202 -0
- package/src/core/graph/__tests__/renderer.test.ts +128 -0
- package/src/core/graph/__tests__/resolver.test.ts +185 -0
- package/src/core/graph/__tests__/scanner.test.ts +231 -0
- package/src/core/graph/__tests__/show.test.ts +134 -0
- package/src/core/graph/builder.ts +303 -0
- package/src/core/graph/constraints.ts +94 -0
- package/src/core/graph/contract-writer.ts +93 -0
- package/src/core/graph/drift/__tests__/classifier.test.ts +215 -0
- package/src/core/graph/drift/__tests__/comparator.test.ts +335 -0
- package/src/core/graph/drift/__tests__/drift.test.ts +453 -0
- package/src/core/graph/drift/__tests__/reporter.test.ts +203 -0
- package/src/core/graph/drift/classifier.ts +165 -0
- package/src/core/graph/drift/comparator.ts +205 -0
- package/src/core/graph/drift/reporter.ts +77 -0
- package/src/core/graph/enricher.ts +251 -0
- package/src/core/graph/grammar-paths.ts +30 -0
- package/src/core/graph/html-template.ts +493 -0
- package/src/core/graph/map-schema.ts +137 -0
- package/src/core/graph/parser.ts +336 -0
- package/src/core/graph/pipeline.ts +209 -0
- package/src/core/graph/renderer.ts +92 -0
- package/src/core/graph/resolver.ts +195 -0
- package/src/core/graph/scanner.ts +145 -0
- package/src/core/logger.ts +46 -0
- package/src/core/orchestrator.ts +792 -0
- package/src/core/plan-file-manager.ts +66 -0
- package/src/core/preflight.ts +64 -0
- package/src/core/prompt.ts +173 -0
- package/src/core/review.ts +95 -0
- package/src/core/state.ts +294 -0
- package/src/core/worktree-coordinator.ts +77 -0
- package/src/search/__tests__/chunk-extractor.test.ts +339 -0
- package/src/search/__tests__/embedder-auth.test.ts +124 -0
- package/src/search/__tests__/embedder.test.ts +267 -0
- package/src/search/__tests__/graph-enricher.test.ts +178 -0
- package/src/search/__tests__/indexer.test.ts +518 -0
- package/src/search/__tests__/integration.test.ts +649 -0
- package/src/search/__tests__/query-engine.test.ts +334 -0
- package/src/search/__tests__/similarity.test.ts +78 -0
- package/src/search/__tests__/vector-store.test.ts +281 -0
- package/src/search/chunk-extractor.ts +167 -0
- package/src/search/embedder.ts +209 -0
- package/src/search/graph-enricher.ts +95 -0
- package/src/search/indexer.ts +483 -0
- package/src/search/lexical-searcher.ts +190 -0
- package/src/search/query-engine.ts +225 -0
- package/src/search/vector-store.ts +311 -0
- package/src/types/index.ts +572 -0
- package/src/utils/__tests__/ansi.test.ts +54 -0
- package/src/utils/__tests__/frontmatter.test.ts +79 -0
- package/src/utils/__tests__/sanitize.test.ts +229 -0
- package/src/utils/ansi.ts +19 -0
- package/src/utils/context.ts +44 -0
- package/src/utils/frontmatter.ts +27 -0
- package/src/utils/sanitize.ts +78 -0
- package/test/e2e/lifecycle.test.ts +330 -0
- package/test/fixtures/mock-planner-hang.ts +5 -0
- package/test/fixtures/mock-planner.ts +26 -0
- package/test/fixtures/mock-reviewer-bad.ts +8 -0
- package/test/fixtures/mock-reviewer-retry.ts +34 -0
- package/test/fixtures/mock-reviewer.ts +18 -0
- package/test/fixtures/sample-project/src/circular-a.ts +6 -0
- package/test/fixtures/sample-project/src/circular-b.ts +6 -0
- package/test/fixtures/sample-project/src/config.ts +15 -0
- package/test/fixtures/sample-project/src/main.ts +19 -0
- package/test/fixtures/sample-project/src/services/product-service.ts +20 -0
- package/test/fixtures/sample-project/src/services/user-service.ts +18 -0
- package/test/fixtures/sample-project/src/types.ts +14 -0
- package/test/fixtures/sample-project/src/utils/index.ts +14 -0
- package/test/fixtures/sample-project/src/utils/validate.ts +12 -0
- package/tsconfig.json +20 -0
- package/vitest.config.ts +12 -0
|
@@ -0,0 +1,483 @@
|
|
|
1
|
+
import fs from 'node:fs/promises';
|
|
2
|
+
import path from 'node:path';
|
|
3
|
+
import type { Logger } from 'winston';
|
|
4
|
+
import { NomosError } from '../core/errors.js';
|
|
5
|
+
import type { IndexMetadata, NomosConfig, ProjectMap, TextChunk, VectorRecord } from '../types/index.js';
|
|
6
|
+
import { ChunkExtractor } from './chunk-extractor.js';
|
|
7
|
+
import { Embedder } from './embedder.js';
|
|
8
|
+
import { VectorStore } from './vector-store.js';
|
|
9
|
+
import { AuthManager } from '../core/auth/manager.js';
|
|
10
|
+
|
|
11
|
+
// ─── SearchIndexer ─────────────────────────────────────────────────────────────
|
|
12
|
+
|
|
13
|
+
export class SearchIndexer {
|
|
14
|
+
private readonly extractor: ChunkExtractor;
|
|
15
|
+
private _embedder: Embedder | null = null; // lazy — not created until first embedding call
|
|
16
|
+
private readonly store: VectorStore;
|
|
17
|
+
private readonly metaPath: string;
|
|
18
|
+
|
|
19
|
+
constructor(
|
|
20
|
+
private readonly projectRoot: string,
|
|
21
|
+
private readonly config: NomosConfig,
|
|
22
|
+
private readonly logger: Logger,
|
|
23
|
+
private readonly authManager?: AuthManager | null,
|
|
24
|
+
) {
|
|
25
|
+
this.extractor = new ChunkExtractor(projectRoot, logger);
|
|
26
|
+
this.store = new VectorStore(
|
|
27
|
+
config.search.vector_store_path,
|
|
28
|
+
logger,
|
|
29
|
+
config.search.embedding_dimensions,
|
|
30
|
+
);
|
|
31
|
+
this.metaPath = path.join(config.search.vector_store_path, 'index-meta.json');
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
/** Lazy Embedder accessor — throws at construction time only if actually needed. */
|
|
35
|
+
private async getEmbedder(): Promise<Embedder> {
|
|
36
|
+
if (!this._embedder) {
|
|
37
|
+
this._embedder = await Embedder.create(
|
|
38
|
+
this.config.search, this.logger, this.authManager,
|
|
39
|
+
);
|
|
40
|
+
}
|
|
41
|
+
return this._embedder;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// ─── Public API ─────────────────────────────────────────────────────────────
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* Full index: extract all chunks, embed in streaming batches, upsert to staging,
|
|
48
|
+
* then atomic table-swap to live [BLOCKER-1].
|
|
49
|
+
* Writes metadata with status tracking [BLOCKER-2].
|
|
50
|
+
*/
|
|
51
|
+
async fullIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata> {
|
|
52
|
+
// Pre-check: access embedder once to validate API key before any mutations [AC-11]
|
|
53
|
+
await this.getEmbedder();
|
|
54
|
+
|
|
55
|
+
// Step 1: Load project_map.json
|
|
56
|
+
const projectMap = await this.loadProjectMap();
|
|
57
|
+
|
|
58
|
+
// Step 2: Write IndexMetadata with status: "in_progress" BEFORE any mutation [BLOCKER-2]
|
|
59
|
+
const startedAt = new Date().toISOString();
|
|
60
|
+
const inProgressMeta: IndexMetadata = {
|
|
61
|
+
status: 'in_progress',
|
|
62
|
+
last_full_index: startedAt,
|
|
63
|
+
last_incremental_index: null,
|
|
64
|
+
total_files_indexed: 0,
|
|
65
|
+
total_symbols_indexed: 0,
|
|
66
|
+
total_chunks: 0,
|
|
67
|
+
embedding_model: this.config.search.embedding_model,
|
|
68
|
+
vector_dimensions: this.config.search.embedding_dimensions,
|
|
69
|
+
failed_files: [],
|
|
70
|
+
files: {},
|
|
71
|
+
};
|
|
72
|
+
await this.writeMeta(inProgressMeta);
|
|
73
|
+
|
|
74
|
+
// Step 3: Extract all chunks
|
|
75
|
+
const allChunks = this.extractor.extract(projectMap);
|
|
76
|
+
const fileChunks = allChunks.filter(c => c.type === 'file');
|
|
77
|
+
const symbolChunks = allChunks.filter(c => c.type === 'symbol');
|
|
78
|
+
this.logger.info(
|
|
79
|
+
`[nomos:search:info] Extracted ${allChunks.length} chunks (${fileChunks.length} file-level, ${symbolChunks.length} symbol-level)`,
|
|
80
|
+
);
|
|
81
|
+
|
|
82
|
+
// Step 4: Init store + cleanup orphaned staging table from prior crash [GAP-3]
|
|
83
|
+
await this.store.init();
|
|
84
|
+
await this.store.cleanupStaging();
|
|
85
|
+
|
|
86
|
+
// Step 5: Streaming batch loop [BLOCKER-4]
|
|
87
|
+
const batches = chunk(allChunks, this.config.search.batch_size);
|
|
88
|
+
const failedFiles: string[] = [];
|
|
89
|
+
let batchesDone = 0;
|
|
90
|
+
let cancelled = false;
|
|
91
|
+
|
|
92
|
+
for (let i = 0; i < batches.length; i++) {
|
|
93
|
+
// 5a: Check cancellation flag at TOP of each iteration
|
|
94
|
+
if (cancellationFlag?.cancelled) {
|
|
95
|
+
cancelled = true;
|
|
96
|
+
break;
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
const batch = batches[i]!;
|
|
100
|
+
|
|
101
|
+
// 5b: Embed batch — on failure, record file paths and continue [GAP-1]
|
|
102
|
+
let vectors: Float32Array[];
|
|
103
|
+
try {
|
|
104
|
+
vectors = await (await this.getEmbedder()).embedBatch(batch.map(c => c.text));
|
|
105
|
+
} catch (err) {
|
|
106
|
+
this.logger.error(
|
|
107
|
+
`[nomos:search:error] Batch ${i + 1}/${batches.length} embedding failed: ${(err as Error).message}`,
|
|
108
|
+
);
|
|
109
|
+
for (const c of batch) {
|
|
110
|
+
if (!failedFiles.includes(c.file_path)) {
|
|
111
|
+
failedFiles.push(c.file_path);
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
continue;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
// 5c: Compose VectorRecord[] for THIS BATCH ONLY
|
|
118
|
+
const records = this.composeRecords(batch, vectors, projectMap);
|
|
119
|
+
|
|
120
|
+
// 5d: Upsert batch to staging immediately, release references
|
|
121
|
+
await this.store.upsertToStaging(records);
|
|
122
|
+
batchesDone++;
|
|
123
|
+
|
|
124
|
+
this.logger.info(
|
|
125
|
+
`[nomos:search:info] Embedded batch ${i + 1}/${batches.length} (${batch.length} chunks)`,
|
|
126
|
+
);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
if (cancelled) {
|
|
130
|
+
// Partial run: metadata written as "in_progress"; next run cleans staging + forces full re-index
|
|
131
|
+
const partialMeta: IndexMetadata = {
|
|
132
|
+
...inProgressMeta,
|
|
133
|
+
status: 'in_progress',
|
|
134
|
+
failed_files: failedFiles,
|
|
135
|
+
total_chunks: batchesDone * this.config.search.batch_size,
|
|
136
|
+
};
|
|
137
|
+
await this.writeMeta(partialMeta);
|
|
138
|
+
return partialMeta;
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
// Step 6: Atomic table-swap [BLOCKER-1]
|
|
142
|
+
await this.store.promoteStagingToLive();
|
|
143
|
+
|
|
144
|
+
// Step 7: Write IndexMetadata with status: "complete" [BLOCKER-2]
|
|
145
|
+
const finalMeta = this.buildFinalMeta(
|
|
146
|
+
allChunks,
|
|
147
|
+
startedAt,
|
|
148
|
+
null,
|
|
149
|
+
failedFiles,
|
|
150
|
+
projectMap,
|
|
151
|
+
);
|
|
152
|
+
await this.writeMeta(finalMeta);
|
|
153
|
+
|
|
154
|
+
// Step 8: Return IndexMetadata
|
|
155
|
+
return finalMeta;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Incremental index: validates dimensions [BLOCKER-3], re-indexes changed + failed files [GAP-1].
|
|
160
|
+
*/
|
|
161
|
+
async incrementalIndex(cancellationFlag?: { cancelled: boolean }): Promise<IndexMetadata> {
|
|
162
|
+
// Pre-check: access embedder once to validate API key before any mutations [AC-11]
|
|
163
|
+
await this.getEmbedder();
|
|
164
|
+
|
|
165
|
+
// Step 1: Load project_map.json
|
|
166
|
+
const projectMap = await this.loadProjectMap();
|
|
167
|
+
|
|
168
|
+
// Step 2: Load existing IndexMetadata
|
|
169
|
+
let existingMeta: IndexMetadata | null = null;
|
|
170
|
+
try {
|
|
171
|
+
const raw = await fs.readFile(this.metaPath, 'utf-8');
|
|
172
|
+
existingMeta = JSON.parse(raw) as IndexMetadata;
|
|
173
|
+
} catch {
|
|
174
|
+
// Not found → fall back to fullIndex
|
|
175
|
+
this.logger.info('[nomos:search:info] No existing index metadata. Running full index.');
|
|
176
|
+
return this.fullIndex(cancellationFlag);
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
// If previous run was interrupted, force full re-index [BLOCKER-2]
|
|
180
|
+
if (existingMeta.status === 'in_progress') {
|
|
181
|
+
this.logger.warn(
|
|
182
|
+
'[nomos:search:warn] Previous index incomplete. Running full re-index.',
|
|
183
|
+
);
|
|
184
|
+
return this.fullIndex(cancellationFlag);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Step 3: Dimension validation [BLOCKER-3]
|
|
188
|
+
const modelChanged = existingMeta.embedding_model !== this.config.search.embedding_model;
|
|
189
|
+
const dimsChanged = existingMeta.vector_dimensions !== this.config.search.embedding_dimensions;
|
|
190
|
+
if (modelChanged || dimsChanged) {
|
|
191
|
+
const oldModel = `${existingMeta.embedding_model}/${existingMeta.vector_dimensions}`;
|
|
192
|
+
const newModel = `${this.config.search.embedding_model}/${this.config.search.embedding_dimensions}`;
|
|
193
|
+
this.logger.warn(
|
|
194
|
+
`[nomos:search:warn] Embedding model/dimensions changed (${oldModel} → ${newModel}). Forcing full re-index.`,
|
|
195
|
+
);
|
|
196
|
+
return this.fullIndex(cancellationFlag);
|
|
197
|
+
}
|
|
198
|
+
|
|
199
|
+
// Step 4: Extract all chunks
|
|
200
|
+
const allChunks = this.extractor.extract(projectMap);
|
|
201
|
+
|
|
202
|
+
// Step 5: Compute diff — decisions are made at file-path granularity.
|
|
203
|
+
// The metadata stores one content_hash per file path (from the file-level chunk).
|
|
204
|
+
// Symbol chunks share the same file_path and are always re-indexed alongside their parent.
|
|
205
|
+
const chunksByFile = groupByFilePath(allChunks);
|
|
206
|
+
const currentFilePaths = new Set(Object.keys(chunksByFile));
|
|
207
|
+
|
|
208
|
+
const toReindexPaths = new Set<string>();
|
|
209
|
+
|
|
210
|
+
for (const [fp, chunks] of Object.entries(chunksByFile)) {
|
|
211
|
+
const existing = existingMeta.files[fp];
|
|
212
|
+
if (!existing) {
|
|
213
|
+
// new file
|
|
214
|
+
toReindexPaths.add(fp);
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
// Use the file-level chunk's hash for comparison (matches what buildFinalMeta stores)
|
|
218
|
+
const fileChunk = chunks.find(c => c.type === 'file');
|
|
219
|
+
if (fileChunk && existing.content_hash !== fileChunk.content_hash) {
|
|
220
|
+
toReindexPaths.add(fp);
|
|
221
|
+
continue;
|
|
222
|
+
}
|
|
223
|
+
if (existingMeta.failed_files.includes(fp)) {
|
|
224
|
+
toReindexPaths.add(fp);
|
|
225
|
+
}
|
|
226
|
+
}
|
|
227
|
+
|
|
228
|
+
// Removed: files in metadata not in current projectMap
|
|
229
|
+
const removed = Object.keys(existingMeta.files).filter(fp => !currentFilePaths.has(fp));
|
|
230
|
+
|
|
231
|
+
const toReindex = allChunks.filter(c => toReindexPaths.has(c.file_path));
|
|
232
|
+
|
|
233
|
+
// Step 6: Nothing to do
|
|
234
|
+
if (toReindex.length === 0 && removed.length === 0) {
|
|
235
|
+
this.logger.info('[nomos:search:info] Index is up-to-date. No changes detected.');
|
|
236
|
+
return existingMeta;
|
|
237
|
+
}
|
|
238
|
+
|
|
239
|
+
// Step 7: Mark in_progress [BLOCKER-2]
|
|
240
|
+
await this.writeMeta({ ...existingMeta, status: 'in_progress' });
|
|
241
|
+
|
|
242
|
+
// Step 8: Init store and delete stale records
|
|
243
|
+
await this.store.init();
|
|
244
|
+
const toDeletePaths = unique([...removed, ...toReindex.map(c => c.file_path)]);
|
|
245
|
+
await this.store.deleteByFilePaths(toDeletePaths);
|
|
246
|
+
|
|
247
|
+
// Step 9: Streaming batch loop for toReindex [BLOCKER-4]
|
|
248
|
+
const batches = chunk(toReindex, this.config.search.batch_size);
|
|
249
|
+
const newFailedFiles: string[] = [];
|
|
250
|
+
|
|
251
|
+
for (let i = 0; i < batches.length; i++) {
|
|
252
|
+
// Check cancellation at TOP of each iteration
|
|
253
|
+
if (cancellationFlag?.cancelled) {
|
|
254
|
+
const partialMeta: IndexMetadata = {
|
|
255
|
+
...existingMeta,
|
|
256
|
+
status: 'in_progress',
|
|
257
|
+
failed_files: newFailedFiles,
|
|
258
|
+
};
|
|
259
|
+
await this.writeMeta(partialMeta);
|
|
260
|
+
return partialMeta;
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const batch = batches[i]!;
|
|
264
|
+
|
|
265
|
+
let vectors: Float32Array[];
|
|
266
|
+
try {
|
|
267
|
+
vectors = await (await this.getEmbedder()).embedBatch(batch.map(c => c.text));
|
|
268
|
+
} catch (err) {
|
|
269
|
+
this.logger.error(
|
|
270
|
+
`[nomos:search:error] Incremental batch ${i + 1}/${batches.length} failed: ${(err as Error).message}`,
|
|
271
|
+
);
|
|
272
|
+
for (const c of batch) {
|
|
273
|
+
if (!newFailedFiles.includes(c.file_path)) {
|
|
274
|
+
newFailedFiles.push(c.file_path);
|
|
275
|
+
}
|
|
276
|
+
}
|
|
277
|
+
continue;
|
|
278
|
+
}
|
|
279
|
+
|
|
280
|
+
const records = this.composeRecords(batch, vectors, projectMap);
|
|
281
|
+
await this.store.upsert(records);
|
|
282
|
+
|
|
283
|
+
this.logger.info(
|
|
284
|
+
`[nomos:search:info] Embedded batch ${i + 1}/${batches.length} (${batch.length} chunks)`,
|
|
285
|
+
);
|
|
286
|
+
}
|
|
287
|
+
|
|
288
|
+
// Step 10: Update metadata
|
|
289
|
+
const now = new Date().toISOString();
|
|
290
|
+
const updatedFiles: IndexMetadata['files'] = { ...existingMeta.files };
|
|
291
|
+
|
|
292
|
+
// Remove deleted file entries
|
|
293
|
+
for (const fp of removed) {
|
|
294
|
+
delete updatedFiles[fp];
|
|
295
|
+
}
|
|
296
|
+
|
|
297
|
+
// Update/add re-indexed file entries
|
|
298
|
+
for (const c of toReindex) {
|
|
299
|
+
if (!newFailedFiles.includes(c.file_path)) {
|
|
300
|
+
const existingEntry = updatedFiles[c.file_path];
|
|
301
|
+
updatedFiles[c.file_path] = {
|
|
302
|
+
last_indexed: now,
|
|
303
|
+
content_hash: c.content_hash,
|
|
304
|
+
chunk_count: (existingEntry?.chunk_count ?? 0),
|
|
305
|
+
};
|
|
306
|
+
}
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
// Recount from updated files entries
|
|
310
|
+
const totalFileChunks = allChunks.filter(c => c.type === 'file' && updatedFiles[c.file_path]).length;
|
|
311
|
+
const totalSymbolChunks = allChunks.filter(c => c.type === 'symbol' && updatedFiles[c.file_path]).length;
|
|
312
|
+
|
|
313
|
+
// Retain failed_files that were not successfully re-embedded; clear those that succeeded
|
|
314
|
+
const persistedFailedFiles = [
|
|
315
|
+
...existingMeta.failed_files.filter(fp => newFailedFiles.includes(fp)),
|
|
316
|
+
...newFailedFiles.filter(fp => !existingMeta.failed_files.includes(fp)),
|
|
317
|
+
];
|
|
318
|
+
|
|
319
|
+
const updatedMeta: IndexMetadata = {
|
|
320
|
+
status: 'complete',
|
|
321
|
+
last_full_index: existingMeta.last_full_index,
|
|
322
|
+
last_incremental_index: now,
|
|
323
|
+
total_files_indexed: totalFileChunks,
|
|
324
|
+
total_symbols_indexed: totalSymbolChunks,
|
|
325
|
+
total_chunks: totalFileChunks + totalSymbolChunks,
|
|
326
|
+
embedding_model: this.config.search.embedding_model,
|
|
327
|
+
vector_dimensions: this.config.search.embedding_dimensions,
|
|
328
|
+
failed_files: persistedFailedFiles,
|
|
329
|
+
files: updatedFiles,
|
|
330
|
+
};
|
|
331
|
+
|
|
332
|
+
// Step 11: Write metadata atomically [BLOCKER-2]
|
|
333
|
+
await this.writeMeta(updatedMeta);
|
|
334
|
+
|
|
335
|
+
// Step 12: Return updated metadata
|
|
336
|
+
return updatedMeta;
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
/**
|
|
340
|
+
* Dry-run: extract and count chunks without embedding or writing [S-2].
|
|
341
|
+
*/
|
|
342
|
+
async dryRun(): Promise<{ fileChunks: number; symbolChunks: number; totalChunks: number }> {
|
|
343
|
+
const projectMap = await this.loadProjectMap();
|
|
344
|
+
const allChunks = this.extractor.extract(projectMap);
|
|
345
|
+
const fileChunks = allChunks.filter(c => c.type === 'file').length;
|
|
346
|
+
const symbolChunks = allChunks.filter(c => c.type === 'symbol').length;
|
|
347
|
+
return { fileChunks, symbolChunks, totalChunks: allChunks.length };
|
|
348
|
+
}
|
|
349
|
+
|
|
350
|
+
// ─── Private helpers ─────────────────────────────────────────────────────────
|
|
351
|
+
|
|
352
|
+
private async loadProjectMap(): Promise<ProjectMap> {
|
|
353
|
+
const mapPath = path.join(this.config.graph.output_dir, 'project_map.json');
|
|
354
|
+
try {
|
|
355
|
+
const raw = await fs.readFile(mapPath, 'utf-8');
|
|
356
|
+
return JSON.parse(raw) as ProjectMap;
|
|
357
|
+
} catch {
|
|
358
|
+
throw new NomosError(
|
|
359
|
+
'search_index_failed',
|
|
360
|
+
`project_map.json not found at ${mapPath}. Run: arc map`,
|
|
361
|
+
);
|
|
362
|
+
}
|
|
363
|
+
}
|
|
364
|
+
|
|
365
|
+
private composeRecords(
|
|
366
|
+
chunks: TextChunk[],
|
|
367
|
+
vectors: Float32Array[],
|
|
368
|
+
projectMap: ProjectMap,
|
|
369
|
+
): VectorRecord[] {
|
|
370
|
+
const now = new Date().toISOString();
|
|
371
|
+
return chunks.map((c, idx) => {
|
|
372
|
+
const fileNode = projectMap.files[c.file_path];
|
|
373
|
+
return {
|
|
374
|
+
id: c.id,
|
|
375
|
+
type: c.type,
|
|
376
|
+
vector: vectors[idx]!,
|
|
377
|
+
file_path: c.file_path,
|
|
378
|
+
module: path.dirname(c.file_path),
|
|
379
|
+
purpose: fileNode?.semantic?.purpose ?? c.file_path,
|
|
380
|
+
symbol_name: c.symbol_name,
|
|
381
|
+
symbol_type: c.symbol_type,
|
|
382
|
+
line_start: c.line_start,
|
|
383
|
+
line_end: c.line_end,
|
|
384
|
+
parent_file_id: c.parent_file_id,
|
|
385
|
+
graph_depth: fileNode?.depth ?? 0,
|
|
386
|
+
dependents_count: fileNode?.dependents?.length ?? 0,
|
|
387
|
+
last_indexed: now,
|
|
388
|
+
content_hash: c.content_hash,
|
|
389
|
+
};
|
|
390
|
+
});
|
|
391
|
+
}
|
|
392
|
+
|
|
393
|
+
private buildFinalMeta(
|
|
394
|
+
allChunks: TextChunk[],
|
|
395
|
+
startedAt: string,
|
|
396
|
+
lastIncremental: string | null,
|
|
397
|
+
failedFiles: string[],
|
|
398
|
+
projectMap: ProjectMap,
|
|
399
|
+
): IndexMetadata {
|
|
400
|
+
const now = new Date().toISOString();
|
|
401
|
+
const successfulChunks = allChunks.filter(c => !failedFiles.includes(c.file_path));
|
|
402
|
+
const fileChunks = successfulChunks.filter(c => c.type === 'file');
|
|
403
|
+
const symbolChunks = successfulChunks.filter(c => c.type === 'symbol');
|
|
404
|
+
|
|
405
|
+
// Build per-file entry: group chunks by file_path
|
|
406
|
+
const fileEntries: IndexMetadata['files'] = {};
|
|
407
|
+
for (const c of successfulChunks) {
|
|
408
|
+
if (!fileEntries[c.file_path]) {
|
|
409
|
+
fileEntries[c.file_path] = {
|
|
410
|
+
last_indexed: now,
|
|
411
|
+
content_hash: c.content_hash,
|
|
412
|
+
chunk_count: 0,
|
|
413
|
+
};
|
|
414
|
+
}
|
|
415
|
+
fileEntries[c.file_path]!.chunk_count++;
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
// Resolve content_hash per file from the ProjectMap-derived chunk [S-6]
|
|
419
|
+
for (const [fp, entry] of Object.entries(fileEntries)) {
|
|
420
|
+
const fileChunk = allChunks.find(c => c.file_path === fp && c.type === 'file');
|
|
421
|
+
if (fileChunk) {
|
|
422
|
+
entry.content_hash = fileChunk.content_hash;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
return {
|
|
427
|
+
status: 'complete',
|
|
428
|
+
last_full_index: startedAt,
|
|
429
|
+
last_incremental_index: lastIncremental,
|
|
430
|
+
total_files_indexed: fileChunks.length,
|
|
431
|
+
total_symbols_indexed: symbolChunks.length,
|
|
432
|
+
total_chunks: successfulChunks.length,
|
|
433
|
+
embedding_model: this.config.search.embedding_model,
|
|
434
|
+
vector_dimensions: this.config.search.embedding_dimensions,
|
|
435
|
+
failed_files: failedFiles,
|
|
436
|
+
files: fileEntries,
|
|
437
|
+
};
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/**
|
|
441
|
+
* Atomically write IndexMetadata: write to .tmp file then rename [BLOCKER-2].
|
|
442
|
+
*/
|
|
443
|
+
private async writeMeta(meta: IndexMetadata): Promise<void> {
|
|
444
|
+
await fs.mkdir(this.config.search.vector_store_path, { recursive: true });
|
|
445
|
+
const tmpPath = `${this.metaPath}.tmp`;
|
|
446
|
+
await fs.writeFile(tmpPath, JSON.stringify(meta, null, 2), 'utf-8');
|
|
447
|
+
await fs.rename(tmpPath, this.metaPath);
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
// ─── Utilities ────────────────────────────────────────────────────────────────
|
|
452
|
+
|
|
453
|
+
function chunk<T>(arr: T[], size: number): T[][] {
|
|
454
|
+
const result: T[][] = [];
|
|
455
|
+
for (let i = 0; i < arr.length; i += size) {
|
|
456
|
+
result.push(arr.slice(i, i + size));
|
|
457
|
+
}
|
|
458
|
+
return result;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
/** Deduplicate TextChunks by id, preserving first occurrence. */
|
|
462
|
+
function dedup(chunks: TextChunk[]): TextChunk[] {
|
|
463
|
+
const seen = new Set<string>();
|
|
464
|
+
return chunks.filter(c => {
|
|
465
|
+
if (seen.has(c.id)) return false;
|
|
466
|
+
seen.add(c.id);
|
|
467
|
+
return true;
|
|
468
|
+
});
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/** Return unique strings, preserving order. */
|
|
472
|
+
function unique(arr: string[]): string[] {
|
|
473
|
+
return [...new Set(arr)];
|
|
474
|
+
}
|
|
475
|
+
|
|
476
|
+
/** Group TextChunks by file_path. */
|
|
477
|
+
function groupByFilePath(chunks: TextChunk[]): Record<string, TextChunk[]> {
|
|
478
|
+
const map: Record<string, TextChunk[]> = {};
|
|
479
|
+
for (const c of chunks) {
|
|
480
|
+
(map[c.file_path] ??= []).push(c);
|
|
481
|
+
}
|
|
482
|
+
return map;
|
|
483
|
+
}
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
import type { ProjectMap, SearchResult } from '../types/index.js';
|
|
2
|
+
|
|
3
|
+
// ─── LexicalSearcher ─────────────────────────────────────────────────────────
|
|
4
|
+
|
|
5
|
+
/**
|
|
6
|
+
* Keyword-based search for files that are not yet vector-indexed.
|
|
7
|
+
* Scores files based on term matches in file path, symbol names,
|
|
8
|
+
* symbol signatures, and import sources.
|
|
9
|
+
*
|
|
10
|
+
* Used as a fallback when the vector index is unavailable or incomplete,
|
|
11
|
+
* enabling `arc search` to work immediately after `arc map --no-ai`.
|
|
12
|
+
*/
|
|
13
|
+
export class LexicalSearcher {
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Search a subset of files from the project map using keyword matching.
|
|
17
|
+
*
|
|
18
|
+
* @param query - The user's search query string
|
|
19
|
+
* @param projectMap - The full project map
|
|
20
|
+
* @param filePaths - Set of file paths to search (e.g., files not in vector index)
|
|
21
|
+
* @param topK - Maximum results to return
|
|
22
|
+
* @param threshold - Minimum score (0–1) to include a result
|
|
23
|
+
*/
|
|
24
|
+
search(
|
|
25
|
+
query: string,
|
|
26
|
+
projectMap: ProjectMap,
|
|
27
|
+
filePaths: Set<string>,
|
|
28
|
+
topK: number,
|
|
29
|
+
threshold: number,
|
|
30
|
+
): SearchResult[] {
|
|
31
|
+
const terms = this.tokenize(query);
|
|
32
|
+
if (terms.length === 0) return [];
|
|
33
|
+
|
|
34
|
+
const results: SearchResult[] = [];
|
|
35
|
+
const coreModuleSet = new Set(projectMap.stats.core_modules);
|
|
36
|
+
|
|
37
|
+
for (const fp of filePaths) {
|
|
38
|
+
const fileNode = projectMap.files[fp];
|
|
39
|
+
if (!fileNode) continue;
|
|
40
|
+
|
|
41
|
+
const score = this.scoreFile(terms, fp, fileNode);
|
|
42
|
+
if (score < threshold) continue;
|
|
43
|
+
|
|
44
|
+
results.push({
|
|
45
|
+
id: fp,
|
|
46
|
+
type: 'file',
|
|
47
|
+
file_path: fp,
|
|
48
|
+
symbol_name: null,
|
|
49
|
+
symbol_type: null,
|
|
50
|
+
line_start: null,
|
|
51
|
+
line_end: null,
|
|
52
|
+
purpose: fileNode.semantic?.purpose ?? fp,
|
|
53
|
+
similarity_score: score,
|
|
54
|
+
graph_depth: fileNode.depth,
|
|
55
|
+
dependents_count: fileNode.dependents.length,
|
|
56
|
+
is_core_module: coreModuleSet.has(fp),
|
|
57
|
+
is_stale: false,
|
|
58
|
+
});
|
|
59
|
+
|
|
60
|
+
// Also score individual symbols
|
|
61
|
+
for (const symbol of fileNode.symbols) {
|
|
62
|
+
if (!symbol.exported && symbol.kind !== 'class' && symbol.kind !== 'function') continue;
|
|
63
|
+
|
|
64
|
+
const symbolScore = this.scoreSymbol(terms, fp, symbol.name, symbol.kind, symbol.signature);
|
|
65
|
+
if (symbolScore < threshold) continue;
|
|
66
|
+
|
|
67
|
+
results.push({
|
|
68
|
+
id: `${fp}::${symbol.name}`,
|
|
69
|
+
type: 'symbol',
|
|
70
|
+
file_path: fp,
|
|
71
|
+
symbol_name: symbol.name,
|
|
72
|
+
symbol_type: symbol.kind,
|
|
73
|
+
line_start: symbol.line,
|
|
74
|
+
line_end: symbol.end_line,
|
|
75
|
+
purpose: fileNode.semantic?.purpose ?? fp,
|
|
76
|
+
similarity_score: symbolScore,
|
|
77
|
+
graph_depth: fileNode.depth,
|
|
78
|
+
dependents_count: fileNode.dependents.length,
|
|
79
|
+
is_core_module: coreModuleSet.has(fp),
|
|
80
|
+
is_stale: false,
|
|
81
|
+
});
|
|
82
|
+
}
|
|
83
|
+
}
|
|
84
|
+
|
|
85
|
+
// Sort by score descending, return top-K
|
|
86
|
+
results.sort((a, b) => b.similarity_score - a.similarity_score);
|
|
87
|
+
return results.slice(0, topK);
|
|
88
|
+
}
|
|
89
|
+
|
|
90
|
+
// ─── Private ───────────────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
/**
|
|
93
|
+
* Tokenize a query into lowercase terms, splitting on whitespace and common
|
|
94
|
+
* code separators (camelCase, underscores, hyphens, dots, slashes).
|
|
95
|
+
*/
|
|
96
|
+
private tokenize(query: string): string[] {
|
|
97
|
+
// Split on whitespace first
|
|
98
|
+
const words = query.toLowerCase().split(/\s+/).filter(Boolean);
|
|
99
|
+
|
|
100
|
+
// Further split camelCase/PascalCase and separators
|
|
101
|
+
const terms: string[] = [];
|
|
102
|
+
for (const word of words) {
|
|
103
|
+
// Split on separators: _ - . /
|
|
104
|
+
const parts = word.split(/[_\-./]+/).filter(Boolean);
|
|
105
|
+
for (const part of parts) {
|
|
106
|
+
// Split camelCase: "getAccessToken" → ["get", "access", "token"]
|
|
107
|
+
const camelParts = part.replace(/([a-z])([A-Z])/g, '$1 $2').toLowerCase().split(' ');
|
|
108
|
+
terms.push(...camelParts.filter(Boolean));
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
return [...new Set(terms)]; // deduplicate
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
/**
|
|
116
|
+
* Score a file based on how many query terms appear in its searchable text.
|
|
117
|
+
* Returns a value in [0, 1].
|
|
118
|
+
*/
|
|
119
|
+
private scoreFile(
|
|
120
|
+
terms: string[],
|
|
121
|
+
filePath: string,
|
|
122
|
+
fileNode: { symbols: Array<{ name: string }>; imports: Array<{ source: string }> },
|
|
123
|
+
): number {
|
|
124
|
+
// Build searchable corpus from file metadata
|
|
125
|
+
const corpus = this.buildCorpus(filePath, fileNode);
|
|
126
|
+
return this.computeScore(terms, corpus);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/**
|
|
130
|
+
* Score a symbol based on how many query terms match its name/signature.
|
|
131
|
+
*/
|
|
132
|
+
private scoreSymbol(
|
|
133
|
+
terms: string[],
|
|
134
|
+
filePath: string,
|
|
135
|
+
symbolName: string,
|
|
136
|
+
kind: string,
|
|
137
|
+
signature: string | null,
|
|
138
|
+
): number {
|
|
139
|
+
const parts = [
|
|
140
|
+
filePath.toLowerCase(),
|
|
141
|
+
symbolName.toLowerCase(),
|
|
142
|
+
kind.toLowerCase(),
|
|
143
|
+
signature?.toLowerCase() ?? '',
|
|
144
|
+
];
|
|
145
|
+
const corpus = parts.join(' ');
|
|
146
|
+
// Boost symbol score if the symbol name directly matches a term
|
|
147
|
+
const baseScore = this.computeScore(terms, corpus);
|
|
148
|
+
const nameTokens = this.tokenize(symbolName);
|
|
149
|
+
const nameOverlap = terms.filter(t => nameTokens.includes(t)).length;
|
|
150
|
+
const nameBoost = nameOverlap > 0 ? 0.1 * Math.min(nameOverlap / terms.length, 1) : 0;
|
|
151
|
+
return Math.min(baseScore + nameBoost, 1);
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/**
|
|
155
|
+
* Build a searchable text corpus from a file's metadata.
|
|
156
|
+
*/
|
|
157
|
+
private buildCorpus(
|
|
158
|
+
filePath: string,
|
|
159
|
+
fileNode: { symbols: Array<{ name: string }>; imports: Array<{ source: string }> },
|
|
160
|
+
): string {
|
|
161
|
+
const parts = [
|
|
162
|
+
filePath.toLowerCase(),
|
|
163
|
+
...fileNode.symbols.map(s => s.name.toLowerCase()),
|
|
164
|
+
...fileNode.imports.map(i => i.source.toLowerCase()),
|
|
165
|
+
];
|
|
166
|
+
return parts.join(' ');
|
|
167
|
+
}
|
|
168
|
+
|
|
169
|
+
/**
|
|
170
|
+
* Compute a [0, 1] score based on term coverage.
|
|
171
|
+
* Exact matches score higher than substring matches.
|
|
172
|
+
*/
|
|
173
|
+
private computeScore(terms: string[], corpus: string): number {
|
|
174
|
+
if (terms.length === 0) return 0;
|
|
175
|
+
|
|
176
|
+
let matched = 0;
|
|
177
|
+
for (const term of terms) {
|
|
178
|
+
if (corpus.includes(term)) {
|
|
179
|
+
matched++;
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
|
|
183
|
+
// Base score: fraction of terms that matched
|
|
184
|
+
const coverage = matched / terms.length;
|
|
185
|
+
|
|
186
|
+
// Scale to a reasonable similarity range [0, 0.85]
|
|
187
|
+
// Lexical results are capped below 0.85 so vector results naturally rank higher
|
|
188
|
+
return coverage * 0.85;
|
|
189
|
+
}
|
|
190
|
+
}
|