@codragraph/cli 1.6.4 → 2.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -0
- package/dist/_shared/cgdb/schema-constants.d.ts +16 -0
- package/dist/_shared/cgdb/schema-constants.d.ts.map +1 -0
- package/dist/_shared/cgdb/schema-constants.js +67 -0
- package/dist/_shared/cgdb/schema-constants.js.map +1 -0
- package/dist/_shared/index.d.ts +2 -2
- package/dist/_shared/index.js +1 -1
- package/dist/cli/analyze.d.ts +22 -0
- package/dist/cli/analyze.js +109 -6
- package/dist/cli/compress-stats.d.ts +29 -0
- package/dist/cli/compress-stats.js +97 -0
- package/dist/cli/graphstore.d.ts +6 -2
- package/dist/cli/graphstore.js +45 -23
- package/dist/cli/index-repo.js +3 -3
- package/dist/cli/index.js +16 -2
- package/dist/cli/profile-heap.d.ts +35 -0
- package/dist/cli/profile-heap.js +126 -0
- package/dist/cli/setup.d.ts +13 -0
- package/dist/cli/setup.js +22 -11
- package/dist/cli/skill-gen.d.ts +14 -2
- package/dist/cli/skill-gen.js +52 -19
- package/dist/cli/tool.js +4 -0
- package/dist/cli/wiki.js +3 -3
- package/dist/core/augmentation/engine.js +7 -7
- package/dist/core/cgdb/cgdb-adapter.d.ts +176 -0
- package/dist/core/cgdb/cgdb-adapter.js +1320 -0
- package/dist/core/cgdb/content-read.d.ts +46 -0
- package/dist/core/cgdb/content-read.js +64 -0
- package/dist/core/cgdb/csv-generator.d.ts +29 -0
- package/dist/core/cgdb/csv-generator.js +492 -0
- package/dist/core/cgdb/pool-adapter.d.ts +93 -0
- package/dist/core/cgdb/pool-adapter.js +550 -0
- package/dist/core/cgdb/schema.d.ts +62 -0
- package/dist/core/cgdb/schema.js +502 -0
- package/dist/core/embeddings/embedding-pipeline.js +27 -10
- package/dist/core/graphstore/cgdb-row-source.d.ts +19 -0
- package/dist/core/graphstore/cgdb-row-source.js +141 -0
- package/dist/core/graphstore/index.d.ts +1 -1
- package/dist/core/graphstore/index.js +3 -3
- package/dist/core/group/bridge-db.d.ts +2 -2
- package/dist/core/group/bridge-db.js +123 -36
- package/dist/core/group/bridge-schema.d.ts +4 -4
- package/dist/core/group/bridge-schema.js +4 -4
- package/dist/core/group/cross-impact.js +3 -3
- package/dist/core/group/sync.js +4 -4
- package/dist/core/lbug/content-read.d.ts +46 -0
- package/dist/core/lbug/content-read.js +64 -0
- package/dist/core/lbug/csv-generator.d.ts +2 -6
- package/dist/core/lbug/csv-generator.js +45 -12
- package/dist/core/lbug/lbug-adapter.d.ts +4 -1
- package/dist/core/lbug/lbug-adapter.js +153 -21
- package/dist/core/lbug/schema.d.ts +7 -7
- package/dist/core/lbug/schema.js +18 -0
- package/dist/core/run-analyze.d.ts +13 -0
- package/dist/core/run-analyze.js +114 -27
- package/dist/core/search/bm25-index.d.ts +3 -3
- package/dist/core/search/bm25-index.js +75 -23
- package/dist/core/search/hybrid-search.js +2 -2
- package/dist/core/wiki/generator.d.ts +2 -2
- package/dist/core/wiki/generator.js +4 -4
- package/dist/core/wiki/graph-queries.d.ts +2 -2
- package/dist/core/wiki/graph-queries.js +5 -5
- package/dist/mcp/core/cgdb-adapter.d.ts +5 -0
- package/dist/mcp/core/cgdb-adapter.js +5 -0
- package/dist/mcp/core/embedder.js +1 -1
- package/dist/mcp/local/local-backend.d.ts +2 -2
- package/dist/mcp/local/local-backend.js +36 -19
- package/dist/mcp/server.js +3 -3
- package/dist/mcp/tools.js +1 -1
- package/dist/server/analyze-worker.js +2 -2
- package/dist/server/api.js +34 -33
- package/dist/storage/repo-manager.d.ts +42 -3
- package/dist/storage/repo-manager.js +23 -4
- package/hooks/claude/codragraph-hook.cjs +98 -5
- package/package.json +4 -4
- package/scripts/build-tree-sitter-proto.cjs +15 -3
- package/scripts/build.js +8 -9
- package/scripts/patch-tree-sitter-swift.cjs +17 -4
- package/skills/codragraph-api-surface.md +110 -0
- package/skills/codragraph-config-audit.md +146 -0
- package/skills/codragraph-cross-repo-impact.md +135 -0
- package/skills/codragraph-data-lineage.md +137 -0
- package/skills/codragraph-dead-code.md +119 -0
- package/skills/codragraph-gh-actions-debug.md +162 -0
- package/skills/codragraph-gh-issue-workflow.md +178 -0
- package/skills/codragraph-gh-pr-workflow.md +176 -0
- package/skills/codragraph-gh-release-workflow.md +187 -0
- package/skills/codragraph-git-bisect.md +176 -0
- package/skills/codragraph-git-force-push.md +147 -0
- package/skills/codragraph-git-history-rewrite.md +174 -0
- package/skills/codragraph-git-rebase-vs-merge.md +138 -0
- package/skills/codragraph-git-recovery.md +181 -0
- package/skills/codragraph-git-worktree.md +145 -0
- package/skills/codragraph-migration-tracking.md +130 -0
- package/skills/codragraph-notebook-context.md +136 -0
- package/skills/codragraph-observability-coverage.md +125 -0
- package/skills/codragraph-onboarding.md +129 -0
- package/skills/codragraph-perf-hotspots.md +132 -0
- package/skills/codragraph-project-switcher.md +116 -0
- package/skills/codragraph-security-audit.md +144 -0
- package/skills/codragraph-sql-tracing.md +122 -0
- package/skills/codragraph-supply-chain-audit.md +153 -0
- package/skills/codragraph-test-coverage.md +97 -0
- package/vendor/tree-sitter-proto/bindings/node/index.js +3 -3
- package/vendor/tree-sitter-proto/src/node-types.json +1 -1
|
@@ -8,6 +8,7 @@
|
|
|
8
8
|
* IMPORTANT: This module must NEVER call process.exit(). The caller (CLI
|
|
9
9
|
* wrapper or server worker) is responsible for process lifecycle.
|
|
10
10
|
*/
|
|
11
|
+
import type { ContentEncoding } from '@codragraph/graphstore';
|
|
11
12
|
export interface AnalyzeCallbacks {
|
|
12
13
|
onProgress: (phase: string, percent: number, message: string) => void;
|
|
13
14
|
onLog?: (message: string) => void;
|
|
@@ -41,6 +42,18 @@ export interface AnalyzeOptions {
|
|
|
41
42
|
* of a pipeline re-index.
|
|
42
43
|
*/
|
|
43
44
|
allowDuplicateName?: boolean;
|
|
45
|
+
/**
|
|
46
|
+
* RFC 0001 Phase 2 — opt into per-row content compression. `'none'`
|
|
47
|
+
* (or undefined) writes plain text and the schema-default tag, exactly
|
|
48
|
+
* as pre-Phase-2 indexes do. `'brotli'` and `'zstd'` route every
|
|
49
|
+
* content field through `encodeContent` before it hits the CSV; the
|
|
50
|
+
* read path decodes via the per-row `contentEncoding` tag.
|
|
51
|
+
*
|
|
52
|
+
* Choosing `'zstd'` requires Node ≥ 22.15 on the indexer (the runtime
|
|
53
|
+
* that wrote the rows). Readers on older Node will get a clear
|
|
54
|
+
* forward-compat error rather than silently bad content.
|
|
55
|
+
*/
|
|
56
|
+
compress?: ContentEncoding;
|
|
44
57
|
}
|
|
45
58
|
export interface AnalyzeResult {
|
|
46
59
|
repoName: string;
|
package/dist/core/run-analyze.js
CHANGED
|
@@ -10,14 +10,16 @@
|
|
|
10
10
|
*/
|
|
11
11
|
import path from 'path';
|
|
12
12
|
import fs from 'fs/promises';
|
|
13
|
+
import * as fsSync from 'node:fs';
|
|
14
|
+
import * as v8 from 'node:v8';
|
|
13
15
|
import { runPipelineFromRepo } from './ingestion/pipeline.js';
|
|
14
|
-
import {
|
|
15
|
-
import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, cleanupOldKuzuFiles, } from '../storage/repo-manager.js';
|
|
16
|
+
import { initCgdb, loadGraphToCgdb, getCgdbStats, executeQuery, executeWithReusedStatement, closeCgdb, loadCachedEmbeddings, } from './cgdb/cgdb-adapter.js';
|
|
17
|
+
import { getStoragePaths, saveMeta, loadMeta, addToGitignore, registerRepo, cleanupOldKuzuFiles, INDEX_SCHEMA_VERSION, } from '../storage/repo-manager.js';
|
|
16
18
|
import { getCurrentCommit, getRemoteUrl, hasGitDir, getInferredRepoName } from '../storage/git.js';
|
|
17
19
|
import { recordAnalysisSnapshot } from './graphstore/index.js';
|
|
18
20
|
import { generateAIContextFiles } from '../cli/ai-context.js';
|
|
19
|
-
import { EMBEDDING_TABLE_NAME } from './
|
|
20
|
-
import { STALE_HASH_SENTINEL } from './
|
|
21
|
+
import { EMBEDDING_TABLE_NAME } from './cgdb/schema.js';
|
|
22
|
+
import { STALE_HASH_SENTINEL } from './cgdb/schema.js';
|
|
21
23
|
/** Threshold: auto-skip embeddings for repos with more nodes than this */
|
|
22
24
|
const EMBEDDING_NODE_LIMIT = 50_000;
|
|
23
25
|
export const PHASE_LABELS = {
|
|
@@ -30,7 +32,7 @@ export const PHASE_LABELS = {
|
|
|
30
32
|
communities: 'Detecting communities',
|
|
31
33
|
processes: 'Detecting processes',
|
|
32
34
|
complete: 'Pipeline complete',
|
|
33
|
-
|
|
35
|
+
cgdb: 'Loading into LadybugDB',
|
|
34
36
|
fts: 'Creating search indexes',
|
|
35
37
|
embeddings: 'Generating embeddings',
|
|
36
38
|
done: 'Done',
|
|
@@ -51,8 +53,71 @@ export const PHASE_LABELS = {
|
|
|
51
53
|
*/
|
|
52
54
|
export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
53
55
|
const log = (msg) => callbacks.onLog?.(msg);
|
|
54
|
-
|
|
55
|
-
|
|
56
|
+
// RFC 0002 Phase 1 — optional heap-profile instrumentation. Set
|
|
57
|
+
// CODRAGRAPH_HEAP_PROFILE=1 (or run `codragraph profile-heap`) to write a
|
|
58
|
+
// v8 heap snapshot at every phase boundary, plus a `profile-summary.jsonl`
|
|
59
|
+
// log of `process.memoryUsage()` at the same boundaries. Snapshots land in
|
|
60
|
+
// `<repo>/.codragraph/heap-profiles/`. Open snapshots in Chrome DevTools
|
|
61
|
+
// (Memory → Load) to find which constructors dominate retained set; the
|
|
62
|
+
// JSONL is the cheap RSS / heapUsed timeline. Off by default — snapshot
|
|
63
|
+
// writes pause the event loop ~2-5s and consume ~100-500MB of disk each.
|
|
64
|
+
const heapProfileEnabled = process.env.CODRAGRAPH_HEAP_PROFILE === '1';
|
|
65
|
+
let heapProfileDir = '';
|
|
66
|
+
let heapProfileSummaryPath = '';
|
|
67
|
+
let lastProfilePhase = '';
|
|
68
|
+
if (heapProfileEnabled) {
|
|
69
|
+
heapProfileDir = path.join(repoPath, '.codragraph', 'heap-profiles');
|
|
70
|
+
heapProfileSummaryPath = path.join(heapProfileDir, 'profile-summary.jsonl');
|
|
71
|
+
try {
|
|
72
|
+
fsSync.mkdirSync(heapProfileDir, { recursive: true });
|
|
73
|
+
// Truncate any prior summary so a single run produces a clean log.
|
|
74
|
+
// We append crash-safely on each phase boundary below.
|
|
75
|
+
fsSync.writeFileSync(heapProfileSummaryPath, '');
|
|
76
|
+
}
|
|
77
|
+
catch {
|
|
78
|
+
/* permission issue — best-effort */
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
const progress = (phase, percent, message) => {
|
|
82
|
+
callbacks.onProgress(phase, percent, message);
|
|
83
|
+
// Only snapshot on phase transitions, not every tick. Phase strings come
|
|
84
|
+
// from runPipelineFromRepo / loadGraphToCgdb and are stable.
|
|
85
|
+
if (heapProfileEnabled && phase && phase !== lastProfilePhase) {
|
|
86
|
+
lastProfilePhase = phase;
|
|
87
|
+
const ts = Date.now();
|
|
88
|
+
const safe = phase.replace(/[^a-zA-Z0-9]+/g, '_').slice(0, 60);
|
|
89
|
+
const file = path.join(heapProfileDir, `${ts}-${safe}.heapsnapshot`);
|
|
90
|
+
// Capture the cheap memoryUsage timeline FIRST — even if writeHeapSnapshot
|
|
91
|
+
// crashes (out of disk, permissions), we still have the RSS curve which
|
|
92
|
+
// is the more useful artifact for the heap-pressure RFC.
|
|
93
|
+
try {
|
|
94
|
+
const mu = process.memoryUsage();
|
|
95
|
+
const entry = JSON.stringify({
|
|
96
|
+
ts,
|
|
97
|
+
phase,
|
|
98
|
+
percent,
|
|
99
|
+
rss: mu.rss,
|
|
100
|
+
heapUsed: mu.heapUsed,
|
|
101
|
+
heapTotal: mu.heapTotal,
|
|
102
|
+
external: mu.external,
|
|
103
|
+
arrayBuffers: mu.arrayBuffers,
|
|
104
|
+
snapshotFile: path.basename(file),
|
|
105
|
+
});
|
|
106
|
+
fsSync.appendFileSync(heapProfileSummaryPath, entry + '\n');
|
|
107
|
+
}
|
|
108
|
+
catch (err) {
|
|
109
|
+
log(`heap-profile: summary append failed (${err.message})`);
|
|
110
|
+
}
|
|
111
|
+
try {
|
|
112
|
+
v8.writeHeapSnapshot(file);
|
|
113
|
+
log(`heap-profile: wrote ${file}`);
|
|
114
|
+
}
|
|
115
|
+
catch (err) {
|
|
116
|
+
log(`heap-profile: write failed (${err.message})`);
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
};
|
|
120
|
+
const { storagePath, cgdbPath } = getStoragePaths(repoPath);
|
|
56
121
|
// Clean up stale KuzuDB files from before the LadybugDB migration.
|
|
57
122
|
const kuzuResult = await cleanupOldKuzuFiles(storagePath);
|
|
58
123
|
if (kuzuResult.found && kuzuResult.needsReindex) {
|
|
@@ -62,7 +127,17 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
62
127
|
const currentCommit = repoHasGit ? getCurrentCommit(repoPath) : '';
|
|
63
128
|
const existingMeta = await loadMeta(storagePath);
|
|
64
129
|
// ── Early-return: already up to date ──────────────────────────────
|
|
65
|
-
|
|
130
|
+
// Schema-version mismatch forces a full re-analyze regardless of commit
|
|
131
|
+
// equality: existing 1.7.x indexes have no `schemaVersion` field at all,
|
|
132
|
+
// and 1.8+ readers expect every node table to carry a `contentEncoding`
|
|
133
|
+
// column (RFC 0001 Phase 2). LadybugDB ALTER on existing tables is not
|
|
134
|
+
// validated end-to-end yet, so the supported migration path is
|
|
135
|
+
// re-analyze → fresh CREATE NODE TABLE.
|
|
136
|
+
const schemaUpToDate = !!existingMeta && (existingMeta.schemaVersion ?? 0) >= INDEX_SCHEMA_VERSION;
|
|
137
|
+
if (existingMeta &&
|
|
138
|
+
schemaUpToDate &&
|
|
139
|
+
!options.force &&
|
|
140
|
+
existingMeta.lastCommit === currentCommit) {
|
|
66
141
|
// Non-git folders have currentCommit = '' — always rebuild since we can't detect changes
|
|
67
142
|
if (currentCommit !== '') {
|
|
68
143
|
return {
|
|
@@ -73,21 +148,26 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
73
148
|
};
|
|
74
149
|
}
|
|
75
150
|
}
|
|
151
|
+
if (existingMeta && !schemaUpToDate) {
|
|
152
|
+
log(`Index schema version ${existingMeta.schemaVersion ?? '<missing>'} is older than ` +
|
|
153
|
+
`${INDEX_SCHEMA_VERSION} (RFC 0001 Phase 2 — adds contentEncoding column). ` +
|
|
154
|
+
`Re-analyzing.`);
|
|
155
|
+
}
|
|
76
156
|
// ── Cache embeddings from existing index before rebuild ────────────
|
|
77
157
|
let cachedEmbeddingNodeIds = new Set();
|
|
78
158
|
let cachedEmbeddings = [];
|
|
79
159
|
if (options.embeddings && existingMeta && !options.force) {
|
|
80
160
|
try {
|
|
81
161
|
progress('embeddings', 0, 'Caching embeddings...');
|
|
82
|
-
await
|
|
162
|
+
await initCgdb(cgdbPath);
|
|
83
163
|
const cached = await loadCachedEmbeddings();
|
|
84
164
|
cachedEmbeddingNodeIds = cached.embeddingNodeIds;
|
|
85
165
|
cachedEmbeddings = cached.embeddings;
|
|
86
|
-
await
|
|
166
|
+
await closeCgdb();
|
|
87
167
|
}
|
|
88
168
|
catch {
|
|
89
169
|
try {
|
|
90
|
-
await
|
|
170
|
+
await closeCgdb();
|
|
91
171
|
}
|
|
92
172
|
catch {
|
|
93
173
|
/* swallow */
|
|
@@ -101,10 +181,10 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
101
181
|
progress(p.phase, scaled, phaseLabel);
|
|
102
182
|
});
|
|
103
183
|
// ── Phase 2: LadybugDB (60–85%) ──────────────────────────────────
|
|
104
|
-
progress('
|
|
105
|
-
await
|
|
106
|
-
const
|
|
107
|
-
for (const f of
|
|
184
|
+
progress('cgdb', 60, 'Loading into LadybugDB...');
|
|
185
|
+
await closeCgdb();
|
|
186
|
+
const cgdbFiles = [cgdbPath, `${cgdbPath}.wal`, `${cgdbPath}.lock`];
|
|
187
|
+
for (const f of cgdbFiles) {
|
|
108
188
|
try {
|
|
109
189
|
await fs.rm(f, { recursive: true, force: true });
|
|
110
190
|
}
|
|
@@ -112,17 +192,22 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
112
192
|
/* swallow */
|
|
113
193
|
}
|
|
114
194
|
}
|
|
115
|
-
await
|
|
195
|
+
await initCgdb(cgdbPath);
|
|
116
196
|
try {
|
|
117
|
-
// All work after
|
|
197
|
+
// All work after initCgdb is wrapped in try/finally to ensure closeCgdb()
|
|
118
198
|
// is called even if an error occurs — the module-level singleton DB handle
|
|
119
199
|
// must be released to avoid blocking subsequent invocations.
|
|
120
|
-
let
|
|
121
|
-
await
|
|
122
|
-
|
|
123
|
-
const pct = Math.min(84, 60 + Math.round((
|
|
124
|
-
progress('
|
|
125
|
-
}
|
|
200
|
+
let cgdbMsgCount = 0;
|
|
201
|
+
await loadGraphToCgdb(pipelineResult.graph, pipelineResult.repoPath, storagePath, (msg) => {
|
|
202
|
+
cgdbMsgCount++;
|
|
203
|
+
const pct = Math.min(84, 60 + Math.round((cgdbMsgCount / (cgdbMsgCount + 10)) * 24));
|
|
204
|
+
progress('cgdb', pct, msg);
|
|
205
|
+
},
|
|
206
|
+
// RFC 0001 Phase 2: when --compress is set, every content row goes
|
|
207
|
+
// through encodeContent before hitting the CSV. Default 'none' is
|
|
208
|
+
// a true passthrough, so the on-disk layout is byte-identical to
|
|
209
|
+
// pre-Phase-2 indexes when no compression flag is passed.
|
|
210
|
+
{ compress: options.compress });
|
|
126
211
|
// ── Phase 2.5: Versioned-graph snapshot (best-effort) ────────────
|
|
127
212
|
// Phase 4 hook: snapshot the freshly-loaded graph into the
|
|
128
213
|
// content-addressed `.codragraph/graphstore/`. Failures here do NOT
|
|
@@ -159,7 +244,7 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
159
244
|
// ── Phase 3.5: Re-insert cached embeddings ────────────────────────
|
|
160
245
|
if (cachedEmbeddings.length > 0) {
|
|
161
246
|
const cachedDims = cachedEmbeddings[0].embedding.length;
|
|
162
|
-
const { EMBEDDING_DIMS } = await import('./
|
|
247
|
+
const { EMBEDDING_DIMS } = await import('./cgdb/schema.js');
|
|
163
248
|
if (cachedDims !== EMBEDDING_DIMS) {
|
|
164
249
|
// Dimensions changed (e.g. switched embedding model) — discard cache and re-embed all
|
|
165
250
|
log(`Embedding dimensions changed (${cachedDims}d -> ${EMBEDDING_DIMS}d), discarding cache`);
|
|
@@ -182,7 +267,7 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
182
267
|
}
|
|
183
268
|
}
|
|
184
269
|
// ── Phase 4: Embeddings (90–98%) ──────────────────────────────────
|
|
185
|
-
const stats = await
|
|
270
|
+
const stats = await getCgdbStats();
|
|
186
271
|
let embeddingSkipped = true;
|
|
187
272
|
if (options.embeddings) {
|
|
188
273
|
if (stats.nodes <= EMBEDDING_NODE_LIMIT) {
|
|
@@ -230,6 +315,8 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
230
315
|
repoPath,
|
|
231
316
|
lastCommit: currentCommit,
|
|
232
317
|
indexedAt: new Date().toISOString(),
|
|
318
|
+
schemaVersion: INDEX_SCHEMA_VERSION,
|
|
319
|
+
compress: options.compress ?? 'none',
|
|
233
320
|
// Captured here (not at registration) so it travels with the
|
|
234
321
|
// on-disk meta.json — sibling-clone fingerprinting works for
|
|
235
322
|
// out-of-tree consumers (group-status, future tooling) without
|
|
@@ -293,7 +380,7 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
293
380
|
// Best-effort — don't fail the entire analysis for context file issues
|
|
294
381
|
}
|
|
295
382
|
// ── Close LadybugDB ──────────────────────────────────────────────
|
|
296
|
-
await
|
|
383
|
+
await closeCgdb();
|
|
297
384
|
progress('done', 100, 'Done');
|
|
298
385
|
return {
|
|
299
386
|
repoName: projectName,
|
|
@@ -305,7 +392,7 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
|
|
|
305
392
|
catch (err) {
|
|
306
393
|
// Ensure LadybugDB is closed even on error
|
|
307
394
|
try {
|
|
308
|
-
await
|
|
395
|
+
await closeCgdb();
|
|
309
396
|
}
|
|
310
397
|
catch {
|
|
311
398
|
/* swallow */
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* Always reads from the database (no cached state to drift).
|
|
6
6
|
*
|
|
7
7
|
* FTS indexes are created lazily on first query (via `ensureFTSIndex`) — see
|
|
8
|
-
* `
|
|
8
|
+
* `cgdb-adapter.ts` for the rationale. This keeps `analyze` fast (the
|
|
9
9
|
* ~440 ms × 5 LadybugDB CREATE_FTS_INDEX cost dominates pipeline time on
|
|
10
10
|
* small repos / CI runners) at the cost of paying that overhead on the
|
|
11
11
|
* first `query`/`context` call in a session.
|
|
@@ -20,7 +20,7 @@ export interface BM25SearchResult {
|
|
|
20
20
|
* Drop all ensured-FTS cache entries for a given repoId.
|
|
21
21
|
*
|
|
22
22
|
* Called from the pool-close listener so that a pool teardown / recreation
|
|
23
|
-
* forces the next `
|
|
23
|
+
* forces the next `searchFTSFromCgdb` call to re-issue `CREATE_FTS_INDEX`
|
|
24
24
|
* against the fresh connection rather than trust stale ensure-state from a
|
|
25
25
|
* previous pool lifetime.
|
|
26
26
|
*
|
|
@@ -38,4 +38,4 @@ export declare function invalidateEnsuredFTSForRepo(repoId: string): void;
|
|
|
38
38
|
* @param repoId - If provided, queries will be routed via the MCP connection pool
|
|
39
39
|
* @returns Ranked search results from FTS indexes
|
|
40
40
|
*/
|
|
41
|
-
export declare const
|
|
41
|
+
export declare const searchFTSFromCgdb: (query: string, limit?: number, repoId?: string) => Promise<BM25SearchResult[]>;
|
|
@@ -5,24 +5,63 @@
|
|
|
5
5
|
* Always reads from the database (no cached state to drift).
|
|
6
6
|
*
|
|
7
7
|
* FTS indexes are created lazily on first query (via `ensureFTSIndex`) — see
|
|
8
|
-
* `
|
|
8
|
+
* `cgdb-adapter.ts` for the rationale. This keeps `analyze` fast (the
|
|
9
9
|
* ~440 ms × 5 LadybugDB CREATE_FTS_INDEX cost dominates pipeline time on
|
|
10
10
|
* small repos / CI runners) at the cost of paying that overhead on the
|
|
11
11
|
* first `query`/`context` call in a session.
|
|
12
12
|
*/
|
|
13
|
-
import { queryFTS, ensureFTSIndex, executeQuery as executeCoreQuery, } from '../
|
|
13
|
+
import { queryFTS, ensureFTSIndex, executeQuery as executeCoreQuery, } from '../cgdb/cgdb-adapter.js';
|
|
14
14
|
/**
|
|
15
|
-
* FTS
|
|
16
|
-
* CLI/pipeline path and the MCP pool path
|
|
17
|
-
*
|
|
15
|
+
* FTS table set served by `searchFTSFromCgdb`. Centralised so that both
|
|
16
|
+
* the CLI/pipeline path and the MCP pool path stay in lockstep.
|
|
17
|
+
*
|
|
18
|
+
* The properties list is computed at FTS-create time via `ftsPropertiesFor`
|
|
19
|
+
* — for repos that were analysed with `--compress brotli|zstd`, the
|
|
20
|
+
* `content` column holds base64-of-encoded-bytes and would tokenise to
|
|
21
|
+
* useless tokens. Those repos get name-only FTS so search at least
|
|
22
|
+
* matches function/class names instead of returning random hits on
|
|
23
|
+
* base64 alphabet. Plain (compress='none' / unset) repos get the full
|
|
24
|
+
* `name + content` index for body-text matches. RFC 0001 Phase 2.5.
|
|
18
25
|
*/
|
|
19
|
-
const
|
|
20
|
-
{ table: 'File', indexName: 'file_fts'
|
|
21
|
-
{ table: 'Function', indexName: 'function_fts'
|
|
22
|
-
{ table: 'Class', indexName: 'class_fts'
|
|
23
|
-
{ table: 'Method', indexName: 'method_fts'
|
|
24
|
-
{ table: 'Interface', indexName: 'interface_fts'
|
|
26
|
+
const FTS_TABLES = [
|
|
27
|
+
{ table: 'File', indexName: 'file_fts' },
|
|
28
|
+
{ table: 'Function', indexName: 'function_fts' },
|
|
29
|
+
{ table: 'Class', indexName: 'class_fts' },
|
|
30
|
+
{ table: 'Method', indexName: 'method_fts' },
|
|
31
|
+
{ table: 'Interface', indexName: 'interface_fts' },
|
|
25
32
|
];
|
|
33
|
+
const ftsPropertiesFor = (compress) => !compress || compress === 'none' ? ['name', 'content'] : ['name'];
|
|
34
|
+
/**
|
|
35
|
+
* Look up `meta.compress` for a repo. The MCP path passes `repoId`
|
|
36
|
+
* (registry-derived); the CLI path passes nothing and we walk up from
|
|
37
|
+
* cwd. Returns `'none'` whenever the lookup fails so the safe default
|
|
38
|
+
* (full FTS index) is used — the failure mode is reduced search
|
|
39
|
+
* quality, never wrong results.
|
|
40
|
+
*/
|
|
41
|
+
async function getCompressMode(repoId) {
|
|
42
|
+
try {
|
|
43
|
+
const repoMod = await import('../../storage/repo-manager.js');
|
|
44
|
+
if (repoId) {
|
|
45
|
+
// MCP path: registry name is the source of truth. The MCP
|
|
46
|
+
// backend's `repoId` is `entry.name.toLowerCase()` (or `${name}-${hash}`
|
|
47
|
+
// on collision); match conservatively against both forms.
|
|
48
|
+
const entries = await repoMod.listRegisteredRepos();
|
|
49
|
+
for (const entry of entries) {
|
|
50
|
+
const base = entry.name.toLowerCase();
|
|
51
|
+
if (base === repoId || repoId.startsWith(`${base}-`)) {
|
|
52
|
+
const meta = await repoMod.loadMeta(entry.storagePath);
|
|
53
|
+
return meta?.compress ?? 'none';
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
return 'none';
|
|
57
|
+
}
|
|
58
|
+
const repo = await repoMod.findRepo(process.cwd());
|
|
59
|
+
return repo?.meta?.compress ?? 'none';
|
|
60
|
+
}
|
|
61
|
+
catch {
|
|
62
|
+
return 'none';
|
|
63
|
+
}
|
|
64
|
+
}
|
|
26
65
|
const FALLBACK_SCAN_LIMIT = 50_000;
|
|
27
66
|
const BOOLEAN_QUERY_TOKENS = new Set(['and', 'or', 'not']);
|
|
28
67
|
const FALLBACK_FIELD_WEIGHTS = {
|
|
@@ -33,7 +72,7 @@ const FALLBACK_FIELD_WEIGHTS = {
|
|
|
33
72
|
/**
|
|
34
73
|
* Per-process cache for the MCP pool path: tracks which `(repoId, table)`
|
|
35
74
|
* pairs have been ensured. The CLI/pipeline path gets its own cache inside
|
|
36
|
-
* `
|
|
75
|
+
* `cgdb-adapter.ts` keyed by table/index, scoped to the singleton connection.
|
|
37
76
|
*
|
|
38
77
|
* IMPORTANT: an entry is added ONLY when the index was confirmed to exist
|
|
39
78
|
* (CREATE_FTS_INDEX succeeded, or failed with `'already exists'`). Other
|
|
@@ -41,14 +80,14 @@ const FALLBACK_FIELD_WEIGHTS = {
|
|
|
41
80
|
* unset so the next query retries instead of silently caching the failure.
|
|
42
81
|
*
|
|
43
82
|
* Entries for a given repoId are invalidated when its pool is closed —
|
|
44
|
-
* see the `addPoolCloseListener` registration in `
|
|
83
|
+
* see the `addPoolCloseListener` registration in `searchFTSFromCgdb`.
|
|
45
84
|
*/
|
|
46
85
|
const ensuredPoolFTS = new Set();
|
|
47
86
|
/**
|
|
48
87
|
* Drop all ensured-FTS cache entries for a given repoId.
|
|
49
88
|
*
|
|
50
89
|
* Called from the pool-close listener so that a pool teardown / recreation
|
|
51
|
-
* forces the next `
|
|
90
|
+
* forces the next `searchFTSFromCgdb` call to re-issue `CREATE_FTS_INDEX`
|
|
52
91
|
* against the fresh connection rather than trust stale ensure-state from a
|
|
53
92
|
* previous pool lifetime.
|
|
54
93
|
*
|
|
@@ -184,9 +223,13 @@ async function queryFallbackViaExecutor(executor, tableName, properties, query,
|
|
|
184
223
|
return [];
|
|
185
224
|
}
|
|
186
225
|
}
|
|
187
|
-
async function fallbackSearchAllTables(executor, query, limit
|
|
226
|
+
async function fallbackSearchAllTables(executor, query, limit,
|
|
227
|
+
// Same compress-aware property selection as the FTS path. Default keeps
|
|
228
|
+
// pre-Phase-2 behaviour (`['name', 'content']`) for callers that don't
|
|
229
|
+
// pass a value.
|
|
230
|
+
properties = ['name', 'content']) {
|
|
188
231
|
const results = [];
|
|
189
|
-
for (const { table
|
|
232
|
+
for (const { table } of FTS_TABLES) {
|
|
190
233
|
results.push(await queryFallbackViaExecutor(executor, table, properties, query, limit));
|
|
191
234
|
}
|
|
192
235
|
return results;
|
|
@@ -202,7 +245,7 @@ async function fallbackSearchAllTables(executor, query, limit) {
|
|
|
202
245
|
* @param repoId - If provided, queries will be routed via the MCP connection pool
|
|
203
246
|
* @returns Ranked search results from FTS indexes
|
|
204
247
|
*/
|
|
205
|
-
export const
|
|
248
|
+
export const searchFTSFromCgdb = async (query, limit = 20, repoId) => {
|
|
206
249
|
if (!query.trim() || limit <= 0)
|
|
207
250
|
return [];
|
|
208
251
|
let fileResults, functionResults, classResults, methodResults, interfaceResults;
|
|
@@ -210,7 +253,7 @@ export const searchFTSFromLbug = async (query, limit = 20, repoId) => {
|
|
|
210
253
|
// Use MCP connection pool via dynamic import
|
|
211
254
|
// IMPORTANT: FTS queries run sequentially to avoid connection contention.
|
|
212
255
|
// The MCP pool supports multiple connections, but FTS is best run serially.
|
|
213
|
-
const poolMod = await import('../
|
|
256
|
+
const poolMod = await import('../cgdb/pool-adapter.js');
|
|
214
257
|
const { executeQuery, addPoolCloseListener } = poolMod;
|
|
215
258
|
// Register the pool-close listener lazily on first use so a teardown of
|
|
216
259
|
// the pool entry (LRU eviction, idle timeout, explicit close) drops the
|
|
@@ -220,7 +263,12 @@ export const searchFTSFromLbug = async (query, limit = 20, repoId) => {
|
|
|
220
263
|
const executor = (cypher) => executeQuery(repoId, cypher);
|
|
221
264
|
// Lazy-create FTS indexes on first query for this repo (analyze no longer
|
|
222
265
|
// creates them up-front, so we ensure them here). Cached per-process.
|
|
223
|
-
|
|
266
|
+
// RFC 0001 Phase 2.5: drop `content` from FTS properties for repos
|
|
267
|
+
// analysed with --compress brotli|zstd — the column holds encoded
|
|
268
|
+
// bytes and would tokenise to garbage.
|
|
269
|
+
const compress = await getCompressMode(repoId);
|
|
270
|
+
const properties = ftsPropertiesFor(compress);
|
|
271
|
+
for (const { table, indexName } of FTS_TABLES) {
|
|
224
272
|
await ensureFTSIndexViaExecutor(executor, repoId, table, indexName, properties);
|
|
225
273
|
}
|
|
226
274
|
fileResults = await queryFTSViaExecutor(executor, 'File', 'file_fts', query, limit);
|
|
@@ -235,13 +283,17 @@ export const searchFTSFromLbug = async (query, limit = 20, repoId) => {
|
|
|
235
283
|
interfaceResults.length ===
|
|
236
284
|
0) {
|
|
237
285
|
[fileResults, functionResults, classResults, methodResults, interfaceResults] =
|
|
238
|
-
await fallbackSearchAllTables(executor, query, limit);
|
|
286
|
+
await fallbackSearchAllTables(executor, query, limit, properties);
|
|
239
287
|
}
|
|
240
288
|
}
|
|
241
289
|
else {
|
|
242
|
-
// Use core
|
|
290
|
+
// Use core cgdb adapter (CLI / pipeline context) — also sequential for safety.
|
|
243
291
|
// Lazy-create FTS indexes on first query (analyze no longer does it).
|
|
244
|
-
|
|
292
|
+
// RFC 0001 Phase 2.5 — same `compress`-aware property selection as the MCP
|
|
293
|
+
// path; the CLI walks up from cwd to find the repo's meta.json.
|
|
294
|
+
const compress = await getCompressMode();
|
|
295
|
+
const properties = ftsPropertiesFor(compress);
|
|
296
|
+
for (const { table, indexName } of FTS_TABLES) {
|
|
245
297
|
await ensureFTSIndex(table, indexName, [...properties]).catch(() => { });
|
|
246
298
|
}
|
|
247
299
|
fileResults = await queryFTS('File', 'file_fts', query, limit, false).catch(() => []);
|
|
@@ -256,7 +308,7 @@ export const searchFTSFromLbug = async (query, limit = 20, repoId) => {
|
|
|
256
308
|
interfaceResults.length ===
|
|
257
309
|
0) {
|
|
258
310
|
[fileResults, functionResults, classResults, methodResults, interfaceResults] =
|
|
259
|
-
await fallbackSearchAllTables(executeCoreQuery, query, limit);
|
|
311
|
+
await fallbackSearchAllTables(executeCoreQuery, query, limit, properties);
|
|
260
312
|
}
|
|
261
313
|
}
|
|
262
314
|
// Collect all node scores per filePath to track which nodes actually matched
|
|
@@ -7,7 +7,7 @@
|
|
|
7
7
|
* This is the same approach used by Elasticsearch, Pinecone, and other
|
|
8
8
|
* production search systems.
|
|
9
9
|
*/
|
|
10
|
-
import {
|
|
10
|
+
import { searchFTSFromCgdb } from './bm25-index.js';
|
|
11
11
|
/**
|
|
12
12
|
* RRF constant - standard value used in the literature
|
|
13
13
|
* Higher values give more weight to lower-ranked results
|
|
@@ -112,7 +112,7 @@ export const formatHybridResults = (results) => {
|
|
|
112
112
|
*/
|
|
113
113
|
export const hybridSearch = async (query, limit, executeQuery, semanticSearch) => {
|
|
114
114
|
// Use LadybugDB FTS for always-fresh BM25 results
|
|
115
|
-
const bm25Results = await
|
|
115
|
+
const bm25Results = await searchFTSFromCgdb(query, limit);
|
|
116
116
|
const semanticResults = await semanticSearch(executeQuery, query, limit);
|
|
117
117
|
return mergeWithRRF(bm25Results, semanticResults, limit);
|
|
118
118
|
};
|
|
@@ -41,14 +41,14 @@ export declare class WikiGenerator {
|
|
|
41
41
|
private repoPath;
|
|
42
42
|
private storagePath;
|
|
43
43
|
private wikiDir;
|
|
44
|
-
private
|
|
44
|
+
private cgdbPath;
|
|
45
45
|
private llmConfig;
|
|
46
46
|
private maxTokensPerModule;
|
|
47
47
|
private concurrency;
|
|
48
48
|
private options;
|
|
49
49
|
private onProgress;
|
|
50
50
|
private failedModules;
|
|
51
|
-
constructor(repoPath: string, storagePath: string,
|
|
51
|
+
constructor(repoPath: string, storagePath: string, cgdbPath: string, llmConfig: LLMConfig, options?: WikiOptions, onProgress?: ProgressCallback);
|
|
52
52
|
private lastPercent;
|
|
53
53
|
/**
|
|
54
54
|
* Create streaming options that report LLM progress to the progress bar.
|
|
@@ -26,18 +26,18 @@ export class WikiGenerator {
|
|
|
26
26
|
repoPath;
|
|
27
27
|
storagePath;
|
|
28
28
|
wikiDir;
|
|
29
|
-
|
|
29
|
+
cgdbPath;
|
|
30
30
|
llmConfig;
|
|
31
31
|
maxTokensPerModule;
|
|
32
32
|
concurrency;
|
|
33
33
|
options;
|
|
34
34
|
onProgress;
|
|
35
35
|
failedModules = [];
|
|
36
|
-
constructor(repoPath, storagePath,
|
|
36
|
+
constructor(repoPath, storagePath, cgdbPath, llmConfig, options = {}, onProgress) {
|
|
37
37
|
this.repoPath = repoPath;
|
|
38
38
|
this.storagePath = storagePath;
|
|
39
39
|
this.wikiDir = path.join(storagePath, WIKI_DIR);
|
|
40
|
-
this.
|
|
40
|
+
this.cgdbPath = cgdbPath;
|
|
41
41
|
this.options = options;
|
|
42
42
|
this.llmConfig = llmConfig;
|
|
43
43
|
this.maxTokensPerModule = options.maxTokensPerModule ?? DEFAULT_MAX_TOKENS_PER_MODULE;
|
|
@@ -134,7 +134,7 @@ export class WikiGenerator {
|
|
|
134
134
|
}
|
|
135
135
|
// Init graph
|
|
136
136
|
this.onProgress('init', 2, 'Connecting to knowledge graph...');
|
|
137
|
-
await initWikiDb(this.
|
|
137
|
+
await initWikiDb(this.cgdbPath);
|
|
138
138
|
let result;
|
|
139
139
|
try {
|
|
140
140
|
if (!forceMode && existingMeta && existingMeta.fromCommit) {
|
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
* Graph Queries for Wiki Generation
|
|
3
3
|
*
|
|
4
4
|
* Encapsulated Cypher queries against the CodraGraph knowledge graph.
|
|
5
|
-
* Uses the MCP-style pooled
|
|
5
|
+
* Uses the MCP-style pooled cgdb-adapter for connection management.
|
|
6
6
|
*/
|
|
7
7
|
/**
|
|
8
8
|
* Touch the wiki DB connection to prevent idle timeout during long LLM calls.
|
|
@@ -36,7 +36,7 @@ export interface ProcessInfo {
|
|
|
36
36
|
/**
|
|
37
37
|
* Initialize the LadybugDB connection for wiki generation.
|
|
38
38
|
*/
|
|
39
|
-
export declare function initWikiDb(
|
|
39
|
+
export declare function initWikiDb(cgdbPath: string): Promise<void>;
|
|
40
40
|
/**
|
|
41
41
|
* Close the LadybugDB connection.
|
|
42
42
|
*/
|
|
@@ -2,9 +2,9 @@
|
|
|
2
2
|
* Graph Queries for Wiki Generation
|
|
3
3
|
*
|
|
4
4
|
* Encapsulated Cypher queries against the CodraGraph knowledge graph.
|
|
5
|
-
* Uses the MCP-style pooled
|
|
5
|
+
* Uses the MCP-style pooled cgdb-adapter for connection management.
|
|
6
6
|
*/
|
|
7
|
-
import {
|
|
7
|
+
import { initCgdb, executeQuery, closeCgdb, touchRepo } from '../cgdb/pool-adapter.js';
|
|
8
8
|
const REPO_ID = '__wiki__';
|
|
9
9
|
/**
|
|
10
10
|
* Touch the wiki DB connection to prevent idle timeout during long LLM calls.
|
|
@@ -15,14 +15,14 @@ export function touchWikiDb() {
|
|
|
15
15
|
/**
|
|
16
16
|
* Initialize the LadybugDB connection for wiki generation.
|
|
17
17
|
*/
|
|
18
|
-
export async function initWikiDb(
|
|
19
|
-
await
|
|
18
|
+
export async function initWikiDb(cgdbPath) {
|
|
19
|
+
await initCgdb(REPO_ID, cgdbPath);
|
|
20
20
|
}
|
|
21
21
|
/**
|
|
22
22
|
* Close the LadybugDB connection.
|
|
23
23
|
*/
|
|
24
24
|
export async function closeWikiDb() {
|
|
25
|
-
await
|
|
25
|
+
await closeCgdb(REPO_ID);
|
|
26
26
|
}
|
|
27
27
|
/**
|
|
28
28
|
* Get all source files with their exported symbol names and types.
|
|
@@ -6,7 +6,7 @@
|
|
|
6
6
|
*/
|
|
7
7
|
import { pipeline, env } from '@huggingface/transformers';
|
|
8
8
|
import { isHttpMode, getHttpDimensions, httpEmbedQuery, } from '../../core/embeddings/http-client.js';
|
|
9
|
-
import { silenceStdout, restoreStdout, realStderrWrite } from '../../core/
|
|
9
|
+
import { silenceStdout, restoreStdout, realStderrWrite } from '../../core/cgdb/pool-adapter.js';
|
|
10
10
|
// Model config
|
|
11
11
|
const MODEL_ID = 'Snowflake/snowflake-arctic-embed-xs';
|
|
12
12
|
// Module-level state for singleton pattern
|
|
@@ -5,7 +5,7 @@
|
|
|
5
5
|
* Supports multiple indexed repositories via a global registry.
|
|
6
6
|
* LadybugDB connections are opened lazily per repo on first query.
|
|
7
7
|
*/
|
|
8
|
-
import { isWriteQuery } from '../../core/
|
|
8
|
+
import { isWriteQuery } from '../../core/cgdb/pool-adapter.js';
|
|
9
9
|
export { isWriteQuery };
|
|
10
10
|
import { type RegistryEntry } from '../../storage/repo-manager.js';
|
|
11
11
|
import { GroupService } from '../../core/group/service.js';
|
|
@@ -53,7 +53,7 @@ interface RepoHandle {
|
|
|
53
53
|
name: string;
|
|
54
54
|
repoPath: string;
|
|
55
55
|
storagePath: string;
|
|
56
|
-
|
|
56
|
+
cgdbPath: string;
|
|
57
57
|
indexedAt: string;
|
|
58
58
|
lastCommit: string;
|
|
59
59
|
remoteUrl?: string;
|