gitnexus 1.6.1 → 1.6.2-rc.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -234,6 +234,79 @@ Installed automatically by both `gitnexus analyze` (per-repo) and `gitnexus setu
234
234
  - Node.js >= 18
235
235
  - Git repository (uses git for commit tracking)
236
236
 
237
+ ## Release candidates
238
+
239
+ Stable releases publish to the default `latest` dist-tag. When a pull request
240
+ with non-documentation changes merges into `main`, an automated workflow also
241
+ publishes a prerelease build under the `rc` dist-tag, so early adopters can
242
+ try in-flight fixes without waiting for the next stable cut. (Docs-only
243
+ merges are skipped.)
244
+
245
+ ```bash
246
+ # Try the latest release candidate (pre-stable — may change at any time)
247
+ npm install -g gitnexus@rc
248
+ # — or —
249
+ npx gitnexus@rc analyze
250
+ ```
251
+
252
+ Release-candidate versions follow the standard semver prerelease format
253
+ `X.Y.Z-rc.N`, where `X.Y.Z` is the next stable target (bumped from the
254
+ current `latest` by patch by default; `minor` or `major` when kicking off a
255
+ bigger cycle) and `N` increments per published rc. Example sequence:
256
+ `1.6.2-rc.1`, `1.6.2-rc.2`, …, then once `1.6.2` ships stable,
257
+ `1.6.3-rc.1`. See the [Releases page](https://github.com/abhigyanpatwari/GitNexus/releases)
258
+ for the full list; stable `latest` is unaffected.
259
+
260
+ ## Troubleshooting
261
+
262
+ ### `Cannot destructure property 'package' of 'node.target' as it is null`
263
+
264
+ This crash was caused by a dependency URL format that is incompatible with
265
+ certain npm/arborist versions ([npm/cli#8126](https://github.com/npm/cli/issues/8126)).
266
+ It is fixed in **gitnexus v1.6.2+**. Upgrade to the latest version:
267
+
268
+ ```bash
269
+ npx gitnexus@latest analyze # always uses the newest release
270
+ # — or —
271
+ npm install -g gitnexus@latest # upgrade a global install
272
+ ```
273
+
274
+ If you still hit npm install issues after upgrading, these generic workarounds
275
+ may help:
276
+
277
+ ```bash
278
+ npm install -g npm@latest # update npm itself
279
+ npm cache clean --force # clear a possibly corrupt cache
280
+ ```
281
+
282
+ ### Installation fails with native module errors
283
+
284
+ Some optional language grammars (Dart, Kotlin, Swift) require native compilation. If they fail, GitNexus still works — those languages will be skipped.
285
+
286
+ If `npm install -g gitnexus` fails on native modules:
287
+
288
+ ```bash
289
+ # Ensure build tools are available (Linux/macOS)
290
+ # Ubuntu/Debian: sudo apt install python3 make g++
291
+ # macOS: xcode-select --install
292
+
293
+ # Retry installation
294
+ npm install -g gitnexus
295
+ ```
296
+
297
+ ### Analysis runs out of memory
298
+
299
+ For very large repositories:
300
+
301
+ ```bash
302
+ # Increase Node.js heap size
303
+ NODE_OPTIONS="--max-old-space-size=16384" npx gitnexus analyze
304
+
305
+ # Exclude large directories
306
+ echo "vendor/" >> .gitnexusignore
307
+ echo "dist/" >> .gitnexusignore
308
+ ```
309
+
237
310
  ## Privacy
238
311
 
239
312
  - All processing happens locally on your machine
@@ -232,7 +232,7 @@ export const analyzeCommand = async (inputPath, options) => {
232
232
  bar.stop();
233
233
  const msg = err.message || String(err);
234
234
  console.error(`\n Analysis failed: ${msg}\n`);
235
- // Provide helpful guidance for known large-repo failure modes
235
+ // Provide helpful guidance for known failure modes
236
236
  if (msg.includes('Maximum call stack size exceeded') ||
237
237
  msg.includes('call stack') ||
238
238
  msg.includes('Map maximum size') ||
@@ -248,6 +248,28 @@ export const analyzeCommand = async (inputPath, options) => {
248
248
  console.error(' 3. Increase stack size: NODE_OPTIONS="--stack-size=4096"');
249
249
  console.error('');
250
250
  }
251
+ else if (msg.includes('ERESOLVE') || msg.includes('Could not resolve dependency')) {
252
+ // Note: the original arborist "Cannot destructure property 'package' of
253
+ // 'node.target'" crash happens inside npm *before* gitnexus code runs,
254
+ // so it can't be caught here. This branch handles dependency-resolution
255
+ // errors that surface at runtime (e.g. dynamic require failures).
256
+ console.error(' This looks like an npm dependency resolution issue.');
257
+ console.error(' Suggestions:');
258
+ console.error(' 1. Clear the npm cache: npm cache clean --force');
259
+ console.error(' 2. Update npm: npm install -g npm@latest');
260
+ console.error(' 3. Reinstall gitnexus: npm install -g gitnexus@latest');
261
+ console.error(' 4. Or try npx directly: npx gitnexus@latest analyze');
262
+ console.error('');
263
+ }
264
+ else if (msg.includes('MODULE_NOT_FOUND') ||
265
+ msg.includes('Cannot find module') ||
266
+ msg.includes('ERR_MODULE_NOT_FOUND')) {
267
+ console.error(' A required module could not be loaded. The installation may be corrupt.');
268
+ console.error(' Suggestions:');
269
+ console.error(' 1. Reinstall: npm install -g gitnexus@latest');
270
+ console.error(' 2. Clear cache: npm cache clean --force && npx gitnexus@latest analyze');
271
+ console.error('');
272
+ }
251
273
  process.exitCode = 1;
252
274
  return;
253
275
  }
@@ -67,8 +67,8 @@ const queryEmbeddableNodes = async (executeQuery) => {
67
67
  * that occurs when UPDATEing nodes with large content fields
68
68
  */
69
69
  const batchInsertEmbeddings = async (executeWithReusedStatement, updates) => {
70
- // INSERT into separate embedding table - much more memory efficient!
71
- const cypher = `CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`;
70
+ // MERGE instead of CREATE idempotent, handles concurrent analyzes and partial prior runs
71
+ const cypher = `MERGE (e:CodeEmbedding {nodeId: $nodeId}) SET e.embedding = $embedding`;
72
72
  const paramsList = updates.map((u) => ({ nodeId: u.id, embedding: u.embedding }));
73
73
  await executeWithReusedStatement(cypher, paramsList);
74
74
  };
@@ -7,7 +7,7 @@ export interface ProtoServiceInfo {
7
7
  protoPath: string;
8
8
  }
9
9
  export declare function buildProtoMap(repoPath: string): Promise<Map<string, ProtoServiceInfo[]>>;
10
- export declare function resolveProtoConflict(_serviceName: string, sourceFilePath: string, candidates: ProtoServiceInfo[]): ProtoServiceInfo | null;
10
+ export declare function resolveProtoConflict(serviceName: string, sourceFilePath: string, candidates: ProtoServiceInfo[]): ProtoServiceInfo | null;
11
11
  export declare function serviceContractId(pkg: string, serviceName: string): string;
12
12
  export declare class GrpcExtractor implements ContractExtractor {
13
13
  type: "grpc";
@@ -263,23 +263,31 @@ export async function buildProtoMap(repoPath) {
263
263
  const { servicesByName } = await buildProtoContext(repoPath);
264
264
  return servicesByName;
265
265
  }
266
- export function resolveProtoConflict(_serviceName, sourceFilePath, candidates) {
266
+ export function resolveProtoConflict(serviceName, sourceFilePath, candidates) {
267
267
  if (candidates.length === 0)
268
268
  return null;
269
269
  if (candidates.length === 1)
270
270
  return candidates[0];
271
271
  const sourceDir = normalizeProtoPath(path.dirname(sourceFilePath));
272
- let best = candidates[0];
273
- let bestScore = -1;
274
- for (const c of candidates) {
272
+ const scored = candidates.map((c) => {
275
273
  const protoDir = normalizeProtoPath(path.dirname(c.protoPath));
276
- const sharedRun = longestSharedSegmentRun(sourceDir, protoDir);
277
- if (sharedRun > bestScore) {
278
- bestScore = sharedRun;
279
- best = c;
280
- }
274
+ return { candidate: c, score: longestSharedSegmentRun(sourceDir, protoDir) };
275
+ });
276
+ let maxScore = -1;
277
+ for (const s of scored) {
278
+ if (s.score > maxScore)
279
+ maxScore = s.score;
281
280
  }
282
- return best;
281
+ const winners = scored.filter((s) => s.score === maxScore);
282
+ // Path heuristic cannot uniquely identify a winner — refuse to guess.
283
+ // Ties (including all-zero ties) would otherwise silently merge unrelated
284
+ // services under a fabricated package-qualified contract id.
285
+ if (winners.length !== 1) {
286
+ const paths = candidates.map((c) => c.protoPath).join(', ');
287
+ console.warn(`[grpc-extractor] Ambiguous proto resolution for service "${serviceName}" from ${sourceFilePath}: ${winners.length} candidates tied at score ${maxScore} among [${paths}] — skipping canonical contract`);
288
+ return null;
289
+ }
290
+ return winners[0].candidate;
283
291
  }
284
292
  export function serviceContractId(pkg, serviceName) {
285
293
  const prefix = pkg ? `${pkg}.${serviceName}` : serviceName;
@@ -339,7 +347,9 @@ export class GrpcExtractor {
339
347
  continue;
340
348
  }
341
349
  for (const d of detections) {
342
- out.push(this.detectionToContract(d, rel, protoMap));
350
+ const contract = this.detectionToContract(d, rel, protoMap);
351
+ if (contract)
352
+ out.push(contract);
343
353
  }
344
354
  }
345
355
  return this.dedupe(out);
@@ -352,8 +362,13 @@ export class GrpcExtractor {
352
362
  * based on whether the proto map had an entry.
353
363
  */
354
364
  detectionToContract(d, filePath, protoMap) {
355
- const candidates = protoMap.get(d.serviceName);
356
- const proto = resolveProtoConflict(d.serviceName, filePath, candidates ?? []);
365
+ const candidates = protoMap.get(d.serviceName) ?? [];
366
+ const proto = resolveProtoConflict(d.serviceName, filePath, candidates);
367
+ // If there were proto candidates but resolution was ambiguous, skip
368
+ // contract emission rather than fabricating a package-qualified id from
369
+ // an arbitrary candidate. resolveProtoConflict already warned.
370
+ if (candidates.length > 0 && proto === null)
371
+ return null;
357
372
  const pkg = proto?.package ?? '';
358
373
  const cid = d.methodName
359
374
  ? contractId(pkg, d.serviceName, d.methodName)
@@ -214,7 +214,29 @@ export class HttpRouteExtractor {
214
214
  const providerDetections = detections.filter((d) => d.role === 'provider');
215
215
  let handlerName = null;
216
216
  const normalizedRoute = normalizeHttpPath(routePath);
217
- const match = providerDetections.find((d) => normalizeHttpPath(d.path) === normalizedRoute);
217
+ // Candidates share the same normalized path. When multiple
218
+ // detections at the same path exist (e.g. GET + POST /api/orders
219
+ // in one router), a blind `.find()` silently returned the first
220
+ // verb — attaching the wrong handler and, when method was not
221
+ // already pinned by the route reason, the wrong method too.
222
+ // Disambiguate by method when we know it; refuse to guess when
223
+ // we don't.
224
+ const candidates = providerDetections.filter((d) => normalizeHttpPath(d.path) === normalizedRoute);
225
+ let match;
226
+ const ambiguousCandidates = !method && candidates.length > 1;
227
+ if (method) {
228
+ match = candidates.find((d) => d.method === method);
229
+ }
230
+ else if (candidates.length === 1) {
231
+ match = candidates[0];
232
+ }
233
+ // else: multiple candidates + unknown method → leave match
234
+ // undefined so handlerName stays null and skip symbol
235
+ // enrichment below, keeping the file-basename fallback instead
236
+ // of letting pickSymbolUid silently pick the first Function /
237
+ // Method in the file (which reintroduces the mis-attribution
238
+ // we were trying to avoid). Method stays at the conservative
239
+ // 'GET' default set below.
218
240
  if (match) {
219
241
  if (!method)
220
242
  method = match.method;
@@ -228,7 +250,7 @@ export class HttpRouteExtractor {
228
250
  let symbolName = path.basename(filePath) || 'handler';
229
251
  let symPath = filePath;
230
252
  const fileId = row.fileId ?? row[0];
231
- if (fileId) {
253
+ if (fileId && !ambiguousCandidates) {
232
254
  try {
233
255
  const syms = await db(CONTAINS_QUERY, { fileId });
234
256
  if (syms.length > 0) {
@@ -308,9 +330,17 @@ export class HttpRouteExtractor {
308
330
  // Prefer the plugin's detected method if we can find a matching
309
331
  // fetch/axios call in the same file.
310
332
  const detections = filePath ? getDetections(filePath) : [];
311
- const inferred = detections.find((d) => d.role === 'consumer' && normalizeConsumerPath(d.path) === pathNorm);
312
- if (inferred)
313
- method = inferred.method;
333
+ // Symmetric to the provider path: if multiple consumer calls in
334
+ // the same file share the same normalized path (e.g. a GET
335
+ // fetch AND a POST fetch to `/api/orders`), `.find()` silently
336
+ // picked the first verb and keyed the contract id on the wrong
337
+ // method. With no upstream method signal here, refuse to guess
338
+ // when candidates are ambiguous — leave `method` at its
339
+ // conservative 'GET' default.
340
+ const consumerCandidates = detections.filter((d) => d.role === 'consumer' && normalizeConsumerPath(d.path) === pathNorm);
341
+ if (consumerCandidates.length === 1) {
342
+ method = consumerCandidates[0].method;
343
+ }
314
344
  const cid = contractIdFor(method, pathNorm);
315
345
  let symbolUid = '';
316
346
  let symbolName = 'fetch';
@@ -16,6 +16,34 @@ function normalizeRoutePath(raw) {
16
16
  return '/';
17
17
  return collapsed.replace(/\/+$/, '');
18
18
  }
19
+ /**
20
+ * Split a manifest HTTP contract into its optional `METHOD::` prefix and
21
+ * its path portion.
22
+ *
23
+ * `buildContractId` recommends the explicit-method form `GET::/api/orders`
24
+ * in group.yaml; if we hand that raw string to `normalizeRoutePath` we get
25
+ * `/GET::/api/orders`, which can never match `Route.name = "/api/orders"`
26
+ * in the graph. This helper extracts the path portion so the Cypher
27
+ * lookup uses the canonical route name.
28
+ *
29
+ * The method prefix regex mirrors `buildContractId` (line ~251) for
30
+ * symmetry: case-insensitive `[A-Za-z]+` followed by `::`. The captured
31
+ * method is upper-cased for downstream use; method-constrained matching
32
+ * against `HANDLES_ROUTE` is a future enhancement (not yet wired).
33
+ *
34
+ * Edge cases:
35
+ * - `"::/api/orders"` — empty method portion, no alpha prefix match, so
36
+ * the whole string is treated as a bare path (matches buildContractId
37
+ * which also requires `[A-Za-z]+`).
38
+ * - `"GET::"` — method with empty path, returns `{ method: 'GET', path: '' }`;
39
+ * `normalizeRoutePath('')` resolves to `/` for caller.
40
+ */
41
+ function parseHttpContract(raw) {
42
+ const match = raw.match(/^([A-Za-z]+)::/);
43
+ if (!match)
44
+ return { method: null, path: raw };
45
+ return { method: match[1].toUpperCase(), path: raw.slice(match[0].length) };
46
+ }
19
47
  /**
20
48
  * Stable synthetic symbolUid for a manifest-declared contract whose target
21
49
  * symbol could not be resolved against the per-repo graph (resolveSymbol
@@ -111,7 +139,15 @@ export class ManifestExtractor {
111
139
  // core/ingestion/pipeline.ts ensureSlash + generateId('Route', ...)).
112
140
  // Normalize the manifest contract the same way so a user-written
113
141
  // "/api/orders" matches "api/orders" in the graph.
114
- const normalized = normalizeRoutePath(link.contract);
142
+ //
143
+ // The contract may also use the explicit-method form "GET::/api/orders"
144
+ // recommended by buildContractId. Strip the METHOD:: prefix before
145
+ // normalizing — otherwise `normalizeRoutePath('GET::/api/orders')`
146
+ // returns `/GET::/api/orders` and never matches Route.name. The
147
+ // captured method is not yet used to constrain the Cypher query
148
+ // (method-aware HANDLES_ROUTE matching is a future enhancement).
149
+ const parsed = parseHttpContract(link.contract);
150
+ const normalized = normalizeRoutePath(parsed.path);
115
151
  rows = await executor(`MATCH (handler)-[r:CodeRelation {type: 'HANDLES_ROUTE'}]->(route:Route)
116
152
  WHERE route.name = $normalized
117
153
  RETURN handler.id AS uid, handler.name AS name, handler.filePath AS filePath
@@ -214,9 +250,15 @@ export class ManifestExtractor {
214
250
  buildContractId(type, contract) {
215
251
  switch (type) {
216
252
  case 'http': {
217
- if (/^[A-Za-z]+::/.test(contract))
218
- return `http::${contract}`;
219
- return `http::*::${contract}`;
253
+ // Canonicalize method casing and path separators so logically
254
+ // equivalent inputs (`get::/api/orders` vs `GET::/api/orders`,
255
+ // or trailing-slash variants) produce the same contractId and
256
+ // matching `manifestSymbolUid` fallback. Without this, raw
257
+ // user casing leaks into cross-impact join keys and fragments
258
+ // matches across repos.
259
+ const { method, path: rawPath } = parseHttpContract(contract);
260
+ const normalizedPath = normalizeRoutePath(rawPath);
261
+ return method ? `http::${method}::${normalizedPath}` : `http::*::${normalizedPath}`;
220
262
  }
221
263
  case 'grpc':
222
264
  return `grpc::${contract}`;
@@ -21,8 +21,25 @@ import type { SyntaxNode } from './utils/ast-helpers.js';
21
21
  import type { NodeLabel } from '../../_shared/index.js';
22
22
  /** Tree-sitter query captures: capture name → AST node (or undefined if not captured). */
23
23
  export type CaptureMap = Record<string, SyntaxNode | undefined>;
24
- /** How a language handles imports — determines wildcard synthesis behavior. */
25
- export type ImportSemantics = 'named' | 'wildcard' | 'namespace';
24
+ /**
25
+ * How a language handles imports — determines wildcard synthesis behavior.
26
+ *
27
+ * Import resolution is a graph-traversal policy with multiple distinct strategies,
28
+ * analogous to MRO for method resolution. Each tag picks a strategy:
29
+ *
30
+ * | Tag | Mechanism | Traversal | Languages |
31
+ * |-----------------------|------------------------------------------------|---------------------|--------------------------------------------|
32
+ * | `named` | Per-symbol imports | None (use-site) | JS/TS, Java, C#, Rust, PHP, Kotlin, Vue |
33
+ * | `wildcard-transitive` | Textual paste, symbols chain through files | BFS closure | C, C++ (future: Obj-C, Fortran, Nim) |
34
+ * | `wildcard-leaf` | Whole public API, single hop | None (direct only) | Go, Ruby, Swift, Dart |
35
+ * | `namespace` | Qualified handle; symbols resolved at call site| None at import | Python |
36
+ * | `explicit-reexport` | Opt-in per-symbol re-export (SCAFFOLD) | Topological DAG | (future: TS `export *`, Rust `pub use`) |
37
+ *
38
+ * The `explicit-reexport` tag is a compile-time scaffold; no provider claims it yet.
39
+ * It falls through to `wildcard-leaf` behavior in synthesis so today's TS/Rust
40
+ * handling is unchanged. A future PR will implement the DAG walk for `export *`.
41
+ */
42
+ export type ImportSemantics = 'named' | 'wildcard-transitive' | 'wildcard-leaf' | 'namespace' | 'explicit-reexport';
26
43
  /**
27
44
  * Everything a language needs to provide.
28
45
  * Required fields must be explicitly set; optional fields have defaults
@@ -51,10 +68,12 @@ interface LanguageProviderConfig {
51
68
  /** Named binding extraction from import statements.
52
69
  * Default: undefined (language uses wildcard/whole-module imports). */
53
70
  readonly namedBindingExtractor?: NamedBindingExtractorFn;
54
- /** How this language handles imports.
71
+ /** How this language handles imports. See `ImportSemantics` for the full taxonomy.
55
72
  * - 'named': per-symbol imports (JS/TS, Java, C#, Rust, PHP, Kotlin)
56
- * - 'wildcard': whole-module imports, needs synthesis (Go, Ruby, C/C++, Swift)
57
- * - 'namespace': namespace imports, needs moduleAliasMap (Python)
73
+ * - 'wildcard-transitive': textual-include closure; imports chain through files (C, C++)
74
+ * - 'wildcard-leaf': whole-module single-hop imports; no transitive chaining (Go, Ruby, Swift, Dart)
75
+ * - 'namespace': qualified namespace imports, needs moduleAliasMap (Python)
76
+ * - 'explicit-reexport': opt-in per-symbol re-export (scaffold; no provider uses yet)
58
77
  * Default: 'named'. */
59
78
  readonly importSemantics?: ImportSemantics;
60
79
  /** Language-specific transformation of raw import path text before resolution.
@@ -293,7 +293,7 @@ export const cProvider = defineLanguage({
293
293
  typeConfig: cCppConfig,
294
294
  exportChecker: cCppExportChecker,
295
295
  importResolver: resolveCImport,
296
- importSemantics: 'wildcard',
296
+ importSemantics: 'wildcard-transitive',
297
297
  fieldExtractor: createFieldExtractor(cFieldConfig),
298
298
  methodExtractor: createMethodExtractor({
299
299
  ...cMethodConfig,
@@ -310,7 +310,7 @@ export const cppProvider = defineLanguage({
310
310
  typeConfig: cCppConfig,
311
311
  exportChecker: cCppExportChecker,
312
312
  importResolver: resolveCppImport,
313
- importSemantics: 'wildcard',
313
+ importSemantics: 'wildcard-transitive',
314
314
  mroStrategy: 'leftmost-base',
315
315
  fieldExtractor: createFieldExtractor(cppFieldConfig),
316
316
  methodExtractor: createMethodExtractor({
@@ -2,7 +2,7 @@
2
2
  * Dart Language Provider
3
3
  *
4
4
  * Dart traits:
5
- * - importSemantics: 'wildcard' (Dart imports bring everything public into scope)
5
+ * - importSemantics: 'wildcard-leaf' (Dart imports bring everything public into scope)
6
6
  * - exportChecker: public if no leading underscore
7
7
  * - Dart SDK imports (dart:*) and external packages are skipped
8
8
  * - enclosingFunctionFinder: Dart's tree-sitter grammar places function_body
@@ -2,7 +2,7 @@
2
2
  * Dart Language Provider
3
3
  *
4
4
  * Dart traits:
5
- * - importSemantics: 'wildcard' (Dart imports bring everything public into scope)
5
+ * - importSemantics: 'wildcard-leaf' (Dart imports bring everything public into scope)
6
6
  * - exportChecker: public if no leading underscore
7
7
  * - Dart SDK imports (dart:*) and external packages are skipped
8
8
  * - enclosingFunctionFinder: Dart's tree-sitter grammar places function_body
@@ -83,7 +83,7 @@ export const dartProvider = defineLanguage({
83
83
  typeConfig: dartConfig,
84
84
  exportChecker: dartExportChecker,
85
85
  importResolver: resolveDartImport,
86
- importSemantics: 'wildcard',
86
+ importSemantics: 'wildcard-leaf',
87
87
  fieldExtractor: createFieldExtractor(dartFieldConfig),
88
88
  methodExtractor: createMethodExtractor(dartMethodConfig),
89
89
  classExtractor: createClassExtractor({
@@ -5,7 +5,7 @@
5
5
  * LanguageProvider, following the Strategy pattern used by the pipeline.
6
6
  *
7
7
  * Key Go traits:
8
- * - importSemantics: 'wildcard' (Go imports entire packages)
8
+ * - importSemantics: 'wildcard-leaf' (Go imports entire packages)
9
9
  * - callRouter: present (Go method calls may need routing)
10
10
  */
11
11
  export declare const goProvider: import("../language-provider.js").LanguageProvider;
@@ -5,7 +5,7 @@
5
5
  * LanguageProvider, following the Strategy pattern used by the pipeline.
6
6
  *
7
7
  * Key Go traits:
8
- * - importSemantics: 'wildcard' (Go imports entire packages)
8
+ * - importSemantics: 'wildcard-leaf' (Go imports entire packages)
9
9
  * - callRouter: present (Go method calls may need routing)
10
10
  */
11
11
  import { SupportedLanguages } from '../../../_shared/index.js';
@@ -26,7 +26,7 @@ export const goProvider = defineLanguage({
26
26
  typeConfig: goConfig,
27
27
  exportChecker: goExportChecker,
28
28
  importResolver: resolveGoImport,
29
- importSemantics: 'wildcard',
29
+ importSemantics: 'wildcard-leaf',
30
30
  fieldExtractor: createFieldExtractor(goFieldConfig),
31
31
  methodExtractor: createMethodExtractor(goMethodConfig),
32
32
  classExtractor: createClassExtractor({
@@ -99,7 +99,7 @@ export const rubyProvider = defineLanguage({
99
99
  exportChecker: rubyExportChecker,
100
100
  importResolver: resolveRubyImport,
101
101
  callRouter: routeRubyCall,
102
- importSemantics: 'wildcard',
102
+ importSemantics: 'wildcard-leaf',
103
103
  resolveEnclosingOwner(node) {
104
104
  // Ruby singleton_class (class << self) should resolve to the enclosing
105
105
  // class or module for owner/container resolution (HAS_METHOD edges, class IDs).
@@ -5,7 +5,7 @@
5
5
  * LanguageProvider, following the Strategy pattern used by the pipeline.
6
6
  *
7
7
  * Key Swift traits:
8
- * - importSemantics: 'wildcard' (Swift imports entire modules)
8
+ * - importSemantics: 'wildcard-leaf' (Swift imports entire modules)
9
9
  * - heritageDefaultEdge: 'IMPLEMENTS' (protocols are more common than class inheritance)
10
10
  * - implicitImportWirer: all files in the same SPM target see each other
11
11
  */
@@ -5,7 +5,7 @@
5
5
  * LanguageProvider, following the Strategy pattern used by the pipeline.
6
6
  *
7
7
  * Key Swift traits:
8
- * - importSemantics: 'wildcard' (Swift imports entire modules)
8
+ * - importSemantics: 'wildcard-leaf' (Swift imports entire modules)
9
9
  * - heritageDefaultEdge: 'IMPLEMENTS' (protocols are more common than class inheritance)
10
10
  * - implicitImportWirer: all files in the same SPM target see each other
11
11
  */
@@ -221,7 +221,7 @@ export const swiftProvider = defineLanguage({
221
221
  typeConfig: swiftConfig,
222
222
  exportChecker: swiftExportChecker,
223
223
  importResolver: resolveSwiftImport,
224
- importSemantics: 'wildcard',
224
+ importSemantics: 'wildcard-leaf',
225
225
  heritageDefaultEdge: 'IMPLEMENTS',
226
226
  fieldExtractor: createFieldExtractor(swiftFieldConfig),
227
227
  methodExtractor: createMethodExtractor({
@@ -14,12 +14,47 @@
14
14
  */
15
15
  import type { KnowledgeGraph } from '../../graph/types.js';
16
16
  import type { createResolutionContext } from '../model/resolution-context.js';
17
- import { SupportedLanguages } from '../../../_shared/index.js';
17
+ import type { SupportedLanguages } from '../../../_shared/index.js';
18
18
  /** Check if a language uses wildcard (whole-module) import semantics. */
19
19
  export declare function isWildcardImportLanguage(lang: SupportedLanguages): boolean;
20
20
  /** Check if a language needs synthesis before call resolution.
21
21
  * True for wildcard-import languages AND namespace-import languages (Python). */
22
22
  export declare function needsSynthesis(lang: SupportedLanguages): boolean;
23
+ /**
24
+ * Strategy implementation for `importSemantics: 'wildcard-transitive'` (C, C++).
25
+ *
26
+ * Textual-include languages chain symbols through files: if `dict.c` includes
27
+ * `server.h` and `server.h` includes `dict.h`, then `dict.c` sees symbols from
28
+ * all three files. This helper walks the include graph (combining both the
29
+ * ingestion-context `importMap` and the graph-level IMPORTS edges) until the
30
+ * closure is stable.
31
+ *
32
+ * **Order matters.** The returned `Set` preserves iteration order (insertion
33
+ * order). `synthesizeWildcardImportBindings` dedupes bindings by symbol name
34
+ * on a first-seen-wins basis, so this closure's ordering determines which
35
+ * declaration wins when multiple headers export the same name (e.g. overloaded
36
+ * free functions like `write_audit()` vs `write_audit(const char*)` in
37
+ * different headers). We therefore:
38
+ * 1. Seed the closure with direct imports in declaration order (matches the
39
+ * order of `#include` directives in the source file).
40
+ * 2. Use FIFO / true BFS (`queue.shift()`) for transitive expansion, so
41
+ * closer headers are seen before deeper ones.
42
+ *
43
+ * Cycle-safe: the `closure.has(file)` guard prevents infinite loops on circular
44
+ * header includes, which are valid C/C++ when paired with `#pragma once` or
45
+ * include guards.
46
+ *
47
+ * Size-bounded: the closure is capped at `MAX_TRANSITIVE_CLOSURE_SIZE` files to
48
+ * prevent OOM on pathological codebases (e.g. boost, monoheader kernel code)
49
+ * where one translation unit can transitively reach tens of thousands of
50
+ * headers. Partial closures still yield useful bindings for the cluster of
51
+ * headers closest to the importer, which is what overload resolution and
52
+ * cross-file call resolution care about.
53
+ *
54
+ * Queue implementation: uses a head-index over a growing array (O(1) dequeue)
55
+ * instead of `Array.prototype.shift()` (O(n)) so deep chains stay linear.
56
+ */
57
+ export declare function expandTransitiveIncludeClosure(directImports: Iterable<string>, importMap: ReadonlyMap<string, ReadonlySet<string>>, graphImports: ReadonlyMap<string, ReadonlySet<string>>): Set<string>;
23
58
  /**
24
59
  * Synthesize namedImportMap entries for languages with whole-module imports.
25
60
  *
@@ -34,9 +34,26 @@ const IMPORTABLE_SYMBOL_LABELS = new Set([
34
34
  /** Max synthetic bindings per importing file — prevents memory bloat
35
35
  * for C/C++ files that include many large headers. */
36
36
  const MAX_SYNTHETIC_BINDINGS_PER_FILE = 1000;
37
+ /** Max files allowed in a single transitive include closure. Guards against
38
+ * OOM on pathological C/C++ codebases (boost, Linux kernel-style monoheaders)
39
+ * where a single translation unit can transitively reach many thousands of
40
+ * headers. When the cap is hit, BFS expansion stops early — the file still
41
+ * synthesizes bindings from the partial closure rather than failing. */
42
+ const MAX_TRANSITIVE_CLOSURE_SIZE = 5000;
43
+ /** Import semantics tags whose languages need synthesis of whole-module imports.
44
+ * `wildcard-transitive` (C/C++) and `wildcard-leaf` (Go, Ruby, Swift, Dart) are
45
+ * the file-based wildcard strategies. `explicit-reexport` is a scaffold tag —
46
+ * no provider uses it yet, but it goes through the same leaf-style synthesis
47
+ * path today because a re-exporter is still an importer; only the extra DAG
48
+ * walk to surface re-exported symbols is missing (future work). */
49
+ const WILDCARD_SEMANTICS = new Set([
50
+ 'wildcard-transitive',
51
+ 'wildcard-leaf',
52
+ 'explicit-reexport',
53
+ ]);
37
54
  /** Languages with whole-module import semantics (derived from providers at module load). */
38
55
  const WILDCARD_LANGUAGES = new Set(Object.values(providers)
39
- .filter((p) => p.importSemantics === 'wildcard')
56
+ .filter((p) => WILDCARD_SEMANTICS.has(p.importSemantics))
40
57
  .map((p) => p.id));
41
58
  /** Languages that need binding synthesis before call resolution. */
42
59
  const SYNTHESIS_LANGUAGES = new Set(Object.values(providers)
@@ -51,6 +68,82 @@ export function isWildcardImportLanguage(lang) {
51
68
  export function needsSynthesis(lang) {
52
69
  return SYNTHESIS_LANGUAGES.has(lang);
53
70
  }
71
+ // ── Strategy implementations ───────────────────────────────────────────────
72
+ /**
73
+ * Strategy implementation for `importSemantics: 'wildcard-transitive'` (C, C++).
74
+ *
75
+ * Textual-include languages chain symbols through files: if `dict.c` includes
76
+ * `server.h` and `server.h` includes `dict.h`, then `dict.c` sees symbols from
77
+ * all three files. This helper walks the include graph (combining both the
78
+ * ingestion-context `importMap` and the graph-level IMPORTS edges) until the
79
+ * closure is stable.
80
+ *
81
+ * **Order matters.** The returned `Set` preserves iteration order (insertion
82
+ * order). `synthesizeWildcardImportBindings` dedupes bindings by symbol name
83
+ * on a first-seen-wins basis, so this closure's ordering determines which
84
+ * declaration wins when multiple headers export the same name (e.g. overloaded
85
+ * free functions like `write_audit()` vs `write_audit(const char*)` in
86
+ * different headers). We therefore:
87
+ * 1. Seed the closure with direct imports in declaration order (matches the
88
+ * order of `#include` directives in the source file).
89
+ * 2. Use FIFO / true BFS (`queue.shift()`) for transitive expansion, so
90
+ * closer headers are seen before deeper ones.
91
+ *
92
+ * Cycle-safe: the `closure.has(file)` guard prevents infinite loops on circular
93
+ * header includes, which are valid C/C++ when paired with `#pragma once` or
94
+ * include guards.
95
+ *
96
+ * Size-bounded: the closure is capped at `MAX_TRANSITIVE_CLOSURE_SIZE` files to
97
+ * prevent OOM on pathological codebases (e.g. boost, monoheader kernel code)
98
+ * where one translation unit can transitively reach tens of thousands of
99
+ * headers. Partial closures still yield useful bindings for the cluster of
100
+ * headers closest to the importer, which is what overload resolution and
101
+ * cross-file call resolution care about.
102
+ *
103
+ * Queue implementation: uses a head-index over a growing array (O(1) dequeue)
104
+ * instead of `Array.prototype.shift()` (O(n)) so deep chains stay linear.
105
+ */
106
+ export function expandTransitiveIncludeClosure(directImports, importMap, graphImports) {
107
+ const closure = new Set();
108
+ const queue = [];
109
+ let head = 0; // O(1) dequeue: advance the head index instead of shift()-ing.
110
+ const tryEnqueue = (file) => {
111
+ if (closure.has(file))
112
+ return true;
113
+ if (closure.size >= MAX_TRANSITIVE_CLOSURE_SIZE)
114
+ return false;
115
+ closure.add(file);
116
+ queue.push(file);
117
+ return true;
118
+ };
119
+ // Seed direct imports in declaration order (see JSDoc on order-sensitivity).
120
+ for (const f of directImports) {
121
+ if (!tryEnqueue(f))
122
+ break;
123
+ }
124
+ // True BFS for transitive reach: head-index FIFO preserves the "closer
125
+ // headers first" ordering that overload resolution depends on.
126
+ while (head < queue.length) {
127
+ if (closure.size >= MAX_TRANSITIVE_CLOSURE_SIZE)
128
+ break;
129
+ const file = queue[head++];
130
+ const nested = importMap.get(file);
131
+ if (nested) {
132
+ for (const n of nested) {
133
+ if (!tryEnqueue(n))
134
+ break;
135
+ }
136
+ }
137
+ const nestedGraph = graphImports.get(file);
138
+ if (nestedGraph) {
139
+ for (const n of nestedGraph) {
140
+ if (!tryEnqueue(n))
141
+ break;
142
+ }
143
+ }
144
+ }
145
+ return closure;
146
+ }
54
147
  // ── Main synthesis function ────────────────────────────────────────────────
55
148
  /**
56
149
  * Synthesize namedImportMap entries for languages with whole-module imports.
@@ -133,16 +226,61 @@ export function synthesizeWildcardImportBindings(graph, ctx) {
133
226
  }
134
227
  }
135
228
  };
136
- // Synthesize from ctx.importMap (Ruby, C/C++, Swift file-based imports)
229
+ /**
230
+ * Dispatch wildcard synthesis by the file's language provider strategy.
231
+ *
232
+ * Strategy tags (see `ImportSemantics`):
233
+ * - `wildcard-transitive`: expand the include closure first (C/C++ #include
234
+ * chains — e.g. `dict.c` → `server.h` → `dict.h` so `dictFind` resolves
235
+ * across header chains)
236
+ * - `wildcard-leaf`: synthesize from direct imports only (Go, Ruby, Swift, Dart)
237
+ * - `explicit-reexport`: scaffold tag; falls through to leaf behavior.
238
+ * TODO(#821): implement re-export DAG walk for TS `export *` / Rust
239
+ * `pub use`. The leaf fallthrough preserves today's TS/Rust behavior
240
+ * (their direct imports still synthesize correctly); only the extra
241
+ * re-export DAG walk for barrel-file correctness is missing.
242
+ * - `namespace` / `named`: no-op here (namespace handled in Loop 3 below,
243
+ * named needs no synthesis).
244
+ *
245
+ * Used by both Loop 1 (ctx.importMap) and Loop 2 (graphImports) so a future
246
+ * transitive-import language whose edges arrive via graphImports gets closure
247
+ * expansion consistently regardless of edge source.
248
+ */
249
+ const dispatchSynthesis = (filePath, importedFiles, provider) => {
250
+ switch (provider.importSemantics) {
251
+ case 'wildcard-transitive':
252
+ synthesizeForFile(filePath, expandTransitiveIncludeClosure(importedFiles, ctx.importMap, graphImports));
253
+ return;
254
+ case 'wildcard-leaf':
255
+ case 'explicit-reexport':
256
+ synthesizeForFile(filePath, importedFiles);
257
+ return;
258
+ case 'namespace':
259
+ case 'named':
260
+ return;
261
+ default: {
262
+ const _exhaustive = provider.importSemantics;
263
+ void _exhaustive;
264
+ }
265
+ }
266
+ };
267
+ // Loop 1: synthesize from ctx.importMap (Ruby, C/C++, Swift, Dart file-based imports).
137
268
  for (const [filePath, importedFiles] of ctx.importMap) {
138
269
  const lang = getLanguageFromFilename(filePath);
139
270
  if (!lang || !isWildcardImportLanguage(lang))
140
271
  continue;
141
- synthesizeForFile(filePath, importedFiles);
272
+ const provider = getProviderForFile(filePath);
273
+ if (!provider)
274
+ continue;
275
+ dispatchSynthesis(filePath, importedFiles, provider);
142
276
  }
143
- // Synthesize from graph IMPORTS edges (Go and other wildcard-import languages)
277
+ // Loop 2: synthesize from graph IMPORTS edges (Go and other wildcard-import
278
+ // languages whose edges live in the graph rather than ctx.importMap).
144
279
  for (const [filePath, importedFiles] of graphImports) {
145
- synthesizeForFile(filePath, importedFiles);
280
+ const provider = getProviderForFile(filePath);
281
+ if (!provider)
282
+ continue;
283
+ dispatchSynthesis(filePath, importedFiles, provider);
146
284
  }
147
285
  // Build Python module-alias maps for namespace-import languages.
148
286
  // `import models` in app.py → moduleAliasMap['app.py']['models'] = 'models.py'
@@ -246,14 +246,17 @@ export const streamAllCSVsToDisk = async (graph, repoPath, csvDir) => {
246
246
  Interface: interfaceWriter,
247
247
  CodeElement: codeElemWriter,
248
248
  };
249
- const seenFileIds = new Set();
249
+ // Deduplicate all node types — the pipeline can produce duplicate IDs across
250
+ // all symbol types (Class, Method, Function, etc.), not just File nodes.
251
+ // A single Set covering every label prevents PK violations on COPY.
252
+ const seenNodeIds = new Set();
250
253
  // --- SINGLE PASS over all nodes ---
251
254
  for (const node of graph.iterNodes()) {
255
+ if (seenNodeIds.has(node.id))
256
+ continue;
257
+ seenNodeIds.add(node.id);
252
258
  switch (node.label) {
253
259
  case 'File': {
254
- if (seenFileIds.has(node.id))
255
- break;
256
- seenFileIds.add(node.id);
257
260
  const content = await extractContent(node, contentCache);
258
261
  await fileWriter.addRow([
259
262
  escapeCSVField(node.id),
@@ -1,5 +1,33 @@
1
1
  import lbug from '@ladybugdb/core';
2
2
  import { KnowledgeGraph } from '../graph/types.js';
3
+ /** Factory for creating WriteStreams — injectable for testing. */
4
+ export type WriteStreamFactory = (filePath: string) => import('fs').WriteStream;
5
+ /** Result of splitting the relationship CSV into per-label-pair files. */
6
+ export interface RelCsvSplitResult {
7
+ relHeader: string;
8
+ relsByPairMeta: Map<string, {
9
+ csvPath: string;
10
+ rows: number;
11
+ }>;
12
+ pairWriteStreams: Map<string, import('fs').WriteStream>;
13
+ skippedRels: number;
14
+ totalValidRels: number;
15
+ }
16
+ /**
17
+ * Split a relationship CSV into per-label-pair files on disk.
18
+ *
19
+ * Streams the CSV line-by-line, routing each relationship to a file named
20
+ * `rel_{fromLabel}_{toLabel}.csv`. Handles backpressure correctly: only one
21
+ * drain listener per stream at a time, and readline resumes only when ALL
22
+ * backpressured streams have drained.
23
+ *
24
+ * @param csvPath Path to the combined relationship CSV
25
+ * @param csvDir Directory to write per-pair CSV files
26
+ * @param validTables Set of valid node table names
27
+ * @param getNodeLabel Function to extract the label from a node ID
28
+ * @param wsFactory Optional WriteStream factory (defaults to fs.createWriteStream)
29
+ */
30
+ export declare const splitRelCsvByLabelPair: (csvPath: string, csvDir: string, validTables: Set<string>, getNodeLabel: (id: string) => string, wsFactory?: WriteStreamFactory) => Promise<RelCsvSplitResult>;
3
31
  /** Expose the current Database for pool adapter reuse in tests. */
4
32
  export declare const getDatabase: () => lbug.Database | null;
5
33
  /**
@@ -5,6 +5,122 @@ import path from 'path';
5
5
  import lbug from '@ladybugdb/core';
6
6
  import { NODE_TABLES, REL_TABLE_NAME, SCHEMA_QUERIES, EMBEDDING_TABLE_NAME, } from './schema.js';
7
7
  import { streamAllCSVsToDisk } from './csv-generator.js';
8
+ /**
9
+ * Split a relationship CSV into per-label-pair files on disk.
10
+ *
11
+ * Streams the CSV line-by-line, routing each relationship to a file named
12
+ * `rel_{fromLabel}_{toLabel}.csv`. Handles backpressure correctly: only one
13
+ * drain listener per stream at a time, and readline resumes only when ALL
14
+ * backpressured streams have drained.
15
+ *
16
+ * @param csvPath Path to the combined relationship CSV
17
+ * @param csvDir Directory to write per-pair CSV files
18
+ * @param validTables Set of valid node table names
19
+ * @param getNodeLabel Function to extract the label from a node ID
20
+ * @param wsFactory Optional WriteStream factory (defaults to fs.createWriteStream)
21
+ */
22
+ export const splitRelCsvByLabelPair = async (csvPath, csvDir, validTables, getNodeLabel, wsFactory = (p) => createWriteStream(p, 'utf-8')) => {
23
+ let relHeader = '';
24
+ const relsByPairMeta = new Map();
25
+ const pairWriteStreams = new Map();
26
+ let skippedRels = 0;
27
+ let totalValidRels = 0;
28
+ await new Promise((resolve, reject) => {
29
+ const inputStream = createReadStream(csvPath, 'utf-8');
30
+ const rl = createInterface({
31
+ input: inputStream,
32
+ crlfDelay: Infinity,
33
+ });
34
+ // Track which streams are already waiting for drain to prevent
35
+ // listener accumulation. rl.pause() is not synchronous — buffered
36
+ // line events continue firing after pause(), and without this guard
37
+ // each line targeting the same pairKey would add another drain listener.
38
+ const waitingForDrain = new Set();
39
+ let settled = false;
40
+ const cleanup = (err) => {
41
+ if (settled)
42
+ return;
43
+ settled = true;
44
+ try {
45
+ rl.close();
46
+ }
47
+ catch { }
48
+ try {
49
+ inputStream.destroy();
50
+ }
51
+ catch { }
52
+ for (const ws of pairWriteStreams.values()) {
53
+ try {
54
+ ws.destroy();
55
+ }
56
+ catch { }
57
+ }
58
+ reject(err);
59
+ };
60
+ let isFirst = true;
61
+ rl.on('line', (line) => {
62
+ if (isFirst) {
63
+ relHeader = line;
64
+ isFirst = false;
65
+ return;
66
+ }
67
+ if (!line.trim())
68
+ return;
69
+ const match = line.match(/"([^"]*)","([^"]*)"/);
70
+ if (!match) {
71
+ skippedRels++;
72
+ return;
73
+ }
74
+ const fromLabel = getNodeLabel(match[1]);
75
+ const toLabel = getNodeLabel(match[2]);
76
+ if (!validTables.has(fromLabel) || !validTables.has(toLabel)) {
77
+ skippedRels++;
78
+ return;
79
+ }
80
+ const pairKey = `${fromLabel}|${toLabel}`;
81
+ let ws = pairWriteStreams.get(pairKey);
82
+ if (!ws) {
83
+ const pairCsvPath = path.join(csvDir, `rel_${fromLabel}_${toLabel}.csv`);
84
+ ws = wsFactory(pairCsvPath);
85
+ // If any per-pair WriteStream errors (disk full, EMFILE, etc.),
86
+ // tear down everything and reject the Promise. Without this handler,
87
+ // a stream error while rl is paused waiting for drain would cause
88
+ // the drain callback to never fire and the Promise to hang forever.
89
+ ws.on('error', cleanup);
90
+ ws.write(relHeader + '\n');
91
+ pairWriteStreams.set(pairKey, ws);
92
+ relsByPairMeta.set(pairKey, { csvPath: pairCsvPath, rows: 0 });
93
+ }
94
+ const ok = ws.write(line + '\n');
95
+ relsByPairMeta.get(pairKey).rows++;
96
+ totalValidRels++;
97
+ // Handle backpressure: pause reading when the write buffer is full,
98
+ // resume when the stream drains. Prevents unbounded memory growth
99
+ // on repos with millions of relationships.
100
+ // Guard with waitingForDrain to ensure only one drain listener is
101
+ // registered per stream at a time — rl.pause() doesn't stop buffered
102
+ // line events immediately. Only resume when ALL streams have drained
103
+ // to avoid writing into still-full streams.
104
+ if (!ok && !waitingForDrain.has(pairKey)) {
105
+ waitingForDrain.add(pairKey);
106
+ rl.pause();
107
+ ws.once('drain', () => {
108
+ waitingForDrain.delete(pairKey);
109
+ if (waitingForDrain.size === 0)
110
+ rl.resume();
111
+ });
112
+ }
113
+ });
114
+ rl.on('close', () => {
115
+ if (!settled) {
116
+ settled = true;
117
+ resolve();
118
+ }
119
+ });
120
+ rl.on('error', cleanup);
121
+ });
122
+ return { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels };
123
+ };
8
124
  let db = null;
9
125
  let conn = null;
10
126
  let currentDbPath = null;
@@ -215,69 +331,16 @@ export const loadGraphToLbug = async (graph, repoPath, storagePath, onProgress)
215
331
  }
216
332
  }
217
333
  // Bulk COPY relationships — split by FROM→TO label pair (LadybugDB requires it)
218
- // Stream-read the relation CSV line by line and write directly to per-pair
219
- // temp files on disk. This avoids accumulating potentially millions of CSV
220
- // lines in memory which could exceed V8 Map or array limits on large repos.
221
- let relHeader = '';
222
- const relsByPairMeta = new Map();
223
- const pairWriteStreams = new Map();
224
- let skippedRels = 0;
225
- let totalValidRels = 0;
226
- await new Promise((resolve, reject) => {
227
- const rl = createInterface({
228
- input: createReadStream(csvResult.relCsvPath, 'utf-8'),
229
- crlfDelay: Infinity,
230
- });
231
- let isFirst = true;
232
- rl.on('line', (line) => {
233
- if (isFirst) {
234
- relHeader = line;
235
- isFirst = false;
236
- return;
237
- }
238
- if (!line.trim())
239
- return;
240
- const match = line.match(/"([^"]*)","([^"]*)"/);
241
- if (!match) {
242
- skippedRels++;
243
- return;
244
- }
245
- const fromLabel = getNodeLabel(match[1]);
246
- const toLabel = getNodeLabel(match[2]);
247
- if (!validTables.has(fromLabel) || !validTables.has(toLabel)) {
248
- skippedRels++;
249
- return;
250
- }
251
- const pairKey = `${fromLabel}|${toLabel}`;
252
- let ws = pairWriteStreams.get(pairKey);
253
- if (!ws) {
254
- const pairCsvPath = path.join(csvDir, `rel_${fromLabel}_${toLabel}.csv`);
255
- ws = createWriteStream(pairCsvPath, 'utf-8');
256
- ws.write(relHeader + '\n');
257
- pairWriteStreams.set(pairKey, ws);
258
- relsByPairMeta.set(pairKey, { csvPath: pairCsvPath, rows: 0 });
259
- }
260
- const ok = ws.write(line + '\n');
261
- relsByPairMeta.get(pairKey).rows++;
262
- totalValidRels++;
263
- // Handle backpressure: pause reading when the write buffer is full,
264
- // resume when the stream drains. Prevents unbounded memory growth
265
- // on repos with millions of relationships.
266
- if (!ok) {
267
- rl.pause();
268
- ws.once('drain', () => rl.resume());
269
- }
270
- });
271
- rl.on('close', resolve);
272
- rl.on('error', (err) => {
273
- // Destroy all open write streams to avoid resource leaks
274
- for (const ws of pairWriteStreams.values())
275
- ws.destroy();
276
- reject(err);
277
- });
278
- });
334
+ const { relHeader, relsByPairMeta, pairWriteStreams, skippedRels, totalValidRels } = await splitRelCsvByLabelPair(csvResult.relCsvPath, csvDir, validTables, getNodeLabel);
279
335
  // Close all per-pair write streams before COPY
280
- await Promise.all(Array.from(pairWriteStreams.values()).map((ws) => new Promise((resolve, reject) => ws.end((err) => (err ? reject(err) : resolve())))));
336
+ await Promise.all(Array.from(pairWriteStreams.values()).map((ws) => new Promise((resolve, reject) => {
337
+ const onError = (err) => reject(err);
338
+ ws.on('error', onError);
339
+ ws.end(() => {
340
+ ws.removeListener('error', onError);
341
+ resolve();
342
+ });
343
+ })));
281
344
  const insertedRels = totalValidRels;
282
345
  const warnings = [];
283
346
  if (insertedRels > 0) {
@@ -149,7 +149,7 @@ export async function runFullAnalysis(repoPath, options, callbacks) {
149
149
  const batch = cachedEmbeddings.slice(i, i + EMBED_BATCH);
150
150
  const paramsList = batch.map((e) => ({ nodeId: e.nodeId, embedding: e.embedding }));
151
151
  try {
152
- await executeWithReusedStatement(`CREATE (e:CodeEmbedding {nodeId: $nodeId, embedding: $embedding})`, paramsList);
152
+ await executeWithReusedStatement(`MERGE (e:CodeEmbedding {nodeId: $nodeId}) SET e.embedding = $embedding`, paramsList);
153
153
  }
154
154
  catch {
155
155
  /* some may fail if node was removed, that's fine */
@@ -1277,6 +1277,26 @@ export const createServer = async (port, host = '127.0.0.1') => {
1277
1277
  const lbugPath = path.join(entry.storagePath, 'lbug');
1278
1278
  await withLbugDb(lbugPath, async () => {
1279
1279
  const { runEmbeddingPipeline } = await import('../core/embeddings/embedding-pipeline.js');
1280
+ // Skip nodes that already have embeddings — Kuzu forbids SET on vector-indexed properties.
1281
+ let skipNodeIds;
1282
+ try {
1283
+ const rows = await executeQuery('MATCH (e:CodeEmbedding) RETURN e.nodeId AS nodeId');
1284
+ if (rows && rows.length > 0) {
1285
+ skipNodeIds = new Set(rows.map((r) => r.nodeId ?? r[0]).filter(Boolean));
1286
+ console.log(`[embed] ${skipNodeIds.size} nodes already embedded — skipping in incremental run`);
1287
+ }
1288
+ }
1289
+ catch (err) {
1290
+ // Swallow only "table does not exist" — let real connection errors propagate.
1291
+ // Log so ops can see this path fire if Kuzu ever changes error wording.
1292
+ const msg = err?.message ?? '';
1293
+ if (msg.includes('does not exist') || msg.includes('not found')) {
1294
+ console.log(`[embed] CodeEmbedding table not yet present — full embedding run (${msg})`);
1295
+ }
1296
+ else {
1297
+ throw err;
1298
+ }
1299
+ }
1280
1300
  await runEmbeddingPipeline(executeQuery, executeWithReusedStatement, (p) => {
1281
1301
  embedJobManager.updateJob(job.id, {
1282
1302
  progress: {
@@ -1293,7 +1313,8 @@ export const createServer = async (port, host = '127.0.0.1') => {
1293
1313
  : `${p.phase} (${p.percent}%)`,
1294
1314
  },
1295
1315
  });
1296
- });
1316
+ }, {}, // config: use defaults (runEmbeddingPipeline signature: executeQuery, executeWithReusedStatement, onProgress, config, skipNodeIds)
1317
+ skipNodeIds);
1297
1318
  });
1298
1319
  clearTimeout(embedTimeout);
1299
1320
  releaseRepoLock(repoLockPath);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "gitnexus",
3
- "version": "1.6.1",
3
+ "version": "1.6.2-rc.2",
4
4
  "description": "Graph-powered code intelligence for AI agents. Index any codebase, query via MCP or CLI.",
5
5
  "author": "Abhigyan Patwari",
6
6
  "license": "PolyForm-Noncommercial-1.0.0",
@@ -84,7 +84,7 @@
84
84
  "uuid": "^13.0.0"
85
85
  },
86
86
  "optionalDependencies": {
87
- "tree-sitter-dart": "https://github.com/UserNobody14/tree-sitter-dart/archive/80e23c07b64494f7e21090bb3450223ef0b192f4.tar.gz",
87
+ "tree-sitter-dart": "git+https://github.com/UserNobody14/tree-sitter-dart.git#80e23c07b64494f7e21090bb3450223ef0b192f4",
88
88
  "tree-sitter-kotlin": "^0.3.8",
89
89
  "tree-sitter-proto": "file:./vendor/tree-sitter-proto",
90
90
  "tree-sitter-swift": "^0.6.0"