gitnexus 1.6.8-rc.41 → 1.6.8-rc.43

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -95,6 +95,23 @@ export interface PipelineOptions {
95
95
  /** Per-run `TAINT_PATH` edge cap (#2084 review P1-3). `undefined` ⇒
96
96
  * `DEFAULT_PDG_MAX_INTERPROC_EDGES` (1000); `0` ⇒ no cap. */
97
97
  pdgMaxInterprocEdges?: number;
98
+ /**
99
+ * Streaming/chunked PDG graph emit (#2202). When true, the BasicBlock +
100
+ * intra-file PDG-edge layer (CFG / REACHING_DEF / CDG / POST_DOMINATE /
101
+ * TAINTED / SANITIZES) is streamed to CSV-on-disk during the scope-resolution
102
+ * emit loop instead of being materialized in the in-memory graph, bounding
103
+ * peak RSS to O(chunk) rather than O(graph) at full-kernel scale. Already
104
+ * gated by the caller to full-rebuild runs only (the incremental writeback
105
+ * reads BasicBlocks back from the in-memory graph). Memory-only — produces a
106
+ * byte-identical persisted graph and is NOT part of `RepoMeta.pdg`, so
107
+ * toggling it never trips `pdgModeMismatch`. Default/false ⇒ today's
108
+ * whole-graph emit.
109
+ */
110
+ streamPdgEmit?: boolean;
111
+ /** Streamed PDG-emit write buffer (rows) when `streamPdgEmit` is on (#2202).
112
+ * `undefined` ⇒ `DEFAULT_PDG_EMIT_CHUNK_ROWS`. Memory-only; does not affect
113
+ * emitted bytes. */
114
+ pdgEmitChunkSize?: number;
98
115
  /**
99
116
  * Request parsing with the worker pool disabled. The sequential parser was
100
117
  * removed — the worker pool is the sole parse path — so setting this now
@@ -76,7 +76,10 @@ export const runPipelineFromRepo = async (repoPath, onProgress, options) => {
76
76
  const { totalFiles, usedWorkerPool } = getPhaseOutput(results, 'parse');
77
77
  let communityResult;
78
78
  let processResult;
79
- const resolutionOutcomes = getPhaseOutput(results, 'scopeResolution').resolutionOutcomes;
79
+ const scopeResolutionOutput = getPhaseOutput(results, 'scopeResolution');
80
+ const resolutionOutcomes = scopeResolutionOutput.resolutionOutcomes;
81
+ // Streamed PDG-emit manifest (#2202): present only when streaming was on.
82
+ const pdgEmitManifest = scopeResolutionOutput.pdgEmitManifest;
80
83
  if (!options?.skipGraphPhases) {
81
84
  communityResult = getPhaseOutput(results, 'communities').communityResult;
82
85
  processResult = getPhaseOutput(results, 'processes').processResult;
@@ -101,5 +104,6 @@ export const runPipelineFromRepo = async (repoPath, onProgress, options) => {
101
104
  processResult,
102
105
  resolutionOutcomes,
103
106
  usedWorkerPool,
107
+ pdgEmitManifest,
104
108
  };
105
109
  };
@@ -27,6 +27,7 @@ import type { PipelinePhase } from '../../pipeline-phases/types.js';
27
27
  import { SupportedLanguages } from '../../../../_shared/index.js';
28
28
  import type { ResolutionOutcome } from '../resolution-outcome.js';
29
29
  import type { FunctionSummary } from '../../taint/summary-model.js';
30
+ import { type PdgEmitManifest } from '../../../lbug/pdg-emit-sink.js';
30
31
  export interface ScopeResolutionOutput {
31
32
  /** True when at least one language ran. */
32
33
  readonly ran: boolean;
@@ -50,5 +51,13 @@ export interface ScopeResolutionOutput {
50
51
  * The `taintSummaries` phase composes these over the `CALLS` graph.
51
52
  */
52
53
  readonly functionSummaries: readonly FunctionSummary[];
54
+ /**
55
+ * Streamed PDG-emit COPY manifest (#2202). Present only when streaming was on
56
+ * (full rebuild + `--pdg` + enabled): the BasicBlock node CSV + per-pair PDG
57
+ * edge CSVs that were flushed to disk during the emit loop, for the persistence
58
+ * step to COPY alongside the structural CSVs. Absent ⇒ the PDG layer (if any)
59
+ * is in the in-memory graph and persists via the normal whole-graph emit.
60
+ */
61
+ readonly pdgEmitManifest?: PdgEmitManifest;
53
62
  }
54
63
  export declare const scopeResolutionPhase: PipelinePhase<ScopeResolutionOutput>;
@@ -34,6 +34,8 @@ import { isDev, isSemanticModelValidatorEnabled } from '../../utils/env.js';
34
34
  import { logHeapProbe } from '../../utils/heap-probe.js';
35
35
  import { clearParsedFileStore, loadParsedFilesForPaths, forceGc, } from '../../../../storage/parsedfile-store.js';
36
36
  import { buildFunctionNodeIndex } from '../../taint/summary-harvest-driver.js';
37
+ import { PdgEmitSink } from '../../../lbug/pdg-emit-sink.js';
38
+ import { resolveNativeSafeStorageDir } from '../../../lbug/lbug-config.js';
37
39
  import { logger } from '../../../logger.js';
38
40
  const NOOP_OUTPUT = Object.freeze({
39
41
  ran: false,
@@ -179,213 +181,265 @@ export const scopeResolutionPhase = {
179
181
  const sharedFnNodeIndex = ctx.options?.pdg === true && totalScopeFiles > 0
180
182
  ? buildFunctionNodeIndex(ctx.graph)
181
183
  : undefined;
182
- for (const [lang, provider] of SCOPE_RESOLVERS) {
183
- // Standalone providers (COBOL, JCL) don't emit graph edges yet
184
- // through the scope-resolution path. This is the canonical guard:
185
- // runScopeResolution is never called for standalone providers, which
186
- // keeps cobolPhase as the sole IMPORTS edge producer. Keep this guard
187
- // in sync with any additional standalone providers added to
188
- // SCOPE_RESOLVERS.
189
- if (provider.languageProvider.parseStrategy === 'standalone')
190
- continue;
191
- const primaryLangFiles = filesByLang.get(lang) ?? [];
192
- if (primaryLangFiles.length === 0)
193
- continue;
194
- const primaryFilePaths = primaryLangFiles.map((f) => f.path);
195
- // Load per-language import-resolution config (tsconfig paths,
196
- // composer.json autoload, go.mod, ...). One I/O round trip per
197
- // workspace pass — cached implicitly by the result handed to
198
- // every `resolveImportTarget` call below.
199
- const resolutionConfig = provider.loadResolutionConfig !== undefined
200
- ? await provider.loadResolutionConfig(ctx.repoPath)
201
- : undefined;
202
- // Some languages (e.g. Vue) expand their file universe beyond the
203
- // primary-language files via the `collectScopeContextPaths` hook.
204
- // The hook receives raw source contents of the primary files so it
205
- // can trace import closures without a second tree-sitter parse.
206
- //
207
- // To avoid reading primary files twice (once for the hook, once for
208
- // the resolution pass), we read them upfront and merge with the
209
- // extra context paths the hook may add.
210
- // Stream this language's pre-built ParsedFiles in from the disk store
211
- // FIRST (huge-repo path). Doing it before reading source lets us skip
212
- // loading content for files the store already covers — for a provider
213
- // with no content-consuming hook that source is pure dead weight once
214
- // extraction is served from the store (~1.5 GB on the kernel's C pass).
215
- // Merged into `preExtractedByPath`; the per-language release block below
216
- // evicts these again before the next language, so only one language's
217
- // ParsedFiles are resident at a time.
218
- const loadStoreFor = async (paths) => {
219
- if (!parsedFileStorePath)
220
- return;
221
- const fromDisk = await loadParsedFilesForPaths(parsedFileStorePath, paths);
222
- for (const [fp, pf] of fromDisk)
223
- preExtractedByPath.set(fp, pf);
224
- };
225
- // A provider that feeds source text into a post-extract hook
226
- // (populateWorkspaceOwners / populateNamespaceSiblings /
227
- // populateRangeBindings / emitPostResolutionEdges) needs content for ALL
228
- // its files; one without those hooks only needs content for files the
229
- // store does NOT cover (fresh-extract fallback). Keep this in sync with
230
- // the getFileContents() call-sites in run.ts.
231
- const providerNeedsAllContent = provider.populateWorkspaceOwners !== undefined ||
232
- provider.populateNamespaceSiblings !== undefined ||
233
- provider.populateRangeBindings !== undefined ||
234
- provider.emitPostResolutionEdges !== undefined;
235
- let scopeFilePaths;
236
- let contents;
237
- if (provider.collectScopeContextPaths !== undefined) {
238
- // Context-expanding providers (e.g. Vue) need every primary file's
239
- // source up front for the closure hook, so load it all.
240
- const entryFileContents = await readFileContents(ctx.repoPath, primaryFilePaths);
241
- scopeFilePaths = provider.collectScopeContextPaths({
242
- primaryFilePaths,
243
- preExtractedByPath,
244
- entryFileContents,
245
- allScannedPaths,
246
- resolutionConfig,
247
- });
248
- // Read only the extra context files (TS/JS etc.) not already loaded.
249
- const extraPaths = [...scopeFilePaths].filter((p) => !entryFileContents.has(p));
250
- const extraContents = await readFileContents(ctx.repoPath, extraPaths);
251
- contents = new Map([...entryFileContents, ...extraContents]);
252
- await loadStoreFor(scopeFilePaths);
184
+ // Streaming/chunked PDG emit (#2202): when enabled (the caller has already
185
+ // gated this to full-rebuild + `--pdg`), route the BasicBlock + intra-file
186
+ // PDG-edge layer to CSV-on-disk through one sink shared across every
187
+ // language pass, so it never accumulates in `ctx.graph` (peak RSS O(chunk)).
188
+ // Needs the storage dir (the parse-cache store path, the same `.gitnexus`
189
+ // dir loadGraphToLbug COPYs from); if that is somehow absent we skip
190
+ // streaming and fall back to the in-memory whole-graph emit.
191
+ let pdgEmitSink;
192
+ if (ctx.options?.streamPdgEmit === true && totalScopeFiles > 0) {
193
+ if (parsedFileStorePath) {
194
+ pdgEmitSink = new PdgEmitSink(ctx.graph,
195
+ // Same ASCII-safe relocation the structural CSVs get (#2202 review #2):
196
+ // on Windows non-ASCII storage paths the COPY can't open files under
197
+ // the native path, so the dir is relocated to a hashed os.tmpdir().
198
+ resolveNativeSafeStorageDir(parsedFileStorePath, 'pdg-csv'), ctx.options?.pdgEmitChunkSize);
253
199
  }
254
200
  else {
255
- scopeFilePaths = new Set(primaryFilePaths);
256
- await loadStoreFor(scopeFilePaths);
257
- const pathsToRead = providerNeedsAllContent
258
- ? primaryFilePaths
259
- : primaryFilePaths.filter((p) => !preExtractedByPath.has(p));
260
- contents = await readFileContents(ctx.repoPath, pathsToRead);
201
+ logger.warn('[scope-resolution] streaming PDG emit requested but no storage path is ' +
202
+ 'available; falling back to in-memory whole-graph emit');
261
203
  }
262
- const filePaths = [...scopeFilePaths];
263
- const files = [];
264
- for (const fp of filePaths) {
265
- const content = contents.get(fp);
266
- if (content !== undefined) {
267
- files.push({ path: fp, content });
204
+ }
205
+ // Cross-pass per-file dedup set for the streaming sink (#2202): one set
206
+ // shared across every language pass so a file emitted in two passes (e.g. a
207
+ // `.ts` module pulled into the Vue context pass) streams its PDG layer once.
208
+ // Only created when streaming — the in-memory-graph path dedups via its Map.
209
+ const pdgEmittedFiles = pdgEmitSink !== undefined ? new Set() : undefined;
210
+ // Stream the PDG layer with guaranteed writer cleanup: a throw escaping the
211
+ // per-language loop (outside run.ts's per-file try/catch — e.g. from
212
+ // finalize/propagate/a provider hook) must still release the sink's file
213
+ // descriptors. finalize() runs on the success path; the finally closes the
214
+ // sink only when finalize did not (idempotent via the sink's `finalized`).
215
+ let pdgEmitManifest;
216
+ let pdgSinkSettled = false;
217
+ try {
218
+ for (const [lang, provider] of SCOPE_RESOLVERS) {
219
+ // Standalone providers (COBOL, JCL) don't emit graph edges yet
220
+ // through the scope-resolution path. This is the canonical guard:
221
+ // runScopeResolution is never called for standalone providers, which
222
+ // keeps cobolPhase as the sole IMPORTS edge producer. Keep this guard
223
+ // in sync with any additional standalone providers added to
224
+ // SCOPE_RESOLVERS.
225
+ if (provider.languageProvider.parseStrategy === 'standalone')
226
+ continue;
227
+ const primaryLangFiles = filesByLang.get(lang) ?? [];
228
+ if (primaryLangFiles.length === 0)
229
+ continue;
230
+ const primaryFilePaths = primaryLangFiles.map((f) => f.path);
231
+ // Load per-language import-resolution config (tsconfig paths,
232
+ // composer.json autoload, go.mod, ...). One I/O round trip per
233
+ // workspace pass — cached implicitly by the result handed to
234
+ // every `resolveImportTarget` call below.
235
+ const resolutionConfig = provider.loadResolutionConfig !== undefined
236
+ ? await provider.loadResolutionConfig(ctx.repoPath)
237
+ : undefined;
238
+ // Some languages (e.g. Vue) expand their file universe beyond the
239
+ // primary-language files via the `collectScopeContextPaths` hook.
240
+ // The hook receives raw source contents of the primary files so it
241
+ // can trace import closures without a second tree-sitter parse.
242
+ //
243
+ // To avoid reading primary files twice (once for the hook, once for
244
+ // the resolution pass), we read them upfront and merge with the
245
+ // extra context paths the hook may add.
246
+ // Stream this language's pre-built ParsedFiles in from the disk store
247
+ // FIRST (huge-repo path). Doing it before reading source lets us skip
248
+ // loading content for files the store already covers — for a provider
249
+ // with no content-consuming hook that source is pure dead weight once
250
+ // extraction is served from the store (~1.5 GB on the kernel's C pass).
251
+ // Merged into `preExtractedByPath`; the per-language release block below
252
+ // evicts these again before the next language, so only one language's
253
+ // ParsedFiles are resident at a time.
254
+ const loadStoreFor = async (paths) => {
255
+ if (!parsedFileStorePath)
256
+ return;
257
+ const fromDisk = await loadParsedFilesForPaths(parsedFileStorePath, paths);
258
+ for (const [fp, pf] of fromDisk)
259
+ preExtractedByPath.set(fp, pf);
260
+ };
261
+ // A provider that feeds source text into a post-extract hook
262
+ // (populateWorkspaceOwners / populateNamespaceSiblings /
263
+ // populateRangeBindings / emitPostResolutionEdges) needs content for ALL
264
+ // its files; one without those hooks only needs content for files the
265
+ // store does NOT cover (fresh-extract fallback). Keep this in sync with
266
+ // the getFileContents() call-sites in run.ts.
267
+ const providerNeedsAllContent = provider.populateWorkspaceOwners !== undefined ||
268
+ provider.populateNamespaceSiblings !== undefined ||
269
+ provider.populateRangeBindings !== undefined ||
270
+ provider.emitPostResolutionEdges !== undefined;
271
+ let scopeFilePaths;
272
+ let contents;
273
+ if (provider.collectScopeContextPaths !== undefined) {
274
+ // Context-expanding providers (e.g. Vue) need every primary file's
275
+ // source up front for the closure hook, so load it all.
276
+ const entryFileContents = await readFileContents(ctx.repoPath, primaryFilePaths);
277
+ scopeFilePaths = provider.collectScopeContextPaths({
278
+ primaryFilePaths,
279
+ preExtractedByPath,
280
+ entryFileContents,
281
+ allScannedPaths,
282
+ resolutionConfig,
283
+ });
284
+ // Read only the extra context files (TS/JS etc.) not already loaded.
285
+ const extraPaths = [...scopeFilePaths].filter((p) => !entryFileContents.has(p));
286
+ const extraContents = await readFileContents(ctx.repoPath, extraPaths);
287
+ contents = new Map([...entryFileContents, ...extraContents]);
288
+ await loadStoreFor(scopeFilePaths);
268
289
  }
269
- else if (preExtractedByPath.has(fp)) {
270
- // Store covers extraction for this file and we deliberately skipped
271
- // reading its source; the empty string is never consumed (the
272
- // extract loop uses the pre-extracted ParsedFile and this provider
273
- // has no content hook).
274
- files.push({ path: fp, content: '' });
290
+ else {
291
+ scopeFilePaths = new Set(primaryFilePaths);
292
+ await loadStoreFor(scopeFilePaths);
293
+ const pathsToRead = providerNeedsAllContent
294
+ ? primaryFilePaths
295
+ : primaryFilePaths.filter((p) => !preExtractedByPath.has(p));
296
+ contents = await readFileContents(ctx.repoPath, pathsToRead);
275
297
  }
276
- // else: uncovered AND unreadable → skip (unchanged from prior behavior).
277
- }
278
- const langFileCount = files.length;
279
- logHeapProbe('scope-lang-start', `lang=${lang} files=${langFileCount} contentsLoaded=${contents.size}`);
280
- const langLabel = lang.charAt(0).toUpperCase() + lang.slice(1);
281
- currentLangIdx++;
282
- const langTag = totalScopeLangs > 1 ? `${langLabel} [${currentLangIdx}/${totalScopeLangs}]` : langLabel;
283
- if (totalScopeFiles > 0) {
284
- const pct = SCOPE_PCT_START + Math.round((processedScopeFiles / totalScopeFiles) * SCOPE_PCT_RANGE);
285
- ctx.onProgress({
286
- phase: 'scopeResolution',
287
- percent: pct,
288
- message: 'Resolving types',
289
- detail: `${langTag}, ${langFileCount.toLocaleString()} files`,
290
- });
291
- }
292
- const stats = runScopeResolution({
293
- graph: ctx.graph,
294
- model,
295
- files,
296
- resolutionConfig,
297
- prebuiltNodeLookup: sharedNodeLookup,
298
- prebuiltFunctionNodeIndex: sharedFnNodeIndex,
299
- preExtractedParsedFiles: preExtractedByPath,
300
- scopeIndexStorePath: parsedFileStorePath,
301
- // CFG/PDG emission (#2081 M1) — opt-in; off ⇒ byte-identical graph.
302
- pdg: ctx.options?.pdg === true,
303
- pdgMaxEdgesPerFunction: ctx.options?.pdgMaxEdgesPerFunction,
304
- pdgMaxReachingDefEdgesPerFunction: ctx.options?.pdgMaxReachingDefEdgesPerFunction,
305
- pdgMaxCdgEdgesPerFunction: ctx.options?.pdgMaxCdgEdgesPerFunction,
306
- pdgMaxTaintFindingsPerFunction: ctx.options?.pdgMaxTaintFindingsPerFunction,
307
- pdgMaxTaintHops: ctx.options?.pdgMaxTaintHops,
308
- recordResolutionOutcome: (outcome) => {
309
- resolutionOutcomes.push(outcome);
310
- },
311
- onWarn: (msg) => {
312
- if (isSemanticModelValidatorEnabled()) {
313
- logger.warn(`[scope-resolution:${lang}] ${msg}`);
298
+ const filePaths = [...scopeFilePaths];
299
+ const files = [];
300
+ for (const fp of filePaths) {
301
+ const content = contents.get(fp);
302
+ if (content !== undefined) {
303
+ files.push({ path: fp, content });
314
304
  }
315
- },
316
- onProgress: totalScopeFiles > 0
317
- ? (subPhase, current, total) => {
318
- let langRatio;
319
- switch (subPhase) {
320
- case 'extracting':
321
- langRatio = total > 0 ? (current / total) * 0.5 : 0;
322
- break;
323
- case 'analyzing types':
324
- langRatio = 0.5;
325
- break;
326
- case 'resolving references':
327
- langRatio = 0.7;
328
- break;
329
- case 'linking symbols':
330
- langRatio = 0.85;
331
- break;
332
- default: {
333
- const _exhaustive = subPhase;
334
- langRatio = 0.85;
305
+ else if (preExtractedByPath.has(fp)) {
306
+ // Store covers extraction for this file and we deliberately skipped
307
+ // reading its source; the empty string is never consumed (the
308
+ // extract loop uses the pre-extracted ParsedFile and this provider
309
+ // has no content hook).
310
+ files.push({ path: fp, content: '' });
311
+ }
312
+ // else: uncovered AND unreadable → skip (unchanged from prior behavior).
313
+ }
314
+ const langFileCount = files.length;
315
+ logHeapProbe('scope-lang-start', `lang=${lang} files=${langFileCount} contentsLoaded=${contents.size}`);
316
+ const langLabel = lang.charAt(0).toUpperCase() + lang.slice(1);
317
+ currentLangIdx++;
318
+ const langTag = totalScopeLangs > 1 ? `${langLabel} [${currentLangIdx}/${totalScopeLangs}]` : langLabel;
319
+ if (totalScopeFiles > 0) {
320
+ const pct = SCOPE_PCT_START + Math.round((processedScopeFiles / totalScopeFiles) * SCOPE_PCT_RANGE);
321
+ ctx.onProgress({
322
+ phase: 'scopeResolution',
323
+ percent: pct,
324
+ message: 'Resolving types',
325
+ detail: `${langTag}, ${langFileCount.toLocaleString()} files`,
326
+ });
327
+ }
328
+ const stats = runScopeResolution({
329
+ graph: ctx.graph,
330
+ model,
331
+ files,
332
+ resolutionConfig,
333
+ prebuiltNodeLookup: sharedNodeLookup,
334
+ prebuiltFunctionNodeIndex: sharedFnNodeIndex,
335
+ preExtractedParsedFiles: preExtractedByPath,
336
+ scopeIndexStorePath: parsedFileStorePath,
337
+ // CFG/PDG emission (#2081 M1) — opt-in; off ⇒ byte-identical graph.
338
+ pdg: ctx.options?.pdg === true,
339
+ pdgMaxEdgesPerFunction: ctx.options?.pdgMaxEdgesPerFunction,
340
+ pdgMaxReachingDefEdgesPerFunction: ctx.options?.pdgMaxReachingDefEdgesPerFunction,
341
+ pdgMaxCdgEdgesPerFunction: ctx.options?.pdgMaxCdgEdgesPerFunction,
342
+ pdgMaxTaintFindingsPerFunction: ctx.options?.pdgMaxTaintFindingsPerFunction,
343
+ pdgMaxTaintHops: ctx.options?.pdgMaxTaintHops,
344
+ // Streaming PDG-emit sink (#2202) — undefined ⇒ emit to the in-memory graph.
345
+ pdgEmitSink,
346
+ // Cross-pass per-file dedup set (#2202) — undefined when not streaming.
347
+ pdgEmittedFiles,
348
+ recordResolutionOutcome: (outcome) => {
349
+ resolutionOutcomes.push(outcome);
350
+ },
351
+ onWarn: (msg) => {
352
+ if (isSemanticModelValidatorEnabled()) {
353
+ logger.warn(`[scope-resolution:${lang}] ${msg}`);
354
+ }
355
+ },
356
+ onProgress: totalScopeFiles > 0
357
+ ? (subPhase, current, total) => {
358
+ let langRatio;
359
+ switch (subPhase) {
360
+ case 'extracting':
361
+ langRatio = total > 0 ? (current / total) * 0.5 : 0;
362
+ break;
363
+ case 'analyzing types':
364
+ langRatio = 0.5;
365
+ break;
366
+ case 'resolving references':
367
+ langRatio = 0.7;
368
+ break;
369
+ case 'linking symbols':
370
+ langRatio = 0.85;
371
+ break;
372
+ default: {
373
+ const _exhaustive = subPhase;
374
+ langRatio = 0.85;
375
+ }
335
376
  }
377
+ const overallRatio = Math.min(1, (processedScopeFiles + langRatio * langFileCount) / totalScopeFiles);
378
+ const pct = SCOPE_PCT_START + Math.round(overallRatio * SCOPE_PCT_RANGE);
379
+ ctx.onProgress({
380
+ phase: 'scopeResolution',
381
+ percent: pct,
382
+ message: 'Resolving types',
383
+ detail: subPhase === 'extracting'
384
+ ? `${langTag} — extracting ${current.toLocaleString()}/${total.toLocaleString()} files`
385
+ : `${langTag} — ${subPhase}`,
386
+ });
336
387
  }
337
- const overallRatio = Math.min(1, (processedScopeFiles + langRatio * langFileCount) / totalScopeFiles);
338
- const pct = SCOPE_PCT_START + Math.round(overallRatio * SCOPE_PCT_RANGE);
339
- ctx.onProgress({
340
- phase: 'scopeResolution',
341
- percent: pct,
342
- message: 'Resolving types',
343
- detail: subPhase === 'extracting'
344
- ? `${langTag} extracting ${current.toLocaleString()}/${total.toLocaleString()} files`
345
- : `${langTag} ${subPhase}`,
346
- });
347
- }
348
- : undefined,
349
- }, provider);
350
- // Release file contents and pre-extracted entries after each language
351
- // to reduce memory pressure. For large codebases (16K+ PHP files),
352
- // holding all source code simultaneously with scope trees causes OOM.
353
- // See: https://github.com/abhigyanpatwari/GitNexus/issues/1741
354
- //
355
- // Use `filePaths` (not `primaryFilePaths`) so that any context files
356
- // added by `collectScopeContextPaths` (e.g. TS/JS files pulled in for
357
- // Vue cross-file resolution) are also evicted and not held until GC.
358
- files.length = 0;
359
- contents.clear();
360
- for (const fp of filePaths) {
361
- preExtractedByPath.delete(fp);
362
- }
363
- // This language's ParsedFiles are now unreachable (runScopeResolution has
364
- // returned and the Map entries are deleted). Force a GC HERE so a heavy
365
- // language's ~17-20GB set (e.g. C/C++ on the Linux kernel) is reclaimed
366
- // BEFORE the next language's store-load — instead of leaving V8 to collect
367
- // it lazily under the next pass's allocation pressure (which, at a cap >=
368
- // RAM, degrades into swap-thrash). Collects only dead objects: the live
369
- // cross-file index of the next pass is untouched. The pre/post probe
370
- // confirms whether old-space fragmentation defeats the reclaim.
371
- logHeapProbe('lang-release-pre-gc', `lang=${lang}`);
372
- forceGc();
373
- logHeapProbe('lang-release-post-gc', `lang=${lang}`);
374
- logHeapProbe('scope-lang-end', `lang=${lang} filesProcessed=${stats.filesProcessed}`);
375
- processedScopeFiles += langFileCount;
376
- anyRan = true;
377
- functionSummaries.push(...stats.functionSummaries);
378
- totalFiles += stats.filesProcessed;
379
- totalImports += stats.importsEmitted;
380
- totalRefs += stats.referenceEdgesEmitted;
381
- perLanguage.set(lang, {
382
- filesProcessed: stats.filesProcessed,
383
- importsEmitted: stats.importsEmitted,
384
- referenceEdgesEmitted: stats.referenceEdgesEmitted,
385
- });
386
- if (isDev) {
387
- logger.info(`[scope-resolution:${lang}] ${stats.filesProcessed} files → ${stats.importsEmitted} IMPORTS + ${stats.referenceEdgesEmitted} reference edges (${stats.resolve.unresolved} unresolved sites, ${stats.referenceSkipped} skipped)`);
388
+ : undefined,
389
+ }, provider);
390
+ // Release file contents and pre-extracted entries after each language
391
+ // to reduce memory pressure. For large codebases (16K+ PHP files),
392
+ // holding all source code simultaneously with scope trees causes OOM.
393
+ // See: https://github.com/abhigyanpatwari/GitNexus/issues/1741
394
+ //
395
+ // Use `filePaths` (not `primaryFilePaths`) so that any context files
396
+ // added by `collectScopeContextPaths` (e.g. TS/JS files pulled in for
397
+ // Vue cross-file resolution) are also evicted and not held until GC.
398
+ files.length = 0;
399
+ contents.clear();
400
+ for (const fp of filePaths) {
401
+ preExtractedByPath.delete(fp);
402
+ }
403
+ // This language's ParsedFiles are now unreachable (runScopeResolution has
404
+ // returned and the Map entries are deleted). Force a GC HERE so a heavy
405
+ // language's ~17-20GB set (e.g. C/C++ on the Linux kernel) is reclaimed
406
+ // BEFORE the next language's store-load instead of leaving V8 to collect
407
+ // it lazily under the next pass's allocation pressure (which, at a cap >=
408
+ // RAM, degrades into swap-thrash). Collects only dead objects: the live
409
+ // cross-file index of the next pass is untouched. The pre/post probe
410
+ // confirms whether old-space fragmentation defeats the reclaim.
411
+ logHeapProbe('lang-release-pre-gc', `lang=${lang}`);
412
+ forceGc();
413
+ logHeapProbe('lang-release-post-gc', `lang=${lang}`);
414
+ logHeapProbe('scope-lang-end', `lang=${lang} filesProcessed=${stats.filesProcessed}`);
415
+ processedScopeFiles += langFileCount;
416
+ anyRan = true;
417
+ functionSummaries.push(...stats.functionSummaries);
418
+ totalFiles += stats.filesProcessed;
419
+ totalImports += stats.importsEmitted;
420
+ totalRefs += stats.referenceEdgesEmitted;
421
+ perLanguage.set(lang, {
422
+ filesProcessed: stats.filesProcessed,
423
+ importsEmitted: stats.importsEmitted,
424
+ referenceEdgesEmitted: stats.referenceEdgesEmitted,
425
+ });
426
+ if (isDev) {
427
+ logger.info(`[scope-resolution:${lang}] ${stats.filesProcessed} files → ${stats.importsEmitted} IMPORTS + ${stats.referenceEdgesEmitted} reference edges (${stats.resolve.unresolved} unresolved sites, ${stats.referenceSkipped} skipped)`);
428
+ }
388
429
  }
430
+ // Finalize the streaming PDG sink (#2202) once after the last language:
431
+ // flush + close its CSV writers and capture the COPY manifest. forceGc at
432
+ // the boundary reclaims transient write buffers (mirrors the per-language
433
+ // release below).
434
+ pdgEmitManifest = pdgEmitSink?.finalize();
435
+ pdgSinkSettled = true;
436
+ if (pdgEmitSink !== undefined)
437
+ forceGc();
438
+ }
439
+ finally {
440
+ // Release fds if a throw skipped finalize (idempotent with finalize()).
441
+ if (pdgEmitSink !== undefined && !pdgSinkSettled)
442
+ pdgEmitSink.close();
389
443
  }
390
444
  if (totalScopeFiles > 0 && anyRan) {
391
445
  ctx.onProgress({
@@ -407,8 +461,11 @@ export const scopeResolutionPhase = {
407
461
  /* best-effort cleanup */
408
462
  }
409
463
  }
464
+ // Even when no language ran, surface a finalized manifest (its CSVs are on
465
+ // disk) so loadGraphToLbug COPYs them rather than orphaning them — empty in
466
+ // the no-files case, harmless.
410
467
  if (!anyRan)
411
- return NOOP_OUTPUT;
468
+ return pdgEmitManifest ? { ...NOOP_OUTPUT, pdgEmitManifest } : NOOP_OUTPUT;
412
469
  return {
413
470
  ran: true,
414
471
  filesProcessed: totalFiles,
@@ -417,6 +474,7 @@ export const scopeResolutionPhase = {
417
474
  resolutionOutcomes,
418
475
  perLanguage,
419
476
  functionSummaries,
477
+ pdgEmitManifest,
420
478
  };
421
479
  },
422
480
  };
@@ -84,6 +84,30 @@ interface RunScopeResolutionInput {
84
84
  * `reason`; consumed by the U4 taint emit step). `undefined` ⇒
85
85
  * `DEFAULT_PDG_MAX_TAINT_HOPS` (32); `0` ⇒ no cap. */
86
86
  readonly pdgMaxTaintHops?: number;
87
+ /**
88
+ * Streaming PDG-emit sink (#2202). When present (streaming on, full rebuild),
89
+ * the `--pdg` emit routes BasicBlock nodes + intra-file PDG edges to THIS
90
+ * graph-shaped target instead of the in-memory `graph`, so the bulky PDG
91
+ * layer never accumulates in memory (peak RSS O(chunk)). Typed as a plain
92
+ * `KnowledgeGraph` so this module stays decoupled from the persistence layer;
93
+ * the caller (the scope-resolution phase) owns its lifecycle and finalizes it
94
+ * after the last language. Absent ⇒ the emit writes to `graph` as before
95
+ * (byte-identical default).
96
+ */
97
+ readonly pdgEmitSink?: KnowledgeGraph;
98
+ /**
99
+ * Cross-pass per-file dedup set for streaming PDG emit (#2202). Shared across
100
+ * every language pass (owned by the scope-resolution phase). A file imported
101
+ * by more than one language (e.g. a `.ts` module pulled into the Vue context
102
+ * pass) is PDG-emitted in each pass over the same `cfgSideChannel`, producing
103
+ * identical ids; the in-memory graph dedups that by id, but the streaming sink
104
+ * is dedup-free (to stay O(write buffer), not O(total ids)). So when present
105
+ * (streaming on), the emit loop skips a file whose PDG already streamed and
106
+ * records the rest — keeping the streamed set byte-identical to the
107
+ * Map-deduped whole-graph emit, for any language-pass order. Absent ⇒ no skip
108
+ * (the graph Map dedups), so the default path is unchanged.
109
+ */
110
+ readonly pdgEmittedFiles?: Set<string>;
87
111
  /**
88
112
  * Optional graph-node lookup built ONCE by the caller and shared across
89
113
  * every language pass. `buildGraphNodeLookup` scans the whole graph and is