gitnexus 1.6.8-rc.41 → 1.6.8-rc.43
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/core/ingestion/pipeline.d.ts +17 -0
- package/dist/core/ingestion/pipeline.js +5 -1
- package/dist/core/ingestion/scope-resolution/pipeline/phase.d.ts +9 -0
- package/dist/core/ingestion/scope-resolution/pipeline/phase.js +257 -199
- package/dist/core/ingestion/scope-resolution/pipeline/run.d.ts +24 -0
- package/dist/core/ingestion/scope-resolution/pipeline/run.js +19 -4
- package/dist/core/ingestion/utils/env.d.ts +7 -0
- package/dist/core/ingestion/utils/env.js +12 -0
- package/dist/core/lbug/csv-generator.d.ts +10 -1
- package/dist/core/lbug/csv-generator.js +17 -8
- package/dist/core/lbug/lbug-adapter.d.ts +11 -1
- package/dist/core/lbug/lbug-adapter.js +40 -12
- package/dist/core/lbug/lbug-config.d.ts +19 -0
- package/dist/core/lbug/lbug-config.js +29 -0
- package/dist/core/lbug/pdg-emit-sink.d.ts +127 -0
- package/dist/core/lbug/pdg-emit-sink.js +358 -0
- package/dist/core/run-analyze.d.ts +44 -0
- package/dist/core/run-analyze.js +45 -1
- package/dist/types/pipeline.d.ts +9 -0
- package/package.json +1 -1
|
@@ -95,6 +95,23 @@ export interface PipelineOptions {
|
|
|
95
95
|
/** Per-run `TAINT_PATH` edge cap (#2084 review P1-3). `undefined` ⇒
|
|
96
96
|
* `DEFAULT_PDG_MAX_INTERPROC_EDGES` (1000); `0` ⇒ no cap. */
|
|
97
97
|
pdgMaxInterprocEdges?: number;
|
|
98
|
+
/**
|
|
99
|
+
* Streaming/chunked PDG graph emit (#2202). When true, the BasicBlock +
|
|
100
|
+
* intra-file PDG-edge layer (CFG / REACHING_DEF / CDG / POST_DOMINATE /
|
|
101
|
+
* TAINTED / SANITIZES) is streamed to CSV-on-disk during the scope-resolution
|
|
102
|
+
* emit loop instead of being materialized in the in-memory graph, bounding
|
|
103
|
+
* peak RSS to O(chunk) rather than O(graph) at full-kernel scale. Already
|
|
104
|
+
* gated by the caller to full-rebuild runs only (the incremental writeback
|
|
105
|
+
* reads BasicBlocks back from the in-memory graph). Memory-only — produces a
|
|
106
|
+
* byte-identical persisted graph and is NOT part of `RepoMeta.pdg`, so
|
|
107
|
+
* toggling it never trips `pdgModeMismatch`. Default/false ⇒ today's
|
|
108
|
+
* whole-graph emit.
|
|
109
|
+
*/
|
|
110
|
+
streamPdgEmit?: boolean;
|
|
111
|
+
/** Streamed PDG-emit write buffer (rows) when `streamPdgEmit` is on (#2202).
|
|
112
|
+
* `undefined` ⇒ `DEFAULT_PDG_EMIT_CHUNK_ROWS`. Memory-only; does not affect
|
|
113
|
+
* emitted bytes. */
|
|
114
|
+
pdgEmitChunkSize?: number;
|
|
98
115
|
/**
|
|
99
116
|
* Request parsing with the worker pool disabled. The sequential parser was
|
|
100
117
|
* removed — the worker pool is the sole parse path — so setting this now
|
|
@@ -76,7 +76,10 @@ export const runPipelineFromRepo = async (repoPath, onProgress, options) => {
|
|
|
76
76
|
const { totalFiles, usedWorkerPool } = getPhaseOutput(results, 'parse');
|
|
77
77
|
let communityResult;
|
|
78
78
|
let processResult;
|
|
79
|
-
const
|
|
79
|
+
const scopeResolutionOutput = getPhaseOutput(results, 'scopeResolution');
|
|
80
|
+
const resolutionOutcomes = scopeResolutionOutput.resolutionOutcomes;
|
|
81
|
+
// Streamed PDG-emit manifest (#2202): present only when streaming was on.
|
|
82
|
+
const pdgEmitManifest = scopeResolutionOutput.pdgEmitManifest;
|
|
80
83
|
if (!options?.skipGraphPhases) {
|
|
81
84
|
communityResult = getPhaseOutput(results, 'communities').communityResult;
|
|
82
85
|
processResult = getPhaseOutput(results, 'processes').processResult;
|
|
@@ -101,5 +104,6 @@ export const runPipelineFromRepo = async (repoPath, onProgress, options) => {
|
|
|
101
104
|
processResult,
|
|
102
105
|
resolutionOutcomes,
|
|
103
106
|
usedWorkerPool,
|
|
107
|
+
pdgEmitManifest,
|
|
104
108
|
};
|
|
105
109
|
};
|
|
@@ -27,6 +27,7 @@ import type { PipelinePhase } from '../../pipeline-phases/types.js';
|
|
|
27
27
|
import { SupportedLanguages } from '../../../../_shared/index.js';
|
|
28
28
|
import type { ResolutionOutcome } from '../resolution-outcome.js';
|
|
29
29
|
import type { FunctionSummary } from '../../taint/summary-model.js';
|
|
30
|
+
import { type PdgEmitManifest } from '../../../lbug/pdg-emit-sink.js';
|
|
30
31
|
export interface ScopeResolutionOutput {
|
|
31
32
|
/** True when at least one language ran. */
|
|
32
33
|
readonly ran: boolean;
|
|
@@ -50,5 +51,13 @@ export interface ScopeResolutionOutput {
|
|
|
50
51
|
* The `taintSummaries` phase composes these over the `CALLS` graph.
|
|
51
52
|
*/
|
|
52
53
|
readonly functionSummaries: readonly FunctionSummary[];
|
|
54
|
+
/**
|
|
55
|
+
* Streamed PDG-emit COPY manifest (#2202). Present only when streaming was on
|
|
56
|
+
* (full rebuild + `--pdg` + enabled): the BasicBlock node CSV + per-pair PDG
|
|
57
|
+
* edge CSVs that were flushed to disk during the emit loop, for the persistence
|
|
58
|
+
* step to COPY alongside the structural CSVs. Absent ⇒ the PDG layer (if any)
|
|
59
|
+
* is in the in-memory graph and persists via the normal whole-graph emit.
|
|
60
|
+
*/
|
|
61
|
+
readonly pdgEmitManifest?: PdgEmitManifest;
|
|
53
62
|
}
|
|
54
63
|
export declare const scopeResolutionPhase: PipelinePhase<ScopeResolutionOutput>;
|
|
@@ -34,6 +34,8 @@ import { isDev, isSemanticModelValidatorEnabled } from '../../utils/env.js';
|
|
|
34
34
|
import { logHeapProbe } from '../../utils/heap-probe.js';
|
|
35
35
|
import { clearParsedFileStore, loadParsedFilesForPaths, forceGc, } from '../../../../storage/parsedfile-store.js';
|
|
36
36
|
import { buildFunctionNodeIndex } from '../../taint/summary-harvest-driver.js';
|
|
37
|
+
import { PdgEmitSink } from '../../../lbug/pdg-emit-sink.js';
|
|
38
|
+
import { resolveNativeSafeStorageDir } from '../../../lbug/lbug-config.js';
|
|
37
39
|
import { logger } from '../../../logger.js';
|
|
38
40
|
const NOOP_OUTPUT = Object.freeze({
|
|
39
41
|
ran: false,
|
|
@@ -179,213 +181,265 @@ export const scopeResolutionPhase = {
|
|
|
179
181
|
const sharedFnNodeIndex = ctx.options?.pdg === true && totalScopeFiles > 0
|
|
180
182
|
? buildFunctionNodeIndex(ctx.graph)
|
|
181
183
|
: undefined;
|
|
182
|
-
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
192
|
-
|
|
193
|
-
|
|
194
|
-
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
// workspace pass — cached implicitly by the result handed to
|
|
198
|
-
// every `resolveImportTarget` call below.
|
|
199
|
-
const resolutionConfig = provider.loadResolutionConfig !== undefined
|
|
200
|
-
? await provider.loadResolutionConfig(ctx.repoPath)
|
|
201
|
-
: undefined;
|
|
202
|
-
// Some languages (e.g. Vue) expand their file universe beyond the
|
|
203
|
-
// primary-language files via the `collectScopeContextPaths` hook.
|
|
204
|
-
// The hook receives raw source contents of the primary files so it
|
|
205
|
-
// can trace import closures without a second tree-sitter parse.
|
|
206
|
-
//
|
|
207
|
-
// To avoid reading primary files twice (once for the hook, once for
|
|
208
|
-
// the resolution pass), we read them upfront and merge with the
|
|
209
|
-
// extra context paths the hook may add.
|
|
210
|
-
// Stream this language's pre-built ParsedFiles in from the disk store
|
|
211
|
-
// FIRST (huge-repo path). Doing it before reading source lets us skip
|
|
212
|
-
// loading content for files the store already covers — for a provider
|
|
213
|
-
// with no content-consuming hook that source is pure dead weight once
|
|
214
|
-
// extraction is served from the store (~1.5 GB on the kernel's C pass).
|
|
215
|
-
// Merged into `preExtractedByPath`; the per-language release block below
|
|
216
|
-
// evicts these again before the next language, so only one language's
|
|
217
|
-
// ParsedFiles are resident at a time.
|
|
218
|
-
const loadStoreFor = async (paths) => {
|
|
219
|
-
if (!parsedFileStorePath)
|
|
220
|
-
return;
|
|
221
|
-
const fromDisk = await loadParsedFilesForPaths(parsedFileStorePath, paths);
|
|
222
|
-
for (const [fp, pf] of fromDisk)
|
|
223
|
-
preExtractedByPath.set(fp, pf);
|
|
224
|
-
};
|
|
225
|
-
// A provider that feeds source text into a post-extract hook
|
|
226
|
-
// (populateWorkspaceOwners / populateNamespaceSiblings /
|
|
227
|
-
// populateRangeBindings / emitPostResolutionEdges) needs content for ALL
|
|
228
|
-
// its files; one without those hooks only needs content for files the
|
|
229
|
-
// store does NOT cover (fresh-extract fallback). Keep this in sync with
|
|
230
|
-
// the getFileContents() call-sites in run.ts.
|
|
231
|
-
const providerNeedsAllContent = provider.populateWorkspaceOwners !== undefined ||
|
|
232
|
-
provider.populateNamespaceSiblings !== undefined ||
|
|
233
|
-
provider.populateRangeBindings !== undefined ||
|
|
234
|
-
provider.emitPostResolutionEdges !== undefined;
|
|
235
|
-
let scopeFilePaths;
|
|
236
|
-
let contents;
|
|
237
|
-
if (provider.collectScopeContextPaths !== undefined) {
|
|
238
|
-
// Context-expanding providers (e.g. Vue) need every primary file's
|
|
239
|
-
// source up front for the closure hook, so load it all.
|
|
240
|
-
const entryFileContents = await readFileContents(ctx.repoPath, primaryFilePaths);
|
|
241
|
-
scopeFilePaths = provider.collectScopeContextPaths({
|
|
242
|
-
primaryFilePaths,
|
|
243
|
-
preExtractedByPath,
|
|
244
|
-
entryFileContents,
|
|
245
|
-
allScannedPaths,
|
|
246
|
-
resolutionConfig,
|
|
247
|
-
});
|
|
248
|
-
// Read only the extra context files (TS/JS etc.) not already loaded.
|
|
249
|
-
const extraPaths = [...scopeFilePaths].filter((p) => !entryFileContents.has(p));
|
|
250
|
-
const extraContents = await readFileContents(ctx.repoPath, extraPaths);
|
|
251
|
-
contents = new Map([...entryFileContents, ...extraContents]);
|
|
252
|
-
await loadStoreFor(scopeFilePaths);
|
|
184
|
+
// Streaming/chunked PDG emit (#2202): when enabled (the caller has already
|
|
185
|
+
// gated this to full-rebuild + `--pdg`), route the BasicBlock + intra-file
|
|
186
|
+
// PDG-edge layer to CSV-on-disk through one sink shared across every
|
|
187
|
+
// language pass, so it never accumulates in `ctx.graph` (peak RSS O(chunk)).
|
|
188
|
+
// Needs the storage dir (the parse-cache store path, the same `.gitnexus`
|
|
189
|
+
// dir loadGraphToLbug COPYs from); if that is somehow absent we skip
|
|
190
|
+
// streaming and fall back to the in-memory whole-graph emit.
|
|
191
|
+
let pdgEmitSink;
|
|
192
|
+
if (ctx.options?.streamPdgEmit === true && totalScopeFiles > 0) {
|
|
193
|
+
if (parsedFileStorePath) {
|
|
194
|
+
pdgEmitSink = new PdgEmitSink(ctx.graph,
|
|
195
|
+
// Same ASCII-safe relocation the structural CSVs get (#2202 review #2):
|
|
196
|
+
// on Windows non-ASCII storage paths the COPY can't open files under
|
|
197
|
+
// the native path, so the dir is relocated to a hashed os.tmpdir().
|
|
198
|
+
resolveNativeSafeStorageDir(parsedFileStorePath, 'pdg-csv'), ctx.options?.pdgEmitChunkSize);
|
|
253
199
|
}
|
|
254
200
|
else {
|
|
255
|
-
|
|
256
|
-
|
|
257
|
-
const pathsToRead = providerNeedsAllContent
|
|
258
|
-
? primaryFilePaths
|
|
259
|
-
: primaryFilePaths.filter((p) => !preExtractedByPath.has(p));
|
|
260
|
-
contents = await readFileContents(ctx.repoPath, pathsToRead);
|
|
201
|
+
logger.warn('[scope-resolution] streaming PDG emit requested but no storage path is ' +
|
|
202
|
+
'available; falling back to in-memory whole-graph emit');
|
|
261
203
|
}
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
266
|
-
|
|
267
|
-
|
|
204
|
+
}
|
|
205
|
+
// Cross-pass per-file dedup set for the streaming sink (#2202): one set
|
|
206
|
+
// shared across every language pass so a file emitted in two passes (e.g. a
|
|
207
|
+
// `.ts` module pulled into the Vue context pass) streams its PDG layer once.
|
|
208
|
+
// Only created when streaming — the in-memory-graph path dedups via its Map.
|
|
209
|
+
const pdgEmittedFiles = pdgEmitSink !== undefined ? new Set() : undefined;
|
|
210
|
+
// Stream the PDG layer with guaranteed writer cleanup: a throw escaping the
|
|
211
|
+
// per-language loop (outside run.ts's per-file try/catch — e.g. from
|
|
212
|
+
// finalize/propagate/a provider hook) must still release the sink's file
|
|
213
|
+
// descriptors. finalize() runs on the success path; the finally closes the
|
|
214
|
+
// sink only when finalize did not (idempotent via the sink's `finalized`).
|
|
215
|
+
let pdgEmitManifest;
|
|
216
|
+
let pdgSinkSettled = false;
|
|
217
|
+
try {
|
|
218
|
+
for (const [lang, provider] of SCOPE_RESOLVERS) {
|
|
219
|
+
// Standalone providers (COBOL, JCL) don't emit graph edges yet
|
|
220
|
+
// through the scope-resolution path. This is the canonical guard:
|
|
221
|
+
// runScopeResolution is never called for standalone providers, which
|
|
222
|
+
// keeps cobolPhase as the sole IMPORTS edge producer. Keep this guard
|
|
223
|
+
// in sync with any additional standalone providers added to
|
|
224
|
+
// SCOPE_RESOLVERS.
|
|
225
|
+
if (provider.languageProvider.parseStrategy === 'standalone')
|
|
226
|
+
continue;
|
|
227
|
+
const primaryLangFiles = filesByLang.get(lang) ?? [];
|
|
228
|
+
if (primaryLangFiles.length === 0)
|
|
229
|
+
continue;
|
|
230
|
+
const primaryFilePaths = primaryLangFiles.map((f) => f.path);
|
|
231
|
+
// Load per-language import-resolution config (tsconfig paths,
|
|
232
|
+
// composer.json autoload, go.mod, ...). One I/O round trip per
|
|
233
|
+
// workspace pass — cached implicitly by the result handed to
|
|
234
|
+
// every `resolveImportTarget` call below.
|
|
235
|
+
const resolutionConfig = provider.loadResolutionConfig !== undefined
|
|
236
|
+
? await provider.loadResolutionConfig(ctx.repoPath)
|
|
237
|
+
: undefined;
|
|
238
|
+
// Some languages (e.g. Vue) expand their file universe beyond the
|
|
239
|
+
// primary-language files via the `collectScopeContextPaths` hook.
|
|
240
|
+
// The hook receives raw source contents of the primary files so it
|
|
241
|
+
// can trace import closures without a second tree-sitter parse.
|
|
242
|
+
//
|
|
243
|
+
// To avoid reading primary files twice (once for the hook, once for
|
|
244
|
+
// the resolution pass), we read them upfront and merge with the
|
|
245
|
+
// extra context paths the hook may add.
|
|
246
|
+
// Stream this language's pre-built ParsedFiles in from the disk store
|
|
247
|
+
// FIRST (huge-repo path). Doing it before reading source lets us skip
|
|
248
|
+
// loading content for files the store already covers — for a provider
|
|
249
|
+
// with no content-consuming hook that source is pure dead weight once
|
|
250
|
+
// extraction is served from the store (~1.5 GB on the kernel's C pass).
|
|
251
|
+
// Merged into `preExtractedByPath`; the per-language release block below
|
|
252
|
+
// evicts these again before the next language, so only one language's
|
|
253
|
+
// ParsedFiles are resident at a time.
|
|
254
|
+
const loadStoreFor = async (paths) => {
|
|
255
|
+
if (!parsedFileStorePath)
|
|
256
|
+
return;
|
|
257
|
+
const fromDisk = await loadParsedFilesForPaths(parsedFileStorePath, paths);
|
|
258
|
+
for (const [fp, pf] of fromDisk)
|
|
259
|
+
preExtractedByPath.set(fp, pf);
|
|
260
|
+
};
|
|
261
|
+
// A provider that feeds source text into a post-extract hook
|
|
262
|
+
// (populateWorkspaceOwners / populateNamespaceSiblings /
|
|
263
|
+
// populateRangeBindings / emitPostResolutionEdges) needs content for ALL
|
|
264
|
+
// its files; one without those hooks only needs content for files the
|
|
265
|
+
// store does NOT cover (fresh-extract fallback). Keep this in sync with
|
|
266
|
+
// the getFileContents() call-sites in run.ts.
|
|
267
|
+
const providerNeedsAllContent = provider.populateWorkspaceOwners !== undefined ||
|
|
268
|
+
provider.populateNamespaceSiblings !== undefined ||
|
|
269
|
+
provider.populateRangeBindings !== undefined ||
|
|
270
|
+
provider.emitPostResolutionEdges !== undefined;
|
|
271
|
+
let scopeFilePaths;
|
|
272
|
+
let contents;
|
|
273
|
+
if (provider.collectScopeContextPaths !== undefined) {
|
|
274
|
+
// Context-expanding providers (e.g. Vue) need every primary file's
|
|
275
|
+
// source up front for the closure hook, so load it all.
|
|
276
|
+
const entryFileContents = await readFileContents(ctx.repoPath, primaryFilePaths);
|
|
277
|
+
scopeFilePaths = provider.collectScopeContextPaths({
|
|
278
|
+
primaryFilePaths,
|
|
279
|
+
preExtractedByPath,
|
|
280
|
+
entryFileContents,
|
|
281
|
+
allScannedPaths,
|
|
282
|
+
resolutionConfig,
|
|
283
|
+
});
|
|
284
|
+
// Read only the extra context files (TS/JS etc.) not already loaded.
|
|
285
|
+
const extraPaths = [...scopeFilePaths].filter((p) => !entryFileContents.has(p));
|
|
286
|
+
const extraContents = await readFileContents(ctx.repoPath, extraPaths);
|
|
287
|
+
contents = new Map([...entryFileContents, ...extraContents]);
|
|
288
|
+
await loadStoreFor(scopeFilePaths);
|
|
268
289
|
}
|
|
269
|
-
else
|
|
270
|
-
|
|
271
|
-
|
|
272
|
-
|
|
273
|
-
|
|
274
|
-
|
|
290
|
+
else {
|
|
291
|
+
scopeFilePaths = new Set(primaryFilePaths);
|
|
292
|
+
await loadStoreFor(scopeFilePaths);
|
|
293
|
+
const pathsToRead = providerNeedsAllContent
|
|
294
|
+
? primaryFilePaths
|
|
295
|
+
: primaryFilePaths.filter((p) => !preExtractedByPath.has(p));
|
|
296
|
+
contents = await readFileContents(ctx.repoPath, pathsToRead);
|
|
275
297
|
}
|
|
276
|
-
|
|
277
|
-
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
const langTag = totalScopeLangs > 1 ? `${langLabel} [${currentLangIdx}/${totalScopeLangs}]` : langLabel;
|
|
283
|
-
if (totalScopeFiles > 0) {
|
|
284
|
-
const pct = SCOPE_PCT_START + Math.round((processedScopeFiles / totalScopeFiles) * SCOPE_PCT_RANGE);
|
|
285
|
-
ctx.onProgress({
|
|
286
|
-
phase: 'scopeResolution',
|
|
287
|
-
percent: pct,
|
|
288
|
-
message: 'Resolving types',
|
|
289
|
-
detail: `${langTag}, ${langFileCount.toLocaleString()} files`,
|
|
290
|
-
});
|
|
291
|
-
}
|
|
292
|
-
const stats = runScopeResolution({
|
|
293
|
-
graph: ctx.graph,
|
|
294
|
-
model,
|
|
295
|
-
files,
|
|
296
|
-
resolutionConfig,
|
|
297
|
-
prebuiltNodeLookup: sharedNodeLookup,
|
|
298
|
-
prebuiltFunctionNodeIndex: sharedFnNodeIndex,
|
|
299
|
-
preExtractedParsedFiles: preExtractedByPath,
|
|
300
|
-
scopeIndexStorePath: parsedFileStorePath,
|
|
301
|
-
// CFG/PDG emission (#2081 M1) — opt-in; off ⇒ byte-identical graph.
|
|
302
|
-
pdg: ctx.options?.pdg === true,
|
|
303
|
-
pdgMaxEdgesPerFunction: ctx.options?.pdgMaxEdgesPerFunction,
|
|
304
|
-
pdgMaxReachingDefEdgesPerFunction: ctx.options?.pdgMaxReachingDefEdgesPerFunction,
|
|
305
|
-
pdgMaxCdgEdgesPerFunction: ctx.options?.pdgMaxCdgEdgesPerFunction,
|
|
306
|
-
pdgMaxTaintFindingsPerFunction: ctx.options?.pdgMaxTaintFindingsPerFunction,
|
|
307
|
-
pdgMaxTaintHops: ctx.options?.pdgMaxTaintHops,
|
|
308
|
-
recordResolutionOutcome: (outcome) => {
|
|
309
|
-
resolutionOutcomes.push(outcome);
|
|
310
|
-
},
|
|
311
|
-
onWarn: (msg) => {
|
|
312
|
-
if (isSemanticModelValidatorEnabled()) {
|
|
313
|
-
logger.warn(`[scope-resolution:${lang}] ${msg}`);
|
|
298
|
+
const filePaths = [...scopeFilePaths];
|
|
299
|
+
const files = [];
|
|
300
|
+
for (const fp of filePaths) {
|
|
301
|
+
const content = contents.get(fp);
|
|
302
|
+
if (content !== undefined) {
|
|
303
|
+
files.push({ path: fp, content });
|
|
314
304
|
}
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
305
|
+
else if (preExtractedByPath.has(fp)) {
|
|
306
|
+
// Store covers extraction for this file and we deliberately skipped
|
|
307
|
+
// reading its source; the empty string is never consumed (the
|
|
308
|
+
// extract loop uses the pre-extracted ParsedFile and this provider
|
|
309
|
+
// has no content hook).
|
|
310
|
+
files.push({ path: fp, content: '' });
|
|
311
|
+
}
|
|
312
|
+
// else: uncovered AND unreadable → skip (unchanged from prior behavior).
|
|
313
|
+
}
|
|
314
|
+
const langFileCount = files.length;
|
|
315
|
+
logHeapProbe('scope-lang-start', `lang=${lang} files=${langFileCount} contentsLoaded=${contents.size}`);
|
|
316
|
+
const langLabel = lang.charAt(0).toUpperCase() + lang.slice(1);
|
|
317
|
+
currentLangIdx++;
|
|
318
|
+
const langTag = totalScopeLangs > 1 ? `${langLabel} [${currentLangIdx}/${totalScopeLangs}]` : langLabel;
|
|
319
|
+
if (totalScopeFiles > 0) {
|
|
320
|
+
const pct = SCOPE_PCT_START + Math.round((processedScopeFiles / totalScopeFiles) * SCOPE_PCT_RANGE);
|
|
321
|
+
ctx.onProgress({
|
|
322
|
+
phase: 'scopeResolution',
|
|
323
|
+
percent: pct,
|
|
324
|
+
message: 'Resolving types',
|
|
325
|
+
detail: `${langTag}, ${langFileCount.toLocaleString()} files`,
|
|
326
|
+
});
|
|
327
|
+
}
|
|
328
|
+
const stats = runScopeResolution({
|
|
329
|
+
graph: ctx.graph,
|
|
330
|
+
model,
|
|
331
|
+
files,
|
|
332
|
+
resolutionConfig,
|
|
333
|
+
prebuiltNodeLookup: sharedNodeLookup,
|
|
334
|
+
prebuiltFunctionNodeIndex: sharedFnNodeIndex,
|
|
335
|
+
preExtractedParsedFiles: preExtractedByPath,
|
|
336
|
+
scopeIndexStorePath: parsedFileStorePath,
|
|
337
|
+
// CFG/PDG emission (#2081 M1) — opt-in; off ⇒ byte-identical graph.
|
|
338
|
+
pdg: ctx.options?.pdg === true,
|
|
339
|
+
pdgMaxEdgesPerFunction: ctx.options?.pdgMaxEdgesPerFunction,
|
|
340
|
+
pdgMaxReachingDefEdgesPerFunction: ctx.options?.pdgMaxReachingDefEdgesPerFunction,
|
|
341
|
+
pdgMaxCdgEdgesPerFunction: ctx.options?.pdgMaxCdgEdgesPerFunction,
|
|
342
|
+
pdgMaxTaintFindingsPerFunction: ctx.options?.pdgMaxTaintFindingsPerFunction,
|
|
343
|
+
pdgMaxTaintHops: ctx.options?.pdgMaxTaintHops,
|
|
344
|
+
// Streaming PDG-emit sink (#2202) — undefined ⇒ emit to the in-memory graph.
|
|
345
|
+
pdgEmitSink,
|
|
346
|
+
// Cross-pass per-file dedup set (#2202) — undefined when not streaming.
|
|
347
|
+
pdgEmittedFiles,
|
|
348
|
+
recordResolutionOutcome: (outcome) => {
|
|
349
|
+
resolutionOutcomes.push(outcome);
|
|
350
|
+
},
|
|
351
|
+
onWarn: (msg) => {
|
|
352
|
+
if (isSemanticModelValidatorEnabled()) {
|
|
353
|
+
logger.warn(`[scope-resolution:${lang}] ${msg}`);
|
|
354
|
+
}
|
|
355
|
+
},
|
|
356
|
+
onProgress: totalScopeFiles > 0
|
|
357
|
+
? (subPhase, current, total) => {
|
|
358
|
+
let langRatio;
|
|
359
|
+
switch (subPhase) {
|
|
360
|
+
case 'extracting':
|
|
361
|
+
langRatio = total > 0 ? (current / total) * 0.5 : 0;
|
|
362
|
+
break;
|
|
363
|
+
case 'analyzing types':
|
|
364
|
+
langRatio = 0.5;
|
|
365
|
+
break;
|
|
366
|
+
case 'resolving references':
|
|
367
|
+
langRatio = 0.7;
|
|
368
|
+
break;
|
|
369
|
+
case 'linking symbols':
|
|
370
|
+
langRatio = 0.85;
|
|
371
|
+
break;
|
|
372
|
+
default: {
|
|
373
|
+
const _exhaustive = subPhase;
|
|
374
|
+
langRatio = 0.85;
|
|
375
|
+
}
|
|
335
376
|
}
|
|
377
|
+
const overallRatio = Math.min(1, (processedScopeFiles + langRatio * langFileCount) / totalScopeFiles);
|
|
378
|
+
const pct = SCOPE_PCT_START + Math.round(overallRatio * SCOPE_PCT_RANGE);
|
|
379
|
+
ctx.onProgress({
|
|
380
|
+
phase: 'scopeResolution',
|
|
381
|
+
percent: pct,
|
|
382
|
+
message: 'Resolving types',
|
|
383
|
+
detail: subPhase === 'extracting'
|
|
384
|
+
? `${langTag} — extracting ${current.toLocaleString()}/${total.toLocaleString()} files`
|
|
385
|
+
: `${langTag} — ${subPhase}`,
|
|
386
|
+
});
|
|
336
387
|
}
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
347
|
-
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
352
|
-
|
|
353
|
-
|
|
354
|
-
|
|
355
|
-
|
|
356
|
-
|
|
357
|
-
|
|
358
|
-
|
|
359
|
-
|
|
360
|
-
|
|
361
|
-
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
373
|
-
|
|
374
|
-
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
totalFiles += stats.filesProcessed;
|
|
379
|
-
totalImports += stats.importsEmitted;
|
|
380
|
-
totalRefs += stats.referenceEdgesEmitted;
|
|
381
|
-
perLanguage.set(lang, {
|
|
382
|
-
filesProcessed: stats.filesProcessed,
|
|
383
|
-
importsEmitted: stats.importsEmitted,
|
|
384
|
-
referenceEdgesEmitted: stats.referenceEdgesEmitted,
|
|
385
|
-
});
|
|
386
|
-
if (isDev) {
|
|
387
|
-
logger.info(`[scope-resolution:${lang}] ${stats.filesProcessed} files → ${stats.importsEmitted} IMPORTS + ${stats.referenceEdgesEmitted} reference edges (${stats.resolve.unresolved} unresolved sites, ${stats.referenceSkipped} skipped)`);
|
|
388
|
+
: undefined,
|
|
389
|
+
}, provider);
|
|
390
|
+
// Release file contents and pre-extracted entries after each language
|
|
391
|
+
// to reduce memory pressure. For large codebases (16K+ PHP files),
|
|
392
|
+
// holding all source code simultaneously with scope trees causes OOM.
|
|
393
|
+
// See: https://github.com/abhigyanpatwari/GitNexus/issues/1741
|
|
394
|
+
//
|
|
395
|
+
// Use `filePaths` (not `primaryFilePaths`) so that any context files
|
|
396
|
+
// added by `collectScopeContextPaths` (e.g. TS/JS files pulled in for
|
|
397
|
+
// Vue cross-file resolution) are also evicted and not held until GC.
|
|
398
|
+
files.length = 0;
|
|
399
|
+
contents.clear();
|
|
400
|
+
for (const fp of filePaths) {
|
|
401
|
+
preExtractedByPath.delete(fp);
|
|
402
|
+
}
|
|
403
|
+
// This language's ParsedFiles are now unreachable (runScopeResolution has
|
|
404
|
+
// returned and the Map entries are deleted). Force a GC HERE so a heavy
|
|
405
|
+
// language's ~17-20GB set (e.g. C/C++ on the Linux kernel) is reclaimed
|
|
406
|
+
// BEFORE the next language's store-load — instead of leaving V8 to collect
|
|
407
|
+
// it lazily under the next pass's allocation pressure (which, at a cap >=
|
|
408
|
+
// RAM, degrades into swap-thrash). Collects only dead objects: the live
|
|
409
|
+
// cross-file index of the next pass is untouched. The pre/post probe
|
|
410
|
+
// confirms whether old-space fragmentation defeats the reclaim.
|
|
411
|
+
logHeapProbe('lang-release-pre-gc', `lang=${lang}`);
|
|
412
|
+
forceGc();
|
|
413
|
+
logHeapProbe('lang-release-post-gc', `lang=${lang}`);
|
|
414
|
+
logHeapProbe('scope-lang-end', `lang=${lang} filesProcessed=${stats.filesProcessed}`);
|
|
415
|
+
processedScopeFiles += langFileCount;
|
|
416
|
+
anyRan = true;
|
|
417
|
+
functionSummaries.push(...stats.functionSummaries);
|
|
418
|
+
totalFiles += stats.filesProcessed;
|
|
419
|
+
totalImports += stats.importsEmitted;
|
|
420
|
+
totalRefs += stats.referenceEdgesEmitted;
|
|
421
|
+
perLanguage.set(lang, {
|
|
422
|
+
filesProcessed: stats.filesProcessed,
|
|
423
|
+
importsEmitted: stats.importsEmitted,
|
|
424
|
+
referenceEdgesEmitted: stats.referenceEdgesEmitted,
|
|
425
|
+
});
|
|
426
|
+
if (isDev) {
|
|
427
|
+
logger.info(`[scope-resolution:${lang}] ${stats.filesProcessed} files → ${stats.importsEmitted} IMPORTS + ${stats.referenceEdgesEmitted} reference edges (${stats.resolve.unresolved} unresolved sites, ${stats.referenceSkipped} skipped)`);
|
|
428
|
+
}
|
|
388
429
|
}
|
|
430
|
+
// Finalize the streaming PDG sink (#2202) once after the last language:
|
|
431
|
+
// flush + close its CSV writers and capture the COPY manifest. forceGc at
|
|
432
|
+
// the boundary reclaims transient write buffers (mirrors the per-language
|
|
433
|
+
// release below).
|
|
434
|
+
pdgEmitManifest = pdgEmitSink?.finalize();
|
|
435
|
+
pdgSinkSettled = true;
|
|
436
|
+
if (pdgEmitSink !== undefined)
|
|
437
|
+
forceGc();
|
|
438
|
+
}
|
|
439
|
+
finally {
|
|
440
|
+
// Release fds if a throw skipped finalize (idempotent with finalize()).
|
|
441
|
+
if (pdgEmitSink !== undefined && !pdgSinkSettled)
|
|
442
|
+
pdgEmitSink.close();
|
|
389
443
|
}
|
|
390
444
|
if (totalScopeFiles > 0 && anyRan) {
|
|
391
445
|
ctx.onProgress({
|
|
@@ -407,8 +461,11 @@ export const scopeResolutionPhase = {
|
|
|
407
461
|
/* best-effort cleanup */
|
|
408
462
|
}
|
|
409
463
|
}
|
|
464
|
+
// Even when no language ran, surface a finalized manifest (its CSVs are on
|
|
465
|
+
// disk) so loadGraphToLbug COPYs them rather than orphaning them — empty in
|
|
466
|
+
// the no-files case, harmless.
|
|
410
467
|
if (!anyRan)
|
|
411
|
-
return NOOP_OUTPUT;
|
|
468
|
+
return pdgEmitManifest ? { ...NOOP_OUTPUT, pdgEmitManifest } : NOOP_OUTPUT;
|
|
412
469
|
return {
|
|
413
470
|
ran: true,
|
|
414
471
|
filesProcessed: totalFiles,
|
|
@@ -417,6 +474,7 @@ export const scopeResolutionPhase = {
|
|
|
417
474
|
resolutionOutcomes,
|
|
418
475
|
perLanguage,
|
|
419
476
|
functionSummaries,
|
|
477
|
+
pdgEmitManifest,
|
|
420
478
|
};
|
|
421
479
|
},
|
|
422
480
|
};
|
|
@@ -84,6 +84,30 @@ interface RunScopeResolutionInput {
|
|
|
84
84
|
* `reason`; consumed by the U4 taint emit step). `undefined` ⇒
|
|
85
85
|
* `DEFAULT_PDG_MAX_TAINT_HOPS` (32); `0` ⇒ no cap. */
|
|
86
86
|
readonly pdgMaxTaintHops?: number;
|
|
87
|
+
/**
|
|
88
|
+
* Streaming PDG-emit sink (#2202). When present (streaming on, full rebuild),
|
|
89
|
+
* the `--pdg` emit routes BasicBlock nodes + intra-file PDG edges to THIS
|
|
90
|
+
* graph-shaped target instead of the in-memory `graph`, so the bulky PDG
|
|
91
|
+
* layer never accumulates in memory (peak RSS O(chunk)). Typed as a plain
|
|
92
|
+
* `KnowledgeGraph` so this module stays decoupled from the persistence layer;
|
|
93
|
+
* the caller (the scope-resolution phase) owns its lifecycle and finalizes it
|
|
94
|
+
* after the last language. Absent ⇒ the emit writes to `graph` as before
|
|
95
|
+
* (byte-identical default).
|
|
96
|
+
*/
|
|
97
|
+
readonly pdgEmitSink?: KnowledgeGraph;
|
|
98
|
+
/**
|
|
99
|
+
* Cross-pass per-file dedup set for streaming PDG emit (#2202). Shared across
|
|
100
|
+
* every language pass (owned by the scope-resolution phase). A file imported
|
|
101
|
+
* by more than one language (e.g. a `.ts` module pulled into the Vue context
|
|
102
|
+
* pass) is PDG-emitted in each pass over the same `cfgSideChannel`, producing
|
|
103
|
+
* identical ids; the in-memory graph dedups that by id, but the streaming sink
|
|
104
|
+
* is dedup-free (to stay O(write buffer), not O(total ids)). So when present
|
|
105
|
+
* (streaming on), the emit loop skips a file whose PDG already streamed and
|
|
106
|
+
* records the rest — keeping the streamed set byte-identical to the
|
|
107
|
+
* Map-deduped whole-graph emit, for any language-pass order. Absent ⇒ no skip
|
|
108
|
+
* (the graph Map dedups), so the default path is unchanged.
|
|
109
|
+
*/
|
|
110
|
+
readonly pdgEmittedFiles?: Set<string>;
|
|
87
111
|
/**
|
|
88
112
|
* Optional graph-node lookup built ONCE by the caller and shared across
|
|
89
113
|
* every language pass. `buildGraphNodeLookup` scans the whole graph and is
|