gitnexus 1.6.8-rc.36 → 1.6.8-rc.37

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/dist/core/ingestion/cfg/cfg-builder.d.ts +50 -0
  2. package/dist/core/ingestion/cfg/cfg-builder.js +67 -0
  3. package/dist/core/ingestion/cfg/collect.d.ts +24 -3
  4. package/dist/core/ingestion/cfg/collect.js +50 -7
  5. package/dist/core/ingestion/cfg/control-dependence.d.ts +21 -10
  6. package/dist/core/ingestion/cfg/control-dependence.js +111 -49
  7. package/dist/core/ingestion/cfg/emit.d.ts +19 -0
  8. package/dist/core/ingestion/cfg/emit.js +47 -9
  9. package/dist/core/ingestion/cfg/reaching-defs.d.ts +13 -0
  10. package/dist/core/ingestion/cfg/reaching-defs.js +24 -0
  11. package/dist/core/ingestion/cfg/synthetic-escape.d.ts +124 -0
  12. package/dist/core/ingestion/cfg/synthetic-escape.js +300 -0
  13. package/dist/core/ingestion/cfg/visitors/c-cpp-harvest.d.ts +111 -0
  14. package/dist/core/ingestion/cfg/visitors/c-cpp-harvest.js +539 -0
  15. package/dist/core/ingestion/cfg/visitors/c-cpp.d.ts +74 -0
  16. package/dist/core/ingestion/cfg/visitors/c-cpp.js +584 -0
  17. package/dist/core/ingestion/cfg/visitors/call-site-harvest.d.ts +152 -0
  18. package/dist/core/ingestion/cfg/visitors/call-site-harvest.js +304 -0
  19. package/dist/core/ingestion/cfg/visitors/csharp-harvest.d.ts +110 -0
  20. package/dist/core/ingestion/cfg/visitors/csharp-harvest.js +559 -0
  21. package/dist/core/ingestion/cfg/visitors/csharp.d.ts +81 -0
  22. package/dist/core/ingestion/cfg/visitors/csharp.js +707 -0
  23. package/dist/core/ingestion/cfg/visitors/dart-harvest.d.ts +160 -0
  24. package/dist/core/ingestion/cfg/visitors/dart-harvest.js +473 -0
  25. package/dist/core/ingestion/cfg/visitors/dart.d.ts +116 -0
  26. package/dist/core/ingestion/cfg/visitors/dart.js +689 -0
  27. package/dist/core/ingestion/cfg/visitors/go-harvest.d.ts +160 -0
  28. package/dist/core/ingestion/cfg/visitors/go-harvest.js +629 -0
  29. package/dist/core/ingestion/cfg/visitors/go.d.ts +108 -0
  30. package/dist/core/ingestion/cfg/visitors/go.js +638 -0
  31. package/dist/core/ingestion/cfg/visitors/java-harvest.d.ts +100 -0
  32. package/dist/core/ingestion/cfg/visitors/java-harvest.js +484 -0
  33. package/dist/core/ingestion/cfg/visitors/java.d.ts +100 -0
  34. package/dist/core/ingestion/cfg/visitors/java.js +716 -0
  35. package/dist/core/ingestion/cfg/visitors/kotlin-harvest.d.ts +161 -0
  36. package/dist/core/ingestion/cfg/visitors/kotlin-harvest.js +485 -0
  37. package/dist/core/ingestion/cfg/visitors/kotlin.d.ts +119 -0
  38. package/dist/core/ingestion/cfg/visitors/kotlin.js +760 -0
  39. package/dist/core/ingestion/cfg/visitors/php-harvest.d.ts +164 -0
  40. package/dist/core/ingestion/cfg/visitors/php-harvest.js +596 -0
  41. package/dist/core/ingestion/cfg/visitors/php.d.ts +94 -0
  42. package/dist/core/ingestion/cfg/visitors/php.js +612 -0
  43. package/dist/core/ingestion/cfg/visitors/python-harvest.d.ts +175 -0
  44. package/dist/core/ingestion/cfg/visitors/python-harvest.js +601 -0
  45. package/dist/core/ingestion/cfg/visitors/python.d.ts +103 -0
  46. package/dist/core/ingestion/cfg/visitors/python.js +558 -0
  47. package/dist/core/ingestion/cfg/visitors/ruby-harvest.d.ts +144 -0
  48. package/dist/core/ingestion/cfg/visitors/ruby-harvest.js +419 -0
  49. package/dist/core/ingestion/cfg/visitors/ruby.d.ts +112 -0
  50. package/dist/core/ingestion/cfg/visitors/ruby.js +756 -0
  51. package/dist/core/ingestion/cfg/visitors/rust-harvest.d.ts +165 -0
  52. package/dist/core/ingestion/cfg/visitors/rust-harvest.js +547 -0
  53. package/dist/core/ingestion/cfg/visitors/rust.d.ts +95 -0
  54. package/dist/core/ingestion/cfg/visitors/rust.js +558 -0
  55. package/dist/core/ingestion/cfg/visitors/scope-tree-harvest.d.ts +87 -0
  56. package/dist/core/ingestion/cfg/visitors/scope-tree-harvest.js +116 -0
  57. package/dist/core/ingestion/cfg/visitors/swift-harvest.d.ts +161 -0
  58. package/dist/core/ingestion/cfg/visitors/swift-harvest.js +487 -0
  59. package/dist/core/ingestion/cfg/visitors/swift.d.ts +110 -0
  60. package/dist/core/ingestion/cfg/visitors/swift.js +721 -0
  61. package/dist/core/ingestion/cfg/visitors/typescript.js +37 -38
  62. package/dist/core/ingestion/languages/c-cpp.js +3 -0
  63. package/dist/core/ingestion/languages/cobol.js +4 -0
  64. package/dist/core/ingestion/languages/csharp.js +2 -0
  65. package/dist/core/ingestion/languages/dart.js +2 -0
  66. package/dist/core/ingestion/languages/go.js +2 -0
  67. package/dist/core/ingestion/languages/java.js +3 -0
  68. package/dist/core/ingestion/languages/kotlin.js +3 -0
  69. package/dist/core/ingestion/languages/php.js +2 -0
  70. package/dist/core/ingestion/languages/python.js +2 -0
  71. package/dist/core/ingestion/languages/ruby.js +2 -0
  72. package/dist/core/ingestion/languages/rust.js +2 -0
  73. package/dist/core/ingestion/languages/swift.js +2 -0
  74. package/dist/core/ingestion/languages/vue.js +6 -0
  75. package/dist/core/ingestion/parsing-processor.js +22 -0
  76. package/dist/core/ingestion/scope-resolution/pipeline/run.js +17 -0
  77. package/dist/core/ingestion/workers/parse-worker.d.ts +12 -0
  78. package/dist/core/ingestion/workers/parse-worker.js +28 -7
  79. package/dist/core/ingestion/workers/worker-pool.js +10 -0
  80. package/package.json +1 -1
@@ -13,6 +13,30 @@
13
13
  * before the tree-sitter visitor (U2) drives it.
14
14
  */
15
15
  import type { BasicBlockData, BindingEntry, CfgEdgeKind, FunctionCfg, StatementFacts } from './types.js';
16
+ /**
17
+ * Hard ceiling on CFG recursive-descent scope-entry depth (#2195). A language
18
+ * `CfgVisitor` wraps each nested block scope in {@link CfgBuilder.withNesting} (its
19
+ * `visitBody` / `visitSeq` choke points), so the live count tracks scope entries,
20
+ * not statement width. NOTE the count is ~2× LEXICAL nesting for block-bodied
21
+ * constructs (visitBody → visitSeq both enter), so the effective lexical ceiling
22
+ * is ~250 levels for block bodies (~500 for single-statement bodies / bare
23
+ * blocks). Real source nests ≤ ~50 deep, so this fires only on machine-generated
24
+ * / adversarial input. Both effective ceilings sit far below the engine's native
25
+ * stack limit (~1.2k+ nesting even on the raised worker `stackSizeMb`), so the
26
+ * bail is a DETERMINISTIC, language-independent {@link CfgNestingDepthError}
27
+ * rather than a nondeterministic `RangeError` thrown somewhere mid-walk.
28
+ */
29
+ export declare const MAX_CFG_NESTING_DEPTH = 500;
30
+ /**
31
+ * Thrown by the visitor nesting-depth guard ({@link CfgBuilder.enterNesting})
32
+ * when lexical nesting exceeds {@link MAX_CFG_NESTING_DEPTH}. `collectFunctionCfgs`
33
+ * catches it and counts the function under `skipped.tooDeeplyNested`, isolating
34
+ * the bail to one function instead of risking a worker-wide stack overflow.
35
+ */
36
+ export declare class CfgNestingDepthError extends Error {
37
+ readonly limit: number;
38
+ constructor(limit: number);
39
+ }
16
40
  export declare class CfgBuilder {
17
41
  private readonly filePath;
18
42
  private readonly functionStartLine;
@@ -24,6 +48,8 @@ export declare class CfgBuilder {
24
48
  private readonly blocks;
25
49
  private readonly edges;
26
50
  private readonly edgeKeys;
51
+ /** Live recursive-descent nesting depth — see {@link enterNesting}. */
52
+ private nesting;
27
53
  readonly entryIndex: number;
28
54
  readonly exitIndex: number;
29
55
  constructor(filePath: string, functionStartLine: number, functionEndLine: number,
@@ -48,6 +74,30 @@ export declare class CfgBuilder {
48
74
  */
49
75
  attachFacts(index: number, facts: StatementFacts): void;
50
76
  get blockCount(): number;
77
+ /**
78
+ * Run `fn` inside ONE nested block scope (#2195) — the single choke every
79
+ * visitor's `visitBody` / `visitSeq` funnels through. Enters on the way in and
80
+ * exits in a `finally`, so the live depth is balanced on every return AND every
81
+ * throw and the enter/exit can never drift out of pair (the reason this is one
82
+ * helper, not 24 hand-paired call sites). Throws {@link CfgNestingDepthError}
83
+ * when nesting exceeds {@link MAX_CFG_NESTING_DEPTH} — a proactive, deterministic
84
+ * bail before the native stack can overflow on a pathologically nested function.
85
+ *
86
+ * A block-bodied construct passes through BOTH visitBody and visitSeq, so it
87
+ * costs TWO scopes per lexical level: the effective structural ceiling is
88
+ * ~MAX_CFG_NESTING_DEPTH/2 (~250) lexical levels for block bodies (~500 for
89
+ * single-statement bodies / bare blocks, which hit only one of the two). Still
90
+ * an order of magnitude below the native limit and far above real code (≤ ~50).
91
+ */
92
+ withNesting<T>(fn: () => T): T;
93
+ /**
94
+ * Increment the nesting counter, throwing {@link CfgNestingDepthError} past the
95
+ * cap. Prefer {@link withNesting}, which pairs the exit in a `finally`; this is
96
+ * exposed for direct depth-accounting tests only.
97
+ */
98
+ enterNesting(): void;
99
+ /** Decrement the nesting counter — the partner of {@link enterNesting}. */
100
+ exitNesting(): void;
51
101
  /** Produce the serializable CFG. Caller is responsible for having wired the
52
102
  * function's dangling exits to {@link exitIndex} before calling.
53
103
  *
@@ -1,3 +1,31 @@
1
+ /**
2
+ * Hard ceiling on CFG recursive-descent scope-entry depth (#2195). A language
3
+ * `CfgVisitor` wraps each nested block scope in {@link CfgBuilder.withNesting} (its
4
+ * `visitBody` / `visitSeq` choke points), so the live count tracks scope entries,
5
+ * not statement width. NOTE the count is ~2× LEXICAL nesting for block-bodied
6
+ * constructs (visitBody → visitSeq both enter), so the effective lexical ceiling
7
+ * is ~250 levels for block bodies (~500 for single-statement bodies / bare
8
+ * blocks). Real source nests ≤ ~50 deep, so this fires only on machine-generated
9
+ * / adversarial input. Both effective ceilings sit far below the engine's native
10
+ * stack limit (~1.2k+ nesting even on the raised worker `stackSizeMb`), so the
11
+ * bail is a DETERMINISTIC, language-independent {@link CfgNestingDepthError}
12
+ * rather than a nondeterministic `RangeError` thrown somewhere mid-walk.
13
+ */
14
+ export const MAX_CFG_NESTING_DEPTH = 500;
15
+ /**
16
+ * Thrown by the visitor nesting-depth guard ({@link CfgBuilder.enterNesting})
17
+ * when lexical nesting exceeds {@link MAX_CFG_NESTING_DEPTH}. `collectFunctionCfgs`
18
+ * catches it and counts the function under `skipped.tooDeeplyNested`, isolating
19
+ * the bail to one function instead of risking a worker-wide stack overflow.
20
+ */
21
+ export class CfgNestingDepthError extends Error {
22
+ limit;
23
+ constructor(limit) {
24
+ super(`CFG nesting depth exceeded ${limit}`);
25
+ this.limit = limit;
26
+ this.name = 'CfgNestingDepthError';
27
+ }
28
+ }
1
29
  export class CfgBuilder {
2
30
  filePath;
3
31
  functionStartLine;
@@ -6,6 +34,8 @@ export class CfgBuilder {
6
34
  blocks = [];
7
35
  edges = [];
8
36
  edgeKeys = new Set();
37
+ /** Live recursive-descent nesting depth — see {@link enterNesting}. */
38
+ nesting = 0;
9
39
  entryIndex;
10
40
  exitIndex;
11
41
  constructor(filePath, functionStartLine, functionEndLine,
@@ -72,6 +102,43 @@ export class CfgBuilder {
72
102
  get blockCount() {
73
103
  return this.blocks.length;
74
104
  }
105
+ /**
106
+ * Run `fn` inside ONE nested block scope (#2195) — the single choke every
107
+ * visitor's `visitBody` / `visitSeq` funnels through. Enters on the way in and
108
+ * exits in a `finally`, so the live depth is balanced on every return AND every
109
+ * throw and the enter/exit can never drift out of pair (the reason this is one
110
+ * helper, not 24 hand-paired call sites). Throws {@link CfgNestingDepthError}
111
+ * when nesting exceeds {@link MAX_CFG_NESTING_DEPTH} — a proactive, deterministic
112
+ * bail before the native stack can overflow on a pathologically nested function.
113
+ *
114
+ * A block-bodied construct passes through BOTH visitBody and visitSeq, so it
115
+ * costs TWO scopes per lexical level: the effective structural ceiling is
116
+ * ~MAX_CFG_NESTING_DEPTH/2 (~250) lexical levels for block bodies (~500 for
117
+ * single-statement bodies / bare blocks, which hit only one of the two). Still
118
+ * an order of magnitude below the native limit and far above real code (≤ ~50).
119
+ */
120
+ withNesting(fn) {
121
+ this.enterNesting();
122
+ try {
123
+ return fn();
124
+ }
125
+ finally {
126
+ this.exitNesting();
127
+ }
128
+ }
129
+ /**
130
+ * Increment the nesting counter, throwing {@link CfgNestingDepthError} past the
131
+ * cap. Prefer {@link withNesting}, which pairs the exit in a `finally`; this is
132
+ * exposed for direct depth-accounting tests only.
133
+ */
134
+ enterNesting() {
135
+ if (++this.nesting > MAX_CFG_NESTING_DEPTH)
136
+ throw new CfgNestingDepthError(MAX_CFG_NESTING_DEPTH);
137
+ }
138
+ /** Decrement the nesting counter — the partner of {@link enterNesting}. */
139
+ exitNesting() {
140
+ this.nesting--;
141
+ }
75
142
  /** Produce the serializable CFG. Caller is responsible for having wired the
76
143
  * function's dangling exits to {@link exitIndex} before calling.
77
144
  *
@@ -22,9 +22,30 @@ import type { CfgVisitor, FunctionCfg } from './types.js';
22
22
  * both expensive and low-value. Overridable via `PipelineOptions.pdgMaxFunctionLines`.
23
23
  */
24
24
  export declare const DEFAULT_PDG_MAX_FUNCTION_LINES = 2000;
25
+ /**
26
+ * CFG-bearing functions skipped during the walk, bucketed by reason (#2195).
27
+ * Surfaced per-language in the parse telemetry (parsing-processor.ts) so a CFG
28
+ * coverage gap is observable, not silent. All-zero ⇒ nothing skipped.
29
+ */
30
+ export interface CfgSkipCounts {
31
+ /** Source span exceeded `maxFunctionLines` (minified / generated code). */
32
+ readonly tooManyLines: number;
33
+ /**
34
+ * Recursive-descent nesting hit {@link MAX_CFG_NESTING_DEPTH} — a proactive,
35
+ * deterministic bail (see {@link CfgNestingDepthError}) before a worker stack
36
+ * overflow.
37
+ */
38
+ readonly tooDeeplyNested: number;
39
+ /**
40
+ * `buildFunctionCfg` threw an unexpected error. Caught PER FUNCTION so one
41
+ * malformed function no longer drops the whole file's CFGs (the throw used to
42
+ * escape to the worker's language-group catch).
43
+ */
44
+ readonly buildError: number;
45
+ }
25
46
  export interface CollectedCfgs {
26
47
  readonly cfgs: readonly FunctionCfg[];
27
- /** Functions skipped for exceeding `maxFunctionLines` (0 ⇒ none skipped). */
28
- readonly skipped: number;
48
+ /** Per-reason skip counts (#2195). */
49
+ readonly skipped: CfgSkipCounts;
29
50
  }
30
- export declare function collectFunctionCfgs(root: SyntaxNode, visitor: CfgVisitor<SyntaxNode>, filePath: string, maxFunctionLines?: number): CollectedCfgs;
51
+ export declare function collectFunctionCfgs(root: SyntaxNode, visitor: CfgVisitor<SyntaxNode>, filePath: string, maxFunctionLines?: number, lineOffset?: number): CollectedCfgs;
@@ -1,3 +1,4 @@
1
+ import { CfgNestingDepthError } from './cfg-builder.js';
1
2
  /**
2
3
  * Default per-function source-line cap used by the worker when the `--pdg` run
3
4
  * does not specify `pdgMaxFunctionLines`. A function longer than this (almost
@@ -5,21 +6,63 @@
5
6
  * both expensive and low-value. Overridable via `PipelineOptions.pdgMaxFunctionLines`.
6
7
  */
7
8
  export const DEFAULT_PDG_MAX_FUNCTION_LINES = 2000;
8
- export function collectFunctionCfgs(root, visitor, filePath, maxFunctionLines = 0) {
9
+ /**
10
+ * Convert a CFG built from an EXTRACTED sub-document's AST (script-relative
11
+ * tree-sitter rows) into the enclosing file's coordinates by adding `offset` to
12
+ * every source-line field. Needed for embedded scripts — a Vue SFC `<script>`
13
+ * block parses at row 0 but lives at `lineOffset` in the `.vue` file, and every
14
+ * other worker-emitted graph node is already file-relative; without this, the
15
+ * CFG's `functionStartLine` would never join its Function/Method graph node
16
+ * (inter-procedural taint silently resolves nothing) and BasicBlock source
17
+ * lines would point at the wrong `.vue` line. A 0 offset returns the input
18
+ * unchanged (the common case: `.ts`/`.js`/etc. parse at the file root), keeping
19
+ * non-embedded languages byte-identical. Synthetic bindings keep `declLine` 0.
20
+ */
21
+ function shiftCfgLines(cfg, offset) {
22
+ if (offset === 0)
23
+ return cfg;
24
+ return {
25
+ ...cfg,
26
+ functionStartLine: cfg.functionStartLine + offset,
27
+ functionEndLine: cfg.functionEndLine + offset,
28
+ blocks: cfg.blocks.map((b) => ({
29
+ ...b,
30
+ startLine: b.startLine + offset,
31
+ endLine: b.endLine + offset,
32
+ statements: b.statements?.map((s) => ({ ...s, line: s.line + offset })),
33
+ })),
34
+ bindings: cfg.bindings?.map((bd) => bd.declLine > 0 ? { ...bd, declLine: bd.declLine + offset } : bd),
35
+ };
36
+ }
37
+ export function collectFunctionCfgs(root, visitor, filePath, maxFunctionLines = 0, lineOffset = 0) {
9
38
  const cfgs = [];
10
- let skipped = 0;
39
+ let tooManyLines = 0;
40
+ let tooDeeplyNested = 0;
41
+ let buildError = 0;
11
42
  const stack = [root];
12
43
  while (stack.length) {
13
44
  const node = stack.pop();
14
45
  if (visitor.isFunction(node)) {
15
46
  const lines = node.endPosition.row - node.startPosition.row + 1;
16
47
  if (maxFunctionLines > 0 && lines > maxFunctionLines) {
17
- skipped++;
48
+ tooManyLines++;
18
49
  }
19
50
  else {
20
- const cfg = visitor.buildFunctionCfg(node, filePath);
21
- if (cfg)
22
- cfgs.push(cfg);
51
+ // Isolate the per-function build: a proactive deep-nesting bail
52
+ // (CfgNestingDepthError) or any other visitor throw is counted and
53
+ // skipped HERE, so it can't escape to the worker's language-group catch
54
+ // and silently drop every remaining function's CFG (#2195).
55
+ try {
56
+ const cfg = visitor.buildFunctionCfg(node, filePath);
57
+ if (cfg)
58
+ cfgs.push(shiftCfgLines(cfg, lineOffset));
59
+ }
60
+ catch (err) {
61
+ if (err instanceof CfgNestingDepthError)
62
+ tooDeeplyNested++;
63
+ else
64
+ buildError++;
65
+ }
23
66
  }
24
67
  }
25
68
  // Descend regardless (a skipped mega-function may still contain small
@@ -30,5 +73,5 @@ export function collectFunctionCfgs(root, visitor, filePath, maxFunctionLines =
30
73
  stack.push(child);
31
74
  }
32
75
  }
33
- return { cfgs, skipped };
76
+ return { cfgs, skipped: { tooManyLines, tooDeeplyNested, buildError } };
34
77
  }
@@ -1,15 +1,26 @@
1
1
  /**
2
- * Control dependence (#2085 M5 U3) — Ferrante, Ottenstein & Warren §3.1.1 over
3
- * the post-dominator tree. A block `dependent` is control-dependent on a branch
4
- * block `controller` when `controller` decides whether `dependent` executes:
5
- * formally, there is a CFG edge `controller → B` such that `dependent`
6
- * post-dominates `B` but does NOT strictly post-dominate `controller`.
2
+ * Control dependence (#2085 M5 U3) — Ferrante, Ottenstein & Warren §3.1.1
3
+ * semantics. A block `dependent` is control-dependent on a branch block
4
+ * `controller` when `controller` decides whether `dependent` executes: formally,
5
+ * there is a CFG edge `controller → B` such that `dependent` post-dominates `B`
6
+ * but does NOT strictly post-dominate `controller`.
7
7
  *
8
- * Construction (§3.1.1): for each CFG edge `(A, B)` where `B` does NOT
9
- * post-dominate `A`, walk UP the post-dom tree from `B` to (but not including)
10
- * `ipdom(A)`; every block on that path is control-dependent on `A`. The branch
11
- * SENSE of the edge ('T' | 'F') becomes the edge label (KTD4 / KTD3 — it rides
12
- * the persisted relation's `reason` column).
8
+ * Construction the reverse-CFG dominance-frontier formulation (Cytron,
9
+ * Ferrante, Rosen, Wegman & Zadeck 1991): control dependence IS the dominance
10
+ * frontier of the reverse CFG, so `A PDF(X)` (the post-dominance frontier)
11
+ * `X` is control-dependent on `A`. The PDF is computed bottom-up over the
12
+ * post-dom tree (`PDF_local` from a node's CFG predecessors + `PDF_up` from its
13
+ * post-dom-tree children) in O(N + E + output) — each up-step is charged to a
14
+ * distinct emitted edge, NOT re-walked per CFG edge as the original §3.1.1
15
+ * up-walk did (which was Θ(N²) on a deep post-dom chain). The two formulations
16
+ * enumerate the IDENTICAL full `(controller, dependent, label)` set (verified
17
+ * byte-identical on 3203 CFGs + ~1M-case differential fuzz); LLVM, Joern and WALA
18
+ * use the reverse-DF form. (Only the rare TRUNCATED prefix — when a function
19
+ * exceeds `maxEdges` — differs from the old prefix: it is now a sorted
20
+ * deterministic prefix rather than CFG-edge-iteration order. Both are valid,
21
+ * deterministic subsets; the full untruncated output is unchanged.)
22
+ * The branch SENSE ('T' | 'F') of the controlling edge becomes the edge label
23
+ * (KTD4 / KTD3 — it rides the persisted relation's `reason` column).
13
24
  *
14
25
  * PURE AND DETERMINISTIC (mirrors post-dominators.ts / reaching-defs.ts): no
15
26
  * graph, no logger, importable outside the worker; output is deduped per
@@ -1,15 +1,26 @@
1
1
  /**
2
- * Control dependence (#2085 M5 U3) — Ferrante, Ottenstein & Warren §3.1.1 over
3
- * the post-dominator tree. A block `dependent` is control-dependent on a branch
4
- * block `controller` when `controller` decides whether `dependent` executes:
5
- * formally, there is a CFG edge `controller → B` such that `dependent`
6
- * post-dominates `B` but does NOT strictly post-dominate `controller`.
2
+ * Control dependence (#2085 M5 U3) — Ferrante, Ottenstein & Warren §3.1.1
3
+ * semantics. A block `dependent` is control-dependent on a branch block
4
+ * `controller` when `controller` decides whether `dependent` executes: formally,
5
+ * there is a CFG edge `controller → B` such that `dependent` post-dominates `B`
6
+ * but does NOT strictly post-dominate `controller`.
7
7
  *
8
- * Construction (§3.1.1): for each CFG edge `(A, B)` where `B` does NOT
9
- * post-dominate `A`, walk UP the post-dom tree from `B` to (but not including)
10
- * `ipdom(A)`; every block on that path is control-dependent on `A`. The branch
11
- * SENSE of the edge ('T' | 'F') becomes the edge label (KTD4 / KTD3 — it rides
12
- * the persisted relation's `reason` column).
8
+ * Construction the reverse-CFG dominance-frontier formulation (Cytron,
9
+ * Ferrante, Rosen, Wegman & Zadeck 1991): control dependence IS the dominance
10
+ * frontier of the reverse CFG, so `A PDF(X)` (the post-dominance frontier)
11
+ * `X` is control-dependent on `A`. The PDF is computed bottom-up over the
12
+ * post-dom tree (`PDF_local` from a node's CFG predecessors + `PDF_up` from its
13
+ * post-dom-tree children) in O(N + E + output) — each up-step is charged to a
14
+ * distinct emitted edge, NOT re-walked per CFG edge as the original §3.1.1
15
+ * up-walk did (which was Θ(N²) on a deep post-dom chain). The two formulations
16
+ * enumerate the IDENTICAL full `(controller, dependent, label)` set (verified
17
+ * byte-identical on 3203 CFGs + ~1M-case differential fuzz); LLVM, Joern and WALA
18
+ * use the reverse-DF form. (Only the rare TRUNCATED prefix — when a function
19
+ * exceeds `maxEdges` — differs from the old prefix: it is now a sorted
20
+ * deterministic prefix rather than CFG-edge-iteration order. Both are valid,
21
+ * deterministic subsets; the full untruncated output is unchanged.)
22
+ * The branch SENSE ('T' | 'F') of the controlling edge becomes the edge label
23
+ * (KTD4 / KTD3 — it rides the persisted relation's `reason` column).
13
24
  *
14
25
  * PURE AND DETERMINISTIC (mirrors post-dominators.ts / reaching-defs.ts): no
15
26
  * graph, no logger, importable outside the worker; output is deduped per
@@ -18,7 +29,7 @@
18
29
  * control-dependent on ITSELF (`controller === dependent`) — the loop predicate
19
30
  * gates its own re-execution; this is standard PDG behavior, not a bug.
20
31
  */
21
- import { computePostDominators, postDominates, NO_IPDOM, } from './post-dominators.js';
32
+ import { computePostDominators, NO_IPDOM } from './post-dominators.js';
22
33
  function buildArmSenses(cfg) {
23
34
  const n = cfg.blocks.length;
24
35
  const senses = Array.from({ length: n }, () => ({
@@ -65,56 +76,107 @@ function labelFor(kind, controller) {
65
76
  * module doc for the purity/determinism contract.
66
77
  */
67
78
  export function computeControlDependence(cfg, postDom,
68
- // Heap-safety ceiling on materialized edges, mirroring computeReachingDefs'
69
- // `maxFacts` (#2188 review): the pre-dedup walk is O(edges × post-dom depth),
70
- // so bound it before it can spike. `0` unbounded. On overflow `edges` is a
71
- // deterministic prefix and `truncated` is set never a silent drop.
79
+ // Output-size ceiling, mirroring computeReachingDefs' `maxFacts` (#2188 review).
80
+ // The reverse-DF set is the bounded (controller, dependent, label) dependence
81
+ // relation, so peak working set output here (no pre-dedup spike like the old
82
+ // up-walk) this caps the final edge COUNT, not transient memory. `0` ⇒
83
+ // unbounded. On overflow `edges` is a deterministic SORTED prefix and
84
+ // `truncated` is set — never a silent drop. (The sorted prefix is the prefix
85
+ // CONTENTS may differ from the old up-walk's CFG-edge-iteration prefix at the
86
+ // cap boundary; the FULL untruncated set is byte-identical — see the module doc.)
72
87
  maxEdges = 0) {
73
88
  const tree = postDom ?? computePostDominators(cfg);
74
89
  const { ipdom } = tree;
75
90
  const n = cfg.blocks.length;
76
91
  const armSenses = buildArmSenses(cfg);
77
92
  const cap = maxEdges > 0 ? maxEdges : Infinity;
78
- const out = [];
79
- const seen = new Set();
80
- let truncated = false;
81
- scan: for (const e of cfg.edges) {
82
- const a = e.from;
83
- const b = e.to;
84
- if (a < 0 || a >= n || b < 0 || b >= n)
85
- continue;
86
- // No control dependence when B post-dominates A every path leaving A
87
- // through this edge still reaches B, so A does not decide B's execution.
88
- // This guard is exactly AC2: a dependence exists IFF post-dominance fails.
89
- if (postDominates(tree, b, a))
93
+ // Reverse-CFG post-dominance frontier (Cytron, Ferrante, Rosen, Wegman,
94
+ // Zadeck 1991): control dependence IS the dominance frontier of the reverse
95
+ // CFG. `A ∈ PDF(X)` ⟺ X is control-dependent on A, so emit (controller=A,
96
+ // dependent=X). Computing the PDF bottom-up over the post-dom tree charges
97
+ // each up-step to a DISTINCT emitted entry — O(N+E+output) — instead of the
98
+ // old Ferrante up-walk that re-climbs the ipdom chain per CFG edge (Θ(N²) on
99
+ // a deep post-dom chain). Output is the identical (controller, dependent,
100
+ // label) set (verified byte-identical on 3203 CFGs across all languages +
101
+ // fuzz) and 1-2 orders of magnitude faster. LLVM (ReverseIDFCalculator),
102
+ // Joern (CdgPass) and WALA use the same formulation.
103
+ const children = Array.from({ length: n }, () => []);
104
+ const inEdges = Array.from({ length: n }, () => []);
105
+ for (let b = 0; b < n; b++) {
106
+ const ip = ipdom[b];
107
+ if (ip !== NO_IPDOM && ip >= 0 && ip < n)
108
+ children[ip].push(b);
109
+ }
110
+ for (const e of cfg.edges) {
111
+ if (e.from < 0 || e.from >= n || e.to < 0 || e.to >= n)
90
112
  continue;
91
- // Sense is read from the CONTROLLER's arms, not this edge's kind alone —
92
- // seq/loop-back fall-through false arms would otherwise mislabel as 'T'
93
- // (#2188 F1).
94
- const label = labelFor(e.kind, armSenses[a]);
95
- const stop = ipdom[a]; // walk up to ipdom(A), EXCLUSIVE (NO_IPDOM to root)
96
- let cur = b;
97
- let steps = 0;
98
- // `steps <= n` is defensive the ipdom chain is a finite tree.
99
- while (cur !== NO_IPDOM && cur !== stop && steps <= n) {
100
- const key = `${a}:${cur}:${label}`;
101
- if (!seen.has(key)) {
102
- // Check BEFORE pushing so `truncated` means a genuine overflow (a new
103
- // unique edge had to be dropped), not merely "reached the ceiling" —
104
- // exactly `cap` edges is a full, non-truncated result.
105
- if (out.length >= cap) {
106
- truncated = true;
107
- break scan;
108
- }
109
- seen.add(key);
110
- out.push({ controllerBlock: a, dependentBlock: cur, label });
113
+ inEdges[e.to].push({ from: e.from, kind: e.kind });
114
+ }
115
+ // Post-dom-tree post-order (children before parents). Iterative — the post-dom
116
+ // forest can itself be chain-deep. Roots are the NO_IPDOM nodes (EXIT, plus
117
+ // any exit-unreachable region per #2188 F2). The reverse of a root-first DFS
118
+ // visits every parent AFTER all its descendants.
119
+ const dfs = [];
120
+ for (let r = 0; r < n; r++)
121
+ if (ipdom[r] === NO_IPDOM)
122
+ dfs.push(r);
123
+ const preorder = [];
124
+ while (dfs.length) {
125
+ const x = dfs.pop();
126
+ preorder.push(x);
127
+ for (const c of children[x])
128
+ dfs.push(c);
129
+ }
130
+ const order = preorder.reverse();
131
+ // PDF[X]: controller A → the label SET with which A controls X. A set (not one
132
+ // label) because a controller can reach X via opposite-sense arms (goto-
133
+ // cycles) — the old (a, cur, label) dedup kept both rows.
134
+ const pdf = Array.from({ length: n }, () => new Map());
135
+ const add = (x, a, label) => {
136
+ const set = pdf[x].get(a);
137
+ if (set)
138
+ set.add(label);
139
+ else
140
+ pdf[x].set(a, new Set([label]));
141
+ };
142
+ for (const x of order) {
143
+ // PDF_local: a CFG-predecessor A of X that X does not (immediately) post-
144
+ // dominate. `A !== X && ipdom[A] !== X` is exactly the production
145
+ // `!postDominates(X, A)` for one edge A→X (postDominates(X,A) ⟺ ipdom[A]===X),
146
+ // and it excludes self-edges + NO_IPDOM regions. Sense is read from the
147
+ // CONTROLLER's arms (seq/loop-back fall-through false arms would otherwise
148
+ // mislabel as 'T' — #2188 F1).
149
+ for (const { from: a, kind } of inEdges[x]) {
150
+ if (a !== x && ipdom[a] !== x)
151
+ add(x, a, labelFor(kind, armSenses[a]));
152
+ }
153
+ // PDF_up: inherit each post-dom child's frontier controller (with its label
154
+ // set) when X does not post-dominate it.
155
+ for (const z of children[x]) {
156
+ for (const [a, labels] of pdf[z]) {
157
+ if (ipdom[a] !== x)
158
+ for (const l of labels)
159
+ add(x, a, l);
111
160
  }
112
- cur = ipdom[cur];
113
- steps += 1;
161
+ }
162
+ }
163
+ const out = [];
164
+ for (const x of order) {
165
+ for (const [a, labels] of pdf[x]) {
166
+ for (const label of labels)
167
+ out.push({ controllerBlock: a, dependentBlock: x, label });
114
168
  }
115
169
  }
116
170
  out.sort((x, y) => x.controllerBlock - y.controllerBlock ||
117
171
  x.dependentBlock - y.dependentBlock ||
118
172
  (x.label < y.label ? -1 : x.label > y.label ? 1 : 0));
173
+ // `maxEdges` is a heap-safety backstop applied to the SORTED set (the DF makes
174
+ // overflow far rarer than the old per-edge walk). Deterministic prefix, never
175
+ // a silent drop; mirrors computeReachingDefs' `truncated`.
176
+ let truncated = false;
177
+ if (cap !== Infinity && out.length > cap) {
178
+ truncated = true;
179
+ out.length = cap;
180
+ }
119
181
  return { edges: out, truncated };
120
182
  }
@@ -76,6 +76,25 @@ export declare const POST_DOMINATE_DEBUG_ENV = "GITNEXUS_PDG_EMIT_POST_DOMINATE"
76
76
  export declare const REACHING_DEF_FACTS_PER_EDGE_CAP = 4;
77
77
  /** Derived emit-path fact limit at the default edge cap (bench/doc anchor). */
78
78
  export declare const DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION: number;
79
+ /**
80
+ * Fixpoint-iteration budget for {@link computeReachingDefs}, as a multiple of
81
+ * the function's block count ({@link emitFileReachingDefs} passes
82
+ * `blocks.length × this` as `maxBlockVisits`). Iterative reaching-defs on a
83
+ * reducible CFG converges in O(loop-nesting-depth) passes, so a worklist
84
+ * re-visits each block a small multiple of times for real code; this budget
85
+ * tolerates a nesting depth far beyond any hand-written function (real code is
86
+ * ≤ ~15 deep) while truncating the pathological deep nest that otherwise drives
87
+ * the solver to O(blocks²) — measured at seconds + GB on a machine-generated
88
+ * 2000-line all-loops function whose fact count stays linear (so `maxFacts`
89
+ * never fires). Truncation degrades to a sound empty REACHING_DEF for that one
90
+ * function (status `truncated`), never wrong facts.
91
+ *
92
+ * This ceiling is the SOUND backstop, not a perf fix: WTO / loop-aware iteration
93
+ * ordering was benchmarked and rejected (0% faster — the cost is dense-set
94
+ * propagation, not visitation order; see the no-go note in reaching-defs.ts at
95
+ * the RPO-order site). SSA-sparse reaching-defs is the deferred real fix.
96
+ */
97
+ export declare const DEFAULT_PDG_MAX_REACHING_DEF_BLOCK_REVISITS = 64;
79
98
  export interface CfgEmitResult {
80
99
  blocks: number;
81
100
  edges: number;
@@ -2,6 +2,7 @@ import { generateId } from '../../../lib/utils.js';
2
2
  import { computeReachingDefs } from './reaching-defs.js';
3
3
  import { computeControlDependence } from './control-dependence.js';
4
4
  import { computePostDominators, isExitReachableFromAllBlocks, NO_IPDOM, } from './post-dominators.js';
5
+ import { augmentForPostDom } from './synthetic-escape.js';
5
6
  /**
6
7
  * Default per-function CFG edge cap. A pathological generated function could
7
8
  * otherwise emit an unbounded edge set; the cap bounds graph growth and is
@@ -58,6 +59,25 @@ export const POST_DOMINATE_DEBUG_ENV = 'GITNEXUS_PDG_EMIT_POST_DOMINATE';
58
59
  export const REACHING_DEF_FACTS_PER_EDGE_CAP = 4;
59
60
  /** Derived emit-path fact limit at the default edge cap (bench/doc anchor). */
60
61
  export const DEFAULT_PDG_MAX_REACHING_DEF_FACTS_PER_FUNCTION = REACHING_DEF_FACTS_PER_EDGE_CAP * DEFAULT_PDG_MAX_REACHING_DEF_EDGES_PER_FUNCTION;
62
+ /**
63
+ * Fixpoint-iteration budget for {@link computeReachingDefs}, as a multiple of
64
+ * the function's block count ({@link emitFileReachingDefs} passes
65
+ * `blocks.length × this` as `maxBlockVisits`). Iterative reaching-defs on a
66
+ * reducible CFG converges in O(loop-nesting-depth) passes, so a worklist
67
+ * re-visits each block a small multiple of times for real code; this budget
68
+ * tolerates a nesting depth far beyond any hand-written function (real code is
69
+ * ≤ ~15 deep) while truncating the pathological deep nest that otherwise drives
70
+ * the solver to O(blocks²) — measured at seconds + GB on a machine-generated
71
+ * 2000-line all-loops function whose fact count stays linear (so `maxFacts`
72
+ * never fires). Truncation degrades to a sound empty REACHING_DEF for that one
73
+ * function (status `truncated`), never wrong facts.
74
+ *
75
+ * This ceiling is the SOUND backstop, not a perf fix: WTO / loop-aware iteration
76
+ * ordering was benchmarked and rejected (0% faster — the cost is dense-set
77
+ * propagation, not visitation order; see the no-go note in reaching-defs.ts at
78
+ * the RPO-order site). SSA-sparse reaching-defs is the deferred real fix.
79
+ */
80
+ export const DEFAULT_PDG_MAX_REACHING_DEF_BLOCK_REVISITS = 64;
61
81
  /**
62
82
  * The single BasicBlock id template (module doc). Exported for the M3 taint
63
83
  * emit path (taint/emit.ts), whose TAINTED/SANITIZES edges must address the
@@ -266,7 +286,10 @@ export function emitFileReachingDefs(graph, cfgs, maxEdgesPerFunction = DEFAULT_
266
286
  `REACHING_DEF skipped for this function; its CFG is unaffected`);
267
287
  continue;
268
288
  }
269
- const r = computeReachingDefs(cfg, { maxFacts });
289
+ const r = computeReachingDefs(cfg, {
290
+ maxFacts,
291
+ maxBlockVisits: cfg.blocks.length * DEFAULT_PDG_MAX_REACHING_DEF_BLOCK_REVISITS,
292
+ });
270
293
  if (r.status === 'no-facts')
271
294
  continue;
272
295
  result.facts += r.facts.length;
@@ -374,24 +397,39 @@ export function emitFileCdg(graph, cfgs, maxEdgesPerFunction = DEFAULT_PDG_MAX_C
374
397
  const emitPostDom = postDominateDebugEnabled();
375
398
  for (const cfg of cfgs) {
376
399
  const { filePath, functionStartLine, functionStartColumn } = cfg;
400
+ // Synthetic-escape pass (#2197 U1): restore EXIT reverse-reachability for a
401
+ // genuine exit-unreachable CYCLE (an unconditional `goto`-cycle / infinite
402
+ // loop) so the post-dom / CDG pass runs instead of being withheld. A no-op
403
+ // (returns `cfg` unchanged) for terminating functions and properly-escaped
404
+ // loops — those stay byte-identical. The synthetic edges are ANALYSIS-ONLY:
405
+ // they live on the returned shallow clone, never on the persisted `cfg`, so
406
+ // CFG / REACHING_DEF and the byte-identical-off golden are unaffected. Both
407
+ // the gate below AND the post-dom / CDG passes must see the augmented view
408
+ // (KTD7 — the Ferrante walk re-reads `cfg.edges`).
409
+ const view = augmentForPostDom(cfg);
377
410
  // Sound post-dominance requires EXIT reachable from every entry-reachable
378
- // block (#2188 review). A CFG that violates it — a future visitor's
379
- // multi-terminal / non-terminating shape would yield a CDG that both
380
- // drops real and invents spurious dependences, so skip CDG for it. CFG and
381
- // REACHING_DEF (emitted elsewhere, independent of post-dominance) are kept.
382
- if (!isExitReachableFromAllBlocks(cfg)) {
411
+ // block (#2188 review). The synthetic-escape pass recovers genuine cycles;
412
+ // anything STILL unreachable after it is a residual non-cycle anomaly (a
413
+ // dangling/dead-end block, a branch-less trapping spin, or a construction
414
+ // error) — NOT something we bridge (that would mask the bug). Skip CDG for
415
+ // it and surface the skip. CFG and REACHING_DEF (emitted elsewhere,
416
+ // independent of post-dominance) are kept.
417
+ if (!isExitReachableFromAllBlocks(view)) {
383
418
  result.skippedUnsoundFunctions++;
384
419
  onWarn?.(`[cdg] ${filePath}:${functionStartLine}: EXIT not reachable from all ` +
385
420
  `blocks — CDG skipped for this function (CFG/REACHING_DEF unaffected)`);
386
421
  continue;
387
422
  }
388
423
  // Compute the post-dom tree once and feed it to the control-dependence
389
- // pass (avoids recomputing it) and to the optional POST_DOMINATE emit.
390
- const tree = computePostDominators(cfg);
424
+ // pass (avoids recomputing it) and to the optional POST_DOMINATE emit. The
425
+ // CDG edges reference BLOCK INDICES, which are identical in `view` and `cfg`
426
+ // (the augmentation only appends edges), so persisting them keyed off the
427
+ // original block ids is correct.
428
+ const tree = computePostDominators(view);
391
429
  // Bound the pre-dedup materialization (heap parity with REACHING_DEF). The
392
430
  // fixed ceiling is a catastrophe backstop; the per-function edge cap below
393
431
  // remains the reporting authority. A ceiling hit is surfaced, not silent.
394
- const { edges: cdgEdges, truncated } = computeControlDependence(cfg, tree, DEFAULT_PDG_MAX_CDG_MATERIALIZATION_PER_FUNCTION);
432
+ const { edges: cdgEdges, truncated } = computeControlDependence(view, tree, DEFAULT_PDG_MAX_CDG_MATERIALIZATION_PER_FUNCTION);
395
433
  if (truncated) {
396
434
  onWarn?.(`[cdg] ${filePath}:${functionStartLine}: control-dependence materialization ` +
397
435
  `ceiling (${DEFAULT_PDG_MAX_CDG_MATERIALIZATION_PER_FUNCTION}) reached — ` +