circle-ir 3.8.4 → 3.9.7

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (157) hide show
  1. package/README.md +82 -5
  2. package/dist/analysis/dfg-verifier.d.ts +3 -14
  3. package/dist/analysis/dfg-verifier.js +43 -74
  4. package/dist/analysis/dfg-verifier.js.map +1 -1
  5. package/dist/analysis/interprocedural.d.ts +5 -1
  6. package/dist/analysis/interprocedural.js +62 -60
  7. package/dist/analysis/interprocedural.js.map +1 -1
  8. package/dist/analysis/metrics/index.d.ts +2 -0
  9. package/dist/analysis/metrics/index.js +2 -0
  10. package/dist/analysis/metrics/index.js.map +1 -0
  11. package/dist/analysis/metrics/metric-pass.d.ts +27 -0
  12. package/dist/analysis/metrics/metric-pass.js +2 -0
  13. package/dist/analysis/metrics/metric-pass.js.map +1 -0
  14. package/dist/analysis/metrics/metric-runner.d.ts +21 -0
  15. package/dist/analysis/metrics/metric-runner.js +47 -0
  16. package/dist/analysis/metrics/metric-runner.js.map +1 -0
  17. package/dist/analysis/metrics/passes/cohesion-metrics-pass.d.ts +21 -0
  18. package/dist/analysis/metrics/passes/cohesion-metrics-pass.js +100 -0
  19. package/dist/analysis/metrics/passes/cohesion-metrics-pass.js.map +1 -0
  20. package/dist/analysis/metrics/passes/complexity-metrics-pass.d.ts +15 -0
  21. package/dist/analysis/metrics/passes/complexity-metrics-pass.js +76 -0
  22. package/dist/analysis/metrics/passes/complexity-metrics-pass.js.map +1 -0
  23. package/dist/analysis/metrics/passes/composite-metrics-pass.d.ts +17 -0
  24. package/dist/analysis/metrics/passes/composite-metrics-pass.js +77 -0
  25. package/dist/analysis/metrics/passes/composite-metrics-pass.js.map +1 -0
  26. package/dist/analysis/metrics/passes/coupling-metrics-pass.d.ts +19 -0
  27. package/dist/analysis/metrics/passes/coupling-metrics-pass.js +94 -0
  28. package/dist/analysis/metrics/passes/coupling-metrics-pass.js.map +1 -0
  29. package/dist/analysis/metrics/passes/data-flow-metrics-pass.d.ts +14 -0
  30. package/dist/analysis/metrics/passes/data-flow-metrics-pass.js +25 -0
  31. package/dist/analysis/metrics/passes/data-flow-metrics-pass.js.map +1 -0
  32. package/dist/analysis/metrics/passes/documentation-metrics-pass.d.ts +15 -0
  33. package/dist/analysis/metrics/passes/documentation-metrics-pass.js +64 -0
  34. package/dist/analysis/metrics/passes/documentation-metrics-pass.js.map +1 -0
  35. package/dist/analysis/metrics/passes/halstead-metrics-pass.d.ts +16 -0
  36. package/dist/analysis/metrics/passes/halstead-metrics-pass.js +95 -0
  37. package/dist/analysis/metrics/passes/halstead-metrics-pass.js.map +1 -0
  38. package/dist/analysis/metrics/passes/inheritance-metrics-pass.d.ts +18 -0
  39. package/dist/analysis/metrics/passes/inheritance-metrics-pass.js +73 -0
  40. package/dist/analysis/metrics/passes/inheritance-metrics-pass.js.map +1 -0
  41. package/dist/analysis/metrics/passes/size-metrics-pass.d.ts +11 -0
  42. package/dist/analysis/metrics/passes/size-metrics-pass.js +64 -0
  43. package/dist/analysis/metrics/passes/size-metrics-pass.js.map +1 -0
  44. package/dist/analysis/passes/circular-dependency-pass.d.ts +18 -0
  45. package/dist/analysis/passes/circular-dependency-pass.js +39 -0
  46. package/dist/analysis/passes/circular-dependency-pass.js.map +1 -0
  47. package/dist/analysis/passes/constant-propagation-pass.d.ts +22 -0
  48. package/dist/analysis/passes/constant-propagation-pass.js +44 -0
  49. package/dist/analysis/passes/constant-propagation-pass.js.map +1 -0
  50. package/dist/analysis/passes/cross-file-pass.d.ts +27 -0
  51. package/dist/analysis/passes/cross-file-pass.js +102 -0
  52. package/dist/analysis/passes/cross-file-pass.js.map +1 -0
  53. package/dist/analysis/passes/dead-code-pass.d.ts +25 -0
  54. package/dist/analysis/passes/dead-code-pass.js +117 -0
  55. package/dist/analysis/passes/dead-code-pass.js.map +1 -0
  56. package/dist/analysis/passes/dependency-fan-out-pass.d.ts +19 -0
  57. package/dist/analysis/passes/dependency-fan-out-pass.js +35 -0
  58. package/dist/analysis/passes/dependency-fan-out-pass.js.map +1 -0
  59. package/dist/analysis/passes/interprocedural-pass.d.ts +29 -0
  60. package/dist/analysis/passes/interprocedural-pass.js +169 -0
  61. package/dist/analysis/passes/interprocedural-pass.js.map +1 -0
  62. package/dist/analysis/passes/language-sources-pass.d.ts +76 -0
  63. package/dist/analysis/passes/language-sources-pass.js +491 -0
  64. package/dist/analysis/passes/language-sources-pass.js.map +1 -0
  65. package/dist/analysis/passes/leaked-global-pass.d.ts +34 -0
  66. package/dist/analysis/passes/leaked-global-pass.js +108 -0
  67. package/dist/analysis/passes/leaked-global-pass.js.map +1 -0
  68. package/dist/analysis/passes/missing-await-pass.d.ts +29 -0
  69. package/dist/analysis/passes/missing-await-pass.js +90 -0
  70. package/dist/analysis/passes/missing-await-pass.js.map +1 -0
  71. package/dist/analysis/passes/missing-public-doc-pass.d.ts +35 -0
  72. package/dist/analysis/passes/missing-public-doc-pass.js +148 -0
  73. package/dist/analysis/passes/missing-public-doc-pass.js.map +1 -0
  74. package/dist/analysis/passes/n-plus-one-pass.d.ts +29 -0
  75. package/dist/analysis/passes/n-plus-one-pass.js +100 -0
  76. package/dist/analysis/passes/n-plus-one-pass.js.map +1 -0
  77. package/dist/analysis/passes/null-deref-pass.d.ts +32 -0
  78. package/dist/analysis/passes/null-deref-pass.js +130 -0
  79. package/dist/analysis/passes/null-deref-pass.js.map +1 -0
  80. package/dist/analysis/passes/orphan-module-pass.d.ts +21 -0
  81. package/dist/analysis/passes/orphan-module-pass.js +38 -0
  82. package/dist/analysis/passes/orphan-module-pass.js.map +1 -0
  83. package/dist/analysis/passes/resource-leak-pass.d.ts +43 -0
  84. package/dist/analysis/passes/resource-leak-pass.js +156 -0
  85. package/dist/analysis/passes/resource-leak-pass.js.map +1 -0
  86. package/dist/analysis/passes/sink-filter-pass.d.ts +39 -0
  87. package/dist/analysis/passes/sink-filter-pass.js +231 -0
  88. package/dist/analysis/passes/sink-filter-pass.js.map +1 -0
  89. package/dist/analysis/passes/stale-doc-ref-pass.d.ts +21 -0
  90. package/dist/analysis/passes/stale-doc-ref-pass.js +96 -0
  91. package/dist/analysis/passes/stale-doc-ref-pass.js.map +1 -0
  92. package/dist/analysis/passes/string-concat-loop-pass.d.ts +26 -0
  93. package/dist/analysis/passes/string-concat-loop-pass.js +87 -0
  94. package/dist/analysis/passes/string-concat-loop-pass.js.map +1 -0
  95. package/dist/analysis/passes/sync-io-async-pass.d.ts +28 -0
  96. package/dist/analysis/passes/sync-io-async-pass.js +80 -0
  97. package/dist/analysis/passes/sync-io-async-pass.js.map +1 -0
  98. package/dist/analysis/passes/taint-matcher-pass.d.ts +24 -0
  99. package/dist/analysis/passes/taint-matcher-pass.js +71 -0
  100. package/dist/analysis/passes/taint-matcher-pass.js.map +1 -0
  101. package/dist/analysis/passes/taint-propagation-pass.d.ts +22 -0
  102. package/dist/analysis/passes/taint-propagation-pass.js +266 -0
  103. package/dist/analysis/passes/taint-propagation-pass.js.map +1 -0
  104. package/dist/analysis/passes/todo-in-prod-pass.d.ts +28 -0
  105. package/dist/analysis/passes/todo-in-prod-pass.js +71 -0
  106. package/dist/analysis/passes/todo-in-prod-pass.js.map +1 -0
  107. package/dist/analysis/passes/unchecked-return-pass.d.ts +34 -0
  108. package/dist/analysis/passes/unchecked-return-pass.js +106 -0
  109. package/dist/analysis/passes/unchecked-return-pass.js.map +1 -0
  110. package/dist/analysis/passes/unused-variable-pass.d.ts +36 -0
  111. package/dist/analysis/passes/unused-variable-pass.js +150 -0
  112. package/dist/analysis/passes/unused-variable-pass.js.map +1 -0
  113. package/dist/analysis/passes/variable-shadowing-pass.d.ts +41 -0
  114. package/dist/analysis/passes/variable-shadowing-pass.js +211 -0
  115. package/dist/analysis/passes/variable-shadowing-pass.js.map +1 -0
  116. package/dist/analysis/path-finder.d.ts +3 -13
  117. package/dist/analysis/path-finder.js +48 -63
  118. package/dist/analysis/path-finder.js.map +1 -1
  119. package/dist/analysis/taint-matcher.js +8 -1
  120. package/dist/analysis/taint-matcher.js.map +1 -1
  121. package/dist/analysis/taint-propagation.d.ts +5 -1
  122. package/dist/analysis/taint-propagation.js +44 -41
  123. package/dist/analysis/taint-propagation.js.map +1 -1
  124. package/dist/analyzer.d.ts +42 -1
  125. package/dist/analyzer.js +234 -1476
  126. package/dist/analyzer.js.map +1 -1
  127. package/dist/browser/circle-ir.js +3413 -1271
  128. package/dist/core/circle-ir-core.cjs +360 -106
  129. package/dist/core/circle-ir-core.js +360 -106
  130. package/dist/core/extractors/imports.js +18 -0
  131. package/dist/core/extractors/imports.js.map +1 -1
  132. package/dist/graph/analysis-pass.d.ts +68 -0
  133. package/dist/graph/analysis-pass.js +51 -0
  134. package/dist/graph/analysis-pass.js.map +1 -0
  135. package/dist/graph/code-graph.d.ts +92 -0
  136. package/dist/graph/code-graph.js +262 -0
  137. package/dist/graph/code-graph.js.map +1 -0
  138. package/dist/graph/import-graph.d.ts +33 -0
  139. package/dist/graph/import-graph.js +170 -0
  140. package/dist/graph/import-graph.js.map +1 -0
  141. package/dist/graph/index.d.ts +4 -0
  142. package/dist/graph/index.js +5 -0
  143. package/dist/graph/index.js.map +1 -0
  144. package/dist/graph/project-graph.d.ts +43 -0
  145. package/dist/graph/project-graph.js +80 -0
  146. package/dist/graph/project-graph.js.map +1 -0
  147. package/dist/graph/scope-graph.d.ts +63 -0
  148. package/dist/graph/scope-graph.js +89 -0
  149. package/dist/graph/scope-graph.js.map +1 -0
  150. package/dist/index.d.ts +2 -2
  151. package/dist/index.js +1 -1
  152. package/dist/index.js.map +1 -1
  153. package/dist/resolution/cross-file.js +52 -19
  154. package/dist/resolution/cross-file.js.map +1 -1
  155. package/dist/types/index.d.ts +151 -0
  156. package/docs/SPEC.md +10 -6
  157. package/package.json +1 -1
package/dist/analyzer.js CHANGED
@@ -3,532 +3,66 @@
3
3
  *
4
4
  * Main entry point for analyzing source code and producing Circle-IR output.
5
5
  * This is the core static analyzer. LLM-based verification and discovery are out of scope for this library.
6
+ *
7
+ * The analysis pipeline runs twenty-one sequential passes over a shared CodeGraph:
8
+ * 1. TaintMatcherPass — config-based source/sink extraction
9
+ * 2. ConstantPropagationPass — dead-code detection, symbol table, field taint
10
+ * 3. LanguageSourcesPass — language-specific sources/sinks (JS, Python, getters)
11
+ * 4. SinkFilterPass — four-stage false-positive elimination
12
+ * 5. TaintPropagationPass — DFG-based flow verification
13
+ * 6. InterproceduralPass — cross-method taint propagation
14
+ * 7. DeadCodePass — CFG blocks unreachable from entry (CWE-561)
15
+ * 8. MissingAwaitPass — unawaited async calls in JS/TS (CWE-252)
16
+ * 9. NPlusOnePass — DB/HTTP calls inside loop bodies (CWE-1049)
17
+ * 10. MissingPublicDocPass — public methods/types without doc comments
18
+ * 11. TodoInProdPass — TODO/FIXME/HACK markers in production code
19
+ * 12. StringConcatLoopPass — string += inside loops, O(n²) allocations (CWE-1046)
20
+ * 13. SyncIoAsyncPass — blocking *Sync calls inside async functions (CWE-1050)
21
+ * 14. UncheckedReturnPass — ignored boolean return from File.delete etc. (CWE-252)
22
+ * 15. NullDerefPass — null-assigned var dereferenced without guard (CWE-476)
23
+ * 16. ResourceLeakPass — stream/connection opened but never closed (CWE-772)
24
+ * 17. VariableShadowingPass — inner scope re-declares outer name (CWE-1109)
25
+ * 18. LeakedGlobalPass — assignment without declaration in JS/TS (CWE-1109)
26
+ * 19. UnusedVariablePass — local variable declared but value never read (CWE-561)
27
+ * 20. DependencyFanOutPass — module imports 20+ other modules (architecture smell)
28
+ * 21. StaleDocRefPass — doc comment references unknown symbol (CWE: none)
6
29
  */
7
30
  import { initParser, parse, extractMeta, extractTypes, extractCalls, extractImports, extractExports, buildCFG, buildDFG, collectAllNodes, } from './core/index.js';
8
- import { analyzeTaint, getDefaultConfig, detectUnresolved, propagateTaint, analyzeInterprocedural, findTaintBridges, analyzeConstantPropagation, isFalsePositive, isCorrelatedPredicateFP } from './analysis/index.js';
9
- import { registerBuiltinPlugins, getLanguagePlugin } from './languages/index.js';
31
+ import { analyzeTaint, getDefaultConfig, detectUnresolved, analyzeConstantPropagation, isFalsePositive, } from './analysis/index.js';
32
+ import { registerBuiltinPlugins } from './languages/index.js';
10
33
  import { logger } from './utils/logger.js';
11
- /**
12
- * Find getter methods that return tainted fields from constructor assignments.
13
- * This enables detection of taint through: constructor param → field → getter return.
14
- */
15
- function findGetterSources(types, instanceFieldTaint, sourceCode) {
16
- const sources = [];
17
- if (instanceFieldTaint.size === 0) {
18
- return sources;
19
- }
20
- // Iterate through all classes and methods
21
- for (const type of types) {
22
- for (const method of type.methods) {
23
- // Look for getter pattern: getXxx() returning a field
24
- const methodName = method.name;
25
- // Check for getter naming convention: getXxx, isXxx, or just xxx
26
- let potentialFieldName = null;
27
- if (methodName.startsWith('get') && methodName.length > 3) {
28
- // getField -> field (lowercase first letter)
29
- potentialFieldName = methodName.charAt(3).toLowerCase() + methodName.substring(4);
30
- }
31
- else if (methodName.startsWith('is') && methodName.length > 2) {
32
- // isField -> field
33
- potentialFieldName = methodName.charAt(2).toLowerCase() + methodName.substring(3);
34
- }
35
- // Check if the method body returns a tainted field
36
- // Simple check: method has no parameters and returns a field that's tracked as tainted
37
- if (method.parameters.length === 0) {
38
- // Check both the potential field name from naming convention and exact match
39
- const fieldsToCheck = potentialFieldName
40
- ? [potentialFieldName, methodName]
41
- : [methodName];
42
- for (const fieldName of fieldsToCheck) {
43
- const fieldTaint = instanceFieldTaint.get(fieldName);
44
- if (fieldTaint && fieldTaint.className === type.name) {
45
- sources.push({
46
- type: 'constructor_field',
47
- location: `${type.name}.${methodName}() returns tainted field '${fieldName}' (from constructor param '${fieldTaint.sourceParam}')`,
48
- severity: 'high',
49
- line: method.start_line,
50
- confidence: 0.95,
51
- });
52
- break; // Found a match, no need to check more fields
53
- }
54
- }
55
- }
56
- // Also check for direct field name match (e.g., method name() returns this.name)
57
- for (const [fieldName, fieldTaint] of instanceFieldTaint) {
58
- if (fieldTaint.className === type.name) {
59
- // Check if method name matches field name directly (common pattern)
60
- if (methodName === fieldName && method.parameters.length === 0) {
61
- // Avoid duplicates
62
- const alreadyAdded = sources.some(s => s.location.includes(`${type.name}.${methodName}()`));
63
- if (!alreadyAdded) {
64
- sources.push({
65
- type: 'constructor_field',
66
- location: `${type.name}.${methodName}() returns tainted field '${fieldName}' (from constructor param '${fieldTaint.sourceParam}')`,
67
- severity: 'high',
68
- line: method.start_line,
69
- confidence: 0.95,
70
- });
71
- }
72
- }
73
- }
74
- }
75
- }
76
- }
77
- return sources;
78
- }
79
- /**
80
- * DOM XSS sink property patterns.
81
- * Used to detect sinks in property assignments like: element.innerHTML = value
82
- */
83
- const JS_DOM_XSS_SINKS = [
84
- { pattern: /\.innerHTML\s*=/, type: 'xss', cwe: 'CWE-79', severity: 'critical' },
85
- { pattern: /\.outerHTML\s*=/, type: 'xss', cwe: 'CWE-79', severity: 'critical' },
86
- { pattern: /document\.write\s*\(/, type: 'xss', cwe: 'CWE-79', severity: 'critical' },
87
- { pattern: /document\.writeln\s*\(/, type: 'xss', cwe: 'CWE-79', severity: 'critical' },
88
- { pattern: /\.insertAdjacentHTML\s*\(/, type: 'xss', cwe: 'CWE-79', severity: 'critical' },
89
- { pattern: /\.src\s*=/, type: 'xss', cwe: 'CWE-79', severity: 'high' },
90
- { pattern: /\.href\s*=/, type: 'xss', cwe: 'CWE-79', severity: 'high' },
91
- ];
92
- /**
93
- * Tainted JavaScript property access patterns.
94
- * Used to detect sources in variable assignments like: var x = req.query.id
95
- */
96
- const JS_TAINTED_PATTERNS = [
97
- { pattern: /\breq\.query\b/, type: 'http_param' },
98
- { pattern: /\breq\.params\b/, type: 'http_param' },
99
- { pattern: /\breq\.body\b/, type: 'http_body' },
100
- { pattern: /\breq\.headers\b/, type: 'http_header' },
101
- { pattern: /\breq\.cookies\b/, type: 'http_cookie' },
102
- { pattern: /\breq\.url\b/, type: 'http_path' },
103
- { pattern: /\breq\.path\b/, type: 'http_path' },
104
- { pattern: /\breq\.originalUrl\b/, type: 'http_path' },
105
- { pattern: /\breq\.files?\b/, type: 'file_input' },
106
- { pattern: /\brequest\.query\b/, type: 'http_param' },
107
- { pattern: /\brequest\.params\b/, type: 'http_param' },
108
- { pattern: /\brequest\.body\b/, type: 'http_body' },
109
- { pattern: /\brequest\.headers\b/, type: 'http_header' },
110
- { pattern: /\bctx\.query\b/, type: 'http_param' },
111
- { pattern: /\bctx\.params\b/, type: 'http_param' },
112
- { pattern: /\bctx\.request\b/, type: 'http_body' },
113
- { pattern: /\bprocess\.env\b/, type: 'env_input' },
114
- { pattern: /\bprocess\.argv\b/, type: 'io_input' },
115
- { pattern: /\blocation\.search\b/, type: 'http_param' },
116
- { pattern: /\blocation\.hash\b/, type: 'http_param' },
117
- { pattern: /\blocation\.href\b/, type: 'http_path' },
118
- { pattern: /\bdocument\.getElementById\b/, type: 'dom_input' },
119
- { pattern: /\bdocument\.querySelector\b/, type: 'dom_input' },
120
- { pattern: /\.value\b/, type: 'dom_input' },
121
- ];
122
- /**
123
- * Python/Flask/Django tainted request access patterns.
124
- * Used to detect sources in assignments like: user_id = request.args.get('id')
125
- * Also covers subscript access: user_id = request.args['id']
126
- */
127
- const PYTHON_TAINTED_PATTERNS = [
128
- { pattern: /\brequest\.args\b/, type: 'http_param' },
129
- { pattern: /\brequest\.form\b/, type: 'http_body' },
130
- { pattern: /\brequest\.json\b/, type: 'http_body' },
131
- { pattern: /\brequest\.data\b/, type: 'http_body' },
132
- { pattern: /\brequest\.files?\b/, type: 'file_input' },
133
- { pattern: /\brequest\.headers?\b/, type: 'http_header' },
134
- { pattern: /\brequest\.cookies\b/, type: 'http_cookie' },
135
- { pattern: /\brequest\.GET\b/, type: 'http_param' },
136
- { pattern: /\brequest\.POST\b/, type: 'http_body' },
137
- { pattern: /\brequest\.META\b/, type: 'http_header' },
138
- { pattern: /\brequest\.FILES\b/, type: 'file_input' },
139
- { pattern: /\brequest\.query_params\b/, type: 'http_param' },
140
- { pattern: /\brequest\.path_params\b/, type: 'http_param' },
141
- // Flask raw query/body strings
142
- { pattern: /\brequest\.query_string\b/, type: 'http_param' },
143
- { pattern: /\brequest\.get_data\s*\(/, type: 'http_body' },
144
- // Request wrapper helper methods (common in OWASP-style benchmarks and real wrappers)
145
- { pattern: /\bget_form_parameter\s*\(/, type: 'http_body' },
146
- { pattern: /\bget_query_parameter\s*\(/, type: 'http_param' },
147
- { pattern: /\bget_header_value\s*\(/, type: 'http_header' },
148
- { pattern: /\bget_cookie_value\s*\(/, type: 'http_cookie' },
149
- ];
150
- /**
151
- * Find JavaScript taint sources from variable assignments.
152
- * Detects patterns like: var userId = req.query.id
153
- */
154
- function findJavaScriptAssignmentSources(sourceCode, language) {
155
- const sources = [];
156
- // Only apply to JavaScript/TypeScript
157
- if (!['javascript', 'typescript'].includes(language)) {
158
- return sources;
159
- }
160
- const lines = sourceCode.split('\n');
161
- for (let lineNum = 0; lineNum < lines.length; lineNum++) {
162
- const line = lines[lineNum];
163
- const lineNumber = lineNum + 1;
164
- // Look for variable assignments: var/let/const x = ...
165
- // or simple assignments: x = ...
166
- const assignmentMatch = line.match(/(?:(?:var|let|const)\s+)?(\w+)\s*=\s*(.+)/);
167
- if (assignmentMatch) {
168
- const varName = assignmentMatch[1];
169
- const rhs = assignmentMatch[2];
170
- // Check if RHS contains a tainted pattern
171
- for (const { pattern, type } of JS_TAINTED_PATTERNS) {
172
- if (pattern.test(rhs)) {
173
- // Don't add duplicates
174
- const alreadyExists = sources.some(s => s.line === lineNumber && s.type === type);
175
- if (!alreadyExists) {
176
- sources.push({
177
- type,
178
- location: `${varName} = ${rhs.trim().substring(0, 50)}${rhs.length > 50 ? '...' : ''}`,
179
- severity: 'high',
180
- line: lineNumber,
181
- confidence: 1.0,
182
- variable: varName,
183
- });
184
- }
185
- break; // Found a match, no need to check more patterns
186
- }
187
- }
188
- }
189
- }
190
- return sources;
191
- }
192
- /**
193
- * Find Python taint sources from variable assignments and subscript access.
194
- * Detects patterns like: user_id = request.args.get('id') or request.args['id']
195
- */
196
- function findPythonAssignmentSources(sourceCode, language) {
197
- const sources = [];
198
- if (language !== 'python') {
199
- return sources;
200
- }
201
- const lines = sourceCode.split('\n');
202
- for (let lineNum = 0; lineNum < lines.length; lineNum++) {
203
- const line = lines[lineNum];
204
- const lineNumber = lineNum + 1;
205
- // Skip comment lines
206
- if (line.trimStart().startsWith('#'))
207
- continue;
208
- // Look for assignments: x = ... or x: type = ...
209
- const assignmentMatch = line.match(/^(\s*\w[\w.]*)\s*(?::\s*\w[\w\[\], .]*)?\s*=\s*(.+)/);
210
- if (assignmentMatch) {
211
- const rhs = assignmentMatch[2];
212
- for (const { pattern, type } of PYTHON_TAINTED_PATTERNS) {
213
- if (pattern.test(rhs)) {
214
- const varMatch = line.match(/^\s*(\w+)\s*/);
215
- const varName = varMatch ? varMatch[1] : 'unknown';
216
- const alreadyExists = sources.some(s => s.line === lineNumber && s.type === type);
217
- if (!alreadyExists) {
218
- sources.push({
219
- type,
220
- location: `${varName} = ${rhs.trim().substring(0, 50)}${rhs.length > 50 ? '...' : ''}`,
221
- severity: 'high',
222
- line: lineNumber,
223
- confidence: 0.95,
224
- variable: varName,
225
- });
226
- }
227
- break;
228
- }
229
- }
230
- }
231
- }
232
- return sources;
233
- }
234
- /**
235
- * Build a map of tainted variable names → source line via simple forward
236
- * line-by-line taint propagation for Python.
237
- *
238
- * Seeds from PYTHON_TAINTED_PATTERNS; propagates through assignments where the
239
- * RHS contains a tainted variable. Uses per-key container taint to distinguish
240
- * map['tainted_key'] from map['safe_key'] and conf.get(s,tainted_k) vs conf.get(s,safe_k).
241
- */
242
- function buildPythonTaintedVars(sourceCode) {
243
- const tainted = new Map();
244
- // Per-key container taint: "map['key']" or "conf['section']['key']" → line number
245
- const containerTainted = new Map();
246
- const lines = sourceCode.split('\n');
247
- for (let i = 0; i < lines.length; i++) {
248
- const line = lines[i];
249
- if (line.trimStart().startsWith('#'))
250
- continue;
251
- // Subscript assignment: container['key'] = value
252
- // Tracks taint per-key so map['keyA']='safe' and map['keyB']=param are distinguished.
253
- const subscriptAssign = line.match(/^\s*(\w+)\[(['"])([^'"]+)\2\]\s*=\s*(.+)$/);
254
- if (subscriptAssign) {
255
- const [, container, , key, rhs2] = subscriptAssign;
256
- const isTaintedRhs = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(rhs2));
257
- if (isTaintedRhs) {
258
- containerTainted.set(`${container}['${key}']`, i + 1);
259
- }
260
- continue; // subscript assignments don't match simple variable regex below
261
- }
262
- // ConfigParser set: obj.set('section', 'key', value)
263
- // Tracks per (section, key) so conf.get('s','keyA') and conf.get('s','keyB') are distinct.
264
- const setCallMatch = line.match(/^\s*(\w+)\.set\s*\(\s*(['"])([^'"]+)\2\s*,\s*(['"])([^'"]+)\4\s*,\s*(.+?)\s*\)$/);
265
- if (setCallMatch) {
266
- const [, obj, , section, , key, rhs2] = setCallMatch;
267
- const isTaintedRhs = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(rhs2));
268
- if (isTaintedRhs) {
269
- containerTainted.set(`${obj}['${section}']['${key}']`, i + 1);
270
- }
271
- continue;
272
- }
273
- // Augmented assignment: var += expr — taint if either side is tainted
274
- const augAssign = line.match(/^\s*(\w+)\s*\+=\s*(.+)$/);
275
- if (augAssign) {
276
- const [, augLhs, augRhs] = augAssign;
277
- const rhsTainted = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(augRhs));
278
- if (rhsTainted || tainted.has(augLhs)) {
279
- tainted.set(augLhs, tainted.get(augLhs) ?? (i + 1));
280
- }
281
- continue;
282
- }
283
- // For loop: for var in tainted_source — seed loop variable as tainted
284
- const forLoopMatch = line.match(/^\s*for\s+(\w+)\s+in\s+(.+?)(?:\s*:\s*)?$/);
285
- if (forLoopMatch) {
286
- const [, iterVar, iterExpr] = forLoopMatch;
287
- const isDirectSource = PYTHON_TAINTED_PATTERNS.some(p => p.pattern.test(iterExpr));
288
- const isPropagated = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(iterExpr));
289
- if (isDirectSource || isPropagated) {
290
- tainted.set(iterVar, i + 1);
291
- }
292
- continue;
293
- }
294
- // Regular assignment: var = expr
295
- const assignMatch = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
296
- if (!assignMatch)
297
- continue;
298
- const [, lhs, rhs] = assignMatch;
299
- const isDirectSource = PYTHON_TAINTED_PATTERNS.some(p => p.pattern.test(rhs));
300
- let propagatedFrom;
301
- // Per-key dict access: bar = container['key']
302
- const dictAccessMatch = rhs.trim().match(/^(\w+)\[(['"])([^'"]+)\2\]$/);
303
- if (dictAccessMatch) {
304
- const [, container, , key] = dictAccessMatch;
305
- if (containerTainted.has(`${container}['${key}']`)) {
306
- propagatedFrom = `${container}['${key}']`;
307
- }
308
- }
309
- // Per-key configparser get: bar = conf.get('section', 'key')
310
- if (!propagatedFrom) {
311
- const confGetMatch = rhs.trim().match(/^(\w+)\.get\s*\(\s*(['"])([^'"]+)\2\s*,\s*(['"])([^'"]+)\4\s*\)$/);
312
- if (confGetMatch) {
313
- const [, obj, , section, , key] = confGetMatch;
314
- if (containerTainted.has(`${obj}['${section}']['${key}']`)) {
315
- propagatedFrom = `${obj}['${section}']['${key}']`;
316
- }
317
- }
318
- }
319
- // Standard variable propagation (skip os.environ/os.getenv — safe env reads)
320
- if (!propagatedFrom) {
321
- const isSafeEnvRead = /\bos\.environ\.get\s*\(/.test(rhs) || /\bos\.getenv\s*\(/.test(rhs);
322
- if (!isSafeEnvRead) {
323
- propagatedFrom = [...tainted.keys()].find(v => new RegExp(`\\b${v}\\b`).test(rhs));
324
- }
325
- }
326
- if (isDirectSource) {
327
- tainted.set(lhs, i + 1);
328
- }
329
- else if (propagatedFrom !== undefined) {
330
- tainted.set(lhs, i + 1);
331
- }
332
- else if (tainted.has(lhs)) {
333
- // Variable overwritten — preserve taint for null-guard patterns like:
334
- // if not param:
335
- // param = ""
336
- const prevNonBlank = lines.slice(0, i).reverse().find(l => l.trim() && !l.trimStart().startsWith('#'));
337
- const isNullGuard = prevNonBlank !== undefined && (new RegExp(`^\\s*if\\s+not\\s+${lhs}\\s*:`).test(prevNonBlank) ||
338
- new RegExp(`^\\s*if\\s+${lhs}\\s+is\\s+None\\s*:`).test(prevNonBlank));
339
- if (!isNullGuard) {
340
- tainted.delete(lhs);
341
- }
342
- }
343
- }
344
- return tainted;
345
- }
346
- /**
347
- * Forward taint propagation for JavaScript/TypeScript.
348
- * Tracks which local variables are tainted from HTTP request sources.
349
- * Used to filter spurious XSS sinks where the argument is NOT actually tainted
350
- * (e.g., res.send(stdout) where stdout is a callback param from exec(), not user input).
351
- */
352
- function buildJavaScriptTaintedVars(sourceCode, language) {
353
- if (!['javascript', 'typescript'].includes(language))
354
- return new Map();
355
- const tainted = new Map();
356
- const lines = sourceCode.split('\n');
357
- for (let i = 0; i < lines.length; i++) {
358
- const line = lines[i];
359
- // Skip comment lines
360
- const trimmed = line.trimStart();
361
- if (trimmed.startsWith('//') || trimmed.startsWith('*'))
362
- continue;
363
- // Match variable assignments: var/let/const x = rhs OR x = rhs
364
- const assignMatch = line.match(/(?:(?:var|let|const)\s+)?(\w+)\s*=\s*(.+)/);
365
- if (!assignMatch)
366
- continue;
367
- const [, lhs, rhs] = assignMatch;
368
- // Skip keywords that look like assignments but aren't variable names
369
- if (['if', 'while', 'for', 'return', 'true', 'false', 'null', 'undefined', 'case'].includes(lhs))
370
- continue;
371
- // Seed from direct source patterns (req.query.x, req.body, etc.)
372
- const isDirectSource = JS_TAINTED_PATTERNS.some(p => p.pattern.test(rhs));
373
- // Propagate from existing tainted variables
374
- const isTaintedPropagation = tainted.size > 0 &&
375
- [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(rhs));
376
- if (isDirectSource || isTaintedPropagation) {
377
- tainted.set(lhs, i + 1);
378
- }
379
- }
380
- return tainted;
381
- }
382
- /**
383
- * Detect Python apostrophe-check sanitizer guards, e.g.:
384
- * if "'" in bar:
385
- * return # or raise / abort
386
- * Returns the set of variable names that are guarded this way.
387
- */
388
- function findPythonQuoteSanitizedVars(sourceCode) {
389
- const sanitized = new Set();
390
- const lines = sourceCode.split('\n');
391
- for (let i = 0; i < lines.length - 1; i++) {
392
- // Match any apostrophe/quote check: if "'" in var:, if '\'' in var:, if '"' in var:
393
- // Uses full quoted-string pattern to handle Python's various literal forms.
394
- const m = lines[i].match(/^\s*if\s+(?:'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*")\s+in\s+(\w+)\s*:/);
395
- if (!m)
396
- continue;
397
- // Look ahead up to 5 lines for a return/raise/abort/continue/break
398
- // The guard body may be multi-line (e.g. RESPONSE += (...) \n return).
399
- // Stop early if we encounter a line at the same or lesser indentation as the if (block exit).
400
- const ifIndent = (lines[i].match(/^(\s*)/) ?? ['', ''])[1].length;
401
- let foundExit = false;
402
- for (let j = i + 1; j <= Math.min(i + 5, lines.length - 1); j++) {
403
- const jLine = lines[j] ?? '';
404
- if (!jLine.trim())
405
- continue; // skip blank lines
406
- const jIndent = (jLine.match(/^(\s*)/) ?? ['', ''])[1].length;
407
- if (jIndent <= ifIndent)
408
- break; // left the if-block
409
- if (/^(return|raise|abort|continue|break)\b/.test(jLine.trim())) {
410
- foundExit = true;
411
- break;
412
- }
413
- }
414
- if (foundExit) {
415
- sanitized.add(m[1]);
416
- }
417
- }
418
- return sanitized;
419
- }
420
- /**
421
- * Detect Python trust boundary violations:
422
- * flask.session[key] = value (or session[key] = value)
423
- * where key or value references a tainted variable.
424
- */
425
- function findPythonTrustBoundaryViolations(sourceCode, language, taintedVars) {
426
- if (language !== 'python' || taintedVars.size === 0)
427
- return [];
428
- const violations = [];
429
- const lines = sourceCode.split('\n');
430
- const SESSION_WRITE = /(?:flask\.)?session\[([^\]]+)\]\s*=\s*(.+)$/;
431
- const taintedKeys = [...taintedVars.keys()];
432
- const earliestSourceLine = Math.min(...[...taintedVars.values()]);
433
- for (let i = 0; i < lines.length; i++) {
434
- const line = lines[i];
435
- if (line.trimStart().startsWith('#'))
436
- continue;
437
- const m = line.match(SESSION_WRITE);
438
- if (!m)
439
- continue;
440
- const [, keyExpr, valueExpr] = m;
441
- const keyTainted = taintedKeys.some(v => new RegExp(`\\b${v}\\b`).test(keyExpr));
442
- const valueTainted = taintedKeys.some(v => new RegExp(`\\b${v}\\b`).test(valueExpr));
443
- if (keyTainted || valueTainted) {
444
- violations.push({ sourceLine: earliestSourceLine, sinkLine: i + 1 });
445
- }
446
- }
447
- return violations;
448
- }
449
- /**
450
- * Find Python XSS sinks in return/yield statements.
451
- * Flask/Django routes often return HTML strings directly:
452
- * return '<h1>' + user_input + '</h1>'
453
- * return f'<html>{user_input}</html>'
454
- * These are not call nodes so findSinks() never detects them.
455
- */
456
- function findPythonReturnXSSSinks(sourceCode, language, taintedVars) {
457
- if (language !== 'python' || taintedVars.size === 0)
458
- return [];
459
- const sinks = [];
460
- const lines = sourceCode.split('\n');
461
- const taintedKeys = [...taintedVars.keys()];
462
- for (let i = 0; i < lines.length; i++) {
463
- const line = lines[i];
464
- if (line.trimStart().startsWith('#'))
465
- continue;
466
- // Match return/yield statements with string content
467
- const returnMatch = line.match(/^\s*(?:return|yield)\s+(.+)$/);
468
- if (!returnMatch)
469
- continue;
470
- const expr = returnMatch[1];
471
- // Must contain a tainted variable
472
- const hasTaintedVar = taintedKeys.some(v => new RegExp(`\\b${v}\\b`).test(expr));
473
- if (!hasTaintedVar)
474
- continue;
475
- // Must look like HTML (contains '<', or is a string concatenation, or f-string with HTML)
476
- const looksLikeHTML = expr.includes('<') || /['"]\s*\+/.test(expr) || /\+\s*['"]/.test(expr) || /f['"][^'"]*\{/.test(expr);
477
- if (!looksLikeHTML)
478
- continue;
479
- sinks.push({ sinkLine: i + 1 });
480
- }
481
- return sinks;
482
- }
483
- /**
484
- * Find DOM XSS sinks from property assignments in JavaScript.
485
- * Detects patterns like: element.innerHTML = userInput
486
- */
487
- function findJavaScriptDOMSinks(sourceCode, language) {
488
- const sinks = [];
489
- // Only apply to JavaScript/TypeScript
490
- if (!['javascript', 'typescript'].includes(language)) {
491
- return sinks;
492
- }
493
- const lines = sourceCode.split('\n');
494
- for (let lineNum = 0; lineNum < lines.length; lineNum++) {
495
- const line = lines[lineNum];
496
- const lineNumber = lineNum + 1;
497
- // Check for DOM XSS sink patterns
498
- for (const { pattern, type, cwe, severity } of JS_DOM_XSS_SINKS) {
499
- if (pattern.test(line)) {
500
- // Extract the method/property being assigned
501
- let method = 'innerHTML';
502
- if (line.includes('.outerHTML'))
503
- method = 'outerHTML';
504
- else if (line.includes('document.write('))
505
- method = 'document.write';
506
- else if (line.includes('document.writeln('))
507
- method = 'document.writeln';
508
- else if (line.includes('.insertAdjacentHTML'))
509
- method = 'insertAdjacentHTML';
510
- else if (line.includes('.src'))
511
- method = 'src';
512
- else if (line.includes('.href'))
513
- method = 'href';
514
- // Don't add duplicates
515
- const alreadyExists = sinks.some(s => s.line === lineNumber && s.cwe === cwe);
516
- if (!alreadyExists) {
517
- sinks.push({
518
- type,
519
- cwe,
520
- severity,
521
- line: lineNumber,
522
- location: line.trim().substring(0, 80),
523
- method,
524
- });
525
- }
526
- break;
527
- }
528
- }
529
- }
530
- return sinks;
531
- }
34
+ import { CodeGraph, AnalysisPipeline, ProjectGraph } from './graph/index.js';
35
+ import { CrossFilePass } from './analysis/passes/cross-file-pass.js';
36
+ // Pass classes
37
+ import { TaintMatcherPass } from './analysis/passes/taint-matcher-pass.js';
38
+ import { ConstantPropagationPass } from './analysis/passes/constant-propagation-pass.js';
39
+ import { LanguageSourcesPass } from './analysis/passes/language-sources-pass.js';
40
+ import { SinkFilterPass, filterCleanVariableSinks, filterSanitizedSinks } from './analysis/passes/sink-filter-pass.js';
41
+ import { TaintPropagationPass } from './analysis/passes/taint-propagation-pass.js';
42
+ import { InterproceduralPass } from './analysis/passes/interprocedural-pass.js';
43
+ import { DeadCodePass } from './analysis/passes/dead-code-pass.js';
44
+ import { MissingAwaitPass } from './analysis/passes/missing-await-pass.js';
45
+ import { NPlusOnePass } from './analysis/passes/n-plus-one-pass.js';
46
+ import { MissingPublicDocPass } from './analysis/passes/missing-public-doc-pass.js';
47
+ import { TodoInProdPass } from './analysis/passes/todo-in-prod-pass.js';
48
+ import { StringConcatLoopPass } from './analysis/passes/string-concat-loop-pass.js';
49
+ import { SyncIoAsyncPass } from './analysis/passes/sync-io-async-pass.js';
50
+ import { UncheckedReturnPass } from './analysis/passes/unchecked-return-pass.js';
51
+ import { NullDerefPass } from './analysis/passes/null-deref-pass.js';
52
+ import { ResourceLeakPass } from './analysis/passes/resource-leak-pass.js';
53
+ import { VariableShadowingPass } from './analysis/passes/variable-shadowing-pass.js';
54
+ import { LeakedGlobalPass } from './analysis/passes/leaked-global-pass.js';
55
+ import { UnusedVariablePass } from './analysis/passes/unused-variable-pass.js';
56
+ import { DependencyFanOutPass } from './analysis/passes/dependency-fan-out-pass.js';
57
+ import { StaleDocRefPass } from './analysis/passes/stale-doc-ref-pass.js';
58
+ // Project-level pass imports
59
+ import { ImportGraph } from './graph/import-graph.js';
60
+ import { CircularDependencyPass } from './analysis/passes/circular-dependency-pass.js';
61
+ import { OrphanModulePass } from './analysis/passes/orphan-module-pass.js';
62
+ // Metrics
63
+ import { MetricRunner } from './analysis/metrics/index.js';
64
+ // Helpers used by analyzeForAPI
65
+ import { buildPythonTaintedVars, buildPythonSanitizedVars, findPythonTrustBoundaryViolations, } from './analysis/passes/language-sources-pass.js';
532
66
  let initialized = false;
533
67
  /**
534
68
  * Initialize the analyzer. Must be called before analyze().
@@ -601,6 +135,45 @@ function buildEnriched(types, _calls, sources, sinks) {
601
135
  functions: functions.length > 0 ? functions : undefined,
602
136
  };
603
137
  }
138
+ // ---------------------------------------------------------------------------
139
+ // Node type collection — shared by analyze() and analyzeForAPI()
140
+ // ---------------------------------------------------------------------------
141
+ function getNodeTypesForLanguage(language) {
142
+ switch (language) {
143
+ case 'rust':
144
+ return new Set([
145
+ 'call_expression', 'macro_invocation', 'function_item', 'struct_item',
146
+ 'impl_item', 'enum_item', 'trait_item', 'mod_item', 'use_declaration',
147
+ 'let_declaration', 'field_expression', 'scoped_identifier',
148
+ ]);
149
+ case 'python':
150
+ return new Set([
151
+ 'call', 'function_definition', 'class_definition', 'import_statement',
152
+ 'import_from_statement', 'assignment', 'attribute', 'subscript',
153
+ ]);
154
+ case 'javascript':
155
+ case 'typescript':
156
+ return new Set([
157
+ 'call_expression', 'new_expression', 'class_declaration', 'function_declaration',
158
+ 'arrow_function', 'method_definition', 'variable_declaration', 'lexical_declaration',
159
+ 'import_statement', 'export_statement', 'member_expression', 'assignment_expression',
160
+ ]);
161
+ case 'bash':
162
+ return new Set([
163
+ 'command', 'function_definition', 'variable_assignment', 'declaration_command',
164
+ 'if_statement', 'for_statement', 'c_style_for_statement', 'while_statement',
165
+ ]);
166
+ default:
167
+ return new Set([
168
+ 'method_invocation', 'object_creation_expression', 'class_declaration',
169
+ 'method_declaration', 'constructor_declaration', 'field_declaration',
170
+ 'import_declaration', 'interface_declaration', 'enum_declaration',
171
+ ]);
172
+ }
173
+ }
174
+ // ---------------------------------------------------------------------------
175
+ // Main analysis function
176
+ // ---------------------------------------------------------------------------
604
177
  /**
605
178
  * Analyze source code and produce Circle-IR output.
606
179
  */
@@ -613,87 +186,8 @@ export async function analyze(code, filePath, language, options = {}) {
613
186
  const tree = await parse(code, language);
614
187
  logger.trace('Parsed AST', { rootNodeType: tree.rootNode.type });
615
188
  // Collect all node types in a single traversal for better performance
616
- // Different languages have different AST node types
617
- const isJavaScript = language === 'javascript' || language === 'typescript';
618
- const isRust = language === 'rust';
619
- const isPython = language === 'python';
620
- let nodeTypesToCollect;
621
- if (isRust) {
622
- nodeTypesToCollect = new Set([
623
- // Rust AST nodes
624
- 'call_expression',
625
- 'macro_invocation',
626
- 'function_item',
627
- 'struct_item',
628
- 'impl_item',
629
- 'enum_item',
630
- 'trait_item',
631
- 'mod_item',
632
- 'use_declaration',
633
- 'let_declaration',
634
- 'field_expression',
635
- 'scoped_identifier',
636
- ]);
637
- }
638
- else if (isPython) {
639
- nodeTypesToCollect = new Set([
640
- // Python AST nodes
641
- 'call',
642
- 'function_definition',
643
- 'class_definition',
644
- 'import_statement',
645
- 'import_from_statement',
646
- 'assignment',
647
- 'attribute',
648
- 'subscript',
649
- ]);
650
- }
651
- else if (isJavaScript) {
652
- nodeTypesToCollect = new Set([
653
- // JavaScript/TypeScript AST nodes
654
- 'call_expression',
655
- 'new_expression',
656
- 'class_declaration',
657
- 'function_declaration',
658
- 'arrow_function',
659
- 'method_definition',
660
- 'variable_declaration',
661
- 'lexical_declaration',
662
- 'import_statement',
663
- 'export_statement',
664
- 'member_expression',
665
- 'assignment_expression',
666
- ]);
667
- }
668
- else if (language === 'bash') {
669
- nodeTypesToCollect = new Set([
670
- // Bash AST nodes
671
- 'command',
672
- 'function_definition',
673
- 'variable_assignment',
674
- 'declaration_command',
675
- 'if_statement',
676
- 'for_statement',
677
- 'c_style_for_statement',
678
- 'while_statement',
679
- ]);
680
- }
681
- else {
682
- nodeTypesToCollect = new Set([
683
- // Java AST nodes
684
- 'method_invocation',
685
- 'object_creation_expression',
686
- 'class_declaration',
687
- 'method_declaration',
688
- 'constructor_declaration',
689
- 'field_declaration',
690
- 'import_declaration',
691
- 'interface_declaration',
692
- 'enum_declaration',
693
- ]);
694
- }
695
- const nodeCache = collectAllNodes(tree.rootNode, nodeTypesToCollect);
696
- // Extract all components using the cached nodes
189
+ const nodeCache = collectAllNodes(tree.rootNode, getNodeTypesForLanguage(language));
190
+ // Extract all IR components
697
191
  const meta = extractMeta(code, tree, filePath, language);
698
192
  const types = extractTypes(tree, nodeCache, language);
699
193
  const calls = extractCalls(tree, nodeCache, language);
@@ -701,455 +195,51 @@ export async function analyze(code, filePath, language, options = {}) {
701
195
  const exports = extractExports(types);
702
196
  const cfg = buildCFG(tree, language);
703
197
  const dfg = buildDFG(tree, nodeCache, language);
704
- // Extract @sanitizer annotated method names (from Javadoc comments)
705
- const sanitizerMethods = [];
706
- for (const type of types) {
707
- for (const method of type.methods) {
708
- if (method.annotations.includes('sanitizer')) {
709
- sanitizerMethods.push(method.name);
710
- }
711
- }
712
- }
713
- // First, do a preliminary taint analysis to find inter-procedural parameter sources
714
- // These need to be passed to constant propagation so it can track taint from method parameters
715
- let baseConfig = options.taintConfig ?? getDefaultConfig();
716
- // Merge language plugin built-in sources/sinks into the config.
717
- // This handles languages (e.g. Bash) whose patterns are defined on the plugin
718
- // rather than in YAML config files loaded by getDefaultConfig().
719
- if (!options.taintConfig) {
720
- const plugin = getLanguagePlugin(language);
721
- if (plugin) {
722
- const pluginSources = plugin.getBuiltinSources();
723
- const pluginSinks = plugin.getBuiltinSinks();
724
- if (pluginSources.length > 0 || pluginSinks.length > 0) {
725
- baseConfig = {
726
- ...baseConfig,
727
- sources: [
728
- ...baseConfig.sources,
729
- ...pluginSources.map(s => ({
730
- method: s.method,
731
- class: s.class,
732
- annotation: s.annotation,
733
- type: s.type,
734
- severity: s.severity,
735
- return_tainted: s.returnTainted ?? false,
736
- })),
737
- ],
738
- sinks: [
739
- ...baseConfig.sinks,
740
- ...pluginSinks.map(s => ({
741
- method: s.method,
742
- class: s.class,
743
- type: s.type,
744
- cwe: s.cwe,
745
- severity: s.severity,
746
- arg_positions: s.argPositions,
747
- })),
748
- ],
749
- };
750
- }
751
- }
752
- }
753
- const preliminaryTaint = analyzeTaint(calls, types, baseConfig);
754
- // Extract inter-procedural parameter sources
755
- const taintedParameters = [];
756
- for (const source of preliminaryTaint.sources) {
757
- if (source.type === 'interprocedural_param') {
758
- // Location format: "ParamType paramName in methodName"
759
- const match = source.location.match(/(\S+)\s+(\S+)\s+in\s+(\S+)/);
760
- if (match) {
761
- taintedParameters.push({
762
- methodName: match[3],
763
- paramName: match[2],
764
- });
765
- }
766
- }
767
- }
768
- // Run constant propagation with tainted parameters
769
- const constPropResult = analyzeConstantPropagation(tree, code, {
770
- sanitizerMethods,
771
- taintedParameters,
772
- });
773
- // Analyze taint with config
774
- const taint = analyzeTaint(calls, types, baseConfig);
775
- // Add sources for getters that return tainted constructor fields
776
- const getterSources = findGetterSources(types, constPropResult.instanceFieldTaint, code);
777
- taint.sources.push(...getterSources);
778
- // Add sources for JavaScript variable assignments with tainted patterns
779
- const jsAssignmentSources = findJavaScriptAssignmentSources(code, language);
780
- taint.sources.push(...jsAssignmentSources);
781
- // Add sources for Python variable assignments with tainted request patterns
782
- const pythonAssignmentSources = findPythonAssignmentSources(code, language);
783
- taint.sources.push(...pythonAssignmentSources);
784
- // Add sinks for JavaScript DOM XSS patterns (innerHTML, document.write, etc.)
785
- const jsDOMSinks = findJavaScriptDOMSinks(code, language);
786
- for (const domSink of jsDOMSinks) {
787
- // Avoid duplicates
788
- const alreadyExists = taint.sinks.some(s => s.line === domSink.line && s.cwe === domSink.cwe);
789
- if (!alreadyExists) {
790
- taint.sinks.push({
791
- type: 'xss',
792
- cwe: domSink.cwe,
793
- line: domSink.line,
794
- location: domSink.location,
795
- method: domSink.method,
796
- confidence: 1.0,
797
- });
798
- }
799
- }
800
- logger.debug('Initial taint analysis', {
801
- sources: taint.sources.length,
802
- sinks: taint.sinks.length,
803
- sanitizers: taint.sanitizers?.length ?? 0,
804
- getterSources: getterSources.length,
805
- jsDOMSinks: jsDOMSinks.length,
198
+ // Build CodeGraph once shared across all passes.
199
+ // Taint is empty at construction time; sources/sinks/sanitizers are populated by passes.
200
+ const graph = new CodeGraph({
201
+ meta, types, calls, cfg, dfg,
202
+ taint: { sources: [], sinks: [], sanitizers: [] },
203
+ imports, exports, unresolved: [], enriched: {},
806
204
  });
807
- // Filter sinks that are in dead code (unreachable)
808
- taint.sinks = taint.sinks.filter(sink => !constPropResult.unreachableLines.has(sink.line));
809
- // Filter sinks that use clean array elements (strong updates)
810
- taint.sinks = filterCleanArraySinks(taint.sinks, calls, constPropResult.taintedArrayElements, constPropResult.symbols);
811
- // Filter sinks that use variables proven clean by constant propagation (strong updates)
812
- taint.sinks = filterCleanVariableSinks(taint.sinks, calls, constPropResult.tainted, constPropResult.symbols, dfg, constPropResult.sanitizedVars, constPropResult.synchronizedLines);
813
- // Filter sinks that are wrapped by sanitizers on the same line
814
- taint.sinks = filterSanitizedSinks(taint.sinks, taint.sanitizers ?? [], calls);
815
- // Python: reduce XPath false-positives using forward taint propagation +
816
- // apostrophe-guard sanitizer detection; also detect trust boundary violations
817
- // (flask.session[key] = value) which are subscript assignments, not call nodes.
818
- if (language === 'python') {
819
- const pyTaintedVars = buildPythonTaintedVars(code);
820
- const pySanitizedVars = findPythonQuoteSanitizedVars(code);
821
- // Propagate sanitization: if bar is sanitized and query = f"...{bar}...", query is also sanitized
822
- for (const line of code.split('\n')) {
823
- const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
824
- if (!am)
825
- continue;
826
- const [, lhs, rhs] = am;
827
- if ([...pySanitizedVars].some(v => new RegExp(`\\b${v}\\b`).test(rhs))) {
828
- pySanitizedVars.add(lhs);
829
- }
830
- }
831
- // Detect inline .replace() sanitizers: query = f"...{bar.replace('\'', '&apos;')}..."
832
- // The tainted var appears with .replace() in the rhs — treat lhs as XPath-safe
833
- for (const line of code.split('\n')) {
834
- const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
835
- if (!am)
836
- continue;
837
- const [, lhs, rhs] = am;
838
- const hasReplaceOnTainted = [...pyTaintedVars.keys()].some(v => new RegExp(`\\b${v}\\.replace\\s*\\(`).test(rhs));
839
- if (hasReplaceOnTainted)
840
- pySanitizedVars.add(lhs);
841
- }
842
- const pySourceLines = code.split('\n');
843
- // Filter XPath sinks: keep only if a tainted var is used at the sink line
844
- taint.sinks = taint.sinks.filter(sink => {
845
- if (sink.type !== 'xpath_injection')
846
- return true;
847
- const sinkLineText = pySourceLines[sink.line - 1] ?? '';
848
- const taintedVarOnLine = [...pyTaintedVars.keys()].find(v => new RegExp(`\\b${v}\\b`).test(sinkLineText));
849
- if (!taintedVarOnLine)
850
- return false;
851
- if (pySanitizedVars.has(taintedVarOnLine))
852
- return false;
853
- // Suppress parameterized XPath: root.xpath(query, name=bar) where bar is a keyword arg
854
- if (new RegExp(`\\.xpath\\s*\\([^)]*\\b\\w+\\s*=\\s*\\b${taintedVarOnLine}\\b`).test(sinkLineText))
855
- return false;
856
- return true;
857
- });
858
- // Add trust boundary sinks from session subscript assignments
859
- const trustViolations = findPythonTrustBoundaryViolations(code, language, pyTaintedVars);
860
- for (const v of trustViolations) {
861
- const alreadyExists = taint.sinks.some(s => s.line === v.sinkLine && s.type === 'trust_boundary');
862
- if (!alreadyExists) {
863
- taint.sinks.push({
864
- type: 'trust_boundary',
865
- cwe: 'CWE-501',
866
- line: v.sinkLine,
867
- location: `session write at line ${v.sinkLine}`,
868
- confidence: 0.85,
869
- });
870
- }
871
- }
872
- // Add XSS sinks from return/yield statements (Flask/Django routes return HTML directly)
873
- const pyReturnXSS = findPythonReturnXSSSinks(code, language, pyTaintedVars);
874
- for (const r of pyReturnXSS) {
875
- const alreadyExists = taint.sinks.some(s => s.line === r.sinkLine && s.type === 'xss');
876
- if (!alreadyExists) {
877
- taint.sinks.push({
878
- type: 'xss',
879
- cwe: 'CWE-79',
880
- line: r.sinkLine,
881
- location: `return HTML with user input at line ${r.sinkLine}`,
882
- confidence: 0.9,
883
- });
884
- }
885
- }
886
- }
887
- // JavaScript/TypeScript: filter XSS sinks where the argument variable is NOT actually
888
- // tainted by user input (e.g., res.send(stdout) — stdout is a callback param from exec(),
889
- // not a variable derived from req.query/req.body). This prevents FP pairs like:
890
- // CWE-78 (correct) + CWE-79 (spurious) for the same source when the cmd output is sent.
891
- if (['javascript', 'typescript'].includes(language)) {
892
- const jsTaintedVars = buildJavaScriptTaintedVars(code, language);
893
- if (jsTaintedVars.size > 0) {
894
- const jsSourceLines = code.split('\n');
895
- taint.sinks = taint.sinks.filter(sink => {
896
- if (sink.type !== 'xss')
897
- return true;
898
- const sinkLineText = jsSourceLines[sink.line - 1] ?? '';
899
- // Keep if any known-tainted variable appears on this sink line
900
- if ([...jsTaintedVars.keys()].some(v => new RegExp(`\\b${v}\\b`).test(sinkLineText)))
901
- return true;
902
- // Also keep if the sink line directly references a taint source (inline use, no assignment)
903
- if (JS_TAINTED_PATTERNS.some(p => p.pattern.test(sinkLineText)))
904
- return true;
905
- return false;
906
- });
907
- }
908
- }
909
- // Propagate taint through dataflow to find verified flows
910
- if (taint.sources.length > 0 && taint.sinks.length > 0) {
911
- const propagationResult = propagateTaint(dfg, calls, taint.sources, taint.sinks, taint.sanitizers ?? []);
912
- // Filter flows using constant propagation (eliminate false positives)
913
- const verifiedFlows = propagationResult.flows.filter(flow => {
914
- // Check if the sink line is in dead code
915
- if (constPropResult.unreachableLines.has(flow.sink.line)) {
916
- return false;
917
- }
918
- // Check each step in the path - if any variable has a constant value, skip
919
- for (const step of flow.path) {
920
- const fpCheck = isFalsePositive(constPropResult, step.line, step.variable);
921
- if (fpCheck.isFalsePositive) {
922
- return false;
923
- }
924
- }
925
- // Check for correlated predicates: if the sink is under condition !C
926
- // and the taint was added under condition C, they're mutually exclusive
927
- if (isCorrelatedPredicateFP(constPropResult, flow)) {
928
- return false;
929
- }
930
- return true;
931
- });
932
- // Convert flows to TaintFlowInfo format
933
- taint.flows = verifiedFlows.map(flow => ({
934
- source_line: flow.source.line,
935
- sink_line: flow.sink.line,
936
- source_type: flow.source.type,
937
- sink_type: flow.sink.type,
938
- path: flow.path.map(step => ({
939
- variable: step.variable,
940
- line: step.line,
941
- type: step.type,
942
- })),
943
- confidence: flow.confidence,
944
- sanitized: flow.sanitized,
945
- }));
946
- // Add array element flows that DFG-based analysis might miss
947
- const arrayFlows = detectArrayElementFlows(calls, taint.sources, taint.sinks, constPropResult.taintedArrayElements, constPropResult.unreachableLines);
948
- if (arrayFlows && arrayFlows.length > 0) {
949
- if (!taint.flows) {
950
- taint.flows = [];
951
- }
952
- for (const flow of arrayFlows) {
953
- // Avoid duplicates
954
- if (!taint.flows.some(f => f.source_line === flow.source_line && f.sink_line === flow.sink_line)) {
955
- taint.flows.push(flow);
956
- }
957
- }
958
- }
959
- // Add collection/iterator flows that DFG-based analysis might miss
960
- const collectionFlows = detectCollectionFlows(calls, taint.sources, taint.sinks, constPropResult.tainted, constPropResult.unreachableLines);
961
- if (collectionFlows && collectionFlows.length > 0) {
962
- if (!taint.flows) {
963
- taint.flows = [];
964
- }
965
- for (const flow of collectionFlows) {
966
- // Avoid duplicates
967
- if (taint.flows.some(f => f.source_line === flow.source_line && f.sink_line === flow.sink_line)) {
968
- continue;
969
- }
970
- // Apply the same filtering as DFG-based flows
971
- const flowForCheck = {
972
- source: { line: flow.source_line, type: flow.source_type },
973
- sink: { line: flow.sink_line, type: flow.sink_type },
974
- path: flow.path.map(p => ({ variable: p.variable, line: p.line })),
975
- };
976
- // eslint-disable-next-line @typescript-eslint/no-explicit-any
977
- if (isCorrelatedPredicateFP(constPropResult, flowForCheck)) {
978
- continue;
979
- }
980
- // Check if any step in the path is a false positive
981
- let isFP = false;
982
- for (const step of flow.path) {
983
- const fpCheck = isFalsePositive(constPropResult, step.line, step.variable);
984
- if (fpCheck.isFalsePositive) {
985
- isFP = true;
986
- break;
987
- }
988
- }
989
- if (isFP) {
990
- continue;
991
- }
992
- taint.flows.push(flow);
993
- }
994
- }
995
- // Add direct parameter-to-sink flows that DFG might miss
996
- const paramFlows = detectParameterSinkFlows(types, calls, taint.sources, taint.sinks, constPropResult.unreachableLines);
997
- if (paramFlows && paramFlows.length > 0) {
998
- if (!taint.flows) {
999
- taint.flows = [];
1000
- }
1001
- for (const flow of paramFlows) {
1002
- // Avoid duplicates
1003
- if (!taint.flows.some(f => f.source_line === flow.source_line && f.sink_line === flow.sink_line)) {
1004
- taint.flows.push(flow);
1005
- }
1006
- }
1007
- }
1008
- // Perform inter-procedural analysis
1009
- const interProc = analyzeInterprocedural(types, calls, dfg, taint.sources, taint.sinks, taint.sanitizers ?? [], {
1010
- taintedVariables: constPropResult.tainted,
1011
- });
1012
- // Add inter-procedural sinks to the taint sinks and generate flows
1013
- // Skip external_taint_escape (CWE-668) here: they are only used as a last resort
1014
- // in the fallback path below when no other sinks exist. Adding them when proper sinks
1015
- // already exist creates duplicate/spurious findings (e.g., http.get already reported
1016
- // as CWE-918 SSRF; also getting CWE-668 for the same call chain is a FP).
1017
- for (const sink of interProc.propagatedSinks) {
1018
- if (sink.type === 'external_taint_escape')
1019
- continue;
1020
- if (!taint.sinks.some(s => s.line === sink.line)) {
1021
- taint.sinks.push(sink);
1022
- }
1023
- }
1024
- // Generate flows for inter-procedural propagated sinks
1025
- // These sinks are inside called methods where tainted args were passed
1026
- if (interProc.propagatedSinks.length > 0 && taint.sources.length > 0) {
1027
- if (!taint.flows) {
1028
- taint.flows = [];
1029
- }
1030
- // Build set of sanitizer method names to skip (methods with @sanitizer annotation)
1031
- const sanitizerMethodNames = new Set();
1032
- for (const san of taint.sanitizers ?? []) {
1033
- if (san.type === 'javadoc_sanitizer') {
1034
- // Extract method name from "methodName()" format
1035
- const match = san.method.match(/^(\w+)\(\)$/);
1036
- if (match)
1037
- sanitizerMethodNames.add(match[1]);
1038
- else
1039
- sanitizerMethodNames.add(san.method);
1040
- }
1041
- }
1042
- for (const sink of interProc.propagatedSinks) {
1043
- // Skip external taint escape sinks (not real vulnerability sinks)
1044
- if (sink.type === 'external_taint_escape')
1045
- continue;
1046
- // Find which call edge brought taint to this sink's method
1047
- for (const edge of interProc.callEdges) {
1048
- if (!interProc.taintedMethods.has(edge.calleeMethod))
1049
- continue;
1050
- const method = interProc.methodNodes.get(edge.calleeMethod);
1051
- if (!method)
1052
- continue;
1053
- if (sink.line < method.startLine || sink.line > method.endLine)
1054
- continue;
1055
- // Skip sinks inside sanitizer methods (@sanitizer annotation)
1056
- if (sanitizerMethodNames.has(method.name))
1057
- continue;
1058
- // Find the source connected to this call
1059
- for (const source of taint.sources) {
1060
- // Source should be in the caller's scope, at or before the call line
1061
- if (source.line > edge.callLine)
1062
- continue;
1063
- // Skip low-confidence interprocedural_param sources
1064
- if (source.type === 'interprocedural_param' && source.confidence < 0.6)
1065
- continue;
1066
- if (taint.flows.some(f => f.source_line === source.line && f.sink_line === sink.line))
1067
- continue;
1068
- taint.flows.push({
1069
- source_line: source.line,
1070
- sink_line: sink.line,
1071
- source_type: source.type,
1072
- sink_type: sink.type,
1073
- path: [
1074
- { variable: source.location, line: source.line, type: 'source' },
1075
- { variable: `call to ${method.name}()`, line: edge.callLine, type: 'use' },
1076
- { variable: sink.location, line: sink.line, type: 'sink' },
1077
- ],
1078
- confidence: sink.confidence * source.confidence * 0.85,
1079
- sanitized: false,
1080
- });
1081
- break; // One source per sink is enough
1082
- }
1083
- break; // One call edge per sink is enough
1084
- }
1085
- }
1086
- }
1087
- // Build inter-procedural info
1088
- const taintBridges = findTaintBridges(interProc);
1089
- taint.interprocedural = {
1090
- tainted_methods: Array.from(interProc.taintedMethods),
1091
- taint_bridges: taintBridges,
1092
- method_flows: interProc.callEdges
1093
- .filter(edge => interProc.taintedMethods.has(edge.calleeMethod))
1094
- .map(edge => ({
1095
- caller: edge.callerMethod,
1096
- callee: edge.calleeMethod,
1097
- call_line: edge.callLine,
1098
- tainted_args: edge.taintedArgs,
1099
- returns_taint: interProc.taintedReturns.has(edge.calleeMethod),
1100
- })),
1101
- };
1102
- }
1103
- // Perform inter-procedural analysis even when no initial sinks (can detect external taint escapes)
1104
- if (taint.sources.length > 0 && taint.sinks.length === 0) {
1105
- const interProc = analyzeInterprocedural(types, calls, dfg, taint.sources, [], // No initial sinks
1106
- taint.sanitizers ?? [], {
1107
- taintedVariables: constPropResult.tainted,
1108
- });
1109
- // Add inter-procedural sinks (e.g., external_taint_escape)
1110
- for (const sink of interProc.propagatedSinks) {
1111
- if (!constPropResult.unreachableLines.has(sink.line) &&
1112
- !taint.sinks.some(s => s.line === sink.line)) {
1113
- taint.sinks.push(sink);
1114
- }
1115
- }
1116
- // Build inter-procedural info
1117
- if (interProc.taintedMethods.size > 0 || interProc.propagatedSinks.length > 0) {
1118
- const taintBridges = findTaintBridges(interProc);
1119
- taint.interprocedural = {
1120
- tainted_methods: Array.from(interProc.taintedMethods),
1121
- taint_bridges: taintBridges,
1122
- method_flows: interProc.callEdges
1123
- .filter(edge => interProc.taintedMethods.has(edge.calleeMethod))
1124
- .map(edge => ({
1125
- caller: edge.callerMethod,
1126
- callee: edge.calleeMethod,
1127
- call_line: edge.callLine,
1128
- tainted_args: edge.taintedArgs,
1129
- returns_taint: interProc.taintedReturns.has(edge.calleeMethod),
1130
- })),
1131
- };
1132
- }
1133
- // If we found new sinks, create flows from sources
1134
- if (taint.sinks.length > 0) {
1135
- taint.flows = taint.sinks.map(sink => ({
1136
- source_line: taint.sources[0].line,
1137
- sink_line: sink.line,
1138
- source_type: taint.sources[0].type,
1139
- sink_type: sink.type,
1140
- path: [
1141
- { variable: 'input', line: taint.sources[0].line, type: 'source' },
1142
- { variable: 'input', line: sink.line, type: 'sink' },
1143
- ],
1144
- confidence: taint.sources[0].confidence * sink.confidence,
1145
- sanitized: false,
1146
- }));
1147
- }
1148
- }
1149
- // Detect unresolved items
205
+ const config = options.taintConfig ?? getDefaultConfig();
206
+ // Run the analysis pipeline
207
+ const { results, findings } = new AnalysisPipeline()
208
+ .add(new TaintMatcherPass())
209
+ .add(new ConstantPropagationPass(tree))
210
+ .add(new LanguageSourcesPass())
211
+ .add(new SinkFilterPass())
212
+ .add(new TaintPropagationPass())
213
+ .add(new InterproceduralPass())
214
+ .add(new DeadCodePass())
215
+ .add(new MissingAwaitPass())
216
+ .add(new NPlusOnePass())
217
+ .add(new MissingPublicDocPass())
218
+ .add(new TodoInProdPass())
219
+ .add(new StringConcatLoopPass())
220
+ .add(new SyncIoAsyncPass())
221
+ .add(new UncheckedReturnPass())
222
+ .add(new NullDerefPass())
223
+ .add(new ResourceLeakPass())
224
+ .add(new VariableShadowingPass())
225
+ .add(new LeakedGlobalPass())
226
+ .add(new UnusedVariablePass())
227
+ .add(new DependencyFanOutPass())
228
+ .add(new StaleDocRefPass())
229
+ .run(graph, code, language, config);
230
+ const sinkFilter = results.get('sink-filter');
231
+ const interProc = results.get('interprocedural');
232
+ const taint = {
233
+ sources: sinkFilter.sources,
234
+ sinks: [...sinkFilter.sinks, ...interProc.additionalSinks],
235
+ sanitizers: sinkFilter.sanitizers,
236
+ flows: interProc.additionalFlows,
237
+ interprocedural: interProc.interprocedural,
238
+ };
1150
239
  const unresolved = detectUnresolved(calls, types, dfg);
1151
- // Build enriched section
1152
240
  const enriched = buildEnriched(types, calls, taint.sources, taint.sinks);
241
+ // Compute software metrics (CK suite, Halstead, composite scores)
242
+ const metricValues = new MetricRunner().run({ meta, types, calls, cfg, dfg, taint, imports, exports, unresolved, enriched }, code, language);
1153
243
  logger.debug('Analysis complete', {
1154
244
  filePath,
1155
245
  finalSources: taint.sources.length,
@@ -1158,18 +248,14 @@ export async function analyze(code, filePath, language, options = {}) {
1158
248
  unresolvedItems: unresolved.length,
1159
249
  });
1160
250
  return {
1161
- meta,
1162
- types,
1163
- calls,
1164
- cfg,
1165
- dfg,
1166
- taint,
1167
- imports,
1168
- exports,
1169
- unresolved,
1170
- enriched,
251
+ meta, types, calls, cfg, dfg, taint, imports, exports, unresolved, enriched,
252
+ findings: findings.length > 0 ? findings : undefined,
253
+ metrics: { file: filePath, metrics: metricValues },
1171
254
  };
1172
255
  }
256
+ // ---------------------------------------------------------------------------
257
+ // Simplified API response format
258
+ // ---------------------------------------------------------------------------
1173
259
  /**
1174
260
  * Analyze code and return a simplified API response format.
1175
261
  */
@@ -1182,39 +268,7 @@ export async function analyzeForAPI(code, filePath, language, options = {}) {
1182
268
  const tree = await parse(code, language);
1183
269
  const parseTime = performance.now() - parseStart;
1184
270
  const analysisStart = performance.now();
1185
- // Collect all node types in a single traversal for better performance
1186
- const isJavaScript = language === 'javascript' || language === 'typescript';
1187
- const isRust = language === 'rust';
1188
- const isPython = language === 'python';
1189
- let nodeTypesToCollect;
1190
- if (isRust) {
1191
- nodeTypesToCollect = new Set([
1192
- 'call_expression', 'macro_invocation', 'function_item', 'struct_item',
1193
- 'impl_item', 'enum_item', 'trait_item', 'mod_item', 'use_declaration',
1194
- 'let_declaration', 'field_expression', 'scoped_identifier',
1195
- ]);
1196
- }
1197
- else if (isPython) {
1198
- nodeTypesToCollect = new Set([
1199
- 'call', 'function_definition', 'class_definition', 'import_statement',
1200
- 'import_from_statement', 'assignment', 'attribute', 'subscript',
1201
- ]);
1202
- }
1203
- else if (isJavaScript) {
1204
- nodeTypesToCollect = new Set([
1205
- 'call_expression', 'new_expression', 'class_declaration', 'function_declaration',
1206
- 'arrow_function', 'method_definition', 'variable_declaration', 'lexical_declaration',
1207
- 'import_statement', 'export_statement',
1208
- ]);
1209
- }
1210
- else {
1211
- nodeTypesToCollect = new Set([
1212
- 'method_invocation', 'object_creation_expression', 'class_declaration',
1213
- 'method_declaration', 'field_declaration', 'import_declaration',
1214
- 'interface_declaration', 'enum_declaration',
1215
- ]);
1216
- }
1217
- const nodeCache = collectAllNodes(tree.rootNode, nodeTypesToCollect);
271
+ const nodeCache = collectAllNodes(tree.rootNode, getNodeTypesForLanguage(language));
1218
272
  const types = extractTypes(tree, nodeCache, language);
1219
273
  const calls = extractCalls(tree, nodeCache, language);
1220
274
  // Run constant propagation
@@ -1232,40 +286,17 @@ export async function analyzeForAPI(code, filePath, language, options = {}) {
1232
286
  let pythonTaintedVars = new Map();
1233
287
  if (language === 'python') {
1234
288
  pythonTaintedVars = buildPythonTaintedVars(code);
1235
- const pythonSanitizedVars = findPythonQuoteSanitizedVars(code);
1236
- // Propagate sanitization: if bar is sanitized and query = f"...{bar}...", query is also sanitized
1237
- for (const line of code.split('\n')) {
1238
- const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
1239
- if (!am)
1240
- continue;
1241
- const [, lhs, rhs] = am;
1242
- if ([...pythonSanitizedVars].some(v => new RegExp(`\\b${v}\\b`).test(rhs))) {
1243
- pythonSanitizedVars.add(lhs);
1244
- }
1245
- }
1246
- // Detect inline .replace() sanitizers: query = f"...{bar.replace('\'', '&apos;')}..."
1247
- for (const line of code.split('\n')) {
1248
- const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
1249
- if (!am)
1250
- continue;
1251
- const [, lhs, rhs] = am;
1252
- const hasReplaceOnTainted = [...pythonTaintedVars.keys()].some(v => new RegExp(`\\b${v}\\.replace\\s*\\(`).test(rhs));
1253
- if (hasReplaceOnTainted)
1254
- pythonSanitizedVars.add(lhs);
1255
- }
289
+ const pythonSanitizedVars = buildPythonSanitizedVars(code, pythonTaintedVars);
1256
290
  const sourceLines = code.split('\n');
1257
291
  filteredSinks = filteredSinks.filter(sink => {
1258
292
  if (sink.type !== 'xpath_injection')
1259
293
  return true;
1260
- // Keep XPath sink only if a tainted variable is used at the sink line
1261
294
  const sinkLineText = sourceLines[sink.line - 1] ?? '';
1262
295
  const taintedVarOnLine = [...pythonTaintedVars.keys()].find(v => new RegExp(`\\b${v}\\b`).test(sinkLineText));
1263
296
  if (!taintedVarOnLine)
1264
297
  return false;
1265
- // Kill if the variable is protected by an apostrophe guard
1266
298
  if (pythonSanitizedVars.has(taintedVarOnLine))
1267
299
  return false;
1268
- // Suppress parameterized XPath: root.xpath(query, name=bar) where bar is a keyword arg
1269
300
  if (new RegExp(`\\.xpath\\s*\\([^)]*\\b\\w+\\s*=\\s*\\b${taintedVarOnLine}\\b`).test(sinkLineText))
1270
301
  return false;
1271
302
  return true;
@@ -1275,9 +306,8 @@ export async function analyzeForAPI(code, filePath, language, options = {}) {
1275
306
  const vulnerabilities = findVulnerabilities(taint.sources, filteredSinks, calls, constPropResult);
1276
307
  // Python: detect trust boundary violations (flask.session[key] = taintedVal)
1277
308
  if (language === 'python') {
1278
- const trustViolations = findPythonTrustBoundaryViolations(code, language, pythonTaintedVars);
309
+ const trustViolations = findPythonTrustBoundaryViolations(code, pythonTaintedVars);
1279
310
  for (const v of trustViolations) {
1280
- // Avoid duplicate: only add if no existing vulnerability for same sink line
1281
311
  const alreadyReported = vulnerabilities.some(existing => existing.sink.line === v.sinkLine && existing.type === 'trust_boundary');
1282
312
  if (!alreadyReported) {
1283
313
  vulnerabilities.push({
@@ -1307,6 +337,9 @@ export async function analyzeForAPI(code, filePath, language, options = {}) {
1307
337
  },
1308
338
  };
1309
339
  }
340
+ // ---------------------------------------------------------------------------
341
+ // Vulnerability matching (used by analyzeForAPI)
342
+ // ---------------------------------------------------------------------------
1310
343
  /**
1311
344
  * Find potential vulnerabilities by matching sources to sinks.
1312
345
  */
@@ -1409,363 +442,9 @@ function calculateVulnConfidence(source, sink) {
1409
442
  confidence = confidence * sink.confidence;
1410
443
  return Math.min(confidence, 1.0);
1411
444
  }
1412
- function evaluateSimpleExpression(expr, symbols) {
1413
- let evaluated = expr;
1414
- for (const [name, val] of symbols) {
1415
- if (val.type === 'int' || val.type === 'float') {
1416
- const regex = new RegExp(`\\b${name}\\b`, 'g');
1417
- evaluated = evaluated.replace(regex, String(val.value));
1418
- }
1419
- }
1420
- try {
1421
- if (/^[\d\s+\-*/().]+$/.test(evaluated)) {
1422
- const result = Function('"use strict"; return (' + evaluated + ')')();
1423
- if (typeof result === 'number' && !isNaN(result)) {
1424
- return String(Math.floor(result));
1425
- }
1426
- }
1427
- }
1428
- catch {
1429
- // Evaluation failed
1430
- }
1431
- return expr;
1432
- }
1433
- function filterCleanArraySinks(sinks, calls, taintedArrayElements, symbols) {
1434
- const callsByLine = new Map();
1435
- for (const call of calls) {
1436
- const existing = callsByLine.get(call.location.line) ?? [];
1437
- existing.push(call);
1438
- callsByLine.set(call.location.line, existing);
1439
- }
1440
- return sinks.filter(sink => {
1441
- const callsAtSink = callsByLine.get(sink.line) ?? [];
1442
- for (const call of callsAtSink) {
1443
- for (const arg of call.arguments) {
1444
- const arrayAccessMatch = arg.expression?.match(/^(\w+)\[(\d+|[^[\]]+)\]$/);
1445
- if (arrayAccessMatch) {
1446
- const arrayName = arrayAccessMatch[1];
1447
- let indexStr = arrayAccessMatch[2];
1448
- indexStr = evaluateSimpleExpression(indexStr, symbols);
1449
- const taintedIndices = taintedArrayElements.get(arrayName);
1450
- if (taintedIndices !== undefined) {
1451
- const isTainted = taintedIndices.has(indexStr) || taintedIndices.has('*');
1452
- if (!isTainted) {
1453
- return false;
1454
- }
1455
- }
1456
- }
1457
- }
1458
- }
1459
- return true;
1460
- });
1461
- }
1462
- function filterCleanVariableSinks(sinks, calls, taintedVars, symbols, dfg, sanitizedVars, synchronizedLines) {
1463
- const fieldNames = new Set();
1464
- if (dfg) {
1465
- for (const def of dfg.defs) {
1466
- if (def.kind === 'field') {
1467
- fieldNames.add(def.variable);
1468
- }
1469
- }
1470
- }
1471
- const callsByLine = new Map();
1472
- for (const call of calls) {
1473
- const existing = callsByLine.get(call.location.line) ?? [];
1474
- existing.push(call);
1475
- callsByLine.set(call.location.line, existing);
1476
- }
1477
- return sinks.filter(sink => {
1478
- const callsAtSink = callsByLine.get(sink.line) ?? [];
1479
- const isInSynchronizedBlock = synchronizedLines?.has(sink.line) ?? false;
1480
- for (const call of callsAtSink) {
1481
- let allArgsAreClean = true;
1482
- const methodName = call.in_method;
1483
- for (const arg of call.arguments) {
1484
- if (arg.variable && !arg.expression?.includes('[')) {
1485
- const varName = arg.variable;
1486
- const scopedName = methodName ? `${methodName}:${varName}` : varName;
1487
- if (fieldNames.has(varName) && !isInSynchronizedBlock) {
1488
- allArgsAreClean = false;
1489
- continue;
1490
- }
1491
- if (sanitizedVars?.has(scopedName) || sanitizedVars?.has(varName)) {
1492
- continue;
1493
- }
1494
- if (taintedVars.has(scopedName) || taintedVars.has(varName)) {
1495
- allArgsAreClean = false;
1496
- continue;
1497
- }
1498
- const symbolValue = symbols.get(scopedName) ?? symbols.get(varName);
1499
- if (symbolValue && symbolValue.type !== 'unknown') {
1500
- continue;
1501
- }
1502
- allArgsAreClean = false;
1503
- }
1504
- else {
1505
- // Check if the argument is a pure literal (string, number, boolean, etc.)
1506
- // Literals are inherently clean — they can't carry tainted data.
1507
- if (arg.literal != null) {
1508
- continue;
1509
- }
1510
- // Also check if the expression is a quoted string literal without variable interpolation
1511
- if (arg.expression && !arg.variable && isStringLiteralExpression(arg.expression)) {
1512
- continue;
1513
- }
1514
- allArgsAreClean = false;
1515
- }
1516
- }
1517
- if (allArgsAreClean && call.arguments.length > 0) {
1518
- return false;
1519
- }
1520
- }
1521
- return true;
1522
- });
1523
- }
1524
- function isStringLiteralExpression(expr) {
1525
- const trimmed = expr.trim();
1526
- return (trimmed.startsWith('"') && trimmed.endsWith('"')) ||
1527
- (trimmed.startsWith("'") && trimmed.endsWith("'"));
1528
- }
1529
- function filterSanitizedSinks(sinks, sanitizers, calls) {
1530
- if (!sanitizers || sanitizers.length === 0) {
1531
- return sinks;
1532
- }
1533
- const sanitizersByLine = new Map();
1534
- for (const san of sanitizers) {
1535
- const existing = sanitizersByLine.get(san.line) ?? [];
1536
- existing.push(san);
1537
- sanitizersByLine.set(san.line, existing);
1538
- }
1539
- const callsByLine = new Map();
1540
- for (const call of calls) {
1541
- const existing = callsByLine.get(call.location.line) ?? [];
1542
- existing.push(call);
1543
- callsByLine.set(call.location.line, existing);
1544
- }
1545
- return sinks.filter(sink => {
1546
- const lineSanitizers = sanitizersByLine.get(sink.line);
1547
- if (!lineSanitizers || lineSanitizers.length === 0) {
1548
- return true;
1549
- }
1550
- for (const san of lineSanitizers) {
1551
- if (san.sanitizes.includes(sink.type)) {
1552
- const lineCalls = callsByLine.get(sink.line) ?? [];
1553
- for (const call of lineCalls) {
1554
- for (const arg of call.arguments) {
1555
- const expr = arg.expression || '';
1556
- const sanMethodMatch = san.method.match(/(?:(\w+)\.)?(\w+)\(\)/);
1557
- if (sanMethodMatch) {
1558
- const sanMethodName = sanMethodMatch[2];
1559
- const sanClassName = sanMethodMatch[1];
1560
- if (sanClassName) {
1561
- if (expr.includes(`${sanClassName}.${sanMethodName}(`)) {
1562
- return false;
1563
- }
1564
- }
1565
- else if (expr.includes(`${sanMethodName}(`)) {
1566
- return false;
1567
- }
1568
- }
1569
- }
1570
- }
1571
- }
1572
- }
1573
- return true;
1574
- });
1575
- }
1576
- function detectCollectionFlows(calls, sources, sinks, taintedVars, unreachableLines) {
1577
- const flows = [];
1578
- const callsByLine = new Map();
1579
- for (const call of calls) {
1580
- const existing = callsByLine.get(call.location.line) ?? [];
1581
- existing.push(call);
1582
- callsByLine.set(call.location.line, existing);
1583
- }
1584
- for (const sink of sinks) {
1585
- if (unreachableLines.has(sink.line)) {
1586
- continue;
1587
- }
1588
- const callsAtSink = callsByLine.get(sink.line) ?? [];
1589
- for (const call of callsAtSink) {
1590
- for (const arg of call.arguments) {
1591
- if (arg.variable) {
1592
- const varName = arg.variable;
1593
- const scopedName = call.in_method ? `${call.in_method}:${varName}` : varName;
1594
- if (taintedVars.has(varName) || taintedVars.has(scopedName)) {
1595
- const source = sources[0];
1596
- if (source) {
1597
- flows.push({
1598
- source_line: source.line,
1599
- sink_line: sink.line,
1600
- source_type: source.type,
1601
- sink_type: sink.type,
1602
- path: [
1603
- { variable: varName, line: source.line, type: 'source' },
1604
- { variable: varName, line: sink.line, type: 'sink' },
1605
- ],
1606
- confidence: 0.8,
1607
- sanitized: false,
1608
- });
1609
- }
1610
- }
1611
- }
1612
- if (arg.expression) {
1613
- const expr = arg.expression;
1614
- const collectionMethods = ['getLast', 'getFirst', 'get', 'next', 'poll', 'peek', 'toArray'];
1615
- for (const method of collectionMethods) {
1616
- const methodPattern = new RegExp(`(\\w+)\\.${method}\\(`);
1617
- const match = expr.match(methodPattern);
1618
- if (match) {
1619
- const collectionVar = match[1];
1620
- const scopedCollection = call.in_method ? `${call.in_method}:${collectionVar}` : collectionVar;
1621
- if (taintedVars.has(collectionVar) || taintedVars.has(scopedCollection)) {
1622
- const source = sources[0];
1623
- if (source) {
1624
- flows.push({
1625
- source_line: source.line,
1626
- sink_line: sink.line,
1627
- source_type: source.type,
1628
- sink_type: sink.type,
1629
- path: [
1630
- { variable: collectionVar, line: source.line, type: 'source' },
1631
- { variable: collectionVar, line: sink.line, type: 'sink' },
1632
- ],
1633
- confidence: 0.75,
1634
- sanitized: false,
1635
- });
1636
- }
1637
- }
1638
- }
1639
- }
1640
- }
1641
- }
1642
- }
1643
- }
1644
- return flows;
1645
- }
1646
- function detectArrayElementFlows(calls, sources, sinks, taintedArrayElements, unreachableLines) {
1647
- const flows = [];
1648
- const callsByLine = new Map();
1649
- for (const call of calls) {
1650
- const existing = callsByLine.get(call.location.line) ?? [];
1651
- existing.push(call);
1652
- callsByLine.set(call.location.line, existing);
1653
- }
1654
- for (const sink of sinks) {
1655
- if (unreachableLines.has(sink.line)) {
1656
- continue;
1657
- }
1658
- const callsAtSink = callsByLine.get(sink.line) ?? [];
1659
- for (const call of callsAtSink) {
1660
- for (const arg of call.arguments) {
1661
- const arrayAccessMatch = arg.expression?.match(/^(\w+)\[(\d+|[^[\]]+)\]$/);
1662
- if (arrayAccessMatch) {
1663
- const arrayName = arrayAccessMatch[1];
1664
- const indexStr = arrayAccessMatch[2];
1665
- const taintedIndices = taintedArrayElements.get(arrayName);
1666
- if (taintedIndices) {
1667
- const isTainted = taintedIndices.has(indexStr) || taintedIndices.has('*');
1668
- if (isTainted) {
1669
- const source = sources[0];
1670
- if (source) {
1671
- flows.push({
1672
- source_line: source.line,
1673
- sink_line: sink.line,
1674
- source_type: source.type,
1675
- sink_type: sink.type,
1676
- path: [
1677
- { variable: arrayName, line: source.line, type: 'source' },
1678
- { variable: `${arrayName}[${indexStr}]`, line: sink.line, type: 'sink' },
1679
- ],
1680
- confidence: 0.85,
1681
- sanitized: false,
1682
- });
1683
- }
1684
- }
1685
- }
1686
- }
1687
- }
1688
- }
1689
- }
1690
- return flows;
1691
- }
1692
- /**
1693
- * Detect direct method parameter to sink flows.
1694
- * This handles cases where a tainted method parameter is directly used in a sink
1695
- * without intermediate variable assignments (which DFG chains might miss).
1696
- */
1697
- function detectParameterSinkFlows(types, calls, sources, sinks, unreachableLines) {
1698
- const flows = [];
1699
- // Build a map of method name -> parameter sources
1700
- const paramSourcesByMethod = new Map();
1701
- for (const source of sources) {
1702
- if (source.type === 'interprocedural_param') {
1703
- // Extract method and param name from location like "String paramName in methodName"
1704
- const match = source.location.match(/(\S+)\s+(\S+)\s+in\s+(\S+)/);
1705
- if (match) {
1706
- const paramName = match[2];
1707
- const methodName = match[3];
1708
- let methodParams = paramSourcesByMethod.get(methodName);
1709
- if (!methodParams) {
1710
- methodParams = new Map();
1711
- paramSourcesByMethod.set(methodName, methodParams);
1712
- }
1713
- methodParams.set(paramName, source);
1714
- }
1715
- }
1716
- }
1717
- if (paramSourcesByMethod.size === 0) {
1718
- return flows;
1719
- }
1720
- // Build map of calls by line
1721
- const callsByLine = new Map();
1722
- for (const call of calls) {
1723
- const existing = callsByLine.get(call.location.line) ?? [];
1724
- existing.push(call);
1725
- callsByLine.set(call.location.line, existing);
1726
- }
1727
- // For each sink, check if it uses a tainted parameter directly
1728
- for (const sink of sinks) {
1729
- if (unreachableLines.has(sink.line)) {
1730
- continue;
1731
- }
1732
- const callsAtSink = callsByLine.get(sink.line) ?? [];
1733
- for (const call of callsAtSink) {
1734
- const methodName = call.in_method;
1735
- if (!methodName)
1736
- continue;
1737
- const methodParamSources = paramSourcesByMethod.get(methodName);
1738
- if (!methodParamSources)
1739
- continue;
1740
- // Check if any argument is a tainted parameter
1741
- for (const arg of call.arguments) {
1742
- if (arg.variable) {
1743
- const paramSource = methodParamSources.get(arg.variable);
1744
- if (paramSource) {
1745
- // Found a direct parameter-to-sink flow
1746
- // Check if we already have this flow
1747
- const exists = flows.some(f => f.source_line === paramSource.line && f.sink_line === sink.line);
1748
- if (!exists) {
1749
- flows.push({
1750
- source_line: paramSource.line,
1751
- sink_line: sink.line,
1752
- source_type: paramSource.type,
1753
- sink_type: sink.type,
1754
- path: [
1755
- { variable: arg.variable, line: paramSource.line, type: 'source' },
1756
- { variable: arg.variable, line: sink.line, type: 'sink' },
1757
- ],
1758
- confidence: 0.75, // Lower confidence for interprocedural
1759
- sanitized: false,
1760
- });
1761
- }
1762
- }
1763
- }
1764
- }
1765
- }
1766
- }
1767
- return flows;
1768
- }
445
+ // ---------------------------------------------------------------------------
446
+ // Lifecycle
447
+ // ---------------------------------------------------------------------------
1769
448
  /**
1770
449
  * Check if the analyzer is initialized.
1771
450
  */
@@ -1778,4 +457,83 @@ export function isAnalyzerInitialized() {
1778
457
  export function resetAnalyzer() {
1779
458
  initialized = false;
1780
459
  }
460
+ // ---------------------------------------------------------------------------
461
+ // Project-level analysis (multi-file)
462
+ // ---------------------------------------------------------------------------
463
+ /**
464
+ * Analyze a set of files as a project, finding cross-file taint flows.
465
+ *
466
+ * Runs single-file `analyze()` on each file in order, then uses
467
+ * `ProjectGraph` + `CrossFileResolver` to surface flows that cross file
468
+ * boundaries. The per-file `CircleIR` outputs are preserved unchanged in
469
+ * `ProjectAnalysis.files`.
470
+ *
471
+ * `findings` is always empty — it requires LLM enrichment which is out of
472
+ * scope for this library (see CLAUDE.md and SPEC.md section 11).
473
+ */
474
+ export async function analyzeProject(files, options = {}) {
475
+ const fileAnalyses = [];
476
+ const projectGraph = new ProjectGraph();
477
+ const sourceLinesByFile = new Map();
478
+ // 1. Per-file analysis
479
+ for (const { code, filePath, language } of files) {
480
+ const ir = await analyze(code, filePath, language, options);
481
+ fileAnalyses.push({ file: filePath, analysis: ir });
482
+ projectGraph.addFile(filePath, new CodeGraph(ir));
483
+ sourceLinesByFile.set(filePath, code.split('\n'));
484
+ }
485
+ // 2. Cross-file analysis
486
+ const crossFileResult = new CrossFilePass().run(projectGraph, sourceLinesByFile);
487
+ // 3. Import-graph analysis (circular deps + orphan modules)
488
+ const importGraph = new ImportGraph(projectGraph);
489
+ const circularFindings = new CircularDependencyPass().run(projectGraph, importGraph);
490
+ const orphanFindings = new OrphanModulePass().run(projectGraph, importGraph);
491
+ // Attach project-level findings to the appropriate per-file CircleIR.findings
492
+ for (const finding of [...circularFindings, ...orphanFindings]) {
493
+ const fa = fileAnalyses.find(f => f.file === finding.file);
494
+ if (fa) {
495
+ fa.analysis.findings = [...(fa.analysis.findings ?? []), finding];
496
+ }
497
+ }
498
+ // 4. Assemble ProjectMeta
499
+ const filePaths = files.map(f => f.filePath);
500
+ const totalLoc = fileAnalyses.reduce((sum, f) => sum + (f.analysis.meta.loc ?? 0), 0);
501
+ const meta = {
502
+ name: deriveProjectName(filePaths),
503
+ root: deriveProjectRoot(filePaths),
504
+ language: files[0]?.language ?? 'java',
505
+ total_files: files.length,
506
+ total_loc: totalLoc,
507
+ analyzed_at: new Date().toISOString(),
508
+ };
509
+ return {
510
+ meta,
511
+ files: fileAnalyses,
512
+ type_hierarchy: crossFileResult.typeHierarchy,
513
+ cross_file_calls: crossFileResult.crossFileCalls,
514
+ taint_paths: crossFileResult.taintPaths,
515
+ findings: [],
516
+ };
517
+ }
518
+ /** Derive a project name from the common root directory of the file paths. */
519
+ function deriveProjectName(paths) {
520
+ if (paths.length === 0)
521
+ return 'unknown';
522
+ const root = deriveProjectRoot(paths);
523
+ return root.split('/').filter(Boolean).pop() ?? 'unknown';
524
+ }
525
+ /** Derive the common ancestor directory from a list of file paths. */
526
+ function deriveProjectRoot(paths) {
527
+ if (paths.length === 0)
528
+ return '/';
529
+ const segments = paths[0].split('/');
530
+ let common = segments.slice(0, -1); // strip filename
531
+ for (const p of paths.slice(1)) {
532
+ const segs = p.split('/');
533
+ common = common.filter((seg, i) => segs[i] === seg);
534
+ }
535
+ return common.join('/') || '/';
536
+ }
537
+ // Re-export isFalsePositive for consumers that use it directly
538
+ export { isFalsePositive };
1781
539
  //# sourceMappingURL=analyzer.js.map