circle-ir 3.6.0 → 3.8.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/analyzer.js CHANGED
@@ -119,6 +119,34 @@ const JS_TAINTED_PATTERNS = [
119
119
  { pattern: /\bdocument\.querySelector\b/, type: 'dom_input' },
120
120
  { pattern: /\.value\b/, type: 'dom_input' },
121
121
  ];
122
+ /**
123
+ * Python/Flask/Django tainted request access patterns.
124
+ * Used to detect sources in assignments like: user_id = request.args.get('id')
125
+ * Also covers subscript access: user_id = request.args['id']
126
+ */
127
+ const PYTHON_TAINTED_PATTERNS = [
128
+ { pattern: /\brequest\.args\b/, type: 'http_param' },
129
+ { pattern: /\brequest\.form\b/, type: 'http_body' },
130
+ { pattern: /\brequest\.json\b/, type: 'http_body' },
131
+ { pattern: /\brequest\.data\b/, type: 'http_body' },
132
+ { pattern: /\brequest\.files?\b/, type: 'file_input' },
133
+ { pattern: /\brequest\.headers?\b/, type: 'http_header' },
134
+ { pattern: /\brequest\.cookies\b/, type: 'http_cookie' },
135
+ { pattern: /\brequest\.GET\b/, type: 'http_param' },
136
+ { pattern: /\brequest\.POST\b/, type: 'http_body' },
137
+ { pattern: /\brequest\.META\b/, type: 'http_header' },
138
+ { pattern: /\brequest\.FILES\b/, type: 'file_input' },
139
+ { pattern: /\brequest\.query_params\b/, type: 'http_param' },
140
+ { pattern: /\brequest\.path_params\b/, type: 'http_param' },
141
+ // Flask raw query/body strings
142
+ { pattern: /\brequest\.query_string\b/, type: 'http_param' },
143
+ { pattern: /\brequest\.get_data\s*\(/, type: 'http_body' },
144
+ // Request wrapper helper methods (common in OWASP-style benchmarks and real wrappers)
145
+ { pattern: /\bget_form_parameter\s*\(/, type: 'http_body' },
146
+ { pattern: /\bget_query_parameter\s*\(/, type: 'http_param' },
147
+ { pattern: /\bget_header_value\s*\(/, type: 'http_header' },
148
+ { pattern: /\bget_cookie_value\s*\(/, type: 'http_cookie' },
149
+ ];
122
150
  /**
123
151
  * Find JavaScript taint sources from variable assignments.
124
152
  * Detects patterns like: var userId = req.query.id
@@ -161,6 +189,297 @@ function findJavaScriptAssignmentSources(sourceCode, language) {
161
189
  }
162
190
  return sources;
163
191
  }
192
+ /**
193
+ * Find Python taint sources from variable assignments and subscript access.
194
+ * Detects patterns like: user_id = request.args.get('id') or request.args['id']
195
+ */
196
+ function findPythonAssignmentSources(sourceCode, language) {
197
+ const sources = [];
198
+ if (language !== 'python') {
199
+ return sources;
200
+ }
201
+ const lines = sourceCode.split('\n');
202
+ for (let lineNum = 0; lineNum < lines.length; lineNum++) {
203
+ const line = lines[lineNum];
204
+ const lineNumber = lineNum + 1;
205
+ // Skip comment lines
206
+ if (line.trimStart().startsWith('#'))
207
+ continue;
208
+ // Look for assignments: x = ... or x: type = ...
209
+ const assignmentMatch = line.match(/^(\s*\w[\w.]*)\s*(?::\s*\w[\w\[\], .]*)?\s*=\s*(.+)/);
210
+ if (assignmentMatch) {
211
+ const rhs = assignmentMatch[2];
212
+ for (const { pattern, type } of PYTHON_TAINTED_PATTERNS) {
213
+ if (pattern.test(rhs)) {
214
+ const varMatch = line.match(/^\s*(\w+)\s*/);
215
+ const varName = varMatch ? varMatch[1] : 'unknown';
216
+ const alreadyExists = sources.some(s => s.line === lineNumber && s.type === type);
217
+ if (!alreadyExists) {
218
+ sources.push({
219
+ type,
220
+ location: `${varName} = ${rhs.trim().substring(0, 50)}${rhs.length > 50 ? '...' : ''}`,
221
+ severity: 'high',
222
+ line: lineNumber,
223
+ confidence: 0.95,
224
+ variable: varName,
225
+ });
226
+ }
227
+ break;
228
+ }
229
+ }
230
+ }
231
+ }
232
+ return sources;
233
+ }
234
+ /**
235
+ * Build a map of tainted variable names → source line via simple forward
236
+ * line-by-line taint propagation for Python.
237
+ *
238
+ * Seeds from PYTHON_TAINTED_PATTERNS; propagates through assignments where the
239
+ * RHS contains a tainted variable. Uses per-key container taint to distinguish
240
+ * map['tainted_key'] from map['safe_key'] and conf.get(s,tainted_k) vs conf.get(s,safe_k).
241
+ */
242
+ function buildPythonTaintedVars(sourceCode) {
243
+ const tainted = new Map();
244
+ // Per-key container taint: "map['key']" or "conf['section']['key']" → line number
245
+ const containerTainted = new Map();
246
+ const lines = sourceCode.split('\n');
247
+ for (let i = 0; i < lines.length; i++) {
248
+ const line = lines[i];
249
+ if (line.trimStart().startsWith('#'))
250
+ continue;
251
+ // Subscript assignment: container['key'] = value
252
+ // Tracks taint per-key so map['keyA']='safe' and map['keyB']=param are distinguished.
253
+ const subscriptAssign = line.match(/^\s*(\w+)\[(['"])([^'"]+)\2\]\s*=\s*(.+)$/);
254
+ if (subscriptAssign) {
255
+ const [, container, , key, rhs2] = subscriptAssign;
256
+ const isTaintedRhs = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(rhs2));
257
+ if (isTaintedRhs) {
258
+ containerTainted.set(`${container}['${key}']`, i + 1);
259
+ }
260
+ continue; // subscript assignments don't match simple variable regex below
261
+ }
262
+ // ConfigParser set: obj.set('section', 'key', value)
263
+ // Tracks per (section, key) so conf.get('s','keyA') and conf.get('s','keyB') are distinct.
264
+ const setCallMatch = line.match(/^\s*(\w+)\.set\s*\(\s*(['"])([^'"]+)\2\s*,\s*(['"])([^'"]+)\4\s*,\s*(.+?)\s*\)$/);
265
+ if (setCallMatch) {
266
+ const [, obj, , section, , key, rhs2] = setCallMatch;
267
+ const isTaintedRhs = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(rhs2));
268
+ if (isTaintedRhs) {
269
+ containerTainted.set(`${obj}['${section}']['${key}']`, i + 1);
270
+ }
271
+ continue;
272
+ }
273
+ // Augmented assignment: var += expr — taint if either side is tainted
274
+ const augAssign = line.match(/^\s*(\w+)\s*\+=\s*(.+)$/);
275
+ if (augAssign) {
276
+ const [, augLhs, augRhs] = augAssign;
277
+ const rhsTainted = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(augRhs));
278
+ if (rhsTainted || tainted.has(augLhs)) {
279
+ tainted.set(augLhs, tainted.get(augLhs) ?? (i + 1));
280
+ }
281
+ continue;
282
+ }
283
+ // For loop: for var in tainted_source — seed loop variable as tainted
284
+ const forLoopMatch = line.match(/^\s*for\s+(\w+)\s+in\s+(.+?)(?:\s*:\s*)?$/);
285
+ if (forLoopMatch) {
286
+ const [, iterVar, iterExpr] = forLoopMatch;
287
+ const isDirectSource = PYTHON_TAINTED_PATTERNS.some(p => p.pattern.test(iterExpr));
288
+ const isPropagated = [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(iterExpr));
289
+ if (isDirectSource || isPropagated) {
290
+ tainted.set(iterVar, i + 1);
291
+ }
292
+ continue;
293
+ }
294
+ // Regular assignment: var = expr
295
+ const assignMatch = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
296
+ if (!assignMatch)
297
+ continue;
298
+ const [, lhs, rhs] = assignMatch;
299
+ const isDirectSource = PYTHON_TAINTED_PATTERNS.some(p => p.pattern.test(rhs));
300
+ let propagatedFrom;
301
+ // Per-key dict access: bar = container['key']
302
+ const dictAccessMatch = rhs.trim().match(/^(\w+)\[(['"])([^'"]+)\2\]$/);
303
+ if (dictAccessMatch) {
304
+ const [, container, , key] = dictAccessMatch;
305
+ if (containerTainted.has(`${container}['${key}']`)) {
306
+ propagatedFrom = `${container}['${key}']`;
307
+ }
308
+ }
309
+ // Per-key configparser get: bar = conf.get('section', 'key')
310
+ if (!propagatedFrom) {
311
+ const confGetMatch = rhs.trim().match(/^(\w+)\.get\s*\(\s*(['"])([^'"]+)\2\s*,\s*(['"])([^'"]+)\4\s*\)$/);
312
+ if (confGetMatch) {
313
+ const [, obj, , section, , key] = confGetMatch;
314
+ if (containerTainted.has(`${obj}['${section}']['${key}']`)) {
315
+ propagatedFrom = `${obj}['${section}']['${key}']`;
316
+ }
317
+ }
318
+ }
319
+ // Standard variable propagation (skip os.environ/os.getenv — safe env reads)
320
+ if (!propagatedFrom) {
321
+ const isSafeEnvRead = /\bos\.environ\.get\s*\(/.test(rhs) || /\bos\.getenv\s*\(/.test(rhs);
322
+ if (!isSafeEnvRead) {
323
+ propagatedFrom = [...tainted.keys()].find(v => new RegExp(`\\b${v}\\b`).test(rhs));
324
+ }
325
+ }
326
+ if (isDirectSource) {
327
+ tainted.set(lhs, i + 1);
328
+ }
329
+ else if (propagatedFrom !== undefined) {
330
+ tainted.set(lhs, i + 1);
331
+ }
332
+ else if (tainted.has(lhs)) {
333
+ // Variable overwritten — preserve taint for null-guard patterns like:
334
+ // if not param:
335
+ // param = ""
336
+ const prevNonBlank = lines.slice(0, i).reverse().find(l => l.trim() && !l.trimStart().startsWith('#'));
337
+ const isNullGuard = prevNonBlank !== undefined && (new RegExp(`^\\s*if\\s+not\\s+${lhs}\\s*:`).test(prevNonBlank) ||
338
+ new RegExp(`^\\s*if\\s+${lhs}\\s+is\\s+None\\s*:`).test(prevNonBlank));
339
+ if (!isNullGuard) {
340
+ tainted.delete(lhs);
341
+ }
342
+ }
343
+ }
344
+ return tainted;
345
+ }
346
+ /**
347
+ * Forward taint propagation for JavaScript/TypeScript.
348
+ * Tracks which local variables are tainted from HTTP request sources.
349
+ * Used to filter spurious XSS sinks where the argument is NOT actually tainted
350
+ * (e.g., res.send(stdout) where stdout is a callback param from exec(), not user input).
351
+ */
352
+ function buildJavaScriptTaintedVars(sourceCode, language) {
353
+ if (!['javascript', 'typescript'].includes(language))
354
+ return new Map();
355
+ const tainted = new Map();
356
+ const lines = sourceCode.split('\n');
357
+ for (let i = 0; i < lines.length; i++) {
358
+ const line = lines[i];
359
+ // Skip comment lines
360
+ const trimmed = line.trimStart();
361
+ if (trimmed.startsWith('//') || trimmed.startsWith('*'))
362
+ continue;
363
+ // Match variable assignments: var/let/const x = rhs OR x = rhs
364
+ const assignMatch = line.match(/(?:(?:var|let|const)\s+)?(\w+)\s*=\s*(.+)/);
365
+ if (!assignMatch)
366
+ continue;
367
+ const [, lhs, rhs] = assignMatch;
368
+ // Skip keywords that look like assignments but aren't variable names
369
+ if (['if', 'while', 'for', 'return', 'true', 'false', 'null', 'undefined', 'case'].includes(lhs))
370
+ continue;
371
+ // Seed from direct source patterns (req.query.x, req.body, etc.)
372
+ const isDirectSource = JS_TAINTED_PATTERNS.some(p => p.pattern.test(rhs));
373
+ // Propagate from existing tainted variables
374
+ const isTaintedPropagation = tainted.size > 0 &&
375
+ [...tainted.keys()].some(v => new RegExp(`\\b${v}\\b`).test(rhs));
376
+ if (isDirectSource || isTaintedPropagation) {
377
+ tainted.set(lhs, i + 1);
378
+ }
379
+ }
380
+ return tainted;
381
+ }
382
+ /**
383
+ * Detect Python apostrophe-check sanitizer guards, e.g.:
384
+ * if "'" in bar:
385
+ * return # or raise / abort
386
+ * Returns the set of variable names that are guarded this way.
387
+ */
388
+ function findPythonQuoteSanitizedVars(sourceCode) {
389
+ const sanitized = new Set();
390
+ const lines = sourceCode.split('\n');
391
+ for (let i = 0; i < lines.length - 1; i++) {
392
+ // Match any apostrophe/quote check: if "'" in var:, if '\'' in var:, if '"' in var:
393
+ // Uses full quoted-string pattern to handle Python's various literal forms.
394
+ const m = lines[i].match(/^\s*if\s+(?:'(?:[^'\\]|\\.)*'|"(?:[^"\\]|\\.)*")\s+in\s+(\w+)\s*:/);
395
+ if (!m)
396
+ continue;
397
+ // Look ahead up to 5 lines for a return/raise/abort/continue/break
398
+ // The guard body may be multi-line (e.g. RESPONSE += (...) \n return).
399
+ // Stop early if we encounter a line at the same or lesser indentation as the if (block exit).
400
+ const ifIndent = (lines[i].match(/^(\s*)/) ?? ['', ''])[1].length;
401
+ let foundExit = false;
402
+ for (let j = i + 1; j <= Math.min(i + 5, lines.length - 1); j++) {
403
+ const jLine = lines[j] ?? '';
404
+ if (!jLine.trim())
405
+ continue; // skip blank lines
406
+ const jIndent = (jLine.match(/^(\s*)/) ?? ['', ''])[1].length;
407
+ if (jIndent <= ifIndent)
408
+ break; // left the if-block
409
+ if (/^(return|raise|abort|continue|break)\b/.test(jLine.trim())) {
410
+ foundExit = true;
411
+ break;
412
+ }
413
+ }
414
+ if (foundExit) {
415
+ sanitized.add(m[1]);
416
+ }
417
+ }
418
+ return sanitized;
419
+ }
420
+ /**
421
+ * Detect Python trust boundary violations:
422
+ * flask.session[key] = value (or session[key] = value)
423
+ * where key or value references a tainted variable.
424
+ */
425
+ function findPythonTrustBoundaryViolations(sourceCode, language, taintedVars) {
426
+ if (language !== 'python' || taintedVars.size === 0)
427
+ return [];
428
+ const violations = [];
429
+ const lines = sourceCode.split('\n');
430
+ const SESSION_WRITE = /(?:flask\.)?session\[([^\]]+)\]\s*=\s*(.+)$/;
431
+ const taintedKeys = [...taintedVars.keys()];
432
+ const earliestSourceLine = Math.min(...[...taintedVars.values()]);
433
+ for (let i = 0; i < lines.length; i++) {
434
+ const line = lines[i];
435
+ if (line.trimStart().startsWith('#'))
436
+ continue;
437
+ const m = line.match(SESSION_WRITE);
438
+ if (!m)
439
+ continue;
440
+ const [, keyExpr, valueExpr] = m;
441
+ const keyTainted = taintedKeys.some(v => new RegExp(`\\b${v}\\b`).test(keyExpr));
442
+ const valueTainted = taintedKeys.some(v => new RegExp(`\\b${v}\\b`).test(valueExpr));
443
+ if (keyTainted || valueTainted) {
444
+ violations.push({ sourceLine: earliestSourceLine, sinkLine: i + 1 });
445
+ }
446
+ }
447
+ return violations;
448
+ }
449
+ /**
450
+ * Find Python XSS sinks in return/yield statements.
451
+ * Flask/Django routes often return HTML strings directly:
452
+ * return '<h1>' + user_input + '</h1>'
453
+ * return f'<html>{user_input}</html>'
454
+ * These are not call nodes so findSinks() never detects them.
455
+ */
456
+ function findPythonReturnXSSSinks(sourceCode, language, taintedVars) {
457
+ if (language !== 'python' || taintedVars.size === 0)
458
+ return [];
459
+ const sinks = [];
460
+ const lines = sourceCode.split('\n');
461
+ const taintedKeys = [...taintedVars.keys()];
462
+ for (let i = 0; i < lines.length; i++) {
463
+ const line = lines[i];
464
+ if (line.trimStart().startsWith('#'))
465
+ continue;
466
+ // Match return/yield statements with string content
467
+ const returnMatch = line.match(/^\s*(?:return|yield)\s+(.+)$/);
468
+ if (!returnMatch)
469
+ continue;
470
+ const expr = returnMatch[1];
471
+ // Must contain a tainted variable
472
+ const hasTaintedVar = taintedKeys.some(v => new RegExp(`\\b${v}\\b`).test(expr));
473
+ if (!hasTaintedVar)
474
+ continue;
475
+ // Must look like HTML (contains '<', or is a string concatenation, or f-string with HTML)
476
+ const looksLikeHTML = expr.includes('<') || /['"]\s*\+/.test(expr) || /\+\s*['"]/.test(expr) || /f['"][^'"]*\{/.test(expr);
477
+ if (!looksLikeHTML)
478
+ continue;
479
+ sinks.push({ sinkLine: i + 1 });
480
+ }
481
+ return sinks;
482
+ }
164
483
  /**
165
484
  * Find DOM XSS sinks from property assignments in JavaScript.
166
485
  * Detects patterns like: element.innerHTML = userInput
@@ -459,6 +778,9 @@ export async function analyze(code, filePath, language, options = {}) {
459
778
  // Add sources for JavaScript variable assignments with tainted patterns
460
779
  const jsAssignmentSources = findJavaScriptAssignmentSources(code, language);
461
780
  taint.sources.push(...jsAssignmentSources);
781
+ // Add sources for Python variable assignments with tainted request patterns
782
+ const pythonAssignmentSources = findPythonAssignmentSources(code, language);
783
+ taint.sources.push(...pythonAssignmentSources);
462
784
  // Add sinks for JavaScript DOM XSS patterns (innerHTML, document.write, etc.)
463
785
  const jsDOMSinks = findJavaScriptDOMSinks(code, language);
464
786
  for (const domSink of jsDOMSinks) {
@@ -490,6 +812,100 @@ export async function analyze(code, filePath, language, options = {}) {
490
812
  taint.sinks = filterCleanVariableSinks(taint.sinks, calls, constPropResult.tainted, constPropResult.symbols, dfg, constPropResult.sanitizedVars, constPropResult.synchronizedLines);
491
813
  // Filter sinks that are wrapped by sanitizers on the same line
492
814
  taint.sinks = filterSanitizedSinks(taint.sinks, taint.sanitizers ?? [], calls);
815
+ // Python: reduce XPath false-positives using forward taint propagation +
816
+ // apostrophe-guard sanitizer detection; also detect trust boundary violations
817
+ // (flask.session[key] = value) which are subscript assignments, not call nodes.
818
+ if (language === 'python') {
819
+ const pyTaintedVars = buildPythonTaintedVars(code);
820
+ const pySanitizedVars = findPythonQuoteSanitizedVars(code);
821
+ // Propagate sanitization: if bar is sanitized and query = f"...{bar}...", query is also sanitized
822
+ for (const line of code.split('\n')) {
823
+ const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
824
+ if (!am)
825
+ continue;
826
+ const [, lhs, rhs] = am;
827
+ if ([...pySanitizedVars].some(v => new RegExp(`\\b${v}\\b`).test(rhs))) {
828
+ pySanitizedVars.add(lhs);
829
+ }
830
+ }
831
+ // Detect inline .replace() sanitizers: query = f"...{bar.replace('\'', '&apos;')}..."
832
+ // The tainted var appears with .replace() in the rhs — treat lhs as XPath-safe
833
+ for (const line of code.split('\n')) {
834
+ const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
835
+ if (!am)
836
+ continue;
837
+ const [, lhs, rhs] = am;
838
+ const hasReplaceOnTainted = [...pyTaintedVars.keys()].some(v => new RegExp(`\\b${v}\\.replace\\s*\\(`).test(rhs));
839
+ if (hasReplaceOnTainted)
840
+ pySanitizedVars.add(lhs);
841
+ }
842
+ const pySourceLines = code.split('\n');
843
+ // Filter XPath sinks: keep only if a tainted var is used at the sink line
844
+ taint.sinks = taint.sinks.filter(sink => {
845
+ if (sink.type !== 'xpath_injection')
846
+ return true;
847
+ const sinkLineText = pySourceLines[sink.line - 1] ?? '';
848
+ const taintedVarOnLine = [...pyTaintedVars.keys()].find(v => new RegExp(`\\b${v}\\b`).test(sinkLineText));
849
+ if (!taintedVarOnLine)
850
+ return false;
851
+ if (pySanitizedVars.has(taintedVarOnLine))
852
+ return false;
853
+ // Suppress parameterized XPath: root.xpath(query, name=bar) where bar is a keyword arg
854
+ if (new RegExp(`\\.xpath\\s*\\([^)]*\\b\\w+\\s*=\\s*\\b${taintedVarOnLine}\\b`).test(sinkLineText))
855
+ return false;
856
+ return true;
857
+ });
858
+ // Add trust boundary sinks from session subscript assignments
859
+ const trustViolations = findPythonTrustBoundaryViolations(code, language, pyTaintedVars);
860
+ for (const v of trustViolations) {
861
+ const alreadyExists = taint.sinks.some(s => s.line === v.sinkLine && s.type === 'trust_boundary');
862
+ if (!alreadyExists) {
863
+ taint.sinks.push({
864
+ type: 'trust_boundary',
865
+ cwe: 'CWE-501',
866
+ line: v.sinkLine,
867
+ location: `session write at line ${v.sinkLine}`,
868
+ confidence: 0.85,
869
+ });
870
+ }
871
+ }
872
+ // Add XSS sinks from return/yield statements (Flask/Django routes return HTML directly)
873
+ const pyReturnXSS = findPythonReturnXSSSinks(code, language, pyTaintedVars);
874
+ for (const r of pyReturnXSS) {
875
+ const alreadyExists = taint.sinks.some(s => s.line === r.sinkLine && s.type === 'xss');
876
+ if (!alreadyExists) {
877
+ taint.sinks.push({
878
+ type: 'xss',
879
+ cwe: 'CWE-79',
880
+ line: r.sinkLine,
881
+ location: `return HTML with user input at line ${r.sinkLine}`,
882
+ confidence: 0.9,
883
+ });
884
+ }
885
+ }
886
+ }
887
+ // JavaScript/TypeScript: filter XSS sinks where the argument variable is NOT actually
888
+ // tainted by user input (e.g., res.send(stdout) — stdout is a callback param from exec(),
889
+ // not a variable derived from req.query/req.body). This prevents FP pairs like:
890
+ // CWE-78 (correct) + CWE-79 (spurious) for the same source when the cmd output is sent.
891
+ if (['javascript', 'typescript'].includes(language)) {
892
+ const jsTaintedVars = buildJavaScriptTaintedVars(code, language);
893
+ if (jsTaintedVars.size > 0) {
894
+ const jsSourceLines = code.split('\n');
895
+ taint.sinks = taint.sinks.filter(sink => {
896
+ if (sink.type !== 'xss')
897
+ return true;
898
+ const sinkLineText = jsSourceLines[sink.line - 1] ?? '';
899
+ // Keep if any known-tainted variable appears on this sink line
900
+ if ([...jsTaintedVars.keys()].some(v => new RegExp(`\\b${v}\\b`).test(sinkLineText)))
901
+ return true;
902
+ // Also keep if the sink line directly references a taint source (inline use, no assignment)
903
+ if (JS_TAINTED_PATTERNS.some(p => p.pattern.test(sinkLineText)))
904
+ return true;
905
+ return false;
906
+ });
907
+ }
908
+ }
493
909
  // Propagate taint through dataflow to find verified flows
494
910
  if (taint.sources.length > 0 && taint.sinks.length > 0) {
495
911
  const propagationResult = propagateTaint(dfg, calls, taint.sources, taint.sinks, taint.sanitizers ?? []);
@@ -594,7 +1010,13 @@ export async function analyze(code, filePath, language, options = {}) {
594
1010
  taintedVariables: constPropResult.tainted,
595
1011
  });
596
1012
  // Add inter-procedural sinks to the taint sinks and generate flows
1013
+ // Skip external_taint_escape (CWE-668) here: they are only used as a last resort
1014
+ // in the fallback path below when no other sinks exist. Adding them when proper sinks
1015
+ // already exist creates duplicate/spurious findings (e.g., http.get already reported
1016
+ // as CWE-918 SSRF; also getting CWE-668 for the same call chain is a FP).
597
1017
  for (const sink of interProc.propagatedSinks) {
1018
+ if (sink.type === 'external_taint_escape')
1019
+ continue;
598
1020
  if (!taint.sinks.some(s => s.line === sink.line)) {
599
1021
  taint.sinks.push(sink);
600
1022
  }
@@ -805,8 +1227,70 @@ export async function analyzeForAPI(code, filePath, language, options = {}) {
805
1227
  filteredSinks = filterCleanVariableSinks(filteredSinks, calls, constPropResult.tainted, constPropResult.symbols, undefined, constPropResult.sanitizedVars, constPropResult.synchronizedLines);
806
1228
  // Filter sinks wrapped by sanitizers on the same line
807
1229
  filteredSinks = filterSanitizedSinks(filteredSinks, taint.sanitizers ?? [], calls);
1230
+ // Python: reduce XPath false-positives using forward taint propagation +
1231
+ // apostrophe-guard sanitizer detection.
1232
+ let pythonTaintedVars = new Map();
1233
+ if (language === 'python') {
1234
+ pythonTaintedVars = buildPythonTaintedVars(code);
1235
+ const pythonSanitizedVars = findPythonQuoteSanitizedVars(code);
1236
+ // Propagate sanitization: if bar is sanitized and query = f"...{bar}...", query is also sanitized
1237
+ for (const line of code.split('\n')) {
1238
+ const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
1239
+ if (!am)
1240
+ continue;
1241
+ const [, lhs, rhs] = am;
1242
+ if ([...pythonSanitizedVars].some(v => new RegExp(`\\b${v}\\b`).test(rhs))) {
1243
+ pythonSanitizedVars.add(lhs);
1244
+ }
1245
+ }
1246
+ // Detect inline .replace() sanitizers: query = f"...{bar.replace('\'', '&apos;')}..."
1247
+ for (const line of code.split('\n')) {
1248
+ const am = line.match(/^\s*(\w+)\s*=\s*(.+)$/);
1249
+ if (!am)
1250
+ continue;
1251
+ const [, lhs, rhs] = am;
1252
+ const hasReplaceOnTainted = [...pythonTaintedVars.keys()].some(v => new RegExp(`\\b${v}\\.replace\\s*\\(`).test(rhs));
1253
+ if (hasReplaceOnTainted)
1254
+ pythonSanitizedVars.add(lhs);
1255
+ }
1256
+ const sourceLines = code.split('\n');
1257
+ filteredSinks = filteredSinks.filter(sink => {
1258
+ if (sink.type !== 'xpath_injection')
1259
+ return true;
1260
+ // Keep XPath sink only if a tainted variable is used at the sink line
1261
+ const sinkLineText = sourceLines[sink.line - 1] ?? '';
1262
+ const taintedVarOnLine = [...pythonTaintedVars.keys()].find(v => new RegExp(`\\b${v}\\b`).test(sinkLineText));
1263
+ if (!taintedVarOnLine)
1264
+ return false;
1265
+ // Kill if the variable is protected by an apostrophe guard
1266
+ if (pythonSanitizedVars.has(taintedVarOnLine))
1267
+ return false;
1268
+ // Suppress parameterized XPath: root.xpath(query, name=bar) where bar is a keyword arg
1269
+ if (new RegExp(`\\.xpath\\s*\\([^)]*\\b\\w+\\s*=\\s*\\b${taintedVarOnLine}\\b`).test(sinkLineText))
1270
+ return false;
1271
+ return true;
1272
+ });
1273
+ }
808
1274
  // Generate vulnerabilities from source-sink pairs
809
1275
  const vulnerabilities = findVulnerabilities(taint.sources, filteredSinks, calls, constPropResult);
1276
+ // Python: detect trust boundary violations (flask.session[key] = taintedVal)
1277
+ if (language === 'python') {
1278
+ const trustViolations = findPythonTrustBoundaryViolations(code, language, pythonTaintedVars);
1279
+ for (const v of trustViolations) {
1280
+ // Avoid duplicate: only add if no existing vulnerability for same sink line
1281
+ const alreadyReported = vulnerabilities.some(existing => existing.sink.line === v.sinkLine && existing.type === 'trust_boundary');
1282
+ if (!alreadyReported) {
1283
+ vulnerabilities.push({
1284
+ type: 'trust_boundary',
1285
+ cwe: 'CWE-501',
1286
+ severity: 'medium',
1287
+ source: { line: v.sourceLine, type: 'http_param' },
1288
+ sink: { line: v.sinkLine, type: 'trust_boundary' },
1289
+ confidence: 0.85,
1290
+ });
1291
+ }
1292
+ }
1293
+ }
810
1294
  const analysisTime = performance.now() - analysisStart;
811
1295
  const totalTime = performance.now() - startTime;
812
1296
  return {