@clear-capabilities/agentic-security-scanner 0.74.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +1580 -0
- package/bin/.agentic-security/findings.json +1577 -0
- package/bin/.agentic-security/last-scan.json +1577 -0
- package/bin/.agentic-security/last-scan.json.sig +1 -0
- package/bin/.agentic-security/scan-history.json +465 -0
- package/bin/.agentic-security/streak.json +25 -0
- package/bin/agentic-security-audit.js +198 -0
- package/bin/agentic-security-consistency.js +80 -0
- package/bin/agentic-security-diff.js +136 -0
- package/bin/agentic-security-lsp.js +12 -0
- package/bin/agentic-security-mcp.js +40 -0
- package/bin/agentic-security-rule.js +153 -0
- package/bin/agentic-security.js +1683 -0
- package/dist/117.index.js +207 -0
- package/dist/178.index.js +250 -0
- package/dist/218.index.js +793 -0
- package/dist/227.index.js +192 -0
- package/dist/301.index.js +167 -0
- package/dist/384.index.js +18 -0
- package/dist/476.index.js +126 -0
- package/dist/513.index.js +373 -0
- package/dist/520.index.js +13 -0
- package/dist/601.index.js +1038 -0
- package/dist/634.index.js +1892 -0
- package/dist/637.index.js +216 -0
- package/dist/660.index.js +131 -0
- package/dist/675.index.js +451 -0
- package/dist/826.index.js +188 -0
- package/dist/830.index.js +133 -0
- package/dist/agentic-security.mjs +272 -0
- package/dist/agentic-security.mjs.sha256 +1 -0
- package/dist/calibration-seed.json +27 -0
- package/package.json +77 -0
- package/src/.agentic-security/findings.json +80844 -0
- package/src/.agentic-security/last-scan.json +80844 -0
- package/src/.agentic-security/last-scan.json.sig +1 -0
- package/src/.agentic-security/scan-history.json +8408 -0
- package/src/.agentic-security/streak.json +26 -0
- package/src/badge.js +188 -0
- package/src/compare.js +203 -0
- package/src/dataflow/.agentic-security/findings.json +3487 -0
- package/src/dataflow/.agentic-security/last-scan.json +3487 -0
- package/src/dataflow/.agentic-security/last-scan.json.sig +1 -0
- package/src/dataflow/.agentic-security/scan-history.json +735 -0
- package/src/dataflow/.agentic-security/streak.json +24 -0
- package/src/dataflow/CLAUDE.md +38 -0
- package/src/dataflow/access-paths.js +172 -0
- package/src/dataflow/async-sequencing.js +177 -0
- package/src/dataflow/backward.js +201 -0
- package/src/dataflow/catalog-expanded.js +485 -0
- package/src/dataflow/catalog.js +659 -0
- package/src/dataflow/cross-repo.js +219 -0
- package/src/dataflow/engine.js +588 -0
- package/src/dataflow/exception-flow.js +116 -0
- package/src/dataflow/exploit-prover.js +187 -0
- package/src/dataflow/higher-order.js +221 -0
- package/src/dataflow/ifds.js +347 -0
- package/src/dataflow/implicit-flow.js +129 -0
- package/src/dataflow/incremental.js +229 -0
- package/src/dataflow/index.js +181 -0
- package/src/dataflow/numeric-domain.js +192 -0
- package/src/dataflow/path-feasibility.js +114 -0
- package/src/dataflow/points-to.js +337 -0
- package/src/dataflow/polyglot.js +190 -0
- package/src/dataflow/proven-clean.js +159 -0
- package/src/dataflow/receiver-context.js +76 -0
- package/src/dataflow/sanitizer-proof.js +154 -0
- package/src/dataflow/soft-taint.js +140 -0
- package/src/dataflow/string-domain.js +234 -0
- package/src/dataflow/stub-aware-filter.js +100 -0
- package/src/dataflow/summaries.js +132 -0
- package/src/dataflow/symbolic-exec.js +238 -0
- package/src/dataflow/tabulation.js +135 -0
- package/src/engine.js +7763 -0
- package/src/history-scan.js +229 -0
- package/src/index.js +3 -0
- package/src/integrations/.agentic-security/findings.json +1504 -0
- package/src/integrations/.agentic-security/last-scan.json +1504 -0
- package/src/integrations/.agentic-security/scan-history.json +40 -0
- package/src/integrations/.agentic-security/streak.json +21 -0
- package/src/integrations/index.js +321 -0
- package/src/integrations/tickets.js +200 -0
- package/src/ir/.agentic-security/findings.json +3036 -0
- package/src/ir/.agentic-security/last-scan.json +3036 -0
- package/src/ir/.agentic-security/last-scan.json.sig +1 -0
- package/src/ir/.agentic-security/scan-history.json +364 -0
- package/src/ir/.agentic-security/streak.json +23 -0
- package/src/ir/CLAUDE.md +172 -0
- package/src/ir/callgraph.js +73 -0
- package/src/ir/class-hierarchy.js +195 -0
- package/src/ir/index.js +152 -0
- package/src/ir/parser-cs.js +260 -0
- package/src/ir/parser-java.js +286 -0
- package/src/ir/parser-js.js +413 -0
- package/src/ir/parser-kt.js +258 -0
- package/src/ir/parser-py-cst.js +136 -0
- package/src/ir/parser-py.helper.py +501 -0
- package/src/ir/parser-py.js +312 -0
- package/src/ir/ssa.js +315 -0
- package/src/ir/type-stubs.js +288 -0
- package/src/leaderboard.js +152 -0
- package/src/llm-validator/.agentic-security/findings.json +1891 -0
- package/src/llm-validator/.agentic-security/last-scan.json +1891 -0
- package/src/llm-validator/.agentic-security/last-scan.json.sig +1 -0
- package/src/llm-validator/.agentic-security/scan-history.json +168 -0
- package/src/llm-validator/.agentic-security/streak.json +20 -0
- package/src/llm-validator/consistency.js +141 -0
- package/src/llm-validator/index.js +437 -0
- package/src/lsp/.agentic-security/findings.json +28 -0
- package/src/lsp/.agentic-security/last-scan.json +28 -0
- package/src/lsp/.agentic-security/scan-history.json +79 -0
- package/src/lsp/.agentic-security/streak.json +22 -0
- package/src/lsp/server.js +275 -0
- package/src/mcp/.agentic-security/findings.json +8358 -0
- package/src/mcp/.agentic-security/last-scan.json +8358 -0
- package/src/mcp/.agentic-security/last-scan.json.sig +1 -0
- package/src/mcp/.agentic-security/scan-history.json +1125 -0
- package/src/mcp/.agentic-security/streak.json +22 -0
- package/src/mcp/CLAUDE.md +54 -0
- package/src/mcp/audit.js +136 -0
- package/src/mcp/redact.js +75 -0
- package/src/mcp/server.js +158 -0
- package/src/mcp/stdio.js +83 -0
- package/src/mcp/tools.js +940 -0
- package/src/mcp/validate.js +49 -0
- package/src/personality.js +164 -0
- package/src/poc-video.js +239 -0
- package/src/posture/.agentic-security/findings.json +51239 -0
- package/src/posture/.agentic-security/last-scan.json +51239 -0
- package/src/posture/.agentic-security/last-scan.json.sig +1 -0
- package/src/posture/.agentic-security/scan-history.json +5557 -0
- package/src/posture/.agentic-security/streak.json +24 -0
- package/src/posture/CLAUDE.md +42 -0
- package/src/posture/adversarial-self-test.js +114 -0
- package/src/posture/adversary-agent.js +204 -0
- package/src/posture/agents-memory.js +135 -0
- package/src/posture/ai-code-fingerprint.js +171 -0
- package/src/posture/aibom.js +284 -0
- package/src/posture/api-inventory.js +96 -0
- package/src/posture/attack-playbooks.js +305 -0
- package/src/posture/auditor-agent.js +115 -0
- package/src/posture/auth-posture-import.js +135 -0
- package/src/posture/baseline-compare.js +114 -0
- package/src/posture/blast-radius.js +836 -0
- package/src/posture/bounty-prediction.js +141 -0
- package/src/posture/business-logic.js +239 -0
- package/src/posture/calibration-drift.js +93 -0
- package/src/posture/calibration-seed.json +27 -0
- package/src/posture/calibration.js +204 -0
- package/src/posture/clustering.js +75 -0
- package/src/posture/concurrency-checker.js +265 -0
- package/src/posture/confidence.js +65 -0
- package/src/posture/container-runtime.js +149 -0
- package/src/posture/counterfactual.js +109 -0
- package/src/posture/cross-lang-graphql.js +165 -0
- package/src/posture/cross-lang-grpc.js +166 -0
- package/src/posture/cross-lang-meta.js +101 -0
- package/src/posture/cross-lang-openapi.js +187 -0
- package/src/posture/cross-lang-orm.js +153 -0
- package/src/posture/cross-lang-queues.js +210 -0
- package/src/posture/crown-jewels.js +110 -0
- package/src/posture/custom-rules.js +361 -0
- package/src/posture/cve-alert-daemon.js +433 -0
- package/src/posture/cve-lookup.js +129 -0
- package/src/posture/dead-code.js +430 -0
- package/src/posture/defender-agent.js +158 -0
- package/src/posture/deploy-platform.js +204 -0
- package/src/posture/detector-fuzz.js +61 -0
- package/src/posture/deterministic.js +99 -0
- package/src/posture/drift.js +165 -0
- package/src/posture/epss.js +156 -0
- package/src/posture/exploitability-probability.js +212 -0
- package/src/posture/exploitability.js +121 -0
- package/src/posture/feature-flags.js +110 -0
- package/src/posture/finding-defaults.js +132 -0
- package/src/posture/fix-history.js +411 -0
- package/src/posture/fix-plan.js +121 -0
- package/src/posture/fix-verify-loop.js +157 -0
- package/src/posture/fix-verify.js +130 -0
- package/src/posture/flow-narration.js +105 -0
- package/src/posture/grader-calibration.js +156 -0
- package/src/posture/harness-discovery.js +113 -0
- package/src/posture/holdout-eval.js +144 -0
- package/src/posture/iac-reachability.js +163 -0
- package/src/posture/iam-policy.js +128 -0
- package/src/posture/integrity.js +97 -0
- package/src/posture/learning.js +166 -0
- package/src/posture/license-policy.js +109 -0
- package/src/posture/llm-redteam-prompts.js +418 -0
- package/src/posture/llm-redteam.js +303 -0
- package/src/posture/material-change.js +163 -0
- package/src/posture/mitigation-composite.js +55 -0
- package/src/posture/mttr.js +91 -0
- package/src/posture/network-policy-import.js +126 -0
- package/src/posture/path-predicates.js +99 -0
- package/src/posture/persona-prioritization.js +153 -0
- package/src/posture/poc-cwe-map.js +51 -0
- package/src/posture/poc-generator.js +500 -0
- package/src/posture/policy-gate.js +174 -0
- package/src/posture/pre-incident-archaeology.js +110 -0
- package/src/posture/profile.js +93 -0
- package/src/posture/reachability-filter.js +42 -0
- package/src/posture/regression-test-gen.js +200 -0
- package/src/posture/reverse-blast-radius.js +110 -0
- package/src/posture/router.js +109 -0
- package/src/posture/rule-overrides.js +198 -0
- package/src/posture/rule-pack-signing.js +209 -0
- package/src/posture/rule-packs.js +143 -0
- package/src/posture/rule-synthesis.js +108 -0
- package/src/posture/ruleset-version.js +71 -0
- package/src/posture/sbom.js +129 -0
- package/src/posture/schema-aware-bridge.js +207 -0
- package/src/posture/security-trend.js +87 -0
- package/src/posture/semantic-clone.js +114 -0
- package/src/posture/specification-mining.js +170 -0
- package/src/posture/stable-id.js +75 -0
- package/src/posture/stack-playbook.js +229 -0
- package/src/posture/streak.js +249 -0
- package/src/posture/suppressions.js +135 -0
- package/src/posture/telemetry-ingest.js +112 -0
- package/src/posture/threat-model.js +145 -0
- package/src/posture/three-agent-pipeline.js +74 -0
- package/src/posture/triage.js +146 -0
- package/src/posture/trust-boundary-diagram.js +115 -0
- package/src/posture/type-narrowing.js +129 -0
- package/src/posture/validator-metrics.js +179 -0
- package/src/posture/verifier-ephemeral.js +118 -0
- package/src/posture/verifier-target.js +147 -0
- package/src/posture/verifier.js +257 -0
- package/src/posture/version.js +75 -0
- package/src/posture/waf-ingest.js +200 -0
- package/src/posture/why-fired.js +141 -0
- package/src/pr-comment.js +172 -0
- package/src/pr-delta.js +198 -0
- package/src/report/.agentic-security/findings.json +79 -0
- package/src/report/.agentic-security/last-scan.json +79 -0
- package/src/report/.agentic-security/last-scan.json.sig +1 -0
- package/src/report/.agentic-security/scan-history.json +332 -0
- package/src/report/.agentic-security/streak.json +23 -0
- package/src/report/index.js +1136 -0
- package/src/report/mascot.js +42 -0
- package/src/runScan.js +141 -0
- package/src/sast/.agentic-security/findings.json +5051 -0
- package/src/sast/.agentic-security/last-scan.json +5051 -0
- package/src/sast/.agentic-security/last-scan.json.sig +1 -0
- package/src/sast/.agentic-security/scan-history.json +788 -0
- package/src/sast/.agentic-security/streak.json +23 -0
- package/src/sast/CLAUDE.md +39 -0
- package/src/sast/_comment-strip.js +46 -0
- package/src/sast/agent-tool-escalation.js +131 -0
- package/src/sast/auth-provider.js +171 -0
- package/src/sast/authz.js +236 -0
- package/src/sast/bench-shape/.agentic-security/findings.json +28 -0
- package/src/sast/bench-shape/.agentic-security/last-scan.json +28 -0
- package/src/sast/bench-shape/.agentic-security/scan-history.json +24 -0
- package/src/sast/bench-shape/.agentic-security/streak.json +22 -0
- package/src/sast/bench-shape/index.js +62 -0
- package/src/sast/claude-hook-injection.js +199 -0
- package/src/sast/claude-md-prompt-injection.js +170 -0
- package/src/sast/claude-settings.js +165 -0
- package/src/sast/client-side.js +149 -0
- package/src/sast/cpp-bench-extras.js +122 -0
- package/src/sast/cpp-dataflow.js +430 -0
- package/src/sast/cpp.js +248 -0
- package/src/sast/csharp.js +152 -0
- package/src/sast/csrf.js +82 -0
- package/src/sast/dart-flutter.js +173 -0
- package/src/sast/db-rls.js +147 -0
- package/src/sast/db-taint.js +215 -0
- package/src/sast/defi-deep.js +242 -0
- package/src/sast/deserialization-gadgets.js +113 -0
- package/src/sast/django-hardening.js +230 -0
- package/src/sast/env-hygiene.js +125 -0
- package/src/sast/fastapi-hardening.js +145 -0
- package/src/sast/go-extended.js +84 -0
- package/src/sast/host-header.js +106 -0
- package/src/sast/index.js +17 -0
- package/src/sast/java-ast-folding.js +561 -0
- package/src/sast/java-bench-extras.js +708 -0
- package/src/sast/java-collection-passthrough.js +178 -0
- package/src/sast/java-constant-fold.js +244 -0
- package/src/sast/java-deserialization.js +125 -0
- package/src/sast/jndi.js +104 -0
- package/src/sast/juliet-shape.js +324 -0
- package/src/sast/jwt-exp.js +104 -0
- package/src/sast/kotlin.js +82 -0
- package/src/sast/laravel-hardening.js +198 -0
- package/src/sast/ldap-injection.js +100 -0
- package/src/sast/llm-owasp.js +465 -0
- package/src/sast/llm-stored-prompt.js +103 -0
- package/src/sast/llm-trading-agent.js +161 -0
- package/src/sast/llm.js +308 -0
- package/src/sast/logic.js +140 -0
- package/src/sast/mass-assignment.js +101 -0
- package/src/sast/mcp-audit.js +242 -0
- package/src/sast/mobile-manifest.js +195 -0
- package/src/sast/model-load.js +164 -0
- package/src/sast/mutation-xss.js +87 -0
- package/src/sast/nosql-injection.js +82 -0
- package/src/sast/open-redirect.js +119 -0
- package/src/sast/php.js +91 -0
- package/src/sast/pipeline.js +122 -0
- package/src/sast/primary-cwe-java.js +155 -0
- package/src/sast/prompt-firewall.js +151 -0
- package/src/sast/prompt-template.js +157 -0
- package/src/sast/prototype-pollution.js +112 -0
- package/src/sast/python-sinks.js +195 -0
- package/src/sast/quarkus-hardening.js +102 -0
- package/src/sast/rag-poisoning.js +118 -0
- package/src/sast/rate-limit.js +128 -0
- package/src/sast/response-splitting.js +138 -0
- package/src/sast/ruby.js +108 -0
- package/src/sast/rust.js +105 -0
- package/src/sast/solidity.js +167 -0
- package/src/sast/springboot-hardening.js +186 -0
- package/src/sast/ssrf-cloud-metadata.js +80 -0
- package/src/sast/ssti.js +116 -0
- package/src/sast/swift.js +162 -0
- package/src/sast/toctou.js +95 -0
- package/src/sast/webhook.js +101 -0
- package/src/sast/xpath-injection.js +51 -0
- package/src/sast/xxe.js +140 -0
- package/src/sast/zip-slip.js +200 -0
- package/src/sca/base-images.json +45 -0
- package/src/sca/container.js +107 -0
- package/src/sca/dep-confusion.js +134 -0
- package/src/sca/index.js +6 -0
- package/src/sca/popular-packages.json +41 -0
- package/src/sca/sarif-ingest.js +187 -0
- package/src/sca/vuln-function-hints.json +89 -0
- package/src/secrets/index.js +4 -0
|
@@ -0,0 +1,136 @@
|
|
|
1
|
+
// AST-backed Python parser — drop-in replacement for parser-py.js.
|
|
2
|
+
//
|
|
3
|
+
// Shells to `scanner/src/ir/parser-py.helper.py` which uses Python's stdlib
|
|
4
|
+
// `ast` module (zero external dependencies, ships with Python 3.8+) to
|
|
5
|
+
// produce the same IR shape parser-py.js emits, but computed from a real
|
|
6
|
+
// parser rather than a regex-balanced indentation walker.
|
|
7
|
+
//
|
|
8
|
+
// What this fixes (gaps in the regex parser, by its own admission):
|
|
9
|
+
// - Comprehensions, decorators, match statements, async/await, lambda
|
|
10
|
+
// bodies — all dropped by the regex parser; the AST parser preserves
|
|
11
|
+
// the function records even when the body has constructs we don't
|
|
12
|
+
// fully lower yet.
|
|
13
|
+
// - `def f(x=Foo(1, 2))` and `db.execute(sanitize(x))` — nested parens
|
|
14
|
+
// that the regex parser's call regex rejected.
|
|
15
|
+
// - Walrus `:=`, type hints (`def f(x: List[int]) -> Dict`), PEP-695
|
|
16
|
+
// generics — recognized cleanly by the real parser.
|
|
17
|
+
//
|
|
18
|
+
// Cost / fallback:
|
|
19
|
+
// - One python3 subprocess per `runScan` (batched: ALL .py files sent in
|
|
20
|
+
// one stdin payload). Not one process per file.
|
|
21
|
+
// - When python3 isn't on PATH, or is too old (< 3.8), or the helper
|
|
22
|
+
// fails — caller falls back to the regex parser (parser-py.js).
|
|
23
|
+
// - Capability probe is cached for the process; we don't re-spawn
|
|
24
|
+
// python3 every scan.
|
|
25
|
+
//
|
|
26
|
+
// Toggle:
|
|
27
|
+
// AGENTIC_SECURITY_PY_PARSER=cst → force this path (error if unavailable)
|
|
28
|
+
// AGENTIC_SECURITY_PY_PARSER=regex → force the legacy regex parser
|
|
29
|
+
// AGENTIC_SECURITY_PY_PARSER=auto → try CST, fall back silently (default)
|
|
30
|
+
|
|
31
|
+
import * as cp from 'node:child_process';
|
|
32
|
+
import * as path from 'node:path';
|
|
33
|
+
import * as fs from 'node:fs';
|
|
34
|
+
import { fileURLToPath } from 'node:url';
|
|
35
|
+
|
|
36
|
+
const HERE = path.dirname(fileURLToPath(import.meta.url));
|
|
37
|
+
const HELPER_PATH = path.join(HERE, 'parser-py.helper.py');
|
|
38
|
+
|
|
39
|
+
// Capability probe — cached per-process. Returns:
|
|
40
|
+
// { ok: true, python: '/usr/bin/python3', version: '3.12.2' } on success
|
|
41
|
+
// { ok: false, reason: '...' } on failure
|
|
42
|
+
let _capability = null;
|
|
43
|
+
|
|
44
|
+
export function probePythonAvailable() {
|
|
45
|
+
if (_capability) return _capability;
|
|
46
|
+
// Try the canonical names in order. macOS / most Linux have python3;
|
|
47
|
+
// some Linuxes only have python. We don't accept python2 (no f-strings).
|
|
48
|
+
for (const bin of ['python3', 'python']) {
|
|
49
|
+
let r;
|
|
50
|
+
try {
|
|
51
|
+
r = cp.spawnSync(bin, ['--version'], { encoding: 'utf8', timeout: 1500 });
|
|
52
|
+
} catch { continue; }
|
|
53
|
+
if (r.status !== 0) continue;
|
|
54
|
+
// Output format: "Python 3.12.2" (or 2.x — reject those).
|
|
55
|
+
const m = /Python\s+(\d+)\.(\d+)\.(\d+)/.exec(r.stdout || r.stderr || '');
|
|
56
|
+
if (!m) continue;
|
|
57
|
+
const major = parseInt(m[1], 10);
|
|
58
|
+
const minor = parseInt(m[2], 10);
|
|
59
|
+
if (major < 3 || (major === 3 && minor < 8)) continue;
|
|
60
|
+
_capability = { ok: true, python: bin, version: `${m[1]}.${m[2]}.${m[3]}` };
|
|
61
|
+
return _capability;
|
|
62
|
+
}
|
|
63
|
+
_capability = { ok: false, reason: 'no-python3-on-path' };
|
|
64
|
+
return _capability;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
// Single-file shim that matches parser-py.js's signature exactly.
|
|
68
|
+
//
|
|
69
|
+
// Internally we DON'T spawn a subprocess per file — that would be slow.
|
|
70
|
+
// Callers should use parsePythonFilesBatch() to amortize the spawn cost.
|
|
71
|
+
// This single-file form is kept for the test harness and for any caller
|
|
72
|
+
// that passes one file at a time.
|
|
73
|
+
export function parsePythonFile(file, raw) {
|
|
74
|
+
if (!file || !raw || typeof raw !== 'string') return null;
|
|
75
|
+
if (!/\.py$/i.test(file)) return null;
|
|
76
|
+
if (raw.length > 1_000_000) return null;
|
|
77
|
+
const cap = probePythonAvailable();
|
|
78
|
+
if (!cap.ok) return null;
|
|
79
|
+
const out = parsePythonFilesBatch([{ file, content: raw }]);
|
|
80
|
+
if (!out || !out.length) return null;
|
|
81
|
+
return out[0];
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
// Batch entry point. Pass [{file, content}, ...]; receive [{file, functions[], topLevel}, ...].
|
|
85
|
+
// Returns null on capability / subprocess failure — caller is expected to
|
|
86
|
+
// fall back to the regex parser.
|
|
87
|
+
export function parsePythonFilesBatch(entries) {
|
|
88
|
+
if (!Array.isArray(entries) || entries.length === 0) return [];
|
|
89
|
+
const cap = probePythonAvailable();
|
|
90
|
+
if (!cap.ok) return null;
|
|
91
|
+
if (!fs.existsSync(HELPER_PATH)) return null;
|
|
92
|
+
const filtered = entries.filter(e =>
|
|
93
|
+
e && typeof e.file === 'string' && /\.py$/i.test(e.file) &&
|
|
94
|
+
typeof e.content === 'string' && e.content.length <= 1_000_000
|
|
95
|
+
);
|
|
96
|
+
if (filtered.length === 0) return [];
|
|
97
|
+
let payload;
|
|
98
|
+
try { payload = JSON.stringify(filtered); }
|
|
99
|
+
catch { return null; }
|
|
100
|
+
let r;
|
|
101
|
+
try {
|
|
102
|
+
r = cp.spawnSync(cap.python, [HELPER_PATH], {
|
|
103
|
+
input: payload,
|
|
104
|
+
encoding: 'utf8',
|
|
105
|
+
// 10 s for a whole batch. The helper itself processes files in a
|
|
106
|
+
// simple linear loop; on a 100-file repo a single-digit-second
|
|
107
|
+
// budget is plenty. If a customer hits the timeout, the regex
|
|
108
|
+
// parser fallback catches them.
|
|
109
|
+
timeout: 10_000,
|
|
110
|
+
maxBuffer: 64 * 1024 * 1024,
|
|
111
|
+
});
|
|
112
|
+
} catch (e) {
|
|
113
|
+
if (process.env.AGENTIC_SECURITY_PY_PARSER_DEBUG === '1') {
|
|
114
|
+
process.stderr.write(`parser-py-cst: spawn failed — ${e.message}\n`);
|
|
115
|
+
}
|
|
116
|
+
return null;
|
|
117
|
+
}
|
|
118
|
+
if (r.status !== 0 || !r.stdout) {
|
|
119
|
+
if (process.env.AGENTIC_SECURITY_PY_PARSER_DEBUG === '1') {
|
|
120
|
+
process.stderr.write(`parser-py-cst: helper exit=${r.status} stderr=${r.stderr || ''}\n`);
|
|
121
|
+
}
|
|
122
|
+
return null;
|
|
123
|
+
}
|
|
124
|
+
let out;
|
|
125
|
+
try { out = JSON.parse(r.stdout); }
|
|
126
|
+
catch (e) {
|
|
127
|
+
if (process.env.AGENTIC_SECURITY_PY_PARSER_DEBUG === '1') {
|
|
128
|
+
process.stderr.write(`parser-py-cst: helper output not JSON — ${e.message}\n`);
|
|
129
|
+
}
|
|
130
|
+
return null;
|
|
131
|
+
}
|
|
132
|
+
return out;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Reset the cache — for tests.
|
|
136
|
+
export function _resetCapabilityCacheForTests() { _capability = null; }
|
|
@@ -0,0 +1,501 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
# Python IR helper for the agentic-security scanner.
|
|
3
|
+
#
|
|
4
|
+
# Reads a JSON list `[{"file": "...", "content": "..."}, ...]` from stdin.
|
|
5
|
+
# For each file, walks the Python AST (stdlib `ast`, no external deps) and
|
|
6
|
+
# emits the same IR shape the regex-based `parser-py.js` produces, but
|
|
7
|
+
# computed from a real parser. Writes a JSON array of `{file, functions[],
|
|
8
|
+
# topLevel}` blobs to stdout.
|
|
9
|
+
#
|
|
10
|
+
# IR shape (must mirror parser-py.js):
|
|
11
|
+
#
|
|
12
|
+
# { file, functions: [
|
|
13
|
+
# { qid, name, line, params, file,
|
|
14
|
+
# cfg: { entry: nodeId, exit: nodeId, nodes: { id: node } } }
|
|
15
|
+
# ], topLevel: null }
|
|
16
|
+
#
|
|
17
|
+
# node = {
|
|
18
|
+
# kind: 'entry' | 'exit' | 'noop' | 'loop-header' | 'assign' | 'call'
|
|
19
|
+
# | 'if' | 'return' | 'throw' | 'unknown',
|
|
20
|
+
# line, succ: [nodeId, ...], pred: [nodeId, ...],
|
|
21
|
+
# ...kind-specific fields
|
|
22
|
+
# }
|
|
23
|
+
#
|
|
24
|
+
# For assign: { target: str|None, source: expr }
|
|
25
|
+
# For call: { callee: str, args: [expr] }
|
|
26
|
+
# For if: { cond: expr }
|
|
27
|
+
# For return: { value: expr|None }
|
|
28
|
+
#
|
|
29
|
+
# expr = { kind: 'literal'|'ident'|'member'|'binary'|'logical'|'tpl'
|
|
30
|
+
# |'call'|'array'|'object'|'unknown',
|
|
31
|
+
# ...kind-specific fields }
|
|
32
|
+
#
|
|
33
|
+
# Constructs deliberately NOT yet lowered (emit `kind: 'unknown'`):
|
|
34
|
+
# - match statements (we tag the function as having one, but don't
|
|
35
|
+
# control-flow into it; future work).
|
|
36
|
+
# - walrus assignment :=
|
|
37
|
+
# - nested function defs inside comprehensions
|
|
38
|
+
# - decorators (function records keep the @-decorator names as metadata
|
|
39
|
+
# but the decorator expressions don't get full CFG nodes).
|
|
40
|
+
#
|
|
41
|
+
# Exit codes:
|
|
42
|
+
# 0 success — stdout is JSON
|
|
43
|
+
# 2 bad input (stdin not parseable)
|
|
44
|
+
# 3 no Python files in input
|
|
45
|
+
#
|
|
46
|
+
# This script is invoked by `scanner/src/ir/parser-py-cst.js`; never run
|
|
47
|
+
# directly by the scanner user.
|
|
48
|
+
|
|
49
|
+
import ast
|
|
50
|
+
import hashlib
|
|
51
|
+
import json
|
|
52
|
+
import sys
|
|
53
|
+
from typing import Any, Optional
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
# ─── ID generation ───────────────────────────────────────────────────────────
|
|
57
|
+
|
|
58
|
+
_node_id = 0
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def _next_id() -> str:
|
|
62
|
+
global _node_id
|
|
63
|
+
_node_id += 1
|
|
64
|
+
return f"pyn{_node_id}"
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def _qid(file: str, name: str, line: int) -> str:
|
|
68
|
+
h = hashlib.sha1(f"{file}:{name}:{line}".encode("utf-8")).hexdigest()[:8]
|
|
69
|
+
return f"{file}::{name}@{line}#{h}"
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
# ─── Expression lowering ─────────────────────────────────────────────────────
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _lower_expr(node: ast.AST) -> dict[str, Any]:
|
|
76
|
+
if node is None:
|
|
77
|
+
return {"kind": "unknown"}
|
|
78
|
+
if isinstance(node, ast.Constant):
|
|
79
|
+
v = node.value
|
|
80
|
+
if isinstance(v, str):
|
|
81
|
+
return {"kind": "literal", "value": repr(v)}
|
|
82
|
+
if isinstance(v, (int, float, bool)) or v is None:
|
|
83
|
+
return {"kind": "literal", "value": v if v is not None else "None"}
|
|
84
|
+
return {"kind": "literal", "value": repr(v)}
|
|
85
|
+
if isinstance(node, ast.Name):
|
|
86
|
+
return {"kind": "ident", "name": node.id}
|
|
87
|
+
if isinstance(node, ast.Attribute):
|
|
88
|
+
return {"kind": "member", "object": _lower_expr(node.value), "prop": node.attr}
|
|
89
|
+
if isinstance(node, ast.Subscript):
|
|
90
|
+
# Surface as a member-with-slice; downstream taint treats it like member access.
|
|
91
|
+
return {
|
|
92
|
+
"kind": "member",
|
|
93
|
+
"object": _lower_expr(node.value),
|
|
94
|
+
"prop": "[]",
|
|
95
|
+
}
|
|
96
|
+
if isinstance(node, ast.JoinedStr):
|
|
97
|
+
# f"...{expr}..." — taint flows through the interpolated parts.
|
|
98
|
+
parts = []
|
|
99
|
+
for p in node.values:
|
|
100
|
+
if isinstance(p, ast.FormattedValue):
|
|
101
|
+
parts.append(_lower_expr(p.value))
|
|
102
|
+
return {"kind": "tpl", "parts": parts}
|
|
103
|
+
if isinstance(node, ast.BinOp):
|
|
104
|
+
op = type(node.op).__name__
|
|
105
|
+
return {
|
|
106
|
+
"kind": "binary", "op": op,
|
|
107
|
+
"left": _lower_expr(node.left),
|
|
108
|
+
"right": _lower_expr(node.right),
|
|
109
|
+
}
|
|
110
|
+
if isinstance(node, ast.BoolOp):
|
|
111
|
+
# 'and' / 'or' — preserve as logical with first two values for taint analysis.
|
|
112
|
+
# (Multi-arg BoolOp ` a or b or c ` is left-associated into nested logical.)
|
|
113
|
+
kind = "logical"
|
|
114
|
+
op = "and" if isinstance(node.op, ast.And) else "or"
|
|
115
|
+
vs = node.values or []
|
|
116
|
+
if len(vs) == 0:
|
|
117
|
+
return {"kind": "unknown"}
|
|
118
|
+
cur = _lower_expr(vs[0])
|
|
119
|
+
for v in vs[1:]:
|
|
120
|
+
cur = {"kind": kind, "op": op, "left": cur, "right": _lower_expr(v)}
|
|
121
|
+
return cur
|
|
122
|
+
if isinstance(node, ast.Compare):
|
|
123
|
+
# Treat as binary on first operand pair (taint analysis doesn't need full chain).
|
|
124
|
+
left = _lower_expr(node.left)
|
|
125
|
+
right = _lower_expr(node.comparators[0]) if node.comparators else {"kind": "unknown"}
|
|
126
|
+
op = type(node.ops[0]).__name__ if node.ops else "Eq"
|
|
127
|
+
return {"kind": "binary", "op": op, "left": left, "right": right}
|
|
128
|
+
if isinstance(node, ast.Call):
|
|
129
|
+
callee = _flatten_callee(node.func)
|
|
130
|
+
args = [_lower_expr(a) for a in (node.args or [])]
|
|
131
|
+
# Keyword args lowered as positional — taint analysis treats them similarly.
|
|
132
|
+
for kw in (node.keywords or []):
|
|
133
|
+
args.append(_lower_expr(kw.value))
|
|
134
|
+
return {"kind": "call", "callee": callee, "args": args}
|
|
135
|
+
if isinstance(node, ast.List) or isinstance(node, ast.Tuple) or isinstance(node, ast.Set):
|
|
136
|
+
return {"kind": "array", "elements": [_lower_expr(e) for e in (node.elts or [])]}
|
|
137
|
+
if isinstance(node, ast.Dict):
|
|
138
|
+
return {
|
|
139
|
+
"kind": "object",
|
|
140
|
+
"props": [
|
|
141
|
+
{"value": _lower_expr(v)} for v in (node.values or [])
|
|
142
|
+
],
|
|
143
|
+
}
|
|
144
|
+
if isinstance(node, ast.IfExp):
|
|
145
|
+
# Ternary `a if cond else b` — surface as union of both branches.
|
|
146
|
+
return {
|
|
147
|
+
"kind": "union",
|
|
148
|
+
"branches": [_lower_expr(node.body), _lower_expr(node.orelse)],
|
|
149
|
+
}
|
|
150
|
+
if isinstance(node, (ast.ListComp, ast.SetComp, ast.GeneratorExp)):
|
|
151
|
+
# Comprehension — represent as array whose element is the lowered
|
|
152
|
+
# elt expression. Tracks taint through `[x for x in untrusted]`.
|
|
153
|
+
return {
|
|
154
|
+
"kind": "array",
|
|
155
|
+
"elements": [_lower_expr(node.elt)],
|
|
156
|
+
}
|
|
157
|
+
if isinstance(node, ast.DictComp):
|
|
158
|
+
return {
|
|
159
|
+
"kind": "object",
|
|
160
|
+
"props": [{"value": _lower_expr(node.value)}],
|
|
161
|
+
}
|
|
162
|
+
if isinstance(node, ast.Lambda):
|
|
163
|
+
# Body of lambda lowered as a transparent expression; the body's
|
|
164
|
+
# free vars surface through the union.
|
|
165
|
+
return _lower_expr(node.body)
|
|
166
|
+
if isinstance(node, ast.Starred):
|
|
167
|
+
return _lower_expr(node.value)
|
|
168
|
+
if isinstance(node, ast.NamedExpr):
|
|
169
|
+
# Walrus: `(x := expr)` — flow the RHS forward.
|
|
170
|
+
return _lower_expr(node.value)
|
|
171
|
+
if isinstance(node, ast.UnaryOp):
|
|
172
|
+
return _lower_expr(node.operand)
|
|
173
|
+
if isinstance(node, ast.Await):
|
|
174
|
+
return _lower_expr(node.value)
|
|
175
|
+
if isinstance(node, ast.Yield):
|
|
176
|
+
return _lower_expr(node.value) if node.value else {"kind": "unknown"}
|
|
177
|
+
if isinstance(node, ast.YieldFrom):
|
|
178
|
+
return _lower_expr(node.value)
|
|
179
|
+
return {"kind": "unknown"}
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def _flatten_callee(node: ast.AST) -> Any:
|
|
183
|
+
"""Return a dot-joined name like 'os.path.join' for a callee, or a
|
|
184
|
+
structured member-access tree for harder shapes. The dataflow engine
|
|
185
|
+
handles both forms."""
|
|
186
|
+
if isinstance(node, ast.Name):
|
|
187
|
+
return node.id
|
|
188
|
+
if isinstance(node, ast.Attribute):
|
|
189
|
+
# Walk inward collecting names.
|
|
190
|
+
parts: list[str] = []
|
|
191
|
+
cur: Any = node
|
|
192
|
+
while isinstance(cur, ast.Attribute):
|
|
193
|
+
parts.insert(0, cur.attr)
|
|
194
|
+
cur = cur.value
|
|
195
|
+
if isinstance(cur, ast.Name):
|
|
196
|
+
parts.insert(0, cur.id)
|
|
197
|
+
return ".".join(parts)
|
|
198
|
+
# Mixed shape (e.g. `func()[0].attr`) — fall back to ident name.
|
|
199
|
+
return parts[-1] if parts else None
|
|
200
|
+
if isinstance(node, ast.Call):
|
|
201
|
+
# Chained calls — surface the immediate callee.
|
|
202
|
+
return _flatten_callee(node.func)
|
|
203
|
+
if isinstance(node, ast.Subscript):
|
|
204
|
+
return _flatten_callee(node.value)
|
|
205
|
+
return None
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def _assign_target(node: ast.AST) -> Optional[str]:
|
|
209
|
+
"""Return a single identifier or dotted-path string for an assignment target,
|
|
210
|
+
or None for destructuring shapes we don't model."""
|
|
211
|
+
if isinstance(node, ast.Name):
|
|
212
|
+
return node.id
|
|
213
|
+
if isinstance(node, ast.Attribute):
|
|
214
|
+
parts: list[str] = []
|
|
215
|
+
cur: Any = node
|
|
216
|
+
while isinstance(cur, ast.Attribute):
|
|
217
|
+
parts.insert(0, cur.attr)
|
|
218
|
+
cur = cur.value
|
|
219
|
+
if isinstance(cur, ast.Name):
|
|
220
|
+
parts.insert(0, cur.id)
|
|
221
|
+
return ".".join(parts)
|
|
222
|
+
# ast.Tuple, ast.List, ast.Starred — destructuring, not yet modeled.
|
|
223
|
+
return None
|
|
224
|
+
|
|
225
|
+
|
|
226
|
+
# ─── CFG construction ────────────────────────────────────────────────────────
|
|
227
|
+
|
|
228
|
+
|
|
229
|
+
class CfgBuilder:
|
|
230
|
+
"""Walks a function body and emits a CFG matching the regex parser's shape."""
|
|
231
|
+
|
|
232
|
+
def __init__(self, fn_name: str) -> None:
|
|
233
|
+
self.fn_name = fn_name
|
|
234
|
+
self.nodes: dict[str, dict[str, Any]] = {}
|
|
235
|
+
self.entry = self._add({"kind": "entry", "line": 0})
|
|
236
|
+
self.exit = self._add({"kind": "exit", "line": 0})
|
|
237
|
+
|
|
238
|
+
def _add(self, node: dict[str, Any]) -> str:
|
|
239
|
+
nid = _next_id()
|
|
240
|
+
node.setdefault("succ", [])
|
|
241
|
+
node.setdefault("pred", [])
|
|
242
|
+
self.nodes[nid] = node
|
|
243
|
+
return nid
|
|
244
|
+
|
|
245
|
+
def _link(self, src_id: str, dst_id: str) -> None:
|
|
246
|
+
sn = self.nodes[src_id]
|
|
247
|
+
dn = self.nodes[dst_id]
|
|
248
|
+
if dst_id not in sn["succ"]:
|
|
249
|
+
sn["succ"].append(dst_id)
|
|
250
|
+
if src_id not in dn["pred"]:
|
|
251
|
+
dn["pred"].append(src_id)
|
|
252
|
+
|
|
253
|
+
def lower(self, body: list[ast.stmt]) -> None:
|
|
254
|
+
tail = self.entry
|
|
255
|
+
tail = self._lower_block(body, tail)
|
|
256
|
+
self._link(tail, self.exit)
|
|
257
|
+
|
|
258
|
+
def _lower_block(self, body: list[ast.stmt], prev: str) -> str:
|
|
259
|
+
"""Lower a sequential list of statements; return the tail node id."""
|
|
260
|
+
for stmt in body:
|
|
261
|
+
prev = self._lower_stmt(stmt, prev)
|
|
262
|
+
return prev
|
|
263
|
+
|
|
264
|
+
def _lower_stmt(self, stmt: ast.stmt, prev: str) -> str:
|
|
265
|
+
line = getattr(stmt, "lineno", 0) or 0
|
|
266
|
+
if isinstance(stmt, ast.Expr):
|
|
267
|
+
# Bare expression — useful when it's a call (decorator pattern,
|
|
268
|
+
# dispatch shape). For everything else, noop.
|
|
269
|
+
if isinstance(stmt.value, ast.Call):
|
|
270
|
+
cur = self._add({
|
|
271
|
+
"kind": "call",
|
|
272
|
+
"callee": _flatten_callee(stmt.value.func),
|
|
273
|
+
"args": [_lower_expr(a) for a in (stmt.value.args or [])]
|
|
274
|
+
+ [_lower_expr(kw.value) for kw in (stmt.value.keywords or [])],
|
|
275
|
+
"line": line,
|
|
276
|
+
})
|
|
277
|
+
else:
|
|
278
|
+
cur = self._add({"kind": "noop", "line": line})
|
|
279
|
+
self._link(prev, cur)
|
|
280
|
+
return cur
|
|
281
|
+
if isinstance(stmt, (ast.Assign, ast.AugAssign, ast.AnnAssign)):
|
|
282
|
+
# AugAssign: x += y → assign x = x + y
|
|
283
|
+
# AnnAssign: x: int = y → assign x = y (or noop if no value)
|
|
284
|
+
if isinstance(stmt, ast.AugAssign):
|
|
285
|
+
tgt = _assign_target(stmt.target)
|
|
286
|
+
src = {
|
|
287
|
+
"kind": "binary",
|
|
288
|
+
"op": type(stmt.op).__name__,
|
|
289
|
+
"left": {"kind": "ident", "name": tgt or "?"},
|
|
290
|
+
"right": _lower_expr(stmt.value),
|
|
291
|
+
}
|
|
292
|
+
elif isinstance(stmt, ast.AnnAssign):
|
|
293
|
+
tgt = _assign_target(stmt.target)
|
|
294
|
+
if stmt.value is None:
|
|
295
|
+
cur = self._add({"kind": "noop", "line": line})
|
|
296
|
+
self._link(prev, cur)
|
|
297
|
+
return cur
|
|
298
|
+
src = _lower_expr(stmt.value)
|
|
299
|
+
else:
|
|
300
|
+
# ast.Assign: targets may be multi (a = b = c). We use the first.
|
|
301
|
+
tgt = _assign_target(stmt.targets[0]) if stmt.targets else None
|
|
302
|
+
src = _lower_expr(stmt.value)
|
|
303
|
+
cur = self._add({"kind": "assign", "target": tgt, "source": src, "line": line})
|
|
304
|
+
self._link(prev, cur)
|
|
305
|
+
return cur
|
|
306
|
+
if isinstance(stmt, ast.If):
|
|
307
|
+
if_node = self._add({
|
|
308
|
+
"kind": "if",
|
|
309
|
+
"cond": _lower_expr(stmt.test),
|
|
310
|
+
"line": line,
|
|
311
|
+
})
|
|
312
|
+
self._link(prev, if_node)
|
|
313
|
+
t_tail = self._lower_block(stmt.body, if_node)
|
|
314
|
+
join = self._add({"kind": "noop", "line": line})
|
|
315
|
+
self._link(t_tail, join)
|
|
316
|
+
if stmt.orelse:
|
|
317
|
+
f_tail = self._lower_block(stmt.orelse, if_node)
|
|
318
|
+
self._link(f_tail, join)
|
|
319
|
+
else:
|
|
320
|
+
self._link(if_node, join)
|
|
321
|
+
return join
|
|
322
|
+
if isinstance(stmt, (ast.For, ast.AsyncFor)):
|
|
323
|
+
# for v in iter: body → assign v from iter; loop-header; body
|
|
324
|
+
lh = self._add({"kind": "loop-header", "line": line})
|
|
325
|
+
self._link(prev, lh)
|
|
326
|
+
# Synthesize an assign for the loop variable so taint from the iter
|
|
327
|
+
# propagates to `v`. Only when target is a plain name.
|
|
328
|
+
tgt = _assign_target(stmt.target)
|
|
329
|
+
if tgt is not None:
|
|
330
|
+
a = self._add({
|
|
331
|
+
"kind": "assign", "target": tgt,
|
|
332
|
+
"source": _lower_expr(stmt.iter), "line": line,
|
|
333
|
+
})
|
|
334
|
+
self._link(lh, a)
|
|
335
|
+
body_prev = a
|
|
336
|
+
else:
|
|
337
|
+
body_prev = lh
|
|
338
|
+
body_tail = self._lower_block(stmt.body, body_prev)
|
|
339
|
+
self._link(body_tail, lh)
|
|
340
|
+
# Loop exit edge (taken when condition false) goes to a join.
|
|
341
|
+
join = self._add({"kind": "noop", "line": line})
|
|
342
|
+
self._link(lh, join)
|
|
343
|
+
return join
|
|
344
|
+
if isinstance(stmt, (ast.While,)):
|
|
345
|
+
lh = self._add({"kind": "loop-header", "line": line})
|
|
346
|
+
self._link(prev, lh)
|
|
347
|
+
body_tail = self._lower_block(stmt.body, lh)
|
|
348
|
+
self._link(body_tail, lh)
|
|
349
|
+
join = self._add({"kind": "noop", "line": line})
|
|
350
|
+
self._link(lh, join)
|
|
351
|
+
return join
|
|
352
|
+
if isinstance(stmt, ast.Return):
|
|
353
|
+
cur = self._add({
|
|
354
|
+
"kind": "return",
|
|
355
|
+
"value": _lower_expr(stmt.value) if stmt.value else None,
|
|
356
|
+
"line": line,
|
|
357
|
+
})
|
|
358
|
+
self._link(prev, cur)
|
|
359
|
+
# Return implicitly flows to exit. We don't link here; the outer
|
|
360
|
+
# `lower` method links the final tail to exit, and the engine
|
|
361
|
+
# treats return as terminal.
|
|
362
|
+
return cur
|
|
363
|
+
if isinstance(stmt, ast.Raise):
|
|
364
|
+
cur = self._add({"kind": "throw", "line": line})
|
|
365
|
+
self._link(prev, cur)
|
|
366
|
+
return cur
|
|
367
|
+
if isinstance(stmt, ast.Try):
|
|
368
|
+
# try body + except handlers + finally. Treat the try body as a
|
|
369
|
+
# plain sequential block; each except handler is an alternate
|
|
370
|
+
# branch from the try head; finally runs after the union. This
|
|
371
|
+
# is a conservative over-approximation that doesn't add false
|
|
372
|
+
# taint but does see every reachable path.
|
|
373
|
+
try_head = self._add({"kind": "noop", "line": line})
|
|
374
|
+
self._link(prev, try_head)
|
|
375
|
+
body_tail = self._lower_block(stmt.body, try_head)
|
|
376
|
+
join = self._add({"kind": "noop", "line": line})
|
|
377
|
+
self._link(body_tail, join)
|
|
378
|
+
for handler in stmt.handlers:
|
|
379
|
+
h_tail = self._lower_block(handler.body, try_head)
|
|
380
|
+
self._link(h_tail, join)
|
|
381
|
+
if stmt.orelse:
|
|
382
|
+
else_tail = self._lower_block(stmt.orelse, body_tail)
|
|
383
|
+
self._link(else_tail, join)
|
|
384
|
+
if stmt.finalbody:
|
|
385
|
+
fin_tail = self._lower_block(stmt.finalbody, join)
|
|
386
|
+
return fin_tail
|
|
387
|
+
return join
|
|
388
|
+
if isinstance(stmt, (ast.With, ast.AsyncWith)):
|
|
389
|
+
# Treat `with X() as v: body` as `v = X()`-style assign followed by body.
|
|
390
|
+
tail = prev
|
|
391
|
+
for item in stmt.items:
|
|
392
|
+
tgt = _assign_target(item.optional_vars) if item.optional_vars else None
|
|
393
|
+
if tgt is not None:
|
|
394
|
+
a = self._add({
|
|
395
|
+
"kind": "assign", "target": tgt,
|
|
396
|
+
"source": _lower_expr(item.context_expr), "line": line,
|
|
397
|
+
})
|
|
398
|
+
self._link(tail, a)
|
|
399
|
+
tail = a
|
|
400
|
+
return self._lower_block(stmt.body, tail)
|
|
401
|
+
if isinstance(stmt, ast.FunctionDef) or isinstance(stmt, ast.AsyncFunctionDef):
|
|
402
|
+
# Nested function definition — emit a noop placeholder. The outer
|
|
403
|
+
# extractor handles nested functions separately via ast.walk().
|
|
404
|
+
cur = self._add({"kind": "noop", "line": line})
|
|
405
|
+
self._link(prev, cur)
|
|
406
|
+
return cur
|
|
407
|
+
if isinstance(stmt, ast.ClassDef):
|
|
408
|
+
cur = self._add({"kind": "noop", "line": line})
|
|
409
|
+
self._link(prev, cur)
|
|
410
|
+
return cur
|
|
411
|
+
if isinstance(stmt, ast.Match):
|
|
412
|
+
# Match statement — emit a noop for now. Future work: lower each
|
|
413
|
+
# case as an alternate branch with its pattern guard.
|
|
414
|
+
cur = self._add({"kind": "noop", "line": line, "_unmodeled": "match"})
|
|
415
|
+
self._link(prev, cur)
|
|
416
|
+
return cur
|
|
417
|
+
# ast.Pass, ast.Break, ast.Continue, ast.Import, ast.ImportFrom,
|
|
418
|
+
# ast.Global, ast.Nonlocal, ast.Delete — all noops for taint.
|
|
419
|
+
cur = self._add({"kind": "noop", "line": line})
|
|
420
|
+
self._link(prev, cur)
|
|
421
|
+
return cur
|
|
422
|
+
|
|
423
|
+
|
|
424
|
+
# ─── Function extraction ─────────────────────────────────────────────────────
|
|
425
|
+
|
|
426
|
+
|
|
427
|
+
def _extract_functions(tree: ast.Module, file: str) -> list[dict[str, Any]]:
|
|
428
|
+
"""Walk the module, capturing every function (top-level or nested) into
|
|
429
|
+
a flat list. Each function's body is lowered into a CFG."""
|
|
430
|
+
fns: list[dict[str, Any]] = []
|
|
431
|
+
for node in ast.walk(tree):
|
|
432
|
+
if not isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef)):
|
|
433
|
+
continue
|
|
434
|
+
params = [a.arg for a in node.args.args]
|
|
435
|
+
if node.args.vararg:
|
|
436
|
+
params.append(node.args.vararg.arg)
|
|
437
|
+
if node.args.kwarg:
|
|
438
|
+
params.append(node.args.kwarg.arg)
|
|
439
|
+
for a in node.args.kwonlyargs:
|
|
440
|
+
params.append(a.arg)
|
|
441
|
+
line = node.lineno or 0
|
|
442
|
+
builder = CfgBuilder(node.name)
|
|
443
|
+
builder.lower(node.body)
|
|
444
|
+
fns.append({
|
|
445
|
+
"qid": _qid(file, node.name, line),
|
|
446
|
+
"name": node.name,
|
|
447
|
+
"line": line,
|
|
448
|
+
"params": params,
|
|
449
|
+
"file": file,
|
|
450
|
+
"cfg": {
|
|
451
|
+
"entry": builder.entry,
|
|
452
|
+
"exit": builder.exit,
|
|
453
|
+
"nodes": builder.nodes,
|
|
454
|
+
},
|
|
455
|
+
})
|
|
456
|
+
return fns
|
|
457
|
+
|
|
458
|
+
|
|
459
|
+
# ─── Driver ──────────────────────────────────────────────────────────────────
|
|
460
|
+
|
|
461
|
+
|
|
462
|
+
def _process_one(file: str, content: str) -> dict[str, Any]:
|
|
463
|
+
if not isinstance(content, str):
|
|
464
|
+
return {"file": file, "functions": [], "topLevel": None, "_error": "content-not-string"}
|
|
465
|
+
if len(content) > 1_000_000:
|
|
466
|
+
return {"file": file, "functions": [], "topLevel": None, "_error": "file-too-large"}
|
|
467
|
+
try:
|
|
468
|
+
tree = ast.parse(content, filename=file)
|
|
469
|
+
except SyntaxError as e:
|
|
470
|
+
return {"file": file, "functions": [], "topLevel": None, "_error": f"syntax-error: {e.msg} (line {e.lineno})"}
|
|
471
|
+
fns = _extract_functions(tree, file)
|
|
472
|
+
return {"file": file, "functions": fns, "topLevel": None}
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def main() -> int:
|
|
476
|
+
try:
|
|
477
|
+
payload = json.load(sys.stdin)
|
|
478
|
+
except Exception as e:
|
|
479
|
+
sys.stderr.write(f"parser-py.helper: bad stdin JSON: {e}\n")
|
|
480
|
+
return 2
|
|
481
|
+
if not isinstance(payload, list):
|
|
482
|
+
sys.stderr.write("parser-py.helper: stdin must be a JSON array\n")
|
|
483
|
+
return 2
|
|
484
|
+
out: list[dict[str, Any]] = []
|
|
485
|
+
for entry in payload:
|
|
486
|
+
if not isinstance(entry, dict):
|
|
487
|
+
continue
|
|
488
|
+
file = entry.get("file") or ""
|
|
489
|
+
if not file.endswith(".py"):
|
|
490
|
+
continue
|
|
491
|
+
content = entry.get("content") or ""
|
|
492
|
+
out.append(_process_one(file, content))
|
|
493
|
+
if not out:
|
|
494
|
+
sys.stderr.write("parser-py.helper: no .py files in input\n")
|
|
495
|
+
return 3
|
|
496
|
+
json.dump(out, sys.stdout)
|
|
497
|
+
return 0
|
|
498
|
+
|
|
499
|
+
|
|
500
|
+
if __name__ == "__main__":
|
|
501
|
+
sys.exit(main())
|