codejury 0.4.1__tar.gz → 0.5.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codejury-0.4.1 → codejury-0.5.0}/PKG-INFO +17 -5
- {codejury-0.4.1 → codejury-0.5.0}/README.md +16 -4
- {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/debate.py +18 -2
- codejury-0.5.0/codejury/agents/refuter.py +76 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/assembly.py +9 -1
- {codejury-0.4.1 → codejury-0.5.0}/codejury/cli.py +62 -4
- codejury-0.5.0/codejury/data/suppressions.yaml +43 -0
- codejury-0.5.0/codejury/integrations/__init__.py +1 -0
- codejury-0.5.0/codejury/integrations/github.py +88 -0
- codejury-0.5.0/codejury/orchestrators/challenge.py +67 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/resources.py +1 -0
- codejury-0.5.0/codejury/suppression.py +96 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/PKG-INFO +17 -5
- {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/SOURCES.txt +9 -0
- {codejury-0.4.1 → codejury-0.5.0}/pyproject.toml +1 -1
- codejury-0.5.0/tests/test_challenge.py +105 -0
- codejury-0.5.0/tests/test_integrations.py +82 -0
- codejury-0.5.0/tests/test_suppression.py +44 -0
- {codejury-0.4.1 → codejury-0.5.0}/LICENSE +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/mock.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/parsing.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/verifier.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/authentication.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/authorization.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/business_logic.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/crypto.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/data_protection.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/dependency_config.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/error_logging.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/input_validation.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/output_encoding.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/secrets.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/session.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_bcrypt_password.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_jwt_noverify_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_jwt_verified_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_sha256_checksum_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_sha256_password.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authz_idor_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authz_owner_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/cmdi_ossystem_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/cmdi_subprocess_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/crypto_aesgcm_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/crypto_ecb_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/path_contained_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/path_traversal_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/secrets_env_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/secrets_hardcoded_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/sqli_format_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/sqli_fstring_query.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/sqli_parameterized_query.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/xss_innerhtml_constant_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/xss_innerhtml_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/tasks/audit_diff_debate.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/data/tasks/quick_scan_single.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/artifact.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/capability.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/context.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/observation.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/result.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/evaluation.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/infrastructure/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/infrastructure/json_parse.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/debate.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/pipeline.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/reflexion.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/single.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/anthropic.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/litellm.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/mock.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/openai.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/openai_format.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/retry.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/reporting.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/callers.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/chunker.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/diff.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/function.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/mock.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/repo.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/tasks/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/tasks/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury/tasks/registry.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/dependency_links.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/entry_points.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/requires.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/top_level.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/setup.cfg +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_anthropic_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_assembly.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_audit_pipeline.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_callers.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_capability.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_cli_audit.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_context.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_debate_agents.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_debate_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_diff_source.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_evaluation.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_function_source.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_json_parse.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_litellm_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_openai_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_pipeline_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_reflexion_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_repo_source.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_reporting.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_retry_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_tasks.py +0 -0
- {codejury-0.4.1 → codejury-0.5.0}/tests/test_verifier.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codejury
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.0
|
|
4
4
|
Summary: General-purpose Application Security AI audit framework -- five-layer architecture, capabilities as first-class data
|
|
5
5
|
Author: AISecLabs
|
|
6
6
|
License-Expression: MIT
|
|
@@ -58,6 +58,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
|
|
|
58
58
|
## Quickstart
|
|
59
59
|
|
|
60
60
|
```bash
|
|
61
|
+
# CI gate: exit 1 if a high-severity issue is found
|
|
62
|
+
git diff origin/main... | codejury audit --fail-on high -
|
|
63
|
+
|
|
64
|
+
# Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
|
|
65
|
+
git diff origin/main... | codejury audit --github your-org/your-repo#123 -
|
|
66
|
+
|
|
61
67
|
# No API key needed -- prove the pipeline runs end to end with mock layers
|
|
62
68
|
codejury dry-run
|
|
63
69
|
|
|
@@ -76,9 +82,13 @@ git diff | codejury audit --provider anthropic
|
|
|
76
82
|
| `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
|
|
77
83
|
| `codejury eval` | Score the golden cases and report precision / recall. |
|
|
78
84
|
|
|
79
|
-
Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
|
|
85
|
+
Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
|
|
80
86
|
`--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
|
|
81
87
|
|
|
88
|
+
Findings in known-noise categories (availability/DoS, rate limiting, memory safety
|
|
89
|
+
outside C/C++) are dropped by versioned rules in
|
|
90
|
+
`codejury/data/suppressions.yaml`; disable with `--no-suppress`.
|
|
91
|
+
|
|
82
92
|
```bash
|
|
83
93
|
# Multi-round adversarial debate, rendered as Markdown
|
|
84
94
|
git diff | codejury audit --orchestrator debate --format markdown - > report.md
|
|
@@ -157,9 +167,11 @@ independently.
|
|
|
157
167
|
- **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
|
|
158
168
|
from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
|
|
159
169
|
ones like path traversal over-flag in single-file review because the verifier
|
|
160
|
-
can't see whether a value is attacker-controlled.
|
|
161
|
-
cross-file call sites for provenance
|
|
162
|
-
|
|
170
|
+
can't see whether a value is attacker-controlled. Mitigations that help but do
|
|
171
|
+
not fully solve it: `scan --callers` (cross-file call sites for provenance),
|
|
172
|
+
`--orchestrator challenge` (a recall-safe refutation pass that drops only
|
|
173
|
+
provably-safe flags), `--only` to scope, or `--orchestrator debate`. Real taint
|
|
174
|
+
precision needs data-flow analysis, not model skepticism.
|
|
163
175
|
- **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
|
|
164
176
|
not a quick check -- scope it with `--only`. Day to day, audit the diff.
|
|
165
177
|
|
|
@@ -29,6 +29,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
|
|
|
29
29
|
## Quickstart
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
|
+
# CI gate: exit 1 if a high-severity issue is found
|
|
33
|
+
git diff origin/main... | codejury audit --fail-on high -
|
|
34
|
+
|
|
35
|
+
# Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
|
|
36
|
+
git diff origin/main... | codejury audit --github your-org/your-repo#123 -
|
|
37
|
+
|
|
32
38
|
# No API key needed -- prove the pipeline runs end to end with mock layers
|
|
33
39
|
codejury dry-run
|
|
34
40
|
|
|
@@ -47,9 +53,13 @@ git diff | codejury audit --provider anthropic
|
|
|
47
53
|
| `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
|
|
48
54
|
| `codejury eval` | Score the golden cases and report precision / recall. |
|
|
49
55
|
|
|
50
|
-
Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
|
|
56
|
+
Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
|
|
51
57
|
`--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
|
|
52
58
|
|
|
59
|
+
Findings in known-noise categories (availability/DoS, rate limiting, memory safety
|
|
60
|
+
outside C/C++) are dropped by versioned rules in
|
|
61
|
+
`codejury/data/suppressions.yaml`; disable with `--no-suppress`.
|
|
62
|
+
|
|
53
63
|
```bash
|
|
54
64
|
# Multi-round adversarial debate, rendered as Markdown
|
|
55
65
|
git diff | codejury audit --orchestrator debate --format markdown - > report.md
|
|
@@ -128,9 +138,11 @@ independently.
|
|
|
128
138
|
- **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
|
|
129
139
|
from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
|
|
130
140
|
ones like path traversal over-flag in single-file review because the verifier
|
|
131
|
-
can't see whether a value is attacker-controlled.
|
|
132
|
-
cross-file call sites for provenance
|
|
133
|
-
|
|
141
|
+
can't see whether a value is attacker-controlled. Mitigations that help but do
|
|
142
|
+
not fully solve it: `scan --callers` (cross-file call sites for provenance),
|
|
143
|
+
`--orchestrator challenge` (a recall-safe refutation pass that drops only
|
|
144
|
+
provably-safe flags), `--only` to scope, or `--orchestrator debate`. Real taint
|
|
145
|
+
precision needs data-flow analysis, not model skepticism.
|
|
134
146
|
- **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
|
|
135
147
|
not a quick check -- scope it with `--only`. Day to day, audit the diff.
|
|
136
148
|
|
|
@@ -30,6 +30,20 @@ _FINDING_SHAPE = (
|
|
|
30
30
|
'"description": "...", "evidence": [{"file": "...", "line": 0, "code": "..."}], "confidence": 0.0}'
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
+
_DEEP_LENS = (
|
|
34
|
+
"Look past surface patterns for the deepest flaw:\n"
|
|
35
|
+
"- Trust anchors: what does this code trust to authenticate or authorize -- a key, token, header, "
|
|
36
|
+
"signature, role, or caller -- and who controls that value? If the attacker supplies what is used to "
|
|
37
|
+
"verify them (e.g. their own public key, an unconfigured key that disables verification), passing the "
|
|
38
|
+
"check proves nothing.\n"
|
|
39
|
+
"- Order of operations: is an external, irreversible, or privileged action performed before the local "
|
|
40
|
+
"state is committed, or before the check that should guard it? Can a check and the action it guards be "
|
|
41
|
+
"split apart under concurrency (race / TOCTOU) or partial failure (on-chain done, DB rolled back)?\n"
|
|
42
|
+
"- Attack chains: combine several weak points into one end-to-end exploit.\n"
|
|
43
|
+
"Prefer the deepest design/authorization/state flaw over surface issues like missing rate limiting or "
|
|
44
|
+
"verbose logging; report those only as secondary."
|
|
45
|
+
)
|
|
46
|
+
|
|
33
47
|
|
|
34
48
|
class _DebateAgent(Agent):
|
|
35
49
|
"""Shared provider plumbing for the three debate roles."""
|
|
@@ -60,7 +74,7 @@ class FinderAgent(_DebateAgent):
|
|
|
60
74
|
)
|
|
61
75
|
|
|
62
76
|
def run(self, ctx: AnalysisContext) -> list[Observation]:
|
|
63
|
-
parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _code(ctx.artifact)]
|
|
77
|
+
parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _DEEP_LENS, _code(ctx.artifact)]
|
|
64
78
|
if ctx.round_num > 1 and ctx.history:
|
|
65
79
|
parts.append(_render_history(ctx.history))
|
|
66
80
|
parts.append("Concede findings the rebuttals refute, keep the valid ones, and add any you missed.")
|
|
@@ -84,7 +98,9 @@ class ChallengerAgent(_DebateAgent):
|
|
|
84
98
|
def run(self, ctx: AnalysisContext) -> list[Observation]:
|
|
85
99
|
parts = [
|
|
86
100
|
"Challenge the findings below. For each one you believe is a false positive, write a rebuttal. "
|
|
87
|
-
"Add new_findings for any real issue that was missed
|
|
101
|
+
"Add new_findings for any real issue that was missed -- especially a deeper flaw the finder "
|
|
102
|
+
"stopped short of.",
|
|
103
|
+
_DEEP_LENS,
|
|
88
104
|
_code(ctx.artifact),
|
|
89
105
|
_render_history(ctx.history),
|
|
90
106
|
'Respond as JSON: {"rebuttals": [{"target": "finding title", "reason": "..."}], '
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""RefuterAgent -- a skeptic that tries to dismiss flagged verdicts as false positives.
|
|
2
|
+
|
|
3
|
+
Used by the challenge orchestrator: the verifier flags issues, then the refuter
|
|
4
|
+
gets the code plus the VULNERABLE verdicts (via ``ctx.history``) and argues which
|
|
5
|
+
are false positives -- e.g. a value that is not actually attacker-controlled or a
|
|
6
|
+
sink that is not reachable. It returns a Concession per verdict it refutes.
|
|
7
|
+
|
|
8
|
+
This is the cheap, focused alternative to a full debate: only flagged verdicts
|
|
9
|
+
are challenged, not the whole file.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from codejury.agents.base import Agent
|
|
15
|
+
from codejury.domain.context import AnalysisContext
|
|
16
|
+
from codejury.domain.observation import Concession, Observation, Verdict
|
|
17
|
+
from codejury.infrastructure.json_parse import extract_json_object
|
|
18
|
+
from codejury.providers.base import Message, Provider
|
|
19
|
+
|
|
20
|
+
_SYSTEM = (
|
|
21
|
+
"You are a careful security reviewer checking flagged issues for false positives. "
|
|
22
|
+
"Security errs toward keeping a flag: refute one ONLY when the code in front of you "
|
|
23
|
+
"affirmatively proves the value is not attacker-controlled. If a value's origin is not "
|
|
24
|
+
"shown, or it could plausibly come from external/untrusted input, KEEP the flag. "
|
|
25
|
+
"Respond with a single JSON object and nothing else."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
_JSON_SHAPE = '{"refuted": [{"capability": "id.sub", "reason": "proof it is not attacker-controlled"}]}'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RefuterAgent(Agent):
|
|
32
|
+
def __init__(self, *, provider: Provider, model: str, max_tokens: int = 1024) -> None:
|
|
33
|
+
self._provider = provider
|
|
34
|
+
self._model = model
|
|
35
|
+
self._max_tokens = max_tokens
|
|
36
|
+
|
|
37
|
+
def run(self, ctx: AnalysisContext) -> list[Observation]:
|
|
38
|
+
flagged = [o for o in ctx.history if isinstance(o, Verdict)]
|
|
39
|
+
if not flagged:
|
|
40
|
+
return []
|
|
41
|
+
flags = "\n".join(f"- {v.capability}: {v.reasoning}" for v in flagged)
|
|
42
|
+
context_block = (
|
|
43
|
+
f"Call sites elsewhere (for tracing where arguments come from):\n```\n{ctx.artifact.context}\n```\n\n"
|
|
44
|
+
if ctx.artifact.context
|
|
45
|
+
else ""
|
|
46
|
+
)
|
|
47
|
+
prompt = (
|
|
48
|
+
f"Code under review ({ctx.artifact.path}):\n```\n{ctx.artifact.content}\n```\n\n"
|
|
49
|
+
f"{context_block}"
|
|
50
|
+
f"Flagged issues:\n{flags}\n\n"
|
|
51
|
+
"This attacker-control reasoning applies ONLY to input-driven issues (injection, path "
|
|
52
|
+
"traversal, SSRF). For those, refute a flag only if you can affirmatively prove the value "
|
|
53
|
+
"is not attacker-controlled: a stored data field, or traced (here or in the call sites) to "
|
|
54
|
+
"a trusted, config, or operator-supplied source. If its origin is not shown or could "
|
|
55
|
+
"plausibly be external input, do NOT refute. For other issue types (hardcoded secrets, "
|
|
56
|
+
"weak crypto, ...), a literal value is often the vulnerability itself -- do NOT refute "
|
|
57
|
+
"those just because a value is constant.\n\n"
|
|
58
|
+
"Respond with a single JSON object exactly like:\n" + _JSON_SHAPE
|
|
59
|
+
)
|
|
60
|
+
result = self._provider.complete(
|
|
61
|
+
system=_SYSTEM,
|
|
62
|
+
messages=[Message(role="user", content=prompt)],
|
|
63
|
+
model=self._model,
|
|
64
|
+
max_tokens=self._max_tokens,
|
|
65
|
+
)
|
|
66
|
+
obj = extract_json_object(result.text) or {}
|
|
67
|
+
out: list[Observation] = []
|
|
68
|
+
for item in obj.get("refuted", []):
|
|
69
|
+
if not isinstance(item, dict):
|
|
70
|
+
continue
|
|
71
|
+
capability = str(item.get("capability", "")).strip()
|
|
72
|
+
if capability:
|
|
73
|
+
out.append(
|
|
74
|
+
Concession(capability=capability, produced_by="refuter", target=capability, reason=str(item.get("reason", "")))
|
|
75
|
+
)
|
|
76
|
+
return out
|
|
@@ -10,12 +10,14 @@ import os
|
|
|
10
10
|
|
|
11
11
|
from codejury.agents.base import Agent
|
|
12
12
|
from codejury.agents.debate import ChallengerAgent, FinderAgent, JudgeAgent
|
|
13
|
+
from codejury.agents.refuter import RefuterAgent
|
|
13
14
|
from codejury.agents.verifier import VerifierAgent
|
|
14
15
|
from codejury.domain.artifact import CodeArtifact
|
|
15
16
|
from codejury.domain.capability import Capability
|
|
16
17
|
from codejury.domain.context import AnalysisContext
|
|
17
18
|
from codejury.domain.result import AnalysisResult
|
|
18
19
|
from codejury.orchestrators.base import Orchestrator
|
|
20
|
+
from codejury.orchestrators.challenge import ChallengeOrchestrator
|
|
19
21
|
from codejury.orchestrators.debate import DebateOrchestrator
|
|
20
22
|
from codejury.orchestrators.pipeline import PipelineOrchestrator
|
|
21
23
|
from codejury.orchestrators.reflexion import ReflexionOrchestrator
|
|
@@ -27,7 +29,7 @@ from codejury.providers.openai import OpenAIProvider
|
|
|
27
29
|
from codejury.providers.retry import RetryProvider
|
|
28
30
|
from codejury.sources.base import Source
|
|
29
31
|
|
|
30
|
-
STRATEGIES = ("single", "pipeline", "debate", "reflexion")
|
|
32
|
+
STRATEGIES = ("single", "pipeline", "debate", "reflexion", "challenge")
|
|
31
33
|
PROVIDERS = ("anthropic", "openai", "litellm")
|
|
32
34
|
DEFAULT_MODEL = os.environ.get("CODEJURY_MODEL", "claude-sonnet-4-6")
|
|
33
35
|
DEFAULT_API_BASE = os.environ.get("CODEJURY_API_BASE")
|
|
@@ -61,6 +63,12 @@ def build_orchestration(
|
|
|
61
63
|
"critic": ChallengerAgent(provider=provider, model=model, max_tokens=max_tokens),
|
|
62
64
|
}
|
|
63
65
|
return agents, ReflexionOrchestrator()
|
|
66
|
+
if strategy == "challenge":
|
|
67
|
+
agents = {
|
|
68
|
+
"verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens),
|
|
69
|
+
"refuter": RefuterAgent(provider=provider, model=model),
|
|
70
|
+
}
|
|
71
|
+
return agents, ChallengeOrchestrator()
|
|
64
72
|
verifier = {"verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens)}
|
|
65
73
|
if strategy == "pipeline":
|
|
66
74
|
return verifier, PipelineOrchestrator()
|
|
@@ -9,6 +9,7 @@ library, backed by the Anthropic provider, under a chosen orchestration strategy
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
+
import os
|
|
12
13
|
import sys
|
|
13
14
|
|
|
14
15
|
from codejury.agents.mock import MockAgent
|
|
@@ -33,7 +34,9 @@ from codejury.orchestrators.single import SingleOrchestrator
|
|
|
33
34
|
from codejury.providers.base import Provider
|
|
34
35
|
from codejury.providers.mock import MockProvider
|
|
35
36
|
from codejury.reporting import to_json, to_markdown
|
|
36
|
-
from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, TASKS_DIR
|
|
37
|
+
from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, SUPPRESSIONS_FILE, TASKS_DIR
|
|
38
|
+
from codejury.suppression import filter_results, load_suppressions
|
|
39
|
+
from codejury.integrations.github import build_review, parse_pr_ref, post_review
|
|
37
40
|
from codejury.sources.chunker import Chunker
|
|
38
41
|
from codejury.sources.diff import DiffSource
|
|
39
42
|
from codejury.sources.repo import RepoSource
|
|
@@ -137,6 +140,50 @@ def _render_results(fmt: str, results: list[tuple[str, AnalysisResult]]) -> str:
|
|
|
137
140
|
return {"text": _render_audit, "markdown": to_markdown, "json": to_json}[fmt](results)
|
|
138
141
|
|
|
139
142
|
|
|
143
|
+
def _maybe_suppress(results: list[tuple[str, AnalysisResult]], enabled: bool) -> list[tuple[str, AnalysisResult]]:
|
|
144
|
+
if not enabled:
|
|
145
|
+
return results
|
|
146
|
+
filtered, suppressed = filter_results(results, load_suppressions(SUPPRESSIONS_FILE))
|
|
147
|
+
if suppressed:
|
|
148
|
+
print(f"suppressed {len(suppressed)} known-noise finding(s) by rule", file=sys.stderr)
|
|
149
|
+
return filtered
|
|
150
|
+
|
|
151
|
+
_FAIL_ON = ("critical", "high", "medium", "low")
|
|
152
|
+
_SEVERITY_RANK = {"critical": 4, "high": 3, "medium": 2, "low": 1, "info": 0}
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def _problem_rank(o: Observation) -> int:
|
|
156
|
+
if o.kind == "finding":
|
|
157
|
+
return _SEVERITY_RANK.get(o.severity.lower(), 2)
|
|
158
|
+
if o.kind == "verdict" and o.status == "VULNERABLE":
|
|
159
|
+
return _SEVERITY_RANK["high"]
|
|
160
|
+
if o.kind == "verdict" and o.status == "PARTIAL":
|
|
161
|
+
return _SEVERITY_RANK["medium"]
|
|
162
|
+
return -1
|
|
163
|
+
|
|
164
|
+
|
|
165
|
+
def _gate_exit(results: list[tuple[str, AnalysisResult]], fail_on: str | None) -> int:
|
|
166
|
+
if not fail_on:
|
|
167
|
+
return 0
|
|
168
|
+
worst = max((_problem_rank(o) for _, r in results for o in r.observations), default=-1)
|
|
169
|
+
return 1 if worst >= _SEVERITY_RANK[fail_on] else 0
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def _maybe_post_github(ref: str | None, results: list[tuple[str, AnalysisResult]]) -> None:
|
|
173
|
+
if not ref:
|
|
174
|
+
return
|
|
175
|
+
token = os.environ.get("GITHUB_TOKEN")
|
|
176
|
+
if not token:
|
|
177
|
+
print("GITHUB_TOKEN not set; skipping PR review", file=sys.stderr)
|
|
178
|
+
return
|
|
179
|
+
try:
|
|
180
|
+
owner, repo, pull = parse_pr_ref(ref)
|
|
181
|
+
post_review(owner, repo, pull, build_review(results), token=token)
|
|
182
|
+
print(f"posted review to {ref}", file=sys.stderr)
|
|
183
|
+
except Exception as exc:
|
|
184
|
+
print(f"github review failed: {exc}", file=sys.stderr)
|
|
185
|
+
|
|
186
|
+
|
|
140
187
|
def _render_metrics(m: Metrics) -> str:
|
|
141
188
|
return (
|
|
142
189
|
f"cases: {m.total} (tp={m.tp} fp={m.fp} tn={m.tn} fn={m.fn})\n"
|
|
@@ -168,6 +215,9 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
168
215
|
audit_p.add_argument("--retries", type=int, default=0, help="provider retry attempts on failure")
|
|
169
216
|
audit_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
|
|
170
217
|
audit_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
|
|
218
|
+
audit_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
|
|
219
|
+
audit_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
|
|
220
|
+
audit_p.add_argument("--github", default=None, help="post a PR review: owner/repo#number (needs GITHUB_TOKEN)")
|
|
171
221
|
|
|
172
222
|
scan_p = sub.add_parser("scan", help="audit a whole directory tree (deep, capability by capability)")
|
|
173
223
|
scan_p.add_argument("directory", help="directory to scan")
|
|
@@ -185,6 +235,8 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
185
235
|
)
|
|
186
236
|
scan_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
|
|
187
237
|
scan_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
|
|
238
|
+
scan_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
|
|
239
|
+
scan_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
|
|
188
240
|
|
|
189
241
|
run_p = sub.add_parser("run", help="run a named task preset against a unified diff")
|
|
190
242
|
run_p.add_argument("task", help="task name")
|
|
@@ -192,6 +244,8 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
192
244
|
run_p.add_argument("--tasks", default=TASKS_DIR, help="task YAML directory")
|
|
193
245
|
run_p.add_argument("--capabilities", default=CAPABILITIES_DIR, help="capability YAML directory")
|
|
194
246
|
run_p.add_argument("--format", choices=_FORMATS, default="text", dest="fmt")
|
|
247
|
+
run_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
|
|
248
|
+
run_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
|
|
195
249
|
|
|
196
250
|
eval_p = sub.add_parser("eval", help="score golden cases and report precision/recall")
|
|
197
251
|
eval_p.add_argument("--golden", default=GOLDEN_DIR, help="golden case YAML directory")
|
|
@@ -214,8 +268,10 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
214
268
|
max_tokens=args.max_tokens,
|
|
215
269
|
strategy=args.orchestrator,
|
|
216
270
|
)
|
|
271
|
+
results = _maybe_suppress(results, not args.no_suppress)
|
|
217
272
|
print(_render_results(args.fmt, results))
|
|
218
|
-
|
|
273
|
+
_maybe_post_github(args.github, results)
|
|
274
|
+
return _gate_exit(results, args.fail_on)
|
|
219
275
|
|
|
220
276
|
if args.command == "scan":
|
|
221
277
|
capabilities = load_capabilities(args.capabilities)
|
|
@@ -234,8 +290,9 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
234
290
|
max_chars=args.max_chars,
|
|
235
291
|
with_callers=args.callers,
|
|
236
292
|
)
|
|
293
|
+
results = _maybe_suppress(results, not args.no_suppress)
|
|
237
294
|
print(_render_results(args.fmt, results))
|
|
238
|
-
return
|
|
295
|
+
return _gate_exit(results, args.fail_on)
|
|
239
296
|
|
|
240
297
|
if args.command == "run":
|
|
241
298
|
tasks = load_tasks(args.tasks)
|
|
@@ -245,8 +302,9 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
245
302
|
results = run_task(
|
|
246
303
|
tasks[args.task], DiffSource(_read_diff(args.diff)), load_capabilities(args.capabilities)
|
|
247
304
|
)
|
|
305
|
+
results = _maybe_suppress(results, not args.no_suppress)
|
|
248
306
|
print(_render_results(args.fmt, results))
|
|
249
|
-
return
|
|
307
|
+
return _gate_exit(results, args.fail_on)
|
|
250
308
|
|
|
251
309
|
if args.command == "eval":
|
|
252
310
|
try:
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Known-noise suppression rules (data-driven false-positive filter).
|
|
2
|
+
# Each drops a flagged finding whose text matches and whose path condition holds.
|
|
3
|
+
# Keep these to out-of-scope / low-signal CATEGORIES -- never key on a real
|
|
4
|
+
# vulnerability class, or you will drop true findings.
|
|
5
|
+
|
|
6
|
+
- id: SUP-AVAILABILITY
|
|
7
|
+
reason: availability / DoS / rate-limiting findings are out of scope and low-signal here
|
|
8
|
+
match_any:
|
|
9
|
+
- "denial of service"
|
|
10
|
+
- "denial-of-service"
|
|
11
|
+
- "rate limit"
|
|
12
|
+
- "rate-limit"
|
|
13
|
+
- "rate limiting"
|
|
14
|
+
- "resource exhaustion"
|
|
15
|
+
- "unbounded"
|
|
16
|
+
- "amplification"
|
|
17
|
+
|
|
18
|
+
- id: SUP-LOGGING-NOISE
|
|
19
|
+
reason: verbose / insufficient logging is noise unless a secret value is logged
|
|
20
|
+
match_any:
|
|
21
|
+
- "verbose logging"
|
|
22
|
+
- "insufficient logging"
|
|
23
|
+
- "excessive logging"
|
|
24
|
+
- "lack of logging"
|
|
25
|
+
- "log verbosity"
|
|
26
|
+
|
|
27
|
+
- id: SUP-MEMORY-SAFETY-NON-C
|
|
28
|
+
reason: memory-safety issues do not apply outside C/C++
|
|
29
|
+
match_any:
|
|
30
|
+
- "buffer overflow"
|
|
31
|
+
- "use after free"
|
|
32
|
+
- "use-after-free"
|
|
33
|
+
- "double free"
|
|
34
|
+
- "memory corruption"
|
|
35
|
+
- "out-of-bounds"
|
|
36
|
+
unless_path_ext: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"]
|
|
37
|
+
|
|
38
|
+
- id: SUP-REDOS
|
|
39
|
+
reason: regex denial-of-service / catastrophic backtracking is low-signal here
|
|
40
|
+
match_any:
|
|
41
|
+
- "redos"
|
|
42
|
+
- "catastrophic backtracking"
|
|
43
|
+
- "regex denial"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""codejury.integrations -- post results to external systems (GitHub PR reviews)."""
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Post audit results to a GitHub pull request as a review with inline comments.
|
|
2
|
+
|
|
3
|
+
``build_review`` is a pure function (results -> GitHub review payload) so it is
|
|
4
|
+
unit-testable; ``post_review`` does the HTTP POST and accepts an injectable
|
|
5
|
+
transport so it can be tested without a token or a live PR. Problems with a
|
|
6
|
+
usable file:line become inline comments; everything else is summarized in the
|
|
7
|
+
review body. The review requests changes when any problem is found.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import urllib.request
|
|
14
|
+
from typing import Any, Callable
|
|
15
|
+
|
|
16
|
+
from codejury.domain.observation import Observation
|
|
17
|
+
from codejury.domain.result import AnalysisResult
|
|
18
|
+
|
|
19
|
+
Results = list[tuple[str, AnalysisResult]]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_review(results: Results, *, max_comments: int = 50) -> dict:
|
|
23
|
+
comments: list[dict] = []
|
|
24
|
+
problems = 0
|
|
25
|
+
for _path, result in results:
|
|
26
|
+
for o in result.observations:
|
|
27
|
+
comment = _inline_comment(o)
|
|
28
|
+
if comment is None:
|
|
29
|
+
continue
|
|
30
|
+
problems += 1
|
|
31
|
+
if len(comments) < max_comments:
|
|
32
|
+
comments.append(comment)
|
|
33
|
+
|
|
34
|
+
body = (
|
|
35
|
+
f"codejury found {problems} issue(s)." if problems else "codejury found no issues."
|
|
36
|
+
)
|
|
37
|
+
if problems > len(comments):
|
|
38
|
+
body += f" Showing {len(comments)} inline; {problems - len(comments)} more omitted."
|
|
39
|
+
return {
|
|
40
|
+
"body": body,
|
|
41
|
+
"event": "REQUEST_CHANGES" if problems else "COMMENT",
|
|
42
|
+
"comments": comments,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _inline_comment(o: Observation) -> dict | None:
|
|
47
|
+
if o.kind == "finding":
|
|
48
|
+
evidence = o.evidence[0] if o.evidence else None
|
|
49
|
+
if evidence and evidence.file and evidence.line:
|
|
50
|
+
cwe = f" ({o.cwe})" if o.cwe else ""
|
|
51
|
+
return {"path": evidence.file, "line": evidence.line, "body": f"**{o.severity}{cwe}** {o.title}\n\n{o.description}"}
|
|
52
|
+
if o.kind == "verdict" and o.status == "VULNERABLE":
|
|
53
|
+
evidence = o.evidence[0] if o.evidence else None
|
|
54
|
+
if evidence and evidence.file and evidence.line:
|
|
55
|
+
return {"path": evidence.file, "line": evidence.line, "body": f"**VULNERABLE** `{o.capability}`\n\n{o.reasoning}"}
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def post_review(
|
|
60
|
+
owner: str,
|
|
61
|
+
repo: str,
|
|
62
|
+
pull: int,
|
|
63
|
+
payload: dict,
|
|
64
|
+
*,
|
|
65
|
+
token: str,
|
|
66
|
+
transport: Callable[[str, bytes, dict], Any] | None = None,
|
|
67
|
+
) -> Any:
|
|
68
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull}/reviews"
|
|
69
|
+
data = json.dumps(payload).encode()
|
|
70
|
+
headers = {
|
|
71
|
+
"Authorization": f"Bearer {token}",
|
|
72
|
+
"Accept": "application/vnd.github+json",
|
|
73
|
+
"Content-Type": "application/json",
|
|
74
|
+
}
|
|
75
|
+
if transport is not None:
|
|
76
|
+
return transport(url, data, headers)
|
|
77
|
+
request = urllib.request.Request(url, data=data, headers=headers, method="POST")
|
|
78
|
+
with urllib.request.urlopen(request) as response:
|
|
79
|
+
return response.status
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def parse_pr_ref(ref: str) -> tuple[str, str, int]:
|
|
83
|
+
"""Parse 'owner/repo#123' into (owner, repo, pull_number)."""
|
|
84
|
+
repo_part, _, number = ref.partition("#")
|
|
85
|
+
owner, _, repo = repo_part.partition("/")
|
|
86
|
+
if not owner or not repo or not number.isdigit():
|
|
87
|
+
raise ValueError(f"expected owner/repo#number, got {ref!r}")
|
|
88
|
+
return owner, repo, int(number)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""ChallengeOrchestrator -- verify, then challenge the flagged verdicts.
|
|
2
|
+
|
|
3
|
+
The verifier rules on every capability; then a refuter is shown only the
|
|
4
|
+
VULNERABLE verdicts and the code, and argues which are false positives. A refuted
|
|
5
|
+
verdict becomes a dismissed Concession (recording why), so the report keeps the
|
|
6
|
+
SECURE/NOT_PRESENT verdicts, the surviving VULNERABLE ones, and a Dismissed list.
|
|
7
|
+
|
|
8
|
+
This targets taint-style false positives (which a lone verifier over-reports)
|
|
9
|
+
while paying the extra model call only for flagged verdicts, not the whole file.
|
|
10
|
+
|
|
11
|
+
Only verdicts from taint-prone capabilities are challenged. Local-pattern issues
|
|
12
|
+
(hardcoded secrets, weak crypto) are kept as-is: refuting them risks dropping a
|
|
13
|
+
real finding, and they do not have the attacker-control ambiguity that makes
|
|
14
|
+
taint checks over-report.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import dataclasses
|
|
20
|
+
|
|
21
|
+
from codejury.agents.base import Agent
|
|
22
|
+
from codejury.domain.context import AnalysisContext
|
|
23
|
+
from codejury.domain.observation import Concession, Observation, Verdict
|
|
24
|
+
from codejury.domain.result import AnalysisResult
|
|
25
|
+
from codejury.orchestrators.base import Orchestrator
|
|
26
|
+
|
|
27
|
+
_REQUIRED_ROLES = ("verifier", "refuter")
|
|
28
|
+
_DEFAULT_TAINT_CAPABILITIES = frozenset({"input_validation"})
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ChallengeOrchestrator(Orchestrator):
|
|
32
|
+
def __init__(self, *, taint_capabilities: frozenset[str] = _DEFAULT_TAINT_CAPABILITIES) -> None:
|
|
33
|
+
self._taint_capabilities = taint_capabilities
|
|
34
|
+
|
|
35
|
+
def run(self, agents: dict[str, Agent], context: AnalysisContext) -> AnalysisResult:
|
|
36
|
+
missing = [role for role in _REQUIRED_ROLES if role not in agents]
|
|
37
|
+
if missing:
|
|
38
|
+
return AnalysisResult(error=f"challenge requires agents: {', '.join(missing)}")
|
|
39
|
+
|
|
40
|
+
verdicts = agents["verifier"].run(context)
|
|
41
|
+
flagged = [
|
|
42
|
+
v
|
|
43
|
+
for v in verdicts
|
|
44
|
+
if isinstance(v, Verdict)
|
|
45
|
+
and v.status == "VULNERABLE"
|
|
46
|
+
and v.capability.split(".")[0] in self._taint_capabilities
|
|
47
|
+
]
|
|
48
|
+
if not flagged:
|
|
49
|
+
return AnalysisResult(observations=verdicts)
|
|
50
|
+
|
|
51
|
+
refutations = agents["refuter"].run(dataclasses.replace(context, history=flagged))
|
|
52
|
+
reasons = {c.target: c.reason for c in refutations if isinstance(c, Concession)}
|
|
53
|
+
|
|
54
|
+
observations: list[Observation] = []
|
|
55
|
+
for v in verdicts:
|
|
56
|
+
if isinstance(v, Verdict) and v.status == "VULNERABLE" and v.capability in reasons:
|
|
57
|
+
observations.append(
|
|
58
|
+
Concession(
|
|
59
|
+
capability=v.capability,
|
|
60
|
+
produced_by="refuter",
|
|
61
|
+
target=v.capability,
|
|
62
|
+
reason=reasons[v.capability] or "refuted as a false positive",
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
observations.append(v)
|
|
67
|
+
return AnalysisResult(observations=observations)
|