codejury 0.4.1__tar.gz → 0.5.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {codejury-0.4.1 → codejury-0.5.1}/PKG-INFO +20 -5
- {codejury-0.4.1 → codejury-0.5.1}/README.md +19 -4
- {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/debate.py +18 -2
- codejury-0.5.1/codejury/agents/refuter.py +76 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/assembly.py +9 -1
- {codejury-0.4.1 → codejury-0.5.1}/codejury/cli.py +73 -6
- codejury-0.5.1/codejury/data/suppressions.yaml +43 -0
- codejury-0.5.1/codejury/integrations/__init__.py +1 -0
- codejury-0.5.1/codejury/integrations/github.py +88 -0
- codejury-0.5.1/codejury/orchestrators/challenge.py +67 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/resources.py +1 -0
- codejury-0.5.1/codejury/sources/callers.py +104 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/repo.py +16 -2
- codejury-0.5.1/codejury/suppression.py +96 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/PKG-INFO +20 -5
- {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/SOURCES.txt +9 -0
- {codejury-0.4.1 → codejury-0.5.1}/pyproject.toml +1 -1
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_callers.py +27 -1
- codejury-0.5.1/tests/test_challenge.py +105 -0
- codejury-0.5.1/tests/test_integrations.py +82 -0
- codejury-0.5.1/tests/test_suppression.py +44 -0
- codejury-0.4.1/codejury/sources/callers.py +0 -46
- {codejury-0.4.1 → codejury-0.5.1}/LICENSE +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/mock.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/parsing.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/verifier.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/authentication.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/authorization.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/business_logic.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/crypto.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/data_protection.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/dependency_config.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/error_logging.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/input_validation.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/output_encoding.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/secrets.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/session.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_bcrypt_password.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_jwt_noverify_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_jwt_verified_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_sha256_checksum_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_sha256_password.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authz_idor_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authz_owner_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/cmdi_ossystem_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/cmdi_subprocess_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/crypto_aesgcm_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/crypto_ecb_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/path_contained_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/path_traversal_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/secrets_env_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/secrets_hardcoded_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/sqli_format_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/sqli_fstring_query.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/sqli_parameterized_query.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/xss_innerhtml_constant_safe.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/xss_innerhtml_vuln.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/tasks/audit_diff_debate.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/data/tasks/quick_scan_single.yaml +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/artifact.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/capability.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/context.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/observation.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/result.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/evaluation.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/infrastructure/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/infrastructure/json_parse.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/debate.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/pipeline.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/reflexion.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/single.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/anthropic.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/litellm.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/mock.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/openai.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/openai_format.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/retry.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/reporting.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/chunker.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/diff.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/function.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/mock.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/tasks/__init__.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/tasks/base.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury/tasks/registry.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/dependency_links.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/entry_points.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/requires.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/top_level.txt +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/setup.cfg +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_anthropic_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_assembly.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_audit_pipeline.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_capability.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_cli_audit.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_context.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_debate_agents.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_debate_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_diff_source.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_evaluation.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_function_source.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_json_parse.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_litellm_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_openai_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_pipeline_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_reflexion_orchestrator.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_repo_source.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_reporting.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_retry_provider.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_tasks.py +0 -0
- {codejury-0.4.1 → codejury-0.5.1}/tests/test_verifier.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: codejury
|
|
3
|
-
Version: 0.
|
|
3
|
+
Version: 0.5.1
|
|
4
4
|
Summary: General-purpose Application Security AI audit framework -- five-layer architecture, capabilities as first-class data
|
|
5
5
|
Author: AISecLabs
|
|
6
6
|
License-Expression: MIT
|
|
@@ -58,6 +58,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
|
|
|
58
58
|
## Quickstart
|
|
59
59
|
|
|
60
60
|
```bash
|
|
61
|
+
# CI gate: exit 1 if a high-severity issue is found
|
|
62
|
+
git diff origin/main... | codejury audit --fail-on high -
|
|
63
|
+
|
|
64
|
+
# Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
|
|
65
|
+
git diff origin/main... | codejury audit --github your-org/your-repo#123 -
|
|
66
|
+
|
|
61
67
|
# No API key needed -- prove the pipeline runs end to end with mock layers
|
|
62
68
|
codejury dry-run
|
|
63
69
|
|
|
@@ -76,9 +82,13 @@ git diff | codejury audit --provider anthropic
|
|
|
76
82
|
| `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
|
|
77
83
|
| `codejury eval` | Score the golden cases and report precision / recall. |
|
|
78
84
|
|
|
79
|
-
Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
|
|
85
|
+
Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
|
|
80
86
|
`--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
|
|
81
87
|
|
|
88
|
+
Findings in known-noise categories (availability/DoS, rate limiting, memory safety
|
|
89
|
+
outside C/C++) are dropped by versioned rules in
|
|
90
|
+
`codejury/data/suppressions.yaml`; disable with `--no-suppress`.
|
|
91
|
+
|
|
82
92
|
```bash
|
|
83
93
|
# Multi-round adversarial debate, rendered as Markdown
|
|
84
94
|
git diff | codejury audit --orchestrator debate --format markdown - > report.md
|
|
@@ -157,9 +167,14 @@ independently.
|
|
|
157
167
|
- **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
|
|
158
168
|
from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
|
|
159
169
|
ones like path traversal over-flag in single-file review because the verifier
|
|
160
|
-
can't see whether a value is attacker-controlled.
|
|
161
|
-
|
|
162
|
-
|
|
170
|
+
can't see whether a value is attacker-controlled. Mitigations that add context
|
|
171
|
+
but do not fully solve it: `scan --callers` (where this file's functions are
|
|
172
|
+
called) and `scan --callees` (the called code it delegates to, so a sink in
|
|
173
|
+
another file is visible) -- pair them for both directions; `--orchestrator
|
|
174
|
+
challenge` (a recall-safe
|
|
175
|
+
refutation pass that drops only provably-safe flags); `--only` to scope; or
|
|
176
|
+
`--orchestrator debate`. Real taint precision still needs data-flow analysis,
|
|
177
|
+
not model skepticism.
|
|
163
178
|
- **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
|
|
164
179
|
not a quick check -- scope it with `--only`. Day to day, audit the diff.
|
|
165
180
|
|
|
@@ -29,6 +29,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
|
|
|
29
29
|
## Quickstart
|
|
30
30
|
|
|
31
31
|
```bash
|
|
32
|
+
# CI gate: exit 1 if a high-severity issue is found
|
|
33
|
+
git diff origin/main... | codejury audit --fail-on high -
|
|
34
|
+
|
|
35
|
+
# Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
|
|
36
|
+
git diff origin/main... | codejury audit --github your-org/your-repo#123 -
|
|
37
|
+
|
|
32
38
|
# No API key needed -- prove the pipeline runs end to end with mock layers
|
|
33
39
|
codejury dry-run
|
|
34
40
|
|
|
@@ -47,9 +53,13 @@ git diff | codejury audit --provider anthropic
|
|
|
47
53
|
| `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
|
|
48
54
|
| `codejury eval` | Score the golden cases and report precision / recall. |
|
|
49
55
|
|
|
50
|
-
Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
|
|
56
|
+
Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
|
|
51
57
|
`--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
|
|
52
58
|
|
|
59
|
+
Findings in known-noise categories (availability/DoS, rate limiting, memory safety
|
|
60
|
+
outside C/C++) are dropped by versioned rules in
|
|
61
|
+
`codejury/data/suppressions.yaml`; disable with `--no-suppress`.
|
|
62
|
+
|
|
53
63
|
```bash
|
|
54
64
|
# Multi-round adversarial debate, rendered as Markdown
|
|
55
65
|
git diff | codejury audit --orchestrator debate --format markdown - > report.md
|
|
@@ -128,9 +138,14 @@ independently.
|
|
|
128
138
|
- **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
|
|
129
139
|
from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
|
|
130
140
|
ones like path traversal over-flag in single-file review because the verifier
|
|
131
|
-
can't see whether a value is attacker-controlled.
|
|
132
|
-
|
|
133
|
-
|
|
141
|
+
can't see whether a value is attacker-controlled. Mitigations that add context
|
|
142
|
+
but do not fully solve it: `scan --callers` (where this file's functions are
|
|
143
|
+
called) and `scan --callees` (the called code it delegates to, so a sink in
|
|
144
|
+
another file is visible) -- pair them for both directions; `--orchestrator
|
|
145
|
+
challenge` (a recall-safe
|
|
146
|
+
refutation pass that drops only provably-safe flags); `--only` to scope; or
|
|
147
|
+
`--orchestrator debate`. Real taint precision still needs data-flow analysis,
|
|
148
|
+
not model skepticism.
|
|
134
149
|
- **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
|
|
135
150
|
not a quick check -- scope it with `--only`. Day to day, audit the diff.
|
|
136
151
|
|
|
@@ -30,6 +30,20 @@ _FINDING_SHAPE = (
|
|
|
30
30
|
'"description": "...", "evidence": [{"file": "...", "line": 0, "code": "..."}], "confidence": 0.0}'
|
|
31
31
|
)
|
|
32
32
|
|
|
33
|
+
_DEEP_LENS = (
|
|
34
|
+
"Look past surface patterns for the deepest flaw:\n"
|
|
35
|
+
"- Trust anchors: what does this code trust to authenticate or authorize -- a key, token, header, "
|
|
36
|
+
"signature, role, or caller -- and who controls that value? If the attacker supplies what is used to "
|
|
37
|
+
"verify them (e.g. their own public key, an unconfigured key that disables verification), passing the "
|
|
38
|
+
"check proves nothing.\n"
|
|
39
|
+
"- Order of operations: is an external, irreversible, or privileged action performed before the local "
|
|
40
|
+
"state is committed, or before the check that should guard it? Can a check and the action it guards be "
|
|
41
|
+
"split apart under concurrency (race / TOCTOU) or partial failure (on-chain done, DB rolled back)?\n"
|
|
42
|
+
"- Attack chains: combine several weak points into one end-to-end exploit.\n"
|
|
43
|
+
"Prefer the deepest design/authorization/state flaw over surface issues like missing rate limiting or "
|
|
44
|
+
"verbose logging; report those only as secondary."
|
|
45
|
+
)
|
|
46
|
+
|
|
33
47
|
|
|
34
48
|
class _DebateAgent(Agent):
|
|
35
49
|
"""Shared provider plumbing for the three debate roles."""
|
|
@@ -60,7 +74,7 @@ class FinderAgent(_DebateAgent):
|
|
|
60
74
|
)
|
|
61
75
|
|
|
62
76
|
def run(self, ctx: AnalysisContext) -> list[Observation]:
|
|
63
|
-
parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _code(ctx.artifact)]
|
|
77
|
+
parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _DEEP_LENS, _code(ctx.artifact)]
|
|
64
78
|
if ctx.round_num > 1 and ctx.history:
|
|
65
79
|
parts.append(_render_history(ctx.history))
|
|
66
80
|
parts.append("Concede findings the rebuttals refute, keep the valid ones, and add any you missed.")
|
|
@@ -84,7 +98,9 @@ class ChallengerAgent(_DebateAgent):
|
|
|
84
98
|
def run(self, ctx: AnalysisContext) -> list[Observation]:
|
|
85
99
|
parts = [
|
|
86
100
|
"Challenge the findings below. For each one you believe is a false positive, write a rebuttal. "
|
|
87
|
-
"Add new_findings for any real issue that was missed
|
|
101
|
+
"Add new_findings for any real issue that was missed -- especially a deeper flaw the finder "
|
|
102
|
+
"stopped short of.",
|
|
103
|
+
_DEEP_LENS,
|
|
88
104
|
_code(ctx.artifact),
|
|
89
105
|
_render_history(ctx.history),
|
|
90
106
|
'Respond as JSON: {"rebuttals": [{"target": "finding title", "reason": "..."}], '
|
|
@@ -0,0 +1,76 @@
|
|
|
1
|
+
"""RefuterAgent -- a skeptic that tries to dismiss flagged verdicts as false positives.
|
|
2
|
+
|
|
3
|
+
Used by the challenge orchestrator: the verifier flags issues, then the refuter
|
|
4
|
+
gets the code plus the VULNERABLE verdicts (via ``ctx.history``) and argues which
|
|
5
|
+
are false positives -- e.g. a value that is not actually attacker-controlled or a
|
|
6
|
+
sink that is not reachable. It returns a Concession per verdict it refutes.
|
|
7
|
+
|
|
8
|
+
This is the cheap, focused alternative to a full debate: only flagged verdicts
|
|
9
|
+
are challenged, not the whole file.
|
|
10
|
+
"""
|
|
11
|
+
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
from codejury.agents.base import Agent
|
|
15
|
+
from codejury.domain.context import AnalysisContext
|
|
16
|
+
from codejury.domain.observation import Concession, Observation, Verdict
|
|
17
|
+
from codejury.infrastructure.json_parse import extract_json_object
|
|
18
|
+
from codejury.providers.base import Message, Provider
|
|
19
|
+
|
|
20
|
+
_SYSTEM = (
|
|
21
|
+
"You are a careful security reviewer checking flagged issues for false positives. "
|
|
22
|
+
"Security errs toward keeping a flag: refute one ONLY when the code in front of you "
|
|
23
|
+
"affirmatively proves the value is not attacker-controlled. If a value's origin is not "
|
|
24
|
+
"shown, or it could plausibly come from external/untrusted input, KEEP the flag. "
|
|
25
|
+
"Respond with a single JSON object and nothing else."
|
|
26
|
+
)
|
|
27
|
+
|
|
28
|
+
_JSON_SHAPE = '{"refuted": [{"capability": "id.sub", "reason": "proof it is not attacker-controlled"}]}'
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class RefuterAgent(Agent):
|
|
32
|
+
def __init__(self, *, provider: Provider, model: str, max_tokens: int = 1024) -> None:
|
|
33
|
+
self._provider = provider
|
|
34
|
+
self._model = model
|
|
35
|
+
self._max_tokens = max_tokens
|
|
36
|
+
|
|
37
|
+
def run(self, ctx: AnalysisContext) -> list[Observation]:
|
|
38
|
+
flagged = [o for o in ctx.history if isinstance(o, Verdict)]
|
|
39
|
+
if not flagged:
|
|
40
|
+
return []
|
|
41
|
+
flags = "\n".join(f"- {v.capability}: {v.reasoning}" for v in flagged)
|
|
42
|
+
context_block = (
|
|
43
|
+
f"Call sites elsewhere (for tracing where arguments come from):\n```\n{ctx.artifact.context}\n```\n\n"
|
|
44
|
+
if ctx.artifact.context
|
|
45
|
+
else ""
|
|
46
|
+
)
|
|
47
|
+
prompt = (
|
|
48
|
+
f"Code under review ({ctx.artifact.path}):\n```\n{ctx.artifact.content}\n```\n\n"
|
|
49
|
+
f"{context_block}"
|
|
50
|
+
f"Flagged issues:\n{flags}\n\n"
|
|
51
|
+
"This attacker-control reasoning applies ONLY to input-driven issues (injection, path "
|
|
52
|
+
"traversal, SSRF). For those, refute a flag only if you can affirmatively prove the value "
|
|
53
|
+
"is not attacker-controlled: a stored data field, or traced (here or in the call sites) to "
|
|
54
|
+
"a trusted, config, or operator-supplied source. If its origin is not shown or could "
|
|
55
|
+
"plausibly be external input, do NOT refute. For other issue types (hardcoded secrets, "
|
|
56
|
+
"weak crypto, ...), a literal value is often the vulnerability itself -- do NOT refute "
|
|
57
|
+
"those just because a value is constant.\n\n"
|
|
58
|
+
"Respond with a single JSON object exactly like:\n" + _JSON_SHAPE
|
|
59
|
+
)
|
|
60
|
+
result = self._provider.complete(
|
|
61
|
+
system=_SYSTEM,
|
|
62
|
+
messages=[Message(role="user", content=prompt)],
|
|
63
|
+
model=self._model,
|
|
64
|
+
max_tokens=self._max_tokens,
|
|
65
|
+
)
|
|
66
|
+
obj = extract_json_object(result.text) or {}
|
|
67
|
+
out: list[Observation] = []
|
|
68
|
+
for item in obj.get("refuted", []):
|
|
69
|
+
if not isinstance(item, dict):
|
|
70
|
+
continue
|
|
71
|
+
capability = str(item.get("capability", "")).strip()
|
|
72
|
+
if capability:
|
|
73
|
+
out.append(
|
|
74
|
+
Concession(capability=capability, produced_by="refuter", target=capability, reason=str(item.get("reason", "")))
|
|
75
|
+
)
|
|
76
|
+
return out
|
|
@@ -10,12 +10,14 @@ import os
|
|
|
10
10
|
|
|
11
11
|
from codejury.agents.base import Agent
|
|
12
12
|
from codejury.agents.debate import ChallengerAgent, FinderAgent, JudgeAgent
|
|
13
|
+
from codejury.agents.refuter import RefuterAgent
|
|
13
14
|
from codejury.agents.verifier import VerifierAgent
|
|
14
15
|
from codejury.domain.artifact import CodeArtifact
|
|
15
16
|
from codejury.domain.capability import Capability
|
|
16
17
|
from codejury.domain.context import AnalysisContext
|
|
17
18
|
from codejury.domain.result import AnalysisResult
|
|
18
19
|
from codejury.orchestrators.base import Orchestrator
|
|
20
|
+
from codejury.orchestrators.challenge import ChallengeOrchestrator
|
|
19
21
|
from codejury.orchestrators.debate import DebateOrchestrator
|
|
20
22
|
from codejury.orchestrators.pipeline import PipelineOrchestrator
|
|
21
23
|
from codejury.orchestrators.reflexion import ReflexionOrchestrator
|
|
@@ -27,7 +29,7 @@ from codejury.providers.openai import OpenAIProvider
|
|
|
27
29
|
from codejury.providers.retry import RetryProvider
|
|
28
30
|
from codejury.sources.base import Source
|
|
29
31
|
|
|
30
|
-
STRATEGIES = ("single", "pipeline", "debate", "reflexion")
|
|
32
|
+
STRATEGIES = ("single", "pipeline", "debate", "reflexion", "challenge")
|
|
31
33
|
PROVIDERS = ("anthropic", "openai", "litellm")
|
|
32
34
|
DEFAULT_MODEL = os.environ.get("CODEJURY_MODEL", "claude-sonnet-4-6")
|
|
33
35
|
DEFAULT_API_BASE = os.environ.get("CODEJURY_API_BASE")
|
|
@@ -61,6 +63,12 @@ def build_orchestration(
|
|
|
61
63
|
"critic": ChallengerAgent(provider=provider, model=model, max_tokens=max_tokens),
|
|
62
64
|
}
|
|
63
65
|
return agents, ReflexionOrchestrator()
|
|
66
|
+
if strategy == "challenge":
|
|
67
|
+
agents = {
|
|
68
|
+
"verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens),
|
|
69
|
+
"refuter": RefuterAgent(provider=provider, model=model),
|
|
70
|
+
}
|
|
71
|
+
return agents, ChallengeOrchestrator()
|
|
64
72
|
verifier = {"verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens)}
|
|
65
73
|
if strategy == "pipeline":
|
|
66
74
|
return verifier, PipelineOrchestrator()
|
|
@@ -9,6 +9,7 @@ library, backed by the Anthropic provider, under a chosen orchestration strategy
|
|
|
9
9
|
from __future__ import annotations
|
|
10
10
|
|
|
11
11
|
import argparse
|
|
12
|
+
import os
|
|
12
13
|
import sys
|
|
13
14
|
|
|
14
15
|
from codejury.agents.mock import MockAgent
|
|
@@ -33,7 +34,9 @@ from codejury.orchestrators.single import SingleOrchestrator
|
|
|
33
34
|
from codejury.providers.base import Provider
|
|
34
35
|
from codejury.providers.mock import MockProvider
|
|
35
36
|
from codejury.reporting import to_json, to_markdown
|
|
36
|
-
from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, TASKS_DIR
|
|
37
|
+
from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, SUPPRESSIONS_FILE, TASKS_DIR
|
|
38
|
+
from codejury.suppression import filter_results, load_suppressions
|
|
39
|
+
from codejury.integrations.github import build_review, parse_pr_ref, post_review
|
|
37
40
|
from codejury.sources.chunker import Chunker
|
|
38
41
|
from codejury.sources.diff import DiffSource
|
|
39
42
|
from codejury.sources.repo import RepoSource
|
|
@@ -83,10 +86,15 @@ def scan(
|
|
|
83
86
|
extensions: tuple[str, ...] = (".py",),
|
|
84
87
|
max_chars: int = 200_000,
|
|
85
88
|
with_callers: bool = False,
|
|
89
|
+
with_callees: bool = False,
|
|
86
90
|
) -> list[tuple[str, AnalysisResult]]:
|
|
87
91
|
"""Audit every matching file in a directory tree, returning (path, result) per artifact."""
|
|
88
92
|
source = RepoSource(
|
|
89
|
-
directory,
|
|
93
|
+
directory,
|
|
94
|
+
extensions=extensions,
|
|
95
|
+
chunker=Chunker(max_chars=max_chars),
|
|
96
|
+
with_callers=with_callers,
|
|
97
|
+
with_callees=with_callees,
|
|
90
98
|
)
|
|
91
99
|
artifacts = source.list_artifacts()
|
|
92
100
|
calls = len(artifacts) * len(capabilities)
|
|
@@ -137,6 +145,50 @@ def _render_results(fmt: str, results: list[tuple[str, AnalysisResult]]) -> str:
|
|
|
137
145
|
return {"text": _render_audit, "markdown": to_markdown, "json": to_json}[fmt](results)
|
|
138
146
|
|
|
139
147
|
|
|
148
|
+
def _maybe_suppress(results: list[tuple[str, AnalysisResult]], enabled: bool) -> list[tuple[str, AnalysisResult]]:
|
|
149
|
+
if not enabled:
|
|
150
|
+
return results
|
|
151
|
+
filtered, suppressed = filter_results(results, load_suppressions(SUPPRESSIONS_FILE))
|
|
152
|
+
if suppressed:
|
|
153
|
+
print(f"suppressed {len(suppressed)} known-noise finding(s) by rule", file=sys.stderr)
|
|
154
|
+
return filtered
|
|
155
|
+
|
|
156
|
+
_FAIL_ON = ("critical", "high", "medium", "low")
|
|
157
|
+
_SEVERITY_RANK = {"critical": 4, "high": 3, "medium": 2, "low": 1, "info": 0}
|
|
158
|
+
|
|
159
|
+
|
|
160
|
+
def _problem_rank(o: Observation) -> int:
|
|
161
|
+
if o.kind == "finding":
|
|
162
|
+
return _SEVERITY_RANK.get(o.severity.lower(), 2)
|
|
163
|
+
if o.kind == "verdict" and o.status == "VULNERABLE":
|
|
164
|
+
return _SEVERITY_RANK["high"]
|
|
165
|
+
if o.kind == "verdict" and o.status == "PARTIAL":
|
|
166
|
+
return _SEVERITY_RANK["medium"]
|
|
167
|
+
return -1
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
def _gate_exit(results: list[tuple[str, AnalysisResult]], fail_on: str | None) -> int:
|
|
171
|
+
if not fail_on:
|
|
172
|
+
return 0
|
|
173
|
+
worst = max((_problem_rank(o) for _, r in results for o in r.observations), default=-1)
|
|
174
|
+
return 1 if worst >= _SEVERITY_RANK[fail_on] else 0
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def _maybe_post_github(ref: str | None, results: list[tuple[str, AnalysisResult]]) -> None:
|
|
178
|
+
if not ref:
|
|
179
|
+
return
|
|
180
|
+
token = os.environ.get("GITHUB_TOKEN")
|
|
181
|
+
if not token:
|
|
182
|
+
print("GITHUB_TOKEN not set; skipping PR review", file=sys.stderr)
|
|
183
|
+
return
|
|
184
|
+
try:
|
|
185
|
+
owner, repo, pull = parse_pr_ref(ref)
|
|
186
|
+
post_review(owner, repo, pull, build_review(results), token=token)
|
|
187
|
+
print(f"posted review to {ref}", file=sys.stderr)
|
|
188
|
+
except Exception as exc:
|
|
189
|
+
print(f"github review failed: {exc}", file=sys.stderr)
|
|
190
|
+
|
|
191
|
+
|
|
140
192
|
def _render_metrics(m: Metrics) -> str:
|
|
141
193
|
return (
|
|
142
194
|
f"cases: {m.total} (tp={m.tp} fp={m.fp} tn={m.tn} fn={m.fn})\n"
|
|
@@ -168,6 +220,9 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
168
220
|
audit_p.add_argument("--retries", type=int, default=0, help="provider retry attempts on failure")
|
|
169
221
|
audit_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
|
|
170
222
|
audit_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
|
|
223
|
+
audit_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
|
|
224
|
+
audit_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
|
|
225
|
+
audit_p.add_argument("--github", default=None, help="post a PR review: owner/repo#number (needs GITHUB_TOKEN)")
|
|
171
226
|
|
|
172
227
|
scan_p = sub.add_parser("scan", help="audit a whole directory tree (deep, capability by capability)")
|
|
173
228
|
scan_p.add_argument("directory", help="directory to scan")
|
|
@@ -181,10 +236,15 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
181
236
|
scan_p.add_argument("--max-tokens", type=int, default=2048)
|
|
182
237
|
scan_p.add_argument("--max-chars", type=int, default=200_000, help="chunk budget; default keeps whole files")
|
|
183
238
|
scan_p.add_argument(
|
|
184
|
-
"--callers", action="store_true", help="add cross-file
|
|
239
|
+
"--callers", action="store_true", help="add cross-file context: where this file's functions are called"
|
|
240
|
+
)
|
|
241
|
+
scan_p.add_argument(
|
|
242
|
+
"--callees", action="store_true", help="add cross-file context: the called code this file delegates to"
|
|
185
243
|
)
|
|
186
244
|
scan_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
|
|
187
245
|
scan_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
|
|
246
|
+
scan_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
|
|
247
|
+
scan_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
|
|
188
248
|
|
|
189
249
|
run_p = sub.add_parser("run", help="run a named task preset against a unified diff")
|
|
190
250
|
run_p.add_argument("task", help="task name")
|
|
@@ -192,6 +252,8 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
192
252
|
run_p.add_argument("--tasks", default=TASKS_DIR, help="task YAML directory")
|
|
193
253
|
run_p.add_argument("--capabilities", default=CAPABILITIES_DIR, help="capability YAML directory")
|
|
194
254
|
run_p.add_argument("--format", choices=_FORMATS, default="text", dest="fmt")
|
|
255
|
+
run_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
|
|
256
|
+
run_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
|
|
195
257
|
|
|
196
258
|
eval_p = sub.add_parser("eval", help="score golden cases and report precision/recall")
|
|
197
259
|
eval_p.add_argument("--golden", default=GOLDEN_DIR, help="golden case YAML directory")
|
|
@@ -214,8 +276,10 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
214
276
|
max_tokens=args.max_tokens,
|
|
215
277
|
strategy=args.orchestrator,
|
|
216
278
|
)
|
|
279
|
+
results = _maybe_suppress(results, not args.no_suppress)
|
|
217
280
|
print(_render_results(args.fmt, results))
|
|
218
|
-
|
|
281
|
+
_maybe_post_github(args.github, results)
|
|
282
|
+
return _gate_exit(results, args.fail_on)
|
|
219
283
|
|
|
220
284
|
if args.command == "scan":
|
|
221
285
|
capabilities = load_capabilities(args.capabilities)
|
|
@@ -233,9 +297,11 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
233
297
|
extensions=extensions,
|
|
234
298
|
max_chars=args.max_chars,
|
|
235
299
|
with_callers=args.callers,
|
|
300
|
+
with_callees=args.callees,
|
|
236
301
|
)
|
|
302
|
+
results = _maybe_suppress(results, not args.no_suppress)
|
|
237
303
|
print(_render_results(args.fmt, results))
|
|
238
|
-
return
|
|
304
|
+
return _gate_exit(results, args.fail_on)
|
|
239
305
|
|
|
240
306
|
if args.command == "run":
|
|
241
307
|
tasks = load_tasks(args.tasks)
|
|
@@ -245,8 +311,9 @@ def main(argv: list[str] | None = None) -> int:
|
|
|
245
311
|
results = run_task(
|
|
246
312
|
tasks[args.task], DiffSource(_read_diff(args.diff)), load_capabilities(args.capabilities)
|
|
247
313
|
)
|
|
314
|
+
results = _maybe_suppress(results, not args.no_suppress)
|
|
248
315
|
print(_render_results(args.fmt, results))
|
|
249
|
-
return
|
|
316
|
+
return _gate_exit(results, args.fail_on)
|
|
250
317
|
|
|
251
318
|
if args.command == "eval":
|
|
252
319
|
try:
|
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
# Known-noise suppression rules (data-driven false-positive filter).
|
|
2
|
+
# Each drops a flagged finding whose text matches and whose path condition holds.
|
|
3
|
+
# Keep these to out-of-scope / low-signal CATEGORIES -- never key on a real
|
|
4
|
+
# vulnerability class, or you will drop true findings.
|
|
5
|
+
|
|
6
|
+
- id: SUP-AVAILABILITY
|
|
7
|
+
reason: availability / DoS / rate-limiting findings are out of scope and low-signal here
|
|
8
|
+
match_any:
|
|
9
|
+
- "denial of service"
|
|
10
|
+
- "denial-of-service"
|
|
11
|
+
- "rate limit"
|
|
12
|
+
- "rate-limit"
|
|
13
|
+
- "rate limiting"
|
|
14
|
+
- "resource exhaustion"
|
|
15
|
+
- "unbounded"
|
|
16
|
+
- "amplification"
|
|
17
|
+
|
|
18
|
+
- id: SUP-LOGGING-NOISE
|
|
19
|
+
reason: verbose / insufficient logging is noise unless a secret value is logged
|
|
20
|
+
match_any:
|
|
21
|
+
- "verbose logging"
|
|
22
|
+
- "insufficient logging"
|
|
23
|
+
- "excessive logging"
|
|
24
|
+
- "lack of logging"
|
|
25
|
+
- "log verbosity"
|
|
26
|
+
|
|
27
|
+
- id: SUP-MEMORY-SAFETY-NON-C
|
|
28
|
+
reason: memory-safety issues do not apply outside C/C++
|
|
29
|
+
match_any:
|
|
30
|
+
- "buffer overflow"
|
|
31
|
+
- "use after free"
|
|
32
|
+
- "use-after-free"
|
|
33
|
+
- "double free"
|
|
34
|
+
- "memory corruption"
|
|
35
|
+
- "out-of-bounds"
|
|
36
|
+
unless_path_ext: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"]
|
|
37
|
+
|
|
38
|
+
- id: SUP-REDOS
|
|
39
|
+
reason: regex denial-of-service / catastrophic backtracking is low-signal here
|
|
40
|
+
match_any:
|
|
41
|
+
- "redos"
|
|
42
|
+
- "catastrophic backtracking"
|
|
43
|
+
- "regex denial"
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"""codejury.integrations -- post results to external systems (GitHub PR reviews)."""
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Post audit results to a GitHub pull request as a review with inline comments.
|
|
2
|
+
|
|
3
|
+
``build_review`` is a pure function (results -> GitHub review payload) so it is
|
|
4
|
+
unit-testable; ``post_review`` does the HTTP POST and accepts an injectable
|
|
5
|
+
transport so it can be tested without a token or a live PR. Problems with a
|
|
6
|
+
usable file:line become inline comments; everything else is summarized in the
|
|
7
|
+
review body. The review requests changes when any problem is found.
|
|
8
|
+
"""
|
|
9
|
+
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
import json
|
|
13
|
+
import urllib.request
|
|
14
|
+
from typing import Any, Callable
|
|
15
|
+
|
|
16
|
+
from codejury.domain.observation import Observation
|
|
17
|
+
from codejury.domain.result import AnalysisResult
|
|
18
|
+
|
|
19
|
+
Results = list[tuple[str, AnalysisResult]]
|
|
20
|
+
|
|
21
|
+
|
|
22
|
+
def build_review(results: Results, *, max_comments: int = 50) -> dict:
|
|
23
|
+
comments: list[dict] = []
|
|
24
|
+
problems = 0
|
|
25
|
+
for _path, result in results:
|
|
26
|
+
for o in result.observations:
|
|
27
|
+
comment = _inline_comment(o)
|
|
28
|
+
if comment is None:
|
|
29
|
+
continue
|
|
30
|
+
problems += 1
|
|
31
|
+
if len(comments) < max_comments:
|
|
32
|
+
comments.append(comment)
|
|
33
|
+
|
|
34
|
+
body = (
|
|
35
|
+
f"codejury found {problems} issue(s)." if problems else "codejury found no issues."
|
|
36
|
+
)
|
|
37
|
+
if problems > len(comments):
|
|
38
|
+
body += f" Showing {len(comments)} inline; {problems - len(comments)} more omitted."
|
|
39
|
+
return {
|
|
40
|
+
"body": body,
|
|
41
|
+
"event": "REQUEST_CHANGES" if problems else "COMMENT",
|
|
42
|
+
"comments": comments,
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
|
|
46
|
+
def _inline_comment(o: Observation) -> dict | None:
|
|
47
|
+
if o.kind == "finding":
|
|
48
|
+
evidence = o.evidence[0] if o.evidence else None
|
|
49
|
+
if evidence and evidence.file and evidence.line:
|
|
50
|
+
cwe = f" ({o.cwe})" if o.cwe else ""
|
|
51
|
+
return {"path": evidence.file, "line": evidence.line, "body": f"**{o.severity}{cwe}** {o.title}\n\n{o.description}"}
|
|
52
|
+
if o.kind == "verdict" and o.status == "VULNERABLE":
|
|
53
|
+
evidence = o.evidence[0] if o.evidence else None
|
|
54
|
+
if evidence and evidence.file and evidence.line:
|
|
55
|
+
return {"path": evidence.file, "line": evidence.line, "body": f"**VULNERABLE** `{o.capability}`\n\n{o.reasoning}"}
|
|
56
|
+
return None
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def post_review(
|
|
60
|
+
owner: str,
|
|
61
|
+
repo: str,
|
|
62
|
+
pull: int,
|
|
63
|
+
payload: dict,
|
|
64
|
+
*,
|
|
65
|
+
token: str,
|
|
66
|
+
transport: Callable[[str, bytes, dict], Any] | None = None,
|
|
67
|
+
) -> Any:
|
|
68
|
+
url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull}/reviews"
|
|
69
|
+
data = json.dumps(payload).encode()
|
|
70
|
+
headers = {
|
|
71
|
+
"Authorization": f"Bearer {token}",
|
|
72
|
+
"Accept": "application/vnd.github+json",
|
|
73
|
+
"Content-Type": "application/json",
|
|
74
|
+
}
|
|
75
|
+
if transport is not None:
|
|
76
|
+
return transport(url, data, headers)
|
|
77
|
+
request = urllib.request.Request(url, data=data, headers=headers, method="POST")
|
|
78
|
+
with urllib.request.urlopen(request) as response:
|
|
79
|
+
return response.status
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
def parse_pr_ref(ref: str) -> tuple[str, str, int]:
|
|
83
|
+
"""Parse 'owner/repo#123' into (owner, repo, pull_number)."""
|
|
84
|
+
repo_part, _, number = ref.partition("#")
|
|
85
|
+
owner, _, repo = repo_part.partition("/")
|
|
86
|
+
if not owner or not repo or not number.isdigit():
|
|
87
|
+
raise ValueError(f"expected owner/repo#number, got {ref!r}")
|
|
88
|
+
return owner, repo, int(number)
|
|
@@ -0,0 +1,67 @@
|
|
|
1
|
+
"""ChallengeOrchestrator -- verify, then challenge the flagged verdicts.
|
|
2
|
+
|
|
3
|
+
The verifier rules on every capability; then a refuter is shown only the
|
|
4
|
+
VULNERABLE verdicts and the code, and argues which are false positives. A refuted
|
|
5
|
+
verdict becomes a dismissed Concession (recording why), so the report keeps the
|
|
6
|
+
SECURE/NOT_PRESENT verdicts, the surviving VULNERABLE ones, and a Dismissed list.
|
|
7
|
+
|
|
8
|
+
This targets taint-style false positives (which a lone verifier over-reports)
|
|
9
|
+
while paying the extra model call only for flagged verdicts, not the whole file.
|
|
10
|
+
|
|
11
|
+
Only verdicts from taint-prone capabilities are challenged. Local-pattern issues
|
|
12
|
+
(hardcoded secrets, weak crypto) are kept as-is: refuting them risks dropping a
|
|
13
|
+
real finding, and they do not have the attacker-control ambiguity that makes
|
|
14
|
+
taint checks over-report.
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
from __future__ import annotations
|
|
18
|
+
|
|
19
|
+
import dataclasses
|
|
20
|
+
|
|
21
|
+
from codejury.agents.base import Agent
|
|
22
|
+
from codejury.domain.context import AnalysisContext
|
|
23
|
+
from codejury.domain.observation import Concession, Observation, Verdict
|
|
24
|
+
from codejury.domain.result import AnalysisResult
|
|
25
|
+
from codejury.orchestrators.base import Orchestrator
|
|
26
|
+
|
|
27
|
+
_REQUIRED_ROLES = ("verifier", "refuter")
|
|
28
|
+
_DEFAULT_TAINT_CAPABILITIES = frozenset({"input_validation"})
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class ChallengeOrchestrator(Orchestrator):
|
|
32
|
+
def __init__(self, *, taint_capabilities: frozenset[str] = _DEFAULT_TAINT_CAPABILITIES) -> None:
|
|
33
|
+
self._taint_capabilities = taint_capabilities
|
|
34
|
+
|
|
35
|
+
def run(self, agents: dict[str, Agent], context: AnalysisContext) -> AnalysisResult:
|
|
36
|
+
missing = [role for role in _REQUIRED_ROLES if role not in agents]
|
|
37
|
+
if missing:
|
|
38
|
+
return AnalysisResult(error=f"challenge requires agents: {', '.join(missing)}")
|
|
39
|
+
|
|
40
|
+
verdicts = agents["verifier"].run(context)
|
|
41
|
+
flagged = [
|
|
42
|
+
v
|
|
43
|
+
for v in verdicts
|
|
44
|
+
if isinstance(v, Verdict)
|
|
45
|
+
and v.status == "VULNERABLE"
|
|
46
|
+
and v.capability.split(".")[0] in self._taint_capabilities
|
|
47
|
+
]
|
|
48
|
+
if not flagged:
|
|
49
|
+
return AnalysisResult(observations=verdicts)
|
|
50
|
+
|
|
51
|
+
refutations = agents["refuter"].run(dataclasses.replace(context, history=flagged))
|
|
52
|
+
reasons = {c.target: c.reason for c in refutations if isinstance(c, Concession)}
|
|
53
|
+
|
|
54
|
+
observations: list[Observation] = []
|
|
55
|
+
for v in verdicts:
|
|
56
|
+
if isinstance(v, Verdict) and v.status == "VULNERABLE" and v.capability in reasons:
|
|
57
|
+
observations.append(
|
|
58
|
+
Concession(
|
|
59
|
+
capability=v.capability,
|
|
60
|
+
produced_by="refuter",
|
|
61
|
+
target=v.capability,
|
|
62
|
+
reason=reasons[v.capability] or "refuted as a false positive",
|
|
63
|
+
)
|
|
64
|
+
)
|
|
65
|
+
else:
|
|
66
|
+
observations.append(v)
|
|
67
|
+
return AnalysisResult(observations=observations)
|