codejury 0.4.1__tar.gz → 0.5.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (121) hide show
  1. {codejury-0.4.1 → codejury-0.5.0}/PKG-INFO +17 -5
  2. {codejury-0.4.1 → codejury-0.5.0}/README.md +16 -4
  3. {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/debate.py +18 -2
  4. codejury-0.5.0/codejury/agents/refuter.py +76 -0
  5. {codejury-0.4.1 → codejury-0.5.0}/codejury/assembly.py +9 -1
  6. {codejury-0.4.1 → codejury-0.5.0}/codejury/cli.py +62 -4
  7. codejury-0.5.0/codejury/data/suppressions.yaml +43 -0
  8. codejury-0.5.0/codejury/integrations/__init__.py +1 -0
  9. codejury-0.5.0/codejury/integrations/github.py +88 -0
  10. codejury-0.5.0/codejury/orchestrators/challenge.py +67 -0
  11. {codejury-0.4.1 → codejury-0.5.0}/codejury/resources.py +1 -0
  12. codejury-0.5.0/codejury/suppression.py +96 -0
  13. {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/PKG-INFO +17 -5
  14. {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/SOURCES.txt +9 -0
  15. {codejury-0.4.1 → codejury-0.5.0}/pyproject.toml +1 -1
  16. codejury-0.5.0/tests/test_challenge.py +105 -0
  17. codejury-0.5.0/tests/test_integrations.py +82 -0
  18. codejury-0.5.0/tests/test_suppression.py +44 -0
  19. {codejury-0.4.1 → codejury-0.5.0}/LICENSE +0 -0
  20. {codejury-0.4.1 → codejury-0.5.0}/codejury/__init__.py +0 -0
  21. {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/__init__.py +0 -0
  22. {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/base.py +0 -0
  23. {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/mock.py +0 -0
  24. {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/parsing.py +0 -0
  25. {codejury-0.4.1 → codejury-0.5.0}/codejury/agents/verifier.py +0 -0
  26. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/authentication.yaml +0 -0
  27. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/authorization.yaml +0 -0
  28. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/business_logic.yaml +0 -0
  29. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/crypto.yaml +0 -0
  30. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/data_protection.yaml +0 -0
  31. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/dependency_config.yaml +0 -0
  32. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/error_logging.yaml +0 -0
  33. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/input_validation.yaml +0 -0
  34. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/output_encoding.yaml +0 -0
  35. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/secrets.yaml +0 -0
  36. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/capabilities/session.yaml +0 -0
  37. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_bcrypt_password.yaml +0 -0
  38. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_jwt_noverify_vuln.yaml +0 -0
  39. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_jwt_verified_safe.yaml +0 -0
  40. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_sha256_checksum_safe.yaml +0 -0
  41. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authn_sha256_password.yaml +0 -0
  42. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authz_idor_vuln.yaml +0 -0
  43. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/authz_owner_safe.yaml +0 -0
  44. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/cmdi_ossystem_vuln.yaml +0 -0
  45. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/cmdi_subprocess_safe.yaml +0 -0
  46. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/crypto_aesgcm_safe.yaml +0 -0
  47. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/crypto_ecb_vuln.yaml +0 -0
  48. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/path_contained_safe.yaml +0 -0
  49. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/path_traversal_vuln.yaml +0 -0
  50. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/secrets_env_safe.yaml +0 -0
  51. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/secrets_hardcoded_vuln.yaml +0 -0
  52. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/sqli_format_vuln.yaml +0 -0
  53. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/sqli_fstring_query.yaml +0 -0
  54. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/sqli_parameterized_query.yaml +0 -0
  55. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/xss_innerhtml_constant_safe.yaml +0 -0
  56. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/golden/xss_innerhtml_vuln.yaml +0 -0
  57. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/tasks/audit_diff_debate.yaml +0 -0
  58. {codejury-0.4.1 → codejury-0.5.0}/codejury/data/tasks/quick_scan_single.yaml +0 -0
  59. {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/__init__.py +0 -0
  60. {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/artifact.py +0 -0
  61. {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/capability.py +0 -0
  62. {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/context.py +0 -0
  63. {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/observation.py +0 -0
  64. {codejury-0.4.1 → codejury-0.5.0}/codejury/domain/result.py +0 -0
  65. {codejury-0.4.1 → codejury-0.5.0}/codejury/evaluation.py +0 -0
  66. {codejury-0.4.1 → codejury-0.5.0}/codejury/infrastructure/__init__.py +0 -0
  67. {codejury-0.4.1 → codejury-0.5.0}/codejury/infrastructure/json_parse.py +0 -0
  68. {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/__init__.py +0 -0
  69. {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/base.py +0 -0
  70. {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/debate.py +0 -0
  71. {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/pipeline.py +0 -0
  72. {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/reflexion.py +0 -0
  73. {codejury-0.4.1 → codejury-0.5.0}/codejury/orchestrators/single.py +0 -0
  74. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/__init__.py +0 -0
  75. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/anthropic.py +0 -0
  76. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/base.py +0 -0
  77. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/litellm.py +0 -0
  78. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/mock.py +0 -0
  79. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/openai.py +0 -0
  80. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/openai_format.py +0 -0
  81. {codejury-0.4.1 → codejury-0.5.0}/codejury/providers/retry.py +0 -0
  82. {codejury-0.4.1 → codejury-0.5.0}/codejury/reporting.py +0 -0
  83. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/__init__.py +0 -0
  84. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/base.py +0 -0
  85. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/callers.py +0 -0
  86. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/chunker.py +0 -0
  87. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/diff.py +0 -0
  88. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/function.py +0 -0
  89. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/mock.py +0 -0
  90. {codejury-0.4.1 → codejury-0.5.0}/codejury/sources/repo.py +0 -0
  91. {codejury-0.4.1 → codejury-0.5.0}/codejury/tasks/__init__.py +0 -0
  92. {codejury-0.4.1 → codejury-0.5.0}/codejury/tasks/base.py +0 -0
  93. {codejury-0.4.1 → codejury-0.5.0}/codejury/tasks/registry.py +0 -0
  94. {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/dependency_links.txt +0 -0
  95. {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/entry_points.txt +0 -0
  96. {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/requires.txt +0 -0
  97. {codejury-0.4.1 → codejury-0.5.0}/codejury.egg-info/top_level.txt +0 -0
  98. {codejury-0.4.1 → codejury-0.5.0}/setup.cfg +0 -0
  99. {codejury-0.4.1 → codejury-0.5.0}/tests/test_anthropic_provider.py +0 -0
  100. {codejury-0.4.1 → codejury-0.5.0}/tests/test_assembly.py +0 -0
  101. {codejury-0.4.1 → codejury-0.5.0}/tests/test_audit_pipeline.py +0 -0
  102. {codejury-0.4.1 → codejury-0.5.0}/tests/test_callers.py +0 -0
  103. {codejury-0.4.1 → codejury-0.5.0}/tests/test_capability.py +0 -0
  104. {codejury-0.4.1 → codejury-0.5.0}/tests/test_cli_audit.py +0 -0
  105. {codejury-0.4.1 → codejury-0.5.0}/tests/test_context.py +0 -0
  106. {codejury-0.4.1 → codejury-0.5.0}/tests/test_debate_agents.py +0 -0
  107. {codejury-0.4.1 → codejury-0.5.0}/tests/test_debate_orchestrator.py +0 -0
  108. {codejury-0.4.1 → codejury-0.5.0}/tests/test_diff_source.py +0 -0
  109. {codejury-0.4.1 → codejury-0.5.0}/tests/test_evaluation.py +0 -0
  110. {codejury-0.4.1 → codejury-0.5.0}/tests/test_function_source.py +0 -0
  111. {codejury-0.4.1 → codejury-0.5.0}/tests/test_json_parse.py +0 -0
  112. {codejury-0.4.1 → codejury-0.5.0}/tests/test_litellm_provider.py +0 -0
  113. {codejury-0.4.1 → codejury-0.5.0}/tests/test_openai_provider.py +0 -0
  114. {codejury-0.4.1 → codejury-0.5.0}/tests/test_orchestrator.py +0 -0
  115. {codejury-0.4.1 → codejury-0.5.0}/tests/test_pipeline_orchestrator.py +0 -0
  116. {codejury-0.4.1 → codejury-0.5.0}/tests/test_reflexion_orchestrator.py +0 -0
  117. {codejury-0.4.1 → codejury-0.5.0}/tests/test_repo_source.py +0 -0
  118. {codejury-0.4.1 → codejury-0.5.0}/tests/test_reporting.py +0 -0
  119. {codejury-0.4.1 → codejury-0.5.0}/tests/test_retry_provider.py +0 -0
  120. {codejury-0.4.1 → codejury-0.5.0}/tests/test_tasks.py +0 -0
  121. {codejury-0.4.1 → codejury-0.5.0}/tests/test_verifier.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codejury
3
- Version: 0.4.1
3
+ Version: 0.5.0
4
4
  Summary: General-purpose Application Security AI audit framework -- five-layer architecture, capabilities as first-class data
5
5
  Author: AISecLabs
6
6
  License-Expression: MIT
@@ -58,6 +58,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
58
58
  ## Quickstart
59
59
 
60
60
  ```bash
61
+ # CI gate: exit 1 if a high-severity issue is found
62
+ git diff origin/main... | codejury audit --fail-on high -
63
+
64
+ # Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
65
+ git diff origin/main... | codejury audit --github your-org/your-repo#123 -
66
+
61
67
  # No API key needed -- prove the pipeline runs end to end with mock layers
62
68
  codejury dry-run
63
69
 
@@ -76,9 +82,13 @@ git diff | codejury audit --provider anthropic
76
82
  | `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
77
83
  | `codejury eval` | Score the golden cases and report precision / recall. |
78
84
 
79
- Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
85
+ Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
80
86
  `--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
81
87
 
88
+ Findings in known-noise categories (availability/DoS, rate limiting, memory safety
89
+ outside C/C++) are dropped by versioned rules in
90
+ `codejury/data/suppressions.yaml`; disable with `--no-suppress`.
91
+
82
92
  ```bash
83
93
  # Multi-round adversarial debate, rendered as Markdown
84
94
  git diff | codejury audit --orchestrator debate --format markdown - > report.md
@@ -157,9 +167,11 @@ independently.
157
167
  - **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
158
168
  from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
159
169
  ones like path traversal over-flag in single-file review because the verifier
160
- can't see whether a value is attacker-controlled. `scan --callers` adds
161
- cross-file call sites for provenance (helps some cases, not a full fix); also
162
- scope with `--only` or challenge findings with `--orchestrator debate`.
170
+ can't see whether a value is attacker-controlled. Mitigations that help but do
171
+ not fully solve it: `scan --callers` (cross-file call sites for provenance),
172
+ `--orchestrator challenge` (a recall-safe refutation pass that drops only
173
+ provably-safe flags), `--only` to scope, or `--orchestrator debate`. Real taint
174
+ precision needs data-flow analysis, not model skepticism.
163
175
  - **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
164
176
  not a quick check -- scope it with `--only`. Day to day, audit the diff.
165
177
 
@@ -29,6 +29,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
29
29
  ## Quickstart
30
30
 
31
31
  ```bash
32
+ # CI gate: exit 1 if a high-severity issue is found
33
+ git diff origin/main... | codejury audit --fail-on high -
34
+
35
+ # Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
36
+ git diff origin/main... | codejury audit --github your-org/your-repo#123 -
37
+
32
38
  # No API key needed -- prove the pipeline runs end to end with mock layers
33
39
  codejury dry-run
34
40
 
@@ -47,9 +53,13 @@ git diff | codejury audit --provider anthropic
47
53
  | `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
48
54
  | `codejury eval` | Score the golden cases and report precision / recall. |
49
55
 
50
- Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
56
+ Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
51
57
  `--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
52
58
 
59
+ Findings in known-noise categories (availability/DoS, rate limiting, memory safety
60
+ outside C/C++) are dropped by versioned rules in
61
+ `codejury/data/suppressions.yaml`; disable with `--no-suppress`.
62
+
53
63
  ```bash
54
64
  # Multi-round adversarial debate, rendered as Markdown
55
65
  git diff | codejury audit --orchestrator debate --format markdown - > report.md
@@ -128,9 +138,11 @@ independently.
128
138
  - **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
129
139
  from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
130
140
  ones like path traversal over-flag in single-file review because the verifier
131
- can't see whether a value is attacker-controlled. `scan --callers` adds
132
- cross-file call sites for provenance (helps some cases, not a full fix); also
133
- scope with `--only` or challenge findings with `--orchestrator debate`.
141
+ can't see whether a value is attacker-controlled. Mitigations that help but do
142
+ not fully solve it: `scan --callers` (cross-file call sites for provenance),
143
+ `--orchestrator challenge` (a recall-safe refutation pass that drops only
144
+ provably-safe flags), `--only` to scope, or `--orchestrator debate`. Real taint
145
+ precision needs data-flow analysis, not model skepticism.
134
146
  - **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
135
147
  not a quick check -- scope it with `--only`. Day to day, audit the diff.
136
148
 
@@ -30,6 +30,20 @@ _FINDING_SHAPE = (
30
30
  '"description": "...", "evidence": [{"file": "...", "line": 0, "code": "..."}], "confidence": 0.0}'
31
31
  )
32
32
 
33
+ _DEEP_LENS = (
34
+ "Look past surface patterns for the deepest flaw:\n"
35
+ "- Trust anchors: what does this code trust to authenticate or authorize -- a key, token, header, "
36
+ "signature, role, or caller -- and who controls that value? If the attacker supplies what is used to "
37
+ "verify them (e.g. their own public key, an unconfigured key that disables verification), passing the "
38
+ "check proves nothing.\n"
39
+ "- Order of operations: is an external, irreversible, or privileged action performed before the local "
40
+ "state is committed, or before the check that should guard it? Can a check and the action it guards be "
41
+ "split apart under concurrency (race / TOCTOU) or partial failure (on-chain done, DB rolled back)?\n"
42
+ "- Attack chains: combine several weak points into one end-to-end exploit.\n"
43
+ "Prefer the deepest design/authorization/state flaw over surface issues like missing rate limiting or "
44
+ "verbose logging; report those only as secondary."
45
+ )
46
+
33
47
 
34
48
  class _DebateAgent(Agent):
35
49
  """Shared provider plumbing for the three debate roles."""
@@ -60,7 +74,7 @@ class FinderAgent(_DebateAgent):
60
74
  )
61
75
 
62
76
  def run(self, ctx: AnalysisContext) -> list[Observation]:
63
- parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _code(ctx.artifact)]
77
+ parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _DEEP_LENS, _code(ctx.artifact)]
64
78
  if ctx.round_num > 1 and ctx.history:
65
79
  parts.append(_render_history(ctx.history))
66
80
  parts.append("Concede findings the rebuttals refute, keep the valid ones, and add any you missed.")
@@ -84,7 +98,9 @@ class ChallengerAgent(_DebateAgent):
84
98
  def run(self, ctx: AnalysisContext) -> list[Observation]:
85
99
  parts = [
86
100
  "Challenge the findings below. For each one you believe is a false positive, write a rebuttal. "
87
- "Add new_findings for any real issue that was missed.",
101
+ "Add new_findings for any real issue that was missed -- especially a deeper flaw the finder "
102
+ "stopped short of.",
103
+ _DEEP_LENS,
88
104
  _code(ctx.artifact),
89
105
  _render_history(ctx.history),
90
106
  'Respond as JSON: {"rebuttals": [{"target": "finding title", "reason": "..."}], '
@@ -0,0 +1,76 @@
1
+ """RefuterAgent -- a skeptic that tries to dismiss flagged verdicts as false positives.
2
+
3
+ Used by the challenge orchestrator: the verifier flags issues, then the refuter
4
+ gets the code plus the VULNERABLE verdicts (via ``ctx.history``) and argues which
5
+ are false positives -- e.g. a value that is not actually attacker-controlled or a
6
+ sink that is not reachable. It returns a Concession per verdict it refutes.
7
+
8
+ This is the cheap, focused alternative to a full debate: only flagged verdicts
9
+ are challenged, not the whole file.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from codejury.agents.base import Agent
15
+ from codejury.domain.context import AnalysisContext
16
+ from codejury.domain.observation import Concession, Observation, Verdict
17
+ from codejury.infrastructure.json_parse import extract_json_object
18
+ from codejury.providers.base import Message, Provider
19
+
20
+ _SYSTEM = (
21
+ "You are a careful security reviewer checking flagged issues for false positives. "
22
+ "Security errs toward keeping a flag: refute one ONLY when the code in front of you "
23
+ "affirmatively proves the value is not attacker-controlled. If a value's origin is not "
24
+ "shown, or it could plausibly come from external/untrusted input, KEEP the flag. "
25
+ "Respond with a single JSON object and nothing else."
26
+ )
27
+
28
+ _JSON_SHAPE = '{"refuted": [{"capability": "id.sub", "reason": "proof it is not attacker-controlled"}]}'
29
+
30
+
31
+ class RefuterAgent(Agent):
32
+ def __init__(self, *, provider: Provider, model: str, max_tokens: int = 1024) -> None:
33
+ self._provider = provider
34
+ self._model = model
35
+ self._max_tokens = max_tokens
36
+
37
+ def run(self, ctx: AnalysisContext) -> list[Observation]:
38
+ flagged = [o for o in ctx.history if isinstance(o, Verdict)]
39
+ if not flagged:
40
+ return []
41
+ flags = "\n".join(f"- {v.capability}: {v.reasoning}" for v in flagged)
42
+ context_block = (
43
+ f"Call sites elsewhere (for tracing where arguments come from):\n```\n{ctx.artifact.context}\n```\n\n"
44
+ if ctx.artifact.context
45
+ else ""
46
+ )
47
+ prompt = (
48
+ f"Code under review ({ctx.artifact.path}):\n```\n{ctx.artifact.content}\n```\n\n"
49
+ f"{context_block}"
50
+ f"Flagged issues:\n{flags}\n\n"
51
+ "This attacker-control reasoning applies ONLY to input-driven issues (injection, path "
52
+ "traversal, SSRF). For those, refute a flag only if you can affirmatively prove the value "
53
+ "is not attacker-controlled: a stored data field, or traced (here or in the call sites) to "
54
+ "a trusted, config, or operator-supplied source. If its origin is not shown or could "
55
+ "plausibly be external input, do NOT refute. For other issue types (hardcoded secrets, "
56
+ "weak crypto, ...), a literal value is often the vulnerability itself -- do NOT refute "
57
+ "those just because a value is constant.\n\n"
58
+ "Respond with a single JSON object exactly like:\n" + _JSON_SHAPE
59
+ )
60
+ result = self._provider.complete(
61
+ system=_SYSTEM,
62
+ messages=[Message(role="user", content=prompt)],
63
+ model=self._model,
64
+ max_tokens=self._max_tokens,
65
+ )
66
+ obj = extract_json_object(result.text) or {}
67
+ out: list[Observation] = []
68
+ for item in obj.get("refuted", []):
69
+ if not isinstance(item, dict):
70
+ continue
71
+ capability = str(item.get("capability", "")).strip()
72
+ if capability:
73
+ out.append(
74
+ Concession(capability=capability, produced_by="refuter", target=capability, reason=str(item.get("reason", "")))
75
+ )
76
+ return out
@@ -10,12 +10,14 @@ import os
10
10
 
11
11
  from codejury.agents.base import Agent
12
12
  from codejury.agents.debate import ChallengerAgent, FinderAgent, JudgeAgent
13
+ from codejury.agents.refuter import RefuterAgent
13
14
  from codejury.agents.verifier import VerifierAgent
14
15
  from codejury.domain.artifact import CodeArtifact
15
16
  from codejury.domain.capability import Capability
16
17
  from codejury.domain.context import AnalysisContext
17
18
  from codejury.domain.result import AnalysisResult
18
19
  from codejury.orchestrators.base import Orchestrator
20
+ from codejury.orchestrators.challenge import ChallengeOrchestrator
19
21
  from codejury.orchestrators.debate import DebateOrchestrator
20
22
  from codejury.orchestrators.pipeline import PipelineOrchestrator
21
23
  from codejury.orchestrators.reflexion import ReflexionOrchestrator
@@ -27,7 +29,7 @@ from codejury.providers.openai import OpenAIProvider
27
29
  from codejury.providers.retry import RetryProvider
28
30
  from codejury.sources.base import Source
29
31
 
30
- STRATEGIES = ("single", "pipeline", "debate", "reflexion")
32
+ STRATEGIES = ("single", "pipeline", "debate", "reflexion", "challenge")
31
33
  PROVIDERS = ("anthropic", "openai", "litellm")
32
34
  DEFAULT_MODEL = os.environ.get("CODEJURY_MODEL", "claude-sonnet-4-6")
33
35
  DEFAULT_API_BASE = os.environ.get("CODEJURY_API_BASE")
@@ -61,6 +63,12 @@ def build_orchestration(
61
63
  "critic": ChallengerAgent(provider=provider, model=model, max_tokens=max_tokens),
62
64
  }
63
65
  return agents, ReflexionOrchestrator()
66
+ if strategy == "challenge":
67
+ agents = {
68
+ "verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens),
69
+ "refuter": RefuterAgent(provider=provider, model=model),
70
+ }
71
+ return agents, ChallengeOrchestrator()
64
72
  verifier = {"verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens)}
65
73
  if strategy == "pipeline":
66
74
  return verifier, PipelineOrchestrator()
@@ -9,6 +9,7 @@ library, backed by the Anthropic provider, under a chosen orchestration strategy
9
9
  from __future__ import annotations
10
10
 
11
11
  import argparse
12
+ import os
12
13
  import sys
13
14
 
14
15
  from codejury.agents.mock import MockAgent
@@ -33,7 +34,9 @@ from codejury.orchestrators.single import SingleOrchestrator
33
34
  from codejury.providers.base import Provider
34
35
  from codejury.providers.mock import MockProvider
35
36
  from codejury.reporting import to_json, to_markdown
36
- from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, TASKS_DIR
37
+ from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, SUPPRESSIONS_FILE, TASKS_DIR
38
+ from codejury.suppression import filter_results, load_suppressions
39
+ from codejury.integrations.github import build_review, parse_pr_ref, post_review
37
40
  from codejury.sources.chunker import Chunker
38
41
  from codejury.sources.diff import DiffSource
39
42
  from codejury.sources.repo import RepoSource
@@ -137,6 +140,50 @@ def _render_results(fmt: str, results: list[tuple[str, AnalysisResult]]) -> str:
137
140
  return {"text": _render_audit, "markdown": to_markdown, "json": to_json}[fmt](results)
138
141
 
139
142
 
143
+ def _maybe_suppress(results: list[tuple[str, AnalysisResult]], enabled: bool) -> list[tuple[str, AnalysisResult]]:
144
+ if not enabled:
145
+ return results
146
+ filtered, suppressed = filter_results(results, load_suppressions(SUPPRESSIONS_FILE))
147
+ if suppressed:
148
+ print(f"suppressed {len(suppressed)} known-noise finding(s) by rule", file=sys.stderr)
149
+ return filtered
150
+
151
+ _FAIL_ON = ("critical", "high", "medium", "low")
152
+ _SEVERITY_RANK = {"critical": 4, "high": 3, "medium": 2, "low": 1, "info": 0}
153
+
154
+
155
+ def _problem_rank(o: Observation) -> int:
156
+ if o.kind == "finding":
157
+ return _SEVERITY_RANK.get(o.severity.lower(), 2)
158
+ if o.kind == "verdict" and o.status == "VULNERABLE":
159
+ return _SEVERITY_RANK["high"]
160
+ if o.kind == "verdict" and o.status == "PARTIAL":
161
+ return _SEVERITY_RANK["medium"]
162
+ return -1
163
+
164
+
165
+ def _gate_exit(results: list[tuple[str, AnalysisResult]], fail_on: str | None) -> int:
166
+ if not fail_on:
167
+ return 0
168
+ worst = max((_problem_rank(o) for _, r in results for o in r.observations), default=-1)
169
+ return 1 if worst >= _SEVERITY_RANK[fail_on] else 0
170
+
171
+
172
+ def _maybe_post_github(ref: str | None, results: list[tuple[str, AnalysisResult]]) -> None:
173
+ if not ref:
174
+ return
175
+ token = os.environ.get("GITHUB_TOKEN")
176
+ if not token:
177
+ print("GITHUB_TOKEN not set; skipping PR review", file=sys.stderr)
178
+ return
179
+ try:
180
+ owner, repo, pull = parse_pr_ref(ref)
181
+ post_review(owner, repo, pull, build_review(results), token=token)
182
+ print(f"posted review to {ref}", file=sys.stderr)
183
+ except Exception as exc:
184
+ print(f"github review failed: {exc}", file=sys.stderr)
185
+
186
+
140
187
  def _render_metrics(m: Metrics) -> str:
141
188
  return (
142
189
  f"cases: {m.total} (tp={m.tp} fp={m.fp} tn={m.tn} fn={m.fn})\n"
@@ -168,6 +215,9 @@ def main(argv: list[str] | None = None) -> int:
168
215
  audit_p.add_argument("--retries", type=int, default=0, help="provider retry attempts on failure")
169
216
  audit_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
170
217
  audit_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
218
+ audit_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
219
+ audit_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
220
+ audit_p.add_argument("--github", default=None, help="post a PR review: owner/repo#number (needs GITHUB_TOKEN)")
171
221
 
172
222
  scan_p = sub.add_parser("scan", help="audit a whole directory tree (deep, capability by capability)")
173
223
  scan_p.add_argument("directory", help="directory to scan")
@@ -185,6 +235,8 @@ def main(argv: list[str] | None = None) -> int:
185
235
  )
186
236
  scan_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
187
237
  scan_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
238
+ scan_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
239
+ scan_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
188
240
 
189
241
  run_p = sub.add_parser("run", help="run a named task preset against a unified diff")
190
242
  run_p.add_argument("task", help="task name")
@@ -192,6 +244,8 @@ def main(argv: list[str] | None = None) -> int:
192
244
  run_p.add_argument("--tasks", default=TASKS_DIR, help="task YAML directory")
193
245
  run_p.add_argument("--capabilities", default=CAPABILITIES_DIR, help="capability YAML directory")
194
246
  run_p.add_argument("--format", choices=_FORMATS, default="text", dest="fmt")
247
+ run_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
248
+ run_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
195
249
 
196
250
  eval_p = sub.add_parser("eval", help="score golden cases and report precision/recall")
197
251
  eval_p.add_argument("--golden", default=GOLDEN_DIR, help="golden case YAML directory")
@@ -214,8 +268,10 @@ def main(argv: list[str] | None = None) -> int:
214
268
  max_tokens=args.max_tokens,
215
269
  strategy=args.orchestrator,
216
270
  )
271
+ results = _maybe_suppress(results, not args.no_suppress)
217
272
  print(_render_results(args.fmt, results))
218
- return 0
273
+ _maybe_post_github(args.github, results)
274
+ return _gate_exit(results, args.fail_on)
219
275
 
220
276
  if args.command == "scan":
221
277
  capabilities = load_capabilities(args.capabilities)
@@ -234,8 +290,9 @@ def main(argv: list[str] | None = None) -> int:
234
290
  max_chars=args.max_chars,
235
291
  with_callers=args.callers,
236
292
  )
293
+ results = _maybe_suppress(results, not args.no_suppress)
237
294
  print(_render_results(args.fmt, results))
238
- return 0
295
+ return _gate_exit(results, args.fail_on)
239
296
 
240
297
  if args.command == "run":
241
298
  tasks = load_tasks(args.tasks)
@@ -245,8 +302,9 @@ def main(argv: list[str] | None = None) -> int:
245
302
  results = run_task(
246
303
  tasks[args.task], DiffSource(_read_diff(args.diff)), load_capabilities(args.capabilities)
247
304
  )
305
+ results = _maybe_suppress(results, not args.no_suppress)
248
306
  print(_render_results(args.fmt, results))
249
- return 0
307
+ return _gate_exit(results, args.fail_on)
250
308
 
251
309
  if args.command == "eval":
252
310
  try:
@@ -0,0 +1,43 @@
1
+ # Known-noise suppression rules (data-driven false-positive filter).
2
+ # Each drops a flagged finding whose text matches and whose path condition holds.
3
+ # Keep these to out-of-scope / low-signal CATEGORIES -- never key on a real
4
+ # vulnerability class, or you will drop true findings.
5
+
6
+ - id: SUP-AVAILABILITY
7
+ reason: availability / DoS / rate-limiting findings are out of scope and low-signal here
8
+ match_any:
9
+ - "denial of service"
10
+ - "denial-of-service"
11
+ - "rate limit"
12
+ - "rate-limit"
13
+ - "rate limiting"
14
+ - "resource exhaustion"
15
+ - "unbounded"
16
+ - "amplification"
17
+
18
+ - id: SUP-LOGGING-NOISE
19
+ reason: verbose / insufficient logging is noise unless a secret value is logged
20
+ match_any:
21
+ - "verbose logging"
22
+ - "insufficient logging"
23
+ - "excessive logging"
24
+ - "lack of logging"
25
+ - "log verbosity"
26
+
27
+ - id: SUP-MEMORY-SAFETY-NON-C
28
+ reason: memory-safety issues do not apply outside C/C++
29
+ match_any:
30
+ - "buffer overflow"
31
+ - "use after free"
32
+ - "use-after-free"
33
+ - "double free"
34
+ - "memory corruption"
35
+ - "out-of-bounds"
36
+ unless_path_ext: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"]
37
+
38
+ - id: SUP-REDOS
39
+ reason: regex denial-of-service / catastrophic backtracking is low-signal here
40
+ match_any:
41
+ - "redos"
42
+ - "catastrophic backtracking"
43
+ - "regex denial"
@@ -0,0 +1 @@
1
+ """codejury.integrations -- post results to external systems (GitHub PR reviews)."""
@@ -0,0 +1,88 @@
1
+ """Post audit results to a GitHub pull request as a review with inline comments.
2
+
3
+ ``build_review`` is a pure function (results -> GitHub review payload) so it is
4
+ unit-testable; ``post_review`` does the HTTP POST and accepts an injectable
5
+ transport so it can be tested without a token or a live PR. Problems with a
6
+ usable file:line become inline comments; everything else is summarized in the
7
+ review body. The review requests changes when any problem is found.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import urllib.request
14
+ from typing import Any, Callable
15
+
16
+ from codejury.domain.observation import Observation
17
+ from codejury.domain.result import AnalysisResult
18
+
19
+ Results = list[tuple[str, AnalysisResult]]
20
+
21
+
22
+ def build_review(results: Results, *, max_comments: int = 50) -> dict:
23
+ comments: list[dict] = []
24
+ problems = 0
25
+ for _path, result in results:
26
+ for o in result.observations:
27
+ comment = _inline_comment(o)
28
+ if comment is None:
29
+ continue
30
+ problems += 1
31
+ if len(comments) < max_comments:
32
+ comments.append(comment)
33
+
34
+ body = (
35
+ f"codejury found {problems} issue(s)." if problems else "codejury found no issues."
36
+ )
37
+ if problems > len(comments):
38
+ body += f" Showing {len(comments)} inline; {problems - len(comments)} more omitted."
39
+ return {
40
+ "body": body,
41
+ "event": "REQUEST_CHANGES" if problems else "COMMENT",
42
+ "comments": comments,
43
+ }
44
+
45
+
46
+ def _inline_comment(o: Observation) -> dict | None:
47
+ if o.kind == "finding":
48
+ evidence = o.evidence[0] if o.evidence else None
49
+ if evidence and evidence.file and evidence.line:
50
+ cwe = f" ({o.cwe})" if o.cwe else ""
51
+ return {"path": evidence.file, "line": evidence.line, "body": f"**{o.severity}{cwe}** {o.title}\n\n{o.description}"}
52
+ if o.kind == "verdict" and o.status == "VULNERABLE":
53
+ evidence = o.evidence[0] if o.evidence else None
54
+ if evidence and evidence.file and evidence.line:
55
+ return {"path": evidence.file, "line": evidence.line, "body": f"**VULNERABLE** `{o.capability}`\n\n{o.reasoning}"}
56
+ return None
57
+
58
+
59
+ def post_review(
60
+ owner: str,
61
+ repo: str,
62
+ pull: int,
63
+ payload: dict,
64
+ *,
65
+ token: str,
66
+ transport: Callable[[str, bytes, dict], Any] | None = None,
67
+ ) -> Any:
68
+ url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull}/reviews"
69
+ data = json.dumps(payload).encode()
70
+ headers = {
71
+ "Authorization": f"Bearer {token}",
72
+ "Accept": "application/vnd.github+json",
73
+ "Content-Type": "application/json",
74
+ }
75
+ if transport is not None:
76
+ return transport(url, data, headers)
77
+ request = urllib.request.Request(url, data=data, headers=headers, method="POST")
78
+ with urllib.request.urlopen(request) as response:
79
+ return response.status
80
+
81
+
82
+ def parse_pr_ref(ref: str) -> tuple[str, str, int]:
83
+ """Parse 'owner/repo#123' into (owner, repo, pull_number)."""
84
+ repo_part, _, number = ref.partition("#")
85
+ owner, _, repo = repo_part.partition("/")
86
+ if not owner or not repo or not number.isdigit():
87
+ raise ValueError(f"expected owner/repo#number, got {ref!r}")
88
+ return owner, repo, int(number)
@@ -0,0 +1,67 @@
1
+ """ChallengeOrchestrator -- verify, then challenge the flagged verdicts.
2
+
3
+ The verifier rules on every capability; then a refuter is shown only the
4
+ VULNERABLE verdicts and the code, and argues which are false positives. A refuted
5
+ verdict becomes a dismissed Concession (recording why), so the report keeps the
6
+ SECURE/NOT_PRESENT verdicts, the surviving VULNERABLE ones, and a Dismissed list.
7
+
8
+ This targets taint-style false positives (which a lone verifier over-reports)
9
+ while paying the extra model call only for flagged verdicts, not the whole file.
10
+
11
+ Only verdicts from taint-prone capabilities are challenged. Local-pattern issues
12
+ (hardcoded secrets, weak crypto) are kept as-is: refuting them risks dropping a
13
+ real finding, and they do not have the attacker-control ambiguity that makes
14
+ taint checks over-report.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import dataclasses
20
+
21
+ from codejury.agents.base import Agent
22
+ from codejury.domain.context import AnalysisContext
23
+ from codejury.domain.observation import Concession, Observation, Verdict
24
+ from codejury.domain.result import AnalysisResult
25
+ from codejury.orchestrators.base import Orchestrator
26
+
27
+ _REQUIRED_ROLES = ("verifier", "refuter")
28
+ _DEFAULT_TAINT_CAPABILITIES = frozenset({"input_validation"})
29
+
30
+
31
+ class ChallengeOrchestrator(Orchestrator):
32
+ def __init__(self, *, taint_capabilities: frozenset[str] = _DEFAULT_TAINT_CAPABILITIES) -> None:
33
+ self._taint_capabilities = taint_capabilities
34
+
35
+ def run(self, agents: dict[str, Agent], context: AnalysisContext) -> AnalysisResult:
36
+ missing = [role for role in _REQUIRED_ROLES if role not in agents]
37
+ if missing:
38
+ return AnalysisResult(error=f"challenge requires agents: {', '.join(missing)}")
39
+
40
+ verdicts = agents["verifier"].run(context)
41
+ flagged = [
42
+ v
43
+ for v in verdicts
44
+ if isinstance(v, Verdict)
45
+ and v.status == "VULNERABLE"
46
+ and v.capability.split(".")[0] in self._taint_capabilities
47
+ ]
48
+ if not flagged:
49
+ return AnalysisResult(observations=verdicts)
50
+
51
+ refutations = agents["refuter"].run(dataclasses.replace(context, history=flagged))
52
+ reasons = {c.target: c.reason for c in refutations if isinstance(c, Concession)}
53
+
54
+ observations: list[Observation] = []
55
+ for v in verdicts:
56
+ if isinstance(v, Verdict) and v.status == "VULNERABLE" and v.capability in reasons:
57
+ observations.append(
58
+ Concession(
59
+ capability=v.capability,
60
+ produced_by="refuter",
61
+ target=v.capability,
62
+ reason=reasons[v.capability] or "refuted as a false positive",
63
+ )
64
+ )
65
+ else:
66
+ observations.append(v)
67
+ return AnalysisResult(observations=observations)
@@ -11,3 +11,4 @@ _DATA = Path(__file__).resolve().parent / "data"
11
11
  CAPABILITIES_DIR = _DATA / "capabilities"
12
12
  TASKS_DIR = _DATA / "tasks"
13
13
  GOLDEN_DIR = _DATA / "golden"
14
+ SUPPRESSIONS_FILE = _DATA / "suppressions.yaml"