codejury 0.4.1__tar.gz → 0.5.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (122) hide show
  1. {codejury-0.4.1 → codejury-0.5.1}/PKG-INFO +20 -5
  2. {codejury-0.4.1 → codejury-0.5.1}/README.md +19 -4
  3. {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/debate.py +18 -2
  4. codejury-0.5.1/codejury/agents/refuter.py +76 -0
  5. {codejury-0.4.1 → codejury-0.5.1}/codejury/assembly.py +9 -1
  6. {codejury-0.4.1 → codejury-0.5.1}/codejury/cli.py +73 -6
  7. codejury-0.5.1/codejury/data/suppressions.yaml +43 -0
  8. codejury-0.5.1/codejury/integrations/__init__.py +1 -0
  9. codejury-0.5.1/codejury/integrations/github.py +88 -0
  10. codejury-0.5.1/codejury/orchestrators/challenge.py +67 -0
  11. {codejury-0.4.1 → codejury-0.5.1}/codejury/resources.py +1 -0
  12. codejury-0.5.1/codejury/sources/callers.py +104 -0
  13. {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/repo.py +16 -2
  14. codejury-0.5.1/codejury/suppression.py +96 -0
  15. {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/PKG-INFO +20 -5
  16. {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/SOURCES.txt +9 -0
  17. {codejury-0.4.1 → codejury-0.5.1}/pyproject.toml +1 -1
  18. {codejury-0.4.1 → codejury-0.5.1}/tests/test_callers.py +27 -1
  19. codejury-0.5.1/tests/test_challenge.py +105 -0
  20. codejury-0.5.1/tests/test_integrations.py +82 -0
  21. codejury-0.5.1/tests/test_suppression.py +44 -0
  22. codejury-0.4.1/codejury/sources/callers.py +0 -46
  23. {codejury-0.4.1 → codejury-0.5.1}/LICENSE +0 -0
  24. {codejury-0.4.1 → codejury-0.5.1}/codejury/__init__.py +0 -0
  25. {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/__init__.py +0 -0
  26. {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/base.py +0 -0
  27. {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/mock.py +0 -0
  28. {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/parsing.py +0 -0
  29. {codejury-0.4.1 → codejury-0.5.1}/codejury/agents/verifier.py +0 -0
  30. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/authentication.yaml +0 -0
  31. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/authorization.yaml +0 -0
  32. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/business_logic.yaml +0 -0
  33. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/crypto.yaml +0 -0
  34. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/data_protection.yaml +0 -0
  35. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/dependency_config.yaml +0 -0
  36. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/error_logging.yaml +0 -0
  37. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/input_validation.yaml +0 -0
  38. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/output_encoding.yaml +0 -0
  39. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/secrets.yaml +0 -0
  40. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/capabilities/session.yaml +0 -0
  41. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_bcrypt_password.yaml +0 -0
  42. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_jwt_noverify_vuln.yaml +0 -0
  43. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_jwt_verified_safe.yaml +0 -0
  44. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_sha256_checksum_safe.yaml +0 -0
  45. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authn_sha256_password.yaml +0 -0
  46. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authz_idor_vuln.yaml +0 -0
  47. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/authz_owner_safe.yaml +0 -0
  48. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/cmdi_ossystem_vuln.yaml +0 -0
  49. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/cmdi_subprocess_safe.yaml +0 -0
  50. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/crypto_aesgcm_safe.yaml +0 -0
  51. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/crypto_ecb_vuln.yaml +0 -0
  52. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/path_contained_safe.yaml +0 -0
  53. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/path_traversal_vuln.yaml +0 -0
  54. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/secrets_env_safe.yaml +0 -0
  55. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/secrets_hardcoded_vuln.yaml +0 -0
  56. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/sqli_format_vuln.yaml +0 -0
  57. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/sqli_fstring_query.yaml +0 -0
  58. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/sqli_parameterized_query.yaml +0 -0
  59. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/xss_innerhtml_constant_safe.yaml +0 -0
  60. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/golden/xss_innerhtml_vuln.yaml +0 -0
  61. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/tasks/audit_diff_debate.yaml +0 -0
  62. {codejury-0.4.1 → codejury-0.5.1}/codejury/data/tasks/quick_scan_single.yaml +0 -0
  63. {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/__init__.py +0 -0
  64. {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/artifact.py +0 -0
  65. {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/capability.py +0 -0
  66. {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/context.py +0 -0
  67. {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/observation.py +0 -0
  68. {codejury-0.4.1 → codejury-0.5.1}/codejury/domain/result.py +0 -0
  69. {codejury-0.4.1 → codejury-0.5.1}/codejury/evaluation.py +0 -0
  70. {codejury-0.4.1 → codejury-0.5.1}/codejury/infrastructure/__init__.py +0 -0
  71. {codejury-0.4.1 → codejury-0.5.1}/codejury/infrastructure/json_parse.py +0 -0
  72. {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/__init__.py +0 -0
  73. {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/base.py +0 -0
  74. {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/debate.py +0 -0
  75. {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/pipeline.py +0 -0
  76. {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/reflexion.py +0 -0
  77. {codejury-0.4.1 → codejury-0.5.1}/codejury/orchestrators/single.py +0 -0
  78. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/__init__.py +0 -0
  79. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/anthropic.py +0 -0
  80. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/base.py +0 -0
  81. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/litellm.py +0 -0
  82. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/mock.py +0 -0
  83. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/openai.py +0 -0
  84. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/openai_format.py +0 -0
  85. {codejury-0.4.1 → codejury-0.5.1}/codejury/providers/retry.py +0 -0
  86. {codejury-0.4.1 → codejury-0.5.1}/codejury/reporting.py +0 -0
  87. {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/__init__.py +0 -0
  88. {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/base.py +0 -0
  89. {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/chunker.py +0 -0
  90. {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/diff.py +0 -0
  91. {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/function.py +0 -0
  92. {codejury-0.4.1 → codejury-0.5.1}/codejury/sources/mock.py +0 -0
  93. {codejury-0.4.1 → codejury-0.5.1}/codejury/tasks/__init__.py +0 -0
  94. {codejury-0.4.1 → codejury-0.5.1}/codejury/tasks/base.py +0 -0
  95. {codejury-0.4.1 → codejury-0.5.1}/codejury/tasks/registry.py +0 -0
  96. {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/dependency_links.txt +0 -0
  97. {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/entry_points.txt +0 -0
  98. {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/requires.txt +0 -0
  99. {codejury-0.4.1 → codejury-0.5.1}/codejury.egg-info/top_level.txt +0 -0
  100. {codejury-0.4.1 → codejury-0.5.1}/setup.cfg +0 -0
  101. {codejury-0.4.1 → codejury-0.5.1}/tests/test_anthropic_provider.py +0 -0
  102. {codejury-0.4.1 → codejury-0.5.1}/tests/test_assembly.py +0 -0
  103. {codejury-0.4.1 → codejury-0.5.1}/tests/test_audit_pipeline.py +0 -0
  104. {codejury-0.4.1 → codejury-0.5.1}/tests/test_capability.py +0 -0
  105. {codejury-0.4.1 → codejury-0.5.1}/tests/test_cli_audit.py +0 -0
  106. {codejury-0.4.1 → codejury-0.5.1}/tests/test_context.py +0 -0
  107. {codejury-0.4.1 → codejury-0.5.1}/tests/test_debate_agents.py +0 -0
  108. {codejury-0.4.1 → codejury-0.5.1}/tests/test_debate_orchestrator.py +0 -0
  109. {codejury-0.4.1 → codejury-0.5.1}/tests/test_diff_source.py +0 -0
  110. {codejury-0.4.1 → codejury-0.5.1}/tests/test_evaluation.py +0 -0
  111. {codejury-0.4.1 → codejury-0.5.1}/tests/test_function_source.py +0 -0
  112. {codejury-0.4.1 → codejury-0.5.1}/tests/test_json_parse.py +0 -0
  113. {codejury-0.4.1 → codejury-0.5.1}/tests/test_litellm_provider.py +0 -0
  114. {codejury-0.4.1 → codejury-0.5.1}/tests/test_openai_provider.py +0 -0
  115. {codejury-0.4.1 → codejury-0.5.1}/tests/test_orchestrator.py +0 -0
  116. {codejury-0.4.1 → codejury-0.5.1}/tests/test_pipeline_orchestrator.py +0 -0
  117. {codejury-0.4.1 → codejury-0.5.1}/tests/test_reflexion_orchestrator.py +0 -0
  118. {codejury-0.4.1 → codejury-0.5.1}/tests/test_repo_source.py +0 -0
  119. {codejury-0.4.1 → codejury-0.5.1}/tests/test_reporting.py +0 -0
  120. {codejury-0.4.1 → codejury-0.5.1}/tests/test_retry_provider.py +0 -0
  121. {codejury-0.4.1 → codejury-0.5.1}/tests/test_tasks.py +0 -0
  122. {codejury-0.4.1 → codejury-0.5.1}/tests/test_verifier.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: codejury
3
- Version: 0.4.1
3
+ Version: 0.5.1
4
4
  Summary: General-purpose Application Security AI audit framework -- five-layer architecture, capabilities as first-class data
5
5
  Author: AISecLabs
6
6
  License-Expression: MIT
@@ -58,6 +58,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
58
58
  ## Quickstart
59
59
 
60
60
  ```bash
61
+ # CI gate: exit 1 if a high-severity issue is found
62
+ git diff origin/main... | codejury audit --fail-on high -
63
+
64
+ # Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
65
+ git diff origin/main... | codejury audit --github your-org/your-repo#123 -
66
+
61
67
  # No API key needed -- prove the pipeline runs end to end with mock layers
62
68
  codejury dry-run
63
69
 
@@ -76,9 +82,13 @@ git diff | codejury audit --provider anthropic
76
82
  | `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
77
83
  | `codejury eval` | Score the golden cases and report precision / recall. |
78
84
 
79
- Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
85
+ Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
80
86
  `--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
81
87
 
88
+ Findings in known-noise categories (availability/DoS, rate limiting, memory safety
89
+ outside C/C++) are dropped by versioned rules in
90
+ `codejury/data/suppressions.yaml`; disable with `--no-suppress`.
91
+
82
92
  ```bash
83
93
  # Multi-round adversarial debate, rendered as Markdown
84
94
  git diff | codejury audit --orchestrator debate --format markdown - > report.md
@@ -157,9 +167,14 @@ independently.
157
167
  - **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
158
168
  from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
159
169
  ones like path traversal over-flag in single-file review because the verifier
160
- can't see whether a value is attacker-controlled. `scan --callers` adds
161
- cross-file call sites for provenance (helps some cases, not a full fix); also
162
- scope with `--only` or challenge findings with `--orchestrator debate`.
170
+ can't see whether a value is attacker-controlled. Mitigations that add context
171
+ but do not fully solve it: `scan --callers` (where this file's functions are
172
+ called) and `scan --callees` (the called code it delegates to, so a sink in
173
+ another file is visible) -- pair them for both directions; `--orchestrator
174
+ challenge` (a recall-safe
175
+ refutation pass that drops only provably-safe flags); `--only` to scope; or
176
+ `--orchestrator debate`. Real taint precision still needs data-flow analysis,
177
+ not model skepticism.
163
178
  - **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
164
179
  not a quick check -- scope it with `--only`. Day to day, audit the diff.
165
180
 
@@ -29,6 +29,12 @@ pip install 'codejury[anthropic]' # the provider you'll use: anthropic | open
29
29
  ## Quickstart
30
30
 
31
31
  ```bash
32
+ # CI gate: exit 1 if a high-severity issue is found
33
+ git diff origin/main... | codejury audit --fail-on high -
34
+
35
+ # Post inline review comments on a GitHub pull request (needs GITHUB_TOKEN)
36
+ git diff origin/main... | codejury audit --github your-org/your-repo#123 -
37
+
32
38
  # No API key needed -- prove the pipeline runs end to end with mock layers
33
39
  codejury dry-run
34
40
 
@@ -47,9 +53,13 @@ git diff | codejury audit --provider anthropic
47
53
  | `codejury run <task>` | Run a named task preset (see [Tasks](#tasks)). |
48
54
  | `codejury eval` | Score the golden cases and report precision / recall. |
49
55
 
50
- Shared flags: `--orchestrator {single,pipeline,debate,reflexion}`,
56
+ Shared flags: `--orchestrator {single,pipeline,debate,reflexion,challenge}`,
51
57
  `--provider {anthropic,openai,litellm}`, `--model`, `--format {text,markdown,json}`.
52
58
 
59
+ Findings in known-noise categories (availability/DoS, rate limiting, memory safety
60
+ outside C/C++) are dropped by versioned rules in
61
+ `codejury/data/suppressions.yaml`; disable with `--no-suppress`.
62
+
53
63
  ```bash
54
64
  # Multi-round adversarial debate, rendered as Markdown
55
65
  git diff | codejury audit --orchestrator debate --format markdown - > report.md
@@ -128,9 +138,14 @@ independently.
128
138
  - **Local-pattern checks are sharper than data-flow ones.** Capabilities judged
129
139
  from one spot (weak crypto, hardcoded secrets) are reliable; taint / data-flow
130
140
  ones like path traversal over-flag in single-file review because the verifier
131
- can't see whether a value is attacker-controlled. `scan --callers` adds
132
- cross-file call sites for provenance (helps some cases, not a full fix); also
133
- scope with `--only` or challenge findings with `--orchestrator debate`.
141
+ can't see whether a value is attacker-controlled. Mitigations that add context
142
+ but do not fully solve it: `scan --callers` (where this file's functions are
143
+ called) and `scan --callees` (the called code it delegates to, so a sink in
144
+ another file is visible) -- pair them for both directions; `--orchestrator
145
+ challenge` (a recall-safe
146
+ refutation pass that drops only provably-safe flags); `--only` to scope; or
147
+ `--orchestrator debate`. Real taint precision still needs data-flow analysis,
148
+ not model skepticism.
134
149
  - **`scan` cost scales as files x capabilities.** It is a periodic deep audit,
135
150
  not a quick check -- scope it with `--only`. Day to day, audit the diff.
136
151
 
@@ -30,6 +30,20 @@ _FINDING_SHAPE = (
30
30
  '"description": "...", "evidence": [{"file": "...", "line": 0, "code": "..."}], "confidence": 0.0}'
31
31
  )
32
32
 
33
+ _DEEP_LENS = (
34
+ "Look past surface patterns for the deepest flaw:\n"
35
+ "- Trust anchors: what does this code trust to authenticate or authorize -- a key, token, header, "
36
+ "signature, role, or caller -- and who controls that value? If the attacker supplies what is used to "
37
+ "verify them (e.g. their own public key, an unconfigured key that disables verification), passing the "
38
+ "check proves nothing.\n"
39
+ "- Order of operations: is an external, irreversible, or privileged action performed before the local "
40
+ "state is committed, or before the check that should guard it? Can a check and the action it guards be "
41
+ "split apart under concurrency (race / TOCTOU) or partial failure (on-chain done, DB rolled back)?\n"
42
+ "- Attack chains: combine several weak points into one end-to-end exploit.\n"
43
+ "Prefer the deepest design/authorization/state flaw over surface issues like missing rate limiting or "
44
+ "verbose logging; report those only as secondary."
45
+ )
46
+
33
47
 
34
48
  class _DebateAgent(Agent):
35
49
  """Shared provider plumbing for the three debate roles."""
@@ -60,7 +74,7 @@ class FinderAgent(_DebateAgent):
60
74
  )
61
75
 
62
76
  def run(self, ctx: AnalysisContext) -> list[Observation]:
63
- parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _code(ctx.artifact)]
77
+ parts = ["Review the code for security vulnerabilities.", _hints(ctx.capabilities), _DEEP_LENS, _code(ctx.artifact)]
64
78
  if ctx.round_num > 1 and ctx.history:
65
79
  parts.append(_render_history(ctx.history))
66
80
  parts.append("Concede findings the rebuttals refute, keep the valid ones, and add any you missed.")
@@ -84,7 +98,9 @@ class ChallengerAgent(_DebateAgent):
84
98
  def run(self, ctx: AnalysisContext) -> list[Observation]:
85
99
  parts = [
86
100
  "Challenge the findings below. For each one you believe is a false positive, write a rebuttal. "
87
- "Add new_findings for any real issue that was missed.",
101
+ "Add new_findings for any real issue that was missed -- especially a deeper flaw the finder "
102
+ "stopped short of.",
103
+ _DEEP_LENS,
88
104
  _code(ctx.artifact),
89
105
  _render_history(ctx.history),
90
106
  'Respond as JSON: {"rebuttals": [{"target": "finding title", "reason": "..."}], '
@@ -0,0 +1,76 @@
1
+ """RefuterAgent -- a skeptic that tries to dismiss flagged verdicts as false positives.
2
+
3
+ Used by the challenge orchestrator: the verifier flags issues, then the refuter
4
+ gets the code plus the VULNERABLE verdicts (via ``ctx.history``) and argues which
5
+ are false positives -- e.g. a value that is not actually attacker-controlled or a
6
+ sink that is not reachable. It returns a Concession per verdict it refutes.
7
+
8
+ This is the cheap, focused alternative to a full debate: only flagged verdicts
9
+ are challenged, not the whole file.
10
+ """
11
+
12
+ from __future__ import annotations
13
+
14
+ from codejury.agents.base import Agent
15
+ from codejury.domain.context import AnalysisContext
16
+ from codejury.domain.observation import Concession, Observation, Verdict
17
+ from codejury.infrastructure.json_parse import extract_json_object
18
+ from codejury.providers.base import Message, Provider
19
+
20
+ _SYSTEM = (
21
+ "You are a careful security reviewer checking flagged issues for false positives. "
22
+ "Security errs toward keeping a flag: refute one ONLY when the code in front of you "
23
+ "affirmatively proves the value is not attacker-controlled. If a value's origin is not "
24
+ "shown, or it could plausibly come from external/untrusted input, KEEP the flag. "
25
+ "Respond with a single JSON object and nothing else."
26
+ )
27
+
28
+ _JSON_SHAPE = '{"refuted": [{"capability": "id.sub", "reason": "proof it is not attacker-controlled"}]}'
29
+
30
+
31
+ class RefuterAgent(Agent):
32
+ def __init__(self, *, provider: Provider, model: str, max_tokens: int = 1024) -> None:
33
+ self._provider = provider
34
+ self._model = model
35
+ self._max_tokens = max_tokens
36
+
37
+ def run(self, ctx: AnalysisContext) -> list[Observation]:
38
+ flagged = [o for o in ctx.history if isinstance(o, Verdict)]
39
+ if not flagged:
40
+ return []
41
+ flags = "\n".join(f"- {v.capability}: {v.reasoning}" for v in flagged)
42
+ context_block = (
43
+ f"Call sites elsewhere (for tracing where arguments come from):\n```\n{ctx.artifact.context}\n```\n\n"
44
+ if ctx.artifact.context
45
+ else ""
46
+ )
47
+ prompt = (
48
+ f"Code under review ({ctx.artifact.path}):\n```\n{ctx.artifact.content}\n```\n\n"
49
+ f"{context_block}"
50
+ f"Flagged issues:\n{flags}\n\n"
51
+ "This attacker-control reasoning applies ONLY to input-driven issues (injection, path "
52
+ "traversal, SSRF). For those, refute a flag only if you can affirmatively prove the value "
53
+ "is not attacker-controlled: a stored data field, or traced (here or in the call sites) to "
54
+ "a trusted, config, or operator-supplied source. If its origin is not shown or could "
55
+ "plausibly be external input, do NOT refute. For other issue types (hardcoded secrets, "
56
+ "weak crypto, ...), a literal value is often the vulnerability itself -- do NOT refute "
57
+ "those just because a value is constant.\n\n"
58
+ "Respond with a single JSON object exactly like:\n" + _JSON_SHAPE
59
+ )
60
+ result = self._provider.complete(
61
+ system=_SYSTEM,
62
+ messages=[Message(role="user", content=prompt)],
63
+ model=self._model,
64
+ max_tokens=self._max_tokens,
65
+ )
66
+ obj = extract_json_object(result.text) or {}
67
+ out: list[Observation] = []
68
+ for item in obj.get("refuted", []):
69
+ if not isinstance(item, dict):
70
+ continue
71
+ capability = str(item.get("capability", "")).strip()
72
+ if capability:
73
+ out.append(
74
+ Concession(capability=capability, produced_by="refuter", target=capability, reason=str(item.get("reason", "")))
75
+ )
76
+ return out
@@ -10,12 +10,14 @@ import os
10
10
 
11
11
  from codejury.agents.base import Agent
12
12
  from codejury.agents.debate import ChallengerAgent, FinderAgent, JudgeAgent
13
+ from codejury.agents.refuter import RefuterAgent
13
14
  from codejury.agents.verifier import VerifierAgent
14
15
  from codejury.domain.artifact import CodeArtifact
15
16
  from codejury.domain.capability import Capability
16
17
  from codejury.domain.context import AnalysisContext
17
18
  from codejury.domain.result import AnalysisResult
18
19
  from codejury.orchestrators.base import Orchestrator
20
+ from codejury.orchestrators.challenge import ChallengeOrchestrator
19
21
  from codejury.orchestrators.debate import DebateOrchestrator
20
22
  from codejury.orchestrators.pipeline import PipelineOrchestrator
21
23
  from codejury.orchestrators.reflexion import ReflexionOrchestrator
@@ -27,7 +29,7 @@ from codejury.providers.openai import OpenAIProvider
27
29
  from codejury.providers.retry import RetryProvider
28
30
  from codejury.sources.base import Source
29
31
 
30
- STRATEGIES = ("single", "pipeline", "debate", "reflexion")
32
+ STRATEGIES = ("single", "pipeline", "debate", "reflexion", "challenge")
31
33
  PROVIDERS = ("anthropic", "openai", "litellm")
32
34
  DEFAULT_MODEL = os.environ.get("CODEJURY_MODEL", "claude-sonnet-4-6")
33
35
  DEFAULT_API_BASE = os.environ.get("CODEJURY_API_BASE")
@@ -61,6 +63,12 @@ def build_orchestration(
61
63
  "critic": ChallengerAgent(provider=provider, model=model, max_tokens=max_tokens),
62
64
  }
63
65
  return agents, ReflexionOrchestrator()
66
+ if strategy == "challenge":
67
+ agents = {
68
+ "verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens),
69
+ "refuter": RefuterAgent(provider=provider, model=model),
70
+ }
71
+ return agents, ChallengeOrchestrator()
64
72
  verifier = {"verifier": VerifierAgent(provider=provider, model=model, max_tokens=max_tokens)}
65
73
  if strategy == "pipeline":
66
74
  return verifier, PipelineOrchestrator()
@@ -9,6 +9,7 @@ library, backed by the Anthropic provider, under a chosen orchestration strategy
9
9
  from __future__ import annotations
10
10
 
11
11
  import argparse
12
+ import os
12
13
  import sys
13
14
 
14
15
  from codejury.agents.mock import MockAgent
@@ -33,7 +34,9 @@ from codejury.orchestrators.single import SingleOrchestrator
33
34
  from codejury.providers.base import Provider
34
35
  from codejury.providers.mock import MockProvider
35
36
  from codejury.reporting import to_json, to_markdown
36
- from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, TASKS_DIR
37
+ from codejury.resources import CAPABILITIES_DIR, GOLDEN_DIR, SUPPRESSIONS_FILE, TASKS_DIR
38
+ from codejury.suppression import filter_results, load_suppressions
39
+ from codejury.integrations.github import build_review, parse_pr_ref, post_review
37
40
  from codejury.sources.chunker import Chunker
38
41
  from codejury.sources.diff import DiffSource
39
42
  from codejury.sources.repo import RepoSource
@@ -83,10 +86,15 @@ def scan(
83
86
  extensions: tuple[str, ...] = (".py",),
84
87
  max_chars: int = 200_000,
85
88
  with_callers: bool = False,
89
+ with_callees: bool = False,
86
90
  ) -> list[tuple[str, AnalysisResult]]:
87
91
  """Audit every matching file in a directory tree, returning (path, result) per artifact."""
88
92
  source = RepoSource(
89
- directory, extensions=extensions, chunker=Chunker(max_chars=max_chars), with_callers=with_callers
93
+ directory,
94
+ extensions=extensions,
95
+ chunker=Chunker(max_chars=max_chars),
96
+ with_callers=with_callers,
97
+ with_callees=with_callees,
90
98
  )
91
99
  artifacts = source.list_artifacts()
92
100
  calls = len(artifacts) * len(capabilities)
@@ -137,6 +145,50 @@ def _render_results(fmt: str, results: list[tuple[str, AnalysisResult]]) -> str:
137
145
  return {"text": _render_audit, "markdown": to_markdown, "json": to_json}[fmt](results)
138
146
 
139
147
 
148
+ def _maybe_suppress(results: list[tuple[str, AnalysisResult]], enabled: bool) -> list[tuple[str, AnalysisResult]]:
149
+ if not enabled:
150
+ return results
151
+ filtered, suppressed = filter_results(results, load_suppressions(SUPPRESSIONS_FILE))
152
+ if suppressed:
153
+ print(f"suppressed {len(suppressed)} known-noise finding(s) by rule", file=sys.stderr)
154
+ return filtered
155
+
156
+ _FAIL_ON = ("critical", "high", "medium", "low")
157
+ _SEVERITY_RANK = {"critical": 4, "high": 3, "medium": 2, "low": 1, "info": 0}
158
+
159
+
160
+ def _problem_rank(o: Observation) -> int:
161
+ if o.kind == "finding":
162
+ return _SEVERITY_RANK.get(o.severity.lower(), 2)
163
+ if o.kind == "verdict" and o.status == "VULNERABLE":
164
+ return _SEVERITY_RANK["high"]
165
+ if o.kind == "verdict" and o.status == "PARTIAL":
166
+ return _SEVERITY_RANK["medium"]
167
+ return -1
168
+
169
+
170
+ def _gate_exit(results: list[tuple[str, AnalysisResult]], fail_on: str | None) -> int:
171
+ if not fail_on:
172
+ return 0
173
+ worst = max((_problem_rank(o) for _, r in results for o in r.observations), default=-1)
174
+ return 1 if worst >= _SEVERITY_RANK[fail_on] else 0
175
+
176
+
177
+ def _maybe_post_github(ref: str | None, results: list[tuple[str, AnalysisResult]]) -> None:
178
+ if not ref:
179
+ return
180
+ token = os.environ.get("GITHUB_TOKEN")
181
+ if not token:
182
+ print("GITHUB_TOKEN not set; skipping PR review", file=sys.stderr)
183
+ return
184
+ try:
185
+ owner, repo, pull = parse_pr_ref(ref)
186
+ post_review(owner, repo, pull, build_review(results), token=token)
187
+ print(f"posted review to {ref}", file=sys.stderr)
188
+ except Exception as exc:
189
+ print(f"github review failed: {exc}", file=sys.stderr)
190
+
191
+
140
192
  def _render_metrics(m: Metrics) -> str:
141
193
  return (
142
194
  f"cases: {m.total} (tp={m.tp} fp={m.fp} tn={m.tn} fn={m.fn})\n"
@@ -168,6 +220,9 @@ def main(argv: list[str] | None = None) -> int:
168
220
  audit_p.add_argument("--retries", type=int, default=0, help="provider retry attempts on failure")
169
221
  audit_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
170
222
  audit_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
223
+ audit_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
224
+ audit_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
225
+ audit_p.add_argument("--github", default=None, help="post a PR review: owner/repo#number (needs GITHUB_TOKEN)")
171
226
 
172
227
  scan_p = sub.add_parser("scan", help="audit a whole directory tree (deep, capability by capability)")
173
228
  scan_p.add_argument("directory", help="directory to scan")
@@ -181,10 +236,15 @@ def main(argv: list[str] | None = None) -> int:
181
236
  scan_p.add_argument("--max-tokens", type=int, default=2048)
182
237
  scan_p.add_argument("--max-chars", type=int, default=200_000, help="chunk budget; default keeps whole files")
183
238
  scan_p.add_argument(
184
- "--callers", action="store_true", help="add cross-file call sites as context (cuts taint false positives)"
239
+ "--callers", action="store_true", help="add cross-file context: where this file's functions are called"
240
+ )
241
+ scan_p.add_argument(
242
+ "--callees", action="store_true", help="add cross-file context: the called code this file delegates to"
185
243
  )
186
244
  scan_p.add_argument("--api-base", default=DEFAULT_API_BASE, help="provider base URL (env: CODEJURY_API_BASE)")
187
245
  scan_p.add_argument("--api-key", default=DEFAULT_API_KEY, help="provider API key (env: CODEJURY_API_KEY)")
246
+ scan_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
247
+ scan_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
188
248
 
189
249
  run_p = sub.add_parser("run", help="run a named task preset against a unified diff")
190
250
  run_p.add_argument("task", help="task name")
@@ -192,6 +252,8 @@ def main(argv: list[str] | None = None) -> int:
192
252
  run_p.add_argument("--tasks", default=TASKS_DIR, help="task YAML directory")
193
253
  run_p.add_argument("--capabilities", default=CAPABILITIES_DIR, help="capability YAML directory")
194
254
  run_p.add_argument("--format", choices=_FORMATS, default="text", dest="fmt")
255
+ run_p.add_argument("--no-suppress", action="store_true", help="disable the known-noise suppression filter")
256
+ run_p.add_argument("--fail-on", choices=_FAIL_ON, default=None, dest="fail_on", help="exit 1 if a finding at/above this severity is found")
195
257
 
196
258
  eval_p = sub.add_parser("eval", help="score golden cases and report precision/recall")
197
259
  eval_p.add_argument("--golden", default=GOLDEN_DIR, help="golden case YAML directory")
@@ -214,8 +276,10 @@ def main(argv: list[str] | None = None) -> int:
214
276
  max_tokens=args.max_tokens,
215
277
  strategy=args.orchestrator,
216
278
  )
279
+ results = _maybe_suppress(results, not args.no_suppress)
217
280
  print(_render_results(args.fmt, results))
218
- return 0
281
+ _maybe_post_github(args.github, results)
282
+ return _gate_exit(results, args.fail_on)
219
283
 
220
284
  if args.command == "scan":
221
285
  capabilities = load_capabilities(args.capabilities)
@@ -233,9 +297,11 @@ def main(argv: list[str] | None = None) -> int:
233
297
  extensions=extensions,
234
298
  max_chars=args.max_chars,
235
299
  with_callers=args.callers,
300
+ with_callees=args.callees,
236
301
  )
302
+ results = _maybe_suppress(results, not args.no_suppress)
237
303
  print(_render_results(args.fmt, results))
238
- return 0
304
+ return _gate_exit(results, args.fail_on)
239
305
 
240
306
  if args.command == "run":
241
307
  tasks = load_tasks(args.tasks)
@@ -245,8 +311,9 @@ def main(argv: list[str] | None = None) -> int:
245
311
  results = run_task(
246
312
  tasks[args.task], DiffSource(_read_diff(args.diff)), load_capabilities(args.capabilities)
247
313
  )
314
+ results = _maybe_suppress(results, not args.no_suppress)
248
315
  print(_render_results(args.fmt, results))
249
- return 0
316
+ return _gate_exit(results, args.fail_on)
250
317
 
251
318
  if args.command == "eval":
252
319
  try:
@@ -0,0 +1,43 @@
1
+ # Known-noise suppression rules (data-driven false-positive filter).
2
+ # Each drops a flagged finding whose text matches and whose path condition holds.
3
+ # Keep these to out-of-scope / low-signal CATEGORIES -- never key on a real
4
+ # vulnerability class, or you will drop true findings.
5
+
6
+ - id: SUP-AVAILABILITY
7
+ reason: availability / DoS / rate-limiting findings are out of scope and low-signal here
8
+ match_any:
9
+ - "denial of service"
10
+ - "denial-of-service"
11
+ - "rate limit"
12
+ - "rate-limit"
13
+ - "rate limiting"
14
+ - "resource exhaustion"
15
+ - "unbounded"
16
+ - "amplification"
17
+
18
+ - id: SUP-LOGGING-NOISE
19
+ reason: verbose / insufficient logging is noise unless a secret value is logged
20
+ match_any:
21
+ - "verbose logging"
22
+ - "insufficient logging"
23
+ - "excessive logging"
24
+ - "lack of logging"
25
+ - "log verbosity"
26
+
27
+ - id: SUP-MEMORY-SAFETY-NON-C
28
+ reason: memory-safety issues do not apply outside C/C++
29
+ match_any:
30
+ - "buffer overflow"
31
+ - "use after free"
32
+ - "use-after-free"
33
+ - "double free"
34
+ - "memory corruption"
35
+ - "out-of-bounds"
36
+ unless_path_ext: [".c", ".cc", ".cpp", ".cxx", ".h", ".hpp"]
37
+
38
+ - id: SUP-REDOS
39
+ reason: regex denial-of-service / catastrophic backtracking is low-signal here
40
+ match_any:
41
+ - "redos"
42
+ - "catastrophic backtracking"
43
+ - "regex denial"
@@ -0,0 +1 @@
1
+ """codejury.integrations -- post results to external systems (GitHub PR reviews)."""
@@ -0,0 +1,88 @@
1
+ """Post audit results to a GitHub pull request as a review with inline comments.
2
+
3
+ ``build_review`` is a pure function (results -> GitHub review payload) so it is
4
+ unit-testable; ``post_review`` does the HTTP POST and accepts an injectable
5
+ transport so it can be tested without a token or a live PR. Problems with a
6
+ usable file:line become inline comments; everything else is summarized in the
7
+ review body. The review requests changes when any problem is found.
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import json
13
+ import urllib.request
14
+ from typing import Any, Callable
15
+
16
+ from codejury.domain.observation import Observation
17
+ from codejury.domain.result import AnalysisResult
18
+
19
+ Results = list[tuple[str, AnalysisResult]]
20
+
21
+
22
+ def build_review(results: Results, *, max_comments: int = 50) -> dict:
23
+ comments: list[dict] = []
24
+ problems = 0
25
+ for _path, result in results:
26
+ for o in result.observations:
27
+ comment = _inline_comment(o)
28
+ if comment is None:
29
+ continue
30
+ problems += 1
31
+ if len(comments) < max_comments:
32
+ comments.append(comment)
33
+
34
+ body = (
35
+ f"codejury found {problems} issue(s)." if problems else "codejury found no issues."
36
+ )
37
+ if problems > len(comments):
38
+ body += f" Showing {len(comments)} inline; {problems - len(comments)} more omitted."
39
+ return {
40
+ "body": body,
41
+ "event": "REQUEST_CHANGES" if problems else "COMMENT",
42
+ "comments": comments,
43
+ }
44
+
45
+
46
+ def _inline_comment(o: Observation) -> dict | None:
47
+ if o.kind == "finding":
48
+ evidence = o.evidence[0] if o.evidence else None
49
+ if evidence and evidence.file and evidence.line:
50
+ cwe = f" ({o.cwe})" if o.cwe else ""
51
+ return {"path": evidence.file, "line": evidence.line, "body": f"**{o.severity}{cwe}** {o.title}\n\n{o.description}"}
52
+ if o.kind == "verdict" and o.status == "VULNERABLE":
53
+ evidence = o.evidence[0] if o.evidence else None
54
+ if evidence and evidence.file and evidence.line:
55
+ return {"path": evidence.file, "line": evidence.line, "body": f"**VULNERABLE** `{o.capability}`\n\n{o.reasoning}"}
56
+ return None
57
+
58
+
59
+ def post_review(
60
+ owner: str,
61
+ repo: str,
62
+ pull: int,
63
+ payload: dict,
64
+ *,
65
+ token: str,
66
+ transport: Callable[[str, bytes, dict], Any] | None = None,
67
+ ) -> Any:
68
+ url = f"https://api.github.com/repos/{owner}/{repo}/pulls/{pull}/reviews"
69
+ data = json.dumps(payload).encode()
70
+ headers = {
71
+ "Authorization": f"Bearer {token}",
72
+ "Accept": "application/vnd.github+json",
73
+ "Content-Type": "application/json",
74
+ }
75
+ if transport is not None:
76
+ return transport(url, data, headers)
77
+ request = urllib.request.Request(url, data=data, headers=headers, method="POST")
78
+ with urllib.request.urlopen(request) as response:
79
+ return response.status
80
+
81
+
82
+ def parse_pr_ref(ref: str) -> tuple[str, str, int]:
83
+ """Parse 'owner/repo#123' into (owner, repo, pull_number)."""
84
+ repo_part, _, number = ref.partition("#")
85
+ owner, _, repo = repo_part.partition("/")
86
+ if not owner or not repo or not number.isdigit():
87
+ raise ValueError(f"expected owner/repo#number, got {ref!r}")
88
+ return owner, repo, int(number)
@@ -0,0 +1,67 @@
1
+ """ChallengeOrchestrator -- verify, then challenge the flagged verdicts.
2
+
3
+ The verifier rules on every capability; then a refuter is shown only the
4
+ VULNERABLE verdicts and the code, and argues which are false positives. A refuted
5
+ verdict becomes a dismissed Concession (recording why), so the report keeps the
6
+ SECURE/NOT_PRESENT verdicts, the surviving VULNERABLE ones, and a Dismissed list.
7
+
8
+ This targets taint-style false positives (which a lone verifier over-reports)
9
+ while paying the extra model call only for flagged verdicts, not the whole file.
10
+
11
+ Only verdicts from taint-prone capabilities are challenged. Local-pattern issues
12
+ (hardcoded secrets, weak crypto) are kept as-is: refuting them risks dropping a
13
+ real finding, and they do not have the attacker-control ambiguity that makes
14
+ taint checks over-report.
15
+ """
16
+
17
+ from __future__ import annotations
18
+
19
+ import dataclasses
20
+
21
+ from codejury.agents.base import Agent
22
+ from codejury.domain.context import AnalysisContext
23
+ from codejury.domain.observation import Concession, Observation, Verdict
24
+ from codejury.domain.result import AnalysisResult
25
+ from codejury.orchestrators.base import Orchestrator
26
+
27
+ _REQUIRED_ROLES = ("verifier", "refuter")
28
+ _DEFAULT_TAINT_CAPABILITIES = frozenset({"input_validation"})
29
+
30
+
31
+ class ChallengeOrchestrator(Orchestrator):
32
+ def __init__(self, *, taint_capabilities: frozenset[str] = _DEFAULT_TAINT_CAPABILITIES) -> None:
33
+ self._taint_capabilities = taint_capabilities
34
+
35
+ def run(self, agents: dict[str, Agent], context: AnalysisContext) -> AnalysisResult:
36
+ missing = [role for role in _REQUIRED_ROLES if role not in agents]
37
+ if missing:
38
+ return AnalysisResult(error=f"challenge requires agents: {', '.join(missing)}")
39
+
40
+ verdicts = agents["verifier"].run(context)
41
+ flagged = [
42
+ v
43
+ for v in verdicts
44
+ if isinstance(v, Verdict)
45
+ and v.status == "VULNERABLE"
46
+ and v.capability.split(".")[0] in self._taint_capabilities
47
+ ]
48
+ if not flagged:
49
+ return AnalysisResult(observations=verdicts)
50
+
51
+ refutations = agents["refuter"].run(dataclasses.replace(context, history=flagged))
52
+ reasons = {c.target: c.reason for c in refutations if isinstance(c, Concession)}
53
+
54
+ observations: list[Observation] = []
55
+ for v in verdicts:
56
+ if isinstance(v, Verdict) and v.status == "VULNERABLE" and v.capability in reasons:
57
+ observations.append(
58
+ Concession(
59
+ capability=v.capability,
60
+ produced_by="refuter",
61
+ target=v.capability,
62
+ reason=reasons[v.capability] or "refuted as a false positive",
63
+ )
64
+ )
65
+ else:
66
+ observations.append(v)
67
+ return AnalysisResult(observations=observations)
@@ -11,3 +11,4 @@ _DATA = Path(__file__).resolve().parent / "data"
11
11
  CAPABILITIES_DIR = _DATA / "capabilities"
12
12
  TASKS_DIR = _DATA / "tasks"
13
13
  GOLDEN_DIR = _DATA / "golden"
14
+ SUPPRESSIONS_FILE = _DATA / "suppressions.yaml"