codejury 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- codejury/__init__.py +8 -0
- codejury/agents/__init__.py +6 -0
- codejury/agents/base.py +21 -0
- codejury/agents/debate.py +188 -0
- codejury/agents/mock.py +38 -0
- codejury/agents/parsing.py +42 -0
- codejury/agents/verifier.py +106 -0
- codejury/assembly.py +76 -0
- codejury/cli.py +196 -0
- codejury/data/capabilities/authentication.yaml +67 -0
- codejury/data/capabilities/authorization.yaml +55 -0
- codejury/data/capabilities/business_logic.yaml +58 -0
- codejury/data/capabilities/crypto.yaml +78 -0
- codejury/data/capabilities/data_protection.yaml +57 -0
- codejury/data/capabilities/dependency_config.yaml +52 -0
- codejury/data/capabilities/error_logging.yaml +49 -0
- codejury/data/capabilities/input_validation.yaml +92 -0
- codejury/data/capabilities/output_encoding.yaml +56 -0
- codejury/data/capabilities/secrets.yaml +51 -0
- codejury/data/capabilities/session.yaml +60 -0
- codejury/data/golden/authn_bcrypt_password.yaml +5 -0
- codejury/data/golden/authn_sha256_password.yaml +5 -0
- codejury/data/golden/sqli_fstring_query.yaml +5 -0
- codejury/data/golden/sqli_parameterized_query.yaml +5 -0
- codejury/data/tasks/audit_diff_debate.yaml +4 -0
- codejury/data/tasks/quick_scan_single.yaml +4 -0
- codejury/domain/__init__.py +5 -0
- codejury/domain/artifact.py +20 -0
- codejury/domain/capability.py +123 -0
- codejury/domain/context.py +26 -0
- codejury/domain/observation.py +104 -0
- codejury/domain/result.py +19 -0
- codejury/evaluation.py +107 -0
- codejury/infrastructure/__init__.py +4 -0
- codejury/infrastructure/json_parse.py +57 -0
- codejury/orchestrators/__init__.py +6 -0
- codejury/orchestrators/base.py +19 -0
- codejury/orchestrators/debate.py +57 -0
- codejury/orchestrators/pipeline.py +32 -0
- codejury/orchestrators/reflexion.py +58 -0
- codejury/orchestrators/single.py +24 -0
- codejury/providers/__init__.py +5 -0
- codejury/providers/anthropic.py +68 -0
- codejury/providers/base.py +42 -0
- codejury/providers/litellm.py +68 -0
- codejury/providers/mock.py +32 -0
- codejury/providers/openai.py +57 -0
- codejury/providers/openai_format.py +30 -0
- codejury/providers/retry.py +48 -0
- codejury/reporting.py +114 -0
- codejury/resources.py +13 -0
- codejury/sources/__init__.py +6 -0
- codejury/sources/base.py +17 -0
- codejury/sources/chunker.py +33 -0
- codejury/sources/diff.py +69 -0
- codejury/sources/function.py +35 -0
- codejury/sources/mock.py +25 -0
- codejury/sources/repo.py +44 -0
- codejury/tasks/__init__.py +6 -0
- codejury/tasks/base.py +55 -0
- codejury/tasks/registry.py +22 -0
- codejury-0.1.0.dist-info/METADATA +110 -0
- codejury-0.1.0.dist-info/RECORD +67 -0
- codejury-0.1.0.dist-info/WHEEL +5 -0
- codejury-0.1.0.dist-info/entry_points.txt +2 -0
- codejury-0.1.0.dist-info/licenses/LICENSE +21 -0
- codejury-0.1.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,48 @@
|
|
|
1
|
+
"""RetryProvider -- wrap any Provider, retrying complete() on transient failure.
|
|
2
|
+
|
|
3
|
+
Real model calls fail intermittently (timeouts, rate limits). This decorator
|
|
4
|
+
retries with linear backoff and re-raises the last error once attempts are
|
|
5
|
+
exhausted. ``sleep`` is injectable so tests do not actually wait.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import time
|
|
11
|
+
from typing import Callable
|
|
12
|
+
|
|
13
|
+
from codejury.providers.base import CompletionResult, Message, Provider
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class RetryProvider(Provider):
|
|
17
|
+
def __init__(
|
|
18
|
+
self,
|
|
19
|
+
inner: Provider,
|
|
20
|
+
*,
|
|
21
|
+
max_attempts: int = 3,
|
|
22
|
+
base_delay: float = 1.0,
|
|
23
|
+
sleep: Callable[[float], None] = time.sleep,
|
|
24
|
+
) -> None:
|
|
25
|
+
self._inner = inner
|
|
26
|
+
self._max_attempts = max_attempts
|
|
27
|
+
self._base_delay = base_delay
|
|
28
|
+
self._sleep = sleep
|
|
29
|
+
|
|
30
|
+
def complete(
|
|
31
|
+
self,
|
|
32
|
+
*,
|
|
33
|
+
system: str,
|
|
34
|
+
messages: list[Message],
|
|
35
|
+
model: str,
|
|
36
|
+
max_tokens: int,
|
|
37
|
+
cache: bool = False,
|
|
38
|
+
) -> CompletionResult:
|
|
39
|
+
for attempt in range(1, self._max_attempts + 1):
|
|
40
|
+
try:
|
|
41
|
+
return self._inner.complete(
|
|
42
|
+
system=system, messages=messages, model=model, max_tokens=max_tokens, cache=cache
|
|
43
|
+
)
|
|
44
|
+
except Exception:
|
|
45
|
+
if attempt == self._max_attempts:
|
|
46
|
+
raise
|
|
47
|
+
self._sleep(self._base_delay * attempt)
|
|
48
|
+
raise AssertionError("unreachable") # pragma: no cover
|
codejury/reporting.py
ADDED
|
@@ -0,0 +1,114 @@
|
|
|
1
|
+
"""Render audit results into machine- and human-readable reports.
|
|
2
|
+
|
|
3
|
+
Input is the per-file ``[(path, AnalysisResult)]`` the audit produces. JSON is
|
|
4
|
+
for tooling; Markdown is for a human reviewer and leads with the issues, then
|
|
5
|
+
shows what was checked and cleared (the "why it's fine" side) and what was
|
|
6
|
+
dismissed.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
import json
|
|
12
|
+
|
|
13
|
+
from codejury.domain.observation import Observation
|
|
14
|
+
from codejury.domain.result import AnalysisResult
|
|
15
|
+
|
|
16
|
+
Results = list[tuple[str, AnalysisResult]]
|
|
17
|
+
|
|
18
|
+
_SEVERITY_ORDER = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3, "INFO": 4}
|
|
19
|
+
_CLEARED = ("SECURE", "NOT_PRESENT")
|
|
20
|
+
_PROBLEM_STATUSES = ("VULNERABLE", "PARTIAL")
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def to_json(results: Results) -> str:
|
|
24
|
+
payload = {
|
|
25
|
+
"files": [
|
|
26
|
+
{
|
|
27
|
+
"path": path,
|
|
28
|
+
"error": result.error,
|
|
29
|
+
"observations": [o.to_dict() for o in result.observations],
|
|
30
|
+
}
|
|
31
|
+
for path, result in results
|
|
32
|
+
]
|
|
33
|
+
}
|
|
34
|
+
return json.dumps(payload, indent=2, ensure_ascii=False)
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
def to_markdown(results: Results) -> str:
|
|
38
|
+
lines = ["# Security Audit Report", ""]
|
|
39
|
+
lines += _summary(results)
|
|
40
|
+
for path, result in results:
|
|
41
|
+
lines += ["", f"## {path}"]
|
|
42
|
+
if result.error:
|
|
43
|
+
lines.append(f"> error: {result.error}")
|
|
44
|
+
|
|
45
|
+
problems = sorted((o for o in result.observations if _is_problem(o)), key=_rank)
|
|
46
|
+
cleared = [o for o in result.observations if o.kind == "verdict" and o.status in _CLEARED]
|
|
47
|
+
dismissed = [o for o in result.observations if o.kind == "concession"]
|
|
48
|
+
|
|
49
|
+
if problems:
|
|
50
|
+
lines += ["", "### Issues"]
|
|
51
|
+
for o in problems:
|
|
52
|
+
lines += _render_problem(o)
|
|
53
|
+
if cleared:
|
|
54
|
+
lines += ["", "### Checked and clear"]
|
|
55
|
+
lines += [f"- {o.status} `{o.capability}`" for o in cleared]
|
|
56
|
+
if dismissed:
|
|
57
|
+
lines += ["", "### Dismissed"]
|
|
58
|
+
lines += [f"- ~~{o.target}~~ — {o.reason}" for o in dismissed]
|
|
59
|
+
if not result.observations and not result.error:
|
|
60
|
+
lines += ["", "_no observations_"]
|
|
61
|
+
return "\n".join(lines)
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
def _summary(results: Results) -> list[str]:
|
|
65
|
+
vulnerable = cleared = findings = dismissed = 0
|
|
66
|
+
for _, result in results:
|
|
67
|
+
for o in result.observations:
|
|
68
|
+
if o.kind == "verdict":
|
|
69
|
+
vulnerable += o.status in _PROBLEM_STATUSES
|
|
70
|
+
cleared += o.status in _CLEARED
|
|
71
|
+
elif o.kind == "finding":
|
|
72
|
+
findings += 1
|
|
73
|
+
elif o.kind == "concession":
|
|
74
|
+
dismissed += 1
|
|
75
|
+
return [
|
|
76
|
+
f"- files audited: {len(results)}",
|
|
77
|
+
f"- issues: {vulnerable} vulnerable verdict(s), {findings} finding(s)",
|
|
78
|
+
f"- checked and clear: {cleared}",
|
|
79
|
+
f"- dismissed: {dismissed}",
|
|
80
|
+
]
|
|
81
|
+
|
|
82
|
+
|
|
83
|
+
def _is_problem(o: Observation) -> bool:
|
|
84
|
+
return o.kind == "finding" or (o.kind == "verdict" and o.status in _PROBLEM_STATUSES)
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _rank(o: Observation) -> int:
|
|
88
|
+
if o.kind == "finding":
|
|
89
|
+
return _SEVERITY_ORDER.get(o.severity, 5)
|
|
90
|
+
return -1 if o.status == "VULNERABLE" else 4 # vulnerable verdicts float to the top
|
|
91
|
+
|
|
92
|
+
|
|
93
|
+
def _render_problem(o: Observation) -> list[str]:
|
|
94
|
+
if o.kind == "finding":
|
|
95
|
+
cwe = f" ({o.cwe})" if o.cwe else ""
|
|
96
|
+
out = [f"- **{o.severity}**{cwe} {o.title}"]
|
|
97
|
+
if o.description:
|
|
98
|
+
out.append(f" - {o.description}")
|
|
99
|
+
else:
|
|
100
|
+
matched = ", ".join(o.matched_anti)
|
|
101
|
+
tag = f" [{matched}]" if matched else ""
|
|
102
|
+
out = [f"- **{o.status}** `{o.capability}`{tag}"]
|
|
103
|
+
if o.reasoning:
|
|
104
|
+
out.append(f" - {o.reasoning}")
|
|
105
|
+
return out + _evidence_lines(o.evidence)
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
def _evidence_lines(evidence) -> list[str]:
|
|
109
|
+
lines = []
|
|
110
|
+
for e in evidence:
|
|
111
|
+
location = e.file + (f":{e.line}" if e.line else "")
|
|
112
|
+
code = f" `{e.code}`" if e.code else ""
|
|
113
|
+
lines.append(f" - {location}{code}")
|
|
114
|
+
return lines
|
codejury/resources.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
"""Locations of the knowledge base bundled inside the installed package.
|
|
2
|
+
|
|
3
|
+
These are the CLI defaults, resolved relative to the package so they work from
|
|
4
|
+
any working directory once installed. Override them with --capabilities etc.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from pathlib import Path
|
|
8
|
+
|
|
9
|
+
_DATA = Path(__file__).resolve().parent / "data"
|
|
10
|
+
|
|
11
|
+
CAPABILITIES_DIR = _DATA / "capabilities"
|
|
12
|
+
TASKS_DIR = _DATA / "tasks"
|
|
13
|
+
GOLDEN_DIR = _DATA / "golden"
|
codejury/sources/base.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Source ABC.
|
|
2
|
+
|
|
3
|
+
A Source turns some input (a PR diff, a file, a repo, a function) into a list of
|
|
4
|
+
CodeArtifacts an agent can analyze. Returning a list rather than one artifact
|
|
5
|
+
lets a single source fan out (e.g. one artifact per changed hunk).
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from abc import ABC, abstractmethod
|
|
11
|
+
|
|
12
|
+
from codejury.domain.artifact import CodeArtifact
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class Source(ABC):
|
|
16
|
+
@abstractmethod
|
|
17
|
+
def list_artifacts(self) -> list[CodeArtifact]: ...
|
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
"""Chunker -- split oversized file content so each artifact fits a context budget.
|
|
2
|
+
|
|
3
|
+
Splits on line boundaries into pieces of at most ``max_chars``. Small content is
|
|
4
|
+
returned unchanged as a single chunk keeping its path; split content gets a
|
|
5
|
+
``path#N`` suffix per chunk. A single line longer than the budget becomes its own
|
|
6
|
+
(over-budget) chunk rather than being cut mid-line.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
class Chunker:
|
|
13
|
+
def __init__(self, max_chars: int = 8000) -> None:
|
|
14
|
+
self._max_chars = max_chars
|
|
15
|
+
|
|
16
|
+
def split(self, path: str, content: str) -> list[tuple[str, str]]:
|
|
17
|
+
if len(content) <= self._max_chars:
|
|
18
|
+
return [(path, content)]
|
|
19
|
+
|
|
20
|
+
chunks: list[tuple[str, str]] = []
|
|
21
|
+
buffer: list[str] = []
|
|
22
|
+
size = 0
|
|
23
|
+
index = 1
|
|
24
|
+
for line in content.splitlines(keepends=True):
|
|
25
|
+
if buffer and size + len(line) > self._max_chars:
|
|
26
|
+
chunks.append((f"{path}#{index}", "".join(buffer)))
|
|
27
|
+
index += 1
|
|
28
|
+
buffer, size = [], 0
|
|
29
|
+
buffer.append(line)
|
|
30
|
+
size += len(line)
|
|
31
|
+
if buffer:
|
|
32
|
+
chunks.append((f"{path}#{index}", "".join(buffer)))
|
|
33
|
+
return chunks
|
codejury/sources/diff.py
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
1
|
+
"""DiffSource -- turn a unified diff into one CodeArtifact per changed file.
|
|
2
|
+
|
|
3
|
+
Splits on ``diff --git`` headers (falling back to a single section for a plain
|
|
4
|
+
diff with no git header). The path is taken from the +++ line, then the ---
|
|
5
|
+
line, then the header -- skipping /dev/null so adds and deletes still resolve to
|
|
6
|
+
the real file.
|
|
7
|
+
"""
|
|
8
|
+
|
|
9
|
+
from __future__ import annotations
|
|
10
|
+
|
|
11
|
+
from codejury.domain.artifact import CodeArtifact
|
|
12
|
+
from codejury.sources.base import Source
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
class DiffSource(Source):
|
|
16
|
+
def __init__(self, diff_text: str) -> None:
|
|
17
|
+
self._diff_text = diff_text
|
|
18
|
+
|
|
19
|
+
def list_artifacts(self) -> list[CodeArtifact]:
|
|
20
|
+
return [
|
|
21
|
+
CodeArtifact(kind="diff", path=path, content=body)
|
|
22
|
+
for path, body in _split_by_file(self._diff_text)
|
|
23
|
+
]
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
def _split_by_file(diff_text: str) -> list[tuple[str, str]]:
|
|
27
|
+
if not diff_text.strip():
|
|
28
|
+
return []
|
|
29
|
+
|
|
30
|
+
sections: list[list[str]] = []
|
|
31
|
+
current: list[str] = []
|
|
32
|
+
for line in diff_text.splitlines(keepends=True):
|
|
33
|
+
if line.startswith("diff --git ") and current:
|
|
34
|
+
sections.append(current)
|
|
35
|
+
current = []
|
|
36
|
+
current.append(line)
|
|
37
|
+
if current:
|
|
38
|
+
sections.append(current)
|
|
39
|
+
|
|
40
|
+
out: list[tuple[str, str]] = []
|
|
41
|
+
for section in sections:
|
|
42
|
+
path = _path_for(section)
|
|
43
|
+
if path:
|
|
44
|
+
out.append((path, "".join(section)))
|
|
45
|
+
return out
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def _path_for(lines: list[str]) -> str:
|
|
49
|
+
plus = minus = header = None
|
|
50
|
+
for line in lines:
|
|
51
|
+
if line.startswith("+++ "):
|
|
52
|
+
plus = _clean(line[4:])
|
|
53
|
+
elif line.startswith("--- "):
|
|
54
|
+
minus = _clean(line[4:])
|
|
55
|
+
elif line.startswith("diff --git "):
|
|
56
|
+
parts = line.split()
|
|
57
|
+
if len(parts) >= 4:
|
|
58
|
+
header = _clean(parts[3])
|
|
59
|
+
for candidate in (plus, minus, header):
|
|
60
|
+
if candidate and candidate != "/dev/null":
|
|
61
|
+
return candidate
|
|
62
|
+
return ""
|
|
63
|
+
|
|
64
|
+
|
|
65
|
+
def _clean(path: str) -> str:
|
|
66
|
+
path = path.split("\t")[0].strip() # drop trailing timestamp from plain `diff -u`
|
|
67
|
+
if path.startswith(("a/", "b/")):
|
|
68
|
+
path = path[2:]
|
|
69
|
+
return path
|
|
@@ -0,0 +1,35 @@
|
|
|
1
|
+
"""FunctionSource -- split Python source into one CodeArtifact per function.
|
|
2
|
+
|
|
3
|
+
Parses the AST and emits an artifact for every function and method (including
|
|
4
|
+
async and nested ones), in source order. Good for deeply auditing one handler at
|
|
5
|
+
a time. The content must be valid Python; a parse failure raises SyntaxError.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
import ast
|
|
11
|
+
|
|
12
|
+
from codejury.domain.artifact import CodeArtifact
|
|
13
|
+
from codejury.sources.base import Source
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class FunctionSource(Source):
|
|
17
|
+
def __init__(self, code: str, *, path: str = "<source>") -> None:
|
|
18
|
+
self._code = code
|
|
19
|
+
self._path = path
|
|
20
|
+
|
|
21
|
+
def list_artifacts(self) -> list[CodeArtifact]:
|
|
22
|
+
tree = ast.parse(self._code)
|
|
23
|
+
functions = [
|
|
24
|
+
node for node in ast.walk(tree) if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
|
|
25
|
+
]
|
|
26
|
+
functions.sort(key=lambda n: n.lineno)
|
|
27
|
+
|
|
28
|
+
artifacts = []
|
|
29
|
+
for node in functions:
|
|
30
|
+
segment = ast.get_source_segment(self._code, node)
|
|
31
|
+
if segment:
|
|
32
|
+
artifacts.append(
|
|
33
|
+
CodeArtifact(kind="function", path=f"{self._path}::{node.name}", content=segment)
|
|
34
|
+
)
|
|
35
|
+
return artifacts
|
codejury/sources/mock.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
"""MockSource -- a Source that yields canned CodeArtifacts.
|
|
2
|
+
|
|
3
|
+
Used by the dry-run and tests so the pipeline has input without touching git or
|
|
4
|
+
the filesystem. Pass your own artifacts, or rely on the default illustrative diff.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from codejury.domain.artifact import CodeArtifact
|
|
10
|
+
from codejury.sources.base import Source
|
|
11
|
+
|
|
12
|
+
_DEFAULT_DIFF = """\
|
|
13
|
+
+def store_password(pwd: str) -> str:
|
|
14
|
+
+ return hashlib.sha256(pwd.encode()).hexdigest()
|
|
15
|
+
"""
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class MockSource(Source):
|
|
19
|
+
def __init__(self, artifacts: list[CodeArtifact] | None = None) -> None:
|
|
20
|
+
self._artifacts = artifacts if artifacts is not None else [
|
|
21
|
+
CodeArtifact(kind="diff", path="auth.py", content=_DEFAULT_DIFF),
|
|
22
|
+
]
|
|
23
|
+
|
|
24
|
+
def list_artifacts(self) -> list[CodeArtifact]:
|
|
25
|
+
return list(self._artifacts)
|
codejury/sources/repo.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
1
|
+
"""RepoSource -- walk a repository into CodeArtifacts, one per file (chunked).
|
|
2
|
+
|
|
3
|
+
Selects files by extension, skips noise directories (.git, virtualenvs, caches),
|
|
4
|
+
and runs each file through a Chunker so large files fit the model's context
|
|
5
|
+
window. Artifact paths are relative to the repo root.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from pathlib import Path
|
|
11
|
+
|
|
12
|
+
from codejury.domain.artifact import CodeArtifact
|
|
13
|
+
from codejury.sources.base import Source
|
|
14
|
+
from codejury.sources.chunker import Chunker
|
|
15
|
+
|
|
16
|
+
_SKIP_DIRS = frozenset({".git", ".venv", "venv", "node_modules", "__pycache__", ".mypy_cache", ".pytest_cache"})
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class RepoSource(Source):
|
|
20
|
+
def __init__(
|
|
21
|
+
self,
|
|
22
|
+
root: str | Path,
|
|
23
|
+
*,
|
|
24
|
+
extensions: tuple[str, ...] = (".py",),
|
|
25
|
+
chunker: Chunker | None = None,
|
|
26
|
+
skip_dirs: frozenset[str] = _SKIP_DIRS,
|
|
27
|
+
) -> None:
|
|
28
|
+
self._root = Path(root)
|
|
29
|
+
self._extensions = extensions
|
|
30
|
+
self._chunker = chunker or Chunker()
|
|
31
|
+
self._skip_dirs = skip_dirs
|
|
32
|
+
|
|
33
|
+
def list_artifacts(self) -> list[CodeArtifact]:
|
|
34
|
+
artifacts: list[CodeArtifact] = []
|
|
35
|
+
for path in sorted(self._root.rglob("*")):
|
|
36
|
+
if not path.is_file() or path.suffix not in self._extensions:
|
|
37
|
+
continue
|
|
38
|
+
if any(part in self._skip_dirs for part in path.relative_to(self._root).parts):
|
|
39
|
+
continue
|
|
40
|
+
rel = path.relative_to(self._root).as_posix()
|
|
41
|
+
content = path.read_text(encoding="utf-8", errors="replace")
|
|
42
|
+
for chunk_path, chunk_content in self._chunker.split(rel, content):
|
|
43
|
+
artifacts.append(CodeArtifact(kind="repo", path=chunk_path, content=chunk_content))
|
|
44
|
+
return artifacts
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
"""codejury.tasks -- Layer 5 task presets.
|
|
2
|
+
|
|
3
|
+
A task is a named binding of capabilities + orchestrator + provider + model, so a
|
|
4
|
+
new audit is a new config file rather than new code. The input source is supplied
|
|
5
|
+
at run time because it carries the actual code under review.
|
|
6
|
+
"""
|
codejury/tasks/base.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
1
|
+
"""Task model and runner.
|
|
2
|
+
|
|
3
|
+
A Task selects which capabilities to check and under which orchestration and
|
|
4
|
+
model. ``run_task`` binds it to a runtime source and executes it.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass
|
|
10
|
+
from typing import Any
|
|
11
|
+
|
|
12
|
+
from codejury.assembly import DEFAULT_MODEL, build_orchestration, make_provider, run_over_source
|
|
13
|
+
from codejury.domain.capability import Capability
|
|
14
|
+
from codejury.domain.result import AnalysisResult
|
|
15
|
+
from codejury.sources.base import Source
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
@dataclass(frozen=True, kw_only=True)
|
|
19
|
+
class Task:
|
|
20
|
+
name: str
|
|
21
|
+
orchestrator: str = "single"
|
|
22
|
+
provider: str = "anthropic"
|
|
23
|
+
model: str = DEFAULT_MODEL
|
|
24
|
+
capabilities: tuple[str, ...] | None = None # capability ids to check; None = all
|
|
25
|
+
max_tokens: int = 2048
|
|
26
|
+
retries: int = 0 # provider retry attempts on transient failure
|
|
27
|
+
|
|
28
|
+
@classmethod
|
|
29
|
+
def from_dict(cls, data: dict[str, Any]) -> Task:
|
|
30
|
+
caps = data.get("capabilities")
|
|
31
|
+
return cls(
|
|
32
|
+
name=data["name"],
|
|
33
|
+
orchestrator=data.get("orchestrator", "single"),
|
|
34
|
+
provider=data.get("provider", "anthropic"),
|
|
35
|
+
model=data.get("model", DEFAULT_MODEL),
|
|
36
|
+
capabilities=tuple(caps) if caps is not None else None,
|
|
37
|
+
max_tokens=int(data.get("max_tokens", 2048)),
|
|
38
|
+
retries=int(data.get("retries", 0)),
|
|
39
|
+
)
|
|
40
|
+
|
|
41
|
+
def select(self, capabilities: list[Capability]) -> list[Capability]:
|
|
42
|
+
if self.capabilities is None:
|
|
43
|
+
return list(capabilities)
|
|
44
|
+
wanted = set(self.capabilities)
|
|
45
|
+
return [c for c in capabilities if c.id in wanted]
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
def run_task(
|
|
49
|
+
task: Task, source: Source, capabilities: list[Capability]
|
|
50
|
+
) -> list[tuple[str, AnalysisResult]]:
|
|
51
|
+
provider = make_provider(task.provider, retries=task.retries)
|
|
52
|
+
agents, orchestrator = build_orchestration(
|
|
53
|
+
task.orchestrator, provider=provider, model=task.model, max_tokens=task.max_tokens
|
|
54
|
+
)
|
|
55
|
+
return run_over_source(source, task.select(capabilities), agents, orchestrator)
|
|
@@ -0,0 +1,22 @@
|
|
|
1
|
+
"""Load Task presets from YAML files."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from pathlib import Path
|
|
6
|
+
|
|
7
|
+
import yaml
|
|
8
|
+
|
|
9
|
+
from codejury.tasks.base import Task
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
def load_task(path: str | Path) -> Task:
|
|
13
|
+
with open(path, encoding="utf-8") as f:
|
|
14
|
+
data = yaml.safe_load(f)
|
|
15
|
+
if not isinstance(data, dict):
|
|
16
|
+
raise ValueError(f"{path}: expected a YAML mapping at the top level, got {type(data).__name__}")
|
|
17
|
+
return Task.from_dict(data)
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
def load_tasks(directory: str | Path) -> dict[str, Task]:
|
|
21
|
+
"""Load every ``*.yaml`` task in a directory, keyed by task name."""
|
|
22
|
+
return {task.name: task for task in (load_task(p) for p in sorted(Path(directory).glob("*.yaml")))}
|
|
@@ -0,0 +1,110 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: codejury
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: General-purpose Application Security AI audit framework -- five-layer architecture, capabilities as first-class data
|
|
5
|
+
Author: 4234288
|
|
6
|
+
License-Expression: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/4234288/codejury
|
|
8
|
+
Project-URL: Repository, https://github.com/4234288/codejury
|
|
9
|
+
Keywords: security,appsec,static analysis,llm,owasp,asvs,code review
|
|
10
|
+
Classifier: Development Status :: 4 - Beta
|
|
11
|
+
Classifier: Intended Audience :: Developers
|
|
12
|
+
Classifier: Topic :: Security
|
|
13
|
+
Classifier: Topic :: Software Development :: Quality Assurance
|
|
14
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
15
|
+
Classifier: Operating System :: OS Independent
|
|
16
|
+
Requires-Python: >=3.12
|
|
17
|
+
Description-Content-Type: text/markdown
|
|
18
|
+
License-File: LICENSE
|
|
19
|
+
Requires-Dist: pyyaml>=6.0
|
|
20
|
+
Provides-Extra: anthropic
|
|
21
|
+
Requires-Dist: anthropic>=0.40; extra == "anthropic"
|
|
22
|
+
Provides-Extra: openai
|
|
23
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
24
|
+
Provides-Extra: litellm
|
|
25
|
+
Requires-Dist: litellm>=1.0; extra == "litellm"
|
|
26
|
+
Provides-Extra: dev
|
|
27
|
+
Requires-Dist: pytest>=8.0; extra == "dev"
|
|
28
|
+
Dynamic: license-file
|
|
29
|
+
|
|
30
|
+
# codejury
|
|
31
|
+
|
|
32
|
+
A general-purpose **Application Security AI audit framework**. Domain knowledge (11
|
|
33
|
+
capabilities aligned with OWASP ASVS) lives in versioned YAML files as
|
|
34
|
+
first-class data, keeping the framework core small.
|
|
35
|
+
|
|
36
|
+
The name comes from the core orchestration metaphor: code goes before a "jury"
|
|
37
|
+
of adversarial roles -- Finder / Challenger / Judge -- that converge on a verdict.
|
|
38
|
+
|
|
39
|
+
## Five-layer architecture
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
Layer 5 Task task configuration (source + capabilities + orchestrator + agents)
|
|
43
|
+
Layer 4 Capability YAML domain knowledge (authn / authz / input_validation ...)
|
|
44
|
+
Layer 3 Orchestrator strategy (single / debate / pipeline / reflexion)
|
|
45
|
+
Source input (diff / function / repo)
|
|
46
|
+
Agent audit role (finder / challenger / judge / verifier)
|
|
47
|
+
Layer 2 Provider model backend (anthropic / openai / litellm / mock)
|
|
48
|
+
Layer 1 Infrastructure cross-cutting utilities (json parsing, ...)
|
|
49
|
+
```
|
|
50
|
+
|
|
51
|
+
Layers talk only through typed data. Each layer is an abstract base class (ABC)
|
|
52
|
+
plus implementations, so the four axes (task / orchestration / model / input)
|
|
53
|
+
compose independently.
|
|
54
|
+
|
|
55
|
+
## Design notes
|
|
56
|
+
|
|
57
|
+
- **Domain knowledge is data, not prompts**: a capability YAML is readable by
|
|
58
|
+
the LLM, by a rule engine, and by a human, and is versioned alongside code.
|
|
59
|
+
- **Explains both "why it's wrong" and "why it's fine"**: every capability
|
|
60
|
+
yields a `Verdict`, recording safe matches too -- a checkup dimension rather
|
|
61
|
+
than an anomaly filter.
|
|
62
|
+
|
|
63
|
+
## Status
|
|
64
|
+
|
|
65
|
+
Usable end to end across all five layers:
|
|
66
|
+
|
|
67
|
+
- **Orchestrators**: single, pipeline, debate, reflexion
|
|
68
|
+
- **Sources**: diff, function, repo (with chunking)
|
|
69
|
+
- **Providers**: anthropic, openai, litellm, mock (plus an opt-in retry wrapper)
|
|
70
|
+
- **Capabilities**: all 11 OWASP ASVS areas
|
|
71
|
+
- **Tasks**: named presets in `tasks/` (e.g. `audit_diff_debate`)
|
|
72
|
+
- **Reporting**: text, markdown, json
|
|
73
|
+
- **Evaluation**: a golden-case precision/recall harness
|
|
74
|
+
|
|
75
|
+
The golden set ships with seed cases; real precision/recall numbers need a model
|
|
76
|
+
(`codejury eval` with a provider key).
|
|
77
|
+
|
|
78
|
+
## Install
|
|
79
|
+
|
|
80
|
+
```bash
|
|
81
|
+
pip install codejury # core + CLI
|
|
82
|
+
pip install 'codejury[anthropic]' # add the provider you'll use (anthropic / openai / litellm)
|
|
83
|
+
```
|
|
84
|
+
|
|
85
|
+
## Usage
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
# Audit a unified diff against the capability library
|
|
89
|
+
git diff | codejury audit --orchestrator debate --provider anthropic --format markdown -
|
|
90
|
+
|
|
91
|
+
# Run a named task preset (tasks/*.yaml)
|
|
92
|
+
git diff | codejury run audit_diff_debate -
|
|
93
|
+
|
|
94
|
+
# Score detection quality against the golden cases (needs a provider key)
|
|
95
|
+
codejury eval --provider anthropic
|
|
96
|
+
|
|
97
|
+
# No API key needed: prove the pipeline composes with mock layers
|
|
98
|
+
codejury dry-run
|
|
99
|
+
```
|
|
100
|
+
|
|
101
|
+
`audit` and `run` read a diff from a file argument or stdin (`-`). Real providers
|
|
102
|
+
read their key from the environment (e.g. `ANTHROPIC_API_KEY`).
|
|
103
|
+
|
|
104
|
+
## Development
|
|
105
|
+
|
|
106
|
+
```bash
|
|
107
|
+
python -m venv .venv && source .venv/bin/activate
|
|
108
|
+
pip install -e ".[dev]"
|
|
109
|
+
pytest
|
|
110
|
+
```
|