codejury 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (67) hide show
  1. codejury/__init__.py +8 -0
  2. codejury/agents/__init__.py +6 -0
  3. codejury/agents/base.py +21 -0
  4. codejury/agents/debate.py +188 -0
  5. codejury/agents/mock.py +38 -0
  6. codejury/agents/parsing.py +42 -0
  7. codejury/agents/verifier.py +106 -0
  8. codejury/assembly.py +76 -0
  9. codejury/cli.py +196 -0
  10. codejury/data/capabilities/authentication.yaml +67 -0
  11. codejury/data/capabilities/authorization.yaml +55 -0
  12. codejury/data/capabilities/business_logic.yaml +58 -0
  13. codejury/data/capabilities/crypto.yaml +78 -0
  14. codejury/data/capabilities/data_protection.yaml +57 -0
  15. codejury/data/capabilities/dependency_config.yaml +52 -0
  16. codejury/data/capabilities/error_logging.yaml +49 -0
  17. codejury/data/capabilities/input_validation.yaml +92 -0
  18. codejury/data/capabilities/output_encoding.yaml +56 -0
  19. codejury/data/capabilities/secrets.yaml +51 -0
  20. codejury/data/capabilities/session.yaml +60 -0
  21. codejury/data/golden/authn_bcrypt_password.yaml +5 -0
  22. codejury/data/golden/authn_sha256_password.yaml +5 -0
  23. codejury/data/golden/sqli_fstring_query.yaml +5 -0
  24. codejury/data/golden/sqli_parameterized_query.yaml +5 -0
  25. codejury/data/tasks/audit_diff_debate.yaml +4 -0
  26. codejury/data/tasks/quick_scan_single.yaml +4 -0
  27. codejury/domain/__init__.py +5 -0
  28. codejury/domain/artifact.py +20 -0
  29. codejury/domain/capability.py +123 -0
  30. codejury/domain/context.py +26 -0
  31. codejury/domain/observation.py +104 -0
  32. codejury/domain/result.py +19 -0
  33. codejury/evaluation.py +107 -0
  34. codejury/infrastructure/__init__.py +4 -0
  35. codejury/infrastructure/json_parse.py +57 -0
  36. codejury/orchestrators/__init__.py +6 -0
  37. codejury/orchestrators/base.py +19 -0
  38. codejury/orchestrators/debate.py +57 -0
  39. codejury/orchestrators/pipeline.py +32 -0
  40. codejury/orchestrators/reflexion.py +58 -0
  41. codejury/orchestrators/single.py +24 -0
  42. codejury/providers/__init__.py +5 -0
  43. codejury/providers/anthropic.py +68 -0
  44. codejury/providers/base.py +42 -0
  45. codejury/providers/litellm.py +68 -0
  46. codejury/providers/mock.py +32 -0
  47. codejury/providers/openai.py +57 -0
  48. codejury/providers/openai_format.py +30 -0
  49. codejury/providers/retry.py +48 -0
  50. codejury/reporting.py +114 -0
  51. codejury/resources.py +13 -0
  52. codejury/sources/__init__.py +6 -0
  53. codejury/sources/base.py +17 -0
  54. codejury/sources/chunker.py +33 -0
  55. codejury/sources/diff.py +69 -0
  56. codejury/sources/function.py +35 -0
  57. codejury/sources/mock.py +25 -0
  58. codejury/sources/repo.py +44 -0
  59. codejury/tasks/__init__.py +6 -0
  60. codejury/tasks/base.py +55 -0
  61. codejury/tasks/registry.py +22 -0
  62. codejury-0.1.0.dist-info/METADATA +110 -0
  63. codejury-0.1.0.dist-info/RECORD +67 -0
  64. codejury-0.1.0.dist-info/WHEEL +5 -0
  65. codejury-0.1.0.dist-info/entry_points.txt +2 -0
  66. codejury-0.1.0.dist-info/licenses/LICENSE +21 -0
  67. codejury-0.1.0.dist-info/top_level.txt +1 -0
@@ -0,0 +1,48 @@
1
+ """RetryProvider -- wrap any Provider, retrying complete() on transient failure.
2
+
3
+ Real model calls fail intermittently (timeouts, rate limits). This decorator
4
+ retries with linear backoff and re-raises the last error once attempts are
5
+ exhausted. ``sleep`` is injectable so tests do not actually wait.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import time
11
+ from typing import Callable
12
+
13
+ from codejury.providers.base import CompletionResult, Message, Provider
14
+
15
+
16
+ class RetryProvider(Provider):
17
+ def __init__(
18
+ self,
19
+ inner: Provider,
20
+ *,
21
+ max_attempts: int = 3,
22
+ base_delay: float = 1.0,
23
+ sleep: Callable[[float], None] = time.sleep,
24
+ ) -> None:
25
+ self._inner = inner
26
+ self._max_attempts = max_attempts
27
+ self._base_delay = base_delay
28
+ self._sleep = sleep
29
+
30
+ def complete(
31
+ self,
32
+ *,
33
+ system: str,
34
+ messages: list[Message],
35
+ model: str,
36
+ max_tokens: int,
37
+ cache: bool = False,
38
+ ) -> CompletionResult:
39
+ for attempt in range(1, self._max_attempts + 1):
40
+ try:
41
+ return self._inner.complete(
42
+ system=system, messages=messages, model=model, max_tokens=max_tokens, cache=cache
43
+ )
44
+ except Exception:
45
+ if attempt == self._max_attempts:
46
+ raise
47
+ self._sleep(self._base_delay * attempt)
48
+ raise AssertionError("unreachable") # pragma: no cover
codejury/reporting.py ADDED
@@ -0,0 +1,114 @@
1
+ """Render audit results into machine- and human-readable reports.
2
+
3
+ Input is the per-file ``[(path, AnalysisResult)]`` the audit produces. JSON is
4
+ for tooling; Markdown is for a human reviewer and leads with the issues, then
5
+ shows what was checked and cleared (the "why it's fine" side) and what was
6
+ dismissed.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ import json
12
+
13
+ from codejury.domain.observation import Observation
14
+ from codejury.domain.result import AnalysisResult
15
+
16
+ Results = list[tuple[str, AnalysisResult]]
17
+
18
+ _SEVERITY_ORDER = {"CRITICAL": 0, "HIGH": 1, "MEDIUM": 2, "LOW": 3, "INFO": 4}
19
+ _CLEARED = ("SECURE", "NOT_PRESENT")
20
+ _PROBLEM_STATUSES = ("VULNERABLE", "PARTIAL")
21
+
22
+
23
+ def to_json(results: Results) -> str:
24
+ payload = {
25
+ "files": [
26
+ {
27
+ "path": path,
28
+ "error": result.error,
29
+ "observations": [o.to_dict() for o in result.observations],
30
+ }
31
+ for path, result in results
32
+ ]
33
+ }
34
+ return json.dumps(payload, indent=2, ensure_ascii=False)
35
+
36
+
37
+ def to_markdown(results: Results) -> str:
38
+ lines = ["# Security Audit Report", ""]
39
+ lines += _summary(results)
40
+ for path, result in results:
41
+ lines += ["", f"## {path}"]
42
+ if result.error:
43
+ lines.append(f"> error: {result.error}")
44
+
45
+ problems = sorted((o for o in result.observations if _is_problem(o)), key=_rank)
46
+ cleared = [o for o in result.observations if o.kind == "verdict" and o.status in _CLEARED]
47
+ dismissed = [o for o in result.observations if o.kind == "concession"]
48
+
49
+ if problems:
50
+ lines += ["", "### Issues"]
51
+ for o in problems:
52
+ lines += _render_problem(o)
53
+ if cleared:
54
+ lines += ["", "### Checked and clear"]
55
+ lines += [f"- {o.status} `{o.capability}`" for o in cleared]
56
+ if dismissed:
57
+ lines += ["", "### Dismissed"]
58
+ lines += [f"- ~~{o.target}~~ — {o.reason}" for o in dismissed]
59
+ if not result.observations and not result.error:
60
+ lines += ["", "_no observations_"]
61
+ return "\n".join(lines)
62
+
63
+
64
+ def _summary(results: Results) -> list[str]:
65
+ vulnerable = cleared = findings = dismissed = 0
66
+ for _, result in results:
67
+ for o in result.observations:
68
+ if o.kind == "verdict":
69
+ vulnerable += o.status in _PROBLEM_STATUSES
70
+ cleared += o.status in _CLEARED
71
+ elif o.kind == "finding":
72
+ findings += 1
73
+ elif o.kind == "concession":
74
+ dismissed += 1
75
+ return [
76
+ f"- files audited: {len(results)}",
77
+ f"- issues: {vulnerable} vulnerable verdict(s), {findings} finding(s)",
78
+ f"- checked and clear: {cleared}",
79
+ f"- dismissed: {dismissed}",
80
+ ]
81
+
82
+
83
+ def _is_problem(o: Observation) -> bool:
84
+ return o.kind == "finding" or (o.kind == "verdict" and o.status in _PROBLEM_STATUSES)
85
+
86
+
87
+ def _rank(o: Observation) -> int:
88
+ if o.kind == "finding":
89
+ return _SEVERITY_ORDER.get(o.severity, 5)
90
+ return -1 if o.status == "VULNERABLE" else 4 # vulnerable verdicts float to the top
91
+
92
+
93
+ def _render_problem(o: Observation) -> list[str]:
94
+ if o.kind == "finding":
95
+ cwe = f" ({o.cwe})" if o.cwe else ""
96
+ out = [f"- **{o.severity}**{cwe} {o.title}"]
97
+ if o.description:
98
+ out.append(f" - {o.description}")
99
+ else:
100
+ matched = ", ".join(o.matched_anti)
101
+ tag = f" [{matched}]" if matched else ""
102
+ out = [f"- **{o.status}** `{o.capability}`{tag}"]
103
+ if o.reasoning:
104
+ out.append(f" - {o.reasoning}")
105
+ return out + _evidence_lines(o.evidence)
106
+
107
+
108
+ def _evidence_lines(evidence) -> list[str]:
109
+ lines = []
110
+ for e in evidence:
111
+ location = e.file + (f":{e.line}" if e.line else "")
112
+ code = f" `{e.code}`" if e.code else ""
113
+ lines.append(f" - {location}{code}")
114
+ return lines
codejury/resources.py ADDED
@@ -0,0 +1,13 @@
1
+ """Locations of the knowledge base bundled inside the installed package.
2
+
3
+ These are the CLI defaults, resolved relative to the package so they work from
4
+ any working directory once installed. Override them with --capabilities etc.
5
+ """
6
+
7
+ from pathlib import Path
8
+
9
+ _DATA = Path(__file__).resolve().parent / "data"
10
+
11
+ CAPABILITIES_DIR = _DATA / "capabilities"
12
+ TASKS_DIR = _DATA / "tasks"
13
+ GOLDEN_DIR = _DATA / "golden"
@@ -0,0 +1,6 @@
1
+ """codejury.sources -- where code to audit comes from.
2
+
3
+ Agents never read files directly; they receive CodeArtifacts from a Source. This
4
+ keeps the "any input" axis (diff / file / repo / function) independent from the
5
+ rest of the framework.
6
+ """
@@ -0,0 +1,17 @@
1
+ """Source ABC.
2
+
3
+ A Source turns some input (a PR diff, a file, a repo, a function) into a list of
4
+ CodeArtifacts an agent can analyze. Returning a list rather than one artifact
5
+ lets a single source fan out (e.g. one artifact per changed hunk).
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from abc import ABC, abstractmethod
11
+
12
+ from codejury.domain.artifact import CodeArtifact
13
+
14
+
15
+ class Source(ABC):
16
+ @abstractmethod
17
+ def list_artifacts(self) -> list[CodeArtifact]: ...
@@ -0,0 +1,33 @@
1
+ """Chunker -- split oversized file content so each artifact fits a context budget.
2
+
3
+ Splits on line boundaries into pieces of at most ``max_chars``. Small content is
4
+ returned unchanged as a single chunk keeping its path; split content gets a
5
+ ``path#N`` suffix per chunk. A single line longer than the budget becomes its own
6
+ (over-budget) chunk rather than being cut mid-line.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+
12
+ class Chunker:
13
+ def __init__(self, max_chars: int = 8000) -> None:
14
+ self._max_chars = max_chars
15
+
16
+ def split(self, path: str, content: str) -> list[tuple[str, str]]:
17
+ if len(content) <= self._max_chars:
18
+ return [(path, content)]
19
+
20
+ chunks: list[tuple[str, str]] = []
21
+ buffer: list[str] = []
22
+ size = 0
23
+ index = 1
24
+ for line in content.splitlines(keepends=True):
25
+ if buffer and size + len(line) > self._max_chars:
26
+ chunks.append((f"{path}#{index}", "".join(buffer)))
27
+ index += 1
28
+ buffer, size = [], 0
29
+ buffer.append(line)
30
+ size += len(line)
31
+ if buffer:
32
+ chunks.append((f"{path}#{index}", "".join(buffer)))
33
+ return chunks
@@ -0,0 +1,69 @@
1
+ """DiffSource -- turn a unified diff into one CodeArtifact per changed file.
2
+
3
+ Splits on ``diff --git`` headers (falling back to a single section for a plain
4
+ diff with no git header). The path is taken from the +++ line, then the ---
5
+ line, then the header -- skipping /dev/null so adds and deletes still resolve to
6
+ the real file.
7
+ """
8
+
9
+ from __future__ import annotations
10
+
11
+ from codejury.domain.artifact import CodeArtifact
12
+ from codejury.sources.base import Source
13
+
14
+
15
+ class DiffSource(Source):
16
+ def __init__(self, diff_text: str) -> None:
17
+ self._diff_text = diff_text
18
+
19
+ def list_artifacts(self) -> list[CodeArtifact]:
20
+ return [
21
+ CodeArtifact(kind="diff", path=path, content=body)
22
+ for path, body in _split_by_file(self._diff_text)
23
+ ]
24
+
25
+
26
+ def _split_by_file(diff_text: str) -> list[tuple[str, str]]:
27
+ if not diff_text.strip():
28
+ return []
29
+
30
+ sections: list[list[str]] = []
31
+ current: list[str] = []
32
+ for line in diff_text.splitlines(keepends=True):
33
+ if line.startswith("diff --git ") and current:
34
+ sections.append(current)
35
+ current = []
36
+ current.append(line)
37
+ if current:
38
+ sections.append(current)
39
+
40
+ out: list[tuple[str, str]] = []
41
+ for section in sections:
42
+ path = _path_for(section)
43
+ if path:
44
+ out.append((path, "".join(section)))
45
+ return out
46
+
47
+
48
+ def _path_for(lines: list[str]) -> str:
49
+ plus = minus = header = None
50
+ for line in lines:
51
+ if line.startswith("+++ "):
52
+ plus = _clean(line[4:])
53
+ elif line.startswith("--- "):
54
+ minus = _clean(line[4:])
55
+ elif line.startswith("diff --git "):
56
+ parts = line.split()
57
+ if len(parts) >= 4:
58
+ header = _clean(parts[3])
59
+ for candidate in (plus, minus, header):
60
+ if candidate and candidate != "/dev/null":
61
+ return candidate
62
+ return ""
63
+
64
+
65
+ def _clean(path: str) -> str:
66
+ path = path.split("\t")[0].strip() # drop trailing timestamp from plain `diff -u`
67
+ if path.startswith(("a/", "b/")):
68
+ path = path[2:]
69
+ return path
@@ -0,0 +1,35 @@
1
+ """FunctionSource -- split Python source into one CodeArtifact per function.
2
+
3
+ Parses the AST and emits an artifact for every function and method (including
4
+ async and nested ones), in source order. Good for deeply auditing one handler at
5
+ a time. The content must be valid Python; a parse failure raises SyntaxError.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ import ast
11
+
12
+ from codejury.domain.artifact import CodeArtifact
13
+ from codejury.sources.base import Source
14
+
15
+
16
+ class FunctionSource(Source):
17
+ def __init__(self, code: str, *, path: str = "<source>") -> None:
18
+ self._code = code
19
+ self._path = path
20
+
21
+ def list_artifacts(self) -> list[CodeArtifact]:
22
+ tree = ast.parse(self._code)
23
+ functions = [
24
+ node for node in ast.walk(tree) if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef))
25
+ ]
26
+ functions.sort(key=lambda n: n.lineno)
27
+
28
+ artifacts = []
29
+ for node in functions:
30
+ segment = ast.get_source_segment(self._code, node)
31
+ if segment:
32
+ artifacts.append(
33
+ CodeArtifact(kind="function", path=f"{self._path}::{node.name}", content=segment)
34
+ )
35
+ return artifacts
@@ -0,0 +1,25 @@
1
+ """MockSource -- a Source that yields canned CodeArtifacts.
2
+
3
+ Used by the dry-run and tests so the pipeline has input without touching git or
4
+ the filesystem. Pass your own artifacts, or rely on the default illustrative diff.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from codejury.domain.artifact import CodeArtifact
10
+ from codejury.sources.base import Source
11
+
12
+ _DEFAULT_DIFF = """\
13
+ +def store_password(pwd: str) -> str:
14
+ + return hashlib.sha256(pwd.encode()).hexdigest()
15
+ """
16
+
17
+
18
+ class MockSource(Source):
19
+ def __init__(self, artifacts: list[CodeArtifact] | None = None) -> None:
20
+ self._artifacts = artifacts if artifacts is not None else [
21
+ CodeArtifact(kind="diff", path="auth.py", content=_DEFAULT_DIFF),
22
+ ]
23
+
24
+ def list_artifacts(self) -> list[CodeArtifact]:
25
+ return list(self._artifacts)
@@ -0,0 +1,44 @@
1
+ """RepoSource -- walk a repository into CodeArtifacts, one per file (chunked).
2
+
3
+ Selects files by extension, skips noise directories (.git, virtualenvs, caches),
4
+ and runs each file through a Chunker so large files fit the model's context
5
+ window. Artifact paths are relative to the repo root.
6
+ """
7
+
8
+ from __future__ import annotations
9
+
10
+ from pathlib import Path
11
+
12
+ from codejury.domain.artifact import CodeArtifact
13
+ from codejury.sources.base import Source
14
+ from codejury.sources.chunker import Chunker
15
+
16
+ _SKIP_DIRS = frozenset({".git", ".venv", "venv", "node_modules", "__pycache__", ".mypy_cache", ".pytest_cache"})
17
+
18
+
19
+ class RepoSource(Source):
20
+ def __init__(
21
+ self,
22
+ root: str | Path,
23
+ *,
24
+ extensions: tuple[str, ...] = (".py",),
25
+ chunker: Chunker | None = None,
26
+ skip_dirs: frozenset[str] = _SKIP_DIRS,
27
+ ) -> None:
28
+ self._root = Path(root)
29
+ self._extensions = extensions
30
+ self._chunker = chunker or Chunker()
31
+ self._skip_dirs = skip_dirs
32
+
33
+ def list_artifacts(self) -> list[CodeArtifact]:
34
+ artifacts: list[CodeArtifact] = []
35
+ for path in sorted(self._root.rglob("*")):
36
+ if not path.is_file() or path.suffix not in self._extensions:
37
+ continue
38
+ if any(part in self._skip_dirs for part in path.relative_to(self._root).parts):
39
+ continue
40
+ rel = path.relative_to(self._root).as_posix()
41
+ content = path.read_text(encoding="utf-8", errors="replace")
42
+ for chunk_path, chunk_content in self._chunker.split(rel, content):
43
+ artifacts.append(CodeArtifact(kind="repo", path=chunk_path, content=chunk_content))
44
+ return artifacts
@@ -0,0 +1,6 @@
1
+ """codejury.tasks -- Layer 5 task presets.
2
+
3
+ A task is a named binding of capabilities + orchestrator + provider + model, so a
4
+ new audit is a new config file rather than new code. The input source is supplied
5
+ at run time because it carries the actual code under review.
6
+ """
codejury/tasks/base.py ADDED
@@ -0,0 +1,55 @@
1
+ """Task model and runner.
2
+
3
+ A Task selects which capabilities to check and under which orchestration and
4
+ model. ``run_task`` binds it to a runtime source and executes it.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass
10
+ from typing import Any
11
+
12
+ from codejury.assembly import DEFAULT_MODEL, build_orchestration, make_provider, run_over_source
13
+ from codejury.domain.capability import Capability
14
+ from codejury.domain.result import AnalysisResult
15
+ from codejury.sources.base import Source
16
+
17
+
18
+ @dataclass(frozen=True, kw_only=True)
19
+ class Task:
20
+ name: str
21
+ orchestrator: str = "single"
22
+ provider: str = "anthropic"
23
+ model: str = DEFAULT_MODEL
24
+ capabilities: tuple[str, ...] | None = None # capability ids to check; None = all
25
+ max_tokens: int = 2048
26
+ retries: int = 0 # provider retry attempts on transient failure
27
+
28
+ @classmethod
29
+ def from_dict(cls, data: dict[str, Any]) -> Task:
30
+ caps = data.get("capabilities")
31
+ return cls(
32
+ name=data["name"],
33
+ orchestrator=data.get("orchestrator", "single"),
34
+ provider=data.get("provider", "anthropic"),
35
+ model=data.get("model", DEFAULT_MODEL),
36
+ capabilities=tuple(caps) if caps is not None else None,
37
+ max_tokens=int(data.get("max_tokens", 2048)),
38
+ retries=int(data.get("retries", 0)),
39
+ )
40
+
41
+ def select(self, capabilities: list[Capability]) -> list[Capability]:
42
+ if self.capabilities is None:
43
+ return list(capabilities)
44
+ wanted = set(self.capabilities)
45
+ return [c for c in capabilities if c.id in wanted]
46
+
47
+
48
+ def run_task(
49
+ task: Task, source: Source, capabilities: list[Capability]
50
+ ) -> list[tuple[str, AnalysisResult]]:
51
+ provider = make_provider(task.provider, retries=task.retries)
52
+ agents, orchestrator = build_orchestration(
53
+ task.orchestrator, provider=provider, model=task.model, max_tokens=task.max_tokens
54
+ )
55
+ return run_over_source(source, task.select(capabilities), agents, orchestrator)
@@ -0,0 +1,22 @@
1
+ """Load Task presets from YAML files."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from pathlib import Path
6
+
7
+ import yaml
8
+
9
+ from codejury.tasks.base import Task
10
+
11
+
12
+ def load_task(path: str | Path) -> Task:
13
+ with open(path, encoding="utf-8") as f:
14
+ data = yaml.safe_load(f)
15
+ if not isinstance(data, dict):
16
+ raise ValueError(f"{path}: expected a YAML mapping at the top level, got {type(data).__name__}")
17
+ return Task.from_dict(data)
18
+
19
+
20
+ def load_tasks(directory: str | Path) -> dict[str, Task]:
21
+ """Load every ``*.yaml`` task in a directory, keyed by task name."""
22
+ return {task.name: task for task in (load_task(p) for p in sorted(Path(directory).glob("*.yaml")))}
@@ -0,0 +1,110 @@
1
+ Metadata-Version: 2.4
2
+ Name: codejury
3
+ Version: 0.1.0
4
+ Summary: General-purpose Application Security AI audit framework -- five-layer architecture, capabilities as first-class data
5
+ Author: 4234288
6
+ License-Expression: MIT
7
+ Project-URL: Homepage, https://github.com/4234288/codejury
8
+ Project-URL: Repository, https://github.com/4234288/codejury
9
+ Keywords: security,appsec,static analysis,llm,owasp,asvs,code review
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Topic :: Security
13
+ Classifier: Topic :: Software Development :: Quality Assurance
14
+ Classifier: Programming Language :: Python :: 3.12
15
+ Classifier: Operating System :: OS Independent
16
+ Requires-Python: >=3.12
17
+ Description-Content-Type: text/markdown
18
+ License-File: LICENSE
19
+ Requires-Dist: pyyaml>=6.0
20
+ Provides-Extra: anthropic
21
+ Requires-Dist: anthropic>=0.40; extra == "anthropic"
22
+ Provides-Extra: openai
23
+ Requires-Dist: openai>=1.0; extra == "openai"
24
+ Provides-Extra: litellm
25
+ Requires-Dist: litellm>=1.0; extra == "litellm"
26
+ Provides-Extra: dev
27
+ Requires-Dist: pytest>=8.0; extra == "dev"
28
+ Dynamic: license-file
29
+
30
+ # codejury
31
+
32
+ A general-purpose **Application Security AI audit framework**. Domain knowledge (11
33
+ capabilities aligned with OWASP ASVS) lives in versioned YAML files as
34
+ first-class data, keeping the framework core small.
35
+
36
+ The name comes from the core orchestration metaphor: code goes before a "jury"
37
+ of adversarial roles -- Finder / Challenger / Judge -- that converge on a verdict.
38
+
39
+ ## Five-layer architecture
40
+
41
+ ```
42
+ Layer 5 Task task configuration (source + capabilities + orchestrator + agents)
43
+ Layer 4 Capability YAML domain knowledge (authn / authz / input_validation ...)
44
+ Layer 3 Orchestrator strategy (single / debate / pipeline / reflexion)
45
+ Source input (diff / function / repo)
46
+ Agent audit role (finder / challenger / judge / verifier)
47
+ Layer 2 Provider model backend (anthropic / openai / litellm / mock)
48
+ Layer 1 Infrastructure cross-cutting utilities (json parsing, ...)
49
+ ```
50
+
51
+ Layers talk only through typed data. Each layer is an abstract base class (ABC)
52
+ plus implementations, so the four axes (task / orchestration / model / input)
53
+ compose independently.
54
+
55
+ ## Design notes
56
+
57
+ - **Domain knowledge is data, not prompts**: a capability YAML is readable by
58
+ the LLM, by a rule engine, and by a human, and is versioned alongside code.
59
+ - **Explains both "why it's wrong" and "why it's fine"**: every capability
60
+ yields a `Verdict`, recording safe matches too -- a checkup dimension rather
61
+ than an anomaly filter.
62
+
63
+ ## Status
64
+
65
+ Usable end to end across all five layers:
66
+
67
+ - **Orchestrators**: single, pipeline, debate, reflexion
68
+ - **Sources**: diff, function, repo (with chunking)
69
+ - **Providers**: anthropic, openai, litellm, mock (plus an opt-in retry wrapper)
70
+ - **Capabilities**: all 11 OWASP ASVS areas
71
+ - **Tasks**: named presets in `tasks/` (e.g. `audit_diff_debate`)
72
+ - **Reporting**: text, markdown, json
73
+ - **Evaluation**: a golden-case precision/recall harness
74
+
75
+ The golden set ships with seed cases; real precision/recall numbers need a model
76
+ (`codejury eval` with a provider key).
77
+
78
+ ## Install
79
+
80
+ ```bash
81
+ pip install codejury # core + CLI
82
+ pip install 'codejury[anthropic]' # add the provider you'll use (anthropic / openai / litellm)
83
+ ```
84
+
85
+ ## Usage
86
+
87
+ ```bash
88
+ # Audit a unified diff against the capability library
89
+ git diff | codejury audit --orchestrator debate --provider anthropic --format markdown -
90
+
91
+ # Run a named task preset (tasks/*.yaml)
92
+ git diff | codejury run audit_diff_debate -
93
+
94
+ # Score detection quality against the golden cases (needs a provider key)
95
+ codejury eval --provider anthropic
96
+
97
+ # No API key needed: prove the pipeline composes with mock layers
98
+ codejury dry-run
99
+ ```
100
+
101
+ `audit` and `run` read a diff from a file argument or stdin (`-`). Real providers
102
+ read their key from the environment (e.g. `ANTHROPIC_API_KEY`).
103
+
104
+ ## Development
105
+
106
+ ```bash
107
+ python -m venv .venv && source .venv/bin/activate
108
+ pip install -e ".[dev]"
109
+ pytest
110
+ ```