wardproof 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- wardproof/__init__.py +40 -0
- wardproof/agents/__init__.py +14 -0
- wardproof/agents/base.py +90 -0
- wardproof/agents/detector.py +70 -0
- wardproof/agents/responder.py +88 -0
- wardproof/agents/verifier.py +93 -0
- wardproof/audit/__init__.py +5 -0
- wardproof/audit/ledger.py +158 -0
- wardproof/cli.py +37 -0
- wardproof/config.py +28 -0
- wardproof/guardrails/__init__.py +17 -0
- wardproof/guardrails/_normalize.py +83 -0
- wardproof/guardrails/base.py +53 -0
- wardproof/guardrails/memory_poisoning.py +117 -0
- wardproof/guardrails/prompt_injection.py +193 -0
- wardproof/guardrails/tool_misuse.py +174 -0
- wardproof/llm/__init__.py +7 -0
- wardproof/llm/base.py +12 -0
- wardproof/llm/null.py +16 -0
- wardproof/llm/ollama_client.py +41 -0
- wardproof/orchestration/__init__.py +17 -0
- wardproof/orchestration/engine.py +199 -0
- wardproof/orchestration/factory.py +75 -0
- wardproof/sandbox/__init__.py +17 -0
- wardproof/sandbox/executor.py +145 -0
- wardproof/sandbox/permissions.py +86 -0
- wardproof/schema.py +89 -0
- wardproof-0.1.0.dist-info/METADATA +282 -0
- wardproof-0.1.0.dist-info/RECORD +32 -0
- wardproof-0.1.0.dist-info/WHEEL +4 -0
- wardproof-0.1.0.dist-info/entry_points.txt +2 -0
- wardproof-0.1.0.dist-info/licenses/LICENSE +21 -0
wardproof/__init__.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
1
|
+
"""Wardproof, local-first, verifiable defensive AI agent swarms."""
|
|
2
|
+
|
|
3
|
+
from wardproof.agents.base import BaseDefensiveAgent
|
|
4
|
+
from wardproof.agents.detector import DetectorAgent
|
|
5
|
+
from wardproof.agents.responder import ResponderAgent
|
|
6
|
+
from wardproof.agents.verifier import VerifierAgent
|
|
7
|
+
from wardproof.audit.ledger import AuditLedger
|
|
8
|
+
from wardproof.orchestration.engine import (
|
|
9
|
+
CircuitBreaker,
|
|
10
|
+
Outcome,
|
|
11
|
+
SwarmOrchestrator,
|
|
12
|
+
Watchdog,
|
|
13
|
+
)
|
|
14
|
+
from wardproof.orchestration.factory import build_default_swarm
|
|
15
|
+
from wardproof.sandbox.executor import SandboxExecutor, ToolRegistry
|
|
16
|
+
from wardproof.sandbox.permissions import PermissionBroker, ToolGrant
|
|
17
|
+
from wardproof.schema import Decision, Event, Finding, Severity, Verdict
|
|
18
|
+
|
|
19
|
+
__version__ = "0.1.0"
|
|
20
|
+
__all__ = [
|
|
21
|
+
"Event",
|
|
22
|
+
"Decision",
|
|
23
|
+
"Finding",
|
|
24
|
+
"Verdict",
|
|
25
|
+
"Severity",
|
|
26
|
+
"AuditLedger",
|
|
27
|
+
"BaseDefensiveAgent",
|
|
28
|
+
"DetectorAgent",
|
|
29
|
+
"VerifierAgent",
|
|
30
|
+
"ResponderAgent",
|
|
31
|
+
"SwarmOrchestrator",
|
|
32
|
+
"Watchdog",
|
|
33
|
+
"CircuitBreaker",
|
|
34
|
+
"Outcome",
|
|
35
|
+
"build_default_swarm",
|
|
36
|
+
"PermissionBroker",
|
|
37
|
+
"ToolGrant",
|
|
38
|
+
"SandboxExecutor",
|
|
39
|
+
"ToolRegistry",
|
|
40
|
+
]
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
"""Defensive agents: Detector, Verifier, Responder, and the base class."""
|
|
2
|
+
|
|
3
|
+
from wardproof.agents.base import BaseDefensiveAgent
|
|
4
|
+
from wardproof.agents.detector import DetectorAgent
|
|
5
|
+
from wardproof.agents.responder import ResponderAgent, Response
|
|
6
|
+
from wardproof.agents.verifier import VerifierAgent
|
|
7
|
+
|
|
8
|
+
__all__ = [
|
|
9
|
+
"BaseDefensiveAgent",
|
|
10
|
+
"DetectorAgent",
|
|
11
|
+
"VerifierAgent",
|
|
12
|
+
"ResponderAgent",
|
|
13
|
+
"Response",
|
|
14
|
+
]
|
wardproof/agents/base.py
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""Base class for every defensive agent in the swarm.
|
|
2
|
+
|
|
3
|
+
Lifecycle of ``process(event)``:
|
|
4
|
+
1. run_guardrails -> deterministic Findings, each recorded to the ledger
|
|
5
|
+
2. decide -> subclass turns Findings (+ optional LLM 2nd opinion)
|
|
6
|
+
into a Decision
|
|
7
|
+
3. the Decision itself is recorded to the ledger
|
|
8
|
+
|
|
9
|
+
Key principles baked in here:
|
|
10
|
+
* Guardrails run first and always, with or without an LLM.
|
|
11
|
+
* Every step is written to an append-only audit ledger the agent does not own.
|
|
12
|
+
* The agent's own LLM is treated as UNTRUSTED, subclasses must never let it
|
|
13
|
+
downgrade a hard guardrail signal (see DetectorAgent / VerifierAgent).
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
from abc import ABC, abstractmethod
|
|
19
|
+
|
|
20
|
+
from wardproof.audit.ledger import AuditLedger
|
|
21
|
+
from wardproof.guardrails.base import Guardrail
|
|
22
|
+
from wardproof.llm.base import LLMClient
|
|
23
|
+
from wardproof.llm.null import NullLLM
|
|
24
|
+
from wardproof.schema import Decision, Event, Finding
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class BaseDefensiveAgent(ABC):
|
|
28
|
+
role: str = "agent"
|
|
29
|
+
|
|
30
|
+
def __init__(
|
|
31
|
+
self,
|
|
32
|
+
agent_id: str,
|
|
33
|
+
*,
|
|
34
|
+
guardrails: list[Guardrail] | None = None,
|
|
35
|
+
ledger: AuditLedger | None = None,
|
|
36
|
+
llm: LLMClient | None = None,
|
|
37
|
+
) -> None:
|
|
38
|
+
self.agent_id = agent_id
|
|
39
|
+
self.guardrails = guardrails or []
|
|
40
|
+
self.ledger = ledger or AuditLedger()
|
|
41
|
+
self.llm = llm or NullLLM()
|
|
42
|
+
|
|
43
|
+
def run_guardrails(self, event: Event) -> list[Finding]:
|
|
44
|
+
findings: list[Finding] = []
|
|
45
|
+
for guard in self.guardrails:
|
|
46
|
+
if not guard.applies_to(event):
|
|
47
|
+
continue
|
|
48
|
+
finding = guard.inspect(event)
|
|
49
|
+
findings.append(finding)
|
|
50
|
+
if finding.triggered:
|
|
51
|
+
self.ledger.append(
|
|
52
|
+
actor=self.agent_id,
|
|
53
|
+
action="guardrail_triggered",
|
|
54
|
+
data={
|
|
55
|
+
"event_id": event.id,
|
|
56
|
+
"guardrail": finding.guardrail,
|
|
57
|
+
"risk": finding.risk,
|
|
58
|
+
"severity": finding.severity.value,
|
|
59
|
+
"reason": finding.reason,
|
|
60
|
+
},
|
|
61
|
+
)
|
|
62
|
+
return findings
|
|
63
|
+
|
|
64
|
+
@abstractmethod
|
|
65
|
+
def decide(self, event: Event, findings: list[Finding]) -> Decision: ...
|
|
66
|
+
|
|
67
|
+
def process(self, event: Event) -> Decision:
|
|
68
|
+
findings = self.run_guardrails(event)
|
|
69
|
+
decision = self.decide(event, findings)
|
|
70
|
+
self.ledger.append(
|
|
71
|
+
actor=self.agent_id,
|
|
72
|
+
action="decision",
|
|
73
|
+
data={
|
|
74
|
+
"event_id": event.id,
|
|
75
|
+
"verdict": decision.verdict.value,
|
|
76
|
+
"risk": decision.risk,
|
|
77
|
+
"rationale": decision.rationale,
|
|
78
|
+
"role": self.role,
|
|
79
|
+
},
|
|
80
|
+
)
|
|
81
|
+
return decision
|
|
82
|
+
|
|
83
|
+
# Convenience for subclasses that want an LLM second opinion safely.
|
|
84
|
+
def _llm_opinion(self, system: str, user: str) -> str:
|
|
85
|
+
if not self.llm.available:
|
|
86
|
+
return ""
|
|
87
|
+
try:
|
|
88
|
+
return self.llm.complete(system, user)
|
|
89
|
+
except Exception: # pragma: no cover - never let the LLM crash defence
|
|
90
|
+
return ""
|
|
@@ -0,0 +1,70 @@
|
|
|
1
|
+
"""Detector, first-pass triage.
|
|
2
|
+
|
|
3
|
+
Aggregates guardrail findings into a single risk score and maps it to a verdict
|
|
4
|
+
using two thresholds. The LLM (if any) may only *raise* concern, never lower it:
|
|
5
|
+
its suggestion is clamped so it cannot pull risk below the deterministic floor.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
from __future__ import annotations
|
|
9
|
+
|
|
10
|
+
from wardproof.agents.base import BaseDefensiveAgent
|
|
11
|
+
from wardproof.schema import Decision, Event, Finding, Verdict, max_severity
|
|
12
|
+
|
|
13
|
+
_SYSTEM = (
|
|
14
|
+
"You are a security detector. Given an event, reply with a single float "
|
|
15
|
+
"0.0-1.0 estimating how likely it is malicious. Reply with ONLY the number."
|
|
16
|
+
)
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class DetectorAgent(BaseDefensiveAgent):
|
|
20
|
+
role = "detector"
|
|
21
|
+
|
|
22
|
+
def __init__(
|
|
23
|
+
self, *args: object, low: float = 0.2, high: float = 0.6, **kwargs: object
|
|
24
|
+
) -> None:
|
|
25
|
+
super().__init__(*args, **kwargs) # type: ignore[arg-type]
|
|
26
|
+
self.low = low
|
|
27
|
+
self.high = high
|
|
28
|
+
|
|
29
|
+
def _deterministic_risk(self, findings: list[Finding]) -> float:
|
|
30
|
+
if not findings:
|
|
31
|
+
return 0.0
|
|
32
|
+
return max((f.risk for f in findings), default=0.0)
|
|
33
|
+
|
|
34
|
+
def _llm_adjust(self, event: Event, floor: float) -> float:
|
|
35
|
+
raw = self._llm_opinion(_SYSTEM, event.content[:2000])
|
|
36
|
+
if not raw:
|
|
37
|
+
return floor
|
|
38
|
+
try:
|
|
39
|
+
guess = float(raw.strip().split()[0])
|
|
40
|
+
except (ValueError, IndexError):
|
|
41
|
+
return floor
|
|
42
|
+
# LLM may only escalate. Never trust it to reduce risk.
|
|
43
|
+
return max(floor, min(1.0, guess))
|
|
44
|
+
|
|
45
|
+
def decide(self, event: Event, findings: list[Finding]) -> Decision:
|
|
46
|
+
floor = self._deterministic_risk(findings)
|
|
47
|
+
risk = self._llm_adjust(event, floor)
|
|
48
|
+
triggered = [f for f in findings if f.triggered]
|
|
49
|
+
severity = max_severity([f.severity for f in triggered]) if triggered else None
|
|
50
|
+
|
|
51
|
+
if risk >= self.high:
|
|
52
|
+
verdict = (
|
|
53
|
+
Verdict.QUARANTINE if event.kind in ("memory_write", "tool_call") else Verdict.BLOCK
|
|
54
|
+
)
|
|
55
|
+
elif risk >= self.low:
|
|
56
|
+
verdict = Verdict.ESCALATE
|
|
57
|
+
else:
|
|
58
|
+
verdict = Verdict.ALLOW
|
|
59
|
+
|
|
60
|
+
reasons = "; ".join(f.reason for f in triggered) or "no guardrail signal"
|
|
61
|
+
rationale = f"risk={risk:.2f} (floor={floor:.2f}); {reasons}"
|
|
62
|
+
return Decision(
|
|
63
|
+
agent_id=self.agent_id,
|
|
64
|
+
event_id=event.id,
|
|
65
|
+
verdict=verdict,
|
|
66
|
+
risk=risk,
|
|
67
|
+
findings=findings,
|
|
68
|
+
rationale=rationale,
|
|
69
|
+
metadata={"severity": severity.value if severity else "info"},
|
|
70
|
+
)
|
|
@@ -0,0 +1,88 @@
|
|
|
1
|
+
"""Responder, turns a verdict into a concrete action.
|
|
2
|
+
|
|
3
|
+
Mapping:
|
|
4
|
+
ALLOW -> pass through, no action
|
|
5
|
+
SANITIZE -> return cleaned content (strip injection markers)
|
|
6
|
+
ESCALATE -> hand to a human review queue (here: record + flag)
|
|
7
|
+
QUARANTINE -> invoke a mitigation tool (e.g. quarantine_chunk, freeze_account)
|
|
8
|
+
BLOCK -> invoke a mitigation tool and refuse
|
|
9
|
+
|
|
10
|
+
The Responder is the only agent allowed to *act*, and it acts through the
|
|
11
|
+
sandbox executor so every mitigation is itself permission-checked and audited.
|
|
12
|
+
"""
|
|
13
|
+
|
|
14
|
+
from __future__ import annotations
|
|
15
|
+
|
|
16
|
+
from dataclasses import dataclass
|
|
17
|
+
from typing import Any
|
|
18
|
+
|
|
19
|
+
from wardproof.audit.ledger import AuditLedger
|
|
20
|
+
from wardproof.guardrails.prompt_injection import strip_injection
|
|
21
|
+
from wardproof.llm.base import LLMClient
|
|
22
|
+
from wardproof.llm.null import NullLLM
|
|
23
|
+
from wardproof.sandbox.executor import SandboxExecutor
|
|
24
|
+
from wardproof.schema import Decision, Event, Verdict
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
@dataclass
|
|
28
|
+
class Response:
|
|
29
|
+
action: str
|
|
30
|
+
detail: str
|
|
31
|
+
payload: Any = None
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
class ResponderAgent:
|
|
35
|
+
role = "responder"
|
|
36
|
+
|
|
37
|
+
def __init__(
|
|
38
|
+
self,
|
|
39
|
+
agent_id: str,
|
|
40
|
+
*,
|
|
41
|
+
ledger: AuditLedger | None = None,
|
|
42
|
+
executor: SandboxExecutor | None = None,
|
|
43
|
+
mitigations: dict[Verdict, str] | None = None,
|
|
44
|
+
llm: LLMClient | None = None,
|
|
45
|
+
) -> None:
|
|
46
|
+
self.agent_id = agent_id
|
|
47
|
+
self.ledger = ledger or AuditLedger()
|
|
48
|
+
self.executor = executor
|
|
49
|
+
self.llm = llm or NullLLM()
|
|
50
|
+
# Which sandbox tool to call for BLOCK / QUARANTINE verdicts.
|
|
51
|
+
self.mitigations = mitigations or {}
|
|
52
|
+
|
|
53
|
+
def _mitigate(self, event: Event, verdict: Verdict) -> Response:
|
|
54
|
+
tool = self.mitigations.get(verdict)
|
|
55
|
+
if tool and self.executor is not None:
|
|
56
|
+
result = self.executor.run(
|
|
57
|
+
self.agent_id,
|
|
58
|
+
tool,
|
|
59
|
+
{"event_id": event.id, "source": event.source},
|
|
60
|
+
)
|
|
61
|
+
detail = f"ran mitigation '{tool}': {result}"
|
|
62
|
+
else:
|
|
63
|
+
detail = f"{verdict.value} (no mitigation tool configured)"
|
|
64
|
+
return Response(action=verdict.value, detail=detail)
|
|
65
|
+
|
|
66
|
+
def respond(self, event: Event, decision: Decision) -> Response:
|
|
67
|
+
v = decision.verdict
|
|
68
|
+
if v == Verdict.ALLOW:
|
|
69
|
+
resp = Response("allow", "passed through")
|
|
70
|
+
elif v == Verdict.SANITIZE:
|
|
71
|
+
cleaned = strip_injection(event.content)
|
|
72
|
+
resp = Response("sanitize", "stripped injection markers", payload=cleaned)
|
|
73
|
+
elif v == Verdict.ESCALATE:
|
|
74
|
+
resp = Response("escalate", "queued for human review")
|
|
75
|
+
else: # QUARANTINE or BLOCK
|
|
76
|
+
resp = self._mitigate(event, v)
|
|
77
|
+
|
|
78
|
+
self.ledger.append(
|
|
79
|
+
actor=self.agent_id,
|
|
80
|
+
action="response",
|
|
81
|
+
data={
|
|
82
|
+
"event_id": event.id,
|
|
83
|
+
"verdict": v.value,
|
|
84
|
+
"action": resp.action,
|
|
85
|
+
"detail": resp.detail,
|
|
86
|
+
},
|
|
87
|
+
)
|
|
88
|
+
return resp
|
|
@@ -0,0 +1,93 @@
|
|
|
1
|
+
"""Verifier, independent second opinion + detector integrity check.
|
|
2
|
+
|
|
3
|
+
The Verifier exists to catch two failure modes:
|
|
4
|
+
1. The event really is malicious and slipped past (false negative).
|
|
5
|
+
2. The Detector ITSELF has been compromised/poisoned and is rubber-stamping
|
|
6
|
+
dangerous events (insider/collusion). If the Detector said ALLOW while the
|
|
7
|
+
deterministic guardrails clearly fired high risk, that is an integrity
|
|
8
|
+
alarm, we quarantine and flag it loudly.
|
|
9
|
+
|
|
10
|
+
It re-runs its own guardrails rather than trusting the Detector's findings.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
from wardproof.agents.base import BaseDefensiveAgent
|
|
16
|
+
from wardproof.schema import (
|
|
17
|
+
Decision,
|
|
18
|
+
Event,
|
|
19
|
+
Finding,
|
|
20
|
+
Verdict,
|
|
21
|
+
max_severity,
|
|
22
|
+
stricter_verdict,
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
|
|
26
|
+
class VerifierAgent(BaseDefensiveAgent):
|
|
27
|
+
role = "verifier"
|
|
28
|
+
|
|
29
|
+
def __init__(self, *args: object, high: float = 0.6, **kwargs: object) -> None:
|
|
30
|
+
super().__init__(*args, **kwargs) # type: ignore[arg-type]
|
|
31
|
+
self.high = high
|
|
32
|
+
|
|
33
|
+
def decide(self, event: Event, findings: list[Finding]) -> Decision:
|
|
34
|
+
# Independent verdict from this agent's own guardrail pass.
|
|
35
|
+
risk = max((f.risk for f in findings if f.triggered), default=0.0)
|
|
36
|
+
if risk >= self.high:
|
|
37
|
+
verdict = (
|
|
38
|
+
Verdict.QUARANTINE if event.kind in ("memory_write", "tool_call") else Verdict.BLOCK
|
|
39
|
+
)
|
|
40
|
+
elif risk > 0.0:
|
|
41
|
+
verdict = Verdict.ESCALATE
|
|
42
|
+
else:
|
|
43
|
+
verdict = Verdict.ALLOW
|
|
44
|
+
triggered = [f for f in findings if f.triggered]
|
|
45
|
+
sev = max_severity([f.severity for f in triggered]) if triggered else None
|
|
46
|
+
rationale = f"independent risk={risk:.2f}"
|
|
47
|
+
return Decision(
|
|
48
|
+
self.agent_id,
|
|
49
|
+
event.id,
|
|
50
|
+
verdict,
|
|
51
|
+
risk,
|
|
52
|
+
findings,
|
|
53
|
+
rationale,
|
|
54
|
+
metadata={"severity": sev.value if sev else "info"},
|
|
55
|
+
)
|
|
56
|
+
|
|
57
|
+
def verify(self, event: Event, detector_decision: Decision) -> Decision:
|
|
58
|
+
"""Cross-check a Detector's decision. Fail-closed: combine to the
|
|
59
|
+
stricter verdict, and raise an integrity alarm on suspicious leniency."""
|
|
60
|
+
own = self.process(event)
|
|
61
|
+
combined = stricter_verdict(own.verdict, detector_decision.verdict)
|
|
62
|
+
integrity_alarm = False
|
|
63
|
+
|
|
64
|
+
# Detector said it's fine, but the deterministic floor says otherwise.
|
|
65
|
+
if detector_decision.verdict == Verdict.ALLOW and own.risk >= self.high:
|
|
66
|
+
integrity_alarm = True
|
|
67
|
+
combined = stricter_verdict(combined, Verdict.QUARANTINE)
|
|
68
|
+
self.ledger.append(
|
|
69
|
+
actor=self.agent_id,
|
|
70
|
+
action="integrity_alarm",
|
|
71
|
+
data={
|
|
72
|
+
"event_id": event.id,
|
|
73
|
+
"suspect_agent": detector_decision.agent_id,
|
|
74
|
+
"detector_verdict": detector_decision.verdict.value,
|
|
75
|
+
"verifier_risk": own.risk,
|
|
76
|
+
"note": "detector allowed a high-risk event; possible compromise",
|
|
77
|
+
},
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
risk = max(own.risk, detector_decision.risk)
|
|
81
|
+
rationale = (
|
|
82
|
+
f"verifier={own.verdict.value} detector={detector_decision.verdict.value} "
|
|
83
|
+
f"-> {combined.value}" + (" [INTEGRITY ALARM]" if integrity_alarm else "")
|
|
84
|
+
)
|
|
85
|
+
return Decision(
|
|
86
|
+
agent_id=self.agent_id,
|
|
87
|
+
event_id=event.id,
|
|
88
|
+
verdict=combined,
|
|
89
|
+
risk=risk,
|
|
90
|
+
findings=own.findings,
|
|
91
|
+
rationale=rationale,
|
|
92
|
+
metadata={"integrity_alarm": integrity_alarm},
|
|
93
|
+
)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""Tamper-evident audit ledger.
|
|
2
|
+
|
|
3
|
+
Two layers of integrity, both optional to set up but always verifiable:
|
|
4
|
+
1. Hash chain -> stdlib only (hashlib). Detects any mutation/reordering/deletion.
|
|
5
|
+
2. Ed25519 signatures -> requires `cryptography`. Proves WHO appended each entry.
|
|
6
|
+
|
|
7
|
+
Design rule: the ledger is append-only and lives OUTSIDE the agents it audits.
|
|
8
|
+
An agent can write to it but must never be able to rewrite history.
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
import dataclasses
|
|
14
|
+
import hashlib
|
|
15
|
+
import json
|
|
16
|
+
import os
|
|
17
|
+
import threading
|
|
18
|
+
import time
|
|
19
|
+
from dataclasses import dataclass
|
|
20
|
+
from pathlib import Path
|
|
21
|
+
from typing import Any
|
|
22
|
+
|
|
23
|
+
try:
|
|
24
|
+
from cryptography.exceptions import InvalidSignature
|
|
25
|
+
from cryptography.hazmat.primitives import serialization
|
|
26
|
+
from cryptography.hazmat.primitives.asymmetric.ed25519 import (
|
|
27
|
+
Ed25519PrivateKey,
|
|
28
|
+
Ed25519PublicKey,
|
|
29
|
+
)
|
|
30
|
+
|
|
31
|
+
_HAS_CRYPTO = True
|
|
32
|
+
except Exception: # pragma: no cover - optional dependency
|
|
33
|
+
_HAS_CRYPTO = False
|
|
34
|
+
|
|
35
|
+
GENESIS_HASH = "0" * 64
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
def _canonical(data: Any) -> str:
|
|
39
|
+
return json.dumps(data, sort_keys=True, separators=(",", ":"), default=str)
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
@dataclass
|
|
43
|
+
class AuditEntry:
|
|
44
|
+
index: int
|
|
45
|
+
ts: float
|
|
46
|
+
actor: str
|
|
47
|
+
action: str
|
|
48
|
+
data: dict[str, Any]
|
|
49
|
+
prev_hash: str
|
|
50
|
+
entry_hash: str
|
|
51
|
+
signature: str | None = None
|
|
52
|
+
|
|
53
|
+
def to_dict(self) -> dict[str, Any]:
|
|
54
|
+
return dataclasses.asdict(self)
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
class AuditLedger:
|
|
58
|
+
def __init__(
|
|
59
|
+
self,
|
|
60
|
+
signing_key_path: str | os.PathLike[str] | None = None,
|
|
61
|
+
jsonl_path: str | os.PathLike[str] | None = None,
|
|
62
|
+
) -> None:
|
|
63
|
+
self._lock = threading.Lock()
|
|
64
|
+
self._entries: list[AuditEntry] = []
|
|
65
|
+
self._jsonl_path = Path(jsonl_path) if jsonl_path else None
|
|
66
|
+
self._priv: Any = None
|
|
67
|
+
self._pub_hex: str | None = None
|
|
68
|
+
if signing_key_path is not None:
|
|
69
|
+
self._load_or_create_key(Path(signing_key_path))
|
|
70
|
+
|
|
71
|
+
# ---- signing key management ----
|
|
72
|
+
def _load_or_create_key(self, path: Path) -> None:
|
|
73
|
+
if not _HAS_CRYPTO:
|
|
74
|
+
raise RuntimeError(
|
|
75
|
+
"Signing requested but 'cryptography' is not installed. "
|
|
76
|
+
"Install with: pip install wardproof[crypto]"
|
|
77
|
+
)
|
|
78
|
+
if path.exists():
|
|
79
|
+
self._priv = Ed25519PrivateKey.from_private_bytes(path.read_bytes())
|
|
80
|
+
else:
|
|
81
|
+
self._priv = Ed25519PrivateKey.generate()
|
|
82
|
+
raw = self._priv.private_bytes(
|
|
83
|
+
serialization.Encoding.Raw,
|
|
84
|
+
serialization.PrivateFormat.Raw,
|
|
85
|
+
serialization.NoEncryption(),
|
|
86
|
+
)
|
|
87
|
+
path.parent.mkdir(parents=True, exist_ok=True)
|
|
88
|
+
path.write_bytes(raw)
|
|
89
|
+
os.chmod(path, 0o600)
|
|
90
|
+
self._pub_hex = (
|
|
91
|
+
self._priv.public_key()
|
|
92
|
+
.public_bytes(serialization.Encoding.Raw, serialization.PublicFormat.Raw)
|
|
93
|
+
.hex()
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
@property
|
|
97
|
+
def public_key_hex(self) -> str | None:
|
|
98
|
+
return self._pub_hex
|
|
99
|
+
|
|
100
|
+
# ---- core ----
|
|
101
|
+
def _compute_hash(
|
|
102
|
+
self,
|
|
103
|
+
index: int,
|
|
104
|
+
ts: float,
|
|
105
|
+
actor: str,
|
|
106
|
+
action: str,
|
|
107
|
+
data: dict[str, Any],
|
|
108
|
+
prev_hash: str,
|
|
109
|
+
) -> str:
|
|
110
|
+
payload = f"{index}|{ts:.6f}|{actor}|{action}|{_canonical(data)}|{prev_hash}"
|
|
111
|
+
return hashlib.sha256(payload.encode("utf-8")).hexdigest()
|
|
112
|
+
|
|
113
|
+
def append(self, actor: str, action: str, data: dict[str, Any] | None = None) -> AuditEntry:
|
|
114
|
+
with self._lock:
|
|
115
|
+
index = len(self._entries)
|
|
116
|
+
prev_hash = self._entries[-1].entry_hash if self._entries else GENESIS_HASH
|
|
117
|
+
ts = time.time()
|
|
118
|
+
data = data or {}
|
|
119
|
+
entry_hash = self._compute_hash(index, ts, actor, action, data, prev_hash)
|
|
120
|
+
signature = None
|
|
121
|
+
if self._priv is not None:
|
|
122
|
+
signature = self._priv.sign(entry_hash.encode("utf-8")).hex()
|
|
123
|
+
entry = AuditEntry(index, ts, actor, action, data, prev_hash, entry_hash, signature)
|
|
124
|
+
self._entries.append(entry)
|
|
125
|
+
if self._jsonl_path is not None:
|
|
126
|
+
with self._jsonl_path.open("a", encoding="utf-8") as fh:
|
|
127
|
+
fh.write(_canonical(entry.to_dict()) + "\n")
|
|
128
|
+
return entry
|
|
129
|
+
|
|
130
|
+
@property
|
|
131
|
+
def entries(self) -> list[AuditEntry]:
|
|
132
|
+
return list(self._entries)
|
|
133
|
+
|
|
134
|
+
def verify(self, public_key_hex: str | None = None) -> tuple[bool, str]:
|
|
135
|
+
"""Recompute the whole chain and (if a key is known) check signatures."""
|
|
136
|
+
pub_hex = public_key_hex or self._pub_hex
|
|
137
|
+
pub: Any = None
|
|
138
|
+
if pub_hex and _HAS_CRYPTO:
|
|
139
|
+
pub = Ed25519PublicKey.from_public_bytes(bytes.fromhex(pub_hex))
|
|
140
|
+
prev = GENESIS_HASH
|
|
141
|
+
for e in self._entries:
|
|
142
|
+
expect = self._compute_hash(e.index, e.ts, e.actor, e.action, e.data, prev)
|
|
143
|
+
if expect != e.entry_hash:
|
|
144
|
+
return False, f"hash mismatch at index {e.index}"
|
|
145
|
+
if e.prev_hash != prev:
|
|
146
|
+
return False, f"broken chain link at index {e.index}"
|
|
147
|
+
if e.signature is not None and pub is not None:
|
|
148
|
+
try:
|
|
149
|
+
pub.verify(bytes.fromhex(e.signature), e.entry_hash.encode("utf-8"))
|
|
150
|
+
except InvalidSignature:
|
|
151
|
+
return False, f"invalid signature at index {e.index}"
|
|
152
|
+
prev = e.entry_hash
|
|
153
|
+
return True, f"verified {len(self._entries)} entries"
|
|
154
|
+
|
|
155
|
+
def export_jsonl(self, path: str | os.PathLike[str]) -> None:
|
|
156
|
+
with Path(path).open("w", encoding="utf-8") as fh:
|
|
157
|
+
for e in self._entries:
|
|
158
|
+
fh.write(_canonical(e.to_dict()) + "\n")
|
wardproof/cli.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""Minimal CLI. Primary job: independently verify an exported audit ledger."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import argparse
|
|
6
|
+
import json
|
|
7
|
+
import sys
|
|
8
|
+
from pathlib import Path
|
|
9
|
+
|
|
10
|
+
from wardproof.audit.ledger import AuditEntry, AuditLedger
|
|
11
|
+
|
|
12
|
+
|
|
13
|
+
def _verify_file(path: str, pub: str | None) -> int:
|
|
14
|
+
ledger = AuditLedger()
|
|
15
|
+
for line in Path(path).read_text(encoding="utf-8").splitlines():
|
|
16
|
+
if not line.strip():
|
|
17
|
+
continue
|
|
18
|
+
ledger._entries.append(AuditEntry(**json.loads(line))) # noqa: SLF001
|
|
19
|
+
ok, detail = ledger.verify(public_key_hex=pub)
|
|
20
|
+
print(("OK: " if ok else "FAIL: ") + detail)
|
|
21
|
+
return 0 if ok else 1
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def main(argv: list[str] | None = None) -> int:
|
|
25
|
+
parser = argparse.ArgumentParser(prog="wardproof")
|
|
26
|
+
sub = parser.add_subparsers(dest="cmd", required=True)
|
|
27
|
+
vp = sub.add_parser("verify-ledger", help="verify integrity of a JSONL audit ledger")
|
|
28
|
+
vp.add_argument("path")
|
|
29
|
+
vp.add_argument("--pubkey", default=None, help="hex Ed25519 public key for signature check")
|
|
30
|
+
args = parser.parse_args(argv)
|
|
31
|
+
if args.cmd == "verify-ledger":
|
|
32
|
+
return _verify_file(args.path, args.pubkey)
|
|
33
|
+
return 1
|
|
34
|
+
|
|
35
|
+
|
|
36
|
+
if __name__ == "__main__":
|
|
37
|
+
sys.exit(main())
|
wardproof/config.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"""Typed configuration. Fork-friendly: override defaults in one place."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import os
|
|
6
|
+
from dataclasses import dataclass
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
@dataclass
|
|
10
|
+
class WardproofConfig:
|
|
11
|
+
model: str = "llama3.1"
|
|
12
|
+
ollama_url: str = "http://localhost:11434"
|
|
13
|
+
detector_low: float = 0.2
|
|
14
|
+
detector_high: float = 0.6
|
|
15
|
+
signing_key_path: str | None = None
|
|
16
|
+
ledger_path: str | None = None
|
|
17
|
+
|
|
18
|
+
@classmethod
|
|
19
|
+
def from_env(cls) -> WardproofConfig:
|
|
20
|
+
base = cls()
|
|
21
|
+
return cls(
|
|
22
|
+
model=os.getenv("WARDPROOF_MODEL", base.model),
|
|
23
|
+
ollama_url=os.getenv("WARDPROOF_OLLAMA_URL", base.ollama_url),
|
|
24
|
+
detector_low=float(os.getenv("WARDPROOF_DETECTOR_LOW", base.detector_low)),
|
|
25
|
+
detector_high=float(os.getenv("WARDPROOF_DETECTOR_HIGH", base.detector_high)),
|
|
26
|
+
signing_key_path=os.getenv("WARDPROOF_SIGNING_KEY"),
|
|
27
|
+
ledger_path=os.getenv("WARDPROOF_LEDGER_PATH"),
|
|
28
|
+
)
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
"""Deterministic guardrails, the first line of defence (no LLM required)."""
|
|
2
|
+
|
|
3
|
+
from wardproof.guardrails.base import Guardrail
|
|
4
|
+
from wardproof.guardrails.memory_poisoning import MemoryPoisoningGuardrail
|
|
5
|
+
from wardproof.guardrails.prompt_injection import (
|
|
6
|
+
PromptInjectionGuardrail,
|
|
7
|
+
strip_injection,
|
|
8
|
+
)
|
|
9
|
+
from wardproof.guardrails.tool_misuse import ToolMisuseGuardrail
|
|
10
|
+
|
|
11
|
+
__all__ = [
|
|
12
|
+
"Guardrail",
|
|
13
|
+
"PromptInjectionGuardrail",
|
|
14
|
+
"ToolMisuseGuardrail",
|
|
15
|
+
"MemoryPoisoningGuardrail",
|
|
16
|
+
"strip_injection",
|
|
17
|
+
]
|