consent-engine 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- consent_engine/__init__.py +11 -0
- consent_engine/api.py +83 -0
- consent_engine/cli.py +133 -0
- consent_engine/config.py +37 -0
- consent_engine/llm/__init__.py +0 -0
- consent_engine/llm/client.py +50 -0
- consent_engine/mcp_server.py +185 -0
- consent_engine/models/__init__.py +0 -0
- consent_engine/models/audit_request.py +23 -0
- consent_engine/models/audit_result.py +152 -0
- consent_engine/models/scan_result.py +57 -0
- consent_engine/models/vendor.py +28 -0
- consent_engine/tools/__init__.py +0 -0
- consent_engine/tools/cmp_clicker.py +901 -0
- consent_engine/tools/cmp_detector.py +413 -0
- consent_engine/tools/cmp_injector.py +420 -0
- consent_engine/tools/jurisdiction_detector.py +306 -0
- consent_engine/tools/tool_01_gtm_parser.py +160 -0
- consent_engine/tools/tool_02_violation_classifier.py +146 -0
- consent_engine/tools/tool_03_browser_scanner.py +1945 -0
- consent_engine/tools/tool_04_har_analyzer.py +99 -0
- consent_engine/tools/tool_05_vendor_library.py +262 -0
- consent_engine/tools/tool_06_ssgtm_detector.py +216 -0
- consent_engine/tools/tool_06b_pixel_detector.py +192 -0
- consent_engine/tools/tool_07_rag_retriever.py +291 -0
- consent_engine/tools/tool_08_report_generator.py +1767 -0
- consent_engine-0.1.0.dist-info/METADATA +226 -0
- consent_engine-0.1.0.dist-info/RECORD +31 -0
- consent_engine-0.1.0.dist-info/WHEEL +4 -0
- consent_engine-0.1.0.dist-info/entry_points.txt +3 -0
- consent_engine-0.1.0.dist-info/licenses/LICENSE +21 -0
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
"""consent-engine — forensic consent compliance audit engine.
|
|
2
|
+
|
|
3
|
+
Public package surface:
|
|
4
|
+
- consent_engine.cli CLI entrypoint (`consent-engine audit ...`)
|
|
5
|
+
- consent_engine.mcp_server MCP server entrypoint (`consent-engine-mcp`)
|
|
6
|
+
- consent_engine.tools.* Eight deterministic audit tools
|
|
7
|
+
- consent_engine.models.* Pydantic models (AuditResult, ScanResult, ...)
|
|
8
|
+
- consent_engine.llm.client LiteLLM-wrapped chat surface (agentic layer)
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
__version__ = "0.1.0"
|
consent_engine/api.py
ADDED
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
"""Stripped FastAPI surface for the public consent-engine.
|
|
2
|
+
|
|
3
|
+
Single endpoint: POST /audit
|
|
4
|
+
- Accepts { "url": "https://example.com" }
|
|
5
|
+
- Returns the audit_result.json contents inline + a link to the report
|
|
6
|
+
bundle on disk (relative path).
|
|
7
|
+
|
|
8
|
+
For the full async / job-queue flow the private business app uses, fork this
|
|
9
|
+
file. This public version is deliberately small and synchronous so it's easy
|
|
10
|
+
to read.
|
|
11
|
+
"""
|
|
12
|
+
|
|
13
|
+
from __future__ import annotations
|
|
14
|
+
|
|
15
|
+
import json
|
|
16
|
+
from pathlib import Path
|
|
17
|
+
|
|
18
|
+
from fastapi import FastAPI, HTTPException
|
|
19
|
+
from pydantic import BaseModel, HttpUrl
|
|
20
|
+
|
|
21
|
+
from consent_engine import __version__
|
|
22
|
+
|
|
23
|
+
app = FastAPI(
|
|
24
|
+
title="consent-engine",
|
|
25
|
+
version=__version__,
|
|
26
|
+
description="Forensic consent compliance audit engine.",
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
class AuditRequest(BaseModel):
|
|
31
|
+
url: HttpUrl
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
@app.get("/healthz")
|
|
35
|
+
def healthz() -> dict[str, str]:
|
|
36
|
+
return {"status": "ok", "version": __version__}
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
@app.post("/audit")
|
|
40
|
+
async def audit(req: AuditRequest) -> dict:
|
|
41
|
+
"""Run a full audit and return the structured result inline.
|
|
42
|
+
|
|
43
|
+
For long-running jobs swap this for an async job-queue (BackgroundTasks
|
|
44
|
+
or a real queue like Celery/Arq).
|
|
45
|
+
"""
|
|
46
|
+
from consent_engine.tools.tool_02_violation_classifier import classify
|
|
47
|
+
from consent_engine.tools.tool_03_browser_scanner import scan_page
|
|
48
|
+
from consent_engine.tools.tool_08_report_generator import generate_report
|
|
49
|
+
|
|
50
|
+
try:
|
|
51
|
+
scan = await scan_page(url=str(req.url))
|
|
52
|
+
except Exception as e: # noqa: BLE001
|
|
53
|
+
raise HTTPException(status_code=502, detail=f"scan failed: {e}") from e
|
|
54
|
+
|
|
55
|
+
audit_result = classify(scan)
|
|
56
|
+
|
|
57
|
+
out_dir = Path("./out") / audit_result.audit_id
|
|
58
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
59
|
+
with (out_dir / "evidence.jsonl").open("w") as f:
|
|
60
|
+
for r in scan.network_requests:
|
|
61
|
+
f.write(json.dumps(r.model_dump(mode="json"), default=str) + "\n")
|
|
62
|
+
report_html, deck_md = generate_report(audit_result)
|
|
63
|
+
(out_dir / "report.html").write_text(report_html)
|
|
64
|
+
(out_dir / "deck.marp.md").write_text(deck_md)
|
|
65
|
+
(out_dir / "audit_result.json").write_text(
|
|
66
|
+
json.dumps(audit_result.model_dump(mode="json"), indent=2, default=str)
|
|
67
|
+
)
|
|
68
|
+
|
|
69
|
+
return {
|
|
70
|
+
"audit_id": audit_result.audit_id,
|
|
71
|
+
"bundle": str(out_dir),
|
|
72
|
+
"result": audit_result.model_dump(mode="json"),
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def cli() -> None:
|
|
77
|
+
"""`uvicorn` entrypoint for the FastAPI surface."""
|
|
78
|
+
import uvicorn
|
|
79
|
+
uvicorn.run("consent_engine.api:app", host="0.0.0.0", port=8080, reload=False)
|
|
80
|
+
|
|
81
|
+
|
|
82
|
+
if __name__ == "__main__": # pragma: no cover
|
|
83
|
+
cli()
|
consent_engine/cli.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
1
|
+
"""consent-engine CLI.
|
|
2
|
+
|
|
3
|
+
Usage:
|
|
4
|
+
consent-engine audit <url> [--output-dir DIR] [--gtm-json PATH] [--har PATH]
|
|
5
|
+
consent-engine chat <audit_id>
|
|
6
|
+
consent-engine version
|
|
7
|
+
|
|
8
|
+
The `audit` command writes a full audit bundle (report.html, audit_result.json,
|
|
9
|
+
evidence.jsonl, deck.marp.md) to ./out/<audit_id>/.
|
|
10
|
+
|
|
11
|
+
The `chat` command opens a per-audit Claude conversation grounded in the
|
|
12
|
+
captured evidence + audit result + wiki context cited by the audit. Closing
|
|
13
|
+
the loop on Fred Pike's glass-box principle.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from __future__ import annotations
|
|
17
|
+
|
|
18
|
+
import argparse
|
|
19
|
+
import asyncio
|
|
20
|
+
import json
|
|
21
|
+
import sys
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
from consent_engine import __version__
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
def _audit_command(args: argparse.Namespace) -> int:
|
|
28
|
+
"""Run an audit against a URL. Writes the bundle to out/<audit_id>/."""
|
|
29
|
+
# Lazy imports so `--help` doesn't trigger Playwright load.
|
|
30
|
+
from consent_engine.tools.tool_02_violation_classifier import classify
|
|
31
|
+
from consent_engine.tools.tool_03_browser_scanner import scan_page
|
|
32
|
+
from consent_engine.tools.tool_08_report_generator import generate_report
|
|
33
|
+
|
|
34
|
+
url = args.url
|
|
35
|
+
out_dir = Path(args.output_dir or "./out")
|
|
36
|
+
out_dir.mkdir(parents=True, exist_ok=True)
|
|
37
|
+
|
|
38
|
+
print(f"[1/4] Scanning {url} …", flush=True)
|
|
39
|
+
scan_result = asyncio.run(scan_page(url=url))
|
|
40
|
+
|
|
41
|
+
print("[2/4] Classifying violations …", flush=True)
|
|
42
|
+
audit_result = classify(scan_result)
|
|
43
|
+
|
|
44
|
+
audit_dir = out_dir / audit_result.audit_id
|
|
45
|
+
audit_dir.mkdir(parents=True, exist_ok=True)
|
|
46
|
+
|
|
47
|
+
# Persist the network evidence per Fred Pike's "glass box" pattern —
|
|
48
|
+
# every captured request goes to evidence.jsonl, audit-scoped.
|
|
49
|
+
print("[3/4] Writing evidence log …", flush=True)
|
|
50
|
+
with (audit_dir / "evidence.jsonl").open("w") as f:
|
|
51
|
+
for req in scan_result.network_requests:
|
|
52
|
+
f.write(json.dumps(req.model_dump(mode="json"), default=str) + "\n")
|
|
53
|
+
|
|
54
|
+
print("[4/4] Generating report + deck …", flush=True)
|
|
55
|
+
report_html, deck_md = generate_report(audit_result)
|
|
56
|
+
(audit_dir / "report.html").write_text(report_html)
|
|
57
|
+
(audit_dir / "deck.marp.md").write_text(deck_md)
|
|
58
|
+
(audit_dir / "audit_result.json").write_text(
|
|
59
|
+
json.dumps(audit_result.model_dump(mode="json"), indent=2, default=str)
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
print()
|
|
63
|
+
print(f"Audit complete: {audit_dir}")
|
|
64
|
+
print(f" Report: {audit_dir / 'report.html'}")
|
|
65
|
+
print(f" Deck: {audit_dir / 'deck.marp.md'}")
|
|
66
|
+
print(f" Evidence: {audit_dir / 'evidence.jsonl'}")
|
|
67
|
+
print(f" Findings: {len(audit_result.violations)} violation(s), "
|
|
68
|
+
f"{len(audit_result.warnings)} warning(s)")
|
|
69
|
+
return 0
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
def _chat_command(args: argparse.Namespace) -> int:
|
|
73
|
+
"""Open a Claude conversation grounded in a completed audit."""
|
|
74
|
+
audit_dir = Path("./out") / args.audit_id
|
|
75
|
+
if not audit_dir.exists():
|
|
76
|
+
print(f"error: no audit bundle at {audit_dir}", file=sys.stderr)
|
|
77
|
+
return 1
|
|
78
|
+
|
|
79
|
+
try:
|
|
80
|
+
from consent_engine.llm.client import chat_with_context
|
|
81
|
+
except ImportError:
|
|
82
|
+
print("error: chat requires `pip install consent-engine[chat]`", file=sys.stderr)
|
|
83
|
+
return 1
|
|
84
|
+
|
|
85
|
+
audit = json.loads((audit_dir / "audit_result.json").read_text())
|
|
86
|
+
evidence_lines = (audit_dir / "evidence.jsonl").read_text().splitlines()
|
|
87
|
+
|
|
88
|
+
print(f"Loaded audit {args.audit_id}. {len(evidence_lines)} network "
|
|
89
|
+
f"events captured. Type 'exit' to quit.\n")
|
|
90
|
+
|
|
91
|
+
while True:
|
|
92
|
+
try:
|
|
93
|
+
question = input("you> ").strip()
|
|
94
|
+
except (EOFError, KeyboardInterrupt):
|
|
95
|
+
print()
|
|
96
|
+
return 0
|
|
97
|
+
if not question or question.lower() in {"exit", "quit"}:
|
|
98
|
+
return 0
|
|
99
|
+
answer = chat_with_context(
|
|
100
|
+
question=question,
|
|
101
|
+
audit_result=audit,
|
|
102
|
+
evidence=evidence_lines,
|
|
103
|
+
)
|
|
104
|
+
print(f"claude> {answer}\n")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
def main(argv: list[str] | None = None) -> int:
|
|
108
|
+
parser = argparse.ArgumentParser(
|
|
109
|
+
prog="consent-engine",
|
|
110
|
+
description="Forensic consent compliance audit engine.",
|
|
111
|
+
)
|
|
112
|
+
sub = parser.add_subparsers(dest="command", required=True)
|
|
113
|
+
|
|
114
|
+
p_audit = sub.add_parser("audit", help="Run an audit against a URL.")
|
|
115
|
+
p_audit.add_argument("url", help="The URL to audit.")
|
|
116
|
+
p_audit.add_argument("--output-dir", help="Output directory (default: ./out).")
|
|
117
|
+
p_audit.add_argument("--gtm-json", help="Optional GTM container JSON export.")
|
|
118
|
+
p_audit.add_argument("--har", help="Optional HAR file.")
|
|
119
|
+
p_audit.set_defaults(func=_audit_command)
|
|
120
|
+
|
|
121
|
+
p_chat = sub.add_parser("chat", help="Chat over a completed audit.")
|
|
122
|
+
p_chat.add_argument("audit_id", help="Audit ID (the directory name under ./out/).")
|
|
123
|
+
p_chat.set_defaults(func=_chat_command)
|
|
124
|
+
|
|
125
|
+
p_ver = sub.add_parser("version", help="Print version + exit.")
|
|
126
|
+
p_ver.set_defaults(func=lambda _: (print(f"consent-engine {__version__}"), 0)[1])
|
|
127
|
+
|
|
128
|
+
args = parser.parse_args(argv)
|
|
129
|
+
return args.func(args)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
if __name__ == "__main__":
|
|
133
|
+
sys.exit(main())
|
consent_engine/config.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
from functools import lru_cache
|
|
2
|
+
|
|
3
|
+
from pydantic import field_validator
|
|
4
|
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class Settings(BaseSettings):
|
|
8
|
+
model_config = SettingsConfigDict(env_file=".env", env_file_encoding="utf-8", extra="ignore")
|
|
9
|
+
|
|
10
|
+
# LLM
|
|
11
|
+
anthropic_api_key: str | None = None
|
|
12
|
+
default_audit_model: str = "vertex_ai/gemini-2.5-pro"
|
|
13
|
+
default_classify_model: str = "vertex_ai/gemini-2.5-flash"
|
|
14
|
+
|
|
15
|
+
# Gemini / Vertex AI
|
|
16
|
+
gemini_api_key: str | None = None
|
|
17
|
+
vertex_project: str | None = None # GCP project ID for Vertex AI
|
|
18
|
+
vertex_location: str = "us-central1"
|
|
19
|
+
|
|
20
|
+
# App
|
|
21
|
+
environment: str = "development"
|
|
22
|
+
log_level: str = "INFO"
|
|
23
|
+
|
|
24
|
+
# Playwright proxy (optional — empty string treated as None)
|
|
25
|
+
playwright_proxy_url: str | None = None
|
|
26
|
+
|
|
27
|
+
@field_validator("playwright_proxy_url", mode="before")
|
|
28
|
+
@classmethod
|
|
29
|
+
def empty_str_to_none(cls, v: object) -> object:
|
|
30
|
+
if isinstance(v, str) and (not v.strip() or v.strip() == "placeholder"):
|
|
31
|
+
return None
|
|
32
|
+
return v
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
@lru_cache(maxsize=1)
|
|
36
|
+
def get_settings() -> Settings:
|
|
37
|
+
return Settings() # type: ignore[call-arg,unused-ignore]
|
|
File without changes
|
|
@@ -0,0 +1,50 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
import os
|
|
4
|
+
from typing import Any
|
|
5
|
+
|
|
6
|
+
import litellm
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
def _propagate_api_keys() -> None:
|
|
10
|
+
"""Propagate API keys from pydantic-settings into os.environ for LiteLLM.
|
|
11
|
+
|
|
12
|
+
pydantic-settings reads .env into a Python object but does NOT set OS env
|
|
13
|
+
vars. LiteLLM reads provider keys directly from os.environ, so we bridge
|
|
14
|
+
the gap here without overwriting keys that were already set at process start.
|
|
15
|
+
"""
|
|
16
|
+
try:
|
|
17
|
+
from consent_engine.config import get_settings # local import to avoid circular
|
|
18
|
+
|
|
19
|
+
settings = get_settings()
|
|
20
|
+
pairs = [
|
|
21
|
+
("GEMINI_API_KEY", settings.gemini_api_key),
|
|
22
|
+
("ANTHROPIC_API_KEY", settings.anthropic_api_key),
|
|
23
|
+
]
|
|
24
|
+
for env_var, value in pairs:
|
|
25
|
+
if value and not os.environ.get(env_var):
|
|
26
|
+
os.environ[env_var] = value
|
|
27
|
+
except Exception: # noqa: BLE001
|
|
28
|
+
pass
|
|
29
|
+
|
|
30
|
+
|
|
31
|
+
class LLMClient:
|
|
32
|
+
"""Thin LiteLLM wrapper. Swap `model` string to change LLM providers."""
|
|
33
|
+
|
|
34
|
+
def __init__(self, model: str) -> None:
|
|
35
|
+
self.model = model
|
|
36
|
+
_propagate_api_keys()
|
|
37
|
+
|
|
38
|
+
async def complete(
|
|
39
|
+
self,
|
|
40
|
+
messages: list[dict[str, Any]],
|
|
41
|
+
tools: list[dict[str, Any]] | None = None,
|
|
42
|
+
system: str | None = None,
|
|
43
|
+
) -> dict[str, Any]:
|
|
44
|
+
kwargs: dict[str, Any] = {"model": self.model, "messages": messages}
|
|
45
|
+
if tools:
|
|
46
|
+
kwargs["tools"] = tools
|
|
47
|
+
if system:
|
|
48
|
+
kwargs["messages"] = [{"role": "system", "content": system}] + messages
|
|
49
|
+
response = await litellm.acompletion(**kwargs)
|
|
50
|
+
return dict(response)
|
|
@@ -0,0 +1,185 @@
|
|
|
1
|
+
"""MCP server wrapper for consent-engine.
|
|
2
|
+
|
|
3
|
+
Exposes the audit pipeline as Model Context Protocol tools so Claude Desktop
|
|
4
|
+
(and any other MCP host) can run an audit, read the result, and query the
|
|
5
|
+
captured evidence from a conversation.
|
|
6
|
+
|
|
7
|
+
Run standalone:
|
|
8
|
+
uvx consent-engine-mcp
|
|
9
|
+
|
|
10
|
+
Register in Claude Desktop config:
|
|
11
|
+
{
|
|
12
|
+
"mcpServers": {
|
|
13
|
+
"consent-engine": {
|
|
14
|
+
"command": "uvx",
|
|
15
|
+
"args": ["consent-engine-mcp"]
|
|
16
|
+
}
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
"""
|
|
20
|
+
|
|
21
|
+
from __future__ import annotations
|
|
22
|
+
|
|
23
|
+
import asyncio
|
|
24
|
+
import json
|
|
25
|
+
from pathlib import Path
|
|
26
|
+
from typing import Any
|
|
27
|
+
|
|
28
|
+
# `mcp` is an optional dependency. If the user installed
|
|
29
|
+
# `pip install consent-engine[mcp]` we get it; otherwise we surface a clear
|
|
30
|
+
# error rather than failing on import.
|
|
31
|
+
try:
|
|
32
|
+
from mcp.server import Server
|
|
33
|
+
from mcp.server.stdio import stdio_server
|
|
34
|
+
from mcp.types import TextContent, Tool
|
|
35
|
+
except ImportError as e: # pragma: no cover
|
|
36
|
+
raise SystemExit(
|
|
37
|
+
"MCP support requires the optional [mcp] extra:\n"
|
|
38
|
+
" pip install 'consent-engine[mcp]'\n"
|
|
39
|
+
) from e
|
|
40
|
+
|
|
41
|
+
|
|
42
|
+
server: Server = Server("consent-engine")
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@server.list_tools()
|
|
46
|
+
async def list_tools() -> list[Tool]:
|
|
47
|
+
return [
|
|
48
|
+
Tool(
|
|
49
|
+
name="audit_url",
|
|
50
|
+
description=(
|
|
51
|
+
"Run a forensic consent-compliance audit against a URL. "
|
|
52
|
+
"Returns the audit_id, a one-paragraph executive summary, "
|
|
53
|
+
"and a violations count. Use read_audit_result / "
|
|
54
|
+
"query_evidence to drill into specifics."
|
|
55
|
+
),
|
|
56
|
+
inputSchema={
|
|
57
|
+
"type": "object",
|
|
58
|
+
"properties": {"url": {"type": "string"}},
|
|
59
|
+
"required": ["url"],
|
|
60
|
+
},
|
|
61
|
+
),
|
|
62
|
+
Tool(
|
|
63
|
+
name="read_audit_result",
|
|
64
|
+
description=(
|
|
65
|
+
"Load the structured audit_result.json for a prior audit. "
|
|
66
|
+
"Returns the full Pydantic model as JSON."
|
|
67
|
+
),
|
|
68
|
+
inputSchema={
|
|
69
|
+
"type": "object",
|
|
70
|
+
"properties": {"audit_id": {"type": "string"}},
|
|
71
|
+
"required": ["audit_id"],
|
|
72
|
+
},
|
|
73
|
+
),
|
|
74
|
+
Tool(
|
|
75
|
+
name="query_evidence",
|
|
76
|
+
description=(
|
|
77
|
+
"Filter the captured network evidence for a prior audit. "
|
|
78
|
+
"Use this when the user asks 'why did X fire' or 'what was "
|
|
79
|
+
"happening at time T'. Filter by url substring, "
|
|
80
|
+
"host substring, or time window."
|
|
81
|
+
),
|
|
82
|
+
inputSchema={
|
|
83
|
+
"type": "object",
|
|
84
|
+
"properties": {
|
|
85
|
+
"audit_id": {"type": "string"},
|
|
86
|
+
"url_contains": {"type": "string"},
|
|
87
|
+
"host_contains": {"type": "string"},
|
|
88
|
+
"max_results": {"type": "integer", "default": 20},
|
|
89
|
+
},
|
|
90
|
+
"required": ["audit_id"],
|
|
91
|
+
},
|
|
92
|
+
),
|
|
93
|
+
]
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
@server.call_tool()
|
|
97
|
+
async def call_tool(name: str, arguments: dict[str, Any]) -> list[TextContent]:
|
|
98
|
+
if name == "audit_url":
|
|
99
|
+
return await _audit_url(arguments["url"])
|
|
100
|
+
if name == "read_audit_result":
|
|
101
|
+
return _read_audit_result(arguments["audit_id"])
|
|
102
|
+
if name == "query_evidence":
|
|
103
|
+
return _query_evidence(arguments)
|
|
104
|
+
raise ValueError(f"Unknown tool: {name}")
|
|
105
|
+
|
|
106
|
+
|
|
107
|
+
async def _audit_url(url: str) -> list[TextContent]:
|
|
108
|
+
# Lazy import — avoids pulling Playwright at MCP server start
|
|
109
|
+
from consent_engine.tools.tool_02_violation_classifier import classify
|
|
110
|
+
from consent_engine.tools.tool_03_browser_scanner import scan_page
|
|
111
|
+
from consent_engine.tools.tool_08_report_generator import generate_executive_summary
|
|
112
|
+
|
|
113
|
+
scan = await scan_page(url=url)
|
|
114
|
+
audit = classify(scan)
|
|
115
|
+
audit_dir = Path("./out") / audit.audit_id
|
|
116
|
+
audit_dir.mkdir(parents=True, exist_ok=True)
|
|
117
|
+
(audit_dir / "audit_result.json").write_text(
|
|
118
|
+
json.dumps(audit.model_dump(mode="json"), indent=2, default=str)
|
|
119
|
+
)
|
|
120
|
+
with (audit_dir / "evidence.jsonl").open("w") as f:
|
|
121
|
+
for req in scan.network_requests:
|
|
122
|
+
f.write(json.dumps(req.model_dump(mode="json"), default=str) + "\n")
|
|
123
|
+
summary = generate_executive_summary(audit)
|
|
124
|
+
return [TextContent(
|
|
125
|
+
type="text",
|
|
126
|
+
text=(
|
|
127
|
+
f"Audit complete: {audit.audit_id}\n"
|
|
128
|
+
f" URL: {url}\n"
|
|
129
|
+
f" Violations: {len(audit.violations)}\n"
|
|
130
|
+
f" Warnings: {len(audit.warnings)}\n\n"
|
|
131
|
+
f"Summary:\n{summary}"
|
|
132
|
+
),
|
|
133
|
+
)]
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def _read_audit_result(audit_id: str) -> list[TextContent]:
|
|
137
|
+
path = Path("./out") / audit_id / "audit_result.json"
|
|
138
|
+
if not path.exists():
|
|
139
|
+
return [TextContent(type="text", text=f"No audit bundle at {path}")]
|
|
140
|
+
return [TextContent(type="text", text=path.read_text())]
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
def _query_evidence(args: dict[str, Any]) -> list[TextContent]:
|
|
144
|
+
audit_id = args["audit_id"]
|
|
145
|
+
max_results = args.get("max_results", 20)
|
|
146
|
+
url_contains = (args.get("url_contains") or "").lower()
|
|
147
|
+
host_contains = (args.get("host_contains") or "").lower()
|
|
148
|
+
|
|
149
|
+
path = Path("./out") / audit_id / "evidence.jsonl"
|
|
150
|
+
if not path.exists():
|
|
151
|
+
return [TextContent(type="text", text=f"No evidence at {path}")]
|
|
152
|
+
|
|
153
|
+
matches: list[dict[str, Any]] = []
|
|
154
|
+
for line in path.read_text().splitlines():
|
|
155
|
+
try:
|
|
156
|
+
evt = json.loads(line)
|
|
157
|
+
except json.JSONDecodeError:
|
|
158
|
+
continue
|
|
159
|
+
u = (evt.get("url") or "").lower()
|
|
160
|
+
if url_contains and url_contains not in u:
|
|
161
|
+
continue
|
|
162
|
+
if host_contains and host_contains not in u:
|
|
163
|
+
continue
|
|
164
|
+
matches.append(evt)
|
|
165
|
+
if len(matches) >= max_results:
|
|
166
|
+
break
|
|
167
|
+
|
|
168
|
+
return [TextContent(
|
|
169
|
+
type="text",
|
|
170
|
+
text=f"{len(matches)} match(es):\n" + json.dumps(matches, indent=2, default=str),
|
|
171
|
+
)]
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
def cli() -> None:
|
|
175
|
+
"""Entrypoint registered as `consent-engine-mcp` in pyproject.toml."""
|
|
176
|
+
asyncio.run(_run())
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
async def _run() -> None:
|
|
180
|
+
async with stdio_server() as (read_stream, write_stream):
|
|
181
|
+
await server.run(read_stream, write_stream, server.create_initialization_options())
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
if __name__ == "__main__": # pragma: no cover
|
|
185
|
+
cli()
|
|
File without changes
|
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
from enum import StrEnum
|
|
2
|
+
from typing import Literal
|
|
3
|
+
|
|
4
|
+
from pydantic import BaseModel
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
class CMPProvider(StrEnum):
|
|
8
|
+
ONETRUST = "onetrust"
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
class ConsentState(StrEnum):
|
|
12
|
+
OPTED_IN = "opted_in"
|
|
13
|
+
OPTED_OUT = "opted_out"
|
|
14
|
+
GPC_OPTED_OUT = "gpc_opted_out"
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
class AuditRequest(BaseModel):
|
|
18
|
+
url: str
|
|
19
|
+
cmp_provider: Literal[CMPProvider.ONETRUST] = CMPProvider.ONETRUST
|
|
20
|
+
consent_state: ConsentState = ConsentState.OPTED_OUT
|
|
21
|
+
gtm_container_json: str | None = None
|
|
22
|
+
onetrust_receipt_jwt: str | None = None
|
|
23
|
+
har_file_path: str | None = None
|
|
@@ -0,0 +1,152 @@
|
|
|
1
|
+
from datetime import datetime
|
|
2
|
+
from enum import StrEnum
|
|
3
|
+
from typing import Literal
|
|
4
|
+
|
|
5
|
+
from pydantic import BaseModel
|
|
6
|
+
|
|
7
|
+
from .vendor import Vendor
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
class GTMExtractionMethod(StrEnum):
|
|
11
|
+
LIVE = "live" # Intercepted from gtm.js during scan — strongest evidence
|
|
12
|
+
PROVIDED = "provided" # User-supplied JSON export
|
|
13
|
+
NONE = "none" # No container data available
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
class ViolationStatus(StrEnum):
|
|
17
|
+
CONFIRMED = "confirmed_violation"
|
|
18
|
+
LIKELY = "likely_violation"
|
|
19
|
+
REQUIRES_INVESTIGATION = "requires_further_investigation"
|
|
20
|
+
NO_EVIDENCE = "no_evidence_of_violation"
|
|
21
|
+
ACM_COMPLIANT = "acm_cookieless_ping" # Google tag firing cookieless (correct ACM behaviour)
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
class MethodologyFlag(StrEnum):
|
|
25
|
+
S1 = "s1_baseline"
|
|
26
|
+
S2 = "s2_post_optout_no_reload"
|
|
27
|
+
S3 = "s3_fresh_load_optout_preset"
|
|
28
|
+
# S3 run completed, but consent injection could not be verified against
|
|
29
|
+
# a denied post-injection Consent Mode signal (e.g. unknown CMP, or
|
|
30
|
+
# injection silently did not suppress tracking). Treat as non-definitive.
|
|
31
|
+
INCONCLUSIVE_UNKNOWN_CMP = "s3_inconclusive_unknown_cmp"
|
|
32
|
+
# S3 run completed with a recognised CMP and a matching injection plan,
|
|
33
|
+
# but Google Consent Mode beacons continued firing with GCS=G111
|
|
34
|
+
# throughout the scan. This is definitive evidence that the site's tag
|
|
35
|
+
# wiring fires before or regardless of the consent state the CMP stores
|
|
36
|
+
# — the CMP is working; the integration is broken. Findings are legally
|
|
37
|
+
# defensible.
|
|
38
|
+
S3_CONSENT_WIRING_BROKEN = "s3_consent_wiring_broken"
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class GCSValue(BaseModel):
|
|
42
|
+
raw: str
|
|
43
|
+
ad_storage: str
|
|
44
|
+
analytics_storage: str
|
|
45
|
+
|
|
46
|
+
|
|
47
|
+
class TagConsentEntry(BaseModel):
|
|
48
|
+
"""Per-tag GTM consent configuration extracted by Tool 1."""
|
|
49
|
+
|
|
50
|
+
tag_id: int
|
|
51
|
+
tag_name: str
|
|
52
|
+
tag_type: str # GTM function code, e.g. "__html", "__ua", "__ga4"
|
|
53
|
+
is_google_tag: bool
|
|
54
|
+
consent_types: list[str] = [] # e.g. ["ad_storage", "analytics_storage"]
|
|
55
|
+
requirement: Literal[
|
|
56
|
+
"required", # explicit consent settings present, enforced
|
|
57
|
+
"optional", # explicit consent settings, default_value=1
|
|
58
|
+
"acm_managed", # Google tag — ACM handles consent via cookieless ping
|
|
59
|
+
"missing", # NON-Google tag with no consent settings — VIOLATION
|
|
60
|
+
]
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
class GCSHit(BaseModel):
|
|
64
|
+
"""A single GCS signal observation from HAR analysis (Tool 4)."""
|
|
65
|
+
|
|
66
|
+
url: str
|
|
67
|
+
gcs_value: GCSValue
|
|
68
|
+
gcd_raw: str | None = None
|
|
69
|
+
timestamp_ms: float # milliseconds from first HAR entry
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
class HarAnalysis(BaseModel):
|
|
73
|
+
"""Output of Tool 4 HAR file analysis."""
|
|
74
|
+
|
|
75
|
+
gcs_timeline: list[GCSHit] = []
|
|
76
|
+
post_payloads: list[str] = [] # raw POST bodies (beacons, dataLayer pushes)
|
|
77
|
+
consent_api_responses: list[str] = [] # response bodies from consent endpoints
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
class PixelFiring(BaseModel):
|
|
81
|
+
"""A tracking pixel endpoint observed firing in network traffic post-consent-denial.
|
|
82
|
+
|
|
83
|
+
This is the primary evidence method used by plaintiff attorneys — detecting
|
|
84
|
+
known ad/analytics pixel endpoints in HAR/network traffic regardless of cookies.
|
|
85
|
+
"""
|
|
86
|
+
|
|
87
|
+
vendor_name: str # e.g. "Meta Pixel", "TikTok Pixel", "LinkedIn Insight Tag"
|
|
88
|
+
url: str # full request URL observed
|
|
89
|
+
category: str # "advertising" | "analytics" | "session_recording"
|
|
90
|
+
legal_exposure: str # "high" | "medium"
|
|
91
|
+
matched_pattern: str # the pattern that triggered this match
|
|
92
|
+
is_acm_ping: bool = (
|
|
93
|
+
False # True = Google ACM cookieless ping (G100+npa=1) — expected behavior, not a violation
|
|
94
|
+
)
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
class VendorFinding(BaseModel):
|
|
98
|
+
vendor: Vendor
|
|
99
|
+
status: ViolationStatus
|
|
100
|
+
methodology: MethodologyFlag
|
|
101
|
+
cookies_observed: list[str] = []
|
|
102
|
+
gcs_value: GCSValue | None = None
|
|
103
|
+
gpc_honored: bool | None = None
|
|
104
|
+
evidence: list[str] = []
|
|
105
|
+
notes: str = ""
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class AuditResult(BaseModel):
|
|
109
|
+
audit_id: str
|
|
110
|
+
url: str
|
|
111
|
+
timestamp: datetime
|
|
112
|
+
methodology: MethodologyFlag
|
|
113
|
+
gtm_extraction_method: GTMExtractionMethod = GTMExtractionMethod.NONE
|
|
114
|
+
gtm_container_id: str | None = None # e.g. "GTM-XXXXXX"
|
|
115
|
+
ssgtm_detected: bool = False
|
|
116
|
+
ssgtm_domain: str | None = None
|
|
117
|
+
gpc_tested: bool = False
|
|
118
|
+
# GPC signal test — populated when a dedicated GPC scan is run alongside
|
|
119
|
+
# the primary S3 scan. Lets the report show a clear pass/fail on whether
|
|
120
|
+
# the site respected the Global Privacy Control opt-out signal.
|
|
121
|
+
gpc_header_sent: bool = False # Sec-GPC: 1 HTTP header on all requests
|
|
122
|
+
gpc_navigator_api_set: bool = False # navigator.globalPrivacyControl = true injected
|
|
123
|
+
gpc_signal_respected: bool | None = None # True = pixel count dropped; None = not tested
|
|
124
|
+
gpc_vendors_after_signal: int = 0 # vendors still firing after GPC asserted
|
|
125
|
+
gpc_pixel_count_baseline: int = 0 # pixel firings during primary S3 opt-out scan
|
|
126
|
+
gpc_pixel_count_with_gpc: int = 0 # pixel firings during GPC scan
|
|
127
|
+
# Scan-level consent signals from primary scan network traffic
|
|
128
|
+
gcs_value: GCSValue | None = None
|
|
129
|
+
gcd_raw: str | None = None
|
|
130
|
+
cmp_interaction_method: str | None = (
|
|
131
|
+
None # "cookie_injection" | "banner_click" | "banner_click_inconclusive" | "banner_click_failed" | "banner_click_reverted"
|
|
132
|
+
)
|
|
133
|
+
detected_cmp: str | None = None
|
|
134
|
+
cmp_detection_confidence: str | None = None
|
|
135
|
+
bot_detection_encountered: bool = False
|
|
136
|
+
scan_mode_used: Literal["playwright", "stealthy"] = "playwright"
|
|
137
|
+
# Records whether the primary Chromium scan succeeded ("playwright") or the
|
|
138
|
+
# Scrapling/Camoufox stealthy fallback had to be engaged ("stealthy") — set
|
|
139
|
+
# when the primary scan hit a WAF/bot challenge.
|
|
140
|
+
detected_jurisdiction: str | None = (
|
|
141
|
+
None # "EU" | "US" | "CA"; str (not Literal) to allow extension without schema migration
|
|
142
|
+
)
|
|
143
|
+
tag_consent_map: list[TagConsentEntry] = []
|
|
144
|
+
gcs_timeline: list[GCSHit] = []
|
|
145
|
+
post_payloads: list[str] = []
|
|
146
|
+
consent_api_responses: list[str] = []
|
|
147
|
+
findings: list[VendorFinding] = []
|
|
148
|
+
pixel_firings: list[
|
|
149
|
+
PixelFiring
|
|
150
|
+
] = [] # Network-level pixel endpoint detections (plaintiff evidence)
|
|
151
|
+
open_gaps: list[str] = []
|
|
152
|
+
remediation: list[str] = []
|