ClawGuard-PII 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- clawguard_pii-0.1.0/PKG-INFO +12 -0
- clawguard_pii-0.1.0/README.md +57 -0
- clawguard_pii-0.1.0/pyproject.toml +30 -0
- clawguard_pii-0.1.0/setup.cfg +4 -0
- clawguard_pii-0.1.0/src/ClawGuard_PII.egg-info/PKG-INFO +12 -0
- clawguard_pii-0.1.0/src/ClawGuard_PII.egg-info/SOURCES.txt +14 -0
- clawguard_pii-0.1.0/src/ClawGuard_PII.egg-info/dependency_links.txt +1 -0
- clawguard_pii-0.1.0/src/ClawGuard_PII.egg-info/entry_points.txt +2 -0
- clawguard_pii-0.1.0/src/ClawGuard_PII.egg-info/requires.txt +7 -0
- clawguard_pii-0.1.0/src/ClawGuard_PII.egg-info/top_level.txt +1 -0
- clawguard_pii-0.1.0/src/clawguard/__init__.py +3 -0
- clawguard_pii-0.1.0/src/clawguard/cli.py +15 -0
- clawguard_pii-0.1.0/src/clawguard/redactor.py +90 -0
- clawguard_pii-0.1.0/src/clawguard/server.py +102 -0
- clawguard_pii-0.1.0/tests/test_redactor.py +189 -0
- clawguard_pii-0.1.0/tests/test_server.py +207 -0
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ClawGuard-PII
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local PII redaction service for OpenClaw using nvidia/gliner-PII
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: gliner>=0.2.0
|
|
8
|
+
Requires-Dist: fastapi>=0.111.0
|
|
9
|
+
Requires-Dist: uvicorn[standard]>=0.30.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
12
|
+
Requires-Dist: httpx>=0.27.0; extra == "dev"
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
# ClawGuard
|
|
2
|
+
|
|
3
|
+
Local PII redaction server for [OpenClaw](https://openclaw.ai). Runs entirely on-device using [`nvidia/gliner-PII`](https://huggingface.co/nvidia/gliner-PII) — no data leaves your machine.
|
|
4
|
+
|
|
5
|
+
Designed for use with the [pii-redactor](https://clawhub.com/skills/pii-redactor) OpenClaw skill.
|
|
6
|
+
|
|
7
|
+
## Requirements
|
|
8
|
+
|
|
9
|
+
- Python 3.10+
|
|
10
|
+
|
|
11
|
+
## Install
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
pip install clawguard
|
|
15
|
+
```
|
|
16
|
+
|
|
17
|
+
## Quickstart
|
|
18
|
+
|
|
19
|
+
1. Generate a token:
|
|
20
|
+
|
|
21
|
+
```bash
|
|
22
|
+
python3 -c "import secrets; print(secrets.token_hex(32))"
|
|
23
|
+
```
|
|
24
|
+
|
|
25
|
+
2. Start the server:
|
|
26
|
+
|
|
27
|
+
```bash
|
|
28
|
+
CLAWGUARD_TOKEN=<your-token> clawguard serve
|
|
29
|
+
```
|
|
30
|
+
|
|
31
|
+
3. Set env vars in your agent:
|
|
32
|
+
|
|
33
|
+
```
|
|
34
|
+
CLAWGUARD_URL=http://localhost:8000
|
|
35
|
+
CLAWGUARD_TOKEN=<your-token>
|
|
36
|
+
```
|
|
37
|
+
## Supported Entity Types
|
|
38
|
+
`nvidia/gliner-PII` was fine-tuned to extract the following entity types:
|
|
39
|
+
|
|
40
|
+
* email
|
|
41
|
+
* phone_number
|
|
42
|
+
* ssn
|
|
43
|
+
* credit_card_number
|
|
44
|
+
* bank_account_number
|
|
45
|
+
* ip_address
|
|
46
|
+
* password
|
|
47
|
+
* api_key
|
|
48
|
+
* user_name
|
|
49
|
+
* date_of_birth
|
|
50
|
+
* drivers_license_number
|
|
51
|
+
* passport_number
|
|
52
|
+
* address
|
|
53
|
+
* medical_record_number
|
|
54
|
+
* health_insurance_id
|
|
55
|
+
|
|
56
|
+
|
|
57
|
+
**License: Apache 2.0**
|
|
@@ -0,0 +1,30 @@
|
|
|
1
|
+
[build-system]
|
|
2
|
+
requires = ["setuptools>=68.0", "wheel"]
|
|
3
|
+
build-backend = "setuptools.build_meta"
|
|
4
|
+
|
|
5
|
+
[project]
|
|
6
|
+
name = "ClawGuard-PII"
|
|
7
|
+
version = "0.1.0"
|
|
8
|
+
description = "Local PII redaction service for OpenClaw using nvidia/gliner-PII"
|
|
9
|
+
requires-python = ">=3.10"
|
|
10
|
+
license = "Apache-2.0"
|
|
11
|
+
dependencies = [
|
|
12
|
+
"gliner>=0.2.0",
|
|
13
|
+
"fastapi>=0.111.0",
|
|
14
|
+
"uvicorn[standard]>=0.30.0",
|
|
15
|
+
]
|
|
16
|
+
|
|
17
|
+
[project.scripts]
|
|
18
|
+
clawguard = "clawguard.cli:serve"
|
|
19
|
+
|
|
20
|
+
[project.optional-dependencies]
|
|
21
|
+
dev = [
|
|
22
|
+
"pytest>=8.0.0",
|
|
23
|
+
"httpx>=0.27.0",
|
|
24
|
+
]
|
|
25
|
+
|
|
26
|
+
[tool.setuptools.packages.find]
|
|
27
|
+
where = ["src"]
|
|
28
|
+
|
|
29
|
+
[tool.pytest.ini_options]
|
|
30
|
+
testpaths = ["tests"]
|
|
@@ -0,0 +1,12 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: ClawGuard-PII
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Local PII redaction service for OpenClaw using nvidia/gliner-PII
|
|
5
|
+
License-Expression: Apache-2.0
|
|
6
|
+
Requires-Python: >=3.10
|
|
7
|
+
Requires-Dist: gliner>=0.2.0
|
|
8
|
+
Requires-Dist: fastapi>=0.111.0
|
|
9
|
+
Requires-Dist: uvicorn[standard]>=0.30.0
|
|
10
|
+
Provides-Extra: dev
|
|
11
|
+
Requires-Dist: pytest>=8.0.0; extra == "dev"
|
|
12
|
+
Requires-Dist: httpx>=0.27.0; extra == "dev"
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
README.md
|
|
2
|
+
pyproject.toml
|
|
3
|
+
src/ClawGuard_PII.egg-info/PKG-INFO
|
|
4
|
+
src/ClawGuard_PII.egg-info/SOURCES.txt
|
|
5
|
+
src/ClawGuard_PII.egg-info/dependency_links.txt
|
|
6
|
+
src/ClawGuard_PII.egg-info/entry_points.txt
|
|
7
|
+
src/ClawGuard_PII.egg-info/requires.txt
|
|
8
|
+
src/ClawGuard_PII.egg-info/top_level.txt
|
|
9
|
+
src/clawguard/__init__.py
|
|
10
|
+
src/clawguard/cli.py
|
|
11
|
+
src/clawguard/redactor.py
|
|
12
|
+
src/clawguard/server.py
|
|
13
|
+
tests/test_redactor.py
|
|
14
|
+
tests/test_server.py
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
clawguard
|
|
@@ -0,0 +1,15 @@
|
|
|
1
|
+
"""CLI entrypoint for ClawGuard."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import sys
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
def serve() -> None:
|
|
9
|
+
try:
|
|
10
|
+
import uvicorn
|
|
11
|
+
except ImportError:
|
|
12
|
+
print("uvicorn is not installed. Run: pip install clawguard", file=sys.stderr)
|
|
13
|
+
sys.exit(1)
|
|
14
|
+
|
|
15
|
+
uvicorn.run("clawguard.server:app", host="127.0.0.1", port=8000, reload=False)
|
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
"""PII detection and redaction using nvidia/gliner-PII."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from dataclasses import dataclass, field
|
|
6
|
+
|
|
7
|
+
from gliner import GLiNER
|
|
8
|
+
|
|
9
|
+
|
|
10
|
+
DEFAULT_MODEL_ID = "nvidia/gliner-PII"
|
|
11
|
+
DEFAULT_THRESHOLD = 0.5
|
|
12
|
+
|
|
13
|
+
DEFAULT_LABELS: list[str] = [
|
|
14
|
+
"email",
|
|
15
|
+
"phone_number",
|
|
16
|
+
"ssn",
|
|
17
|
+
"credit_card_number",
|
|
18
|
+
"bank_account_number",
|
|
19
|
+
"ip_address",
|
|
20
|
+
"password",
|
|
21
|
+
"api_key",
|
|
22
|
+
"user_name",
|
|
23
|
+
"date_of_birth",
|
|
24
|
+
"drivers_license_number",
|
|
25
|
+
"passport_number",
|
|
26
|
+
"address",
|
|
27
|
+
"medical_record_number",
|
|
28
|
+
"health_insurance_id",
|
|
29
|
+
]
|
|
30
|
+
|
|
31
|
+
|
|
32
|
+
@dataclass
|
|
33
|
+
class RedactResult:
|
|
34
|
+
redacted_text: str
|
|
35
|
+
redacted_count: int
|
|
36
|
+
redacted_items: list[dict] = field(default_factory=list)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
def _resolve_overlaps(entities: list[dict]) -> list[dict]:
|
|
40
|
+
"""Remove overlapping spans, keeping the highest-confidence entity per region."""
|
|
41
|
+
sorted_by_conf = sorted(entities, key=lambda e: e["score"], reverse=True)
|
|
42
|
+
kept: list[dict] = []
|
|
43
|
+
for ent in sorted_by_conf:
|
|
44
|
+
if any(ent["start"] < k["end"] and ent["end"] > k["start"] for k in kept):
|
|
45
|
+
continue
|
|
46
|
+
kept.append(ent)
|
|
47
|
+
return sorted(kept, key=lambda e: e["start"])
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
class PIIRedactor:
|
|
51
|
+
def __init__(
|
|
52
|
+
self,
|
|
53
|
+
model_id: str = DEFAULT_MODEL_ID,
|
|
54
|
+
threshold: float = DEFAULT_THRESHOLD,
|
|
55
|
+
labels: list[str] | None = None,
|
|
56
|
+
):
|
|
57
|
+
self.threshold = threshold
|
|
58
|
+
self.labels = labels or DEFAULT_LABELS
|
|
59
|
+
self._model = GLiNER.from_pretrained(model_id)
|
|
60
|
+
|
|
61
|
+
def redact(self, text: str) -> RedactResult:
|
|
62
|
+
"""Detect PII in text and replace each span with [LABEL_UPPER]."""
|
|
63
|
+
raw_entities = self._model.predict_entities(text, self.labels, threshold=self.threshold)
|
|
64
|
+
entities = _resolve_overlaps(raw_entities)
|
|
65
|
+
|
|
66
|
+
parts: list[str] = []
|
|
67
|
+
cursor = 0
|
|
68
|
+
redacted_items: list[dict] = []
|
|
69
|
+
|
|
70
|
+
for ent in entities:
|
|
71
|
+
parts.append(text[cursor : ent["start"]])
|
|
72
|
+
placeholder = f"[{ent['label'].upper()}]"
|
|
73
|
+
parts.append(placeholder)
|
|
74
|
+
cursor = ent["end"]
|
|
75
|
+
redacted_items.append(
|
|
76
|
+
{
|
|
77
|
+
"original": text[ent["start"] : ent["end"]],
|
|
78
|
+
"label": ent["label"],
|
|
79
|
+
"replacement": placeholder,
|
|
80
|
+
"confidence": ent["score"],
|
|
81
|
+
}
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
parts.append(text[cursor:])
|
|
85
|
+
|
|
86
|
+
return RedactResult(
|
|
87
|
+
redacted_text="".join(parts),
|
|
88
|
+
redacted_count=len(entities),
|
|
89
|
+
redacted_items=redacted_items,
|
|
90
|
+
)
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
"""FastAPI service exposing PII redaction for OpenClaw."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import asyncio
|
|
6
|
+
import hmac
|
|
7
|
+
import os
|
|
8
|
+
from contextlib import asynccontextmanager
|
|
9
|
+
from typing import Annotated
|
|
10
|
+
|
|
11
|
+
from fastapi import Depends, FastAPI, HTTPException, Query, Security
|
|
12
|
+
from fastapi.security import HTTPAuthorizationCredentials, HTTPBearer
|
|
13
|
+
from pydantic import BaseModel
|
|
14
|
+
|
|
15
|
+
from clawguard.redactor import PIIRedactor, RedactResult
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
_raw_token = os.environ.get("CLAWGUARD_TOKEN", "")
|
|
19
|
+
if not _raw_token or _raw_token == "change-me":
|
|
20
|
+
raise RuntimeError(
|
|
21
|
+
"CLAWGUARD_TOKEN environment variable is not set or uses the default placeholder. "
|
|
22
|
+
"Generate a secret with: python3 -c \"import secrets; print(secrets.token_hex(32))\""
|
|
23
|
+
)
|
|
24
|
+
|
|
25
|
+
API_TOKEN: str = _raw_token
|
|
26
|
+
MODEL_ID: str = os.environ.get("MODEL_ID", "nvidia/gliner-PII")
|
|
27
|
+
THRESHOLD: float = float(os.environ.get("THRESHOLD", "0.5"))
|
|
28
|
+
MAX_TEXT_LENGTH: int = int(os.environ.get("MAX_TEXT_LENGTH", "50000"))
|
|
29
|
+
|
|
30
|
+
security = HTTPBearer()
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def verify_token(
|
|
34
|
+
credentials: Annotated[HTTPAuthorizationCredentials, Security(security)],
|
|
35
|
+
) -> str:
|
|
36
|
+
if not hmac.compare_digest(credentials.credentials, API_TOKEN):
|
|
37
|
+
raise HTTPException(status_code=401, detail="Invalid token")
|
|
38
|
+
return credentials.credentials
|
|
39
|
+
|
|
40
|
+
|
|
41
|
+
class TextRequest(BaseModel):
|
|
42
|
+
text: str
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
class RedactedItem(BaseModel):
|
|
46
|
+
label: str
|
|
47
|
+
replacement: str
|
|
48
|
+
confidence: float
|
|
49
|
+
original: str | None = None
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
class RedactResponse(BaseModel):
|
|
53
|
+
redacted_text: str
|
|
54
|
+
redacted_count: int
|
|
55
|
+
redacted_items: list[RedactedItem]
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
@asynccontextmanager
|
|
59
|
+
async def lifespan(app: FastAPI):
|
|
60
|
+
app.state.redactor = PIIRedactor(model_id=MODEL_ID, threshold=THRESHOLD)
|
|
61
|
+
yield
|
|
62
|
+
|
|
63
|
+
|
|
64
|
+
app = FastAPI(title="ClawGuard PII Redaction Service", version="0.1.0", lifespan=lifespan)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
@app.get("/health")
|
|
68
|
+
async def health():
|
|
69
|
+
return {"status": "ok"}
|
|
70
|
+
|
|
71
|
+
|
|
72
|
+
@app.post("/redact", response_model=RedactResponse, dependencies=[Depends(verify_token)])
|
|
73
|
+
async def redact(
|
|
74
|
+
req: TextRequest,
|
|
75
|
+
include_original: Annotated[bool, Query()] = False,
|
|
76
|
+
) -> RedactResponse:
|
|
77
|
+
if not req.text:
|
|
78
|
+
return RedactResponse(redacted_text="", redacted_count=0, redacted_items=[])
|
|
79
|
+
|
|
80
|
+
if len(req.text) > MAX_TEXT_LENGTH:
|
|
81
|
+
raise HTTPException(
|
|
82
|
+
status_code=413,
|
|
83
|
+
detail=f"Text exceeds maximum allowed length of {MAX_TEXT_LENGTH} characters.",
|
|
84
|
+
)
|
|
85
|
+
|
|
86
|
+
result: RedactResult = await asyncio.to_thread(app.state.redactor.redact, req.text)
|
|
87
|
+
|
|
88
|
+
items = [
|
|
89
|
+
RedactedItem(
|
|
90
|
+
label=item["label"],
|
|
91
|
+
replacement=item["replacement"],
|
|
92
|
+
confidence=item["confidence"],
|
|
93
|
+
original=item["original"] if include_original else None,
|
|
94
|
+
)
|
|
95
|
+
for item in result.redacted_items
|
|
96
|
+
]
|
|
97
|
+
|
|
98
|
+
return RedactResponse(
|
|
99
|
+
redacted_text=result.redacted_text,
|
|
100
|
+
redacted_count=result.redacted_count,
|
|
101
|
+
redacted_items=items,
|
|
102
|
+
)
|
|
@@ -0,0 +1,189 @@
|
|
|
1
|
+
"""Tests for PIIRedactor."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
from unittest.mock import MagicMock, patch
|
|
6
|
+
|
|
7
|
+
import pytest
|
|
8
|
+
|
|
9
|
+
from clawguard.redactor import (
|
|
10
|
+
DEFAULT_LABELS,
|
|
11
|
+
DEFAULT_THRESHOLD,
|
|
12
|
+
PIIRedactor,
|
|
13
|
+
RedactResult,
|
|
14
|
+
_resolve_overlaps,
|
|
15
|
+
)
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
def _make_redactor(predict_return: list[dict]) -> PIIRedactor:
|
|
19
|
+
with patch("clawguard.redactor.GLiNER") as MockGLiNER:
|
|
20
|
+
mock_model = MagicMock()
|
|
21
|
+
MockGLiNER.from_pretrained.return_value = mock_model
|
|
22
|
+
mock_model.predict_entities.return_value = predict_return
|
|
23
|
+
redactor = PIIRedactor()
|
|
24
|
+
return redactor
|
|
25
|
+
|
|
26
|
+
|
|
27
|
+
class TestDefaults:
|
|
28
|
+
def test_default_labels_is_nonempty_list(self):
|
|
29
|
+
assert isinstance(DEFAULT_LABELS, list)
|
|
30
|
+
assert len(DEFAULT_LABELS) > 0
|
|
31
|
+
|
|
32
|
+
def test_loads_nvidia_model_by_default(self):
|
|
33
|
+
with patch("clawguard.redactor.GLiNER") as MockGLiNER:
|
|
34
|
+
MockGLiNER.from_pretrained.return_value = MagicMock()
|
|
35
|
+
PIIRedactor()
|
|
36
|
+
MockGLiNER.from_pretrained.assert_called_once_with("nvidia/gliner-PII")
|
|
37
|
+
|
|
38
|
+
def test_default_threshold_is_0_5(self):
|
|
39
|
+
with patch("clawguard.redactor.GLiNER") as MockGLiNER:
|
|
40
|
+
MockGLiNER.from_pretrained.return_value = MagicMock()
|
|
41
|
+
r = PIIRedactor()
|
|
42
|
+
assert r.threshold == 0.5
|
|
43
|
+
|
|
44
|
+
def test_default_threshold_constant(self):
|
|
45
|
+
assert DEFAULT_THRESHOLD == 0.5
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
class TestRedactCleanText:
|
|
49
|
+
def test_clean_text_unchanged(self):
|
|
50
|
+
redactor = _make_redactor([])
|
|
51
|
+
result = redactor.redact("What is the capital of France?")
|
|
52
|
+
assert result.redacted_text == "What is the capital of France?"
|
|
53
|
+
assert result.redacted_count == 0
|
|
54
|
+
assert result.redacted_items == []
|
|
55
|
+
|
|
56
|
+
def test_returns_redact_result(self):
|
|
57
|
+
redactor = _make_redactor([])
|
|
58
|
+
result = redactor.redact("hello")
|
|
59
|
+
assert isinstance(result, RedactResult)
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
class TestRedactSingleEntity:
|
|
63
|
+
def test_email_redacted(self):
|
|
64
|
+
redactor = _make_redactor([
|
|
65
|
+
{"text": "john@example.com", "label": "email", "start": 12, "end": 28, "score": 0.99},
|
|
66
|
+
])
|
|
67
|
+
result = redactor.redact("Contact me: john@example.com")
|
|
68
|
+
assert "john@example.com" not in result.redacted_text
|
|
69
|
+
assert "[EMAIL]" in result.redacted_text
|
|
70
|
+
assert result.redacted_count == 1
|
|
71
|
+
|
|
72
|
+
def test_phone_redacted(self):
|
|
73
|
+
redactor = _make_redactor([
|
|
74
|
+
{"text": "555-123-4567", "label": "phone_number", "start": 9, "end": 21, "score": 0.97},
|
|
75
|
+
])
|
|
76
|
+
result = redactor.redact("Call me: 555-123-4567")
|
|
77
|
+
assert "555-123-4567" not in result.redacted_text
|
|
78
|
+
assert "[PHONE_NUMBER]" in result.redacted_text
|
|
79
|
+
|
|
80
|
+
def test_ssn_redacted(self):
|
|
81
|
+
redactor = _make_redactor([
|
|
82
|
+
{"text": "123-45-6789", "label": "ssn", "start": 12, "end": 23, "score": 0.98},
|
|
83
|
+
])
|
|
84
|
+
result = redactor.redact("My SSN is: 123-45-6789")
|
|
85
|
+
assert "123-45-6789" not in result.redacted_text
|
|
86
|
+
assert "[SSN]" in result.redacted_text
|
|
87
|
+
|
|
88
|
+
def test_label_uppercased(self):
|
|
89
|
+
redactor = _make_redactor([
|
|
90
|
+
{"text": "foo@bar.com", "label": "email", "start": 0, "end": 11, "score": 0.95},
|
|
91
|
+
])
|
|
92
|
+
result = redactor.redact("foo@bar.com")
|
|
93
|
+
assert result.redacted_text == "[EMAIL]"
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
class TestRedactMultipleEntities:
|
|
97
|
+
def test_two_entities_both_redacted(self):
|
|
98
|
+
redactor = _make_redactor([
|
|
99
|
+
{"text": "john@example.com", "label": "email", "start": 0, "end": 16, "score": 0.99},
|
|
100
|
+
{"text": "555-123-4567", "label": "phone_number", "start": 20, "end": 32, "score": 0.97},
|
|
101
|
+
])
|
|
102
|
+
result = redactor.redact("john@example.com and 555-123-4567")
|
|
103
|
+
assert result.redacted_count == 2
|
|
104
|
+
assert "[EMAIL]" in result.redacted_text
|
|
105
|
+
assert "[PHONE_NUMBER]" in result.redacted_text
|
|
106
|
+
assert "john@example.com" not in result.redacted_text
|
|
107
|
+
assert "555-123-4567" not in result.redacted_text
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
class TestOverlapResolution:
|
|
111
|
+
def test_higher_confidence_wins_on_overlap(self):
|
|
112
|
+
redactor = _make_redactor([
|
|
113
|
+
{"text": "john", "label": "user_name", "start": 0, "end": 4, "score": 0.9},
|
|
114
|
+
{"text": "john@example.com", "label": "email", "start": 0, "end": 16, "score": 0.99},
|
|
115
|
+
])
|
|
116
|
+
result = redactor.redact("john@example.com")
|
|
117
|
+
assert result.redacted_text == "[EMAIL]"
|
|
118
|
+
assert result.redacted_count == 1
|
|
119
|
+
|
|
120
|
+
def test_nested_span_discarded(self):
|
|
121
|
+
redactor = _make_redactor([
|
|
122
|
+
{"text": "john@example.com", "label": "email", "start": 0, "end": 16, "score": 0.99},
|
|
123
|
+
{"text": "john", "label": "user_name", "start": 0, "end": 4, "score": 0.5},
|
|
124
|
+
])
|
|
125
|
+
result = redactor.redact("john@example.com")
|
|
126
|
+
assert result.redacted_text == "[EMAIL]"
|
|
127
|
+
assert result.redacted_count == 1
|
|
128
|
+
|
|
129
|
+
def test_non_overlapping_spans_both_kept(self):
|
|
130
|
+
redactor = _make_redactor([
|
|
131
|
+
{"text": "john@example.com", "label": "email", "start": 0, "end": 16, "score": 0.99},
|
|
132
|
+
{"text": "555-123-4567", "label": "phone_number", "start": 20, "end": 32, "score": 0.97},
|
|
133
|
+
])
|
|
134
|
+
result = redactor.redact("john@example.com and 555-123-4567")
|
|
135
|
+
assert result.redacted_count == 2
|
|
136
|
+
|
|
137
|
+
def test_resolve_overlaps_unit(self):
|
|
138
|
+
entities = [
|
|
139
|
+
{"text": "john", "label": "user_name", "start": 0, "end": 4, "score": 0.9},
|
|
140
|
+
{"text": "john@example.com", "label": "email", "start": 0, "end": 16, "score": 0.99},
|
|
141
|
+
{"text": "555-123-4567", "label": "phone_number", "start": 20, "end": 32, "score": 0.97},
|
|
142
|
+
]
|
|
143
|
+
kept = _resolve_overlaps(entities)
|
|
144
|
+
assert len(kept) == 2
|
|
145
|
+
assert kept[0]["label"] == "email"
|
|
146
|
+
assert kept[1]["label"] == "phone_number"
|
|
147
|
+
|
|
148
|
+
|
|
149
|
+
class TestRedactItems:
|
|
150
|
+
def test_redacted_items_contain_metadata(self):
|
|
151
|
+
redactor = _make_redactor([
|
|
152
|
+
{"text": "john@example.com", "label": "email", "start": 0, "end": 16, "score": 0.99},
|
|
153
|
+
])
|
|
154
|
+
result = redactor.redact("john@example.com")
|
|
155
|
+
assert len(result.redacted_items) == 1
|
|
156
|
+
item = result.redacted_items[0]
|
|
157
|
+
assert item["original"] == "john@example.com"
|
|
158
|
+
assert item["label"] == "email"
|
|
159
|
+
assert item["replacement"] == "[EMAIL]"
|
|
160
|
+
assert item["confidence"] == 0.99
|
|
161
|
+
|
|
162
|
+
def test_redacted_items_ordered_by_appearance(self):
|
|
163
|
+
redactor = _make_redactor([
|
|
164
|
+
{"text": "555-123-4567", "label": "phone_number", "start": 20, "end": 32, "score": 0.97},
|
|
165
|
+
{"text": "john@example.com", "label": "email", "start": 0, "end": 16, "score": 0.99},
|
|
166
|
+
])
|
|
167
|
+
result = redactor.redact("john@example.com and 555-123-4567")
|
|
168
|
+
assert result.redacted_items[0]["label"] == "email"
|
|
169
|
+
assert result.redacted_items[1]["label"] == "phone_number"
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
class TestRedactPassesConfig:
|
|
173
|
+
def test_custom_threshold_passed_to_model(self):
|
|
174
|
+
with patch("clawguard.redactor.GLiNER") as MockGLiNER:
|
|
175
|
+
mock_model = MagicMock()
|
|
176
|
+
MockGLiNER.from_pretrained.return_value = mock_model
|
|
177
|
+
mock_model.predict_entities.return_value = []
|
|
178
|
+
redactor = PIIRedactor(threshold=0.8)
|
|
179
|
+
|
|
180
|
+
redactor.redact("test text")
|
|
181
|
+
args, kwargs = mock_model.predict_entities.call_args
|
|
182
|
+
threshold = kwargs.get("threshold") or (args[2] if len(args) > 2 else None)
|
|
183
|
+
assert threshold == 0.8
|
|
184
|
+
|
|
185
|
+
def test_custom_model_id_used(self):
|
|
186
|
+
with patch("clawguard.redactor.GLiNER") as MockGLiNER:
|
|
187
|
+
MockGLiNER.from_pretrained.return_value = MagicMock()
|
|
188
|
+
PIIRedactor(model_id="custom/model")
|
|
189
|
+
MockGLiNER.from_pretrained.assert_called_once_with("custom/model")
|
|
@@ -0,0 +1,207 @@
|
|
|
1
|
+
"""Tests for the FastAPI server."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import importlib
|
|
6
|
+
from unittest.mock import MagicMock, patch
|
|
7
|
+
|
|
8
|
+
import pytest
|
|
9
|
+
from fastapi.testclient import TestClient
|
|
10
|
+
|
|
11
|
+
from clawguard.redactor import RedactResult
|
|
12
|
+
|
|
13
|
+
|
|
14
|
+
@pytest.fixture
|
|
15
|
+
def client(monkeypatch):
|
|
16
|
+
monkeypatch.setenv("CLAWGUARD_TOKEN", "test-token")
|
|
17
|
+
|
|
18
|
+
mock_redactor = MagicMock()
|
|
19
|
+
mock_redactor.redact.return_value = RedactResult(
|
|
20
|
+
redacted_text="Contact [EMAIL]",
|
|
21
|
+
redacted_count=1,
|
|
22
|
+
redacted_items=[{
|
|
23
|
+
"original": "john@example.com",
|
|
24
|
+
"label": "email",
|
|
25
|
+
"replacement": "[EMAIL]",
|
|
26
|
+
"confidence": 0.99,
|
|
27
|
+
}],
|
|
28
|
+
)
|
|
29
|
+
|
|
30
|
+
with patch("clawguard.server.PIIRedactor", return_value=mock_redactor):
|
|
31
|
+
import clawguard.server as srv
|
|
32
|
+
importlib.reload(srv)
|
|
33
|
+
with TestClient(srv.app) as tc:
|
|
34
|
+
yield tc
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
@pytest.fixture
|
|
38
|
+
def clean_client(monkeypatch):
|
|
39
|
+
"""Client whose mock redactor returns no PII."""
|
|
40
|
+
monkeypatch.setenv("CLAWGUARD_TOKEN", "test-token")
|
|
41
|
+
|
|
42
|
+
mock_redactor = MagicMock()
|
|
43
|
+
mock_redactor.redact.return_value = RedactResult(
|
|
44
|
+
redacted_text="Hello world",
|
|
45
|
+
redacted_count=0,
|
|
46
|
+
redacted_items=[],
|
|
47
|
+
)
|
|
48
|
+
|
|
49
|
+
with patch("clawguard.server.PIIRedactor", return_value=mock_redactor):
|
|
50
|
+
import clawguard.server as srv
|
|
51
|
+
importlib.reload(srv)
|
|
52
|
+
with TestClient(srv.app) as tc:
|
|
53
|
+
yield tc
|
|
54
|
+
|
|
55
|
+
|
|
56
|
+
class TestHealth:
|
|
57
|
+
def test_health_returns_ok(self, client):
|
|
58
|
+
resp = client.get("/health")
|
|
59
|
+
assert resp.status_code == 200
|
|
60
|
+
assert resp.json()["status"] == "ok"
|
|
61
|
+
|
|
62
|
+
def test_health_requires_no_auth(self, client):
|
|
63
|
+
resp = client.get("/health")
|
|
64
|
+
assert resp.status_code == 200
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
class TestRedactEndpoint:
|
|
68
|
+
def test_valid_request_returns_200(self, client):
|
|
69
|
+
resp = client.post(
|
|
70
|
+
"/redact",
|
|
71
|
+
json={"text": "Contact john@example.com"},
|
|
72
|
+
headers={"Authorization": "Bearer test-token"},
|
|
73
|
+
)
|
|
74
|
+
assert resp.status_code == 200
|
|
75
|
+
|
|
76
|
+
def test_response_shape(self, client):
|
|
77
|
+
resp = client.post(
|
|
78
|
+
"/redact",
|
|
79
|
+
json={"text": "Contact john@example.com"},
|
|
80
|
+
headers={"Authorization": "Bearer test-token"},
|
|
81
|
+
)
|
|
82
|
+
data = resp.json()
|
|
83
|
+
assert "redacted_text" in data
|
|
84
|
+
assert "redacted_count" in data
|
|
85
|
+
assert "redacted_items" in data
|
|
86
|
+
|
|
87
|
+
def test_redacted_text_returned(self, client):
|
|
88
|
+
resp = client.post(
|
|
89
|
+
"/redact",
|
|
90
|
+
json={"text": "Contact john@example.com"},
|
|
91
|
+
headers={"Authorization": "Bearer test-token"},
|
|
92
|
+
)
|
|
93
|
+
assert resp.json()["redacted_text"] == "Contact [EMAIL]"
|
|
94
|
+
|
|
95
|
+
def test_redacted_count_returned(self, client):
|
|
96
|
+
resp = client.post(
|
|
97
|
+
"/redact",
|
|
98
|
+
json={"text": "Contact john@example.com"},
|
|
99
|
+
headers={"Authorization": "Bearer test-token"},
|
|
100
|
+
)
|
|
101
|
+
assert resp.json()["redacted_count"] == 1
|
|
102
|
+
|
|
103
|
+
def test_empty_text_short_circuits(self, clean_client):
|
|
104
|
+
resp = clean_client.post(
|
|
105
|
+
"/redact",
|
|
106
|
+
json={"text": ""},
|
|
107
|
+
headers={"Authorization": "Bearer test-token"},
|
|
108
|
+
)
|
|
109
|
+
assert resp.status_code == 200
|
|
110
|
+
data = resp.json()
|
|
111
|
+
assert data["redacted_text"] == ""
|
|
112
|
+
assert data["redacted_count"] == 0
|
|
113
|
+
assert data["redacted_items"] == []
|
|
114
|
+
|
|
115
|
+
def test_oversized_text_rejected(self, monkeypatch):
|
|
116
|
+
monkeypatch.setenv("CLAWGUARD_TOKEN", "test-token")
|
|
117
|
+
monkeypatch.setenv("MAX_TEXT_LENGTH", "10")
|
|
118
|
+
|
|
119
|
+
mock_redactor = MagicMock()
|
|
120
|
+
mock_redactor.redact.return_value = RedactResult("", 0, [])
|
|
121
|
+
|
|
122
|
+
with patch("clawguard.server.PIIRedactor", return_value=mock_redactor):
|
|
123
|
+
import clawguard.server as srv
|
|
124
|
+
importlib.reload(srv)
|
|
125
|
+
with TestClient(srv.app) as tc:
|
|
126
|
+
resp = tc.post(
|
|
127
|
+
"/redact",
|
|
128
|
+
json={"text": "x" * 11},
|
|
129
|
+
headers={"Authorization": "Bearer test-token"},
|
|
130
|
+
)
|
|
131
|
+
assert resp.status_code == 413
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
class TestOriginalField:
|
|
135
|
+
def test_original_omitted_by_default(self, client):
|
|
136
|
+
resp = client.post(
|
|
137
|
+
"/redact",
|
|
138
|
+
json={"text": "Contact john@example.com"},
|
|
139
|
+
headers={"Authorization": "Bearer test-token"},
|
|
140
|
+
)
|
|
141
|
+
items = resp.json()["redacted_items"]
|
|
142
|
+
assert len(items) == 1
|
|
143
|
+
assert items[0].get("original") is None
|
|
144
|
+
|
|
145
|
+
def test_original_included_when_requested(self, client):
|
|
146
|
+
resp = client.post(
|
|
147
|
+
"/redact?include_original=true",
|
|
148
|
+
json={"text": "Contact john@example.com"},
|
|
149
|
+
headers={"Authorization": "Bearer test-token"},
|
|
150
|
+
)
|
|
151
|
+
items = resp.json()["redacted_items"]
|
|
152
|
+
assert len(items) == 1
|
|
153
|
+
assert items[0]["original"] == "john@example.com"
|
|
154
|
+
|
|
155
|
+
|
|
156
|
+
class TestAuth:
|
|
157
|
+
def test_missing_auth_rejected(self, client):
|
|
158
|
+
resp = client.post("/redact", json={"text": "hello"})
|
|
159
|
+
assert resp.status_code in (401, 403)
|
|
160
|
+
|
|
161
|
+
def test_wrong_token_rejected(self, client):
|
|
162
|
+
resp = client.post(
|
|
163
|
+
"/redact",
|
|
164
|
+
json={"text": "hello"},
|
|
165
|
+
headers={"Authorization": "Bearer wrong-token"},
|
|
166
|
+
)
|
|
167
|
+
assert resp.status_code == 401
|
|
168
|
+
|
|
169
|
+
def test_correct_token_accepted(self, client):
|
|
170
|
+
resp = client.post(
|
|
171
|
+
"/redact",
|
|
172
|
+
json={"text": "hello"},
|
|
173
|
+
headers={"Authorization": "Bearer test-token"},
|
|
174
|
+
)
|
|
175
|
+
assert resp.status_code == 200
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
class TestStartupGuard:
|
|
179
|
+
def test_missing_token_raises_on_import(self, monkeypatch):
|
|
180
|
+
monkeypatch.delenv("CLAWGUARD_TOKEN", raising=False)
|
|
181
|
+
import clawguard.server as srv
|
|
182
|
+
with pytest.raises(RuntimeError, match="CLAWGUARD_TOKEN"):
|
|
183
|
+
importlib.reload(srv)
|
|
184
|
+
|
|
185
|
+
def test_default_placeholder_raises_on_import(self, monkeypatch):
|
|
186
|
+
monkeypatch.setenv("CLAWGUARD_TOKEN", "change-me")
|
|
187
|
+
import clawguard.server as srv
|
|
188
|
+
with pytest.raises(RuntimeError, match="CLAWGUARD_TOKEN"):
|
|
189
|
+
importlib.reload(srv)
|
|
190
|
+
|
|
191
|
+
|
|
192
|
+
class TestValidation:
|
|
193
|
+
def test_missing_text_field_rejected(self, client):
|
|
194
|
+
resp = client.post(
|
|
195
|
+
"/redact",
|
|
196
|
+
json={},
|
|
197
|
+
headers={"Authorization": "Bearer test-token"},
|
|
198
|
+
)
|
|
199
|
+
assert resp.status_code == 422
|
|
200
|
+
|
|
201
|
+
def test_empty_text_accepted(self, client):
|
|
202
|
+
resp = client.post(
|
|
203
|
+
"/redact",
|
|
204
|
+
json={"text": ""},
|
|
205
|
+
headers={"Authorization": "Bearer test-token"},
|
|
206
|
+
)
|
|
207
|
+
assert resp.status_code == 200
|