sether 0.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sether-0.1.0/LICENSE +21 -0
- sether-0.1.0/PKG-INFO +380 -0
- sether-0.1.0/README.md +334 -0
- sether-0.1.0/pyproject.toml +59 -0
- sether-0.1.0/setup.cfg +4 -0
- sether-0.1.0/src/sether/__init__.py +148 -0
- sether-0.1.0/src/sether/audit/__init__.py +20 -0
- sether-0.1.0/src/sether/audit/sinks.py +48 -0
- sether-0.1.0/src/sether/audit/types.py +104 -0
- sether-0.1.0/src/sether/core.py +97 -0
- sether-0.1.0/src/sether/detectors/__init__.py +63 -0
- sether-0.1.0/src/sether/detectors/basic.py +286 -0
- sether-0.1.0/src/sether/detectors/identity.py +402 -0
- sether-0.1.0/src/sether/detectors/secrets.py +195 -0
- sether-0.1.0/src/sether/detectors/types.py +49 -0
- sether-0.1.0/src/sether/middleware/__init__.py +15 -0
- sether-0.1.0/src/sether/middleware/_common.py +60 -0
- sether-0.1.0/src/sether/middleware/anthropic.py +137 -0
- sether-0.1.0/src/sether/middleware/asgi.py +117 -0
- sether-0.1.0/src/sether/middleware/httpx_client.py +115 -0
- sether-0.1.0/src/sether/middleware/openai.py +132 -0
- sether-0.1.0/src/sether/middleware/wsgi.py +106 -0
- sether-0.1.0/src/sether/py.typed +0 -0
- sether-0.1.0/src/sether/stream/__init__.py +49 -0
- sether-0.1.0/src/sether/stream/_tokens.py +41 -0
- sether-0.1.0/src/sether/stream/redact.py +234 -0
- sether-0.1.0/src/sether/stream/restore.py +109 -0
- sether-0.1.0/src/sether/stream/sse.py +112 -0
- sether-0.1.0/src/sether/vault/__init__.py +6 -0
- sether-0.1.0/src/sether/vault/memory.py +81 -0
- sether-0.1.0/src/sether/vault/types.py +34 -0
- sether-0.1.0/src/sether.egg-info/PKG-INFO +380 -0
- sether-0.1.0/src/sether.egg-info/SOURCES.txt +47 -0
- sether-0.1.0/src/sether.egg-info/dependency_links.txt +1 -0
- sether-0.1.0/src/sether.egg-info/requires.txt +20 -0
- sether-0.1.0/src/sether.egg-info/top_level.txt +1 -0
- sether-0.1.0/tests/test_audit.py +76 -0
- sether-0.1.0/tests/test_detectors_basic.py +96 -0
- sether-0.1.0/tests/test_detectors_identity.py +83 -0
- sether-0.1.0/tests/test_detectors_secrets.py +64 -0
- sether-0.1.0/tests/test_middleware_anthropic.py +48 -0
- sether-0.1.0/tests/test_middleware_asgi.py +96 -0
- sether-0.1.0/tests/test_middleware_httpx.py +71 -0
- sether-0.1.0/tests/test_middleware_openai.py +81 -0
- sether-0.1.0/tests/test_middleware_wsgi.py +60 -0
- sether-0.1.0/tests/test_sse.py +45 -0
- sether-0.1.0/tests/test_stream.py +122 -0
- sether-0.1.0/tests/test_sync.py +29 -0
- sether-0.1.0/tests/test_vault_memory.py +48 -0
sether-0.1.0/LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
MIT License
|
|
2
|
+
|
|
3
|
+
Copyright (c) 2026 Godfrey Lebo / Raeven Company LTD
|
|
4
|
+
|
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
6
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
7
|
+
in the Software without restriction, including without limitation the rights
|
|
8
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
9
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
10
|
+
furnished to do so, subject to the following conditions:
|
|
11
|
+
|
|
12
|
+
The above copyright notice and this permission notice shall be included in all
|
|
13
|
+
copies or substantial portions of the Software.
|
|
14
|
+
|
|
15
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
17
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
18
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
19
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
20
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
21
|
+
SOFTWARE.
|
sether-0.1.0/PKG-INFO
ADDED
|
@@ -0,0 +1,380 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sether
|
|
3
|
+
Version: 0.1.0
|
|
4
|
+
Summary: Streaming PII redaction for AI applications. The hiding place for sensitive data flowing into LLMs. Secrets pack, SSE-aware streaming, audit events, drop-in middlewares for httpx / ASGI / WSGI / OpenAI / Anthropic.
|
|
5
|
+
Author-email: Godfrey Lebo <emorylebo@gmail.com>
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://setherai.vercel.app
|
|
8
|
+
Project-URL: Documentation, https://setherai.vercel.app/docs/python
|
|
9
|
+
Project-URL: Sandbox, https://setherai.vercel.app/#sandbox
|
|
10
|
+
Project-URL: Repository, https://github.com/raeven-co/sether
|
|
11
|
+
Project-URL: Issues, https://github.com/raeven-co/sether/issues
|
|
12
|
+
Project-URL: Changelog, https://github.com/raeven-co/sether/blob/main/CHANGELOG.md
|
|
13
|
+
Keywords: ai,security,pii,redaction,stream,sse,llm,privacy,gdpr,hipaa,compliance,openai,anthropic,secrets,guardrails,agentic,mcp
|
|
14
|
+
Classifier: Development Status :: 4 - Beta
|
|
15
|
+
Classifier: Intended Audience :: Developers
|
|
16
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
17
|
+
Classifier: Operating System :: OS Independent
|
|
18
|
+
Classifier: Programming Language :: Python :: 3
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
20
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
21
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
22
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
23
|
+
Classifier: Programming Language :: Python :: 3.13
|
|
24
|
+
Classifier: Topic :: Security
|
|
25
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
26
|
+
Classifier: Typing :: Typed
|
|
27
|
+
Requires-Python: >=3.9
|
|
28
|
+
Description-Content-Type: text/markdown
|
|
29
|
+
License-File: LICENSE
|
|
30
|
+
Requires-Dist: phonenumbers>=8.13
|
|
31
|
+
Provides-Extra: openai
|
|
32
|
+
Requires-Dist: openai>=1.0; extra == "openai"
|
|
33
|
+
Provides-Extra: anthropic
|
|
34
|
+
Requires-Dist: anthropic>=0.39; extra == "anthropic"
|
|
35
|
+
Provides-Extra: httpx
|
|
36
|
+
Requires-Dist: httpx>=0.27; extra == "httpx"
|
|
37
|
+
Provides-Extra: all
|
|
38
|
+
Requires-Dist: openai>=1.0; extra == "all"
|
|
39
|
+
Requires-Dist: anthropic>=0.39; extra == "all"
|
|
40
|
+
Requires-Dist: httpx>=0.27; extra == "all"
|
|
41
|
+
Provides-Extra: dev
|
|
42
|
+
Requires-Dist: pytest>=7; extra == "dev"
|
|
43
|
+
Requires-Dist: phonenumbers>=8.13; extra == "dev"
|
|
44
|
+
Requires-Dist: httpx>=0.27; extra == "dev"
|
|
45
|
+
Dynamic: license-file
|
|
46
|
+
|
|
47
|
+
# Sether (Python)
|
|
48
|
+
|
|
49
|
+
> **Hide personal data from your AI before it reaches any LLM provider.**
|
|
50
|
+
>
|
|
51
|
+
> Named for the Hebrew *sether* (סֵתֶר), *the hiding place*. Psalm 32:7.
|
|
52
|
+
|
|
53
|
+
Sether is a streaming PII-redaction layer that sits between your application
|
|
54
|
+
and any LLM API. It detects sensitive data (email, phone, SSN, credit card,
|
|
55
|
+
IBAN, IP addresses, secrets, and labelled identity fields), swaps each match
|
|
56
|
+
for a stable token before the request leaves your boundary, then restores the
|
|
57
|
+
original values transparently in the response.
|
|
58
|
+
|
|
59
|
+
This is the Python port of [`@raeven-co/sether`](https://www.npmjs.com/package/@raeven-co/sether).
|
|
60
|
+
Same detection engine, same token format, same chunk-boundary streaming safety,
|
|
61
|
+
ported faithfully to Python with both synchronous and asynchronous streaming and
|
|
62
|
+
drop-in integrations for **httpx, ASGI (FastAPI / Starlette), WSGI (Flask),
|
|
63
|
+
the OpenAI SDK, and the Anthropic SDK**.
|
|
64
|
+
|
|
65
|
+
A product of **Raeven Company LTD**.
|
|
66
|
+
|
|
67
|
+
---
|
|
68
|
+
|
|
69
|
+
## Why this exists
|
|
70
|
+
|
|
71
|
+
If your application sends a customer's email, phone number, or any other PII to
|
|
72
|
+
an LLM provider, that is a sub-processor disclosure under GDPR Article 28.
|
|
73
|
+
Credit-card data pulls you into PCI DSS scope. Health identifiers trigger HIPAA.
|
|
74
|
+
Sether stops the leak at the boundary: sensitive substrings become stable tokens
|
|
75
|
+
before the bytes leave your process, and `restore()` swaps them back so your
|
|
76
|
+
application code does not need to branch on redacted text.
|
|
77
|
+
|
|
78
|
+
**This package does not phone home.** Streams are not sent to Raeven. The vault
|
|
79
|
+
stays in your process (or your own backing store if you implement `Vault`).
|
|
80
|
+
|
|
81
|
+
---
|
|
82
|
+
|
|
83
|
+
## Install
|
|
84
|
+
|
|
85
|
+
```bash
|
|
86
|
+
pip install sether
|
|
87
|
+
```
|
|
88
|
+
|
|
89
|
+
Requires Python 3.9+. The phone detector uses [`phonenumbers`](https://pypi.org/project/phonenumbers/)
|
|
90
|
+
(installed automatically). Integration extras are optional:
|
|
91
|
+
|
|
92
|
+
```bash
|
|
93
|
+
pip install "sether[openai]" # wrap_openai
|
|
94
|
+
pip install "sether[anthropic]" # wrap_anthropic
|
|
95
|
+
pip install "sether[httpx]" # wrap_httpx
|
|
96
|
+
pip install "sether[all]" # all of the above
|
|
97
|
+
```
|
|
98
|
+
|
|
99
|
+
The ASGI and WSGI middlewares have no extra dependency.
|
|
100
|
+
|
|
101
|
+
---
|
|
102
|
+
|
|
103
|
+
## 60-second quickstart
|
|
104
|
+
|
|
105
|
+
```python
|
|
106
|
+
from sether import Sether
|
|
107
|
+
|
|
108
|
+
sether = Sether()
|
|
109
|
+
|
|
110
|
+
# Outgoing: redact before sending to the LLM.
|
|
111
|
+
safe = sether.redact_sync("my email is alice@example.com")
|
|
112
|
+
# -> "my email is <EMAIL_xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx>"
|
|
113
|
+
|
|
114
|
+
# Incoming: restore before showing the user.
|
|
115
|
+
back = sether.restore_sync(safe)
|
|
116
|
+
# -> "my email is alice@example.com"
|
|
117
|
+
```
|
|
118
|
+
|
|
119
|
+
The same `Sether` instance shares its vault between redaction and restoration,
|
|
120
|
+
which is how the round-trip identity is preserved.
|
|
121
|
+
|
|
122
|
+
### Streaming (sync and async)
|
|
123
|
+
|
|
124
|
+
```python
|
|
125
|
+
sether = Sether()
|
|
126
|
+
|
|
127
|
+
# Synchronous: any iterable of text chunks.
|
|
128
|
+
def chunks():
|
|
129
|
+
yield "Contact alice@"
|
|
130
|
+
yield "example.com or call "
|
|
131
|
+
yield "+1 415 555 2671."
|
|
132
|
+
|
|
133
|
+
redacted = "".join(sether.redact_stream(chunks()))
|
|
134
|
+
restored = "".join(sether.restore_stream([redacted]))
|
|
135
|
+
|
|
136
|
+
# Asynchronous: any async iterable (e.g. an LLM token stream).
|
|
137
|
+
async def run(llm_stream):
|
|
138
|
+
async for piece in sether.aredact_stream(llm_stream):
|
|
139
|
+
await forward_to_model(piece)
|
|
140
|
+
```
|
|
141
|
+
|
|
142
|
+
The redact stream holds back `safe_distance_bytes` (default 256) at the tail of
|
|
143
|
+
each chunk so a PII pattern crossing a chunk boundary is still detected when the
|
|
144
|
+
next chunk arrives. A long whitespace-free value (a JWT, an API key) is held
|
|
145
|
+
back up to `max(safe_distance_bytes * 4, 8192)` bytes so it is never emitted
|
|
146
|
+
partially across a boundary. For values larger than that bound, raise
|
|
147
|
+
`safe_distance_bytes` or use `redact_sync` on complete payloads. This round-trip
|
|
148
|
+
identity is verified by property-based tests over 60 random chunk partitions.
|
|
149
|
+
|
|
150
|
+
---
|
|
151
|
+
|
|
152
|
+
## Detectors
|
|
153
|
+
|
|
154
|
+
By default Sether runs the **basic pack**. Pass an explicit list to narrow scope,
|
|
155
|
+
or add the opt-in packs.
|
|
156
|
+
|
|
157
|
+
```python
|
|
158
|
+
from sether import Sether, basic_detectors, secrets_detectors, identity_detectors
|
|
159
|
+
|
|
160
|
+
sether = Sether(detectors=[*basic_detectors, *secrets_detectors, *identity_detectors])
|
|
161
|
+
```
|
|
162
|
+
|
|
163
|
+
### Basic pack (`basic_detectors`)
|
|
164
|
+
|
|
165
|
+
| Detector | Type | Method |
|
|
166
|
+
| --- | --- | --- |
|
|
167
|
+
| `email_detector` | `EMAIL` | RFC-5321-style regex. ASCII-only. |
|
|
168
|
+
| `phone_detector` | `PHONE` | `phonenumbers` (international format). |
|
|
169
|
+
| `credit_card_detector` | `CC` | Bounded regex + Luhn check. |
|
|
170
|
+
| `ssn_detector` | `SSN` | Regex + SSA invalid-prefix rules. |
|
|
171
|
+
| `ipv4_detector` | `IPV4` | Strict octet-bounded regex. |
|
|
172
|
+
| `ipv6_detector` | `IPV6` | Candidate regex + in-tree validator. |
|
|
173
|
+
| `iban_detector` | `IBAN` | Regex + mod-97 checksum. |
|
|
174
|
+
|
|
175
|
+
### Secrets pack (`secrets_detectors`)
|
|
176
|
+
|
|
177
|
+
`aws_access_key_detector`, `openai_key_detector`, `anthropic_key_detector`,
|
|
178
|
+
`github_pat_detector` (classic + fine-grained), `slack_token_detector`,
|
|
179
|
+
`stripe_key_detector`, `jwt_detector`, `high_entropy_detector`
|
|
180
|
+
(Shannon entropy >= 3.5 bits/char).
|
|
181
|
+
|
|
182
|
+
### Identity pack (`identity_detectors`, opt-in)
|
|
183
|
+
|
|
184
|
+
Label-anchored detection for names, dates of birth, passport numbers, and
|
|
185
|
+
addresses. A value is redacted only when it appears with the label that
|
|
186
|
+
introduces it (`Name:`, `DOB:`, `Passport No:`, `Address:`) or, for a few
|
|
187
|
+
distinctive standalone shapes (a street line with a house number, a UK
|
|
188
|
+
postcode), a structure strong enough to keep false positives low. Labels are
|
|
189
|
+
recognised across many languages (Latin-script plus CJK, Cyrillic, Arabic), and
|
|
190
|
+
value capture is Unicode-aware.
|
|
191
|
+
|
|
192
|
+
Free-text NER (unlabelled names, organisations, locations in running prose) is
|
|
193
|
+
not covered here; that is the separate `sether-ner` roadmap item.
|
|
194
|
+
|
|
195
|
+
### Custom detectors
|
|
196
|
+
|
|
197
|
+
Anything with a `type` string and a `detect(text)` method works:
|
|
198
|
+
|
|
199
|
+
```python
|
|
200
|
+
import re
|
|
201
|
+
from sether import DetectorMatch
|
|
202
|
+
|
|
203
|
+
class OrderIdDetector:
|
|
204
|
+
type = "ORDER_ID"
|
|
205
|
+
_re = re.compile(r"\bORD-\d{8}\b")
|
|
206
|
+
|
|
207
|
+
def detect(self, text):
|
|
208
|
+
return [DetectorMatch(m.start(), m.end(), m.group(0)) for m in self._re.finditer(text)]
|
|
209
|
+
```
|
|
210
|
+
|
|
211
|
+
---
|
|
212
|
+
|
|
213
|
+
## Token vault
|
|
214
|
+
|
|
215
|
+
Tokens map back to originals through a vault. Sether ships an in-memory LRU
|
|
216
|
+
vault (10,000 entries, 1-hour TTL by default). Implement the `Vault` protocol to
|
|
217
|
+
change eviction, encrypt at rest, or namespace tokens per tenant.
|
|
218
|
+
|
|
219
|
+
```python
|
|
220
|
+
from sether import Vault # a runtime-checkable Protocol: set/get/has/delete/size/clear
|
|
221
|
+
|
|
222
|
+
class NamespacedVault:
|
|
223
|
+
def __init__(self, prefix):
|
|
224
|
+
self._store = {}
|
|
225
|
+
self._prefix = prefix
|
|
226
|
+
def set(self, token, value): self._store[self._prefix + token] = value
|
|
227
|
+
def get(self, token): return self._store.get(self._prefix + token)
|
|
228
|
+
def has(self, token): return (self._prefix + token) in self._store
|
|
229
|
+
def delete(self, token): return self._store.pop(self._prefix + token, None) is not None
|
|
230
|
+
def size(self): return len(self._store)
|
|
231
|
+
def clear(self): self._store.clear()
|
|
232
|
+
|
|
233
|
+
sether = Sether(vault=NamespacedVault("tenant-42:"))
|
|
234
|
+
```
|
|
235
|
+
|
|
236
|
+
The `Vault` interface is **synchronous**: restore substitutes tokens as bytes
|
|
237
|
+
flow through and cannot `await` a lookup per token. Front an async store (Redis,
|
|
238
|
+
Postgres) with a synchronous in-process cache you hydrate before the restore
|
|
239
|
+
pass, or keep the vault in-process.
|
|
240
|
+
|
|
241
|
+
---
|
|
242
|
+
|
|
243
|
+
## SSE / JSON-stream mode
|
|
244
|
+
|
|
245
|
+
OpenAI and Anthropic streaming responses come back as Server-Sent Events. The
|
|
246
|
+
SSE-aware stream redacts payloads inside `data:` lines while preserving the
|
|
247
|
+
`data:` / `event:` / `id:` / `retry:` framing and blank-line separators
|
|
248
|
+
verbatim.
|
|
249
|
+
|
|
250
|
+
```python
|
|
251
|
+
from sether import create_sse_redact_stream, basic_detectors, MemoryVault
|
|
252
|
+
|
|
253
|
+
vault = MemoryVault()
|
|
254
|
+
stream = create_sse_redact_stream(basic_detectors, vault)
|
|
255
|
+
out = stream.feed(sse_chunk) + stream.finish()
|
|
256
|
+
# or the iterator helpers: sse_redact_iter(chunks, detectors, vault)
|
|
257
|
+
```
|
|
258
|
+
|
|
259
|
+
---
|
|
260
|
+
|
|
261
|
+
## Drop-in integrations
|
|
262
|
+
|
|
263
|
+
```python
|
|
264
|
+
from sether import Sether
|
|
265
|
+
sether = Sether()
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
### httpx
|
|
269
|
+
|
|
270
|
+
```python
|
|
271
|
+
import httpx
|
|
272
|
+
from sether import wrap_httpx
|
|
273
|
+
|
|
274
|
+
client = wrap_httpx(httpx.Client(), detectors=sether.detectors, vault=sether.vault)
|
|
275
|
+
r = client.post("https://api.example.com/v1/chat",
|
|
276
|
+
json={"q": "email alice@example.com"})
|
|
277
|
+
# The request carried <EMAIL_...>; r.text has any tokens the server echoed restored.
|
|
278
|
+
```
|
|
279
|
+
|
|
280
|
+
Works on `httpx.Client` and `httpx.AsyncClient`. Binary bodies pass through
|
|
281
|
+
untouched.
|
|
282
|
+
|
|
283
|
+
### ASGI (FastAPI / Starlette)
|
|
284
|
+
|
|
285
|
+
```python
|
|
286
|
+
from fastapi import FastAPI
|
|
287
|
+
from sether import SetherASGIMiddleware
|
|
288
|
+
|
|
289
|
+
app = FastAPI()
|
|
290
|
+
app.add_middleware(SetherASGIMiddleware, detectors=sether.detectors, vault=sether.vault)
|
|
291
|
+
```
|
|
292
|
+
|
|
293
|
+
### WSGI (Flask)
|
|
294
|
+
|
|
295
|
+
```python
|
|
296
|
+
from flask import Flask
|
|
297
|
+
from sether import SetherWSGIMiddleware
|
|
298
|
+
|
|
299
|
+
app = Flask(__name__)
|
|
300
|
+
app.wsgi_app = SetherWSGIMiddleware(app.wsgi_app, detectors=sether.detectors, vault=sether.vault)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
### OpenAI SDK
|
|
304
|
+
|
|
305
|
+
```python
|
|
306
|
+
from openai import OpenAI
|
|
307
|
+
from sether import wrap_openai
|
|
308
|
+
|
|
309
|
+
client = wrap_openai(OpenAI(), detectors=sether.detectors, vault=sether.vault)
|
|
310
|
+
# Redacts messages out, restores choices back. Sync, async, and streaming clients.
|
|
311
|
+
```
|
|
312
|
+
|
|
313
|
+
### Anthropic SDK
|
|
314
|
+
|
|
315
|
+
```python
|
|
316
|
+
import anthropic
|
|
317
|
+
from sether import wrap_anthropic
|
|
318
|
+
|
|
319
|
+
client = wrap_anthropic(anthropic.Anthropic(), detectors=sether.detectors, vault=sether.vault)
|
|
320
|
+
# Redacts messages/system out, restores content blocks back.
|
|
321
|
+
```
|
|
322
|
+
|
|
323
|
+
The SDK wrappers are **structurally typed**. Sether never imports `openai` or
|
|
324
|
+
`anthropic`; any object matching the `chat.completions.create` /
|
|
325
|
+
`messages.create` shape works.
|
|
326
|
+
|
|
327
|
+
---
|
|
328
|
+
|
|
329
|
+
## Audit events
|
|
330
|
+
|
|
331
|
+
Each redaction can be described by a structured `AuditEvent` that maps to the
|
|
332
|
+
regulation it satisfies (GDPR Art. 28, SOC 2 CC6.7, HIPAA, PCI DSS, and more,
|
|
333
|
+
see `DEFAULT_REGULATION_MAPPINGS`). **The original value is never carried in an
|
|
334
|
+
event, only its length.** The JSON wire shape matches the TypeScript package
|
|
335
|
+
(camelCase keys) so events are interchangeable across both.
|
|
336
|
+
|
|
337
|
+
```python
|
|
338
|
+
from sether import AuditEvent, ConsoleAuditSink, MemoryAuditSink
|
|
339
|
+
|
|
340
|
+
sink = ConsoleAuditSink() # JSONL to stderr; MemoryAuditSink accumulates for tests
|
|
341
|
+
sink.write(AuditEvent(timestamp="...", detector="EMAIL", value_length=17, token="<EMAIL_x>"))
|
|
342
|
+
```
|
|
343
|
+
|
|
344
|
+
---
|
|
345
|
+
|
|
346
|
+
## Honest limitations
|
|
347
|
+
|
|
348
|
+
These match the TypeScript package's documented limits:
|
|
349
|
+
|
|
350
|
+
- **Email detection is ASCII-only.** IDN/Unicode local parts do not match.
|
|
351
|
+
- **IPv6 `::1` (loopback) is not detected.** The candidate regex requires 4+
|
|
352
|
+
chars. Loopback is not customer PII.
|
|
353
|
+
- **Credit-card regex is permissive**, then validated by Luhn. False positives
|
|
354
|
+
in dense numeric content are possible.
|
|
355
|
+
- **Names / DOB / passport / address are label-anchored, not free-text NER.**
|
|
356
|
+
- **Very large whitespace-free values split across chunk boundaries** are held
|
|
357
|
+
back only up to `max(safe_distance_bytes * 4, 8192)` bytes. Raise
|
|
358
|
+
`safe_distance_bytes` or use `redact_sync` on complete payloads.
|
|
359
|
+
|
|
360
|
+
---
|
|
361
|
+
|
|
362
|
+
## Parity with the TypeScript package
|
|
363
|
+
|
|
364
|
+
This port reproduces the audited TypeScript engine 1:1: the same detector
|
|
365
|
+
regexes (compiled with `re.ASCII` so `\b` / `\d` stay ASCII-only as in JS), the
|
|
366
|
+
same Luhn / mod-97 / SSA validation, the same overlap resolution (longest match
|
|
367
|
+
wins), the same `<TYPE_uuid>` token format, and the same safe-distance and
|
|
368
|
+
long-value streaming guards. 76 tests cover detectors, vault, streaming
|
|
369
|
+
(including a property-based chunk-partition round-trip), SSE, audit, and all
|
|
370
|
+
five integrations.
|
|
371
|
+
|
|
372
|
+
---
|
|
373
|
+
|
|
374
|
+
## License
|
|
375
|
+
|
|
376
|
+
MIT (c) Godfrey Lebo / Raeven Company LTD
|
|
377
|
+
|
|
378
|
+
## Reporting security issues
|
|
379
|
+
|
|
380
|
+
Email `emorylebo@gmail.com`. Do not file public issues for security findings.
|