sether 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. sether-0.1.0/LICENSE +21 -0
  2. sether-0.1.0/PKG-INFO +380 -0
  3. sether-0.1.0/README.md +334 -0
  4. sether-0.1.0/pyproject.toml +59 -0
  5. sether-0.1.0/setup.cfg +4 -0
  6. sether-0.1.0/src/sether/__init__.py +148 -0
  7. sether-0.1.0/src/sether/audit/__init__.py +20 -0
  8. sether-0.1.0/src/sether/audit/sinks.py +48 -0
  9. sether-0.1.0/src/sether/audit/types.py +104 -0
  10. sether-0.1.0/src/sether/core.py +97 -0
  11. sether-0.1.0/src/sether/detectors/__init__.py +63 -0
  12. sether-0.1.0/src/sether/detectors/basic.py +286 -0
  13. sether-0.1.0/src/sether/detectors/identity.py +402 -0
  14. sether-0.1.0/src/sether/detectors/secrets.py +195 -0
  15. sether-0.1.0/src/sether/detectors/types.py +49 -0
  16. sether-0.1.0/src/sether/middleware/__init__.py +15 -0
  17. sether-0.1.0/src/sether/middleware/_common.py +60 -0
  18. sether-0.1.0/src/sether/middleware/anthropic.py +137 -0
  19. sether-0.1.0/src/sether/middleware/asgi.py +117 -0
  20. sether-0.1.0/src/sether/middleware/httpx_client.py +115 -0
  21. sether-0.1.0/src/sether/middleware/openai.py +132 -0
  22. sether-0.1.0/src/sether/middleware/wsgi.py +106 -0
  23. sether-0.1.0/src/sether/py.typed +0 -0
  24. sether-0.1.0/src/sether/stream/__init__.py +49 -0
  25. sether-0.1.0/src/sether/stream/_tokens.py +41 -0
  26. sether-0.1.0/src/sether/stream/redact.py +234 -0
  27. sether-0.1.0/src/sether/stream/restore.py +109 -0
  28. sether-0.1.0/src/sether/stream/sse.py +112 -0
  29. sether-0.1.0/src/sether/vault/__init__.py +6 -0
  30. sether-0.1.0/src/sether/vault/memory.py +81 -0
  31. sether-0.1.0/src/sether/vault/types.py +34 -0
  32. sether-0.1.0/src/sether.egg-info/PKG-INFO +380 -0
  33. sether-0.1.0/src/sether.egg-info/SOURCES.txt +47 -0
  34. sether-0.1.0/src/sether.egg-info/dependency_links.txt +1 -0
  35. sether-0.1.0/src/sether.egg-info/requires.txt +20 -0
  36. sether-0.1.0/src/sether.egg-info/top_level.txt +1 -0
  37. sether-0.1.0/tests/test_audit.py +76 -0
  38. sether-0.1.0/tests/test_detectors_basic.py +96 -0
  39. sether-0.1.0/tests/test_detectors_identity.py +83 -0
  40. sether-0.1.0/tests/test_detectors_secrets.py +64 -0
  41. sether-0.1.0/tests/test_middleware_anthropic.py +48 -0
  42. sether-0.1.0/tests/test_middleware_asgi.py +96 -0
  43. sether-0.1.0/tests/test_middleware_httpx.py +71 -0
  44. sether-0.1.0/tests/test_middleware_openai.py +81 -0
  45. sether-0.1.0/tests/test_middleware_wsgi.py +60 -0
  46. sether-0.1.0/tests/test_sse.py +45 -0
  47. sether-0.1.0/tests/test_stream.py +122 -0
  48. sether-0.1.0/tests/test_sync.py +29 -0
  49. sether-0.1.0/tests/test_vault_memory.py +48 -0
sether-0.1.0/LICENSE ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Godfrey Lebo / Raeven Company LTD
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
sether-0.1.0/PKG-INFO ADDED
@@ -0,0 +1,380 @@
1
+ Metadata-Version: 2.4
2
+ Name: sether
3
+ Version: 0.1.0
4
+ Summary: Streaming PII redaction for AI applications. The hiding place for sensitive data flowing into LLMs. Secrets pack, SSE-aware streaming, audit events, drop-in middlewares for httpx / ASGI / WSGI / OpenAI / Anthropic.
5
+ Author-email: Godfrey Lebo <emorylebo@gmail.com>
6
+ License: MIT
7
+ Project-URL: Homepage, https://setherai.vercel.app
8
+ Project-URL: Documentation, https://setherai.vercel.app/docs/python
9
+ Project-URL: Sandbox, https://setherai.vercel.app/#sandbox
10
+ Project-URL: Repository, https://github.com/raeven-co/sether
11
+ Project-URL: Issues, https://github.com/raeven-co/sether/issues
12
+ Project-URL: Changelog, https://github.com/raeven-co/sether/blob/main/CHANGELOG.md
13
+ Keywords: ai,security,pii,redaction,stream,sse,llm,privacy,gdpr,hipaa,compliance,openai,anthropic,secrets,guardrails,agentic,mcp
14
+ Classifier: Development Status :: 4 - Beta
15
+ Classifier: Intended Audience :: Developers
16
+ Classifier: License :: OSI Approved :: MIT License
17
+ Classifier: Operating System :: OS Independent
18
+ Classifier: Programming Language :: Python :: 3
19
+ Classifier: Programming Language :: Python :: 3.9
20
+ Classifier: Programming Language :: Python :: 3.10
21
+ Classifier: Programming Language :: Python :: 3.11
22
+ Classifier: Programming Language :: Python :: 3.12
23
+ Classifier: Programming Language :: Python :: 3.13
24
+ Classifier: Topic :: Security
25
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
26
+ Classifier: Typing :: Typed
27
+ Requires-Python: >=3.9
28
+ Description-Content-Type: text/markdown
29
+ License-File: LICENSE
30
+ Requires-Dist: phonenumbers>=8.13
31
+ Provides-Extra: openai
32
+ Requires-Dist: openai>=1.0; extra == "openai"
33
+ Provides-Extra: anthropic
34
+ Requires-Dist: anthropic>=0.39; extra == "anthropic"
35
+ Provides-Extra: httpx
36
+ Requires-Dist: httpx>=0.27; extra == "httpx"
37
+ Provides-Extra: all
38
+ Requires-Dist: openai>=1.0; extra == "all"
39
+ Requires-Dist: anthropic>=0.39; extra == "all"
40
+ Requires-Dist: httpx>=0.27; extra == "all"
41
+ Provides-Extra: dev
42
+ Requires-Dist: pytest>=7; extra == "dev"
43
+ Requires-Dist: phonenumbers>=8.13; extra == "dev"
44
+ Requires-Dist: httpx>=0.27; extra == "dev"
45
+ Dynamic: license-file
46
+
47
+ # Sether (Python)
48
+
49
+ > **Hide personal data from your AI before it reaches any LLM provider.**
50
+ >
51
+ > Named for the Hebrew *sether* (סֵתֶר), *the hiding place*. Psalm 32:7.
52
+
53
+ Sether is a streaming PII-redaction layer that sits between your application
54
+ and any LLM API. It detects sensitive data (email, phone, SSN, credit card,
55
+ IBAN, IP addresses, secrets, and labelled identity fields), swaps each match
56
+ for a stable token before the request leaves your boundary, then restores the
57
+ original values transparently in the response.
58
+
59
+ This is the Python port of [`@raeven-co/sether`](https://www.npmjs.com/package/@raeven-co/sether).
60
+ Same detection engine, same token format, same chunk-boundary streaming safety,
61
+ ported faithfully to Python with both synchronous and asynchronous streaming and
62
+ drop-in integrations for **httpx, ASGI (FastAPI / Starlette), WSGI (Flask),
63
+ the OpenAI SDK, and the Anthropic SDK**.
64
+
65
+ A product of **Raeven Company LTD**.
66
+
67
+ ---
68
+
69
+ ## Why this exists
70
+
71
+ If your application sends a customer's email, phone number, or any other PII to
72
+ an LLM provider, that is a sub-processor disclosure under GDPR Article 28.
73
+ Credit-card data pulls you into PCI DSS scope. Health identifiers trigger HIPAA.
74
+ Sether stops the leak at the boundary: sensitive substrings become stable tokens
75
+ before the bytes leave your process, and `restore()` swaps them back so your
76
+ application code does not need to branch on redacted text.
77
+
78
+ **This package does not phone home.** Streams are not sent to Raeven. The vault
79
+ stays in your process (or your own backing store if you implement `Vault`).
80
+
81
+ ---
82
+
83
+ ## Install
84
+
85
+ ```bash
86
+ pip install sether
87
+ ```
88
+
89
+ Requires Python 3.9+. The phone detector uses [`phonenumbers`](https://pypi.org/project/phonenumbers/)
90
+ (installed automatically). Integration extras are optional:
91
+
92
+ ```bash
93
+ pip install "sether[openai]" # wrap_openai
94
+ pip install "sether[anthropic]" # wrap_anthropic
95
+ pip install "sether[httpx]" # wrap_httpx
96
+ pip install "sether[all]" # all of the above
97
+ ```
98
+
99
+ The ASGI and WSGI middlewares have no extra dependency.
100
+
101
+ ---
102
+
103
+ ## 60-second quickstart
104
+
105
+ ```python
106
+ from sether import Sether
107
+
108
+ sether = Sether()
109
+
110
+ # Outgoing: redact before sending to the LLM.
111
+ safe = sether.redact_sync("my email is alice@example.com")
112
+ # -> "my email is <EMAIL_xxxxxxxx-xxxx-xxxx-xxxx-xxxxxxxxxxxx>"
113
+
114
+ # Incoming: restore before showing the user.
115
+ back = sether.restore_sync(safe)
116
+ # -> "my email is alice@example.com"
117
+ ```
118
+
119
+ The same `Sether` instance shares its vault between redaction and restoration,
120
+ which is how the round-trip identity is preserved.
121
+
122
+ ### Streaming (sync and async)
123
+
124
+ ```python
125
+ sether = Sether()
126
+
127
+ # Synchronous: any iterable of text chunks.
128
+ def chunks():
129
+ yield "Contact alice@"
130
+ yield "example.com or call "
131
+ yield "+1 415 555 2671."
132
+
133
+ redacted = "".join(sether.redact_stream(chunks()))
134
+ restored = "".join(sether.restore_stream([redacted]))
135
+
136
+ # Asynchronous: any async iterable (e.g. an LLM token stream).
137
+ async def run(llm_stream):
138
+ async for piece in sether.aredact_stream(llm_stream):
139
+ await forward_to_model(piece)
140
+ ```
141
+
142
+ The redact stream holds back `safe_distance_bytes` (default 256) at the tail of
143
+ each chunk so a PII pattern crossing a chunk boundary is still detected when the
144
+ next chunk arrives. A long whitespace-free value (a JWT, an API key) is held
145
+ back up to `max(safe_distance_bytes * 4, 8192)` bytes so it is never emitted
146
+ partially across a boundary. For values larger than that bound, raise
147
+ `safe_distance_bytes` or use `redact_sync` on complete payloads. This round-trip
148
+ identity is verified by property-based tests over 60 random chunk partitions.
149
+
150
+ ---
151
+
152
+ ## Detectors
153
+
154
+ By default Sether runs the **basic pack**. Pass an explicit list to narrow scope,
155
+ or add the opt-in packs.
156
+
157
+ ```python
158
+ from sether import Sether, basic_detectors, secrets_detectors, identity_detectors
159
+
160
+ sether = Sether(detectors=[*basic_detectors, *secrets_detectors, *identity_detectors])
161
+ ```
162
+
163
+ ### Basic pack (`basic_detectors`)
164
+
165
+ | Detector | Type | Method |
166
+ | --- | --- | --- |
167
+ | `email_detector` | `EMAIL` | RFC-5321-style regex. ASCII-only. |
168
+ | `phone_detector` | `PHONE` | `phonenumbers` (international format). |
169
+ | `credit_card_detector` | `CC` | Bounded regex + Luhn check. |
170
+ | `ssn_detector` | `SSN` | Regex + SSA invalid-prefix rules. |
171
+ | `ipv4_detector` | `IPV4` | Strict octet-bounded regex. |
172
+ | `ipv6_detector` | `IPV6` | Candidate regex + in-tree validator. |
173
+ | `iban_detector` | `IBAN` | Regex + mod-97 checksum. |
174
+
175
+ ### Secrets pack (`secrets_detectors`)
176
+
177
+ `aws_access_key_detector`, `openai_key_detector`, `anthropic_key_detector`,
178
+ `github_pat_detector` (classic + fine-grained), `slack_token_detector`,
179
+ `stripe_key_detector`, `jwt_detector`, `high_entropy_detector`
180
+ (Shannon entropy >= 3.5 bits/char).
181
+
182
+ ### Identity pack (`identity_detectors`, opt-in)
183
+
184
+ Label-anchored detection for names, dates of birth, passport numbers, and
185
+ addresses. A value is redacted only when it appears with the label that
186
+ introduces it (`Name:`, `DOB:`, `Passport No:`, `Address:`) or, for a few
187
+ distinctive standalone shapes (a street line with a house number, a UK
188
+ postcode), a structure strong enough to keep false positives low. Labels are
189
+ recognised across many languages (Latin-script plus CJK, Cyrillic, Arabic), and
190
+ value capture is Unicode-aware.
191
+
192
+ Free-text NER (unlabelled names, organisations, locations in running prose) is
193
+ not covered here; that is the separate `sether-ner` roadmap item.
194
+
195
+ ### Custom detectors
196
+
197
+ Anything with a `type` string and a `detect(text)` method works:
198
+
199
+ ```python
200
+ import re
201
+ from sether import DetectorMatch
202
+
203
+ class OrderIdDetector:
204
+ type = "ORDER_ID"
205
+ _re = re.compile(r"\bORD-\d{8}\b")
206
+
207
+ def detect(self, text):
208
+ return [DetectorMatch(m.start(), m.end(), m.group(0)) for m in self._re.finditer(text)]
209
+ ```
210
+
211
+ ---
212
+
213
+ ## Token vault
214
+
215
+ Tokens map back to originals through a vault. Sether ships an in-memory LRU
216
+ vault (10,000 entries, 1-hour TTL by default). Implement the `Vault` protocol to
217
+ change eviction, encrypt at rest, or namespace tokens per tenant.
218
+
219
+ ```python
220
+ from sether import Vault # a runtime-checkable Protocol: set/get/has/delete/size/clear
221
+
222
+ class NamespacedVault:
223
+ def __init__(self, prefix):
224
+ self._store = {}
225
+ self._prefix = prefix
226
+ def set(self, token, value): self._store[self._prefix + token] = value
227
+ def get(self, token): return self._store.get(self._prefix + token)
228
+ def has(self, token): return (self._prefix + token) in self._store
229
+ def delete(self, token): return self._store.pop(self._prefix + token, None) is not None
230
+ def size(self): return len(self._store)
231
+ def clear(self): self._store.clear()
232
+
233
+ sether = Sether(vault=NamespacedVault("tenant-42:"))
234
+ ```
235
+
236
+ The `Vault` interface is **synchronous**: restore substitutes tokens as bytes
237
+ flow through and cannot `await` a lookup per token. Front an async store (Redis,
238
+ Postgres) with a synchronous in-process cache you hydrate before the restore
239
+ pass, or keep the vault in-process.
240
+
241
+ ---
242
+
243
+ ## SSE / JSON-stream mode
244
+
245
+ OpenAI and Anthropic streaming responses come back as Server-Sent Events. The
246
+ SSE-aware stream redacts payloads inside `data:` lines while preserving the
247
+ `data:` / `event:` / `id:` / `retry:` framing and blank-line separators
248
+ verbatim.
249
+
250
+ ```python
251
+ from sether import create_sse_redact_stream, basic_detectors, MemoryVault
252
+
253
+ vault = MemoryVault()
254
+ stream = create_sse_redact_stream(basic_detectors, vault)
255
+ out = stream.feed(sse_chunk) + stream.finish()
256
+ # or the iterator helpers: sse_redact_iter(chunks, detectors, vault)
257
+ ```
258
+
259
+ ---
260
+
261
+ ## Drop-in integrations
262
+
263
+ ```python
264
+ from sether import Sether
265
+ sether = Sether()
266
+ ```
267
+
268
+ ### httpx
269
+
270
+ ```python
271
+ import httpx
272
+ from sether import wrap_httpx
273
+
274
+ client = wrap_httpx(httpx.Client(), detectors=sether.detectors, vault=sether.vault)
275
+ r = client.post("https://api.example.com/v1/chat",
276
+ json={"q": "email alice@example.com"})
277
+ # The request carried <EMAIL_...>; r.text has any tokens the server echoed restored.
278
+ ```
279
+
280
+ Works on `httpx.Client` and `httpx.AsyncClient`. Binary bodies pass through
281
+ untouched.
282
+
283
+ ### ASGI (FastAPI / Starlette)
284
+
285
+ ```python
286
+ from fastapi import FastAPI
287
+ from sether import SetherASGIMiddleware
288
+
289
+ app = FastAPI()
290
+ app.add_middleware(SetherASGIMiddleware, detectors=sether.detectors, vault=sether.vault)
291
+ ```
292
+
293
+ ### WSGI (Flask)
294
+
295
+ ```python
296
+ from flask import Flask
297
+ from sether import SetherWSGIMiddleware
298
+
299
+ app = Flask(__name__)
300
+ app.wsgi_app = SetherWSGIMiddleware(app.wsgi_app, detectors=sether.detectors, vault=sether.vault)
301
+ ```
302
+
303
+ ### OpenAI SDK
304
+
305
+ ```python
306
+ from openai import OpenAI
307
+ from sether import wrap_openai
308
+
309
+ client = wrap_openai(OpenAI(), detectors=sether.detectors, vault=sether.vault)
310
+ # Redacts messages out, restores choices back. Sync, async, and streaming clients.
311
+ ```
312
+
313
+ ### Anthropic SDK
314
+
315
+ ```python
316
+ import anthropic
317
+ from sether import wrap_anthropic
318
+
319
+ client = wrap_anthropic(anthropic.Anthropic(), detectors=sether.detectors, vault=sether.vault)
320
+ # Redacts messages/system out, restores content blocks back.
321
+ ```
322
+
323
+ The SDK wrappers are **structurally typed**. Sether never imports `openai` or
324
+ `anthropic`; any object matching the `chat.completions.create` /
325
+ `messages.create` shape works.
326
+
327
+ ---
328
+
329
+ ## Audit events
330
+
331
+ Each redaction can be described by a structured `AuditEvent` that maps to the
332
+ regulation it satisfies (GDPR Art. 28, SOC 2 CC6.7, HIPAA, PCI DSS, and more,
333
+ see `DEFAULT_REGULATION_MAPPINGS`). **The original value is never carried in an
334
+ event, only its length.** The JSON wire shape matches the TypeScript package
335
+ (camelCase keys) so events are interchangeable across both.
336
+
337
+ ```python
338
+ from sether import AuditEvent, ConsoleAuditSink, MemoryAuditSink
339
+
340
+ sink = ConsoleAuditSink() # JSONL to stderr; MemoryAuditSink accumulates for tests
341
+ sink.write(AuditEvent(timestamp="...", detector="EMAIL", value_length=17, token="<EMAIL_x>"))
342
+ ```
343
+
344
+ ---
345
+
346
+ ## Honest limitations
347
+
348
+ These match the TypeScript package's documented limits:
349
+
350
+ - **Email detection is ASCII-only.** IDN/Unicode local parts do not match.
351
+ - **IPv6 `::1` (loopback) is not detected.** The candidate regex requires 4+
352
+ chars. Loopback is not customer PII.
353
+ - **Credit-card regex is permissive**, then validated by Luhn. False positives
354
+ in dense numeric content are possible.
355
+ - **Names / DOB / passport / address are label-anchored, not free-text NER.**
356
+ - **Very large whitespace-free values split across chunk boundaries** are held
357
+ back only up to `max(safe_distance_bytes * 4, 8192)` bytes. Raise
358
+ `safe_distance_bytes` or use `redact_sync` on complete payloads.
359
+
360
+ ---
361
+
362
+ ## Parity with the TypeScript package
363
+
364
+ This port reproduces the audited TypeScript engine 1:1: the same detector
365
+ regexes (compiled with `re.ASCII` so `\b` / `\d` stay ASCII-only as in JS), the
366
+ same Luhn / mod-97 / SSA validation, the same overlap resolution (longest match
367
+ wins), the same `<TYPE_uuid>` token format, and the same safe-distance and
368
+ long-value streaming guards. 76 tests cover detectors, vault, streaming
369
+ (including a property-based chunk-partition round-trip), SSE, audit, and all
370
+ five integrations.
371
+
372
+ ---
373
+
374
+ ## License
375
+
376
+ MIT (c) Godfrey Lebo / Raeven Company LTD
377
+
378
+ ## Reporting security issues
379
+
380
+ Email `emorylebo@gmail.com`. Do not file public issues for security findings.