privaite 0.2.3__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. privaite-0.2.3/LICENSE +28 -0
  2. privaite-0.2.3/PKG-INFO +351 -0
  3. privaite-0.2.3/README.md +302 -0
  4. privaite-0.2.3/privaite/__init__.py +1 -0
  5. privaite-0.2.3/privaite/__main__.py +3 -0
  6. privaite-0.2.3/privaite/api/__init__.py +0 -0
  7. privaite-0.2.3/privaite/api/chat.py +126 -0
  8. privaite-0.2.3/privaite/api/completions.py +108 -0
  9. privaite-0.2.3/privaite/api/dependencies.py +22 -0
  10. privaite-0.2.3/privaite/api/embeddings.py +71 -0
  11. privaite-0.2.3/privaite/api/health.py +48 -0
  12. privaite-0.2.3/privaite/api/models.py +25 -0
  13. privaite-0.2.3/privaite/api/router.py +12 -0
  14. privaite-0.2.3/privaite/app.py +78 -0
  15. privaite-0.2.3/privaite/cli.py +45 -0
  16. privaite-0.2.3/privaite/config/__init__.py +4 -0
  17. privaite-0.2.3/privaite/config/loader.py +53 -0
  18. privaite-0.2.3/privaite/config/schema.py +184 -0
  19. privaite-0.2.3/privaite/middleware/__init__.py +0 -0
  20. privaite-0.2.3/privaite/middleware/auth.py +61 -0
  21. privaite-0.2.3/privaite/middleware/limits.py +103 -0
  22. privaite-0.2.3/privaite/pii/__init__.py +0 -0
  23. privaite-0.2.3/privaite/pii/anonymizer.py +68 -0
  24. privaite-0.2.3/privaite/pii/deanonymizer.py +66 -0
  25. privaite-0.2.3/privaite/pii/detector_base.py +23 -0
  26. privaite-0.2.3/privaite/pii/detector_bert_ner.py +93 -0
  27. privaite-0.2.3/privaite/pii/detector_mlmodel.py +116 -0
  28. privaite-0.2.3/privaite/pii/detector_onnx.py +262 -0
  29. privaite-0.2.3/privaite/pii/detector_presidio.py +158 -0
  30. privaite-0.2.3/privaite/pii/engine.py +315 -0
  31. privaite-0.2.3/privaite/pii/entity.py +88 -0
  32. privaite-0.2.3/privaite/pii/faker_providers.py +83 -0
  33. privaite-0.2.3/privaite/pii/mapping.py +43 -0
  34. privaite-0.2.3/privaite/pii/recognizer_context.py +106 -0
  35. privaite-0.2.3/privaite/pii/recognizer_custom.py +45 -0
  36. privaite-0.2.3/privaite/pii/recognizer_fr_date.py +60 -0
  37. privaite-0.2.3/privaite/pii/recognizer_location.py +58 -0
  38. privaite-0.2.3/privaite/pii/tracker.py +63 -0
  39. privaite-0.2.3/privaite/providers/__init__.py +0 -0
  40. privaite-0.2.3/privaite/providers/router.py +58 -0
  41. privaite-0.2.3/privaite/streaming/__init__.py +0 -0
  42. privaite-0.2.3/privaite/streaming/buffer.py +80 -0
  43. privaite-0.2.3/privaite/streaming/handler.py +77 -0
  44. privaite-0.2.3/privaite/streaming/sse.py +32 -0
  45. privaite-0.2.3/privaite/utils/__init__.py +0 -0
  46. privaite-0.2.3/privaite/utils/errors.py +59 -0
  47. privaite-0.2.3/privaite/utils/logging.py +26 -0
  48. privaite-0.2.3/privaite/utils/security.py +18 -0
  49. privaite-0.2.3/privaite.egg-info/PKG-INFO +351 -0
  50. privaite-0.2.3/privaite.egg-info/SOURCES.txt +54 -0
  51. privaite-0.2.3/privaite.egg-info/dependency_links.txt +1 -0
  52. privaite-0.2.3/privaite.egg-info/entry_points.txt +2 -0
  53. privaite-0.2.3/privaite.egg-info/requires.txt +31 -0
  54. privaite-0.2.3/privaite.egg-info/top_level.txt +1 -0
  55. privaite-0.2.3/pyproject.toml +87 -0
  56. privaite-0.2.3/setup.cfg +4 -0
privaite-0.2.3/LICENSE ADDED
@@ -0,0 +1,28 @@
1
+ BSD 3-Clause License
2
+
3
+ Copyright (c) 2025, Elie
4
+
5
+ Redistribution and use in source and binary forms, with or without
6
+ modification, are permitted provided that the following conditions are met:
7
+
8
+ 1. Redistributions of source code must retain the above copyright notice, this
9
+ list of conditions and the following disclaimer.
10
+
11
+ 2. Redistributions in binary form must reproduce the above copyright notice,
12
+ this list of conditions and the following disclaimer in the documentation
13
+ and/or other materials provided with the distribution.
14
+
15
+ 3. Neither the name of the copyright holder nor the names of its
16
+ contributors may be used to endorse or promote products derived from
17
+ this software without specific prior written permission.
18
+
19
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
23
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
@@ -0,0 +1,351 @@
1
+ Metadata-Version: 2.4
2
+ Name: privaite
3
+ Version: 0.2.3
4
+ Summary: Privacy-first LLM proxy with transparent PII anonymization
5
+ License-Expression: BSD-3-Clause
6
+ Project-URL: Homepage, https://github.com/crp4222/PrivAiTe
7
+ Project-URL: Repository, https://github.com/crp4222/PrivAiTe
8
+ Project-URL: Changelog, https://github.com/crp4222/PrivAiTe/blob/main/CHANGELOG.md
9
+ Keywords: privacy,pii,llm,proxy,anonymization,openai,gdpr
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Programming Language :: Python :: 3.11
13
+ Classifier: Programming Language :: Python :: 3.12
14
+ Classifier: Programming Language :: Python :: 3.13
15
+ Classifier: Topic :: Security
16
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
17
+ Requires-Python: >=3.11
18
+ Description-Content-Type: text/markdown
19
+ License-File: LICENSE
20
+ Requires-Dist: fastapi>=0.115.0
21
+ Requires-Dist: uvicorn[standard]>=0.30.0
22
+ Requires-Dist: sse-starlette>=2.0.0
23
+ Requires-Dist: litellm>=1.80.0
24
+ Requires-Dist: pydantic>=2.0
25
+ Requires-Dist: pyyaml>=6.0
26
+ Requires-Dist: python-dotenv>=1.0
27
+ Requires-Dist: httpx>=0.27.0
28
+ Requires-Dist: presidio-analyzer>=2.2
29
+ Requires-Dist: presidio-anonymizer>=2.2
30
+ Requires-Dist: spacy>=3.7
31
+ Requires-Dist: faker>=25.0
32
+ Requires-Dist: click>=8.0
33
+ Provides-Extra: onnx
34
+ Requires-Dist: onnxruntime>=1.17; extra == "onnx"
35
+ Requires-Dist: transformers>=4.40; extra == "onnx"
36
+ Requires-Dist: huggingface_hub>=0.23; extra == "onnx"
37
+ Provides-Extra: ml
38
+ Requires-Dist: transformers>=4.40; extra == "ml"
39
+ Requires-Dist: torch>=2.0; extra == "ml"
40
+ Provides-Extra: dev
41
+ Requires-Dist: pytest>=8.0; extra == "dev"
42
+ Requires-Dist: pytest-asyncio>=0.24; extra == "dev"
43
+ Requires-Dist: pytest-httpx>=0.30; extra == "dev"
44
+ Requires-Dist: respx>=0.21; extra == "dev"
45
+ Requires-Dist: coverage>=7.0; extra == "dev"
46
+ Requires-Dist: ruff>=0.5.0; extra == "dev"
47
+ Requires-Dist: mypy>=1.10; extra == "dev"
48
+ Dynamic: license-file
49
+
50
+ # PrivAiTe
51
+
52
+ [![CI](https://github.com/crp4222/PrivAiTe/actions/workflows/ci.yml/badge.svg)](https://github.com/crp4222/PrivAiTe/actions)
53
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
54
+ [![License](https://img.shields.io/badge/license-BSD--3--Clause-green.svg)](LICENSE)
55
+
56
+ A privacy proxy for LLMs. It sits between your app and the provider, replaces personal data with placeholders before the request leaves your machine, and restores it in the response — across message text, **tool-call arguments, and multimodal content**. Works with any OpenAI-compatible client.
57
+
58
+ ```
59
+ You type: "Je m'appelle Marie Dupont, email marie@acme.com"
60
+ LLM sees: "Je m'appelle <PERSON_1>, email <EMAIL_ADDRESS_1>"
61
+ LLM says: "Bonjour <PERSON_1>, votre email <EMAIL_ADDRESS_1> est noté."
62
+ You see: "Bonjour Marie Dupont, votre email marie@acme.com est noté."
63
+ ```
64
+
65
+ Detection runs locally. This is **local pseudonymization, not guaranteed anonymization** — what it does and doesn't protect against is spelled out in [Threat model](#threat-model).
66
+
67
+ ## How detection works
68
+
69
+ PrivAiTe uses two detection engines that can run together or separately:
70
+
71
+ ### Presidio (Microsoft) — regex + spaCy NER
72
+
73
+ The default engine. Handles structured PII through pattern matching and basic NER.
74
+
75
+ | What it detects | How |
76
+ |---|---|
77
+ | Emails | Regex |
78
+ | Phone numbers | Regex + international format validation |
79
+ | Credit cards | Regex + Luhn checksum |
80
+ | IBAN | Regex + checksum validation |
81
+ | IP addresses | Regex |
82
+ | US SSN | Regex + format validation |
83
+ | Person names (capitalized, 2+ words) | spaCy NER — only kept if all words are capitalized |
84
+ | Person names (lowercase or single word) | Contextual regex — only after "je m'appelle X", "my name is X", "ich heiße X", "Nom: X", etc. |
85
+ | Dates (FR/DE) | Custom regex — "15 mars 1987", "3. März 1990" |
86
+
87
+ Presidio is fast (~25ms/request) and produces zero false positives on code, news articles, and technical text. The tradeoff: it misses names that spaCy doesn't recognize (unusual names, single-word names without context) and doesn't detect secrets/passwords.
88
+
89
+ ### OpenAI Privacy Filter — contextual ML model
90
+
91
+ [OpenAI's open-source PII model](https://openai.com/index/introducing-openai-privacy-filter/) (1.5B params, 50M active, Apache 2.0). Runs locally via ONNX Runtime (~800MB, no PyTorch needed).
92
+
93
+ | What it adds over Presidio | How |
94
+ |---|---|
95
+ | Person names (any format, any case) | ML NER — understands context, not just capitalization |
96
+ | Passwords and secrets | Detects "SuperSecret2024!", API keys like "sk-proj-..." |
97
+ | Account numbers | Detects bank account numbers, policy numbers, etc. |
98
+ | Dates (all languages) | ML-based, not limited to FR/DE regex |
99
+
100
+ The Privacy Filter is slower (~400ms/request) and occasionally flags technical identifiers as account numbers (e.g., "CMD-2024-98765"). It runs as a second pass alongside Presidio — Presidio handles regex-based entities, the Privacy Filter handles contextual NER.
101
+
102
+ ### Why two engines?
103
+
104
+ Neither is perfect alone:
105
+ - **Presidio alone** misses names that spaCy doesn't recognize, and can't detect secrets. But it has zero false positives.
106
+ - **Privacy Filter alone** misses some names in credit/list formats, and doesn't have regex validators for IBAN/credit card checksums.
107
+ - **Both together** cover each other's blind spots. Presidio handles structured formats with validation, the Privacy Filter handles context-dependent PII.
108
+
109
+ ## Presets
110
+
111
+ Choose based on your needs:
112
+
113
+ | Preset | What runs | Detection | False positives | Speed | Install |
114
+ |--------|-----------|-----------|-----------------|-------|---------|
115
+ | `light` | Presidio only | 97% | **0%** | **23ms** | `pip install -e .` + spaCy models |
116
+ | `onnx` | Presidio + Privacy Filter | **100%** | ~7% | 400ms | `pip install "privaite[onnx]"` |
117
+
118
+ ```yaml
119
+ pii:
120
+ preset: "light" # Zero false positives, fast. Recommended for most users.
121
+ # preset: "onnx" # Catches everything including secrets. Needs the onnx extra.
122
+ ```
123
+
124
+ The `onnx` preset runs on onnxruntime plus the transformers tokenizer, so the
125
+ `onnx` extra does not pull in torch or scipy. The `ml` extra (the `standard` and
126
+ `full` presets) is the only one that installs torch.
127
+
128
+ **When to use `light`:** You want zero disruption. Code, news, business text all pass through untouched. Only clearly identifiable PII (names, emails, phones, cards, IBANs) is anonymized.
129
+
130
+ **When to use `onnx`:** You need maximum coverage. Secrets, passwords, API keys, account numbers, unusual names. Accept occasional false positives on technical identifiers.
131
+
132
+ Two other presets exist (`standard`, `full`) but are less useful in practice — they add BERT NER which doesn't improve much over spaCy and requires PyTorch.
133
+
134
+ ## Benchmark
135
+
136
+ Tested on 61 documents across 5 languages (FR, EN, DE, ES, IT). Corporate letters, contracts, invoices, medical referrals, CVs, bank transfers, news articles, codebases. Mix of synthetic data (valid checksums) and real-world public report extracts.
137
+
138
+ | | **light** | **onnx** |
139
+ |---|---|---|
140
+ | Detection | 96.7% (236/244) | **100% (244/244)** |
141
+ | False positives | **0/14 (0%)** | 1/14 (7%) |
142
+ | PERSON | 93% | **100%** |
143
+ | EMAIL | 98% | **100%** |
144
+ | PHONE | 100% | 100% |
145
+ | IBAN | 100% | 100% |
146
+ | CREDIT_CARD | 100% | 100% |
147
+ | DATE | 100% | 100% |
148
+ | SSN | 100% | 100% |
149
+ | Secrets | no | **yes** |
150
+
151
+ The `light` misses are all PERSON entities: single-word names, long multi-part Spanish names, and names spaCy doesn't recognize. Regex entities are 100% on both presets.
152
+
153
+ Full benchmark with all test data: [privaite-bench](https://github.com/crp4222/privaite-bench)
154
+
155
+ ## What's NOT detected by default
156
+
157
+ - **Locations/cities** — "Paris", "London" alone aren't PII (they don't identify anyone). Detecting them causes massive false positives on any text ("Kubernetes", "PIB", "Saturday" all get flagged as locations by spaCy). Disabled by default.
158
+ - **URLs** — Presidio's URL regex matches code like `logging.getLogger` because `.ge` is a valid TLD. Disabled by default.
159
+ - **Passwords/secrets** — Only the `onnx` preset detects these via the Privacy Filter model.
160
+
161
+ All of these can be re-enabled in the YAML config if your use case needs them.
162
+
163
+ ## Threat model
164
+
165
+ PrivAiTe performs **local pseudonymization**, not guaranteed anonymization. Detection runs on your machine; the real ↔ placeholder mapping lives in memory only for the duration of a request and is dropped afterwards.
166
+
167
+ **What it protects against:** the LLM provider storing, training on, or logging your raw PII. The provider receives placeholders (`<PERSON_1>`, …) for everything the detector catches — across message content, tool-call arguments, and multimodal text.
168
+
169
+ **What it does NOT protect against:**
170
+
171
+ - **PII the detector misses.** Detection is statistical and never 100% (see the [benchmark](https://github.com/crp4222/privaite-bench)). A name it doesn't recognize reaches the provider. The `onnx` preset has the best recall; treat the output as best-effort, not a guarantee.
172
+ - **Re-identification from context.** Even with names replaced, the surrounding text can stay identifying ("the CEO of `<ORG_1>` who resigned in March").
173
+ - **A compromised local machine.** The mapping and raw text live in local memory; this is not a defense against a local attacker.
174
+ - **The provider correlating** requests within a session.
175
+
176
+ For GDPR/HIPAA: treat this as pseudonymization + transfer minimization, not anonymization. If you need irreversible removal, use `method: "redact"` instead of `method: "placeholder"`.
177
+
178
+ ## Alternatives
179
+
180
+ Keeping PII out of LLM calls is a crowded space, and PrivAiTe is not always the right pick. Based on each project's public docs as of June 2026:
181
+
182
+ - [AI Security Gateway](https://github.com/aisecuritygateway/aisecuritygateway) does more than PII: it adds secret detection and prompt-injection blocking. If you want those in the same proxy, start there. Its PII scanning targets plain message text.
183
+ - [Philter](https://philterd.ai/) is a mature, drop-in "change one URL" redaction proxy for plain-text prompts.
184
+ - LiteLLM has a built-in Presidio guardrail, the natural choice if you already run the LiteLLM proxy and want PII handling inline (there are a few open bugs around scrubbing requests and responses).
185
+ - Managed/cloud options exist too, such as Microsoft PII Shield and [LangChain's gateway redaction](https://docs.langchain.com/langsmith/llm-gateway-redaction).
186
+
187
+ Where PrivAiTe differs: it anonymizes PII **inside tool-call arguments and multimodal content**, not just message text (LangChain's gateway docs, for instance, note that tool-call arguments are not scanned), it **restores** the original values in the response, and it ships a [reproducible benchmark](https://github.com/crp4222/privaite-bench). If your traffic is agentic or multimodal, that gap is the reason this exists.
188
+
189
+ ## Quick start
190
+
191
+ ### 1. Install
192
+
193
+ ```bash
194
+ pip install -e .
195
+ python -m spacy download en_core_web_lg
196
+ python -m spacy download fr_core_news_md
197
+ ```
198
+
199
+ For the `onnx` preset (optional, torch-free):
200
+ ```bash
201
+ pip install -e ".[onnx]"
202
+ ```
203
+
204
+ ### 2. Configure
205
+
206
+ ```bash
207
+ cp .env.example .env
208
+ cp config/privaite.example.yaml config/privaite.yaml
209
+ ```
210
+
211
+ Edit `.env` with your API keys and `config/privaite.yaml` with your LLM providers.
212
+
213
+ ### 3. Run
214
+
215
+ ```bash
216
+ python -m privaite
217
+
218
+ # Dev mode (auto-reload)
219
+ python -m privaite --reload
220
+ ```
221
+
222
+ ### 4. Connect
223
+
224
+ Point any OpenAI-compatible client to `http://localhost:8400/v1` with your proxy API key.
225
+
226
+ **OpenWebUI (Docker):** Admin → Settings → Connections → OpenAI API:
227
+ - URL: `http://host.docker.internal:8400/v1`
228
+ - Key: your `PRIVAITE_API_KEYS` value
229
+
230
+ If you would rather not run a separate proxy, there is also an in-process Open
231
+ WebUI filter (see [Open WebUI filter](#open-webui-filter) below).
232
+
233
+ ## Docker
234
+
235
+ ```bash
236
+ docker compose up -d
237
+ ```
238
+
239
+ ## Open WebUI filter
240
+
241
+ `integrations/openwebui/privaite_filter.py` is an Open WebUI Filter Function. It
242
+ runs the engine inside Open WebUI, so it anonymizes the outgoing request and
243
+ restores PII in the reply without a separate proxy. It covers message text,
244
+ tool-call arguments, and multimodal text.
245
+
246
+ To install it: Admin Panel → Functions → "+", paste the file, save, enable it,
247
+ then open its valves to pick the preset (`light` or `onnx`) and the languages.
248
+ The filter pulls Presidio and spaCy into Open WebUI and downloads the spaCy
249
+ models on first use, so the first request after enabling it can be slow. Setup
250
+ notes are in [`integrations/openwebui/README.md`](integrations/openwebui/README.md).
251
+
252
+ ## Configuration
253
+
254
+ ### LLM providers
255
+
256
+ Any [LiteLLM-supported provider](https://docs.litellm.ai/docs/providers) works:
257
+
258
+ ```yaml
259
+ providers:
260
+ - model_name: "gpt-4o"
261
+ litellm_params:
262
+ model: "openai/gpt-4o"
263
+ api_key: "${OPENAI_API_KEY}"
264
+
265
+ - model_name: "local-llama"
266
+ litellm_params:
267
+ model: "ollama/llama3.1"
268
+ api_base: "http://localhost:11434"
269
+ ```
270
+
271
+ ### Anonymization method
272
+
273
+ ```yaml
274
+ pii:
275
+ anonymization:
276
+ method: "placeholder" # <PERSON_1>, <EMAIL_ADDRESS_1> — recommended
277
+ # method: "fake_replacement" # Realistic fakes via Faker (Jean → Michel)
278
+ # method: "redact" # [PERSON], [EMAIL_ADDRESS] — irreversible
279
+ # method: "mask" # ********
280
+ ```
281
+
282
+ ### Custom regex patterns
283
+
284
+ Add your own PII patterns without touching code:
285
+
286
+ ```yaml
287
+ pii:
288
+ custom_patterns:
289
+ - pattern: "KD-\\d{6}"
290
+ entity_type: "CUSTOMER_ID"
291
+ - pattern: "REF-[A-Z]{3}-\\d+"
292
+ entity_type: "REFERENCE"
293
+ ```
294
+
295
+ ### Languages
296
+
297
+ 7 languages supported with spaCy NER + contextual patterns: FR, EN, DE, ES, IT, PT, NL.
298
+
299
+ ```yaml
300
+ pii:
301
+ detectors:
302
+ presidio:
303
+ languages: ["fr", "en"] # Add "de", "es", etc.
304
+ ```
305
+
306
+ Each language needs its spaCy model: `python -m spacy download de_core_news_md`
307
+
308
+ ## API
309
+
310
+ OpenAI-compatible:
311
+
312
+ | Endpoint | Description |
313
+ |----------|-------------|
314
+ | `POST /v1/chat/completions` | Chat (streaming + non-streaming) |
315
+ | `POST /v1/completions` | Text completions |
316
+ | `POST /v1/embeddings` | Embeddings (anonymized, no de-anonymization) |
317
+ | `GET /v1/models` | List configured models |
318
+ | `GET /health` | Health check |
319
+ | `GET /ready` | Readiness check |
320
+ | `GET /stats` | PII detection stats per session |
321
+
322
+ ### What gets anonymized
323
+
324
+ PII is stripped from every field that carries user text to the provider:
325
+
326
+ - `messages[].content`, whether a plain string or a multimodal list of parts (text parts are scrubbed, images and audio are left alone).
327
+ - `tool_calls[].function.arguments` and the legacy `function_call.arguments`: parsed as JSON and scrubbed value by value, so object keys and the function name stay intact. Arguments that are not valid JSON are scrubbed as free text.
328
+ - `/v1/completions` `prompt` and `/v1/embeddings` `input`, as a string or a list of strings.
329
+
330
+ On the way back, the original values are restored in `message.content` and, for non-streaming chat, in returned `tool_calls`. Set `pii.passthrough.tool_calls: true` to forward tool-call arguments unchanged.
331
+
332
+ For a stricter posture, set `pii.strict: true`: any request whose content can't be inspected (a shape that is neither text nor a known media part) is rejected with `400` instead of being forwarded.
333
+
334
+ ## Known limitations
335
+
336
+ - **Single-word names** from spaCy are dropped (too many false positives). Caught by contextual patterns ("Nom: X") or the `onnx` preset.
337
+ - **Lowercase names** need intro patterns ("je m'appelle X"). The `onnx` preset catches them without patterns.
338
+ - **Informal dates** ("last Tuesday", "il y a deux ans") are not detected.
339
+ - **No policy gate** — all requests are forwarded after pseudonymization.
340
+ - **Streaming tool calls**: argument deltas are not de-anonymized, so a streamed tool call may show placeholders instead of the original values. Request-side anonymization still applies, so no PII leaks.
341
+
342
+ ## Development
343
+
344
+ ```bash
345
+ pip install -e ".[dev]"
346
+ python -m pytest tests/ -v
347
+ ```
348
+
349
+ ## License
350
+
351
+ BSD 3-Clause. See [LICENSE](LICENSE).
@@ -0,0 +1,302 @@
1
+ # PrivAiTe
2
+
3
+ [![CI](https://github.com/crp4222/PrivAiTe/actions/workflows/ci.yml/badge.svg)](https://github.com/crp4222/PrivAiTe/actions)
4
+ [![Python 3.11+](https://img.shields.io/badge/python-3.11+-blue.svg)](https://www.python.org/downloads/)
5
+ [![License](https://img.shields.io/badge/license-BSD--3--Clause-green.svg)](LICENSE)
6
+
7
+ A privacy proxy for LLMs. It sits between your app and the provider, replaces personal data with placeholders before the request leaves your machine, and restores it in the response — across message text, **tool-call arguments, and multimodal content**. Works with any OpenAI-compatible client.
8
+
9
+ ```
10
+ You type: "Je m'appelle Marie Dupont, email marie@acme.com"
11
+ LLM sees: "Je m'appelle <PERSON_1>, email <EMAIL_ADDRESS_1>"
12
+ LLM says: "Bonjour <PERSON_1>, votre email <EMAIL_ADDRESS_1> est noté."
13
+ You see: "Bonjour Marie Dupont, votre email marie@acme.com est noté."
14
+ ```
15
+
16
+ Detection runs locally. This is **local pseudonymization, not guaranteed anonymization** — what it does and doesn't protect against is spelled out in [Threat model](#threat-model).
17
+
18
+ ## How detection works
19
+
20
+ PrivAiTe uses two detection engines that can run together or separately:
21
+
22
+ ### Presidio (Microsoft) — regex + spaCy NER
23
+
24
+ The default engine. Handles structured PII through pattern matching and basic NER.
25
+
26
+ | What it detects | How |
27
+ |---|---|
28
+ | Emails | Regex |
29
+ | Phone numbers | Regex + international format validation |
30
+ | Credit cards | Regex + Luhn checksum |
31
+ | IBAN | Regex + checksum validation |
32
+ | IP addresses | Regex |
33
+ | US SSN | Regex + format validation |
34
+ | Person names (capitalized, 2+ words) | spaCy NER — only kept if all words are capitalized |
35
+ | Person names (lowercase or single word) | Contextual regex — only after "je m'appelle X", "my name is X", "ich heiße X", "Nom: X", etc. |
36
+ | Dates (FR/DE) | Custom regex — "15 mars 1987", "3. März 1990" |
37
+
38
+ Presidio is fast (~25ms/request) and produces zero false positives on code, news articles, and technical text. The tradeoff: it misses names that spaCy doesn't recognize (unusual names, single-word names without context) and doesn't detect secrets/passwords.
39
+
40
+ ### OpenAI Privacy Filter — contextual ML model
41
+
42
+ [OpenAI's open-source PII model](https://openai.com/index/introducing-openai-privacy-filter/) (1.5B params, 50M active, Apache 2.0). Runs locally via ONNX Runtime (~800MB, no PyTorch needed).
43
+
44
+ | What it adds over Presidio | How |
45
+ |---|---|
46
+ | Person names (any format, any case) | ML NER — understands context, not just capitalization |
47
+ | Passwords and secrets | Detects "SuperSecret2024!", API keys like "sk-proj-..." |
48
+ | Account numbers | Detects bank account numbers, policy numbers, etc. |
49
+ | Dates (all languages) | ML-based, not limited to FR/DE regex |
50
+
51
+ The Privacy Filter is slower (~400ms/request) and occasionally flags technical identifiers as account numbers (e.g., "CMD-2024-98765"). It runs as a second pass alongside Presidio — Presidio handles regex-based entities, the Privacy Filter handles contextual NER.
52
+
53
+ ### Why two engines?
54
+
55
+ Neither is perfect alone:
56
+ - **Presidio alone** misses names that spaCy doesn't recognize, and can't detect secrets. But it has zero false positives.
57
+ - **Privacy Filter alone** misses some names in credit/list formats, and doesn't have regex validators for IBAN/credit card checksums.
58
+ - **Both together** cover each other's blind spots. Presidio handles structured formats with validation, the Privacy Filter handles context-dependent PII.
59
+
60
+ ## Presets
61
+
62
+ Choose based on your needs:
63
+
64
+ | Preset | What runs | Detection | False positives | Speed | Install |
65
+ |--------|-----------|-----------|-----------------|-------|---------|
66
+ | `light` | Presidio only | 97% | **0%** | **23ms** | `pip install -e .` + spaCy models |
67
+ | `onnx` | Presidio + Privacy Filter | **100%** | ~7% | 400ms | `pip install "privaite[onnx]"` |
68
+
69
+ ```yaml
70
+ pii:
71
+ preset: "light" # Zero false positives, fast. Recommended for most users.
72
+ # preset: "onnx" # Catches everything including secrets. Needs the onnx extra.
73
+ ```
74
+
75
+ The `onnx` preset runs on onnxruntime plus the transformers tokenizer, so the
76
+ `onnx` extra does not pull in torch or scipy. The `ml` extra (the `standard` and
77
+ `full` presets) is the only one that installs torch.
78
+
79
+ **When to use `light`:** You want zero disruption. Code, news, business text all pass through untouched. Only clearly identifiable PII (names, emails, phones, cards, IBANs) is anonymized.
80
+
81
+ **When to use `onnx`:** You need maximum coverage. Secrets, passwords, API keys, account numbers, unusual names. Accept occasional false positives on technical identifiers.
82
+
83
+ Two other presets exist (`standard`, `full`) but are less useful in practice — they add BERT NER which doesn't improve much over spaCy and requires PyTorch.
84
+
85
+ ## Benchmark
86
+
87
+ Tested on 61 documents across 5 languages (FR, EN, DE, ES, IT). Corporate letters, contracts, invoices, medical referrals, CVs, bank transfers, news articles, codebases. Mix of synthetic data (valid checksums) and real-world public report extracts.
88
+
89
+ | | **light** | **onnx** |
90
+ |---|---|---|
91
+ | Detection | 96.7% (236/244) | **100% (244/244)** |
92
+ | False positives | **0/14 (0%)** | 1/14 (7%) |
93
+ | PERSON | 93% | **100%** |
94
+ | EMAIL | 98% | **100%** |
95
+ | PHONE | 100% | 100% |
96
+ | IBAN | 100% | 100% |
97
+ | CREDIT_CARD | 100% | 100% |
98
+ | DATE | 100% | 100% |
99
+ | SSN | 100% | 100% |
100
+ | Secrets | no | **yes** |
101
+
102
+ The `light` misses are all PERSON entities: single-word names, long multi-part Spanish names, and names spaCy doesn't recognize. Regex entities are 100% on both presets.
103
+
104
+ Full benchmark with all test data: [privaite-bench](https://github.com/crp4222/privaite-bench)
105
+
106
+ ## What's NOT detected by default
107
+
108
+ - **Locations/cities** — "Paris", "London" alone aren't PII (they don't identify anyone). Detecting them causes massive false positives on any text ("Kubernetes", "PIB", "Saturday" all get flagged as locations by spaCy). Disabled by default.
109
+ - **URLs** — Presidio's URL regex matches code like `logging.getLogger` because `.ge` is a valid TLD. Disabled by default.
110
+ - **Passwords/secrets** — Only the `onnx` preset detects these via the Privacy Filter model.
111
+
112
+ All of these can be re-enabled in the YAML config if your use case needs them.
113
+
114
+ ## Threat model
115
+
116
+ PrivAiTe performs **local pseudonymization**, not guaranteed anonymization. Detection runs on your machine; the real ↔ placeholder mapping lives in memory only for the duration of a request and is dropped afterwards.
117
+
118
+ **What it protects against:** the LLM provider storing, training on, or logging your raw PII. The provider receives placeholders (`<PERSON_1>`, …) for everything the detector catches — across message content, tool-call arguments, and multimodal text.
119
+
120
+ **What it does NOT protect against:**
121
+
122
+ - **PII the detector misses.** Detection is statistical and never 100% (see the [benchmark](https://github.com/crp4222/privaite-bench)). A name it doesn't recognize reaches the provider. The `onnx` preset has the best recall; treat the output as best-effort, not a guarantee.
123
+ - **Re-identification from context.** Even with names replaced, the surrounding text can stay identifying ("the CEO of `<ORG_1>` who resigned in March").
124
+ - **A compromised local machine.** The mapping and raw text live in local memory; this is not a defense against a local attacker.
125
+ - **The provider correlating** requests within a session.
126
+
127
+ For GDPR/HIPAA: treat this as pseudonymization + transfer minimization, not anonymization. If you need irreversible removal, use `method: "redact"` instead of `method: "placeholder"`.
128
+
129
+ ## Alternatives
130
+
131
+ Keeping PII out of LLM calls is a crowded space, and PrivAiTe is not always the right pick. Based on each project's public docs as of June 2026:
132
+
133
+ - [AI Security Gateway](https://github.com/aisecuritygateway/aisecuritygateway) does more than PII: it adds secret detection and prompt-injection blocking. If you want those in the same proxy, start there. Its PII scanning targets plain message text.
134
+ - [Philter](https://philterd.ai/) is a mature, drop-in "change one URL" redaction proxy for plain-text prompts.
135
+ - LiteLLM has a built-in Presidio guardrail, the natural choice if you already run the LiteLLM proxy and want PII handling inline (there are a few open bugs around scrubbing requests and responses).
136
+ - Managed/cloud options exist too, such as Microsoft PII Shield and [LangChain's gateway redaction](https://docs.langchain.com/langsmith/llm-gateway-redaction).
137
+
138
+ Where PrivAiTe differs: it anonymizes PII **inside tool-call arguments and multimodal content**, not just message text (LangChain's gateway docs, for instance, note that tool-call arguments are not scanned), it **restores** the original values in the response, and it ships a [reproducible benchmark](https://github.com/crp4222/privaite-bench). If your traffic is agentic or multimodal, that gap is the reason this exists.
139
+
140
+ ## Quick start
141
+
142
+ ### 1. Install
143
+
144
+ ```bash
145
+ pip install -e .
146
+ python -m spacy download en_core_web_lg
147
+ python -m spacy download fr_core_news_md
148
+ ```
149
+
150
+ For the `onnx` preset (optional, torch-free):
151
+ ```bash
152
+ pip install -e ".[onnx]"
153
+ ```
154
+
155
+ ### 2. Configure
156
+
157
+ ```bash
158
+ cp .env.example .env
159
+ cp config/privaite.example.yaml config/privaite.yaml
160
+ ```
161
+
162
+ Edit `.env` with your API keys and `config/privaite.yaml` with your LLM providers.
163
+
164
+ ### 3. Run
165
+
166
+ ```bash
167
+ python -m privaite
168
+
169
+ # Dev mode (auto-reload)
170
+ python -m privaite --reload
171
+ ```
172
+
173
+ ### 4. Connect
174
+
175
+ Point any OpenAI-compatible client to `http://localhost:8400/v1` with your proxy API key.
176
+
177
+ **OpenWebUI (Docker):** Admin → Settings → Connections → OpenAI API:
178
+ - URL: `http://host.docker.internal:8400/v1`
179
+ - Key: your `PRIVAITE_API_KEYS` value
180
+
181
+ If you would rather not run a separate proxy, there is also an in-process Open
182
+ WebUI filter (see [Open WebUI filter](#open-webui-filter) below).
183
+
184
+ ## Docker
185
+
186
+ ```bash
187
+ docker compose up -d
188
+ ```
189
+
190
+ ## Open WebUI filter
191
+
192
+ `integrations/openwebui/privaite_filter.py` is an Open WebUI Filter Function. It
193
+ runs the engine inside Open WebUI, so it anonymizes the outgoing request and
194
+ restores PII in the reply without a separate proxy. It covers message text,
195
+ tool-call arguments, and multimodal text.
196
+
197
+ To install it: Admin Panel → Functions → "+", paste the file, save, enable it,
198
+ then open its valves to pick the preset (`light` or `onnx`) and the languages.
199
+ The filter pulls Presidio and spaCy into Open WebUI and downloads the spaCy
200
+ models on first use, so the first request after enabling it can be slow. Setup
201
+ notes are in [`integrations/openwebui/README.md`](integrations/openwebui/README.md).
202
+
203
+ ## Configuration
204
+
205
+ ### LLM providers
206
+
207
+ Any [LiteLLM-supported provider](https://docs.litellm.ai/docs/providers) works:
208
+
209
+ ```yaml
210
+ providers:
211
+ - model_name: "gpt-4o"
212
+ litellm_params:
213
+ model: "openai/gpt-4o"
214
+ api_key: "${OPENAI_API_KEY}"
215
+
216
+ - model_name: "local-llama"
217
+ litellm_params:
218
+ model: "ollama/llama3.1"
219
+ api_base: "http://localhost:11434"
220
+ ```
221
+
222
+ ### Anonymization method
223
+
224
+ ```yaml
225
+ pii:
226
+ anonymization:
227
+ method: "placeholder" # <PERSON_1>, <EMAIL_ADDRESS_1> — recommended
228
+ # method: "fake_replacement" # Realistic fakes via Faker (Jean → Michel)
229
+ # method: "redact" # [PERSON], [EMAIL_ADDRESS] — irreversible
230
+ # method: "mask" # ********
231
+ ```
232
+
233
+ ### Custom regex patterns
234
+
235
+ Add your own PII patterns without touching code:
236
+
237
+ ```yaml
238
+ pii:
239
+ custom_patterns:
240
+ - pattern: "KD-\\d{6}"
241
+ entity_type: "CUSTOMER_ID"
242
+ - pattern: "REF-[A-Z]{3}-\\d+"
243
+ entity_type: "REFERENCE"
244
+ ```
245
+
246
+ ### Languages
247
+
248
+ 7 languages supported with spaCy NER + contextual patterns: FR, EN, DE, ES, IT, PT, NL.
249
+
250
+ ```yaml
251
+ pii:
252
+ detectors:
253
+ presidio:
254
+ languages: ["fr", "en"] # Add "de", "es", etc.
255
+ ```
256
+
257
+ Each language needs its spaCy model: `python -m spacy download de_core_news_md`
258
+
259
+ ## API
260
+
261
+ OpenAI-compatible:
262
+
263
+ | Endpoint | Description |
264
+ |----------|-------------|
265
+ | `POST /v1/chat/completions` | Chat (streaming + non-streaming) |
266
+ | `POST /v1/completions` | Text completions |
267
+ | `POST /v1/embeddings` | Embeddings (anonymized, no de-anonymization) |
268
+ | `GET /v1/models` | List configured models |
269
+ | `GET /health` | Health check |
270
+ | `GET /ready` | Readiness check |
271
+ | `GET /stats` | PII detection stats per session |
272
+
273
+ ### What gets anonymized
274
+
275
+ PII is stripped from every field that carries user text to the provider:
276
+
277
+ - `messages[].content`, whether a plain string or a multimodal list of parts (text parts are scrubbed, images and audio are left alone).
278
+ - `tool_calls[].function.arguments` and the legacy `function_call.arguments`: parsed as JSON and scrubbed value by value, so object keys and the function name stay intact. Arguments that are not valid JSON are scrubbed as free text.
279
+ - `/v1/completions` `prompt` and `/v1/embeddings` `input`, as a string or a list of strings.
280
+
281
+ On the way back, the original values are restored in `message.content` and, for non-streaming chat, in returned `tool_calls`. Set `pii.passthrough.tool_calls: true` to forward tool-call arguments unchanged.
282
+
283
+ For a stricter posture, set `pii.strict: true`: any request whose content can't be inspected (a shape that is neither text nor a known media part) is rejected with `400` instead of being forwarded.
284
+
285
+ ## Known limitations
286
+
287
+ - **Single-word names** from spaCy are dropped (too many false positives). Caught by contextual patterns ("Nom: X") or the `onnx` preset.
288
+ - **Lowercase names** need intro patterns ("je m'appelle X"). The `onnx` preset catches them without patterns.
289
+ - **Informal dates** ("last Tuesday", "il y a deux ans") are not detected.
290
+ - **No policy gate** — all requests are forwarded after pseudonymization.
291
+ - **Streaming tool calls**: argument deltas are not de-anonymized, so a streamed tool call may show placeholders instead of the original values. Request-side anonymization still applies, so no PII leaks.
292
+
293
+ ## Development
294
+
295
+ ```bash
296
+ pip install -e ".[dev]"
297
+ python -m pytest tests/ -v
298
+ ```
299
+
300
+ ## License
301
+
302
+ BSD 3-Clause. See [LICENSE](LICENSE).
@@ -0,0 +1 @@
1
+ __version__ = "0.2.3"