carnaval 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (124) hide show
  1. carnaval-0.1.0/.gitignore +36 -0
  2. carnaval-0.1.0/CHANGELOG.md +37 -0
  3. carnaval-0.1.0/LICENSE +15 -0
  4. carnaval-0.1.0/NOTICE +34 -0
  5. carnaval-0.1.0/PKG-INFO +312 -0
  6. carnaval-0.1.0/README.md +275 -0
  7. carnaval-0.1.0/assets/carnaval-mask.svg +69 -0
  8. carnaval-0.1.0/assets/dictionaries/cities/de.txt +15990 -0
  9. carnaval-0.1.0/assets/dictionaries/cities/en.txt +27401 -0
  10. carnaval-0.1.0/assets/dictionaries/cities/es.txt +30357 -0
  11. carnaval-0.1.0/assets/dictionaries/cities/fr.txt +22938 -0
  12. carnaval-0.1.0/assets/dictionaries/cities/it.txt +11546 -0
  13. carnaval-0.1.0/assets/dictionaries/cities/pt.txt +9597 -0
  14. carnaval-0.1.0/assets/dictionaries/firstnames/_stoplist.txt +11 -0
  15. carnaval-0.1.0/assets/dictionaries/firstnames/de.txt +102 -0
  16. carnaval-0.1.0/assets/dictionaries/firstnames/en.txt +78 -0
  17. carnaval-0.1.0/assets/dictionaries/firstnames/es.txt +62 -0
  18. carnaval-0.1.0/assets/dictionaries/firstnames/fr.txt +113 -0
  19. carnaval-0.1.0/assets/dictionaries/firstnames/it.txt +61 -0
  20. carnaval-0.1.0/assets/dictionaries/firstnames/pt.txt +47 -0
  21. carnaval-0.1.0/config/pipeline.yaml +29 -0
  22. carnaval-0.1.0/docs/00_overview.md +94 -0
  23. carnaval-0.1.0/docs/01_architecture_etages.md +153 -0
  24. carnaval-0.1.0/docs/02_install.md +113 -0
  25. carnaval-0.1.0/docs/03_deploiement_production.md +143 -0
  26. carnaval-0.1.0/docs/04_configuration.md +148 -0
  27. carnaval-0.1.0/docs/05_extension_listes.md +144 -0
  28. carnaval-0.1.0/docs/06_extension_recognizers.md +188 -0
  29. carnaval-0.1.0/docs/07_securite.md +148 -0
  30. carnaval-0.1.0/docs/08_format_entree_sortie.md +199 -0
  31. carnaval-0.1.0/docs/09_troubleshooting.md +158 -0
  32. carnaval-0.1.0/docs/10_api_reference.md +269 -0
  33. carnaval-0.1.0/profiles/README.md +49 -0
  34. carnaval-0.1.0/profiles/acknowledge/README.md +48 -0
  35. carnaval-0.1.0/profiles/acknowledge/allow_lists/product_refs.yaml +9 -0
  36. carnaval-0.1.0/profiles/acknowledge/deny_lists/organization_singleton.yaml +12 -0
  37. carnaval-0.1.0/profiles/acknowledge/deny_lists/organizations.yaml +32 -0
  38. carnaval-0.1.0/profiles/acknowledge/deny_lists/people.yaml +9 -0
  39. carnaval-0.1.0/profiles/acknowledge/deny_lists/places/de.yaml +77 -0
  40. carnaval-0.1.0/profiles/acknowledge/deny_lists/places/en.yaml +82 -0
  41. carnaval-0.1.0/profiles/acknowledge/deny_lists/places/es.yaml +55 -0
  42. carnaval-0.1.0/profiles/acknowledge/deny_lists/places/fr.yaml +96 -0
  43. carnaval-0.1.0/profiles/acknowledge/deny_lists/places/it.yaml +62 -0
  44. carnaval-0.1.0/profiles/acknowledge/deny_lists/places/pt.yaml +49 -0
  45. carnaval-0.1.0/profiles/acknowledge/fixtures/sample_ack_globex.txt +34 -0
  46. carnaval-0.1.0/profiles/acknowledge/profile.yaml +25 -0
  47. carnaval-0.1.0/profiles/email/README.md +12 -0
  48. carnaval-0.1.0/profiles/email/deny_lists/organizations.yaml +14 -0
  49. carnaval-0.1.0/profiles/email/fixtures/sample_email_vandelay.txt +24 -0
  50. carnaval-0.1.0/profiles/email/profile.yaml +17 -0
  51. carnaval-0.1.0/profiles/invoice/README.md +20 -0
  52. carnaval-0.1.0/profiles/invoice/deny_lists/organization_singleton.yaml +4 -0
  53. carnaval-0.1.0/profiles/invoice/deny_lists/organizations.yaml +14 -0
  54. carnaval-0.1.0/profiles/invoice/fixtures/sample_invoice_initech.txt +34 -0
  55. carnaval-0.1.0/profiles/invoice/profile.yaml +25 -0
  56. carnaval-0.1.0/profiles_private/README.md +34 -0
  57. carnaval-0.1.0/profiles_private/example_acknowledge/README.md +40 -0
  58. carnaval-0.1.0/pyproject.toml +87 -0
  59. carnaval-0.1.0/src/carnaval/__init__.py +8 -0
  60. carnaval-0.1.0/src/carnaval/cli/__init__.py +6 -0
  61. carnaval-0.1.0/src/carnaval/cli/anonymize.py +157 -0
  62. carnaval-0.1.0/src/carnaval/cli/reinject.py +98 -0
  63. carnaval-0.1.0/src/carnaval/core/__init__.py +3 -0
  64. carnaval-0.1.0/src/carnaval/core/config_loader.py +215 -0
  65. carnaval-0.1.0/src/carnaval/core/language_detector.py +52 -0
  66. carnaval-0.1.0/src/carnaval/core/logger.py +84 -0
  67. carnaval-0.1.0/src/carnaval/core/serializers.py +236 -0
  68. carnaval-0.1.0/src/carnaval/core/span.py +94 -0
  69. carnaval-0.1.0/src/carnaval/core/vault.py +135 -0
  70. carnaval-0.1.0/src/carnaval/pipeline.py +110 -0
  71. carnaval-0.1.0/src/carnaval/recognizers/__init__.py +7 -0
  72. carnaval-0.1.0/src/carnaval/recognizers/ai/__init__.py +3 -0
  73. carnaval-0.1.0/src/carnaval/recognizers/ai/gliner_engine.py +112 -0
  74. carnaval-0.1.0/src/carnaval/recognizers/base.py +208 -0
  75. carnaval-0.1.0/src/carnaval/recognizers/denylist/__init__.py +3 -0
  76. carnaval-0.1.0/src/carnaval/recognizers/denylist/organizations.py +69 -0
  77. carnaval-0.1.0/src/carnaval/recognizers/denylist/people.py +34 -0
  78. carnaval-0.1.0/src/carnaval/recognizers/denylist/places.py +47 -0
  79. carnaval-0.1.0/src/carnaval/recognizers/denylist/singleton.py +86 -0
  80. carnaval-0.1.0/src/carnaval/recognizers/dictionary/__init__.py +13 -0
  81. carnaval-0.1.0/src/carnaval/recognizers/dictionary/_loader.py +117 -0
  82. carnaval-0.1.0/src/carnaval/recognizers/dictionary/cities.py +44 -0
  83. carnaval-0.1.0/src/carnaval/recognizers/dictionary/firstnames.py +49 -0
  84. carnaval-0.1.0/src/carnaval/recognizers/regex/__init__.py +3 -0
  85. carnaval-0.1.0/src/carnaval/recognizers/regex/address/__init__.py +51 -0
  86. carnaval-0.1.0/src/carnaval/recognizers/regex/address/de.py +79 -0
  87. carnaval-0.1.0/src/carnaval/recognizers/regex/address/en.py +67 -0
  88. carnaval-0.1.0/src/carnaval/recognizers/regex/address/es.py +57 -0
  89. carnaval-0.1.0/src/carnaval/recognizers/regex/address/fr.py +149 -0
  90. carnaval-0.1.0/src/carnaval/recognizers/regex/address/it.py +59 -0
  91. carnaval-0.1.0/src/carnaval/recognizers/regex/address/pt.py +65 -0
  92. carnaval-0.1.0/src/carnaval/recognizers/regex/address_fr.py +24 -0
  93. carnaval-0.1.0/src/carnaval/recognizers/regex/context_location.py +216 -0
  94. carnaval-0.1.0/src/carnaval/recognizers/regex/email.py +23 -0
  95. carnaval-0.1.0/src/carnaval/recognizers/regex/fiscal_fr.py +95 -0
  96. carnaval-0.1.0/src/carnaval/recognizers/regex/header_source.py +36 -0
  97. carnaval-0.1.0/src/carnaval/recognizers/regex/iban_bic.py +96 -0
  98. carnaval-0.1.0/src/carnaval/recognizers/regex/name_patterns.py +23 -0
  99. carnaval-0.1.0/src/carnaval/recognizers/regex/names/__init__.py +51 -0
  100. carnaval-0.1.0/src/carnaval/recognizers/regex/names/de.py +236 -0
  101. carnaval-0.1.0/src/carnaval/recognizers/regex/names/en.py +79 -0
  102. carnaval-0.1.0/src/carnaval/recognizers/regex/names/es.py +77 -0
  103. carnaval-0.1.0/src/carnaval/recognizers/regex/names/fr.py +167 -0
  104. carnaval-0.1.0/src/carnaval/recognizers/regex/names/it.py +81 -0
  105. carnaval-0.1.0/src/carnaval/recognizers/regex/names/pt.py +72 -0
  106. carnaval-0.1.0/src/carnaval/recognizers/regex/org_suffix.py +125 -0
  107. carnaval-0.1.0/src/carnaval/recognizers/regex/phone/__init__.py +51 -0
  108. carnaval-0.1.0/src/carnaval/recognizers/regex/phone/de.py +40 -0
  109. carnaval-0.1.0/src/carnaval/recognizers/regex/phone/en.py +63 -0
  110. carnaval-0.1.0/src/carnaval/recognizers/regex/phone/es.py +50 -0
  111. carnaval-0.1.0/src/carnaval/recognizers/regex/phone/fr.py +31 -0
  112. carnaval-0.1.0/src/carnaval/recognizers/regex/phone/it.py +36 -0
  113. carnaval-0.1.0/src/carnaval/recognizers/regex/phone/pt.py +50 -0
  114. carnaval-0.1.0/src/carnaval/recognizers/regex/phone_fr.py +14 -0
  115. carnaval-0.1.0/src/carnaval/recognizers/regex/url.py +27 -0
  116. carnaval-0.1.0/src/carnaval/stages/__init__.py +7 -0
  117. carnaval-0.1.0/src/carnaval/stages/documents.py +96 -0
  118. carnaval-0.1.0/src/carnaval/stages/s1_intake.py +71 -0
  119. carnaval-0.1.0/src/carnaval/stages/s2_preprocess.py +79 -0
  120. carnaval-0.1.0/src/carnaval/stages/s3_detect.py +369 -0
  121. carnaval-0.1.0/src/carnaval/stages/s4_resolve.py +155 -0
  122. carnaval-0.1.0/src/carnaval/stages/s5_mask.py +137 -0
  123. carnaval-0.1.0/src/carnaval/stages/s6_output.py +113 -0
  124. carnaval-0.1.0/src/carnaval/stages/s7_reinject.py +174 -0
@@ -0,0 +1,36 @@
1
+ # Virtual env
2
+ .venv/
3
+ venv/
4
+ __pycache__/
5
+ *.py[cod]
6
+
7
+ # Tests / coverage
8
+ .pytest_cache/
9
+ .coverage
10
+ htmlcov/
11
+
12
+ # Local data
13
+ .env
14
+ inbox/*
15
+ !inbox/.gitkeep
16
+ !inbox/README.md
17
+ outbox/**/*
18
+ !outbox/**/.gitkeep
19
+
20
+ # Profils prives : tout est ignore SAUF le README et le modele
21
+ # example_acknowledge/ livre comme reference a copier.
22
+ profiles_private/*
23
+ !profiles_private/README.md
24
+ !profiles_private/.gitkeep
25
+ !profiles_private/example_acknowledge/
26
+ !profiles_private/example_acknowledge/**
27
+
28
+ # IDE
29
+ .vscode/
30
+ .idea/
31
+ .DS_Store
32
+
33
+ # Dist
34
+ dist/
35
+ build/
36
+ *.egg-info/
@@ -0,0 +1,37 @@
1
+ # Changelog
2
+
3
+ All notable changes to this project are documented in this file.
4
+
5
+ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/),
6
+ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
7
+
8
+ ## [Unreleased]
9
+
10
+ ## [0.1.0] - 2026-05-15
11
+
12
+ ### Added
13
+
14
+ - Reversible PII anonymization pipeline (stages S1 to S7): intake,
15
+ preprocess, detect, resolve, mask, output, reinject.
16
+ - Encrypted vault (AES-256-GCM) mapping placeholders to original values,
17
+ enabling lossless reinjection of LLM responses.
18
+ - Multi-source recognizers:
19
+ - Regex recognizers for emails, phones, IBAN/BIC, URLs, fiscal IDs
20
+ (SIREN/SIRET/VAT), names and addresses, across several languages
21
+ (FR, EN, DE, ES, IT, PT).
22
+ - Deny-list recognizers for organizations, people, places, and a
23
+ singleton "parent organization" placeholder `[ORG]`.
24
+ - Dictionary recognizers for first names and cities.
25
+ - Optional GLiNER zero-shot NER recognizer (extra `ai`).
26
+ - Automatic language detection (lingua) with manual override.
27
+ - Business profiles (`acknowledge`, `invoice`, `email`) bundling deny
28
+ lists, allow lists, patterns and policies, plus fictional test fixtures.
29
+ - Private-profile support (`profiles_private/`) with a fictional
30
+ `example_acknowledge` template for user-specific deny lists.
31
+ - Multi-format output: TXT, JSON, JSONL, XML, CoNLL and HTML.
32
+ - Command-line tools `carnaval-anonymize` and `carnaval-reinject`.
33
+ - Structured logging via structlog.
34
+ - Packaging with hatchling, src layout, Apache-2.0 license.
35
+
36
+ [Unreleased]: https://example.com/carnaval/compare/v0.1.0...HEAD
37
+ [0.1.0]: https://example.com/carnaval/releases/tag/v0.1.0
carnaval-0.1.0/LICENSE ADDED
@@ -0,0 +1,15 @@
1
+ Apache License
2
+ Version 2.0, January 2004
3
+ http://www.apache.org/licenses/
4
+
5
+ Licensed under the Apache License, Version 2.0 (the "License");
6
+ you may not use this file except in compliance with the License.
7
+ You may obtain a copy of the License at
8
+
9
+ http://www.apache.org/licenses/LICENSE-2.0
10
+
11
+ Unless required by applicable law or agreed to in writing, software
12
+ distributed under the License is distributed on an "AS IS" BASIS,
13
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14
+ See the License for the specific language governing permissions and
15
+ limitations under the License.
carnaval-0.1.0/NOTICE ADDED
@@ -0,0 +1,34 @@
1
+ Carnaval
2
+ Copyright 2026 - Patrice AUBERT
3
+
4
+ This product includes software developed by the Carnaval project.
5
+
6
+ Licensed under the Apache License, Version 2.0 (the "License");
7
+ you may not use this software except in compliance with the License.
8
+ You may obtain a copy of the License at
9
+
10
+ http://www.apache.org/licenses/LICENSE-2.0
11
+
12
+ Unless required by applicable law or agreed to in writing, software
13
+ distributed under the License is distributed on an "AS IS" BASIS,
14
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ See the License for the specific language governing permissions and
16
+ limitations under the License.
17
+
18
+ -------------------------------------------------------------------------------
19
+
20
+ Third-party components
21
+
22
+ This software depends on the following open-source packages, each
23
+ distributed under its own license:
24
+
25
+ - pycryptodome (BSD / Public Domain)
26
+ - PyYAML (MIT)
27
+ - python-dotenv (BSD-3-Clause)
28
+ - structlog (Apache-2.0 / MIT)
29
+ - lingua-language-detector (Apache-2.0)
30
+ - gliner (Apache-2.0, optional extra "ai")
31
+
32
+ The bundled dictionaries under assets/dictionaries/ are derived from public
33
+ open-data sources (e.g. GeoNames, INSEE) and remain subject to their
34
+ respective open-data licenses.
@@ -0,0 +1,312 @@
1
+ Metadata-Version: 2.4
2
+ Name: carnaval
3
+ Version: 0.1.0
4
+ Summary: Reversible PII anonymization framework with an encrypted vault
5
+ Project-URL: Homepage, https://github.com/carnaval-ai/carnaval
6
+ Project-URL: Repository, https://github.com/carnaval-ai/carnaval
7
+ Project-URL: Documentation, https://github.com/carnaval-ai/carnaval/wiki
8
+ Project-URL: Issues, https://github.com/carnaval-ai/carnaval/issues
9
+ Project-URL: Changelog, https://github.com/carnaval-ai/carnaval/blob/main/CHANGELOG.md
10
+ Author-email: Patrice AUBERT <carnaval.oss@gmail.com>
11
+ Maintainer-email: Patrice AUBERT <carnaval.oss@gmail.com>
12
+ License-Expression: Apache-2.0
13
+ License-File: LICENSE
14
+ License-File: NOTICE
15
+ Keywords: anonymization,compliance,data-masking,data-protection,gdpr,llm,named-entity-recognition,ner,nlp,pii,pii-anonymization,privacy,pseudonymization
16
+ Classifier: Development Status :: 4 - Beta
17
+ Classifier: Intended Audience :: Developers
18
+ Classifier: License :: OSI Approved :: Apache Software License
19
+ Classifier: Programming Language :: Python :: 3
20
+ Classifier: Programming Language :: Python :: 3.11
21
+ Classifier: Programming Language :: Python :: 3.12
22
+ Classifier: Programming Language :: Python :: 3.13
23
+ Classifier: Topic :: Security
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Requires-Python: >=3.11
26
+ Requires-Dist: lingua-language-detector>=2.0
27
+ Requires-Dist: pycryptodome>=3.20
28
+ Requires-Dist: python-dotenv>=1.0
29
+ Requires-Dist: pyyaml>=6.0
30
+ Requires-Dist: structlog>=24.1
31
+ Provides-Extra: ai
32
+ Requires-Dist: gliner>=0.2; extra == 'ai'
33
+ Provides-Extra: dev
34
+ Requires-Dist: pytest-cov>=5.0; extra == 'dev'
35
+ Requires-Dist: pytest>=8.0; extra == 'dev'
36
+ Description-Content-Type: text/markdown
37
+
38
+ <p align="center">
39
+ <img src="assets/carnaval-mask.svg" width="240" alt="Carnaval mask">
40
+ </p>
41
+
42
+ <h1 align="center">Carnaval</h1>
43
+
44
+ <p align="center"><em>The art of the mask - hide the identity, keep the meaning.</em></p>
45
+
46
+ <p align="center">
47
+ <a href="LICENSE"><img src="https://img.shields.io/badge/License-Apache_2.0-blue.svg" alt="License: Apache 2.0"></a>
48
+ <a href="https://doi.org/10.5281/zenodo.20219604"><img src="https://zenodo.org/badge/DOI/10.5281/zenodo.20219604.svg" alt="DOI"></a>
49
+ <img src="https://img.shields.io/badge/python-3.11%2B-blue.svg" alt="Python 3.11+">
50
+ <img src="https://img.shields.io/badge/tests-passing-brightgreen.svg" alt="Tests">
51
+ <img src="https://img.shields.io/badge/status-functional%20POC-orange.svg" alt="Status">
52
+ </p>
53
+
54
+ **Carnaval** is an open-source Python framework for **reversible PII anonymization**.
55
+ It masks sensitive entities in text documents *before* they are sent to a cloud
56
+ LLM, then restores the original values in the structured response the LLM returns.
57
+
58
+ ---
59
+
60
+ ## The problem
61
+
62
+ You want to use a cloud LLM (Claude, GPT, Mistral, Gemini...) to process text
63
+ documents - order acknowledgements, invoices, business emails, contracts - but
64
+ those documents contain personal or confidential data that must never leave
65
+ your infrastructure in clear text.
66
+
67
+ ## The solution
68
+
69
+ ```
70
+ RAW DOCUMENT ──▶ [ Carnaval ] ──▶ MASKED DOCUMENT ──▶ Cloud LLM
71
+
72
+ FINAL DOCUMENT ◀── [ Carnaval ] ◀── JSON / XML response ◀──┘
73
+ ```
74
+
75
+ 1. **Before sending** - sensitive entities are replaced with placeholders such
76
+ as `[PERSON_1]`, `[EMAIL_2]`, `[ORG]`. The placeholder ↔ real-value mapping
77
+ is stored in an **encrypted local vault**.
78
+ 2. **After the response** - the original values are re-injected into the JSON
79
+ or XML structure returned by the LLM.
80
+
81
+ No data ever leaves your machine in clear text, and the LLM still receives a
82
+ coherent, structured document it can reason about.
83
+
84
+ ---
85
+
86
+ ## Key features
87
+
88
+ - **Reversible** - every masked entity maps to a unique placeholder; the mapping
89
+ lives in an AES-256-GCM encrypted vault.
90
+ - **Coherent** - the same value always receives the same placeholder within a
91
+ run, so the LLM can reason about cross-references.
92
+ - **Local-first** - no network calls to anonymize. The optional neural model
93
+ runs on your own machine.
94
+ - **9 entity types** - `PERSON`, `ORGANIZATION`, `LOCATION`, `EMAIL`, `PHONE`,
95
+ `IBAN`, `BIC`, `VAT`, `SIREN`/`SIRET`, `URL`.
96
+ - **Layered detection** - regex recognizers, deny lists, bundled dictionaries
97
+ (GeoNames cities, first names), and an optional zero-shot neural recognizer
98
+ (GLiNER).
99
+ - **Multilingual** - 6 languages: French, English, German, Spanish, Italian,
100
+ Portuguese.
101
+ - **Business profiles** - `acknowledge`, `invoice`, `email`, plus private
102
+ per-client profiles kept out of version control.
103
+ - **8 output formats** - TXT, JSON, JSONL, XML, CoNLL, HTML, encrypted vault,
104
+ audit metadata - all produced in a single pass.
105
+ - **CLI and library** - use the `anonymize.py` / `reinject.py` scripts, or
106
+ import `carnaval` directly into your Python code.
107
+
108
+ ---
109
+
110
+ ## Pipeline
111
+
112
+ Carnaval is built as **7 self-contained stages**, each with a clear
113
+ input → output contract:
114
+
115
+ ```
116
+ TXT ──▶ S1 Intake ──▶ S2 Preprocess ──▶ S3 Detect ──▶ S4 Resolve ──▶ S5 Mask ──▶ S6 Output
117
+ (read) (language, (recognizers) (dedup, (placeholders (8 formats)
118
+ normalize) arbitration) + vault)
119
+
120
+ JSON / XML ──▶ S7 Reinject ──▶ JSON / XML with original values restored
121
+ ```
122
+
123
+ See [Architecture](wiki/Architecture.md) for details on each stage.
124
+
125
+ ---
126
+
127
+ ## Installation
128
+
129
+ Requires **Python 3.11+** (tested on 3.13).
130
+
131
+ ```bash
132
+ git clone <repository-url>
133
+ cd carnaval
134
+
135
+ python -m venv .venv
136
+ # Windows PowerShell
137
+ .\.venv\Scripts\Activate.ps1
138
+ # Linux / macOS
139
+ source .venv/bin/activate
140
+
141
+ pip install -r requirements.txt
142
+ ```
143
+
144
+ The neural recognizer (GLiNER) is included in `requirements.txt`. The model
145
+ (~500 MB) is downloaded automatically on first use; afterwards Carnaval works
146
+ fully offline. See the [Installation guide](wiki/Installation.md) for an
147
+ offline / air-gapped setup.
148
+
149
+ ### Configure the vault password
150
+
151
+ ```bash
152
+ cp .env.example .env
153
+ ```
154
+
155
+ Then edit `.env` and set a strong secret (16 characters minimum, 32+ recommended):
156
+
157
+ ```
158
+ CARNAVAL_VAULT_PASSWORD=a-strong-randomly-generated-secret
159
+ ```
160
+
161
+ ---
162
+
163
+ ## Quickstart - CLI
164
+
165
+ ```bash
166
+ # 1. Anonymize a document
167
+ python anonymize.py inbox/order.txt --profile acknowledge
168
+
169
+ # 2. Send outbox/txt/order_anonymise.txt to your LLM, collect a JSON response
170
+
171
+ # 3. Re-inject the real values into the LLM response
172
+ python reinject.py response.json --vault outbox/vault/order_vault.enc
173
+ ```
174
+
175
+ `anonymize.py` produces, in one pass, all 8 output files under `outbox/`
176
+ (`txt/`, `json/`, `jsonl/`, `xml/`, `conll/`, `html/`, `vault/`, `meta/`).
177
+
178
+ Useful flags: `--no-gliner` (regex + deny lists only, faster),
179
+ `--gliner-threshold 0.6`, `--profile invoice`, `--private my_client`,
180
+ `--console` (human-readable logs).
181
+
182
+ ---
183
+
184
+ ## Quickstart - Python API
185
+
186
+ ```python
187
+ from pathlib import Path
188
+ from carnaval.pipeline import run_anonymization
189
+
190
+ masked, written, config = run_anonymization(
191
+ input_path=Path("inbox/order.txt"),
192
+ outbox_dir=Path("outbox"),
193
+ vault_password="a-strong-randomly-generated-secret",
194
+ profile="acknowledge",
195
+ use_gliner=True,
196
+ )
197
+
198
+ print(masked.anonymized_text) # text with placeholders
199
+ print(masked.by_category) # {'PERSON': 2, 'ORGANIZATION': 1, ...}
200
+ print(written.json_path) # path to the JSON output
201
+ ```
202
+
203
+ Re-injecting an LLM response:
204
+
205
+ ```python
206
+ from carnaval.core.vault import Vault
207
+ from carnaval.stages.s7_reinject import reinject_json_data
208
+
209
+ vault = Vault(password="a-strong-randomly-generated-secret",
210
+ path="outbox/vault/order_vault.enc")
211
+ vault.load()
212
+
213
+ llm_response = {"supplier": "[ORG_1]", "contact": "[PERSON_1]"}
214
+ restored = reinject_json_data(llm_response, vault)
215
+ # {"supplier": "Globex Inc.", "contact": "Jane Doe"}
216
+ ```
217
+
218
+ See the [Quickstart](wiki/Quickstart.md) and [Reinjection](wiki/Reinjection.md)
219
+ wiki pages for more.
220
+
221
+ ---
222
+
223
+ ## Security
224
+
225
+ The placeholder ↔ value mapping is stored in an encrypted vault:
226
+
227
+ | Property | Value |
228
+ |---|---|
229
+ | Symmetric cipher | AES-256-GCM (authenticated encryption) |
230
+ | Key derivation | PBKDF2-HMAC-SHA256, 600,000 iterations |
231
+ | Salt | 16 random bytes per file |
232
+ | Nonce | 16 random bytes per file |
233
+ | Integrity tag | 16 bytes - any tampering is detected on read |
234
+
235
+ Without the password, the vault is unreadable. Carnaval makes **no outbound
236
+ network calls** once the GLiNER model has been downloaded, and its structured
237
+ logger redacts sensitive keys by default. It supports GDPR-style
238
+ **pseudonymization** (Article 4.5). See [Vault and Security](wiki/Vault-and-Security.md).
239
+
240
+ ---
241
+
242
+ ## Supported languages
243
+
244
+ French (FR), English (EN), German (DE), Spanish (ES), Italian (IT) and
245
+ Portuguese (PT). The language is auto-detected; mixed-language documents are
246
+ handled via in-text linguistic markers. See [Multilingual](wiki/Multilingual.md).
247
+
248
+ ---
249
+
250
+ ## Project status
251
+
252
+ Carnaval is a **functional proof of concept**. Core anonymization,
253
+ re-injection, the encrypted vault and the 8 output formats are implemented and
254
+ covered by an extensive automated test suite.
255
+
256
+ ## Testing
257
+
258
+ ```bash
259
+ pytest # full suite (skips slow neural tests)
260
+ pytest -m slow # real GLiNER tests (downloads the model)
261
+ pytest --cov=src/carnaval # with coverage
262
+ ```
263
+
264
+ ---
265
+
266
+ ## Documentation
267
+
268
+ The complete reference lives in the **[project wiki](wiki/Home.md)**:
269
+
270
+ - [Home](wiki/Home.md) - overview and table of contents
271
+ - [Installation](wiki/Installation.md)
272
+ - [Quickstart](wiki/Quickstart.md)
273
+ - [Architecture](wiki/Architecture.md)
274
+ - [Vault and Security](wiki/Vault-and-Security.md)
275
+ - [Profiles](wiki/Profiles.md)
276
+ - [Recognizers](wiki/Recognizers.md)
277
+ - [Multilingual](wiki/Multilingual.md)
278
+ - [Output Formats](wiki/Output-Formats.md)
279
+ - [Reinjection](wiki/Reinjection.md)
280
+ - [Troubleshooting](wiki/Troubleshooting.md)
281
+ - [Contributing](wiki/Contributing.md)
282
+
283
+ The original design notes are kept under [`docs/`](docs/).
284
+
285
+ ---
286
+
287
+ ## Contributing
288
+
289
+ Contributions are welcome - see [CONTRIBUTING.md](CONTRIBUTING.md) and our
290
+ [Code of Conduct](CODE_OF_CONDUCT.md). Please use only fictitious entities
291
+ (Acme Corp, Globex, Jane Doe, Springfield...) in public fixtures and examples.
292
+
293
+ ## Contact & Security
294
+
295
+ - General questions, conduct reports: **carnaval.oss@gmail.com**
296
+ - Bug reports and feature requests: GitHub issues
297
+ - Security vulnerabilities: please **do not** open a public issue - see
298
+ [SECURITY.md](SECURITY.md) for responsible disclosure.
299
+
300
+ ## Citation
301
+
302
+ If you use Carnaval in your work, please cite it via its archived DOI:
303
+
304
+ > Patrice AUBERT. *Carnaval: a reversible PII anonymization framework.*
305
+ > 2026. DOI: [10.5281/zenodo.20219604](https://doi.org/10.5281/zenodo.20219604)
306
+
307
+ A machine-readable [`CITATION.cff`](CITATION.cff) is included - GitHub turns it
308
+ into a **"Cite this repository"** button.
309
+
310
+ ## License
311
+
312
+ Carnaval is released under the **Apache License 2.0**. See [LICENSE](LICENSE).