gdpr-pseudonymizer 1.0.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (59) hide show
  1. gdpr_pseudonymizer-1.0.0/LICENSE +21 -0
  2. gdpr_pseudonymizer-1.0.0/PKG-INFO +632 -0
  3. gdpr_pseudonymizer-1.0.0/README.md +593 -0
  4. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/__init__.py +0 -0
  5. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/__init__.py +1 -0
  6. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/__init__.py +1 -0
  7. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/batch.py +756 -0
  8. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/config_show.py +488 -0
  9. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/destroy_table.py +351 -0
  10. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/export.py +253 -0
  11. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/import_mappings.py +263 -0
  12. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/init.py +146 -0
  13. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/list_mappings.py +319 -0
  14. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/process.py +271 -0
  15. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/stats.py +246 -0
  16. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/commands/validate_mappings.py +289 -0
  17. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/config.py +295 -0
  18. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/formatters.py +298 -0
  19. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/main.py +578 -0
  20. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/passphrase.py +93 -0
  21. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/progress.py +311 -0
  22. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/validation_stub.py +83 -0
  23. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/cli/validators.py +329 -0
  24. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/core/__init__.py +1 -0
  25. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/core/document_processor.py +757 -0
  26. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/core/naive_processor.py +121 -0
  27. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/__init__.py +1 -0
  28. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/database.py +285 -0
  29. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/encryption.py +244 -0
  30. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/models.py +143 -0
  31. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/naive_data.py +40 -0
  32. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/repositories/__init__.py +1 -0
  33. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/repositories/audit_repository.py +408 -0
  34. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/repositories/mapping_repository.py +405 -0
  35. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/data/repositories/metadata_repository.py +173 -0
  36. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/exceptions.py +100 -0
  37. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/__init__.py +0 -0
  38. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/entity_detector.py +101 -0
  39. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/entity_grouping.py +320 -0
  40. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/hybrid_detector.py +398 -0
  41. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/name_dictionary.py +123 -0
  42. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/regex_matcher.py +274 -0
  43. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/spacy_detector.py +163 -0
  44. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/nlp/stanza_detector.py +172 -0
  45. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/pseudonym/__init__.py +1 -0
  46. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/pseudonym/assignment_engine.py +437 -0
  47. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/pseudonym/library_manager.py +795 -0
  48. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/utils/__init__.py +1 -0
  49. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/utils/config_manager.py +180 -0
  50. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/utils/file_handler.py +119 -0
  51. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/utils/french_patterns.py +72 -0
  52. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/utils/logger.py +120 -0
  53. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/validation/__init__.py +35 -0
  54. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/validation/actions.py +134 -0
  55. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/validation/context_precomputer.py +96 -0
  56. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/validation/models.py +422 -0
  57. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/validation/ui.py +460 -0
  58. gdpr_pseudonymizer-1.0.0/gdpr_pseudonymizer/validation/workflow.py +518 -0
  59. gdpr_pseudonymizer-1.0.0/pyproject.toml +203 -0
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 GDPR Pseudonymizer Team
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
@@ -0,0 +1,632 @@
1
+ Metadata-Version: 2.4
2
+ Name: gdpr-pseudonymizer
3
+ Version: 1.0.0
4
+ Summary: CLI tool for GDPR-compliant pseudonymization of French text documents using NLP-based entity detection and reversible mapping
5
+ License: MIT
6
+ License-File: LICENSE
7
+ Keywords: gdpr,pseudonymization,nlp,privacy,data-protection
8
+ Author: GDPR Pseudonymizer Team
9
+ Requires-Python: >=3.10,<3.14
10
+ Classifier: Development Status :: 4 - Beta
11
+ Classifier: Intended Audience :: Developers
12
+ Classifier: Intended Audience :: Legal Industry
13
+ Classifier: Intended Audience :: Science/Research
14
+ Classifier: License :: OSI Approved :: MIT License
15
+ Classifier: Natural Language :: French
16
+ Classifier: Operating System :: OS Independent
17
+ Classifier: Programming Language :: Python :: 3
18
+ Classifier: Programming Language :: Python :: 3.10
19
+ Classifier: Programming Language :: Python :: 3.11
20
+ Classifier: Programming Language :: Python :: 3.12
21
+ Classifier: Programming Language :: Python :: 3.13
22
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
23
+ Classifier: Topic :: Security
24
+ Classifier: Topic :: Text Processing :: Linguistic
25
+ Requires-Dist: PyYAML (>=6.0,<7.0)
26
+ Requires-Dist: SQLAlchemy (>=2.0.0,<3.0.0)
27
+ Requires-Dist: cryptography (>=44.0.1,<45.0.0)
28
+ Requires-Dist: markdown-it-py (>=3.0.0,<4.0.0)
29
+ Requires-Dist: readchar (>=4.2.0,<5.0.0)
30
+ Requires-Dist: rich (>=13.7.0,<14.0.0)
31
+ Requires-Dist: spacy (>=3.7.0,<4.0.0)
32
+ Requires-Dist: structlog (>=23.2.0,<24.0.0)
33
+ Requires-Dist: typer (>=0.9.0,<0.10.0)
34
+ Project-URL: Documentation, https://liochandayo.github.io/RGPDpseudonymizer/
35
+ Project-URL: Homepage, https://github.com/LioChanDaYo/RGPDpseudonymizer
36
+ Project-URL: Repository, https://github.com/LioChanDaYo/RGPDpseudonymizer
37
+ Description-Content-Type: text/markdown
38
+
39
+ # GDPR Pseudonymizer
40
+
41
+ [![PyPI version](https://img.shields.io/pypi/v/gdpr-pseudonymizer)](https://pypi.org/project/gdpr-pseudonymizer/)
42
+ [![Python versions](https://img.shields.io/pypi/pyversions/gdpr-pseudonymizer)](https://pypi.org/project/gdpr-pseudonymizer/)
43
+ [![License: MIT](https://img.shields.io/badge/License-MIT-blue.svg)](LICENSE)
44
+ [![CI](https://github.com/LioChanDaYo/RGPDpseudonymizer/actions/workflows/ci.yaml/badge.svg)](https://github.com/LioChanDaYo/RGPDpseudonymizer/actions/workflows/ci.yaml)
45
+ [![Docs](https://img.shields.io/badge/docs-GitHub%20Pages-blue)](https://liochandayo.github.io/RGPDpseudonymizer/)
46
+
47
+ **AI-Assisted Pseudonymization for French Documents with Human Verification**
48
+
49
+ Transform sensitive French documents for safe AI analysis with local processing, mandatory human review, and GDPR compliance.
50
+
51
+ ---
52
+
53
+ ## 🎯 Overview
54
+
55
+ GDPR Pseudonymizer is a **privacy-first CLI tool** that combines AI efficiency with human accuracy to pseudonymize French text documents. Unlike fully automatic tools or cloud services, we prioritize **zero false negatives** and **legal defensibility** through mandatory validation workflows.
56
+
57
+ **Perfect for:**
58
+ - 🏛️ **Privacy-conscious organizations** needing GDPR-compliant AI analysis
59
+ - 🎓 **Academic researchers** with ethics board requirements
60
+ - ⚖️ **Legal/HR teams** requiring defensible pseudonymization
61
+ - 🤖 **LLM users** who want to analyze confidential documents safely
62
+
63
+ ---
64
+
65
+ ## ✨ Key Features
66
+
67
+ ### 🔒 **Privacy-First Architecture**
68
+ - ✅ **100% local processing** - Your data never leaves your machine
69
+ - ✅ **No cloud dependencies** - Works completely offline after installation
70
+ - ✅ **Encrypted mapping tables** - AES-256-SIV encryption with PBKDF2 key derivation (210K iterations), passphrase-protected reversible pseudonymization
71
+ - ✅ **Zero telemetry** - No analytics, crash reporting, or external communication
72
+
73
+ ### 🤝 **AI + Human Verification**
74
+ - ✅ **Hybrid detection** - AI pre-detects 40-50% of entities (NLP + regex patterns)
75
+ - ✅ **Mandatory validation** - You review and confirm all entities (ensures 100% accuracy)
76
+ - ✅ **Fast validation UI** - Rich CLI interface with keyboard shortcuts, <2 min per document
77
+ - ✅ **Smart workflow** - Entity-by-type grouping (PERSON → ORG → LOCATION) with context display
78
+ - ✅ **Entity variant grouping** - Related forms ("Marie Dubois", "Pr. Dubois", "Dubois") merged into one validation item with "Also appears as:" display
79
+ - ✅ **Batch actions** - Confirm/reject multiple entities efficiently
80
+
81
+ ### 📊 **Batch Processing**
82
+ - ✅ **Consistent pseudonyms** - Same entity = same pseudonym across 10-100+ documents
83
+ - ✅ **Compositional matching** - "Marie Dubois" → "Leia Organa", "Marie" alone → "Leia"
84
+ - ✅ **Smart name handling** - Title stripping ("Dr. Marie Dubois" = "Marie Dubois"), compound names ("Jean-Pierre" treated as atomic)
85
+ - ✅ **Selective entity processing** - `--entity-types` flag to filter by type (e.g., `--entity-types PERSON,LOCATION`)
86
+ - ✅ **50%+ time savings** vs manual redaction (AI pre-detection + validation)
87
+
88
+ ### 🎭 **Themed Pseudonyms**
89
+ - ✅ **Readable output** - Star Wars, LOTR, or generic French names
90
+ - ✅ **Maintains context** - LLM analysis preserves 85% document utility (validated: 4.27/5.0)
91
+ - ✅ **Gender-preserving** - When NER provides gender classification (PERSON entities)
92
+ - ✅ **Full entity support** - PERSON, LOCATION, and ORGANIZATION pseudonyms for all themes
93
+
94
+ ---
95
+
96
+ ## 🚀 Quick Start
97
+
98
+ **Status:** 🎉 **v1.0.0 — Public Release** (February 2026)
99
+
100
+ ### Realistic Expectations for v1.0
101
+
102
+ **What v1.0 delivers:**
103
+ - 🤖 **AI-assisted detection** - Hybrid NLP + regex detects ~40-50% of entities automatically
104
+ - ✅ **Mandatory human verification** - You review and confirm all entities (2-3 min per document)
105
+ - 🔒 **100% accuracy guarantee** - Human validation ensures zero false negatives
106
+ - ⚡ **50%+ faster than manual** - Pre-detection saves time vs pure manual redaction
107
+
108
+ **What v1.0 does NOT deliver:**
109
+ - ❌ Fully automatic "set and forget" processing
110
+ - ❌ 85%+ AI accuracy (current: 40-50% with hybrid approach)
111
+ - ❌ Optional validation mode (validation is mandatory)
112
+
113
+ ### Roadmap
114
+
115
+ **v1.0 (MVP - Q2 2026):** AI-assisted CLI with mandatory validation
116
+ - Target: Privacy-conscious early adopters who value human oversight
117
+ - 100% local processing, encrypted mapping tables, audit trails
118
+
119
+ **v1.1 (Q2-Q3 2026):** Quick wins & GDPR compliance
120
+ - GDPR Right to Erasure: selective entity deletion (`delete-mapping` command, Article 17)
121
+ - Gender-aware pseudonym assignment for French names
122
+ - Beta feedback bug fixes and UX improvements
123
+
124
+ **v2.0 (Q3-Q4 2026):** GUI & broader accessibility
125
+ - Desktop GUI wrapping CLI core (drag-and-drop, visual entity review)
126
+ - Standalone executables (.exe for Windows, .app for macOS) — no Python required
127
+ - French-first UI with i18n architecture (multi-language ready)
128
+ - WCAG AA accessibility for professional/academic contexts
129
+ - Target: Non-technical users (HR, legal, compliance teams)
130
+
131
+ **v3.0 (2027+):** NLP accuracy & automation
132
+ - Fine-tuned French NER model (70-85% F1 target, up from 40-50%)
133
+ - Optional `--no-validate` flag for high-confidence workflows
134
+ - Confidence-based auto-processing (85%+ F1 target)
135
+ - Multi-language support (English, Spanish, German)
136
+
137
+ ---
138
+
139
+ ## ⚙️ Installation
140
+
141
+ See [Installation Guide](https://liochandayo.github.io/RGPDpseudonymizer/installation/) for detailed platform-specific instructions.
142
+
143
+ ### Prerequisites
144
+ - Python 3.10-3.12 (validated in CI/CD)
145
+
146
+ ### Install from PyPI (Recommended)
147
+
148
+ ```bash
149
+ pip install gdpr-pseudonymizer
150
+
151
+ # Download spaCy French model (required - 571MB)
152
+ python -m spacy download fr_core_news_lg
153
+
154
+ # Verify installation
155
+ gdpr-pseudo --help
156
+ ```
157
+
158
+ ### Install from Source (Developer)
159
+
160
+ ```bash
161
+ # Clone repository
162
+ git clone https://github.com/LioChanDaYo/RGPDpseudonymizer.git
163
+ cd RGPDpseudonymizer
164
+
165
+ # Install dependencies via Poetry
166
+ pip install poetry>=1.7.0
167
+ poetry install
168
+
169
+ # Install spaCy French model
170
+ poetry run python -m spacy download fr_core_news_lg
171
+
172
+ # Verify installation
173
+ poetry run gdpr-pseudo --help
174
+ ```
175
+
176
+ ### Quick Test
177
+
178
+ ```bash
179
+ # Test on sample document
180
+ echo "Marie Dubois travaille à Paris pour Acme SA." > test.txt
181
+ gdpr-pseudo process test.txt
182
+
183
+ # Or specify custom output file
184
+ gdpr-pseudo process test.txt -o output.txt
185
+ ```
186
+
187
+ Expected output: "Leia Organa travaille à Coruscant pour Rebel Alliance."
188
+
189
+ ### Configuration File (Optional)
190
+
191
+ Generate a config template to customize default settings:
192
+
193
+ ```bash
194
+ # Generate .gdpr-pseudo.yaml template in current directory
195
+ poetry run gdpr-pseudo config --init
196
+
197
+ # View current effective configuration
198
+ poetry run gdpr-pseudo config
199
+ ```
200
+
201
+ Example `.gdpr-pseudo.yaml`:
202
+ ```yaml
203
+ database:
204
+ path: mappings.db
205
+
206
+ pseudonymization:
207
+ theme: star_wars # neutral, star_wars, lotr
208
+ model: spacy
209
+
210
+ batch:
211
+ workers: 4 # 1-8 (use 1 for interactive validation)
212
+ output_dir: null
213
+
214
+ logging:
215
+ level: INFO
216
+ ```
217
+
218
+ **Note:** Passphrase is never stored in config files (security). Use `GDPR_PSEUDO_PASSPHRASE` env var or interactive prompt. Minimum 12 characters required (NFR12).
219
+
220
+ ---
221
+
222
+ ## 📖 Documentation
223
+
224
+ **Documentation Site:** [https://liochandayo.github.io/RGPDpseudonymizer/](https://liochandayo.github.io/RGPDpseudonymizer/)
225
+
226
+ **For Users:**
227
+ - 📘 [Installation Guide](docs/installation.md) - Platform-specific installation instructions
228
+ - 📗 [Usage Tutorial](docs/tutorial.md) - Step-by-step usage tutorials
229
+ - 📕 [CLI Reference](docs/CLI-REFERENCE.md) - Complete command documentation
230
+ - 📕 [Methodology & Academic Citation](docs/methodology.md) - Technical approach and GDPR compliance
231
+ - ❓ [FAQ](docs/faq.md) - Common questions and answers
232
+ - 🔧 [Troubleshooting](docs/troubleshooting.md) - Error reference and solutions
233
+
234
+ **For Developers:**
235
+ - 📚 [API Reference](docs/api-reference.md) - Module documentation and extension points
236
+ - 🏗️ [Architecture Documentation](docs/architecture/) - Technical design
237
+ - 📊 [NLP Benchmark Report](docs/nlp-benchmark-report.md) - NER accuracy analysis
238
+ - 📊 [Performance Report](docs/qa/performance-stability-report.md) - NFR performance validation results
239
+
240
+ **For Stakeholders:**
241
+ - 🎨 [Positioning & Messaging](docs/positioning-messaging-v2-assisted.md)
242
+ - 📋 [Deliverables Summary](docs/DELIVERABLES-SUMMARY-2026-01-16.md)
243
+
244
+ ---
245
+
246
+ ## 🔬 Technical Details
247
+
248
+ ### NLP Library Selection (Story 1.2 - Completed)
249
+
250
+ After comprehensive benchmarking on 25 French interview/business documents (1,855 entities):
251
+
252
+ | Library | F1 Score | Precision | Recall | Decision |
253
+ |---------|----------|-----------|--------|----------|
254
+ | **spaCy** `fr_core_news_lg` | **29.5%** | 27.0% | 32.7% | ✅ **Selected** |
255
+ | **Stanza** `fr_default` | 11.9% | 10.3% | 14.1% | ❌ Rejected |
256
+
257
+ **Why both failed 85% target:**
258
+ - Pre-trained models optimized for news text (not interview/business docs)
259
+ - Domain-specific language patterns (conversational, mixed registers)
260
+ - ORG detection catastrophic (3.8% precision = 96% false positives)
261
+
262
+ **Approved Solution:**
263
+ - ✅ **Hybrid approach** (NLP + regex) targets 40-50% F1
264
+ - ✅ **Mandatory validation** ensures 100% final accuracy
265
+ - 📅 **Fine-tuning** deferred to v3.0 (70-85% F1 target, requires training data from v1.x/v2.x user validations)
266
+
267
+ See full analysis: [docs/nlp-benchmark-report.md](docs/nlp-benchmark-report.md)
268
+
269
+ ### Validation Workflow (Story 1.7 - Complete)
270
+
271
+ The validation UI provides an intuitive keyboard-driven interface for reviewing detected entities:
272
+
273
+ **Features:**
274
+ - ✅ **Entity-by-type grouping** - Review PERSON → ORG → LOCATION in logical order
275
+ - ✅ **Context display** - See 10 words before/after each entity with highlighting
276
+ - ✅ **Confidence scores** - Color-coded confidence from spaCy NER (green >80%, yellow 60-80%, red <60%)
277
+ - ✅ **Keyboard shortcuts** - Single-key actions: [Space] Confirm, [R] Reject, [E] Modify, [A] Add, [C] Change pseudonym
278
+ - ✅ **Batch operations** - Accept/reject all entities of a type at once (Shift+A/R)
279
+ - ✅ **Help overlay** - Press [H] for full command reference
280
+ - ✅ **Performance** - <2 minutes for typical 20-30 entity documents
281
+
282
+ **Workflow Steps:**
283
+ 1. Summary screen (entity counts by type)
284
+ 2. Review entities by type with context
285
+ 3. Flag ambiguous entities for careful review
286
+ 4. Final confirmation with summary of changes
287
+ 5. Process document with validated entities
288
+
289
+ **Deduplication Feature (Story 1.9):** Duplicate entities grouped together - validate once, apply to all occurrences (66% time reduction for large docs)
290
+
291
+ **Entity Variant Grouping (Story 4.6):** Related entity forms automatically merged into single validation items. "Marie Dubois", "Pr. Dubois", and "Dubois" appear as one item with "Also appears as:" showing variant forms. Prevents Union-Find transitive bridging for ambiguous surnames shared by different people.
292
+
293
+ ---
294
+
295
+ ### Technology Stack
296
+
297
+ | Component | Technology | Version | Purpose |
298
+ |-----------|------------|---------|---------|
299
+ | **Runtime** | Python | 3.10-3.12 | Validated in CI/CD (3.13+ not yet tested) |
300
+ | **NLP Library** | spaCy | 3.8.0 | French entity detection (fr_core_news_lg) |
301
+ | **CLI Framework** | Typer | 0.9+ | Command-line interface |
302
+ | **Database** | SQLite | 3.35+ | Local mapping table storage with WAL mode |
303
+ | **Encryption** | cryptography (AESSIV) | 44.0+ | AES-256-SIV encryption for sensitive fields (PBKDF2 key derivation, passphrase-protected) |
304
+ | **ORM** | SQLAlchemy | 2.0+ | Database abstraction and session management |
305
+ | **Validation UI** | rich | 13.7+ | Interactive CLI entity review |
306
+ | **Keyboard Input** | readchar | 4.2+ | Single-keypress capture for validation UI |
307
+ | **Testing** | pytest | 7.4+ | Unit & integration testing |
308
+ | **CI/CD** | GitHub Actions | N/A | Automated testing (Windows/Mac/Linux) |
309
+
310
+ ---
311
+
312
+ ## 🤔 Why AI-Assisted Instead of Automatic?
313
+
314
+ **Short answer:** Privacy and compliance require human oversight.
315
+
316
+ **Long answer:**
317
+ 1. **GDPR defensibility** - Human verification provides legal audit trail
318
+ 2. **Zero false negatives** - AI misses entities, humans catch them (100% coverage)
319
+ 3. **Current NLP limitations** - French models on interview/business docs: 29.5% F1 out-of-box
320
+ 4. **Better than alternatives:**
321
+ - ✅ **vs Manual redaction:** 50%+ faster (AI pre-detection)
322
+ - ✅ **vs Cloud services:** 100% local processing (no data leakage)
323
+ - ✅ **vs Fully automatic tools:** 100% accuracy (human verification)
324
+
325
+ **User Perspective:**
326
+ > "I WANT human review for compliance reasons. The AI saves me time by pre-flagging entities, but I control the final decision." - Compliance Officer
327
+
328
+ ---
329
+
330
+ ## 🎯 Use Cases
331
+
332
+ ### 1. **Research Ethics Compliance**
333
+ **Scenario:** Academic researcher with 50 interview transcripts needing IRB approval
334
+
335
+ **Without GDPR Pseudonymizer:**
336
+ - ❌ Manual redaction: 16-25 hours
337
+ - ❌ Destroys document coherence for analysis
338
+ - ❌ Error-prone (human fatigue)
339
+
340
+ **With GDPR Pseudonymizer:**
341
+ - ✅ AI pre-detection: ~30 min processing
342
+ - ✅ Human validation: ~90 min review (50 docs × ~2 min each)
343
+ - ✅ Total: **2-3 hours** (85%+ time savings)
344
+ - ✅ Audit trail for ethics board
345
+
346
+ ---
347
+
348
+ ### 2. **HR Document Analysis**
349
+ **Scenario:** HR team analyzing employee feedback with ChatGPT
350
+
351
+ **Without GDPR Pseudonymizer:**
352
+ - ❌ Can't use ChatGPT (GDPR violation - employee names exposed)
353
+ - ❌ Manual analysis only (slow, limited insights)
354
+
355
+ **With GDPR Pseudonymizer:**
356
+ - ✅ Pseudonymize locally (employee names → pseudonyms)
357
+ - ✅ Send to ChatGPT safely (no personal data exposed)
358
+ - ✅ Get AI insights while staying GDPR-compliant
359
+
360
+ ---
361
+
362
+ ### 3. **Legal Document Preparation**
363
+ **Scenario:** Law firm preparing case materials for AI legal research
364
+
365
+ **Without GDPR Pseudonymizer:**
366
+ - ❌ Cloud pseudonymization service (third-party risk)
367
+ - ❌ Manual redaction (expensive billable hours)
368
+
369
+ **With GDPR Pseudonymizer:**
370
+ - ✅ 100% local processing (client confidentiality)
371
+ - ✅ Human-verified accuracy (legal defensibility)
372
+ - ✅ Reversible mappings (can de-pseudonymize if needed)
373
+
374
+ ---
375
+
376
+ ## ⚖️ GDPR Compliance
377
+
378
+ ### How GDPR Pseudonymizer Supports Compliance
379
+
380
+ | GDPR Requirement | Implementation |
381
+ |------------------|----------------|
382
+ | **Art. 25 - Data Protection by Design** | Local processing, no cloud dependencies, encrypted storage |
383
+ | **Art. 30 - Processing Records** | Comprehensive audit logs (Story 2.5): operations table tracks timestamp, files processed, entity count, model version, theme, success/failure, processing time; JSON/CSV export for compliance reporting |
384
+ | **Art. 32 - Security Measures** | AES-256-SIV encryption with PBKDF2 key derivation (210,000 iterations), passphrase-protected storage, column-level encryption for sensitive fields |
385
+ | **Art. 35 - Privacy Impact Assessment** | Transparent methodology, cite-able approach for DPIA documentation |
386
+ | **Recital 26 - Pseudonymization** | Consistent pseudonym mapping, reversibility with passphrase |
387
+
388
+ ### What Pseudonymization Means (Legally)
389
+
390
+ **According to GDPR Article 4(5):**
391
+ > "Pseudonymization means the processing of personal data in such a manner that the personal data can no longer be attributed to a specific data subject **without the use of additional information**, provided that such additional information is kept separately."
392
+
393
+ **GDPR Pseudonymizer approach:**
394
+ - ✅ **Personal data replaced:** Names, locations, organizations → pseudonyms
395
+ - ✅ **Separate storage:** Mapping table encrypted with passphrase (separate from documents)
396
+ - ✅ **Reversibility:** Authorized users can de-pseudonymize with passphrase
397
+ - ⚠️ **Note:** Pseudonymization reduces risk but **does NOT make data anonymous**
398
+
399
+ **Recommendation:** Consult your Data Protection Officer (DPO) for specific compliance guidance.
400
+
401
+ ---
402
+
403
+ ## 🛠️ Development Status
404
+
405
+ **All 4 MVP Epics Complete** — v1.0.0 released February 2026.
406
+
407
+ - ✅ **Epic 1:** Foundation & NLP Validation (9 stories) — spaCy integration, validation UI, hybrid detection, entity deduplication
408
+ - ✅ **Epic 2:** Core Pseudonymization Engine (9 stories) — pseudonym libraries, encryption, audit logging, batch processing, GDPR 1:1 mapping
409
+ - ✅ **Epic 3:** CLI Interface & Batch Processing (7 stories) — 8 CLI commands, progress reporting, config files, parallel batch, UX polish
410
+ - ✅ **Epic 4:** Launch Readiness (8 stories) — LLM utility validation, cross-platform testing, documentation, NER accuracy suite, performance validation, beta feedback integration, codebase refactoring, launch preparation
411
+ - **Total:** 33 stories, 1077+ tests, 86%+ coverage, all quality gates green
412
+
413
+ ---
414
+
415
+ ## 🤝 Contributing
416
+
417
+ We welcome contributions! See [CONTRIBUTING.md](CONTRIBUTING.md) for details on:
418
+ - Bug reports and feature requests
419
+ - Development setup and code quality requirements
420
+ - PR process and commit message format
421
+
422
+ Please read our [Code of Conduct](CODE_OF_CONDUCT.md) before participating.
423
+
424
+ ---
425
+
426
+ ## 📧 Contact & Support
427
+
428
+ **Project Lead:** Lionel Deveaux - [@LioChanDaYo](https://github.com/LioChanDaYo)
429
+
430
+ **For questions and support:**
431
+ - 💬 [GitHub Discussions](https://github.com/LioChanDaYo/RGPDpseudonymizer/discussions) — General questions, use cases
432
+ - 🐛 [GitHub Issues](https://github.com/LioChanDaYo/RGPDpseudonymizer/issues) — Bug reports, feature requests
433
+ - 📖 [SUPPORT.md](SUPPORT.md) — Full support process and self-help checklist
434
+
435
+ ---
436
+
437
+ ## 📜 License
438
+
439
+ This project is licensed under the [MIT License](LICENSE).
440
+
441
+ ---
442
+
443
+ ## 🙏 Acknowledgments
444
+
445
+ **Built with:**
446
+ - [spaCy](https://spacy.io/) - Industrial-strength NLP library
447
+ - [Typer](https://typer.tiangolo.com/) - Modern CLI framework
448
+ - [rich](https://rich.readthedocs.io/) - Beautiful CLI formatting
449
+
450
+ **Inspired by:**
451
+ - GDPR privacy-by-design principles
452
+ - Academic research ethics requirements
453
+ - Real-world need for safe AI document analysis
454
+
455
+ **Methodology:**
456
+ - Developed using [BMAD-METHOD™](https://bmad.ai) framework
457
+ - Interactive elicitation and multi-perspective validation
458
+
459
+ ---
460
+
461
+ ## ⚠️ Disclaimer
462
+
463
+ **GDPR Pseudonymizer is a tool to assist with GDPR compliance. It does NOT provide legal advice.**
464
+
465
+ **Important notes:**
466
+ - ⚠️ Pseudonymization reduces risk but is NOT anonymization
467
+ - ⚠️ You remain the data controller under GDPR
468
+ - ⚠️ Consult your DPO or legal counsel for compliance guidance
469
+ - ⚠️ Human validation is MANDATORY - do not skip review steps
470
+ - ⚠️ Test thoroughly before production use
471
+
472
+ **v1.0 MVP limitations:**
473
+ - AI detection: 40-50% baseline (not 85%+)
474
+ - Validation required for ALL documents (not optional)
475
+ - French language only (English, Spanish, etc. in future versions)
476
+ - Text formats only (.txt, .md - no PDF/DOCX in v1.0)
477
+
478
+ ---
479
+
480
+ ## 🧪 Testing
481
+
482
+ ### Running Tests
483
+
484
+ The project includes comprehensive unit and integration tests covering the validation workflow, NLP detection, and core functionality.
485
+
486
+ **Note for Windows users:** Due to known spaCy access violations on Windows ([spaCy issue #12659](https://github.com/explosion/spaCy/issues/12659)), Windows CI runs non-spaCy tests only. Full test suite runs on Linux/macOS.
487
+
488
+ **Run all tests:**
489
+ ```bash
490
+ poetry run pytest -v
491
+ ```
492
+
493
+ **Run only unit tests:**
494
+ ```bash
495
+ poetry run pytest tests/unit/ -v
496
+ ```
497
+
498
+ **Run only integration tests:**
499
+ ```bash
500
+ poetry run pytest tests/integration/ -v
501
+ ```
502
+
503
+ **Run accuracy validation tests (requires spaCy model):**
504
+ ```bash
505
+ poetry run pytest tests/accuracy/ -v -m accuracy -s
506
+ ```
507
+
508
+ **Run performance & stability tests (requires spaCy model):**
509
+ ```bash
510
+ # All performance tests (stability, memory, startup, stress)
511
+ poetry run pytest tests/performance/ -v -s -p no:benchmark --timeout=600
512
+
513
+ # Benchmark tests only (pytest-benchmark)
514
+ poetry run pytest tests/performance/ --benchmark-only -v -s
515
+ ```
516
+
517
+ **Run with coverage report:**
518
+ ```bash
519
+ poetry run pytest --cov=gdpr_pseudonymizer --cov-report=term-missing --cov-report=html
520
+ ```
521
+
522
+ **Run validation workflow integration tests specifically:**
523
+ ```bash
524
+ poetry run pytest tests/integration/test_validation_workflow_integration.py -v
525
+ ```
526
+
527
+ **Run quality checks:**
528
+ ```bash
529
+ # Code formatting check
530
+ poetry run black --check gdpr_pseudonymizer tests
531
+
532
+ # Format code automatically
533
+ poetry run black gdpr_pseudonymizer tests
534
+
535
+ # Linting check
536
+ poetry run ruff check gdpr_pseudonymizer tests
537
+
538
+ # Type checking
539
+ poetry run mypy gdpr_pseudonymizer
540
+ ```
541
+
542
+ **Run Windows-safe tests only (excludes spaCy-dependent tests):**
543
+ ```bash
544
+ # Run non-spaCy unit tests (follows Windows CI pattern)
545
+ poetry run pytest tests/unit/test_benchmark_nlp.py tests/unit/test_config_manager.py tests/unit/test_data_models.py tests/unit/test_file_handler.py tests/unit/test_logger.py tests/unit/test_naive_processor.py tests/unit/test_name_dictionary.py tests/unit/test_process_command.py tests/unit/test_project_config.py tests/unit/test_regex_matcher.py tests/unit/test_validation_models.py tests/unit/test_validation_stub.py -v
546
+
547
+ # Run validation workflow integration tests (Windows-safe)
548
+ poetry run pytest tests/integration/test_validation_workflow_integration.py -v
549
+ ```
550
+
551
+ ### Test Coverage
552
+
553
+ - **Unit tests:** 777 tests covering validation models, UI components, encryption, database operations, audit logging, progress tracking, and core logic
554
+ - **Integration tests:** 90 tests for end-to-end workflows including validation (Story 2.0.1), encrypted database operations (Story 2.4), compositional logic, and hybrid detection
555
+ - **Accuracy tests:** 22 tests validating NER accuracy against 25-document ground-truth corpus (Story 4.4)
556
+ - **Performance tests:** 15 tests validating all NFR targets — single-document benchmarks (NFR1), batch performance (NFR2), memory profiling (NFR4), startup time (NFR5), stability/error rate (NFR6), stress testing (Story 4.5)
557
+ - **Current coverage:** 86%+ across all modules (100% for progress module, 91.41% for AuditRepository)
558
+ - **Total tests:** 1077+ tests (post-refactoring baseline)
559
+ - **CI/CD:** Tests run on Python 3.10-3.12 across Windows, macOS, and Linux
560
+ - **Quality gates:** All pass (Black, Ruff, mypy, pytest)
561
+
562
+ ### Key Integration Test Scenarios
563
+
564
+ The integration test suite covers:
565
+
566
+ **Validation Workflow (19 tests):**
567
+ - ✅ Full workflow: entity detection → summary → review → confirmation
568
+ - ✅ User actions: confirm (Space), reject (R), modify (E), add entity (A), change pseudonym (C), context cycling (X)
569
+ - ✅ State transitions: PENDING → CONFIRMED/REJECTED/MODIFIED
570
+ - ✅ Entity deduplication with grouped review
571
+ - ✅ Edge cases: empty documents, large documents (320+ entities), Ctrl+C interruption, invalid input
572
+ - ✅ Batch operations: Accept All Type (Shift+A), Reject All Type (Shift+R) with confirmation prompts
573
+ - ✅ Mock user input: Full simulation of keyboard interactions and prompts
574
+
575
+ **Encrypted Database (9 tests):**
576
+ - ✅ End-to-end workflow: init → open → save → query → close
577
+ - ✅ Cross-session consistency: Same passphrase retrieves same data
578
+ - ✅ Idempotency: Multiple queries return same results
579
+ - ✅ Encrypted data at rest: Sensitive fields stored encrypted in SQLite
580
+ - ✅ Compositional logic integration: Encrypted component queries
581
+ - ✅ Repository integration: All repositories (mapping, audit, metadata) work with encrypted session
582
+ - ✅ Concurrent reads: WAL mode enables multiple readers
583
+ - ✅ Database indexes: Query performance optimization verified
584
+ - ✅ Batch save rollback: Transaction integrity on errors
585
+
586
+ ---
587
+
588
+ ## 📊 Project Metrics (As of 2026-02-11)
589
+
590
+ | Metric | Value | Status |
591
+ |--------|-------|--------|
592
+ | **Development Progress** | v1.0.0 Released | ✅ All 4 Epics Complete |
593
+ | **Stories Complete** | 33 (Epic 1-4) | ✅ All stories including 4.6.1 refactoring + 4.7 launch prep |
594
+ | **LLM Utility (NFR10)** | 4.27/5.0 (85.4%) | ✅ PASSED (threshold: 80%) |
595
+ | **Installation Success (NFR3)** | 87.5% (7/8 platforms) | ✅ PASSED (threshold: 85%) |
596
+ | **First Pseudonymization (NFR14)** | 100% within 30 min | ✅ PASSED (threshold: 80%) |
597
+ | **Critical Bugs Found** | 1 (Story 2.8) | ✅ RESOLVED - Epic 3 Unblocked |
598
+ | **Test Corpus Size** | 25 docs, 1,855 entities | ✅ Complete |
599
+ | **NLP Accuracy (Baseline)** | 29.5% F1 (spaCy) | ✅ Measured |
600
+ | **Hybrid Accuracy (NLP+Regex)** | 35.3% F1 (+52.2% PERSON) | ✅ Story 1.8 Complete |
601
+ | **Final Accuracy (AI+Human)** | 100% (validated) | 🎯 By Design |
602
+ | **Pseudonym Libraries** | 3 themes (2,426 names + 240 locations + 588 orgs) | ✅ Stories 2.1, 3.0, 4.6 Complete |
603
+ | **Compositional Matching** | Operational (component reuse + title stripping + compound names) | ✅ Stories 2.2, 2.3 Complete |
604
+ | **Batch Processing** | Architecture validated (multiprocessing.Pool, 1.17x-2.5x speedup) | ✅ Story 2.7 Complete |
605
+ | **Encrypted Storage** | AES-256-SIV with passphrase protection (PBKDF2 210K iterations) | ✅ Story 2.4 Complete |
606
+ | **Audit Logging** | GDPR Article 30 compliance (operations table + JSON/CSV export) | ✅ Story 2.5 Complete |
607
+ | **Validation UI** | Operational with deduplication | ✅ Stories 1.7, 1.9 Complete |
608
+ | **Validation Time** | <2 min (20-30 entities), <5 min (100 entities) | ✅ Targets Met |
609
+ | **Single-Doc Performance (NFR1)** | ~6s mean for 3.5K words | ✅ PASSED (<30s threshold, 80% headroom) |
610
+ | **Batch Performance (NFR2)** | ~5 min for 50 docs | ✅ PASSED (<30min threshold, 83% headroom) |
611
+ | **Memory Usage (NFR4)** | ~1 GB Python-tracked peak | ✅ PASSED (<8GB threshold) |
612
+ | **CLI Startup (NFR5)** | 0.56s (help), 6.0s (cold start w/ model) | ✅ PASSED (<5s for CLI startup) |
613
+ | **Error Rate (NFR6)** | ~0% unexpected errors | ✅ PASSED (<10% threshold) |
614
+ | **Test Coverage** | 1077+ tests (post-refactoring baseline), 86%+ coverage | ✅ All Quality Checks Pass |
615
+ | **Quality Gates** | Ruff, mypy, pytest | ✅ All Pass (0 issues) |
616
+ | **Supported Languages** | French | 🇫🇷 v1.0 only |
617
+ | **Supported Formats** | .txt, .md | 📝 v1.0 scope |
618
+
619
+ ---
620
+
621
+ ## 🔗 Quick Links
622
+
623
+ - 📘 [Full PRD](docs/.ignore/prd.md) - Complete product requirements
624
+ - 📊 [Benchmark Report](docs/nlp-benchmark-report.md) - NLP accuracy analysis
625
+ - 🎨 [Positioning Strategy](docs/positioning-messaging-v2-assisted.md) - Marketing & messaging
626
+ - 🏗️ [Architecture Docs](docs/architecture/) - Technical design
627
+ - 📋 [Approval Checklist](docs/PM-APPROVAL-CHECKLIST.md) - PM decision tracker
628
+
629
+ ---
630
+
631
+ **Last Updated:** 2026-02-11 (v1.0.0 Public Release — all 4 MVP epics complete)
632
+