datafog 4.1.0.dev0__tar.gz → 4.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- datafog-4.2.0/PKG-INFO +523 -0
- datafog-4.2.0/README.md +435 -0
- datafog-4.2.0/datafog/__about__.py +1 -0
- datafog-4.2.0/datafog/__init__.py +249 -0
- datafog-4.2.0/datafog/__init___lean.py +189 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/client.py +97 -22
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/config.py +2 -3
- datafog-4.2.0/datafog/core.py +177 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/exceptions.py +3 -1
- datafog-4.2.0/datafog/main.py +236 -0
- datafog-4.2.0/datafog/main_lean.py +190 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/anonymizer.py +15 -4
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/spacy_nlp.py +1 -1
- datafog-4.2.0/datafog/processing/image_processing/donut_processor.py +165 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/spark_processing/pyspark_udfs.py +2 -6
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/__init__.py +2 -0
- datafog-4.2.0/datafog/processing/text_processing/gliner_annotator.py +206 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/regex_annotator/regex_annotator.py +3 -3
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/spacy_pii_annotator.py +3 -1
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/services/__init__.py +2 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/services/image_service.py +33 -4
- datafog-4.2.0/datafog/services/spark_service.py +99 -0
- datafog-4.2.0/datafog/services/text_service.py +402 -0
- datafog-4.2.0/datafog/services/text_service_lean.py +190 -0
- datafog-4.1.0.dev0/datafog/services/text_service.py → datafog-4.2.0/datafog/services/text_service_original.py +4 -4
- datafog-4.2.0/datafog.egg-info/PKG-INFO +523 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog.egg-info/SOURCES.txt +13 -3
- datafog-4.2.0/datafog.egg-info/entry_points.txt +2 -0
- datafog-4.2.0/datafog.egg-info/requires.txt +59 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog.egg-info/top_level.txt +0 -1
- datafog-4.2.0/setup.py +117 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_anonymizer.py +12 -12
- datafog-4.2.0/tests/test_cli_smoke.py +116 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_client.py +68 -115
- datafog-4.2.0/tests/test_donut_lazy_import.py +66 -0
- datafog-4.2.0/tests/test_gliner_annotator.py +467 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_main.py +146 -24
- datafog-4.2.0/tests/test_ocr_integration.py +127 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_regex_annotator.py +11 -3
- datafog-4.2.0/tests/test_spark_integration.py +80 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_text_service.py +128 -7
- {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_text_service_integration.py +5 -6
- datafog-4.1.0.dev0/PKG-INFO +0 -551
- datafog-4.1.0.dev0/README.md +0 -474
- datafog-4.1.0.dev0/datafog/__about__.py +0 -1
- datafog-4.1.0.dev0/datafog/processing/image_processing/donut_processor.py +0 -112
- datafog-4.1.0.dev0/datafog/services/spark_service.py +0 -49
- datafog-4.1.0.dev0/datafog.egg-info/PKG-INFO +0 -551
- datafog-4.1.0.dev0/datafog.egg-info/entry_points.txt +0 -2
- datafog-4.1.0.dev0/datafog.egg-info/requires.txt +0 -31
- datafog-4.1.0.dev0/setup.py +0 -87
- datafog-4.1.0.dev0/tests/__init__.py +0 -0
- datafog-4.1.0.dev0/tests/benchmark_text_service.py +0 -221
- datafog-4.1.0.dev0/tests/debug_spacy_entities.py +0 -20
- {datafog-4.1.0.dev0 → datafog-4.2.0}/LICENSE +0 -0
- /datafog-4.1.0.dev0/datafog/__init__.py → /datafog-4.2.0/datafog/__init___original.py +0 -0
- /datafog-4.1.0.dev0/datafog/main.py → /datafog-4.2.0/datafog/main_original.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/__init__.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/annotator.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/common.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/__init__.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/image_processing/__init__.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/image_processing/image_downloader.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/image_processing/pytesseract_processor.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/spark_processing/__init__.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/regex_annotator/__init__.py +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog.egg-info/dependency_links.txt +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/setup.cfg +0 -0
- {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_image_service.py +0 -0
datafog-4.2.0/PKG-INFO
ADDED
|
@@ -0,0 +1,523 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: datafog
|
|
3
|
+
Version: 4.2.0
|
|
4
|
+
Summary: Lightning-fast PII detection and anonymization library with 190x performance advantage
|
|
5
|
+
Author: Sid Mohan
|
|
6
|
+
Author-email: sid@datafog.ai
|
|
7
|
+
Project-URL: Homepage, https://datafog.ai
|
|
8
|
+
Project-URL: Documentation, https://docs.datafog.ai
|
|
9
|
+
Project-URL: Discord, https://discord.gg/bzDth394R4
|
|
10
|
+
Project-URL: Twitter, https://twitter.com/datafoginc
|
|
11
|
+
Project-URL: GitHub, https://github.com/datafog/datafog-python
|
|
12
|
+
Keywords: pii detection anonymization privacy regex performance
|
|
13
|
+
Classifier: Development Status :: 4 - Beta
|
|
14
|
+
Classifier: Intended Audience :: Developers
|
|
15
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
16
|
+
Classifier: Programming Language :: Python :: 3
|
|
17
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
18
|
+
Classifier: Programming Language :: Python :: 3.11
|
|
19
|
+
Classifier: Programming Language :: Python :: 3.12
|
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
|
21
|
+
Classifier: Topic :: Text Processing
|
|
22
|
+
Classifier: Topic :: Security
|
|
23
|
+
Requires-Python: >=3.10,<3.13
|
|
24
|
+
Description-Content-Type: text/markdown
|
|
25
|
+
License-File: LICENSE
|
|
26
|
+
Requires-Dist: pydantic<3.0,>=2.0
|
|
27
|
+
Requires-Dist: pydantic-settings>=2.0.0
|
|
28
|
+
Requires-Dist: typing-extensions>=4.0
|
|
29
|
+
Provides-Extra: nlp
|
|
30
|
+
Requires-Dist: spacy<4.0,>=3.7.0; extra == "nlp"
|
|
31
|
+
Provides-Extra: nlp-advanced
|
|
32
|
+
Requires-Dist: gliner>=0.2.5; extra == "nlp-advanced"
|
|
33
|
+
Requires-Dist: torch<2.7,>=2.1.0; extra == "nlp-advanced"
|
|
34
|
+
Requires-Dist: transformers>=4.20.0; extra == "nlp-advanced"
|
|
35
|
+
Requires-Dist: huggingface-hub>=0.16.0; extra == "nlp-advanced"
|
|
36
|
+
Provides-Extra: ocr
|
|
37
|
+
Requires-Dist: pytesseract>=0.3.0; extra == "ocr"
|
|
38
|
+
Requires-Dist: Pillow>=10.0.0; extra == "ocr"
|
|
39
|
+
Requires-Dist: sentencepiece>=0.2.0; extra == "ocr"
|
|
40
|
+
Requires-Dist: protobuf>=4.0.0; extra == "ocr"
|
|
41
|
+
Provides-Extra: distributed
|
|
42
|
+
Requires-Dist: pandas>=2.0.0; extra == "distributed"
|
|
43
|
+
Requires-Dist: numpy>=1.24.0; extra == "distributed"
|
|
44
|
+
Provides-Extra: web
|
|
45
|
+
Requires-Dist: fastapi>=0.100.0; extra == "web"
|
|
46
|
+
Requires-Dist: aiohttp>=3.8.0; extra == "web"
|
|
47
|
+
Requires-Dist: requests>=2.30.0; extra == "web"
|
|
48
|
+
Provides-Extra: cli
|
|
49
|
+
Requires-Dist: typer>=0.12.0; extra == "cli"
|
|
50
|
+
Requires-Dist: pydantic-settings>=2.0.0; extra == "cli"
|
|
51
|
+
Provides-Extra: crypto
|
|
52
|
+
Requires-Dist: cryptography>=40.0.0; extra == "crypto"
|
|
53
|
+
Provides-Extra: dev
|
|
54
|
+
Requires-Dist: pytest>=7.0.0; extra == "dev"
|
|
55
|
+
Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
|
|
56
|
+
Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
|
|
57
|
+
Requires-Dist: sphinx>=7.0.0; extra == "dev"
|
|
58
|
+
Provides-Extra: all
|
|
59
|
+
Requires-Dist: spacy<4.0,>=3.7.0; extra == "all"
|
|
60
|
+
Requires-Dist: gliner>=0.2.5; extra == "all"
|
|
61
|
+
Requires-Dist: torch<2.7,>=2.1.0; extra == "all"
|
|
62
|
+
Requires-Dist: transformers>=4.20.0; extra == "all"
|
|
63
|
+
Requires-Dist: huggingface-hub>=0.16.0; extra == "all"
|
|
64
|
+
Requires-Dist: pytesseract>=0.3.0; extra == "all"
|
|
65
|
+
Requires-Dist: Pillow>=10.0.0; extra == "all"
|
|
66
|
+
Requires-Dist: sentencepiece>=0.2.0; extra == "all"
|
|
67
|
+
Requires-Dist: protobuf>=4.0.0; extra == "all"
|
|
68
|
+
Requires-Dist: pandas>=2.0.0; extra == "all"
|
|
69
|
+
Requires-Dist: numpy>=1.24.0; extra == "all"
|
|
70
|
+
Requires-Dist: fastapi>=0.100.0; extra == "all"
|
|
71
|
+
Requires-Dist: aiohttp>=3.8.0; extra == "all"
|
|
72
|
+
Requires-Dist: requests>=2.30.0; extra == "all"
|
|
73
|
+
Requires-Dist: typer>=0.12.0; extra == "all"
|
|
74
|
+
Requires-Dist: pydantic-settings>=2.0.0; extra == "all"
|
|
75
|
+
Requires-Dist: cryptography>=40.0.0; extra == "all"
|
|
76
|
+
Dynamic: author
|
|
77
|
+
Dynamic: author-email
|
|
78
|
+
Dynamic: classifier
|
|
79
|
+
Dynamic: description
|
|
80
|
+
Dynamic: description-content-type
|
|
81
|
+
Dynamic: keywords
|
|
82
|
+
Dynamic: license-file
|
|
83
|
+
Dynamic: project-url
|
|
84
|
+
Dynamic: provides-extra
|
|
85
|
+
Dynamic: requires-dist
|
|
86
|
+
Dynamic: requires-python
|
|
87
|
+
Dynamic: summary
|
|
88
|
+
|
|
89
|
+
# DataFog: PII Detection & Anonymization
|
|
90
|
+
|
|
91
|
+
<p align="center">
|
|
92
|
+
<a href="https://www.datafog.ai"><img src="public/colorlogo.png" alt="DataFog logo" width="300"></a>
|
|
93
|
+
</p>
|
|
94
|
+
|
|
95
|
+
<p align="center">
|
|
96
|
+
<b>Fast processing • Production-ready • Simple configuration</b>
|
|
97
|
+
</p>
|
|
98
|
+
|
|
99
|
+
<p align="center">
|
|
100
|
+
<a href="https://pypi.org/project/datafog/"><img src="https://img.shields.io/pypi/v/datafog.svg?style=flat-square" alt="PyPi Version"></a>
|
|
101
|
+
<a href="https://pypi.org/project/datafog/"><img src="https://img.shields.io/pypi/pyversions/datafog.svg?style=flat-square" alt="PyPI pyversions"></a>
|
|
102
|
+
<a href="https://github.com/datafog/datafog-python"><img src="https://img.shields.io/github/stars/datafog/datafog-python.svg?style=flat-square&logo=github&label=Stars&logoColor=white" alt="GitHub stars"></a>
|
|
103
|
+
<a href="https://pypistats.org/packages/datafog"><img src="https://img.shields.io/pypi/dm/datafog.svg?style=flat-square" alt="PyPi downloads"></a>
|
|
104
|
+
<a href="https://github.com/datafog/datafog-python/actions/workflows/tests.yml"><img src="https://github.com/datafog/datafog-python/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
|
|
105
|
+
<a href="https://github.com/datafog/datafog-python/actions/workflows/benchmark.yml"><img src="https://github.com/datafog/datafog-python/actions/workflows/benchmark.yml/badge.svg" alt="Benchmarks"></a>
|
|
106
|
+
</p>
|
|
107
|
+
|
|
108
|
+
---
|
|
109
|
+
|
|
110
|
+
## Overview
|
|
111
|
+
|
|
112
|
+
DataFog provides efficient PII detection using a pattern-first approach that processes text significantly faster than traditional NLP methods while maintaining high accuracy.
|
|
113
|
+
|
|
114
|
+
```python
|
|
115
|
+
# Basic usage example
|
|
116
|
+
from datafog import DataFog
|
|
117
|
+
results = DataFog().scan_text("John's email is john@example.com and SSN is 123-45-6789")
|
|
118
|
+
```
|
|
119
|
+
|
|
120
|
+
### Performance Comparison
|
|
121
|
+
|
|
122
|
+
| Engine | 10KB Text Processing | Relative Speed | Accuracy |
|
|
123
|
+
| -------------------- | -------------------- | --------------- | ----------------- |
|
|
124
|
+
| **DataFog (Regex)** | ~2.4ms | **190x faster** | High (structured) |
|
|
125
|
+
| **DataFog (GLiNER)** | ~15ms | **32x faster** | Very High |
|
|
126
|
+
| **DataFog (Smart)** | ~3-15ms | **60x faster** | Highest |
|
|
127
|
+
| spaCy | ~459ms | baseline | Good |
|
|
128
|
+
|
|
129
|
+
_Performance measured on 13.3KB business document. GLiNER provides excellent accuracy for named entities while maintaining speed advantage._
|
|
130
|
+
|
|
131
|
+
### Supported PII Types
|
|
132
|
+
|
|
133
|
+
| Type | Examples | Use Cases |
|
|
134
|
+
| ---------------- | ------------------- | ---------------------- |
|
|
135
|
+
| **Email** | john@company.com | Contact scrubbing |
|
|
136
|
+
| **Phone** | (555) 123-4567 | Call log anonymization |
|
|
137
|
+
| **SSN** | 123-45-6789 | HR data protection |
|
|
138
|
+
| **Credit Cards** | 4111-1111-1111-1111 | Payment processing |
|
|
139
|
+
| **IP Addresses** | 192.168.1.1 | Network log cleaning |
|
|
140
|
+
| **Dates** | 01/01/1990 | Birthdate removal |
|
|
141
|
+
| **ZIP Codes** | 12345-6789 | Location anonymization |
|
|
142
|
+
|
|
143
|
+
---
|
|
144
|
+
|
|
145
|
+
## Quick Start
|
|
146
|
+
|
|
147
|
+
### Installation
|
|
148
|
+
|
|
149
|
+
```bash
|
|
150
|
+
# Lightweight core (fast regex-based PII detection)
|
|
151
|
+
pip install datafog
|
|
152
|
+
|
|
153
|
+
# With advanced ML models for better accuracy
|
|
154
|
+
pip install datafog[nlp] # spaCy for advanced NLP
|
|
155
|
+
pip install datafog[nlp-advanced] # GLiNER for modern NER
|
|
156
|
+
pip install datafog[ocr] # Image processing with OCR
|
|
157
|
+
pip install datafog[all] # Everything included
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
### Basic Usage
|
|
161
|
+
|
|
162
|
+
**Detect PII in text:**
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from datafog import DataFog
|
|
166
|
+
|
|
167
|
+
# Simple detection (uses fast regex engine)
|
|
168
|
+
detector = DataFog()
|
|
169
|
+
text = "Contact John Doe at john.doe@company.com or (555) 123-4567"
|
|
170
|
+
results = detector.scan_text(text)
|
|
171
|
+
print(results)
|
|
172
|
+
# Finds: emails, phone numbers, and more
|
|
173
|
+
|
|
174
|
+
# Modern NER with GLiNER (requires: pip install datafog[nlp-advanced])
|
|
175
|
+
from datafog.services import TextService
|
|
176
|
+
gliner_service = TextService(engine="gliner")
|
|
177
|
+
result = gliner_service.annotate_text_sync("Dr. John Smith works at General Hospital")
|
|
178
|
+
# Detects: PERSON, ORGANIZATION with high accuracy
|
|
179
|
+
|
|
180
|
+
# Best of both worlds: Smart cascading (recommended for production)
|
|
181
|
+
smart_service = TextService(engine="smart")
|
|
182
|
+
result = smart_service.annotate_text_sync("Contact john@company.com or call (555) 123-4567")
|
|
183
|
+
# Uses regex for structured PII (fast), GLiNER for entities (accurate)
|
|
184
|
+
```
|
|
185
|
+
|
|
186
|
+
**Anonymize on the fly:**
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
# Redact sensitive data
|
|
190
|
+
redacted = DataFog(operations=["scan", "redact"]).process_text(
|
|
191
|
+
"My SSN is 123-45-6789 and email is john@example.com"
|
|
192
|
+
)
|
|
193
|
+
print(redacted)
|
|
194
|
+
# Output: "My SSN is [REDACTED] and email is [REDACTED]"
|
|
195
|
+
|
|
196
|
+
# Replace with fake data
|
|
197
|
+
replaced = DataFog(operations=["scan", "replace"]).process_text(
|
|
198
|
+
"Call me at (555) 123-4567"
|
|
199
|
+
)
|
|
200
|
+
print(replaced)
|
|
201
|
+
# Output: "Call me at [PHONE_A1B2C3]"
|
|
202
|
+
```
|
|
203
|
+
|
|
204
|
+
**Process images with OCR:**
|
|
205
|
+
|
|
206
|
+
```python
|
|
207
|
+
import asyncio
|
|
208
|
+
from datafog import DataFog
|
|
209
|
+
|
|
210
|
+
async def scan_document():
|
|
211
|
+
ocr_scanner = DataFog(operations=["extract", "scan"])
|
|
212
|
+
results = await ocr_scanner.run_ocr_pipeline([
|
|
213
|
+
"https://example.com/document.png"
|
|
214
|
+
])
|
|
215
|
+
return results
|
|
216
|
+
|
|
217
|
+
# Extract text and find PII in images
|
|
218
|
+
results = asyncio.run(scan_document())
|
|
219
|
+
```
|
|
220
|
+
|
|
221
|
+
---
|
|
222
|
+
|
|
223
|
+
## Advanced Features
|
|
224
|
+
|
|
225
|
+
### Engine Selection
|
|
226
|
+
|
|
227
|
+
Choose the appropriate engine for your needs:
|
|
228
|
+
|
|
229
|
+
```python
|
|
230
|
+
from datafog.services import TextService
|
|
231
|
+
|
|
232
|
+
# Regex: Fast, pattern-based (recommended for speed)
|
|
233
|
+
regex_service = TextService(engine="regex")
|
|
234
|
+
|
|
235
|
+
# spaCy: Traditional NLP with broad entity recognition
|
|
236
|
+
spacy_service = TextService(engine="spacy")
|
|
237
|
+
|
|
238
|
+
# GLiNER: Modern ML model optimized for NER (requires nlp-advanced extra)
|
|
239
|
+
gliner_service = TextService(engine="gliner")
|
|
240
|
+
|
|
241
|
+
# Smart: Cascading approach - regex → GLiNER → spaCy (best accuracy/speed balance)
|
|
242
|
+
smart_service = TextService(engine="smart")
|
|
243
|
+
|
|
244
|
+
# Auto: Regex → spaCy fallback (legacy)
|
|
245
|
+
auto_service = TextService(engine="auto")
|
|
246
|
+
```
|
|
247
|
+
|
|
248
|
+
**Performance & Accuracy Guide:**
|
|
249
|
+
|
|
250
|
+
| Engine | Speed | Accuracy | Use Case | Install Requirements |
|
|
251
|
+
| -------- | ----------- | -------- | ------------------------------- | ----------------------------------- |
|
|
252
|
+
| `regex` | 🚀 Fastest | Good | Structured PII (emails, phones) | Core only |
|
|
253
|
+
| `gliner` | ⚡ Fast | Better | Modern NER, custom entities | `pip install datafog[nlp-advanced]` |
|
|
254
|
+
| `spacy` | 🐌 Slower | Good | Traditional NLP entities | `pip install datafog[nlp]` |
|
|
255
|
+
| `smart` | ⚡ Balanced | Best | Combines all approaches | `pip install datafog[nlp-advanced]` |
|
|
256
|
+
|
|
257
|
+
**Model Management:**
|
|
258
|
+
|
|
259
|
+
```python
|
|
260
|
+
# Download specific GLiNER models
|
|
261
|
+
import subprocess
|
|
262
|
+
|
|
263
|
+
# PII-specialized model (recommended)
|
|
264
|
+
subprocess.run(["datafog", "download-model", "urchade/gliner_multi_pii-v1", "--engine", "gliner"])
|
|
265
|
+
|
|
266
|
+
# General-purpose model
|
|
267
|
+
subprocess.run(["datafog", "download-model", "urchade/gliner_base", "--engine", "gliner"])
|
|
268
|
+
|
|
269
|
+
# List available models
|
|
270
|
+
subprocess.run(["datafog", "list-models", "--engine", "gliner"])
|
|
271
|
+
```
|
|
272
|
+
|
|
273
|
+
### Anonymization Options
|
|
274
|
+
|
|
275
|
+
```python
|
|
276
|
+
from datafog import DataFog
|
|
277
|
+
from datafog.models.anonymizer import AnonymizerType, HashType
|
|
278
|
+
|
|
279
|
+
# Hash with different algorithms
|
|
280
|
+
hasher = DataFog(
|
|
281
|
+
operations=["scan", "hash"],
|
|
282
|
+
hash_type=HashType.SHA256 # or MD5, SHA3_256
|
|
283
|
+
)
|
|
284
|
+
|
|
285
|
+
# Target specific entity types only
|
|
286
|
+
selective = DataFog(
|
|
287
|
+
operations=["scan", "redact"],
|
|
288
|
+
entities=["EMAIL", "PHONE"] # Only process these types
|
|
289
|
+
)
|
|
290
|
+
```
|
|
291
|
+
|
|
292
|
+
### Batch Processing
|
|
293
|
+
|
|
294
|
+
```python
|
|
295
|
+
documents = [
|
|
296
|
+
"Document 1 with PII...",
|
|
297
|
+
"Document 2 with more data...",
|
|
298
|
+
"Document 3..."
|
|
299
|
+
]
|
|
300
|
+
|
|
301
|
+
# Process multiple documents efficiently
|
|
302
|
+
results = DataFog().batch_process(documents)
|
|
303
|
+
```
|
|
304
|
+
|
|
305
|
+
---
|
|
306
|
+
|
|
307
|
+
## Performance Benchmarks
|
|
308
|
+
|
|
309
|
+
Performance comparison with alternatives:
|
|
310
|
+
|
|
311
|
+
### Speed Comparison (10KB text)
|
|
312
|
+
|
|
313
|
+
```
|
|
314
|
+
DataFog Pattern: 4ms ████████████████████████████████ 123x faster
|
|
315
|
+
spaCy: 480ms ██ baseline
|
|
316
|
+
```
|
|
317
|
+
|
|
318
|
+
### Engine Selection Guide
|
|
319
|
+
|
|
320
|
+
| Scenario | Recommended Engine | Why |
|
|
321
|
+
| -------------------------- | ------------------ | ------------------------------------- |
|
|
322
|
+
| **High-volume processing** | `pattern` | Maximum speed, consistent performance |
|
|
323
|
+
| **Unknown entity types** | `spacy` | Broader entity recognition |
|
|
324
|
+
| **General purpose** | `auto` | Smart fallback, best of both worlds |
|
|
325
|
+
| **Real-time applications** | `pattern` | Sub-millisecond processing |
|
|
326
|
+
|
|
327
|
+
---
|
|
328
|
+
|
|
329
|
+
## CLI Usage
|
|
330
|
+
|
|
331
|
+
DataFog includes a command-line interface:
|
|
332
|
+
|
|
333
|
+
```bash
|
|
334
|
+
# Scan text for PII
|
|
335
|
+
datafog scan-text "John's email is john@example.com"
|
|
336
|
+
|
|
337
|
+
# Process images
|
|
338
|
+
datafog scan-image document.png --operations extract,scan
|
|
339
|
+
|
|
340
|
+
# Anonymize data
|
|
341
|
+
datafog redact-text "My phone is (555) 123-4567"
|
|
342
|
+
datafog replace-text "SSN: 123-45-6789"
|
|
343
|
+
datafog hash-text "Email: john@company.com" --hash-type sha256
|
|
344
|
+
|
|
345
|
+
# Utility commands
|
|
346
|
+
datafog health
|
|
347
|
+
datafog list-entities
|
|
348
|
+
datafog show-config
|
|
349
|
+
```
|
|
350
|
+
|
|
351
|
+
---
|
|
352
|
+
|
|
353
|
+
## Features
|
|
354
|
+
|
|
355
|
+
### Security & Compliance
|
|
356
|
+
|
|
357
|
+
- Detection of regulated data types for GDPR/CCPA compliance
|
|
358
|
+
- Audit trails for tracking detection and anonymization
|
|
359
|
+
- Configurable detection thresholds
|
|
360
|
+
|
|
361
|
+
### Scalability
|
|
362
|
+
|
|
363
|
+
- Batch processing for handling multiple documents
|
|
364
|
+
- Memory-efficient processing for large files
|
|
365
|
+
- Async support for non-blocking operations
|
|
366
|
+
|
|
367
|
+
### Integration Example
|
|
368
|
+
|
|
369
|
+
```python
|
|
370
|
+
# FastAPI middleware example
|
|
371
|
+
from fastapi import FastAPI
|
|
372
|
+
from datafog import DataFog
|
|
373
|
+
|
|
374
|
+
app = FastAPI()
|
|
375
|
+
detector = DataFog()
|
|
376
|
+
|
|
377
|
+
@app.middleware("http")
|
|
378
|
+
async def redact_pii_middleware(request, call_next):
|
|
379
|
+
# Automatically scan/redact request data
|
|
380
|
+
pass
|
|
381
|
+
```
|
|
382
|
+
|
|
383
|
+
---
|
|
384
|
+
|
|
385
|
+
## Common Use Cases
|
|
386
|
+
|
|
387
|
+
### Enterprise
|
|
388
|
+
|
|
389
|
+
- Log sanitization
|
|
390
|
+
- Data migration with PII handling
|
|
391
|
+
- Compliance reporting and audits
|
|
392
|
+
|
|
393
|
+
### Data Science
|
|
394
|
+
|
|
395
|
+
- Dataset preparation and anonymization
|
|
396
|
+
- Privacy-preserving analytics
|
|
397
|
+
- Research compliance
|
|
398
|
+
|
|
399
|
+
### Development
|
|
400
|
+
|
|
401
|
+
- Test data generation
|
|
402
|
+
- Code review for PII detection
|
|
403
|
+
- API security validation
|
|
404
|
+
|
|
405
|
+
---
|
|
406
|
+
|
|
407
|
+
## Installation & Setup
|
|
408
|
+
|
|
409
|
+
### Basic Installation
|
|
410
|
+
|
|
411
|
+
```bash
|
|
412
|
+
pip install datafog
|
|
413
|
+
```
|
|
414
|
+
|
|
415
|
+
### Development Setup
|
|
416
|
+
|
|
417
|
+
```bash
|
|
418
|
+
git clone https://github.com/datafog/datafog-python
|
|
419
|
+
cd datafog-python
|
|
420
|
+
python -m venv .venv
|
|
421
|
+
source .venv/bin/activate # On Windows: .venv\Scripts\activate
|
|
422
|
+
pip install -r requirements-dev.txt
|
|
423
|
+
just setup
|
|
424
|
+
```
|
|
425
|
+
|
|
426
|
+
### Docker Usage
|
|
427
|
+
|
|
428
|
+
```dockerfile
|
|
429
|
+
FROM python:3.10-slim
|
|
430
|
+
RUN pip install datafog
|
|
431
|
+
COPY . .
|
|
432
|
+
CMD ["python", "your_script.py"]
|
|
433
|
+
```
|
|
434
|
+
|
|
435
|
+
---
|
|
436
|
+
|
|
437
|
+
## Contributing
|
|
438
|
+
|
|
439
|
+
Contributions are welcome in the form of:
|
|
440
|
+
|
|
441
|
+
- Bug reports
|
|
442
|
+
- Feature requests
|
|
443
|
+
- Documentation improvements
|
|
444
|
+
- New pattern patterns for PII detection
|
|
445
|
+
- Performance improvements
|
|
446
|
+
|
|
447
|
+
### Quick Contribution Guide
|
|
448
|
+
|
|
449
|
+
```bash
|
|
450
|
+
# Setup development environment
|
|
451
|
+
git clone https://github.com/datafog/datafog-python
|
|
452
|
+
cd datafog-python
|
|
453
|
+
just setup
|
|
454
|
+
|
|
455
|
+
# Run tests
|
|
456
|
+
just test
|
|
457
|
+
|
|
458
|
+
# Format code
|
|
459
|
+
just format
|
|
460
|
+
|
|
461
|
+
# Submit PR
|
|
462
|
+
git checkout -b feature/your-improvement
|
|
463
|
+
# Make your changes
|
|
464
|
+
git commit -m "Add your improvement"
|
|
465
|
+
git push origin feature/your-improvement
|
|
466
|
+
```
|
|
467
|
+
|
|
468
|
+
See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
|
|
469
|
+
|
|
470
|
+
---
|
|
471
|
+
|
|
472
|
+
## Benchmarking & Performance
|
|
473
|
+
|
|
474
|
+
### Run Benchmarks Locally
|
|
475
|
+
|
|
476
|
+
```bash
|
|
477
|
+
# Install benchmark dependencies
|
|
478
|
+
pip install pytest-benchmark
|
|
479
|
+
|
|
480
|
+
# Run performance tests
|
|
481
|
+
pytest tests/benchmark_text_service.py -v
|
|
482
|
+
|
|
483
|
+
# Compare with baseline
|
|
484
|
+
python scripts/run_benchmark_locally.sh
|
|
485
|
+
```
|
|
486
|
+
|
|
487
|
+
### Continuous Performance Monitoring
|
|
488
|
+
|
|
489
|
+
Our CI pipeline:
|
|
490
|
+
|
|
491
|
+
- Runs benchmarks on every PR
|
|
492
|
+
- Compares against baseline performance
|
|
493
|
+
- Fails builds if performance degrades >10%
|
|
494
|
+
- Tracks performance trends over time
|
|
495
|
+
|
|
496
|
+
---
|
|
497
|
+
|
|
498
|
+
## Documentation & Support
|
|
499
|
+
|
|
500
|
+
| Resource | Link |
|
|
501
|
+
| --------------------- | --------------------------------------------------------------------------- |
|
|
502
|
+
| **Documentation** | [docs.datafog.ai](https://docs.datafog.ai) |
|
|
503
|
+
| **Community Discord** | [Join here](https://discord.gg/bzDth394R4) |
|
|
504
|
+
| **Bug Reports** | [GitHub Issues](https://github.com/datafog/datafog-python/issues) |
|
|
505
|
+
| **Feature Requests** | [GitHub Discussions](https://github.com/datafog/datafog-python/discussions) |
|
|
506
|
+
| **Support** | [hi@datafog.ai](mailto:hi@datafog.ai) |
|
|
507
|
+
|
|
508
|
+
---
|
|
509
|
+
|
|
510
|
+
## License & Acknowledgments
|
|
511
|
+
|
|
512
|
+
DataFog is released under the [MIT License](LICENSE).
|
|
513
|
+
|
|
514
|
+
**Built with:**
|
|
515
|
+
|
|
516
|
+
- Pattern optimization for efficient processing
|
|
517
|
+
- spaCy integration for NLP capabilities
|
|
518
|
+
- Tesseract & Donut for OCR capabilities
|
|
519
|
+
- Pydantic for data validation
|
|
520
|
+
|
|
521
|
+
---
|
|
522
|
+
|
|
523
|
+
[GitHub](https://github.com/datafog/datafog-python) • [Documentation](https://docs.datafog.ai) • [Discord](https://discord.gg/bzDth394R4)
|