datafog 4.1.0.dev0__tar.gz → 4.2.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. datafog-4.2.0/PKG-INFO +523 -0
  2. datafog-4.2.0/README.md +435 -0
  3. datafog-4.2.0/datafog/__about__.py +1 -0
  4. datafog-4.2.0/datafog/__init__.py +249 -0
  5. datafog-4.2.0/datafog/__init___lean.py +189 -0
  6. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/client.py +97 -22
  7. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/config.py +2 -3
  8. datafog-4.2.0/datafog/core.py +177 -0
  9. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/exceptions.py +3 -1
  10. datafog-4.2.0/datafog/main.py +236 -0
  11. datafog-4.2.0/datafog/main_lean.py +190 -0
  12. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/anonymizer.py +15 -4
  13. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/spacy_nlp.py +1 -1
  14. datafog-4.2.0/datafog/processing/image_processing/donut_processor.py +165 -0
  15. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/spark_processing/pyspark_udfs.py +2 -6
  16. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/__init__.py +2 -0
  17. datafog-4.2.0/datafog/processing/text_processing/gliner_annotator.py +206 -0
  18. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/regex_annotator/regex_annotator.py +3 -3
  19. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/spacy_pii_annotator.py +3 -1
  20. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/services/__init__.py +2 -0
  21. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/services/image_service.py +33 -4
  22. datafog-4.2.0/datafog/services/spark_service.py +99 -0
  23. datafog-4.2.0/datafog/services/text_service.py +402 -0
  24. datafog-4.2.0/datafog/services/text_service_lean.py +190 -0
  25. datafog-4.1.0.dev0/datafog/services/text_service.py → datafog-4.2.0/datafog/services/text_service_original.py +4 -4
  26. datafog-4.2.0/datafog.egg-info/PKG-INFO +523 -0
  27. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog.egg-info/SOURCES.txt +13 -3
  28. datafog-4.2.0/datafog.egg-info/entry_points.txt +2 -0
  29. datafog-4.2.0/datafog.egg-info/requires.txt +59 -0
  30. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog.egg-info/top_level.txt +0 -1
  31. datafog-4.2.0/setup.py +117 -0
  32. {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_anonymizer.py +12 -12
  33. datafog-4.2.0/tests/test_cli_smoke.py +116 -0
  34. {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_client.py +68 -115
  35. datafog-4.2.0/tests/test_donut_lazy_import.py +66 -0
  36. datafog-4.2.0/tests/test_gliner_annotator.py +467 -0
  37. {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_main.py +146 -24
  38. datafog-4.2.0/tests/test_ocr_integration.py +127 -0
  39. {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_regex_annotator.py +11 -3
  40. datafog-4.2.0/tests/test_spark_integration.py +80 -0
  41. {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_text_service.py +128 -7
  42. {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_text_service_integration.py +5 -6
  43. datafog-4.1.0.dev0/PKG-INFO +0 -551
  44. datafog-4.1.0.dev0/README.md +0 -474
  45. datafog-4.1.0.dev0/datafog/__about__.py +0 -1
  46. datafog-4.1.0.dev0/datafog/processing/image_processing/donut_processor.py +0 -112
  47. datafog-4.1.0.dev0/datafog/services/spark_service.py +0 -49
  48. datafog-4.1.0.dev0/datafog.egg-info/PKG-INFO +0 -551
  49. datafog-4.1.0.dev0/datafog.egg-info/entry_points.txt +0 -2
  50. datafog-4.1.0.dev0/datafog.egg-info/requires.txt +0 -31
  51. datafog-4.1.0.dev0/setup.py +0 -87
  52. datafog-4.1.0.dev0/tests/__init__.py +0 -0
  53. datafog-4.1.0.dev0/tests/benchmark_text_service.py +0 -221
  54. datafog-4.1.0.dev0/tests/debug_spacy_entities.py +0 -20
  55. {datafog-4.1.0.dev0 → datafog-4.2.0}/LICENSE +0 -0
  56. /datafog-4.1.0.dev0/datafog/__init__.py → /datafog-4.2.0/datafog/__init___original.py +0 -0
  57. /datafog-4.1.0.dev0/datafog/main.py → /datafog-4.2.0/datafog/main_original.py +0 -0
  58. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/__init__.py +0 -0
  59. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/annotator.py +0 -0
  60. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/models/common.py +0 -0
  61. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/__init__.py +0 -0
  62. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/image_processing/__init__.py +0 -0
  63. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/image_processing/image_downloader.py +0 -0
  64. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/image_processing/pytesseract_processor.py +0 -0
  65. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/spark_processing/__init__.py +0 -0
  66. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog/processing/text_processing/regex_annotator/__init__.py +0 -0
  67. {datafog-4.1.0.dev0 → datafog-4.2.0}/datafog.egg-info/dependency_links.txt +0 -0
  68. {datafog-4.1.0.dev0 → datafog-4.2.0}/setup.cfg +0 -0
  69. {datafog-4.1.0.dev0 → datafog-4.2.0}/tests/test_image_service.py +0 -0
datafog-4.2.0/PKG-INFO ADDED
@@ -0,0 +1,523 @@
1
+ Metadata-Version: 2.4
2
+ Name: datafog
3
+ Version: 4.2.0
4
+ Summary: Lightning-fast PII detection and anonymization library with 190x performance advantage
5
+ Author: Sid Mohan
6
+ Author-email: sid@datafog.ai
7
+ Project-URL: Homepage, https://datafog.ai
8
+ Project-URL: Documentation, https://docs.datafog.ai
9
+ Project-URL: Discord, https://discord.gg/bzDth394R4
10
+ Project-URL: Twitter, https://twitter.com/datafoginc
11
+ Project-URL: GitHub, https://github.com/datafog/datafog-python
12
+ Keywords: pii detection anonymization privacy regex performance
13
+ Classifier: Development Status :: 4 - Beta
14
+ Classifier: Intended Audience :: Developers
15
+ Classifier: License :: OSI Approved :: Apache Software License
16
+ Classifier: Programming Language :: Python :: 3
17
+ Classifier: Programming Language :: Python :: 3.10
18
+ Classifier: Programming Language :: Python :: 3.11
19
+ Classifier: Programming Language :: Python :: 3.12
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing
22
+ Classifier: Topic :: Security
23
+ Requires-Python: >=3.10,<3.13
24
+ Description-Content-Type: text/markdown
25
+ License-File: LICENSE
26
+ Requires-Dist: pydantic<3.0,>=2.0
27
+ Requires-Dist: pydantic-settings>=2.0.0
28
+ Requires-Dist: typing-extensions>=4.0
29
+ Provides-Extra: nlp
30
+ Requires-Dist: spacy<4.0,>=3.7.0; extra == "nlp"
31
+ Provides-Extra: nlp-advanced
32
+ Requires-Dist: gliner>=0.2.5; extra == "nlp-advanced"
33
+ Requires-Dist: torch<2.7,>=2.1.0; extra == "nlp-advanced"
34
+ Requires-Dist: transformers>=4.20.0; extra == "nlp-advanced"
35
+ Requires-Dist: huggingface-hub>=0.16.0; extra == "nlp-advanced"
36
+ Provides-Extra: ocr
37
+ Requires-Dist: pytesseract>=0.3.0; extra == "ocr"
38
+ Requires-Dist: Pillow>=10.0.0; extra == "ocr"
39
+ Requires-Dist: sentencepiece>=0.2.0; extra == "ocr"
40
+ Requires-Dist: protobuf>=4.0.0; extra == "ocr"
41
+ Provides-Extra: distributed
42
+ Requires-Dist: pandas>=2.0.0; extra == "distributed"
43
+ Requires-Dist: numpy>=1.24.0; extra == "distributed"
44
+ Provides-Extra: web
45
+ Requires-Dist: fastapi>=0.100.0; extra == "web"
46
+ Requires-Dist: aiohttp>=3.8.0; extra == "web"
47
+ Requires-Dist: requests>=2.30.0; extra == "web"
48
+ Provides-Extra: cli
49
+ Requires-Dist: typer>=0.12.0; extra == "cli"
50
+ Requires-Dist: pydantic-settings>=2.0.0; extra == "cli"
51
+ Provides-Extra: crypto
52
+ Requires-Dist: cryptography>=40.0.0; extra == "crypto"
53
+ Provides-Extra: dev
54
+ Requires-Dist: pytest>=7.0.0; extra == "dev"
55
+ Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
56
+ Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
57
+ Requires-Dist: sphinx>=7.0.0; extra == "dev"
58
+ Provides-Extra: all
59
+ Requires-Dist: spacy<4.0,>=3.7.0; extra == "all"
60
+ Requires-Dist: gliner>=0.2.5; extra == "all"
61
+ Requires-Dist: torch<2.7,>=2.1.0; extra == "all"
62
+ Requires-Dist: transformers>=4.20.0; extra == "all"
63
+ Requires-Dist: huggingface-hub>=0.16.0; extra == "all"
64
+ Requires-Dist: pytesseract>=0.3.0; extra == "all"
65
+ Requires-Dist: Pillow>=10.0.0; extra == "all"
66
+ Requires-Dist: sentencepiece>=0.2.0; extra == "all"
67
+ Requires-Dist: protobuf>=4.0.0; extra == "all"
68
+ Requires-Dist: pandas>=2.0.0; extra == "all"
69
+ Requires-Dist: numpy>=1.24.0; extra == "all"
70
+ Requires-Dist: fastapi>=0.100.0; extra == "all"
71
+ Requires-Dist: aiohttp>=3.8.0; extra == "all"
72
+ Requires-Dist: requests>=2.30.0; extra == "all"
73
+ Requires-Dist: typer>=0.12.0; extra == "all"
74
+ Requires-Dist: pydantic-settings>=2.0.0; extra == "all"
75
+ Requires-Dist: cryptography>=40.0.0; extra == "all"
76
+ Dynamic: author
77
+ Dynamic: author-email
78
+ Dynamic: classifier
79
+ Dynamic: description
80
+ Dynamic: description-content-type
81
+ Dynamic: keywords
82
+ Dynamic: license-file
83
+ Dynamic: project-url
84
+ Dynamic: provides-extra
85
+ Dynamic: requires-dist
86
+ Dynamic: requires-python
87
+ Dynamic: summary
88
+
89
+ # DataFog: PII Detection & Anonymization
90
+
91
+ <p align="center">
92
+ <a href="https://www.datafog.ai"><img src="public/colorlogo.png" alt="DataFog logo" width="300"></a>
93
+ </p>
94
+
95
+ <p align="center">
96
+ <b>Fast processing • Production-ready • Simple configuration</b>
97
+ </p>
98
+
99
+ <p align="center">
100
+ <a href="https://pypi.org/project/datafog/"><img src="https://img.shields.io/pypi/v/datafog.svg?style=flat-square" alt="PyPi Version"></a>
101
+ <a href="https://pypi.org/project/datafog/"><img src="https://img.shields.io/pypi/pyversions/datafog.svg?style=flat-square" alt="PyPI pyversions"></a>
102
+ <a href="https://github.com/datafog/datafog-python"><img src="https://img.shields.io/github/stars/datafog/datafog-python.svg?style=flat-square&logo=github&label=Stars&logoColor=white" alt="GitHub stars"></a>
103
+ <a href="https://pypistats.org/packages/datafog"><img src="https://img.shields.io/pypi/dm/datafog.svg?style=flat-square" alt="PyPi downloads"></a>
104
+ <a href="https://github.com/datafog/datafog-python/actions/workflows/tests.yml"><img src="https://github.com/datafog/datafog-python/actions/workflows/tests.yml/badge.svg" alt="Tests"></a>
105
+ <a href="https://github.com/datafog/datafog-python/actions/workflows/benchmark.yml"><img src="https://github.com/datafog/datafog-python/actions/workflows/benchmark.yml/badge.svg" alt="Benchmarks"></a>
106
+ </p>
107
+
108
+ ---
109
+
110
+ ## Overview
111
+
112
+ DataFog provides efficient PII detection using a pattern-first approach that processes text significantly faster than traditional NLP methods while maintaining high accuracy.
113
+
114
+ ```python
115
+ # Basic usage example
116
+ from datafog import DataFog
117
+ results = DataFog().scan_text("John's email is john@example.com and SSN is 123-45-6789")
118
+ ```
119
+
120
+ ### Performance Comparison
121
+
122
+ | Engine | 10KB Text Processing | Relative Speed | Accuracy |
123
+ | -------------------- | -------------------- | --------------- | ----------------- |
124
+ | **DataFog (Regex)** | ~2.4ms | **190x faster** | High (structured) |
125
+ | **DataFog (GLiNER)** | ~15ms | **32x faster** | Very High |
126
+ | **DataFog (Smart)** | ~3-15ms | **60x faster** | Highest |
127
+ | spaCy | ~459ms | baseline | Good |
128
+
129
+ _Performance measured on 13.3KB business document. GLiNER provides excellent accuracy for named entities while maintaining speed advantage._
130
+
131
+ ### Supported PII Types
132
+
133
+ | Type | Examples | Use Cases |
134
+ | ---------------- | ------------------- | ---------------------- |
135
+ | **Email** | john@company.com | Contact scrubbing |
136
+ | **Phone** | (555) 123-4567 | Call log anonymization |
137
+ | **SSN** | 123-45-6789 | HR data protection |
138
+ | **Credit Cards** | 4111-1111-1111-1111 | Payment processing |
139
+ | **IP Addresses** | 192.168.1.1 | Network log cleaning |
140
+ | **Dates** | 01/01/1990 | Birthdate removal |
141
+ | **ZIP Codes** | 12345-6789 | Location anonymization |
142
+
143
+ ---
144
+
145
+ ## Quick Start
146
+
147
+ ### Installation
148
+
149
+ ```bash
150
+ # Lightweight core (fast regex-based PII detection)
151
+ pip install datafog
152
+
153
+ # With advanced ML models for better accuracy
154
+ pip install datafog[nlp] # spaCy for advanced NLP
155
+ pip install datafog[nlp-advanced] # GLiNER for modern NER
156
+ pip install datafog[ocr] # Image processing with OCR
157
+ pip install datafog[all] # Everything included
158
+ ```
159
+
160
+ ### Basic Usage
161
+
162
+ **Detect PII in text:**
163
+
164
+ ```python
165
+ from datafog import DataFog
166
+
167
+ # Simple detection (uses fast regex engine)
168
+ detector = DataFog()
169
+ text = "Contact John Doe at john.doe@company.com or (555) 123-4567"
170
+ results = detector.scan_text(text)
171
+ print(results)
172
+ # Finds: emails, phone numbers, and more
173
+
174
+ # Modern NER with GLiNER (requires: pip install datafog[nlp-advanced])
175
+ from datafog.services import TextService
176
+ gliner_service = TextService(engine="gliner")
177
+ result = gliner_service.annotate_text_sync("Dr. John Smith works at General Hospital")
178
+ # Detects: PERSON, ORGANIZATION with high accuracy
179
+
180
+ # Best of both worlds: Smart cascading (recommended for production)
181
+ smart_service = TextService(engine="smart")
182
+ result = smart_service.annotate_text_sync("Contact john@company.com or call (555) 123-4567")
183
+ # Uses regex for structured PII (fast), GLiNER for entities (accurate)
184
+ ```
185
+
186
+ **Anonymize on the fly:**
187
+
188
+ ```python
189
+ # Redact sensitive data
190
+ redacted = DataFog(operations=["scan", "redact"]).process_text(
191
+ "My SSN is 123-45-6789 and email is john@example.com"
192
+ )
193
+ print(redacted)
194
+ # Output: "My SSN is [REDACTED] and email is [REDACTED]"
195
+
196
+ # Replace with fake data
197
+ replaced = DataFog(operations=["scan", "replace"]).process_text(
198
+ "Call me at (555) 123-4567"
199
+ )
200
+ print(replaced)
201
+ # Output: "Call me at [PHONE_A1B2C3]"
202
+ ```
203
+
204
+ **Process images with OCR:**
205
+
206
+ ```python
207
+ import asyncio
208
+ from datafog import DataFog
209
+
210
+ async def scan_document():
211
+ ocr_scanner = DataFog(operations=["extract", "scan"])
212
+ results = await ocr_scanner.run_ocr_pipeline([
213
+ "https://example.com/document.png"
214
+ ])
215
+ return results
216
+
217
+ # Extract text and find PII in images
218
+ results = asyncio.run(scan_document())
219
+ ```
220
+
221
+ ---
222
+
223
+ ## Advanced Features
224
+
225
+ ### Engine Selection
226
+
227
+ Choose the appropriate engine for your needs:
228
+
229
+ ```python
230
+ from datafog.services import TextService
231
+
232
+ # Regex: Fast, pattern-based (recommended for speed)
233
+ regex_service = TextService(engine="regex")
234
+
235
+ # spaCy: Traditional NLP with broad entity recognition
236
+ spacy_service = TextService(engine="spacy")
237
+
238
+ # GLiNER: Modern ML model optimized for NER (requires nlp-advanced extra)
239
+ gliner_service = TextService(engine="gliner")
240
+
241
+ # Smart: Cascading approach - regex → GLiNER → spaCy (best accuracy/speed balance)
242
+ smart_service = TextService(engine="smart")
243
+
244
+ # Auto: Regex → spaCy fallback (legacy)
245
+ auto_service = TextService(engine="auto")
246
+ ```
247
+
248
+ **Performance & Accuracy Guide:**
249
+
250
+ | Engine | Speed | Accuracy | Use Case | Install Requirements |
251
+ | -------- | ----------- | -------- | ------------------------------- | ----------------------------------- |
252
+ | `regex` | 🚀 Fastest | Good | Structured PII (emails, phones) | Core only |
253
+ | `gliner` | ⚡ Fast | Better | Modern NER, custom entities | `pip install datafog[nlp-advanced]` |
254
+ | `spacy` | 🐌 Slower | Good | Traditional NLP entities | `pip install datafog[nlp]` |
255
+ | `smart` | ⚡ Balanced | Best | Combines all approaches | `pip install datafog[nlp-advanced]` |
256
+
257
+ **Model Management:**
258
+
259
+ ```python
260
+ # Download specific GLiNER models
261
+ import subprocess
262
+
263
+ # PII-specialized model (recommended)
264
+ subprocess.run(["datafog", "download-model", "urchade/gliner_multi_pii-v1", "--engine", "gliner"])
265
+
266
+ # General-purpose model
267
+ subprocess.run(["datafog", "download-model", "urchade/gliner_base", "--engine", "gliner"])
268
+
269
+ # List available models
270
+ subprocess.run(["datafog", "list-models", "--engine", "gliner"])
271
+ ```
272
+
273
+ ### Anonymization Options
274
+
275
+ ```python
276
+ from datafog import DataFog
277
+ from datafog.models.anonymizer import AnonymizerType, HashType
278
+
279
+ # Hash with different algorithms
280
+ hasher = DataFog(
281
+ operations=["scan", "hash"],
282
+ hash_type=HashType.SHA256 # or MD5, SHA3_256
283
+ )
284
+
285
+ # Target specific entity types only
286
+ selective = DataFog(
287
+ operations=["scan", "redact"],
288
+ entities=["EMAIL", "PHONE"] # Only process these types
289
+ )
290
+ ```
291
+
292
+ ### Batch Processing
293
+
294
+ ```python
295
+ documents = [
296
+ "Document 1 with PII...",
297
+ "Document 2 with more data...",
298
+ "Document 3..."
299
+ ]
300
+
301
+ # Process multiple documents efficiently
302
+ results = DataFog().batch_process(documents)
303
+ ```
304
+
305
+ ---
306
+
307
+ ## Performance Benchmarks
308
+
309
+ Performance comparison with alternatives:
310
+
311
+ ### Speed Comparison (10KB text)
312
+
313
+ ```
314
+ DataFog Pattern: 4ms ████████████████████████████████ 123x faster
315
+ spaCy: 480ms ██ baseline
316
+ ```
317
+
318
+ ### Engine Selection Guide
319
+
320
+ | Scenario | Recommended Engine | Why |
321
+ | -------------------------- | ------------------ | ------------------------------------- |
322
+ | **High-volume processing** | `pattern` | Maximum speed, consistent performance |
323
+ | **Unknown entity types** | `spacy` | Broader entity recognition |
324
+ | **General purpose** | `auto` | Smart fallback, best of both worlds |
325
+ | **Real-time applications** | `pattern` | Sub-millisecond processing |
326
+
327
+ ---
328
+
329
+ ## CLI Usage
330
+
331
+ DataFog includes a command-line interface:
332
+
333
+ ```bash
334
+ # Scan text for PII
335
+ datafog scan-text "John's email is john@example.com"
336
+
337
+ # Process images
338
+ datafog scan-image document.png --operations extract,scan
339
+
340
+ # Anonymize data
341
+ datafog redact-text "My phone is (555) 123-4567"
342
+ datafog replace-text "SSN: 123-45-6789"
343
+ datafog hash-text "Email: john@company.com" --hash-type sha256
344
+
345
+ # Utility commands
346
+ datafog health
347
+ datafog list-entities
348
+ datafog show-config
349
+ ```
350
+
351
+ ---
352
+
353
+ ## Features
354
+
355
+ ### Security & Compliance
356
+
357
+ - Detection of regulated data types for GDPR/CCPA compliance
358
+ - Audit trails for tracking detection and anonymization
359
+ - Configurable detection thresholds
360
+
361
+ ### Scalability
362
+
363
+ - Batch processing for handling multiple documents
364
+ - Memory-efficient processing for large files
365
+ - Async support for non-blocking operations
366
+
367
+ ### Integration Example
368
+
369
+ ```python
370
+ # FastAPI middleware example
371
+ from fastapi import FastAPI
372
+ from datafog import DataFog
373
+
374
+ app = FastAPI()
375
+ detector = DataFog()
376
+
377
+ @app.middleware("http")
378
+ async def redact_pii_middleware(request, call_next):
379
+ # Automatically scan/redact request data
380
+ pass
381
+ ```
382
+
383
+ ---
384
+
385
+ ## Common Use Cases
386
+
387
+ ### Enterprise
388
+
389
+ - Log sanitization
390
+ - Data migration with PII handling
391
+ - Compliance reporting and audits
392
+
393
+ ### Data Science
394
+
395
+ - Dataset preparation and anonymization
396
+ - Privacy-preserving analytics
397
+ - Research compliance
398
+
399
+ ### Development
400
+
401
+ - Test data generation
402
+ - Code review for PII detection
403
+ - API security validation
404
+
405
+ ---
406
+
407
+ ## Installation & Setup
408
+
409
+ ### Basic Installation
410
+
411
+ ```bash
412
+ pip install datafog
413
+ ```
414
+
415
+ ### Development Setup
416
+
417
+ ```bash
418
+ git clone https://github.com/datafog/datafog-python
419
+ cd datafog-python
420
+ python -m venv .venv
421
+ source .venv/bin/activate # On Windows: .venv\Scripts\activate
422
+ pip install -r requirements-dev.txt
423
+ just setup
424
+ ```
425
+
426
+ ### Docker Usage
427
+
428
+ ```dockerfile
429
+ FROM python:3.10-slim
430
+ RUN pip install datafog
431
+ COPY . .
432
+ CMD ["python", "your_script.py"]
433
+ ```
434
+
435
+ ---
436
+
437
+ ## Contributing
438
+
439
+ Contributions are welcome in the form of:
440
+
441
+ - Bug reports
442
+ - Feature requests
443
+ - Documentation improvements
444
+ - New pattern patterns for PII detection
445
+ - Performance improvements
446
+
447
+ ### Quick Contribution Guide
448
+
449
+ ```bash
450
+ # Setup development environment
451
+ git clone https://github.com/datafog/datafog-python
452
+ cd datafog-python
453
+ just setup
454
+
455
+ # Run tests
456
+ just test
457
+
458
+ # Format code
459
+ just format
460
+
461
+ # Submit PR
462
+ git checkout -b feature/your-improvement
463
+ # Make your changes
464
+ git commit -m "Add your improvement"
465
+ git push origin feature/your-improvement
466
+ ```
467
+
468
+ See [CONTRIBUTING.md](CONTRIBUTING.md) for detailed guidelines.
469
+
470
+ ---
471
+
472
+ ## Benchmarking & Performance
473
+
474
+ ### Run Benchmarks Locally
475
+
476
+ ```bash
477
+ # Install benchmark dependencies
478
+ pip install pytest-benchmark
479
+
480
+ # Run performance tests
481
+ pytest tests/benchmark_text_service.py -v
482
+
483
+ # Compare with baseline
484
+ python scripts/run_benchmark_locally.sh
485
+ ```
486
+
487
+ ### Continuous Performance Monitoring
488
+
489
+ Our CI pipeline:
490
+
491
+ - Runs benchmarks on every PR
492
+ - Compares against baseline performance
493
+ - Fails builds if performance degrades >10%
494
+ - Tracks performance trends over time
495
+
496
+ ---
497
+
498
+ ## Documentation & Support
499
+
500
+ | Resource | Link |
501
+ | --------------------- | --------------------------------------------------------------------------- |
502
+ | **Documentation** | [docs.datafog.ai](https://docs.datafog.ai) |
503
+ | **Community Discord** | [Join here](https://discord.gg/bzDth394R4) |
504
+ | **Bug Reports** | [GitHub Issues](https://github.com/datafog/datafog-python/issues) |
505
+ | **Feature Requests** | [GitHub Discussions](https://github.com/datafog/datafog-python/discussions) |
506
+ | **Support** | [hi@datafog.ai](mailto:hi@datafog.ai) |
507
+
508
+ ---
509
+
510
+ ## License & Acknowledgments
511
+
512
+ DataFog is released under the [MIT License](LICENSE).
513
+
514
+ **Built with:**
515
+
516
+ - Pattern optimization for efficient processing
517
+ - spaCy integration for NLP capabilities
518
+ - Tesseract & Donut for OCR capabilities
519
+ - Pydantic for data validation
520
+
521
+ ---
522
+
523
+ [GitHub](https://github.com/datafog/datafog-python) • [Documentation](https://docs.datafog.ai) • [Discord](https://discord.gg/bzDth394R4)