datafog 4.4.0__tar.gz → 4.4.0a2__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {datafog-4.4.0 → datafog-4.4.0a2}/PKG-INFO +28 -13
- {datafog-4.4.0 → datafog-4.4.0a2}/README.md +1 -0
- datafog-4.4.0a2/datafog/__about__.py +1 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/client.py +1 -1
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/engine.py +8 -18
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/spacy_nlp.py +19 -8
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/donut_processor.py +31 -34
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/spark_processing/pyspark_udfs.py +16 -8
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/gliner_annotator.py +6 -2
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/spacy_pii_annotator.py +17 -22
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/spark_service.py +9 -20
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/telemetry.py +80 -81
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/PKG-INFO +28 -13
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/SOURCES.txt +3 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/requires.txt +29 -13
- {datafog-4.4.0 → datafog-4.4.0a2}/setup.py +75 -56
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_donut_lazy_import.py +2 -19
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_gliner_annotator.py +25 -15
- datafog-4.4.0a2/tests/test_install_profiles.py +67 -0
- datafog-4.4.0a2/tests/test_no_network_core.py +96 -0
- datafog-4.4.0a2/tests/test_runtime_dependency_safety.py +155 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_telemetry.py +129 -12
- datafog-4.4.0/datafog/__about__.py +0 -1
- {datafog-4.4.0 → datafog-4.4.0a2}/LICENSE +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/__init___lean.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/__init___original.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/agent.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/config.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/core.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/exceptions.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/main.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/main_lean.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/main_original.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/annotator.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/anonymizer.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/common.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/image_downloader.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/pytesseract_processor.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/spark_processing/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/regex_annotator/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/regex_annotator/regex_annotator.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/__init__.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/image_service.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/text_service.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/text_service_lean.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/text_service_original.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/dependency_links.txt +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/entry_points.txt +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/top_level.txt +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/setup.cfg +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_agent_api.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_anonymizer.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_cli_smoke.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_client.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_detection_accuracy.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_engine_api.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_image_service.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_main.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_ocr_integration.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_regex_annotator.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_spark_integration.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_text_service.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_text_service_integration.py +0 -0
- {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_v44_bridge_api.py +0 -0
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: datafog
|
|
3
|
-
Version: 4.4.
|
|
3
|
+
Version: 4.4.0a2
|
|
4
4
|
Summary: Lightning-fast PII detection and anonymization library with 190x performance advantage
|
|
5
5
|
Author: Sid Mohan
|
|
6
6
|
Author-email: sid@datafog.ai
|
|
@@ -35,45 +35,59 @@ Requires-Dist: torch<2.7,>=2.1.0; extra == "nlp-advanced"
|
|
|
35
35
|
Requires-Dist: transformers>=4.20.0; extra == "nlp-advanced"
|
|
36
36
|
Requires-Dist: huggingface-hub>=0.16.0; extra == "nlp-advanced"
|
|
37
37
|
Provides-Extra: ocr
|
|
38
|
+
Requires-Dist: numpy>=1.24.0; extra == "ocr"
|
|
38
39
|
Requires-Dist: pytesseract>=0.3.0; extra == "ocr"
|
|
39
|
-
Requires-Dist: Pillow>=
|
|
40
|
+
Requires-Dist: Pillow>=12.2.0; extra == "ocr"
|
|
40
41
|
Requires-Dist: sentencepiece>=0.2.0; extra == "ocr"
|
|
41
42
|
Requires-Dist: protobuf>=4.0.0; extra == "ocr"
|
|
42
43
|
Provides-Extra: distributed
|
|
43
44
|
Requires-Dist: pandas>=2.0.0; extra == "distributed"
|
|
44
45
|
Requires-Dist: numpy>=1.24.0; extra == "distributed"
|
|
46
|
+
Requires-Dist: pyspark>=3.5.0; extra == "distributed"
|
|
45
47
|
Provides-Extra: web
|
|
46
48
|
Requires-Dist: fastapi>=0.100.0; extra == "web"
|
|
47
|
-
Requires-Dist: aiohttp>=3.
|
|
48
|
-
Requires-Dist:
|
|
49
|
+
Requires-Dist: aiohttp>=3.13.4; extra == "web"
|
|
50
|
+
Requires-Dist: certifi>=2025.4.26; extra == "web"
|
|
51
|
+
Requires-Dist: requests>=2.33.0; extra == "web"
|
|
49
52
|
Provides-Extra: cli
|
|
50
53
|
Requires-Dist: typer>=0.12.0; extra == "cli"
|
|
51
54
|
Requires-Dist: pydantic-settings>=2.0.0; extra == "cli"
|
|
52
55
|
Provides-Extra: crypto
|
|
53
|
-
Requires-Dist: cryptography>=
|
|
56
|
+
Requires-Dist: cryptography>=46.0.7; extra == "crypto"
|
|
57
|
+
Provides-Extra: test
|
|
58
|
+
Requires-Dist: pytest>=9.0.3; extra == "test"
|
|
59
|
+
Requires-Dist: pytest-asyncio>=1.3.0; extra == "test"
|
|
60
|
+
Requires-Dist: pytest-cov>=7.1.0; extra == "test"
|
|
61
|
+
Provides-Extra: docs
|
|
62
|
+
Requires-Dist: sphinx>=7.2.6; extra == "docs"
|
|
63
|
+
Provides-Extra: benchmark
|
|
64
|
+
Requires-Dist: pytest-benchmark>=4.0.0; extra == "benchmark"
|
|
54
65
|
Provides-Extra: dev
|
|
55
|
-
Requires-Dist: pytest>=
|
|
56
|
-
Requires-Dist: pytest-asyncio>=
|
|
57
|
-
Requires-Dist: pytest-cov>=
|
|
58
|
-
Requires-Dist: sphinx>=7.
|
|
66
|
+
Requires-Dist: pytest>=9.0.3; extra == "dev"
|
|
67
|
+
Requires-Dist: pytest-asyncio>=1.3.0; extra == "dev"
|
|
68
|
+
Requires-Dist: pytest-cov>=7.1.0; extra == "dev"
|
|
69
|
+
Requires-Dist: sphinx>=7.2.6; extra == "dev"
|
|
59
70
|
Provides-Extra: all
|
|
60
71
|
Requires-Dist: spacy<4.0,>=3.7.0; extra == "all"
|
|
61
72
|
Requires-Dist: gliner>=0.2.5; extra == "all"
|
|
62
73
|
Requires-Dist: torch<2.7,>=2.1.0; extra == "all"
|
|
63
74
|
Requires-Dist: transformers>=4.20.0; extra == "all"
|
|
64
75
|
Requires-Dist: huggingface-hub>=0.16.0; extra == "all"
|
|
76
|
+
Requires-Dist: numpy>=1.24.0; extra == "all"
|
|
65
77
|
Requires-Dist: pytesseract>=0.3.0; extra == "all"
|
|
66
|
-
Requires-Dist: Pillow>=
|
|
78
|
+
Requires-Dist: Pillow>=12.2.0; extra == "all"
|
|
67
79
|
Requires-Dist: sentencepiece>=0.2.0; extra == "all"
|
|
68
80
|
Requires-Dist: protobuf>=4.0.0; extra == "all"
|
|
69
81
|
Requires-Dist: pandas>=2.0.0; extra == "all"
|
|
70
82
|
Requires-Dist: numpy>=1.24.0; extra == "all"
|
|
83
|
+
Requires-Dist: pyspark>=3.5.0; extra == "all"
|
|
71
84
|
Requires-Dist: fastapi>=0.100.0; extra == "all"
|
|
72
|
-
Requires-Dist: aiohttp>=3.
|
|
73
|
-
Requires-Dist:
|
|
85
|
+
Requires-Dist: aiohttp>=3.13.4; extra == "all"
|
|
86
|
+
Requires-Dist: certifi>=2025.4.26; extra == "all"
|
|
87
|
+
Requires-Dist: requests>=2.33.0; extra == "all"
|
|
74
88
|
Requires-Dist: typer>=0.12.0; extra == "all"
|
|
75
89
|
Requires-Dist: pydantic-settings>=2.0.0; extra == "all"
|
|
76
|
-
Requires-Dist: cryptography>=
|
|
90
|
+
Requires-Dist: cryptography>=46.0.7; extra == "all"
|
|
77
91
|
Dynamic: author
|
|
78
92
|
Dynamic: author-email
|
|
79
93
|
Dynamic: classifier
|
|
@@ -251,5 +265,6 @@ cd datafog-python
|
|
|
251
265
|
python -m venv .venv
|
|
252
266
|
source .venv/bin/activate # Windows: .venv\Scripts\activate
|
|
253
267
|
pip install -e ".[all,dev]"
|
|
268
|
+
pip install -r requirements-dev.txt
|
|
254
269
|
pytest tests/
|
|
255
270
|
```
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
__version__ = "4.4.0a2"
|
|
@@ -181,7 +181,7 @@ def download_model(
|
|
|
181
181
|
Download a model for specified engine.
|
|
182
182
|
|
|
183
183
|
Examples:
|
|
184
|
-
spaCy: datafog download-model
|
|
184
|
+
spaCy: datafog download-model en_core_web_lg --engine spacy
|
|
185
185
|
GLiNER: datafog download-model urchade/gliner_multi_pii-v1 --engine gliner
|
|
186
186
|
"""
|
|
187
187
|
if engine == "spacy":
|
|
@@ -171,17 +171,13 @@ def _gliner_entities(text: str) -> list[Entity]:
|
|
|
171
171
|
def _get_spacy_annotator():
|
|
172
172
|
try:
|
|
173
173
|
from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
|
|
174
|
-
except ImportError:
|
|
175
|
-
return _UnavailableAnnotator(
|
|
176
|
-
"SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
|
|
177
|
-
)
|
|
174
|
+
except ImportError as exc:
|
|
175
|
+
return _UnavailableAnnotator(str(exc))
|
|
178
176
|
|
|
179
177
|
try:
|
|
180
178
|
return SpacyPIIAnnotator.create()
|
|
181
|
-
except ImportError:
|
|
182
|
-
return _UnavailableAnnotator(
|
|
183
|
-
"SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
|
|
184
|
-
)
|
|
179
|
+
except ImportError as exc:
|
|
180
|
+
return _UnavailableAnnotator(str(exc))
|
|
185
181
|
except Exception as exc:
|
|
186
182
|
return _UnavailableAnnotator(
|
|
187
183
|
f"SpaCy engine initialization failed: {type(exc).__name__}: {exc}"
|
|
@@ -192,19 +188,13 @@ def _get_spacy_annotator():
|
|
|
192
188
|
def _get_gliner_annotator():
|
|
193
189
|
try:
|
|
194
190
|
from .processing.text_processing.gliner_annotator import GLiNERAnnotator
|
|
195
|
-
except ImportError:
|
|
196
|
-
return _UnavailableAnnotator(
|
|
197
|
-
"GLiNER engine requires the nlp-advanced extra. "
|
|
198
|
-
"Install with: pip install datafog[nlp-advanced]"
|
|
199
|
-
)
|
|
191
|
+
except ImportError as exc:
|
|
192
|
+
return _UnavailableAnnotator(str(exc))
|
|
200
193
|
|
|
201
194
|
try:
|
|
202
195
|
annotator = GLiNERAnnotator.create()
|
|
203
|
-
except ImportError:
|
|
204
|
-
return _UnavailableAnnotator(
|
|
205
|
-
"GLiNER engine requires the nlp-advanced extra. "
|
|
206
|
-
"Install with: pip install datafog[nlp-advanced]"
|
|
207
|
-
)
|
|
196
|
+
except ImportError as exc:
|
|
197
|
+
return _UnavailableAnnotator(str(exc))
|
|
208
198
|
except Exception as exc:
|
|
209
199
|
return _UnavailableAnnotator(
|
|
210
200
|
f"GLiNER engine initialization failed: {type(exc).__name__}: {exc}"
|
|
@@ -9,10 +9,11 @@ from typing import List
|
|
|
9
9
|
from uuid import uuid4
|
|
10
10
|
|
|
11
11
|
import spacy
|
|
12
|
-
from rich.progress import track
|
|
13
12
|
|
|
14
13
|
from .annotator import AnnotationResult, AnnotatorRequest
|
|
15
14
|
|
|
15
|
+
DEFAULT_SPACY_MODEL = "en_core_web_lg"
|
|
16
|
+
|
|
16
17
|
|
|
17
18
|
class SpacyAnnotator:
|
|
18
19
|
"""
|
|
@@ -22,14 +23,18 @@ class SpacyAnnotator:
|
|
|
22
23
|
Supports various NLP tasks including entity recognition and model management.
|
|
23
24
|
"""
|
|
24
25
|
|
|
25
|
-
def __init__(self, model_name: str =
|
|
26
|
+
def __init__(self, model_name: str = DEFAULT_SPACY_MODEL):
|
|
26
27
|
self.model_name = model_name
|
|
27
28
|
self.nlp = None
|
|
28
29
|
|
|
29
30
|
def load_model(self):
|
|
30
|
-
|
|
31
|
-
spacy.
|
|
32
|
-
|
|
31
|
+
try:
|
|
32
|
+
self.nlp = spacy.load(self.model_name)
|
|
33
|
+
except OSError as exc:
|
|
34
|
+
raise ImportError(
|
|
35
|
+
f"spaCy model {self.model_name!r} is not installed. "
|
|
36
|
+
f"Download it explicitly with: datafog download-model {self.model_name} --engine spacy"
|
|
37
|
+
) from exc
|
|
33
38
|
|
|
34
39
|
def annotate_text(self, text: str, language: str = "en") -> List[AnnotationResult]:
|
|
35
40
|
if not self.nlp:
|
|
@@ -47,7 +52,7 @@ class SpacyAnnotator:
|
|
|
47
52
|
)
|
|
48
53
|
doc = self.nlp(annotator_request.text)
|
|
49
54
|
results = []
|
|
50
|
-
for ent in
|
|
55
|
+
for ent in doc.ents:
|
|
51
56
|
result = AnnotationResult(
|
|
52
57
|
start=ent.start_char,
|
|
53
58
|
end=ent.end_char,
|
|
@@ -72,6 +77,12 @@ class SpacyAnnotator:
|
|
|
72
77
|
return spacy.util.get_installed_models()
|
|
73
78
|
|
|
74
79
|
@staticmethod
|
|
75
|
-
def list_entities() -> List[str]:
|
|
76
|
-
|
|
80
|
+
def list_entities(model_name: str = DEFAULT_SPACY_MODEL) -> List[str]:
|
|
81
|
+
try:
|
|
82
|
+
nlp = spacy.load(model_name)
|
|
83
|
+
except OSError as exc:
|
|
84
|
+
raise ImportError(
|
|
85
|
+
f"spaCy model {model_name!r} is not installed. "
|
|
86
|
+
f"Download it explicitly with: datafog download-model {model_name} --engine spacy"
|
|
87
|
+
) from exc
|
|
77
88
|
return [ent for ent in nlp.pipe_labels["ner"]]
|
|
@@ -6,14 +6,10 @@ for document understanding tasks, particularly OCR and information extraction
|
|
|
6
6
|
from images of documents.
|
|
7
7
|
"""
|
|
8
8
|
|
|
9
|
-
import importlib
|
|
10
|
-
import importlib.util
|
|
11
9
|
import json
|
|
12
10
|
import logging
|
|
13
11
|
import os
|
|
14
12
|
import re
|
|
15
|
-
import subprocess
|
|
16
|
-
import sys
|
|
17
13
|
from typing import TYPE_CHECKING, Any
|
|
18
14
|
|
|
19
15
|
from .image_downloader import ImageDownloader
|
|
@@ -43,13 +39,12 @@ class DonutProcessor:
|
|
|
43
39
|
self.model_path = model_path
|
|
44
40
|
self.downloader = ImageDownloader()
|
|
45
41
|
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
)
|
|
42
|
+
@staticmethod
|
|
43
|
+
def _missing_dependency_message(package_name: str) -> str:
|
|
44
|
+
return (
|
|
45
|
+
f"Donut OCR requires {package_name}. "
|
|
46
|
+
"Install with: pip install datafog[nlp-advanced,ocr]"
|
|
47
|
+
)
|
|
53
48
|
|
|
54
49
|
def preprocess_image(self, image: "Image.Image") -> Any:
|
|
55
50
|
import numpy as np
|
|
@@ -86,40 +81,40 @@ class DonutProcessor:
|
|
|
86
81
|
"PYTEST_DONUT=yes is set, running actual OCR in test environment"
|
|
87
82
|
)
|
|
88
83
|
|
|
89
|
-
# Only import torch and transformers when actually needed and not in test environment
|
|
90
84
|
try:
|
|
91
|
-
# Check if torch is available before trying to import it
|
|
92
|
-
try:
|
|
93
|
-
# Try to find the module without importing it
|
|
94
|
-
spec = importlib.util.find_spec("torch")
|
|
95
|
-
if spec is None:
|
|
96
|
-
# If we're in a test that somehow bypassed the IN_TEST_ENV check,
|
|
97
|
-
# still return a mock result instead of failing
|
|
98
|
-
logging.warning("torch module not found, returning mock result")
|
|
99
|
-
return json.dumps({"text": "Mock OCR text (torch not available)"})
|
|
100
|
-
|
|
101
|
-
# Ensure dependencies are installed
|
|
102
|
-
self.ensure_installed("torch")
|
|
103
|
-
self.ensure_installed("transformers")
|
|
104
|
-
except ImportError:
|
|
105
|
-
# If importlib.util is not available, fall back to direct try/except
|
|
106
|
-
pass
|
|
107
|
-
|
|
108
|
-
# Import dependencies only when needed
|
|
109
85
|
try:
|
|
110
86
|
import torch
|
|
87
|
+
except ImportError as exc:
|
|
88
|
+
raise ImportError(self._missing_dependency_message("torch")) from exc
|
|
89
|
+
|
|
90
|
+
try:
|
|
111
91
|
from transformers import DonutProcessor as TransformersDonutProcessor
|
|
112
92
|
from transformers import VisionEncoderDecoderModel
|
|
113
93
|
except ImportError as e:
|
|
114
|
-
|
|
115
|
-
|
|
94
|
+
raise ImportError(
|
|
95
|
+
self._missing_dependency_message("transformers")
|
|
96
|
+
) from e
|
|
116
97
|
|
|
117
98
|
# Preprocess the image
|
|
118
99
|
image_np = self.preprocess_image(image)
|
|
119
100
|
|
|
120
101
|
# Initialize model components
|
|
121
|
-
|
|
122
|
-
|
|
102
|
+
try:
|
|
103
|
+
processor = TransformersDonutProcessor.from_pretrained(
|
|
104
|
+
self.model_path,
|
|
105
|
+
local_files_only=True,
|
|
106
|
+
)
|
|
107
|
+
model = VisionEncoderDecoderModel.from_pretrained(
|
|
108
|
+
self.model_path,
|
|
109
|
+
local_files_only=True,
|
|
110
|
+
)
|
|
111
|
+
except OSError as exc:
|
|
112
|
+
raise RuntimeError(
|
|
113
|
+
f"Donut model {self.model_path!r} is not available locally. "
|
|
114
|
+
"Download it explicitly before using Donut OCR, or pass a local "
|
|
115
|
+
"model path."
|
|
116
|
+
) from exc
|
|
117
|
+
|
|
123
118
|
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
124
119
|
model.to(device)
|
|
125
120
|
model.eval()
|
|
@@ -153,6 +148,8 @@ class DonutProcessor:
|
|
|
153
148
|
result = processor.token2json(sequence)
|
|
154
149
|
return json.dumps(result)
|
|
155
150
|
|
|
151
|
+
except (ImportError, RuntimeError):
|
|
152
|
+
raise
|
|
156
153
|
except Exception as e:
|
|
157
154
|
logging.error(f"Error in extract_text_from_image: {e}")
|
|
158
155
|
# Return a placeholder in case of error
|
|
@@ -2,17 +2,16 @@
|
|
|
2
2
|
PySpark UDFs for PII annotation and related utilities.
|
|
3
3
|
|
|
4
4
|
This module provides functions for PII (Personally Identifiable Information) annotation
|
|
5
|
-
using SpaCy models in a PySpark environment. It includes utilities for
|
|
6
|
-
dependencies, creating and broadcasting PII annotator UDFs, and performing PII
|
|
7
|
-
on text data.
|
|
5
|
+
using SpaCy models in a PySpark environment. It includes utilities for validating
|
|
6
|
+
dependencies, creating and broadcasting PII annotator UDFs, and performing PII
|
|
7
|
+
annotation on text data.
|
|
8
8
|
"""
|
|
9
9
|
|
|
10
10
|
import importlib
|
|
11
|
-
import subprocess
|
|
12
|
-
import sys
|
|
13
11
|
|
|
14
12
|
PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
|
|
15
13
|
MAXIMAL_STRING_SIZE = 1000000
|
|
14
|
+
DEFAULT_SPACY_MODEL = "en_core_web_lg"
|
|
16
15
|
|
|
17
16
|
|
|
18
17
|
def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
|
|
@@ -45,7 +44,7 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
|
|
|
45
44
|
|
|
46
45
|
|
|
47
46
|
def broadcast_pii_annotator_udf(
|
|
48
|
-
spark_session=None, spacy_model: str =
|
|
47
|
+
spark_session=None, spacy_model: str = DEFAULT_SPACY_MODEL
|
|
49
48
|
):
|
|
50
49
|
"""Broadcast PII annotator across Spark cluster and create UDF"""
|
|
51
50
|
ensure_installed("pyspark")
|
|
@@ -69,5 +68,14 @@ def broadcast_pii_annotator_udf(
|
|
|
69
68
|
def ensure_installed(package_name):
|
|
70
69
|
try:
|
|
71
70
|
importlib.import_module(package_name)
|
|
72
|
-
except ImportError:
|
|
73
|
-
|
|
71
|
+
except ImportError as exc:
|
|
72
|
+
if package_name == "pyspark":
|
|
73
|
+
extra = "distributed"
|
|
74
|
+
elif package_name == "spacy":
|
|
75
|
+
extra = "nlp"
|
|
76
|
+
else:
|
|
77
|
+
extra = "all"
|
|
78
|
+
raise ImportError(
|
|
79
|
+
f"{package_name} is required for Spark PII UDF support. "
|
|
80
|
+
f"Install with: pip install datafog[{extra}]"
|
|
81
|
+
) from exc
|
|
@@ -79,14 +79,18 @@ class GLiNERAnnotator(BaseModel):
|
|
|
79
79
|
|
|
80
80
|
try:
|
|
81
81
|
# Load the GLiNER model
|
|
82
|
-
model = GLiNER.from_pretrained(model_name)
|
|
82
|
+
model = GLiNER.from_pretrained(model_name, local_files_only=True)
|
|
83
83
|
logging.info(f"Successfully loaded GLiNER model: {model_name}")
|
|
84
84
|
|
|
85
85
|
return cls(model=model, entity_types=entity_types, model_name=model_name)
|
|
86
86
|
|
|
87
87
|
except Exception as e:
|
|
88
88
|
logging.error(f"Failed to load GLiNER model {model_name}: {str(e)}")
|
|
89
|
-
raise
|
|
89
|
+
raise RuntimeError(
|
|
90
|
+
f"GLiNER model {model_name!r} is not available locally. "
|
|
91
|
+
"Download it explicitly with: "
|
|
92
|
+
f"datafog download-model {model_name} --engine gliner"
|
|
93
|
+
) from e
|
|
90
94
|
|
|
91
95
|
def annotate(self, text: str) -> Dict[str, List[str]]:
|
|
92
96
|
"""
|
|
@@ -24,39 +24,34 @@ PII_ANNOTATION_LABELS = [
|
|
|
24
24
|
"WORK_OF_ART",
|
|
25
25
|
]
|
|
26
26
|
MAXIMAL_STRING_SIZE = 1000000
|
|
27
|
+
DEFAULT_SPACY_MODEL = "en_core_web_lg"
|
|
27
28
|
|
|
28
29
|
|
|
29
30
|
class SpacyPIIAnnotator(BaseModel):
|
|
30
31
|
model_config = ConfigDict(arbitrary_types_allowed=True)
|
|
31
32
|
|
|
32
33
|
nlp: Any
|
|
34
|
+
model_name: str = DEFAULT_SPACY_MODEL
|
|
33
35
|
|
|
34
36
|
@classmethod
|
|
35
|
-
def create(cls) -> "SpacyPIIAnnotator":
|
|
36
|
-
import spacy
|
|
37
|
-
|
|
37
|
+
def create(cls, model_name: str = DEFAULT_SPACY_MODEL) -> "SpacyPIIAnnotator":
|
|
38
38
|
try:
|
|
39
|
-
|
|
40
|
-
except
|
|
41
|
-
|
|
42
|
-
|
|
39
|
+
import spacy
|
|
40
|
+
except ImportError as exc:
|
|
41
|
+
raise ImportError(
|
|
42
|
+
"SpaCy engine requires the nlp extra. "
|
|
43
|
+
"Install with: pip install datafog[nlp]"
|
|
44
|
+
) from exc
|
|
43
45
|
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
"--no-deps",
|
|
52
|
-
"--no-cache-dir",
|
|
53
|
-
"https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl",
|
|
54
|
-
],
|
|
55
|
-
check=True,
|
|
56
|
-
)
|
|
57
|
-
nlp = spacy.load("en_core_web_lg")
|
|
46
|
+
try:
|
|
47
|
+
nlp = spacy.load(model_name)
|
|
48
|
+
except OSError as exc:
|
|
49
|
+
raise ImportError(
|
|
50
|
+
f"spaCy model {model_name!r} is not installed. "
|
|
51
|
+
f"Download it explicitly with: datafog download-model {model_name} --engine spacy"
|
|
52
|
+
) from exc
|
|
58
53
|
|
|
59
|
-
return cls(nlp=nlp)
|
|
54
|
+
return cls(nlp=nlp, model_name=model_name)
|
|
60
55
|
|
|
61
56
|
def annotate(self, text: str) -> Dict[str, List[str]]:
|
|
62
57
|
try:
|
|
@@ -1,14 +1,12 @@
|
|
|
1
1
|
"""
|
|
2
2
|
Spark service for data processing and analysis.
|
|
3
3
|
|
|
4
|
-
Provides a wrapper around PySpark functionality, including session creation
|
|
5
|
-
JSON reading
|
|
4
|
+
Provides a wrapper around PySpark functionality, including session creation and
|
|
5
|
+
JSON reading.
|
|
6
6
|
"""
|
|
7
7
|
|
|
8
8
|
import importlib
|
|
9
9
|
import os
|
|
10
|
-
import subprocess
|
|
11
|
-
import sys
|
|
12
10
|
from typing import List
|
|
13
11
|
|
|
14
12
|
|
|
@@ -16,14 +14,13 @@ class SparkService:
|
|
|
16
14
|
"""
|
|
17
15
|
Manages Spark operations and dependencies.
|
|
18
16
|
|
|
19
|
-
Initializes a Spark session, handles imports, and provides methods for
|
|
20
|
-
|
|
17
|
+
Initializes a Spark session, handles imports, and provides methods for data
|
|
18
|
+
reading.
|
|
21
19
|
"""
|
|
22
20
|
|
|
23
21
|
def __init__(self, master=None):
|
|
24
22
|
self.master = master
|
|
25
23
|
|
|
26
|
-
# Ensure pyspark is installed first
|
|
27
24
|
self.ensure_installed("pyspark")
|
|
28
25
|
|
|
29
26
|
# Now import necessary modules after ensuring pyspark is installed
|
|
@@ -84,16 +81,8 @@ class SparkService:
|
|
|
84
81
|
def ensure_installed(self, package_name):
|
|
85
82
|
try:
|
|
86
83
|
importlib.import_module(package_name)
|
|
87
|
-
except ImportError:
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
)
|
|
93
|
-
print(f"{package_name} installed successfully.")
|
|
94
|
-
except subprocess.CalledProcessError as e:
|
|
95
|
-
print(f"Failed to install {package_name}: {e}")
|
|
96
|
-
raise ImportError(
|
|
97
|
-
f"Could not install {package_name}. "
|
|
98
|
-
f"Please install it manually with 'pip install {package_name}'."
|
|
99
|
-
)
|
|
84
|
+
except ImportError as exc:
|
|
85
|
+
raise ImportError(
|
|
86
|
+
f"{package_name} is required for Spark support. "
|
|
87
|
+
"Install with: pip install datafog[distributed]"
|
|
88
|
+
) from exc
|