datafog 4.4.0__tar.gz → 4.4.0a2__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (69) hide show
  1. {datafog-4.4.0 → datafog-4.4.0a2}/PKG-INFO +28 -13
  2. {datafog-4.4.0 → datafog-4.4.0a2}/README.md +1 -0
  3. datafog-4.4.0a2/datafog/__about__.py +1 -0
  4. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/client.py +1 -1
  5. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/engine.py +8 -18
  6. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/spacy_nlp.py +19 -8
  7. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/donut_processor.py +31 -34
  8. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/spark_processing/pyspark_udfs.py +16 -8
  9. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/gliner_annotator.py +6 -2
  10. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/spacy_pii_annotator.py +17 -22
  11. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/spark_service.py +9 -20
  12. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/telemetry.py +80 -81
  13. {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/PKG-INFO +28 -13
  14. {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/SOURCES.txt +3 -0
  15. {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/requires.txt +29 -13
  16. {datafog-4.4.0 → datafog-4.4.0a2}/setup.py +75 -56
  17. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_donut_lazy_import.py +2 -19
  18. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_gliner_annotator.py +25 -15
  19. datafog-4.4.0a2/tests/test_install_profiles.py +67 -0
  20. datafog-4.4.0a2/tests/test_no_network_core.py +96 -0
  21. datafog-4.4.0a2/tests/test_runtime_dependency_safety.py +155 -0
  22. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_telemetry.py +129 -12
  23. datafog-4.4.0/datafog/__about__.py +0 -1
  24. {datafog-4.4.0 → datafog-4.4.0a2}/LICENSE +0 -0
  25. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/__init__.py +0 -0
  26. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/__init___lean.py +0 -0
  27. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/__init___original.py +0 -0
  28. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/agent.py +0 -0
  29. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/config.py +0 -0
  30. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/core.py +0 -0
  31. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/exceptions.py +0 -0
  32. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/main.py +0 -0
  33. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/main_lean.py +0 -0
  34. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/main_original.py +0 -0
  35. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/__init__.py +0 -0
  36. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/annotator.py +0 -0
  37. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/anonymizer.py +0 -0
  38. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/models/common.py +0 -0
  39. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/__init__.py +0 -0
  40. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/__init__.py +0 -0
  41. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/image_downloader.py +0 -0
  42. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/image_processing/pytesseract_processor.py +0 -0
  43. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/spark_processing/__init__.py +0 -0
  44. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/__init__.py +0 -0
  45. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/regex_annotator/__init__.py +0 -0
  46. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/processing/text_processing/regex_annotator/regex_annotator.py +0 -0
  47. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/__init__.py +0 -0
  48. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/image_service.py +0 -0
  49. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/text_service.py +0 -0
  50. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/text_service_lean.py +0 -0
  51. {datafog-4.4.0 → datafog-4.4.0a2}/datafog/services/text_service_original.py +0 -0
  52. {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/dependency_links.txt +0 -0
  53. {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/entry_points.txt +0 -0
  54. {datafog-4.4.0 → datafog-4.4.0a2}/datafog.egg-info/top_level.txt +0 -0
  55. {datafog-4.4.0 → datafog-4.4.0a2}/setup.cfg +0 -0
  56. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_agent_api.py +0 -0
  57. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_anonymizer.py +0 -0
  58. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_cli_smoke.py +0 -0
  59. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_client.py +0 -0
  60. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_detection_accuracy.py +0 -0
  61. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_engine_api.py +0 -0
  62. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_image_service.py +0 -0
  63. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_main.py +0 -0
  64. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_ocr_integration.py +0 -0
  65. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_regex_annotator.py +0 -0
  66. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_spark_integration.py +0 -0
  67. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_text_service.py +0 -0
  68. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_text_service_integration.py +0 -0
  69. {datafog-4.4.0 → datafog-4.4.0a2}/tests/test_v44_bridge_api.py +0 -0
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: datafog
3
- Version: 4.4.0
3
+ Version: 4.4.0a2
4
4
  Summary: Lightning-fast PII detection and anonymization library with 190x performance advantage
5
5
  Author: Sid Mohan
6
6
  Author-email: sid@datafog.ai
@@ -35,45 +35,59 @@ Requires-Dist: torch<2.7,>=2.1.0; extra == "nlp-advanced"
35
35
  Requires-Dist: transformers>=4.20.0; extra == "nlp-advanced"
36
36
  Requires-Dist: huggingface-hub>=0.16.0; extra == "nlp-advanced"
37
37
  Provides-Extra: ocr
38
+ Requires-Dist: numpy>=1.24.0; extra == "ocr"
38
39
  Requires-Dist: pytesseract>=0.3.0; extra == "ocr"
39
- Requires-Dist: Pillow>=10.0.0; extra == "ocr"
40
+ Requires-Dist: Pillow>=12.2.0; extra == "ocr"
40
41
  Requires-Dist: sentencepiece>=0.2.0; extra == "ocr"
41
42
  Requires-Dist: protobuf>=4.0.0; extra == "ocr"
42
43
  Provides-Extra: distributed
43
44
  Requires-Dist: pandas>=2.0.0; extra == "distributed"
44
45
  Requires-Dist: numpy>=1.24.0; extra == "distributed"
46
+ Requires-Dist: pyspark>=3.5.0; extra == "distributed"
45
47
  Provides-Extra: web
46
48
  Requires-Dist: fastapi>=0.100.0; extra == "web"
47
- Requires-Dist: aiohttp>=3.8.0; extra == "web"
48
- Requires-Dist: requests>=2.30.0; extra == "web"
49
+ Requires-Dist: aiohttp>=3.13.4; extra == "web"
50
+ Requires-Dist: certifi>=2025.4.26; extra == "web"
51
+ Requires-Dist: requests>=2.33.0; extra == "web"
49
52
  Provides-Extra: cli
50
53
  Requires-Dist: typer>=0.12.0; extra == "cli"
51
54
  Requires-Dist: pydantic-settings>=2.0.0; extra == "cli"
52
55
  Provides-Extra: crypto
53
- Requires-Dist: cryptography>=40.0.0; extra == "crypto"
56
+ Requires-Dist: cryptography>=46.0.7; extra == "crypto"
57
+ Provides-Extra: test
58
+ Requires-Dist: pytest>=9.0.3; extra == "test"
59
+ Requires-Dist: pytest-asyncio>=1.3.0; extra == "test"
60
+ Requires-Dist: pytest-cov>=7.1.0; extra == "test"
61
+ Provides-Extra: docs
62
+ Requires-Dist: sphinx>=7.2.6; extra == "docs"
63
+ Provides-Extra: benchmark
64
+ Requires-Dist: pytest-benchmark>=4.0.0; extra == "benchmark"
54
65
  Provides-Extra: dev
55
- Requires-Dist: pytest>=7.0.0; extra == "dev"
56
- Requires-Dist: pytest-asyncio>=0.21.0; extra == "dev"
57
- Requires-Dist: pytest-cov>=4.0.0; extra == "dev"
58
- Requires-Dist: sphinx>=7.0.0; extra == "dev"
66
+ Requires-Dist: pytest>=9.0.3; extra == "dev"
67
+ Requires-Dist: pytest-asyncio>=1.3.0; extra == "dev"
68
+ Requires-Dist: pytest-cov>=7.1.0; extra == "dev"
69
+ Requires-Dist: sphinx>=7.2.6; extra == "dev"
59
70
  Provides-Extra: all
60
71
  Requires-Dist: spacy<4.0,>=3.7.0; extra == "all"
61
72
  Requires-Dist: gliner>=0.2.5; extra == "all"
62
73
  Requires-Dist: torch<2.7,>=2.1.0; extra == "all"
63
74
  Requires-Dist: transformers>=4.20.0; extra == "all"
64
75
  Requires-Dist: huggingface-hub>=0.16.0; extra == "all"
76
+ Requires-Dist: numpy>=1.24.0; extra == "all"
65
77
  Requires-Dist: pytesseract>=0.3.0; extra == "all"
66
- Requires-Dist: Pillow>=10.0.0; extra == "all"
78
+ Requires-Dist: Pillow>=12.2.0; extra == "all"
67
79
  Requires-Dist: sentencepiece>=0.2.0; extra == "all"
68
80
  Requires-Dist: protobuf>=4.0.0; extra == "all"
69
81
  Requires-Dist: pandas>=2.0.0; extra == "all"
70
82
  Requires-Dist: numpy>=1.24.0; extra == "all"
83
+ Requires-Dist: pyspark>=3.5.0; extra == "all"
71
84
  Requires-Dist: fastapi>=0.100.0; extra == "all"
72
- Requires-Dist: aiohttp>=3.8.0; extra == "all"
73
- Requires-Dist: requests>=2.30.0; extra == "all"
85
+ Requires-Dist: aiohttp>=3.13.4; extra == "all"
86
+ Requires-Dist: certifi>=2025.4.26; extra == "all"
87
+ Requires-Dist: requests>=2.33.0; extra == "all"
74
88
  Requires-Dist: typer>=0.12.0; extra == "all"
75
89
  Requires-Dist: pydantic-settings>=2.0.0; extra == "all"
76
- Requires-Dist: cryptography>=40.0.0; extra == "all"
90
+ Requires-Dist: cryptography>=46.0.7; extra == "all"
77
91
  Dynamic: author
78
92
  Dynamic: author-email
79
93
  Dynamic: classifier
@@ -251,5 +265,6 @@ cd datafog-python
251
265
  python -m venv .venv
252
266
  source .venv/bin/activate # Windows: .venv\Scripts\activate
253
267
  pip install -e ".[all,dev]"
268
+ pip install -r requirements-dev.txt
254
269
  pytest tests/
255
270
  ```
@@ -162,5 +162,6 @@ cd datafog-python
162
162
  python -m venv .venv
163
163
  source .venv/bin/activate # Windows: .venv\Scripts\activate
164
164
  pip install -e ".[all,dev]"
165
+ pip install -r requirements-dev.txt
165
166
  pytest tests/
166
167
  ```
@@ -0,0 +1 @@
1
+ __version__ = "4.4.0a2"
@@ -181,7 +181,7 @@ def download_model(
181
181
  Download a model for specified engine.
182
182
 
183
183
  Examples:
184
- spaCy: datafog download-model en_core_web_sm --engine spacy
184
+ spaCy: datafog download-model en_core_web_lg --engine spacy
185
185
  GLiNER: datafog download-model urchade/gliner_multi_pii-v1 --engine gliner
186
186
  """
187
187
  if engine == "spacy":
@@ -171,17 +171,13 @@ def _gliner_entities(text: str) -> list[Entity]:
171
171
  def _get_spacy_annotator():
172
172
  try:
173
173
  from .processing.text_processing.spacy_pii_annotator import SpacyPIIAnnotator
174
- except ImportError:
175
- return _UnavailableAnnotator(
176
- "SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
177
- )
174
+ except ImportError as exc:
175
+ return _UnavailableAnnotator(str(exc))
178
176
 
179
177
  try:
180
178
  return SpacyPIIAnnotator.create()
181
- except ImportError:
182
- return _UnavailableAnnotator(
183
- "SpaCy engine requires the nlp extra. Install with: pip install datafog[nlp]"
184
- )
179
+ except ImportError as exc:
180
+ return _UnavailableAnnotator(str(exc))
185
181
  except Exception as exc:
186
182
  return _UnavailableAnnotator(
187
183
  f"SpaCy engine initialization failed: {type(exc).__name__}: {exc}"
@@ -192,19 +188,13 @@ def _get_spacy_annotator():
192
188
  def _get_gliner_annotator():
193
189
  try:
194
190
  from .processing.text_processing.gliner_annotator import GLiNERAnnotator
195
- except ImportError:
196
- return _UnavailableAnnotator(
197
- "GLiNER engine requires the nlp-advanced extra. "
198
- "Install with: pip install datafog[nlp-advanced]"
199
- )
191
+ except ImportError as exc:
192
+ return _UnavailableAnnotator(str(exc))
200
193
 
201
194
  try:
202
195
  annotator = GLiNERAnnotator.create()
203
- except ImportError:
204
- return _UnavailableAnnotator(
205
- "GLiNER engine requires the nlp-advanced extra. "
206
- "Install with: pip install datafog[nlp-advanced]"
207
- )
196
+ except ImportError as exc:
197
+ return _UnavailableAnnotator(str(exc))
208
198
  except Exception as exc:
209
199
  return _UnavailableAnnotator(
210
200
  f"GLiNER engine initialization failed: {type(exc).__name__}: {exc}"
@@ -9,10 +9,11 @@ from typing import List
9
9
  from uuid import uuid4
10
10
 
11
11
  import spacy
12
- from rich.progress import track
13
12
 
14
13
  from .annotator import AnnotationResult, AnnotatorRequest
15
14
 
15
+ DEFAULT_SPACY_MODEL = "en_core_web_lg"
16
+
16
17
 
17
18
  class SpacyAnnotator:
18
19
  """
@@ -22,14 +23,18 @@ class SpacyAnnotator:
22
23
  Supports various NLP tasks including entity recognition and model management.
23
24
  """
24
25
 
25
- def __init__(self, model_name: str = "en_core_web_lg"):
26
+ def __init__(self, model_name: str = DEFAULT_SPACY_MODEL):
26
27
  self.model_name = model_name
27
28
  self.nlp = None
28
29
 
29
30
  def load_model(self):
30
- if not spacy.util.is_package(self.model_name):
31
- spacy.cli.download(self.model_name)
32
- self.nlp = spacy.load(self.model_name)
31
+ try:
32
+ self.nlp = spacy.load(self.model_name)
33
+ except OSError as exc:
34
+ raise ImportError(
35
+ f"spaCy model {self.model_name!r} is not installed. "
36
+ f"Download it explicitly with: datafog download-model {self.model_name} --engine spacy"
37
+ ) from exc
33
38
 
34
39
  def annotate_text(self, text: str, language: str = "en") -> List[AnnotationResult]:
35
40
  if not self.nlp:
@@ -47,7 +52,7 @@ class SpacyAnnotator:
47
52
  )
48
53
  doc = self.nlp(annotator_request.text)
49
54
  results = []
50
- for ent in track(doc.ents, description="Processing entities"):
55
+ for ent in doc.ents:
51
56
  result = AnnotationResult(
52
57
  start=ent.start_char,
53
58
  end=ent.end_char,
@@ -72,6 +77,12 @@ class SpacyAnnotator:
72
77
  return spacy.util.get_installed_models()
73
78
 
74
79
  @staticmethod
75
- def list_entities() -> List[str]:
76
- nlp = spacy.load("en_core_web_lg")
80
+ def list_entities(model_name: str = DEFAULT_SPACY_MODEL) -> List[str]:
81
+ try:
82
+ nlp = spacy.load(model_name)
83
+ except OSError as exc:
84
+ raise ImportError(
85
+ f"spaCy model {model_name!r} is not installed. "
86
+ f"Download it explicitly with: datafog download-model {model_name} --engine spacy"
87
+ ) from exc
77
88
  return [ent for ent in nlp.pipe_labels["ner"]]
@@ -6,14 +6,10 @@ for document understanding tasks, particularly OCR and information extraction
6
6
  from images of documents.
7
7
  """
8
8
 
9
- import importlib
10
- import importlib.util
11
9
  import json
12
10
  import logging
13
11
  import os
14
12
  import re
15
- import subprocess
16
- import sys
17
13
  from typing import TYPE_CHECKING, Any
18
14
 
19
15
  from .image_downloader import ImageDownloader
@@ -43,13 +39,12 @@ class DonutProcessor:
43
39
  self.model_path = model_path
44
40
  self.downloader = ImageDownloader()
45
41
 
46
- def ensure_installed(self, package_name):
47
- try:
48
- importlib.import_module(package_name)
49
- except ImportError:
50
- subprocess.check_call(
51
- [sys.executable, "-m", "pip", "install", package_name]
52
- )
42
+ @staticmethod
43
+ def _missing_dependency_message(package_name: str) -> str:
44
+ return (
45
+ f"Donut OCR requires {package_name}. "
46
+ "Install with: pip install datafog[nlp-advanced,ocr]"
47
+ )
53
48
 
54
49
  def preprocess_image(self, image: "Image.Image") -> Any:
55
50
  import numpy as np
@@ -86,40 +81,40 @@ class DonutProcessor:
86
81
  "PYTEST_DONUT=yes is set, running actual OCR in test environment"
87
82
  )
88
83
 
89
- # Only import torch and transformers when actually needed and not in test environment
90
84
  try:
91
- # Check if torch is available before trying to import it
92
- try:
93
- # Try to find the module without importing it
94
- spec = importlib.util.find_spec("torch")
95
- if spec is None:
96
- # If we're in a test that somehow bypassed the IN_TEST_ENV check,
97
- # still return a mock result instead of failing
98
- logging.warning("torch module not found, returning mock result")
99
- return json.dumps({"text": "Mock OCR text (torch not available)"})
100
-
101
- # Ensure dependencies are installed
102
- self.ensure_installed("torch")
103
- self.ensure_installed("transformers")
104
- except ImportError:
105
- # If importlib.util is not available, fall back to direct try/except
106
- pass
107
-
108
- # Import dependencies only when needed
109
85
  try:
110
86
  import torch
87
+ except ImportError as exc:
88
+ raise ImportError(self._missing_dependency_message("torch")) from exc
89
+
90
+ try:
111
91
  from transformers import DonutProcessor as TransformersDonutProcessor
112
92
  from transformers import VisionEncoderDecoderModel
113
93
  except ImportError as e:
114
- logging.warning(f"Import error: {e}, returning mock result")
115
- return json.dumps({"text": f"Mock OCR text (import error: {e})"})
94
+ raise ImportError(
95
+ self._missing_dependency_message("transformers")
96
+ ) from e
116
97
 
117
98
  # Preprocess the image
118
99
  image_np = self.preprocess_image(image)
119
100
 
120
101
  # Initialize model components
121
- processor = TransformersDonutProcessor.from_pretrained(self.model_path)
122
- model = VisionEncoderDecoderModel.from_pretrained(self.model_path)
102
+ try:
103
+ processor = TransformersDonutProcessor.from_pretrained(
104
+ self.model_path,
105
+ local_files_only=True,
106
+ )
107
+ model = VisionEncoderDecoderModel.from_pretrained(
108
+ self.model_path,
109
+ local_files_only=True,
110
+ )
111
+ except OSError as exc:
112
+ raise RuntimeError(
113
+ f"Donut model {self.model_path!r} is not available locally. "
114
+ "Download it explicitly before using Donut OCR, or pass a local "
115
+ "model path."
116
+ ) from exc
117
+
123
118
  device = "cuda" if torch.cuda.is_available() else "cpu"
124
119
  model.to(device)
125
120
  model.eval()
@@ -153,6 +148,8 @@ class DonutProcessor:
153
148
  result = processor.token2json(sequence)
154
149
  return json.dumps(result)
155
150
 
151
+ except (ImportError, RuntimeError):
152
+ raise
156
153
  except Exception as e:
157
154
  logging.error(f"Error in extract_text_from_image: {e}")
158
155
  # Return a placeholder in case of error
@@ -2,17 +2,16 @@
2
2
  PySpark UDFs for PII annotation and related utilities.
3
3
 
4
4
  This module provides functions for PII (Personally Identifiable Information) annotation
5
- using SpaCy models in a PySpark environment. It includes utilities for installing
6
- dependencies, creating and broadcasting PII annotator UDFs, and performing PII annotation
7
- on text data.
5
+ using SpaCy models in a PySpark environment. It includes utilities for validating
6
+ dependencies, creating and broadcasting PII annotator UDFs, and performing PII
7
+ annotation on text data.
8
8
  """
9
9
 
10
10
  import importlib
11
- import subprocess
12
- import sys
13
11
 
14
12
  PII_ANNOTATION_LABELS = ["DATE_TIME", "LOC", "NRP", "ORG", "PER"]
15
13
  MAXIMAL_STRING_SIZE = 1000000
14
+ DEFAULT_SPACY_MODEL = "en_core_web_lg"
16
15
 
17
16
 
18
17
  def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
@@ -45,7 +44,7 @@ def pii_annotator(text: str, broadcasted_nlp) -> list[list[str]]:
45
44
 
46
45
 
47
46
  def broadcast_pii_annotator_udf(
48
- spark_session=None, spacy_model: str = "en_core_web_lg"
47
+ spark_session=None, spacy_model: str = DEFAULT_SPACY_MODEL
49
48
  ):
50
49
  """Broadcast PII annotator across Spark cluster and create UDF"""
51
50
  ensure_installed("pyspark")
@@ -69,5 +68,14 @@ def broadcast_pii_annotator_udf(
69
68
  def ensure_installed(package_name):
70
69
  try:
71
70
  importlib.import_module(package_name)
72
- except ImportError:
73
- subprocess.check_call([sys.executable, "-m", "pip", "install", package_name])
71
+ except ImportError as exc:
72
+ if package_name == "pyspark":
73
+ extra = "distributed"
74
+ elif package_name == "spacy":
75
+ extra = "nlp"
76
+ else:
77
+ extra = "all"
78
+ raise ImportError(
79
+ f"{package_name} is required for Spark PII UDF support. "
80
+ f"Install with: pip install datafog[{extra}]"
81
+ ) from exc
@@ -79,14 +79,18 @@ class GLiNERAnnotator(BaseModel):
79
79
 
80
80
  try:
81
81
  # Load the GLiNER model
82
- model = GLiNER.from_pretrained(model_name)
82
+ model = GLiNER.from_pretrained(model_name, local_files_only=True)
83
83
  logging.info(f"Successfully loaded GLiNER model: {model_name}")
84
84
 
85
85
  return cls(model=model, entity_types=entity_types, model_name=model_name)
86
86
 
87
87
  except Exception as e:
88
88
  logging.error(f"Failed to load GLiNER model {model_name}: {str(e)}")
89
- raise
89
+ raise RuntimeError(
90
+ f"GLiNER model {model_name!r} is not available locally. "
91
+ "Download it explicitly with: "
92
+ f"datafog download-model {model_name} --engine gliner"
93
+ ) from e
90
94
 
91
95
  def annotate(self, text: str) -> Dict[str, List[str]]:
92
96
  """
@@ -24,39 +24,34 @@ PII_ANNOTATION_LABELS = [
24
24
  "WORK_OF_ART",
25
25
  ]
26
26
  MAXIMAL_STRING_SIZE = 1000000
27
+ DEFAULT_SPACY_MODEL = "en_core_web_lg"
27
28
 
28
29
 
29
30
  class SpacyPIIAnnotator(BaseModel):
30
31
  model_config = ConfigDict(arbitrary_types_allowed=True)
31
32
 
32
33
  nlp: Any
34
+ model_name: str = DEFAULT_SPACY_MODEL
33
35
 
34
36
  @classmethod
35
- def create(cls) -> "SpacyPIIAnnotator":
36
- import spacy
37
-
37
+ def create(cls, model_name: str = DEFAULT_SPACY_MODEL) -> "SpacyPIIAnnotator":
38
38
  try:
39
- nlp = spacy.load("en_core_web_lg")
40
- except OSError:
41
- import subprocess
42
- import sys
39
+ import spacy
40
+ except ImportError as exc:
41
+ raise ImportError(
42
+ "SpaCy engine requires the nlp extra. "
43
+ "Install with: pip install datafog[nlp]"
44
+ ) from exc
43
45
 
44
- interpreter_location = sys.executable
45
- subprocess.run(
46
- [
47
- interpreter_location,
48
- "-m",
49
- "pip",
50
- "install",
51
- "--no-deps",
52
- "--no-cache-dir",
53
- "https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl",
54
- ],
55
- check=True,
56
- )
57
- nlp = spacy.load("en_core_web_lg")
46
+ try:
47
+ nlp = spacy.load(model_name)
48
+ except OSError as exc:
49
+ raise ImportError(
50
+ f"spaCy model {model_name!r} is not installed. "
51
+ f"Download it explicitly with: datafog download-model {model_name} --engine spacy"
52
+ ) from exc
58
53
 
59
- return cls(nlp=nlp)
54
+ return cls(nlp=nlp, model_name=model_name)
60
55
 
61
56
  def annotate(self, text: str) -> Dict[str, List[str]]:
62
57
  try:
@@ -1,14 +1,12 @@
1
1
  """
2
2
  Spark service for data processing and analysis.
3
3
 
4
- Provides a wrapper around PySpark functionality, including session creation,
5
- JSON reading, and package management.
4
+ Provides a wrapper around PySpark functionality, including session creation and
5
+ JSON reading.
6
6
  """
7
7
 
8
8
  import importlib
9
9
  import os
10
- import subprocess
11
- import sys
12
10
  from typing import List
13
11
 
14
12
 
@@ -16,14 +14,13 @@ class SparkService:
16
14
  """
17
15
  Manages Spark operations and dependencies.
18
16
 
19
- Initializes a Spark session, handles imports, and provides methods for
20
- data reading and package installation.
17
+ Initializes a Spark session, handles imports, and provides methods for data
18
+ reading.
21
19
  """
22
20
 
23
21
  def __init__(self, master=None):
24
22
  self.master = master
25
23
 
26
- # Ensure pyspark is installed first
27
24
  self.ensure_installed("pyspark")
28
25
 
29
26
  # Now import necessary modules after ensuring pyspark is installed
@@ -84,16 +81,8 @@ class SparkService:
84
81
  def ensure_installed(self, package_name):
85
82
  try:
86
83
  importlib.import_module(package_name)
87
- except ImportError:
88
- print(f"Installing {package_name}...")
89
- try:
90
- subprocess.check_call(
91
- [sys.executable, "-m", "pip", "install", package_name]
92
- )
93
- print(f"{package_name} installed successfully.")
94
- except subprocess.CalledProcessError as e:
95
- print(f"Failed to install {package_name}: {e}")
96
- raise ImportError(
97
- f"Could not install {package_name}. "
98
- f"Please install it manually with 'pip install {package_name}'."
99
- )
84
+ except ImportError as exc:
85
+ raise ImportError(
86
+ f"{package_name} is required for Spark support. "
87
+ "Install with: pip install datafog[distributed]"
88
+ ) from exc