kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,3 @@
1
- """General-purpose file-based caching layer for Kreuzberg."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import hashlib
@@ -14,6 +12,7 @@ from typing import Any, Generic, TypeVar
14
12
  from anyio import Path as AsyncPath
15
13
 
16
14
  from kreuzberg._types import ExtractionResult
15
+ from kreuzberg._utils._ref import Ref
17
16
  from kreuzberg._utils._serialization import deserialize, serialize
18
17
  from kreuzberg._utils._sync import run_sync
19
18
 
@@ -57,22 +56,12 @@ class KreuzbergCache(Generic[T]):
57
56
  self._lock = threading.Lock()
58
57
 
59
58
  def _get_cache_key(self, **kwargs: Any) -> str:
60
- """Generate cache key from kwargs.
61
-
62
- Args:
63
- **kwargs: Key-value pairs to generate cache key from
64
-
65
- Returns:
66
- Unique cache key string
67
- """
68
59
  if not kwargs:
69
60
  return "empty"
70
61
 
71
- # Build cache key using list + join (faster than StringIO)
72
62
  parts = []
73
63
  for key in sorted(kwargs):
74
64
  value = kwargs[key]
75
- # Convert common types efficiently
76
65
  if isinstance(value, (str, int, float, bool)):
77
66
  parts.append(f"{key}={value}")
78
67
  elif isinstance(value, bytes):
@@ -81,15 +70,12 @@ class KreuzbergCache(Generic[T]):
81
70
  parts.append(f"{key}={type(value).__name__}:{value!s}")
82
71
 
83
72
  cache_str = "&".join(parts)
84
- # SHA256 is secure and fast enough for cache keys
85
73
  return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
86
74
 
87
75
  def _get_cache_path(self, cache_key: str) -> Path:
88
- """Get cache file path for key."""
89
76
  return self.cache_dir / f"{cache_key}.msgpack"
90
77
 
91
78
  def _is_cache_valid(self, cache_path: Path) -> bool:
92
- """Check if cached result is still valid."""
93
79
  try:
94
80
  if not cache_path.exists():
95
81
  return False
@@ -102,18 +88,14 @@ class KreuzbergCache(Generic[T]):
102
88
  return False
103
89
 
104
90
  def _serialize_result(self, result: T) -> dict[str, Any]:
105
- """Serialize result for caching with metadata."""
106
- # Handle TableData objects that contain DataFrames
107
91
  if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
108
92
  serialized_data = []
109
93
  for item in result:
110
94
  if isinstance(item, dict) and "df" in item:
111
- # Build new dict without unnecessary copy
112
95
  serialized_item = {k: v for k, v in item.items() if k != "df"}
113
96
  if hasattr(item["df"], "to_csv"):
114
97
  serialized_item["df_csv"] = item["df"].to_csv(index=False)
115
98
  else:
116
- # Fallback for non-DataFrame objects
117
99
  serialized_item["df_csv"] = str(item["df"])
118
100
  serialized_data.append(serialized_item)
119
101
  else:
@@ -123,7 +105,6 @@ class KreuzbergCache(Generic[T]):
123
105
  return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
124
106
 
125
107
  def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
126
- """Deserialize cached result."""
127
108
  data = cached_data["data"]
128
109
 
129
110
  if cached_data.get("type") == "TableDataList" and isinstance(data, list):
@@ -132,7 +113,6 @@ class KreuzbergCache(Generic[T]):
132
113
  deserialized_data = []
133
114
  for item in data:
134
115
  if isinstance(item, dict) and "df_csv" in item:
135
- # Build new dict without unnecessary copy
136
116
  deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
137
117
  deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
138
118
  deserialized_data.append(deserialized_item)
@@ -146,7 +126,6 @@ class KreuzbergCache(Generic[T]):
146
126
  return data # type: ignore[no-any-return]
147
127
 
148
128
  def _cleanup_cache(self) -> None:
149
- """Clean up old and oversized cache entries."""
150
129
  try:
151
130
  cache_files = list(self.cache_dir.glob("*.msgpack"))
152
131
 
@@ -331,87 +310,106 @@ class KreuzbergCache(Generic[T]):
331
310
  }
332
311
 
333
312
 
334
- _ocr_cache: KreuzbergCache[ExtractionResult] | None = None
335
- _document_cache: KreuzbergCache[ExtractionResult] | None = None
336
- _table_cache: KreuzbergCache[Any] | None = None
337
- _mime_cache: KreuzbergCache[str] | None = None
313
+ def _create_ocr_cache() -> KreuzbergCache[ExtractionResult]:
314
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
315
+ cache_dir: Path | None = None
316
+ if cache_dir_str:
317
+ cache_dir = Path(cache_dir_str) / "ocr"
318
+
319
+ return KreuzbergCache[ExtractionResult](
320
+ cache_type="ocr",
321
+ cache_dir=cache_dir,
322
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_OCR_CACHE_SIZE_MB", "500")),
323
+ max_age_days=int(os.environ.get("KREUZBERG_OCR_CACHE_AGE_DAYS", "30")),
324
+ )
325
+
326
+
327
+ _ocr_cache_ref = Ref("ocr_cache", _create_ocr_cache)
338
328
 
339
329
 
340
330
  def get_ocr_cache() -> KreuzbergCache[ExtractionResult]:
341
- """Get the global OCR cache instance."""
342
- global _ocr_cache
343
- if _ocr_cache is None:
344
- cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
345
- cache_dir: Path | None = None
346
- if cache_dir_str:
347
- cache_dir = Path(cache_dir_str) / "ocr"
348
-
349
- _ocr_cache = KreuzbergCache[ExtractionResult](
350
- cache_type="ocr",
351
- cache_dir=cache_dir,
352
- max_cache_size_mb=float(os.environ.get("KREUZBERG_OCR_CACHE_SIZE_MB", "500")),
353
- max_age_days=int(os.environ.get("KREUZBERG_OCR_CACHE_AGE_DAYS", "30")),
354
- )
355
- return _ocr_cache
331
+ """Get the OCR cache instance."""
332
+ return _ocr_cache_ref.get()
333
+
334
+
335
+ def _create_document_cache() -> KreuzbergCache[ExtractionResult]:
336
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
337
+ cache_dir: Path | None = None
338
+ if cache_dir_str:
339
+ cache_dir = Path(cache_dir_str) / "documents"
340
+
341
+ return KreuzbergCache[ExtractionResult](
342
+ cache_type="documents",
343
+ cache_dir=cache_dir,
344
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_DOCUMENT_CACHE_SIZE_MB", "1000")),
345
+ max_age_days=int(os.environ.get("KREUZBERG_DOCUMENT_CACHE_AGE_DAYS", "7")),
346
+ )
347
+
348
+
349
+ _document_cache_ref = Ref("document_cache", _create_document_cache)
356
350
 
357
351
 
358
352
  def get_document_cache() -> KreuzbergCache[ExtractionResult]:
359
- """Get the global document cache instance."""
360
- global _document_cache
361
- if _document_cache is None:
362
- cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
363
- cache_dir: Path | None = None
364
- if cache_dir_str:
365
- cache_dir = Path(cache_dir_str) / "documents"
366
-
367
- _document_cache = KreuzbergCache[ExtractionResult](
368
- cache_type="documents",
369
- cache_dir=cache_dir,
370
- max_cache_size_mb=float(os.environ.get("KREUZBERG_DOCUMENT_CACHE_SIZE_MB", "1000")),
371
- max_age_days=int(os.environ.get("KREUZBERG_DOCUMENT_CACHE_AGE_DAYS", "7")),
372
- )
373
- return _document_cache
353
+ """Get the document cache instance."""
354
+ return _document_cache_ref.get()
355
+
356
+
357
+ def _create_table_cache() -> KreuzbergCache[Any]:
358
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
359
+ cache_dir: Path | None = None
360
+ if cache_dir_str:
361
+ cache_dir = Path(cache_dir_str) / "tables"
362
+
363
+ return KreuzbergCache[Any](
364
+ cache_type="tables",
365
+ cache_dir=cache_dir,
366
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_TABLE_CACHE_SIZE_MB", "200")),
367
+ max_age_days=int(os.environ.get("KREUZBERG_TABLE_CACHE_AGE_DAYS", "30")),
368
+ )
369
+
370
+
371
+ _table_cache_ref = Ref("table_cache", _create_table_cache)
374
372
 
375
373
 
376
374
  def get_table_cache() -> KreuzbergCache[Any]:
377
- """Get the global table cache instance."""
378
- global _table_cache
379
- if _table_cache is None:
380
- cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
381
- cache_dir: Path | None = None
382
- if cache_dir_str:
383
- cache_dir = Path(cache_dir_str) / "tables"
384
-
385
- _table_cache = KreuzbergCache[Any](
386
- cache_type="tables",
387
- cache_dir=cache_dir,
388
- max_cache_size_mb=float(os.environ.get("KREUZBERG_TABLE_CACHE_SIZE_MB", "200")),
389
- max_age_days=int(os.environ.get("KREUZBERG_TABLE_CACHE_AGE_DAYS", "30")),
390
- )
391
- return _table_cache
375
+ """Get the table cache instance."""
376
+ return _table_cache_ref.get()
377
+
378
+
379
+ def _create_mime_cache() -> KreuzbergCache[str]:
380
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
381
+ cache_dir: Path | None = None
382
+ if cache_dir_str:
383
+ cache_dir = Path(cache_dir_str) / "mime"
384
+
385
+ return KreuzbergCache[str](
386
+ cache_type="mime",
387
+ cache_dir=cache_dir,
388
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
389
+ max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
390
+ )
391
+
392
+
393
+ _mime_cache_ref = Ref("mime_cache", _create_mime_cache)
392
394
 
393
395
 
394
396
  def get_mime_cache() -> KreuzbergCache[str]:
395
- """Get the global MIME type cache instance."""
396
- global _mime_cache
397
- if _mime_cache is None:
398
- cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
399
- cache_dir: Path | None = None
400
- if cache_dir_str:
401
- cache_dir = Path(cache_dir_str) / "mime"
402
-
403
- _mime_cache = KreuzbergCache[str](
404
- cache_type="mime",
405
- cache_dir=cache_dir,
406
- max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
407
- max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
408
- )
409
- return _mime_cache
397
+ """Get the MIME type cache instance."""
398
+ return _mime_cache_ref.get()
410
399
 
411
400
 
412
401
  def clear_all_caches() -> None:
413
402
  """Clear all caches."""
414
- get_ocr_cache().clear()
415
- get_document_cache().clear()
416
- get_table_cache().clear()
417
- get_mime_cache().clear()
403
+ if _ocr_cache_ref.is_initialized():
404
+ get_ocr_cache().clear()
405
+ if _document_cache_ref.is_initialized():
406
+ get_document_cache().clear()
407
+ if _table_cache_ref.is_initialized():
408
+ get_table_cache().clear()
409
+ if _mime_cache_ref.is_initialized():
410
+ get_mime_cache().clear()
411
+
412
+ _ocr_cache_ref.clear()
413
+ _document_cache_ref.clear()
414
+ _table_cache_ref.clear()
415
+ _mime_cache_ref.clear()
@@ -1,4 +1,3 @@
1
- """Device detection and management utilities for GPU acceleration."""
2
1
  # ruff: noqa: BLE001 # ~keep
3
2
 
4
3
  from __future__ import annotations
@@ -35,7 +34,6 @@ def detect_available_devices() -> list[DeviceInfo]:
35
34
  Returns:
36
35
  List of available devices, with the most preferred device first.
37
36
  """
38
- # Build device lists efficiently using generators
39
37
  cpu_device = DeviceInfo(device_type="cpu", name="CPU")
40
38
 
41
39
  cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
@@ -43,7 +41,6 @@ def detect_available_devices() -> list[DeviceInfo]:
43
41
  mps_device = _get_mps_device() if _is_mps_available() else None
44
42
  mps_devices = [mps_device] if mps_device else []
45
43
 
46
- # Return GPU devices first, then CPU using itertools.chain
47
44
  gpu_devices = list(chain(cuda_devices, mps_devices))
48
45
  return [*gpu_devices, cpu_device]
49
46
 
@@ -139,7 +136,6 @@ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | No
139
136
 
140
137
 
141
138
  def _is_cuda_available() -> bool:
142
- """Check if CUDA is available."""
143
139
  try:
144
140
  import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
145
141
 
@@ -149,7 +145,6 @@ def _is_cuda_available() -> bool:
149
145
 
150
146
 
151
147
  def _is_mps_available() -> bool:
152
- """Check if MPS (Apple Silicon) is available."""
153
148
  try:
154
149
  import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
155
150
 
@@ -159,7 +154,6 @@ def _is_mps_available() -> bool:
159
154
 
160
155
 
161
156
  def _get_cuda_devices() -> list[DeviceInfo]:
162
- """Get information about available CUDA devices."""
163
157
  devices: list[DeviceInfo] = []
164
158
 
165
159
  try:
@@ -197,7 +191,6 @@ def _get_cuda_devices() -> list[DeviceInfo]:
197
191
 
198
192
 
199
193
  def _get_mps_device() -> DeviceInfo | None:
200
- """Get information about the MPS device."""
201
194
  try:
202
195
  import torch # noqa: PLC0415
203
196
 
@@ -214,7 +207,6 @@ def _get_mps_device() -> DeviceInfo | None:
214
207
 
215
208
 
216
209
  def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
217
- """Get CUDA memory information for a specific device."""
218
210
  try:
219
211
  import torch # noqa: PLC0415
220
212
 
@@ -237,20 +229,10 @@ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
237
229
 
238
230
 
239
231
  def _get_mps_memory_info() -> tuple[float | None, float | None]:
240
- """Get MPS memory information."""
241
232
  return None, None
242
233
 
243
234
 
244
235
  def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
245
- """Validate that a device has enough memory for the requested limit.
246
-
247
- Args:
248
- device: The device to validate.
249
- memory_limit: Required memory in GB.
250
-
251
- Raises:
252
- ValidationError: If the device doesn't have enough memory.
253
- """
254
236
  if device.device_type == "cpu":
255
237
  # CPU memory validation is complex and OS-dependent, skip for now # ~keep
256
238
  return
@@ -1,5 +1,3 @@
1
- """Document-level caching to prevent pypdfium2 issues with duplicate processing."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import hashlib
@@ -1,5 +1,3 @@
1
- """Enhanced error handling utilities."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import platform
@@ -12,7 +10,6 @@ import psutil
12
10
 
13
11
  from kreuzberg.exceptions import ValidationError
14
12
 
15
- # Define error keywords as frozensets for O(1) membership testing
16
13
  _SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
17
14
  _TRANSIENT_ERROR_PATTERNS = frozenset(
18
15
  {
@@ -1,5 +1,3 @@
1
- """PDF processing lock utilities for thread-safe pypdfium2 operations."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import hashlib
@@ -1,5 +1,3 @@
1
- """Process pool utilities for CPU-intensive operations."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import io
@@ -13,6 +11,8 @@ import psutil
13
11
  import pypdfium2
14
12
  from typing_extensions import Self
15
13
 
14
+ from kreuzberg._utils._ref import Ref
15
+
16
16
  if TYPE_CHECKING:
17
17
  import types
18
18
  from collections.abc import Callable, Generator
@@ -20,27 +20,29 @@ if TYPE_CHECKING:
20
20
  T = TypeVar("T")
21
21
 
22
22
 
23
- _PROCESS_POOL: ProcessPoolExecutor | None = None
24
23
  _POOL_SIZE = max(1, mp.cpu_count() - 1)
25
24
 
26
25
 
27
- def _init_process_pool() -> ProcessPoolExecutor:
28
- """Initialize the global process pool."""
29
- global _PROCESS_POOL
30
- if _PROCESS_POOL is None:
31
- _PROCESS_POOL = ProcessPoolExecutor(max_workers=_POOL_SIZE)
32
- return _PROCESS_POOL
26
+ def _create_process_pool() -> ProcessPoolExecutor:
27
+ return ProcessPoolExecutor(max_workers=_POOL_SIZE)
28
+
29
+
30
+ _process_pool_ref = Ref("process_pool", _create_process_pool)
31
+
32
+
33
+ def _get_process_pool() -> ProcessPoolExecutor:
34
+ return _process_pool_ref.get()
33
35
 
34
36
 
35
37
  @contextmanager
36
38
  def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
37
- """Get the global process pool."""
38
- pool = _init_process_pool()
39
+ """Get the process pool."""
40
+ pool = _get_process_pool()
39
41
  try:
40
42
  yield pool
41
43
  except Exception: # noqa: BLE001
42
44
  shutdown_process_pool()
43
- pool = _init_process_pool()
45
+ pool = _get_process_pool()
44
46
  yield pool
45
47
 
46
48
 
@@ -52,15 +54,14 @@ def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) ->
52
54
 
53
55
 
54
56
  def shutdown_process_pool() -> None:
55
- """Shutdown the global process pool."""
56
- global _PROCESS_POOL
57
- if _PROCESS_POOL is not None:
58
- _PROCESS_POOL.shutdown(wait=True)
59
- _PROCESS_POOL = None
57
+ """Shutdown the process pool."""
58
+ if _process_pool_ref.is_initialized():
59
+ pool = _process_pool_ref.get()
60
+ pool.shutdown(wait=True)
61
+ _process_pool_ref.clear()
60
62
 
61
63
 
62
64
  def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
63
- """Worker function for extracting PDF text in a separate process."""
64
65
  pdf = None
65
66
  try:
66
67
  pdf = pypdfium2.PdfDocument(pdf_path)
@@ -80,7 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
80
81
 
81
82
 
82
83
  def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
83
- """Worker function for converting PDF to images in a separate process."""
84
84
  pdf = None
85
85
  try:
86
86
  pdf = pypdfium2.PdfDocument(pdf_path)
@@ -1,14 +1,10 @@
1
- """Quality post-processing utilities for extracted text."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import re
6
4
  from functools import reduce
7
5
  from typing import Any
8
6
 
9
- # Pre-compiled patterns for performance
10
7
  _OCR_ARTIFACTS = {
11
- # Common OCR misreads
12
8
  "scattered_chars": re.compile(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b"),
13
9
  "repeated_punctuation": re.compile(r"[.]{3,}|[-]{3,}|[_]{3,}"),
14
10
  "isolated_punctuation": re.compile(r"\s[.,;:!?]\s"),
@@ -17,7 +13,6 @@ _OCR_ARTIFACTS = {
17
13
  "broken_sentences": re.compile(r"[a-z]\s{3,}[A-Z][a-z]"),
18
14
  }
19
15
 
20
- # Combined pattern for faster OCR penalty calculation
21
16
  _COMBINED_OCR_PATTERN = re.compile(
22
17
  r"(?P<scattered>\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b)|"
23
18
  r"(?P<repeated>[.]{3,}|[-]{3,}|[_]{3,})|"
@@ -27,14 +22,12 @@ _COMBINED_OCR_PATTERN = re.compile(
27
22
  r"(?P<broken>[a-z]\s{3,}[A-Z][a-z])"
28
23
  )
29
24
 
30
- # Pre-compiled patterns for text normalization
31
25
  _WHITESPACE_NORMALIZE = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
32
26
  _NEWLINE_NORMALIZE = re.compile(r"\n\s*\n\s*\n+")
33
27
  _SENTENCE_DETECT = re.compile(r"[.!?]\s+[A-Z]")
34
28
  _PUNCTUATION_DETECT = re.compile(r"[.!?]")
35
29
 
36
30
  _SCRIPT_PATTERNS = {
37
- # JavaScript and CSS content
38
31
  "js_functions": re.compile(r"function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}", re.IGNORECASE),
39
32
  "css_rules": re.compile(r"\.[a-zA-Z][\w-]*\s*\{[^}]*\}", re.IGNORECASE),
40
33
  "script_tags": re.compile(r"<script[^>]*>.*?</script>", re.DOTALL | re.IGNORECASE),
@@ -63,27 +56,21 @@ def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -
63
56
  if not text or not text.strip():
64
57
  return 0.0
65
58
 
66
- # Initialize score
67
59
  score = 1.0
68
60
  total_chars = len(text)
69
61
 
70
- # Penalize OCR artifacts
71
62
  ocr_penalty = _calculate_ocr_penalty(text, total_chars)
72
63
  score -= ocr_penalty * 0.3
73
64
 
74
- # Penalize script/style content
75
65
  script_penalty = _calculate_script_penalty(text, total_chars)
76
66
  score -= script_penalty * 0.2
77
67
 
78
- # Penalize navigation content
79
68
  nav_penalty = _calculate_navigation_penalty(text, total_chars)
80
69
  score -= nav_penalty * 0.1
81
70
 
82
- # Bonus for structure (sentences, paragraphs)
83
71
  structure_bonus = _calculate_structure_bonus(text)
84
72
  score += structure_bonus * 0.2
85
73
 
86
- # Bonus for metadata richness
87
74
  if metadata:
88
75
  metadata_bonus = _calculate_metadata_bonus(metadata)
89
76
  score += metadata_bonus * 0.1
@@ -103,16 +90,12 @@ def clean_extracted_text(text: str) -> str:
103
90
  if not text:
104
91
  return text
105
92
 
106
- # Remove script and style content using functools.reduce for single pass
107
93
  text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
108
94
 
109
- # Clean OCR artifacts
110
95
  text = _clean_ocr_artifacts(text)
111
96
 
112
- # Clean navigation elements
113
97
  text = _clean_navigation_elements(text)
114
98
 
115
- # Normalize whitespace using pre-compiled patterns
116
99
  text = _WHITESPACE_NORMALIZE.sub(" ", text)
117
100
  text = _NEWLINE_NORMALIZE.sub("\n\n", text)
118
101
 
@@ -120,72 +103,57 @@ def clean_extracted_text(text: str) -> str:
120
103
 
121
104
 
122
105
  def _calculate_ocr_penalty(text: str, total_chars: int) -> float:
123
- """Calculate penalty for OCR artifacts."""
124
106
  if total_chars == 0:
125
107
  return 0.0
126
108
 
127
- # Use combined pattern for single-pass processing
128
109
  artifact_chars = sum(len(match.group()) for match in _COMBINED_OCR_PATTERN.finditer(text))
129
110
  return min(1.0, artifact_chars / total_chars)
130
111
 
131
112
 
132
113
  def _calculate_script_penalty(text: str, total_chars: int) -> float:
133
- """Calculate penalty for script/style content."""
134
114
  if total_chars == 0:
135
115
  return 0.0
136
116
 
137
- # Use sum with generator expression for single-pass calculation
138
117
  script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
139
118
 
140
119
  return min(1.0, script_chars / total_chars)
141
120
 
142
121
 
143
122
  def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
144
- """Calculate penalty for navigation content."""
145
123
  if total_chars == 0:
146
124
  return 0.0
147
125
 
148
- # Use sum with generator expression for single-pass calculation
149
126
  nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
150
127
 
151
128
  return min(1.0, nav_chars / total_chars)
152
129
 
153
130
 
154
131
  def _calculate_structure_bonus(text: str) -> float:
155
- """Calculate bonus for proper text structure."""
156
132
  if not text:
157
133
  return 0.0
158
134
 
159
- # Count sentences (rough heuristic)
160
135
  sentence_count = len(_SENTENCE_DETECT.findall(text))
161
136
 
162
- # Count paragraphs
163
137
  paragraph_count = len(text.split("\n\n"))
164
138
 
165
- # Calculate structure score
166
139
  words = len(text.split())
167
140
  if words == 0:
168
141
  return 0.0
169
142
 
170
- # Good structure: reasonable sentence and paragraph distribution
171
143
  avg_words_per_sentence = words / max(1, sentence_count)
172
144
  avg_words_per_paragraph = words / max(1, paragraph_count)
173
145
 
174
146
  structure_score = 0.0
175
147
 
176
- # Bonus for reasonable sentence length (10-30 words)
177
148
  if 10 <= avg_words_per_sentence <= 30:
178
149
  structure_score += 0.3
179
150
 
180
- # Bonus for reasonable paragraph length (50-300 words)
181
151
  if 50 <= avg_words_per_paragraph <= 300:
182
152
  structure_score += 0.3
183
153
 
184
- # Bonus for having multiple paragraphs
185
154
  if paragraph_count > 1:
186
155
  structure_score += 0.2
187
156
 
188
- # Bonus for having punctuation
189
157
  if _PUNCTUATION_DETECT.search(text):
190
158
  structure_score += 0.2
191
159
 
@@ -193,7 +161,6 @@ def _calculate_structure_bonus(text: str) -> float:
193
161
 
194
162
 
195
163
  def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
196
- """Calculate bonus for rich metadata."""
197
164
  if not metadata:
198
165
  return 0.0
199
166
 
@@ -204,30 +171,20 @@ def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
204
171
 
205
172
 
206
173
  def _clean_ocr_artifacts(text: str) -> str:
207
- """Remove common OCR artifacts from text."""
208
- # Fix scattered characters (likely OCR errors)
209
174
  text = _OCR_ARTIFACTS["scattered_chars"].sub(lambda m: m.group().replace(" ", ""), text)
210
175
 
211
- # Clean repeated punctuation
212
176
  text = _OCR_ARTIFACTS["repeated_punctuation"].sub("...", text)
213
177
 
214
- # Fix isolated punctuation
215
178
  text = _OCR_ARTIFACTS["isolated_punctuation"].sub(" ", text)
216
179
 
217
- # Remove malformed words with numbers mixed in
218
180
  text = _OCR_ARTIFACTS["malformed_words"].sub(" ", text)
219
181
 
220
- # Normalize excessive whitespace
221
182
  return _OCR_ARTIFACTS["excessive_whitespace"].sub(" ", text)
222
183
 
223
184
 
224
185
  def _clean_navigation_elements(text: str) -> str:
225
- """Remove navigation elements from text."""
226
- # Remove navigation words
227
186
  text = _NAVIGATION_PATTERNS["nav_words"].sub(" ", text)
228
187
 
229
- # Remove breadcrumbs
230
188
  text = _NAVIGATION_PATTERNS["breadcrumbs"].sub(" ", text)
231
189
 
232
- # Remove pagination
233
190
  return _NAVIGATION_PATTERNS["pagination"].sub(" ", text)