kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_utils/_cache.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""General-purpose file-based caching layer for Kreuzberg."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import hashlib
|
@@ -14,6 +12,7 @@ from typing import Any, Generic, TypeVar
|
|
14
12
|
from anyio import Path as AsyncPath
|
15
13
|
|
16
14
|
from kreuzberg._types import ExtractionResult
|
15
|
+
from kreuzberg._utils._ref import Ref
|
17
16
|
from kreuzberg._utils._serialization import deserialize, serialize
|
18
17
|
from kreuzberg._utils._sync import run_sync
|
19
18
|
|
@@ -57,22 +56,12 @@ class KreuzbergCache(Generic[T]):
|
|
57
56
|
self._lock = threading.Lock()
|
58
57
|
|
59
58
|
def _get_cache_key(self, **kwargs: Any) -> str:
|
60
|
-
"""Generate cache key from kwargs.
|
61
|
-
|
62
|
-
Args:
|
63
|
-
**kwargs: Key-value pairs to generate cache key from
|
64
|
-
|
65
|
-
Returns:
|
66
|
-
Unique cache key string
|
67
|
-
"""
|
68
59
|
if not kwargs:
|
69
60
|
return "empty"
|
70
61
|
|
71
|
-
# Build cache key using list + join (faster than StringIO)
|
72
62
|
parts = []
|
73
63
|
for key in sorted(kwargs):
|
74
64
|
value = kwargs[key]
|
75
|
-
# Convert common types efficiently
|
76
65
|
if isinstance(value, (str, int, float, bool)):
|
77
66
|
parts.append(f"{key}={value}")
|
78
67
|
elif isinstance(value, bytes):
|
@@ -81,15 +70,12 @@ class KreuzbergCache(Generic[T]):
|
|
81
70
|
parts.append(f"{key}={type(value).__name__}:{value!s}")
|
82
71
|
|
83
72
|
cache_str = "&".join(parts)
|
84
|
-
# SHA256 is secure and fast enough for cache keys
|
85
73
|
return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
|
86
74
|
|
87
75
|
def _get_cache_path(self, cache_key: str) -> Path:
|
88
|
-
"""Get cache file path for key."""
|
89
76
|
return self.cache_dir / f"{cache_key}.msgpack"
|
90
77
|
|
91
78
|
def _is_cache_valid(self, cache_path: Path) -> bool:
|
92
|
-
"""Check if cached result is still valid."""
|
93
79
|
try:
|
94
80
|
if not cache_path.exists():
|
95
81
|
return False
|
@@ -102,18 +88,14 @@ class KreuzbergCache(Generic[T]):
|
|
102
88
|
return False
|
103
89
|
|
104
90
|
def _serialize_result(self, result: T) -> dict[str, Any]:
|
105
|
-
"""Serialize result for caching with metadata."""
|
106
|
-
# Handle TableData objects that contain DataFrames
|
107
91
|
if isinstance(result, list) and result and isinstance(result[0], dict) and "df" in result[0]:
|
108
92
|
serialized_data = []
|
109
93
|
for item in result:
|
110
94
|
if isinstance(item, dict) and "df" in item:
|
111
|
-
# Build new dict without unnecessary copy
|
112
95
|
serialized_item = {k: v for k, v in item.items() if k != "df"}
|
113
96
|
if hasattr(item["df"], "to_csv"):
|
114
97
|
serialized_item["df_csv"] = item["df"].to_csv(index=False)
|
115
98
|
else:
|
116
|
-
# Fallback for non-DataFrame objects
|
117
99
|
serialized_item["df_csv"] = str(item["df"])
|
118
100
|
serialized_data.append(serialized_item)
|
119
101
|
else:
|
@@ -123,7 +105,6 @@ class KreuzbergCache(Generic[T]):
|
|
123
105
|
return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
|
124
106
|
|
125
107
|
def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
|
126
|
-
"""Deserialize cached result."""
|
127
108
|
data = cached_data["data"]
|
128
109
|
|
129
110
|
if cached_data.get("type") == "TableDataList" and isinstance(data, list):
|
@@ -132,7 +113,6 @@ class KreuzbergCache(Generic[T]):
|
|
132
113
|
deserialized_data = []
|
133
114
|
for item in data:
|
134
115
|
if isinstance(item, dict) and "df_csv" in item:
|
135
|
-
# Build new dict without unnecessary copy
|
136
116
|
deserialized_item = {k: v for k, v in item.items() if k != "df_csv"}
|
137
117
|
deserialized_item["df"] = pd.read_csv(StringIO(item["df_csv"]))
|
138
118
|
deserialized_data.append(deserialized_item)
|
@@ -146,7 +126,6 @@ class KreuzbergCache(Generic[T]):
|
|
146
126
|
return data # type: ignore[no-any-return]
|
147
127
|
|
148
128
|
def _cleanup_cache(self) -> None:
|
149
|
-
"""Clean up old and oversized cache entries."""
|
150
129
|
try:
|
151
130
|
cache_files = list(self.cache_dir.glob("*.msgpack"))
|
152
131
|
|
@@ -331,87 +310,106 @@ class KreuzbergCache(Generic[T]):
|
|
331
310
|
}
|
332
311
|
|
333
312
|
|
334
|
-
|
335
|
-
|
336
|
-
|
337
|
-
|
313
|
+
def _create_ocr_cache() -> KreuzbergCache[ExtractionResult]:
|
314
|
+
cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
|
315
|
+
cache_dir: Path | None = None
|
316
|
+
if cache_dir_str:
|
317
|
+
cache_dir = Path(cache_dir_str) / "ocr"
|
318
|
+
|
319
|
+
return KreuzbergCache[ExtractionResult](
|
320
|
+
cache_type="ocr",
|
321
|
+
cache_dir=cache_dir,
|
322
|
+
max_cache_size_mb=float(os.environ.get("KREUZBERG_OCR_CACHE_SIZE_MB", "500")),
|
323
|
+
max_age_days=int(os.environ.get("KREUZBERG_OCR_CACHE_AGE_DAYS", "30")),
|
324
|
+
)
|
325
|
+
|
326
|
+
|
327
|
+
_ocr_cache_ref = Ref("ocr_cache", _create_ocr_cache)
|
338
328
|
|
339
329
|
|
340
330
|
def get_ocr_cache() -> KreuzbergCache[ExtractionResult]:
|
341
|
-
"""Get the
|
342
|
-
|
343
|
-
|
344
|
-
|
345
|
-
|
346
|
-
|
347
|
-
|
348
|
-
|
349
|
-
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
354
|
-
)
|
355
|
-
|
331
|
+
"""Get the OCR cache instance."""
|
332
|
+
return _ocr_cache_ref.get()
|
333
|
+
|
334
|
+
|
335
|
+
def _create_document_cache() -> KreuzbergCache[ExtractionResult]:
|
336
|
+
cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
|
337
|
+
cache_dir: Path | None = None
|
338
|
+
if cache_dir_str:
|
339
|
+
cache_dir = Path(cache_dir_str) / "documents"
|
340
|
+
|
341
|
+
return KreuzbergCache[ExtractionResult](
|
342
|
+
cache_type="documents",
|
343
|
+
cache_dir=cache_dir,
|
344
|
+
max_cache_size_mb=float(os.environ.get("KREUZBERG_DOCUMENT_CACHE_SIZE_MB", "1000")),
|
345
|
+
max_age_days=int(os.environ.get("KREUZBERG_DOCUMENT_CACHE_AGE_DAYS", "7")),
|
346
|
+
)
|
347
|
+
|
348
|
+
|
349
|
+
_document_cache_ref = Ref("document_cache", _create_document_cache)
|
356
350
|
|
357
351
|
|
358
352
|
def get_document_cache() -> KreuzbergCache[ExtractionResult]:
|
359
|
-
"""Get the
|
360
|
-
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
|
372
|
-
)
|
373
|
-
|
353
|
+
"""Get the document cache instance."""
|
354
|
+
return _document_cache_ref.get()
|
355
|
+
|
356
|
+
|
357
|
+
def _create_table_cache() -> KreuzbergCache[Any]:
|
358
|
+
cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
|
359
|
+
cache_dir: Path | None = None
|
360
|
+
if cache_dir_str:
|
361
|
+
cache_dir = Path(cache_dir_str) / "tables"
|
362
|
+
|
363
|
+
return KreuzbergCache[Any](
|
364
|
+
cache_type="tables",
|
365
|
+
cache_dir=cache_dir,
|
366
|
+
max_cache_size_mb=float(os.environ.get("KREUZBERG_TABLE_CACHE_SIZE_MB", "200")),
|
367
|
+
max_age_days=int(os.environ.get("KREUZBERG_TABLE_CACHE_AGE_DAYS", "30")),
|
368
|
+
)
|
369
|
+
|
370
|
+
|
371
|
+
_table_cache_ref = Ref("table_cache", _create_table_cache)
|
374
372
|
|
375
373
|
|
376
374
|
def get_table_cache() -> KreuzbergCache[Any]:
|
377
|
-
"""Get the
|
378
|
-
|
379
|
-
|
380
|
-
|
381
|
-
|
382
|
-
|
383
|
-
|
384
|
-
|
385
|
-
|
386
|
-
|
387
|
-
|
388
|
-
|
389
|
-
|
390
|
-
)
|
391
|
-
|
375
|
+
"""Get the table cache instance."""
|
376
|
+
return _table_cache_ref.get()
|
377
|
+
|
378
|
+
|
379
|
+
def _create_mime_cache() -> KreuzbergCache[str]:
|
380
|
+
cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
|
381
|
+
cache_dir: Path | None = None
|
382
|
+
if cache_dir_str:
|
383
|
+
cache_dir = Path(cache_dir_str) / "mime"
|
384
|
+
|
385
|
+
return KreuzbergCache[str](
|
386
|
+
cache_type="mime",
|
387
|
+
cache_dir=cache_dir,
|
388
|
+
max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
|
389
|
+
max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
|
390
|
+
)
|
391
|
+
|
392
|
+
|
393
|
+
_mime_cache_ref = Ref("mime_cache", _create_mime_cache)
|
392
394
|
|
393
395
|
|
394
396
|
def get_mime_cache() -> KreuzbergCache[str]:
|
395
|
-
"""Get the
|
396
|
-
|
397
|
-
if _mime_cache is None:
|
398
|
-
cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
|
399
|
-
cache_dir: Path | None = None
|
400
|
-
if cache_dir_str:
|
401
|
-
cache_dir = Path(cache_dir_str) / "mime"
|
402
|
-
|
403
|
-
_mime_cache = KreuzbergCache[str](
|
404
|
-
cache_type="mime",
|
405
|
-
cache_dir=cache_dir,
|
406
|
-
max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
|
407
|
-
max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
|
408
|
-
)
|
409
|
-
return _mime_cache
|
397
|
+
"""Get the MIME type cache instance."""
|
398
|
+
return _mime_cache_ref.get()
|
410
399
|
|
411
400
|
|
412
401
|
def clear_all_caches() -> None:
|
413
402
|
"""Clear all caches."""
|
414
|
-
|
415
|
-
|
416
|
-
|
417
|
-
|
403
|
+
if _ocr_cache_ref.is_initialized():
|
404
|
+
get_ocr_cache().clear()
|
405
|
+
if _document_cache_ref.is_initialized():
|
406
|
+
get_document_cache().clear()
|
407
|
+
if _table_cache_ref.is_initialized():
|
408
|
+
get_table_cache().clear()
|
409
|
+
if _mime_cache_ref.is_initialized():
|
410
|
+
get_mime_cache().clear()
|
411
|
+
|
412
|
+
_ocr_cache_ref.clear()
|
413
|
+
_document_cache_ref.clear()
|
414
|
+
_table_cache_ref.clear()
|
415
|
+
_mime_cache_ref.clear()
|
kreuzberg/_utils/_device.py
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
"""Device detection and management utilities for GPU acceleration."""
|
2
1
|
# ruff: noqa: BLE001 # ~keep
|
3
2
|
|
4
3
|
from __future__ import annotations
|
@@ -35,7 +34,6 @@ def detect_available_devices() -> list[DeviceInfo]:
|
|
35
34
|
Returns:
|
36
35
|
List of available devices, with the most preferred device first.
|
37
36
|
"""
|
38
|
-
# Build device lists efficiently using generators
|
39
37
|
cpu_device = DeviceInfo(device_type="cpu", name="CPU")
|
40
38
|
|
41
39
|
cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
|
@@ -43,7 +41,6 @@ def detect_available_devices() -> list[DeviceInfo]:
|
|
43
41
|
mps_device = _get_mps_device() if _is_mps_available() else None
|
44
42
|
mps_devices = [mps_device] if mps_device else []
|
45
43
|
|
46
|
-
# Return GPU devices first, then CPU using itertools.chain
|
47
44
|
gpu_devices = list(chain(cuda_devices, mps_devices))
|
48
45
|
return [*gpu_devices, cpu_device]
|
49
46
|
|
@@ -139,7 +136,6 @@ def get_device_memory_info(device: DeviceInfo) -> tuple[float | None, float | No
|
|
139
136
|
|
140
137
|
|
141
138
|
def _is_cuda_available() -> bool:
|
142
|
-
"""Check if CUDA is available."""
|
143
139
|
try:
|
144
140
|
import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
|
145
141
|
|
@@ -149,7 +145,6 @@ def _is_cuda_available() -> bool:
|
|
149
145
|
|
150
146
|
|
151
147
|
def _is_mps_available() -> bool:
|
152
|
-
"""Check if MPS (Apple Silicon) is available."""
|
153
148
|
try:
|
154
149
|
import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
|
155
150
|
|
@@ -159,7 +154,6 @@ def _is_mps_available() -> bool:
|
|
159
154
|
|
160
155
|
|
161
156
|
def _get_cuda_devices() -> list[DeviceInfo]:
|
162
|
-
"""Get information about available CUDA devices."""
|
163
157
|
devices: list[DeviceInfo] = []
|
164
158
|
|
165
159
|
try:
|
@@ -197,7 +191,6 @@ def _get_cuda_devices() -> list[DeviceInfo]:
|
|
197
191
|
|
198
192
|
|
199
193
|
def _get_mps_device() -> DeviceInfo | None:
|
200
|
-
"""Get information about the MPS device."""
|
201
194
|
try:
|
202
195
|
import torch # noqa: PLC0415
|
203
196
|
|
@@ -214,7 +207,6 @@ def _get_mps_device() -> DeviceInfo | None:
|
|
214
207
|
|
215
208
|
|
216
209
|
def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
|
217
|
-
"""Get CUDA memory information for a specific device."""
|
218
210
|
try:
|
219
211
|
import torch # noqa: PLC0415
|
220
212
|
|
@@ -237,20 +229,10 @@ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
|
|
237
229
|
|
238
230
|
|
239
231
|
def _get_mps_memory_info() -> tuple[float | None, float | None]:
|
240
|
-
"""Get MPS memory information."""
|
241
232
|
return None, None
|
242
233
|
|
243
234
|
|
244
235
|
def _validate_memory_limit(device: DeviceInfo, memory_limit: float) -> None:
|
245
|
-
"""Validate that a device has enough memory for the requested limit.
|
246
|
-
|
247
|
-
Args:
|
248
|
-
device: The device to validate.
|
249
|
-
memory_limit: Required memory in GB.
|
250
|
-
|
251
|
-
Raises:
|
252
|
-
ValidationError: If the device doesn't have enough memory.
|
253
|
-
"""
|
254
236
|
if device.device_type == "cpu":
|
255
237
|
# CPU memory validation is complex and OS-dependent, skip for now # ~keep
|
256
238
|
return
|
kreuzberg/_utils/_errors.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Enhanced error handling utilities."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import platform
|
@@ -12,7 +10,6 @@ import psutil
|
|
12
10
|
|
13
11
|
from kreuzberg.exceptions import ValidationError
|
14
12
|
|
15
|
-
# Define error keywords as frozensets for O(1) membership testing
|
16
13
|
_SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
|
17
14
|
_TRANSIENT_ERROR_PATTERNS = frozenset(
|
18
15
|
{
|
kreuzberg/_utils/_pdf_lock.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Process pool utilities for CPU-intensive operations."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import io
|
@@ -13,6 +11,8 @@ import psutil
|
|
13
11
|
import pypdfium2
|
14
12
|
from typing_extensions import Self
|
15
13
|
|
14
|
+
from kreuzberg._utils._ref import Ref
|
15
|
+
|
16
16
|
if TYPE_CHECKING:
|
17
17
|
import types
|
18
18
|
from collections.abc import Callable, Generator
|
@@ -20,27 +20,29 @@ if TYPE_CHECKING:
|
|
20
20
|
T = TypeVar("T")
|
21
21
|
|
22
22
|
|
23
|
-
_PROCESS_POOL: ProcessPoolExecutor | None = None
|
24
23
|
_POOL_SIZE = max(1, mp.cpu_count() - 1)
|
25
24
|
|
26
25
|
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
26
|
+
def _create_process_pool() -> ProcessPoolExecutor:
|
27
|
+
return ProcessPoolExecutor(max_workers=_POOL_SIZE)
|
28
|
+
|
29
|
+
|
30
|
+
_process_pool_ref = Ref("process_pool", _create_process_pool)
|
31
|
+
|
32
|
+
|
33
|
+
def _get_process_pool() -> ProcessPoolExecutor:
|
34
|
+
return _process_pool_ref.get()
|
33
35
|
|
34
36
|
|
35
37
|
@contextmanager
|
36
38
|
def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
|
37
|
-
"""Get the
|
38
|
-
pool =
|
39
|
+
"""Get the process pool."""
|
40
|
+
pool = _get_process_pool()
|
39
41
|
try:
|
40
42
|
yield pool
|
41
43
|
except Exception: # noqa: BLE001
|
42
44
|
shutdown_process_pool()
|
43
|
-
pool =
|
45
|
+
pool = _get_process_pool()
|
44
46
|
yield pool
|
45
47
|
|
46
48
|
|
@@ -52,15 +54,14 @@ def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) ->
|
|
52
54
|
|
53
55
|
|
54
56
|
def shutdown_process_pool() -> None:
|
55
|
-
"""Shutdown the
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
|
57
|
+
"""Shutdown the process pool."""
|
58
|
+
if _process_pool_ref.is_initialized():
|
59
|
+
pool = _process_pool_ref.get()
|
60
|
+
pool.shutdown(wait=True)
|
61
|
+
_process_pool_ref.clear()
|
60
62
|
|
61
63
|
|
62
64
|
def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
|
63
|
-
"""Worker function for extracting PDF text in a separate process."""
|
64
65
|
pdf = None
|
65
66
|
try:
|
66
67
|
pdf = pypdfium2.PdfDocument(pdf_path)
|
@@ -80,7 +81,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
|
|
80
81
|
|
81
82
|
|
82
83
|
def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
|
83
|
-
"""Worker function for converting PDF to images in a separate process."""
|
84
84
|
pdf = None
|
85
85
|
try:
|
86
86
|
pdf = pypdfium2.PdfDocument(pdf_path)
|
kreuzberg/_utils/_quality.py
CHANGED
@@ -1,14 +1,10 @@
|
|
1
|
-
"""Quality post-processing utilities for extracted text."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import re
|
6
4
|
from functools import reduce
|
7
5
|
from typing import Any
|
8
6
|
|
9
|
-
# Pre-compiled patterns for performance
|
10
7
|
_OCR_ARTIFACTS = {
|
11
|
-
# Common OCR misreads
|
12
8
|
"scattered_chars": re.compile(r"\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b"),
|
13
9
|
"repeated_punctuation": re.compile(r"[.]{3,}|[-]{3,}|[_]{3,}"),
|
14
10
|
"isolated_punctuation": re.compile(r"\s[.,;:!?]\s"),
|
@@ -17,7 +13,6 @@ _OCR_ARTIFACTS = {
|
|
17
13
|
"broken_sentences": re.compile(r"[a-z]\s{3,}[A-Z][a-z]"),
|
18
14
|
}
|
19
15
|
|
20
|
-
# Combined pattern for faster OCR penalty calculation
|
21
16
|
_COMBINED_OCR_PATTERN = re.compile(
|
22
17
|
r"(?P<scattered>\b[a-zA-Z]\s{2,}[a-zA-Z]\s{2,}[a-zA-Z]\b)|"
|
23
18
|
r"(?P<repeated>[.]{3,}|[-]{3,}|[_]{3,})|"
|
@@ -27,14 +22,12 @@ _COMBINED_OCR_PATTERN = re.compile(
|
|
27
22
|
r"(?P<broken>[a-z]\s{3,}[A-Z][a-z])"
|
28
23
|
)
|
29
24
|
|
30
|
-
# Pre-compiled patterns for text normalization
|
31
25
|
_WHITESPACE_NORMALIZE = re.compile(r"[ \t\f\v\r\xa0\u2000-\u200b\u2028\u2029\u3000]+")
|
32
26
|
_NEWLINE_NORMALIZE = re.compile(r"\n\s*\n\s*\n+")
|
33
27
|
_SENTENCE_DETECT = re.compile(r"[.!?]\s+[A-Z]")
|
34
28
|
_PUNCTUATION_DETECT = re.compile(r"[.!?]")
|
35
29
|
|
36
30
|
_SCRIPT_PATTERNS = {
|
37
|
-
# JavaScript and CSS content
|
38
31
|
"js_functions": re.compile(r"function\s+\w+\s*\([^)]*\)\s*\{[^}]*\}", re.IGNORECASE),
|
39
32
|
"css_rules": re.compile(r"\.[a-zA-Z][\w-]*\s*\{[^}]*\}", re.IGNORECASE),
|
40
33
|
"script_tags": re.compile(r"<script[^>]*>.*?</script>", re.DOTALL | re.IGNORECASE),
|
@@ -63,27 +56,21 @@ def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -
|
|
63
56
|
if not text or not text.strip():
|
64
57
|
return 0.0
|
65
58
|
|
66
|
-
# Initialize score
|
67
59
|
score = 1.0
|
68
60
|
total_chars = len(text)
|
69
61
|
|
70
|
-
# Penalize OCR artifacts
|
71
62
|
ocr_penalty = _calculate_ocr_penalty(text, total_chars)
|
72
63
|
score -= ocr_penalty * 0.3
|
73
64
|
|
74
|
-
# Penalize script/style content
|
75
65
|
script_penalty = _calculate_script_penalty(text, total_chars)
|
76
66
|
score -= script_penalty * 0.2
|
77
67
|
|
78
|
-
# Penalize navigation content
|
79
68
|
nav_penalty = _calculate_navigation_penalty(text, total_chars)
|
80
69
|
score -= nav_penalty * 0.1
|
81
70
|
|
82
|
-
# Bonus for structure (sentences, paragraphs)
|
83
71
|
structure_bonus = _calculate_structure_bonus(text)
|
84
72
|
score += structure_bonus * 0.2
|
85
73
|
|
86
|
-
# Bonus for metadata richness
|
87
74
|
if metadata:
|
88
75
|
metadata_bonus = _calculate_metadata_bonus(metadata)
|
89
76
|
score += metadata_bonus * 0.1
|
@@ -103,16 +90,12 @@ def clean_extracted_text(text: str) -> str:
|
|
103
90
|
if not text:
|
104
91
|
return text
|
105
92
|
|
106
|
-
# Remove script and style content using functools.reduce for single pass
|
107
93
|
text = reduce(lambda t, pattern: pattern.sub(" ", t), _SCRIPT_PATTERNS.values(), text)
|
108
94
|
|
109
|
-
# Clean OCR artifacts
|
110
95
|
text = _clean_ocr_artifacts(text)
|
111
96
|
|
112
|
-
# Clean navigation elements
|
113
97
|
text = _clean_navigation_elements(text)
|
114
98
|
|
115
|
-
# Normalize whitespace using pre-compiled patterns
|
116
99
|
text = _WHITESPACE_NORMALIZE.sub(" ", text)
|
117
100
|
text = _NEWLINE_NORMALIZE.sub("\n\n", text)
|
118
101
|
|
@@ -120,72 +103,57 @@ def clean_extracted_text(text: str) -> str:
|
|
120
103
|
|
121
104
|
|
122
105
|
def _calculate_ocr_penalty(text: str, total_chars: int) -> float:
|
123
|
-
"""Calculate penalty for OCR artifacts."""
|
124
106
|
if total_chars == 0:
|
125
107
|
return 0.0
|
126
108
|
|
127
|
-
# Use combined pattern for single-pass processing
|
128
109
|
artifact_chars = sum(len(match.group()) for match in _COMBINED_OCR_PATTERN.finditer(text))
|
129
110
|
return min(1.0, artifact_chars / total_chars)
|
130
111
|
|
131
112
|
|
132
113
|
def _calculate_script_penalty(text: str, total_chars: int) -> float:
|
133
|
-
"""Calculate penalty for script/style content."""
|
134
114
|
if total_chars == 0:
|
135
115
|
return 0.0
|
136
116
|
|
137
|
-
# Use sum with generator expression for single-pass calculation
|
138
117
|
script_chars = sum(len(match) for pattern in _SCRIPT_PATTERNS.values() for match in pattern.findall(text))
|
139
118
|
|
140
119
|
return min(1.0, script_chars / total_chars)
|
141
120
|
|
142
121
|
|
143
122
|
def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
|
144
|
-
"""Calculate penalty for navigation content."""
|
145
123
|
if total_chars == 0:
|
146
124
|
return 0.0
|
147
125
|
|
148
|
-
# Use sum with generator expression for single-pass calculation
|
149
126
|
nav_chars = sum(len(match) for pattern in _NAVIGATION_PATTERNS.values() for match in pattern.findall(text))
|
150
127
|
|
151
128
|
return min(1.0, nav_chars / total_chars)
|
152
129
|
|
153
130
|
|
154
131
|
def _calculate_structure_bonus(text: str) -> float:
|
155
|
-
"""Calculate bonus for proper text structure."""
|
156
132
|
if not text:
|
157
133
|
return 0.0
|
158
134
|
|
159
|
-
# Count sentences (rough heuristic)
|
160
135
|
sentence_count = len(_SENTENCE_DETECT.findall(text))
|
161
136
|
|
162
|
-
# Count paragraphs
|
163
137
|
paragraph_count = len(text.split("\n\n"))
|
164
138
|
|
165
|
-
# Calculate structure score
|
166
139
|
words = len(text.split())
|
167
140
|
if words == 0:
|
168
141
|
return 0.0
|
169
142
|
|
170
|
-
# Good structure: reasonable sentence and paragraph distribution
|
171
143
|
avg_words_per_sentence = words / max(1, sentence_count)
|
172
144
|
avg_words_per_paragraph = words / max(1, paragraph_count)
|
173
145
|
|
174
146
|
structure_score = 0.0
|
175
147
|
|
176
|
-
# Bonus for reasonable sentence length (10-30 words)
|
177
148
|
if 10 <= avg_words_per_sentence <= 30:
|
178
149
|
structure_score += 0.3
|
179
150
|
|
180
|
-
# Bonus for reasonable paragraph length (50-300 words)
|
181
151
|
if 50 <= avg_words_per_paragraph <= 300:
|
182
152
|
structure_score += 0.3
|
183
153
|
|
184
|
-
# Bonus for having multiple paragraphs
|
185
154
|
if paragraph_count > 1:
|
186
155
|
structure_score += 0.2
|
187
156
|
|
188
|
-
# Bonus for having punctuation
|
189
157
|
if _PUNCTUATION_DETECT.search(text):
|
190
158
|
structure_score += 0.2
|
191
159
|
|
@@ -193,7 +161,6 @@ def _calculate_structure_bonus(text: str) -> float:
|
|
193
161
|
|
194
162
|
|
195
163
|
def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
|
196
|
-
"""Calculate bonus for rich metadata."""
|
197
164
|
if not metadata:
|
198
165
|
return 0.0
|
199
166
|
|
@@ -204,30 +171,20 @@ def _calculate_metadata_bonus(metadata: dict[str, Any]) -> float:
|
|
204
171
|
|
205
172
|
|
206
173
|
def _clean_ocr_artifacts(text: str) -> str:
|
207
|
-
"""Remove common OCR artifacts from text."""
|
208
|
-
# Fix scattered characters (likely OCR errors)
|
209
174
|
text = _OCR_ARTIFACTS["scattered_chars"].sub(lambda m: m.group().replace(" ", ""), text)
|
210
175
|
|
211
|
-
# Clean repeated punctuation
|
212
176
|
text = _OCR_ARTIFACTS["repeated_punctuation"].sub("...", text)
|
213
177
|
|
214
|
-
# Fix isolated punctuation
|
215
178
|
text = _OCR_ARTIFACTS["isolated_punctuation"].sub(" ", text)
|
216
179
|
|
217
|
-
# Remove malformed words with numbers mixed in
|
218
180
|
text = _OCR_ARTIFACTS["malformed_words"].sub(" ", text)
|
219
181
|
|
220
|
-
# Normalize excessive whitespace
|
221
182
|
return _OCR_ARTIFACTS["excessive_whitespace"].sub(" ", text)
|
222
183
|
|
223
184
|
|
224
185
|
def _clean_navigation_elements(text: str) -> str:
|
225
|
-
"""Remove navigation elements from text."""
|
226
|
-
# Remove navigation words
|
227
186
|
text = _NAVIGATION_PATTERNS["nav_words"].sub(" ", text)
|
228
187
|
|
229
|
-
# Remove breadcrumbs
|
230
188
|
text = _NAVIGATION_PATTERNS["breadcrumbs"].sub(" ", text)
|
231
189
|
|
232
|
-
# Remove pagination
|
233
190
|
return _NAVIGATION_PATTERNS["pagination"].sub(" ", text)
|