kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Document-level caching to prevent pypdfium2 issues with duplicate processing."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import hashlib
|
@@ -13,14 +11,7 @@ if TYPE_CHECKING:
|
|
13
11
|
|
14
12
|
|
15
13
|
class DocumentCache:
|
16
|
-
"""Session-scoped cache for document extraction results.
|
17
|
-
|
18
|
-
Ensures each unique document is processed only once per session,
|
19
|
-
preventing pypdfium2 state corruption issues with repeated processing.
|
20
|
-
"""
|
21
|
-
|
22
14
|
def __init__(self) -> None:
|
23
|
-
"""Initialize document cache."""
|
24
15
|
self._cache: dict[str, ExtractionResult] = {}
|
25
16
|
self._processing: dict[str, threading.Event] = {}
|
26
17
|
self._lock = threading.Lock()
|
@@ -28,15 +19,6 @@ class DocumentCache:
|
|
28
19
|
self._file_metadata: dict[str, dict[str, Any]] = {}
|
29
20
|
|
30
21
|
def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
|
31
|
-
"""Generate cache key for a file and config combination.
|
32
|
-
|
33
|
-
Args:
|
34
|
-
file_path: Path to the file
|
35
|
-
config: Extraction configuration
|
36
|
-
|
37
|
-
Returns:
|
38
|
-
Unique cache key string
|
39
|
-
"""
|
40
22
|
path = Path(file_path).resolve()
|
41
23
|
|
42
24
|
try:
|
@@ -67,15 +49,6 @@ class DocumentCache:
|
|
67
49
|
return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
|
68
50
|
|
69
51
|
def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
|
70
|
-
"""Check if cached result is still valid.
|
71
|
-
|
72
|
-
Args:
|
73
|
-
cache_key: The cache key to validate
|
74
|
-
file_path: Path to the file
|
75
|
-
|
76
|
-
Returns:
|
77
|
-
True if cache is valid, False if invalidated
|
78
|
-
"""
|
79
52
|
if cache_key not in self._file_metadata:
|
80
53
|
return False
|
81
54
|
|
@@ -91,15 +64,6 @@ class DocumentCache:
|
|
91
64
|
return False
|
92
65
|
|
93
66
|
def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
|
94
|
-
"""Get cached extraction result if available and valid.
|
95
|
-
|
96
|
-
Args:
|
97
|
-
file_path: Path to the file
|
98
|
-
config: Extraction configuration
|
99
|
-
|
100
|
-
Returns:
|
101
|
-
Cached result if available, None otherwise
|
102
|
-
"""
|
103
67
|
cache_key = self._get_cache_key(file_path, config)
|
104
68
|
|
105
69
|
with self._lock:
|
@@ -113,13 +77,6 @@ class DocumentCache:
|
|
113
77
|
return None
|
114
78
|
|
115
79
|
def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
|
116
|
-
"""Cache extraction result.
|
117
|
-
|
118
|
-
Args:
|
119
|
-
file_path: Path to the file
|
120
|
-
config: Extraction configuration
|
121
|
-
result: Extraction result to cache
|
122
|
-
"""
|
123
80
|
cache_key = self._get_cache_key(file_path, config)
|
124
81
|
path = Path(file_path)
|
125
82
|
|
@@ -142,29 +99,11 @@ class DocumentCache:
|
|
142
99
|
self._file_metadata[cache_key] = file_metadata
|
143
100
|
|
144
101
|
def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
|
145
|
-
"""Check if file is currently being processed.
|
146
|
-
|
147
|
-
Args:
|
148
|
-
file_path: Path to the file
|
149
|
-
config: Extraction configuration
|
150
|
-
|
151
|
-
Returns:
|
152
|
-
True if file is currently being processed
|
153
|
-
"""
|
154
102
|
cache_key = self._get_cache_key(file_path, config)
|
155
103
|
with self._lock:
|
156
104
|
return cache_key in self._processing
|
157
105
|
|
158
106
|
def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
|
159
|
-
"""Mark file as being processed and return event to wait on.
|
160
|
-
|
161
|
-
Args:
|
162
|
-
file_path: Path to the file
|
163
|
-
config: Extraction configuration
|
164
|
-
|
165
|
-
Returns:
|
166
|
-
Event that will be set when processing completes
|
167
|
-
"""
|
168
107
|
cache_key = self._get_cache_key(file_path, config)
|
169
108
|
|
170
109
|
with self._lock:
|
@@ -173,12 +112,6 @@ class DocumentCache:
|
|
173
112
|
return self._processing[cache_key]
|
174
113
|
|
175
114
|
def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
|
176
|
-
"""Mark file processing as complete.
|
177
|
-
|
178
|
-
Args:
|
179
|
-
file_path: Path to the file
|
180
|
-
config: Extraction configuration
|
181
|
-
"""
|
182
115
|
cache_key = self._get_cache_key(file_path, config)
|
183
116
|
|
184
117
|
with self._lock:
|
@@ -187,17 +120,11 @@ class DocumentCache:
|
|
187
120
|
event.set()
|
188
121
|
|
189
122
|
def clear(self) -> None:
|
190
|
-
"""Clear all cached results."""
|
191
123
|
with self._lock:
|
192
124
|
self._cache.clear()
|
193
125
|
self._file_metadata.clear()
|
194
126
|
|
195
127
|
def get_stats(self) -> dict[str, Any]:
|
196
|
-
"""Get cache statistics.
|
197
|
-
|
198
|
-
Returns:
|
199
|
-
Dictionary with cache statistics
|
200
|
-
"""
|
201
128
|
with self._lock:
|
202
129
|
return {
|
203
130
|
"cached_documents": len(self._cache),
|
@@ -212,10 +139,8 @@ _document_cache = DocumentCache()
|
|
212
139
|
|
213
140
|
|
214
141
|
def get_document_cache() -> DocumentCache:
|
215
|
-
"""Get the global document cache instance."""
|
216
142
|
return _document_cache
|
217
143
|
|
218
144
|
|
219
145
|
def clear_document_cache() -> None:
|
220
|
-
"""Clear the global document cache."""
|
221
146
|
_document_cache.clear()
|
kreuzberg/_utils/_errors.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Enhanced error handling utilities."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import platform
|
@@ -12,7 +10,6 @@ import psutil
|
|
12
10
|
|
13
11
|
from kreuzberg.exceptions import ValidationError
|
14
12
|
|
15
|
-
# Define error keywords as frozensets for O(1) membership testing
|
16
13
|
_SYSTEM_ERROR_KEYWORDS = frozenset({"memory", "resource", "process", "thread"})
|
17
14
|
_TRANSIENT_ERROR_PATTERNS = frozenset(
|
18
15
|
{
|
@@ -56,17 +53,6 @@ def create_error_context(
|
|
56
53
|
error: Exception | None = None,
|
57
54
|
**extra: Any,
|
58
55
|
) -> dict[str, Any]:
|
59
|
-
"""Create comprehensive error context.
|
60
|
-
|
61
|
-
Args:
|
62
|
-
operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
|
63
|
-
file_path: The file being processed, if applicable
|
64
|
-
error: The original exception, if any
|
65
|
-
**extra: Additional context fields
|
66
|
-
|
67
|
-
Returns:
|
68
|
-
Dictionary with error context including system info
|
69
|
-
"""
|
70
56
|
context: dict[str, Any] = {
|
71
57
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
72
58
|
"operation": operation,
|
@@ -107,14 +93,6 @@ def create_error_context(
|
|
107
93
|
|
108
94
|
|
109
95
|
def is_transient_error(error: Exception) -> bool:
|
110
|
-
"""Check if an error is likely transient and worth retrying.
|
111
|
-
|
112
|
-
Args:
|
113
|
-
error: The exception to check
|
114
|
-
|
115
|
-
Returns:
|
116
|
-
True if the error is likely transient
|
117
|
-
"""
|
118
96
|
transient_types = (
|
119
97
|
OSError,
|
120
98
|
PermissionError,
|
@@ -131,29 +109,11 @@ def is_transient_error(error: Exception) -> bool:
|
|
131
109
|
|
132
110
|
|
133
111
|
def is_resource_error(error: Exception) -> bool:
|
134
|
-
"""Check if an error is related to system resources.
|
135
|
-
|
136
|
-
Args:
|
137
|
-
error: The exception to check
|
138
|
-
|
139
|
-
Returns:
|
140
|
-
True if the error is resource-related
|
141
|
-
"""
|
142
112
|
error_str = str(error).lower()
|
143
113
|
return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
|
144
114
|
|
145
115
|
|
146
116
|
def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
|
147
|
-
"""Determine if an operation should be retried.
|
148
|
-
|
149
|
-
Args:
|
150
|
-
error: The exception that occurred
|
151
|
-
attempt: Current attempt number (1-based)
|
152
|
-
max_attempts: Maximum number of attempts
|
153
|
-
|
154
|
-
Returns:
|
155
|
-
True if the operation should be retried
|
156
|
-
"""
|
157
117
|
if attempt >= max_attempts:
|
158
118
|
return False
|
159
119
|
|
@@ -164,22 +124,17 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
|
|
164
124
|
|
165
125
|
|
166
126
|
class BatchExtractionResult:
|
167
|
-
"""Result container for batch operations with partial success support."""
|
168
|
-
|
169
127
|
__slots__ = ("failed", "successful", "total_count")
|
170
128
|
|
171
129
|
def __init__(self) -> None:
|
172
|
-
"""Initialize batch result container."""
|
173
130
|
self.successful: list[tuple[int, Any]] = []
|
174
131
|
self.failed: list[tuple[int, dict[str, Any]]] = []
|
175
132
|
self.total_count: int = 0
|
176
133
|
|
177
134
|
def add_success(self, index: int, result: Any) -> None:
|
178
|
-
"""Add a successful result."""
|
179
135
|
self.successful.append((index, result))
|
180
136
|
|
181
137
|
def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
|
182
|
-
"""Add a failed result with context."""
|
183
138
|
error_info = {
|
184
139
|
"error": {
|
185
140
|
"type": type(error).__name__,
|
@@ -191,30 +146,25 @@ class BatchExtractionResult:
|
|
191
146
|
|
192
147
|
@property
|
193
148
|
def success_count(self) -> int:
|
194
|
-
"""Number of successful operations."""
|
195
149
|
return len(self.successful)
|
196
150
|
|
197
151
|
@property
|
198
152
|
def failure_count(self) -> int:
|
199
|
-
"""Number of failed operations."""
|
200
153
|
return len(self.failed)
|
201
154
|
|
202
155
|
@property
|
203
156
|
def success_rate(self) -> float:
|
204
|
-
"""Success rate as a percentage."""
|
205
157
|
if self.total_count == 0:
|
206
158
|
return 0.0
|
207
159
|
return (self.success_count / self.total_count) * 100
|
208
160
|
|
209
161
|
def get_ordered_results(self) -> list[Any | None]:
|
210
|
-
"""Get results in original order with None for failures."""
|
211
162
|
results = [None] * self.total_count
|
212
163
|
for index, result in self.successful:
|
213
164
|
results[index] = result
|
214
165
|
return results
|
215
166
|
|
216
167
|
def get_summary(self) -> dict[str, Any]:
|
217
|
-
"""Get summary of batch operation."""
|
218
168
|
return {
|
219
169
|
"total": self.total_count,
|
220
170
|
"successful": self.success_count,
|
@@ -0,0 +1,136 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import hashlib
|
4
|
+
import io
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
|
+
|
7
|
+
import anyio
|
8
|
+
|
9
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from pathlib import Path
|
13
|
+
|
14
|
+
from PIL.Image import Image as PILImage
|
15
|
+
|
16
|
+
from kreuzberg._types import ExtractionResult
|
17
|
+
|
18
|
+
|
19
|
+
def get_file_info(path: Path) -> dict[str, Any]:
|
20
|
+
from pathlib import Path as PathType # noqa: PLC0415
|
21
|
+
|
22
|
+
path_obj = PathType(path) if not isinstance(path, PathType) else path
|
23
|
+
|
24
|
+
try:
|
25
|
+
stat = path_obj.stat()
|
26
|
+
return {
|
27
|
+
"path": str(path_obj.resolve()),
|
28
|
+
"size": stat.st_size,
|
29
|
+
"mtime": stat.st_mtime,
|
30
|
+
}
|
31
|
+
except OSError:
|
32
|
+
return {
|
33
|
+
"path": str(path_obj),
|
34
|
+
"size": 0,
|
35
|
+
"mtime": 0,
|
36
|
+
}
|
37
|
+
|
38
|
+
|
39
|
+
def generate_image_hash(image: PILImage) -> str:
|
40
|
+
save_image = image
|
41
|
+
if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
|
42
|
+
save_image = image.convert("RGB")
|
43
|
+
|
44
|
+
image_buffer = io.BytesIO()
|
45
|
+
save_image.save(image_buffer, format="PNG")
|
46
|
+
image_content = image_buffer.getvalue()
|
47
|
+
|
48
|
+
return hashlib.sha256(image_content).hexdigest()[:16]
|
49
|
+
|
50
|
+
|
51
|
+
def build_cache_kwargs(
|
52
|
+
backend_name: str,
|
53
|
+
config_dict: dict[str, Any],
|
54
|
+
image_hash: str | None = None,
|
55
|
+
file_info: dict[str, Any] | None = None,
|
56
|
+
) -> dict[str, Any]:
|
57
|
+
cache_kwargs = {
|
58
|
+
"ocr_backend": backend_name,
|
59
|
+
"ocr_config": str(sorted(config_dict.items())),
|
60
|
+
}
|
61
|
+
|
62
|
+
if image_hash:
|
63
|
+
cache_kwargs["image_hash"] = image_hash
|
64
|
+
if file_info:
|
65
|
+
cache_kwargs["file_info"] = str(sorted(file_info.items()))
|
66
|
+
|
67
|
+
return cache_kwargs
|
68
|
+
|
69
|
+
|
70
|
+
async def handle_cache_lookup_async(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
71
|
+
ocr_cache = get_ocr_cache()
|
72
|
+
|
73
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
74
|
+
if cached_result is not None:
|
75
|
+
return cached_result
|
76
|
+
|
77
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
78
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
79
|
+
await anyio.to_thread.run_sync(event.wait)
|
80
|
+
|
81
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
82
|
+
if cached_result is not None:
|
83
|
+
return cached_result
|
84
|
+
|
85
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
86
|
+
return None
|
87
|
+
|
88
|
+
|
89
|
+
def handle_cache_lookup_sync(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
90
|
+
ocr_cache = get_ocr_cache()
|
91
|
+
|
92
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
93
|
+
if cached_result is not None:
|
94
|
+
return cached_result
|
95
|
+
|
96
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
97
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
98
|
+
event.wait()
|
99
|
+
|
100
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
101
|
+
if cached_result is not None:
|
102
|
+
return cached_result
|
103
|
+
|
104
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
105
|
+
return None
|
106
|
+
|
107
|
+
|
108
|
+
async def cache_and_complete_async(
|
109
|
+
result: ExtractionResult,
|
110
|
+
cache_kwargs: dict[str, Any],
|
111
|
+
use_cache: bool,
|
112
|
+
) -> None:
|
113
|
+
ocr_cache = get_ocr_cache()
|
114
|
+
|
115
|
+
if use_cache:
|
116
|
+
await ocr_cache.aset(result, **cache_kwargs)
|
117
|
+
|
118
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
119
|
+
|
120
|
+
|
121
|
+
def cache_and_complete_sync(
|
122
|
+
result: ExtractionResult,
|
123
|
+
cache_kwargs: dict[str, Any],
|
124
|
+
use_cache: bool,
|
125
|
+
) -> None:
|
126
|
+
ocr_cache = get_ocr_cache()
|
127
|
+
|
128
|
+
if use_cache:
|
129
|
+
ocr_cache.set(result, **cache_kwargs)
|
130
|
+
|
131
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
132
|
+
|
133
|
+
|
134
|
+
def mark_processing_complete(cache_kwargs: dict[str, Any]) -> None:
|
135
|
+
ocr_cache = get_ocr_cache()
|
136
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
kreuzberg/_utils/_pdf_lock.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""PDF processing lock utilities for thread-safe pypdfium2 operations."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import hashlib
|
@@ -21,13 +19,11 @@ _FILE_LOCKS_LOCK = threading.Lock()
|
|
21
19
|
|
22
20
|
|
23
21
|
def _get_file_key(file_path: Path | str) -> str:
|
24
|
-
"""Get a consistent key for a file path."""
|
25
22
|
path_str = str(Path(file_path).resolve())
|
26
23
|
return hashlib.md5(path_str.encode()).hexdigest() # noqa: S324
|
27
24
|
|
28
25
|
|
29
26
|
def _get_file_lock(file_path: Path | str) -> threading.RLock:
|
30
|
-
"""Get or create a lock for a specific file."""
|
31
27
|
file_key = _get_file_key(file_path)
|
32
28
|
|
33
29
|
with _FILE_LOCKS_LOCK:
|
@@ -41,30 +37,18 @@ def _get_file_lock(file_path: Path | str) -> threading.RLock:
|
|
41
37
|
|
42
38
|
@contextmanager
|
43
39
|
def pypdfium_lock() -> Generator[None, None, None]:
|
44
|
-
"""Context manager for thread-safe pypdfium2 operations.
|
45
|
-
|
46
|
-
This prevents segmentation faults on macOS where pypdfium2
|
47
|
-
is not fork-safe when used concurrently.
|
48
|
-
"""
|
49
40
|
with _PYPDFIUM_LOCK:
|
50
41
|
yield
|
51
42
|
|
52
43
|
|
53
44
|
@contextmanager
|
54
45
|
def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
|
55
|
-
"""Context manager for per-file pypdfium2 operations.
|
56
|
-
|
57
|
-
This allows concurrent processing of different files while
|
58
|
-
preventing segfaults. Document caching handles same-file issues.
|
59
|
-
"""
|
60
46
|
lock = _get_file_lock(file_path)
|
61
47
|
with lock:
|
62
48
|
yield
|
63
49
|
|
64
50
|
|
65
51
|
def with_pypdfium_lock(func: Any) -> Any:
|
66
|
-
"""Decorator to wrap functions with pypdfium2 lock."""
|
67
|
-
|
68
52
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
69
53
|
with pypdfium_lock():
|
70
54
|
return func(*args, **kwargs)
|
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Process pool utilities for CPU-intensive operations."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import io
|
@@ -13,6 +11,8 @@ import psutil
|
|
13
11
|
import pypdfium2
|
14
12
|
from typing_extensions import Self
|
15
13
|
|
14
|
+
from kreuzberg._utils._ref import Ref
|
15
|
+
|
16
16
|
if TYPE_CHECKING:
|
17
17
|
import types
|
18
18
|
from collections.abc import Callable, Generator
|
@@ -20,47 +20,45 @@ if TYPE_CHECKING:
|
|
20
20
|
T = TypeVar("T")
|
21
21
|
|
22
22
|
|
23
|
-
_PROCESS_POOL: ProcessPoolExecutor | None = None
|
24
23
|
_POOL_SIZE = max(1, mp.cpu_count() - 1)
|
25
24
|
|
26
25
|
|
27
|
-
def
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
26
|
+
def _create_process_pool() -> ProcessPoolExecutor:
|
27
|
+
return ProcessPoolExecutor(max_workers=_POOL_SIZE)
|
28
|
+
|
29
|
+
|
30
|
+
_process_pool_ref = Ref("process_pool", _create_process_pool)
|
31
|
+
|
32
|
+
|
33
|
+
def _get_process_pool() -> ProcessPoolExecutor:
|
34
|
+
return _process_pool_ref.get()
|
33
35
|
|
34
36
|
|
35
37
|
@contextmanager
|
36
38
|
def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
|
37
|
-
|
38
|
-
pool = _init_process_pool()
|
39
|
+
pool = _get_process_pool()
|
39
40
|
try:
|
40
41
|
yield pool
|
41
42
|
except Exception: # noqa: BLE001
|
42
43
|
shutdown_process_pool()
|
43
|
-
pool =
|
44
|
+
pool = _get_process_pool()
|
44
45
|
yield pool
|
45
46
|
|
46
47
|
|
47
48
|
def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
|
48
|
-
"""Submit a function to the process pool and wait for result."""
|
49
49
|
with process_pool() as pool:
|
50
50
|
future = pool.submit(func, *args, **kwargs)
|
51
51
|
return future.result()
|
52
52
|
|
53
53
|
|
54
54
|
def shutdown_process_pool() -> None:
|
55
|
-
|
56
|
-
|
57
|
-
|
58
|
-
|
59
|
-
_PROCESS_POOL = None
|
55
|
+
if _process_pool_ref.is_initialized():
|
56
|
+
pool = _process_pool_ref.get()
|
57
|
+
pool.shutdown(wait=True)
|
58
|
+
_process_pool_ref.clear()
|
60
59
|
|
61
60
|
|
62
61
|
def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
|
63
|
-
"""Worker function for extracting PDF text in a separate process."""
|
64
62
|
pdf = None
|
65
63
|
try:
|
66
64
|
pdf = pypdfium2.PdfDocument(pdf_path)
|
@@ -80,7 +78,6 @@ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
|
|
80
78
|
|
81
79
|
|
82
80
|
def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
|
83
|
-
"""Worker function for converting PDF to images in a separate process."""
|
84
81
|
pdf = None
|
85
82
|
try:
|
86
83
|
pdf = pypdfium2.PdfDocument(pdf_path)
|
@@ -102,19 +99,11 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
|
|
102
99
|
|
103
100
|
|
104
101
|
class ProcessPoolManager:
|
105
|
-
"""Resource-aware process pool manager for CPU-intensive tasks."""
|
106
|
-
|
107
102
|
def __init__(
|
108
103
|
self,
|
109
104
|
max_processes: int | None = None,
|
110
105
|
memory_limit_gb: float | None = None,
|
111
106
|
) -> None:
|
112
|
-
"""Initialize the process pool manager.
|
113
|
-
|
114
|
-
Args:
|
115
|
-
max_processes: Maximum number of processes. Defaults to CPU count.
|
116
|
-
memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
|
117
|
-
"""
|
118
107
|
self.max_processes = max_processes or mp.cpu_count()
|
119
108
|
|
120
109
|
if memory_limit_gb is None:
|
@@ -127,21 +116,12 @@ class ProcessPoolManager:
|
|
127
116
|
self._active_tasks = 0
|
128
117
|
|
129
118
|
def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
|
130
|
-
"""Calculate optimal number of workers based on memory constraints.
|
131
|
-
|
132
|
-
Args:
|
133
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
134
|
-
|
135
|
-
Returns:
|
136
|
-
Optimal number of workers.
|
137
|
-
"""
|
138
119
|
task_memory_bytes = task_memory_mb * 1024**2
|
139
120
|
memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
|
140
121
|
|
141
122
|
return min(self.max_processes, memory_based_limit)
|
142
123
|
|
143
124
|
def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
|
144
|
-
"""Ensure process pool executor is initialized."""
|
145
125
|
if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
|
146
126
|
if self._executor is not None:
|
147
127
|
self._executor.shutdown(wait=False)
|
@@ -157,16 +137,6 @@ class ProcessPoolManager:
|
|
157
137
|
*args: Any,
|
158
138
|
task_memory_mb: float = 100,
|
159
139
|
) -> T:
|
160
|
-
"""Submit a task to the process pool.
|
161
|
-
|
162
|
-
Args:
|
163
|
-
func: Function to execute.
|
164
|
-
*args: Positional arguments for the function.
|
165
|
-
task_memory_mb: Estimated memory usage in MB.
|
166
|
-
|
167
|
-
Returns:
|
168
|
-
Result of the function execution.
|
169
|
-
"""
|
170
140
|
workers = self.get_optimal_workers(task_memory_mb)
|
171
141
|
self._ensure_executor(workers)
|
172
142
|
|
@@ -184,17 +154,6 @@ class ProcessPoolManager:
|
|
184
154
|
task_memory_mb: float = 100,
|
185
155
|
max_concurrent: int | None = None,
|
186
156
|
) -> list[T]:
|
187
|
-
"""Submit a batch of tasks to the process pool.
|
188
|
-
|
189
|
-
Args:
|
190
|
-
func: Function to execute.
|
191
|
-
arg_batches: List of argument tuples for each task.
|
192
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
193
|
-
max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
|
194
|
-
|
195
|
-
Returns:
|
196
|
-
List of results in the same order as input.
|
197
|
-
"""
|
198
157
|
if not arg_batches:
|
199
158
|
return []
|
200
159
|
|
@@ -225,7 +184,6 @@ class ProcessPoolManager:
|
|
225
184
|
return results
|
226
185
|
|
227
186
|
def get_system_info(self) -> dict[str, Any]:
|
228
|
-
"""Get current system resource information."""
|
229
187
|
memory = psutil.virtual_memory()
|
230
188
|
cpu_percent = psutil.cpu_percent(interval=1)
|
231
189
|
|
@@ -241,13 +199,11 @@ class ProcessPoolManager:
|
|
241
199
|
}
|
242
200
|
|
243
201
|
def shutdown(self, wait: bool = True) -> None:
|
244
|
-
"""Shutdown the process pool."""
|
245
202
|
if self._executor is not None:
|
246
203
|
self._executor.shutdown(wait=wait)
|
247
204
|
self._executor = None
|
248
205
|
|
249
206
|
def __enter__(self) -> Self:
|
250
|
-
"""Context manager entry."""
|
251
207
|
return self
|
252
208
|
|
253
209
|
def __exit__(
|
@@ -256,11 +212,9 @@ class ProcessPoolManager:
|
|
256
212
|
exc_val: BaseException | None,
|
257
213
|
exc_tb: types.TracebackType | None,
|
258
214
|
) -> None:
|
259
|
-
"""Context manager exit."""
|
260
215
|
self.shutdown()
|
261
216
|
|
262
217
|
async def __aenter__(self) -> Self:
|
263
|
-
"""Async context manager entry."""
|
264
218
|
return self
|
265
219
|
|
266
220
|
async def __aexit__(
|
@@ -269,5 +223,4 @@ class ProcessPoolManager:
|
|
269
223
|
exc_val: BaseException | None,
|
270
224
|
exc_tb: types.TracebackType | None,
|
271
225
|
) -> None:
|
272
|
-
"""Async context manager exit."""
|
273
226
|
self.shutdown()
|