kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +0 -124
- kreuzberg/_document_classification.py +20 -39
- kreuzberg/_entity_extraction.py +0 -29
- kreuzberg/_extractors/_base.py +4 -66
- kreuzberg/_extractors/_email.py +0 -4
- kreuzberg/_extractors/_image.py +0 -2
- kreuzberg/_extractors/_pandoc.py +0 -58
- kreuzberg/_extractors/_pdf.py +0 -3
- kreuzberg/_extractors/_presentation.py +0 -82
- kreuzberg/_extractors/_spread_sheet.py +0 -2
- kreuzberg/_gmft.py +0 -61
- kreuzberg/_language_detection.py +0 -14
- kreuzberg/_mime_types.py +0 -17
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +110 -85
- kreuzberg/_ocr/_paddleocr.py +146 -138
- kreuzberg/_ocr/_table_extractor.py +0 -76
- kreuzberg/_ocr/_tesseract.py +0 -206
- kreuzberg/_playa.py +0 -27
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +16 -119
- kreuzberg/_utils/_cache.py +0 -52
- kreuzberg/_utils/_device.py +0 -56
- kreuzberg/_utils/_document_cache.py +0 -73
- kreuzberg/_utils/_errors.py +0 -47
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -14
- kreuzberg/_utils/_process_pool.py +0 -47
- kreuzberg/_utils/_quality.py +0 -17
- kreuzberg/_utils/_ref.py +0 -16
- kreuzberg/_utils/_serialization.py +0 -25
- kreuzberg/_utils/_string.py +0 -20
- kreuzberg/_utils/_sync.py +0 -76
- kreuzberg/_utils/_table.py +0 -45
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +2 -2
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
- kreuzberg-3.13.2.dist-info/RECORD +57 -0
- kreuzberg-3.13.0.dist-info/RECORD +0 -56
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0
@@ -11,14 +11,7 @@ if TYPE_CHECKING:
|
|
11
11
|
|
12
12
|
|
13
13
|
class DocumentCache:
|
14
|
-
"""Session-scoped cache for document extraction results.
|
15
|
-
|
16
|
-
Ensures each unique document is processed only once per session,
|
17
|
-
preventing pypdfium2 state corruption issues with repeated processing.
|
18
|
-
"""
|
19
|
-
|
20
14
|
def __init__(self) -> None:
|
21
|
-
"""Initialize document cache."""
|
22
15
|
self._cache: dict[str, ExtractionResult] = {}
|
23
16
|
self._processing: dict[str, threading.Event] = {}
|
24
17
|
self._lock = threading.Lock()
|
@@ -26,15 +19,6 @@ class DocumentCache:
|
|
26
19
|
self._file_metadata: dict[str, dict[str, Any]] = {}
|
27
20
|
|
28
21
|
def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
|
29
|
-
"""Generate cache key for a file and config combination.
|
30
|
-
|
31
|
-
Args:
|
32
|
-
file_path: Path to the file
|
33
|
-
config: Extraction configuration
|
34
|
-
|
35
|
-
Returns:
|
36
|
-
Unique cache key string
|
37
|
-
"""
|
38
22
|
path = Path(file_path).resolve()
|
39
23
|
|
40
24
|
try:
|
@@ -65,15 +49,6 @@ class DocumentCache:
|
|
65
49
|
return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
|
66
50
|
|
67
51
|
def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
|
68
|
-
"""Check if cached result is still valid.
|
69
|
-
|
70
|
-
Args:
|
71
|
-
cache_key: The cache key to validate
|
72
|
-
file_path: Path to the file
|
73
|
-
|
74
|
-
Returns:
|
75
|
-
True if cache is valid, False if invalidated
|
76
|
-
"""
|
77
52
|
if cache_key not in self._file_metadata:
|
78
53
|
return False
|
79
54
|
|
@@ -89,15 +64,6 @@ class DocumentCache:
|
|
89
64
|
return False
|
90
65
|
|
91
66
|
def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
|
92
|
-
"""Get cached extraction result if available and valid.
|
93
|
-
|
94
|
-
Args:
|
95
|
-
file_path: Path to the file
|
96
|
-
config: Extraction configuration
|
97
|
-
|
98
|
-
Returns:
|
99
|
-
Cached result if available, None otherwise
|
100
|
-
"""
|
101
67
|
cache_key = self._get_cache_key(file_path, config)
|
102
68
|
|
103
69
|
with self._lock:
|
@@ -111,13 +77,6 @@ class DocumentCache:
|
|
111
77
|
return None
|
112
78
|
|
113
79
|
def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
|
114
|
-
"""Cache extraction result.
|
115
|
-
|
116
|
-
Args:
|
117
|
-
file_path: Path to the file
|
118
|
-
config: Extraction configuration
|
119
|
-
result: Extraction result to cache
|
120
|
-
"""
|
121
80
|
cache_key = self._get_cache_key(file_path, config)
|
122
81
|
path = Path(file_path)
|
123
82
|
|
@@ -140,29 +99,11 @@ class DocumentCache:
|
|
140
99
|
self._file_metadata[cache_key] = file_metadata
|
141
100
|
|
142
101
|
def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
|
143
|
-
"""Check if file is currently being processed.
|
144
|
-
|
145
|
-
Args:
|
146
|
-
file_path: Path to the file
|
147
|
-
config: Extraction configuration
|
148
|
-
|
149
|
-
Returns:
|
150
|
-
True if file is currently being processed
|
151
|
-
"""
|
152
102
|
cache_key = self._get_cache_key(file_path, config)
|
153
103
|
with self._lock:
|
154
104
|
return cache_key in self._processing
|
155
105
|
|
156
106
|
def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
|
157
|
-
"""Mark file as being processed and return event to wait on.
|
158
|
-
|
159
|
-
Args:
|
160
|
-
file_path: Path to the file
|
161
|
-
config: Extraction configuration
|
162
|
-
|
163
|
-
Returns:
|
164
|
-
Event that will be set when processing completes
|
165
|
-
"""
|
166
107
|
cache_key = self._get_cache_key(file_path, config)
|
167
108
|
|
168
109
|
with self._lock:
|
@@ -171,12 +112,6 @@ class DocumentCache:
|
|
171
112
|
return self._processing[cache_key]
|
172
113
|
|
173
114
|
def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
|
174
|
-
"""Mark file processing as complete.
|
175
|
-
|
176
|
-
Args:
|
177
|
-
file_path: Path to the file
|
178
|
-
config: Extraction configuration
|
179
|
-
"""
|
180
115
|
cache_key = self._get_cache_key(file_path, config)
|
181
116
|
|
182
117
|
with self._lock:
|
@@ -185,17 +120,11 @@ class DocumentCache:
|
|
185
120
|
event.set()
|
186
121
|
|
187
122
|
def clear(self) -> None:
|
188
|
-
"""Clear all cached results."""
|
189
123
|
with self._lock:
|
190
124
|
self._cache.clear()
|
191
125
|
self._file_metadata.clear()
|
192
126
|
|
193
127
|
def get_stats(self) -> dict[str, Any]:
|
194
|
-
"""Get cache statistics.
|
195
|
-
|
196
|
-
Returns:
|
197
|
-
Dictionary with cache statistics
|
198
|
-
"""
|
199
128
|
with self._lock:
|
200
129
|
return {
|
201
130
|
"cached_documents": len(self._cache),
|
@@ -210,10 +139,8 @@ _document_cache = DocumentCache()
|
|
210
139
|
|
211
140
|
|
212
141
|
def get_document_cache() -> DocumentCache:
|
213
|
-
"""Get the global document cache instance."""
|
214
142
|
return _document_cache
|
215
143
|
|
216
144
|
|
217
145
|
def clear_document_cache() -> None:
|
218
|
-
"""Clear the global document cache."""
|
219
146
|
_document_cache.clear()
|
kreuzberg/_utils/_errors.py
CHANGED
@@ -53,17 +53,6 @@ def create_error_context(
|
|
53
53
|
error: Exception | None = None,
|
54
54
|
**extra: Any,
|
55
55
|
) -> dict[str, Any]:
|
56
|
-
"""Create comprehensive error context.
|
57
|
-
|
58
|
-
Args:
|
59
|
-
operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
|
60
|
-
file_path: The file being processed, if applicable
|
61
|
-
error: The original exception, if any
|
62
|
-
**extra: Additional context fields
|
63
|
-
|
64
|
-
Returns:
|
65
|
-
Dictionary with error context including system info
|
66
|
-
"""
|
67
56
|
context: dict[str, Any] = {
|
68
57
|
"timestamp": datetime.now(timezone.utc).isoformat(),
|
69
58
|
"operation": operation,
|
@@ -104,14 +93,6 @@ def create_error_context(
|
|
104
93
|
|
105
94
|
|
106
95
|
def is_transient_error(error: Exception) -> bool:
|
107
|
-
"""Check if an error is likely transient and worth retrying.
|
108
|
-
|
109
|
-
Args:
|
110
|
-
error: The exception to check
|
111
|
-
|
112
|
-
Returns:
|
113
|
-
True if the error is likely transient
|
114
|
-
"""
|
115
96
|
transient_types = (
|
116
97
|
OSError,
|
117
98
|
PermissionError,
|
@@ -128,29 +109,11 @@ def is_transient_error(error: Exception) -> bool:
|
|
128
109
|
|
129
110
|
|
130
111
|
def is_resource_error(error: Exception) -> bool:
|
131
|
-
"""Check if an error is related to system resources.
|
132
|
-
|
133
|
-
Args:
|
134
|
-
error: The exception to check
|
135
|
-
|
136
|
-
Returns:
|
137
|
-
True if the error is resource-related
|
138
|
-
"""
|
139
112
|
error_str = str(error).lower()
|
140
113
|
return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
|
141
114
|
|
142
115
|
|
143
116
|
def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
|
144
|
-
"""Determine if an operation should be retried.
|
145
|
-
|
146
|
-
Args:
|
147
|
-
error: The exception that occurred
|
148
|
-
attempt: Current attempt number (1-based)
|
149
|
-
max_attempts: Maximum number of attempts
|
150
|
-
|
151
|
-
Returns:
|
152
|
-
True if the operation should be retried
|
153
|
-
"""
|
154
117
|
if attempt >= max_attempts:
|
155
118
|
return False
|
156
119
|
|
@@ -161,22 +124,17 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
|
|
161
124
|
|
162
125
|
|
163
126
|
class BatchExtractionResult:
|
164
|
-
"""Result container for batch operations with partial success support."""
|
165
|
-
|
166
127
|
__slots__ = ("failed", "successful", "total_count")
|
167
128
|
|
168
129
|
def __init__(self) -> None:
|
169
|
-
"""Initialize batch result container."""
|
170
130
|
self.successful: list[tuple[int, Any]] = []
|
171
131
|
self.failed: list[tuple[int, dict[str, Any]]] = []
|
172
132
|
self.total_count: int = 0
|
173
133
|
|
174
134
|
def add_success(self, index: int, result: Any) -> None:
|
175
|
-
"""Add a successful result."""
|
176
135
|
self.successful.append((index, result))
|
177
136
|
|
178
137
|
def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
|
179
|
-
"""Add a failed result with context."""
|
180
138
|
error_info = {
|
181
139
|
"error": {
|
182
140
|
"type": type(error).__name__,
|
@@ -188,30 +146,25 @@ class BatchExtractionResult:
|
|
188
146
|
|
189
147
|
@property
|
190
148
|
def success_count(self) -> int:
|
191
|
-
"""Number of successful operations."""
|
192
149
|
return len(self.successful)
|
193
150
|
|
194
151
|
@property
|
195
152
|
def failure_count(self) -> int:
|
196
|
-
"""Number of failed operations."""
|
197
153
|
return len(self.failed)
|
198
154
|
|
199
155
|
@property
|
200
156
|
def success_rate(self) -> float:
|
201
|
-
"""Success rate as a percentage."""
|
202
157
|
if self.total_count == 0:
|
203
158
|
return 0.0
|
204
159
|
return (self.success_count / self.total_count) * 100
|
205
160
|
|
206
161
|
def get_ordered_results(self) -> list[Any | None]:
|
207
|
-
"""Get results in original order with None for failures."""
|
208
162
|
results = [None] * self.total_count
|
209
163
|
for index, result in self.successful:
|
210
164
|
results[index] = result
|
211
165
|
return results
|
212
166
|
|
213
167
|
def get_summary(self) -> dict[str, Any]:
|
214
|
-
"""Get summary of batch operation."""
|
215
168
|
return {
|
216
169
|
"total": self.total_count,
|
217
170
|
"successful": self.success_count,
|
@@ -0,0 +1,136 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import hashlib
|
4
|
+
import io
|
5
|
+
from typing import TYPE_CHECKING, Any
|
6
|
+
|
7
|
+
import anyio
|
8
|
+
|
9
|
+
from kreuzberg._utils._cache import get_ocr_cache
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from pathlib import Path
|
13
|
+
|
14
|
+
from PIL.Image import Image as PILImage
|
15
|
+
|
16
|
+
from kreuzberg._types import ExtractionResult
|
17
|
+
|
18
|
+
|
19
|
+
def get_file_info(path: Path) -> dict[str, Any]:
|
20
|
+
from pathlib import Path as PathType # noqa: PLC0415
|
21
|
+
|
22
|
+
path_obj = PathType(path) if not isinstance(path, PathType) else path
|
23
|
+
|
24
|
+
try:
|
25
|
+
stat = path_obj.stat()
|
26
|
+
return {
|
27
|
+
"path": str(path_obj.resolve()),
|
28
|
+
"size": stat.st_size,
|
29
|
+
"mtime": stat.st_mtime,
|
30
|
+
}
|
31
|
+
except OSError:
|
32
|
+
return {
|
33
|
+
"path": str(path_obj),
|
34
|
+
"size": 0,
|
35
|
+
"mtime": 0,
|
36
|
+
}
|
37
|
+
|
38
|
+
|
39
|
+
def generate_image_hash(image: PILImage) -> str:
|
40
|
+
save_image = image
|
41
|
+
if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
|
42
|
+
save_image = image.convert("RGB")
|
43
|
+
|
44
|
+
image_buffer = io.BytesIO()
|
45
|
+
save_image.save(image_buffer, format="PNG")
|
46
|
+
image_content = image_buffer.getvalue()
|
47
|
+
|
48
|
+
return hashlib.sha256(image_content).hexdigest()[:16]
|
49
|
+
|
50
|
+
|
51
|
+
def build_cache_kwargs(
|
52
|
+
backend_name: str,
|
53
|
+
config_dict: dict[str, Any],
|
54
|
+
image_hash: str | None = None,
|
55
|
+
file_info: dict[str, Any] | None = None,
|
56
|
+
) -> dict[str, Any]:
|
57
|
+
cache_kwargs = {
|
58
|
+
"ocr_backend": backend_name,
|
59
|
+
"ocr_config": str(sorted(config_dict.items())),
|
60
|
+
}
|
61
|
+
|
62
|
+
if image_hash:
|
63
|
+
cache_kwargs["image_hash"] = image_hash
|
64
|
+
if file_info:
|
65
|
+
cache_kwargs["file_info"] = str(sorted(file_info.items()))
|
66
|
+
|
67
|
+
return cache_kwargs
|
68
|
+
|
69
|
+
|
70
|
+
async def handle_cache_lookup_async(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
71
|
+
ocr_cache = get_ocr_cache()
|
72
|
+
|
73
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
74
|
+
if cached_result is not None:
|
75
|
+
return cached_result
|
76
|
+
|
77
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
78
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
79
|
+
await anyio.to_thread.run_sync(event.wait)
|
80
|
+
|
81
|
+
cached_result = await ocr_cache.aget(**cache_kwargs)
|
82
|
+
if cached_result is not None:
|
83
|
+
return cached_result
|
84
|
+
|
85
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
86
|
+
return None
|
87
|
+
|
88
|
+
|
89
|
+
def handle_cache_lookup_sync(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
|
90
|
+
ocr_cache = get_ocr_cache()
|
91
|
+
|
92
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
93
|
+
if cached_result is not None:
|
94
|
+
return cached_result
|
95
|
+
|
96
|
+
if ocr_cache.is_processing(**cache_kwargs):
|
97
|
+
event = ocr_cache.mark_processing(**cache_kwargs)
|
98
|
+
event.wait()
|
99
|
+
|
100
|
+
cached_result = ocr_cache.get(**cache_kwargs)
|
101
|
+
if cached_result is not None:
|
102
|
+
return cached_result
|
103
|
+
|
104
|
+
ocr_cache.mark_processing(**cache_kwargs)
|
105
|
+
return None
|
106
|
+
|
107
|
+
|
108
|
+
async def cache_and_complete_async(
|
109
|
+
result: ExtractionResult,
|
110
|
+
cache_kwargs: dict[str, Any],
|
111
|
+
use_cache: bool,
|
112
|
+
) -> None:
|
113
|
+
ocr_cache = get_ocr_cache()
|
114
|
+
|
115
|
+
if use_cache:
|
116
|
+
await ocr_cache.aset(result, **cache_kwargs)
|
117
|
+
|
118
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
119
|
+
|
120
|
+
|
121
|
+
def cache_and_complete_sync(
|
122
|
+
result: ExtractionResult,
|
123
|
+
cache_kwargs: dict[str, Any],
|
124
|
+
use_cache: bool,
|
125
|
+
) -> None:
|
126
|
+
ocr_cache = get_ocr_cache()
|
127
|
+
|
128
|
+
if use_cache:
|
129
|
+
ocr_cache.set(result, **cache_kwargs)
|
130
|
+
|
131
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
132
|
+
|
133
|
+
|
134
|
+
def mark_processing_complete(cache_kwargs: dict[str, Any]) -> None:
|
135
|
+
ocr_cache = get_ocr_cache()
|
136
|
+
ocr_cache.mark_complete(**cache_kwargs)
|
kreuzberg/_utils/_pdf_lock.py
CHANGED
@@ -19,13 +19,11 @@ _FILE_LOCKS_LOCK = threading.Lock()
|
|
19
19
|
|
20
20
|
|
21
21
|
def _get_file_key(file_path: Path | str) -> str:
|
22
|
-
"""Get a consistent key for a file path."""
|
23
22
|
path_str = str(Path(file_path).resolve())
|
24
23
|
return hashlib.md5(path_str.encode()).hexdigest() # noqa: S324
|
25
24
|
|
26
25
|
|
27
26
|
def _get_file_lock(file_path: Path | str) -> threading.RLock:
|
28
|
-
"""Get or create a lock for a specific file."""
|
29
27
|
file_key = _get_file_key(file_path)
|
30
28
|
|
31
29
|
with _FILE_LOCKS_LOCK:
|
@@ -39,30 +37,18 @@ def _get_file_lock(file_path: Path | str) -> threading.RLock:
|
|
39
37
|
|
40
38
|
@contextmanager
|
41
39
|
def pypdfium_lock() -> Generator[None, None, None]:
|
42
|
-
"""Context manager for thread-safe pypdfium2 operations.
|
43
|
-
|
44
|
-
This prevents segmentation faults on macOS where pypdfium2
|
45
|
-
is not fork-safe when used concurrently.
|
46
|
-
"""
|
47
40
|
with _PYPDFIUM_LOCK:
|
48
41
|
yield
|
49
42
|
|
50
43
|
|
51
44
|
@contextmanager
|
52
45
|
def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
|
53
|
-
"""Context manager for per-file pypdfium2 operations.
|
54
|
-
|
55
|
-
This allows concurrent processing of different files while
|
56
|
-
preventing segfaults. Document caching handles same-file issues.
|
57
|
-
"""
|
58
46
|
lock = _get_file_lock(file_path)
|
59
47
|
with lock:
|
60
48
|
yield
|
61
49
|
|
62
50
|
|
63
51
|
def with_pypdfium_lock(func: Any) -> Any:
|
64
|
-
"""Decorator to wrap functions with pypdfium2 lock."""
|
65
|
-
|
66
52
|
def wrapper(*args: Any, **kwargs: Any) -> Any:
|
67
53
|
with pypdfium_lock():
|
68
54
|
return func(*args, **kwargs)
|
@@ -36,7 +36,6 @@ def _get_process_pool() -> ProcessPoolExecutor:
|
|
36
36
|
|
37
37
|
@contextmanager
|
38
38
|
def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
|
39
|
-
"""Get the process pool."""
|
40
39
|
pool = _get_process_pool()
|
41
40
|
try:
|
42
41
|
yield pool
|
@@ -47,14 +46,12 @@ def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
|
|
47
46
|
|
48
47
|
|
49
48
|
def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
|
50
|
-
"""Submit a function to the process pool and wait for result."""
|
51
49
|
with process_pool() as pool:
|
52
50
|
future = pool.submit(func, *args, **kwargs)
|
53
51
|
return future.result()
|
54
52
|
|
55
53
|
|
56
54
|
def shutdown_process_pool() -> None:
|
57
|
-
"""Shutdown the process pool."""
|
58
55
|
if _process_pool_ref.is_initialized():
|
59
56
|
pool = _process_pool_ref.get()
|
60
57
|
pool.shutdown(wait=True)
|
@@ -102,19 +99,11 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
|
|
102
99
|
|
103
100
|
|
104
101
|
class ProcessPoolManager:
|
105
|
-
"""Resource-aware process pool manager for CPU-intensive tasks."""
|
106
|
-
|
107
102
|
def __init__(
|
108
103
|
self,
|
109
104
|
max_processes: int | None = None,
|
110
105
|
memory_limit_gb: float | None = None,
|
111
106
|
) -> None:
|
112
|
-
"""Initialize the process pool manager.
|
113
|
-
|
114
|
-
Args:
|
115
|
-
max_processes: Maximum number of processes. Defaults to CPU count.
|
116
|
-
memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
|
117
|
-
"""
|
118
107
|
self.max_processes = max_processes or mp.cpu_count()
|
119
108
|
|
120
109
|
if memory_limit_gb is None:
|
@@ -127,21 +116,12 @@ class ProcessPoolManager:
|
|
127
116
|
self._active_tasks = 0
|
128
117
|
|
129
118
|
def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
|
130
|
-
"""Calculate optimal number of workers based on memory constraints.
|
131
|
-
|
132
|
-
Args:
|
133
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
134
|
-
|
135
|
-
Returns:
|
136
|
-
Optimal number of workers.
|
137
|
-
"""
|
138
119
|
task_memory_bytes = task_memory_mb * 1024**2
|
139
120
|
memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
|
140
121
|
|
141
122
|
return min(self.max_processes, memory_based_limit)
|
142
123
|
|
143
124
|
def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
|
144
|
-
"""Ensure process pool executor is initialized."""
|
145
125
|
if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
|
146
126
|
if self._executor is not None:
|
147
127
|
self._executor.shutdown(wait=False)
|
@@ -157,16 +137,6 @@ class ProcessPoolManager:
|
|
157
137
|
*args: Any,
|
158
138
|
task_memory_mb: float = 100,
|
159
139
|
) -> T:
|
160
|
-
"""Submit a task to the process pool.
|
161
|
-
|
162
|
-
Args:
|
163
|
-
func: Function to execute.
|
164
|
-
*args: Positional arguments for the function.
|
165
|
-
task_memory_mb: Estimated memory usage in MB.
|
166
|
-
|
167
|
-
Returns:
|
168
|
-
Result of the function execution.
|
169
|
-
"""
|
170
140
|
workers = self.get_optimal_workers(task_memory_mb)
|
171
141
|
self._ensure_executor(workers)
|
172
142
|
|
@@ -184,17 +154,6 @@ class ProcessPoolManager:
|
|
184
154
|
task_memory_mb: float = 100,
|
185
155
|
max_concurrent: int | None = None,
|
186
156
|
) -> list[T]:
|
187
|
-
"""Submit a batch of tasks to the process pool.
|
188
|
-
|
189
|
-
Args:
|
190
|
-
func: Function to execute.
|
191
|
-
arg_batches: List of argument tuples for each task.
|
192
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
193
|
-
max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
|
194
|
-
|
195
|
-
Returns:
|
196
|
-
List of results in the same order as input.
|
197
|
-
"""
|
198
157
|
if not arg_batches:
|
199
158
|
return []
|
200
159
|
|
@@ -225,7 +184,6 @@ class ProcessPoolManager:
|
|
225
184
|
return results
|
226
185
|
|
227
186
|
def get_system_info(self) -> dict[str, Any]:
|
228
|
-
"""Get current system resource information."""
|
229
187
|
memory = psutil.virtual_memory()
|
230
188
|
cpu_percent = psutil.cpu_percent(interval=1)
|
231
189
|
|
@@ -241,13 +199,11 @@ class ProcessPoolManager:
|
|
241
199
|
}
|
242
200
|
|
243
201
|
def shutdown(self, wait: bool = True) -> None:
|
244
|
-
"""Shutdown the process pool."""
|
245
202
|
if self._executor is not None:
|
246
203
|
self._executor.shutdown(wait=wait)
|
247
204
|
self._executor = None
|
248
205
|
|
249
206
|
def __enter__(self) -> Self:
|
250
|
-
"""Context manager entry."""
|
251
207
|
return self
|
252
208
|
|
253
209
|
def __exit__(
|
@@ -256,11 +212,9 @@ class ProcessPoolManager:
|
|
256
212
|
exc_val: BaseException | None,
|
257
213
|
exc_tb: types.TracebackType | None,
|
258
214
|
) -> None:
|
259
|
-
"""Context manager exit."""
|
260
215
|
self.shutdown()
|
261
216
|
|
262
217
|
async def __aenter__(self) -> Self:
|
263
|
-
"""Async context manager entry."""
|
264
218
|
return self
|
265
219
|
|
266
220
|
async def __aexit__(
|
@@ -269,5 +223,4 @@ class ProcessPoolManager:
|
|
269
223
|
exc_val: BaseException | None,
|
270
224
|
exc_tb: types.TracebackType | None,
|
271
225
|
) -> None:
|
272
|
-
"""Async context manager exit."""
|
273
226
|
self.shutdown()
|
kreuzberg/_utils/_quality.py
CHANGED
@@ -44,15 +44,6 @@ _NAVIGATION_PATTERNS = {
|
|
44
44
|
|
45
45
|
|
46
46
|
def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -> float:
|
47
|
-
"""Calculate overall quality score for extracted text.
|
48
|
-
|
49
|
-
Args:
|
50
|
-
text: The extracted text content
|
51
|
-
metadata: Optional metadata for additional scoring
|
52
|
-
|
53
|
-
Returns:
|
54
|
-
Quality score between 0.0 and 1.0
|
55
|
-
"""
|
56
47
|
if not text or not text.strip():
|
57
48
|
return 0.0
|
58
49
|
|
@@ -79,14 +70,6 @@ def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -
|
|
79
70
|
|
80
71
|
|
81
72
|
def clean_extracted_text(text: str) -> str:
|
82
|
-
"""Clean extracted text by removing artifacts and improving quality.
|
83
|
-
|
84
|
-
Args:
|
85
|
-
text: The raw extracted text
|
86
|
-
|
87
|
-
Returns:
|
88
|
-
Cleaned text with artifacts removed
|
89
|
-
"""
|
90
73
|
if not text:
|
91
74
|
return text
|
92
75
|
|
kreuzberg/_utils/_ref.py
CHANGED
@@ -9,40 +9,24 @@ T = TypeVar("T")
|
|
9
9
|
|
10
10
|
|
11
11
|
class Ref(Generic[T]):
|
12
|
-
"""A reference container that manages singleton instances without global variables.
|
13
|
-
|
14
|
-
This provides a clean alternative to global variables by using a registry pattern
|
15
|
-
with type safety.
|
16
|
-
"""
|
17
|
-
|
18
12
|
_instances: ClassVar[dict[str, Any]] = {}
|
19
13
|
|
20
14
|
def __init__(self, name: str, factory: Callable[[], T]) -> None:
|
21
|
-
"""Initialize a reference container.
|
22
|
-
|
23
|
-
Args:
|
24
|
-
name: Unique name for this reference
|
25
|
-
factory: Factory function to create the instance when needed
|
26
|
-
"""
|
27
15
|
self.name = name
|
28
16
|
self.factory = factory
|
29
17
|
|
30
18
|
def get(self) -> T:
|
31
|
-
"""Get the singleton instance, creating it if it doesn't exist."""
|
32
19
|
if self.name not in self._instances:
|
33
20
|
self._instances[self.name] = self.factory()
|
34
21
|
return cast("T", self._instances[self.name])
|
35
22
|
|
36
23
|
def clear(self) -> None:
|
37
|
-
"""Clear the singleton instance."""
|
38
24
|
if self.name in self._instances:
|
39
25
|
del self._instances[self.name]
|
40
26
|
|
41
27
|
def is_initialized(self) -> bool:
|
42
|
-
"""Check if the singleton instance exists."""
|
43
28
|
return self.name in self._instances
|
44
29
|
|
45
30
|
@classmethod
|
46
31
|
def clear_all(cls) -> None:
|
47
|
-
"""Clear all singleton instances."""
|
48
32
|
cls._instances.clear()
|
@@ -22,7 +22,6 @@ _DICT_METHOD_NAMES = (
|
|
22
22
|
|
23
23
|
|
24
24
|
def encode_hook(obj: Any) -> Any:
|
25
|
-
"""Custom encoder for complex objects."""
|
26
25
|
if callable(obj):
|
27
26
|
return None
|
28
27
|
|
@@ -44,18 +43,6 @@ def encode_hook(obj: Any) -> Any:
|
|
44
43
|
|
45
44
|
|
46
45
|
def deserialize(value: str | bytes, target_type: type[T]) -> T:
|
47
|
-
"""Deserialize bytes/string to target type.
|
48
|
-
|
49
|
-
Args:
|
50
|
-
value: Serialized data
|
51
|
-
target_type: Type to deserialize to
|
52
|
-
|
53
|
-
Returns:
|
54
|
-
Deserialized object
|
55
|
-
|
56
|
-
Raises:
|
57
|
-
ValueError: If deserialization fails
|
58
|
-
"""
|
59
46
|
try:
|
60
47
|
return decode(cast("bytes", value), type=target_type, strict=False)
|
61
48
|
except MsgspecError as e:
|
@@ -63,18 +50,6 @@ def deserialize(value: str | bytes, target_type: type[T]) -> T:
|
|
63
50
|
|
64
51
|
|
65
52
|
def serialize(value: Any, **kwargs: Any) -> bytes:
|
66
|
-
"""Serialize value to bytes.
|
67
|
-
|
68
|
-
Args:
|
69
|
-
value: Object to serialize
|
70
|
-
**kwargs: Additional data to merge with value if it's a dict
|
71
|
-
|
72
|
-
Returns:
|
73
|
-
Serialized bytes
|
74
|
-
|
75
|
-
Raises:
|
76
|
-
ValueError: If serialization fails
|
77
|
-
"""
|
78
53
|
if isinstance(value, dict) and kwargs:
|
79
54
|
value = value | kwargs
|
80
55
|
|