kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,6 +4,7 @@ from typing import TYPE_CHECKING, Any
|
|
4
4
|
|
5
5
|
from PIL import Image
|
6
6
|
|
7
|
+
from kreuzberg._constants import PDF_POINTS_PER_INCH
|
7
8
|
from kreuzberg._types import ExtractionConfig, ImagePreprocessingMetadata
|
8
9
|
|
9
10
|
if TYPE_CHECKING:
|
@@ -31,36 +32,30 @@ def calculate_optimal_dpi(
|
|
31
32
|
Returns:
|
32
33
|
Optimal DPI value that keeps image within max_dimension
|
33
34
|
"""
|
34
|
-
|
35
|
-
|
36
|
-
height_inches = page_height / 72.0
|
35
|
+
width_inches = page_width / PDF_POINTS_PER_INCH
|
36
|
+
height_inches = page_height / PDF_POINTS_PER_INCH
|
37
37
|
|
38
|
-
# Calculate pixel dimensions at target DPI
|
39
38
|
target_width_pixels = int(width_inches * target_dpi)
|
40
39
|
target_height_pixels = int(height_inches * target_dpi)
|
41
40
|
|
42
|
-
# Check if target DPI results in oversized image
|
43
41
|
max_pixel_dimension = max(target_width_pixels, target_height_pixels)
|
44
42
|
|
45
43
|
if max_pixel_dimension <= max_dimension:
|
46
|
-
# Target DPI is fine, clamp to min/max bounds
|
47
44
|
return max(min_dpi, min(target_dpi, max_dpi))
|
48
45
|
|
49
|
-
# Calculate maximum DPI that keeps within dimension constraints
|
50
46
|
max_dpi_for_width = max_dimension / width_inches if width_inches > 0 else max_dpi
|
51
47
|
max_dpi_for_height = max_dimension / height_inches if height_inches > 0 else max_dpi
|
52
48
|
constrained_dpi = int(min(max_dpi_for_width, max_dpi_for_height))
|
53
49
|
|
54
|
-
# Clamp to min/max bounds
|
55
50
|
return max(min_dpi, min(constrained_dpi, max_dpi))
|
56
51
|
|
57
52
|
|
58
53
|
def _extract_image_dpi(image: PILImage) -> tuple[tuple[float, float], float]:
|
59
54
|
"""Extract DPI information from image."""
|
60
|
-
current_dpi_info = image.info.get("dpi", (
|
55
|
+
current_dpi_info = image.info.get("dpi", (PDF_POINTS_PER_INCH, PDF_POINTS_PER_INCH))
|
61
56
|
if isinstance(current_dpi_info, (list, tuple)):
|
62
57
|
original_dpi = (float(current_dpi_info[0]), float(current_dpi_info[1]))
|
63
|
-
current_dpi = float(current_dpi_info[0])
|
58
|
+
current_dpi = float(current_dpi_info[0])
|
64
59
|
else:
|
65
60
|
current_dpi = float(current_dpi_info)
|
66
61
|
original_dpi = (current_dpi, current_dpi)
|
@@ -88,10 +83,8 @@ def _calculate_target_dpi(
|
|
88
83
|
"""Calculate target DPI and whether it was auto-adjusted."""
|
89
84
|
calculated_dpi = None
|
90
85
|
if config.auto_adjust_dpi:
|
91
|
-
|
92
|
-
|
93
|
-
approx_width_points = original_width * 72.0 / current_dpi
|
94
|
-
approx_height_points = original_height * 72.0 / current_dpi
|
86
|
+
approx_width_points = original_width * PDF_POINTS_PER_INCH / current_dpi
|
87
|
+
approx_height_points = original_height * PDF_POINTS_PER_INCH / current_dpi
|
95
88
|
|
96
89
|
optimal_dpi = calculate_optimal_dpi(
|
97
90
|
approx_width_points,
|
@@ -131,7 +124,6 @@ def normalize_image_dpi(
|
|
131
124
|
original_width, original_height = image.size
|
132
125
|
original_dpi, current_dpi = _extract_image_dpi(image)
|
133
126
|
|
134
|
-
# If no auto-adjustment and current DPI matches target and within limits, skip processing
|
135
127
|
if _should_skip_processing(original_width, original_height, current_dpi, config):
|
136
128
|
return image, ImagePreprocessingMetadata(
|
137
129
|
original_dimensions=(original_width, original_height),
|
@@ -143,15 +135,12 @@ def normalize_image_dpi(
|
|
143
135
|
skipped_resize=True,
|
144
136
|
)
|
145
137
|
|
146
|
-
# Calculate target DPI
|
147
138
|
target_dpi, auto_adjusted, calculated_dpi = _calculate_target_dpi(
|
148
139
|
original_width, original_height, current_dpi, config
|
149
140
|
)
|
150
141
|
|
151
|
-
# Calculate scale factor based on DPI ratio
|
152
142
|
scale_factor = target_dpi / current_dpi
|
153
143
|
|
154
|
-
# If scale factor is very close to 1.0, skip resizing
|
155
144
|
if abs(scale_factor - 1.0) < 0.05:
|
156
145
|
return image, ImagePreprocessingMetadata(
|
157
146
|
original_dimensions=(original_width, original_height),
|
@@ -164,11 +153,9 @@ def normalize_image_dpi(
|
|
164
153
|
skipped_resize=True,
|
165
154
|
)
|
166
155
|
|
167
|
-
# Calculate new dimensions
|
168
156
|
new_width = int(original_width * scale_factor)
|
169
157
|
new_height = int(original_height * scale_factor)
|
170
158
|
|
171
|
-
# Ensure we don't exceed max_dimension (safety check)
|
172
159
|
dimension_clamped = False
|
173
160
|
max_new_dimension = max(new_width, new_height)
|
174
161
|
if max_new_dimension > config.max_image_dimension:
|
@@ -178,12 +165,8 @@ def normalize_image_dpi(
|
|
178
165
|
scale_factor *= dimension_scale
|
179
166
|
dimension_clamped = True
|
180
167
|
|
181
|
-
# Resize image
|
182
168
|
try:
|
183
|
-
# Use LANCZOS for high-quality downscaling, BICUBIC for upscaling
|
184
|
-
# Handle different PIL versions
|
185
169
|
try:
|
186
|
-
# Modern PIL version
|
187
170
|
if scale_factor < 1.0:
|
188
171
|
resample_method = Image.Resampling.LANCZOS
|
189
172
|
resample_name = "LANCZOS"
|
@@ -191,7 +174,6 @@ def normalize_image_dpi(
|
|
191
174
|
resample_method = Image.Resampling.BICUBIC
|
192
175
|
resample_name = "BICUBIC"
|
193
176
|
except AttributeError:
|
194
|
-
# Older PIL version
|
195
177
|
if scale_factor < 1.0:
|
196
178
|
resample_method = getattr(Image, "LANCZOS", 1) # type: ignore[arg-type]
|
197
179
|
resample_name = "LANCZOS"
|
@@ -201,7 +183,6 @@ def normalize_image_dpi(
|
|
201
183
|
|
202
184
|
normalized_image = image.resize((new_width, new_height), resample_method)
|
203
185
|
|
204
|
-
# Update DPI info in the new image
|
205
186
|
normalized_image.info["dpi"] = (target_dpi, target_dpi)
|
206
187
|
|
207
188
|
return normalized_image, ImagePreprocessingMetadata(
|
@@ -218,7 +199,6 @@ def normalize_image_dpi(
|
|
218
199
|
)
|
219
200
|
|
220
201
|
except OSError as e:
|
221
|
-
# If resizing fails, return original image with error info
|
222
202
|
return image, ImagePreprocessingMetadata(
|
223
203
|
original_dimensions=(original_width, original_height),
|
224
204
|
original_dpi=original_dpi,
|
@@ -261,7 +241,6 @@ def get_dpi_adjustment_heuristics(
|
|
261
241
|
"recommendations": recommendations,
|
262
242
|
}
|
263
243
|
|
264
|
-
# Calculate aspect ratio and size analysis
|
265
244
|
aspect_ratio = width / height if height > 0 else 1.0
|
266
245
|
total_pixels = width * height
|
267
246
|
megapixels = total_pixels / 1_000_000
|
@@ -274,27 +253,23 @@ def get_dpi_adjustment_heuristics(
|
|
274
253
|
"is_large": max(width, height) > max_dimension * 0.8,
|
275
254
|
}
|
276
255
|
|
277
|
-
# Document-specific heuristics
|
278
256
|
if content_type == "document":
|
279
257
|
if aspect_ratio > 2.0 or aspect_ratio < 0.5:
|
280
|
-
# Very wide or very tall documents (like forms, receipts)
|
281
258
|
recommendations.append("Consider higher DPI for narrow documents")
|
282
259
|
if target_dpi < 200:
|
283
260
|
heuristics["recommended_dpi"] = min(200, target_dpi * 1.3)
|
284
261
|
|
285
|
-
if megapixels > 50:
|
262
|
+
if megapixels > 50:
|
286
263
|
recommendations.append("Large document detected - consider DPI reduction")
|
287
264
|
heuristics["performance_impact"] = "high"
|
288
265
|
if target_dpi > 150:
|
289
266
|
heuristics["recommended_dpi"] = max(120, target_dpi * 0.8)
|
290
267
|
|
291
|
-
|
292
|
-
estimated_memory_mb = (width * height * 3) / (1024 * 1024) # RGB bytes
|
268
|
+
estimated_memory_mb = (width * height * 3) / (1024 * 1024)
|
293
269
|
if estimated_memory_mb > 200:
|
294
270
|
heuristics["performance_impact"] = "high"
|
295
271
|
recommendations.append(f"High memory usage expected (~{estimated_memory_mb:.0f}MB)")
|
296
272
|
|
297
|
-
# Quality vs performance tradeoffs
|
298
273
|
scale_factor = target_dpi / current_dpi if current_dpi > 0 else 1.0
|
299
274
|
if scale_factor < 0.7:
|
300
275
|
heuristics["quality_impact"] = "high"
|
@@ -324,16 +299,14 @@ def estimate_processing_time(
|
|
324
299
|
total_pixels = width * height
|
325
300
|
megapixels = total_pixels / 1_000_000
|
326
301
|
|
327
|
-
# Base processing times per megapixel (rough estimates)
|
328
302
|
base_times = {
|
329
|
-
"tesseract": 2.5,
|
330
|
-
"easyocr": 4.0,
|
331
|
-
"paddleocr": 3.5,
|
303
|
+
"tesseract": 2.5,
|
304
|
+
"easyocr": 4.0,
|
305
|
+
"paddleocr": 3.5,
|
332
306
|
}
|
333
307
|
|
334
308
|
base_time = base_times.get(ocr_backend, 3.0)
|
335
309
|
|
336
|
-
# Non-linear scaling for very large images
|
337
310
|
scaling_factor = 1.0 + (megapixels - 10) * 0.1 if megapixels > 10 else 1.0
|
338
311
|
|
339
312
|
estimated_time = base_time * megapixels * scaling_factor
|
@@ -19,15 +19,9 @@ if TYPE_CHECKING:
|
|
19
19
|
|
20
20
|
T = TypeVar("T")
|
21
21
|
|
22
|
+
_POOL_SIZE = mp.cpu_count()
|
22
23
|
|
23
|
-
|
24
|
-
|
25
|
-
|
26
|
-
def _create_process_pool() -> ProcessPoolExecutor:
|
27
|
-
return ProcessPoolExecutor(max_workers=_POOL_SIZE)
|
28
|
-
|
29
|
-
|
30
|
-
_process_pool_ref = Ref("process_pool", _create_process_pool)
|
24
|
+
_process_pool_ref = Ref("process_pool", lambda: ProcessPoolExecutor(max_workers=_POOL_SIZE))
|
31
25
|
|
32
26
|
|
33
27
|
def _get_process_pool() -> ProcessPoolExecutor:
|
@@ -51,6 +45,33 @@ def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) ->
|
|
51
45
|
return future.result()
|
52
46
|
|
53
47
|
|
48
|
+
def get_optimal_worker_count(num_tasks: int, cpu_intensive: bool = True) -> int:
|
49
|
+
"""Calculate optimal worker count based on workload.
|
50
|
+
|
51
|
+
Optimized based on benchmarking results:
|
52
|
+
- For 1 task: Use 1 worker (avoid overhead)
|
53
|
+
- For 2-3 tasks: Use num_tasks workers
|
54
|
+
- For 4+ tasks: Use all CPU cores for CPU-intensive work
|
55
|
+
"""
|
56
|
+
cpu_count = mp.cpu_count()
|
57
|
+
|
58
|
+
if num_tasks == 1:
|
59
|
+
return 1
|
60
|
+
if num_tasks <= 3:
|
61
|
+
return min(num_tasks, cpu_count)
|
62
|
+
if cpu_intensive:
|
63
|
+
return cpu_count
|
64
|
+
return min(cpu_count * 2, max(cpu_count, num_tasks))
|
65
|
+
|
66
|
+
|
67
|
+
def warmup_process_pool() -> None:
|
68
|
+
"""Warm up the process pool to reduce initialization overhead."""
|
69
|
+
with process_pool() as pool:
|
70
|
+
futures = [pool.submit(lambda: None) for _ in range(_POOL_SIZE)]
|
71
|
+
for future in futures:
|
72
|
+
future.result()
|
73
|
+
|
74
|
+
|
54
75
|
def shutdown_process_pool() -> None:
|
55
76
|
if _process_pool_ref.is_initialized():
|
56
77
|
pool = _process_pool_ref.get()
|
kreuzberg/_utils/_quality.py
CHANGED
@@ -2,6 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import re
|
4
4
|
from functools import reduce
|
5
|
+
from itertools import chain
|
5
6
|
from typing import Any
|
6
7
|
|
7
8
|
_OCR_ARTIFACTS = {
|
@@ -97,7 +98,9 @@ def _calculate_script_penalty(text: str, total_chars: int) -> float:
|
|
97
98
|
if total_chars == 0:
|
98
99
|
return 0.0
|
99
100
|
|
100
|
-
script_chars = sum(
|
101
|
+
script_chars = sum(
|
102
|
+
len(match) for match in chain.from_iterable(pattern.findall(text) for pattern in _SCRIPT_PATTERNS.values())
|
103
|
+
)
|
101
104
|
|
102
105
|
return min(1.0, script_chars / total_chars)
|
103
106
|
|
@@ -106,7 +109,9 @@ def _calculate_navigation_penalty(text: str, total_chars: int) -> float:
|
|
106
109
|
if total_chars == 0:
|
107
110
|
return 0.0
|
108
111
|
|
109
|
-
nav_chars = sum(
|
112
|
+
nav_chars = sum(
|
113
|
+
len(match) for match in chain.from_iterable(pattern.findall(text) for pattern in _NAVIGATION_PATTERNS.values())
|
114
|
+
)
|
110
115
|
|
111
116
|
return min(1.0, nav_chars / total_chars)
|
112
117
|
|
@@ -0,0 +1,65 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import contextlib
|
4
|
+
from typing import TYPE_CHECKING
|
5
|
+
|
6
|
+
import pypdfium2
|
7
|
+
|
8
|
+
from kreuzberg._utils._pdf_lock import pypdfium_file_lock
|
9
|
+
from kreuzberg._utils._sync import run_sync
|
10
|
+
|
11
|
+
if TYPE_CHECKING: # pragma: no cover
|
12
|
+
from collections.abc import AsyncGenerator, Generator
|
13
|
+
from pathlib import Path
|
14
|
+
|
15
|
+
|
16
|
+
@contextlib.asynccontextmanager
|
17
|
+
async def pdf_document(file_path: Path) -> AsyncGenerator[pypdfium2.PdfDocument, None]:
|
18
|
+
"""Async context manager for PyPDFium document resources."""
|
19
|
+
document = None
|
20
|
+
try:
|
21
|
+
with pypdfium_file_lock(file_path):
|
22
|
+
document = await run_sync(pypdfium2.PdfDocument, str(file_path))
|
23
|
+
yield document
|
24
|
+
finally:
|
25
|
+
if document:
|
26
|
+
with pypdfium_file_lock(file_path), contextlib.suppress(Exception):
|
27
|
+
await run_sync(document.close)
|
28
|
+
|
29
|
+
|
30
|
+
@contextlib.contextmanager
|
31
|
+
def pdf_document_sync(file_path: Path) -> Generator[pypdfium2.PdfDocument, None, None]:
|
32
|
+
"""Sync context manager for PyPDFium document resources."""
|
33
|
+
document = None
|
34
|
+
try:
|
35
|
+
with pypdfium_file_lock(file_path):
|
36
|
+
document = pypdfium2.PdfDocument(str(file_path))
|
37
|
+
yield document
|
38
|
+
finally:
|
39
|
+
if document:
|
40
|
+
with pypdfium_file_lock(file_path), contextlib.suppress(Exception):
|
41
|
+
document.close()
|
42
|
+
|
43
|
+
|
44
|
+
@contextlib.contextmanager
|
45
|
+
def pdf_resources_sync(*resources: object) -> Generator[None, None, None]:
|
46
|
+
"""Context manager for multiple PDF resources (pages, textpages, bitmaps)."""
|
47
|
+
try:
|
48
|
+
yield
|
49
|
+
finally:
|
50
|
+
for resource in resources:
|
51
|
+
with contextlib.suppress(Exception):
|
52
|
+
if hasattr(resource, "close"):
|
53
|
+
resource.close()
|
54
|
+
|
55
|
+
|
56
|
+
@contextlib.contextmanager
|
57
|
+
def image_resources(*images: object) -> Generator[None, None, None]:
|
58
|
+
"""Context manager for PIL Image resources."""
|
59
|
+
try:
|
60
|
+
yield
|
61
|
+
finally:
|
62
|
+
for image in images:
|
63
|
+
with contextlib.suppress(Exception):
|
64
|
+
if hasattr(image, "close"):
|
65
|
+
image.close()
|
@@ -1,11 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import is_dataclass
|
4
|
-
from typing import Any, TypeVar
|
4
|
+
from typing import Any, TypeVar
|
5
5
|
|
6
6
|
import msgspec
|
7
7
|
from msgspec import MsgspecError
|
8
|
-
from msgspec.msgpack import decode, encode
|
9
8
|
|
10
9
|
T = TypeVar("T")
|
11
10
|
|
@@ -42,18 +41,26 @@ def encode_hook(obj: Any) -> Any:
|
|
42
41
|
raise TypeError(f"Unsupported type: {type(obj)!r}")
|
43
42
|
|
44
43
|
|
45
|
-
def deserialize(value: str | bytes, target_type: type[T]) -> T:
|
44
|
+
def deserialize(value: str | bytes, target_type: type[T], json: bool = False) -> T:
|
45
|
+
decoder = msgspec.json.decode if json else msgspec.msgpack.decode
|
46
|
+
|
47
|
+
if json:
|
48
|
+
data = value.encode() if isinstance(value, str) else value
|
49
|
+
else:
|
50
|
+
data = value.encode() if isinstance(value, str) else value
|
51
|
+
|
46
52
|
try:
|
47
|
-
return
|
53
|
+
return decoder(data, type=target_type, strict=False)
|
48
54
|
except MsgspecError as e:
|
49
55
|
raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
|
50
56
|
|
51
57
|
|
52
|
-
def serialize(value: Any, **kwargs: Any) -> bytes:
|
58
|
+
def serialize(value: Any, json: bool = False, **kwargs: Any) -> bytes:
|
53
59
|
if isinstance(value, dict) and kwargs:
|
54
60
|
value = value | kwargs
|
55
61
|
|
62
|
+
encoder = msgspec.json.encode if json else msgspec.msgpack.encode
|
56
63
|
try:
|
57
|
-
return
|
64
|
+
return encoder(value, enc_hook=encode_hook)
|
58
65
|
except (MsgspecError, TypeError) as e:
|
59
66
|
raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e
|
kreuzberg/_utils/_sync.py
CHANGED
@@ -2,17 +2,15 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from functools import partial
|
4
4
|
from inspect import isawaitable, iscoroutinefunction
|
5
|
-
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
5
|
+
from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar, cast
|
6
6
|
|
7
7
|
import anyio
|
8
|
-
from anyio import create_task_group
|
8
|
+
from anyio import CapacityLimiter, create_task_group
|
9
9
|
from anyio.to_thread import run_sync as any_io_run_sync
|
10
10
|
|
11
11
|
if TYPE_CHECKING: # pragma: no cover
|
12
12
|
from collections.abc import Awaitable, Callable
|
13
13
|
|
14
|
-
from typing import ParamSpec
|
15
|
-
|
16
14
|
T = TypeVar("T")
|
17
15
|
P = ParamSpec("P")
|
18
16
|
|
@@ -37,14 +35,45 @@ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
|
|
37
35
|
return results
|
38
36
|
|
39
37
|
|
40
|
-
async def run_taskgroup_batched(
|
41
|
-
|
38
|
+
async def run_taskgroup_batched(
|
39
|
+
*async_tasks: Awaitable[Any],
|
40
|
+
batch_size: int,
|
41
|
+
use_semaphore: bool = True,
|
42
|
+
) -> list[Any]:
|
43
|
+
"""Run async tasks with controlled concurrency.
|
42
44
|
|
43
|
-
|
44
|
-
|
45
|
-
|
45
|
+
Args:
|
46
|
+
async_tasks: Tasks to execute
|
47
|
+
batch_size: Maximum concurrent tasks
|
48
|
+
use_semaphore: Use semaphore for concurrency control instead of sequential batches
|
46
49
|
|
47
|
-
|
50
|
+
Returns:
|
51
|
+
List of results in the same order as input tasks
|
52
|
+
"""
|
53
|
+
if not async_tasks:
|
54
|
+
return []
|
55
|
+
|
56
|
+
if len(async_tasks) <= batch_size or not use_semaphore:
|
57
|
+
batch_results: list[Any] = []
|
58
|
+
for i in range(0, len(async_tasks), batch_size):
|
59
|
+
batch = async_tasks[i : i + batch_size]
|
60
|
+
batch_results.extend(await run_taskgroup(*batch))
|
61
|
+
return batch_results
|
62
|
+
|
63
|
+
limiter = CapacityLimiter(batch_size)
|
64
|
+
results: list[tuple[int, Any]] = []
|
65
|
+
|
66
|
+
async def run_with_semaphore(task: Awaitable[Any], index: int) -> None:
|
67
|
+
async with limiter:
|
68
|
+
result = await task
|
69
|
+
results.append((index, result))
|
70
|
+
|
71
|
+
async with create_task_group() as tg:
|
72
|
+
for i, task in enumerate(async_tasks):
|
73
|
+
tg.start_soon(run_with_semaphore, task, i)
|
74
|
+
|
75
|
+
results.sort(key=lambda x: x[0])
|
76
|
+
return [result for _, result in results]
|
48
77
|
|
49
78
|
|
50
79
|
async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
kreuzberg/_utils/_tmp.py
CHANGED
@@ -1,5 +1,8 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import contextlib
|
4
|
+
import os
|
5
|
+
import tempfile
|
3
6
|
from contextlib import suppress
|
4
7
|
from pathlib import Path
|
5
8
|
from tempfile import NamedTemporaryFile
|
@@ -10,7 +13,7 @@ from anyio import Path as AsyncPath
|
|
10
13
|
from kreuzberg._utils._sync import run_sync
|
11
14
|
|
12
15
|
if TYPE_CHECKING: # pragma: no cover
|
13
|
-
from collections.abc import Callable, Coroutine
|
16
|
+
from collections.abc import AsyncGenerator, Callable, Coroutine, Generator
|
14
17
|
|
15
18
|
|
16
19
|
async def create_temp_file(
|
@@ -26,3 +29,36 @@ async def create_temp_file(
|
|
26
29
|
await AsyncPath(file.name).unlink(missing_ok=True)
|
27
30
|
|
28
31
|
return Path(file.name), unlink
|
32
|
+
|
33
|
+
|
34
|
+
@contextlib.asynccontextmanager
|
35
|
+
async def temporary_file(extension: str, content: bytes | None = None) -> AsyncGenerator[Path, None]:
|
36
|
+
"""Async context manager for temporary files with automatic cleanup."""
|
37
|
+
file_path, unlink = await create_temp_file(extension, content)
|
38
|
+
try:
|
39
|
+
yield file_path
|
40
|
+
finally:
|
41
|
+
await unlink()
|
42
|
+
|
43
|
+
|
44
|
+
@contextlib.contextmanager
|
45
|
+
def temporary_file_sync(extension: str, content: bytes | None = None) -> Generator[Path, None, None]:
|
46
|
+
"""Sync context manager for temporary files with automatic cleanup."""
|
47
|
+
fd, temp_path = tempfile.mkstemp(suffix=extension)
|
48
|
+
try:
|
49
|
+
if content:
|
50
|
+
with os.fdopen(fd, "wb") as f:
|
51
|
+
f.write(content)
|
52
|
+
else:
|
53
|
+
os.close(fd)
|
54
|
+
yield Path(temp_path)
|
55
|
+
finally:
|
56
|
+
with suppress(OSError, PermissionError):
|
57
|
+
Path(temp_path).unlink()
|
58
|
+
|
59
|
+
|
60
|
+
@contextlib.contextmanager
|
61
|
+
def temporary_directory() -> Generator[Path, None, None]:
|
62
|
+
"""Context manager for temporary directories with automatic cleanup."""
|
63
|
+
with tempfile.TemporaryDirectory() as temp_dir:
|
64
|
+
yield Path(temp_dir)
|
kreuzberg/cli.py
CHANGED
@@ -122,32 +122,37 @@ def _build_cli_args(params: dict[str, Any]) -> dict[str, Any]:
|
|
122
122
|
"force_ocr": params["force_ocr"] if params["force_ocr"] else None,
|
123
123
|
"chunk_content": params["chunk_content"] if params["chunk_content"] else None,
|
124
124
|
"extract_tables": params["extract_tables"] if params["extract_tables"] else None,
|
125
|
+
"extract_entities": params["extract_entities"] if params["extract_entities"] else None,
|
126
|
+
"extract_keywords": params["extract_keywords"] if params["extract_keywords"] else None,
|
127
|
+
"auto_detect_language": params["auto_detect_language"] if params["auto_detect_language"] else None,
|
128
|
+
"keyword_count": params["keyword_count"] if params["keyword_count"] != 10 else None,
|
125
129
|
"max_chars": params["max_chars"] if params["max_chars"] != DEFAULT_MAX_CHARACTERS else None,
|
126
130
|
"max_overlap": params["max_overlap"] if params["max_overlap"] != DEFAULT_MAX_OVERLAP else None,
|
127
131
|
"ocr_backend": params["ocr_backend"],
|
128
132
|
}
|
129
133
|
|
130
134
|
ocr_backend = params["ocr_backend"]
|
131
|
-
|
132
|
-
|
133
|
-
|
134
|
-
|
135
|
-
|
136
|
-
|
137
|
-
|
138
|
-
|
139
|
-
|
140
|
-
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
135
|
+
match ocr_backend:
|
136
|
+
case "tesseract" if (
|
137
|
+
params["tesseract_lang"]
|
138
|
+
or params["tesseract_psm"] is not None
|
139
|
+
or params["tesseract_output_format"]
|
140
|
+
or params["enable_table_detection"]
|
141
|
+
):
|
142
|
+
tesseract_config = {}
|
143
|
+
if params["tesseract_lang"]:
|
144
|
+
tesseract_config["language"] = params["tesseract_lang"]
|
145
|
+
if params["tesseract_psm"] is not None:
|
146
|
+
tesseract_config["psm"] = params["tesseract_psm"]
|
147
|
+
if params["tesseract_output_format"]:
|
148
|
+
tesseract_config["output_format"] = params["tesseract_output_format"]
|
149
|
+
if params["enable_table_detection"]:
|
150
|
+
tesseract_config["enable_table_detection"] = True
|
151
|
+
cli_args["tesseract_config"] = tesseract_config
|
152
|
+
case "easyocr" if params["easyocr_languages"]:
|
153
|
+
cli_args["easyocr_config"] = {"languages": params["easyocr_languages"].split(",")}
|
154
|
+
case "paddleocr" if params["paddleocr_languages"]:
|
155
|
+
cli_args["paddleocr_config"] = {"languages": params["paddleocr_languages"].split(",")}
|
151
156
|
|
152
157
|
return cli_args
|
153
158
|
|
@@ -250,6 +255,9 @@ def cli(ctx: click.Context) -> None:
|
|
250
255
|
@click.option("--force-ocr", is_flag=True, help="Force OCR processing")
|
251
256
|
@click.option("--chunk-content", is_flag=True, help="Enable content chunking")
|
252
257
|
@click.option("--extract-tables", is_flag=True, help="Enable table extraction")
|
258
|
+
@click.option("--extract-entities", is_flag=True, help="Enable entity extraction")
|
259
|
+
@click.option("--extract-keywords", is_flag=True, help="Enable keyword extraction")
|
260
|
+
@click.option("--auto-detect-language", is_flag=True, help="Enable automatic language detection")
|
253
261
|
@click.option(
|
254
262
|
"--max-chars",
|
255
263
|
type=int,
|
@@ -262,6 +270,12 @@ def cli(ctx: click.Context) -> None:
|
|
262
270
|
default=DEFAULT_MAX_OVERLAP,
|
263
271
|
help=f"Maximum overlap between chunks (default: {DEFAULT_MAX_OVERLAP})",
|
264
272
|
)
|
273
|
+
@click.option(
|
274
|
+
"--keyword-count",
|
275
|
+
type=int,
|
276
|
+
default=10,
|
277
|
+
help="Number of keywords to extract (default: 10)",
|
278
|
+
)
|
265
279
|
@click.option(
|
266
280
|
"--ocr-backend", type=OcrBackendParamType(), help="OCR backend to use (tesseract, easyocr, paddleocr, none)"
|
267
281
|
)
|