kreuzberg 3.7.0__py3-none-any.whl → 3.8.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors/_base.py +40 -0
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +17 -18
- kreuzberg/_extractors/_pdf.py +68 -14
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +179 -4
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_multiprocessing/__init__.py +2 -3
- kreuzberg/_ocr/__init__.py +30 -0
- kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
- kreuzberg/_ocr/_sync.py +566 -0
- kreuzberg/_ocr/_tesseract.py +6 -2
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +131 -0
- kreuzberg/_utils/_cache.py +17 -2
- kreuzberg/_utils/_process_pool.py +178 -1
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +5 -2
- kreuzberg/_utils/_table.py +261 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +66 -50
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/RECORD +29 -28
- kreuzberg/_multiprocessing/process_manager.py +0 -189
- kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
- kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,12 +4,12 @@ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
|
4
4
|
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
6
|
kreuzberg/_entity_extraction.py,sha256=EIasBGpkZ-3FwivjEpisz23LilTwx8os-IbfrDtzNl4,7815
|
7
|
-
kreuzberg/_gmft.py,sha256=
|
7
|
+
kreuzberg/_gmft.py,sha256=ZIEUu4Uy5zYNFEeDRbz1cLJhnCAStVsSzm1PQ3vDeO8,14828
|
8
8
|
kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
|
9
|
-
kreuzberg/_mime_types.py,sha256=
|
9
|
+
kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
|
10
10
|
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
11
|
-
kreuzberg/_registry.py,sha256=
|
12
|
-
kreuzberg/_types.py,sha256=
|
11
|
+
kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
|
12
|
+
kreuzberg/_types.py,sha256=R_0Xc2kq4nEwkruvkB3qfrLeJ996419hBQ_1C6Xrqjo,13388
|
13
13
|
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
14
14
|
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
15
15
|
kreuzberg/extraction.py,sha256=mdH45bMAAUUNXYT7UrNyWJ2oD_gXuLUU-NyuYxQM884,17459
|
@@ -17,40 +17,41 @@ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
|
17
17
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
18
|
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
19
19
|
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
20
|
-
kreuzberg/_extractors/_base.py,sha256=
|
21
|
-
kreuzberg/_extractors/
|
22
|
-
kreuzberg/_extractors/
|
20
|
+
kreuzberg/_extractors/_base.py,sha256=ECEwBpxnIy_J9kGZGuqsaPCgLFfxRn7kn4hIf11gDJ8,4478
|
21
|
+
kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
|
22
|
+
kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
|
23
|
+
kreuzberg/_extractors/_image.py,sha256=0kzOQTTeJacaA8I9833fFvVQSz6FtUe9Nuw1oy0ToD0,4939
|
23
24
|
kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
|
24
|
-
kreuzberg/_extractors/_pdf.py,sha256=
|
25
|
-
kreuzberg/_extractors/_presentation.py,sha256=
|
26
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=
|
25
|
+
kreuzberg/_extractors/_pdf.py,sha256=giYG3aEdmsxT0tGWKBaMzHDPz74-jVmK4HZARDEBhsM,17108
|
26
|
+
kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
|
27
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=Nvyz7XT7C2ai4QeUashBeENQpuP5rs8SmKfumxEqlCg,13712
|
28
|
+
kreuzberg/_extractors/_structured.py,sha256=i3jAvhHZt_BsRGgZZfgcsUqlwAg_RNc8vsuecb04T0c,5581
|
27
29
|
kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
|
28
30
|
kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
|
29
|
-
kreuzberg/_multiprocessing/__init__.py,sha256=
|
31
|
+
kreuzberg/_multiprocessing/__init__.py,sha256=X2BtgKmWhF1rl0JYg2gvoSUaozKExfsWh-RRNvzNoOs,202
|
30
32
|
kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
|
31
|
-
kreuzberg/
|
32
|
-
kreuzberg/_multiprocessing/sync_easyocr.py,sha256=-3_Ol0H8G6RhPxTbTPvoe8fTsTz3e-dg2QbHHnoJL48,7693
|
33
|
-
kreuzberg/_multiprocessing/sync_paddleocr.py,sha256=5558iTjPXCyJWuyhZckmuJLadUwJDb5YVC8Cv-FOaWg,6090
|
34
|
-
kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
|
35
|
-
kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
|
36
|
-
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
33
|
+
kreuzberg/_ocr/__init__.py,sha256=CC9Ob1t_ltTYUamK1ZtmkswfCYdn1B-Z0kPemsQU0xU,1439
|
37
34
|
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
38
35
|
kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
|
39
36
|
kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
|
40
|
-
kreuzberg/_ocr/
|
37
|
+
kreuzberg/_ocr/_pool.py,sha256=Yb0l_GxnPsIWn3NA2FuBYEC8ipIqgwaYglUt0ltqSvk,10948
|
38
|
+
kreuzberg/_ocr/_sync.py,sha256=cdLiH9hYqygzqW3LkibhrE6C8atin7mfTv_k3JJFE0k,18287
|
39
|
+
kreuzberg/_ocr/_tesseract.py,sha256=KtenEIGL63gRhdH2hxOEVM89locAETGo2bNjQMXjTwY,13266
|
41
40
|
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
42
|
-
kreuzberg/_utils/_cache.py,sha256=
|
41
|
+
kreuzberg/_utils/_cache.py,sha256=CtpSmEggWoIPDZ9_Nl0i5pr7wtPyci8EVT-ajYsARGI,13609
|
43
42
|
kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
|
44
43
|
kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
|
45
44
|
kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
|
46
45
|
kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
|
47
|
-
kreuzberg/_utils/_process_pool.py,sha256
|
48
|
-
kreuzberg/_utils/
|
49
|
-
kreuzberg/_utils/
|
50
|
-
kreuzberg/_utils/
|
46
|
+
kreuzberg/_utils/_process_pool.py,sha256=E3bHOO67TeoLUBjtw5HoY9gyFl621VaImYI-_itQ96c,8653
|
47
|
+
kreuzberg/_utils/_quality.py,sha256=dgFLt40NSqB8Ciej5QcZQLiV4U7LcrGux0vXckiE31U,7568
|
48
|
+
kreuzberg/_utils/_serialization.py,sha256=Rt5zSkvzf1SVNDrI6F2Zvnkel24mQkD1QvP0WjgZUgk,2195
|
49
|
+
kreuzberg/_utils/_string.py,sha256=5YKu9EZlZQ-LkphXUq8fdwKQrX9jWACFEhMGfjIysf4,6381
|
50
|
+
kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
|
51
|
+
kreuzberg/_utils/_table.py,sha256=C2skLtcyczxDEH33Qw2dOwnR15SGillvNEP-NzBG3R8,8156
|
51
52
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
52
|
-
kreuzberg-3.
|
53
|
-
kreuzberg-3.
|
54
|
-
kreuzberg-3.
|
55
|
-
kreuzberg-3.
|
56
|
-
kreuzberg-3.
|
53
|
+
kreuzberg-3.8.0.dist-info/METADATA,sha256=d1N7v0EvJA-22g071Dctler5zF11WlKGTgLjGpsV8iw,11422
|
54
|
+
kreuzberg-3.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
55
|
+
kreuzberg-3.8.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
56
|
+
kreuzberg-3.8.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
57
|
+
kreuzberg-3.8.0.dist-info/RECORD,,
|
@@ -1,189 +0,0 @@
|
|
1
|
-
"""Process pool manager for resource-aware multiprocessing."""
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
import multiprocessing as mp
|
6
|
-
from concurrent.futures import ProcessPoolExecutor
|
7
|
-
from typing import TYPE_CHECKING, Any, TypeVar
|
8
|
-
|
9
|
-
import anyio
|
10
|
-
import psutil
|
11
|
-
from typing_extensions import Self
|
12
|
-
|
13
|
-
if TYPE_CHECKING:
|
14
|
-
import types
|
15
|
-
from collections.abc import Callable
|
16
|
-
|
17
|
-
T = TypeVar("T")
|
18
|
-
|
19
|
-
|
20
|
-
class ProcessPoolManager:
|
21
|
-
"""Resource-aware process pool manager for CPU-intensive tasks."""
|
22
|
-
|
23
|
-
def __init__(
|
24
|
-
self,
|
25
|
-
max_processes: int | None = None,
|
26
|
-
memory_limit_gb: float | None = None,
|
27
|
-
) -> None:
|
28
|
-
"""Initialize the process pool manager.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
max_processes: Maximum number of processes. Defaults to CPU count.
|
32
|
-
memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
|
33
|
-
"""
|
34
|
-
self.max_processes = max_processes or mp.cpu_count()
|
35
|
-
|
36
|
-
if memory_limit_gb is None:
|
37
|
-
available_memory = psutil.virtual_memory().available
|
38
|
-
self.memory_limit_bytes = int(available_memory * 0.75) # Use 75% of available # ~keep
|
39
|
-
else:
|
40
|
-
self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
|
41
|
-
|
42
|
-
self._executor: ProcessPoolExecutor | None = None
|
43
|
-
self._active_tasks = 0
|
44
|
-
|
45
|
-
def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
|
46
|
-
"""Calculate optimal number of workers based on memory constraints.
|
47
|
-
|
48
|
-
Args:
|
49
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
50
|
-
|
51
|
-
Returns:
|
52
|
-
Optimal number of workers.
|
53
|
-
"""
|
54
|
-
task_memory_bytes = task_memory_mb * 1024**2
|
55
|
-
memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
|
56
|
-
|
57
|
-
return min(self.max_processes, memory_based_limit)
|
58
|
-
|
59
|
-
def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
|
60
|
-
"""Ensure process pool executor is initialized."""
|
61
|
-
if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
|
62
|
-
if self._executor is not None:
|
63
|
-
self._executor.shutdown(wait=False)
|
64
|
-
|
65
|
-
workers = max_workers or self.max_processes
|
66
|
-
self._executor = ProcessPoolExecutor(max_workers=workers)
|
67
|
-
|
68
|
-
return self._executor
|
69
|
-
|
70
|
-
async def submit_task(
|
71
|
-
self,
|
72
|
-
func: Callable[..., T],
|
73
|
-
*args: Any,
|
74
|
-
task_memory_mb: float = 100,
|
75
|
-
) -> T:
|
76
|
-
"""Submit a task to the process pool.
|
77
|
-
|
78
|
-
Args:
|
79
|
-
func: Function to execute.
|
80
|
-
*args: Positional arguments for the function.
|
81
|
-
task_memory_mb: Estimated memory usage in MB.
|
82
|
-
|
83
|
-
Returns:
|
84
|
-
Result of the function execution.
|
85
|
-
"""
|
86
|
-
workers = self.get_optimal_workers(task_memory_mb)
|
87
|
-
self._ensure_executor(workers)
|
88
|
-
|
89
|
-
self._active_tasks += 1
|
90
|
-
|
91
|
-
try:
|
92
|
-
return await anyio.to_thread.run_sync(func, *args)
|
93
|
-
finally:
|
94
|
-
self._active_tasks -= 1
|
95
|
-
|
96
|
-
async def submit_batch(
|
97
|
-
self,
|
98
|
-
func: Callable[..., T],
|
99
|
-
arg_batches: list[tuple[Any, ...]],
|
100
|
-
task_memory_mb: float = 100,
|
101
|
-
max_concurrent: int | None = None,
|
102
|
-
) -> list[T]:
|
103
|
-
"""Submit a batch of tasks to the process pool.
|
104
|
-
|
105
|
-
Args:
|
106
|
-
func: Function to execute.
|
107
|
-
arg_batches: List of argument tuples for each task.
|
108
|
-
task_memory_mb: Estimated memory usage per task in MB.
|
109
|
-
max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
|
110
|
-
|
111
|
-
Returns:
|
112
|
-
List of results in the same order as input.
|
113
|
-
"""
|
114
|
-
if not arg_batches:
|
115
|
-
return []
|
116
|
-
|
117
|
-
workers = self.get_optimal_workers(task_memory_mb)
|
118
|
-
max_concurrent = max_concurrent or workers
|
119
|
-
|
120
|
-
self._ensure_executor(workers)
|
121
|
-
|
122
|
-
semaphore = anyio.CapacityLimiter(max_concurrent)
|
123
|
-
|
124
|
-
async def submit_single(args: tuple[Any, ...]) -> T:
|
125
|
-
async with semaphore:
|
126
|
-
self._active_tasks += 1
|
127
|
-
try:
|
128
|
-
return await anyio.to_thread.run_sync(func, *args)
|
129
|
-
finally:
|
130
|
-
self._active_tasks -= 1
|
131
|
-
|
132
|
-
async with anyio.create_task_group() as tg:
|
133
|
-
results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
|
134
|
-
|
135
|
-
async def run_task(idx: int, args: tuple[Any, ...]) -> None:
|
136
|
-
results[idx] = await submit_single(args)
|
137
|
-
|
138
|
-
for idx, args in enumerate(arg_batches):
|
139
|
-
tg.start_soon(run_task, idx, args)
|
140
|
-
|
141
|
-
return results
|
142
|
-
|
143
|
-
def get_system_info(self) -> dict[str, Any]:
|
144
|
-
"""Get current system resource information."""
|
145
|
-
memory = psutil.virtual_memory()
|
146
|
-
cpu_percent = psutil.cpu_percent(interval=1)
|
147
|
-
|
148
|
-
return {
|
149
|
-
"cpu_count": mp.cpu_count(),
|
150
|
-
"cpu_percent": cpu_percent,
|
151
|
-
"memory_total": memory.total,
|
152
|
-
"memory_available": memory.available,
|
153
|
-
"memory_percent": memory.percent,
|
154
|
-
"active_tasks": self._active_tasks,
|
155
|
-
"max_processes": self.max_processes,
|
156
|
-
"memory_limit": self.memory_limit_bytes,
|
157
|
-
}
|
158
|
-
|
159
|
-
def shutdown(self, wait: bool = True) -> None:
|
160
|
-
"""Shutdown the process pool."""
|
161
|
-
if self._executor is not None:
|
162
|
-
self._executor.shutdown(wait=wait)
|
163
|
-
self._executor = None
|
164
|
-
|
165
|
-
def __enter__(self) -> Self:
|
166
|
-
"""Context manager entry."""
|
167
|
-
return self
|
168
|
-
|
169
|
-
def __exit__(
|
170
|
-
self,
|
171
|
-
exc_type: type[BaseException] | None,
|
172
|
-
exc_val: BaseException | None,
|
173
|
-
exc_tb: types.TracebackType | None,
|
174
|
-
) -> None:
|
175
|
-
"""Context manager exit."""
|
176
|
-
self.shutdown()
|
177
|
-
|
178
|
-
async def __aenter__(self) -> Self:
|
179
|
-
"""Async context manager entry."""
|
180
|
-
return self
|
181
|
-
|
182
|
-
async def __aexit__(
|
183
|
-
self,
|
184
|
-
exc_type: type[BaseException] | None,
|
185
|
-
exc_val: BaseException | None,
|
186
|
-
exc_tb: types.TracebackType | None,
|
187
|
-
) -> None:
|
188
|
-
"""Async context manager exit."""
|
189
|
-
self.shutdown()
|
@@ -1,235 +0,0 @@
|
|
1
|
-
"""Pure synchronous EasyOCR without any async overhead."""
|
2
|
-
|
3
|
-
from __future__ import annotations
|
4
|
-
|
5
|
-
import tempfile
|
6
|
-
from pathlib import Path
|
7
|
-
from typing import Any
|
8
|
-
|
9
|
-
from PIL import Image
|
10
|
-
|
11
|
-
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
13
|
-
from kreuzberg._types import ExtractionResult
|
14
|
-
from kreuzberg._utils._string import normalize_spaces
|
15
|
-
from kreuzberg.exceptions import MissingDependencyError, OCRError
|
16
|
-
|
17
|
-
|
18
|
-
def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
|
19
|
-
"""Get an EasyOCR Reader instance with the given configuration."""
|
20
|
-
try:
|
21
|
-
import easyocr
|
22
|
-
except ImportError as e:
|
23
|
-
raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
|
24
|
-
|
25
|
-
gpu = False
|
26
|
-
if hasattr(config, "device"):
|
27
|
-
if config.device and config.device.lower() != "cpu":
|
28
|
-
gpu = True
|
29
|
-
elif hasattr(config, "use_gpu"):
|
30
|
-
gpu = config.use_gpu
|
31
|
-
|
32
|
-
language = config.language if hasattr(config, "language") else "en"
|
33
|
-
if isinstance(language, str):
|
34
|
-
lang_list = [lang.strip().lower() for lang in language.split(",")]
|
35
|
-
else:
|
36
|
-
lang_list = [lang.lower() for lang in language]
|
37
|
-
|
38
|
-
kwargs = {
|
39
|
-
"lang_list": lang_list,
|
40
|
-
"gpu": gpu,
|
41
|
-
"model_storage_directory": getattr(config, "model_storage_directory", None),
|
42
|
-
"user_network_directory": getattr(config, "user_network_directory", None),
|
43
|
-
"recog_network": getattr(config, "recog_network", None),
|
44
|
-
"detector": getattr(config, "detector", None),
|
45
|
-
"recognizer": getattr(config, "recognizer", None),
|
46
|
-
"verbose": False,
|
47
|
-
"quantize": getattr(config, "quantize", None),
|
48
|
-
"cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
|
49
|
-
}
|
50
|
-
|
51
|
-
kwargs = {k: v for k, v in kwargs.items() if v is not None}
|
52
|
-
|
53
|
-
return easyocr.Reader(**kwargs)
|
54
|
-
|
55
|
-
|
56
|
-
def process_image_sync_pure(
|
57
|
-
image_path: str | Path,
|
58
|
-
config: EasyOCRConfig | None = None,
|
59
|
-
) -> ExtractionResult:
|
60
|
-
"""Process an image with EasyOCR using pure sync implementation.
|
61
|
-
|
62
|
-
This bypasses all async overhead and calls EasyOCR directly.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
image_path: Path to the image file.
|
66
|
-
config: EasyOCR configuration.
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
Extraction result.
|
70
|
-
"""
|
71
|
-
cfg = config or EasyOCRConfig()
|
72
|
-
|
73
|
-
try:
|
74
|
-
reader = _get_easyocr_instance(cfg)
|
75
|
-
|
76
|
-
readtext_kwargs = {
|
77
|
-
"decoder": cfg.decoder,
|
78
|
-
"beamWidth": cfg.beam_width,
|
79
|
-
"batch_size": getattr(cfg, "batch_size", 1),
|
80
|
-
"workers": getattr(cfg, "workers", 0),
|
81
|
-
"allowlist": getattr(cfg, "allowlist", None),
|
82
|
-
"blocklist": getattr(cfg, "blocklist", None),
|
83
|
-
"detail": getattr(cfg, "detail", 1),
|
84
|
-
"rotation_info": cfg.rotation_info,
|
85
|
-
"paragraph": getattr(cfg, "paragraph", False),
|
86
|
-
"min_size": cfg.min_size,
|
87
|
-
"text_threshold": cfg.text_threshold,
|
88
|
-
"low_text": cfg.low_text,
|
89
|
-
"link_threshold": cfg.link_threshold,
|
90
|
-
"canvas_size": cfg.canvas_size,
|
91
|
-
"mag_ratio": cfg.mag_ratio,
|
92
|
-
"slope_ths": cfg.slope_ths,
|
93
|
-
"ycenter_ths": cfg.ycenter_ths,
|
94
|
-
"height_ths": cfg.height_ths,
|
95
|
-
"width_ths": cfg.width_ths,
|
96
|
-
"add_margin": cfg.add_margin,
|
97
|
-
"x_ths": cfg.x_ths,
|
98
|
-
"y_ths": cfg.y_ths,
|
99
|
-
}
|
100
|
-
|
101
|
-
readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
|
102
|
-
|
103
|
-
results = reader.readtext(str(image_path), **readtext_kwargs)
|
104
|
-
|
105
|
-
if not results:
|
106
|
-
return ExtractionResult(
|
107
|
-
content="",
|
108
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
109
|
-
metadata={},
|
110
|
-
chunks=[],
|
111
|
-
)
|
112
|
-
|
113
|
-
texts = []
|
114
|
-
confidences = []
|
115
|
-
|
116
|
-
detail_value = getattr(cfg, "detail", 1)
|
117
|
-
if detail_value:
|
118
|
-
for result in results:
|
119
|
-
min_result_length = 2
|
120
|
-
max_confidence_index = 2
|
121
|
-
if len(result) >= min_result_length:
|
122
|
-
_bbox, text = result[0], result[1]
|
123
|
-
confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
|
124
|
-
texts.append(text)
|
125
|
-
confidences.append(confidence)
|
126
|
-
else:
|
127
|
-
texts = results
|
128
|
-
confidences = [1.0] * len(texts)
|
129
|
-
|
130
|
-
content = "\n".join(texts)
|
131
|
-
content = normalize_spaces(content)
|
132
|
-
|
133
|
-
avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
|
134
|
-
|
135
|
-
metadata = {"confidence": avg_confidence} if confidences else {}
|
136
|
-
|
137
|
-
return ExtractionResult(
|
138
|
-
content=content,
|
139
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
140
|
-
metadata=metadata, # type: ignore[arg-type]
|
141
|
-
chunks=[],
|
142
|
-
)
|
143
|
-
|
144
|
-
except Exception as e:
|
145
|
-
raise OCRError(f"EasyOCR processing failed: {e}") from e
|
146
|
-
|
147
|
-
|
148
|
-
def process_image_bytes_sync_pure(
|
149
|
-
image_bytes: bytes,
|
150
|
-
config: EasyOCRConfig | None = None,
|
151
|
-
) -> ExtractionResult:
|
152
|
-
"""Process image bytes with EasyOCR using pure sync implementation.
|
153
|
-
|
154
|
-
Args:
|
155
|
-
image_bytes: Image data as bytes.
|
156
|
-
config: EasyOCR configuration.
|
157
|
-
|
158
|
-
Returns:
|
159
|
-
Extraction result.
|
160
|
-
"""
|
161
|
-
import io
|
162
|
-
|
163
|
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
|
164
|
-
with Image.open(io.BytesIO(image_bytes)) as image:
|
165
|
-
image.save(tmp_image.name, format="PNG")
|
166
|
-
image_path = tmp_image.name
|
167
|
-
|
168
|
-
try:
|
169
|
-
return process_image_sync_pure(image_path, config)
|
170
|
-
finally:
|
171
|
-
image_file = Path(image_path)
|
172
|
-
if image_file.exists():
|
173
|
-
image_file.unlink()
|
174
|
-
|
175
|
-
|
176
|
-
def process_batch_images_sync_pure(
|
177
|
-
image_paths: list[str | Path],
|
178
|
-
config: EasyOCRConfig | None = None,
|
179
|
-
) -> list[ExtractionResult]:
|
180
|
-
"""Process a batch of images sequentially with pure sync implementation.
|
181
|
-
|
182
|
-
Args:
|
183
|
-
image_paths: List of image file paths.
|
184
|
-
config: EasyOCR configuration.
|
185
|
-
|
186
|
-
Returns:
|
187
|
-
List of extraction results.
|
188
|
-
"""
|
189
|
-
results = []
|
190
|
-
for image_path in image_paths:
|
191
|
-
result = process_image_sync_pure(image_path, config)
|
192
|
-
results.append(result)
|
193
|
-
return results
|
194
|
-
|
195
|
-
|
196
|
-
def process_batch_images_threaded(
|
197
|
-
image_paths: list[str | Path],
|
198
|
-
config: EasyOCRConfig | None = None,
|
199
|
-
max_workers: int | None = None,
|
200
|
-
) -> list[ExtractionResult]:
|
201
|
-
"""Process a batch of images using threading.
|
202
|
-
|
203
|
-
Args:
|
204
|
-
image_paths: List of image file paths.
|
205
|
-
config: EasyOCR configuration.
|
206
|
-
max_workers: Maximum number of threads.
|
207
|
-
|
208
|
-
Returns:
|
209
|
-
List of extraction results in same order as input.
|
210
|
-
"""
|
211
|
-
import multiprocessing as mp
|
212
|
-
from concurrent.futures import ThreadPoolExecutor, as_completed
|
213
|
-
|
214
|
-
if max_workers is None:
|
215
|
-
max_workers = min(len(image_paths), mp.cpu_count())
|
216
|
-
|
217
|
-
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
218
|
-
future_to_index = {
|
219
|
-
executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
|
220
|
-
}
|
221
|
-
|
222
|
-
results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
|
223
|
-
for future in as_completed(future_to_index):
|
224
|
-
index = future_to_index[future]
|
225
|
-
try:
|
226
|
-
results[index] = future.result()
|
227
|
-
except Exception as e: # noqa: BLE001
|
228
|
-
results[index] = ExtractionResult(
|
229
|
-
content=f"Error: {e}",
|
230
|
-
mime_type=PLAIN_TEXT_MIME_TYPE,
|
231
|
-
metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
|
232
|
-
chunks=[],
|
233
|
-
)
|
234
|
-
|
235
|
-
return results
|