kreuzberg 3.7.0__py3-none-any.whl → 3.8.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (33) hide show
  1. kreuzberg/_extractors/_base.py +40 -0
  2. kreuzberg/_extractors/_email.py +149 -0
  3. kreuzberg/_extractors/_html.py +15 -3
  4. kreuzberg/_extractors/_image.py +17 -18
  5. kreuzberg/_extractors/_pdf.py +68 -14
  6. kreuzberg/_extractors/_presentation.py +62 -10
  7. kreuzberg/_extractors/_spread_sheet.py +179 -4
  8. kreuzberg/_extractors/_structured.py +148 -0
  9. kreuzberg/_gmft.py +2 -2
  10. kreuzberg/_mime_types.py +27 -1
  11. kreuzberg/_multiprocessing/__init__.py +2 -3
  12. kreuzberg/_ocr/__init__.py +30 -0
  13. kreuzberg/{_multiprocessing/tesseract_pool.py → _ocr/_pool.py} +3 -5
  14. kreuzberg/_ocr/_sync.py +566 -0
  15. kreuzberg/_ocr/_tesseract.py +6 -2
  16. kreuzberg/_registry.py +4 -0
  17. kreuzberg/_types.py +131 -0
  18. kreuzberg/_utils/_cache.py +17 -2
  19. kreuzberg/_utils/_process_pool.py +178 -1
  20. kreuzberg/_utils/_quality.py +237 -0
  21. kreuzberg/_utils/_serialization.py +4 -2
  22. kreuzberg/_utils/_string.py +153 -10
  23. kreuzberg/_utils/_sync.py +5 -2
  24. kreuzberg/_utils/_table.py +261 -0
  25. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/METADATA +66 -50
  26. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/RECORD +29 -28
  27. kreuzberg/_multiprocessing/process_manager.py +0 -189
  28. kreuzberg/_multiprocessing/sync_easyocr.py +0 -235
  29. kreuzberg/_multiprocessing/sync_paddleocr.py +0 -199
  30. kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
  31. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/WHEEL +0 -0
  32. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/entry_points.txt +0 -0
  33. {kreuzberg-3.7.0.dist-info → kreuzberg-3.8.0.dist-info}/licenses/LICENSE +0 -0
@@ -4,12 +4,12 @@ kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
4
4
  kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
6
  kreuzberg/_entity_extraction.py,sha256=EIasBGpkZ-3FwivjEpisz23LilTwx8os-IbfrDtzNl4,7815
7
- kreuzberg/_gmft.py,sha256=e-UpYwizRX_V-dn0a7ja0Z9nShAmDKA1Q7HThJy8cyA,14856
7
+ kreuzberg/_gmft.py,sha256=ZIEUu4Uy5zYNFEeDRbz1cLJhnCAStVsSzm1PQ3vDeO8,14828
8
8
  kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
9
- kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
9
+ kreuzberg/_mime_types.py,sha256=OhJ6gEyyLHjyvRtkk37zyLFBsRcSd_QybBaV8TxinIg,8471
10
10
  kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
11
- kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
12
- kreuzberg/_types.py,sha256=U72a4SXS1e-zV8cXG0tiozMy9mX9wFM1ma6sVz7HpJo,9936
11
+ kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
12
+ kreuzberg/_types.py,sha256=R_0Xc2kq4nEwkruvkB3qfrLeJ996419hBQ_1C6Xrqjo,13388
13
13
  kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
14
14
  kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
15
15
  kreuzberg/extraction.py,sha256=mdH45bMAAUUNXYT7UrNyWJ2oD_gXuLUU-NyuYxQM884,17459
@@ -17,40 +17,41 @@ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
17
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
19
19
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
20
- kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
21
- kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
22
- kreuzberg/_extractors/_image.py,sha256=pYfh3x9CkiIxOLvp0jkkZcmLbB_FpdfDo01klSc6OzQ,4819
20
+ kreuzberg/_extractors/_base.py,sha256=ECEwBpxnIy_J9kGZGuqsaPCgLFfxRn7kn4hIf11gDJ8,4478
21
+ kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
22
+ kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
23
+ kreuzberg/_extractors/_image.py,sha256=0kzOQTTeJacaA8I9833fFvVQSz6FtUe9Nuw1oy0ToD0,4939
23
24
  kreuzberg/_extractors/_pandoc.py,sha256=oQ4DgQSPoX1LXjGAKh_A40JHqiKWb91LeRBYSS_6EUA,26750
24
- kreuzberg/_extractors/_pdf.py,sha256=R33ggTd0IU6NsEnzgHFTr9ScgcnM8nIIstDq7XMVcvg,14792
25
- kreuzberg/_extractors/_presentation.py,sha256=ZX-EKQppHwvKtyKk0-IQVF6QAqJi0SfGgCiiyqMQh0w,8701
26
- kreuzberg/_extractors/_spread_sheet.py,sha256=HOzCeYQc6kaMveAHfi80LrsF0yU7Kn74aKQ7lrMAlo8,6480
25
+ kreuzberg/_extractors/_pdf.py,sha256=giYG3aEdmsxT0tGWKBaMzHDPz74-jVmK4HZARDEBhsM,17108
26
+ kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
27
+ kreuzberg/_extractors/_spread_sheet.py,sha256=Nvyz7XT7C2ai4QeUashBeENQpuP5rs8SmKfumxEqlCg,13712
28
+ kreuzberg/_extractors/_structured.py,sha256=i3jAvhHZt_BsRGgZZfgcsUqlwAg_RNc8vsuecb04T0c,5581
27
29
  kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
28
30
  kreuzberg/_mcp/server.py,sha256=BQHeKI89aKf24BIE4n6m8r1rVA1Zgt6vM8Ki_OHuGnc,6780
29
- kreuzberg/_multiprocessing/__init__.py,sha256=nwYQpKH7ixHwzkQbTMFCstOCBKktmbNq5dTrwI2Mn94,203
31
+ kreuzberg/_multiprocessing/__init__.py,sha256=X2BtgKmWhF1rl0JYg2gvoSUaozKExfsWh-RRNvzNoOs,202
30
32
  kreuzberg/_multiprocessing/gmft_isolated.py,sha256=ZfbhiL5bhBEJnibUSls3WV-FECrnU9VvKfq5O2foHcc,11191
31
- kreuzberg/_multiprocessing/process_manager.py,sha256=_qtB8y9td2coJevlIl4z6F__jau320RdI1lqdyuaeD4,6061
32
- kreuzberg/_multiprocessing/sync_easyocr.py,sha256=-3_Ol0H8G6RhPxTbTPvoe8fTsTz3e-dg2QbHHnoJL48,7693
33
- kreuzberg/_multiprocessing/sync_paddleocr.py,sha256=5558iTjPXCyJWuyhZckmuJLadUwJDb5YVC8Cv-FOaWg,6090
34
- kreuzberg/_multiprocessing/sync_tesseract.py,sha256=Ck1PvHGWOMQWUcC7RyVrBt8K9VDFQ0lQcwFkwYzl3rE,8240
35
- kreuzberg/_multiprocessing/tesseract_pool.py,sha256=UN7BtS_ib1ux9xuR6d6AB3PY7UEUhd-5Ti1n1H0UnYw,10945
36
- kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
33
+ kreuzberg/_ocr/__init__.py,sha256=CC9Ob1t_ltTYUamK1ZtmkswfCYdn1B-Z0kPemsQU0xU,1439
37
34
  kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
38
35
  kreuzberg/_ocr/_easyocr.py,sha256=90Dv1xaLXbpG7EtmRQE5ykvnhqZJR3xSFXlxFMCSVSI,13740
39
36
  kreuzberg/_ocr/_paddleocr.py,sha256=UvugDdZd7RojHUiFeBaI8aqz36ecegPLj2v6oT6c42g,13776
40
- kreuzberg/_ocr/_tesseract.py,sha256=3s3MkZN9xA_Uedx4s2p5m4IEIMhGjs9gYHxan9Iz-2g,13044
37
+ kreuzberg/_ocr/_pool.py,sha256=Yb0l_GxnPsIWn3NA2FuBYEC8ipIqgwaYglUt0ltqSvk,10948
38
+ kreuzberg/_ocr/_sync.py,sha256=cdLiH9hYqygzqW3LkibhrE6C8atin7mfTv_k3JJFE0k,18287
39
+ kreuzberg/_ocr/_tesseract.py,sha256=KtenEIGL63gRhdH2hxOEVM89locAETGo2bNjQMXjTwY,13266
41
40
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
42
- kreuzberg/_utils/_cache.py,sha256=JGiwwcNBoD950IbsPUUAD5gAGS7byUuz0BqYSneVakc,13088
41
+ kreuzberg/_utils/_cache.py,sha256=CtpSmEggWoIPDZ9_Nl0i5pr7wtPyci8EVT-ajYsARGI,13609
43
42
  kreuzberg/_utils/_device.py,sha256=rnaSSB5ibf2wr7EDxrcmOUZ4Ocor0pHkwb3N1pC46EY,10276
44
43
  kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
45
44
  kreuzberg/_utils/_errors.py,sha256=AV3oaRQDgJxe1YUZd9pCQUysUv9KW8Ib37MvnyFOZ4o,6386
46
45
  kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
47
- kreuzberg/_utils/_process_pool.py,sha256=-0SNP01Qz21D7hgJmN0eHoqKusSygwPbi1U7IzJlPio,2895
48
- kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lTklO0g,2132
49
- kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
50
- kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
46
+ kreuzberg/_utils/_process_pool.py,sha256=E3bHOO67TeoLUBjtw5HoY9gyFl621VaImYI-_itQ96c,8653
47
+ kreuzberg/_utils/_quality.py,sha256=dgFLt40NSqB8Ciej5QcZQLiV4U7LcrGux0vXckiE31U,7568
48
+ kreuzberg/_utils/_serialization.py,sha256=Rt5zSkvzf1SVNDrI6F2Zvnkel24mQkD1QvP0WjgZUgk,2195
49
+ kreuzberg/_utils/_string.py,sha256=5YKu9EZlZQ-LkphXUq8fdwKQrX9jWACFEhMGfjIysf4,6381
50
+ kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
51
+ kreuzberg/_utils/_table.py,sha256=C2skLtcyczxDEH33Qw2dOwnR15SGillvNEP-NzBG3R8,8156
51
52
  kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
52
- kreuzberg-3.7.0.dist-info/METADATA,sha256=0rBXhtDYCdZ2AGpQdXTTZUQUX8T01OsKzwrm2nl14QA,11137
53
- kreuzberg-3.7.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
54
- kreuzberg-3.7.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
55
- kreuzberg-3.7.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
56
- kreuzberg-3.7.0.dist-info/RECORD,,
53
+ kreuzberg-3.8.0.dist-info/METADATA,sha256=d1N7v0EvJA-22g071Dctler5zF11WlKGTgLjGpsV8iw,11422
54
+ kreuzberg-3.8.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
55
+ kreuzberg-3.8.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
56
+ kreuzberg-3.8.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
57
+ kreuzberg-3.8.0.dist-info/RECORD,,
@@ -1,189 +0,0 @@
1
- """Process pool manager for resource-aware multiprocessing."""
2
-
3
- from __future__ import annotations
4
-
5
- import multiprocessing as mp
6
- from concurrent.futures import ProcessPoolExecutor
7
- from typing import TYPE_CHECKING, Any, TypeVar
8
-
9
- import anyio
10
- import psutil
11
- from typing_extensions import Self
12
-
13
- if TYPE_CHECKING:
14
- import types
15
- from collections.abc import Callable
16
-
17
- T = TypeVar("T")
18
-
19
-
20
- class ProcessPoolManager:
21
- """Resource-aware process pool manager for CPU-intensive tasks."""
22
-
23
- def __init__(
24
- self,
25
- max_processes: int | None = None,
26
- memory_limit_gb: float | None = None,
27
- ) -> None:
28
- """Initialize the process pool manager.
29
-
30
- Args:
31
- max_processes: Maximum number of processes. Defaults to CPU count.
32
- memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
33
- """
34
- self.max_processes = max_processes or mp.cpu_count()
35
-
36
- if memory_limit_gb is None:
37
- available_memory = psutil.virtual_memory().available
38
- self.memory_limit_bytes = int(available_memory * 0.75) # Use 75% of available # ~keep
39
- else:
40
- self.memory_limit_bytes = int(memory_limit_gb * 1024**3)
41
-
42
- self._executor: ProcessPoolExecutor | None = None
43
- self._active_tasks = 0
44
-
45
- def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
46
- """Calculate optimal number of workers based on memory constraints.
47
-
48
- Args:
49
- task_memory_mb: Estimated memory usage per task in MB.
50
-
51
- Returns:
52
- Optimal number of workers.
53
- """
54
- task_memory_bytes = task_memory_mb * 1024**2
55
- memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
56
-
57
- return min(self.max_processes, memory_based_limit)
58
-
59
- def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
60
- """Ensure process pool executor is initialized."""
61
- if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
62
- if self._executor is not None:
63
- self._executor.shutdown(wait=False)
64
-
65
- workers = max_workers or self.max_processes
66
- self._executor = ProcessPoolExecutor(max_workers=workers)
67
-
68
- return self._executor
69
-
70
- async def submit_task(
71
- self,
72
- func: Callable[..., T],
73
- *args: Any,
74
- task_memory_mb: float = 100,
75
- ) -> T:
76
- """Submit a task to the process pool.
77
-
78
- Args:
79
- func: Function to execute.
80
- *args: Positional arguments for the function.
81
- task_memory_mb: Estimated memory usage in MB.
82
-
83
- Returns:
84
- Result of the function execution.
85
- """
86
- workers = self.get_optimal_workers(task_memory_mb)
87
- self._ensure_executor(workers)
88
-
89
- self._active_tasks += 1
90
-
91
- try:
92
- return await anyio.to_thread.run_sync(func, *args)
93
- finally:
94
- self._active_tasks -= 1
95
-
96
- async def submit_batch(
97
- self,
98
- func: Callable[..., T],
99
- arg_batches: list[tuple[Any, ...]],
100
- task_memory_mb: float = 100,
101
- max_concurrent: int | None = None,
102
- ) -> list[T]:
103
- """Submit a batch of tasks to the process pool.
104
-
105
- Args:
106
- func: Function to execute.
107
- arg_batches: List of argument tuples for each task.
108
- task_memory_mb: Estimated memory usage per task in MB.
109
- max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
110
-
111
- Returns:
112
- List of results in the same order as input.
113
- """
114
- if not arg_batches:
115
- return []
116
-
117
- workers = self.get_optimal_workers(task_memory_mb)
118
- max_concurrent = max_concurrent or workers
119
-
120
- self._ensure_executor(workers)
121
-
122
- semaphore = anyio.CapacityLimiter(max_concurrent)
123
-
124
- async def submit_single(args: tuple[Any, ...]) -> T:
125
- async with semaphore:
126
- self._active_tasks += 1
127
- try:
128
- return await anyio.to_thread.run_sync(func, *args)
129
- finally:
130
- self._active_tasks -= 1
131
-
132
- async with anyio.create_task_group() as tg:
133
- results: list[T] = [None] * len(arg_batches) # type: ignore[list-item]
134
-
135
- async def run_task(idx: int, args: tuple[Any, ...]) -> None:
136
- results[idx] = await submit_single(args)
137
-
138
- for idx, args in enumerate(arg_batches):
139
- tg.start_soon(run_task, idx, args)
140
-
141
- return results
142
-
143
- def get_system_info(self) -> dict[str, Any]:
144
- """Get current system resource information."""
145
- memory = psutil.virtual_memory()
146
- cpu_percent = psutil.cpu_percent(interval=1)
147
-
148
- return {
149
- "cpu_count": mp.cpu_count(),
150
- "cpu_percent": cpu_percent,
151
- "memory_total": memory.total,
152
- "memory_available": memory.available,
153
- "memory_percent": memory.percent,
154
- "active_tasks": self._active_tasks,
155
- "max_processes": self.max_processes,
156
- "memory_limit": self.memory_limit_bytes,
157
- }
158
-
159
- def shutdown(self, wait: bool = True) -> None:
160
- """Shutdown the process pool."""
161
- if self._executor is not None:
162
- self._executor.shutdown(wait=wait)
163
- self._executor = None
164
-
165
- def __enter__(self) -> Self:
166
- """Context manager entry."""
167
- return self
168
-
169
- def __exit__(
170
- self,
171
- exc_type: type[BaseException] | None,
172
- exc_val: BaseException | None,
173
- exc_tb: types.TracebackType | None,
174
- ) -> None:
175
- """Context manager exit."""
176
- self.shutdown()
177
-
178
- async def __aenter__(self) -> Self:
179
- """Async context manager entry."""
180
- return self
181
-
182
- async def __aexit__(
183
- self,
184
- exc_type: type[BaseException] | None,
185
- exc_val: BaseException | None,
186
- exc_tb: types.TracebackType | None,
187
- ) -> None:
188
- """Async context manager exit."""
189
- self.shutdown()
@@ -1,235 +0,0 @@
1
- """Pure synchronous EasyOCR without any async overhead."""
2
-
3
- from __future__ import annotations
4
-
5
- import tempfile
6
- from pathlib import Path
7
- from typing import Any
8
-
9
- from PIL import Image
10
-
11
- from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
- from kreuzberg._ocr._easyocr import EasyOCRConfig
13
- from kreuzberg._types import ExtractionResult
14
- from kreuzberg._utils._string import normalize_spaces
15
- from kreuzberg.exceptions import MissingDependencyError, OCRError
16
-
17
-
18
- def _get_easyocr_instance(config: EasyOCRConfig) -> Any:
19
- """Get an EasyOCR Reader instance with the given configuration."""
20
- try:
21
- import easyocr
22
- except ImportError as e:
23
- raise MissingDependencyError("EasyOCR is not installed. Install it with: pip install easyocr") from e
24
-
25
- gpu = False
26
- if hasattr(config, "device"):
27
- if config.device and config.device.lower() != "cpu":
28
- gpu = True
29
- elif hasattr(config, "use_gpu"):
30
- gpu = config.use_gpu
31
-
32
- language = config.language if hasattr(config, "language") else "en"
33
- if isinstance(language, str):
34
- lang_list = [lang.strip().lower() for lang in language.split(",")]
35
- else:
36
- lang_list = [lang.lower() for lang in language]
37
-
38
- kwargs = {
39
- "lang_list": lang_list,
40
- "gpu": gpu,
41
- "model_storage_directory": getattr(config, "model_storage_directory", None),
42
- "user_network_directory": getattr(config, "user_network_directory", None),
43
- "recog_network": getattr(config, "recog_network", None),
44
- "detector": getattr(config, "detector", None),
45
- "recognizer": getattr(config, "recognizer", None),
46
- "verbose": False,
47
- "quantize": getattr(config, "quantize", None),
48
- "cudnn_benchmark": getattr(config, "cudnn_benchmark", None),
49
- }
50
-
51
- kwargs = {k: v for k, v in kwargs.items() if v is not None}
52
-
53
- return easyocr.Reader(**kwargs)
54
-
55
-
56
- def process_image_sync_pure(
57
- image_path: str | Path,
58
- config: EasyOCRConfig | None = None,
59
- ) -> ExtractionResult:
60
- """Process an image with EasyOCR using pure sync implementation.
61
-
62
- This bypasses all async overhead and calls EasyOCR directly.
63
-
64
- Args:
65
- image_path: Path to the image file.
66
- config: EasyOCR configuration.
67
-
68
- Returns:
69
- Extraction result.
70
- """
71
- cfg = config or EasyOCRConfig()
72
-
73
- try:
74
- reader = _get_easyocr_instance(cfg)
75
-
76
- readtext_kwargs = {
77
- "decoder": cfg.decoder,
78
- "beamWidth": cfg.beam_width,
79
- "batch_size": getattr(cfg, "batch_size", 1),
80
- "workers": getattr(cfg, "workers", 0),
81
- "allowlist": getattr(cfg, "allowlist", None),
82
- "blocklist": getattr(cfg, "blocklist", None),
83
- "detail": getattr(cfg, "detail", 1),
84
- "rotation_info": cfg.rotation_info,
85
- "paragraph": getattr(cfg, "paragraph", False),
86
- "min_size": cfg.min_size,
87
- "text_threshold": cfg.text_threshold,
88
- "low_text": cfg.low_text,
89
- "link_threshold": cfg.link_threshold,
90
- "canvas_size": cfg.canvas_size,
91
- "mag_ratio": cfg.mag_ratio,
92
- "slope_ths": cfg.slope_ths,
93
- "ycenter_ths": cfg.ycenter_ths,
94
- "height_ths": cfg.height_ths,
95
- "width_ths": cfg.width_ths,
96
- "add_margin": cfg.add_margin,
97
- "x_ths": cfg.x_ths,
98
- "y_ths": cfg.y_ths,
99
- }
100
-
101
- readtext_kwargs = {k: v for k, v in readtext_kwargs.items() if v is not None}
102
-
103
- results = reader.readtext(str(image_path), **readtext_kwargs)
104
-
105
- if not results:
106
- return ExtractionResult(
107
- content="",
108
- mime_type=PLAIN_TEXT_MIME_TYPE,
109
- metadata={},
110
- chunks=[],
111
- )
112
-
113
- texts = []
114
- confidences = []
115
-
116
- detail_value = getattr(cfg, "detail", 1)
117
- if detail_value:
118
- for result in results:
119
- min_result_length = 2
120
- max_confidence_index = 2
121
- if len(result) >= min_result_length:
122
- _bbox, text = result[0], result[1]
123
- confidence = result[max_confidence_index] if len(result) > max_confidence_index else 1.0
124
- texts.append(text)
125
- confidences.append(confidence)
126
- else:
127
- texts = results
128
- confidences = [1.0] * len(texts)
129
-
130
- content = "\n".join(texts)
131
- content = normalize_spaces(content)
132
-
133
- avg_confidence = sum(confidences) / len(confidences) if confidences else 0.0
134
-
135
- metadata = {"confidence": avg_confidence} if confidences else {}
136
-
137
- return ExtractionResult(
138
- content=content,
139
- mime_type=PLAIN_TEXT_MIME_TYPE,
140
- metadata=metadata, # type: ignore[arg-type]
141
- chunks=[],
142
- )
143
-
144
- except Exception as e:
145
- raise OCRError(f"EasyOCR processing failed: {e}") from e
146
-
147
-
148
- def process_image_bytes_sync_pure(
149
- image_bytes: bytes,
150
- config: EasyOCRConfig | None = None,
151
- ) -> ExtractionResult:
152
- """Process image bytes with EasyOCR using pure sync implementation.
153
-
154
- Args:
155
- image_bytes: Image data as bytes.
156
- config: EasyOCR configuration.
157
-
158
- Returns:
159
- Extraction result.
160
- """
161
- import io
162
-
163
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp_image:
164
- with Image.open(io.BytesIO(image_bytes)) as image:
165
- image.save(tmp_image.name, format="PNG")
166
- image_path = tmp_image.name
167
-
168
- try:
169
- return process_image_sync_pure(image_path, config)
170
- finally:
171
- image_file = Path(image_path)
172
- if image_file.exists():
173
- image_file.unlink()
174
-
175
-
176
- def process_batch_images_sync_pure(
177
- image_paths: list[str | Path],
178
- config: EasyOCRConfig | None = None,
179
- ) -> list[ExtractionResult]:
180
- """Process a batch of images sequentially with pure sync implementation.
181
-
182
- Args:
183
- image_paths: List of image file paths.
184
- config: EasyOCR configuration.
185
-
186
- Returns:
187
- List of extraction results.
188
- """
189
- results = []
190
- for image_path in image_paths:
191
- result = process_image_sync_pure(image_path, config)
192
- results.append(result)
193
- return results
194
-
195
-
196
- def process_batch_images_threaded(
197
- image_paths: list[str | Path],
198
- config: EasyOCRConfig | None = None,
199
- max_workers: int | None = None,
200
- ) -> list[ExtractionResult]:
201
- """Process a batch of images using threading.
202
-
203
- Args:
204
- image_paths: List of image file paths.
205
- config: EasyOCR configuration.
206
- max_workers: Maximum number of threads.
207
-
208
- Returns:
209
- List of extraction results in same order as input.
210
- """
211
- import multiprocessing as mp
212
- from concurrent.futures import ThreadPoolExecutor, as_completed
213
-
214
- if max_workers is None:
215
- max_workers = min(len(image_paths), mp.cpu_count())
216
-
217
- with ThreadPoolExecutor(max_workers=max_workers) as executor:
218
- future_to_index = {
219
- executor.submit(process_image_sync_pure, path, config): i for i, path in enumerate(image_paths)
220
- }
221
-
222
- results: list[ExtractionResult] = [None] * len(image_paths) # type: ignore[list-item]
223
- for future in as_completed(future_to_index):
224
- index = future_to_index[future]
225
- try:
226
- results[index] = future.result()
227
- except Exception as e: # noqa: BLE001
228
- results[index] = ExtractionResult(
229
- content=f"Error: {e}",
230
- mime_type=PLAIN_TEXT_MIME_TYPE,
231
- metadata={"error": str(e)}, # type: ignore[typeddict-unknown-key]
232
- chunks=[],
233
- )
234
-
235
- return results