kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_gmft.py
CHANGED
@@ -7,16 +7,17 @@ import queue
|
|
7
7
|
import signal
|
8
8
|
import time
|
9
9
|
import traceback
|
10
|
-
from dataclasses import dataclass, field
|
11
10
|
from io import StringIO
|
12
11
|
from pathlib import Path
|
13
|
-
from typing import TYPE_CHECKING, Any
|
12
|
+
from typing import TYPE_CHECKING, Any
|
14
13
|
|
15
14
|
import anyio
|
16
15
|
import msgspec
|
16
|
+
import pandas as pd
|
17
17
|
from PIL import Image
|
18
18
|
|
19
|
-
from kreuzberg._types import TableData
|
19
|
+
from kreuzberg._types import GMFTConfig, TableData
|
20
|
+
from kreuzberg._utils._cache import get_table_cache
|
20
21
|
from kreuzberg._utils._sync import run_sync
|
21
22
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
22
23
|
|
@@ -27,117 +28,6 @@ if TYPE_CHECKING:
|
|
27
28
|
from pandas import DataFrame
|
28
29
|
|
29
30
|
|
30
|
-
@dataclass(unsafe_hash=True, slots=True)
|
31
|
-
class GMFTConfig:
|
32
|
-
"""Configuration options for GMFT.
|
33
|
-
|
34
|
-
This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
|
35
|
-
"""
|
36
|
-
|
37
|
-
verbosity: int = 0
|
38
|
-
"""
|
39
|
-
Verbosity level for logging.
|
40
|
-
|
41
|
-
0: errors only
|
42
|
-
1: print warnings
|
43
|
-
2: print warnings and info
|
44
|
-
3: print warnings, info, and debug
|
45
|
-
"""
|
46
|
-
formatter_base_threshold: float = 0.3
|
47
|
-
"""
|
48
|
-
Base threshold for the confidence demanded of a table feature (row/column).
|
49
|
-
|
50
|
-
Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
|
51
|
-
"""
|
52
|
-
cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
|
53
|
-
default_factory=lambda: {
|
54
|
-
0: 0.3,
|
55
|
-
1: 0.3,
|
56
|
-
2: 0.3,
|
57
|
-
3: 0.3,
|
58
|
-
4: 0.5,
|
59
|
-
5: 0.5,
|
60
|
-
6: 99,
|
61
|
-
},
|
62
|
-
hash=False,
|
63
|
-
)
|
64
|
-
"""
|
65
|
-
Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
|
66
|
-
|
67
|
-
But low confidences may be better than too high confidence (see formatter_base_threshold)
|
68
|
-
"""
|
69
|
-
detector_base_threshold: float = 0.9
|
70
|
-
"""Minimum confidence score required for a table"""
|
71
|
-
remove_null_rows: bool = True
|
72
|
-
"""
|
73
|
-
Flag to remove rows with no text.
|
74
|
-
"""
|
75
|
-
enable_multi_header: bool = False
|
76
|
-
"""
|
77
|
-
Enable multi-indices in the dataframe.
|
78
|
-
|
79
|
-
If false, then multiple headers will be merged column-wise.
|
80
|
-
"""
|
81
|
-
semantic_spanning_cells: bool = False
|
82
|
-
"""
|
83
|
-
[Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
|
84
|
-
"""
|
85
|
-
semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
|
86
|
-
"""
|
87
|
-
[Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
|
88
|
-
|
89
|
-
Possible values: 'algorithm', 'deep', None.
|
90
|
-
|
91
|
-
'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
|
92
|
-
'deep': merges headers according to the spanning cells detected by the Table Transformer.
|
93
|
-
None: headers are not duplicated.
|
94
|
-
"""
|
95
|
-
large_table_if_n_rows_removed: int = 8
|
96
|
-
"""
|
97
|
-
If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
|
98
|
-
"""
|
99
|
-
large_table_threshold: int = 10
|
100
|
-
"""
|
101
|
-
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
102
|
-
|
103
|
-
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
|
104
|
-
"""
|
105
|
-
large_table_row_overlap_threshold: float = 0.2
|
106
|
-
"""
|
107
|
-
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
108
|
-
|
109
|
-
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
|
110
|
-
"""
|
111
|
-
large_table_maximum_rows: int = 1000
|
112
|
-
"""
|
113
|
-
Maximum number of rows allowed for a large table.
|
114
|
-
"""
|
115
|
-
force_large_table_assumption: bool | None = None
|
116
|
-
"""
|
117
|
-
Force the large table assumption to be applied, regardless of the number of rows and overlap.
|
118
|
-
"""
|
119
|
-
total_overlap_reject_threshold: float = 0.9
|
120
|
-
"""
|
121
|
-
Reject if total overlap is > 90% of table area.
|
122
|
-
"""
|
123
|
-
total_overlap_warn_threshold: float = 0.1
|
124
|
-
"""
|
125
|
-
Warn if total overlap is > 10% of table area.
|
126
|
-
"""
|
127
|
-
nms_warn_threshold: int = 5
|
128
|
-
"""
|
129
|
-
Warn if non maxima suppression removes > 5 rows.
|
130
|
-
"""
|
131
|
-
iob_reject_threshold: float = 0.05
|
132
|
-
"""
|
133
|
-
Reject if iob between textbox and cell is < 5%.
|
134
|
-
"""
|
135
|
-
iob_warn_threshold: float = 0.5
|
136
|
-
"""
|
137
|
-
Warn if iob between textbox and cell is < 50%.
|
138
|
-
"""
|
139
|
-
|
140
|
-
|
141
31
|
async def extract_tables(
|
142
32
|
file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
|
143
33
|
) -> list[TableData]:
|
@@ -158,8 +48,6 @@ async def extract_tables(
|
|
158
48
|
Returns:
|
159
49
|
A list of table data dictionaries.
|
160
50
|
"""
|
161
|
-
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
162
|
-
|
163
51
|
# Determine if we should use isolated process # ~keep
|
164
52
|
if use_isolated_process is None:
|
165
53
|
use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
|
@@ -211,15 +99,15 @@ async def extract_tables(
|
|
211
99
|
return result
|
212
100
|
|
213
101
|
try:
|
214
|
-
from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415
|
102
|
+
from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415
|
215
103
|
AutoTableDetector,
|
216
104
|
AutoTableFormatter,
|
217
105
|
)
|
218
106
|
from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
|
219
|
-
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
220
|
-
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
107
|
+
from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
|
108
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
|
221
109
|
|
222
|
-
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
110
|
+
formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
|
223
111
|
config=TATRFormatConfig(
|
224
112
|
verbosity=config.verbosity,
|
225
113
|
formatter_base_threshold=config.formatter_base_threshold,
|
@@ -235,7 +123,7 @@ async def extract_tables(
|
|
235
123
|
force_large_table_assumption=config.force_large_table_assumption,
|
236
124
|
)
|
237
125
|
)
|
238
|
-
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
126
|
+
detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
|
239
127
|
config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
|
240
128
|
)
|
241
129
|
doc = await run_sync(PyPDFium2Document, str(file_path))
|
@@ -287,8 +175,6 @@ def extract_tables_sync(
|
|
287
175
|
Returns:
|
288
176
|
A list of table data dictionaries.
|
289
177
|
"""
|
290
|
-
from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
|
291
|
-
|
292
178
|
# Determine if we should use isolated process # ~keep
|
293
179
|
if use_isolated_process is None:
|
294
180
|
use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
|
@@ -542,7 +428,6 @@ def _extract_tables_isolated(
|
|
542
428
|
tables = []
|
543
429
|
for table_dict in result:
|
544
430
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
545
|
-
import pandas as pd # noqa: PLC0415
|
546
431
|
|
547
432
|
if table_dict["df_csv"] is None:
|
548
433
|
df = pd.DataFrame(columns=table_dict["df_columns"])
|
@@ -620,38 +505,29 @@ async def _extract_tables_isolated_async(
|
|
620
505
|
|
621
506
|
try:
|
622
507
|
|
623
|
-
|
508
|
+
def get_result_sync() -> tuple[bool, Any]:
|
624
509
|
while True:
|
625
510
|
try:
|
626
|
-
return result_queue.
|
511
|
+
return result_queue.get(timeout=0.1) # type: ignore[no-any-return]
|
627
512
|
except queue.Empty: # noqa: PERF203
|
628
|
-
await anyio.sleep(0.1)
|
629
513
|
if not process.is_alive():
|
630
|
-
# Process died without putting result # ~keep
|
631
514
|
if process.exitcode == -signal.SIGSEGV:
|
632
515
|
raise ParsingError(
|
633
516
|
"GMFT process crashed with segmentation fault",
|
634
|
-
context={
|
635
|
-
"file_path": str(file_path),
|
636
|
-
"exit_code": process.exitcode,
|
637
|
-
},
|
517
|
+
context={"file_path": str(file_path), "exit_code": process.exitcode},
|
638
518
|
) from None
|
639
519
|
raise ParsingError(
|
640
520
|
f"GMFT process died unexpectedly with exit code {process.exitcode}",
|
641
|
-
context={
|
642
|
-
"file_path": str(file_path),
|
643
|
-
"exit_code": process.exitcode,
|
644
|
-
},
|
521
|
+
context={"file_path": str(file_path), "exit_code": process.exitcode},
|
645
522
|
) from None
|
646
523
|
|
647
524
|
with anyio.fail_after(timeout):
|
648
|
-
success, result = await
|
525
|
+
success, result = await anyio.to_thread.run_sync(get_result_sync)
|
649
526
|
|
650
527
|
if success:
|
651
528
|
tables = []
|
652
529
|
for table_dict in result:
|
653
530
|
img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
|
654
|
-
import pandas as pd # noqa: PLC0415
|
655
531
|
|
656
532
|
if table_dict["df_csv"] is None:
|
657
533
|
df = pd.DataFrame(columns=table_dict["df_columns"])
|
kreuzberg/_language_detection.py
CHANGED
@@ -1,9 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from dataclasses import dataclass
|
4
3
|
from functools import lru_cache
|
5
4
|
from typing import TYPE_CHECKING, Any
|
6
5
|
|
6
|
+
from kreuzberg._types import LanguageDetectionConfig
|
7
7
|
from kreuzberg.exceptions import MissingDependencyError
|
8
8
|
|
9
9
|
if TYPE_CHECKING:
|
@@ -23,27 +23,6 @@ except ImportError: # pragma: no cover
|
|
23
23
|
_CACHE_SIZE = 128
|
24
24
|
|
25
25
|
|
26
|
-
@dataclass(frozen=True, slots=True)
|
27
|
-
class LanguageDetectionConfig:
|
28
|
-
"""Configuration for language detection.
|
29
|
-
|
30
|
-
Attributes:
|
31
|
-
low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
|
32
|
-
Defaults to True for better memory efficiency.
|
33
|
-
top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
|
34
|
-
multilingual: If True, uses multilingual detection to handle mixed-language text.
|
35
|
-
If False, uses single language detection. Defaults to False.
|
36
|
-
cache_dir: Custom directory for model cache. If None, uses system default.
|
37
|
-
allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
|
38
|
-
"""
|
39
|
-
|
40
|
-
low_memory: bool = True
|
41
|
-
top_k: int = 3
|
42
|
-
multilingual: bool = False
|
43
|
-
cache_dir: str | None = None
|
44
|
-
allow_fallback: bool = True
|
45
|
-
|
46
|
-
|
47
26
|
def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
|
48
27
|
"""Create FastLangDetectConfig from our config."""
|
49
28
|
if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
|
kreuzberg/_mcp/__init__.py
CHANGED
kreuzberg/_mcp/server.py
CHANGED
@@ -1,5 +1,3 @@
|
|
1
|
-
"""Kreuzberg MCP server implementation."""
|
2
|
-
|
3
1
|
from __future__ import annotations
|
4
2
|
|
5
3
|
import base64
|
@@ -10,11 +8,10 @@ import msgspec
|
|
10
8
|
from mcp.server import FastMCP
|
11
9
|
from mcp.types import TextContent
|
12
10
|
|
13
|
-
from kreuzberg._config import
|
11
|
+
from kreuzberg._config import discover_config
|
14
12
|
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
15
13
|
from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
|
16
14
|
|
17
|
-
# Create the MCP server
|
18
15
|
mcp = FastMCP("Kreuzberg Text Extraction")
|
19
16
|
|
20
17
|
|
@@ -27,14 +24,11 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
27
24
|
Returns:
|
28
25
|
ExtractionConfig instance.
|
29
26
|
"""
|
30
|
-
|
31
|
-
base_config = try_discover_config()
|
27
|
+
base_config = discover_config()
|
32
28
|
|
33
29
|
if base_config is None:
|
34
|
-
# No config file found, use defaults
|
35
30
|
return ExtractionConfig(**kwargs)
|
36
31
|
|
37
|
-
# Merge discovered config with tool parameters (tool params take precedence)
|
38
32
|
config_dict: dict[str, Any] = {
|
39
33
|
"force_ocr": base_config.force_ocr,
|
40
34
|
"chunk_content": base_config.chunk_content,
|
@@ -50,7 +44,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
50
44
|
"gmft_config": base_config.gmft_config,
|
51
45
|
}
|
52
46
|
|
53
|
-
# Override with provided parameters
|
54
47
|
config_dict = config_dict | kwargs
|
55
48
|
|
56
49
|
return ExtractionConfig(**config_dict)
|
@@ -189,7 +182,7 @@ def get_default_config() -> str:
|
|
189
182
|
@mcp.resource("config://discovered")
|
190
183
|
def get_discovered_config() -> str:
|
191
184
|
"""Get the discovered configuration from config files."""
|
192
|
-
config =
|
185
|
+
config = discover_config()
|
193
186
|
if config is None:
|
194
187
|
return "No configuration file found"
|
195
188
|
return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
|
kreuzberg/_mime_types.py
CHANGED
@@ -4,6 +4,7 @@ from mimetypes import guess_type
|
|
4
4
|
from pathlib import Path
|
5
5
|
from typing import TYPE_CHECKING, Final
|
6
6
|
|
7
|
+
from kreuzberg._utils._cache import get_mime_cache
|
7
8
|
from kreuzberg.exceptions import ValidationError
|
8
9
|
|
9
10
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -191,8 +192,6 @@ def validate_mime_type(
|
|
191
192
|
return _validate_explicit_mime_type(mime_type)
|
192
193
|
|
193
194
|
if file_path:
|
194
|
-
from kreuzberg._utils._cache import get_mime_cache # noqa: PLC0415
|
195
|
-
|
196
195
|
path = Path(file_path)
|
197
196
|
|
198
197
|
try:
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -1,15 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import warnings
|
4
|
-
from
|
5
|
-
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
4
|
+
from typing import TYPE_CHECKING, Any, ClassVar, Final
|
6
5
|
|
7
6
|
from PIL import Image
|
8
7
|
|
9
8
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
10
9
|
from kreuzberg._ocr._base import OCRBackend
|
11
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
12
|
-
from kreuzberg._utils._device import DeviceInfo,
|
10
|
+
from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
|
11
|
+
from kreuzberg._utils._device import DeviceInfo, validate_device_request
|
13
12
|
from kreuzberg._utils._string import normalize_spaces
|
14
13
|
from kreuzberg._utils._sync import run_sync
|
15
14
|
from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
|
@@ -22,6 +21,18 @@ try: # pragma: no cover
|
|
22
21
|
except ImportError: # pragma: no cover
|
23
22
|
from typing_extensions import Unpack
|
24
23
|
|
24
|
+
try:
|
25
|
+
import easyocr
|
26
|
+
import numpy as np
|
27
|
+
import torch
|
28
|
+
|
29
|
+
HAS_EASYOCR = True
|
30
|
+
except ImportError:
|
31
|
+
HAS_EASYOCR = False
|
32
|
+
easyocr = None
|
33
|
+
np = None # type: ignore[assignment]
|
34
|
+
torch = None # type: ignore[assignment]
|
35
|
+
|
25
36
|
|
26
37
|
EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
27
38
|
"abq",
|
@@ -110,59 +121,6 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
|
110
121
|
}
|
111
122
|
|
112
123
|
|
113
|
-
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
114
|
-
class EasyOCRConfig:
|
115
|
-
"""Configuration options for EasyOCR."""
|
116
|
-
|
117
|
-
add_margin: float = 0.1
|
118
|
-
"""Extend bounding boxes in all directions."""
|
119
|
-
adjust_contrast: float = 0.5
|
120
|
-
"""Target contrast level for low contrast text."""
|
121
|
-
beam_width: int = 5
|
122
|
-
"""Beam width for beam search in recognition."""
|
123
|
-
canvas_size: int = 2560
|
124
|
-
"""Maximum image dimension for detection."""
|
125
|
-
contrast_ths: float = 0.1
|
126
|
-
"""Contrast threshold for preprocessing."""
|
127
|
-
decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
|
128
|
-
"""Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
|
129
|
-
height_ths: float = 0.5
|
130
|
-
"""Maximum difference in box height for merging."""
|
131
|
-
language: str | list[str] = "en"
|
132
|
-
"""Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
|
133
|
-
a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
|
134
|
-
link_threshold: float = 0.4
|
135
|
-
"""Link confidence threshold."""
|
136
|
-
low_text: float = 0.4
|
137
|
-
"""Text low-bound score."""
|
138
|
-
mag_ratio: float = 1.0
|
139
|
-
"""Image magnification ratio."""
|
140
|
-
min_size: int = 10
|
141
|
-
"""Minimum text box size in pixels."""
|
142
|
-
rotation_info: list[int] | None = None
|
143
|
-
"""List of angles to try for detection."""
|
144
|
-
slope_ths: float = 0.1
|
145
|
-
"""Maximum slope for merging text boxes."""
|
146
|
-
text_threshold: float = 0.7
|
147
|
-
"""Text confidence threshold."""
|
148
|
-
use_gpu: bool = False
|
149
|
-
"""Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
|
150
|
-
device: DeviceType = "auto"
|
151
|
-
"""Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
|
152
|
-
gpu_memory_limit: float | None = None
|
153
|
-
"""Maximum GPU memory to use in GB. None for no limit."""
|
154
|
-
fallback_to_cpu: bool = True
|
155
|
-
"""Whether to fallback to CPU if requested device is unavailable."""
|
156
|
-
width_ths: float = 0.5
|
157
|
-
"""Maximum horizontal distance for merging boxes."""
|
158
|
-
x_ths: float = 1.0
|
159
|
-
"""Maximum horizontal distance for paragraph merging."""
|
160
|
-
y_ths: float = 0.5
|
161
|
-
"""Maximum vertical distance for paragraph merging."""
|
162
|
-
ycenter_ths: float = 0.5
|
163
|
-
"""Maximum shift in y direction for merging."""
|
164
|
-
|
165
|
-
|
166
124
|
class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
167
125
|
_reader: ClassVar[Any] = None
|
168
126
|
|
@@ -179,8 +137,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
179
137
|
Raises:
|
180
138
|
OCRError: If OCR processing fails.
|
181
139
|
"""
|
182
|
-
import numpy as np # noqa: PLC0415
|
183
|
-
|
184
140
|
await self._init_easyocr(**kwargs)
|
185
141
|
|
186
142
|
beam_width = kwargs.pop("beam_width")
|
@@ -225,15 +181,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
225
181
|
|
226
182
|
@staticmethod
|
227
183
|
def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
|
228
|
-
"""Process EasyOCR result into an ExtractionResult with metadata.
|
229
|
-
|
230
|
-
Args:
|
231
|
-
result: The raw result from EasyOCR.
|
232
|
-
image: The original PIL image.
|
233
|
-
|
234
|
-
Returns:
|
235
|
-
ExtractionResult: The extraction result containing text content, mime type, and metadata.
|
236
|
-
"""
|
237
184
|
if not result:
|
238
185
|
return ExtractionResult(
|
239
186
|
content="",
|
@@ -314,38 +261,19 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
314
261
|
|
315
262
|
@classmethod
|
316
263
|
def _is_gpu_available(cls) -> bool:
|
317
|
-
|
318
|
-
|
319
|
-
Returns:
|
320
|
-
bool: True if GPU support is available.
|
321
|
-
"""
|
322
|
-
try:
|
323
|
-
import torch # noqa: PLC0415
|
324
|
-
|
325
|
-
return bool(torch.cuda.is_available())
|
326
|
-
except ImportError: # pragma: no cover
|
264
|
+
if not HAS_EASYOCR or torch is None:
|
327
265
|
return False
|
266
|
+
return bool(torch.cuda.is_available())
|
328
267
|
|
329
268
|
@classmethod
|
330
269
|
async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
|
331
|
-
"""Initialize EasyOCR with the provided configuration.
|
332
|
-
|
333
|
-
Args:
|
334
|
-
**kwargs: Configuration parameters for EasyOCR including language, etc.
|
335
|
-
|
336
|
-
Raises:
|
337
|
-
MissingDependencyError: If EasyOCR is not installed.
|
338
|
-
OCRError: If initialization fails.
|
339
|
-
"""
|
340
270
|
if cls._reader is not None:
|
341
271
|
return
|
342
272
|
|
343
|
-
|
344
|
-
import easyocr # noqa: PLC0415
|
345
|
-
except ImportError as e: # pragma: no cover
|
273
|
+
if not HAS_EASYOCR or easyocr is None:
|
346
274
|
raise MissingDependencyError.create_for_package(
|
347
275
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
348
|
-
)
|
276
|
+
)
|
349
277
|
|
350
278
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
351
279
|
|
@@ -369,17 +297,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
369
297
|
|
370
298
|
@classmethod
|
371
299
|
def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
|
372
|
-
"""Resolve device configuration with backward compatibility.
|
373
|
-
|
374
|
-
Args:
|
375
|
-
**kwargs: Configuration parameters including device settings.
|
376
|
-
|
377
|
-
Returns:
|
378
|
-
DeviceInfo object for the selected device.
|
379
|
-
|
380
|
-
Raises:
|
381
|
-
ValidationError: If requested device is not available and fallback is disabled.
|
382
|
-
"""
|
383
300
|
use_gpu = kwargs.get("use_gpu", False)
|
384
301
|
device = kwargs.get("device", "auto")
|
385
302
|
memory_limit = kwargs.get("gpu_memory_limit")
|
@@ -457,8 +374,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
457
374
|
Raises:
|
458
375
|
OCRError: If OCR processing fails.
|
459
376
|
"""
|
460
|
-
import numpy as np # noqa: PLC0415
|
461
|
-
|
462
377
|
self._init_easyocr_sync(**kwargs)
|
463
378
|
|
464
379
|
beam_width = kwargs.pop("beam_width")
|
@@ -513,12 +428,10 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
513
428
|
if cls._reader is not None:
|
514
429
|
return
|
515
430
|
|
516
|
-
|
517
|
-
import easyocr # noqa: PLC0415
|
518
|
-
except ImportError as e: # pragma: no cover
|
431
|
+
if not HAS_EASYOCR or easyocr is None:
|
519
432
|
raise MissingDependencyError.create_for_package(
|
520
433
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
521
|
-
)
|
434
|
+
)
|
522
435
|
|
523
436
|
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
524
437
|
|