kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +54 -74
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_gmft.py CHANGED
@@ -7,16 +7,17 @@ import queue
7
7
  import signal
8
8
  import time
9
9
  import traceback
10
- from dataclasses import dataclass, field
11
10
  from io import StringIO
12
11
  from pathlib import Path
13
- from typing import TYPE_CHECKING, Any, Literal
12
+ from typing import TYPE_CHECKING, Any
14
13
 
15
14
  import anyio
16
15
  import msgspec
16
+ import pandas as pd
17
17
  from PIL import Image
18
18
 
19
- from kreuzberg._types import TableData
19
+ from kreuzberg._types import GMFTConfig, TableData
20
+ from kreuzberg._utils._cache import get_table_cache
20
21
  from kreuzberg._utils._sync import run_sync
21
22
  from kreuzberg.exceptions import MissingDependencyError, ParsingError
22
23
 
@@ -27,117 +28,6 @@ if TYPE_CHECKING:
27
28
  from pandas import DataFrame
28
29
 
29
30
 
30
- @dataclass(unsafe_hash=True, slots=True)
31
- class GMFTConfig:
32
- """Configuration options for GMFT.
33
-
34
- This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
35
- """
36
-
37
- verbosity: int = 0
38
- """
39
- Verbosity level for logging.
40
-
41
- 0: errors only
42
- 1: print warnings
43
- 2: print warnings and info
44
- 3: print warnings, info, and debug
45
- """
46
- formatter_base_threshold: float = 0.3
47
- """
48
- Base threshold for the confidence demanded of a table feature (row/column).
49
-
50
- Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
51
- """
52
- cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
53
- default_factory=lambda: {
54
- 0: 0.3,
55
- 1: 0.3,
56
- 2: 0.3,
57
- 3: 0.3,
58
- 4: 0.5,
59
- 5: 0.5,
60
- 6: 99,
61
- },
62
- hash=False,
63
- )
64
- """
65
- Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
66
-
67
- But low confidences may be better than too high confidence (see formatter_base_threshold)
68
- """
69
- detector_base_threshold: float = 0.9
70
- """Minimum confidence score required for a table"""
71
- remove_null_rows: bool = True
72
- """
73
- Flag to remove rows with no text.
74
- """
75
- enable_multi_header: bool = False
76
- """
77
- Enable multi-indices in the dataframe.
78
-
79
- If false, then multiple headers will be merged column-wise.
80
- """
81
- semantic_spanning_cells: bool = False
82
- """
83
- [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
84
- """
85
- semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
86
- """
87
- [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
88
-
89
- Possible values: 'algorithm', 'deep', None.
90
-
91
- 'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
92
- 'deep': merges headers according to the spanning cells detected by the Table Transformer.
93
- None: headers are not duplicated.
94
- """
95
- large_table_if_n_rows_removed: int = 8
96
- """
97
- If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
98
- """
99
- large_table_threshold: int = 10
100
- """
101
- With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
102
-
103
- Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
104
- """
105
- large_table_row_overlap_threshold: float = 0.2
106
- """
107
- With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
108
-
109
- Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
110
- """
111
- large_table_maximum_rows: int = 1000
112
- """
113
- Maximum number of rows allowed for a large table.
114
- """
115
- force_large_table_assumption: bool | None = None
116
- """
117
- Force the large table assumption to be applied, regardless of the number of rows and overlap.
118
- """
119
- total_overlap_reject_threshold: float = 0.9
120
- """
121
- Reject if total overlap is > 90% of table area.
122
- """
123
- total_overlap_warn_threshold: float = 0.1
124
- """
125
- Warn if total overlap is > 10% of table area.
126
- """
127
- nms_warn_threshold: int = 5
128
- """
129
- Warn if non maxima suppression removes > 5 rows.
130
- """
131
- iob_reject_threshold: float = 0.05
132
- """
133
- Reject if iob between textbox and cell is < 5%.
134
- """
135
- iob_warn_threshold: float = 0.5
136
- """
137
- Warn if iob between textbox and cell is < 50%.
138
- """
139
-
140
-
141
31
  async def extract_tables(
142
32
  file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
143
33
  ) -> list[TableData]:
@@ -158,8 +48,6 @@ async def extract_tables(
158
48
  Returns:
159
49
  A list of table data dictionaries.
160
50
  """
161
- from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
162
-
163
51
  # Determine if we should use isolated process # ~keep
164
52
  if use_isolated_process is None:
165
53
  use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -211,15 +99,15 @@ async def extract_tables(
211
99
  return result
212
100
 
213
101
  try:
214
- from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415 # noqa: PLC0415
102
+ from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415
215
103
  AutoTableDetector,
216
104
  AutoTableFormatter,
217
105
  )
218
106
  from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
219
- from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415 # noqa: PLC0415
220
- from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415 # noqa: PLC0415
107
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
108
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
221
109
 
222
- formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
110
+ formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
223
111
  config=TATRFormatConfig(
224
112
  verbosity=config.verbosity,
225
113
  formatter_base_threshold=config.formatter_base_threshold,
@@ -235,7 +123,7 @@ async def extract_tables(
235
123
  force_large_table_assumption=config.force_large_table_assumption,
236
124
  )
237
125
  )
238
- detector: Any = AutoTableDetector( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
126
+ detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
239
127
  config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
240
128
  )
241
129
  doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -287,8 +175,6 @@ def extract_tables_sync(
287
175
  Returns:
288
176
  A list of table data dictionaries.
289
177
  """
290
- from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
291
-
292
178
  # Determine if we should use isolated process # ~keep
293
179
  if use_isolated_process is None:
294
180
  use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -542,7 +428,6 @@ def _extract_tables_isolated(
542
428
  tables = []
543
429
  for table_dict in result:
544
430
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
545
- import pandas as pd # noqa: PLC0415
546
431
 
547
432
  if table_dict["df_csv"] is None:
548
433
  df = pd.DataFrame(columns=table_dict["df_columns"])
@@ -620,38 +505,29 @@ async def _extract_tables_isolated_async(
620
505
 
621
506
  try:
622
507
 
623
- async def wait_for_result() -> tuple[bool, Any]:
508
+ def get_result_sync() -> tuple[bool, Any]:
624
509
  while True:
625
510
  try:
626
- return result_queue.get_nowait() # type: ignore[no-any-return]
511
+ return result_queue.get(timeout=0.1) # type: ignore[no-any-return]
627
512
  except queue.Empty: # noqa: PERF203
628
- await anyio.sleep(0.1)
629
513
  if not process.is_alive():
630
- # Process died without putting result # ~keep
631
514
  if process.exitcode == -signal.SIGSEGV:
632
515
  raise ParsingError(
633
516
  "GMFT process crashed with segmentation fault",
634
- context={
635
- "file_path": str(file_path),
636
- "exit_code": process.exitcode,
637
- },
517
+ context={"file_path": str(file_path), "exit_code": process.exitcode},
638
518
  ) from None
639
519
  raise ParsingError(
640
520
  f"GMFT process died unexpectedly with exit code {process.exitcode}",
641
- context={
642
- "file_path": str(file_path),
643
- "exit_code": process.exitcode,
644
- },
521
+ context={"file_path": str(file_path), "exit_code": process.exitcode},
645
522
  ) from None
646
523
 
647
524
  with anyio.fail_after(timeout):
648
- success, result = await wait_for_result()
525
+ success, result = await anyio.to_thread.run_sync(get_result_sync)
649
526
 
650
527
  if success:
651
528
  tables = []
652
529
  for table_dict in result:
653
530
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
654
- import pandas as pd # noqa: PLC0415
655
531
 
656
532
  if table_dict["df_csv"] is None:
657
533
  df = pd.DataFrame(columns=table_dict["df_columns"])
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
3
  from functools import lru_cache
5
4
  from typing import TYPE_CHECKING, Any
6
5
 
6
+ from kreuzberg._types import LanguageDetectionConfig
7
7
  from kreuzberg.exceptions import MissingDependencyError
8
8
 
9
9
  if TYPE_CHECKING:
@@ -23,27 +23,6 @@ except ImportError: # pragma: no cover
23
23
  _CACHE_SIZE = 128
24
24
 
25
25
 
26
- @dataclass(frozen=True, slots=True)
27
- class LanguageDetectionConfig:
28
- """Configuration for language detection.
29
-
30
- Attributes:
31
- low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
32
- Defaults to True for better memory efficiency.
33
- top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
34
- multilingual: If True, uses multilingual detection to handle mixed-language text.
35
- If False, uses single language detection. Defaults to False.
36
- cache_dir: Custom directory for model cache. If None, uses system default.
37
- allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
38
- """
39
-
40
- low_memory: bool = True
41
- top_k: int = 3
42
- multilingual: bool = False
43
- cache_dir: str | None = None
44
- allow_fallback: bool = True
45
-
46
-
47
26
  def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
48
27
  """Create FastLangDetectConfig from our config."""
49
28
  if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
@@ -1,5 +1,3 @@
1
- """MCP server for Kreuzberg text extraction."""
2
-
3
1
  from .server import mcp
4
2
 
5
3
  __all__ = ["mcp"]
kreuzberg/_mcp/server.py CHANGED
@@ -1,5 +1,3 @@
1
- """Kreuzberg MCP server implementation."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import base64
@@ -10,11 +8,10 @@ import msgspec
10
8
  from mcp.server import FastMCP
11
9
  from mcp.types import TextContent
12
10
 
13
- from kreuzberg._config import try_discover_config
11
+ from kreuzberg._config import discover_config
14
12
  from kreuzberg._types import ExtractionConfig, OcrBackendType
15
13
  from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
16
14
 
17
- # Create the MCP server
18
15
  mcp = FastMCP("Kreuzberg Text Extraction")
19
16
 
20
17
 
@@ -27,14 +24,11 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
27
24
  Returns:
28
25
  ExtractionConfig instance.
29
26
  """
30
- # Try to discover configuration from files
31
- base_config = try_discover_config()
27
+ base_config = discover_config()
32
28
 
33
29
  if base_config is None:
34
- # No config file found, use defaults
35
30
  return ExtractionConfig(**kwargs)
36
31
 
37
- # Merge discovered config with tool parameters (tool params take precedence)
38
32
  config_dict: dict[str, Any] = {
39
33
  "force_ocr": base_config.force_ocr,
40
34
  "chunk_content": base_config.chunk_content,
@@ -50,7 +44,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
50
44
  "gmft_config": base_config.gmft_config,
51
45
  }
52
46
 
53
- # Override with provided parameters
54
47
  config_dict = config_dict | kwargs
55
48
 
56
49
  return ExtractionConfig(**config_dict)
@@ -189,7 +182,7 @@ def get_default_config() -> str:
189
182
  @mcp.resource("config://discovered")
190
183
  def get_discovered_config() -> str:
191
184
  """Get the discovered configuration from config files."""
192
- config = try_discover_config()
185
+ config = discover_config()
193
186
  if config is None:
194
187
  return "No configuration file found"
195
188
  return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
kreuzberg/_mime_types.py CHANGED
@@ -4,6 +4,7 @@ from mimetypes import guess_type
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Final
6
6
 
7
+ from kreuzberg._utils._cache import get_mime_cache
7
8
  from kreuzberg.exceptions import ValidationError
8
9
 
9
10
  if TYPE_CHECKING: # pragma: no cover
@@ -191,8 +192,6 @@ def validate_mime_type(
191
192
  return _validate_explicit_mime_type(mime_type)
192
193
 
193
194
  if file_path:
194
- from kreuzberg._utils._cache import get_mime_cache # noqa: PLC0415
195
-
196
195
  path = Path(file_path)
197
196
 
198
197
  try:
@@ -1,15 +1,14 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import warnings
4
- from dataclasses import dataclass
5
- from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
4
+ from typing import TYPE_CHECKING, Any, ClassVar, Final
6
5
 
7
6
  from PIL import Image
8
7
 
9
8
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
10
9
  from kreuzberg._ocr._base import OCRBackend
11
- from kreuzberg._types import ExtractionResult, Metadata
12
- from kreuzberg._utils._device import DeviceInfo, DeviceType, validate_device_request
10
+ from kreuzberg._types import EasyOCRConfig, ExtractionResult, Metadata
11
+ from kreuzberg._utils._device import DeviceInfo, validate_device_request
13
12
  from kreuzberg._utils._string import normalize_spaces
14
13
  from kreuzberg._utils._sync import run_sync
15
14
  from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
@@ -22,6 +21,18 @@ try: # pragma: no cover
22
21
  except ImportError: # pragma: no cover
23
22
  from typing_extensions import Unpack
24
23
 
24
+ try:
25
+ import easyocr
26
+ import numpy as np
27
+ import torch
28
+
29
+ HAS_EASYOCR = True
30
+ except ImportError:
31
+ HAS_EASYOCR = False
32
+ easyocr = None
33
+ np = None # type: ignore[assignment]
34
+ torch = None # type: ignore[assignment]
35
+
25
36
 
26
37
  EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
27
38
  "abq",
@@ -110,59 +121,6 @@ EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
110
121
  }
111
122
 
112
123
 
113
- @dataclass(unsafe_hash=True, frozen=True, slots=True)
114
- class EasyOCRConfig:
115
- """Configuration options for EasyOCR."""
116
-
117
- add_margin: float = 0.1
118
- """Extend bounding boxes in all directions."""
119
- adjust_contrast: float = 0.5
120
- """Target contrast level for low contrast text."""
121
- beam_width: int = 5
122
- """Beam width for beam search in recognition."""
123
- canvas_size: int = 2560
124
- """Maximum image dimension for detection."""
125
- contrast_ths: float = 0.1
126
- """Contrast threshold for preprocessing."""
127
- decoder: Literal["greedy", "beamsearch", "wordbeamsearch"] = "greedy"
128
- """Decoder method. Options: 'greedy', 'beamsearch', 'wordbeamsearch'."""
129
- height_ths: float = 0.5
130
- """Maximum difference in box height for merging."""
131
- language: str | list[str] = "en"
132
- """Language or languages to use for OCR. Can be a single language code (e.g., 'en'),
133
- a comma-separated string of language codes (e.g., 'en,ch_sim'), or a list of language codes."""
134
- link_threshold: float = 0.4
135
- """Link confidence threshold."""
136
- low_text: float = 0.4
137
- """Text low-bound score."""
138
- mag_ratio: float = 1.0
139
- """Image magnification ratio."""
140
- min_size: int = 10
141
- """Minimum text box size in pixels."""
142
- rotation_info: list[int] | None = None
143
- """List of angles to try for detection."""
144
- slope_ths: float = 0.1
145
- """Maximum slope for merging text boxes."""
146
- text_threshold: float = 0.7
147
- """Text confidence threshold."""
148
- use_gpu: bool = False
149
- """Whether to use GPU for inference. DEPRECATED: Use 'device' parameter instead."""
150
- device: DeviceType = "auto"
151
- """Device to use for inference. Options: 'cpu', 'cuda', 'mps', 'auto'."""
152
- gpu_memory_limit: float | None = None
153
- """Maximum GPU memory to use in GB. None for no limit."""
154
- fallback_to_cpu: bool = True
155
- """Whether to fallback to CPU if requested device is unavailable."""
156
- width_ths: float = 0.5
157
- """Maximum horizontal distance for merging boxes."""
158
- x_ths: float = 1.0
159
- """Maximum horizontal distance for paragraph merging."""
160
- y_ths: float = 0.5
161
- """Maximum vertical distance for paragraph merging."""
162
- ycenter_ths: float = 0.5
163
- """Maximum shift in y direction for merging."""
164
-
165
-
166
124
  class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
167
125
  _reader: ClassVar[Any] = None
168
126
 
@@ -179,8 +137,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
179
137
  Raises:
180
138
  OCRError: If OCR processing fails.
181
139
  """
182
- import numpy as np # noqa: PLC0415
183
-
184
140
  await self._init_easyocr(**kwargs)
185
141
 
186
142
  beam_width = kwargs.pop("beam_width")
@@ -225,15 +181,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
225
181
 
226
182
  @staticmethod
227
183
  def _process_easyocr_result(result: list[Any], image: Image.Image) -> ExtractionResult:
228
- """Process EasyOCR result into an ExtractionResult with metadata.
229
-
230
- Args:
231
- result: The raw result from EasyOCR.
232
- image: The original PIL image.
233
-
234
- Returns:
235
- ExtractionResult: The extraction result containing text content, mime type, and metadata.
236
- """
237
184
  if not result:
238
185
  return ExtractionResult(
239
186
  content="",
@@ -314,38 +261,19 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
314
261
 
315
262
  @classmethod
316
263
  def _is_gpu_available(cls) -> bool:
317
- """Check if GPU is available for EasyOCR.
318
-
319
- Returns:
320
- bool: True if GPU support is available.
321
- """
322
- try:
323
- import torch # noqa: PLC0415
324
-
325
- return bool(torch.cuda.is_available())
326
- except ImportError: # pragma: no cover
264
+ if not HAS_EASYOCR or torch is None:
327
265
  return False
266
+ return bool(torch.cuda.is_available())
328
267
 
329
268
  @classmethod
330
269
  async def _init_easyocr(cls, **kwargs: Unpack[EasyOCRConfig]) -> None:
331
- """Initialize EasyOCR with the provided configuration.
332
-
333
- Args:
334
- **kwargs: Configuration parameters for EasyOCR including language, etc.
335
-
336
- Raises:
337
- MissingDependencyError: If EasyOCR is not installed.
338
- OCRError: If initialization fails.
339
- """
340
270
  if cls._reader is not None:
341
271
  return
342
272
 
343
- try:
344
- import easyocr # noqa: PLC0415
345
- except ImportError as e: # pragma: no cover
273
+ if not HAS_EASYOCR or easyocr is None:
346
274
  raise MissingDependencyError.create_for_package(
347
275
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
348
- ) from e
276
+ )
349
277
 
350
278
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
351
279
 
@@ -369,17 +297,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
369
297
 
370
298
  @classmethod
371
299
  def _resolve_device_config(cls, **kwargs: Unpack[EasyOCRConfig]) -> DeviceInfo:
372
- """Resolve device configuration with backward compatibility.
373
-
374
- Args:
375
- **kwargs: Configuration parameters including device settings.
376
-
377
- Returns:
378
- DeviceInfo object for the selected device.
379
-
380
- Raises:
381
- ValidationError: If requested device is not available and fallback is disabled.
382
- """
383
300
  use_gpu = kwargs.get("use_gpu", False)
384
301
  device = kwargs.get("device", "auto")
385
302
  memory_limit = kwargs.get("gpu_memory_limit")
@@ -457,8 +374,6 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
457
374
  Raises:
458
375
  OCRError: If OCR processing fails.
459
376
  """
460
- import numpy as np # noqa: PLC0415
461
-
462
377
  self._init_easyocr_sync(**kwargs)
463
378
 
464
379
  beam_width = kwargs.pop("beam_width")
@@ -513,12 +428,10 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
513
428
  if cls._reader is not None:
514
429
  return
515
430
 
516
- try:
517
- import easyocr # noqa: PLC0415
518
- except ImportError as e: # pragma: no cover
431
+ if not HAS_EASYOCR or easyocr is None:
519
432
  raise MissingDependencyError.create_for_package(
520
433
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
521
- ) from e
434
+ )
522
435
 
523
436
  languages = cls._validate_language_code(kwargs.pop("language", "en"))
524
437