kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_gmft.py CHANGED
@@ -7,16 +7,17 @@ import queue
7
7
  import signal
8
8
  import time
9
9
  import traceback
10
- from dataclasses import dataclass, field
11
10
  from io import StringIO
12
11
  from pathlib import Path
13
- from typing import TYPE_CHECKING, Any, Literal
12
+ from typing import TYPE_CHECKING, Any
14
13
 
15
14
  import anyio
16
15
  import msgspec
16
+ import pandas as pd
17
17
  from PIL import Image
18
18
 
19
- from kreuzberg._types import TableData
19
+ from kreuzberg._types import GMFTConfig, TableData
20
+ from kreuzberg._utils._cache import get_table_cache
20
21
  from kreuzberg._utils._sync import run_sync
21
22
  from kreuzberg.exceptions import MissingDependencyError, ParsingError
22
23
 
@@ -27,139 +28,9 @@ if TYPE_CHECKING:
27
28
  from pandas import DataFrame
28
29
 
29
30
 
30
- @dataclass(unsafe_hash=True, slots=True)
31
- class GMFTConfig:
32
- """Configuration options for GMFT.
33
-
34
- This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
35
- """
36
-
37
- verbosity: int = 0
38
- """
39
- Verbosity level for logging.
40
-
41
- 0: errors only
42
- 1: print warnings
43
- 2: print warnings and info
44
- 3: print warnings, info, and debug
45
- """
46
- formatter_base_threshold: float = 0.3
47
- """
48
- Base threshold for the confidence demanded of a table feature (row/column).
49
-
50
- Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
51
- """
52
- cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
53
- default_factory=lambda: {
54
- 0: 0.3,
55
- 1: 0.3,
56
- 2: 0.3,
57
- 3: 0.3,
58
- 4: 0.5,
59
- 5: 0.5,
60
- 6: 99,
61
- },
62
- hash=False,
63
- )
64
- """
65
- Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
66
-
67
- But low confidences may be better than too high confidence (see formatter_base_threshold)
68
- """
69
- detector_base_threshold: float = 0.9
70
- """Minimum confidence score required for a table"""
71
- remove_null_rows: bool = True
72
- """
73
- Flag to remove rows with no text.
74
- """
75
- enable_multi_header: bool = False
76
- """
77
- Enable multi-indices in the dataframe.
78
-
79
- If false, then multiple headers will be merged column-wise.
80
- """
81
- semantic_spanning_cells: bool = False
82
- """
83
- [Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
84
- """
85
- semantic_hierarchical_left_fill: Literal["algorithm", "deep"] | None = "algorithm"
86
- """
87
- [Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
88
-
89
- Possible values: 'algorithm', 'deep', None.
90
-
91
- 'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
92
- 'deep': merges headers according to the spanning cells detected by the Table Transformer.
93
- None: headers are not duplicated.
94
- """
95
- large_table_if_n_rows_removed: int = 8
96
- """
97
- If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
98
- """
99
- large_table_threshold: int = 10
100
- """
101
- With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
102
-
103
- Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
104
- """
105
- large_table_row_overlap_threshold: float = 0.2
106
- """
107
- With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
108
-
109
- Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
110
- """
111
- large_table_maximum_rows: int = 1000
112
- """
113
- Maximum number of rows allowed for a large table.
114
- """
115
- force_large_table_assumption: bool | None = None
116
- """
117
- Force the large table assumption to be applied, regardless of the number of rows and overlap.
118
- """
119
- total_overlap_reject_threshold: float = 0.9
120
- """
121
- Reject if total overlap is > 90% of table area.
122
- """
123
- total_overlap_warn_threshold: float = 0.1
124
- """
125
- Warn if total overlap is > 10% of table area.
126
- """
127
- nms_warn_threshold: int = 5
128
- """
129
- Warn if non maxima suppression removes > 5 rows.
130
- """
131
- iob_reject_threshold: float = 0.05
132
- """
133
- Reject if iob between textbox and cell is < 5%.
134
- """
135
- iob_warn_threshold: float = 0.5
136
- """
137
- Warn if iob between textbox and cell is < 50%.
138
- """
139
-
140
-
141
31
  async def extract_tables(
142
32
  file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
143
33
  ) -> list[TableData]:
144
- """Extracts tables from a PDF file.
145
-
146
- This function takes a file path to a PDF file, and an optional configuration object.
147
- It returns a list of strings, where each string is a markdown-formatted table.
148
-
149
- Args:
150
- file_path: The path to the PDF file.
151
- config: An optional configuration object.
152
- use_isolated_process: Whether to use an isolated process for extraction.
153
- If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
154
-
155
- Raises:
156
- MissingDependencyError: Raised when the required dependencies are not installed.
157
-
158
- Returns:
159
- A list of table data dictionaries.
160
- """
161
- from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
162
-
163
34
  # Determine if we should use isolated process # ~keep
164
35
  if use_isolated_process is None:
165
36
  use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -211,15 +82,15 @@ async def extract_tables(
211
82
  return result
212
83
 
213
84
  try:
214
- from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415 # noqa: PLC0415
85
+ from gmft.auto import ( # type: ignore[attr-defined] # noqa: PLC0415
215
86
  AutoTableDetector,
216
87
  AutoTableFormatter,
217
88
  )
218
89
  from gmft.detectors.tatr import TATRDetectorConfig # type: ignore[attr-defined] # noqa: PLC0415
219
- from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415 # noqa: PLC0415
220
- from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415 # noqa: PLC0415
90
+ from gmft.formatters.tatr import TATRFormatConfig # noqa: PLC0415
91
+ from gmft.pdf_bindings.pdfium import PyPDFium2Document # noqa: PLC0415
221
92
 
222
- formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
93
+ formatter: Any = AutoTableFormatter( # type: ignore[no-untyped-call]
223
94
  config=TATRFormatConfig(
224
95
  verbosity=config.verbosity,
225
96
  formatter_base_threshold=config.formatter_base_threshold,
@@ -235,7 +106,7 @@ async def extract_tables(
235
106
  force_large_table_assumption=config.force_large_table_assumption,
236
107
  )
237
108
  )
238
- detector: Any = AutoTableDetector( # type: ignore[no-untyped-call] # type: ignore[no-untyped-call]
109
+ detector: Any = AutoTableDetector( # type: ignore[no-untyped-call]
239
110
  config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold)
240
111
  )
241
112
  doc = await run_sync(PyPDFium2Document, str(file_path))
@@ -276,19 +147,6 @@ async def extract_tables(
276
147
  def extract_tables_sync(
277
148
  file_path: str | PathLike[str], config: GMFTConfig | None = None, use_isolated_process: bool | None = None
278
149
  ) -> list[TableData]:
279
- """Synchronous wrapper for extract_tables.
280
-
281
- Args:
282
- file_path: The path to the PDF file.
283
- config: An optional configuration object.
284
- use_isolated_process: Whether to use an isolated process for extraction.
285
- If None, uses environment variable KREUZBERG_GMFT_ISOLATED (default: True).
286
-
287
- Returns:
288
- A list of table data dictionaries.
289
- """
290
- from kreuzberg._utils._cache import get_table_cache # noqa: PLC0415
291
-
292
150
  # Determine if we should use isolated process # ~keep
293
151
  if use_isolated_process is None:
294
152
  use_isolated_process = os.environ.get("KREUZBERG_GMFT_ISOLATED", "true").lower() in ("true", "1", "yes")
@@ -390,13 +248,6 @@ def _extract_tables_in_process(
390
248
  config_dict: dict[str, Any],
391
249
  result_queue: queue.Queue[tuple[bool, Any]],
392
250
  ) -> None:
393
- """Extract tables in an isolated process to handle potential segfaults.
394
-
395
- Args:
396
- file_path: Path to the PDF file
397
- config_dict: Serialized GMFTConfig as a dict
398
- result_queue: Queue to put results or errors
399
- """
400
251
  signal.signal(signal.SIGINT, signal.SIG_IGN)
401
252
 
402
253
  try:
@@ -480,19 +331,6 @@ def _extract_tables_isolated(
480
331
  config: GMFTConfig | None = None,
481
332
  timeout: float = 300.0,
482
333
  ) -> list[TableData]:
483
- """Extract tables using an isolated process to handle segfaults.
484
-
485
- Args:
486
- file_path: Path to the PDF file
487
- config: GMFT configuration
488
- timeout: Maximum time to wait for extraction
489
-
490
- Returns:
491
- List of extracted tables
492
-
493
- Raises:
494
- RuntimeError: If extraction fails or times out
495
- """
496
334
  config = config or GMFTConfig()
497
335
  config_dict = msgspec.to_builtins(config)
498
336
 
@@ -542,7 +380,6 @@ def _extract_tables_isolated(
542
380
  tables = []
543
381
  for table_dict in result:
544
382
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
545
- import pandas as pd # noqa: PLC0415
546
383
 
547
384
  if table_dict["df_csv"] is None:
548
385
  df = pd.DataFrame(columns=table_dict["df_columns"])
@@ -592,19 +429,6 @@ async def _extract_tables_isolated_async(
592
429
  config: GMFTConfig | None = None,
593
430
  timeout: float = 300.0, # noqa: ASYNC109
594
431
  ) -> list[TableData]:
595
- """Async version of extract_tables_isolated using asyncio.
596
-
597
- Args:
598
- file_path: Path to the PDF file
599
- config: GMFT configuration
600
- timeout: Maximum time to wait for extraction
601
-
602
- Returns:
603
- List of extracted tables
604
-
605
- Raises:
606
- RuntimeError: If extraction fails or times out
607
- """
608
432
  config = config or GMFTConfig()
609
433
  config_dict = msgspec.to_builtins(config)
610
434
 
@@ -620,38 +444,29 @@ async def _extract_tables_isolated_async(
620
444
 
621
445
  try:
622
446
 
623
- async def wait_for_result() -> tuple[bool, Any]:
447
+ def get_result_sync() -> tuple[bool, Any]:
624
448
  while True:
625
449
  try:
626
- return result_queue.get_nowait() # type: ignore[no-any-return]
450
+ return result_queue.get(timeout=0.1) # type: ignore[no-any-return]
627
451
  except queue.Empty: # noqa: PERF203
628
- await anyio.sleep(0.1)
629
452
  if not process.is_alive():
630
- # Process died without putting result # ~keep
631
453
  if process.exitcode == -signal.SIGSEGV:
632
454
  raise ParsingError(
633
455
  "GMFT process crashed with segmentation fault",
634
- context={
635
- "file_path": str(file_path),
636
- "exit_code": process.exitcode,
637
- },
456
+ context={"file_path": str(file_path), "exit_code": process.exitcode},
638
457
  ) from None
639
458
  raise ParsingError(
640
459
  f"GMFT process died unexpectedly with exit code {process.exitcode}",
641
- context={
642
- "file_path": str(file_path),
643
- "exit_code": process.exitcode,
644
- },
460
+ context={"file_path": str(file_path), "exit_code": process.exitcode},
645
461
  ) from None
646
462
 
647
463
  with anyio.fail_after(timeout):
648
- success, result = await wait_for_result()
464
+ success, result = await anyio.to_thread.run_sync(get_result_sync)
649
465
 
650
466
  if success:
651
467
  tables = []
652
468
  for table_dict in result:
653
469
  img = Image.open(io.BytesIO(table_dict["cropped_image_bytes"]))
654
- import pandas as pd # noqa: PLC0415
655
470
 
656
471
  if table_dict["df_csv"] is None:
657
472
  df = pd.DataFrame(columns=table_dict["df_columns"])
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- from dataclasses import dataclass
4
3
  from functools import lru_cache
5
4
  from typing import TYPE_CHECKING, Any
6
5
 
6
+ from kreuzberg._types import LanguageDetectionConfig
7
7
  from kreuzberg.exceptions import MissingDependencyError
8
8
 
9
9
  if TYPE_CHECKING:
@@ -23,29 +23,7 @@ except ImportError: # pragma: no cover
23
23
  _CACHE_SIZE = 128
24
24
 
25
25
 
26
- @dataclass(frozen=True, slots=True)
27
- class LanguageDetectionConfig:
28
- """Configuration for language detection.
29
-
30
- Attributes:
31
- low_memory: If True, uses a smaller model (~200MB). If False, uses a larger, more accurate model.
32
- Defaults to True for better memory efficiency.
33
- top_k: Maximum number of languages to return for multilingual detection. Defaults to 3.
34
- multilingual: If True, uses multilingual detection to handle mixed-language text.
35
- If False, uses single language detection. Defaults to False.
36
- cache_dir: Custom directory for model cache. If None, uses system default.
37
- allow_fallback: If True, falls back to small model if large model fails. Defaults to True.
38
- """
39
-
40
- low_memory: bool = True
41
- top_k: int = 3
42
- multilingual: bool = False
43
- cache_dir: str | None = None
44
- allow_fallback: bool = True
45
-
46
-
47
26
  def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangDetectConfig | None:
48
- """Create FastLangDetectConfig from our config."""
49
27
  if not HAS_FAST_LANGDETECT or FastLangDetectConfig is None:
50
28
  return None
51
29
 
@@ -60,19 +38,6 @@ def _create_fast_langdetect_config(config: LanguageDetectionConfig) -> FastLangD
60
38
 
61
39
  @lru_cache(maxsize=_CACHE_SIZE)
62
40
  def detect_languages(text: str, config: LanguageDetectionConfig | None = None) -> list[str] | None:
63
- """Detect the most probable languages in the given text using fast-langdetect.
64
-
65
- Args:
66
- text: The text to analyze.
67
- config: Configuration for language detection. If None, uses defaults.
68
-
69
- Returns:
70
- A list of detected language codes in lowercase (e.g., ['en', 'de', 'fr']),
71
- or None if detection fails.
72
-
73
- Raises:
74
- MissingDependencyError: If fast-langdetect is not installed.
75
- """
76
41
  if not HAS_FAST_LANGDETECT or detect is None or detect_multilingual is None:
77
42
  raise MissingDependencyError.create_for_package(
78
43
  dependency_group="langdetect", functionality="language detection", package_name="fast-langdetect"
@@ -1,5 +1,3 @@
1
- """MCP server for Kreuzberg text extraction."""
2
-
3
1
  from .server import mcp
4
2
 
5
3
  __all__ = ["mcp"]
kreuzberg/_mcp/server.py CHANGED
@@ -1,5 +1,3 @@
1
- """Kreuzberg MCP server implementation."""
2
-
3
1
  from __future__ import annotations
4
2
 
5
3
  import base64
@@ -10,11 +8,10 @@ import msgspec
10
8
  from mcp.server import FastMCP
11
9
  from mcp.types import TextContent
12
10
 
13
- from kreuzberg._config import try_discover_config
11
+ from kreuzberg._config import discover_config
14
12
  from kreuzberg._types import ExtractionConfig, OcrBackendType
15
13
  from kreuzberg.extraction import extract_bytes_sync, extract_file_sync
16
14
 
17
- # Create the MCP server
18
15
  mcp = FastMCP("Kreuzberg Text Extraction")
19
16
 
20
17
 
@@ -27,14 +24,11 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
27
24
  Returns:
28
25
  ExtractionConfig instance.
29
26
  """
30
- # Try to discover configuration from files
31
- base_config = try_discover_config()
27
+ base_config = discover_config()
32
28
 
33
29
  if base_config is None:
34
- # No config file found, use defaults
35
30
  return ExtractionConfig(**kwargs)
36
31
 
37
- # Merge discovered config with tool parameters (tool params take precedence)
38
32
  config_dict: dict[str, Any] = {
39
33
  "force_ocr": base_config.force_ocr,
40
34
  "chunk_content": base_config.chunk_content,
@@ -50,7 +44,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
50
44
  "gmft_config": base_config.gmft_config,
51
45
  }
52
46
 
53
- # Override with provided parameters
54
47
  config_dict = config_dict | kwargs
55
48
 
56
49
  return ExtractionConfig(**config_dict)
@@ -189,7 +182,7 @@ def get_default_config() -> str:
189
182
  @mcp.resource("config://discovered")
190
183
  def get_discovered_config() -> str:
191
184
  """Get the discovered configuration from config files."""
192
- config = try_discover_config()
185
+ config = discover_config()
193
186
  if config is None:
194
187
  return "No configuration file found"
195
188
  return json.dumps(msgspec.to_builtins(config, order="deterministic"), indent=2)
kreuzberg/_mime_types.py CHANGED
@@ -4,6 +4,7 @@ from mimetypes import guess_type
4
4
  from pathlib import Path
5
5
  from typing import TYPE_CHECKING, Final
6
6
 
7
+ from kreuzberg._utils._cache import get_mime_cache
7
8
  from kreuzberg.exceptions import ValidationError
8
9
 
9
10
  if TYPE_CHECKING: # pragma: no cover
@@ -172,27 +173,10 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
172
173
  def validate_mime_type(
173
174
  *, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
174
175
  ) -> str:
175
- """Validate and detect the MIME type for a given file.
176
-
177
- Args:
178
- file_path: The path to the file.
179
- mime_type: Optional explicit MIME type. If provided, this will be validated.
180
- If not provided, the function will attempt to detect the MIME type.
181
- check_file_exists: Whether to check if the file exists. Default is True.
182
- Set to False in tests where you want to validate a mime type without an actual file.
183
-
184
- Raises:
185
- ValidationError: If the MIME type is not supported or cannot be determined.
186
-
187
- Returns:
188
- The validated MIME type.
189
- """
190
176
  if mime_type:
191
177
  return _validate_explicit_mime_type(mime_type)
192
178
 
193
179
  if file_path:
194
- from kreuzberg._utils._cache import get_mime_cache # noqa: PLC0415
195
-
196
180
  path = Path(file_path)
197
181
 
198
182
  try:
@@ -228,7 +212,6 @@ def validate_mime_type(
228
212
 
229
213
 
230
214
  def _validate_explicit_mime_type(mime_type: str) -> str:
231
- """Validate an explicitly provided MIME type."""
232
215
  if mime_type in SUPPORTED_MIME_TYPES:
233
216
  return mime_type
234
217
 
@@ -243,7 +226,6 @@ def _validate_explicit_mime_type(mime_type: str) -> str:
243
226
 
244
227
 
245
228
  def _detect_mime_type_uncached(file_path: PathLike[str] | str | None = None, check_file_exists: bool = True) -> str:
246
- """Detect MIME type without caching (internal function)."""
247
229
  if file_path and check_file_exists:
248
230
  path = Path(file_path)
249
231
  if not path.exists():
kreuzberg/_ocr/_base.py CHANGED
@@ -16,98 +16,26 @@ T = TypeVar("T")
16
16
 
17
17
 
18
18
  class OCRBackend(ABC, Generic[T]):
19
- """Abstract base class for Optical Character Recognition (OCR) backend implementations.
20
-
21
- This class provides the blueprint for OCR backend implementations,
22
- offering both synchronous and asynchronous methods to process images
23
- and files for text extraction.
24
- """
25
-
26
19
  @abstractmethod
27
- async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
28
- """Asynchronously process an image and extract its text and metadata.
29
-
30
- Args:
31
- image: An instance of PIL.Image representing the input image.
32
- **kwargs: Any kwargs related to the given backend
33
-
34
- Returns:
35
- The extraction result object
36
- """
37
- ...
20
+ async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
38
21
 
39
22
  @abstractmethod
40
- async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
41
- """Asynchronously process a file and extract its text and metadata.
42
-
43
- Args:
44
- path: A Path object representing the file to be processed.
45
- **kwargs: Any kwargs related to the given backend
46
-
47
- Returns:
48
- The extraction result object
49
- """
50
- ...
23
+ async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
51
24
 
52
25
  @abstractmethod
53
- def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
54
- """Synchronously process an image and extract its text and metadata.
55
-
56
- Args:
57
- image: An instance of PIL.Image representing the input image.
58
- **kwargs: Any kwargs related to the given backend
59
-
60
- Returns:
61
- The extraction result object
62
- """
63
- ...
26
+ def process_image_sync(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult: ...
64
27
 
65
28
  @abstractmethod
66
- def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
67
- """Synchronously process a file and extract its text and metadata.
68
-
69
- Args:
70
- path: A Path object representing the file to be processed.
71
- **kwargs: Any kwargs related to the given backend
72
-
73
- Returns:
74
- The extraction result object
75
- """
76
- ...
29
+ def process_file_sync(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult: ...
77
30
 
78
31
  def process_batch_sync(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
79
- """Synchronously process a batch of files and extract their text and metadata.
80
-
81
- Default implementation processes files sequentially. Backends can override
82
- for more efficient batch processing.
83
-
84
- Args:
85
- paths: List of Path objects representing files to be processed.
86
- **kwargs: Any kwargs related to the given backend
87
-
88
- Returns:
89
- List of extraction result objects in the same order as input paths
90
- """
91
32
  return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
92
33
 
93
34
  async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
- """Asynchronously process a batch of files and extract their text and metadata.
95
-
96
- Default implementation processes files concurrently. Backends can override
97
- for more efficient batch processing.
98
-
99
- Args:
100
- paths: List of Path objects representing files to be processed.
101
- **kwargs: Any kwargs related to the given backend
102
-
103
- Returns:
104
- List of extraction result objects in the same order as input paths
105
- """
106
35
  from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
107
36
 
108
37
  tasks = [self.process_file(path, **kwargs) for path in paths]
109
38
  return await run_taskgroup(*tasks) # pragma: no cover
110
39
 
111
40
  def __hash__(self) -> int:
112
- """Hash function for allowing caching."""
113
41
  return hash(type(self).__name__) # pragma: no cover