kreuzberg 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_types.py ADDED
@@ -0,0 +1,71 @@
1
+ from __future__ import annotations
2
+
3
+ import sys
4
+ from typing import NamedTuple, TypedDict
5
+
6
+ if sys.version_info < (3, 11): # pragma: no cover
7
+ from typing_extensions import NotRequired
8
+ else: # pragma: no cover
9
+ from typing import NotRequired
10
+
11
+
12
+ class Metadata(TypedDict, total=False):
13
+ """Document metadata.
14
+
15
+ All fields are optional but will only be included if they contain non-empty values.
16
+ Any field that would be empty or None is omitted from the dictionary.
17
+
18
+ Different documents and extraction methods will yield different metadata.
19
+ """
20
+
21
+ title: NotRequired[str]
22
+ """Document title."""
23
+ subtitle: NotRequired[str]
24
+ """Document subtitle."""
25
+ abstract: NotRequired[str | list[str]]
26
+ """Document abstract, summary or description."""
27
+ authors: NotRequired[list[str]]
28
+ """List of document authors."""
29
+ date: NotRequired[str]
30
+ """Document date as string to preserve original format."""
31
+ subject: NotRequired[str]
32
+ """Document subject or topic."""
33
+ description: NotRequired[str]
34
+ """Extended description."""
35
+ keywords: NotRequired[list[str]]
36
+ """Keywords or tags."""
37
+ categories: NotRequired[list[str]]
38
+ """Categories or classifications."""
39
+ version: NotRequired[str]
40
+ """Version identifier."""
41
+ language: NotRequired[str]
42
+ """Document language code."""
43
+ references: NotRequired[list[str]]
44
+ """Reference entries."""
45
+ citations: NotRequired[list[str]]
46
+ """Citation identifiers."""
47
+ copyright: NotRequired[str]
48
+ """Copyright information."""
49
+ license: NotRequired[str]
50
+ """License information."""
51
+ identifier: NotRequired[str]
52
+ """Document identifier."""
53
+ publisher: NotRequired[str]
54
+ """Publisher name."""
55
+ contributors: NotRequired[list[str]]
56
+ """Additional contributors."""
57
+ creator: NotRequired[str]
58
+ """Document creator."""
59
+ institute: NotRequired[str | list[str]]
60
+ """Institute or organization."""
61
+
62
+
63
+ class ExtractionResult(NamedTuple):
64
+ """The result of a file extraction."""
65
+
66
+ content: str
67
+ """The extracted content."""
68
+ mime_type: str
69
+ """The mime type of the content."""
70
+ metadata: Metadata
71
+ """The metadata of the content."""
kreuzberg/_xlsx.py ADDED
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ from io import StringIO
5
+ from typing import TYPE_CHECKING, cast
6
+
7
+ from anyio import Path as AsyncPath
8
+ from anyio import create_task_group
9
+ from python_calamine import CalamineWorkbook
10
+
11
+ from kreuzberg import ExtractionResult, ParsingError
12
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
13
+ from kreuzberg._pandoc import process_file_with_pandoc
14
+ from kreuzberg._string import normalize_spaces
15
+ from kreuzberg._sync import run_sync
16
+ from kreuzberg._tmp import create_temp_file
17
+
18
+ if TYPE_CHECKING: # pragma: no cover
19
+ from pathlib import Path
20
+
21
+
22
+ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
23
+ """Extract text from an XLSX file by converting it to CSV and then to markdown.
24
+
25
+ Args:
26
+ input_file: The path to the XLSX file.
27
+
28
+ Returns:
29
+ The extracted text content.
30
+
31
+ Raises:
32
+ ParsingError: If the XLSX file could not be parsed.
33
+ """
34
+ try:
35
+ workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
36
+
37
+ results = cast(list[str], [None] * len(workbook.sheet_names))
38
+
39
+ async def convert_sheet_to_text(sheet_name: str) -> None:
40
+ nonlocal results
41
+ values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
42
+
43
+ csv_buffer = StringIO()
44
+ writer = csv.writer(csv_buffer)
45
+
46
+ for row in values:
47
+ writer.writerow(row)
48
+
49
+ csv_data = csv_buffer.getvalue()
50
+ csv_buffer.close()
51
+
52
+ from kreuzberg._tmp import create_temp_file
53
+
54
+ csv_path, unlink = await create_temp_file(".csv")
55
+ await AsyncPath(csv_path).write_text(csv_data)
56
+ result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
57
+ results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
58
+ await unlink()
59
+
60
+ async with create_task_group() as tg:
61
+ for sheet_name in workbook.sheet_names:
62
+ tg.start_soon(convert_sheet_to_text, sheet_name)
63
+
64
+ return ExtractionResult(
65
+ content="\n\n".join(results),
66
+ mime_type=MARKDOWN_MIME_TYPE,
67
+ metadata={},
68
+ )
69
+ except Exception as e:
70
+ raise ParsingError(
71
+ "Could not extract text from XLSX",
72
+ context={
73
+ "error": str(e),
74
+ },
75
+ ) from e
76
+
77
+
78
+ async def extract_xlsx_content(content: bytes) -> ExtractionResult:
79
+ """Extract text from an XLSX file content.
80
+
81
+ Args:
82
+ content: The XLSX file content.
83
+
84
+ Returns:
85
+ The extracted text content.
86
+ """
87
+ xlsx_path, unlink = await create_temp_file(".xlsx")
88
+
89
+ await AsyncPath(xlsx_path).write_bytes(content)
90
+ result = await extract_xlsx_file(xlsx_path)
91
+ await unlink()
92
+ return result
kreuzberg/extraction.py CHANGED
@@ -9,54 +9,62 @@ It includes vendored code:
9
9
 
10
10
  from __future__ import annotations
11
11
 
12
- from mimetypes import guess_type
12
+ from functools import partial
13
+ from io import BytesIO
13
14
  from pathlib import Path
14
- from tempfile import NamedTemporaryFile
15
- from typing import NamedTuple
15
+ from typing import TYPE_CHECKING, cast
16
16
 
17
+ import anyio
17
18
  from anyio import Path as AsyncPath
19
+ from PIL.Image import open as open_image
18
20
 
19
- from kreuzberg._extractors import (
20
- extract_content_with_pandoc,
21
- extract_file_with_pandoc,
22
- extract_html_string,
23
- extract_pdf_file,
24
- extract_pptx_file,
25
- extract_xlsx_file,
26
- )
21
+ from kreuzberg import ExtractionResult
22
+ from kreuzberg._constants import DEFAULT_MAX_PROCESSES
23
+ from kreuzberg._html import extract_html_string
27
24
  from kreuzberg._mime_types import (
28
25
  EXCEL_MIME_TYPE,
29
26
  HTML_MIME_TYPE,
30
- IMAGE_MIME_TYPE_EXT_MAP,
31
27
  IMAGE_MIME_TYPES,
32
- MARKDOWN_MIME_TYPE,
33
28
  PANDOC_SUPPORTED_MIME_TYPES,
34
29
  PDF_MIME_TYPE,
35
- PLAIN_TEXT_MIME_TYPE,
36
30
  POWER_POINT_MIME_TYPE,
37
31
  SUPPORTED_MIME_TYPES,
32
+ validate_mime_type,
38
33
  )
34
+ from kreuzberg._pandoc import process_content_with_pandoc, process_file_with_pandoc
35
+ from kreuzberg._pdf import (
36
+ extract_pdf_content,
37
+ extract_pdf_file,
38
+ )
39
+ from kreuzberg._pptx import extract_pptx_file_content
39
40
  from kreuzberg._string import safe_decode
40
- from kreuzberg._tesseract import process_image_with_tesseract
41
+ from kreuzberg._tesseract import PSMMode, SupportedLanguage, process_image_with_tesseract
42
+ from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
41
43
  from kreuzberg.exceptions import ValidationError
42
44
 
45
+ if TYPE_CHECKING:
46
+ from collections.abc import Sequence
47
+ from os import PathLike
43
48
 
44
- class ExtractionResult(NamedTuple):
45
- """The result of a file extraction."""
46
-
47
- content: str
48
- """The extracted content."""
49
- mime_type: str
50
- """The mime type of the content."""
51
49
 
52
-
53
- async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False) -> ExtractionResult:
50
+ async def extract_bytes(
51
+ content: bytes,
52
+ mime_type: str,
53
+ *,
54
+ force_ocr: bool = False,
55
+ language: SupportedLanguage = "eng",
56
+ max_processes: int = DEFAULT_MAX_PROCESSES,
57
+ psm: PSMMode = PSMMode.AUTO,
58
+ ) -> ExtractionResult:
54
59
  """Extract the textual content from a given byte string representing a file's contents.
55
60
 
56
61
  Args:
57
62
  content: The content to extract.
58
63
  mime_type: The mime type of the content.
59
- force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
64
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
65
+ language: The language code for OCR. Defaults to "eng".
66
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
67
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
60
68
 
61
69
  Raises:
62
70
  ValidationError: If the mime type is not supported.
@@ -71,50 +79,54 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
71
79
  )
72
80
 
73
81
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
74
- with NamedTemporaryFile(suffix=".pdf") as temp_file:
75
- temp_file.write(content)
76
- return ExtractionResult(
77
- content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
78
- )
82
+ return await extract_pdf_content(
83
+ content, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
84
+ )
79
85
 
80
86
  if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
81
- return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
87
+ return await extract_xlsx_content(content)
82
88
 
83
89
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
84
- with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
85
- temp_file.write(content)
86
- return ExtractionResult(
87
- content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
88
- )
90
+ return await process_image_with_tesseract(
91
+ open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
92
+ )
89
93
 
90
94
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
91
95
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
92
96
  ):
93
- return ExtractionResult(
94
- content=await extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
95
- )
97
+ return await process_content_with_pandoc(content=content, mime_type=mime_type, max_processes=max_processes)
96
98
 
97
99
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
98
- return ExtractionResult(content=await extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
100
+ return await extract_pptx_file_content(content)
99
101
 
100
102
  if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
101
- return ExtractionResult(content=await extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
103
+ return await extract_html_string(content)
102
104
 
103
105
  return ExtractionResult(
104
106
  content=safe_decode(content),
105
107
  mime_type=mime_type,
108
+ metadata={},
106
109
  )
107
110
 
108
111
 
109
112
  async def extract_file(
110
- file_path: Path | str, mime_type: str | None = None, force_ocr: bool = False
113
+ file_path: PathLike[str] | str,
114
+ mime_type: str | None = None,
115
+ *,
116
+ force_ocr: bool = False,
117
+ language: SupportedLanguage = "eng",
118
+ max_processes: int = DEFAULT_MAX_PROCESSES,
119
+ psm: PSMMode = PSMMode.AUTO,
111
120
  ) -> ExtractionResult:
112
121
  """Extract the textual content from a given file.
113
122
 
114
123
  Args:
115
124
  file_path: The path to the file.
116
- mime_type: The mime type of the file.
117
- force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
125
+ mime_type: The mime type of the content.
126
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
127
+ language: The language code for OCR. Defaults to "eng".
128
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
129
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
118
130
 
119
131
  Raises:
120
132
  ValidationError: If the mime type is not supported.
@@ -122,40 +134,233 @@ async def extract_file(
122
134
  Returns:
123
135
  The extracted content and the mime type of the content.
124
136
  """
125
- file_path = Path(file_path)
126
- mime_type = mime_type or guess_type(file_path.name)[0]
127
- if not mime_type: # pragma: no cover
128
- raise ValidationError("Could not determine the mime type of the file.", context={"file_path": str(file_path)})
137
+ input_file = await AsyncPath(file_path).resolve()
129
138
 
130
- if mime_type not in SUPPORTED_MIME_TYPES or not any(mime_type.startswith(value) for value in SUPPORTED_MIME_TYPES):
131
- raise ValidationError(
132
- f"Unsupported mime type: {mime_type}",
133
- context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
134
- )
139
+ mime_type = validate_mime_type(input_file, mime_type)
135
140
 
136
- if not await AsyncPath(file_path).exists():
137
- raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
141
+ if not await input_file.exists():
142
+ raise ValidationError("The file does not exist.", context={"input_file": str(input_file)})
138
143
 
139
144
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
140
- return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
145
+ return await extract_pdf_file(
146
+ Path(input_file), force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
147
+ )
141
148
 
142
149
  if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
143
- return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
150
+ return await extract_xlsx_file(Path(input_file))
144
151
 
145
152
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
146
- return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
153
+ return await process_image_with_tesseract(input_file, max_processes=max_processes, psm=psm, language=language)
147
154
 
148
155
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
149
156
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
150
157
  ):
151
- return ExtractionResult(
152
- content=await extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
153
- )
158
+ return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type, max_processes=max_processes)
154
159
 
155
160
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
156
- return ExtractionResult(content=await extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
161
+ return await extract_pptx_file_content(Path(input_file))
157
162
 
158
163
  if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
159
- return ExtractionResult(content=await extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
164
+ return await extract_html_string(Path(input_file))
165
+
166
+ return ExtractionResult(content=safe_decode(await input_file.read_bytes()), mime_type=mime_type, metadata={})
167
+
168
+
169
+ async def batch_extract_file(
170
+ file_paths: Sequence[PathLike[str] | str],
171
+ *,
172
+ force_ocr: bool = False,
173
+ language: SupportedLanguage = "eng",
174
+ max_processes: int = DEFAULT_MAX_PROCESSES,
175
+ psm: PSMMode = PSMMode.AUTO,
176
+ ) -> list[ExtractionResult]:
177
+ """Extract text from multiple files concurrently.
178
+
179
+ Args:
180
+ file_paths: A sequence of paths to files to extract text from.
181
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
182
+ language: The language code for OCR. Defaults to "eng".
183
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
184
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
185
+
186
+ Returns:
187
+ A list of extraction results in the same order as the input paths.
188
+ """
189
+ results = cast(list[ExtractionResult], ([None] * len(file_paths)))
190
+
191
+ async def _extract_file(path: PathLike[str] | str, index: int) -> None:
192
+ result = await extract_file(
193
+ path,
194
+ force_ocr=force_ocr,
195
+ max_processes=max_processes,
196
+ psm=psm,
197
+ language=language,
198
+ )
199
+ results[index] = result
200
+
201
+ async with anyio.create_task_group() as tg:
202
+ for i, path in enumerate(file_paths):
203
+ tg.start_soon(_extract_file, path, i)
204
+
205
+ return results
206
+
207
+
208
+ async def batch_extract_bytes(
209
+ contents: Sequence[tuple[bytes, str]],
210
+ *,
211
+ force_ocr: bool = False,
212
+ language: SupportedLanguage = "eng",
213
+ max_processes: int = DEFAULT_MAX_PROCESSES,
214
+ psm: PSMMode = PSMMode.AUTO,
215
+ ) -> list[ExtractionResult]:
216
+ """Extract text from multiple byte contents concurrently.
217
+
218
+ Args:
219
+ contents: A sequence of tuples containing (content, mime_type) pairs.
220
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
221
+ language: The language code for OCR. Defaults to "eng".
222
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
223
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
224
+
225
+ Returns:
226
+ A list of extraction results in the same order as the input contents.
227
+ """
228
+ results = cast(list[ExtractionResult], [None] * len(contents))
229
+
230
+ async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
231
+ result = await extract_bytes(
232
+ content,
233
+ mime_type,
234
+ force_ocr=force_ocr,
235
+ max_processes=max_processes,
236
+ psm=psm,
237
+ language=language,
238
+ )
239
+ results[index] = result
240
+
241
+ async with anyio.create_task_group() as tg:
242
+ for i, (content, mime_type) in enumerate(contents):
243
+ tg.start_soon(_extract_bytes, content, mime_type, i)
244
+
245
+ return results
246
+
160
247
 
161
- return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
248
+ ### Sync proxies
249
+
250
+
251
+ def extract_bytes_sync(
252
+ content: bytes,
253
+ mime_type: str,
254
+ *,
255
+ force_ocr: bool = False,
256
+ language: SupportedLanguage = "eng",
257
+ max_processes: int = DEFAULT_MAX_PROCESSES,
258
+ psm: PSMMode = PSMMode.AUTO,
259
+ ) -> ExtractionResult:
260
+ """Synchronous version of extract_bytes.
261
+
262
+ Args:
263
+ content: The content to extract.
264
+ mime_type: The mime type of the content.
265
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
266
+ language: The language code for OCR. Defaults to "eng".
267
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
268
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
269
+
270
+ Returns:
271
+ The extracted content and the mime type of the content.
272
+ """
273
+ handler = partial(
274
+ extract_bytes, content, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
275
+ )
276
+ return anyio.run(handler)
277
+
278
+
279
+ def extract_file_sync(
280
+ file_path: Path | str,
281
+ mime_type: str | None = None,
282
+ *,
283
+ force_ocr: bool = False,
284
+ language: SupportedLanguage = "eng",
285
+ max_processes: int = DEFAULT_MAX_PROCESSES,
286
+ psm: PSMMode = PSMMode.AUTO,
287
+ ) -> ExtractionResult:
288
+ """Synchronous version of extract_file.
289
+
290
+ Args:
291
+ file_path: The path to the file.
292
+ mime_type: The mime type of the content.
293
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
294
+ language: The language code for OCR. Defaults to "eng".
295
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
296
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
297
+
298
+ Returns:
299
+ The extracted content and the mime type of the content.
300
+ """
301
+ handler = partial(
302
+ extract_file, file_path, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
303
+ )
304
+ return anyio.run(handler)
305
+
306
+
307
+ def batch_extract_file_sync(
308
+ file_paths: Sequence[PathLike[str] | str],
309
+ *,
310
+ force_ocr: bool = False,
311
+ language: SupportedLanguage = "eng",
312
+ max_processes: int = DEFAULT_MAX_PROCESSES,
313
+ psm: PSMMode = PSMMode.AUTO,
314
+ ) -> list[ExtractionResult]:
315
+ """Synchronous version of batch_extract_file.
316
+
317
+ Args:
318
+ file_paths: A sequence of paths to files to extract text from.
319
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
320
+ language: The language code for OCR. Defaults to "eng".
321
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
322
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
323
+
324
+ Returns:
325
+ A list of extraction results in the same order as the input paths.
326
+ """
327
+ handler = partial(
328
+ batch_extract_file,
329
+ file_paths,
330
+ force_ocr=force_ocr,
331
+ max_processes=max_processes,
332
+ language=language,
333
+ psm=psm,
334
+ )
335
+ return anyio.run(handler)
336
+
337
+
338
+ def batch_extract_bytes_sync(
339
+ contents: Sequence[tuple[bytes, str]],
340
+ *,
341
+ force_ocr: bool = False,
342
+ language: SupportedLanguage = "eng",
343
+ max_processes: int = DEFAULT_MAX_PROCESSES,
344
+ psm: PSMMode = PSMMode.AUTO,
345
+ ) -> list[ExtractionResult]:
346
+ """Synchronous version of batch_extract_bytes.
347
+
348
+ Args:
349
+ contents: A sequence of tuples containing (content, mime_type) pairs.
350
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
351
+ language: The language code for OCR. Defaults to "eng".
352
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
353
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
354
+
355
+ Returns:
356
+ A list of extraction results in the same order as the input contents.
357
+ """
358
+ handler = partial(
359
+ batch_extract_bytes,
360
+ contents,
361
+ force_ocr=force_ocr,
362
+ max_processes=max_processes,
363
+ language=language,
364
+ psm=psm,
365
+ )
366
+ return anyio.run(handler)