kreuzberg 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_pdf.py ADDED
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ from re import Pattern
4
+ from re import compile as compile_regex
5
+ from typing import TYPE_CHECKING, Final, cast
6
+
7
+ import pypdfium2
8
+ from anyio import Path as AsyncPath
9
+
10
+ from kreuzberg import ExtractionResult
11
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
+ from kreuzberg._string import normalize_spaces
13
+ from kreuzberg._sync import run_sync
14
+ from kreuzberg._tesseract import PSMMode, SupportedLanguage, batch_process_images
15
+ from kreuzberg.exceptions import ParsingError
16
+
17
+ if TYPE_CHECKING: # pragma: no cover
18
+ from pathlib import Path
19
+
20
+ from PIL.Image import Image
21
+
22
+
23
+ # Pattern to detect common PDF text extraction corruption:
24
+ # - Control and non-printable characters
25
+ # - Unicode replacement and invalid characters
26
+ # - Zero-width spaces and other invisible characters
27
+ CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
28
+ r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
29
+ )
30
+
31
+
32
+ def _validate_extracted_text(text: str) -> bool:
33
+ """Check if text extracted from PDF is valid or corrupted.
34
+
35
+ This checks for common indicators of corrupted PDF text extraction:
36
+ 1. Empty or whitespace-only text
37
+ 2. Control characters and other non-printable characters
38
+ 3. Unicode replacement characters
39
+ 4. Zero-width spaces and other invisible characters
40
+
41
+ Args:
42
+ text: The extracted text to validate
43
+
44
+ Returns:
45
+ True if the text appears valid, False if it seems corrupted
46
+ """
47
+ # Check for empty or whitespace-only text
48
+ if not text or not text.strip():
49
+ return False
50
+
51
+ # Check for corruption indicators
52
+ return not bool(CORRUPTED_PATTERN.search(text))
53
+
54
+
55
+ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
56
+ """Convert a PDF file to images.
57
+
58
+ Args:
59
+ input_file: The path to the PDF file.
60
+
61
+ Raises:
62
+ ParsingError: If the PDF file could not be converted to images.
63
+
64
+ Returns:
65
+ A list of Pillow Images.
66
+ """
67
+ document: pypdfium2.PdfDocument | None = None
68
+ try:
69
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
70
+ return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
71
+ except pypdfium2.PdfiumError as e:
72
+ raise ParsingError(
73
+ "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
74
+ ) from e
75
+ finally:
76
+ if document:
77
+ await run_sync(document.close)
78
+
79
+
80
+ async def _extract_pdf_text_with_ocr(
81
+ input_file: Path,
82
+ *,
83
+ language: SupportedLanguage = "eng",
84
+ max_processes: int,
85
+ psm: PSMMode = PSMMode.AUTO,
86
+ ) -> ExtractionResult:
87
+ """Extract text from a scanned PDF file using pytesseract.
88
+
89
+ Args:
90
+ input_file: The path to the PDF file.
91
+ language: The language code for OCR. Defaults to "eng".
92
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
93
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
94
+
95
+ Returns:
96
+ The extracted text.
97
+ """
98
+ images = await _convert_pdf_to_images(input_file)
99
+ ocr_results = await batch_process_images(images, max_processes=max_processes, psm=psm, language=language)
100
+ return ExtractionResult(
101
+ content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
102
+ )
103
+
104
+
105
+ async def _extract_pdf_searchable_text(input_file: Path) -> str:
106
+ """Extract text from a searchable PDF file using pypdfium2.
107
+
108
+ Args:
109
+ input_file: The path to the PDF file.
110
+
111
+ Raises:
112
+ ParsingError: If the text could not be extracted from the PDF file.
113
+
114
+ Returns:
115
+ The extracted text.
116
+ """
117
+ document: pypdfium2.PdfDocument | None = None
118
+ try:
119
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
120
+ text = "\n".join(page.get_textpage().get_text_bounded() for page in cast(pypdfium2.PdfDocument, document))
121
+ return normalize_spaces(text)
122
+ except pypdfium2.PdfiumError as e:
123
+ raise ParsingError(
124
+ "Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
125
+ ) from e
126
+ finally:
127
+ if document:
128
+ await run_sync(document.close)
129
+
130
+
131
+ async def extract_pdf_file(
132
+ input_file: Path,
133
+ *,
134
+ force_ocr: bool,
135
+ language: SupportedLanguage = "eng",
136
+ max_processes: int,
137
+ psm: PSMMode = PSMMode.AUTO,
138
+ ) -> ExtractionResult:
139
+ """Extract text from a PDF file.
140
+
141
+ Args:
142
+ input_file: The path to the PDF file.
143
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
144
+ language: The language code for OCR. Defaults to "eng".
145
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
146
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
147
+
148
+ Returns:
149
+ The extracted text.
150
+ """
151
+ if (
152
+ not force_ocr
153
+ and (content := await _extract_pdf_searchable_text(input_file))
154
+ and _validate_extracted_text(content)
155
+ ):
156
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
157
+
158
+ return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
159
+
160
+
161
+ async def extract_pdf_content(
162
+ content: bytes,
163
+ *,
164
+ force_ocr: bool,
165
+ language: SupportedLanguage = "eng",
166
+ max_processes: int,
167
+ psm: PSMMode = PSMMode.AUTO,
168
+ ) -> ExtractionResult:
169
+ """Extract text from a PDF file content.
170
+
171
+ Args:
172
+ content: The PDF file content.
173
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
174
+ language: The language code for OCR. Defaults to "eng".
175
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
176
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
177
+
178
+ Returns:
179
+ The extracted text.
180
+ """
181
+ from kreuzberg._tmp import create_temp_file
182
+
183
+ file_path, unlink = await create_temp_file(".pdf")
184
+ await AsyncPath(file_path).write_bytes(content)
185
+ result = await extract_pdf_file(
186
+ file_path, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
187
+ )
188
+ await unlink()
189
+ return result
kreuzberg/_pptx.py ADDED
@@ -0,0 +1,88 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from contextlib import suppress
5
+ from html import escape
6
+ from io import BytesIO
7
+ from typing import TYPE_CHECKING
8
+
9
+ import pptx
10
+ from anyio import Path as AsyncPath
11
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
12
+
13
+ from kreuzberg import ExtractionResult
14
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
15
+ from kreuzberg._string import normalize_spaces
16
+
17
+ if TYPE_CHECKING: # pragma: no cover
18
+ from pathlib import Path
19
+
20
+
21
+ async def extract_pptx_file_content(file_path_or_contents: Path | bytes) -> ExtractionResult:
22
+ """Extract text from a PPTX file.
23
+
24
+ Notes:
25
+ This function is based on code vendored from `markitdown`, which has an MIT license as well.
26
+
27
+ Args:
28
+ file_path_or_contents: The path to the PPTX file or its contents as bytes.
29
+
30
+ Returns:
31
+ The extracted text content
32
+ """
33
+ md_content = ""
34
+ file_contents = (
35
+ file_path_or_contents
36
+ if isinstance(file_path_or_contents, bytes)
37
+ else await AsyncPath(file_path_or_contents).read_bytes()
38
+ )
39
+ presentation = pptx.Presentation(BytesIO(file_contents))
40
+
41
+ for index, slide in enumerate(presentation.slides):
42
+ md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
43
+
44
+ title = slide.shapes.title
45
+
46
+ for shape in slide.shapes:
47
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
48
+ shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
49
+ ):
50
+ alt_text = ""
51
+ with suppress(AttributeError):
52
+ # access non-visual properties
53
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
54
+
55
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
56
+ md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
57
+
58
+ elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
59
+ html_table = "<table>"
60
+ first_row = True
61
+
62
+ for row in shape.table.rows:
63
+ html_table += "<tr>"
64
+
65
+ for cell in row.cells:
66
+ tag = "th" if first_row else "td"
67
+ html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
68
+
69
+ html_table += "</tr>"
70
+ first_row = False
71
+
72
+ html_table += "</table>"
73
+ md_content += "\n" + html_table + "\n"
74
+
75
+ elif shape.has_text_frame:
76
+ md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
77
+
78
+ md_content = md_content.strip()
79
+ if slide.has_notes_slide:
80
+ md_content += "\n\n### Notes:\n"
81
+ notes_frame = slide.notes_slide.notes_text_frame
82
+
83
+ if notes_frame is not None: # pragma: no branch
84
+ md_content += notes_frame.text
85
+
86
+ md_content = md_content.strip()
87
+
88
+ return ExtractionResult(content=normalize_spaces(md_content), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_string.py CHANGED
@@ -4,8 +4,6 @@ from contextlib import suppress
4
4
 
5
5
  from charset_normalizer import detect
6
6
 
7
- from kreuzberg.exceptions import ParsingError
8
-
9
7
 
10
8
  def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
11
9
  """Decode a byte string safely, removing invalid sequences.
@@ -14,22 +12,21 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
14
12
  byte_data: The byte string to decode.
15
13
  encoding: The encoding to use when decoding the byte string.
16
14
 
17
- Raises:
18
- ParsingError: If the byte string could not be decoded.
19
-
20
15
  Returns:
21
16
  The decoded string.
22
17
  """
23
18
  if not byte_data:
24
19
  return ""
25
20
 
26
- encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8", "latin-1"]
21
+ # We try each encoding in order until one works
22
+ encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
27
23
 
28
- for enc in [e for e in encodings if e]:
24
+ for enc in [e for e in encodings if e]: # pragma: no cover
29
25
  with suppress(UnicodeDecodeError):
30
26
  return byte_data.decode(enc)
31
27
 
32
- raise ParsingError("Could not decode byte string. Please provide an encoding.")
28
+ # If all encodings fail, fall back to latin-1 which can handle any byte
29
+ return byte_data.decode("latin-1", errors="replace")
33
30
 
34
31
 
35
32
  def normalize_spaces(text: str) -> str:
kreuzberg/_sync.py CHANGED
@@ -1,14 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import sys
3
4
  from functools import partial
4
5
  from typing import TYPE_CHECKING, TypeVar, cast
5
6
 
6
7
  from anyio.to_thread import run_sync as any_io_run_sync
7
- from typing_extensions import ParamSpec
8
8
 
9
9
  if TYPE_CHECKING: # pragma: no cover
10
10
  from collections.abc import Callable
11
11
 
12
+ if sys.version_info >= (3, 10):
13
+ from typing import ParamSpec
14
+ else: # pragma: no cover
15
+ from typing_extensions import ParamSpec
16
+
12
17
  T = TypeVar("T")
13
18
  P = ParamSpec("P")
14
19
 
kreuzberg/_tesseract.py CHANGED
@@ -2,23 +2,34 @@ from __future__ import annotations
2
2
 
3
3
  import re
4
4
  import subprocess
5
- from asyncio import gather
5
+ import sys
6
6
  from enum import Enum
7
+ from functools import partial
7
8
  from os import PathLike
8
- from tempfile import NamedTemporaryFile
9
- from typing import Any, Literal, TypeVar, Union
9
+ from typing import Final, Literal, TypeVar, Union, cast
10
10
 
11
+ from anyio import CapacityLimiter, create_task_group, to_process
11
12
  from anyio import Path as AsyncPath
12
13
  from PIL.Image import Image
13
14
 
15
+ from kreuzberg import ExtractionResult, ParsingError
16
+ from kreuzberg._constants import DEFAULT_MAX_PROCESSES
17
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
18
+ from kreuzberg._string import normalize_spaces
14
19
  from kreuzberg._sync import run_sync
20
+ from kreuzberg._tmp import create_temp_file
15
21
  from kreuzberg.exceptions import MissingDependencyError, OCRError
16
22
 
23
+ if sys.version_info < (3, 11): # pragma: no cover
24
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
25
+
26
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
27
+
17
28
  version_ref = {"checked": False}
18
29
 
19
30
  T = TypeVar("T", bound=Union[Image, PathLike[str], str])
20
31
 
21
- SupportedLanguages = Literal[
32
+ SupportedLanguage = Literal[
22
33
  "afr",
23
34
  "amh",
24
35
  "ara",
@@ -186,9 +197,10 @@ async def validate_tesseract_version() -> None:
186
197
  if version_ref["checked"]:
187
198
  return
188
199
 
189
- result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
190
- version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
191
- if not version_match or int(version_match.group(1)) < 5:
200
+ command = ["tesseract", "--version"]
201
+ result = await run_sync(subprocess.run, command, capture_output=True)
202
+ version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
203
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
192
204
  raise MissingDependencyError("Tesseract version 5 or above is required.")
193
205
 
194
206
  version_ref["checked"] = True
@@ -197,85 +209,96 @@ async def validate_tesseract_version() -> None:
197
209
 
198
210
 
199
211
  async def process_file(
200
- input_file: str | PathLike[str], *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any
201
- ) -> str:
212
+ input_file: str | PathLike[str],
213
+ *,
214
+ language: SupportedLanguage,
215
+ psm: PSMMode,
216
+ max_processes: int = DEFAULT_MAX_PROCESSES,
217
+ ) -> ExtractionResult:
202
218
  """Process a single image file using Tesseract OCR.
203
219
 
204
220
  Args:
205
221
  input_file: The path to the image file to process.
206
222
  language: The language code for OCR.
207
223
  psm: Page segmentation mode.
208
- **kwargs: Additional Tesseract configuration options as key-value pairs.
224
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
209
225
 
210
226
  Raises:
211
227
  OCRError: If OCR fails to extract text from the image.
212
228
 
213
229
  Returns:
214
- str: Extracted text from the image.
230
+ ExtractionResult: The extracted text from the image.
215
231
  """
216
- with NamedTemporaryFile(suffix=".txt") as output_file:
217
- # this is needed because tesseract adds .txt to the output file
218
- output_file_name = output_file.name.replace(".txt", "")
219
- try:
220
- command = [
221
- "tesseract",
222
- str(input_file),
223
- output_file_name,
224
- "-l",
225
- language,
226
- "--psm",
227
- str(psm.value),
228
- ]
229
-
230
- for key, value in kwargs.items():
231
- command.extend(["-c", f"{key}={value}"])
232
-
233
- result = await run_sync(
234
- subprocess.run,
235
- command,
236
- capture_output=True,
237
- )
238
-
239
- if not result.returncode == 0:
240
- raise OCRError("OCR failed with a non-0 return code.")
241
-
242
- output = await AsyncPath(output_file.name).read_text()
243
- return output.strip()
244
- except (RuntimeError, OSError) as e:
245
- raise OCRError("Failed to OCR using tesseract") from e
246
-
247
-
248
- async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
232
+ output_path, unlink = await create_temp_file(".txt")
233
+ try:
234
+ output_base = str(output_path).replace(".txt", "")
235
+ command = [
236
+ "tesseract",
237
+ str(input_file),
238
+ output_base,
239
+ "-l",
240
+ language,
241
+ "--psm",
242
+ str(psm.value),
243
+ ]
244
+
245
+ result = await to_process.run_sync(
246
+ partial(subprocess.run, capture_output=True),
247
+ command,
248
+ limiter=CapacityLimiter(max_processes),
249
+ cancellable=True,
250
+ )
251
+
252
+ if not result.returncode == 0:
253
+ raise OCRError("OCR failed with a non-0 return code.")
254
+
255
+ output = await AsyncPath(output_path).read_text("utf-8")
256
+ return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
257
+ except (RuntimeError, OSError) as e:
258
+ raise OCRError("Failed to OCR using tesseract") from e
259
+ finally:
260
+ await unlink()
261
+
262
+
263
+ async def process_image(
264
+ image: Image,
265
+ *,
266
+ language: SupportedLanguage,
267
+ psm: PSMMode,
268
+ max_processes: int = DEFAULT_MAX_PROCESSES,
269
+ ) -> ExtractionResult:
249
270
  """Process a single Pillow Image using Tesseract OCR.
250
271
 
251
272
  Args:
252
273
  image: The Pillow Image to process.
253
274
  language: The language code for OCR.
254
275
  psm: Page segmentation mode.
255
- **kwargs: Additional Tesseract configuration options as key-value pairs.
276
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
256
277
 
257
278
  Returns:
258
- str: Extracted text from the image.
279
+ ExtractionResult: The extracted text from the image.
259
280
  """
260
- with NamedTemporaryFile(suffix=".png") as image_file:
261
- await run_sync(image.save, image_file.name, format="PNG")
262
- return await process_file(image_file.name, language=language, psm=psm, **kwargs)
281
+ image_path, unlink = await create_temp_file(".png")
282
+ await run_sync(image.save, str(image_path), format="PNG")
283
+ result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
284
+ await unlink()
285
+ return result
263
286
 
264
287
 
265
288
  async def process_image_with_tesseract(
266
289
  image: Image | PathLike[str] | str,
267
290
  *,
268
- language: SupportedLanguages = "eng",
291
+ language: SupportedLanguage = "eng",
269
292
  psm: PSMMode = PSMMode.AUTO,
270
- **kwargs: Any,
271
- ) -> str:
293
+ max_processes: int = DEFAULT_MAX_PROCESSES,
294
+ ) -> ExtractionResult:
272
295
  """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
273
296
 
274
297
  Args:
275
298
  image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
276
299
  language: The language code for OCR (default: "eng").
277
300
  psm: Page segmentation mode (default: PSMMode.AUTO).
278
- **kwargs: Additional Tesseract configuration options as key-value pairs.
301
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
279
302
 
280
303
  Raises:
281
304
  ValueError: If the input is not a Pillow Image or a list of Pillow Images.
@@ -286,10 +309,10 @@ async def process_image_with_tesseract(
286
309
  await validate_tesseract_version()
287
310
 
288
311
  if isinstance(image, Image):
289
- return await process_image(image, language=language, psm=psm, **kwargs)
312
+ return await process_image(image, language=language, psm=psm, max_processes=max_processes)
290
313
 
291
314
  if isinstance(image, (PathLike, str)):
292
- return await process_file(image, language=language, psm=psm, **kwargs)
315
+ return await process_file(image, language=language, psm=psm, max_processes=max_processes)
293
316
 
294
317
  raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
295
318
 
@@ -297,22 +320,36 @@ async def process_image_with_tesseract(
297
320
  async def batch_process_images(
298
321
  images: list[T],
299
322
  *,
300
- language: SupportedLanguages = "eng",
323
+ language: SupportedLanguage = "eng",
301
324
  psm: PSMMode = PSMMode.AUTO,
302
- **kwargs: Any,
303
- ) -> list[str]:
304
- """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
325
+ max_processes: int = DEFAULT_MAX_PROCESSES,
326
+ ) -> list[ExtractionResult]:
327
+ """Run Tesseract OCR asynchronously on multiple images with controlled concurrency.
305
328
 
306
329
  Args:
307
330
  images: A list of Pillow Images, paths or strings to process.
308
331
  language: The language code for OCR (default: "eng").
309
332
  psm: Page segmentation mode (default: PSMMode.AUTO).
310
- **kwargs: Additional Tesseract configuration options as key-value pairs.
333
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
334
+
335
+ Raises:
336
+ ParsingError: If OCR fails to extract text from any of the images.
311
337
 
312
338
  Returns:
313
- Extracted text as a string (for single image) or a list of strings (for multiple images).
339
+ List of ExtractionResult objects, one per input image.
314
340
  """
315
341
  await validate_tesseract_version()
316
- return await gather(
317
- *[process_image_with_tesseract(image, language=language, psm=psm, **kwargs) for image in images]
318
- )
342
+ results = cast(list[ExtractionResult], list(range(len(images))))
343
+
344
+ async def _process_image(index: int, image: T) -> None:
345
+ results[index] = await process_image_with_tesseract(
346
+ image, language=language, psm=psm, max_processes=max_processes
347
+ )
348
+
349
+ try:
350
+ async with create_task_group() as tg:
351
+ for i, image in enumerate(images):
352
+ tg.start_soon(_process_image, i, image)
353
+ return results
354
+ except ExceptionGroup as eg:
355
+ raise ParsingError("Failed to process images with Tesseract") from eg
kreuzberg/_tmp.py ADDED
@@ -0,0 +1,37 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import suppress
4
+ from pathlib import Path
5
+ from tempfile import NamedTemporaryFile
6
+ from typing import TYPE_CHECKING, Callable
7
+
8
+ from anyio import Path as AsyncPath
9
+
10
+ from kreuzberg._sync import run_sync
11
+
12
+ if TYPE_CHECKING: # pragma: no cover
13
+ from collections.abc import Coroutine
14
+
15
+
16
+ async def create_temp_file(
17
+ extension: str, content: bytes | None = None
18
+ ) -> tuple[Path, Callable[[], Coroutine[None, None, None]]]:
19
+ """Create a temporary file that is closed.
20
+
21
+ Args:
22
+ extension: The file extension.
23
+ content: The content to write to the file.
24
+
25
+ Returns:
26
+ The temporary file path.
27
+ """
28
+ file = await run_sync(NamedTemporaryFile, suffix=extension, delete=False)
29
+ if content:
30
+ await AsyncPath(file.name).write_bytes(content)
31
+ await run_sync(file.close)
32
+
33
+ async def unlink() -> None:
34
+ with suppress(OSError, PermissionError):
35
+ await AsyncPath(file.name).unlink(missing_ok=True)
36
+
37
+ return Path(file.name), unlink