kreuzberg 2.0.1__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from ._tesseract import PSMMode
1
2
  from ._types import ExtractionResult, Metadata
2
3
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
3
4
  from .extraction import (
@@ -15,6 +16,7 @@ __all__ = [
15
16
  "Metadata",
16
17
  "MissingDependencyError",
17
18
  "OCRError",
19
+ "PSMMode",
18
20
  "ParsingError",
19
21
  "ValidationError",
20
22
  "batch_extract_bytes",
kreuzberg/_constants.py CHANGED
@@ -3,4 +3,6 @@ from __future__ import annotations
3
3
  from multiprocessing import cpu_count
4
4
  from typing import Final
5
5
 
6
- DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
6
+ DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
7
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
8
+ MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
kreuzberg/_html.py CHANGED
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
8
8
  from kreuzberg import ExtractionResult
9
9
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
10
10
  from kreuzberg._string import normalize_spaces, safe_decode
11
- from kreuzberg._sync import run_sync
12
11
 
13
12
  if TYPE_CHECKING:
14
13
  from pathlib import Path
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
28
27
  if isinstance(file_path_or_contents, bytes)
29
28
  else await AsyncPath(file_path_or_contents).read_text()
30
29
  )
31
- result = await run_sync(html_to_markdown.convert_to_markdown, content)
30
+ result = html_to_markdown.convert_to_markdown(content)
32
31
  return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_pandoc.py CHANGED
@@ -1,21 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- import subprocess
3
+ import re
4
4
  import sys
5
5
  from functools import partial
6
6
  from json import JSONDecodeError, loads
7
7
  from typing import TYPE_CHECKING, Any, Final, Literal, cast
8
8
 
9
- from anyio import CapacityLimiter, create_task_group, to_process
10
9
  from anyio import Path as AsyncPath
10
+ from anyio import run_process
11
11
 
12
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
12
+ from kreuzberg import ValidationError
13
+ from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
13
14
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
14
15
  from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync
16
+ from kreuzberg._sync import run_taskgroup
16
17
  from kreuzberg._tmp import create_temp_file
17
18
  from kreuzberg._types import ExtractionResult, Metadata
18
- from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
19
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError
19
20
 
20
21
  if TYPE_CHECKING: # pragma: no cover
21
22
  from collections.abc import Mapping
@@ -24,10 +25,8 @@ if TYPE_CHECKING: # pragma: no cover
24
25
  if sys.version_info < (3, 11): # pragma: no cover
25
26
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
27
 
27
-
28
28
  version_ref: Final[dict[str, bool]] = {"checked": False}
29
29
 
30
-
31
30
  # Block-level node types in Pandoc AST
32
31
  BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
33
32
  BLOCK_PARA: Final = "Para" # Paragraph containing inline content
@@ -229,20 +228,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
229
228
 
230
229
 
231
230
  def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
232
- if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
233
- mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
234
- ):
235
- raise ValidationError(
236
- f"Unsupported mime type: {mime_type}",
237
- context={
238
- "mime_type": mime_type,
239
- "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
240
- },
231
+ if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
232
+ return pandoc_type
233
+
234
+ if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
235
+ return next(
236
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
241
237
  )
242
238
 
243
- return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
244
- MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
245
- )
239
+ raise ValidationError(f"Unsupported mime type: {mime_type}")
246
240
 
247
241
 
248
242
  async def _validate_pandoc_version() -> None:
@@ -251,20 +245,19 @@ async def _validate_pandoc_version() -> None:
251
245
  return
252
246
 
253
247
  command = ["pandoc", "--version"]
254
- result = await run_sync(subprocess.run, command, capture_output=True)
255
- version = result.stdout.decode().split("\n")[0].split()[1]
256
- if not version.startswith("3."):
257
- raise MissingDependencyError("Pandoc version 3 or above is required.")
248
+ result = await run_process(command)
249
+
250
+ version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
251
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
252
+ raise MissingDependencyError("Pandoc version 2 or above is required")
258
253
 
259
254
  version_ref["checked"] = True
260
255
 
261
256
  except FileNotFoundError as e:
262
- raise MissingDependencyError("Pandoc is not installed.") from e
257
+ raise MissingDependencyError("Pandoc is not installed") from e
263
258
 
264
259
 
265
- async def _handle_extract_metadata(
266
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
267
- ) -> Metadata:
260
+ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
268
261
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
269
262
  metadata_file, unlink = await create_temp_file(".json")
270
263
  try:
@@ -276,15 +269,10 @@ async def _handle_extract_metadata(
276
269
  "--standalone",
277
270
  "--quiet",
278
271
  "--output",
279
- metadata_file,
272
+ str(metadata_file),
280
273
  ]
281
274
 
282
- result = await to_process.run_sync(
283
- partial(subprocess.run, capture_output=True),
284
- command,
285
- cancellable=True,
286
- limiter=CapacityLimiter(max_processes),
287
- )
275
+ result = await run_process(command)
288
276
 
289
277
  if result.returncode != 0:
290
278
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -297,9 +285,7 @@ async def _handle_extract_metadata(
297
285
  await unlink()
298
286
 
299
287
 
300
- async def _handle_extract_file(
301
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
302
- ) -> str:
288
+ async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
303
289
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
304
290
  output_path, unlink = await create_temp_file(".md")
305
291
  try:
@@ -315,12 +301,7 @@ async def _handle_extract_file(
315
301
 
316
302
  command.extend(["--output", str(output_path)])
317
303
 
318
- result = await to_process.run_sync(
319
- partial(subprocess.run, capture_output=True),
320
- command,
321
- cancellable=True,
322
- limiter=CapacityLimiter(max_processes),
323
- )
304
+ result = await run_process(command)
324
305
 
325
306
  if result.returncode != 0:
326
307
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -334,15 +315,12 @@ async def _handle_extract_file(
334
315
  await unlink()
335
316
 
336
317
 
337
- async def process_file_with_pandoc(
338
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
339
- ) -> ExtractionResult:
318
+ async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
340
319
  """Process a single file using Pandoc and convert to markdown.
341
320
 
342
321
  Args:
343
322
  input_file: The path to the file to process.
344
323
  mime_type: The mime type of the file.
345
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
346
324
 
347
325
  Raises:
348
326
  ParsingError: If the file data could not be extracted.
@@ -354,41 +332,27 @@ async def process_file_with_pandoc(
354
332
 
355
333
  _get_pandoc_type_from_mime_type(mime_type)
356
334
 
357
- metadata: Metadata = {}
358
- content: str = ""
359
-
360
335
  try:
361
- async with create_task_group() as tg:
362
-
363
- async def _get_metadata() -> None:
364
- nonlocal metadata
365
- metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
366
-
367
- async def _get_content() -> None:
368
- nonlocal content
369
- content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
336
+ metadata, content = await run_taskgroup(
337
+ partial(_handle_extract_metadata, input_file, mime_type=mime_type),
338
+ partial(_handle_extract_file, input_file, mime_type=mime_type),
339
+ )
370
340
 
371
- tg.start_soon(_get_metadata)
372
- tg.start_soon(_get_content)
341
+ return ExtractionResult(
342
+ content=normalize_spaces(cast(str, content)),
343
+ metadata=cast(Metadata, metadata),
344
+ mime_type=MARKDOWN_MIME_TYPE,
345
+ )
373
346
  except ExceptionGroup as eg:
374
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
375
-
376
- return ExtractionResult(
377
- content=normalize_spaces(content),
378
- metadata=metadata,
379
- mime_type=MARKDOWN_MIME_TYPE,
380
- )
347
+ raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
381
348
 
382
349
 
383
- async def process_content_with_pandoc(
384
- content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
385
- ) -> ExtractionResult:
350
+ async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
386
351
  """Process content using Pandoc and convert to markdown.
387
352
 
388
353
  Args:
389
354
  content: The content to process.
390
355
  mime_type: The mime type of the content.
391
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
392
356
 
393
357
  Returns:
394
358
  ExtractionResult
@@ -397,7 +361,7 @@ async def process_content_with_pandoc(
397
361
  input_file, unlink = await create_temp_file(f".{extension}")
398
362
 
399
363
  await AsyncPath(input_file).write_bytes(content)
400
- result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
364
+ result = await process_file_with_pandoc(input_file, mime_type=mime_type)
401
365
 
402
366
  await unlink()
403
367
  return result
kreuzberg/_pdf.py CHANGED
@@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
67
67
  document: pypdfium2.PdfDocument | None = None
68
68
  try:
69
69
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
70
- return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
70
+ return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
71
71
  except pypdfium2.PdfiumError as e:
72
72
  raise ParsingError(
73
73
  "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
@@ -154,7 +154,6 @@ async def extract_pdf_file(
154
154
  and _validate_extracted_text(content)
155
155
  ):
156
156
  return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
157
-
158
157
  return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
159
158
 
160
159
 
kreuzberg/_string.py CHANGED
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
22
22
  encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
23
23
 
24
24
  for enc in [e for e in encodings if e]: # pragma: no cover
25
- with suppress(UnicodeDecodeError):
25
+ with suppress(UnicodeDecodeError, LookupError):
26
26
  return byte_data.decode(enc)
27
27
 
28
28
  # If all encodings fail, fall back to latin-1 which can handle any byte
kreuzberg/_sync.py CHANGED
@@ -4,10 +4,11 @@ import sys
4
4
  from functools import partial
5
5
  from typing import TYPE_CHECKING, TypeVar, cast
6
6
 
7
+ from anyio import create_task_group
7
8
  from anyio.to_thread import run_sync as any_io_run_sync
8
9
 
9
10
  if TYPE_CHECKING: # pragma: no cover
10
- from collections.abc import Callable
11
+ from collections.abc import Callable, Coroutine
11
12
 
12
13
  if sys.version_info >= (3, 10):
13
14
  from typing import ParamSpec
@@ -30,4 +31,44 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
30
31
  The result of the synchronous function.
31
32
  """
32
33
  handler = partial(sync_fn, **kwargs)
33
- return cast(T, await any_io_run_sync(handler, *args)) # pyright: ignore [reportCallIssue]
34
+ return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
+
36
+
37
+ async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) -> list[T]:
38
+ """Run a list of coroutines concurrently.
39
+
40
+ Args:
41
+ *async_tasks: The list of coroutines to run.
42
+
43
+ Returns:
44
+ The results of the coroutines.
45
+ """
46
+ results = cast(list[T], [None] * len(async_tasks))
47
+
48
+ async def run_task(index: int, task: Callable[[], Coroutine[None, None, T]]) -> None:
49
+ results[index] = await task()
50
+
51
+ async with create_task_group() as tg:
52
+ for i, t in enumerate(async_tasks):
53
+ tg.start_soon(run_task, i, t)
54
+
55
+ return results
56
+
57
+
58
+ async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None, T]], batch_size: int) -> list[T]:
59
+ """Run a list of coroutines concurrently in batches.
60
+
61
+ Args:
62
+ *async_tasks: The list of coroutines to run.
63
+ batch_size: The size of each batch.
64
+
65
+ Returns:
66
+ The results of the coroutines.
67
+ """
68
+ results: list[T] = []
69
+
70
+ for i in range(0, len(async_tasks), batch_size):
71
+ batch = async_tasks[i : i + batch_size]
72
+ results.extend(await run_taskgroup(*batch))
73
+
74
+ return results
kreuzberg/_tesseract.py CHANGED
@@ -1,30 +1,27 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
- import subprocess
5
4
  import sys
6
5
  from enum import Enum
7
6
  from functools import partial
8
7
  from os import PathLike
9
- from typing import Final, TypeVar, Union, cast
8
+ from typing import Any, TypeVar, Union
10
9
 
11
- from anyio import CapacityLimiter, create_task_group, to_process
12
10
  from anyio import Path as AsyncPath
11
+ from anyio import run_process
13
12
  from PIL.Image import Image
14
13
 
15
- from kreuzberg import ExtractionResult, ParsingError
16
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
14
+ from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
17
15
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
18
16
  from kreuzberg._string import normalize_spaces
19
- from kreuzberg._sync import run_sync
17
+ from kreuzberg._sync import run_sync, run_taskgroup_batched
20
18
  from kreuzberg._tmp import create_temp_file
21
- from kreuzberg.exceptions import MissingDependencyError, OCRError
19
+ from kreuzberg._types import ExtractionResult
20
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
22
21
 
23
22
  if sys.version_info < (3, 11): # pragma: no cover
24
23
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
25
24
 
26
- MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
27
-
28
25
  version_ref = {"checked": False}
29
26
 
30
27
  T = TypeVar("T", bound=Union[Image, PathLike[str], str])
@@ -68,14 +65,16 @@ async def validate_tesseract_version() -> None:
68
65
  return
69
66
 
70
67
  command = ["tesseract", "--version"]
71
- result = await run_sync(subprocess.run, command, capture_output=True)
72
- version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
68
+ result = await run_process(command)
69
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
73
70
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
74
71
  raise MissingDependencyError("Tesseract version 5 or above is required.")
75
72
 
76
73
  version_ref["checked"] = True
77
74
  except FileNotFoundError as e:
78
- raise MissingDependencyError("Tesseract is not installed.") from e
75
+ raise MissingDependencyError(
76
+ "Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
77
+ ) from e
79
78
 
80
79
 
81
80
  async def process_file(
@@ -83,7 +82,6 @@ async def process_file(
83
82
  *,
84
83
  language: str,
85
84
  psm: PSMMode,
86
- max_processes: int = DEFAULT_MAX_PROCESSES,
87
85
  ) -> ExtractionResult:
88
86
  """Process a single image file using Tesseract OCR.
89
87
 
@@ -91,7 +89,6 @@ async def process_file(
91
89
  input_file: The path to the image file to process.
92
90
  language: The language code for OCR.
93
91
  psm: Page segmentation mode.
94
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
95
92
 
96
93
  Raises:
97
94
  OCRError: If OCR fails to extract text from the image.
@@ -102,6 +99,7 @@ async def process_file(
102
99
  output_path, unlink = await create_temp_file(".txt")
103
100
  try:
104
101
  output_base = str(output_path).replace(".txt", "")
102
+
105
103
  command = [
106
104
  "tesseract",
107
105
  str(input_file),
@@ -110,22 +108,44 @@ async def process_file(
110
108
  language,
111
109
  "--psm",
112
110
  str(psm.value),
111
+ "--oem",
112
+ "1",
113
+ "--loglevel",
114
+ "OFF",
115
+ "-c",
116
+ "thresholding_method=1",
117
+ "-c",
118
+ "tessedit_enable_dict_correction=1",
119
+ "-c",
120
+ "language_model_ngram_on=1",
121
+ "-c",
122
+ "textord_space_size_is_variable=1",
123
+ "-c",
124
+ "classify_use_pre_adapted_templates=1",
125
+ "-c",
126
+ "tessedit_dont_blkrej_good_wds=1",
127
+ "-c",
128
+ "tessedit_dont_rowrej_good_wds=1",
129
+ "-c",
130
+ "tessedit_use_primary_params_model=1",
113
131
  ]
114
132
 
115
- result = await to_process.run_sync(
116
- partial(subprocess.run, capture_output=True),
117
- command,
118
- limiter=CapacityLimiter(max_processes),
119
- cancellable=True,
120
- )
133
+ env: dict[str, Any] | None = None
134
+ if sys.platform.startswith("linux"):
135
+ env = {"OMP_THREAD_LIMIT": "1"}
136
+
137
+ result = await run_process(command, env=env)
121
138
 
122
139
  if not result.returncode == 0:
123
- raise OCRError("OCR failed with a non-0 return code.")
140
+ raise OCRError(
141
+ "OCR failed with a non-0 return code.",
142
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
143
+ )
124
144
 
125
145
  output = await AsyncPath(output_path).read_text("utf-8")
126
146
  return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
127
147
  except (RuntimeError, OSError) as e:
128
- raise OCRError("Failed to OCR using tesseract") from e
148
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
129
149
  finally:
130
150
  await unlink()
131
151
 
@@ -135,7 +155,6 @@ async def process_image(
135
155
  *,
136
156
  language: str,
137
157
  psm: PSMMode,
138
- max_processes: int = DEFAULT_MAX_PROCESSES,
139
158
  ) -> ExtractionResult:
140
159
  """Process a single Pillow Image using Tesseract OCR.
141
160
 
@@ -143,14 +162,13 @@ async def process_image(
143
162
  image: The Pillow Image to process.
144
163
  language: The language code for OCR.
145
164
  psm: Page segmentation mode.
146
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
147
165
 
148
166
  Returns:
149
167
  ExtractionResult: The extracted text from the image.
150
168
  """
151
169
  image_path, unlink = await create_temp_file(".png")
152
170
  await run_sync(image.save, str(image_path), format="PNG")
153
- result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
171
+ result = await process_file(image_path, language=language, psm=psm)
154
172
  await unlink()
155
173
  return result
156
174
 
@@ -160,7 +178,6 @@ async def process_image_with_tesseract(
160
178
  *,
161
179
  language: str = "eng",
162
180
  psm: PSMMode = PSMMode.AUTO,
163
- max_processes: int = DEFAULT_MAX_PROCESSES,
164
181
  ) -> ExtractionResult:
165
182
  """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
166
183
 
@@ -168,7 +185,6 @@ async def process_image_with_tesseract(
168
185
  image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
169
186
  language: The language code for OCR (default: "eng").
170
187
  psm: Page segmentation mode (default: PSMMode.AUTO).
171
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
172
188
 
173
189
  Raises:
174
190
  ValueError: If the input is not a Pillow Image or a list of Pillow Images.
@@ -179,10 +195,10 @@ async def process_image_with_tesseract(
179
195
  await validate_tesseract_version()
180
196
 
181
197
  if isinstance(image, Image):
182
- return await process_image(image, language=language, psm=psm, max_processes=max_processes)
198
+ return await process_image(image, language=language, psm=psm)
183
199
 
184
200
  if isinstance(image, (PathLike, str)):
185
- return await process_file(image, language=language, psm=psm, max_processes=max_processes)
201
+ return await process_file(image, language=language, psm=psm)
186
202
 
187
203
  raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
188
204
 
@@ -200,7 +216,7 @@ async def batch_process_images(
200
216
  images: A list of Pillow Images, paths or strings to process.
201
217
  language: The language code for OCR (default: "eng").
202
218
  psm: Page segmentation mode (default: PSMMode.AUTO).
203
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
219
+ max_processes: Maximum number of concurrent processes (default: CPU count / 2).
204
220
 
205
221
  Raises:
206
222
  ParsingError: If OCR fails to extract text from any of the images.
@@ -209,17 +225,10 @@ async def batch_process_images(
209
225
  List of ExtractionResult objects, one per input image.
210
226
  """
211
227
  await validate_tesseract_version()
212
- results = cast(list[ExtractionResult], list(range(len(images))))
213
-
214
- async def _process_image(index: int, image: T) -> None:
215
- results[index] = await process_image_with_tesseract(
216
- image, language=language, psm=psm, max_processes=max_processes
217
- )
218
-
219
228
  try:
220
- async with create_task_group() as tg:
221
- for i, image in enumerate(images):
222
- tg.start_soon(_process_image, i, image)
223
- return results
229
+ return await run_taskgroup_batched(
230
+ *[partial(process_image_with_tesseract, image, language=language, psm=psm) for image in images],
231
+ batch_size=max_processes,
232
+ )
224
233
  except ExceptionGroup as eg:
225
- raise ParsingError("Failed to process images with Tesseract") from eg
234
+ raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
kreuzberg/_xlsx.py CHANGED
@@ -1,23 +1,47 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import csv
4
+ import sys
5
+ from functools import partial
4
6
  from io import StringIO
5
- from typing import TYPE_CHECKING, cast
7
+ from typing import TYPE_CHECKING
6
8
 
7
9
  from anyio import Path as AsyncPath
8
- from anyio import create_task_group
9
10
  from python_calamine import CalamineWorkbook
10
11
 
11
12
  from kreuzberg import ExtractionResult, ParsingError
12
13
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
13
14
  from kreuzberg._pandoc import process_file_with_pandoc
14
15
  from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync
16
+ from kreuzberg._sync import run_sync, run_taskgroup
16
17
  from kreuzberg._tmp import create_temp_file
17
18
 
18
19
  if TYPE_CHECKING: # pragma: no cover
19
20
  from pathlib import Path
20
21
 
22
+ if sys.version_info < (3, 11): # pragma: no cover
23
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
24
+
25
+
26
+ async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
27
+ values = workbook.get_sheet_by_name(sheet_name).to_python()
28
+
29
+ csv_buffer = StringIO()
30
+ writer = csv.writer(csv_buffer)
31
+
32
+ for row in values:
33
+ writer.writerow(row)
34
+
35
+ csv_data = csv_buffer.getvalue()
36
+ csv_buffer.close()
37
+
38
+ csv_path, unlink = await create_temp_file(".csv")
39
+ await AsyncPath(csv_path).write_text(csv_data)
40
+
41
+ result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
42
+ await unlink()
43
+ return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
44
+
21
45
 
22
46
  async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
23
47
  """Extract text from an XLSX file by converting it to CSV and then to markdown.
@@ -33,46 +57,20 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
33
57
  """
34
58
  try:
35
59
  workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
36
-
37
- results = cast(list[str], [None] * len(workbook.sheet_names))
38
-
39
- async def convert_sheet_to_text(sheet_name: str) -> None:
40
- nonlocal results
41
- values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
42
-
43
- csv_buffer = StringIO()
44
- writer = csv.writer(csv_buffer)
45
-
46
- for row in values:
47
- writer.writerow(row)
48
-
49
- csv_data = csv_buffer.getvalue()
50
- csv_buffer.close()
51
-
52
- from kreuzberg._tmp import create_temp_file
53
-
54
- csv_path, unlink = await create_temp_file(".csv")
55
- await AsyncPath(csv_path).write_text(csv_data)
56
- result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
57
- results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
58
- await unlink()
59
-
60
- async with create_task_group() as tg:
61
- for sheet_name in workbook.sheet_names:
62
- tg.start_soon(convert_sheet_to_text, sheet_name)
60
+ results = await run_taskgroup(
61
+ *[partial(convert_sheet_to_text, workbook, sheet_name) for sheet_name in workbook.sheet_names]
62
+ )
63
63
 
64
64
  return ExtractionResult(
65
65
  content="\n\n".join(results),
66
66
  mime_type=MARKDOWN_MIME_TYPE,
67
67
  metadata={},
68
68
  )
69
- except Exception as e:
69
+ except ExceptionGroup as eg:
70
70
  raise ParsingError(
71
- "Could not extract text from XLSX",
72
- context={
73
- "error": str(e),
74
- },
75
- ) from e
71
+ "Failed to extract file data",
72
+ context={"file": str(input_file), "errors": eg.exceptions},
73
+ ) from eg
76
74
 
77
75
 
78
76
  async def extract_xlsx_content(content: bytes) -> ExtractionResult:
kreuzberg/exceptions.py CHANGED
@@ -14,9 +14,28 @@ class KreuzbergError(Exception):
14
14
  self.context = context
15
15
  super().__init__(message)
16
16
 
17
+ def _serialize_context(self, obj: Any) -> Any:
18
+ """Recursively serialize context objects to ensure JSON compatibility."""
19
+ if isinstance(obj, bytes):
20
+ return obj.decode("utf-8", errors="replace")
21
+ if isinstance(obj, dict):
22
+ return {k: self._serialize_context(v) for k, v in obj.items()}
23
+ if isinstance(obj, (list, tuple)):
24
+ return [self._serialize_context(x) for x in obj]
25
+ if isinstance(obj, Exception):
26
+ return {
27
+ "type": obj.__class__.__name__,
28
+ "message": str(obj),
29
+ }
30
+ return obj
31
+
17
32
  def __str__(self) -> str:
18
33
  """Return a string representation of the exception."""
19
- ctx = f"\n\nContext: {dumps(self.context)}" if self.context else ""
34
+ if self.context:
35
+ serialized_context = self._serialize_context(self.context)
36
+ ctx = f"\n\nContext: {dumps(serialized_context)}"
37
+ else:
38
+ ctx = ""
20
39
 
21
40
  return f"{self.__class__.__name__}: {super().__str__()}{ctx}"
22
41
 
kreuzberg/extraction.py CHANGED
@@ -87,14 +87,12 @@ async def extract_bytes(
87
87
  return await extract_xlsx_content(content)
88
88
 
89
89
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
90
- return await process_image_with_tesseract(
91
- open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
92
- )
90
+ return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
93
91
 
94
92
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
95
93
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
96
94
  ):
97
- return await process_content_with_pandoc(content=content, mime_type=mime_type, max_processes=max_processes)
95
+ return await process_content_with_pandoc(content=content, mime_type=mime_type)
98
96
 
99
97
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
100
98
  return await extract_pptx_file_content(content)
@@ -150,12 +148,12 @@ async def extract_file(
150
148
  return await extract_xlsx_file(Path(input_file))
151
149
 
152
150
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
153
- return await process_image_with_tesseract(input_file, max_processes=max_processes, psm=psm, language=language)
151
+ return await process_image_with_tesseract(input_file, psm=psm, language=language)
154
152
 
155
153
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
156
154
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
157
155
  ):
158
- return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type, max_processes=max_processes)
156
+ return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
159
157
 
160
158
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
161
159
  return await extract_pptx_file_content(Path(input_file))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 2.0.1
3
+ Version: 2.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
42
42
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
43
43
  - **Local Processing**: No external API calls or cloud dependencies required
44
44
  - **Resource Efficient**: Lightweight processing without GPU requirements
45
- - **Lightweight**: Has few curated dependencies and a minimal footprint
45
+ - **Small Package Size**: Has few curated dependencies and a minimal footprint
46
46
  - **Format Support**: Comprehensive support for documents, images, and text formats
47
47
  - **Modern Python**: Built with async/await, type hints, and functional first approach
48
48
  - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
@@ -61,8 +61,8 @@ pip install kreuzberg
61
61
 
62
62
  Kreuzberg requires two system level dependencies:
63
63
 
64
- - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
65
- - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
64
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
65
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
66
66
 
67
67
  You can install these with:
68
68
 
@@ -75,7 +75,7 @@ sudo apt-get install pandoc tesseract-ocr
75
75
  #### MacOS
76
76
 
77
77
  ```shell
78
- # MacOS
78
+ #
79
79
  brew install tesseract pandoc
80
80
  ```
81
81
 
@@ -191,19 +191,15 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
191
191
 
192
192
  #### Processing Configuration
193
193
 
194
- - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
195
-
196
- Notes:
197
-
198
- - Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
194
+ - `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
199
195
 
200
196
  ### Quick Start
201
197
 
202
198
  ```python
203
199
  from pathlib import Path
204
200
  from kreuzberg import extract_file
205
- from kreuzberg.extraction import ExtractionResult
206
- from kreuzberg._tesseract import PSMMode
201
+ from kreuzberg import ExtractionResult
202
+ from kreuzberg import PSMMode
207
203
 
208
204
 
209
205
  # Basic file extraction
@@ -232,7 +228,7 @@ async def extract_document():
232
228
 
233
229
  ```python
234
230
  from kreuzberg import extract_bytes
235
- from kreuzberg.extraction import ExtractionResult
231
+ from kreuzberg import ExtractionResult
236
232
 
237
233
 
238
234
  async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
@@ -378,8 +374,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
378
374
  Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
379
375
 
380
376
  ```python
381
- from kreuzberg import extract_file
382
- from kreuzberg.exceptions import (
377
+ from kreuzberg import (
378
+ extract_file,
383
379
  ValidationError,
384
380
  ParsingError,
385
381
  OCRError,
@@ -0,0 +1,21 @@
1
+ kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
2
+ kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
3
+ kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
4
+ kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
5
+ kreuzberg/_pandoc.py,sha256=lUqG1GQqezz011fLn12AUKJ_xw9gElj-S7xRO5g-Rlw,12513
6
+ kreuzberg/_pdf.py,sha256=BI7ooYvvLPEX3y7lKyri4r0k6bW4pj_cmBQW1UqZiF8,6227
7
+ kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
+ kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
9
+ kreuzberg/_sync.py,sha256=DepezWTfsyyeEq7VYjhWD6XFRiaEz-uCvXFUYkQMswQ,2191
10
+ kreuzberg/_tesseract.py,sha256=gKGyZpa_MLLsMTpzi_VvSXFAmLxagRE-sfqH2oKFmPM,7662
11
+ kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
+ kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
+ kreuzberg/_xlsx.py,sha256=JcQTdV38uiNdyRmHQ1DI6khN8ng4W38tIRaxonIoaHs,2703
14
+ kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
15
+ kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
16
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ kreuzberg-2.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
+ kreuzberg-2.1.0.dist-info/METADATA,sha256=t1NeglNqJFjWpr6WeIp-d33OikT_HIrS8FrEMGSk1hA,14844
19
+ kreuzberg-2.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
+ kreuzberg-2.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
+ kreuzberg-2.1.0.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- kreuzberg/__init__.py,sha256=CBRHXPhjdslaSXaUjZO5V0k57uz5_x12cwo0HTtxOcU,647
2
- kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
3
- kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
4
- kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
5
- kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
6
- kreuzberg/_pdf.py,sha256=9YErIrRvMMFXKHckXzBDCEMzDAEnC0JVOR38gFhvHKQ,6227
7
- kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
- kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
9
- kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
10
- kreuzberg/_tesseract.py,sha256=SZsv0gFWvzR8iLaMyGr4Oc0lXE7atCR3sNxXR7TQzEE,7686
11
- kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
- kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
- kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
14
- kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
15
- kreuzberg/extraction.py,sha256=kuEKvOGhPBRcFeGX7eKmup9BukX6o55740F_KdZ15qQ,13214
16
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- kreuzberg-2.0.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
- kreuzberg-2.0.1.dist-info/METADATA,sha256=KmKLubQ89i0_JwpK96kYbhuq1MuucrqHe2bCLNcbyic,15023
19
- kreuzberg-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
- kreuzberg-2.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
- kreuzberg-2.0.1.dist-info/RECORD,,