kreuzberg 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,6 +1,14 @@
1
+ from ._tesseract import PSMMode
1
2
  from ._types import ExtractionResult, Metadata
2
3
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
3
- from .extraction import extract_bytes, extract_file
4
+ from .extraction import (
5
+ batch_extract_bytes,
6
+ batch_extract_bytes_sync,
7
+ batch_extract_file,
8
+ batch_extract_file_sync,
9
+ extract_bytes,
10
+ extract_file,
11
+ )
4
12
 
5
13
  __all__ = [
6
14
  "ExtractionResult",
@@ -8,8 +16,13 @@ __all__ = [
8
16
  "Metadata",
9
17
  "MissingDependencyError",
10
18
  "OCRError",
19
+ "PSMMode",
11
20
  "ParsingError",
12
21
  "ValidationError",
22
+ "batch_extract_bytes",
23
+ "batch_extract_bytes_sync",
24
+ "batch_extract_file",
25
+ "batch_extract_file_sync",
13
26
  "extract_bytes",
14
27
  "extract_file",
15
28
  ]
kreuzberg/_constants.py CHANGED
@@ -3,4 +3,6 @@ from __future__ import annotations
3
3
  from multiprocessing import cpu_count
4
4
  from typing import Final
5
5
 
6
- DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
6
+ DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
7
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
8
+ MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
kreuzberg/_html.py CHANGED
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
8
8
  from kreuzberg import ExtractionResult
9
9
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
10
10
  from kreuzberg._string import normalize_spaces, safe_decode
11
- from kreuzberg._sync import run_sync
12
11
 
13
12
  if TYPE_CHECKING:
14
13
  from pathlib import Path
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
28
27
  if isinstance(file_path_or_contents, bytes)
29
28
  else await AsyncPath(file_path_or_contents).read_text()
30
29
  )
31
- result = await run_sync(html_to_markdown.convert_to_markdown, content)
30
+ result = html_to_markdown.convert_to_markdown(content)
32
31
  return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_pandoc.py CHANGED
@@ -1,21 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
- import subprocess
3
+ import re
4
4
  import sys
5
5
  from functools import partial
6
6
  from json import JSONDecodeError, loads
7
7
  from typing import TYPE_CHECKING, Any, Final, Literal, cast
8
8
 
9
- from anyio import CapacityLimiter, create_task_group, to_process
10
9
  from anyio import Path as AsyncPath
10
+ from anyio import run_process
11
11
 
12
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
12
+ from kreuzberg import ValidationError
13
+ from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
13
14
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
14
15
  from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync
16
+ from kreuzberg._sync import run_taskgroup
16
17
  from kreuzberg._tmp import create_temp_file
17
18
  from kreuzberg._types import ExtractionResult, Metadata
18
- from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
19
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError
19
20
 
20
21
  if TYPE_CHECKING: # pragma: no cover
21
22
  from collections.abc import Mapping
@@ -24,10 +25,8 @@ if TYPE_CHECKING: # pragma: no cover
24
25
  if sys.version_info < (3, 11): # pragma: no cover
25
26
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
27
 
27
-
28
28
  version_ref: Final[dict[str, bool]] = {"checked": False}
29
29
 
30
-
31
30
  # Block-level node types in Pandoc AST
32
31
  BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
33
32
  BLOCK_PARA: Final = "Para" # Paragraph containing inline content
@@ -229,20 +228,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
229
228
 
230
229
 
231
230
  def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
232
- if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
233
- mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
234
- ):
235
- raise ValidationError(
236
- f"Unsupported mime type: {mime_type}",
237
- context={
238
- "mime_type": mime_type,
239
- "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
240
- },
231
+ if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
232
+ return pandoc_type
233
+
234
+ if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
235
+ return next(
236
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
241
237
  )
242
238
 
243
- return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
244
- MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
245
- )
239
+ raise ValidationError(f"Unsupported mime type: {mime_type}")
246
240
 
247
241
 
248
242
  async def _validate_pandoc_version() -> None:
@@ -251,20 +245,19 @@ async def _validate_pandoc_version() -> None:
251
245
  return
252
246
 
253
247
  command = ["pandoc", "--version"]
254
- result = await run_sync(subprocess.run, command, capture_output=True)
255
- version = result.stdout.decode().split("\n")[0].split()[1]
256
- if not version.startswith("3."):
257
- raise MissingDependencyError("Pandoc version 3 or above is required.")
248
+ result = await run_process(command)
249
+
250
+ version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
251
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
252
+ raise MissingDependencyError("Pandoc version 2 or above is required")
258
253
 
259
254
  version_ref["checked"] = True
260
255
 
261
256
  except FileNotFoundError as e:
262
- raise MissingDependencyError("Pandoc is not installed.") from e
257
+ raise MissingDependencyError("Pandoc is not installed") from e
263
258
 
264
259
 
265
- async def _handle_extract_metadata(
266
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
267
- ) -> Metadata:
260
+ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
268
261
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
269
262
  metadata_file, unlink = await create_temp_file(".json")
270
263
  try:
@@ -276,15 +269,10 @@ async def _handle_extract_metadata(
276
269
  "--standalone",
277
270
  "--quiet",
278
271
  "--output",
279
- metadata_file,
272
+ str(metadata_file),
280
273
  ]
281
274
 
282
- result = await to_process.run_sync(
283
- partial(subprocess.run, capture_output=True),
284
- command,
285
- cancellable=True,
286
- limiter=CapacityLimiter(max_processes),
287
- )
275
+ result = await run_process(command)
288
276
 
289
277
  if result.returncode != 0:
290
278
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -297,9 +285,7 @@ async def _handle_extract_metadata(
297
285
  await unlink()
298
286
 
299
287
 
300
- async def _handle_extract_file(
301
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
302
- ) -> str:
288
+ async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
303
289
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
304
290
  output_path, unlink = await create_temp_file(".md")
305
291
  try:
@@ -315,12 +301,7 @@ async def _handle_extract_file(
315
301
 
316
302
  command.extend(["--output", str(output_path)])
317
303
 
318
- result = await to_process.run_sync(
319
- partial(subprocess.run, capture_output=True),
320
- command,
321
- cancellable=True,
322
- limiter=CapacityLimiter(max_processes),
323
- )
304
+ result = await run_process(command)
324
305
 
325
306
  if result.returncode != 0:
326
307
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -334,15 +315,12 @@ async def _handle_extract_file(
334
315
  await unlink()
335
316
 
336
317
 
337
- async def process_file_with_pandoc(
338
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
339
- ) -> ExtractionResult:
318
+ async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
340
319
  """Process a single file using Pandoc and convert to markdown.
341
320
 
342
321
  Args:
343
322
  input_file: The path to the file to process.
344
323
  mime_type: The mime type of the file.
345
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
346
324
 
347
325
  Raises:
348
326
  ParsingError: If the file data could not be extracted.
@@ -354,41 +332,27 @@ async def process_file_with_pandoc(
354
332
 
355
333
  _get_pandoc_type_from_mime_type(mime_type)
356
334
 
357
- metadata: Metadata = {}
358
- content: str = ""
359
-
360
335
  try:
361
- async with create_task_group() as tg:
362
-
363
- async def _get_metadata() -> None:
364
- nonlocal metadata
365
- metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
366
-
367
- async def _get_content() -> None:
368
- nonlocal content
369
- content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
336
+ metadata, content = await run_taskgroup(
337
+ partial(_handle_extract_metadata, input_file, mime_type=mime_type),
338
+ partial(_handle_extract_file, input_file, mime_type=mime_type),
339
+ )
370
340
 
371
- tg.start_soon(_get_metadata)
372
- tg.start_soon(_get_content)
341
+ return ExtractionResult(
342
+ content=normalize_spaces(cast(str, content)),
343
+ metadata=cast(Metadata, metadata),
344
+ mime_type=MARKDOWN_MIME_TYPE,
345
+ )
373
346
  except ExceptionGroup as eg:
374
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
375
-
376
- return ExtractionResult(
377
- content=normalize_spaces(content),
378
- metadata=metadata,
379
- mime_type=MARKDOWN_MIME_TYPE,
380
- )
347
+ raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
381
348
 
382
349
 
383
- async def process_content_with_pandoc(
384
- content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
385
- ) -> ExtractionResult:
350
+ async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
386
351
  """Process content using Pandoc and convert to markdown.
387
352
 
388
353
  Args:
389
354
  content: The content to process.
390
355
  mime_type: The mime type of the content.
391
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
392
356
 
393
357
  Returns:
394
358
  ExtractionResult
@@ -397,7 +361,7 @@ async def process_content_with_pandoc(
397
361
  input_file, unlink = await create_temp_file(f".{extension}")
398
362
 
399
363
  await AsyncPath(input_file).write_bytes(content)
400
- result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
364
+ result = await process_file_with_pandoc(input_file, mime_type=mime_type)
401
365
 
402
366
  await unlink()
403
367
  return result
kreuzberg/_pdf.py CHANGED
@@ -11,7 +11,7 @@ from kreuzberg import ExtractionResult
11
11
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
12
  from kreuzberg._string import normalize_spaces
13
13
  from kreuzberg._sync import run_sync
14
- from kreuzberg._tesseract import PSMMode, SupportedLanguage, batch_process_images
14
+ from kreuzberg._tesseract import PSMMode, batch_process_images
15
15
  from kreuzberg.exceptions import ParsingError
16
16
 
17
17
  if TYPE_CHECKING: # pragma: no cover
@@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
67
67
  document: pypdfium2.PdfDocument | None = None
68
68
  try:
69
69
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
70
- return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
70
+ return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
71
71
  except pypdfium2.PdfiumError as e:
72
72
  raise ParsingError(
73
73
  "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
@@ -80,7 +80,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
80
80
  async def _extract_pdf_text_with_ocr(
81
81
  input_file: Path,
82
82
  *,
83
- language: SupportedLanguage = "eng",
83
+ language: str = "eng",
84
84
  max_processes: int,
85
85
  psm: PSMMode = PSMMode.AUTO,
86
86
  ) -> ExtractionResult:
@@ -132,7 +132,7 @@ async def extract_pdf_file(
132
132
  input_file: Path,
133
133
  *,
134
134
  force_ocr: bool,
135
- language: SupportedLanguage = "eng",
135
+ language: str = "eng",
136
136
  max_processes: int,
137
137
  psm: PSMMode = PSMMode.AUTO,
138
138
  ) -> ExtractionResult:
@@ -154,7 +154,6 @@ async def extract_pdf_file(
154
154
  and _validate_extracted_text(content)
155
155
  ):
156
156
  return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
157
-
158
157
  return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
159
158
 
160
159
 
@@ -162,7 +161,7 @@ async def extract_pdf_content(
162
161
  content: bytes,
163
162
  *,
164
163
  force_ocr: bool,
165
- language: SupportedLanguage = "eng",
164
+ language: str = "eng",
166
165
  max_processes: int,
167
166
  psm: PSMMode = PSMMode.AUTO,
168
167
  ) -> ExtractionResult:
kreuzberg/_string.py CHANGED
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
22
22
  encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
23
23
 
24
24
  for enc in [e for e in encodings if e]: # pragma: no cover
25
- with suppress(UnicodeDecodeError):
25
+ with suppress(UnicodeDecodeError, LookupError):
26
26
  return byte_data.decode(enc)
27
27
 
28
28
  # If all encodings fail, fall back to latin-1 which can handle any byte
kreuzberg/_sync.py CHANGED
@@ -4,10 +4,11 @@ import sys
4
4
  from functools import partial
5
5
  from typing import TYPE_CHECKING, TypeVar, cast
6
6
 
7
+ from anyio import create_task_group
7
8
  from anyio.to_thread import run_sync as any_io_run_sync
8
9
 
9
10
  if TYPE_CHECKING: # pragma: no cover
10
- from collections.abc import Callable
11
+ from collections.abc import Callable, Coroutine
11
12
 
12
13
  if sys.version_info >= (3, 10):
13
14
  from typing import ParamSpec
@@ -30,4 +31,44 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
30
31
  The result of the synchronous function.
31
32
  """
32
33
  handler = partial(sync_fn, **kwargs)
33
- return cast(T, await any_io_run_sync(handler, *args)) # pyright: ignore [reportCallIssue]
34
+ return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
+
36
+
37
+ async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) -> list[T]:
38
+ """Run a list of coroutines concurrently.
39
+
40
+ Args:
41
+ *async_tasks: The list of coroutines to run.
42
+
43
+ Returns:
44
+ The results of the coroutines.
45
+ """
46
+ results = cast(list[T], [None] * len(async_tasks))
47
+
48
+ async def run_task(index: int, task: Callable[[], Coroutine[None, None, T]]) -> None:
49
+ results[index] = await task()
50
+
51
+ async with create_task_group() as tg:
52
+ for i, t in enumerate(async_tasks):
53
+ tg.start_soon(run_task, i, t)
54
+
55
+ return results
56
+
57
+
58
+ async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None, T]], batch_size: int) -> list[T]:
59
+ """Run a list of coroutines concurrently in batches.
60
+
61
+ Args:
62
+ *async_tasks: The list of coroutines to run.
63
+ batch_size: The size of each batch.
64
+
65
+ Returns:
66
+ The results of the coroutines.
67
+ """
68
+ results: list[T] = []
69
+
70
+ for i in range(0, len(async_tasks), batch_size):
71
+ batch = async_tasks[i : i + batch_size]
72
+ results.extend(await run_taskgroup(*batch))
73
+
74
+ return results
kreuzberg/_tesseract.py CHANGED
@@ -1,164 +1,31 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
- import subprocess
5
4
  import sys
6
5
  from enum import Enum
7
6
  from functools import partial
8
7
  from os import PathLike
9
- from typing import Final, Literal, TypeVar, Union, cast
8
+ from typing import Any, TypeVar, Union
10
9
 
11
- from anyio import CapacityLimiter, create_task_group, to_process
12
10
  from anyio import Path as AsyncPath
11
+ from anyio import run_process
13
12
  from PIL.Image import Image
14
13
 
15
- from kreuzberg import ExtractionResult, ParsingError
16
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
14
+ from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
17
15
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
18
16
  from kreuzberg._string import normalize_spaces
19
- from kreuzberg._sync import run_sync
17
+ from kreuzberg._sync import run_sync, run_taskgroup_batched
20
18
  from kreuzberg._tmp import create_temp_file
21
- from kreuzberg.exceptions import MissingDependencyError, OCRError
19
+ from kreuzberg._types import ExtractionResult
20
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
22
21
 
23
22
  if sys.version_info < (3, 11): # pragma: no cover
24
23
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
25
24
 
26
- MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
27
-
28
25
  version_ref = {"checked": False}
29
26
 
30
27
  T = TypeVar("T", bound=Union[Image, PathLike[str], str])
31
28
 
32
- SupportedLanguage = Literal[
33
- "afr",
34
- "amh",
35
- "ara",
36
- "asm",
37
- "aze",
38
- "aze_cyrl",
39
- "bel",
40
- "ben",
41
- "bod",
42
- "bos",
43
- "bre",
44
- "bul",
45
- "cat",
46
- "ceb",
47
- "ces",
48
- "chi_sim",
49
- "chi_tra",
50
- "chr",
51
- "cos",
52
- "cym",
53
- "dan",
54
- "dan_frak",
55
- "deu",
56
- "deu_frak",
57
- "deu_latf",
58
- "dzo",
59
- "ell",
60
- "eng",
61
- "enm",
62
- "epo",
63
- "equ",
64
- "est",
65
- "eus",
66
- "fao",
67
- "fas",
68
- "fil",
69
- "fin",
70
- "fra",
71
- "frk",
72
- "frm",
73
- "fry",
74
- "gla",
75
- "gle",
76
- "glg",
77
- "grc",
78
- "guj",
79
- "hat",
80
- "heb",
81
- "hin",
82
- "hrv",
83
- "hun",
84
- "hye",
85
- "iku",
86
- "ind",
87
- "isl",
88
- "ita",
89
- "ita_old",
90
- "jav",
91
- "jpn",
92
- "kan",
93
- "kat",
94
- "kat_old",
95
- "kaz",
96
- "khm",
97
- "kir",
98
- "kmr",
99
- "kor",
100
- "kor_vert",
101
- "kur",
102
- "lao",
103
- "lat",
104
- "lav",
105
- "lit",
106
- "ltz",
107
- "mal",
108
- "mar",
109
- "mkd",
110
- "mlt",
111
- "mon",
112
- "mri",
113
- "msa",
114
- "mya",
115
- "nep",
116
- "nld",
117
- "nor",
118
- "oci",
119
- "ori",
120
- "osd",
121
- "pan",
122
- "pol",
123
- "por",
124
- "pus",
125
- "que",
126
- "ron",
127
- "rus",
128
- "san",
129
- "sin",
130
- "slk",
131
- "slk_frak",
132
- "slv",
133
- "snd",
134
- "spa",
135
- "spa_old",
136
- "sqi",
137
- "srp",
138
- "srp_latn",
139
- "sun",
140
- "swa",
141
- "swe",
142
- "syr",
143
- "tam",
144
- "tat",
145
- "tel",
146
- "tgk",
147
- "tgl",
148
- "tha",
149
- "tir",
150
- "ton",
151
- "tur",
152
- "uig",
153
- "ukr",
154
- "urd",
155
- "uzb",
156
- "uzb_cyrl",
157
- "vie",
158
- "yid",
159
- "yor",
160
- ]
161
-
162
29
 
163
30
  class PSMMode(Enum):
164
31
  """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
@@ -198,22 +65,23 @@ async def validate_tesseract_version() -> None:
198
65
  return
199
66
 
200
67
  command = ["tesseract", "--version"]
201
- result = await run_sync(subprocess.run, command, capture_output=True)
202
- version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
68
+ result = await run_process(command)
69
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
203
70
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
204
71
  raise MissingDependencyError("Tesseract version 5 or above is required.")
205
72
 
206
73
  version_ref["checked"] = True
207
74
  except FileNotFoundError as e:
208
- raise MissingDependencyError("Tesseract is not installed.") from e
75
+ raise MissingDependencyError(
76
+ "Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
77
+ ) from e
209
78
 
210
79
 
211
80
  async def process_file(
212
81
  input_file: str | PathLike[str],
213
82
  *,
214
- language: SupportedLanguage,
83
+ language: str,
215
84
  psm: PSMMode,
216
- max_processes: int = DEFAULT_MAX_PROCESSES,
217
85
  ) -> ExtractionResult:
218
86
  """Process a single image file using Tesseract OCR.
219
87
 
@@ -221,7 +89,6 @@ async def process_file(
221
89
  input_file: The path to the image file to process.
222
90
  language: The language code for OCR.
223
91
  psm: Page segmentation mode.
224
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
225
92
 
226
93
  Raises:
227
94
  OCRError: If OCR fails to extract text from the image.
@@ -232,6 +99,7 @@ async def process_file(
232
99
  output_path, unlink = await create_temp_file(".txt")
233
100
  try:
234
101
  output_base = str(output_path).replace(".txt", "")
102
+
235
103
  command = [
236
104
  "tesseract",
237
105
  str(input_file),
@@ -240,22 +108,44 @@ async def process_file(
240
108
  language,
241
109
  "--psm",
242
110
  str(psm.value),
111
+ "--oem",
112
+ "1",
113
+ "--loglevel",
114
+ "OFF",
115
+ "-c",
116
+ "thresholding_method=1",
117
+ "-c",
118
+ "tessedit_enable_dict_correction=1",
119
+ "-c",
120
+ "language_model_ngram_on=1",
121
+ "-c",
122
+ "textord_space_size_is_variable=1",
123
+ "-c",
124
+ "classify_use_pre_adapted_templates=1",
125
+ "-c",
126
+ "tessedit_dont_blkrej_good_wds=1",
127
+ "-c",
128
+ "tessedit_dont_rowrej_good_wds=1",
129
+ "-c",
130
+ "tessedit_use_primary_params_model=1",
243
131
  ]
244
132
 
245
- result = await to_process.run_sync(
246
- partial(subprocess.run, capture_output=True),
247
- command,
248
- limiter=CapacityLimiter(max_processes),
249
- cancellable=True,
250
- )
133
+ env: dict[str, Any] | None = None
134
+ if sys.platform.startswith("linux"):
135
+ env = {"OMP_THREAD_LIMIT": "1"}
136
+
137
+ result = await run_process(command, env=env)
251
138
 
252
139
  if not result.returncode == 0:
253
- raise OCRError("OCR failed with a non-0 return code.")
140
+ raise OCRError(
141
+ "OCR failed with a non-0 return code.",
142
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
143
+ )
254
144
 
255
145
  output = await AsyncPath(output_path).read_text("utf-8")
256
146
  return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
257
147
  except (RuntimeError, OSError) as e:
258
- raise OCRError("Failed to OCR using tesseract") from e
148
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
259
149
  finally:
260
150
  await unlink()
261
151
 
@@ -263,9 +153,8 @@ async def process_file(
263
153
  async def process_image(
264
154
  image: Image,
265
155
  *,
266
- language: SupportedLanguage,
156
+ language: str,
267
157
  psm: PSMMode,
268
- max_processes: int = DEFAULT_MAX_PROCESSES,
269
158
  ) -> ExtractionResult:
270
159
  """Process a single Pillow Image using Tesseract OCR.
271
160
 
@@ -273,14 +162,13 @@ async def process_image(
273
162
  image: The Pillow Image to process.
274
163
  language: The language code for OCR.
275
164
  psm: Page segmentation mode.
276
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
277
165
 
278
166
  Returns:
279
167
  ExtractionResult: The extracted text from the image.
280
168
  """
281
169
  image_path, unlink = await create_temp_file(".png")
282
170
  await run_sync(image.save, str(image_path), format="PNG")
283
- result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
171
+ result = await process_file(image_path, language=language, psm=psm)
284
172
  await unlink()
285
173
  return result
286
174
 
@@ -288,9 +176,8 @@ async def process_image(
288
176
  async def process_image_with_tesseract(
289
177
  image: Image | PathLike[str] | str,
290
178
  *,
291
- language: SupportedLanguage = "eng",
179
+ language: str = "eng",
292
180
  psm: PSMMode = PSMMode.AUTO,
293
- max_processes: int = DEFAULT_MAX_PROCESSES,
294
181
  ) -> ExtractionResult:
295
182
  """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
296
183
 
@@ -298,7 +185,6 @@ async def process_image_with_tesseract(
298
185
  image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
299
186
  language: The language code for OCR (default: "eng").
300
187
  psm: Page segmentation mode (default: PSMMode.AUTO).
301
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
302
188
 
303
189
  Raises:
304
190
  ValueError: If the input is not a Pillow Image or a list of Pillow Images.
@@ -309,10 +195,10 @@ async def process_image_with_tesseract(
309
195
  await validate_tesseract_version()
310
196
 
311
197
  if isinstance(image, Image):
312
- return await process_image(image, language=language, psm=psm, max_processes=max_processes)
198
+ return await process_image(image, language=language, psm=psm)
313
199
 
314
200
  if isinstance(image, (PathLike, str)):
315
- return await process_file(image, language=language, psm=psm, max_processes=max_processes)
201
+ return await process_file(image, language=language, psm=psm)
316
202
 
317
203
  raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
318
204
 
@@ -320,7 +206,7 @@ async def process_image_with_tesseract(
320
206
  async def batch_process_images(
321
207
  images: list[T],
322
208
  *,
323
- language: SupportedLanguage = "eng",
209
+ language: str = "eng",
324
210
  psm: PSMMode = PSMMode.AUTO,
325
211
  max_processes: int = DEFAULT_MAX_PROCESSES,
326
212
  ) -> list[ExtractionResult]:
@@ -330,7 +216,7 @@ async def batch_process_images(
330
216
  images: A list of Pillow Images, paths or strings to process.
331
217
  language: The language code for OCR (default: "eng").
332
218
  psm: Page segmentation mode (default: PSMMode.AUTO).
333
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
219
+ max_processes: Maximum number of concurrent processes (default: CPU count / 2).
334
220
 
335
221
  Raises:
336
222
  ParsingError: If OCR fails to extract text from any of the images.
@@ -339,17 +225,10 @@ async def batch_process_images(
339
225
  List of ExtractionResult objects, one per input image.
340
226
  """
341
227
  await validate_tesseract_version()
342
- results = cast(list[ExtractionResult], list(range(len(images))))
343
-
344
- async def _process_image(index: int, image: T) -> None:
345
- results[index] = await process_image_with_tesseract(
346
- image, language=language, psm=psm, max_processes=max_processes
347
- )
348
-
349
228
  try:
350
- async with create_task_group() as tg:
351
- for i, image in enumerate(images):
352
- tg.start_soon(_process_image, i, image)
353
- return results
229
+ return await run_taskgroup_batched(
230
+ *[partial(process_image_with_tesseract, image, language=language, psm=psm) for image in images],
231
+ batch_size=max_processes,
232
+ )
354
233
  except ExceptionGroup as eg:
355
- raise ParsingError("Failed to process images with Tesseract") from eg
234
+ raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
kreuzberg/_xlsx.py CHANGED
@@ -1,23 +1,47 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import csv
4
+ import sys
5
+ from functools import partial
4
6
  from io import StringIO
5
- from typing import TYPE_CHECKING, cast
7
+ from typing import TYPE_CHECKING
6
8
 
7
9
  from anyio import Path as AsyncPath
8
- from anyio import create_task_group
9
10
  from python_calamine import CalamineWorkbook
10
11
 
11
12
  from kreuzberg import ExtractionResult, ParsingError
12
13
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
13
14
  from kreuzberg._pandoc import process_file_with_pandoc
14
15
  from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync
16
+ from kreuzberg._sync import run_sync, run_taskgroup
16
17
  from kreuzberg._tmp import create_temp_file
17
18
 
18
19
  if TYPE_CHECKING: # pragma: no cover
19
20
  from pathlib import Path
20
21
 
22
+ if sys.version_info < (3, 11): # pragma: no cover
23
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
24
+
25
+
26
+ async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
27
+ values = workbook.get_sheet_by_name(sheet_name).to_python()
28
+
29
+ csv_buffer = StringIO()
30
+ writer = csv.writer(csv_buffer)
31
+
32
+ for row in values:
33
+ writer.writerow(row)
34
+
35
+ csv_data = csv_buffer.getvalue()
36
+ csv_buffer.close()
37
+
38
+ csv_path, unlink = await create_temp_file(".csv")
39
+ await AsyncPath(csv_path).write_text(csv_data)
40
+
41
+ result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
42
+ await unlink()
43
+ return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
44
+
21
45
 
22
46
  async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
23
47
  """Extract text from an XLSX file by converting it to CSV and then to markdown.
@@ -33,46 +57,20 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
33
57
  """
34
58
  try:
35
59
  workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
36
-
37
- results = cast(list[str], [None] * len(workbook.sheet_names))
38
-
39
- async def convert_sheet_to_text(sheet_name: str) -> None:
40
- nonlocal results
41
- values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
42
-
43
- csv_buffer = StringIO()
44
- writer = csv.writer(csv_buffer)
45
-
46
- for row in values:
47
- writer.writerow(row)
48
-
49
- csv_data = csv_buffer.getvalue()
50
- csv_buffer.close()
51
-
52
- from kreuzberg._tmp import create_temp_file
53
-
54
- csv_path, unlink = await create_temp_file(".csv")
55
- await AsyncPath(csv_path).write_text(csv_data)
56
- result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
57
- results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
58
- await unlink()
59
-
60
- async with create_task_group() as tg:
61
- for sheet_name in workbook.sheet_names:
62
- tg.start_soon(convert_sheet_to_text, sheet_name)
60
+ results = await run_taskgroup(
61
+ *[partial(convert_sheet_to_text, workbook, sheet_name) for sheet_name in workbook.sheet_names]
62
+ )
63
63
 
64
64
  return ExtractionResult(
65
65
  content="\n\n".join(results),
66
66
  mime_type=MARKDOWN_MIME_TYPE,
67
67
  metadata={},
68
68
  )
69
- except Exception as e:
69
+ except ExceptionGroup as eg:
70
70
  raise ParsingError(
71
- "Could not extract text from XLSX",
72
- context={
73
- "error": str(e),
74
- },
75
- ) from e
71
+ "Failed to extract file data",
72
+ context={"file": str(input_file), "errors": eg.exceptions},
73
+ ) from eg
76
74
 
77
75
 
78
76
  async def extract_xlsx_content(content: bytes) -> ExtractionResult:
kreuzberg/exceptions.py CHANGED
@@ -14,9 +14,28 @@ class KreuzbergError(Exception):
14
14
  self.context = context
15
15
  super().__init__(message)
16
16
 
17
+ def _serialize_context(self, obj: Any) -> Any:
18
+ """Recursively serialize context objects to ensure JSON compatibility."""
19
+ if isinstance(obj, bytes):
20
+ return obj.decode("utf-8", errors="replace")
21
+ if isinstance(obj, dict):
22
+ return {k: self._serialize_context(v) for k, v in obj.items()}
23
+ if isinstance(obj, (list, tuple)):
24
+ return [self._serialize_context(x) for x in obj]
25
+ if isinstance(obj, Exception):
26
+ return {
27
+ "type": obj.__class__.__name__,
28
+ "message": str(obj),
29
+ }
30
+ return obj
31
+
17
32
  def __str__(self) -> str:
18
33
  """Return a string representation of the exception."""
19
- ctx = f"\n\nContext: {dumps(self.context)}" if self.context else ""
34
+ if self.context:
35
+ serialized_context = self._serialize_context(self.context)
36
+ ctx = f"\n\nContext: {dumps(serialized_context)}"
37
+ else:
38
+ ctx = ""
20
39
 
21
40
  return f"{self.__class__.__name__}: {super().__str__()}{ctx}"
22
41
 
kreuzberg/extraction.py CHANGED
@@ -38,7 +38,7 @@ from kreuzberg._pdf import (
38
38
  )
39
39
  from kreuzberg._pptx import extract_pptx_file_content
40
40
  from kreuzberg._string import safe_decode
41
- from kreuzberg._tesseract import PSMMode, SupportedLanguage, process_image_with_tesseract
41
+ from kreuzberg._tesseract import PSMMode, process_image_with_tesseract
42
42
  from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
43
43
  from kreuzberg.exceptions import ValidationError
44
44
 
@@ -52,7 +52,7 @@ async def extract_bytes(
52
52
  mime_type: str,
53
53
  *,
54
54
  force_ocr: bool = False,
55
- language: SupportedLanguage = "eng",
55
+ language: str = "eng",
56
56
  max_processes: int = DEFAULT_MAX_PROCESSES,
57
57
  psm: PSMMode = PSMMode.AUTO,
58
58
  ) -> ExtractionResult:
@@ -87,14 +87,12 @@ async def extract_bytes(
87
87
  return await extract_xlsx_content(content)
88
88
 
89
89
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
90
- return await process_image_with_tesseract(
91
- open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
92
- )
90
+ return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
93
91
 
94
92
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
95
93
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
96
94
  ):
97
- return await process_content_with_pandoc(content=content, mime_type=mime_type, max_processes=max_processes)
95
+ return await process_content_with_pandoc(content=content, mime_type=mime_type)
98
96
 
99
97
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
100
98
  return await extract_pptx_file_content(content)
@@ -114,7 +112,7 @@ async def extract_file(
114
112
  mime_type: str | None = None,
115
113
  *,
116
114
  force_ocr: bool = False,
117
- language: SupportedLanguage = "eng",
115
+ language: str = "eng",
118
116
  max_processes: int = DEFAULT_MAX_PROCESSES,
119
117
  psm: PSMMode = PSMMode.AUTO,
120
118
  ) -> ExtractionResult:
@@ -150,12 +148,12 @@ async def extract_file(
150
148
  return await extract_xlsx_file(Path(input_file))
151
149
 
152
150
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
153
- return await process_image_with_tesseract(input_file, max_processes=max_processes, psm=psm, language=language)
151
+ return await process_image_with_tesseract(input_file, psm=psm, language=language)
154
152
 
155
153
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
156
154
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
157
155
  ):
158
- return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type, max_processes=max_processes)
156
+ return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
159
157
 
160
158
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
161
159
  return await extract_pptx_file_content(Path(input_file))
@@ -170,7 +168,7 @@ async def batch_extract_file(
170
168
  file_paths: Sequence[PathLike[str] | str],
171
169
  *,
172
170
  force_ocr: bool = False,
173
- language: SupportedLanguage = "eng",
171
+ language: str = "eng",
174
172
  max_processes: int = DEFAULT_MAX_PROCESSES,
175
173
  psm: PSMMode = PSMMode.AUTO,
176
174
  ) -> list[ExtractionResult]:
@@ -209,7 +207,7 @@ async def batch_extract_bytes(
209
207
  contents: Sequence[tuple[bytes, str]],
210
208
  *,
211
209
  force_ocr: bool = False,
212
- language: SupportedLanguage = "eng",
210
+ language: str = "eng",
213
211
  max_processes: int = DEFAULT_MAX_PROCESSES,
214
212
  psm: PSMMode = PSMMode.AUTO,
215
213
  ) -> list[ExtractionResult]:
@@ -253,7 +251,7 @@ def extract_bytes_sync(
253
251
  mime_type: str,
254
252
  *,
255
253
  force_ocr: bool = False,
256
- language: SupportedLanguage = "eng",
254
+ language: str = "eng",
257
255
  max_processes: int = DEFAULT_MAX_PROCESSES,
258
256
  psm: PSMMode = PSMMode.AUTO,
259
257
  ) -> ExtractionResult:
@@ -281,7 +279,7 @@ def extract_file_sync(
281
279
  mime_type: str | None = None,
282
280
  *,
283
281
  force_ocr: bool = False,
284
- language: SupportedLanguage = "eng",
282
+ language: str = "eng",
285
283
  max_processes: int = DEFAULT_MAX_PROCESSES,
286
284
  psm: PSMMode = PSMMode.AUTO,
287
285
  ) -> ExtractionResult:
@@ -308,7 +306,7 @@ def batch_extract_file_sync(
308
306
  file_paths: Sequence[PathLike[str] | str],
309
307
  *,
310
308
  force_ocr: bool = False,
311
- language: SupportedLanguage = "eng",
309
+ language: str = "eng",
312
310
  max_processes: int = DEFAULT_MAX_PROCESSES,
313
311
  psm: PSMMode = PSMMode.AUTO,
314
312
  ) -> list[ExtractionResult]:
@@ -339,7 +337,7 @@ def batch_extract_bytes_sync(
339
337
  contents: Sequence[tuple[bytes, str]],
340
338
  *,
341
339
  force_ocr: bool = False,
342
- language: SupportedLanguage = "eng",
340
+ language: str = "eng",
343
341
  max_processes: int = DEFAULT_MAX_PROCESSES,
344
342
  psm: PSMMode = PSMMode.AUTO,
345
343
  ) -> list[ExtractionResult]:
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 2.0.0
3
+ Version: 2.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
42
42
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
43
43
  - **Local Processing**: No external API calls or cloud dependencies required
44
44
  - **Resource Efficient**: Lightweight processing without GPU requirements
45
- - **Lightweight**: Has few curated dependencies and a minimal footprint
45
+ - **Small Package Size**: Has few curated dependencies and a minimal footprint
46
46
  - **Format Support**: Comprehensive support for documents, images, and text formats
47
47
  - **Modern Python**: Built with async/await, type hints, and functional first approach
48
48
  - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
@@ -61,10 +61,34 @@ pip install kreuzberg
61
61
 
62
62
  Kreuzberg requires two system level dependencies:
63
63
 
64
- - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
65
- - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
64
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
65
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
66
66
 
67
- Please install these using their respective installation guides.
67
+ You can install these with:
68
+
69
+ #### Linux (Ubuntu)
70
+
71
+ ```shell
72
+ sudo apt-get install pandoc tesseract-ocr
73
+ ```
74
+
75
+ #### MacOS
76
+
77
+ ```shell
78
+ #
79
+ brew install tesseract pandoc
80
+ ```
81
+
82
+ #### Windows
83
+
84
+ ```shell
85
+ choco install -y tesseract pandoc
86
+ ```
87
+
88
+ Notes:
89
+
90
+ - in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
91
+ - please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
68
92
 
69
93
  ## Architecture
70
94
 
@@ -152,26 +176,30 @@ All extraction functions accept the following optional parameters for configurin
152
176
 
153
177
  #### OCR Configuration
154
178
 
155
- - `language` (default: "eng"): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for non-English documents. Examples:
156
- - "eng" for English
157
- - "deu" for German
158
- - "fra" for French
179
+ - `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
180
+ - `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
181
+
182
+ - `eng` for English
183
+ - `deu` for German
184
+ - `eng+deu` for English and German
159
185
 
160
- Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information.
186
+ Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
161
187
 
162
- - `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
188
+ - `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
163
189
 
164
- #### Performance Configuration
190
+ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
165
191
 
166
- - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc. Higher values can lead to performance improvements, but may cause resource exhaustion and deadlocks (especially for tesseract).
192
+ #### Processing Configuration
193
+
194
+ - `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
167
195
 
168
196
  ### Quick Start
169
197
 
170
198
  ```python
171
199
  from pathlib import Path
172
200
  from kreuzberg import extract_file
173
- from kreuzberg.extraction import ExtractionResult
174
- from kreuzberg._tesseract import PSMMode, SupportedLanguage
201
+ from kreuzberg import ExtractionResult
202
+ from kreuzberg import PSMMode
175
203
 
176
204
 
177
205
  # Basic file extraction
@@ -193,14 +221,14 @@ async def extract_document():
193
221
  docx_result = await extract_file(Path("document.docx"))
194
222
  if docx_result.metadata:
195
223
  print(f"Title: {docx_result.metadata.get('title')}")
196
- print(f"Author: {docx_result.metadata.get('author')}")
224
+ print(f"Author: {docx_result.metadata.get('creator')}")
197
225
  ```
198
226
 
199
227
  ### Extracting Bytes
200
228
 
201
229
  ```python
202
230
  from kreuzberg import extract_bytes
203
- from kreuzberg.extraction import ExtractionResult
231
+ from kreuzberg import ExtractionResult
204
232
 
205
233
 
206
234
  async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
@@ -236,7 +264,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
236
264
 
237
265
  ```python
238
266
  from pathlib import Path
239
- from kreuzberg import batch_extract_file, batch_extract_bytes
267
+ from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
240
268
 
241
269
 
242
270
  # Process multiple files concurrently
@@ -346,8 +374,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
346
374
  Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
347
375
 
348
376
  ```python
349
- from kreuzberg import extract_file
350
- from kreuzberg.exceptions import (
377
+ from kreuzberg import (
378
+ extract_file,
351
379
  ValidationError,
352
380
  ParsingError,
353
381
  OCRError,
@@ -0,0 +1,21 @@
1
+ kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
2
+ kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
3
+ kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
4
+ kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
5
+ kreuzberg/_pandoc.py,sha256=lUqG1GQqezz011fLn12AUKJ_xw9gElj-S7xRO5g-Rlw,12513
6
+ kreuzberg/_pdf.py,sha256=BI7ooYvvLPEX3y7lKyri4r0k6bW4pj_cmBQW1UqZiF8,6227
7
+ kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
+ kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
9
+ kreuzberg/_sync.py,sha256=DepezWTfsyyeEq7VYjhWD6XFRiaEz-uCvXFUYkQMswQ,2191
10
+ kreuzberg/_tesseract.py,sha256=gKGyZpa_MLLsMTpzi_VvSXFAmLxagRE-sfqH2oKFmPM,7662
11
+ kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
+ kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
+ kreuzberg/_xlsx.py,sha256=JcQTdV38uiNdyRmHQ1DI6khN8ng4W38tIRaxonIoaHs,2703
14
+ kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
15
+ kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
16
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ kreuzberg-2.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
+ kreuzberg-2.1.0.dist-info/METADATA,sha256=t1NeglNqJFjWpr6WeIp-d33OikT_HIrS8FrEMGSk1hA,14844
19
+ kreuzberg-2.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
+ kreuzberg-2.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
+ kreuzberg-2.1.0.dist-info/RECORD,,
@@ -1,21 +0,0 @@
1
- kreuzberg/__init__.py,sha256=3opnj4Q8Ci151QuVqPaM3sCb8mpFIRhZbZUgBmp1LI0,410
2
- kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
3
- kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
4
- kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
5
- kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
6
- kreuzberg/_pdf.py,sha256=V1TVwPpGyrE0YJqnmW_5kh4Y1qWwZI5SSF-lwT_Bbac,6288
7
- kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
- kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
9
- kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
10
- kreuzberg/_tesseract.py,sha256=xt_4MU7PfN1nZWlWBVQF6zmJnMs9pJq8yWTzPUxTqm0,9240
11
- kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
- kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
- kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
14
- kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
15
- kreuzberg/extraction.py,sha256=1RIs7YaUK0wcOpY1eDcIqh3n-UlJY7ZeulZPdaAxdvo,13345
16
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- kreuzberg-2.0.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
- kreuzberg-2.0.0.dist-info/METADATA,sha256=cvD9ypz004yHqePKuw8eZZcuZ2lanyN1y2jlB5FMG0Q,14201
19
- kreuzberg-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
- kreuzberg-2.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
- kreuzberg-2.0.0.dist-info/RECORD,,