kreuzberg 2.0.1__py3-none-any.whl → 2.1.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,3 +1,4 @@
1
+ from ._tesseract import PSMMode
1
2
  from ._types import ExtractionResult, Metadata
2
3
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
3
4
  from .extraction import (
@@ -15,6 +16,7 @@ __all__ = [
15
16
  "Metadata",
16
17
  "MissingDependencyError",
17
18
  "OCRError",
19
+ "PSMMode",
18
20
  "ParsingError",
19
21
  "ValidationError",
20
22
  "batch_extract_bytes",
kreuzberg/_constants.py CHANGED
@@ -3,4 +3,6 @@ from __future__ import annotations
3
3
  from multiprocessing import cpu_count
4
4
  from typing import Final
5
5
 
6
- DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
6
+ DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
7
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
8
+ MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
kreuzberg/_html.py CHANGED
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
8
8
  from kreuzberg import ExtractionResult
9
9
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
10
10
  from kreuzberg._string import normalize_spaces, safe_decode
11
- from kreuzberg._sync import run_sync
12
11
 
13
12
  if TYPE_CHECKING:
14
13
  from pathlib import Path
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
28
27
  if isinstance(file_path_or_contents, bytes)
29
28
  else await AsyncPath(file_path_or_contents).read_text()
30
29
  )
31
- result = await run_sync(html_to_markdown.convert_to_markdown, content)
30
+ result = html_to_markdown.convert_to_markdown(content)
32
31
  return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_mime_types.py CHANGED
@@ -15,6 +15,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
15
15
  PDF_MIME_TYPE: Final = "application/pdf"
16
16
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
17
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
+ DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
18
19
  # Excel formats
19
20
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
20
21
  EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
@@ -73,7 +74,7 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
73
74
  "application/epub+zip",
74
75
  "application/rtf",
75
76
  "application/vnd.oasis.opendocument.text",
76
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
77
+ DOCX_MIME_TYPE,
77
78
  "application/x-biblatex",
78
79
  "application/x-bibtex",
79
80
  "application/x-endnote+xml",
@@ -146,7 +147,7 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
146
147
  ".epub": "application/epub+zip",
147
148
  ".rtf": "application/rtf",
148
149
  ".odt": "application/vnd.oasis.opendocument.text",
149
- ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
150
+ ".docx": DOCX_MIME_TYPE,
150
151
  ".bib": "application/x-bibtex",
151
152
  ".ipynb": "application/x-ipynb+json",
152
153
  ".tex": "application/x-latex",
kreuzberg/_pandoc.py CHANGED
@@ -1,21 +1,21 @@
1
1
  from __future__ import annotations
2
2
 
3
- import subprocess
3
+ import re
4
4
  import sys
5
- from functools import partial
6
5
  from json import JSONDecodeError, loads
7
6
  from typing import TYPE_CHECKING, Any, Final, Literal, cast
8
7
 
9
- from anyio import CapacityLimiter, create_task_group, to_process
10
8
  from anyio import Path as AsyncPath
9
+ from anyio import run_process
11
10
 
12
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
11
+ from kreuzberg import ValidationError
12
+ from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
13
13
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
14
14
  from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync
15
+ from kreuzberg._sync import run_taskgroup
16
16
  from kreuzberg._tmp import create_temp_file
17
17
  from kreuzberg._types import ExtractionResult, Metadata
18
- from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
18
+ from kreuzberg.exceptions import MissingDependencyError, ParsingError
19
19
 
20
20
  if TYPE_CHECKING: # pragma: no cover
21
21
  from collections.abc import Mapping
@@ -24,10 +24,8 @@ if TYPE_CHECKING: # pragma: no cover
24
24
  if sys.version_info < (3, 11): # pragma: no cover
25
25
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
26
 
27
-
28
27
  version_ref: Final[dict[str, bool]] = {"checked": False}
29
28
 
30
-
31
29
  # Block-level node types in Pandoc AST
32
30
  BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
33
31
  BLOCK_PARA: Final = "Para" # Paragraph containing inline content
@@ -229,20 +227,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
229
227
 
230
228
 
231
229
  def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
232
- if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
233
- mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
234
- ):
235
- raise ValidationError(
236
- f"Unsupported mime type: {mime_type}",
237
- context={
238
- "mime_type": mime_type,
239
- "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
240
- },
230
+ if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
231
+ return pandoc_type
232
+
233
+ if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
234
+ return next(
235
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
241
236
  )
242
237
 
243
- return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
244
- MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
245
- )
238
+ raise ValidationError(f"Unsupported mime type: {mime_type}")
246
239
 
247
240
 
248
241
  async def _validate_pandoc_version() -> None:
@@ -251,20 +244,19 @@ async def _validate_pandoc_version() -> None:
251
244
  return
252
245
 
253
246
  command = ["pandoc", "--version"]
254
- result = await run_sync(subprocess.run, command, capture_output=True)
255
- version = result.stdout.decode().split("\n")[0].split()[1]
256
- if not version.startswith("3."):
257
- raise MissingDependencyError("Pandoc version 3 or above is required.")
247
+ result = await run_process(command)
248
+
249
+ version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
250
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
251
+ raise MissingDependencyError("Pandoc version 2 or above is required")
258
252
 
259
253
  version_ref["checked"] = True
260
254
 
261
255
  except FileNotFoundError as e:
262
- raise MissingDependencyError("Pandoc is not installed.") from e
256
+ raise MissingDependencyError("Pandoc is not installed") from e
263
257
 
264
258
 
265
- async def _handle_extract_metadata(
266
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
267
- ) -> Metadata:
259
+ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
268
260
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
269
261
  metadata_file, unlink = await create_temp_file(".json")
270
262
  try:
@@ -276,15 +268,10 @@ async def _handle_extract_metadata(
276
268
  "--standalone",
277
269
  "--quiet",
278
270
  "--output",
279
- metadata_file,
271
+ str(metadata_file),
280
272
  ]
281
273
 
282
- result = await to_process.run_sync(
283
- partial(subprocess.run, capture_output=True),
284
- command,
285
- cancellable=True,
286
- limiter=CapacityLimiter(max_processes),
287
- )
274
+ result = await run_process(command)
288
275
 
289
276
  if result.returncode != 0:
290
277
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -297,9 +284,7 @@ async def _handle_extract_metadata(
297
284
  await unlink()
298
285
 
299
286
 
300
- async def _handle_extract_file(
301
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
302
- ) -> str:
287
+ async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
303
288
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
304
289
  output_path, unlink = await create_temp_file(".md")
305
290
  try:
@@ -315,12 +300,7 @@ async def _handle_extract_file(
315
300
 
316
301
  command.extend(["--output", str(output_path)])
317
302
 
318
- result = await to_process.run_sync(
319
- partial(subprocess.run, capture_output=True),
320
- command,
321
- cancellable=True,
322
- limiter=CapacityLimiter(max_processes),
323
- )
303
+ result = await run_process(command)
324
304
 
325
305
  if result.returncode != 0:
326
306
  raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
@@ -334,15 +314,12 @@ async def _handle_extract_file(
334
314
  await unlink()
335
315
 
336
316
 
337
- async def process_file_with_pandoc(
338
- input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
339
- ) -> ExtractionResult:
317
+ async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
340
318
  """Process a single file using Pandoc and convert to markdown.
341
319
 
342
320
  Args:
343
321
  input_file: The path to the file to process.
344
322
  mime_type: The mime type of the file.
345
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
346
323
 
347
324
  Raises:
348
325
  ParsingError: If the file data could not be extracted.
@@ -354,41 +331,27 @@ async def process_file_with_pandoc(
354
331
 
355
332
  _get_pandoc_type_from_mime_type(mime_type)
356
333
 
357
- metadata: Metadata = {}
358
- content: str = ""
359
-
360
334
  try:
361
- async with create_task_group() as tg:
362
-
363
- async def _get_metadata() -> None:
364
- nonlocal metadata
365
- metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
366
-
367
- async def _get_content() -> None:
368
- nonlocal content
369
- content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
370
-
371
- tg.start_soon(_get_metadata)
372
- tg.start_soon(_get_content)
335
+ metadata_task = _handle_extract_metadata(input_file, mime_type=mime_type)
336
+ content_task = _handle_extract_file(input_file, mime_type=mime_type)
337
+ results = await run_taskgroup(metadata_task, content_task)
338
+ metadata, content = cast(tuple[Metadata, str], results)
339
+
340
+ return ExtractionResult(
341
+ content=normalize_spaces(content),
342
+ metadata=metadata,
343
+ mime_type=MARKDOWN_MIME_TYPE,
344
+ )
373
345
  except ExceptionGroup as eg:
374
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
375
-
376
- return ExtractionResult(
377
- content=normalize_spaces(content),
378
- metadata=metadata,
379
- mime_type=MARKDOWN_MIME_TYPE,
380
- )
346
+ raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
381
347
 
382
348
 
383
- async def process_content_with_pandoc(
384
- content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
385
- ) -> ExtractionResult:
349
+ async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
386
350
  """Process content using Pandoc and convert to markdown.
387
351
 
388
352
  Args:
389
353
  content: The content to process.
390
354
  mime_type: The mime type of the content.
391
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
392
355
 
393
356
  Returns:
394
357
  ExtractionResult
@@ -397,7 +360,7 @@ async def process_content_with_pandoc(
397
360
  input_file, unlink = await create_temp_file(f".{extension}")
398
361
 
399
362
  await AsyncPath(input_file).write_bytes(content)
400
- result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
363
+ result = await process_file_with_pandoc(input_file, mime_type=mime_type)
401
364
 
402
365
  await unlink()
403
366
  return result
kreuzberg/_pdf.py CHANGED
@@ -24,32 +24,36 @@ if TYPE_CHECKING: # pragma: no cover
24
24
  # - Control and non-printable characters
25
25
  # - Unicode replacement and invalid characters
26
26
  # - Zero-width spaces and other invisible characters
27
- CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
28
- r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
29
- )
27
+ CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
28
+ SHORT_TEXT_THRESHOLD: Final[int] = 50
29
+ MINIMUM_CORRUPTED_RESULTS: Final[int] = 2
30
30
 
31
31
 
32
- def _validate_extracted_text(text: str) -> bool:
32
+ def _validate_extracted_text(text: str, corruption_threshold: float = 0.05) -> bool:
33
33
  """Check if text extracted from PDF is valid or corrupted.
34
34
 
35
- This checks for common indicators of corrupted PDF text extraction:
35
+ This checks for indicators of corrupted PDF text extraction:
36
36
  1. Empty or whitespace-only text
37
- 2. Control characters and other non-printable characters
38
- 3. Unicode replacement characters
39
- 4. Zero-width spaces and other invisible characters
37
+ 2. High concentration of control characters and null bytes
38
+ 3. High concentration of Unicode replacement characters
40
39
 
41
40
  Args:
42
41
  text: The extracted text to validate
42
+ corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
43
+ characters (default: 0.05 or 5%)
43
44
 
44
45
  Returns:
45
46
  True if the text appears valid, False if it seems corrupted
46
47
  """
47
- # Check for empty or whitespace-only text
48
48
  if not text or not text.strip():
49
49
  return False
50
50
 
51
- # Check for corruption indicators
52
- return not bool(CORRUPTED_PATTERN.search(text))
51
+ corruption_matches = CORRUPTED_PATTERN.findall(text)
52
+
53
+ if len(text) < SHORT_TEXT_THRESHOLD:
54
+ return len(corruption_matches) <= MINIMUM_CORRUPTED_RESULTS
55
+
56
+ return (len(corruption_matches) / len(text)) < corruption_threshold
53
57
 
54
58
 
55
59
  async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
@@ -67,7 +71,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
67
71
  document: pypdfium2.PdfDocument | None = None
68
72
  try:
69
73
  document = await run_sync(pypdfium2.PdfDocument, str(input_file))
70
- return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
74
+ return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
71
75
  except pypdfium2.PdfiumError as e:
72
76
  raise ParsingError(
73
77
  "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
@@ -148,13 +152,10 @@ async def extract_pdf_file(
148
152
  Returns:
149
153
  The extracted text.
150
154
  """
151
- if (
152
- not force_ocr
153
- and (content := await _extract_pdf_searchable_text(input_file))
154
- and _validate_extracted_text(content)
155
- ):
156
- return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
157
-
155
+ if not force_ocr:
156
+ content = await _extract_pdf_searchable_text(input_file)
157
+ if _validate_extracted_text(content):
158
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
158
159
  return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
159
160
 
160
161
 
kreuzberg/_string.py CHANGED
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
22
22
  encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
23
23
 
24
24
  for enc in [e for e in encodings if e]: # pragma: no cover
25
- with suppress(UnicodeDecodeError):
25
+ with suppress(UnicodeDecodeError, LookupError):
26
26
  return byte_data.decode(enc)
27
27
 
28
28
  # If all encodings fail, fall back to latin-1 which can handle any byte
kreuzberg/_sync.py CHANGED
@@ -2,12 +2,13 @@ from __future__ import annotations
2
2
 
3
3
  import sys
4
4
  from functools import partial
5
- from typing import TYPE_CHECKING, TypeVar, cast
5
+ from typing import TYPE_CHECKING, Any, TypeVar, cast
6
6
 
7
+ from anyio import create_task_group
7
8
  from anyio.to_thread import run_sync as any_io_run_sync
8
9
 
9
10
  if TYPE_CHECKING: # pragma: no cover
10
- from collections.abc import Callable
11
+ from collections.abc import Awaitable, Callable
11
12
 
12
13
  if sys.version_info >= (3, 10):
13
14
  from typing import ParamSpec
@@ -30,4 +31,44 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
30
31
  The result of the synchronous function.
31
32
  """
32
33
  handler = partial(sync_fn, **kwargs)
33
- return cast(T, await any_io_run_sync(handler, *args)) # pyright: ignore [reportCallIssue]
34
+ return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
35
+
36
+
37
+ async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
38
+ """Run a list of coroutines concurrently.
39
+
40
+ Args:
41
+ *async_tasks: The list of coroutines to run.
42
+
43
+ Returns:
44
+ The results of the coroutines.
45
+ """
46
+ results: list[Any] = [None] * len(async_tasks)
47
+
48
+ async def run_task(index: int, task: Awaitable[T]) -> None:
49
+ results[index] = await task
50
+
51
+ async with create_task_group() as tg:
52
+ for i, t in enumerate(async_tasks):
53
+ tg.start_soon(run_task, i, t)
54
+
55
+ return results
56
+
57
+
58
+ async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
59
+ """Run a list of coroutines concurrently in batches.
60
+
61
+ Args:
62
+ *async_tasks: The list of coroutines to run.
63
+ batch_size: The size of each batch.
64
+
65
+ Returns:
66
+ The results of the coroutines.
67
+ """
68
+ results: list[Any] = []
69
+
70
+ for i in range(0, len(async_tasks), batch_size):
71
+ batch = async_tasks[i : i + batch_size]
72
+ results.extend(await run_taskgroup(*batch))
73
+
74
+ return results
kreuzberg/_tesseract.py CHANGED
@@ -1,30 +1,26 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
- import subprocess
5
4
  import sys
6
5
  from enum import Enum
7
- from functools import partial
8
6
  from os import PathLike
9
- from typing import Final, TypeVar, Union, cast
7
+ from typing import Any, TypeVar, Union
10
8
 
11
- from anyio import CapacityLimiter, create_task_group, to_process
12
9
  from anyio import Path as AsyncPath
10
+ from anyio import run_process
13
11
  from PIL.Image import Image
14
12
 
15
- from kreuzberg import ExtractionResult, ParsingError
16
- from kreuzberg._constants import DEFAULT_MAX_PROCESSES
13
+ from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
17
14
  from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
18
15
  from kreuzberg._string import normalize_spaces
19
- from kreuzberg._sync import run_sync
16
+ from kreuzberg._sync import run_sync, run_taskgroup_batched
20
17
  from kreuzberg._tmp import create_temp_file
21
- from kreuzberg.exceptions import MissingDependencyError, OCRError
18
+ from kreuzberg._types import ExtractionResult
19
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
22
20
 
23
21
  if sys.version_info < (3, 11): # pragma: no cover
24
22
  from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
25
23
 
26
- MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
27
-
28
24
  version_ref = {"checked": False}
29
25
 
30
26
  T = TypeVar("T", bound=Union[Image, PathLike[str], str])
@@ -68,14 +64,16 @@ async def validate_tesseract_version() -> None:
68
64
  return
69
65
 
70
66
  command = ["tesseract", "--version"]
71
- result = await run_sync(subprocess.run, command, capture_output=True)
72
- version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
67
+ result = await run_process(command)
68
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
73
69
  if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
74
70
  raise MissingDependencyError("Tesseract version 5 or above is required.")
75
71
 
76
72
  version_ref["checked"] = True
77
73
  except FileNotFoundError as e:
78
- raise MissingDependencyError("Tesseract is not installed.") from e
74
+ raise MissingDependencyError(
75
+ "Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
76
+ ) from e
79
77
 
80
78
 
81
79
  async def process_file(
@@ -83,7 +81,6 @@ async def process_file(
83
81
  *,
84
82
  language: str,
85
83
  psm: PSMMode,
86
- max_processes: int = DEFAULT_MAX_PROCESSES,
87
84
  ) -> ExtractionResult:
88
85
  """Process a single image file using Tesseract OCR.
89
86
 
@@ -91,7 +88,6 @@ async def process_file(
91
88
  input_file: The path to the image file to process.
92
89
  language: The language code for OCR.
93
90
  psm: Page segmentation mode.
94
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
95
91
 
96
92
  Raises:
97
93
  OCRError: If OCR fails to extract text from the image.
@@ -102,6 +98,7 @@ async def process_file(
102
98
  output_path, unlink = await create_temp_file(".txt")
103
99
  try:
104
100
  output_base = str(output_path).replace(".txt", "")
101
+
105
102
  command = [
106
103
  "tesseract",
107
104
  str(input_file),
@@ -110,22 +107,44 @@ async def process_file(
110
107
  language,
111
108
  "--psm",
112
109
  str(psm.value),
110
+ "--oem",
111
+ "1",
112
+ "--loglevel",
113
+ "OFF",
114
+ "-c",
115
+ "thresholding_method=1",
116
+ "-c",
117
+ "tessedit_enable_dict_correction=1",
118
+ "-c",
119
+ "language_model_ngram_on=1",
120
+ "-c",
121
+ "textord_space_size_is_variable=1",
122
+ "-c",
123
+ "classify_use_pre_adapted_templates=1",
124
+ "-c",
125
+ "tessedit_dont_blkrej_good_wds=1",
126
+ "-c",
127
+ "tessedit_dont_rowrej_good_wds=1",
128
+ "-c",
129
+ "tessedit_use_primary_params_model=1",
113
130
  ]
114
131
 
115
- result = await to_process.run_sync(
116
- partial(subprocess.run, capture_output=True),
117
- command,
118
- limiter=CapacityLimiter(max_processes),
119
- cancellable=True,
120
- )
132
+ env: dict[str, Any] | None = None
133
+ if sys.platform.startswith("linux"):
134
+ env = {"OMP_THREAD_LIMIT": "1"}
135
+
136
+ result = await run_process(command, env=env)
121
137
 
122
138
  if not result.returncode == 0:
123
- raise OCRError("OCR failed with a non-0 return code.")
139
+ raise OCRError(
140
+ "OCR failed with a non-0 return code.",
141
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
142
+ )
124
143
 
125
144
  output = await AsyncPath(output_path).read_text("utf-8")
126
145
  return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
127
146
  except (RuntimeError, OSError) as e:
128
- raise OCRError("Failed to OCR using tesseract") from e
147
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
129
148
  finally:
130
149
  await unlink()
131
150
 
@@ -135,7 +154,6 @@ async def process_image(
135
154
  *,
136
155
  language: str,
137
156
  psm: PSMMode,
138
- max_processes: int = DEFAULT_MAX_PROCESSES,
139
157
  ) -> ExtractionResult:
140
158
  """Process a single Pillow Image using Tesseract OCR.
141
159
 
@@ -143,14 +161,13 @@ async def process_image(
143
161
  image: The Pillow Image to process.
144
162
  language: The language code for OCR.
145
163
  psm: Page segmentation mode.
146
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
147
164
 
148
165
  Returns:
149
166
  ExtractionResult: The extracted text from the image.
150
167
  """
151
168
  image_path, unlink = await create_temp_file(".png")
152
169
  await run_sync(image.save, str(image_path), format="PNG")
153
- result = await process_file(image_path, language=language, psm=psm, max_processes=max_processes)
170
+ result = await process_file(image_path, language=language, psm=psm)
154
171
  await unlink()
155
172
  return result
156
173
 
@@ -160,7 +177,6 @@ async def process_image_with_tesseract(
160
177
  *,
161
178
  language: str = "eng",
162
179
  psm: PSMMode = PSMMode.AUTO,
163
- max_processes: int = DEFAULT_MAX_PROCESSES,
164
180
  ) -> ExtractionResult:
165
181
  """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
166
182
 
@@ -168,7 +184,6 @@ async def process_image_with_tesseract(
168
184
  image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
169
185
  language: The language code for OCR (default: "eng").
170
186
  psm: Page segmentation mode (default: PSMMode.AUTO).
171
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
172
187
 
173
188
  Raises:
174
189
  ValueError: If the input is not a Pillow Image or a list of Pillow Images.
@@ -179,10 +194,10 @@ async def process_image_with_tesseract(
179
194
  await validate_tesseract_version()
180
195
 
181
196
  if isinstance(image, Image):
182
- return await process_image(image, language=language, psm=psm, max_processes=max_processes)
197
+ return await process_image(image, language=language, psm=psm)
183
198
 
184
199
  if isinstance(image, (PathLike, str)):
185
- return await process_file(image, language=language, psm=psm, max_processes=max_processes)
200
+ return await process_file(image, language=language, psm=psm)
186
201
 
187
202
  raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
188
203
 
@@ -200,7 +215,7 @@ async def batch_process_images(
200
215
  images: A list of Pillow Images, paths or strings to process.
201
216
  language: The language code for OCR (default: "eng").
202
217
  psm: Page segmentation mode (default: PSMMode.AUTO).
203
- max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
218
+ max_processes: Maximum number of concurrent processes (default: CPU count / 2).
204
219
 
205
220
  Raises:
206
221
  ParsingError: If OCR fails to extract text from any of the images.
@@ -209,17 +224,8 @@ async def batch_process_images(
209
224
  List of ExtractionResult objects, one per input image.
210
225
  """
211
226
  await validate_tesseract_version()
212
- results = cast(list[ExtractionResult], list(range(len(images))))
213
-
214
- async def _process_image(index: int, image: T) -> None:
215
- results[index] = await process_image_with_tesseract(
216
- image, language=language, psm=psm, max_processes=max_processes
217
- )
218
-
219
227
  try:
220
- async with create_task_group() as tg:
221
- for i, image in enumerate(images):
222
- tg.start_soon(_process_image, i, image)
223
- return results
228
+ tasks = [process_image_with_tesseract(image, language=language, psm=psm) for image in images]
229
+ return await run_taskgroup_batched(*tasks, batch_size=max_processes)
224
230
  except ExceptionGroup as eg:
225
- raise ParsingError("Failed to process images with Tesseract") from eg
231
+ raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
kreuzberg/_xlsx.py CHANGED
@@ -1,23 +1,46 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import csv
4
+ import sys
4
5
  from io import StringIO
5
- from typing import TYPE_CHECKING, cast
6
+ from typing import TYPE_CHECKING
6
7
 
7
8
  from anyio import Path as AsyncPath
8
- from anyio import create_task_group
9
9
  from python_calamine import CalamineWorkbook
10
10
 
11
11
  from kreuzberg import ExtractionResult, ParsingError
12
12
  from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
13
13
  from kreuzberg._pandoc import process_file_with_pandoc
14
14
  from kreuzberg._string import normalize_spaces
15
- from kreuzberg._sync import run_sync
15
+ from kreuzberg._sync import run_sync, run_taskgroup
16
16
  from kreuzberg._tmp import create_temp_file
17
17
 
18
18
  if TYPE_CHECKING: # pragma: no cover
19
19
  from pathlib import Path
20
20
 
21
+ if sys.version_info < (3, 11): # pragma: no cover
22
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
23
+
24
+
25
+ async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
26
+ values = workbook.get_sheet_by_name(sheet_name).to_python()
27
+
28
+ csv_buffer = StringIO()
29
+ writer = csv.writer(csv_buffer)
30
+
31
+ for row in values:
32
+ writer.writerow(row)
33
+
34
+ csv_data = csv_buffer.getvalue()
35
+ csv_buffer.close()
36
+
37
+ csv_path, unlink = await create_temp_file(".csv")
38
+ await AsyncPath(csv_path).write_text(csv_data)
39
+
40
+ result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
41
+ await unlink()
42
+ return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
43
+
21
44
 
22
45
  async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
23
46
  """Extract text from an XLSX file by converting it to CSV and then to markdown.
@@ -33,46 +56,19 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
33
56
  """
34
57
  try:
35
58
  workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
36
-
37
- results = cast(list[str], [None] * len(workbook.sheet_names))
38
-
39
- async def convert_sheet_to_text(sheet_name: str) -> None:
40
- nonlocal results
41
- values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
42
-
43
- csv_buffer = StringIO()
44
- writer = csv.writer(csv_buffer)
45
-
46
- for row in values:
47
- writer.writerow(row)
48
-
49
- csv_data = csv_buffer.getvalue()
50
- csv_buffer.close()
51
-
52
- from kreuzberg._tmp import create_temp_file
53
-
54
- csv_path, unlink = await create_temp_file(".csv")
55
- await AsyncPath(csv_path).write_text(csv_data)
56
- result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
57
- results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
58
- await unlink()
59
-
60
- async with create_task_group() as tg:
61
- for sheet_name in workbook.sheet_names:
62
- tg.start_soon(convert_sheet_to_text, sheet_name)
59
+ tasks = [convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
60
+ results: list[str] = await run_taskgroup(*tasks)
63
61
 
64
62
  return ExtractionResult(
65
63
  content="\n\n".join(results),
66
64
  mime_type=MARKDOWN_MIME_TYPE,
67
65
  metadata={},
68
66
  )
69
- except Exception as e:
67
+ except ExceptionGroup as eg:
70
68
  raise ParsingError(
71
- "Could not extract text from XLSX",
72
- context={
73
- "error": str(e),
74
- },
75
- ) from e
69
+ "Failed to extract file data",
70
+ context={"file": str(input_file), "errors": eg.exceptions},
71
+ ) from eg
76
72
 
77
73
 
78
74
  async def extract_xlsx_content(content: bytes) -> ExtractionResult:
kreuzberg/exceptions.py CHANGED
@@ -14,9 +14,28 @@ class KreuzbergError(Exception):
14
14
  self.context = context
15
15
  super().__init__(message)
16
16
 
17
+ def _serialize_context(self, obj: Any) -> Any:
18
+ """Recursively serialize context objects to ensure JSON compatibility."""
19
+ if isinstance(obj, bytes):
20
+ return obj.decode("utf-8", errors="replace")
21
+ if isinstance(obj, dict):
22
+ return {k: self._serialize_context(v) for k, v in obj.items()}
23
+ if isinstance(obj, (list, tuple)):
24
+ return [self._serialize_context(x) for x in obj]
25
+ if isinstance(obj, Exception):
26
+ return {
27
+ "type": obj.__class__.__name__,
28
+ "message": str(obj),
29
+ }
30
+ return obj
31
+
17
32
  def __str__(self) -> str:
18
33
  """Return a string representation of the exception."""
19
- ctx = f"\n\nContext: {dumps(self.context)}" if self.context else ""
34
+ if self.context:
35
+ serialized_context = self._serialize_context(self.context)
36
+ ctx = f"\n\nContext: {dumps(serialized_context)}"
37
+ else:
38
+ ctx = ""
20
39
 
21
40
  return f"{self.__class__.__name__}: {super().__str__()}{ctx}"
22
41
 
kreuzberg/extraction.py CHANGED
@@ -87,14 +87,12 @@ async def extract_bytes(
87
87
  return await extract_xlsx_content(content)
88
88
 
89
89
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
90
- return await process_image_with_tesseract(
91
- open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
92
- )
90
+ return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
93
91
 
94
92
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
95
93
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
96
94
  ):
97
- return await process_content_with_pandoc(content=content, mime_type=mime_type, max_processes=max_processes)
95
+ return await process_content_with_pandoc(content=content, mime_type=mime_type)
98
96
 
99
97
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
100
98
  return await extract_pptx_file_content(content)
@@ -150,12 +148,12 @@ async def extract_file(
150
148
  return await extract_xlsx_file(Path(input_file))
151
149
 
152
150
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
153
- return await process_image_with_tesseract(input_file, max_processes=max_processes, psm=psm, language=language)
151
+ return await process_image_with_tesseract(input_file, psm=psm, language=language)
154
152
 
155
153
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
156
154
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
157
155
  ):
158
- return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type, max_processes=max_processes)
156
+ return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
159
157
 
160
158
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
161
159
  return await extract_pptx_file_content(Path(input_file))
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 2.0.1
3
+ Version: 2.1.1
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -31,7 +31,7 @@ Requires-Dist: html-to-markdown>=1.2.0
31
31
  Requires-Dist: pypdfium2>=4.30.1
32
32
  Requires-Dist: python-calamine>=0.3.1
33
33
  Requires-Dist: python-pptx>=1.0.2
34
- Requires-Dist: typing-extensions>=4.12.2; python_version < "3.11"
34
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
35
35
 
36
36
  # Kreuzberg
37
37
 
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
42
42
  - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
43
43
  - **Local Processing**: No external API calls or cloud dependencies required
44
44
  - **Resource Efficient**: Lightweight processing without GPU requirements
45
- - **Lightweight**: Has few curated dependencies and a minimal footprint
45
+ - **Small Package Size**: Has few curated dependencies and a minimal footprint
46
46
  - **Format Support**: Comprehensive support for documents, images, and text formats
47
47
  - **Modern Python**: Built with async/await, type hints, and functional first approach
48
48
  - **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
@@ -61,8 +61,8 @@ pip install kreuzberg
61
61
 
62
62
  Kreuzberg requires two system level dependencies:
63
63
 
64
- - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
65
- - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
64
+ - [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
65
+ - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 5.
66
66
 
67
67
  You can install these with:
68
68
 
@@ -75,7 +75,6 @@ sudo apt-get install pandoc tesseract-ocr
75
75
  #### MacOS
76
76
 
77
77
  ```shell
78
- # MacOS
79
78
  brew install tesseract pandoc
80
79
  ```
81
80
 
@@ -191,19 +190,15 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
191
190
 
192
191
  #### Processing Configuration
193
192
 
194
- - `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
195
-
196
- Notes:
197
-
198
- - Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
193
+ - `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
199
194
 
200
195
  ### Quick Start
201
196
 
202
197
  ```python
203
198
  from pathlib import Path
204
199
  from kreuzberg import extract_file
205
- from kreuzberg.extraction import ExtractionResult
206
- from kreuzberg._tesseract import PSMMode
200
+ from kreuzberg import ExtractionResult
201
+ from kreuzberg import PSMMode
207
202
 
208
203
 
209
204
  # Basic file extraction
@@ -232,7 +227,7 @@ async def extract_document():
232
227
 
233
228
  ```python
234
229
  from kreuzberg import extract_bytes
235
- from kreuzberg.extraction import ExtractionResult
230
+ from kreuzberg import ExtractionResult
236
231
 
237
232
 
238
233
  async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
@@ -378,8 +373,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
378
373
  Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
379
374
 
380
375
  ```python
381
- from kreuzberg import extract_file
382
- from kreuzberg.exceptions import (
376
+ from kreuzberg import (
377
+ extract_file,
383
378
  ValidationError,
384
379
  ParsingError,
385
380
  OCRError,
@@ -0,0 +1,21 @@
1
+ kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
2
+ kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
3
+ kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
4
+ kreuzberg/_mime_types.py,sha256=Kuu0yWY4p0Eck8b_vdp9oamqRZc1RJaS_ZKikVD2Z2o,6431
5
+ kreuzberg/_pandoc.py,sha256=YIXaFC11N2tgVHjBd3JD_21GZ6OOVQ0UY3aKrWNfK-I,12531
6
+ kreuzberg/_pdf.py,sha256=AIwxlydZkJOU4878SaeF9cKUmzSN7o3X40Hye7z017U,6479
7
+ kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
+ kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
9
+ kreuzberg/_sync.py,sha256=sDVH4GrpYW9SOnmu3BqKPL76xl0hxzHjTAC78aovbQA,2122
10
+ kreuzberg/_tesseract.py,sha256=0BkguZJIKlOFHkrN2mjVgaycWwolmuEv6DwpQY7n7Os,7610
11
+ kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
+ kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
+ kreuzberg/_xlsx.py,sha256=kSH7PJ33vdLgoh5LmL_bqbc4I0VgZlZUeF4ckKl6NJM,2675
14
+ kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
15
+ kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
16
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
+ kreuzberg-2.1.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
+ kreuzberg-2.1.1.dist-info/METADATA,sha256=tWRsv1bx9os2dQnU5KrQpUd4fNeQ4x-J2fXWKdcuQAA,14842
19
+ kreuzberg-2.1.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
20
+ kreuzberg-2.1.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
+ kreuzberg-2.1.1.dist-info/RECORD,,
@@ -1,5 +1,5 @@
1
1
  Wheel-Version: 1.0
2
- Generator: setuptools (75.8.0)
2
+ Generator: setuptools (75.8.2)
3
3
  Root-Is-Purelib: true
4
4
  Tag: py3-none-any
5
5
 
@@ -1,21 +0,0 @@
1
- kreuzberg/__init__.py,sha256=CBRHXPhjdslaSXaUjZO5V0k57uz5_x12cwo0HTtxOcU,647
2
- kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
3
- kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
4
- kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
5
- kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
6
- kreuzberg/_pdf.py,sha256=9YErIrRvMMFXKHckXzBDCEMzDAEnC0JVOR38gFhvHKQ,6227
7
- kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
8
- kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
9
- kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
10
- kreuzberg/_tesseract.py,sha256=SZsv0gFWvzR8iLaMyGr4Oc0lXE7atCR3sNxXR7TQzEE,7686
11
- kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
12
- kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
13
- kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
14
- kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
15
- kreuzberg/extraction.py,sha256=kuEKvOGhPBRcFeGX7eKmup9BukX6o55740F_KdZ15qQ,13214
16
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
17
- kreuzberg-2.0.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
18
- kreuzberg-2.0.1.dist-info/METADATA,sha256=KmKLubQ89i0_JwpK96kYbhuq1MuucrqHe2bCLNcbyic,15023
19
- kreuzberg-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
20
- kreuzberg-2.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
21
- kreuzberg-2.0.1.dist-info/RECORD,,