kreuzberg 2.0.1__py3-none-any.whl → 2.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +2 -0
- kreuzberg/_constants.py +3 -1
- kreuzberg/_html.py +1 -2
- kreuzberg/_mime_types.py +3 -2
- kreuzberg/_pandoc.py +38 -75
- kreuzberg/_pdf.py +20 -19
- kreuzberg/_string.py +1 -1
- kreuzberg/_sync.py +44 -3
- kreuzberg/_tesseract.py +49 -43
- kreuzberg/_xlsx.py +32 -36
- kreuzberg/exceptions.py +20 -1
- kreuzberg/extraction.py +4 -6
- {kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/METADATA +11 -16
- kreuzberg-2.1.1.dist-info/RECORD +21 -0
- {kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/WHEEL +1 -1
- kreuzberg-2.0.1.dist-info/RECORD +0 -21
- {kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/LICENSE +0 -0
- {kreuzberg-2.0.1.dist-info → kreuzberg-2.1.1.dist-info}/top_level.txt +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,3 +1,4 @@
|
|
1
|
+
from ._tesseract import PSMMode
|
1
2
|
from ._types import ExtractionResult, Metadata
|
2
3
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
3
4
|
from .extraction import (
|
@@ -15,6 +16,7 @@ __all__ = [
|
|
15
16
|
"Metadata",
|
16
17
|
"MissingDependencyError",
|
17
18
|
"OCRError",
|
19
|
+
"PSMMode",
|
18
20
|
"ParsingError",
|
19
21
|
"ValidationError",
|
20
22
|
"batch_extract_bytes",
|
kreuzberg/_constants.py
CHANGED
@@ -3,4 +3,6 @@ from __future__ import annotations
|
|
3
3
|
from multiprocessing import cpu_count
|
4
4
|
from typing import Final
|
5
5
|
|
6
|
-
DEFAULT_MAX_PROCESSES: Final[int] =
|
6
|
+
DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
|
7
|
+
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
8
|
+
MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
|
kreuzberg/_html.py
CHANGED
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
|
|
8
8
|
from kreuzberg import ExtractionResult
|
9
9
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
10
10
|
from kreuzberg._string import normalize_spaces, safe_decode
|
11
|
-
from kreuzberg._sync import run_sync
|
12
11
|
|
13
12
|
if TYPE_CHECKING:
|
14
13
|
from pathlib import Path
|
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
|
|
28
27
|
if isinstance(file_path_or_contents, bytes)
|
29
28
|
else await AsyncPath(file_path_or_contents).read_text()
|
30
29
|
)
|
31
|
-
result =
|
30
|
+
result = html_to_markdown.convert_to_markdown(content)
|
32
31
|
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
kreuzberg/_mime_types.py
CHANGED
@@ -15,6 +15,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
|
15
15
|
PDF_MIME_TYPE: Final = "application/pdf"
|
16
16
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
17
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
|
+
DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
18
19
|
# Excel formats
|
19
20
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
20
21
|
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
@@ -73,7 +74,7 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
|
73
74
|
"application/epub+zip",
|
74
75
|
"application/rtf",
|
75
76
|
"application/vnd.oasis.opendocument.text",
|
76
|
-
|
77
|
+
DOCX_MIME_TYPE,
|
77
78
|
"application/x-biblatex",
|
78
79
|
"application/x-bibtex",
|
79
80
|
"application/x-endnote+xml",
|
@@ -146,7 +147,7 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
|
|
146
147
|
".epub": "application/epub+zip",
|
147
148
|
".rtf": "application/rtf",
|
148
149
|
".odt": "application/vnd.oasis.opendocument.text",
|
149
|
-
".docx":
|
150
|
+
".docx": DOCX_MIME_TYPE,
|
150
151
|
".bib": "application/x-bibtex",
|
151
152
|
".ipynb": "application/x-ipynb+json",
|
152
153
|
".tex": "application/x-latex",
|
kreuzberg/_pandoc.py
CHANGED
@@ -1,21 +1,21 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import
|
3
|
+
import re
|
4
4
|
import sys
|
5
|
-
from functools import partial
|
6
5
|
from json import JSONDecodeError, loads
|
7
6
|
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
8
7
|
|
9
|
-
from anyio import CapacityLimiter, create_task_group, to_process
|
10
8
|
from anyio import Path as AsyncPath
|
9
|
+
from anyio import run_process
|
11
10
|
|
12
|
-
from kreuzberg
|
11
|
+
from kreuzberg import ValidationError
|
12
|
+
from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
|
13
13
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
14
14
|
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import
|
15
|
+
from kreuzberg._sync import run_taskgroup
|
16
16
|
from kreuzberg._tmp import create_temp_file
|
17
17
|
from kreuzberg._types import ExtractionResult, Metadata
|
18
|
-
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
18
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
19
|
|
20
20
|
if TYPE_CHECKING: # pragma: no cover
|
21
21
|
from collections.abc import Mapping
|
@@ -24,10 +24,8 @@ if TYPE_CHECKING: # pragma: no cover
|
|
24
24
|
if sys.version_info < (3, 11): # pragma: no cover
|
25
25
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
26
|
|
27
|
-
|
28
27
|
version_ref: Final[dict[str, bool]] = {"checked": False}
|
29
28
|
|
30
|
-
|
31
29
|
# Block-level node types in Pandoc AST
|
32
30
|
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
33
31
|
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
@@ -229,20 +227,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
|
229
227
|
|
230
228
|
|
231
229
|
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
232
|
-
if
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
"mime_type": mime_type,
|
239
|
-
"supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
|
240
|
-
},
|
230
|
+
if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
|
231
|
+
return pandoc_type
|
232
|
+
|
233
|
+
if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
|
234
|
+
return next(
|
235
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
241
236
|
)
|
242
237
|
|
243
|
-
|
244
|
-
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
245
|
-
)
|
238
|
+
raise ValidationError(f"Unsupported mime type: {mime_type}")
|
246
239
|
|
247
240
|
|
248
241
|
async def _validate_pandoc_version() -> None:
|
@@ -251,20 +244,19 @@ async def _validate_pandoc_version() -> None:
|
|
251
244
|
return
|
252
245
|
|
253
246
|
command = ["pandoc", "--version"]
|
254
|
-
result = await
|
255
|
-
|
256
|
-
|
257
|
-
|
247
|
+
result = await run_process(command)
|
248
|
+
|
249
|
+
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
250
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
251
|
+
raise MissingDependencyError("Pandoc version 2 or above is required")
|
258
252
|
|
259
253
|
version_ref["checked"] = True
|
260
254
|
|
261
255
|
except FileNotFoundError as e:
|
262
|
-
raise MissingDependencyError("Pandoc is not installed
|
256
|
+
raise MissingDependencyError("Pandoc is not installed") from e
|
263
257
|
|
264
258
|
|
265
|
-
async def _handle_extract_metadata(
|
266
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
267
|
-
) -> Metadata:
|
259
|
+
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
268
260
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
269
261
|
metadata_file, unlink = await create_temp_file(".json")
|
270
262
|
try:
|
@@ -276,15 +268,10 @@ async def _handle_extract_metadata(
|
|
276
268
|
"--standalone",
|
277
269
|
"--quiet",
|
278
270
|
"--output",
|
279
|
-
metadata_file,
|
271
|
+
str(metadata_file),
|
280
272
|
]
|
281
273
|
|
282
|
-
result = await
|
283
|
-
partial(subprocess.run, capture_output=True),
|
284
|
-
command,
|
285
|
-
cancellable=True,
|
286
|
-
limiter=CapacityLimiter(max_processes),
|
287
|
-
)
|
274
|
+
result = await run_process(command)
|
288
275
|
|
289
276
|
if result.returncode != 0:
|
290
277
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -297,9 +284,7 @@ async def _handle_extract_metadata(
|
|
297
284
|
await unlink()
|
298
285
|
|
299
286
|
|
300
|
-
async def _handle_extract_file(
|
301
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
302
|
-
) -> str:
|
287
|
+
async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
|
303
288
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
304
289
|
output_path, unlink = await create_temp_file(".md")
|
305
290
|
try:
|
@@ -315,12 +300,7 @@ async def _handle_extract_file(
|
|
315
300
|
|
316
301
|
command.extend(["--output", str(output_path)])
|
317
302
|
|
318
|
-
result = await
|
319
|
-
partial(subprocess.run, capture_output=True),
|
320
|
-
command,
|
321
|
-
cancellable=True,
|
322
|
-
limiter=CapacityLimiter(max_processes),
|
323
|
-
)
|
303
|
+
result = await run_process(command)
|
324
304
|
|
325
305
|
if result.returncode != 0:
|
326
306
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -334,15 +314,12 @@ async def _handle_extract_file(
|
|
334
314
|
await unlink()
|
335
315
|
|
336
316
|
|
337
|
-
async def process_file_with_pandoc(
|
338
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
339
|
-
) -> ExtractionResult:
|
317
|
+
async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
|
340
318
|
"""Process a single file using Pandoc and convert to markdown.
|
341
319
|
|
342
320
|
Args:
|
343
321
|
input_file: The path to the file to process.
|
344
322
|
mime_type: The mime type of the file.
|
345
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
346
323
|
|
347
324
|
Raises:
|
348
325
|
ParsingError: If the file data could not be extracted.
|
@@ -354,41 +331,27 @@ async def process_file_with_pandoc(
|
|
354
331
|
|
355
332
|
_get_pandoc_type_from_mime_type(mime_type)
|
356
333
|
|
357
|
-
metadata: Metadata = {}
|
358
|
-
content: str = ""
|
359
|
-
|
360
334
|
try:
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
|
366
|
-
|
367
|
-
|
368
|
-
|
369
|
-
|
370
|
-
|
371
|
-
tg.start_soon(_get_metadata)
|
372
|
-
tg.start_soon(_get_content)
|
335
|
+
metadata_task = _handle_extract_metadata(input_file, mime_type=mime_type)
|
336
|
+
content_task = _handle_extract_file(input_file, mime_type=mime_type)
|
337
|
+
results = await run_taskgroup(metadata_task, content_task)
|
338
|
+
metadata, content = cast(tuple[Metadata, str], results)
|
339
|
+
|
340
|
+
return ExtractionResult(
|
341
|
+
content=normalize_spaces(content),
|
342
|
+
metadata=metadata,
|
343
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
344
|
+
)
|
373
345
|
except ExceptionGroup as eg:
|
374
|
-
raise ParsingError("Failed to
|
375
|
-
|
376
|
-
return ExtractionResult(
|
377
|
-
content=normalize_spaces(content),
|
378
|
-
metadata=metadata,
|
379
|
-
mime_type=MARKDOWN_MIME_TYPE,
|
380
|
-
)
|
346
|
+
raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
|
381
347
|
|
382
348
|
|
383
|
-
async def process_content_with_pandoc(
|
384
|
-
content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
385
|
-
) -> ExtractionResult:
|
349
|
+
async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
|
386
350
|
"""Process content using Pandoc and convert to markdown.
|
387
351
|
|
388
352
|
Args:
|
389
353
|
content: The content to process.
|
390
354
|
mime_type: The mime type of the content.
|
391
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
392
355
|
|
393
356
|
Returns:
|
394
357
|
ExtractionResult
|
@@ -397,7 +360,7 @@ async def process_content_with_pandoc(
|
|
397
360
|
input_file, unlink = await create_temp_file(f".{extension}")
|
398
361
|
|
399
362
|
await AsyncPath(input_file).write_bytes(content)
|
400
|
-
result = await process_file_with_pandoc(input_file, mime_type=mime_type
|
363
|
+
result = await process_file_with_pandoc(input_file, mime_type=mime_type)
|
401
364
|
|
402
365
|
await unlink()
|
403
366
|
return result
|
kreuzberg/_pdf.py
CHANGED
@@ -24,32 +24,36 @@ if TYPE_CHECKING: # pragma: no cover
|
|
24
24
|
# - Control and non-printable characters
|
25
25
|
# - Unicode replacement and invalid characters
|
26
26
|
# - Zero-width spaces and other invisible characters
|
27
|
-
CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
|
28
|
-
|
29
|
-
|
27
|
+
CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
|
28
|
+
SHORT_TEXT_THRESHOLD: Final[int] = 50
|
29
|
+
MINIMUM_CORRUPTED_RESULTS: Final[int] = 2
|
30
30
|
|
31
31
|
|
32
|
-
def _validate_extracted_text(text: str) -> bool:
|
32
|
+
def _validate_extracted_text(text: str, corruption_threshold: float = 0.05) -> bool:
|
33
33
|
"""Check if text extracted from PDF is valid or corrupted.
|
34
34
|
|
35
|
-
This checks for
|
35
|
+
This checks for indicators of corrupted PDF text extraction:
|
36
36
|
1. Empty or whitespace-only text
|
37
|
-
2.
|
38
|
-
3. Unicode replacement characters
|
39
|
-
4. Zero-width spaces and other invisible characters
|
37
|
+
2. High concentration of control characters and null bytes
|
38
|
+
3. High concentration of Unicode replacement characters
|
40
39
|
|
41
40
|
Args:
|
42
41
|
text: The extracted text to validate
|
42
|
+
corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
|
43
|
+
characters (default: 0.05 or 5%)
|
43
44
|
|
44
45
|
Returns:
|
45
46
|
True if the text appears valid, False if it seems corrupted
|
46
47
|
"""
|
47
|
-
# Check for empty or whitespace-only text
|
48
48
|
if not text or not text.strip():
|
49
49
|
return False
|
50
50
|
|
51
|
-
|
52
|
-
|
51
|
+
corruption_matches = CORRUPTED_PATTERN.findall(text)
|
52
|
+
|
53
|
+
if len(text) < SHORT_TEXT_THRESHOLD:
|
54
|
+
return len(corruption_matches) <= MINIMUM_CORRUPTED_RESULTS
|
55
|
+
|
56
|
+
return (len(corruption_matches) / len(text)) < corruption_threshold
|
53
57
|
|
54
58
|
|
55
59
|
async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
@@ -67,7 +71,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
|
67
71
|
document: pypdfium2.PdfDocument | None = None
|
68
72
|
try:
|
69
73
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
70
|
-
return [page.render(scale=
|
74
|
+
return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
|
71
75
|
except pypdfium2.PdfiumError as e:
|
72
76
|
raise ParsingError(
|
73
77
|
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
@@ -148,13 +152,10 @@ async def extract_pdf_file(
|
|
148
152
|
Returns:
|
149
153
|
The extracted text.
|
150
154
|
"""
|
151
|
-
if
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
):
|
156
|
-
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
157
|
-
|
155
|
+
if not force_ocr:
|
156
|
+
content = await _extract_pdf_searchable_text(input_file)
|
157
|
+
if _validate_extracted_text(content):
|
158
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
158
159
|
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
159
160
|
|
160
161
|
|
kreuzberg/_string.py
CHANGED
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
22
22
|
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
23
23
|
|
24
24
|
for enc in [e for e in encodings if e]: # pragma: no cover
|
25
|
-
with suppress(UnicodeDecodeError):
|
25
|
+
with suppress(UnicodeDecodeError, LookupError):
|
26
26
|
return byte_data.decode(enc)
|
27
27
|
|
28
28
|
# If all encodings fail, fall back to latin-1 which can handle any byte
|
kreuzberg/_sync.py
CHANGED
@@ -2,12 +2,13 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import sys
|
4
4
|
from functools import partial
|
5
|
-
from typing import TYPE_CHECKING, TypeVar, cast
|
5
|
+
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
6
6
|
|
7
|
+
from anyio import create_task_group
|
7
8
|
from anyio.to_thread import run_sync as any_io_run_sync
|
8
9
|
|
9
10
|
if TYPE_CHECKING: # pragma: no cover
|
10
|
-
from collections.abc import Callable
|
11
|
+
from collections.abc import Awaitable, Callable
|
11
12
|
|
12
13
|
if sys.version_info >= (3, 10):
|
13
14
|
from typing import ParamSpec
|
@@ -30,4 +31,44 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
|
|
30
31
|
The result of the synchronous function.
|
31
32
|
"""
|
32
33
|
handler = partial(sync_fn, **kwargs)
|
33
|
-
return cast(T, await any_io_run_sync(handler, *args)) # pyright: ignore [reportCallIssue]
|
34
|
+
return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
35
|
+
|
36
|
+
|
37
|
+
async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
|
38
|
+
"""Run a list of coroutines concurrently.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
*async_tasks: The list of coroutines to run.
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
The results of the coroutines.
|
45
|
+
"""
|
46
|
+
results: list[Any] = [None] * len(async_tasks)
|
47
|
+
|
48
|
+
async def run_task(index: int, task: Awaitable[T]) -> None:
|
49
|
+
results[index] = await task
|
50
|
+
|
51
|
+
async with create_task_group() as tg:
|
52
|
+
for i, t in enumerate(async_tasks):
|
53
|
+
tg.start_soon(run_task, i, t)
|
54
|
+
|
55
|
+
return results
|
56
|
+
|
57
|
+
|
58
|
+
async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
|
59
|
+
"""Run a list of coroutines concurrently in batches.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
*async_tasks: The list of coroutines to run.
|
63
|
+
batch_size: The size of each batch.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
The results of the coroutines.
|
67
|
+
"""
|
68
|
+
results: list[Any] = []
|
69
|
+
|
70
|
+
for i in range(0, len(async_tasks), batch_size):
|
71
|
+
batch = async_tasks[i : i + batch_size]
|
72
|
+
results.extend(await run_taskgroup(*batch))
|
73
|
+
|
74
|
+
return results
|
kreuzberg/_tesseract.py
CHANGED
@@ -1,30 +1,26 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import re
|
4
|
-
import subprocess
|
5
4
|
import sys
|
6
5
|
from enum import Enum
|
7
|
-
from functools import partial
|
8
6
|
from os import PathLike
|
9
|
-
from typing import
|
7
|
+
from typing import Any, TypeVar, Union
|
10
8
|
|
11
|
-
from anyio import CapacityLimiter, create_task_group, to_process
|
12
9
|
from anyio import Path as AsyncPath
|
10
|
+
from anyio import run_process
|
13
11
|
from PIL.Image import Image
|
14
12
|
|
15
|
-
from kreuzberg import
|
16
|
-
from kreuzberg._constants import DEFAULT_MAX_PROCESSES
|
13
|
+
from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
|
17
14
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
18
15
|
from kreuzberg._string import normalize_spaces
|
19
|
-
from kreuzberg._sync import run_sync
|
16
|
+
from kreuzberg._sync import run_sync, run_taskgroup_batched
|
20
17
|
from kreuzberg._tmp import create_temp_file
|
21
|
-
from kreuzberg.
|
18
|
+
from kreuzberg._types import ExtractionResult
|
19
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
|
22
20
|
|
23
21
|
if sys.version_info < (3, 11): # pragma: no cover
|
24
22
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
25
23
|
|
26
|
-
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
27
|
-
|
28
24
|
version_ref = {"checked": False}
|
29
25
|
|
30
26
|
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
@@ -68,14 +64,16 @@ async def validate_tesseract_version() -> None:
|
|
68
64
|
return
|
69
65
|
|
70
66
|
command = ["tesseract", "--version"]
|
71
|
-
result = await
|
72
|
-
version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
|
67
|
+
result = await run_process(command)
|
68
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
73
69
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
74
70
|
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
75
71
|
|
76
72
|
version_ref["checked"] = True
|
77
73
|
except FileNotFoundError as e:
|
78
|
-
raise MissingDependencyError(
|
74
|
+
raise MissingDependencyError(
|
75
|
+
"Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
|
76
|
+
) from e
|
79
77
|
|
80
78
|
|
81
79
|
async def process_file(
|
@@ -83,7 +81,6 @@ async def process_file(
|
|
83
81
|
*,
|
84
82
|
language: str,
|
85
83
|
psm: PSMMode,
|
86
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
87
84
|
) -> ExtractionResult:
|
88
85
|
"""Process a single image file using Tesseract OCR.
|
89
86
|
|
@@ -91,7 +88,6 @@ async def process_file(
|
|
91
88
|
input_file: The path to the image file to process.
|
92
89
|
language: The language code for OCR.
|
93
90
|
psm: Page segmentation mode.
|
94
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
95
91
|
|
96
92
|
Raises:
|
97
93
|
OCRError: If OCR fails to extract text from the image.
|
@@ -102,6 +98,7 @@ async def process_file(
|
|
102
98
|
output_path, unlink = await create_temp_file(".txt")
|
103
99
|
try:
|
104
100
|
output_base = str(output_path).replace(".txt", "")
|
101
|
+
|
105
102
|
command = [
|
106
103
|
"tesseract",
|
107
104
|
str(input_file),
|
@@ -110,22 +107,44 @@ async def process_file(
|
|
110
107
|
language,
|
111
108
|
"--psm",
|
112
109
|
str(psm.value),
|
110
|
+
"--oem",
|
111
|
+
"1",
|
112
|
+
"--loglevel",
|
113
|
+
"OFF",
|
114
|
+
"-c",
|
115
|
+
"thresholding_method=1",
|
116
|
+
"-c",
|
117
|
+
"tessedit_enable_dict_correction=1",
|
118
|
+
"-c",
|
119
|
+
"language_model_ngram_on=1",
|
120
|
+
"-c",
|
121
|
+
"textord_space_size_is_variable=1",
|
122
|
+
"-c",
|
123
|
+
"classify_use_pre_adapted_templates=1",
|
124
|
+
"-c",
|
125
|
+
"tessedit_dont_blkrej_good_wds=1",
|
126
|
+
"-c",
|
127
|
+
"tessedit_dont_rowrej_good_wds=1",
|
128
|
+
"-c",
|
129
|
+
"tessedit_use_primary_params_model=1",
|
113
130
|
]
|
114
131
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
)
|
132
|
+
env: dict[str, Any] | None = None
|
133
|
+
if sys.platform.startswith("linux"):
|
134
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
135
|
+
|
136
|
+
result = await run_process(command, env=env)
|
121
137
|
|
122
138
|
if not result.returncode == 0:
|
123
|
-
raise OCRError(
|
139
|
+
raise OCRError(
|
140
|
+
"OCR failed with a non-0 return code.",
|
141
|
+
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
142
|
+
)
|
124
143
|
|
125
144
|
output = await AsyncPath(output_path).read_text("utf-8")
|
126
145
|
return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
127
146
|
except (RuntimeError, OSError) as e:
|
128
|
-
raise OCRError("Failed to OCR using tesseract") from e
|
147
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
129
148
|
finally:
|
130
149
|
await unlink()
|
131
150
|
|
@@ -135,7 +154,6 @@ async def process_image(
|
|
135
154
|
*,
|
136
155
|
language: str,
|
137
156
|
psm: PSMMode,
|
138
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
139
157
|
) -> ExtractionResult:
|
140
158
|
"""Process a single Pillow Image using Tesseract OCR.
|
141
159
|
|
@@ -143,14 +161,13 @@ async def process_image(
|
|
143
161
|
image: The Pillow Image to process.
|
144
162
|
language: The language code for OCR.
|
145
163
|
psm: Page segmentation mode.
|
146
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
147
164
|
|
148
165
|
Returns:
|
149
166
|
ExtractionResult: The extracted text from the image.
|
150
167
|
"""
|
151
168
|
image_path, unlink = await create_temp_file(".png")
|
152
169
|
await run_sync(image.save, str(image_path), format="PNG")
|
153
|
-
result = await process_file(image_path, language=language, psm=psm
|
170
|
+
result = await process_file(image_path, language=language, psm=psm)
|
154
171
|
await unlink()
|
155
172
|
return result
|
156
173
|
|
@@ -160,7 +177,6 @@ async def process_image_with_tesseract(
|
|
160
177
|
*,
|
161
178
|
language: str = "eng",
|
162
179
|
psm: PSMMode = PSMMode.AUTO,
|
163
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
164
180
|
) -> ExtractionResult:
|
165
181
|
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
166
182
|
|
@@ -168,7 +184,6 @@ async def process_image_with_tesseract(
|
|
168
184
|
image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
|
169
185
|
language: The language code for OCR (default: "eng").
|
170
186
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
171
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
172
187
|
|
173
188
|
Raises:
|
174
189
|
ValueError: If the input is not a Pillow Image or a list of Pillow Images.
|
@@ -179,10 +194,10 @@ async def process_image_with_tesseract(
|
|
179
194
|
await validate_tesseract_version()
|
180
195
|
|
181
196
|
if isinstance(image, Image):
|
182
|
-
return await process_image(image, language=language, psm=psm
|
197
|
+
return await process_image(image, language=language, psm=psm)
|
183
198
|
|
184
199
|
if isinstance(image, (PathLike, str)):
|
185
|
-
return await process_file(image, language=language, psm=psm
|
200
|
+
return await process_file(image, language=language, psm=psm)
|
186
201
|
|
187
202
|
raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
|
188
203
|
|
@@ -200,7 +215,7 @@ async def batch_process_images(
|
|
200
215
|
images: A list of Pillow Images, paths or strings to process.
|
201
216
|
language: The language code for OCR (default: "eng").
|
202
217
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
203
|
-
max_processes: Maximum number of concurrent processes
|
218
|
+
max_processes: Maximum number of concurrent processes (default: CPU count / 2).
|
204
219
|
|
205
220
|
Raises:
|
206
221
|
ParsingError: If OCR fails to extract text from any of the images.
|
@@ -209,17 +224,8 @@ async def batch_process_images(
|
|
209
224
|
List of ExtractionResult objects, one per input image.
|
210
225
|
"""
|
211
226
|
await validate_tesseract_version()
|
212
|
-
results = cast(list[ExtractionResult], list(range(len(images))))
|
213
|
-
|
214
|
-
async def _process_image(index: int, image: T) -> None:
|
215
|
-
results[index] = await process_image_with_tesseract(
|
216
|
-
image, language=language, psm=psm, max_processes=max_processes
|
217
|
-
)
|
218
|
-
|
219
227
|
try:
|
220
|
-
|
221
|
-
|
222
|
-
tg.start_soon(_process_image, i, image)
|
223
|
-
return results
|
228
|
+
tasks = [process_image_with_tesseract(image, language=language, psm=psm) for image in images]
|
229
|
+
return await run_taskgroup_batched(*tasks, batch_size=max_processes)
|
224
230
|
except ExceptionGroup as eg:
|
225
|
-
raise ParsingError("Failed to process images with Tesseract") from eg
|
231
|
+
raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
|
kreuzberg/_xlsx.py
CHANGED
@@ -1,23 +1,46 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import csv
|
4
|
+
import sys
|
4
5
|
from io import StringIO
|
5
|
-
from typing import TYPE_CHECKING
|
6
|
+
from typing import TYPE_CHECKING
|
6
7
|
|
7
8
|
from anyio import Path as AsyncPath
|
8
|
-
from anyio import create_task_group
|
9
9
|
from python_calamine import CalamineWorkbook
|
10
10
|
|
11
11
|
from kreuzberg import ExtractionResult, ParsingError
|
12
12
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
13
13
|
from kreuzberg._pandoc import process_file_with_pandoc
|
14
14
|
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import run_sync
|
15
|
+
from kreuzberg._sync import run_sync, run_taskgroup
|
16
16
|
from kreuzberg._tmp import create_temp_file
|
17
17
|
|
18
18
|
if TYPE_CHECKING: # pragma: no cover
|
19
19
|
from pathlib import Path
|
20
20
|
|
21
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
22
|
+
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
23
|
+
|
24
|
+
|
25
|
+
async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
|
26
|
+
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
27
|
+
|
28
|
+
csv_buffer = StringIO()
|
29
|
+
writer = csv.writer(csv_buffer)
|
30
|
+
|
31
|
+
for row in values:
|
32
|
+
writer.writerow(row)
|
33
|
+
|
34
|
+
csv_data = csv_buffer.getvalue()
|
35
|
+
csv_buffer.close()
|
36
|
+
|
37
|
+
csv_path, unlink = await create_temp_file(".csv")
|
38
|
+
await AsyncPath(csv_path).write_text(csv_data)
|
39
|
+
|
40
|
+
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
41
|
+
await unlink()
|
42
|
+
return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
43
|
+
|
21
44
|
|
22
45
|
async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
23
46
|
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
@@ -33,46 +56,19 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
|
33
56
|
"""
|
34
57
|
try:
|
35
58
|
workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
|
36
|
-
|
37
|
-
results
|
38
|
-
|
39
|
-
async def convert_sheet_to_text(sheet_name: str) -> None:
|
40
|
-
nonlocal results
|
41
|
-
values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
|
42
|
-
|
43
|
-
csv_buffer = StringIO()
|
44
|
-
writer = csv.writer(csv_buffer)
|
45
|
-
|
46
|
-
for row in values:
|
47
|
-
writer.writerow(row)
|
48
|
-
|
49
|
-
csv_data = csv_buffer.getvalue()
|
50
|
-
csv_buffer.close()
|
51
|
-
|
52
|
-
from kreuzberg._tmp import create_temp_file
|
53
|
-
|
54
|
-
csv_path, unlink = await create_temp_file(".csv")
|
55
|
-
await AsyncPath(csv_path).write_text(csv_data)
|
56
|
-
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
57
|
-
results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
58
|
-
await unlink()
|
59
|
-
|
60
|
-
async with create_task_group() as tg:
|
61
|
-
for sheet_name in workbook.sheet_names:
|
62
|
-
tg.start_soon(convert_sheet_to_text, sheet_name)
|
59
|
+
tasks = [convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
|
60
|
+
results: list[str] = await run_taskgroup(*tasks)
|
63
61
|
|
64
62
|
return ExtractionResult(
|
65
63
|
content="\n\n".join(results),
|
66
64
|
mime_type=MARKDOWN_MIME_TYPE,
|
67
65
|
metadata={},
|
68
66
|
)
|
69
|
-
except
|
67
|
+
except ExceptionGroup as eg:
|
70
68
|
raise ParsingError(
|
71
|
-
"
|
72
|
-
context={
|
73
|
-
|
74
|
-
},
|
75
|
-
) from e
|
69
|
+
"Failed to extract file data",
|
70
|
+
context={"file": str(input_file), "errors": eg.exceptions},
|
71
|
+
) from eg
|
76
72
|
|
77
73
|
|
78
74
|
async def extract_xlsx_content(content: bytes) -> ExtractionResult:
|
kreuzberg/exceptions.py
CHANGED
@@ -14,9 +14,28 @@ class KreuzbergError(Exception):
|
|
14
14
|
self.context = context
|
15
15
|
super().__init__(message)
|
16
16
|
|
17
|
+
def _serialize_context(self, obj: Any) -> Any:
|
18
|
+
"""Recursively serialize context objects to ensure JSON compatibility."""
|
19
|
+
if isinstance(obj, bytes):
|
20
|
+
return obj.decode("utf-8", errors="replace")
|
21
|
+
if isinstance(obj, dict):
|
22
|
+
return {k: self._serialize_context(v) for k, v in obj.items()}
|
23
|
+
if isinstance(obj, (list, tuple)):
|
24
|
+
return [self._serialize_context(x) for x in obj]
|
25
|
+
if isinstance(obj, Exception):
|
26
|
+
return {
|
27
|
+
"type": obj.__class__.__name__,
|
28
|
+
"message": str(obj),
|
29
|
+
}
|
30
|
+
return obj
|
31
|
+
|
17
32
|
def __str__(self) -> str:
|
18
33
|
"""Return a string representation of the exception."""
|
19
|
-
|
34
|
+
if self.context:
|
35
|
+
serialized_context = self._serialize_context(self.context)
|
36
|
+
ctx = f"\n\nContext: {dumps(serialized_context)}"
|
37
|
+
else:
|
38
|
+
ctx = ""
|
20
39
|
|
21
40
|
return f"{self.__class__.__name__}: {super().__str__()}{ctx}"
|
22
41
|
|
kreuzberg/extraction.py
CHANGED
@@ -87,14 +87,12 @@ async def extract_bytes(
|
|
87
87
|
return await extract_xlsx_content(content)
|
88
88
|
|
89
89
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
90
|
-
return await process_image_with_tesseract(
|
91
|
-
open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
|
92
|
-
)
|
90
|
+
return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
|
93
91
|
|
94
92
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
95
93
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
96
94
|
):
|
97
|
-
return await process_content_with_pandoc(content=content, mime_type=mime_type
|
95
|
+
return await process_content_with_pandoc(content=content, mime_type=mime_type)
|
98
96
|
|
99
97
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
100
98
|
return await extract_pptx_file_content(content)
|
@@ -150,12 +148,12 @@ async def extract_file(
|
|
150
148
|
return await extract_xlsx_file(Path(input_file))
|
151
149
|
|
152
150
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
153
|
-
return await process_image_with_tesseract(input_file,
|
151
|
+
return await process_image_with_tesseract(input_file, psm=psm, language=language)
|
154
152
|
|
155
153
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
156
154
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
157
155
|
):
|
158
|
-
return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type
|
156
|
+
return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
|
159
157
|
|
160
158
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
161
159
|
return await extract_pptx_file_content(Path(input_file))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.1.1
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -31,7 +31,7 @@ Requires-Dist: html-to-markdown>=1.2.0
|
|
31
31
|
Requires-Dist: pypdfium2>=4.30.1
|
32
32
|
Requires-Dist: python-calamine>=0.3.1
|
33
33
|
Requires-Dist: python-pptx>=1.0.2
|
34
|
-
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.
|
34
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
|
35
35
|
|
36
36
|
# Kreuzberg
|
37
37
|
|
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
42
42
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
43
43
|
- **Local Processing**: No external API calls or cloud dependencies required
|
44
44
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
45
|
-
- **
|
45
|
+
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
46
46
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
47
47
|
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
48
48
|
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
@@ -61,8 +61,8 @@ pip install kreuzberg
|
|
61
61
|
|
62
62
|
Kreuzberg requires two system level dependencies:
|
63
63
|
|
64
|
-
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
64
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
65
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 5.
|
66
66
|
|
67
67
|
You can install these with:
|
68
68
|
|
@@ -75,7 +75,6 @@ sudo apt-get install pandoc tesseract-ocr
|
|
75
75
|
#### MacOS
|
76
76
|
|
77
77
|
```shell
|
78
|
-
# MacOS
|
79
78
|
brew install tesseract pandoc
|
80
79
|
```
|
81
80
|
|
@@ -191,19 +190,15 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
|
|
191
190
|
|
192
191
|
#### Processing Configuration
|
193
192
|
|
194
|
-
- `max_processes` (default: CPU count
|
195
|
-
|
196
|
-
Notes:
|
197
|
-
|
198
|
-
- Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
193
|
+
- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
|
199
194
|
|
200
195
|
### Quick Start
|
201
196
|
|
202
197
|
```python
|
203
198
|
from pathlib import Path
|
204
199
|
from kreuzberg import extract_file
|
205
|
-
from kreuzberg
|
206
|
-
from kreuzberg
|
200
|
+
from kreuzberg import ExtractionResult
|
201
|
+
from kreuzberg import PSMMode
|
207
202
|
|
208
203
|
|
209
204
|
# Basic file extraction
|
@@ -232,7 +227,7 @@ async def extract_document():
|
|
232
227
|
|
233
228
|
```python
|
234
229
|
from kreuzberg import extract_bytes
|
235
|
-
from kreuzberg
|
230
|
+
from kreuzberg import ExtractionResult
|
236
231
|
|
237
232
|
|
238
233
|
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
@@ -378,8 +373,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
|
|
378
373
|
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
379
374
|
|
380
375
|
```python
|
381
|
-
from kreuzberg import
|
382
|
-
|
376
|
+
from kreuzberg import (
|
377
|
+
extract_file,
|
383
378
|
ValidationError,
|
384
379
|
ParsingError,
|
385
380
|
OCRError,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
|
2
|
+
kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
|
3
|
+
kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
|
4
|
+
kreuzberg/_mime_types.py,sha256=Kuu0yWY4p0Eck8b_vdp9oamqRZc1RJaS_ZKikVD2Z2o,6431
|
5
|
+
kreuzberg/_pandoc.py,sha256=YIXaFC11N2tgVHjBd3JD_21GZ6OOVQ0UY3aKrWNfK-I,12531
|
6
|
+
kreuzberg/_pdf.py,sha256=AIwxlydZkJOU4878SaeF9cKUmzSN7o3X40Hye7z017U,6479
|
7
|
+
kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
|
8
|
+
kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
|
9
|
+
kreuzberg/_sync.py,sha256=sDVH4GrpYW9SOnmu3BqKPL76xl0hxzHjTAC78aovbQA,2122
|
10
|
+
kreuzberg/_tesseract.py,sha256=0BkguZJIKlOFHkrN2mjVgaycWwolmuEv6DwpQY7n7Os,7610
|
11
|
+
kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
|
12
|
+
kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
|
13
|
+
kreuzberg/_xlsx.py,sha256=kSH7PJ33vdLgoh5LmL_bqbc4I0VgZlZUeF4ckKl6NJM,2675
|
14
|
+
kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
|
15
|
+
kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
|
16
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
kreuzberg-2.1.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
18
|
+
kreuzberg-2.1.1.dist-info/METADATA,sha256=tWRsv1bx9os2dQnU5KrQpUd4fNeQ4x-J2fXWKdcuQAA,14842
|
19
|
+
kreuzberg-2.1.1.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
20
|
+
kreuzberg-2.1.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
21
|
+
kreuzberg-2.1.1.dist-info/RECORD,,
|
kreuzberg-2.0.1.dist-info/RECORD
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=CBRHXPhjdslaSXaUjZO5V0k57uz5_x12cwo0HTtxOcU,647
|
2
|
-
kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
|
3
|
-
kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
|
4
|
-
kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
|
5
|
-
kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
|
6
|
-
kreuzberg/_pdf.py,sha256=9YErIrRvMMFXKHckXzBDCEMzDAEnC0JVOR38gFhvHKQ,6227
|
7
|
-
kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
|
8
|
-
kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
|
9
|
-
kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
|
10
|
-
kreuzberg/_tesseract.py,sha256=SZsv0gFWvzR8iLaMyGr4Oc0lXE7atCR3sNxXR7TQzEE,7686
|
11
|
-
kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
|
12
|
-
kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
|
13
|
-
kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
|
14
|
-
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
15
|
-
kreuzberg/extraction.py,sha256=kuEKvOGhPBRcFeGX7eKmup9BukX6o55740F_KdZ15qQ,13214
|
16
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
kreuzberg-2.0.1.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
18
|
-
kreuzberg-2.0.1.dist-info/METADATA,sha256=KmKLubQ89i0_JwpK96kYbhuq1MuucrqHe2bCLNcbyic,15023
|
19
|
-
kreuzberg-2.0.1.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
20
|
-
kreuzberg-2.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
21
|
-
kreuzberg-2.0.1.dist-info/RECORD,,
|
File without changes
|
File without changes
|