kreuzberg 2.0.0__py3-none-any.whl → 2.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -1
- kreuzberg/_constants.py +3 -1
- kreuzberg/_html.py +1 -2
- kreuzberg/_pandoc.py +37 -73
- kreuzberg/_pdf.py +5 -6
- kreuzberg/_string.py +1 -1
- kreuzberg/_sync.py +43 -2
- kreuzberg/_tesseract.py +55 -176
- kreuzberg/_xlsx.py +34 -36
- kreuzberg/exceptions.py +20 -1
- kreuzberg/extraction.py +13 -15
- {kreuzberg-2.0.0.dist-info → kreuzberg-2.1.0.dist-info}/METADATA +48 -20
- kreuzberg-2.1.0.dist-info/RECORD +21 -0
- kreuzberg-2.0.0.dist-info/RECORD +0 -21
- {kreuzberg-2.0.0.dist-info → kreuzberg-2.1.0.dist-info}/LICENSE +0 -0
- {kreuzberg-2.0.0.dist-info → kreuzberg-2.1.0.dist-info}/WHEEL +0 -0
- {kreuzberg-2.0.0.dist-info → kreuzberg-2.1.0.dist-info}/top_level.txt +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,6 +1,14 @@
|
|
1
|
+
from ._tesseract import PSMMode
|
1
2
|
from ._types import ExtractionResult, Metadata
|
2
3
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
3
|
-
from .extraction import
|
4
|
+
from .extraction import (
|
5
|
+
batch_extract_bytes,
|
6
|
+
batch_extract_bytes_sync,
|
7
|
+
batch_extract_file,
|
8
|
+
batch_extract_file_sync,
|
9
|
+
extract_bytes,
|
10
|
+
extract_file,
|
11
|
+
)
|
4
12
|
|
5
13
|
__all__ = [
|
6
14
|
"ExtractionResult",
|
@@ -8,8 +16,13 @@ __all__ = [
|
|
8
16
|
"Metadata",
|
9
17
|
"MissingDependencyError",
|
10
18
|
"OCRError",
|
19
|
+
"PSMMode",
|
11
20
|
"ParsingError",
|
12
21
|
"ValidationError",
|
22
|
+
"batch_extract_bytes",
|
23
|
+
"batch_extract_bytes_sync",
|
24
|
+
"batch_extract_file",
|
25
|
+
"batch_extract_file_sync",
|
13
26
|
"extract_bytes",
|
14
27
|
"extract_file",
|
15
28
|
]
|
kreuzberg/_constants.py
CHANGED
@@ -3,4 +3,6 @@ from __future__ import annotations
|
|
3
3
|
from multiprocessing import cpu_count
|
4
4
|
from typing import Final
|
5
5
|
|
6
|
-
DEFAULT_MAX_PROCESSES: Final[int] =
|
6
|
+
DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
|
7
|
+
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
8
|
+
MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
|
kreuzberg/_html.py
CHANGED
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
|
|
8
8
|
from kreuzberg import ExtractionResult
|
9
9
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
10
10
|
from kreuzberg._string import normalize_spaces, safe_decode
|
11
|
-
from kreuzberg._sync import run_sync
|
12
11
|
|
13
12
|
if TYPE_CHECKING:
|
14
13
|
from pathlib import Path
|
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
|
|
28
27
|
if isinstance(file_path_or_contents, bytes)
|
29
28
|
else await AsyncPath(file_path_or_contents).read_text()
|
30
29
|
)
|
31
|
-
result =
|
30
|
+
result = html_to_markdown.convert_to_markdown(content)
|
32
31
|
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
kreuzberg/_pandoc.py
CHANGED
@@ -1,21 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import
|
3
|
+
import re
|
4
4
|
import sys
|
5
5
|
from functools import partial
|
6
6
|
from json import JSONDecodeError, loads
|
7
7
|
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
8
8
|
|
9
|
-
from anyio import CapacityLimiter, create_task_group, to_process
|
10
9
|
from anyio import Path as AsyncPath
|
10
|
+
from anyio import run_process
|
11
11
|
|
12
|
-
from kreuzberg
|
12
|
+
from kreuzberg import ValidationError
|
13
|
+
from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
|
13
14
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
14
15
|
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import
|
16
|
+
from kreuzberg._sync import run_taskgroup
|
16
17
|
from kreuzberg._tmp import create_temp_file
|
17
18
|
from kreuzberg._types import ExtractionResult, Metadata
|
18
|
-
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
20
|
|
20
21
|
if TYPE_CHECKING: # pragma: no cover
|
21
22
|
from collections.abc import Mapping
|
@@ -24,10 +25,8 @@ if TYPE_CHECKING: # pragma: no cover
|
|
24
25
|
if sys.version_info < (3, 11): # pragma: no cover
|
25
26
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
27
|
|
27
|
-
|
28
28
|
version_ref: Final[dict[str, bool]] = {"checked": False}
|
29
29
|
|
30
|
-
|
31
30
|
# Block-level node types in Pandoc AST
|
32
31
|
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
33
32
|
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
@@ -229,20 +228,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
|
229
228
|
|
230
229
|
|
231
230
|
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
232
|
-
if
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
"mime_type": mime_type,
|
239
|
-
"supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
|
240
|
-
},
|
231
|
+
if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
|
232
|
+
return pandoc_type
|
233
|
+
|
234
|
+
if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
|
235
|
+
return next(
|
236
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
241
237
|
)
|
242
238
|
|
243
|
-
|
244
|
-
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
245
|
-
)
|
239
|
+
raise ValidationError(f"Unsupported mime type: {mime_type}")
|
246
240
|
|
247
241
|
|
248
242
|
async def _validate_pandoc_version() -> None:
|
@@ -251,20 +245,19 @@ async def _validate_pandoc_version() -> None:
|
|
251
245
|
return
|
252
246
|
|
253
247
|
command = ["pandoc", "--version"]
|
254
|
-
result = await
|
255
|
-
|
256
|
-
|
257
|
-
|
248
|
+
result = await run_process(command)
|
249
|
+
|
250
|
+
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
251
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
252
|
+
raise MissingDependencyError("Pandoc version 2 or above is required")
|
258
253
|
|
259
254
|
version_ref["checked"] = True
|
260
255
|
|
261
256
|
except FileNotFoundError as e:
|
262
|
-
raise MissingDependencyError("Pandoc is not installed
|
257
|
+
raise MissingDependencyError("Pandoc is not installed") from e
|
263
258
|
|
264
259
|
|
265
|
-
async def _handle_extract_metadata(
|
266
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
267
|
-
) -> Metadata:
|
260
|
+
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
268
261
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
269
262
|
metadata_file, unlink = await create_temp_file(".json")
|
270
263
|
try:
|
@@ -276,15 +269,10 @@ async def _handle_extract_metadata(
|
|
276
269
|
"--standalone",
|
277
270
|
"--quiet",
|
278
271
|
"--output",
|
279
|
-
metadata_file,
|
272
|
+
str(metadata_file),
|
280
273
|
]
|
281
274
|
|
282
|
-
result = await
|
283
|
-
partial(subprocess.run, capture_output=True),
|
284
|
-
command,
|
285
|
-
cancellable=True,
|
286
|
-
limiter=CapacityLimiter(max_processes),
|
287
|
-
)
|
275
|
+
result = await run_process(command)
|
288
276
|
|
289
277
|
if result.returncode != 0:
|
290
278
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -297,9 +285,7 @@ async def _handle_extract_metadata(
|
|
297
285
|
await unlink()
|
298
286
|
|
299
287
|
|
300
|
-
async def _handle_extract_file(
|
301
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
302
|
-
) -> str:
|
288
|
+
async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
|
303
289
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
304
290
|
output_path, unlink = await create_temp_file(".md")
|
305
291
|
try:
|
@@ -315,12 +301,7 @@ async def _handle_extract_file(
|
|
315
301
|
|
316
302
|
command.extend(["--output", str(output_path)])
|
317
303
|
|
318
|
-
result = await
|
319
|
-
partial(subprocess.run, capture_output=True),
|
320
|
-
command,
|
321
|
-
cancellable=True,
|
322
|
-
limiter=CapacityLimiter(max_processes),
|
323
|
-
)
|
304
|
+
result = await run_process(command)
|
324
305
|
|
325
306
|
if result.returncode != 0:
|
326
307
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -334,15 +315,12 @@ async def _handle_extract_file(
|
|
334
315
|
await unlink()
|
335
316
|
|
336
317
|
|
337
|
-
async def process_file_with_pandoc(
|
338
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
339
|
-
) -> ExtractionResult:
|
318
|
+
async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
|
340
319
|
"""Process a single file using Pandoc and convert to markdown.
|
341
320
|
|
342
321
|
Args:
|
343
322
|
input_file: The path to the file to process.
|
344
323
|
mime_type: The mime type of the file.
|
345
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
346
324
|
|
347
325
|
Raises:
|
348
326
|
ParsingError: If the file data could not be extracted.
|
@@ -354,41 +332,27 @@ async def process_file_with_pandoc(
|
|
354
332
|
|
355
333
|
_get_pandoc_type_from_mime_type(mime_type)
|
356
334
|
|
357
|
-
metadata: Metadata = {}
|
358
|
-
content: str = ""
|
359
|
-
|
360
335
|
try:
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
|
366
|
-
|
367
|
-
async def _get_content() -> None:
|
368
|
-
nonlocal content
|
369
|
-
content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
|
336
|
+
metadata, content = await run_taskgroup(
|
337
|
+
partial(_handle_extract_metadata, input_file, mime_type=mime_type),
|
338
|
+
partial(_handle_extract_file, input_file, mime_type=mime_type),
|
339
|
+
)
|
370
340
|
|
371
|
-
|
372
|
-
|
341
|
+
return ExtractionResult(
|
342
|
+
content=normalize_spaces(cast(str, content)),
|
343
|
+
metadata=cast(Metadata, metadata),
|
344
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
345
|
+
)
|
373
346
|
except ExceptionGroup as eg:
|
374
|
-
raise ParsingError("Failed to
|
375
|
-
|
376
|
-
return ExtractionResult(
|
377
|
-
content=normalize_spaces(content),
|
378
|
-
metadata=metadata,
|
379
|
-
mime_type=MARKDOWN_MIME_TYPE,
|
380
|
-
)
|
347
|
+
raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
|
381
348
|
|
382
349
|
|
383
|
-
async def process_content_with_pandoc(
|
384
|
-
content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
385
|
-
) -> ExtractionResult:
|
350
|
+
async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
|
386
351
|
"""Process content using Pandoc and convert to markdown.
|
387
352
|
|
388
353
|
Args:
|
389
354
|
content: The content to process.
|
390
355
|
mime_type: The mime type of the content.
|
391
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
392
356
|
|
393
357
|
Returns:
|
394
358
|
ExtractionResult
|
@@ -397,7 +361,7 @@ async def process_content_with_pandoc(
|
|
397
361
|
input_file, unlink = await create_temp_file(f".{extension}")
|
398
362
|
|
399
363
|
await AsyncPath(input_file).write_bytes(content)
|
400
|
-
result = await process_file_with_pandoc(input_file, mime_type=mime_type
|
364
|
+
result = await process_file_with_pandoc(input_file, mime_type=mime_type)
|
401
365
|
|
402
366
|
await unlink()
|
403
367
|
return result
|
kreuzberg/_pdf.py
CHANGED
@@ -11,7 +11,7 @@ from kreuzberg import ExtractionResult
|
|
11
11
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
12
|
from kreuzberg._string import normalize_spaces
|
13
13
|
from kreuzberg._sync import run_sync
|
14
|
-
from kreuzberg._tesseract import PSMMode,
|
14
|
+
from kreuzberg._tesseract import PSMMode, batch_process_images
|
15
15
|
from kreuzberg.exceptions import ParsingError
|
16
16
|
|
17
17
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
|
67
67
|
document: pypdfium2.PdfDocument | None = None
|
68
68
|
try:
|
69
69
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
70
|
-
return [page.render(scale=
|
70
|
+
return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
|
71
71
|
except pypdfium2.PdfiumError as e:
|
72
72
|
raise ParsingError(
|
73
73
|
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
@@ -80,7 +80,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
|
80
80
|
async def _extract_pdf_text_with_ocr(
|
81
81
|
input_file: Path,
|
82
82
|
*,
|
83
|
-
language:
|
83
|
+
language: str = "eng",
|
84
84
|
max_processes: int,
|
85
85
|
psm: PSMMode = PSMMode.AUTO,
|
86
86
|
) -> ExtractionResult:
|
@@ -132,7 +132,7 @@ async def extract_pdf_file(
|
|
132
132
|
input_file: Path,
|
133
133
|
*,
|
134
134
|
force_ocr: bool,
|
135
|
-
language:
|
135
|
+
language: str = "eng",
|
136
136
|
max_processes: int,
|
137
137
|
psm: PSMMode = PSMMode.AUTO,
|
138
138
|
) -> ExtractionResult:
|
@@ -154,7 +154,6 @@ async def extract_pdf_file(
|
|
154
154
|
and _validate_extracted_text(content)
|
155
155
|
):
|
156
156
|
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
157
|
-
|
158
157
|
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
159
158
|
|
160
159
|
|
@@ -162,7 +161,7 @@ async def extract_pdf_content(
|
|
162
161
|
content: bytes,
|
163
162
|
*,
|
164
163
|
force_ocr: bool,
|
165
|
-
language:
|
164
|
+
language: str = "eng",
|
166
165
|
max_processes: int,
|
167
166
|
psm: PSMMode = PSMMode.AUTO,
|
168
167
|
) -> ExtractionResult:
|
kreuzberg/_string.py
CHANGED
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
22
22
|
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
23
23
|
|
24
24
|
for enc in [e for e in encodings if e]: # pragma: no cover
|
25
|
-
with suppress(UnicodeDecodeError):
|
25
|
+
with suppress(UnicodeDecodeError, LookupError):
|
26
26
|
return byte_data.decode(enc)
|
27
27
|
|
28
28
|
# If all encodings fail, fall back to latin-1 which can handle any byte
|
kreuzberg/_sync.py
CHANGED
@@ -4,10 +4,11 @@ import sys
|
|
4
4
|
from functools import partial
|
5
5
|
from typing import TYPE_CHECKING, TypeVar, cast
|
6
6
|
|
7
|
+
from anyio import create_task_group
|
7
8
|
from anyio.to_thread import run_sync as any_io_run_sync
|
8
9
|
|
9
10
|
if TYPE_CHECKING: # pragma: no cover
|
10
|
-
from collections.abc import Callable
|
11
|
+
from collections.abc import Callable, Coroutine
|
11
12
|
|
12
13
|
if sys.version_info >= (3, 10):
|
13
14
|
from typing import ParamSpec
|
@@ -30,4 +31,44 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
|
|
30
31
|
The result of the synchronous function.
|
31
32
|
"""
|
32
33
|
handler = partial(sync_fn, **kwargs)
|
33
|
-
return cast(T, await any_io_run_sync(handler, *args)) # pyright: ignore [reportCallIssue]
|
34
|
+
return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
35
|
+
|
36
|
+
|
37
|
+
async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) -> list[T]:
|
38
|
+
"""Run a list of coroutines concurrently.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
*async_tasks: The list of coroutines to run.
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
The results of the coroutines.
|
45
|
+
"""
|
46
|
+
results = cast(list[T], [None] * len(async_tasks))
|
47
|
+
|
48
|
+
async def run_task(index: int, task: Callable[[], Coroutine[None, None, T]]) -> None:
|
49
|
+
results[index] = await task()
|
50
|
+
|
51
|
+
async with create_task_group() as tg:
|
52
|
+
for i, t in enumerate(async_tasks):
|
53
|
+
tg.start_soon(run_task, i, t)
|
54
|
+
|
55
|
+
return results
|
56
|
+
|
57
|
+
|
58
|
+
async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None, T]], batch_size: int) -> list[T]:
|
59
|
+
"""Run a list of coroutines concurrently in batches.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
*async_tasks: The list of coroutines to run.
|
63
|
+
batch_size: The size of each batch.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
The results of the coroutines.
|
67
|
+
"""
|
68
|
+
results: list[T] = []
|
69
|
+
|
70
|
+
for i in range(0, len(async_tasks), batch_size):
|
71
|
+
batch = async_tasks[i : i + batch_size]
|
72
|
+
results.extend(await run_taskgroup(*batch))
|
73
|
+
|
74
|
+
return results
|
kreuzberg/_tesseract.py
CHANGED
@@ -1,164 +1,31 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import re
|
4
|
-
import subprocess
|
5
4
|
import sys
|
6
5
|
from enum import Enum
|
7
6
|
from functools import partial
|
8
7
|
from os import PathLike
|
9
|
-
from typing import
|
8
|
+
from typing import Any, TypeVar, Union
|
10
9
|
|
11
|
-
from anyio import CapacityLimiter, create_task_group, to_process
|
12
10
|
from anyio import Path as AsyncPath
|
11
|
+
from anyio import run_process
|
13
12
|
from PIL.Image import Image
|
14
13
|
|
15
|
-
from kreuzberg import
|
16
|
-
from kreuzberg._constants import DEFAULT_MAX_PROCESSES
|
14
|
+
from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
|
17
15
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
18
16
|
from kreuzberg._string import normalize_spaces
|
19
|
-
from kreuzberg._sync import run_sync
|
17
|
+
from kreuzberg._sync import run_sync, run_taskgroup_batched
|
20
18
|
from kreuzberg._tmp import create_temp_file
|
21
|
-
from kreuzberg.
|
19
|
+
from kreuzberg._types import ExtractionResult
|
20
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
|
22
21
|
|
23
22
|
if sys.version_info < (3, 11): # pragma: no cover
|
24
23
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
25
24
|
|
26
|
-
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
27
|
-
|
28
25
|
version_ref = {"checked": False}
|
29
26
|
|
30
27
|
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
31
28
|
|
32
|
-
SupportedLanguage = Literal[
|
33
|
-
"afr",
|
34
|
-
"amh",
|
35
|
-
"ara",
|
36
|
-
"asm",
|
37
|
-
"aze",
|
38
|
-
"aze_cyrl",
|
39
|
-
"bel",
|
40
|
-
"ben",
|
41
|
-
"bod",
|
42
|
-
"bos",
|
43
|
-
"bre",
|
44
|
-
"bul",
|
45
|
-
"cat",
|
46
|
-
"ceb",
|
47
|
-
"ces",
|
48
|
-
"chi_sim",
|
49
|
-
"chi_tra",
|
50
|
-
"chr",
|
51
|
-
"cos",
|
52
|
-
"cym",
|
53
|
-
"dan",
|
54
|
-
"dan_frak",
|
55
|
-
"deu",
|
56
|
-
"deu_frak",
|
57
|
-
"deu_latf",
|
58
|
-
"dzo",
|
59
|
-
"ell",
|
60
|
-
"eng",
|
61
|
-
"enm",
|
62
|
-
"epo",
|
63
|
-
"equ",
|
64
|
-
"est",
|
65
|
-
"eus",
|
66
|
-
"fao",
|
67
|
-
"fas",
|
68
|
-
"fil",
|
69
|
-
"fin",
|
70
|
-
"fra",
|
71
|
-
"frk",
|
72
|
-
"frm",
|
73
|
-
"fry",
|
74
|
-
"gla",
|
75
|
-
"gle",
|
76
|
-
"glg",
|
77
|
-
"grc",
|
78
|
-
"guj",
|
79
|
-
"hat",
|
80
|
-
"heb",
|
81
|
-
"hin",
|
82
|
-
"hrv",
|
83
|
-
"hun",
|
84
|
-
"hye",
|
85
|
-
"iku",
|
86
|
-
"ind",
|
87
|
-
"isl",
|
88
|
-
"ita",
|
89
|
-
"ita_old",
|
90
|
-
"jav",
|
91
|
-
"jpn",
|
92
|
-
"kan",
|
93
|
-
"kat",
|
94
|
-
"kat_old",
|
95
|
-
"kaz",
|
96
|
-
"khm",
|
97
|
-
"kir",
|
98
|
-
"kmr",
|
99
|
-
"kor",
|
100
|
-
"kor_vert",
|
101
|
-
"kur",
|
102
|
-
"lao",
|
103
|
-
"lat",
|
104
|
-
"lav",
|
105
|
-
"lit",
|
106
|
-
"ltz",
|
107
|
-
"mal",
|
108
|
-
"mar",
|
109
|
-
"mkd",
|
110
|
-
"mlt",
|
111
|
-
"mon",
|
112
|
-
"mri",
|
113
|
-
"msa",
|
114
|
-
"mya",
|
115
|
-
"nep",
|
116
|
-
"nld",
|
117
|
-
"nor",
|
118
|
-
"oci",
|
119
|
-
"ori",
|
120
|
-
"osd",
|
121
|
-
"pan",
|
122
|
-
"pol",
|
123
|
-
"por",
|
124
|
-
"pus",
|
125
|
-
"que",
|
126
|
-
"ron",
|
127
|
-
"rus",
|
128
|
-
"san",
|
129
|
-
"sin",
|
130
|
-
"slk",
|
131
|
-
"slk_frak",
|
132
|
-
"slv",
|
133
|
-
"snd",
|
134
|
-
"spa",
|
135
|
-
"spa_old",
|
136
|
-
"sqi",
|
137
|
-
"srp",
|
138
|
-
"srp_latn",
|
139
|
-
"sun",
|
140
|
-
"swa",
|
141
|
-
"swe",
|
142
|
-
"syr",
|
143
|
-
"tam",
|
144
|
-
"tat",
|
145
|
-
"tel",
|
146
|
-
"tgk",
|
147
|
-
"tgl",
|
148
|
-
"tha",
|
149
|
-
"tir",
|
150
|
-
"ton",
|
151
|
-
"tur",
|
152
|
-
"uig",
|
153
|
-
"ukr",
|
154
|
-
"urd",
|
155
|
-
"uzb",
|
156
|
-
"uzb_cyrl",
|
157
|
-
"vie",
|
158
|
-
"yid",
|
159
|
-
"yor",
|
160
|
-
]
|
161
|
-
|
162
29
|
|
163
30
|
class PSMMode(Enum):
|
164
31
|
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
@@ -198,22 +65,23 @@ async def validate_tesseract_version() -> None:
|
|
198
65
|
return
|
199
66
|
|
200
67
|
command = ["tesseract", "--version"]
|
201
|
-
result = await
|
202
|
-
version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
|
68
|
+
result = await run_process(command)
|
69
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
203
70
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
204
71
|
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
205
72
|
|
206
73
|
version_ref["checked"] = True
|
207
74
|
except FileNotFoundError as e:
|
208
|
-
raise MissingDependencyError(
|
75
|
+
raise MissingDependencyError(
|
76
|
+
"Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
|
77
|
+
) from e
|
209
78
|
|
210
79
|
|
211
80
|
async def process_file(
|
212
81
|
input_file: str | PathLike[str],
|
213
82
|
*,
|
214
|
-
language:
|
83
|
+
language: str,
|
215
84
|
psm: PSMMode,
|
216
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
217
85
|
) -> ExtractionResult:
|
218
86
|
"""Process a single image file using Tesseract OCR.
|
219
87
|
|
@@ -221,7 +89,6 @@ async def process_file(
|
|
221
89
|
input_file: The path to the image file to process.
|
222
90
|
language: The language code for OCR.
|
223
91
|
psm: Page segmentation mode.
|
224
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
225
92
|
|
226
93
|
Raises:
|
227
94
|
OCRError: If OCR fails to extract text from the image.
|
@@ -232,6 +99,7 @@ async def process_file(
|
|
232
99
|
output_path, unlink = await create_temp_file(".txt")
|
233
100
|
try:
|
234
101
|
output_base = str(output_path).replace(".txt", "")
|
102
|
+
|
235
103
|
command = [
|
236
104
|
"tesseract",
|
237
105
|
str(input_file),
|
@@ -240,22 +108,44 @@ async def process_file(
|
|
240
108
|
language,
|
241
109
|
"--psm",
|
242
110
|
str(psm.value),
|
111
|
+
"--oem",
|
112
|
+
"1",
|
113
|
+
"--loglevel",
|
114
|
+
"OFF",
|
115
|
+
"-c",
|
116
|
+
"thresholding_method=1",
|
117
|
+
"-c",
|
118
|
+
"tessedit_enable_dict_correction=1",
|
119
|
+
"-c",
|
120
|
+
"language_model_ngram_on=1",
|
121
|
+
"-c",
|
122
|
+
"textord_space_size_is_variable=1",
|
123
|
+
"-c",
|
124
|
+
"classify_use_pre_adapted_templates=1",
|
125
|
+
"-c",
|
126
|
+
"tessedit_dont_blkrej_good_wds=1",
|
127
|
+
"-c",
|
128
|
+
"tessedit_dont_rowrej_good_wds=1",
|
129
|
+
"-c",
|
130
|
+
"tessedit_use_primary_params_model=1",
|
243
131
|
]
|
244
132
|
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
)
|
133
|
+
env: dict[str, Any] | None = None
|
134
|
+
if sys.platform.startswith("linux"):
|
135
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
136
|
+
|
137
|
+
result = await run_process(command, env=env)
|
251
138
|
|
252
139
|
if not result.returncode == 0:
|
253
|
-
raise OCRError(
|
140
|
+
raise OCRError(
|
141
|
+
"OCR failed with a non-0 return code.",
|
142
|
+
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
143
|
+
)
|
254
144
|
|
255
145
|
output = await AsyncPath(output_path).read_text("utf-8")
|
256
146
|
return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
257
147
|
except (RuntimeError, OSError) as e:
|
258
|
-
raise OCRError("Failed to OCR using tesseract") from e
|
148
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
259
149
|
finally:
|
260
150
|
await unlink()
|
261
151
|
|
@@ -263,9 +153,8 @@ async def process_file(
|
|
263
153
|
async def process_image(
|
264
154
|
image: Image,
|
265
155
|
*,
|
266
|
-
language:
|
156
|
+
language: str,
|
267
157
|
psm: PSMMode,
|
268
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
269
158
|
) -> ExtractionResult:
|
270
159
|
"""Process a single Pillow Image using Tesseract OCR.
|
271
160
|
|
@@ -273,14 +162,13 @@ async def process_image(
|
|
273
162
|
image: The Pillow Image to process.
|
274
163
|
language: The language code for OCR.
|
275
164
|
psm: Page segmentation mode.
|
276
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
277
165
|
|
278
166
|
Returns:
|
279
167
|
ExtractionResult: The extracted text from the image.
|
280
168
|
"""
|
281
169
|
image_path, unlink = await create_temp_file(".png")
|
282
170
|
await run_sync(image.save, str(image_path), format="PNG")
|
283
|
-
result = await process_file(image_path, language=language, psm=psm
|
171
|
+
result = await process_file(image_path, language=language, psm=psm)
|
284
172
|
await unlink()
|
285
173
|
return result
|
286
174
|
|
@@ -288,9 +176,8 @@ async def process_image(
|
|
288
176
|
async def process_image_with_tesseract(
|
289
177
|
image: Image | PathLike[str] | str,
|
290
178
|
*,
|
291
|
-
language:
|
179
|
+
language: str = "eng",
|
292
180
|
psm: PSMMode = PSMMode.AUTO,
|
293
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
294
181
|
) -> ExtractionResult:
|
295
182
|
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
296
183
|
|
@@ -298,7 +185,6 @@ async def process_image_with_tesseract(
|
|
298
185
|
image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
|
299
186
|
language: The language code for OCR (default: "eng").
|
300
187
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
301
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
302
188
|
|
303
189
|
Raises:
|
304
190
|
ValueError: If the input is not a Pillow Image or a list of Pillow Images.
|
@@ -309,10 +195,10 @@ async def process_image_with_tesseract(
|
|
309
195
|
await validate_tesseract_version()
|
310
196
|
|
311
197
|
if isinstance(image, Image):
|
312
|
-
return await process_image(image, language=language, psm=psm
|
198
|
+
return await process_image(image, language=language, psm=psm)
|
313
199
|
|
314
200
|
if isinstance(image, (PathLike, str)):
|
315
|
-
return await process_file(image, language=language, psm=psm
|
201
|
+
return await process_file(image, language=language, psm=psm)
|
316
202
|
|
317
203
|
raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
|
318
204
|
|
@@ -320,7 +206,7 @@ async def process_image_with_tesseract(
|
|
320
206
|
async def batch_process_images(
|
321
207
|
images: list[T],
|
322
208
|
*,
|
323
|
-
language:
|
209
|
+
language: str = "eng",
|
324
210
|
psm: PSMMode = PSMMode.AUTO,
|
325
211
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
326
212
|
) -> list[ExtractionResult]:
|
@@ -330,7 +216,7 @@ async def batch_process_images(
|
|
330
216
|
images: A list of Pillow Images, paths or strings to process.
|
331
217
|
language: The language code for OCR (default: "eng").
|
332
218
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
333
|
-
max_processes: Maximum number of concurrent processes
|
219
|
+
max_processes: Maximum number of concurrent processes (default: CPU count / 2).
|
334
220
|
|
335
221
|
Raises:
|
336
222
|
ParsingError: If OCR fails to extract text from any of the images.
|
@@ -339,17 +225,10 @@ async def batch_process_images(
|
|
339
225
|
List of ExtractionResult objects, one per input image.
|
340
226
|
"""
|
341
227
|
await validate_tesseract_version()
|
342
|
-
results = cast(list[ExtractionResult], list(range(len(images))))
|
343
|
-
|
344
|
-
async def _process_image(index: int, image: T) -> None:
|
345
|
-
results[index] = await process_image_with_tesseract(
|
346
|
-
image, language=language, psm=psm, max_processes=max_processes
|
347
|
-
)
|
348
|
-
|
349
228
|
try:
|
350
|
-
|
351
|
-
|
352
|
-
|
353
|
-
|
229
|
+
return await run_taskgroup_batched(
|
230
|
+
*[partial(process_image_with_tesseract, image, language=language, psm=psm) for image in images],
|
231
|
+
batch_size=max_processes,
|
232
|
+
)
|
354
233
|
except ExceptionGroup as eg:
|
355
|
-
raise ParsingError("Failed to process images with Tesseract") from eg
|
234
|
+
raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
|
kreuzberg/_xlsx.py
CHANGED
@@ -1,23 +1,47 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import csv
|
4
|
+
import sys
|
5
|
+
from functools import partial
|
4
6
|
from io import StringIO
|
5
|
-
from typing import TYPE_CHECKING
|
7
|
+
from typing import TYPE_CHECKING
|
6
8
|
|
7
9
|
from anyio import Path as AsyncPath
|
8
|
-
from anyio import create_task_group
|
9
10
|
from python_calamine import CalamineWorkbook
|
10
11
|
|
11
12
|
from kreuzberg import ExtractionResult, ParsingError
|
12
13
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
13
14
|
from kreuzberg._pandoc import process_file_with_pandoc
|
14
15
|
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import run_sync
|
16
|
+
from kreuzberg._sync import run_sync, run_taskgroup
|
16
17
|
from kreuzberg._tmp import create_temp_file
|
17
18
|
|
18
19
|
if TYPE_CHECKING: # pragma: no cover
|
19
20
|
from pathlib import Path
|
20
21
|
|
22
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
23
|
+
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
24
|
+
|
25
|
+
|
26
|
+
async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
|
27
|
+
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
28
|
+
|
29
|
+
csv_buffer = StringIO()
|
30
|
+
writer = csv.writer(csv_buffer)
|
31
|
+
|
32
|
+
for row in values:
|
33
|
+
writer.writerow(row)
|
34
|
+
|
35
|
+
csv_data = csv_buffer.getvalue()
|
36
|
+
csv_buffer.close()
|
37
|
+
|
38
|
+
csv_path, unlink = await create_temp_file(".csv")
|
39
|
+
await AsyncPath(csv_path).write_text(csv_data)
|
40
|
+
|
41
|
+
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
42
|
+
await unlink()
|
43
|
+
return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
44
|
+
|
21
45
|
|
22
46
|
async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
23
47
|
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
@@ -33,46 +57,20 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
|
33
57
|
"""
|
34
58
|
try:
|
35
59
|
workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
async def convert_sheet_to_text(sheet_name: str) -> None:
|
40
|
-
nonlocal results
|
41
|
-
values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
|
42
|
-
|
43
|
-
csv_buffer = StringIO()
|
44
|
-
writer = csv.writer(csv_buffer)
|
45
|
-
|
46
|
-
for row in values:
|
47
|
-
writer.writerow(row)
|
48
|
-
|
49
|
-
csv_data = csv_buffer.getvalue()
|
50
|
-
csv_buffer.close()
|
51
|
-
|
52
|
-
from kreuzberg._tmp import create_temp_file
|
53
|
-
|
54
|
-
csv_path, unlink = await create_temp_file(".csv")
|
55
|
-
await AsyncPath(csv_path).write_text(csv_data)
|
56
|
-
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
57
|
-
results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
58
|
-
await unlink()
|
59
|
-
|
60
|
-
async with create_task_group() as tg:
|
61
|
-
for sheet_name in workbook.sheet_names:
|
62
|
-
tg.start_soon(convert_sheet_to_text, sheet_name)
|
60
|
+
results = await run_taskgroup(
|
61
|
+
*[partial(convert_sheet_to_text, workbook, sheet_name) for sheet_name in workbook.sheet_names]
|
62
|
+
)
|
63
63
|
|
64
64
|
return ExtractionResult(
|
65
65
|
content="\n\n".join(results),
|
66
66
|
mime_type=MARKDOWN_MIME_TYPE,
|
67
67
|
metadata={},
|
68
68
|
)
|
69
|
-
except
|
69
|
+
except ExceptionGroup as eg:
|
70
70
|
raise ParsingError(
|
71
|
-
"
|
72
|
-
context={
|
73
|
-
|
74
|
-
},
|
75
|
-
) from e
|
71
|
+
"Failed to extract file data",
|
72
|
+
context={"file": str(input_file), "errors": eg.exceptions},
|
73
|
+
) from eg
|
76
74
|
|
77
75
|
|
78
76
|
async def extract_xlsx_content(content: bytes) -> ExtractionResult:
|
kreuzberg/exceptions.py
CHANGED
@@ -14,9 +14,28 @@ class KreuzbergError(Exception):
|
|
14
14
|
self.context = context
|
15
15
|
super().__init__(message)
|
16
16
|
|
17
|
+
def _serialize_context(self, obj: Any) -> Any:
|
18
|
+
"""Recursively serialize context objects to ensure JSON compatibility."""
|
19
|
+
if isinstance(obj, bytes):
|
20
|
+
return obj.decode("utf-8", errors="replace")
|
21
|
+
if isinstance(obj, dict):
|
22
|
+
return {k: self._serialize_context(v) for k, v in obj.items()}
|
23
|
+
if isinstance(obj, (list, tuple)):
|
24
|
+
return [self._serialize_context(x) for x in obj]
|
25
|
+
if isinstance(obj, Exception):
|
26
|
+
return {
|
27
|
+
"type": obj.__class__.__name__,
|
28
|
+
"message": str(obj),
|
29
|
+
}
|
30
|
+
return obj
|
31
|
+
|
17
32
|
def __str__(self) -> str:
|
18
33
|
"""Return a string representation of the exception."""
|
19
|
-
|
34
|
+
if self.context:
|
35
|
+
serialized_context = self._serialize_context(self.context)
|
36
|
+
ctx = f"\n\nContext: {dumps(serialized_context)}"
|
37
|
+
else:
|
38
|
+
ctx = ""
|
20
39
|
|
21
40
|
return f"{self.__class__.__name__}: {super().__str__()}{ctx}"
|
22
41
|
|
kreuzberg/extraction.py
CHANGED
@@ -38,7 +38,7 @@ from kreuzberg._pdf import (
|
|
38
38
|
)
|
39
39
|
from kreuzberg._pptx import extract_pptx_file_content
|
40
40
|
from kreuzberg._string import safe_decode
|
41
|
-
from kreuzberg._tesseract import PSMMode,
|
41
|
+
from kreuzberg._tesseract import PSMMode, process_image_with_tesseract
|
42
42
|
from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
|
43
43
|
from kreuzberg.exceptions import ValidationError
|
44
44
|
|
@@ -52,7 +52,7 @@ async def extract_bytes(
|
|
52
52
|
mime_type: str,
|
53
53
|
*,
|
54
54
|
force_ocr: bool = False,
|
55
|
-
language:
|
55
|
+
language: str = "eng",
|
56
56
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
57
57
|
psm: PSMMode = PSMMode.AUTO,
|
58
58
|
) -> ExtractionResult:
|
@@ -87,14 +87,12 @@ async def extract_bytes(
|
|
87
87
|
return await extract_xlsx_content(content)
|
88
88
|
|
89
89
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
90
|
-
return await process_image_with_tesseract(
|
91
|
-
open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
|
92
|
-
)
|
90
|
+
return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
|
93
91
|
|
94
92
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
95
93
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
96
94
|
):
|
97
|
-
return await process_content_with_pandoc(content=content, mime_type=mime_type
|
95
|
+
return await process_content_with_pandoc(content=content, mime_type=mime_type)
|
98
96
|
|
99
97
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
100
98
|
return await extract_pptx_file_content(content)
|
@@ -114,7 +112,7 @@ async def extract_file(
|
|
114
112
|
mime_type: str | None = None,
|
115
113
|
*,
|
116
114
|
force_ocr: bool = False,
|
117
|
-
language:
|
115
|
+
language: str = "eng",
|
118
116
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
119
117
|
psm: PSMMode = PSMMode.AUTO,
|
120
118
|
) -> ExtractionResult:
|
@@ -150,12 +148,12 @@ async def extract_file(
|
|
150
148
|
return await extract_xlsx_file(Path(input_file))
|
151
149
|
|
152
150
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
153
|
-
return await process_image_with_tesseract(input_file,
|
151
|
+
return await process_image_with_tesseract(input_file, psm=psm, language=language)
|
154
152
|
|
155
153
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
156
154
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
157
155
|
):
|
158
|
-
return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type
|
156
|
+
return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
|
159
157
|
|
160
158
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
161
159
|
return await extract_pptx_file_content(Path(input_file))
|
@@ -170,7 +168,7 @@ async def batch_extract_file(
|
|
170
168
|
file_paths: Sequence[PathLike[str] | str],
|
171
169
|
*,
|
172
170
|
force_ocr: bool = False,
|
173
|
-
language:
|
171
|
+
language: str = "eng",
|
174
172
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
175
173
|
psm: PSMMode = PSMMode.AUTO,
|
176
174
|
) -> list[ExtractionResult]:
|
@@ -209,7 +207,7 @@ async def batch_extract_bytes(
|
|
209
207
|
contents: Sequence[tuple[bytes, str]],
|
210
208
|
*,
|
211
209
|
force_ocr: bool = False,
|
212
|
-
language:
|
210
|
+
language: str = "eng",
|
213
211
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
214
212
|
psm: PSMMode = PSMMode.AUTO,
|
215
213
|
) -> list[ExtractionResult]:
|
@@ -253,7 +251,7 @@ def extract_bytes_sync(
|
|
253
251
|
mime_type: str,
|
254
252
|
*,
|
255
253
|
force_ocr: bool = False,
|
256
|
-
language:
|
254
|
+
language: str = "eng",
|
257
255
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
258
256
|
psm: PSMMode = PSMMode.AUTO,
|
259
257
|
) -> ExtractionResult:
|
@@ -281,7 +279,7 @@ def extract_file_sync(
|
|
281
279
|
mime_type: str | None = None,
|
282
280
|
*,
|
283
281
|
force_ocr: bool = False,
|
284
|
-
language:
|
282
|
+
language: str = "eng",
|
285
283
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
286
284
|
psm: PSMMode = PSMMode.AUTO,
|
287
285
|
) -> ExtractionResult:
|
@@ -308,7 +306,7 @@ def batch_extract_file_sync(
|
|
308
306
|
file_paths: Sequence[PathLike[str] | str],
|
309
307
|
*,
|
310
308
|
force_ocr: bool = False,
|
311
|
-
language:
|
309
|
+
language: str = "eng",
|
312
310
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
313
311
|
psm: PSMMode = PSMMode.AUTO,
|
314
312
|
) -> list[ExtractionResult]:
|
@@ -339,7 +337,7 @@ def batch_extract_bytes_sync(
|
|
339
337
|
contents: Sequence[tuple[bytes, str]],
|
340
338
|
*,
|
341
339
|
force_ocr: bool = False,
|
342
|
-
language:
|
340
|
+
language: str = "eng",
|
343
341
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
344
342
|
psm: PSMMode = PSMMode.AUTO,
|
345
343
|
) -> list[ExtractionResult]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
42
42
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
43
43
|
- **Local Processing**: No external API calls or cloud dependencies required
|
44
44
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
45
|
-
- **
|
45
|
+
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
46
46
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
47
47
|
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
48
48
|
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
@@ -61,10 +61,34 @@ pip install kreuzberg
|
|
61
61
|
|
62
62
|
Kreuzberg requires two system level dependencies:
|
63
63
|
|
64
|
-
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
64
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
65
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
|
66
66
|
|
67
|
-
|
67
|
+
You can install these with:
|
68
|
+
|
69
|
+
#### Linux (Ubuntu)
|
70
|
+
|
71
|
+
```shell
|
72
|
+
sudo apt-get install pandoc tesseract-ocr
|
73
|
+
```
|
74
|
+
|
75
|
+
#### MacOS
|
76
|
+
|
77
|
+
```shell
|
78
|
+
#
|
79
|
+
brew install tesseract pandoc
|
80
|
+
```
|
81
|
+
|
82
|
+
#### Windows
|
83
|
+
|
84
|
+
```shell
|
85
|
+
choco install -y tesseract pandoc
|
86
|
+
```
|
87
|
+
|
88
|
+
Notes:
|
89
|
+
|
90
|
+
- in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
|
91
|
+
- please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
|
68
92
|
|
69
93
|
## Architecture
|
70
94
|
|
@@ -152,26 +176,30 @@ All extraction functions accept the following optional parameters for configurin
|
|
152
176
|
|
153
177
|
#### OCR Configuration
|
154
178
|
|
155
|
-
- `
|
156
|
-
|
157
|
-
|
158
|
-
-
|
179
|
+
- `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
|
180
|
+
- `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
|
181
|
+
|
182
|
+
- `eng` for English
|
183
|
+
- `deu` for German
|
184
|
+
- `eng+deu` for English and German
|
159
185
|
|
160
|
-
|
186
|
+
Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
|
161
187
|
|
162
|
-
- `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
188
|
+
- `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
163
189
|
|
164
|
-
|
190
|
+
Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
|
165
191
|
|
166
|
-
|
192
|
+
#### Processing Configuration
|
193
|
+
|
194
|
+
- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
|
167
195
|
|
168
196
|
### Quick Start
|
169
197
|
|
170
198
|
```python
|
171
199
|
from pathlib import Path
|
172
200
|
from kreuzberg import extract_file
|
173
|
-
from kreuzberg
|
174
|
-
from kreuzberg
|
201
|
+
from kreuzberg import ExtractionResult
|
202
|
+
from kreuzberg import PSMMode
|
175
203
|
|
176
204
|
|
177
205
|
# Basic file extraction
|
@@ -193,14 +221,14 @@ async def extract_document():
|
|
193
221
|
docx_result = await extract_file(Path("document.docx"))
|
194
222
|
if docx_result.metadata:
|
195
223
|
print(f"Title: {docx_result.metadata.get('title')}")
|
196
|
-
print(f"Author: {docx_result.metadata.get('
|
224
|
+
print(f"Author: {docx_result.metadata.get('creator')}")
|
197
225
|
```
|
198
226
|
|
199
227
|
### Extracting Bytes
|
200
228
|
|
201
229
|
```python
|
202
230
|
from kreuzberg import extract_bytes
|
203
|
-
from kreuzberg
|
231
|
+
from kreuzberg import ExtractionResult
|
204
232
|
|
205
233
|
|
206
234
|
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
@@ -236,7 +264,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
|
|
236
264
|
|
237
265
|
```python
|
238
266
|
from pathlib import Path
|
239
|
-
from kreuzberg import batch_extract_file, batch_extract_bytes
|
267
|
+
from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
|
240
268
|
|
241
269
|
|
242
270
|
# Process multiple files concurrently
|
@@ -346,8 +374,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
|
|
346
374
|
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
347
375
|
|
348
376
|
```python
|
349
|
-
from kreuzberg import
|
350
|
-
|
377
|
+
from kreuzberg import (
|
378
|
+
extract_file,
|
351
379
|
ValidationError,
|
352
380
|
ParsingError,
|
353
381
|
OCRError,
|
@@ -0,0 +1,21 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
|
2
|
+
kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
|
3
|
+
kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
|
4
|
+
kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
|
5
|
+
kreuzberg/_pandoc.py,sha256=lUqG1GQqezz011fLn12AUKJ_xw9gElj-S7xRO5g-Rlw,12513
|
6
|
+
kreuzberg/_pdf.py,sha256=BI7ooYvvLPEX3y7lKyri4r0k6bW4pj_cmBQW1UqZiF8,6227
|
7
|
+
kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
|
8
|
+
kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
|
9
|
+
kreuzberg/_sync.py,sha256=DepezWTfsyyeEq7VYjhWD6XFRiaEz-uCvXFUYkQMswQ,2191
|
10
|
+
kreuzberg/_tesseract.py,sha256=gKGyZpa_MLLsMTpzi_VvSXFAmLxagRE-sfqH2oKFmPM,7662
|
11
|
+
kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
|
12
|
+
kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
|
13
|
+
kreuzberg/_xlsx.py,sha256=JcQTdV38uiNdyRmHQ1DI6khN8ng4W38tIRaxonIoaHs,2703
|
14
|
+
kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
|
15
|
+
kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
|
16
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
+
kreuzberg-2.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
18
|
+
kreuzberg-2.1.0.dist-info/METADATA,sha256=t1NeglNqJFjWpr6WeIp-d33OikT_HIrS8FrEMGSk1hA,14844
|
19
|
+
kreuzberg-2.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
20
|
+
kreuzberg-2.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
21
|
+
kreuzberg-2.1.0.dist-info/RECORD,,
|
kreuzberg-2.0.0.dist-info/RECORD
DELETED
@@ -1,21 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=3opnj4Q8Ci151QuVqPaM3sCb8mpFIRhZbZUgBmp1LI0,410
|
2
|
-
kreuzberg/_constants.py,sha256=BXICWxbtN-22BEQDcGwCH5rLk3HZObtc9fJim1fXzDA,161
|
3
|
-
kreuzberg/_html.py,sha256=lj4GjvCGiUeDcBjotKZuMNNsG3wOuFwP1-bJLsI99YQ,978
|
4
|
-
kreuzberg/_mime_types.py,sha256=VFaOozh8o7Xv1d3pa9mObkz7DM8tVsZC_W8hxsMUeb4,6451
|
5
|
-
kreuzberg/_pandoc.py,sha256=8sggl4nE-BWLKBecGGPnUX-gfNjnKxX-2SInuWmtWKQ,13763
|
6
|
-
kreuzberg/_pdf.py,sha256=V1TVwPpGyrE0YJqnmW_5kh4Y1qWwZI5SSF-lwT_Bbac,6288
|
7
|
-
kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
|
8
|
-
kreuzberg/_string.py,sha256=Z1c53A1-9JtzNthsnrPENxUaMyPS1VD6Oj3SNagSNgg,1093
|
9
|
-
kreuzberg/_sync.py,sha256=3biXw0UDwcaxz-PGmfjWV5JaDE7olFpGKZdG12onxO0,981
|
10
|
-
kreuzberg/_tesseract.py,sha256=xt_4MU7PfN1nZWlWBVQF6zmJnMs9pJq8yWTzPUxTqm0,9240
|
11
|
-
kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
|
12
|
-
kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
|
13
|
-
kreuzberg/_xlsx.py,sha256=dDsNwJ_AGjUU5CQ8ExDFbiIYBauc3cEYAD-7zcP3Op0,2850
|
14
|
-
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
15
|
-
kreuzberg/extraction.py,sha256=1RIs7YaUK0wcOpY1eDcIqh3n-UlJY7ZeulZPdaAxdvo,13345
|
16
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
kreuzberg-2.0.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
18
|
-
kreuzberg-2.0.0.dist-info/METADATA,sha256=cvD9ypz004yHqePKuw8eZZcuZ2lanyN1y2jlB5FMG0Q,14201
|
19
|
-
kreuzberg-2.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
20
|
-
kreuzberg-2.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
21
|
-
kreuzberg-2.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|