kreuzberg 2.1.0__py3-none-any.whl → 2.1.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_mime_types.py +3 -2
- kreuzberg/_pandoc.py +6 -7
- kreuzberg/_pdf.py +19 -17
- kreuzberg/_sync.py +8 -8
- kreuzberg/_tesseract.py +2 -5
- kreuzberg/_xlsx.py +2 -4
- {kreuzberg-2.1.0.dist-info → kreuzberg-2.1.2.dist-info}/METADATA +4 -5
- {kreuzberg-2.1.0.dist-info → kreuzberg-2.1.2.dist-info}/RECORD +11 -11
- {kreuzberg-2.1.0.dist-info → kreuzberg-2.1.2.dist-info}/WHEEL +1 -1
- {kreuzberg-2.1.0.dist-info → kreuzberg-2.1.2.dist-info}/LICENSE +0 -0
- {kreuzberg-2.1.0.dist-info → kreuzberg-2.1.2.dist-info}/top_level.txt +0 -0
kreuzberg/_mime_types.py
CHANGED
@@ -15,6 +15,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
|
15
15
|
PDF_MIME_TYPE: Final = "application/pdf"
|
16
16
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
17
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
|
+
DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
18
19
|
# Excel formats
|
19
20
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
20
21
|
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
@@ -73,7 +74,7 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
|
73
74
|
"application/epub+zip",
|
74
75
|
"application/rtf",
|
75
76
|
"application/vnd.oasis.opendocument.text",
|
76
|
-
|
77
|
+
DOCX_MIME_TYPE,
|
77
78
|
"application/x-biblatex",
|
78
79
|
"application/x-bibtex",
|
79
80
|
"application/x-endnote+xml",
|
@@ -146,7 +147,7 @@ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
|
|
146
147
|
".epub": "application/epub+zip",
|
147
148
|
".rtf": "application/rtf",
|
148
149
|
".odt": "application/vnd.oasis.opendocument.text",
|
149
|
-
".docx":
|
150
|
+
".docx": DOCX_MIME_TYPE,
|
150
151
|
".bib": "application/x-bibtex",
|
151
152
|
".ipynb": "application/x-ipynb+json",
|
152
153
|
".tex": "application/x-latex",
|
kreuzberg/_pandoc.py
CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import re
|
4
4
|
import sys
|
5
|
-
from functools import partial
|
6
5
|
from json import JSONDecodeError, loads
|
7
6
|
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
8
7
|
|
@@ -333,14 +332,14 @@ async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type
|
|
333
332
|
_get_pandoc_type_from_mime_type(mime_type)
|
334
333
|
|
335
334
|
try:
|
336
|
-
|
337
|
-
|
338
|
-
|
339
|
-
)
|
335
|
+
metadata_task = _handle_extract_metadata(input_file, mime_type=mime_type)
|
336
|
+
content_task = _handle_extract_file(input_file, mime_type=mime_type)
|
337
|
+
results = await run_taskgroup(metadata_task, content_task)
|
338
|
+
metadata, content = cast(tuple[Metadata, str], results)
|
340
339
|
|
341
340
|
return ExtractionResult(
|
342
|
-
content=normalize_spaces(
|
343
|
-
metadata=
|
341
|
+
content=normalize_spaces(content),
|
342
|
+
metadata=metadata,
|
344
343
|
mime_type=MARKDOWN_MIME_TYPE,
|
345
344
|
)
|
346
345
|
except ExceptionGroup as eg:
|
kreuzberg/_pdf.py
CHANGED
@@ -24,32 +24,36 @@ if TYPE_CHECKING: # pragma: no cover
|
|
24
24
|
# - Control and non-printable characters
|
25
25
|
# - Unicode replacement and invalid characters
|
26
26
|
# - Zero-width spaces and other invisible characters
|
27
|
-
CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
|
28
|
-
|
29
|
-
|
27
|
+
CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
|
28
|
+
SHORT_TEXT_THRESHOLD: Final[int] = 50
|
29
|
+
MINIMUM_CORRUPTED_RESULTS: Final[int] = 2
|
30
30
|
|
31
31
|
|
32
|
-
def _validate_extracted_text(text: str) -> bool:
|
32
|
+
def _validate_extracted_text(text: str, corruption_threshold: float = 0.05) -> bool:
|
33
33
|
"""Check if text extracted from PDF is valid or corrupted.
|
34
34
|
|
35
|
-
This checks for
|
35
|
+
This checks for indicators of corrupted PDF text extraction:
|
36
36
|
1. Empty or whitespace-only text
|
37
|
-
2.
|
38
|
-
3. Unicode replacement characters
|
39
|
-
4. Zero-width spaces and other invisible characters
|
37
|
+
2. High concentration of control characters and null bytes
|
38
|
+
3. High concentration of Unicode replacement characters
|
40
39
|
|
41
40
|
Args:
|
42
41
|
text: The extracted text to validate
|
42
|
+
corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
|
43
|
+
characters (default: 0.05 or 5%)
|
43
44
|
|
44
45
|
Returns:
|
45
46
|
True if the text appears valid, False if it seems corrupted
|
46
47
|
"""
|
47
|
-
# Check for empty or whitespace-only text
|
48
48
|
if not text or not text.strip():
|
49
49
|
return False
|
50
50
|
|
51
|
-
|
52
|
-
|
51
|
+
corruption_matches = CORRUPTED_PATTERN.findall(text)
|
52
|
+
|
53
|
+
if len(text) < SHORT_TEXT_THRESHOLD:
|
54
|
+
return len(corruption_matches) <= MINIMUM_CORRUPTED_RESULTS
|
55
|
+
|
56
|
+
return (len(corruption_matches) / len(text)) < corruption_threshold
|
53
57
|
|
54
58
|
|
55
59
|
async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
@@ -148,12 +152,10 @@ async def extract_pdf_file(
|
|
148
152
|
Returns:
|
149
153
|
The extracted text.
|
150
154
|
"""
|
151
|
-
if
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
):
|
156
|
-
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
155
|
+
if not force_ocr:
|
156
|
+
content = await _extract_pdf_searchable_text(input_file)
|
157
|
+
if _validate_extracted_text(content):
|
158
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
157
159
|
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
158
160
|
|
159
161
|
|
kreuzberg/_sync.py
CHANGED
@@ -2,13 +2,13 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import sys
|
4
4
|
from functools import partial
|
5
|
-
from typing import TYPE_CHECKING, TypeVar, cast
|
5
|
+
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
6
6
|
|
7
7
|
from anyio import create_task_group
|
8
8
|
from anyio.to_thread import run_sync as any_io_run_sync
|
9
9
|
|
10
10
|
if TYPE_CHECKING: # pragma: no cover
|
11
|
-
from collections.abc import
|
11
|
+
from collections.abc import Awaitable, Callable
|
12
12
|
|
13
13
|
if sys.version_info >= (3, 10):
|
14
14
|
from typing import ParamSpec
|
@@ -34,7 +34,7 @@ async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -
|
|
34
34
|
return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
35
35
|
|
36
36
|
|
37
|
-
async def run_taskgroup(*async_tasks:
|
37
|
+
async def run_taskgroup(*async_tasks: Awaitable[Any]) -> list[Any]:
|
38
38
|
"""Run a list of coroutines concurrently.
|
39
39
|
|
40
40
|
Args:
|
@@ -43,10 +43,10 @@ async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) ->
|
|
43
43
|
Returns:
|
44
44
|
The results of the coroutines.
|
45
45
|
"""
|
46
|
-
results
|
46
|
+
results: list[Any] = [None] * len(async_tasks)
|
47
47
|
|
48
|
-
async def run_task(index: int, task:
|
49
|
-
results[index] = await task
|
48
|
+
async def run_task(index: int, task: Awaitable[T]) -> None:
|
49
|
+
results[index] = await task
|
50
50
|
|
51
51
|
async with create_task_group() as tg:
|
52
52
|
for i, t in enumerate(async_tasks):
|
@@ -55,7 +55,7 @@ async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) ->
|
|
55
55
|
return results
|
56
56
|
|
57
57
|
|
58
|
-
async def run_taskgroup_batched(*async_tasks:
|
58
|
+
async def run_taskgroup_batched(*async_tasks: Awaitable[Any], batch_size: int) -> list[Any]:
|
59
59
|
"""Run a list of coroutines concurrently in batches.
|
60
60
|
|
61
61
|
Args:
|
@@ -65,7 +65,7 @@ async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None,
|
|
65
65
|
Returns:
|
66
66
|
The results of the coroutines.
|
67
67
|
"""
|
68
|
-
results: list[
|
68
|
+
results: list[Any] = []
|
69
69
|
|
70
70
|
for i in range(0, len(async_tasks), batch_size):
|
71
71
|
batch = async_tasks[i : i + batch_size]
|
kreuzberg/_tesseract.py
CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
import re
|
4
4
|
import sys
|
5
5
|
from enum import Enum
|
6
|
-
from functools import partial
|
7
6
|
from os import PathLike
|
8
7
|
from typing import Any, TypeVar, Union
|
9
8
|
|
@@ -226,9 +225,7 @@ async def batch_process_images(
|
|
226
225
|
"""
|
227
226
|
await validate_tesseract_version()
|
228
227
|
try:
|
229
|
-
|
230
|
-
|
231
|
-
batch_size=max_processes,
|
232
|
-
)
|
228
|
+
tasks = [process_image_with_tesseract(image, language=language, psm=psm) for image in images]
|
229
|
+
return await run_taskgroup_batched(*tasks, batch_size=max_processes)
|
233
230
|
except ExceptionGroup as eg:
|
234
231
|
raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
|
kreuzberg/_xlsx.py
CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import csv
|
4
4
|
import sys
|
5
|
-
from functools import partial
|
6
5
|
from io import StringIO
|
7
6
|
from typing import TYPE_CHECKING
|
8
7
|
|
@@ -57,9 +56,8 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
|
57
56
|
"""
|
58
57
|
try:
|
59
58
|
workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
|
60
|
-
|
61
|
-
|
62
|
-
)
|
59
|
+
tasks = [convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
|
60
|
+
results: list[str] = await run_taskgroup(*tasks)
|
63
61
|
|
64
62
|
return ExtractionResult(
|
65
63
|
content="\n\n".join(results),
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.1.
|
3
|
+
Version: 2.1.2
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -28,10 +28,10 @@ Requires-Dist: anyio>=4.8.0
|
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
30
|
Requires-Dist: html-to-markdown>=1.2.0
|
31
|
-
Requires-Dist: pypdfium2
|
31
|
+
Requires-Dist: pypdfium2==4.30.0
|
32
32
|
Requires-Dist: python-calamine>=0.3.1
|
33
33
|
Requires-Dist: python-pptx>=1.0.2
|
34
|
-
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.
|
34
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
|
35
35
|
|
36
36
|
# Kreuzberg
|
37
37
|
|
@@ -62,7 +62,7 @@ pip install kreuzberg
|
|
62
62
|
Kreuzberg requires two system level dependencies:
|
63
63
|
|
64
64
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
65
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract
|
65
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 5.
|
66
66
|
|
67
67
|
You can install these with:
|
68
68
|
|
@@ -75,7 +75,6 @@ sudo apt-get install pandoc tesseract-ocr
|
|
75
75
|
#### MacOS
|
76
76
|
|
77
77
|
```shell
|
78
|
-
#
|
79
78
|
brew install tesseract pandoc
|
80
79
|
```
|
81
80
|
|
@@ -1,21 +1,21 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=WgGo3x09JKCk89htZuodbnYysu0ZYpkAP29dcRl5Sg0,694
|
2
2
|
kreuzberg/_constants.py,sha256=N61ZF8xuEso8GzRGiVpqIv5yfMkQmLeH_EN9fVARYV0,249
|
3
3
|
kreuzberg/_html.py,sha256=yM78bPjyKRaXqMp5QW9xOYe0CBd9uUhDZfjnFB1tZOY,925
|
4
|
-
kreuzberg/_mime_types.py,sha256=
|
5
|
-
kreuzberg/_pandoc.py,sha256=
|
6
|
-
kreuzberg/_pdf.py,sha256=
|
4
|
+
kreuzberg/_mime_types.py,sha256=Kuu0yWY4p0Eck8b_vdp9oamqRZc1RJaS_ZKikVD2Z2o,6431
|
5
|
+
kreuzberg/_pandoc.py,sha256=YIXaFC11N2tgVHjBd3JD_21GZ6OOVQ0UY3aKrWNfK-I,12531
|
6
|
+
kreuzberg/_pdf.py,sha256=AIwxlydZkJOU4878SaeF9cKUmzSN7o3X40Hye7z017U,6479
|
7
7
|
kreuzberg/_pptx.py,sha256=oX1WYabKQ02Hla2jYnkEBjJXCPvrcRnzLi3MeY86TN0,3028
|
8
8
|
kreuzberg/_string.py,sha256=pE92BF2E7BXrQ5if3uATM2enwH82ntViBpshxK-797E,1106
|
9
|
-
kreuzberg/_sync.py,sha256=
|
10
|
-
kreuzberg/_tesseract.py,sha256=
|
9
|
+
kreuzberg/_sync.py,sha256=sDVH4GrpYW9SOnmu3BqKPL76xl0hxzHjTAC78aovbQA,2122
|
10
|
+
kreuzberg/_tesseract.py,sha256=0BkguZJIKlOFHkrN2mjVgaycWwolmuEv6DwpQY7n7Os,7610
|
11
11
|
kreuzberg/_tmp.py,sha256=y0PxKJXsRsDCwpFqtJAMl05lMNu3N_E2yaUVL93h7g0,1037
|
12
12
|
kreuzberg/_types.py,sha256=Qxlk6qfdtvEsCfjsXU57qgZiONfwF7wUgbCJK8QXNZ4,2195
|
13
|
-
kreuzberg/_xlsx.py,sha256=
|
13
|
+
kreuzberg/_xlsx.py,sha256=kSH7PJ33vdLgoh5LmL_bqbc4I0VgZlZUeF4ckKl6NJM,2675
|
14
14
|
kreuzberg/exceptions.py,sha256=syDCjy8PNqVMGhD-zAuhkurLMg9bk1j1yJtvJN8cN9A,1679
|
15
15
|
kreuzberg/extraction.py,sha256=7oc2C1_bIxrLx2r4NEyGrL9Jt6YpPxfQKMRJm6QQayo,13076
|
16
16
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
|
-
kreuzberg-2.1.
|
18
|
-
kreuzberg-2.1.
|
19
|
-
kreuzberg-2.1.
|
20
|
-
kreuzberg-2.1.
|
21
|
-
kreuzberg-2.1.
|
17
|
+
kreuzberg-2.1.2.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
18
|
+
kreuzberg-2.1.2.dist-info/METADATA,sha256=0MEegHP8F5ur-wafeprL9UEN6Utipml1SuCF_xF6daA,14842
|
19
|
+
kreuzberg-2.1.2.dist-info/WHEEL,sha256=jB7zZ3N9hIM9adW7qlTAyycLYW9npaWKLRzaoVcLKcM,91
|
20
|
+
kreuzberg-2.1.2.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
21
|
+
kreuzberg-2.1.2.dist-info/RECORD,,
|
File without changes
|
File without changes
|