kreuzberg 1.7.0__py3-none-any.whl → 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +17 -2
- kreuzberg/_constants.py +6 -0
- kreuzberg/_html.py +32 -0
- kreuzberg/_mime_types.py +109 -1
- kreuzberg/_pandoc.py +122 -169
- kreuzberg/_pdf.py +189 -0
- kreuzberg/_pptx.py +88 -0
- kreuzberg/_string.py +5 -8
- kreuzberg/_sync.py +6 -1
- kreuzberg/_tesseract.py +97 -200
- kreuzberg/_tmp.py +37 -0
- kreuzberg/_types.py +71 -0
- kreuzberg/_xlsx.py +92 -0
- kreuzberg/extraction.py +269 -64
- kreuzberg-2.0.1.dist-info/METADATA +451 -0
- kreuzberg-2.0.1.dist-info/RECORD +21 -0
- kreuzberg/_extractors.py +0 -280
- kreuzberg-1.7.0.dist-info/METADATA +0 -342
- kreuzberg-1.7.0.dist-info/RECORD +0 -15
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/LICENSE +0 -0
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/WHEEL +0 -0
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.1.dist-info}/top_level.txt +0 -0
kreuzberg/_pdf.py
ADDED
@@ -0,0 +1,189 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from re import Pattern
|
4
|
+
from re import compile as compile_regex
|
5
|
+
from typing import TYPE_CHECKING, Final, cast
|
6
|
+
|
7
|
+
import pypdfium2
|
8
|
+
from anyio import Path as AsyncPath
|
9
|
+
|
10
|
+
from kreuzberg import ExtractionResult
|
11
|
+
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
|
+
from kreuzberg._string import normalize_spaces
|
13
|
+
from kreuzberg._sync import run_sync
|
14
|
+
from kreuzberg._tesseract import PSMMode, batch_process_images
|
15
|
+
from kreuzberg.exceptions import ParsingError
|
16
|
+
|
17
|
+
if TYPE_CHECKING: # pragma: no cover
|
18
|
+
from pathlib import Path
|
19
|
+
|
20
|
+
from PIL.Image import Image
|
21
|
+
|
22
|
+
|
23
|
+
# Pattern to detect common PDF text extraction corruption:
|
24
|
+
# - Control and non-printable characters
|
25
|
+
# - Unicode replacement and invalid characters
|
26
|
+
# - Zero-width spaces and other invisible characters
|
27
|
+
CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
|
28
|
+
r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
|
29
|
+
)
|
30
|
+
|
31
|
+
|
32
|
+
def _validate_extracted_text(text: str) -> bool:
|
33
|
+
"""Check if text extracted from PDF is valid or corrupted.
|
34
|
+
|
35
|
+
This checks for common indicators of corrupted PDF text extraction:
|
36
|
+
1. Empty or whitespace-only text
|
37
|
+
2. Control characters and other non-printable characters
|
38
|
+
3. Unicode replacement characters
|
39
|
+
4. Zero-width spaces and other invisible characters
|
40
|
+
|
41
|
+
Args:
|
42
|
+
text: The extracted text to validate
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
True if the text appears valid, False if it seems corrupted
|
46
|
+
"""
|
47
|
+
# Check for empty or whitespace-only text
|
48
|
+
if not text or not text.strip():
|
49
|
+
return False
|
50
|
+
|
51
|
+
# Check for corruption indicators
|
52
|
+
return not bool(CORRUPTED_PATTERN.search(text))
|
53
|
+
|
54
|
+
|
55
|
+
async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
56
|
+
"""Convert a PDF file to images.
|
57
|
+
|
58
|
+
Args:
|
59
|
+
input_file: The path to the PDF file.
|
60
|
+
|
61
|
+
Raises:
|
62
|
+
ParsingError: If the PDF file could not be converted to images.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
A list of Pillow Images.
|
66
|
+
"""
|
67
|
+
document: pypdfium2.PdfDocument | None = None
|
68
|
+
try:
|
69
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
70
|
+
return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
|
71
|
+
except pypdfium2.PdfiumError as e:
|
72
|
+
raise ParsingError(
|
73
|
+
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
74
|
+
) from e
|
75
|
+
finally:
|
76
|
+
if document:
|
77
|
+
await run_sync(document.close)
|
78
|
+
|
79
|
+
|
80
|
+
async def _extract_pdf_text_with_ocr(
|
81
|
+
input_file: Path,
|
82
|
+
*,
|
83
|
+
language: str = "eng",
|
84
|
+
max_processes: int,
|
85
|
+
psm: PSMMode = PSMMode.AUTO,
|
86
|
+
) -> ExtractionResult:
|
87
|
+
"""Extract text from a scanned PDF file using pytesseract.
|
88
|
+
|
89
|
+
Args:
|
90
|
+
input_file: The path to the PDF file.
|
91
|
+
language: The language code for OCR. Defaults to "eng".
|
92
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
93
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
94
|
+
|
95
|
+
Returns:
|
96
|
+
The extracted text.
|
97
|
+
"""
|
98
|
+
images = await _convert_pdf_to_images(input_file)
|
99
|
+
ocr_results = await batch_process_images(images, max_processes=max_processes, psm=psm, language=language)
|
100
|
+
return ExtractionResult(
|
101
|
+
content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
102
|
+
)
|
103
|
+
|
104
|
+
|
105
|
+
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
106
|
+
"""Extract text from a searchable PDF file using pypdfium2.
|
107
|
+
|
108
|
+
Args:
|
109
|
+
input_file: The path to the PDF file.
|
110
|
+
|
111
|
+
Raises:
|
112
|
+
ParsingError: If the text could not be extracted from the PDF file.
|
113
|
+
|
114
|
+
Returns:
|
115
|
+
The extracted text.
|
116
|
+
"""
|
117
|
+
document: pypdfium2.PdfDocument | None = None
|
118
|
+
try:
|
119
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
120
|
+
text = "\n".join(page.get_textpage().get_text_bounded() for page in cast(pypdfium2.PdfDocument, document))
|
121
|
+
return normalize_spaces(text)
|
122
|
+
except pypdfium2.PdfiumError as e:
|
123
|
+
raise ParsingError(
|
124
|
+
"Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
|
125
|
+
) from e
|
126
|
+
finally:
|
127
|
+
if document:
|
128
|
+
await run_sync(document.close)
|
129
|
+
|
130
|
+
|
131
|
+
async def extract_pdf_file(
|
132
|
+
input_file: Path,
|
133
|
+
*,
|
134
|
+
force_ocr: bool,
|
135
|
+
language: str = "eng",
|
136
|
+
max_processes: int,
|
137
|
+
psm: PSMMode = PSMMode.AUTO,
|
138
|
+
) -> ExtractionResult:
|
139
|
+
"""Extract text from a PDF file.
|
140
|
+
|
141
|
+
Args:
|
142
|
+
input_file: The path to the PDF file.
|
143
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
144
|
+
language: The language code for OCR. Defaults to "eng".
|
145
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
146
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
147
|
+
|
148
|
+
Returns:
|
149
|
+
The extracted text.
|
150
|
+
"""
|
151
|
+
if (
|
152
|
+
not force_ocr
|
153
|
+
and (content := await _extract_pdf_searchable_text(input_file))
|
154
|
+
and _validate_extracted_text(content)
|
155
|
+
):
|
156
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
157
|
+
|
158
|
+
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
159
|
+
|
160
|
+
|
161
|
+
async def extract_pdf_content(
|
162
|
+
content: bytes,
|
163
|
+
*,
|
164
|
+
force_ocr: bool,
|
165
|
+
language: str = "eng",
|
166
|
+
max_processes: int,
|
167
|
+
psm: PSMMode = PSMMode.AUTO,
|
168
|
+
) -> ExtractionResult:
|
169
|
+
"""Extract text from a PDF file content.
|
170
|
+
|
171
|
+
Args:
|
172
|
+
content: The PDF file content.
|
173
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
174
|
+
language: The language code for OCR. Defaults to "eng".
|
175
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
176
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
177
|
+
|
178
|
+
Returns:
|
179
|
+
The extracted text.
|
180
|
+
"""
|
181
|
+
from kreuzberg._tmp import create_temp_file
|
182
|
+
|
183
|
+
file_path, unlink = await create_temp_file(".pdf")
|
184
|
+
await AsyncPath(file_path).write_bytes(content)
|
185
|
+
result = await extract_pdf_file(
|
186
|
+
file_path, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
|
187
|
+
)
|
188
|
+
await unlink()
|
189
|
+
return result
|
kreuzberg/_pptx.py
ADDED
@@ -0,0 +1,88 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from contextlib import suppress
|
5
|
+
from html import escape
|
6
|
+
from io import BytesIO
|
7
|
+
from typing import TYPE_CHECKING
|
8
|
+
|
9
|
+
import pptx
|
10
|
+
from anyio import Path as AsyncPath
|
11
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
12
|
+
|
13
|
+
from kreuzberg import ExtractionResult
|
14
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
15
|
+
from kreuzberg._string import normalize_spaces
|
16
|
+
|
17
|
+
if TYPE_CHECKING: # pragma: no cover
|
18
|
+
from pathlib import Path
|
19
|
+
|
20
|
+
|
21
|
+
async def extract_pptx_file_content(file_path_or_contents: Path | bytes) -> ExtractionResult:
|
22
|
+
"""Extract text from a PPTX file.
|
23
|
+
|
24
|
+
Notes:
|
25
|
+
This function is based on code vendored from `markitdown`, which has an MIT license as well.
|
26
|
+
|
27
|
+
Args:
|
28
|
+
file_path_or_contents: The path to the PPTX file or its contents as bytes.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
The extracted text content
|
32
|
+
"""
|
33
|
+
md_content = ""
|
34
|
+
file_contents = (
|
35
|
+
file_path_or_contents
|
36
|
+
if isinstance(file_path_or_contents, bytes)
|
37
|
+
else await AsyncPath(file_path_or_contents).read_bytes()
|
38
|
+
)
|
39
|
+
presentation = pptx.Presentation(BytesIO(file_contents))
|
40
|
+
|
41
|
+
for index, slide in enumerate(presentation.slides):
|
42
|
+
md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
|
43
|
+
|
44
|
+
title = slide.shapes.title
|
45
|
+
|
46
|
+
for shape in slide.shapes:
|
47
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
|
48
|
+
shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
|
49
|
+
):
|
50
|
+
alt_text = ""
|
51
|
+
with suppress(AttributeError):
|
52
|
+
# access non-visual properties
|
53
|
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
54
|
+
|
55
|
+
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
56
|
+
md_content += f"\n\n"
|
57
|
+
|
58
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
59
|
+
html_table = "<table>"
|
60
|
+
first_row = True
|
61
|
+
|
62
|
+
for row in shape.table.rows:
|
63
|
+
html_table += "<tr>"
|
64
|
+
|
65
|
+
for cell in row.cells:
|
66
|
+
tag = "th" if first_row else "td"
|
67
|
+
html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
|
68
|
+
|
69
|
+
html_table += "</tr>"
|
70
|
+
first_row = False
|
71
|
+
|
72
|
+
html_table += "</table>"
|
73
|
+
md_content += "\n" + html_table + "\n"
|
74
|
+
|
75
|
+
elif shape.has_text_frame:
|
76
|
+
md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
|
77
|
+
|
78
|
+
md_content = md_content.strip()
|
79
|
+
if slide.has_notes_slide:
|
80
|
+
md_content += "\n\n### Notes:\n"
|
81
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
82
|
+
|
83
|
+
if notes_frame is not None: # pragma: no branch
|
84
|
+
md_content += notes_frame.text
|
85
|
+
|
86
|
+
md_content = md_content.strip()
|
87
|
+
|
88
|
+
return ExtractionResult(content=normalize_spaces(md_content), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
kreuzberg/_string.py
CHANGED
@@ -4,8 +4,6 @@ from contextlib import suppress
|
|
4
4
|
|
5
5
|
from charset_normalizer import detect
|
6
6
|
|
7
|
-
from kreuzberg.exceptions import ParsingError
|
8
|
-
|
9
7
|
|
10
8
|
def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
11
9
|
"""Decode a byte string safely, removing invalid sequences.
|
@@ -14,22 +12,21 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
14
12
|
byte_data: The byte string to decode.
|
15
13
|
encoding: The encoding to use when decoding the byte string.
|
16
14
|
|
17
|
-
Raises:
|
18
|
-
ParsingError: If the byte string could not be decoded.
|
19
|
-
|
20
15
|
Returns:
|
21
16
|
The decoded string.
|
22
17
|
"""
|
23
18
|
if not byte_data:
|
24
19
|
return ""
|
25
20
|
|
26
|
-
|
21
|
+
# We try each encoding in order until one works
|
22
|
+
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
27
23
|
|
28
|
-
for enc in [e for e in encodings if e]:
|
24
|
+
for enc in [e for e in encodings if e]: # pragma: no cover
|
29
25
|
with suppress(UnicodeDecodeError):
|
30
26
|
return byte_data.decode(enc)
|
31
27
|
|
32
|
-
|
28
|
+
# If all encodings fail, fall back to latin-1 which can handle any byte
|
29
|
+
return byte_data.decode("latin-1", errors="replace")
|
33
30
|
|
34
31
|
|
35
32
|
def normalize_spaces(text: str) -> str:
|
kreuzberg/_sync.py
CHANGED
@@ -1,14 +1,19 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import sys
|
3
4
|
from functools import partial
|
4
5
|
from typing import TYPE_CHECKING, TypeVar, cast
|
5
6
|
|
6
7
|
from anyio.to_thread import run_sync as any_io_run_sync
|
7
|
-
from typing_extensions import ParamSpec
|
8
8
|
|
9
9
|
if TYPE_CHECKING: # pragma: no cover
|
10
10
|
from collections.abc import Callable
|
11
11
|
|
12
|
+
if sys.version_info >= (3, 10):
|
13
|
+
from typing import ParamSpec
|
14
|
+
else: # pragma: no cover
|
15
|
+
from typing_extensions import ParamSpec
|
16
|
+
|
12
17
|
T = TypeVar("T")
|
13
18
|
P = ParamSpec("P")
|
14
19
|
|