kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_extractors/__init__.py +0 -0
- kreuzberg/_extractors/_base.py +92 -0
- kreuzberg/_extractors/_html.py +34 -0
- kreuzberg/_extractors/_image.py +74 -0
- kreuzberg/_extractors/_pandoc.py +613 -0
- kreuzberg/_extractors/_pdf.py +163 -0
- kreuzberg/_extractors/_presentation.py +233 -0
- kreuzberg/_extractors/_spread_sheet.py +125 -0
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_ocr/__init__.py +17 -0
- kreuzberg/_ocr/_base.py +54 -0
- kreuzberg/_ocr/_easyocr.py +376 -0
- kreuzberg/_ocr/_paddleocr.py +291 -0
- kreuzberg/_ocr/_tesseract.py +342 -0
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/_utils/__init__.py +0 -0
- kreuzberg/{_string.py → _utils/_string.py} +0 -2
- kreuzberg/_utils/_sync.py +121 -0
- kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.1.dist-info/METADATA +178 -0
- kreuzberg-3.0.1.dist-info/RECORD +32 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.2.dist-info/METADATA +0 -446
- kreuzberg-2.1.2.dist-info/RECORD +0 -21
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,32 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=KZ_y21m64cafWL7goGeG3EIDutM184st28n4UGajADs,1131
|
2
|
+
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
3
|
+
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
4
|
+
kreuzberg/_mime_types.py,sha256=pKtxBPDoye2knyou_VODDMPIt3eXotP-ak4MAKFI2SU,6310
|
5
|
+
kreuzberg/_playa.py,sha256=agHdhKfKLNtiP37XdNncbCP65v3Qv3m1Gn2KTRUkVx8,10396
|
6
|
+
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
7
|
+
kreuzberg/_types.py,sha256=sZMxjRZQ1c_MzxdumhYSWghW6yXBwohTUIBa5eR-FKA,6582
|
8
|
+
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
9
|
+
kreuzberg/extraction.py,sha256=0sjvbunx5srbR5lzjOAQjGK5JY3bCUHw-dRFmHjFz7o,8671
|
10
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
12
|
+
kreuzberg/_extractors/_base.py,sha256=YUr6A2n34LlFzbYQkiKqhXAphL9RYrvAls5SlkoQqNg,3028
|
13
|
+
kreuzberg/_extractors/_html.py,sha256=L_vcKyv1ObI6irPaD3-WTKqxeRfZA4Rhsl3zUiAe_ws,1312
|
14
|
+
kreuzberg/_extractors/_image.py,sha256=VQgSFSzXIMX3A52-DyvuKgfTRXUJIjYn6IX4-sQWWdg,2626
|
15
|
+
kreuzberg/_extractors/_pandoc.py,sha256=a6cYQxoh5G9EMrDWVcQhrTkE4Mar24sNiGCY0zOOzw4,20121
|
16
|
+
kreuzberg/_extractors/_pdf.py,sha256=dcSAXyqH8SZ-z45OUAjjwdboSEbrli0YekS8PxCaVGA,6384
|
17
|
+
kreuzberg/_extractors/_presentation.py,sha256=K4ALrpmZ0EWyp2O-3oEmTRCS7yAET9xjinrzo13rpWo,8764
|
18
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=1ejRZk8AE1dXS1tRIdg2S0J9Vo0wG81iKkW2IF6PjlE,4445
|
19
|
+
kreuzberg/_ocr/__init__.py,sha256=VTqwKDlIRbjve71Y11Ztygyhv5aWG9LWTj8iX66ANxE,533
|
20
|
+
kreuzberg/_ocr/_base.py,sha256=lNT0Tin4hzbmaamqqySxvYEwNtrJB5gGlStrANQQcyc,1637
|
21
|
+
kreuzberg/_ocr/_easyocr.py,sha256=VfYW66SkB2Bigbrtd7WEeJ6QZ_1Y5d8Z_rZYBPMsuk0,11037
|
22
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=X5es69QMl0P6DZuuRNKWHaRtLi1OJqFs-mWHR_gVKvY,10837
|
23
|
+
kreuzberg/_ocr/_tesseract.py,sha256=cdnVxNpaKjxtBN4xy0Timz-uYtPA9wq9kc6kyYVeDug,9779
|
24
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
25
|
+
kreuzberg/_utils/_string.py,sha256=oNO0cmwjVNG0jAzaqNCjYtzvM_nxH5TW2KV-Uh3oEUU,978
|
26
|
+
kreuzberg/_utils/_sync.py,sha256=lycobEMXk0tBMWLwkuMdOuNMStDwPKMC0V1Qgp_oi6k,4071
|
27
|
+
kreuzberg/_utils/_tmp.py,sha256=5rqG_Nlb9xweaLqJA8Kc5csHDase9_eY_Fq93rNQGWc,1044
|
28
|
+
kreuzberg-3.0.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
29
|
+
kreuzberg-3.0.1.dist-info/METADATA,sha256=5Kt0w9rFBAina8SzbO-m2umEMRJQL-4mcPGAQASko_k,6545
|
30
|
+
kreuzberg-3.0.1.dist-info/WHEEL,sha256=CmyFI0kx5cdEMTLiONQRbGQwjIoR1aIYB7eCAQ4KPJ0,91
|
31
|
+
kreuzberg-3.0.1.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
32
|
+
kreuzberg-3.0.1.dist-info/RECORD,,
|
kreuzberg/_html.py
DELETED
@@ -1,31 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from typing import TYPE_CHECKING
|
4
|
-
|
5
|
-
import html_to_markdown
|
6
|
-
from anyio import Path as AsyncPath
|
7
|
-
|
8
|
-
from kreuzberg import ExtractionResult
|
9
|
-
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
10
|
-
from kreuzberg._string import normalize_spaces, safe_decode
|
11
|
-
|
12
|
-
if TYPE_CHECKING:
|
13
|
-
from pathlib import Path
|
14
|
-
|
15
|
-
|
16
|
-
async def extract_html_string(file_path_or_contents: Path | bytes) -> ExtractionResult:
|
17
|
-
"""Extract text from an HTML string.
|
18
|
-
|
19
|
-
Args:
|
20
|
-
file_path_or_contents: The HTML content.
|
21
|
-
|
22
|
-
Returns:
|
23
|
-
The extracted text content.
|
24
|
-
"""
|
25
|
-
content = (
|
26
|
-
safe_decode(file_path_or_contents)
|
27
|
-
if isinstance(file_path_or_contents, bytes)
|
28
|
-
else await AsyncPath(file_path_or_contents).read_text()
|
29
|
-
)
|
30
|
-
result = html_to_markdown.convert_to_markdown(content)
|
31
|
-
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
kreuzberg/_pandoc.py
DELETED
@@ -1,366 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import re
|
4
|
-
import sys
|
5
|
-
from json import JSONDecodeError, loads
|
6
|
-
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
7
|
-
|
8
|
-
from anyio import Path as AsyncPath
|
9
|
-
from anyio import run_process
|
10
|
-
|
11
|
-
from kreuzberg import ValidationError
|
12
|
-
from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
|
13
|
-
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
14
|
-
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import run_taskgroup
|
16
|
-
from kreuzberg._tmp import create_temp_file
|
17
|
-
from kreuzberg._types import ExtractionResult, Metadata
|
18
|
-
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
|
-
|
20
|
-
if TYPE_CHECKING: # pragma: no cover
|
21
|
-
from collections.abc import Mapping
|
22
|
-
from os import PathLike
|
23
|
-
|
24
|
-
if sys.version_info < (3, 11): # pragma: no cover
|
25
|
-
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
|
-
|
27
|
-
version_ref: Final[dict[str, bool]] = {"checked": False}
|
28
|
-
|
29
|
-
# Block-level node types in Pandoc AST
|
30
|
-
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
31
|
-
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
32
|
-
BLOCK_CODE: Final = "CodeBlock" # Code block with attributes and string content
|
33
|
-
BLOCK_QUOTE: Final = "BlockQuote" # Block quote containing blocks
|
34
|
-
BLOCK_LIST: Final = "BulletList" # Bullet list containing items (blocks)
|
35
|
-
BLOCK_ORDERED: Final = "OrderedList" # Numbered list with attrs and items
|
36
|
-
|
37
|
-
# Inline-level node types in Pandoc AST
|
38
|
-
INLINE_STR: Final = "Str" # Plain text string
|
39
|
-
INLINE_SPACE: Final = "Space" # Single space
|
40
|
-
INLINE_EMPH: Final = "Emph" # Emphasized text (contains inlines)
|
41
|
-
INLINE_STRONG: Final = "Strong" # Strong/bold text (contains inlines)
|
42
|
-
INLINE_LINK: Final = "Link" # Link with text and target
|
43
|
-
INLINE_IMAGE: Final = "Image" # Image with alt text and source
|
44
|
-
INLINE_CODE: Final = "Code" # Inline code span
|
45
|
-
INLINE_MATH: Final = "Math" # Math expression
|
46
|
-
|
47
|
-
# Metadata node types in Pandoc AST
|
48
|
-
META_MAP: Final = "MetaMap" # Key-value mapping of metadata
|
49
|
-
META_LIST: Final = "MetaList" # List of metadata values
|
50
|
-
META_INLINES: Final = "MetaInlines" # Inline content in metadata
|
51
|
-
META_STRING: Final = "MetaString" # Plain string in metadata
|
52
|
-
META_BLOCKS: Final = "MetaBlocks" # Block content in metadata
|
53
|
-
|
54
|
-
# Node content field name
|
55
|
-
CONTENT_FIELD: Final = "c"
|
56
|
-
TYPE_FIELD: Final = "t"
|
57
|
-
|
58
|
-
# Valid node types
|
59
|
-
NodeType = Literal[
|
60
|
-
# Block types
|
61
|
-
"Header",
|
62
|
-
"Para",
|
63
|
-
"CodeBlock",
|
64
|
-
"BlockQuote",
|
65
|
-
"BulletList",
|
66
|
-
"OrderedList",
|
67
|
-
# Inline types
|
68
|
-
"Str",
|
69
|
-
"Space",
|
70
|
-
"Emph",
|
71
|
-
"Strong",
|
72
|
-
"Link",
|
73
|
-
"Image",
|
74
|
-
"Code",
|
75
|
-
"Math",
|
76
|
-
# Meta types
|
77
|
-
"MetaMap",
|
78
|
-
"MetaList",
|
79
|
-
"MetaInlines",
|
80
|
-
"MetaString",
|
81
|
-
"MetaBlocks",
|
82
|
-
]
|
83
|
-
|
84
|
-
MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
|
85
|
-
"application/csl+json": "csljson",
|
86
|
-
"application/docbook+xml": "docbook",
|
87
|
-
"application/epub+zip": "epub",
|
88
|
-
"application/rtf": "rtf",
|
89
|
-
"application/vnd.oasis.opendocument.text": "odt",
|
90
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
91
|
-
"application/x-biblatex": "biblatex",
|
92
|
-
"application/x-bibtex": "bibtex",
|
93
|
-
"application/x-endnote+xml": "endnotexml",
|
94
|
-
"application/x-fictionbook+xml": "fb2",
|
95
|
-
"application/x-ipynb+json": "ipynb",
|
96
|
-
"application/x-jats+xml": "jats",
|
97
|
-
"application/x-latex": "latex",
|
98
|
-
"application/x-opml+xml": "opml",
|
99
|
-
"application/x-research-info-systems": "ris",
|
100
|
-
"application/x-typst": "typst",
|
101
|
-
"text/csv": "csv",
|
102
|
-
"text/tab-separated-values": "tsv",
|
103
|
-
"text/troff": "man",
|
104
|
-
"text/x-commonmark": "commonmark",
|
105
|
-
"text/x-dokuwiki": "dokuwiki",
|
106
|
-
"text/x-gfm": "gfm",
|
107
|
-
"text/x-markdown": "markdown",
|
108
|
-
"text/x-markdown-extra": "markdown_phpextra",
|
109
|
-
"text/x-mdoc": "mdoc",
|
110
|
-
"text/x-multimarkdown": "markdown_mmd",
|
111
|
-
"text/x-org": "org",
|
112
|
-
"text/x-pod": "pod",
|
113
|
-
"text/x-rst": "rst",
|
114
|
-
}
|
115
|
-
|
116
|
-
MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
|
117
|
-
"application/csl+json": "json",
|
118
|
-
"application/docbook+xml": "xml",
|
119
|
-
"application/epub+zip": "epub",
|
120
|
-
"application/rtf": "rtf",
|
121
|
-
"application/vnd.oasis.opendocument.text": "odt",
|
122
|
-
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
123
|
-
"application/x-biblatex": "bib",
|
124
|
-
"application/x-bibtex": "bib",
|
125
|
-
"application/x-endnote+xml": "xml",
|
126
|
-
"application/x-fictionbook+xml": "fb2",
|
127
|
-
"application/x-ipynb+json": "ipynb",
|
128
|
-
"application/x-jats+xml": "xml",
|
129
|
-
"application/x-latex": "tex",
|
130
|
-
"application/x-opml+xml": "opml",
|
131
|
-
"application/x-research-info-systems": "ris",
|
132
|
-
"application/x-typst": "typst",
|
133
|
-
"text/csv": "csv",
|
134
|
-
"text/tab-separated-values": "tsv",
|
135
|
-
"text/troff": "1",
|
136
|
-
"text/x-commonmark": "md",
|
137
|
-
"text/x-dokuwiki": "wiki",
|
138
|
-
"text/x-gfm": "md",
|
139
|
-
"text/x-markdown": "md",
|
140
|
-
"text/x-markdown-extra": "md",
|
141
|
-
"text/x-mdoc": "md",
|
142
|
-
"text/x-multimarkdown": "md",
|
143
|
-
"text/x-org": "org",
|
144
|
-
"text/x-pod": "pod",
|
145
|
-
"text/x-rst": "rst",
|
146
|
-
}
|
147
|
-
|
148
|
-
|
149
|
-
def _extract_inline_text(node: dict[str, Any]) -> str | None:
|
150
|
-
if node_type := node.get(TYPE_FIELD):
|
151
|
-
if node_type == INLINE_STR:
|
152
|
-
return node.get(CONTENT_FIELD)
|
153
|
-
if node_type == INLINE_SPACE:
|
154
|
-
return " "
|
155
|
-
if node_type in (INLINE_EMPH, INLINE_STRONG):
|
156
|
-
return _extract_inlines(node.get(CONTENT_FIELD, []))
|
157
|
-
return None # pragma: no cover
|
158
|
-
|
159
|
-
|
160
|
-
def _extract_inlines(nodes: list[dict[str, Any]]) -> str | None:
|
161
|
-
texts = [text for node in nodes if (text := _extract_inline_text(node))]
|
162
|
-
result = "".join(texts).strip()
|
163
|
-
return result if result else None
|
164
|
-
|
165
|
-
|
166
|
-
def _extract_meta_value(node: Any) -> str | list[str] | None:
|
167
|
-
if not isinstance(node, dict) or CONTENT_FIELD not in node or TYPE_FIELD not in node:
|
168
|
-
return None
|
169
|
-
|
170
|
-
content = node[CONTENT_FIELD]
|
171
|
-
node_type = node[TYPE_FIELD]
|
172
|
-
|
173
|
-
if not content or node_type not in {
|
174
|
-
META_STRING,
|
175
|
-
META_INLINES,
|
176
|
-
META_LIST,
|
177
|
-
META_BLOCKS,
|
178
|
-
}:
|
179
|
-
return None
|
180
|
-
|
181
|
-
if node_type == META_STRING and isinstance(content, str):
|
182
|
-
return content
|
183
|
-
|
184
|
-
if isinstance(content, list) and (content := [v for v in content if isinstance(v, dict)]):
|
185
|
-
if node_type == META_INLINES:
|
186
|
-
return _extract_inlines(cast(list[dict[str, Any]], content))
|
187
|
-
|
188
|
-
if node_type == META_LIST:
|
189
|
-
results = []
|
190
|
-
for value in [value for item in content if (value := _extract_meta_value(item))]:
|
191
|
-
if isinstance(value, list): # pragma: no cover
|
192
|
-
results.extend(value)
|
193
|
-
else:
|
194
|
-
results.append(value)
|
195
|
-
return results
|
196
|
-
|
197
|
-
# This branch is only taken for complex metadata blocks which we don't use
|
198
|
-
if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]: # pragma: no cover
|
199
|
-
block_texts = []
|
200
|
-
for block in blocks:
|
201
|
-
block_content = block.get(CONTENT_FIELD, [])
|
202
|
-
if isinstance(block_content, list) and (text := _extract_inlines(block_content)):
|
203
|
-
block_texts.append(text)
|
204
|
-
return block_texts if block_texts else None
|
205
|
-
|
206
|
-
return None
|
207
|
-
|
208
|
-
|
209
|
-
def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
210
|
-
meta: Metadata = {}
|
211
|
-
|
212
|
-
for key, value in raw_meta.items():
|
213
|
-
if extracted := _extract_meta_value(value):
|
214
|
-
meta[key] = extracted # type: ignore[literal-required]
|
215
|
-
|
216
|
-
citations = [
|
217
|
-
cite["citationId"]
|
218
|
-
for block in raw_meta.get("blocks", [])
|
219
|
-
if block.get(TYPE_FIELD) == "Cite"
|
220
|
-
for cite in block.get(CONTENT_FIELD, [[{}]])[0]
|
221
|
-
if isinstance(cite, dict)
|
222
|
-
]
|
223
|
-
if citations:
|
224
|
-
meta["citations"] = citations
|
225
|
-
|
226
|
-
return meta
|
227
|
-
|
228
|
-
|
229
|
-
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
230
|
-
if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
|
231
|
-
return pandoc_type
|
232
|
-
|
233
|
-
if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
|
234
|
-
return next(
|
235
|
-
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
236
|
-
)
|
237
|
-
|
238
|
-
raise ValidationError(f"Unsupported mime type: {mime_type}")
|
239
|
-
|
240
|
-
|
241
|
-
async def _validate_pandoc_version() -> None:
|
242
|
-
try:
|
243
|
-
if version_ref["checked"]:
|
244
|
-
return
|
245
|
-
|
246
|
-
command = ["pandoc", "--version"]
|
247
|
-
result = await run_process(command)
|
248
|
-
|
249
|
-
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
250
|
-
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
251
|
-
raise MissingDependencyError("Pandoc version 2 or above is required")
|
252
|
-
|
253
|
-
version_ref["checked"] = True
|
254
|
-
|
255
|
-
except FileNotFoundError as e:
|
256
|
-
raise MissingDependencyError("Pandoc is not installed") from e
|
257
|
-
|
258
|
-
|
259
|
-
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
260
|
-
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
261
|
-
metadata_file, unlink = await create_temp_file(".json")
|
262
|
-
try:
|
263
|
-
command = [
|
264
|
-
"pandoc",
|
265
|
-
str(input_file),
|
266
|
-
f"--from={pandoc_type}",
|
267
|
-
"--to=json",
|
268
|
-
"--standalone",
|
269
|
-
"--quiet",
|
270
|
-
"--output",
|
271
|
-
str(metadata_file),
|
272
|
-
]
|
273
|
-
|
274
|
-
result = await run_process(command)
|
275
|
-
|
276
|
-
if result.returncode != 0:
|
277
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
278
|
-
|
279
|
-
json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
|
280
|
-
return _extract_metadata(json_data)
|
281
|
-
except (RuntimeError, OSError, JSONDecodeError) as e:
|
282
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
283
|
-
finally:
|
284
|
-
await unlink()
|
285
|
-
|
286
|
-
|
287
|
-
async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
|
288
|
-
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
289
|
-
output_path, unlink = await create_temp_file(".md")
|
290
|
-
try:
|
291
|
-
command = [
|
292
|
-
"pandoc",
|
293
|
-
str(input_file),
|
294
|
-
f"--from={pandoc_type}",
|
295
|
-
"--to=markdown",
|
296
|
-
"--standalone",
|
297
|
-
"--wrap=preserve",
|
298
|
-
"--quiet",
|
299
|
-
]
|
300
|
-
|
301
|
-
command.extend(["--output", str(output_path)])
|
302
|
-
|
303
|
-
result = await run_process(command)
|
304
|
-
|
305
|
-
if result.returncode != 0:
|
306
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
307
|
-
|
308
|
-
text = await AsyncPath(output_path).read_text("utf-8")
|
309
|
-
|
310
|
-
return normalize_spaces(text)
|
311
|
-
except (RuntimeError, OSError) as e:
|
312
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
313
|
-
finally:
|
314
|
-
await unlink()
|
315
|
-
|
316
|
-
|
317
|
-
async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
|
318
|
-
"""Process a single file using Pandoc and convert to markdown.
|
319
|
-
|
320
|
-
Args:
|
321
|
-
input_file: The path to the file to process.
|
322
|
-
mime_type: The mime type of the file.
|
323
|
-
|
324
|
-
Raises:
|
325
|
-
ParsingError: If the file data could not be extracted.
|
326
|
-
|
327
|
-
Returns:
|
328
|
-
ExtractionResult
|
329
|
-
"""
|
330
|
-
await _validate_pandoc_version()
|
331
|
-
|
332
|
-
_get_pandoc_type_from_mime_type(mime_type)
|
333
|
-
|
334
|
-
try:
|
335
|
-
metadata_task = _handle_extract_metadata(input_file, mime_type=mime_type)
|
336
|
-
content_task = _handle_extract_file(input_file, mime_type=mime_type)
|
337
|
-
results = await run_taskgroup(metadata_task, content_task)
|
338
|
-
metadata, content = cast(tuple[Metadata, str], results)
|
339
|
-
|
340
|
-
return ExtractionResult(
|
341
|
-
content=normalize_spaces(content),
|
342
|
-
metadata=metadata,
|
343
|
-
mime_type=MARKDOWN_MIME_TYPE,
|
344
|
-
)
|
345
|
-
except ExceptionGroup as eg:
|
346
|
-
raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
|
347
|
-
|
348
|
-
|
349
|
-
async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
|
350
|
-
"""Process content using Pandoc and convert to markdown.
|
351
|
-
|
352
|
-
Args:
|
353
|
-
content: The content to process.
|
354
|
-
mime_type: The mime type of the content.
|
355
|
-
|
356
|
-
Returns:
|
357
|
-
ExtractionResult
|
358
|
-
"""
|
359
|
-
extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
|
360
|
-
input_file, unlink = await create_temp_file(f".{extension}")
|
361
|
-
|
362
|
-
await AsyncPath(input_file).write_bytes(content)
|
363
|
-
result = await process_file_with_pandoc(input_file, mime_type=mime_type)
|
364
|
-
|
365
|
-
await unlink()
|
366
|
-
return result
|
kreuzberg/_pdf.py
DELETED
@@ -1,190 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
from re import Pattern
|
4
|
-
from re import compile as compile_regex
|
5
|
-
from typing import TYPE_CHECKING, Final, cast
|
6
|
-
|
7
|
-
import pypdfium2
|
8
|
-
from anyio import Path as AsyncPath
|
9
|
-
|
10
|
-
from kreuzberg import ExtractionResult
|
11
|
-
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
|
-
from kreuzberg._string import normalize_spaces
|
13
|
-
from kreuzberg._sync import run_sync
|
14
|
-
from kreuzberg._tesseract import PSMMode, batch_process_images
|
15
|
-
from kreuzberg.exceptions import ParsingError
|
16
|
-
|
17
|
-
if TYPE_CHECKING: # pragma: no cover
|
18
|
-
from pathlib import Path
|
19
|
-
|
20
|
-
from PIL.Image import Image
|
21
|
-
|
22
|
-
|
23
|
-
# Pattern to detect common PDF text extraction corruption:
|
24
|
-
# - Control and non-printable characters
|
25
|
-
# - Unicode replacement and invalid characters
|
26
|
-
# - Zero-width spaces and other invisible characters
|
27
|
-
CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
|
28
|
-
SHORT_TEXT_THRESHOLD: Final[int] = 50
|
29
|
-
MINIMUM_CORRUPTED_RESULTS: Final[int] = 2
|
30
|
-
|
31
|
-
|
32
|
-
def _validate_extracted_text(text: str, corruption_threshold: float = 0.05) -> bool:
|
33
|
-
"""Check if text extracted from PDF is valid or corrupted.
|
34
|
-
|
35
|
-
This checks for indicators of corrupted PDF text extraction:
|
36
|
-
1. Empty or whitespace-only text
|
37
|
-
2. High concentration of control characters and null bytes
|
38
|
-
3. High concentration of Unicode replacement characters
|
39
|
-
|
40
|
-
Args:
|
41
|
-
text: The extracted text to validate
|
42
|
-
corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
|
43
|
-
characters (default: 0.05 or 5%)
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
True if the text appears valid, False if it seems corrupted
|
47
|
-
"""
|
48
|
-
if not text or not text.strip():
|
49
|
-
return False
|
50
|
-
|
51
|
-
corruption_matches = CORRUPTED_PATTERN.findall(text)
|
52
|
-
|
53
|
-
if len(text) < SHORT_TEXT_THRESHOLD:
|
54
|
-
return len(corruption_matches) <= MINIMUM_CORRUPTED_RESULTS
|
55
|
-
|
56
|
-
return (len(corruption_matches) / len(text)) < corruption_threshold
|
57
|
-
|
58
|
-
|
59
|
-
async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
60
|
-
"""Convert a PDF file to images.
|
61
|
-
|
62
|
-
Args:
|
63
|
-
input_file: The path to the PDF file.
|
64
|
-
|
65
|
-
Raises:
|
66
|
-
ParsingError: If the PDF file could not be converted to images.
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
A list of Pillow Images.
|
70
|
-
"""
|
71
|
-
document: pypdfium2.PdfDocument | None = None
|
72
|
-
try:
|
73
|
-
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
74
|
-
return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
|
75
|
-
except pypdfium2.PdfiumError as e:
|
76
|
-
raise ParsingError(
|
77
|
-
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
78
|
-
) from e
|
79
|
-
finally:
|
80
|
-
if document:
|
81
|
-
await run_sync(document.close)
|
82
|
-
|
83
|
-
|
84
|
-
async def _extract_pdf_text_with_ocr(
|
85
|
-
input_file: Path,
|
86
|
-
*,
|
87
|
-
language: str = "eng",
|
88
|
-
max_processes: int,
|
89
|
-
psm: PSMMode = PSMMode.AUTO,
|
90
|
-
) -> ExtractionResult:
|
91
|
-
"""Extract text from a scanned PDF file using pytesseract.
|
92
|
-
|
93
|
-
Args:
|
94
|
-
input_file: The path to the PDF file.
|
95
|
-
language: The language code for OCR. Defaults to "eng".
|
96
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
97
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
98
|
-
|
99
|
-
Returns:
|
100
|
-
The extracted text.
|
101
|
-
"""
|
102
|
-
images = await _convert_pdf_to_images(input_file)
|
103
|
-
ocr_results = await batch_process_images(images, max_processes=max_processes, psm=psm, language=language)
|
104
|
-
return ExtractionResult(
|
105
|
-
content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
|
106
|
-
)
|
107
|
-
|
108
|
-
|
109
|
-
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
110
|
-
"""Extract text from a searchable PDF file using pypdfium2.
|
111
|
-
|
112
|
-
Args:
|
113
|
-
input_file: The path to the PDF file.
|
114
|
-
|
115
|
-
Raises:
|
116
|
-
ParsingError: If the text could not be extracted from the PDF file.
|
117
|
-
|
118
|
-
Returns:
|
119
|
-
The extracted text.
|
120
|
-
"""
|
121
|
-
document: pypdfium2.PdfDocument | None = None
|
122
|
-
try:
|
123
|
-
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
124
|
-
text = "\n".join(page.get_textpage().get_text_bounded() for page in cast(pypdfium2.PdfDocument, document))
|
125
|
-
return normalize_spaces(text)
|
126
|
-
except pypdfium2.PdfiumError as e:
|
127
|
-
raise ParsingError(
|
128
|
-
"Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
|
129
|
-
) from e
|
130
|
-
finally:
|
131
|
-
if document:
|
132
|
-
await run_sync(document.close)
|
133
|
-
|
134
|
-
|
135
|
-
async def extract_pdf_file(
|
136
|
-
input_file: Path,
|
137
|
-
*,
|
138
|
-
force_ocr: bool,
|
139
|
-
language: str = "eng",
|
140
|
-
max_processes: int,
|
141
|
-
psm: PSMMode = PSMMode.AUTO,
|
142
|
-
) -> ExtractionResult:
|
143
|
-
"""Extract text from a PDF file.
|
144
|
-
|
145
|
-
Args:
|
146
|
-
input_file: The path to the PDF file.
|
147
|
-
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
148
|
-
language: The language code for OCR. Defaults to "eng".
|
149
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
150
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
151
|
-
|
152
|
-
Returns:
|
153
|
-
The extracted text.
|
154
|
-
"""
|
155
|
-
if not force_ocr:
|
156
|
-
content = await _extract_pdf_searchable_text(input_file)
|
157
|
-
if _validate_extracted_text(content):
|
158
|
-
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
159
|
-
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
160
|
-
|
161
|
-
|
162
|
-
async def extract_pdf_content(
|
163
|
-
content: bytes,
|
164
|
-
*,
|
165
|
-
force_ocr: bool,
|
166
|
-
language: str = "eng",
|
167
|
-
max_processes: int,
|
168
|
-
psm: PSMMode = PSMMode.AUTO,
|
169
|
-
) -> ExtractionResult:
|
170
|
-
"""Extract text from a PDF file content.
|
171
|
-
|
172
|
-
Args:
|
173
|
-
content: The PDF file content.
|
174
|
-
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
175
|
-
language: The language code for OCR. Defaults to "eng".
|
176
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
177
|
-
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
178
|
-
|
179
|
-
Returns:
|
180
|
-
The extracted text.
|
181
|
-
"""
|
182
|
-
from kreuzberg._tmp import create_temp_file
|
183
|
-
|
184
|
-
file_path, unlink = await create_temp_file(".pdf")
|
185
|
-
await AsyncPath(file_path).write_bytes(content)
|
186
|
-
result = await extract_pdf_file(
|
187
|
-
file_path, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
|
188
|
-
)
|
189
|
-
await unlink()
|
190
|
-
return result
|