kreuzberg 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -2
- kreuzberg/_constants.py +6 -0
- kreuzberg/_html.py +32 -0
- kreuzberg/_mime_types.py +109 -1
- kreuzberg/_pandoc.py +154 -167
- kreuzberg/_pdf.py +189 -0
- kreuzberg/_pptx.py +88 -0
- kreuzberg/_string.py +5 -8
- kreuzberg/_sync.py +6 -1
- kreuzberg/_tesseract.py +101 -64
- kreuzberg/_tmp.py +37 -0
- kreuzberg/_types.py +71 -0
- kreuzberg/_xlsx.py +92 -0
- kreuzberg/extraction.py +269 -64
- kreuzberg-2.0.0.dist-info/METADATA +419 -0
- kreuzberg-2.0.0.dist-info/RECORD +21 -0
- kreuzberg/_extractors.py +0 -247
- kreuzberg-1.6.0.dist-info/METADATA +0 -317
- kreuzberg-1.6.0.dist-info/RECORD +0 -15
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.6.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,9 +1,13 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
1
|
+
from ._types import ExtractionResult, Metadata
|
2
|
+
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
3
|
+
from .extraction import extract_bytes, extract_file
|
3
4
|
|
4
5
|
__all__ = [
|
5
6
|
"ExtractionResult",
|
6
7
|
"KreuzbergError",
|
8
|
+
"Metadata",
|
9
|
+
"MissingDependencyError",
|
10
|
+
"OCRError",
|
7
11
|
"ParsingError",
|
8
12
|
"ValidationError",
|
9
13
|
"extract_bytes",
|
kreuzberg/_constants.py
ADDED
kreuzberg/_html.py
ADDED
@@ -0,0 +1,32 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
|
5
|
+
import html_to_markdown
|
6
|
+
from anyio import Path as AsyncPath
|
7
|
+
|
8
|
+
from kreuzberg import ExtractionResult
|
9
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
10
|
+
from kreuzberg._string import normalize_spaces, safe_decode
|
11
|
+
from kreuzberg._sync import run_sync
|
12
|
+
|
13
|
+
if TYPE_CHECKING:
|
14
|
+
from pathlib import Path
|
15
|
+
|
16
|
+
|
17
|
+
async def extract_html_string(file_path_or_contents: Path | bytes) -> ExtractionResult:
|
18
|
+
"""Extract text from an HTML string.
|
19
|
+
|
20
|
+
Args:
|
21
|
+
file_path_or_contents: The HTML content.
|
22
|
+
|
23
|
+
Returns:
|
24
|
+
The extracted text content.
|
25
|
+
"""
|
26
|
+
content = (
|
27
|
+
safe_decode(file_path_or_contents)
|
28
|
+
if isinstance(file_path_or_contents, bytes)
|
29
|
+
else await AsyncPath(file_path_or_contents).read_text()
|
30
|
+
)
|
31
|
+
result = await run_sync(html_to_markdown.convert_to_markdown, content)
|
32
|
+
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
kreuzberg/_mime_types.py
CHANGED
@@ -1,16 +1,30 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from mimetypes import guess_type
|
4
|
+
from pathlib import Path
|
3
5
|
from typing import TYPE_CHECKING, Final
|
4
6
|
|
7
|
+
from kreuzberg.exceptions import ValidationError
|
8
|
+
|
5
9
|
if TYPE_CHECKING: # pragma: no cover
|
6
10
|
from collections.abc import Mapping
|
11
|
+
from os import PathLike
|
7
12
|
|
8
13
|
HTML_MIME_TYPE: Final = "text/html"
|
9
14
|
MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
10
15
|
PDF_MIME_TYPE: Final = "application/pdf"
|
11
16
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
12
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
|
+
# Excel formats
|
13
19
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
20
|
+
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
21
|
+
EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
|
22
|
+
EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macroEnabled.12"
|
23
|
+
EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
|
24
|
+
EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
|
25
|
+
|
26
|
+
# OpenDocument spreadsheet format
|
27
|
+
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet" # ods
|
14
28
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
15
29
|
|
16
30
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -85,9 +99,103 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
|
85
99
|
"text/x-rst",
|
86
100
|
}
|
87
101
|
|
102
|
+
SPREADSHEET_MIME_TYPES: Final[set[str]] = {
|
103
|
+
EXCEL_MIME_TYPE,
|
104
|
+
EXCEL_BINARY_MIME_TYPE,
|
105
|
+
EXCEL_MACRO_MIME_TYPE,
|
106
|
+
EXCEL_BINARY_2007_MIME_TYPE,
|
107
|
+
EXCEL_ADDON_MIME_TYPE,
|
108
|
+
EXCEL_TEMPLATE_MIME_TYPE,
|
109
|
+
OPENDOC_SPREADSHEET_MIME_TYPE,
|
110
|
+
}
|
111
|
+
|
112
|
+
EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
|
113
|
+
".txt": PLAIN_TEXT_MIME_TYPE,
|
114
|
+
".md": MARKDOWN_MIME_TYPE,
|
115
|
+
".pdf": PDF_MIME_TYPE,
|
116
|
+
".html": HTML_MIME_TYPE,
|
117
|
+
".htm": HTML_MIME_TYPE,
|
118
|
+
".xlsx": EXCEL_MIME_TYPE,
|
119
|
+
".xls": EXCEL_BINARY_MIME_TYPE,
|
120
|
+
".xlsm": EXCEL_MACRO_MIME_TYPE,
|
121
|
+
".xlsb": EXCEL_BINARY_2007_MIME_TYPE,
|
122
|
+
".xlam": EXCEL_ADDON_MIME_TYPE,
|
123
|
+
".xla": EXCEL_TEMPLATE_MIME_TYPE,
|
124
|
+
".ods": OPENDOC_SPREADSHEET_MIME_TYPE,
|
125
|
+
".pptx": POWER_POINT_MIME_TYPE,
|
126
|
+
".bmp": "image/bmp",
|
127
|
+
".gif": "image/gif",
|
128
|
+
".jpg": "image/jpeg",
|
129
|
+
".jpeg": "image/jpeg",
|
130
|
+
".png": "image/png",
|
131
|
+
".tiff": "image/tiff",
|
132
|
+
".tif": "image/tiff",
|
133
|
+
".webp": "image/webp",
|
134
|
+
".jp2": "image/jp2",
|
135
|
+
".jpx": "image/jpx",
|
136
|
+
".jpm": "image/jpm",
|
137
|
+
".mj2": "image/mj2",
|
138
|
+
".pnm": "image/x-portable-anymap",
|
139
|
+
".pbm": "image/x-portable-bitmap",
|
140
|
+
".pgm": "image/x-portable-graymap",
|
141
|
+
".ppm": "image/x-portable-pixmap",
|
142
|
+
".csv": "text/csv",
|
143
|
+
".tsv": "text/tab-separated-values",
|
144
|
+
".rst": "text/x-rst",
|
145
|
+
".org": "text/x-org",
|
146
|
+
".epub": "application/epub+zip",
|
147
|
+
".rtf": "application/rtf",
|
148
|
+
".odt": "application/vnd.oasis.opendocument.text",
|
149
|
+
".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
|
150
|
+
".bib": "application/x-bibtex",
|
151
|
+
".ipynb": "application/x-ipynb+json",
|
152
|
+
".tex": "application/x-latex",
|
153
|
+
}
|
154
|
+
|
88
155
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
89
156
|
PLAIN_TEXT_MIME_TYPES
|
90
157
|
| IMAGE_MIME_TYPES
|
91
158
|
| PANDOC_SUPPORTED_MIME_TYPES
|
92
|
-
|
|
159
|
+
| SPREADSHEET_MIME_TYPES
|
160
|
+
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
|
93
161
|
)
|
162
|
+
|
163
|
+
|
164
|
+
def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
|
165
|
+
"""Validate and detect the MIME type for a given file.
|
166
|
+
|
167
|
+
Args:
|
168
|
+
file_path: The path to the file.
|
169
|
+
mime_type: Optional explicit MIME type. If provided, this will be validated.
|
170
|
+
If not provided, the function will attempt to detect the MIME type.
|
171
|
+
|
172
|
+
Raises:
|
173
|
+
ValidationError: If the MIME type is not supported or cannot be determined.
|
174
|
+
|
175
|
+
Returns:
|
176
|
+
The validated MIME type.
|
177
|
+
"""
|
178
|
+
path = Path(file_path)
|
179
|
+
|
180
|
+
if not mime_type:
|
181
|
+
# Try to determine MIME type from file extension first
|
182
|
+
ext = path.suffix.lower()
|
183
|
+
mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
|
184
|
+
|
185
|
+
if not mime_type: # pragma: no cover
|
186
|
+
raise ValidationError(
|
187
|
+
"Could not determine the mime type of the file. Please specify the mime_type parameter explicitly.",
|
188
|
+
context={"input_file": str(path), "extension": ext},
|
189
|
+
)
|
190
|
+
|
191
|
+
if mime_type in SUPPORTED_MIME_TYPES:
|
192
|
+
return mime_type
|
193
|
+
|
194
|
+
for supported_mime_type in SUPPORTED_MIME_TYPES:
|
195
|
+
if mime_type.startswith(supported_mime_type):
|
196
|
+
return supported_mime_type
|
197
|
+
|
198
|
+
raise ValidationError(
|
199
|
+
f"Unsupported mime type: {mime_type}",
|
200
|
+
context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
|
201
|
+
)
|
kreuzberg/_pandoc.py
CHANGED
@@ -1,26 +1,29 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import json
|
4
3
|
import subprocess
|
5
|
-
|
6
|
-
from
|
7
|
-
from
|
8
|
-
from typing import TYPE_CHECKING, Any, Final, Literal,
|
4
|
+
import sys
|
5
|
+
from functools import partial
|
6
|
+
from json import JSONDecodeError, loads
|
7
|
+
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
9
8
|
|
9
|
+
from anyio import CapacityLimiter, create_task_group, to_process
|
10
10
|
from anyio import Path as AsyncPath
|
11
11
|
|
12
|
+
from kreuzberg._constants import DEFAULT_MAX_PROCESSES
|
13
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
12
14
|
from kreuzberg._string import normalize_spaces
|
13
15
|
from kreuzberg._sync import run_sync
|
16
|
+
from kreuzberg._tmp import create_temp_file
|
17
|
+
from kreuzberg._types import ExtractionResult, Metadata
|
14
18
|
from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
|
15
19
|
|
16
|
-
if TYPE_CHECKING:
|
20
|
+
if TYPE_CHECKING: # pragma: no cover
|
17
21
|
from collections.abc import Mapping
|
18
22
|
from os import PathLike
|
19
23
|
|
20
|
-
|
21
|
-
from
|
22
|
-
|
23
|
-
from typing_extensions import NotRequired
|
24
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
25
|
+
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
|
+
|
24
27
|
|
25
28
|
version_ref: Final[dict[str, bool]] = {"checked": False}
|
26
29
|
|
@@ -80,7 +83,7 @@ NodeType = Literal[
|
|
80
83
|
"MetaBlocks",
|
81
84
|
]
|
82
85
|
|
83
|
-
|
86
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
|
84
87
|
"application/csl+json": "csljson",
|
85
88
|
"application/docbook+xml": "docbook",
|
86
89
|
"application/epub+zip": "epub",
|
@@ -112,64 +115,37 @@ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
|
|
112
115
|
"text/x-rst": "rst",
|
113
116
|
}
|
114
117
|
|
115
|
-
|
116
|
-
|
117
|
-
"""
|
118
|
-
|
119
|
-
|
120
|
-
|
121
|
-
"""
|
122
|
-
|
123
|
-
|
124
|
-
""
|
125
|
-
|
126
|
-
""
|
127
|
-
|
128
|
-
""
|
129
|
-
|
130
|
-
""
|
131
|
-
|
132
|
-
""
|
133
|
-
|
134
|
-
""
|
135
|
-
|
136
|
-
""
|
137
|
-
|
138
|
-
""
|
139
|
-
|
140
|
-
""
|
141
|
-
|
142
|
-
""
|
143
|
-
|
144
|
-
""
|
145
|
-
|
146
|
-
"""Reference entries."""
|
147
|
-
citations: NotRequired[list[str]]
|
148
|
-
"""Citation identifiers."""
|
149
|
-
copyright: NotRequired[str]
|
150
|
-
"""Copyright information."""
|
151
|
-
license: NotRequired[str]
|
152
|
-
"""License information."""
|
153
|
-
identifier: NotRequired[str]
|
154
|
-
"""Document identifier."""
|
155
|
-
publisher: NotRequired[str]
|
156
|
-
"""Publisher name."""
|
157
|
-
contributors: NotRequired[list[str]]
|
158
|
-
"""Additional contributors."""
|
159
|
-
creator: NotRequired[str]
|
160
|
-
"""Document creator."""
|
161
|
-
institute: NotRequired[str | list[str]]
|
162
|
-
"""Institute or organization."""
|
163
|
-
|
164
|
-
|
165
|
-
@dataclass
|
166
|
-
class PandocResult:
|
167
|
-
"""Result of a pandoc conversion including content and metadata."""
|
168
|
-
|
169
|
-
content: str
|
170
|
-
"""The processed markdown content."""
|
171
|
-
metadata: Metadata
|
172
|
-
"""Document metadata extracted from the source."""
|
118
|
+
MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
|
119
|
+
"application/csl+json": "json",
|
120
|
+
"application/docbook+xml": "xml",
|
121
|
+
"application/epub+zip": "epub",
|
122
|
+
"application/rtf": "rtf",
|
123
|
+
"application/vnd.oasis.opendocument.text": "odt",
|
124
|
+
"application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
|
125
|
+
"application/x-biblatex": "bib",
|
126
|
+
"application/x-bibtex": "bib",
|
127
|
+
"application/x-endnote+xml": "xml",
|
128
|
+
"application/x-fictionbook+xml": "fb2",
|
129
|
+
"application/x-ipynb+json": "ipynb",
|
130
|
+
"application/x-jats+xml": "xml",
|
131
|
+
"application/x-latex": "tex",
|
132
|
+
"application/x-opml+xml": "opml",
|
133
|
+
"application/x-research-info-systems": "ris",
|
134
|
+
"application/x-typst": "typst",
|
135
|
+
"text/csv": "csv",
|
136
|
+
"text/tab-separated-values": "tsv",
|
137
|
+
"text/troff": "1",
|
138
|
+
"text/x-commonmark": "md",
|
139
|
+
"text/x-dokuwiki": "wiki",
|
140
|
+
"text/x-gfm": "md",
|
141
|
+
"text/x-markdown": "md",
|
142
|
+
"text/x-markdown-extra": "md",
|
143
|
+
"text/x-mdoc": "md",
|
144
|
+
"text/x-multimarkdown": "md",
|
145
|
+
"text/x-org": "org",
|
146
|
+
"text/x-pod": "pod",
|
147
|
+
"text/x-rst": "rst",
|
148
|
+
}
|
173
149
|
|
174
150
|
|
175
151
|
def _extract_inline_text(node: dict[str, Any]) -> str | None:
|
@@ -214,13 +190,14 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
|
|
214
190
|
if node_type == META_LIST:
|
215
191
|
results = []
|
216
192
|
for value in [value for item in content if (value := _extract_meta_value(item))]:
|
217
|
-
if isinstance(value, list):
|
218
|
-
results.extend(value)
|
193
|
+
if isinstance(value, list): # pragma: no cover
|
194
|
+
results.extend(value)
|
219
195
|
else:
|
220
196
|
results.append(value)
|
221
197
|
return results
|
222
198
|
|
223
|
-
|
199
|
+
# This branch is only taken for complex metadata blocks which we don't use
|
200
|
+
if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]: # pragma: no cover
|
224
201
|
block_texts = []
|
225
202
|
for block in blocks:
|
226
203
|
block_content = block.get(CONTENT_FIELD, [])
|
@@ -232,7 +209,6 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
|
|
232
209
|
|
233
210
|
|
234
211
|
def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
235
|
-
"""Extract all non-empty metadata values from Pandoc AST metadata."""
|
236
212
|
meta: Metadata = {}
|
237
213
|
|
238
214
|
for key, value in raw_meta.items():
|
@@ -252,34 +228,30 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
|
252
228
|
return meta
|
253
229
|
|
254
230
|
|
255
|
-
def
|
256
|
-
if mime_type not in
|
257
|
-
mime_type.startswith(value) for value in
|
231
|
+
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
232
|
+
if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
|
233
|
+
mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
|
258
234
|
):
|
259
235
|
raise ValidationError(
|
260
236
|
f"Unsupported mime type: {mime_type}",
|
261
237
|
context={
|
262
238
|
"mime_type": mime_type,
|
263
|
-
"supported_mimetypes": ",".join(sorted(
|
239
|
+
"supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
|
264
240
|
},
|
265
241
|
)
|
266
242
|
|
267
|
-
return
|
268
|
-
|
243
|
+
return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
|
244
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
269
245
|
)
|
270
246
|
|
271
247
|
|
272
|
-
async def
|
273
|
-
"""Validate that Pandoc is installed and is version 3 or above.
|
274
|
-
|
275
|
-
Raises:
|
276
|
-
MissingDependencyError: If Pandoc is not installed or is below version 3.
|
277
|
-
"""
|
248
|
+
async def _validate_pandoc_version() -> None:
|
278
249
|
try:
|
279
250
|
if version_ref["checked"]:
|
280
251
|
return
|
281
252
|
|
282
|
-
|
253
|
+
command = ["pandoc", "--version"]
|
254
|
+
result = await run_sync(subprocess.run, command, capture_output=True)
|
283
255
|
version = result.stdout.decode().split("\n")[0].split()[1]
|
284
256
|
if not version.startswith("3."):
|
285
257
|
raise MissingDependencyError("Pandoc version 3 or above is required.")
|
@@ -290,127 +262,142 @@ async def validate_pandoc_version() -> None:
|
|
290
262
|
raise MissingDependencyError("Pandoc is not installed.") from e
|
291
263
|
|
292
264
|
|
293
|
-
async def
|
294
|
-
|
265
|
+
async def _handle_extract_metadata(
|
266
|
+
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
267
|
+
) -> Metadata:
|
268
|
+
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
269
|
+
metadata_file, unlink = await create_temp_file(".json")
|
270
|
+
try:
|
271
|
+
command = [
|
272
|
+
"pandoc",
|
273
|
+
str(input_file),
|
274
|
+
f"--from={pandoc_type}",
|
275
|
+
"--to=json",
|
276
|
+
"--standalone",
|
277
|
+
"--quiet",
|
278
|
+
"--output",
|
279
|
+
metadata_file,
|
280
|
+
]
|
281
|
+
|
282
|
+
result = await to_process.run_sync(
|
283
|
+
partial(subprocess.run, capture_output=True),
|
284
|
+
command,
|
285
|
+
cancellable=True,
|
286
|
+
limiter=CapacityLimiter(max_processes),
|
287
|
+
)
|
288
|
+
|
289
|
+
if result.returncode != 0:
|
290
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
295
291
|
|
296
|
-
|
297
|
-
|
298
|
-
|
292
|
+
json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
|
293
|
+
return _extract_metadata(json_data)
|
294
|
+
except (RuntimeError, OSError, JSONDecodeError) as e:
|
295
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
296
|
+
finally:
|
297
|
+
await unlink()
|
299
298
|
|
300
|
-
Raises:
|
301
|
-
ParsingError: If Pandoc fails to extract metadata.
|
302
299
|
|
303
|
-
|
304
|
-
|
305
|
-
|
306
|
-
|
307
|
-
|
308
|
-
|
309
|
-
try:
|
310
|
-
command = [
|
311
|
-
"pandoc",
|
312
|
-
str(input_file),
|
313
|
-
f"--from={extension}",
|
314
|
-
"--to=json",
|
315
|
-
"--standalone",
|
316
|
-
"--quiet",
|
317
|
-
"--output",
|
318
|
-
metadata_file.name,
|
319
|
-
]
|
320
|
-
|
321
|
-
result = await run_sync(
|
322
|
-
subprocess.run,
|
323
|
-
command,
|
324
|
-
capture_output=True,
|
325
|
-
)
|
326
|
-
|
327
|
-
if result.returncode != 0:
|
328
|
-
raise ParsingError(
|
329
|
-
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
330
|
-
)
|
331
|
-
|
332
|
-
json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
|
333
|
-
return _extract_metadata(json_data)
|
334
|
-
|
335
|
-
except (RuntimeError, OSError, json.JSONDecodeError) as e:
|
336
|
-
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
337
|
-
|
338
|
-
|
339
|
-
async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
|
340
|
-
extension = _get_extension_from_mime_type(mime_type)
|
341
|
-
|
342
|
-
with NamedTemporaryFile(suffix=".md") as output_file:
|
300
|
+
async def _handle_extract_file(
|
301
|
+
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
302
|
+
) -> str:
|
303
|
+
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
304
|
+
output_path, unlink = await create_temp_file(".md")
|
305
|
+
try:
|
343
306
|
command = [
|
344
307
|
"pandoc",
|
345
308
|
str(input_file),
|
346
|
-
f"--from={
|
309
|
+
f"--from={pandoc_type}",
|
347
310
|
"--to=markdown",
|
348
311
|
"--standalone",
|
349
312
|
"--wrap=preserve",
|
350
313
|
"--quiet",
|
351
|
-
"--output",
|
352
|
-
output_file.name,
|
353
314
|
]
|
354
315
|
|
355
|
-
|
356
|
-
command.extend(extra_args)
|
316
|
+
command.extend(["--output", str(output_path)])
|
357
317
|
|
358
|
-
result = await run_sync(
|
359
|
-
subprocess.run,
|
318
|
+
result = await to_process.run_sync(
|
319
|
+
partial(subprocess.run, capture_output=True),
|
360
320
|
command,
|
361
|
-
|
321
|
+
cancellable=True,
|
322
|
+
limiter=CapacityLimiter(max_processes),
|
362
323
|
)
|
363
324
|
|
364
325
|
if result.returncode != 0:
|
365
|
-
raise ParsingError(
|
366
|
-
"Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
|
367
|
-
)
|
326
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
368
327
|
|
369
|
-
text = await AsyncPath(
|
328
|
+
text = await AsyncPath(output_path).read_text("utf-8")
|
370
329
|
|
371
330
|
return normalize_spaces(text)
|
331
|
+
except (RuntimeError, OSError) as e:
|
332
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
|
333
|
+
finally:
|
334
|
+
await unlink()
|
372
335
|
|
373
336
|
|
374
|
-
async def
|
375
|
-
input_file: str | PathLike[str], *, mime_type: str,
|
376
|
-
) ->
|
337
|
+
async def process_file_with_pandoc(
|
338
|
+
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
339
|
+
) -> ExtractionResult:
|
377
340
|
"""Process a single file using Pandoc and convert to markdown.
|
378
341
|
|
379
342
|
Args:
|
380
343
|
input_file: The path to the file to process.
|
381
344
|
mime_type: The mime type of the file.
|
382
|
-
|
345
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
346
|
+
|
347
|
+
Raises:
|
348
|
+
ParsingError: If the file data could not be extracted.
|
383
349
|
|
384
350
|
Returns:
|
385
|
-
|
351
|
+
ExtractionResult
|
386
352
|
"""
|
387
|
-
await
|
353
|
+
await _validate_pandoc_version()
|
388
354
|
|
389
|
-
|
390
|
-
|
391
|
-
|
392
|
-
|
393
|
-
|
394
|
-
|
395
|
-
|
396
|
-
|
397
|
-
|
355
|
+
_get_pandoc_type_from_mime_type(mime_type)
|
356
|
+
|
357
|
+
metadata: Metadata = {}
|
358
|
+
content: str = ""
|
359
|
+
|
360
|
+
try:
|
361
|
+
async with create_task_group() as tg:
|
362
|
+
|
363
|
+
async def _get_metadata() -> None:
|
364
|
+
nonlocal metadata
|
365
|
+
metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
|
366
|
+
|
367
|
+
async def _get_content() -> None:
|
368
|
+
nonlocal content
|
369
|
+
content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
|
370
|
+
|
371
|
+
tg.start_soon(_get_metadata)
|
372
|
+
tg.start_soon(_get_content)
|
373
|
+
except ExceptionGroup as eg:
|
374
|
+
raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
|
375
|
+
|
376
|
+
return ExtractionResult(
|
377
|
+
content=normalize_spaces(content),
|
378
|
+
metadata=metadata,
|
379
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
398
380
|
)
|
399
381
|
|
400
382
|
|
401
|
-
async def
|
383
|
+
async def process_content_with_pandoc(
|
384
|
+
content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
385
|
+
) -> ExtractionResult:
|
402
386
|
"""Process content using Pandoc and convert to markdown.
|
403
387
|
|
404
388
|
Args:
|
405
389
|
content: The content to process.
|
406
390
|
mime_type: The mime type of the content.
|
407
|
-
|
391
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
408
392
|
|
409
393
|
Returns:
|
410
|
-
|
394
|
+
ExtractionResult
|
411
395
|
"""
|
412
|
-
extension =
|
396
|
+
extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
|
397
|
+
input_file, unlink = await create_temp_file(f".{extension}")
|
398
|
+
|
399
|
+
await AsyncPath(input_file).write_bytes(content)
|
400
|
+
result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
|
413
401
|
|
414
|
-
|
415
|
-
|
416
|
-
return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
|
402
|
+
await unlink()
|
403
|
+
return result
|