kreuzberg 1.6.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,9 +1,13 @@
1
- from .exceptions import KreuzbergError, ParsingError, ValidationError
2
- from .extraction import ExtractionResult, extract_bytes, extract_file
1
+ from ._types import ExtractionResult, Metadata
2
+ from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
3
+ from .extraction import extract_bytes, extract_file
3
4
 
4
5
  __all__ = [
5
6
  "ExtractionResult",
6
7
  "KreuzbergError",
8
+ "Metadata",
9
+ "MissingDependencyError",
10
+ "OCRError",
7
11
  "ParsingError",
8
12
  "ValidationError",
9
13
  "extract_bytes",
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from multiprocessing import cpu_count
4
+ from typing import Final
5
+
6
+ DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
kreuzberg/_html.py ADDED
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import html_to_markdown
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg import ExtractionResult
9
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
10
+ from kreuzberg._string import normalize_spaces, safe_decode
11
+ from kreuzberg._sync import run_sync
12
+
13
+ if TYPE_CHECKING:
14
+ from pathlib import Path
15
+
16
+
17
+ async def extract_html_string(file_path_or_contents: Path | bytes) -> ExtractionResult:
18
+ """Extract text from an HTML string.
19
+
20
+ Args:
21
+ file_path_or_contents: The HTML content.
22
+
23
+ Returns:
24
+ The extracted text content.
25
+ """
26
+ content = (
27
+ safe_decode(file_path_or_contents)
28
+ if isinstance(file_path_or_contents, bytes)
29
+ else await AsyncPath(file_path_or_contents).read_text()
30
+ )
31
+ result = await run_sync(html_to_markdown.convert_to_markdown, content)
32
+ return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_mime_types.py CHANGED
@@ -1,16 +1,30 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from mimetypes import guess_type
4
+ from pathlib import Path
3
5
  from typing import TYPE_CHECKING, Final
4
6
 
7
+ from kreuzberg.exceptions import ValidationError
8
+
5
9
  if TYPE_CHECKING: # pragma: no cover
6
10
  from collections.abc import Mapping
11
+ from os import PathLike
7
12
 
8
13
  HTML_MIME_TYPE: Final = "text/html"
9
14
  MARKDOWN_MIME_TYPE: Final = "text/markdown"
10
15
  PDF_MIME_TYPE: Final = "application/pdf"
11
16
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
12
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
+ # Excel formats
13
19
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
20
+ EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
21
+ EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
22
+ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macroEnabled.12"
23
+ EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
24
+ EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
25
+
26
+ # OpenDocument spreadsheet format
27
+ OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet" # ods
14
28
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
15
29
 
16
30
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -85,9 +99,103 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
85
99
  "text/x-rst",
86
100
  }
87
101
 
102
+ SPREADSHEET_MIME_TYPES: Final[set[str]] = {
103
+ EXCEL_MIME_TYPE,
104
+ EXCEL_BINARY_MIME_TYPE,
105
+ EXCEL_MACRO_MIME_TYPE,
106
+ EXCEL_BINARY_2007_MIME_TYPE,
107
+ EXCEL_ADDON_MIME_TYPE,
108
+ EXCEL_TEMPLATE_MIME_TYPE,
109
+ OPENDOC_SPREADSHEET_MIME_TYPE,
110
+ }
111
+
112
+ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
113
+ ".txt": PLAIN_TEXT_MIME_TYPE,
114
+ ".md": MARKDOWN_MIME_TYPE,
115
+ ".pdf": PDF_MIME_TYPE,
116
+ ".html": HTML_MIME_TYPE,
117
+ ".htm": HTML_MIME_TYPE,
118
+ ".xlsx": EXCEL_MIME_TYPE,
119
+ ".xls": EXCEL_BINARY_MIME_TYPE,
120
+ ".xlsm": EXCEL_MACRO_MIME_TYPE,
121
+ ".xlsb": EXCEL_BINARY_2007_MIME_TYPE,
122
+ ".xlam": EXCEL_ADDON_MIME_TYPE,
123
+ ".xla": EXCEL_TEMPLATE_MIME_TYPE,
124
+ ".ods": OPENDOC_SPREADSHEET_MIME_TYPE,
125
+ ".pptx": POWER_POINT_MIME_TYPE,
126
+ ".bmp": "image/bmp",
127
+ ".gif": "image/gif",
128
+ ".jpg": "image/jpeg",
129
+ ".jpeg": "image/jpeg",
130
+ ".png": "image/png",
131
+ ".tiff": "image/tiff",
132
+ ".tif": "image/tiff",
133
+ ".webp": "image/webp",
134
+ ".jp2": "image/jp2",
135
+ ".jpx": "image/jpx",
136
+ ".jpm": "image/jpm",
137
+ ".mj2": "image/mj2",
138
+ ".pnm": "image/x-portable-anymap",
139
+ ".pbm": "image/x-portable-bitmap",
140
+ ".pgm": "image/x-portable-graymap",
141
+ ".ppm": "image/x-portable-pixmap",
142
+ ".csv": "text/csv",
143
+ ".tsv": "text/tab-separated-values",
144
+ ".rst": "text/x-rst",
145
+ ".org": "text/x-org",
146
+ ".epub": "application/epub+zip",
147
+ ".rtf": "application/rtf",
148
+ ".odt": "application/vnd.oasis.opendocument.text",
149
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
150
+ ".bib": "application/x-bibtex",
151
+ ".ipynb": "application/x-ipynb+json",
152
+ ".tex": "application/x-latex",
153
+ }
154
+
88
155
  SUPPORTED_MIME_TYPES: Final[set[str]] = (
89
156
  PLAIN_TEXT_MIME_TYPES
90
157
  | IMAGE_MIME_TYPES
91
158
  | PANDOC_SUPPORTED_MIME_TYPES
92
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
159
+ | SPREADSHEET_MIME_TYPES
160
+ | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
93
161
  )
162
+
163
+
164
+ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
165
+ """Validate and detect the MIME type for a given file.
166
+
167
+ Args:
168
+ file_path: The path to the file.
169
+ mime_type: Optional explicit MIME type. If provided, this will be validated.
170
+ If not provided, the function will attempt to detect the MIME type.
171
+
172
+ Raises:
173
+ ValidationError: If the MIME type is not supported or cannot be determined.
174
+
175
+ Returns:
176
+ The validated MIME type.
177
+ """
178
+ path = Path(file_path)
179
+
180
+ if not mime_type:
181
+ # Try to determine MIME type from file extension first
182
+ ext = path.suffix.lower()
183
+ mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
184
+
185
+ if not mime_type: # pragma: no cover
186
+ raise ValidationError(
187
+ "Could not determine the mime type of the file. Please specify the mime_type parameter explicitly.",
188
+ context={"input_file": str(path), "extension": ext},
189
+ )
190
+
191
+ if mime_type in SUPPORTED_MIME_TYPES:
192
+ return mime_type
193
+
194
+ for supported_mime_type in SUPPORTED_MIME_TYPES:
195
+ if mime_type.startswith(supported_mime_type):
196
+ return supported_mime_type
197
+
198
+ raise ValidationError(
199
+ f"Unsupported mime type: {mime_type}",
200
+ context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
201
+ )
kreuzberg/_pandoc.py CHANGED
@@ -1,26 +1,29 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  import subprocess
5
- from asyncio import gather
6
- from dataclasses import dataclass
7
- from tempfile import NamedTemporaryFile
8
- from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
4
+ import sys
5
+ from functools import partial
6
+ from json import JSONDecodeError, loads
7
+ from typing import TYPE_CHECKING, Any, Final, Literal, cast
9
8
 
9
+ from anyio import CapacityLimiter, create_task_group, to_process
10
10
  from anyio import Path as AsyncPath
11
11
 
12
+ from kreuzberg._constants import DEFAULT_MAX_PROCESSES
13
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
12
14
  from kreuzberg._string import normalize_spaces
13
15
  from kreuzberg._sync import run_sync
16
+ from kreuzberg._tmp import create_temp_file
17
+ from kreuzberg._types import ExtractionResult, Metadata
14
18
  from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
15
19
 
16
- if TYPE_CHECKING:
20
+ if TYPE_CHECKING: # pragma: no cover
17
21
  from collections.abc import Mapping
18
22
  from os import PathLike
19
23
 
20
- try: # pragma: no cover
21
- from typing import NotRequired # type: ignore[attr-defined]
22
- except ImportError: # pragma: no cover
23
- from typing_extensions import NotRequired
24
+ if sys.version_info < (3, 11): # pragma: no cover
25
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
+
24
27
 
25
28
  version_ref: Final[dict[str, bool]] = {"checked": False}
26
29
 
@@ -80,7 +83,7 @@ NodeType = Literal[
80
83
  "MetaBlocks",
81
84
  ]
82
85
 
83
- PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
86
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
84
87
  "application/csl+json": "csljson",
85
88
  "application/docbook+xml": "docbook",
86
89
  "application/epub+zip": "epub",
@@ -112,64 +115,37 @@ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
112
115
  "text/x-rst": "rst",
113
116
  }
114
117
 
115
-
116
- class Metadata(TypedDict, total=False):
117
- """Document metadata extracted from Pandoc document.
118
-
119
- All fields are optional but will only be included if they contain non-empty values.
120
- Any field that would be empty or None is omitted from the dictionary.
121
- """
122
-
123
- title: NotRequired[str]
124
- """Document title."""
125
- subtitle: NotRequired[str]
126
- """Document subtitle."""
127
- abstract: NotRequired[str | list[str]]
128
- """Document abstract, summary or description."""
129
- authors: NotRequired[list[str]]
130
- """List of document authors."""
131
- date: NotRequired[str]
132
- """Document date as string to preserve original format."""
133
- subject: NotRequired[str]
134
- """Document subject or topic."""
135
- description: NotRequired[str]
136
- """Extended description."""
137
- keywords: NotRequired[list[str]]
138
- """Keywords or tags."""
139
- categories: NotRequired[list[str]]
140
- """Categories or classifications."""
141
- version: NotRequired[str]
142
- """Version identifier."""
143
- language: NotRequired[str]
144
- """Document language code."""
145
- references: NotRequired[list[str]]
146
- """Reference entries."""
147
- citations: NotRequired[list[str]]
148
- """Citation identifiers."""
149
- copyright: NotRequired[str]
150
- """Copyright information."""
151
- license: NotRequired[str]
152
- """License information."""
153
- identifier: NotRequired[str]
154
- """Document identifier."""
155
- publisher: NotRequired[str]
156
- """Publisher name."""
157
- contributors: NotRequired[list[str]]
158
- """Additional contributors."""
159
- creator: NotRequired[str]
160
- """Document creator."""
161
- institute: NotRequired[str | list[str]]
162
- """Institute or organization."""
163
-
164
-
165
- @dataclass
166
- class PandocResult:
167
- """Result of a pandoc conversion including content and metadata."""
168
-
169
- content: str
170
- """The processed markdown content."""
171
- metadata: Metadata
172
- """Document metadata extracted from the source."""
118
+ MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
119
+ "application/csl+json": "json",
120
+ "application/docbook+xml": "xml",
121
+ "application/epub+zip": "epub",
122
+ "application/rtf": "rtf",
123
+ "application/vnd.oasis.opendocument.text": "odt",
124
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
125
+ "application/x-biblatex": "bib",
126
+ "application/x-bibtex": "bib",
127
+ "application/x-endnote+xml": "xml",
128
+ "application/x-fictionbook+xml": "fb2",
129
+ "application/x-ipynb+json": "ipynb",
130
+ "application/x-jats+xml": "xml",
131
+ "application/x-latex": "tex",
132
+ "application/x-opml+xml": "opml",
133
+ "application/x-research-info-systems": "ris",
134
+ "application/x-typst": "typst",
135
+ "text/csv": "csv",
136
+ "text/tab-separated-values": "tsv",
137
+ "text/troff": "1",
138
+ "text/x-commonmark": "md",
139
+ "text/x-dokuwiki": "wiki",
140
+ "text/x-gfm": "md",
141
+ "text/x-markdown": "md",
142
+ "text/x-markdown-extra": "md",
143
+ "text/x-mdoc": "md",
144
+ "text/x-multimarkdown": "md",
145
+ "text/x-org": "org",
146
+ "text/x-pod": "pod",
147
+ "text/x-rst": "rst",
148
+ }
173
149
 
174
150
 
175
151
  def _extract_inline_text(node: dict[str, Any]) -> str | None:
@@ -214,13 +190,14 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
214
190
  if node_type == META_LIST:
215
191
  results = []
216
192
  for value in [value for item in content if (value := _extract_meta_value(item))]:
217
- if isinstance(value, list):
218
- results.extend(value) # pragma: no cover
193
+ if isinstance(value, list): # pragma: no cover
194
+ results.extend(value)
219
195
  else:
220
196
  results.append(value)
221
197
  return results
222
198
 
223
- if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
199
+ # This branch is only taken for complex metadata blocks which we don't use
200
+ if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]: # pragma: no cover
224
201
  block_texts = []
225
202
  for block in blocks:
226
203
  block_content = block.get(CONTENT_FIELD, [])
@@ -232,7 +209,6 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
232
209
 
233
210
 
234
211
  def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
235
- """Extract all non-empty metadata values from Pandoc AST metadata."""
236
212
  meta: Metadata = {}
237
213
 
238
214
  for key, value in raw_meta.items():
@@ -252,34 +228,30 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
252
228
  return meta
253
229
 
254
230
 
255
- def _get_extension_from_mime_type(mime_type: str) -> str:
256
- if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
257
- mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
231
+ def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
232
+ if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
233
+ mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
258
234
  ):
259
235
  raise ValidationError(
260
236
  f"Unsupported mime type: {mime_type}",
261
237
  context={
262
238
  "mime_type": mime_type,
263
- "supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
239
+ "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
264
240
  },
265
241
  )
266
242
 
267
- return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
268
- PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
243
+ return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
244
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
269
245
  )
270
246
 
271
247
 
272
- async def validate_pandoc_version() -> None:
273
- """Validate that Pandoc is installed and is version 3 or above.
274
-
275
- Raises:
276
- MissingDependencyError: If Pandoc is not installed or is below version 3.
277
- """
248
+ async def _validate_pandoc_version() -> None:
278
249
  try:
279
250
  if version_ref["checked"]:
280
251
  return
281
252
 
282
- result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
253
+ command = ["pandoc", "--version"]
254
+ result = await run_sync(subprocess.run, command, capture_output=True)
283
255
  version = result.stdout.decode().split("\n")[0].split()[1]
284
256
  if not version.startswith("3."):
285
257
  raise MissingDependencyError("Pandoc version 3 or above is required.")
@@ -290,127 +262,142 @@ async def validate_pandoc_version() -> None:
290
262
  raise MissingDependencyError("Pandoc is not installed.") from e
291
263
 
292
264
 
293
- async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
294
- """Extract metadata from a document using pandoc.
265
+ async def _handle_extract_metadata(
266
+ input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
267
+ ) -> Metadata:
268
+ pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
269
+ metadata_file, unlink = await create_temp_file(".json")
270
+ try:
271
+ command = [
272
+ "pandoc",
273
+ str(input_file),
274
+ f"--from={pandoc_type}",
275
+ "--to=json",
276
+ "--standalone",
277
+ "--quiet",
278
+ "--output",
279
+ metadata_file,
280
+ ]
281
+
282
+ result = await to_process.run_sync(
283
+ partial(subprocess.run, capture_output=True),
284
+ command,
285
+ cancellable=True,
286
+ limiter=CapacityLimiter(max_processes),
287
+ )
288
+
289
+ if result.returncode != 0:
290
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
295
291
 
296
- Args:
297
- input_file: The path to the file to process.
298
- mime_type: The mime type of the file.
292
+ json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
293
+ return _extract_metadata(json_data)
294
+ except (RuntimeError, OSError, JSONDecodeError) as e:
295
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
296
+ finally:
297
+ await unlink()
299
298
 
300
- Raises:
301
- ParsingError: If Pandoc fails to extract metadata.
302
299
 
303
- Returns:
304
- Dictionary containing document metadata.
305
- """
306
- extension = _get_extension_from_mime_type(mime_type)
307
-
308
- with NamedTemporaryFile(suffix=".json") as metadata_file:
309
- try:
310
- command = [
311
- "pandoc",
312
- str(input_file),
313
- f"--from={extension}",
314
- "--to=json",
315
- "--standalone",
316
- "--quiet",
317
- "--output",
318
- metadata_file.name,
319
- ]
320
-
321
- result = await run_sync(
322
- subprocess.run,
323
- command,
324
- capture_output=True,
325
- )
326
-
327
- if result.returncode != 0:
328
- raise ParsingError(
329
- "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
330
- )
331
-
332
- json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
333
- return _extract_metadata(json_data)
334
-
335
- except (RuntimeError, OSError, json.JSONDecodeError) as e:
336
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
337
-
338
-
339
- async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
340
- extension = _get_extension_from_mime_type(mime_type)
341
-
342
- with NamedTemporaryFile(suffix=".md") as output_file:
300
+ async def _handle_extract_file(
301
+ input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
302
+ ) -> str:
303
+ pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
304
+ output_path, unlink = await create_temp_file(".md")
305
+ try:
343
306
  command = [
344
307
  "pandoc",
345
308
  str(input_file),
346
- f"--from={extension}",
309
+ f"--from={pandoc_type}",
347
310
  "--to=markdown",
348
311
  "--standalone",
349
312
  "--wrap=preserve",
350
313
  "--quiet",
351
- "--output",
352
- output_file.name,
353
314
  ]
354
315
 
355
- if extra_args:
356
- command.extend(extra_args)
316
+ command.extend(["--output", str(output_path)])
357
317
 
358
- result = await run_sync(
359
- subprocess.run,
318
+ result = await to_process.run_sync(
319
+ partial(subprocess.run, capture_output=True),
360
320
  command,
361
- capture_output=True,
321
+ cancellable=True,
322
+ limiter=CapacityLimiter(max_processes),
362
323
  )
363
324
 
364
325
  if result.returncode != 0:
365
- raise ParsingError(
366
- "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
367
- )
326
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
368
327
 
369
- text = await AsyncPath(output_file.name).read_text()
328
+ text = await AsyncPath(output_path).read_text("utf-8")
370
329
 
371
330
  return normalize_spaces(text)
331
+ except (RuntimeError, OSError) as e:
332
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
333
+ finally:
334
+ await unlink()
372
335
 
373
336
 
374
- async def process_file(
375
- input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
376
- ) -> PandocResult:
337
+ async def process_file_with_pandoc(
338
+ input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
339
+ ) -> ExtractionResult:
377
340
  """Process a single file using Pandoc and convert to markdown.
378
341
 
379
342
  Args:
380
343
  input_file: The path to the file to process.
381
344
  mime_type: The mime type of the file.
382
- extra_args: Additional Pandoc command line arguments.
345
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
346
+
347
+ Raises:
348
+ ParsingError: If the file data could not be extracted.
383
349
 
384
350
  Returns:
385
- PandocResult containing processed content and metadata.
351
+ ExtractionResult
386
352
  """
387
- await validate_pandoc_version()
353
+ await _validate_pandoc_version()
388
354
 
389
- metadata, content = await gather(
390
- *[
391
- extract_metadata(input_file, mime_type=mime_type),
392
- _extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
393
- ]
394
- )
395
- return PandocResult(
396
- content=content, # type: ignore[arg-type]
397
- metadata=metadata, # type: ignore[arg-type]
355
+ _get_pandoc_type_from_mime_type(mime_type)
356
+
357
+ metadata: Metadata = {}
358
+ content: str = ""
359
+
360
+ try:
361
+ async with create_task_group() as tg:
362
+
363
+ async def _get_metadata() -> None:
364
+ nonlocal metadata
365
+ metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
366
+
367
+ async def _get_content() -> None:
368
+ nonlocal content
369
+ content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
370
+
371
+ tg.start_soon(_get_metadata)
372
+ tg.start_soon(_get_content)
373
+ except ExceptionGroup as eg:
374
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
375
+
376
+ return ExtractionResult(
377
+ content=normalize_spaces(content),
378
+ metadata=metadata,
379
+ mime_type=MARKDOWN_MIME_TYPE,
398
380
  )
399
381
 
400
382
 
401
- async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
383
+ async def process_content_with_pandoc(
384
+ content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
385
+ ) -> ExtractionResult:
402
386
  """Process content using Pandoc and convert to markdown.
403
387
 
404
388
  Args:
405
389
  content: The content to process.
406
390
  mime_type: The mime type of the content.
407
- extra_args: Additional Pandoc command line arguments.
391
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
408
392
 
409
393
  Returns:
410
- PandocResult containing processed content and metadata.
394
+ ExtractionResult
411
395
  """
412
- extension = _get_extension_from_mime_type(mime_type)
396
+ extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
397
+ input_file, unlink = await create_temp_file(f".{extension}")
398
+
399
+ await AsyncPath(input_file).write_bytes(content)
400
+ result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
413
401
 
414
- with NamedTemporaryFile(suffix=f".{extension}") as input_file:
415
- await AsyncPath(input_file.name).write_bytes(content)
416
- return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
402
+ await unlink()
403
+ return result