kreuzberg 1.7.0__py3-none-any.whl → 2.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/__init__.py CHANGED
@@ -1,9 +1,13 @@
1
- from .exceptions import KreuzbergError, ParsingError, ValidationError
2
- from .extraction import ExtractionResult, extract_bytes, extract_file
1
+ from ._types import ExtractionResult, Metadata
2
+ from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
3
+ from .extraction import extract_bytes, extract_file
3
4
 
4
5
  __all__ = [
5
6
  "ExtractionResult",
6
7
  "KreuzbergError",
8
+ "Metadata",
9
+ "MissingDependencyError",
10
+ "OCRError",
7
11
  "ParsingError",
8
12
  "ValidationError",
9
13
  "extract_bytes",
@@ -0,0 +1,6 @@
1
+ from __future__ import annotations
2
+
3
+ from multiprocessing import cpu_count
4
+ from typing import Final
5
+
6
+ DEFAULT_MAX_PROCESSES: Final[int] = max(cpu_count() // 2, 1)
kreuzberg/_html.py ADDED
@@ -0,0 +1,32 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ import html_to_markdown
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg import ExtractionResult
9
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
10
+ from kreuzberg._string import normalize_spaces, safe_decode
11
+ from kreuzberg._sync import run_sync
12
+
13
+ if TYPE_CHECKING:
14
+ from pathlib import Path
15
+
16
+
17
+ async def extract_html_string(file_path_or_contents: Path | bytes) -> ExtractionResult:
18
+ """Extract text from an HTML string.
19
+
20
+ Args:
21
+ file_path_or_contents: The HTML content.
22
+
23
+ Returns:
24
+ The extracted text content.
25
+ """
26
+ content = (
27
+ safe_decode(file_path_or_contents)
28
+ if isinstance(file_path_or_contents, bytes)
29
+ else await AsyncPath(file_path_or_contents).read_text()
30
+ )
31
+ result = await run_sync(html_to_markdown.convert_to_markdown, content)
32
+ return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_mime_types.py CHANGED
@@ -1,16 +1,30 @@
1
1
  from __future__ import annotations
2
2
 
3
+ from mimetypes import guess_type
4
+ from pathlib import Path
3
5
  from typing import TYPE_CHECKING, Final
4
6
 
7
+ from kreuzberg.exceptions import ValidationError
8
+
5
9
  if TYPE_CHECKING: # pragma: no cover
6
10
  from collections.abc import Mapping
11
+ from os import PathLike
7
12
 
8
13
  HTML_MIME_TYPE: Final = "text/html"
9
14
  MARKDOWN_MIME_TYPE: Final = "text/markdown"
10
15
  PDF_MIME_TYPE: Final = "application/pdf"
11
16
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
12
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
+ # Excel formats
13
19
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
20
+ EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
21
+ EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
22
+ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macroEnabled.12"
23
+ EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
24
+ EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
25
+
26
+ # OpenDocument spreadsheet format
27
+ OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet" # ods
14
28
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
15
29
 
16
30
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -85,9 +99,103 @@ PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
85
99
  "text/x-rst",
86
100
  }
87
101
 
102
+ SPREADSHEET_MIME_TYPES: Final[set[str]] = {
103
+ EXCEL_MIME_TYPE,
104
+ EXCEL_BINARY_MIME_TYPE,
105
+ EXCEL_MACRO_MIME_TYPE,
106
+ EXCEL_BINARY_2007_MIME_TYPE,
107
+ EXCEL_ADDON_MIME_TYPE,
108
+ EXCEL_TEMPLATE_MIME_TYPE,
109
+ OPENDOC_SPREADSHEET_MIME_TYPE,
110
+ }
111
+
112
+ EXT_TO_MIME_TYPE: Final[Mapping[str, str]] = {
113
+ ".txt": PLAIN_TEXT_MIME_TYPE,
114
+ ".md": MARKDOWN_MIME_TYPE,
115
+ ".pdf": PDF_MIME_TYPE,
116
+ ".html": HTML_MIME_TYPE,
117
+ ".htm": HTML_MIME_TYPE,
118
+ ".xlsx": EXCEL_MIME_TYPE,
119
+ ".xls": EXCEL_BINARY_MIME_TYPE,
120
+ ".xlsm": EXCEL_MACRO_MIME_TYPE,
121
+ ".xlsb": EXCEL_BINARY_2007_MIME_TYPE,
122
+ ".xlam": EXCEL_ADDON_MIME_TYPE,
123
+ ".xla": EXCEL_TEMPLATE_MIME_TYPE,
124
+ ".ods": OPENDOC_SPREADSHEET_MIME_TYPE,
125
+ ".pptx": POWER_POINT_MIME_TYPE,
126
+ ".bmp": "image/bmp",
127
+ ".gif": "image/gif",
128
+ ".jpg": "image/jpeg",
129
+ ".jpeg": "image/jpeg",
130
+ ".png": "image/png",
131
+ ".tiff": "image/tiff",
132
+ ".tif": "image/tiff",
133
+ ".webp": "image/webp",
134
+ ".jp2": "image/jp2",
135
+ ".jpx": "image/jpx",
136
+ ".jpm": "image/jpm",
137
+ ".mj2": "image/mj2",
138
+ ".pnm": "image/x-portable-anymap",
139
+ ".pbm": "image/x-portable-bitmap",
140
+ ".pgm": "image/x-portable-graymap",
141
+ ".ppm": "image/x-portable-pixmap",
142
+ ".csv": "text/csv",
143
+ ".tsv": "text/tab-separated-values",
144
+ ".rst": "text/x-rst",
145
+ ".org": "text/x-org",
146
+ ".epub": "application/epub+zip",
147
+ ".rtf": "application/rtf",
148
+ ".odt": "application/vnd.oasis.opendocument.text",
149
+ ".docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
150
+ ".bib": "application/x-bibtex",
151
+ ".ipynb": "application/x-ipynb+json",
152
+ ".tex": "application/x-latex",
153
+ }
154
+
88
155
  SUPPORTED_MIME_TYPES: Final[set[str]] = (
89
156
  PLAIN_TEXT_MIME_TYPES
90
157
  | IMAGE_MIME_TYPES
91
158
  | PANDOC_SUPPORTED_MIME_TYPES
92
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
159
+ | SPREADSHEET_MIME_TYPES
160
+ | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
93
161
  )
162
+
163
+
164
+ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
165
+ """Validate and detect the MIME type for a given file.
166
+
167
+ Args:
168
+ file_path: The path to the file.
169
+ mime_type: Optional explicit MIME type. If provided, this will be validated.
170
+ If not provided, the function will attempt to detect the MIME type.
171
+
172
+ Raises:
173
+ ValidationError: If the MIME type is not supported or cannot be determined.
174
+
175
+ Returns:
176
+ The validated MIME type.
177
+ """
178
+ path = Path(file_path)
179
+
180
+ if not mime_type:
181
+ # Try to determine MIME type from file extension first
182
+ ext = path.suffix.lower()
183
+ mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
184
+
185
+ if not mime_type: # pragma: no cover
186
+ raise ValidationError(
187
+ "Could not determine the mime type of the file. Please specify the mime_type parameter explicitly.",
188
+ context={"input_file": str(path), "extension": ext},
189
+ )
190
+
191
+ if mime_type in SUPPORTED_MIME_TYPES:
192
+ return mime_type
193
+
194
+ for supported_mime_type in SUPPORTED_MIME_TYPES:
195
+ if mime_type.startswith(supported_mime_type):
196
+ return supported_mime_type
197
+
198
+ raise ValidationError(
199
+ f"Unsupported mime type: {mime_type}",
200
+ context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
201
+ )
kreuzberg/_pandoc.py CHANGED
@@ -1,26 +1,29 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import subprocess
4
- from asyncio import gather
5
- from dataclasses import dataclass
4
+ import sys
5
+ from functools import partial
6
6
  from json import JSONDecodeError, loads
7
- from tempfile import NamedTemporaryFile
8
- from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
7
+ from typing import TYPE_CHECKING, Any, Final, Literal, cast
9
8
 
9
+ from anyio import CapacityLimiter, create_task_group, to_process
10
10
  from anyio import Path as AsyncPath
11
11
 
12
+ from kreuzberg._constants import DEFAULT_MAX_PROCESSES
13
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
12
14
  from kreuzberg._string import normalize_spaces
13
15
  from kreuzberg._sync import run_sync
16
+ from kreuzberg._tmp import create_temp_file
17
+ from kreuzberg._types import ExtractionResult, Metadata
14
18
  from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
15
19
 
16
20
  if TYPE_CHECKING: # pragma: no cover
17
21
  from collections.abc import Mapping
18
22
  from os import PathLike
19
23
 
20
- try: # pragma: no cover
21
- from typing import NotRequired # type: ignore[attr-defined]
22
- except ImportError: # pragma: no cover
23
- from typing_extensions import NotRequired
24
+ if sys.version_info < (3, 11): # pragma: no cover
25
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
+
24
27
 
25
28
  version_ref: Final[dict[str, bool]] = {"checked": False}
26
29
 
@@ -145,65 +148,6 @@ MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
145
148
  }
146
149
 
147
150
 
148
- class Metadata(TypedDict, total=False):
149
- """Document metadata extracted from Pandoc document.
150
-
151
- All fields are optional but will only be included if they contain non-empty values.
152
- Any field that would be empty or None is omitted from the dictionary.
153
- """
154
-
155
- title: NotRequired[str]
156
- """Document title."""
157
- subtitle: NotRequired[str]
158
- """Document subtitle."""
159
- abstract: NotRequired[str | list[str]]
160
- """Document abstract, summary or description."""
161
- authors: NotRequired[list[str]]
162
- """List of document authors."""
163
- date: NotRequired[str]
164
- """Document date as string to preserve original format."""
165
- subject: NotRequired[str]
166
- """Document subject or topic."""
167
- description: NotRequired[str]
168
- """Extended description."""
169
- keywords: NotRequired[list[str]]
170
- """Keywords or tags."""
171
- categories: NotRequired[list[str]]
172
- """Categories or classifications."""
173
- version: NotRequired[str]
174
- """Version identifier."""
175
- language: NotRequired[str]
176
- """Document language code."""
177
- references: NotRequired[list[str]]
178
- """Reference entries."""
179
- citations: NotRequired[list[str]]
180
- """Citation identifiers."""
181
- copyright: NotRequired[str]
182
- """Copyright information."""
183
- license: NotRequired[str]
184
- """License information."""
185
- identifier: NotRequired[str]
186
- """Document identifier."""
187
- publisher: NotRequired[str]
188
- """Publisher name."""
189
- contributors: NotRequired[list[str]]
190
- """Additional contributors."""
191
- creator: NotRequired[str]
192
- """Document creator."""
193
- institute: NotRequired[str | list[str]]
194
- """Institute or organization."""
195
-
196
-
197
- @dataclass
198
- class PandocResult:
199
- """Result of a pandoc conversion including content and metadata."""
200
-
201
- content: str
202
- """The processed markdown content."""
203
- metadata: Metadata
204
- """Document metadata extracted from the source."""
205
-
206
-
207
151
  def _extract_inline_text(node: dict[str, Any]) -> str | None:
208
152
  if node_type := node.get(TYPE_FIELD):
209
153
  if node_type == INLINE_STR:
@@ -246,13 +190,14 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
246
190
  if node_type == META_LIST:
247
191
  results = []
248
192
  for value in [value for item in content if (value := _extract_meta_value(item))]:
249
- if isinstance(value, list):
250
- results.extend(value) # pragma: no cover
193
+ if isinstance(value, list): # pragma: no cover
194
+ results.extend(value)
251
195
  else:
252
196
  results.append(value)
253
197
  return results
254
198
 
255
- if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]:
199
+ # This branch is only taken for complex metadata blocks which we don't use
200
+ if blocks := [block for block in content if block.get(TYPE_FIELD) == BLOCK_PARA]: # pragma: no cover
256
201
  block_texts = []
257
202
  for block in blocks:
258
203
  block_content = block.get(CONTENT_FIELD, [])
@@ -317,134 +262,142 @@ async def _validate_pandoc_version() -> None:
317
262
  raise MissingDependencyError("Pandoc is not installed.") from e
318
263
 
319
264
 
320
- async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
265
+ async def _handle_extract_metadata(
266
+ input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
267
+ ) -> Metadata:
321
268
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
269
+ metadata_file, unlink = await create_temp_file(".json")
270
+ try:
271
+ command = [
272
+ "pandoc",
273
+ str(input_file),
274
+ f"--from={pandoc_type}",
275
+ "--to=json",
276
+ "--standalone",
277
+ "--quiet",
278
+ "--output",
279
+ metadata_file,
280
+ ]
322
281
 
323
- with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
324
- try:
325
- command = [
326
- "pandoc",
327
- str(input_file),
328
- f"--from={pandoc_type}",
329
- "--to=json",
330
- "--standalone",
331
- "--quiet",
332
- "--output",
333
- metadata_file.name,
334
- ]
335
-
336
- result = await run_sync(
337
- subprocess.run,
338
- command,
339
- capture_output=True,
340
- )
341
-
342
- if result.returncode != 0:
343
- raise ParsingError(
344
- "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
345
- )
346
-
347
- json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
348
- return _extract_metadata(json_data)
349
-
350
- except (RuntimeError, OSError, JSONDecodeError) as e:
351
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
352
-
353
- finally:
354
- metadata_file.close()
355
- await AsyncPath(metadata_file.name).unlink()
282
+ result = await to_process.run_sync(
283
+ partial(subprocess.run, capture_output=True),
284
+ command,
285
+ cancellable=True,
286
+ limiter=CapacityLimiter(max_processes),
287
+ )
288
+
289
+ if result.returncode != 0:
290
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
291
+
292
+ json_data = loads(await AsyncPath(metadata_file).read_text("utf-8"))
293
+ return _extract_metadata(json_data)
294
+ except (RuntimeError, OSError, JSONDecodeError) as e:
295
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
296
+ finally:
297
+ await unlink()
356
298
 
357
299
 
358
300
  async def _handle_extract_file(
359
- input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
301
+ input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
360
302
  ) -> str:
361
303
  pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
304
+ output_path, unlink = await create_temp_file(".md")
305
+ try:
306
+ command = [
307
+ "pandoc",
308
+ str(input_file),
309
+ f"--from={pandoc_type}",
310
+ "--to=markdown",
311
+ "--standalone",
312
+ "--wrap=preserve",
313
+ "--quiet",
314
+ ]
315
+
316
+ command.extend(["--output", str(output_path)])
317
+
318
+ result = await to_process.run_sync(
319
+ partial(subprocess.run, capture_output=True),
320
+ command,
321
+ cancellable=True,
322
+ limiter=CapacityLimiter(max_processes),
323
+ )
324
+
325
+ if result.returncode != 0:
326
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
327
+
328
+ text = await AsyncPath(output_path).read_text("utf-8")
329
+
330
+ return normalize_spaces(text)
331
+ except (RuntimeError, OSError) as e:
332
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
333
+ finally:
334
+ await unlink()
362
335
 
363
- with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
364
- try:
365
- command = [
366
- "pandoc",
367
- str(input_file),
368
- f"--from={pandoc_type}",
369
- "--to=markdown",
370
- "--standalone",
371
- "--wrap=preserve",
372
- "--quiet",
373
- "--output",
374
- output_file.name,
375
- ]
376
-
377
- if extra_args:
378
- command.extend(extra_args)
379
-
380
- result = await run_sync(
381
- subprocess.run,
382
- command,
383
- capture_output=True,
384
- )
385
-
386
- if result.returncode != 0:
387
- raise ParsingError(
388
- "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
389
- )
390
-
391
- text = await AsyncPath(output_file.name).read_text("utf-8")
392
-
393
- return normalize_spaces(text)
394
-
395
- except (RuntimeError, OSError) as e:
396
- raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
397
-
398
- finally:
399
- output_file.close()
400
- await AsyncPath(output_file.name).unlink()
401
-
402
-
403
- async def process_file(
404
- input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
405
- ) -> PandocResult:
336
+
337
+ async def process_file_with_pandoc(
338
+ input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
339
+ ) -> ExtractionResult:
406
340
  """Process a single file using Pandoc and convert to markdown.
407
341
 
408
342
  Args:
409
343
  input_file: The path to the file to process.
410
344
  mime_type: The mime type of the file.
411
- extra_args: Additional Pandoc command line arguments.
345
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
346
+
347
+ Raises:
348
+ ParsingError: If the file data could not be extracted.
412
349
 
413
350
  Returns:
414
- PandocResult containing processed content and metadata.
351
+ ExtractionResult
415
352
  """
416
353
  await _validate_pandoc_version()
417
354
 
418
- metadata, content = await gather(
419
- *[
420
- _handle_extract_metadata(input_file, mime_type=mime_type),
421
- _handle_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
422
- ]
423
- )
424
- return PandocResult(
425
- content=content, # type: ignore[arg-type]
426
- metadata=metadata, # type: ignore[arg-type]
355
+ _get_pandoc_type_from_mime_type(mime_type)
356
+
357
+ metadata: Metadata = {}
358
+ content: str = ""
359
+
360
+ try:
361
+ async with create_task_group() as tg:
362
+
363
+ async def _get_metadata() -> None:
364
+ nonlocal metadata
365
+ metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
366
+
367
+ async def _get_content() -> None:
368
+ nonlocal content
369
+ content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
370
+
371
+ tg.start_soon(_get_metadata)
372
+ tg.start_soon(_get_content)
373
+ except ExceptionGroup as eg:
374
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from eg.exceptions[0]
375
+
376
+ return ExtractionResult(
377
+ content=normalize_spaces(content),
378
+ metadata=metadata,
379
+ mime_type=MARKDOWN_MIME_TYPE,
427
380
  )
428
381
 
429
382
 
430
- async def process_content(content: bytes, *, mime_type: str, extra_args: list[str] | None = None) -> PandocResult:
383
+ async def process_content_with_pandoc(
384
+ content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
385
+ ) -> ExtractionResult:
431
386
  """Process content using Pandoc and convert to markdown.
432
387
 
433
388
  Args:
434
389
  content: The content to process.
435
390
  mime_type: The mime type of the content.
436
- extra_args: Additional Pandoc command line arguments.
391
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
437
392
 
438
393
  Returns:
439
- PandocResult containing processed content and metadata.
394
+ ExtractionResult
440
395
  """
441
396
  extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
397
+ input_file, unlink = await create_temp_file(f".{extension}")
442
398
 
443
- with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
444
- try:
445
- await AsyncPath(input_file.name).write_bytes(content)
446
- return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
399
+ await AsyncPath(input_file).write_bytes(content)
400
+ result = await process_file_with_pandoc(input_file, mime_type=mime_type, max_processes=max_processes)
447
401
 
448
- finally:
449
- input_file.close()
450
- await AsyncPath(input_file.name).unlink()
402
+ await unlink()
403
+ return result