kreuzberg 1.6.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -1,6 +1,7 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
+ from asyncio import gather
4
5
  from contextlib import suppress
5
6
  from html import escape
6
7
  from io import BytesIO
@@ -37,13 +38,18 @@ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
37
38
  Returns:
38
39
  A list of Pillow Images.
39
40
  """
41
+ pdf = None
42
+ resolved_path = str(await AsyncPath(file_path).resolve())
40
43
  try:
41
- pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
44
+ pdf = await run_sync(pypdfium2.PdfDocument, resolved_path)
42
45
  return [page.render(scale=2.0).to_pil() for page in pdf]
43
46
  except pypdfium2.PdfiumError as e:
44
47
  raise ParsingError(
45
48
  "Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
46
49
  ) from e
50
+ finally:
51
+ if pdf is not None:
52
+ pdf.close()
47
53
 
48
54
 
49
55
  async def extract_pdf_with_tesseract(file_path: Path) -> str:
@@ -72,30 +78,49 @@ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
72
78
  Returns:
73
79
  The extracted text.
74
80
  """
81
+ document = None
82
+ resolved_path = str(await AsyncPath(file_path).resolve())
75
83
  try:
76
- document = await run_sync(pypdfium2.PdfDocument, file_path)
77
- text = "\n".join(page.get_textpage().get_text_range() for page in document)
84
+ document = await run_sync(pypdfium2.PdfDocument, resolved_path)
85
+ text = "\n".join(page.get_textpage().get_text_bounded() for page in document)
78
86
  return normalize_spaces(text)
79
87
  except pypdfium2.PdfiumError as e:
80
88
  raise ParsingError(
81
89
  "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
82
90
  ) from e
91
+ finally:
92
+ if document is not None:
93
+ document.close()
83
94
 
84
95
 
85
- async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
96
+ async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = False) -> str:
86
97
  """Extract text from a PDF file.
87
98
 
88
99
  Args:
89
- file_path: The path to the PDF file.
100
+ file_path_or_contents: The path to the PDF file or its contents as bytes.
90
101
  force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
91
102
 
92
103
  Returns:
93
104
  The extracted text.
94
105
  """
95
- if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
106
+ if isinstance(file_path_or_contents, bytes):
107
+ with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
108
+ try:
109
+ file_path = Path(pdf_file.name)
110
+ await AsyncPath(file_path).write_bytes(file_path_or_contents)
111
+
112
+ if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
113
+ return normalize_spaces(content)
114
+
115
+ return await extract_pdf_with_tesseract(file_path)
116
+ finally:
117
+ pdf_file.close()
118
+ await AsyncPath(pdf_file.name).unlink()
119
+
120
+ if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
96
121
  return normalize_spaces(content)
97
122
 
98
- return await extract_pdf_with_tesseract(file_path)
123
+ return await extract_pdf_with_tesseract(file_path_or_contents)
99
124
 
100
125
 
101
126
  async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
@@ -122,7 +147,8 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
122
147
  Returns:
123
148
  The extracted text.
124
149
  """
125
- result = await process_file(file_path, mime_type=mime_type)
150
+ resolved_path = str(await AsyncPath(file_path).resolve())
151
+ result = await process_file(resolved_path, mime_type=mime_type)
126
152
  return normalize_spaces(result.content)
127
153
 
128
154
 
@@ -208,26 +234,33 @@ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
208
234
  Raises:
209
235
  ParsingError: If the XLSX file could not be parsed.
210
236
  """
211
- try:
212
- with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
237
+ with (
238
+ NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
239
+ NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
240
+ ):
241
+ try:
213
242
  if isinstance(file_path_or_contents, bytes):
214
243
  xlsx_file.write(file_path_or_contents)
215
244
  xlsx_file.flush()
216
245
  xlsx_path = xlsx_file.name
217
246
  else:
218
- xlsx_path = str(file_path_or_contents)
247
+ xlsx_path = str(await AsyncPath(file_path_or_contents).resolve())
219
248
 
220
249
  await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
221
250
  result = await process_file(csv_file.name, mime_type="text/csv")
222
251
  return normalize_spaces(result.content)
223
- except Exception as e:
224
- raise ParsingError(
225
- "Could not extract text from XLSX file",
226
- context={
227
- "error": str(e),
228
- "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
229
- },
230
- ) from e
252
+ except Exception as e:
253
+ raise ParsingError(
254
+ "Could not extract text from XLSX file",
255
+ context={
256
+ "error": str(e),
257
+ "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
258
+ },
259
+ ) from e
260
+ finally:
261
+ xlsx_file.close()
262
+ csv_file.close()
263
+ await gather(AsyncPath(xlsx_file.name).unlink(), AsyncPath(csv_file.name).unlink())
231
264
 
232
265
 
233
266
  async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
kreuzberg/_pandoc.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  import subprocess
5
4
  from asyncio import gather
6
5
  from dataclasses import dataclass
6
+ from json import JSONDecodeError, loads
7
7
  from tempfile import NamedTemporaryFile
8
8
  from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
9
9
 
@@ -13,7 +13,7 @@ from kreuzberg._string import normalize_spaces
13
13
  from kreuzberg._sync import run_sync
14
14
  from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
15
15
 
16
- if TYPE_CHECKING:
16
+ if TYPE_CHECKING: # pragma: no cover
17
17
  from collections.abc import Mapping
18
18
  from os import PathLike
19
19
 
@@ -80,7 +80,7 @@ NodeType = Literal[
80
80
  "MetaBlocks",
81
81
  ]
82
82
 
83
- PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
83
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
84
84
  "application/csl+json": "csljson",
85
85
  "application/docbook+xml": "docbook",
86
86
  "application/epub+zip": "epub",
@@ -112,6 +112,38 @@ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
112
112
  "text/x-rst": "rst",
113
113
  }
114
114
 
115
+ MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
116
+ "application/csl+json": "json",
117
+ "application/docbook+xml": "xml",
118
+ "application/epub+zip": "epub",
119
+ "application/rtf": "rtf",
120
+ "application/vnd.oasis.opendocument.text": "odt",
121
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
122
+ "application/x-biblatex": "bib",
123
+ "application/x-bibtex": "bib",
124
+ "application/x-endnote+xml": "xml",
125
+ "application/x-fictionbook+xml": "fb2",
126
+ "application/x-ipynb+json": "ipynb",
127
+ "application/x-jats+xml": "xml",
128
+ "application/x-latex": "tex",
129
+ "application/x-opml+xml": "opml",
130
+ "application/x-research-info-systems": "ris",
131
+ "application/x-typst": "typst",
132
+ "text/csv": "csv",
133
+ "text/tab-separated-values": "tsv",
134
+ "text/troff": "1",
135
+ "text/x-commonmark": "md",
136
+ "text/x-dokuwiki": "wiki",
137
+ "text/x-gfm": "md",
138
+ "text/x-markdown": "md",
139
+ "text/x-markdown-extra": "md",
140
+ "text/x-mdoc": "md",
141
+ "text/x-multimarkdown": "md",
142
+ "text/x-org": "org",
143
+ "text/x-pod": "pod",
144
+ "text/x-rst": "rst",
145
+ }
146
+
115
147
 
116
148
  class Metadata(TypedDict, total=False):
117
149
  """Document metadata extracted from Pandoc document.
@@ -232,7 +264,6 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
232
264
 
233
265
 
234
266
  def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
235
- """Extract all non-empty metadata values from Pandoc AST metadata."""
236
267
  meta: Metadata = {}
237
268
 
238
269
  for key, value in raw_meta.items():
@@ -252,34 +283,30 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
252
283
  return meta
253
284
 
254
285
 
255
- def _get_extension_from_mime_type(mime_type: str) -> str:
256
- if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
257
- mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
286
+ def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
287
+ if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
288
+ mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
258
289
  ):
259
290
  raise ValidationError(
260
291
  f"Unsupported mime type: {mime_type}",
261
292
  context={
262
293
  "mime_type": mime_type,
263
- "supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
294
+ "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
264
295
  },
265
296
  )
266
297
 
267
- return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
268
- PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
298
+ return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
299
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
269
300
  )
270
301
 
271
302
 
272
- async def validate_pandoc_version() -> None:
273
- """Validate that Pandoc is installed and is version 3 or above.
274
-
275
- Raises:
276
- MissingDependencyError: If Pandoc is not installed or is below version 3.
277
- """
303
+ async def _validate_pandoc_version() -> None:
278
304
  try:
279
305
  if version_ref["checked"]:
280
306
  return
281
307
 
282
- result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
308
+ command = ["pandoc", "--version"]
309
+ result = await run_sync(subprocess.run, command, capture_output=True)
283
310
  version = result.stdout.decode().split("\n")[0].split()[1]
284
311
  if not version.startswith("3."):
285
312
  raise MissingDependencyError("Pandoc version 3 or above is required.")
@@ -290,27 +317,15 @@ async def validate_pandoc_version() -> None:
290
317
  raise MissingDependencyError("Pandoc is not installed.") from e
291
318
 
292
319
 
293
- async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
294
- """Extract metadata from a document using pandoc.
295
-
296
- Args:
297
- input_file: The path to the file to process.
298
- mime_type: The mime type of the file.
299
-
300
- Raises:
301
- ParsingError: If Pandoc fails to extract metadata.
302
-
303
- Returns:
304
- Dictionary containing document metadata.
305
- """
306
- extension = _get_extension_from_mime_type(mime_type)
320
+ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
321
+ pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
307
322
 
308
- with NamedTemporaryFile(suffix=".json") as metadata_file:
323
+ with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
309
324
  try:
310
325
  command = [
311
326
  "pandoc",
312
327
  str(input_file),
313
- f"--from={extension}",
328
+ f"--from={pandoc_type}",
314
329
  "--to=json",
315
330
  "--standalone",
316
331
  "--quiet",
@@ -329,46 +344,60 @@ async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -
329
344
  "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
330
345
  )
331
346
 
332
- json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
347
+ json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
333
348
  return _extract_metadata(json_data)
334
349
 
335
- except (RuntimeError, OSError, json.JSONDecodeError) as e:
350
+ except (RuntimeError, OSError, JSONDecodeError) as e:
336
351
  raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
337
352
 
353
+ finally:
354
+ metadata_file.close()
355
+ await AsyncPath(metadata_file.name).unlink()
338
356
 
339
- async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
340
- extension = _get_extension_from_mime_type(mime_type)
341
-
342
- with NamedTemporaryFile(suffix=".md") as output_file:
343
- command = [
344
- "pandoc",
345
- str(input_file),
346
- f"--from={extension}",
347
- "--to=markdown",
348
- "--standalone",
349
- "--wrap=preserve",
350
- "--quiet",
351
- "--output",
352
- output_file.name,
353
- ]
354
357
 
355
- if extra_args:
356
- command.extend(extra_args)
358
+ async def _handle_extract_file(
359
+ input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
360
+ ) -> str:
361
+ pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
357
362
 
358
- result = await run_sync(
359
- subprocess.run,
360
- command,
361
- capture_output=True,
362
- )
363
+ with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
364
+ try:
365
+ command = [
366
+ "pandoc",
367
+ str(input_file),
368
+ f"--from={pandoc_type}",
369
+ "--to=markdown",
370
+ "--standalone",
371
+ "--wrap=preserve",
372
+ "--quiet",
373
+ "--output",
374
+ output_file.name,
375
+ ]
376
+
377
+ if extra_args:
378
+ command.extend(extra_args)
363
379
 
364
- if result.returncode != 0:
365
- raise ParsingError(
366
- "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
380
+ result = await run_sync(
381
+ subprocess.run,
382
+ command,
383
+ capture_output=True,
367
384
  )
368
385
 
369
- text = await AsyncPath(output_file.name).read_text()
386
+ if result.returncode != 0:
387
+ raise ParsingError(
388
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
389
+ )
370
390
 
371
- return normalize_spaces(text)
391
+ text = await AsyncPath(output_file.name).read_text("utf-8")
392
+
393
+ return normalize_spaces(text)
394
+
395
+ except (RuntimeError, OSError) as e:
396
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
397
+
398
+ finally:
399
+ output_file.close()
400
+ await AsyncPath(output_file.name).unlink()
372
401
 
373
402
 
374
403
  async def process_file(
@@ -384,12 +413,12 @@ async def process_file(
384
413
  Returns:
385
414
  PandocResult containing processed content and metadata.
386
415
  """
387
- await validate_pandoc_version()
416
+ await _validate_pandoc_version()
388
417
 
389
418
  metadata, content = await gather(
390
419
  *[
391
- extract_metadata(input_file, mime_type=mime_type),
392
- _extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
420
+ _handle_extract_metadata(input_file, mime_type=mime_type),
421
+ _handle_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
393
422
  ]
394
423
  )
395
424
  return PandocResult(
@@ -409,8 +438,13 @@ async def process_content(content: bytes, *, mime_type: str, extra_args: list[st
409
438
  Returns:
410
439
  PandocResult containing processed content and metadata.
411
440
  """
412
- extension = _get_extension_from_mime_type(mime_type)
441
+ extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
442
+
443
+ with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
444
+ try:
445
+ await AsyncPath(input_file.name).write_bytes(content)
446
+ return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
413
447
 
414
- with NamedTemporaryFile(suffix=f".{extension}") as input_file:
415
- await AsyncPath(input_file.name).write_bytes(content)
416
- return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
448
+ finally:
449
+ input_file.close()
450
+ await AsyncPath(input_file.name).unlink()
kreuzberg/_tesseract.py CHANGED
@@ -186,8 +186,9 @@ async def validate_tesseract_version() -> None:
186
186
  if version_ref["checked"]:
187
187
  return
188
188
 
189
- result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
190
- version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
189
+ command = ["tesseract", "--version"]
190
+ result = await run_sync(subprocess.run, command, capture_output=True)
191
+ version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
191
192
  if not version_match or int(version_match.group(1)) < 5:
192
193
  raise MissingDependencyError("Tesseract version 5 or above is required.")
193
194
 
@@ -213,10 +214,10 @@ async def process_file(
213
214
  Returns:
214
215
  str: Extracted text from the image.
215
216
  """
216
- with NamedTemporaryFile(suffix=".txt") as output_file:
217
+ with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
217
218
  # this is needed because tesseract adds .txt to the output file
218
- output_file_name = output_file.name.replace(".txt", "")
219
219
  try:
220
+ output_file_name = output_file.name.replace(".txt", "")
220
221
  command = [
221
222
  "tesseract",
222
223
  str(input_file),
@@ -239,11 +240,15 @@ async def process_file(
239
240
  if not result.returncode == 0:
240
241
  raise OCRError("OCR failed with a non-0 return code.")
241
242
 
242
- output = await AsyncPath(output_file.name).read_text()
243
+ output = await AsyncPath(output_file.name).read_text("utf-8")
243
244
  return output.strip()
244
245
  except (RuntimeError, OSError) as e:
245
246
  raise OCRError("Failed to OCR using tesseract") from e
246
247
 
248
+ finally:
249
+ output_file.close()
250
+ await AsyncPath(output_file.name).unlink()
251
+
247
252
 
248
253
  async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
249
254
  """Process a single Pillow Image using Tesseract OCR.
@@ -257,9 +262,14 @@ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMM
257
262
  Returns:
258
263
  str: Extracted text from the image.
259
264
  """
260
- with NamedTemporaryFile(suffix=".png") as image_file:
261
- await run_sync(image.save, image_file.name, format="PNG")
262
- return await process_file(image_file.name, language=language, psm=psm, **kwargs)
265
+ with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
266
+ try:
267
+ await run_sync(image.save, image_file.name, format="PNG")
268
+ return await process_file(image_file.name, language=language, psm=psm, **kwargs)
269
+
270
+ finally:
271
+ image_file.close()
272
+ await AsyncPath(image_file.name).unlink()
263
273
 
264
274
 
265
275
  async def process_image_with_tesseract(
kreuzberg/extraction.py CHANGED
@@ -20,7 +20,7 @@ from kreuzberg._extractors import (
20
20
  extract_content_with_pandoc,
21
21
  extract_file_with_pandoc,
22
22
  extract_html_string,
23
- extract_pdf_file,
23
+ extract_pdf,
24
24
  extract_pptx_file,
25
25
  extract_xlsx_file,
26
26
  )
@@ -71,21 +71,21 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
71
71
  )
72
72
 
73
73
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
74
- with NamedTemporaryFile(suffix=".pdf") as temp_file:
75
- temp_file.write(content)
76
- return ExtractionResult(
77
- content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
78
- )
74
+ return ExtractionResult(content=await extract_pdf(content, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
79
75
 
80
76
  if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
81
77
  return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
82
78
 
83
79
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
84
- with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
85
- temp_file.write(content)
86
- return ExtractionResult(
87
- content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
88
- )
80
+ with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
81
+ try:
82
+ await AsyncPath(temp_file.name).write_bytes(content)
83
+ return ExtractionResult(
84
+ content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
85
+ )
86
+ finally:
87
+ temp_file.close()
88
+ await AsyncPath(temp_file.name).unlink()
89
89
 
90
90
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
91
91
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
@@ -137,7 +137,7 @@ async def extract_file(
137
137
  raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
138
138
 
139
139
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
140
- return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
140
+ return ExtractionResult(content=await extract_pdf(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
141
141
 
142
142
  if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
143
143
  return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.6.0
3
+ Version: 1.7.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -29,7 +29,7 @@ Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: html-to-markdown>=1.2.0
30
30
  Requires-Dist: pypdfium2>=4.30.1
31
31
  Requires-Dist: python-pptx>=1.0.2
32
- Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
32
+ Requires-Dist: typing-extensions>=4.12.2
33
33
  Requires-Dist: xlsx2csv>=0.8.4
34
34
 
35
35
  # Kreuzberg
@@ -231,11 +231,16 @@ async def process_document(path: str) -> tuple[str, str]:
231
231
 
232
232
  ### Error Handling
233
233
 
234
- Kreuzberg provides detailed error handling with two main exception types:
234
+ Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
235
235
 
236
236
  ```python
237
237
  from kreuzberg import extract_file
238
- from kreuzberg.exceptions import ValidationError, ParsingError
238
+ from kreuzberg.exceptions import (
239
+ ValidationError,
240
+ ParsingError,
241
+ OCRError,
242
+ MissingDependencyError
243
+ )
239
244
 
240
245
  async def safe_extract(path: str) -> str:
241
246
  try:
@@ -243,20 +248,31 @@ async def safe_extract(path: str) -> str:
243
248
  return result.content
244
249
 
245
250
  except ValidationError as e:
246
- # Handles input validation issues:
247
- # - Unsupported file types
251
+ # Input validation issues
252
+ # - Unsupported or undetectable MIME types
248
253
  # - Missing files
249
- # - Invalid MIME types
250
- print(f"Invalid input: {e.message}")
251
- print(f"Details: {e.context}")
254
+ # - Invalid input parameters
255
+ print(f"Validation failed: {e}")
256
+
257
+ except OCRError as e:
258
+ # OCR-specific issues
259
+ # - Tesseract processing failures
260
+ # - Image conversion problems
261
+ print(f"OCR failed: {e}")
262
+
263
+ except MissingDependencyError as e:
264
+ # System dependency issues
265
+ # - Missing Tesseract OCR
266
+ # - Missing Pandoc
267
+ # - Incompatible versions
268
+ print(f"Dependency missing: {e}")
252
269
 
253
270
  except ParsingError as e:
254
- # Handles processing errors:
271
+ # General processing errors
255
272
  # - PDF parsing failures
256
- # - OCR errors
257
273
  # - Format conversion issues
258
- print(f"Processing failed: {e.message}")
259
- print(f"Details: {e.context}")
274
+ # - Encoding problems
275
+ print(f"Processing failed: {e}")
260
276
 
261
277
  return ""
262
278
 
@@ -264,24 +280,33 @@ async def safe_extract(path: str) -> str:
264
280
  try:
265
281
  result = await extract_file("document.xyz")
266
282
  except ValidationError as e:
267
- # e.context might contain:
268
- # {
283
+ # Error will include context:
284
+ # ValidationError: Unsupported mime type
285
+ # Context: {
269
286
  # "file_path": "document.xyz",
270
- # "error": "Unsupported file type",
271
- # "supported_types": ["pdf", "docx", ...]
287
+ # "supported_mimetypes": ["application/pdf", ...]
272
288
  # }
289
+ print(e)
273
290
 
274
291
  try:
275
- result = await extract_file("scan.pdf")
276
- except ParsingError as e:
277
- # e.context might contain:
278
- # {
279
- # "file_path": "scan.pdf",
280
- # "error": "OCR processing failed",
281
- # "details": "Tesseract error: Unable to process image"
292
+ result = await extract_file("scan.jpg")
293
+ except OCRError as e:
294
+ # Error will include context:
295
+ # OCRError: OCR failed with a non-0 return code
296
+ # Context: {
297
+ # "file_path": "scan.jpg",
298
+ # "tesseract_version": "5.3.0"
282
299
  # }
300
+ print(e)
283
301
  ```
284
302
 
303
+ All exceptions provide:
304
+
305
+ - A descriptive error message
306
+ - Relevant context in the `context` attribute
307
+ - String representation with both message and context
308
+ - Proper exception chaining for debugging
309
+
285
310
  ## Roadmap
286
311
 
287
312
  V1:
@@ -0,0 +1,15 @@
1
+ kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
+ kreuzberg/_extractors.py,sha256=3VP7oBz0VpmkkhlbKDPjRmnZdHBv4K_xqcyMeeDaetM,9283
3
+ kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
4
+ kreuzberg/_pandoc.py,sha256=zhNJ8_92JMs4gG_Fj-IVwdpZwWsyaK-VTrbLke6NGyU,15097
5
+ kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
6
+ kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
7
+ kreuzberg/_tesseract.py,sha256=Yya15OxB4PBi2QqmrGXF70_SHBD7Luii9sBXzMJlCpU,8168
8
+ kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
9
+ kreuzberg/extraction.py,sha256=_vJ9O8t50a3p4co3hY8b3BdBIXV5S7XOUNl_kD9_FvM,6599
10
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ kreuzberg-1.7.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
+ kreuzberg-1.7.0.dist-info/METADATA,sha256=3wKe7X5G1IQfSPNzD0wnS0t81MqoWtQ-cgR-6MBoyec,10355
13
+ kreuzberg-1.7.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
+ kreuzberg-1.7.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
+ kreuzberg-1.7.0.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=cbDjitvqI35Gimh27iXvEE0Zczf9jZRJZS7Do8ugVNE,7934
3
- kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
4
- kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
5
- kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
6
- kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
7
- kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
8
- kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
9
- kreuzberg/extraction.py,sha256=G3_Uyzhe99qEib4WLE7_l1oC9JKlvoVdn3WEY56J_Wo,6572
10
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- kreuzberg-1.6.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
- kreuzberg-1.6.0.dist-info/METADATA,sha256=GQNbGnxmym5vAcXDivDUccdVBUGnYh-4M38xYEkKTJk,9663
13
- kreuzberg-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
- kreuzberg-1.6.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
- kreuzberg-1.6.0.dist-info/RECORD,,