kreuzberg 1.5.0__py3-none-any.whl → 1.7.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -1,9 +1,12 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import re
4
+ from asyncio import gather
4
5
  from contextlib import suppress
5
6
  from html import escape
6
7
  from io import BytesIO
8
+ from pathlib import Path
9
+ from tempfile import NamedTemporaryFile
7
10
  from typing import TYPE_CHECKING
8
11
 
9
12
  import html_to_markdown
@@ -11,6 +14,7 @@ import pptx
11
14
  import pypdfium2
12
15
  from anyio import Path as AsyncPath
13
16
  from pptx.enum.shapes import MSO_SHAPE_TYPE
17
+ from xlsx2csv import Xlsx2csv
14
18
 
15
19
  from kreuzberg._pandoc import process_content, process_file
16
20
  from kreuzberg._string import normalize_spaces, safe_decode
@@ -19,8 +23,6 @@ from kreuzberg._tesseract import batch_process_images
19
23
  from kreuzberg.exceptions import ParsingError
20
24
 
21
25
  if TYPE_CHECKING: # pragma: no cover
22
- from pathlib import Path
23
-
24
26
  from PIL.Image import Image
25
27
 
26
28
 
@@ -36,13 +38,18 @@ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
36
38
  Returns:
37
39
  A list of Pillow Images.
38
40
  """
41
+ pdf = None
42
+ resolved_path = str(await AsyncPath(file_path).resolve())
39
43
  try:
40
- pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
44
+ pdf = await run_sync(pypdfium2.PdfDocument, resolved_path)
41
45
  return [page.render(scale=2.0).to_pil() for page in pdf]
42
46
  except pypdfium2.PdfiumError as e:
43
47
  raise ParsingError(
44
48
  "Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
45
49
  ) from e
50
+ finally:
51
+ if pdf is not None:
52
+ pdf.close()
46
53
 
47
54
 
48
55
  async def extract_pdf_with_tesseract(file_path: Path) -> str:
@@ -71,30 +78,49 @@ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
71
78
  Returns:
72
79
  The extracted text.
73
80
  """
81
+ document = None
82
+ resolved_path = str(await AsyncPath(file_path).resolve())
74
83
  try:
75
- document = await run_sync(pypdfium2.PdfDocument, file_path)
76
- text = "\n".join(page.get_textpage().get_text_range() for page in document)
84
+ document = await run_sync(pypdfium2.PdfDocument, resolved_path)
85
+ text = "\n".join(page.get_textpage().get_text_bounded() for page in document)
77
86
  return normalize_spaces(text)
78
87
  except pypdfium2.PdfiumError as e:
79
88
  raise ParsingError(
80
89
  "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
81
90
  ) from e
91
+ finally:
92
+ if document is not None:
93
+ document.close()
82
94
 
83
95
 
84
- async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
96
+ async def extract_pdf(file_path_or_contents: Path | bytes, force_ocr: bool = False) -> str:
85
97
  """Extract text from a PDF file.
86
98
 
87
99
  Args:
88
- file_path: The path to the PDF file.
100
+ file_path_or_contents: The path to the PDF file or its contents as bytes.
89
101
  force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
90
102
 
91
103
  Returns:
92
104
  The extracted text.
93
105
  """
94
- if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
106
+ if isinstance(file_path_or_contents, bytes):
107
+ with NamedTemporaryFile(suffix=".pdf", delete=False) as pdf_file:
108
+ try:
109
+ file_path = Path(pdf_file.name)
110
+ await AsyncPath(file_path).write_bytes(file_path_or_contents)
111
+
112
+ if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
113
+ return normalize_spaces(content)
114
+
115
+ return await extract_pdf_with_tesseract(file_path)
116
+ finally:
117
+ pdf_file.close()
118
+ await AsyncPath(pdf_file.name).unlink()
119
+
120
+ if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path_or_contents)):
95
121
  return normalize_spaces(content)
96
122
 
97
- return await extract_pdf_with_tesseract(file_path)
123
+ return await extract_pdf_with_tesseract(file_path_or_contents)
98
124
 
99
125
 
100
126
  async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
@@ -121,7 +147,8 @@ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str
121
147
  Returns:
122
148
  The extracted text.
123
149
  """
124
- result = await process_file(file_path, mime_type=mime_type)
150
+ resolved_path = str(await AsyncPath(file_path).resolve())
151
+ result = await process_file(resolved_path, mime_type=mime_type)
125
152
  return normalize_spaces(result.content)
126
153
 
127
154
 
@@ -195,6 +222,47 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
195
222
  return normalize_spaces(md_content)
196
223
 
197
224
 
225
+ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
226
+ """Extract text from an XLSX file by converting it to CSV and then to markdown.
227
+
228
+ Args:
229
+ file_path_or_contents: The path to the XLSX file or its contents as bytes.
230
+
231
+ Returns:
232
+ The extracted text content.
233
+
234
+ Raises:
235
+ ParsingError: If the XLSX file could not be parsed.
236
+ """
237
+ with (
238
+ NamedTemporaryFile(suffix=".xlsx", delete=False) as xlsx_file,
239
+ NamedTemporaryFile(suffix=".csv", delete=False) as csv_file,
240
+ ):
241
+ try:
242
+ if isinstance(file_path_or_contents, bytes):
243
+ xlsx_file.write(file_path_or_contents)
244
+ xlsx_file.flush()
245
+ xlsx_path = xlsx_file.name
246
+ else:
247
+ xlsx_path = str(await AsyncPath(file_path_or_contents).resolve())
248
+
249
+ await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
250
+ result = await process_file(csv_file.name, mime_type="text/csv")
251
+ return normalize_spaces(result.content)
252
+ except Exception as e:
253
+ raise ParsingError(
254
+ "Could not extract text from XLSX file",
255
+ context={
256
+ "error": str(e),
257
+ "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
258
+ },
259
+ ) from e
260
+ finally:
261
+ xlsx_file.close()
262
+ csv_file.close()
263
+ await gather(AsyncPath(xlsx_file.name).unlink(), AsyncPath(csv_file.name).unlink())
264
+
265
+
198
266
  async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
199
267
  """Extract text from an HTML string.
200
268
 
kreuzberg/_mime_types.py CHANGED
@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
10
10
  PDF_MIME_TYPE: Final = "application/pdf"
11
11
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
12
12
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
13
-
13
+ EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
14
14
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
15
15
 
16
16
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -89,5 +89,5 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
89
89
  PLAIN_TEXT_MIME_TYPES
90
90
  | IMAGE_MIME_TYPES
91
91
  | PANDOC_SUPPORTED_MIME_TYPES
92
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
92
+ | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
93
93
  )
kreuzberg/_pandoc.py CHANGED
@@ -1,9 +1,9 @@
1
1
  from __future__ import annotations
2
2
 
3
- import json
4
3
  import subprocess
5
4
  from asyncio import gather
6
5
  from dataclasses import dataclass
6
+ from json import JSONDecodeError, loads
7
7
  from tempfile import NamedTemporaryFile
8
8
  from typing import TYPE_CHECKING, Any, Final, Literal, TypedDict, cast
9
9
 
@@ -13,7 +13,7 @@ from kreuzberg._string import normalize_spaces
13
13
  from kreuzberg._sync import run_sync
14
14
  from kreuzberg.exceptions import MissingDependencyError, ParsingError, ValidationError
15
15
 
16
- if TYPE_CHECKING:
16
+ if TYPE_CHECKING: # pragma: no cover
17
17
  from collections.abc import Mapping
18
18
  from os import PathLike
19
19
 
@@ -80,7 +80,7 @@ NodeType = Literal[
80
80
  "MetaBlocks",
81
81
  ]
82
82
 
83
- PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
83
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING: Final[Mapping[str, str]] = {
84
84
  "application/csl+json": "csljson",
85
85
  "application/docbook+xml": "docbook",
86
86
  "application/epub+zip": "epub",
@@ -112,6 +112,38 @@ PANDOC_MIMETYPE_TO_FORMAT_MAPPING: Final[Mapping[str, str]] = {
112
112
  "text/x-rst": "rst",
113
113
  }
114
114
 
115
+ MIMETYPE_TO_FILE_EXTENSION_MAPPING: Final[Mapping[str, str]] = {
116
+ "application/csl+json": "json",
117
+ "application/docbook+xml": "xml",
118
+ "application/epub+zip": "epub",
119
+ "application/rtf": "rtf",
120
+ "application/vnd.oasis.opendocument.text": "odt",
121
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
122
+ "application/x-biblatex": "bib",
123
+ "application/x-bibtex": "bib",
124
+ "application/x-endnote+xml": "xml",
125
+ "application/x-fictionbook+xml": "fb2",
126
+ "application/x-ipynb+json": "ipynb",
127
+ "application/x-jats+xml": "xml",
128
+ "application/x-latex": "tex",
129
+ "application/x-opml+xml": "opml",
130
+ "application/x-research-info-systems": "ris",
131
+ "application/x-typst": "typst",
132
+ "text/csv": "csv",
133
+ "text/tab-separated-values": "tsv",
134
+ "text/troff": "1",
135
+ "text/x-commonmark": "md",
136
+ "text/x-dokuwiki": "wiki",
137
+ "text/x-gfm": "md",
138
+ "text/x-markdown": "md",
139
+ "text/x-markdown-extra": "md",
140
+ "text/x-mdoc": "md",
141
+ "text/x-multimarkdown": "md",
142
+ "text/x-org": "org",
143
+ "text/x-pod": "pod",
144
+ "text/x-rst": "rst",
145
+ }
146
+
115
147
 
116
148
  class Metadata(TypedDict, total=False):
117
149
  """Document metadata extracted from Pandoc document.
@@ -232,7 +264,6 @@ def _extract_meta_value(node: Any) -> str | list[str] | None:
232
264
 
233
265
 
234
266
  def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
235
- """Extract all non-empty metadata values from Pandoc AST metadata."""
236
267
  meta: Metadata = {}
237
268
 
238
269
  for key, value in raw_meta.items():
@@ -252,34 +283,30 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
252
283
  return meta
253
284
 
254
285
 
255
- def _get_extension_from_mime_type(mime_type: str) -> str:
256
- if mime_type not in PANDOC_MIMETYPE_TO_FORMAT_MAPPING or not any(
257
- mime_type.startswith(value) for value in PANDOC_MIMETYPE_TO_FORMAT_MAPPING
286
+ def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
287
+ if mime_type not in MIMETYPE_TO_PANDOC_TYPE_MAPPING or not any(
288
+ mime_type.startswith(value) for value in MIMETYPE_TO_PANDOC_TYPE_MAPPING
258
289
  ):
259
290
  raise ValidationError(
260
291
  f"Unsupported mime type: {mime_type}",
261
292
  context={
262
293
  "mime_type": mime_type,
263
- "supported_mimetypes": ",".join(sorted(PANDOC_MIMETYPE_TO_FORMAT_MAPPING)),
294
+ "supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
264
295
  },
265
296
  )
266
297
 
267
- return PANDOC_MIMETYPE_TO_FORMAT_MAPPING.get(mime_type) or next(
268
- PANDOC_MIMETYPE_TO_FORMAT_MAPPING[k] for k in PANDOC_MIMETYPE_TO_FORMAT_MAPPING if k.startswith(mime_type)
298
+ return MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type) or next(
299
+ MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
269
300
  )
270
301
 
271
302
 
272
- async def validate_pandoc_version() -> None:
273
- """Validate that Pandoc is installed and is version 3 or above.
274
-
275
- Raises:
276
- MissingDependencyError: If Pandoc is not installed or is below version 3.
277
- """
303
+ async def _validate_pandoc_version() -> None:
278
304
  try:
279
305
  if version_ref["checked"]:
280
306
  return
281
307
 
282
- result = await run_sync(subprocess.run, ["pandoc", "--version"], capture_output=True)
308
+ command = ["pandoc", "--version"]
309
+ result = await run_sync(subprocess.run, command, capture_output=True)
283
310
  version = result.stdout.decode().split("\n")[0].split()[1]
284
311
  if not version.startswith("3."):
285
312
  raise MissingDependencyError("Pandoc version 3 or above is required.")
@@ -290,27 +317,15 @@ async def validate_pandoc_version() -> None:
290
317
  raise MissingDependencyError("Pandoc is not installed.") from e
291
318
 
292
319
 
293
- async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
294
- """Extract metadata from a document using pandoc.
295
-
296
- Args:
297
- input_file: The path to the file to process.
298
- mime_type: The mime type of the file.
299
-
300
- Raises:
301
- ParsingError: If Pandoc fails to extract metadata.
302
-
303
- Returns:
304
- Dictionary containing document metadata.
305
- """
306
- extension = _get_extension_from_mime_type(mime_type)
320
+ async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
321
+ pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
307
322
 
308
- with NamedTemporaryFile(suffix=".json") as metadata_file:
323
+ with NamedTemporaryFile(suffix=".json", delete=False) as metadata_file:
309
324
  try:
310
325
  command = [
311
326
  "pandoc",
312
327
  str(input_file),
313
- f"--from={extension}",
328
+ f"--from={pandoc_type}",
314
329
  "--to=json",
315
330
  "--standalone",
316
331
  "--quiet",
@@ -329,46 +344,60 @@ async def extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -
329
344
  "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
330
345
  )
331
346
 
332
- json_data = json.loads(await AsyncPath(metadata_file.name).read_text())
347
+ json_data = loads(await AsyncPath(metadata_file.name).read_text("utf-8"))
333
348
  return _extract_metadata(json_data)
334
349
 
335
- except (RuntimeError, OSError, json.JSONDecodeError) as e:
350
+ except (RuntimeError, OSError, JSONDecodeError) as e:
336
351
  raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
337
352
 
353
+ finally:
354
+ metadata_file.close()
355
+ await AsyncPath(metadata_file.name).unlink()
338
356
 
339
- async def _extract_file(input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None) -> str:
340
- extension = _get_extension_from_mime_type(mime_type)
341
-
342
- with NamedTemporaryFile(suffix=".md") as output_file:
343
- command = [
344
- "pandoc",
345
- str(input_file),
346
- f"--from={extension}",
347
- "--to=markdown",
348
- "--standalone",
349
- "--wrap=preserve",
350
- "--quiet",
351
- "--output",
352
- output_file.name,
353
- ]
354
357
 
355
- if extra_args:
356
- command.extend(extra_args)
358
+ async def _handle_extract_file(
359
+ input_file: str | PathLike[str], *, mime_type: str, extra_args: list[str] | None = None
360
+ ) -> str:
361
+ pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
357
362
 
358
- result = await run_sync(
359
- subprocess.run,
360
- command,
361
- capture_output=True,
362
- )
363
+ with NamedTemporaryFile(suffix=".md", delete=False) as output_file:
364
+ try:
365
+ command = [
366
+ "pandoc",
367
+ str(input_file),
368
+ f"--from={pandoc_type}",
369
+ "--to=markdown",
370
+ "--standalone",
371
+ "--wrap=preserve",
372
+ "--quiet",
373
+ "--output",
374
+ output_file.name,
375
+ ]
376
+
377
+ if extra_args:
378
+ command.extend(extra_args)
363
379
 
364
- if result.returncode != 0:
365
- raise ParsingError(
366
- "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
380
+ result = await run_sync(
381
+ subprocess.run,
382
+ command,
383
+ capture_output=True,
367
384
  )
368
385
 
369
- text = await AsyncPath(output_file.name).read_text()
386
+ if result.returncode != 0:
387
+ raise ParsingError(
388
+ "Failed to extract file data", context={"file": str(input_file), "error": result.stderr.decode()}
389
+ )
370
390
 
371
- return normalize_spaces(text)
391
+ text = await AsyncPath(output_file.name).read_text("utf-8")
392
+
393
+ return normalize_spaces(text)
394
+
395
+ except (RuntimeError, OSError) as e:
396
+ raise ParsingError("Failed to extract file data", context={"file": str(input_file)}) from e
397
+
398
+ finally:
399
+ output_file.close()
400
+ await AsyncPath(output_file.name).unlink()
372
401
 
373
402
 
374
403
  async def process_file(
@@ -384,12 +413,12 @@ async def process_file(
384
413
  Returns:
385
414
  PandocResult containing processed content and metadata.
386
415
  """
387
- await validate_pandoc_version()
416
+ await _validate_pandoc_version()
388
417
 
389
418
  metadata, content = await gather(
390
419
  *[
391
- extract_metadata(input_file, mime_type=mime_type),
392
- _extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
420
+ _handle_extract_metadata(input_file, mime_type=mime_type),
421
+ _handle_extract_file(input_file, mime_type=mime_type, extra_args=extra_args),
393
422
  ]
394
423
  )
395
424
  return PandocResult(
@@ -409,8 +438,13 @@ async def process_content(content: bytes, *, mime_type: str, extra_args: list[st
409
438
  Returns:
410
439
  PandocResult containing processed content and metadata.
411
440
  """
412
- extension = _get_extension_from_mime_type(mime_type)
441
+ extension = MIMETYPE_TO_FILE_EXTENSION_MAPPING.get(mime_type) or "md"
442
+
443
+ with NamedTemporaryFile(suffix=f".{extension}", delete=False) as input_file:
444
+ try:
445
+ await AsyncPath(input_file.name).write_bytes(content)
446
+ return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
413
447
 
414
- with NamedTemporaryFile(suffix=f".{extension}") as input_file:
415
- await AsyncPath(input_file.name).write_bytes(content)
416
- return await process_file(input_file.name, mime_type=mime_type, extra_args=extra_args)
448
+ finally:
449
+ input_file.close()
450
+ await AsyncPath(input_file.name).unlink()
kreuzberg/_tesseract.py CHANGED
@@ -186,8 +186,9 @@ async def validate_tesseract_version() -> None:
186
186
  if version_ref["checked"]:
187
187
  return
188
188
 
189
- result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
190
- version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
189
+ command = ["tesseract", "--version"]
190
+ result = await run_sync(subprocess.run, command, capture_output=True)
191
+ version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
191
192
  if not version_match or int(version_match.group(1)) < 5:
192
193
  raise MissingDependencyError("Tesseract version 5 or above is required.")
193
194
 
@@ -213,10 +214,10 @@ async def process_file(
213
214
  Returns:
214
215
  str: Extracted text from the image.
215
216
  """
216
- with NamedTemporaryFile(suffix=".txt") as output_file:
217
+ with NamedTemporaryFile(suffix=".txt", delete=False) as output_file:
217
218
  # this is needed because tesseract adds .txt to the output file
218
- output_file_name = output_file.name.replace(".txt", "")
219
219
  try:
220
+ output_file_name = output_file.name.replace(".txt", "")
220
221
  command = [
221
222
  "tesseract",
222
223
  str(input_file),
@@ -239,11 +240,15 @@ async def process_file(
239
240
  if not result.returncode == 0:
240
241
  raise OCRError("OCR failed with a non-0 return code.")
241
242
 
242
- output = await AsyncPath(output_file.name).read_text()
243
+ output = await AsyncPath(output_file.name).read_text("utf-8")
243
244
  return output.strip()
244
245
  except (RuntimeError, OSError) as e:
245
246
  raise OCRError("Failed to OCR using tesseract") from e
246
247
 
248
+ finally:
249
+ output_file.close()
250
+ await AsyncPath(output_file.name).unlink()
251
+
247
252
 
248
253
  async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
249
254
  """Process a single Pillow Image using Tesseract OCR.
@@ -257,9 +262,14 @@ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMM
257
262
  Returns:
258
263
  str: Extracted text from the image.
259
264
  """
260
- with NamedTemporaryFile(suffix=".png") as image_file:
261
- await run_sync(image.save, image_file.name, format="PNG")
262
- return await process_file(image_file.name, language=language, psm=psm, **kwargs)
265
+ with NamedTemporaryFile(suffix=".png", delete=False) as image_file:
266
+ try:
267
+ await run_sync(image.save, image_file.name, format="PNG")
268
+ return await process_file(image_file.name, language=language, psm=psm, **kwargs)
269
+
270
+ finally:
271
+ image_file.close()
272
+ await AsyncPath(image_file.name).unlink()
263
273
 
264
274
 
265
275
  async def process_image_with_tesseract(
kreuzberg/extraction.py CHANGED
@@ -20,10 +20,12 @@ from kreuzberg._extractors import (
20
20
  extract_content_with_pandoc,
21
21
  extract_file_with_pandoc,
22
22
  extract_html_string,
23
- extract_pdf_file,
23
+ extract_pdf,
24
24
  extract_pptx_file,
25
+ extract_xlsx_file,
25
26
  )
26
27
  from kreuzberg._mime_types import (
28
+ EXCEL_MIME_TYPE,
27
29
  HTML_MIME_TYPE,
28
30
  IMAGE_MIME_TYPE_EXT_MAP,
29
31
  IMAGE_MIME_TYPES,
@@ -69,18 +71,21 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
69
71
  )
70
72
 
71
73
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
72
- with NamedTemporaryFile(suffix=".pdf") as temp_file:
73
- temp_file.write(content)
74
- return ExtractionResult(
75
- content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
76
- )
74
+ return ExtractionResult(content=await extract_pdf(content, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
75
+
76
+ if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
77
+ return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
77
78
 
78
79
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
79
- with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
80
- temp_file.write(content)
81
- return ExtractionResult(
82
- content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
83
- )
80
+ with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type], delete=False) as temp_file:
81
+ try:
82
+ await AsyncPath(temp_file.name).write_bytes(content)
83
+ return ExtractionResult(
84
+ content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
85
+ )
86
+ finally:
87
+ temp_file.close()
88
+ await AsyncPath(temp_file.name).unlink()
84
89
 
85
90
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
86
91
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
@@ -132,7 +137,10 @@ async def extract_file(
132
137
  raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
133
138
 
134
139
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
135
- return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
140
+ return ExtractionResult(content=await extract_pdf(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
141
+
142
+ if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
143
+ return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
136
144
 
137
145
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
138
146
  return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.5.0
3
+ Version: 1.7.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -29,7 +29,8 @@ Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: html-to-markdown>=1.2.0
30
30
  Requires-Dist: pypdfium2>=4.30.1
31
31
  Requires-Dist: python-pptx>=1.0.2
32
- Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
32
+ Requires-Dist: typing-extensions>=4.12.2
33
+ Requires-Dist: xlsx2csv>=0.8.4
33
34
 
34
35
  # Kreuzberg
35
36
 
@@ -68,16 +69,12 @@ pip install kreuzberg
68
69
 
69
70
  ### 2. Install System Dependencies
70
71
 
71
- Kreuzberg requires two open-source tools:
72
+ Kreuzberg requires two system level dependencies:
72
73
 
73
74
  - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
74
-
75
- - GPL v2.0 licensed (used via CLI only)
76
- - Handles office documents and markup formats
77
-
78
75
  - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
79
- - Apache License
80
- - Required for scanned documents and images
76
+
77
+ Please install these using their respective installation guides.
81
78
 
82
79
  ## Architecture
83
80
 
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
87
84
  - `pdfium2` for searchable PDFs
88
85
  - Tesseract OCR for scanned content
89
86
  - **Document Conversion**:
90
- - Pandoc for office documents and markup
87
+ - Pandoc for many document and markup formats
91
88
  - `python-pptx` for PowerPoint files
92
89
  - `html-to-markdown` for HTML content
90
+ - `xlsx2csv` for Excel spreadsheets
93
91
  - **Text Processing**:
94
92
  - Smart encoding detection
95
93
  - Markdown and plain text handling
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
121
119
 
122
120
  #### Data and Research Formats
123
121
 
122
+ - Excel spreadsheets (`.xlsx`)
124
123
  - CSV (`.csv`) and TSV (`.tsv`) files
125
124
  - Jupyter Notebooks (`.ipynb`)
126
125
  - BibTeX (`.bib`) and BibLaTeX (`.bib`)
@@ -232,11 +231,16 @@ async def process_document(path: str) -> tuple[str, str]:
232
231
 
233
232
  ### Error Handling
234
233
 
235
- Kreuzberg provides detailed error handling with two main exception types:
234
+ Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
236
235
 
237
236
  ```python
238
237
  from kreuzberg import extract_file
239
- from kreuzberg.exceptions import ValidationError, ParsingError
238
+ from kreuzberg.exceptions import (
239
+ ValidationError,
240
+ ParsingError,
241
+ OCRError,
242
+ MissingDependencyError
243
+ )
240
244
 
241
245
  async def safe_extract(path: str) -> str:
242
246
  try:
@@ -244,20 +248,31 @@ async def safe_extract(path: str) -> str:
244
248
  return result.content
245
249
 
246
250
  except ValidationError as e:
247
- # Handles input validation issues:
248
- # - Unsupported file types
251
+ # Input validation issues
252
+ # - Unsupported or undetectable MIME types
249
253
  # - Missing files
250
- # - Invalid MIME types
251
- print(f"Invalid input: {e.message}")
252
- print(f"Details: {e.context}")
254
+ # - Invalid input parameters
255
+ print(f"Validation failed: {e}")
256
+
257
+ except OCRError as e:
258
+ # OCR-specific issues
259
+ # - Tesseract processing failures
260
+ # - Image conversion problems
261
+ print(f"OCR failed: {e}")
262
+
263
+ except MissingDependencyError as e:
264
+ # System dependency issues
265
+ # - Missing Tesseract OCR
266
+ # - Missing Pandoc
267
+ # - Incompatible versions
268
+ print(f"Dependency missing: {e}")
253
269
 
254
270
  except ParsingError as e:
255
- # Handles processing errors:
271
+ # General processing errors
256
272
  # - PDF parsing failures
257
- # - OCR errors
258
273
  # - Format conversion issues
259
- print(f"Processing failed: {e.message}")
260
- print(f"Details: {e.context}")
274
+ # - Encoding problems
275
+ print(f"Processing failed: {e}")
261
276
 
262
277
  return ""
263
278
 
@@ -265,24 +280,33 @@ async def safe_extract(path: str) -> str:
265
280
  try:
266
281
  result = await extract_file("document.xyz")
267
282
  except ValidationError as e:
268
- # e.context might contain:
269
- # {
283
+ # Error will include context:
284
+ # ValidationError: Unsupported mime type
285
+ # Context: {
270
286
  # "file_path": "document.xyz",
271
- # "error": "Unsupported file type",
272
- # "supported_types": ["pdf", "docx", ...]
287
+ # "supported_mimetypes": ["application/pdf", ...]
273
288
  # }
289
+ print(e)
274
290
 
275
291
  try:
276
- result = await extract_file("scan.pdf")
277
- except ParsingError as e:
278
- # e.context might contain:
279
- # {
280
- # "file_path": "scan.pdf",
281
- # "error": "OCR processing failed",
282
- # "details": "Tesseract error: Unable to process image"
292
+ result = await extract_file("scan.jpg")
293
+ except OCRError as e:
294
+ # Error will include context:
295
+ # OCRError: OCR failed with a non-0 return code
296
+ # Context: {
297
+ # "file_path": "scan.jpg",
298
+ # "tesseract_version": "5.3.0"
283
299
  # }
300
+ print(e)
284
301
  ```
285
302
 
303
+ All exceptions provide:
304
+
305
+ - A descriptive error message
306
+ - Relevant context in the `context` attribute
307
+ - String representation with both message and context
308
+ - Proper exception chaining for debugging
309
+
286
310
  ## Roadmap
287
311
 
288
312
  V1:
@@ -0,0 +1,15 @@
1
+ kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
+ kreuzberg/_extractors.py,sha256=3VP7oBz0VpmkkhlbKDPjRmnZdHBv4K_xqcyMeeDaetM,9283
3
+ kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
4
+ kreuzberg/_pandoc.py,sha256=zhNJ8_92JMs4gG_Fj-IVwdpZwWsyaK-VTrbLke6NGyU,15097
5
+ kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
6
+ kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
7
+ kreuzberg/_tesseract.py,sha256=Yya15OxB4PBi2QqmrGXF70_SHBD7Luii9sBXzMJlCpU,8168
8
+ kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
9
+ kreuzberg/extraction.py,sha256=_vJ9O8t50a3p4co3hY8b3BdBIXV5S7XOUNl_kD9_FvM,6599
10
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
+ kreuzberg-1.7.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
+ kreuzberg-1.7.0.dist-info/METADATA,sha256=3wKe7X5G1IQfSPNzD0wnS0t81MqoWtQ-cgR-6MBoyec,10355
13
+ kreuzberg-1.7.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
+ kreuzberg-1.7.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
+ kreuzberg-1.7.0.dist-info/RECORD,,
@@ -1,15 +0,0 @@
1
- kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=k6xO_2ItaftPmlqzfXyxTn8rdaWdwrJHGziBbo7gCio,6599
3
- kreuzberg/_mime_types.py,sha256=0ZYtRrMAaKpCMDkhpTbWAXHCsVob5MFRMGlbni8iYSA,2573
4
- kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
5
- kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
6
- kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
7
- kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
8
- kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
9
- kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
10
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- kreuzberg-1.5.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
- kreuzberg-1.5.0.dist-info/METADATA,sha256=O462ss7M6Cb8cO6fJXwqsOdzkzaZekqa1oGwb7Vrgx8,9641
13
- kreuzberg-1.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
- kreuzberg-1.5.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
- kreuzberg-1.5.0.dist-info/RECORD,,