kreuzberg 1.3.0__py3-none-any.whl → 1.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -4,53 +4,62 @@ import re
4
4
  from contextlib import suppress
5
5
  from html import escape
6
6
  from io import BytesIO
7
- from typing import TYPE_CHECKING, cast
7
+ from typing import TYPE_CHECKING
8
8
 
9
+ import html_to_markdown
10
+ import pptx
11
+ import pypdfium2
9
12
  from anyio import Path as AsyncPath
10
- from charset_normalizer import detect
11
- from html_to_markdown import convert_to_markdown
12
- from pptx import Presentation
13
13
  from pptx.enum.shapes import MSO_SHAPE_TYPE
14
- from pypandoc import convert_file, convert_text
15
- from pypdfium2 import PdfDocument, PdfiumError
16
- from pytesseract import TesseractError, image_to_string
17
14
 
18
- from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
15
+ from kreuzberg._pandoc import process_content, process_file
19
16
  from kreuzberg._string import normalize_spaces, safe_decode
20
17
  from kreuzberg._sync import run_sync
18
+ from kreuzberg._tesseract import batch_process_images
21
19
  from kreuzberg.exceptions import ParsingError
22
20
 
23
21
  if TYPE_CHECKING: # pragma: no cover
24
22
  from pathlib import Path
25
23
 
24
+ from PIL.Image import Image
26
25
 
27
- def _extract_pdf_with_tesseract(file_path: Path) -> str:
28
- """Extract text from a scanned PDF file using pytesseract.
26
+
27
+ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
28
+ """Convert a PDF file to images.
29
29
 
30
30
  Args:
31
31
  file_path: The path to the PDF file.
32
32
 
33
33
  Raises:
34
- ParsingError: If the text could not be extracted from the PDF file.
34
+ ParsingError: If the PDF file could not be converted to images.
35
35
 
36
36
  Returns:
37
- The extracted text.
37
+ A list of Pillow Images.
38
38
  """
39
39
  try:
40
- # make it into an image here:
41
- pdf = PdfDocument(str(file_path))
42
- images = [page.render(scale=2.0).to_pil() for page in pdf]
43
-
44
- text = "\n".join(image_to_string(img) for img in images)
45
- return normalize_spaces(text)
46
- except (PdfiumError, TesseractError) as e:
47
- # TODO: add test case
40
+ pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
41
+ return [page.render(scale=2.0).to_pil() for page in pdf]
42
+ except pypdfium2.PdfiumError as e:
48
43
  raise ParsingError(
49
- "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
44
+ "Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
50
45
  ) from e
51
46
 
52
47
 
53
- def _extract_pdf_with_pdfium2(file_path: Path) -> str:
48
+ async def extract_pdf_with_tesseract(file_path: Path) -> str:
49
+ """Extract text from a scanned PDF file using pytesseract.
50
+
51
+ Args:
52
+ file_path: The path to the PDF file.
53
+
54
+ Returns:
55
+ The extracted text.
56
+ """
57
+ images = await convert_pdf_to_images(file_path)
58
+ ocr_results = await batch_process_images(images)
59
+ return normalize_spaces("\n".join(ocr_results))
60
+
61
+
62
+ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
54
63
  """Extract text from a searchable PDF file using pypdfium2.
55
64
 
56
65
  Args:
@@ -63,17 +72,16 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
63
72
  The extracted text.
64
73
  """
65
74
  try:
66
- document = PdfDocument(file_path)
75
+ document = await run_sync(pypdfium2.PdfDocument, file_path)
67
76
  text = "\n".join(page.get_textpage().get_text_range() for page in document)
68
77
  return normalize_spaces(text)
69
- except PdfiumError as e:
70
- # TODO: add test case
78
+ except pypdfium2.PdfiumError as e:
71
79
  raise ParsingError(
72
80
  "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
73
81
  ) from e
74
82
 
75
83
 
76
- async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
84
+ async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
77
85
  """Extract text from a PDF file.
78
86
 
79
87
  Args:
@@ -83,84 +91,41 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
83
91
  Returns:
84
92
  The extracted text.
85
93
  """
86
- if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
94
+ if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
87
95
  return normalize_spaces(content)
88
96
 
89
- return normalize_spaces(await run_sync(_extract_pdf_with_tesseract, file_path))
97
+ return await extract_pdf_with_tesseract(file_path)
90
98
 
91
99
 
92
- async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
100
+ async def extract_content_with_pandoc(file_data: bytes, mime_type: str) -> str:
93
101
  """Extract text using pandoc.
94
102
 
95
103
  Args:
96
104
  file_data: The content of the file.
97
105
  mime_type: The mime type of the file.
98
- encoding: An optional encoding to use when decoding the string.
99
-
100
- Raises:
101
- ParsingError: If the text could not be extracted from the file using pandoc.
102
106
 
103
107
  Returns:
104
108
  The extracted text.
105
109
  """
106
- ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
107
- encoding = encoding or detect(file_data)["encoding"] or "utf-8"
108
- try:
109
- return normalize_spaces(
110
- cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
111
- )
112
- except RuntimeError as e:
113
- # TODO: add test case
114
- raise ParsingError(
115
- f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
116
- context={"error": str(e)},
117
- ) from e
110
+ result = await process_content(file_data, mime_type=mime_type)
111
+ return normalize_spaces(result.content)
118
112
 
119
113
 
120
- async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
114
+ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
121
115
  """Extract text using pandoc.
122
116
 
123
117
  Args:
124
118
  file_path: The path to the file.
125
119
  mime_type: The mime type of the file.
126
120
 
127
- Raises:
128
- ParsingError: If the text could not be extracted from the file using pandoc.
129
-
130
121
  Returns:
131
122
  The extracted text.
132
123
  """
133
- ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
134
- try:
135
- return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
136
- except RuntimeError as e:
137
- raise ParsingError(
138
- f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
139
- context={"file_path": str(file_path), "error": str(e)},
140
- ) from e
141
-
142
-
143
- async def _extract_image_with_tesseract(file_path: Path | str) -> str:
144
- """Extract text from an image file.
145
-
146
- Args:
147
- file_path: The path to the image file.
148
-
149
- Raises:
150
- ParsingError: If the text could not be extracted from the image file.
151
-
152
- Returns:
153
- The extracted content.
154
- """
155
- try:
156
- return normalize_spaces(cast(str, image_to_string(str(file_path))))
157
- except TesseractError as e:
158
- raise ParsingError(
159
- "Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
160
- ) from e
124
+ result = await process_file(file_path, mime_type=mime_type)
125
+ return normalize_spaces(result.content)
161
126
 
162
127
 
163
- async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
128
+ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
164
129
  """Extract text from a PPTX file.
165
130
 
166
131
  Notes:
@@ -178,7 +143,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
178
143
  if isinstance(file_path_or_contents, bytes)
179
144
  else await AsyncPath(file_path_or_contents).read_bytes()
180
145
  )
181
- presentation = Presentation(BytesIO(file_contents))
146
+ presentation = pptx.Presentation(BytesIO(file_contents))
182
147
 
183
148
  for index, slide in enumerate(presentation.slides):
184
149
  md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
@@ -230,7 +195,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
230
195
  return normalize_spaces(md_content)
231
196
 
232
197
 
233
- async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
198
+ async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
234
199
  """Extract text from an HTML string.
235
200
 
236
201
  Args:
@@ -244,4 +209,4 @@ async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
244
209
  if isinstance(file_path_or_contents, bytes)
245
210
  else await AsyncPath(file_path_or_contents).read_text()
246
211
  )
247
- return normalize_spaces(await run_sync(convert_to_markdown, content))
212
+ return normalize_spaces(await run_sync(html_to_markdown.convert_to_markdown, content))
kreuzberg/_mime_types.py CHANGED
@@ -54,44 +54,35 @@ IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
54
54
  "image/x-portable-pixmap": "ppm",
55
55
  }
56
56
  PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
57
- "application/csv",
58
- "application/latex",
57
+ "application/csl+json",
58
+ "application/docbook+xml",
59
+ "application/epub+zip",
59
60
  "application/rtf",
60
61
  "application/vnd.oasis.opendocument.text",
61
62
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
62
- "application/x-csv",
63
+ "application/x-biblatex",
64
+ "application/x-bibtex",
65
+ "application/x-endnote+xml",
66
+ "application/x-fictionbook+xml",
67
+ "application/x-ipynb+json",
68
+ "application/x-jats+xml",
63
69
  "application/x-latex",
64
- "application/x-rtf",
65
- "application/x-vnd.oasis.opendocument.text",
70
+ "application/x-opml+xml",
71
+ "application/x-research-info-systems",
72
+ "application/x-typst",
66
73
  "text/csv",
67
- "text/latex",
68
- "text/rst",
69
- "text/rtf",
70
74
  "text/tab-separated-values",
71
- "text/x-csv",
72
- "text/x-latex",
75
+ "text/troff",
76
+ "text/x-commonmark",
77
+ "text/x-dokuwiki",
78
+ "text/x-gfm",
79
+ "text/x-markdown",
80
+ "text/x-markdown-extra",
81
+ "text/x-mdoc",
82
+ "text/x-multimarkdown",
83
+ "text/x-org",
84
+ "text/x-pod",
73
85
  "text/x-rst",
74
- "text/x-tsv",
75
- }
76
- PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
77
- "application/csv": "csv",
78
- "application/latex": "latex",
79
- "application/rtf": "rtf",
80
- "application/vnd.oasis.opendocument.text": "odt",
81
- "application/vnd.openxmlformats-officedocument.wordprocessingml.document": "docx",
82
- "application/x-csv": "csv",
83
- "application/x-latex": "latex",
84
- "application/x-rtf": "rtf",
85
- "application/x-vnd.oasis.opendocument.text": "odt",
86
- "text/csv": "csv",
87
- "text/latex": "latex",
88
- "text/rst": "rst",
89
- "text/rtf": "rtf",
90
- "text/tab-separated-values": "tsv",
91
- "text/x-csv": "csv",
92
- "text/x-latex": "latex",
93
- "text/x-rst": "rst",
94
- "text/x-tsv": "tsv",
95
86
  }
96
87
 
97
88
  SUPPORTED_MIME_TYPES: Final[set[str]] = (