kreuzberg 1.3.0__py3-none-any.whl → 1.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -6,51 +6,61 @@ from html import escape
6
6
  from io import BytesIO
7
7
  from typing import TYPE_CHECKING, cast
8
8
 
9
+ import html_to_markdown
10
+ import pptx
11
+ import pypandoc
12
+ import pypdfium2
9
13
  from anyio import Path as AsyncPath
10
14
  from charset_normalizer import detect
11
- from html_to_markdown import convert_to_markdown
12
- from pptx import Presentation
13
- from pptx.enum.shapes import MSO_SHAPE_TYPE
14
- from pypandoc import convert_file, convert_text
15
- from pypdfium2 import PdfDocument, PdfiumError
16
- from pytesseract import TesseractError, image_to_string
17
15
 
18
16
  from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
19
17
  from kreuzberg._string import normalize_spaces, safe_decode
20
18
  from kreuzberg._sync import run_sync
19
+ from kreuzberg._tesseract import batch_process_images
21
20
  from kreuzberg.exceptions import ParsingError
22
21
 
23
22
  if TYPE_CHECKING: # pragma: no cover
24
23
  from pathlib import Path
25
24
 
25
+ from PIL.Image import Image
26
26
 
27
- def _extract_pdf_with_tesseract(file_path: Path) -> str:
28
- """Extract text from a scanned PDF file using pytesseract.
27
+
28
+ async def convert_pdf_to_images(file_path: Path) -> list[Image]:
29
+ """Convert a PDF file to images.
29
30
 
30
31
  Args:
31
32
  file_path: The path to the PDF file.
32
33
 
33
34
  Raises:
34
- ParsingError: If the text could not be extracted from the PDF file.
35
+ ParsingError: If the PDF file could not be converted to images.
35
36
 
36
37
  Returns:
37
- The extracted text.
38
+ A list of Pillow Images.
38
39
  """
39
40
  try:
40
- # make it into an image here:
41
- pdf = PdfDocument(str(file_path))
42
- images = [page.render(scale=2.0).to_pil() for page in pdf]
43
-
44
- text = "\n".join(image_to_string(img) for img in images)
45
- return normalize_spaces(text)
46
- except (PdfiumError, TesseractError) as e:
47
- # TODO: add test case
41
+ pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
42
+ return [page.render(scale=2.0).to_pil() for page in pdf]
43
+ except pypdfium2.PdfiumError as e:
48
44
  raise ParsingError(
49
- "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
45
+ "Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
50
46
  ) from e
51
47
 
52
48
 
53
- def _extract_pdf_with_pdfium2(file_path: Path) -> str:
49
+ async def extract_pdf_with_tesseract(file_path: Path) -> str:
50
+ """Extract text from a scanned PDF file using pytesseract.
51
+
52
+ Args:
53
+ file_path: The path to the PDF file.
54
+
55
+ Returns:
56
+ The extracted text.
57
+ """
58
+ images = await convert_pdf_to_images(file_path)
59
+ ocr_results = await batch_process_images(images)
60
+ return normalize_spaces("\n".join(ocr_results))
61
+
62
+
63
+ async def extract_pdf_with_pdfium2(file_path: Path) -> str:
54
64
  """Extract text from a searchable PDF file using pypdfium2.
55
65
 
56
66
  Args:
@@ -63,17 +73,16 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
63
73
  The extracted text.
64
74
  """
65
75
  try:
66
- document = PdfDocument(file_path)
76
+ document = await run_sync(pypdfium2.PdfDocument, file_path)
67
77
  text = "\n".join(page.get_textpage().get_text_range() for page in document)
68
78
  return normalize_spaces(text)
69
- except PdfiumError as e:
70
- # TODO: add test case
79
+ except pypdfium2.PdfiumError as e:
71
80
  raise ParsingError(
72
81
  "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
73
82
  ) from e
74
83
 
75
84
 
76
- async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
85
+ async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
77
86
  """Extract text from a PDF file.
78
87
 
79
88
  Args:
@@ -83,13 +92,13 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
83
92
  Returns:
84
93
  The extracted text.
85
94
  """
86
- if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
95
+ if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
87
96
  return normalize_spaces(content)
88
97
 
89
- return normalize_spaces(await run_sync(_extract_pdf_with_tesseract, file_path))
98
+ return await extract_pdf_with_tesseract(file_path)
90
99
 
91
100
 
92
- async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
101
+ async def extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
93
102
  """Extract text using pandoc.
94
103
 
95
104
  Args:
@@ -107,7 +116,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
107
116
  encoding = encoding or detect(file_data)["encoding"] or "utf-8"
108
117
  try:
109
118
  return normalize_spaces(
110
- cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
119
+ cast(str, await run_sync(pypandoc.convert_text, file_data, to="md", format=ext, encoding=encoding))
111
120
  )
112
121
  except RuntimeError as e:
113
122
  # TODO: add test case
@@ -117,7 +126,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
117
126
  ) from e
118
127
 
119
128
 
120
- async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
129
+ async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
121
130
  """Extract text using pandoc.
122
131
 
123
132
  Args:
@@ -132,7 +141,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
132
141
  """
133
142
  ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
134
143
  try:
135
- return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
144
+ return normalize_spaces(cast(str, await run_sync(pypandoc.convert_file, file_path, to="md", format=ext)))
136
145
  except RuntimeError as e:
137
146
  raise ParsingError(
138
147
  f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
@@ -140,27 +149,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
140
149
  ) from e
141
150
 
142
151
 
143
- async def _extract_image_with_tesseract(file_path: Path | str) -> str:
144
- """Extract text from an image file.
145
-
146
- Args:
147
- file_path: The path to the image file.
148
-
149
- Raises:
150
- ParsingError: If the text could not be extracted from the image file.
151
-
152
- Returns:
153
- The extracted content.
154
- """
155
- try:
156
- return normalize_spaces(cast(str, image_to_string(str(file_path))))
157
- except TesseractError as e:
158
- raise ParsingError(
159
- "Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
160
- ) from e
161
-
162
-
163
- async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
152
+ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
164
153
  """Extract text from a PPTX file.
165
154
 
166
155
  Notes:
@@ -172,13 +161,15 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
172
161
  Returns:
173
162
  The extracted text content
174
163
  """
164
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
165
+
175
166
  md_content = ""
176
167
  file_contents = (
177
168
  file_path_or_contents
178
169
  if isinstance(file_path_or_contents, bytes)
179
170
  else await AsyncPath(file_path_or_contents).read_bytes()
180
171
  )
181
- presentation = Presentation(BytesIO(file_contents))
172
+ presentation = pptx.Presentation(BytesIO(file_contents))
182
173
 
183
174
  for index, slide in enumerate(presentation.slides):
184
175
  md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
@@ -230,7 +221,7 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
230
221
  return normalize_spaces(md_content)
231
222
 
232
223
 
233
- async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
224
+ async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
234
225
  """Extract text from an HTML string.
235
226
 
236
227
  Args:
@@ -244,4 +235,4 @@ async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
244
235
  if isinstance(file_path_or_contents, bytes)
245
236
  else await AsyncPath(file_path_or_contents).read_text()
246
237
  )
247
- return normalize_spaces(await run_sync(convert_to_markdown, content))
238
+ return normalize_spaces(await run_sync(html_to_markdown.convert_to_markdown, content))
kreuzberg/_string.py CHANGED
@@ -4,6 +4,8 @@ from contextlib import suppress
4
4
 
5
5
  from charset_normalizer import detect
6
6
 
7
+ from kreuzberg.exceptions import ParsingError
8
+
7
9
 
8
10
  def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
9
11
  """Decode a byte string safely, removing invalid sequences.
@@ -12,27 +14,22 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
12
14
  byte_data: The byte string to decode.
13
15
  encoding: The encoding to use when decoding the byte string.
14
16
 
17
+ Raises:
18
+ ParsingError: If the byte string could not be decoded.
19
+
15
20
  Returns:
16
21
  The decoded string.
17
22
  """
18
23
  if not byte_data:
19
24
  return ""
20
25
 
21
- encodings = ["utf-8", "latin-1"]
22
-
23
- if encoding:
24
- with suppress(UnicodeDecodeError):
25
- return byte_data.decode(encoding, errors="ignore")
26
-
27
- if encoding := detect(byte_data).get("encoding"):
28
- encodings.append(encoding)
26
+ encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8", "latin-1"]
29
27
 
30
- for encoding in encodings:
28
+ for enc in [e for e in encodings if e]:
31
29
  with suppress(UnicodeDecodeError):
32
- return byte_data.decode(encoding, errors="ignore")
30
+ return byte_data.decode(enc)
33
31
 
34
- # TODO: add test case
35
- return byte_data.decode("latin-1", errors="replace")
32
+ raise ParsingError("Could not decode byte string. Please provide an encoding.")
36
33
 
37
34
 
38
35
  def normalize_spaces(text: str) -> str:
@@ -0,0 +1,318 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import subprocess
5
+ from asyncio import gather
6
+ from enum import Enum
7
+ from os import PathLike
8
+ from tempfile import NamedTemporaryFile
9
+ from typing import Any, Literal, TypeVar, Union
10
+
11
+ from anyio import Path as AsyncPath
12
+ from PIL.Image import Image
13
+
14
+ from kreuzberg._sync import run_sync
15
+ from kreuzberg.exceptions import MissingDependencyError, OCRError
16
+
17
+ version_ref = {"checked": False}
18
+
19
+ T = TypeVar("T", bound=Union[Image, PathLike[str], str])
20
+
21
+ SupportedLanguages = Literal[
22
+ "afr",
23
+ "amh",
24
+ "ara",
25
+ "asm",
26
+ "aze",
27
+ "aze_cyrl",
28
+ "bel",
29
+ "ben",
30
+ "bod",
31
+ "bos",
32
+ "bre",
33
+ "bul",
34
+ "cat",
35
+ "ceb",
36
+ "ces",
37
+ "chi_sim",
38
+ "chi_tra",
39
+ "chr",
40
+ "cos",
41
+ "cym",
42
+ "dan",
43
+ "dan_frak",
44
+ "deu",
45
+ "deu_frak",
46
+ "deu_latf",
47
+ "dzo",
48
+ "ell",
49
+ "eng",
50
+ "enm",
51
+ "epo",
52
+ "equ",
53
+ "est",
54
+ "eus",
55
+ "fao",
56
+ "fas",
57
+ "fil",
58
+ "fin",
59
+ "fra",
60
+ "frk",
61
+ "frm",
62
+ "fry",
63
+ "gla",
64
+ "gle",
65
+ "glg",
66
+ "grc",
67
+ "guj",
68
+ "hat",
69
+ "heb",
70
+ "hin",
71
+ "hrv",
72
+ "hun",
73
+ "hye",
74
+ "iku",
75
+ "ind",
76
+ "isl",
77
+ "ita",
78
+ "ita_old",
79
+ "jav",
80
+ "jpn",
81
+ "kan",
82
+ "kat",
83
+ "kat_old",
84
+ "kaz",
85
+ "khm",
86
+ "kir",
87
+ "kmr",
88
+ "kor",
89
+ "kor_vert",
90
+ "kur",
91
+ "lao",
92
+ "lat",
93
+ "lav",
94
+ "lit",
95
+ "ltz",
96
+ "mal",
97
+ "mar",
98
+ "mkd",
99
+ "mlt",
100
+ "mon",
101
+ "mri",
102
+ "msa",
103
+ "mya",
104
+ "nep",
105
+ "nld",
106
+ "nor",
107
+ "oci",
108
+ "ori",
109
+ "osd",
110
+ "pan",
111
+ "pol",
112
+ "por",
113
+ "pus",
114
+ "que",
115
+ "ron",
116
+ "rus",
117
+ "san",
118
+ "sin",
119
+ "slk",
120
+ "slk_frak",
121
+ "slv",
122
+ "snd",
123
+ "spa",
124
+ "spa_old",
125
+ "sqi",
126
+ "srp",
127
+ "srp_latn",
128
+ "sun",
129
+ "swa",
130
+ "swe",
131
+ "syr",
132
+ "tam",
133
+ "tat",
134
+ "tel",
135
+ "tgk",
136
+ "tgl",
137
+ "tha",
138
+ "tir",
139
+ "ton",
140
+ "tur",
141
+ "uig",
142
+ "ukr",
143
+ "urd",
144
+ "uzb",
145
+ "uzb_cyrl",
146
+ "vie",
147
+ "yid",
148
+ "yor",
149
+ ]
150
+
151
+
152
+ class PSMMode(Enum):
153
+ """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
154
+
155
+ OSD_ONLY = 0
156
+ """Orientation and script detection only."""
157
+ AUTO_OSD = 1
158
+ """Automatic page segmentation with orientation and script detection."""
159
+ AUTO_ONLY = 2
160
+ """Automatic page segmentation without OSD."""
161
+ AUTO = 3
162
+ """Fully automatic page segmentation (default)."""
163
+ SINGLE_COLUMN = 4
164
+ """Assume a single column of text."""
165
+ SINGLE_BLOCK_VERTICAL = 5
166
+ """Assume a single uniform block of vertically aligned text."""
167
+ SINGLE_BLOCK = 6
168
+ """Assume a single uniform block of text."""
169
+ SINGLE_LINE = 7
170
+ """Treat the image as a single text line."""
171
+ SINGLE_WORD = 8
172
+ """Treat the image as a single word."""
173
+ CIRCLE_WORD = 9
174
+ """Treat the image as a single word in a circle."""
175
+ SINGLE_CHAR = 10
176
+ """Treat the image as a single character."""
177
+
178
+
179
+ async def validate_tesseract_version() -> None:
180
+ """Validate that Tesseract is installed and is version 5 or above.
181
+
182
+ Raises:
183
+ MissingDependencyError: If Tesseract is not installed or is below version 5.
184
+ """
185
+ try:
186
+ if version_ref["checked"]:
187
+ return
188
+
189
+ result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
190
+ version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
191
+ if not version_match or int(version_match.group(1)) < 5:
192
+ raise MissingDependencyError("Tesseract version 5 or above is required.")
193
+
194
+ version_ref["checked"] = True
195
+ except FileNotFoundError as e:
196
+ raise MissingDependencyError("Tesseract is not installed.") from e
197
+
198
+
199
+ async def process_file(
200
+ input_file: str | PathLike[str], *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any
201
+ ) -> str:
202
+ """Process a single image file using Tesseract OCR.
203
+
204
+ Args:
205
+ input_file: The path to the image file to process.
206
+ language: The language code for OCR.
207
+ psm: Page segmentation mode.
208
+ **kwargs: Additional Tesseract configuration options as key-value pairs.
209
+
210
+ Raises:
211
+ OCRError: If OCR fails to extract text from the image.
212
+
213
+ Returns:
214
+ str: Extracted text from the image.
215
+ """
216
+ with NamedTemporaryFile(suffix=".txt") as output_file:
217
+ # this is needed because tesseract adds .txt to the output file
218
+ output_file_name = output_file.name.replace(".txt", "")
219
+ try:
220
+ command = [
221
+ "tesseract",
222
+ str(input_file),
223
+ output_file_name,
224
+ "-l",
225
+ language,
226
+ "--psm",
227
+ str(psm.value),
228
+ ]
229
+
230
+ for key, value in kwargs.items():
231
+ command.extend(["-c", f"{key}={value}"])
232
+
233
+ result = await run_sync(
234
+ subprocess.run,
235
+ command,
236
+ capture_output=True,
237
+ )
238
+
239
+ if not result.returncode == 0:
240
+ raise OCRError("OCR failed with a non-0 return code.")
241
+
242
+ output = await AsyncPath(output_file.name).read_text()
243
+ return output.strip()
244
+ except (RuntimeError, OSError) as e:
245
+ raise OCRError("Failed to OCR using tesseract") from e
246
+
247
+
248
+ async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
249
+ """Process a single Pillow Image using Tesseract OCR.
250
+
251
+ Args:
252
+ image: The Pillow Image to process.
253
+ language: The language code for OCR.
254
+ psm: Page segmentation mode.
255
+ **kwargs: Additional Tesseract configuration options as key-value pairs.
256
+
257
+ Returns:
258
+ str: Extracted text from the image.
259
+ """
260
+ with NamedTemporaryFile(suffix=".png") as image_file:
261
+ await run_sync(image.save, image_file.name, format="PNG")
262
+ return await process_file(image_file.name, language=language, psm=psm, **kwargs)
263
+
264
+
265
+ async def process_image_with_tesseract(
266
+ image: Image | PathLike[str] | str,
267
+ *,
268
+ language: SupportedLanguages = "eng",
269
+ psm: PSMMode = PSMMode.AUTO,
270
+ **kwargs: Any,
271
+ ) -> str:
272
+ """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
273
+
274
+ Args:
275
+ image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
276
+ language: The language code for OCR (default: "eng").
277
+ psm: Page segmentation mode (default: PSMMode.AUTO).
278
+ **kwargs: Additional Tesseract configuration options as key-value pairs.
279
+
280
+ Raises:
281
+ ValueError: If the input is not a Pillow Image or a list of Pillow Images.
282
+
283
+ Returns:
284
+ Extracted text as a string
285
+ """
286
+ await validate_tesseract_version()
287
+
288
+ if isinstance(image, Image):
289
+ return await process_image(image, language=language, psm=psm, **kwargs)
290
+
291
+ if isinstance(image, (PathLike, str)):
292
+ return await process_file(image, language=language, psm=psm, **kwargs)
293
+
294
+ raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
295
+
296
+
297
+ async def batch_process_images(
298
+ images: list[T],
299
+ *,
300
+ language: SupportedLanguages = "eng",
301
+ psm: PSMMode = PSMMode.AUTO,
302
+ **kwargs: Any,
303
+ ) -> list[str]:
304
+ """Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
305
+
306
+ Args:
307
+ images: A list of Pillow Images, paths or strings to process.
308
+ language: The language code for OCR (default: "eng").
309
+ psm: Page segmentation mode (default: PSMMode.AUTO).
310
+ **kwargs: Additional Tesseract configuration options as key-value pairs.
311
+
312
+ Returns:
313
+ Extracted text as a string (for single image) or a list of strings (for multiple images).
314
+ """
315
+ await validate_tesseract_version()
316
+ return await gather(
317
+ *[process_image_with_tesseract(image, language=language, psm=psm, **kwargs) for image in images]
318
+ )
kreuzberg/exceptions.py CHANGED
@@ -10,7 +10,7 @@ class KreuzbergError(Exception):
10
10
  context: Any
11
11
  """The context of the error."""
12
12
 
13
- def __init__(self, message: str, context: Any = None) -> None:
13
+ def __init__(self, message: str, *, context: Any = None) -> None:
14
14
  self.context = context
15
15
  super().__init__(message)
16
16
 
@@ -27,3 +27,11 @@ class ParsingError(KreuzbergError):
27
27
 
28
28
  class ValidationError(KreuzbergError):
29
29
  """Raised when a validation error occurs."""
30
+
31
+
32
+ class MissingDependencyError(KreuzbergError):
33
+ """Raised when a dependency is missing."""
34
+
35
+
36
+ class OCRError(KreuzbergError):
37
+ """Raised when an OCR error occurs."""
kreuzberg/extraction.py CHANGED
@@ -17,12 +17,11 @@ from typing import NamedTuple
17
17
  from anyio import Path as AsyncPath
18
18
 
19
19
  from kreuzberg._extractors import (
20
- _extract_content_with_pandoc,
21
- _extract_file_with_pandoc,
22
- _extract_html_string,
23
- _extract_image_with_tesseract,
24
- _extract_pdf_file,
25
- _extract_pptx_file,
20
+ extract_content_with_pandoc,
21
+ extract_file_with_pandoc,
22
+ extract_html_string,
23
+ extract_pdf_file,
24
+ extract_pptx_file,
26
25
  )
27
26
  from kreuzberg._mime_types import (
28
27
  HTML_MIME_TYPE,
@@ -36,6 +35,7 @@ from kreuzberg._mime_types import (
36
35
  SUPPORTED_MIME_TYPES,
37
36
  )
38
37
  from kreuzberg._string import safe_decode
38
+ from kreuzberg._tesseract import process_image_with_tesseract
39
39
  from kreuzberg.exceptions import ValidationError
40
40
 
41
41
 
@@ -72,28 +72,28 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
72
72
  with NamedTemporaryFile(suffix=".pdf") as temp_file:
73
73
  temp_file.write(content)
74
74
  return ExtractionResult(
75
- content=await _extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
75
+ content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
76
76
  )
77
77
 
78
78
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
79
79
  with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
80
80
  temp_file.write(content)
81
81
  return ExtractionResult(
82
- content=await _extract_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
82
+ content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
83
83
  )
84
84
 
85
85
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
86
86
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
87
87
  ):
88
88
  return ExtractionResult(
89
- content=await _extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
89
+ content=await extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
90
90
  )
91
91
 
92
92
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
93
- return ExtractionResult(content=await _extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
93
+ return ExtractionResult(content=await extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
94
94
 
95
95
  if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
96
- return ExtractionResult(content=await _extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
96
+ return ExtractionResult(content=await extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
97
97
 
98
98
  return ExtractionResult(
99
99
  content=safe_decode(content),
@@ -132,22 +132,22 @@ async def extract_file(
132
132
  raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
133
133
 
134
134
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
135
- return ExtractionResult(content=await _extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
135
+ return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
136
136
 
137
137
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
138
- return ExtractionResult(content=await _extract_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
138
+ return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
139
139
 
140
140
  if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
141
141
  mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
142
142
  ):
143
143
  return ExtractionResult(
144
- content=await _extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
144
+ content=await extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
145
145
  )
146
146
 
147
147
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
148
- return ExtractionResult(content=await _extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
148
+ return ExtractionResult(content=await extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
149
149
 
150
150
  if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
151
- return ExtractionResult(content=await _extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
151
+ return ExtractionResult(content=await extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
152
152
 
153
153
  return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.3.0
3
+ Version: 1.4.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -28,9 +28,7 @@ Requires-Dist: charset-normalizer>=3.4.1
28
28
  Requires-Dist: html-to-markdown>=1.2.0
29
29
  Requires-Dist: pypandoc>=1.15
30
30
  Requires-Dist: pypdfium2>=4.30.1
31
- Requires-Dist: pytesseract>=0.3.13
32
31
  Requires-Dist: python-pptx>=1.0.2
33
- Requires-Dist: typing-extensions>=4.12.2
34
32
 
35
33
  # Kreuzberg
36
34
 
@@ -0,0 +1,14 @@
1
+ kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
+ kreuzberg/_extractors.py,sha256=Z6fxNMODsiNGPBv8gYpZ0jrc2hPbX-56xdrVPJ-6SQ4,7658
3
+ kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
4
+ kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
5
+ kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
+ kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
7
+ kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
8
+ kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
9
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
10
+ kreuzberg-1.4.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
11
+ kreuzberg-1.4.0.dist-info/METADATA,sha256=ul0iSWSu_1i029aq8X4T4ZboOzWpKK8wZRuvvLVqAoQ,8503
12
+ kreuzberg-1.4.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
13
+ kreuzberg-1.4.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
14
+ kreuzberg-1.4.0.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=eiWPpjnZOZFDwlQL4XsgavJEWqxGtzLVvS8YU28RBAo,8095
3
- kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
4
- kreuzberg/_string.py,sha256=O023sxdYoC4DhFCU1z430UBdbxqwXKmyymUDDx3J_i8,1156
5
- kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
- kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
7
- kreuzberg/extraction.py,sha256=cgX8uoCVXf-Va30g8T8DwrZUqsSPHIzmPfDgnWOqNNU,6148
8
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- kreuzberg-1.3.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
10
- kreuzberg-1.3.0.dist-info/METADATA,sha256=3wiaAuaiA865lg5oCjwlAKaZqRQn1w8VqaQXeoEdip4,8579
11
- kreuzberg-1.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
12
- kreuzberg-1.3.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
13
- kreuzberg-1.3.0.dist-info/RECORD,,