kreuzberg 1.7.0__py3-none-any.whl → 2.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_pdf.py ADDED
@@ -0,0 +1,189 @@
1
+ from __future__ import annotations
2
+
3
+ from re import Pattern
4
+ from re import compile as compile_regex
5
+ from typing import TYPE_CHECKING, Final, cast
6
+
7
+ import pypdfium2
8
+ from anyio import Path as AsyncPath
9
+
10
+ from kreuzberg import ExtractionResult
11
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
12
+ from kreuzberg._string import normalize_spaces
13
+ from kreuzberg._sync import run_sync
14
+ from kreuzberg._tesseract import PSMMode, batch_process_images
15
+ from kreuzberg.exceptions import ParsingError
16
+
17
+ if TYPE_CHECKING: # pragma: no cover
18
+ from pathlib import Path
19
+
20
+ from PIL.Image import Image
21
+
22
+
23
+ # Pattern to detect common PDF text extraction corruption:
24
+ # - Control and non-printable characters
25
+ # - Unicode replacement and invalid characters
26
+ # - Zero-width spaces and other invisible characters
27
+ CORRUPTED_PATTERN: Final[Pattern[str]] = compile_regex(
28
+ r"[\x00-\x08\x0B-\x1F\x7F-\x9F]|\uFFFD|[\u200B-\u200F\u2028-\u202F]"
29
+ )
30
+
31
+
32
+ def _validate_extracted_text(text: str) -> bool:
33
+ """Check if text extracted from PDF is valid or corrupted.
34
+
35
+ This checks for common indicators of corrupted PDF text extraction:
36
+ 1. Empty or whitespace-only text
37
+ 2. Control characters and other non-printable characters
38
+ 3. Unicode replacement characters
39
+ 4. Zero-width spaces and other invisible characters
40
+
41
+ Args:
42
+ text: The extracted text to validate
43
+
44
+ Returns:
45
+ True if the text appears valid, False if it seems corrupted
46
+ """
47
+ # Check for empty or whitespace-only text
48
+ if not text or not text.strip():
49
+ return False
50
+
51
+ # Check for corruption indicators
52
+ return not bool(CORRUPTED_PATTERN.search(text))
53
+
54
+
55
+ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
56
+ """Convert a PDF file to images.
57
+
58
+ Args:
59
+ input_file: The path to the PDF file.
60
+
61
+ Raises:
62
+ ParsingError: If the PDF file could not be converted to images.
63
+
64
+ Returns:
65
+ A list of Pillow Images.
66
+ """
67
+ document: pypdfium2.PdfDocument | None = None
68
+ try:
69
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
70
+ return [page.render(scale=2.0).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
71
+ except pypdfium2.PdfiumError as e:
72
+ raise ParsingError(
73
+ "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
74
+ ) from e
75
+ finally:
76
+ if document:
77
+ await run_sync(document.close)
78
+
79
+
80
+ async def _extract_pdf_text_with_ocr(
81
+ input_file: Path,
82
+ *,
83
+ language: str = "eng",
84
+ max_processes: int,
85
+ psm: PSMMode = PSMMode.AUTO,
86
+ ) -> ExtractionResult:
87
+ """Extract text from a scanned PDF file using pytesseract.
88
+
89
+ Args:
90
+ input_file: The path to the PDF file.
91
+ language: The language code for OCR. Defaults to "eng".
92
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
93
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
94
+
95
+ Returns:
96
+ The extracted text.
97
+ """
98
+ images = await _convert_pdf_to_images(input_file)
99
+ ocr_results = await batch_process_images(images, max_processes=max_processes, psm=psm, language=language)
100
+ return ExtractionResult(
101
+ content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}
102
+ )
103
+
104
+
105
+ async def _extract_pdf_searchable_text(input_file: Path) -> str:
106
+ """Extract text from a searchable PDF file using pypdfium2.
107
+
108
+ Args:
109
+ input_file: The path to the PDF file.
110
+
111
+ Raises:
112
+ ParsingError: If the text could not be extracted from the PDF file.
113
+
114
+ Returns:
115
+ The extracted text.
116
+ """
117
+ document: pypdfium2.PdfDocument | None = None
118
+ try:
119
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
120
+ text = "\n".join(page.get_textpage().get_text_bounded() for page in cast(pypdfium2.PdfDocument, document))
121
+ return normalize_spaces(text)
122
+ except pypdfium2.PdfiumError as e:
123
+ raise ParsingError(
124
+ "Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
125
+ ) from e
126
+ finally:
127
+ if document:
128
+ await run_sync(document.close)
129
+
130
+
131
+ async def extract_pdf_file(
132
+ input_file: Path,
133
+ *,
134
+ force_ocr: bool,
135
+ language: str = "eng",
136
+ max_processes: int,
137
+ psm: PSMMode = PSMMode.AUTO,
138
+ ) -> ExtractionResult:
139
+ """Extract text from a PDF file.
140
+
141
+ Args:
142
+ input_file: The path to the PDF file.
143
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
144
+ language: The language code for OCR. Defaults to "eng".
145
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
146
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
147
+
148
+ Returns:
149
+ The extracted text.
150
+ """
151
+ if (
152
+ not force_ocr
153
+ and (content := await _extract_pdf_searchable_text(input_file))
154
+ and _validate_extracted_text(content)
155
+ ):
156
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
157
+
158
+ return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
159
+
160
+
161
+ async def extract_pdf_content(
162
+ content: bytes,
163
+ *,
164
+ force_ocr: bool,
165
+ language: str = "eng",
166
+ max_processes: int,
167
+ psm: PSMMode = PSMMode.AUTO,
168
+ ) -> ExtractionResult:
169
+ """Extract text from a PDF file content.
170
+
171
+ Args:
172
+ content: The PDF file content.
173
+ force_ocr: Whether to force OCR on PDF files that have a text layer.
174
+ language: The language code for OCR. Defaults to "eng".
175
+ max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
176
+ psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
177
+
178
+ Returns:
179
+ The extracted text.
180
+ """
181
+ from kreuzberg._tmp import create_temp_file
182
+
183
+ file_path, unlink = await create_temp_file(".pdf")
184
+ await AsyncPath(file_path).write_bytes(content)
185
+ result = await extract_pdf_file(
186
+ file_path, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
187
+ )
188
+ await unlink()
189
+ return result
kreuzberg/_pptx.py ADDED
@@ -0,0 +1,88 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from contextlib import suppress
5
+ from html import escape
6
+ from io import BytesIO
7
+ from typing import TYPE_CHECKING
8
+
9
+ import pptx
10
+ from anyio import Path as AsyncPath
11
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
12
+
13
+ from kreuzberg import ExtractionResult
14
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
15
+ from kreuzberg._string import normalize_spaces
16
+
17
+ if TYPE_CHECKING: # pragma: no cover
18
+ from pathlib import Path
19
+
20
+
21
+ async def extract_pptx_file_content(file_path_or_contents: Path | bytes) -> ExtractionResult:
22
+ """Extract text from a PPTX file.
23
+
24
+ Notes:
25
+ This function is based on code vendored from `markitdown`, which has an MIT license as well.
26
+
27
+ Args:
28
+ file_path_or_contents: The path to the PPTX file or its contents as bytes.
29
+
30
+ Returns:
31
+ The extracted text content
32
+ """
33
+ md_content = ""
34
+ file_contents = (
35
+ file_path_or_contents
36
+ if isinstance(file_path_or_contents, bytes)
37
+ else await AsyncPath(file_path_or_contents).read_bytes()
38
+ )
39
+ presentation = pptx.Presentation(BytesIO(file_contents))
40
+
41
+ for index, slide in enumerate(presentation.slides):
42
+ md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
43
+
44
+ title = slide.shapes.title
45
+
46
+ for shape in slide.shapes:
47
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
48
+ shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
49
+ ):
50
+ alt_text = ""
51
+ with suppress(AttributeError):
52
+ # access non-visual properties
53
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
54
+
55
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
56
+ md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
57
+
58
+ elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
59
+ html_table = "<table>"
60
+ first_row = True
61
+
62
+ for row in shape.table.rows:
63
+ html_table += "<tr>"
64
+
65
+ for cell in row.cells:
66
+ tag = "th" if first_row else "td"
67
+ html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
68
+
69
+ html_table += "</tr>"
70
+ first_row = False
71
+
72
+ html_table += "</table>"
73
+ md_content += "\n" + html_table + "\n"
74
+
75
+ elif shape.has_text_frame:
76
+ md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
77
+
78
+ md_content = md_content.strip()
79
+ if slide.has_notes_slide:
80
+ md_content += "\n\n### Notes:\n"
81
+ notes_frame = slide.notes_slide.notes_text_frame
82
+
83
+ if notes_frame is not None: # pragma: no branch
84
+ md_content += notes_frame.text
85
+
86
+ md_content = md_content.strip()
87
+
88
+ return ExtractionResult(content=normalize_spaces(md_content), mime_type=MARKDOWN_MIME_TYPE, metadata={})
kreuzberg/_string.py CHANGED
@@ -4,8 +4,6 @@ from contextlib import suppress
4
4
 
5
5
  from charset_normalizer import detect
6
6
 
7
- from kreuzberg.exceptions import ParsingError
8
-
9
7
 
10
8
  def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
11
9
  """Decode a byte string safely, removing invalid sequences.
@@ -14,22 +12,21 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
14
12
  byte_data: The byte string to decode.
15
13
  encoding: The encoding to use when decoding the byte string.
16
14
 
17
- Raises:
18
- ParsingError: If the byte string could not be decoded.
19
-
20
15
  Returns:
21
16
  The decoded string.
22
17
  """
23
18
  if not byte_data:
24
19
  return ""
25
20
 
26
- encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8", "latin-1"]
21
+ # We try each encoding in order until one works
22
+ encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
27
23
 
28
- for enc in [e for e in encodings if e]:
24
+ for enc in [e for e in encodings if e]: # pragma: no cover
29
25
  with suppress(UnicodeDecodeError):
30
26
  return byte_data.decode(enc)
31
27
 
32
- raise ParsingError("Could not decode byte string. Please provide an encoding.")
28
+ # If all encodings fail, fall back to latin-1 which can handle any byte
29
+ return byte_data.decode("latin-1", errors="replace")
33
30
 
34
31
 
35
32
  def normalize_spaces(text: str) -> str:
kreuzberg/_sync.py CHANGED
@@ -1,14 +1,19 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import sys
3
4
  from functools import partial
4
5
  from typing import TYPE_CHECKING, TypeVar, cast
5
6
 
6
7
  from anyio.to_thread import run_sync as any_io_run_sync
7
- from typing_extensions import ParamSpec
8
8
 
9
9
  if TYPE_CHECKING: # pragma: no cover
10
10
  from collections.abc import Callable
11
11
 
12
+ if sys.version_info >= (3, 10):
13
+ from typing import ParamSpec
14
+ else: # pragma: no cover
15
+ from typing_extensions import ParamSpec
16
+
12
17
  T = TypeVar("T")
13
18
  P = ParamSpec("P")
14
19