kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_extractors/__init__.py +0 -0
- kreuzberg/_extractors/_base.py +92 -0
- kreuzberg/_extractors/_html.py +34 -0
- kreuzberg/_extractors/_image.py +74 -0
- kreuzberg/_extractors/_pandoc.py +613 -0
- kreuzberg/_extractors/_pdf.py +163 -0
- kreuzberg/_extractors/_presentation.py +233 -0
- kreuzberg/_extractors/_spread_sheet.py +125 -0
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_ocr/__init__.py +17 -0
- kreuzberg/_ocr/_base.py +54 -0
- kreuzberg/_ocr/_easyocr.py +376 -0
- kreuzberg/_ocr/_paddleocr.py +291 -0
- kreuzberg/_ocr/_tesseract.py +342 -0
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/_utils/__init__.py +0 -0
- kreuzberg/{_string.py → _utils/_string.py} +0 -2
- kreuzberg/_utils/_sync.py +121 -0
- kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.1.dist-info/METADATA +178 -0
- kreuzberg-3.0.1.dist-info/RECORD +32 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.2.dist-info/METADATA +0 -446
- kreuzberg-2.1.2.dist-info/RECORD +0 -21
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,163 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from multiprocessing import cpu_count
|
4
|
+
from re import Pattern
|
5
|
+
from re import compile as compile_regex
|
6
|
+
from typing import TYPE_CHECKING, ClassVar, cast
|
7
|
+
|
8
|
+
import anyio
|
9
|
+
import pypdfium2
|
10
|
+
from anyio import Path as AsyncPath
|
11
|
+
|
12
|
+
from kreuzberg._extractors._base import Extractor
|
13
|
+
from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
14
|
+
from kreuzberg._ocr import get_ocr_backend
|
15
|
+
from kreuzberg._playa import extract_pdf_metadata
|
16
|
+
from kreuzberg._types import ExtractionResult, OcrBackendType
|
17
|
+
from kreuzberg._utils._string import normalize_spaces
|
18
|
+
from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
|
19
|
+
from kreuzberg._utils._tmp import create_temp_file
|
20
|
+
from kreuzberg.exceptions import ParsingError
|
21
|
+
|
22
|
+
if TYPE_CHECKING: # pragma: no cover
|
23
|
+
from pathlib import Path
|
24
|
+
|
25
|
+
from PIL.Image import Image
|
26
|
+
|
27
|
+
|
28
|
+
class PDFExtractor(Extractor):
|
29
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {PDF_MIME_TYPE}
|
30
|
+
CORRUPTED_PATTERN: ClassVar[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
|
31
|
+
SHORT_TEXT_THRESHOLD: ClassVar[int] = 50
|
32
|
+
MINIMUM_CORRUPTED_RESULTS: ClassVar[int] = 2
|
33
|
+
|
34
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
35
|
+
file_path, unlink = await create_temp_file(".pdf")
|
36
|
+
await AsyncPath(file_path).write_bytes(content)
|
37
|
+
try:
|
38
|
+
metadata = await extract_pdf_metadata(content)
|
39
|
+
result = await self.extract_path_async(file_path)
|
40
|
+
|
41
|
+
result.metadata = metadata
|
42
|
+
return result
|
43
|
+
finally:
|
44
|
+
await unlink()
|
45
|
+
|
46
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
47
|
+
content_bytes = await AsyncPath(path).read_bytes()
|
48
|
+
metadata = await extract_pdf_metadata(content_bytes)
|
49
|
+
|
50
|
+
if not self.config.force_ocr:
|
51
|
+
content = await self._extract_pdf_searchable_text(path)
|
52
|
+
if self._validate_extracted_text(content):
|
53
|
+
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
|
54
|
+
|
55
|
+
if self.config.ocr_backend is not None:
|
56
|
+
result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
|
57
|
+
|
58
|
+
result.metadata = metadata
|
59
|
+
return result
|
60
|
+
|
61
|
+
return ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
|
62
|
+
|
63
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
64
|
+
return anyio.run(self.extract_bytes_async, content)
|
65
|
+
|
66
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
67
|
+
return anyio.run(self.extract_path_async, path)
|
68
|
+
|
69
|
+
def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
|
70
|
+
"""Check if text extracted from PDF is valid or corrupted.
|
71
|
+
|
72
|
+
This checks for indicators of corrupted PDF text extraction:
|
73
|
+
1. Empty or whitespace-only text
|
74
|
+
2. High concentration of control characters and null bytes
|
75
|
+
3. High concentration of Unicode replacement characters
|
76
|
+
|
77
|
+
Args:
|
78
|
+
text: The extracted text to validate
|
79
|
+
corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
|
80
|
+
characters (default: 0.05 or 5%)
|
81
|
+
|
82
|
+
Returns:
|
83
|
+
True if the text appears valid, False if it seems corrupted
|
84
|
+
"""
|
85
|
+
if not text or not text.strip():
|
86
|
+
return False
|
87
|
+
|
88
|
+
corruption_matches = self.CORRUPTED_PATTERN.findall(text)
|
89
|
+
|
90
|
+
if len(text) < self.SHORT_TEXT_THRESHOLD:
|
91
|
+
return len(corruption_matches) <= self.MINIMUM_CORRUPTED_RESULTS
|
92
|
+
|
93
|
+
return (len(corruption_matches) / len(text)) < corruption_threshold
|
94
|
+
|
95
|
+
async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
|
96
|
+
"""Convert a PDF file to images.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
input_file: The path to the PDF file.
|
100
|
+
|
101
|
+
Raises:
|
102
|
+
ParsingError: If the PDF file could not be converted to images.
|
103
|
+
|
104
|
+
Returns:
|
105
|
+
A list of Pillow Images.
|
106
|
+
"""
|
107
|
+
document: pypdfium2.PdfDocument | None = None
|
108
|
+
try:
|
109
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
110
|
+
return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
|
111
|
+
except pypdfium2.PdfiumError as e:
|
112
|
+
raise ParsingError(
|
113
|
+
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
114
|
+
) from e
|
115
|
+
finally:
|
116
|
+
if document:
|
117
|
+
await run_sync(document.close)
|
118
|
+
|
119
|
+
async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
|
120
|
+
"""Extract text from a scanned PDF file using OCR.
|
121
|
+
|
122
|
+
Args:
|
123
|
+
input_file: The path to the PDF file.
|
124
|
+
ocr_backend: The OCR backend to use.
|
125
|
+
|
126
|
+
Returns:
|
127
|
+
The extraction result with text content and metadata.
|
128
|
+
"""
|
129
|
+
images = await self._convert_pdf_to_images(input_file)
|
130
|
+
backend = get_ocr_backend(ocr_backend)
|
131
|
+
ocr_results = await run_taskgroup_batched(
|
132
|
+
*[backend.process_image(image, **self.config.get_config_dict()) for image in images],
|
133
|
+
batch_size=cpu_count(),
|
134
|
+
)
|
135
|
+
return ExtractionResult(
|
136
|
+
content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
|
137
|
+
)
|
138
|
+
|
139
|
+
@staticmethod
|
140
|
+
async def _extract_pdf_searchable_text(input_file: Path) -> str:
|
141
|
+
"""Extract text from a searchable PDF file using pypdfium2.
|
142
|
+
|
143
|
+
Args:
|
144
|
+
input_file: The path to the PDF file.
|
145
|
+
|
146
|
+
Raises:
|
147
|
+
ParsingError: If the text could not be extracted from the PDF file.
|
148
|
+
|
149
|
+
Returns:
|
150
|
+
The extracted text.
|
151
|
+
"""
|
152
|
+
document: pypdfium2.PdfDocument | None = None
|
153
|
+
try:
|
154
|
+
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
155
|
+
text = "\n".join(page.get_textpage().get_text_bounded() for page in cast("pypdfium2.PdfDocument", document))
|
156
|
+
return normalize_spaces(text)
|
157
|
+
except pypdfium2.PdfiumError as e:
|
158
|
+
raise ParsingError(
|
159
|
+
"Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
|
160
|
+
) from e
|
161
|
+
finally:
|
162
|
+
if document:
|
163
|
+
await run_sync(document.close)
|
@@ -0,0 +1,233 @@
|
|
1
|
+
"""This module provides functions to extract textual content from files.
|
2
|
+
|
3
|
+
It includes vendored code:
|
4
|
+
|
5
|
+
- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
|
6
|
+
See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
|
7
|
+
Refer to the markitdown repository for it's license (MIT).
|
8
|
+
"""
|
9
|
+
|
10
|
+
from __future__ import annotations
|
11
|
+
|
12
|
+
import re
|
13
|
+
from contextlib import suppress
|
14
|
+
from html import escape
|
15
|
+
from io import BytesIO
|
16
|
+
from pathlib import Path
|
17
|
+
from typing import TYPE_CHECKING, ClassVar
|
18
|
+
|
19
|
+
import pptx
|
20
|
+
from anyio import Path as AsyncPath
|
21
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
22
|
+
|
23
|
+
from kreuzberg._extractors._base import Extractor
|
24
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, POWER_POINT_MIME_TYPE
|
25
|
+
from kreuzberg._types import ExtractionResult
|
26
|
+
from kreuzberg._utils._string import normalize_spaces
|
27
|
+
|
28
|
+
if TYPE_CHECKING: # pragma: no cover
|
29
|
+
from pptx.presentation import Presentation
|
30
|
+
|
31
|
+
from kreuzberg._types import Metadata
|
32
|
+
|
33
|
+
|
34
|
+
class PresentationExtractor(Extractor):
|
35
|
+
"""Extractor for PowerPoint (.pptx) files.
|
36
|
+
|
37
|
+
This extractor processes PowerPoint presentations and converts their content into Markdown format.
|
38
|
+
It handles slides, shapes, images, tables, and slide notes, preserving the structure and content
|
39
|
+
of the presentation in a readable text format.
|
40
|
+
|
41
|
+
The extractor provides both synchronous and asynchronous methods for processing files either
|
42
|
+
from disk or from bytes in memory.
|
43
|
+
"""
|
44
|
+
|
45
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
|
46
|
+
|
47
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
48
|
+
"""Asynchronously extract content from PowerPoint file bytes.
|
49
|
+
|
50
|
+
Args:
|
51
|
+
content: Raw bytes of the PowerPoint file to process.
|
52
|
+
|
53
|
+
Returns:
|
54
|
+
ExtractionResult: Contains the extracted content in Markdown format,
|
55
|
+
the MIME type, and any additional metadata.
|
56
|
+
"""
|
57
|
+
return self._extract_pptx(content)
|
58
|
+
|
59
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
60
|
+
"""Asynchronously extract content from a PowerPoint file on disk.
|
61
|
+
|
62
|
+
Args:
|
63
|
+
path: Path to the PowerPoint file to process.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
ExtractionResult: Contains the extracted content in Markdown format,
|
67
|
+
the MIME type, and any additional metadata.
|
68
|
+
"""
|
69
|
+
content = await AsyncPath(path).read_bytes()
|
70
|
+
return self._extract_pptx(content)
|
71
|
+
|
72
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
73
|
+
"""Synchronously extract content from PowerPoint file bytes.
|
74
|
+
|
75
|
+
Args:
|
76
|
+
content: Raw bytes of the PowerPoint file to process.
|
77
|
+
|
78
|
+
Returns:
|
79
|
+
ExtractionResult: Contains the extracted content in Markdown format,
|
80
|
+
the MIME type, and any additional metadata.
|
81
|
+
"""
|
82
|
+
return self._extract_pptx(content)
|
83
|
+
|
84
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
85
|
+
"""Synchronously extract content from a PowerPoint file on disk.
|
86
|
+
|
87
|
+
Args:
|
88
|
+
path: Path to the PowerPoint file to process.
|
89
|
+
|
90
|
+
Returns:
|
91
|
+
ExtractionResult: Contains the extracted content in Markdown format,
|
92
|
+
the MIME type, and any additional metadata.
|
93
|
+
"""
|
94
|
+
content = Path(path).read_bytes()
|
95
|
+
return self._extract_pptx(content)
|
96
|
+
|
97
|
+
def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
|
98
|
+
"""Process PowerPoint file contents and convert to Markdown.
|
99
|
+
|
100
|
+
This method handles the core logic of extracting content from a PowerPoint file.
|
101
|
+
It processes:
|
102
|
+
- Slide titles and content
|
103
|
+
- Images (with alt text if available)
|
104
|
+
- Tables (converted to HTML format)
|
105
|
+
- Text frames
|
106
|
+
- Slide notes
|
107
|
+
|
108
|
+
Args:
|
109
|
+
file_contents: Raw bytes of the PowerPoint file to process.
|
110
|
+
|
111
|
+
Returns:
|
112
|
+
ExtractionResult: Contains the extracted content in Markdown format,
|
113
|
+
the MIME type, and any additional metadata.
|
114
|
+
|
115
|
+
Notes:
|
116
|
+
The extraction preserves the following elements:
|
117
|
+
- Slide numbers (as HTML comments)
|
118
|
+
- Images (converted to Markdown image syntax with alt text)
|
119
|
+
- Tables (converted to HTML table syntax)
|
120
|
+
- Text content (with titles properly formatted)
|
121
|
+
- Slide notes (under a dedicated section for each slide)
|
122
|
+
"""
|
123
|
+
md_content = ""
|
124
|
+
presentation = pptx.Presentation(BytesIO(file_contents))
|
125
|
+
|
126
|
+
for index, slide in enumerate(presentation.slides):
|
127
|
+
md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
|
128
|
+
|
129
|
+
title = None
|
130
|
+
if hasattr(slide.shapes, "title"):
|
131
|
+
title = slide.shapes.title
|
132
|
+
|
133
|
+
for shape in slide.shapes:
|
134
|
+
if not hasattr(shape, "shape_type"):
|
135
|
+
continue
|
136
|
+
|
137
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
|
138
|
+
shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
|
139
|
+
):
|
140
|
+
alt_text = ""
|
141
|
+
with suppress(AttributeError):
|
142
|
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
143
|
+
|
144
|
+
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
145
|
+
md_content += f"\n\n"
|
146
|
+
|
147
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
148
|
+
html_table = "<table>"
|
149
|
+
first_row = True
|
150
|
+
|
151
|
+
for row in shape.table.rows:
|
152
|
+
html_table += "<tr>"
|
153
|
+
|
154
|
+
for cell in row.cells:
|
155
|
+
tag = "th" if first_row else "td"
|
156
|
+
html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
|
157
|
+
|
158
|
+
html_table += "</tr>"
|
159
|
+
first_row = False
|
160
|
+
|
161
|
+
html_table += "</table>"
|
162
|
+
md_content += "\n" + html_table + "\n"
|
163
|
+
|
164
|
+
elif shape.has_text_frame:
|
165
|
+
md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
|
166
|
+
|
167
|
+
md_content = md_content.strip()
|
168
|
+
if slide.has_notes_slide:
|
169
|
+
md_content += "\n\n### Notes:\n"
|
170
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
171
|
+
|
172
|
+
if notes_frame is not None: # pragma: no branch
|
173
|
+
md_content += notes_frame.text
|
174
|
+
|
175
|
+
md_content = md_content.strip()
|
176
|
+
|
177
|
+
return ExtractionResult(
|
178
|
+
content=normalize_spaces(md_content),
|
179
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
180
|
+
metadata=self._extract_presentation_metadata(presentation),
|
181
|
+
chunks=[],
|
182
|
+
)
|
183
|
+
|
184
|
+
@staticmethod
|
185
|
+
def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
|
186
|
+
"""Extract metadata from a presentation instance.
|
187
|
+
|
188
|
+
Args:
|
189
|
+
presentation: A `Presentation` object representing the PowerPoint file.
|
190
|
+
|
191
|
+
Returns:
|
192
|
+
PresentationMetadata: Object containing presentation-specific metadata fields.
|
193
|
+
"""
|
194
|
+
metadata: Metadata = {}
|
195
|
+
|
196
|
+
for metadata_key, core_property_key in [
|
197
|
+
("authors", "author"),
|
198
|
+
("comments", "comments"),
|
199
|
+
("status", "content_status"),
|
200
|
+
("created_by", "created"),
|
201
|
+
("identifier", "identifier"),
|
202
|
+
("keywords", "keywords"),
|
203
|
+
("modified_by", "last_modified_by"),
|
204
|
+
("modified_at", "modified"),
|
205
|
+
("version", "revision"), # if version and revision are given, version overwrites ~keep
|
206
|
+
("subject", "subject"),
|
207
|
+
("title", "title"),
|
208
|
+
("version", "version"),
|
209
|
+
]:
|
210
|
+
if core_property := getattr(presentation.core_properties, core_property_key, None):
|
211
|
+
metadata[metadata_key] = core_property # type: ignore[literal-required]
|
212
|
+
|
213
|
+
if presentation.core_properties.language:
|
214
|
+
metadata["languages"] = [presentation.core_properties.language]
|
215
|
+
|
216
|
+
if presentation.core_properties.category:
|
217
|
+
metadata["categories"] = [presentation.core_properties.category]
|
218
|
+
|
219
|
+
fonts = set()
|
220
|
+
for slide in presentation.slides:
|
221
|
+
for shape in slide.shapes:
|
222
|
+
if not hasattr(shape, "text_frame"):
|
223
|
+
continue
|
224
|
+
|
225
|
+
for paragraph in shape.text_frame.paragraphs:
|
226
|
+
for run in paragraph.runs:
|
227
|
+
if hasattr(run, "font") and run.font.name:
|
228
|
+
fonts.add(run.font.name)
|
229
|
+
|
230
|
+
if fonts:
|
231
|
+
metadata["fonts"] = list(fonts)
|
232
|
+
|
233
|
+
return metadata
|
@@ -0,0 +1,125 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import csv
|
4
|
+
import sys
|
5
|
+
from datetime import date, datetime, time, timedelta
|
6
|
+
from io import StringIO
|
7
|
+
from typing import TYPE_CHECKING, Any, Union
|
8
|
+
|
9
|
+
import anyio
|
10
|
+
from anyio import Path as AsyncPath
|
11
|
+
from python_calamine import CalamineWorkbook
|
12
|
+
|
13
|
+
from kreuzberg._extractors._base import Extractor
|
14
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
|
15
|
+
from kreuzberg._types import ExtractionResult
|
16
|
+
from kreuzberg._utils._string import normalize_spaces
|
17
|
+
from kreuzberg._utils._sync import run_sync, run_taskgroup
|
18
|
+
from kreuzberg._utils._tmp import create_temp_file
|
19
|
+
from kreuzberg.exceptions import ParsingError
|
20
|
+
|
21
|
+
if TYPE_CHECKING: # pragma: no cover
|
22
|
+
from pathlib import Path
|
23
|
+
|
24
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
25
|
+
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
|
+
|
27
|
+
|
28
|
+
CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
|
29
|
+
|
30
|
+
|
31
|
+
class SpreadSheetExtractor(Extractor):
|
32
|
+
SUPPORTED_MIME_TYPES = SPREADSHEET_MIME_TYPES
|
33
|
+
|
34
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
35
|
+
xlsx_path, unlink = await create_temp_file(".xlsx")
|
36
|
+
await AsyncPath(xlsx_path).write_bytes(content)
|
37
|
+
try:
|
38
|
+
return await self.extract_path_async(xlsx_path)
|
39
|
+
finally:
|
40
|
+
await unlink()
|
41
|
+
|
42
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
43
|
+
try:
|
44
|
+
workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(path))
|
45
|
+
tasks = [self._convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
|
46
|
+
|
47
|
+
try:
|
48
|
+
results: list[str] = await run_taskgroup(*tasks)
|
49
|
+
|
50
|
+
return ExtractionResult(
|
51
|
+
content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[]
|
52
|
+
)
|
53
|
+
except ExceptionGroup as eg:
|
54
|
+
raise ParsingError(
|
55
|
+
"Failed to extract file data",
|
56
|
+
context={"file": str(path), "errors": eg.exceptions},
|
57
|
+
) from eg
|
58
|
+
except Exception as e:
|
59
|
+
if isinstance(e, ParsingError):
|
60
|
+
raise
|
61
|
+
raise ParsingError(
|
62
|
+
"Failed to extract file data",
|
63
|
+
context={"file": str(path), "error": str(e)},
|
64
|
+
) from e
|
65
|
+
|
66
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
67
|
+
return anyio.run(self.extract_bytes_async, content)
|
68
|
+
|
69
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
70
|
+
return anyio.run(self.extract_path_async, path)
|
71
|
+
|
72
|
+
@staticmethod
|
73
|
+
def _convert_cell_to_str(value: Any) -> str:
|
74
|
+
"""Convert a cell value to string representation.
|
75
|
+
|
76
|
+
Args:
|
77
|
+
value: The cell value to convert.
|
78
|
+
|
79
|
+
Returns:
|
80
|
+
String representation of the cell value.
|
81
|
+
"""
|
82
|
+
if value is None:
|
83
|
+
return ""
|
84
|
+
if isinstance(value, bool):
|
85
|
+
return str(value).lower()
|
86
|
+
if isinstance(value, (datetime, date, time)):
|
87
|
+
return value.isoformat()
|
88
|
+
if isinstance(value, timedelta):
|
89
|
+
return f"{value.total_seconds()} seconds"
|
90
|
+
return str(value)
|
91
|
+
|
92
|
+
async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
|
93
|
+
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
94
|
+
|
95
|
+
csv_buffer = StringIO()
|
96
|
+
writer = csv.writer(csv_buffer)
|
97
|
+
|
98
|
+
for row in values:
|
99
|
+
writer.writerow([self._convert_cell_to_str(cell) for cell in row])
|
100
|
+
|
101
|
+
csv_data = csv_buffer.getvalue()
|
102
|
+
csv_buffer.close()
|
103
|
+
|
104
|
+
csv_path, unlink = await create_temp_file(".csv")
|
105
|
+
await AsyncPath(csv_path).write_text(csv_data)
|
106
|
+
|
107
|
+
csv_reader = csv.reader(StringIO(csv_data))
|
108
|
+
rows = list(csv_reader)
|
109
|
+
result = ""
|
110
|
+
if rows:
|
111
|
+
header = rows[0]
|
112
|
+
markdown_lines: list[str] = [
|
113
|
+
"| " + " | ".join(header) + " |",
|
114
|
+
"| " + " | ".join(["---" for _ in header]) + " |",
|
115
|
+
]
|
116
|
+
|
117
|
+
for row in rows[1:]: # type: ignore[assignment]
|
118
|
+
while len(row) < len(header):
|
119
|
+
row.append("")
|
120
|
+
markdown_lines.append("| " + " | ".join(row) + " |") # type: ignore[arg-type]
|
121
|
+
|
122
|
+
result = "\n".join(markdown_lines)
|
123
|
+
|
124
|
+
await unlink()
|
125
|
+
return f"## {sheet_name}\n\n{normalize_spaces(result)}"
|
kreuzberg/_mime_types.py
CHANGED
@@ -16,7 +16,7 @@ PDF_MIME_TYPE: Final = "application/pdf"
|
|
16
16
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
17
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
18
|
DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
19
|
-
|
19
|
+
|
20
20
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
21
21
|
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
22
22
|
EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
|
@@ -24,8 +24,8 @@ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macr
|
|
24
24
|
EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
|
25
25
|
EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
|
26
26
|
|
27
|
-
|
28
|
-
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
|
27
|
+
|
28
|
+
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
|
29
29
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
30
30
|
|
31
31
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -48,26 +48,7 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
|
|
48
48
|
"image/x-portable-pixmap",
|
49
49
|
"image/x-tiff",
|
50
50
|
}
|
51
|
-
|
52
|
-
"image/bmp": "bmp",
|
53
|
-
"image/x-bmp": "bmp",
|
54
|
-
"image/x-ms-bmp": "bmp",
|
55
|
-
"image/gif": "gif",
|
56
|
-
"image/jpeg": "jpg",
|
57
|
-
"image/pjpeg": "jpg",
|
58
|
-
"image/png": "png",
|
59
|
-
"image/tiff": "tiff",
|
60
|
-
"image/x-tiff": "tiff",
|
61
|
-
"image/jp2": "jp2",
|
62
|
-
"image/jpx": "jpx",
|
63
|
-
"image/jpm": "jpm",
|
64
|
-
"image/mj2": "mj2",
|
65
|
-
"image/webp": "webp",
|
66
|
-
"image/x-portable-anymap": "pnm",
|
67
|
-
"image/x-portable-bitmap": "pbm",
|
68
|
-
"image/x-portable-graymap": "pgm",
|
69
|
-
"image/x-portable-pixmap": "ppm",
|
70
|
-
}
|
51
|
+
|
71
52
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
72
53
|
"application/csl+json",
|
73
54
|
"application/docbook+xml",
|
@@ -162,13 +143,17 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
|
162
143
|
)
|
163
144
|
|
164
145
|
|
165
|
-
def validate_mime_type(
|
146
|
+
def validate_mime_type(
|
147
|
+
*, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
|
148
|
+
) -> str:
|
166
149
|
"""Validate and detect the MIME type for a given file.
|
167
150
|
|
168
151
|
Args:
|
169
152
|
file_path: The path to the file.
|
170
153
|
mime_type: Optional explicit MIME type. If provided, this will be validated.
|
171
154
|
If not provided, the function will attempt to detect the MIME type.
|
155
|
+
check_file_exists: Whether to check if the file exists. Default is True.
|
156
|
+
Set to False in tests where you want to validate a mime type without an actual file.
|
172
157
|
|
173
158
|
Raises:
|
174
159
|
ValidationError: If the MIME type is not supported or cannot be determined.
|
@@ -176,10 +161,18 @@ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = N
|
|
176
161
|
Returns:
|
177
162
|
The validated MIME type.
|
178
163
|
"""
|
179
|
-
|
164
|
+
if file_path and check_file_exists:
|
165
|
+
path = Path(file_path)
|
166
|
+
if not path.exists():
|
167
|
+
raise ValidationError("The file does not exist", context={"file_path": str(path)})
|
180
168
|
|
181
169
|
if not mime_type:
|
182
|
-
|
170
|
+
if not file_path:
|
171
|
+
raise ValidationError(
|
172
|
+
"Could not determine mime type.",
|
173
|
+
)
|
174
|
+
path = Path(file_path)
|
175
|
+
|
183
176
|
ext = path.suffix.lower()
|
184
177
|
mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
|
185
178
|
|
@@ -0,0 +1,17 @@
|
|
1
|
+
from functools import lru_cache
|
2
|
+
from typing import Any
|
3
|
+
|
4
|
+
from kreuzberg._ocr._base import OCRBackend
|
5
|
+
from kreuzberg._ocr._easyocr import EasyOCRBackend
|
6
|
+
from kreuzberg._ocr._paddleocr import PaddleBackend
|
7
|
+
from kreuzberg._ocr._tesseract import TesseractBackend
|
8
|
+
from kreuzberg._types import OcrBackendType
|
9
|
+
|
10
|
+
|
11
|
+
@lru_cache
|
12
|
+
def get_ocr_backend(backend: OcrBackendType) -> OCRBackend[Any]:
|
13
|
+
if backend == "easyocr":
|
14
|
+
return EasyOCRBackend()
|
15
|
+
if backend == "paddleocr":
|
16
|
+
return PaddleBackend()
|
17
|
+
return TesseractBackend()
|
kreuzberg/_ocr/_base.py
ADDED
@@ -0,0 +1,54 @@
|
|
1
|
+
from abc import ABC, abstractmethod
|
2
|
+
from pathlib import Path
|
3
|
+
from typing import Generic, TypeVar
|
4
|
+
|
5
|
+
from PIL.Image import Image
|
6
|
+
|
7
|
+
from kreuzberg._types import ExtractionResult
|
8
|
+
|
9
|
+
try: # pragma: no cover
|
10
|
+
from typing import Unpack # type: ignore[attr-defined]
|
11
|
+
except ImportError: # pragma: no cover
|
12
|
+
from typing_extensions import Unpack
|
13
|
+
|
14
|
+
|
15
|
+
T = TypeVar("T")
|
16
|
+
|
17
|
+
|
18
|
+
class OCRBackend(ABC, Generic[T]):
|
19
|
+
"""Abstract base class for Optical Character Recognition (OCR) backend implementations.
|
20
|
+
|
21
|
+
This class provides the blueprint for OCR backend implementations,
|
22
|
+
offering both synchronous and asynchronous methods to process images
|
23
|
+
and files for text extraction.
|
24
|
+
"""
|
25
|
+
|
26
|
+
@abstractmethod
|
27
|
+
async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
|
28
|
+
"""Asynchronously process an image and extract its text and metadata.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
image: An instance of PIL.Image representing the input image.
|
32
|
+
**kwargs: Any kwargs related to the given backend
|
33
|
+
|
34
|
+
Returns:
|
35
|
+
The extraction result object
|
36
|
+
"""
|
37
|
+
...
|
38
|
+
|
39
|
+
@abstractmethod
|
40
|
+
async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
|
41
|
+
"""Asynchronously process a file and extract its text and metadata.
|
42
|
+
|
43
|
+
Args:
|
44
|
+
path: A Path object representing the file to be processed.
|
45
|
+
**kwargs: Any kwargs related to the given backend
|
46
|
+
|
47
|
+
Returns:
|
48
|
+
The extraction result object
|
49
|
+
"""
|
50
|
+
...
|
51
|
+
|
52
|
+
def __hash__(self) -> int:
|
53
|
+
"""Hash function for allowing caching."""
|
54
|
+
return hash(type(self).__name__)
|