kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. kreuzberg/__init__.py +16 -2
  2. kreuzberg/_chunker.py +51 -0
  3. kreuzberg/_constants.py +2 -3
  4. kreuzberg/_extractors/__init__.py +0 -0
  5. kreuzberg/_extractors/_base.py +92 -0
  6. kreuzberg/_extractors/_html.py +34 -0
  7. kreuzberg/_extractors/_image.py +74 -0
  8. kreuzberg/_extractors/_pandoc.py +613 -0
  9. kreuzberg/_extractors/_pdf.py +163 -0
  10. kreuzberg/_extractors/_presentation.py +233 -0
  11. kreuzberg/_extractors/_spread_sheet.py +125 -0
  12. kreuzberg/_mime_types.py +19 -26
  13. kreuzberg/_ocr/__init__.py +17 -0
  14. kreuzberg/_ocr/_base.py +54 -0
  15. kreuzberg/_ocr/_easyocr.py +376 -0
  16. kreuzberg/_ocr/_paddleocr.py +291 -0
  17. kreuzberg/_ocr/_tesseract.py +342 -0
  18. kreuzberg/_playa.py +276 -0
  19. kreuzberg/_registry.py +108 -0
  20. kreuzberg/_types.py +133 -36
  21. kreuzberg/_utils/__init__.py +0 -0
  22. kreuzberg/{_string.py → _utils/_string.py} +0 -2
  23. kreuzberg/_utils/_sync.py +121 -0
  24. kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
  25. kreuzberg/exceptions.py +25 -0
  26. kreuzberg/extraction.py +114 -227
  27. kreuzberg-3.0.1.dist-info/METADATA +178 -0
  28. kreuzberg-3.0.1.dist-info/RECORD +32 -0
  29. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
  30. kreuzberg/_html.py +0 -31
  31. kreuzberg/_pandoc.py +0 -366
  32. kreuzberg/_pdf.py +0 -190
  33. kreuzberg/_pptx.py +0 -88
  34. kreuzberg/_sync.py +0 -74
  35. kreuzberg/_tesseract.py +0 -231
  36. kreuzberg/_xlsx.py +0 -88
  37. kreuzberg-2.1.2.dist-info/METADATA +0 -446
  38. kreuzberg-2.1.2.dist-info/RECORD +0 -21
  39. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
  40. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,163 @@
1
+ from __future__ import annotations
2
+
3
+ from multiprocessing import cpu_count
4
+ from re import Pattern
5
+ from re import compile as compile_regex
6
+ from typing import TYPE_CHECKING, ClassVar, cast
7
+
8
+ import anyio
9
+ import pypdfium2
10
+ from anyio import Path as AsyncPath
11
+
12
+ from kreuzberg._extractors._base import Extractor
13
+ from kreuzberg._mime_types import PDF_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
14
+ from kreuzberg._ocr import get_ocr_backend
15
+ from kreuzberg._playa import extract_pdf_metadata
16
+ from kreuzberg._types import ExtractionResult, OcrBackendType
17
+ from kreuzberg._utils._string import normalize_spaces
18
+ from kreuzberg._utils._sync import run_sync, run_taskgroup_batched
19
+ from kreuzberg._utils._tmp import create_temp_file
20
+ from kreuzberg.exceptions import ParsingError
21
+
22
+ if TYPE_CHECKING: # pragma: no cover
23
+ from pathlib import Path
24
+
25
+ from PIL.Image import Image
26
+
27
+
28
+ class PDFExtractor(Extractor):
29
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {PDF_MIME_TYPE}
30
+ CORRUPTED_PATTERN: ClassVar[Pattern[str]] = compile_regex(r"[\x00-\x08\x0B-\x0C\x0E-\x1F]|\uFFFD")
31
+ SHORT_TEXT_THRESHOLD: ClassVar[int] = 50
32
+ MINIMUM_CORRUPTED_RESULTS: ClassVar[int] = 2
33
+
34
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
35
+ file_path, unlink = await create_temp_file(".pdf")
36
+ await AsyncPath(file_path).write_bytes(content)
37
+ try:
38
+ metadata = await extract_pdf_metadata(content)
39
+ result = await self.extract_path_async(file_path)
40
+
41
+ result.metadata = metadata
42
+ return result
43
+ finally:
44
+ await unlink()
45
+
46
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
47
+ content_bytes = await AsyncPath(path).read_bytes()
48
+ metadata = await extract_pdf_metadata(content_bytes)
49
+
50
+ if not self.config.force_ocr:
51
+ content = await self._extract_pdf_searchable_text(path)
52
+ if self._validate_extracted_text(content):
53
+ return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
54
+
55
+ if self.config.ocr_backend is not None:
56
+ result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
57
+
58
+ result.metadata = metadata
59
+ return result
60
+
61
+ return ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata=metadata, chunks=[])
62
+
63
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
64
+ return anyio.run(self.extract_bytes_async, content)
65
+
66
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
67
+ return anyio.run(self.extract_path_async, path)
68
+
69
+ def _validate_extracted_text(self, text: str, corruption_threshold: float = 0.05) -> bool:
70
+ """Check if text extracted from PDF is valid or corrupted.
71
+
72
+ This checks for indicators of corrupted PDF text extraction:
73
+ 1. Empty or whitespace-only text
74
+ 2. High concentration of control characters and null bytes
75
+ 3. High concentration of Unicode replacement characters
76
+
77
+ Args:
78
+ text: The extracted text to validate
79
+ corruption_threshold: Maximum allowed percentage (0.0-1.0) of corrupted
80
+ characters (default: 0.05 or 5%)
81
+
82
+ Returns:
83
+ True if the text appears valid, False if it seems corrupted
84
+ """
85
+ if not text or not text.strip():
86
+ return False
87
+
88
+ corruption_matches = self.CORRUPTED_PATTERN.findall(text)
89
+
90
+ if len(text) < self.SHORT_TEXT_THRESHOLD:
91
+ return len(corruption_matches) <= self.MINIMUM_CORRUPTED_RESULTS
92
+
93
+ return (len(corruption_matches) / len(text)) < corruption_threshold
94
+
95
+ async def _convert_pdf_to_images(self, input_file: Path) -> list[Image]:
96
+ """Convert a PDF file to images.
97
+
98
+ Args:
99
+ input_file: The path to the PDF file.
100
+
101
+ Raises:
102
+ ParsingError: If the PDF file could not be converted to images.
103
+
104
+ Returns:
105
+ A list of Pillow Images.
106
+ """
107
+ document: pypdfium2.PdfDocument | None = None
108
+ try:
109
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
110
+ return [page.render(scale=4.25).to_pil() for page in cast("pypdfium2.PdfDocument", document)]
111
+ except pypdfium2.PdfiumError as e:
112
+ raise ParsingError(
113
+ "Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
114
+ ) from e
115
+ finally:
116
+ if document:
117
+ await run_sync(document.close)
118
+
119
+ async def _extract_pdf_text_with_ocr(self, input_file: Path, ocr_backend: OcrBackendType) -> ExtractionResult:
120
+ """Extract text from a scanned PDF file using OCR.
121
+
122
+ Args:
123
+ input_file: The path to the PDF file.
124
+ ocr_backend: The OCR backend to use.
125
+
126
+ Returns:
127
+ The extraction result with text content and metadata.
128
+ """
129
+ images = await self._convert_pdf_to_images(input_file)
130
+ backend = get_ocr_backend(ocr_backend)
131
+ ocr_results = await run_taskgroup_batched(
132
+ *[backend.process_image(image, **self.config.get_config_dict()) for image in images],
133
+ batch_size=cpu_count(),
134
+ )
135
+ return ExtractionResult(
136
+ content="\n".join([v.content for v in ocr_results]), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
137
+ )
138
+
139
+ @staticmethod
140
+ async def _extract_pdf_searchable_text(input_file: Path) -> str:
141
+ """Extract text from a searchable PDF file using pypdfium2.
142
+
143
+ Args:
144
+ input_file: The path to the PDF file.
145
+
146
+ Raises:
147
+ ParsingError: If the text could not be extracted from the PDF file.
148
+
149
+ Returns:
150
+ The extracted text.
151
+ """
152
+ document: pypdfium2.PdfDocument | None = None
153
+ try:
154
+ document = await run_sync(pypdfium2.PdfDocument, str(input_file))
155
+ text = "\n".join(page.get_textpage().get_text_bounded() for page in cast("pypdfium2.PdfDocument", document))
156
+ return normalize_spaces(text)
157
+ except pypdfium2.PdfiumError as e:
158
+ raise ParsingError(
159
+ "Could not extract text from PDF file", context={"file_path": str(input_file), "error": str(e)}
160
+ ) from e
161
+ finally:
162
+ if document:
163
+ await run_sync(document.close)
@@ -0,0 +1,233 @@
1
+ """This module provides functions to extract textual content from files.
2
+
3
+ It includes vendored code:
4
+
5
+ - The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
6
+ See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
7
+ Refer to the markitdown repository for it's license (MIT).
8
+ """
9
+
10
+ from __future__ import annotations
11
+
12
+ import re
13
+ from contextlib import suppress
14
+ from html import escape
15
+ from io import BytesIO
16
+ from pathlib import Path
17
+ from typing import TYPE_CHECKING, ClassVar
18
+
19
+ import pptx
20
+ from anyio import Path as AsyncPath
21
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
22
+
23
+ from kreuzberg._extractors._base import Extractor
24
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, POWER_POINT_MIME_TYPE
25
+ from kreuzberg._types import ExtractionResult
26
+ from kreuzberg._utils._string import normalize_spaces
27
+
28
+ if TYPE_CHECKING: # pragma: no cover
29
+ from pptx.presentation import Presentation
30
+
31
+ from kreuzberg._types import Metadata
32
+
33
+
34
+ class PresentationExtractor(Extractor):
35
+ """Extractor for PowerPoint (.pptx) files.
36
+
37
+ This extractor processes PowerPoint presentations and converts their content into Markdown format.
38
+ It handles slides, shapes, images, tables, and slide notes, preserving the structure and content
39
+ of the presentation in a readable text format.
40
+
41
+ The extractor provides both synchronous and asynchronous methods for processing files either
42
+ from disk or from bytes in memory.
43
+ """
44
+
45
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {POWER_POINT_MIME_TYPE}
46
+
47
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
48
+ """Asynchronously extract content from PowerPoint file bytes.
49
+
50
+ Args:
51
+ content: Raw bytes of the PowerPoint file to process.
52
+
53
+ Returns:
54
+ ExtractionResult: Contains the extracted content in Markdown format,
55
+ the MIME type, and any additional metadata.
56
+ """
57
+ return self._extract_pptx(content)
58
+
59
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
60
+ """Asynchronously extract content from a PowerPoint file on disk.
61
+
62
+ Args:
63
+ path: Path to the PowerPoint file to process.
64
+
65
+ Returns:
66
+ ExtractionResult: Contains the extracted content in Markdown format,
67
+ the MIME type, and any additional metadata.
68
+ """
69
+ content = await AsyncPath(path).read_bytes()
70
+ return self._extract_pptx(content)
71
+
72
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
73
+ """Synchronously extract content from PowerPoint file bytes.
74
+
75
+ Args:
76
+ content: Raw bytes of the PowerPoint file to process.
77
+
78
+ Returns:
79
+ ExtractionResult: Contains the extracted content in Markdown format,
80
+ the MIME type, and any additional metadata.
81
+ """
82
+ return self._extract_pptx(content)
83
+
84
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
85
+ """Synchronously extract content from a PowerPoint file on disk.
86
+
87
+ Args:
88
+ path: Path to the PowerPoint file to process.
89
+
90
+ Returns:
91
+ ExtractionResult: Contains the extracted content in Markdown format,
92
+ the MIME type, and any additional metadata.
93
+ """
94
+ content = Path(path).read_bytes()
95
+ return self._extract_pptx(content)
96
+
97
+ def _extract_pptx(self, file_contents: bytes) -> ExtractionResult:
98
+ """Process PowerPoint file contents and convert to Markdown.
99
+
100
+ This method handles the core logic of extracting content from a PowerPoint file.
101
+ It processes:
102
+ - Slide titles and content
103
+ - Images (with alt text if available)
104
+ - Tables (converted to HTML format)
105
+ - Text frames
106
+ - Slide notes
107
+
108
+ Args:
109
+ file_contents: Raw bytes of the PowerPoint file to process.
110
+
111
+ Returns:
112
+ ExtractionResult: Contains the extracted content in Markdown format,
113
+ the MIME type, and any additional metadata.
114
+
115
+ Notes:
116
+ The extraction preserves the following elements:
117
+ - Slide numbers (as HTML comments)
118
+ - Images (converted to Markdown image syntax with alt text)
119
+ - Tables (converted to HTML table syntax)
120
+ - Text content (with titles properly formatted)
121
+ - Slide notes (under a dedicated section for each slide)
122
+ """
123
+ md_content = ""
124
+ presentation = pptx.Presentation(BytesIO(file_contents))
125
+
126
+ for index, slide in enumerate(presentation.slides):
127
+ md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
128
+
129
+ title = None
130
+ if hasattr(slide.shapes, "title"):
131
+ title = slide.shapes.title
132
+
133
+ for shape in slide.shapes:
134
+ if not hasattr(shape, "shape_type"):
135
+ continue
136
+
137
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
138
+ shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
139
+ ):
140
+ alt_text = ""
141
+ with suppress(AttributeError):
142
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
143
+
144
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
145
+ md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
146
+
147
+ elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
148
+ html_table = "<table>"
149
+ first_row = True
150
+
151
+ for row in shape.table.rows:
152
+ html_table += "<tr>"
153
+
154
+ for cell in row.cells:
155
+ tag = "th" if first_row else "td"
156
+ html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
157
+
158
+ html_table += "</tr>"
159
+ first_row = False
160
+
161
+ html_table += "</table>"
162
+ md_content += "\n" + html_table + "\n"
163
+
164
+ elif shape.has_text_frame:
165
+ md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
166
+
167
+ md_content = md_content.strip()
168
+ if slide.has_notes_slide:
169
+ md_content += "\n\n### Notes:\n"
170
+ notes_frame = slide.notes_slide.notes_text_frame
171
+
172
+ if notes_frame is not None: # pragma: no branch
173
+ md_content += notes_frame.text
174
+
175
+ md_content = md_content.strip()
176
+
177
+ return ExtractionResult(
178
+ content=normalize_spaces(md_content),
179
+ mime_type=MARKDOWN_MIME_TYPE,
180
+ metadata=self._extract_presentation_metadata(presentation),
181
+ chunks=[],
182
+ )
183
+
184
+ @staticmethod
185
+ def _extract_presentation_metadata(presentation: Presentation) -> Metadata:
186
+ """Extract metadata from a presentation instance.
187
+
188
+ Args:
189
+ presentation: A `Presentation` object representing the PowerPoint file.
190
+
191
+ Returns:
192
+ PresentationMetadata: Object containing presentation-specific metadata fields.
193
+ """
194
+ metadata: Metadata = {}
195
+
196
+ for metadata_key, core_property_key in [
197
+ ("authors", "author"),
198
+ ("comments", "comments"),
199
+ ("status", "content_status"),
200
+ ("created_by", "created"),
201
+ ("identifier", "identifier"),
202
+ ("keywords", "keywords"),
203
+ ("modified_by", "last_modified_by"),
204
+ ("modified_at", "modified"),
205
+ ("version", "revision"), # if version and revision are given, version overwrites ~keep
206
+ ("subject", "subject"),
207
+ ("title", "title"),
208
+ ("version", "version"),
209
+ ]:
210
+ if core_property := getattr(presentation.core_properties, core_property_key, None):
211
+ metadata[metadata_key] = core_property # type: ignore[literal-required]
212
+
213
+ if presentation.core_properties.language:
214
+ metadata["languages"] = [presentation.core_properties.language]
215
+
216
+ if presentation.core_properties.category:
217
+ metadata["categories"] = [presentation.core_properties.category]
218
+
219
+ fonts = set()
220
+ for slide in presentation.slides:
221
+ for shape in slide.shapes:
222
+ if not hasattr(shape, "text_frame"):
223
+ continue
224
+
225
+ for paragraph in shape.text_frame.paragraphs:
226
+ for run in paragraph.runs:
227
+ if hasattr(run, "font") and run.font.name:
228
+ fonts.add(run.font.name)
229
+
230
+ if fonts:
231
+ metadata["fonts"] = list(fonts)
232
+
233
+ return metadata
@@ -0,0 +1,125 @@
1
+ from __future__ import annotations
2
+
3
+ import csv
4
+ import sys
5
+ from datetime import date, datetime, time, timedelta
6
+ from io import StringIO
7
+ from typing import TYPE_CHECKING, Any, Union
8
+
9
+ import anyio
10
+ from anyio import Path as AsyncPath
11
+ from python_calamine import CalamineWorkbook
12
+
13
+ from kreuzberg._extractors._base import Extractor
14
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE, SPREADSHEET_MIME_TYPES
15
+ from kreuzberg._types import ExtractionResult
16
+ from kreuzberg._utils._string import normalize_spaces
17
+ from kreuzberg._utils._sync import run_sync, run_taskgroup
18
+ from kreuzberg._utils._tmp import create_temp_file
19
+ from kreuzberg.exceptions import ParsingError
20
+
21
+ if TYPE_CHECKING: # pragma: no cover
22
+ from pathlib import Path
23
+
24
+ if sys.version_info < (3, 11): # pragma: no cover
25
+ from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
26
+
27
+
28
+ CellValue = Union[int, float, str, bool, time, date, datetime, timedelta]
29
+
30
+
31
+ class SpreadSheetExtractor(Extractor):
32
+ SUPPORTED_MIME_TYPES = SPREADSHEET_MIME_TYPES
33
+
34
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
35
+ xlsx_path, unlink = await create_temp_file(".xlsx")
36
+ await AsyncPath(xlsx_path).write_bytes(content)
37
+ try:
38
+ return await self.extract_path_async(xlsx_path)
39
+ finally:
40
+ await unlink()
41
+
42
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
43
+ try:
44
+ workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(path))
45
+ tasks = [self._convert_sheet_to_text(workbook, sheet_name) for sheet_name in workbook.sheet_names]
46
+
47
+ try:
48
+ results: list[str] = await run_taskgroup(*tasks)
49
+
50
+ return ExtractionResult(
51
+ content="\n\n".join(results), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[]
52
+ )
53
+ except ExceptionGroup as eg:
54
+ raise ParsingError(
55
+ "Failed to extract file data",
56
+ context={"file": str(path), "errors": eg.exceptions},
57
+ ) from eg
58
+ except Exception as e:
59
+ if isinstance(e, ParsingError):
60
+ raise
61
+ raise ParsingError(
62
+ "Failed to extract file data",
63
+ context={"file": str(path), "error": str(e)},
64
+ ) from e
65
+
66
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
67
+ return anyio.run(self.extract_bytes_async, content)
68
+
69
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
70
+ return anyio.run(self.extract_path_async, path)
71
+
72
+ @staticmethod
73
+ def _convert_cell_to_str(value: Any) -> str:
74
+ """Convert a cell value to string representation.
75
+
76
+ Args:
77
+ value: The cell value to convert.
78
+
79
+ Returns:
80
+ String representation of the cell value.
81
+ """
82
+ if value is None:
83
+ return ""
84
+ if isinstance(value, bool):
85
+ return str(value).lower()
86
+ if isinstance(value, (datetime, date, time)):
87
+ return value.isoformat()
88
+ if isinstance(value, timedelta):
89
+ return f"{value.total_seconds()} seconds"
90
+ return str(value)
91
+
92
+ async def _convert_sheet_to_text(self, workbook: CalamineWorkbook, sheet_name: str) -> str:
93
+ values = workbook.get_sheet_by_name(sheet_name).to_python()
94
+
95
+ csv_buffer = StringIO()
96
+ writer = csv.writer(csv_buffer)
97
+
98
+ for row in values:
99
+ writer.writerow([self._convert_cell_to_str(cell) for cell in row])
100
+
101
+ csv_data = csv_buffer.getvalue()
102
+ csv_buffer.close()
103
+
104
+ csv_path, unlink = await create_temp_file(".csv")
105
+ await AsyncPath(csv_path).write_text(csv_data)
106
+
107
+ csv_reader = csv.reader(StringIO(csv_data))
108
+ rows = list(csv_reader)
109
+ result = ""
110
+ if rows:
111
+ header = rows[0]
112
+ markdown_lines: list[str] = [
113
+ "| " + " | ".join(header) + " |",
114
+ "| " + " | ".join(["---" for _ in header]) + " |",
115
+ ]
116
+
117
+ for row in rows[1:]: # type: ignore[assignment]
118
+ while len(row) < len(header):
119
+ row.append("")
120
+ markdown_lines.append("| " + " | ".join(row) + " |") # type: ignore[arg-type]
121
+
122
+ result = "\n".join(markdown_lines)
123
+
124
+ await unlink()
125
+ return f"## {sheet_name}\n\n{normalize_spaces(result)}"
kreuzberg/_mime_types.py CHANGED
@@ -16,7 +16,7 @@ PDF_MIME_TYPE: Final = "application/pdf"
16
16
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
17
17
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
18
18
  DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
19
- # Excel formats
19
+
20
20
  EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
21
21
  EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
22
22
  EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
@@ -24,8 +24,8 @@ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macr
24
24
  EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
25
25
  EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
26
26
 
27
- # OpenDocument spreadsheet format
28
- OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet" # ods
27
+
28
+ OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
29
29
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
30
30
 
31
31
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -48,26 +48,7 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
48
48
  "image/x-portable-pixmap",
49
49
  "image/x-tiff",
50
50
  }
51
- IMAGE_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
52
- "image/bmp": "bmp",
53
- "image/x-bmp": "bmp",
54
- "image/x-ms-bmp": "bmp",
55
- "image/gif": "gif",
56
- "image/jpeg": "jpg",
57
- "image/pjpeg": "jpg",
58
- "image/png": "png",
59
- "image/tiff": "tiff",
60
- "image/x-tiff": "tiff",
61
- "image/jp2": "jp2",
62
- "image/jpx": "jpx",
63
- "image/jpm": "jpm",
64
- "image/mj2": "mj2",
65
- "image/webp": "webp",
66
- "image/x-portable-anymap": "pnm",
67
- "image/x-portable-bitmap": "pbm",
68
- "image/x-portable-graymap": "pgm",
69
- "image/x-portable-pixmap": "ppm",
70
- }
51
+
71
52
  PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
72
53
  "application/csl+json",
73
54
  "application/docbook+xml",
@@ -162,13 +143,17 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
162
143
  )
163
144
 
164
145
 
165
- def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = None) -> str:
146
+ def validate_mime_type(
147
+ *, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
148
+ ) -> str:
166
149
  """Validate and detect the MIME type for a given file.
167
150
 
168
151
  Args:
169
152
  file_path: The path to the file.
170
153
  mime_type: Optional explicit MIME type. If provided, this will be validated.
171
154
  If not provided, the function will attempt to detect the MIME type.
155
+ check_file_exists: Whether to check if the file exists. Default is True.
156
+ Set to False in tests where you want to validate a mime type without an actual file.
172
157
 
173
158
  Raises:
174
159
  ValidationError: If the MIME type is not supported or cannot be determined.
@@ -176,10 +161,18 @@ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = N
176
161
  Returns:
177
162
  The validated MIME type.
178
163
  """
179
- path = Path(file_path)
164
+ if file_path and check_file_exists:
165
+ path = Path(file_path)
166
+ if not path.exists():
167
+ raise ValidationError("The file does not exist", context={"file_path": str(path)})
180
168
 
181
169
  if not mime_type:
182
- # Try to determine MIME type from file extension first
170
+ if not file_path:
171
+ raise ValidationError(
172
+ "Could not determine mime type.",
173
+ )
174
+ path = Path(file_path)
175
+
183
176
  ext = path.suffix.lower()
184
177
  mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
185
178
 
@@ -0,0 +1,17 @@
1
+ from functools import lru_cache
2
+ from typing import Any
3
+
4
+ from kreuzberg._ocr._base import OCRBackend
5
+ from kreuzberg._ocr._easyocr import EasyOCRBackend
6
+ from kreuzberg._ocr._paddleocr import PaddleBackend
7
+ from kreuzberg._ocr._tesseract import TesseractBackend
8
+ from kreuzberg._types import OcrBackendType
9
+
10
+
11
+ @lru_cache
12
+ def get_ocr_backend(backend: OcrBackendType) -> OCRBackend[Any]:
13
+ if backend == "easyocr":
14
+ return EasyOCRBackend()
15
+ if backend == "paddleocr":
16
+ return PaddleBackend()
17
+ return TesseractBackend()
@@ -0,0 +1,54 @@
1
+ from abc import ABC, abstractmethod
2
+ from pathlib import Path
3
+ from typing import Generic, TypeVar
4
+
5
+ from PIL.Image import Image
6
+
7
+ from kreuzberg._types import ExtractionResult
8
+
9
+ try: # pragma: no cover
10
+ from typing import Unpack # type: ignore[attr-defined]
11
+ except ImportError: # pragma: no cover
12
+ from typing_extensions import Unpack
13
+
14
+
15
+ T = TypeVar("T")
16
+
17
+
18
+ class OCRBackend(ABC, Generic[T]):
19
+ """Abstract base class for Optical Character Recognition (OCR) backend implementations.
20
+
21
+ This class provides the blueprint for OCR backend implementations,
22
+ offering both synchronous and asynchronous methods to process images
23
+ and files for text extraction.
24
+ """
25
+
26
+ @abstractmethod
27
+ async def process_image(self, image: Image, **kwargs: Unpack[T]) -> ExtractionResult:
28
+ """Asynchronously process an image and extract its text and metadata.
29
+
30
+ Args:
31
+ image: An instance of PIL.Image representing the input image.
32
+ **kwargs: Any kwargs related to the given backend
33
+
34
+ Returns:
35
+ The extraction result object
36
+ """
37
+ ...
38
+
39
+ @abstractmethod
40
+ async def process_file(self, path: Path, **kwargs: Unpack[T]) -> ExtractionResult:
41
+ """Asynchronously process a file and extract its text and metadata.
42
+
43
+ Args:
44
+ path: A Path object representing the file to be processed.
45
+ **kwargs: Any kwargs related to the given backend
46
+
47
+ Returns:
48
+ The extraction result object
49
+ """
50
+ ...
51
+
52
+ def __hash__(self) -> int:
53
+ """Hash function for allowing caching."""
54
+ return hash(type(self).__name__)