kreuzberg 2.1.2__tar.gz → 3.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. kreuzberg-3.0.1/PKG-INFO +178 -0
  2. kreuzberg-3.0.1/README.md +125 -0
  3. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/kreuzberg/__init__.py +16 -2
  4. kreuzberg-3.0.1/kreuzberg/_chunker.py +51 -0
  5. kreuzberg-3.0.1/kreuzberg/_constants.py +7 -0
  6. kreuzberg-3.0.1/kreuzberg/_extractors/_base.py +92 -0
  7. kreuzberg-3.0.1/kreuzberg/_extractors/_html.py +34 -0
  8. kreuzberg-3.0.1/kreuzberg/_extractors/_image.py +74 -0
  9. kreuzberg-3.0.1/kreuzberg/_extractors/_pandoc.py +613 -0
  10. kreuzberg-3.0.1/kreuzberg/_extractors/_pdf.py +163 -0
  11. kreuzberg-3.0.1/kreuzberg/_extractors/_presentation.py +233 -0
  12. kreuzberg-3.0.1/kreuzberg/_extractors/_spread_sheet.py +125 -0
  13. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/kreuzberg/_mime_types.py +19 -26
  14. kreuzberg-3.0.1/kreuzberg/_ocr/__init__.py +17 -0
  15. kreuzberg-3.0.1/kreuzberg/_ocr/_base.py +54 -0
  16. kreuzberg-3.0.1/kreuzberg/_ocr/_easyocr.py +376 -0
  17. kreuzberg-3.0.1/kreuzberg/_ocr/_paddleocr.py +291 -0
  18. kreuzberg-3.0.1/kreuzberg/_ocr/_tesseract.py +342 -0
  19. kreuzberg-3.0.1/kreuzberg/_playa.py +276 -0
  20. kreuzberg-3.0.1/kreuzberg/_registry.py +108 -0
  21. kreuzberg-3.0.1/kreuzberg/_types.py +168 -0
  22. kreuzberg-3.0.1/kreuzberg/_utils/__init__.py +0 -0
  23. {kreuzberg-2.1.2/kreuzberg → kreuzberg-3.0.1/kreuzberg/_utils}/_string.py +0 -2
  24. kreuzberg-3.0.1/kreuzberg/_utils/_sync.py +121 -0
  25. {kreuzberg-2.1.2/kreuzberg → kreuzberg-3.0.1/kreuzberg/_utils}/_tmp.py +1 -1
  26. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/kreuzberg/exceptions.py +25 -0
  27. kreuzberg-3.0.1/kreuzberg/extraction.py +251 -0
  28. kreuzberg-3.0.1/kreuzberg/py.typed +0 -0
  29. kreuzberg-3.0.1/kreuzberg.egg-info/PKG-INFO +178 -0
  30. kreuzberg-3.0.1/kreuzberg.egg-info/SOURCES.txt +35 -0
  31. kreuzberg-3.0.1/kreuzberg.egg-info/requires.txt +37 -0
  32. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/pyproject.toml +44 -12
  33. kreuzberg-2.1.2/PKG-INFO +0 -446
  34. kreuzberg-2.1.2/README.md +0 -411
  35. kreuzberg-2.1.2/kreuzberg/_constants.py +0 -8
  36. kreuzberg-2.1.2/kreuzberg/_html.py +0 -31
  37. kreuzberg-2.1.2/kreuzberg/_pandoc.py +0 -366
  38. kreuzberg-2.1.2/kreuzberg/_pdf.py +0 -190
  39. kreuzberg-2.1.2/kreuzberg/_pptx.py +0 -88
  40. kreuzberg-2.1.2/kreuzberg/_sync.py +0 -74
  41. kreuzberg-2.1.2/kreuzberg/_tesseract.py +0 -231
  42. kreuzberg-2.1.2/kreuzberg/_types.py +0 -71
  43. kreuzberg-2.1.2/kreuzberg/_xlsx.py +0 -88
  44. kreuzberg-2.1.2/kreuzberg/extraction.py +0 -364
  45. kreuzberg-2.1.2/kreuzberg.egg-info/PKG-INFO +0 -446
  46. kreuzberg-2.1.2/kreuzberg.egg-info/SOURCES.txt +0 -24
  47. kreuzberg-2.1.2/kreuzberg.egg-info/requires.txt +0 -12
  48. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/LICENSE +0 -0
  49. /kreuzberg-2.1.2/kreuzberg/py.typed → /kreuzberg-3.0.1/kreuzberg/_extractors/__init__.py +0 -0
  50. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/kreuzberg.egg-info/dependency_links.txt +0 -0
  51. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/kreuzberg.egg-info/top_level.txt +0 -0
  52. {kreuzberg-2.1.2 → kreuzberg-3.0.1}/setup.cfg +0 -0
@@ -0,0 +1,178 @@
1
+ Metadata-Version: 2.4
2
+ Name: kreuzberg
3
+ Version: 3.0.1
4
+ Summary: A text extraction library supporting PDFs, images, office documents and more
5
+ Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
+ License: MIT
7
+ Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
9
+ Classifier: Development Status :: 4 - Beta
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: License :: OSI Approved :: MIT License
12
+ Classifier: Operating System :: OS Independent
13
+ Classifier: Programming Language :: Python :: 3 :: Only
14
+ Classifier: Programming Language :: Python :: 3.9
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
20
+ Classifier: Topic :: Software Development :: Libraries :: Python Modules
21
+ Classifier: Topic :: Text Processing :: General
22
+ Classifier: Topic :: Utilities
23
+ Classifier: Typing :: Typed
24
+ Requires-Python: >=3.9
25
+ Description-Content-Type: text/markdown
26
+ License-File: LICENSE
27
+ Requires-Dist: anyio>=4.9.0
28
+ Requires-Dist: charset-normalizer>=3.4.1
29
+ Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
+ Requires-Dist: html-to-markdown>=1.2.0
31
+ Requires-Dist: playa-pdf>=0.4.1
32
+ Requires-Dist: pypdfium2==4.30.0
33
+ Requires-Dist: python-calamine>=0.3.1
34
+ Requires-Dist: python-pptx>=1.0.2
35
+ Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
36
+ Provides-Extra: all
37
+ Requires-Dist: easyocr>=1.7.2; extra == "all"
38
+ Requires-Dist: numpy>=2.0.2; extra == "all"
39
+ Requires-Dist: paddleocr>=2.10.0; extra == "all"
40
+ Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
41
+ Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
42
+ Requires-Dist: setuptools>=76.0.0; extra == "all"
43
+ Provides-Extra: chunking
44
+ Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
45
+ Provides-Extra: easyocr
46
+ Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
47
+ Provides-Extra: paddleocr
48
+ Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
49
+ Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
50
+ Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
51
+ Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
52
+ Dynamic: license-file
53
+
54
+ # Kreuzberg
55
+
56
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
57
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
58
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
59
+
60
+ Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
61
+
62
+ ## Why Kreuzberg?
63
+
64
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
65
+ - **Local Processing**: No external API calls or cloud dependencies required
66
+ - **Resource Efficient**: Lightweight processing without GPU requirements
67
+ - **Format Support**: Comprehensive support for documents, images, and text formats
68
+ - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
69
+ - **Modern Python**: Built with async/await, type hints, and a functional-first approach
70
+ - **Permissive OSS**: MIT licensed with permissively licensed dependencies
71
+
72
+ ## Quick Start
73
+
74
+ ```bash
75
+ pip install kreuzberg
76
+ ```
77
+
78
+ Install pandoc:
79
+
80
+ ```bash
81
+ # Ubuntu/Debian
82
+ sudo apt-get install tesseract-ocr pandoc
83
+
84
+ # macOS
85
+ brew install tesseract pandoc
86
+
87
+ # Windows
88
+ choco install -y tesseract pandoc
89
+ ```
90
+
91
+ The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
92
+
93
+ ### Alternative OCR engines
94
+
95
+ ```bash
96
+ # Install with EasyOCR support
97
+ pip install "kreuzberg[easyocr]"
98
+
99
+ # Install with PaddleOCR support
100
+ pip install "kreuzberg[paddleocr]"
101
+ ```
102
+
103
+ ## Quick Example
104
+
105
+ ```python
106
+ import asyncio
107
+ from kreuzberg import extract_file
108
+
109
+ async def main():
110
+ # Extract text from a PDF
111
+ result = await extract_file("document.pdf")
112
+ print(result.content)
113
+
114
+ # Extract text from an image
115
+ result = await extract_file("scan.jpg")
116
+ print(result.content)
117
+
118
+ # Extract text from a Word document
119
+ result = await extract_file("report.docx")
120
+ print(result.content)
121
+
122
+ asyncio.run(main())
123
+ ```
124
+
125
+ ## Documentation
126
+
127
+ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
128
+
129
+ - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
130
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
131
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
132
+ - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
133
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
134
+ - [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
135
+
136
+ ## Supported Formats
137
+
138
+ Kreuzberg supports a wide range of document formats:
139
+
140
+ - **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
141
+ - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
142
+ - **Spreadsheets**: XLSX, XLS, CSV, etc.
143
+ - **Presentations**: PPTX, PPT, etc.
144
+ - **Web Content**: HTML, XML, etc.
145
+
146
+ ## OCR Engines
147
+
148
+ Kreuzberg supports multiple OCR engines:
149
+
150
+ - **Tesseract** (Default): Lightweight, fast startup, requires system installation
151
+ - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
152
+ - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
153
+
154
+ For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
155
+
156
+ ## Contribution
157
+
158
+ This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
159
+
160
+ ### Local Development
161
+
162
+ 1. Clone the repo
163
+
164
+ 1. Install the system dependencies
165
+
166
+ 1. Install the full dependencies with `uv sync`
167
+
168
+ 1. Install the pre-commit hooks with:
169
+
170
+ ```shell
171
+ pre-commit install && pre-commit install --hook-type commit-msg
172
+ ```
173
+
174
+ 1. Make your changes and submit a PR
175
+
176
+ ## License
177
+
178
+ This library is released under the MIT license.
@@ -0,0 +1,125 @@
1
+ # Kreuzberg
2
+
3
+ [![PyPI version](https://badge.fury.io/py/kreuzberg.svg)](https://badge.fury.io/py/kreuzberg)
4
+ [![Documentation](https://img.shields.io/badge/docs-GitHub_Pages-blue)](https://goldziher.github.io/kreuzberg/)
5
+ [![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
6
+
7
+ Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
8
+
9
+ ## Why Kreuzberg?
10
+
11
+ - **Simple and Hassle-Free**: Clean API that just works, without complex configuration
12
+ - **Local Processing**: No external API calls or cloud dependencies required
13
+ - **Resource Efficient**: Lightweight processing without GPU requirements
14
+ - **Format Support**: Comprehensive support for documents, images, and text formats
15
+ - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
16
+ - **Modern Python**: Built with async/await, type hints, and a functional-first approach
17
+ - **Permissive OSS**: MIT licensed with permissively licensed dependencies
18
+
19
+ ## Quick Start
20
+
21
+ ```bash
22
+ pip install kreuzberg
23
+ ```
24
+
25
+ Install pandoc:
26
+
27
+ ```bash
28
+ # Ubuntu/Debian
29
+ sudo apt-get install tesseract-ocr pandoc
30
+
31
+ # macOS
32
+ brew install tesseract pandoc
33
+
34
+ # Windows
35
+ choco install -y tesseract pandoc
36
+ ```
37
+
38
+ The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
39
+
40
+ ### Alternative OCR engines
41
+
42
+ ```bash
43
+ # Install with EasyOCR support
44
+ pip install "kreuzberg[easyocr]"
45
+
46
+ # Install with PaddleOCR support
47
+ pip install "kreuzberg[paddleocr]"
48
+ ```
49
+
50
+ ## Quick Example
51
+
52
+ ```python
53
+ import asyncio
54
+ from kreuzberg import extract_file
55
+
56
+ async def main():
57
+ # Extract text from a PDF
58
+ result = await extract_file("document.pdf")
59
+ print(result.content)
60
+
61
+ # Extract text from an image
62
+ result = await extract_file("scan.jpg")
63
+ print(result.content)
64
+
65
+ # Extract text from a Word document
66
+ result = await extract_file("report.docx")
67
+ print(result.content)
68
+
69
+ asyncio.run(main())
70
+ ```
71
+
72
+ ## Documentation
73
+
74
+ For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
75
+
76
+ - [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
77
+ - [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
78
+ - [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
79
+ - [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
80
+ - [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
81
+ - [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
82
+
83
+ ## Supported Formats
84
+
85
+ Kreuzberg supports a wide range of document formats:
86
+
87
+ - **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
88
+ - **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
89
+ - **Spreadsheets**: XLSX, XLS, CSV, etc.
90
+ - **Presentations**: PPTX, PPT, etc.
91
+ - **Web Content**: HTML, XML, etc.
92
+
93
+ ## OCR Engines
94
+
95
+ Kreuzberg supports multiple OCR engines:
96
+
97
+ - **Tesseract** (Default): Lightweight, fast startup, requires system installation
98
+ - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
99
+ - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
100
+
101
+ For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
102
+
103
+ ## Contribution
104
+
105
+ This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
106
+
107
+ ### Local Development
108
+
109
+ 1. Clone the repo
110
+
111
+ 1. Install the system dependencies
112
+
113
+ 1. Install the full dependencies with `uv sync`
114
+
115
+ 1. Install the pre-commit hooks with:
116
+
117
+ ```shell
118
+ pre-commit install && pre-commit install --hook-type commit-msg
119
+ ```
120
+
121
+ 1. Make your changes and submit a PR
122
+
123
+ ## License
124
+
125
+ This library is released under the MIT license.
@@ -1,5 +1,10 @@
1
- from ._tesseract import PSMMode
2
- from ._types import ExtractionResult, Metadata
1
+ from kreuzberg._ocr._easyocr import EasyOCRConfig
2
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig
3
+ from kreuzberg._ocr._tesseract import TesseractConfig
4
+
5
+ from ._ocr._tesseract import PSMMode
6
+ from ._registry import ExtractorRegistry
7
+ from ._types import ExtractionConfig, ExtractionResult, Metadata
3
8
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
4
9
  from .extraction import (
5
10
  batch_extract_bytes,
@@ -7,22 +12,31 @@ from .extraction import (
7
12
  batch_extract_file,
8
13
  batch_extract_file_sync,
9
14
  extract_bytes,
15
+ extract_bytes_sync,
10
16
  extract_file,
17
+ extract_file_sync,
11
18
  )
12
19
 
13
20
  __all__ = [
21
+ "EasyOCRConfig",
22
+ "ExtractionConfig",
14
23
  "ExtractionResult",
24
+ "ExtractorRegistry",
15
25
  "KreuzbergError",
16
26
  "Metadata",
17
27
  "MissingDependencyError",
18
28
  "OCRError",
19
29
  "PSMMode",
30
+ "PaddleOCRConfig",
20
31
  "ParsingError",
32
+ "TesseractConfig",
21
33
  "ValidationError",
22
34
  "batch_extract_bytes",
23
35
  "batch_extract_bytes_sync",
24
36
  "batch_extract_file",
25
37
  "batch_extract_file_sync",
26
38
  "extract_bytes",
39
+ "extract_bytes_sync",
27
40
  "extract_file",
41
+ "extract_file_sync",
28
42
  ]
@@ -0,0 +1,51 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING
4
+
5
+ from kreuzberg import MissingDependencyError
6
+ from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
7
+ from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
8
+
9
+ if TYPE_CHECKING:
10
+ from semantic_text_splitter import MarkdownSplitter, TextSplitter
11
+
12
+ _chunkers: dict[tuple[int, int, str], MarkdownSplitter | TextSplitter] = {}
13
+
14
+
15
+ def get_chunker(
16
+ mime_type: str,
17
+ max_characters: int = DEFAULT_MAX_CHARACTERS,
18
+ overlap_characters: int = DEFAULT_MAX_OVERLAP,
19
+ ) -> MarkdownSplitter | TextSplitter:
20
+ """Creates and returns a Chunker object configured with the given maximum
21
+ characters per chunk and overlap between chunks.
22
+
23
+ Args:
24
+ mime_type: The mime type of the content.
25
+ max_characters: Maximum number of characters allowed in each chunk.
26
+ overlap_characters: Number of characters overlapping between two consecutive chunks.
27
+
28
+ Raises:
29
+ MissingDependencyError: if semantic-text-splitter is not installed.
30
+
31
+ Returns:
32
+ Chunker: A Chunker object configured with the specified maximum
33
+ characters and overlap.
34
+ """
35
+ key = (max_characters, overlap_characters, mime_type)
36
+ if key not in _chunkers:
37
+ try:
38
+ if mime_type == MARKDOWN_MIME_TYPE:
39
+ from semantic_text_splitter import MarkdownSplitter
40
+
41
+ _chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
42
+ else:
43
+ from semantic_text_splitter import TextSplitter
44
+
45
+ _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
+ except ImportError as e:
47
+ raise MissingDependencyError.create_for_package(
48
+ dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
49
+ ) from e
50
+
51
+ return _chunkers[key]
@@ -0,0 +1,7 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import Final
4
+
5
+ MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
6
+ DEFAULT_MAX_CHARACTERS: Final[int] = 2000
7
+ DEFAULT_MAX_OVERLAP: Final[int] = 100
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, ClassVar
5
+
6
+ if TYPE_CHECKING:
7
+ from pathlib import Path
8
+
9
+ from kreuzberg import ExtractionResult
10
+ from kreuzberg._types import ExtractionConfig
11
+
12
+
13
+ class Extractor(ABC):
14
+ """Abstract base class for file content extraction.
15
+
16
+ This class provides the interface for different types of content extractors.
17
+ Subclasses are expected to implement the methods for extracting content
18
+ either asynchronously or synchronously and determining the supported MIME types.
19
+
20
+ Attributes:
21
+ SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
22
+
23
+ Args:
24
+ mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
25
+ config: Configuration options for the extraction process.
26
+ """
27
+
28
+ __slots__ = ("config", "mime_type")
29
+
30
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]]
31
+
32
+ def __init__(self, mime_type: str, config: ExtractionConfig) -> None:
33
+ self.mime_type = mime_type
34
+ self.config = config
35
+
36
+ @abstractmethod
37
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
38
+ """Asynchronously extract content from a byte stream.
39
+
40
+ Args:
41
+ content: The byte content to extract.
42
+
43
+ Returns:
44
+ ExtractionResult: The extracted content along with metadata about the extraction.
45
+ """
46
+
47
+ @abstractmethod
48
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
49
+ """Asynchronously extract content from a file located at the specified path.
50
+
51
+ Args:
52
+ path: The path to the file to process.
53
+
54
+ Returns:
55
+ ExtractionResult: The extracted content along with metadata about the extraction.
56
+ """
57
+
58
+ @abstractmethod
59
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
60
+ """Synchronously extract content from a byte stream.
61
+
62
+ Args:
63
+ content: The byte content to extract.
64
+
65
+ Returns:
66
+ ExtractionResult: The extracted content along with metadata about the extraction.
67
+ """
68
+
69
+ @abstractmethod
70
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
71
+ """Synchronously extract content from a file located at the specified path.
72
+
73
+ Args:
74
+ path: The path to the file to process.
75
+
76
+ Returns:
77
+ ExtractionResult: The extracted content along with metadata about the extraction.
78
+ """
79
+
80
+ @classmethod
81
+ def supports_mimetype(cls, mime_type: str) -> bool:
82
+ """Verify whether the extractor supports the given MIME type.
83
+
84
+ Args:
85
+ mime_type: The MIME type to check (e.g., "application/pdf").
86
+
87
+ Returns:
88
+ bool: True if the MIME type is supported, False otherwise.
89
+ """
90
+ return mime_type in cls.SUPPORTED_MIME_TYPES or any(
91
+ mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
92
+ )
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, ClassVar
4
+
5
+ import html_to_markdown
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg._extractors._base import Extractor
9
+ from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
+ from kreuzberg._types import ExtractionResult
11
+ from kreuzberg._utils._string import normalize_spaces, safe_decode
12
+ from kreuzberg._utils._sync import run_sync
13
+
14
+ if TYPE_CHECKING:
15
+ from pathlib import Path
16
+
17
+
18
+ class HTMLExtractor(Extractor):
19
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
20
+
21
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
22
+ return await run_sync(self.extract_bytes_sync, content)
23
+
24
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
25
+ content = await AsyncPath(path).read_bytes()
26
+ return await run_sync(self.extract_bytes_sync, content)
27
+
28
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
+ result = html_to_markdown.convert_to_markdown(safe_decode(content))
30
+ return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
31
+
32
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
33
+ content = path.read_bytes()
34
+ return self.extract_bytes_sync(content)
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, ClassVar
4
+
5
+ import anyio
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg._extractors._base import Extractor
9
+ from kreuzberg._mime_types import IMAGE_MIME_TYPES
10
+ from kreuzberg._ocr import get_ocr_backend
11
+ from kreuzberg._utils._tmp import create_temp_file
12
+ from kreuzberg.exceptions import ValidationError
13
+
14
+ if TYPE_CHECKING: # pragma: no cover
15
+ from collections.abc import Mapping
16
+ from pathlib import Path
17
+
18
+ from kreuzberg._types import ExtractionResult
19
+
20
+
21
+ class ImageExtractor(Extractor):
22
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
23
+
24
+ IMAGE_MIME_TYPE_EXT_MAP: ClassVar[Mapping[str, str]] = {
25
+ "image/bmp": "bmp",
26
+ "image/x-bmp": "bmp",
27
+ "image/x-ms-bmp": "bmp",
28
+ "image/gif": "gif",
29
+ "image/jpeg": "jpg",
30
+ "image/pjpeg": "jpg",
31
+ "image/png": "png",
32
+ "image/tiff": "tiff",
33
+ "image/x-tiff": "tiff",
34
+ "image/jp2": "jp2",
35
+ "image/jpx": "jpx",
36
+ "image/jpm": "jpm",
37
+ "image/mj2": "mj2",
38
+ "image/webp": "webp",
39
+ "image/x-portable-anymap": "pnm",
40
+ "image/x-portable-bitmap": "pbm",
41
+ "image/x-portable-graymap": "pgm",
42
+ "image/x-portable-pixmap": "ppm",
43
+ }
44
+
45
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
46
+ extension = self._get_extension_from_mime_type(self.mime_type)
47
+ file_path, unlink = await create_temp_file(f".{extension}")
48
+ await AsyncPath(file_path).write_bytes(content)
49
+ try:
50
+ return await self.extract_path_async(file_path)
51
+ finally:
52
+ await unlink()
53
+
54
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
55
+ if self.config.ocr_backend is None:
56
+ raise ValidationError("ocr_backend is None, cannot perform OCR")
57
+
58
+ return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
59
+
60
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
61
+ return anyio.run(self.extract_bytes_async, content)
62
+
63
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
64
+ return anyio.run(self.extract_path_async, path)
65
+
66
+ def _get_extension_from_mime_type(self, mime_type: str) -> str:
67
+ if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
68
+ return self.IMAGE_MIME_TYPE_EXT_MAP[mime_type]
69
+
70
+ for k, v in self.IMAGE_MIME_TYPE_EXT_MAP.items():
71
+ if k.startswith(mime_type):
72
+ return v
73
+
74
+ raise ValidationError("unsupported mimetype", context={"mime_type": mime_type})