kreuzberg 2.1.1__tar.gz → 3.0.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg-3.0.0/PKG-INFO +178 -0
- kreuzberg-3.0.0/README.md +125 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/kreuzberg/__init__.py +16 -2
- kreuzberg-3.0.0/kreuzberg/_chunker.py +51 -0
- kreuzberg-3.0.0/kreuzberg/_constants.py +7 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/kreuzberg/_mime_types.py +19 -26
- kreuzberg-3.0.0/kreuzberg/_playa.py +276 -0
- kreuzberg-3.0.0/kreuzberg/_registry.py +108 -0
- kreuzberg-3.0.0/kreuzberg/_types.py +168 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/kreuzberg/exceptions.py +25 -0
- kreuzberg-3.0.0/kreuzberg/extraction.py +251 -0
- kreuzberg-3.0.0/kreuzberg.egg-info/PKG-INFO +178 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/kreuzberg.egg-info/SOURCES.txt +3 -9
- kreuzberg-3.0.0/kreuzberg.egg-info/requires.txt +37 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/pyproject.toml +45 -10
- kreuzberg-2.1.1/PKG-INFO +0 -446
- kreuzberg-2.1.1/README.md +0 -411
- kreuzberg-2.1.1/kreuzberg/_constants.py +0 -8
- kreuzberg-2.1.1/kreuzberg/_html.py +0 -31
- kreuzberg-2.1.1/kreuzberg/_pandoc.py +0 -366
- kreuzberg-2.1.1/kreuzberg/_pdf.py +0 -190
- kreuzberg-2.1.1/kreuzberg/_pptx.py +0 -88
- kreuzberg-2.1.1/kreuzberg/_string.py +0 -41
- kreuzberg-2.1.1/kreuzberg/_sync.py +0 -74
- kreuzberg-2.1.1/kreuzberg/_tesseract.py +0 -231
- kreuzberg-2.1.1/kreuzberg/_tmp.py +0 -37
- kreuzberg-2.1.1/kreuzberg/_types.py +0 -71
- kreuzberg-2.1.1/kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.1/kreuzberg/extraction.py +0 -364
- kreuzberg-2.1.1/kreuzberg.egg-info/PKG-INFO +0 -446
- kreuzberg-2.1.1/kreuzberg.egg-info/requires.txt +0 -12
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/LICENSE +0 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-2.1.1 → kreuzberg-3.0.0}/setup.cfg +0 -0
kreuzberg-3.0.0/PKG-INFO
ADDED
@@ -0,0 +1,178 @@
|
|
1
|
+
Metadata-Version: 2.4
|
2
|
+
Name: kreuzberg
|
3
|
+
Version: 3.0.0
|
4
|
+
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
|
+
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
|
+
License: MIT
|
7
|
+
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
9
|
+
Classifier: Development Status :: 4 - Beta
|
10
|
+
Classifier: Intended Audience :: Developers
|
11
|
+
Classifier: License :: OSI Approved :: MIT License
|
12
|
+
Classifier: Operating System :: OS Independent
|
13
|
+
Classifier: Programming Language :: Python :: 3 :: Only
|
14
|
+
Classifier: Programming Language :: Python :: 3.9
|
15
|
+
Classifier: Programming Language :: Python :: 3.10
|
16
|
+
Classifier: Programming Language :: Python :: 3.11
|
17
|
+
Classifier: Programming Language :: Python :: 3.12
|
18
|
+
Classifier: Programming Language :: Python :: 3.13
|
19
|
+
Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
|
20
|
+
Classifier: Topic :: Software Development :: Libraries :: Python Modules
|
21
|
+
Classifier: Topic :: Text Processing :: General
|
22
|
+
Classifier: Topic :: Utilities
|
23
|
+
Classifier: Typing :: Typed
|
24
|
+
Requires-Python: >=3.9
|
25
|
+
Description-Content-Type: text/markdown
|
26
|
+
License-File: LICENSE
|
27
|
+
Requires-Dist: anyio>=4.9.0
|
28
|
+
Requires-Dist: charset-normalizer>=3.4.1
|
29
|
+
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
31
|
+
Requires-Dist: playa-pdf>=0.4.1
|
32
|
+
Requires-Dist: pypdfium2==4.30.0
|
33
|
+
Requires-Dist: python-calamine>=0.3.1
|
34
|
+
Requires-Dist: python-pptx>=1.0.2
|
35
|
+
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
|
36
|
+
Provides-Extra: all
|
37
|
+
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
|
+
Requires-Dist: numpy>=2.0.2; extra == "all"
|
39
|
+
Requires-Dist: paddleocr>=2.10.0; extra == "all"
|
40
|
+
Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
|
41
|
+
Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
|
42
|
+
Requires-Dist: setuptools>=76.0.0; extra == "all"
|
43
|
+
Provides-Extra: chunking
|
44
|
+
Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
|
45
|
+
Provides-Extra: easyocr
|
46
|
+
Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
|
47
|
+
Provides-Extra: paddleocr
|
48
|
+
Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
|
49
|
+
Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
|
50
|
+
Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
|
51
|
+
Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
|
52
|
+
Dynamic: license-file
|
53
|
+
|
54
|
+
# Kreuzberg
|
55
|
+
|
56
|
+
[](https://badge.fury.io/py/kreuzberg)
|
57
|
+
[](https://goldziher.github.io/kreuzberg/)
|
58
|
+
[](https://opensource.org/licenses/MIT)
|
59
|
+
|
60
|
+
Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
|
61
|
+
|
62
|
+
## Why Kreuzberg?
|
63
|
+
|
64
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
65
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
66
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
67
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
68
|
+
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
69
|
+
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
70
|
+
- **Permissive OSS**: MIT licensed with permissively licensed dependencies
|
71
|
+
|
72
|
+
## Quick Start
|
73
|
+
|
74
|
+
```bash
|
75
|
+
pip install kreuzberg
|
76
|
+
```
|
77
|
+
|
78
|
+
Install pandoc:
|
79
|
+
|
80
|
+
```bash
|
81
|
+
# Ubuntu/Debian
|
82
|
+
sudo apt-get install tesseract-ocr pandoc
|
83
|
+
|
84
|
+
# macOS
|
85
|
+
brew install tesseract pandoc
|
86
|
+
|
87
|
+
# Windows
|
88
|
+
choco install -y tesseract pandoc
|
89
|
+
```
|
90
|
+
|
91
|
+
The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
|
92
|
+
|
93
|
+
### Alternative OCR engines
|
94
|
+
|
95
|
+
```bash
|
96
|
+
# Install with EasyOCR support
|
97
|
+
pip install "kreuzberg[easyocr]"
|
98
|
+
|
99
|
+
# Install with PaddleOCR support
|
100
|
+
pip install "kreuzberg[paddleocr]"
|
101
|
+
```
|
102
|
+
|
103
|
+
## Quick Example
|
104
|
+
|
105
|
+
```python
|
106
|
+
import asyncio
|
107
|
+
from kreuzberg import extract_file
|
108
|
+
|
109
|
+
async def main():
|
110
|
+
# Extract text from a PDF
|
111
|
+
result = await extract_file("document.pdf")
|
112
|
+
print(result.content)
|
113
|
+
|
114
|
+
# Extract text from an image
|
115
|
+
result = await extract_file("scan.jpg")
|
116
|
+
print(result.content)
|
117
|
+
|
118
|
+
# Extract text from a Word document
|
119
|
+
result = await extract_file("report.docx")
|
120
|
+
print(result.content)
|
121
|
+
|
122
|
+
asyncio.run(main())
|
123
|
+
```
|
124
|
+
|
125
|
+
## Documentation
|
126
|
+
|
127
|
+
For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
|
128
|
+
|
129
|
+
- [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
|
130
|
+
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
|
131
|
+
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
|
132
|
+
- [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
|
133
|
+
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
|
134
|
+
- [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
|
135
|
+
|
136
|
+
## Supported Formats
|
137
|
+
|
138
|
+
Kreuzberg supports a wide range of document formats:
|
139
|
+
|
140
|
+
- **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
|
141
|
+
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
142
|
+
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
143
|
+
- **Presentations**: PPTX, PPT, etc.
|
144
|
+
- **Web Content**: HTML, XML, etc.
|
145
|
+
|
146
|
+
## OCR Engines
|
147
|
+
|
148
|
+
Kreuzberg supports multiple OCR engines:
|
149
|
+
|
150
|
+
- **Tesseract** (Default): Lightweight, fast startup, requires system installation
|
151
|
+
- **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
|
152
|
+
- **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
|
153
|
+
|
154
|
+
For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
|
155
|
+
|
156
|
+
## Contribution
|
157
|
+
|
158
|
+
This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
|
159
|
+
|
160
|
+
### Local Development
|
161
|
+
|
162
|
+
1. Clone the repo
|
163
|
+
|
164
|
+
1. Install the system dependencies
|
165
|
+
|
166
|
+
1. Install the full dependencies with `uv sync`
|
167
|
+
|
168
|
+
1. Install the pre-commit hooks with:
|
169
|
+
|
170
|
+
```shell
|
171
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
172
|
+
```
|
173
|
+
|
174
|
+
1. Make your changes and submit a PR
|
175
|
+
|
176
|
+
## License
|
177
|
+
|
178
|
+
This library is released under the MIT license.
|
@@ -0,0 +1,125 @@
|
|
1
|
+
# Kreuzberg
|
2
|
+
|
3
|
+
[](https://badge.fury.io/py/kreuzberg)
|
4
|
+
[](https://goldziher.github.io/kreuzberg/)
|
5
|
+
[](https://opensource.org/licenses/MIT)
|
6
|
+
|
7
|
+
Kreuzberg is a Python library for text extraction from documents. It provides a unified interface for extracting text from PDFs, images, office documents, and more, with both async and sync APIs.
|
8
|
+
|
9
|
+
## Why Kreuzberg?
|
10
|
+
|
11
|
+
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
12
|
+
- **Local Processing**: No external API calls or cloud dependencies required
|
13
|
+
- **Resource Efficient**: Lightweight processing without GPU requirements
|
14
|
+
- **Format Support**: Comprehensive support for documents, images, and text formats
|
15
|
+
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
16
|
+
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
17
|
+
- **Permissive OSS**: MIT licensed with permissively licensed dependencies
|
18
|
+
|
19
|
+
## Quick Start
|
20
|
+
|
21
|
+
```bash
|
22
|
+
pip install kreuzberg
|
23
|
+
```
|
24
|
+
|
25
|
+
Install pandoc:
|
26
|
+
|
27
|
+
```bash
|
28
|
+
# Ubuntu/Debian
|
29
|
+
sudo apt-get install tesseract-ocr pandoc
|
30
|
+
|
31
|
+
# macOS
|
32
|
+
brew install tesseract pandoc
|
33
|
+
|
34
|
+
# Windows
|
35
|
+
choco install -y tesseract pandoc
|
36
|
+
```
|
37
|
+
|
38
|
+
The tesseract OCR engine is the default OCR engine. You can decide not to use it - and then either use one of the two alternative OCR engines, or have no OCR at all.
|
39
|
+
|
40
|
+
### Alternative OCR engines
|
41
|
+
|
42
|
+
```bash
|
43
|
+
# Install with EasyOCR support
|
44
|
+
pip install "kreuzberg[easyocr]"
|
45
|
+
|
46
|
+
# Install with PaddleOCR support
|
47
|
+
pip install "kreuzberg[paddleocr]"
|
48
|
+
```
|
49
|
+
|
50
|
+
## Quick Example
|
51
|
+
|
52
|
+
```python
|
53
|
+
import asyncio
|
54
|
+
from kreuzberg import extract_file
|
55
|
+
|
56
|
+
async def main():
|
57
|
+
# Extract text from a PDF
|
58
|
+
result = await extract_file("document.pdf")
|
59
|
+
print(result.content)
|
60
|
+
|
61
|
+
# Extract text from an image
|
62
|
+
result = await extract_file("scan.jpg")
|
63
|
+
print(result.content)
|
64
|
+
|
65
|
+
# Extract text from a Word document
|
66
|
+
result = await extract_file("report.docx")
|
67
|
+
print(result.content)
|
68
|
+
|
69
|
+
asyncio.run(main())
|
70
|
+
```
|
71
|
+
|
72
|
+
## Documentation
|
73
|
+
|
74
|
+
For comprehensive documentation, visit our [GitHub Pages](https://goldziher.github.io/kreuzberg/):
|
75
|
+
|
76
|
+
- [Getting Started](https://goldziher.github.io/kreuzberg/getting-started/) - Installation and basic usage
|
77
|
+
- [User Guide](https://goldziher.github.io/kreuzberg/user-guide/) - In-depth usage information
|
78
|
+
- [API Reference](https://goldziher.github.io/kreuzberg/api-reference/) - Detailed API documentation
|
79
|
+
- [Examples](https://goldziher.github.io/kreuzberg/examples/) - Code examples for common use cases
|
80
|
+
- [OCR Configuration](https://goldziher.github.io/kreuzberg/user-guide/ocr-configuration/) - Configure OCR engines
|
81
|
+
- [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) - Choose the right OCR engine
|
82
|
+
|
83
|
+
## Supported Formats
|
84
|
+
|
85
|
+
Kreuzberg supports a wide range of document formats:
|
86
|
+
|
87
|
+
- **Documents**: PDF, DOCX, DOC, RTF, TXT, EPUB, etc.
|
88
|
+
- **Images**: JPG, PNG, TIFF, BMP, GIF, etc.
|
89
|
+
- **Spreadsheets**: XLSX, XLS, CSV, etc.
|
90
|
+
- **Presentations**: PPTX, PPT, etc.
|
91
|
+
- **Web Content**: HTML, XML, etc.
|
92
|
+
|
93
|
+
## OCR Engines
|
94
|
+
|
95
|
+
Kreuzberg supports multiple OCR engines:
|
96
|
+
|
97
|
+
- **Tesseract** (Default): Lightweight, fast startup, requires system installation
|
98
|
+
- **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
|
99
|
+
- **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
|
100
|
+
|
101
|
+
For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
|
102
|
+
|
103
|
+
## Contribution
|
104
|
+
|
105
|
+
This library is open to contribution. Feel free to open issues or submit PRs. It's better to discuss issues before submitting PRs to avoid disappointment.
|
106
|
+
|
107
|
+
### Local Development
|
108
|
+
|
109
|
+
1. Clone the repo
|
110
|
+
|
111
|
+
1. Install the system dependencies
|
112
|
+
|
113
|
+
1. Install the full dependencies with `uv sync`
|
114
|
+
|
115
|
+
1. Install the pre-commit hooks with:
|
116
|
+
|
117
|
+
```shell
|
118
|
+
pre-commit install && pre-commit install --hook-type commit-msg
|
119
|
+
```
|
120
|
+
|
121
|
+
1. Make your changes and submit a PR
|
122
|
+
|
123
|
+
## License
|
124
|
+
|
125
|
+
This library is released under the MIT license.
|
@@ -1,5 +1,10 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
1
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
2
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
3
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
4
|
+
|
5
|
+
from ._ocr._tesseract import PSMMode
|
6
|
+
from ._registry import ExtractorRegistry
|
7
|
+
from ._types import ExtractionConfig, ExtractionResult, Metadata
|
3
8
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
4
9
|
from .extraction import (
|
5
10
|
batch_extract_bytes,
|
@@ -7,22 +12,31 @@ from .extraction import (
|
|
7
12
|
batch_extract_file,
|
8
13
|
batch_extract_file_sync,
|
9
14
|
extract_bytes,
|
15
|
+
extract_bytes_sync,
|
10
16
|
extract_file,
|
17
|
+
extract_file_sync,
|
11
18
|
)
|
12
19
|
|
13
20
|
__all__ = [
|
21
|
+
"EasyOCRConfig",
|
22
|
+
"ExtractionConfig",
|
14
23
|
"ExtractionResult",
|
24
|
+
"ExtractorRegistry",
|
15
25
|
"KreuzbergError",
|
16
26
|
"Metadata",
|
17
27
|
"MissingDependencyError",
|
18
28
|
"OCRError",
|
19
29
|
"PSMMode",
|
30
|
+
"PaddleOCRConfig",
|
20
31
|
"ParsingError",
|
32
|
+
"TesseractConfig",
|
21
33
|
"ValidationError",
|
22
34
|
"batch_extract_bytes",
|
23
35
|
"batch_extract_bytes_sync",
|
24
36
|
"batch_extract_file",
|
25
37
|
"batch_extract_file_sync",
|
26
38
|
"extract_bytes",
|
39
|
+
"extract_bytes_sync",
|
27
40
|
"extract_file",
|
41
|
+
"extract_file_sync",
|
28
42
|
]
|
@@ -0,0 +1,51 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
|
5
|
+
from kreuzberg import MissingDependencyError
|
6
|
+
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
7
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from semantic_text_splitter import MarkdownSplitter, TextSplitter
|
11
|
+
|
12
|
+
_chunkers: dict[tuple[int, int, str], MarkdownSplitter | TextSplitter] = {}
|
13
|
+
|
14
|
+
|
15
|
+
def get_chunker(
|
16
|
+
mime_type: str,
|
17
|
+
max_characters: int = DEFAULT_MAX_CHARACTERS,
|
18
|
+
overlap_characters: int = DEFAULT_MAX_OVERLAP,
|
19
|
+
) -> MarkdownSplitter | TextSplitter:
|
20
|
+
"""Creates and returns a Chunker object configured with the given maximum
|
21
|
+
characters per chunk and overlap between chunks.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
mime_type: The mime type of the content.
|
25
|
+
max_characters: Maximum number of characters allowed in each chunk.
|
26
|
+
overlap_characters: Number of characters overlapping between two consecutive chunks.
|
27
|
+
|
28
|
+
Raises:
|
29
|
+
MissingDependencyError: if semantic-text-splitter is not installed.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Chunker: A Chunker object configured with the specified maximum
|
33
|
+
characters and overlap.
|
34
|
+
"""
|
35
|
+
key = (max_characters, overlap_characters, mime_type)
|
36
|
+
if key not in _chunkers:
|
37
|
+
try:
|
38
|
+
if mime_type == MARKDOWN_MIME_TYPE:
|
39
|
+
from semantic_text_splitter import MarkdownSplitter
|
40
|
+
|
41
|
+
_chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
|
42
|
+
else:
|
43
|
+
from semantic_text_splitter import TextSplitter
|
44
|
+
|
45
|
+
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
|
+
except ImportError as e:
|
47
|
+
raise MissingDependencyError.create_for_package(
|
48
|
+
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
49
|
+
) from e
|
50
|
+
|
51
|
+
return _chunkers[key]
|
@@ -16,7 +16,7 @@ PDF_MIME_TYPE: Final = "application/pdf"
|
|
16
16
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
17
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
18
|
DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
19
|
-
|
19
|
+
|
20
20
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
21
21
|
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
22
22
|
EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
|
@@ -24,8 +24,8 @@ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macr
|
|
24
24
|
EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
|
25
25
|
EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
|
26
26
|
|
27
|
-
|
28
|
-
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
|
27
|
+
|
28
|
+
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
|
29
29
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
30
30
|
|
31
31
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -48,26 +48,7 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
|
|
48
48
|
"image/x-portable-pixmap",
|
49
49
|
"image/x-tiff",
|
50
50
|
}
|
51
|
-
|
52
|
-
"image/bmp": "bmp",
|
53
|
-
"image/x-bmp": "bmp",
|
54
|
-
"image/x-ms-bmp": "bmp",
|
55
|
-
"image/gif": "gif",
|
56
|
-
"image/jpeg": "jpg",
|
57
|
-
"image/pjpeg": "jpg",
|
58
|
-
"image/png": "png",
|
59
|
-
"image/tiff": "tiff",
|
60
|
-
"image/x-tiff": "tiff",
|
61
|
-
"image/jp2": "jp2",
|
62
|
-
"image/jpx": "jpx",
|
63
|
-
"image/jpm": "jpm",
|
64
|
-
"image/mj2": "mj2",
|
65
|
-
"image/webp": "webp",
|
66
|
-
"image/x-portable-anymap": "pnm",
|
67
|
-
"image/x-portable-bitmap": "pbm",
|
68
|
-
"image/x-portable-graymap": "pgm",
|
69
|
-
"image/x-portable-pixmap": "ppm",
|
70
|
-
}
|
51
|
+
|
71
52
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
72
53
|
"application/csl+json",
|
73
54
|
"application/docbook+xml",
|
@@ -162,13 +143,17 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
|
162
143
|
)
|
163
144
|
|
164
145
|
|
165
|
-
def validate_mime_type(
|
146
|
+
def validate_mime_type(
|
147
|
+
*, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
|
148
|
+
) -> str:
|
166
149
|
"""Validate and detect the MIME type for a given file.
|
167
150
|
|
168
151
|
Args:
|
169
152
|
file_path: The path to the file.
|
170
153
|
mime_type: Optional explicit MIME type. If provided, this will be validated.
|
171
154
|
If not provided, the function will attempt to detect the MIME type.
|
155
|
+
check_file_exists: Whether to check if the file exists. Default is True.
|
156
|
+
Set to False in tests where you want to validate a mime type without an actual file.
|
172
157
|
|
173
158
|
Raises:
|
174
159
|
ValidationError: If the MIME type is not supported or cannot be determined.
|
@@ -176,10 +161,18 @@ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = N
|
|
176
161
|
Returns:
|
177
162
|
The validated MIME type.
|
178
163
|
"""
|
179
|
-
|
164
|
+
if file_path and check_file_exists:
|
165
|
+
path = Path(file_path)
|
166
|
+
if not path.exists():
|
167
|
+
raise ValidationError("The file does not exist", context={"file_path": str(path)})
|
180
168
|
|
181
169
|
if not mime_type:
|
182
|
-
|
170
|
+
if not file_path:
|
171
|
+
raise ValidationError(
|
172
|
+
"Could not determine mime type.",
|
173
|
+
)
|
174
|
+
path = Path(file_path)
|
175
|
+
|
183
176
|
ext = path.suffix.lower()
|
184
177
|
mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
|
185
178
|
|