kreuzberg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +7 -3
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_string.py +7 -7
- kreuzberg/_sync.py +1 -1
- kreuzberg/extraction.py +8 -4
- {kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/METADATA +20 -5
- kreuzberg-1.1.0.dist-info/RECORD +13 -0
- kreuzberg-1.0.0.dist-info/RECORD +0 -13
- {kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors.py
CHANGED
@@ -11,7 +11,7 @@ from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
|
|
11
11
|
from kreuzberg._sync import run_sync
|
12
12
|
from kreuzberg.exceptions import ParsingError
|
13
13
|
|
14
|
-
if TYPE_CHECKING:
|
14
|
+
if TYPE_CHECKING: # pragma: no cover
|
15
15
|
from pathlib import Path
|
16
16
|
|
17
17
|
|
@@ -35,6 +35,7 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
|
|
35
35
|
text = "\n".join(image_to_string(img) for img in images)
|
36
36
|
return text.strip()
|
37
37
|
except (PdfiumError, TesseractError) as e:
|
38
|
+
# TODO: add test case
|
38
39
|
raise ParsingError(
|
39
40
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
40
41
|
) from e
|
@@ -57,21 +58,23 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
|
|
57
58
|
text = "\n".join(page.get_textpage().get_text_range() for page in document)
|
58
59
|
return text.strip()
|
59
60
|
except PdfiumError as e:
|
61
|
+
# TODO: add test case
|
60
62
|
raise ParsingError(
|
61
63
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
62
64
|
) from e
|
63
65
|
|
64
66
|
|
65
|
-
async def _extract_pdf_file(file_path: Path) -> str:
|
67
|
+
async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
66
68
|
"""Extract text from a PDF file.
|
67
69
|
|
68
70
|
Args:
|
69
71
|
file_path: The path to the PDF file.
|
72
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
70
73
|
|
71
74
|
Returns:
|
72
75
|
The extracted text.
|
73
76
|
"""
|
74
|
-
if content := await run_sync(_extract_pdf_with_pdfium2, file_path):
|
77
|
+
if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
|
75
78
|
return content
|
76
79
|
|
77
80
|
return await run_sync(_extract_pdf_with_tesseract, file_path)
|
@@ -96,6 +99,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
|
|
96
99
|
try:
|
97
100
|
return cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
|
98
101
|
except RuntimeError as e:
|
102
|
+
# TODO: add test case
|
99
103
|
raise ParsingError(
|
100
104
|
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
|
101
105
|
context={"error": str(e)},
|
kreuzberg/_mime_types.py
CHANGED
kreuzberg/_string.py
CHANGED
@@ -1,5 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from contextlib import suppress
|
4
|
+
|
3
5
|
from charset_normalizer import detect
|
4
6
|
|
5
7
|
|
@@ -16,20 +18,18 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
16
18
|
if not byte_data:
|
17
19
|
return ""
|
18
20
|
|
21
|
+
encodings = ["utf-8", "latin-1"]
|
22
|
+
|
19
23
|
if encoding:
|
20
|
-
|
24
|
+
with suppress(UnicodeDecodeError):
|
21
25
|
return byte_data.decode(encoding, errors="ignore")
|
22
|
-
except UnicodeDecodeError: # pragma: no cover
|
23
|
-
pass
|
24
26
|
|
25
|
-
encodings = ["utf-8", "latin-1"]
|
26
27
|
if encoding := detect(byte_data).get("encoding"):
|
27
28
|
encodings.append(encoding)
|
28
29
|
|
29
30
|
for encoding in encodings:
|
30
|
-
|
31
|
+
with suppress(UnicodeDecodeError):
|
31
32
|
return byte_data.decode(encoding, errors="ignore")
|
32
|
-
except UnicodeDecodeError: # pragma: no cover # noqa: PERF203
|
33
|
-
pass
|
34
33
|
|
34
|
+
# TODO: add test case
|
35
35
|
return byte_data.decode("latin-1", errors="replace")
|
kreuzberg/_sync.py
CHANGED
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, TypeVar, cast
|
|
6
6
|
from anyio.to_thread import run_sync as any_io_run_sync
|
7
7
|
from typing_extensions import ParamSpec
|
8
8
|
|
9
|
-
if TYPE_CHECKING:
|
9
|
+
if TYPE_CHECKING: # pragma: no cover
|
10
10
|
from collections.abc import Callable
|
11
11
|
|
12
12
|
T = TypeVar("T")
|
kreuzberg/extraction.py
CHANGED
@@ -35,12 +35,13 @@ class ExtractionResult(NamedTuple):
|
|
35
35
|
"""The mime type of the content."""
|
36
36
|
|
37
37
|
|
38
|
-
async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
38
|
+
async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False) -> ExtractionResult:
|
39
39
|
"""Extract the textual content from a given byte string representing a file's contents.
|
40
40
|
|
41
41
|
Args:
|
42
42
|
content: The content to extract.
|
43
43
|
mime_type: The mime type of the content.
|
44
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
44
45
|
|
45
46
|
Raises:
|
46
47
|
ValidationError: If the mime type is not supported.
|
@@ -58,7 +59,7 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
|
58
59
|
with NamedTemporaryFile(suffix=".pdf") as temp_file:
|
59
60
|
temp_file.write(content)
|
60
61
|
return ExtractionResult(
|
61
|
-
content=await _extract_pdf_file(Path(temp_file.name)), mime_type=PLAIN_TEXT_MIME_TYPE
|
62
|
+
content=await _extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
62
63
|
)
|
63
64
|
|
64
65
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
@@ -81,12 +82,15 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
|
81
82
|
)
|
82
83
|
|
83
84
|
|
84
|
-
async def extract_file(
|
85
|
+
async def extract_file(
|
86
|
+
file_path: Path | str, mime_type: str | None = None, force_ocr: bool = False
|
87
|
+
) -> ExtractionResult:
|
85
88
|
"""Extract the textual content from a given file.
|
86
89
|
|
87
90
|
Args:
|
88
91
|
file_path: The path to the file.
|
89
92
|
mime_type: The mime type of the file.
|
93
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
90
94
|
|
91
95
|
Raises:
|
92
96
|
ValidationError: If the mime type is not supported.
|
@@ -109,7 +113,7 @@ async def extract_file(file_path: Path | str, mime_type: str | None = None) -> E
|
|
109
113
|
raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
|
110
114
|
|
111
115
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
112
|
-
return ExtractionResult(content=await _extract_pdf_file(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
116
|
+
return ExtractionResult(content=await _extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
113
117
|
|
114
118
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
115
119
|
return ExtractionResult(content=await _extract_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -46,7 +46,7 @@ Hence, this library.
|
|
46
46
|
|
47
47
|
## Features
|
48
48
|
|
49
|
-
- Extract text from PDFs, images, and
|
49
|
+
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
50
50
|
- Use modern Python with async (via `anyio`) and proper type hints
|
51
51
|
- Extensive error handling for easy debugging
|
52
52
|
|
@@ -164,6 +164,21 @@ async def process_uploaded_image(image_content: bytes):
|
|
164
164
|
return result.content
|
165
165
|
```
|
166
166
|
|
167
|
+
### Forcing OCR
|
168
|
+
|
169
|
+
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
170
|
+
You can do this by passing `force_ocr=True`:
|
171
|
+
|
172
|
+
```python
|
173
|
+
from kreuzberg import extract_bytes
|
174
|
+
|
175
|
+
|
176
|
+
# Extract text from PDF bytes and force OCR
|
177
|
+
async def process_uploaded_pdf(pdf_content: bytes):
|
178
|
+
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
179
|
+
return result.content
|
180
|
+
```
|
181
|
+
|
167
182
|
### Error Handling
|
168
183
|
|
169
184
|
Kreuzberg raises two exception types:
|
@@ -173,8 +188,8 @@ Kreuzberg raises two exception types:
|
|
173
188
|
Raised when there are issues with input validation:
|
174
189
|
|
175
190
|
- Unsupported mime types
|
176
|
-
- Non-existent files
|
177
191
|
- Undetectable mime types
|
192
|
+
- Path doesn't point at an exist file
|
178
193
|
|
179
194
|
#### ParsingError
|
180
195
|
|
@@ -218,8 +233,8 @@ except ParsingError as e:
|
|
218
233
|
|
219
234
|
All extraction functions return an ExtractionResult named tuple containing:
|
220
235
|
|
221
|
-
- content
|
222
|
-
- mime_type
|
236
|
+
- `content`: The extracted text as a string
|
237
|
+
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
223
238
|
|
224
239
|
```python
|
225
240
|
from kreuzberg import ExtractionResult
|
@@ -0,0 +1,13 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
+
kreuzberg/_extractors.py,sha256=r8L9Bm3x7s1u7-T5HKkr1j6M6W3bUuwMAmDtAwX-s9g,4717
|
3
|
+
kreuzberg/_mime_types.py,sha256=M5sKT4OkMf7pwtgs_jO2uhl6gC94wUurYzw_wbrIjU0,2739
|
4
|
+
kreuzberg/_string.py,sha256=5s6BfTLQdYlDEt2PP4AdmBLV-ajroATOVYQQRcBYFD4,934
|
5
|
+
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
6
|
+
kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
|
7
|
+
kreuzberg/extraction.py,sha256=-a_msLQm7h5pHDhBuvfRP81-FtBwv7FGW-6YVJlXpUg,4926
|
8
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
kreuzberg-1.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
10
|
+
kreuzberg-1.1.0.dist-info/METADATA,sha256=nkDjE2MEqAE_-1MZvlBxnNuM7SKCOD2LvB7Ucb_W7U4,7775
|
11
|
+
kreuzberg-1.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
12
|
+
kreuzberg-1.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
13
|
+
kreuzberg-1.1.0.dist-info/RECORD,,
|
kreuzberg-1.0.0.dist-info/RECORD
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
-
kreuzberg/_extractors.py,sha256=tmOgzhKw8J21R-NKWSgu7yf5epGleoxC9nKQacUDdms,4461
|
3
|
-
kreuzberg/_mime_types.py,sha256=VI3bWm7NBF0Vs2PXpxnJxTlt0pRSE59raVO_KTDJCVQ,2719
|
4
|
-
kreuzberg/_string.py,sha256=8YezUPhTGEMk08yGrBxVu4CwhUdCQwOvyC6EGB7wxLk,975
|
5
|
-
kreuzberg/_sync.py,sha256=OQZTSKUOaSMkxAb4ynq-BDrx1JLAYP9uc_zFZaAN_fk,854
|
6
|
-
kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
|
7
|
-
kreuzberg/extraction.py,sha256=utxr9HM8K2aDU0LXHVKCNPXqTu7fGDeNCNpamGr6hAQ,4646
|
8
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
kreuzberg-1.0.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
10
|
-
kreuzberg-1.0.0.dist-info/METADATA,sha256=fQszunogmstxhdJMMD5ieXLRqjBojXpb0pXJAZZO8fQ,7238
|
11
|
-
kreuzberg-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
12
|
-
kreuzberg-1.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
13
|
-
kreuzberg-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|