kreuzberg 1.0.0__tar.gz → 1.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/PKG-INFO +20 -5
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/README.md +19 -4
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/_extractors.py +7 -3
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/_mime_types.py +1 -1
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/_string.py +7 -7
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/_sync.py +1 -1
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/extraction.py +8 -4
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg.egg-info/PKG-INFO +20 -5
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/pyproject.toml +2 -2
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/LICENSE +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg.egg-info/SOURCES.txt +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg.egg-info/requires.txt +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -46,7 +46,7 @@ Hence, this library.
|
|
46
46
|
|
47
47
|
## Features
|
48
48
|
|
49
|
-
- Extract text from PDFs, images, and
|
49
|
+
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
50
50
|
- Use modern Python with async (via `anyio`) and proper type hints
|
51
51
|
- Extensive error handling for easy debugging
|
52
52
|
|
@@ -164,6 +164,21 @@ async def process_uploaded_image(image_content: bytes):
|
|
164
164
|
return result.content
|
165
165
|
```
|
166
166
|
|
167
|
+
### Forcing OCR
|
168
|
+
|
169
|
+
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
170
|
+
You can do this by passing `force_ocr=True`:
|
171
|
+
|
172
|
+
```python
|
173
|
+
from kreuzberg import extract_bytes
|
174
|
+
|
175
|
+
|
176
|
+
# Extract text from PDF bytes and force OCR
|
177
|
+
async def process_uploaded_pdf(pdf_content: bytes):
|
178
|
+
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
179
|
+
return result.content
|
180
|
+
```
|
181
|
+
|
167
182
|
### Error Handling
|
168
183
|
|
169
184
|
Kreuzberg raises two exception types:
|
@@ -173,8 +188,8 @@ Kreuzberg raises two exception types:
|
|
173
188
|
Raised when there are issues with input validation:
|
174
189
|
|
175
190
|
- Unsupported mime types
|
176
|
-
- Non-existent files
|
177
191
|
- Undetectable mime types
|
192
|
+
- Path doesn't point at an exist file
|
178
193
|
|
179
194
|
#### ParsingError
|
180
195
|
|
@@ -218,8 +233,8 @@ except ParsingError as e:
|
|
218
233
|
|
219
234
|
All extraction functions return an ExtractionResult named tuple containing:
|
220
235
|
|
221
|
-
- content
|
222
|
-
- mime_type
|
236
|
+
- `content`: The extracted text as a string
|
237
|
+
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
223
238
|
|
224
239
|
```python
|
225
240
|
from kreuzberg import ExtractionResult
|
@@ -14,7 +14,7 @@ Hence, this library.
|
|
14
14
|
|
15
15
|
## Features
|
16
16
|
|
17
|
-
- Extract text from PDFs, images, and
|
17
|
+
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
18
18
|
- Use modern Python with async (via `anyio`) and proper type hints
|
19
19
|
- Extensive error handling for easy debugging
|
20
20
|
|
@@ -132,6 +132,21 @@ async def process_uploaded_image(image_content: bytes):
|
|
132
132
|
return result.content
|
133
133
|
```
|
134
134
|
|
135
|
+
### Forcing OCR
|
136
|
+
|
137
|
+
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
138
|
+
You can do this by passing `force_ocr=True`:
|
139
|
+
|
140
|
+
```python
|
141
|
+
from kreuzberg import extract_bytes
|
142
|
+
|
143
|
+
|
144
|
+
# Extract text from PDF bytes and force OCR
|
145
|
+
async def process_uploaded_pdf(pdf_content: bytes):
|
146
|
+
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
147
|
+
return result.content
|
148
|
+
```
|
149
|
+
|
135
150
|
### Error Handling
|
136
151
|
|
137
152
|
Kreuzberg raises two exception types:
|
@@ -141,8 +156,8 @@ Kreuzberg raises two exception types:
|
|
141
156
|
Raised when there are issues with input validation:
|
142
157
|
|
143
158
|
- Unsupported mime types
|
144
|
-
- Non-existent files
|
145
159
|
- Undetectable mime types
|
160
|
+
- Path doesn't point at an exist file
|
146
161
|
|
147
162
|
#### ParsingError
|
148
163
|
|
@@ -186,8 +201,8 @@ except ParsingError as e:
|
|
186
201
|
|
187
202
|
All extraction functions return an ExtractionResult named tuple containing:
|
188
203
|
|
189
|
-
- content
|
190
|
-
- mime_type
|
204
|
+
- `content`: The extracted text as a string
|
205
|
+
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
191
206
|
|
192
207
|
```python
|
193
208
|
from kreuzberg import ExtractionResult
|
@@ -11,7 +11,7 @@ from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
|
|
11
11
|
from kreuzberg._sync import run_sync
|
12
12
|
from kreuzberg.exceptions import ParsingError
|
13
13
|
|
14
|
-
if TYPE_CHECKING:
|
14
|
+
if TYPE_CHECKING: # pragma: no cover
|
15
15
|
from pathlib import Path
|
16
16
|
|
17
17
|
|
@@ -35,6 +35,7 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
|
|
35
35
|
text = "\n".join(image_to_string(img) for img in images)
|
36
36
|
return text.strip()
|
37
37
|
except (PdfiumError, TesseractError) as e:
|
38
|
+
# TODO: add test case
|
38
39
|
raise ParsingError(
|
39
40
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
40
41
|
) from e
|
@@ -57,21 +58,23 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
|
|
57
58
|
text = "\n".join(page.get_textpage().get_text_range() for page in document)
|
58
59
|
return text.strip()
|
59
60
|
except PdfiumError as e:
|
61
|
+
# TODO: add test case
|
60
62
|
raise ParsingError(
|
61
63
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
62
64
|
) from e
|
63
65
|
|
64
66
|
|
65
|
-
async def _extract_pdf_file(file_path: Path) -> str:
|
67
|
+
async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
66
68
|
"""Extract text from a PDF file.
|
67
69
|
|
68
70
|
Args:
|
69
71
|
file_path: The path to the PDF file.
|
72
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
70
73
|
|
71
74
|
Returns:
|
72
75
|
The extracted text.
|
73
76
|
"""
|
74
|
-
if content := await run_sync(_extract_pdf_with_pdfium2, file_path):
|
77
|
+
if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
|
75
78
|
return content
|
76
79
|
|
77
80
|
return await run_sync(_extract_pdf_with_tesseract, file_path)
|
@@ -96,6 +99,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
|
|
96
99
|
try:
|
97
100
|
return cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
|
98
101
|
except RuntimeError as e:
|
102
|
+
# TODO: add test case
|
99
103
|
raise ParsingError(
|
100
104
|
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
|
101
105
|
context={"error": str(e)},
|
@@ -1,5 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from contextlib import suppress
|
4
|
+
|
3
5
|
from charset_normalizer import detect
|
4
6
|
|
5
7
|
|
@@ -16,20 +18,18 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
16
18
|
if not byte_data:
|
17
19
|
return ""
|
18
20
|
|
21
|
+
encodings = ["utf-8", "latin-1"]
|
22
|
+
|
19
23
|
if encoding:
|
20
|
-
|
24
|
+
with suppress(UnicodeDecodeError):
|
21
25
|
return byte_data.decode(encoding, errors="ignore")
|
22
|
-
except UnicodeDecodeError: # pragma: no cover
|
23
|
-
pass
|
24
26
|
|
25
|
-
encodings = ["utf-8", "latin-1"]
|
26
27
|
if encoding := detect(byte_data).get("encoding"):
|
27
28
|
encodings.append(encoding)
|
28
29
|
|
29
30
|
for encoding in encodings:
|
30
|
-
|
31
|
+
with suppress(UnicodeDecodeError):
|
31
32
|
return byte_data.decode(encoding, errors="ignore")
|
32
|
-
except UnicodeDecodeError: # pragma: no cover # noqa: PERF203
|
33
|
-
pass
|
34
33
|
|
34
|
+
# TODO: add test case
|
35
35
|
return byte_data.decode("latin-1", errors="replace")
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, TypeVar, cast
|
|
6
6
|
from anyio.to_thread import run_sync as any_io_run_sync
|
7
7
|
from typing_extensions import ParamSpec
|
8
8
|
|
9
|
-
if TYPE_CHECKING:
|
9
|
+
if TYPE_CHECKING: # pragma: no cover
|
10
10
|
from collections.abc import Callable
|
11
11
|
|
12
12
|
T = TypeVar("T")
|
@@ -35,12 +35,13 @@ class ExtractionResult(NamedTuple):
|
|
35
35
|
"""The mime type of the content."""
|
36
36
|
|
37
37
|
|
38
|
-
async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
38
|
+
async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False) -> ExtractionResult:
|
39
39
|
"""Extract the textual content from a given byte string representing a file's contents.
|
40
40
|
|
41
41
|
Args:
|
42
42
|
content: The content to extract.
|
43
43
|
mime_type: The mime type of the content.
|
44
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
44
45
|
|
45
46
|
Raises:
|
46
47
|
ValidationError: If the mime type is not supported.
|
@@ -58,7 +59,7 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
|
58
59
|
with NamedTemporaryFile(suffix=".pdf") as temp_file:
|
59
60
|
temp_file.write(content)
|
60
61
|
return ExtractionResult(
|
61
|
-
content=await _extract_pdf_file(Path(temp_file.name)), mime_type=PLAIN_TEXT_MIME_TYPE
|
62
|
+
content=await _extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
62
63
|
)
|
63
64
|
|
64
65
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
@@ -81,12 +82,15 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
|
81
82
|
)
|
82
83
|
|
83
84
|
|
84
|
-
async def extract_file(
|
85
|
+
async def extract_file(
|
86
|
+
file_path: Path | str, mime_type: str | None = None, force_ocr: bool = False
|
87
|
+
) -> ExtractionResult:
|
85
88
|
"""Extract the textual content from a given file.
|
86
89
|
|
87
90
|
Args:
|
88
91
|
file_path: The path to the file.
|
89
92
|
mime_type: The mime type of the file.
|
93
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
90
94
|
|
91
95
|
Raises:
|
92
96
|
ValidationError: If the mime type is not supported.
|
@@ -109,7 +113,7 @@ async def extract_file(file_path: Path | str, mime_type: str | None = None) -> E
|
|
109
113
|
raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
|
110
114
|
|
111
115
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
112
|
-
return ExtractionResult(content=await _extract_pdf_file(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
116
|
+
return ExtractionResult(content=await _extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
113
117
|
|
114
118
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
115
119
|
return ExtractionResult(content=await _extract_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -46,7 +46,7 @@ Hence, this library.
|
|
46
46
|
|
47
47
|
## Features
|
48
48
|
|
49
|
-
- Extract text from PDFs, images, and
|
49
|
+
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
50
50
|
- Use modern Python with async (via `anyio`) and proper type hints
|
51
51
|
- Extensive error handling for easy debugging
|
52
52
|
|
@@ -164,6 +164,21 @@ async def process_uploaded_image(image_content: bytes):
|
|
164
164
|
return result.content
|
165
165
|
```
|
166
166
|
|
167
|
+
### Forcing OCR
|
168
|
+
|
169
|
+
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
170
|
+
You can do this by passing `force_ocr=True`:
|
171
|
+
|
172
|
+
```python
|
173
|
+
from kreuzberg import extract_bytes
|
174
|
+
|
175
|
+
|
176
|
+
# Extract text from PDF bytes and force OCR
|
177
|
+
async def process_uploaded_pdf(pdf_content: bytes):
|
178
|
+
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
179
|
+
return result.content
|
180
|
+
```
|
181
|
+
|
167
182
|
### Error Handling
|
168
183
|
|
169
184
|
Kreuzberg raises two exception types:
|
@@ -173,8 +188,8 @@ Kreuzberg raises two exception types:
|
|
173
188
|
Raised when there are issues with input validation:
|
174
189
|
|
175
190
|
- Unsupported mime types
|
176
|
-
- Non-existent files
|
177
191
|
- Undetectable mime types
|
192
|
+
- Path doesn't point at an exist file
|
178
193
|
|
179
194
|
#### ParsingError
|
180
195
|
|
@@ -218,8 +233,8 @@ except ParsingError as e:
|
|
218
233
|
|
219
234
|
All extraction functions return an ExtractionResult named tuple containing:
|
220
235
|
|
221
|
-
- content
|
222
|
-
- mime_type
|
236
|
+
- `content`: The extracted text as a string
|
237
|
+
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
223
238
|
|
224
239
|
```python
|
225
240
|
from kreuzberg import ExtractionResult
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "kreuzberg"
|
3
|
-
version = "1.
|
3
|
+
version = "1.1.0"
|
4
4
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
readme = "README.md"
|
6
6
|
keywords = [
|
@@ -128,7 +128,7 @@ source = [ "kreuzberg" ]
|
|
128
128
|
|
129
129
|
[tool.coverage.report]
|
130
130
|
exclude_lines = [ 'if TYPE_CHECKING:' ]
|
131
|
-
fail_under =
|
131
|
+
fail_under = 90
|
132
132
|
|
133
133
|
[tool.mypy]
|
134
134
|
packages = [ "kreuzberg", "tests" ]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|