kreuzberg 1.2.0__tar.gz → 1.4.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/PKG-INFO +25 -10
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/README.md +23 -7
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/_extractors.py +61 -52
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/_mime_types.py +10 -5
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/_string.py +9 -12
- kreuzberg-1.4.0/kreuzberg/_tesseract.py +318 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/exceptions.py +9 -1
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/extraction.py +21 -13
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg.egg-info/PKG-INFO +25 -10
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg.egg-info/SOURCES.txt +1 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg.egg-info/requires.txt +1 -2
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/pyproject.toml +5 -7
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/LICENSE +0 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/_sync.py +0 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-1.2.0 → kreuzberg-1.4.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -25,11 +25,10 @@ Description-Content-Type: text/markdown
|
|
25
25
|
License-File: LICENSE
|
26
26
|
Requires-Dist: anyio>=4.8.0
|
27
27
|
Requires-Dist: charset-normalizer>=3.4.1
|
28
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
28
29
|
Requires-Dist: pypandoc>=1.15
|
29
30
|
Requires-Dist: pypdfium2>=4.30.1
|
30
|
-
Requires-Dist: pytesseract>=0.3.13
|
31
31
|
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
Requires-Dist: typing-extensions>=4.12.2
|
33
32
|
|
34
33
|
# Kreuzberg
|
35
34
|
|
@@ -38,7 +37,7 @@ extraction.
|
|
38
37
|
|
39
38
|
Why?
|
40
39
|
|
41
|
-
I am building, like many do now, a RAG focused service. I have text extraction needs.
|
40
|
+
I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
|
42
41
|
There are quite a lot of commercial options out there, and several open-source + paid options.
|
43
42
|
But I wanted something simple, which does not require expansive round-trips to an external API.
|
44
43
|
Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
|
@@ -76,17 +75,32 @@ polished and well maintained.
|
|
76
75
|
|
77
76
|
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
78
77
|
- Images are processed using Tesseract OCR
|
79
|
-
- Office documents and other formats are processed using Pandoc
|
78
|
+
- Office documents and other formats are processed using Pandoc
|
79
|
+
- PPTX files are converted using python-pptx
|
80
|
+
- HTML files are converted using html-to-markdown
|
80
81
|
- Plain text files are read directly with appropriate encoding detection
|
81
82
|
|
82
83
|
### Roadmap
|
83
84
|
|
84
|
-
|
85
|
-
[] - html file text extraction
|
86
|
-
[] - better PDF table extraction
|
87
|
-
[] - metadata extraction
|
85
|
+
V1:
|
88
86
|
|
89
|
-
|
87
|
+
- [x] - html file text extraction
|
88
|
+
- [ ] - better PDF table extraction
|
89
|
+
- [ ] - TBD
|
90
|
+
|
91
|
+
V2:
|
92
|
+
|
93
|
+
- [ ] - extra install groups (to make dependencies optional)
|
94
|
+
- [ ] - metadata extraction (possible breaking change)
|
95
|
+
- [ ] - TBD
|
96
|
+
|
97
|
+
### Feature Requests
|
98
|
+
|
99
|
+
Feel free to open a discussion in GitHub or an issue if you have any feature requests
|
100
|
+
|
101
|
+
### Contribution
|
102
|
+
|
103
|
+
Is welcome! Read guidelines below.
|
90
104
|
|
91
105
|
## Supported File Types
|
92
106
|
|
@@ -116,6 +130,7 @@ Kreuzberg supports a wide range of file formats:
|
|
116
130
|
|
117
131
|
#### Text and Markup Formats
|
118
132
|
|
133
|
+
- HTML (`.html`, `.htm`)
|
119
134
|
- Plain Text (`.txt`)
|
120
135
|
- Markdown (`.md`)
|
121
136
|
- reStructuredText (`.rst`)
|
@@ -5,7 +5,7 @@ extraction.
|
|
5
5
|
|
6
6
|
Why?
|
7
7
|
|
8
|
-
I am building, like many do now, a RAG focused service. I have text extraction needs.
|
8
|
+
I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
|
9
9
|
There are quite a lot of commercial options out there, and several open-source + paid options.
|
10
10
|
But I wanted something simple, which does not require expansive round-trips to an external API.
|
11
11
|
Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
|
@@ -43,17 +43,32 @@ polished and well maintained.
|
|
43
43
|
|
44
44
|
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
45
45
|
- Images are processed using Tesseract OCR
|
46
|
-
- Office documents and other formats are processed using Pandoc
|
46
|
+
- Office documents and other formats are processed using Pandoc
|
47
|
+
- PPTX files are converted using python-pptx
|
48
|
+
- HTML files are converted using html-to-markdown
|
47
49
|
- Plain text files are read directly with appropriate encoding detection
|
48
50
|
|
49
51
|
### Roadmap
|
50
52
|
|
51
|
-
|
52
|
-
[] - html file text extraction
|
53
|
-
[] - better PDF table extraction
|
54
|
-
[] - metadata extraction
|
53
|
+
V1:
|
55
54
|
|
56
|
-
|
55
|
+
- [x] - html file text extraction
|
56
|
+
- [ ] - better PDF table extraction
|
57
|
+
- [ ] - TBD
|
58
|
+
|
59
|
+
V2:
|
60
|
+
|
61
|
+
- [ ] - extra install groups (to make dependencies optional)
|
62
|
+
- [ ] - metadata extraction (possible breaking change)
|
63
|
+
- [ ] - TBD
|
64
|
+
|
65
|
+
### Feature Requests
|
66
|
+
|
67
|
+
Feel free to open a discussion in GitHub or an issue if you have any feature requests
|
68
|
+
|
69
|
+
### Contribution
|
70
|
+
|
71
|
+
Is welcome! Read guidelines below.
|
57
72
|
|
58
73
|
## Supported File Types
|
59
74
|
|
@@ -83,6 +98,7 @@ Kreuzberg supports a wide range of file formats:
|
|
83
98
|
|
84
99
|
#### Text and Markup Formats
|
85
100
|
|
101
|
+
- HTML (`.html`, `.htm`)
|
86
102
|
- Plain Text (`.txt`)
|
87
103
|
- Markdown (`.md`)
|
88
104
|
- reStructuredText (`.rst`)
|
@@ -6,50 +6,61 @@ from html import escape
|
|
6
6
|
from io import BytesIO
|
7
7
|
from typing import TYPE_CHECKING, cast
|
8
8
|
|
9
|
+
import html_to_markdown
|
10
|
+
import pptx
|
11
|
+
import pypandoc
|
12
|
+
import pypdfium2
|
9
13
|
from anyio import Path as AsyncPath
|
10
14
|
from charset_normalizer import detect
|
11
|
-
from pptx import Presentation
|
12
|
-
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
13
|
-
from pypandoc import convert_file, convert_text
|
14
|
-
from pypdfium2 import PdfDocument, PdfiumError
|
15
|
-
from pytesseract import TesseractError, image_to_string
|
16
15
|
|
17
16
|
from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
|
18
|
-
from kreuzberg._string import normalize_spaces
|
17
|
+
from kreuzberg._string import normalize_spaces, safe_decode
|
19
18
|
from kreuzberg._sync import run_sync
|
19
|
+
from kreuzberg._tesseract import batch_process_images
|
20
20
|
from kreuzberg.exceptions import ParsingError
|
21
21
|
|
22
22
|
if TYPE_CHECKING: # pragma: no cover
|
23
23
|
from pathlib import Path
|
24
24
|
|
25
|
+
from PIL.Image import Image
|
25
26
|
|
26
|
-
|
27
|
-
|
27
|
+
|
28
|
+
async def convert_pdf_to_images(file_path: Path) -> list[Image]:
|
29
|
+
"""Convert a PDF file to images.
|
28
30
|
|
29
31
|
Args:
|
30
32
|
file_path: The path to the PDF file.
|
31
33
|
|
32
34
|
Raises:
|
33
|
-
ParsingError: If the
|
35
|
+
ParsingError: If the PDF file could not be converted to images.
|
34
36
|
|
35
37
|
Returns:
|
36
|
-
|
38
|
+
A list of Pillow Images.
|
37
39
|
"""
|
38
40
|
try:
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
43
|
-
text = "\n".join(image_to_string(img) for img in images)
|
44
|
-
return normalize_spaces(text)
|
45
|
-
except (PdfiumError, TesseractError) as e:
|
46
|
-
# TODO: add test case
|
41
|
+
pdf = await run_sync(pypdfium2.PdfDocument, str(file_path))
|
42
|
+
return [page.render(scale=2.0).to_pil() for page in pdf]
|
43
|
+
except pypdfium2.PdfiumError as e:
|
47
44
|
raise ParsingError(
|
48
|
-
"Could not
|
45
|
+
"Could not convert PDF to images", context={"file_path": str(file_path), "error": str(e)}
|
49
46
|
) from e
|
50
47
|
|
51
48
|
|
52
|
-
def
|
49
|
+
async def extract_pdf_with_tesseract(file_path: Path) -> str:
|
50
|
+
"""Extract text from a scanned PDF file using pytesseract.
|
51
|
+
|
52
|
+
Args:
|
53
|
+
file_path: The path to the PDF file.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
The extracted text.
|
57
|
+
"""
|
58
|
+
images = await convert_pdf_to_images(file_path)
|
59
|
+
ocr_results = await batch_process_images(images)
|
60
|
+
return normalize_spaces("\n".join(ocr_results))
|
61
|
+
|
62
|
+
|
63
|
+
async def extract_pdf_with_pdfium2(file_path: Path) -> str:
|
53
64
|
"""Extract text from a searchable PDF file using pypdfium2.
|
54
65
|
|
55
66
|
Args:
|
@@ -62,17 +73,16 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
|
|
62
73
|
The extracted text.
|
63
74
|
"""
|
64
75
|
try:
|
65
|
-
document = PdfDocument
|
76
|
+
document = await run_sync(pypdfium2.PdfDocument, file_path)
|
66
77
|
text = "\n".join(page.get_textpage().get_text_range() for page in document)
|
67
78
|
return normalize_spaces(text)
|
68
|
-
except PdfiumError as e:
|
69
|
-
# TODO: add test case
|
79
|
+
except pypdfium2.PdfiumError as e:
|
70
80
|
raise ParsingError(
|
71
81
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
72
82
|
) from e
|
73
83
|
|
74
84
|
|
75
|
-
async def
|
85
|
+
async def extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
76
86
|
"""Extract text from a PDF file.
|
77
87
|
|
78
88
|
Args:
|
@@ -82,13 +92,13 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
|
82
92
|
Returns:
|
83
93
|
The extracted text.
|
84
94
|
"""
|
85
|
-
if not force_ocr and (content := await
|
95
|
+
if not force_ocr and (content := await extract_pdf_with_pdfium2(file_path)):
|
86
96
|
return normalize_spaces(content)
|
87
97
|
|
88
|
-
return
|
98
|
+
return await extract_pdf_with_tesseract(file_path)
|
89
99
|
|
90
100
|
|
91
|
-
async def
|
101
|
+
async def extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
|
92
102
|
"""Extract text using pandoc.
|
93
103
|
|
94
104
|
Args:
|
@@ -106,7 +116,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
|
|
106
116
|
encoding = encoding or detect(file_data)["encoding"] or "utf-8"
|
107
117
|
try:
|
108
118
|
return normalize_spaces(
|
109
|
-
cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
|
119
|
+
cast(str, await run_sync(pypandoc.convert_text, file_data, to="md", format=ext, encoding=encoding))
|
110
120
|
)
|
111
121
|
except RuntimeError as e:
|
112
122
|
# TODO: add test case
|
@@ -116,7 +126,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
|
|
116
126
|
) from e
|
117
127
|
|
118
128
|
|
119
|
-
async def
|
129
|
+
async def extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> str:
|
120
130
|
"""Extract text using pandoc.
|
121
131
|
|
122
132
|
Args:
|
@@ -131,7 +141,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
|
|
131
141
|
"""
|
132
142
|
ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
|
133
143
|
try:
|
134
|
-
return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
|
144
|
+
return normalize_spaces(cast(str, await run_sync(pypandoc.convert_file, file_path, to="md", format=ext)))
|
135
145
|
except RuntimeError as e:
|
136
146
|
raise ParsingError(
|
137
147
|
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
|
@@ -139,27 +149,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
|
|
139
149
|
) from e
|
140
150
|
|
141
151
|
|
142
|
-
async def
|
143
|
-
"""Extract text from an image file.
|
144
|
-
|
145
|
-
Args:
|
146
|
-
file_path: The path to the image file.
|
147
|
-
|
148
|
-
Raises:
|
149
|
-
ParsingError: If the text could not be extracted from the image file.
|
150
|
-
|
151
|
-
Returns:
|
152
|
-
The extracted content.
|
153
|
-
"""
|
154
|
-
try:
|
155
|
-
return normalize_spaces(cast(str, image_to_string(str(file_path))))
|
156
|
-
except TesseractError as e:
|
157
|
-
raise ParsingError(
|
158
|
-
"Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
|
159
|
-
) from e
|
160
|
-
|
161
|
-
|
162
|
-
async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
152
|
+
async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
163
153
|
"""Extract text from a PPTX file.
|
164
154
|
|
165
155
|
Notes:
|
@@ -171,13 +161,15 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
171
161
|
Returns:
|
172
162
|
The extracted text content
|
173
163
|
"""
|
164
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
165
|
+
|
174
166
|
md_content = ""
|
175
167
|
file_contents = (
|
176
168
|
file_path_or_contents
|
177
169
|
if isinstance(file_path_or_contents, bytes)
|
178
170
|
else await AsyncPath(file_path_or_contents).read_bytes()
|
179
171
|
)
|
180
|
-
presentation = Presentation(BytesIO(file_contents))
|
172
|
+
presentation = pptx.Presentation(BytesIO(file_contents))
|
181
173
|
|
182
174
|
for index, slide in enumerate(presentation.slides):
|
183
175
|
md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
|
@@ -227,3 +219,20 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
227
219
|
md_content = md_content.strip()
|
228
220
|
|
229
221
|
return normalize_spaces(md_content)
|
222
|
+
|
223
|
+
|
224
|
+
async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
225
|
+
"""Extract text from an HTML string.
|
226
|
+
|
227
|
+
Args:
|
228
|
+
file_path_or_contents: The HTML content.
|
229
|
+
|
230
|
+
Returns:
|
231
|
+
The extracted text content.
|
232
|
+
"""
|
233
|
+
content = (
|
234
|
+
safe_decode(file_path_or_contents)
|
235
|
+
if isinstance(file_path_or_contents, bytes)
|
236
|
+
else await AsyncPath(file_path_or_contents).read_text()
|
237
|
+
)
|
238
|
+
return normalize_spaces(await run_sync(html_to_markdown.convert_to_markdown, content))
|
@@ -5,10 +5,12 @@ from typing import TYPE_CHECKING, Final
|
|
5
5
|
if TYPE_CHECKING: # pragma: no cover
|
6
6
|
from collections.abc import Mapping
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
PDF_MIME_TYPE: Final
|
11
|
-
|
8
|
+
HTML_MIME_TYPE: Final = "text/html"
|
9
|
+
MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
10
|
+
PDF_MIME_TYPE: Final = "application/pdf"
|
11
|
+
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
12
|
+
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
13
|
+
|
12
14
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
13
15
|
|
14
16
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -93,5 +95,8 @@ PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
|
93
95
|
}
|
94
96
|
|
95
97
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
96
|
-
PLAIN_TEXT_MIME_TYPES
|
98
|
+
PLAIN_TEXT_MIME_TYPES
|
99
|
+
| IMAGE_MIME_TYPES
|
100
|
+
| PANDOC_SUPPORTED_MIME_TYPES
|
101
|
+
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
|
97
102
|
)
|
@@ -4,6 +4,8 @@ from contextlib import suppress
|
|
4
4
|
|
5
5
|
from charset_normalizer import detect
|
6
6
|
|
7
|
+
from kreuzberg.exceptions import ParsingError
|
8
|
+
|
7
9
|
|
8
10
|
def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
9
11
|
"""Decode a byte string safely, removing invalid sequences.
|
@@ -12,27 +14,22 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
12
14
|
byte_data: The byte string to decode.
|
13
15
|
encoding: The encoding to use when decoding the byte string.
|
14
16
|
|
17
|
+
Raises:
|
18
|
+
ParsingError: If the byte string could not be decoded.
|
19
|
+
|
15
20
|
Returns:
|
16
21
|
The decoded string.
|
17
22
|
"""
|
18
23
|
if not byte_data:
|
19
24
|
return ""
|
20
25
|
|
21
|
-
encodings = ["utf-8", "latin-1"]
|
22
|
-
|
23
|
-
if encoding:
|
24
|
-
with suppress(UnicodeDecodeError):
|
25
|
-
return byte_data.decode(encoding, errors="ignore")
|
26
|
-
|
27
|
-
if encoding := detect(byte_data).get("encoding"):
|
28
|
-
encodings.append(encoding)
|
26
|
+
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8", "latin-1"]
|
29
27
|
|
30
|
-
for
|
28
|
+
for enc in [e for e in encodings if e]:
|
31
29
|
with suppress(UnicodeDecodeError):
|
32
|
-
return byte_data.decode(
|
30
|
+
return byte_data.decode(enc)
|
33
31
|
|
34
|
-
|
35
|
-
return byte_data.decode("latin-1", errors="replace")
|
32
|
+
raise ParsingError("Could not decode byte string. Please provide an encoding.")
|
36
33
|
|
37
34
|
|
38
35
|
def normalize_spaces(text: str) -> str:
|
@@ -0,0 +1,318 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
import subprocess
|
5
|
+
from asyncio import gather
|
6
|
+
from enum import Enum
|
7
|
+
from os import PathLike
|
8
|
+
from tempfile import NamedTemporaryFile
|
9
|
+
from typing import Any, Literal, TypeVar, Union
|
10
|
+
|
11
|
+
from anyio import Path as AsyncPath
|
12
|
+
from PIL.Image import Image
|
13
|
+
|
14
|
+
from kreuzberg._sync import run_sync
|
15
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError
|
16
|
+
|
17
|
+
version_ref = {"checked": False}
|
18
|
+
|
19
|
+
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
20
|
+
|
21
|
+
SupportedLanguages = Literal[
|
22
|
+
"afr",
|
23
|
+
"amh",
|
24
|
+
"ara",
|
25
|
+
"asm",
|
26
|
+
"aze",
|
27
|
+
"aze_cyrl",
|
28
|
+
"bel",
|
29
|
+
"ben",
|
30
|
+
"bod",
|
31
|
+
"bos",
|
32
|
+
"bre",
|
33
|
+
"bul",
|
34
|
+
"cat",
|
35
|
+
"ceb",
|
36
|
+
"ces",
|
37
|
+
"chi_sim",
|
38
|
+
"chi_tra",
|
39
|
+
"chr",
|
40
|
+
"cos",
|
41
|
+
"cym",
|
42
|
+
"dan",
|
43
|
+
"dan_frak",
|
44
|
+
"deu",
|
45
|
+
"deu_frak",
|
46
|
+
"deu_latf",
|
47
|
+
"dzo",
|
48
|
+
"ell",
|
49
|
+
"eng",
|
50
|
+
"enm",
|
51
|
+
"epo",
|
52
|
+
"equ",
|
53
|
+
"est",
|
54
|
+
"eus",
|
55
|
+
"fao",
|
56
|
+
"fas",
|
57
|
+
"fil",
|
58
|
+
"fin",
|
59
|
+
"fra",
|
60
|
+
"frk",
|
61
|
+
"frm",
|
62
|
+
"fry",
|
63
|
+
"gla",
|
64
|
+
"gle",
|
65
|
+
"glg",
|
66
|
+
"grc",
|
67
|
+
"guj",
|
68
|
+
"hat",
|
69
|
+
"heb",
|
70
|
+
"hin",
|
71
|
+
"hrv",
|
72
|
+
"hun",
|
73
|
+
"hye",
|
74
|
+
"iku",
|
75
|
+
"ind",
|
76
|
+
"isl",
|
77
|
+
"ita",
|
78
|
+
"ita_old",
|
79
|
+
"jav",
|
80
|
+
"jpn",
|
81
|
+
"kan",
|
82
|
+
"kat",
|
83
|
+
"kat_old",
|
84
|
+
"kaz",
|
85
|
+
"khm",
|
86
|
+
"kir",
|
87
|
+
"kmr",
|
88
|
+
"kor",
|
89
|
+
"kor_vert",
|
90
|
+
"kur",
|
91
|
+
"lao",
|
92
|
+
"lat",
|
93
|
+
"lav",
|
94
|
+
"lit",
|
95
|
+
"ltz",
|
96
|
+
"mal",
|
97
|
+
"mar",
|
98
|
+
"mkd",
|
99
|
+
"mlt",
|
100
|
+
"mon",
|
101
|
+
"mri",
|
102
|
+
"msa",
|
103
|
+
"mya",
|
104
|
+
"nep",
|
105
|
+
"nld",
|
106
|
+
"nor",
|
107
|
+
"oci",
|
108
|
+
"ori",
|
109
|
+
"osd",
|
110
|
+
"pan",
|
111
|
+
"pol",
|
112
|
+
"por",
|
113
|
+
"pus",
|
114
|
+
"que",
|
115
|
+
"ron",
|
116
|
+
"rus",
|
117
|
+
"san",
|
118
|
+
"sin",
|
119
|
+
"slk",
|
120
|
+
"slk_frak",
|
121
|
+
"slv",
|
122
|
+
"snd",
|
123
|
+
"spa",
|
124
|
+
"spa_old",
|
125
|
+
"sqi",
|
126
|
+
"srp",
|
127
|
+
"srp_latn",
|
128
|
+
"sun",
|
129
|
+
"swa",
|
130
|
+
"swe",
|
131
|
+
"syr",
|
132
|
+
"tam",
|
133
|
+
"tat",
|
134
|
+
"tel",
|
135
|
+
"tgk",
|
136
|
+
"tgl",
|
137
|
+
"tha",
|
138
|
+
"tir",
|
139
|
+
"ton",
|
140
|
+
"tur",
|
141
|
+
"uig",
|
142
|
+
"ukr",
|
143
|
+
"urd",
|
144
|
+
"uzb",
|
145
|
+
"uzb_cyrl",
|
146
|
+
"vie",
|
147
|
+
"yid",
|
148
|
+
"yor",
|
149
|
+
]
|
150
|
+
|
151
|
+
|
152
|
+
class PSMMode(Enum):
|
153
|
+
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
154
|
+
|
155
|
+
OSD_ONLY = 0
|
156
|
+
"""Orientation and script detection only."""
|
157
|
+
AUTO_OSD = 1
|
158
|
+
"""Automatic page segmentation with orientation and script detection."""
|
159
|
+
AUTO_ONLY = 2
|
160
|
+
"""Automatic page segmentation without OSD."""
|
161
|
+
AUTO = 3
|
162
|
+
"""Fully automatic page segmentation (default)."""
|
163
|
+
SINGLE_COLUMN = 4
|
164
|
+
"""Assume a single column of text."""
|
165
|
+
SINGLE_BLOCK_VERTICAL = 5
|
166
|
+
"""Assume a single uniform block of vertically aligned text."""
|
167
|
+
SINGLE_BLOCK = 6
|
168
|
+
"""Assume a single uniform block of text."""
|
169
|
+
SINGLE_LINE = 7
|
170
|
+
"""Treat the image as a single text line."""
|
171
|
+
SINGLE_WORD = 8
|
172
|
+
"""Treat the image as a single word."""
|
173
|
+
CIRCLE_WORD = 9
|
174
|
+
"""Treat the image as a single word in a circle."""
|
175
|
+
SINGLE_CHAR = 10
|
176
|
+
"""Treat the image as a single character."""
|
177
|
+
|
178
|
+
|
179
|
+
async def validate_tesseract_version() -> None:
|
180
|
+
"""Validate that Tesseract is installed and is version 5 or above.
|
181
|
+
|
182
|
+
Raises:
|
183
|
+
MissingDependencyError: If Tesseract is not installed or is below version 5.
|
184
|
+
"""
|
185
|
+
try:
|
186
|
+
if version_ref["checked"]:
|
187
|
+
return
|
188
|
+
|
189
|
+
result = await run_sync(subprocess.run, ["tesseract", "--version"], capture_output=True)
|
190
|
+
version_match = re.search(r"tesseract\s+(\d+)", result.stdout.decode())
|
191
|
+
if not version_match or int(version_match.group(1)) < 5:
|
192
|
+
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
193
|
+
|
194
|
+
version_ref["checked"] = True
|
195
|
+
except FileNotFoundError as e:
|
196
|
+
raise MissingDependencyError("Tesseract is not installed.") from e
|
197
|
+
|
198
|
+
|
199
|
+
async def process_file(
|
200
|
+
input_file: str | PathLike[str], *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any
|
201
|
+
) -> str:
|
202
|
+
"""Process a single image file using Tesseract OCR.
|
203
|
+
|
204
|
+
Args:
|
205
|
+
input_file: The path to the image file to process.
|
206
|
+
language: The language code for OCR.
|
207
|
+
psm: Page segmentation mode.
|
208
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
209
|
+
|
210
|
+
Raises:
|
211
|
+
OCRError: If OCR fails to extract text from the image.
|
212
|
+
|
213
|
+
Returns:
|
214
|
+
str: Extracted text from the image.
|
215
|
+
"""
|
216
|
+
with NamedTemporaryFile(suffix=".txt") as output_file:
|
217
|
+
# this is needed because tesseract adds .txt to the output file
|
218
|
+
output_file_name = output_file.name.replace(".txt", "")
|
219
|
+
try:
|
220
|
+
command = [
|
221
|
+
"tesseract",
|
222
|
+
str(input_file),
|
223
|
+
output_file_name,
|
224
|
+
"-l",
|
225
|
+
language,
|
226
|
+
"--psm",
|
227
|
+
str(psm.value),
|
228
|
+
]
|
229
|
+
|
230
|
+
for key, value in kwargs.items():
|
231
|
+
command.extend(["-c", f"{key}={value}"])
|
232
|
+
|
233
|
+
result = await run_sync(
|
234
|
+
subprocess.run,
|
235
|
+
command,
|
236
|
+
capture_output=True,
|
237
|
+
)
|
238
|
+
|
239
|
+
if not result.returncode == 0:
|
240
|
+
raise OCRError("OCR failed with a non-0 return code.")
|
241
|
+
|
242
|
+
output = await AsyncPath(output_file.name).read_text()
|
243
|
+
return output.strip()
|
244
|
+
except (RuntimeError, OSError) as e:
|
245
|
+
raise OCRError("Failed to OCR using tesseract") from e
|
246
|
+
|
247
|
+
|
248
|
+
async def process_image(image: Image, *, language: SupportedLanguages, psm: PSMMode, **kwargs: Any) -> str:
|
249
|
+
"""Process a single Pillow Image using Tesseract OCR.
|
250
|
+
|
251
|
+
Args:
|
252
|
+
image: The Pillow Image to process.
|
253
|
+
language: The language code for OCR.
|
254
|
+
psm: Page segmentation mode.
|
255
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
256
|
+
|
257
|
+
Returns:
|
258
|
+
str: Extracted text from the image.
|
259
|
+
"""
|
260
|
+
with NamedTemporaryFile(suffix=".png") as image_file:
|
261
|
+
await run_sync(image.save, image_file.name, format="PNG")
|
262
|
+
return await process_file(image_file.name, language=language, psm=psm, **kwargs)
|
263
|
+
|
264
|
+
|
265
|
+
async def process_image_with_tesseract(
|
266
|
+
image: Image | PathLike[str] | str,
|
267
|
+
*,
|
268
|
+
language: SupportedLanguages = "eng",
|
269
|
+
psm: PSMMode = PSMMode.AUTO,
|
270
|
+
**kwargs: Any,
|
271
|
+
) -> str:
|
272
|
+
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
273
|
+
|
274
|
+
Args:
|
275
|
+
image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
|
276
|
+
language: The language code for OCR (default: "eng").
|
277
|
+
psm: Page segmentation mode (default: PSMMode.AUTO).
|
278
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
279
|
+
|
280
|
+
Raises:
|
281
|
+
ValueError: If the input is not a Pillow Image or a list of Pillow Images.
|
282
|
+
|
283
|
+
Returns:
|
284
|
+
Extracted text as a string
|
285
|
+
"""
|
286
|
+
await validate_tesseract_version()
|
287
|
+
|
288
|
+
if isinstance(image, Image):
|
289
|
+
return await process_image(image, language=language, psm=psm, **kwargs)
|
290
|
+
|
291
|
+
if isinstance(image, (PathLike, str)):
|
292
|
+
return await process_file(image, language=language, psm=psm, **kwargs)
|
293
|
+
|
294
|
+
raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
|
295
|
+
|
296
|
+
|
297
|
+
async def batch_process_images(
|
298
|
+
images: list[T],
|
299
|
+
*,
|
300
|
+
language: SupportedLanguages = "eng",
|
301
|
+
psm: PSMMode = PSMMode.AUTO,
|
302
|
+
**kwargs: Any,
|
303
|
+
) -> list[str]:
|
304
|
+
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
305
|
+
|
306
|
+
Args:
|
307
|
+
images: A list of Pillow Images, paths or strings to process.
|
308
|
+
language: The language code for OCR (default: "eng").
|
309
|
+
psm: Page segmentation mode (default: PSMMode.AUTO).
|
310
|
+
**kwargs: Additional Tesseract configuration options as key-value pairs.
|
311
|
+
|
312
|
+
Returns:
|
313
|
+
Extracted text as a string (for single image) or a list of strings (for multiple images).
|
314
|
+
"""
|
315
|
+
await validate_tesseract_version()
|
316
|
+
return await gather(
|
317
|
+
*[process_image_with_tesseract(image, language=language, psm=psm, **kwargs) for image in images]
|
318
|
+
)
|
@@ -10,7 +10,7 @@ class KreuzbergError(Exception):
|
|
10
10
|
context: Any
|
11
11
|
"""The context of the error."""
|
12
12
|
|
13
|
-
def __init__(self, message: str, context: Any = None) -> None:
|
13
|
+
def __init__(self, message: str, *, context: Any = None) -> None:
|
14
14
|
self.context = context
|
15
15
|
super().__init__(message)
|
16
16
|
|
@@ -27,3 +27,11 @@ class ParsingError(KreuzbergError):
|
|
27
27
|
|
28
28
|
class ValidationError(KreuzbergError):
|
29
29
|
"""Raised when a validation error occurs."""
|
30
|
+
|
31
|
+
|
32
|
+
class MissingDependencyError(KreuzbergError):
|
33
|
+
"""Raised when a dependency is missing."""
|
34
|
+
|
35
|
+
|
36
|
+
class OCRError(KreuzbergError):
|
37
|
+
"""Raised when an OCR error occurs."""
|
@@ -17,13 +17,14 @@ from typing import NamedTuple
|
|
17
17
|
from anyio import Path as AsyncPath
|
18
18
|
|
19
19
|
from kreuzberg._extractors import (
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
20
|
+
extract_content_with_pandoc,
|
21
|
+
extract_file_with_pandoc,
|
22
|
+
extract_html_string,
|
23
|
+
extract_pdf_file,
|
24
|
+
extract_pptx_file,
|
25
25
|
)
|
26
26
|
from kreuzberg._mime_types import (
|
27
|
+
HTML_MIME_TYPE,
|
27
28
|
IMAGE_MIME_TYPE_EXT_MAP,
|
28
29
|
IMAGE_MIME_TYPES,
|
29
30
|
MARKDOWN_MIME_TYPE,
|
@@ -34,6 +35,7 @@ from kreuzberg._mime_types import (
|
|
34
35
|
SUPPORTED_MIME_TYPES,
|
35
36
|
)
|
36
37
|
from kreuzberg._string import safe_decode
|
38
|
+
from kreuzberg._tesseract import process_image_with_tesseract
|
37
39
|
from kreuzberg.exceptions import ValidationError
|
38
40
|
|
39
41
|
|
@@ -70,25 +72,28 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
70
72
|
with NamedTemporaryFile(suffix=".pdf") as temp_file:
|
71
73
|
temp_file.write(content)
|
72
74
|
return ExtractionResult(
|
73
|
-
content=await
|
75
|
+
content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
74
76
|
)
|
75
77
|
|
76
78
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
77
79
|
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
|
78
80
|
temp_file.write(content)
|
79
81
|
return ExtractionResult(
|
80
|
-
content=await
|
82
|
+
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
|
81
83
|
)
|
82
84
|
|
83
85
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
84
86
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
85
87
|
):
|
86
88
|
return ExtractionResult(
|
87
|
-
content=await
|
89
|
+
content=await extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
88
90
|
)
|
89
91
|
|
90
92
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
91
|
-
return ExtractionResult(content=await
|
93
|
+
return ExtractionResult(content=await extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
94
|
+
|
95
|
+
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
96
|
+
return ExtractionResult(content=await extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
|
92
97
|
|
93
98
|
return ExtractionResult(
|
94
99
|
content=safe_decode(content),
|
@@ -127,19 +132,22 @@ async def extract_file(
|
|
127
132
|
raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
|
128
133
|
|
129
134
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
130
|
-
return ExtractionResult(content=await
|
135
|
+
return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
131
136
|
|
132
137
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
133
|
-
return ExtractionResult(content=await
|
138
|
+
return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
134
139
|
|
135
140
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
136
141
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
137
142
|
):
|
138
143
|
return ExtractionResult(
|
139
|
-
content=await
|
144
|
+
content=await extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
140
145
|
)
|
141
146
|
|
142
147
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
143
|
-
return ExtractionResult(content=await
|
148
|
+
return ExtractionResult(content=await extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
149
|
+
|
150
|
+
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
151
|
+
return ExtractionResult(content=await extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
144
152
|
|
145
153
|
return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.4.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -25,11 +25,10 @@ Description-Content-Type: text/markdown
|
|
25
25
|
License-File: LICENSE
|
26
26
|
Requires-Dist: anyio>=4.8.0
|
27
27
|
Requires-Dist: charset-normalizer>=3.4.1
|
28
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
28
29
|
Requires-Dist: pypandoc>=1.15
|
29
30
|
Requires-Dist: pypdfium2>=4.30.1
|
30
|
-
Requires-Dist: pytesseract>=0.3.13
|
31
31
|
Requires-Dist: python-pptx>=1.0.2
|
32
|
-
Requires-Dist: typing-extensions>=4.12.2
|
33
32
|
|
34
33
|
# Kreuzberg
|
35
34
|
|
@@ -38,7 +37,7 @@ extraction.
|
|
38
37
|
|
39
38
|
Why?
|
40
39
|
|
41
|
-
I am building, like many do now, a RAG focused service. I have text extraction needs.
|
40
|
+
I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
|
42
41
|
There are quite a lot of commercial options out there, and several open-source + paid options.
|
43
42
|
But I wanted something simple, which does not require expansive round-trips to an external API.
|
44
43
|
Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
|
@@ -76,17 +75,32 @@ polished and well maintained.
|
|
76
75
|
|
77
76
|
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
78
77
|
- Images are processed using Tesseract OCR
|
79
|
-
- Office documents and other formats are processed using Pandoc
|
78
|
+
- Office documents and other formats are processed using Pandoc
|
79
|
+
- PPTX files are converted using python-pptx
|
80
|
+
- HTML files are converted using html-to-markdown
|
80
81
|
- Plain text files are read directly with appropriate encoding detection
|
81
82
|
|
82
83
|
### Roadmap
|
83
84
|
|
84
|
-
|
85
|
-
[] - html file text extraction
|
86
|
-
[] - better PDF table extraction
|
87
|
-
[] - metadata extraction
|
85
|
+
V1:
|
88
86
|
|
89
|
-
|
87
|
+
- [x] - html file text extraction
|
88
|
+
- [ ] - better PDF table extraction
|
89
|
+
- [ ] - TBD
|
90
|
+
|
91
|
+
V2:
|
92
|
+
|
93
|
+
- [ ] - extra install groups (to make dependencies optional)
|
94
|
+
- [ ] - metadata extraction (possible breaking change)
|
95
|
+
- [ ] - TBD
|
96
|
+
|
97
|
+
### Feature Requests
|
98
|
+
|
99
|
+
Feel free to open a discussion in GitHub or an issue if you have any feature requests
|
100
|
+
|
101
|
+
### Contribution
|
102
|
+
|
103
|
+
Is welcome! Read guidelines below.
|
90
104
|
|
91
105
|
## Supported File Types
|
92
106
|
|
@@ -116,6 +130,7 @@ Kreuzberg supports a wide range of file formats:
|
|
116
130
|
|
117
131
|
#### Text and Markup Formats
|
118
132
|
|
133
|
+
- HTML (`.html`, `.htm`)
|
119
134
|
- Plain Text (`.txt`)
|
120
135
|
- Markdown (`.md`)
|
121
136
|
- reStructuredText (`.rst`)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "kreuzberg"
|
3
|
-
version = "1.
|
3
|
+
version = "1.4.0"
|
4
4
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
readme = "README.md"
|
6
6
|
keywords = [
|
@@ -42,27 +42,25 @@ classifiers = [
|
|
42
42
|
dependencies = [
|
43
43
|
"anyio>=4.8.0",
|
44
44
|
"charset-normalizer>=3.4.1",
|
45
|
+
"html-to-markdown>=1.2.0",
|
45
46
|
"pypandoc>=1.15",
|
46
47
|
"pypdfium2>=4.30.1",
|
47
|
-
"pytesseract>=0.3.13",
|
48
48
|
"python-pptx>=1.0.2",
|
49
|
-
"typing-extensions>=4.12.2",
|
50
49
|
]
|
51
|
-
|
52
50
|
urls.homepage = "https://github.com/Goldziher/kreuzberg"
|
53
51
|
|
54
52
|
[dependency-groups]
|
55
53
|
dev = [
|
56
54
|
"covdefaults>=2.3.0",
|
57
|
-
"mypy>=1.
|
55
|
+
"mypy>=1.15.0",
|
58
56
|
"pre-commit>=4.1.0",
|
59
57
|
"pytest>=8.3.4",
|
60
|
-
"pytest-asyncio>=0.25.
|
58
|
+
"pytest-asyncio>=0.25.3",
|
61
59
|
"pytest-cov>=6.0.0",
|
62
60
|
"pytest-mock>=3.14.0",
|
63
61
|
"pytest-timeout>=2.3.1",
|
64
62
|
"python-dotenv>=1.0.1",
|
65
|
-
"ruff>=0.9.
|
63
|
+
"ruff>=0.9.5",
|
66
64
|
]
|
67
65
|
|
68
66
|
[tool.setuptools.packages.find]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|