kreuzberg 1.0.0__tar.gz → 1.2.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/PKG-INFO +46 -16
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/README.md +43 -14
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/_extractors.py +94 -10
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/_mime_types.py +3 -3
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/_string.py +19 -7
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/_sync.py +1 -1
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/extraction.py +25 -4
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg.egg-info/PKG-INFO +46 -16
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg.egg-info/requires.txt +1 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/pyproject.toml +3 -4
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/LICENSE +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/__init__.py +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg.egg-info/SOURCES.txt +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-1.0.0 → kreuzberg-1.2.0}/setup.cfg +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
7
7
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
-
Keywords:
|
8
|
+
Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
|
9
9
|
Classifier: Development Status :: 4 - Beta
|
10
10
|
Classifier: Intended Audience :: Developers
|
11
11
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -28,6 +28,7 @@ Requires-Dist: charset-normalizer>=3.4.1
|
|
28
28
|
Requires-Dist: pypandoc>=1.15
|
29
29
|
Requires-Dist: pypdfium2>=4.30.1
|
30
30
|
Requires-Dist: pytesseract>=0.3.13
|
31
|
+
Requires-Dist: python-pptx>=1.0.2
|
31
32
|
Requires-Dist: typing-extensions>=4.12.2
|
32
33
|
|
33
34
|
# Kreuzberg
|
@@ -46,7 +47,7 @@ Hence, this library.
|
|
46
47
|
|
47
48
|
## Features
|
48
49
|
|
49
|
-
- Extract text from PDFs, images, and
|
50
|
+
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
50
51
|
- Use modern Python with async (via `anyio`) and proper type hints
|
51
52
|
- Extensive error handling for easy debugging
|
52
53
|
|
@@ -65,6 +66,28 @@ Hence, this library.
|
|
65
66
|
- [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
|
66
67
|
- [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
|
67
68
|
|
69
|
+
## Dependencies and Philosophy
|
70
|
+
|
71
|
+
This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
|
72
|
+
high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
|
73
|
+
polished and well maintained.
|
74
|
+
|
75
|
+
### Dependencies
|
76
|
+
|
77
|
+
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
78
|
+
- Images are processed using Tesseract OCR
|
79
|
+
- Office documents and other formats are processed using Pandoc, or python-pptx for PPTX files
|
80
|
+
- Plain text files are read directly with appropriate encoding detection
|
81
|
+
|
82
|
+
### Roadmap
|
83
|
+
|
84
|
+
[] - extra install groups (to make dependencies optional and offer alternatives)
|
85
|
+
[] - html file text extraction
|
86
|
+
[] - better PDF table extraction
|
87
|
+
[] - metadata extraction
|
88
|
+
|
89
|
+
Feel free to open a discussion in GitHub or an issue if you have any feature requests, but keep the philosophy part in mind
|
90
|
+
|
68
91
|
## Supported File Types
|
69
92
|
|
70
93
|
Kreuzberg supports a wide range of file formats:
|
@@ -72,7 +95,8 @@ Kreuzberg supports a wide range of file formats:
|
|
72
95
|
### Document Formats
|
73
96
|
|
74
97
|
- PDF (`.pdf`) - both searchable and scanned documents
|
75
|
-
- Word Documents (`.docx`)
|
98
|
+
- Word Documents (`.docx`, `.doc`)
|
99
|
+
- Power Point Presentations (`.pptx`)
|
76
100
|
- OpenDocument Text (`.odt`)
|
77
101
|
- Rich Text Format (`.rtf`)
|
78
102
|
|
@@ -102,13 +126,6 @@ Kreuzberg supports a wide range of file formats:
|
|
102
126
|
- Comma-Separated Values (`.csv`)
|
103
127
|
- Tab-Separated Values (`.tsv`)
|
104
128
|
|
105
|
-
All formats support text extraction, with different processing methods:
|
106
|
-
|
107
|
-
- PDFs are processed using pdfium2 for searchable PDFs and Tesseract OCR for scanned documents
|
108
|
-
- Images are processed using Tesseract OCR
|
109
|
-
- Office documents and other formats are processed using Pandoc
|
110
|
-
- Plain text files are read directly with appropriate encoding detection
|
111
|
-
|
112
129
|
## Usage
|
113
130
|
|
114
131
|
Kreuzberg exports two async functions:
|
@@ -116,8 +133,6 @@ Kreuzberg exports two async functions:
|
|
116
133
|
- Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
|
117
134
|
- Extract text from a byte-string using `extract_bytes()`
|
118
135
|
|
119
|
-
Note - both of these functions are async and therefore should be used in an async context.
|
120
|
-
|
121
136
|
### Extract from File
|
122
137
|
|
123
138
|
```python
|
@@ -164,6 +179,21 @@ async def process_uploaded_image(image_content: bytes):
|
|
164
179
|
return result.content
|
165
180
|
```
|
166
181
|
|
182
|
+
### Forcing OCR
|
183
|
+
|
184
|
+
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
185
|
+
You can do this by passing `force_ocr=True`:
|
186
|
+
|
187
|
+
```python
|
188
|
+
from kreuzberg import extract_bytes
|
189
|
+
|
190
|
+
|
191
|
+
# Extract text from PDF bytes and force OCR
|
192
|
+
async def process_uploaded_pdf(pdf_content: bytes):
|
193
|
+
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
194
|
+
return result.content
|
195
|
+
```
|
196
|
+
|
167
197
|
### Error Handling
|
168
198
|
|
169
199
|
Kreuzberg raises two exception types:
|
@@ -173,8 +203,8 @@ Kreuzberg raises two exception types:
|
|
173
203
|
Raised when there are issues with input validation:
|
174
204
|
|
175
205
|
- Unsupported mime types
|
176
|
-
- Non-existent files
|
177
206
|
- Undetectable mime types
|
207
|
+
- Path doesn't point at an exist file
|
178
208
|
|
179
209
|
#### ParsingError
|
180
210
|
|
@@ -218,8 +248,8 @@ except ParsingError as e:
|
|
218
248
|
|
219
249
|
All extraction functions return an ExtractionResult named tuple containing:
|
220
250
|
|
221
|
-
- content
|
222
|
-
- mime_type
|
251
|
+
- `content`: The extracted text as a string
|
252
|
+
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
223
253
|
|
224
254
|
```python
|
225
255
|
from kreuzberg import ExtractionResult
|
@@ -14,7 +14,7 @@ Hence, this library.
|
|
14
14
|
|
15
15
|
## Features
|
16
16
|
|
17
|
-
- Extract text from PDFs, images, and
|
17
|
+
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
18
18
|
- Use modern Python with async (via `anyio`) and proper type hints
|
19
19
|
- Extensive error handling for easy debugging
|
20
20
|
|
@@ -33,6 +33,28 @@ Hence, this library.
|
|
33
33
|
- [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
|
34
34
|
- [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
|
35
35
|
|
36
|
+
## Dependencies and Philosophy
|
37
|
+
|
38
|
+
This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
|
39
|
+
high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
|
40
|
+
polished and well maintained.
|
41
|
+
|
42
|
+
### Dependencies
|
43
|
+
|
44
|
+
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
45
|
+
- Images are processed using Tesseract OCR
|
46
|
+
- Office documents and other formats are processed using Pandoc, or python-pptx for PPTX files
|
47
|
+
- Plain text files are read directly with appropriate encoding detection
|
48
|
+
|
49
|
+
### Roadmap
|
50
|
+
|
51
|
+
[] - extra install groups (to make dependencies optional and offer alternatives)
|
52
|
+
[] - html file text extraction
|
53
|
+
[] - better PDF table extraction
|
54
|
+
[] - metadata extraction
|
55
|
+
|
56
|
+
Feel free to open a discussion in GitHub or an issue if you have any feature requests, but keep the philosophy part in mind
|
57
|
+
|
36
58
|
## Supported File Types
|
37
59
|
|
38
60
|
Kreuzberg supports a wide range of file formats:
|
@@ -40,7 +62,8 @@ Kreuzberg supports a wide range of file formats:
|
|
40
62
|
### Document Formats
|
41
63
|
|
42
64
|
- PDF (`.pdf`) - both searchable and scanned documents
|
43
|
-
- Word Documents (`.docx`)
|
65
|
+
- Word Documents (`.docx`, `.doc`)
|
66
|
+
- Power Point Presentations (`.pptx`)
|
44
67
|
- OpenDocument Text (`.odt`)
|
45
68
|
- Rich Text Format (`.rtf`)
|
46
69
|
|
@@ -70,13 +93,6 @@ Kreuzberg supports a wide range of file formats:
|
|
70
93
|
- Comma-Separated Values (`.csv`)
|
71
94
|
- Tab-Separated Values (`.tsv`)
|
72
95
|
|
73
|
-
All formats support text extraction, with different processing methods:
|
74
|
-
|
75
|
-
- PDFs are processed using pdfium2 for searchable PDFs and Tesseract OCR for scanned documents
|
76
|
-
- Images are processed using Tesseract OCR
|
77
|
-
- Office documents and other formats are processed using Pandoc
|
78
|
-
- Plain text files are read directly with appropriate encoding detection
|
79
|
-
|
80
96
|
## Usage
|
81
97
|
|
82
98
|
Kreuzberg exports two async functions:
|
@@ -84,8 +100,6 @@ Kreuzberg exports two async functions:
|
|
84
100
|
- Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
|
85
101
|
- Extract text from a byte-string using `extract_bytes()`
|
86
102
|
|
87
|
-
Note - both of these functions are async and therefore should be used in an async context.
|
88
|
-
|
89
103
|
### Extract from File
|
90
104
|
|
91
105
|
```python
|
@@ -132,6 +146,21 @@ async def process_uploaded_image(image_content: bytes):
|
|
132
146
|
return result.content
|
133
147
|
```
|
134
148
|
|
149
|
+
### Forcing OCR
|
150
|
+
|
151
|
+
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
152
|
+
You can do this by passing `force_ocr=True`:
|
153
|
+
|
154
|
+
```python
|
155
|
+
from kreuzberg import extract_bytes
|
156
|
+
|
157
|
+
|
158
|
+
# Extract text from PDF bytes and force OCR
|
159
|
+
async def process_uploaded_pdf(pdf_content: bytes):
|
160
|
+
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
161
|
+
return result.content
|
162
|
+
```
|
163
|
+
|
135
164
|
### Error Handling
|
136
165
|
|
137
166
|
Kreuzberg raises two exception types:
|
@@ -141,8 +170,8 @@ Kreuzberg raises two exception types:
|
|
141
170
|
Raised when there are issues with input validation:
|
142
171
|
|
143
172
|
- Unsupported mime types
|
144
|
-
- Non-existent files
|
145
173
|
- Undetectable mime types
|
174
|
+
- Path doesn't point at an exist file
|
146
175
|
|
147
176
|
#### ParsingError
|
148
177
|
|
@@ -186,8 +215,8 @@ except ParsingError as e:
|
|
186
215
|
|
187
216
|
All extraction functions return an ExtractionResult named tuple containing:
|
188
217
|
|
189
|
-
- content
|
190
|
-
- mime_type
|
218
|
+
- `content`: The extracted text as a string
|
219
|
+
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
191
220
|
|
192
221
|
```python
|
193
222
|
from kreuzberg import ExtractionResult
|
@@ -1,17 +1,25 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import re
|
4
|
+
from contextlib import suppress
|
5
|
+
from html import escape
|
6
|
+
from io import BytesIO
|
3
7
|
from typing import TYPE_CHECKING, cast
|
4
8
|
|
9
|
+
from anyio import Path as AsyncPath
|
5
10
|
from charset_normalizer import detect
|
11
|
+
from pptx import Presentation
|
12
|
+
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
6
13
|
from pypandoc import convert_file, convert_text
|
7
14
|
from pypdfium2 import PdfDocument, PdfiumError
|
8
15
|
from pytesseract import TesseractError, image_to_string
|
9
16
|
|
10
17
|
from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
|
18
|
+
from kreuzberg._string import normalize_spaces
|
11
19
|
from kreuzberg._sync import run_sync
|
12
20
|
from kreuzberg.exceptions import ParsingError
|
13
21
|
|
14
|
-
if TYPE_CHECKING:
|
22
|
+
if TYPE_CHECKING: # pragma: no cover
|
15
23
|
from pathlib import Path
|
16
24
|
|
17
25
|
|
@@ -33,8 +41,9 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
|
|
33
41
|
images = [page.render(scale=2.0).to_pil() for page in pdf]
|
34
42
|
|
35
43
|
text = "\n".join(image_to_string(img) for img in images)
|
36
|
-
return text
|
44
|
+
return normalize_spaces(text)
|
37
45
|
except (PdfiumError, TesseractError) as e:
|
46
|
+
# TODO: add test case
|
38
47
|
raise ParsingError(
|
39
48
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
40
49
|
) from e
|
@@ -55,26 +64,28 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
|
|
55
64
|
try:
|
56
65
|
document = PdfDocument(file_path)
|
57
66
|
text = "\n".join(page.get_textpage().get_text_range() for page in document)
|
58
|
-
return text
|
67
|
+
return normalize_spaces(text)
|
59
68
|
except PdfiumError as e:
|
69
|
+
# TODO: add test case
|
60
70
|
raise ParsingError(
|
61
71
|
"Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
|
62
72
|
) from e
|
63
73
|
|
64
74
|
|
65
|
-
async def _extract_pdf_file(file_path: Path) -> str:
|
75
|
+
async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
|
66
76
|
"""Extract text from a PDF file.
|
67
77
|
|
68
78
|
Args:
|
69
79
|
file_path: The path to the PDF file.
|
80
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
70
81
|
|
71
82
|
Returns:
|
72
83
|
The extracted text.
|
73
84
|
"""
|
74
|
-
if content := await run_sync(_extract_pdf_with_pdfium2, file_path):
|
75
|
-
return content
|
85
|
+
if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
|
86
|
+
return normalize_spaces(content)
|
76
87
|
|
77
|
-
return await run_sync(_extract_pdf_with_tesseract, file_path)
|
88
|
+
return normalize_spaces(await run_sync(_extract_pdf_with_tesseract, file_path))
|
78
89
|
|
79
90
|
|
80
91
|
async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
|
@@ -94,8 +105,11 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
|
|
94
105
|
ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
|
95
106
|
encoding = encoding or detect(file_data)["encoding"] or "utf-8"
|
96
107
|
try:
|
97
|
-
return
|
108
|
+
return normalize_spaces(
|
109
|
+
cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
|
110
|
+
)
|
98
111
|
except RuntimeError as e:
|
112
|
+
# TODO: add test case
|
99
113
|
raise ParsingError(
|
100
114
|
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
|
101
115
|
context={"error": str(e)},
|
@@ -117,7 +131,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
|
|
117
131
|
"""
|
118
132
|
ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
|
119
133
|
try:
|
120
|
-
return cast(str, await run_sync(convert_file, file_path, to="md", format=ext))
|
134
|
+
return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
|
121
135
|
except RuntimeError as e:
|
122
136
|
raise ParsingError(
|
123
137
|
f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
|
@@ -138,8 +152,78 @@ async def _extract_image_with_tesseract(file_path: Path | str) -> str:
|
|
138
152
|
The extracted content.
|
139
153
|
"""
|
140
154
|
try:
|
141
|
-
return cast(str, image_to_string(str(file_path))
|
155
|
+
return normalize_spaces(cast(str, image_to_string(str(file_path))))
|
142
156
|
except TesseractError as e:
|
143
157
|
raise ParsingError(
|
144
158
|
"Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
|
145
159
|
) from e
|
160
|
+
|
161
|
+
|
162
|
+
async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
163
|
+
"""Extract text from a PPTX file.
|
164
|
+
|
165
|
+
Notes:
|
166
|
+
This function is based on code vendored from `markitdown`, which has an MIT license as well.
|
167
|
+
|
168
|
+
Args:
|
169
|
+
file_path_or_contents: The path to the PPTX file or its contents as bytes.
|
170
|
+
|
171
|
+
Returns:
|
172
|
+
The extracted text content
|
173
|
+
"""
|
174
|
+
md_content = ""
|
175
|
+
file_contents = (
|
176
|
+
file_path_or_contents
|
177
|
+
if isinstance(file_path_or_contents, bytes)
|
178
|
+
else await AsyncPath(file_path_or_contents).read_bytes()
|
179
|
+
)
|
180
|
+
presentation = Presentation(BytesIO(file_contents))
|
181
|
+
|
182
|
+
for index, slide in enumerate(presentation.slides):
|
183
|
+
md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
|
184
|
+
|
185
|
+
title = slide.shapes.title
|
186
|
+
|
187
|
+
for shape in slide.shapes:
|
188
|
+
if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
|
189
|
+
shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
|
190
|
+
):
|
191
|
+
alt_text = ""
|
192
|
+
with suppress(AttributeError):
|
193
|
+
# access non-visual properties
|
194
|
+
alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
|
195
|
+
|
196
|
+
filename = re.sub(r"\W", "", shape.name) + ".jpg"
|
197
|
+
md_content += f"\n\n"
|
198
|
+
|
199
|
+
elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
|
200
|
+
html_table = "<table>"
|
201
|
+
first_row = True
|
202
|
+
|
203
|
+
for row in shape.table.rows:
|
204
|
+
html_table += "<tr>"
|
205
|
+
|
206
|
+
for cell in row.cells:
|
207
|
+
tag = "th" if first_row else "td"
|
208
|
+
html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
|
209
|
+
|
210
|
+
html_table += "</tr>"
|
211
|
+
first_row = False
|
212
|
+
|
213
|
+
html_table += "</table>"
|
214
|
+
md_content += "\n" + html_table + "\n"
|
215
|
+
|
216
|
+
elif shape.has_text_frame:
|
217
|
+
md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
|
218
|
+
|
219
|
+
md_content = md_content.strip()
|
220
|
+
if slide.has_notes_slide:
|
221
|
+
md_content += "\n\n### Notes:\n"
|
222
|
+
notes_frame = slide.notes_slide.notes_text_frame
|
223
|
+
|
224
|
+
if notes_frame is not None:
|
225
|
+
md_content += notes_frame.text
|
226
|
+
|
227
|
+
md_content = md_content.strip()
|
228
|
+
|
229
|
+
return normalize_spaces(md_content)
|
@@ -2,13 +2,13 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, Final
|
4
4
|
|
5
|
-
if TYPE_CHECKING:
|
5
|
+
if TYPE_CHECKING: # pragma: no cover
|
6
6
|
from collections.abc import Mapping
|
7
7
|
|
8
8
|
MARKDOWN_MIME_TYPE: Final[str] = "text/markdown"
|
9
9
|
PLAIN_TEXT_MIME_TYPE: Final[str] = "text/plain"
|
10
10
|
PDF_MIME_TYPE: Final[str] = "application/pdf"
|
11
|
-
|
11
|
+
POWER_POINT_MIME_TYPE: Final[str] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
12
12
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
13
13
|
|
14
14
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -93,5 +93,5 @@ PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
|
93
93
|
}
|
94
94
|
|
95
95
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
96
|
-
PLAIN_TEXT_MIME_TYPES | IMAGE_MIME_TYPES | PANDOC_SUPPORTED_MIME_TYPES | {PDF_MIME_TYPE}
|
96
|
+
PLAIN_TEXT_MIME_TYPES | IMAGE_MIME_TYPES | PANDOC_SUPPORTED_MIME_TYPES | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE}
|
97
97
|
)
|
@@ -1,5 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
from contextlib import suppress
|
4
|
+
|
3
5
|
from charset_normalizer import detect
|
4
6
|
|
5
7
|
|
@@ -16,20 +18,30 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
16
18
|
if not byte_data:
|
17
19
|
return ""
|
18
20
|
|
21
|
+
encodings = ["utf-8", "latin-1"]
|
22
|
+
|
19
23
|
if encoding:
|
20
|
-
|
24
|
+
with suppress(UnicodeDecodeError):
|
21
25
|
return byte_data.decode(encoding, errors="ignore")
|
22
|
-
except UnicodeDecodeError: # pragma: no cover
|
23
|
-
pass
|
24
26
|
|
25
|
-
encodings = ["utf-8", "latin-1"]
|
26
27
|
if encoding := detect(byte_data).get("encoding"):
|
27
28
|
encodings.append(encoding)
|
28
29
|
|
29
30
|
for encoding in encodings:
|
30
|
-
|
31
|
+
with suppress(UnicodeDecodeError):
|
31
32
|
return byte_data.decode(encoding, errors="ignore")
|
32
|
-
except UnicodeDecodeError: # pragma: no cover # noqa: PERF203
|
33
|
-
pass
|
34
33
|
|
34
|
+
# TODO: add test case
|
35
35
|
return byte_data.decode("latin-1", errors="replace")
|
36
|
+
|
37
|
+
|
38
|
+
def normalize_spaces(text: str) -> str:
|
39
|
+
"""Normalize the spaces in a string.
|
40
|
+
|
41
|
+
Args:
|
42
|
+
text: The text to sanitize.
|
43
|
+
|
44
|
+
Returns:
|
45
|
+
The sanitized text.
|
46
|
+
"""
|
47
|
+
return " ".join(text.strip().split())
|
@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, TypeVar, cast
|
|
6
6
|
from anyio.to_thread import run_sync as any_io_run_sync
|
7
7
|
from typing_extensions import ParamSpec
|
8
8
|
|
9
|
-
if TYPE_CHECKING:
|
9
|
+
if TYPE_CHECKING: # pragma: no cover
|
10
10
|
from collections.abc import Callable
|
11
11
|
|
12
12
|
T = TypeVar("T")
|
@@ -1,3 +1,12 @@
|
|
1
|
+
"""This module provides functions to extract textual content from files.
|
2
|
+
|
3
|
+
It includes vendored code:
|
4
|
+
|
5
|
+
- The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
|
6
|
+
See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
|
7
|
+
Refer to the markitdown repository for it's license (MIT).
|
8
|
+
"""
|
9
|
+
|
1
10
|
from __future__ import annotations
|
2
11
|
|
3
12
|
from mimetypes import guess_type
|
@@ -12,6 +21,7 @@ from kreuzberg._extractors import (
|
|
12
21
|
_extract_file_with_pandoc,
|
13
22
|
_extract_image_with_tesseract,
|
14
23
|
_extract_pdf_file,
|
24
|
+
_extract_pptx_file,
|
15
25
|
)
|
16
26
|
from kreuzberg._mime_types import (
|
17
27
|
IMAGE_MIME_TYPE_EXT_MAP,
|
@@ -20,6 +30,7 @@ from kreuzberg._mime_types import (
|
|
20
30
|
PANDOC_SUPPORTED_MIME_TYPES,
|
21
31
|
PDF_MIME_TYPE,
|
22
32
|
PLAIN_TEXT_MIME_TYPE,
|
33
|
+
POWER_POINT_MIME_TYPE,
|
23
34
|
SUPPORTED_MIME_TYPES,
|
24
35
|
)
|
25
36
|
from kreuzberg._string import safe_decode
|
@@ -35,12 +46,13 @@ class ExtractionResult(NamedTuple):
|
|
35
46
|
"""The mime type of the content."""
|
36
47
|
|
37
48
|
|
38
|
-
async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
49
|
+
async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False) -> ExtractionResult:
|
39
50
|
"""Extract the textual content from a given byte string representing a file's contents.
|
40
51
|
|
41
52
|
Args:
|
42
53
|
content: The content to extract.
|
43
54
|
mime_type: The mime type of the content.
|
55
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
44
56
|
|
45
57
|
Raises:
|
46
58
|
ValidationError: If the mime type is not supported.
|
@@ -58,7 +70,7 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
|
58
70
|
with NamedTemporaryFile(suffix=".pdf") as temp_file:
|
59
71
|
temp_file.write(content)
|
60
72
|
return ExtractionResult(
|
61
|
-
content=await _extract_pdf_file(Path(temp_file.name)), mime_type=PLAIN_TEXT_MIME_TYPE
|
73
|
+
content=await _extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
62
74
|
)
|
63
75
|
|
64
76
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
@@ -75,18 +87,24 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
|
|
75
87
|
content=await _extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
76
88
|
)
|
77
89
|
|
90
|
+
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
91
|
+
return ExtractionResult(content=await _extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
92
|
+
|
78
93
|
return ExtractionResult(
|
79
94
|
content=safe_decode(content),
|
80
95
|
mime_type=mime_type,
|
81
96
|
)
|
82
97
|
|
83
98
|
|
84
|
-
async def extract_file(
|
99
|
+
async def extract_file(
|
100
|
+
file_path: Path | str, mime_type: str | None = None, force_ocr: bool = False
|
101
|
+
) -> ExtractionResult:
|
85
102
|
"""Extract the textual content from a given file.
|
86
103
|
|
87
104
|
Args:
|
88
105
|
file_path: The path to the file.
|
89
106
|
mime_type: The mime type of the file.
|
107
|
+
force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
|
90
108
|
|
91
109
|
Raises:
|
92
110
|
ValidationError: If the mime type is not supported.
|
@@ -109,7 +127,7 @@ async def extract_file(file_path: Path | str, mime_type: str | None = None) -> E
|
|
109
127
|
raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
|
110
128
|
|
111
129
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
112
|
-
return ExtractionResult(content=await _extract_pdf_file(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
130
|
+
return ExtractionResult(content=await _extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
113
131
|
|
114
132
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
115
133
|
return ExtractionResult(content=await _extract_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
@@ -121,4 +139,7 @@ async def extract_file(file_path: Path | str, mime_type: str | None = None) -> E
|
|
121
139
|
content=await _extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
122
140
|
)
|
123
141
|
|
142
|
+
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
143
|
+
return ExtractionResult(content=await _extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
144
|
+
|
124
145
|
return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.2.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
7
7
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
-
Keywords:
|
8
|
+
Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
|
9
9
|
Classifier: Development Status :: 4 - Beta
|
10
10
|
Classifier: Intended Audience :: Developers
|
11
11
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -28,6 +28,7 @@ Requires-Dist: charset-normalizer>=3.4.1
|
|
28
28
|
Requires-Dist: pypandoc>=1.15
|
29
29
|
Requires-Dist: pypdfium2>=4.30.1
|
30
30
|
Requires-Dist: pytesseract>=0.3.13
|
31
|
+
Requires-Dist: python-pptx>=1.0.2
|
31
32
|
Requires-Dist: typing-extensions>=4.12.2
|
32
33
|
|
33
34
|
# Kreuzberg
|
@@ -46,7 +47,7 @@ Hence, this library.
|
|
46
47
|
|
47
48
|
## Features
|
48
49
|
|
49
|
-
- Extract text from PDFs, images, and
|
50
|
+
- Extract text from PDFs, images, office documents and more (see supported formats below)
|
50
51
|
- Use modern Python with async (via `anyio`) and proper type hints
|
51
52
|
- Extensive error handling for easy debugging
|
52
53
|
|
@@ -65,6 +66,28 @@ Hence, this library.
|
|
65
66
|
- [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
|
66
67
|
- [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
|
67
68
|
|
69
|
+
## Dependencies and Philosophy
|
70
|
+
|
71
|
+
This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
|
72
|
+
high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
|
73
|
+
polished and well maintained.
|
74
|
+
|
75
|
+
### Dependencies
|
76
|
+
|
77
|
+
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
78
|
+
- Images are processed using Tesseract OCR
|
79
|
+
- Office documents and other formats are processed using Pandoc, or python-pptx for PPTX files
|
80
|
+
- Plain text files are read directly with appropriate encoding detection
|
81
|
+
|
82
|
+
### Roadmap
|
83
|
+
|
84
|
+
[] - extra install groups (to make dependencies optional and offer alternatives)
|
85
|
+
[] - html file text extraction
|
86
|
+
[] - better PDF table extraction
|
87
|
+
[] - metadata extraction
|
88
|
+
|
89
|
+
Feel free to open a discussion in GitHub or an issue if you have any feature requests, but keep the philosophy part in mind
|
90
|
+
|
68
91
|
## Supported File Types
|
69
92
|
|
70
93
|
Kreuzberg supports a wide range of file formats:
|
@@ -72,7 +95,8 @@ Kreuzberg supports a wide range of file formats:
|
|
72
95
|
### Document Formats
|
73
96
|
|
74
97
|
- PDF (`.pdf`) - both searchable and scanned documents
|
75
|
-
- Word Documents (`.docx`)
|
98
|
+
- Word Documents (`.docx`, `.doc`)
|
99
|
+
- Power Point Presentations (`.pptx`)
|
76
100
|
- OpenDocument Text (`.odt`)
|
77
101
|
- Rich Text Format (`.rtf`)
|
78
102
|
|
@@ -102,13 +126,6 @@ Kreuzberg supports a wide range of file formats:
|
|
102
126
|
- Comma-Separated Values (`.csv`)
|
103
127
|
- Tab-Separated Values (`.tsv`)
|
104
128
|
|
105
|
-
All formats support text extraction, with different processing methods:
|
106
|
-
|
107
|
-
- PDFs are processed using pdfium2 for searchable PDFs and Tesseract OCR for scanned documents
|
108
|
-
- Images are processed using Tesseract OCR
|
109
|
-
- Office documents and other formats are processed using Pandoc
|
110
|
-
- Plain text files are read directly with appropriate encoding detection
|
111
|
-
|
112
129
|
## Usage
|
113
130
|
|
114
131
|
Kreuzberg exports two async functions:
|
@@ -116,8 +133,6 @@ Kreuzberg exports two async functions:
|
|
116
133
|
- Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
|
117
134
|
- Extract text from a byte-string using `extract_bytes()`
|
118
135
|
|
119
|
-
Note - both of these functions are async and therefore should be used in an async context.
|
120
|
-
|
121
136
|
### Extract from File
|
122
137
|
|
123
138
|
```python
|
@@ -164,6 +179,21 @@ async def process_uploaded_image(image_content: bytes):
|
|
164
179
|
return result.content
|
165
180
|
```
|
166
181
|
|
182
|
+
### Forcing OCR
|
183
|
+
|
184
|
+
When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
|
185
|
+
You can do this by passing `force_ocr=True`:
|
186
|
+
|
187
|
+
```python
|
188
|
+
from kreuzberg import extract_bytes
|
189
|
+
|
190
|
+
|
191
|
+
# Extract text from PDF bytes and force OCR
|
192
|
+
async def process_uploaded_pdf(pdf_content: bytes):
|
193
|
+
result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
|
194
|
+
return result.content
|
195
|
+
```
|
196
|
+
|
167
197
|
### Error Handling
|
168
198
|
|
169
199
|
Kreuzberg raises two exception types:
|
@@ -173,8 +203,8 @@ Kreuzberg raises two exception types:
|
|
173
203
|
Raised when there are issues with input validation:
|
174
204
|
|
175
205
|
- Unsupported mime types
|
176
|
-
- Non-existent files
|
177
206
|
- Undetectable mime types
|
207
|
+
- Path doesn't point at an exist file
|
178
208
|
|
179
209
|
#### ParsingError
|
180
210
|
|
@@ -218,8 +248,8 @@ except ParsingError as e:
|
|
218
248
|
|
219
249
|
All extraction functions return an ExtractionResult named tuple containing:
|
220
250
|
|
221
|
-
- content
|
222
|
-
- mime_type
|
251
|
+
- `content`: The extracted text as a string
|
252
|
+
- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
|
223
253
|
|
224
254
|
```python
|
225
255
|
from kreuzberg import ExtractionResult
|
@@ -1,10 +1,9 @@
|
|
1
1
|
[project]
|
2
2
|
name = "kreuzberg"
|
3
|
-
version = "1.
|
3
|
+
version = "1.2.0"
|
4
4
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
readme = "README.md"
|
6
6
|
keywords = [
|
7
|
-
"async",
|
8
7
|
"document-processing",
|
9
8
|
"docx",
|
10
9
|
"image-to-text",
|
@@ -17,7 +16,6 @@ keywords = [
|
|
17
16
|
"pdf",
|
18
17
|
"pdf-extraction",
|
19
18
|
"rag",
|
20
|
-
"tesseract",
|
21
19
|
"text-extraction",
|
22
20
|
"text-processing",
|
23
21
|
]
|
@@ -47,6 +45,7 @@ dependencies = [
|
|
47
45
|
"pypandoc>=1.15",
|
48
46
|
"pypdfium2>=4.30.1",
|
49
47
|
"pytesseract>=0.3.13",
|
48
|
+
"python-pptx>=1.0.2",
|
50
49
|
"typing-extensions>=4.12.2",
|
51
50
|
]
|
52
51
|
|
@@ -128,7 +127,7 @@ source = [ "kreuzberg" ]
|
|
128
127
|
|
129
128
|
[tool.coverage.report]
|
130
129
|
exclude_lines = [ 'if TYPE_CHECKING:' ]
|
131
|
-
fail_under =
|
130
|
+
fail_under = 90
|
132
131
|
|
133
132
|
[tool.mypy]
|
134
133
|
packages = [ "kreuzberg", "tests" ]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|