kreuzberg 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +19 -1
- kreuzberg/_mime_types.py +10 -5
- kreuzberg/extraction.py +8 -0
- {kreuzberg-1.2.0.dist-info → kreuzberg-1.3.0.dist-info}/METADATA +25 -8
- kreuzberg-1.3.0.dist-info/RECORD +13 -0
- kreuzberg-1.2.0.dist-info/RECORD +0 -13
- {kreuzberg-1.2.0.dist-info → kreuzberg-1.3.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.2.0.dist-info → kreuzberg-1.3.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.2.0.dist-info → kreuzberg-1.3.0.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors.py
CHANGED
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, cast
|
|
8
8
|
|
9
9
|
from anyio import Path as AsyncPath
|
10
10
|
from charset_normalizer import detect
|
11
|
+
from html_to_markdown import convert_to_markdown
|
11
12
|
from pptx import Presentation
|
12
13
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
13
14
|
from pypandoc import convert_file, convert_text
|
@@ -15,7 +16,7 @@ from pypdfium2 import PdfDocument, PdfiumError
|
|
15
16
|
from pytesseract import TesseractError, image_to_string
|
16
17
|
|
17
18
|
from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
|
18
|
-
from kreuzberg._string import normalize_spaces
|
19
|
+
from kreuzberg._string import normalize_spaces, safe_decode
|
19
20
|
from kreuzberg._sync import run_sync
|
20
21
|
from kreuzberg.exceptions import ParsingError
|
21
22
|
|
@@ -227,3 +228,20 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
227
228
|
md_content = md_content.strip()
|
228
229
|
|
229
230
|
return normalize_spaces(md_content)
|
231
|
+
|
232
|
+
|
233
|
+
async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
234
|
+
"""Extract text from an HTML string.
|
235
|
+
|
236
|
+
Args:
|
237
|
+
file_path_or_contents: The HTML content.
|
238
|
+
|
239
|
+
Returns:
|
240
|
+
The extracted text content.
|
241
|
+
"""
|
242
|
+
content = (
|
243
|
+
safe_decode(file_path_or_contents)
|
244
|
+
if isinstance(file_path_or_contents, bytes)
|
245
|
+
else await AsyncPath(file_path_or_contents).read_text()
|
246
|
+
)
|
247
|
+
return normalize_spaces(await run_sync(convert_to_markdown, content))
|
kreuzberg/_mime_types.py
CHANGED
@@ -5,10 +5,12 @@ from typing import TYPE_CHECKING, Final
|
|
5
5
|
if TYPE_CHECKING: # pragma: no cover
|
6
6
|
from collections.abc import Mapping
|
7
7
|
|
8
|
-
|
9
|
-
|
10
|
-
PDF_MIME_TYPE: Final
|
11
|
-
|
8
|
+
HTML_MIME_TYPE: Final = "text/html"
|
9
|
+
MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
10
|
+
PDF_MIME_TYPE: Final = "application/pdf"
|
11
|
+
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
12
|
+
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
13
|
+
|
12
14
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
13
15
|
|
14
16
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -93,5 +95,8 @@ PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
|
|
93
95
|
}
|
94
96
|
|
95
97
|
SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
96
|
-
PLAIN_TEXT_MIME_TYPES
|
98
|
+
PLAIN_TEXT_MIME_TYPES
|
99
|
+
| IMAGE_MIME_TYPES
|
100
|
+
| PANDOC_SUPPORTED_MIME_TYPES
|
101
|
+
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
|
97
102
|
)
|
kreuzberg/extraction.py
CHANGED
@@ -19,11 +19,13 @@ from anyio import Path as AsyncPath
|
|
19
19
|
from kreuzberg._extractors import (
|
20
20
|
_extract_content_with_pandoc,
|
21
21
|
_extract_file_with_pandoc,
|
22
|
+
_extract_html_string,
|
22
23
|
_extract_image_with_tesseract,
|
23
24
|
_extract_pdf_file,
|
24
25
|
_extract_pptx_file,
|
25
26
|
)
|
26
27
|
from kreuzberg._mime_types import (
|
28
|
+
HTML_MIME_TYPE,
|
27
29
|
IMAGE_MIME_TYPE_EXT_MAP,
|
28
30
|
IMAGE_MIME_TYPES,
|
29
31
|
MARKDOWN_MIME_TYPE,
|
@@ -90,6 +92,9 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
90
92
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
91
93
|
return ExtractionResult(content=await _extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
92
94
|
|
95
|
+
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
96
|
+
return ExtractionResult(content=await _extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
|
97
|
+
|
93
98
|
return ExtractionResult(
|
94
99
|
content=safe_decode(content),
|
95
100
|
mime_type=mime_type,
|
@@ -142,4 +147,7 @@ async def extract_file(
|
|
142
147
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
143
148
|
return ExtractionResult(content=await _extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
144
149
|
|
150
|
+
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
151
|
+
return ExtractionResult(content=await _extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
152
|
+
|
145
153
|
return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.3.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -25,6 +25,7 @@ Description-Content-Type: text/markdown
|
|
25
25
|
License-File: LICENSE
|
26
26
|
Requires-Dist: anyio>=4.8.0
|
27
27
|
Requires-Dist: charset-normalizer>=3.4.1
|
28
|
+
Requires-Dist: html-to-markdown>=1.2.0
|
28
29
|
Requires-Dist: pypandoc>=1.15
|
29
30
|
Requires-Dist: pypdfium2>=4.30.1
|
30
31
|
Requires-Dist: pytesseract>=0.3.13
|
@@ -38,7 +39,7 @@ extraction.
|
|
38
39
|
|
39
40
|
Why?
|
40
41
|
|
41
|
-
I am building, like many do now, a RAG focused service. I have text extraction needs.
|
42
|
+
I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
|
42
43
|
There are quite a lot of commercial options out there, and several open-source + paid options.
|
43
44
|
But I wanted something simple, which does not require expansive round-trips to an external API.
|
44
45
|
Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
|
@@ -76,17 +77,32 @@ polished and well maintained.
|
|
76
77
|
|
77
78
|
- PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
|
78
79
|
- Images are processed using Tesseract OCR
|
79
|
-
- Office documents and other formats are processed using Pandoc
|
80
|
+
- Office documents and other formats are processed using Pandoc
|
81
|
+
- PPTX files are converted using python-pptx
|
82
|
+
- HTML files are converted using html-to-markdown
|
80
83
|
- Plain text files are read directly with appropriate encoding detection
|
81
84
|
|
82
85
|
### Roadmap
|
83
86
|
|
84
|
-
|
85
|
-
[] - html file text extraction
|
86
|
-
[] - better PDF table extraction
|
87
|
-
[] - metadata extraction
|
87
|
+
V1:
|
88
88
|
|
89
|
-
|
89
|
+
- [x] - html file text extraction
|
90
|
+
- [ ] - better PDF table extraction
|
91
|
+
- [ ] - TBD
|
92
|
+
|
93
|
+
V2:
|
94
|
+
|
95
|
+
- [ ] - extra install groups (to make dependencies optional)
|
96
|
+
- [ ] - metadata extraction (possible breaking change)
|
97
|
+
- [ ] - TBD
|
98
|
+
|
99
|
+
### Feature Requests
|
100
|
+
|
101
|
+
Feel free to open a discussion in GitHub or an issue if you have any feature requests
|
102
|
+
|
103
|
+
### Contribution
|
104
|
+
|
105
|
+
Is welcome! Read guidelines below.
|
90
106
|
|
91
107
|
## Supported File Types
|
92
108
|
|
@@ -116,6 +132,7 @@ Kreuzberg supports a wide range of file formats:
|
|
116
132
|
|
117
133
|
#### Text and Markup Formats
|
118
134
|
|
135
|
+
- HTML (`.html`, `.htm`)
|
119
136
|
- Plain Text (`.txt`)
|
120
137
|
- Markdown (`.md`)
|
121
138
|
- reStructuredText (`.rst`)
|
@@ -0,0 +1,13 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
+
kreuzberg/_extractors.py,sha256=eiWPpjnZOZFDwlQL4XsgavJEWqxGtzLVvS8YU28RBAo,8095
|
3
|
+
kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
|
4
|
+
kreuzberg/_string.py,sha256=O023sxdYoC4DhFCU1z430UBdbxqwXKmyymUDDx3J_i8,1156
|
5
|
+
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
6
|
+
kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
|
7
|
+
kreuzberg/extraction.py,sha256=cgX8uoCVXf-Va30g8T8DwrZUqsSPHIzmPfDgnWOqNNU,6148
|
8
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
+
kreuzberg-1.3.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
10
|
+
kreuzberg-1.3.0.dist-info/METADATA,sha256=3wiaAuaiA865lg5oCjwlAKaZqRQn1w8VqaQXeoEdip4,8579
|
11
|
+
kreuzberg-1.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
12
|
+
kreuzberg-1.3.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
13
|
+
kreuzberg-1.3.0.dist-info/RECORD,,
|
kreuzberg-1.2.0.dist-info/RECORD
DELETED
@@ -1,13 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
-
kreuzberg/_extractors.py,sha256=eZ12O7Ii2NRba-dDPIino_eKApCihfdxSPZP121D3xA,7541
|
3
|
-
kreuzberg/_mime_types.py,sha256=oJc4Qc2RkfZAYoCmxuuJ4S_Mo9-QQ0c4wwy0ZBqMRoA,2873
|
4
|
-
kreuzberg/_string.py,sha256=O023sxdYoC4DhFCU1z430UBdbxqwXKmyymUDDx3J_i8,1156
|
5
|
-
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
6
|
-
kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
|
7
|
-
kreuzberg/extraction.py,sha256=j5p-JCnyGMouRp4qD-1qKSV_cw8DQ9QSo1H2ocwbbqA,5732
|
8
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
9
|
-
kreuzberg-1.2.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
10
|
-
kreuzberg-1.2.0.dist-info/METADATA,sha256=rTuoCAAk9mYh7f55bLSQ9CfEPhh3BnqNHwxdruth1P8,8330
|
11
|
-
kreuzberg-1.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
12
|
-
kreuzberg-1.2.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
13
|
-
kreuzberg-1.2.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|