kreuzberg 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/_extractors.py +37 -2
- kreuzberg/_mime_types.py +2 -2
- kreuzberg/extraction.py +8 -0
- {kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/METADATA +8 -9
- {kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/RECORD +8 -8
- {kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.5.0.dist-info → kreuzberg-1.6.0.dist-info}/top_level.txt +0 -0
kreuzberg/_extractors.py
CHANGED
@@ -4,6 +4,8 @@ import re
|
|
4
4
|
from contextlib import suppress
|
5
5
|
from html import escape
|
6
6
|
from io import BytesIO
|
7
|
+
from pathlib import Path
|
8
|
+
from tempfile import NamedTemporaryFile
|
7
9
|
from typing import TYPE_CHECKING
|
8
10
|
|
9
11
|
import html_to_markdown
|
@@ -11,6 +13,7 @@ import pptx
|
|
11
13
|
import pypdfium2
|
12
14
|
from anyio import Path as AsyncPath
|
13
15
|
from pptx.enum.shapes import MSO_SHAPE_TYPE
|
16
|
+
from xlsx2csv import Xlsx2csv
|
14
17
|
|
15
18
|
from kreuzberg._pandoc import process_content, process_file
|
16
19
|
from kreuzberg._string import normalize_spaces, safe_decode
|
@@ -19,8 +22,6 @@ from kreuzberg._tesseract import batch_process_images
|
|
19
22
|
from kreuzberg.exceptions import ParsingError
|
20
23
|
|
21
24
|
if TYPE_CHECKING: # pragma: no cover
|
22
|
-
from pathlib import Path
|
23
|
-
|
24
25
|
from PIL.Image import Image
|
25
26
|
|
26
27
|
|
@@ -195,6 +196,40 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
|
|
195
196
|
return normalize_spaces(md_content)
|
196
197
|
|
197
198
|
|
199
|
+
async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
|
200
|
+
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
201
|
+
|
202
|
+
Args:
|
203
|
+
file_path_or_contents: The path to the XLSX file or its contents as bytes.
|
204
|
+
|
205
|
+
Returns:
|
206
|
+
The extracted text content.
|
207
|
+
|
208
|
+
Raises:
|
209
|
+
ParsingError: If the XLSX file could not be parsed.
|
210
|
+
"""
|
211
|
+
try:
|
212
|
+
with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
|
213
|
+
if isinstance(file_path_or_contents, bytes):
|
214
|
+
xlsx_file.write(file_path_or_contents)
|
215
|
+
xlsx_file.flush()
|
216
|
+
xlsx_path = xlsx_file.name
|
217
|
+
else:
|
218
|
+
xlsx_path = str(file_path_or_contents)
|
219
|
+
|
220
|
+
await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
|
221
|
+
result = await process_file(csv_file.name, mime_type="text/csv")
|
222
|
+
return normalize_spaces(result.content)
|
223
|
+
except Exception as e:
|
224
|
+
raise ParsingError(
|
225
|
+
"Could not extract text from XLSX file",
|
226
|
+
context={
|
227
|
+
"error": str(e),
|
228
|
+
"file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
|
229
|
+
},
|
230
|
+
) from e
|
231
|
+
|
232
|
+
|
198
233
|
async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
|
199
234
|
"""Extract text from an HTML string.
|
200
235
|
|
kreuzberg/_mime_types.py
CHANGED
@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
|
|
10
10
|
PDF_MIME_TYPE: Final = "application/pdf"
|
11
11
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
12
12
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
13
|
-
|
13
|
+
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
14
14
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
15
15
|
|
16
16
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -89,5 +89,5 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
|
89
89
|
PLAIN_TEXT_MIME_TYPES
|
90
90
|
| IMAGE_MIME_TYPES
|
91
91
|
| PANDOC_SUPPORTED_MIME_TYPES
|
92
|
-
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
|
92
|
+
| {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
|
93
93
|
)
|
kreuzberg/extraction.py
CHANGED
@@ -22,8 +22,10 @@ from kreuzberg._extractors import (
|
|
22
22
|
extract_html_string,
|
23
23
|
extract_pdf_file,
|
24
24
|
extract_pptx_file,
|
25
|
+
extract_xlsx_file,
|
25
26
|
)
|
26
27
|
from kreuzberg._mime_types import (
|
28
|
+
EXCEL_MIME_TYPE,
|
27
29
|
HTML_MIME_TYPE,
|
28
30
|
IMAGE_MIME_TYPE_EXT_MAP,
|
29
31
|
IMAGE_MIME_TYPES,
|
@@ -75,6 +77,9 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
75
77
|
content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
|
76
78
|
)
|
77
79
|
|
80
|
+
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
81
|
+
return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
|
82
|
+
|
78
83
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
79
84
|
with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
|
80
85
|
temp_file.write(content)
|
@@ -134,6 +139,9 @@ async def extract_file(
|
|
134
139
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
135
140
|
return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
|
136
141
|
|
142
|
+
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
143
|
+
return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
|
144
|
+
|
137
145
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
138
146
|
return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
|
139
147
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 1.
|
3
|
+
Version: 1.6.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -30,6 +30,7 @@ Requires-Dist: html-to-markdown>=1.2.0
|
|
30
30
|
Requires-Dist: pypdfium2>=4.30.1
|
31
31
|
Requires-Dist: python-pptx>=1.0.2
|
32
32
|
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
|
33
|
+
Requires-Dist: xlsx2csv>=0.8.4
|
33
34
|
|
34
35
|
# Kreuzberg
|
35
36
|
|
@@ -68,16 +69,12 @@ pip install kreuzberg
|
|
68
69
|
|
69
70
|
### 2. Install System Dependencies
|
70
71
|
|
71
|
-
Kreuzberg requires two
|
72
|
+
Kreuzberg requires two system level dependencies:
|
72
73
|
|
73
74
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
74
|
-
|
75
|
-
- GPL v2.0 licensed (used via CLI only)
|
76
|
-
- Handles office documents and markup formats
|
77
|
-
|
78
75
|
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
79
|
-
|
80
|
-
|
76
|
+
|
77
|
+
Please install these using their respective installation guides.
|
81
78
|
|
82
79
|
## Architecture
|
83
80
|
|
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
87
84
|
- `pdfium2` for searchable PDFs
|
88
85
|
- Tesseract OCR for scanned content
|
89
86
|
- **Document Conversion**:
|
90
|
-
- Pandoc for
|
87
|
+
- Pandoc for many document and markup formats
|
91
88
|
- `python-pptx` for PowerPoint files
|
92
89
|
- `html-to-markdown` for HTML content
|
90
|
+
- `xlsx2csv` for Excel spreadsheets
|
93
91
|
- **Text Processing**:
|
94
92
|
- Smart encoding detection
|
95
93
|
- Markdown and plain text handling
|
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
|
|
121
119
|
|
122
120
|
#### Data and Research Formats
|
123
121
|
|
122
|
+
- Excel spreadsheets (`.xlsx`)
|
124
123
|
- CSV (`.csv`) and TSV (`.tsv`) files
|
125
124
|
- Jupyter Notebooks (`.ipynb`)
|
126
125
|
- BibTeX (`.bib`) and BibLaTeX (`.bib`)
|
@@ -1,15 +1,15 @@
|
|
1
1
|
kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
|
2
|
-
kreuzberg/_extractors.py,sha256=
|
3
|
-
kreuzberg/_mime_types.py,sha256=
|
2
|
+
kreuzberg/_extractors.py,sha256=cbDjitvqI35Gimh27iXvEE0Zczf9jZRJZS7Do8ugVNE,7934
|
3
|
+
kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
|
4
4
|
kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
|
5
5
|
kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
|
6
6
|
kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
|
7
7
|
kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
|
8
8
|
kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
|
9
|
-
kreuzberg/extraction.py,sha256=
|
9
|
+
kreuzberg/extraction.py,sha256=G3_Uyzhe99qEib4WLE7_l1oC9JKlvoVdn3WEY56J_Wo,6572
|
10
10
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
11
|
-
kreuzberg-1.
|
12
|
-
kreuzberg-1.
|
13
|
-
kreuzberg-1.
|
14
|
-
kreuzberg-1.
|
15
|
-
kreuzberg-1.
|
11
|
+
kreuzberg-1.6.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
12
|
+
kreuzberg-1.6.0.dist-info/METADATA,sha256=GQNbGnxmym5vAcXDivDUccdVBUGnYh-4M38xYEkKTJk,9663
|
13
|
+
kreuzberg-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
|
14
|
+
kreuzberg-1.6.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
|
15
|
+
kreuzberg-1.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|