kreuzberg 1.5.0__py3-none-any.whl → 1.6.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -4,6 +4,8 @@ import re
4
4
  from contextlib import suppress
5
5
  from html import escape
6
6
  from io import BytesIO
7
+ from pathlib import Path
8
+ from tempfile import NamedTemporaryFile
7
9
  from typing import TYPE_CHECKING
8
10
 
9
11
  import html_to_markdown
@@ -11,6 +13,7 @@ import pptx
11
13
  import pypdfium2
12
14
  from anyio import Path as AsyncPath
13
15
  from pptx.enum.shapes import MSO_SHAPE_TYPE
16
+ from xlsx2csv import Xlsx2csv
14
17
 
15
18
  from kreuzberg._pandoc import process_content, process_file
16
19
  from kreuzberg._string import normalize_spaces, safe_decode
@@ -19,8 +22,6 @@ from kreuzberg._tesseract import batch_process_images
19
22
  from kreuzberg.exceptions import ParsingError
20
23
 
21
24
  if TYPE_CHECKING: # pragma: no cover
22
- from pathlib import Path
23
-
24
25
  from PIL.Image import Image
25
26
 
26
27
 
@@ -195,6 +196,40 @@ async def extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
195
196
  return normalize_spaces(md_content)
196
197
 
197
198
 
199
+ async def extract_xlsx_file(file_path_or_contents: Path | bytes) -> str:
200
+ """Extract text from an XLSX file by converting it to CSV and then to markdown.
201
+
202
+ Args:
203
+ file_path_or_contents: The path to the XLSX file or its contents as bytes.
204
+
205
+ Returns:
206
+ The extracted text content.
207
+
208
+ Raises:
209
+ ParsingError: If the XLSX file could not be parsed.
210
+ """
211
+ try:
212
+ with NamedTemporaryFile(suffix=".xlsx") as xlsx_file, NamedTemporaryFile(suffix=".csv") as csv_file:
213
+ if isinstance(file_path_or_contents, bytes):
214
+ xlsx_file.write(file_path_or_contents)
215
+ xlsx_file.flush()
216
+ xlsx_path = xlsx_file.name
217
+ else:
218
+ xlsx_path = str(file_path_or_contents)
219
+
220
+ await run_sync(Xlsx2csv(xlsx_path).convert, csv_file.name)
221
+ result = await process_file(csv_file.name, mime_type="text/csv")
222
+ return normalize_spaces(result.content)
223
+ except Exception as e:
224
+ raise ParsingError(
225
+ "Could not extract text from XLSX file",
226
+ context={
227
+ "error": str(e),
228
+ "file_path": str(file_path_or_contents) if isinstance(file_path_or_contents, Path) else None,
229
+ },
230
+ ) from e
231
+
232
+
198
233
  async def extract_html_string(file_path_or_contents: Path | bytes) -> str:
199
234
  """Extract text from an HTML string.
200
235
 
kreuzberg/_mime_types.py CHANGED
@@ -10,7 +10,7 @@ MARKDOWN_MIME_TYPE: Final = "text/markdown"
10
10
  PDF_MIME_TYPE: Final = "application/pdf"
11
11
  PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
12
12
  POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
13
-
13
+ EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
14
14
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
15
15
 
16
16
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -89,5 +89,5 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
89
89
  PLAIN_TEXT_MIME_TYPES
90
90
  | IMAGE_MIME_TYPES
91
91
  | PANDOC_SUPPORTED_MIME_TYPES
92
- | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
92
+ | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE, EXCEL_MIME_TYPE}
93
93
  )
kreuzberg/extraction.py CHANGED
@@ -22,8 +22,10 @@ from kreuzberg._extractors import (
22
22
  extract_html_string,
23
23
  extract_pdf_file,
24
24
  extract_pptx_file,
25
+ extract_xlsx_file,
25
26
  )
26
27
  from kreuzberg._mime_types import (
28
+ EXCEL_MIME_TYPE,
27
29
  HTML_MIME_TYPE,
28
30
  IMAGE_MIME_TYPE_EXT_MAP,
29
31
  IMAGE_MIME_TYPES,
@@ -75,6 +77,9 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
75
77
  content=await extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
76
78
  )
77
79
 
80
+ if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
81
+ return ExtractionResult(content=await extract_xlsx_file(content), mime_type=MARKDOWN_MIME_TYPE)
82
+
78
83
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
79
84
  with NamedTemporaryFile(suffix=IMAGE_MIME_TYPE_EXT_MAP[mime_type]) as temp_file:
80
85
  temp_file.write(content)
@@ -134,6 +139,9 @@ async def extract_file(
134
139
  if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
135
140
  return ExtractionResult(content=await extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
136
141
 
142
+ if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
143
+ return ExtractionResult(content=await extract_xlsx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
144
+
137
145
  if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
138
146
  return ExtractionResult(content=await process_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
139
147
 
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.5.0
3
+ Version: 1.6.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -30,6 +30,7 @@ Requires-Dist: html-to-markdown>=1.2.0
30
30
  Requires-Dist: pypdfium2>=4.30.1
31
31
  Requires-Dist: python-pptx>=1.0.2
32
32
  Requires-Dist: typing-extensions>=4.12.2; python_version < "3.10"
33
+ Requires-Dist: xlsx2csv>=0.8.4
33
34
 
34
35
  # Kreuzberg
35
36
 
@@ -68,16 +69,12 @@ pip install kreuzberg
68
69
 
69
70
  ### 2. Install System Dependencies
70
71
 
71
- Kreuzberg requires two open-source tools:
72
+ Kreuzberg requires two system level dependencies:
72
73
 
73
74
  - [Pandoc](https://pandoc.org/installing.html) - For document format conversion
74
-
75
- - GPL v2.0 licensed (used via CLI only)
76
- - Handles office documents and markup formats
77
-
78
75
  - [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
79
- - Apache License
80
- - Required for scanned documents and images
76
+
77
+ Please install these using their respective installation guides.
81
78
 
82
79
  ## Architecture
83
80
 
@@ -87,9 +84,10 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
87
84
  - `pdfium2` for searchable PDFs
88
85
  - Tesseract OCR for scanned content
89
86
  - **Document Conversion**:
90
- - Pandoc for office documents and markup
87
+ - Pandoc for many document and markup formats
91
88
  - `python-pptx` for PowerPoint files
92
89
  - `html-to-markdown` for HTML content
90
+ - `xlsx2csv` for Excel spreadsheets
93
91
  - **Text Processing**:
94
92
  - Smart encoding detection
95
93
  - Markdown and plain text handling
@@ -121,6 +119,7 @@ Kreuzberg is designed as a high-level async abstraction over established open-so
121
119
 
122
120
  #### Data and Research Formats
123
121
 
122
+ - Excel spreadsheets (`.xlsx`)
124
123
  - CSV (`.csv`) and TSV (`.tsv`) files
125
124
  - Jupyter Notebooks (`.ipynb`)
126
125
  - BibTeX (`.bib`) and BibLaTeX (`.bib`)
@@ -1,15 +1,15 @@
1
1
  kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=k6xO_2ItaftPmlqzfXyxTn8rdaWdwrJHGziBbo7gCio,6599
3
- kreuzberg/_mime_types.py,sha256=0ZYtRrMAaKpCMDkhpTbWAXHCsVob5MFRMGlbni8iYSA,2573
2
+ kreuzberg/_extractors.py,sha256=cbDjitvqI35Gimh27iXvEE0Zczf9jZRJZS7Do8ugVNE,7934
3
+ kreuzberg/_mime_types.py,sha256=nvRSWDUhtntO9-E9gv2l5BVYow61zim4llJ6n33k_BE,2682
4
4
  kreuzberg/_pandoc.py,sha256=DC6y_NN_CG9dF6fhAj3WumXqKIJLjYmnql2H53_KHnE,13766
5
5
  kreuzberg/_string.py,sha256=4txRDnkdR12oO6G8V-jXEMlA9ivgmw8E8EbjyhfL-W4,1106
6
6
  kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
7
7
  kreuzberg/_tesseract.py,sha256=nnhkjRIS0BSoovjMIqOlBEXlzngE0QJeFDe7BIqUik8,7872
8
8
  kreuzberg/exceptions.py,sha256=pxoEPS0T9e5QSgxsfXn1VmxsY_EGXvTwY0gETPiNn8E,945
9
- kreuzberg/extraction.py,sha256=gux3fkPIs8IbIKtRGuPFWJBLB5jO6Y9JsBfhHRcpQ0k,6160
9
+ kreuzberg/extraction.py,sha256=G3_Uyzhe99qEib4WLE7_l1oC9JKlvoVdn3WEY56J_Wo,6572
10
10
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
11
- kreuzberg-1.5.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
- kreuzberg-1.5.0.dist-info/METADATA,sha256=O462ss7M6Cb8cO6fJXwqsOdzkzaZekqa1oGwb7Vrgx8,9641
13
- kreuzberg-1.5.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
- kreuzberg-1.5.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
- kreuzberg-1.5.0.dist-info/RECORD,,
11
+ kreuzberg-1.6.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
12
+ kreuzberg-1.6.0.dist-info/METADATA,sha256=GQNbGnxmym5vAcXDivDUccdVBUGnYh-4M38xYEkKTJk,9663
13
+ kreuzberg-1.6.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
14
+ kreuzberg-1.6.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
15
+ kreuzberg-1.6.0.dist-info/RECORD,,