kreuzberg 1.1.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -1,13 +1,22 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import re
4
+ from contextlib import suppress
5
+ from html import escape
6
+ from io import BytesIO
3
7
  from typing import TYPE_CHECKING, cast
4
8
 
9
+ from anyio import Path as AsyncPath
5
10
  from charset_normalizer import detect
11
+ from html_to_markdown import convert_to_markdown
12
+ from pptx import Presentation
13
+ from pptx.enum.shapes import MSO_SHAPE_TYPE
6
14
  from pypandoc import convert_file, convert_text
7
15
  from pypdfium2 import PdfDocument, PdfiumError
8
16
  from pytesseract import TesseractError, image_to_string
9
17
 
10
18
  from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
19
+ from kreuzberg._string import normalize_spaces, safe_decode
11
20
  from kreuzberg._sync import run_sync
12
21
  from kreuzberg.exceptions import ParsingError
13
22
 
@@ -33,7 +42,7 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
33
42
  images = [page.render(scale=2.0).to_pil() for page in pdf]
34
43
 
35
44
  text = "\n".join(image_to_string(img) for img in images)
36
- return text.strip()
45
+ return normalize_spaces(text)
37
46
  except (PdfiumError, TesseractError) as e:
38
47
  # TODO: add test case
39
48
  raise ParsingError(
@@ -56,7 +65,7 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
56
65
  try:
57
66
  document = PdfDocument(file_path)
58
67
  text = "\n".join(page.get_textpage().get_text_range() for page in document)
59
- return text.strip()
68
+ return normalize_spaces(text)
60
69
  except PdfiumError as e:
61
70
  # TODO: add test case
62
71
  raise ParsingError(
@@ -75,9 +84,9 @@ async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
75
84
  The extracted text.
76
85
  """
77
86
  if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
78
- return content
87
+ return normalize_spaces(content)
79
88
 
80
- return await run_sync(_extract_pdf_with_tesseract, file_path)
89
+ return normalize_spaces(await run_sync(_extract_pdf_with_tesseract, file_path))
81
90
 
82
91
 
83
92
  async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encoding: str | None = None) -> str:
@@ -97,7 +106,9 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
97
106
  ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
98
107
  encoding = encoding or detect(file_data)["encoding"] or "utf-8"
99
108
  try:
100
- return cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
109
+ return normalize_spaces(
110
+ cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
111
+ )
101
112
  except RuntimeError as e:
102
113
  # TODO: add test case
103
114
  raise ParsingError(
@@ -121,7 +132,7 @@ async def _extract_file_with_pandoc(file_path: Path | str, mime_type: str) -> st
121
132
  """
122
133
  ext = PANDOC_MIME_TYPE_EXT_MAP[mime_type]
123
134
  try:
124
- return cast(str, await run_sync(convert_file, file_path, to="md", format=ext))
135
+ return normalize_spaces(cast(str, await run_sync(convert_file, file_path, to="md", format=ext)))
125
136
  except RuntimeError as e:
126
137
  raise ParsingError(
127
138
  f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file",
@@ -142,8 +153,95 @@ async def _extract_image_with_tesseract(file_path: Path | str) -> str:
142
153
  The extracted content.
143
154
  """
144
155
  try:
145
- return cast(str, image_to_string(str(file_path)).strip())
156
+ return normalize_spaces(cast(str, image_to_string(str(file_path))))
146
157
  except TesseractError as e:
147
158
  raise ParsingError(
148
159
  "Could not extract text from image file", context={"file_path": str(file_path), "error": str(e)}
149
160
  ) from e
161
+
162
+
163
+ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
164
+ """Extract text from a PPTX file.
165
+
166
+ Notes:
167
+ This function is based on code vendored from `markitdown`, which has an MIT license as well.
168
+
169
+ Args:
170
+ file_path_or_contents: The path to the PPTX file or its contents as bytes.
171
+
172
+ Returns:
173
+ The extracted text content
174
+ """
175
+ md_content = ""
176
+ file_contents = (
177
+ file_path_or_contents
178
+ if isinstance(file_path_or_contents, bytes)
179
+ else await AsyncPath(file_path_or_contents).read_bytes()
180
+ )
181
+ presentation = Presentation(BytesIO(file_contents))
182
+
183
+ for index, slide in enumerate(presentation.slides):
184
+ md_content += f"\n\n<!-- Slide number: {index + 1} -->\n"
185
+
186
+ title = slide.shapes.title
187
+
188
+ for shape in slide.shapes:
189
+ if shape.shape_type == MSO_SHAPE_TYPE.PICTURE or (
190
+ shape.shape_type == MSO_SHAPE_TYPE.PLACEHOLDER and hasattr(shape, "image")
191
+ ):
192
+ alt_text = ""
193
+ with suppress(AttributeError):
194
+ # access non-visual properties
195
+ alt_text = shape._element._nvXxPr.cNvPr.attrib.get("descr", "") # noqa: SLF001
196
+
197
+ filename = re.sub(r"\W", "", shape.name) + ".jpg"
198
+ md_content += f"\n![{alt_text if alt_text else shape.name}]({filename})\n"
199
+
200
+ elif shape.shape_type == MSO_SHAPE_TYPE.TABLE:
201
+ html_table = "<table>"
202
+ first_row = True
203
+
204
+ for row in shape.table.rows:
205
+ html_table += "<tr>"
206
+
207
+ for cell in row.cells:
208
+ tag = "th" if first_row else "td"
209
+ html_table += f"<{tag}>{escape(cell.text)}</{tag}>"
210
+
211
+ html_table += "</tr>"
212
+ first_row = False
213
+
214
+ html_table += "</table>"
215
+ md_content += "\n" + html_table + "\n"
216
+
217
+ elif shape.has_text_frame:
218
+ md_content += "# " + shape.text.lstrip() + "\n" if shape == title else shape.text + "\n"
219
+
220
+ md_content = md_content.strip()
221
+ if slide.has_notes_slide:
222
+ md_content += "\n\n### Notes:\n"
223
+ notes_frame = slide.notes_slide.notes_text_frame
224
+
225
+ if notes_frame is not None:
226
+ md_content += notes_frame.text
227
+
228
+ md_content = md_content.strip()
229
+
230
+ return normalize_spaces(md_content)
231
+
232
+
233
+ async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
234
+ """Extract text from an HTML string.
235
+
236
+ Args:
237
+ file_path_or_contents: The HTML content.
238
+
239
+ Returns:
240
+ The extracted text content.
241
+ """
242
+ content = (
243
+ safe_decode(file_path_or_contents)
244
+ if isinstance(file_path_or_contents, bytes)
245
+ else await AsyncPath(file_path_or_contents).read_text()
246
+ )
247
+ return normalize_spaces(await run_sync(convert_to_markdown, content))
kreuzberg/_mime_types.py CHANGED
@@ -5,9 +5,11 @@ from typing import TYPE_CHECKING, Final
5
5
  if TYPE_CHECKING: # pragma: no cover
6
6
  from collections.abc import Mapping
7
7
 
8
- MARKDOWN_MIME_TYPE: Final[str] = "text/markdown"
9
- PLAIN_TEXT_MIME_TYPE: Final[str] = "text/plain"
10
- PDF_MIME_TYPE: Final[str] = "application/pdf"
8
+ HTML_MIME_TYPE: Final = "text/html"
9
+ MARKDOWN_MIME_TYPE: Final = "text/markdown"
10
+ PDF_MIME_TYPE: Final = "application/pdf"
11
+ PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
12
+ POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
11
13
 
12
14
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
13
15
 
@@ -93,5 +95,8 @@ PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
93
95
  }
94
96
 
95
97
  SUPPORTED_MIME_TYPES: Final[set[str]] = (
96
- PLAIN_TEXT_MIME_TYPES | IMAGE_MIME_TYPES | PANDOC_SUPPORTED_MIME_TYPES | {PDF_MIME_TYPE}
98
+ PLAIN_TEXT_MIME_TYPES
99
+ | IMAGE_MIME_TYPES
100
+ | PANDOC_SUPPORTED_MIME_TYPES
101
+ | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
97
102
  )
kreuzberg/_string.py CHANGED
@@ -33,3 +33,15 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
33
33
 
34
34
  # TODO: add test case
35
35
  return byte_data.decode("latin-1", errors="replace")
36
+
37
+
38
+ def normalize_spaces(text: str) -> str:
39
+ """Normalize the spaces in a string.
40
+
41
+ Args:
42
+ text: The text to sanitize.
43
+
44
+ Returns:
45
+ The sanitized text.
46
+ """
47
+ return " ".join(text.strip().split())
kreuzberg/extraction.py CHANGED
@@ -1,3 +1,12 @@
1
+ """This module provides functions to extract textual content from files.
2
+
3
+ It includes vendored code:
4
+
5
+ - The extract PPTX logic is based on code vendored from `markitdown` to extract text from PPTX files.
6
+ See: https://github.com/microsoft/markitdown/blob/main/src/markitdown/_markitdown.py
7
+ Refer to the markitdown repository for it's license (MIT).
8
+ """
9
+
1
10
  from __future__ import annotations
2
11
 
3
12
  from mimetypes import guess_type
@@ -10,16 +19,20 @@ from anyio import Path as AsyncPath
10
19
  from kreuzberg._extractors import (
11
20
  _extract_content_with_pandoc,
12
21
  _extract_file_with_pandoc,
22
+ _extract_html_string,
13
23
  _extract_image_with_tesseract,
14
24
  _extract_pdf_file,
25
+ _extract_pptx_file,
15
26
  )
16
27
  from kreuzberg._mime_types import (
28
+ HTML_MIME_TYPE,
17
29
  IMAGE_MIME_TYPE_EXT_MAP,
18
30
  IMAGE_MIME_TYPES,
19
31
  MARKDOWN_MIME_TYPE,
20
32
  PANDOC_SUPPORTED_MIME_TYPES,
21
33
  PDF_MIME_TYPE,
22
34
  PLAIN_TEXT_MIME_TYPE,
35
+ POWER_POINT_MIME_TYPE,
23
36
  SUPPORTED_MIME_TYPES,
24
37
  )
25
38
  from kreuzberg._string import safe_decode
@@ -76,6 +89,12 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
76
89
  content=await _extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
77
90
  )
78
91
 
92
+ if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
93
+ return ExtractionResult(content=await _extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
94
+
95
+ if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
96
+ return ExtractionResult(content=await _extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
97
+
79
98
  return ExtractionResult(
80
99
  content=safe_decode(content),
81
100
  mime_type=mime_type,
@@ -125,4 +144,10 @@ async def extract_file(
125
144
  content=await _extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
126
145
  )
127
146
 
147
+ if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
148
+ return ExtractionResult(content=await _extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
149
+
150
+ if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
151
+ return ExtractionResult(content=await _extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
152
+
128
153
  return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.1.0
3
+ Version: 1.3.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
- Keywords: async,document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,tesseract,text-extraction,text-processing
8
+ Keywords: document-processing,docx,image-to-text,latex,markdown,ocr,odt,office-documents,pandoc,pdf,pdf-extraction,rag,text-extraction,text-processing
9
9
  Classifier: Development Status :: 4 - Beta
10
10
  Classifier: Intended Audience :: Developers
11
11
  Classifier: License :: OSI Approved :: MIT License
@@ -25,9 +25,11 @@ Description-Content-Type: text/markdown
25
25
  License-File: LICENSE
26
26
  Requires-Dist: anyio>=4.8.0
27
27
  Requires-Dist: charset-normalizer>=3.4.1
28
+ Requires-Dist: html-to-markdown>=1.2.0
28
29
  Requires-Dist: pypandoc>=1.15
29
30
  Requires-Dist: pypdfium2>=4.30.1
30
31
  Requires-Dist: pytesseract>=0.3.13
32
+ Requires-Dist: python-pptx>=1.0.2
31
33
  Requires-Dist: typing-extensions>=4.12.2
32
34
 
33
35
  # Kreuzberg
@@ -37,7 +39,7 @@ extraction.
37
39
 
38
40
  Why?
39
41
 
40
- I am building, like many do now, a RAG focused service. I have text extraction needs.
42
+ I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
41
43
  There are quite a lot of commercial options out there, and several open-source + paid options.
42
44
  But I wanted something simple, which does not require expansive round-trips to an external API.
43
45
  Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
@@ -65,6 +67,43 @@ Hence, this library.
65
67
  - [pandoc](https://pandoc.org/installing.html) (non-pdf text extraction, GPL v2.0 licensed but used via CLI only)
66
68
  - [tesseract-ocr](https://tesseract-ocr.github.io/) (for image/PDF OCR, Apache License)
67
69
 
70
+ ## Dependencies and Philosophy
71
+
72
+ This library is built to be minimalist and simple. It also aims to utilize OSS tools for the job. Its fundamentally a
73
+ high order async abstraction on top of other tools, think of it like the library you would bake in your code base, but
74
+ polished and well maintained.
75
+
76
+ ### Dependencies
77
+
78
+ - PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
79
+ - Images are processed using Tesseract OCR
80
+ - Office documents and other formats are processed using Pandoc
81
+ - PPTX files are converted using python-pptx
82
+ - HTML files are converted using html-to-markdown
83
+ - Plain text files are read directly with appropriate encoding detection
84
+
85
+ ### Roadmap
86
+
87
+ V1:
88
+
89
+ - [x] - html file text extraction
90
+ - [ ] - better PDF table extraction
91
+ - [ ] - TBD
92
+
93
+ V2:
94
+
95
+ - [ ] - extra install groups (to make dependencies optional)
96
+ - [ ] - metadata extraction (possible breaking change)
97
+ - [ ] - TBD
98
+
99
+ ### Feature Requests
100
+
101
+ Feel free to open a discussion in GitHub or an issue if you have any feature requests
102
+
103
+ ### Contribution
104
+
105
+ Is welcome! Read guidelines below.
106
+
68
107
  ## Supported File Types
69
108
 
70
109
  Kreuzberg supports a wide range of file formats:
@@ -72,7 +111,8 @@ Kreuzberg supports a wide range of file formats:
72
111
  ### Document Formats
73
112
 
74
113
  - PDF (`.pdf`) - both searchable and scanned documents
75
- - Word Documents (`.docx`)
114
+ - Word Documents (`.docx`, `.doc`)
115
+ - Power Point Presentations (`.pptx`)
76
116
  - OpenDocument Text (`.odt`)
77
117
  - Rich Text Format (`.rtf`)
78
118
 
@@ -92,6 +132,7 @@ Kreuzberg supports a wide range of file formats:
92
132
 
93
133
  #### Text and Markup Formats
94
134
 
135
+ - HTML (`.html`, `.htm`)
95
136
  - Plain Text (`.txt`)
96
137
  - Markdown (`.md`)
97
138
  - reStructuredText (`.rst`)
@@ -102,13 +143,6 @@ Kreuzberg supports a wide range of file formats:
102
143
  - Comma-Separated Values (`.csv`)
103
144
  - Tab-Separated Values (`.tsv`)
104
145
 
105
- All formats support text extraction, with different processing methods:
106
-
107
- - PDFs are processed using pdfium2 for searchable PDFs and Tesseract OCR for scanned documents
108
- - Images are processed using Tesseract OCR
109
- - Office documents and other formats are processed using Pandoc
110
- - Plain text files are read directly with appropriate encoding detection
111
-
112
146
  ## Usage
113
147
 
114
148
  Kreuzberg exports two async functions:
@@ -116,8 +150,6 @@ Kreuzberg exports two async functions:
116
150
  - Extract text from a file (string path or `pathlib.Path`) using `extract_file()`
117
151
  - Extract text from a byte-string using `extract_bytes()`
118
152
 
119
- Note - both of these functions are async and therefore should be used in an async context.
120
-
121
153
  ### Extract from File
122
154
 
123
155
  ```python
@@ -0,0 +1,13 @@
1
+ kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
+ kreuzberg/_extractors.py,sha256=eiWPpjnZOZFDwlQL4XsgavJEWqxGtzLVvS8YU28RBAo,8095
3
+ kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
4
+ kreuzberg/_string.py,sha256=O023sxdYoC4DhFCU1z430UBdbxqwXKmyymUDDx3J_i8,1156
5
+ kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
+ kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
7
+ kreuzberg/extraction.py,sha256=cgX8uoCVXf-Va30g8T8DwrZUqsSPHIzmPfDgnWOqNNU,6148
8
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ kreuzberg-1.3.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
10
+ kreuzberg-1.3.0.dist-info/METADATA,sha256=3wiaAuaiA865lg5oCjwlAKaZqRQn1w8VqaQXeoEdip4,8579
11
+ kreuzberg-1.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
12
+ kreuzberg-1.3.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
13
+ kreuzberg-1.3.0.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=r8L9Bm3x7s1u7-T5HKkr1j6M6W3bUuwMAmDtAwX-s9g,4717
3
- kreuzberg/_mime_types.py,sha256=M5sKT4OkMf7pwtgs_jO2uhl6gC94wUurYzw_wbrIjU0,2739
4
- kreuzberg/_string.py,sha256=5s6BfTLQdYlDEt2PP4AdmBLV-ajroATOVYQQRcBYFD4,934
5
- kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
- kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
7
- kreuzberg/extraction.py,sha256=-a_msLQm7h5pHDhBuvfRP81-FtBwv7FGW-6YVJlXpUg,4926
8
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- kreuzberg-1.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
10
- kreuzberg-1.1.0.dist-info/METADATA,sha256=nkDjE2MEqAE_-1MZvlBxnNuM7SKCOD2LvB7Ucb_W7U4,7775
11
- kreuzberg-1.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
12
- kreuzberg-1.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
13
- kreuzberg-1.1.0.dist-info/RECORD,,