kreuzberg 1.2.0__py3-none-any.whl → 1.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_extractors.py CHANGED
@@ -8,6 +8,7 @@ from typing import TYPE_CHECKING, cast
8
8
 
9
9
  from anyio import Path as AsyncPath
10
10
  from charset_normalizer import detect
11
+ from html_to_markdown import convert_to_markdown
11
12
  from pptx import Presentation
12
13
  from pptx.enum.shapes import MSO_SHAPE_TYPE
13
14
  from pypandoc import convert_file, convert_text
@@ -15,7 +16,7 @@ from pypdfium2 import PdfDocument, PdfiumError
15
16
  from pytesseract import TesseractError, image_to_string
16
17
 
17
18
  from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
18
- from kreuzberg._string import normalize_spaces
19
+ from kreuzberg._string import normalize_spaces, safe_decode
19
20
  from kreuzberg._sync import run_sync
20
21
  from kreuzberg.exceptions import ParsingError
21
22
 
@@ -227,3 +228,20 @@ async def _extract_pptx_file(file_path_or_contents: Path | bytes) -> str:
227
228
  md_content = md_content.strip()
228
229
 
229
230
  return normalize_spaces(md_content)
231
+
232
+
233
+ async def _extract_html_string(file_path_or_contents: Path | bytes) -> str:
234
+ """Extract text from an HTML string.
235
+
236
+ Args:
237
+ file_path_or_contents: The HTML content.
238
+
239
+ Returns:
240
+ The extracted text content.
241
+ """
242
+ content = (
243
+ safe_decode(file_path_or_contents)
244
+ if isinstance(file_path_or_contents, bytes)
245
+ else await AsyncPath(file_path_or_contents).read_text()
246
+ )
247
+ return normalize_spaces(await run_sync(convert_to_markdown, content))
kreuzberg/_mime_types.py CHANGED
@@ -5,10 +5,12 @@ from typing import TYPE_CHECKING, Final
5
5
  if TYPE_CHECKING: # pragma: no cover
6
6
  from collections.abc import Mapping
7
7
 
8
- MARKDOWN_MIME_TYPE: Final[str] = "text/markdown"
9
- PLAIN_TEXT_MIME_TYPE: Final[str] = "text/plain"
10
- PDF_MIME_TYPE: Final[str] = "application/pdf"
11
- POWER_POINT_MIME_TYPE: Final[str] = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
8
+ HTML_MIME_TYPE: Final = "text/html"
9
+ MARKDOWN_MIME_TYPE: Final = "text/markdown"
10
+ PDF_MIME_TYPE: Final = "application/pdf"
11
+ PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
12
+ POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
13
+
12
14
  PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
13
15
 
14
16
  IMAGE_MIME_TYPES: Final[set[str]] = {
@@ -93,5 +95,8 @@ PANDOC_MIME_TYPE_EXT_MAP: Final[Mapping[str, str]] = {
93
95
  }
94
96
 
95
97
  SUPPORTED_MIME_TYPES: Final[set[str]] = (
96
- PLAIN_TEXT_MIME_TYPES | IMAGE_MIME_TYPES | PANDOC_SUPPORTED_MIME_TYPES | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE}
98
+ PLAIN_TEXT_MIME_TYPES
99
+ | IMAGE_MIME_TYPES
100
+ | PANDOC_SUPPORTED_MIME_TYPES
101
+ | {PDF_MIME_TYPE, POWER_POINT_MIME_TYPE, HTML_MIME_TYPE}
97
102
  )
kreuzberg/extraction.py CHANGED
@@ -19,11 +19,13 @@ from anyio import Path as AsyncPath
19
19
  from kreuzberg._extractors import (
20
20
  _extract_content_with_pandoc,
21
21
  _extract_file_with_pandoc,
22
+ _extract_html_string,
22
23
  _extract_image_with_tesseract,
23
24
  _extract_pdf_file,
24
25
  _extract_pptx_file,
25
26
  )
26
27
  from kreuzberg._mime_types import (
28
+ HTML_MIME_TYPE,
27
29
  IMAGE_MIME_TYPE_EXT_MAP,
28
30
  IMAGE_MIME_TYPES,
29
31
  MARKDOWN_MIME_TYPE,
@@ -90,6 +92,9 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
90
92
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
91
93
  return ExtractionResult(content=await _extract_pptx_file(content), mime_type=MARKDOWN_MIME_TYPE)
92
94
 
95
+ if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
96
+ return ExtractionResult(content=await _extract_html_string(content), mime_type=MARKDOWN_MIME_TYPE)
97
+
93
98
  return ExtractionResult(
94
99
  content=safe_decode(content),
95
100
  mime_type=mime_type,
@@ -142,4 +147,7 @@ async def extract_file(
142
147
  if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
143
148
  return ExtractionResult(content=await _extract_pptx_file(file_path), mime_type=MARKDOWN_MIME_TYPE)
144
149
 
150
+ if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
151
+ return ExtractionResult(content=await _extract_html_string(file_path), mime_type=MARKDOWN_MIME_TYPE)
152
+
145
153
  return ExtractionResult(content=await AsyncPath(file_path).read_text(), mime_type=mime_type)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.2
2
2
  Name: kreuzberg
3
- Version: 1.2.0
3
+ Version: 1.3.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
@@ -25,6 +25,7 @@ Description-Content-Type: text/markdown
25
25
  License-File: LICENSE
26
26
  Requires-Dist: anyio>=4.8.0
27
27
  Requires-Dist: charset-normalizer>=3.4.1
28
+ Requires-Dist: html-to-markdown>=1.2.0
28
29
  Requires-Dist: pypandoc>=1.15
29
30
  Requires-Dist: pypdfium2>=4.30.1
30
31
  Requires-Dist: pytesseract>=0.3.13
@@ -38,7 +39,7 @@ extraction.
38
39
 
39
40
  Why?
40
41
 
41
- I am building, like many do now, a RAG focused service. I have text extraction needs.
42
+ I am building, like many do now, a RAG focused service (checkout https://grantflow.ai). I have text extraction needs.
42
43
  There are quite a lot of commercial options out there, and several open-source + paid options.
43
44
  But I wanted something simple, which does not require expansive round-trips to an external API.
44
45
  Furthermore, I wanted something that is easy to run locally and isn't very heavy / requires a GPU.
@@ -76,17 +77,32 @@ polished and well maintained.
76
77
 
77
78
  - PDFs are processed using pdfium2 for searchable PDFs + Tesseract OCR for scanned documents
78
79
  - Images are processed using Tesseract OCR
79
- - Office documents and other formats are processed using Pandoc, or python-pptx for PPTX files
80
+ - Office documents and other formats are processed using Pandoc
81
+ - PPTX files are converted using python-pptx
82
+ - HTML files are converted using html-to-markdown
80
83
  - Plain text files are read directly with appropriate encoding detection
81
84
 
82
85
  ### Roadmap
83
86
 
84
- [] - extra install groups (to make dependencies optional and offer alternatives)
85
- [] - html file text extraction
86
- [] - better PDF table extraction
87
- [] - metadata extraction
87
+ V1:
88
88
 
89
- Feel free to open a discussion in GitHub or an issue if you have any feature requests, but keep the philosophy part in mind
89
+ - [x] - html file text extraction
90
+ - [ ] - better PDF table extraction
91
+ - [ ] - TBD
92
+
93
+ V2:
94
+
95
+ - [ ] - extra install groups (to make dependencies optional)
96
+ - [ ] - metadata extraction (possible breaking change)
97
+ - [ ] - TBD
98
+
99
+ ### Feature Requests
100
+
101
+ Feel free to open a discussion in GitHub or an issue if you have any feature requests
102
+
103
+ ### Contribution
104
+
105
+ Is welcome! Read guidelines below.
90
106
 
91
107
  ## Supported File Types
92
108
 
@@ -116,6 +132,7 @@ Kreuzberg supports a wide range of file formats:
116
132
 
117
133
  #### Text and Markup Formats
118
134
 
135
+ - HTML (`.html`, `.htm`)
119
136
  - Plain Text (`.txt`)
120
137
  - Markdown (`.md`)
121
138
  - reStructuredText (`.rst`)
@@ -0,0 +1,13 @@
1
+ kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
+ kreuzberg/_extractors.py,sha256=eiWPpjnZOZFDwlQL4XsgavJEWqxGtzLVvS8YU28RBAo,8095
3
+ kreuzberg/_mime_types.py,sha256=hR6LFXWn8dtCDB05PkADYk2l__HpmETNyf4YFixhecE,2918
4
+ kreuzberg/_string.py,sha256=O023sxdYoC4DhFCU1z430UBdbxqwXKmyymUDDx3J_i8,1156
5
+ kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
+ kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
7
+ kreuzberg/extraction.py,sha256=cgX8uoCVXf-Va30g8T8DwrZUqsSPHIzmPfDgnWOqNNU,6148
8
+ kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
+ kreuzberg-1.3.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
10
+ kreuzberg-1.3.0.dist-info/METADATA,sha256=3wiaAuaiA865lg5oCjwlAKaZqRQn1w8VqaQXeoEdip4,8579
11
+ kreuzberg-1.3.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
12
+ kreuzberg-1.3.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
13
+ kreuzberg-1.3.0.dist-info/RECORD,,
@@ -1,13 +0,0 @@
1
- kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
2
- kreuzberg/_extractors.py,sha256=eZ12O7Ii2NRba-dDPIino_eKApCihfdxSPZP121D3xA,7541
3
- kreuzberg/_mime_types.py,sha256=oJc4Qc2RkfZAYoCmxuuJ4S_Mo9-QQ0c4wwy0ZBqMRoA,2873
4
- kreuzberg/_string.py,sha256=O023sxdYoC4DhFCU1z430UBdbxqwXKmyymUDDx3J_i8,1156
5
- kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
6
- kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
7
- kreuzberg/extraction.py,sha256=j5p-JCnyGMouRp4qD-1qKSV_cw8DQ9QSo1H2ocwbbqA,5732
8
- kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
9
- kreuzberg-1.2.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
10
- kreuzberg-1.2.0.dist-info/METADATA,sha256=rTuoCAAk9mYh7f55bLSQ9CfEPhh3BnqNHwxdruth1P8,8330
11
- kreuzberg-1.2.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
12
- kreuzberg-1.2.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
13
- kreuzberg-1.2.0.dist-info/RECORD,,