PyPI - kreuzberg - Versions diffs - 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl - Mend

kreuzberg 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (11) hide show

kreuzberg/_extractors.py +7 -3
kreuzberg/_mime_types.py +1 -1
kreuzberg/_string.py +7 -7
kreuzberg/_sync.py +1 -1
kreuzberg/extraction.py +8 -4
{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/METADATA +20 -5
kreuzberg-1.1.0.dist-info/RECORD +13 -0
kreuzberg-1.0.0.dist-info/RECORD +0 -13
{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/LICENSE +0 -0
{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/WHEEL +0 -0
{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/top_level.txt +0 -0

kreuzberg/_extractors.py CHANGED Viewed

@@ -11,7 +11,7 @@ from kreuzberg._mime_types import PANDOC_MIME_TYPE_EXT_MAP
 from kreuzberg._sync import run_sync
 from kreuzberg.exceptions import ParsingError
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from pathlib import Path
@@ -35,6 +35,7 @@ def _extract_pdf_with_tesseract(file_path: Path) -> str:
         text = "\n".join(image_to_string(img) for img in images)
         return text.strip()
     except (PdfiumError, TesseractError) as e:
+        # TODO: add test case
         raise ParsingError(
             "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
         ) from e
@@ -57,21 +58,23 @@ def _extract_pdf_with_pdfium2(file_path: Path) -> str:
         text = "\n".join(page.get_textpage().get_text_range() for page in document)
         return text.strip()
     except PdfiumError as e:
+        # TODO: add test case
         raise ParsingError(
             "Could not extract text from PDF file", context={"file_path": str(file_path), "error": str(e)}
         ) from e
-async def _extract_pdf_file(file_path: Path) -> str:
+async def _extract_pdf_file(file_path: Path, force_ocr: bool = False) -> str:
     """Extract text from a PDF file.
     Args:
         file_path: The path to the PDF file.
+        force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
     Returns:
         The extracted text.
     """
-    if content := await run_sync(_extract_pdf_with_pdfium2, file_path):
+    if not force_ocr and (content := await run_sync(_extract_pdf_with_pdfium2, file_path)):
         return content
     return await run_sync(_extract_pdf_with_tesseract, file_path)
@@ -96,6 +99,7 @@ async def _extract_content_with_pandoc(file_data: bytes, mime_type: str, encodin
     try:
         return cast(str, await run_sync(convert_text, file_data, to="md", format=ext, encoding=encoding))
     except RuntimeError as e:
+        # TODO: add test case
         raise ParsingError(
             f"Could not extract text from {PANDOC_MIME_TYPE_EXT_MAP[mime_type]} file contents",
             context={"error": str(e)},

kreuzberg/_mime_types.py CHANGED Viewed

@@ -2,7 +2,7 @@ from __future__ import annotations
 from typing import TYPE_CHECKING, Final
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Mapping
 MARKDOWN_MIME_TYPE: Final[str] = "text/markdown"

kreuzberg/_string.py CHANGED Viewed

@@ -1,5 +1,7 @@
 from __future__ import annotations
+from contextlib import suppress
 from charset_normalizer import detect
@@ -16,20 +18,18 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
     if not byte_data:
         return ""
+    encodings = ["utf-8", "latin-1"]
     if encoding:
-        try:
+        with suppress(UnicodeDecodeError):
             return byte_data.decode(encoding, errors="ignore")
-        except UnicodeDecodeError:  # pragma: no cover
-            pass
-    encodings = ["utf-8", "latin-1"]
     if encoding := detect(byte_data).get("encoding"):
         encodings.append(encoding)
     for encoding in encodings:
-        try:
+        with suppress(UnicodeDecodeError):
             return byte_data.decode(encoding, errors="ignore")
-        except UnicodeDecodeError:  # pragma: no cover  # noqa: PERF203
-            pass
+    # TODO: add test case
     return byte_data.decode("latin-1", errors="replace")

kreuzberg/_sync.py CHANGED Viewed

@@ -6,7 +6,7 @@ from typing import TYPE_CHECKING, TypeVar, cast
 from anyio.to_thread import run_sync as any_io_run_sync
 from typing_extensions import ParamSpec
-if TYPE_CHECKING:
+if TYPE_CHECKING:  # pragma: no cover
     from collections.abc import Callable
 T = TypeVar("T")

kreuzberg/extraction.py CHANGED Viewed

@@ -35,12 +35,13 @@ class ExtractionResult(NamedTuple):
     """The mime type of the content."""
-async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
+async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False) -> ExtractionResult:
     """Extract the textual content from a given byte string representing a file's contents.
     Args:
         content: The content to extract.
         mime_type: The mime type of the content.
+        force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
     Raises:
         ValidationError: If the mime type is not supported.
@@ -58,7 +59,7 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
         with NamedTemporaryFile(suffix=".pdf") as temp_file:
             temp_file.write(content)
             return ExtractionResult(
-                content=await _extract_pdf_file(Path(temp_file.name)), mime_type=PLAIN_TEXT_MIME_TYPE
+                content=await _extract_pdf_file(Path(temp_file.name), force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE
             )
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
@@ -81,12 +82,15 @@ async def extract_bytes(content: bytes, mime_type: str) -> ExtractionResult:
     )
-async def extract_file(file_path: Path | str, mime_type: str | None = None) -> ExtractionResult:
+async def extract_file(
+    file_path: Path | str, mime_type: str | None = None, force_ocr: bool = False
+) -> ExtractionResult:
     """Extract the textual content from a given file.
     Args:
         file_path: The path to the file.
         mime_type: The mime type of the file.
+        force_ocr: Whether or not to force OCR on PDF files that have a text layer. Default = false.
     Raises:
         ValidationError: If the mime type is not supported.
@@ -109,7 +113,7 @@ async def extract_file(file_path: Path | str, mime_type: str | None = None) -> E
         raise ValidationError("The file does not exist.", context={"file_path": str(file_path)})
     if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
-        return ExtractionResult(content=await _extract_pdf_file(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)
+        return ExtractionResult(content=await _extract_pdf_file(file_path, force_ocr), mime_type=PLAIN_TEXT_MIME_TYPE)
     if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
         return ExtractionResult(content=await _extract_image_with_tesseract(file_path), mime_type=PLAIN_TEXT_MIME_TYPE)

{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: kreuzberg
-Version: 1.0.0
+Version: 1.1.0
 Summary: A text extraction library supporting PDFs, images, office documents and more
 Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
 License: MIT
@@ -46,7 +46,7 @@ Hence, this library.
 ## Features
-- Extract text from PDFs, images, and office documents
+- Extract text from PDFs, images, office documents and more (see supported formats below)
 - Use modern Python with async (via `anyio`) and proper type hints
 - Extensive error handling for easy debugging
@@ -164,6 +164,21 @@ async def process_uploaded_image(image_content: bytes):
     return result.content
 ```
+### Forcing OCR
+When extracting a PDF file or bytes, you might want to force OCR - for example, if the PDF includes images that have text that should be extracted etc.
+You can do this by passing `force_ocr=True`:
+```python
+from kreuzberg import extract_bytes
+# Extract text from PDF bytes and force OCR
+async def process_uploaded_pdf(pdf_content: bytes):
+    result = await extract_bytes(pdf_content, mime_type="application/pdf", force_ocr=True)
+    return result.content
+```
 ### Error Handling
 Kreuzberg raises two exception types:
@@ -173,8 +188,8 @@ Kreuzberg raises two exception types:
 Raised when there are issues with input validation:
 - Unsupported mime types
-- Non-existent files
 - Undetectable mime types
+- Path doesn't point at an exist file
 #### ParsingError
@@ -218,8 +233,8 @@ except ParsingError as e:
 All extraction functions return an ExtractionResult named tuple containing:
-- content: The extracted text as a string
-- mime_type: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
+- `content`: The extracted text as a string
+- `mime_type`: The mime type of the output (either "text/plain" or, if pandoc is used- "text/markdown")
 ```python
 from kreuzberg import ExtractionResult

kreuzberg-1.1.0.dist-info/RECORD ADDED Viewed

@@ -0,0 +1,13 @@
+kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
+kreuzberg/_extractors.py,sha256=r8L9Bm3x7s1u7-T5HKkr1j6M6W3bUuwMAmDtAwX-s9g,4717
+kreuzberg/_mime_types.py,sha256=M5sKT4OkMf7pwtgs_jO2uhl6gC94wUurYzw_wbrIjU0,2739
+kreuzberg/_string.py,sha256=5s6BfTLQdYlDEt2PP4AdmBLV-ajroATOVYQQRcBYFD4,934
+kreuzberg/_sync.py,sha256=ovsFHFdkcczz7gNEUJsbZzY8KHG0_GAOOYipQNE4hIY,874
+kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
+kreuzberg/extraction.py,sha256=-a_msLQm7h5pHDhBuvfRP81-FtBwv7FGW-6YVJlXpUg,4926
+kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
+kreuzberg-1.1.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
+kreuzberg-1.1.0.dist-info/METADATA,sha256=nkDjE2MEqAE_-1MZvlBxnNuM7SKCOD2LvB7Ucb_W7U4,7775
+kreuzberg-1.1.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
+kreuzberg-1.1.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
+kreuzberg-1.1.0.dist-info/RECORD,,

kreuzberg-1.0.0.dist-info/RECORD DELETED Viewed

@@ -1,13 +0,0 @@
-kreuzberg/__init__.py,sha256=5IBPjPsZ7faK15gFB9ZEROHhkEX7KKQmrHPCZuGnhb0,285
-kreuzberg/_extractors.py,sha256=tmOgzhKw8J21R-NKWSgu7yf5epGleoxC9nKQacUDdms,4461
-kreuzberg/_mime_types.py,sha256=VI3bWm7NBF0Vs2PXpxnJxTlt0pRSE59raVO_KTDJCVQ,2719
-kreuzberg/_string.py,sha256=8YezUPhTGEMk08yGrBxVu4CwhUdCQwOvyC6EGB7wxLk,975
-kreuzberg/_sync.py,sha256=OQZTSKUOaSMkxAb4ynq-BDrx1JLAYP9uc_zFZaAN_fk,854
-kreuzberg/exceptions.py,sha256=jrXyvcuSU-694OEtXPZfHYcUbpoRZzNKw9Lo3wIZwL0,770
-kreuzberg/extraction.py,sha256=utxr9HM8K2aDU0LXHVKCNPXqTu7fGDeNCNpamGr6hAQ,4646
-kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
-kreuzberg-1.0.0.dist-info/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
-kreuzberg-1.0.0.dist-info/METADATA,sha256=fQszunogmstxhdJMMD5ieXLRqjBojXpb0pXJAZZO8fQ,7238
-kreuzberg-1.0.0.dist-info/WHEEL,sha256=In9FTNxeP60KnTkGw7wk6mJPYd_dQSjEZmXdBdMCI-8,91
-kreuzberg-1.0.0.dist-info/top_level.txt,sha256=rbGkygffkZiyKhL8UN41ZOjLfem0jJPA1Whtndne0rE,10
-kreuzberg-1.0.0.dist-info/RECORD,,

{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/LICENSE RENAMED Viewed

File without changes

{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/WHEEL RENAMED Viewed

File without changes

{kreuzberg-1.0.0.dist-info → kreuzberg-1.1.0.dist-info}/top_level.txt RENAMED Viewed

File without changes

kreuzberg 1.0.0__py3-none-any.whl → 1.1.0__py3-none-any.whl

kreuzberg 1.0.0py3-none-any.whl → 1.1.0py3-none-any.whl