kreuzberg 3.0.0__tar.gz → 3.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/PKG-INFO +14 -19
  2. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/README.md +4 -10
  3. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/__init__.py +4 -1
  4. kreuzberg-3.1.0/kreuzberg/_extractors/_base.py +92 -0
  5. kreuzberg-3.1.0/kreuzberg/_extractors/_html.py +34 -0
  6. kreuzberg-3.1.0/kreuzberg/_extractors/_image.py +74 -0
  7. kreuzberg-3.1.0/kreuzberg/_extractors/_pandoc.py +613 -0
  8. kreuzberg-3.1.0/kreuzberg/_extractors/_pdf.py +171 -0
  9. kreuzberg-3.1.0/kreuzberg/_extractors/_presentation.py +233 -0
  10. kreuzberg-3.1.0/kreuzberg/_extractors/_spread_sheet.py +125 -0
  11. kreuzberg-3.1.0/kreuzberg/_gmft.py +174 -0
  12. kreuzberg-3.1.0/kreuzberg/_ocr/__init__.py +17 -0
  13. kreuzberg-3.1.0/kreuzberg/_ocr/_base.py +54 -0
  14. kreuzberg-3.1.0/kreuzberg/_ocr/_easyocr.py +376 -0
  15. kreuzberg-3.1.0/kreuzberg/_ocr/_paddleocr.py +283 -0
  16. kreuzberg-3.1.0/kreuzberg/_ocr/_tesseract.py +342 -0
  17. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/_types.py +31 -4
  18. kreuzberg-3.1.0/kreuzberg/_utils/__init__.py +0 -0
  19. kreuzberg-3.1.0/kreuzberg/_utils/_string.py +39 -0
  20. kreuzberg-3.1.0/kreuzberg/_utils/_sync.py +121 -0
  21. kreuzberg-3.1.0/kreuzberg/_utils/_tmp.py +37 -0
  22. kreuzberg-3.1.0/kreuzberg/py.typed +0 -0
  23. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg.egg-info/PKG-INFO +14 -19
  24. kreuzberg-3.1.0/kreuzberg.egg-info/SOURCES.txt +36 -0
  25. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg.egg-info/requires.txt +9 -11
  26. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/pyproject.toml +17 -11
  27. kreuzberg-3.0.0/kreuzberg.egg-info/SOURCES.txt +0 -18
  28. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/LICENSE +0 -0
  29. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/_chunker.py +0 -0
  30. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/_constants.py +0 -0
  31. /kreuzberg-3.0.0/kreuzberg/py.typed → /kreuzberg-3.1.0/kreuzberg/_extractors/__init__.py +0 -0
  32. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/_mime_types.py +0 -0
  33. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/_playa.py +0 -0
  34. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/_registry.py +0 -0
  35. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/exceptions.py +0 -0
  36. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg/extraction.py +0 -0
  37. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
  38. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/kreuzberg.egg-info/top_level.txt +0 -0
  39. {kreuzberg-3.0.0 → kreuzberg-3.1.0}/setup.cfg +0 -0
@@ -1,11 +1,11 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.0.0
3
+ Version: 3.1.0
4
4
  Summary: A text extraction library supporting PDFs, images, office documents and more
5
5
  Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
6
6
  License: MIT
7
7
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
8
- Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
8
+ Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
9
9
  Classifier: Development Status :: 4 - Beta
10
10
  Classifier: Intended Audience :: Developers
11
11
  Classifier: License :: OSI Approved :: MIT License
@@ -27,7 +27,7 @@ License-File: LICENSE
27
27
  Requires-Dist: anyio>=4.9.0
28
28
  Requires-Dist: charset-normalizer>=3.4.1
29
29
  Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
30
- Requires-Dist: html-to-markdown>=1.2.0
30
+ Requires-Dist: html-to-markdown>=1.2.1
31
31
  Requires-Dist: playa-pdf>=0.4.1
32
32
  Requires-Dist: pypdfium2==4.30.0
33
33
  Requires-Dist: python-calamine>=0.3.1
@@ -35,19 +35,20 @@ Requires-Dist: python-pptx>=1.0.2
35
35
  Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
36
36
  Provides-Extra: all
37
37
  Requires-Dist: easyocr>=1.7.2; extra == "all"
38
- Requires-Dist: numpy>=2.0.2; extra == "all"
38
+ Requires-Dist: gmft>=0.4.1; extra == "all"
39
39
  Requires-Dist: paddleocr>=2.10.0; extra == "all"
40
- Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "all"
41
- Requires-Dist: semantic-text-splitter>=0.24.1; extra == "all"
40
+ Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
41
+ Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
42
42
  Requires-Dist: setuptools>=76.0.0; extra == "all"
43
43
  Provides-Extra: chunking
44
- Requires-Dist: semantic-text-splitter>=0.24.1; extra == "chunking"
44
+ Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
45
45
  Provides-Extra: easyocr
46
46
  Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
47
+ Provides-Extra: gmft
48
+ Requires-Dist: gmft>=0.4.1; extra == "gmft"
47
49
  Provides-Extra: paddleocr
48
- Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
49
50
  Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
50
- Requires-Dist: paddlepaddle>=2.6.2; python_version < "3.13" and extra == "paddleocr"
51
+ Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
51
52
  Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
52
53
  Dynamic: license-file
53
54
 
@@ -66,6 +67,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
66
67
  - **Resource Efficient**: Lightweight processing without GPU requirements
67
68
  - **Format Support**: Comprehensive support for documents, images, and text formats
68
69
  - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
70
+ - **Metadata Extraction**: Get document metadata alongside text content
71
+ - **Table Extraction**: Extract tables from documents using the excellent GMFT library
69
72
  - **Modern Python**: Built with async/await, type hints, and a functional-first approach
70
73
  - **Permissive OSS**: MIT licensed with permissively licensed dependencies
71
74
 
@@ -151,7 +154,7 @@ Kreuzberg supports multiple OCR engines:
151
154
  - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
152
155
  - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
153
156
 
154
- For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
157
+ For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
155
158
 
156
159
  ## Contribution
157
160
 
@@ -160,17 +163,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
160
163
  ### Local Development
161
164
 
162
165
  1. Clone the repo
163
-
164
166
  1. Install the system dependencies
165
-
166
167
  1. Install the full dependencies with `uv sync`
167
-
168
- 1. Install the pre-commit hooks with:
169
-
170
- ```shell
171
- pre-commit install && pre-commit install --hook-type commit-msg
172
- ```
173
-
168
+ 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
174
169
  1. Make your changes and submit a PR
175
170
 
176
171
  ## License
@@ -13,6 +13,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
13
13
  - **Resource Efficient**: Lightweight processing without GPU requirements
14
14
  - **Format Support**: Comprehensive support for documents, images, and text formats
15
15
  - **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
16
+ - **Metadata Extraction**: Get document metadata alongside text content
17
+ - **Table Extraction**: Extract tables from documents using the excellent GMFT library
16
18
  - **Modern Python**: Built with async/await, type hints, and a functional-first approach
17
19
  - **Permissive OSS**: MIT licensed with permissively licensed dependencies
18
20
 
@@ -98,7 +100,7 @@ Kreuzberg supports multiple OCR engines:
98
100
  - **EasyOCR**: Good for many languages, pure Python, but downloads models on first use
99
101
  - **PaddleOCR**: Excellent for Asian languages, pure Python, but downloads models on first use
100
102
 
101
- For comparison and selection guidance, see the [OCR Backends](https://example.com/ocr-backends) documentation.
103
+ For comparison and selection guidance, see the [OCR Backends](https://goldziher.github.io/kreuzberg/user-guide/ocr-backends/) documentation.
102
104
 
103
105
  ## Contribution
104
106
 
@@ -107,17 +109,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
107
109
  ### Local Development
108
110
 
109
111
  1. Clone the repo
110
-
111
112
  1. Install the system dependencies
112
-
113
113
  1. Install the full dependencies with `uv sync`
114
-
115
- 1. Install the pre-commit hooks with:
116
-
117
- ```shell
118
- pre-commit install && pre-commit install --hook-type commit-msg
119
- ```
120
-
114
+ 1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
121
115
  1. Make your changes and submit a PR
122
116
 
123
117
  ## License
@@ -1,10 +1,11 @@
1
+ from kreuzberg._gmft import GMFTConfig
1
2
  from kreuzberg._ocr._easyocr import EasyOCRConfig
2
3
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
3
4
  from kreuzberg._ocr._tesseract import TesseractConfig
4
5
 
5
6
  from ._ocr._tesseract import PSMMode
6
7
  from ._registry import ExtractorRegistry
7
- from ._types import ExtractionConfig, ExtractionResult, Metadata
8
+ from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
8
9
  from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
9
10
  from .extraction import (
10
11
  batch_extract_bytes,
@@ -22,6 +23,7 @@ __all__ = [
22
23
  "ExtractionConfig",
23
24
  "ExtractionResult",
24
25
  "ExtractorRegistry",
26
+ "GMFTConfig",
25
27
  "KreuzbergError",
26
28
  "Metadata",
27
29
  "MissingDependencyError",
@@ -29,6 +31,7 @@ __all__ = [
29
31
  "PSMMode",
30
32
  "PaddleOCRConfig",
31
33
  "ParsingError",
34
+ "TableData",
32
35
  "TesseractConfig",
33
36
  "ValidationError",
34
37
  "batch_extract_bytes",
@@ -0,0 +1,92 @@
1
+ from __future__ import annotations
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import TYPE_CHECKING, ClassVar
5
+
6
+ if TYPE_CHECKING:
7
+ from pathlib import Path
8
+
9
+ from kreuzberg import ExtractionResult
10
+ from kreuzberg._types import ExtractionConfig
11
+
12
+
13
+ class Extractor(ABC):
14
+ """Abstract base class for file content extraction.
15
+
16
+ This class provides the interface for different types of content extractors.
17
+ Subclasses are expected to implement the methods for extracting content
18
+ either asynchronously or synchronously and determining the supported MIME types.
19
+
20
+ Attributes:
21
+ SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
22
+
23
+ Args:
24
+ mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
25
+ config: Configuration options for the extraction process.
26
+ """
27
+
28
+ __slots__ = ("config", "mime_type")
29
+
30
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]]
31
+
32
+ def __init__(self, mime_type: str, config: ExtractionConfig) -> None:
33
+ self.mime_type = mime_type
34
+ self.config = config
35
+
36
+ @abstractmethod
37
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
38
+ """Asynchronously extract content from a byte stream.
39
+
40
+ Args:
41
+ content: The byte content to extract.
42
+
43
+ Returns:
44
+ ExtractionResult: The extracted content along with metadata about the extraction.
45
+ """
46
+
47
+ @abstractmethod
48
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
49
+ """Asynchronously extract content from a file located at the specified path.
50
+
51
+ Args:
52
+ path: The path to the file to process.
53
+
54
+ Returns:
55
+ ExtractionResult: The extracted content along with metadata about the extraction.
56
+ """
57
+
58
+ @abstractmethod
59
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
60
+ """Synchronously extract content from a byte stream.
61
+
62
+ Args:
63
+ content: The byte content to extract.
64
+
65
+ Returns:
66
+ ExtractionResult: The extracted content along with metadata about the extraction.
67
+ """
68
+
69
+ @abstractmethod
70
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
71
+ """Synchronously extract content from a file located at the specified path.
72
+
73
+ Args:
74
+ path: The path to the file to process.
75
+
76
+ Returns:
77
+ ExtractionResult: The extracted content along with metadata about the extraction.
78
+ """
79
+
80
+ @classmethod
81
+ def supports_mimetype(cls, mime_type: str) -> bool:
82
+ """Verify whether the extractor supports the given MIME type.
83
+
84
+ Args:
85
+ mime_type: The MIME type to check (e.g., "application/pdf").
86
+
87
+ Returns:
88
+ bool: True if the MIME type is supported, False otherwise.
89
+ """
90
+ return mime_type in cls.SUPPORTED_MIME_TYPES or any(
91
+ mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
92
+ )
@@ -0,0 +1,34 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, ClassVar
4
+
5
+ import html_to_markdown
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg._extractors._base import Extractor
9
+ from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
+ from kreuzberg._types import ExtractionResult
11
+ from kreuzberg._utils._string import normalize_spaces, safe_decode
12
+ from kreuzberg._utils._sync import run_sync
13
+
14
+ if TYPE_CHECKING:
15
+ from pathlib import Path
16
+
17
+
18
+ class HTMLExtractor(Extractor):
19
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {HTML_MIME_TYPE}
20
+
21
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
22
+ return await run_sync(self.extract_bytes_sync, content)
23
+
24
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
25
+ content = await AsyncPath(path).read_bytes()
26
+ return await run_sync(self.extract_bytes_sync, content)
27
+
28
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
+ result = html_to_markdown.convert_to_markdown(safe_decode(content))
30
+ return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
31
+
32
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
33
+ content = path.read_bytes()
34
+ return self.extract_bytes_sync(content)
@@ -0,0 +1,74 @@
1
+ from __future__ import annotations
2
+
3
+ from typing import TYPE_CHECKING, ClassVar
4
+
5
+ import anyio
6
+ from anyio import Path as AsyncPath
7
+
8
+ from kreuzberg._extractors._base import Extractor
9
+ from kreuzberg._mime_types import IMAGE_MIME_TYPES
10
+ from kreuzberg._ocr import get_ocr_backend
11
+ from kreuzberg._utils._tmp import create_temp_file
12
+ from kreuzberg.exceptions import ValidationError
13
+
14
+ if TYPE_CHECKING: # pragma: no cover
15
+ from collections.abc import Mapping
16
+ from pathlib import Path
17
+
18
+ from kreuzberg._types import ExtractionResult
19
+
20
+
21
+ class ImageExtractor(Extractor):
22
+ SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
23
+
24
+ IMAGE_MIME_TYPE_EXT_MAP: ClassVar[Mapping[str, str]] = {
25
+ "image/bmp": "bmp",
26
+ "image/x-bmp": "bmp",
27
+ "image/x-ms-bmp": "bmp",
28
+ "image/gif": "gif",
29
+ "image/jpeg": "jpg",
30
+ "image/pjpeg": "jpg",
31
+ "image/png": "png",
32
+ "image/tiff": "tiff",
33
+ "image/x-tiff": "tiff",
34
+ "image/jp2": "jp2",
35
+ "image/jpx": "jpx",
36
+ "image/jpm": "jpm",
37
+ "image/mj2": "mj2",
38
+ "image/webp": "webp",
39
+ "image/x-portable-anymap": "pnm",
40
+ "image/x-portable-bitmap": "pbm",
41
+ "image/x-portable-graymap": "pgm",
42
+ "image/x-portable-pixmap": "ppm",
43
+ }
44
+
45
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
46
+ extension = self._get_extension_from_mime_type(self.mime_type)
47
+ file_path, unlink = await create_temp_file(f".{extension}")
48
+ await AsyncPath(file_path).write_bytes(content)
49
+ try:
50
+ return await self.extract_path_async(file_path)
51
+ finally:
52
+ await unlink()
53
+
54
+ async def extract_path_async(self, path: Path) -> ExtractionResult:
55
+ if self.config.ocr_backend is None:
56
+ raise ValidationError("ocr_backend is None, cannot perform OCR")
57
+
58
+ return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
59
+
60
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
61
+ return anyio.run(self.extract_bytes_async, content)
62
+
63
+ def extract_path_sync(self, path: Path) -> ExtractionResult:
64
+ return anyio.run(self.extract_path_async, path)
65
+
66
+ def _get_extension_from_mime_type(self, mime_type: str) -> str:
67
+ if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
68
+ return self.IMAGE_MIME_TYPE_EXT_MAP[mime_type]
69
+
70
+ for k, v in self.IMAGE_MIME_TYPE_EXT_MAP.items():
71
+ if k.startswith(mime_type):
72
+ return v
73
+
74
+ raise ValidationError("unsupported mimetype", context={"mime_type": mime_type})