kreuzberg 3.0.1__tar.gz → 3.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/PKG-INFO +13 -18
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/README.md +3 -9
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/__init__.py +4 -1
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_pdf.py +14 -6
- kreuzberg-3.1.0/kreuzberg/_gmft.py +174 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_ocr/_paddleocr.py +0 -8
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_types.py +31 -4
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/PKG-INFO +13 -18
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/SOURCES.txt +1 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/requires.txt +9 -11
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/pyproject.toml +17 -8
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/LICENSE +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_chunker.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_constants.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/__init__.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_base.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_html.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_image.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_pandoc.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_presentation.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_extractors/_spread_sheet.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_ocr/__init__.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_ocr/_base.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_ocr/_easyocr.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_ocr/_tesseract.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_playa.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_registry.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_utils/__init__.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_utils/_string.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_utils/_sync.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/_utils/_tmp.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/extraction.py +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-3.0.1 → kreuzberg-3.1.0}/setup.cfg +0 -0
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.0
|
3
|
+
Version: 3.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
7
7
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
-
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
9
9
|
Classifier: Development Status :: 4 - Beta
|
10
10
|
Classifier: Intended Audience :: Developers
|
11
11
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -27,7 +27,7 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.2.
|
30
|
+
Requires-Dist: html-to-markdown>=1.2.1
|
31
31
|
Requires-Dist: playa-pdf>=0.4.1
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
33
|
Requires-Dist: python-calamine>=0.3.1
|
@@ -35,19 +35,20 @@ Requires-Dist: python-pptx>=1.0.2
|
|
35
35
|
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
|
-
Requires-Dist:
|
38
|
+
Requires-Dist: gmft>=0.4.1; extra == "all"
|
39
39
|
Requires-Dist: paddleocr>=2.10.0; extra == "all"
|
40
|
-
Requires-Dist: paddlepaddle>=
|
41
|
-
Requires-Dist: semantic-text-splitter>=0.
|
40
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
|
41
|
+
Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
|
42
42
|
Requires-Dist: setuptools>=76.0.0; extra == "all"
|
43
43
|
Provides-Extra: chunking
|
44
|
-
Requires-Dist: semantic-text-splitter>=0.
|
44
|
+
Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
|
45
45
|
Provides-Extra: easyocr
|
46
46
|
Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
|
47
|
+
Provides-Extra: gmft
|
48
|
+
Requires-Dist: gmft>=0.4.1; extra == "gmft"
|
47
49
|
Provides-Extra: paddleocr
|
48
|
-
Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
|
49
50
|
Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
|
50
|
-
Requires-Dist: paddlepaddle>=
|
51
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
|
51
52
|
Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
|
52
53
|
Dynamic: license-file
|
53
54
|
|
@@ -66,6 +67,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
66
67
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
67
68
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
68
69
|
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
70
|
+
- **Metadata Extraction**: Get document metadata alongside text content
|
71
|
+
- **Table Extraction**: Extract tables from documents using the excellent GMFT library
|
69
72
|
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
70
73
|
- **Permissive OSS**: MIT licensed with permissively licensed dependencies
|
71
74
|
|
@@ -160,17 +163,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
160
163
|
### Local Development
|
161
164
|
|
162
165
|
1. Clone the repo
|
163
|
-
|
164
166
|
1. Install the system dependencies
|
165
|
-
|
166
167
|
1. Install the full dependencies with `uv sync`
|
167
|
-
|
168
|
-
1. Install the pre-commit hooks with:
|
169
|
-
|
170
|
-
```shell
|
171
|
-
pre-commit install && pre-commit install --hook-type commit-msg
|
172
|
-
```
|
173
|
-
|
168
|
+
1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
174
169
|
1. Make your changes and submit a PR
|
175
170
|
|
176
171
|
## License
|
@@ -13,6 +13,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
13
13
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
14
14
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
15
15
|
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
16
|
+
- **Metadata Extraction**: Get document metadata alongside text content
|
17
|
+
- **Table Extraction**: Extract tables from documents using the excellent GMFT library
|
16
18
|
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
17
19
|
- **Permissive OSS**: MIT licensed with permissively licensed dependencies
|
18
20
|
|
@@ -107,17 +109,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
107
109
|
### Local Development
|
108
110
|
|
109
111
|
1. Clone the repo
|
110
|
-
|
111
112
|
1. Install the system dependencies
|
112
|
-
|
113
113
|
1. Install the full dependencies with `uv sync`
|
114
|
-
|
115
|
-
1. Install the pre-commit hooks with:
|
116
|
-
|
117
|
-
```shell
|
118
|
-
pre-commit install && pre-commit install --hook-type commit-msg
|
119
|
-
```
|
120
|
-
|
114
|
+
1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
121
115
|
1. Make your changes and submit a PR
|
122
116
|
|
123
117
|
## License
|
@@ -1,10 +1,11 @@
|
|
1
|
+
from kreuzberg._gmft import GMFTConfig
|
1
2
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
2
3
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
3
4
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
4
5
|
|
5
6
|
from ._ocr._tesseract import PSMMode
|
6
7
|
from ._registry import ExtractorRegistry
|
7
|
-
from ._types import ExtractionConfig, ExtractionResult, Metadata
|
8
|
+
from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
|
8
9
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
9
10
|
from .extraction import (
|
10
11
|
batch_extract_bytes,
|
@@ -22,6 +23,7 @@ __all__ = [
|
|
22
23
|
"ExtractionConfig",
|
23
24
|
"ExtractionResult",
|
24
25
|
"ExtractorRegistry",
|
26
|
+
"GMFTConfig",
|
25
27
|
"KreuzbergError",
|
26
28
|
"Metadata",
|
27
29
|
"MissingDependencyError",
|
@@ -29,6 +31,7 @@ __all__ = [
|
|
29
31
|
"PSMMode",
|
30
32
|
"PaddleOCRConfig",
|
31
33
|
"ParsingError",
|
34
|
+
"TableData",
|
32
35
|
"TesseractConfig",
|
33
36
|
"ValidationError",
|
34
37
|
"batch_extract_bytes",
|
@@ -45,20 +45,28 @@ class PDFExtractor(Extractor):
|
|
45
45
|
|
46
46
|
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
47
47
|
content_bytes = await AsyncPath(path).read_bytes()
|
48
|
-
|
48
|
+
|
49
|
+
result: ExtractionResult | None = None
|
49
50
|
|
50
51
|
if not self.config.force_ocr:
|
51
52
|
content = await self._extract_pdf_searchable_text(path)
|
52
53
|
if self._validate_extracted_text(content):
|
53
|
-
|
54
|
+
result = ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
54
55
|
|
55
|
-
if self.config.ocr_backend is not None:
|
56
|
+
if not result and self.config.ocr_backend is not None:
|
56
57
|
result = await self._extract_pdf_text_with_ocr(path, self.config.ocr_backend)
|
57
58
|
|
58
|
-
|
59
|
-
|
59
|
+
if not result:
|
60
|
+
result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
|
61
|
+
|
62
|
+
result.metadata = await extract_pdf_metadata(content_bytes)
|
63
|
+
|
64
|
+
if self.config.extract_tables:
|
65
|
+
from kreuzberg._gmft import extract_tables
|
66
|
+
|
67
|
+
result.tables = await extract_tables(path, self.config.gmft_config)
|
60
68
|
|
61
|
-
return
|
69
|
+
return result
|
62
70
|
|
63
71
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
64
72
|
return anyio.run(self.extract_bytes_async, content)
|
@@ -0,0 +1,174 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from dataclasses import dataclass, field
|
4
|
+
from typing import TYPE_CHECKING, Literal
|
5
|
+
|
6
|
+
from kreuzberg._types import TableData
|
7
|
+
from kreuzberg._utils._sync import run_sync
|
8
|
+
from kreuzberg.exceptions import MissingDependencyError
|
9
|
+
|
10
|
+
if TYPE_CHECKING:
|
11
|
+
from os import PathLike
|
12
|
+
|
13
|
+
from gmft.detectors.base import CroppedTable
|
14
|
+
from pandas import DataFrame
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass(unsafe_hash=True)
|
18
|
+
class GMFTConfig:
|
19
|
+
"""Configuration options for GMFT.
|
20
|
+
|
21
|
+
This class encapsulates the configuration options for GMFT, providing a way to customize its behavior.
|
22
|
+
"""
|
23
|
+
|
24
|
+
verbosity: int = 0
|
25
|
+
"""
|
26
|
+
Verbosity level for logging.
|
27
|
+
|
28
|
+
0: errors only
|
29
|
+
1: print warnings
|
30
|
+
2: print warnings and info
|
31
|
+
3: print warnings, info, and debug
|
32
|
+
"""
|
33
|
+
formatter_base_threshold: float = 0.3
|
34
|
+
"""
|
35
|
+
Base threshold for the confidence demanded of a table feature (row/column).
|
36
|
+
|
37
|
+
Note that a low threshold is actually better, because overzealous rows means that generally, numbers are still aligned and there are just many empty rows (having fewer rows than expected merges cells, which is bad).
|
38
|
+
"""
|
39
|
+
cell_required_confidence: dict[Literal[0, 1, 2, 3, 4, 5, 6], float] = field(
|
40
|
+
default_factory=lambda: {
|
41
|
+
0: 0.3,
|
42
|
+
1: 0.3,
|
43
|
+
2: 0.3,
|
44
|
+
3: 0.3,
|
45
|
+
4: 0.5,
|
46
|
+
5: 0.5,
|
47
|
+
6: 99,
|
48
|
+
},
|
49
|
+
hash=False,
|
50
|
+
)
|
51
|
+
"""
|
52
|
+
Confidences required (>=) for a row/column feature to be considered good. See TATRFormattedTable.id2label
|
53
|
+
|
54
|
+
But low confidences may be better than too high confidence (see formatter_base_threshold)
|
55
|
+
"""
|
56
|
+
detector_base_threshold: float = 0.9
|
57
|
+
"""Minimum confidence score required for a table"""
|
58
|
+
remove_null_rows: bool = True
|
59
|
+
"""
|
60
|
+
Flag to remove rows with no text.
|
61
|
+
"""
|
62
|
+
enable_multi_header: bool = False
|
63
|
+
"""
|
64
|
+
Enable multi-indices in the dataframe.
|
65
|
+
|
66
|
+
If false, then multiple headers will be merged column-wise.
|
67
|
+
"""
|
68
|
+
semantic_spanning_cells: bool = False
|
69
|
+
"""
|
70
|
+
[Experimental] Enable semantic spanning cells, which often encode hierarchical multi-level indices.
|
71
|
+
"""
|
72
|
+
semantic_hierarchical_left_fill: str | None = "algorithm"
|
73
|
+
"""
|
74
|
+
[Experimental] When semantic spanning cells is enabled, when a left header is detected which might represent a group of rows, that same value is reduplicated for each row.
|
75
|
+
|
76
|
+
Possible values: 'algorithm', 'deep', None.
|
77
|
+
|
78
|
+
'algorithm': assumes that the higher-level header is always the first row followed by several empty rows.
|
79
|
+
'deep': merges headers according to the spanning cells detected by the Table Transformer.
|
80
|
+
None: headers are not duplicated.
|
81
|
+
"""
|
82
|
+
large_table_if_n_rows_removed: int = 8
|
83
|
+
"""
|
84
|
+
If >= n rows are removed due to non-maxima suppression (NMS), then this table is classified as a large table.
|
85
|
+
"""
|
86
|
+
large_table_threshold: int = 10
|
87
|
+
"""
|
88
|
+
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
89
|
+
|
90
|
+
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold). Set 9999 to disable; set 0 to force large table assumption to run every time.
|
91
|
+
"""
|
92
|
+
large_table_row_overlap_threshold: float = 0.2
|
93
|
+
"""
|
94
|
+
With large tables, table transformer struggles with placing too many overlapping rows. Luckily, with more rows, we have more info on the usual size of text, which we can use to make a guess on the height such that no rows are merged or overlapping.
|
95
|
+
|
96
|
+
Large table assumption is only applied when (# of rows > large_table_threshold) AND (total overlap > large_table_row_overlap_threshold).
|
97
|
+
"""
|
98
|
+
large_table_maximum_rows: int = 1000
|
99
|
+
"""
|
100
|
+
Maximum number of rows allowed for a large table.
|
101
|
+
"""
|
102
|
+
force_large_table_assumption: bool | None = None
|
103
|
+
"""
|
104
|
+
Force the large table assumption to be applied, regardless of the number of rows and overlap.
|
105
|
+
"""
|
106
|
+
|
107
|
+
|
108
|
+
async def extract_tables(file_path: str | PathLike[str], config: GMFTConfig | None = None) -> list[TableData]:
|
109
|
+
"""Extracts tables from a PDF file.
|
110
|
+
|
111
|
+
This function takes a file path to a PDF file, and an optional configuration object.
|
112
|
+
It returns a list of strings, where each string is a markdown-formatted table.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
file_path: The path to the PDF file.
|
116
|
+
config: An optional configuration object.
|
117
|
+
|
118
|
+
Raises:
|
119
|
+
MissingDependencyError: Raised when the required dependencies are not installed.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
A list of table data dictionaries.
|
123
|
+
"""
|
124
|
+
try:
|
125
|
+
from gmft.auto import AutoTableDetector, AutoTableFormatter
|
126
|
+
from gmft.detectors.tatr import TATRDetectorConfig
|
127
|
+
from gmft.formatters.tatr import TATRFormatConfig
|
128
|
+
from gmft.pdf_bindings.pdfium import PyPDFium2Document
|
129
|
+
|
130
|
+
config = config or GMFTConfig()
|
131
|
+
formatter = AutoTableFormatter(
|
132
|
+
config=TATRFormatConfig(
|
133
|
+
verbosity=config.verbosity,
|
134
|
+
formatter_base_threshold=config.formatter_base_threshold,
|
135
|
+
cell_required_confidence=config.cell_required_confidence,
|
136
|
+
remove_null_rows=config.remove_null_rows,
|
137
|
+
enable_multi_header=config.enable_multi_header,
|
138
|
+
semantic_spanning_cells=config.semantic_spanning_cells,
|
139
|
+
semantic_hierarchical_left_fill=config.semantic_hierarchical_left_fill,
|
140
|
+
large_table_if_n_rows_removed=config.large_table_if_n_rows_removed,
|
141
|
+
large_table_threshold=config.large_table_threshold,
|
142
|
+
large_table_row_overlap_threshold=config.large_table_row_overlap_threshold,
|
143
|
+
large_table_maximum_rows=config.large_table_maximum_rows,
|
144
|
+
force_large_table_assumption=config.force_large_table_assumption,
|
145
|
+
)
|
146
|
+
)
|
147
|
+
detector = AutoTableDetector(config=TATRDetectorConfig(detector_base_threshold=config.detector_base_threshold))
|
148
|
+
doc = await run_sync(PyPDFium2Document, str(file_path))
|
149
|
+
cropped_tables: list[CroppedTable] = []
|
150
|
+
dataframes: list[DataFrame] = []
|
151
|
+
try:
|
152
|
+
for page in doc:
|
153
|
+
cropped_tables.extend(await run_sync(detector.extract, page))
|
154
|
+
|
155
|
+
for cropped_table in cropped_tables:
|
156
|
+
formatted_table = await run_sync(formatter.extract, cropped_table)
|
157
|
+
dataframes.append(await run_sync(formatted_table.df))
|
158
|
+
|
159
|
+
return [
|
160
|
+
TableData(
|
161
|
+
cropped_image=cropped_table.image(),
|
162
|
+
page_number=cropped_table.page.page_number,
|
163
|
+
text=data_frame.to_markdown(),
|
164
|
+
df=data_frame,
|
165
|
+
)
|
166
|
+
for data_frame, cropped_table in zip(dataframes, cropped_tables)
|
167
|
+
]
|
168
|
+
finally:
|
169
|
+
await run_sync(doc.close)
|
170
|
+
|
171
|
+
except ImportError as e:
|
172
|
+
raise MissingDependencyError.create_for_package(
|
173
|
+
dependency_group="gmft", functionality="table extraction", package_name="gmft"
|
174
|
+
) from e
|
@@ -1,7 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import platform
|
4
|
-
import sys
|
5
4
|
from dataclasses import dataclass
|
6
5
|
from importlib.util import find_spec
|
7
6
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal
|
@@ -233,17 +232,10 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
|
|
233
232
|
Raises:
|
234
233
|
MissingDependencyError: If PaddleOCR is not installed.
|
235
234
|
OCRError: If initialization fails.
|
236
|
-
ValidationError: If the python version is too high.
|
237
235
|
"""
|
238
236
|
if cls._paddle_ocr is not None:
|
239
237
|
return
|
240
238
|
|
241
|
-
if sys.version_info >= (3, 13): # pragma: no cover
|
242
|
-
raise ValidationError(
|
243
|
-
"PaddleOCR is only available in python 3.12 and below. Please downgrade your Python or switch to a different OCR backend.",
|
244
|
-
context={"issue": "https://github.com/PaddlePaddle/Paddle/issues/71616"},
|
245
|
-
)
|
246
|
-
|
247
239
|
try:
|
248
240
|
from paddleocr import PaddleOCR
|
249
241
|
except ImportError as e:
|
@@ -2,7 +2,7 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import sys
|
4
4
|
from collections.abc import Awaitable
|
5
|
-
from dataclasses import asdict, dataclass
|
5
|
+
from dataclasses import asdict, dataclass, field
|
6
6
|
from typing import TYPE_CHECKING, Any, Callable, Literal, TypedDict, Union
|
7
7
|
|
8
8
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
@@ -14,6 +14,10 @@ else: # pragma: no cover
|
|
14
14
|
from typing import NotRequired
|
15
15
|
|
16
16
|
if TYPE_CHECKING:
|
17
|
+
from pandas import DataFrame
|
18
|
+
from PIL.Image import Image
|
19
|
+
|
20
|
+
from kreuzberg._gmft import GMFTConfig
|
17
21
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
18
22
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
19
23
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
@@ -21,6 +25,19 @@ if TYPE_CHECKING:
|
|
21
25
|
OcrBackendType = Literal["tesseract", "easyocr", "paddleocr"]
|
22
26
|
|
23
27
|
|
28
|
+
class TableData(TypedDict):
|
29
|
+
"""Table data, returned from table extraction."""
|
30
|
+
|
31
|
+
cropped_image: Image
|
32
|
+
"""The cropped image of the table."""
|
33
|
+
df: DataFrame
|
34
|
+
"""The table data as a pandas DataFrame."""
|
35
|
+
page_number: int
|
36
|
+
"""The page number of the table."""
|
37
|
+
text: str
|
38
|
+
"""The table text as a markdown string."""
|
39
|
+
|
40
|
+
|
24
41
|
class Metadata(TypedDict, total=False):
|
25
42
|
"""Base metadata common to all document types.
|
26
43
|
|
@@ -88,12 +105,14 @@ class ExtractionResult:
|
|
88
105
|
|
89
106
|
content: str
|
90
107
|
"""The extracted content."""
|
91
|
-
chunks: list[str]
|
92
|
-
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
93
108
|
mime_type: str
|
94
109
|
"""The mime type of the extracted content. Is either text/plain or text/markdown."""
|
95
110
|
metadata: Metadata
|
96
111
|
"""The metadata of the content."""
|
112
|
+
tables: list[TableData] = field(default_factory=list)
|
113
|
+
"""Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
|
114
|
+
chunks: list[str] = field(default_factory=list)
|
115
|
+
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
97
116
|
|
98
117
|
|
99
118
|
PostProcessingHook = Callable[[ExtractionResult], Union[ExtractionResult, Awaitable[ExtractionResult]]]
|
@@ -114,14 +133,22 @@ class ExtractionConfig:
|
|
114
133
|
"""Whether to force OCR."""
|
115
134
|
chunk_content: bool = False
|
116
135
|
"""Whether to chunk the content into smaller chunks."""
|
136
|
+
extract_tables: bool = False
|
137
|
+
"""Whether to extract tables from the content. This requires the 'gmft' dependency."""
|
117
138
|
max_chars: int = DEFAULT_MAX_CHARACTERS
|
118
139
|
"""The size of each chunk in characters."""
|
119
140
|
max_overlap: int = DEFAULT_MAX_OVERLAP
|
120
141
|
"""The overlap between chunks in characters."""
|
121
142
|
ocr_backend: OcrBackendType | None = "tesseract"
|
122
|
-
"""The OCR backend to use.
|
143
|
+
"""The OCR backend to use.
|
144
|
+
|
145
|
+
Notes:
|
146
|
+
- If set to 'None', OCR will not be performed.
|
147
|
+
"""
|
123
148
|
ocr_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
|
124
149
|
"""Configuration to pass to the OCR backend."""
|
150
|
+
gmft_config: GMFTConfig | None = None
|
151
|
+
"""GMFT configuration."""
|
125
152
|
post_processing_hooks: list[PostProcessingHook] | None = None
|
126
153
|
"""Post processing hooks to call after processing is done and before the final result is returned."""
|
127
154
|
validators: list[ValidationHook] | None = None
|
@@ -1,11 +1,11 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.0
|
3
|
+
Version: 3.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
7
7
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
8
|
-
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,tesseract,text-extraction,text-processing
|
8
|
+
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
9
9
|
Classifier: Development Status :: 4 - Beta
|
10
10
|
Classifier: Intended Audience :: Developers
|
11
11
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -27,7 +27,7 @@ License-File: LICENSE
|
|
27
27
|
Requires-Dist: anyio>=4.9.0
|
28
28
|
Requires-Dist: charset-normalizer>=3.4.1
|
29
29
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < "3.11"
|
30
|
-
Requires-Dist: html-to-markdown>=1.2.
|
30
|
+
Requires-Dist: html-to-markdown>=1.2.1
|
31
31
|
Requires-Dist: playa-pdf>=0.4.1
|
32
32
|
Requires-Dist: pypdfium2==4.30.0
|
33
33
|
Requires-Dist: python-calamine>=0.3.1
|
@@ -35,19 +35,20 @@ Requires-Dist: python-pptx>=1.0.2
|
|
35
35
|
Requires-Dist: typing-extensions>=4.12.2; python_version < "3.12"
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: easyocr>=1.7.2; extra == "all"
|
38
|
-
Requires-Dist:
|
38
|
+
Requires-Dist: gmft>=0.4.1; extra == "all"
|
39
39
|
Requires-Dist: paddleocr>=2.10.0; extra == "all"
|
40
|
-
Requires-Dist: paddlepaddle>=
|
41
|
-
Requires-Dist: semantic-text-splitter>=0.
|
40
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "all"
|
41
|
+
Requires-Dist: semantic-text-splitter>=0.25.1; extra == "all"
|
42
42
|
Requires-Dist: setuptools>=76.0.0; extra == "all"
|
43
43
|
Provides-Extra: chunking
|
44
|
-
Requires-Dist: semantic-text-splitter>=0.
|
44
|
+
Requires-Dist: semantic-text-splitter>=0.25.1; extra == "chunking"
|
45
45
|
Provides-Extra: easyocr
|
46
46
|
Requires-Dist: easyocr>=1.7.2; extra == "easyocr"
|
47
|
+
Provides-Extra: gmft
|
48
|
+
Requires-Dist: gmft>=0.4.1; extra == "gmft"
|
47
49
|
Provides-Extra: paddleocr
|
48
|
-
Requires-Dist: numpy>=2.0.2; extra == "paddleocr"
|
49
50
|
Requires-Dist: paddleocr>=2.10.0; extra == "paddleocr"
|
50
|
-
Requires-Dist: paddlepaddle>=
|
51
|
+
Requires-Dist: paddlepaddle>=3.0.0; extra == "paddleocr"
|
51
52
|
Requires-Dist: setuptools>=76.0.0; extra == "paddleocr"
|
52
53
|
Dynamic: license-file
|
53
54
|
|
@@ -66,6 +67,8 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
66
67
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
67
68
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
68
69
|
- **Multiple OCR Engines**: Support for Tesseract, EasyOCR, and PaddleOCR
|
70
|
+
- **Metadata Extraction**: Get document metadata alongside text content
|
71
|
+
- **Table Extraction**: Extract tables from documents using the excellent GMFT library
|
69
72
|
- **Modern Python**: Built with async/await, type hints, and a functional-first approach
|
70
73
|
- **Permissive OSS**: MIT licensed with permissively licensed dependencies
|
71
74
|
|
@@ -160,17 +163,9 @@ This library is open to contribution. Feel free to open issues or submit PRs. It
|
|
160
163
|
### Local Development
|
161
164
|
|
162
165
|
1. Clone the repo
|
163
|
-
|
164
166
|
1. Install the system dependencies
|
165
|
-
|
166
167
|
1. Install the full dependencies with `uv sync`
|
167
|
-
|
168
|
-
1. Install the pre-commit hooks with:
|
169
|
-
|
170
|
-
```shell
|
171
|
-
pre-commit install && pre-commit install --hook-type commit-msg
|
172
|
-
```
|
173
|
-
|
168
|
+
1. Install the pre-commit hooks with: `pre-commit install && pre-commit install --hook-type commit-msg`
|
174
169
|
1. Make your changes and submit a PR
|
175
170
|
|
176
171
|
## License
|
@@ -1,6 +1,6 @@
|
|
1
1
|
anyio>=4.9.0
|
2
2
|
charset-normalizer>=3.4.1
|
3
|
-
html-to-markdown>=1.2.
|
3
|
+
html-to-markdown>=1.2.1
|
4
4
|
playa-pdf>=0.4.1
|
5
5
|
pypdfium2==4.30.0
|
6
6
|
python-calamine>=0.3.1
|
@@ -14,24 +14,22 @@ typing-extensions>=4.12.2
|
|
14
14
|
|
15
15
|
[all]
|
16
16
|
easyocr>=1.7.2
|
17
|
-
|
17
|
+
gmft>=0.4.1
|
18
18
|
paddleocr>=2.10.0
|
19
|
-
|
19
|
+
paddlepaddle>=3.0.0
|
20
|
+
semantic-text-splitter>=0.25.1
|
20
21
|
setuptools>=76.0.0
|
21
22
|
|
22
|
-
[all:python_version < "3.13"]
|
23
|
-
paddlepaddle>=2.6.2
|
24
|
-
|
25
23
|
[chunking]
|
26
|
-
semantic-text-splitter>=0.
|
24
|
+
semantic-text-splitter>=0.25.1
|
27
25
|
|
28
26
|
[easyocr]
|
29
27
|
easyocr>=1.7.2
|
30
28
|
|
29
|
+
[gmft]
|
30
|
+
gmft>=0.4.1
|
31
|
+
|
31
32
|
[paddleocr]
|
32
|
-
numpy>=2.0.2
|
33
33
|
paddleocr>=2.10.0
|
34
|
+
paddlepaddle>=3.0.0
|
34
35
|
setuptools>=76.0.0
|
35
|
-
|
36
|
-
[paddleocr:python_version < "3.13"]
|
37
|
-
paddlepaddle>=2.6.2
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "kreuzberg"
|
3
|
-
version = "3.0
|
3
|
+
version = "3.1.0"
|
4
4
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
readme = "README.md"
|
6
6
|
keywords = [
|
@@ -10,6 +10,7 @@ keywords = [
|
|
10
10
|
"pandoc",
|
11
11
|
"pdf-extraction",
|
12
12
|
"rag",
|
13
|
+
"table-extraction",
|
13
14
|
"tesseract",
|
14
15
|
"text-extraction",
|
15
16
|
"text-processing",
|
@@ -39,7 +40,7 @@ dependencies = [
|
|
39
40
|
"anyio>=4.9.0",
|
40
41
|
"charset-normalizer>=3.4.1",
|
41
42
|
"exceptiongroup>=1.2.2; python_version<'3.11'",
|
42
|
-
"html-to-markdown>=1.2.
|
43
|
+
"html-to-markdown>=1.2.1",
|
43
44
|
"playa-pdf>=0.4.1",
|
44
45
|
"pypdfium2==4.30.0", # pinned due to bug in 4.30.1, until v5 is stable
|
45
46
|
"python-calamine>=0.3.1",
|
@@ -50,24 +51,27 @@ dependencies = [
|
|
50
51
|
optional-dependencies.all = [
|
51
52
|
# easyocr
|
52
53
|
"easyocr>=1.7.2",
|
54
|
+
# gmft
|
55
|
+
"gmft>=0.4.1",
|
53
56
|
# paddle
|
54
|
-
"numpy>=2.0.2",
|
55
57
|
"paddleocr>=2.10.0",
|
56
|
-
"paddlepaddle>=
|
58
|
+
"paddlepaddle>=3.0.0",
|
57
59
|
# chunking
|
58
|
-
"semantic-text-splitter>=0.
|
60
|
+
"semantic-text-splitter>=0.25.1",
|
59
61
|
"setuptools>=76.0.0",
|
60
62
|
]
|
61
63
|
optional-dependencies.chunking = [
|
62
|
-
"semantic-text-splitter>=0.
|
64
|
+
"semantic-text-splitter>=0.25.1",
|
63
65
|
]
|
64
66
|
optional-dependencies.easyocr = [
|
65
67
|
"easyocr>=1.7.2",
|
66
68
|
]
|
69
|
+
optional-dependencies.gmft = [
|
70
|
+
"gmft>=0.4.1",
|
71
|
+
]
|
67
72
|
optional-dependencies.paddleocr = [
|
68
|
-
"numpy>=2.0.2",
|
69
73
|
"paddleocr>=2.10.0",
|
70
|
-
"paddlepaddle>=
|
74
|
+
"paddlepaddle>=3.0.0",
|
71
75
|
"setuptools>=76.0.0",
|
72
76
|
]
|
73
77
|
urls.homepage = "https://github.com/Goldziher/kreuzberg"
|
@@ -83,6 +87,7 @@ dev = [
|
|
83
87
|
"pytest-timeout>=2.3.1",
|
84
88
|
"ruff>=0.11.2",
|
85
89
|
"trio>=0.29.0",
|
90
|
+
"uv-bump",
|
86
91
|
]
|
87
92
|
doc = [
|
88
93
|
"mkdocs>=1.6.1",
|
@@ -121,6 +126,7 @@ lint.per-file-ignores."tests/**/*.*" = [
|
|
121
126
|
"ARG001",
|
122
127
|
"D",
|
123
128
|
"N815",
|
129
|
+
"PD",
|
124
130
|
"PGH003",
|
125
131
|
"PLR0915",
|
126
132
|
"PLR2004",
|
@@ -167,3 +173,6 @@ disable_error_code = 'import-untyped'
|
|
167
173
|
implicit_reexport = false
|
168
174
|
show_error_codes = true
|
169
175
|
strict = true
|
176
|
+
|
177
|
+
[tool.uv.sources]
|
178
|
+
uv-bump = { git = "https://github.com/Goldziher/uv-bump" }
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|