kreuzberg 2.0.0__tar.gz → 2.0.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/PKG-INFO +45 -13
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/README.md +44 -12
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/__init__.py +12 -1
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_pdf.py +4 -4
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_tesseract.py +5 -135
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/extraction.py +9 -9
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/PKG-INFO +45 -13
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/pyproject.toml +1 -1
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/LICENSE +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_constants.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_html.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_pandoc.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_pptx.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_string.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_sync.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_tmp.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_types.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/_xlsx.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/exceptions.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg/py.typed +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/SOURCES.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/requires.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.0.1}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.1
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -64,7 +64,31 @@ Kreuzberg requires two system level dependencies:
|
|
64
64
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
65
|
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
66
66
|
|
67
|
-
|
67
|
+
You can install these with:
|
68
|
+
|
69
|
+
#### Linux (Ubuntu)
|
70
|
+
|
71
|
+
```shell
|
72
|
+
sudo apt-get install pandoc tesseract-ocr
|
73
|
+
```
|
74
|
+
|
75
|
+
#### MacOS
|
76
|
+
|
77
|
+
```shell
|
78
|
+
# MacOS
|
79
|
+
brew install tesseract pandoc
|
80
|
+
```
|
81
|
+
|
82
|
+
#### Windows
|
83
|
+
|
84
|
+
```shell
|
85
|
+
choco install -y tesseract pandoc
|
86
|
+
```
|
87
|
+
|
88
|
+
Notes:
|
89
|
+
|
90
|
+
- in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
|
91
|
+
- please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
|
68
92
|
|
69
93
|
## Architecture
|
70
94
|
|
@@ -152,18 +176,26 @@ All extraction functions accept the following optional parameters for configurin
|
|
152
176
|
|
153
177
|
#### OCR Configuration
|
154
178
|
|
155
|
-
- `
|
156
|
-
|
157
|
-
|
158
|
-
-
|
179
|
+
- `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
|
180
|
+
- `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
|
181
|
+
|
182
|
+
- `eng` for English
|
183
|
+
- `deu` for German
|
184
|
+
- `eng+deu` for English and German
|
185
|
+
|
186
|
+
Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
|
187
|
+
|
188
|
+
- `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
189
|
+
|
190
|
+
Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
|
159
191
|
|
160
|
-
|
192
|
+
#### Processing Configuration
|
161
193
|
|
162
|
-
- `
|
194
|
+
- `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
|
163
195
|
|
164
|
-
|
196
|
+
Notes:
|
165
197
|
|
166
|
-
-
|
198
|
+
- Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
167
199
|
|
168
200
|
### Quick Start
|
169
201
|
|
@@ -171,7 +203,7 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
|
|
171
203
|
from pathlib import Path
|
172
204
|
from kreuzberg import extract_file
|
173
205
|
from kreuzberg.extraction import ExtractionResult
|
174
|
-
from kreuzberg._tesseract import PSMMode
|
206
|
+
from kreuzberg._tesseract import PSMMode
|
175
207
|
|
176
208
|
|
177
209
|
# Basic file extraction
|
@@ -193,7 +225,7 @@ async def extract_document():
|
|
193
225
|
docx_result = await extract_file(Path("document.docx"))
|
194
226
|
if docx_result.metadata:
|
195
227
|
print(f"Title: {docx_result.metadata.get('title')}")
|
196
|
-
print(f"Author: {docx_result.metadata.get('
|
228
|
+
print(f"Author: {docx_result.metadata.get('creator')}")
|
197
229
|
```
|
198
230
|
|
199
231
|
### Extracting Bytes
|
@@ -236,7 +268,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
|
|
236
268
|
|
237
269
|
```python
|
238
270
|
from pathlib import Path
|
239
|
-
from kreuzberg import batch_extract_file, batch_extract_bytes
|
271
|
+
from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
|
240
272
|
|
241
273
|
|
242
274
|
# Process multiple files concurrently
|
@@ -29,7 +29,31 @@ Kreuzberg requires two system level dependencies:
|
|
29
29
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
30
30
|
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
31
31
|
|
32
|
-
|
32
|
+
You can install these with:
|
33
|
+
|
34
|
+
#### Linux (Ubuntu)
|
35
|
+
|
36
|
+
```shell
|
37
|
+
sudo apt-get install pandoc tesseract-ocr
|
38
|
+
```
|
39
|
+
|
40
|
+
#### MacOS
|
41
|
+
|
42
|
+
```shell
|
43
|
+
# MacOS
|
44
|
+
brew install tesseract pandoc
|
45
|
+
```
|
46
|
+
|
47
|
+
#### Windows
|
48
|
+
|
49
|
+
```shell
|
50
|
+
choco install -y tesseract pandoc
|
51
|
+
```
|
52
|
+
|
53
|
+
Notes:
|
54
|
+
|
55
|
+
- in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
|
56
|
+
- please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
|
33
57
|
|
34
58
|
## Architecture
|
35
59
|
|
@@ -117,18 +141,26 @@ All extraction functions accept the following optional parameters for configurin
|
|
117
141
|
|
118
142
|
#### OCR Configuration
|
119
143
|
|
120
|
-
- `
|
121
|
-
|
122
|
-
|
123
|
-
-
|
144
|
+
- `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
|
145
|
+
- `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
|
146
|
+
|
147
|
+
- `eng` for English
|
148
|
+
- `deu` for German
|
149
|
+
- `eng+deu` for English and German
|
150
|
+
|
151
|
+
Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
|
152
|
+
|
153
|
+
- `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
154
|
+
|
155
|
+
Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
|
124
156
|
|
125
|
-
|
157
|
+
#### Processing Configuration
|
126
158
|
|
127
|
-
- `
|
159
|
+
- `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
|
128
160
|
|
129
|
-
|
161
|
+
Notes:
|
130
162
|
|
131
|
-
-
|
163
|
+
- Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
132
164
|
|
133
165
|
### Quick Start
|
134
166
|
|
@@ -136,7 +168,7 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
|
|
136
168
|
from pathlib import Path
|
137
169
|
from kreuzberg import extract_file
|
138
170
|
from kreuzberg.extraction import ExtractionResult
|
139
|
-
from kreuzberg._tesseract import PSMMode
|
171
|
+
from kreuzberg._tesseract import PSMMode
|
140
172
|
|
141
173
|
|
142
174
|
# Basic file extraction
|
@@ -158,7 +190,7 @@ async def extract_document():
|
|
158
190
|
docx_result = await extract_file(Path("document.docx"))
|
159
191
|
if docx_result.metadata:
|
160
192
|
print(f"Title: {docx_result.metadata.get('title')}")
|
161
|
-
print(f"Author: {docx_result.metadata.get('
|
193
|
+
print(f"Author: {docx_result.metadata.get('creator')}")
|
162
194
|
```
|
163
195
|
|
164
196
|
### Extracting Bytes
|
@@ -201,7 +233,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
|
|
201
233
|
|
202
234
|
```python
|
203
235
|
from pathlib import Path
|
204
|
-
from kreuzberg import batch_extract_file, batch_extract_bytes
|
236
|
+
from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
|
205
237
|
|
206
238
|
|
207
239
|
# Process multiple files concurrently
|
@@ -1,6 +1,13 @@
|
|
1
1
|
from ._types import ExtractionResult, Metadata
|
2
2
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
3
|
-
from .extraction import
|
3
|
+
from .extraction import (
|
4
|
+
batch_extract_bytes,
|
5
|
+
batch_extract_bytes_sync,
|
6
|
+
batch_extract_file,
|
7
|
+
batch_extract_file_sync,
|
8
|
+
extract_bytes,
|
9
|
+
extract_file,
|
10
|
+
)
|
4
11
|
|
5
12
|
__all__ = [
|
6
13
|
"ExtractionResult",
|
@@ -10,6 +17,10 @@ __all__ = [
|
|
10
17
|
"OCRError",
|
11
18
|
"ParsingError",
|
12
19
|
"ValidationError",
|
20
|
+
"batch_extract_bytes",
|
21
|
+
"batch_extract_bytes_sync",
|
22
|
+
"batch_extract_file",
|
23
|
+
"batch_extract_file_sync",
|
13
24
|
"extract_bytes",
|
14
25
|
"extract_file",
|
15
26
|
]
|
@@ -11,7 +11,7 @@ from kreuzberg import ExtractionResult
|
|
11
11
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
12
|
from kreuzberg._string import normalize_spaces
|
13
13
|
from kreuzberg._sync import run_sync
|
14
|
-
from kreuzberg._tesseract import PSMMode,
|
14
|
+
from kreuzberg._tesseract import PSMMode, batch_process_images
|
15
15
|
from kreuzberg.exceptions import ParsingError
|
16
16
|
|
17
17
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -80,7 +80,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
|
80
80
|
async def _extract_pdf_text_with_ocr(
|
81
81
|
input_file: Path,
|
82
82
|
*,
|
83
|
-
language:
|
83
|
+
language: str = "eng",
|
84
84
|
max_processes: int,
|
85
85
|
psm: PSMMode = PSMMode.AUTO,
|
86
86
|
) -> ExtractionResult:
|
@@ -132,7 +132,7 @@ async def extract_pdf_file(
|
|
132
132
|
input_file: Path,
|
133
133
|
*,
|
134
134
|
force_ocr: bool,
|
135
|
-
language:
|
135
|
+
language: str = "eng",
|
136
136
|
max_processes: int,
|
137
137
|
psm: PSMMode = PSMMode.AUTO,
|
138
138
|
) -> ExtractionResult:
|
@@ -162,7 +162,7 @@ async def extract_pdf_content(
|
|
162
162
|
content: bytes,
|
163
163
|
*,
|
164
164
|
force_ocr: bool,
|
165
|
-
language:
|
165
|
+
language: str = "eng",
|
166
166
|
max_processes: int,
|
167
167
|
psm: PSMMode = PSMMode.AUTO,
|
168
168
|
) -> ExtractionResult:
|
@@ -6,7 +6,7 @@ import sys
|
|
6
6
|
from enum import Enum
|
7
7
|
from functools import partial
|
8
8
|
from os import PathLike
|
9
|
-
from typing import Final,
|
9
|
+
from typing import Final, TypeVar, Union, cast
|
10
10
|
|
11
11
|
from anyio import CapacityLimiter, create_task_group, to_process
|
12
12
|
from anyio import Path as AsyncPath
|
@@ -29,136 +29,6 @@ version_ref = {"checked": False}
|
|
29
29
|
|
30
30
|
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
31
31
|
|
32
|
-
SupportedLanguage = Literal[
|
33
|
-
"afr",
|
34
|
-
"amh",
|
35
|
-
"ara",
|
36
|
-
"asm",
|
37
|
-
"aze",
|
38
|
-
"aze_cyrl",
|
39
|
-
"bel",
|
40
|
-
"ben",
|
41
|
-
"bod",
|
42
|
-
"bos",
|
43
|
-
"bre",
|
44
|
-
"bul",
|
45
|
-
"cat",
|
46
|
-
"ceb",
|
47
|
-
"ces",
|
48
|
-
"chi_sim",
|
49
|
-
"chi_tra",
|
50
|
-
"chr",
|
51
|
-
"cos",
|
52
|
-
"cym",
|
53
|
-
"dan",
|
54
|
-
"dan_frak",
|
55
|
-
"deu",
|
56
|
-
"deu_frak",
|
57
|
-
"deu_latf",
|
58
|
-
"dzo",
|
59
|
-
"ell",
|
60
|
-
"eng",
|
61
|
-
"enm",
|
62
|
-
"epo",
|
63
|
-
"equ",
|
64
|
-
"est",
|
65
|
-
"eus",
|
66
|
-
"fao",
|
67
|
-
"fas",
|
68
|
-
"fil",
|
69
|
-
"fin",
|
70
|
-
"fra",
|
71
|
-
"frk",
|
72
|
-
"frm",
|
73
|
-
"fry",
|
74
|
-
"gla",
|
75
|
-
"gle",
|
76
|
-
"glg",
|
77
|
-
"grc",
|
78
|
-
"guj",
|
79
|
-
"hat",
|
80
|
-
"heb",
|
81
|
-
"hin",
|
82
|
-
"hrv",
|
83
|
-
"hun",
|
84
|
-
"hye",
|
85
|
-
"iku",
|
86
|
-
"ind",
|
87
|
-
"isl",
|
88
|
-
"ita",
|
89
|
-
"ita_old",
|
90
|
-
"jav",
|
91
|
-
"jpn",
|
92
|
-
"kan",
|
93
|
-
"kat",
|
94
|
-
"kat_old",
|
95
|
-
"kaz",
|
96
|
-
"khm",
|
97
|
-
"kir",
|
98
|
-
"kmr",
|
99
|
-
"kor",
|
100
|
-
"kor_vert",
|
101
|
-
"kur",
|
102
|
-
"lao",
|
103
|
-
"lat",
|
104
|
-
"lav",
|
105
|
-
"lit",
|
106
|
-
"ltz",
|
107
|
-
"mal",
|
108
|
-
"mar",
|
109
|
-
"mkd",
|
110
|
-
"mlt",
|
111
|
-
"mon",
|
112
|
-
"mri",
|
113
|
-
"msa",
|
114
|
-
"mya",
|
115
|
-
"nep",
|
116
|
-
"nld",
|
117
|
-
"nor",
|
118
|
-
"oci",
|
119
|
-
"ori",
|
120
|
-
"osd",
|
121
|
-
"pan",
|
122
|
-
"pol",
|
123
|
-
"por",
|
124
|
-
"pus",
|
125
|
-
"que",
|
126
|
-
"ron",
|
127
|
-
"rus",
|
128
|
-
"san",
|
129
|
-
"sin",
|
130
|
-
"slk",
|
131
|
-
"slk_frak",
|
132
|
-
"slv",
|
133
|
-
"snd",
|
134
|
-
"spa",
|
135
|
-
"spa_old",
|
136
|
-
"sqi",
|
137
|
-
"srp",
|
138
|
-
"srp_latn",
|
139
|
-
"sun",
|
140
|
-
"swa",
|
141
|
-
"swe",
|
142
|
-
"syr",
|
143
|
-
"tam",
|
144
|
-
"tat",
|
145
|
-
"tel",
|
146
|
-
"tgk",
|
147
|
-
"tgl",
|
148
|
-
"tha",
|
149
|
-
"tir",
|
150
|
-
"ton",
|
151
|
-
"tur",
|
152
|
-
"uig",
|
153
|
-
"ukr",
|
154
|
-
"urd",
|
155
|
-
"uzb",
|
156
|
-
"uzb_cyrl",
|
157
|
-
"vie",
|
158
|
-
"yid",
|
159
|
-
"yor",
|
160
|
-
]
|
161
|
-
|
162
32
|
|
163
33
|
class PSMMode(Enum):
|
164
34
|
"""Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
|
@@ -211,7 +81,7 @@ async def validate_tesseract_version() -> None:
|
|
211
81
|
async def process_file(
|
212
82
|
input_file: str | PathLike[str],
|
213
83
|
*,
|
214
|
-
language:
|
84
|
+
language: str,
|
215
85
|
psm: PSMMode,
|
216
86
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
217
87
|
) -> ExtractionResult:
|
@@ -263,7 +133,7 @@ async def process_file(
|
|
263
133
|
async def process_image(
|
264
134
|
image: Image,
|
265
135
|
*,
|
266
|
-
language:
|
136
|
+
language: str,
|
267
137
|
psm: PSMMode,
|
268
138
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
269
139
|
) -> ExtractionResult:
|
@@ -288,7 +158,7 @@ async def process_image(
|
|
288
158
|
async def process_image_with_tesseract(
|
289
159
|
image: Image | PathLike[str] | str,
|
290
160
|
*,
|
291
|
-
language:
|
161
|
+
language: str = "eng",
|
292
162
|
psm: PSMMode = PSMMode.AUTO,
|
293
163
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
294
164
|
) -> ExtractionResult:
|
@@ -320,7 +190,7 @@ async def process_image_with_tesseract(
|
|
320
190
|
async def batch_process_images(
|
321
191
|
images: list[T],
|
322
192
|
*,
|
323
|
-
language:
|
193
|
+
language: str = "eng",
|
324
194
|
psm: PSMMode = PSMMode.AUTO,
|
325
195
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
326
196
|
) -> list[ExtractionResult]:
|
@@ -38,7 +38,7 @@ from kreuzberg._pdf import (
|
|
38
38
|
)
|
39
39
|
from kreuzberg._pptx import extract_pptx_file_content
|
40
40
|
from kreuzberg._string import safe_decode
|
41
|
-
from kreuzberg._tesseract import PSMMode,
|
41
|
+
from kreuzberg._tesseract import PSMMode, process_image_with_tesseract
|
42
42
|
from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
|
43
43
|
from kreuzberg.exceptions import ValidationError
|
44
44
|
|
@@ -52,7 +52,7 @@ async def extract_bytes(
|
|
52
52
|
mime_type: str,
|
53
53
|
*,
|
54
54
|
force_ocr: bool = False,
|
55
|
-
language:
|
55
|
+
language: str = "eng",
|
56
56
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
57
57
|
psm: PSMMode = PSMMode.AUTO,
|
58
58
|
) -> ExtractionResult:
|
@@ -114,7 +114,7 @@ async def extract_file(
|
|
114
114
|
mime_type: str | None = None,
|
115
115
|
*,
|
116
116
|
force_ocr: bool = False,
|
117
|
-
language:
|
117
|
+
language: str = "eng",
|
118
118
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
119
119
|
psm: PSMMode = PSMMode.AUTO,
|
120
120
|
) -> ExtractionResult:
|
@@ -170,7 +170,7 @@ async def batch_extract_file(
|
|
170
170
|
file_paths: Sequence[PathLike[str] | str],
|
171
171
|
*,
|
172
172
|
force_ocr: bool = False,
|
173
|
-
language:
|
173
|
+
language: str = "eng",
|
174
174
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
175
175
|
psm: PSMMode = PSMMode.AUTO,
|
176
176
|
) -> list[ExtractionResult]:
|
@@ -209,7 +209,7 @@ async def batch_extract_bytes(
|
|
209
209
|
contents: Sequence[tuple[bytes, str]],
|
210
210
|
*,
|
211
211
|
force_ocr: bool = False,
|
212
|
-
language:
|
212
|
+
language: str = "eng",
|
213
213
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
214
214
|
psm: PSMMode = PSMMode.AUTO,
|
215
215
|
) -> list[ExtractionResult]:
|
@@ -253,7 +253,7 @@ def extract_bytes_sync(
|
|
253
253
|
mime_type: str,
|
254
254
|
*,
|
255
255
|
force_ocr: bool = False,
|
256
|
-
language:
|
256
|
+
language: str = "eng",
|
257
257
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
258
258
|
psm: PSMMode = PSMMode.AUTO,
|
259
259
|
) -> ExtractionResult:
|
@@ -281,7 +281,7 @@ def extract_file_sync(
|
|
281
281
|
mime_type: str | None = None,
|
282
282
|
*,
|
283
283
|
force_ocr: bool = False,
|
284
|
-
language:
|
284
|
+
language: str = "eng",
|
285
285
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
286
286
|
psm: PSMMode = PSMMode.AUTO,
|
287
287
|
) -> ExtractionResult:
|
@@ -308,7 +308,7 @@ def batch_extract_file_sync(
|
|
308
308
|
file_paths: Sequence[PathLike[str] | str],
|
309
309
|
*,
|
310
310
|
force_ocr: bool = False,
|
311
|
-
language:
|
311
|
+
language: str = "eng",
|
312
312
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
313
313
|
psm: PSMMode = PSMMode.AUTO,
|
314
314
|
) -> list[ExtractionResult]:
|
@@ -339,7 +339,7 @@ def batch_extract_bytes_sync(
|
|
339
339
|
contents: Sequence[tuple[bytes, str]],
|
340
340
|
*,
|
341
341
|
force_ocr: bool = False,
|
342
|
-
language:
|
342
|
+
language: str = "eng",
|
343
343
|
max_processes: int = DEFAULT_MAX_PROCESSES,
|
344
344
|
psm: PSMMode = PSMMode.AUTO,
|
345
345
|
) -> list[ExtractionResult]:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.0.
|
3
|
+
Version: 2.0.1
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -64,7 +64,31 @@ Kreuzberg requires two system level dependencies:
|
|
64
64
|
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
65
|
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
66
66
|
|
67
|
-
|
67
|
+
You can install these with:
|
68
|
+
|
69
|
+
#### Linux (Ubuntu)
|
70
|
+
|
71
|
+
```shell
|
72
|
+
sudo apt-get install pandoc tesseract-ocr
|
73
|
+
```
|
74
|
+
|
75
|
+
#### MacOS
|
76
|
+
|
77
|
+
```shell
|
78
|
+
# MacOS
|
79
|
+
brew install tesseract pandoc
|
80
|
+
```
|
81
|
+
|
82
|
+
#### Windows
|
83
|
+
|
84
|
+
```shell
|
85
|
+
choco install -y tesseract pandoc
|
86
|
+
```
|
87
|
+
|
88
|
+
Notes:
|
89
|
+
|
90
|
+
- in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
|
91
|
+
- please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
|
68
92
|
|
69
93
|
## Architecture
|
70
94
|
|
@@ -152,18 +176,26 @@ All extraction functions accept the following optional parameters for configurin
|
|
152
176
|
|
153
177
|
#### OCR Configuration
|
154
178
|
|
155
|
-
- `
|
156
|
-
|
157
|
-
|
158
|
-
-
|
179
|
+
- `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
|
180
|
+
- `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
|
181
|
+
|
182
|
+
- `eng` for English
|
183
|
+
- `deu` for German
|
184
|
+
- `eng+deu` for English and German
|
185
|
+
|
186
|
+
Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
|
187
|
+
|
188
|
+
- `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
189
|
+
|
190
|
+
Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
|
159
191
|
|
160
|
-
|
192
|
+
#### Processing Configuration
|
161
193
|
|
162
|
-
- `
|
194
|
+
- `max_processes` (default: CPU count / 2): Maximum number of concurrent processes for Tesseract and Pandoc.
|
163
195
|
|
164
|
-
|
196
|
+
Notes:
|
165
197
|
|
166
|
-
-
|
198
|
+
- Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
167
199
|
|
168
200
|
### Quick Start
|
169
201
|
|
@@ -171,7 +203,7 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
|
|
171
203
|
from pathlib import Path
|
172
204
|
from kreuzberg import extract_file
|
173
205
|
from kreuzberg.extraction import ExtractionResult
|
174
|
-
from kreuzberg._tesseract import PSMMode
|
206
|
+
from kreuzberg._tesseract import PSMMode
|
175
207
|
|
176
208
|
|
177
209
|
# Basic file extraction
|
@@ -193,7 +225,7 @@ async def extract_document():
|
|
193
225
|
docx_result = await extract_file(Path("document.docx"))
|
194
226
|
if docx_result.metadata:
|
195
227
|
print(f"Title: {docx_result.metadata.get('title')}")
|
196
|
-
print(f"Author: {docx_result.metadata.get('
|
228
|
+
print(f"Author: {docx_result.metadata.get('creator')}")
|
197
229
|
```
|
198
230
|
|
199
231
|
### Extracting Bytes
|
@@ -236,7 +268,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
|
|
236
268
|
|
237
269
|
```python
|
238
270
|
from pathlib import Path
|
239
|
-
from kreuzberg import batch_extract_file, batch_extract_bytes
|
271
|
+
from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
|
240
272
|
|
241
273
|
|
242
274
|
# Process multiple files concurrently
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|