kreuzberg 2.0.0__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/PKG-INFO +48 -20
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/README.md +47 -19
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/__init__.py +14 -1
- kreuzberg-2.1.0/kreuzberg/_constants.py +8 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_html.py +1 -2
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_pandoc.py +37 -73
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_pdf.py +5 -6
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_string.py +1 -1
- kreuzberg-2.1.0/kreuzberg/_sync.py +74 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_tesseract.py +55 -176
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_xlsx.py +34 -36
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/exceptions.py +20 -1
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/extraction.py +13 -15
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/PKG-INFO +48 -20
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/pyproject.toml +4 -5
- kreuzberg-2.0.0/kreuzberg/_constants.py +0 -6
- kreuzberg-2.0.0/kreuzberg/_sync.py +0 -33
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/LICENSE +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_pptx.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_tmp.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/_types.py +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/SOURCES.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/requires.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-2.0.0 → kreuzberg-2.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.
|
3
|
+
Version: 2.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
42
42
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
43
43
|
- **Local Processing**: No external API calls or cloud dependencies required
|
44
44
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
45
|
-
- **
|
45
|
+
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
46
46
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
47
47
|
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
48
48
|
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
@@ -61,10 +61,34 @@ pip install kreuzberg
|
|
61
61
|
|
62
62
|
Kreuzberg requires two system level dependencies:
|
63
63
|
|
64
|
-
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
64
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
65
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
|
66
66
|
|
67
|
-
|
67
|
+
You can install these with:
|
68
|
+
|
69
|
+
#### Linux (Ubuntu)
|
70
|
+
|
71
|
+
```shell
|
72
|
+
sudo apt-get install pandoc tesseract-ocr
|
73
|
+
```
|
74
|
+
|
75
|
+
#### MacOS
|
76
|
+
|
77
|
+
```shell
|
78
|
+
#
|
79
|
+
brew install tesseract pandoc
|
80
|
+
```
|
81
|
+
|
82
|
+
#### Windows
|
83
|
+
|
84
|
+
```shell
|
85
|
+
choco install -y tesseract pandoc
|
86
|
+
```
|
87
|
+
|
88
|
+
Notes:
|
89
|
+
|
90
|
+
- in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
|
91
|
+
- please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
|
68
92
|
|
69
93
|
## Architecture
|
70
94
|
|
@@ -152,26 +176,30 @@ All extraction functions accept the following optional parameters for configurin
|
|
152
176
|
|
153
177
|
#### OCR Configuration
|
154
178
|
|
155
|
-
- `
|
156
|
-
|
157
|
-
|
158
|
-
-
|
179
|
+
- `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
|
180
|
+
- `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
|
181
|
+
|
182
|
+
- `eng` for English
|
183
|
+
- `deu` for German
|
184
|
+
- `eng+deu` for English and German
|
159
185
|
|
160
|
-
|
186
|
+
Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
|
161
187
|
|
162
|
-
- `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
188
|
+
- `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
163
189
|
|
164
|
-
|
190
|
+
Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
|
165
191
|
|
166
|
-
|
192
|
+
#### Processing Configuration
|
193
|
+
|
194
|
+
- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
|
167
195
|
|
168
196
|
### Quick Start
|
169
197
|
|
170
198
|
```python
|
171
199
|
from pathlib import Path
|
172
200
|
from kreuzberg import extract_file
|
173
|
-
from kreuzberg
|
174
|
-
from kreuzberg
|
201
|
+
from kreuzberg import ExtractionResult
|
202
|
+
from kreuzberg import PSMMode
|
175
203
|
|
176
204
|
|
177
205
|
# Basic file extraction
|
@@ -193,14 +221,14 @@ async def extract_document():
|
|
193
221
|
docx_result = await extract_file(Path("document.docx"))
|
194
222
|
if docx_result.metadata:
|
195
223
|
print(f"Title: {docx_result.metadata.get('title')}")
|
196
|
-
print(f"Author: {docx_result.metadata.get('
|
224
|
+
print(f"Author: {docx_result.metadata.get('creator')}")
|
197
225
|
```
|
198
226
|
|
199
227
|
### Extracting Bytes
|
200
228
|
|
201
229
|
```python
|
202
230
|
from kreuzberg import extract_bytes
|
203
|
-
from kreuzberg
|
231
|
+
from kreuzberg import ExtractionResult
|
204
232
|
|
205
233
|
|
206
234
|
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
@@ -236,7 +264,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
|
|
236
264
|
|
237
265
|
```python
|
238
266
|
from pathlib import Path
|
239
|
-
from kreuzberg import batch_extract_file, batch_extract_bytes
|
267
|
+
from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
|
240
268
|
|
241
269
|
|
242
270
|
# Process multiple files concurrently
|
@@ -346,8 +374,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
|
|
346
374
|
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
347
375
|
|
348
376
|
```python
|
349
|
-
from kreuzberg import
|
350
|
-
|
377
|
+
from kreuzberg import (
|
378
|
+
extract_file,
|
351
379
|
ValidationError,
|
352
380
|
ParsingError,
|
353
381
|
OCRError,
|
@@ -7,7 +7,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
7
7
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
8
8
|
- **Local Processing**: No external API calls or cloud dependencies required
|
9
9
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
10
|
-
- **
|
10
|
+
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
11
11
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
12
12
|
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
13
13
|
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
@@ -26,10 +26,34 @@ pip install kreuzberg
|
|
26
26
|
|
27
27
|
Kreuzberg requires two system level dependencies:
|
28
28
|
|
29
|
-
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
30
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
29
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
30
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
|
31
31
|
|
32
|
-
|
32
|
+
You can install these with:
|
33
|
+
|
34
|
+
#### Linux (Ubuntu)
|
35
|
+
|
36
|
+
```shell
|
37
|
+
sudo apt-get install pandoc tesseract-ocr
|
38
|
+
```
|
39
|
+
|
40
|
+
#### MacOS
|
41
|
+
|
42
|
+
```shell
|
43
|
+
#
|
44
|
+
brew install tesseract pandoc
|
45
|
+
```
|
46
|
+
|
47
|
+
#### Windows
|
48
|
+
|
49
|
+
```shell
|
50
|
+
choco install -y tesseract pandoc
|
51
|
+
```
|
52
|
+
|
53
|
+
Notes:
|
54
|
+
|
55
|
+
- in most distributions the tesseract-ocr package is split into multiple packages, you may need to install any language models you need aside from English separately.
|
56
|
+
- please consult the official documentation for these libraries for the most up-to-date installation instructions for your platform.
|
33
57
|
|
34
58
|
## Architecture
|
35
59
|
|
@@ -117,26 +141,30 @@ All extraction functions accept the following optional parameters for configurin
|
|
117
141
|
|
118
142
|
#### OCR Configuration
|
119
143
|
|
120
|
-
- `
|
121
|
-
|
122
|
-
|
123
|
-
-
|
144
|
+
- `force_ocr`(default: `False`): Forces OCR processing even for searchable PDFs.
|
145
|
+
- `language` (default: `eng`): Specifies the language model for Tesseract OCR. This affects text recognition accuracy for documents in different languages. Examples:
|
146
|
+
|
147
|
+
- `eng` for English
|
148
|
+
- `deu` for German
|
149
|
+
- `eng+deu` for English and German
|
124
150
|
|
125
|
-
|
151
|
+
Notes: - the order of languages effect processing time, the first language is the primary language and the second language is the secondary language etc.
|
126
152
|
|
127
|
-
- `psm` (Page Segmentation Mode, default: PSM.AUTO): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
153
|
+
- `psm` (Page Segmentation Mode, default: `PSM.AUTO`): Controls how Tesseract analyzes page layout. In most cases you do not need to change this to a different value.
|
128
154
|
|
129
|
-
|
155
|
+
Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/) for more information on both options.
|
130
156
|
|
131
|
-
|
157
|
+
#### Processing Configuration
|
158
|
+
|
159
|
+
- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
|
132
160
|
|
133
161
|
### Quick Start
|
134
162
|
|
135
163
|
```python
|
136
164
|
from pathlib import Path
|
137
165
|
from kreuzberg import extract_file
|
138
|
-
from kreuzberg
|
139
|
-
from kreuzberg
|
166
|
+
from kreuzberg import ExtractionResult
|
167
|
+
from kreuzberg import PSMMode
|
140
168
|
|
141
169
|
|
142
170
|
# Basic file extraction
|
@@ -158,14 +186,14 @@ async def extract_document():
|
|
158
186
|
docx_result = await extract_file(Path("document.docx"))
|
159
187
|
if docx_result.metadata:
|
160
188
|
print(f"Title: {docx_result.metadata.get('title')}")
|
161
|
-
print(f"Author: {docx_result.metadata.get('
|
189
|
+
print(f"Author: {docx_result.metadata.get('creator')}")
|
162
190
|
```
|
163
191
|
|
164
192
|
### Extracting Bytes
|
165
193
|
|
166
194
|
```python
|
167
195
|
from kreuzberg import extract_bytes
|
168
|
-
from kreuzberg
|
196
|
+
from kreuzberg import ExtractionResult
|
169
197
|
|
170
198
|
|
171
199
|
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
@@ -201,7 +229,7 @@ Kreuzberg supports efficient batch processing of multiple files or byte contents
|
|
201
229
|
|
202
230
|
```python
|
203
231
|
from pathlib import Path
|
204
|
-
from kreuzberg import batch_extract_file, batch_extract_bytes
|
232
|
+
from kreuzberg import batch_extract_file, batch_extract_bytes, batch_extract_file_sync
|
205
233
|
|
206
234
|
|
207
235
|
# Process multiple files concurrently
|
@@ -311,8 +339,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
|
|
311
339
|
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
312
340
|
|
313
341
|
```python
|
314
|
-
from kreuzberg import
|
315
|
-
|
342
|
+
from kreuzberg import (
|
343
|
+
extract_file,
|
316
344
|
ValidationError,
|
317
345
|
ParsingError,
|
318
346
|
OCRError,
|
@@ -1,6 +1,14 @@
|
|
1
|
+
from ._tesseract import PSMMode
|
1
2
|
from ._types import ExtractionResult, Metadata
|
2
3
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
3
|
-
from .extraction import
|
4
|
+
from .extraction import (
|
5
|
+
batch_extract_bytes,
|
6
|
+
batch_extract_bytes_sync,
|
7
|
+
batch_extract_file,
|
8
|
+
batch_extract_file_sync,
|
9
|
+
extract_bytes,
|
10
|
+
extract_file,
|
11
|
+
)
|
4
12
|
|
5
13
|
__all__ = [
|
6
14
|
"ExtractionResult",
|
@@ -8,8 +16,13 @@ __all__ = [
|
|
8
16
|
"Metadata",
|
9
17
|
"MissingDependencyError",
|
10
18
|
"OCRError",
|
19
|
+
"PSMMode",
|
11
20
|
"ParsingError",
|
12
21
|
"ValidationError",
|
22
|
+
"batch_extract_bytes",
|
23
|
+
"batch_extract_bytes_sync",
|
24
|
+
"batch_extract_file",
|
25
|
+
"batch_extract_file_sync",
|
13
26
|
"extract_bytes",
|
14
27
|
"extract_file",
|
15
28
|
]
|
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
|
|
8
8
|
from kreuzberg import ExtractionResult
|
9
9
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
10
10
|
from kreuzberg._string import normalize_spaces, safe_decode
|
11
|
-
from kreuzberg._sync import run_sync
|
12
11
|
|
13
12
|
if TYPE_CHECKING:
|
14
13
|
from pathlib import Path
|
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
|
|
28
27
|
if isinstance(file_path_or_contents, bytes)
|
29
28
|
else await AsyncPath(file_path_or_contents).read_text()
|
30
29
|
)
|
31
|
-
result =
|
30
|
+
result = html_to_markdown.convert_to_markdown(content)
|
32
31
|
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
@@ -1,21 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import
|
3
|
+
import re
|
4
4
|
import sys
|
5
5
|
from functools import partial
|
6
6
|
from json import JSONDecodeError, loads
|
7
7
|
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
8
8
|
|
9
|
-
from anyio import CapacityLimiter, create_task_group, to_process
|
10
9
|
from anyio import Path as AsyncPath
|
10
|
+
from anyio import run_process
|
11
11
|
|
12
|
-
from kreuzberg
|
12
|
+
from kreuzberg import ValidationError
|
13
|
+
from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
|
13
14
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
14
15
|
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import
|
16
|
+
from kreuzberg._sync import run_taskgroup
|
16
17
|
from kreuzberg._tmp import create_temp_file
|
17
18
|
from kreuzberg._types import ExtractionResult, Metadata
|
18
|
-
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
20
|
|
20
21
|
if TYPE_CHECKING: # pragma: no cover
|
21
22
|
from collections.abc import Mapping
|
@@ -24,10 +25,8 @@ if TYPE_CHECKING: # pragma: no cover
|
|
24
25
|
if sys.version_info < (3, 11): # pragma: no cover
|
25
26
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
27
|
|
27
|
-
|
28
28
|
version_ref: Final[dict[str, bool]] = {"checked": False}
|
29
29
|
|
30
|
-
|
31
30
|
# Block-level node types in Pandoc AST
|
32
31
|
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
33
32
|
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
@@ -229,20 +228,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
|
229
228
|
|
230
229
|
|
231
230
|
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
232
|
-
if
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
"mime_type": mime_type,
|
239
|
-
"supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
|
240
|
-
},
|
231
|
+
if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
|
232
|
+
return pandoc_type
|
233
|
+
|
234
|
+
if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
|
235
|
+
return next(
|
236
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
241
237
|
)
|
242
238
|
|
243
|
-
|
244
|
-
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
245
|
-
)
|
239
|
+
raise ValidationError(f"Unsupported mime type: {mime_type}")
|
246
240
|
|
247
241
|
|
248
242
|
async def _validate_pandoc_version() -> None:
|
@@ -251,20 +245,19 @@ async def _validate_pandoc_version() -> None:
|
|
251
245
|
return
|
252
246
|
|
253
247
|
command = ["pandoc", "--version"]
|
254
|
-
result = await
|
255
|
-
|
256
|
-
|
257
|
-
|
248
|
+
result = await run_process(command)
|
249
|
+
|
250
|
+
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
251
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
252
|
+
raise MissingDependencyError("Pandoc version 2 or above is required")
|
258
253
|
|
259
254
|
version_ref["checked"] = True
|
260
255
|
|
261
256
|
except FileNotFoundError as e:
|
262
|
-
raise MissingDependencyError("Pandoc is not installed
|
257
|
+
raise MissingDependencyError("Pandoc is not installed") from e
|
263
258
|
|
264
259
|
|
265
|
-
async def _handle_extract_metadata(
|
266
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
267
|
-
) -> Metadata:
|
260
|
+
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
268
261
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
269
262
|
metadata_file, unlink = await create_temp_file(".json")
|
270
263
|
try:
|
@@ -276,15 +269,10 @@ async def _handle_extract_metadata(
|
|
276
269
|
"--standalone",
|
277
270
|
"--quiet",
|
278
271
|
"--output",
|
279
|
-
metadata_file,
|
272
|
+
str(metadata_file),
|
280
273
|
]
|
281
274
|
|
282
|
-
result = await
|
283
|
-
partial(subprocess.run, capture_output=True),
|
284
|
-
command,
|
285
|
-
cancellable=True,
|
286
|
-
limiter=CapacityLimiter(max_processes),
|
287
|
-
)
|
275
|
+
result = await run_process(command)
|
288
276
|
|
289
277
|
if result.returncode != 0:
|
290
278
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -297,9 +285,7 @@ async def _handle_extract_metadata(
|
|
297
285
|
await unlink()
|
298
286
|
|
299
287
|
|
300
|
-
async def _handle_extract_file(
|
301
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
302
|
-
) -> str:
|
288
|
+
async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
|
303
289
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
304
290
|
output_path, unlink = await create_temp_file(".md")
|
305
291
|
try:
|
@@ -315,12 +301,7 @@ async def _handle_extract_file(
|
|
315
301
|
|
316
302
|
command.extend(["--output", str(output_path)])
|
317
303
|
|
318
|
-
result = await
|
319
|
-
partial(subprocess.run, capture_output=True),
|
320
|
-
command,
|
321
|
-
cancellable=True,
|
322
|
-
limiter=CapacityLimiter(max_processes),
|
323
|
-
)
|
304
|
+
result = await run_process(command)
|
324
305
|
|
325
306
|
if result.returncode != 0:
|
326
307
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -334,15 +315,12 @@ async def _handle_extract_file(
|
|
334
315
|
await unlink()
|
335
316
|
|
336
317
|
|
337
|
-
async def process_file_with_pandoc(
|
338
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
339
|
-
) -> ExtractionResult:
|
318
|
+
async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
|
340
319
|
"""Process a single file using Pandoc and convert to markdown.
|
341
320
|
|
342
321
|
Args:
|
343
322
|
input_file: The path to the file to process.
|
344
323
|
mime_type: The mime type of the file.
|
345
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
346
324
|
|
347
325
|
Raises:
|
348
326
|
ParsingError: If the file data could not be extracted.
|
@@ -354,41 +332,27 @@ async def process_file_with_pandoc(
|
|
354
332
|
|
355
333
|
_get_pandoc_type_from_mime_type(mime_type)
|
356
334
|
|
357
|
-
metadata: Metadata = {}
|
358
|
-
content: str = ""
|
359
|
-
|
360
335
|
try:
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
|
366
|
-
|
367
|
-
async def _get_content() -> None:
|
368
|
-
nonlocal content
|
369
|
-
content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
|
336
|
+
metadata, content = await run_taskgroup(
|
337
|
+
partial(_handle_extract_metadata, input_file, mime_type=mime_type),
|
338
|
+
partial(_handle_extract_file, input_file, mime_type=mime_type),
|
339
|
+
)
|
370
340
|
|
371
|
-
|
372
|
-
|
341
|
+
return ExtractionResult(
|
342
|
+
content=normalize_spaces(cast(str, content)),
|
343
|
+
metadata=cast(Metadata, metadata),
|
344
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
345
|
+
)
|
373
346
|
except ExceptionGroup as eg:
|
374
|
-
raise ParsingError("Failed to
|
375
|
-
|
376
|
-
return ExtractionResult(
|
377
|
-
content=normalize_spaces(content),
|
378
|
-
metadata=metadata,
|
379
|
-
mime_type=MARKDOWN_MIME_TYPE,
|
380
|
-
)
|
347
|
+
raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
|
381
348
|
|
382
349
|
|
383
|
-
async def process_content_with_pandoc(
|
384
|
-
content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
385
|
-
) -> ExtractionResult:
|
350
|
+
async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
|
386
351
|
"""Process content using Pandoc and convert to markdown.
|
387
352
|
|
388
353
|
Args:
|
389
354
|
content: The content to process.
|
390
355
|
mime_type: The mime type of the content.
|
391
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
392
356
|
|
393
357
|
Returns:
|
394
358
|
ExtractionResult
|
@@ -397,7 +361,7 @@ async def process_content_with_pandoc(
|
|
397
361
|
input_file, unlink = await create_temp_file(f".{extension}")
|
398
362
|
|
399
363
|
await AsyncPath(input_file).write_bytes(content)
|
400
|
-
result = await process_file_with_pandoc(input_file, mime_type=mime_type
|
364
|
+
result = await process_file_with_pandoc(input_file, mime_type=mime_type)
|
401
365
|
|
402
366
|
await unlink()
|
403
367
|
return result
|
@@ -11,7 +11,7 @@ from kreuzberg import ExtractionResult
|
|
11
11
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
12
12
|
from kreuzberg._string import normalize_spaces
|
13
13
|
from kreuzberg._sync import run_sync
|
14
|
-
from kreuzberg._tesseract import PSMMode,
|
14
|
+
from kreuzberg._tesseract import PSMMode, batch_process_images
|
15
15
|
from kreuzberg.exceptions import ParsingError
|
16
16
|
|
17
17
|
if TYPE_CHECKING: # pragma: no cover
|
@@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
|
67
67
|
document: pypdfium2.PdfDocument | None = None
|
68
68
|
try:
|
69
69
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
70
|
-
return [page.render(scale=
|
70
|
+
return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
|
71
71
|
except pypdfium2.PdfiumError as e:
|
72
72
|
raise ParsingError(
|
73
73
|
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
@@ -80,7 +80,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
|
80
80
|
async def _extract_pdf_text_with_ocr(
|
81
81
|
input_file: Path,
|
82
82
|
*,
|
83
|
-
language:
|
83
|
+
language: str = "eng",
|
84
84
|
max_processes: int,
|
85
85
|
psm: PSMMode = PSMMode.AUTO,
|
86
86
|
) -> ExtractionResult:
|
@@ -132,7 +132,7 @@ async def extract_pdf_file(
|
|
132
132
|
input_file: Path,
|
133
133
|
*,
|
134
134
|
force_ocr: bool,
|
135
|
-
language:
|
135
|
+
language: str = "eng",
|
136
136
|
max_processes: int,
|
137
137
|
psm: PSMMode = PSMMode.AUTO,
|
138
138
|
) -> ExtractionResult:
|
@@ -154,7 +154,6 @@ async def extract_pdf_file(
|
|
154
154
|
and _validate_extracted_text(content)
|
155
155
|
):
|
156
156
|
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
157
|
-
|
158
157
|
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
159
158
|
|
160
159
|
|
@@ -162,7 +161,7 @@ async def extract_pdf_content(
|
|
162
161
|
content: bytes,
|
163
162
|
*,
|
164
163
|
force_ocr: bool,
|
165
|
-
language:
|
164
|
+
language: str = "eng",
|
166
165
|
max_processes: int,
|
167
166
|
psm: PSMMode = PSMMode.AUTO,
|
168
167
|
) -> ExtractionResult:
|
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
22
22
|
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
23
23
|
|
24
24
|
for enc in [e for e in encodings if e]: # pragma: no cover
|
25
|
-
with suppress(UnicodeDecodeError):
|
25
|
+
with suppress(UnicodeDecodeError, LookupError):
|
26
26
|
return byte_data.decode(enc)
|
27
27
|
|
28
28
|
# If all encodings fail, fall back to latin-1 which can handle any byte
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from functools import partial
|
5
|
+
from typing import TYPE_CHECKING, TypeVar, cast
|
6
|
+
|
7
|
+
from anyio import create_task_group
|
8
|
+
from anyio.to_thread import run_sync as any_io_run_sync
|
9
|
+
|
10
|
+
if TYPE_CHECKING: # pragma: no cover
|
11
|
+
from collections.abc import Callable, Coroutine
|
12
|
+
|
13
|
+
if sys.version_info >= (3, 10):
|
14
|
+
from typing import ParamSpec
|
15
|
+
else: # pragma: no cover
|
16
|
+
from typing_extensions import ParamSpec
|
17
|
+
|
18
|
+
T = TypeVar("T")
|
19
|
+
P = ParamSpec("P")
|
20
|
+
|
21
|
+
|
22
|
+
async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
|
23
|
+
"""Run a synchronous function in an asynchronous context.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
sync_fn: The synchronous function to run.
|
27
|
+
*args: The positional arguments to pass to the function.
|
28
|
+
**kwargs: The keyword arguments to pass to the function.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
The result of the synchronous function.
|
32
|
+
"""
|
33
|
+
handler = partial(sync_fn, **kwargs)
|
34
|
+
return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
35
|
+
|
36
|
+
|
37
|
+
async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) -> list[T]:
|
38
|
+
"""Run a list of coroutines concurrently.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
*async_tasks: The list of coroutines to run.
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
The results of the coroutines.
|
45
|
+
"""
|
46
|
+
results = cast(list[T], [None] * len(async_tasks))
|
47
|
+
|
48
|
+
async def run_task(index: int, task: Callable[[], Coroutine[None, None, T]]) -> None:
|
49
|
+
results[index] = await task()
|
50
|
+
|
51
|
+
async with create_task_group() as tg:
|
52
|
+
for i, t in enumerate(async_tasks):
|
53
|
+
tg.start_soon(run_task, i, t)
|
54
|
+
|
55
|
+
return results
|
56
|
+
|
57
|
+
|
58
|
+
async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None, T]], batch_size: int) -> list[T]:
|
59
|
+
"""Run a list of coroutines concurrently in batches.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
*async_tasks: The list of coroutines to run.
|
63
|
+
batch_size: The size of each batch.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
The results of the coroutines.
|
67
|
+
"""
|
68
|
+
results: list[T] = []
|
69
|
+
|
70
|
+
for i in range(0, len(async_tasks), batch_size):
|
71
|
+
batch = async_tasks[i : i + batch_size]
|
72
|
+
results.extend(await run_taskgroup(*batch))
|
73
|
+
|
74
|
+
return results
|