kreuzberg 2.0.1__tar.gz → 2.1.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/PKG-INFO +11 -15
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/README.md +10 -14
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/__init__.py +2 -0
- kreuzberg-2.1.0/kreuzberg/_constants.py +8 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_html.py +1 -2
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_pandoc.py +37 -73
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_pdf.py +1 -2
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_string.py +1 -1
- kreuzberg-2.1.0/kreuzberg/_sync.py +74 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_tesseract.py +51 -42
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_xlsx.py +34 -36
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/exceptions.py +20 -1
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/extraction.py +4 -6
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg.egg-info/PKG-INFO +11 -15
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/pyproject.toml +4 -5
- kreuzberg-2.0.1/kreuzberg/_constants.py +0 -6
- kreuzberg-2.0.1/kreuzberg/_sync.py +0 -33
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/LICENSE +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_mime_types.py +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_pptx.py +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_tmp.py +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/_types.py +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg/py.typed +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg.egg-info/SOURCES.txt +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg.egg-info/dependency_links.txt +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg.egg-info/requires.txt +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/kreuzberg.egg-info/top_level.txt +0 -0
- {kreuzberg-2.0.1 → kreuzberg-2.1.0}/setup.cfg +0 -0
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.0
|
3
|
+
Version: 2.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
42
42
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
43
43
|
- **Local Processing**: No external API calls or cloud dependencies required
|
44
44
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
45
|
-
- **
|
45
|
+
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
46
46
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
47
47
|
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
48
48
|
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
@@ -61,8 +61,8 @@ pip install kreuzberg
|
|
61
61
|
|
62
62
|
Kreuzberg requires two system level dependencies:
|
63
63
|
|
64
|
-
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
64
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
65
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
|
66
66
|
|
67
67
|
You can install these with:
|
68
68
|
|
@@ -75,7 +75,7 @@ sudo apt-get install pandoc tesseract-ocr
|
|
75
75
|
#### MacOS
|
76
76
|
|
77
77
|
```shell
|
78
|
-
#
|
78
|
+
#
|
79
79
|
brew install tesseract pandoc
|
80
80
|
```
|
81
81
|
|
@@ -191,19 +191,15 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
|
|
191
191
|
|
192
192
|
#### Processing Configuration
|
193
193
|
|
194
|
-
- `max_processes` (default: CPU count
|
195
|
-
|
196
|
-
Notes:
|
197
|
-
|
198
|
-
- Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
194
|
+
- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
|
199
195
|
|
200
196
|
### Quick Start
|
201
197
|
|
202
198
|
```python
|
203
199
|
from pathlib import Path
|
204
200
|
from kreuzberg import extract_file
|
205
|
-
from kreuzberg
|
206
|
-
from kreuzberg
|
201
|
+
from kreuzberg import ExtractionResult
|
202
|
+
from kreuzberg import PSMMode
|
207
203
|
|
208
204
|
|
209
205
|
# Basic file extraction
|
@@ -232,7 +228,7 @@ async def extract_document():
|
|
232
228
|
|
233
229
|
```python
|
234
230
|
from kreuzberg import extract_bytes
|
235
|
-
from kreuzberg
|
231
|
+
from kreuzberg import ExtractionResult
|
236
232
|
|
237
233
|
|
238
234
|
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
@@ -378,8 +374,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
|
|
378
374
|
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
379
375
|
|
380
376
|
```python
|
381
|
-
from kreuzberg import
|
382
|
-
|
377
|
+
from kreuzberg import (
|
378
|
+
extract_file,
|
383
379
|
ValidationError,
|
384
380
|
ParsingError,
|
385
381
|
OCRError,
|
@@ -7,7 +7,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
7
7
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
8
8
|
- **Local Processing**: No external API calls or cloud dependencies required
|
9
9
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
10
|
-
- **
|
10
|
+
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
11
11
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
12
12
|
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
13
13
|
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
@@ -26,8 +26,8 @@ pip install kreuzberg
|
|
26
26
|
|
27
27
|
Kreuzberg requires two system level dependencies:
|
28
28
|
|
29
|
-
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
30
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
29
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
30
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
|
31
31
|
|
32
32
|
You can install these with:
|
33
33
|
|
@@ -40,7 +40,7 @@ sudo apt-get install pandoc tesseract-ocr
|
|
40
40
|
#### MacOS
|
41
41
|
|
42
42
|
```shell
|
43
|
-
#
|
43
|
+
#
|
44
44
|
brew install tesseract pandoc
|
45
45
|
```
|
46
46
|
|
@@ -156,19 +156,15 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
|
|
156
156
|
|
157
157
|
#### Processing Configuration
|
158
158
|
|
159
|
-
- `max_processes` (default: CPU count
|
160
|
-
|
161
|
-
Notes:
|
162
|
-
|
163
|
-
- Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
159
|
+
- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
|
164
160
|
|
165
161
|
### Quick Start
|
166
162
|
|
167
163
|
```python
|
168
164
|
from pathlib import Path
|
169
165
|
from kreuzberg import extract_file
|
170
|
-
from kreuzberg
|
171
|
-
from kreuzberg
|
166
|
+
from kreuzberg import ExtractionResult
|
167
|
+
from kreuzberg import PSMMode
|
172
168
|
|
173
169
|
|
174
170
|
# Basic file extraction
|
@@ -197,7 +193,7 @@ async def extract_document():
|
|
197
193
|
|
198
194
|
```python
|
199
195
|
from kreuzberg import extract_bytes
|
200
|
-
from kreuzberg
|
196
|
+
from kreuzberg import ExtractionResult
|
201
197
|
|
202
198
|
|
203
199
|
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
@@ -343,8 +339,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
|
|
343
339
|
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
344
340
|
|
345
341
|
```python
|
346
|
-
from kreuzberg import
|
347
|
-
|
342
|
+
from kreuzberg import (
|
343
|
+
extract_file,
|
348
344
|
ValidationError,
|
349
345
|
ParsingError,
|
350
346
|
OCRError,
|
@@ -1,3 +1,4 @@
|
|
1
|
+
from ._tesseract import PSMMode
|
1
2
|
from ._types import ExtractionResult, Metadata
|
2
3
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
3
4
|
from .extraction import (
|
@@ -15,6 +16,7 @@ __all__ = [
|
|
15
16
|
"Metadata",
|
16
17
|
"MissingDependencyError",
|
17
18
|
"OCRError",
|
19
|
+
"PSMMode",
|
18
20
|
"ParsingError",
|
19
21
|
"ValidationError",
|
20
22
|
"batch_extract_bytes",
|
@@ -8,7 +8,6 @@ from anyio import Path as AsyncPath
|
|
8
8
|
from kreuzberg import ExtractionResult
|
9
9
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
10
10
|
from kreuzberg._string import normalize_spaces, safe_decode
|
11
|
-
from kreuzberg._sync import run_sync
|
12
11
|
|
13
12
|
if TYPE_CHECKING:
|
14
13
|
from pathlib import Path
|
@@ -28,5 +27,5 @@ async def extract_html_string(file_path_or_contents: Path | bytes) -> Extraction
|
|
28
27
|
if isinstance(file_path_or_contents, bytes)
|
29
28
|
else await AsyncPath(file_path_or_contents).read_text()
|
30
29
|
)
|
31
|
-
result =
|
30
|
+
result = html_to_markdown.convert_to_markdown(content)
|
32
31
|
return ExtractionResult(content=normalize_spaces(result), mime_type=MARKDOWN_MIME_TYPE, metadata={})
|
@@ -1,21 +1,22 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import
|
3
|
+
import re
|
4
4
|
import sys
|
5
5
|
from functools import partial
|
6
6
|
from json import JSONDecodeError, loads
|
7
7
|
from typing import TYPE_CHECKING, Any, Final, Literal, cast
|
8
8
|
|
9
|
-
from anyio import CapacityLimiter, create_task_group, to_process
|
10
9
|
from anyio import Path as AsyncPath
|
10
|
+
from anyio import run_process
|
11
11
|
|
12
|
-
from kreuzberg
|
12
|
+
from kreuzberg import ValidationError
|
13
|
+
from kreuzberg._constants import MINIMAL_SUPPORTED_PANDOC_VERSION
|
13
14
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
14
15
|
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import
|
16
|
+
from kreuzberg._sync import run_taskgroup
|
16
17
|
from kreuzberg._tmp import create_temp_file
|
17
18
|
from kreuzberg._types import ExtractionResult, Metadata
|
18
|
-
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
|
+
from kreuzberg.exceptions import MissingDependencyError, ParsingError
|
19
20
|
|
20
21
|
if TYPE_CHECKING: # pragma: no cover
|
21
22
|
from collections.abc import Mapping
|
@@ -24,10 +25,8 @@ if TYPE_CHECKING: # pragma: no cover
|
|
24
25
|
if sys.version_info < (3, 11): # pragma: no cover
|
25
26
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
26
27
|
|
27
|
-
|
28
28
|
version_ref: Final[dict[str, bool]] = {"checked": False}
|
29
29
|
|
30
|
-
|
31
30
|
# Block-level node types in Pandoc AST
|
32
31
|
BLOCK_HEADER: Final = "Header" # Header with level, attributes and inline content
|
33
32
|
BLOCK_PARA: Final = "Para" # Paragraph containing inline content
|
@@ -229,20 +228,15 @@ def _extract_metadata(raw_meta: dict[str, Any]) -> Metadata:
|
|
229
228
|
|
230
229
|
|
231
230
|
def _get_pandoc_type_from_mime_type(mime_type: str) -> str:
|
232
|
-
if
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
"mime_type": mime_type,
|
239
|
-
"supported_mimetypes": ",".join(sorted(MIMETYPE_TO_PANDOC_TYPE_MAPPING)),
|
240
|
-
},
|
231
|
+
if pandoc_type := (MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
|
232
|
+
return pandoc_type
|
233
|
+
|
234
|
+
if any(k.startswith(mime_type) for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING):
|
235
|
+
return next(
|
236
|
+
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
241
237
|
)
|
242
238
|
|
243
|
-
|
244
|
-
MIMETYPE_TO_PANDOC_TYPE_MAPPING[k] for k in MIMETYPE_TO_PANDOC_TYPE_MAPPING if k.startswith(mime_type)
|
245
|
-
)
|
239
|
+
raise ValidationError(f"Unsupported mime type: {mime_type}")
|
246
240
|
|
247
241
|
|
248
242
|
async def _validate_pandoc_version() -> None:
|
@@ -251,20 +245,19 @@ async def _validate_pandoc_version() -> None:
|
|
251
245
|
return
|
252
246
|
|
253
247
|
command = ["pandoc", "--version"]
|
254
|
-
result = await
|
255
|
-
|
256
|
-
|
257
|
-
|
248
|
+
result = await run_process(command)
|
249
|
+
|
250
|
+
version_match = re.search(r"pandoc\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
251
|
+
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_PANDOC_VERSION:
|
252
|
+
raise MissingDependencyError("Pandoc version 2 or above is required")
|
258
253
|
|
259
254
|
version_ref["checked"] = True
|
260
255
|
|
261
256
|
except FileNotFoundError as e:
|
262
|
-
raise MissingDependencyError("Pandoc is not installed
|
257
|
+
raise MissingDependencyError("Pandoc is not installed") from e
|
263
258
|
|
264
259
|
|
265
|
-
async def _handle_extract_metadata(
|
266
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
267
|
-
) -> Metadata:
|
260
|
+
async def _handle_extract_metadata(input_file: str | PathLike[str], *, mime_type: str) -> Metadata:
|
268
261
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
269
262
|
metadata_file, unlink = await create_temp_file(".json")
|
270
263
|
try:
|
@@ -276,15 +269,10 @@ async def _handle_extract_metadata(
|
|
276
269
|
"--standalone",
|
277
270
|
"--quiet",
|
278
271
|
"--output",
|
279
|
-
metadata_file,
|
272
|
+
str(metadata_file),
|
280
273
|
]
|
281
274
|
|
282
|
-
result = await
|
283
|
-
partial(subprocess.run, capture_output=True),
|
284
|
-
command,
|
285
|
-
cancellable=True,
|
286
|
-
limiter=CapacityLimiter(max_processes),
|
287
|
-
)
|
275
|
+
result = await run_process(command)
|
288
276
|
|
289
277
|
if result.returncode != 0:
|
290
278
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -297,9 +285,7 @@ async def _handle_extract_metadata(
|
|
297
285
|
await unlink()
|
298
286
|
|
299
287
|
|
300
|
-
async def _handle_extract_file(
|
301
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
302
|
-
) -> str:
|
288
|
+
async def _handle_extract_file(input_file: str | PathLike[str], *, mime_type: str) -> str:
|
303
289
|
pandoc_type = _get_pandoc_type_from_mime_type(mime_type)
|
304
290
|
output_path, unlink = await create_temp_file(".md")
|
305
291
|
try:
|
@@ -315,12 +301,7 @@ async def _handle_extract_file(
|
|
315
301
|
|
316
302
|
command.extend(["--output", str(output_path)])
|
317
303
|
|
318
|
-
result = await
|
319
|
-
partial(subprocess.run, capture_output=True),
|
320
|
-
command,
|
321
|
-
cancellable=True,
|
322
|
-
limiter=CapacityLimiter(max_processes),
|
323
|
-
)
|
304
|
+
result = await run_process(command)
|
324
305
|
|
325
306
|
if result.returncode != 0:
|
326
307
|
raise ParsingError("Failed to extract file data", context={"file": str(input_file), "error": result.stderr})
|
@@ -334,15 +315,12 @@ async def _handle_extract_file(
|
|
334
315
|
await unlink()
|
335
316
|
|
336
317
|
|
337
|
-
async def process_file_with_pandoc(
|
338
|
-
input_file: str | PathLike[str], *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
339
|
-
) -> ExtractionResult:
|
318
|
+
async def process_file_with_pandoc(input_file: str | PathLike[str], *, mime_type: str) -> ExtractionResult:
|
340
319
|
"""Process a single file using Pandoc and convert to markdown.
|
341
320
|
|
342
321
|
Args:
|
343
322
|
input_file: The path to the file to process.
|
344
323
|
mime_type: The mime type of the file.
|
345
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
346
324
|
|
347
325
|
Raises:
|
348
326
|
ParsingError: If the file data could not be extracted.
|
@@ -354,41 +332,27 @@ async def process_file_with_pandoc(
|
|
354
332
|
|
355
333
|
_get_pandoc_type_from_mime_type(mime_type)
|
356
334
|
|
357
|
-
metadata: Metadata = {}
|
358
|
-
content: str = ""
|
359
|
-
|
360
335
|
try:
|
361
|
-
|
362
|
-
|
363
|
-
|
364
|
-
|
365
|
-
metadata = await _handle_extract_metadata(input_file, mime_type=mime_type, max_processes=max_processes)
|
366
|
-
|
367
|
-
async def _get_content() -> None:
|
368
|
-
nonlocal content
|
369
|
-
content = await _handle_extract_file(input_file, mime_type=mime_type, max_processes=max_processes)
|
336
|
+
metadata, content = await run_taskgroup(
|
337
|
+
partial(_handle_extract_metadata, input_file, mime_type=mime_type),
|
338
|
+
partial(_handle_extract_file, input_file, mime_type=mime_type),
|
339
|
+
)
|
370
340
|
|
371
|
-
|
372
|
-
|
341
|
+
return ExtractionResult(
|
342
|
+
content=normalize_spaces(cast(str, content)),
|
343
|
+
metadata=cast(Metadata, metadata),
|
344
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
345
|
+
)
|
373
346
|
except ExceptionGroup as eg:
|
374
|
-
raise ParsingError("Failed to
|
375
|
-
|
376
|
-
return ExtractionResult(
|
377
|
-
content=normalize_spaces(content),
|
378
|
-
metadata=metadata,
|
379
|
-
mime_type=MARKDOWN_MIME_TYPE,
|
380
|
-
)
|
347
|
+
raise ParsingError("Failed to process file", context={"file": str(input_file), "errors": eg.exceptions}) from eg
|
381
348
|
|
382
349
|
|
383
|
-
async def process_content_with_pandoc(
|
384
|
-
content: bytes, *, mime_type: str, max_processes: int = DEFAULT_MAX_PROCESSES
|
385
|
-
) -> ExtractionResult:
|
350
|
+
async def process_content_with_pandoc(content: bytes, *, mime_type: str) -> ExtractionResult:
|
386
351
|
"""Process content using Pandoc and convert to markdown.
|
387
352
|
|
388
353
|
Args:
|
389
354
|
content: The content to process.
|
390
355
|
mime_type: The mime type of the content.
|
391
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
392
356
|
|
393
357
|
Returns:
|
394
358
|
ExtractionResult
|
@@ -397,7 +361,7 @@ async def process_content_with_pandoc(
|
|
397
361
|
input_file, unlink = await create_temp_file(f".{extension}")
|
398
362
|
|
399
363
|
await AsyncPath(input_file).write_bytes(content)
|
400
|
-
result = await process_file_with_pandoc(input_file, mime_type=mime_type
|
364
|
+
result = await process_file_with_pandoc(input_file, mime_type=mime_type)
|
401
365
|
|
402
366
|
await unlink()
|
403
367
|
return result
|
@@ -67,7 +67,7 @@ async def _convert_pdf_to_images(input_file: Path) -> list[Image]:
|
|
67
67
|
document: pypdfium2.PdfDocument | None = None
|
68
68
|
try:
|
69
69
|
document = await run_sync(pypdfium2.PdfDocument, str(input_file))
|
70
|
-
return [page.render(scale=
|
70
|
+
return [page.render(scale=4.25).to_pil() for page in cast(pypdfium2.PdfDocument, document)]
|
71
71
|
except pypdfium2.PdfiumError as e:
|
72
72
|
raise ParsingError(
|
73
73
|
"Could not convert PDF to images", context={"file_path": str(input_file), "error": str(e)}
|
@@ -154,7 +154,6 @@ async def extract_pdf_file(
|
|
154
154
|
and _validate_extracted_text(content)
|
155
155
|
):
|
156
156
|
return ExtractionResult(content=content, mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
157
|
-
|
158
157
|
return await _extract_pdf_text_with_ocr(input_file, max_processes=max_processes, language=language, psm=psm)
|
159
158
|
|
160
159
|
|
@@ -22,7 +22,7 @@ def safe_decode(byte_data: bytes, encoding: str | None = None) -> str:
|
|
22
22
|
encodings = [encoding, detect(byte_data).get("encoding", ""), "utf-8"]
|
23
23
|
|
24
24
|
for enc in [e for e in encodings if e]: # pragma: no cover
|
25
|
-
with suppress(UnicodeDecodeError):
|
25
|
+
with suppress(UnicodeDecodeError, LookupError):
|
26
26
|
return byte_data.decode(enc)
|
27
27
|
|
28
28
|
# If all encodings fail, fall back to latin-1 which can handle any byte
|
@@ -0,0 +1,74 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from functools import partial
|
5
|
+
from typing import TYPE_CHECKING, TypeVar, cast
|
6
|
+
|
7
|
+
from anyio import create_task_group
|
8
|
+
from anyio.to_thread import run_sync as any_io_run_sync
|
9
|
+
|
10
|
+
if TYPE_CHECKING: # pragma: no cover
|
11
|
+
from collections.abc import Callable, Coroutine
|
12
|
+
|
13
|
+
if sys.version_info >= (3, 10):
|
14
|
+
from typing import ParamSpec
|
15
|
+
else: # pragma: no cover
|
16
|
+
from typing_extensions import ParamSpec
|
17
|
+
|
18
|
+
T = TypeVar("T")
|
19
|
+
P = ParamSpec("P")
|
20
|
+
|
21
|
+
|
22
|
+
async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
|
23
|
+
"""Run a synchronous function in an asynchronous context.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
sync_fn: The synchronous function to run.
|
27
|
+
*args: The positional arguments to pass to the function.
|
28
|
+
**kwargs: The keyword arguments to pass to the function.
|
29
|
+
|
30
|
+
Returns:
|
31
|
+
The result of the synchronous function.
|
32
|
+
"""
|
33
|
+
handler = partial(sync_fn, **kwargs)
|
34
|
+
return cast(T, await any_io_run_sync(handler, *args, abandon_on_cancel=True)) # pyright: ignore [reportCallIssue]
|
35
|
+
|
36
|
+
|
37
|
+
async def run_taskgroup(*async_tasks: Callable[[], Coroutine[None, None, T]]) -> list[T]:
|
38
|
+
"""Run a list of coroutines concurrently.
|
39
|
+
|
40
|
+
Args:
|
41
|
+
*async_tasks: The list of coroutines to run.
|
42
|
+
|
43
|
+
Returns:
|
44
|
+
The results of the coroutines.
|
45
|
+
"""
|
46
|
+
results = cast(list[T], [None] * len(async_tasks))
|
47
|
+
|
48
|
+
async def run_task(index: int, task: Callable[[], Coroutine[None, None, T]]) -> None:
|
49
|
+
results[index] = await task()
|
50
|
+
|
51
|
+
async with create_task_group() as tg:
|
52
|
+
for i, t in enumerate(async_tasks):
|
53
|
+
tg.start_soon(run_task, i, t)
|
54
|
+
|
55
|
+
return results
|
56
|
+
|
57
|
+
|
58
|
+
async def run_taskgroup_batched(*async_tasks: Callable[[], Coroutine[None, None, T]], batch_size: int) -> list[T]:
|
59
|
+
"""Run a list of coroutines concurrently in batches.
|
60
|
+
|
61
|
+
Args:
|
62
|
+
*async_tasks: The list of coroutines to run.
|
63
|
+
batch_size: The size of each batch.
|
64
|
+
|
65
|
+
Returns:
|
66
|
+
The results of the coroutines.
|
67
|
+
"""
|
68
|
+
results: list[T] = []
|
69
|
+
|
70
|
+
for i in range(0, len(async_tasks), batch_size):
|
71
|
+
batch = async_tasks[i : i + batch_size]
|
72
|
+
results.extend(await run_taskgroup(*batch))
|
73
|
+
|
74
|
+
return results
|
@@ -1,30 +1,27 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import re
|
4
|
-
import subprocess
|
5
4
|
import sys
|
6
5
|
from enum import Enum
|
7
6
|
from functools import partial
|
8
7
|
from os import PathLike
|
9
|
-
from typing import
|
8
|
+
from typing import Any, TypeVar, Union
|
10
9
|
|
11
|
-
from anyio import CapacityLimiter, create_task_group, to_process
|
12
10
|
from anyio import Path as AsyncPath
|
11
|
+
from anyio import run_process
|
13
12
|
from PIL.Image import Image
|
14
13
|
|
15
|
-
from kreuzberg import
|
16
|
-
from kreuzberg._constants import DEFAULT_MAX_PROCESSES
|
14
|
+
from kreuzberg._constants import DEFAULT_MAX_PROCESSES, MINIMAL_SUPPORTED_TESSERACT_VERSION
|
17
15
|
from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
|
18
16
|
from kreuzberg._string import normalize_spaces
|
19
|
-
from kreuzberg._sync import run_sync
|
17
|
+
from kreuzberg._sync import run_sync, run_taskgroup_batched
|
20
18
|
from kreuzberg._tmp import create_temp_file
|
21
|
-
from kreuzberg.
|
19
|
+
from kreuzberg._types import ExtractionResult
|
20
|
+
from kreuzberg.exceptions import MissingDependencyError, OCRError, ParsingError
|
22
21
|
|
23
22
|
if sys.version_info < (3, 11): # pragma: no cover
|
24
23
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
25
24
|
|
26
|
-
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
27
|
-
|
28
25
|
version_ref = {"checked": False}
|
29
26
|
|
30
27
|
T = TypeVar("T", bound=Union[Image, PathLike[str], str])
|
@@ -68,14 +65,16 @@ async def validate_tesseract_version() -> None:
|
|
68
65
|
return
|
69
66
|
|
70
67
|
command = ["tesseract", "--version"]
|
71
|
-
result = await
|
72
|
-
version_match = re.search(r"tesseract\s+v?(\d+)", result.stdout.decode())
|
68
|
+
result = await run_process(command)
|
69
|
+
version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
|
73
70
|
if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
|
74
71
|
raise MissingDependencyError("Tesseract version 5 or above is required.")
|
75
72
|
|
76
73
|
version_ref["checked"] = True
|
77
74
|
except FileNotFoundError as e:
|
78
|
-
raise MissingDependencyError(
|
75
|
+
raise MissingDependencyError(
|
76
|
+
"Tesseract is not installed or not in path. Please install tesseract 5 and above on your system."
|
77
|
+
) from e
|
79
78
|
|
80
79
|
|
81
80
|
async def process_file(
|
@@ -83,7 +82,6 @@ async def process_file(
|
|
83
82
|
*,
|
84
83
|
language: str,
|
85
84
|
psm: PSMMode,
|
86
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
87
85
|
) -> ExtractionResult:
|
88
86
|
"""Process a single image file using Tesseract OCR.
|
89
87
|
|
@@ -91,7 +89,6 @@ async def process_file(
|
|
91
89
|
input_file: The path to the image file to process.
|
92
90
|
language: The language code for OCR.
|
93
91
|
psm: Page segmentation mode.
|
94
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
95
92
|
|
96
93
|
Raises:
|
97
94
|
OCRError: If OCR fails to extract text from the image.
|
@@ -102,6 +99,7 @@ async def process_file(
|
|
102
99
|
output_path, unlink = await create_temp_file(".txt")
|
103
100
|
try:
|
104
101
|
output_base = str(output_path).replace(".txt", "")
|
102
|
+
|
105
103
|
command = [
|
106
104
|
"tesseract",
|
107
105
|
str(input_file),
|
@@ -110,22 +108,44 @@ async def process_file(
|
|
110
108
|
language,
|
111
109
|
"--psm",
|
112
110
|
str(psm.value),
|
111
|
+
"--oem",
|
112
|
+
"1",
|
113
|
+
"--loglevel",
|
114
|
+
"OFF",
|
115
|
+
"-c",
|
116
|
+
"thresholding_method=1",
|
117
|
+
"-c",
|
118
|
+
"tessedit_enable_dict_correction=1",
|
119
|
+
"-c",
|
120
|
+
"language_model_ngram_on=1",
|
121
|
+
"-c",
|
122
|
+
"textord_space_size_is_variable=1",
|
123
|
+
"-c",
|
124
|
+
"classify_use_pre_adapted_templates=1",
|
125
|
+
"-c",
|
126
|
+
"tessedit_dont_blkrej_good_wds=1",
|
127
|
+
"-c",
|
128
|
+
"tessedit_dont_rowrej_good_wds=1",
|
129
|
+
"-c",
|
130
|
+
"tessedit_use_primary_params_model=1",
|
113
131
|
]
|
114
132
|
|
115
|
-
|
116
|
-
|
117
|
-
|
118
|
-
|
119
|
-
|
120
|
-
)
|
133
|
+
env: dict[str, Any] | None = None
|
134
|
+
if sys.platform.startswith("linux"):
|
135
|
+
env = {"OMP_THREAD_LIMIT": "1"}
|
136
|
+
|
137
|
+
result = await run_process(command, env=env)
|
121
138
|
|
122
139
|
if not result.returncode == 0:
|
123
|
-
raise OCRError(
|
140
|
+
raise OCRError(
|
141
|
+
"OCR failed with a non-0 return code.",
|
142
|
+
context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
|
143
|
+
)
|
124
144
|
|
125
145
|
output = await AsyncPath(output_path).read_text("utf-8")
|
126
146
|
return ExtractionResult(content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={})
|
127
147
|
except (RuntimeError, OSError) as e:
|
128
|
-
raise OCRError("Failed to OCR using tesseract") from e
|
148
|
+
raise OCRError(f"Failed to OCR using tesseract: {e}") from e
|
129
149
|
finally:
|
130
150
|
await unlink()
|
131
151
|
|
@@ -135,7 +155,6 @@ async def process_image(
|
|
135
155
|
*,
|
136
156
|
language: str,
|
137
157
|
psm: PSMMode,
|
138
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
139
158
|
) -> ExtractionResult:
|
140
159
|
"""Process a single Pillow Image using Tesseract OCR.
|
141
160
|
|
@@ -143,14 +162,13 @@ async def process_image(
|
|
143
162
|
image: The Pillow Image to process.
|
144
163
|
language: The language code for OCR.
|
145
164
|
psm: Page segmentation mode.
|
146
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
147
165
|
|
148
166
|
Returns:
|
149
167
|
ExtractionResult: The extracted text from the image.
|
150
168
|
"""
|
151
169
|
image_path, unlink = await create_temp_file(".png")
|
152
170
|
await run_sync(image.save, str(image_path), format="PNG")
|
153
|
-
result = await process_file(image_path, language=language, psm=psm
|
171
|
+
result = await process_file(image_path, language=language, psm=psm)
|
154
172
|
await unlink()
|
155
173
|
return result
|
156
174
|
|
@@ -160,7 +178,6 @@ async def process_image_with_tesseract(
|
|
160
178
|
*,
|
161
179
|
language: str = "eng",
|
162
180
|
psm: PSMMode = PSMMode.AUTO,
|
163
|
-
max_processes: int = DEFAULT_MAX_PROCESSES,
|
164
181
|
) -> ExtractionResult:
|
165
182
|
"""Run Tesseract OCR asynchronously on a single Pillow Image or a list of Pillow Images.
|
166
183
|
|
@@ -168,7 +185,6 @@ async def process_image_with_tesseract(
|
|
168
185
|
image: A single Pillow Image, a pathlike or a string or a list of Pillow Images to process.
|
169
186
|
language: The language code for OCR (default: "eng").
|
170
187
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
171
|
-
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
172
188
|
|
173
189
|
Raises:
|
174
190
|
ValueError: If the input is not a Pillow Image or a list of Pillow Images.
|
@@ -179,10 +195,10 @@ async def process_image_with_tesseract(
|
|
179
195
|
await validate_tesseract_version()
|
180
196
|
|
181
197
|
if isinstance(image, Image):
|
182
|
-
return await process_image(image, language=language, psm=psm
|
198
|
+
return await process_image(image, language=language, psm=psm)
|
183
199
|
|
184
200
|
if isinstance(image, (PathLike, str)):
|
185
|
-
return await process_file(image, language=language, psm=psm
|
201
|
+
return await process_file(image, language=language, psm=psm)
|
186
202
|
|
187
203
|
raise ValueError("Input must be one of: str, Pathlike or Pillow Image.")
|
188
204
|
|
@@ -200,7 +216,7 @@ async def batch_process_images(
|
|
200
216
|
images: A list of Pillow Images, paths or strings to process.
|
201
217
|
language: The language code for OCR (default: "eng").
|
202
218
|
psm: Page segmentation mode (default: PSMMode.AUTO).
|
203
|
-
max_processes: Maximum number of concurrent processes
|
219
|
+
max_processes: Maximum number of concurrent processes (default: CPU count / 2).
|
204
220
|
|
205
221
|
Raises:
|
206
222
|
ParsingError: If OCR fails to extract text from any of the images.
|
@@ -209,17 +225,10 @@ async def batch_process_images(
|
|
209
225
|
List of ExtractionResult objects, one per input image.
|
210
226
|
"""
|
211
227
|
await validate_tesseract_version()
|
212
|
-
results = cast(list[ExtractionResult], list(range(len(images))))
|
213
|
-
|
214
|
-
async def _process_image(index: int, image: T) -> None:
|
215
|
-
results[index] = await process_image_with_tesseract(
|
216
|
-
image, language=language, psm=psm, max_processes=max_processes
|
217
|
-
)
|
218
|
-
|
219
228
|
try:
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
229
|
+
return await run_taskgroup_batched(
|
230
|
+
*[partial(process_image_with_tesseract, image, language=language, psm=psm) for image in images],
|
231
|
+
batch_size=max_processes,
|
232
|
+
)
|
224
233
|
except ExceptionGroup as eg:
|
225
|
-
raise ParsingError("Failed to process images with Tesseract") from eg
|
234
|
+
raise ParsingError("Failed to process images with Tesseract", context={"errors": eg.exceptions}) from eg
|
@@ -1,23 +1,47 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import csv
|
4
|
+
import sys
|
5
|
+
from functools import partial
|
4
6
|
from io import StringIO
|
5
|
-
from typing import TYPE_CHECKING
|
7
|
+
from typing import TYPE_CHECKING
|
6
8
|
|
7
9
|
from anyio import Path as AsyncPath
|
8
|
-
from anyio import create_task_group
|
9
10
|
from python_calamine import CalamineWorkbook
|
10
11
|
|
11
12
|
from kreuzberg import ExtractionResult, ParsingError
|
12
13
|
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
13
14
|
from kreuzberg._pandoc import process_file_with_pandoc
|
14
15
|
from kreuzberg._string import normalize_spaces
|
15
|
-
from kreuzberg._sync import run_sync
|
16
|
+
from kreuzberg._sync import run_sync, run_taskgroup
|
16
17
|
from kreuzberg._tmp import create_temp_file
|
17
18
|
|
18
19
|
if TYPE_CHECKING: # pragma: no cover
|
19
20
|
from pathlib import Path
|
20
21
|
|
22
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
23
|
+
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
24
|
+
|
25
|
+
|
26
|
+
async def convert_sheet_to_text(workbook: CalamineWorkbook, sheet_name: str) -> str:
|
27
|
+
values = workbook.get_sheet_by_name(sheet_name).to_python()
|
28
|
+
|
29
|
+
csv_buffer = StringIO()
|
30
|
+
writer = csv.writer(csv_buffer)
|
31
|
+
|
32
|
+
for row in values:
|
33
|
+
writer.writerow(row)
|
34
|
+
|
35
|
+
csv_data = csv_buffer.getvalue()
|
36
|
+
csv_buffer.close()
|
37
|
+
|
38
|
+
csv_path, unlink = await create_temp_file(".csv")
|
39
|
+
await AsyncPath(csv_path).write_text(csv_data)
|
40
|
+
|
41
|
+
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
42
|
+
await unlink()
|
43
|
+
return f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
44
|
+
|
21
45
|
|
22
46
|
async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
23
47
|
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
@@ -33,46 +57,20 @@ async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
|
33
57
|
"""
|
34
58
|
try:
|
35
59
|
workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
async def convert_sheet_to_text(sheet_name: str) -> None:
|
40
|
-
nonlocal results
|
41
|
-
values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
|
42
|
-
|
43
|
-
csv_buffer = StringIO()
|
44
|
-
writer = csv.writer(csv_buffer)
|
45
|
-
|
46
|
-
for row in values:
|
47
|
-
writer.writerow(row)
|
48
|
-
|
49
|
-
csv_data = csv_buffer.getvalue()
|
50
|
-
csv_buffer.close()
|
51
|
-
|
52
|
-
from kreuzberg._tmp import create_temp_file
|
53
|
-
|
54
|
-
csv_path, unlink = await create_temp_file(".csv")
|
55
|
-
await AsyncPath(csv_path).write_text(csv_data)
|
56
|
-
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
57
|
-
results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
58
|
-
await unlink()
|
59
|
-
|
60
|
-
async with create_task_group() as tg:
|
61
|
-
for sheet_name in workbook.sheet_names:
|
62
|
-
tg.start_soon(convert_sheet_to_text, sheet_name)
|
60
|
+
results = await run_taskgroup(
|
61
|
+
*[partial(convert_sheet_to_text, workbook, sheet_name) for sheet_name in workbook.sheet_names]
|
62
|
+
)
|
63
63
|
|
64
64
|
return ExtractionResult(
|
65
65
|
content="\n\n".join(results),
|
66
66
|
mime_type=MARKDOWN_MIME_TYPE,
|
67
67
|
metadata={},
|
68
68
|
)
|
69
|
-
except
|
69
|
+
except ExceptionGroup as eg:
|
70
70
|
raise ParsingError(
|
71
|
-
"
|
72
|
-
context={
|
73
|
-
|
74
|
-
},
|
75
|
-
) from e
|
71
|
+
"Failed to extract file data",
|
72
|
+
context={"file": str(input_file), "errors": eg.exceptions},
|
73
|
+
) from eg
|
76
74
|
|
77
75
|
|
78
76
|
async def extract_xlsx_content(content: bytes) -> ExtractionResult:
|
@@ -14,9 +14,28 @@ class KreuzbergError(Exception):
|
|
14
14
|
self.context = context
|
15
15
|
super().__init__(message)
|
16
16
|
|
17
|
+
def _serialize_context(self, obj: Any) -> Any:
|
18
|
+
"""Recursively serialize context objects to ensure JSON compatibility."""
|
19
|
+
if isinstance(obj, bytes):
|
20
|
+
return obj.decode("utf-8", errors="replace")
|
21
|
+
if isinstance(obj, dict):
|
22
|
+
return {k: self._serialize_context(v) for k, v in obj.items()}
|
23
|
+
if isinstance(obj, (list, tuple)):
|
24
|
+
return [self._serialize_context(x) for x in obj]
|
25
|
+
if isinstance(obj, Exception):
|
26
|
+
return {
|
27
|
+
"type": obj.__class__.__name__,
|
28
|
+
"message": str(obj),
|
29
|
+
}
|
30
|
+
return obj
|
31
|
+
|
17
32
|
def __str__(self) -> str:
|
18
33
|
"""Return a string representation of the exception."""
|
19
|
-
|
34
|
+
if self.context:
|
35
|
+
serialized_context = self._serialize_context(self.context)
|
36
|
+
ctx = f"\n\nContext: {dumps(serialized_context)}"
|
37
|
+
else:
|
38
|
+
ctx = ""
|
20
39
|
|
21
40
|
return f"{self.__class__.__name__}: {super().__str__()}{ctx}"
|
22
41
|
|
@@ -87,14 +87,12 @@ async def extract_bytes(
|
|
87
87
|
return await extract_xlsx_content(content)
|
88
88
|
|
89
89
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
90
|
-
return await process_image_with_tesseract(
|
91
|
-
open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
|
92
|
-
)
|
90
|
+
return await process_image_with_tesseract(open_image(BytesIO(content)), psm=psm, language=language)
|
93
91
|
|
94
92
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
95
93
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
96
94
|
):
|
97
|
-
return await process_content_with_pandoc(content=content, mime_type=mime_type
|
95
|
+
return await process_content_with_pandoc(content=content, mime_type=mime_type)
|
98
96
|
|
99
97
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
100
98
|
return await extract_pptx_file_content(content)
|
@@ -150,12 +148,12 @@ async def extract_file(
|
|
150
148
|
return await extract_xlsx_file(Path(input_file))
|
151
149
|
|
152
150
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
153
|
-
return await process_image_with_tesseract(input_file,
|
151
|
+
return await process_image_with_tesseract(input_file, psm=psm, language=language)
|
154
152
|
|
155
153
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
156
154
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
157
155
|
):
|
158
|
-
return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type
|
156
|
+
return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type)
|
159
157
|
|
160
158
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
161
159
|
return await extract_pptx_file_content(Path(input_file))
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.2
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 2.0
|
3
|
+
Version: 2.1.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
6
6
|
License: MIT
|
@@ -42,7 +42,7 @@ Kreuzberg is a Python library for text extraction from documents. It provides a
|
|
42
42
|
- **Simple and Hassle-Free**: Clean API that just works, without complex configuration
|
43
43
|
- **Local Processing**: No external API calls or cloud dependencies required
|
44
44
|
- **Resource Efficient**: Lightweight processing without GPU requirements
|
45
|
-
- **
|
45
|
+
- **Small Package Size**: Has few curated dependencies and a minimal footprint
|
46
46
|
- **Format Support**: Comprehensive support for documents, images, and text formats
|
47
47
|
- **Modern Python**: Built with async/await, type hints, and functional first approach
|
48
48
|
- **Permissive OSS**: Kreuzberg and its dependencies have a permissive OSS license
|
@@ -61,8 +61,8 @@ pip install kreuzberg
|
|
61
61
|
|
62
62
|
Kreuzberg requires two system level dependencies:
|
63
63
|
|
64
|
-
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion
|
65
|
-
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR
|
64
|
+
- [Pandoc](https://pandoc.org/installing.html) - For document format conversion. Minimum required version is Pandoc 2.
|
65
|
+
- [Tesseract OCR](https://tesseract-ocr.github.io/) - For image and PDF OCR. Minimum required version is Tesseract 4.
|
66
66
|
|
67
67
|
You can install these with:
|
68
68
|
|
@@ -75,7 +75,7 @@ sudo apt-get install pandoc tesseract-ocr
|
|
75
75
|
#### MacOS
|
76
76
|
|
77
77
|
```shell
|
78
|
-
#
|
78
|
+
#
|
79
79
|
brew install tesseract pandoc
|
80
80
|
```
|
81
81
|
|
@@ -191,19 +191,15 @@ Consult the [Tesseract documentation](https://tesseract-ocr.github.io/tessdoc/)
|
|
191
191
|
|
192
192
|
#### Processing Configuration
|
193
193
|
|
194
|
-
- `max_processes` (default: CPU count
|
195
|
-
|
196
|
-
Notes:
|
197
|
-
|
198
|
-
- Higher values can lead to performance improvements when batch processing especially with OCR, but may cause resource exhaustion and deadlocks (especially for tesseract).
|
194
|
+
- `max_processes` (default: CPU count): Maximum number of concurrent processes for Tesseract.
|
199
195
|
|
200
196
|
### Quick Start
|
201
197
|
|
202
198
|
```python
|
203
199
|
from pathlib import Path
|
204
200
|
from kreuzberg import extract_file
|
205
|
-
from kreuzberg
|
206
|
-
from kreuzberg
|
201
|
+
from kreuzberg import ExtractionResult
|
202
|
+
from kreuzberg import PSMMode
|
207
203
|
|
208
204
|
|
209
205
|
# Basic file extraction
|
@@ -232,7 +228,7 @@ async def extract_document():
|
|
232
228
|
|
233
229
|
```python
|
234
230
|
from kreuzberg import extract_bytes
|
235
|
-
from kreuzberg
|
231
|
+
from kreuzberg import ExtractionResult
|
236
232
|
|
237
233
|
|
238
234
|
async def process_upload(file_content: bytes, mime_type: str) -> ExtractionResult:
|
@@ -378,8 +374,8 @@ async def process_document(path: str) -> tuple[str, str, Metadata]:
|
|
378
374
|
Kreuzberg provides comprehensive error handling through several exception types, all inheriting from `KreuzbergError`. Each exception includes helpful context information for debugging.
|
379
375
|
|
380
376
|
```python
|
381
|
-
from kreuzberg import
|
382
|
-
|
377
|
+
from kreuzberg import (
|
378
|
+
extract_file,
|
383
379
|
ValidationError,
|
384
380
|
ParsingError,
|
385
381
|
OCRError,
|
@@ -1,6 +1,6 @@
|
|
1
1
|
[project]
|
2
2
|
name = "kreuzberg"
|
3
|
-
version = "2.0
|
3
|
+
version = "2.1.0"
|
4
4
|
description = "A text extraction library supporting PDFs, images, office documents and more"
|
5
5
|
readme = "README.md"
|
6
6
|
keywords = [
|
@@ -53,11 +53,11 @@ dev = [
|
|
53
53
|
"mypy>=1.15.0",
|
54
54
|
"pre-commit>=4.1.0",
|
55
55
|
"pytest>=8.3.4",
|
56
|
-
"pytest-asyncio>=0.25.3",
|
57
56
|
"pytest-cov>=6.0.0",
|
58
57
|
"pytest-mock>=3.14.0",
|
59
|
-
"
|
58
|
+
"pytest-timeout>=2.3.1",
|
60
59
|
"ruff>=0.9.6",
|
60
|
+
"trio>=0.29.0",
|
61
61
|
]
|
62
62
|
|
63
63
|
[tool.setuptools.packages.find]
|
@@ -100,6 +100,7 @@ lint.per-file-ignores."tests/**/*.*" = [
|
|
100
100
|
"PT013",
|
101
101
|
"RUF012",
|
102
102
|
"S",
|
103
|
+
"SLF001",
|
103
104
|
]
|
104
105
|
lint.isort.known-first-party = [ "kreuzberg", "tests" ]
|
105
106
|
lint.mccabe.max-complexity = 15
|
@@ -114,8 +115,6 @@ keep_full_version = true
|
|
114
115
|
max_supported_python = "3.13"
|
115
116
|
|
116
117
|
[tool.pytest.ini_options]
|
117
|
-
asyncio_mode = "auto"
|
118
|
-
asyncio_default_fixture_loop_scope = "session"
|
119
118
|
filterwarnings = [
|
120
119
|
"ignore:Exception ignored in:pytest.PytestUnraisableExceptionWarning",
|
121
120
|
]
|
@@ -1,33 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
|
3
|
-
import sys
|
4
|
-
from functools import partial
|
5
|
-
from typing import TYPE_CHECKING, TypeVar, cast
|
6
|
-
|
7
|
-
from anyio.to_thread import run_sync as any_io_run_sync
|
8
|
-
|
9
|
-
if TYPE_CHECKING: # pragma: no cover
|
10
|
-
from collections.abc import Callable
|
11
|
-
|
12
|
-
if sys.version_info >= (3, 10):
|
13
|
-
from typing import ParamSpec
|
14
|
-
else: # pragma: no cover
|
15
|
-
from typing_extensions import ParamSpec
|
16
|
-
|
17
|
-
T = TypeVar("T")
|
18
|
-
P = ParamSpec("P")
|
19
|
-
|
20
|
-
|
21
|
-
async def run_sync(sync_fn: Callable[P, T], *args: P.args, **kwargs: P.kwargs) -> T:
|
22
|
-
"""Run a synchronous function in an asynchronous context.
|
23
|
-
|
24
|
-
Args:
|
25
|
-
sync_fn: The synchronous function to run.
|
26
|
-
*args: The positional arguments to pass to the function.
|
27
|
-
**kwargs: The keyword arguments to pass to the function.
|
28
|
-
|
29
|
-
Returns:
|
30
|
-
The result of the synchronous function.
|
31
|
-
"""
|
32
|
-
handler = partial(sync_fn, **kwargs)
|
33
|
-
return cast(T, await any_io_run_sync(handler, *args)) # pyright: ignore [reportCallIssue]
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|
File without changes
|