kreuzberg 1.7.0__py3-none-any.whl → 2.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -2
- kreuzberg/_constants.py +6 -0
- kreuzberg/_html.py +32 -0
- kreuzberg/_mime_types.py +109 -1
- kreuzberg/_pandoc.py +122 -169
- kreuzberg/_pdf.py +189 -0
- kreuzberg/_pptx.py +88 -0
- kreuzberg/_string.py +5 -8
- kreuzberg/_sync.py +6 -1
- kreuzberg/_tesseract.py +98 -71
- kreuzberg/_tmp.py +37 -0
- kreuzberg/_types.py +71 -0
- kreuzberg/_xlsx.py +92 -0
- kreuzberg/extraction.py +269 -64
- kreuzberg-2.0.0.dist-info/METADATA +419 -0
- kreuzberg-2.0.0.dist-info/RECORD +21 -0
- kreuzberg/_extractors.py +0 -280
- kreuzberg-1.7.0.dist-info/METADATA +0 -342
- kreuzberg-1.7.0.dist-info/RECORD +0 -15
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/LICENSE +0 -0
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/WHEEL +0 -0
- {kreuzberg-1.7.0.dist-info → kreuzberg-2.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/_types.py
ADDED
@@ -0,0 +1,71 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import sys
|
4
|
+
from typing import NamedTuple, TypedDict
|
5
|
+
|
6
|
+
if sys.version_info < (3, 11): # pragma: no cover
|
7
|
+
from typing_extensions import NotRequired
|
8
|
+
else: # pragma: no cover
|
9
|
+
from typing import NotRequired
|
10
|
+
|
11
|
+
|
12
|
+
class Metadata(TypedDict, total=False):
|
13
|
+
"""Document metadata.
|
14
|
+
|
15
|
+
All fields are optional but will only be included if they contain non-empty values.
|
16
|
+
Any field that would be empty or None is omitted from the dictionary.
|
17
|
+
|
18
|
+
Different documents and extraction methods will yield different metadata.
|
19
|
+
"""
|
20
|
+
|
21
|
+
title: NotRequired[str]
|
22
|
+
"""Document title."""
|
23
|
+
subtitle: NotRequired[str]
|
24
|
+
"""Document subtitle."""
|
25
|
+
abstract: NotRequired[str | list[str]]
|
26
|
+
"""Document abstract, summary or description."""
|
27
|
+
authors: NotRequired[list[str]]
|
28
|
+
"""List of document authors."""
|
29
|
+
date: NotRequired[str]
|
30
|
+
"""Document date as string to preserve original format."""
|
31
|
+
subject: NotRequired[str]
|
32
|
+
"""Document subject or topic."""
|
33
|
+
description: NotRequired[str]
|
34
|
+
"""Extended description."""
|
35
|
+
keywords: NotRequired[list[str]]
|
36
|
+
"""Keywords or tags."""
|
37
|
+
categories: NotRequired[list[str]]
|
38
|
+
"""Categories or classifications."""
|
39
|
+
version: NotRequired[str]
|
40
|
+
"""Version identifier."""
|
41
|
+
language: NotRequired[str]
|
42
|
+
"""Document language code."""
|
43
|
+
references: NotRequired[list[str]]
|
44
|
+
"""Reference entries."""
|
45
|
+
citations: NotRequired[list[str]]
|
46
|
+
"""Citation identifiers."""
|
47
|
+
copyright: NotRequired[str]
|
48
|
+
"""Copyright information."""
|
49
|
+
license: NotRequired[str]
|
50
|
+
"""License information."""
|
51
|
+
identifier: NotRequired[str]
|
52
|
+
"""Document identifier."""
|
53
|
+
publisher: NotRequired[str]
|
54
|
+
"""Publisher name."""
|
55
|
+
contributors: NotRequired[list[str]]
|
56
|
+
"""Additional contributors."""
|
57
|
+
creator: NotRequired[str]
|
58
|
+
"""Document creator."""
|
59
|
+
institute: NotRequired[str | list[str]]
|
60
|
+
"""Institute or organization."""
|
61
|
+
|
62
|
+
|
63
|
+
class ExtractionResult(NamedTuple):
|
64
|
+
"""The result of a file extraction."""
|
65
|
+
|
66
|
+
content: str
|
67
|
+
"""The extracted content."""
|
68
|
+
mime_type: str
|
69
|
+
"""The mime type of the content."""
|
70
|
+
metadata: Metadata
|
71
|
+
"""The metadata of the content."""
|
kreuzberg/_xlsx.py
ADDED
@@ -0,0 +1,92 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import csv
|
4
|
+
from io import StringIO
|
5
|
+
from typing import TYPE_CHECKING, cast
|
6
|
+
|
7
|
+
from anyio import Path as AsyncPath
|
8
|
+
from anyio import create_task_group
|
9
|
+
from python_calamine import CalamineWorkbook
|
10
|
+
|
11
|
+
from kreuzberg import ExtractionResult, ParsingError
|
12
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
13
|
+
from kreuzberg._pandoc import process_file_with_pandoc
|
14
|
+
from kreuzberg._string import normalize_spaces
|
15
|
+
from kreuzberg._sync import run_sync
|
16
|
+
from kreuzberg._tmp import create_temp_file
|
17
|
+
|
18
|
+
if TYPE_CHECKING: # pragma: no cover
|
19
|
+
from pathlib import Path
|
20
|
+
|
21
|
+
|
22
|
+
async def extract_xlsx_file(input_file: Path) -> ExtractionResult:
|
23
|
+
"""Extract text from an XLSX file by converting it to CSV and then to markdown.
|
24
|
+
|
25
|
+
Args:
|
26
|
+
input_file: The path to the XLSX file.
|
27
|
+
|
28
|
+
Returns:
|
29
|
+
The extracted text content.
|
30
|
+
|
31
|
+
Raises:
|
32
|
+
ParsingError: If the XLSX file could not be parsed.
|
33
|
+
"""
|
34
|
+
try:
|
35
|
+
workbook: CalamineWorkbook = await run_sync(CalamineWorkbook.from_path, str(input_file))
|
36
|
+
|
37
|
+
results = cast(list[str], [None] * len(workbook.sheet_names))
|
38
|
+
|
39
|
+
async def convert_sheet_to_text(sheet_name: str) -> None:
|
40
|
+
nonlocal results
|
41
|
+
values = await run_sync(workbook.get_sheet_by_name(sheet_name).to_python)
|
42
|
+
|
43
|
+
csv_buffer = StringIO()
|
44
|
+
writer = csv.writer(csv_buffer)
|
45
|
+
|
46
|
+
for row in values:
|
47
|
+
writer.writerow(row)
|
48
|
+
|
49
|
+
csv_data = csv_buffer.getvalue()
|
50
|
+
csv_buffer.close()
|
51
|
+
|
52
|
+
from kreuzberg._tmp import create_temp_file
|
53
|
+
|
54
|
+
csv_path, unlink = await create_temp_file(".csv")
|
55
|
+
await AsyncPath(csv_path).write_text(csv_data)
|
56
|
+
result = await process_file_with_pandoc(csv_path, mime_type="text/csv")
|
57
|
+
results[workbook.sheet_names.index(sheet_name)] = f"## {sheet_name}\n\n{normalize_spaces(result.content)}"
|
58
|
+
await unlink()
|
59
|
+
|
60
|
+
async with create_task_group() as tg:
|
61
|
+
for sheet_name in workbook.sheet_names:
|
62
|
+
tg.start_soon(convert_sheet_to_text, sheet_name)
|
63
|
+
|
64
|
+
return ExtractionResult(
|
65
|
+
content="\n\n".join(results),
|
66
|
+
mime_type=MARKDOWN_MIME_TYPE,
|
67
|
+
metadata={},
|
68
|
+
)
|
69
|
+
except Exception as e:
|
70
|
+
raise ParsingError(
|
71
|
+
"Could not extract text from XLSX",
|
72
|
+
context={
|
73
|
+
"error": str(e),
|
74
|
+
},
|
75
|
+
) from e
|
76
|
+
|
77
|
+
|
78
|
+
async def extract_xlsx_content(content: bytes) -> ExtractionResult:
|
79
|
+
"""Extract text from an XLSX file content.
|
80
|
+
|
81
|
+
Args:
|
82
|
+
content: The XLSX file content.
|
83
|
+
|
84
|
+
Returns:
|
85
|
+
The extracted text content.
|
86
|
+
"""
|
87
|
+
xlsx_path, unlink = await create_temp_file(".xlsx")
|
88
|
+
|
89
|
+
await AsyncPath(xlsx_path).write_bytes(content)
|
90
|
+
result = await extract_xlsx_file(xlsx_path)
|
91
|
+
await unlink()
|
92
|
+
return result
|
kreuzberg/extraction.py
CHANGED
@@ -9,54 +9,62 @@ It includes vendored code:
|
|
9
9
|
|
10
10
|
from __future__ import annotations
|
11
11
|
|
12
|
-
from
|
12
|
+
from functools import partial
|
13
|
+
from io import BytesIO
|
13
14
|
from pathlib import Path
|
14
|
-
from
|
15
|
-
from typing import NamedTuple
|
15
|
+
from typing import TYPE_CHECKING, cast
|
16
16
|
|
17
|
+
import anyio
|
17
18
|
from anyio import Path as AsyncPath
|
19
|
+
from PIL.Image import open as open_image
|
18
20
|
|
19
|
-
from kreuzberg
|
20
|
-
|
21
|
-
|
22
|
-
extract_html_string,
|
23
|
-
extract_pdf,
|
24
|
-
extract_pptx_file,
|
25
|
-
extract_xlsx_file,
|
26
|
-
)
|
21
|
+
from kreuzberg import ExtractionResult
|
22
|
+
from kreuzberg._constants import DEFAULT_MAX_PROCESSES
|
23
|
+
from kreuzberg._html import extract_html_string
|
27
24
|
from kreuzberg._mime_types import (
|
28
25
|
EXCEL_MIME_TYPE,
|
29
26
|
HTML_MIME_TYPE,
|
30
|
-
IMAGE_MIME_TYPE_EXT_MAP,
|
31
27
|
IMAGE_MIME_TYPES,
|
32
|
-
MARKDOWN_MIME_TYPE,
|
33
28
|
PANDOC_SUPPORTED_MIME_TYPES,
|
34
29
|
PDF_MIME_TYPE,
|
35
|
-
PLAIN_TEXT_MIME_TYPE,
|
36
30
|
POWER_POINT_MIME_TYPE,
|
37
31
|
SUPPORTED_MIME_TYPES,
|
32
|
+
validate_mime_type,
|
33
|
+
)
|
34
|
+
from kreuzberg._pandoc import process_content_with_pandoc, process_file_with_pandoc
|
35
|
+
from kreuzberg._pdf import (
|
36
|
+
extract_pdf_content,
|
37
|
+
extract_pdf_file,
|
38
38
|
)
|
39
|
+
from kreuzberg._pptx import extract_pptx_file_content
|
39
40
|
from kreuzberg._string import safe_decode
|
40
|
-
from kreuzberg._tesseract import process_image_with_tesseract
|
41
|
+
from kreuzberg._tesseract import PSMMode, SupportedLanguage, process_image_with_tesseract
|
42
|
+
from kreuzberg._xlsx import extract_xlsx_content, extract_xlsx_file
|
41
43
|
from kreuzberg.exceptions import ValidationError
|
42
44
|
|
45
|
+
if TYPE_CHECKING:
|
46
|
+
from collections.abc import Sequence
|
47
|
+
from os import PathLike
|
43
48
|
|
44
|
-
class ExtractionResult(NamedTuple):
|
45
|
-
"""The result of a file extraction."""
|
46
|
-
|
47
|
-
content: str
|
48
|
-
"""The extracted content."""
|
49
|
-
mime_type: str
|
50
|
-
"""The mime type of the content."""
|
51
49
|
|
52
|
-
|
53
|
-
|
50
|
+
async def extract_bytes(
|
51
|
+
content: bytes,
|
52
|
+
mime_type: str,
|
53
|
+
*,
|
54
|
+
force_ocr: bool = False,
|
55
|
+
language: SupportedLanguage = "eng",
|
56
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
57
|
+
psm: PSMMode = PSMMode.AUTO,
|
58
|
+
) -> ExtractionResult:
|
54
59
|
"""Extract the textual content from a given byte string representing a file's contents.
|
55
60
|
|
56
61
|
Args:
|
57
62
|
content: The content to extract.
|
58
63
|
mime_type: The mime type of the content.
|
59
|
-
force_ocr: Whether
|
64
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
65
|
+
language: The language code for OCR. Defaults to "eng".
|
66
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
67
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
60
68
|
|
61
69
|
Raises:
|
62
70
|
ValidationError: If the mime type is not supported.
|
@@ -71,50 +79,54 @@ async def extract_bytes(content: bytes, mime_type: str, force_ocr: bool = False)
|
|
71
79
|
)
|
72
80
|
|
73
81
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
74
|
-
return
|
82
|
+
return await extract_pdf_content(
|
83
|
+
content, force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
|
84
|
+
)
|
75
85
|
|
76
86
|
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
77
|
-
return
|
87
|
+
return await extract_xlsx_content(content)
|
78
88
|
|
79
89
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
80
|
-
|
81
|
-
|
82
|
-
|
83
|
-
return ExtractionResult(
|
84
|
-
content=await process_image_with_tesseract(temp_file.name), mime_type=PLAIN_TEXT_MIME_TYPE
|
85
|
-
)
|
86
|
-
finally:
|
87
|
-
temp_file.close()
|
88
|
-
await AsyncPath(temp_file.name).unlink()
|
90
|
+
return await process_image_with_tesseract(
|
91
|
+
open_image(BytesIO(content)), max_processes=max_processes, psm=psm, language=language
|
92
|
+
)
|
89
93
|
|
90
94
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
91
95
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
92
96
|
):
|
93
|
-
return
|
94
|
-
content=await extract_content_with_pandoc(content, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
95
|
-
)
|
97
|
+
return await process_content_with_pandoc(content=content, mime_type=mime_type, max_processes=max_processes)
|
96
98
|
|
97
99
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
98
|
-
return
|
100
|
+
return await extract_pptx_file_content(content)
|
99
101
|
|
100
102
|
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
101
|
-
return
|
103
|
+
return await extract_html_string(content)
|
102
104
|
|
103
105
|
return ExtractionResult(
|
104
106
|
content=safe_decode(content),
|
105
107
|
mime_type=mime_type,
|
108
|
+
metadata={},
|
106
109
|
)
|
107
110
|
|
108
111
|
|
109
112
|
async def extract_file(
|
110
|
-
file_path:
|
113
|
+
file_path: PathLike[str] | str,
|
114
|
+
mime_type: str | None = None,
|
115
|
+
*,
|
116
|
+
force_ocr: bool = False,
|
117
|
+
language: SupportedLanguage = "eng",
|
118
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
119
|
+
psm: PSMMode = PSMMode.AUTO,
|
111
120
|
) -> ExtractionResult:
|
112
121
|
"""Extract the textual content from a given file.
|
113
122
|
|
114
123
|
Args:
|
115
124
|
file_path: The path to the file.
|
116
|
-
mime_type: The mime type of the
|
117
|
-
force_ocr: Whether
|
125
|
+
mime_type: The mime type of the content.
|
126
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
127
|
+
language: The language code for OCR. Defaults to "eng".
|
128
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
129
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
118
130
|
|
119
131
|
Raises:
|
120
132
|
ValidationError: If the mime type is not supported.
|
@@ -122,40 +134,233 @@ async def extract_file(
|
|
122
134
|
Returns:
|
123
135
|
The extracted content and the mime type of the content.
|
124
136
|
"""
|
125
|
-
|
126
|
-
mime_type = mime_type or guess_type(file_path.name)[0]
|
127
|
-
if not mime_type: # pragma: no cover
|
128
|
-
raise ValidationError("Could not determine the mime type of the file.", context={"file_path": str(file_path)})
|
137
|
+
input_file = await AsyncPath(file_path).resolve()
|
129
138
|
|
130
|
-
|
131
|
-
raise ValidationError(
|
132
|
-
f"Unsupported mime type: {mime_type}",
|
133
|
-
context={"mime_type": mime_type, "supported_mimetypes": ",".join(sorted(SUPPORTED_MIME_TYPES))},
|
134
|
-
)
|
139
|
+
mime_type = validate_mime_type(input_file, mime_type)
|
135
140
|
|
136
|
-
if not await
|
137
|
-
raise ValidationError("The file does not exist.", context={"
|
141
|
+
if not await input_file.exists():
|
142
|
+
raise ValidationError("The file does not exist.", context={"input_file": str(input_file)})
|
138
143
|
|
139
144
|
if mime_type == PDF_MIME_TYPE or mime_type.startswith(PDF_MIME_TYPE):
|
140
|
-
return
|
145
|
+
return await extract_pdf_file(
|
146
|
+
Path(input_file), force_ocr=force_ocr, max_processes=max_processes, psm=psm, language=language
|
147
|
+
)
|
141
148
|
|
142
149
|
if mime_type == EXCEL_MIME_TYPE or mime_type.startswith(EXCEL_MIME_TYPE):
|
143
|
-
return
|
150
|
+
return await extract_xlsx_file(Path(input_file))
|
144
151
|
|
145
152
|
if mime_type in IMAGE_MIME_TYPES or any(mime_type.startswith(value) for value in IMAGE_MIME_TYPES):
|
146
|
-
return
|
153
|
+
return await process_image_with_tesseract(input_file, max_processes=max_processes, psm=psm, language=language)
|
147
154
|
|
148
155
|
if mime_type in PANDOC_SUPPORTED_MIME_TYPES or any(
|
149
156
|
mime_type.startswith(value) for value in PANDOC_SUPPORTED_MIME_TYPES
|
150
157
|
):
|
151
|
-
return
|
152
|
-
content=await extract_file_with_pandoc(file_path, mime_type), mime_type=MARKDOWN_MIME_TYPE
|
153
|
-
)
|
158
|
+
return await process_file_with_pandoc(input_file=input_file, mime_type=mime_type, max_processes=max_processes)
|
154
159
|
|
155
160
|
if mime_type == POWER_POINT_MIME_TYPE or mime_type.startswith(POWER_POINT_MIME_TYPE):
|
156
|
-
return
|
161
|
+
return await extract_pptx_file_content(Path(input_file))
|
157
162
|
|
158
163
|
if mime_type == HTML_MIME_TYPE or mime_type.startswith(HTML_MIME_TYPE):
|
159
|
-
return
|
164
|
+
return await extract_html_string(Path(input_file))
|
165
|
+
|
166
|
+
return ExtractionResult(content=safe_decode(await input_file.read_bytes()), mime_type=mime_type, metadata={})
|
167
|
+
|
168
|
+
|
169
|
+
async def batch_extract_file(
|
170
|
+
file_paths: Sequence[PathLike[str] | str],
|
171
|
+
*,
|
172
|
+
force_ocr: bool = False,
|
173
|
+
language: SupportedLanguage = "eng",
|
174
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
175
|
+
psm: PSMMode = PSMMode.AUTO,
|
176
|
+
) -> list[ExtractionResult]:
|
177
|
+
"""Extract text from multiple files concurrently.
|
178
|
+
|
179
|
+
Args:
|
180
|
+
file_paths: A sequence of paths to files to extract text from.
|
181
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
182
|
+
language: The language code for OCR. Defaults to "eng".
|
183
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
184
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
185
|
+
|
186
|
+
Returns:
|
187
|
+
A list of extraction results in the same order as the input paths.
|
188
|
+
"""
|
189
|
+
results = cast(list[ExtractionResult], ([None] * len(file_paths)))
|
190
|
+
|
191
|
+
async def _extract_file(path: PathLike[str] | str, index: int) -> None:
|
192
|
+
result = await extract_file(
|
193
|
+
path,
|
194
|
+
force_ocr=force_ocr,
|
195
|
+
max_processes=max_processes,
|
196
|
+
psm=psm,
|
197
|
+
language=language,
|
198
|
+
)
|
199
|
+
results[index] = result
|
200
|
+
|
201
|
+
async with anyio.create_task_group() as tg:
|
202
|
+
for i, path in enumerate(file_paths):
|
203
|
+
tg.start_soon(_extract_file, path, i)
|
204
|
+
|
205
|
+
return results
|
206
|
+
|
207
|
+
|
208
|
+
async def batch_extract_bytes(
|
209
|
+
contents: Sequence[tuple[bytes, str]],
|
210
|
+
*,
|
211
|
+
force_ocr: bool = False,
|
212
|
+
language: SupportedLanguage = "eng",
|
213
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
214
|
+
psm: PSMMode = PSMMode.AUTO,
|
215
|
+
) -> list[ExtractionResult]:
|
216
|
+
"""Extract text from multiple byte contents concurrently.
|
217
|
+
|
218
|
+
Args:
|
219
|
+
contents: A sequence of tuples containing (content, mime_type) pairs.
|
220
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
221
|
+
language: The language code for OCR. Defaults to "eng".
|
222
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
223
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
224
|
+
|
225
|
+
Returns:
|
226
|
+
A list of extraction results in the same order as the input contents.
|
227
|
+
"""
|
228
|
+
results = cast(list[ExtractionResult], [None] * len(contents))
|
229
|
+
|
230
|
+
async def _extract_bytes(content: bytes, mime_type: str, index: int) -> None:
|
231
|
+
result = await extract_bytes(
|
232
|
+
content,
|
233
|
+
mime_type,
|
234
|
+
force_ocr=force_ocr,
|
235
|
+
max_processes=max_processes,
|
236
|
+
psm=psm,
|
237
|
+
language=language,
|
238
|
+
)
|
239
|
+
results[index] = result
|
240
|
+
|
241
|
+
async with anyio.create_task_group() as tg:
|
242
|
+
for i, (content, mime_type) in enumerate(contents):
|
243
|
+
tg.start_soon(_extract_bytes, content, mime_type, i)
|
244
|
+
|
245
|
+
return results
|
246
|
+
|
247
|
+
|
248
|
+
### Sync proxies
|
160
249
|
|
161
|
-
|
250
|
+
|
251
|
+
def extract_bytes_sync(
|
252
|
+
content: bytes,
|
253
|
+
mime_type: str,
|
254
|
+
*,
|
255
|
+
force_ocr: bool = False,
|
256
|
+
language: SupportedLanguage = "eng",
|
257
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
258
|
+
psm: PSMMode = PSMMode.AUTO,
|
259
|
+
) -> ExtractionResult:
|
260
|
+
"""Synchronous version of extract_bytes.
|
261
|
+
|
262
|
+
Args:
|
263
|
+
content: The content to extract.
|
264
|
+
mime_type: The mime type of the content.
|
265
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
266
|
+
language: The language code for OCR. Defaults to "eng".
|
267
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
268
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
269
|
+
|
270
|
+
Returns:
|
271
|
+
The extracted content and the mime type of the content.
|
272
|
+
"""
|
273
|
+
handler = partial(
|
274
|
+
extract_bytes, content, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
|
275
|
+
)
|
276
|
+
return anyio.run(handler)
|
277
|
+
|
278
|
+
|
279
|
+
def extract_file_sync(
|
280
|
+
file_path: Path | str,
|
281
|
+
mime_type: str | None = None,
|
282
|
+
*,
|
283
|
+
force_ocr: bool = False,
|
284
|
+
language: SupportedLanguage = "eng",
|
285
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
286
|
+
psm: PSMMode = PSMMode.AUTO,
|
287
|
+
) -> ExtractionResult:
|
288
|
+
"""Synchronous version of extract_file.
|
289
|
+
|
290
|
+
Args:
|
291
|
+
file_path: The path to the file.
|
292
|
+
mime_type: The mime type of the content.
|
293
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
294
|
+
language: The language code for OCR. Defaults to "eng".
|
295
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
296
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
297
|
+
|
298
|
+
Returns:
|
299
|
+
The extracted content and the mime type of the content.
|
300
|
+
"""
|
301
|
+
handler = partial(
|
302
|
+
extract_file, file_path, mime_type, max_processes=max_processes, force_ocr=force_ocr, language=language, psm=psm
|
303
|
+
)
|
304
|
+
return anyio.run(handler)
|
305
|
+
|
306
|
+
|
307
|
+
def batch_extract_file_sync(
|
308
|
+
file_paths: Sequence[PathLike[str] | str],
|
309
|
+
*,
|
310
|
+
force_ocr: bool = False,
|
311
|
+
language: SupportedLanguage = "eng",
|
312
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
313
|
+
psm: PSMMode = PSMMode.AUTO,
|
314
|
+
) -> list[ExtractionResult]:
|
315
|
+
"""Synchronous version of batch_extract_file.
|
316
|
+
|
317
|
+
Args:
|
318
|
+
file_paths: A sequence of paths to files to extract text from.
|
319
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
320
|
+
language: The language code for OCR. Defaults to "eng".
|
321
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
322
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
323
|
+
|
324
|
+
Returns:
|
325
|
+
A list of extraction results in the same order as the input paths.
|
326
|
+
"""
|
327
|
+
handler = partial(
|
328
|
+
batch_extract_file,
|
329
|
+
file_paths,
|
330
|
+
force_ocr=force_ocr,
|
331
|
+
max_processes=max_processes,
|
332
|
+
language=language,
|
333
|
+
psm=psm,
|
334
|
+
)
|
335
|
+
return anyio.run(handler)
|
336
|
+
|
337
|
+
|
338
|
+
def batch_extract_bytes_sync(
|
339
|
+
contents: Sequence[tuple[bytes, str]],
|
340
|
+
*,
|
341
|
+
force_ocr: bool = False,
|
342
|
+
language: SupportedLanguage = "eng",
|
343
|
+
max_processes: int = DEFAULT_MAX_PROCESSES,
|
344
|
+
psm: PSMMode = PSMMode.AUTO,
|
345
|
+
) -> list[ExtractionResult]:
|
346
|
+
"""Synchronous version of batch_extract_bytes.
|
347
|
+
|
348
|
+
Args:
|
349
|
+
contents: A sequence of tuples containing (content, mime_type) pairs.
|
350
|
+
force_ocr: Whether to force OCR on PDF files that have a text layer.
|
351
|
+
language: The language code for OCR. Defaults to "eng".
|
352
|
+
max_processes: Maximum number of concurrent processes. Defaults to CPU count / 2 (minimum 1).
|
353
|
+
psm: Page segmentation mode for Tesseract OCR. Defaults to PSMMode.AUTO.
|
354
|
+
|
355
|
+
Returns:
|
356
|
+
A list of extraction results in the same order as the input contents.
|
357
|
+
"""
|
358
|
+
handler = partial(
|
359
|
+
batch_extract_bytes,
|
360
|
+
contents,
|
361
|
+
force_ocr=force_ocr,
|
362
|
+
max_processes=max_processes,
|
363
|
+
language=language,
|
364
|
+
psm=psm,
|
365
|
+
)
|
366
|
+
return anyio.run(handler)
|