kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +3 -0
- kreuzberg/__main__.py +8 -0
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_cli_config.py +175 -0
- kreuzberg/_extractors/_image.py +39 -4
- kreuzberg/_extractors/_pandoc.py +158 -18
- kreuzberg/_extractors/_pdf.py +199 -19
- kreuzberg/_extractors/_presentation.py +1 -1
- kreuzberg/_extractors/_spread_sheet.py +65 -7
- kreuzberg/_gmft.py +222 -16
- kreuzberg/_mime_types.py +62 -16
- kreuzberg/_multiprocessing/__init__.py +6 -0
- kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
- kreuzberg/_multiprocessing/process_manager.py +188 -0
- kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
- kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
- kreuzberg/_ocr/_easyocr.py +6 -12
- kreuzberg/_ocr/_paddleocr.py +15 -13
- kreuzberg/_ocr/_tesseract.py +136 -46
- kreuzberg/_playa.py +43 -0
- kreuzberg/_types.py +4 -0
- kreuzberg/_utils/_cache.py +372 -0
- kreuzberg/_utils/_device.py +10 -27
- kreuzberg/_utils/_document_cache.py +220 -0
- kreuzberg/_utils/_errors.py +232 -0
- kreuzberg/_utils/_pdf_lock.py +72 -0
- kreuzberg/_utils/_process_pool.py +100 -0
- kreuzberg/_utils/_serialization.py +82 -0
- kreuzberg/_utils/_string.py +1 -1
- kreuzberg/_utils/_sync.py +21 -0
- kreuzberg/cli.py +338 -0
- kreuzberg/extraction.py +247 -36
- kreuzberg-3.4.0.dist-info/METADATA +290 -0
- kreuzberg-3.4.0.dist-info/RECORD +50 -0
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
- kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
- kreuzberg-3.2.0.dist-info/METADATA +0 -166
- kreuzberg-3.2.0.dist-info/RECORD +0 -34
- kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
- {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -18,6 +18,8 @@ from .extraction import (
|
|
18
18
|
extract_file_sync,
|
19
19
|
)
|
20
20
|
|
21
|
+
__version__ = "3.2.0"
|
22
|
+
|
21
23
|
__all__ = [
|
22
24
|
"EasyOCRConfig",
|
23
25
|
"ExtractionConfig",
|
@@ -34,6 +36,7 @@ __all__ = [
|
|
34
36
|
"TableData",
|
35
37
|
"TesseractConfig",
|
36
38
|
"ValidationError",
|
39
|
+
"__version__",
|
37
40
|
"batch_extract_bytes",
|
38
41
|
"batch_extract_bytes_sync",
|
39
42
|
"batch_extract_file",
|
kreuzberg/__main__.py
ADDED
File without changes
|
kreuzberg/_api/main.py
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from json import dumps
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
5
|
+
|
6
|
+
from kreuzberg import (
|
7
|
+
ExtractionResult,
|
8
|
+
KreuzbergError,
|
9
|
+
MissingDependencyError,
|
10
|
+
ParsingError,
|
11
|
+
ValidationError,
|
12
|
+
batch_extract_bytes,
|
13
|
+
)
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from litestar.datastructures import UploadFile
|
17
|
+
|
18
|
+
try:
|
19
|
+
from litestar import Litestar, Request, Response, get, post
|
20
|
+
from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
|
21
|
+
from litestar.enums import RequestEncodingType
|
22
|
+
from litestar.logging import StructLoggingConfig
|
23
|
+
from litestar.params import Body
|
24
|
+
from litestar.status_codes import (
|
25
|
+
HTTP_400_BAD_REQUEST,
|
26
|
+
HTTP_422_UNPROCESSABLE_ENTITY,
|
27
|
+
HTTP_500_INTERNAL_SERVER_ERROR,
|
28
|
+
)
|
29
|
+
except ImportError as e:
|
30
|
+
raise MissingDependencyError.create_for_package(
|
31
|
+
dependency_group="litestar",
|
32
|
+
functionality="Litestar API and docker container",
|
33
|
+
package_name="litestar",
|
34
|
+
) from e
|
35
|
+
|
36
|
+
|
37
|
+
def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
|
38
|
+
if isinstance(exception, ValidationError):
|
39
|
+
status_code = HTTP_400_BAD_REQUEST
|
40
|
+
elif isinstance(exception, ParsingError):
|
41
|
+
status_code = HTTP_422_UNPROCESSABLE_ENTITY
|
42
|
+
else:
|
43
|
+
status_code = HTTP_500_INTERNAL_SERVER_ERROR
|
44
|
+
|
45
|
+
message = str(exception)
|
46
|
+
details = dumps(exception.context)
|
47
|
+
|
48
|
+
if request.app.logger:
|
49
|
+
request.app.logger.error(
|
50
|
+
"API error",
|
51
|
+
method=request.method,
|
52
|
+
url=str(request.url),
|
53
|
+
status_code=status_code,
|
54
|
+
message=message,
|
55
|
+
context=exception.context,
|
56
|
+
)
|
57
|
+
|
58
|
+
return Response(
|
59
|
+
content={"message": message, "details": details},
|
60
|
+
status_code=status_code,
|
61
|
+
)
|
62
|
+
|
63
|
+
|
64
|
+
@post("/extract", operation_id="ExtractFiles")
|
65
|
+
async def handle_files_upload(
|
66
|
+
data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
|
67
|
+
) -> list[ExtractionResult]:
|
68
|
+
"""Extracts text content from an uploaded file."""
|
69
|
+
return await batch_extract_bytes(
|
70
|
+
[(await file.read(), file.content_type) for file in data],
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
@get("/health", operation_id="HealthCheck")
|
75
|
+
async def health_check() -> dict[str, str]:
|
76
|
+
"""A simple health check endpoint."""
|
77
|
+
return {"status": "ok"}
|
78
|
+
|
79
|
+
|
80
|
+
app = Litestar(
|
81
|
+
route_handlers=[handle_files_upload, health_check],
|
82
|
+
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
|
83
|
+
logging_config=StructLoggingConfig(),
|
84
|
+
exception_handlers={
|
85
|
+
KreuzbergError: exception_handler,
|
86
|
+
},
|
87
|
+
)
|
kreuzberg/_cli_config.py
ADDED
@@ -0,0 +1,175 @@
|
|
1
|
+
"""Configuration parsing for the CLI."""
|
2
|
+
|
3
|
+
from __future__ import annotations
|
4
|
+
|
5
|
+
import sys
|
6
|
+
from pathlib import Path
|
7
|
+
from typing import TYPE_CHECKING, Any
|
8
|
+
|
9
|
+
if sys.version_info >= (3, 11):
|
10
|
+
import tomllib
|
11
|
+
else:
|
12
|
+
import tomli as tomllib # type: ignore[import-not-found]
|
13
|
+
|
14
|
+
from kreuzberg._gmft import GMFTConfig
|
15
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
16
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
17
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
18
|
+
from kreuzberg._types import ExtractionConfig, OcrBackendType
|
19
|
+
from kreuzberg.exceptions import ValidationError
|
20
|
+
|
21
|
+
if TYPE_CHECKING:
|
22
|
+
from collections.abc import MutableMapping
|
23
|
+
|
24
|
+
|
25
|
+
def load_config_from_file(config_path: Path) -> dict[str, Any]:
|
26
|
+
"""Load configuration from a TOML file.
|
27
|
+
|
28
|
+
Args:
|
29
|
+
config_path: Path to the configuration file.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Dictionary containing the loaded configuration.
|
33
|
+
|
34
|
+
Raises:
|
35
|
+
ValidationError: If the file cannot be read or parsed.
|
36
|
+
"""
|
37
|
+
try:
|
38
|
+
with config_path.open("rb") as f:
|
39
|
+
data = tomllib.load(f)
|
40
|
+
except FileNotFoundError as e:
|
41
|
+
raise ValidationError(f"Configuration file not found: {config_path}") from e
|
42
|
+
except tomllib.TOMLDecodeError as e:
|
43
|
+
raise ValidationError(f"Invalid TOML in configuration file: {e}") from e
|
44
|
+
|
45
|
+
return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
|
46
|
+
|
47
|
+
|
48
|
+
def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
|
49
|
+
"""Merge two configuration dictionaries recursively.
|
50
|
+
|
51
|
+
Args:
|
52
|
+
base: Base configuration dictionary.
|
53
|
+
override: Configuration dictionary to override base values.
|
54
|
+
|
55
|
+
Returns:
|
56
|
+
Merged configuration dictionary.
|
57
|
+
"""
|
58
|
+
result = base.copy()
|
59
|
+
for key, value in override.items():
|
60
|
+
if isinstance(value, dict) and key in result and isinstance(result[key], dict):
|
61
|
+
result[key] = merge_configs(result[key], value)
|
62
|
+
else:
|
63
|
+
result[key] = value
|
64
|
+
return result
|
65
|
+
|
66
|
+
|
67
|
+
def parse_ocr_backend_config(
|
68
|
+
config_dict: dict[str, Any], backend: OcrBackendType
|
69
|
+
) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
|
70
|
+
"""Parse OCR backend-specific configuration.
|
71
|
+
|
72
|
+
Args:
|
73
|
+
config_dict: Configuration dictionary.
|
74
|
+
backend: The OCR backend type.
|
75
|
+
|
76
|
+
Returns:
|
77
|
+
Backend-specific configuration object or None.
|
78
|
+
"""
|
79
|
+
if backend not in config_dict:
|
80
|
+
return None
|
81
|
+
|
82
|
+
backend_config = config_dict[backend]
|
83
|
+
if not isinstance(backend_config, dict):
|
84
|
+
return None
|
85
|
+
|
86
|
+
if backend == "tesseract":
|
87
|
+
return TesseractConfig(**backend_config)
|
88
|
+
if backend == "easyocr":
|
89
|
+
return EasyOCRConfig(**backend_config)
|
90
|
+
if backend == "paddleocr":
|
91
|
+
return PaddleOCRConfig(**backend_config)
|
92
|
+
return None
|
93
|
+
|
94
|
+
|
95
|
+
def build_extraction_config( # noqa: C901, PLR0912
|
96
|
+
file_config: dict[str, Any],
|
97
|
+
cli_args: MutableMapping[str, Any],
|
98
|
+
) -> ExtractionConfig:
|
99
|
+
"""Build ExtractionConfig from file config and CLI arguments.
|
100
|
+
|
101
|
+
Args:
|
102
|
+
file_config: Configuration loaded from file.
|
103
|
+
cli_args: CLI arguments.
|
104
|
+
|
105
|
+
Returns:
|
106
|
+
ExtractionConfig instance.
|
107
|
+
"""
|
108
|
+
config_dict: dict[str, Any] = {}
|
109
|
+
|
110
|
+
if file_config:
|
111
|
+
for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
|
112
|
+
if field in file_config:
|
113
|
+
config_dict[field] = file_config[field]
|
114
|
+
|
115
|
+
for field in ["force_ocr", "chunk_content", "extract_tables", "max_chars", "max_overlap", "ocr_backend"]:
|
116
|
+
cli_key = field
|
117
|
+
if cli_key in cli_args and cli_args[cli_key] is not None:
|
118
|
+
config_dict[field] = cli_args[cli_key]
|
119
|
+
|
120
|
+
ocr_backend = config_dict.get("ocr_backend")
|
121
|
+
if ocr_backend and ocr_backend != "none":
|
122
|
+
ocr_config = None
|
123
|
+
|
124
|
+
if cli_args.get(f"{ocr_backend}_config"):
|
125
|
+
backend_args = cli_args[f"{ocr_backend}_config"]
|
126
|
+
if ocr_backend == "tesseract":
|
127
|
+
ocr_config = TesseractConfig(**backend_args)
|
128
|
+
elif ocr_backend == "easyocr":
|
129
|
+
ocr_config = EasyOCRConfig(**backend_args) # type: ignore[assignment]
|
130
|
+
elif ocr_backend == "paddleocr":
|
131
|
+
ocr_config = PaddleOCRConfig(**backend_args) # type: ignore[assignment]
|
132
|
+
|
133
|
+
if not ocr_config and file_config:
|
134
|
+
ocr_config = parse_ocr_backend_config(file_config, ocr_backend) # type: ignore[assignment]
|
135
|
+
|
136
|
+
if ocr_config:
|
137
|
+
config_dict["ocr_config"] = ocr_config
|
138
|
+
|
139
|
+
if config_dict.get("extract_tables"):
|
140
|
+
gmft_config = None
|
141
|
+
|
142
|
+
if cli_args.get("gmft_config"):
|
143
|
+
gmft_config = GMFTConfig(**cli_args["gmft_config"])
|
144
|
+
|
145
|
+
elif "gmft" in file_config and isinstance(file_config["gmft"], dict):
|
146
|
+
gmft_config = GMFTConfig(**file_config["gmft"])
|
147
|
+
|
148
|
+
if gmft_config:
|
149
|
+
config_dict["gmft_config"] = gmft_config
|
150
|
+
|
151
|
+
if config_dict.get("ocr_backend") == "none":
|
152
|
+
config_dict["ocr_backend"] = None
|
153
|
+
|
154
|
+
return ExtractionConfig(**config_dict)
|
155
|
+
|
156
|
+
|
157
|
+
def find_default_config() -> Path | None:
|
158
|
+
"""Find the default configuration file (pyproject.toml).
|
159
|
+
|
160
|
+
Returns:
|
161
|
+
Path to the configuration file or None if not found.
|
162
|
+
"""
|
163
|
+
current = Path.cwd()
|
164
|
+
while current != current.parent:
|
165
|
+
config_path = current / "pyproject.toml"
|
166
|
+
if config_path.exists():
|
167
|
+
try:
|
168
|
+
with config_path.open("rb") as f:
|
169
|
+
data = tomllib.load(f)
|
170
|
+
if "tool" in data and "kreuzberg" in data["tool"]:
|
171
|
+
return config_path
|
172
|
+
except Exception: # noqa: BLE001
|
173
|
+
pass
|
174
|
+
current = current.parent
|
175
|
+
return None
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -2,7 +2,6 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
from typing import TYPE_CHECKING, ClassVar
|
4
4
|
|
5
|
-
import anyio
|
6
5
|
from anyio import Path as AsyncPath
|
7
6
|
|
8
7
|
from kreuzberg._extractors._base import Extractor
|
@@ -13,10 +12,12 @@ from kreuzberg.exceptions import ValidationError
|
|
13
12
|
|
14
13
|
if TYPE_CHECKING: # pragma: no cover
|
15
14
|
from collections.abc import Mapping
|
16
|
-
from pathlib import Path
|
17
15
|
|
18
16
|
from kreuzberg._types import ExtractionResult
|
19
17
|
|
18
|
+
import contextlib
|
19
|
+
from pathlib import Path
|
20
|
+
|
20
21
|
|
21
22
|
class ImageExtractor(Extractor):
|
22
23
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
|
@@ -58,10 +59,44 @@ class ImageExtractor(Extractor):
|
|
58
59
|
return await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
|
59
60
|
|
60
61
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
61
|
-
|
62
|
+
"""Pure sync implementation of extract_bytes."""
|
63
|
+
import os
|
64
|
+
import tempfile
|
65
|
+
|
66
|
+
extension = self._get_extension_from_mime_type(self.mime_type)
|
67
|
+
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
68
|
+
|
69
|
+
try:
|
70
|
+
with os.fdopen(fd, "wb") as f:
|
71
|
+
f.write(content)
|
72
|
+
|
73
|
+
return self.extract_path_sync(Path(temp_path))
|
74
|
+
finally:
|
75
|
+
with contextlib.suppress(OSError):
|
76
|
+
Path(temp_path).unlink()
|
62
77
|
|
63
78
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
64
|
-
|
79
|
+
"""Pure sync implementation of extract_path."""
|
80
|
+
if self.config.ocr_backend is None:
|
81
|
+
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
82
|
+
|
83
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
84
|
+
from kreuzberg._types import ExtractionResult
|
85
|
+
|
86
|
+
if self.config.ocr_backend == "tesseract":
|
87
|
+
from kreuzberg._multiprocessing.sync_tesseract import process_batch_images_sync_pure
|
88
|
+
|
89
|
+
if isinstance(self.config.ocr_config, TesseractConfig):
|
90
|
+
config = self.config.ocr_config
|
91
|
+
else:
|
92
|
+
config = TesseractConfig()
|
93
|
+
|
94
|
+
results = process_batch_images_sync_pure([str(path)], config)
|
95
|
+
if results:
|
96
|
+
return results[0]
|
97
|
+
return ExtractionResult(content="", mime_type="text/plain", metadata={}, chunks=[])
|
98
|
+
|
99
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
65
100
|
|
66
101
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
67
102
|
if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
|
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -1,11 +1,12 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import contextlib
|
3
4
|
import re
|
4
5
|
import sys
|
5
6
|
from json import JSONDecodeError, loads
|
7
|
+
from pathlib import Path
|
6
8
|
from typing import TYPE_CHECKING, Any, ClassVar, Final, Literal, cast
|
7
9
|
|
8
|
-
import anyio
|
9
10
|
from anyio import Path as AsyncPath
|
10
11
|
from anyio import run_process
|
11
12
|
|
@@ -21,7 +22,7 @@ from kreuzberg.exceptions import MissingDependencyError, ParsingError, Validatio
|
|
21
22
|
if TYPE_CHECKING: # pragma: no cover
|
22
23
|
from collections.abc import Mapping
|
23
24
|
from os import PathLike
|
24
|
-
|
25
|
+
|
25
26
|
|
26
27
|
if sys.version_info < (3, 11): # pragma: no cover
|
27
28
|
from exceptiongroup import ExceptionGroup # type: ignore[import-not-found]
|
@@ -194,7 +195,7 @@ class PandocExtractor(Extractor):
|
|
194
195
|
raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
|
195
196
|
|
196
197
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
197
|
-
"""
|
198
|
+
"""Pure sync implementation of extract_bytes.
|
198
199
|
|
199
200
|
Args:
|
200
201
|
content: The content bytes to process.
|
@@ -202,18 +203,46 @@ class PandocExtractor(Extractor):
|
|
202
203
|
Returns:
|
203
204
|
ExtractionResult with the extracted text and metadata.
|
204
205
|
"""
|
205
|
-
|
206
|
+
import os
|
207
|
+
import tempfile
|
208
|
+
from pathlib import Path
|
209
|
+
|
210
|
+
extension = self._get_pandoc_type_from_mime_type(self.mime_type)
|
211
|
+
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
212
|
+
|
213
|
+
try:
|
214
|
+
with os.fdopen(fd, "wb") as f:
|
215
|
+
f.write(content)
|
216
|
+
|
217
|
+
return self.extract_path_sync(Path(temp_path))
|
218
|
+
finally:
|
219
|
+
with contextlib.suppress(OSError):
|
220
|
+
Path(temp_path).unlink()
|
206
221
|
|
207
222
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
208
|
-
"""
|
223
|
+
"""Pure sync implementation of extract_path.
|
209
224
|
|
210
225
|
Args:
|
211
226
|
path: The path to the file to process.
|
212
227
|
|
213
228
|
Returns:
|
214
229
|
ExtractionResult with the extracted text and metadata.
|
230
|
+
|
231
|
+
Raises:
|
232
|
+
ParsingError: When file processing fails.
|
215
233
|
"""
|
216
|
-
|
234
|
+
self._validate_pandoc_version_sync()
|
235
|
+
self._get_pandoc_type_from_mime_type(self.mime_type)
|
236
|
+
|
237
|
+
try:
|
238
|
+
metadata = self._extract_metadata_sync(path)
|
239
|
+
content = self._extract_file_sync(path)
|
240
|
+
|
241
|
+
return ExtractionResult(
|
242
|
+
content=normalize_spaces(content), metadata=metadata, mime_type=MARKDOWN_MIME_TYPE, chunks=[]
|
243
|
+
)
|
244
|
+
except Exception as e:
|
245
|
+
raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
|
217
246
|
|
218
247
|
async def _validate_pandoc_version(self) -> None:
|
219
248
|
"""Validate that the installed Pandoc version meets the minimum requirement.
|
@@ -229,36 +258,26 @@ class PandocExtractor(Extractor):
|
|
229
258
|
result = await run_process(command)
|
230
259
|
stdout = result.stdout.decode()
|
231
260
|
|
232
|
-
# Try more inclusive patterns to detect the pandoc version
|
233
|
-
# Try common formats first
|
234
261
|
version_match = re.search(
|
235
262
|
r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
|
236
263
|
)
|
237
264
|
|
238
|
-
# Try version in parentheses format
|
239
265
|
if not version_match:
|
240
266
|
version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
|
241
267
|
|
242
|
-
# Try hyphenated format
|
243
268
|
if not version_match:
|
244
269
|
version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
|
245
270
|
|
246
|
-
# If still no match, check for version at the beginning of the output or any line
|
247
271
|
if not version_match:
|
248
|
-
# Match version at the start of a line (like in the test case "2.9.2.1\npandoc-types 1.20")
|
249
272
|
version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
|
250
273
|
|
251
|
-
# Try finding version-like patterns elsewhere in the text
|
252
274
|
if not version_match:
|
253
|
-
# Search for version-like patterns at the beginning of lines or after spaces
|
254
275
|
version_match = re.search(r"(?:^|\s)(\d+)\.(\d+)(?:\.(\d+))?(?:\s|$)", stdout)
|
255
276
|
|
256
|
-
# As a last resort, check any sequence of digits that might be a version
|
257
277
|
if not version_match:
|
258
278
|
out_lines = stdout.splitlines()
|
259
279
|
for line in out_lines:
|
260
280
|
for token in line.split():
|
261
|
-
# Match standalone version patterns like 2.11 or 2.11.4
|
262
281
|
version_pattern = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?$", token)
|
263
282
|
if version_pattern:
|
264
283
|
version_match = version_pattern
|
@@ -266,12 +285,10 @@ class PandocExtractor(Extractor):
|
|
266
285
|
if version_match:
|
267
286
|
break
|
268
287
|
|
269
|
-
# If we found a version, check that the major version is at least the minimum required
|
270
288
|
if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
|
271
289
|
self._checked_version = True
|
272
290
|
return
|
273
291
|
|
274
|
-
# If we get here, we either didn't find a version or it's too low
|
275
292
|
raise MissingDependencyError(
|
276
293
|
"Pandoc version 2 or above is a required system dependency. Please install it on your system and make sure its available in $PATH."
|
277
294
|
)
|
@@ -560,6 +577,129 @@ class PandocExtractor(Extractor):
|
|
560
577
|
|
561
578
|
return None
|
562
579
|
|
580
|
+
def _validate_pandoc_version_sync(self) -> None:
|
581
|
+
"""Synchronous version of _validate_pandoc_version."""
|
582
|
+
import subprocess
|
583
|
+
|
584
|
+
try:
|
585
|
+
if self._checked_version:
|
586
|
+
return
|
587
|
+
|
588
|
+
result = subprocess.run(["pandoc", "--version"], capture_output=True, text=True, check=False) # noqa: S607
|
589
|
+
|
590
|
+
if result.returncode != 0:
|
591
|
+
raise MissingDependencyError(
|
592
|
+
"Pandoc version 2 or above is a required system dependency. "
|
593
|
+
"Please install it on your system and make sure its available in $PATH."
|
594
|
+
)
|
595
|
+
|
596
|
+
stdout = result.stdout
|
597
|
+
|
598
|
+
version_match = re.search(
|
599
|
+
r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
|
600
|
+
)
|
601
|
+
|
602
|
+
if not version_match:
|
603
|
+
version_match = re.search(r"pandoc\s+\(version\s+(\d+)\.(\d+)(?:\.(\d+))?\)", stdout, re.IGNORECASE)
|
604
|
+
|
605
|
+
if not version_match:
|
606
|
+
version_match = re.search(r"pandoc-(\d+)\.(\d+)(?:\.(\d+))?", stdout)
|
607
|
+
|
608
|
+
if not version_match:
|
609
|
+
version_match = re.search(r"^(\d+)\.(\d+)(?:\.(\d+)(?:\.(\d+))?)?", stdout, re.MULTILINE)
|
610
|
+
|
611
|
+
if version_match and int(version_match.group(1)) >= MINIMAL_SUPPORTED_PANDOC_VERSION:
|
612
|
+
self._checked_version = True
|
613
|
+
return
|
614
|
+
|
615
|
+
raise MissingDependencyError(
|
616
|
+
"Pandoc version 2 or above is a required system dependency. "
|
617
|
+
"Please install it on your system and make sure its available in $PATH."
|
618
|
+
)
|
619
|
+
|
620
|
+
except (subprocess.SubprocessError, FileNotFoundError) as e:
|
621
|
+
raise MissingDependencyError(
|
622
|
+
"Pandoc version 2 or above is a required system dependency. "
|
623
|
+
"Please install it on your system and make sure its available in $PATH."
|
624
|
+
) from e
|
625
|
+
|
626
|
+
def _extract_metadata_sync(self, path: Path) -> Metadata:
|
627
|
+
"""Synchronous version of _handle_extract_metadata."""
|
628
|
+
import os
|
629
|
+
import subprocess
|
630
|
+
import tempfile
|
631
|
+
|
632
|
+
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
633
|
+
fd, metadata_file = tempfile.mkstemp(suffix=".json")
|
634
|
+
os.close(fd)
|
635
|
+
|
636
|
+
try:
|
637
|
+
command = [
|
638
|
+
"pandoc",
|
639
|
+
str(path),
|
640
|
+
f"--from={pandoc_type}",
|
641
|
+
"--to=json",
|
642
|
+
"--standalone",
|
643
|
+
"--quiet",
|
644
|
+
"--output",
|
645
|
+
str(metadata_file),
|
646
|
+
]
|
647
|
+
|
648
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
649
|
+
|
650
|
+
if result.returncode != 0:
|
651
|
+
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|
652
|
+
|
653
|
+
with Path(metadata_file).open(encoding="utf-8") as f:
|
654
|
+
json_data = loads(f.read())
|
655
|
+
|
656
|
+
return self._extract_metadata(json_data)
|
657
|
+
|
658
|
+
except (OSError, JSONDecodeError) as e:
|
659
|
+
raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
|
660
|
+
finally:
|
661
|
+
with contextlib.suppress(OSError):
|
662
|
+
Path(metadata_file).unlink()
|
663
|
+
|
664
|
+
def _extract_file_sync(self, path: Path) -> str:
|
665
|
+
"""Synchronous version of _handle_extract_file."""
|
666
|
+
import os
|
667
|
+
import subprocess
|
668
|
+
import tempfile
|
669
|
+
|
670
|
+
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
671
|
+
fd, output_path = tempfile.mkstemp(suffix=".md")
|
672
|
+
os.close(fd)
|
673
|
+
|
674
|
+
try:
|
675
|
+
command = [
|
676
|
+
"pandoc",
|
677
|
+
str(path),
|
678
|
+
f"--from={pandoc_type}",
|
679
|
+
"--to=markdown",
|
680
|
+
"--standalone",
|
681
|
+
"--wrap=preserve",
|
682
|
+
"--quiet",
|
683
|
+
"--output",
|
684
|
+
str(output_path),
|
685
|
+
]
|
686
|
+
|
687
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
688
|
+
|
689
|
+
if result.returncode != 0:
|
690
|
+
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|
691
|
+
|
692
|
+
with Path(output_path).open(encoding="utf-8") as f:
|
693
|
+
text = f.read()
|
694
|
+
|
695
|
+
return normalize_spaces(text)
|
696
|
+
|
697
|
+
except OSError as e:
|
698
|
+
raise ParsingError("Failed to extract file data", context={"file": str(path)}) from e
|
699
|
+
finally:
|
700
|
+
with contextlib.suppress(OSError):
|
701
|
+
Path(output_path).unlink()
|
702
|
+
|
563
703
|
|
564
704
|
class MarkdownExtractor(PandocExtractor):
|
565
705
|
"""Extractor for Markdown-based document formats."""
|