kreuzberg 3.11.4__py3-none-any.whl → 3.13.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_config.py +248 -204
- kreuzberg/_document_classification.py +0 -8
- kreuzberg/_entity_extraction.py +1 -93
- kreuzberg/_extractors/_base.py +0 -5
- kreuzberg/_extractors/_email.py +1 -11
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -23
- kreuzberg/_extractors/_pandoc.py +10 -89
- kreuzberg/_extractors/_pdf.py +39 -92
- kreuzberg/_extractors/_presentation.py +0 -17
- kreuzberg/_extractors/_spread_sheet.py +13 -53
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -138
- kreuzberg/_language_detection.py +1 -22
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -2
- kreuzberg/_ocr/_easyocr.py +21 -108
- kreuzberg/_ocr/_paddleocr.py +16 -94
- kreuzberg/_ocr/_table_extractor.py +260 -0
- kreuzberg/_ocr/_tesseract.py +906 -264
- kreuzberg/_playa.py +5 -4
- kreuzberg/_types.py +638 -40
- kreuzberg/_utils/_cache.py +88 -90
- kreuzberg/_utils/_device.py +0 -18
- kreuzberg/_utils/_document_cache.py +0 -2
- kreuzberg/_utils/_errors.py +0 -3
- kreuzberg/_utils/_pdf_lock.py +0 -2
- kreuzberg/_utils/_process_pool.py +19 -19
- kreuzberg/_utils/_quality.py +0 -43
- kreuzberg/_utils/_ref.py +48 -0
- kreuzberg/_utils/_serialization.py +0 -5
- kreuzberg/_utils/_string.py +9 -39
- kreuzberg/_utils/_sync.py +0 -1
- kreuzberg/_utils/_table.py +50 -57
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
- kreuzberg-3.13.0.dist-info/RECORD +56 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_entity_extraction.py
CHANGED
@@ -2,105 +2,14 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
5
|
-
from dataclasses import dataclass
|
6
5
|
from functools import lru_cache
|
7
6
|
from typing import TYPE_CHECKING, Any
|
8
7
|
|
9
|
-
from kreuzberg._types import Entity
|
8
|
+
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
10
9
|
from kreuzberg.exceptions import MissingDependencyError
|
11
10
|
|
12
11
|
if TYPE_CHECKING:
|
13
12
|
from collections.abc import Sequence
|
14
|
-
from pathlib import Path
|
15
|
-
|
16
|
-
|
17
|
-
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
18
|
-
class SpacyEntityExtractionConfig:
|
19
|
-
"""Configuration for spaCy-based entity extraction."""
|
20
|
-
|
21
|
-
model_cache_dir: str | Path | None = None
|
22
|
-
"""Directory to cache spaCy models. If None, uses spaCy's default."""
|
23
|
-
|
24
|
-
language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
|
25
|
-
"""Mapping of language codes to spaCy model names.
|
26
|
-
|
27
|
-
If None, uses default mappings:
|
28
|
-
- en: en_core_web_sm
|
29
|
-
- de: de_core_news_sm
|
30
|
-
- fr: fr_core_news_sm
|
31
|
-
- es: es_core_news_sm
|
32
|
-
- pt: pt_core_news_sm
|
33
|
-
- it: it_core_news_sm
|
34
|
-
- nl: nl_core_news_sm
|
35
|
-
- zh: zh_core_web_sm
|
36
|
-
- ja: ja_core_news_sm
|
37
|
-
"""
|
38
|
-
|
39
|
-
fallback_to_multilingual: bool = True
|
40
|
-
"""If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
|
41
|
-
|
42
|
-
max_doc_length: int = 1000000
|
43
|
-
"""Maximum document length for spaCy processing."""
|
44
|
-
|
45
|
-
batch_size: int = 1000
|
46
|
-
"""Batch size for processing multiple texts."""
|
47
|
-
|
48
|
-
def __post_init__(self) -> None:
|
49
|
-
if self.language_models is None:
|
50
|
-
object.__setattr__(self, "language_models", self._get_default_language_models())
|
51
|
-
|
52
|
-
if isinstance(self.language_models, dict):
|
53
|
-
object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
|
54
|
-
|
55
|
-
@staticmethod
|
56
|
-
def _get_default_language_models() -> dict[str, str]:
|
57
|
-
"""Get default language model mappings based on available spaCy models."""
|
58
|
-
return {
|
59
|
-
"en": "en_core_web_sm",
|
60
|
-
"de": "de_core_news_sm",
|
61
|
-
"fr": "fr_core_news_sm",
|
62
|
-
"es": "es_core_news_sm",
|
63
|
-
"pt": "pt_core_news_sm",
|
64
|
-
"it": "it_core_news_sm",
|
65
|
-
"nl": "nl_core_news_sm",
|
66
|
-
"zh": "zh_core_web_sm",
|
67
|
-
"ja": "ja_core_news_sm",
|
68
|
-
"ko": "ko_core_news_sm",
|
69
|
-
"ru": "ru_core_news_sm",
|
70
|
-
"pl": "pl_core_news_sm",
|
71
|
-
"ro": "ro_core_news_sm",
|
72
|
-
"el": "el_core_news_sm",
|
73
|
-
"da": "da_core_news_sm",
|
74
|
-
"fi": "fi_core_news_sm",
|
75
|
-
"nb": "nb_core_news_sm",
|
76
|
-
"sv": "sv_core_news_sm",
|
77
|
-
"ca": "ca_core_news_sm",
|
78
|
-
"hr": "hr_core_news_sm",
|
79
|
-
"lt": "lt_core_news_sm",
|
80
|
-
"mk": "mk_core_news_sm",
|
81
|
-
"sl": "sl_core_news_sm",
|
82
|
-
"uk": "uk_core_news_sm",
|
83
|
-
}
|
84
|
-
|
85
|
-
def get_model_for_language(self, language_code: str) -> str | None:
|
86
|
-
"""Get the appropriate spaCy model for a language code."""
|
87
|
-
if not self.language_models:
|
88
|
-
return None
|
89
|
-
|
90
|
-
models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
|
91
|
-
|
92
|
-
if language_code in models_dict:
|
93
|
-
return models_dict[language_code]
|
94
|
-
|
95
|
-
base_lang = language_code.split("-")[0].lower()
|
96
|
-
if base_lang in models_dict:
|
97
|
-
return models_dict[base_lang]
|
98
|
-
|
99
|
-
return None
|
100
|
-
|
101
|
-
def get_fallback_model(self) -> str | None:
|
102
|
-
"""Get fallback multilingual model if enabled."""
|
103
|
-
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
104
13
|
|
105
14
|
|
106
15
|
def extract_entities(
|
@@ -127,7 +36,6 @@ def extract_entities(
|
|
127
36
|
"""
|
128
37
|
entities: list[Entity] = []
|
129
38
|
if custom_patterns:
|
130
|
-
# Direct iteration over frozenset - no need to convert to dict
|
131
39
|
for ent_type, pattern in custom_patterns:
|
132
40
|
entities.extend(
|
133
41
|
Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -102,23 +102,18 @@ class Extractor(ABC):
|
|
102
102
|
Returns:
|
103
103
|
Enhanced extraction result with quality improvements (if enabled)
|
104
104
|
"""
|
105
|
-
# Only apply quality processing if enabled in config
|
106
105
|
if not self.config.enable_quality_processing:
|
107
106
|
return result
|
108
107
|
|
109
108
|
if not result.content:
|
110
109
|
return result
|
111
110
|
|
112
|
-
# Clean the content
|
113
111
|
cleaned_content = clean_extracted_text(result.content)
|
114
112
|
|
115
|
-
# Calculate quality score
|
116
113
|
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
114
|
|
118
|
-
# Add quality metadata
|
119
115
|
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
120
116
|
|
121
|
-
# Return enhanced result
|
122
117
|
return ExtractionResult(
|
123
118
|
content=cleaned_content,
|
124
119
|
mime_type=result.mime_type,
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -16,7 +16,6 @@ from kreuzberg.exceptions import MissingDependencyError
|
|
16
16
|
if TYPE_CHECKING:
|
17
17
|
from pathlib import Path
|
18
18
|
|
19
|
-
# Import optional dependencies at module level with proper error handling
|
20
19
|
try:
|
21
20
|
import mailparse
|
22
21
|
except ImportError: # pragma: no cover
|
@@ -27,7 +26,6 @@ try:
|
|
27
26
|
except ImportError: # pragma: no cover
|
28
27
|
html2text = None
|
29
28
|
|
30
|
-
# Compile regex pattern once at module level
|
31
29
|
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
32
30
|
|
33
31
|
|
@@ -45,7 +43,6 @@ class EmailExtractor(Extractor):
|
|
45
43
|
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
46
44
|
) -> None:
|
47
45
|
"""Extract and process email headers."""
|
48
|
-
# Use single dict access where possible to avoid repeated lookups
|
49
46
|
subject = parsed_email.get("subject")
|
50
47
|
if subject:
|
51
48
|
metadata["subject"] = subject
|
@@ -59,9 +56,7 @@ class EmailExtractor(Extractor):
|
|
59
56
|
|
60
57
|
to_info = parsed_email.get("to")
|
61
58
|
if to_info:
|
62
|
-
# Store the raw value in metadata (could be string, dict, or list)
|
63
59
|
if isinstance(to_info, list) and to_info:
|
64
|
-
# For metadata, use first recipient's email if it's a list
|
65
60
|
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
66
61
|
metadata["email_to"] = to_email
|
67
62
|
elif isinstance(to_info, dict):
|
@@ -69,7 +64,6 @@ class EmailExtractor(Extractor):
|
|
69
64
|
else:
|
70
65
|
metadata["email_to"] = str(to_info)
|
71
66
|
|
72
|
-
# For display, format all recipients
|
73
67
|
to_formatted = self._format_email_field(to_info)
|
74
68
|
text_parts.append(f"To: {to_formatted}")
|
75
69
|
|
@@ -111,19 +105,17 @@ class EmailExtractor(Extractor):
|
|
111
105
|
text_content = parsed_email.get("text")
|
112
106
|
if text_content:
|
113
107
|
text_parts.append(f"\n{text_content}")
|
114
|
-
return
|
108
|
+
return
|
115
109
|
|
116
110
|
html_content = parsed_email.get("html")
|
117
111
|
if html_content:
|
118
112
|
if html2text is not None:
|
119
|
-
# Use html2text if available (faster path)
|
120
113
|
h = html2text.HTML2Text()
|
121
114
|
h.ignore_links = True
|
122
115
|
h.ignore_images = True
|
123
116
|
converted_text = h.handle(html_content)
|
124
117
|
text_parts.append(f"\n{converted_text}")
|
125
118
|
else:
|
126
|
-
# Fallback: strip HTML tags and unescape entities
|
127
119
|
clean_html = _HTML_TAG_PATTERN.sub("", html_content)
|
128
120
|
clean_html = unescape(clean_html)
|
129
121
|
text_parts.append(f"\n{clean_html}")
|
@@ -148,12 +140,10 @@ class EmailExtractor(Extractor):
|
|
148
140
|
text_parts: list[str] = []
|
149
141
|
metadata: dict[str, Any] = {}
|
150
142
|
|
151
|
-
# Extract headers, body, and attachments
|
152
143
|
self._extract_email_headers(parsed_email, text_parts, metadata)
|
153
144
|
self._extract_email_body(parsed_email, text_parts)
|
154
145
|
self._extract_email_attachments(parsed_email, text_parts, metadata)
|
155
146
|
|
156
|
-
# Join efficiently
|
157
147
|
combined_text = "\n".join(text_parts)
|
158
148
|
|
159
149
|
return ExtractionResult(
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -7,7 +7,7 @@ from anyio import Path as AsyncPath
|
|
7
7
|
|
8
8
|
from kreuzberg._extractors._base import Extractor
|
9
9
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
|
-
from kreuzberg._types import ExtractionResult
|
10
|
+
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
|
11
11
|
from kreuzberg._utils._string import safe_decode
|
12
12
|
from kreuzberg._utils._sync import run_sync
|
13
13
|
|
@@ -26,19 +26,16 @@ class HTMLExtractor(Extractor):
|
|
26
26
|
return await run_sync(self.extract_bytes_sync, content)
|
27
27
|
|
28
28
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
# Skip normalize_spaces since quality processing will handle whitespace
|
29
|
+
config = self.config.html_to_markdown_config if self.config else None
|
30
|
+
if config is None:
|
31
|
+
config = HTMLToMarkdownConfig()
|
32
|
+
|
33
|
+
config_dict = config.to_dict()
|
34
|
+
|
35
|
+
result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
|
36
|
+
|
39
37
|
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
|
40
38
|
|
41
|
-
# Apply quality processing which includes normalization
|
42
39
|
return self._apply_quality_processing(extraction_result)
|
43
40
|
|
44
41
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
import contextlib
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
-
from dataclasses import asdict
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import TYPE_CHECKING, ClassVar
|
9
8
|
|
@@ -12,9 +11,6 @@ from anyio import Path as AsyncPath
|
|
12
11
|
from kreuzberg._extractors._base import Extractor
|
13
12
|
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
14
13
|
from kreuzberg._ocr import get_ocr_backend
|
15
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
16
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
17
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
18
14
|
from kreuzberg._utils._tmp import create_temp_file
|
19
15
|
from kreuzberg.exceptions import ValidationError
|
20
16
|
|
@@ -84,25 +80,7 @@ class ImageExtractor(Extractor):
|
|
84
80
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
85
81
|
|
86
82
|
backend = get_ocr_backend(self.config.ocr_backend)
|
87
|
-
|
88
|
-
match self.config.ocr_backend:
|
89
|
-
case "tesseract":
|
90
|
-
config = (
|
91
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
92
|
-
)
|
93
|
-
result = backend.process_file_sync(path, **asdict(config))
|
94
|
-
case "paddleocr":
|
95
|
-
paddle_config = (
|
96
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
97
|
-
)
|
98
|
-
result = backend.process_file_sync(path, **asdict(paddle_config))
|
99
|
-
case "easyocr":
|
100
|
-
easy_config = (
|
101
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
102
|
-
)
|
103
|
-
result = backend.process_file_sync(path, **asdict(easy_config))
|
104
|
-
case _:
|
105
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
83
|
+
result = backend.process_file_sync(path, **self.config.get_config_dict())
|
106
84
|
return self._apply_quality_processing(result)
|
107
85
|
|
108
86
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
kreuzberg/_extractors/_pandoc.py
CHANGED
@@ -244,18 +244,13 @@ class PandocExtractor(Extractor):
|
|
244
244
|
raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
|
245
245
|
|
246
246
|
async def _validate_pandoc_version(self) -> None:
|
247
|
-
"""Validate that the installed Pandoc version meets the minimum requirement.
|
248
|
-
|
249
|
-
Raises:
|
250
|
-
MissingDependencyError: If Pandoc is not installed or version is too low
|
251
|
-
"""
|
252
247
|
try:
|
253
248
|
if self._checked_version:
|
254
249
|
return
|
255
250
|
|
256
251
|
command = ["pandoc", "--version"]
|
257
252
|
result = await run_process(command)
|
258
|
-
stdout = result.stdout.decode()
|
253
|
+
stdout = result.stdout.decode("utf-8")
|
259
254
|
|
260
255
|
version_match = re.search(
|
261
256
|
r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
|
@@ -299,14 +294,6 @@ class PandocExtractor(Extractor):
|
|
299
294
|
|
300
295
|
@staticmethod
|
301
296
|
def _get_pandoc_key(key: str) -> str | None:
|
302
|
-
"""Map Pandoc metadata keys to our standard metadata keys.
|
303
|
-
|
304
|
-
Args:
|
305
|
-
key: The key from Pandoc metadata
|
306
|
-
|
307
|
-
Returns:
|
308
|
-
The mapped key name for our system, or None if not mapped
|
309
|
-
"""
|
310
297
|
if key == "abstract":
|
311
298
|
return "summary"
|
312
299
|
|
@@ -325,17 +312,6 @@ class PandocExtractor(Extractor):
|
|
325
312
|
return key
|
326
313
|
|
327
314
|
def _get_pandoc_type_from_mime_type(self, mime_type: str) -> str:
|
328
|
-
"""Get Pandoc format type from MIME type.
|
329
|
-
|
330
|
-
Args:
|
331
|
-
mime_type: The MIME type to look up
|
332
|
-
|
333
|
-
Returns:
|
334
|
-
The corresponding Pandoc type
|
335
|
-
|
336
|
-
Raises:
|
337
|
-
ValidationError: If mime_type is not supported
|
338
|
-
"""
|
339
315
|
if pandoc_type := (self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
|
340
316
|
return pandoc_type
|
341
317
|
|
@@ -349,17 +325,6 @@ class PandocExtractor(Extractor):
|
|
349
325
|
raise ValidationError(f"Unsupported mime type: {mime_type}")
|
350
326
|
|
351
327
|
async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
|
352
|
-
"""Extract metadata from a file using Pandoc.
|
353
|
-
|
354
|
-
Args:
|
355
|
-
input_file: The file to extract metadata from
|
356
|
-
|
357
|
-
Returns:
|
358
|
-
The extracted metadata
|
359
|
-
|
360
|
-
Raises:
|
361
|
-
ParsingError: If metadata extraction fails
|
362
|
-
"""
|
363
328
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
364
329
|
metadata_file, unlink = await create_temp_file(".json")
|
365
330
|
try:
|
@@ -389,17 +354,6 @@ class PandocExtractor(Extractor):
|
|
389
354
|
await unlink()
|
390
355
|
|
391
356
|
async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
|
392
|
-
"""Extract text content from a file using Pandoc.
|
393
|
-
|
394
|
-
Args:
|
395
|
-
input_file: The file to extract content from
|
396
|
-
|
397
|
-
Returns:
|
398
|
-
The extracted text content
|
399
|
-
|
400
|
-
Raises:
|
401
|
-
ParsingError: If content extraction fails
|
402
|
-
"""
|
403
357
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
404
358
|
output_path, unlink = await create_temp_file(".md")
|
405
359
|
try:
|
@@ -431,14 +385,6 @@ class PandocExtractor(Extractor):
|
|
431
385
|
await unlink()
|
432
386
|
|
433
387
|
def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
|
434
|
-
"""Extract structured metadata from Pandoc JSON metadata.
|
435
|
-
|
436
|
-
Args:
|
437
|
-
raw_meta: The raw metadata from Pandoc
|
438
|
-
|
439
|
-
Returns:
|
440
|
-
Structured metadata
|
441
|
-
"""
|
442
388
|
meta: Metadata = {}
|
443
389
|
|
444
390
|
if (
|
@@ -485,16 +431,6 @@ class PandocExtractor(Extractor):
|
|
485
431
|
return meta
|
486
432
|
|
487
433
|
def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
|
488
|
-
"""Extract text from an inline node in a document structure.
|
489
|
-
|
490
|
-
Args:
|
491
|
-
node: The node to extract text from
|
492
|
-
type_field: The field name for the node type
|
493
|
-
content_field: The field name for the node content
|
494
|
-
|
495
|
-
Returns:
|
496
|
-
The extracted text or None if no text could be extracted
|
497
|
-
"""
|
498
434
|
if node_type := node.get(type_field):
|
499
435
|
if node_type == "Str":
|
500
436
|
return node.get(content_field)
|
@@ -505,29 +441,11 @@ class PandocExtractor(Extractor):
|
|
505
441
|
return None
|
506
442
|
|
507
443
|
def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
|
508
|
-
"""Extract text from a list of inline nodes.
|
509
|
-
|
510
|
-
Args:
|
511
|
-
nodes: The list of nodes to extract text from
|
512
|
-
|
513
|
-
Returns:
|
514
|
-
The extracted text or None if no text could be extracted
|
515
|
-
"""
|
516
444
|
texts = [text for node in nodes if (text := self._extract_inline_text(node))]
|
517
445
|
result = "".join(texts).strip()
|
518
446
|
return result if result else None
|
519
447
|
|
520
448
|
def _extract_meta_value(self, node: Any, type_field: str = "t", content_field: str = "c") -> str | list[str] | None:
|
521
|
-
"""Extract a metadata value from a node.
|
522
|
-
|
523
|
-
Args:
|
524
|
-
node: The node to extract metadata from
|
525
|
-
type_field: The field name for the node type
|
526
|
-
content_field: The field name for the node content
|
527
|
-
|
528
|
-
Returns:
|
529
|
-
The extracted metadata value or None if no metadata could be extracted
|
530
|
-
"""
|
531
449
|
if not isinstance(node, dict) or type_field not in node:
|
532
450
|
return None
|
533
451
|
|
@@ -577,12 +495,17 @@ class PandocExtractor(Extractor):
|
|
577
495
|
return None
|
578
496
|
|
579
497
|
def _validate_pandoc_version_sync(self) -> None:
|
580
|
-
"""Synchronous version of _validate_pandoc_version."""
|
581
498
|
try:
|
582
499
|
if self._checked_version:
|
583
500
|
return
|
584
501
|
|
585
|
-
result = subprocess.run(
|
502
|
+
result = subprocess.run(
|
503
|
+
["pandoc", "--version"], # noqa: S607
|
504
|
+
capture_output=True,
|
505
|
+
text=True,
|
506
|
+
check=False,
|
507
|
+
encoding="utf-8",
|
508
|
+
)
|
586
509
|
|
587
510
|
if result.returncode != 0:
|
588
511
|
raise MissingDependencyError(
|
@@ -621,7 +544,6 @@ class PandocExtractor(Extractor):
|
|
621
544
|
) from e
|
622
545
|
|
623
546
|
def _extract_metadata_sync(self, path: Path) -> Metadata:
|
624
|
-
"""Synchronous version of _handle_extract_metadata."""
|
625
547
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
626
548
|
fd, metadata_file = tempfile.mkstemp(suffix=".json")
|
627
549
|
os.close(fd)
|
@@ -638,7 +560,7 @@ class PandocExtractor(Extractor):
|
|
638
560
|
str(metadata_file),
|
639
561
|
]
|
640
562
|
|
641
|
-
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
563
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
|
642
564
|
|
643
565
|
if result.returncode != 0:
|
644
566
|
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|
@@ -655,7 +577,6 @@ class PandocExtractor(Extractor):
|
|
655
577
|
Path(metadata_file).unlink()
|
656
578
|
|
657
579
|
def _extract_file_sync(self, path: Path) -> str:
|
658
|
-
"""Synchronous version of _handle_extract_file."""
|
659
580
|
pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
|
660
581
|
fd, output_path = tempfile.mkstemp(suffix=".md")
|
661
582
|
os.close(fd)
|
@@ -673,7 +594,7 @@ class PandocExtractor(Extractor):
|
|
673
594
|
str(output_path),
|
674
595
|
]
|
675
596
|
|
676
|
-
result = subprocess.run(command, capture_output=True, text=True, check=False)
|
597
|
+
result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
|
677
598
|
|
678
599
|
if result.returncode != 0:
|
679
600
|
raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
|