kreuzberg 2.1.2__py3-none-any.whl → 3.0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. kreuzberg/__init__.py +16 -2
  2. kreuzberg/_chunker.py +51 -0
  3. kreuzberg/_constants.py +2 -3
  4. kreuzberg/_extractors/__init__.py +0 -0
  5. kreuzberg/_extractors/_base.py +92 -0
  6. kreuzberg/_extractors/_html.py +34 -0
  7. kreuzberg/_extractors/_image.py +74 -0
  8. kreuzberg/_extractors/_pandoc.py +613 -0
  9. kreuzberg/_extractors/_pdf.py +163 -0
  10. kreuzberg/_extractors/_presentation.py +233 -0
  11. kreuzberg/_extractors/_spread_sheet.py +125 -0
  12. kreuzberg/_mime_types.py +19 -26
  13. kreuzberg/_ocr/__init__.py +17 -0
  14. kreuzberg/_ocr/_base.py +54 -0
  15. kreuzberg/_ocr/_easyocr.py +376 -0
  16. kreuzberg/_ocr/_paddleocr.py +291 -0
  17. kreuzberg/_ocr/_tesseract.py +342 -0
  18. kreuzberg/_playa.py +276 -0
  19. kreuzberg/_registry.py +108 -0
  20. kreuzberg/_types.py +133 -36
  21. kreuzberg/_utils/__init__.py +0 -0
  22. kreuzberg/{_string.py → _utils/_string.py} +0 -2
  23. kreuzberg/_utils/_sync.py +121 -0
  24. kreuzberg/{_tmp.py → _utils/_tmp.py} +1 -1
  25. kreuzberg/exceptions.py +25 -0
  26. kreuzberg/extraction.py +114 -227
  27. kreuzberg-3.0.1.dist-info/METADATA +178 -0
  28. kreuzberg-3.0.1.dist-info/RECORD +32 -0
  29. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/WHEEL +1 -1
  30. kreuzberg/_html.py +0 -31
  31. kreuzberg/_pandoc.py +0 -366
  32. kreuzberg/_pdf.py +0 -190
  33. kreuzberg/_pptx.py +0 -88
  34. kreuzberg/_sync.py +0 -74
  35. kreuzberg/_tesseract.py +0 -231
  36. kreuzberg/_xlsx.py +0 -88
  37. kreuzberg-2.1.2.dist-info/METADATA +0 -446
  38. kreuzberg-2.1.2.dist-info/RECORD +0 -21
  39. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info/licenses}/LICENSE +0 -0
  40. {kreuzberg-2.1.2.dist-info → kreuzberg-3.0.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,342 @@
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ import sys
5
+ from dataclasses import dataclass
6
+ from enum import Enum
7
+ from typing import TYPE_CHECKING, Any, ClassVar, Final
8
+
9
+ from anyio import Path as AsyncPath
10
+ from anyio import run_process
11
+
12
+ from kreuzberg._mime_types import PLAIN_TEXT_MIME_TYPE
13
+ from kreuzberg._ocr._base import OCRBackend
14
+ from kreuzberg._types import ExtractionResult
15
+ from kreuzberg._utils._string import normalize_spaces
16
+ from kreuzberg._utils._sync import run_sync
17
+ from kreuzberg._utils._tmp import create_temp_file
18
+ from kreuzberg.exceptions import MissingDependencyError, OCRError, ValidationError
19
+
20
+ if TYPE_CHECKING:
21
+ from pathlib import Path
22
+
23
+ from PIL.Image import Image
24
+
25
+ try: # pragma: no cover
26
+ from typing import Unpack # type: ignore[attr-defined]
27
+ except ImportError: # pragma: no cover
28
+ from typing_extensions import Unpack
29
+
30
+
31
+ TESSERACT_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
32
+ "afr",
33
+ "amh",
34
+ "ara",
35
+ "asm",
36
+ "aze",
37
+ "aze_cyrl",
38
+ "bel",
39
+ "ben",
40
+ "bod",
41
+ "bos",
42
+ "bre",
43
+ "bul",
44
+ "cat",
45
+ "ceb",
46
+ "ces",
47
+ "chi_sim",
48
+ "chi_tra",
49
+ "chr",
50
+ "cos",
51
+ "cym",
52
+ "dan",
53
+ "dan_frak",
54
+ "deu",
55
+ "deu_frak",
56
+ "deu_latf",
57
+ "dzo",
58
+ "ell",
59
+ "eng",
60
+ "enm",
61
+ "epo",
62
+ "equ",
63
+ "est",
64
+ "eus",
65
+ "fao",
66
+ "fas",
67
+ "fil",
68
+ "fin",
69
+ "fra",
70
+ "frk",
71
+ "frm",
72
+ "fry",
73
+ "gla",
74
+ "gle",
75
+ "glg",
76
+ "grc",
77
+ "guj",
78
+ "hat",
79
+ "heb",
80
+ "hin",
81
+ "hrv",
82
+ "hun",
83
+ "hye",
84
+ "iku",
85
+ "ind",
86
+ "isl",
87
+ "ita",
88
+ "ita_old",
89
+ "jav",
90
+ "jpn",
91
+ "kan",
92
+ "kat",
93
+ "kat_old",
94
+ "kaz",
95
+ "khm",
96
+ "kir",
97
+ "kmr",
98
+ "kor",
99
+ "kor_vert",
100
+ "kur",
101
+ "lao",
102
+ "lat",
103
+ "lav",
104
+ "lit",
105
+ "ltz",
106
+ "mal",
107
+ "mar",
108
+ "mkd",
109
+ "mlt",
110
+ "mon",
111
+ "mri",
112
+ "msa",
113
+ "mya",
114
+ "nep",
115
+ "nld",
116
+ "nor",
117
+ "oci",
118
+ "ori",
119
+ "osd",
120
+ "pan",
121
+ "pol",
122
+ "por",
123
+ "pus",
124
+ "que",
125
+ "ron",
126
+ "rus",
127
+ "san",
128
+ "sin",
129
+ "slk",
130
+ "slk_frak",
131
+ "slv",
132
+ "snd",
133
+ "spa",
134
+ "spa_old",
135
+ "sqi",
136
+ "srp",
137
+ "srp_latn",
138
+ "sun",
139
+ "swa",
140
+ "swe",
141
+ "syr",
142
+ "tam",
143
+ "tat",
144
+ "tel",
145
+ "tgk",
146
+ "tgl",
147
+ "tha", # codespell:ignore
148
+ "tir",
149
+ "ton",
150
+ "tur",
151
+ "uig",
152
+ "ukr",
153
+ "urd",
154
+ "uzb",
155
+ "uzb_cyrl",
156
+ "vie", # codespell:ignore
157
+ "yid",
158
+ "yor",
159
+ }
160
+
161
+ MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
162
+
163
+
164
+ class PSMMode(Enum):
165
+ """Enum for Tesseract Page Segmentation Modes (PSM) with human-readable values."""
166
+
167
+ OSD_ONLY = 0
168
+ """Orientation and script detection only."""
169
+ AUTO_OSD = 1
170
+ """Automatic page segmentation with orientation and script detection."""
171
+ AUTO_ONLY = 2
172
+ """Automatic page segmentation without OSD."""
173
+ AUTO = 3
174
+ """Fully automatic page segmentation (default)."""
175
+ SINGLE_COLUMN = 4
176
+ """Assume a single column of text."""
177
+ SINGLE_BLOCK_VERTICAL = 5
178
+ """Assume a single uniform block of vertically aligned text."""
179
+ SINGLE_BLOCK = 6
180
+ """Assume a single uniform block of text."""
181
+ SINGLE_LINE = 7
182
+ """Treat the image as a single text line."""
183
+ SINGLE_WORD = 8
184
+ """Treat the image as a single word."""
185
+ CIRCLE_WORD = 9
186
+ """Treat the image as a single word in a circle."""
187
+ SINGLE_CHAR = 10
188
+ """Treat the image as a single character."""
189
+
190
+
191
+ @dataclass(unsafe_hash=True, frozen=True)
192
+ class TesseractConfig:
193
+ """Configuration options for Tesseract OCR engine."""
194
+
195
+ classify_use_pre_adapted_templates: bool = True
196
+ """Whether to use pre-adapted templates during classification to improve recognition accuracy."""
197
+ language: str = "eng"
198
+ """Language code to use for OCR.
199
+ Examples:
200
+ - 'eng' for English
201
+ - 'deu' for German
202
+ - multiple languages combined with '+', e.g. 'eng+deu')
203
+ """
204
+ language_model_ngram_on: bool = True
205
+ """Enable or disable the use of n-gram-based language models for improved text recognition."""
206
+ psm: PSMMode = PSMMode.AUTO
207
+ """Page segmentation mode (PSM) to guide Tesseract on how to segment the image (e.g., single block, single line)."""
208
+ tessedit_dont_blkrej_good_wds: bool = True
209
+ """If True, prevents block rejection of words identified as good, improving text output quality."""
210
+ tessedit_dont_rowrej_good_wds: bool = True
211
+ """If True, prevents row rejection of words identified as good, avoiding unnecessary omissions."""
212
+ tessedit_enable_dict_correction: bool = True
213
+ """Enable or disable dictionary-based correction for recognized text to improve word accuracy."""
214
+ tessedit_use_primary_params_model: bool = True
215
+ """If True, forces the use of the primary parameters model for text recognition."""
216
+ textord_space_size_is_variable: bool = True
217
+ """Allow variable spacing between words, useful for text with irregular spacing."""
218
+ thresholding_method: bool = False
219
+ """Enable or disable specific thresholding methods during image preprocessing for better OCR accuracy."""
220
+
221
+
222
+ class TesseractBackend(OCRBackend[TesseractConfig]):
223
+ _version_checked: ClassVar[bool] = False
224
+
225
+ async def process_image(
226
+ self,
227
+ image: Image,
228
+ **kwargs: Unpack[TesseractConfig],
229
+ ) -> ExtractionResult:
230
+ await self._validate_tesseract_version()
231
+ image_path, unlink = await create_temp_file(".png")
232
+ await run_sync(image.save, str(image_path), format="PNG")
233
+ try:
234
+ return await self.process_file(image_path, **kwargs)
235
+ finally:
236
+ await unlink()
237
+
238
+ async def process_file(
239
+ self,
240
+ path: Path,
241
+ **kwargs: Unpack[TesseractConfig],
242
+ ) -> ExtractionResult:
243
+ await self._validate_tesseract_version()
244
+ output_path, unlink = await create_temp_file(".txt")
245
+ language = self._validate_language_code(kwargs.pop("language", "eng"))
246
+ psm = kwargs.pop("psm", PSMMode.AUTO)
247
+ try:
248
+ output_base = str(output_path).replace(".txt", "")
249
+ command = [
250
+ "tesseract",
251
+ str(path),
252
+ output_base,
253
+ "-l",
254
+ language,
255
+ "--psm",
256
+ str(psm.value),
257
+ "--oem",
258
+ "1",
259
+ "--loglevel",
260
+ "OFF",
261
+ ]
262
+ for kwarg, value in kwargs.items():
263
+ command.extend(["-c", f"{kwarg}={1 if value else 0}"])
264
+
265
+ env: dict[str, Any] | None = None
266
+ if sys.platform.startswith("linux"):
267
+ # we have to prevent multithreading this way otherwise we will get deadlocks ~keep
268
+ env = {"OMP_THREAD_LIMIT": "1"}
269
+
270
+ result = await run_process(command, env=env)
271
+
272
+ if not result.returncode == 0:
273
+ raise OCRError(
274
+ "OCR failed with a non-0 return code.",
275
+ context={"error": result.stderr.decode() if isinstance(result.stderr, bytes) else result.stderr},
276
+ )
277
+
278
+ output = await AsyncPath(output_path).read_text("utf-8")
279
+ return ExtractionResult(
280
+ content=normalize_spaces(output), mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[]
281
+ )
282
+ except (RuntimeError, OSError) as e:
283
+ raise OCRError(f"Failed to OCR using tesseract: {e}") from e
284
+ finally:
285
+ await unlink()
286
+
287
+ @classmethod
288
+ async def _validate_tesseract_version(cls) -> None:
289
+ """Validate that Tesseract is installed and is version 5 or above.
290
+
291
+ Raises:
292
+ MissingDependencyError: If Tesseract is not installed or is below version 5.
293
+ """
294
+ try:
295
+ if cls._version_checked:
296
+ return
297
+
298
+ command = ["tesseract", "--version"]
299
+ result = await run_process(command)
300
+ version_match = re.search(r"tesseract\s+v?(\d+)\.\d+\.\d+", result.stdout.decode())
301
+ if not version_match or int(version_match.group(1)) < MINIMAL_SUPPORTED_TESSERACT_VERSION:
302
+ raise MissingDependencyError(
303
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
304
+ )
305
+
306
+ cls._version_checked = True
307
+ except FileNotFoundError as e:
308
+ raise MissingDependencyError(
309
+ "Tesseract version 5 is a required system dependency. Please install it on your system and make sure its available in $PATH."
310
+ ) from e
311
+
312
+ @staticmethod
313
+ def _validate_language_code(language_code: str) -> str:
314
+ """Convert a language code to Tesseract format.
315
+
316
+ Args:
317
+ language_code: Tesseract supported language code or multiple language codes connected with '+'
318
+
319
+ Raises:
320
+ ValidationError: If the language is not supported by Tesseract
321
+
322
+ Returns:
323
+ Language code compatible with Tesseract
324
+ """
325
+ normalized = language_code.lower()
326
+ if normalized in TESSERACT_SUPPORTED_LANGUAGE_CODES:
327
+ return normalized
328
+
329
+ if "+" in normalized and all(lang in TESSERACT_SUPPORTED_LANGUAGE_CODES for lang in normalized.split("+")):
330
+ return normalized
331
+
332
+ raise ValidationError(
333
+ "The provided language code is not supported by Tesseract",
334
+ context={
335
+ "language_code": normalized
336
+ if "+" not in normalized
337
+ else ",".join(
338
+ [lang for lang in normalized.split("+") if lang not in TESSERACT_SUPPORTED_LANGUAGE_CODES]
339
+ ),
340
+ "supported_languages": ",".join(sorted(TESSERACT_SUPPORTED_LANGUAGE_CODES)),
341
+ },
342
+ )
kreuzberg/_playa.py ADDED
@@ -0,0 +1,276 @@
1
+ from __future__ import annotations
2
+
3
+ from datetime import datetime
4
+ from typing import TYPE_CHECKING, Any, cast
5
+
6
+ from playa import asobj, parse
7
+ from playa.utils import decode_text
8
+
9
+ from kreuzberg.exceptions import ParsingError
10
+
11
+ if TYPE_CHECKING:
12
+ from playa.document import Document
13
+
14
+ from kreuzberg._types import Metadata
15
+
16
+
17
+ GRAY_COMPONENTS = 1
18
+ RGB_COMPONENTS = 3
19
+ CMYK_COMPONENTS = 4
20
+ UTF16BE_BOM = b"\xfe\xff"
21
+ UTF16BE_ENCODING = "utf-16be"
22
+ MIN_DATE_LENGTH = 8
23
+ FULL_DATE_LENGTH = 14
24
+ BOM_CHAR = "\ufeff"
25
+
26
+
27
+ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
28
+ """Extract metadata from a PDF document.
29
+
30
+ Args:
31
+ pdf_content: The bytes of the PDF document.
32
+
33
+ Raises:
34
+ ParsingError: If the PDF metadata could not be extracted.
35
+
36
+ Returns:
37
+ A dictionary of metadata extracted from the PDF.
38
+ """
39
+ try:
40
+ document = parse(pdf_content, max_workers=1)
41
+ metadata: Metadata = {}
42
+
43
+ for raw_info in document.info:
44
+ pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
45
+ _extract_basic_metadata(pdf_info, metadata)
46
+ _extract_author_metadata(pdf_info, metadata)
47
+ _extract_keyword_metadata(pdf_info, metadata)
48
+ _extract_category_metadata(pdf_info, metadata)
49
+ _extract_date_metadata(pdf_info, metadata)
50
+ _extract_creator_metadata(pdf_info, metadata)
51
+
52
+ if document.pages:
53
+ _extract_document_dimensions(document, metadata)
54
+
55
+ if document.outline and "description" not in metadata:
56
+ metadata["description"] = _generate_outline_description(document)
57
+
58
+ if "summary" not in metadata:
59
+ metadata["summary"] = _generate_document_summary(document)
60
+
61
+ _extract_structure_information(document, metadata)
62
+
63
+ return metadata
64
+ except Exception as e:
65
+ raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e
66
+
67
+
68
+ def _extract_basic_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
69
+ if "title" not in result and (title := pdf_info.get("title")):
70
+ result["title"] = decode_text(title)
71
+
72
+ if "subject" not in result and (subject := pdf_info.get("subject")):
73
+ result["subject"] = decode_text(subject)
74
+
75
+ if "publisher" not in result and (publisher := pdf_info.get("Publisher", pdf_info.get("publisher"))):
76
+ result["publisher"] = decode_text(publisher)
77
+
78
+ if "copyright" not in result and (copyright_info := pdf_info.get("copyright") or pdf_info.get("rights")):
79
+ result["copyright"] = decode_text(copyright_info)
80
+
81
+ if "comments" not in result and (comments := pdf_info.get("comments")):
82
+ result["comments"] = decode_text(comments)
83
+
84
+ if "identifier" not in result and (identifier := pdf_info.get("identifier") or pdf_info.get("id")):
85
+ result["identifier"] = decode_text(identifier)
86
+
87
+ if "license" not in result and (license_info := pdf_info.get("license")):
88
+ result["license"] = decode_text(license_info)
89
+
90
+ if "modified_by" not in result and (modified_by := pdf_info.get("modifiedby") or pdf_info.get("last_modified_by")):
91
+ result["modified_by"] = decode_text(modified_by)
92
+
93
+ if "version" not in result and (version := pdf_info.get("version")):
94
+ result["version"] = decode_text(version)
95
+
96
+
97
+ def _extract_author_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
98
+ if author := pdf_info.get("author"):
99
+ if isinstance(author, (str, bytes)):
100
+ author_str = decode_text(author)
101
+ author_str = author_str.replace(" and ", ", ")
102
+
103
+ authors = []
104
+ for author_segment in author_str.split(";"):
105
+ authors.extend(
106
+ [author_name.strip() for author_name in author_segment.split(",") if author_name.strip()]
107
+ )
108
+ result["authors"] = authors
109
+ elif isinstance(author, list):
110
+ result["authors"] = [decode_text(a) for a in author]
111
+
112
+
113
+ def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
114
+ if keywords := pdf_info.get("keywords"):
115
+ if isinstance(keywords, (str, bytes)):
116
+ kw_str = decode_text(keywords)
117
+ kw_list = [k.strip() for k in kw_str.split(",")]
118
+ kw_list = [k.strip() for k in " ".join(kw_list).split(";")]
119
+ result["keywords"] = [k for k in kw_list if k]
120
+ elif isinstance(keywords, list):
121
+ result["keywords"] = [decode_text(k) for k in keywords]
122
+
123
+
124
+ def _extract_category_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
125
+ if categories := pdf_info.get("categories") or pdf_info.get("category"):
126
+ if isinstance(categories, (str, bytes)):
127
+ cat_str = decode_text(categories)
128
+ cat_list = [c.strip() for c in cat_str.split(",")]
129
+ result["categories"] = [c for c in cat_list if c]
130
+ elif isinstance(categories, list):
131
+ result["categories"] = [decode_text(c) for c in categories]
132
+
133
+
134
+ def _parse_date_string(date_str: str) -> str:
135
+ date_str = date_str.removeprefix("D:")
136
+ if len(date_str) >= MIN_DATE_LENGTH:
137
+ year = date_str[0:4]
138
+ month = date_str[4:6]
139
+ day = date_str[6:8]
140
+ time_part = ""
141
+ if len(date_str) >= FULL_DATE_LENGTH:
142
+ hour = date_str[8:10]
143
+ minute = date_str[10:12]
144
+ second = date_str[12:14]
145
+ time_part = f"T{hour}:{minute}:{second}"
146
+ return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat() # noqa: DTZ007
147
+ return date_str
148
+
149
+
150
+ def _extract_date_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
151
+ if created := pdf_info.get("creationdate") or pdf_info.get("createdate"):
152
+ try:
153
+ date_str = decode_text(created)
154
+ result["created_at"] = _parse_date_string(date_str)
155
+ except (ValueError, IndexError):
156
+ result["created_at"] = decode_text(created)
157
+
158
+ if modified := pdf_info.get("moddate") or pdf_info.get("modificationdate"):
159
+ try:
160
+ date_str = decode_text(modified)
161
+ result["modified_at"] = _parse_date_string(date_str)
162
+ except (ValueError, IndexError):
163
+ result["modified_at"] = decode_text(modified)
164
+
165
+
166
+ def _extract_creator_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
167
+ if creator := pdf_info.get("creator"):
168
+ result["created_by"] = decode_text(creator)
169
+
170
+ if producer := pdf_info.get("producer"):
171
+ producer_str = decode_text(producer)
172
+ if "created_by" not in result:
173
+ result["created_by"] = producer_str
174
+ elif producer_str not in result["created_by"]:
175
+ result["created_by"] = f"{result['created_by']} (Producer: {producer_str})"
176
+
177
+
178
+ def _extract_document_dimensions(document: Document, result: Metadata) -> None:
179
+ first_page = document.pages[0]
180
+ if hasattr(first_page, "width") and hasattr(first_page, "height"):
181
+ result["width"] = int(first_page.width)
182
+ result["height"] = int(first_page.height)
183
+
184
+
185
+ def _format_outline(entries: list[Any], level: int = 0) -> list[str]:
186
+ outline_text: list[str] = []
187
+ for entry in entries:
188
+ if hasattr(entry, "title") and entry.title:
189
+ indent = " " * level
190
+ outline_text.append(f"{indent}- {entry.title}")
191
+ if hasattr(entry, "children") and entry.children:
192
+ _format_outline(entry.children, level + 1)
193
+
194
+ return outline_text
195
+
196
+
197
+ def _generate_outline_description(document: Document) -> str:
198
+ if outline_text := _format_outline(cast("list[Any]", document.outline)):
199
+ return "Table of Contents:\n" + "\n".join(outline_text)
200
+ return ""
201
+
202
+
203
+ def _generate_document_summary(document: Document) -> str:
204
+ summary_parts = []
205
+
206
+ page_count = len(document.pages)
207
+ summary_parts.append(f"PDF document with {page_count} page{'s' if page_count != 1 else ''}.")
208
+
209
+ if hasattr(document, "pdf_version"):
210
+ summary_parts.append(f"PDF version {document.pdf_version}.")
211
+
212
+ if hasattr(document, "is_encrypted") and document.is_encrypted:
213
+ summary_parts.append("Document is encrypted.")
214
+
215
+ if hasattr(document, "encryption_method") and document.encryption_method:
216
+ summary_parts.append(f"Encryption: {document.encryption_method}.")
217
+
218
+ permissions = _collect_document_permissions(document)
219
+ if permissions:
220
+ summary_parts.append(f"Document is {', '.join(permissions)}.")
221
+
222
+ if hasattr(document, "status") and document.status:
223
+ status = decode_text(document.status)
224
+ summary_parts.append(f"Status: {status}.")
225
+
226
+ if hasattr(document, "is_pdf_a") and document.is_pdf_a:
227
+ if hasattr(document, "pdf_a_level") and document.pdf_a_level:
228
+ summary_parts.append(f"PDF/A-{document.pdf_a_level} compliant.")
229
+ else:
230
+ summary_parts.append("PDF/A compliant.")
231
+
232
+ return " ".join(summary_parts)
233
+
234
+
235
+ def _collect_document_permissions(document: Document) -> list[str]:
236
+ permissions = []
237
+ if document.is_printable:
238
+ permissions.append("printable")
239
+ if document.is_modifiable:
240
+ permissions.append("modifiable")
241
+ if document.is_extractable:
242
+ permissions.append("extractable")
243
+ return permissions
244
+
245
+
246
+ def _extract_structure_information(document: Document, result: Metadata) -> None:
247
+ """Extract language and subtitle from document structure."""
248
+ if document.structure:
249
+ languages = set()
250
+ subtitle = None
251
+
252
+ def extract_languages(elements: list[Any]) -> None:
253
+ nonlocal subtitle
254
+ for element in elements:
255
+ if hasattr(element, "language") and element.language:
256
+ languages.add(element.language.lower())
257
+
258
+ if (
259
+ subtitle is None
260
+ and hasattr(element, "role")
261
+ and element.role == "H1"
262
+ and hasattr(element, "text")
263
+ and element.text
264
+ ):
265
+ subtitle = decode_text(element.text)
266
+
267
+ if hasattr(element, "children") and element.children:
268
+ extract_languages(element.children)
269
+
270
+ extract_languages(cast("list[Any]", document.structure))
271
+
272
+ if languages:
273
+ result["languages"] = list(languages)
274
+
275
+ if subtitle and "title" in result and subtitle != result["title"]:
276
+ result["subtitle"] = subtitle