kreuzberg 2.1.1__py3-none-any.whl → 3.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +16 -2
- kreuzberg/_chunker.py +51 -0
- kreuzberg/_constants.py +2 -3
- kreuzberg/_mime_types.py +19 -26
- kreuzberg/_playa.py +276 -0
- kreuzberg/_registry.py +108 -0
- kreuzberg/_types.py +133 -36
- kreuzberg/exceptions.py +25 -0
- kreuzberg/extraction.py +114 -227
- kreuzberg-3.0.0.dist-info/METADATA +178 -0
- kreuzberg-3.0.0.dist-info/RECORD +15 -0
- {kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info}/WHEEL +1 -1
- kreuzberg/_html.py +0 -31
- kreuzberg/_pandoc.py +0 -366
- kreuzberg/_pdf.py +0 -190
- kreuzberg/_pptx.py +0 -88
- kreuzberg/_string.py +0 -41
- kreuzberg/_sync.py +0 -74
- kreuzberg/_tesseract.py +0 -231
- kreuzberg/_tmp.py +0 -37
- kreuzberg/_xlsx.py +0 -88
- kreuzberg-2.1.1.dist-info/METADATA +0 -446
- kreuzberg-2.1.1.dist-info/RECORD +0 -21
- {kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info/licenses}/LICENSE +0 -0
- {kreuzberg-2.1.1.dist-info → kreuzberg-3.0.0.dist-info}/top_level.txt +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,5 +1,10 @@
|
|
1
|
-
from .
|
2
|
-
from .
|
1
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
2
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
3
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
4
|
+
|
5
|
+
from ._ocr._tesseract import PSMMode
|
6
|
+
from ._registry import ExtractorRegistry
|
7
|
+
from ._types import ExtractionConfig, ExtractionResult, Metadata
|
3
8
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
4
9
|
from .extraction import (
|
5
10
|
batch_extract_bytes,
|
@@ -7,22 +12,31 @@ from .extraction import (
|
|
7
12
|
batch_extract_file,
|
8
13
|
batch_extract_file_sync,
|
9
14
|
extract_bytes,
|
15
|
+
extract_bytes_sync,
|
10
16
|
extract_file,
|
17
|
+
extract_file_sync,
|
11
18
|
)
|
12
19
|
|
13
20
|
__all__ = [
|
21
|
+
"EasyOCRConfig",
|
22
|
+
"ExtractionConfig",
|
14
23
|
"ExtractionResult",
|
24
|
+
"ExtractorRegistry",
|
15
25
|
"KreuzbergError",
|
16
26
|
"Metadata",
|
17
27
|
"MissingDependencyError",
|
18
28
|
"OCRError",
|
19
29
|
"PSMMode",
|
30
|
+
"PaddleOCRConfig",
|
20
31
|
"ParsingError",
|
32
|
+
"TesseractConfig",
|
21
33
|
"ValidationError",
|
22
34
|
"batch_extract_bytes",
|
23
35
|
"batch_extract_bytes_sync",
|
24
36
|
"batch_extract_file",
|
25
37
|
"batch_extract_file_sync",
|
26
38
|
"extract_bytes",
|
39
|
+
"extract_bytes_sync",
|
27
40
|
"extract_file",
|
41
|
+
"extract_file_sync",
|
28
42
|
]
|
kreuzberg/_chunker.py
ADDED
@@ -0,0 +1,51 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from typing import TYPE_CHECKING
|
4
|
+
|
5
|
+
from kreuzberg import MissingDependencyError
|
6
|
+
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
7
|
+
from kreuzberg._mime_types import MARKDOWN_MIME_TYPE
|
8
|
+
|
9
|
+
if TYPE_CHECKING:
|
10
|
+
from semantic_text_splitter import MarkdownSplitter, TextSplitter
|
11
|
+
|
12
|
+
_chunkers: dict[tuple[int, int, str], MarkdownSplitter | TextSplitter] = {}
|
13
|
+
|
14
|
+
|
15
|
+
def get_chunker(
|
16
|
+
mime_type: str,
|
17
|
+
max_characters: int = DEFAULT_MAX_CHARACTERS,
|
18
|
+
overlap_characters: int = DEFAULT_MAX_OVERLAP,
|
19
|
+
) -> MarkdownSplitter | TextSplitter:
|
20
|
+
"""Creates and returns a Chunker object configured with the given maximum
|
21
|
+
characters per chunk and overlap between chunks.
|
22
|
+
|
23
|
+
Args:
|
24
|
+
mime_type: The mime type of the content.
|
25
|
+
max_characters: Maximum number of characters allowed in each chunk.
|
26
|
+
overlap_characters: Number of characters overlapping between two consecutive chunks.
|
27
|
+
|
28
|
+
Raises:
|
29
|
+
MissingDependencyError: if semantic-text-splitter is not installed.
|
30
|
+
|
31
|
+
Returns:
|
32
|
+
Chunker: A Chunker object configured with the specified maximum
|
33
|
+
characters and overlap.
|
34
|
+
"""
|
35
|
+
key = (max_characters, overlap_characters, mime_type)
|
36
|
+
if key not in _chunkers:
|
37
|
+
try:
|
38
|
+
if mime_type == MARKDOWN_MIME_TYPE:
|
39
|
+
from semantic_text_splitter import MarkdownSplitter
|
40
|
+
|
41
|
+
_chunkers[key] = MarkdownSplitter(max_characters, overlap_characters)
|
42
|
+
else:
|
43
|
+
from semantic_text_splitter import TextSplitter
|
44
|
+
|
45
|
+
_chunkers[key] = TextSplitter(max_characters, overlap_characters)
|
46
|
+
except ImportError as e:
|
47
|
+
raise MissingDependencyError.create_for_package(
|
48
|
+
dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
|
49
|
+
) from e
|
50
|
+
|
51
|
+
return _chunkers[key]
|
kreuzberg/_constants.py
CHANGED
@@ -1,8 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from multiprocessing import cpu_count
|
4
3
|
from typing import Final
|
5
4
|
|
6
|
-
DEFAULT_MAX_PROCESSES: Final[int] = cpu_count()
|
7
|
-
MINIMAL_SUPPORTED_TESSERACT_VERSION: Final[int] = 5
|
8
5
|
MINIMAL_SUPPORTED_PANDOC_VERSION: Final[int] = 2
|
6
|
+
DEFAULT_MAX_CHARACTERS: Final[int] = 2000
|
7
|
+
DEFAULT_MAX_OVERLAP: Final[int] = 100
|
kreuzberg/_mime_types.py
CHANGED
@@ -16,7 +16,7 @@ PDF_MIME_TYPE: Final = "application/pdf"
|
|
16
16
|
PLAIN_TEXT_MIME_TYPE: Final = "text/plain"
|
17
17
|
POWER_POINT_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.presentationml.presentation"
|
18
18
|
DOCX_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
19
|
-
|
19
|
+
|
20
20
|
EXCEL_MIME_TYPE: Final = "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet"
|
21
21
|
EXCEL_BINARY_MIME_TYPE: Final = "application/vnd.ms-excel"
|
22
22
|
EXCEL_MACRO_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.macroEnabled.12"
|
@@ -24,8 +24,8 @@ EXCEL_BINARY_2007_MIME_TYPE: Final = "application/vnd.ms-excel.sheet.binary.macr
|
|
24
24
|
EXCEL_ADDON_MIME_TYPE: Final = "application/vnd.ms-excel.addin.macroEnabled.12"
|
25
25
|
EXCEL_TEMPLATE_MIME_TYPE: Final = "application/vnd.ms-excel.template.macroEnabled.12"
|
26
26
|
|
27
|
-
|
28
|
-
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
|
27
|
+
|
28
|
+
OPENDOC_SPREADSHEET_MIME_TYPE: Final = "application/vnd.oasis.opendocument.spreadsheet"
|
29
29
|
PLAIN_TEXT_MIME_TYPES: Final[set[str]] = {PLAIN_TEXT_MIME_TYPE, MARKDOWN_MIME_TYPE}
|
30
30
|
|
31
31
|
IMAGE_MIME_TYPES: Final[set[str]] = {
|
@@ -48,26 +48,7 @@ IMAGE_MIME_TYPES: Final[set[str]] = {
|
|
48
48
|
"image/x-portable-pixmap",
|
49
49
|
"image/x-tiff",
|
50
50
|
}
|
51
|
-
|
52
|
-
"image/bmp": "bmp",
|
53
|
-
"image/x-bmp": "bmp",
|
54
|
-
"image/x-ms-bmp": "bmp",
|
55
|
-
"image/gif": "gif",
|
56
|
-
"image/jpeg": "jpg",
|
57
|
-
"image/pjpeg": "jpg",
|
58
|
-
"image/png": "png",
|
59
|
-
"image/tiff": "tiff",
|
60
|
-
"image/x-tiff": "tiff",
|
61
|
-
"image/jp2": "jp2",
|
62
|
-
"image/jpx": "jpx",
|
63
|
-
"image/jpm": "jpm",
|
64
|
-
"image/mj2": "mj2",
|
65
|
-
"image/webp": "webp",
|
66
|
-
"image/x-portable-anymap": "pnm",
|
67
|
-
"image/x-portable-bitmap": "pbm",
|
68
|
-
"image/x-portable-graymap": "pgm",
|
69
|
-
"image/x-portable-pixmap": "ppm",
|
70
|
-
}
|
51
|
+
|
71
52
|
PANDOC_SUPPORTED_MIME_TYPES: Final[set[str]] = {
|
72
53
|
"application/csl+json",
|
73
54
|
"application/docbook+xml",
|
@@ -162,13 +143,17 @@ SUPPORTED_MIME_TYPES: Final[set[str]] = (
|
|
162
143
|
)
|
163
144
|
|
164
145
|
|
165
|
-
def validate_mime_type(
|
146
|
+
def validate_mime_type(
|
147
|
+
*, file_path: PathLike[str] | str | None = None, mime_type: str | None = None, check_file_exists: bool = True
|
148
|
+
) -> str:
|
166
149
|
"""Validate and detect the MIME type for a given file.
|
167
150
|
|
168
151
|
Args:
|
169
152
|
file_path: The path to the file.
|
170
153
|
mime_type: Optional explicit MIME type. If provided, this will be validated.
|
171
154
|
If not provided, the function will attempt to detect the MIME type.
|
155
|
+
check_file_exists: Whether to check if the file exists. Default is True.
|
156
|
+
Set to False in tests where you want to validate a mime type without an actual file.
|
172
157
|
|
173
158
|
Raises:
|
174
159
|
ValidationError: If the MIME type is not supported or cannot be determined.
|
@@ -176,10 +161,18 @@ def validate_mime_type(file_path: PathLike[str] | str, mime_type: str | None = N
|
|
176
161
|
Returns:
|
177
162
|
The validated MIME type.
|
178
163
|
"""
|
179
|
-
|
164
|
+
if file_path and check_file_exists:
|
165
|
+
path = Path(file_path)
|
166
|
+
if not path.exists():
|
167
|
+
raise ValidationError("The file does not exist", context={"file_path": str(path)})
|
180
168
|
|
181
169
|
if not mime_type:
|
182
|
-
|
170
|
+
if not file_path:
|
171
|
+
raise ValidationError(
|
172
|
+
"Could not determine mime type.",
|
173
|
+
)
|
174
|
+
path = Path(file_path)
|
175
|
+
|
183
176
|
ext = path.suffix.lower()
|
184
177
|
mime_type = EXT_TO_MIME_TYPE.get(ext) or guess_type(path.name)[0]
|
185
178
|
|
kreuzberg/_playa.py
ADDED
@@ -0,0 +1,276 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from datetime import datetime
|
4
|
+
from typing import TYPE_CHECKING, Any, cast
|
5
|
+
|
6
|
+
from playa import asobj, parse
|
7
|
+
from playa.utils import decode_text
|
8
|
+
|
9
|
+
from kreuzberg.exceptions import ParsingError
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from playa.document import Document
|
13
|
+
|
14
|
+
from kreuzberg._types import Metadata
|
15
|
+
|
16
|
+
|
17
|
+
GRAY_COMPONENTS = 1
|
18
|
+
RGB_COMPONENTS = 3
|
19
|
+
CMYK_COMPONENTS = 4
|
20
|
+
UTF16BE_BOM = b"\xfe\xff"
|
21
|
+
UTF16BE_ENCODING = "utf-16be"
|
22
|
+
MIN_DATE_LENGTH = 8
|
23
|
+
FULL_DATE_LENGTH = 14
|
24
|
+
BOM_CHAR = "\ufeff"
|
25
|
+
|
26
|
+
|
27
|
+
async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
|
28
|
+
"""Extract metadata from a PDF document.
|
29
|
+
|
30
|
+
Args:
|
31
|
+
pdf_content: The bytes of the PDF document.
|
32
|
+
|
33
|
+
Raises:
|
34
|
+
ParsingError: If the PDF metadata could not be extracted.
|
35
|
+
|
36
|
+
Returns:
|
37
|
+
A dictionary of metadata extracted from the PDF.
|
38
|
+
"""
|
39
|
+
try:
|
40
|
+
document = parse(pdf_content, max_workers=1)
|
41
|
+
metadata: Metadata = {}
|
42
|
+
|
43
|
+
for raw_info in document.info:
|
44
|
+
pdf_info = {k.lower(): v for k, v in asobj(raw_info).items()}
|
45
|
+
_extract_basic_metadata(pdf_info, metadata)
|
46
|
+
_extract_author_metadata(pdf_info, metadata)
|
47
|
+
_extract_keyword_metadata(pdf_info, metadata)
|
48
|
+
_extract_category_metadata(pdf_info, metadata)
|
49
|
+
_extract_date_metadata(pdf_info, metadata)
|
50
|
+
_extract_creator_metadata(pdf_info, metadata)
|
51
|
+
|
52
|
+
if document.pages:
|
53
|
+
_extract_document_dimensions(document, metadata)
|
54
|
+
|
55
|
+
if document.outline and "description" not in metadata:
|
56
|
+
metadata["description"] = _generate_outline_description(document)
|
57
|
+
|
58
|
+
if "summary" not in metadata:
|
59
|
+
metadata["summary"] = _generate_document_summary(document)
|
60
|
+
|
61
|
+
_extract_structure_information(document, metadata)
|
62
|
+
|
63
|
+
return metadata
|
64
|
+
except Exception as e:
|
65
|
+
raise ParsingError(f"Failed to extract PDF metadata: {e!s}") from e
|
66
|
+
|
67
|
+
|
68
|
+
def _extract_basic_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
|
69
|
+
if "title" not in result and (title := pdf_info.get("title")):
|
70
|
+
result["title"] = decode_text(title)
|
71
|
+
|
72
|
+
if "subject" not in result and (subject := pdf_info.get("subject")):
|
73
|
+
result["subject"] = decode_text(subject)
|
74
|
+
|
75
|
+
if "publisher" not in result and (publisher := pdf_info.get("Publisher", pdf_info.get("publisher"))):
|
76
|
+
result["publisher"] = decode_text(publisher)
|
77
|
+
|
78
|
+
if "copyright" not in result and (copyright_info := pdf_info.get("copyright") or pdf_info.get("rights")):
|
79
|
+
result["copyright"] = decode_text(copyright_info)
|
80
|
+
|
81
|
+
if "comments" not in result and (comments := pdf_info.get("comments")):
|
82
|
+
result["comments"] = decode_text(comments)
|
83
|
+
|
84
|
+
if "identifier" not in result and (identifier := pdf_info.get("identifier") or pdf_info.get("id")):
|
85
|
+
result["identifier"] = decode_text(identifier)
|
86
|
+
|
87
|
+
if "license" not in result and (license_info := pdf_info.get("license")):
|
88
|
+
result["license"] = decode_text(license_info)
|
89
|
+
|
90
|
+
if "modified_by" not in result and (modified_by := pdf_info.get("modifiedby") or pdf_info.get("last_modified_by")):
|
91
|
+
result["modified_by"] = decode_text(modified_by)
|
92
|
+
|
93
|
+
if "version" not in result and (version := pdf_info.get("version")):
|
94
|
+
result["version"] = decode_text(version)
|
95
|
+
|
96
|
+
|
97
|
+
def _extract_author_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
|
98
|
+
if author := pdf_info.get("author"):
|
99
|
+
if isinstance(author, (str, bytes)):
|
100
|
+
author_str = decode_text(author)
|
101
|
+
author_str = author_str.replace(" and ", ", ")
|
102
|
+
|
103
|
+
authors = []
|
104
|
+
for author_segment in author_str.split(";"):
|
105
|
+
authors.extend(
|
106
|
+
[author_name.strip() for author_name in author_segment.split(",") if author_name.strip()]
|
107
|
+
)
|
108
|
+
result["authors"] = authors
|
109
|
+
elif isinstance(author, list):
|
110
|
+
result["authors"] = [decode_text(a) for a in author]
|
111
|
+
|
112
|
+
|
113
|
+
def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
|
114
|
+
if keywords := pdf_info.get("keywords"):
|
115
|
+
if isinstance(keywords, (str, bytes)):
|
116
|
+
kw_str = decode_text(keywords)
|
117
|
+
kw_list = [k.strip() for k in kw_str.split(",")]
|
118
|
+
kw_list = [k.strip() for k in " ".join(kw_list).split(";")]
|
119
|
+
result["keywords"] = [k for k in kw_list if k]
|
120
|
+
elif isinstance(keywords, list):
|
121
|
+
result["keywords"] = [decode_text(k) for k in keywords]
|
122
|
+
|
123
|
+
|
124
|
+
def _extract_category_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
|
125
|
+
if categories := pdf_info.get("categories") or pdf_info.get("category"):
|
126
|
+
if isinstance(categories, (str, bytes)):
|
127
|
+
cat_str = decode_text(categories)
|
128
|
+
cat_list = [c.strip() for c in cat_str.split(",")]
|
129
|
+
result["categories"] = [c for c in cat_list if c]
|
130
|
+
elif isinstance(categories, list):
|
131
|
+
result["categories"] = [decode_text(c) for c in categories]
|
132
|
+
|
133
|
+
|
134
|
+
def _parse_date_string(date_str: str) -> str:
|
135
|
+
date_str = date_str.removeprefix("D:")
|
136
|
+
if len(date_str) >= MIN_DATE_LENGTH:
|
137
|
+
year = date_str[0:4]
|
138
|
+
month = date_str[4:6]
|
139
|
+
day = date_str[6:8]
|
140
|
+
time_part = ""
|
141
|
+
if len(date_str) >= FULL_DATE_LENGTH:
|
142
|
+
hour = date_str[8:10]
|
143
|
+
minute = date_str[10:12]
|
144
|
+
second = date_str[12:14]
|
145
|
+
time_part = f"T{hour}:{minute}:{second}"
|
146
|
+
return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat() # noqa: DTZ007
|
147
|
+
return date_str
|
148
|
+
|
149
|
+
|
150
|
+
def _extract_date_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
|
151
|
+
if created := pdf_info.get("creationdate") or pdf_info.get("createdate"):
|
152
|
+
try:
|
153
|
+
date_str = decode_text(created)
|
154
|
+
result["created_at"] = _parse_date_string(date_str)
|
155
|
+
except (ValueError, IndexError):
|
156
|
+
result["created_at"] = decode_text(created)
|
157
|
+
|
158
|
+
if modified := pdf_info.get("moddate") or pdf_info.get("modificationdate"):
|
159
|
+
try:
|
160
|
+
date_str = decode_text(modified)
|
161
|
+
result["modified_at"] = _parse_date_string(date_str)
|
162
|
+
except (ValueError, IndexError):
|
163
|
+
result["modified_at"] = decode_text(modified)
|
164
|
+
|
165
|
+
|
166
|
+
def _extract_creator_metadata(pdf_info: dict[str, Any], result: Metadata) -> None:
|
167
|
+
if creator := pdf_info.get("creator"):
|
168
|
+
result["created_by"] = decode_text(creator)
|
169
|
+
|
170
|
+
if producer := pdf_info.get("producer"):
|
171
|
+
producer_str = decode_text(producer)
|
172
|
+
if "created_by" not in result:
|
173
|
+
result["created_by"] = producer_str
|
174
|
+
elif producer_str not in result["created_by"]:
|
175
|
+
result["created_by"] = f"{result['created_by']} (Producer: {producer_str})"
|
176
|
+
|
177
|
+
|
178
|
+
def _extract_document_dimensions(document: Document, result: Metadata) -> None:
|
179
|
+
first_page = document.pages[0]
|
180
|
+
if hasattr(first_page, "width") and hasattr(first_page, "height"):
|
181
|
+
result["width"] = int(first_page.width)
|
182
|
+
result["height"] = int(first_page.height)
|
183
|
+
|
184
|
+
|
185
|
+
def _format_outline(entries: list[Any], level: int = 0) -> list[str]:
|
186
|
+
outline_text: list[str] = []
|
187
|
+
for entry in entries:
|
188
|
+
if hasattr(entry, "title") and entry.title:
|
189
|
+
indent = " " * level
|
190
|
+
outline_text.append(f"{indent}- {entry.title}")
|
191
|
+
if hasattr(entry, "children") and entry.children:
|
192
|
+
_format_outline(entry.children, level + 1)
|
193
|
+
|
194
|
+
return outline_text
|
195
|
+
|
196
|
+
|
197
|
+
def _generate_outline_description(document: Document) -> str:
|
198
|
+
if outline_text := _format_outline(cast("list[Any]", document.outline)):
|
199
|
+
return "Table of Contents:\n" + "\n".join(outline_text)
|
200
|
+
return ""
|
201
|
+
|
202
|
+
|
203
|
+
def _generate_document_summary(document: Document) -> str:
|
204
|
+
summary_parts = []
|
205
|
+
|
206
|
+
page_count = len(document.pages)
|
207
|
+
summary_parts.append(f"PDF document with {page_count} page{'s' if page_count != 1 else ''}.")
|
208
|
+
|
209
|
+
if hasattr(document, "pdf_version"):
|
210
|
+
summary_parts.append(f"PDF version {document.pdf_version}.")
|
211
|
+
|
212
|
+
if hasattr(document, "is_encrypted") and document.is_encrypted:
|
213
|
+
summary_parts.append("Document is encrypted.")
|
214
|
+
|
215
|
+
if hasattr(document, "encryption_method") and document.encryption_method:
|
216
|
+
summary_parts.append(f"Encryption: {document.encryption_method}.")
|
217
|
+
|
218
|
+
permissions = _collect_document_permissions(document)
|
219
|
+
if permissions:
|
220
|
+
summary_parts.append(f"Document is {', '.join(permissions)}.")
|
221
|
+
|
222
|
+
if hasattr(document, "status") and document.status:
|
223
|
+
status = decode_text(document.status)
|
224
|
+
summary_parts.append(f"Status: {status}.")
|
225
|
+
|
226
|
+
if hasattr(document, "is_pdf_a") and document.is_pdf_a:
|
227
|
+
if hasattr(document, "pdf_a_level") and document.pdf_a_level:
|
228
|
+
summary_parts.append(f"PDF/A-{document.pdf_a_level} compliant.")
|
229
|
+
else:
|
230
|
+
summary_parts.append("PDF/A compliant.")
|
231
|
+
|
232
|
+
return " ".join(summary_parts)
|
233
|
+
|
234
|
+
|
235
|
+
def _collect_document_permissions(document: Document) -> list[str]:
|
236
|
+
permissions = []
|
237
|
+
if document.is_printable:
|
238
|
+
permissions.append("printable")
|
239
|
+
if document.is_modifiable:
|
240
|
+
permissions.append("modifiable")
|
241
|
+
if document.is_extractable:
|
242
|
+
permissions.append("extractable")
|
243
|
+
return permissions
|
244
|
+
|
245
|
+
|
246
|
+
def _extract_structure_information(document: Document, result: Metadata) -> None:
|
247
|
+
"""Extract language and subtitle from document structure."""
|
248
|
+
if document.structure:
|
249
|
+
languages = set()
|
250
|
+
subtitle = None
|
251
|
+
|
252
|
+
def extract_languages(elements: list[Any]) -> None:
|
253
|
+
nonlocal subtitle
|
254
|
+
for element in elements:
|
255
|
+
if hasattr(element, "language") and element.language:
|
256
|
+
languages.add(element.language.lower())
|
257
|
+
|
258
|
+
if (
|
259
|
+
subtitle is None
|
260
|
+
and hasattr(element, "role")
|
261
|
+
and element.role == "H1"
|
262
|
+
and hasattr(element, "text")
|
263
|
+
and element.text
|
264
|
+
):
|
265
|
+
subtitle = decode_text(element.text)
|
266
|
+
|
267
|
+
if hasattr(element, "children") and element.children:
|
268
|
+
extract_languages(element.children)
|
269
|
+
|
270
|
+
extract_languages(cast("list[Any]", document.structure))
|
271
|
+
|
272
|
+
if languages:
|
273
|
+
result["languages"] = list(languages)
|
274
|
+
|
275
|
+
if subtitle and "title" in result and subtitle != result["title"]:
|
276
|
+
result["subtitle"] = subtitle
|
kreuzberg/_registry.py
ADDED
@@ -0,0 +1,108 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from functools import lru_cache
|
4
|
+
from typing import TYPE_CHECKING, ClassVar
|
5
|
+
|
6
|
+
from kreuzberg._extractors._html import HTMLExtractor
|
7
|
+
from kreuzberg._extractors._image import ImageExtractor
|
8
|
+
from kreuzberg._extractors._pandoc import (
|
9
|
+
BibliographyExtractor,
|
10
|
+
EbookExtractor,
|
11
|
+
LaTeXExtractor,
|
12
|
+
MarkdownExtractor,
|
13
|
+
MiscFormatExtractor,
|
14
|
+
OfficeDocumentExtractor,
|
15
|
+
StructuredTextExtractor,
|
16
|
+
TabularDataExtractor,
|
17
|
+
XMLBasedExtractor,
|
18
|
+
)
|
19
|
+
from kreuzberg._extractors._pdf import PDFExtractor
|
20
|
+
from kreuzberg._extractors._presentation import PresentationExtractor
|
21
|
+
from kreuzberg._extractors._spread_sheet import SpreadSheetExtractor
|
22
|
+
|
23
|
+
if TYPE_CHECKING:
|
24
|
+
from kreuzberg._extractors._base import Extractor
|
25
|
+
from kreuzberg._types import ExtractionConfig
|
26
|
+
|
27
|
+
|
28
|
+
class ExtractorRegistry:
|
29
|
+
"""Manages extractors for different MIME types and their configurations.
|
30
|
+
|
31
|
+
This class provides functionality to register, unregister, and retrieve
|
32
|
+
extractors based on MIME types. It supports both synchronous and asynchronous
|
33
|
+
operations for managing extractors. A default set of extractors is also
|
34
|
+
maintained alongside user-registered extractors.
|
35
|
+
"""
|
36
|
+
|
37
|
+
_default_extractors: ClassVar[list[type[Extractor]]] = [
|
38
|
+
PDFExtractor,
|
39
|
+
OfficeDocumentExtractor,
|
40
|
+
PresentationExtractor,
|
41
|
+
SpreadSheetExtractor,
|
42
|
+
HTMLExtractor,
|
43
|
+
MarkdownExtractor,
|
44
|
+
ImageExtractor,
|
45
|
+
BibliographyExtractor,
|
46
|
+
EbookExtractor,
|
47
|
+
LaTeXExtractor,
|
48
|
+
MiscFormatExtractor,
|
49
|
+
StructuredTextExtractor,
|
50
|
+
TabularDataExtractor,
|
51
|
+
XMLBasedExtractor,
|
52
|
+
]
|
53
|
+
_registered_extractors: ClassVar[list[type[Extractor]]] = []
|
54
|
+
|
55
|
+
@classmethod
|
56
|
+
@lru_cache
|
57
|
+
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
|
58
|
+
"""Gets the extractor for the mimetype.
|
59
|
+
|
60
|
+
Args:
|
61
|
+
mime_type: The mime type of the content.
|
62
|
+
config: Extraction options object, defaults to the default object.
|
63
|
+
|
64
|
+
Returns:
|
65
|
+
The extractor
|
66
|
+
"""
|
67
|
+
extractors: list[type[Extractor]] = [
|
68
|
+
*cls._registered_extractors,
|
69
|
+
*cls._default_extractors,
|
70
|
+
]
|
71
|
+
if mime_type:
|
72
|
+
for extractor in extractors:
|
73
|
+
if extractor.supports_mimetype(mime_type):
|
74
|
+
return extractor(mime_type=mime_type, config=config)
|
75
|
+
|
76
|
+
return None
|
77
|
+
|
78
|
+
@classmethod
|
79
|
+
def add_extractor(cls, extractor: type[Extractor]) -> None:
|
80
|
+
"""Add an extractor to the registry.
|
81
|
+
|
82
|
+
Note:
|
83
|
+
Extractors are tried in the order they are added: first added, first tried.
|
84
|
+
|
85
|
+
Args:
|
86
|
+
extractor: The extractor to add.
|
87
|
+
|
88
|
+
Returns:
|
89
|
+
None
|
90
|
+
"""
|
91
|
+
cls._registered_extractors.append(extractor)
|
92
|
+
cls.get_extractor.cache_clear()
|
93
|
+
|
94
|
+
@classmethod
|
95
|
+
def remove_extractor(cls, extractor: type[Extractor]) -> None:
|
96
|
+
"""Remove an extractor from the registry.
|
97
|
+
|
98
|
+
Args:
|
99
|
+
extractor: The extractor to remove.
|
100
|
+
|
101
|
+
Returns:
|
102
|
+
None
|
103
|
+
"""
|
104
|
+
try:
|
105
|
+
cls._registered_extractors.remove(extractor)
|
106
|
+
cls.get_extractor.cache_clear()
|
107
|
+
except ValueError:
|
108
|
+
pass
|