kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_playa.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
from datetime import datetime
|
3
|
+
from datetime import datetime, timezone
|
4
4
|
from typing import TYPE_CHECKING, Any, cast
|
5
5
|
|
6
6
|
from playa import asobj, parse
|
@@ -25,18 +25,6 @@ BOM_CHAR = "\ufeff"
|
|
25
25
|
|
26
26
|
|
27
27
|
async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
|
28
|
-
"""Extract metadata from a PDF document.
|
29
|
-
|
30
|
-
Args:
|
31
|
-
pdf_content: The bytes of the PDF document.
|
32
|
-
password: Password for encrypted PDF files.
|
33
|
-
|
34
|
-
Raises:
|
35
|
-
ParsingError: If the PDF metadata could not be extracted.
|
36
|
-
|
37
|
-
Returns:
|
38
|
-
A dictionary of metadata extracted from the PDF.
|
39
|
-
"""
|
40
28
|
try:
|
41
29
|
document = parse(pdf_content, max_workers=1, password=password)
|
42
30
|
metadata: Metadata = {}
|
@@ -115,7 +103,6 @@ def _extract_keyword_metadata(pdf_info: dict[str, Any], result: Metadata) -> Non
|
|
115
103
|
if keywords := pdf_info.get("keywords"):
|
116
104
|
if isinstance(keywords, (str, bytes)):
|
117
105
|
kw_str = decode_text(keywords)
|
118
|
-
# Combine multiple operations into a single comprehension
|
119
106
|
result["keywords"] = [k.strip() for part in kw_str.replace(";", ",").split(",") if (k := part.strip())]
|
120
107
|
elif isinstance(keywords, list):
|
121
108
|
result["keywords"] = [decode_text(k) for k in keywords]
|
@@ -144,8 +131,10 @@ def _parse_date_string(date_str: str) -> str:
|
|
144
131
|
second = date_str[12:14]
|
145
132
|
time_part = f"T{hour}:{minute}:{second}"
|
146
133
|
if time_part:
|
147
|
-
|
148
|
-
|
134
|
+
dt = datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").replace(tzinfo=timezone.utc)
|
135
|
+
return dt.isoformat()
|
136
|
+
dt = datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").replace(tzinfo=timezone.utc)
|
137
|
+
return dt.isoformat()
|
149
138
|
return date_str
|
150
139
|
|
151
140
|
|
@@ -246,7 +235,6 @@ def _collect_document_permissions(document: Document) -> list[str]:
|
|
246
235
|
|
247
236
|
|
248
237
|
def _extract_structure_information(document: Document, result: Metadata) -> None:
|
249
|
-
"""Extract language and subtitle from document structure."""
|
250
238
|
if document.structure:
|
251
239
|
languages = set()
|
252
240
|
subtitle = None
|
@@ -279,20 +267,6 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
|
|
279
267
|
|
280
268
|
|
281
269
|
def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
|
282
|
-
"""Synchronous version of extract_pdf_metadata.
|
283
|
-
|
284
|
-
Extract metadata from a PDF document without using async/await.
|
285
|
-
|
286
|
-
Args:
|
287
|
-
pdf_content: The bytes of the PDF document.
|
288
|
-
password: Password for encrypted PDF files.
|
289
|
-
|
290
|
-
Raises:
|
291
|
-
ParsingError: If the PDF metadata could not be extracted.
|
292
|
-
|
293
|
-
Returns:
|
294
|
-
A dictionary of metadata extracted from the PDF.
|
295
|
-
"""
|
296
270
|
try:
|
297
271
|
document = parse(pdf_content, max_workers=1, password=password)
|
298
272
|
metadata: Metadata = {}
|
kreuzberg/_registry.py
CHANGED
@@ -28,14 +28,6 @@ if TYPE_CHECKING:
|
|
28
28
|
|
29
29
|
|
30
30
|
class ExtractorRegistry:
|
31
|
-
"""Manages extractors for different MIME types and their configurations.
|
32
|
-
|
33
|
-
This class provides functionality to register, unregister, and retrieve
|
34
|
-
extractors based on MIME types. It supports both synchronous and asynchronous
|
35
|
-
operations for managing extractors. A default set of extractors is also
|
36
|
-
maintained alongside user-registered extractors.
|
37
|
-
"""
|
38
|
-
|
39
31
|
_default_extractors: ClassVar[list[type[Extractor]]] = [
|
40
32
|
PDFExtractor,
|
41
33
|
OfficeDocumentExtractor,
|
@@ -59,15 +51,6 @@ class ExtractorRegistry:
|
|
59
51
|
@classmethod
|
60
52
|
@lru_cache
|
61
53
|
def get_extractor(cls, mime_type: str | None, config: ExtractionConfig) -> Extractor | None:
|
62
|
-
"""Gets the extractor for the mimetype.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
mime_type: The mime type of the content.
|
66
|
-
config: Extraction options object, defaults to the default object.
|
67
|
-
|
68
|
-
Returns:
|
69
|
-
The extractor
|
70
|
-
"""
|
71
54
|
extractors: list[type[Extractor]] = [
|
72
55
|
*cls._registered_extractors,
|
73
56
|
*cls._default_extractors,
|
@@ -81,30 +64,11 @@ class ExtractorRegistry:
|
|
81
64
|
|
82
65
|
@classmethod
|
83
66
|
def add_extractor(cls, extractor: type[Extractor]) -> None:
|
84
|
-
"""Add an extractor to the registry.
|
85
|
-
|
86
|
-
Note:
|
87
|
-
Extractors are tried in the order they are added: first added, first tried.
|
88
|
-
|
89
|
-
Args:
|
90
|
-
extractor: The extractor to add.
|
91
|
-
|
92
|
-
Returns:
|
93
|
-
None
|
94
|
-
"""
|
95
67
|
cls._registered_extractors.append(extractor)
|
96
68
|
cls.get_extractor.cache_clear()
|
97
69
|
|
98
70
|
@classmethod
|
99
71
|
def remove_extractor(cls, extractor: type[Extractor]) -> None:
|
100
|
-
"""Remove an extractor from the registry.
|
101
|
-
|
102
|
-
Args:
|
103
|
-
extractor: The extractor to remove.
|
104
|
-
|
105
|
-
Returns:
|
106
|
-
None
|
107
|
-
"""
|
108
72
|
try:
|
109
73
|
cls._registered_extractors.remove(extractor)
|
110
74
|
cls.get_extractor.cache_clear()
|