kreuzberg 3.15.0__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -0
- kreuzberg/_api/main.py +0 -53
- kreuzberg/_config.py +11 -1
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_email.py +16 -10
- kreuzberg/_extractors/_html.py +39 -12
- kreuzberg/_extractors/_pdf.py +2 -3
- kreuzberg/_extractors/_presentation.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +0 -21
- kreuzberg/_ocr/_easyocr.py +51 -19
- kreuzberg/_ocr/_tesseract.py +14 -3
- kreuzberg/_types.py +111 -40
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +15 -16
- kreuzberg/extraction.py +2 -2
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +12 -11
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/RECORD +24 -23
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_mcp/server.py
CHANGED
@@ -22,7 +22,6 @@ from kreuzberg.extraction import (
|
|
22
22
|
|
23
23
|
mcp = FastMCP("Kreuzberg Text Extraction")
|
24
24
|
|
25
|
-
# Security and performance limits
|
26
25
|
MAX_BATCH_SIZE = 100
|
27
26
|
|
28
27
|
|
@@ -46,7 +45,6 @@ def _validate_file_path(file_path: str) -> Path:
|
|
46
45
|
context={"file_path": file_path, "error": str(e)},
|
47
46
|
) from e
|
48
47
|
|
49
|
-
# Check for path traversal attempts
|
50
48
|
if ".." in file_path and not file_path.startswith("/"):
|
51
49
|
raise ValidationError(
|
52
50
|
"Path traversal detected in file path",
|
@@ -73,7 +71,6 @@ def _validate_file_path_with_context(file_path: str, index: int, total: int) ->
|
|
73
71
|
try:
|
74
72
|
return _validate_file_path(file_path)
|
75
73
|
except ValidationError as e:
|
76
|
-
# Add context about which file in the batch failed
|
77
74
|
e.context = e.context or {}
|
78
75
|
e.context["batch_index"] = index
|
79
76
|
e.context["total_files"] = total
|
@@ -99,7 +96,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
|
|
99
96
|
context={"context": context_info},
|
100
97
|
)
|
101
98
|
|
102
|
-
# Check for whitespace-only content
|
103
99
|
if not content_base64.strip():
|
104
100
|
raise ValidationError(
|
105
101
|
"Base64 content cannot be whitespace only",
|
@@ -126,7 +122,6 @@ def _validate_base64_content(content_base64: str, context_info: str | None = Non
|
|
126
122
|
def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
127
123
|
base_config = discover_config()
|
128
124
|
|
129
|
-
# Extract Tesseract-specific parameters from kwargs first
|
130
125
|
tesseract_lang = kwargs.pop("tesseract_lang", None)
|
131
126
|
tesseract_psm = kwargs.pop("tesseract_psm", None)
|
132
127
|
tesseract_output_format = kwargs.pop("tesseract_output_format", None)
|
@@ -151,7 +146,6 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
151
146
|
}
|
152
147
|
config_dict = config_dict | kwargs
|
153
148
|
|
154
|
-
# Handle Tesseract OCR configuration
|
155
149
|
ocr_backend = config_dict.get("ocr_backend")
|
156
150
|
if ocr_backend == "tesseract" and (
|
157
151
|
tesseract_lang or tesseract_psm is not None or tesseract_output_format or enable_table_detection
|
@@ -174,10 +168,8 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
|
|
174
168
|
tesseract_config_dict["enable_table_detection"] = True
|
175
169
|
|
176
170
|
if tesseract_config_dict:
|
177
|
-
# Merge with existing tesseract config if present
|
178
171
|
existing_ocr_config = config_dict.get("ocr_config")
|
179
172
|
if existing_ocr_config and isinstance(existing_ocr_config, TesseractConfig):
|
180
|
-
# Convert existing config to dict, merge, and recreate
|
181
173
|
existing_dict = existing_ocr_config.to_dict()
|
182
174
|
merged_dict = existing_dict | tesseract_config_dict
|
183
175
|
config_dict["ocr_config"] = TesseractConfig(**merged_dict)
|
@@ -206,7 +198,6 @@ def extract_document( # noqa: PLR0913
|
|
206
198
|
tesseract_output_format: str | None = None,
|
207
199
|
enable_table_detection: bool | None = None,
|
208
200
|
) -> dict[str, Any]:
|
209
|
-
# Validate file path for security
|
210
201
|
validated_path = _validate_file_path(file_path)
|
211
202
|
config = _create_config_with_overrides(
|
212
203
|
force_ocr=force_ocr,
|
@@ -289,7 +280,6 @@ def batch_extract_document( # noqa: PLR0913
|
|
289
280
|
tesseract_output_format: str | None = None,
|
290
281
|
enable_table_detection: bool | None = None,
|
291
282
|
) -> list[dict[str, Any]]:
|
292
|
-
# Validate batch size
|
293
283
|
if len(file_paths) > MAX_BATCH_SIZE:
|
294
284
|
raise ValidationError(
|
295
285
|
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
@@ -302,7 +292,6 @@ def batch_extract_document( # noqa: PLR0913
|
|
302
292
|
context={"file_paths": file_paths},
|
303
293
|
)
|
304
294
|
|
305
|
-
# Validate all file paths for security
|
306
295
|
validated_paths = []
|
307
296
|
for i, file_path in enumerate(file_paths):
|
308
297
|
validated_path = _validate_file_path_with_context(file_path, i, len(file_paths))
|
@@ -346,7 +335,6 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
346
335
|
tesseract_output_format: str | None = None,
|
347
336
|
enable_table_detection: bool | None = None,
|
348
337
|
) -> list[dict[str, Any]]:
|
349
|
-
# Validate input
|
350
338
|
if not content_items:
|
351
339
|
raise ValidationError("content_items cannot be empty", context={"content_items": content_items})
|
352
340
|
|
@@ -355,7 +343,6 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
355
343
|
"content_items must be a list", context={"content_items_type": type(content_items).__name__}
|
356
344
|
)
|
357
345
|
|
358
|
-
# Validate batch size
|
359
346
|
if len(content_items) > MAX_BATCH_SIZE:
|
360
347
|
raise ValidationError(
|
361
348
|
f"Batch size exceeds maximum limit of {MAX_BATCH_SIZE}",
|
@@ -379,17 +366,14 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
379
366
|
enable_table_detection=enable_table_detection,
|
380
367
|
)
|
381
368
|
|
382
|
-
# Convert list of dicts to list of tuples (bytes, mime_type)
|
383
369
|
contents = []
|
384
370
|
for i, item in enumerate(content_items):
|
385
|
-
# Validate item structure
|
386
371
|
if not isinstance(item, dict):
|
387
372
|
raise ValidationError(
|
388
373
|
f"Item at index {i} must be a dictionary",
|
389
374
|
context={"item_index": i, "item_type": type(item).__name__, "item": item},
|
390
375
|
)
|
391
376
|
|
392
|
-
# Check for required keys
|
393
377
|
if "content_base64" not in item:
|
394
378
|
raise ValidationError(
|
395
379
|
f"Item at index {i} is missing required key 'content_base64'",
|
@@ -405,11 +389,9 @@ def batch_extract_bytes( # noqa: PLR0913
|
|
405
389
|
content_base64 = item["content_base64"]
|
406
390
|
mime_type = item["mime_type"]
|
407
391
|
|
408
|
-
# Validate base64 content
|
409
392
|
try:
|
410
393
|
content_bytes = _validate_base64_content(content_base64, f"batch_extract_bytes item {i}")
|
411
394
|
except ValidationError as e:
|
412
|
-
# Add batch-specific context
|
413
395
|
e.context = e.context or {}
|
414
396
|
e.context["item_index"] = i
|
415
397
|
e.context["total_items"] = len(content_items)
|
@@ -426,7 +408,6 @@ def extract_simple(
|
|
426
408
|
file_path: str,
|
427
409
|
mime_type: str | None = None,
|
428
410
|
) -> str:
|
429
|
-
# Validate file path for security
|
430
411
|
validated_path = _validate_file_path(file_path)
|
431
412
|
config = _create_config_with_overrides()
|
432
413
|
result = extract_file_sync(str(validated_path), mime_type, config)
|
@@ -467,7 +448,6 @@ def get_supported_formats() -> str:
|
|
467
448
|
|
468
449
|
@mcp.prompt()
|
469
450
|
def extract_and_summarize(file_path: str) -> list[TextContent]:
|
470
|
-
# Validate file path for security
|
471
451
|
validated_path = _validate_file_path(file_path)
|
472
452
|
result = extract_file_sync(str(validated_path), None, _create_config_with_overrides())
|
473
453
|
|
@@ -481,7 +461,6 @@ def extract_and_summarize(file_path: str) -> list[TextContent]:
|
|
481
461
|
|
482
462
|
@mcp.prompt()
|
483
463
|
def extract_structured(file_path: str) -> list[TextContent]:
|
484
|
-
# Validate file path for security
|
485
464
|
validated_path = _validate_file_path(file_path)
|
486
465
|
config = _create_config_with_overrides(
|
487
466
|
extract_entities=True,
|
kreuzberg/_ocr/_easyocr.py
CHANGED
@@ -33,22 +33,39 @@ except ImportError: # pragma: no cover
|
|
33
33
|
|
34
34
|
if TYPE_CHECKING:
|
35
35
|
import easyocr
|
36
|
-
import numpy as np
|
37
36
|
import torch
|
37
|
+
else:
|
38
|
+
easyocr: Any = None
|
39
|
+
torch: Any = None
|
40
|
+
|
41
|
+
HAS_EASYOCR: bool = False
|
42
|
+
|
43
|
+
|
44
|
+
def _import_easyocr() -> tuple[Any, Any]:
|
45
|
+
global HAS_EASYOCR, easyocr, torch
|
46
|
+
|
47
|
+
# If easyocr is already set (either real module or mock), return it
|
48
|
+
if easyocr is not None:
|
49
|
+
return easyocr, torch
|
50
|
+
|
51
|
+
# If explicitly disabled for testing
|
52
|
+
if not HAS_EASYOCR and easyocr is None:
|
53
|
+
return None, None
|
38
54
|
|
39
|
-
HAS_EASYOCR: bool
|
40
|
-
if not TYPE_CHECKING:
|
41
55
|
try:
|
42
|
-
import easyocr
|
43
|
-
import numpy as np
|
44
|
-
import torch
|
56
|
+
import easyocr as _easyocr # noqa: PLC0415
|
45
57
|
|
58
|
+
try:
|
59
|
+
import torch as _torch # noqa: PLC0415
|
60
|
+
except ImportError:
|
61
|
+
_torch = None # type: ignore[assignment]
|
62
|
+
|
63
|
+
easyocr = _easyocr
|
64
|
+
torch = _torch
|
46
65
|
HAS_EASYOCR = True
|
66
|
+
return easyocr, torch
|
47
67
|
except ImportError:
|
48
|
-
|
49
|
-
easyocr: Any = None
|
50
|
-
np: Any = None
|
51
|
-
torch: Any = None
|
68
|
+
return None, None
|
52
69
|
|
53
70
|
|
54
71
|
EASYOCR_SUPPORTED_LANGUAGE_CODES: Final[set[str]] = {
|
@@ -142,6 +159,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
142
159
|
_reader: ClassVar[Any] = None
|
143
160
|
|
144
161
|
async def process_image(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
162
|
+
try:
|
163
|
+
import numpy as np # noqa: PLC0415
|
164
|
+
except ImportError as e:
|
165
|
+
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
166
|
+
|
145
167
|
use_cache = kwargs.pop("use_cache", True)
|
146
168
|
|
147
169
|
cache_kwargs = None
|
@@ -292,7 +314,8 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
292
314
|
|
293
315
|
@classmethod
|
294
316
|
def _is_gpu_available(cls) -> bool:
|
295
|
-
|
317
|
+
# Use the module-level torch variable directly to respect patches
|
318
|
+
if torch is None:
|
296
319
|
return False
|
297
320
|
return bool(torch.cuda.is_available())
|
298
321
|
|
@@ -301,13 +324,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
301
324
|
if cls._reader is not None:
|
302
325
|
return
|
303
326
|
|
304
|
-
|
327
|
+
# Validate language first before attempting import
|
328
|
+
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
329
|
+
|
330
|
+
easyocr_module, _ = _import_easyocr()
|
331
|
+
if easyocr_module is None:
|
305
332
|
raise MissingDependencyError.create_for_package(
|
306
333
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
307
334
|
)
|
308
335
|
|
309
|
-
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
310
|
-
|
311
336
|
device_info = cls._resolve_device_config(**kwargs)
|
312
337
|
use_gpu = device_info.device_type in ("cuda", "mps")
|
313
338
|
|
@@ -318,7 +343,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
318
343
|
|
319
344
|
try:
|
320
345
|
cls._reader = await run_sync(
|
321
|
-
|
346
|
+
easyocr_module.Reader,
|
322
347
|
languages,
|
323
348
|
gpu=use_gpu,
|
324
349
|
verbose=False,
|
@@ -382,6 +407,11 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
382
407
|
return languages
|
383
408
|
|
384
409
|
def process_image_sync(self, image: Image.Image, **kwargs: Unpack[EasyOCRConfig]) -> ExtractionResult:
|
410
|
+
try:
|
411
|
+
import numpy as np # noqa: PLC0415
|
412
|
+
except ImportError as e:
|
413
|
+
raise MissingDependencyError("EasyOCR requires numpy: pip install 'kreuzberg[easyocr]'") from e
|
414
|
+
|
385
415
|
use_cache = kwargs.pop("use_cache", True)
|
386
416
|
|
387
417
|
cache_kwargs = None
|
@@ -453,13 +483,15 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
453
483
|
if cls._reader is not None:
|
454
484
|
return
|
455
485
|
|
456
|
-
|
486
|
+
# Validate language first before attempting import
|
487
|
+
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
488
|
+
|
489
|
+
easyocr_module, _ = _import_easyocr()
|
490
|
+
if easyocr_module is None:
|
457
491
|
raise MissingDependencyError.create_for_package(
|
458
492
|
dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
|
459
493
|
)
|
460
494
|
|
461
|
-
languages = cls._validate_language_code(kwargs.pop("language", "en"))
|
462
|
-
|
463
495
|
device_info = cls._resolve_device_config(**kwargs)
|
464
496
|
use_gpu = device_info.device_type in ("cuda", "mps")
|
465
497
|
|
@@ -469,7 +501,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
|
|
469
501
|
kwargs.setdefault("recog_network", "standard")
|
470
502
|
|
471
503
|
try:
|
472
|
-
cls._reader =
|
504
|
+
cls._reader = easyocr_module.Reader(
|
473
505
|
languages,
|
474
506
|
gpu=use_gpu,
|
475
507
|
verbose=False,
|
kreuzberg/_ocr/_tesseract.py
CHANGED
@@ -29,6 +29,7 @@ from kreuzberg._ocr._base import OCRBackend
|
|
29
29
|
from kreuzberg._ocr._table_extractor import extract_words, reconstruct_table, to_markdown
|
30
30
|
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig, PSMMode, TableData, TesseractConfig
|
31
31
|
from kreuzberg._utils._cache import get_ocr_cache
|
32
|
+
from kreuzberg._utils._html_streaming import should_use_streaming
|
32
33
|
from kreuzberg._utils._process_pool import ProcessPoolManager, get_optimal_worker_count
|
33
34
|
from kreuzberg._utils._string import normalize_spaces
|
34
35
|
from kreuzberg._utils._sync import run_sync
|
@@ -510,7 +511,7 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
510
511
|
escape_asterisks=False,
|
511
512
|
escape_underscores=False,
|
512
513
|
extract_metadata=False,
|
513
|
-
strip="meta title",
|
514
|
+
strip=["meta", "title"],
|
514
515
|
)
|
515
516
|
|
516
517
|
tables: list[TableData] = []
|
@@ -532,6 +533,10 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
532
533
|
config_dict = config.to_dict()
|
533
534
|
config_dict["custom_converters"] = all_converters
|
534
535
|
|
536
|
+
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
537
|
+
config_dict["stream_processing"] = use_streaming
|
538
|
+
config_dict["chunk_size"] = chunk_size
|
539
|
+
|
535
540
|
try:
|
536
541
|
markdown_content = html_to_markdown.convert_to_markdown(hocr_content, **config_dict)
|
537
542
|
markdown_content = normalize_spaces(markdown_content)
|
@@ -676,12 +681,18 @@ class TesseractBackend(OCRBackend[TesseractConfig]):
|
|
676
681
|
escape_asterisks=False,
|
677
682
|
escape_underscores=False,
|
678
683
|
extract_metadata=False,
|
679
|
-
strip="meta title",
|
684
|
+
strip=["meta", "title"],
|
680
685
|
)
|
681
686
|
|
687
|
+
config_dict = html_config.to_dict()
|
688
|
+
|
689
|
+
use_streaming, chunk_size = should_use_streaming(len(hocr_content.encode()))
|
690
|
+
config_dict["stream_processing"] = use_streaming
|
691
|
+
config_dict["chunk_size"] = chunk_size
|
692
|
+
|
682
693
|
markdown_content = html_to_markdown.convert_to_markdown(
|
683
694
|
hocr_content,
|
684
|
-
**
|
695
|
+
**config_dict,
|
685
696
|
)
|
686
697
|
|
687
698
|
markdown_content = normalize_spaces(markdown_content)
|
kreuzberg/_types.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sys
|
4
|
-
from collections.abc import Awaitable, Callable,
|
4
|
+
from collections.abc import Awaitable, Callable, Mapping
|
5
5
|
from dataclasses import asdict, dataclass, field
|
6
6
|
from enum import Enum
|
7
7
|
from pathlib import Path
|
@@ -591,6 +591,8 @@ class ImagePreprocessingMetadata(NamedTuple):
|
|
591
591
|
|
592
592
|
|
593
593
|
class Metadata(TypedDict, total=False):
|
594
|
+
abstract: NotRequired[str]
|
595
|
+
"""Document abstract or summary."""
|
594
596
|
authors: NotRequired[list[str]]
|
595
597
|
"""List of document authors."""
|
596
598
|
categories: NotRequired[list[str]]
|
@@ -677,9 +679,26 @@ class Metadata(TypedDict, total=False):
|
|
677
679
|
"""Error message if extraction failed."""
|
678
680
|
error_context: NotRequired[dict[str, Any]]
|
679
681
|
"""Error context information for debugging."""
|
682
|
+
json_schema: NotRequired[dict[str, Any]]
|
683
|
+
"""JSON schema information extracted from structured data."""
|
684
|
+
notes: NotRequired[list[str]]
|
685
|
+
"""Notes or additional information extracted from documents."""
|
686
|
+
note: NotRequired[str]
|
687
|
+
"""Single note or annotation."""
|
688
|
+
name: NotRequired[str]
|
689
|
+
"""Name field from structured data."""
|
690
|
+
body: NotRequired[str]
|
691
|
+
"""Body text content."""
|
692
|
+
text: NotRequired[str]
|
693
|
+
"""Generic text content."""
|
694
|
+
message: NotRequired[str]
|
695
|
+
"""Message or communication content."""
|
696
|
+
attributes: NotRequired[dict[str, Any]]
|
697
|
+
"""Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
|
680
698
|
|
681
699
|
|
682
700
|
_VALID_METADATA_KEYS = {
|
701
|
+
"abstract",
|
683
702
|
"authors",
|
684
703
|
"categories",
|
685
704
|
"citations",
|
@@ -722,6 +741,14 @@ _VALID_METADATA_KEYS = {
|
|
722
741
|
"source_format",
|
723
742
|
"error",
|
724
743
|
"error_context",
|
744
|
+
"json_schema",
|
745
|
+
"notes",
|
746
|
+
"note",
|
747
|
+
"name",
|
748
|
+
"body",
|
749
|
+
"text",
|
750
|
+
"message",
|
751
|
+
"attributes",
|
725
752
|
}
|
726
753
|
|
727
754
|
|
@@ -730,9 +757,29 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
|
|
730
757
|
return {}
|
731
758
|
|
732
759
|
normalized: Metadata = {}
|
760
|
+
attributes: dict[str, Any] = {}
|
761
|
+
|
733
762
|
for key, value in data.items():
|
734
|
-
if
|
735
|
-
|
763
|
+
if value is not None:
|
764
|
+
if key in _VALID_METADATA_KEYS:
|
765
|
+
normalized[key] = value # type: ignore[literal-required]
|
766
|
+
elif "." in key and key.split(".")[-1] in {
|
767
|
+
"title",
|
768
|
+
"name",
|
769
|
+
"subject",
|
770
|
+
"description",
|
771
|
+
"content",
|
772
|
+
"body",
|
773
|
+
"text",
|
774
|
+
"message",
|
775
|
+
"note",
|
776
|
+
"abstract",
|
777
|
+
"summary",
|
778
|
+
}:
|
779
|
+
attributes[key] = value
|
780
|
+
|
781
|
+
if attributes:
|
782
|
+
normalized["attributes"] = attributes
|
736
783
|
|
737
784
|
return normalized
|
738
785
|
|
@@ -835,6 +882,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
|
|
835
882
|
ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
836
883
|
|
837
884
|
|
885
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
886
|
+
class JSONExtractionConfig(ConfigDict):
|
887
|
+
extract_schema: bool = False
|
888
|
+
"""Extract and include JSON schema information in metadata."""
|
889
|
+
custom_text_field_patterns: frozenset[str] | None = None
|
890
|
+
"""Custom patterns to identify text fields beyond default keywords."""
|
891
|
+
max_depth: int = 10
|
892
|
+
"""Maximum nesting depth to process in JSON structures."""
|
893
|
+
array_item_limit: int = 1000
|
894
|
+
"""Maximum number of array items to process to prevent memory issues."""
|
895
|
+
include_type_info: bool = False
|
896
|
+
"""Include data type information in extracted content."""
|
897
|
+
flatten_nested_objects: bool = True
|
898
|
+
"""Flatten nested objects using dot notation for better text extraction."""
|
899
|
+
|
900
|
+
def __post_init__(self) -> None:
|
901
|
+
if self.max_depth <= 0:
|
902
|
+
raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
|
903
|
+
if self.array_item_limit <= 0:
|
904
|
+
raise ValidationError(
|
905
|
+
"array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
|
906
|
+
)
|
907
|
+
|
908
|
+
|
838
909
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
839
910
|
class ExtractionConfig(ConfigDict):
|
840
911
|
force_ocr: bool = False
|
@@ -924,6 +995,8 @@ class ExtractionConfig(ConfigDict):
|
|
924
995
|
"""Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
|
925
996
|
html_to_markdown_config: HTMLToMarkdownConfig | None = None
|
926
997
|
"""Configuration for HTML to Markdown conversion. If None, uses default settings."""
|
998
|
+
json_config: JSONExtractionConfig | None = None
|
999
|
+
"""Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
|
927
1000
|
use_cache: bool = True
|
928
1001
|
"""Whether to use caching for extraction results. Set to False to disable all caching."""
|
929
1002
|
target_dpi: int = 150
|
@@ -1060,70 +1133,68 @@ class ExtractionConfig(ConfigDict):
|
|
1060
1133
|
|
1061
1134
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
1062
1135
|
class HTMLToMarkdownConfig:
|
1063
|
-
stream_processing: bool = False
|
1064
|
-
"""Enable streaming mode for processing large HTML documents."""
|
1065
|
-
chunk_size: int = 1024
|
1066
|
-
"""Size of chunks when stream_processing is enabled."""
|
1067
|
-
chunk_callback: Callable[[str], None] | None = None
|
1068
|
-
"""Callback function invoked for each chunk during stream processing."""
|
1069
|
-
progress_callback: Callable[[int, int], None] | None = None
|
1070
|
-
"""Callback function for progress updates (current, total)."""
|
1071
|
-
parser: str | None = "lxml"
|
1072
|
-
"""BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
|
1073
1136
|
autolinks: bool = True
|
1074
|
-
"""
|
1137
|
+
"""Automatically convert valid URLs to Markdown links."""
|
1138
|
+
br_in_tables: bool = False
|
1139
|
+
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
1075
1140
|
bullets: str = "*+-"
|
1076
1141
|
"""Characters to use for unordered list bullets."""
|
1077
1142
|
code_language: str = ""
|
1078
|
-
"""Default language for code blocks."""
|
1143
|
+
"""Default language identifier for fenced code blocks."""
|
1079
1144
|
code_language_callback: Callable[[Any], str] | None = None
|
1080
|
-
"""
|
1081
|
-
convert:
|
1082
|
-
"""HTML tags to convert
|
1145
|
+
"""Function to dynamically determine code block language."""
|
1146
|
+
convert: list[str] | None = None
|
1147
|
+
"""List of HTML tags to convert (None = all supported tags)."""
|
1083
1148
|
convert_as_inline: bool = False
|
1084
|
-
"""
|
1085
|
-
custom_converters: Mapping[
|
1086
|
-
"""
|
1149
|
+
"""Treat content as inline elements only."""
|
1150
|
+
custom_converters: Mapping[str, Callable[..., str]] | None = None
|
1151
|
+
"""Mapping of HTML tag names to custom converter functions."""
|
1087
1152
|
default_title: bool = False
|
1088
|
-
"""Use
|
1153
|
+
"""Use default titles for elements like links."""
|
1089
1154
|
escape_asterisks: bool = True
|
1090
|
-
"""Escape
|
1155
|
+
"""Escape * characters to prevent unintended formatting."""
|
1091
1156
|
escape_misc: bool = True
|
1092
|
-
"""Escape miscellaneous characters
|
1157
|
+
"""Escape miscellaneous characters to prevent Markdown conflicts."""
|
1093
1158
|
escape_underscores: bool = True
|
1094
|
-
"""Escape
|
1159
|
+
"""Escape _ characters to prevent unintended formatting."""
|
1095
1160
|
extract_metadata: bool = True
|
1096
|
-
"""Extract metadata
|
1161
|
+
"""Extract document metadata as comment header."""
|
1097
1162
|
heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
|
1098
1163
|
"""Style for markdown headings."""
|
1099
1164
|
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
1100
1165
|
"""Style for highlighting text."""
|
1101
|
-
keep_inline_images_in:
|
1102
|
-
"""
|
1166
|
+
keep_inline_images_in: list[str] | None = None
|
1167
|
+
"""Tags where inline images should be preserved."""
|
1168
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces"
|
1169
|
+
"""Type of indentation to use for lists."""
|
1170
|
+
list_indent_width: int = 4
|
1171
|
+
"""Number of spaces per indentation level (use 2 for Discord/Slack)."""
|
1103
1172
|
newline_style: Literal["spaces", "backslash"] = "spaces"
|
1104
1173
|
"""Style for line breaks in markdown."""
|
1105
|
-
|
1106
|
-
"""HTML
|
1174
|
+
preprocess_html: bool = False
|
1175
|
+
"""Enable HTML preprocessing to clean messy HTML."""
|
1176
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
|
1177
|
+
"""Preprocessing level for cleaning HTML."""
|
1178
|
+
remove_forms: bool = True
|
1179
|
+
"""Remove form elements during preprocessing."""
|
1180
|
+
remove_navigation: bool = True
|
1181
|
+
"""Remove navigation elements during preprocessing."""
|
1182
|
+
strip: list[str] | None = None
|
1183
|
+
"""List of HTML tags to remove from output."""
|
1107
1184
|
strip_newlines: bool = False
|
1108
|
-
"""
|
1185
|
+
"""Remove newlines from HTML input before processing."""
|
1109
1186
|
strong_em_symbol: Literal["*", "_"] = "*"
|
1110
1187
|
"""Symbol to use for strong/emphasis formatting."""
|
1111
1188
|
sub_symbol: str = ""
|
1112
1189
|
"""Symbol to use for subscript text."""
|
1113
1190
|
sup_symbol: str = ""
|
1114
1191
|
"""Symbol to use for superscript text."""
|
1192
|
+
whitespace_mode: Literal["normalized", "strict"] = "normalized"
|
1193
|
+
"""Whitespace handling mode."""
|
1115
1194
|
wrap: bool = False
|
1116
1195
|
"""Enable text wrapping."""
|
1117
1196
|
wrap_width: int = 80
|
1118
|
-
"""Width for text wrapping
|
1119
|
-
preprocess_html: bool = True
|
1120
|
-
"""Enable HTML preprocessing to clean up the input."""
|
1121
|
-
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
|
1122
|
-
"""Preprocessing level for cleaning HTML."""
|
1123
|
-
remove_navigation: bool = True
|
1124
|
-
"""Remove navigation elements from HTML."""
|
1125
|
-
remove_forms: bool = True
|
1126
|
-
"""Remove form elements from HTML."""
|
1197
|
+
"""Width for text wrapping."""
|
1127
1198
|
|
1128
1199
|
def to_dict(self) -> dict[str, Any]:
|
1129
1200
|
result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
_STREAMING_THRESHOLD_KB = 10
|
4
|
+
_LARGE_FILE_THRESHOLD_MB = 1
|
5
|
+
_DEFAULT_CHUNK_SIZE = 2048
|
6
|
+
_LARGE_FILE_CHUNK_SIZE = 4096
|
7
|
+
|
8
|
+
_STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
|
9
|
+
_LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
|
10
|
+
|
11
|
+
|
12
|
+
def should_use_streaming(content_size: int) -> tuple[bool, int]:
|
13
|
+
if content_size < 0:
|
14
|
+
return False, _DEFAULT_CHUNK_SIZE
|
15
|
+
|
16
|
+
if content_size > _STREAMING_THRESHOLD_BYTES:
|
17
|
+
if content_size > _LARGE_FILE_THRESHOLD_BYTES:
|
18
|
+
return True, _LARGE_FILE_CHUNK_SIZE
|
19
|
+
return True, _DEFAULT_CHUNK_SIZE
|
20
|
+
return False, _DEFAULT_CHUNK_SIZE
|
@@ -1,11 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import is_dataclass
|
4
|
-
from typing import Any, TypeVar
|
4
|
+
from typing import Any, TypeVar
|
5
5
|
|
6
6
|
import msgspec
|
7
7
|
from msgspec import MsgspecError
|
8
|
-
from msgspec.msgpack import decode, encode
|
9
8
|
|
10
9
|
T = TypeVar("T")
|
11
10
|
|
@@ -42,18 +41,26 @@ def encode_hook(obj: Any) -> Any:
|
|
42
41
|
raise TypeError(f"Unsupported type: {type(obj)!r}")
|
43
42
|
|
44
43
|
|
45
|
-
def deserialize(value: str | bytes, target_type: type[T]) -> T:
|
44
|
+
def deserialize(value: str | bytes, target_type: type[T], json: bool = False) -> T:
|
45
|
+
decoder = msgspec.json.decode if json else msgspec.msgpack.decode
|
46
|
+
|
47
|
+
if json:
|
48
|
+
data = value.encode() if isinstance(value, str) else value
|
49
|
+
else:
|
50
|
+
data = value.encode() if isinstance(value, str) else value
|
51
|
+
|
46
52
|
try:
|
47
|
-
return
|
53
|
+
return decoder(data, type=target_type, strict=False)
|
48
54
|
except MsgspecError as e:
|
49
55
|
raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
|
50
56
|
|
51
57
|
|
52
|
-
def serialize(value: Any, **kwargs: Any) -> bytes:
|
58
|
+
def serialize(value: Any, json: bool = False, **kwargs: Any) -> bytes:
|
53
59
|
if isinstance(value, dict) and kwargs:
|
54
60
|
value = value | kwargs
|
55
61
|
|
62
|
+
encoder = msgspec.json.encode if json else msgspec.msgpack.encode
|
56
63
|
try:
|
57
|
-
return
|
64
|
+
return encoder(value, enc_hook=encode_hook)
|
58
65
|
except (MsgspecError, TypeError) as e:
|
59
66
|
raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e
|