kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +10 -0
- kreuzberg/_api/_config_cache.py +247 -0
- kreuzberg/_api/main.py +74 -45
- kreuzberg/_chunker.py +7 -6
- kreuzberg/_config.py +11 -1
- kreuzberg/_constants.py +2 -0
- kreuzberg/_document_classification.py +5 -7
- kreuzberg/_entity_extraction.py +9 -4
- kreuzberg/_extractors/_base.py +269 -3
- kreuzberg/_extractors/_email.py +101 -27
- kreuzberg/_extractors/_html.py +112 -7
- kreuzberg/_extractors/_image.py +23 -22
- kreuzberg/_extractors/_pandoc.py +106 -75
- kreuzberg/_extractors/_pdf.py +208 -99
- kreuzberg/_extractors/_presentation.py +76 -8
- kreuzberg/_extractors/_spread_sheet.py +24 -30
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +5 -0
- kreuzberg/_mcp/server.py +324 -25
- kreuzberg/_mime_types.py +42 -0
- kreuzberg/_ocr/_easyocr.py +53 -21
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +88 -37
- kreuzberg/_types.py +291 -61
- kreuzberg/_utils/_cache.py +10 -4
- kreuzberg/_utils/_device.py +2 -4
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +12 -39
- kreuzberg/_utils/_process_pool.py +29 -8
- kreuzberg/_utils/_quality.py +7 -2
- kreuzberg/_utils/_resource_managers.py +65 -0
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +39 -10
- kreuzberg/_utils/_tmp.py +37 -1
- kreuzberg/cli.py +34 -20
- kreuzberg/extraction.py +44 -28
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
- kreuzberg-3.16.0.dist-info/RECORD +61 -0
- kreuzberg-3.14.1.dist-info/RECORD +0 -58
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py
CHANGED
@@ -1,9 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sys
|
4
|
-
from collections.abc import Awaitable, Callable,
|
4
|
+
from collections.abc import Awaitable, Callable, Mapping
|
5
5
|
from dataclasses import asdict, dataclass, field
|
6
6
|
from enum import Enum
|
7
|
+
from pathlib import Path
|
7
8
|
from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
|
8
9
|
|
9
10
|
import msgspec
|
@@ -25,8 +26,6 @@ else: # pragma: no cover
|
|
25
26
|
from typing import NotRequired
|
26
27
|
|
27
28
|
if TYPE_CHECKING:
|
28
|
-
from pathlib import Path
|
29
|
-
|
30
29
|
from PIL.Image import Image
|
31
30
|
from polars import DataFrame
|
32
31
|
|
@@ -165,6 +164,12 @@ class EasyOCRConfig(ConfigDict):
|
|
165
164
|
ycenter_ths: float = 0.5
|
166
165
|
"""Maximum shift in y direction for merging."""
|
167
166
|
|
167
|
+
def __post_init__(self) -> None:
|
168
|
+
if isinstance(self.language, list):
|
169
|
+
object.__setattr__(self, "language", tuple(self.language))
|
170
|
+
if isinstance(self.rotation_info, list):
|
171
|
+
object.__setattr__(self, "rotation_info", tuple(self.rotation_info))
|
172
|
+
|
168
173
|
|
169
174
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
170
175
|
class PaddleOCRConfig(ConfigDict):
|
@@ -349,6 +354,51 @@ class GMFTConfig(ConfigDict):
|
|
349
354
|
"""
|
350
355
|
|
351
356
|
|
357
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
358
|
+
class ImageOCRConfig(ConfigDict):
|
359
|
+
"""Configuration for OCR processing of extracted images."""
|
360
|
+
|
361
|
+
enabled: bool = False
|
362
|
+
"""Whether to perform OCR on extracted images."""
|
363
|
+
backend: OcrBackendType | None = None
|
364
|
+
"""OCR backend for image OCR. Falls back to main ocr_backend when None."""
|
365
|
+
backend_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
|
366
|
+
"""Backend-specific configuration for image OCR."""
|
367
|
+
min_dimensions: tuple[int, int] = (50, 50)
|
368
|
+
"""Minimum (width, height) in pixels for image OCR eligibility."""
|
369
|
+
max_dimensions: tuple[int, int] = (10000, 10000)
|
370
|
+
"""Maximum (width, height) in pixels for image OCR eligibility."""
|
371
|
+
allowed_formats: frozenset[str] = frozenset(
|
372
|
+
{
|
373
|
+
"jpg",
|
374
|
+
"jpeg",
|
375
|
+
"png",
|
376
|
+
"gif",
|
377
|
+
"bmp",
|
378
|
+
"tiff",
|
379
|
+
"tif",
|
380
|
+
"webp",
|
381
|
+
"jp2",
|
382
|
+
"jpx",
|
383
|
+
"jpm",
|
384
|
+
"mj2",
|
385
|
+
"pnm",
|
386
|
+
"pbm",
|
387
|
+
"pgm",
|
388
|
+
"ppm",
|
389
|
+
}
|
390
|
+
)
|
391
|
+
"""Allowed image formats for OCR processing (lowercase, without dot)."""
|
392
|
+
batch_size: int = 4
|
393
|
+
"""Number of images to process in parallel for OCR."""
|
394
|
+
timeout_seconds: int = 30
|
395
|
+
"""Maximum time in seconds for OCR processing per image."""
|
396
|
+
|
397
|
+
def __post_init__(self) -> None:
|
398
|
+
if isinstance(self.allowed_formats, list):
|
399
|
+
object.__setattr__(self, "allowed_formats", frozenset(self.allowed_formats))
|
400
|
+
|
401
|
+
|
352
402
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
353
403
|
class LanguageDetectionConfig(ConfigDict):
|
354
404
|
low_memory: bool = True
|
@@ -391,6 +441,9 @@ class SpacyEntityExtractionConfig(ConfigDict):
|
|
391
441
|
"""Batch size for processing multiple texts."""
|
392
442
|
|
393
443
|
def __post_init__(self) -> None:
|
444
|
+
if isinstance(self.model_cache_dir, Path):
|
445
|
+
object.__setattr__(self, "model_cache_dir", str(self.model_cache_dir))
|
446
|
+
|
394
447
|
if self.language_models is None:
|
395
448
|
object.__setattr__(self, "language_models", self._get_default_language_models())
|
396
449
|
|
@@ -538,6 +591,8 @@ class ImagePreprocessingMetadata(NamedTuple):
|
|
538
591
|
|
539
592
|
|
540
593
|
class Metadata(TypedDict, total=False):
|
594
|
+
abstract: NotRequired[str]
|
595
|
+
"""Document abstract or summary."""
|
541
596
|
authors: NotRequired[list[str]]
|
542
597
|
"""List of document authors."""
|
543
598
|
categories: NotRequired[list[str]]
|
@@ -622,9 +677,28 @@ class Metadata(TypedDict, total=False):
|
|
622
677
|
"""Source format of the extracted content."""
|
623
678
|
error: NotRequired[str]
|
624
679
|
"""Error message if extraction failed."""
|
680
|
+
error_context: NotRequired[dict[str, Any]]
|
681
|
+
"""Error context information for debugging."""
|
682
|
+
json_schema: NotRequired[dict[str, Any]]
|
683
|
+
"""JSON schema information extracted from structured data."""
|
684
|
+
notes: NotRequired[list[str]]
|
685
|
+
"""Notes or additional information extracted from documents."""
|
686
|
+
note: NotRequired[str]
|
687
|
+
"""Single note or annotation."""
|
688
|
+
name: NotRequired[str]
|
689
|
+
"""Name field from structured data."""
|
690
|
+
body: NotRequired[str]
|
691
|
+
"""Body text content."""
|
692
|
+
text: NotRequired[str]
|
693
|
+
"""Generic text content."""
|
694
|
+
message: NotRequired[str]
|
695
|
+
"""Message or communication content."""
|
696
|
+
attributes: NotRequired[dict[str, Any]]
|
697
|
+
"""Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
|
625
698
|
|
626
699
|
|
627
700
|
_VALID_METADATA_KEYS = {
|
701
|
+
"abstract",
|
628
702
|
"authors",
|
629
703
|
"categories",
|
630
704
|
"citations",
|
@@ -664,6 +738,17 @@ _VALID_METADATA_KEYS = {
|
|
664
738
|
"tables_summary",
|
665
739
|
"quality_score",
|
666
740
|
"image_preprocessing",
|
741
|
+
"source_format",
|
742
|
+
"error",
|
743
|
+
"error_context",
|
744
|
+
"json_schema",
|
745
|
+
"notes",
|
746
|
+
"note",
|
747
|
+
"name",
|
748
|
+
"body",
|
749
|
+
"text",
|
750
|
+
"message",
|
751
|
+
"attributes",
|
667
752
|
}
|
668
753
|
|
669
754
|
|
@@ -672,14 +757,34 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
|
|
672
757
|
return {}
|
673
758
|
|
674
759
|
normalized: Metadata = {}
|
760
|
+
attributes: dict[str, Any] = {}
|
761
|
+
|
675
762
|
for key, value in data.items():
|
676
|
-
if
|
677
|
-
|
763
|
+
if value is not None:
|
764
|
+
if key in _VALID_METADATA_KEYS:
|
765
|
+
normalized[key] = value # type: ignore[literal-required]
|
766
|
+
elif "." in key and key.split(".")[-1] in {
|
767
|
+
"title",
|
768
|
+
"name",
|
769
|
+
"subject",
|
770
|
+
"description",
|
771
|
+
"content",
|
772
|
+
"body",
|
773
|
+
"text",
|
774
|
+
"message",
|
775
|
+
"note",
|
776
|
+
"abstract",
|
777
|
+
"summary",
|
778
|
+
}:
|
779
|
+
attributes[key] = value
|
780
|
+
|
781
|
+
if attributes:
|
782
|
+
normalized["attributes"] = attributes
|
678
783
|
|
679
784
|
return normalized
|
680
785
|
|
681
786
|
|
682
|
-
@dataclass(frozen=True, slots=True)
|
787
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
683
788
|
class Entity:
|
684
789
|
type: str
|
685
790
|
"""e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
|
@@ -691,18 +796,44 @@ class Entity:
|
|
691
796
|
"""End character offset in the content"""
|
692
797
|
|
693
798
|
|
799
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
800
|
+
class ExtractedImage:
|
801
|
+
data: bytes
|
802
|
+
format: str
|
803
|
+
filename: str | None = None
|
804
|
+
page_number: int | None = None
|
805
|
+
dimensions: tuple[int, int] | None = None
|
806
|
+
colorspace: str | None = None
|
807
|
+
bits_per_component: int | None = None
|
808
|
+
is_mask: bool = False
|
809
|
+
description: str | None = None
|
810
|
+
|
811
|
+
|
812
|
+
@dataclass(slots=True)
|
813
|
+
class ImageOCRResult:
|
814
|
+
image: ExtractedImage
|
815
|
+
ocr_result: ExtractionResult
|
816
|
+
confidence_score: float | None = None
|
817
|
+
processing_time: float | None = None
|
818
|
+
skipped_reason: str | None = None
|
819
|
+
|
820
|
+
|
694
821
|
@dataclass(slots=True)
|
695
822
|
class ExtractionResult:
|
696
823
|
content: str
|
697
824
|
"""The extracted content."""
|
698
825
|
mime_type: str
|
699
826
|
"""The mime type of the extracted content. Is either text/plain or text/markdown."""
|
700
|
-
metadata: Metadata
|
827
|
+
metadata: Metadata = field(default_factory=lambda: Metadata())
|
701
828
|
"""The metadata of the content."""
|
702
829
|
tables: list[TableData] = field(default_factory=list)
|
703
830
|
"""Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
|
704
831
|
chunks: list[str] = field(default_factory=list)
|
705
832
|
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
833
|
+
images: list[ExtractedImage] = field(default_factory=list)
|
834
|
+
"""Extracted images. Empty list if 'extract_images' is not enabled."""
|
835
|
+
image_ocr_results: list[ImageOCRResult] = field(default_factory=list)
|
836
|
+
"""OCR results from extracted images. Empty list if disabled or none processed."""
|
706
837
|
entities: list[Entity] | None = None
|
707
838
|
"""Extracted entities, if entity extraction is enabled."""
|
708
839
|
keywords: list[tuple[str, float]] | None = None
|
@@ -751,6 +882,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
|
|
751
882
|
ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
752
883
|
|
753
884
|
|
885
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
886
|
+
class JSONExtractionConfig(ConfigDict):
|
887
|
+
extract_schema: bool = False
|
888
|
+
"""Extract and include JSON schema information in metadata."""
|
889
|
+
custom_text_field_patterns: frozenset[str] | None = None
|
890
|
+
"""Custom patterns to identify text fields beyond default keywords."""
|
891
|
+
max_depth: int = 10
|
892
|
+
"""Maximum nesting depth to process in JSON structures."""
|
893
|
+
array_item_limit: int = 1000
|
894
|
+
"""Maximum number of array items to process to prevent memory issues."""
|
895
|
+
include_type_info: bool = False
|
896
|
+
"""Include data type information in extracted content."""
|
897
|
+
flatten_nested_objects: bool = True
|
898
|
+
"""Flatten nested objects using dot notation for better text extraction."""
|
899
|
+
|
900
|
+
def __post_init__(self) -> None:
|
901
|
+
if self.max_depth <= 0:
|
902
|
+
raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
|
903
|
+
if self.array_item_limit <= 0:
|
904
|
+
raise ValidationError(
|
905
|
+
"array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
|
906
|
+
)
|
907
|
+
|
908
|
+
|
754
909
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
755
910
|
class ExtractionConfig(ConfigDict):
|
756
911
|
force_ocr: bool = False
|
@@ -761,6 +916,41 @@ class ExtractionConfig(ConfigDict):
|
|
761
916
|
"""Whether to extract tables from the content. This requires the 'gmft' dependency."""
|
762
917
|
extract_tables_from_ocr: bool = False
|
763
918
|
"""Extract tables from OCR output using TSV format (Tesseract only)."""
|
919
|
+
extract_images: bool = False
|
920
|
+
"""Whether to extract images from documents."""
|
921
|
+
deduplicate_images: bool = True
|
922
|
+
"""Whether to remove duplicate images using CRC32 checksums."""
|
923
|
+
image_ocr_config: ImageOCRConfig | None = None
|
924
|
+
"""Configuration for OCR processing of extracted images."""
|
925
|
+
ocr_extracted_images: bool = False
|
926
|
+
"""Deprecated: Use image_ocr_config.enabled instead."""
|
927
|
+
image_ocr_backend: OcrBackendType | None = None
|
928
|
+
"""Deprecated: Use image_ocr_config.backend instead."""
|
929
|
+
image_ocr_min_dimensions: tuple[int, int] = (50, 50)
|
930
|
+
"""Deprecated: Use image_ocr_config.min_dimensions instead."""
|
931
|
+
image_ocr_max_dimensions: tuple[int, int] = (10000, 10000)
|
932
|
+
"""Deprecated: Use image_ocr_config.max_dimensions instead."""
|
933
|
+
image_ocr_formats: frozenset[str] = frozenset(
|
934
|
+
{
|
935
|
+
"jpg",
|
936
|
+
"jpeg",
|
937
|
+
"png",
|
938
|
+
"gif",
|
939
|
+
"bmp",
|
940
|
+
"tiff",
|
941
|
+
"tif",
|
942
|
+
"webp",
|
943
|
+
"jp2",
|
944
|
+
"jpx",
|
945
|
+
"jpm",
|
946
|
+
"mj2",
|
947
|
+
"pnm",
|
948
|
+
"pbm",
|
949
|
+
"pgm",
|
950
|
+
"ppm",
|
951
|
+
}
|
952
|
+
)
|
953
|
+
"""Deprecated: Use image_ocr_config.allowed_formats instead."""
|
764
954
|
max_chars: int = DEFAULT_MAX_CHARACTERS
|
765
955
|
"""The size of each chunk in characters."""
|
766
956
|
max_overlap: int = DEFAULT_MAX_OVERLAP
|
@@ -805,6 +995,8 @@ class ExtractionConfig(ConfigDict):
|
|
805
995
|
"""Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
|
806
996
|
html_to_markdown_config: HTMLToMarkdownConfig | None = None
|
807
997
|
"""Configuration for HTML to Markdown conversion. If None, uses default settings."""
|
998
|
+
json_config: JSONExtractionConfig | None = None
|
999
|
+
"""Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
|
808
1000
|
use_cache: bool = True
|
809
1001
|
"""Whether to use caching for extraction results. Set to False to disable all caching."""
|
810
1002
|
target_dpi: int = 150
|
@@ -826,6 +1018,51 @@ class ExtractionConfig(ConfigDict):
|
|
826
1018
|
if self.validators is not None and isinstance(self.validators, list):
|
827
1019
|
object.__setattr__(self, "validators", tuple(self.validators))
|
828
1020
|
|
1021
|
+
if isinstance(self.pdf_password, list):
|
1022
|
+
object.__setattr__(self, "pdf_password", tuple(self.pdf_password))
|
1023
|
+
|
1024
|
+
if isinstance(self.image_ocr_formats, list):
|
1025
|
+
object.__setattr__(self, "image_ocr_formats", frozenset(self.image_ocr_formats))
|
1026
|
+
|
1027
|
+
if self.image_ocr_config is None and (
|
1028
|
+
self.ocr_extracted_images
|
1029
|
+
or self.image_ocr_backend is not None
|
1030
|
+
or self.image_ocr_min_dimensions != (50, 50)
|
1031
|
+
or self.image_ocr_max_dimensions != (10000, 10000)
|
1032
|
+
or self.image_ocr_formats
|
1033
|
+
!= frozenset(
|
1034
|
+
{
|
1035
|
+
"jpg",
|
1036
|
+
"jpeg",
|
1037
|
+
"png",
|
1038
|
+
"gif",
|
1039
|
+
"bmp",
|
1040
|
+
"tiff",
|
1041
|
+
"tif",
|
1042
|
+
"webp",
|
1043
|
+
"jp2",
|
1044
|
+
"jpx",
|
1045
|
+
"jpm",
|
1046
|
+
"mj2",
|
1047
|
+
"pnm",
|
1048
|
+
"pbm",
|
1049
|
+
"pgm",
|
1050
|
+
"ppm",
|
1051
|
+
}
|
1052
|
+
)
|
1053
|
+
):
|
1054
|
+
object.__setattr__(
|
1055
|
+
self,
|
1056
|
+
"image_ocr_config",
|
1057
|
+
ImageOCRConfig(
|
1058
|
+
enabled=self.ocr_extracted_images,
|
1059
|
+
backend=self.image_ocr_backend,
|
1060
|
+
min_dimensions=self.image_ocr_min_dimensions,
|
1061
|
+
max_dimensions=self.image_ocr_max_dimensions,
|
1062
|
+
allowed_formats=self.image_ocr_formats,
|
1063
|
+
),
|
1064
|
+
)
|
1065
|
+
|
829
1066
|
if self.ocr_backend is None and self.ocr_config is not None:
|
830
1067
|
raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
|
831
1068
|
|
@@ -839,7 +1076,6 @@ class ExtractionConfig(ConfigDict):
|
|
839
1076
|
context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
|
840
1077
|
)
|
841
1078
|
|
842
|
-
# Validate DPI configuration
|
843
1079
|
if self.target_dpi <= 0:
|
844
1080
|
raise ValidationError("target_dpi must be positive", context={"target_dpi": self.target_dpi})
|
845
1081
|
if self.min_dpi <= 0:
|
@@ -861,27 +1097,22 @@ class ExtractionConfig(ConfigDict):
|
|
861
1097
|
)
|
862
1098
|
|
863
1099
|
def get_config_dict(self) -> dict[str, Any]:
|
864
|
-
if self.ocr_backend is None:
|
865
|
-
return {"use_cache": self.use_cache}
|
866
|
-
|
867
|
-
if self.ocr_config is not None:
|
868
|
-
config_dict = asdict(self.ocr_config)
|
869
|
-
config_dict["use_cache"] = self.use_cache
|
870
|
-
return config_dict
|
871
|
-
|
872
1100
|
match self.ocr_backend:
|
873
|
-
case
|
874
|
-
|
1101
|
+
case None:
|
1102
|
+
return {"use_cache": self.use_cache}
|
1103
|
+
case _ if self.ocr_config is not None:
|
1104
|
+
config_dict = asdict(self.ocr_config)
|
875
1105
|
config_dict["use_cache"] = self.use_cache
|
876
1106
|
return config_dict
|
1107
|
+
case "tesseract":
|
1108
|
+
config_dict = asdict(TesseractConfig())
|
877
1109
|
case "easyocr":
|
878
1110
|
config_dict = asdict(EasyOCRConfig())
|
879
|
-
config_dict["use_cache"] = self.use_cache
|
880
|
-
return config_dict
|
881
1111
|
case _:
|
882
1112
|
config_dict = asdict(PaddleOCRConfig())
|
883
|
-
|
884
|
-
|
1113
|
+
|
1114
|
+
config_dict["use_cache"] = self.use_cache
|
1115
|
+
return config_dict
|
885
1116
|
|
886
1117
|
def to_dict(self, include_none: bool = False) -> dict[str, Any]:
|
887
1118
|
result = msgspec.to_builtins(
|
@@ -900,72 +1131,71 @@ class ExtractionConfig(ConfigDict):
|
|
900
1131
|
return {k: v for k, v in result.items() if v is not None}
|
901
1132
|
|
902
1133
|
|
903
|
-
@dataclass(frozen=True)
|
1134
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
904
1135
|
class HTMLToMarkdownConfig:
|
905
|
-
stream_processing: bool = False
|
906
|
-
"""Enable streaming mode for processing large HTML documents."""
|
907
|
-
chunk_size: int = 1024
|
908
|
-
"""Size of chunks when stream_processing is enabled."""
|
909
|
-
chunk_callback: Callable[[str], None] | None = None
|
910
|
-
"""Callback function invoked for each chunk during stream processing."""
|
911
|
-
progress_callback: Callable[[int, int], None] | None = None
|
912
|
-
"""Callback function for progress updates (current, total)."""
|
913
|
-
parser: str | None = "lxml"
|
914
|
-
"""BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
|
915
1136
|
autolinks: bool = True
|
916
|
-
"""
|
1137
|
+
"""Automatically convert valid URLs to Markdown links."""
|
1138
|
+
br_in_tables: bool = False
|
1139
|
+
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
917
1140
|
bullets: str = "*+-"
|
918
1141
|
"""Characters to use for unordered list bullets."""
|
919
1142
|
code_language: str = ""
|
920
|
-
"""Default language for code blocks."""
|
1143
|
+
"""Default language identifier for fenced code blocks."""
|
921
1144
|
code_language_callback: Callable[[Any], str] | None = None
|
922
|
-
"""
|
923
|
-
convert:
|
924
|
-
"""HTML tags to convert
|
1145
|
+
"""Function to dynamically determine code block language."""
|
1146
|
+
convert: list[str] | None = None
|
1147
|
+
"""List of HTML tags to convert (None = all supported tags)."""
|
925
1148
|
convert_as_inline: bool = False
|
926
|
-
"""
|
927
|
-
custom_converters: Mapping[
|
928
|
-
"""
|
1149
|
+
"""Treat content as inline elements only."""
|
1150
|
+
custom_converters: Mapping[str, Callable[..., str]] | None = None
|
1151
|
+
"""Mapping of HTML tag names to custom converter functions."""
|
929
1152
|
default_title: bool = False
|
930
|
-
"""Use
|
1153
|
+
"""Use default titles for elements like links."""
|
931
1154
|
escape_asterisks: bool = True
|
932
|
-
"""Escape
|
1155
|
+
"""Escape * characters to prevent unintended formatting."""
|
933
1156
|
escape_misc: bool = True
|
934
|
-
"""Escape miscellaneous characters
|
1157
|
+
"""Escape miscellaneous characters to prevent Markdown conflicts."""
|
935
1158
|
escape_underscores: bool = True
|
936
|
-
"""Escape
|
1159
|
+
"""Escape _ characters to prevent unintended formatting."""
|
937
1160
|
extract_metadata: bool = True
|
938
|
-
"""Extract metadata
|
1161
|
+
"""Extract document metadata as comment header."""
|
939
1162
|
heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
|
940
1163
|
"""Style for markdown headings."""
|
941
1164
|
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
942
1165
|
"""Style for highlighting text."""
|
943
|
-
keep_inline_images_in:
|
944
|
-
"""
|
1166
|
+
keep_inline_images_in: list[str] | None = None
|
1167
|
+
"""Tags where inline images should be preserved."""
|
1168
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces"
|
1169
|
+
"""Type of indentation to use for lists."""
|
1170
|
+
list_indent_width: int = 4
|
1171
|
+
"""Number of spaces per indentation level (use 2 for Discord/Slack)."""
|
945
1172
|
newline_style: Literal["spaces", "backslash"] = "spaces"
|
946
1173
|
"""Style for line breaks in markdown."""
|
947
|
-
|
948
|
-
"""HTML
|
1174
|
+
preprocess_html: bool = False
|
1175
|
+
"""Enable HTML preprocessing to clean messy HTML."""
|
1176
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
|
1177
|
+
"""Preprocessing level for cleaning HTML."""
|
1178
|
+
remove_forms: bool = True
|
1179
|
+
"""Remove form elements during preprocessing."""
|
1180
|
+
remove_navigation: bool = True
|
1181
|
+
"""Remove navigation elements during preprocessing."""
|
1182
|
+
strip: list[str] | None = None
|
1183
|
+
"""List of HTML tags to remove from output."""
|
949
1184
|
strip_newlines: bool = False
|
950
|
-
"""
|
1185
|
+
"""Remove newlines from HTML input before processing."""
|
951
1186
|
strong_em_symbol: Literal["*", "_"] = "*"
|
952
1187
|
"""Symbol to use for strong/emphasis formatting."""
|
953
1188
|
sub_symbol: str = ""
|
954
1189
|
"""Symbol to use for subscript text."""
|
955
1190
|
sup_symbol: str = ""
|
956
1191
|
"""Symbol to use for superscript text."""
|
1192
|
+
whitespace_mode: Literal["normalized", "strict"] = "normalized"
|
1193
|
+
"""Whitespace handling mode."""
|
957
1194
|
wrap: bool = False
|
958
1195
|
"""Enable text wrapping."""
|
959
1196
|
wrap_width: int = 80
|
960
|
-
"""Width for text wrapping
|
961
|
-
preprocess_html: bool = True
|
962
|
-
"""Enable HTML preprocessing to clean up the input."""
|
963
|
-
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
|
964
|
-
"""Preprocessing level for cleaning HTML."""
|
965
|
-
remove_navigation: bool = True
|
966
|
-
"""Remove navigation elements from HTML."""
|
967
|
-
remove_forms: bool = True
|
968
|
-
"""Remove form elements from HTML."""
|
1197
|
+
"""Width for text wrapping."""
|
969
1198
|
|
970
1199
|
def to_dict(self) -> dict[str, Any]:
|
971
|
-
|
1200
|
+
result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
|
1201
|
+
return {k: v for k, v in result.items() if v is not None}
|
kreuzberg/_utils/_cache.py
CHANGED
@@ -20,6 +20,8 @@ from kreuzberg._utils._sync import run_sync
|
|
20
20
|
|
21
21
|
T = TypeVar("T")
|
22
22
|
|
23
|
+
CACHE_CLEANUP_FREQUENCY = 100
|
24
|
+
|
23
25
|
|
24
26
|
class KreuzbergCache(Generic[T]):
|
25
27
|
def __init__(
|
@@ -136,16 +138,20 @@ class KreuzbergCache(Generic[T]):
|
|
136
138
|
def _cleanup_cache(self) -> None:
|
137
139
|
try:
|
138
140
|
cache_files = list(self.cache_dir.glob("*.msgpack"))
|
139
|
-
|
140
141
|
cutoff_time = time.time() - (self.max_age_days * 24 * 3600)
|
141
|
-
|
142
|
+
|
143
|
+
remaining_files = []
|
144
|
+
for cache_file in cache_files:
|
142
145
|
try:
|
143
146
|
if cache_file.stat().st_mtime < cutoff_time:
|
144
147
|
cache_file.unlink(missing_ok=True)
|
145
|
-
|
148
|
+
else:
|
149
|
+
remaining_files.append(cache_file)
|
146
150
|
except OSError: # noqa: PERF203
|
147
151
|
continue
|
148
152
|
|
153
|
+
cache_files = remaining_files
|
154
|
+
|
149
155
|
total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists()) / (
|
150
156
|
1024 * 1024
|
151
157
|
)
|
@@ -191,7 +197,7 @@ class KreuzbergCache(Generic[T]):
|
|
191
197
|
content = serialize(serialized)
|
192
198
|
cache_path.write_bytes(content)
|
193
199
|
|
194
|
-
if hash(cache_key) %
|
200
|
+
if hash(cache_key) % CACHE_CLEANUP_FREQUENCY == 0:
|
195
201
|
self._cleanup_cache()
|
196
202
|
except (OSError, TypeError, ValueError):
|
197
203
|
pass
|
kreuzberg/_utils/_device.py
CHANGED
@@ -12,7 +12,7 @@ from kreuzberg.exceptions import ValidationError
|
|
12
12
|
DeviceType = Literal["cpu", "cuda", "mps", "auto"]
|
13
13
|
|
14
14
|
|
15
|
-
@dataclass(frozen=True, slots=True)
|
15
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
16
16
|
class DeviceInfo:
|
17
17
|
device_type: Literal["cpu", "cuda", "mps"]
|
18
18
|
"""The type of device."""
|
@@ -30,12 +30,10 @@ def detect_available_devices() -> list[DeviceInfo]:
|
|
30
30
|
cpu_device = DeviceInfo(device_type="cpu", name="CPU")
|
31
31
|
|
32
32
|
cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
|
33
|
-
|
34
33
|
mps_device = _get_mps_device() if _is_mps_available() else None
|
35
34
|
mps_devices = [mps_device] if mps_device else []
|
36
35
|
|
37
|
-
|
38
|
-
return [*gpu_devices, cpu_device]
|
36
|
+
return list(chain(cuda_devices, mps_devices, [cpu_device]))
|
39
37
|
|
40
38
|
|
41
39
|
def get_optimal_device() -> DeviceInfo:
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
_STREAMING_THRESHOLD_KB = 10
|
4
|
+
_LARGE_FILE_THRESHOLD_MB = 1
|
5
|
+
_DEFAULT_CHUNK_SIZE = 2048
|
6
|
+
_LARGE_FILE_CHUNK_SIZE = 4096
|
7
|
+
|
8
|
+
_STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
|
9
|
+
_LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
|
10
|
+
|
11
|
+
|
12
|
+
def should_use_streaming(content_size: int) -> tuple[bool, int]:
|
13
|
+
if content_size < 0:
|
14
|
+
return False, _DEFAULT_CHUNK_SIZE
|
15
|
+
|
16
|
+
if content_size > _STREAMING_THRESHOLD_BYTES:
|
17
|
+
if content_size > _LARGE_FILE_THRESHOLD_BYTES:
|
18
|
+
return True, _LARGE_FILE_CHUNK_SIZE
|
19
|
+
return True, _DEFAULT_CHUNK_SIZE
|
20
|
+
return False, _DEFAULT_CHUNK_SIZE
|