kreuzberg 3.14.1__py3-none-any.whl → 3.16.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/__init__.py +10 -0
  2. kreuzberg/_api/_config_cache.py +247 -0
  3. kreuzberg/_api/main.py +74 -45
  4. kreuzberg/_chunker.py +7 -6
  5. kreuzberg/_config.py +11 -1
  6. kreuzberg/_constants.py +2 -0
  7. kreuzberg/_document_classification.py +5 -7
  8. kreuzberg/_entity_extraction.py +9 -4
  9. kreuzberg/_extractors/_base.py +269 -3
  10. kreuzberg/_extractors/_email.py +101 -27
  11. kreuzberg/_extractors/_html.py +112 -7
  12. kreuzberg/_extractors/_image.py +23 -22
  13. kreuzberg/_extractors/_pandoc.py +106 -75
  14. kreuzberg/_extractors/_pdf.py +208 -99
  15. kreuzberg/_extractors/_presentation.py +76 -8
  16. kreuzberg/_extractors/_spread_sheet.py +24 -30
  17. kreuzberg/_extractors/_structured.py +83 -15
  18. kreuzberg/_gmft.py +5 -0
  19. kreuzberg/_mcp/server.py +324 -25
  20. kreuzberg/_mime_types.py +42 -0
  21. kreuzberg/_ocr/_easyocr.py +53 -21
  22. kreuzberg/_ocr/_paddleocr.py +1 -1
  23. kreuzberg/_ocr/_tesseract.py +88 -37
  24. kreuzberg/_types.py +291 -61
  25. kreuzberg/_utils/_cache.py +10 -4
  26. kreuzberg/_utils/_device.py +2 -4
  27. kreuzberg/_utils/_html_streaming.py +20 -0
  28. kreuzberg/_utils/_image_preprocessing.py +12 -39
  29. kreuzberg/_utils/_process_pool.py +29 -8
  30. kreuzberg/_utils/_quality.py +7 -2
  31. kreuzberg/_utils/_resource_managers.py +65 -0
  32. kreuzberg/_utils/_serialization.py +13 -6
  33. kreuzberg/_utils/_sync.py +39 -10
  34. kreuzberg/_utils/_tmp.py +37 -1
  35. kreuzberg/cli.py +34 -20
  36. kreuzberg/extraction.py +44 -28
  37. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/METADATA +13 -11
  38. kreuzberg-3.16.0.dist-info/RECORD +61 -0
  39. kreuzberg-3.14.1.dist-info/RECORD +0 -58
  40. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.14.1.dist-info → kreuzberg-3.16.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_types.py CHANGED
@@ -1,9 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from collections.abc import Awaitable, Callable, Iterable, Mapping
4
+ from collections.abc import Awaitable, Callable, Mapping
5
5
  from dataclasses import asdict, dataclass, field
6
6
  from enum import Enum
7
+ from pathlib import Path
7
8
  from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
8
9
 
9
10
  import msgspec
@@ -25,8 +26,6 @@ else: # pragma: no cover
25
26
  from typing import NotRequired
26
27
 
27
28
  if TYPE_CHECKING:
28
- from pathlib import Path
29
-
30
29
  from PIL.Image import Image
31
30
  from polars import DataFrame
32
31
 
@@ -165,6 +164,12 @@ class EasyOCRConfig(ConfigDict):
165
164
  ycenter_ths: float = 0.5
166
165
  """Maximum shift in y direction for merging."""
167
166
 
167
+ def __post_init__(self) -> None:
168
+ if isinstance(self.language, list):
169
+ object.__setattr__(self, "language", tuple(self.language))
170
+ if isinstance(self.rotation_info, list):
171
+ object.__setattr__(self, "rotation_info", tuple(self.rotation_info))
172
+
168
173
 
169
174
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
170
175
  class PaddleOCRConfig(ConfigDict):
@@ -349,6 +354,51 @@ class GMFTConfig(ConfigDict):
349
354
  """
350
355
 
351
356
 
357
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
358
+ class ImageOCRConfig(ConfigDict):
359
+ """Configuration for OCR processing of extracted images."""
360
+
361
+ enabled: bool = False
362
+ """Whether to perform OCR on extracted images."""
363
+ backend: OcrBackendType | None = None
364
+ """OCR backend for image OCR. Falls back to main ocr_backend when None."""
365
+ backend_config: TesseractConfig | PaddleOCRConfig | EasyOCRConfig | None = None
366
+ """Backend-specific configuration for image OCR."""
367
+ min_dimensions: tuple[int, int] = (50, 50)
368
+ """Minimum (width, height) in pixels for image OCR eligibility."""
369
+ max_dimensions: tuple[int, int] = (10000, 10000)
370
+ """Maximum (width, height) in pixels for image OCR eligibility."""
371
+ allowed_formats: frozenset[str] = frozenset(
372
+ {
373
+ "jpg",
374
+ "jpeg",
375
+ "png",
376
+ "gif",
377
+ "bmp",
378
+ "tiff",
379
+ "tif",
380
+ "webp",
381
+ "jp2",
382
+ "jpx",
383
+ "jpm",
384
+ "mj2",
385
+ "pnm",
386
+ "pbm",
387
+ "pgm",
388
+ "ppm",
389
+ }
390
+ )
391
+ """Allowed image formats for OCR processing (lowercase, without dot)."""
392
+ batch_size: int = 4
393
+ """Number of images to process in parallel for OCR."""
394
+ timeout_seconds: int = 30
395
+ """Maximum time in seconds for OCR processing per image."""
396
+
397
+ def __post_init__(self) -> None:
398
+ if isinstance(self.allowed_formats, list):
399
+ object.__setattr__(self, "allowed_formats", frozenset(self.allowed_formats))
400
+
401
+
352
402
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
353
403
  class LanguageDetectionConfig(ConfigDict):
354
404
  low_memory: bool = True
@@ -391,6 +441,9 @@ class SpacyEntityExtractionConfig(ConfigDict):
391
441
  """Batch size for processing multiple texts."""
392
442
 
393
443
  def __post_init__(self) -> None:
444
+ if isinstance(self.model_cache_dir, Path):
445
+ object.__setattr__(self, "model_cache_dir", str(self.model_cache_dir))
446
+
394
447
  if self.language_models is None:
395
448
  object.__setattr__(self, "language_models", self._get_default_language_models())
396
449
 
@@ -538,6 +591,8 @@ class ImagePreprocessingMetadata(NamedTuple):
538
591
 
539
592
 
540
593
  class Metadata(TypedDict, total=False):
594
+ abstract: NotRequired[str]
595
+ """Document abstract or summary."""
541
596
  authors: NotRequired[list[str]]
542
597
  """List of document authors."""
543
598
  categories: NotRequired[list[str]]
@@ -622,9 +677,28 @@ class Metadata(TypedDict, total=False):
622
677
  """Source format of the extracted content."""
623
678
  error: NotRequired[str]
624
679
  """Error message if extraction failed."""
680
+ error_context: NotRequired[dict[str, Any]]
681
+ """Error context information for debugging."""
682
+ json_schema: NotRequired[dict[str, Any]]
683
+ """JSON schema information extracted from structured data."""
684
+ notes: NotRequired[list[str]]
685
+ """Notes or additional information extracted from documents."""
686
+ note: NotRequired[str]
687
+ """Single note or annotation."""
688
+ name: NotRequired[str]
689
+ """Name field from structured data."""
690
+ body: NotRequired[str]
691
+ """Body text content."""
692
+ text: NotRequired[str]
693
+ """Generic text content."""
694
+ message: NotRequired[str]
695
+ """Message or communication content."""
696
+ attributes: NotRequired[dict[str, Any]]
697
+ """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
625
698
 
626
699
 
627
700
  _VALID_METADATA_KEYS = {
701
+ "abstract",
628
702
  "authors",
629
703
  "categories",
630
704
  "citations",
@@ -664,6 +738,17 @@ _VALID_METADATA_KEYS = {
664
738
  "tables_summary",
665
739
  "quality_score",
666
740
  "image_preprocessing",
741
+ "source_format",
742
+ "error",
743
+ "error_context",
744
+ "json_schema",
745
+ "notes",
746
+ "note",
747
+ "name",
748
+ "body",
749
+ "text",
750
+ "message",
751
+ "attributes",
667
752
  }
668
753
 
669
754
 
@@ -672,14 +757,34 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
672
757
  return {}
673
758
 
674
759
  normalized: Metadata = {}
760
+ attributes: dict[str, Any] = {}
761
+
675
762
  for key, value in data.items():
676
- if key in _VALID_METADATA_KEYS and value is not None:
677
- normalized[key] = value # type: ignore[literal-required]
763
+ if value is not None:
764
+ if key in _VALID_METADATA_KEYS:
765
+ normalized[key] = value # type: ignore[literal-required]
766
+ elif "." in key and key.split(".")[-1] in {
767
+ "title",
768
+ "name",
769
+ "subject",
770
+ "description",
771
+ "content",
772
+ "body",
773
+ "text",
774
+ "message",
775
+ "note",
776
+ "abstract",
777
+ "summary",
778
+ }:
779
+ attributes[key] = value
780
+
781
+ if attributes:
782
+ normalized["attributes"] = attributes
678
783
 
679
784
  return normalized
680
785
 
681
786
 
682
- @dataclass(frozen=True, slots=True)
787
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
683
788
  class Entity:
684
789
  type: str
685
790
  """e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
@@ -691,18 +796,44 @@ class Entity:
691
796
  """End character offset in the content"""
692
797
 
693
798
 
799
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
800
+ class ExtractedImage:
801
+ data: bytes
802
+ format: str
803
+ filename: str | None = None
804
+ page_number: int | None = None
805
+ dimensions: tuple[int, int] | None = None
806
+ colorspace: str | None = None
807
+ bits_per_component: int | None = None
808
+ is_mask: bool = False
809
+ description: str | None = None
810
+
811
+
812
+ @dataclass(slots=True)
813
+ class ImageOCRResult:
814
+ image: ExtractedImage
815
+ ocr_result: ExtractionResult
816
+ confidence_score: float | None = None
817
+ processing_time: float | None = None
818
+ skipped_reason: str | None = None
819
+
820
+
694
821
  @dataclass(slots=True)
695
822
  class ExtractionResult:
696
823
  content: str
697
824
  """The extracted content."""
698
825
  mime_type: str
699
826
  """The mime type of the extracted content. Is either text/plain or text/markdown."""
700
- metadata: Metadata
827
+ metadata: Metadata = field(default_factory=lambda: Metadata())
701
828
  """The metadata of the content."""
702
829
  tables: list[TableData] = field(default_factory=list)
703
830
  """Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
704
831
  chunks: list[str] = field(default_factory=list)
705
832
  """The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
833
+ images: list[ExtractedImage] = field(default_factory=list)
834
+ """Extracted images. Empty list if 'extract_images' is not enabled."""
835
+ image_ocr_results: list[ImageOCRResult] = field(default_factory=list)
836
+ """OCR results from extracted images. Empty list if disabled or none processed."""
706
837
  entities: list[Entity] | None = None
707
838
  """Extracted entities, if entity extraction is enabled."""
708
839
  keywords: list[tuple[str, float]] | None = None
@@ -751,6 +882,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
751
882
  ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
752
883
 
753
884
 
885
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
886
+ class JSONExtractionConfig(ConfigDict):
887
+ extract_schema: bool = False
888
+ """Extract and include JSON schema information in metadata."""
889
+ custom_text_field_patterns: frozenset[str] | None = None
890
+ """Custom patterns to identify text fields beyond default keywords."""
891
+ max_depth: int = 10
892
+ """Maximum nesting depth to process in JSON structures."""
893
+ array_item_limit: int = 1000
894
+ """Maximum number of array items to process to prevent memory issues."""
895
+ include_type_info: bool = False
896
+ """Include data type information in extracted content."""
897
+ flatten_nested_objects: bool = True
898
+ """Flatten nested objects using dot notation for better text extraction."""
899
+
900
+ def __post_init__(self) -> None:
901
+ if self.max_depth <= 0:
902
+ raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
903
+ if self.array_item_limit <= 0:
904
+ raise ValidationError(
905
+ "array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
906
+ )
907
+
908
+
754
909
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
755
910
  class ExtractionConfig(ConfigDict):
756
911
  force_ocr: bool = False
@@ -761,6 +916,41 @@ class ExtractionConfig(ConfigDict):
761
916
  """Whether to extract tables from the content. This requires the 'gmft' dependency."""
762
917
  extract_tables_from_ocr: bool = False
763
918
  """Extract tables from OCR output using TSV format (Tesseract only)."""
919
+ extract_images: bool = False
920
+ """Whether to extract images from documents."""
921
+ deduplicate_images: bool = True
922
+ """Whether to remove duplicate images using CRC32 checksums."""
923
+ image_ocr_config: ImageOCRConfig | None = None
924
+ """Configuration for OCR processing of extracted images."""
925
+ ocr_extracted_images: bool = False
926
+ """Deprecated: Use image_ocr_config.enabled instead."""
927
+ image_ocr_backend: OcrBackendType | None = None
928
+ """Deprecated: Use image_ocr_config.backend instead."""
929
+ image_ocr_min_dimensions: tuple[int, int] = (50, 50)
930
+ """Deprecated: Use image_ocr_config.min_dimensions instead."""
931
+ image_ocr_max_dimensions: tuple[int, int] = (10000, 10000)
932
+ """Deprecated: Use image_ocr_config.max_dimensions instead."""
933
+ image_ocr_formats: frozenset[str] = frozenset(
934
+ {
935
+ "jpg",
936
+ "jpeg",
937
+ "png",
938
+ "gif",
939
+ "bmp",
940
+ "tiff",
941
+ "tif",
942
+ "webp",
943
+ "jp2",
944
+ "jpx",
945
+ "jpm",
946
+ "mj2",
947
+ "pnm",
948
+ "pbm",
949
+ "pgm",
950
+ "ppm",
951
+ }
952
+ )
953
+ """Deprecated: Use image_ocr_config.allowed_formats instead."""
764
954
  max_chars: int = DEFAULT_MAX_CHARACTERS
765
955
  """The size of each chunk in characters."""
766
956
  max_overlap: int = DEFAULT_MAX_OVERLAP
@@ -805,6 +995,8 @@ class ExtractionConfig(ConfigDict):
805
995
  """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
806
996
  html_to_markdown_config: HTMLToMarkdownConfig | None = None
807
997
  """Configuration for HTML to Markdown conversion. If None, uses default settings."""
998
+ json_config: JSONExtractionConfig | None = None
999
+ """Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
808
1000
  use_cache: bool = True
809
1001
  """Whether to use caching for extraction results. Set to False to disable all caching."""
810
1002
  target_dpi: int = 150
@@ -826,6 +1018,51 @@ class ExtractionConfig(ConfigDict):
826
1018
  if self.validators is not None and isinstance(self.validators, list):
827
1019
  object.__setattr__(self, "validators", tuple(self.validators))
828
1020
 
1021
+ if isinstance(self.pdf_password, list):
1022
+ object.__setattr__(self, "pdf_password", tuple(self.pdf_password))
1023
+
1024
+ if isinstance(self.image_ocr_formats, list):
1025
+ object.__setattr__(self, "image_ocr_formats", frozenset(self.image_ocr_formats))
1026
+
1027
+ if self.image_ocr_config is None and (
1028
+ self.ocr_extracted_images
1029
+ or self.image_ocr_backend is not None
1030
+ or self.image_ocr_min_dimensions != (50, 50)
1031
+ or self.image_ocr_max_dimensions != (10000, 10000)
1032
+ or self.image_ocr_formats
1033
+ != frozenset(
1034
+ {
1035
+ "jpg",
1036
+ "jpeg",
1037
+ "png",
1038
+ "gif",
1039
+ "bmp",
1040
+ "tiff",
1041
+ "tif",
1042
+ "webp",
1043
+ "jp2",
1044
+ "jpx",
1045
+ "jpm",
1046
+ "mj2",
1047
+ "pnm",
1048
+ "pbm",
1049
+ "pgm",
1050
+ "ppm",
1051
+ }
1052
+ )
1053
+ ):
1054
+ object.__setattr__(
1055
+ self,
1056
+ "image_ocr_config",
1057
+ ImageOCRConfig(
1058
+ enabled=self.ocr_extracted_images,
1059
+ backend=self.image_ocr_backend,
1060
+ min_dimensions=self.image_ocr_min_dimensions,
1061
+ max_dimensions=self.image_ocr_max_dimensions,
1062
+ allowed_formats=self.image_ocr_formats,
1063
+ ),
1064
+ )
1065
+
829
1066
  if self.ocr_backend is None and self.ocr_config is not None:
830
1067
  raise ValidationError("'ocr_backend' is None but 'ocr_config' is provided")
831
1068
 
@@ -839,7 +1076,6 @@ class ExtractionConfig(ConfigDict):
839
1076
  context={"ocr_backend": self.ocr_backend, "ocr_config": type(self.ocr_config).__name__},
840
1077
  )
841
1078
 
842
- # Validate DPI configuration
843
1079
  if self.target_dpi <= 0:
844
1080
  raise ValidationError("target_dpi must be positive", context={"target_dpi": self.target_dpi})
845
1081
  if self.min_dpi <= 0:
@@ -861,27 +1097,22 @@ class ExtractionConfig(ConfigDict):
861
1097
  )
862
1098
 
863
1099
  def get_config_dict(self) -> dict[str, Any]:
864
- if self.ocr_backend is None:
865
- return {"use_cache": self.use_cache}
866
-
867
- if self.ocr_config is not None:
868
- config_dict = asdict(self.ocr_config)
869
- config_dict["use_cache"] = self.use_cache
870
- return config_dict
871
-
872
1100
  match self.ocr_backend:
873
- case "tesseract":
874
- config_dict = asdict(TesseractConfig())
1101
+ case None:
1102
+ return {"use_cache": self.use_cache}
1103
+ case _ if self.ocr_config is not None:
1104
+ config_dict = asdict(self.ocr_config)
875
1105
  config_dict["use_cache"] = self.use_cache
876
1106
  return config_dict
1107
+ case "tesseract":
1108
+ config_dict = asdict(TesseractConfig())
877
1109
  case "easyocr":
878
1110
  config_dict = asdict(EasyOCRConfig())
879
- config_dict["use_cache"] = self.use_cache
880
- return config_dict
881
1111
  case _:
882
1112
  config_dict = asdict(PaddleOCRConfig())
883
- config_dict["use_cache"] = self.use_cache
884
- return config_dict
1113
+
1114
+ config_dict["use_cache"] = self.use_cache
1115
+ return config_dict
885
1116
 
886
1117
  def to_dict(self, include_none: bool = False) -> dict[str, Any]:
887
1118
  result = msgspec.to_builtins(
@@ -900,72 +1131,71 @@ class ExtractionConfig(ConfigDict):
900
1131
  return {k: v for k, v in result.items() if v is not None}
901
1132
 
902
1133
 
903
- @dataclass(frozen=True)
1134
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
904
1135
  class HTMLToMarkdownConfig:
905
- stream_processing: bool = False
906
- """Enable streaming mode for processing large HTML documents."""
907
- chunk_size: int = 1024
908
- """Size of chunks when stream_processing is enabled."""
909
- chunk_callback: Callable[[str], None] | None = None
910
- """Callback function invoked for each chunk during stream processing."""
911
- progress_callback: Callable[[int, int], None] | None = None
912
- """Callback function for progress updates (current, total)."""
913
- parser: str | None = "lxml"
914
- """BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
915
1136
  autolinks: bool = True
916
- """Convert URLs to clickable links automatically."""
1137
+ """Automatically convert valid URLs to Markdown links."""
1138
+ br_in_tables: bool = False
1139
+ """Use <br> tags for line breaks in table cells instead of spaces."""
917
1140
  bullets: str = "*+-"
918
1141
  """Characters to use for unordered list bullets."""
919
1142
  code_language: str = ""
920
- """Default language for code blocks."""
1143
+ """Default language identifier for fenced code blocks."""
921
1144
  code_language_callback: Callable[[Any], str] | None = None
922
- """Callback to determine code language dynamically."""
923
- convert: str | Iterable[str] | None = None
924
- """HTML tags to convert. If None, all supported tags are converted."""
1145
+ """Function to dynamically determine code block language."""
1146
+ convert: list[str] | None = None
1147
+ """List of HTML tags to convert (None = all supported tags)."""
925
1148
  convert_as_inline: bool = False
926
- """Convert block elements as inline elements."""
927
- custom_converters: Mapping[Any, Any] | None = None
928
- """Custom converters for specific HTML elements."""
1149
+ """Treat content as inline elements only."""
1150
+ custom_converters: Mapping[str, Callable[..., str]] | None = None
1151
+ """Mapping of HTML tag names to custom converter functions."""
929
1152
  default_title: bool = False
930
- """Use a default title if none is found."""
1153
+ """Use default titles for elements like links."""
931
1154
  escape_asterisks: bool = True
932
- """Escape asterisks in text to prevent unintended emphasis."""
1155
+ """Escape * characters to prevent unintended formatting."""
933
1156
  escape_misc: bool = True
934
- """Escape miscellaneous characters that have special meaning in Markdown."""
1157
+ """Escape miscellaneous characters to prevent Markdown conflicts."""
935
1158
  escape_underscores: bool = True
936
- """Escape underscores in text to prevent unintended emphasis."""
1159
+ """Escape _ characters to prevent unintended formatting."""
937
1160
  extract_metadata: bool = True
938
- """Extract metadata from HTML head section."""
1161
+ """Extract document metadata as comment header."""
939
1162
  heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
940
1163
  """Style for markdown headings."""
941
1164
  highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
942
1165
  """Style for highlighting text."""
943
- keep_inline_images_in: Iterable[str] | None = None
944
- """HTML tags where inline images should be preserved."""
1166
+ keep_inline_images_in: list[str] | None = None
1167
+ """Tags where inline images should be preserved."""
1168
+ list_indent_type: Literal["spaces", "tabs"] = "spaces"
1169
+ """Type of indentation to use for lists."""
1170
+ list_indent_width: int = 4
1171
+ """Number of spaces per indentation level (use 2 for Discord/Slack)."""
945
1172
  newline_style: Literal["spaces", "backslash"] = "spaces"
946
1173
  """Style for line breaks in markdown."""
947
- strip: str | Iterable[str] | None = None
948
- """HTML tags to strip completely from output."""
1174
+ preprocess_html: bool = False
1175
+ """Enable HTML preprocessing to clean messy HTML."""
1176
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
1177
+ """Preprocessing level for cleaning HTML."""
1178
+ remove_forms: bool = True
1179
+ """Remove form elements during preprocessing."""
1180
+ remove_navigation: bool = True
1181
+ """Remove navigation elements during preprocessing."""
1182
+ strip: list[str] | None = None
1183
+ """List of HTML tags to remove from output."""
949
1184
  strip_newlines: bool = False
950
- """Strip newlines from the output."""
1185
+ """Remove newlines from HTML input before processing."""
951
1186
  strong_em_symbol: Literal["*", "_"] = "*"
952
1187
  """Symbol to use for strong/emphasis formatting."""
953
1188
  sub_symbol: str = ""
954
1189
  """Symbol to use for subscript text."""
955
1190
  sup_symbol: str = ""
956
1191
  """Symbol to use for superscript text."""
1192
+ whitespace_mode: Literal["normalized", "strict"] = "normalized"
1193
+ """Whitespace handling mode."""
957
1194
  wrap: bool = False
958
1195
  """Enable text wrapping."""
959
1196
  wrap_width: int = 80
960
- """Width for text wrapping when wrap is True."""
961
- preprocess_html: bool = True
962
- """Enable HTML preprocessing to clean up the input."""
963
- preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
964
- """Preprocessing level for cleaning HTML."""
965
- remove_navigation: bool = True
966
- """Remove navigation elements from HTML."""
967
- remove_forms: bool = True
968
- """Remove form elements from HTML."""
1197
+ """Width for text wrapping."""
969
1198
 
970
1199
  def to_dict(self) -> dict[str, Any]:
971
- return {key: value for key, value in self.__dict__.items() if value is not None}
1200
+ result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
1201
+ return {k: v for k, v in result.items() if v is not None}
@@ -20,6 +20,8 @@ from kreuzberg._utils._sync import run_sync
20
20
 
21
21
  T = TypeVar("T")
22
22
 
23
+ CACHE_CLEANUP_FREQUENCY = 100
24
+
23
25
 
24
26
  class KreuzbergCache(Generic[T]):
25
27
  def __init__(
@@ -136,16 +138,20 @@ class KreuzbergCache(Generic[T]):
136
138
  def _cleanup_cache(self) -> None:
137
139
  try:
138
140
  cache_files = list(self.cache_dir.glob("*.msgpack"))
139
-
140
141
  cutoff_time = time.time() - (self.max_age_days * 24 * 3600)
141
- for cache_file in cache_files[:]:
142
+
143
+ remaining_files = []
144
+ for cache_file in cache_files:
142
145
  try:
143
146
  if cache_file.stat().st_mtime < cutoff_time:
144
147
  cache_file.unlink(missing_ok=True)
145
- cache_files.remove(cache_file)
148
+ else:
149
+ remaining_files.append(cache_file)
146
150
  except OSError: # noqa: PERF203
147
151
  continue
148
152
 
153
+ cache_files = remaining_files
154
+
149
155
  total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists()) / (
150
156
  1024 * 1024
151
157
  )
@@ -191,7 +197,7 @@ class KreuzbergCache(Generic[T]):
191
197
  content = serialize(serialized)
192
198
  cache_path.write_bytes(content)
193
199
 
194
- if hash(cache_key) % 100 == 0:
200
+ if hash(cache_key) % CACHE_CLEANUP_FREQUENCY == 0:
195
201
  self._cleanup_cache()
196
202
  except (OSError, TypeError, ValueError):
197
203
  pass
@@ -12,7 +12,7 @@ from kreuzberg.exceptions import ValidationError
12
12
  DeviceType = Literal["cpu", "cuda", "mps", "auto"]
13
13
 
14
14
 
15
- @dataclass(frozen=True, slots=True)
15
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
16
16
  class DeviceInfo:
17
17
  device_type: Literal["cpu", "cuda", "mps"]
18
18
  """The type of device."""
@@ -30,12 +30,10 @@ def detect_available_devices() -> list[DeviceInfo]:
30
30
  cpu_device = DeviceInfo(device_type="cpu", name="CPU")
31
31
 
32
32
  cuda_devices = _get_cuda_devices() if _is_cuda_available() else []
33
-
34
33
  mps_device = _get_mps_device() if _is_mps_available() else None
35
34
  mps_devices = [mps_device] if mps_device else []
36
35
 
37
- gpu_devices = list(chain(cuda_devices, mps_devices))
38
- return [*gpu_devices, cpu_device]
36
+ return list(chain(cuda_devices, mps_devices, [cpu_device]))
39
37
 
40
38
 
41
39
  def get_optimal_device() -> DeviceInfo:
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ _STREAMING_THRESHOLD_KB = 10
4
+ _LARGE_FILE_THRESHOLD_MB = 1
5
+ _DEFAULT_CHUNK_SIZE = 2048
6
+ _LARGE_FILE_CHUNK_SIZE = 4096
7
+
8
+ _STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
9
+ _LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
10
+
11
+
12
+ def should_use_streaming(content_size: int) -> tuple[bool, int]:
13
+ if content_size < 0:
14
+ return False, _DEFAULT_CHUNK_SIZE
15
+
16
+ if content_size > _STREAMING_THRESHOLD_BYTES:
17
+ if content_size > _LARGE_FILE_THRESHOLD_BYTES:
18
+ return True, _LARGE_FILE_CHUNK_SIZE
19
+ return True, _DEFAULT_CHUNK_SIZE
20
+ return False, _DEFAULT_CHUNK_SIZE