kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (99) hide show
  1. kreuzberg/__init__.py +6 -0
  2. kreuzberg/_api/main.py +0 -53
  3. kreuzberg/_config.py +17 -8
  4. kreuzberg/_document_classification.py +1 -1
  5. kreuzberg/_extractors/_base.py +0 -46
  6. kreuzberg/_extractors/_email.py +16 -10
  7. kreuzberg/_extractors/_html.py +39 -12
  8. kreuzberg/_extractors/_pandoc.py +2 -2
  9. kreuzberg/_extractors/_pdf.py +6 -7
  10. kreuzberg/_extractors/_presentation.py +4 -0
  11. kreuzberg/_extractors/_spread_sheet.py +0 -1
  12. kreuzberg/_extractors/_structured.py +83 -15
  13. kreuzberg/_gmft.py +7 -2
  14. kreuzberg/_mcp/server.py +1 -22
  15. kreuzberg/_mime_types.py +1 -1
  16. kreuzberg/_ocr/_easyocr.py +47 -20
  17. kreuzberg/_ocr/_paddleocr.py +1 -1
  18. kreuzberg/_ocr/_tesseract.py +27 -26
  19. kreuzberg/_token_reduction/__init__.py +11 -0
  20. kreuzberg/_token_reduction/_reducer.py +439 -0
  21. kreuzberg/_token_reduction/_stopwords.py +116 -0
  22. kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
  23. kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
  24. kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
  25. kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
  26. kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
  27. kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
  28. kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
  29. kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
  30. kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
  31. kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
  32. kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
  33. kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
  34. kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
  35. kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
  36. kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
  37. kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
  38. kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
  39. kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
  40. kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
  41. kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
  42. kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
  43. kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
  44. kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
  45. kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
  46. kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
  47. kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
  48. kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
  49. kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
  50. kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
  51. kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
  52. kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
  53. kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
  54. kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
  55. kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
  56. kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
  57. kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
  58. kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
  59. kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
  60. kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
  61. kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
  62. kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
  63. kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
  64. kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
  65. kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
  66. kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
  67. kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
  68. kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
  69. kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
  70. kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
  71. kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
  72. kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
  73. kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
  74. kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
  75. kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
  76. kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
  77. kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
  78. kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
  79. kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
  80. kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
  81. kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
  82. kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
  83. kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
  84. kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
  85. kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
  86. kreuzberg/_types.py +146 -43
  87. kreuzberg/_utils/_html_streaming.py +20 -0
  88. kreuzberg/_utils/_image_preprocessing.py +1 -1
  89. kreuzberg/_utils/_ref.py +14 -6
  90. kreuzberg/_utils/_serialization.py +13 -6
  91. kreuzberg/_utils/_sync.py +15 -16
  92. kreuzberg/exceptions.py +0 -1
  93. kreuzberg/extraction.py +27 -11
  94. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
  95. kreuzberg-3.17.0.dist-info/RECORD +128 -0
  96. kreuzberg-3.15.0.dist-info/RECORD +0 -60
  97. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
  98. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
  99. {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,31 @@
1
+ [
2
+ "futhi",
3
+ "kahle",
4
+ "kakhulu",
5
+ "kanye",
6
+ "khona",
7
+ "kodwa",
8
+ "kungani",
9
+ "kusho",
10
+ "la",
11
+ "lakhe",
12
+ "lapho",
13
+ "mina",
14
+ "ngesikhathi",
15
+ "nje",
16
+ "phansi",
17
+ "phezulu",
18
+ "u",
19
+ "ukuba",
20
+ "ukuthi",
21
+ "ukuze",
22
+ "uma",
23
+ "wahamba",
24
+ "wakhe",
25
+ "wami",
26
+ "wase",
27
+ "wathi",
28
+ "yakhe",
29
+ "zakhe",
30
+ "zonke"
31
+ ]
kreuzberg/_types.py CHANGED
@@ -1,12 +1,13 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  import sys
4
- from collections.abc import Awaitable, Callable, Iterable, Mapping
4
+ from collections.abc import Awaitable, Callable, Mapping
5
5
  from dataclasses import asdict, dataclass, field
6
6
  from enum import Enum
7
7
  from pathlib import Path
8
8
  from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
9
9
 
10
+ import langcodes
10
11
  import msgspec
11
12
 
12
13
  from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
@@ -591,6 +592,8 @@ class ImagePreprocessingMetadata(NamedTuple):
591
592
 
592
593
 
593
594
  class Metadata(TypedDict, total=False):
595
+ abstract: NotRequired[str]
596
+ """Document abstract or summary."""
594
597
  authors: NotRequired[list[str]]
595
598
  """List of document authors."""
596
599
  categories: NotRequired[list[str]]
@@ -677,9 +680,28 @@ class Metadata(TypedDict, total=False):
677
680
  """Error message if extraction failed."""
678
681
  error_context: NotRequired[dict[str, Any]]
679
682
  """Error context information for debugging."""
683
+ json_schema: NotRequired[dict[str, Any]]
684
+ """JSON schema information extracted from structured data."""
685
+ notes: NotRequired[list[str]]
686
+ """Notes or additional information extracted from documents."""
687
+ note: NotRequired[str]
688
+ """Single note or annotation."""
689
+ name: NotRequired[str]
690
+ """Name field from structured data."""
691
+ body: NotRequired[str]
692
+ """Body text content."""
693
+ text: NotRequired[str]
694
+ """Generic text content."""
695
+ message: NotRequired[str]
696
+ """Message or communication content."""
697
+ attributes: NotRequired[dict[str, Any]]
698
+ """Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
699
+ token_reduction: NotRequired[dict[str, float]]
700
+ """Token reduction statistics including reduction ratios and counts."""
680
701
 
681
702
 
682
703
  _VALID_METADATA_KEYS = {
704
+ "abstract",
683
705
  "authors",
684
706
  "categories",
685
707
  "citations",
@@ -722,6 +744,15 @@ _VALID_METADATA_KEYS = {
722
744
  "source_format",
723
745
  "error",
724
746
  "error_context",
747
+ "json_schema",
748
+ "notes",
749
+ "note",
750
+ "name",
751
+ "body",
752
+ "text",
753
+ "message",
754
+ "attributes",
755
+ "token_reduction",
725
756
  }
726
757
 
727
758
 
@@ -730,9 +761,29 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
730
761
  return {}
731
762
 
732
763
  normalized: Metadata = {}
764
+ attributes: dict[str, Any] = {}
765
+
733
766
  for key, value in data.items():
734
- if key in _VALID_METADATA_KEYS and value is not None:
735
- normalized[key] = value # type: ignore[literal-required]
767
+ if value is not None:
768
+ if key in _VALID_METADATA_KEYS:
769
+ normalized[key] = value # type: ignore[literal-required]
770
+ elif "." in key and key.split(".")[-1] in {
771
+ "title",
772
+ "name",
773
+ "subject",
774
+ "description",
775
+ "content",
776
+ "body",
777
+ "text",
778
+ "message",
779
+ "note",
780
+ "abstract",
781
+ "summary",
782
+ }:
783
+ attributes[key] = value
784
+
785
+ if attributes:
786
+ normalized["attributes"] = attributes
736
787
 
737
788
  return normalized
738
789
 
@@ -835,6 +886,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
835
886
  ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
836
887
 
837
888
 
889
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
890
+ class JSONExtractionConfig(ConfigDict):
891
+ extract_schema: bool = False
892
+ """Extract and include JSON schema information in metadata."""
893
+ custom_text_field_patterns: frozenset[str] | None = None
894
+ """Custom patterns to identify text fields beyond default keywords."""
895
+ max_depth: int = 10
896
+ """Maximum nesting depth to process in JSON structures."""
897
+ array_item_limit: int = 1000
898
+ """Maximum number of array items to process to prevent memory issues."""
899
+ include_type_info: bool = False
900
+ """Include data type information in extracted content."""
901
+ flatten_nested_objects: bool = True
902
+ """Flatten nested objects using dot notation for better text extraction."""
903
+
904
+ def __post_init__(self) -> None:
905
+ if self.max_depth <= 0:
906
+ raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
907
+ if self.array_item_limit <= 0:
908
+ raise ValidationError(
909
+ "array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
910
+ )
911
+
912
+
838
913
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
839
914
  class ExtractionConfig(ConfigDict):
840
915
  force_ocr: bool = False
@@ -924,6 +999,8 @@ class ExtractionConfig(ConfigDict):
924
999
  """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
925
1000
  html_to_markdown_config: HTMLToMarkdownConfig | None = None
926
1001
  """Configuration for HTML to Markdown conversion. If None, uses default settings."""
1002
+ json_config: JSONExtractionConfig | None = None
1003
+ """Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
927
1004
  use_cache: bool = True
928
1005
  """Whether to use caching for extraction results. Set to False to disable all caching."""
929
1006
  target_dpi: int = 150
@@ -936,6 +1013,8 @@ class ExtractionConfig(ConfigDict):
936
1013
  """Minimum DPI threshold when auto-adjusting DPI."""
937
1014
  max_dpi: int = 600
938
1015
  """Maximum DPI threshold when auto-adjusting DPI."""
1016
+ token_reduction: TokenReductionConfig | None = None
1017
+ """Configuration for token reduction to optimize output size while preserving meaning."""
939
1018
 
940
1019
  def __post_init__(self) -> None:
941
1020
  if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
@@ -1060,71 +1139,95 @@ class ExtractionConfig(ConfigDict):
1060
1139
 
1061
1140
  @dataclass(unsafe_hash=True, frozen=True, slots=True)
1062
1141
  class HTMLToMarkdownConfig:
1063
- stream_processing: bool = False
1064
- """Enable streaming mode for processing large HTML documents."""
1065
- chunk_size: int = 1024
1066
- """Size of chunks when stream_processing is enabled."""
1067
- chunk_callback: Callable[[str], None] | None = None
1068
- """Callback function invoked for each chunk during stream processing."""
1069
- progress_callback: Callable[[int, int], None] | None = None
1070
- """Callback function for progress updates (current, total)."""
1071
- parser: str | None = "lxml"
1072
- """BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
1073
1142
  autolinks: bool = True
1074
- """Convert URLs to clickable links automatically."""
1143
+ """Automatically convert valid URLs to Markdown links."""
1144
+ br_in_tables: bool = False
1145
+ """Use <br> tags for line breaks in table cells instead of spaces."""
1075
1146
  bullets: str = "*+-"
1076
1147
  """Characters to use for unordered list bullets."""
1077
1148
  code_language: str = ""
1078
- """Default language for code blocks."""
1149
+ """Default language identifier for fenced code blocks."""
1079
1150
  code_language_callback: Callable[[Any], str] | None = None
1080
- """Callback to determine code language dynamically."""
1081
- convert: str | Iterable[str] | None = None
1082
- """HTML tags to convert. If None, all supported tags are converted."""
1151
+ """Function to dynamically determine code block language."""
1152
+ convert: list[str] | None = None
1153
+ """List of HTML tags to convert (None = all supported tags)."""
1083
1154
  convert_as_inline: bool = False
1084
- """Convert block elements as inline elements."""
1085
- custom_converters: Mapping[Any, Any] | None = None
1086
- """Custom converters for specific HTML elements."""
1155
+ """Treat content as inline elements only."""
1156
+ custom_converters: Mapping[str, Callable[..., str]] | None = None
1157
+ """Mapping of HTML tag names to custom converter functions."""
1087
1158
  default_title: bool = False
1088
- """Use a default title if none is found."""
1089
- escape_asterisks: bool = True
1090
- """Escape asterisks in text to prevent unintended emphasis."""
1091
- escape_misc: bool = True
1092
- """Escape miscellaneous characters that have special meaning in Markdown."""
1093
- escape_underscores: bool = True
1094
- """Escape underscores in text to prevent unintended emphasis."""
1159
+ """Use default titles for elements like links."""
1160
+ escape_asterisks: bool = False
1161
+ """Escape * characters to prevent unintended formatting."""
1162
+ escape_misc: bool = False
1163
+ """Escape miscellaneous characters to prevent Markdown conflicts."""
1164
+ escape_underscores: bool = False
1165
+ """Escape _ characters to prevent unintended formatting."""
1095
1166
  extract_metadata: bool = True
1096
- """Extract metadata from HTML head section."""
1167
+ """Extract document metadata as comment header."""
1097
1168
  heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
1098
1169
  """Style for markdown headings."""
1099
1170
  highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
1100
1171
  """Style for highlighting text."""
1101
- keep_inline_images_in: Iterable[str] | None = None
1102
- """HTML tags where inline images should be preserved."""
1172
+ keep_inline_images_in: list[str] | None = None
1173
+ """Tags where inline images should be preserved."""
1174
+ list_indent_type: Literal["spaces", "tabs"] = "spaces"
1175
+ """Type of indentation to use for lists."""
1176
+ list_indent_width: int = 4
1177
+ """Number of spaces per indentation level (use 2 for Discord/Slack)."""
1103
1178
  newline_style: Literal["spaces", "backslash"] = "spaces"
1104
1179
  """Style for line breaks in markdown."""
1105
- strip: str | Iterable[str] | None = None
1106
- """HTML tags to strip completely from output."""
1180
+ preprocess_html: bool = False
1181
+ """Enable HTML preprocessing to clean messy HTML."""
1182
+ preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
1183
+ """Preprocessing level for cleaning HTML."""
1184
+ remove_forms: bool = True
1185
+ """Remove form elements during preprocessing."""
1186
+ remove_navigation: bool = True
1187
+ """Remove navigation elements during preprocessing."""
1188
+ strip: list[str] | None = None
1189
+ """List of HTML tags to remove from output."""
1107
1190
  strip_newlines: bool = False
1108
- """Strip newlines from the output."""
1191
+ """Remove newlines from HTML input before processing."""
1109
1192
  strong_em_symbol: Literal["*", "_"] = "*"
1110
1193
  """Symbol to use for strong/emphasis formatting."""
1111
1194
  sub_symbol: str = ""
1112
1195
  """Symbol to use for subscript text."""
1113
1196
  sup_symbol: str = ""
1114
1197
  """Symbol to use for superscript text."""
1198
+ whitespace_mode: Literal["normalized", "strict"] = "normalized"
1199
+ """Whitespace handling mode."""
1115
1200
  wrap: bool = False
1116
1201
  """Enable text wrapping."""
1117
1202
  wrap_width: int = 80
1118
- """Width for text wrapping when wrap is True."""
1119
- preprocess_html: bool = True
1120
- """Enable HTML preprocessing to clean up the input."""
1121
- preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
1122
- """Preprocessing level for cleaning HTML."""
1123
- remove_navigation: bool = True
1124
- """Remove navigation elements from HTML."""
1125
- remove_forms: bool = True
1126
- """Remove form elements from HTML."""
1203
+ """Width for text wrapping."""
1127
1204
 
1128
1205
  def to_dict(self) -> dict[str, Any]:
1129
1206
  result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
1130
1207
  return {k: v for k, v in result.items() if v is not None}
1208
+
1209
+
1210
+ @dataclass(unsafe_hash=True, frozen=True, slots=True)
1211
+ class TokenReductionConfig:
1212
+ mode: Literal["off", "light", "moderate"] = "off"
1213
+ preserve_markdown: bool = True
1214
+ custom_stopwords: dict[str, list[str]] | None = field(default=None, compare=False, hash=False)
1215
+ language_hint: str | None = None
1216
+
1217
+ def __post_init__(self) -> None:
1218
+ if self.language_hint:
1219
+ hint = self.language_hint.strip()
1220
+
1221
+ if not hint or len(hint) > 50 or any(c in hint for c in "\x00\r\n\t"):
1222
+ object.__setattr__(self, "language_hint", None)
1223
+ return
1224
+
1225
+ try:
1226
+ normalized = langcodes.standardize_tag(hint)
1227
+
1228
+ lang = langcodes.Language.get(normalized).language
1229
+
1230
+ if lang and lang != hint:
1231
+ object.__setattr__(self, "language_hint", lang)
1232
+ except (ValueError, AttributeError, TypeError):
1233
+ object.__setattr__(self, "language_hint", None)
@@ -0,0 +1,20 @@
1
+ from __future__ import annotations
2
+
3
+ _STREAMING_THRESHOLD_KB = 10
4
+ _LARGE_FILE_THRESHOLD_MB = 1
5
+ _DEFAULT_CHUNK_SIZE = 2048
6
+ _LARGE_FILE_CHUNK_SIZE = 4096
7
+
8
+ _STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
9
+ _LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
10
+
11
+
12
+ def should_use_streaming(content_size: int) -> tuple[bool, int]:
13
+ if content_size < 0:
14
+ return False, _DEFAULT_CHUNK_SIZE
15
+
16
+ if content_size > _STREAMING_THRESHOLD_BYTES:
17
+ if content_size > _LARGE_FILE_THRESHOLD_BYTES:
18
+ return True, _LARGE_FILE_CHUNK_SIZE
19
+ return True, _DEFAULT_CHUNK_SIZE
20
+ return False, _DEFAULT_CHUNK_SIZE
@@ -198,7 +198,7 @@ def normalize_image_dpi(
198
198
  calculated_dpi=calculated_dpi,
199
199
  )
200
200
 
201
- except OSError as e:
201
+ except OSError as e: # pragma: no cover
202
202
  return image, ImagePreprocessingMetadata(
203
203
  original_dimensions=(original_width, original_height),
204
204
  original_dpi=original_dpi,
kreuzberg/_utils/_ref.py CHANGED
@@ -1,5 +1,6 @@
1
1
  from __future__ import annotations
2
2
 
3
+ import threading
3
4
  from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, cast
4
5
 
5
6
  if TYPE_CHECKING:
@@ -10,23 +11,30 @@ T = TypeVar("T")
10
11
 
11
12
  class Ref(Generic[T]):
12
13
  _instances: ClassVar[dict[str, Any]] = {}
14
+ _lock: ClassVar[threading.Lock] = threading.Lock()
13
15
 
14
16
  def __init__(self, name: str, factory: Callable[[], T]) -> None:
15
17
  self.name = name
16
18
  self.factory = factory
17
19
 
18
20
  def get(self) -> T:
19
- if self.name not in self._instances:
20
- self._instances[self.name] = self.factory()
21
- return cast("T", self._instances[self.name])
21
+ if self.name in self._instances:
22
+ return cast("T", self._instances[self.name])
23
+
24
+ with self._lock:
25
+ if self.name not in self._instances:
26
+ self._instances[self.name] = self.factory()
27
+ return cast("T", self._instances[self.name])
22
28
 
23
29
  def clear(self) -> None:
24
- if self.name in self._instances:
25
- del self._instances[self.name]
30
+ with self._lock:
31
+ if self.name in self._instances:
32
+ del self._instances[self.name]
26
33
 
27
34
  def is_initialized(self) -> bool:
28
35
  return self.name in self._instances
29
36
 
30
37
  @classmethod
31
38
  def clear_all(cls) -> None:
32
- cls._instances.clear()
39
+ with cls._lock:
40
+ cls._instances.clear()
@@ -1,11 +1,10 @@
1
1
  from __future__ import annotations
2
2
 
3
3
  from dataclasses import is_dataclass
4
- from typing import Any, TypeVar, cast
4
+ from typing import Any, TypeVar
5
5
 
6
6
  import msgspec
7
7
  from msgspec import MsgspecError
8
- from msgspec.msgpack import decode, encode
9
8
 
10
9
  T = TypeVar("T")
11
10
 
@@ -42,18 +41,26 @@ def encode_hook(obj: Any) -> Any:
42
41
  raise TypeError(f"Unsupported type: {type(obj)!r}")
43
42
 
44
43
 
45
- def deserialize(value: str | bytes, target_type: type[T]) -> T:
44
+ def deserialize(value: str | bytes, target_type: type[T], json: bool = False) -> T:
45
+ decoder = msgspec.json.decode if json else msgspec.msgpack.decode
46
+
47
+ if json:
48
+ data = value.encode() if isinstance(value, str) else value
49
+ else:
50
+ data = value.encode() if isinstance(value, str) else value
51
+
46
52
  try:
47
- return decode(cast("bytes", value), type=target_type, strict=False)
53
+ return decoder(data, type=target_type, strict=False)
48
54
  except MsgspecError as e:
49
55
  raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
50
56
 
51
57
 
52
- def serialize(value: Any, **kwargs: Any) -> bytes:
58
+ def serialize(value: Any, json: bool = False, **kwargs: Any) -> bytes:
53
59
  if isinstance(value, dict) and kwargs:
54
60
  value = value | kwargs
55
61
 
62
+ encoder = msgspec.json.encode if json else msgspec.msgpack.encode
56
63
  try:
57
- return encode(value, enc_hook=encode_hook)
64
+ return encoder(value, enc_hook=encode_hook)
58
65
  except (MsgspecError, TypeError) as e:
59
66
  raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e
kreuzberg/_utils/_sync.py CHANGED
@@ -1,19 +1,16 @@
1
1
  from __future__ import annotations
2
2
 
3
- import asyncio
4
3
  from functools import partial
5
4
  from inspect import isawaitable, iscoroutinefunction
6
- from typing import TYPE_CHECKING, Any, TypeVar, cast
5
+ from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar, cast
7
6
 
8
7
  import anyio
9
- from anyio import create_task_group
8
+ from anyio import CapacityLimiter, create_task_group
10
9
  from anyio.to_thread import run_sync as any_io_run_sync
11
10
 
12
11
  if TYPE_CHECKING: # pragma: no cover
13
12
  from collections.abc import Awaitable, Callable
14
13
 
15
- from typing import ParamSpec
16
-
17
14
  T = TypeVar("T")
18
15
  P = ParamSpec("P")
19
16
 
@@ -57,24 +54,26 @@ async def run_taskgroup_batched(
57
54
  return []
58
55
 
59
56
  if len(async_tasks) <= batch_size or not use_semaphore:
60
- results: list[Any] = []
57
+ batch_results: list[Any] = []
61
58
  for i in range(0, len(async_tasks), batch_size):
62
59
  batch = async_tasks[i : i + batch_size]
63
- results.extend(await run_taskgroup(*batch))
64
- return results
60
+ batch_results.extend(await run_taskgroup(*batch))
61
+ return batch_results
65
62
 
66
- semaphore = asyncio.Semaphore(batch_size)
63
+ limiter = CapacityLimiter(batch_size)
64
+ results: list[tuple[int, Any]] = []
67
65
 
68
- async def run_with_semaphore(task: Awaitable[Any], index: int) -> tuple[int, Any]:
69
- async with semaphore:
66
+ async def run_with_semaphore(task: Awaitable[Any], index: int) -> None:
67
+ async with limiter:
70
68
  result = await task
71
- return (index, result)
69
+ results.append((index, result))
72
70
 
73
- indexed_tasks = [run_with_semaphore(task, i) for i, task in enumerate(async_tasks)]
74
- indexed_results = await asyncio.gather(*indexed_tasks)
71
+ async with create_task_group() as tg:
72
+ for i, task in enumerate(async_tasks):
73
+ tg.start_soon(run_with_semaphore, task, i)
75
74
 
76
- indexed_results.sort(key=lambda x: x[0])
77
- return [result for _, result in indexed_results]
75
+ results.sort(key=lambda x: x[0])
76
+ return [result for _, result in results]
78
77
 
79
78
 
80
79
  async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
kreuzberg/exceptions.py CHANGED
@@ -17,7 +17,6 @@ class KreuzbergError(Exception):
17
17
  super().__init__(message)
18
18
 
19
19
  def _serialize_context(self, obj: Any) -> Any:
20
- """Recursively serialize context objects to ensure JSON compatibility."""
21
20
  if isinstance(obj, bytes):
22
21
  return obj.decode("utf-8", errors="replace")
23
22
  if isinstance(obj, dict):
kreuzberg/extraction.py CHANGED
@@ -15,6 +15,7 @@ from kreuzberg._mime_types import (
15
15
  validate_mime_type,
16
16
  )
17
17
  from kreuzberg._registry import ExtractorRegistry
18
+ from kreuzberg._token_reduction import get_reduction_stats, reduce_tokens
18
19
  from kreuzberg._types import ExtractionConfig, ExtractionResult
19
20
  from kreuzberg._utils._document_cache import get_document_cache
20
21
  from kreuzberg._utils._errors import create_error_context
@@ -31,15 +32,6 @@ DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
31
32
 
32
33
 
33
34
  async def _handle_cache_async(path: Path, config: ExtractionConfig) -> ExtractionResult | None:
34
- """Handle cache lookup and coordination with other processes.
35
-
36
- Args:
37
- path: Path to the file being processed
38
- config: Extraction configuration
39
-
40
- Returns:
41
- Cached result if available, None otherwise
42
- """
43
35
  cache = get_document_cache()
44
36
 
45
37
  cached_result = cache.get(path, config)
@@ -47,7 +39,7 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
47
39
  return cached_result
48
40
 
49
41
  if cache.is_processing(path, config):
50
- event = cache.mark_processing(path, config)
42
+ event = cache.mark_processing(path, config) # pragma: no cover
51
43
  await anyio.to_thread.run_sync(event.wait) # pragma: no cover
52
44
 
53
45
  return cache.get(path, config) # pragma: no cover
@@ -92,6 +84,30 @@ def _validate_and_post_process_helper(
92
84
  if config.auto_detect_document_type:
93
85
  result = auto_detect_document_type(result, config, file_path=file_path)
94
86
 
87
+ if config.token_reduction is not None and config.token_reduction.mode != "off":
88
+ original_content = result.content
89
+
90
+ language_hint = None
91
+ if result.detected_languages and len(result.detected_languages) > 0:
92
+ language_hint = result.detected_languages[0]
93
+
94
+ reduced_content = reduce_tokens(
95
+ original_content,
96
+ config=config.token_reduction,
97
+ language=language_hint,
98
+ )
99
+ reduction_stats = get_reduction_stats(original_content, reduced_content)
100
+
101
+ result.content = reduced_content
102
+ result.metadata["token_reduction"] = {
103
+ "character_reduction_ratio": reduction_stats["character_reduction_ratio"],
104
+ "token_reduction_ratio": reduction_stats["token_reduction_ratio"],
105
+ "original_characters": reduction_stats["original_characters"],
106
+ "reduced_characters": reduction_stats["reduced_characters"],
107
+ "original_tokens": reduction_stats["original_tokens"],
108
+ "reduced_tokens": reduction_stats["reduced_tokens"],
109
+ }
110
+
95
111
  return result
96
112
 
97
113
 
@@ -362,7 +378,7 @@ def extract_file_sync(
362
378
  return cached_result
363
379
 
364
380
  if cache.is_processing(path, config):
365
- event = cache.mark_processing(path, config)
381
+ event = cache.mark_processing(path, config) # pragma: no cover
366
382
  event.wait() # pragma: no cover
367
383
 
368
384
  # Try cache again after waiting for other process to complete # ~keep
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.15.0
3
+ Version: 3.17.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -31,7 +31,8 @@ Requires-Python: >=3.10
31
31
  Requires-Dist: anyio>=4.10.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
- Requires-Dist: html-to-markdown[lxml]>=1.11.0
34
+ Requires-Dist: html-to-markdown[lxml]>=1.13.0
35
+ Requires-Dist: langcodes>=3.5.0
35
36
  Requires-Dist: mcp>=1.14.0
36
37
  Requires-Dist: msgspec>=0.18.0
37
38
  Requires-Dist: numpy>=2.0.0
@@ -49,7 +50,7 @@ Provides-Extra: all
49
50
  Requires-Dist: click>=8.2.1; extra == 'all'
50
51
  Requires-Dist: deep-translator>=1.11.4; extra == 'all'
51
52
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
52
- Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
53
+ Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
53
54
  Requires-Dist: gmft>=0.4.2; extra == 'all'
54
55
  Requires-Dist: keybert>=0.9.0; extra == 'all'
55
56
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
@@ -82,7 +83,7 @@ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
82
83
  Provides-Extra: gmft
83
84
  Requires-Dist: gmft>=0.4.2; extra == 'gmft'
84
85
  Provides-Extra: langdetect
85
- Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
86
+ Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
86
87
  Provides-Extra: paddleocr
87
88
  Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
88
89
  Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
@@ -109,7 +110,7 @@ Description-Content-Type: text/markdown
109
110
  - **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
110
111
  - **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
111
112
  - **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
112
- - **Format Support**: 18 document types including PDF, Microsoft Office, images, HTML, and structured data formats
113
+ - **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
113
114
  - **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
114
115
  - **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
115
116
 
@@ -227,14 +228,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
227
228
 
228
229
  ## Supported Formats
229
230
 
230
- | Category | Formats |
231
- | ----------------- | ------------------------------ |
232
- | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
233
- | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
234
- | **Spreadsheets** | XLSX, XLS, CSV, ODS |
235
- | **Presentations** | PPTX, PPT, ODP |
236
- | **Web** | HTML, XML, MHTML |
237
- | **Archives** | Support via extraction |
231
+ | Category | Formats |
232
+ | ------------------- | ------------------------------ |
233
+ | **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
234
+ | **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
235
+ | **Spreadsheets** | XLSX, XLS, CSV, ODS |
236
+ | **Presentations** | PPTX, PPT, ODP |
237
+ | **Web** | HTML, XML, MHTML |
238
+ | **Structured Data** | JSON, YAML, TOML |
239
+ | **Archives** | Support via extraction |
238
240
 
239
241
  ## 📊 Performance Characteristics
240
242