kreuzberg 3.15.0__py3-none-any.whl → 3.17.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +6 -0
- kreuzberg/_api/main.py +0 -53
- kreuzberg/_config.py +17 -8
- kreuzberg/_document_classification.py +1 -1
- kreuzberg/_extractors/_base.py +0 -46
- kreuzberg/_extractors/_email.py +16 -10
- kreuzberg/_extractors/_html.py +39 -12
- kreuzberg/_extractors/_pandoc.py +2 -2
- kreuzberg/_extractors/_pdf.py +6 -7
- kreuzberg/_extractors/_presentation.py +4 -0
- kreuzberg/_extractors/_spread_sheet.py +0 -1
- kreuzberg/_extractors/_structured.py +83 -15
- kreuzberg/_gmft.py +7 -2
- kreuzberg/_mcp/server.py +1 -22
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_easyocr.py +47 -20
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +27 -26
- kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- kreuzberg/_types.py +146 -43
- kreuzberg/_utils/_html_streaming.py +20 -0
- kreuzberg/_utils/_image_preprocessing.py +1 -1
- kreuzberg/_utils/_ref.py +14 -6
- kreuzberg/_utils/_serialization.py +13 -6
- kreuzberg/_utils/_sync.py +15 -16
- kreuzberg/exceptions.py +0 -1
- kreuzberg/extraction.py +27 -11
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/METADATA +15 -13
- kreuzberg-3.17.0.dist-info/RECORD +128 -0
- kreuzberg-3.15.0.dist-info/RECORD +0 -60
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.15.0.dist-info → kreuzberg-3.17.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
[
|
2
|
+
"futhi",
|
3
|
+
"kahle",
|
4
|
+
"kakhulu",
|
5
|
+
"kanye",
|
6
|
+
"khona",
|
7
|
+
"kodwa",
|
8
|
+
"kungani",
|
9
|
+
"kusho",
|
10
|
+
"la",
|
11
|
+
"lakhe",
|
12
|
+
"lapho",
|
13
|
+
"mina",
|
14
|
+
"ngesikhathi",
|
15
|
+
"nje",
|
16
|
+
"phansi",
|
17
|
+
"phezulu",
|
18
|
+
"u",
|
19
|
+
"ukuba",
|
20
|
+
"ukuthi",
|
21
|
+
"ukuze",
|
22
|
+
"uma",
|
23
|
+
"wahamba",
|
24
|
+
"wakhe",
|
25
|
+
"wami",
|
26
|
+
"wase",
|
27
|
+
"wathi",
|
28
|
+
"yakhe",
|
29
|
+
"zakhe",
|
30
|
+
"zonke"
|
31
|
+
]
|
kreuzberg/_types.py
CHANGED
@@ -1,12 +1,13 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
import sys
|
4
|
-
from collections.abc import Awaitable, Callable,
|
4
|
+
from collections.abc import Awaitable, Callable, Mapping
|
5
5
|
from dataclasses import asdict, dataclass, field
|
6
6
|
from enum import Enum
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
|
9
9
|
|
10
|
+
import langcodes
|
10
11
|
import msgspec
|
11
12
|
|
12
13
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
@@ -591,6 +592,8 @@ class ImagePreprocessingMetadata(NamedTuple):
|
|
591
592
|
|
592
593
|
|
593
594
|
class Metadata(TypedDict, total=False):
|
595
|
+
abstract: NotRequired[str]
|
596
|
+
"""Document abstract or summary."""
|
594
597
|
authors: NotRequired[list[str]]
|
595
598
|
"""List of document authors."""
|
596
599
|
categories: NotRequired[list[str]]
|
@@ -677,9 +680,28 @@ class Metadata(TypedDict, total=False):
|
|
677
680
|
"""Error message if extraction failed."""
|
678
681
|
error_context: NotRequired[dict[str, Any]]
|
679
682
|
"""Error context information for debugging."""
|
683
|
+
json_schema: NotRequired[dict[str, Any]]
|
684
|
+
"""JSON schema information extracted from structured data."""
|
685
|
+
notes: NotRequired[list[str]]
|
686
|
+
"""Notes or additional information extracted from documents."""
|
687
|
+
note: NotRequired[str]
|
688
|
+
"""Single note or annotation."""
|
689
|
+
name: NotRequired[str]
|
690
|
+
"""Name field from structured data."""
|
691
|
+
body: NotRequired[str]
|
692
|
+
"""Body text content."""
|
693
|
+
text: NotRequired[str]
|
694
|
+
"""Generic text content."""
|
695
|
+
message: NotRequired[str]
|
696
|
+
"""Message or communication content."""
|
697
|
+
attributes: NotRequired[dict[str, Any]]
|
698
|
+
"""Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
|
699
|
+
token_reduction: NotRequired[dict[str, float]]
|
700
|
+
"""Token reduction statistics including reduction ratios and counts."""
|
680
701
|
|
681
702
|
|
682
703
|
_VALID_METADATA_KEYS = {
|
704
|
+
"abstract",
|
683
705
|
"authors",
|
684
706
|
"categories",
|
685
707
|
"citations",
|
@@ -722,6 +744,15 @@ _VALID_METADATA_KEYS = {
|
|
722
744
|
"source_format",
|
723
745
|
"error",
|
724
746
|
"error_context",
|
747
|
+
"json_schema",
|
748
|
+
"notes",
|
749
|
+
"note",
|
750
|
+
"name",
|
751
|
+
"body",
|
752
|
+
"text",
|
753
|
+
"message",
|
754
|
+
"attributes",
|
755
|
+
"token_reduction",
|
725
756
|
}
|
726
757
|
|
727
758
|
|
@@ -730,9 +761,29 @@ def normalize_metadata(data: dict[str, Any] | None) -> Metadata:
|
|
730
761
|
return {}
|
731
762
|
|
732
763
|
normalized: Metadata = {}
|
764
|
+
attributes: dict[str, Any] = {}
|
765
|
+
|
733
766
|
for key, value in data.items():
|
734
|
-
if
|
735
|
-
|
767
|
+
if value is not None:
|
768
|
+
if key in _VALID_METADATA_KEYS:
|
769
|
+
normalized[key] = value # type: ignore[literal-required]
|
770
|
+
elif "." in key and key.split(".")[-1] in {
|
771
|
+
"title",
|
772
|
+
"name",
|
773
|
+
"subject",
|
774
|
+
"description",
|
775
|
+
"content",
|
776
|
+
"body",
|
777
|
+
"text",
|
778
|
+
"message",
|
779
|
+
"note",
|
780
|
+
"abstract",
|
781
|
+
"summary",
|
782
|
+
}:
|
783
|
+
attributes[key] = value
|
784
|
+
|
785
|
+
if attributes:
|
786
|
+
normalized["attributes"] = attributes
|
736
787
|
|
737
788
|
return normalized
|
738
789
|
|
@@ -835,6 +886,30 @@ PostProcessingHook = Callable[[ExtractionResult], ExtractionResult | Awaitable[E
|
|
835
886
|
ValidationHook = Callable[[ExtractionResult], None | Awaitable[None]]
|
836
887
|
|
837
888
|
|
889
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
890
|
+
class JSONExtractionConfig(ConfigDict):
|
891
|
+
extract_schema: bool = False
|
892
|
+
"""Extract and include JSON schema information in metadata."""
|
893
|
+
custom_text_field_patterns: frozenset[str] | None = None
|
894
|
+
"""Custom patterns to identify text fields beyond default keywords."""
|
895
|
+
max_depth: int = 10
|
896
|
+
"""Maximum nesting depth to process in JSON structures."""
|
897
|
+
array_item_limit: int = 1000
|
898
|
+
"""Maximum number of array items to process to prevent memory issues."""
|
899
|
+
include_type_info: bool = False
|
900
|
+
"""Include data type information in extracted content."""
|
901
|
+
flatten_nested_objects: bool = True
|
902
|
+
"""Flatten nested objects using dot notation for better text extraction."""
|
903
|
+
|
904
|
+
def __post_init__(self) -> None:
|
905
|
+
if self.max_depth <= 0:
|
906
|
+
raise ValidationError("max_depth must be positive", context={"max_depth": self.max_depth})
|
907
|
+
if self.array_item_limit <= 0:
|
908
|
+
raise ValidationError(
|
909
|
+
"array_item_limit must be positive", context={"array_item_limit": self.array_item_limit}
|
910
|
+
)
|
911
|
+
|
912
|
+
|
838
913
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
839
914
|
class ExtractionConfig(ConfigDict):
|
840
915
|
force_ocr: bool = False
|
@@ -924,6 +999,8 @@ class ExtractionConfig(ConfigDict):
|
|
924
999
|
"""Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
|
925
1000
|
html_to_markdown_config: HTMLToMarkdownConfig | None = None
|
926
1001
|
"""Configuration for HTML to Markdown conversion. If None, uses default settings."""
|
1002
|
+
json_config: JSONExtractionConfig | None = None
|
1003
|
+
"""Configuration for enhanced JSON extraction features. If None, uses standard JSON processing."""
|
927
1004
|
use_cache: bool = True
|
928
1005
|
"""Whether to use caching for extraction results. Set to False to disable all caching."""
|
929
1006
|
target_dpi: int = 150
|
@@ -936,6 +1013,8 @@ class ExtractionConfig(ConfigDict):
|
|
936
1013
|
"""Minimum DPI threshold when auto-adjusting DPI."""
|
937
1014
|
max_dpi: int = 600
|
938
1015
|
"""Maximum DPI threshold when auto-adjusting DPI."""
|
1016
|
+
token_reduction: TokenReductionConfig | None = None
|
1017
|
+
"""Configuration for token reduction to optimize output size while preserving meaning."""
|
939
1018
|
|
940
1019
|
def __post_init__(self) -> None:
|
941
1020
|
if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
|
@@ -1060,71 +1139,95 @@ class ExtractionConfig(ConfigDict):
|
|
1060
1139
|
|
1061
1140
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
1062
1141
|
class HTMLToMarkdownConfig:
|
1063
|
-
stream_processing: bool = False
|
1064
|
-
"""Enable streaming mode for processing large HTML documents."""
|
1065
|
-
chunk_size: int = 1024
|
1066
|
-
"""Size of chunks when stream_processing is enabled."""
|
1067
|
-
chunk_callback: Callable[[str], None] | None = None
|
1068
|
-
"""Callback function invoked for each chunk during stream processing."""
|
1069
|
-
progress_callback: Callable[[int, int], None] | None = None
|
1070
|
-
"""Callback function for progress updates (current, total)."""
|
1071
|
-
parser: str | None = "lxml"
|
1072
|
-
"""BeautifulSoup parser to use. Defaults to 'lxml' for ~30% better performance. Falls back to 'html.parser' if lxml not available."""
|
1073
1142
|
autolinks: bool = True
|
1074
|
-
"""
|
1143
|
+
"""Automatically convert valid URLs to Markdown links."""
|
1144
|
+
br_in_tables: bool = False
|
1145
|
+
"""Use <br> tags for line breaks in table cells instead of spaces."""
|
1075
1146
|
bullets: str = "*+-"
|
1076
1147
|
"""Characters to use for unordered list bullets."""
|
1077
1148
|
code_language: str = ""
|
1078
|
-
"""Default language for code blocks."""
|
1149
|
+
"""Default language identifier for fenced code blocks."""
|
1079
1150
|
code_language_callback: Callable[[Any], str] | None = None
|
1080
|
-
"""
|
1081
|
-
convert:
|
1082
|
-
"""HTML tags to convert
|
1151
|
+
"""Function to dynamically determine code block language."""
|
1152
|
+
convert: list[str] | None = None
|
1153
|
+
"""List of HTML tags to convert (None = all supported tags)."""
|
1083
1154
|
convert_as_inline: bool = False
|
1084
|
-
"""
|
1085
|
-
custom_converters: Mapping[
|
1086
|
-
"""
|
1155
|
+
"""Treat content as inline elements only."""
|
1156
|
+
custom_converters: Mapping[str, Callable[..., str]] | None = None
|
1157
|
+
"""Mapping of HTML tag names to custom converter functions."""
|
1087
1158
|
default_title: bool = False
|
1088
|
-
"""Use
|
1089
|
-
escape_asterisks: bool =
|
1090
|
-
"""Escape
|
1091
|
-
escape_misc: bool =
|
1092
|
-
"""Escape miscellaneous characters
|
1093
|
-
escape_underscores: bool =
|
1094
|
-
"""Escape
|
1159
|
+
"""Use default titles for elements like links."""
|
1160
|
+
escape_asterisks: bool = False
|
1161
|
+
"""Escape * characters to prevent unintended formatting."""
|
1162
|
+
escape_misc: bool = False
|
1163
|
+
"""Escape miscellaneous characters to prevent Markdown conflicts."""
|
1164
|
+
escape_underscores: bool = False
|
1165
|
+
"""Escape _ characters to prevent unintended formatting."""
|
1095
1166
|
extract_metadata: bool = True
|
1096
|
-
"""Extract metadata
|
1167
|
+
"""Extract document metadata as comment header."""
|
1097
1168
|
heading_style: Literal["underlined", "atx", "atx_closed"] = "underlined"
|
1098
1169
|
"""Style for markdown headings."""
|
1099
1170
|
highlight_style: Literal["double-equal", "html", "bold"] = "double-equal"
|
1100
1171
|
"""Style for highlighting text."""
|
1101
|
-
keep_inline_images_in:
|
1102
|
-
"""
|
1172
|
+
keep_inline_images_in: list[str] | None = None
|
1173
|
+
"""Tags where inline images should be preserved."""
|
1174
|
+
list_indent_type: Literal["spaces", "tabs"] = "spaces"
|
1175
|
+
"""Type of indentation to use for lists."""
|
1176
|
+
list_indent_width: int = 4
|
1177
|
+
"""Number of spaces per indentation level (use 2 for Discord/Slack)."""
|
1103
1178
|
newline_style: Literal["spaces", "backslash"] = "spaces"
|
1104
1179
|
"""Style for line breaks in markdown."""
|
1105
|
-
|
1106
|
-
"""HTML
|
1180
|
+
preprocess_html: bool = False
|
1181
|
+
"""Enable HTML preprocessing to clean messy HTML."""
|
1182
|
+
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "standard"
|
1183
|
+
"""Preprocessing level for cleaning HTML."""
|
1184
|
+
remove_forms: bool = True
|
1185
|
+
"""Remove form elements during preprocessing."""
|
1186
|
+
remove_navigation: bool = True
|
1187
|
+
"""Remove navigation elements during preprocessing."""
|
1188
|
+
strip: list[str] | None = None
|
1189
|
+
"""List of HTML tags to remove from output."""
|
1107
1190
|
strip_newlines: bool = False
|
1108
|
-
"""
|
1191
|
+
"""Remove newlines from HTML input before processing."""
|
1109
1192
|
strong_em_symbol: Literal["*", "_"] = "*"
|
1110
1193
|
"""Symbol to use for strong/emphasis formatting."""
|
1111
1194
|
sub_symbol: str = ""
|
1112
1195
|
"""Symbol to use for subscript text."""
|
1113
1196
|
sup_symbol: str = ""
|
1114
1197
|
"""Symbol to use for superscript text."""
|
1198
|
+
whitespace_mode: Literal["normalized", "strict"] = "normalized"
|
1199
|
+
"""Whitespace handling mode."""
|
1115
1200
|
wrap: bool = False
|
1116
1201
|
"""Enable text wrapping."""
|
1117
1202
|
wrap_width: int = 80
|
1118
|
-
"""Width for text wrapping
|
1119
|
-
preprocess_html: bool = True
|
1120
|
-
"""Enable HTML preprocessing to clean up the input."""
|
1121
|
-
preprocessing_preset: Literal["minimal", "standard", "aggressive"] = "aggressive"
|
1122
|
-
"""Preprocessing level for cleaning HTML."""
|
1123
|
-
remove_navigation: bool = True
|
1124
|
-
"""Remove navigation elements from HTML."""
|
1125
|
-
remove_forms: bool = True
|
1126
|
-
"""Remove form elements from HTML."""
|
1203
|
+
"""Width for text wrapping."""
|
1127
1204
|
|
1128
1205
|
def to_dict(self) -> dict[str, Any]:
|
1129
1206
|
result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
|
1130
1207
|
return {k: v for k, v in result.items() if v is not None}
|
1208
|
+
|
1209
|
+
|
1210
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
1211
|
+
class TokenReductionConfig:
|
1212
|
+
mode: Literal["off", "light", "moderate"] = "off"
|
1213
|
+
preserve_markdown: bool = True
|
1214
|
+
custom_stopwords: dict[str, list[str]] | None = field(default=None, compare=False, hash=False)
|
1215
|
+
language_hint: str | None = None
|
1216
|
+
|
1217
|
+
def __post_init__(self) -> None:
|
1218
|
+
if self.language_hint:
|
1219
|
+
hint = self.language_hint.strip()
|
1220
|
+
|
1221
|
+
if not hint or len(hint) > 50 or any(c in hint for c in "\x00\r\n\t"):
|
1222
|
+
object.__setattr__(self, "language_hint", None)
|
1223
|
+
return
|
1224
|
+
|
1225
|
+
try:
|
1226
|
+
normalized = langcodes.standardize_tag(hint)
|
1227
|
+
|
1228
|
+
lang = langcodes.Language.get(normalized).language
|
1229
|
+
|
1230
|
+
if lang and lang != hint:
|
1231
|
+
object.__setattr__(self, "language_hint", lang)
|
1232
|
+
except (ValueError, AttributeError, TypeError):
|
1233
|
+
object.__setattr__(self, "language_hint", None)
|
@@ -0,0 +1,20 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
_STREAMING_THRESHOLD_KB = 10
|
4
|
+
_LARGE_FILE_THRESHOLD_MB = 1
|
5
|
+
_DEFAULT_CHUNK_SIZE = 2048
|
6
|
+
_LARGE_FILE_CHUNK_SIZE = 4096
|
7
|
+
|
8
|
+
_STREAMING_THRESHOLD_BYTES = _STREAMING_THRESHOLD_KB * 1024
|
9
|
+
_LARGE_FILE_THRESHOLD_BYTES = _LARGE_FILE_THRESHOLD_MB * 1024 * 1024
|
10
|
+
|
11
|
+
|
12
|
+
def should_use_streaming(content_size: int) -> tuple[bool, int]:
|
13
|
+
if content_size < 0:
|
14
|
+
return False, _DEFAULT_CHUNK_SIZE
|
15
|
+
|
16
|
+
if content_size > _STREAMING_THRESHOLD_BYTES:
|
17
|
+
if content_size > _LARGE_FILE_THRESHOLD_BYTES:
|
18
|
+
return True, _LARGE_FILE_CHUNK_SIZE
|
19
|
+
return True, _DEFAULT_CHUNK_SIZE
|
20
|
+
return False, _DEFAULT_CHUNK_SIZE
|
@@ -198,7 +198,7 @@ def normalize_image_dpi(
|
|
198
198
|
calculated_dpi=calculated_dpi,
|
199
199
|
)
|
200
200
|
|
201
|
-
except OSError as e:
|
201
|
+
except OSError as e: # pragma: no cover
|
202
202
|
return image, ImagePreprocessingMetadata(
|
203
203
|
original_dimensions=(original_width, original_height),
|
204
204
|
original_dpi=original_dpi,
|
kreuzberg/_utils/_ref.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import threading
|
3
4
|
from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, cast
|
4
5
|
|
5
6
|
if TYPE_CHECKING:
|
@@ -10,23 +11,30 @@ T = TypeVar("T")
|
|
10
11
|
|
11
12
|
class Ref(Generic[T]):
|
12
13
|
_instances: ClassVar[dict[str, Any]] = {}
|
14
|
+
_lock: ClassVar[threading.Lock] = threading.Lock()
|
13
15
|
|
14
16
|
def __init__(self, name: str, factory: Callable[[], T]) -> None:
|
15
17
|
self.name = name
|
16
18
|
self.factory = factory
|
17
19
|
|
18
20
|
def get(self) -> T:
|
19
|
-
if self.name
|
20
|
-
self._instances[self.name]
|
21
|
-
|
21
|
+
if self.name in self._instances:
|
22
|
+
return cast("T", self._instances[self.name])
|
23
|
+
|
24
|
+
with self._lock:
|
25
|
+
if self.name not in self._instances:
|
26
|
+
self._instances[self.name] = self.factory()
|
27
|
+
return cast("T", self._instances[self.name])
|
22
28
|
|
23
29
|
def clear(self) -> None:
|
24
|
-
|
25
|
-
|
30
|
+
with self._lock:
|
31
|
+
if self.name in self._instances:
|
32
|
+
del self._instances[self.name]
|
26
33
|
|
27
34
|
def is_initialized(self) -> bool:
|
28
35
|
return self.name in self._instances
|
29
36
|
|
30
37
|
@classmethod
|
31
38
|
def clear_all(cls) -> None:
|
32
|
-
cls.
|
39
|
+
with cls._lock:
|
40
|
+
cls._instances.clear()
|
@@ -1,11 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from dataclasses import is_dataclass
|
4
|
-
from typing import Any, TypeVar
|
4
|
+
from typing import Any, TypeVar
|
5
5
|
|
6
6
|
import msgspec
|
7
7
|
from msgspec import MsgspecError
|
8
|
-
from msgspec.msgpack import decode, encode
|
9
8
|
|
10
9
|
T = TypeVar("T")
|
11
10
|
|
@@ -42,18 +41,26 @@ def encode_hook(obj: Any) -> Any:
|
|
42
41
|
raise TypeError(f"Unsupported type: {type(obj)!r}")
|
43
42
|
|
44
43
|
|
45
|
-
def deserialize(value: str | bytes, target_type: type[T]) -> T:
|
44
|
+
def deserialize(value: str | bytes, target_type: type[T], json: bool = False) -> T:
|
45
|
+
decoder = msgspec.json.decode if json else msgspec.msgpack.decode
|
46
|
+
|
47
|
+
if json:
|
48
|
+
data = value.encode() if isinstance(value, str) else value
|
49
|
+
else:
|
50
|
+
data = value.encode() if isinstance(value, str) else value
|
51
|
+
|
46
52
|
try:
|
47
|
-
return
|
53
|
+
return decoder(data, type=target_type, strict=False)
|
48
54
|
except MsgspecError as e:
|
49
55
|
raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
|
50
56
|
|
51
57
|
|
52
|
-
def serialize(value: Any, **kwargs: Any) -> bytes:
|
58
|
+
def serialize(value: Any, json: bool = False, **kwargs: Any) -> bytes:
|
53
59
|
if isinstance(value, dict) and kwargs:
|
54
60
|
value = value | kwargs
|
55
61
|
|
62
|
+
encoder = msgspec.json.encode if json else msgspec.msgpack.encode
|
56
63
|
try:
|
57
|
-
return
|
64
|
+
return encoder(value, enc_hook=encode_hook)
|
58
65
|
except (MsgspecError, TypeError) as e:
|
59
66
|
raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e
|
kreuzberg/_utils/_sync.py
CHANGED
@@ -1,19 +1,16 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
-
import asyncio
|
4
3
|
from functools import partial
|
5
4
|
from inspect import isawaitable, iscoroutinefunction
|
6
|
-
from typing import TYPE_CHECKING, Any, TypeVar, cast
|
5
|
+
from typing import TYPE_CHECKING, Any, ParamSpec, TypeVar, cast
|
7
6
|
|
8
7
|
import anyio
|
9
|
-
from anyio import create_task_group
|
8
|
+
from anyio import CapacityLimiter, create_task_group
|
10
9
|
from anyio.to_thread import run_sync as any_io_run_sync
|
11
10
|
|
12
11
|
if TYPE_CHECKING: # pragma: no cover
|
13
12
|
from collections.abc import Awaitable, Callable
|
14
13
|
|
15
|
-
from typing import ParamSpec
|
16
|
-
|
17
14
|
T = TypeVar("T")
|
18
15
|
P = ParamSpec("P")
|
19
16
|
|
@@ -57,24 +54,26 @@ async def run_taskgroup_batched(
|
|
57
54
|
return []
|
58
55
|
|
59
56
|
if len(async_tasks) <= batch_size or not use_semaphore:
|
60
|
-
|
57
|
+
batch_results: list[Any] = []
|
61
58
|
for i in range(0, len(async_tasks), batch_size):
|
62
59
|
batch = async_tasks[i : i + batch_size]
|
63
|
-
|
64
|
-
return
|
60
|
+
batch_results.extend(await run_taskgroup(*batch))
|
61
|
+
return batch_results
|
65
62
|
|
66
|
-
|
63
|
+
limiter = CapacityLimiter(batch_size)
|
64
|
+
results: list[tuple[int, Any]] = []
|
67
65
|
|
68
|
-
async def run_with_semaphore(task: Awaitable[Any], index: int) ->
|
69
|
-
async with
|
66
|
+
async def run_with_semaphore(task: Awaitable[Any], index: int) -> None:
|
67
|
+
async with limiter:
|
70
68
|
result = await task
|
71
|
-
|
69
|
+
results.append((index, result))
|
72
70
|
|
73
|
-
|
74
|
-
|
71
|
+
async with create_task_group() as tg:
|
72
|
+
for i, task in enumerate(async_tasks):
|
73
|
+
tg.start_soon(run_with_semaphore, task, i)
|
75
74
|
|
76
|
-
|
77
|
-
return [result for _, result in
|
75
|
+
results.sort(key=lambda x: x[0])
|
76
|
+
return [result for _, result in results]
|
78
77
|
|
79
78
|
|
80
79
|
async def run_maybe_sync(fn: Callable[P, T | Awaitable[T]], *args: P.args, **kwargs: P.kwargs) -> T:
|
kreuzberg/exceptions.py
CHANGED
@@ -17,7 +17,6 @@ class KreuzbergError(Exception):
|
|
17
17
|
super().__init__(message)
|
18
18
|
|
19
19
|
def _serialize_context(self, obj: Any) -> Any:
|
20
|
-
"""Recursively serialize context objects to ensure JSON compatibility."""
|
21
20
|
if isinstance(obj, bytes):
|
22
21
|
return obj.decode("utf-8", errors="replace")
|
23
22
|
if isinstance(obj, dict):
|
kreuzberg/extraction.py
CHANGED
@@ -15,6 +15,7 @@ from kreuzberg._mime_types import (
|
|
15
15
|
validate_mime_type,
|
16
16
|
)
|
17
17
|
from kreuzberg._registry import ExtractorRegistry
|
18
|
+
from kreuzberg._token_reduction import get_reduction_stats, reduce_tokens
|
18
19
|
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
19
20
|
from kreuzberg._utils._document_cache import get_document_cache
|
20
21
|
from kreuzberg._utils._errors import create_error_context
|
@@ -31,15 +32,6 @@ DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
|
|
31
32
|
|
32
33
|
|
33
34
|
async def _handle_cache_async(path: Path, config: ExtractionConfig) -> ExtractionResult | None:
|
34
|
-
"""Handle cache lookup and coordination with other processes.
|
35
|
-
|
36
|
-
Args:
|
37
|
-
path: Path to the file being processed
|
38
|
-
config: Extraction configuration
|
39
|
-
|
40
|
-
Returns:
|
41
|
-
Cached result if available, None otherwise
|
42
|
-
"""
|
43
35
|
cache = get_document_cache()
|
44
36
|
|
45
37
|
cached_result = cache.get(path, config)
|
@@ -47,7 +39,7 @@ async def _handle_cache_async(path: Path, config: ExtractionConfig) -> Extractio
|
|
47
39
|
return cached_result
|
48
40
|
|
49
41
|
if cache.is_processing(path, config):
|
50
|
-
event = cache.mark_processing(path, config)
|
42
|
+
event = cache.mark_processing(path, config) # pragma: no cover
|
51
43
|
await anyio.to_thread.run_sync(event.wait) # pragma: no cover
|
52
44
|
|
53
45
|
return cache.get(path, config) # pragma: no cover
|
@@ -92,6 +84,30 @@ def _validate_and_post_process_helper(
|
|
92
84
|
if config.auto_detect_document_type:
|
93
85
|
result = auto_detect_document_type(result, config, file_path=file_path)
|
94
86
|
|
87
|
+
if config.token_reduction is not None and config.token_reduction.mode != "off":
|
88
|
+
original_content = result.content
|
89
|
+
|
90
|
+
language_hint = None
|
91
|
+
if result.detected_languages and len(result.detected_languages) > 0:
|
92
|
+
language_hint = result.detected_languages[0]
|
93
|
+
|
94
|
+
reduced_content = reduce_tokens(
|
95
|
+
original_content,
|
96
|
+
config=config.token_reduction,
|
97
|
+
language=language_hint,
|
98
|
+
)
|
99
|
+
reduction_stats = get_reduction_stats(original_content, reduced_content)
|
100
|
+
|
101
|
+
result.content = reduced_content
|
102
|
+
result.metadata["token_reduction"] = {
|
103
|
+
"character_reduction_ratio": reduction_stats["character_reduction_ratio"],
|
104
|
+
"token_reduction_ratio": reduction_stats["token_reduction_ratio"],
|
105
|
+
"original_characters": reduction_stats["original_characters"],
|
106
|
+
"reduced_characters": reduction_stats["reduced_characters"],
|
107
|
+
"original_tokens": reduction_stats["original_tokens"],
|
108
|
+
"reduced_tokens": reduction_stats["reduced_tokens"],
|
109
|
+
}
|
110
|
+
|
95
111
|
return result
|
96
112
|
|
97
113
|
|
@@ -362,7 +378,7 @@ def extract_file_sync(
|
|
362
378
|
return cached_result
|
363
379
|
|
364
380
|
if cache.is_processing(path, config):
|
365
|
-
event = cache.mark_processing(path, config)
|
381
|
+
event = cache.mark_processing(path, config) # pragma: no cover
|
366
382
|
event.wait() # pragma: no cover
|
367
383
|
|
368
384
|
# Try cache again after waiting for other process to complete # ~keep
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.17.0
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -31,7 +31,8 @@ Requires-Python: >=3.10
|
|
31
31
|
Requires-Dist: anyio>=4.10.0
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
|
-
Requires-Dist: html-to-markdown[lxml]>=1.
|
34
|
+
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
|
+
Requires-Dist: langcodes>=3.5.0
|
35
36
|
Requires-Dist: mcp>=1.14.0
|
36
37
|
Requires-Dist: msgspec>=0.18.0
|
37
38
|
Requires-Dist: numpy>=2.0.0
|
@@ -49,7 +50,7 @@ Provides-Extra: all
|
|
49
50
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
50
51
|
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
51
52
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
52
|
-
Requires-Dist: fast-langdetect>=0.
|
53
|
+
Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
|
53
54
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
54
55
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
55
56
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
@@ -82,7 +83,7 @@ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
|
82
83
|
Provides-Extra: gmft
|
83
84
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
84
85
|
Provides-Extra: langdetect
|
85
|
-
Requires-Dist: fast-langdetect>=0.
|
86
|
+
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
86
87
|
Provides-Extra: paddleocr
|
87
88
|
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
88
89
|
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
@@ -109,7 +110,7 @@ Description-Content-Type: text/markdown
|
|
109
110
|
- **Text Extraction**: High-fidelity text extraction preserving document structure and formatting
|
110
111
|
- **Image Extraction**: Extract embedded images from PDFs, presentations, HTML, and Office documents with optional OCR
|
111
112
|
- **Metadata Extraction**: Comprehensive metadata including author, creation date, language, and document properties
|
112
|
-
- **Format Support**:
|
113
|
+
- **Format Support**: 21 document types including PDF, Microsoft Office, images, HTML, and structured data formats
|
113
114
|
- **OCR Integration**: Tesseract OCR with markdown output (default) and table extraction from scanned documents
|
114
115
|
- **Document Classification**: Automatic document type detection (contracts, forms, invoices, receipts, reports)
|
115
116
|
|
@@ -227,14 +228,15 @@ claude mcp add kreuzberg uvx kreuzberg-mcp
|
|
227
228
|
|
228
229
|
## Supported Formats
|
229
230
|
|
230
|
-
| Category
|
231
|
-
|
|
232
|
-
| **Documents**
|
233
|
-
| **Images**
|
234
|
-
| **Spreadsheets**
|
235
|
-
| **Presentations**
|
236
|
-
| **Web**
|
237
|
-
| **
|
231
|
+
| Category | Formats |
|
232
|
+
| ------------------- | ------------------------------ |
|
233
|
+
| **Documents** | PDF, DOCX, DOC, RTF, TXT, EPUB |
|
234
|
+
| **Images** | JPG, PNG, TIFF, BMP, GIF, WEBP |
|
235
|
+
| **Spreadsheets** | XLSX, XLS, CSV, ODS |
|
236
|
+
| **Presentations** | PPTX, PPT, ODP |
|
237
|
+
| **Web** | HTML, XML, MHTML |
|
238
|
+
| **Structured Data** | JSON, YAML, TOML |
|
239
|
+
| **Archives** | Support via extraction |
|
238
240
|
|
239
241
|
## 📊 Performance Characteristics
|
240
242
|
|