kreuzberg 3.16.0__py3-none-any.whl → 3.17.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +2 -0
- kreuzberg/_config.py +8 -9
- kreuzberg/_extractors/_base.py +0 -46
- kreuzberg/_extractors/_html.py +1 -1
- kreuzberg/_extractors/_pandoc.py +2 -2
- kreuzberg/_extractors/_pdf.py +4 -4
- kreuzberg/_gmft.py +2 -2
- kreuzberg/_language_detection.py +16 -39
- kreuzberg/_mcp/server.py +1 -1
- kreuzberg/_mime_types.py +1 -1
- kreuzberg/_ocr/_easyocr.py +4 -9
- kreuzberg/_ocr/_paddleocr.py +1 -1
- kreuzberg/_ocr/_tesseract.py +15 -25
- kreuzberg/_token_reduction/__init__.py +11 -0
- kreuzberg/_token_reduction/_reducer.py +439 -0
- kreuzberg/_token_reduction/_stopwords.py +116 -0
- kreuzberg/_token_reduction/stopwords/af_stopwords.json +53 -0
- kreuzberg/_token_reduction/stopwords/ar_stopwords.json +482 -0
- kreuzberg/_token_reduction/stopwords/bg_stopwords.json +261 -0
- kreuzberg/_token_reduction/stopwords/bn_stopwords.json +400 -0
- kreuzberg/_token_reduction/stopwords/br_stopwords.json +1205 -0
- kreuzberg/_token_reduction/stopwords/ca_stopwords.json +280 -0
- kreuzberg/_token_reduction/stopwords/cs_stopwords.json +425 -0
- kreuzberg/_token_reduction/stopwords/da_stopwords.json +172 -0
- kreuzberg/_token_reduction/stopwords/de_stopwords.json +622 -0
- kreuzberg/_token_reduction/stopwords/el_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/en_stopwords.json +1300 -0
- kreuzberg/_token_reduction/stopwords/eo_stopwords.json +175 -0
- kreuzberg/_token_reduction/stopwords/es_stopwords.json +734 -0
- kreuzberg/_token_reduction/stopwords/et_stopwords.json +37 -0
- kreuzberg/_token_reduction/stopwords/eu_stopwords.json +100 -0
- kreuzberg/_token_reduction/stopwords/fa_stopwords.json +801 -0
- kreuzberg/_token_reduction/stopwords/fi_stopwords.json +849 -0
- kreuzberg/_token_reduction/stopwords/fr_stopwords.json +693 -0
- kreuzberg/_token_reduction/stopwords/ga_stopwords.json +111 -0
- kreuzberg/_token_reduction/stopwords/gl_stopwords.json +162 -0
- kreuzberg/_token_reduction/stopwords/gu_stopwords.json +226 -0
- kreuzberg/_token_reduction/stopwords/ha_stopwords.json +41 -0
- kreuzberg/_token_reduction/stopwords/he_stopwords.json +196 -0
- kreuzberg/_token_reduction/stopwords/hi_stopwords.json +227 -0
- kreuzberg/_token_reduction/stopwords/hr_stopwords.json +181 -0
- kreuzberg/_token_reduction/stopwords/hu_stopwords.json +791 -0
- kreuzberg/_token_reduction/stopwords/hy_stopwords.json +47 -0
- kreuzberg/_token_reduction/stopwords/id_stopwords.json +760 -0
- kreuzberg/_token_reduction/stopwords/it_stopwords.json +634 -0
- kreuzberg/_token_reduction/stopwords/ja_stopwords.json +136 -0
- kreuzberg/_token_reduction/stopwords/kn_stopwords.json +84 -0
- kreuzberg/_token_reduction/stopwords/ko_stopwords.json +681 -0
- kreuzberg/_token_reduction/stopwords/ku_stopwords.json +64 -0
- kreuzberg/_token_reduction/stopwords/la_stopwords.json +51 -0
- kreuzberg/_token_reduction/stopwords/lt_stopwords.json +476 -0
- kreuzberg/_token_reduction/stopwords/lv_stopwords.json +163 -0
- kreuzberg/_token_reduction/stopwords/ml_stopwords.json +11 -0
- kreuzberg/_token_reduction/stopwords/mr_stopwords.json +101 -0
- kreuzberg/_token_reduction/stopwords/ms_stopwords.json +477 -0
- kreuzberg/_token_reduction/stopwords/ne_stopwords.json +490 -0
- kreuzberg/_token_reduction/stopwords/nl_stopwords.json +415 -0
- kreuzberg/_token_reduction/stopwords/no_stopwords.json +223 -0
- kreuzberg/_token_reduction/stopwords/pl_stopwords.json +331 -0
- kreuzberg/_token_reduction/stopwords/pt_stopwords.json +562 -0
- kreuzberg/_token_reduction/stopwords/ro_stopwords.json +436 -0
- kreuzberg/_token_reduction/stopwords/ru_stopwords.json +561 -0
- kreuzberg/_token_reduction/stopwords/si_stopwords.json +193 -0
- kreuzberg/_token_reduction/stopwords/sk_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sl_stopwords.json +448 -0
- kreuzberg/_token_reduction/stopwords/so_stopwords.json +32 -0
- kreuzberg/_token_reduction/stopwords/st_stopwords.json +33 -0
- kreuzberg/_token_reduction/stopwords/sv_stopwords.json +420 -0
- kreuzberg/_token_reduction/stopwords/sw_stopwords.json +76 -0
- kreuzberg/_token_reduction/stopwords/ta_stopwords.json +129 -0
- kreuzberg/_token_reduction/stopwords/te_stopwords.json +54 -0
- kreuzberg/_token_reduction/stopwords/th_stopwords.json +118 -0
- kreuzberg/_token_reduction/stopwords/tl_stopwords.json +149 -0
- kreuzberg/_token_reduction/stopwords/tr_stopwords.json +506 -0
- kreuzberg/_token_reduction/stopwords/uk_stopwords.json +75 -0
- kreuzberg/_token_reduction/stopwords/ur_stopwords.json +519 -0
- kreuzberg/_token_reduction/stopwords/vi_stopwords.json +647 -0
- kreuzberg/_token_reduction/stopwords/yo_stopwords.json +62 -0
- kreuzberg/_token_reduction/stopwords/zh_stopwords.json +796 -0
- kreuzberg/_token_reduction/stopwords/zu_stopwords.json +31 -0
- kreuzberg/_types.py +50 -9
- kreuzberg/_utils/_image_preprocessing.py +1 -1
- kreuzberg/_utils/_ref.py +14 -6
- kreuzberg/exceptions.py +0 -1
- kreuzberg/extraction.py +33 -10
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/METADATA +6 -5
- kreuzberg-3.17.1.dist-info/RECORD +128 -0
- kreuzberg-3.16.0.dist-info/RECORD +0 -61
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.16.0.dist-info → kreuzberg-3.17.1.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,31 @@
|
|
1
|
+
[
|
2
|
+
"futhi",
|
3
|
+
"kahle",
|
4
|
+
"kakhulu",
|
5
|
+
"kanye",
|
6
|
+
"khona",
|
7
|
+
"kodwa",
|
8
|
+
"kungani",
|
9
|
+
"kusho",
|
10
|
+
"la",
|
11
|
+
"lakhe",
|
12
|
+
"lapho",
|
13
|
+
"mina",
|
14
|
+
"ngesikhathi",
|
15
|
+
"nje",
|
16
|
+
"phansi",
|
17
|
+
"phezulu",
|
18
|
+
"u",
|
19
|
+
"ukuba",
|
20
|
+
"ukuthi",
|
21
|
+
"ukuze",
|
22
|
+
"uma",
|
23
|
+
"wahamba",
|
24
|
+
"wakhe",
|
25
|
+
"wami",
|
26
|
+
"wase",
|
27
|
+
"wathi",
|
28
|
+
"yakhe",
|
29
|
+
"zakhe",
|
30
|
+
"zonke"
|
31
|
+
]
|
kreuzberg/_types.py
CHANGED
@@ -7,6 +7,7 @@ from enum import Enum
|
|
7
7
|
from pathlib import Path
|
8
8
|
from typing import TYPE_CHECKING, Any, Literal, NamedTuple, TypedDict
|
9
9
|
|
10
|
+
import langcodes
|
10
11
|
import msgspec
|
11
12
|
|
12
13
|
from kreuzberg._constants import DEFAULT_MAX_CHARACTERS, DEFAULT_MAX_OVERLAP
|
@@ -401,9 +402,12 @@ class ImageOCRConfig(ConfigDict):
|
|
401
402
|
|
402
403
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
403
404
|
class LanguageDetectionConfig(ConfigDict):
|
404
|
-
|
405
|
-
"""
|
406
|
-
|
405
|
+
model: Literal["lite", "full", "auto"] = "auto"
|
406
|
+
"""Language detection model to use:
|
407
|
+
- 'lite': Smaller, faster model with good accuracy
|
408
|
+
- 'full': Larger model with highest accuracy
|
409
|
+
- 'auto': Automatically choose based on memory availability (default)
|
410
|
+
"""
|
407
411
|
top_k: int = 3
|
408
412
|
"""Maximum number of languages to return for multilingual detection."""
|
409
413
|
multilingual: bool = False
|
@@ -411,8 +415,8 @@ class LanguageDetectionConfig(ConfigDict):
|
|
411
415
|
If False, uses single language detection."""
|
412
416
|
cache_dir: str | None = None
|
413
417
|
"""Custom directory for model cache. If None, uses system default."""
|
414
|
-
|
415
|
-
"""
|
418
|
+
low_memory: bool = True
|
419
|
+
"""Deprecated. Use 'model' parameter instead. If True, uses 'lite' model."""
|
416
420
|
|
417
421
|
|
418
422
|
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
@@ -695,6 +699,8 @@ class Metadata(TypedDict, total=False):
|
|
695
699
|
"""Message or communication content."""
|
696
700
|
attributes: NotRequired[dict[str, Any]]
|
697
701
|
"""Additional attributes extracted from structured data (e.g., custom text fields with dotted keys)."""
|
702
|
+
token_reduction: NotRequired[dict[str, float]]
|
703
|
+
"""Token reduction statistics including reduction ratios and counts."""
|
698
704
|
|
699
705
|
|
700
706
|
_VALID_METADATA_KEYS = {
|
@@ -749,6 +755,7 @@ _VALID_METADATA_KEYS = {
|
|
749
755
|
"text",
|
750
756
|
"message",
|
751
757
|
"attributes",
|
758
|
+
"token_reduction",
|
752
759
|
}
|
753
760
|
|
754
761
|
|
@@ -979,8 +986,14 @@ class ExtractionConfig(ConfigDict):
|
|
979
986
|
"""Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
|
980
987
|
auto_detect_language: bool = False
|
981
988
|
"""Whether to automatically detect language and configure OCR accordingly."""
|
989
|
+
language_detection_model: Literal["lite", "full", "auto"] = "auto"
|
990
|
+
"""Language detection model to use when auto_detect_language is True.
|
991
|
+
- 'lite': Smaller, faster model with good accuracy
|
992
|
+
- 'full': Larger model with highest accuracy
|
993
|
+
- 'auto': Automatically choose based on memory availability (default)
|
994
|
+
"""
|
982
995
|
language_detection_config: LanguageDetectionConfig | None = None
|
983
|
-
"""Configuration for language detection. If None, uses default settings."""
|
996
|
+
"""Configuration for language detection. If None, uses default settings with language_detection_model."""
|
984
997
|
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
985
998
|
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
986
999
|
auto_detect_document_type: bool = False
|
@@ -1009,6 +1022,8 @@ class ExtractionConfig(ConfigDict):
|
|
1009
1022
|
"""Minimum DPI threshold when auto-adjusting DPI."""
|
1010
1023
|
max_dpi: int = 600
|
1011
1024
|
"""Maximum DPI threshold when auto-adjusting DPI."""
|
1025
|
+
token_reduction: TokenReductionConfig | None = None
|
1026
|
+
"""Configuration for token reduction to optimize output size while preserving meaning."""
|
1012
1027
|
|
1013
1028
|
def __post_init__(self) -> None:
|
1014
1029
|
if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
|
@@ -1151,11 +1166,11 @@ class HTMLToMarkdownConfig:
|
|
1151
1166
|
"""Mapping of HTML tag names to custom converter functions."""
|
1152
1167
|
default_title: bool = False
|
1153
1168
|
"""Use default titles for elements like links."""
|
1154
|
-
escape_asterisks: bool =
|
1169
|
+
escape_asterisks: bool = False
|
1155
1170
|
"""Escape * characters to prevent unintended formatting."""
|
1156
|
-
escape_misc: bool =
|
1171
|
+
escape_misc: bool = False
|
1157
1172
|
"""Escape miscellaneous characters to prevent Markdown conflicts."""
|
1158
|
-
escape_underscores: bool =
|
1173
|
+
escape_underscores: bool = False
|
1159
1174
|
"""Escape _ characters to prevent unintended formatting."""
|
1160
1175
|
extract_metadata: bool = True
|
1161
1176
|
"""Extract document metadata as comment header."""
|
@@ -1199,3 +1214,29 @@ class HTMLToMarkdownConfig:
|
|
1199
1214
|
def to_dict(self) -> dict[str, Any]:
|
1200
1215
|
result = msgspec.to_builtins(self, builtin_types=(type(None),), order="deterministic")
|
1201
1216
|
return {k: v for k, v in result.items() if v is not None}
|
1217
|
+
|
1218
|
+
|
1219
|
+
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
1220
|
+
class TokenReductionConfig:
|
1221
|
+
mode: Literal["off", "light", "moderate"] = "off"
|
1222
|
+
preserve_markdown: bool = True
|
1223
|
+
custom_stopwords: dict[str, list[str]] | None = field(default=None, compare=False, hash=False)
|
1224
|
+
language_hint: str | None = None
|
1225
|
+
|
1226
|
+
def __post_init__(self) -> None:
|
1227
|
+
if self.language_hint:
|
1228
|
+
hint = self.language_hint.strip()
|
1229
|
+
|
1230
|
+
if not hint or len(hint) > 50 or any(c in hint for c in "\x00\r\n\t"):
|
1231
|
+
object.__setattr__(self, "language_hint", None)
|
1232
|
+
return
|
1233
|
+
|
1234
|
+
try:
|
1235
|
+
normalized = langcodes.standardize_tag(hint)
|
1236
|
+
|
1237
|
+
lang = langcodes.Language.get(normalized).language
|
1238
|
+
|
1239
|
+
if lang and lang != hint:
|
1240
|
+
object.__setattr__(self, "language_hint", lang)
|
1241
|
+
except (ValueError, AttributeError, TypeError):
|
1242
|
+
object.__setattr__(self, "language_hint", None)
|
@@ -198,7 +198,7 @@ def normalize_image_dpi(
|
|
198
198
|
calculated_dpi=calculated_dpi,
|
199
199
|
)
|
200
200
|
|
201
|
-
except OSError as e:
|
201
|
+
except OSError as e: # pragma: no cover
|
202
202
|
return image, ImagePreprocessingMetadata(
|
203
203
|
original_dimensions=(original_width, original_height),
|
204
204
|
original_dpi=original_dpi,
|
kreuzberg/_utils/_ref.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import threading
|
3
4
|
from typing import TYPE_CHECKING, Any, ClassVar, Generic, TypeVar, cast
|
4
5
|
|
5
6
|
if TYPE_CHECKING:
|
@@ -10,23 +11,30 @@ T = TypeVar("T")
|
|
10
11
|
|
11
12
|
class Ref(Generic[T]):
|
12
13
|
_instances: ClassVar[dict[str, Any]] = {}
|
14
|
+
_lock: ClassVar[threading.Lock] = threading.Lock()
|
13
15
|
|
14
16
|
def __init__(self, name: str, factory: Callable[[], T]) -> None:
|
15
17
|
self.name = name
|
16
18
|
self.factory = factory
|
17
19
|
|
18
20
|
def get(self) -> T:
|
19
|
-
if self.name
|
20
|
-
self._instances[self.name]
|
21
|
-
|
21
|
+
if self.name in self._instances:
|
22
|
+
return cast("T", self._instances[self.name])
|
23
|
+
|
24
|
+
with self._lock:
|
25
|
+
if self.name not in self._instances:
|
26
|
+
self._instances[self.name] = self.factory()
|
27
|
+
return cast("T", self._instances[self.name])
|
22
28
|
|
23
29
|
def clear(self) -> None:
|
24
|
-
|
25
|
-
|
30
|
+
with self._lock:
|
31
|
+
if self.name in self._instances:
|
32
|
+
del self._instances[self.name]
|
26
33
|
|
27
34
|
def is_initialized(self) -> bool:
|
28
35
|
return self.name in self._instances
|
29
36
|
|
30
37
|
@classmethod
|
31
38
|
def clear_all(cls) -> None:
|
32
|
-
cls.
|
39
|
+
with cls._lock:
|
40
|
+
cls._instances.clear()
|
kreuzberg/exceptions.py
CHANGED
@@ -17,7 +17,6 @@ class KreuzbergError(Exception):
|
|
17
17
|
super().__init__(message)
|
18
18
|
|
19
19
|
def _serialize_context(self, obj: Any) -> Any:
|
20
|
-
"""Recursively serialize context objects to ensure JSON compatibility."""
|
21
20
|
if isinstance(obj, bytes):
|
22
21
|
return obj.decode("utf-8", errors="replace")
|
23
22
|
if isinstance(obj, dict):
|
kreuzberg/extraction.py
CHANGED
@@ -15,6 +15,7 @@ from kreuzberg._mime_types import (
|
|
15
15
|
validate_mime_type,
|
16
16
|
)
|
17
17
|
from kreuzberg._registry import ExtractorRegistry
|
18
|
+
from kreuzberg._token_reduction import get_reduction_stats, reduce_tokens
|
18
19
|
from kreuzberg._types import ExtractionConfig, ExtractionResult
|
19
20
|
from kreuzberg._utils._document_cache import get_document_cache
|
20
21
|
from kreuzberg._utils._errors import create_error_context
|
@@ -31,15 +32,6 @@ DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
|
|
31
32
|
|
32
33
|
|
33
34
|
async def _handle_cache_async(path: Path, config: ExtractionConfig) -> ExtractionResult | None:
|
34
|
-
"""Handle cache lookup and coordination with other processes.
|
35
|
-
|
36
|
-
Args:
|
37
|
-
path: Path to the file being processed
|
38
|
-
config: Extraction configuration
|
39
|
-
|
40
|
-
Returns:
|
41
|
-
Cached result if available, None otherwise
|
42
|
-
"""
|
43
35
|
cache = get_document_cache()
|
44
36
|
|
45
37
|
cached_result = cache.get(path, config)
|
@@ -84,14 +76,45 @@ def _validate_and_post_process_helper(
|
|
84
76
|
result.keywords = None
|
85
77
|
|
86
78
|
if config.auto_detect_language:
|
79
|
+
# Use provided config or create one with the model from ExtractionConfig
|
80
|
+
lang_config = config.language_detection_config
|
81
|
+
if lang_config is None:
|
82
|
+
from kreuzberg._types import LanguageDetectionConfig # noqa: PLC0415
|
83
|
+
|
84
|
+
lang_config = LanguageDetectionConfig(model=config.language_detection_model)
|
85
|
+
|
87
86
|
result.detected_languages = detect_languages(
|
88
87
|
result.content,
|
89
|
-
config=
|
88
|
+
config=lang_config,
|
90
89
|
)
|
91
90
|
|
92
91
|
if config.auto_detect_document_type:
|
93
92
|
result = auto_detect_document_type(result, config, file_path=file_path)
|
94
93
|
|
94
|
+
if config.token_reduction is not None and config.token_reduction.mode != "off":
|
95
|
+
original_content = result.content
|
96
|
+
|
97
|
+
language_hint = None
|
98
|
+
if result.detected_languages and len(result.detected_languages) > 0:
|
99
|
+
language_hint = result.detected_languages[0]
|
100
|
+
|
101
|
+
reduced_content = reduce_tokens(
|
102
|
+
original_content,
|
103
|
+
config=config.token_reduction,
|
104
|
+
language=language_hint,
|
105
|
+
)
|
106
|
+
reduction_stats = get_reduction_stats(original_content, reduced_content)
|
107
|
+
|
108
|
+
result.content = reduced_content
|
109
|
+
result.metadata["token_reduction"] = {
|
110
|
+
"character_reduction_ratio": reduction_stats["character_reduction_ratio"],
|
111
|
+
"token_reduction_ratio": reduction_stats["token_reduction_ratio"],
|
112
|
+
"original_characters": reduction_stats["original_characters"],
|
113
|
+
"reduced_characters": reduction_stats["reduced_characters"],
|
114
|
+
"original_tokens": reduction_stats["original_tokens"],
|
115
|
+
"reduced_tokens": reduction_stats["reduced_tokens"],
|
116
|
+
}
|
117
|
+
|
95
118
|
return result
|
96
119
|
|
97
120
|
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.17.1
|
4
4
|
Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
|
5
5
|
Project-URL: documentation, https://kreuzberg.dev
|
6
6
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
@@ -32,12 +32,13 @@ Requires-Dist: anyio>=4.10.0
|
|
32
32
|
Requires-Dist: chardetng-py>=0.3.5
|
33
33
|
Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
|
34
34
|
Requires-Dist: html-to-markdown[lxml]>=1.13.0
|
35
|
-
Requires-Dist:
|
35
|
+
Requires-Dist: langcodes>=3.5.0
|
36
|
+
Requires-Dist: mcp>=1.14.1
|
36
37
|
Requires-Dist: msgspec>=0.18.0
|
37
38
|
Requires-Dist: numpy>=2.0.0
|
38
39
|
Requires-Dist: playa-pdf>=0.7.0
|
39
40
|
Requires-Dist: polars>=1.33.1
|
40
|
-
Requires-Dist: psutil>=7.
|
41
|
+
Requires-Dist: psutil>=7.1.0
|
41
42
|
Requires-Dist: pypdfium2==4.30.0
|
42
43
|
Requires-Dist: python-calamine>=0.5.3
|
43
44
|
Requires-Dist: python-pptx>=1.0.2
|
@@ -49,7 +50,7 @@ Provides-Extra: all
|
|
49
50
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
50
51
|
Requires-Dist: deep-translator>=1.11.4; extra == 'all'
|
51
52
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
52
|
-
Requires-Dist: fast-langdetect>=0.
|
53
|
+
Requires-Dist: fast-langdetect>=1.0.0; extra == 'all'
|
53
54
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
54
55
|
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
55
56
|
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.17.0; extra == 'all'
|
@@ -82,7 +83,7 @@ Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
|
82
83
|
Provides-Extra: gmft
|
83
84
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
84
85
|
Provides-Extra: langdetect
|
85
|
-
Requires-Dist: fast-langdetect>=0.
|
86
|
+
Requires-Dist: fast-langdetect>=1.0.0; extra == 'langdetect'
|
86
87
|
Provides-Extra: paddleocr
|
87
88
|
Requires-Dist: paddleocr>=3.2.0; extra == 'paddleocr'
|
88
89
|
Requires-Dist: paddlepaddle>=3.2.0; extra == 'paddleocr'
|
@@ -0,0 +1,128 @@
|
|
1
|
+
kreuzberg/__init__.py,sha256=niF_YZ7YADL_oXZ8zB5EMov4xnyFzuxTABVlHoRnBJA,1629
|
2
|
+
kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
|
3
|
+
kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
|
4
|
+
kreuzberg/_config.py,sha256=ZYIcnJAjDnbWW_2WBy7NlOk1Ol6WpoMG5FMNMmHpqSY,13086
|
5
|
+
kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
6
|
+
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
|
+
kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
|
8
|
+
kreuzberg/_gmft.py,sha256=gfRXOsv-K9R7Y0zZ2SUa5wid3FpP2eFIlg5nepWcz1Q,20827
|
9
|
+
kreuzberg/_language_detection.py,sha256=OwIWIddERPEz8krU_Aq0_KjRF6MHP-LpugH6Y6miwOc,1204
|
10
|
+
kreuzberg/_mime_types.py,sha256=duEMDBg_qIf9A02tXAC_2znD-wgE-2BBMW9ofyYTJjE,8622
|
11
|
+
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
|
+
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
+
kreuzberg/_types.py,sha256=ttY61QI8mruCI70Af3owlU-O5LdvQ6gOqIZTGQ9PaVs,49129
|
14
|
+
kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
|
15
|
+
kreuzberg/exceptions.py,sha256=KiGAfIX3_TkGYG1h9eTZ_E_pALsAqhZ_A3XfhwxwaS0,2909
|
16
|
+
kreuzberg/extraction.py,sha256=jwzWdomwrl-2z1UznLoURLyqD5r0U-rFABXSBV2B2wA,19063
|
17
|
+
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
+
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
+
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
20
|
+
kreuzberg/_api/main.py,sha256=_tBZaRiq7qq7x4nXkVRgU5FBivLFJ_dmadAc7aT0H_k,13901
|
21
|
+
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
+
kreuzberg/_extractors/_base.py,sha256=4MRBXdLsgdtdrTuupWb2IT9YpRSnNPpWWviS2mfeOXg,9961
|
23
|
+
kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
|
24
|
+
kreuzberg/_extractors/_html.py,sha256=TXXgwQZuEvnrny5HdBpn8oikGktyxgY9jvgZmnFtnqY,6371
|
25
|
+
kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
|
26
|
+
kreuzberg/_extractors/_pandoc.py,sha256=cwthr--IFwbu8r0rCZ_Cx5zRlan94yuqt5e3mjYxesE,24182
|
27
|
+
kreuzberg/_extractors/_pdf.py,sha256=GFy7xHUH09i48E5Xixy6nReF_uBu9646UTjywKoH-Rs,23304
|
28
|
+
kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
|
29
|
+
kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
|
30
|
+
kreuzberg/_extractors/_structured.py,sha256=YkTOfSQJOe127ZURrAYAomNrIkKoAYC4gt0P9ypY3RY,8919
|
31
|
+
kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
|
32
|
+
kreuzberg/_mcp/server.py,sha256=71MhjiFDwgFROdGejf0djgO1eG370qudWmZsN59CUeA,16743
|
33
|
+
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
34
|
+
kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
35
|
+
kreuzberg/_ocr/_easyocr.py,sha256=bHz2S_8nNHaPHPemcJK-U0al9_qP-vUmWE4ECVlf7AA,15485
|
36
|
+
kreuzberg/_ocr/_paddleocr.py,sha256=CV9cCjkRe-3cNJ5tRu_sBXd_HNghEwfPIgWwxAZTeRY,15026
|
37
|
+
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
38
|
+
kreuzberg/_ocr/_tesseract.py,sha256=1SEfrX_JvU6KIeWt31GsRWnNmjaAh3xgQaRMPvoZLJA,51349
|
39
|
+
kreuzberg/_token_reduction/__init__.py,sha256=y_2WgPxJes8_PD-VMfx7vQT0hGjFIixzS8PjaIseAGg,311
|
40
|
+
kreuzberg/_token_reduction/_reducer.py,sha256=shAfMPznP69sTSzwX_bE1LpcBmoia9cpd7r6bSc4R5Q,13609
|
41
|
+
kreuzberg/_token_reduction/_stopwords.py,sha256=mu-5CapG0RCP7LYzjhdTM6WWLtmt3cjZ08OOsyQkJVg,3608
|
42
|
+
kreuzberg/_token_reduction/stopwords/af_stopwords.json,sha256=RlgUHyzPIQBbTA52kLSQpmTfteRbbV_qb_Spa51RI8Q,452
|
43
|
+
kreuzberg/_token_reduction/stopwords/ar_stopwords.json,sha256=GKcR9MyDM5zvIQhLqWfq87Jmj3gbAM81ZZi-eBKBjz8,6738
|
44
|
+
kreuzberg/_token_reduction/stopwords/bg_stopwords.json,sha256=7KuYBTg7jc8ZLFYI6QwaVatlh_gP8i80EkQHD1suhXs,3707
|
45
|
+
kreuzberg/_token_reduction/stopwords/bn_stopwords.json,sha256=qAtZN89LGy0se9i_FrB02vsNLgE2gG1clwMHTi4Qncg,7437
|
46
|
+
kreuzberg/_token_reduction/stopwords/br_stopwords.json,sha256=RuErMr4twvsocqw9fvwtgrnbzVKB2WABVi5AfPy2lqo,13601
|
47
|
+
kreuzberg/_token_reduction/stopwords/ca_stopwords.json,sha256=HRrh4QKFXDsAfmk5yjXHD28KjdO2vMjqJFIltNwh_F8,2952
|
48
|
+
kreuzberg/_token_reduction/stopwords/cs_stopwords.json,sha256=Dlsq3UFIHD9USIuHiwrFur0DvIaRpjftnBb3Qnjio4M,4523
|
49
|
+
kreuzberg/_token_reduction/stopwords/da_stopwords.json,sha256=zLk-90hrY5tH4gS4uOcMlviky4mmg7b8WaXnn_NzKfQ,1664
|
50
|
+
kreuzberg/_token_reduction/stopwords/de_stopwords.json,sha256=4lB0tUyT9PlB9ubnUbwOObO_RT0irBSdPOuvQIgrr7g,7052
|
51
|
+
kreuzberg/_token_reduction/stopwords/el_stopwords.json,sha256=VqO3y_q_ZeSBZAMxD1KjMMkCylYN2uuN620szHmFx-M,13604
|
52
|
+
kreuzberg/_token_reduction/stopwords/en_stopwords.json,sha256=VvLb0zoUKjhqQH-RGkSTpPtdRjvgv_G8l4i9ub2fJmU,14171
|
53
|
+
kreuzberg/_token_reduction/stopwords/eo_stopwords.json,sha256=xnojHtnik734Mzw4i4bIxPZEgBRXvgK2TRkHnxBCjWw,1722
|
54
|
+
kreuzberg/_token_reduction/stopwords/es_stopwords.json,sha256=PcSwxKskYQXc-21vNkpb6IntQYVP50CwuXfx4Gyhhx8,8598
|
55
|
+
kreuzberg/_token_reduction/stopwords/et_stopwords.json,sha256=_t6iPfNa1LhqRq4sLNbIB6_B5-472UCNi9IARJTPhzQ,327
|
56
|
+
kreuzberg/_token_reduction/stopwords/eu_stopwords.json,sha256=SNa84Zkx5Rcf8JZBdm4rCMxxZ7Z_94fW9cebZC4qgqI,1069
|
57
|
+
kreuzberg/_token_reduction/stopwords/fa_stopwords.json,sha256=8R1724IQHkXc1g_jXJjRMVLgq2Zz6YgPeE4DI0iSj9Y,11708
|
58
|
+
kreuzberg/_token_reduction/stopwords/fi_stopwords.json,sha256=yOlZLoh3aMJ-YXz3r7kGLAIsDyvxNrhFyvWr7Vu_z5o,10699
|
59
|
+
kreuzberg/_token_reduction/stopwords/fr_stopwords.json,sha256=KkiZ8dQYFQzjVJ-YwUoP13zwLwz7zu9Fpw-X-wmxya0,8025
|
60
|
+
kreuzberg/_token_reduction/stopwords/ga_stopwords.json,sha256=K8LOrUkqSi82KTLlZ1NnadEU-HMyCd2Ofm13GfxW3J0,1100
|
61
|
+
kreuzberg/_token_reduction/stopwords/gl_stopwords.json,sha256=Y0GfhhcOv1GNPJP3zoFYIYkg369GT1yHK5xCPiH6Pn0,1602
|
62
|
+
kreuzberg/_token_reduction/stopwords/gu_stopwords.json,sha256=YSldatfgVz_gNWopQ5TMFTHWEbGVYPcJMwO-bThtYAI,3818
|
63
|
+
kreuzberg/_token_reduction/stopwords/ha_stopwords.json,sha256=EohjrRkbSuLOn_aiDcsMOUEYPJjVha9wHhCsoxiwNsU,354
|
64
|
+
kreuzberg/_token_reduction/stopwords/he_stopwords.json,sha256=STlmHNDJqDEZI7ZCtBcZlEU1ndoEeJIexuOnTaOXJac,2629
|
65
|
+
kreuzberg/_token_reduction/stopwords/hi_stopwords.json,sha256=aYojvEA-UlivR_JCJTwZRoK2BJjVUW_m9q8eDRRczpE,3792
|
66
|
+
kreuzberg/_token_reduction/stopwords/hr_stopwords.json,sha256=2s5uhGAitVRDLgKdbA0F9sFZWtRWcmyiDZY9adwLGzk,1769
|
67
|
+
kreuzberg/_token_reduction/stopwords/hu_stopwords.json,sha256=9o0snSijbEEt9Hpbs4kTW6czhcdiXLTa5sbC68nitDY,9830
|
68
|
+
kreuzberg/_token_reduction/stopwords/hy_stopwords.json,sha256=QLsYw_y9ESyou1bHbPwjSrWy_nJq8wjiNihrvikYSKY,525
|
69
|
+
kreuzberg/_token_reduction/stopwords/id_stopwords.json,sha256=TZB_e1Txu3oGpQfHCzodoOTcKoKplTC5ZDr1iAbdzVI,10238
|
70
|
+
kreuzberg/_token_reduction/stopwords/it_stopwords.json,sha256=BSOpBGf_StyW6tdycNRMSvXGTksvrOowrE--D5914J4,7277
|
71
|
+
kreuzberg/_token_reduction/stopwords/ja_stopwords.json,sha256=E7MSvBOnRvTeChRk0Nm5X7xxwP50BHaP5FGOfDbnmRI,1680
|
72
|
+
kreuzberg/_token_reduction/stopwords/kn_stopwords.json,sha256=km3Qk1vy3OVdsAoE_YbZ-oXRYapFBi5k59o1mlWnk70,1626
|
73
|
+
kreuzberg/_token_reduction/stopwords/ko_stopwords.json,sha256=sHR2SLh_zXVs6SKZlWCS29MGRv6xlKcp3Ckvf0-aXt8,9932
|
74
|
+
kreuzberg/_token_reduction/stopwords/ku_stopwords.json,sha256=1Vj0g-fwacVcwaJ66BSPe4GkI7WybXK-EspIE6uvAmY,893
|
75
|
+
kreuzberg/_token_reduction/stopwords/la_stopwords.json,sha256=1d6iV2sTgZF6G7EF5yb3G0Sic85awtjN617cWXb-ltw,456
|
76
|
+
kreuzberg/_token_reduction/stopwords/lt_stopwords.json,sha256=7WE-NiX-y2IQnnO61-2pDExaR0ZeOq6A7YMn29effAM,5675
|
77
|
+
kreuzberg/_token_reduction/stopwords/lv_stopwords.json,sha256=WTp3jWxsX054E53DdpoI0BqujDefICljN4d7KiIIsls,1796
|
78
|
+
kreuzberg/_token_reduction/stopwords/ml_stopwords.json,sha256=lDoq0gGSI4zbuKhdNyF8MMTPkSI9wYb1om6pRPY5zkw,192
|
79
|
+
kreuzberg/_token_reduction/stopwords/mr_stopwords.json,sha256=6XjzSLaHwwOGWot1QszaUMl12mAVFh840GH9MJoYoes,1764
|
80
|
+
kreuzberg/_token_reduction/stopwords/ms_stopwords.json,sha256=eJsXJ0bVOnWUSVG3XwkIClxlR3qd_2k75ZAQlmHpsKQ,5950
|
81
|
+
kreuzberg/_token_reduction/stopwords/ne_stopwords.json,sha256=MoAXH6Tncag9Qgr6TR7yp1FguDCGQBXpGdSQ2DIOikE,9447
|
82
|
+
kreuzberg/_token_reduction/stopwords/nl_stopwords.json,sha256=W08hz9JP3EdWpXtLPUjWFOSr3AwGnZPkwcjEUBiFWnA,4724
|
83
|
+
kreuzberg/_token_reduction/stopwords/no_stopwords.json,sha256=bOjDCti_Loe0ZYSF2mR-LQzMqViZRkur1GEOLh4Mr-A,2210
|
84
|
+
kreuzberg/_token_reduction/stopwords/pl_stopwords.json,sha256=TcnvzF5uMVDKxQUt1YBu7Lw1qIpeHftuIDSguz8ZAdA,3487
|
85
|
+
kreuzberg/_token_reduction/stopwords/pt_stopwords.json,sha256=h4jmBxUu10PuzQzTjeFm1B5NBl0Owt7uGhwx66mTTYQ,6413
|
86
|
+
kreuzberg/_token_reduction/stopwords/ro_stopwords.json,sha256=iuHvFs-iS118RH07v0hO7Oxfdx5rDqJwl3lRPMWINbM,4569
|
87
|
+
kreuzberg/_token_reduction/stopwords/ru_stopwords.json,sha256=MZckTBKlL1i4Kv16RSSozUfCM6dcKI5H9PYZD7pS0Ac,9028
|
88
|
+
kreuzberg/_token_reduction/stopwords/si_stopwords.json,sha256=jvtaQfO4fc-XPHgaO1hPsbpJQQg40rSeEbCGWm2AO60,3324
|
89
|
+
kreuzberg/_token_reduction/stopwords/sk_stopwords.json,sha256=FDaLmQ61_fFg0k3cGthv8flKFs67M1hmSE-6PrfMCAU,4638
|
90
|
+
kreuzberg/_token_reduction/stopwords/sl_stopwords.json,sha256=UoQRoLRT9qzmS8ALY_cuDE1uukK0hS6Q6QuUhr7oLHc,4669
|
91
|
+
kreuzberg/_token_reduction/stopwords/so_stopwords.json,sha256=Z7ayeNV98MOx_xkGxtcSX3dh8GAhgCRFa0EC1VDG29Q,299
|
92
|
+
kreuzberg/_token_reduction/stopwords/st_stopwords.json,sha256=ajvBq5XQCse62nptN_m8Jll5-Ps9j3bK4RODMIzCkD4,268
|
93
|
+
kreuzberg/_token_reduction/stopwords/sv_stopwords.json,sha256=kLz5vgx0VfQI0jtOj3Rlp6wuj3tKhqp2oF-f9f2-neQ,4737
|
94
|
+
kreuzberg/_token_reduction/stopwords/sw_stopwords.json,sha256=x4eOC7-nRlSS7qv_pwW6yECDrfhm_3zoTWenIPL1aWY,780
|
95
|
+
kreuzberg/_token_reduction/stopwords/ta_stopwords.json,sha256=qBbEu6m_HEx2C27ep6UJOyxQ6st74Et1fN8TvRHoTxw,2634
|
96
|
+
kreuzberg/_token_reduction/stopwords/te_stopwords.json,sha256=GT0Rj3MsgCJSj9GdzKjpgsQJE3-wCaS5Aa3_ynIZKx0,1263
|
97
|
+
kreuzberg/_token_reduction/stopwords/th_stopwords.json,sha256=5DEb-W41TFL4BGS-_CJzgPTkpmuLN20WBfeO1hG0HLc,2010
|
98
|
+
kreuzberg/_token_reduction/stopwords/tl_stopwords.json,sha256=pu3wAWQyT0vzGwSO8N2x2mRlaCHzEgEIvECTCrJOLE8,1663
|
99
|
+
kreuzberg/_token_reduction/stopwords/tr_stopwords.json,sha256=hSmUsApI7lxVfwJwAInkCLoa3YoGjI85Mwg9DpiHTDo,6159
|
100
|
+
kreuzberg/_token_reduction/stopwords/uk_stopwords.json,sha256=_j_lYv_bE5RAEMcW7-u0rYWf39fMrlpIgFEMFQDjqW0,965
|
101
|
+
kreuzberg/_token_reduction/stopwords/ur_stopwords.json,sha256=IcrM74VdmSbgM7wlBtFVtkrWsCI0SDFbRCSSAkyvlqo,7370
|
102
|
+
kreuzberg/_token_reduction/stopwords/vi_stopwords.json,sha256=UOyAEKBwMcQV65QGpQU-ynmyignNoqFzUSQ8p_1XuoY,9152
|
103
|
+
kreuzberg/_token_reduction/stopwords/yo_stopwords.json,sha256=60liY89h7KReEvHEPxe-hCWLPuqr4U89aQDCi7iRCfo,651
|
104
|
+
kreuzberg/_token_reduction/stopwords/zh_stopwords.json,sha256=rouSTCkXun90Q1aCvLjHyt4I7pGrtlcruDpNVybpAMI,8934
|
105
|
+
kreuzberg/_token_reduction/stopwords/zu_stopwords.json,sha256=hfm4E2EDI_VWyR0GUOVjcMQA7ZDH7FsV4FUMcns1H28,324
|
106
|
+
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
107
|
+
kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14110
|
108
|
+
kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
|
109
|
+
kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
|
110
|
+
kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
|
111
|
+
kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
|
112
|
+
kreuzberg/_utils/_image_preprocessing.py,sha256=f7ioWQyARnhzj0am0Y1_eteJwWomdPy7AnbXqw2xWBs,10954
|
113
|
+
kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
|
114
|
+
kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
|
115
|
+
kreuzberg/_utils/_process_pool.py,sha256=fqlxNsxDoqS28BLrZeDBH743HdaUBuGPYFH5hjSajIg,7493
|
116
|
+
kreuzberg/_utils/_quality.py,sha256=FCVh9KieWUYgT1klLxudbslzKuqbOTBbTsHbvIuru7M,5510
|
117
|
+
kreuzberg/_utils/_ref.py,sha256=BDuk9hHYq1KPRgenjC3-6iFEjGsrGfHZKr9tPNhfquU,1109
|
118
|
+
kreuzberg/_utils/_resource_managers.py,sha256=N3-VeHDj6sKBeg3UL-PqRtKGExUBoVcEB5UuQ8FncY8,2079
|
119
|
+
kreuzberg/_utils/_serialization.py,sha256=G-kxtCPDPGFqBMyHfzvAPo-bNUmPdaXYdeg1dnBLfN4,1789
|
120
|
+
kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
|
121
|
+
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
122
|
+
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
123
|
+
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
124
|
+
kreuzberg-3.17.1.dist-info/METADATA,sha256=ttfOl3XA6b-M2BMY7v1cfASGm_Qe91HPzfRcAf_-zU8,12351
|
125
|
+
kreuzberg-3.17.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
126
|
+
kreuzberg-3.17.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
127
|
+
kreuzberg-3.17.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
128
|
+
kreuzberg-3.17.1.dist-info/RECORD,,
|
@@ -1,61 +0,0 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=EE6ENEjyKlt0o6QN1cG3Z_1isCtminVOjQT7ii5eBHA,1575
|
2
|
-
kreuzberg/__main__.py,sha256=3cIDdzTggj2kj8uKx4WShWHmCWqdZazdM3BxUGbAuSI,104
|
3
|
-
kreuzberg/_chunker.py,sha256=lRXvVN60vmWaTxa1b3QzvE-jBmOqYzh5dY-3Kl6pSqI,1427
|
4
|
-
kreuzberg/_config.py,sha256=H4jUAL0fNY-YE61GbGq5UtAUtXHbZA4-9W3YwcT_hu8,12988
|
5
|
-
kreuzberg/_constants.py,sha256=gY6SpCi9za59ghRuLX_z7xfSok6qqvPbvEnv4BLczqI,265
|
6
|
-
kreuzberg/_document_classification.py,sha256=55aDxDIJ65qK6yEXt-fRYTn8LgALvYsWssjWSheVpR0,5697
|
7
|
-
kreuzberg/_entity_extraction.py,sha256=YvcELIo3kV8A_WbzwNjhKn7rPhkZXjbpNMgm2UK0oJw,3621
|
8
|
-
kreuzberg/_gmft.py,sha256=XI8vdBG0tdEVwFiabVieCuvxM5esqTSiFtsEwJ0YT5g,20787
|
9
|
-
kreuzberg/_language_detection.py,sha256=T9p6aimB7QFXAQiEntIMZeH_Z62E52E8fBQ43hWuyhs,1960
|
10
|
-
kreuzberg/_mime_types.py,sha256=-05mBS5AoF4LUmfB_WyLoce0y4peiOyOf2JucF714WQ,8602
|
11
|
-
kreuzberg/_playa.py,sha256=p4G5ymSSCbQoDeXJjH-yuVzdd4y-wKcolqDthjPtqok,11413
|
12
|
-
kreuzberg/_registry.py,sha256=8XYT-vPhNYMAbB5RBIUKz-1Zdg48OCnBcdVZzBq6YwY,3307
|
13
|
-
kreuzberg/_types.py,sha256=Xht1_TcvsbIpdmLYMy6Pa_HpbQuF9MBOo-BrKkZ7cLA,47358
|
14
|
-
kreuzberg/cli.py,sha256=OoHA5MiIcRBATFJpb-FZYlZfpohxL2AbVgamyhnEMFo,14342
|
15
|
-
kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
|
16
|
-
kreuzberg/extraction.py,sha256=5TuuRqLRmboLaTS0x9eZ2lrYOHKJBSHuTT_U-5nn6ek,17829
|
17
|
-
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
18
|
-
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
19
|
-
kreuzberg/_api/_config_cache.py,sha256=gX_ezGNq6SCpTn02yFkn24zMVrQwfIk8-u5XkKJiHFg,8774
|
20
|
-
kreuzberg/_api/main.py,sha256=_tBZaRiq7qq7x4nXkVRgU5FBivLFJ_dmadAc7aT0H_k,13901
|
21
|
-
kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
22
|
-
kreuzberg/_extractors/_base.py,sha256=39E7R7hV6C2uMJdQKLBVSWK3tN-mtK0LaayU10-8Fqo,11191
|
23
|
-
kreuzberg/_extractors/_email.py,sha256=DzNthVbmbdlajDUfs0nNwbHNvG0CAQVqJsRfsatHtf0,8799
|
24
|
-
kreuzberg/_extractors/_html.py,sha256=zZ9WZmmoIG9B5dGF25ulm_GmW9RsYFI1HddDUUp3hOE,6351
|
25
|
-
kreuzberg/_extractors/_image.py,sha256=7rKEGhUAmdzO0YcBKQVhVme4PqyKIi2UCn4esmmFXOY,4300
|
26
|
-
kreuzberg/_extractors/_pandoc.py,sha256=CPEJxKTZdfyb7jPacZkiAsR2NEGL6KyiHzOr88tprJY,24142
|
27
|
-
kreuzberg/_extractors/_pdf.py,sha256=78gPO7m8nPFIOskqqRpUfyOhKUk6f5rjJ0cZDnL9Vdk,23224
|
28
|
-
kreuzberg/_extractors/_presentation.py,sha256=2g6PJnpgUpUfMjQJh-7_gHywDulE8QE8ypH__BrEUTQ,10692
|
29
|
-
kreuzberg/_extractors/_spread_sheet.py,sha256=TJOM70DLN0HzcOkAowZJogAx7QFrouohvU5V0OIliag,12738
|
30
|
-
kreuzberg/_extractors/_structured.py,sha256=YkTOfSQJOe127ZURrAYAomNrIkKoAYC4gt0P9ypY3RY,8919
|
31
|
-
kreuzberg/_mcp/__init__.py,sha256=h6DgLFO4TMUk7_wCJ2jn2Y6IkFmfzb-Z7jX-G5UCYVc,43
|
32
|
-
kreuzberg/_mcp/server.py,sha256=vJWCXbBiv0ktIPZeLedSWZEwKF46p6642H6lxhTnjek,16723
|
33
|
-
kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
|
34
|
-
kreuzberg/_ocr/_base.py,sha256=ZvOJvW8DtylQJZdCPk9vlVNZiBFK-dC4Oj7Kb6-mWkY,1419
|
35
|
-
kreuzberg/_ocr/_easyocr.py,sha256=7bkMM_zN0h7ZiX0-VHxxnwNOhQloI-dlOOibpRc-vNs,15710
|
36
|
-
kreuzberg/_ocr/_paddleocr.py,sha256=XyYc3gtmnvOGfQ0qBQYFphJa1kSv5hZ_LJ0weD2hQ08,15006
|
37
|
-
kreuzberg/_ocr/_table_extractor.py,sha256=LhBiCX8R_xR-uK1FH3ONA_vqOmqUWANZJ2HMCBLsmNY,5513
|
38
|
-
kreuzberg/_ocr/_tesseract.py,sha256=BjTKE6ilUpSEKarHdgP3PbsE6I89JeqgDtpQ-XHniBA,51452
|
39
|
-
kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
40
|
-
kreuzberg/_utils/_cache.py,sha256=AtANbs1MWR4WLB2MhatVGhlh7kM-yjSfFuDnSVSNp50,14110
|
41
|
-
kreuzberg/_utils/_device.py,sha256=o03rLiHiRX6TKhJ55LO1Vj2Map1Po5YdjuMdA63tGOE,8249
|
42
|
-
kreuzberg/_utils/_document_cache.py,sha256=tfk9_Yc1cQkT5_uM5R1uaI4w-2SjNn7QyAd6AmWkSz8,4851
|
43
|
-
kreuzberg/_utils/_errors.py,sha256=aQYEnp8oJ-WJVmCNo7YY-25y1KZZFEwjAmxVRfw4a_M,4920
|
44
|
-
kreuzberg/_utils/_html_streaming.py,sha256=ywQgEQfEGm6MSotS1g_HXgl0e7V59yLmf2wytALuZko,648
|
45
|
-
kreuzberg/_utils/_image_preprocessing.py,sha256=arl4UDDiD_Z6SKM-jTXENaOaaHZBVFTsueb6DcpFXOo,10934
|
46
|
-
kreuzberg/_utils/_ocr_cache.py,sha256=uCCZfdY7EiqMhCnhNwqirFOr-Wfaobd2Ntc-F07TKec,3425
|
47
|
-
kreuzberg/_utils/_pdf_lock.py,sha256=Ytvds30aZf3yXeZFo27ZenrhUoU-GZlR2rKEkhJ_wlk,1349
|
48
|
-
kreuzberg/_utils/_process_pool.py,sha256=fqlxNsxDoqS28BLrZeDBH743HdaUBuGPYFH5hjSajIg,7493
|
49
|
-
kreuzberg/_utils/_quality.py,sha256=FCVh9KieWUYgT1klLxudbslzKuqbOTBbTsHbvIuru7M,5510
|
50
|
-
kreuzberg/_utils/_ref.py,sha256=iOflvjTUc_F0XaL28Bd6fpvL6qkeoURGA4B77Nqky7I,840
|
51
|
-
kreuzberg/_utils/_resource_managers.py,sha256=N3-VeHDj6sKBeg3UL-PqRtKGExUBoVcEB5UuQ8FncY8,2079
|
52
|
-
kreuzberg/_utils/_serialization.py,sha256=G-kxtCPDPGFqBMyHfzvAPo-bNUmPdaXYdeg1dnBLfN4,1789
|
53
|
-
kreuzberg/_utils/_string.py,sha256=wVyvEHByHBeu_6evmqJGv9Ml-NAwkyz60n8l-7L5Cw0,4366
|
54
|
-
kreuzberg/_utils/_sync.py,sha256=gb828WYfVtkB4wKslJrPMmrdeI1h3htWceq-gywHtO4,3184
|
55
|
-
kreuzberg/_utils/_table.py,sha256=OVg6T2QnerMhVNb1juLTBSIjyjFiE5-OrUWr5NSCgnQ,6493
|
56
|
-
kreuzberg/_utils/_tmp.py,sha256=mwZ0BFzhGPfYa2tt8qSjUjfcHnSYvbQT4VlPRCRc_q8,2038
|
57
|
-
kreuzberg-3.16.0.dist-info/METADATA,sha256=d1sUA7WBl0VcXHX0jPGzTHeXmj7yyJzTWjzHUmT-Dp4,12319
|
58
|
-
kreuzberg-3.16.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
59
|
-
kreuzberg-3.16.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
|
60
|
-
kreuzberg-3.16.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
61
|
-
kreuzberg-3.16.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|