kreuzberg 3.5.0__py3-none-any.whl → 3.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +4 -1
- kreuzberg/_entity_extraction.py +239 -0
- kreuzberg/_types.py +35 -0
- kreuzberg/extraction.py +39 -22
- {kreuzberg-3.5.0.dist-info → kreuzberg-3.6.0.dist-info}/METADATA +11 -5
- {kreuzberg-3.5.0.dist-info → kreuzberg-3.6.0.dist-info}/RECORD +9 -8
- {kreuzberg-3.5.0.dist-info → kreuzberg-3.6.0.dist-info}/WHEEL +0 -0
- {kreuzberg-3.5.0.dist-info → kreuzberg-3.6.0.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.5.0.dist-info → kreuzberg-3.6.0.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,5 +1,6 @@
|
|
1
1
|
from importlib.metadata import version
|
2
2
|
|
3
|
+
from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
|
3
4
|
from kreuzberg._gmft import GMFTConfig
|
4
5
|
from kreuzberg._language_detection import LanguageDetectionConfig
|
5
6
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
@@ -8,7 +9,7 @@ from kreuzberg._ocr._tesseract import TesseractConfig
|
|
8
9
|
|
9
10
|
from ._ocr._tesseract import PSMMode
|
10
11
|
from ._registry import ExtractorRegistry
|
11
|
-
from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
|
12
|
+
from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
|
12
13
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
13
14
|
from .extraction import (
|
14
15
|
batch_extract_bytes,
|
@@ -25,6 +26,7 @@ __version__ = version("kreuzberg")
|
|
25
26
|
|
26
27
|
__all__ = [
|
27
28
|
"EasyOCRConfig",
|
29
|
+
"Entity",
|
28
30
|
"ExtractionConfig",
|
29
31
|
"ExtractionResult",
|
30
32
|
"ExtractorRegistry",
|
@@ -37,6 +39,7 @@ __all__ = [
|
|
37
39
|
"PSMMode",
|
38
40
|
"PaddleOCRConfig",
|
39
41
|
"ParsingError",
|
42
|
+
"SpacyEntityExtractionConfig",
|
40
43
|
"TableData",
|
41
44
|
"TesseractConfig",
|
42
45
|
"ValidationError",
|
@@ -0,0 +1,239 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from dataclasses import dataclass
|
5
|
+
from functools import lru_cache
|
6
|
+
from typing import TYPE_CHECKING, Any
|
7
|
+
|
8
|
+
from kreuzberg._types import Entity
|
9
|
+
from kreuzberg.exceptions import MissingDependencyError
|
10
|
+
|
11
|
+
if TYPE_CHECKING:
|
12
|
+
from collections.abc import Sequence
|
13
|
+
from pathlib import Path
|
14
|
+
|
15
|
+
|
16
|
+
@dataclass(unsafe_hash=True, frozen=True)
|
17
|
+
class SpacyEntityExtractionConfig:
|
18
|
+
"""Configuration for spaCy-based entity extraction."""
|
19
|
+
|
20
|
+
model_cache_dir: str | Path | None = None
|
21
|
+
"""Directory to cache spaCy models. If None, uses spaCy's default."""
|
22
|
+
|
23
|
+
language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
|
24
|
+
"""Mapping of language codes to spaCy model names.
|
25
|
+
|
26
|
+
If None, uses default mappings:
|
27
|
+
- en: en_core_web_sm
|
28
|
+
- de: de_core_news_sm
|
29
|
+
- fr: fr_core_news_sm
|
30
|
+
- es: es_core_news_sm
|
31
|
+
- pt: pt_core_news_sm
|
32
|
+
- it: it_core_news_sm
|
33
|
+
- nl: nl_core_news_sm
|
34
|
+
- zh: zh_core_web_sm
|
35
|
+
- ja: ja_core_news_sm
|
36
|
+
"""
|
37
|
+
|
38
|
+
fallback_to_multilingual: bool = True
|
39
|
+
"""If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
|
40
|
+
|
41
|
+
max_doc_length: int = 1000000
|
42
|
+
"""Maximum document length for spaCy processing."""
|
43
|
+
|
44
|
+
batch_size: int = 1000
|
45
|
+
"""Batch size for processing multiple texts."""
|
46
|
+
|
47
|
+
def __post_init__(self) -> None:
|
48
|
+
if self.language_models is None:
|
49
|
+
object.__setattr__(self, "language_models", self._get_default_language_models())
|
50
|
+
|
51
|
+
if isinstance(self.language_models, dict):
|
52
|
+
object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
|
53
|
+
|
54
|
+
@staticmethod
|
55
|
+
def _get_default_language_models() -> dict[str, str]:
|
56
|
+
"""Get default language model mappings based on available spaCy models."""
|
57
|
+
return {
|
58
|
+
"en": "en_core_web_sm",
|
59
|
+
"de": "de_core_news_sm",
|
60
|
+
"fr": "fr_core_news_sm",
|
61
|
+
"es": "es_core_news_sm",
|
62
|
+
"pt": "pt_core_news_sm",
|
63
|
+
"it": "it_core_news_sm",
|
64
|
+
"nl": "nl_core_news_sm",
|
65
|
+
"zh": "zh_core_web_sm",
|
66
|
+
"ja": "ja_core_news_sm",
|
67
|
+
"ko": "ko_core_news_sm",
|
68
|
+
"ru": "ru_core_news_sm",
|
69
|
+
"pl": "pl_core_news_sm",
|
70
|
+
"ro": "ro_core_news_sm",
|
71
|
+
"el": "el_core_news_sm",
|
72
|
+
"da": "da_core_news_sm",
|
73
|
+
"fi": "fi_core_news_sm",
|
74
|
+
"nb": "nb_core_news_sm",
|
75
|
+
"sv": "sv_core_news_sm",
|
76
|
+
"ca": "ca_core_news_sm",
|
77
|
+
"hr": "hr_core_news_sm",
|
78
|
+
"lt": "lt_core_news_sm",
|
79
|
+
"mk": "mk_core_news_sm",
|
80
|
+
"sl": "sl_core_news_sm",
|
81
|
+
"uk": "uk_core_news_sm",
|
82
|
+
}
|
83
|
+
|
84
|
+
def get_model_for_language(self, language_code: str) -> str | None:
|
85
|
+
"""Get the appropriate spaCy model for a language code."""
|
86
|
+
if not self.language_models:
|
87
|
+
return None
|
88
|
+
|
89
|
+
models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
|
90
|
+
|
91
|
+
if language_code in models_dict:
|
92
|
+
return models_dict[language_code]
|
93
|
+
|
94
|
+
base_lang = language_code.split("-")[0].lower()
|
95
|
+
if base_lang in models_dict:
|
96
|
+
return models_dict[base_lang]
|
97
|
+
|
98
|
+
return None
|
99
|
+
|
100
|
+
def get_fallback_model(self) -> str | None:
|
101
|
+
"""Get fallback multilingual model if enabled."""
|
102
|
+
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
103
|
+
|
104
|
+
|
105
|
+
def extract_entities(
|
106
|
+
text: str,
|
107
|
+
entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
|
108
|
+
custom_patterns: frozenset[tuple[str, str]] | None = None,
|
109
|
+
languages: list[str] | None = None,
|
110
|
+
spacy_config: SpacyEntityExtractionConfig | None = None,
|
111
|
+
) -> list[Entity]:
|
112
|
+
"""Extract entities from text using custom regex patterns and/or a NER model.
|
113
|
+
|
114
|
+
Args:
|
115
|
+
text: The input text to extract entities from.
|
116
|
+
entity_types: List of entity types to extract using the NER model.
|
117
|
+
custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
|
118
|
+
languages: List of detected languages to choose appropriate spaCy models.
|
119
|
+
spacy_config: Configuration for spaCy entity extraction.
|
120
|
+
|
121
|
+
Returns:
|
122
|
+
list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
|
123
|
+
|
124
|
+
Raises:
|
125
|
+
MissingDependencyError: If `spacy` is not installed.
|
126
|
+
"""
|
127
|
+
entities: list[Entity] = []
|
128
|
+
if custom_patterns:
|
129
|
+
custom_patterns_dict = dict(custom_patterns)
|
130
|
+
for ent_type, pattern in custom_patterns_dict.items():
|
131
|
+
entities.extend(
|
132
|
+
Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
|
133
|
+
for match in re.finditer(pattern, text)
|
134
|
+
)
|
135
|
+
|
136
|
+
if spacy_config is None:
|
137
|
+
spacy_config = SpacyEntityExtractionConfig()
|
138
|
+
|
139
|
+
try:
|
140
|
+
import spacy # noqa: F401
|
141
|
+
except ImportError as e:
|
142
|
+
raise MissingDependencyError.create_for_package(
|
143
|
+
package_name="spacy",
|
144
|
+
dependency_group="entity-extraction",
|
145
|
+
functionality="Entity Extraction",
|
146
|
+
) from e
|
147
|
+
|
148
|
+
model_name = _select_spacy_model(languages, spacy_config)
|
149
|
+
if not model_name:
|
150
|
+
return entities
|
151
|
+
|
152
|
+
nlp = _load_spacy_model(model_name, spacy_config)
|
153
|
+
if not nlp:
|
154
|
+
return entities
|
155
|
+
|
156
|
+
if len(text) > spacy_config.max_doc_length:
|
157
|
+
text = text[: spacy_config.max_doc_length]
|
158
|
+
|
159
|
+
doc = nlp(text)
|
160
|
+
|
161
|
+
entity_type_mapping = {etype.upper() for etype in entity_types}
|
162
|
+
|
163
|
+
entities.extend(
|
164
|
+
Entity(
|
165
|
+
type=ent.label_,
|
166
|
+
text=ent.text,
|
167
|
+
start=ent.start_char,
|
168
|
+
end=ent.end_char,
|
169
|
+
)
|
170
|
+
for ent in doc.ents
|
171
|
+
if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
|
172
|
+
)
|
173
|
+
|
174
|
+
return entities
|
175
|
+
|
176
|
+
|
177
|
+
@lru_cache(maxsize=32)
|
178
|
+
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
179
|
+
"""Load a spaCy model with caching."""
|
180
|
+
try:
|
181
|
+
import spacy
|
182
|
+
|
183
|
+
if spacy_config.model_cache_dir:
|
184
|
+
import os
|
185
|
+
|
186
|
+
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
187
|
+
|
188
|
+
nlp = spacy.load(model_name)
|
189
|
+
|
190
|
+
nlp.max_length = spacy_config.max_doc_length
|
191
|
+
|
192
|
+
return nlp
|
193
|
+
except (OSError, ImportError):
|
194
|
+
return None
|
195
|
+
|
196
|
+
|
197
|
+
def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
198
|
+
"""Select the best spaCy model based on detected languages."""
|
199
|
+
if not languages:
|
200
|
+
return spacy_config.get_model_for_language("en")
|
201
|
+
|
202
|
+
for lang in languages:
|
203
|
+
model_name = spacy_config.get_model_for_language(lang)
|
204
|
+
if model_name:
|
205
|
+
return model_name
|
206
|
+
|
207
|
+
return spacy_config.get_fallback_model()
|
208
|
+
|
209
|
+
|
210
|
+
def extract_keywords(
|
211
|
+
text: str,
|
212
|
+
keyword_count: int = 10,
|
213
|
+
) -> list[tuple[str, float]]:
|
214
|
+
"""Extract keywords from text using the KeyBERT model.
|
215
|
+
|
216
|
+
Args:
|
217
|
+
text: The input text to extract keywords from.
|
218
|
+
keyword_count: Number of top keywords to return. Defaults to 10.
|
219
|
+
|
220
|
+
Returns:
|
221
|
+
list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
|
222
|
+
|
223
|
+
Raises:
|
224
|
+
MissingDependencyError: If `keybert` is not installed.
|
225
|
+
"""
|
226
|
+
try:
|
227
|
+
from keybert import KeyBERT
|
228
|
+
|
229
|
+
kw_model = KeyBERT()
|
230
|
+
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
231
|
+
return [(kw, float(score)) for kw, score in keywords]
|
232
|
+
except (RuntimeError, OSError, ValueError):
|
233
|
+
return []
|
234
|
+
except ImportError as e:
|
235
|
+
raise MissingDependencyError.create_for_package(
|
236
|
+
package_name="keybert",
|
237
|
+
dependency_group="entity-extraction",
|
238
|
+
functionality="Keyword Extraction",
|
239
|
+
) from e
|
kreuzberg/_types.py
CHANGED
@@ -17,6 +17,7 @@ if TYPE_CHECKING:
|
|
17
17
|
from pandas import DataFrame
|
18
18
|
from PIL.Image import Image
|
19
19
|
|
20
|
+
from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
|
20
21
|
from kreuzberg._gmft import GMFTConfig
|
21
22
|
from kreuzberg._language_detection import LanguageDetectionConfig
|
22
23
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
@@ -100,6 +101,20 @@ class Metadata(TypedDict, total=False):
|
|
100
101
|
"""Width of the document page/slide/image, if applicable."""
|
101
102
|
|
102
103
|
|
104
|
+
@dataclass(frozen=True)
|
105
|
+
class Entity:
|
106
|
+
"""Represents an extracted entity with type, text, and position."""
|
107
|
+
|
108
|
+
type: str
|
109
|
+
"""e.g., PERSON, ORGANIZATION, LOCATION, DATE, EMAIL, PHONE, or custom"""
|
110
|
+
text: str
|
111
|
+
"""Extracted text"""
|
112
|
+
start: int
|
113
|
+
"""Start character offset in the content"""
|
114
|
+
end: int
|
115
|
+
"""End character offset in the content"""
|
116
|
+
|
117
|
+
|
103
118
|
@dataclass
|
104
119
|
class ExtractionResult:
|
105
120
|
"""The result of a file extraction."""
|
@@ -114,6 +129,10 @@ class ExtractionResult:
|
|
114
129
|
"""Extracted tables. Is an empty list if 'extract_tables' is not set to True in the ExtractionConfig."""
|
115
130
|
chunks: list[str] = field(default_factory=list)
|
116
131
|
"""The extracted content chunks. This is an empty list if 'chunk_content' is not set to True in the ExtractionConfig."""
|
132
|
+
entities: list[Entity] | None = None
|
133
|
+
"""Extracted entities, if entity extraction is enabled."""
|
134
|
+
keywords: list[tuple[str, float]] | None = None
|
135
|
+
"""Extracted keywords and their scores, if keyword extraction is enabled."""
|
117
136
|
detected_languages: list[str] | None = None
|
118
137
|
"""Languages detected in the extracted content, if language detection is enabled."""
|
119
138
|
|
@@ -160,12 +179,28 @@ class ExtractionConfig:
|
|
160
179
|
"""Post processing hooks to call after processing is done and before the final result is returned."""
|
161
180
|
validators: list[ValidationHook] | None = None
|
162
181
|
"""Validation hooks to call after processing is done and before post-processing and result return."""
|
182
|
+
extract_entities: bool = False
|
183
|
+
"""Whether to extract named entities from the content."""
|
184
|
+
extract_keywords: bool = False
|
185
|
+
"""Whether to extract keywords from the content."""
|
186
|
+
keyword_count: int = 10
|
187
|
+
"""Number of keywords to extract if extract_keywords is True."""
|
188
|
+
custom_entity_patterns: frozenset[tuple[str, str]] | None = None
|
189
|
+
"""Custom entity patterns as a frozenset of (entity_type, regex_pattern) tuples."""
|
163
190
|
auto_detect_language: bool = False
|
164
191
|
"""Whether to automatically detect language and configure OCR accordingly."""
|
165
192
|
language_detection_config: LanguageDetectionConfig | None = None
|
166
193
|
"""Configuration for language detection. If None, uses default settings."""
|
194
|
+
spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
|
195
|
+
"""Configuration for spaCy entity extraction. If None, uses default settings."""
|
167
196
|
|
168
197
|
def __post_init__(self) -> None:
|
198
|
+
if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
|
199
|
+
object.__setattr__(self, "custom_entity_patterns", frozenset(self.custom_entity_patterns.items()))
|
200
|
+
if self.post_processing_hooks is not None and isinstance(self.post_processing_hooks, list):
|
201
|
+
object.__setattr__(self, "post_processing_hooks", tuple(self.post_processing_hooks))
|
202
|
+
if self.validators is not None and isinstance(self.validators, list):
|
203
|
+
object.__setattr__(self, "validators", tuple(self.validators))
|
169
204
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
170
205
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
171
206
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
kreuzberg/extraction.py
CHANGED
@@ -1,12 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
3
|
from pathlib import Path
|
4
|
-
from typing import TYPE_CHECKING, Final, cast
|
4
|
+
from typing import TYPE_CHECKING, Any, Final, cast
|
5
5
|
|
6
6
|
import anyio
|
7
7
|
|
8
8
|
from kreuzberg import ExtractionResult
|
9
9
|
from kreuzberg._chunker import get_chunker
|
10
|
+
from kreuzberg._entity_extraction import extract_entities, extract_keywords
|
11
|
+
from kreuzberg._language_detection import detect_languages
|
10
12
|
from kreuzberg._mime_types import (
|
11
13
|
validate_mime_type,
|
12
14
|
)
|
@@ -24,15 +26,7 @@ if TYPE_CHECKING:
|
|
24
26
|
DEFAULT_CONFIG: Final[ExtractionConfig] = ExtractionConfig()
|
25
27
|
|
26
28
|
|
27
|
-
|
28
|
-
for validator in config.validators or []:
|
29
|
-
await run_maybe_sync(validator, result)
|
30
|
-
|
31
|
-
if config.auto_detect_language and result.content:
|
32
|
-
from kreuzberg._language_detection import detect_languages
|
33
|
-
|
34
|
-
result.detected_languages = detect_languages(result.content, config.language_detection_config)
|
35
|
-
|
29
|
+
def _validate_and_post_process_helper(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
36
30
|
if config.chunk_content:
|
37
31
|
result.chunks = _handle_chunk_content(
|
38
32
|
mime_type=result.mime_type,
|
@@ -40,6 +34,39 @@ async def _validate_and_post_process_async(result: ExtractionResult, config: Ext
|
|
40
34
|
content=result.content,
|
41
35
|
)
|
42
36
|
|
37
|
+
if config.extract_entities:
|
38
|
+
try:
|
39
|
+
result.entities = extract_entities(
|
40
|
+
result.content,
|
41
|
+
custom_patterns=config.custom_entity_patterns,
|
42
|
+
)
|
43
|
+
except RuntimeError:
|
44
|
+
result.entities = None
|
45
|
+
|
46
|
+
if config.extract_keywords:
|
47
|
+
try:
|
48
|
+
result.keywords = extract_keywords(
|
49
|
+
result.content,
|
50
|
+
keyword_count=config.keyword_count,
|
51
|
+
)
|
52
|
+
except RuntimeError:
|
53
|
+
result.keywords = None
|
54
|
+
|
55
|
+
if config.auto_detect_language:
|
56
|
+
result.detected_languages = detect_languages(
|
57
|
+
result.content,
|
58
|
+
config=config.language_detection_config,
|
59
|
+
)
|
60
|
+
|
61
|
+
return result
|
62
|
+
|
63
|
+
|
64
|
+
async def _validate_and_post_process_async(result: ExtractionResult, config: ExtractionConfig) -> ExtractionResult:
|
65
|
+
for validator in config.validators or []:
|
66
|
+
await run_maybe_sync(validator, result)
|
67
|
+
|
68
|
+
result = _validate_and_post_process_helper(result, config)
|
69
|
+
|
43
70
|
for post_processor in config.post_processing_hooks or []:
|
44
71
|
result = await run_maybe_sync(post_processor, result)
|
45
72
|
|
@@ -50,17 +77,7 @@ def _validate_and_post_process_sync(result: ExtractionResult, config: Extraction
|
|
50
77
|
for validator in config.validators or []:
|
51
78
|
run_sync_only(validator, result)
|
52
79
|
|
53
|
-
|
54
|
-
from kreuzberg._language_detection import detect_languages
|
55
|
-
|
56
|
-
result.detected_languages = detect_languages(result.content, config.language_detection_config)
|
57
|
-
|
58
|
-
if config.chunk_content:
|
59
|
-
result.chunks = _handle_chunk_content(
|
60
|
-
mime_type=result.mime_type,
|
61
|
-
config=config,
|
62
|
-
content=result.content,
|
63
|
-
)
|
80
|
+
result = _validate_and_post_process_helper(result, config)
|
64
81
|
|
65
82
|
for post_processor in config.post_processing_hooks or []:
|
66
83
|
result = run_sync_only(post_processor, result)
|
@@ -72,7 +89,7 @@ def _handle_chunk_content(
|
|
72
89
|
mime_type: str,
|
73
90
|
config: ExtractionConfig,
|
74
91
|
content: str,
|
75
|
-
) ->
|
92
|
+
) -> Any:
|
76
93
|
chunker = get_chunker(mime_type=mime_type, max_characters=config.max_chars, overlap_characters=config.max_overlap)
|
77
94
|
return chunker.chunks(content)
|
78
95
|
|
@@ -1,12 +1,12 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: kreuzberg
|
3
|
-
Version: 3.
|
3
|
+
Version: 3.6.0
|
4
4
|
Summary: A text extraction library supporting PDFs, images, office documents and more
|
5
5
|
Project-URL: homepage, https://github.com/Goldziher/kreuzberg
|
6
6
|
Author-email: Na'aman Hirschfeld <nhirschfed@gmail.com>
|
7
7
|
License: MIT
|
8
8
|
License-File: LICENSE
|
9
|
-
Keywords: document-processing,image-to-text,ocr,pandoc,pdf-extraction,rag,table-extraction,tesseract,text-extraction,text-processing
|
9
|
+
Keywords: document-processing,entity-extraction,image-to-text,keyword-extraction,named-entity-recognition,ner,ocr,pandoc,pdf-extraction,rag,spacy,table-extraction,tesseract,text-extraction,text-processing
|
10
10
|
Classifier: Development Status :: 5 - Production/Stable
|
11
11
|
Classifier: Intended Audience :: Developers
|
12
12
|
Classifier: License :: OSI Approved :: MIT License
|
@@ -36,16 +36,19 @@ Requires-Dist: typing-extensions>=4.14.0; python_version < '3.12'
|
|
36
36
|
Provides-Extra: all
|
37
37
|
Requires-Dist: click>=8.2.1; extra == 'all'
|
38
38
|
Requires-Dist: easyocr>=1.7.2; extra == 'all'
|
39
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
|
39
40
|
Requires-Dist: gmft>=0.4.2; extra == 'all'
|
40
|
-
Requires-Dist:
|
41
|
+
Requires-Dist: keybert>=0.9.0; extra == 'all'
|
42
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all'
|
41
43
|
Requires-Dist: paddleocr>=3.1.0; extra == 'all'
|
42
44
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
|
43
45
|
Requires-Dist: rich>=14.0.0; extra == 'all'
|
44
46
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
|
45
47
|
Requires-Dist: setuptools>=80.9.0; extra == 'all'
|
48
|
+
Requires-Dist: spacy>=3.8.7; extra == 'all'
|
46
49
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
|
47
50
|
Provides-Extra: api
|
48
|
-
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.
|
51
|
+
Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
|
49
52
|
Provides-Extra: chunking
|
50
53
|
Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
|
51
54
|
Provides-Extra: cli
|
@@ -54,10 +57,13 @@ Requires-Dist: rich>=14.0.0; extra == 'cli'
|
|
54
57
|
Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
|
55
58
|
Provides-Extra: easyocr
|
56
59
|
Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
|
60
|
+
Provides-Extra: entity-extraction
|
61
|
+
Requires-Dist: keybert>=0.9.0; extra == 'entity-extraction'
|
62
|
+
Requires-Dist: spacy>=3.8.7; extra == 'entity-extraction'
|
57
63
|
Provides-Extra: gmft
|
58
64
|
Requires-Dist: gmft>=0.4.2; extra == 'gmft'
|
59
65
|
Provides-Extra: langdetect
|
60
|
-
Requires-Dist: fast-langdetect>=0.2
|
66
|
+
Requires-Dist: fast-langdetect>=0.3.2; extra == 'langdetect'
|
61
67
|
Provides-Extra: paddleocr
|
62
68
|
Requires-Dist: paddleocr>=3.1.0; extra == 'paddleocr'
|
63
69
|
Requires-Dist: paddlepaddle>=3.1.0; extra == 'paddleocr'
|
@@ -1,17 +1,18 @@
|
|
1
|
-
kreuzberg/__init__.py,sha256=
|
1
|
+
kreuzberg/__init__.py,sha256=wVxbug-w1cO2xHcP04Bf6QeIKmT2Ep6aeenb8EOYLA0,1534
|
2
2
|
kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
|
3
3
|
kreuzberg/_chunker.py,sha256=2eHSRHcZdJ2ZjR3in49y3o9tPl5HMO3vkbnMqaVCbHI,1887
|
4
4
|
kreuzberg/_cli_config.py,sha256=WD_seFjbuay_NJv77vGLBW6BVV9WZNujdzf3zQkhzPc,5691
|
5
5
|
kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
|
6
|
+
kreuzberg/_entity_extraction.py,sha256=EIasBGpkZ-3FwivjEpisz23LilTwx8os-IbfrDtzNl4,7815
|
6
7
|
kreuzberg/_gmft.py,sha256=e-UpYwizRX_V-dn0a7ja0Z9nShAmDKA1Q7HThJy8cyA,14856
|
7
8
|
kreuzberg/_language_detection.py,sha256=22-uXoOu_ws0K8Hz2M7U_SF9QX3npRYLhntAE1dNLFU,3283
|
8
9
|
kreuzberg/_mime_types.py,sha256=QgX-k8aI4lTKArObDM0TFPt7DUjUVwWrdIaIZDh_XQY,7815
|
9
10
|
kreuzberg/_playa.py,sha256=rU6ii2Qnrj8tkDYlSiab5h-BCYLJnUg4QwSLVDEXF5g,11883
|
10
11
|
kreuzberg/_registry.py,sha256=c2B_PJbaL0q3ab2eNmj_0jldeyMaqgvRwkZqUU4MM5Q,3290
|
11
|
-
kreuzberg/_types.py,sha256=
|
12
|
+
kreuzberg/_types.py,sha256=U72a4SXS1e-zV8cXG0tiozMy9mX9wFM1ma6sVz7HpJo,9936
|
12
13
|
kreuzberg/cli.py,sha256=S0w2nGXBWPFn1NhxppW7dpUwB9f_3ymFuWSAB2aRu9g,12465
|
13
14
|
kreuzberg/exceptions.py,sha256=xRaiJh11i8E6Nc-gAQPgNW5xvhiiFBhRS-CBbCEbHQM,2881
|
14
|
-
kreuzberg/extraction.py,sha256=
|
15
|
+
kreuzberg/extraction.py,sha256=mdH45bMAAUUNXYT7UrNyWJ2oD_gXuLUU-NyuYxQM884,17459
|
15
16
|
kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
16
17
|
kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
17
18
|
kreuzberg/_api/main.py,sha256=kZCMPPzP4BGzEege9pdhQTJPKKVjCaC6kZdMMeaqP2M,2599
|
@@ -46,8 +47,8 @@ kreuzberg/_utils/_serialization.py,sha256=AhZvyAu4KsjAqyZDh--Kn2kSWGgCuH7udio8lT
|
|
46
47
|
kreuzberg/_utils/_string.py,sha256=owIVkUtP0__GiJD9RIJzPdvyIigT5sQho3mOXPbsnW0,958
|
47
48
|
kreuzberg/_utils/_sync.py,sha256=oT4Y_cDBKtE_BFEoLTae3rSisqlYXzW-jlUG_x-dmLM,4725
|
48
49
|
kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
|
49
|
-
kreuzberg-3.
|
50
|
-
kreuzberg-3.
|
51
|
-
kreuzberg-3.
|
52
|
-
kreuzberg-3.
|
53
|
-
kreuzberg-3.
|
50
|
+
kreuzberg-3.6.0.dist-info/METADATA,sha256=zlqw5yTQit-jYeZVnM27kPsn2mCfulpL8wssptrQR8Q,9160
|
51
|
+
kreuzberg-3.6.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
52
|
+
kreuzberg-3.6.0.dist-info/entry_points.txt,sha256=VdoFaTl3QSvVWOZcIlPpDd47o6kn7EvmXSs8FI0ExLc,48
|
53
|
+
kreuzberg-3.6.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
|
54
|
+
kreuzberg-3.6.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|
File without changes
|