kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +14 -13
- kreuzberg/__main__.py +0 -2
- kreuzberg/_api/main.py +119 -9
- kreuzberg/_chunker.py +0 -15
- kreuzberg/_config.py +212 -292
- kreuzberg/_document_classification.py +20 -47
- kreuzberg/_entity_extraction.py +1 -122
- kreuzberg/_extractors/_base.py +4 -71
- kreuzberg/_extractors/_email.py +1 -15
- kreuzberg/_extractors/_html.py +9 -12
- kreuzberg/_extractors/_image.py +1 -25
- kreuzberg/_extractors/_pandoc.py +10 -147
- kreuzberg/_extractors/_pdf.py +38 -94
- kreuzberg/_extractors/_presentation.py +0 -99
- kreuzberg/_extractors/_spread_sheet.py +13 -55
- kreuzberg/_extractors/_structured.py +1 -4
- kreuzberg/_gmft.py +14 -199
- kreuzberg/_language_detection.py +1 -36
- kreuzberg/_mcp/__init__.py +0 -2
- kreuzberg/_mcp/server.py +3 -10
- kreuzberg/_mime_types.py +1 -19
- kreuzberg/_ocr/_base.py +4 -76
- kreuzberg/_ocr/_easyocr.py +124 -186
- kreuzberg/_ocr/_paddleocr.py +154 -224
- kreuzberg/_ocr/_table_extractor.py +184 -0
- kreuzberg/_ocr/_tesseract.py +797 -361
- kreuzberg/_playa.py +5 -31
- kreuzberg/_registry.py +0 -36
- kreuzberg/_types.py +588 -93
- kreuzberg/_utils/_cache.py +84 -138
- kreuzberg/_utils/_device.py +0 -74
- kreuzberg/_utils/_document_cache.py +0 -75
- kreuzberg/_utils/_errors.py +0 -50
- kreuzberg/_utils/_ocr_cache.py +136 -0
- kreuzberg/_utils/_pdf_lock.py +0 -16
- kreuzberg/_utils/_process_pool.py +17 -64
- kreuzberg/_utils/_quality.py +0 -60
- kreuzberg/_utils/_ref.py +32 -0
- kreuzberg/_utils/_serialization.py +0 -30
- kreuzberg/_utils/_string.py +9 -59
- kreuzberg/_utils/_sync.py +0 -77
- kreuzberg/_utils/_table.py +49 -101
- kreuzberg/_utils/_tmp.py +0 -9
- kreuzberg/cli.py +54 -74
- kreuzberg/extraction.py +39 -32
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
- kreuzberg-3.13.1.dist-info/RECORD +57 -0
- kreuzberg-3.11.4.dist-info/RECORD +0 -54
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
- {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,8 @@ from __future__ import annotations
|
|
3
3
|
import re
|
4
4
|
from typing import TYPE_CHECKING
|
5
5
|
|
6
|
+
import polars as pl
|
7
|
+
|
6
8
|
from kreuzberg._ocr import get_ocr_backend
|
7
9
|
from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
|
8
10
|
from kreuzberg.exceptions import MissingDependencyError
|
@@ -40,21 +42,8 @@ DOCUMENT_CLASSIFIERS = {
|
|
40
42
|
|
41
43
|
|
42
44
|
def _get_translated_text(result: ExtractionResult) -> str:
|
43
|
-
"""Translate extracted text to English using Google Translate API.
|
44
|
-
|
45
|
-
Args:
|
46
|
-
result: ExtractionResult containing the text to be translated
|
47
|
-
|
48
|
-
Returns:
|
49
|
-
str: The translated text in lowercase English
|
50
|
-
|
51
|
-
Raises:
|
52
|
-
MissingDependencyError: If the deep-translator package is not installed
|
53
|
-
"""
|
54
|
-
# Combine content with metadata for classification
|
55
45
|
text_to_classify = result.content
|
56
46
|
if result.metadata:
|
57
|
-
# Add metadata values to the text for classification
|
58
47
|
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
59
48
|
text_to_classify = f"{text_to_classify} {metadata_text}"
|
60
49
|
|
@@ -68,21 +57,10 @@ def _get_translated_text(result: ExtractionResult) -> str:
|
|
68
57
|
try:
|
69
58
|
return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
70
59
|
except Exception: # noqa: BLE001
|
71
|
-
# Fall back to original content in lowercase if translation fails
|
72
60
|
return text_to_classify.lower()
|
73
61
|
|
74
62
|
|
75
63
|
def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
|
76
|
-
"""Classifies the document type based on keywords and patterns.
|
77
|
-
|
78
|
-
Args:
|
79
|
-
result: The extraction result containing the content.
|
80
|
-
config: The extraction configuration.
|
81
|
-
|
82
|
-
Returns:
|
83
|
-
A tuple containing the detected document type and the confidence score,
|
84
|
-
or (None, None) if no type is detected with sufficient confidence.
|
85
|
-
"""
|
86
64
|
if not config.auto_detect_document_type:
|
87
65
|
return None, None
|
88
66
|
|
@@ -111,33 +89,20 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
|
|
111
89
|
def classify_document_from_layout(
|
112
90
|
result: ExtractionResult, config: ExtractionConfig
|
113
91
|
) -> tuple[str | None, float | None]:
|
114
|
-
"""Classifies the document type based on layout information from OCR.
|
115
|
-
|
116
|
-
Args:
|
117
|
-
result: The extraction result containing the layout data.
|
118
|
-
config: The extraction configuration.
|
119
|
-
|
120
|
-
Returns:
|
121
|
-
A tuple containing the detected document type and the confidence score,
|
122
|
-
or (None, None) if no type is detected with sufficient confidence.
|
123
|
-
"""
|
124
92
|
if not config.auto_detect_document_type:
|
125
93
|
return None, None
|
126
94
|
|
127
|
-
if result.layout is None or result.layout.
|
95
|
+
if result.layout is None or result.layout.is_empty():
|
128
96
|
return None, None
|
129
97
|
|
130
98
|
layout_df = result.layout
|
131
99
|
if not all(col in layout_df.columns for col in ["text", "top", "height"]):
|
132
100
|
return None, None
|
133
101
|
|
134
|
-
|
135
|
-
layout_text = " ".join(layout_df["text"].astype(str).tolist())
|
102
|
+
layout_text = " ".join(layout_df["text"].cast(str).to_list())
|
136
103
|
|
137
|
-
# Translate layout text directly for classification
|
138
104
|
text_to_classify = layout_text
|
139
105
|
if result.metadata:
|
140
|
-
# Add metadata values to the text for classification
|
141
106
|
metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
|
142
107
|
text_to_classify = f"{text_to_classify} {metadata_text}"
|
143
108
|
|
@@ -146,20 +111,29 @@ def classify_document_from_layout(
|
|
146
111
|
|
147
112
|
translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
|
148
113
|
except Exception: # noqa: BLE001
|
149
|
-
# Fall back to original content in lowercase if translation fails
|
150
114
|
translated_text = text_to_classify.lower()
|
151
115
|
|
152
|
-
layout_df
|
116
|
+
layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
|
153
117
|
|
154
|
-
|
118
|
+
try:
|
119
|
+
layout_df = layout_df.with_columns(
|
120
|
+
[pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
|
121
|
+
)
|
122
|
+
|
123
|
+
page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
|
124
|
+
if page_height_val is None:
|
125
|
+
page_height_val = 0.0
|
126
|
+
page_height = float(page_height_val)
|
127
|
+
except Exception: # noqa: BLE001
|
128
|
+
page_height = 1000.0
|
155
129
|
scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
|
156
130
|
|
157
131
|
for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
|
158
132
|
for pattern in patterns:
|
159
|
-
found_words = layout_df
|
160
|
-
if not found_words.
|
133
|
+
found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
|
134
|
+
if not found_words.is_empty():
|
161
135
|
scores[doc_type] += 1.0
|
162
|
-
word_top = found_words
|
136
|
+
word_top = found_words[0, "top"]
|
163
137
|
if word_top < page_height * 0.3:
|
164
138
|
scores[doc_type] += 0.5
|
165
139
|
|
@@ -183,8 +157,7 @@ def auto_detect_document_type(
|
|
183
157
|
if config.document_classification_mode == "vision" and file_path:
|
184
158
|
layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
|
185
159
|
result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
|
186
|
-
elif result.layout is not None and not result.layout.
|
187
|
-
# Use layout-based classification if layout data is available
|
160
|
+
elif result.layout is not None and not result.layout.is_empty():
|
188
161
|
result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
|
189
162
|
else:
|
190
163
|
result.document_type, result.document_type_confidence = classify_document(result, config)
|
kreuzberg/_entity_extraction.py
CHANGED
@@ -2,105 +2,14 @@ from __future__ import annotations
|
|
2
2
|
|
3
3
|
import os
|
4
4
|
import re
|
5
|
-
from dataclasses import dataclass
|
6
5
|
from functools import lru_cache
|
7
6
|
from typing import TYPE_CHECKING, Any
|
8
7
|
|
9
|
-
from kreuzberg._types import Entity
|
8
|
+
from kreuzberg._types import Entity, SpacyEntityExtractionConfig
|
10
9
|
from kreuzberg.exceptions import MissingDependencyError
|
11
10
|
|
12
11
|
if TYPE_CHECKING:
|
13
12
|
from collections.abc import Sequence
|
14
|
-
from pathlib import Path
|
15
|
-
|
16
|
-
|
17
|
-
@dataclass(unsafe_hash=True, frozen=True, slots=True)
|
18
|
-
class SpacyEntityExtractionConfig:
|
19
|
-
"""Configuration for spaCy-based entity extraction."""
|
20
|
-
|
21
|
-
model_cache_dir: str | Path | None = None
|
22
|
-
"""Directory to cache spaCy models. If None, uses spaCy's default."""
|
23
|
-
|
24
|
-
language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
|
25
|
-
"""Mapping of language codes to spaCy model names.
|
26
|
-
|
27
|
-
If None, uses default mappings:
|
28
|
-
- en: en_core_web_sm
|
29
|
-
- de: de_core_news_sm
|
30
|
-
- fr: fr_core_news_sm
|
31
|
-
- es: es_core_news_sm
|
32
|
-
- pt: pt_core_news_sm
|
33
|
-
- it: it_core_news_sm
|
34
|
-
- nl: nl_core_news_sm
|
35
|
-
- zh: zh_core_web_sm
|
36
|
-
- ja: ja_core_news_sm
|
37
|
-
"""
|
38
|
-
|
39
|
-
fallback_to_multilingual: bool = True
|
40
|
-
"""If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
|
41
|
-
|
42
|
-
max_doc_length: int = 1000000
|
43
|
-
"""Maximum document length for spaCy processing."""
|
44
|
-
|
45
|
-
batch_size: int = 1000
|
46
|
-
"""Batch size for processing multiple texts."""
|
47
|
-
|
48
|
-
def __post_init__(self) -> None:
|
49
|
-
if self.language_models is None:
|
50
|
-
object.__setattr__(self, "language_models", self._get_default_language_models())
|
51
|
-
|
52
|
-
if isinstance(self.language_models, dict):
|
53
|
-
object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
|
54
|
-
|
55
|
-
@staticmethod
|
56
|
-
def _get_default_language_models() -> dict[str, str]:
|
57
|
-
"""Get default language model mappings based on available spaCy models."""
|
58
|
-
return {
|
59
|
-
"en": "en_core_web_sm",
|
60
|
-
"de": "de_core_news_sm",
|
61
|
-
"fr": "fr_core_news_sm",
|
62
|
-
"es": "es_core_news_sm",
|
63
|
-
"pt": "pt_core_news_sm",
|
64
|
-
"it": "it_core_news_sm",
|
65
|
-
"nl": "nl_core_news_sm",
|
66
|
-
"zh": "zh_core_web_sm",
|
67
|
-
"ja": "ja_core_news_sm",
|
68
|
-
"ko": "ko_core_news_sm",
|
69
|
-
"ru": "ru_core_news_sm",
|
70
|
-
"pl": "pl_core_news_sm",
|
71
|
-
"ro": "ro_core_news_sm",
|
72
|
-
"el": "el_core_news_sm",
|
73
|
-
"da": "da_core_news_sm",
|
74
|
-
"fi": "fi_core_news_sm",
|
75
|
-
"nb": "nb_core_news_sm",
|
76
|
-
"sv": "sv_core_news_sm",
|
77
|
-
"ca": "ca_core_news_sm",
|
78
|
-
"hr": "hr_core_news_sm",
|
79
|
-
"lt": "lt_core_news_sm",
|
80
|
-
"mk": "mk_core_news_sm",
|
81
|
-
"sl": "sl_core_news_sm",
|
82
|
-
"uk": "uk_core_news_sm",
|
83
|
-
}
|
84
|
-
|
85
|
-
def get_model_for_language(self, language_code: str) -> str | None:
|
86
|
-
"""Get the appropriate spaCy model for a language code."""
|
87
|
-
if not self.language_models:
|
88
|
-
return None
|
89
|
-
|
90
|
-
models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
|
91
|
-
|
92
|
-
if language_code in models_dict:
|
93
|
-
return models_dict[language_code]
|
94
|
-
|
95
|
-
base_lang = language_code.split("-")[0].lower()
|
96
|
-
if base_lang in models_dict:
|
97
|
-
return models_dict[base_lang]
|
98
|
-
|
99
|
-
return None
|
100
|
-
|
101
|
-
def get_fallback_model(self) -> str | None:
|
102
|
-
"""Get fallback multilingual model if enabled."""
|
103
|
-
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
104
13
|
|
105
14
|
|
106
15
|
def extract_entities(
|
@@ -110,24 +19,8 @@ def extract_entities(
|
|
110
19
|
languages: list[str] | None = None,
|
111
20
|
spacy_config: SpacyEntityExtractionConfig | None = None,
|
112
21
|
) -> list[Entity]:
|
113
|
-
"""Extract entities from text using custom regex patterns and/or a NER model.
|
114
|
-
|
115
|
-
Args:
|
116
|
-
text: The input text to extract entities from.
|
117
|
-
entity_types: List of entity types to extract using the NER model.
|
118
|
-
custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
|
119
|
-
languages: List of detected languages to choose appropriate spaCy models.
|
120
|
-
spacy_config: Configuration for spaCy entity extraction.
|
121
|
-
|
122
|
-
Returns:
|
123
|
-
list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
|
124
|
-
|
125
|
-
Raises:
|
126
|
-
MissingDependencyError: If `spacy` is not installed.
|
127
|
-
"""
|
128
22
|
entities: list[Entity] = []
|
129
23
|
if custom_patterns:
|
130
|
-
# Direct iteration over frozenset - no need to convert to dict
|
131
24
|
for ent_type, pattern in custom_patterns:
|
132
25
|
entities.extend(
|
133
26
|
Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
|
@@ -177,7 +70,6 @@ def extract_entities(
|
|
177
70
|
|
178
71
|
@lru_cache(maxsize=32)
|
179
72
|
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
180
|
-
"""Load a spaCy model with caching."""
|
181
73
|
try:
|
182
74
|
import spacy # noqa: PLC0415
|
183
75
|
|
@@ -194,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
|
|
194
86
|
|
195
87
|
|
196
88
|
def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
197
|
-
"""Select the best spaCy model based on detected languages."""
|
198
89
|
if not languages:
|
199
90
|
return spacy_config.get_model_for_language("en")
|
200
91
|
|
@@ -210,18 +101,6 @@ def extract_keywords(
|
|
210
101
|
text: str,
|
211
102
|
keyword_count: int = 10,
|
212
103
|
) -> list[tuple[str, float]]:
|
213
|
-
"""Extract keywords from text using the KeyBERT model.
|
214
|
-
|
215
|
-
Args:
|
216
|
-
text: The input text to extract keywords from.
|
217
|
-
keyword_count: Number of top keywords to return. Defaults to 10.
|
218
|
-
|
219
|
-
Returns:
|
220
|
-
list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
|
221
|
-
|
222
|
-
Raises:
|
223
|
-
MissingDependencyError: If `keybert` is not installed.
|
224
|
-
"""
|
225
104
|
try:
|
226
105
|
from keybert import KeyBERT # noqa: PLC0415
|
227
106
|
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -13,20 +13,6 @@ if TYPE_CHECKING:
|
|
13
13
|
|
14
14
|
|
15
15
|
class Extractor(ABC):
|
16
|
-
"""Abstract base class for file content extraction.
|
17
|
-
|
18
|
-
This class provides the interface for different types of content extractors.
|
19
|
-
Subclasses are expected to implement the methods for extracting content
|
20
|
-
either asynchronously or synchronously and determining the supported MIME types.
|
21
|
-
|
22
|
-
Attributes:
|
23
|
-
SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
|
24
|
-
|
25
|
-
Args:
|
26
|
-
mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
|
27
|
-
config: Configuration options for the extraction process.
|
28
|
-
"""
|
29
|
-
|
30
16
|
__slots__ = ("config", "mime_type")
|
31
17
|
|
32
18
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]]
|
@@ -36,89 +22,36 @@ class Extractor(ABC):
|
|
36
22
|
self.config = config
|
37
23
|
|
38
24
|
@abstractmethod
|
39
|
-
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
40
|
-
"""Asynchronously extract content from a byte stream.
|
41
|
-
|
42
|
-
Args:
|
43
|
-
content: The byte content to extract.
|
44
|
-
|
45
|
-
Returns:
|
46
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
47
|
-
"""
|
25
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
|
48
26
|
|
49
27
|
@abstractmethod
|
50
|
-
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
51
|
-
"""Asynchronously extract content from a file located at the specified path.
|
52
|
-
|
53
|
-
Args:
|
54
|
-
path: The path to the file to process.
|
55
|
-
|
56
|
-
Returns:
|
57
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
58
|
-
"""
|
28
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult: ...
|
59
29
|
|
60
30
|
@abstractmethod
|
61
|
-
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
62
|
-
"""Synchronously extract content from a byte stream.
|
63
|
-
|
64
|
-
Args:
|
65
|
-
content: The byte content to extract.
|
66
|
-
|
67
|
-
Returns:
|
68
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
69
|
-
"""
|
31
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
|
70
32
|
|
71
33
|
@abstractmethod
|
72
|
-
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
73
|
-
"""Synchronously extract content from a file located at the specified path.
|
74
|
-
|
75
|
-
Args:
|
76
|
-
path: The path to the file to process.
|
77
|
-
|
78
|
-
Returns:
|
79
|
-
ExtractionResult: The extracted content along with metadata about the extraction.
|
80
|
-
"""
|
34
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult: ...
|
81
35
|
|
82
36
|
@classmethod
|
83
37
|
def supports_mimetype(cls, mime_type: str) -> bool:
|
84
|
-
"""Verify whether the extractor supports the given MIME type.
|
85
|
-
|
86
|
-
Args:
|
87
|
-
mime_type: The MIME type to check (e.g., "application/pdf").
|
88
|
-
|
89
|
-
Returns:
|
90
|
-
bool: True if the MIME type is supported, False otherwise.
|
91
|
-
"""
|
92
38
|
return mime_type in cls.SUPPORTED_MIME_TYPES or any(
|
93
39
|
mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
|
94
40
|
)
|
95
41
|
|
96
42
|
def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
|
97
|
-
"""Apply quality post-processing to extraction result if enabled.
|
98
|
-
|
99
|
-
Args:
|
100
|
-
result: The raw extraction result
|
101
|
-
|
102
|
-
Returns:
|
103
|
-
Enhanced extraction result with quality improvements (if enabled)
|
104
|
-
"""
|
105
|
-
# Only apply quality processing if enabled in config
|
106
43
|
if not self.config.enable_quality_processing:
|
107
44
|
return result
|
108
45
|
|
109
46
|
if not result.content:
|
110
47
|
return result
|
111
48
|
|
112
|
-
# Clean the content
|
113
49
|
cleaned_content = clean_extracted_text(result.content)
|
114
50
|
|
115
|
-
# Calculate quality score
|
116
51
|
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
52
|
|
118
|
-
# Add quality metadata
|
119
53
|
enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
|
120
54
|
|
121
|
-
# Return enhanced result
|
122
55
|
return ExtractionResult(
|
123
56
|
content=cleaned_content,
|
124
57
|
mime_type=result.mime_type,
|
kreuzberg/_extractors/_email.py
CHANGED
@@ -16,7 +16,6 @@ from kreuzberg.exceptions import MissingDependencyError
|
|
16
16
|
if TYPE_CHECKING:
|
17
17
|
from pathlib import Path
|
18
18
|
|
19
|
-
# Import optional dependencies at module level with proper error handling
|
20
19
|
try:
|
21
20
|
import mailparse
|
22
21
|
except ImportError: # pragma: no cover
|
@@ -27,7 +26,6 @@ try:
|
|
27
26
|
except ImportError: # pragma: no cover
|
28
27
|
html2text = None
|
29
28
|
|
30
|
-
# Compile regex pattern once at module level
|
31
29
|
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
32
30
|
|
33
31
|
|
@@ -44,8 +42,6 @@ class EmailExtractor(Extractor):
|
|
44
42
|
def _extract_email_headers(
|
45
43
|
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
46
44
|
) -> None:
|
47
|
-
"""Extract and process email headers."""
|
48
|
-
# Use single dict access where possible to avoid repeated lookups
|
49
45
|
subject = parsed_email.get("subject")
|
50
46
|
if subject:
|
51
47
|
metadata["subject"] = subject
|
@@ -59,9 +55,7 @@ class EmailExtractor(Extractor):
|
|
59
55
|
|
60
56
|
to_info = parsed_email.get("to")
|
61
57
|
if to_info:
|
62
|
-
# Store the raw value in metadata (could be string, dict, or list)
|
63
58
|
if isinstance(to_info, list) and to_info:
|
64
|
-
# For metadata, use first recipient's email if it's a list
|
65
59
|
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
66
60
|
metadata["email_to"] = to_email
|
67
61
|
elif isinstance(to_info, dict):
|
@@ -69,7 +63,6 @@ class EmailExtractor(Extractor):
|
|
69
63
|
else:
|
70
64
|
metadata["email_to"] = str(to_info)
|
71
65
|
|
72
|
-
# For display, format all recipients
|
73
66
|
to_formatted = self._format_email_field(to_info)
|
74
67
|
text_parts.append(f"To: {to_formatted}")
|
75
68
|
|
@@ -91,7 +84,6 @@ class EmailExtractor(Extractor):
|
|
91
84
|
text_parts.append(f"BCC: {bcc_formatted}")
|
92
85
|
|
93
86
|
def _format_email_field(self, field: Any) -> str:
|
94
|
-
"""Format email field (to, cc, bcc) for display."""
|
95
87
|
if isinstance(field, list):
|
96
88
|
emails = []
|
97
89
|
for item in field:
|
@@ -107,23 +99,20 @@ class EmailExtractor(Extractor):
|
|
107
99
|
return str(field)
|
108
100
|
|
109
101
|
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
110
|
-
"""Extract and process email body content."""
|
111
102
|
text_content = parsed_email.get("text")
|
112
103
|
if text_content:
|
113
104
|
text_parts.append(f"\n{text_content}")
|
114
|
-
return
|
105
|
+
return
|
115
106
|
|
116
107
|
html_content = parsed_email.get("html")
|
117
108
|
if html_content:
|
118
109
|
if html2text is not None:
|
119
|
-
# Use html2text if available (faster path)
|
120
110
|
h = html2text.HTML2Text()
|
121
111
|
h.ignore_links = True
|
122
112
|
h.ignore_images = True
|
123
113
|
converted_text = h.handle(html_content)
|
124
114
|
text_parts.append(f"\n{converted_text}")
|
125
115
|
else:
|
126
|
-
# Fallback: strip HTML tags and unescape entities
|
127
116
|
clean_html = _HTML_TAG_PATTERN.sub("", html_content)
|
128
117
|
clean_html = unescape(clean_html)
|
129
118
|
text_parts.append(f"\n{clean_html}")
|
@@ -131,7 +120,6 @@ class EmailExtractor(Extractor):
|
|
131
120
|
def _extract_email_attachments(
|
132
121
|
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
133
122
|
) -> None:
|
134
|
-
"""Extract and process email attachments info."""
|
135
123
|
if parsed_email.get("attachments"):
|
136
124
|
attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
|
137
125
|
metadata["attachments"] = attachment_names
|
@@ -148,12 +136,10 @@ class EmailExtractor(Extractor):
|
|
148
136
|
text_parts: list[str] = []
|
149
137
|
metadata: dict[str, Any] = {}
|
150
138
|
|
151
|
-
# Extract headers, body, and attachments
|
152
139
|
self._extract_email_headers(parsed_email, text_parts, metadata)
|
153
140
|
self._extract_email_body(parsed_email, text_parts)
|
154
141
|
self._extract_email_attachments(parsed_email, text_parts, metadata)
|
155
142
|
|
156
|
-
# Join efficiently
|
157
143
|
combined_text = "\n".join(text_parts)
|
158
144
|
|
159
145
|
return ExtractionResult(
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -7,7 +7,7 @@ from anyio import Path as AsyncPath
|
|
7
7
|
|
8
8
|
from kreuzberg._extractors._base import Extractor
|
9
9
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
|
-
from kreuzberg._types import ExtractionResult
|
10
|
+
from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
|
11
11
|
from kreuzberg._utils._string import safe_decode
|
12
12
|
from kreuzberg._utils._sync import run_sync
|
13
13
|
|
@@ -26,19 +26,16 @@ class HTMLExtractor(Extractor):
|
|
26
26
|
return await run_sync(self.extract_bytes_sync, content)
|
27
27
|
|
28
28
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
36
|
-
|
37
|
-
|
38
|
-
# Skip normalize_spaces since quality processing will handle whitespace
|
29
|
+
config = self.config.html_to_markdown_config if self.config else None
|
30
|
+
if config is None:
|
31
|
+
config = HTMLToMarkdownConfig()
|
32
|
+
|
33
|
+
config_dict = config.to_dict()
|
34
|
+
|
35
|
+
result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
|
36
|
+
|
39
37
|
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
|
40
38
|
|
41
|
-
# Apply quality processing which includes normalization
|
42
39
|
return self._apply_quality_processing(extraction_result)
|
43
40
|
|
44
41
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -3,7 +3,6 @@ from __future__ import annotations
|
|
3
3
|
import contextlib
|
4
4
|
import os
|
5
5
|
import tempfile
|
6
|
-
from dataclasses import asdict
|
7
6
|
from pathlib import Path
|
8
7
|
from typing import TYPE_CHECKING, ClassVar
|
9
8
|
|
@@ -12,9 +11,6 @@ from anyio import Path as AsyncPath
|
|
12
11
|
from kreuzberg._extractors._base import Extractor
|
13
12
|
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
14
13
|
from kreuzberg._ocr import get_ocr_backend
|
15
|
-
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
16
|
-
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
17
|
-
from kreuzberg._ocr._tesseract import TesseractConfig
|
18
14
|
from kreuzberg._utils._tmp import create_temp_file
|
19
15
|
from kreuzberg.exceptions import ValidationError
|
20
16
|
|
@@ -65,7 +61,6 @@ class ImageExtractor(Extractor):
|
|
65
61
|
return self._apply_quality_processing(result)
|
66
62
|
|
67
63
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
68
|
-
"""Pure sync implementation of extract_bytes."""
|
69
64
|
extension = self._get_extension_from_mime_type(self.mime_type)
|
70
65
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
71
66
|
|
@@ -79,30 +74,11 @@ class ImageExtractor(Extractor):
|
|
79
74
|
Path(temp_path).unlink()
|
80
75
|
|
81
76
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
82
|
-
"""Pure sync implementation of extract_path."""
|
83
77
|
if self.config.ocr_backend is None:
|
84
78
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
85
79
|
|
86
80
|
backend = get_ocr_backend(self.config.ocr_backend)
|
87
|
-
|
88
|
-
match self.config.ocr_backend:
|
89
|
-
case "tesseract":
|
90
|
-
config = (
|
91
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
92
|
-
)
|
93
|
-
result = backend.process_file_sync(path, **asdict(config))
|
94
|
-
case "paddleocr":
|
95
|
-
paddle_config = (
|
96
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
97
|
-
)
|
98
|
-
result = backend.process_file_sync(path, **asdict(paddle_config))
|
99
|
-
case "easyocr":
|
100
|
-
easy_config = (
|
101
|
-
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
102
|
-
)
|
103
|
-
result = backend.process_file_sync(path, **asdict(easy_config))
|
104
|
-
case _:
|
105
|
-
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
81
|
+
result = backend.process_file_sync(path, **self.config.get_config_dict())
|
106
82
|
return self._apply_quality_processing(result)
|
107
83
|
|
108
84
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|