kreuzberg 3.11.4__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_chunker.py +0 -15
  5. kreuzberg/_config.py +212 -292
  6. kreuzberg/_document_classification.py +20 -47
  7. kreuzberg/_entity_extraction.py +1 -122
  8. kreuzberg/_extractors/_base.py +4 -71
  9. kreuzberg/_extractors/_email.py +1 -15
  10. kreuzberg/_extractors/_html.py +9 -12
  11. kreuzberg/_extractors/_image.py +1 -25
  12. kreuzberg/_extractors/_pandoc.py +10 -147
  13. kreuzberg/_extractors/_pdf.py +38 -94
  14. kreuzberg/_extractors/_presentation.py +0 -99
  15. kreuzberg/_extractors/_spread_sheet.py +13 -55
  16. kreuzberg/_extractors/_structured.py +1 -4
  17. kreuzberg/_gmft.py +14 -199
  18. kreuzberg/_language_detection.py +1 -36
  19. kreuzberg/_mcp/__init__.py +0 -2
  20. kreuzberg/_mcp/server.py +3 -10
  21. kreuzberg/_mime_types.py +1 -19
  22. kreuzberg/_ocr/_base.py +4 -76
  23. kreuzberg/_ocr/_easyocr.py +124 -186
  24. kreuzberg/_ocr/_paddleocr.py +154 -224
  25. kreuzberg/_ocr/_table_extractor.py +184 -0
  26. kreuzberg/_ocr/_tesseract.py +797 -361
  27. kreuzberg/_playa.py +5 -31
  28. kreuzberg/_registry.py +0 -36
  29. kreuzberg/_types.py +588 -93
  30. kreuzberg/_utils/_cache.py +84 -138
  31. kreuzberg/_utils/_device.py +0 -74
  32. kreuzberg/_utils/_document_cache.py +0 -75
  33. kreuzberg/_utils/_errors.py +0 -50
  34. kreuzberg/_utils/_ocr_cache.py +136 -0
  35. kreuzberg/_utils/_pdf_lock.py +0 -16
  36. kreuzberg/_utils/_process_pool.py +17 -64
  37. kreuzberg/_utils/_quality.py +0 -60
  38. kreuzberg/_utils/_ref.py +32 -0
  39. kreuzberg/_utils/_serialization.py +0 -30
  40. kreuzberg/_utils/_string.py +9 -59
  41. kreuzberg/_utils/_sync.py +0 -77
  42. kreuzberg/_utils/_table.py +49 -101
  43. kreuzberg/_utils/_tmp.py +0 -9
  44. kreuzberg/cli.py +54 -74
  45. kreuzberg/extraction.py +39 -32
  46. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +19 -15
  47. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  48. kreuzberg-3.11.4.dist-info/RECORD +0 -54
  49. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  50. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  51. {kreuzberg-3.11.4.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -3,6 +3,8 @@ from __future__ import annotations
3
3
  import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
+ import polars as pl
7
+
6
8
  from kreuzberg._ocr import get_ocr_backend
7
9
  from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
8
10
  from kreuzberg.exceptions import MissingDependencyError
@@ -40,21 +42,8 @@ DOCUMENT_CLASSIFIERS = {
40
42
 
41
43
 
42
44
  def _get_translated_text(result: ExtractionResult) -> str:
43
- """Translate extracted text to English using Google Translate API.
44
-
45
- Args:
46
- result: ExtractionResult containing the text to be translated
47
-
48
- Returns:
49
- str: The translated text in lowercase English
50
-
51
- Raises:
52
- MissingDependencyError: If the deep-translator package is not installed
53
- """
54
- # Combine content with metadata for classification
55
45
  text_to_classify = result.content
56
46
  if result.metadata:
57
- # Add metadata values to the text for classification
58
47
  metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
59
48
  text_to_classify = f"{text_to_classify} {metadata_text}"
60
49
 
@@ -68,21 +57,10 @@ def _get_translated_text(result: ExtractionResult) -> str:
68
57
  try:
69
58
  return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
70
59
  except Exception: # noqa: BLE001
71
- # Fall back to original content in lowercase if translation fails
72
60
  return text_to_classify.lower()
73
61
 
74
62
 
75
63
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
76
- """Classifies the document type based on keywords and patterns.
77
-
78
- Args:
79
- result: The extraction result containing the content.
80
- config: The extraction configuration.
81
-
82
- Returns:
83
- A tuple containing the detected document type and the confidence score,
84
- or (None, None) if no type is detected with sufficient confidence.
85
- """
86
64
  if not config.auto_detect_document_type:
87
65
  return None, None
88
66
 
@@ -111,33 +89,20 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
111
89
  def classify_document_from_layout(
112
90
  result: ExtractionResult, config: ExtractionConfig
113
91
  ) -> tuple[str | None, float | None]:
114
- """Classifies the document type based on layout information from OCR.
115
-
116
- Args:
117
- result: The extraction result containing the layout data.
118
- config: The extraction configuration.
119
-
120
- Returns:
121
- A tuple containing the detected document type and the confidence score,
122
- or (None, None) if no type is detected with sufficient confidence.
123
- """
124
92
  if not config.auto_detect_document_type:
125
93
  return None, None
126
94
 
127
- if result.layout is None or result.layout.empty:
95
+ if result.layout is None or result.layout.is_empty():
128
96
  return None, None
129
97
 
130
98
  layout_df = result.layout
131
99
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
132
100
  return None, None
133
101
 
134
- # Use layout text for classification, not the content
135
- layout_text = " ".join(layout_df["text"].astype(str).tolist())
102
+ layout_text = " ".join(layout_df["text"].cast(str).to_list())
136
103
 
137
- # Translate layout text directly for classification
138
104
  text_to_classify = layout_text
139
105
  if result.metadata:
140
- # Add metadata values to the text for classification
141
106
  metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
142
107
  text_to_classify = f"{text_to_classify} {metadata_text}"
143
108
 
@@ -146,20 +111,29 @@ def classify_document_from_layout(
146
111
 
147
112
  translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
148
113
  except Exception: # noqa: BLE001
149
- # Fall back to original content in lowercase if translation fails
150
114
  translated_text = text_to_classify.lower()
151
115
 
152
- layout_df["translated_text"] = translated_text
116
+ layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
153
117
 
154
- page_height = layout_df["top"].max() + layout_df["height"].max()
118
+ try:
119
+ layout_df = layout_df.with_columns(
120
+ [pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
121
+ )
122
+
123
+ page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
124
+ if page_height_val is None:
125
+ page_height_val = 0.0
126
+ page_height = float(page_height_val)
127
+ except Exception: # noqa: BLE001
128
+ page_height = 1000.0
155
129
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
156
130
 
157
131
  for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
158
132
  for pattern in patterns:
159
- found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
160
- if not found_words.empty:
133
+ found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
134
+ if not found_words.is_empty():
161
135
  scores[doc_type] += 1.0
162
- word_top = found_words.iloc[0]["top"]
136
+ word_top = found_words[0, "top"]
163
137
  if word_top < page_height * 0.3:
164
138
  scores[doc_type] += 0.5
165
139
 
@@ -183,8 +157,7 @@ def auto_detect_document_type(
183
157
  if config.document_classification_mode == "vision" and file_path:
184
158
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
185
159
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
186
- elif result.layout is not None and not result.layout.empty:
187
- # Use layout-based classification if layout data is available
160
+ elif result.layout is not None and not result.layout.is_empty():
188
161
  result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
189
162
  else:
190
163
  result.document_type, result.document_type_confidence = classify_document(result, config)
@@ -2,105 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import re
5
- from dataclasses import dataclass
6
5
  from functools import lru_cache
7
6
  from typing import TYPE_CHECKING, Any
8
7
 
9
- from kreuzberg._types import Entity
8
+ from kreuzberg._types import Entity, SpacyEntityExtractionConfig
10
9
  from kreuzberg.exceptions import MissingDependencyError
11
10
 
12
11
  if TYPE_CHECKING:
13
12
  from collections.abc import Sequence
14
- from pathlib import Path
15
-
16
-
17
- @dataclass(unsafe_hash=True, frozen=True, slots=True)
18
- class SpacyEntityExtractionConfig:
19
- """Configuration for spaCy-based entity extraction."""
20
-
21
- model_cache_dir: str | Path | None = None
22
- """Directory to cache spaCy models. If None, uses spaCy's default."""
23
-
24
- language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
25
- """Mapping of language codes to spaCy model names.
26
-
27
- If None, uses default mappings:
28
- - en: en_core_web_sm
29
- - de: de_core_news_sm
30
- - fr: fr_core_news_sm
31
- - es: es_core_news_sm
32
- - pt: pt_core_news_sm
33
- - it: it_core_news_sm
34
- - nl: nl_core_news_sm
35
- - zh: zh_core_web_sm
36
- - ja: ja_core_news_sm
37
- """
38
-
39
- fallback_to_multilingual: bool = True
40
- """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
41
-
42
- max_doc_length: int = 1000000
43
- """Maximum document length for spaCy processing."""
44
-
45
- batch_size: int = 1000
46
- """Batch size for processing multiple texts."""
47
-
48
- def __post_init__(self) -> None:
49
- if self.language_models is None:
50
- object.__setattr__(self, "language_models", self._get_default_language_models())
51
-
52
- if isinstance(self.language_models, dict):
53
- object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
54
-
55
- @staticmethod
56
- def _get_default_language_models() -> dict[str, str]:
57
- """Get default language model mappings based on available spaCy models."""
58
- return {
59
- "en": "en_core_web_sm",
60
- "de": "de_core_news_sm",
61
- "fr": "fr_core_news_sm",
62
- "es": "es_core_news_sm",
63
- "pt": "pt_core_news_sm",
64
- "it": "it_core_news_sm",
65
- "nl": "nl_core_news_sm",
66
- "zh": "zh_core_web_sm",
67
- "ja": "ja_core_news_sm",
68
- "ko": "ko_core_news_sm",
69
- "ru": "ru_core_news_sm",
70
- "pl": "pl_core_news_sm",
71
- "ro": "ro_core_news_sm",
72
- "el": "el_core_news_sm",
73
- "da": "da_core_news_sm",
74
- "fi": "fi_core_news_sm",
75
- "nb": "nb_core_news_sm",
76
- "sv": "sv_core_news_sm",
77
- "ca": "ca_core_news_sm",
78
- "hr": "hr_core_news_sm",
79
- "lt": "lt_core_news_sm",
80
- "mk": "mk_core_news_sm",
81
- "sl": "sl_core_news_sm",
82
- "uk": "uk_core_news_sm",
83
- }
84
-
85
- def get_model_for_language(self, language_code: str) -> str | None:
86
- """Get the appropriate spaCy model for a language code."""
87
- if not self.language_models:
88
- return None
89
-
90
- models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
91
-
92
- if language_code in models_dict:
93
- return models_dict[language_code]
94
-
95
- base_lang = language_code.split("-")[0].lower()
96
- if base_lang in models_dict:
97
- return models_dict[base_lang]
98
-
99
- return None
100
-
101
- def get_fallback_model(self) -> str | None:
102
- """Get fallback multilingual model if enabled."""
103
- return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
104
13
 
105
14
 
106
15
  def extract_entities(
@@ -110,24 +19,8 @@ def extract_entities(
110
19
  languages: list[str] | None = None,
111
20
  spacy_config: SpacyEntityExtractionConfig | None = None,
112
21
  ) -> list[Entity]:
113
- """Extract entities from text using custom regex patterns and/or a NER model.
114
-
115
- Args:
116
- text: The input text to extract entities from.
117
- entity_types: List of entity types to extract using the NER model.
118
- custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
119
- languages: List of detected languages to choose appropriate spaCy models.
120
- spacy_config: Configuration for spaCy entity extraction.
121
-
122
- Returns:
123
- list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
124
-
125
- Raises:
126
- MissingDependencyError: If `spacy` is not installed.
127
- """
128
22
  entities: list[Entity] = []
129
23
  if custom_patterns:
130
- # Direct iteration over frozenset - no need to convert to dict
131
24
  for ent_type, pattern in custom_patterns:
132
25
  entities.extend(
133
26
  Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
@@ -177,7 +70,6 @@ def extract_entities(
177
70
 
178
71
  @lru_cache(maxsize=32)
179
72
  def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
180
- """Load a spaCy model with caching."""
181
73
  try:
182
74
  import spacy # noqa: PLC0415
183
75
 
@@ -194,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
194
86
 
195
87
 
196
88
  def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
197
- """Select the best spaCy model based on detected languages."""
198
89
  if not languages:
199
90
  return spacy_config.get_model_for_language("en")
200
91
 
@@ -210,18 +101,6 @@ def extract_keywords(
210
101
  text: str,
211
102
  keyword_count: int = 10,
212
103
  ) -> list[tuple[str, float]]:
213
- """Extract keywords from text using the KeyBERT model.
214
-
215
- Args:
216
- text: The input text to extract keywords from.
217
- keyword_count: Number of top keywords to return. Defaults to 10.
218
-
219
- Returns:
220
- list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
221
-
222
- Raises:
223
- MissingDependencyError: If `keybert` is not installed.
224
- """
225
104
  try:
226
105
  from keybert import KeyBERT # noqa: PLC0415
227
106
 
@@ -13,20 +13,6 @@ if TYPE_CHECKING:
13
13
 
14
14
 
15
15
  class Extractor(ABC):
16
- """Abstract base class for file content extraction.
17
-
18
- This class provides the interface for different types of content extractors.
19
- Subclasses are expected to implement the methods for extracting content
20
- either asynchronously or synchronously and determining the supported MIME types.
21
-
22
- Attributes:
23
- SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
24
-
25
- Args:
26
- mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
27
- config: Configuration options for the extraction process.
28
- """
29
-
30
16
  __slots__ = ("config", "mime_type")
31
17
 
32
18
  SUPPORTED_MIME_TYPES: ClassVar[set[str]]
@@ -36,89 +22,36 @@ class Extractor(ABC):
36
22
  self.config = config
37
23
 
38
24
  @abstractmethod
39
- async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
40
- """Asynchronously extract content from a byte stream.
41
-
42
- Args:
43
- content: The byte content to extract.
44
-
45
- Returns:
46
- ExtractionResult: The extracted content along with metadata about the extraction.
47
- """
25
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
48
26
 
49
27
  @abstractmethod
50
- async def extract_path_async(self, path: Path) -> ExtractionResult:
51
- """Asynchronously extract content from a file located at the specified path.
52
-
53
- Args:
54
- path: The path to the file to process.
55
-
56
- Returns:
57
- ExtractionResult: The extracted content along with metadata about the extraction.
58
- """
28
+ async def extract_path_async(self, path: Path) -> ExtractionResult: ...
59
29
 
60
30
  @abstractmethod
61
- def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
62
- """Synchronously extract content from a byte stream.
63
-
64
- Args:
65
- content: The byte content to extract.
66
-
67
- Returns:
68
- ExtractionResult: The extracted content along with metadata about the extraction.
69
- """
31
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
70
32
 
71
33
  @abstractmethod
72
- def extract_path_sync(self, path: Path) -> ExtractionResult:
73
- """Synchronously extract content from a file located at the specified path.
74
-
75
- Args:
76
- path: The path to the file to process.
77
-
78
- Returns:
79
- ExtractionResult: The extracted content along with metadata about the extraction.
80
- """
34
+ def extract_path_sync(self, path: Path) -> ExtractionResult: ...
81
35
 
82
36
  @classmethod
83
37
  def supports_mimetype(cls, mime_type: str) -> bool:
84
- """Verify whether the extractor supports the given MIME type.
85
-
86
- Args:
87
- mime_type: The MIME type to check (e.g., "application/pdf").
88
-
89
- Returns:
90
- bool: True if the MIME type is supported, False otherwise.
91
- """
92
38
  return mime_type in cls.SUPPORTED_MIME_TYPES or any(
93
39
  mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
94
40
  )
95
41
 
96
42
  def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
97
- """Apply quality post-processing to extraction result if enabled.
98
-
99
- Args:
100
- result: The raw extraction result
101
-
102
- Returns:
103
- Enhanced extraction result with quality improvements (if enabled)
104
- """
105
- # Only apply quality processing if enabled in config
106
43
  if not self.config.enable_quality_processing:
107
44
  return result
108
45
 
109
46
  if not result.content:
110
47
  return result
111
48
 
112
- # Clean the content
113
49
  cleaned_content = clean_extracted_text(result.content)
114
50
 
115
- # Calculate quality score
116
51
  quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
52
 
118
- # Add quality metadata
119
53
  enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
120
54
 
121
- # Return enhanced result
122
55
  return ExtractionResult(
123
56
  content=cleaned_content,
124
57
  mime_type=result.mime_type,
@@ -16,7 +16,6 @@ from kreuzberg.exceptions import MissingDependencyError
16
16
  if TYPE_CHECKING:
17
17
  from pathlib import Path
18
18
 
19
- # Import optional dependencies at module level with proper error handling
20
19
  try:
21
20
  import mailparse
22
21
  except ImportError: # pragma: no cover
@@ -27,7 +26,6 @@ try:
27
26
  except ImportError: # pragma: no cover
28
27
  html2text = None
29
28
 
30
- # Compile regex pattern once at module level
31
29
  _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
32
30
 
33
31
 
@@ -44,8 +42,6 @@ class EmailExtractor(Extractor):
44
42
  def _extract_email_headers(
45
43
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
46
44
  ) -> None:
47
- """Extract and process email headers."""
48
- # Use single dict access where possible to avoid repeated lookups
49
45
  subject = parsed_email.get("subject")
50
46
  if subject:
51
47
  metadata["subject"] = subject
@@ -59,9 +55,7 @@ class EmailExtractor(Extractor):
59
55
 
60
56
  to_info = parsed_email.get("to")
61
57
  if to_info:
62
- # Store the raw value in metadata (could be string, dict, or list)
63
58
  if isinstance(to_info, list) and to_info:
64
- # For metadata, use first recipient's email if it's a list
65
59
  to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
66
60
  metadata["email_to"] = to_email
67
61
  elif isinstance(to_info, dict):
@@ -69,7 +63,6 @@ class EmailExtractor(Extractor):
69
63
  else:
70
64
  metadata["email_to"] = str(to_info)
71
65
 
72
- # For display, format all recipients
73
66
  to_formatted = self._format_email_field(to_info)
74
67
  text_parts.append(f"To: {to_formatted}")
75
68
 
@@ -91,7 +84,6 @@ class EmailExtractor(Extractor):
91
84
  text_parts.append(f"BCC: {bcc_formatted}")
92
85
 
93
86
  def _format_email_field(self, field: Any) -> str:
94
- """Format email field (to, cc, bcc) for display."""
95
87
  if isinstance(field, list):
96
88
  emails = []
97
89
  for item in field:
@@ -107,23 +99,20 @@ class EmailExtractor(Extractor):
107
99
  return str(field)
108
100
 
109
101
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
110
- """Extract and process email body content."""
111
102
  text_content = parsed_email.get("text")
112
103
  if text_content:
113
104
  text_parts.append(f"\n{text_content}")
114
- return # If we have text, prefer it over HTML
105
+ return
115
106
 
116
107
  html_content = parsed_email.get("html")
117
108
  if html_content:
118
109
  if html2text is not None:
119
- # Use html2text if available (faster path)
120
110
  h = html2text.HTML2Text()
121
111
  h.ignore_links = True
122
112
  h.ignore_images = True
123
113
  converted_text = h.handle(html_content)
124
114
  text_parts.append(f"\n{converted_text}")
125
115
  else:
126
- # Fallback: strip HTML tags and unescape entities
127
116
  clean_html = _HTML_TAG_PATTERN.sub("", html_content)
128
117
  clean_html = unescape(clean_html)
129
118
  text_parts.append(f"\n{clean_html}")
@@ -131,7 +120,6 @@ class EmailExtractor(Extractor):
131
120
  def _extract_email_attachments(
132
121
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
133
122
  ) -> None:
134
- """Extract and process email attachments info."""
135
123
  if parsed_email.get("attachments"):
136
124
  attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
137
125
  metadata["attachments"] = attachment_names
@@ -148,12 +136,10 @@ class EmailExtractor(Extractor):
148
136
  text_parts: list[str] = []
149
137
  metadata: dict[str, Any] = {}
150
138
 
151
- # Extract headers, body, and attachments
152
139
  self._extract_email_headers(parsed_email, text_parts, metadata)
153
140
  self._extract_email_body(parsed_email, text_parts)
154
141
  self._extract_email_attachments(parsed_email, text_parts, metadata)
155
142
 
156
- # Join efficiently
157
143
  combined_text = "\n".join(text_parts)
158
144
 
159
145
  return ExtractionResult(
@@ -7,7 +7,7 @@ from anyio import Path as AsyncPath
7
7
 
8
8
  from kreuzberg._extractors._base import Extractor
9
9
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
- from kreuzberg._types import ExtractionResult
10
+ from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
11
11
  from kreuzberg._utils._string import safe_decode
12
12
  from kreuzberg._utils._sync import run_sync
13
13
 
@@ -26,19 +26,16 @@ class HTMLExtractor(Extractor):
26
26
  return await run_sync(self.extract_bytes_sync, content)
27
27
 
28
28
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
- # Use html-to-markdown with script/nav removal for better quality
30
- result = html_to_markdown.convert_to_markdown(
31
- safe_decode(content),
32
- preprocess_html=True,
33
- preprocessing_preset="aggressive",
34
- remove_navigation=True,
35
- remove_forms=True,
36
- )
37
-
38
- # Skip normalize_spaces since quality processing will handle whitespace
29
+ config = self.config.html_to_markdown_config if self.config else None
30
+ if config is None:
31
+ config = HTMLToMarkdownConfig()
32
+
33
+ config_dict = config.to_dict()
34
+
35
+ result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
36
+
39
37
  extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
40
38
 
41
- # Apply quality processing which includes normalization
42
39
  return self._apply_quality_processing(extraction_result)
43
40
 
44
41
  def extract_path_sync(self, path: Path) -> ExtractionResult:
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import contextlib
4
4
  import os
5
5
  import tempfile
6
- from dataclasses import asdict
7
6
  from pathlib import Path
8
7
  from typing import TYPE_CHECKING, ClassVar
9
8
 
@@ -12,9 +11,6 @@ from anyio import Path as AsyncPath
12
11
  from kreuzberg._extractors._base import Extractor
13
12
  from kreuzberg._mime_types import IMAGE_MIME_TYPES
14
13
  from kreuzberg._ocr import get_ocr_backend
15
- from kreuzberg._ocr._easyocr import EasyOCRConfig
16
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
17
- from kreuzberg._ocr._tesseract import TesseractConfig
18
14
  from kreuzberg._utils._tmp import create_temp_file
19
15
  from kreuzberg.exceptions import ValidationError
20
16
 
@@ -65,7 +61,6 @@ class ImageExtractor(Extractor):
65
61
  return self._apply_quality_processing(result)
66
62
 
67
63
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
68
- """Pure sync implementation of extract_bytes."""
69
64
  extension = self._get_extension_from_mime_type(self.mime_type)
70
65
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
71
66
 
@@ -79,30 +74,11 @@ class ImageExtractor(Extractor):
79
74
  Path(temp_path).unlink()
80
75
 
81
76
  def extract_path_sync(self, path: Path) -> ExtractionResult:
82
- """Pure sync implementation of extract_path."""
83
77
  if self.config.ocr_backend is None:
84
78
  raise ValidationError("ocr_backend is None, cannot perform OCR")
85
79
 
86
80
  backend = get_ocr_backend(self.config.ocr_backend)
87
-
88
- match self.config.ocr_backend:
89
- case "tesseract":
90
- config = (
91
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
92
- )
93
- result = backend.process_file_sync(path, **asdict(config))
94
- case "paddleocr":
95
- paddle_config = (
96
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
97
- )
98
- result = backend.process_file_sync(path, **asdict(paddle_config))
99
- case "easyocr":
100
- easy_config = (
101
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
102
- )
103
- result = backend.process_file_sync(path, **asdict(easy_config))
104
- case _:
105
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
81
+ result = backend.process_file_sync(path, **self.config.get_config_dict())
106
82
  return self._apply_quality_processing(result)
107
83
 
108
84
  def _get_extension_from_mime_type(self, mime_type: str) -> str: