kreuzberg 3.13.0__py3-none-any.whl → 3.13.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. kreuzberg/_chunker.py +0 -15
  2. kreuzberg/_config.py +0 -124
  3. kreuzberg/_document_classification.py +20 -39
  4. kreuzberg/_entity_extraction.py +0 -29
  5. kreuzberg/_extractors/_base.py +4 -66
  6. kreuzberg/_extractors/_email.py +0 -4
  7. kreuzberg/_extractors/_image.py +0 -2
  8. kreuzberg/_extractors/_pandoc.py +0 -58
  9. kreuzberg/_extractors/_pdf.py +0 -3
  10. kreuzberg/_extractors/_presentation.py +0 -82
  11. kreuzberg/_extractors/_spread_sheet.py +0 -2
  12. kreuzberg/_gmft.py +0 -61
  13. kreuzberg/_language_detection.py +0 -14
  14. kreuzberg/_mime_types.py +0 -17
  15. kreuzberg/_ocr/_base.py +4 -76
  16. kreuzberg/_ocr/_easyocr.py +110 -85
  17. kreuzberg/_ocr/_paddleocr.py +146 -138
  18. kreuzberg/_ocr/_table_extractor.py +0 -76
  19. kreuzberg/_ocr/_tesseract.py +0 -206
  20. kreuzberg/_playa.py +0 -27
  21. kreuzberg/_registry.py +0 -36
  22. kreuzberg/_types.py +16 -119
  23. kreuzberg/_utils/_cache.py +0 -52
  24. kreuzberg/_utils/_device.py +0 -56
  25. kreuzberg/_utils/_document_cache.py +0 -73
  26. kreuzberg/_utils/_errors.py +0 -47
  27. kreuzberg/_utils/_ocr_cache.py +136 -0
  28. kreuzberg/_utils/_pdf_lock.py +0 -14
  29. kreuzberg/_utils/_process_pool.py +0 -47
  30. kreuzberg/_utils/_quality.py +0 -17
  31. kreuzberg/_utils/_ref.py +0 -16
  32. kreuzberg/_utils/_serialization.py +0 -25
  33. kreuzberg/_utils/_string.py +0 -20
  34. kreuzberg/_utils/_sync.py +0 -76
  35. kreuzberg/_utils/_table.py +0 -45
  36. kreuzberg/_utils/_tmp.py +0 -9
  37. kreuzberg/cli.py +2 -2
  38. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/METADATA +3 -2
  39. kreuzberg-3.13.2.dist-info/RECORD +57 -0
  40. kreuzberg-3.13.0.dist-info/RECORD +0 -56
  41. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/WHEEL +0 -0
  42. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/entry_points.txt +0 -0
  43. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.2.dist-info}/licenses/LICENSE +0 -0
kreuzberg/_chunker.py CHANGED
@@ -17,21 +17,6 @@ def get_chunker(
17
17
  max_characters: int = DEFAULT_MAX_CHARACTERS,
18
18
  overlap_characters: int = DEFAULT_MAX_OVERLAP,
19
19
  ) -> MarkdownSplitter | TextSplitter:
20
- """Creates and returns a Chunker object configured with the given maximum
21
- characters per chunk and overlap between chunks.
22
-
23
- Args:
24
- mime_type: The mime type of the content.
25
- max_characters: Maximum number of characters allowed in each chunk.
26
- overlap_characters: Number of characters overlapping between two consecutive chunks.
27
-
28
- Raises:
29
- MissingDependencyError: if semantic-text-splitter is not installed.
30
-
31
- Returns:
32
- Chunker: A Chunker object configured with the specified maximum
33
- characters and overlap.
34
- """
35
20
  key = (max_characters, overlap_characters, mime_type)
36
21
  if key not in _chunkers:
37
22
  try:
kreuzberg/_config.py CHANGED
@@ -148,17 +148,6 @@ def _create_ocr_config(
148
148
 
149
149
 
150
150
  def load_config_from_file(config_path: Path) -> dict[str, Any]:
151
- """Load configuration from a TOML file.
152
-
153
- Args:
154
- config_path: Path to the configuration file.
155
-
156
- Returns:
157
- Dictionary containing the loaded configuration.
158
-
159
- Raises:
160
- ValidationError: If the file cannot be read or parsed.
161
- """
162
151
  try:
163
152
  with config_path.open("rb") as f:
164
153
  data = tomllib.load(f)
@@ -177,15 +166,6 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
177
166
 
178
167
 
179
168
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
180
- """Merge two configuration dictionaries recursively.
181
-
182
- Args:
183
- base: Base configuration dictionary.
184
- override: Configuration dictionary to override base values.
185
-
186
- Returns:
187
- Merged configuration dictionary.
188
- """
189
169
  result = base.copy()
190
170
  for key, value in override.items():
191
171
  if isinstance(value, dict) and key in result and isinstance(result[key], dict):
@@ -198,18 +178,6 @@ def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, A
198
178
  def parse_ocr_backend_config(
199
179
  config_dict: dict[str, Any], backend: OcrBackendType
200
180
  ) -> TesseractConfig | EasyOCRConfig | PaddleOCRConfig | None:
201
- """Parse OCR backend-specific configuration.
202
-
203
- Args:
204
- config_dict: Configuration dictionary.
205
- backend: The OCR backend type.
206
-
207
- Returns:
208
- Backend-specific configuration object or None.
209
-
210
- Raises:
211
- ValidationError: If the backend configuration is invalid.
212
- """
213
181
  if backend not in config_dict:
214
182
  return None
215
183
 
@@ -230,17 +198,6 @@ def parse_ocr_backend_config(
230
198
 
231
199
 
232
200
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
233
- """Build ExtractionConfig from a configuration dictionary.
234
-
235
- Args:
236
- config_dict: Configuration dictionary from TOML file.
237
-
238
- Returns:
239
- ExtractionConfig instance.
240
-
241
- Raises:
242
- ValidationError: If the configuration is invalid.
243
- """
244
201
  extraction_config: dict[str, Any] = {field: config_dict[field] for field in _CONFIG_FIELDS if field in config_dict}
245
202
 
246
203
  ocr_backend = extraction_config.get("ocr_backend")
@@ -288,18 +245,6 @@ def build_extraction_config(
288
245
  file_config: dict[str, Any],
289
246
  cli_args: MutableMapping[str, Any],
290
247
  ) -> ExtractionConfig:
291
- """Build ExtractionConfig from file config and CLI arguments.
292
-
293
- Args:
294
- file_config: Configuration loaded from file.
295
- cli_args: CLI arguments.
296
-
297
- Returns:
298
- ExtractionConfig instance.
299
-
300
- Raises:
301
- ValidationError: If the combined configuration is invalid.
302
- """
303
248
  config_dict: dict[str, Any] = {}
304
249
 
305
250
  _merge_file_config(config_dict, file_config)
@@ -321,21 +266,6 @@ def build_extraction_config(
321
266
 
322
267
 
323
268
  def find_config_file(start_path: Path | None = None) -> Path | None:
324
- """Find configuration file by searching up the directory tree.
325
-
326
- Searches for configuration files in the following order:
327
- 1. kreuzberg.toml
328
- 2. pyproject.toml (with [tool.kreuzberg] section)
329
-
330
- Args:
331
- start_path: Directory to start searching from. Defaults to current working directory.
332
-
333
- Returns:
334
- Path to the configuration file or None if not found.
335
-
336
- Raises:
337
- ValidationError: If a config file exists but cannot be read or has invalid TOML.
338
- """
339
269
  current = start_path or Path.cwd()
340
270
 
341
271
  while current != current.parent:
@@ -366,17 +296,6 @@ def find_config_file(start_path: Path | None = None) -> Path | None:
366
296
 
367
297
 
368
298
  def load_default_config(start_path: Path | None = None) -> ExtractionConfig | None:
369
- """Load the default configuration from discovered config file.
370
-
371
- Args:
372
- start_path: Directory to start searching from. Defaults to current working directory.
373
-
374
- Returns:
375
- ExtractionConfig instance or None if no configuration found.
376
-
377
- Raises:
378
- ValidationError: If configuration file exists but contains invalid configuration.
379
- """
380
299
  config_path = find_config_file(start_path)
381
300
  if not config_path:
382
301
  return None
@@ -388,34 +307,12 @@ def load_default_config(start_path: Path | None = None) -> ExtractionConfig | No
388
307
 
389
308
 
390
309
  def load_config_from_path(config_path: Path | str) -> ExtractionConfig:
391
- """Load configuration from a specific file path.
392
-
393
- Args:
394
- config_path: Path to the configuration file.
395
-
396
- Returns:
397
- ExtractionConfig instance.
398
-
399
- Raises:
400
- ValidationError: If the file cannot be read, parsed, or is invalid.
401
- """
402
310
  path = Path(config_path)
403
311
  config_dict = load_config_from_file(path)
404
312
  return build_extraction_config_from_dict(config_dict)
405
313
 
406
314
 
407
315
  def discover_and_load_config(start_path: Path | str | None = None) -> ExtractionConfig:
408
- """Load configuration by discovering config files in the directory tree.
409
-
410
- Args:
411
- start_path: Directory to start searching from. Defaults to current working directory.
412
-
413
- Returns:
414
- ExtractionConfig instance.
415
-
416
- Raises:
417
- ValidationError: If no configuration file is found or if the file is invalid.
418
- """
419
316
  search_path = Path(start_path) if start_path else None
420
317
  config_path = find_config_file(search_path)
421
318
 
@@ -436,19 +333,6 @@ def discover_and_load_config(start_path: Path | str | None = None) -> Extraction
436
333
 
437
334
 
438
335
  def discover_config(start_path: Path | str | None = None) -> ExtractionConfig | None:
439
- """Discover and load configuration, returning None if no config file found.
440
-
441
- If a config file is found, attempts to load it. Any errors during loading will bubble up.
442
-
443
- Args:
444
- start_path: Directory to start searching from. Defaults to current working directory.
445
-
446
- Returns:
447
- ExtractionConfig instance or None if no configuration file found.
448
-
449
- Raises:
450
- ValidationError: If a configuration file exists but is invalid.
451
- """
452
336
  search_path = Path(start_path) if start_path else None
453
337
  config_path = find_config_file(search_path)
454
338
 
@@ -462,12 +346,4 @@ def discover_config(start_path: Path | str | None = None) -> ExtractionConfig |
462
346
 
463
347
 
464
348
  def find_default_config() -> Path | None:
465
- """Find the default configuration file (pyproject.toml).
466
-
467
- Returns:
468
- Path to the configuration file or None if not found.
469
-
470
- Note:
471
- This function is deprecated. Use find_config_file() instead.
472
- """
473
349
  return find_config_file()
@@ -3,6 +3,8 @@ from __future__ import annotations
3
3
  import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
+ import polars as pl
7
+
6
8
  from kreuzberg._ocr import get_ocr_backend
7
9
  from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
8
10
  from kreuzberg.exceptions import MissingDependencyError
@@ -40,17 +42,6 @@ DOCUMENT_CLASSIFIERS = {
40
42
 
41
43
 
42
44
  def _get_translated_text(result: ExtractionResult) -> str:
43
- """Translate extracted text to English using Google Translate API.
44
-
45
- Args:
46
- result: ExtractionResult containing the text to be translated
47
-
48
- Returns:
49
- str: The translated text in lowercase English
50
-
51
- Raises:
52
- MissingDependencyError: If the deep-translator package is not installed
53
- """
54
45
  text_to_classify = result.content
55
46
  if result.metadata:
56
47
  metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
@@ -70,16 +61,6 @@ def _get_translated_text(result: ExtractionResult) -> str:
70
61
 
71
62
 
72
63
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
73
- """Classifies the document type based on keywords and patterns.
74
-
75
- Args:
76
- result: The extraction result containing the content.
77
- config: The extraction configuration.
78
-
79
- Returns:
80
- A tuple containing the detected document type and the confidence score,
81
- or (None, None) if no type is detected with sufficient confidence.
82
- """
83
64
  if not config.auto_detect_document_type:
84
65
  return None, None
85
66
 
@@ -108,27 +89,17 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
108
89
  def classify_document_from_layout(
109
90
  result: ExtractionResult, config: ExtractionConfig
110
91
  ) -> tuple[str | None, float | None]:
111
- """Classifies the document type based on layout information from OCR.
112
-
113
- Args:
114
- result: The extraction result containing the layout data.
115
- config: The extraction configuration.
116
-
117
- Returns:
118
- A tuple containing the detected document type and the confidence score,
119
- or (None, None) if no type is detected with sufficient confidence.
120
- """
121
92
  if not config.auto_detect_document_type:
122
93
  return None, None
123
94
 
124
- if result.layout is None or result.layout.empty:
95
+ if result.layout is None or result.layout.is_empty():
125
96
  return None, None
126
97
 
127
98
  layout_df = result.layout
128
99
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
129
100
  return None, None
130
101
 
131
- layout_text = " ".join(layout_df["text"].astype(str).tolist())
102
+ layout_text = " ".join(layout_df["text"].cast(str).to_list())
132
103
 
133
104
  text_to_classify = layout_text
134
105
  if result.metadata:
@@ -142,17 +113,27 @@ def classify_document_from_layout(
142
113
  except Exception: # noqa: BLE001
143
114
  translated_text = text_to_classify.lower()
144
115
 
145
- layout_df["translated_text"] = translated_text
116
+ layout_df = layout_df.with_columns(pl.lit(translated_text).alias("translated_text"))
146
117
 
147
- page_height = layout_df["top"].max() + layout_df["height"].max()
118
+ try:
119
+ layout_df = layout_df.with_columns(
120
+ [pl.col("top").cast(pl.Float64, strict=False), pl.col("height").cast(pl.Float64, strict=False)]
121
+ )
122
+
123
+ page_height_val = layout_df.select(pl.col("top").max() + pl.col("height").max()).item()
124
+ if page_height_val is None:
125
+ page_height_val = 0.0
126
+ page_height = float(page_height_val)
127
+ except Exception: # noqa: BLE001
128
+ page_height = 1000.0
148
129
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0.0)
149
130
 
150
131
  for doc_type, patterns in DOCUMENT_CLASSIFIERS.items():
151
132
  for pattern in patterns:
152
- found_words = layout_df[layout_df["translated_text"].str.contains(pattern, case=False, na=False)]
153
- if not found_words.empty:
133
+ found_words = layout_df.filter(layout_df["translated_text"].str.contains(pattern))
134
+ if not found_words.is_empty():
154
135
  scores[doc_type] += 1.0
155
- word_top = found_words.iloc[0]["top"]
136
+ word_top = found_words[0, "top"]
156
137
  if word_top < page_height * 0.3:
157
138
  scores[doc_type] += 0.5
158
139
 
@@ -176,7 +157,7 @@ def auto_detect_document_type(
176
157
  if config.document_classification_mode == "vision" and file_path:
177
158
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
178
159
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
179
- elif result.layout is not None and not result.layout.empty:
160
+ elif result.layout is not None and not result.layout.is_empty():
180
161
  result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
181
162
  else:
182
163
  result.document_type, result.document_type_confidence = classify_document(result, config)
@@ -19,21 +19,6 @@ def extract_entities(
19
19
  languages: list[str] | None = None,
20
20
  spacy_config: SpacyEntityExtractionConfig | None = None,
21
21
  ) -> list[Entity]:
22
- """Extract entities from text using custom regex patterns and/or a NER model.
23
-
24
- Args:
25
- text: The input text to extract entities from.
26
- entity_types: List of entity types to extract using the NER model.
27
- custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
28
- languages: List of detected languages to choose appropriate spaCy models.
29
- spacy_config: Configuration for spaCy entity extraction.
30
-
31
- Returns:
32
- list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
33
-
34
- Raises:
35
- MissingDependencyError: If `spacy` is not installed.
36
- """
37
22
  entities: list[Entity] = []
38
23
  if custom_patterns:
39
24
  for ent_type, pattern in custom_patterns:
@@ -85,7 +70,6 @@ def extract_entities(
85
70
 
86
71
  @lru_cache(maxsize=32)
87
72
  def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
88
- """Load a spaCy model with caching."""
89
73
  try:
90
74
  import spacy # noqa: PLC0415
91
75
 
@@ -102,7 +86,6 @@ def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig
102
86
 
103
87
 
104
88
  def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
105
- """Select the best spaCy model based on detected languages."""
106
89
  if not languages:
107
90
  return spacy_config.get_model_for_language("en")
108
91
 
@@ -118,18 +101,6 @@ def extract_keywords(
118
101
  text: str,
119
102
  keyword_count: int = 10,
120
103
  ) -> list[tuple[str, float]]:
121
- """Extract keywords from text using the KeyBERT model.
122
-
123
- Args:
124
- text: The input text to extract keywords from.
125
- keyword_count: Number of top keywords to return. Defaults to 10.
126
-
127
- Returns:
128
- list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
129
-
130
- Raises:
131
- MissingDependencyError: If `keybert` is not installed.
132
- """
133
104
  try:
134
105
  from keybert import KeyBERT # noqa: PLC0415
135
106
 
@@ -13,20 +13,6 @@ if TYPE_CHECKING:
13
13
 
14
14
 
15
15
  class Extractor(ABC):
16
- """Abstract base class for file content extraction.
17
-
18
- This class provides the interface for different types of content extractors.
19
- Subclasses are expected to implement the methods for extracting content
20
- either asynchronously or synchronously and determining the supported MIME types.
21
-
22
- Attributes:
23
- SUPPORTED_MIME_TYPES: The set of supported mime types - all none abstract extractors must implement this.
24
-
25
- Args:
26
- mime_type: The MIME type that this extractor handles (e.g., "application/pdf").
27
- config: Configuration options for the extraction process.
28
- """
29
-
30
16
  __slots__ = ("config", "mime_type")
31
17
 
32
18
  SUPPORTED_MIME_TYPES: ClassVar[set[str]]
@@ -36,72 +22,24 @@ class Extractor(ABC):
36
22
  self.config = config
37
23
 
38
24
  @abstractmethod
39
- async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
40
- """Asynchronously extract content from a byte stream.
41
-
42
- Args:
43
- content: The byte content to extract.
44
-
45
- Returns:
46
- ExtractionResult: The extracted content along with metadata about the extraction.
47
- """
25
+ async def extract_bytes_async(self, content: bytes) -> ExtractionResult: ...
48
26
 
49
27
  @abstractmethod
50
- async def extract_path_async(self, path: Path) -> ExtractionResult:
51
- """Asynchronously extract content from a file located at the specified path.
52
-
53
- Args:
54
- path: The path to the file to process.
55
-
56
- Returns:
57
- ExtractionResult: The extracted content along with metadata about the extraction.
58
- """
28
+ async def extract_path_async(self, path: Path) -> ExtractionResult: ...
59
29
 
60
30
  @abstractmethod
61
- def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
62
- """Synchronously extract content from a byte stream.
63
-
64
- Args:
65
- content: The byte content to extract.
66
-
67
- Returns:
68
- ExtractionResult: The extracted content along with metadata about the extraction.
69
- """
31
+ def extract_bytes_sync(self, content: bytes) -> ExtractionResult: ...
70
32
 
71
33
  @abstractmethod
72
- def extract_path_sync(self, path: Path) -> ExtractionResult:
73
- """Synchronously extract content from a file located at the specified path.
74
-
75
- Args:
76
- path: The path to the file to process.
77
-
78
- Returns:
79
- ExtractionResult: The extracted content along with metadata about the extraction.
80
- """
34
+ def extract_path_sync(self, path: Path) -> ExtractionResult: ...
81
35
 
82
36
  @classmethod
83
37
  def supports_mimetype(cls, mime_type: str) -> bool:
84
- """Verify whether the extractor supports the given MIME type.
85
-
86
- Args:
87
- mime_type: The MIME type to check (e.g., "application/pdf").
88
-
89
- Returns:
90
- bool: True if the MIME type is supported, False otherwise.
91
- """
92
38
  return mime_type in cls.SUPPORTED_MIME_TYPES or any(
93
39
  mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
94
40
  )
95
41
 
96
42
  def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
97
- """Apply quality post-processing to extraction result if enabled.
98
-
99
- Args:
100
- result: The raw extraction result
101
-
102
- Returns:
103
- Enhanced extraction result with quality improvements (if enabled)
104
- """
105
43
  if not self.config.enable_quality_processing:
106
44
  return result
107
45
 
@@ -42,7 +42,6 @@ class EmailExtractor(Extractor):
42
42
  def _extract_email_headers(
43
43
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
44
44
  ) -> None:
45
- """Extract and process email headers."""
46
45
  subject = parsed_email.get("subject")
47
46
  if subject:
48
47
  metadata["subject"] = subject
@@ -85,7 +84,6 @@ class EmailExtractor(Extractor):
85
84
  text_parts.append(f"BCC: {bcc_formatted}")
86
85
 
87
86
  def _format_email_field(self, field: Any) -> str:
88
- """Format email field (to, cc, bcc) for display."""
89
87
  if isinstance(field, list):
90
88
  emails = []
91
89
  for item in field:
@@ -101,7 +99,6 @@ class EmailExtractor(Extractor):
101
99
  return str(field)
102
100
 
103
101
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
104
- """Extract and process email body content."""
105
102
  text_content = parsed_email.get("text")
106
103
  if text_content:
107
104
  text_parts.append(f"\n{text_content}")
@@ -123,7 +120,6 @@ class EmailExtractor(Extractor):
123
120
  def _extract_email_attachments(
124
121
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
125
122
  ) -> None:
126
- """Extract and process email attachments info."""
127
123
  if parsed_email.get("attachments"):
128
124
  attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
129
125
  metadata["attachments"] = attachment_names
@@ -61,7 +61,6 @@ class ImageExtractor(Extractor):
61
61
  return self._apply_quality_processing(result)
62
62
 
63
63
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
64
- """Pure sync implementation of extract_bytes."""
65
64
  extension = self._get_extension_from_mime_type(self.mime_type)
66
65
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
67
66
 
@@ -75,7 +74,6 @@ class ImageExtractor(Extractor):
75
74
  Path(temp_path).unlink()
76
75
 
77
76
  def extract_path_sync(self, path: Path) -> ExtractionResult:
78
- """Pure sync implementation of extract_path."""
79
77
  if self.config.ocr_backend is None:
80
78
  raise ValidationError("ocr_backend is None, cannot perform OCR")
81
79
 
@@ -84,8 +84,6 @@ NodeType = Literal[
84
84
 
85
85
 
86
86
  class PandocExtractor(Extractor):
87
- """Extractor for documents supported by Pandoc."""
88
-
89
87
  _checked_version: bool = False
90
88
 
91
89
  MIMETYPE_TO_PANDOC_TYPE_MAPPING: ClassVar[Mapping[str, str]] = {
@@ -153,14 +151,6 @@ class PandocExtractor(Extractor):
153
151
  }
154
152
 
155
153
  async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
156
- """Extract text and metadata from bytes content using Pandoc.
157
-
158
- Args:
159
- content: The content bytes to process.
160
-
161
- Returns:
162
- ExtractionResult with the extracted text and metadata.
163
- """
164
154
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
165
155
  input_file, unlink = await create_temp_file(f".{extension}")
166
156
 
@@ -171,17 +161,6 @@ class PandocExtractor(Extractor):
171
161
  await unlink()
172
162
 
173
163
  async def extract_path_async(self, path: Path) -> ExtractionResult:
174
- """Extract text and metadata from a file using Pandoc.
175
-
176
- Args:
177
- path: The path to the file to process.
178
-
179
- Raises:
180
- ParsingError: If the file data could not be extracted.
181
-
182
- Returns:
183
- ExtractionResult with the extracted text and metadata.
184
- """
185
164
  await self._validate_pandoc_version()
186
165
  self._get_pandoc_type_from_mime_type(self.mime_type)
187
166
 
@@ -198,14 +177,6 @@ class PandocExtractor(Extractor):
198
177
  raise ParsingError("Failed to process file", context={"file": str(path), "errors": eg.exceptions}) from eg
199
178
 
200
179
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
201
- """Pure sync implementation of extract_bytes.
202
-
203
- Args:
204
- content: The content bytes to process.
205
-
206
- Returns:
207
- ExtractionResult with the extracted text and metadata.
208
- """
209
180
  extension = self._get_pandoc_type_from_mime_type(self.mime_type)
210
181
  fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
211
182
 
@@ -219,17 +190,6 @@ class PandocExtractor(Extractor):
219
190
  Path(temp_path).unlink()
220
191
 
221
192
  def extract_path_sync(self, path: Path) -> ExtractionResult:
222
- """Pure sync implementation of extract_path.
223
-
224
- Args:
225
- path: The path to the file to process.
226
-
227
- Returns:
228
- ExtractionResult with the extracted text and metadata.
229
-
230
- Raises:
231
- ParsingError: When file processing fails.
232
- """
233
193
  self._validate_pandoc_version_sync()
234
194
  self._get_pandoc_type_from_mime_type(self.mime_type)
235
195
 
@@ -612,8 +572,6 @@ class PandocExtractor(Extractor):
612
572
 
613
573
 
614
574
  class MarkdownExtractor(PandocExtractor):
615
- """Extractor for Markdown-based document formats."""
616
-
617
575
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
618
576
  "text/x-markdown",
619
577
  "text/x-commonmark",
@@ -625,8 +583,6 @@ class MarkdownExtractor(PandocExtractor):
625
583
 
626
584
 
627
585
  class OfficeDocumentExtractor(PandocExtractor):
628
- """Extractor for Office document formats (Word, ODT)."""
629
-
630
586
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
631
587
  "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
632
588
  "application/vnd.oasis.opendocument.text",
@@ -634,8 +590,6 @@ class OfficeDocumentExtractor(PandocExtractor):
634
590
 
635
591
 
636
592
  class EbookExtractor(PandocExtractor):
637
- """Extractor for e-book formats (EPUB, FB2)."""
638
-
639
593
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
640
594
  "application/epub+zip",
641
595
  "application/x-fictionbook+xml",
@@ -643,8 +597,6 @@ class EbookExtractor(PandocExtractor):
643
597
 
644
598
 
645
599
  class StructuredTextExtractor(PandocExtractor):
646
- """Extractor for structured text formats (RST, Org, etc.)."""
647
-
648
600
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
649
601
  "text/x-rst",
650
602
  "text/x-org",
@@ -654,8 +606,6 @@ class StructuredTextExtractor(PandocExtractor):
654
606
 
655
607
 
656
608
  class LaTeXExtractor(PandocExtractor):
657
- """Extractor for LaTeX and Typst documents."""
658
-
659
609
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
660
610
  "application/x-latex",
661
611
  "application/x-typst",
@@ -663,8 +613,6 @@ class LaTeXExtractor(PandocExtractor):
663
613
 
664
614
 
665
615
  class BibliographyExtractor(PandocExtractor):
666
- """Extractor for bibliography formats (BibTeX, CSL JSON, etc.)."""
667
-
668
616
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
669
617
  "application/x-bibtex",
670
618
  "application/x-biblatex",
@@ -675,8 +623,6 @@ class BibliographyExtractor(PandocExtractor):
675
623
 
676
624
 
677
625
  class XMLBasedExtractor(PandocExtractor):
678
- """Extractor for XML-based document formats (DocBook, JATS, OPML)."""
679
-
680
626
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
681
627
  "application/docbook+xml",
682
628
  "application/x-jats+xml",
@@ -685,8 +631,6 @@ class XMLBasedExtractor(PandocExtractor):
685
631
 
686
632
 
687
633
  class TabularDataExtractor(PandocExtractor):
688
- """Extractor for tabular data formats (CSV, TSV)."""
689
-
690
634
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
691
635
  "text/csv",
692
636
  "text/tab-separated-values",
@@ -694,8 +638,6 @@ class TabularDataExtractor(PandocExtractor):
694
638
 
695
639
 
696
640
  class MiscFormatExtractor(PandocExtractor):
697
- """Extractor for miscellaneous formats (RTF, man, Jupyter notebooks)."""
698
-
699
641
  SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {
700
642
  "application/rtf",
701
643
  "text/troff",