kreuzberg 3.11.3__py3-none-any.whl → 3.13.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (46) hide show
  1. kreuzberg/__init__.py +14 -13
  2. kreuzberg/__main__.py +0 -2
  3. kreuzberg/_api/main.py +119 -9
  4. kreuzberg/_config.py +248 -204
  5. kreuzberg/_document_classification.py +0 -8
  6. kreuzberg/_entity_extraction.py +1 -93
  7. kreuzberg/_extractors/_base.py +0 -5
  8. kreuzberg/_extractors/_email.py +1 -11
  9. kreuzberg/_extractors/_html.py +9 -12
  10. kreuzberg/_extractors/_image.py +1 -23
  11. kreuzberg/_extractors/_pandoc.py +10 -89
  12. kreuzberg/_extractors/_pdf.py +39 -92
  13. kreuzberg/_extractors/_presentation.py +0 -17
  14. kreuzberg/_extractors/_spread_sheet.py +13 -53
  15. kreuzberg/_extractors/_structured.py +1 -4
  16. kreuzberg/_gmft.py +14 -138
  17. kreuzberg/_language_detection.py +1 -22
  18. kreuzberg/_mcp/__init__.py +0 -2
  19. kreuzberg/_mcp/server.py +3 -10
  20. kreuzberg/_mime_types.py +1 -2
  21. kreuzberg/_ocr/_easyocr.py +21 -108
  22. kreuzberg/_ocr/_paddleocr.py +16 -94
  23. kreuzberg/_ocr/_table_extractor.py +260 -0
  24. kreuzberg/_ocr/_tesseract.py +906 -264
  25. kreuzberg/_playa.py +5 -4
  26. kreuzberg/_types.py +638 -40
  27. kreuzberg/_utils/_cache.py +88 -90
  28. kreuzberg/_utils/_device.py +0 -18
  29. kreuzberg/_utils/_document_cache.py +0 -2
  30. kreuzberg/_utils/_errors.py +0 -3
  31. kreuzberg/_utils/_pdf_lock.py +0 -2
  32. kreuzberg/_utils/_process_pool.py +19 -19
  33. kreuzberg/_utils/_quality.py +0 -43
  34. kreuzberg/_utils/_ref.py +48 -0
  35. kreuzberg/_utils/_serialization.py +0 -5
  36. kreuzberg/_utils/_string.py +9 -39
  37. kreuzberg/_utils/_sync.py +0 -1
  38. kreuzberg/_utils/_table.py +50 -57
  39. kreuzberg/cli.py +55 -77
  40. kreuzberg/extraction.py +39 -32
  41. {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/METADATA +17 -14
  42. kreuzberg-3.13.0.dist-info/RECORD +56 -0
  43. kreuzberg-3.11.3.dist-info/RECORD +0 -54
  44. {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/WHEEL +0 -0
  45. {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/entry_points.txt +0 -0
  46. {kreuzberg-3.11.3.dist-info → kreuzberg-3.13.0.dist-info}/licenses/LICENSE +0 -0
@@ -2,105 +2,14 @@ from __future__ import annotations
2
2
 
3
3
  import os
4
4
  import re
5
- from dataclasses import dataclass
6
5
  from functools import lru_cache
7
6
  from typing import TYPE_CHECKING, Any
8
7
 
9
- from kreuzberg._types import Entity
8
+ from kreuzberg._types import Entity, SpacyEntityExtractionConfig
10
9
  from kreuzberg.exceptions import MissingDependencyError
11
10
 
12
11
  if TYPE_CHECKING:
13
12
  from collections.abc import Sequence
14
- from pathlib import Path
15
-
16
-
17
- @dataclass(unsafe_hash=True, frozen=True, slots=True)
18
- class SpacyEntityExtractionConfig:
19
- """Configuration for spaCy-based entity extraction."""
20
-
21
- model_cache_dir: str | Path | None = None
22
- """Directory to cache spaCy models. If None, uses spaCy's default."""
23
-
24
- language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
25
- """Mapping of language codes to spaCy model names.
26
-
27
- If None, uses default mappings:
28
- - en: en_core_web_sm
29
- - de: de_core_news_sm
30
- - fr: fr_core_news_sm
31
- - es: es_core_news_sm
32
- - pt: pt_core_news_sm
33
- - it: it_core_news_sm
34
- - nl: nl_core_news_sm
35
- - zh: zh_core_web_sm
36
- - ja: ja_core_news_sm
37
- """
38
-
39
- fallback_to_multilingual: bool = True
40
- """If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
41
-
42
- max_doc_length: int = 1000000
43
- """Maximum document length for spaCy processing."""
44
-
45
- batch_size: int = 1000
46
- """Batch size for processing multiple texts."""
47
-
48
- def __post_init__(self) -> None:
49
- if self.language_models is None:
50
- object.__setattr__(self, "language_models", self._get_default_language_models())
51
-
52
- if isinstance(self.language_models, dict):
53
- object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
54
-
55
- @staticmethod
56
- def _get_default_language_models() -> dict[str, str]:
57
- """Get default language model mappings based on available spaCy models."""
58
- return {
59
- "en": "en_core_web_sm",
60
- "de": "de_core_news_sm",
61
- "fr": "fr_core_news_sm",
62
- "es": "es_core_news_sm",
63
- "pt": "pt_core_news_sm",
64
- "it": "it_core_news_sm",
65
- "nl": "nl_core_news_sm",
66
- "zh": "zh_core_web_sm",
67
- "ja": "ja_core_news_sm",
68
- "ko": "ko_core_news_sm",
69
- "ru": "ru_core_news_sm",
70
- "pl": "pl_core_news_sm",
71
- "ro": "ro_core_news_sm",
72
- "el": "el_core_news_sm",
73
- "da": "da_core_news_sm",
74
- "fi": "fi_core_news_sm",
75
- "nb": "nb_core_news_sm",
76
- "sv": "sv_core_news_sm",
77
- "ca": "ca_core_news_sm",
78
- "hr": "hr_core_news_sm",
79
- "lt": "lt_core_news_sm",
80
- "mk": "mk_core_news_sm",
81
- "sl": "sl_core_news_sm",
82
- "uk": "uk_core_news_sm",
83
- }
84
-
85
- def get_model_for_language(self, language_code: str) -> str | None:
86
- """Get the appropriate spaCy model for a language code."""
87
- if not self.language_models:
88
- return None
89
-
90
- models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
91
-
92
- if language_code in models_dict:
93
- return models_dict[language_code]
94
-
95
- base_lang = language_code.split("-")[0].lower()
96
- if base_lang in models_dict:
97
- return models_dict[base_lang]
98
-
99
- return None
100
-
101
- def get_fallback_model(self) -> str | None:
102
- """Get fallback multilingual model if enabled."""
103
- return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
104
13
 
105
14
 
106
15
  def extract_entities(
@@ -127,7 +36,6 @@ def extract_entities(
127
36
  """
128
37
  entities: list[Entity] = []
129
38
  if custom_patterns:
130
- # Direct iteration over frozenset - no need to convert to dict
131
39
  for ent_type, pattern in custom_patterns:
132
40
  entities.extend(
133
41
  Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
@@ -102,23 +102,18 @@ class Extractor(ABC):
102
102
  Returns:
103
103
  Enhanced extraction result with quality improvements (if enabled)
104
104
  """
105
- # Only apply quality processing if enabled in config
106
105
  if not self.config.enable_quality_processing:
107
106
  return result
108
107
 
109
108
  if not result.content:
110
109
  return result
111
110
 
112
- # Clean the content
113
111
  cleaned_content = clean_extracted_text(result.content)
114
112
 
115
- # Calculate quality score
116
113
  quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
114
 
118
- # Add quality metadata
119
115
  enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
120
116
 
121
- # Return enhanced result
122
117
  return ExtractionResult(
123
118
  content=cleaned_content,
124
119
  mime_type=result.mime_type,
@@ -16,7 +16,6 @@ from kreuzberg.exceptions import MissingDependencyError
16
16
  if TYPE_CHECKING:
17
17
  from pathlib import Path
18
18
 
19
- # Import optional dependencies at module level with proper error handling
20
19
  try:
21
20
  import mailparse
22
21
  except ImportError: # pragma: no cover
@@ -27,7 +26,6 @@ try:
27
26
  except ImportError: # pragma: no cover
28
27
  html2text = None
29
28
 
30
- # Compile regex pattern once at module level
31
29
  _HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
32
30
 
33
31
 
@@ -45,7 +43,6 @@ class EmailExtractor(Extractor):
45
43
  self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
46
44
  ) -> None:
47
45
  """Extract and process email headers."""
48
- # Use single dict access where possible to avoid repeated lookups
49
46
  subject = parsed_email.get("subject")
50
47
  if subject:
51
48
  metadata["subject"] = subject
@@ -59,9 +56,7 @@ class EmailExtractor(Extractor):
59
56
 
60
57
  to_info = parsed_email.get("to")
61
58
  if to_info:
62
- # Store the raw value in metadata (could be string, dict, or list)
63
59
  if isinstance(to_info, list) and to_info:
64
- # For metadata, use first recipient's email if it's a list
65
60
  to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
66
61
  metadata["email_to"] = to_email
67
62
  elif isinstance(to_info, dict):
@@ -69,7 +64,6 @@ class EmailExtractor(Extractor):
69
64
  else:
70
65
  metadata["email_to"] = str(to_info)
71
66
 
72
- # For display, format all recipients
73
67
  to_formatted = self._format_email_field(to_info)
74
68
  text_parts.append(f"To: {to_formatted}")
75
69
 
@@ -111,19 +105,17 @@ class EmailExtractor(Extractor):
111
105
  text_content = parsed_email.get("text")
112
106
  if text_content:
113
107
  text_parts.append(f"\n{text_content}")
114
- return # If we have text, prefer it over HTML
108
+ return
115
109
 
116
110
  html_content = parsed_email.get("html")
117
111
  if html_content:
118
112
  if html2text is not None:
119
- # Use html2text if available (faster path)
120
113
  h = html2text.HTML2Text()
121
114
  h.ignore_links = True
122
115
  h.ignore_images = True
123
116
  converted_text = h.handle(html_content)
124
117
  text_parts.append(f"\n{converted_text}")
125
118
  else:
126
- # Fallback: strip HTML tags and unescape entities
127
119
  clean_html = _HTML_TAG_PATTERN.sub("", html_content)
128
120
  clean_html = unescape(clean_html)
129
121
  text_parts.append(f"\n{clean_html}")
@@ -148,12 +140,10 @@ class EmailExtractor(Extractor):
148
140
  text_parts: list[str] = []
149
141
  metadata: dict[str, Any] = {}
150
142
 
151
- # Extract headers, body, and attachments
152
143
  self._extract_email_headers(parsed_email, text_parts, metadata)
153
144
  self._extract_email_body(parsed_email, text_parts)
154
145
  self._extract_email_attachments(parsed_email, text_parts, metadata)
155
146
 
156
- # Join efficiently
157
147
  combined_text = "\n".join(text_parts)
158
148
 
159
149
  return ExtractionResult(
@@ -7,7 +7,7 @@ from anyio import Path as AsyncPath
7
7
 
8
8
  from kreuzberg._extractors._base import Extractor
9
9
  from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
10
- from kreuzberg._types import ExtractionResult
10
+ from kreuzberg._types import ExtractionResult, HTMLToMarkdownConfig
11
11
  from kreuzberg._utils._string import safe_decode
12
12
  from kreuzberg._utils._sync import run_sync
13
13
 
@@ -26,19 +26,16 @@ class HTMLExtractor(Extractor):
26
26
  return await run_sync(self.extract_bytes_sync, content)
27
27
 
28
28
  def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
29
- # Use html-to-markdown with script/nav removal for better quality
30
- result = html_to_markdown.convert_to_markdown(
31
- safe_decode(content),
32
- preprocess_html=True,
33
- preprocessing_preset="aggressive",
34
- remove_navigation=True,
35
- remove_forms=True,
36
- )
37
-
38
- # Skip normalize_spaces since quality processing will handle whitespace
29
+ config = self.config.html_to_markdown_config if self.config else None
30
+ if config is None:
31
+ config = HTMLToMarkdownConfig()
32
+
33
+ config_dict = config.to_dict()
34
+
35
+ result = html_to_markdown.convert_to_markdown(safe_decode(content), **config_dict)
36
+
39
37
  extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
40
38
 
41
- # Apply quality processing which includes normalization
42
39
  return self._apply_quality_processing(extraction_result)
43
40
 
44
41
  def extract_path_sync(self, path: Path) -> ExtractionResult:
@@ -3,7 +3,6 @@ from __future__ import annotations
3
3
  import contextlib
4
4
  import os
5
5
  import tempfile
6
- from dataclasses import asdict
7
6
  from pathlib import Path
8
7
  from typing import TYPE_CHECKING, ClassVar
9
8
 
@@ -12,9 +11,6 @@ from anyio import Path as AsyncPath
12
11
  from kreuzberg._extractors._base import Extractor
13
12
  from kreuzberg._mime_types import IMAGE_MIME_TYPES
14
13
  from kreuzberg._ocr import get_ocr_backend
15
- from kreuzberg._ocr._easyocr import EasyOCRConfig
16
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig
17
- from kreuzberg._ocr._tesseract import TesseractConfig
18
14
  from kreuzberg._utils._tmp import create_temp_file
19
15
  from kreuzberg.exceptions import ValidationError
20
16
 
@@ -84,25 +80,7 @@ class ImageExtractor(Extractor):
84
80
  raise ValidationError("ocr_backend is None, cannot perform OCR")
85
81
 
86
82
  backend = get_ocr_backend(self.config.ocr_backend)
87
-
88
- match self.config.ocr_backend:
89
- case "tesseract":
90
- config = (
91
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
92
- )
93
- result = backend.process_file_sync(path, **asdict(config))
94
- case "paddleocr":
95
- paddle_config = (
96
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
97
- )
98
- result = backend.process_file_sync(path, **asdict(paddle_config))
99
- case "easyocr":
100
- easy_config = (
101
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
102
- )
103
- result = backend.process_file_sync(path, **asdict(easy_config))
104
- case _:
105
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
83
+ result = backend.process_file_sync(path, **self.config.get_config_dict())
106
84
  return self._apply_quality_processing(result)
107
85
 
108
86
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -244,18 +244,13 @@ class PandocExtractor(Extractor):
244
244
  raise ParsingError("Failed to process file", context={"file": str(path), "error": str(e)}) from e
245
245
 
246
246
  async def _validate_pandoc_version(self) -> None:
247
- """Validate that the installed Pandoc version meets the minimum requirement.
248
-
249
- Raises:
250
- MissingDependencyError: If Pandoc is not installed or version is too low
251
- """
252
247
  try:
253
248
  if self._checked_version:
254
249
  return
255
250
 
256
251
  command = ["pandoc", "--version"]
257
252
  result = await run_process(command)
258
- stdout = result.stdout.decode()
253
+ stdout = result.stdout.decode("utf-8")
259
254
 
260
255
  version_match = re.search(
261
256
  r"pandoc(?:\.exe)?(?:\s+|\s+v|\s+version\s+)(\d+)\.(\d+)(?:\.(\d+))?", stdout, re.IGNORECASE
@@ -299,14 +294,6 @@ class PandocExtractor(Extractor):
299
294
 
300
295
  @staticmethod
301
296
  def _get_pandoc_key(key: str) -> str | None:
302
- """Map Pandoc metadata keys to our standard metadata keys.
303
-
304
- Args:
305
- key: The key from Pandoc metadata
306
-
307
- Returns:
308
- The mapped key name for our system, or None if not mapped
309
- """
310
297
  if key == "abstract":
311
298
  return "summary"
312
299
 
@@ -325,17 +312,6 @@ class PandocExtractor(Extractor):
325
312
  return key
326
313
 
327
314
  def _get_pandoc_type_from_mime_type(self, mime_type: str) -> str:
328
- """Get Pandoc format type from MIME type.
329
-
330
- Args:
331
- mime_type: The MIME type to look up
332
-
333
- Returns:
334
- The corresponding Pandoc type
335
-
336
- Raises:
337
- ValidationError: If mime_type is not supported
338
- """
339
315
  if pandoc_type := (self.MIMETYPE_TO_PANDOC_TYPE_MAPPING.get(mime_type, "")):
340
316
  return pandoc_type
341
317
 
@@ -349,17 +325,6 @@ class PandocExtractor(Extractor):
349
325
  raise ValidationError(f"Unsupported mime type: {mime_type}")
350
326
 
351
327
  async def _handle_extract_metadata(self, input_file: str | PathLike[str]) -> Metadata:
352
- """Extract metadata from a file using Pandoc.
353
-
354
- Args:
355
- input_file: The file to extract metadata from
356
-
357
- Returns:
358
- The extracted metadata
359
-
360
- Raises:
361
- ParsingError: If metadata extraction fails
362
- """
363
328
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
364
329
  metadata_file, unlink = await create_temp_file(".json")
365
330
  try:
@@ -389,17 +354,6 @@ class PandocExtractor(Extractor):
389
354
  await unlink()
390
355
 
391
356
  async def _handle_extract_file(self, input_file: str | PathLike[str]) -> str:
392
- """Extract text content from a file using Pandoc.
393
-
394
- Args:
395
- input_file: The file to extract content from
396
-
397
- Returns:
398
- The extracted text content
399
-
400
- Raises:
401
- ParsingError: If content extraction fails
402
- """
403
357
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
404
358
  output_path, unlink = await create_temp_file(".md")
405
359
  try:
@@ -431,14 +385,6 @@ class PandocExtractor(Extractor):
431
385
  await unlink()
432
386
 
433
387
  def _extract_metadata(self, raw_meta: dict[str, Any]) -> Metadata:
434
- """Extract structured metadata from Pandoc JSON metadata.
435
-
436
- Args:
437
- raw_meta: The raw metadata from Pandoc
438
-
439
- Returns:
440
- Structured metadata
441
- """
442
388
  meta: Metadata = {}
443
389
 
444
390
  if (
@@ -485,16 +431,6 @@ class PandocExtractor(Extractor):
485
431
  return meta
486
432
 
487
433
  def _extract_inline_text(self, node: dict[str, Any], type_field: str = "t", content_field: str = "c") -> str | None:
488
- """Extract text from an inline node in a document structure.
489
-
490
- Args:
491
- node: The node to extract text from
492
- type_field: The field name for the node type
493
- content_field: The field name for the node content
494
-
495
- Returns:
496
- The extracted text or None if no text could be extracted
497
- """
498
434
  if node_type := node.get(type_field):
499
435
  if node_type == "Str":
500
436
  return node.get(content_field)
@@ -505,29 +441,11 @@ class PandocExtractor(Extractor):
505
441
  return None
506
442
 
507
443
  def _extract_inlines(self, nodes: list[dict[str, Any]]) -> str | None:
508
- """Extract text from a list of inline nodes.
509
-
510
- Args:
511
- nodes: The list of nodes to extract text from
512
-
513
- Returns:
514
- The extracted text or None if no text could be extracted
515
- """
516
444
  texts = [text for node in nodes if (text := self._extract_inline_text(node))]
517
445
  result = "".join(texts).strip()
518
446
  return result if result else None
519
447
 
520
448
  def _extract_meta_value(self, node: Any, type_field: str = "t", content_field: str = "c") -> str | list[str] | None:
521
- """Extract a metadata value from a node.
522
-
523
- Args:
524
- node: The node to extract metadata from
525
- type_field: The field name for the node type
526
- content_field: The field name for the node content
527
-
528
- Returns:
529
- The extracted metadata value or None if no metadata could be extracted
530
- """
531
449
  if not isinstance(node, dict) or type_field not in node:
532
450
  return None
533
451
 
@@ -577,12 +495,17 @@ class PandocExtractor(Extractor):
577
495
  return None
578
496
 
579
497
  def _validate_pandoc_version_sync(self) -> None:
580
- """Synchronous version of _validate_pandoc_version."""
581
498
  try:
582
499
  if self._checked_version:
583
500
  return
584
501
 
585
- result = subprocess.run(["pandoc", "--version"], capture_output=True, text=True, check=False) # noqa: S607
502
+ result = subprocess.run(
503
+ ["pandoc", "--version"], # noqa: S607
504
+ capture_output=True,
505
+ text=True,
506
+ check=False,
507
+ encoding="utf-8",
508
+ )
586
509
 
587
510
  if result.returncode != 0:
588
511
  raise MissingDependencyError(
@@ -621,7 +544,6 @@ class PandocExtractor(Extractor):
621
544
  ) from e
622
545
 
623
546
  def _extract_metadata_sync(self, path: Path) -> Metadata:
624
- """Synchronous version of _handle_extract_metadata."""
625
547
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
626
548
  fd, metadata_file = tempfile.mkstemp(suffix=".json")
627
549
  os.close(fd)
@@ -638,7 +560,7 @@ class PandocExtractor(Extractor):
638
560
  str(metadata_file),
639
561
  ]
640
562
 
641
- result = subprocess.run(command, capture_output=True, text=True, check=False)
563
+ result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
642
564
 
643
565
  if result.returncode != 0:
644
566
  raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})
@@ -655,7 +577,6 @@ class PandocExtractor(Extractor):
655
577
  Path(metadata_file).unlink()
656
578
 
657
579
  def _extract_file_sync(self, path: Path) -> str:
658
- """Synchronous version of _handle_extract_file."""
659
580
  pandoc_type = self._get_pandoc_type_from_mime_type(self.mime_type)
660
581
  fd, output_path = tempfile.mkstemp(suffix=".md")
661
582
  os.close(fd)
@@ -673,7 +594,7 @@ class PandocExtractor(Extractor):
673
594
  str(output_path),
674
595
  ]
675
596
 
676
- result = subprocess.run(command, capture_output=True, text=True, check=False)
597
+ result = subprocess.run(command, capture_output=True, text=True, check=False, encoding="utf-8")
677
598
 
678
599
  if result.returncode != 0:
679
600
  raise ParsingError("Failed to extract file data", context={"file": str(path), "error": result.stderr})