kreuzberg 3.9.1__py3-none-any.whl → 3.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -30,7 +30,7 @@ try:
30
30
  HTTP_422_UNPROCESSABLE_ENTITY,
31
31
  HTTP_500_INTERNAL_SERVER_ERROR,
32
32
  )
33
- except ImportError as e:
33
+ except ImportError as e: # pragma: no cover
34
34
  raise MissingDependencyError.create_for_package(
35
35
  dependency_group="litestar",
36
36
  functionality="Litestar API and docker container",
kreuzberg/_chunker.py CHANGED
@@ -43,7 +43,7 @@ def get_chunker(
43
43
  from semantic_text_splitter import TextSplitter # noqa: PLC0415
44
44
 
45
45
  _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
- except ImportError as e:
46
+ except ImportError as e: # pragma: no cover
47
47
  raise MissingDependencyError.create_for_package(
48
48
  dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
49
49
  ) from e
kreuzberg/_config.py CHANGED
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
13
13
 
14
14
  if sys.version_info >= (3, 11):
15
15
  import tomllib
16
- else:
16
+ else: # pragma: no cover
17
17
  import tomli as tomllib # type: ignore[import-not-found]
18
18
 
19
19
  from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
50
50
  # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
51
51
  if config_path.name == "kreuzberg.toml":
52
52
  return data # type: ignore[no-any-return]
53
- return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
53
+
54
+ # For other files, check if they have [tool.kreuzberg] section
55
+ if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
56
+ return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
57
+
58
+ # Otherwise assume root-level configuration
59
+ return data # type: ignore[no-any-return]
54
60
 
55
61
 
56
62
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -129,12 +135,23 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
129
135
  "extract_keywords",
130
136
  "auto_detect_language",
131
137
  "enable_quality_processing",
138
+ "auto_detect_document_type",
139
+ "document_type_confidence_threshold",
140
+ "document_classification_mode",
141
+ "keyword_count",
132
142
  }
133
143
  extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
134
144
 
135
145
  # Handle OCR backend configuration
136
146
  ocr_backend = extraction_config.get("ocr_backend")
137
147
  if ocr_backend and ocr_backend != "none":
148
+ # Validate OCR backend
149
+ valid_backends = {"tesseract", "easyocr", "paddleocr"}
150
+ if ocr_backend not in valid_backends:
151
+ raise ValidationError(
152
+ f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
153
+ context={"provided": ocr_backend, "valid": sorted(valid_backends)},
154
+ )
138
155
  ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
139
156
  if ocr_config:
140
157
  extraction_config["ocr_config"] = ocr_config
@@ -286,6 +303,10 @@ _CONFIG_FIELDS = [
286
303
  "extract_keywords",
287
304
  "auto_detect_language",
288
305
  "enable_quality_processing",
306
+ "auto_detect_document_type",
307
+ "document_type_confidence_threshold",
308
+ "document_classification_mode",
309
+ "keyword_count",
289
310
  ]
290
311
 
291
312
 
@@ -4,13 +4,12 @@ import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
6
  from kreuzberg._ocr import get_ocr_backend
7
+ from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
7
8
  from kreuzberg.exceptions import MissingDependencyError
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  from pathlib import Path
11
12
 
12
- from kreuzberg._types import ExtractionConfig, ExtractionResult
13
-
14
13
 
15
14
  DOCUMENT_CLASSIFIERS = {
16
15
  "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
52
51
  Raises:
53
52
  MissingDependencyError: If the deep-translator package is not installed
54
53
  """
54
+ # Combine content with metadata for classification
55
+ text_to_classify = result.content
56
+ if result.metadata:
57
+ # Add metadata values to the text for classification
58
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
59
+ text_to_classify = f"{text_to_classify} {metadata_text}"
60
+
55
61
  try:
56
62
  from deep_translator import GoogleTranslator # noqa: PLC0415
57
- except ImportError as e:
63
+ except ImportError as e: # pragma: no cover
58
64
  raise MissingDependencyError(
59
65
  "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
60
66
  ) from e
61
67
 
62
- return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
68
+ try:
69
+ return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
70
+ except Exception: # noqa: BLE001
71
+ # Fall back to original content in lowercase if translation fails
72
+ return text_to_classify.lower()
63
73
 
64
74
 
65
75
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
73
83
  A tuple containing the detected document type and the confidence score,
74
84
  or (None, None) if no type is detected with sufficient confidence.
75
85
  """
86
+ if not config.auto_detect_document_type:
87
+ return None, None
88
+
76
89
  translated_text = _get_translated_text(result)
77
90
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
78
91
 
@@ -108,7 +121,8 @@ def classify_document_from_layout(
108
121
  A tuple containing the detected document type and the confidence score,
109
122
  or (None, None) if no type is detected with sufficient confidence.
110
123
  """
111
- translated_text = _get_translated_text(result)
124
+ if not config.auto_detect_document_type:
125
+ return None, None
112
126
 
113
127
  if result.layout is None or result.layout.empty:
114
128
  return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
117
131
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
118
132
  return None, None
119
133
 
134
+ # Use layout text for classification, not the content
135
+ layout_text = " ".join(layout_df["text"].astype(str).tolist())
136
+
137
+ # Translate layout text directly for classification
138
+ text_to_classify = layout_text
139
+ if result.metadata:
140
+ # Add metadata values to the text for classification
141
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
142
+ text_to_classify = f"{text_to_classify} {metadata_text}"
143
+
144
+ try:
145
+ from deep_translator import GoogleTranslator # noqa: PLC0415
146
+
147
+ translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
148
+ except Exception: # noqa: BLE001
149
+ # Fall back to original content in lowercase if translation fails
150
+ translated_text = text_to_classify.lower()
151
+
120
152
  layout_df["translated_text"] = translated_text
121
153
 
122
154
  page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
151
183
  if config.document_classification_mode == "vision" and file_path:
152
184
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
153
185
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
186
+ elif result.layout is not None and not result.layout.empty:
187
+ # Use layout-based classification if layout data is available
188
+ result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
154
189
  else:
155
190
  result.document_type, result.document_type_confidence = classify_document(result, config)
156
191
  return result
@@ -139,7 +139,7 @@ def extract_entities(
139
139
 
140
140
  try:
141
141
  import spacy # noqa: F401, PLC0415
142
- except ImportError as e:
142
+ except ImportError as e: # pragma: no cover
143
143
  raise MissingDependencyError.create_for_package(
144
144
  package_name="spacy",
145
145
  dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
230
230
  return [(kw, float(score)) for kw, score in keywords]
231
231
  except (RuntimeError, OSError, ValueError):
232
232
  return []
233
- except ImportError as e:
233
+ except ImportError as e: # pragma: no cover
234
234
  raise MissingDependencyError.create_for_package(
235
235
  package_name="keybert",
236
236
  dependency_group="entity-extraction",
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
  # Import optional dependencies at module level with proper error handling
20
20
  try:
21
21
  import mailparse
22
- except ImportError:
22
+ except ImportError: # pragma: no cover
23
23
  mailparse = None
24
24
 
25
25
  try:
26
26
  import html2text # type: ignore[import-not-found]
27
- except ImportError:
27
+ except ImportError: # pragma: no cover
28
28
  html2text = None
29
29
 
30
30
  # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
59
59
 
60
60
  to_info = parsed_email.get("to")
61
61
  if to_info:
62
+ # Store the raw value in metadata (could be string, dict, or list)
62
63
  if isinstance(to_info, list) and to_info:
64
+ # For metadata, use first recipient's email if it's a list
63
65
  to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
66
+ metadata["email_to"] = to_email
64
67
  elif isinstance(to_info, dict):
65
- to_email = to_info.get("email", "")
68
+ metadata["email_to"] = to_info.get("email", "")
66
69
  else:
67
- to_email = str(to_info)
68
- metadata["email_to"] = to_email
69
- text_parts.append(f"To: {to_email}")
70
+ metadata["email_to"] = str(to_info)
71
+
72
+ # For display, format all recipients
73
+ to_formatted = self._format_email_field(to_info)
74
+ text_parts.append(f"To: {to_formatted}")
70
75
 
71
76
  date = parsed_email.get("date")
72
77
  if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
76
81
  cc = parsed_email.get("cc")
77
82
  if cc:
78
83
  metadata["email_cc"] = cc
79
- text_parts.append(f"CC: {cc}")
84
+ cc_formatted = self._format_email_field(cc)
85
+ text_parts.append(f"CC: {cc_formatted}")
80
86
 
81
87
  bcc = parsed_email.get("bcc")
82
88
  if bcc:
83
89
  metadata["email_bcc"] = bcc
84
- text_parts.append(f"BCC: {bcc}")
90
+ bcc_formatted = self._format_email_field(bcc)
91
+ text_parts.append(f"BCC: {bcc_formatted}")
92
+
93
+ def _format_email_field(self, field: Any) -> str:
94
+ """Format email field (to, cc, bcc) for display."""
95
+ if isinstance(field, list):
96
+ emails = []
97
+ for item in field:
98
+ if isinstance(item, dict):
99
+ email = item.get("email", "")
100
+ if email:
101
+ emails.append(email)
102
+ else:
103
+ emails.append(str(item))
104
+ return ", ".join(emails)
105
+ if isinstance(field, dict):
106
+ return str(field.get("email", ""))
107
+ return str(field)
85
108
 
86
109
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
110
  """Extract and process email body content."""
@@ -22,7 +22,7 @@ from kreuzberg._ocr._easyocr import EasyOCRConfig
22
22
  from kreuzberg._ocr._paddleocr import PaddleOCRConfig
23
23
  from kreuzberg._ocr._tesseract import TesseractConfig
24
24
  from kreuzberg._playa import extract_pdf_metadata, extract_pdf_metadata_sync
25
- from kreuzberg._types import ExtractionResult, OcrBackendType
25
+ from kreuzberg._types import ExtractionResult, Metadata, OcrBackendType
26
26
  from kreuzberg._utils._errors import create_error_context, should_retry
27
27
  from kreuzberg._utils._pdf_lock import pypdfium_file_lock
28
28
  from kreuzberg._utils._string import normalize_spaces
@@ -33,6 +33,7 @@ from kreuzberg.exceptions import ParsingError
33
33
 
34
34
  if TYPE_CHECKING: # pragma: no cover
35
35
  from PIL.Image import Image
36
+ from playa.document import Document
36
37
 
37
38
 
38
39
  class PDFExtractor(Extractor):
@@ -45,7 +46,7 @@ class PDFExtractor(Extractor):
45
46
  file_path, unlink = await create_temp_file(".pdf")
46
47
  await AsyncPath(file_path).write_bytes(content)
47
48
  try:
48
- metadata = await extract_pdf_metadata(content)
49
+ metadata = await self._extract_metadata_with_password_attempts(content)
49
50
  result = await self.extract_path_async(file_path)
50
51
 
51
52
  result.metadata = metadata
@@ -73,7 +74,7 @@ class PDFExtractor(Extractor):
73
74
  if not result:
74
75
  result = ExtractionResult(content="", mime_type=PLAIN_TEXT_MIME_TYPE, metadata={}, chunks=[])
75
76
 
76
- result.metadata = await extract_pdf_metadata(content_bytes)
77
+ result.metadata = await self._extract_metadata_with_password_attempts(content_bytes)
77
78
 
78
79
  if self.config.extract_tables:
79
80
  # GMFT is optional dependency
@@ -81,7 +82,7 @@ class PDFExtractor(Extractor):
81
82
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
82
83
 
83
84
  result.tables = await extract_tables(path, self.config.gmft_config)
84
- except ImportError:
85
+ except ImportError: # pragma: no cover
85
86
  result.tables = []
86
87
 
87
88
  # Enhance metadata with table information
@@ -107,7 +108,7 @@ class PDFExtractor(Extractor):
107
108
 
108
109
  result = self.extract_path_sync(Path(temp_path))
109
110
 
110
- metadata = extract_pdf_metadata_sync(content)
111
+ metadata = self._extract_metadata_with_password_attempts_sync(content)
111
112
  result.metadata = metadata
112
113
 
113
114
  return result
@@ -406,11 +407,81 @@ class PDFExtractor(Extractor):
406
407
  # Use list comprehension and join for efficient string building
407
408
  return "\n\n".join(result.content for result in results)
408
409
 
410
+ def _parse_with_password_attempts(self, content: bytes) -> Document:
411
+ """Parse PDF with password attempts."""
412
+ # Normalize password to list
413
+ if isinstance(self.config.pdf_password, str):
414
+ passwords = [self.config.pdf_password] if self.config.pdf_password else [""]
415
+ else:
416
+ passwords = list(self.config.pdf_password)
417
+
418
+ # Try each password in sequence
419
+ last_exception = None
420
+ for password in passwords:
421
+ try:
422
+ return parse(content, max_workers=1, password=password)
423
+ except Exception as e: # noqa: PERF203, BLE001
424
+ last_exception = e
425
+ continue
426
+
427
+ # If all passwords failed, raise the last exception
428
+ if last_exception:
429
+ raise last_exception from None
430
+
431
+ # Fallback to no password
432
+ return parse(content, max_workers=1, password="")
433
+
434
+ def _get_passwords_to_try(self) -> list[str]:
435
+ """Get list of passwords to try in sequence."""
436
+ if isinstance(self.config.pdf_password, str):
437
+ return [self.config.pdf_password] if self.config.pdf_password else [""]
438
+ return list(self.config.pdf_password) if self.config.pdf_password else [""]
439
+
440
+ async def _extract_metadata_with_password_attempts(self, content: bytes) -> Metadata:
441
+ """Extract PDF metadata with password attempts."""
442
+ passwords = self._get_passwords_to_try()
443
+
444
+ last_exception = None
445
+ for password in passwords:
446
+ try:
447
+ return await extract_pdf_metadata(content, password=password)
448
+ except Exception as e: # noqa: PERF203, BLE001
449
+ last_exception = e
450
+ continue
451
+
452
+ # If all passwords failed, try with empty password as fallback
453
+ try:
454
+ return await extract_pdf_metadata(content, password="")
455
+ except Exception:
456
+ if last_exception:
457
+ raise last_exception from None
458
+ raise
459
+
460
+ def _extract_metadata_with_password_attempts_sync(self, content: bytes) -> Metadata:
461
+ """Extract PDF metadata with password attempts (sync version)."""
462
+ passwords = self._get_passwords_to_try()
463
+
464
+ last_exception = None
465
+ for password in passwords:
466
+ try:
467
+ return extract_pdf_metadata_sync(content, password=password)
468
+ except Exception as e: # noqa: PERF203, BLE001
469
+ last_exception = e
470
+ continue
471
+
472
+ # If all passwords failed, try with empty password as fallback
473
+ try:
474
+ return extract_pdf_metadata_sync(content, password="")
475
+ except Exception:
476
+ if last_exception:
477
+ raise last_exception from None
478
+ raise
479
+
409
480
  def _extract_with_playa_sync(self, path: Path, fallback_text: str) -> str:
410
481
  """Extract text using playa for better structure preservation."""
411
482
  with contextlib.suppress(Exception):
412
483
  content = path.read_bytes()
413
- document = parse(content, max_workers=1)
484
+ document = self._parse_with_password_attempts(content)
414
485
 
415
486
  # Extract text while preserving structure
416
487
  pages_text = []
@@ -6,15 +6,15 @@ from typing import TYPE_CHECKING, Any, ClassVar
6
6
 
7
7
  if sys.version_info >= (3, 11):
8
8
  import tomllib
9
- else:
9
+ else: # pragma: no cover
10
10
  try:
11
11
  import tomli as tomllib # type: ignore[import-not-found]
12
- except ImportError:
12
+ except ImportError: # pragma: no cover
13
13
  tomllib = None
14
14
 
15
15
  try:
16
16
  import yaml
17
- except ImportError:
17
+ except ImportError: # pragma: no cover
18
18
  yaml = None
19
19
 
20
20
  from anyio import Path as AsyncPath
kreuzberg/_gmft.py CHANGED
@@ -265,7 +265,7 @@ async def extract_tables(
265
265
  finally:
266
266
  await run_sync(doc.close)
267
267
 
268
- except ImportError as e:
268
+ except ImportError as e: # pragma: no cover
269
269
  raise MissingDependencyError.create_for_package(
270
270
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
271
271
  ) from e
@@ -379,7 +379,7 @@ def extract_tables_sync(
379
379
  finally:
380
380
  doc.close() # type: ignore[no-untyped-call]
381
381
 
382
- except ImportError as e:
382
+ except ImportError as e: # pragma: no cover
383
383
  raise MissingDependencyError.create_for_package(
384
384
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
385
385
  ) from e
@@ -14,7 +14,7 @@ try:
14
14
  from fast_langdetect import detect, detect_multilingual
15
15
 
16
16
  HAS_FAST_LANGDETECT = True
17
- except ImportError:
17
+ except ImportError: # pragma: no cover
18
18
  HAS_FAST_LANGDETECT = False
19
19
  detect = None
20
20
  detect_multilingual = None
kreuzberg/_mcp/server.py CHANGED
@@ -268,7 +268,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
268
268
  return [TextContent(type="text", text=content)]
269
269
 
270
270
 
271
- def main() -> None:
271
+ def main() -> None: # pragma: no cover
272
272
  """Main entry point for the MCP server."""
273
273
  mcp.run()
274
274
 
kreuzberg/_ocr/_base.py CHANGED
@@ -88,7 +88,7 @@ class OCRBackend(ABC, Generic[T]):
88
88
  Returns:
89
89
  List of extraction result objects in the same order as input paths
90
90
  """
91
- return [self.process_file_sync(path, **kwargs) for path in paths]
91
+ return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
92
92
 
93
93
  async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
94
  """Asynchronously process a batch of files and extract their text and metadata.
@@ -106,8 +106,8 @@ class OCRBackend(ABC, Generic[T]):
106
106
  from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
107
107
 
108
108
  tasks = [self.process_file(path, **kwargs) for path in paths]
109
- return await run_taskgroup(*tasks)
109
+ return await run_taskgroup(*tasks) # pragma: no cover
110
110
 
111
111
  def __hash__(self) -> int:
112
112
  """Hash function for allowing caching."""
113
- return hash(type(self).__name__)
113
+ return hash(type(self).__name__) # pragma: no cover
@@ -321,7 +321,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
321
321
  import torch # noqa: PLC0415
322
322
 
323
323
  return bool(torch.cuda.is_available())
324
- except ImportError:
324
+ except ImportError: # pragma: no cover
325
325
  return False
326
326
 
327
327
  @classmethod
@@ -340,7 +340,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
340
340
 
341
341
  try:
342
342
  import easyocr # noqa: PLC0415
343
- except ImportError as e:
343
+ except ImportError as e: # pragma: no cover
344
344
  raise MissingDependencyError.create_for_package(
345
345
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
346
346
  ) from e
@@ -508,7 +508,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
508
508
 
509
509
  try:
510
510
  import easyocr # noqa: PLC0415
511
- except ImportError as e:
511
+ except ImportError as e: # pragma: no cover
512
512
  raise MissingDependencyError.create_for_package(
513
513
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
514
514
  ) from e
@@ -261,7 +261,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
261
261
 
262
262
  try:
263
263
  from paddleocr import PaddleOCR # noqa: PLC0415
264
- except ImportError as e:
264
+ except ImportError as e: # pragma: no cover
265
265
  raise MissingDependencyError.create_for_package(
266
266
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
267
267
  ) from e
@@ -428,7 +428,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
428
428
 
429
429
  try:
430
430
  from paddleocr import PaddleOCR # noqa: PLC0415
431
- except ImportError as e:
431
+ except ImportError as e: # pragma: no cover
432
432
  raise MissingDependencyError.create_for_package(
433
433
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
434
434
  ) from e
kreuzberg/_playa.py CHANGED
@@ -24,11 +24,12 @@ FULL_DATE_LENGTH = 14
24
24
  BOM_CHAR = "\ufeff"
25
25
 
26
26
 
27
- async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
27
+ async def extract_pdf_metadata(pdf_content: bytes, password: str = "") -> Metadata:
28
28
  """Extract metadata from a PDF document.
29
29
 
30
30
  Args:
31
31
  pdf_content: The bytes of the PDF document.
32
+ password: Password for encrypted PDF files.
32
33
 
33
34
  Raises:
34
35
  ParsingError: If the PDF metadata could not be extracted.
@@ -37,7 +38,7 @@ async def extract_pdf_metadata(pdf_content: bytes) -> Metadata:
37
38
  A dictionary of metadata extracted from the PDF.
38
39
  """
39
40
  try:
40
- document = parse(pdf_content, max_workers=1)
41
+ document = parse(pdf_content, max_workers=1, password=password)
41
42
  metadata: Metadata = {}
42
43
 
43
44
  for raw_info in document.info:
@@ -142,7 +143,9 @@ def _parse_date_string(date_str: str) -> str:
142
143
  minute = date_str[10:12]
143
144
  second = date_str[12:14]
144
145
  time_part = f"T{hour}:{minute}:{second}"
145
- return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat() # noqa: DTZ007
146
+ if time_part:
147
+ return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").isoformat() # noqa: DTZ007
148
+ return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").isoformat() # noqa: DTZ007
146
149
  return date_str
147
150
 
148
151
 
@@ -275,13 +278,14 @@ def _extract_structure_information(document: Document, result: Metadata) -> None
275
278
  result["subtitle"] = subtitle
276
279
 
277
280
 
278
- def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
281
+ def extract_pdf_metadata_sync(pdf_content: bytes, password: str = "") -> Metadata:
279
282
  """Synchronous version of extract_pdf_metadata.
280
283
 
281
284
  Extract metadata from a PDF document without using async/await.
282
285
 
283
286
  Args:
284
287
  pdf_content: The bytes of the PDF document.
288
+ password: Password for encrypted PDF files.
285
289
 
286
290
  Raises:
287
291
  ParsingError: If the PDF metadata could not be extracted.
@@ -290,7 +294,7 @@ def extract_pdf_metadata_sync(pdf_content: bytes) -> Metadata:
290
294
  A dictionary of metadata extracted from the PDF.
291
295
  """
292
296
  try:
293
- document = parse(pdf_content, max_workers=1)
297
+ document = parse(pdf_content, max_workers=1, password=password)
294
298
  metadata: Metadata = {}
295
299
 
296
300
  for raw_info in document.info:
kreuzberg/_types.py CHANGED
@@ -269,7 +269,7 @@ class ExtractionResult:
269
269
  Returns:
270
270
  List of CSV strings, one per table
271
271
  """
272
- if not self.tables:
272
+ if not self.tables: # pragma: no cover
273
273
  return []
274
274
 
275
275
  return [export_table_to_csv(table) for table in self.tables]
@@ -280,7 +280,7 @@ class ExtractionResult:
280
280
  Returns:
281
281
  List of TSV strings, one per table
282
282
  """
283
- if not self.tables:
283
+ if not self.tables: # pragma: no cover
284
284
  return []
285
285
 
286
286
  return [export_table_to_tsv(table) for table in self.tables]
@@ -291,7 +291,7 @@ class ExtractionResult:
291
291
  Returns:
292
292
  List of table structure dictionaries
293
293
  """
294
- if not self.tables:
294
+ if not self.tables: # pragma: no cover
295
295
  return []
296
296
 
297
297
  return [extract_table_structure_info(table) for table in self.tables]
@@ -349,14 +349,16 @@ class ExtractionConfig:
349
349
  """Configuration for language detection. If None, uses default settings."""
350
350
  spacy_entity_extraction_config: SpacyEntityExtractionConfig | None = None
351
351
  """Configuration for spaCy entity extraction. If None, uses default settings."""
352
- auto_detect_document_type: bool = False
352
+ auto_detect_document_type: bool = True
353
353
  """Whether to automatically detect the document type."""
354
- document_type_confidence_threshold: float = 0.7
354
+ document_type_confidence_threshold: float = 0.5
355
355
  """Confidence threshold for document type detection."""
356
356
  document_classification_mode: Literal["text", "vision"] = "text"
357
357
  """The mode to use for document classification."""
358
358
  enable_quality_processing: bool = True
359
359
  """Whether to apply quality post-processing to improve extraction results."""
360
+ pdf_password: str | list[str] = ""
361
+ """Password(s) for encrypted PDF files. Can be a single password or list of passwords to try in sequence. Only used when crypto extra is installed."""
360
362
 
361
363
  def __post_init__(self) -> None:
362
364
  if self.custom_entity_patterns is not None and isinstance(self.custom_entity_patterns, dict):
@@ -144,7 +144,7 @@ def _is_cuda_available() -> bool:
144
144
  import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
145
145
 
146
146
  return bool(torch.cuda.is_available())
147
- except ImportError:
147
+ except ImportError: # pragma: no cover
148
148
  return False
149
149
 
150
150
 
@@ -154,7 +154,7 @@ def _is_mps_available() -> bool:
154
154
  import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
155
155
 
156
156
  return bool(torch.backends.mps.is_available())
157
- except ImportError:
157
+ except ImportError: # pragma: no cover
158
158
  return False
159
159
 
160
160
 
@@ -190,7 +190,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
190
190
  )
191
191
  )
192
192
 
193
- except ImportError:
193
+ except ImportError: # pragma: no cover
194
194
  pass
195
195
 
196
196
  return devices
@@ -209,7 +209,7 @@ def _get_mps_device() -> DeviceInfo | None:
209
209
  name="Apple Silicon GPU (MPS)",
210
210
  )
211
211
 
212
- except ImportError:
212
+ except ImportError: # pragma: no cover
213
213
  return None
214
214
 
215
215
 
@@ -232,7 +232,7 @@ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
232
232
 
233
233
  return total_memory, available_memory
234
234
 
235
- except ImportError:
235
+ except ImportError: # pragma: no cover
236
236
  return None, None
237
237
 
238
238
 
@@ -333,7 +333,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
333
333
 
334
334
  if torch.cuda.is_available():
335
335
  torch.cuda.empty_cache()
336
- except ImportError:
336
+ except ImportError: # pragma: no cover # pragma: no cover
337
337
  pass
338
338
 
339
339
  elif device.device_type == "mps":
@@ -58,6 +58,7 @@ class DocumentCache:
58
58
  "chunk_content": config.chunk_content,
59
59
  "max_chars": config.max_chars,
60
60
  "max_overlap": config.max_overlap,
61
+ "auto_detect_document_type": config.auto_detect_document_type,
61
62
  }
62
63
 
63
64
  cache_data = {**file_info, **config_info}
kreuzberg/cli.py CHANGED
@@ -12,7 +12,7 @@ try:
12
12
  import click
13
13
  from rich.console import Console
14
14
  from rich.progress import Progress, SpinnerColumn, TextColumn
15
- except ImportError as e:
15
+ except ImportError as e: # pragma: no cover
16
16
  raise ImportError(
17
17
  "CLI dependencies are not installed. Please install kreuzberg with the 'cli' extra: pip install kreuzberg[cli]"
18
18
  ) from e
@@ -163,7 +163,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
163
163
  import magic # type: ignore[import-not-found] # noqa: PLC0415
164
164
 
165
165
  mime_type = magic.from_buffer(input_bytes, mime=True)
166
- except ImportError:
166
+ except ImportError: # pragma: no cover
167
167
  content_str = input_bytes.decode("utf-8", errors="ignore").lower()
168
168
  mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
169
169
 
@@ -193,7 +193,7 @@ def _write_output(
193
193
  click.echo(formatted_output)
194
194
 
195
195
 
196
- def handle_error(error: Exception, verbose: bool) -> None:
196
+ def handle_error(error: Exception, verbose: bool) -> None: # pragma: no cover
197
197
  """Handle and display errors.
198
198
 
199
199
  Args:
@@ -202,19 +202,19 @@ def handle_error(error: Exception, verbose: bool) -> None:
202
202
  """
203
203
  if isinstance(error, MissingDependencyError):
204
204
  console.print(f"[red]Missing dependency:[/red] {error}", style="bold")
205
- sys.exit(2)
205
+ sys.exit(2) # pragma: no cover
206
206
  elif isinstance(error, KreuzbergError):
207
207
  console.print(f"[red]Error:[/red] {error}", style="bold")
208
208
  if verbose and error.context:
209
209
  console.print("\n[dim]Context:[/dim]")
210
210
  console.print(json.dumps(error.context, indent=2))
211
- sys.exit(1)
211
+ sys.exit(1) # pragma: no cover
212
212
  else:
213
213
  console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
214
214
  if verbose:
215
215
  console.print("\n[dim]Traceback:[/dim]")
216
216
  traceback.print_exc()
217
- sys.exit(1)
217
+ sys.exit(1) # pragma: no cover
218
218
 
219
219
 
220
220
  @click.group(invoke_without_command=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.9.1
3
+ Version: 3.10.1
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -53,6 +53,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
53
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
54
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
55
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
56
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
56
57
  Requires-Dist: rich>=14.1.0; extra == 'all'
57
58
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
58
59
  Requires-Dist: setuptools>=80.9.0; extra == 'all'
@@ -69,6 +70,8 @@ Provides-Extra: cli
69
70
  Requires-Dist: click>=8.2.1; extra == 'cli'
70
71
  Requires-Dist: rich>=14.1.0; extra == 'cli'
71
72
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
+ Provides-Extra: crypto
74
+ Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
72
75
  Provides-Extra: easyocr
73
76
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
74
77
  Provides-Extra: entity-extraction
@@ -1,43 +1,43 @@
1
1
  kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
2
2
  kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
- kreuzberg/_chunker.py,sha256=QmYbPHPE36ztMT70xPwg_Y4NIftCDl0wyufg5X9lmTo,1932
4
- kreuzberg/_config.py,sha256=EvrBFAawjfKgXu49tACi4CuMmmoIRt_EzbHayZqM_jU,12983
3
+ kreuzberg/_chunker.py,sha256=y4-dX6ILjjBkkC1gkCzXb7v7vbi8844m7vz1gIzbmv4,1952
4
+ kreuzberg/_config.py,sha256=Yptj-wFXsQHxIdG7xd_HYA7bqjaRLq-1Os4nPQwJtl8,13947
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
- kreuzberg/_document_classification.py,sha256=8XVTKh8ohsb4mbKw2gPFr5OB6v4dWuzXhFE_63vHLrw,5189
7
- kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR0c,7862
8
- kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
9
- kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
6
+ kreuzberg/_document_classification.py,sha256=SErB5Ji6AyEDhgHYPUQr1xcj8FYo0cWOHqb-utRjgJE,6874
7
+ kreuzberg/_entity_extraction.py,sha256=Oa1T-9mptimpOHtcda-GtrVYH9PFy7DSJj3thJZUD7k,7902
8
+ kreuzberg/_gmft.py,sha256=HdQ7Xpuixxl2Y0jY8C3KfyQEU0mN4yQdqErWCv4TnFY,25573
9
+ kreuzberg/_language_detection.py,sha256=_Ng2aHgPxOHFgd507gVNiIGVmnxxbpgYwsO0bD0yTzg,3315
10
10
  kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
11
- kreuzberg/_playa.py,sha256=9z4If0WHxbYQxfb8xT7T96L9Du2Fj3Ar5-rF0OHHiMM,11877
11
+ kreuzberg/_playa.py,sha256=_IPrUSWwSfDQlWXOpKlauV0D9MhGrujGP5kmQ0U3L0g,12188
12
12
  kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
13
- kreuzberg/_types.py,sha256=Si-Kb58HgE4ckGyZnJFqbWRbCNbdyC_Y0-p75aQP838,15065
14
- kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
13
+ kreuzberg/_types.py,sha256=OiB-W_V3zg5rHCfVOr4x39TZJIsd5-sE_SgPZzokW9k,15318
14
+ kreuzberg/cli.py,sha256=rJMdHg7FhUxefCrx-sf4c2qVGRXr8Xrpjgfx_DQSKMg,12558
15
15
  kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
16
  kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
19
+ kreuzberg/_api/main.py,sha256=8VwxRlIXwnPs7ZYm0saUZsNOjevEAWJQpNreG-X7ZpE,3273
20
20
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
21
  kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
22
- kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
22
+ kreuzberg/_extractors/_email.py,sha256=Jpr4NFef640uVgNFkR1or-omy8RVt-NOHUYgWRDjyBo,6753
23
23
  kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
24
24
  kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
25
25
  kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
26
- kreuzberg/_extractors/_pdf.py,sha256=UlliWggWHuVwwJE-bRa7H9-_cieSa8kdrQP3x_GOxxY,17018
26
+ kreuzberg/_extractors/_pdf.py,sha256=kvbB9Brz7brUpqMZN1G-CRAFgXn5hlWa6eGKXSAe4Xk,19938
27
27
  kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
28
28
  kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
29
- kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
29
+ kreuzberg/_extractors/_structured.py,sha256=PbNaXd-_PUPsE0yZkISod_vLBokbWdVTKEPpEmqaEMM,5787
30
30
  kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
31
- kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
31
+ kreuzberg/_mcp/server.py,sha256=aAnkRDvr-tDIw-G9CAThVmvR3qOQOowcnWE6BLJg2jg,8723
32
32
  kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
33
- kreuzberg/_ocr/_base.py,sha256=urvsLRgOmVYHjxil_IsSL69FmMnboklC4CHAjdBQLKQ,3893
34
- kreuzberg/_ocr/_easyocr.py,sha256=pw2uDmULuMQ9T1Gl4axP_ev7-qwjLt1mJHHyZ34P_FI,17178
35
- kreuzberg/_ocr/_paddleocr.py,sha256=s75aQJILXm1ZbacyZiLPXh6jEAg9tk2NYnwPnfSDrRU,17543
33
+ kreuzberg/_ocr/_base.py,sha256=IkONqwG6zxZoVMni1JlYugBoyONahlRny7J2_7Dy69c,3953
34
+ kreuzberg/_ocr/_easyocr.py,sha256=dWfoj5fPIGqJPGTVeZ0W59TrW3DpNwF0bcfgt6FwQUw,17238
35
+ kreuzberg/_ocr/_paddleocr.py,sha256=Is_iJQaSUeCMfCvg5RnuG_pmBRjBt0b3dCBPY1IAc3A,17583
36
36
  kreuzberg/_ocr/_tesseract.py,sha256=teLMH1pBhpcmEXDcyZlv56hYINLGMuaKZ0CQtcu_czQ,31510
37
37
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  kreuzberg/_utils/_cache.py,sha256=hYd_a5Ni5VJBE1XU_eN9gvQ5gg0FRsdbRgmJe-OIJHM,15253
39
- kreuzberg/_utils/_device.py,sha256=arVrJOSp_2LbbN6lu_rMEUOezzRogdWdkF8d5q5Bg8U,10345
40
- kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
39
+ kreuzberg/_utils/_device.py,sha256=JI9p9TGSfQHEi2SL-ovOXMr9RUnVq-RrEly89OvmQ5w,10485
40
+ kreuzberg/_utils/_document_cache.py,sha256=ka90JIT-FXUMOv8z2u3fztQgZZb2XQDHTMnBi32mySA,7005
41
41
  kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
42
42
  kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
43
43
  kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
47
47
  kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
48
48
  kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
49
49
  kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
50
- kreuzberg-3.9.1.dist-info/METADATA,sha256=rBzP4yLvNuodmSrOUNXeYnUZCEPocULKhSjykSlPBeU,11908
51
- kreuzberg-3.9.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
- kreuzberg-3.9.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
- kreuzberg-3.9.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
- kreuzberg-3.9.1.dist-info/RECORD,,
50
+ kreuzberg-3.10.1.dist-info/METADATA,sha256=2idgsCPgaT174AZ7HA8yXZEjPLdiLkEH9-e-IMmvFck,12047
51
+ kreuzberg-3.10.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
+ kreuzberg-3.10.1.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
+ kreuzberg-3.10.1.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
+ kreuzberg-3.10.1.dist-info/RECORD,,