kreuzberg 3.10.0__py3-none-any.whl → 3.11.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
kreuzberg/_api/main.py CHANGED
@@ -30,7 +30,7 @@ try:
30
30
  HTTP_422_UNPROCESSABLE_ENTITY,
31
31
  HTTP_500_INTERNAL_SERVER_ERROR,
32
32
  )
33
- except ImportError as e:
33
+ except ImportError as e: # pragma: no cover
34
34
  raise MissingDependencyError.create_for_package(
35
35
  dependency_group="litestar",
36
36
  functionality="Litestar API and docker container",
kreuzberg/_chunker.py CHANGED
@@ -43,7 +43,7 @@ def get_chunker(
43
43
  from semantic_text_splitter import TextSplitter # noqa: PLC0415
44
44
 
45
45
  _chunkers[key] = TextSplitter(max_characters, overlap_characters)
46
- except ImportError as e:
46
+ except ImportError as e: # pragma: no cover
47
47
  raise MissingDependencyError.create_for_package(
48
48
  dependency_group="chunking", functionality="chunking", package_name="semantic-text-splitter"
49
49
  ) from e
kreuzberg/_config.py CHANGED
@@ -13,7 +13,7 @@ from typing import TYPE_CHECKING, Any
13
13
 
14
14
  if sys.version_info >= (3, 11):
15
15
  import tomllib
16
- else:
16
+ else: # pragma: no cover
17
17
  import tomli as tomllib # type: ignore[import-not-found]
18
18
 
19
19
  from kreuzberg._gmft import GMFTConfig
@@ -50,7 +50,13 @@ def load_config_from_file(config_path: Path) -> dict[str, Any]:
50
50
  # Handle both kreuzberg.toml (root level) and pyproject.toml ([tool.kreuzberg])
51
51
  if config_path.name == "kreuzberg.toml":
52
52
  return data # type: ignore[no-any-return]
53
- return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
53
+
54
+ # For other files, check if they have [tool.kreuzberg] section
55
+ if config_path.name == "pyproject.toml" or ("tool" in data and "kreuzberg" in data.get("tool", {})):
56
+ return data.get("tool", {}).get("kreuzberg", {}) # type: ignore[no-any-return]
57
+
58
+ # Otherwise assume root-level configuration
59
+ return data # type: ignore[no-any-return]
54
60
 
55
61
 
56
62
  def merge_configs(base: dict[str, Any], override: dict[str, Any]) -> dict[str, Any]:
@@ -91,19 +97,21 @@ def parse_ocr_backend_config(
91
97
  if not isinstance(backend_config, dict):
92
98
  return None
93
99
 
94
- if backend == "tesseract":
95
- # Convert psm integer to PSMMode enum if needed
96
- processed_config = backend_config.copy()
97
- if "psm" in processed_config and isinstance(processed_config["psm"], int):
98
- from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
99
-
100
- processed_config["psm"] = PSMMode(processed_config["psm"])
101
- return TesseractConfig(**processed_config)
102
- if backend == "easyocr":
103
- return EasyOCRConfig(**backend_config)
104
- if backend == "paddleocr":
105
- return PaddleOCRConfig(**backend_config)
106
- return None
100
+ match backend:
101
+ case "tesseract":
102
+ # Convert psm integer to PSMMode enum if needed
103
+ processed_config = backend_config.copy()
104
+ if "psm" in processed_config and isinstance(processed_config["psm"], int):
105
+ from kreuzberg._ocr._tesseract import PSMMode # noqa: PLC0415
106
+
107
+ processed_config["psm"] = PSMMode(processed_config["psm"])
108
+ return TesseractConfig(**processed_config)
109
+ case "easyocr":
110
+ return EasyOCRConfig(**backend_config)
111
+ case "paddleocr":
112
+ return PaddleOCRConfig(**backend_config)
113
+ case _:
114
+ return None
107
115
 
108
116
 
109
117
  def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> ExtractionConfig:
@@ -129,12 +137,25 @@ def build_extraction_config_from_dict(config_dict: dict[str, Any]) -> Extraction
129
137
  "extract_keywords",
130
138
  "auto_detect_language",
131
139
  "enable_quality_processing",
140
+ "auto_detect_document_type",
141
+ "document_type_confidence_threshold",
142
+ "document_classification_mode",
143
+ "keyword_count",
144
+ }
145
+ extraction_config = extraction_config | {
146
+ field: config_dict[field] for field in basic_fields if field in config_dict
132
147
  }
133
- extraction_config.update({field: config_dict[field] for field in basic_fields if field in config_dict})
134
148
 
135
149
  # Handle OCR backend configuration
136
150
  ocr_backend = extraction_config.get("ocr_backend")
137
151
  if ocr_backend and ocr_backend != "none":
152
+ # Validate OCR backend
153
+ valid_backends = {"tesseract", "easyocr", "paddleocr"}
154
+ if ocr_backend not in valid_backends:
155
+ raise ValidationError(
156
+ f"Invalid OCR backend: {ocr_backend}. Must be one of: {', '.join(sorted(valid_backends))} or 'none'",
157
+ context={"provided": ocr_backend, "valid": sorted(valid_backends)},
158
+ )
138
159
  ocr_config = parse_ocr_backend_config(config_dict, ocr_backend)
139
160
  if ocr_config:
140
161
  extraction_config["ocr_config"] = ocr_config
@@ -286,6 +307,10 @@ _CONFIG_FIELDS = [
286
307
  "extract_keywords",
287
308
  "auto_detect_language",
288
309
  "enable_quality_processing",
310
+ "auto_detect_document_type",
311
+ "document_type_confidence_threshold",
312
+ "document_classification_mode",
313
+ "keyword_count",
289
314
  ]
290
315
 
291
316
 
@@ -4,13 +4,12 @@ import re
4
4
  from typing import TYPE_CHECKING
5
5
 
6
6
  from kreuzberg._ocr import get_ocr_backend
7
+ from kreuzberg._types import ExtractionConfig, ExtractionResult # noqa: TC001
7
8
  from kreuzberg.exceptions import MissingDependencyError
8
9
 
9
10
  if TYPE_CHECKING:
10
11
  from pathlib import Path
11
12
 
12
- from kreuzberg._types import ExtractionConfig, ExtractionResult
13
-
14
13
 
15
14
  DOCUMENT_CLASSIFIERS = {
16
15
  "invoice": [
@@ -52,14 +51,25 @@ def _get_translated_text(result: ExtractionResult) -> str:
52
51
  Raises:
53
52
  MissingDependencyError: If the deep-translator package is not installed
54
53
  """
54
+ # Combine content with metadata for classification
55
+ text_to_classify = result.content
56
+ if result.metadata:
57
+ # Add metadata values to the text for classification
58
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
59
+ text_to_classify = f"{text_to_classify} {metadata_text}"
60
+
55
61
  try:
56
62
  from deep_translator import GoogleTranslator # noqa: PLC0415
57
- except ImportError as e:
63
+ except ImportError as e: # pragma: no cover
58
64
  raise MissingDependencyError(
59
- "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[auto-classify-document-type]'"
65
+ "The 'deep-translator' library is not installed. Please install it with: pip install 'kreuzberg[document-classification]'"
60
66
  ) from e
61
67
 
62
- return str(GoogleTranslator(source="auto", target="en").translate(result.content).lower())
68
+ try:
69
+ return str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
70
+ except Exception: # noqa: BLE001
71
+ # Fall back to original content in lowercase if translation fails
72
+ return text_to_classify.lower()
63
73
 
64
74
 
65
75
  def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tuple[str | None, float | None]:
@@ -73,6 +83,9 @@ def classify_document(result: ExtractionResult, config: ExtractionConfig) -> tup
73
83
  A tuple containing the detected document type and the confidence score,
74
84
  or (None, None) if no type is detected with sufficient confidence.
75
85
  """
86
+ if not config.auto_detect_document_type:
87
+ return None, None
88
+
76
89
  translated_text = _get_translated_text(result)
77
90
  scores = dict.fromkeys(DOCUMENT_CLASSIFIERS, 0)
78
91
 
@@ -108,7 +121,8 @@ def classify_document_from_layout(
108
121
  A tuple containing the detected document type and the confidence score,
109
122
  or (None, None) if no type is detected with sufficient confidence.
110
123
  """
111
- translated_text = _get_translated_text(result)
124
+ if not config.auto_detect_document_type:
125
+ return None, None
112
126
 
113
127
  if result.layout is None or result.layout.empty:
114
128
  return None, None
@@ -117,6 +131,24 @@ def classify_document_from_layout(
117
131
  if not all(col in layout_df.columns for col in ["text", "top", "height"]):
118
132
  return None, None
119
133
 
134
+ # Use layout text for classification, not the content
135
+ layout_text = " ".join(layout_df["text"].astype(str).tolist())
136
+
137
+ # Translate layout text directly for classification
138
+ text_to_classify = layout_text
139
+ if result.metadata:
140
+ # Add metadata values to the text for classification
141
+ metadata_text = " ".join(str(value) for value in result.metadata.values() if value)
142
+ text_to_classify = f"{text_to_classify} {metadata_text}"
143
+
144
+ try:
145
+ from deep_translator import GoogleTranslator # noqa: PLC0415
146
+
147
+ translated_text = str(GoogleTranslator(source="auto", target="en").translate(text_to_classify).lower())
148
+ except Exception: # noqa: BLE001
149
+ # Fall back to original content in lowercase if translation fails
150
+ translated_text = text_to_classify.lower()
151
+
120
152
  layout_df["translated_text"] = translated_text
121
153
 
122
154
  page_height = layout_df["top"].max() + layout_df["height"].max()
@@ -151,6 +183,9 @@ def auto_detect_document_type(
151
183
  if config.document_classification_mode == "vision" and file_path:
152
184
  layout_result = get_ocr_backend("tesseract").process_file_sync(file_path, **config.get_config_dict())
153
185
  result.document_type, result.document_type_confidence = classify_document_from_layout(layout_result, config)
186
+ elif result.layout is not None and not result.layout.empty:
187
+ # Use layout-based classification if layout data is available
188
+ result.document_type, result.document_type_confidence = classify_document_from_layout(result, config)
154
189
  else:
155
190
  result.document_type, result.document_type_confidence = classify_document(result, config)
156
191
  return result
@@ -139,7 +139,7 @@ def extract_entities(
139
139
 
140
140
  try:
141
141
  import spacy # noqa: F401, PLC0415
142
- except ImportError as e:
142
+ except ImportError as e: # pragma: no cover
143
143
  raise MissingDependencyError.create_for_package(
144
144
  package_name="spacy",
145
145
  dependency_group="entity-extraction",
@@ -230,7 +230,7 @@ def extract_keywords(
230
230
  return [(kw, float(score)) for kw, score in keywords]
231
231
  except (RuntimeError, OSError, ValueError):
232
232
  return []
233
- except ImportError as e:
233
+ except ImportError as e: # pragma: no cover
234
234
  raise MissingDependencyError.create_for_package(
235
235
  package_name="keybert",
236
236
  dependency_group="entity-extraction",
@@ -116,8 +116,7 @@ class Extractor(ABC):
116
116
  quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
117
117
 
118
118
  # Add quality metadata
119
- enhanced_metadata = dict(result.metadata) if result.metadata else {}
120
- enhanced_metadata["quality_score"] = quality_score
119
+ enhanced_metadata = (dict(result.metadata) if result.metadata else {}) | {"quality_score": quality_score}
121
120
 
122
121
  # Return enhanced result
123
122
  return ExtractionResult(
@@ -19,12 +19,12 @@ if TYPE_CHECKING:
19
19
  # Import optional dependencies at module level with proper error handling
20
20
  try:
21
21
  import mailparse
22
- except ImportError:
22
+ except ImportError: # pragma: no cover
23
23
  mailparse = None
24
24
 
25
25
  try:
26
26
  import html2text # type: ignore[import-not-found]
27
- except ImportError:
27
+ except ImportError: # pragma: no cover
28
28
  html2text = None
29
29
 
30
30
  # Compile regex pattern once at module level
@@ -59,14 +59,19 @@ class EmailExtractor(Extractor):
59
59
 
60
60
  to_info = parsed_email.get("to")
61
61
  if to_info:
62
+ # Store the raw value in metadata (could be string, dict, or list)
62
63
  if isinstance(to_info, list) and to_info:
64
+ # For metadata, use first recipient's email if it's a list
63
65
  to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
66
+ metadata["email_to"] = to_email
64
67
  elif isinstance(to_info, dict):
65
- to_email = to_info.get("email", "")
68
+ metadata["email_to"] = to_info.get("email", "")
66
69
  else:
67
- to_email = str(to_info)
68
- metadata["email_to"] = to_email
69
- text_parts.append(f"To: {to_email}")
70
+ metadata["email_to"] = str(to_info)
71
+
72
+ # For display, format all recipients
73
+ to_formatted = self._format_email_field(to_info)
74
+ text_parts.append(f"To: {to_formatted}")
70
75
 
71
76
  date = parsed_email.get("date")
72
77
  if date:
@@ -76,12 +81,30 @@ class EmailExtractor(Extractor):
76
81
  cc = parsed_email.get("cc")
77
82
  if cc:
78
83
  metadata["email_cc"] = cc
79
- text_parts.append(f"CC: {cc}")
84
+ cc_formatted = self._format_email_field(cc)
85
+ text_parts.append(f"CC: {cc_formatted}")
80
86
 
81
87
  bcc = parsed_email.get("bcc")
82
88
  if bcc:
83
89
  metadata["email_bcc"] = bcc
84
- text_parts.append(f"BCC: {bcc}")
90
+ bcc_formatted = self._format_email_field(bcc)
91
+ text_parts.append(f"BCC: {bcc_formatted}")
92
+
93
+ def _format_email_field(self, field: Any) -> str:
94
+ """Format email field (to, cc, bcc) for display."""
95
+ if isinstance(field, list):
96
+ emails = []
97
+ for item in field:
98
+ if isinstance(item, dict):
99
+ email = item.get("email", "")
100
+ if email:
101
+ emails.append(email)
102
+ else:
103
+ emails.append(str(item))
104
+ return ", ".join(emails)
105
+ if isinstance(field, dict):
106
+ return str(field.get("email", ""))
107
+ return str(field)
85
108
 
86
109
  def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
87
110
  """Extract and process email body content."""
@@ -85,23 +85,24 @@ class ImageExtractor(Extractor):
85
85
 
86
86
  backend = get_ocr_backend(self.config.ocr_backend)
87
87
 
88
- if self.config.ocr_backend == "tesseract":
89
- config = (
90
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
91
- )
92
- result = backend.process_file_sync(path, **asdict(config))
93
- elif self.config.ocr_backend == "paddleocr":
94
- paddle_config = (
95
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
96
- )
97
- result = backend.process_file_sync(path, **asdict(paddle_config))
98
- elif self.config.ocr_backend == "easyocr":
99
- easy_config = (
100
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
101
- )
102
- result = backend.process_file_sync(path, **asdict(easy_config))
103
- else:
104
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
88
+ match self.config.ocr_backend:
89
+ case "tesseract":
90
+ config = (
91
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
92
+ )
93
+ result = backend.process_file_sync(path, **asdict(config))
94
+ case "paddleocr":
95
+ paddle_config = (
96
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
97
+ )
98
+ result = backend.process_file_sync(path, **asdict(paddle_config))
99
+ case "easyocr":
100
+ easy_config = (
101
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
102
+ )
103
+ result = backend.process_file_sync(path, **asdict(easy_config))
104
+ case _:
105
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
105
106
  return self._apply_quality_processing(result)
106
107
 
107
108
  def _get_extension_from_mime_type(self, mime_type: str) -> str:
@@ -82,20 +82,18 @@ class PDFExtractor(Extractor):
82
82
  from kreuzberg._gmft import extract_tables # noqa: PLC0415
83
83
 
84
84
  result.tables = await extract_tables(path, self.config.gmft_config)
85
- except ImportError:
85
+ except ImportError: # pragma: no cover
86
86
  result.tables = []
87
87
 
88
88
  # Enhance metadata with table information
89
89
  if result.tables:
90
90
  table_summary = generate_table_summary(result.tables)
91
- result.metadata.update(
92
- {
93
- "table_count": table_summary["table_count"],
94
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
95
- f"across {table_summary['pages_with_tables']} pages with "
96
- f"{table_summary['total_rows']} total rows",
97
- }
98
- )
91
+ result.metadata = result.metadata | {
92
+ "table_count": table_summary["table_count"],
93
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
94
+ f"across {table_summary['pages_with_tables']} pages with "
95
+ f"{table_summary['total_rows']} total rows",
96
+ }
99
97
 
100
98
  return self._apply_quality_processing(result)
101
99
 
@@ -153,14 +151,12 @@ class PDFExtractor(Extractor):
153
151
  # Enhance metadata with table information
154
152
  if tables:
155
153
  table_summary = generate_table_summary(tables)
156
- result.metadata.update(
157
- {
158
- "table_count": table_summary["table_count"],
159
- "tables_summary": f"Document contains {table_summary['table_count']} tables "
160
- f"across {table_summary['pages_with_tables']} pages with "
161
- f"{table_summary['total_rows']} total rows",
162
- }
163
- )
154
+ result.metadata = result.metadata | {
155
+ "table_count": table_summary["table_count"],
156
+ "tables_summary": f"Document contains {table_summary['table_count']} tables "
157
+ f"across {table_summary['pages_with_tables']} pages with "
158
+ f"{table_summary['total_rows']} total rows",
159
+ }
164
160
 
165
161
  # Apply quality processing
166
162
  return self._apply_quality_processing(result)
@@ -386,23 +382,24 @@ class PDFExtractor(Extractor):
386
382
  backend = get_ocr_backend(self.config.ocr_backend)
387
383
  paths = [Path(p) for p in image_paths]
388
384
 
389
- if self.config.ocr_backend == "tesseract":
390
- config = (
391
- self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
392
- )
393
- results = backend.process_batch_sync(paths, **asdict(config))
394
- elif self.config.ocr_backend == "paddleocr":
395
- paddle_config = (
396
- self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
397
- )
398
- results = backend.process_batch_sync(paths, **asdict(paddle_config))
399
- elif self.config.ocr_backend == "easyocr":
400
- easy_config = (
401
- self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
402
- )
403
- results = backend.process_batch_sync(paths, **asdict(easy_config))
404
- else:
405
- raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
385
+ match self.config.ocr_backend:
386
+ case "tesseract":
387
+ config = (
388
+ self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
389
+ )
390
+ results = backend.process_batch_sync(paths, **asdict(config))
391
+ case "paddleocr":
392
+ paddle_config = (
393
+ self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
394
+ )
395
+ results = backend.process_batch_sync(paths, **asdict(paddle_config))
396
+ case "easyocr":
397
+ easy_config = (
398
+ self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
399
+ )
400
+ results = backend.process_batch_sync(paths, **asdict(easy_config))
401
+ case _:
402
+ raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
406
403
 
407
404
  # Use list comprehension and join for efficient string building
408
405
  return "\n\n".join(result.content for result in results)
@@ -6,15 +6,15 @@ from typing import TYPE_CHECKING, Any, ClassVar
6
6
 
7
7
  if sys.version_info >= (3, 11):
8
8
  import tomllib
9
- else:
9
+ else: # pragma: no cover
10
10
  try:
11
11
  import tomli as tomllib # type: ignore[import-not-found]
12
- except ImportError:
12
+ except ImportError: # pragma: no cover
13
13
  tomllib = None
14
14
 
15
15
  try:
16
16
  import yaml
17
- except ImportError:
17
+ except ImportError: # pragma: no cover
18
18
  yaml = None
19
19
 
20
20
  from anyio import Path as AsyncPath
kreuzberg/_gmft.py CHANGED
@@ -265,7 +265,7 @@ async def extract_tables(
265
265
  finally:
266
266
  await run_sync(doc.close)
267
267
 
268
- except ImportError as e:
268
+ except ImportError as e: # pragma: no cover
269
269
  raise MissingDependencyError.create_for_package(
270
270
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
271
271
  ) from e
@@ -379,7 +379,7 @@ def extract_tables_sync(
379
379
  finally:
380
380
  doc.close() # type: ignore[no-untyped-call]
381
381
 
382
- except ImportError as e:
382
+ except ImportError as e: # pragma: no cover
383
383
  raise MissingDependencyError.create_for_package(
384
384
  dependency_group="gmft", functionality="table extraction", package_name="gmft"
385
385
  ) from e
@@ -14,7 +14,7 @@ try:
14
14
  from fast_langdetect import detect, detect_multilingual
15
15
 
16
16
  HAS_FAST_LANGDETECT = True
17
- except ImportError:
17
+ except ImportError: # pragma: no cover
18
18
  HAS_FAST_LANGDETECT = False
19
19
  detect = None
20
20
  detect_multilingual = None
kreuzberg/_mcp/server.py CHANGED
@@ -51,7 +51,7 @@ def _create_config_with_overrides(**kwargs: Any) -> ExtractionConfig:
51
51
  }
52
52
 
53
53
  # Override with provided parameters
54
- config_dict.update(kwargs)
54
+ config_dict = config_dict | kwargs
55
55
 
56
56
  return ExtractionConfig(**config_dict)
57
57
 
@@ -268,7 +268,7 @@ def extract_structured(file_path: str) -> list[TextContent]:
268
268
  return [TextContent(type="text", text=content)]
269
269
 
270
270
 
271
- def main() -> None:
271
+ def main() -> None: # pragma: no cover
272
272
  """Main entry point for the MCP server."""
273
273
  mcp.run()
274
274
 
kreuzberg/_ocr/_base.py CHANGED
@@ -88,7 +88,7 @@ class OCRBackend(ABC, Generic[T]):
88
88
  Returns:
89
89
  List of extraction result objects in the same order as input paths
90
90
  """
91
- return [self.process_file_sync(path, **kwargs) for path in paths]
91
+ return [self.process_file_sync(path, **kwargs) for path in paths] # pragma: no cover
92
92
 
93
93
  async def process_batch(self, paths: list[Path], **kwargs: Unpack[T]) -> list[ExtractionResult]:
94
94
  """Asynchronously process a batch of files and extract their text and metadata.
@@ -106,8 +106,8 @@ class OCRBackend(ABC, Generic[T]):
106
106
  from kreuzberg._utils._sync import run_taskgroup # noqa: PLC0415
107
107
 
108
108
  tasks = [self.process_file(path, **kwargs) for path in paths]
109
- return await run_taskgroup(*tasks)
109
+ return await run_taskgroup(*tasks) # pragma: no cover
110
110
 
111
111
  def __hash__(self) -> int:
112
112
  """Hash function for allowing caching."""
113
- return hash(type(self).__name__)
113
+ return hash(type(self).__name__) # pragma: no cover
@@ -321,7 +321,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
321
321
  import torch # noqa: PLC0415
322
322
 
323
323
  return bool(torch.cuda.is_available())
324
- except ImportError:
324
+ except ImportError: # pragma: no cover
325
325
  return False
326
326
 
327
327
  @classmethod
@@ -340,7 +340,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
340
340
 
341
341
  try:
342
342
  import easyocr # noqa: PLC0415
343
- except ImportError as e:
343
+ except ImportError as e: # pragma: no cover
344
344
  raise MissingDependencyError.create_for_package(
345
345
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
346
346
  ) from e
@@ -508,7 +508,7 @@ class EasyOCRBackend(OCRBackend[EasyOCRConfig]):
508
508
 
509
509
  try:
510
510
  import easyocr # noqa: PLC0415
511
- except ImportError as e:
511
+ except ImportError as e: # pragma: no cover
512
512
  raise MissingDependencyError.create_for_package(
513
513
  dependency_group="easyocr", functionality="EasyOCR as an OCR backend", package_name="easyocr"
514
514
  ) from e
@@ -261,7 +261,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
261
261
 
262
262
  try:
263
263
  from paddleocr import PaddleOCR # noqa: PLC0415
264
- except ImportError as e:
264
+ except ImportError as e: # pragma: no cover
265
265
  raise MissingDependencyError.create_for_package(
266
266
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
267
267
  ) from e
@@ -428,7 +428,7 @@ class PaddleBackend(OCRBackend[PaddleOCRConfig]):
428
428
 
429
429
  try:
430
430
  from paddleocr import PaddleOCR # noqa: PLC0415
431
- except ImportError as e:
431
+ except ImportError as e: # pragma: no cover
432
432
  raise MissingDependencyError.create_for_package(
433
433
  dependency_group="paddleocr", functionality="PaddleOCR as an OCR backend", package_name="paddleocr"
434
434
  ) from e
kreuzberg/_playa.py CHANGED
@@ -143,7 +143,9 @@ def _parse_date_string(date_str: str) -> str:
143
143
  minute = date_str[10:12]
144
144
  second = date_str[12:14]
145
145
  time_part = f"T{hour}:{minute}:{second}"
146
- return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y%m%d%H%M%S").isoformat() # noqa: DTZ007
146
+ if time_part:
147
+ return datetime.strptime(f"{year}-{month}-{day}{time_part}", "%Y-%m-%dT%H:%M:%S").isoformat() # noqa: DTZ007
148
+ return datetime.strptime(f"{year}-{month}-{day}", "%Y-%m-%d").isoformat() # noqa: DTZ007
147
149
  return date_str
148
150
 
149
151
 
kreuzberg/_types.py CHANGED
@@ -269,7 +269,7 @@ class ExtractionResult:
269
269
  Returns:
270
270
  List of CSV strings, one per table
271
271
  """
272
- if not self.tables:
272
+ if not self.tables: # pragma: no cover
273
273
  return []
274
274
 
275
275
  return [export_table_to_csv(table) for table in self.tables]
@@ -280,7 +280,7 @@ class ExtractionResult:
280
280
  Returns:
281
281
  List of TSV strings, one per table
282
282
  """
283
- if not self.tables:
283
+ if not self.tables: # pragma: no cover
284
284
  return []
285
285
 
286
286
  return [export_table_to_tsv(table) for table in self.tables]
@@ -291,7 +291,7 @@ class ExtractionResult:
291
291
  Returns:
292
292
  List of table structure dictionaries
293
293
  """
294
- if not self.tables:
294
+ if not self.tables: # pragma: no cover
295
295
  return []
296
296
 
297
297
  return [extract_table_structure_info(table) for table in self.tables]
@@ -351,7 +351,7 @@ class ExtractionConfig:
351
351
  """Configuration for spaCy entity extraction. If None, uses default settings."""
352
352
  auto_detect_document_type: bool = False
353
353
  """Whether to automatically detect the document type."""
354
- document_type_confidence_threshold: float = 0.7
354
+ document_type_confidence_threshold: float = 0.5
355
355
  """Confidence threshold for document type detection."""
356
356
  document_classification_mode: Literal["text", "vision"] = "text"
357
357
  """The mode to use for document classification."""
@@ -398,15 +398,16 @@ class ExtractionConfig:
398
398
  return asdict(self.ocr_config)
399
399
 
400
400
  # Lazy load and cache default configs instead of creating new instances
401
- if self.ocr_backend == "tesseract":
402
- from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
401
+ match self.ocr_backend:
402
+ case "tesseract":
403
+ from kreuzberg._ocr._tesseract import TesseractConfig # noqa: PLC0415
403
404
 
404
- return asdict(TesseractConfig())
405
- if self.ocr_backend == "easyocr":
406
- from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
405
+ return asdict(TesseractConfig())
406
+ case "easyocr":
407
+ from kreuzberg._ocr._easyocr import EasyOCRConfig # noqa: PLC0415
407
408
 
408
- return asdict(EasyOCRConfig())
409
- # paddleocr
410
- from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
409
+ return asdict(EasyOCRConfig())
410
+ case _: # paddleocr or any other backend
411
+ from kreuzberg._ocr._paddleocr import PaddleOCRConfig # noqa: PLC0415
411
412
 
412
- return asdict(PaddleOCRConfig())
413
+ return asdict(PaddleOCRConfig())
@@ -144,7 +144,7 @@ def _is_cuda_available() -> bool:
144
144
  import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
145
145
 
146
146
  return bool(torch.cuda.is_available())
147
- except ImportError:
147
+ except ImportError: # pragma: no cover
148
148
  return False
149
149
 
150
150
 
@@ -154,7 +154,7 @@ def _is_mps_available() -> bool:
154
154
  import torch # type: ignore[import-not-found,unused-ignore] # noqa: PLC0415
155
155
 
156
156
  return bool(torch.backends.mps.is_available())
157
- except ImportError:
157
+ except ImportError: # pragma: no cover
158
158
  return False
159
159
 
160
160
 
@@ -190,7 +190,7 @@ def _get_cuda_devices() -> list[DeviceInfo]:
190
190
  )
191
191
  )
192
192
 
193
- except ImportError:
193
+ except ImportError: # pragma: no cover
194
194
  pass
195
195
 
196
196
  return devices
@@ -209,7 +209,7 @@ def _get_mps_device() -> DeviceInfo | None:
209
209
  name="Apple Silicon GPU (MPS)",
210
210
  )
211
211
 
212
- except ImportError:
212
+ except ImportError: # pragma: no cover
213
213
  return None
214
214
 
215
215
 
@@ -232,7 +232,7 @@ def _get_cuda_memory_info(device_id: int) -> tuple[float | None, float | None]:
232
232
 
233
233
  return total_memory, available_memory
234
234
 
235
- except ImportError:
235
+ except ImportError: # pragma: no cover
236
236
  return None, None
237
237
 
238
238
 
@@ -333,7 +333,7 @@ def cleanup_device_memory(device: DeviceInfo) -> None:
333
333
 
334
334
  if torch.cuda.is_available():
335
335
  torch.cuda.empty_cache()
336
- except ImportError:
336
+ except ImportError: # pragma: no cover # pragma: no cover
337
337
  pass
338
338
 
339
339
  elif device.device_type == "mps":
@@ -58,6 +58,7 @@ class DocumentCache:
58
58
  "chunk_content": config.chunk_content,
59
59
  "max_chars": config.max_chars,
60
60
  "max_overlap": config.max_overlap,
61
+ "auto_detect_document_type": config.auto_detect_document_type,
61
62
  }
62
63
 
63
64
  cache_data = {**file_info, **config_info}
kreuzberg/cli.py CHANGED
@@ -12,7 +12,7 @@ try:
12
12
  import click
13
13
  from rich.console import Console
14
14
  from rich.progress import Progress, SpinnerColumn, TextColumn
15
- except ImportError as e:
15
+ except ImportError as e: # pragma: no cover
16
16
  raise ImportError(
17
17
  "CLI dependencies are not installed. Please install kreuzberg with the 'cli' extra: pip install kreuzberg[cli]"
18
18
  ) from e
@@ -163,7 +163,7 @@ def _perform_extraction(file: Path | None, extraction_config: ExtractionConfig,
163
163
  import magic # type: ignore[import-not-found] # noqa: PLC0415
164
164
 
165
165
  mime_type = magic.from_buffer(input_bytes, mime=True)
166
- except ImportError:
166
+ except ImportError: # pragma: no cover
167
167
  content_str = input_bytes.decode("utf-8", errors="ignore").lower()
168
168
  mime_type = "text/html" if "<html" in content_str or "<body" in content_str else "text/plain"
169
169
 
@@ -193,7 +193,7 @@ def _write_output(
193
193
  click.echo(formatted_output)
194
194
 
195
195
 
196
- def handle_error(error: Exception, verbose: bool) -> None:
196
+ def handle_error(error: Exception, verbose: bool) -> None: # pragma: no cover
197
197
  """Handle and display errors.
198
198
 
199
199
  Args:
@@ -202,19 +202,19 @@ def handle_error(error: Exception, verbose: bool) -> None:
202
202
  """
203
203
  if isinstance(error, MissingDependencyError):
204
204
  console.print(f"[red]Missing dependency:[/red] {error}", style="bold")
205
- sys.exit(2)
205
+ sys.exit(2) # pragma: no cover
206
206
  elif isinstance(error, KreuzbergError):
207
207
  console.print(f"[red]Error:[/red] {error}", style="bold")
208
208
  if verbose and error.context:
209
209
  console.print("\n[dim]Context:[/dim]")
210
210
  console.print(json.dumps(error.context, indent=2))
211
- sys.exit(1)
211
+ sys.exit(1) # pragma: no cover
212
212
  else:
213
213
  console.print(f"[red]Unexpected error:[/red] {type(error).__name__}: {error}", style="bold")
214
214
  if verbose:
215
215
  console.print("\n[dim]Traceback:[/dim]")
216
216
  traceback.print_exc()
217
- sys.exit(1)
217
+ sys.exit(1) # pragma: no cover
218
218
 
219
219
 
220
220
  @click.group(invoke_without_command=True)
@@ -1,6 +1,6 @@
1
1
  Metadata-Version: 2.4
2
2
  Name: kreuzberg
3
- Version: 3.10.0
3
+ Version: 3.11.0
4
4
  Summary: Document intelligence framework for Python - Extract text, metadata, and structured data from diverse file formats
5
5
  Project-URL: documentation, https://kreuzberg.dev
6
6
  Project-URL: homepage, https://github.com/Goldziher/kreuzberg
@@ -32,7 +32,7 @@ Requires-Dist: anyio>=4.9.0
32
32
  Requires-Dist: chardetng-py>=0.3.5
33
33
  Requires-Dist: exceptiongroup>=1.2.2; python_version < '3.11'
34
34
  Requires-Dist: html-to-markdown[lxml]>=1.9.0
35
- Requires-Dist: mcp>=1.12.2
35
+ Requires-Dist: mcp>=1.12.3
36
36
  Requires-Dist: msgspec>=0.18.0
37
37
  Requires-Dist: playa-pdf>=0.6.4
38
38
  Requires-Dist: psutil>=7.0.0
@@ -45,6 +45,7 @@ Requires-Dist: mailparse>=1.0.15; extra == 'additional-extensions'
45
45
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'additional-extensions'
46
46
  Provides-Extra: all
47
47
  Requires-Dist: click>=8.2.1; extra == 'all'
48
+ Requires-Dist: deep-translator>=1.11.4; extra == 'all'
48
49
  Requires-Dist: easyocr>=1.7.2; extra == 'all'
49
50
  Requires-Dist: fast-langdetect>=0.3.2; extra == 'all'
50
51
  Requires-Dist: gmft>=0.4.2; extra == 'all'
@@ -53,6 +54,7 @@ Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'all
53
54
  Requires-Dist: mailparse>=1.0.15; extra == 'all'
54
55
  Requires-Dist: paddleocr>=3.1.0; extra == 'all'
55
56
  Requires-Dist: paddlepaddle>=3.1.0; extra == 'all'
57
+ Requires-Dist: pandas>=2.3.1; extra == 'all'
56
58
  Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'all'
57
59
  Requires-Dist: rich>=14.1.0; extra == 'all'
58
60
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'all'
@@ -61,9 +63,6 @@ Requires-Dist: spacy>=3.8.7; extra == 'all'
61
63
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'all'
62
64
  Provides-Extra: api
63
65
  Requires-Dist: litestar[opentelemetry,standard,structlog]>=2.16.0; extra == 'api'
64
- Provides-Extra: auto-classify-document-type
65
- Requires-Dist: deep-translator>=1.11.4; extra == 'auto-classify-document-type'
66
- Requires-Dist: pandas>=2.3.1; extra == 'auto-classify-document-type'
67
66
  Provides-Extra: chunking
68
67
  Requires-Dist: semantic-text-splitter>=0.27.0; extra == 'chunking'
69
68
  Provides-Extra: cli
@@ -72,6 +71,9 @@ Requires-Dist: rich>=14.1.0; extra == 'cli'
72
71
  Requires-Dist: tomli>=2.0.0; (python_version < '3.11') and extra == 'cli'
73
72
  Provides-Extra: crypto
74
73
  Requires-Dist: playa-pdf[crypto]>=0.6.4; extra == 'crypto'
74
+ Provides-Extra: document-classification
75
+ Requires-Dist: deep-translator>=1.11.4; extra == 'document-classification'
76
+ Requires-Dist: pandas>=2.3.1; extra == 'document-classification'
75
77
  Provides-Extra: easyocr
76
78
  Requires-Dist: easyocr>=1.7.2; extra == 'easyocr'
77
79
  Provides-Extra: entity-extraction
@@ -1,43 +1,43 @@
1
1
  kreuzberg/__init__.py,sha256=0OJ_jNKbS6GxzWC5-EfRCiE80as_ya0-wwyNsTYbxzY,1721
2
2
  kreuzberg/__main__.py,sha256=s2qM1nPEkRHAQP-G3P7sf5l6qA_KJeIEHS5LpPz04lg,183
3
- kreuzberg/_chunker.py,sha256=QmYbPHPE36ztMT70xPwg_Y4NIftCDl0wyufg5X9lmTo,1932
4
- kreuzberg/_config.py,sha256=EvrBFAawjfKgXu49tACi4CuMmmoIRt_EzbHayZqM_jU,12983
3
+ kreuzberg/_chunker.py,sha256=y4-dX6ILjjBkkC1gkCzXb7v7vbi8844m7vz1gIzbmv4,1952
4
+ kreuzberg/_config.py,sha256=Au521UiR7vcQs_8_hhoWIfmDDMJIrDM3XZUB_qHfCmo,14035
5
5
  kreuzberg/_constants.py,sha256=Bxc8oiN-wHwnWXT9bEiJhTUcu1ygPpra5qHirAif3b4,191
6
- kreuzberg/_document_classification.py,sha256=8XVTKh8ohsb4mbKw2gPFr5OB6v4dWuzXhFE_63vHLrw,5189
7
- kreuzberg/_entity_extraction.py,sha256=43VxtCPbuSrfi-XGSop-VRoZadJcNe02qRGwjGSaR0c,7862
8
- kreuzberg/_gmft.py,sha256=JWDeBYWSDkh3tLUhwTmW5CbIBEig2A3o712BLshk7hE,25533
9
- kreuzberg/_language_detection.py,sha256=eEfj4tsh91SfB2_zQIdY-qD7TlPcppaFm0SqQmETS6Y,3295
6
+ kreuzberg/_document_classification.py,sha256=qFGmwvUMhnNAvNNJO7E-huPx-Ps-_DWxdNxsozIzgaw,6870
7
+ kreuzberg/_entity_extraction.py,sha256=Oa1T-9mptimpOHtcda-GtrVYH9PFy7DSJj3thJZUD7k,7902
8
+ kreuzberg/_gmft.py,sha256=HdQ7Xpuixxl2Y0jY8C3KfyQEU0mN4yQdqErWCv4TnFY,25573
9
+ kreuzberg/_language_detection.py,sha256=_Ng2aHgPxOHFgd507gVNiIGVmnxxbpgYwsO0bD0yTzg,3315
10
10
  kreuzberg/_mime_types.py,sha256=2warRVqfBUNIg8JBg8yP4pRqaMPvwINosHMkJwtH_Fc,8488
11
- kreuzberg/_playa.py,sha256=cJ000ZPHRhbpbP7odRuzMKn38teR6RbodoHgksbfjGE,12059
11
+ kreuzberg/_playa.py,sha256=_IPrUSWwSfDQlWXOpKlauV0D9MhGrujGP5kmQ0U3L0g,12188
12
12
  kreuzberg/_registry.py,sha256=wGSlkS0U1zqruWQCLE95vj4a2mw1yyvf0j6rgz80sJg,3473
13
- kreuzberg/_types.py,sha256=ecT2dRg7dr06p7Dxv23YJ7Ur2m4FUCt6xGtuoS7MQaI,15259
14
- kreuzberg/cli.py,sha256=lmthZa0x8pz7SQfCoPPdTaUF9aQZ8W4w5FlnPcsGr9k,12438
13
+ kreuzberg/_types.py,sha256=bMaU6VuoqwOpW6ufshA-DWpNw6t9EokjEDEfFsznvdo,15389
14
+ kreuzberg/cli.py,sha256=rJMdHg7FhUxefCrx-sf4c2qVGRXr8Xrpjgfx_DQSKMg,12558
15
15
  kreuzberg/exceptions.py,sha256=PTiAZgQwcG9hXbgYg2W7sfxksFhq5_wzOFgZGnTJAoc,2991
16
16
  kreuzberg/extraction.py,sha256=Kt1mOxdlOb35yVOdpdhiRPuTgA9BW_TTG9qwCkSxSkc,17332
17
17
  kreuzberg/py.typed,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
18
18
  kreuzberg/_api/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
19
- kreuzberg/_api/main.py,sha256=g3kqXUfSie2pcw3-EWOM4TAoJUqM7yj2e-cBQJ_bmYc,3253
19
+ kreuzberg/_api/main.py,sha256=8VwxRlIXwnPs7ZYm0saUZsNOjevEAWJQpNreG-X7ZpE,3273
20
20
  kreuzberg/_extractors/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
21
- kreuzberg/_extractors/_base.py,sha256=yNVQSECFad-8_MjqpQZ4q0jQoNdzP6-tqw6l3TfgsMc,4418
22
- kreuzberg/_extractors/_email.py,sha256=6-Mk1TRXPyy9ylWKCpgdrogyzhiFnJOTuTRld1ghO8I,5695
21
+ kreuzberg/_extractors/_base.py,sha256=H_nwynBX3fozncVjV13c329x5eCLl5r7nyVTLQyDAzI,4396
22
+ kreuzberg/_extractors/_email.py,sha256=Jpr4NFef640uVgNFkR1or-omy8RVt-NOHUYgWRDjyBo,6753
23
23
  kreuzberg/_extractors/_html.py,sha256=lOM1Tgrrvd7vpEeFAxC1dp0Tibr6N2FEHCjgFx0FK64,1745
24
- kreuzberg/_extractors/_image.py,sha256=OmkqR5Cd6bTM_qHEdNHeXa5eK-3KvtLgHX-JE5oZWec,4483
24
+ kreuzberg/_extractors/_image.py,sha256=Iz1JpvGqcYyh9g4zO_bMZG3E9S39KNHFu8PrXDRXeOk,4513
25
25
  kreuzberg/_extractors/_pandoc.py,sha256=51k7XISfKaPorhapG7aIeQb94KGsfozxKyT2rwhk9Bk,26553
26
- kreuzberg/_extractors/_pdf.py,sha256=pn45qKYkMcmG-PzeeF5jRjrw1NwaKU3589dhpn7HvE8,19918
26
+ kreuzberg/_extractors/_pdf.py,sha256=OflyvwEkuFLmw8E3si35MCGH31fvd5o50VdMmu5QRVs,19884
27
27
  kreuzberg/_extractors/_presentation.py,sha256=CUlqZl_QCdJdumsZh0BpROkFbvi9uq7yMoIt3bRTUeE,10859
28
28
  kreuzberg/_extractors/_spread_sheet.py,sha256=iagiyJsnl-89OP1eqmEv8jWl7gZBJm2x0YOyqBgLasA,13733
29
- kreuzberg/_extractors/_structured.py,sha256=J7op5ZZ663a_CkE2SB5eKjPQPXKHdqPst-GFCJMNGqw,5727
29
+ kreuzberg/_extractors/_structured.py,sha256=PbNaXd-_PUPsE0yZkISod_vLBokbWdVTKEPpEmqaEMM,5787
30
30
  kreuzberg/_mcp/__init__.py,sha256=8PYV-omC8Rln7Cove8C3rHu3d7sR1FuiwSBG1O7vkAE,92
31
- kreuzberg/_mcp/server.py,sha256=Ab0w7kR3m7_L1cfhYHiC8HqDL282vt4uBYwYc9w9E08,8703
31
+ kreuzberg/_mcp/server.py,sha256=Dxed80MqZsYCFyYo0QdArpKE4H8DhpKY34fijdzV5uw,8731
32
32
  kreuzberg/_ocr/__init__.py,sha256=grshVFwVQl2rMvH1hg1JNlYXjy5-Tdb_rusLD1Cselk,706
33
- kreuzberg/_ocr/_base.py,sha256=urvsLRgOmVYHjxil_IsSL69FmMnboklC4CHAjdBQLKQ,3893
34
- kreuzberg/_ocr/_easyocr.py,sha256=pw2uDmULuMQ9T1Gl4axP_ev7-qwjLt1mJHHyZ34P_FI,17178
35
- kreuzberg/_ocr/_paddleocr.py,sha256=s75aQJILXm1ZbacyZiLPXh6jEAg9tk2NYnwPnfSDrRU,17543
33
+ kreuzberg/_ocr/_base.py,sha256=IkONqwG6zxZoVMni1JlYugBoyONahlRny7J2_7Dy69c,3953
34
+ kreuzberg/_ocr/_easyocr.py,sha256=dWfoj5fPIGqJPGTVeZ0W59TrW3DpNwF0bcfgt6FwQUw,17238
35
+ kreuzberg/_ocr/_paddleocr.py,sha256=Is_iJQaSUeCMfCvg5RnuG_pmBRjBt0b3dCBPY1IAc3A,17583
36
36
  kreuzberg/_ocr/_tesseract.py,sha256=teLMH1pBhpcmEXDcyZlv56hYINLGMuaKZ0CQtcu_czQ,31510
37
37
  kreuzberg/_utils/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
38
38
  kreuzberg/_utils/_cache.py,sha256=hYd_a5Ni5VJBE1XU_eN9gvQ5gg0FRsdbRgmJe-OIJHM,15253
39
- kreuzberg/_utils/_device.py,sha256=arVrJOSp_2LbbN6lu_rMEUOezzRogdWdkF8d5q5Bg8U,10345
40
- kreuzberg/_utils/_document_cache.py,sha256=z8irioKsOu8xve1YgHatm__wIFvs9I1gDK3tLNsNyqM,6926
39
+ kreuzberg/_utils/_device.py,sha256=JI9p9TGSfQHEi2SL-ovOXMr9RUnVq-RrEly89OvmQ5w,10485
40
+ kreuzberg/_utils/_document_cache.py,sha256=ka90JIT-FXUMOv8z2u3fztQgZZb2XQDHTMnBi32mySA,7005
41
41
  kreuzberg/_utils/_errors.py,sha256=UsktQ_p7eOj9crPsFDg8HgRSE5-IpuFC7y1e6dDI_fY,6503
42
42
  kreuzberg/_utils/_pdf_lock.py,sha256=nqxAYCNlfWDrJtP4ZNu57st1YnkDl-gYXdr0q8nv0kA,1961
43
43
  kreuzberg/_utils/_process_pool.py,sha256=4BqhmRspwMyPT2EBfTu_rrn7v722wlMLD8qlYvYsc00,8621
@@ -47,8 +47,8 @@ kreuzberg/_utils/_string.py,sha256=bCzO3UO6nXupxvtMWvHqfp1Vd9CTzEH9jmpJXQ7upAU,6
47
47
  kreuzberg/_utils/_sync.py,sha256=7LSavBmxVKQUzdjfx9fYRAI9IbJtRw8iGf_Q8B7RX9g,4923
48
48
  kreuzberg/_utils/_table.py,sha256=IomrfQBP85DZI8RmQjOVs2Siq7VP9FUTYPaZR4t3yRw,8199
49
49
  kreuzberg/_utils/_tmp.py,sha256=hVn-VVijIg2FM7EZJ899gc7wZg-TGoJZoeAcxMX-Cxg,1044
50
- kreuzberg-3.10.0.dist-info/METADATA,sha256=4U1mSEAbT3zRir--SPZmYy09LfEfu5vUz6CUhQL8uzA,12047
51
- kreuzberg-3.10.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
- kreuzberg-3.10.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
- kreuzberg-3.10.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
- kreuzberg-3.10.0.dist-info/RECORD,,
50
+ kreuzberg-3.11.0.dist-info/METADATA,sha256=pvyRM3TAmXE3TnYaNOZ1chD_IQTgWn254wxnqDsy6EM,12135
51
+ kreuzberg-3.11.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
52
+ kreuzberg-3.11.0.dist-info/entry_points.txt,sha256=GplGhFryCP7kyAG_k-Mdahznvo2fwi73qLFg5yQfH_A,91
53
+ kreuzberg-3.11.0.dist-info/licenses/LICENSE,sha256=-8caMvpCK8SgZ5LlRKhGCMtYDEXqTKH9X8pFEhl91_4,1066
54
+ kreuzberg-3.11.0.dist-info/RECORD,,