kreuzberg 3.2.0__py3-none-any.whl → 3.4.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (41) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_api/__init__.py +0 -0
  4. kreuzberg/_api/main.py +87 -0
  5. kreuzberg/_cli_config.py +175 -0
  6. kreuzberg/_extractors/_image.py +39 -4
  7. kreuzberg/_extractors/_pandoc.py +158 -18
  8. kreuzberg/_extractors/_pdf.py +199 -19
  9. kreuzberg/_extractors/_presentation.py +1 -1
  10. kreuzberg/_extractors/_spread_sheet.py +65 -7
  11. kreuzberg/_gmft.py +222 -16
  12. kreuzberg/_mime_types.py +62 -16
  13. kreuzberg/_multiprocessing/__init__.py +6 -0
  14. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  15. kreuzberg/_multiprocessing/process_manager.py +188 -0
  16. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  17. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  18. kreuzberg/_ocr/_easyocr.py +6 -12
  19. kreuzberg/_ocr/_paddleocr.py +15 -13
  20. kreuzberg/_ocr/_tesseract.py +136 -46
  21. kreuzberg/_playa.py +43 -0
  22. kreuzberg/_types.py +4 -0
  23. kreuzberg/_utils/_cache.py +372 -0
  24. kreuzberg/_utils/_device.py +10 -27
  25. kreuzberg/_utils/_document_cache.py +220 -0
  26. kreuzberg/_utils/_errors.py +232 -0
  27. kreuzberg/_utils/_pdf_lock.py +72 -0
  28. kreuzberg/_utils/_process_pool.py +100 -0
  29. kreuzberg/_utils/_serialization.py +82 -0
  30. kreuzberg/_utils/_string.py +1 -1
  31. kreuzberg/_utils/_sync.py +21 -0
  32. kreuzberg/cli.py +338 -0
  33. kreuzberg/extraction.py +247 -36
  34. kreuzberg-3.4.0.dist-info/METADATA +290 -0
  35. kreuzberg-3.4.0.dist-info/RECORD +50 -0
  36. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/WHEEL +1 -2
  37. kreuzberg-3.4.0.dist-info/entry_points.txt +2 -0
  38. kreuzberg-3.2.0.dist-info/METADATA +0 -166
  39. kreuzberg-3.2.0.dist-info/RECORD +0 -34
  40. kreuzberg-3.2.0.dist-info/top_level.txt +0 -1
  41. {kreuzberg-3.2.0.dist-info → kreuzberg-3.4.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,220 @@
1
+ """Document-level caching to prevent pypdfium2 issues with duplicate processing."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import threading
7
+ import time
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any
10
+
11
+ if TYPE_CHECKING:
12
+ from kreuzberg._types import ExtractionConfig, ExtractionResult
13
+
14
+
15
+ class DocumentCache:
16
+ """Session-scoped cache for document extraction results.
17
+
18
+ Ensures each unique document is processed only once per session,
19
+ preventing pypdfium2 state corruption issues with repeated processing.
20
+ """
21
+
22
+ def __init__(self) -> None:
23
+ """Initialize document cache."""
24
+ self._cache: dict[str, ExtractionResult] = {}
25
+ self._processing: dict[str, threading.Event] = {}
26
+ self._lock = threading.Lock()
27
+
28
+ self._file_metadata: dict[str, dict[str, Any]] = {}
29
+
30
+ def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
31
+ """Generate cache key for a file and config combination.
32
+
33
+ Args:
34
+ file_path: Path to the file
35
+ config: Extraction configuration
36
+
37
+ Returns:
38
+ Unique cache key string
39
+ """
40
+ path = Path(file_path).resolve()
41
+
42
+ try:
43
+ stat = path.stat()
44
+ file_info = {
45
+ "path": str(path),
46
+ "size": stat.st_size,
47
+ "mtime": stat.st_mtime,
48
+ }
49
+ except OSError:
50
+ file_info = {"path": str(path), "size": 0, "mtime": 0}
51
+
52
+ config_info = {}
53
+ if config:
54
+ config_info = {
55
+ "force_ocr": config.force_ocr,
56
+ "ocr_backend": config.ocr_backend,
57
+ "extract_tables": config.extract_tables,
58
+ "chunk_content": config.chunk_content,
59
+ "max_chars": config.max_chars,
60
+ "max_overlap": config.max_overlap,
61
+ }
62
+
63
+ cache_data = {**file_info, **config_info}
64
+ cache_str = str(sorted(cache_data.items()))
65
+
66
+ return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
67
+
68
+ def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
69
+ """Check if cached result is still valid.
70
+
71
+ Args:
72
+ cache_key: The cache key to validate
73
+ file_path: Path to the file
74
+
75
+ Returns:
76
+ True if cache is valid, False if invalidated
77
+ """
78
+ if cache_key not in self._file_metadata:
79
+ return False
80
+
81
+ path = Path(file_path)
82
+ try:
83
+ current_stat = path.stat()
84
+ cached_metadata = self._file_metadata[cache_key]
85
+
86
+ return bool(
87
+ cached_metadata["size"] == current_stat.st_size and cached_metadata["mtime"] == current_stat.st_mtime
88
+ )
89
+ except OSError:
90
+ return False
91
+
92
+ def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
93
+ """Get cached extraction result if available and valid.
94
+
95
+ Args:
96
+ file_path: Path to the file
97
+ config: Extraction configuration
98
+
99
+ Returns:
100
+ Cached result if available, None otherwise
101
+ """
102
+ cache_key = self._get_cache_key(file_path, config)
103
+
104
+ with self._lock:
105
+ if cache_key in self._cache:
106
+ if self._is_cache_valid(cache_key, file_path):
107
+ return self._cache[cache_key]
108
+
109
+ self._cache.pop(cache_key, None)
110
+ self._file_metadata.pop(cache_key, None)
111
+
112
+ return None
113
+
114
+ def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
115
+ """Cache extraction result.
116
+
117
+ Args:
118
+ file_path: Path to the file
119
+ config: Extraction configuration
120
+ result: Extraction result to cache
121
+ """
122
+ cache_key = self._get_cache_key(file_path, config)
123
+ path = Path(file_path)
124
+
125
+ try:
126
+ stat = path.stat()
127
+ file_metadata = {
128
+ "size": stat.st_size,
129
+ "mtime": stat.st_mtime,
130
+ "cached_at": time.time(),
131
+ }
132
+ except OSError:
133
+ file_metadata = {
134
+ "size": 0,
135
+ "mtime": 0,
136
+ "cached_at": time.time(),
137
+ }
138
+
139
+ with self._lock:
140
+ self._cache[cache_key] = result
141
+ self._file_metadata[cache_key] = file_metadata
142
+
143
+ def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
144
+ """Check if file is currently being processed.
145
+
146
+ Args:
147
+ file_path: Path to the file
148
+ config: Extraction configuration
149
+
150
+ Returns:
151
+ True if file is currently being processed
152
+ """
153
+ cache_key = self._get_cache_key(file_path, config)
154
+ with self._lock:
155
+ return cache_key in self._processing
156
+
157
+ def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
158
+ """Mark file as being processed and return event to wait on.
159
+
160
+ Args:
161
+ file_path: Path to the file
162
+ config: Extraction configuration
163
+
164
+ Returns:
165
+ Event that will be set when processing completes
166
+ """
167
+ cache_key = self._get_cache_key(file_path, config)
168
+
169
+ with self._lock:
170
+ if cache_key not in self._processing:
171
+ self._processing[cache_key] = threading.Event()
172
+ return self._processing[cache_key]
173
+
174
+ def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
175
+ """Mark file processing as complete.
176
+
177
+ Args:
178
+ file_path: Path to the file
179
+ config: Extraction configuration
180
+ """
181
+ cache_key = self._get_cache_key(file_path, config)
182
+
183
+ with self._lock:
184
+ if cache_key in self._processing:
185
+ event = self._processing.pop(cache_key)
186
+ event.set()
187
+
188
+ def clear(self) -> None:
189
+ """Clear all cached results."""
190
+ with self._lock:
191
+ self._cache.clear()
192
+ self._file_metadata.clear()
193
+
194
+ def get_stats(self) -> dict[str, Any]:
195
+ """Get cache statistics.
196
+
197
+ Returns:
198
+ Dictionary with cache statistics
199
+ """
200
+ with self._lock:
201
+ return {
202
+ "cached_documents": len(self._cache),
203
+ "processing_documents": len(self._processing),
204
+ "total_cache_size_mb": sum(len(result.content.encode("utf-8")) for result in self._cache.values())
205
+ / 1024
206
+ / 1024,
207
+ }
208
+
209
+
210
+ _document_cache = DocumentCache()
211
+
212
+
213
+ def get_document_cache() -> DocumentCache:
214
+ """Get the global document cache instance."""
215
+ return _document_cache
216
+
217
+
218
+ def clear_document_cache() -> None:
219
+ """Clear the global document cache."""
220
+ _document_cache.clear()
@@ -0,0 +1,232 @@
1
+ """Enhanced error handling utilities."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import platform
6
+ import traceback
7
+ from datetime import datetime, timezone
8
+ from typing import TYPE_CHECKING, Any
9
+
10
+ import psutil
11
+
12
+ if TYPE_CHECKING:
13
+ from pathlib import Path
14
+
15
+
16
+ def create_error_context(
17
+ *,
18
+ operation: str,
19
+ file_path: Path | str | None = None,
20
+ error: Exception | None = None,
21
+ **extra: Any,
22
+ ) -> dict[str, Any]:
23
+ """Create comprehensive error context.
24
+
25
+ Args:
26
+ operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
27
+ file_path: The file being processed, if applicable
28
+ error: The original exception, if any
29
+ **extra: Additional context fields
30
+
31
+ Returns:
32
+ Dictionary with error context including system info
33
+ """
34
+ context: dict[str, Any] = {
35
+ "timestamp": datetime.now(timezone.utc).isoformat(),
36
+ "operation": operation,
37
+ }
38
+
39
+ if file_path:
40
+ from pathlib import Path
41
+
42
+ path = Path(file_path) if isinstance(file_path, str) else file_path
43
+ context["file"] = {
44
+ "path": str(path),
45
+ "name": path.name,
46
+ "exists": path.exists(),
47
+ "size": path.stat().st_size if path.exists() else None,
48
+ }
49
+
50
+ if error:
51
+ context["error"] = {
52
+ "type": type(error).__name__,
53
+ "message": str(error),
54
+ "traceback": traceback.format_exception_only(type(error), error),
55
+ }
56
+
57
+ if (
58
+ any(keyword in str(error).lower() for keyword in ["memory", "resource", "process", "thread"])
59
+ if error
60
+ else False
61
+ ):
62
+ try:
63
+ mem = psutil.virtual_memory()
64
+ context["system"] = {
65
+ "memory_available_mb": mem.available / 1024 / 1024,
66
+ "memory_percent": mem.percent,
67
+ "cpu_percent": psutil.cpu_percent(interval=0.1),
68
+ "process_count": len(psutil.pids()),
69
+ "platform": platform.platform(),
70
+ }
71
+ except Exception: # noqa: BLE001
72
+ pass
73
+
74
+ context.update(extra)
75
+
76
+ return context
77
+
78
+
79
+ def is_transient_error(error: Exception) -> bool:
80
+ """Check if an error is likely transient and worth retrying.
81
+
82
+ Args:
83
+ error: The exception to check
84
+
85
+ Returns:
86
+ True if the error is likely transient
87
+ """
88
+ transient_types = (
89
+ OSError,
90
+ PermissionError,
91
+ TimeoutError,
92
+ ConnectionError,
93
+ BrokenPipeError,
94
+ )
95
+
96
+ if isinstance(error, transient_types):
97
+ return True
98
+
99
+ transient_patterns = [
100
+ "temporary",
101
+ "locked",
102
+ "in use",
103
+ "access denied",
104
+ "permission",
105
+ "timeout",
106
+ "connection",
107
+ "network",
108
+ "too many open files",
109
+ "cannot allocate memory",
110
+ "resource temporarily unavailable",
111
+ "broken pipe",
112
+ "subprocess",
113
+ "signal",
114
+ ]
115
+
116
+ error_str = str(error).lower()
117
+ return any(pattern in error_str for pattern in transient_patterns)
118
+
119
+
120
+ def is_resource_error(error: Exception) -> bool:
121
+ """Check if an error is related to system resources.
122
+
123
+ Args:
124
+ error: The exception to check
125
+
126
+ Returns:
127
+ True if the error is resource-related
128
+ """
129
+ resource_patterns = [
130
+ "memory",
131
+ "out of memory",
132
+ "cannot allocate",
133
+ "too many open files",
134
+ "file descriptor",
135
+ "resource",
136
+ "exhausted",
137
+ "limit",
138
+ "cpu",
139
+ "thread",
140
+ "process",
141
+ ]
142
+
143
+ error_str = str(error).lower()
144
+ return any(pattern in error_str for pattern in resource_patterns)
145
+
146
+
147
+ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
148
+ """Determine if an operation should be retried.
149
+
150
+ Args:
151
+ error: The exception that occurred
152
+ attempt: Current attempt number (1-based)
153
+ max_attempts: Maximum number of attempts
154
+
155
+ Returns:
156
+ True if the operation should be retried
157
+ """
158
+ if attempt >= max_attempts:
159
+ return False
160
+
161
+ from kreuzberg.exceptions import ValidationError
162
+
163
+ if isinstance(error, ValidationError):
164
+ return False
165
+
166
+ return is_transient_error(error)
167
+
168
+
169
+ class BatchExtractionResult:
170
+ """Result container for batch operations with partial success support."""
171
+
172
+ def __init__(self) -> None:
173
+ """Initialize batch result container."""
174
+ self.successful: list[tuple[int, Any]] = []
175
+ self.failed: list[tuple[int, dict[str, Any]]] = []
176
+ self.total_count: int = 0
177
+
178
+ def add_success(self, index: int, result: Any) -> None:
179
+ """Add a successful result."""
180
+ self.successful.append((index, result))
181
+
182
+ def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
183
+ """Add a failed result with context."""
184
+ error_info = {
185
+ "error": {
186
+ "type": type(error).__name__,
187
+ "message": str(error),
188
+ },
189
+ "context": context,
190
+ }
191
+ self.failed.append((index, error_info))
192
+
193
+ @property
194
+ def success_count(self) -> int:
195
+ """Number of successful operations."""
196
+ return len(self.successful)
197
+
198
+ @property
199
+ def failure_count(self) -> int:
200
+ """Number of failed operations."""
201
+ return len(self.failed)
202
+
203
+ @property
204
+ def success_rate(self) -> float:
205
+ """Success rate as a percentage."""
206
+ if self.total_count == 0:
207
+ return 0.0
208
+ return (self.success_count / self.total_count) * 100
209
+
210
+ def get_ordered_results(self) -> list[Any | None]:
211
+ """Get results in original order with None for failures."""
212
+ results = [None] * self.total_count
213
+ for index, result in self.successful:
214
+ results[index] = result
215
+ return results
216
+
217
+ def get_summary(self) -> dict[str, Any]:
218
+ """Get summary of batch operation."""
219
+ return {
220
+ "total": self.total_count,
221
+ "successful": self.success_count,
222
+ "failed": self.failure_count,
223
+ "success_rate": f"{self.success_rate:.1f}%",
224
+ "failures": [
225
+ {
226
+ "index": idx,
227
+ "error": info["error"]["type"],
228
+ "message": info["error"]["message"],
229
+ }
230
+ for idx, info in self.failed
231
+ ],
232
+ }
@@ -0,0 +1,72 @@
1
+ """PDF processing lock utilities for thread-safe pypdfium2 operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import threading
7
+ from contextlib import contextmanager
8
+ from pathlib import Path
9
+ from typing import TYPE_CHECKING, Any
10
+ from weakref import WeakValueDictionary
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Generator
14
+
15
+
16
+ _PYPDFIUM_LOCK = threading.RLock()
17
+
18
+
19
+ _FILE_LOCKS_CACHE = WeakValueDictionary[str, threading.RLock]()
20
+ _FILE_LOCKS_LOCK = threading.Lock()
21
+
22
+
23
+ def _get_file_key(file_path: Path | str) -> str:
24
+ """Get a consistent key for a file path."""
25
+ path_str = str(Path(file_path).resolve())
26
+ return hashlib.md5(path_str.encode()).hexdigest() # noqa: S324
27
+
28
+
29
+ def _get_file_lock(file_path: Path | str) -> threading.RLock:
30
+ """Get or create a lock for a specific file."""
31
+ file_key = _get_file_key(file_path)
32
+
33
+ with _FILE_LOCKS_LOCK:
34
+ if file_key in _FILE_LOCKS_CACHE:
35
+ return _FILE_LOCKS_CACHE[file_key]
36
+
37
+ lock = threading.RLock()
38
+ _FILE_LOCKS_CACHE[file_key] = lock
39
+ return lock
40
+
41
+
42
+ @contextmanager
43
+ def pypdfium_lock() -> Generator[None, None, None]:
44
+ """Context manager for thread-safe pypdfium2 operations.
45
+
46
+ This prevents segmentation faults on macOS where pypdfium2
47
+ is not fork-safe when used concurrently.
48
+ """
49
+ with _PYPDFIUM_LOCK:
50
+ yield
51
+
52
+
53
+ @contextmanager
54
+ def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
55
+ """Context manager for per-file pypdfium2 operations.
56
+
57
+ This allows concurrent processing of different files while
58
+ preventing segfaults. Document caching handles same-file issues.
59
+ """
60
+ lock = _get_file_lock(file_path)
61
+ with lock:
62
+ yield
63
+
64
+
65
+ def with_pypdfium_lock(func: Any) -> Any:
66
+ """Decorator to wrap functions with pypdfium2 lock."""
67
+
68
+ def wrapper(*args: Any, **kwargs: Any) -> Any:
69
+ with pypdfium_lock():
70
+ return func(*args, **kwargs)
71
+
72
+ return wrapper
@@ -0,0 +1,100 @@
1
+ """Process pool utilities for CPU-intensive operations."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import multiprocessing as mp
6
+ from concurrent.futures import ProcessPoolExecutor
7
+ from contextlib import contextmanager
8
+ from typing import TYPE_CHECKING, Any, Callable, TypeVar
9
+
10
+ if TYPE_CHECKING:
11
+ from collections.abc import Generator
12
+
13
+ T = TypeVar("T")
14
+
15
+
16
+ _PROCESS_POOL: ProcessPoolExecutor | None = None
17
+ _POOL_SIZE = max(1, mp.cpu_count() - 1)
18
+
19
+
20
+ def _init_process_pool() -> ProcessPoolExecutor:
21
+ """Initialize the global process pool."""
22
+ global _PROCESS_POOL
23
+ if _PROCESS_POOL is None:
24
+ _PROCESS_POOL = ProcessPoolExecutor(max_workers=_POOL_SIZE)
25
+ return _PROCESS_POOL
26
+
27
+
28
+ @contextmanager
29
+ def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
30
+ """Get the global process pool."""
31
+ pool = _init_process_pool()
32
+ try:
33
+ yield pool
34
+ except Exception: # noqa: BLE001
35
+ shutdown_process_pool()
36
+ pool = _init_process_pool()
37
+ yield pool
38
+
39
+
40
+ def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
41
+ """Submit a function to the process pool and wait for result."""
42
+ with process_pool() as pool:
43
+ future = pool.submit(func, *args, **kwargs)
44
+ return future.result()
45
+
46
+
47
+ def shutdown_process_pool() -> None:
48
+ """Shutdown the global process pool."""
49
+ global _PROCESS_POOL
50
+ if _PROCESS_POOL is not None:
51
+ _PROCESS_POOL.shutdown(wait=True)
52
+ _PROCESS_POOL = None
53
+
54
+
55
+ def _extract_pdf_text_worker(pdf_path: str) -> tuple[str, str]:
56
+ """Worker function for extracting PDF text in a separate process."""
57
+ import pypdfium2
58
+
59
+ pdf = None
60
+ try:
61
+ pdf = pypdfium2.PdfDocument(pdf_path)
62
+ text_parts = []
63
+ for page in pdf:
64
+ text_page = page.get_textpage()
65
+ text = text_page.get_text_range()
66
+ text_parts.append(text)
67
+ text_page.close()
68
+ page.close()
69
+ return (pdf_path, "".join(text_parts))
70
+ except Exception as e: # noqa: BLE001
71
+ return (pdf_path, f"ERROR: {e}")
72
+ finally:
73
+ if pdf:
74
+ pdf.close()
75
+
76
+
77
+ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str, list[bytes]]:
78
+ """Worker function for converting PDF to images in a separate process."""
79
+ import io
80
+
81
+ import pypdfium2
82
+
83
+ pdf = None
84
+ try:
85
+ pdf = pypdfium2.PdfDocument(pdf_path)
86
+ image_bytes = []
87
+ for page in pdf:
88
+ bitmap = page.render(scale=scale)
89
+ pil_image = bitmap.to_pil()
90
+ img_bytes = io.BytesIO()
91
+ pil_image.save(img_bytes, format="PNG")
92
+ image_bytes.append(img_bytes.getvalue())
93
+ bitmap.close()
94
+ page.close()
95
+ return (pdf_path, image_bytes)
96
+ except Exception: # noqa: BLE001
97
+ return (pdf_path, [])
98
+ finally:
99
+ if pdf:
100
+ pdf.close()
@@ -0,0 +1,82 @@
1
+ """Fast serialization utilities using msgspec."""
2
+
3
+ from __future__ import annotations
4
+
5
+ from dataclasses import asdict, is_dataclass
6
+ from enum import Enum
7
+ from typing import Any, TypeVar, cast
8
+
9
+ from msgspec import MsgspecError
10
+ from msgspec.msgpack import decode, encode
11
+
12
+ T = TypeVar("T")
13
+
14
+
15
+ def encode_hook(obj: Any) -> Any:
16
+ """Custom encoder for complex objects."""
17
+ if callable(obj):
18
+ return None
19
+
20
+ if isinstance(obj, Exception):
21
+ return {"message": str(obj), "type": type(obj).__name__}
22
+
23
+ for key in (
24
+ "to_dict",
25
+ "as_dict",
26
+ "dict",
27
+ "model_dump",
28
+ "json",
29
+ "to_list",
30
+ "tolist",
31
+ ):
32
+ if hasattr(obj, key) and callable(getattr(obj, key)):
33
+ return getattr(obj, key)()
34
+
35
+ if is_dataclass(obj) and not isinstance(obj, type):
36
+ return {k: v if not isinstance(v, Enum) else v.value for (k, v) in asdict(obj).items()}
37
+
38
+ if hasattr(obj, "save") and hasattr(obj, "format"):
39
+ return None
40
+
41
+ raise TypeError(f"Unsupported type: {type(obj)!r}")
42
+
43
+
44
+ def deserialize(value: str | bytes, target_type: type[T]) -> T:
45
+ """Deserialize bytes/string to target type.
46
+
47
+ Args:
48
+ value: Serialized data
49
+ target_type: Type to deserialize to
50
+
51
+ Returns:
52
+ Deserialized object
53
+
54
+ Raises:
55
+ ValueError: If deserialization fails
56
+ """
57
+ try:
58
+ return decode(cast("bytes", value), type=target_type, strict=False)
59
+ except MsgspecError as e:
60
+ raise ValueError(f"Failed to deserialize to {target_type.__name__}: {e}") from e
61
+
62
+
63
+ def serialize(value: Any, **kwargs: Any) -> bytes:
64
+ """Serialize value to bytes.
65
+
66
+ Args:
67
+ value: Object to serialize
68
+ **kwargs: Additional data to merge with value if it's a dict
69
+
70
+ Returns:
71
+ Serialized bytes
72
+
73
+ Raises:
74
+ ValueError: If serialization fails
75
+ """
76
+ if isinstance(value, dict) and kwargs:
77
+ value = value | kwargs
78
+
79
+ try:
80
+ return encode(value, enc_hook=encode_hook)
81
+ except (MsgspecError, TypeError) as e:
82
+ raise ValueError(f"Failed to serialize {type(value).__name__}: {e}") from e