kreuzberg 3.13.0__py3-none-any.whl → 3.13.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. kreuzberg/_chunker.py +0 -15
  2. kreuzberg/_config.py +0 -124
  3. kreuzberg/_document_classification.py +20 -39
  4. kreuzberg/_entity_extraction.py +0 -29
  5. kreuzberg/_extractors/_base.py +4 -66
  6. kreuzberg/_extractors/_email.py +0 -4
  7. kreuzberg/_extractors/_image.py +0 -2
  8. kreuzberg/_extractors/_pandoc.py +0 -58
  9. kreuzberg/_extractors/_pdf.py +0 -3
  10. kreuzberg/_extractors/_presentation.py +0 -82
  11. kreuzberg/_extractors/_spread_sheet.py +0 -2
  12. kreuzberg/_gmft.py +0 -61
  13. kreuzberg/_language_detection.py +0 -14
  14. kreuzberg/_mime_types.py +0 -17
  15. kreuzberg/_ocr/_base.py +4 -76
  16. kreuzberg/_ocr/_easyocr.py +110 -85
  17. kreuzberg/_ocr/_paddleocr.py +146 -138
  18. kreuzberg/_ocr/_table_extractor.py +0 -76
  19. kreuzberg/_ocr/_tesseract.py +0 -206
  20. kreuzberg/_playa.py +0 -27
  21. kreuzberg/_registry.py +0 -36
  22. kreuzberg/_types.py +16 -119
  23. kreuzberg/_utils/_cache.py +0 -52
  24. kreuzberg/_utils/_device.py +0 -56
  25. kreuzberg/_utils/_document_cache.py +0 -73
  26. kreuzberg/_utils/_errors.py +0 -47
  27. kreuzberg/_utils/_ocr_cache.py +136 -0
  28. kreuzberg/_utils/_pdf_lock.py +0 -14
  29. kreuzberg/_utils/_process_pool.py +0 -47
  30. kreuzberg/_utils/_quality.py +0 -17
  31. kreuzberg/_utils/_ref.py +0 -16
  32. kreuzberg/_utils/_serialization.py +0 -25
  33. kreuzberg/_utils/_string.py +0 -20
  34. kreuzberg/_utils/_sync.py +0 -76
  35. kreuzberg/_utils/_table.py +0 -45
  36. kreuzberg/_utils/_tmp.py +0 -9
  37. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/METADATA +3 -2
  38. kreuzberg-3.13.1.dist-info/RECORD +57 -0
  39. kreuzberg-3.13.0.dist-info/RECORD +0 -56
  40. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/WHEEL +0 -0
  41. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/entry_points.txt +0 -0
  42. {kreuzberg-3.13.0.dist-info → kreuzberg-3.13.1.dist-info}/licenses/LICENSE +0 -0
@@ -11,14 +11,7 @@ if TYPE_CHECKING:
11
11
 
12
12
 
13
13
  class DocumentCache:
14
- """Session-scoped cache for document extraction results.
15
-
16
- Ensures each unique document is processed only once per session,
17
- preventing pypdfium2 state corruption issues with repeated processing.
18
- """
19
-
20
14
  def __init__(self) -> None:
21
- """Initialize document cache."""
22
15
  self._cache: dict[str, ExtractionResult] = {}
23
16
  self._processing: dict[str, threading.Event] = {}
24
17
  self._lock = threading.Lock()
@@ -26,15 +19,6 @@ class DocumentCache:
26
19
  self._file_metadata: dict[str, dict[str, Any]] = {}
27
20
 
28
21
  def _get_cache_key(self, file_path: Path | str, config: ExtractionConfig | None = None) -> str:
29
- """Generate cache key for a file and config combination.
30
-
31
- Args:
32
- file_path: Path to the file
33
- config: Extraction configuration
34
-
35
- Returns:
36
- Unique cache key string
37
- """
38
22
  path = Path(file_path).resolve()
39
23
 
40
24
  try:
@@ -65,15 +49,6 @@ class DocumentCache:
65
49
  return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
66
50
 
67
51
  def _is_cache_valid(self, cache_key: str, file_path: Path | str) -> bool:
68
- """Check if cached result is still valid.
69
-
70
- Args:
71
- cache_key: The cache key to validate
72
- file_path: Path to the file
73
-
74
- Returns:
75
- True if cache is valid, False if invalidated
76
- """
77
52
  if cache_key not in self._file_metadata:
78
53
  return False
79
54
 
@@ -89,15 +64,6 @@ class DocumentCache:
89
64
  return False
90
65
 
91
66
  def get(self, file_path: Path | str, config: ExtractionConfig | None = None) -> ExtractionResult | None:
92
- """Get cached extraction result if available and valid.
93
-
94
- Args:
95
- file_path: Path to the file
96
- config: Extraction configuration
97
-
98
- Returns:
99
- Cached result if available, None otherwise
100
- """
101
67
  cache_key = self._get_cache_key(file_path, config)
102
68
 
103
69
  with self._lock:
@@ -111,13 +77,6 @@ class DocumentCache:
111
77
  return None
112
78
 
113
79
  def set(self, file_path: Path | str, config: ExtractionConfig | None, result: ExtractionResult) -> None:
114
- """Cache extraction result.
115
-
116
- Args:
117
- file_path: Path to the file
118
- config: Extraction configuration
119
- result: Extraction result to cache
120
- """
121
80
  cache_key = self._get_cache_key(file_path, config)
122
81
  path = Path(file_path)
123
82
 
@@ -140,29 +99,11 @@ class DocumentCache:
140
99
  self._file_metadata[cache_key] = file_metadata
141
100
 
142
101
  def is_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> bool:
143
- """Check if file is currently being processed.
144
-
145
- Args:
146
- file_path: Path to the file
147
- config: Extraction configuration
148
-
149
- Returns:
150
- True if file is currently being processed
151
- """
152
102
  cache_key = self._get_cache_key(file_path, config)
153
103
  with self._lock:
154
104
  return cache_key in self._processing
155
105
 
156
106
  def mark_processing(self, file_path: Path | str, config: ExtractionConfig | None = None) -> threading.Event:
157
- """Mark file as being processed and return event to wait on.
158
-
159
- Args:
160
- file_path: Path to the file
161
- config: Extraction configuration
162
-
163
- Returns:
164
- Event that will be set when processing completes
165
- """
166
107
  cache_key = self._get_cache_key(file_path, config)
167
108
 
168
109
  with self._lock:
@@ -171,12 +112,6 @@ class DocumentCache:
171
112
  return self._processing[cache_key]
172
113
 
173
114
  def mark_complete(self, file_path: Path | str, config: ExtractionConfig | None = None) -> None:
174
- """Mark file processing as complete.
175
-
176
- Args:
177
- file_path: Path to the file
178
- config: Extraction configuration
179
- """
180
115
  cache_key = self._get_cache_key(file_path, config)
181
116
 
182
117
  with self._lock:
@@ -185,17 +120,11 @@ class DocumentCache:
185
120
  event.set()
186
121
 
187
122
  def clear(self) -> None:
188
- """Clear all cached results."""
189
123
  with self._lock:
190
124
  self._cache.clear()
191
125
  self._file_metadata.clear()
192
126
 
193
127
  def get_stats(self) -> dict[str, Any]:
194
- """Get cache statistics.
195
-
196
- Returns:
197
- Dictionary with cache statistics
198
- """
199
128
  with self._lock:
200
129
  return {
201
130
  "cached_documents": len(self._cache),
@@ -210,10 +139,8 @@ _document_cache = DocumentCache()
210
139
 
211
140
 
212
141
  def get_document_cache() -> DocumentCache:
213
- """Get the global document cache instance."""
214
142
  return _document_cache
215
143
 
216
144
 
217
145
  def clear_document_cache() -> None:
218
- """Clear the global document cache."""
219
146
  _document_cache.clear()
@@ -53,17 +53,6 @@ def create_error_context(
53
53
  error: Exception | None = None,
54
54
  **extra: Any,
55
55
  ) -> dict[str, Any]:
56
- """Create comprehensive error context.
57
-
58
- Args:
59
- operation: The operation being performed (e.g., "extract_file", "convert_pdf_to_images")
60
- file_path: The file being processed, if applicable
61
- error: The original exception, if any
62
- **extra: Additional context fields
63
-
64
- Returns:
65
- Dictionary with error context including system info
66
- """
67
56
  context: dict[str, Any] = {
68
57
  "timestamp": datetime.now(timezone.utc).isoformat(),
69
58
  "operation": operation,
@@ -104,14 +93,6 @@ def create_error_context(
104
93
 
105
94
 
106
95
  def is_transient_error(error: Exception) -> bool:
107
- """Check if an error is likely transient and worth retrying.
108
-
109
- Args:
110
- error: The exception to check
111
-
112
- Returns:
113
- True if the error is likely transient
114
- """
115
96
  transient_types = (
116
97
  OSError,
117
98
  PermissionError,
@@ -128,29 +109,11 @@ def is_transient_error(error: Exception) -> bool:
128
109
 
129
110
 
130
111
  def is_resource_error(error: Exception) -> bool:
131
- """Check if an error is related to system resources.
132
-
133
- Args:
134
- error: The exception to check
135
-
136
- Returns:
137
- True if the error is resource-related
138
- """
139
112
  error_str = str(error).lower()
140
113
  return any(pattern in error_str for pattern in _RESOURCE_ERROR_PATTERNS)
141
114
 
142
115
 
143
116
  def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
144
- """Determine if an operation should be retried.
145
-
146
- Args:
147
- error: The exception that occurred
148
- attempt: Current attempt number (1-based)
149
- max_attempts: Maximum number of attempts
150
-
151
- Returns:
152
- True if the operation should be retried
153
- """
154
117
  if attempt >= max_attempts:
155
118
  return False
156
119
 
@@ -161,22 +124,17 @@ def should_retry(error: Exception, attempt: int, max_attempts: int = 3) -> bool:
161
124
 
162
125
 
163
126
  class BatchExtractionResult:
164
- """Result container for batch operations with partial success support."""
165
-
166
127
  __slots__ = ("failed", "successful", "total_count")
167
128
 
168
129
  def __init__(self) -> None:
169
- """Initialize batch result container."""
170
130
  self.successful: list[tuple[int, Any]] = []
171
131
  self.failed: list[tuple[int, dict[str, Any]]] = []
172
132
  self.total_count: int = 0
173
133
 
174
134
  def add_success(self, index: int, result: Any) -> None:
175
- """Add a successful result."""
176
135
  self.successful.append((index, result))
177
136
 
178
137
  def add_failure(self, index: int, error: Exception, context: dict[str, Any]) -> None:
179
- """Add a failed result with context."""
180
138
  error_info = {
181
139
  "error": {
182
140
  "type": type(error).__name__,
@@ -188,30 +146,25 @@ class BatchExtractionResult:
188
146
 
189
147
  @property
190
148
  def success_count(self) -> int:
191
- """Number of successful operations."""
192
149
  return len(self.successful)
193
150
 
194
151
  @property
195
152
  def failure_count(self) -> int:
196
- """Number of failed operations."""
197
153
  return len(self.failed)
198
154
 
199
155
  @property
200
156
  def success_rate(self) -> float:
201
- """Success rate as a percentage."""
202
157
  if self.total_count == 0:
203
158
  return 0.0
204
159
  return (self.success_count / self.total_count) * 100
205
160
 
206
161
  def get_ordered_results(self) -> list[Any | None]:
207
- """Get results in original order with None for failures."""
208
162
  results = [None] * self.total_count
209
163
  for index, result in self.successful:
210
164
  results[index] = result
211
165
  return results
212
166
 
213
167
  def get_summary(self) -> dict[str, Any]:
214
- """Get summary of batch operation."""
215
168
  return {
216
169
  "total": self.total_count,
217
170
  "successful": self.success_count,
@@ -0,0 +1,136 @@
1
+ from __future__ import annotations
2
+
3
+ import hashlib
4
+ import io
5
+ from typing import TYPE_CHECKING, Any
6
+
7
+ import anyio
8
+
9
+ from kreuzberg._utils._cache import get_ocr_cache
10
+
11
+ if TYPE_CHECKING:
12
+ from pathlib import Path
13
+
14
+ from PIL.Image import Image as PILImage
15
+
16
+ from kreuzberg._types import ExtractionResult
17
+
18
+
19
+ def get_file_info(path: Path) -> dict[str, Any]:
20
+ from pathlib import Path as PathType # noqa: PLC0415
21
+
22
+ path_obj = PathType(path) if not isinstance(path, PathType) else path
23
+
24
+ try:
25
+ stat = path_obj.stat()
26
+ return {
27
+ "path": str(path_obj.resolve()),
28
+ "size": stat.st_size,
29
+ "mtime": stat.st_mtime,
30
+ }
31
+ except OSError:
32
+ return {
33
+ "path": str(path_obj),
34
+ "size": 0,
35
+ "mtime": 0,
36
+ }
37
+
38
+
39
+ def generate_image_hash(image: PILImage) -> str:
40
+ save_image = image
41
+ if image.mode not in ("RGB", "RGBA", "L", "LA", "P", "1"):
42
+ save_image = image.convert("RGB")
43
+
44
+ image_buffer = io.BytesIO()
45
+ save_image.save(image_buffer, format="PNG")
46
+ image_content = image_buffer.getvalue()
47
+
48
+ return hashlib.sha256(image_content).hexdigest()[:16]
49
+
50
+
51
+ def build_cache_kwargs(
52
+ backend_name: str,
53
+ config_dict: dict[str, Any],
54
+ image_hash: str | None = None,
55
+ file_info: dict[str, Any] | None = None,
56
+ ) -> dict[str, Any]:
57
+ cache_kwargs = {
58
+ "ocr_backend": backend_name,
59
+ "ocr_config": str(sorted(config_dict.items())),
60
+ }
61
+
62
+ if image_hash:
63
+ cache_kwargs["image_hash"] = image_hash
64
+ if file_info:
65
+ cache_kwargs["file_info"] = str(sorted(file_info.items()))
66
+
67
+ return cache_kwargs
68
+
69
+
70
+ async def handle_cache_lookup_async(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
71
+ ocr_cache = get_ocr_cache()
72
+
73
+ cached_result = await ocr_cache.aget(**cache_kwargs)
74
+ if cached_result is not None:
75
+ return cached_result
76
+
77
+ if ocr_cache.is_processing(**cache_kwargs):
78
+ event = ocr_cache.mark_processing(**cache_kwargs)
79
+ await anyio.to_thread.run_sync(event.wait)
80
+
81
+ cached_result = await ocr_cache.aget(**cache_kwargs)
82
+ if cached_result is not None:
83
+ return cached_result
84
+
85
+ ocr_cache.mark_processing(**cache_kwargs)
86
+ return None
87
+
88
+
89
+ def handle_cache_lookup_sync(cache_kwargs: dict[str, Any]) -> ExtractionResult | None:
90
+ ocr_cache = get_ocr_cache()
91
+
92
+ cached_result = ocr_cache.get(**cache_kwargs)
93
+ if cached_result is not None:
94
+ return cached_result
95
+
96
+ if ocr_cache.is_processing(**cache_kwargs):
97
+ event = ocr_cache.mark_processing(**cache_kwargs)
98
+ event.wait()
99
+
100
+ cached_result = ocr_cache.get(**cache_kwargs)
101
+ if cached_result is not None:
102
+ return cached_result
103
+
104
+ ocr_cache.mark_processing(**cache_kwargs)
105
+ return None
106
+
107
+
108
+ async def cache_and_complete_async(
109
+ result: ExtractionResult,
110
+ cache_kwargs: dict[str, Any],
111
+ use_cache: bool,
112
+ ) -> None:
113
+ ocr_cache = get_ocr_cache()
114
+
115
+ if use_cache:
116
+ await ocr_cache.aset(result, **cache_kwargs)
117
+
118
+ ocr_cache.mark_complete(**cache_kwargs)
119
+
120
+
121
+ def cache_and_complete_sync(
122
+ result: ExtractionResult,
123
+ cache_kwargs: dict[str, Any],
124
+ use_cache: bool,
125
+ ) -> None:
126
+ ocr_cache = get_ocr_cache()
127
+
128
+ if use_cache:
129
+ ocr_cache.set(result, **cache_kwargs)
130
+
131
+ ocr_cache.mark_complete(**cache_kwargs)
132
+
133
+
134
+ def mark_processing_complete(cache_kwargs: dict[str, Any]) -> None:
135
+ ocr_cache = get_ocr_cache()
136
+ ocr_cache.mark_complete(**cache_kwargs)
@@ -19,13 +19,11 @@ _FILE_LOCKS_LOCK = threading.Lock()
19
19
 
20
20
 
21
21
  def _get_file_key(file_path: Path | str) -> str:
22
- """Get a consistent key for a file path."""
23
22
  path_str = str(Path(file_path).resolve())
24
23
  return hashlib.md5(path_str.encode()).hexdigest() # noqa: S324
25
24
 
26
25
 
27
26
  def _get_file_lock(file_path: Path | str) -> threading.RLock:
28
- """Get or create a lock for a specific file."""
29
27
  file_key = _get_file_key(file_path)
30
28
 
31
29
  with _FILE_LOCKS_LOCK:
@@ -39,30 +37,18 @@ def _get_file_lock(file_path: Path | str) -> threading.RLock:
39
37
 
40
38
  @contextmanager
41
39
  def pypdfium_lock() -> Generator[None, None, None]:
42
- """Context manager for thread-safe pypdfium2 operations.
43
-
44
- This prevents segmentation faults on macOS where pypdfium2
45
- is not fork-safe when used concurrently.
46
- """
47
40
  with _PYPDFIUM_LOCK:
48
41
  yield
49
42
 
50
43
 
51
44
  @contextmanager
52
45
  def pypdfium_file_lock(file_path: Path | str) -> Generator[None, None, None]:
53
- """Context manager for per-file pypdfium2 operations.
54
-
55
- This allows concurrent processing of different files while
56
- preventing segfaults. Document caching handles same-file issues.
57
- """
58
46
  lock = _get_file_lock(file_path)
59
47
  with lock:
60
48
  yield
61
49
 
62
50
 
63
51
  def with_pypdfium_lock(func: Any) -> Any:
64
- """Decorator to wrap functions with pypdfium2 lock."""
65
-
66
52
  def wrapper(*args: Any, **kwargs: Any) -> Any:
67
53
  with pypdfium_lock():
68
54
  return func(*args, **kwargs)
@@ -36,7 +36,6 @@ def _get_process_pool() -> ProcessPoolExecutor:
36
36
 
37
37
  @contextmanager
38
38
  def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
39
- """Get the process pool."""
40
39
  pool = _get_process_pool()
41
40
  try:
42
41
  yield pool
@@ -47,14 +46,12 @@ def process_pool() -> Generator[ProcessPoolExecutor, None, None]:
47
46
 
48
47
 
49
48
  def submit_to_process_pool(func: Callable[..., T], *args: Any, **kwargs: Any) -> T:
50
- """Submit a function to the process pool and wait for result."""
51
49
  with process_pool() as pool:
52
50
  future = pool.submit(func, *args, **kwargs)
53
51
  return future.result()
54
52
 
55
53
 
56
54
  def shutdown_process_pool() -> None:
57
- """Shutdown the process pool."""
58
55
  if _process_pool_ref.is_initialized():
59
56
  pool = _process_pool_ref.get()
60
57
  pool.shutdown(wait=True)
@@ -102,19 +99,11 @@ def _extract_pdf_images_worker(pdf_path: str, scale: float = 4.25) -> tuple[str,
102
99
 
103
100
 
104
101
  class ProcessPoolManager:
105
- """Resource-aware process pool manager for CPU-intensive tasks."""
106
-
107
102
  def __init__(
108
103
  self,
109
104
  max_processes: int | None = None,
110
105
  memory_limit_gb: float | None = None,
111
106
  ) -> None:
112
- """Initialize the process pool manager.
113
-
114
- Args:
115
- max_processes: Maximum number of processes. Defaults to CPU count.
116
- memory_limit_gb: Memory limit in GB. Defaults to 75% of available memory.
117
- """
118
107
  self.max_processes = max_processes or mp.cpu_count()
119
108
 
120
109
  if memory_limit_gb is None:
@@ -127,21 +116,12 @@ class ProcessPoolManager:
127
116
  self._active_tasks = 0
128
117
 
129
118
  def get_optimal_workers(self, task_memory_mb: float = 100) -> int:
130
- """Calculate optimal number of workers based on memory constraints.
131
-
132
- Args:
133
- task_memory_mb: Estimated memory usage per task in MB.
134
-
135
- Returns:
136
- Optimal number of workers.
137
- """
138
119
  task_memory_bytes = task_memory_mb * 1024**2
139
120
  memory_based_limit = max(1, int(self.memory_limit_bytes / task_memory_bytes))
140
121
 
141
122
  return min(self.max_processes, memory_based_limit)
142
123
 
143
124
  def _ensure_executor(self, max_workers: int | None = None) -> ProcessPoolExecutor:
144
- """Ensure process pool executor is initialized."""
145
125
  if self._executor is None or getattr(self._executor, "_max_workers", None) != max_workers:
146
126
  if self._executor is not None:
147
127
  self._executor.shutdown(wait=False)
@@ -157,16 +137,6 @@ class ProcessPoolManager:
157
137
  *args: Any,
158
138
  task_memory_mb: float = 100,
159
139
  ) -> T:
160
- """Submit a task to the process pool.
161
-
162
- Args:
163
- func: Function to execute.
164
- *args: Positional arguments for the function.
165
- task_memory_mb: Estimated memory usage in MB.
166
-
167
- Returns:
168
- Result of the function execution.
169
- """
170
140
  workers = self.get_optimal_workers(task_memory_mb)
171
141
  self._ensure_executor(workers)
172
142
 
@@ -184,17 +154,6 @@ class ProcessPoolManager:
184
154
  task_memory_mb: float = 100,
185
155
  max_concurrent: int | None = None,
186
156
  ) -> list[T]:
187
- """Submit a batch of tasks to the process pool.
188
-
189
- Args:
190
- func: Function to execute.
191
- arg_batches: List of argument tuples for each task.
192
- task_memory_mb: Estimated memory usage per task in MB.
193
- max_concurrent: Maximum concurrent tasks. Defaults to optimal workers.
194
-
195
- Returns:
196
- List of results in the same order as input.
197
- """
198
157
  if not arg_batches:
199
158
  return []
200
159
 
@@ -225,7 +184,6 @@ class ProcessPoolManager:
225
184
  return results
226
185
 
227
186
  def get_system_info(self) -> dict[str, Any]:
228
- """Get current system resource information."""
229
187
  memory = psutil.virtual_memory()
230
188
  cpu_percent = psutil.cpu_percent(interval=1)
231
189
 
@@ -241,13 +199,11 @@ class ProcessPoolManager:
241
199
  }
242
200
 
243
201
  def shutdown(self, wait: bool = True) -> None:
244
- """Shutdown the process pool."""
245
202
  if self._executor is not None:
246
203
  self._executor.shutdown(wait=wait)
247
204
  self._executor = None
248
205
 
249
206
  def __enter__(self) -> Self:
250
- """Context manager entry."""
251
207
  return self
252
208
 
253
209
  def __exit__(
@@ -256,11 +212,9 @@ class ProcessPoolManager:
256
212
  exc_val: BaseException | None,
257
213
  exc_tb: types.TracebackType | None,
258
214
  ) -> None:
259
- """Context manager exit."""
260
215
  self.shutdown()
261
216
 
262
217
  async def __aenter__(self) -> Self:
263
- """Async context manager entry."""
264
218
  return self
265
219
 
266
220
  async def __aexit__(
@@ -269,5 +223,4 @@ class ProcessPoolManager:
269
223
  exc_val: BaseException | None,
270
224
  exc_tb: types.TracebackType | None,
271
225
  ) -> None:
272
- """Async context manager exit."""
273
226
  self.shutdown()
@@ -44,15 +44,6 @@ _NAVIGATION_PATTERNS = {
44
44
 
45
45
 
46
46
  def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -> float:
47
- """Calculate overall quality score for extracted text.
48
-
49
- Args:
50
- text: The extracted text content
51
- metadata: Optional metadata for additional scoring
52
-
53
- Returns:
54
- Quality score between 0.0 and 1.0
55
- """
56
47
  if not text or not text.strip():
57
48
  return 0.0
58
49
 
@@ -79,14 +70,6 @@ def calculate_quality_score(text: str, metadata: dict[str, Any] | None = None) -
79
70
 
80
71
 
81
72
  def clean_extracted_text(text: str) -> str:
82
- """Clean extracted text by removing artifacts and improving quality.
83
-
84
- Args:
85
- text: The raw extracted text
86
-
87
- Returns:
88
- Cleaned text with artifacts removed
89
- """
90
73
  if not text:
91
74
  return text
92
75
 
kreuzberg/_utils/_ref.py CHANGED
@@ -9,40 +9,24 @@ T = TypeVar("T")
9
9
 
10
10
 
11
11
  class Ref(Generic[T]):
12
- """A reference container that manages singleton instances without global variables.
13
-
14
- This provides a clean alternative to global variables by using a registry pattern
15
- with type safety.
16
- """
17
-
18
12
  _instances: ClassVar[dict[str, Any]] = {}
19
13
 
20
14
  def __init__(self, name: str, factory: Callable[[], T]) -> None:
21
- """Initialize a reference container.
22
-
23
- Args:
24
- name: Unique name for this reference
25
- factory: Factory function to create the instance when needed
26
- """
27
15
  self.name = name
28
16
  self.factory = factory
29
17
 
30
18
  def get(self) -> T:
31
- """Get the singleton instance, creating it if it doesn't exist."""
32
19
  if self.name not in self._instances:
33
20
  self._instances[self.name] = self.factory()
34
21
  return cast("T", self._instances[self.name])
35
22
 
36
23
  def clear(self) -> None:
37
- """Clear the singleton instance."""
38
24
  if self.name in self._instances:
39
25
  del self._instances[self.name]
40
26
 
41
27
  def is_initialized(self) -> bool:
42
- """Check if the singleton instance exists."""
43
28
  return self.name in self._instances
44
29
 
45
30
  @classmethod
46
31
  def clear_all(cls) -> None:
47
- """Clear all singleton instances."""
48
32
  cls._instances.clear()
@@ -22,7 +22,6 @@ _DICT_METHOD_NAMES = (
22
22
 
23
23
 
24
24
  def encode_hook(obj: Any) -> Any:
25
- """Custom encoder for complex objects."""
26
25
  if callable(obj):
27
26
  return None
28
27
 
@@ -44,18 +43,6 @@ def encode_hook(obj: Any) -> Any:
44
43
 
45
44
 
46
45
  def deserialize(value: str | bytes, target_type: type[T]) -> T:
47
- """Deserialize bytes/string to target type.
48
-
49
- Args:
50
- value: Serialized data
51
- target_type: Type to deserialize to
52
-
53
- Returns:
54
- Deserialized object
55
-
56
- Raises:
57
- ValueError: If deserialization fails
58
- """
59
46
  try:
60
47
  return decode(cast("bytes", value), type=target_type, strict=False)
61
48
  except MsgspecError as e:
@@ -63,18 +50,6 @@ def deserialize(value: str | bytes, target_type: type[T]) -> T:
63
50
 
64
51
 
65
52
  def serialize(value: Any, **kwargs: Any) -> bytes:
66
- """Serialize value to bytes.
67
-
68
- Args:
69
- value: Object to serialize
70
- **kwargs: Additional data to merge with value if it's a dict
71
-
72
- Returns:
73
- Serialized bytes
74
-
75
- Raises:
76
- ValueError: If serialization fails
77
- """
78
53
  if isinstance(value, dict) and kwargs:
79
54
  value = value | kwargs
80
55