kreuzberg 3.1.7__py3-none-any.whl → 3.3.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (37) hide show
  1. kreuzberg/__init__.py +3 -0
  2. kreuzberg/__main__.py +8 -0
  3. kreuzberg/_cli_config.py +175 -0
  4. kreuzberg/_extractors/_image.py +39 -4
  5. kreuzberg/_extractors/_pandoc.py +158 -18
  6. kreuzberg/_extractors/_pdf.py +199 -19
  7. kreuzberg/_extractors/_presentation.py +1 -1
  8. kreuzberg/_extractors/_spread_sheet.py +65 -7
  9. kreuzberg/_gmft.py +222 -16
  10. kreuzberg/_mime_types.py +62 -16
  11. kreuzberg/_multiprocessing/__init__.py +6 -0
  12. kreuzberg/_multiprocessing/gmft_isolated.py +332 -0
  13. kreuzberg/_multiprocessing/process_manager.py +188 -0
  14. kreuzberg/_multiprocessing/sync_tesseract.py +261 -0
  15. kreuzberg/_multiprocessing/tesseract_pool.py +359 -0
  16. kreuzberg/_ocr/_easyocr.py +66 -10
  17. kreuzberg/_ocr/_paddleocr.py +86 -7
  18. kreuzberg/_ocr/_tesseract.py +136 -46
  19. kreuzberg/_playa.py +43 -0
  20. kreuzberg/_utils/_cache.py +372 -0
  21. kreuzberg/_utils/_device.py +356 -0
  22. kreuzberg/_utils/_document_cache.py +220 -0
  23. kreuzberg/_utils/_errors.py +232 -0
  24. kreuzberg/_utils/_pdf_lock.py +72 -0
  25. kreuzberg/_utils/_process_pool.py +100 -0
  26. kreuzberg/_utils/_serialization.py +82 -0
  27. kreuzberg/_utils/_string.py +1 -1
  28. kreuzberg/_utils/_sync.py +21 -0
  29. kreuzberg/cli.py +338 -0
  30. kreuzberg/extraction.py +247 -36
  31. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/METADATA +95 -34
  32. kreuzberg-3.3.0.dist-info/RECORD +48 -0
  33. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/WHEEL +1 -2
  34. kreuzberg-3.3.0.dist-info/entry_points.txt +2 -0
  35. kreuzberg-3.1.7.dist-info/RECORD +0 -33
  36. kreuzberg-3.1.7.dist-info/top_level.txt +0 -1
  37. {kreuzberg-3.1.7.dist-info → kreuzberg-3.3.0.dist-info}/licenses/LICENSE +0 -0
@@ -0,0 +1,372 @@
1
+ """General-purpose file-based caching layer for Kreuzberg."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import hashlib
6
+ import os
7
+ import threading
8
+ import time
9
+ from contextlib import suppress
10
+ from pathlib import Path
11
+ from typing import Any, Generic, TypeVar
12
+
13
+ from anyio import Path as AsyncPath
14
+
15
+ from kreuzberg._types import ExtractionResult
16
+ from kreuzberg._utils._serialization import deserialize, serialize
17
+ from kreuzberg._utils._sync import run_sync
18
+
19
+ T = TypeVar("T")
20
+
21
+
22
+ class KreuzbergCache(Generic[T]):
23
+ """File-based cache for Kreuzberg operations.
24
+
25
+ Provides both sync and async interfaces for caching extraction results,
26
+ OCR results, table data, and other expensive operations to disk.
27
+ """
28
+
29
+ def __init__(
30
+ self,
31
+ cache_type: str,
32
+ cache_dir: Path | str | None = None,
33
+ max_cache_size_mb: float = 500.0,
34
+ max_age_days: int = 30,
35
+ ) -> None:
36
+ """Initialize cache.
37
+
38
+ Args:
39
+ cache_type: Type of cache (e.g., 'ocr', 'tables', 'documents', 'mime')
40
+ cache_dir: Cache directory (defaults to .kreuzberg/{cache_type} in cwd)
41
+ max_cache_size_mb: Maximum cache size in MB (default: 500MB)
42
+ max_age_days: Maximum age of cached results in days (default: 30 days)
43
+ """
44
+ if cache_dir is None:
45
+ cache_dir = Path.cwd() / ".kreuzberg" / cache_type
46
+
47
+ self.cache_dir = Path(cache_dir)
48
+ self.cache_type = cache_type
49
+ self.max_cache_size_mb = max_cache_size_mb
50
+ self.max_age_days = max_age_days
51
+
52
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
53
+
54
+ # In-memory tracking of processing state (session-scoped) # ~keep
55
+ self._processing: dict[str, threading.Event] = {}
56
+ self._lock = threading.Lock()
57
+
58
+ def _get_cache_key(self, **kwargs: Any) -> str:
59
+ """Generate cache key from kwargs.
60
+
61
+ Args:
62
+ **kwargs: Key-value pairs to generate cache key from
63
+
64
+ Returns:
65
+ Unique cache key string
66
+ """
67
+ # Sort for consistent hashing # ~keep
68
+ cache_str = str(sorted(kwargs.items()))
69
+ return hashlib.sha256(cache_str.encode()).hexdigest()[:16]
70
+
71
+ def _get_cache_path(self, cache_key: str) -> Path:
72
+ """Get cache file path for key."""
73
+ return self.cache_dir / f"{cache_key}.msgpack"
74
+
75
+ def _is_cache_valid(self, cache_path: Path) -> bool:
76
+ """Check if cached result is still valid."""
77
+ try:
78
+ if not cache_path.exists():
79
+ return False
80
+
81
+ mtime = cache_path.stat().st_mtime
82
+ age_days = (time.time() - mtime) / (24 * 3600)
83
+
84
+ return age_days <= self.max_age_days
85
+ except OSError:
86
+ return False
87
+
88
+ def _serialize_result(self, result: T) -> dict[str, Any]:
89
+ """Serialize result for caching with metadata."""
90
+ return {"type": type(result).__name__, "data": result, "cached_at": time.time()}
91
+
92
+ def _deserialize_result(self, cached_data: dict[str, Any]) -> T:
93
+ """Deserialize cached result."""
94
+ data = cached_data["data"]
95
+
96
+ if cached_data.get("type") == "ExtractionResult" and isinstance(data, dict):
97
+ from kreuzberg._types import ExtractionResult
98
+
99
+ return ExtractionResult(**data) # type: ignore[return-value]
100
+
101
+ return data # type: ignore[no-any-return]
102
+
103
+ def _cleanup_cache(self) -> None:
104
+ """Clean up old and oversized cache entries."""
105
+ try:
106
+ cache_files = list(self.cache_dir.glob("*.msgpack"))
107
+
108
+ cutoff_time = time.time() - (self.max_age_days * 24 * 3600)
109
+ for cache_file in cache_files[:]:
110
+ try:
111
+ if cache_file.stat().st_mtime < cutoff_time:
112
+ cache_file.unlink(missing_ok=True)
113
+ cache_files.remove(cache_file)
114
+ except OSError: # noqa: PERF203
115
+ continue
116
+
117
+ total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists()) / (
118
+ 1024 * 1024
119
+ )
120
+
121
+ if total_size > self.max_cache_size_mb:
122
+ cache_files.sort(key=lambda f: f.stat().st_mtime if f.exists() else 0)
123
+
124
+ for cache_file in cache_files:
125
+ try:
126
+ size_mb = cache_file.stat().st_size / (1024 * 1024)
127
+ cache_file.unlink(missing_ok=True)
128
+ total_size -= size_mb
129
+
130
+ if total_size <= self.max_cache_size_mb * 0.8:
131
+ break
132
+ except OSError:
133
+ continue
134
+ except (OSError, ValueError, TypeError):
135
+ pass
136
+
137
+ def get(self, **kwargs: Any) -> T | None:
138
+ """Get cached result (sync).
139
+
140
+ Args:
141
+ **kwargs: Key-value pairs to generate cache key from
142
+
143
+ Returns:
144
+ Cached result if available, None otherwise
145
+ """
146
+ cache_key = self._get_cache_key(**kwargs)
147
+ cache_path = self._get_cache_path(cache_key)
148
+
149
+ if not self._is_cache_valid(cache_path):
150
+ return None
151
+
152
+ try:
153
+ content = cache_path.read_bytes()
154
+ cached_data = deserialize(content, dict)
155
+ return self._deserialize_result(cached_data)
156
+ except (OSError, ValueError, KeyError):
157
+ with suppress(OSError):
158
+ cache_path.unlink(missing_ok=True)
159
+ return None
160
+
161
+ def set(self, result: T, **kwargs: Any) -> None:
162
+ """Cache result (sync).
163
+
164
+ Args:
165
+ result: Result to cache
166
+ **kwargs: Key-value pairs to generate cache key from
167
+ """
168
+ cache_key = self._get_cache_key(**kwargs)
169
+ cache_path = self._get_cache_path(cache_key)
170
+
171
+ try:
172
+ serialized = self._serialize_result(result)
173
+ content = serialize(serialized)
174
+ cache_path.write_bytes(content)
175
+
176
+ if hash(cache_key) % 100 == 0:
177
+ self._cleanup_cache()
178
+ except (OSError, TypeError, ValueError):
179
+ pass
180
+
181
+ async def aget(self, **kwargs: Any) -> T | None:
182
+ """Get cached result (async).
183
+
184
+ Args:
185
+ **kwargs: Key-value pairs to generate cache key from
186
+
187
+ Returns:
188
+ Cached result if available, None otherwise
189
+ """
190
+ cache_key = self._get_cache_key(**kwargs)
191
+ cache_path = AsyncPath(self._get_cache_path(cache_key))
192
+
193
+ if not await run_sync(self._is_cache_valid, Path(cache_path)):
194
+ return None
195
+
196
+ try:
197
+ content = await cache_path.read_bytes()
198
+ cached_data = deserialize(content, dict)
199
+ return self._deserialize_result(cached_data)
200
+ except (OSError, ValueError, KeyError):
201
+ with suppress(Exception):
202
+ await cache_path.unlink(missing_ok=True)
203
+ return None
204
+
205
+ async def aset(self, result: T, **kwargs: Any) -> None:
206
+ """Cache result (async).
207
+
208
+ Args:
209
+ result: Result to cache
210
+ **kwargs: Key-value pairs to generate cache key from
211
+ """
212
+ cache_key = self._get_cache_key(**kwargs)
213
+ cache_path = AsyncPath(self._get_cache_path(cache_key))
214
+
215
+ try:
216
+ serialized = self._serialize_result(result)
217
+ content = serialize(serialized)
218
+ await cache_path.write_bytes(content)
219
+
220
+ if hash(cache_key) % 100 == 0:
221
+ await run_sync(self._cleanup_cache)
222
+ except (OSError, TypeError, ValueError):
223
+ pass
224
+
225
+ def is_processing(self, **kwargs: Any) -> bool:
226
+ """Check if operation is currently being processed."""
227
+ cache_key = self._get_cache_key(**kwargs)
228
+ with self._lock:
229
+ return cache_key in self._processing
230
+
231
+ def mark_processing(self, **kwargs: Any) -> threading.Event:
232
+ """Mark operation as being processed and return event to wait on."""
233
+ cache_key = self._get_cache_key(**kwargs)
234
+
235
+ with self._lock:
236
+ if cache_key not in self._processing:
237
+ self._processing[cache_key] = threading.Event()
238
+ return self._processing[cache_key]
239
+
240
+ def mark_complete(self, **kwargs: Any) -> None:
241
+ """Mark operation processing as complete."""
242
+ cache_key = self._get_cache_key(**kwargs)
243
+
244
+ with self._lock:
245
+ if cache_key in self._processing:
246
+ event = self._processing.pop(cache_key)
247
+ event.set()
248
+
249
+ def clear(self) -> None:
250
+ """Clear all cached results."""
251
+ try:
252
+ for cache_file in self.cache_dir.glob("*.msgpack"):
253
+ cache_file.unlink(missing_ok=True)
254
+ except OSError:
255
+ pass
256
+
257
+ with self._lock:
258
+ pass
259
+
260
+ def get_stats(self) -> dict[str, Any]:
261
+ """Get cache statistics."""
262
+ try:
263
+ cache_files = list(self.cache_dir.glob("*.msgpack"))
264
+ total_size = sum(cache_file.stat().st_size for cache_file in cache_files if cache_file.exists())
265
+
266
+ return {
267
+ "cache_type": self.cache_type,
268
+ "cached_results": len(cache_files),
269
+ "processing_results": len(self._processing),
270
+ "total_cache_size_mb": total_size / 1024 / 1024,
271
+ "avg_result_size_kb": (total_size / len(cache_files) / 1024) if cache_files else 0,
272
+ "cache_dir": str(self.cache_dir),
273
+ "max_cache_size_mb": self.max_cache_size_mb,
274
+ "max_age_days": self.max_age_days,
275
+ }
276
+ except OSError:
277
+ return {
278
+ "cache_type": self.cache_type,
279
+ "cached_results": 0,
280
+ "processing_results": len(self._processing),
281
+ "total_cache_size_mb": 0.0,
282
+ "avg_result_size_kb": 0.0,
283
+ "cache_dir": str(self.cache_dir),
284
+ "max_cache_size_mb": self.max_cache_size_mb,
285
+ "max_age_days": self.max_age_days,
286
+ }
287
+
288
+
289
+ _ocr_cache: KreuzbergCache[ExtractionResult] | None = None
290
+ _document_cache: KreuzbergCache[ExtractionResult] | None = None
291
+ _table_cache: KreuzbergCache[Any] | None = None
292
+ _mime_cache: KreuzbergCache[str] | None = None
293
+
294
+
295
+ def get_ocr_cache() -> KreuzbergCache[ExtractionResult]:
296
+ """Get the global OCR cache instance."""
297
+ global _ocr_cache
298
+ if _ocr_cache is None:
299
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
300
+ cache_dir: Path | None = None
301
+ if cache_dir_str:
302
+ cache_dir = Path(cache_dir_str) / "ocr"
303
+
304
+ _ocr_cache = KreuzbergCache[ExtractionResult](
305
+ cache_type="ocr",
306
+ cache_dir=cache_dir,
307
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_OCR_CACHE_SIZE_MB", "500")),
308
+ max_age_days=int(os.environ.get("KREUZBERG_OCR_CACHE_AGE_DAYS", "30")),
309
+ )
310
+ return _ocr_cache
311
+
312
+
313
+ def get_document_cache() -> KreuzbergCache[ExtractionResult]:
314
+ """Get the global document cache instance."""
315
+ global _document_cache
316
+ if _document_cache is None:
317
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
318
+ cache_dir: Path | None = None
319
+ if cache_dir_str:
320
+ cache_dir = Path(cache_dir_str) / "documents"
321
+
322
+ _document_cache = KreuzbergCache[ExtractionResult](
323
+ cache_type="documents",
324
+ cache_dir=cache_dir,
325
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_DOCUMENT_CACHE_SIZE_MB", "1000")),
326
+ max_age_days=int(os.environ.get("KREUZBERG_DOCUMENT_CACHE_AGE_DAYS", "7")),
327
+ )
328
+ return _document_cache
329
+
330
+
331
+ def get_table_cache() -> KreuzbergCache[Any]:
332
+ """Get the global table cache instance."""
333
+ global _table_cache
334
+ if _table_cache is None:
335
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
336
+ cache_dir: Path | None = None
337
+ if cache_dir_str:
338
+ cache_dir = Path(cache_dir_str) / "tables"
339
+
340
+ _table_cache = KreuzbergCache[Any](
341
+ cache_type="tables",
342
+ cache_dir=cache_dir,
343
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_TABLE_CACHE_SIZE_MB", "200")),
344
+ max_age_days=int(os.environ.get("KREUZBERG_TABLE_CACHE_AGE_DAYS", "30")),
345
+ )
346
+ return _table_cache
347
+
348
+
349
+ def get_mime_cache() -> KreuzbergCache[str]:
350
+ """Get the global MIME type cache instance."""
351
+ global _mime_cache
352
+ if _mime_cache is None:
353
+ cache_dir_str = os.environ.get("KREUZBERG_CACHE_DIR")
354
+ cache_dir: Path | None = None
355
+ if cache_dir_str:
356
+ cache_dir = Path(cache_dir_str) / "mime"
357
+
358
+ _mime_cache = KreuzbergCache[str](
359
+ cache_type="mime",
360
+ cache_dir=cache_dir,
361
+ max_cache_size_mb=float(os.environ.get("KREUZBERG_MIME_CACHE_SIZE_MB", "50")),
362
+ max_age_days=int(os.environ.get("KREUZBERG_MIME_CACHE_AGE_DAYS", "60")),
363
+ )
364
+ return _mime_cache
365
+
366
+
367
+ def clear_all_caches() -> None:
368
+ """Clear all caches."""
369
+ get_ocr_cache().clear()
370
+ get_document_cache().clear()
371
+ get_table_cache().clear()
372
+ get_mime_cache().clear()