lorax-arg 0.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (66) hide show
  1. lorax/buffer.py +43 -0
  2. lorax/cache/__init__.py +43 -0
  3. lorax/cache/csv_tree_graph.py +59 -0
  4. lorax/cache/disk.py +467 -0
  5. lorax/cache/file_cache.py +142 -0
  6. lorax/cache/file_context.py +72 -0
  7. lorax/cache/lru.py +90 -0
  8. lorax/cache/tree_graph.py +293 -0
  9. lorax/cli.py +312 -0
  10. lorax/cloud/__init__.py +0 -0
  11. lorax/cloud/gcs_utils.py +205 -0
  12. lorax/constants.py +66 -0
  13. lorax/context.py +80 -0
  14. lorax/csv/__init__.py +7 -0
  15. lorax/csv/config.py +250 -0
  16. lorax/csv/layout.py +182 -0
  17. lorax/csv/newick_tree.py +234 -0
  18. lorax/handlers.py +998 -0
  19. lorax/lineage.py +456 -0
  20. lorax/loaders/__init__.py +0 -0
  21. lorax/loaders/csv_loader.py +10 -0
  22. lorax/loaders/loader.py +31 -0
  23. lorax/loaders/tskit_loader.py +119 -0
  24. lorax/lorax_app.py +75 -0
  25. lorax/manager.py +58 -0
  26. lorax/metadata/__init__.py +0 -0
  27. lorax/metadata/loader.py +426 -0
  28. lorax/metadata/mutations.py +146 -0
  29. lorax/modes.py +190 -0
  30. lorax/pg.py +183 -0
  31. lorax/redis_utils.py +30 -0
  32. lorax/routes.py +137 -0
  33. lorax/session_manager.py +206 -0
  34. lorax/sockets/__init__.py +55 -0
  35. lorax/sockets/connection.py +99 -0
  36. lorax/sockets/debug.py +47 -0
  37. lorax/sockets/decorators.py +112 -0
  38. lorax/sockets/file_ops.py +200 -0
  39. lorax/sockets/lineage.py +307 -0
  40. lorax/sockets/metadata.py +232 -0
  41. lorax/sockets/mutations.py +154 -0
  42. lorax/sockets/node_search.py +535 -0
  43. lorax/sockets/tree_layout.py +117 -0
  44. lorax/sockets/utils.py +10 -0
  45. lorax/tree_graph/__init__.py +12 -0
  46. lorax/tree_graph/tree_graph.py +689 -0
  47. lorax/utils.py +124 -0
  48. lorax_app/__init__.py +4 -0
  49. lorax_app/app.py +159 -0
  50. lorax_app/cli.py +114 -0
  51. lorax_app/static/X.png +0 -0
  52. lorax_app/static/assets/index-BCEGlUFi.js +2361 -0
  53. lorax_app/static/assets/index-iKjzUpA9.css +1 -0
  54. lorax_app/static/assets/localBackendWorker-BaWwjSV_.js +2 -0
  55. lorax_app/static/assets/renderDataWorker-BKLdiU7J.js +2 -0
  56. lorax_app/static/gestures/gesture-flick.ogv +0 -0
  57. lorax_app/static/gestures/gesture-two-finger-scroll.ogv +0 -0
  58. lorax_app/static/index.html +14 -0
  59. lorax_app/static/logo.png +0 -0
  60. lorax_app/static/lorax-logo.png +0 -0
  61. lorax_app/static/vite.svg +1 -0
  62. lorax_arg-0.1.dist-info/METADATA +131 -0
  63. lorax_arg-0.1.dist-info/RECORD +66 -0
  64. lorax_arg-0.1.dist-info/WHEEL +5 -0
  65. lorax_arg-0.1.dist-info/entry_points.txt +4 -0
  66. lorax_arg-0.1.dist-info/top_level.txt +2 -0
lorax/buffer.py ADDED
@@ -0,0 +1,43 @@
1
+
2
+ import pyarrow as pa
3
+
4
+ def mutations_to_arrow_buffer(mutations_data):
5
+ """
6
+ Convert mutations list to PyArrow IPC buffer for efficient transfer.
7
+
8
+ Args:
9
+ mutations_data: dict with 'mutations' list from get_mutations_in_window or search_mutations_by_position
10
+
11
+ Returns:
12
+ bytes: PyArrow IPC serialized buffer
13
+ """
14
+ mutations = mutations_data.get('mutations', [])
15
+
16
+ if not mutations:
17
+ # Return empty table with correct schema
18
+ table = pa.table({
19
+ 'position': pa.array([], type=pa.int64()),
20
+ 'mutation': pa.array([], type=pa.string()),
21
+ 'node_id': pa.array([], type=pa.int32()),
22
+ 'site_id': pa.array([], type=pa.int32()),
23
+ 'ancestral_state': pa.array([], type=pa.string()),
24
+ 'derived_state': pa.array([], type=pa.string()),
25
+ 'distance': pa.array([], type=pa.int64()),
26
+ })
27
+ else:
28
+ table = pa.table({
29
+ 'position': pa.array([m['position'] for m in mutations], type=pa.int64()),
30
+ 'mutation': pa.array([m['mutation'] for m in mutations], type=pa.string()),
31
+ 'node_id': pa.array([m['node_id'] for m in mutations], type=pa.int32()),
32
+ 'site_id': pa.array([m['site_id'] for m in mutations], type=pa.int32()),
33
+ 'ancestral_state': pa.array([m['ancestral_state'] for m in mutations], type=pa.string()),
34
+ 'derived_state': pa.array([m['derived_state'] for m in mutations], type=pa.string()),
35
+ 'distance': pa.array([m.get('distance', 0) for m in mutations], type=pa.int64()),
36
+ })
37
+
38
+ sink = pa.BufferOutputStream()
39
+ writer = pa.ipc.new_stream(sink, table.schema)
40
+ writer.write_table(table)
41
+ writer.close()
42
+
43
+ return sink.getvalue().to_pybytes()
@@ -0,0 +1,43 @@
1
+ """
2
+ Lorax Caching System.
3
+
4
+ This package provides consolidated caching infrastructure:
5
+ - LRUCache, LRUCacheWithMeta: In-memory LRU caches with eviction
6
+ - DiskCacheManager: LRU disk cache with distributed locking for GCS downloads
7
+ - TreeGraphCache: Per-session caching of TreeGraph objects
8
+ - FileContext: Unified cache entry combining tree sequence, config, and metadata
9
+ - get_file_context: Cached file loading with mtime validation
10
+
11
+ The FileContext-based caching provides atomic invalidation: when a file is
12
+ evicted from cache, its tree sequence, config, and all metadata are evicted
13
+ together, preventing orphan metadata.
14
+ """
15
+
16
+ from lorax.cache.lru import LRUCache, LRUCacheWithMeta
17
+ from lorax.cache.disk import DiskCacheManager
18
+ from lorax.cache.tree_graph import TreeGraphCache
19
+ from lorax.cache.csv_tree_graph import CsvTreeGraphCache
20
+ from lorax.cache.file_context import FileContext
21
+ from lorax.cache.file_cache import (
22
+ get_file_context,
23
+ get_file_cache_size,
24
+ # Backwards compatibility
25
+ get_or_load_ts,
26
+ get_ts_cache_size,
27
+ )
28
+
29
+ __all__ = [
30
+ # Core cache classes
31
+ "LRUCache",
32
+ "LRUCacheWithMeta",
33
+ "DiskCacheManager",
34
+ "TreeGraphCache",
35
+ "CsvTreeGraphCache",
36
+ # Unified file caching (preferred API)
37
+ "FileContext",
38
+ "get_file_context",
39
+ "get_file_cache_size",
40
+ # Backwards compatibility
41
+ "get_or_load_ts",
42
+ "get_ts_cache_size",
43
+ ]
@@ -0,0 +1,59 @@
1
+ """
2
+ CSV Newick Tree cache for per-session caching of parsed Newick trees.
3
+
4
+ This is a lightweight in-memory cache (no Redis) that mirrors the small subset
5
+ of the TreeGraphCache interface used by the layout pipeline:
6
+ - get / set per (session_id, tree_index)
7
+ - clear_session on file load
8
+ - evict_not_visible for viewport-based eviction
9
+ """
10
+
11
+ import asyncio
12
+ from collections import OrderedDict
13
+ from typing import Dict, Optional, TYPE_CHECKING
14
+
15
+ if TYPE_CHECKING:
16
+ from lorax.csv.newick_tree import NewickTreeGraph
17
+
18
+
19
+ class CsvTreeGraphCache:
20
+ """
21
+ Per-session cache for parsed NewickTreeGraph objects (CSV mode).
22
+
23
+ NOTE: This intentionally stays in-memory only to keep it simple.
24
+ """
25
+
26
+ def __init__(self):
27
+ # session_id -> OrderedDict{tree_index -> NewickTreeGraph}
28
+ self._local_cache: Dict[str, OrderedDict] = {}
29
+ self._lock = asyncio.Lock()
30
+ print("CsvTreeGraphCache initialized (in-memory)")
31
+
32
+ async def get(self, session_id: str, tree_index: int) -> Optional["NewickTreeGraph"]:
33
+ session_cache = self._local_cache.get(session_id)
34
+ if session_cache and tree_index in session_cache:
35
+ session_cache.move_to_end(tree_index)
36
+ return session_cache[tree_index]
37
+ return None
38
+
39
+ async def set(self, session_id: str, tree_index: int, graph: "NewickTreeGraph") -> None:
40
+ async with self._lock:
41
+ if session_id not in self._local_cache:
42
+ self._local_cache[session_id] = OrderedDict()
43
+ self._local_cache[session_id][tree_index] = graph
44
+
45
+ async def clear_session(self, session_id: str) -> None:
46
+ async with self._lock:
47
+ if session_id in self._local_cache:
48
+ del self._local_cache[session_id]
49
+
50
+ async def evict_not_visible(self, session_id: str, visible_indices: set) -> int:
51
+ async with self._lock:
52
+ session_cache = self._local_cache.get(session_id)
53
+ if not session_cache:
54
+ return 0
55
+ to_delete = [idx for idx in session_cache.keys() if idx not in visible_indices]
56
+ for idx in to_delete:
57
+ session_cache.pop(idx, None)
58
+ return len(to_delete)
59
+
lorax/cache/disk.py ADDED
@@ -0,0 +1,467 @@
1
+ """
2
+ Disk Cache Manager for Lorax
3
+
4
+ LRU disk cache with distributed locking for GCS file downloads.
5
+ Supports three modes: local, development, production.
6
+ """
7
+
8
+ import os
9
+ import json
10
+ import hashlib
11
+ import asyncio
12
+ import fcntl
13
+ from pathlib import Path
14
+ from datetime import datetime, timezone
15
+ from typing import Optional, Dict, Any
16
+ from dataclasses import dataclass, asdict
17
+
18
+ import aiofiles
19
+
20
+
21
+ @dataclass
22
+ class CachedFile:
23
+ """Metadata for a cached file."""
24
+ gcs_path: str
25
+ local_path: str
26
+ size_bytes: int
27
+ last_access: str
28
+ download_complete: bool
29
+ etag: Optional[str] = None
30
+
31
+ def to_dict(self) -> Dict[str, Any]:
32
+ return asdict(self)
33
+
34
+ @staticmethod
35
+ def from_dict(data: Dict[str, Any]) -> "CachedFile":
36
+ return CachedFile(**data)
37
+
38
+
39
+ class DiskCacheManifest:
40
+ """Thread-safe manifest for tracking cached files."""
41
+
42
+ def __init__(self, manifest_path: Path):
43
+ self.manifest_path = manifest_path
44
+ self._lock = asyncio.Lock()
45
+
46
+ async def load(self) -> Dict[str, Any]:
47
+ """Load manifest from disk."""
48
+ if not self.manifest_path.exists():
49
+ return {"version": 1, "files": {}, "total_size_bytes": 0}
50
+
51
+ try:
52
+ async with aiofiles.open(self.manifest_path, "r") as f:
53
+ content = await f.read()
54
+ return json.loads(content) if content else {"version": 1, "files": {}, "total_size_bytes": 0}
55
+ except (json.JSONDecodeError, IOError) as e:
56
+ print(f"Warning: Failed to load manifest: {e}")
57
+ return {"version": 1, "files": {}, "total_size_bytes": 0}
58
+
59
+ async def save(self, data: Dict[str, Any]):
60
+ """Save manifest to disk atomically."""
61
+ self.manifest_path.parent.mkdir(parents=True, exist_ok=True)
62
+ tmp_path = self.manifest_path.with_suffix(".tmp")
63
+
64
+ async with aiofiles.open(tmp_path, "w") as f:
65
+ await f.write(json.dumps(data, indent=2))
66
+
67
+ # Atomic rename
68
+ tmp_path.rename(self.manifest_path)
69
+
70
+ async def get_file(self, cache_key: str) -> Optional[CachedFile]:
71
+ """Get cached file metadata."""
72
+ async with self._lock:
73
+ data = await self.load()
74
+ file_data = data["files"].get(cache_key)
75
+ if file_data:
76
+ return CachedFile.from_dict(file_data)
77
+ return None
78
+
79
+ async def set_file(self, cache_key: str, cached_file: CachedFile):
80
+ """Set cached file metadata."""
81
+ async with self._lock:
82
+ data = await self.load()
83
+
84
+ # Update total size
85
+ old_file = data["files"].get(cache_key)
86
+ if old_file:
87
+ data["total_size_bytes"] -= old_file.get("size_bytes", 0)
88
+
89
+ data["files"][cache_key] = cached_file.to_dict()
90
+ data["total_size_bytes"] += cached_file.size_bytes
91
+
92
+ await self.save(data)
93
+
94
+ async def update_access_time(self, cache_key: str):
95
+ """Update last access time for a cached file."""
96
+ async with self._lock:
97
+ data = await self.load()
98
+ if cache_key in data["files"]:
99
+ data["files"][cache_key]["last_access"] = datetime.now(timezone.utc).isoformat()
100
+ await self.save(data)
101
+
102
+ async def remove_file(self, cache_key: str) -> Optional[CachedFile]:
103
+ """Remove a file from the manifest."""
104
+ async with self._lock:
105
+ data = await self.load()
106
+ file_data = data["files"].pop(cache_key, None)
107
+ if file_data:
108
+ data["total_size_bytes"] -= file_data.get("size_bytes", 0)
109
+ await self.save(data)
110
+ return CachedFile.from_dict(file_data)
111
+ return None
112
+
113
+ async def get_total_size(self) -> int:
114
+ """Get total size of cached files in bytes."""
115
+ data = await self.load()
116
+ return data.get("total_size_bytes", 0)
117
+
118
+ async def get_files_by_access_time(self) -> list:
119
+ """Get all files sorted by last access time (oldest first)."""
120
+ data = await self.load()
121
+ files = []
122
+ for cache_key, file_data in data["files"].items():
123
+ files.append((cache_key, CachedFile.from_dict(file_data)))
124
+
125
+ # Sort by last_access (oldest first)
126
+ files.sort(key=lambda x: x[1].last_access)
127
+ return files
128
+
129
+
130
+ class FileLock:
131
+ """File-based lock for single-process or fallback mode."""
132
+
133
+ def __init__(self, lock_path: Path):
134
+ self.lock_path = lock_path
135
+ self._fd = None
136
+
137
+ async def acquire(self, timeout: float = 300.0) -> bool:
138
+ """Acquire the lock with timeout."""
139
+ self.lock_path.parent.mkdir(parents=True, exist_ok=True)
140
+
141
+ start_time = asyncio.get_event_loop().time()
142
+ while True:
143
+ try:
144
+ self._fd = open(self.lock_path, "w")
145
+ fcntl.flock(self._fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
146
+ return True
147
+ except (IOError, OSError):
148
+ if self._fd:
149
+ self._fd.close()
150
+ self._fd = None
151
+
152
+ elapsed = asyncio.get_event_loop().time() - start_time
153
+ if elapsed >= timeout:
154
+ return False
155
+
156
+ await asyncio.sleep(0.1)
157
+
158
+ async def release(self):
159
+ """Release the lock."""
160
+ if self._fd:
161
+ try:
162
+ fcntl.flock(self._fd.fileno(), fcntl.LOCK_UN)
163
+ self._fd.close()
164
+ except (IOError, OSError):
165
+ pass
166
+ finally:
167
+ self._fd = None
168
+
169
+
170
+ class RedisLock:
171
+ """Redis-based distributed lock for multi-worker deployments."""
172
+
173
+ def __init__(self, redis_client, lock_key: str, timeout_ms: int = 300000):
174
+ self.redis = redis_client
175
+ self.lock_key = lock_key
176
+ self.timeout_ms = timeout_ms
177
+ self._lock_value = None
178
+
179
+ async def acquire(self, timeout: float = 300.0) -> bool:
180
+ """Acquire the lock with timeout."""
181
+ import uuid
182
+ self._lock_value = str(uuid.uuid4())
183
+
184
+ start_time = asyncio.get_event_loop().time()
185
+ while True:
186
+ # SET key value NX PX timeout_ms
187
+ acquired = await self.redis.set(
188
+ self.lock_key,
189
+ self._lock_value,
190
+ nx=True,
191
+ px=self.timeout_ms
192
+ )
193
+ if acquired:
194
+ return True
195
+
196
+ elapsed = asyncio.get_event_loop().time() - start_time
197
+ if elapsed >= timeout:
198
+ return False
199
+
200
+ await asyncio.sleep(0.1)
201
+
202
+ async def release(self):
203
+ """Release the lock if we own it."""
204
+ if self._lock_value:
205
+ # Only release if we own it (compare-and-delete)
206
+ lua_script = """
207
+ if redis.call("get", KEYS[1]) == ARGV[1] then
208
+ return redis.call("del", KEYS[1])
209
+ else
210
+ return 0
211
+ end
212
+ """
213
+ try:
214
+ await self.redis.eval(lua_script, 1, self.lock_key, self._lock_value)
215
+ except Exception as e:
216
+ print(f"Warning: Failed to release Redis lock: {e}")
217
+ finally:
218
+ self._lock_value = None
219
+
220
+
221
+ class DiskCacheManager:
222
+ """
223
+ LRU disk cache manager for GCS file downloads.
224
+
225
+ Features:
226
+ - 50GB (configurable) LRU eviction
227
+ - Atomic downloads (temp file + rename)
228
+ - Distributed locking (Redis or file-based)
229
+ - Access time tracking
230
+ """
231
+
232
+ def __init__(
233
+ self,
234
+ cache_dir: Path,
235
+ max_size_bytes: int,
236
+ redis_client=None,
237
+ enabled: bool = True
238
+ ):
239
+ self.cache_dir = Path(cache_dir)
240
+ self.max_size_bytes = max_size_bytes
241
+ self.redis = redis_client
242
+ self.enabled = enabled
243
+
244
+ self.files_dir = self.cache_dir / "files"
245
+ self.locks_dir = self.cache_dir / "locks"
246
+ self.manifest = DiskCacheManifest(self.cache_dir / "manifest.json")
247
+
248
+ # Create directories
249
+ if self.enabled:
250
+ self.files_dir.mkdir(parents=True, exist_ok=True)
251
+ self.locks_dir.mkdir(parents=True, exist_ok=True)
252
+
253
+ def _get_cache_key(self, gcs_bucket: str, gcs_path: str) -> str:
254
+ """Generate cache key from GCS path."""
255
+ full_path = f"{gcs_bucket}/{gcs_path}"
256
+ return hashlib.sha256(full_path.encode()).hexdigest()[:16]
257
+
258
+ def _get_local_path(self, cache_key: str, gcs_path: str) -> Path:
259
+ """Get local file path for a cache key."""
260
+ # Preserve extension for tskit compatibility
261
+ ext = Path(gcs_path).suffix or ".dat"
262
+ return self.files_dir / f"{cache_key}{ext}"
263
+
264
+ async def _acquire_lock(self, cache_key: str) -> Any:
265
+ """Acquire download lock (Redis or file-based)."""
266
+ if self.redis:
267
+ lock = RedisLock(self.redis, f"lorax:download:{cache_key}")
268
+ else:
269
+ lock = FileLock(self.locks_dir / f"{cache_key}.lock")
270
+
271
+ acquired = await lock.acquire()
272
+ if not acquired:
273
+ raise TimeoutError(f"Failed to acquire lock for {cache_key}")
274
+
275
+ return lock
276
+
277
+ async def _release_lock(self, lock: Any):
278
+ """Release download lock."""
279
+ await lock.release()
280
+
281
+ async def get_cached_path(self, gcs_bucket: str, gcs_path: str) -> Optional[Path]:
282
+ """
283
+ Get path to cached file if it exists and is valid.
284
+ Updates access time on hit.
285
+ """
286
+ if not self.enabled:
287
+ return None
288
+
289
+ cache_key = self._get_cache_key(gcs_bucket, gcs_path)
290
+ cached_file = await self.manifest.get_file(cache_key)
291
+
292
+ if cached_file and cached_file.download_complete:
293
+ local_path = Path(cached_file.local_path)
294
+ if local_path.exists():
295
+ # Update access time
296
+ await self.manifest.update_access_time(cache_key)
297
+ print(f"Cache hit: {gcs_path}")
298
+ return local_path
299
+ else:
300
+ # File was deleted externally, remove from manifest
301
+ await self.manifest.remove_file(cache_key)
302
+
303
+ return None
304
+
305
+ async def evict_if_needed(self, required_bytes: int = 0):
306
+ """Evict oldest files until we have space for required_bytes."""
307
+ if not self.enabled:
308
+ return
309
+
310
+ target_size = self.max_size_bytes - required_bytes
311
+ current_size = await self.manifest.get_total_size()
312
+
313
+ if current_size <= target_size:
314
+ return
315
+
316
+ # Get files sorted by access time (oldest first)
317
+ files = await self.manifest.get_files_by_access_time()
318
+
319
+ for cache_key, cached_file in files:
320
+ if current_size <= target_size:
321
+ break
322
+
323
+ # Delete file
324
+ local_path = Path(cached_file.local_path)
325
+ if local_path.exists():
326
+ try:
327
+ local_path.unlink()
328
+ print(f"Evicted: {cached_file.gcs_path} ({cached_file.size_bytes / 1024 / 1024:.1f} MB)")
329
+ except OSError as e:
330
+ print(f"Warning: Failed to delete {local_path}: {e}")
331
+
332
+ # Remove from manifest
333
+ await self.manifest.remove_file(cache_key)
334
+ current_size -= cached_file.size_bytes
335
+
336
+ async def cache_file(
337
+ self,
338
+ gcs_bucket: str,
339
+ gcs_path: str,
340
+ local_path: Path,
341
+ size_bytes: int,
342
+ etag: Optional[str] = None
343
+ ):
344
+ """Register a downloaded file in the cache."""
345
+ if not self.enabled:
346
+ return
347
+
348
+ cache_key = self._get_cache_key(gcs_bucket, gcs_path)
349
+
350
+ cached_file = CachedFile(
351
+ gcs_path=f"{gcs_bucket}/{gcs_path}",
352
+ local_path=str(local_path),
353
+ size_bytes=size_bytes,
354
+ last_access=datetime.now(timezone.utc).isoformat(),
355
+ download_complete=True,
356
+ etag=etag
357
+ )
358
+
359
+ await self.manifest.set_file(cache_key, cached_file)
360
+ print(f"Cached: {gcs_path} ({size_bytes / 1024 / 1024:.1f} MB)")
361
+
362
+ async def get_or_download(
363
+ self,
364
+ gcs_bucket: str,
365
+ gcs_path: str,
366
+ download_func
367
+ ) -> Path:
368
+ """
369
+ Get file from cache or download using provided function.
370
+
371
+ Args:
372
+ gcs_bucket: GCS bucket name
373
+ gcs_path: Path within bucket
374
+ download_func: async function(local_path) that downloads file
375
+
376
+ Returns:
377
+ Path to local file
378
+ """
379
+ # Fast path: check cache without lock
380
+ cached_path = await self.get_cached_path(gcs_bucket, gcs_path)
381
+ if cached_path:
382
+ return cached_path
383
+
384
+ if not self.enabled:
385
+ # Cache disabled, download directly
386
+ cache_key = self._get_cache_key(gcs_bucket, gcs_path)
387
+ local_path = self._get_local_path(cache_key, gcs_path)
388
+ await download_func(str(local_path))
389
+ return local_path
390
+
391
+ cache_key = self._get_cache_key(gcs_bucket, gcs_path)
392
+ local_path = self._get_local_path(cache_key, gcs_path)
393
+
394
+ # Acquire distributed lock
395
+ lock = await self._acquire_lock(cache_key)
396
+ try:
397
+ # Double-check after acquiring lock
398
+ cached_path = await self.get_cached_path(gcs_bucket, gcs_path)
399
+ if cached_path:
400
+ return cached_path
401
+
402
+ # Estimate size and evict if needed (conservative estimate)
403
+ await self.evict_if_needed(required_bytes=1024 * 1024 * 1024) # 1GB buffer
404
+
405
+ # Download to temp file
406
+ tmp_path = local_path.with_suffix(local_path.suffix + ".tmp")
407
+ try:
408
+ await download_func(str(tmp_path))
409
+
410
+ # Get actual size
411
+ size_bytes = tmp_path.stat().st_size
412
+
413
+ # Evict with actual size if needed
414
+ await self.evict_if_needed(required_bytes=size_bytes)
415
+
416
+ # Atomic rename
417
+ tmp_path.rename(local_path)
418
+
419
+ # Register in cache
420
+ await self.cache_file(gcs_bucket, gcs_path, local_path, size_bytes)
421
+
422
+ return local_path
423
+
424
+ except Exception as e:
425
+ # Cleanup temp file on failure
426
+ if tmp_path.exists():
427
+ tmp_path.unlink()
428
+ raise
429
+
430
+ finally:
431
+ await self._release_lock(lock)
432
+
433
+ async def get_stats(self) -> Dict[str, Any]:
434
+ """Get cache statistics."""
435
+ if not self.enabled:
436
+ return {"enabled": False}
437
+
438
+ total_size = await self.manifest.get_total_size()
439
+ data = await self.manifest.load()
440
+
441
+ return {
442
+ "enabled": True,
443
+ "total_size_bytes": total_size,
444
+ "total_size_mb": round(total_size / 1024 / 1024, 2),
445
+ "max_size_mb": round(self.max_size_bytes / 1024 / 1024, 2),
446
+ "usage_percent": round(total_size / self.max_size_bytes * 100, 1) if self.max_size_bytes > 0 else 0,
447
+ "file_count": len(data["files"]),
448
+ "cache_dir": str(self.cache_dir),
449
+ }
450
+
451
+ async def clear(self):
452
+ """Clear all cached files."""
453
+ if not self.enabled:
454
+ return
455
+
456
+ files = await self.manifest.get_files_by_access_time()
457
+
458
+ for cache_key, cached_file in files:
459
+ local_path = Path(cached_file.local_path)
460
+ if local_path.exists():
461
+ try:
462
+ local_path.unlink()
463
+ except OSError:
464
+ pass
465
+ await self.manifest.remove_file(cache_key)
466
+
467
+ print("Disk cache cleared")