lorax-arg 0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lorax/buffer.py +43 -0
- lorax/cache/__init__.py +43 -0
- lorax/cache/csv_tree_graph.py +59 -0
- lorax/cache/disk.py +467 -0
- lorax/cache/file_cache.py +142 -0
- lorax/cache/file_context.py +72 -0
- lorax/cache/lru.py +90 -0
- lorax/cache/tree_graph.py +293 -0
- lorax/cli.py +312 -0
- lorax/cloud/__init__.py +0 -0
- lorax/cloud/gcs_utils.py +205 -0
- lorax/constants.py +66 -0
- lorax/context.py +80 -0
- lorax/csv/__init__.py +7 -0
- lorax/csv/config.py +250 -0
- lorax/csv/layout.py +182 -0
- lorax/csv/newick_tree.py +234 -0
- lorax/handlers.py +998 -0
- lorax/lineage.py +456 -0
- lorax/loaders/__init__.py +0 -0
- lorax/loaders/csv_loader.py +10 -0
- lorax/loaders/loader.py +31 -0
- lorax/loaders/tskit_loader.py +119 -0
- lorax/lorax_app.py +75 -0
- lorax/manager.py +58 -0
- lorax/metadata/__init__.py +0 -0
- lorax/metadata/loader.py +426 -0
- lorax/metadata/mutations.py +146 -0
- lorax/modes.py +190 -0
- lorax/pg.py +183 -0
- lorax/redis_utils.py +30 -0
- lorax/routes.py +137 -0
- lorax/session_manager.py +206 -0
- lorax/sockets/__init__.py +55 -0
- lorax/sockets/connection.py +99 -0
- lorax/sockets/debug.py +47 -0
- lorax/sockets/decorators.py +112 -0
- lorax/sockets/file_ops.py +200 -0
- lorax/sockets/lineage.py +307 -0
- lorax/sockets/metadata.py +232 -0
- lorax/sockets/mutations.py +154 -0
- lorax/sockets/node_search.py +535 -0
- lorax/sockets/tree_layout.py +117 -0
- lorax/sockets/utils.py +10 -0
- lorax/tree_graph/__init__.py +12 -0
- lorax/tree_graph/tree_graph.py +689 -0
- lorax/utils.py +124 -0
- lorax_app/__init__.py +4 -0
- lorax_app/app.py +159 -0
- lorax_app/cli.py +114 -0
- lorax_app/static/X.png +0 -0
- lorax_app/static/assets/index-BCEGlUFi.js +2361 -0
- lorax_app/static/assets/index-iKjzUpA9.css +1 -0
- lorax_app/static/assets/localBackendWorker-BaWwjSV_.js +2 -0
- lorax_app/static/assets/renderDataWorker-BKLdiU7J.js +2 -0
- lorax_app/static/gestures/gesture-flick.ogv +0 -0
- lorax_app/static/gestures/gesture-two-finger-scroll.ogv +0 -0
- lorax_app/static/index.html +14 -0
- lorax_app/static/logo.png +0 -0
- lorax_app/static/lorax-logo.png +0 -0
- lorax_app/static/vite.svg +1 -0
- lorax_arg-0.1.dist-info/METADATA +131 -0
- lorax_arg-0.1.dist-info/RECORD +66 -0
- lorax_arg-0.1.dist-info/WHEEL +5 -0
- lorax_arg-0.1.dist-info/entry_points.txt +4 -0
- lorax_arg-0.1.dist-info/top_level.txt +2 -0
lorax/buffer.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
|
|
2
|
+
import pyarrow as pa
|
|
3
|
+
|
|
4
|
+
def mutations_to_arrow_buffer(mutations_data):
|
|
5
|
+
"""
|
|
6
|
+
Convert mutations list to PyArrow IPC buffer for efficient transfer.
|
|
7
|
+
|
|
8
|
+
Args:
|
|
9
|
+
mutations_data: dict with 'mutations' list from get_mutations_in_window or search_mutations_by_position
|
|
10
|
+
|
|
11
|
+
Returns:
|
|
12
|
+
bytes: PyArrow IPC serialized buffer
|
|
13
|
+
"""
|
|
14
|
+
mutations = mutations_data.get('mutations', [])
|
|
15
|
+
|
|
16
|
+
if not mutations:
|
|
17
|
+
# Return empty table with correct schema
|
|
18
|
+
table = pa.table({
|
|
19
|
+
'position': pa.array([], type=pa.int64()),
|
|
20
|
+
'mutation': pa.array([], type=pa.string()),
|
|
21
|
+
'node_id': pa.array([], type=pa.int32()),
|
|
22
|
+
'site_id': pa.array([], type=pa.int32()),
|
|
23
|
+
'ancestral_state': pa.array([], type=pa.string()),
|
|
24
|
+
'derived_state': pa.array([], type=pa.string()),
|
|
25
|
+
'distance': pa.array([], type=pa.int64()),
|
|
26
|
+
})
|
|
27
|
+
else:
|
|
28
|
+
table = pa.table({
|
|
29
|
+
'position': pa.array([m['position'] for m in mutations], type=pa.int64()),
|
|
30
|
+
'mutation': pa.array([m['mutation'] for m in mutations], type=pa.string()),
|
|
31
|
+
'node_id': pa.array([m['node_id'] for m in mutations], type=pa.int32()),
|
|
32
|
+
'site_id': pa.array([m['site_id'] for m in mutations], type=pa.int32()),
|
|
33
|
+
'ancestral_state': pa.array([m['ancestral_state'] for m in mutations], type=pa.string()),
|
|
34
|
+
'derived_state': pa.array([m['derived_state'] for m in mutations], type=pa.string()),
|
|
35
|
+
'distance': pa.array([m.get('distance', 0) for m in mutations], type=pa.int64()),
|
|
36
|
+
})
|
|
37
|
+
|
|
38
|
+
sink = pa.BufferOutputStream()
|
|
39
|
+
writer = pa.ipc.new_stream(sink, table.schema)
|
|
40
|
+
writer.write_table(table)
|
|
41
|
+
writer.close()
|
|
42
|
+
|
|
43
|
+
return sink.getvalue().to_pybytes()
|
lorax/cache/__init__.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Lorax Caching System.
|
|
3
|
+
|
|
4
|
+
This package provides consolidated caching infrastructure:
|
|
5
|
+
- LRUCache, LRUCacheWithMeta: In-memory LRU caches with eviction
|
|
6
|
+
- DiskCacheManager: LRU disk cache with distributed locking for GCS downloads
|
|
7
|
+
- TreeGraphCache: Per-session caching of TreeGraph objects
|
|
8
|
+
- FileContext: Unified cache entry combining tree sequence, config, and metadata
|
|
9
|
+
- get_file_context: Cached file loading with mtime validation
|
|
10
|
+
|
|
11
|
+
The FileContext-based caching provides atomic invalidation: when a file is
|
|
12
|
+
evicted from cache, its tree sequence, config, and all metadata are evicted
|
|
13
|
+
together, preventing orphan metadata.
|
|
14
|
+
"""
|
|
15
|
+
|
|
16
|
+
from lorax.cache.lru import LRUCache, LRUCacheWithMeta
|
|
17
|
+
from lorax.cache.disk import DiskCacheManager
|
|
18
|
+
from lorax.cache.tree_graph import TreeGraphCache
|
|
19
|
+
from lorax.cache.csv_tree_graph import CsvTreeGraphCache
|
|
20
|
+
from lorax.cache.file_context import FileContext
|
|
21
|
+
from lorax.cache.file_cache import (
|
|
22
|
+
get_file_context,
|
|
23
|
+
get_file_cache_size,
|
|
24
|
+
# Backwards compatibility
|
|
25
|
+
get_or_load_ts,
|
|
26
|
+
get_ts_cache_size,
|
|
27
|
+
)
|
|
28
|
+
|
|
29
|
+
__all__ = [
|
|
30
|
+
# Core cache classes
|
|
31
|
+
"LRUCache",
|
|
32
|
+
"LRUCacheWithMeta",
|
|
33
|
+
"DiskCacheManager",
|
|
34
|
+
"TreeGraphCache",
|
|
35
|
+
"CsvTreeGraphCache",
|
|
36
|
+
# Unified file caching (preferred API)
|
|
37
|
+
"FileContext",
|
|
38
|
+
"get_file_context",
|
|
39
|
+
"get_file_cache_size",
|
|
40
|
+
# Backwards compatibility
|
|
41
|
+
"get_or_load_ts",
|
|
42
|
+
"get_ts_cache_size",
|
|
43
|
+
]
|
|
@@ -0,0 +1,59 @@
|
|
|
1
|
+
"""
|
|
2
|
+
CSV Newick Tree cache for per-session caching of parsed Newick trees.
|
|
3
|
+
|
|
4
|
+
This is a lightweight in-memory cache (no Redis) that mirrors the small subset
|
|
5
|
+
of the TreeGraphCache interface used by the layout pipeline:
|
|
6
|
+
- get / set per (session_id, tree_index)
|
|
7
|
+
- clear_session on file load
|
|
8
|
+
- evict_not_visible for viewport-based eviction
|
|
9
|
+
"""
|
|
10
|
+
|
|
11
|
+
import asyncio
|
|
12
|
+
from collections import OrderedDict
|
|
13
|
+
from typing import Dict, Optional, TYPE_CHECKING
|
|
14
|
+
|
|
15
|
+
if TYPE_CHECKING:
|
|
16
|
+
from lorax.csv.newick_tree import NewickTreeGraph
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
class CsvTreeGraphCache:
|
|
20
|
+
"""
|
|
21
|
+
Per-session cache for parsed NewickTreeGraph objects (CSV mode).
|
|
22
|
+
|
|
23
|
+
NOTE: This intentionally stays in-memory only to keep it simple.
|
|
24
|
+
"""
|
|
25
|
+
|
|
26
|
+
def __init__(self):
|
|
27
|
+
# session_id -> OrderedDict{tree_index -> NewickTreeGraph}
|
|
28
|
+
self._local_cache: Dict[str, OrderedDict] = {}
|
|
29
|
+
self._lock = asyncio.Lock()
|
|
30
|
+
print("CsvTreeGraphCache initialized (in-memory)")
|
|
31
|
+
|
|
32
|
+
async def get(self, session_id: str, tree_index: int) -> Optional["NewickTreeGraph"]:
|
|
33
|
+
session_cache = self._local_cache.get(session_id)
|
|
34
|
+
if session_cache and tree_index in session_cache:
|
|
35
|
+
session_cache.move_to_end(tree_index)
|
|
36
|
+
return session_cache[tree_index]
|
|
37
|
+
return None
|
|
38
|
+
|
|
39
|
+
async def set(self, session_id: str, tree_index: int, graph: "NewickTreeGraph") -> None:
|
|
40
|
+
async with self._lock:
|
|
41
|
+
if session_id not in self._local_cache:
|
|
42
|
+
self._local_cache[session_id] = OrderedDict()
|
|
43
|
+
self._local_cache[session_id][tree_index] = graph
|
|
44
|
+
|
|
45
|
+
async def clear_session(self, session_id: str) -> None:
|
|
46
|
+
async with self._lock:
|
|
47
|
+
if session_id in self._local_cache:
|
|
48
|
+
del self._local_cache[session_id]
|
|
49
|
+
|
|
50
|
+
async def evict_not_visible(self, session_id: str, visible_indices: set) -> int:
|
|
51
|
+
async with self._lock:
|
|
52
|
+
session_cache = self._local_cache.get(session_id)
|
|
53
|
+
if not session_cache:
|
|
54
|
+
return 0
|
|
55
|
+
to_delete = [idx for idx in session_cache.keys() if idx not in visible_indices]
|
|
56
|
+
for idx in to_delete:
|
|
57
|
+
session_cache.pop(idx, None)
|
|
58
|
+
return len(to_delete)
|
|
59
|
+
|
lorax/cache/disk.py
ADDED
|
@@ -0,0 +1,467 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Disk Cache Manager for Lorax
|
|
3
|
+
|
|
4
|
+
LRU disk cache with distributed locking for GCS file downloads.
|
|
5
|
+
Supports three modes: local, development, production.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import os
|
|
9
|
+
import json
|
|
10
|
+
import hashlib
|
|
11
|
+
import asyncio
|
|
12
|
+
import fcntl
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
from datetime import datetime, timezone
|
|
15
|
+
from typing import Optional, Dict, Any
|
|
16
|
+
from dataclasses import dataclass, asdict
|
|
17
|
+
|
|
18
|
+
import aiofiles
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
@dataclass
|
|
22
|
+
class CachedFile:
|
|
23
|
+
"""Metadata for a cached file."""
|
|
24
|
+
gcs_path: str
|
|
25
|
+
local_path: str
|
|
26
|
+
size_bytes: int
|
|
27
|
+
last_access: str
|
|
28
|
+
download_complete: bool
|
|
29
|
+
etag: Optional[str] = None
|
|
30
|
+
|
|
31
|
+
def to_dict(self) -> Dict[str, Any]:
|
|
32
|
+
return asdict(self)
|
|
33
|
+
|
|
34
|
+
@staticmethod
|
|
35
|
+
def from_dict(data: Dict[str, Any]) -> "CachedFile":
|
|
36
|
+
return CachedFile(**data)
|
|
37
|
+
|
|
38
|
+
|
|
39
|
+
class DiskCacheManifest:
|
|
40
|
+
"""Thread-safe manifest for tracking cached files."""
|
|
41
|
+
|
|
42
|
+
def __init__(self, manifest_path: Path):
|
|
43
|
+
self.manifest_path = manifest_path
|
|
44
|
+
self._lock = asyncio.Lock()
|
|
45
|
+
|
|
46
|
+
async def load(self) -> Dict[str, Any]:
|
|
47
|
+
"""Load manifest from disk."""
|
|
48
|
+
if not self.manifest_path.exists():
|
|
49
|
+
return {"version": 1, "files": {}, "total_size_bytes": 0}
|
|
50
|
+
|
|
51
|
+
try:
|
|
52
|
+
async with aiofiles.open(self.manifest_path, "r") as f:
|
|
53
|
+
content = await f.read()
|
|
54
|
+
return json.loads(content) if content else {"version": 1, "files": {}, "total_size_bytes": 0}
|
|
55
|
+
except (json.JSONDecodeError, IOError) as e:
|
|
56
|
+
print(f"Warning: Failed to load manifest: {e}")
|
|
57
|
+
return {"version": 1, "files": {}, "total_size_bytes": 0}
|
|
58
|
+
|
|
59
|
+
async def save(self, data: Dict[str, Any]):
|
|
60
|
+
"""Save manifest to disk atomically."""
|
|
61
|
+
self.manifest_path.parent.mkdir(parents=True, exist_ok=True)
|
|
62
|
+
tmp_path = self.manifest_path.with_suffix(".tmp")
|
|
63
|
+
|
|
64
|
+
async with aiofiles.open(tmp_path, "w") as f:
|
|
65
|
+
await f.write(json.dumps(data, indent=2))
|
|
66
|
+
|
|
67
|
+
# Atomic rename
|
|
68
|
+
tmp_path.rename(self.manifest_path)
|
|
69
|
+
|
|
70
|
+
async def get_file(self, cache_key: str) -> Optional[CachedFile]:
|
|
71
|
+
"""Get cached file metadata."""
|
|
72
|
+
async with self._lock:
|
|
73
|
+
data = await self.load()
|
|
74
|
+
file_data = data["files"].get(cache_key)
|
|
75
|
+
if file_data:
|
|
76
|
+
return CachedFile.from_dict(file_data)
|
|
77
|
+
return None
|
|
78
|
+
|
|
79
|
+
async def set_file(self, cache_key: str, cached_file: CachedFile):
|
|
80
|
+
"""Set cached file metadata."""
|
|
81
|
+
async with self._lock:
|
|
82
|
+
data = await self.load()
|
|
83
|
+
|
|
84
|
+
# Update total size
|
|
85
|
+
old_file = data["files"].get(cache_key)
|
|
86
|
+
if old_file:
|
|
87
|
+
data["total_size_bytes"] -= old_file.get("size_bytes", 0)
|
|
88
|
+
|
|
89
|
+
data["files"][cache_key] = cached_file.to_dict()
|
|
90
|
+
data["total_size_bytes"] += cached_file.size_bytes
|
|
91
|
+
|
|
92
|
+
await self.save(data)
|
|
93
|
+
|
|
94
|
+
async def update_access_time(self, cache_key: str):
|
|
95
|
+
"""Update last access time for a cached file."""
|
|
96
|
+
async with self._lock:
|
|
97
|
+
data = await self.load()
|
|
98
|
+
if cache_key in data["files"]:
|
|
99
|
+
data["files"][cache_key]["last_access"] = datetime.now(timezone.utc).isoformat()
|
|
100
|
+
await self.save(data)
|
|
101
|
+
|
|
102
|
+
async def remove_file(self, cache_key: str) -> Optional[CachedFile]:
|
|
103
|
+
"""Remove a file from the manifest."""
|
|
104
|
+
async with self._lock:
|
|
105
|
+
data = await self.load()
|
|
106
|
+
file_data = data["files"].pop(cache_key, None)
|
|
107
|
+
if file_data:
|
|
108
|
+
data["total_size_bytes"] -= file_data.get("size_bytes", 0)
|
|
109
|
+
await self.save(data)
|
|
110
|
+
return CachedFile.from_dict(file_data)
|
|
111
|
+
return None
|
|
112
|
+
|
|
113
|
+
async def get_total_size(self) -> int:
|
|
114
|
+
"""Get total size of cached files in bytes."""
|
|
115
|
+
data = await self.load()
|
|
116
|
+
return data.get("total_size_bytes", 0)
|
|
117
|
+
|
|
118
|
+
async def get_files_by_access_time(self) -> list:
|
|
119
|
+
"""Get all files sorted by last access time (oldest first)."""
|
|
120
|
+
data = await self.load()
|
|
121
|
+
files = []
|
|
122
|
+
for cache_key, file_data in data["files"].items():
|
|
123
|
+
files.append((cache_key, CachedFile.from_dict(file_data)))
|
|
124
|
+
|
|
125
|
+
# Sort by last_access (oldest first)
|
|
126
|
+
files.sort(key=lambda x: x[1].last_access)
|
|
127
|
+
return files
|
|
128
|
+
|
|
129
|
+
|
|
130
|
+
class FileLock:
|
|
131
|
+
"""File-based lock for single-process or fallback mode."""
|
|
132
|
+
|
|
133
|
+
def __init__(self, lock_path: Path):
|
|
134
|
+
self.lock_path = lock_path
|
|
135
|
+
self._fd = None
|
|
136
|
+
|
|
137
|
+
async def acquire(self, timeout: float = 300.0) -> bool:
|
|
138
|
+
"""Acquire the lock with timeout."""
|
|
139
|
+
self.lock_path.parent.mkdir(parents=True, exist_ok=True)
|
|
140
|
+
|
|
141
|
+
start_time = asyncio.get_event_loop().time()
|
|
142
|
+
while True:
|
|
143
|
+
try:
|
|
144
|
+
self._fd = open(self.lock_path, "w")
|
|
145
|
+
fcntl.flock(self._fd.fileno(), fcntl.LOCK_EX | fcntl.LOCK_NB)
|
|
146
|
+
return True
|
|
147
|
+
except (IOError, OSError):
|
|
148
|
+
if self._fd:
|
|
149
|
+
self._fd.close()
|
|
150
|
+
self._fd = None
|
|
151
|
+
|
|
152
|
+
elapsed = asyncio.get_event_loop().time() - start_time
|
|
153
|
+
if elapsed >= timeout:
|
|
154
|
+
return False
|
|
155
|
+
|
|
156
|
+
await asyncio.sleep(0.1)
|
|
157
|
+
|
|
158
|
+
async def release(self):
|
|
159
|
+
"""Release the lock."""
|
|
160
|
+
if self._fd:
|
|
161
|
+
try:
|
|
162
|
+
fcntl.flock(self._fd.fileno(), fcntl.LOCK_UN)
|
|
163
|
+
self._fd.close()
|
|
164
|
+
except (IOError, OSError):
|
|
165
|
+
pass
|
|
166
|
+
finally:
|
|
167
|
+
self._fd = None
|
|
168
|
+
|
|
169
|
+
|
|
170
|
+
class RedisLock:
|
|
171
|
+
"""Redis-based distributed lock for multi-worker deployments."""
|
|
172
|
+
|
|
173
|
+
def __init__(self, redis_client, lock_key: str, timeout_ms: int = 300000):
|
|
174
|
+
self.redis = redis_client
|
|
175
|
+
self.lock_key = lock_key
|
|
176
|
+
self.timeout_ms = timeout_ms
|
|
177
|
+
self._lock_value = None
|
|
178
|
+
|
|
179
|
+
async def acquire(self, timeout: float = 300.0) -> bool:
|
|
180
|
+
"""Acquire the lock with timeout."""
|
|
181
|
+
import uuid
|
|
182
|
+
self._lock_value = str(uuid.uuid4())
|
|
183
|
+
|
|
184
|
+
start_time = asyncio.get_event_loop().time()
|
|
185
|
+
while True:
|
|
186
|
+
# SET key value NX PX timeout_ms
|
|
187
|
+
acquired = await self.redis.set(
|
|
188
|
+
self.lock_key,
|
|
189
|
+
self._lock_value,
|
|
190
|
+
nx=True,
|
|
191
|
+
px=self.timeout_ms
|
|
192
|
+
)
|
|
193
|
+
if acquired:
|
|
194
|
+
return True
|
|
195
|
+
|
|
196
|
+
elapsed = asyncio.get_event_loop().time() - start_time
|
|
197
|
+
if elapsed >= timeout:
|
|
198
|
+
return False
|
|
199
|
+
|
|
200
|
+
await asyncio.sleep(0.1)
|
|
201
|
+
|
|
202
|
+
async def release(self):
|
|
203
|
+
"""Release the lock if we own it."""
|
|
204
|
+
if self._lock_value:
|
|
205
|
+
# Only release if we own it (compare-and-delete)
|
|
206
|
+
lua_script = """
|
|
207
|
+
if redis.call("get", KEYS[1]) == ARGV[1] then
|
|
208
|
+
return redis.call("del", KEYS[1])
|
|
209
|
+
else
|
|
210
|
+
return 0
|
|
211
|
+
end
|
|
212
|
+
"""
|
|
213
|
+
try:
|
|
214
|
+
await self.redis.eval(lua_script, 1, self.lock_key, self._lock_value)
|
|
215
|
+
except Exception as e:
|
|
216
|
+
print(f"Warning: Failed to release Redis lock: {e}")
|
|
217
|
+
finally:
|
|
218
|
+
self._lock_value = None
|
|
219
|
+
|
|
220
|
+
|
|
221
|
+
class DiskCacheManager:
|
|
222
|
+
"""
|
|
223
|
+
LRU disk cache manager for GCS file downloads.
|
|
224
|
+
|
|
225
|
+
Features:
|
|
226
|
+
- 50GB (configurable) LRU eviction
|
|
227
|
+
- Atomic downloads (temp file + rename)
|
|
228
|
+
- Distributed locking (Redis or file-based)
|
|
229
|
+
- Access time tracking
|
|
230
|
+
"""
|
|
231
|
+
|
|
232
|
+
def __init__(
|
|
233
|
+
self,
|
|
234
|
+
cache_dir: Path,
|
|
235
|
+
max_size_bytes: int,
|
|
236
|
+
redis_client=None,
|
|
237
|
+
enabled: bool = True
|
|
238
|
+
):
|
|
239
|
+
self.cache_dir = Path(cache_dir)
|
|
240
|
+
self.max_size_bytes = max_size_bytes
|
|
241
|
+
self.redis = redis_client
|
|
242
|
+
self.enabled = enabled
|
|
243
|
+
|
|
244
|
+
self.files_dir = self.cache_dir / "files"
|
|
245
|
+
self.locks_dir = self.cache_dir / "locks"
|
|
246
|
+
self.manifest = DiskCacheManifest(self.cache_dir / "manifest.json")
|
|
247
|
+
|
|
248
|
+
# Create directories
|
|
249
|
+
if self.enabled:
|
|
250
|
+
self.files_dir.mkdir(parents=True, exist_ok=True)
|
|
251
|
+
self.locks_dir.mkdir(parents=True, exist_ok=True)
|
|
252
|
+
|
|
253
|
+
def _get_cache_key(self, gcs_bucket: str, gcs_path: str) -> str:
|
|
254
|
+
"""Generate cache key from GCS path."""
|
|
255
|
+
full_path = f"{gcs_bucket}/{gcs_path}"
|
|
256
|
+
return hashlib.sha256(full_path.encode()).hexdigest()[:16]
|
|
257
|
+
|
|
258
|
+
def _get_local_path(self, cache_key: str, gcs_path: str) -> Path:
|
|
259
|
+
"""Get local file path for a cache key."""
|
|
260
|
+
# Preserve extension for tskit compatibility
|
|
261
|
+
ext = Path(gcs_path).suffix or ".dat"
|
|
262
|
+
return self.files_dir / f"{cache_key}{ext}"
|
|
263
|
+
|
|
264
|
+
async def _acquire_lock(self, cache_key: str) -> Any:
|
|
265
|
+
"""Acquire download lock (Redis or file-based)."""
|
|
266
|
+
if self.redis:
|
|
267
|
+
lock = RedisLock(self.redis, f"lorax:download:{cache_key}")
|
|
268
|
+
else:
|
|
269
|
+
lock = FileLock(self.locks_dir / f"{cache_key}.lock")
|
|
270
|
+
|
|
271
|
+
acquired = await lock.acquire()
|
|
272
|
+
if not acquired:
|
|
273
|
+
raise TimeoutError(f"Failed to acquire lock for {cache_key}")
|
|
274
|
+
|
|
275
|
+
return lock
|
|
276
|
+
|
|
277
|
+
async def _release_lock(self, lock: Any):
|
|
278
|
+
"""Release download lock."""
|
|
279
|
+
await lock.release()
|
|
280
|
+
|
|
281
|
+
async def get_cached_path(self, gcs_bucket: str, gcs_path: str) -> Optional[Path]:
|
|
282
|
+
"""
|
|
283
|
+
Get path to cached file if it exists and is valid.
|
|
284
|
+
Updates access time on hit.
|
|
285
|
+
"""
|
|
286
|
+
if not self.enabled:
|
|
287
|
+
return None
|
|
288
|
+
|
|
289
|
+
cache_key = self._get_cache_key(gcs_bucket, gcs_path)
|
|
290
|
+
cached_file = await self.manifest.get_file(cache_key)
|
|
291
|
+
|
|
292
|
+
if cached_file and cached_file.download_complete:
|
|
293
|
+
local_path = Path(cached_file.local_path)
|
|
294
|
+
if local_path.exists():
|
|
295
|
+
# Update access time
|
|
296
|
+
await self.manifest.update_access_time(cache_key)
|
|
297
|
+
print(f"Cache hit: {gcs_path}")
|
|
298
|
+
return local_path
|
|
299
|
+
else:
|
|
300
|
+
# File was deleted externally, remove from manifest
|
|
301
|
+
await self.manifest.remove_file(cache_key)
|
|
302
|
+
|
|
303
|
+
return None
|
|
304
|
+
|
|
305
|
+
async def evict_if_needed(self, required_bytes: int = 0):
|
|
306
|
+
"""Evict oldest files until we have space for required_bytes."""
|
|
307
|
+
if not self.enabled:
|
|
308
|
+
return
|
|
309
|
+
|
|
310
|
+
target_size = self.max_size_bytes - required_bytes
|
|
311
|
+
current_size = await self.manifest.get_total_size()
|
|
312
|
+
|
|
313
|
+
if current_size <= target_size:
|
|
314
|
+
return
|
|
315
|
+
|
|
316
|
+
# Get files sorted by access time (oldest first)
|
|
317
|
+
files = await self.manifest.get_files_by_access_time()
|
|
318
|
+
|
|
319
|
+
for cache_key, cached_file in files:
|
|
320
|
+
if current_size <= target_size:
|
|
321
|
+
break
|
|
322
|
+
|
|
323
|
+
# Delete file
|
|
324
|
+
local_path = Path(cached_file.local_path)
|
|
325
|
+
if local_path.exists():
|
|
326
|
+
try:
|
|
327
|
+
local_path.unlink()
|
|
328
|
+
print(f"Evicted: {cached_file.gcs_path} ({cached_file.size_bytes / 1024 / 1024:.1f} MB)")
|
|
329
|
+
except OSError as e:
|
|
330
|
+
print(f"Warning: Failed to delete {local_path}: {e}")
|
|
331
|
+
|
|
332
|
+
# Remove from manifest
|
|
333
|
+
await self.manifest.remove_file(cache_key)
|
|
334
|
+
current_size -= cached_file.size_bytes
|
|
335
|
+
|
|
336
|
+
async def cache_file(
|
|
337
|
+
self,
|
|
338
|
+
gcs_bucket: str,
|
|
339
|
+
gcs_path: str,
|
|
340
|
+
local_path: Path,
|
|
341
|
+
size_bytes: int,
|
|
342
|
+
etag: Optional[str] = None
|
|
343
|
+
):
|
|
344
|
+
"""Register a downloaded file in the cache."""
|
|
345
|
+
if not self.enabled:
|
|
346
|
+
return
|
|
347
|
+
|
|
348
|
+
cache_key = self._get_cache_key(gcs_bucket, gcs_path)
|
|
349
|
+
|
|
350
|
+
cached_file = CachedFile(
|
|
351
|
+
gcs_path=f"{gcs_bucket}/{gcs_path}",
|
|
352
|
+
local_path=str(local_path),
|
|
353
|
+
size_bytes=size_bytes,
|
|
354
|
+
last_access=datetime.now(timezone.utc).isoformat(),
|
|
355
|
+
download_complete=True,
|
|
356
|
+
etag=etag
|
|
357
|
+
)
|
|
358
|
+
|
|
359
|
+
await self.manifest.set_file(cache_key, cached_file)
|
|
360
|
+
print(f"Cached: {gcs_path} ({size_bytes / 1024 / 1024:.1f} MB)")
|
|
361
|
+
|
|
362
|
+
async def get_or_download(
|
|
363
|
+
self,
|
|
364
|
+
gcs_bucket: str,
|
|
365
|
+
gcs_path: str,
|
|
366
|
+
download_func
|
|
367
|
+
) -> Path:
|
|
368
|
+
"""
|
|
369
|
+
Get file from cache or download using provided function.
|
|
370
|
+
|
|
371
|
+
Args:
|
|
372
|
+
gcs_bucket: GCS bucket name
|
|
373
|
+
gcs_path: Path within bucket
|
|
374
|
+
download_func: async function(local_path) that downloads file
|
|
375
|
+
|
|
376
|
+
Returns:
|
|
377
|
+
Path to local file
|
|
378
|
+
"""
|
|
379
|
+
# Fast path: check cache without lock
|
|
380
|
+
cached_path = await self.get_cached_path(gcs_bucket, gcs_path)
|
|
381
|
+
if cached_path:
|
|
382
|
+
return cached_path
|
|
383
|
+
|
|
384
|
+
if not self.enabled:
|
|
385
|
+
# Cache disabled, download directly
|
|
386
|
+
cache_key = self._get_cache_key(gcs_bucket, gcs_path)
|
|
387
|
+
local_path = self._get_local_path(cache_key, gcs_path)
|
|
388
|
+
await download_func(str(local_path))
|
|
389
|
+
return local_path
|
|
390
|
+
|
|
391
|
+
cache_key = self._get_cache_key(gcs_bucket, gcs_path)
|
|
392
|
+
local_path = self._get_local_path(cache_key, gcs_path)
|
|
393
|
+
|
|
394
|
+
# Acquire distributed lock
|
|
395
|
+
lock = await self._acquire_lock(cache_key)
|
|
396
|
+
try:
|
|
397
|
+
# Double-check after acquiring lock
|
|
398
|
+
cached_path = await self.get_cached_path(gcs_bucket, gcs_path)
|
|
399
|
+
if cached_path:
|
|
400
|
+
return cached_path
|
|
401
|
+
|
|
402
|
+
# Estimate size and evict if needed (conservative estimate)
|
|
403
|
+
await self.evict_if_needed(required_bytes=1024 * 1024 * 1024) # 1GB buffer
|
|
404
|
+
|
|
405
|
+
# Download to temp file
|
|
406
|
+
tmp_path = local_path.with_suffix(local_path.suffix + ".tmp")
|
|
407
|
+
try:
|
|
408
|
+
await download_func(str(tmp_path))
|
|
409
|
+
|
|
410
|
+
# Get actual size
|
|
411
|
+
size_bytes = tmp_path.stat().st_size
|
|
412
|
+
|
|
413
|
+
# Evict with actual size if needed
|
|
414
|
+
await self.evict_if_needed(required_bytes=size_bytes)
|
|
415
|
+
|
|
416
|
+
# Atomic rename
|
|
417
|
+
tmp_path.rename(local_path)
|
|
418
|
+
|
|
419
|
+
# Register in cache
|
|
420
|
+
await self.cache_file(gcs_bucket, gcs_path, local_path, size_bytes)
|
|
421
|
+
|
|
422
|
+
return local_path
|
|
423
|
+
|
|
424
|
+
except Exception as e:
|
|
425
|
+
# Cleanup temp file on failure
|
|
426
|
+
if tmp_path.exists():
|
|
427
|
+
tmp_path.unlink()
|
|
428
|
+
raise
|
|
429
|
+
|
|
430
|
+
finally:
|
|
431
|
+
await self._release_lock(lock)
|
|
432
|
+
|
|
433
|
+
async def get_stats(self) -> Dict[str, Any]:
|
|
434
|
+
"""Get cache statistics."""
|
|
435
|
+
if not self.enabled:
|
|
436
|
+
return {"enabled": False}
|
|
437
|
+
|
|
438
|
+
total_size = await self.manifest.get_total_size()
|
|
439
|
+
data = await self.manifest.load()
|
|
440
|
+
|
|
441
|
+
return {
|
|
442
|
+
"enabled": True,
|
|
443
|
+
"total_size_bytes": total_size,
|
|
444
|
+
"total_size_mb": round(total_size / 1024 / 1024, 2),
|
|
445
|
+
"max_size_mb": round(self.max_size_bytes / 1024 / 1024, 2),
|
|
446
|
+
"usage_percent": round(total_size / self.max_size_bytes * 100, 1) if self.max_size_bytes > 0 else 0,
|
|
447
|
+
"file_count": len(data["files"]),
|
|
448
|
+
"cache_dir": str(self.cache_dir),
|
|
449
|
+
}
|
|
450
|
+
|
|
451
|
+
async def clear(self):
|
|
452
|
+
"""Clear all cached files."""
|
|
453
|
+
if not self.enabled:
|
|
454
|
+
return
|
|
455
|
+
|
|
456
|
+
files = await self.manifest.get_files_by_access_time()
|
|
457
|
+
|
|
458
|
+
for cache_key, cached_file in files:
|
|
459
|
+
local_path = Path(cached_file.local_path)
|
|
460
|
+
if local_path.exists():
|
|
461
|
+
try:
|
|
462
|
+
local_path.unlink()
|
|
463
|
+
except OSError:
|
|
464
|
+
pass
|
|
465
|
+
await self.manifest.remove_file(cache_key)
|
|
466
|
+
|
|
467
|
+
print("Disk cache cleared")
|