lorax-arg 0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lorax/buffer.py +43 -0
- lorax/cache/__init__.py +43 -0
- lorax/cache/csv_tree_graph.py +59 -0
- lorax/cache/disk.py +467 -0
- lorax/cache/file_cache.py +142 -0
- lorax/cache/file_context.py +72 -0
- lorax/cache/lru.py +90 -0
- lorax/cache/tree_graph.py +293 -0
- lorax/cli.py +312 -0
- lorax/cloud/__init__.py +0 -0
- lorax/cloud/gcs_utils.py +205 -0
- lorax/constants.py +66 -0
- lorax/context.py +80 -0
- lorax/csv/__init__.py +7 -0
- lorax/csv/config.py +250 -0
- lorax/csv/layout.py +182 -0
- lorax/csv/newick_tree.py +234 -0
- lorax/handlers.py +998 -0
- lorax/lineage.py +456 -0
- lorax/loaders/__init__.py +0 -0
- lorax/loaders/csv_loader.py +10 -0
- lorax/loaders/loader.py +31 -0
- lorax/loaders/tskit_loader.py +119 -0
- lorax/lorax_app.py +75 -0
- lorax/manager.py +58 -0
- lorax/metadata/__init__.py +0 -0
- lorax/metadata/loader.py +426 -0
- lorax/metadata/mutations.py +146 -0
- lorax/modes.py +190 -0
- lorax/pg.py +183 -0
- lorax/redis_utils.py +30 -0
- lorax/routes.py +137 -0
- lorax/session_manager.py +206 -0
- lorax/sockets/__init__.py +55 -0
- lorax/sockets/connection.py +99 -0
- lorax/sockets/debug.py +47 -0
- lorax/sockets/decorators.py +112 -0
- lorax/sockets/file_ops.py +200 -0
- lorax/sockets/lineage.py +307 -0
- lorax/sockets/metadata.py +232 -0
- lorax/sockets/mutations.py +154 -0
- lorax/sockets/node_search.py +535 -0
- lorax/sockets/tree_layout.py +117 -0
- lorax/sockets/utils.py +10 -0
- lorax/tree_graph/__init__.py +12 -0
- lorax/tree_graph/tree_graph.py +689 -0
- lorax/utils.py +124 -0
- lorax_app/__init__.py +4 -0
- lorax_app/app.py +159 -0
- lorax_app/cli.py +114 -0
- lorax_app/static/X.png +0 -0
- lorax_app/static/assets/index-BCEGlUFi.js +2361 -0
- lorax_app/static/assets/index-iKjzUpA9.css +1 -0
- lorax_app/static/assets/localBackendWorker-BaWwjSV_.js +2 -0
- lorax_app/static/assets/renderDataWorker-BKLdiU7J.js +2 -0
- lorax_app/static/gestures/gesture-flick.ogv +0 -0
- lorax_app/static/gestures/gesture-two-finger-scroll.ogv +0 -0
- lorax_app/static/index.html +14 -0
- lorax_app/static/logo.png +0 -0
- lorax_app/static/lorax-logo.png +0 -0
- lorax_app/static/vite.svg +1 -0
- lorax_arg-0.1.dist-info/METADATA +131 -0
- lorax_arg-0.1.dist-info/RECORD +66 -0
- lorax_arg-0.1.dist-info/WHEEL +5 -0
- lorax_arg-0.1.dist-info/entry_points.txt +4 -0
- lorax_arg-0.1.dist-info/top_level.txt +2 -0
lorax/manager.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
1
|
+
from fastapi.websockets import WebSocket, WebSocketState, WebSocketDisconnect
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
class WebSocketManager:
|
|
5
|
+
def __init__(self):
|
|
6
|
+
self.connected_clients = set()
|
|
7
|
+
self.client_component = {} # websocket -> set(component names)
|
|
8
|
+
|
|
9
|
+
async def connect(self, websocket: WebSocket):
|
|
10
|
+
client_ip = f"{websocket.client.host}:{websocket.client.port}"
|
|
11
|
+
await websocket.accept()
|
|
12
|
+
self.connected_clients.add(websocket)
|
|
13
|
+
|
|
14
|
+
async def cleanup_disconnected_clients(self):
|
|
15
|
+
dead = [ws for ws in self.connected_clients if not self.is_connected(ws)]
|
|
16
|
+
for ws in dead:
|
|
17
|
+
await self.disconnect(ws)
|
|
18
|
+
|
|
19
|
+
def get_connected_clients(self):
|
|
20
|
+
"""Get list of currently connected clients"""
|
|
21
|
+
return [client for client in self.connected_clients if self.is_connected(client)]
|
|
22
|
+
|
|
23
|
+
def is_connected(self, ws: WebSocket) -> bool:
|
|
24
|
+
try:
|
|
25
|
+
return ws.client_state == WebSocketState.CONNECTED
|
|
26
|
+
except Exception:
|
|
27
|
+
return False
|
|
28
|
+
|
|
29
|
+
async def send_message(self, ws: WebSocket, message: dict):
|
|
30
|
+
if self.is_connected(ws):
|
|
31
|
+
try:
|
|
32
|
+
await ws.send_json(message)
|
|
33
|
+
except Exception as e:
|
|
34
|
+
print(f"send_message error: {e}")
|
|
35
|
+
await self.disconnect(ws)
|
|
36
|
+
else:
|
|
37
|
+
await self.disconnect(ws)
|
|
38
|
+
|
|
39
|
+
async def disconnect(self, websocket: WebSocket):
|
|
40
|
+
if websocket in self.connected_clients:
|
|
41
|
+
self.connected_clients.remove(websocket)
|
|
42
|
+
if websocket in self.client_component:
|
|
43
|
+
del self.client_component[websocket]
|
|
44
|
+
|
|
45
|
+
async def register_component(self, ws: WebSocket, component: str):
|
|
46
|
+
self.client_component.setdefault(ws, set()).add(component)
|
|
47
|
+
|
|
48
|
+
async def send_to_component(self, component: str, message: dict):
|
|
49
|
+
"""Send only to sockets in *this* session that registered that component."""
|
|
50
|
+
targets = [
|
|
51
|
+
ws for ws, comps in self.client_component.items()
|
|
52
|
+
if component in comps and self.is_connected(ws)
|
|
53
|
+
]
|
|
54
|
+
if not targets:
|
|
55
|
+
print(f"⚠️ No sockets registered for component: {component}")
|
|
56
|
+
return
|
|
57
|
+
for ws in targets:
|
|
58
|
+
await self.send_message(ws, message)
|
|
File without changes
|
lorax/metadata/loader.py
ADDED
|
@@ -0,0 +1,426 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Metadata extraction and caching for tree sequences.
|
|
3
|
+
|
|
4
|
+
Functions accept FileContext and use its nested metadata cache.
|
|
5
|
+
When a FileContext is evicted, its metadata cache is evicted together.
|
|
6
|
+
"""
|
|
7
|
+
|
|
8
|
+
import json
|
|
9
|
+
import numpy as np
|
|
10
|
+
import tskit
|
|
11
|
+
import pyarrow as pa
|
|
12
|
+
from collections import defaultdict
|
|
13
|
+
from lorax.utils import ensure_json_dict, make_json_safe, make_json_serializable
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def get_metadata_for_key(
|
|
17
|
+
ctx,
|
|
18
|
+
key,
|
|
19
|
+
sources=("individual", "node", "population"),
|
|
20
|
+
sample_name_key="name"
|
|
21
|
+
):
|
|
22
|
+
"""
|
|
23
|
+
Get sample-to-value mapping for a specific metadata key.
|
|
24
|
+
Results are cached in the FileContext's nested metadata cache.
|
|
25
|
+
|
|
26
|
+
Parameters
|
|
27
|
+
----------
|
|
28
|
+
ctx : FileContext
|
|
29
|
+
The file context containing tree_sequence and metadata cache
|
|
30
|
+
key : str
|
|
31
|
+
The metadata key to extract
|
|
32
|
+
sources : tuple
|
|
33
|
+
Any of ("individual", "node", "population")
|
|
34
|
+
sample_name_key : str
|
|
35
|
+
Key in node metadata used as sample name
|
|
36
|
+
|
|
37
|
+
Returns
|
|
38
|
+
-------
|
|
39
|
+
dict
|
|
40
|
+
{sample_name: value} for the specified key
|
|
41
|
+
"""
|
|
42
|
+
# Check nested cache in FileContext
|
|
43
|
+
cached = ctx.get_metadata(key)
|
|
44
|
+
if cached is not None:
|
|
45
|
+
print(f"✅ Using cached metadata for key: {key}")
|
|
46
|
+
return cached
|
|
47
|
+
|
|
48
|
+
ts = ctx.tree_sequence
|
|
49
|
+
|
|
50
|
+
# Special handling for "sample" key - each sample's value is its own name
|
|
51
|
+
if key == "sample":
|
|
52
|
+
result = {}
|
|
53
|
+
for node_id in ts.samples():
|
|
54
|
+
node = ts.node(node_id)
|
|
55
|
+
node_meta = node.metadata or {}
|
|
56
|
+
try:
|
|
57
|
+
node_meta = ensure_json_dict(node_meta)
|
|
58
|
+
except (TypeError, json.JSONDecodeError):
|
|
59
|
+
node_meta = {}
|
|
60
|
+
sample_name = str(node_meta.get(sample_name_key, f"{node_id}"))
|
|
61
|
+
result[sample_name] = sample_name
|
|
62
|
+
ctx.set_metadata(key, result)
|
|
63
|
+
return result
|
|
64
|
+
|
|
65
|
+
result = {}
|
|
66
|
+
|
|
67
|
+
for node_id in ts.samples():
|
|
68
|
+
node = ts.node(node_id)
|
|
69
|
+
node_meta = node.metadata or {}
|
|
70
|
+
node_meta = ensure_json_dict(node_meta)
|
|
71
|
+
sample_name = node_meta.get(sample_name_key, f"{node_id}")
|
|
72
|
+
|
|
73
|
+
for source in sources:
|
|
74
|
+
if source == "individual":
|
|
75
|
+
if node.individual == tskit.NULL:
|
|
76
|
+
continue
|
|
77
|
+
meta = ts.individual(node.individual).metadata
|
|
78
|
+
meta = meta or {}
|
|
79
|
+
meta = ensure_json_dict(meta)
|
|
80
|
+
|
|
81
|
+
elif source == "node":
|
|
82
|
+
meta = node_meta
|
|
83
|
+
|
|
84
|
+
elif source == "population":
|
|
85
|
+
if node.population == tskit.NULL:
|
|
86
|
+
continue
|
|
87
|
+
meta = ts.population(node.population).metadata
|
|
88
|
+
meta = meta or {}
|
|
89
|
+
meta = ensure_json_dict(meta)
|
|
90
|
+
|
|
91
|
+
else:
|
|
92
|
+
continue
|
|
93
|
+
|
|
94
|
+
if not meta:
|
|
95
|
+
continue
|
|
96
|
+
|
|
97
|
+
if key in meta:
|
|
98
|
+
value = meta[key]
|
|
99
|
+
if value is None:
|
|
100
|
+
break # Skip None values
|
|
101
|
+
if isinstance(value, (list, dict)):
|
|
102
|
+
value = repr(value)
|
|
103
|
+
result[sample_name] = str(value)
|
|
104
|
+
break # Found the key, move to next sample
|
|
105
|
+
|
|
106
|
+
ctx.set_metadata(key, result)
|
|
107
|
+
return result
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def search_samples_by_metadata(
|
|
111
|
+
ctx,
|
|
112
|
+
key,
|
|
113
|
+
value,
|
|
114
|
+
sources=("individual", "node", "population"),
|
|
115
|
+
sample_name_key="name"
|
|
116
|
+
):
|
|
117
|
+
"""
|
|
118
|
+
Search for samples that have a specific value for a metadata key.
|
|
119
|
+
|
|
120
|
+
Parameters
|
|
121
|
+
----------
|
|
122
|
+
ctx : FileContext
|
|
123
|
+
The file context containing tree_sequence and metadata cache
|
|
124
|
+
key : str
|
|
125
|
+
The metadata key to search
|
|
126
|
+
value : str
|
|
127
|
+
The value to match
|
|
128
|
+
sources : tuple
|
|
129
|
+
Any of ("individual", "node", "population")
|
|
130
|
+
sample_name_key : str
|
|
131
|
+
Key in node metadata used as sample name
|
|
132
|
+
|
|
133
|
+
Returns
|
|
134
|
+
-------
|
|
135
|
+
list
|
|
136
|
+
List of sample names matching the criteria
|
|
137
|
+
"""
|
|
138
|
+
ts = ctx.tree_sequence
|
|
139
|
+
|
|
140
|
+
# Try to use cached metadata if available
|
|
141
|
+
cached = ctx.get_metadata(key)
|
|
142
|
+
|
|
143
|
+
if cached is not None:
|
|
144
|
+
# Use cached data for fast lookup
|
|
145
|
+
return [sample for sample, val in cached.items() if str(val) == str(value)]
|
|
146
|
+
|
|
147
|
+
# If not cached, compute on the fly
|
|
148
|
+
matching_samples = []
|
|
149
|
+
|
|
150
|
+
for node_id in ts.samples():
|
|
151
|
+
node = ts.node(node_id)
|
|
152
|
+
node_meta = node.metadata or {}
|
|
153
|
+
node_meta = ensure_json_dict(node_meta)
|
|
154
|
+
sample_name = node_meta.get(sample_name_key, f"{node_id}")
|
|
155
|
+
|
|
156
|
+
for source in sources:
|
|
157
|
+
if source == "individual":
|
|
158
|
+
if node.individual == tskit.NULL:
|
|
159
|
+
continue
|
|
160
|
+
meta = ts.individual(node.individual).metadata
|
|
161
|
+
meta = meta or {}
|
|
162
|
+
meta = ensure_json_dict(meta)
|
|
163
|
+
|
|
164
|
+
elif source == "node":
|
|
165
|
+
meta = node_meta
|
|
166
|
+
|
|
167
|
+
elif source == "population":
|
|
168
|
+
if node.population == tskit.NULL:
|
|
169
|
+
continue
|
|
170
|
+
meta = ts.population(node.population).metadata
|
|
171
|
+
meta = meta or {}
|
|
172
|
+
meta = ensure_json_dict(meta)
|
|
173
|
+
|
|
174
|
+
else:
|
|
175
|
+
continue
|
|
176
|
+
|
|
177
|
+
if not meta:
|
|
178
|
+
continue
|
|
179
|
+
|
|
180
|
+
if key in meta:
|
|
181
|
+
meta_value = meta[key]
|
|
182
|
+
if meta_value is None:
|
|
183
|
+
break # Skip None values
|
|
184
|
+
if isinstance(meta_value, (list, dict)):
|
|
185
|
+
meta_value = repr(meta_value)
|
|
186
|
+
if str(meta_value) == str(value):
|
|
187
|
+
matching_samples.append(sample_name)
|
|
188
|
+
break
|
|
189
|
+
|
|
190
|
+
return matching_samples
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
def _get_sample_metadata_value(ts, node_id, key, sources, sample_name_key="name"):
|
|
194
|
+
"""
|
|
195
|
+
Helper to get a specific metadata value for a sample node.
|
|
196
|
+
Returns (sample_name, value) tuple.
|
|
197
|
+
"""
|
|
198
|
+
node = ts.node(node_id)
|
|
199
|
+
node_meta = node.metadata or {}
|
|
200
|
+
node_meta = ensure_json_dict(node_meta)
|
|
201
|
+
sample_name = node_meta.get(sample_name_key, f"{node_id}")
|
|
202
|
+
|
|
203
|
+
# Special handling for "sample" key: it is not a real metadata field in tskit.
|
|
204
|
+
# Treat it as identity so "sample" searches/highlights match by sample name.
|
|
205
|
+
if key == "sample":
|
|
206
|
+
sample_name = str(sample_name)
|
|
207
|
+
return (sample_name, sample_name)
|
|
208
|
+
|
|
209
|
+
for source in sources:
|
|
210
|
+
if source == "individual":
|
|
211
|
+
if node.individual == tskit.NULL:
|
|
212
|
+
continue
|
|
213
|
+
meta = ts.individual(node.individual).metadata
|
|
214
|
+
meta = meta or {}
|
|
215
|
+
meta = ensure_json_dict(meta)
|
|
216
|
+
|
|
217
|
+
elif source == "node":
|
|
218
|
+
meta = node_meta
|
|
219
|
+
|
|
220
|
+
elif source == "population":
|
|
221
|
+
if node.population == tskit.NULL:
|
|
222
|
+
continue
|
|
223
|
+
meta = ts.population(node.population).metadata
|
|
224
|
+
meta = meta or {}
|
|
225
|
+
meta = ensure_json_dict(meta)
|
|
226
|
+
|
|
227
|
+
else:
|
|
228
|
+
continue
|
|
229
|
+
|
|
230
|
+
if not meta:
|
|
231
|
+
continue
|
|
232
|
+
|
|
233
|
+
if key in meta:
|
|
234
|
+
value = meta[key]
|
|
235
|
+
if isinstance(value, (list, dict)):
|
|
236
|
+
value = repr(value)
|
|
237
|
+
return (sample_name, value)
|
|
238
|
+
|
|
239
|
+
return (sample_name, None)
|
|
240
|
+
|
|
241
|
+
|
|
242
|
+
def get_metadata_array_for_key(
|
|
243
|
+
ctx,
|
|
244
|
+
key,
|
|
245
|
+
sources=("individual", "node", "population"),
|
|
246
|
+
sample_name_key="name"
|
|
247
|
+
):
|
|
248
|
+
"""
|
|
249
|
+
Build efficient array-based metadata for a key using PyArrow.
|
|
250
|
+
|
|
251
|
+
Returns indices array where indices[i] is the index into unique_values
|
|
252
|
+
for the i-th sample (ordered by node_id from ts.samples()).
|
|
253
|
+
|
|
254
|
+
Parameters
|
|
255
|
+
----------
|
|
256
|
+
ctx : FileContext
|
|
257
|
+
The file context containing tree_sequence and metadata cache
|
|
258
|
+
key : str
|
|
259
|
+
The metadata key to extract
|
|
260
|
+
sources : tuple
|
|
261
|
+
Any of ("individual", "node", "population")
|
|
262
|
+
sample_name_key : str
|
|
263
|
+
Key in node metadata used as sample name
|
|
264
|
+
|
|
265
|
+
Returns
|
|
266
|
+
-------
|
|
267
|
+
dict
|
|
268
|
+
{
|
|
269
|
+
'unique_values': [val0, val1, ...], # Index i -> value string
|
|
270
|
+
'sample_node_ids': [node_id0, node_id1, ...], # Sample order
|
|
271
|
+
'arrow_buffer': bytes # PyArrow IPC serialized indices
|
|
272
|
+
}
|
|
273
|
+
"""
|
|
274
|
+
cache_key = f"{key}:array"
|
|
275
|
+
cached = ctx.get_metadata(cache_key)
|
|
276
|
+
if cached is not None:
|
|
277
|
+
print(f"✅ Using cached metadata array for key: {key}")
|
|
278
|
+
return cached
|
|
279
|
+
|
|
280
|
+
ts = ctx.tree_sequence
|
|
281
|
+
sample_ids = list(ts.samples())
|
|
282
|
+
n_samples = len(sample_ids)
|
|
283
|
+
|
|
284
|
+
# Special handling for "sample" key - each sample's name is its own unique value
|
|
285
|
+
if key == "sample":
|
|
286
|
+
unique_values = []
|
|
287
|
+
value_to_idx = {}
|
|
288
|
+
indices = np.zeros(n_samples, dtype=np.uint32)
|
|
289
|
+
|
|
290
|
+
for i, node_id in enumerate(sample_ids):
|
|
291
|
+
node = ts.node(node_id)
|
|
292
|
+
node_meta = node.metadata or {}
|
|
293
|
+
try:
|
|
294
|
+
node_meta = ensure_json_dict(node_meta)
|
|
295
|
+
except (TypeError, json.JSONDecodeError):
|
|
296
|
+
node_meta = {}
|
|
297
|
+
sample_name = str(node_meta.get(sample_name_key, f"{node_id}"))
|
|
298
|
+
|
|
299
|
+
if sample_name not in value_to_idx:
|
|
300
|
+
value_to_idx[sample_name] = len(unique_values)
|
|
301
|
+
unique_values.append(sample_name)
|
|
302
|
+
|
|
303
|
+
indices[i] = value_to_idx[sample_name]
|
|
304
|
+
|
|
305
|
+
# Serialize to Arrow IPC format
|
|
306
|
+
table = pa.table({'idx': pa.array(indices, type=pa.uint32())})
|
|
307
|
+
sink = pa.BufferOutputStream()
|
|
308
|
+
writer = pa.ipc.new_stream(sink, table.schema)
|
|
309
|
+
writer.write_table(table)
|
|
310
|
+
writer.close()
|
|
311
|
+
|
|
312
|
+
result = {
|
|
313
|
+
'unique_values': unique_values,
|
|
314
|
+
'sample_node_ids': [int(x) for x in sample_ids],
|
|
315
|
+
'arrow_buffer': sink.getvalue().to_pybytes()
|
|
316
|
+
}
|
|
317
|
+
ctx.set_metadata(cache_key, result)
|
|
318
|
+
print(f"✅ Built sample metadata array ({n_samples} samples, {len(unique_values)} unique values)")
|
|
319
|
+
return result
|
|
320
|
+
|
|
321
|
+
unique_values = []
|
|
322
|
+
value_to_idx = {}
|
|
323
|
+
indices = np.zeros(n_samples, dtype=np.uint32)
|
|
324
|
+
|
|
325
|
+
for i, node_id in enumerate(sample_ids):
|
|
326
|
+
sample_name, value = _get_sample_metadata_value(ts, node_id, key, sources, sample_name_key)
|
|
327
|
+
|
|
328
|
+
if value is None:
|
|
329
|
+
value = "" # Handle missing values
|
|
330
|
+
|
|
331
|
+
value_str = str(value)
|
|
332
|
+
|
|
333
|
+
if value_str not in value_to_idx:
|
|
334
|
+
value_to_idx[value_str] = len(unique_values)
|
|
335
|
+
unique_values.append(value_str)
|
|
336
|
+
|
|
337
|
+
indices[i] = value_to_idx[value_str]
|
|
338
|
+
|
|
339
|
+
# Serialize to Arrow IPC format
|
|
340
|
+
table = pa.table({'idx': pa.array(indices, type=pa.uint32())})
|
|
341
|
+
sink = pa.BufferOutputStream()
|
|
342
|
+
writer = pa.ipc.new_stream(sink, table.schema)
|
|
343
|
+
writer.write_table(table)
|
|
344
|
+
writer.close()
|
|
345
|
+
|
|
346
|
+
result = {
|
|
347
|
+
'unique_values': unique_values,
|
|
348
|
+
'sample_node_ids': [int(x) for x in sample_ids], # Convert to Python int for JSON
|
|
349
|
+
'arrow_buffer': sink.getvalue().to_pybytes()
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
ctx.set_metadata(cache_key, result)
|
|
353
|
+
print(f"✅ Built metadata array for key: {key} ({n_samples} samples, {len(unique_values)} unique values)")
|
|
354
|
+
return result
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
def get_metadata_schema(
|
|
358
|
+
ts,
|
|
359
|
+
sources=("individual", "node", "population"),
|
|
360
|
+
sample_name_key="name"
|
|
361
|
+
):
|
|
362
|
+
"""
|
|
363
|
+
Extract metadata keys only (values are fetched on-demand via get_metadata_array_for_key).
|
|
364
|
+
|
|
365
|
+
Also includes "sample" as the first key, where each sample's name/ID is its
|
|
366
|
+
own unique value (for coloring samples individually).
|
|
367
|
+
|
|
368
|
+
Parameters
|
|
369
|
+
----------
|
|
370
|
+
ts : tskit.TreeSequence
|
|
371
|
+
The tree sequence (not FileContext - this doesn't need caching)
|
|
372
|
+
sources : tuple
|
|
373
|
+
Any of ("individual", "node", "population")
|
|
374
|
+
sample_name_key : str
|
|
375
|
+
Key in node metadata used as sample name
|
|
376
|
+
|
|
377
|
+
Returns
|
|
378
|
+
-------
|
|
379
|
+
dict
|
|
380
|
+
{
|
|
381
|
+
"metadata_keys": [key1, key2, ...]
|
|
382
|
+
}
|
|
383
|
+
"""
|
|
384
|
+
keys = set()
|
|
385
|
+
|
|
386
|
+
for node_id in ts.samples():
|
|
387
|
+
node = ts.node(node_id)
|
|
388
|
+
|
|
389
|
+
# Parse node metadata
|
|
390
|
+
node_meta = node.metadata or {}
|
|
391
|
+
try:
|
|
392
|
+
node_meta = ensure_json_dict(node_meta)
|
|
393
|
+
except (TypeError, json.JSONDecodeError):
|
|
394
|
+
node_meta = {}
|
|
395
|
+
|
|
396
|
+
for source in sources:
|
|
397
|
+
if source == "individual":
|
|
398
|
+
if node.individual == tskit.NULL:
|
|
399
|
+
continue
|
|
400
|
+
meta = ts.individual(node.individual).metadata
|
|
401
|
+
meta = meta or {}
|
|
402
|
+
meta = ensure_json_dict(meta)
|
|
403
|
+
|
|
404
|
+
elif source == "node":
|
|
405
|
+
meta = node_meta # Reuse already parsed node metadata
|
|
406
|
+
|
|
407
|
+
elif source == "population":
|
|
408
|
+
if node.population == tskit.NULL:
|
|
409
|
+
continue
|
|
410
|
+
meta = ts.population(node.population).metadata
|
|
411
|
+
meta = meta or {}
|
|
412
|
+
meta = ensure_json_dict(meta)
|
|
413
|
+
|
|
414
|
+
else:
|
|
415
|
+
raise ValueError(f"Unknown source: {source}")
|
|
416
|
+
|
|
417
|
+
if not meta:
|
|
418
|
+
continue
|
|
419
|
+
|
|
420
|
+
for key in meta.keys():
|
|
421
|
+
keys.add(key)
|
|
422
|
+
|
|
423
|
+
# Prepend "sample" to keys - this makes it the default colorBy option
|
|
424
|
+
return {
|
|
425
|
+
"metadata_keys": ["sample"] + sorted(list(keys))
|
|
426
|
+
}
|
|
@@ -0,0 +1,146 @@
|
|
|
1
|
+
|
|
2
|
+
import numpy as np
|
|
3
|
+
import tskit
|
|
4
|
+
|
|
5
|
+
def get_mutations_in_window(ts, start, end, offset=0, limit=1000):
|
|
6
|
+
"""
|
|
7
|
+
Get mutations within a genomic interval [start, end) with pagination.
|
|
8
|
+
|
|
9
|
+
Args:
|
|
10
|
+
ts: tskit.TreeSequence
|
|
11
|
+
start: Start genomic position (bp)
|
|
12
|
+
end: End genomic position (bp)
|
|
13
|
+
offset: Number of mutations to skip (for pagination)
|
|
14
|
+
limit: Maximum number of mutations to return
|
|
15
|
+
|
|
16
|
+
Returns:
|
|
17
|
+
dict with:
|
|
18
|
+
- 'mutations': list of mutation dicts
|
|
19
|
+
- 'total_count': total mutations in window (for pagination)
|
|
20
|
+
- 'has_more': whether there are more mutations
|
|
21
|
+
"""
|
|
22
|
+
t = ts.tables
|
|
23
|
+
sites = t.sites
|
|
24
|
+
mutations = t.mutations
|
|
25
|
+
|
|
26
|
+
# Get positions for all mutations via their sites
|
|
27
|
+
positions = sites.position[mutations.site]
|
|
28
|
+
|
|
29
|
+
# Create mask for mutations in the window
|
|
30
|
+
mask = (positions >= start) & (positions < end)
|
|
31
|
+
indices = np.where(mask)[0]
|
|
32
|
+
|
|
33
|
+
total_count = len(indices)
|
|
34
|
+
|
|
35
|
+
# Apply pagination
|
|
36
|
+
paginated_indices = indices[offset:offset + limit]
|
|
37
|
+
|
|
38
|
+
# Extract mutation data
|
|
39
|
+
result_mutations = []
|
|
40
|
+
for idx in paginated_indices:
|
|
41
|
+
mut = mutations[idx]
|
|
42
|
+
site = sites[mut.site]
|
|
43
|
+
position = int(site.position)
|
|
44
|
+
site_id = int(mut.site)
|
|
45
|
+
node_id = int(mut.node)
|
|
46
|
+
|
|
47
|
+
# Get ancestral and derived states
|
|
48
|
+
ancestral_state = site.ancestral_state
|
|
49
|
+
derived_state = mut.derived_state
|
|
50
|
+
|
|
51
|
+
result_mutations.append({
|
|
52
|
+
'position': position,
|
|
53
|
+
'mutation': f"{ancestral_state}->{derived_state}",
|
|
54
|
+
'node_id': node_id,
|
|
55
|
+
'site_id': site_id,
|
|
56
|
+
'ancestral_state': ancestral_state,
|
|
57
|
+
'derived_state': derived_state,
|
|
58
|
+
})
|
|
59
|
+
|
|
60
|
+
return {
|
|
61
|
+
'mutations': result_mutations,
|
|
62
|
+
'total_count': total_count,
|
|
63
|
+
'has_more': offset + limit < total_count
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def search_mutations_by_position(ts, position, range_bp=5000, offset=0, limit=1000):
|
|
68
|
+
"""
|
|
69
|
+
Search for mutations around a specific position.
|
|
70
|
+
|
|
71
|
+
Args:
|
|
72
|
+
ts: tskit.TreeSequence
|
|
73
|
+
position: Center position to search around (bp)
|
|
74
|
+
range_bp: Total range to search (searches +/- range_bp/2 around position)
|
|
75
|
+
offset: Number of mutations to skip (for pagination)
|
|
76
|
+
limit: Maximum number of mutations to return
|
|
77
|
+
|
|
78
|
+
Returns:
|
|
79
|
+
dict with:
|
|
80
|
+
- 'mutations': list of mutation dicts sorted by distance from position
|
|
81
|
+
- 'total_count': total mutations in search range
|
|
82
|
+
- 'has_more': whether there are more mutations
|
|
83
|
+
- 'search_start': actual start of search range
|
|
84
|
+
- 'search_end': actual end of search range
|
|
85
|
+
"""
|
|
86
|
+
half_range = range_bp // 2
|
|
87
|
+
search_start = max(0, position - half_range)
|
|
88
|
+
search_end = min(ts.sequence_length, position + half_range)
|
|
89
|
+
|
|
90
|
+
t = ts.tables
|
|
91
|
+
sites = t.sites
|
|
92
|
+
mutations = t.mutations
|
|
93
|
+
|
|
94
|
+
# Get positions for all mutations via their sites
|
|
95
|
+
positions = sites.position[mutations.site]
|
|
96
|
+
|
|
97
|
+
# Create mask for mutations in the search range
|
|
98
|
+
mask = (positions >= search_start) & (positions < search_end)
|
|
99
|
+
indices = np.where(mask)[0]
|
|
100
|
+
|
|
101
|
+
# Calculate distances and sort by distance
|
|
102
|
+
if len(indices) > 0:
|
|
103
|
+
mutation_positions = positions[indices]
|
|
104
|
+
distances = np.abs(mutation_positions - position)
|
|
105
|
+
sorted_order = np.argsort(distances)
|
|
106
|
+
indices = indices[sorted_order]
|
|
107
|
+
sorted_distances = distances[sorted_order]
|
|
108
|
+
else:
|
|
109
|
+
sorted_distances = np.array([])
|
|
110
|
+
|
|
111
|
+
total_count = len(indices)
|
|
112
|
+
|
|
113
|
+
# Apply pagination
|
|
114
|
+
paginated_indices = indices[offset:offset + limit]
|
|
115
|
+
paginated_distances = sorted_distances[offset:offset + limit] if len(sorted_distances) > 0 else []
|
|
116
|
+
|
|
117
|
+
# Extract mutation data
|
|
118
|
+
result_mutations = []
|
|
119
|
+
for i, idx in enumerate(paginated_indices):
|
|
120
|
+
mut = mutations[idx]
|
|
121
|
+
site = sites[mut.site]
|
|
122
|
+
mut_position = int(site.position)
|
|
123
|
+
site_id = int(mut.site)
|
|
124
|
+
node_id = int(mut.node)
|
|
125
|
+
|
|
126
|
+
# Get ancestral and derived states
|
|
127
|
+
ancestral_state = site.ancestral_state
|
|
128
|
+
derived_state = mut.derived_state
|
|
129
|
+
|
|
130
|
+
result_mutations.append({
|
|
131
|
+
'position': mut_position,
|
|
132
|
+
'mutation': f"{ancestral_state}->{derived_state}",
|
|
133
|
+
'node_id': node_id,
|
|
134
|
+
'site_id': site_id,
|
|
135
|
+
'ancestral_state': ancestral_state,
|
|
136
|
+
'derived_state': derived_state,
|
|
137
|
+
'distance': int(paginated_distances[i]) if i < len(paginated_distances) else 0,
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
return {
|
|
141
|
+
'mutations': result_mutations,
|
|
142
|
+
'total_count': total_count,
|
|
143
|
+
'has_more': offset + limit < total_count,
|
|
144
|
+
'search_start': int(search_start),
|
|
145
|
+
'search_end': int(search_end),
|
|
146
|
+
}
|