sawnergy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sawnergy might be problematic. Click here for more details.
- sawnergy/__init__.py +13 -0
- sawnergy/embedding/SGNS_pml.py +135 -0
- sawnergy/embedding/SGNS_torch.py +177 -0
- sawnergy/embedding/__init__.py +34 -0
- sawnergy/embedding/embedder.py +578 -0
- sawnergy/logging_util.py +54 -0
- sawnergy/rin/__init__.py +9 -0
- sawnergy/rin/rin_builder.py +936 -0
- sawnergy/rin/rin_util.py +391 -0
- sawnergy/sawnergy_util.py +1182 -0
- sawnergy/visual/__init__.py +42 -0
- sawnergy/visual/visualizer.py +690 -0
- sawnergy/visual/visualizer_util.py +387 -0
- sawnergy/walks/__init__.py +16 -0
- sawnergy/walks/walker.py +795 -0
- sawnergy/walks/walker_util.py +384 -0
- sawnergy-1.0.0.dist-info/METADATA +290 -0
- sawnergy-1.0.0.dist-info/RECORD +22 -0
- sawnergy-1.0.0.dist-info/WHEEL +5 -0
- sawnergy-1.0.0.dist-info/licenses/LICENSE +201 -0
- sawnergy-1.0.0.dist-info/licenses/NOTICE +4 -0
- sawnergy-1.0.0.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,384 @@
|
|
|
1
|
+
# third-party
|
|
2
|
+
import numpy as np
|
|
3
|
+
# built-in
|
|
4
|
+
from multiprocessing import shared_memory
|
|
5
|
+
import logging
|
|
6
|
+
|
|
7
|
+
# *----------------------------------------------------*
|
|
8
|
+
# GLOBALS
|
|
9
|
+
# *----------------------------------------------------*
|
|
10
|
+
|
|
11
|
+
_logger = logging.getLogger(__name__)
|
|
12
|
+
|
|
13
|
+
# *----------------------------------------------------*
|
|
14
|
+
# CLASSES
|
|
15
|
+
# *----------------------------------------------------*
|
|
16
|
+
|
|
17
|
+
class SharedNDArray:
|
|
18
|
+
"""NumPy-facing wrapper over a raw :class:`multiprocessing.shared_memory.SharedMemory`.
|
|
19
|
+
|
|
20
|
+
This class does **not** own any data itself; it wraps an OS-level shared
|
|
21
|
+
memory segment and exposes it as a NumPy array via zero-copy views
|
|
22
|
+
(shape/dtype provided by the caller). The underlying buffer is just a
|
|
23
|
+
flat byte block; dimensionality and strides come from the views you
|
|
24
|
+
construct.
|
|
25
|
+
|
|
26
|
+
Usage model:
|
|
27
|
+
- Create a segment in the parent with :meth:`create`, optionally seeding
|
|
28
|
+
from an existing array (copied once, C-contiguous).
|
|
29
|
+
- Pass ``(name, shape, dtype)`` to workers and attach with :meth:`attach`.
|
|
30
|
+
- Obtain a view with :py:meth:`view` or the :py:attr:`array` property.
|
|
31
|
+
Views are read-only by default unless ``default_readonly=False`` or
|
|
32
|
+
``view(readonly=False)`` is requested.
|
|
33
|
+
- Every process that opened the segment must call :meth:`close`.
|
|
34
|
+
Exactly one process should call :meth:`unlink` after all others have
|
|
35
|
+
closed to destroy the OS resource.
|
|
36
|
+
|
|
37
|
+
Indexing:
|
|
38
|
+
- ``__getitem__`` strictly supports **axis-0** basic indexing
|
|
39
|
+
(``None``, ``slice``, or ``int``). This guarantees **no-copy** views.
|
|
40
|
+
Fancy indexing (index arrays/boolean masks) is intentionally disallowed.
|
|
41
|
+
- For 1D arrays, ``int`` indexing would yield a NumPy scalar (not a view),
|
|
42
|
+
so it is rejected; use ``slice(i, i+1)`` for a one-row view instead.
|
|
43
|
+
|
|
44
|
+
Concurrency:
|
|
45
|
+
- Multiple readers are safe by design.
|
|
46
|
+
- If multiple writers may overlap, synchronize externally (e.g., a
|
|
47
|
+
:class:`multiprocessing.Lock`). The class does not implement locking.
|
|
48
|
+
|
|
49
|
+
Notes:
|
|
50
|
+
- The writeability flag is **per-view**. Marking one view read-only does
|
|
51
|
+
not prevent other processes (or other views) from writing.
|
|
52
|
+
- Shape/dtype are trusted by :meth:`attach`—they must match what was used
|
|
53
|
+
at creation time; no runtime validation is performed here.
|
|
54
|
+
"""
|
|
55
|
+
|
|
56
|
+
def __init__(self,
|
|
57
|
+
shm: shared_memory.SharedMemory,
|
|
58
|
+
shape: tuple[int, ...],
|
|
59
|
+
dtype: np.dtype,
|
|
60
|
+
*,
|
|
61
|
+
default_readonly: bool = True):
|
|
62
|
+
"""Construct a wrapper over an existing shared memory handle.
|
|
63
|
+
|
|
64
|
+
Args:
|
|
65
|
+
shm: An open :class:`SharedMemory` handle (already created/attached).
|
|
66
|
+
shape: Target array shape used for all views into this buffer.
|
|
67
|
+
dtype: Target NumPy dtype used for all views into this buffer.
|
|
68
|
+
default_readonly: If ``True``, views returned by :py:attr:`array`
|
|
69
|
+
are marked read-only; override per-call via :py:meth:`view`.
|
|
70
|
+
|
|
71
|
+
Remarks:
|
|
72
|
+
This constructor does not allocate memory; it only stores metadata.
|
|
73
|
+
Use :meth:`create` to allocate a new segment, or :meth:`attach`
|
|
74
|
+
to connect to an existing one by name.
|
|
75
|
+
"""
|
|
76
|
+
self.shm = shm
|
|
77
|
+
self.shape = tuple(shape)
|
|
78
|
+
self.dtype = np.dtype(dtype)
|
|
79
|
+
self._default_readonly = default_readonly
|
|
80
|
+
_logger.debug(
|
|
81
|
+
"SharedNDArray.__init__(name=%r, shape=%s, dtype=%s, default_readonly=%s)",
|
|
82
|
+
getattr(self.shm, "name", None), self.shape, self.dtype, self._default_readonly
|
|
83
|
+
)
|
|
84
|
+
|
|
85
|
+
def __len__(self) -> int:
|
|
86
|
+
"""Return the size of axis 0 (NumPy semantics).
|
|
87
|
+
|
|
88
|
+
Returns:
|
|
89
|
+
The number of elements along the first dimension.
|
|
90
|
+
|
|
91
|
+
Raises:
|
|
92
|
+
TypeError: If the wrapped array is 0-D (unsized).
|
|
93
|
+
"""
|
|
94
|
+
if len(self.shape) == 0:
|
|
95
|
+
_logger.error("len() called on 0-D array (shape=%s)", self.shape)
|
|
96
|
+
raise TypeError("len() of unsized object")
|
|
97
|
+
length = self.shape[0]
|
|
98
|
+
_logger.debug("__len__ -> %d", length)
|
|
99
|
+
return length
|
|
100
|
+
|
|
101
|
+
def __repr__(self):
|
|
102
|
+
"""Debug-friendly representation showing name/shape/dtype."""
|
|
103
|
+
return f"SharedNDArray(name={self.name!r}, shape={self.shape}, dtype={self.dtype})"
|
|
104
|
+
|
|
105
|
+
def __getitem__(self, ids: int | slice | None = None):
|
|
106
|
+
"""Axis-0 only, no-copy guaranteed.
|
|
107
|
+
- None -> full view
|
|
108
|
+
- slice -> view
|
|
109
|
+
- int -> view (requires ndim >= 2); for 1D, use slice(i, i+1)
|
|
110
|
+
"""
|
|
111
|
+
_logger.debug("__getitem__(ids=%r)", ids)
|
|
112
|
+
arr = self.array
|
|
113
|
+
if ids is None:
|
|
114
|
+
_logger.debug("__getitem__: returning full view")
|
|
115
|
+
return arr
|
|
116
|
+
if isinstance(ids, slice):
|
|
117
|
+
_logger.debug("__getitem__: slice=%s", ids)
|
|
118
|
+
return arr[ids, ...]
|
|
119
|
+
if isinstance(ids, int):
|
|
120
|
+
if arr.ndim == 1:
|
|
121
|
+
_logger.error(
|
|
122
|
+
"__getitem__: 1D int indexing requested (idx=%r) -> would copy; raising",
|
|
123
|
+
ids
|
|
124
|
+
)
|
|
125
|
+
raise TypeError(
|
|
126
|
+
"No-copy view for 1D int indexing is impossible. "
|
|
127
|
+
"Use slice(i, i+1) to get a 1-row view."
|
|
128
|
+
)
|
|
129
|
+
_logger.debug("__getitem__: int=%d", ids)
|
|
130
|
+
return arr[ids, ...]
|
|
131
|
+
_logger.error("__getitem__: unsupported key type %s", type(ids).__name__)
|
|
132
|
+
raise TypeError("Only axis-0 int/slice/None are allowed for no-copy access.")
|
|
133
|
+
|
|
134
|
+
@classmethod
|
|
135
|
+
def attach(cls, name: str, shape, dtype):
|
|
136
|
+
"""Attach to an existing shared memory segment by name.
|
|
137
|
+
|
|
138
|
+
Args:
|
|
139
|
+
name: System-wide shared memory name (as returned by :py:attr:`name`).
|
|
140
|
+
shape: Shape to interpret the buffer with (must match creator).
|
|
141
|
+
dtype: Dtype to interpret the buffer with (must match creator).
|
|
142
|
+
|
|
143
|
+
Returns:
|
|
144
|
+
A :class:`SharedNDArray` bound to the named segment.
|
|
145
|
+
|
|
146
|
+
Raises:
|
|
147
|
+
FileNotFoundError: If no segment with ``name`` exists.
|
|
148
|
+
PermissionError: If the segment exists but cannot be opened.
|
|
149
|
+
|
|
150
|
+
Notes:
|
|
151
|
+
This method trusts ``shape`` and ``dtype``; it does not verify that
|
|
152
|
+
they match the original settings. Passing inconsistent metadata
|
|
153
|
+
results in undefined views.
|
|
154
|
+
"""
|
|
155
|
+
_logger.debug("SharedNDArray.attach(name=%r, shape=%s, dtype=%s)", name, shape, np.dtype(dtype))
|
|
156
|
+
shm = shared_memory.SharedMemory(name=name, create=False)
|
|
157
|
+
obj = cls(shm, shape, dtype)
|
|
158
|
+
_logger.debug("Attached to shared memory: name=%r", obj.name)
|
|
159
|
+
return obj
|
|
160
|
+
|
|
161
|
+
@classmethod
|
|
162
|
+
def create(cls, shape, dtype, *, from_array=None, name: str | None = None):
|
|
163
|
+
"""Create a new shared memory segment and wrap it.
|
|
164
|
+
|
|
165
|
+
The allocated buffer is sized exactly as ``prod(shape) * dtype.itemsize``.
|
|
166
|
+
If ``from_array`` is provided, its contents are copied into the buffer
|
|
167
|
+
after being coerced to a C-contiguous array of ``dtype``. Otherwise the
|
|
168
|
+
buffer is zero-initialized.
|
|
169
|
+
|
|
170
|
+
Args:
|
|
171
|
+
shape: Desired array shape.
|
|
172
|
+
dtype: Desired NumPy dtype.
|
|
173
|
+
from_array: Optional source array to seed the buffer. Must match
|
|
174
|
+
``shape`` after coercion to ``dtype``; copied as C-contiguous.
|
|
175
|
+
name: Optional OS-visible name for the segment. If omitted, a unique
|
|
176
|
+
name is generated.
|
|
177
|
+
|
|
178
|
+
Returns:
|
|
179
|
+
A :class:`SharedNDArray` bound to the newly created segment.
|
|
180
|
+
|
|
181
|
+
Raises:
|
|
182
|
+
ValueError: If ``from_array`` shape does not match ``shape`` after
|
|
183
|
+
dtype coercion.
|
|
184
|
+
"""
|
|
185
|
+
dtype = np.dtype(dtype)
|
|
186
|
+
nbytes = int(np.prod(shape)) * dtype.itemsize
|
|
187
|
+
_logger.debug("SharedNDArray.create(shape=%s, dtype=%s, name=%r, nbytes=%d)", shape, dtype, name, nbytes)
|
|
188
|
+
shm = shared_memory.SharedMemory(create=True, size=nbytes, name=name)
|
|
189
|
+
obj = cls(shm, shape, dtype)
|
|
190
|
+
|
|
191
|
+
view = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
|
|
192
|
+
if from_array is not None:
|
|
193
|
+
src = np.ascontiguousarray(from_array, dtype=dtype)
|
|
194
|
+
if src.shape != tuple(shape):
|
|
195
|
+
_logger.error("create: source shape %s does not match %s", src.shape, shape)
|
|
196
|
+
raise ValueError(f"shape mismatch: {src.shape} vs {shape}")
|
|
197
|
+
view[...] = src
|
|
198
|
+
_logger.debug("create: seeded from array (shape=%s, dtype=%s)", src.shape, src.dtype)
|
|
199
|
+
else:
|
|
200
|
+
view.fill(0)
|
|
201
|
+
_logger.debug("create: zero-initialized buffer")
|
|
202
|
+
_logger.debug("create: created shared segment name=%r", obj.name)
|
|
203
|
+
return obj
|
|
204
|
+
|
|
205
|
+
def close(self) -> None:
|
|
206
|
+
"""Detach this process from the shared memory segment.
|
|
207
|
+
|
|
208
|
+
Call this in **every** process that opened/attached the segment.
|
|
209
|
+
After closing, any existing views into the buffer must **not** be used
|
|
210
|
+
unless you first copy them (e.g., ``np.array(view, copy=True)``).
|
|
211
|
+
"""
|
|
212
|
+
_logger.debug("close(): name=%r", self.name)
|
|
213
|
+
self.shm.close()
|
|
214
|
+
|
|
215
|
+
def unlink(self) -> None:
|
|
216
|
+
"""Destroy the shared memory segment (OS resource).
|
|
217
|
+
|
|
218
|
+
Call exactly **once** globally after all participating processes have
|
|
219
|
+
called :meth:`close`. After unlinking, the ``name`` may be reused by
|
|
220
|
+
the OS for new segments.
|
|
221
|
+
"""
|
|
222
|
+
_logger.debug("unlink(): name=%r", self.name)
|
|
223
|
+
self.shm.unlink()
|
|
224
|
+
|
|
225
|
+
def view(self, *, readonly: bool | None = None) -> np.ndarray: # if readonly is False, arr is mutable
|
|
226
|
+
"""Return a zero-copy NumPy view over the shared buffer.
|
|
227
|
+
|
|
228
|
+
Args:
|
|
229
|
+
readonly: If ``True``, the returned view is marked read-only.
|
|
230
|
+
If ``False``, the view is writable. If ``None`` (default),
|
|
231
|
+
the behavior follows ``self._default_readonly``.
|
|
232
|
+
|
|
233
|
+
Returns:
|
|
234
|
+
A NumPy ndarray that directly references the shared bytes using
|
|
235
|
+
the stored ``shape`` and ``dtype``.
|
|
236
|
+
|
|
237
|
+
Notes:
|
|
238
|
+
- The writeability flag is **per-view**; it does not affect other
|
|
239
|
+
views or other processes.
|
|
240
|
+
- Basic slicing of the returned array yields further views that
|
|
241
|
+
inherit the writeability flag; fancy indexing creates copies.
|
|
242
|
+
"""
|
|
243
|
+
arr = np.ndarray(self.shape, dtype=self.dtype, buffer=self.shm.buf)
|
|
244
|
+
ro = self._default_readonly if readonly is None else readonly
|
|
245
|
+
_logger.debug("view(readonly=%r) -> resolved_readonly=%r", readonly, ro)
|
|
246
|
+
if ro:
|
|
247
|
+
arr.flags.writeable = False
|
|
248
|
+
return arr
|
|
249
|
+
|
|
250
|
+
@property
|
|
251
|
+
def name(self) -> str:
|
|
252
|
+
"""System-wide name of the underlying shared memory segment."""
|
|
253
|
+
return self.shm.name
|
|
254
|
+
|
|
255
|
+
@property
|
|
256
|
+
def array(self) -> np.ndarray:
|
|
257
|
+
"""Default zero-copy view honoring ``default_readonly``."""
|
|
258
|
+
_logger.debug("array property accessed (default_readonly=%r)", self._default_readonly)
|
|
259
|
+
return self.view(readonly=self._default_readonly)
|
|
260
|
+
|
|
261
|
+
# *----------------------------------------------------*
|
|
262
|
+
# FUNCTIONS
|
|
263
|
+
# *----------------------------------------------------*
|
|
264
|
+
|
|
265
|
+
def l1_norm(X: np.ndarray) -> np.ndarray:
|
|
266
|
+
"""Return an L1-normalized copy of ``X`` (sum to 1), or zeros if invalid.
|
|
267
|
+
|
|
268
|
+
Args:
|
|
269
|
+
X (np.ndarray): Array of nonnegative weights/probabilities (any shape).
|
|
270
|
+
It is coerced with ``np.asarray(X, dtype=float)``.
|
|
271
|
+
|
|
272
|
+
Returns:
|
|
273
|
+
np.ndarray: Array with the same shape as ``X`` whose entries sum to 1
|
|
274
|
+
(within FP error). If the total mass is non-finite or <= 0, returns
|
|
275
|
+
an array of zeros with the same shape/dtype.
|
|
276
|
+
|
|
277
|
+
Notes:
|
|
278
|
+
- If ``X`` contains NaNs or Infs, the sum becomes non-finite and a
|
|
279
|
+
zeros array is returned.
|
|
280
|
+
- Works for any shape; normalization is over all elements.
|
|
281
|
+
"""
|
|
282
|
+
X = np.asarray(X, dtype=float)
|
|
283
|
+
s = float(np.sum(X))
|
|
284
|
+
if not np.isfinite(s) or s <= 0.0:
|
|
285
|
+
return np.zeros_like(X)
|
|
286
|
+
return X / s
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def apply_on_axis0(X: np.ndarray, func):
|
|
290
|
+
"""Apply a function independently to each slice ``X[i]`` along axis 0.
|
|
291
|
+
|
|
292
|
+
``func`` is called once per ``i`` with a view/copy of ``X[i]`` and its
|
|
293
|
+
first result is used to allocate the output array.
|
|
294
|
+
|
|
295
|
+
Args:
|
|
296
|
+
X (np.ndarray): Input array of shape ``(N, ...)`` where ``N >= 1``.
|
|
297
|
+
func (Callable): Function taking ``X[i]`` (shape ``X.shape[1:]``) and
|
|
298
|
+
returning an array-like object. All returns must be broadcast-
|
|
299
|
+
compatible and have identical shape.
|
|
300
|
+
|
|
301
|
+
Returns:
|
|
302
|
+
np.ndarray: Stacked results with shape ``(N,) + out0.shape``, where
|
|
303
|
+
``out0`` is ``func(X[0])``. The dtype matches ``np.asarray(out0).dtype``.
|
|
304
|
+
|
|
305
|
+
Raises:
|
|
306
|
+
IndexError: If ``X`` is empty along axis 0 (i.e., ``X.shape[0] == 0``).
|
|
307
|
+
|
|
308
|
+
Notes:
|
|
309
|
+
The first call to ``func`` determines the output dtype and shape.
|
|
310
|
+
"""
|
|
311
|
+
X = np.asarray(X)
|
|
312
|
+
out0 = func(X[0])
|
|
313
|
+
# the 0th axis has to have as many dims as the X array has along the 0th axis;
|
|
314
|
+
# as for the other axes, they coincide in dimensionality with the output of func
|
|
315
|
+
out = np.empty((X.shape[0],) + np.shape(out0), dtype=np.asarray(out0).dtype)
|
|
316
|
+
out[0] = out0
|
|
317
|
+
for i in range(1, X.shape[0]):
|
|
318
|
+
out[i] = func(X[i])
|
|
319
|
+
return out
|
|
320
|
+
|
|
321
|
+
|
|
322
|
+
def cosine_similarity(A: np.ndarray, eps: float = 1e-12):
|
|
323
|
+
"""Create a callable that computes cosine similarity to a fixed array ``A``.
|
|
324
|
+
|
|
325
|
+
The returned function takes an array ``B`` (same shape as ``A``), computes
|
|
326
|
+
the cosine similarity between ``A`` and ``B`` (using flattened views),
|
|
327
|
+
and maps it from ``[-1, 1]`` to ``[0, 1]`` via ``(cos + 1) / 2``.
|
|
328
|
+
If either vector has norm below ``eps``, it returns ``0.0``.
|
|
329
|
+
|
|
330
|
+
Args:
|
|
331
|
+
A (np.ndarray): Reference array. Coerced with ``np.asarray``.
|
|
332
|
+
eps (float, optional): Small threshold to guard against division by
|
|
333
|
+
near-zero norms. Defaults to ``1e-12``.
|
|
334
|
+
|
|
335
|
+
Returns:
|
|
336
|
+
Callable[[np.ndarray], float]: Function ``inner(B)`` that returns a
|
|
337
|
+
similarity score in ``[0, 1]``.
|
|
338
|
+
|
|
339
|
+
Raises:
|
|
340
|
+
ValueError: If the input ``B`` provided to the returned function does
|
|
341
|
+
not match the shape of ``A``.
|
|
342
|
+
"""
|
|
343
|
+
|
|
344
|
+
def inner(B: np.ndarray):
|
|
345
|
+
"""Compute cosine similarity between the captured ``A`` and input ``B``.
|
|
346
|
+
|
|
347
|
+
Args:
|
|
348
|
+
B (np.ndarray): Array with the same shape as ``A``.
|
|
349
|
+
|
|
350
|
+
Returns:
|
|
351
|
+
float: Cosine similarity mapped to ``[0, 1]``. Returns ``0.0`` if
|
|
352
|
+
the product of the norms is below ``eps``.
|
|
353
|
+
|
|
354
|
+
Raises:
|
|
355
|
+
ValueError: If ``A.shape != B.shape``.
|
|
356
|
+
"""
|
|
357
|
+
nonlocal A
|
|
358
|
+
nonlocal eps
|
|
359
|
+
|
|
360
|
+
A = np.asarray(A)
|
|
361
|
+
B = np.asarray(B)
|
|
362
|
+
if A.shape != B.shape:
|
|
363
|
+
raise ValueError(f"shapes must match, got {A.shape} vs {B.shape}")
|
|
364
|
+
|
|
365
|
+
a = A.ravel()
|
|
366
|
+
b = B.ravel()
|
|
367
|
+
|
|
368
|
+
denom = np.linalg.norm(a) * np.linalg.norm(b)
|
|
369
|
+
if denom < eps:
|
|
370
|
+
return 0.0
|
|
371
|
+
return (float(a @ b / denom) + 1) / 2 # translate from [-1, 1] to [0, 2] to [0, 1]
|
|
372
|
+
|
|
373
|
+
return inner
|
|
374
|
+
|
|
375
|
+
|
|
376
|
+
__all__ = [
|
|
377
|
+
"SharedNDArray",
|
|
378
|
+
"l1_norm",
|
|
379
|
+
"apply_on_axis0",
|
|
380
|
+
"cosine_similarity"
|
|
381
|
+
]
|
|
382
|
+
|
|
383
|
+
if __name__ == "__main__":
|
|
384
|
+
pass
|
|
@@ -0,0 +1,290 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: sawnergy
|
|
3
|
+
Version: 1.0.0
|
|
4
|
+
Summary: Toolkit for transforming molecular dynamics (MD) trajectories into rich graph representations
|
|
5
|
+
Home-page: https://github.com/Yehor-Mishchyriak/SAWNERGY
|
|
6
|
+
Author: Yehor Mishchyriak
|
|
7
|
+
License: Apache-2.0
|
|
8
|
+
Classifier: Programming Language :: Python :: 3
|
|
9
|
+
Classifier: License :: OSI Approved :: Apache Software License
|
|
10
|
+
Classifier: Operating System :: OS Independent
|
|
11
|
+
Requires-Python: >=3.11
|
|
12
|
+
Description-Content-Type: text/markdown
|
|
13
|
+
License-File: LICENSE
|
|
14
|
+
License-File: NOTICE
|
|
15
|
+
Requires-Dist: numpy>=2.0
|
|
16
|
+
Requires-Dist: zarr>=3.0
|
|
17
|
+
Requires-Dist: threadpoolctl>=3.0
|
|
18
|
+
Requires-Dist: matplotlib>=3.7
|
|
19
|
+
Requires-Dist: psutil>=5.9
|
|
20
|
+
Requires-Dist: ym-pure-ml>=1.2.0
|
|
21
|
+
Dynamic: author
|
|
22
|
+
Dynamic: classifier
|
|
23
|
+
Dynamic: description
|
|
24
|
+
Dynamic: description-content-type
|
|
25
|
+
Dynamic: home-page
|
|
26
|
+
Dynamic: license
|
|
27
|
+
Dynamic: license-file
|
|
28
|
+
Dynamic: requires-dist
|
|
29
|
+
Dynamic: requires-python
|
|
30
|
+
Dynamic: summary
|
|
31
|
+
|
|
32
|
+
# SAWNERGY
|
|
33
|
+
|
|
34
|
+
A toolkit for transforming molecular dynamics (MD) trajectories into rich graph representations, sampling
|
|
35
|
+
random and self-avoiding walks, learning node embeddings, and visualising residue interaction networks (RINs). SAWNERGY
|
|
36
|
+
keeps the full workflow — from `cpptraj` output to skip-gram embeddings (node2vec approach) — inside Python, backed by efficient Zarr-based archives and optional GPU acceleration.
|
|
37
|
+
|
|
38
|
+
---
|
|
39
|
+
|
|
40
|
+
## Why SAWNERGY?
|
|
41
|
+
|
|
42
|
+
- **Bridge simulations and graph ML**: Convert raw MD trajectories into residue interaction networks ready for graph
|
|
43
|
+
algorithms and downstream machine learning tasks.
|
|
44
|
+
- **Deterministic, shareable artefacts**: Every stage produces compressed Zarr archives that contain both data and metadata so runs can be reproduced, shared, or inspected later.
|
|
45
|
+
- **High-performance data handling**: Heavy arrays live in shared memory during walk sampling to allow parallel processing without serealization overhead; archives are written in chunked, compressed form for fast read/write.
|
|
46
|
+
- **Flexible embedding backends**: Train skip-gram with negative sampling (SGNS) models using either PureML or PyTorch.
|
|
47
|
+
- **Visualization out of the box**: Plot and animate residue networks without leaving Python, using the data produced by RINBuilder
|
|
48
|
+
|
|
49
|
+
---
|
|
50
|
+
|
|
51
|
+
## Pipeline at a Glance
|
|
52
|
+
|
|
53
|
+
```
|
|
54
|
+
MD Trajectory + Topology
|
|
55
|
+
│
|
|
56
|
+
▼
|
|
57
|
+
RINBuilder
|
|
58
|
+
│ → RIN archive (.zip/.zarr) → Visualizer (display/animate RINs)
|
|
59
|
+
▼
|
|
60
|
+
Walker
|
|
61
|
+
│ → Walks archive (RW/SAW per frame)
|
|
62
|
+
▼
|
|
63
|
+
Embedder
|
|
64
|
+
│ → Embedding archive (frame × vocab × dim)
|
|
65
|
+
▼
|
|
66
|
+
Downstream ML
|
|
67
|
+
```
|
|
68
|
+
|
|
69
|
+
Each stage consumes the archive produced by the previous one. Metadata embedded in the archives ensures frame order,
|
|
70
|
+
node indexing, and RNG seeds stay consistent across the toolchain.
|
|
71
|
+
|
|
72
|
+
---
|
|
73
|
+
|
|
74
|
+
## Core Components
|
|
75
|
+
|
|
76
|
+
### `sawnergy.rin.RINBuilder`
|
|
77
|
+
|
|
78
|
+
* Wraps the AmberTools `cpptraj` executable to:
|
|
79
|
+
- compute per-frame electrostatic (EMAP) and van der Waals (VMAP) energy matrices at the atomic level,
|
|
80
|
+
- project atom–atom interactions to residue–residue interactions using compositional masks,
|
|
81
|
+
- prune, symmetrise, remove self-interactions, and L1-normalise the matrices,
|
|
82
|
+
- compute per-residue centres of mass (COM) over the same frames.
|
|
83
|
+
* Outputs a compressed Zarr archive with transition matrices, optional prenormalised energies, COM snapshots, and rich
|
|
84
|
+
metadata (frame range, pruning quantile, molecule ID, etc.).
|
|
85
|
+
* Supports parallel `cpptraj` execution, batch processing, and keeps temporary stores tidy via
|
|
86
|
+
`ArrayStorage.compress_and_cleanup`.
|
|
87
|
+
|
|
88
|
+
### `sawnergy.visual.Visualizer`
|
|
89
|
+
|
|
90
|
+
* Opens RIN archives, resolves dataset names from attributes, and renders nodes plus attractive/repulsive edge bundles
|
|
91
|
+
in 3D using Matplotlib.
|
|
92
|
+
* Allows both static frame visualization and trajectory animation.
|
|
93
|
+
* Handles backend selection (`Agg` fallback in headless environments) and offers convenient colour palettes via
|
|
94
|
+
`visualizer_util`.
|
|
95
|
+
|
|
96
|
+
### `sawnergy.walks.Walker`
|
|
97
|
+
|
|
98
|
+
* Attaches to the RIN archive and loads attractive/repulsive transition matrices into shared memory using
|
|
99
|
+
`walker_util.SharedNDArray` so multiple processes can sample without copying.
|
|
100
|
+
* Samples random walks (RW) and self-avoiding walks (SAW), optionally time-aware, that is, walks move through transition matrices with transition probabilities proportional to cosine similarity between the current and next frame. Randomness is controlled by the seed passed to the class constructor.
|
|
101
|
+
* Persists walks as `(time, walk_id, length+1)` tensors (1-based node indices) alongside metadata such as
|
|
102
|
+
`walk_length`, `walks_per_node`, and RNG scheme.
|
|
103
|
+
|
|
104
|
+
### `sawnergy.embedding.Embedder`
|
|
105
|
+
|
|
106
|
+
* Consumes walk archives, generates skip-gram pairs, and normalises them to 0-based indices.
|
|
107
|
+
* Provides a unified interface to SGNS implementations:
|
|
108
|
+
- **PureML backend** (`SGNS_PureML`): works with the `pureml` ecosystem, optimistic for CPU training.
|
|
109
|
+
- **PyTorch backend** (`SGNS_Torch`): uses `torch.nn.Embedding` plays nicely with GPUs.
|
|
110
|
+
* Both `SGNS_PureML` and `SGNS_Torch` accept training hyperparameters such as batch_size, LR, optimizer and LR_scheduler, etc.
|
|
111
|
+
* Exposes `embed_frame` (single frame) and `embed_all` (all frames, deterministic seeding per frame) which return the
|
|
112
|
+
learned input embedding matrices and write them to disk when requested.
|
|
113
|
+
|
|
114
|
+
### Supporting Utilities
|
|
115
|
+
|
|
116
|
+
* `sawnergy.sawnergy_util`
|
|
117
|
+
- `ArrayStorage`: thin wrapper over Zarr v3 with helpers for chunk management, attribute coercion to JSON, and transparent compression to `.zip` archives.
|
|
118
|
+
- Parallel helpers (`elementwise_processor`, `compose_steps`, etc.), temporary file management, logging, and runtime
|
|
119
|
+
inspection utilities.
|
|
120
|
+
* `sawnergy.logging_util.configure_logging`: configure rotating file/console logging consistently across scripts.
|
|
121
|
+
|
|
122
|
+
---
|
|
123
|
+
|
|
124
|
+
## Archive Layouts
|
|
125
|
+
|
|
126
|
+
| Archive | Key datasets (name → shape, dtype) | Important attributes (root `attrs`) |
|
|
127
|
+
|---|---|---|
|
|
128
|
+
| **RIN** | `ATTRACTIVE_transitions` → **(T, N, N)**, float32 • `REPULSIVE_transitions` → **(T, N, N)**, float32 (optional) • `ATTRACTIVE_energies` → **(T, N, N)**, float32 (optional) • `REPULSIVE_energies` → **(T, N, N)**, float32 (optional) • `COM` → **(T, N, 3)**, float32 | `time_created` (ISO) • `com_name` = `"COM"` • `molecule_of_interest` (int) • `frame_range` = `(start, end)` inclusive • `frame_batch_size` (int) • `prune_low_energies_frac` (float in [0,1]) • `attractive_transitions_name` / `repulsive_transitions_name` (dataset names or `None`) • `attractive_energies_name` / `repulsive_energies_name` (dataset names or `None`) |
|
|
129
|
+
| **Walks** | `ATTRACTIVE_RWs` → **(T, N·num_RWs, L+1)**, int32 (optional) • `REPULSIVE_RWs` → **(T, N·num_RWs, L+1)**, int32 (optional) • `ATTRACTIVE_SAWs` → **(T, N·num_SAWs, L+1)**, int32 (optional) • `REPULSIVE_SAWs` → **(T, N·num_SAWs, L+1)**, int32 (optional) <br/>_Note:_ node IDs are **1-based**.| `time_created` (ISO) • `seed` (int) • `rng_scheme` = `"SeedSequence.spawn_per_batch_v1"` • `num_workers` (int) • `in_parallel` (bool) • `batch_size_nodes` (int) • `num_RWs` / `num_SAWs` (ints) • `node_count` (N) • `time_stamp_count` (T) • `walk_length` (L) • `walks_per_node` (int) • `attractive_RWs_name` / `repulsive_RWs_name` / `attractive_SAWs_name` / `repulsive_SAWs_name` (dataset names or `None`) • `walks_layout` = `"time_leading_3d"` |
|
|
130
|
+
| **Embeddings** | `FRAME_EMBEDDINGS` → **(frames_written, vocab_size, D)**, typically float32 | `time_created` (ISO) • `seed` (int) • `rng_scheme` = `"SeedSequence.spawn_per_frame_v1"` • `source_walks_path` (str) • `model_base` = `"torch"` or `"pureml"` • `rin_type` = `"attr"` or `"repuls"` • `using_mode` = `"RW"|"SAW"|"merged"` • `window_size` (int) • `alpha` (float; noise exponent) • `dimensionality` = D • `num_negative_samples` (int) • `num_epochs` (int) • `batch_size` (int) • `shuffle_data` (bool) • `frames_written` (int) • `vocab_size` (int) • `frame_count` (int) • `embedding_dtype` (str) • `frame_embeddings_name` = `"FRAME_EMBEDDINGS"` • `arrays_per_chunk` (int) • `compression_level` (int) |
|
|
131
|
+
|
|
132
|
+
**Notes**
|
|
133
|
+
|
|
134
|
+
- In **RIN**, `T` equals the number of frame **batches** written (i.e., `frame_range` swept in steps of `frame_batch_size`). `ATTRACTIVE/REPULSIVE_energies` are **pre-normalised** absolute energies (written only when `keep_prenormalized_energies=True`), whereas `ATTRACTIVE/REPULSIVE_transitions` are the **row-wise L1-normalised** versions used for sampling.
|
|
135
|
+
- All archives are Zarr v3 groups. ArrayStorage also maintains per-block metadata in root attrs: `array_chunk_size_in_block`, `array_shape_in_block`, and `array_dtype_in_block` (dicts keyed by dataset name). You’ll see these in every archive.
|
|
136
|
+
|
|
137
|
+
---
|
|
138
|
+
|
|
139
|
+
## Installation
|
|
140
|
+
|
|
141
|
+
```bash
|
|
142
|
+
pip install sawnergy
|
|
143
|
+
```
|
|
144
|
+
|
|
145
|
+
> **Note:** RIN building requires `cpptraj` (AmberTools). Ensure it is discoverable via `$PATH` or the `CPPTRAJ`
|
|
146
|
+
> environment variable.
|
|
147
|
+
|
|
148
|
+
---
|
|
149
|
+
|
|
150
|
+
## Quick Start
|
|
151
|
+
|
|
152
|
+
```python
|
|
153
|
+
from pathlib import Path
|
|
154
|
+
from sawnergy.logging_util import configure_logging
|
|
155
|
+
from sawnergy.rin import RINBuilder
|
|
156
|
+
from sawnergy.walks import Walker
|
|
157
|
+
from sawnergy.embedding import Embedder
|
|
158
|
+
|
|
159
|
+
import logging
|
|
160
|
+
configure_logging("./logs", file_level=logging.WARNING, console_level=logging.INFO)
|
|
161
|
+
|
|
162
|
+
# 1. Build a Residue Interaction Network archive
|
|
163
|
+
rin_path = Path("./RIN_demo.zip")
|
|
164
|
+
rin_builder = RINBuilder()
|
|
165
|
+
rin_builder.build_rin(
|
|
166
|
+
topology_file="system.prmtop",
|
|
167
|
+
trajectory_file="trajectory.nc",
|
|
168
|
+
molecule_of_interest=1,
|
|
169
|
+
frame_range=(1, 100),
|
|
170
|
+
frame_batch_size=10,
|
|
171
|
+
prune_low_energies_frac=0.3,
|
|
172
|
+
output_path=rin_path,
|
|
173
|
+
include_attractive=True,
|
|
174
|
+
include_repulsive=False,
|
|
175
|
+
)
|
|
176
|
+
|
|
177
|
+
# 2. Sample walks from the RIN
|
|
178
|
+
walker = Walker(rin_path, seed=123)
|
|
179
|
+
walks_path = Path("./WALKS_demo.zip")
|
|
180
|
+
walker.sample_walks(
|
|
181
|
+
walk_length=16,
|
|
182
|
+
walks_per_node=32,
|
|
183
|
+
saw_frac=0.25,
|
|
184
|
+
include_attractive=True,
|
|
185
|
+
include_repulsive=False,
|
|
186
|
+
time_aware=False,
|
|
187
|
+
output_path=walks_path,
|
|
188
|
+
in_parallel=False,
|
|
189
|
+
)
|
|
190
|
+
walker.close()
|
|
191
|
+
|
|
192
|
+
# 3. Train embeddings per frame (PyTorch backend)
|
|
193
|
+
import torch
|
|
194
|
+
|
|
195
|
+
embedder = Embedder(walks_path, base="torch", seed=999)
|
|
196
|
+
embeddings_path = embedder.embed_all(
|
|
197
|
+
RIN_type="attr",
|
|
198
|
+
using="merged",
|
|
199
|
+
window_size=4,
|
|
200
|
+
num_negative_samples=5,
|
|
201
|
+
num_epochs=5,
|
|
202
|
+
batch_size=1024,
|
|
203
|
+
dimensionality=128,
|
|
204
|
+
shuffle_data=True,
|
|
205
|
+
output_path="./EMBEDDINGS_demo.zip",
|
|
206
|
+
sgns_kwargs={
|
|
207
|
+
"optim": torch.optim.Adam,
|
|
208
|
+
"optim_kwargs": {"lr": 1e-3},
|
|
209
|
+
"lr_sched": torch.optim.lr_scheduler.LambdaLR,
|
|
210
|
+
"lr_sched_kwargs": {"lr_lambda": lambda _: 1.0},
|
|
211
|
+
"device": "cuda" if torch.cuda.is_available() else "cpu",
|
|
212
|
+
},
|
|
213
|
+
)
|
|
214
|
+
print("Embeddings written to", embeddings_path)
|
|
215
|
+
```
|
|
216
|
+
|
|
217
|
+
> For the PureML backend, supply the relevant optimiser and scheduler via `sgns_kwargs`
|
|
218
|
+
> (for example `optim=pureml.optimizers.Adam`, `lr_sched=pureml.optimizers.CosineAnnealingLR`).
|
|
219
|
+
|
|
220
|
+
---
|
|
221
|
+
|
|
222
|
+
## Visualisation
|
|
223
|
+
|
|
224
|
+
```python
|
|
225
|
+
from sawnergy.visual import Visualizer
|
|
226
|
+
|
|
227
|
+
v = sawnergy.visual.Visualizer("./RIN_demo.zip")
|
|
228
|
+
v.build_frame(1,
|
|
229
|
+
node_colors="rainbow",
|
|
230
|
+
displayed_nodes="ALL",
|
|
231
|
+
displayed_pairwise_attraction_for_nodes="DISPLAYED_NODES",
|
|
232
|
+
displayed_pairwise_repulsion_for_nodes="DISPLAYED_NODES",
|
|
233
|
+
show_node_labels=True,
|
|
234
|
+
show=True
|
|
235
|
+
)
|
|
236
|
+
```
|
|
237
|
+
|
|
238
|
+
`Visualizer` lazily loads datasets and works even in headless environments (falls back to the `Agg` backend).
|
|
239
|
+
|
|
240
|
+
---
|
|
241
|
+
|
|
242
|
+
## Advanced Notes
|
|
243
|
+
|
|
244
|
+
- **Time-aware walks**: Set `time_aware=True`, provide `stickiness` and `on_no_options` when calling `Walker.sample_walks`.
|
|
245
|
+
- **Shared memory lifecycle**: Call `Walker.close()` (or use a context manager) to release shared-memory segments.
|
|
246
|
+
- **PureML vs PyTorch**: Choose the backend via `Embedder(..., base="pureml"|"torch")` and provide backend-specific
|
|
247
|
+
constructor kwargs through `sgns_kwargs` (optimizer, scheduler, device).
|
|
248
|
+
- **ArrayStorage utilities**: Use `ArrayStorage` directly to peek into archives, append arrays, or manage metadata.
|
|
249
|
+
|
|
250
|
+
---
|
|
251
|
+
|
|
252
|
+
## Testing & Quality Assurance
|
|
253
|
+
|
|
254
|
+
The automated test suite (`pytest`) synthesises deterministic cpptraj outputs and exercises the entire workflow:
|
|
255
|
+
|
|
256
|
+
- RIN parsing, residue aggregation, and metadata verification.
|
|
257
|
+
- Random/self-avoiding walk sampling and probability consistency with the RIN.
|
|
258
|
+
- Embedding orchestration, frame ordering, SGNS pair generation property tests.
|
|
259
|
+
- PureML and PyTorch SGNS smoke tests verifying finite weights and decreasing loss.
|
|
260
|
+
- Visualiser smoke tests that cover data loading and artist updates.
|
|
261
|
+
|
|
262
|
+
Run the suite (inside the project virtualenv):
|
|
263
|
+
|
|
264
|
+
```bash
|
|
265
|
+
python -m pytest
|
|
266
|
+
```
|
|
267
|
+
|
|
268
|
+
---
|
|
269
|
+
|
|
270
|
+
## Project Structure
|
|
271
|
+
|
|
272
|
+
```
|
|
273
|
+
├── sawnergy/
|
|
274
|
+
│ ├── rin/ # RINBuilder and cpptraj integration helpers
|
|
275
|
+
│ ├── walks/ # Walker class and shared-memory utilities
|
|
276
|
+
│ ├── embedding/ # Embedder + SGNS backends (PureML / PyTorch)
|
|
277
|
+
│ ├── visual/ # Visualizer and palette utilities
|
|
278
|
+
│ ├── logging_util.py
|
|
279
|
+
│ └── sawnergy_util.py
|
|
280
|
+
├── tests/ # Synthetic end-to-end tests (pytest)
|
|
281
|
+
│
|
|
282
|
+
└── README.md
|
|
283
|
+
```
|
|
284
|
+
|
|
285
|
+
---
|
|
286
|
+
|
|
287
|
+
## Acknowledgements
|
|
288
|
+
|
|
289
|
+
SAWNERGY builds on the AmberTools `cpptraj` ecosystem, NumPy, Matplotlib, Zarr, and PyTorch (for GPU acceleration if necessary; PureML is available by default).
|
|
290
|
+
Big thanks to the upstream communities whose work makes this toolkit possible.
|