sawnergy 1.0.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of sawnergy might be problematic. Click here for more details.

@@ -0,0 +1,384 @@
1
+ # third-party
2
+ import numpy as np
3
+ # built-in
4
+ from multiprocessing import shared_memory
5
+ import logging
6
+
7
+ # *----------------------------------------------------*
8
+ # GLOBALS
9
+ # *----------------------------------------------------*
10
+
11
+ _logger = logging.getLogger(__name__)
12
+
13
+ # *----------------------------------------------------*
14
+ # CLASSES
15
+ # *----------------------------------------------------*
16
+
17
+ class SharedNDArray:
18
+ """NumPy-facing wrapper over a raw :class:`multiprocessing.shared_memory.SharedMemory`.
19
+
20
+ This class does **not** own any data itself; it wraps an OS-level shared
21
+ memory segment and exposes it as a NumPy array via zero-copy views
22
+ (shape/dtype provided by the caller). The underlying buffer is just a
23
+ flat byte block; dimensionality and strides come from the views you
24
+ construct.
25
+
26
+ Usage model:
27
+ - Create a segment in the parent with :meth:`create`, optionally seeding
28
+ from an existing array (copied once, C-contiguous).
29
+ - Pass ``(name, shape, dtype)`` to workers and attach with :meth:`attach`.
30
+ - Obtain a view with :py:meth:`view` or the :py:attr:`array` property.
31
+ Views are read-only by default unless ``default_readonly=False`` or
32
+ ``view(readonly=False)`` is requested.
33
+ - Every process that opened the segment must call :meth:`close`.
34
+ Exactly one process should call :meth:`unlink` after all others have
35
+ closed to destroy the OS resource.
36
+
37
+ Indexing:
38
+ - ``__getitem__`` strictly supports **axis-0** basic indexing
39
+ (``None``, ``slice``, or ``int``). This guarantees **no-copy** views.
40
+ Fancy indexing (index arrays/boolean masks) is intentionally disallowed.
41
+ - For 1D arrays, ``int`` indexing would yield a NumPy scalar (not a view),
42
+ so it is rejected; use ``slice(i, i+1)`` for a one-row view instead.
43
+
44
+ Concurrency:
45
+ - Multiple readers are safe by design.
46
+ - If multiple writers may overlap, synchronize externally (e.g., a
47
+ :class:`multiprocessing.Lock`). The class does not implement locking.
48
+
49
+ Notes:
50
+ - The writeability flag is **per-view**. Marking one view read-only does
51
+ not prevent other processes (or other views) from writing.
52
+ - Shape/dtype are trusted by :meth:`attach`—they must match what was used
53
+ at creation time; no runtime validation is performed here.
54
+ """
55
+
56
+ def __init__(self,
57
+ shm: shared_memory.SharedMemory,
58
+ shape: tuple[int, ...],
59
+ dtype: np.dtype,
60
+ *,
61
+ default_readonly: bool = True):
62
+ """Construct a wrapper over an existing shared memory handle.
63
+
64
+ Args:
65
+ shm: An open :class:`SharedMemory` handle (already created/attached).
66
+ shape: Target array shape used for all views into this buffer.
67
+ dtype: Target NumPy dtype used for all views into this buffer.
68
+ default_readonly: If ``True``, views returned by :py:attr:`array`
69
+ are marked read-only; override per-call via :py:meth:`view`.
70
+
71
+ Remarks:
72
+ This constructor does not allocate memory; it only stores metadata.
73
+ Use :meth:`create` to allocate a new segment, or :meth:`attach`
74
+ to connect to an existing one by name.
75
+ """
76
+ self.shm = shm
77
+ self.shape = tuple(shape)
78
+ self.dtype = np.dtype(dtype)
79
+ self._default_readonly = default_readonly
80
+ _logger.debug(
81
+ "SharedNDArray.__init__(name=%r, shape=%s, dtype=%s, default_readonly=%s)",
82
+ getattr(self.shm, "name", None), self.shape, self.dtype, self._default_readonly
83
+ )
84
+
85
+ def __len__(self) -> int:
86
+ """Return the size of axis 0 (NumPy semantics).
87
+
88
+ Returns:
89
+ The number of elements along the first dimension.
90
+
91
+ Raises:
92
+ TypeError: If the wrapped array is 0-D (unsized).
93
+ """
94
+ if len(self.shape) == 0:
95
+ _logger.error("len() called on 0-D array (shape=%s)", self.shape)
96
+ raise TypeError("len() of unsized object")
97
+ length = self.shape[0]
98
+ _logger.debug("__len__ -> %d", length)
99
+ return length
100
+
101
+ def __repr__(self):
102
+ """Debug-friendly representation showing name/shape/dtype."""
103
+ return f"SharedNDArray(name={self.name!r}, shape={self.shape}, dtype={self.dtype})"
104
+
105
+ def __getitem__(self, ids: int | slice | None = None):
106
+ """Axis-0 only, no-copy guaranteed.
107
+ - None -> full view
108
+ - slice -> view
109
+ - int -> view (requires ndim >= 2); for 1D, use slice(i, i+1)
110
+ """
111
+ _logger.debug("__getitem__(ids=%r)", ids)
112
+ arr = self.array
113
+ if ids is None:
114
+ _logger.debug("__getitem__: returning full view")
115
+ return arr
116
+ if isinstance(ids, slice):
117
+ _logger.debug("__getitem__: slice=%s", ids)
118
+ return arr[ids, ...]
119
+ if isinstance(ids, int):
120
+ if arr.ndim == 1:
121
+ _logger.error(
122
+ "__getitem__: 1D int indexing requested (idx=%r) -> would copy; raising",
123
+ ids
124
+ )
125
+ raise TypeError(
126
+ "No-copy view for 1D int indexing is impossible. "
127
+ "Use slice(i, i+1) to get a 1-row view."
128
+ )
129
+ _logger.debug("__getitem__: int=%d", ids)
130
+ return arr[ids, ...]
131
+ _logger.error("__getitem__: unsupported key type %s", type(ids).__name__)
132
+ raise TypeError("Only axis-0 int/slice/None are allowed for no-copy access.")
133
+
134
+ @classmethod
135
+ def attach(cls, name: str, shape, dtype):
136
+ """Attach to an existing shared memory segment by name.
137
+
138
+ Args:
139
+ name: System-wide shared memory name (as returned by :py:attr:`name`).
140
+ shape: Shape to interpret the buffer with (must match creator).
141
+ dtype: Dtype to interpret the buffer with (must match creator).
142
+
143
+ Returns:
144
+ A :class:`SharedNDArray` bound to the named segment.
145
+
146
+ Raises:
147
+ FileNotFoundError: If no segment with ``name`` exists.
148
+ PermissionError: If the segment exists but cannot be opened.
149
+
150
+ Notes:
151
+ This method trusts ``shape`` and ``dtype``; it does not verify that
152
+ they match the original settings. Passing inconsistent metadata
153
+ results in undefined views.
154
+ """
155
+ _logger.debug("SharedNDArray.attach(name=%r, shape=%s, dtype=%s)", name, shape, np.dtype(dtype))
156
+ shm = shared_memory.SharedMemory(name=name, create=False)
157
+ obj = cls(shm, shape, dtype)
158
+ _logger.debug("Attached to shared memory: name=%r", obj.name)
159
+ return obj
160
+
161
+ @classmethod
162
+ def create(cls, shape, dtype, *, from_array=None, name: str | None = None):
163
+ """Create a new shared memory segment and wrap it.
164
+
165
+ The allocated buffer is sized exactly as ``prod(shape) * dtype.itemsize``.
166
+ If ``from_array`` is provided, its contents are copied into the buffer
167
+ after being coerced to a C-contiguous array of ``dtype``. Otherwise the
168
+ buffer is zero-initialized.
169
+
170
+ Args:
171
+ shape: Desired array shape.
172
+ dtype: Desired NumPy dtype.
173
+ from_array: Optional source array to seed the buffer. Must match
174
+ ``shape`` after coercion to ``dtype``; copied as C-contiguous.
175
+ name: Optional OS-visible name for the segment. If omitted, a unique
176
+ name is generated.
177
+
178
+ Returns:
179
+ A :class:`SharedNDArray` bound to the newly created segment.
180
+
181
+ Raises:
182
+ ValueError: If ``from_array`` shape does not match ``shape`` after
183
+ dtype coercion.
184
+ """
185
+ dtype = np.dtype(dtype)
186
+ nbytes = int(np.prod(shape)) * dtype.itemsize
187
+ _logger.debug("SharedNDArray.create(shape=%s, dtype=%s, name=%r, nbytes=%d)", shape, dtype, name, nbytes)
188
+ shm = shared_memory.SharedMemory(create=True, size=nbytes, name=name)
189
+ obj = cls(shm, shape, dtype)
190
+
191
+ view = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
192
+ if from_array is not None:
193
+ src = np.ascontiguousarray(from_array, dtype=dtype)
194
+ if src.shape != tuple(shape):
195
+ _logger.error("create: source shape %s does not match %s", src.shape, shape)
196
+ raise ValueError(f"shape mismatch: {src.shape} vs {shape}")
197
+ view[...] = src
198
+ _logger.debug("create: seeded from array (shape=%s, dtype=%s)", src.shape, src.dtype)
199
+ else:
200
+ view.fill(0)
201
+ _logger.debug("create: zero-initialized buffer")
202
+ _logger.debug("create: created shared segment name=%r", obj.name)
203
+ return obj
204
+
205
+ def close(self) -> None:
206
+ """Detach this process from the shared memory segment.
207
+
208
+ Call this in **every** process that opened/attached the segment.
209
+ After closing, any existing views into the buffer must **not** be used
210
+ unless you first copy them (e.g., ``np.array(view, copy=True)``).
211
+ """
212
+ _logger.debug("close(): name=%r", self.name)
213
+ self.shm.close()
214
+
215
+ def unlink(self) -> None:
216
+ """Destroy the shared memory segment (OS resource).
217
+
218
+ Call exactly **once** globally after all participating processes have
219
+ called :meth:`close`. After unlinking, the ``name`` may be reused by
220
+ the OS for new segments.
221
+ """
222
+ _logger.debug("unlink(): name=%r", self.name)
223
+ self.shm.unlink()
224
+
225
+ def view(self, *, readonly: bool | None = None) -> np.ndarray: # if readonly is False, arr is mutable
226
+ """Return a zero-copy NumPy view over the shared buffer.
227
+
228
+ Args:
229
+ readonly: If ``True``, the returned view is marked read-only.
230
+ If ``False``, the view is writable. If ``None`` (default),
231
+ the behavior follows ``self._default_readonly``.
232
+
233
+ Returns:
234
+ A NumPy ndarray that directly references the shared bytes using
235
+ the stored ``shape`` and ``dtype``.
236
+
237
+ Notes:
238
+ - The writeability flag is **per-view**; it does not affect other
239
+ views or other processes.
240
+ - Basic slicing of the returned array yields further views that
241
+ inherit the writeability flag; fancy indexing creates copies.
242
+ """
243
+ arr = np.ndarray(self.shape, dtype=self.dtype, buffer=self.shm.buf)
244
+ ro = self._default_readonly if readonly is None else readonly
245
+ _logger.debug("view(readonly=%r) -> resolved_readonly=%r", readonly, ro)
246
+ if ro:
247
+ arr.flags.writeable = False
248
+ return arr
249
+
250
+ @property
251
+ def name(self) -> str:
252
+ """System-wide name of the underlying shared memory segment."""
253
+ return self.shm.name
254
+
255
+ @property
256
+ def array(self) -> np.ndarray:
257
+ """Default zero-copy view honoring ``default_readonly``."""
258
+ _logger.debug("array property accessed (default_readonly=%r)", self._default_readonly)
259
+ return self.view(readonly=self._default_readonly)
260
+
261
+ # *----------------------------------------------------*
262
+ # FUNCTIONS
263
+ # *----------------------------------------------------*
264
+
265
+ def l1_norm(X: np.ndarray) -> np.ndarray:
266
+ """Return an L1-normalized copy of ``X`` (sum to 1), or zeros if invalid.
267
+
268
+ Args:
269
+ X (np.ndarray): Array of nonnegative weights/probabilities (any shape).
270
+ It is coerced with ``np.asarray(X, dtype=float)``.
271
+
272
+ Returns:
273
+ np.ndarray: Array with the same shape as ``X`` whose entries sum to 1
274
+ (within FP error). If the total mass is non-finite or <= 0, returns
275
+ an array of zeros with the same shape/dtype.
276
+
277
+ Notes:
278
+ - If ``X`` contains NaNs or Infs, the sum becomes non-finite and a
279
+ zeros array is returned.
280
+ - Works for any shape; normalization is over all elements.
281
+ """
282
+ X = np.asarray(X, dtype=float)
283
+ s = float(np.sum(X))
284
+ if not np.isfinite(s) or s <= 0.0:
285
+ return np.zeros_like(X)
286
+ return X / s
287
+
288
+
289
+ def apply_on_axis0(X: np.ndarray, func):
290
+ """Apply a function independently to each slice ``X[i]`` along axis 0.
291
+
292
+ ``func`` is called once per ``i`` with a view/copy of ``X[i]`` and its
293
+ first result is used to allocate the output array.
294
+
295
+ Args:
296
+ X (np.ndarray): Input array of shape ``(N, ...)`` where ``N >= 1``.
297
+ func (Callable): Function taking ``X[i]`` (shape ``X.shape[1:]``) and
298
+ returning an array-like object. All returns must be broadcast-
299
+ compatible and have identical shape.
300
+
301
+ Returns:
302
+ np.ndarray: Stacked results with shape ``(N,) + out0.shape``, where
303
+ ``out0`` is ``func(X[0])``. The dtype matches ``np.asarray(out0).dtype``.
304
+
305
+ Raises:
306
+ IndexError: If ``X`` is empty along axis 0 (i.e., ``X.shape[0] == 0``).
307
+
308
+ Notes:
309
+ The first call to ``func`` determines the output dtype and shape.
310
+ """
311
+ X = np.asarray(X)
312
+ out0 = func(X[0])
313
+ # the 0th axis has to have as many dims as the X array has along the 0th axis;
314
+ # as for the other axes, they coincide in dimensionality with the output of func
315
+ out = np.empty((X.shape[0],) + np.shape(out0), dtype=np.asarray(out0).dtype)
316
+ out[0] = out0
317
+ for i in range(1, X.shape[0]):
318
+ out[i] = func(X[i])
319
+ return out
320
+
321
+
322
+ def cosine_similarity(A: np.ndarray, eps: float = 1e-12):
323
+ """Create a callable that computes cosine similarity to a fixed array ``A``.
324
+
325
+ The returned function takes an array ``B`` (same shape as ``A``), computes
326
+ the cosine similarity between ``A`` and ``B`` (using flattened views),
327
+ and maps it from ``[-1, 1]`` to ``[0, 1]`` via ``(cos + 1) / 2``.
328
+ If either vector has norm below ``eps``, it returns ``0.0``.
329
+
330
+ Args:
331
+ A (np.ndarray): Reference array. Coerced with ``np.asarray``.
332
+ eps (float, optional): Small threshold to guard against division by
333
+ near-zero norms. Defaults to ``1e-12``.
334
+
335
+ Returns:
336
+ Callable[[np.ndarray], float]: Function ``inner(B)`` that returns a
337
+ similarity score in ``[0, 1]``.
338
+
339
+ Raises:
340
+ ValueError: If the input ``B`` provided to the returned function does
341
+ not match the shape of ``A``.
342
+ """
343
+
344
+ def inner(B: np.ndarray):
345
+ """Compute cosine similarity between the captured ``A`` and input ``B``.
346
+
347
+ Args:
348
+ B (np.ndarray): Array with the same shape as ``A``.
349
+
350
+ Returns:
351
+ float: Cosine similarity mapped to ``[0, 1]``. Returns ``0.0`` if
352
+ the product of the norms is below ``eps``.
353
+
354
+ Raises:
355
+ ValueError: If ``A.shape != B.shape``.
356
+ """
357
+ nonlocal A
358
+ nonlocal eps
359
+
360
+ A = np.asarray(A)
361
+ B = np.asarray(B)
362
+ if A.shape != B.shape:
363
+ raise ValueError(f"shapes must match, got {A.shape} vs {B.shape}")
364
+
365
+ a = A.ravel()
366
+ b = B.ravel()
367
+
368
+ denom = np.linalg.norm(a) * np.linalg.norm(b)
369
+ if denom < eps:
370
+ return 0.0
371
+ return (float(a @ b / denom) + 1) / 2 # translate from [-1, 1] to [0, 2] to [0, 1]
372
+
373
+ return inner
374
+
375
+
376
+ __all__ = [
377
+ "SharedNDArray",
378
+ "l1_norm",
379
+ "apply_on_axis0",
380
+ "cosine_similarity"
381
+ ]
382
+
383
+ if __name__ == "__main__":
384
+ pass
@@ -0,0 +1,290 @@
1
+ Metadata-Version: 2.4
2
+ Name: sawnergy
3
+ Version: 1.0.0
4
+ Summary: Toolkit for transforming molecular dynamics (MD) trajectories into rich graph representations
5
+ Home-page: https://github.com/Yehor-Mishchyriak/SAWNERGY
6
+ Author: Yehor Mishchyriak
7
+ License: Apache-2.0
8
+ Classifier: Programming Language :: Python :: 3
9
+ Classifier: License :: OSI Approved :: Apache Software License
10
+ Classifier: Operating System :: OS Independent
11
+ Requires-Python: >=3.11
12
+ Description-Content-Type: text/markdown
13
+ License-File: LICENSE
14
+ License-File: NOTICE
15
+ Requires-Dist: numpy>=2.0
16
+ Requires-Dist: zarr>=3.0
17
+ Requires-Dist: threadpoolctl>=3.0
18
+ Requires-Dist: matplotlib>=3.7
19
+ Requires-Dist: psutil>=5.9
20
+ Requires-Dist: ym-pure-ml>=1.2.0
21
+ Dynamic: author
22
+ Dynamic: classifier
23
+ Dynamic: description
24
+ Dynamic: description-content-type
25
+ Dynamic: home-page
26
+ Dynamic: license
27
+ Dynamic: license-file
28
+ Dynamic: requires-dist
29
+ Dynamic: requires-python
30
+ Dynamic: summary
31
+
32
+ # SAWNERGY
33
+
34
+ A toolkit for transforming molecular dynamics (MD) trajectories into rich graph representations, sampling
35
+ random and self-avoiding walks, learning node embeddings, and visualising residue interaction networks (RINs). SAWNERGY
36
+ keeps the full workflow — from `cpptraj` output to skip-gram embeddings (node2vec approach) — inside Python, backed by efficient Zarr-based archives and optional GPU acceleration.
37
+
38
+ ---
39
+
40
+ ## Why SAWNERGY?
41
+
42
+ - **Bridge simulations and graph ML**: Convert raw MD trajectories into residue interaction networks ready for graph
43
+ algorithms and downstream machine learning tasks.
44
+ - **Deterministic, shareable artefacts**: Every stage produces compressed Zarr archives that contain both data and metadata so runs can be reproduced, shared, or inspected later.
45
+ - **High-performance data handling**: Heavy arrays live in shared memory during walk sampling to allow parallel processing without serealization overhead; archives are written in chunked, compressed form for fast read/write.
46
+ - **Flexible embedding backends**: Train skip-gram with negative sampling (SGNS) models using either PureML or PyTorch.
47
+ - **Visualization out of the box**: Plot and animate residue networks without leaving Python, using the data produced by RINBuilder
48
+
49
+ ---
50
+
51
+ ## Pipeline at a Glance
52
+
53
+ ```
54
+ MD Trajectory + Topology
55
+
56
+
57
+ RINBuilder
58
+ │ → RIN archive (.zip/.zarr) → Visualizer (display/animate RINs)
59
+
60
+ Walker
61
+ │ → Walks archive (RW/SAW per frame)
62
+
63
+ Embedder
64
+ │ → Embedding archive (frame × vocab × dim)
65
+
66
+ Downstream ML
67
+ ```
68
+
69
+ Each stage consumes the archive produced by the previous one. Metadata embedded in the archives ensures frame order,
70
+ node indexing, and RNG seeds stay consistent across the toolchain.
71
+
72
+ ---
73
+
74
+ ## Core Components
75
+
76
+ ### `sawnergy.rin.RINBuilder`
77
+
78
+ * Wraps the AmberTools `cpptraj` executable to:
79
+ - compute per-frame electrostatic (EMAP) and van der Waals (VMAP) energy matrices at the atomic level,
80
+ - project atom–atom interactions to residue–residue interactions using compositional masks,
81
+ - prune, symmetrise, remove self-interactions, and L1-normalise the matrices,
82
+ - compute per-residue centres of mass (COM) over the same frames.
83
+ * Outputs a compressed Zarr archive with transition matrices, optional prenormalised energies, COM snapshots, and rich
84
+ metadata (frame range, pruning quantile, molecule ID, etc.).
85
+ * Supports parallel `cpptraj` execution, batch processing, and keeps temporary stores tidy via
86
+ `ArrayStorage.compress_and_cleanup`.
87
+
88
+ ### `sawnergy.visual.Visualizer`
89
+
90
+ * Opens RIN archives, resolves dataset names from attributes, and renders nodes plus attractive/repulsive edge bundles
91
+ in 3D using Matplotlib.
92
+ * Allows both static frame visualization and trajectory animation.
93
+ * Handles backend selection (`Agg` fallback in headless environments) and offers convenient colour palettes via
94
+ `visualizer_util`.
95
+
96
+ ### `sawnergy.walks.Walker`
97
+
98
+ * Attaches to the RIN archive and loads attractive/repulsive transition matrices into shared memory using
99
+ `walker_util.SharedNDArray` so multiple processes can sample without copying.
100
+ * Samples random walks (RW) and self-avoiding walks (SAW), optionally time-aware, that is, walks move through transition matrices with transition probabilities proportional to cosine similarity between the current and next frame. Randomness is controlled by the seed passed to the class constructor.
101
+ * Persists walks as `(time, walk_id, length+1)` tensors (1-based node indices) alongside metadata such as
102
+ `walk_length`, `walks_per_node`, and RNG scheme.
103
+
104
+ ### `sawnergy.embedding.Embedder`
105
+
106
+ * Consumes walk archives, generates skip-gram pairs, and normalises them to 0-based indices.
107
+ * Provides a unified interface to SGNS implementations:
108
+ - **PureML backend** (`SGNS_PureML`): works with the `pureml` ecosystem, optimistic for CPU training.
109
+ - **PyTorch backend** (`SGNS_Torch`): uses `torch.nn.Embedding` plays nicely with GPUs.
110
+ * Both `SGNS_PureML` and `SGNS_Torch` accept training hyperparameters such as batch_size, LR, optimizer and LR_scheduler, etc.
111
+ * Exposes `embed_frame` (single frame) and `embed_all` (all frames, deterministic seeding per frame) which return the
112
+ learned input embedding matrices and write them to disk when requested.
113
+
114
+ ### Supporting Utilities
115
+
116
+ * `sawnergy.sawnergy_util`
117
+ - `ArrayStorage`: thin wrapper over Zarr v3 with helpers for chunk management, attribute coercion to JSON, and transparent compression to `.zip` archives.
118
+ - Parallel helpers (`elementwise_processor`, `compose_steps`, etc.), temporary file management, logging, and runtime
119
+ inspection utilities.
120
+ * `sawnergy.logging_util.configure_logging`: configure rotating file/console logging consistently across scripts.
121
+
122
+ ---
123
+
124
+ ## Archive Layouts
125
+
126
+ | Archive | Key datasets (name → shape, dtype) | Important attributes (root `attrs`) |
127
+ |---|---|---|
128
+ | **RIN** | `ATTRACTIVE_transitions` → **(T, N, N)**, float32 • `REPULSIVE_transitions` → **(T, N, N)**, float32 (optional) • `ATTRACTIVE_energies` → **(T, N, N)**, float32 (optional) • `REPULSIVE_energies` → **(T, N, N)**, float32 (optional) • `COM` → **(T, N, 3)**, float32 | `time_created` (ISO) • `com_name` = `"COM"` • `molecule_of_interest` (int) • `frame_range` = `(start, end)` inclusive • `frame_batch_size` (int) • `prune_low_energies_frac` (float in [0,1]) • `attractive_transitions_name` / `repulsive_transitions_name` (dataset names or `None`) • `attractive_energies_name` / `repulsive_energies_name` (dataset names or `None`) |
129
+ | **Walks** | `ATTRACTIVE_RWs` → **(T, N·num_RWs, L+1)**, int32 (optional) • `REPULSIVE_RWs` → **(T, N·num_RWs, L+1)**, int32 (optional) • `ATTRACTIVE_SAWs` → **(T, N·num_SAWs, L+1)**, int32 (optional) • `REPULSIVE_SAWs` → **(T, N·num_SAWs, L+1)**, int32 (optional) <br/>_Note:_ node IDs are **1-based**.| `time_created` (ISO) • `seed` (int) • `rng_scheme` = `"SeedSequence.spawn_per_batch_v1"` • `num_workers` (int) • `in_parallel` (bool) • `batch_size_nodes` (int) • `num_RWs` / `num_SAWs` (ints) • `node_count` (N) • `time_stamp_count` (T) • `walk_length` (L) • `walks_per_node` (int) • `attractive_RWs_name` / `repulsive_RWs_name` / `attractive_SAWs_name` / `repulsive_SAWs_name` (dataset names or `None`) • `walks_layout` = `"time_leading_3d"` |
130
+ | **Embeddings** | `FRAME_EMBEDDINGS` → **(frames_written, vocab_size, D)**, typically float32 | `time_created` (ISO) • `seed` (int) • `rng_scheme` = `"SeedSequence.spawn_per_frame_v1"` • `source_walks_path` (str) • `model_base` = `"torch"` or `"pureml"` • `rin_type` = `"attr"` or `"repuls"` • `using_mode` = `"RW"|"SAW"|"merged"` • `window_size` (int) • `alpha` (float; noise exponent) • `dimensionality` = D • `num_negative_samples` (int) • `num_epochs` (int) • `batch_size` (int) • `shuffle_data` (bool) • `frames_written` (int) • `vocab_size` (int) • `frame_count` (int) • `embedding_dtype` (str) • `frame_embeddings_name` = `"FRAME_EMBEDDINGS"` • `arrays_per_chunk` (int) • `compression_level` (int) |
131
+
132
+ **Notes**
133
+
134
+ - In **RIN**, `T` equals the number of frame **batches** written (i.e., `frame_range` swept in steps of `frame_batch_size`). `ATTRACTIVE/REPULSIVE_energies` are **pre-normalised** absolute energies (written only when `keep_prenormalized_energies=True`), whereas `ATTRACTIVE/REPULSIVE_transitions` are the **row-wise L1-normalised** versions used for sampling.
135
+ - All archives are Zarr v3 groups. ArrayStorage also maintains per-block metadata in root attrs: `array_chunk_size_in_block`, `array_shape_in_block`, and `array_dtype_in_block` (dicts keyed by dataset name). You’ll see these in every archive.
136
+
137
+ ---
138
+
139
+ ## Installation
140
+
141
+ ```bash
142
+ pip install sawnergy
143
+ ```
144
+
145
+ > **Note:** RIN building requires `cpptraj` (AmberTools). Ensure it is discoverable via `$PATH` or the `CPPTRAJ`
146
+ > environment variable.
147
+
148
+ ---
149
+
150
+ ## Quick Start
151
+
152
+ ```python
153
+ from pathlib import Path
154
+ from sawnergy.logging_util import configure_logging
155
+ from sawnergy.rin import RINBuilder
156
+ from sawnergy.walks import Walker
157
+ from sawnergy.embedding import Embedder
158
+
159
+ import logging
160
+ configure_logging("./logs", file_level=logging.WARNING, console_level=logging.INFO)
161
+
162
+ # 1. Build a Residue Interaction Network archive
163
+ rin_path = Path("./RIN_demo.zip")
164
+ rin_builder = RINBuilder()
165
+ rin_builder.build_rin(
166
+ topology_file="system.prmtop",
167
+ trajectory_file="trajectory.nc",
168
+ molecule_of_interest=1,
169
+ frame_range=(1, 100),
170
+ frame_batch_size=10,
171
+ prune_low_energies_frac=0.3,
172
+ output_path=rin_path,
173
+ include_attractive=True,
174
+ include_repulsive=False,
175
+ )
176
+
177
+ # 2. Sample walks from the RIN
178
+ walker = Walker(rin_path, seed=123)
179
+ walks_path = Path("./WALKS_demo.zip")
180
+ walker.sample_walks(
181
+ walk_length=16,
182
+ walks_per_node=32,
183
+ saw_frac=0.25,
184
+ include_attractive=True,
185
+ include_repulsive=False,
186
+ time_aware=False,
187
+ output_path=walks_path,
188
+ in_parallel=False,
189
+ )
190
+ walker.close()
191
+
192
+ # 3. Train embeddings per frame (PyTorch backend)
193
+ import torch
194
+
195
+ embedder = Embedder(walks_path, base="torch", seed=999)
196
+ embeddings_path = embedder.embed_all(
197
+ RIN_type="attr",
198
+ using="merged",
199
+ window_size=4,
200
+ num_negative_samples=5,
201
+ num_epochs=5,
202
+ batch_size=1024,
203
+ dimensionality=128,
204
+ shuffle_data=True,
205
+ output_path="./EMBEDDINGS_demo.zip",
206
+ sgns_kwargs={
207
+ "optim": torch.optim.Adam,
208
+ "optim_kwargs": {"lr": 1e-3},
209
+ "lr_sched": torch.optim.lr_scheduler.LambdaLR,
210
+ "lr_sched_kwargs": {"lr_lambda": lambda _: 1.0},
211
+ "device": "cuda" if torch.cuda.is_available() else "cpu",
212
+ },
213
+ )
214
+ print("Embeddings written to", embeddings_path)
215
+ ```
216
+
217
+ > For the PureML backend, supply the relevant optimiser and scheduler via `sgns_kwargs`
218
+ > (for example `optim=pureml.optimizers.Adam`, `lr_sched=pureml.optimizers.CosineAnnealingLR`).
219
+
220
+ ---
221
+
222
+ ## Visualisation
223
+
224
+ ```python
225
+ from sawnergy.visual import Visualizer
226
+
227
+ v = sawnergy.visual.Visualizer("./RIN_demo.zip")
228
+ v.build_frame(1,
229
+ node_colors="rainbow",
230
+ displayed_nodes="ALL",
231
+ displayed_pairwise_attraction_for_nodes="DISPLAYED_NODES",
232
+ displayed_pairwise_repulsion_for_nodes="DISPLAYED_NODES",
233
+ show_node_labels=True,
234
+ show=True
235
+ )
236
+ ```
237
+
238
+ `Visualizer` lazily loads datasets and works even in headless environments (falls back to the `Agg` backend).
239
+
240
+ ---
241
+
242
+ ## Advanced Notes
243
+
244
+ - **Time-aware walks**: Set `time_aware=True`, provide `stickiness` and `on_no_options` when calling `Walker.sample_walks`.
245
+ - **Shared memory lifecycle**: Call `Walker.close()` (or use a context manager) to release shared-memory segments.
246
+ - **PureML vs PyTorch**: Choose the backend via `Embedder(..., base="pureml"|"torch")` and provide backend-specific
247
+ constructor kwargs through `sgns_kwargs` (optimizer, scheduler, device).
248
+ - **ArrayStorage utilities**: Use `ArrayStorage` directly to peek into archives, append arrays, or manage metadata.
249
+
250
+ ---
251
+
252
+ ## Testing & Quality Assurance
253
+
254
+ The automated test suite (`pytest`) synthesises deterministic cpptraj outputs and exercises the entire workflow:
255
+
256
+ - RIN parsing, residue aggregation, and metadata verification.
257
+ - Random/self-avoiding walk sampling and probability consistency with the RIN.
258
+ - Embedding orchestration, frame ordering, SGNS pair generation property tests.
259
+ - PureML and PyTorch SGNS smoke tests verifying finite weights and decreasing loss.
260
+ - Visualiser smoke tests that cover data loading and artist updates.
261
+
262
+ Run the suite (inside the project virtualenv):
263
+
264
+ ```bash
265
+ python -m pytest
266
+ ```
267
+
268
+ ---
269
+
270
+ ## Project Structure
271
+
272
+ ```
273
+ ├── sawnergy/
274
+ │ ├── rin/ # RINBuilder and cpptraj integration helpers
275
+ │ ├── walks/ # Walker class and shared-memory utilities
276
+ │ ├── embedding/ # Embedder + SGNS backends (PureML / PyTorch)
277
+ │ ├── visual/ # Visualizer and palette utilities
278
+ │ ├── logging_util.py
279
+ │ └── sawnergy_util.py
280
+ ├── tests/ # Synthetic end-to-end tests (pytest)
281
+
282
+ └── README.md
283
+ ```
284
+
285
+ ---
286
+
287
+ ## Acknowledgements
288
+
289
+ SAWNERGY builds on the AmberTools `cpptraj` ecosystem, NumPy, Matplotlib, Zarr, and PyTorch (for GPU acceleration if necessary; PureML is available by default).
290
+ Big thanks to the upstream communities whose work makes this toolkit possible.