PyPI - sawnergy - Versions diffs - 1.0.3__py3-none-any.whl → 1.0.9__py3-none-any.whl - Mend

sawnergy 1.0.3py3-none-any.whl → 1.0.9py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (18) hide show

sawnergy/__init__.py +3 -1
sawnergy/embedding/SGNS_pml.py +324 -51
sawnergy/embedding/SGNS_torch.py +282 -39
sawnergy/embedding/__init__.py +26 -1
sawnergy/embedding/embedder.py +426 -203
sawnergy/embedding/visualizer.py +251 -0
sawnergy/logging_util.py +1 -1
sawnergy/rin/rin_builder.py +4 -4
sawnergy/visual/visualizer.py +6 -6
sawnergy/visual/visualizer_util.py +3 -0
sawnergy/walks/walker.py +43 -22
{sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/METADATA +91 -57
sawnergy-1.0.9.dist-info/RECORD +23 -0
sawnergy-1.0.3.dist-info/RECORD +0 -22
{sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/WHEEL +0 -0
{sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/licenses/LICENSE +0 -0
{sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/licenses/NOTICE +0 -0
{sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/top_level.txt +0 -0

sawnergy/embedding/embedder.py CHANGED Viewed

@@ -1,15 +1,5 @@
 from __future__ import annotations
-"""
-Embedding orchestration for Skip-Gram with Negative Sampling (SGNS).
-This module consumes attractive/repulsive walk corpora produced by the walker
-pipeline and trains per-frame embeddings using either the PyTorch or PureML
-implementations of SGNS. The resulting embeddings can be persisted back into
-an ``ArrayStorage`` archive along with rich metadata describing the training
-configuration.
-"""
 # third-pary
 import numpy as np
@@ -36,9 +26,8 @@ class Embedder:
     def __init__(self,
                  WALKS_path: str | Path,
-                 base: Literal["torch", "pureml"],
                  *,
-                 seed: int | None = None
+                 seed: int | None = None,
                 ) -> None:
         """Initialize the embedder and load walk tensors.
@@ -50,22 +39,19 @@ class Embedder:
                 ``None`` if that collection is absent), and the metadata
                 ``num_RWs``, ``num_SAWs``, ``node_count``, ``time_stamp_count``,
                 ``walk_length``.
-            base: Which SGNS backend to use, either ``"torch"`` or ``"pureml"``.
             seed: Optional seed for the embedder's RNG. If ``None``, a random
                 32-bit seed is chosen.
         Raises:
             ValueError: If required metadata is missing or any loaded walk array
                 has an unexpected shape.
-            ImportError: If the requested backend is not installed.
-            NameError: If ``base`` is not one of ``{"torch","pureml"}``.
         Notes:
             - Walks in storage are 1-based (residue indexing). Internally, this
               class normalizes to 0-based indices for training utilities.
         """
         self._walks_path = Path(WALKS_path)
-        _logger.info("Initializing Embedder from %s (base=%s)", self._walks_path, base)
+        _logger.info("Initializing Embedder from %s", self._walks_path)
         # placeholders for optional walk collections
         self.attractive_RWs : np.ndarray | None = None
@@ -124,53 +110,76 @@ class Embedder:
         RWs_expected  = (time_stamp_count, node_count * num_RWs,  walk_length+1) if (num_RWs  > 0) else None
         SAWs_expected = (time_stamp_count, node_count * num_SAWs, walk_length+1) if (num_SAWs > 0) else None
-        self.vocab_size = int(node_count)
-        self.frame_count = int(time_stamp_count)
-        self.walk_length = int(walk_length)
+        self.vocab_size   = int(node_count)
+        self.frame_count  = int(time_stamp_count)
+        self.walk_length  = int(walk_length)
+        self.num_RWs      = int(num_RWs)
+        self.num_SAWs     = int(num_SAWs)
+        # Keep dataset names for metadata passthrough
+        self._attractive_RWs_name  = attractive_RWs_name
+        self._repulsive_RWs_name   = repulsive_RWs_name
+        self._attractive_SAWs_name = attractive_SAWs_name
+        self._repulsive_SAWs_name  = repulsive_SAWs_name
         # store walks if present
         if attractive_RWs is not None:
             if RWs_expected and attractive_RWs.shape != RWs_expected:
                 raise ValueError(f"ATTR RWs: expected {RWs_expected}, got {attractive_RWs.shape}")
             self.attractive_RWs = attractive_RWs
+            _logger.debug("ATTR RWs loaded: %s", self.attractive_RWs.shape)
         if repulsive_RWs is not None:
             if RWs_expected and repulsive_RWs.shape != RWs_expected:
                 raise ValueError(f"REP RWs: expected {RWs_expected}, got {repulsive_RWs.shape}")
             self.repulsive_RWs = repulsive_RWs
+            _logger.debug("REP  RWs loaded: %s", self.repulsive_RWs.shape)
         if attractive_SAWs is not None:
             if SAWs_expected and attractive_SAWs.shape != SAWs_expected:
                 raise ValueError(f"ATTR SAWs: expected {SAWs_expected}, got {attractive_SAWs.shape}")
             self.attractive_SAWs = attractive_SAWs
+            _logger.debug("ATTR SAWs loaded: %s", self.attractive_SAWs.shape)
         if repulsive_SAWs is not None:
             if SAWs_expected and repulsive_SAWs.shape != SAWs_expected:
                 raise ValueError(f"REP SAWs: expected {SAWs_expected}, got {repulsive_SAWs.shape}")
             self.repulsive_SAWs = repulsive_SAWs
+            _logger.debug("REP  SAWs loaded: %s", self.repulsive_SAWs.shape)
         # INTERNAL RNG
         self._seed = np.random.randint(0, 2**32 - 1) if seed is None else int(seed)
         self.rng = np.random.default_rng(self._seed)
         _logger.info("RNG initialized from seed=%d", self._seed)
-        # MODEL HANDLE
-        self.model_base: Literal["torch", "pureml"] = base
-        self.model_constructor = self._get_SGNS_constructor_from(base)
-        _logger.info("SGNS backend resolved: %s", getattr(self.model_constructor, "__name__", repr(self.model_constructor)))
     # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=- PRIVATE -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
     # HELPERS:
     @staticmethod
-    def _get_SGNS_constructor_from(base: Literal["torch", "pureml"]):
-        """Resolve the SGNS implementation class for the selected backend."""
+    def _get_NN_constructor_from(base: Literal["torch", "pureml"],
+                                 objective: Literal["sgns", "sg"]):
+        """Resolve the SG/SGNS implementation class for the selected backend.
+        Args:
+            base: Backend family to use, ``"torch"`` or ``"pureml"``.
+            objective: Training objective, ``"sgns"`` or ``"sg"``.
+        Returns:
+            A callable class (constructor) implementing the requested model.
+        Raises:
+            ImportError: If the requested backend cannot be imported.
+            NameError: If ``base`` is not one of the supported values.
+        """
+        _logger.debug("Resolving model constructor: base=%s objective=%s", base, objective)
         if base == "torch":
             try:
-                from .SGNS_torch import SGNS_Torch
-                return SGNS_Torch
+                from .SGNS_torch import SGNS_Torch, SG_Torch
+                ctor = SG_Torch if objective == "sg" else SGNS_Torch
+                _logger.debug("Resolved PyTorch class: %s", getattr(ctor, "__name__", str(ctor)))
+                return ctor
             except Exception:
+                _logger.exception("Failed to import PyTorch backend.")
                 raise ImportError(
                     "PyTorch is not installed, but base='torch' was requested. "
                     "Install PyTorch first, e.g.: `pip install torch` "
@@ -178,9 +187,12 @@ class Embedder:
                 )
         elif base == "pureml":
             try:
-                from .SGNS_pml import SGNS_PureML
-                return SGNS_PureML
+                from .SGNS_pml import SGNS_PureML, SG_PureML
+                ctor = SG_PureML if objective == "sg" else SGNS_PureML
+                _logger.debug("Resolved PureML class: %s", getattr(ctor, "__name__", str(ctor)))
+                return ctor
             except Exception:
+                _logger.exception("Failed to import PureML backend.")
                 raise ImportError(
                     "PureML is not installed, but base='pureml' was requested. "
                     "Install PureML first via `pip install ym-pure-ml` "
@@ -190,7 +202,18 @@ class Embedder:
     @staticmethod
     def _as_zerobase_intp(walks: np.ndarray, *, V: int) -> np.ndarray:
-        """Validate 1-based uint/int walks → 0-based intp; check bounds."""
+        """Validate and convert 1-based walks to 0-based ``intp``.
+        Args:
+            walks: 2D array of node ids with 1-based indexing.
+            V: Vocabulary size for bounds checking.
+        Returns:
+            2D array of dtype ``intp`` with 0-based indices.
+        Raises:
+            ValueError: If shape is not 2D or indices are out of bounds.
+        """
         arr = np.asarray(walks)
         if arr.ndim != 2:
             raise ValueError("walks must be 2D: (num_walks, walk_len)")
@@ -198,7 +221,9 @@ class Embedder:
             arr = arr.astype(np.int64, copy=False)
         # 1-based → 0-based
         arr = arr - 1
-        if arr.min() < 0 or arr.max() >= V:
+        mn, mx = int(arr.min()), int(arr.max())
+        _logger.debug("Zero-basing walks: min=%d max=%d V=%d", mn, mx, V)
+        if mn < 0 or mx >= V:
             raise ValueError("walk ids out of range after 1→0-based normalization")
         return arr.astype(np.intp, copy=False)
@@ -206,19 +231,29 @@ class Embedder:
     def _pairs_from_walks(walks0: np.ndarray, window_size: int) -> np.ndarray:
         """
         Skip-gram pairs including edge centers (one-sided when needed).
-        walks0: (W, L) int array (0-based ids).
-        Returns: (N_pairs, 2) int32 [center, context].
+        Args:
+            walks0: (W, L) int array (0-based ids).
+            window_size: Symmetric context window radius.
+        Returns:
+            Array of shape (N_pairs, 2) int32 with columns [center, context].
+        Raises:
+            ValueError: If shape is invalid or ``window_size`` <= 0.
         """
         if walks0.ndim != 2:
             raise ValueError("walks must be 2D: (num_walks, walk_len)")
         _, L = walks0.shape
         k = int(window_size)
+        _logger.debug("Building SG pairs: L=%d, window=%d", L, k)
         if k <= 0:
             raise ValueError("window_size must be positive")
         if L == 0:
+            _logger.debug("Empty walks length; returning 0 pairs.")
             return np.empty((0, 2), dtype=np.int32)
         out_chunks = []
@@ -236,18 +271,42 @@ class Embedder:
             out_chunks.append(np.stack((centers_l, ctx_l), axis=2).reshape(-1, 2))
         if not out_chunks:
+            _logger.debug("No offsets produced pairs; returning empty.")
             return np.empty((0, 2), dtype=np.int32)
-        return np.concatenate(out_chunks, axis=0).astype(np.int32, copy=False)
+        pairs = np.concatenate(out_chunks, axis=0).astype(np.int32, copy=False)
+        _logger.debug("Built %d training pairs", pairs.shape[0])
+        return pairs
     @staticmethod
     def _freq_from_walks(walks0: np.ndarray, *, V: int) -> np.ndarray:
-        """Node frequencies from walks (0-based)."""
-        return np.bincount(walks0.ravel(), minlength=V).astype(np.int64, copy=False)
+        """Node frequencies from walks (0-based).
+        Args:
+            walks0: 2D array of 0-based node ids.
+            V: Vocabulary size (minlength for bincount).
+        Returns:
+            1D array of int64 frequencies with length ``V``.
+        """
+        freq = np.bincount(walks0.ravel(), minlength=V).astype(np.int64, copy=False)
+        _logger.debug("Frequency mass: total=%d nonzero=%d", int(freq.sum()), int(np.count_nonzero(freq)))
+        return freq
     @staticmethod
     def _soft_unigram(freq: np.ndarray, *, power: float = 0.75) -> np.ndarray:
-        """Return normalized Pn(w) ∝ f(w)^power as float64 probs."""
+        """Return normalized Pn(w) ∝ f(w)^power as float64 probs.
+        Args:
+            freq: 1D array of token frequencies.
+            power: Exponent used for smoothing (default 0.75 à la word2vec).
+        Returns:
+            1D array of probabilities summing to 1.0.
+        Raises:
+            ValueError: If mass is invalid (all zeros or non-finite).
+        """
         p = np.asarray(freq, dtype=np.float64)
         if p.sum() == 0:
             raise ValueError("all frequencies are zero")
@@ -255,13 +314,31 @@ class Embedder:
         s = p.sum()
         if not np.isfinite(s) or s <= 0:
             raise ValueError("invalid unigram mass")
-        return p / s
+        probs = p / s
+        _logger.debug("Noise distribution ready (power=%.3f)", power)
+        return probs
     def _materialize_walks(self, frame_id: int, rin: Literal["attr", "repuls"],
                            using: Literal["RW", "SAW", "merged"]) -> np.ndarray:
+        """Materialize the requested set of walks for a frame.
+        Args:
+            frame_id: 1-based frame index.
+            rin: Which RIN to pull from: ``"attr"`` or ``"repuls"``.
+            using: Which walk sets to include: ``"RW"``, ``"SAW"``, or ``"merged"``.
+                If ``"merged"``, concatenate available RW and SAW along axis 0.
+        Returns:
+            A 2D array of walks with shape (num_walks, walk_length+1).
+        Raises:
+            IndexError: If ``frame_id`` is out of range.
+            ValueError: If no matching walks are available.
+        """
         if not 1 <= frame_id <= int(self.frame_count):
             raise IndexError(f"frame_id must be in [1, {self.frame_count}]; got {frame_id}")
+        _logger.debug("Materializing %s walks at frame=%d using=%s", rin, frame_id, using)
         frame_id -= 1
         if rin == "attr":
@@ -288,8 +365,12 @@ class Embedder:
         if not parts:
             raise ValueError(f"No walks available for {rin=} with {using=}")
         if len(parts) == 1:
-            return parts[0]
-        return np.concatenate(parts, axis=0)
+            out = parts[0]
+        else:
+            out = np.concatenate(parts, axis=0)
+        _logger.debug("Materialized walks shape: %s", getattr(out, "shape", None))
+        return out
     # INTERFACES: (private)
@@ -298,6 +379,17 @@ class Embedder:
                                     using: Literal["RW", "SAW", "merged"],
                                     window_size: int,
                                     alpha: float = 0.75) -> tuple[np.ndarray, np.ndarray]:
+        """Construct (center, context) pairs and noise distribution for ATTR.
+        Args:
+            frame_id: 1-based frame index.
+            using: Walk subset to include.
+            window_size: Skip-gram window radius.
+            alpha: Unigram smoothing exponent.
+        Returns:
+            Tuple of (pairs, noise_probs).
+        """
         walks = self._materialize_walks(frame_id, "attr", using)
         walks0 = self._as_zerobase_intp(walks, V=self.vocab_size)
         attractive_corpus = self._pairs_from_walks(walks0, window_size)
@@ -311,6 +403,17 @@ class Embedder:
                                    using: Literal["RW", "SAW", "merged"],
                                    window_size: int,
                                    alpha: float = 0.75) -> tuple[np.ndarray, np.ndarray]:
+        """Construct (center, context) pairs and noise distribution for REP.
+        Args:
+            frame_id: 1-based frame index.
+            using: Walk subset to include.
+            window_size: Skip-gram window radius.
+            alpha: Unigram smoothing exponent.
+        Returns:
+            Tuple of (pairs, noise_probs).
+        """
         walks = self._materialize_walks(frame_id, "repuls", using)
         walks0 = self._as_zerobase_intp(walks, V=self.vocab_size)
         repulsive_corpus = self._pairs_from_walks(walks0, window_size)
@@ -322,56 +425,63 @@ class Embedder:
     # -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= PUBLIC -=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
     def embed_frame(self,
-              frame_id: int,
-              RIN_type: Literal["attr", "repuls"],
-              using: Literal["RW", "SAW", "merged"],
-              window_size: int,
-              num_negative_samples: int,
-              num_epochs: int,
-              batch_size: int,
-              *,
-              shuffle_data: bool = True,
-              dimensionality: int = 128,
-              alpha: float = 0.75,
-              device: str | None = None,
-              sgns_kwargs: dict[str, object] | None = None,
-              _seed: int | None = None
-              ) -> np.ndarray:
-        """Train embeddings for a single frame and return the input embedding matrix.
+            frame_id: int,
+            RIN_type: Literal["attr", "repuls"],
+            using: Literal["RW", "SAW", "merged"],
+            num_epochs: int,
+            negative_sampling: bool = False,
+            window_size: int = 5,
+            num_negative_samples: int = 10,
+            batch_size: int = 1024,
+            *,
+            in_weights:  np.ndarray | None = None,
+            out_weights: np.ndarray | None = None,
+            lr_step_per_batch: bool = False,
+            shuffle_data: bool = True,
+            dimensionality: int = 128,
+            alpha: float = 0.75,
+            device: str | None = None,
+            model_base: Literal["torch", "pureml"] = "pureml",
+            model_kwargs: dict[str, object] | None = None,
+            kind: tuple[Literal["in", "out", "avg"]] = ("in",),
+            _seed: int | None = None
+            ) -> list[tuple[np.ndarray, str]]:
+        """Train embeddings for a single frame and return the matrix containing embeddings of the specified `kind`.
         Args:
-            frame_id: 1-based frame index to train on.
-            RIN_type: Interaction channel to use: ``"attr"`` (attractive) or
-                ``"repuls"`` (repulsive).
-            using: Which walk collections to include: ``"RW"``, ``"SAW"``, or
-                ``"merged"`` (concatenates both if available).
-            window_size: Symmetric skip-gram window size ``k``.
-            num_negative_samples: Number of negative samples per positive pair.
-            num_epochs: Number of passes over the pair dataset.
-            batch_size: Mini-batch size for training.
-            shuffle_data: Whether to shuffle pairs each epoch.
-            dimensionality: Embedding dimensionality ``D``.
-            alpha: Noise distribution exponent (``Pn ∝ f^alpha``).
-            device: Optional device string for the Torch backend (e.g., ``"cuda"``).
-            sgns_kwargs: Extra keyword arguments forwarded to the backend SGNS
-                constructor. For PureML, required keys are:
-                ``{"optim", "optim_kwargs", "lr_sched", "lr_sched_kwargs"}``.
-            _seed: Optional child seed for this frame's model initialization.
+            frame_id: 1-based frame index to embed.
+            RIN_type: ``"attr"`` or ``"repuls"`` - which corpus to use.
+            using: Which walks to use (``"RW"``, ``"SAW"``, or ``"merged"``).
+            num_epochs: Number of passes over the pairs.
+            negative_sampling: If ``True``, use SGNS objective; else plain SG.
+            window_size: Skip-gram symmetric window radius.
+            num_negative_samples: Negatives per positive pair (SGNS only).
+            batch_size: Minibatch size for training.
+            in_weights: Optional starting input-embedding matrix of shape (V, D).
+            out_weights: Optional starting output-embedding matrix of shape (V, D).
+                        SGNS: shape (V, D)
+                        SG:   shape (D, V)
+            lr_step_per_batch: If ``True``, step LR every batch (else per epoch).
+            shuffle_data: Shuffle pairs each epoch.
+            dimensionality: Embedding dimension ``D``.
+            alpha: Unigram smoothing power for noise distribution.
+            device: Optional backend device hint (e.g., ``"cuda"``).
+            model_base: Backend family (``"torch"`` or ``"pureml"``).
+            model_kwargs: Passed through to backend model constructor.
+            kind: Which embedding to return: ``"in"``, ``"out"``, or ``"avg"``.
+            _seed: Optional override seed for this frame.
         Returns:
-            np.ndarray: Learned **input** embedding matrix of shape ``(V, D)``.
-        Raises:
-            ValueError: If requested walks are missing, if no training pairs are
-                generated, or if required ``sgns_kwargs`` for PureML are absent.
-            AttributeError: If the SGNS model does not expose embeddings via
-                ``.embeddings`` or ``.parameters[0]``.
+            list[tuple[np.ndarray, Literal["avg","in","out"]]]:
+                (embedding, kind) pairs sorted as 'avg', 'in', 'out'.
         """
         _logger.info(
-            "Preparing frame %d (rin=%s using=%s window=%d neg=%d epochs=%d batch=%d)",
-            frame_id, RIN_type, using, window_size, num_negative_samples, num_epochs, batch_size
+            "embed_frame: frame=%d RIN=%s using=%s base=%s D=%d epochs=%d batch=%d sgns=%s window_size=%d alpha=%.3f",
+            frame_id, RIN_type, using, model_base, dimensionality, num_epochs, batch_size,
+            str(negative_sampling), window_size, alpha
         )
+        # ------------------ resolve training data -----------------
         if RIN_type == "attr":
             if self.attractive_RWs is None and self.attractive_SAWs is None:
                 raise ValueError("Attractive random walks are missing")
@@ -381,125 +491,125 @@ class Embedder:
                 raise ValueError("Repulsive random walks are missing")
             pairs, noise_probs = self._repulsive_corpus_and_prob(frame_id=frame_id, using=using, window_size=window_size, alpha=alpha)
         else:
-            raise ValueError(f"Unknown RIN_type: {RIN_type!r}")
+            raise NameError(f"Unknown RIN_type: {RIN_type!r}")
         if pairs.size == 0:
             raise ValueError("No training pairs generated for the requested configuration")
+        # ----------------------------------------------------------
+        # ---------------- construct training corpus ---------------
         centers  = pairs[:, 0].astype(np.int64, copy=False)
         contexts = pairs[:, 1].astype(np.int64, copy=False)
+        _logger.debug("Pairs split: centers=%s contexts=%s", centers.shape, contexts.shape)
+        # ----------------------------------------------------------
-        model_kwargs: dict[str, object] = dict(sgns_kwargs or {})
-        if self.model_base == "pureml":
-            required = {"optim", "optim_kwargs", "lr_sched", "lr_sched_kwargs"}
-            missing = required.difference(model_kwargs)
-            if missing:
-                raise ValueError(f"PureML backend requires {sorted(missing)} in sgns_kwargs.")
+        # ------------ resolve model_constructor kwargs ------------
+        if model_kwargs is not None:
+            if (("lr_sched" in model_kwargs and model_kwargs.get("lr_sched", None) is not None)
+                and ("lr_sched_kwargs" in model_kwargs and model_kwargs.get("lr_sched_kwargs", None) is None)):
+                raise ValueError("When `lr_sched`, you must also provide `lr_sched_kwargs`.")
-        child_seed = int(self._seed if _seed is None else _seed)
-        model_kwargs.update({
+        constructor_kwargs: dict[str, object] = dict(model_kwargs or {})
+        constructor_kwargs.update({
             "V": self.vocab_size,
             "D": dimensionality,
-            "seed": child_seed
+            "in_weights": in_weights,
+            "out_weights": out_weights,
+            "seed": int(self._seed if _seed is None else _seed),
+            "device": device
         })
+        _logger.debug("Model constructor kwargs: %s", {k: constructor_kwargs[k] for k in ("V","D","seed","device")})
+        # ----------------------------------------------------------
+        # --------------- resolve model constructor ----------------
+        model_constructor = self._get_NN_constructor_from(
+            model_base, objective=("sgns" if negative_sampling else "sg"))
+        # ----------------------------------------------------------
+        # ------------------ initialize the model ------------------
+        model = model_constructor(**constructor_kwargs)
+        _logger.debug("Model initialized: %s", model_constructor.__name__ if hasattr(model_constructor,"__name__") else str(model_constructor))
+        # ----------------------------------------------------------
+        # -------------------- fitting the data --------------------
+        _logger.info("Fitting model on %d pairs ...", pairs.shape[0])
+        model.fit(centers=centers,
+                  contexts=contexts,
+                  num_epochs=num_epochs,
+                  batch_size=batch_size,
+                  # -- optional; for SGNS; safely ignored by SG via **_ignore --
+                  num_negative_samples=num_negative_samples,
+                  noise_dist=noise_probs,
+                  # -----------------------------------------
+                  shuffle_data=shuffle_data,
+                  lr_step_per_batch=lr_step_per_batch
+            )
+        _logger.info("Training complete for frame %d", frame_id)
+        # ----------------------------------------------------------
-        if self.model_base == "torch" and device is not None:
-            model_kwargs["device"] = device
-        self.model = self.model_constructor(**model_kwargs)
-        _logger.info(
-            "Training SGNS base=%s constructor=%s frame=%d pairs=%d dim=%d epochs=%d batch=%d neg=%d shuffle=%s",
-            self.model_base,
-            getattr(self.model_constructor, "__name__", repr(self.model_constructor)),
-            frame_id,
-            pairs.shape[0],
-            dimensionality,
-            num_epochs,
-            batch_size,
-            num_negative_samples,
-            shuffle_data
-        )
-        self.model.fit(
-            centers,
-            contexts,
-            num_epochs,
-            batch_size,
-            num_negative_samples,
-            noise_probs,
-            shuffle_data,
-            lr_step_per_batch=False
-        )
+        if any([k not in ("in", "out", "avg") for k in kind]):
+            raise NameError(f"Unknown embeddings kind in {kind}. Expected: one of ['in', 'out', 'avg']")
-        embeddings = getattr(self.model, "embeddings", None)
-        if embeddings is None:
-            params = getattr(self.model, "parameters", None)
-            if isinstance(params, tuple) and params:
-                embeddings = params[0]
-        if embeddings is None:
-            raise AttributeError("SGNS model does not expose embeddings via '.embeddings' or '.parameters[0]'")
+        # OUTPUT:
+        embeddings = [(np.asarray(model.in_embeddings, dtype=np.float32),  k)  if k == "in" else
+                      (np.asarray(model.out_embeddings, dtype=np.float32), k)  if k == "out" else
+                      (np.asarray(model.avg_embeddings, dtype=np.float32), k)  if k == "avg" else
+                      (None, k)
+                      for k in kind
+                    ]
+        embeddings.sort(key=lambda pair: pair[1]) # ensures 'avg', 'in', 'out' ordering
-        embeddings = np.asarray(embeddings)
-        _logger.info("Frame %d embeddings ready: shape=%s dtype=%s", frame_id, embeddings.shape, embeddings.dtype)
         return embeddings
     def embed_all(
         self,
         RIN_type: Literal["attr", "repuls"],
         using: Literal["RW", "SAW", "merged"],
-        window_size: int,
-        num_negative_samples: int,
         num_epochs: int,
-        batch_size: int,
+        negative_sampling: bool = False,
+        window_size: int = 2,
+        num_negative_samples: int = 10,
+        batch_size: int = 1024,
         *,
+        lr_step_per_batch: bool = False,
         shuffle_data: bool = True,
         dimensionality: int = 128,
         alpha: float = 0.75,
         device: str | None = None,
-        sgns_kwargs: dict[str, object] | None = None,
+        model_base: Literal["torch", "pureml"] = "pureml",
+        model_kwargs: dict[str, object] | None = None,
+        kind: Literal["in", "out", "avg"] = "in",
         output_path: str | Path | None = None,
         num_matrices_in_compressed_blocks: int = 20,
-        compression_level: int = 3):
-        """Train embeddings for all frames and persist them to compressed storage.
+        compression_level: int = 3,
+        ) -> str:
+        """Embed all frames and persist a self-contained archive.
-        Iterates through all frames (``1..frame_count``), trains an SGNS model
-        per frame using the configured backend, collects the resulting input
-        embeddings, and writes them into a new compressed ``ArrayStorage`` archive.
+        The resulting file stores a block named ``FRAME_EMBEDDINGS`` with a
+        compressed sequence of per-frame matrices (each ``(V, D)``), alongside
+        rich metadata mirroring the style of other SAWNERGY modules.
         Args:
-            RIN_type: Interaction channel to use: ``"attr"`` or ``"repuls"``.
-            using: Walk collections: ``"RW"``, ``"SAW"``, or ``"merged"``.
-            window_size: Symmetric skip-gram window size ``k``.
-            num_negative_samples: Number of negative samples per positive pair.
-            num_epochs: Number of epochs for each frame.
-            batch_size: Mini-batch size used during training.
-            shuffle_data: Whether to shuffle pairs each epoch.
-            dimensionality: Embedding dimensionality ``D``.
-            alpha: Noise distribution exponent (``Pn ∝ f^alpha``).
-            device: Optional device string for Torch backend.
-            sgns_kwargs: Extra constructor kwargs for the SGNS backend (see
-                :meth:`embed_frame` for PureML requirements).
-            output_path: Destination path. If ``None``, a new file named
-                ``EMBEDDINGS_<timestamp>.zip`` is created next to the source
-                WALKS archive. If the provided path lacks a suffix, ``.zip`` is
-                appended.
-            num_matrices_in_compressed_blocks: Number of per-frame matrices to
-                store per compressed chunk in the output archive.
-            compression_level: Blosc Zstd compression level (0-9).
+            RIN_type: ``"attr"`` or ``"repuls"`` - which corpus to use.
+            using: Which walks to use (``"RW"``, ``"SAW"``, or ``"merged"``).
+            num_epochs: Number of epochs to train per frame.
+            negative_sampling: If ``True``, use SGNS; otherwise plain SG.
+            window_size: Skip-gram window radius.
+            num_negative_samples: Negatives per positive pair (SGNS).
+            batch_size: Minibatch size for training.
+            lr_step_per_batch: If ``True``, step LR per batch (else per epoch).
+            shuffle_data: Shuffle pairs each epoch.
+            dimensionality: Embedding dimension.
+            alpha: Unigram smoothing power for noise distribution.
+            device: Backend device hint (e.g., ``"cuda"``).
+            model_base: Backend family (``"torch"`` or ``"pureml"``).
+            model_kwargs: Passed through to backend model constructor.
+            kind: Which embedding to store: ``"in"``, ``"out"``, or ``"avg"``.
+            output_path: Optional path for the output archive (``.zip`` inferred).
+            num_matrices_in_compressed_blocks: How many frames per compressed chunk.
+            compression_level: Integer compression level for the archive.
         Returns:
-            str: Filesystem path to the written embeddings archive (``.zip``).
-        Raises:
-            ValueError: If configuration produces no pairs for a frame or if
-                PureML kwargs are incomplete.
-            RuntimeError: Propagated from storage operations on failure.
-        Notes:
-            - A deterministic child seed is spawned per frame from the master
-              seed using ``np.random.SeedSequence`` to ensure reproducibility
-              across runs.
+            Path to the created embeddings archive, as ``str``.
         """
         current_time = sawnergy_util.current_time()
         if output_path is None:
@@ -510,69 +620,182 @@ class Embedder:
                 output_path = output_path.with_suffix(".zip")
         _logger.info(
-            "Embedding all frames -> %s | frames=%d dim=%d base=%s",
-            output_path, self.frame_count, dimensionality, self.model_base
+            "embed_all: frames=%d D=%d base=%s RIN=%s using=%s out=%s",
+            self.frame_count, dimensionality, model_base, RIN_type, using, output_path
         )
+        # Per-frame deterministic seeds
         master_ss = np.random.SeedSequence(self._seed)
         child_seeds = master_ss.spawn(self.frame_count)
-        embeddings = []
-        for frame_idx, seed_seq in enumerate(child_seeds, start=1):
+        embeddings: list[np.ndarray] = []
+        last_frame_in_embs:  np.ndarray = None
+        last_frame_out_embs: np.ndarray = None
+        used_child_seeds: list[int] = []
+        for frame_id, seed_seq in enumerate(child_seeds, start=1):
             child_seed = int(seed_seq.generate_state(1, dtype=np.uint32)[0])
-            _logger.info("Processing frame %d/%d (child_seed=%d entropy=%d)", frame_idx, self.frame_count, child_seed, seed_seq.entropy)
-            embeddings.append(
+            used_child_seeds.append(child_seed)
+            _logger.info("Embedding frame %d/%d with seed=%d", frame_id, self.frame_count, child_seed)
+            embs_and_kinds: list[tuple[np.ndarray, str]] = \
                 self.embed_frame(
-                    frame_idx,
-                    RIN_type,
-                    using,
-                    window_size,
-                    num_negative_samples,
-                    num_epochs,
-                    batch_size,
+                    frame_id=frame_id,
+                    RIN_type=RIN_type,
+                    using=using,
+                    num_epochs=num_epochs,
+                    negative_sampling=negative_sampling,
+                    window_size=window_size,
+                    num_negative_samples=num_negative_samples,
+                    batch_size=batch_size,
+                    in_weights=last_frame_in_embs,
+                    out_weights=last_frame_out_embs,
+                    lr_step_per_batch=lr_step_per_batch,
                     shuffle_data=shuffle_data,
                     dimensionality=dimensionality,
                     alpha=alpha,
                     device=device,
-                    sgns_kwargs=sgns_kwargs,
+                    model_base=model_base,
+                    model_kwargs=model_kwargs,
+                    kind=("in", "out", "avg"),
                     _seed=child_seed
                 )
-            )
+            embs = {K: E for (E, K) in embs_and_kinds}
+            last_frame_in_embs  = embs["in"]                          # (V, D)
+            last_frame_out_embs = embs["out"] if negative_sampling else embs["out"].T  # SG needs (D, V), SGNS keeps (V, D)
+            resolved_embedding = embs[kind]
+            embeddings.append(np.asarray(resolved_embedding, dtype=np.float32, copy=False))
+            _logger.debug("Frame %d embedded: E.shape=%s", frame_id, resolved_embedding.shape)
-        embeddings = [np.asarray(e) for e in embeddings]
         block_name = "FRAME_EMBEDDINGS"
         with sawnergy_util.ArrayStorage.compress_and_cleanup(output_path, compression_level=compression_level) as storage:
+            _logger.info("Writing %d frame matrices to block '%s' ...", len(embeddings), block_name)
             storage.write(
                 these_arrays=embeddings,
                 to_block_named=block_name,
                 arrays_per_chunk=num_matrices_in_compressed_blocks
             )
-            storage.add_attr("time_created", current_time)
-            storage.add_attr("seed", int(self._seed))
-            storage.add_attr("rng_scheme", "SeedSequence.spawn_per_frame_v1")
-            storage.add_attr("source_walks_path", str(self._walks_path))
-            storage.add_attr("model_base", self.model_base)
-            storage.add_attr("rin_type", RIN_type)
-            storage.add_attr("using_mode", using)
+            # Core dataset discovery (for consumers like the Embeddings Visualizer)
+            storage.add_attr("frame_embeddings_name", block_name)
+            storage.add_attr("time_stamp_count", int(self.frame_count))
+            storage.add_attr("node_count", int(self.vocab_size))
+            storage.add_attr("embedding_dim", int(dimensionality))
+            # Provenance of input WALKS
+            storage.add_attr("source_WALKS_path", str(self._walks_path))
+            storage.add_attr("walk_length", int(self.walk_length))
+            storage.add_attr("num_RWs", int(self.num_RWs))
+            storage.add_attr("num_SAWs", int(self.num_SAWs))
+            storage.add_attr("attractive_RWs_name", self._attractive_RWs_name)
+            storage.add_attr("repulsive_RWs_name",  self._repulsive_RWs_name)
+            storage.add_attr("attractive_SAWs_name", self._attractive_SAWs_name)
+            storage.add_attr("repulsive_SAWs_name",  self._repulsive_SAWs_name)
+            # Training configuration (sufficient to reproduce)
+            storage.add_attr("objective", "sgns" if negative_sampling else "sg")
+            storage.add_attr("model_base", model_base)
+            storage.add_attr("embedding_kind", kind)  # 'in' | 'out' | 'avg'
+            storage.add_attr("num_epochs", int(num_epochs))
+            storage.add_attr("batch_size", int(batch_size))
             storage.add_attr("window_size", int(window_size))
             storage.add_attr("alpha", float(alpha))
-            storage.add_attr("dimensionality", int(dimensionality))
+            storage.add_attr("negative_sampling", bool(negative_sampling))
             storage.add_attr("num_negative_samples", int(num_negative_samples))
-            storage.add_attr("num_epochs", int(num_epochs))
-            storage.add_attr("batch_size", int(batch_size))
+            storage.add_attr("lr_step_per_batch", bool(lr_step_per_batch))
             storage.add_attr("shuffle_data", bool(shuffle_data))
-            storage.add_attr("frames_written", int(len(embeddings)))
-            storage.add_attr("vocab_size", int(self.vocab_size))
-            storage.add_attr("frame_count", int(self.frame_count))
-            storage.add_attr("embedding_dtype", str(embeddings[0].dtype))
-            storage.add_attr("frame_embeddings_name", block_name)
+            storage.add_attr("device_hint", device if device is not None else "")
+            storage.add_attr("model_kwargs_repr", repr(model_kwargs) if model_kwargs is not None else "{}")
+            # Which walks were used to train
+            storage.add_attr("RIN_type", RIN_type)   # 'attr' or 'repuls'
+            storage.add_attr("using", using)         # 'RW' | 'SAW' | 'merged'
+            # Reproducibility
+            storage.add_attr("master_seed", int(self._seed))
+            storage.add_attr("per_frame_seeds", [int(s) for s in used_child_seeds])
+            # Archive/IO details
             storage.add_attr("arrays_per_chunk", int(num_matrices_in_compressed_blocks))
             storage.add_attr("compression_level", int(compression_level))
+            storage.add_attr("created_at", current_time)
+            _logger.info(
+                "Stored embeddings archive: %s | shape=(T,N,D)=(%d,%d,%d)",
+                output_path, self.frame_count, self.vocab_size, dimensionality
+            )
-        _logger.info("Embedding archive written to %s", output_path)
         return str(output_path)
-__all__ = ["Embedder"]
+# *----------------------------------------------------*
+#                       FUNCTIONS
+# *----------------------------------------------------*
+def align_frames(this: np.ndarray,
+                 to_this: np.ndarray,
+                 *,
+                 center: bool = True,
+                 add_back_mean: bool = True,
+                 allow_reflection: bool = False) -> np.ndarray:
+    """
+    Align `this` to `to_this` via Orthogonal Procrustes.
+    Solves:  min_{R ∈ O(D)} || X R - Y ||_F
+    with X = this, Y = to_this (both shape (N, D)). Returns X aligned.
+    Args:
+        this: (N, D) matrix to be aligned.
+        to_this: (N, D) target matrix.
+        center: if True, subtract per-dimension means before solving.
+        add_back_mean: if True, add Y's mean back after alignment.
+        allow_reflection: if False, enforce det(R) = +1 (proper rotation).
+    Returns:
+        Aligned copy of `this` with shape (N, D).
+    """
+    X = np.asarray(this, dtype=np.float64)
+    Y = np.asarray(to_this, dtype=np.float64)
+    if X.ndim != 2 or Y.ndim != 2:
+        raise ValueError(f"Expected 2D arrays; got {X.ndim=} and {Y.ndim=}")
+    if X.shape[1] != Y.shape[1]:
+        raise ValueError(f"Dimensionalities must match: X.shape={X.shape}, Y.shape={Y.shape}")
+    if X.shape[0] != Y.shape[0]:
+        raise ValueError(f"Row counts must match (one-to-one correspondence): {X.shape[0]} vs {Y.shape[0]}")
+    # center
+    if center:
+        X_mean = X.mean(axis=0, keepdims=True)
+        Y_mean = Y.mean(axis=0, keepdims=True)
+        Xc = X - X_mean
+        Yc = Y - Y_mean
+    else:
+        Xc, Yc = X, Y
+        Y_mean = 0.0
+    # Cross-covariance and SVD
+    # M = Xᵀ Y (D×D); solution R = U Vᵀ for SVD(M) = U Σ Vᵀ
+    M = Xc.T @ Yc
+    U, _, Vt = np.linalg.svd(M, full_matrices=False)
+    R = U @ Vt
+    # enforce proper rotation unless reflections are allowed
+    if not allow_reflection and np.linalg.det(R) < 0:
+        Vt[-1, :] *= -1
+        R = U @ Vt
+    X_aligned = Xc @ R
+    if center and add_back_mean is True:
+        X_aligned = X_aligned + Y_mean
+    # match input dtype if possible
+    return X_aligned.astype(this.dtype, copy=False)
+__all__ = ["Embedder", "align_frames"]
 if __name__ == "__main__":
     pass

sawnergy 1.0.3__py3-none-any.whl → 1.0.9__py3-none-any.whl

sawnergy 1.0.3py3-none-any.whl → 1.0.9py3-none-any.whl