sawnergy 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sawnergy might be problematic. Click here for more details.
- sawnergy/__init__.py +13 -0
- sawnergy/embedding/SGNS_pml.py +135 -0
- sawnergy/embedding/SGNS_torch.py +177 -0
- sawnergy/embedding/__init__.py +34 -0
- sawnergy/embedding/embedder.py +578 -0
- sawnergy/logging_util.py +54 -0
- sawnergy/rin/__init__.py +9 -0
- sawnergy/rin/rin_builder.py +936 -0
- sawnergy/rin/rin_util.py +391 -0
- sawnergy/sawnergy_util.py +1182 -0
- sawnergy/visual/__init__.py +42 -0
- sawnergy/visual/visualizer.py +690 -0
- sawnergy/visual/visualizer_util.py +387 -0
- sawnergy/walks/__init__.py +16 -0
- sawnergy/walks/walker.py +795 -0
- sawnergy/walks/walker_util.py +384 -0
- sawnergy-1.0.0.dist-info/METADATA +290 -0
- sawnergy-1.0.0.dist-info/RECORD +22 -0
- sawnergy-1.0.0.dist-info/WHEEL +5 -0
- sawnergy-1.0.0.dist-info/licenses/LICENSE +201 -0
- sawnergy-1.0.0.dist-info/licenses/NOTICE +4 -0
- sawnergy-1.0.0.dist-info/top_level.txt +1 -0
sawnergy/__init__.py
ADDED
|
@@ -0,0 +1,135 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# third party
|
|
4
|
+
import numpy as np
|
|
5
|
+
from pureml.machinery import Tensor
|
|
6
|
+
from pureml.layers import Embedding
|
|
7
|
+
from pureml.losses import BCE
|
|
8
|
+
from pureml.general_math import sum as t_sum
|
|
9
|
+
from pureml.optimizers import Optim, LRScheduler
|
|
10
|
+
from pureml.training_utils import TensorDataset, DataLoader
|
|
11
|
+
from pureml.base import NN
|
|
12
|
+
|
|
13
|
+
# built-in
|
|
14
|
+
import logging
|
|
15
|
+
from typing import Type
|
|
16
|
+
|
|
17
|
+
# *----------------------------------------------------*
|
|
18
|
+
# GLOBALS
|
|
19
|
+
# *----------------------------------------------------*
|
|
20
|
+
|
|
21
|
+
_logger = logging.getLogger(__name__)
|
|
22
|
+
|
|
23
|
+
# *----------------------------------------------------*
|
|
24
|
+
# CLASSES
|
|
25
|
+
# *----------------------------------------------------*
|
|
26
|
+
|
|
27
|
+
class SGNS_PureML(NN):
|
|
28
|
+
"""PureML implementation of Skip-Gram with Negative Sampling."""
|
|
29
|
+
|
|
30
|
+
def __init__(self,
|
|
31
|
+
V: int,
|
|
32
|
+
D: int,
|
|
33
|
+
*,
|
|
34
|
+
seed: int | None = None,
|
|
35
|
+
optim: Type[Optim],
|
|
36
|
+
optim_kwargs: dict,
|
|
37
|
+
lr_sched: Type[LRScheduler],
|
|
38
|
+
lr_sched_kwargs: dict):
|
|
39
|
+
"""
|
|
40
|
+
Args:
|
|
41
|
+
V: Vocabulary size (number of nodes).
|
|
42
|
+
D: Embedding dimensionality.
|
|
43
|
+
seed: Optional RNG seed for negative sampling.
|
|
44
|
+
optim: PureML optimizer class.
|
|
45
|
+
optim_kwargs: Keyword arguments forwarded to the optimizer.
|
|
46
|
+
lr_sched: PureML learning-rate scheduler class.
|
|
47
|
+
lr_sched_kwargs: Keyword arguments forwarded to the scheduler.
|
|
48
|
+
"""
|
|
49
|
+
self.V, self.D = int(V), int(D)
|
|
50
|
+
self.in_emb = Embedding(V, D)
|
|
51
|
+
self.out_emb = Embedding(V, D)
|
|
52
|
+
|
|
53
|
+
self.seed = None if seed is None else int(seed)
|
|
54
|
+
self._rng = np.random.default_rng(self.seed)
|
|
55
|
+
|
|
56
|
+
self.optim: Optim = optim(self.parameters, **optim_kwargs)
|
|
57
|
+
self.lr_sched: LRScheduler = lr_sched(**lr_sched_kwargs)
|
|
58
|
+
_logger.info("SGNS_PureML init: V=%d D=%d seed=%s", self.V, self.D, self.seed)
|
|
59
|
+
|
|
60
|
+
def _sample_neg(self, B: int, K: int, dist: np.ndarray):
|
|
61
|
+
"""Draw negative samples according to the provided unigram distribution."""
|
|
62
|
+
if dist.ndim != 1 or dist.size != self.V:
|
|
63
|
+
raise ValueError(f"noise_dist must be 1-D with length {self.V}; got {dist.shape}")
|
|
64
|
+
return self._rng.choice(self.V, size=(B, K), replace=True, p=dist)
|
|
65
|
+
|
|
66
|
+
def predict(self, center: Tensor, pos: Tensor, neg: Tensor) -> Tensor:
|
|
67
|
+
"""Compute positive/negative logits for SGNS."""
|
|
68
|
+
c = self.in_emb(center)
|
|
69
|
+
pos_e = self.out_emb(pos)
|
|
70
|
+
neg_e = self.out_emb(neg)
|
|
71
|
+
pos_logits = t_sum(c * pos_e, axis=-1)
|
|
72
|
+
neg_logits = t_sum(c[:, None, :] * neg_e, axis=-1)
|
|
73
|
+
# ^^^
|
|
74
|
+
# (B,1,D) * (B,K,D) → (B,K,D) → sum D → (B,K)
|
|
75
|
+
|
|
76
|
+
return pos_logits, neg_logits
|
|
77
|
+
|
|
78
|
+
def fit(self,
|
|
79
|
+
centers: np.ndarray,
|
|
80
|
+
contexts: np.ndarray,
|
|
81
|
+
num_epochs: int,
|
|
82
|
+
batch_size: int,
|
|
83
|
+
num_negative_samples: int,
|
|
84
|
+
noise_dist: np.ndarray,
|
|
85
|
+
shuffle_data: bool,
|
|
86
|
+
lr_step_per_batch: bool):
|
|
87
|
+
"""Train SGNS on the provided center/context pairs."""
|
|
88
|
+
_logger.info(
|
|
89
|
+
"SGNS_PureML fit: epochs=%d batch=%d negatives=%d shuffle=%s",
|
|
90
|
+
num_epochs, batch_size, num_negative_samples, shuffle_data
|
|
91
|
+
)
|
|
92
|
+
data = TensorDataset(centers, contexts)
|
|
93
|
+
|
|
94
|
+
for epoch in range(1, num_epochs + 1):
|
|
95
|
+
epoch_loss = 0.0
|
|
96
|
+
batches = 0
|
|
97
|
+
for cen, pos in DataLoader(data, batch_size=batch_size, shuffle=shuffle_data):
|
|
98
|
+
neg = self._sample_neg(batch_size, num_negative_samples, noise_dist)
|
|
99
|
+
|
|
100
|
+
x_pos_logits, x_neg_logits = self(cen, pos, neg)
|
|
101
|
+
|
|
102
|
+
y_pos = Tensor(np.ones_like(x_pos_logits.data))
|
|
103
|
+
y_neg = Tensor(np.zeros_like(x_neg_logits.data))
|
|
104
|
+
|
|
105
|
+
loss = BCE(y_pos, x_pos_logits, from_logits=True) + BCE(y_neg, x_neg_logits, from_logits=True)
|
|
106
|
+
|
|
107
|
+
self.optim.zero_grad()
|
|
108
|
+
loss.backward()
|
|
109
|
+
self.optim.step()
|
|
110
|
+
|
|
111
|
+
if lr_step_per_batch:
|
|
112
|
+
self.lr_sched.step()
|
|
113
|
+
|
|
114
|
+
loss_value = float(np.asarray(loss.data).mean())
|
|
115
|
+
epoch_loss += loss_value
|
|
116
|
+
batches += 1
|
|
117
|
+
_logger.debug("Epoch %d batch %d loss=%.6f", epoch, batches, loss_value)
|
|
118
|
+
|
|
119
|
+
if not lr_step_per_batch:
|
|
120
|
+
self.lr_sched.step()
|
|
121
|
+
|
|
122
|
+
mean_loss = epoch_loss / max(batches, 1)
|
|
123
|
+
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
124
|
+
|
|
125
|
+
@property
|
|
126
|
+
def embeddings(self) -> np.ndarray:
|
|
127
|
+
"""Return the input embedding matrix as a NumPy array."""
|
|
128
|
+
W: Tensor = self.in_emb.parameters[0]
|
|
129
|
+
return np.asarray(W.data)
|
|
130
|
+
|
|
131
|
+
|
|
132
|
+
__all__ = ["SGNS_PureML"]
|
|
133
|
+
|
|
134
|
+
if __name__ == "__main__":
|
|
135
|
+
pass
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
# third party
|
|
4
|
+
import numpy as np
|
|
5
|
+
import torch
|
|
6
|
+
import torch.nn as nn
|
|
7
|
+
from torch.optim import Optimizer
|
|
8
|
+
from torch.optim.lr_scheduler import LRScheduler
|
|
9
|
+
|
|
10
|
+
# built-in
|
|
11
|
+
import logging
|
|
12
|
+
from typing import Type
|
|
13
|
+
|
|
14
|
+
# *----------------------------------------------------*
|
|
15
|
+
# GLOBALS
|
|
16
|
+
# *----------------------------------------------------*
|
|
17
|
+
|
|
18
|
+
_logger = logging.getLogger(__name__)
|
|
19
|
+
|
|
20
|
+
# *----------------------------------------------------*
|
|
21
|
+
# CLASSES
|
|
22
|
+
# *----------------------------------------------------*
|
|
23
|
+
|
|
24
|
+
class SGNS_Torch:
|
|
25
|
+
"""PyTorch implementation of Skip-Gram with Negative Sampling."""
|
|
26
|
+
|
|
27
|
+
def __init__(self,
|
|
28
|
+
V: int,
|
|
29
|
+
D: int,
|
|
30
|
+
*,
|
|
31
|
+
seed: int | None = None,
|
|
32
|
+
optim: Type[Optimizer],
|
|
33
|
+
optim_kwargs: dict,
|
|
34
|
+
lr_sched: Type[LRScheduler] | None = None,
|
|
35
|
+
lr_sched_kwargs: dict | None = None,
|
|
36
|
+
device: str | None = None):
|
|
37
|
+
"""
|
|
38
|
+
Args:
|
|
39
|
+
V: Vocabulary size (number of nodes).
|
|
40
|
+
D: Embedding dimensionality.
|
|
41
|
+
seed: Optional RNG seed for PyTorch.
|
|
42
|
+
optim: Optimizer class to instantiate.
|
|
43
|
+
optim_kwargs: Keyword arguments for the optimizer.
|
|
44
|
+
lr_sched: Optional learning-rate scheduler class.
|
|
45
|
+
lr_sched_kwargs: Keyword arguments for the scheduler.
|
|
46
|
+
device: Target device string (e.g. ``"cuda"``). Defaults to CUDA if available, else CPU.
|
|
47
|
+
"""
|
|
48
|
+
if optim_kwargs is None:
|
|
49
|
+
raise ValueError("optim_kwargs must be provided")
|
|
50
|
+
if lr_sched is not None and lr_sched_kwargs is None:
|
|
51
|
+
raise ValueError("lr_sched_kwargs required when lr_sched is provided")
|
|
52
|
+
self.V, self.D = int(V), int(D)
|
|
53
|
+
resolved_device = device if device is not None else ("cuda" if torch.cuda.is_available() else "cpu")
|
|
54
|
+
self.device = torch.device(resolved_device)
|
|
55
|
+
_logger.info("SGNS_Torch init: V=%d D=%d device=%s seed=%s", self.V, self.D, self.device, seed)
|
|
56
|
+
|
|
57
|
+
if seed is not None:
|
|
58
|
+
torch.manual_seed(int(seed))
|
|
59
|
+
np.random.seed(int(seed))
|
|
60
|
+
if self.device.type == "cuda":
|
|
61
|
+
torch.cuda.manual_seed_all(int(seed))
|
|
62
|
+
|
|
63
|
+
# two embeddings as in/out matrices
|
|
64
|
+
self.in_emb = nn.Embedding(self.V, self.D)
|
|
65
|
+
self.out_emb = nn.Embedding(self.V, self.D)
|
|
66
|
+
|
|
67
|
+
self.to(self.device)
|
|
68
|
+
|
|
69
|
+
params = list(self.in_emb.parameters()) + list(self.out_emb.parameters())
|
|
70
|
+
self.opt = optim(params=params, **optim_kwargs)
|
|
71
|
+
self.lr_sched = lr_sched(self.opt, **lr_sched_kwargs) if lr_sched is not None else None
|
|
72
|
+
|
|
73
|
+
def predict(self,
|
|
74
|
+
center: torch.Tensor,
|
|
75
|
+
pos: torch.Tensor,
|
|
76
|
+
neg: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
|
|
77
|
+
|
|
78
|
+
center = center.to(self.device, dtype=torch.long)
|
|
79
|
+
pos = pos.to(self.device, dtype=torch.long)
|
|
80
|
+
neg = neg.to(self.device, dtype=torch.long)
|
|
81
|
+
|
|
82
|
+
c = self.in_emb(center) # (B, D)
|
|
83
|
+
pe = self.out_emb(pos) # (B, D)
|
|
84
|
+
ne = self.out_emb(neg) # (B, K, D)
|
|
85
|
+
|
|
86
|
+
pos_logits = (c * pe).sum(dim=-1) # (B,)
|
|
87
|
+
neg_logits = (c.unsqueeze(1) * ne).sum(dim=-1) # (B, K)
|
|
88
|
+
|
|
89
|
+
return pos_logits, neg_logits
|
|
90
|
+
|
|
91
|
+
__call__ = predict
|
|
92
|
+
|
|
93
|
+
def fit(self,
|
|
94
|
+
centers: np.ndarray,
|
|
95
|
+
contexts: np.ndarray,
|
|
96
|
+
num_epochs: int,
|
|
97
|
+
batch_size: int,
|
|
98
|
+
num_negative_samples: int,
|
|
99
|
+
noise_dist: np.ndarray,
|
|
100
|
+
shuffle_data: bool,
|
|
101
|
+
lr_step_per_batch: bool):
|
|
102
|
+
"""Train SGNS on the provided center/context pairs."""
|
|
103
|
+
if noise_dist.ndim != 1 or noise_dist.size != self.V:
|
|
104
|
+
raise ValueError(f"noise_dist must be 1-D with length {self.V}; got {noise_dist.shape}")
|
|
105
|
+
_logger.info(
|
|
106
|
+
"SGNS_Torch fit: epochs=%d batch=%d negatives=%d shuffle=%s",
|
|
107
|
+
num_epochs, batch_size, num_negative_samples, shuffle_data
|
|
108
|
+
)
|
|
109
|
+
bce = nn.BCEWithLogitsLoss(reduction="mean")
|
|
110
|
+
|
|
111
|
+
N = centers.shape[0]
|
|
112
|
+
idx = np.arange(N)
|
|
113
|
+
|
|
114
|
+
noise_probs = torch.as_tensor(noise_dist, dtype=torch.float32, device=self.device)
|
|
115
|
+
|
|
116
|
+
for epoch in range(1, int(num_epochs) + 1):
|
|
117
|
+
epoch_loss = 0.0
|
|
118
|
+
batches = 0
|
|
119
|
+
if shuffle_data:
|
|
120
|
+
np.random.shuffle(idx)
|
|
121
|
+
|
|
122
|
+
for s in range(0, N, int(batch_size)):
|
|
123
|
+
take = idx[s:s+int(batch_size)]
|
|
124
|
+
if take.size == 0:
|
|
125
|
+
continue
|
|
126
|
+
K = int(num_negative_samples)
|
|
127
|
+
B = len(take)
|
|
128
|
+
|
|
129
|
+
cen = torch.as_tensor(centers[take], dtype=torch.long, device=self.device) # (B,)
|
|
130
|
+
pos = torch.as_tensor(contexts[take], dtype=torch.long, device=self.device) # (B,)
|
|
131
|
+
neg = torch.multinomial(noise_probs, num_samples=B * K, replacement=True).view(B, K) # (B,K) on device
|
|
132
|
+
|
|
133
|
+
pos_logits, neg_logits = self(cen, pos, neg)
|
|
134
|
+
|
|
135
|
+
# BCE(+)
|
|
136
|
+
y_pos = torch.ones_like(pos_logits)
|
|
137
|
+
loss_pos = bce(pos_logits, y_pos)
|
|
138
|
+
|
|
139
|
+
# BCE(-):
|
|
140
|
+
y_neg = torch.zeros_like(neg_logits)
|
|
141
|
+
loss_neg = bce(neg_logits, y_neg)
|
|
142
|
+
|
|
143
|
+
loss = loss_pos + loss_neg
|
|
144
|
+
|
|
145
|
+
self.opt.zero_grad(set_to_none=True)
|
|
146
|
+
loss.backward()
|
|
147
|
+
self.opt.step()
|
|
148
|
+
|
|
149
|
+
if lr_step_per_batch and self.lr_sched is not None:
|
|
150
|
+
self.lr_sched.step()
|
|
151
|
+
|
|
152
|
+
epoch_loss += float(loss.detach().cpu().item())
|
|
153
|
+
batches += 1
|
|
154
|
+
_logger.debug("Epoch %d batch %d loss=%.6f", epoch, batches, loss.item())
|
|
155
|
+
|
|
156
|
+
if not lr_step_per_batch and self.lr_sched is not None:
|
|
157
|
+
self.lr_sched.step()
|
|
158
|
+
|
|
159
|
+
mean_loss = epoch_loss / max(batches, 1)
|
|
160
|
+
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
161
|
+
|
|
162
|
+
@property
|
|
163
|
+
def embeddings(self) -> np.ndarray:
|
|
164
|
+
"""Return the input embedding matrix as a NumPy array."""
|
|
165
|
+
return self.in_emb.weight.detach().cpu().numpy()
|
|
166
|
+
|
|
167
|
+
# tiny helper for device move
|
|
168
|
+
def to(self, device):
|
|
169
|
+
self.in_emb.to(device)
|
|
170
|
+
self.out_emb.to(device)
|
|
171
|
+
return self
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
__all__ = ["SGNS_Torch"]
|
|
175
|
+
|
|
176
|
+
if __name__ == "__main__":
|
|
177
|
+
pass
|
|
@@ -0,0 +1,34 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from .embedder import Embedder
|
|
4
|
+
|
|
5
|
+
def __getattr__(name: str):
|
|
6
|
+
"""Lazily expose optional backends."""
|
|
7
|
+
if name == "SGNS_Torch":
|
|
8
|
+
try:
|
|
9
|
+
from .SGNS_torch import SGNS_Torch
|
|
10
|
+
except Exception as exc:
|
|
11
|
+
raise ImportError(
|
|
12
|
+
"PyTorch backend requested but torch is not installed. "
|
|
13
|
+
"Install PyTorch via `pip install torch` (see https://pytorch.org/get-started)."
|
|
14
|
+
) from exc
|
|
15
|
+
return SGNS_Torch
|
|
16
|
+
|
|
17
|
+
if name == "SGNS_PureML":
|
|
18
|
+
try:
|
|
19
|
+
from .SGNS_pml import SGNS_PureML
|
|
20
|
+
return SGNS_PureML
|
|
21
|
+
except Exception as exc:
|
|
22
|
+
raise ImportError(
|
|
23
|
+
"PureML is not installed. "
|
|
24
|
+
"Install PureML first via `pip install ym-pure-ml` "
|
|
25
|
+
) from exc
|
|
26
|
+
|
|
27
|
+
raise AttributeError(name)
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
__all__ = [
|
|
31
|
+
"Embedder",
|
|
32
|
+
"SGNS_PureML",
|
|
33
|
+
"SGNS_Torch",
|
|
34
|
+
]
|