sawnergy 1.0.6__py3-none-any.whl → 1.0.8__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of sawnergy might be problematic. Click here for more details.
- sawnergy/embedding/SGNS_pml.py +219 -23
- sawnergy/embedding/SGNS_torch.py +213 -26
- sawnergy/embedding/__init__.py +24 -0
- sawnergy/embedding/embedder.py +341 -211
- sawnergy/embedding/visualizer.py +251 -0
- sawnergy/logging_util.py +1 -1
- sawnergy/rin/rin_builder.py +1 -1
- sawnergy/visual/visualizer.py +6 -6
- sawnergy/visual/visualizer_util.py +3 -0
- {sawnergy-1.0.6.dist-info → sawnergy-1.0.8.dist-info}/METADATA +79 -56
- sawnergy-1.0.8.dist-info/RECORD +23 -0
- sawnergy-1.0.6.dist-info/RECORD +0 -22
- {sawnergy-1.0.6.dist-info → sawnergy-1.0.8.dist-info}/WHEEL +0 -0
- {sawnergy-1.0.6.dist-info → sawnergy-1.0.8.dist-info}/licenses/LICENSE +0 -0
- {sawnergy-1.0.6.dist-info → sawnergy-1.0.8.dist-info}/licenses/NOTICE +0 -0
- {sawnergy-1.0.6.dist-info → sawnergy-1.0.8.dist-info}/top_level.txt +0 -0
sawnergy/embedding/SGNS_pml.py
CHANGED
|
@@ -3,11 +3,11 @@ from __future__ import annotations
|
|
|
3
3
|
# third party
|
|
4
4
|
import numpy as np
|
|
5
5
|
from pureml.machinery import Tensor
|
|
6
|
-
from pureml.layers import Embedding
|
|
7
|
-
from pureml.losses import BCE
|
|
6
|
+
from pureml.layers import Embedding, Affine
|
|
7
|
+
from pureml.losses import BCE, CCE
|
|
8
8
|
from pureml.general_math import sum as t_sum
|
|
9
|
-
from pureml.optimizers import Optim, LRScheduler
|
|
10
|
-
from pureml.training_utils import TensorDataset, DataLoader
|
|
9
|
+
from pureml.optimizers import Optim, LRScheduler, SGD
|
|
10
|
+
from pureml.training_utils import TensorDataset, DataLoader, one_hot
|
|
11
11
|
from pureml.base import NN
|
|
12
12
|
|
|
13
13
|
# built-in
|
|
@@ -32,8 +32,8 @@ class SGNS_PureML(NN):
|
|
|
32
32
|
D: int,
|
|
33
33
|
*,
|
|
34
34
|
seed: int | None = None,
|
|
35
|
-
optim: Type[Optim],
|
|
36
|
-
optim_kwargs: dict,
|
|
35
|
+
optim: Type[Optim] = SGD,
|
|
36
|
+
optim_kwargs: dict | None = None,
|
|
37
37
|
lr_sched: Type[LRScheduler] | None = None,
|
|
38
38
|
lr_sched_kwargs: dict | None = None,
|
|
39
39
|
device: str | None = None):
|
|
@@ -42,15 +42,15 @@ class SGNS_PureML(NN):
|
|
|
42
42
|
V: Vocabulary size (number of nodes).
|
|
43
43
|
D: Embedding dimensionality.
|
|
44
44
|
seed: Optional RNG seed for negative sampling.
|
|
45
|
-
optim: Optimizer class to instantiate.
|
|
46
|
-
optim_kwargs: Keyword arguments for the optimizer
|
|
45
|
+
optim: Optimizer class to instantiate. Defaults to plain SGD.
|
|
46
|
+
optim_kwargs: Keyword arguments for the optimizer. Defaults to {"lr": 0.1}.
|
|
47
47
|
lr_sched: Optional learning-rate scheduler class.
|
|
48
48
|
lr_sched_kwargs: Keyword arguments for the scheduler (required if lr_sched is provided).
|
|
49
49
|
device: Target device string (e.g. "cuda"); accepted for API parity, ignored by PureML.
|
|
50
50
|
"""
|
|
51
51
|
|
|
52
|
-
|
|
53
|
-
|
|
52
|
+
optim_kwargs = optim_kwargs or {"lr": 0.1}
|
|
53
|
+
|
|
54
54
|
if lr_sched is not None and lr_sched_kwargs is None:
|
|
55
55
|
raise ValueError("lr_sched_kwargs required when lr_sched is provided")
|
|
56
56
|
|
|
@@ -82,9 +82,6 @@ class SGNS_PureML(NN):
|
|
|
82
82
|
)
|
|
83
83
|
|
|
84
84
|
def _sample_neg(self, B: int, K: int, dist: np.ndarray) -> np.ndarray:
|
|
85
|
-
"""Draw negative samples according to the provided unigram distribution."""
|
|
86
|
-
if dist.ndim != 1 or dist.size != self.V:
|
|
87
|
-
raise ValueError(f"noise_dist must be 1-D with length {self.V}; got {dist.shape}")
|
|
88
85
|
return self._rng.choice(self.V, size=(B, K), replace=True, p=dist)
|
|
89
86
|
|
|
90
87
|
def predict(self, center: Tensor, pos: Tensor, neg: Tensor) -> tuple[Tensor, Tensor]:
|
|
@@ -120,8 +117,19 @@ class SGNS_PureML(NN):
|
|
|
120
117
|
"SGNS_PureML fit: epochs=%d batch=%d negatives=%d shuffle=%s",
|
|
121
118
|
num_epochs, batch_size, num_negative_samples, shuffle_data
|
|
122
119
|
)
|
|
123
|
-
data = TensorDataset(centers, contexts)
|
|
124
120
|
|
|
121
|
+
if noise_dist.ndim != 1 or noise_dist.size != self.V:
|
|
122
|
+
raise ValueError(f"noise_dist must be 1-D with length {self.V}; got {noise_dist.shape}")
|
|
123
|
+
dist = np.asarray(noise_dist, dtype=np.float64)
|
|
124
|
+
if np.any(dist < 0):
|
|
125
|
+
raise ValueError("noise_dist has negative entries")
|
|
126
|
+
s = dist.sum()
|
|
127
|
+
if not np.isfinite(s) or s <= 0:
|
|
128
|
+
raise ValueError("noise_dist must have positive finite sum")
|
|
129
|
+
if abs(s - 1.0) > 1e-6:
|
|
130
|
+
dist = dist / s
|
|
131
|
+
|
|
132
|
+
data = TensorDataset(centers, contexts)
|
|
125
133
|
for epoch in range(1, num_epochs + 1):
|
|
126
134
|
epoch_loss = 0.0
|
|
127
135
|
batches = 0
|
|
@@ -129,16 +137,17 @@ class SGNS_PureML(NN):
|
|
|
129
137
|
for cen, pos in DataLoader(data, batch_size=batch_size, shuffle=shuffle_data):
|
|
130
138
|
B = cen.data.shape[0] if isinstance(cen, Tensor) else len(cen)
|
|
131
139
|
|
|
132
|
-
neg_idx_np = self._sample_neg(B, num_negative_samples,
|
|
140
|
+
neg_idx_np = self._sample_neg(B, num_negative_samples, dist)
|
|
133
141
|
neg = Tensor(neg_idx_np, requires_grad=False)
|
|
134
142
|
x_pos_logits, x_neg_logits = self(cen, pos, neg)
|
|
135
143
|
|
|
136
|
-
y_pos = Tensor(np.ones_like(x_pos_logits.
|
|
137
|
-
y_neg = Tensor(np.zeros_like(x_neg_logits.
|
|
144
|
+
y_pos = Tensor(np.ones_like(x_pos_logits.numpy(copy=False)), requires_grad=False)
|
|
145
|
+
y_neg = Tensor(np.zeros_like(x_neg_logits.numpy(copy=False)), requires_grad=False)
|
|
138
146
|
|
|
147
|
+
K = int(neg.data.shape[1])
|
|
139
148
|
loss = (
|
|
140
149
|
BCE(y_pos, x_pos_logits, from_logits=True)
|
|
141
|
-
+ BCE(y_neg, x_neg_logits, from_logits=True)
|
|
150
|
+
+ Tensor(K)*BCE(y_neg, x_neg_logits, from_logits=True)
|
|
142
151
|
)
|
|
143
152
|
|
|
144
153
|
self.optim.zero_grad()
|
|
@@ -160,13 +169,200 @@ class SGNS_PureML(NN):
|
|
|
160
169
|
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
161
170
|
|
|
162
171
|
@property
|
|
163
|
-
def
|
|
164
|
-
|
|
165
|
-
W
|
|
166
|
-
|
|
172
|
+
def in_embeddings(self) -> np.ndarray:
|
|
173
|
+
W: Tensor = self.in_emb.parameters[0] # (V, D)
|
|
174
|
+
if W.shape != (self.V, self.D):
|
|
175
|
+
raise RuntimeError(
|
|
176
|
+
"Wrong embedding matrix shape: "
|
|
177
|
+
"self.in_emb.parameters[0].shape != (V, D)"
|
|
178
|
+
)
|
|
179
|
+
arr = W.numpy(copy=True, readonly=True) # (V, D)
|
|
180
|
+
_logger.debug("In emb shape: %s", arr.shape)
|
|
181
|
+
return arr
|
|
182
|
+
|
|
183
|
+
@property
|
|
184
|
+
def out_embeddings(self) -> np.ndarray:
|
|
185
|
+
W: Tensor = self.out_emb.parameters[0] # (V, D)
|
|
186
|
+
if W.shape != (self.V, self.D):
|
|
187
|
+
raise RuntimeError(
|
|
188
|
+
"Wrong embedding matrix shape: "
|
|
189
|
+
"self.out_emb.parameters[0].shape != (V, D)"
|
|
190
|
+
)
|
|
191
|
+
arr = W.numpy(copy=True, readonly=True) # (V, D)
|
|
192
|
+
_logger.debug("Out emb shape: %s", arr.shape)
|
|
193
|
+
return arr
|
|
194
|
+
|
|
195
|
+
@property
|
|
196
|
+
def avg_embeddings(self) -> np.ndarray:
|
|
197
|
+
return 0.5 * (self.in_embeddings + self.out_embeddings)
|
|
198
|
+
|
|
199
|
+
class SG_PureML(NN):
|
|
200
|
+
"""Plain Skip-Gram (full softmax) in PureML.
|
|
201
|
+
|
|
202
|
+
Trains two affine layers to emulate the classic Skip-Gram objective with a
|
|
203
|
+
**full** softmax over the vocabulary (no negative sampling):
|
|
204
|
+
|
|
205
|
+
x = one_hot(center, V) # (B, V)
|
|
206
|
+
y = x @ W_in + b_in # (B, D)
|
|
207
|
+
logits = y @ W_out + b_out # (B, V)
|
|
208
|
+
loss = CCE(one_hot(context, V), logits, from_logits=True)
|
|
209
|
+
|
|
210
|
+
The learnable “input” embeddings are the rows of `W_in` (shape `(V, D)`), and
|
|
211
|
+
the “output” embeddings are the rows of `W_outᵀ` (also `(V, D)`).
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
def __init__(self,
|
|
215
|
+
V: int,
|
|
216
|
+
D: int,
|
|
217
|
+
*,
|
|
218
|
+
seed: int | None = None,
|
|
219
|
+
optim: Type[Optim] = SGD,
|
|
220
|
+
optim_kwargs: dict | None = None,
|
|
221
|
+
lr_sched: Type[LRScheduler] | None = None,
|
|
222
|
+
lr_sched_kwargs: dict | None = None,
|
|
223
|
+
device: str | None = None):
|
|
224
|
+
"""Initialize the plain Skip-Gram model (full softmax).
|
|
225
|
+
|
|
226
|
+
Args:
|
|
227
|
+
V: Vocabulary size (number of nodes/tokens).
|
|
228
|
+
D: Embedding dimensionality.
|
|
229
|
+
seed: Optional RNG seed (kept for API parity; not used in layer init).
|
|
230
|
+
optim: Optimizer class to instantiate. Defaults to plain SGD.
|
|
231
|
+
optim_kwargs: Keyword arguments for the optimizer. Defaults to {"lr": 0.1}.
|
|
232
|
+
lr_sched: Optional learning-rate scheduler class.
|
|
233
|
+
lr_sched_kwargs: Keyword arguments for the scheduler (required if lr_sched is provided).
|
|
234
|
+
device: Device string (e.g., "cuda"). Accepted for parity, ignored by PureML (CPU-only).
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
optim_kwargs = optim_kwargs or {"lr": 0.1}
|
|
238
|
+
if lr_sched is not None and lr_sched_kwargs is None:
|
|
239
|
+
raise ValueError("lr_sched_kwargs required when lr_sched is provided")
|
|
240
|
+
|
|
241
|
+
self.V, self.D = int(V), int(D)
|
|
242
|
+
|
|
243
|
+
# input/output “embedding” projections
|
|
244
|
+
self.in_emb = Affine(self.V, self.D)
|
|
245
|
+
self.out_emb = Affine(self.D, self.V)
|
|
246
|
+
|
|
247
|
+
self.seed = None if seed is None else int(seed)
|
|
248
|
+
self.device = "cpu" # API parity
|
|
249
|
+
|
|
250
|
+
# optimizer / scheduler
|
|
251
|
+
self.optim: Optim = optim(self.parameters, **optim_kwargs)
|
|
252
|
+
self.lr_sched: LRScheduler | None = (
|
|
253
|
+
lr_sched(optim=self.optim, **lr_sched_kwargs) if lr_sched is not None else None
|
|
254
|
+
)
|
|
255
|
+
|
|
256
|
+
_logger.info(
|
|
257
|
+
"SG_PureML init: V=%d D=%d device=%s seed=%s",
|
|
258
|
+
self.V, self.D, self.device, self.seed
|
|
259
|
+
)
|
|
260
|
+
|
|
261
|
+
def predict(self, center: Tensor) -> Tensor:
|
|
262
|
+
"""Return vocabulary logits for each center index.
|
|
263
|
+
|
|
264
|
+
Args:
|
|
265
|
+
center: Tensor of center indices with shape `(B,)` and integer dtype.
|
|
266
|
+
|
|
267
|
+
Returns:
|
|
268
|
+
Tensor: Logits over the vocabulary with shape `(B, V)`.
|
|
269
|
+
"""
|
|
270
|
+
c = one_hot(dims=self.V, label=center) # (B, V)
|
|
271
|
+
y = self.in_emb(c) # (B, D)
|
|
272
|
+
z = self.out_emb(y) # (B, V)
|
|
273
|
+
return z
|
|
274
|
+
|
|
275
|
+
def fit(self,
|
|
276
|
+
centers: np.ndarray,
|
|
277
|
+
contexts: np.ndarray,
|
|
278
|
+
num_epochs: int,
|
|
279
|
+
batch_size: int,
|
|
280
|
+
shuffle_data: bool,
|
|
281
|
+
lr_step_per_batch: bool,
|
|
282
|
+
**_ignore):
|
|
283
|
+
"""Train Skip-Gram with full softmax on center/context pairs.
|
|
284
|
+
|
|
285
|
+
Args:
|
|
286
|
+
centers: Array of center indices, shape `(N,)`, dtype integer in `[0, V)`.
|
|
287
|
+
contexts: Array of context (target) indices, shape `(N,)`, dtype integer.
|
|
288
|
+
num_epochs: Number of passes over the dataset.
|
|
289
|
+
batch_size: Mini-batch size.
|
|
290
|
+
shuffle_data: Whether to shuffle pairs each epoch.
|
|
291
|
+
lr_step_per_batch: If True, call `lr_sched.step()` after every batch
|
|
292
|
+
(when a scheduler is provided). If False, step once per epoch.
|
|
293
|
+
**_ignore: Ignored kwargs for API compatibility with SGNS.
|
|
294
|
+
|
|
295
|
+
Optimization:
|
|
296
|
+
Uses `CCE(one_hot(context), logits, from_logits=True)` where
|
|
297
|
+
`logits = predict(center)`. Scheduler stepping obeys `lr_step_per_batch`.
|
|
298
|
+
"""
|
|
299
|
+
_logger.info(
|
|
300
|
+
"SG_PureML fit: epochs=%d batch=%d shuffle=%s",
|
|
301
|
+
num_epochs, batch_size, shuffle_data
|
|
302
|
+
)
|
|
303
|
+
data = TensorDataset(centers, contexts)
|
|
304
|
+
|
|
305
|
+
for epoch in range(1, num_epochs + 1):
|
|
306
|
+
epoch_loss = 0.0
|
|
307
|
+
batches = 0
|
|
308
|
+
|
|
309
|
+
for cen, ctx in DataLoader(data, batch_size=batch_size, shuffle=shuffle_data):
|
|
310
|
+
logits = self(cen) # (B, V)
|
|
311
|
+
y = one_hot(self.V, label=ctx) # (B, V)
|
|
312
|
+
loss = CCE(y, logits, from_logits=True) # scalar
|
|
313
|
+
|
|
314
|
+
self.optim.zero_grad()
|
|
315
|
+
loss.backward()
|
|
316
|
+
self.optim.step()
|
|
317
|
+
|
|
318
|
+
if lr_step_per_batch and self.lr_sched is not None:
|
|
319
|
+
self.lr_sched.step()
|
|
320
|
+
|
|
321
|
+
loss_value = float(np.asarray(loss.data))
|
|
322
|
+
epoch_loss += loss_value
|
|
323
|
+
batches += 1
|
|
324
|
+
_logger.debug("Epoch %d batch %d loss=%.6f", epoch, batches, loss_value)
|
|
325
|
+
|
|
326
|
+
if (not lr_step_per_batch) and (self.lr_sched is not None):
|
|
327
|
+
self.lr_sched.step()
|
|
328
|
+
|
|
329
|
+
mean_loss = epoch_loss / max(batches, 1)
|
|
330
|
+
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
331
|
+
|
|
332
|
+
@property
|
|
333
|
+
def in_embeddings(self) -> np.ndarray:
|
|
334
|
+
"""Input embeddings matrix `W_in` as `(V, D)` (copy, read-only)."""
|
|
335
|
+
W = self.in_emb.parameters[0] # (V, D)
|
|
336
|
+
if W.shape != (self.V, self.D):
|
|
337
|
+
raise RuntimeError(
|
|
338
|
+
"Wrong embedding matrix shape: "
|
|
339
|
+
"self.in_emb.parameters[0].shape != (V, D)"
|
|
340
|
+
)
|
|
341
|
+
arr = W.numpy(copy=True, readonly=True) # (V, D)
|
|
342
|
+
_logger.debug("In emb shape: %s", arr.shape)
|
|
343
|
+
return arr
|
|
344
|
+
|
|
345
|
+
@property
|
|
346
|
+
def out_embeddings(self) -> np.ndarray:
|
|
347
|
+
"""Output embeddings matrix `W_outᵀ` as `(V, D)` (copy, read-only).
|
|
348
|
+
(`out_emb.parameters[0]` is `(D, V)`, so we transpose.)"""
|
|
349
|
+
W = self.out_emb.parameters[0] # (D, V)
|
|
350
|
+
if W.shape != (self.D, self.V):
|
|
351
|
+
raise RuntimeError(
|
|
352
|
+
"Wrong embedding matrix shape: "
|
|
353
|
+
"self.out_emb.parameters[0].shape != (D, V)"
|
|
354
|
+
)
|
|
355
|
+
arr = W.numpy(copy=True, readonly=True).T # (V, D)
|
|
356
|
+
_logger.debug("Out emb shape: %s", arr.shape)
|
|
357
|
+
return arr
|
|
358
|
+
|
|
359
|
+
@property
|
|
360
|
+
def avg_embeddings(self) -> np.ndarray:
|
|
361
|
+
"""Elementwise average of input/output embeddings, shape `(V, D)`."""
|
|
362
|
+
return 0.5 * (self.in_embeddings + self.out_embeddings) # (V, D)
|
|
167
363
|
|
|
168
364
|
|
|
169
|
-
__all__ = ["SGNS_PureML"]
|
|
365
|
+
__all__ = ["SGNS_PureML", "SG_PureML"]
|
|
170
366
|
|
|
171
367
|
if __name__ == "__main__":
|
|
172
368
|
pass
|
sawnergy/embedding/SGNS_torch.py
CHANGED
|
@@ -10,6 +10,7 @@ from torch.optim.lr_scheduler import LRScheduler
|
|
|
10
10
|
# built-in
|
|
11
11
|
import logging
|
|
12
12
|
from typing import Type
|
|
13
|
+
import warnings
|
|
13
14
|
|
|
14
15
|
# *----------------------------------------------------*
|
|
15
16
|
# GLOBALS
|
|
@@ -22,51 +23,84 @@ _logger = logging.getLogger(__name__)
|
|
|
22
23
|
# *----------------------------------------------------*
|
|
23
24
|
|
|
24
25
|
class SGNS_Torch:
|
|
25
|
-
"""PyTorch implementation of Skip-Gram with Negative Sampling.
|
|
26
|
+
"""PyTorch implementation of Skip-Gram with Negative Sampling.
|
|
27
|
+
|
|
28
|
+
DEPRECATED (temporary): This class currently produces noisy embeddings in
|
|
29
|
+
practice and is deprecated until further notice. The issue likely stems from
|
|
30
|
+
weight initialization, although the root cause has not yet been determined.
|
|
31
|
+
|
|
32
|
+
Prefer one of the following alternatives:
|
|
33
|
+
• Plain PyTorch Skip-Gram (full softmax): `SG_Torch`
|
|
34
|
+
• PureML-based implementations: `SGNS_PureML` or `SG_PureML` (if available)
|
|
35
|
+
|
|
36
|
+
This API may change or be removed once the root cause is resolved.
|
|
37
|
+
"""
|
|
26
38
|
|
|
27
39
|
def __init__(self,
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
40
|
+
V: int,
|
|
41
|
+
D: int,
|
|
42
|
+
*,
|
|
31
43
|
seed: int | None = None,
|
|
32
|
-
optim: Type[Optimizer],
|
|
33
|
-
optim_kwargs: dict,
|
|
44
|
+
optim: Type[Optimizer] = torch.optim.SGD,
|
|
45
|
+
optim_kwargs: dict | None = None,
|
|
34
46
|
lr_sched: Type[LRScheduler] | None = None,
|
|
35
47
|
lr_sched_kwargs: dict | None = None,
|
|
36
48
|
device: str | None = None):
|
|
37
|
-
"""
|
|
49
|
+
"""Initialize SGNS (negative sampling) in PyTorch.
|
|
50
|
+
|
|
51
|
+
DEPRECATION WARNING:
|
|
52
|
+
This implementation is temporarily deprecated for producing noisy
|
|
53
|
+
embeddings. The issue likely stems from weight initialization, though
|
|
54
|
+
the exact root cause has not been conclusively determined. Please use
|
|
55
|
+
`SG_Torch` (plain Skip-Gram with full softmax) or the PureML-based
|
|
56
|
+
`SGNS_PureML` / `SG_PureML` models instead.
|
|
57
|
+
|
|
38
58
|
Args:
|
|
39
59
|
V: Vocabulary size (number of nodes).
|
|
40
60
|
D: Embedding dimensionality.
|
|
41
61
|
seed: Optional RNG seed for PyTorch.
|
|
42
|
-
optim: Optimizer class to instantiate.
|
|
43
|
-
optim_kwargs: Keyword arguments for the optimizer.
|
|
62
|
+
optim: Optimizer class to instantiate. Defaults to plain SGD.
|
|
63
|
+
optim_kwargs: Keyword arguments for the optimizer. Defaults to {"lr": 0.1}.
|
|
44
64
|
lr_sched: Optional learning-rate scheduler class.
|
|
45
|
-
lr_sched_kwargs: Keyword arguments for the scheduler.
|
|
46
|
-
device: Target device string (e.g.
|
|
65
|
+
lr_sched_kwargs: Keyword arguments for the scheduler (required if lr_sched is provided).
|
|
66
|
+
device: Target device string (e.g. "cuda"). Defaults to CUDA if available, else CPU.
|
|
47
67
|
"""
|
|
48
|
-
|
|
49
|
-
|
|
68
|
+
|
|
69
|
+
# --- runtime deprecation notice ---
|
|
70
|
+
warnings.warn(
|
|
71
|
+
"SGNS_Torch is temporarily deprecated: it currently produces noisy "
|
|
72
|
+
"embeddings (likely due to weight initialization). Use SG_Torch "
|
|
73
|
+
"(plain Skip-Gram, full softmax) or the PureML-based SG/SGNS classes.",
|
|
74
|
+
DeprecationWarning,
|
|
75
|
+
stacklevel=2,
|
|
76
|
+
)
|
|
77
|
+
_logger.warning(
|
|
78
|
+
"DEPRECATED: SGNS_Torch currently produces noisy embeddings "
|
|
79
|
+
"(likely weight initialization). Prefer SG_Torch or PureML SG/SGNS."
|
|
80
|
+
)
|
|
81
|
+
# ----------------------------------
|
|
82
|
+
|
|
83
|
+
optim_kwargs = optim_kwargs or {"lr": 0.1}
|
|
50
84
|
if lr_sched is not None and lr_sched_kwargs is None:
|
|
51
85
|
raise ValueError("lr_sched_kwargs required when lr_sched is provided")
|
|
86
|
+
|
|
52
87
|
self.V, self.D = int(V), int(D)
|
|
53
|
-
|
|
54
|
-
self.
|
|
55
|
-
|
|
88
|
+
# two embeddings as in/out matrices
|
|
89
|
+
self.in_emb = nn.Embedding(self.V, self.D)
|
|
90
|
+
self.out_emb = nn.Embedding(self.V, self.D)
|
|
56
91
|
|
|
92
|
+
resolved_device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
93
|
+
self.device = torch.device(resolved_device)
|
|
57
94
|
if seed is not None:
|
|
58
95
|
torch.manual_seed(int(seed))
|
|
59
96
|
np.random.seed(int(seed))
|
|
60
97
|
if self.device.type == "cuda":
|
|
61
98
|
torch.cuda.manual_seed_all(int(seed))
|
|
62
99
|
|
|
63
|
-
# two embeddings as in/out matrices
|
|
64
|
-
self.in_emb = nn.Embedding(self.V, self.D)
|
|
65
|
-
self.out_emb = nn.Embedding(self.V, self.D)
|
|
66
|
-
|
|
67
100
|
self.to(self.device)
|
|
68
|
-
|
|
101
|
+
_logger.info("SGNS_Torch init: V=%d D=%d device=%s seed=%s", self.V, self.D, self.device, seed)
|
|
69
102
|
params = list(self.in_emb.parameters()) + list(self.out_emb.parameters())
|
|
103
|
+
# optimizer / scheduler
|
|
70
104
|
self.opt = optim(params=params, **optim_kwargs)
|
|
71
105
|
self.lr_sched = lr_sched(self.opt, **lr_sched_kwargs) if lr_sched is not None else None
|
|
72
106
|
|
|
@@ -112,6 +146,15 @@ class SGNS_Torch:
|
|
|
112
146
|
idx = np.arange(N)
|
|
113
147
|
|
|
114
148
|
noise_probs = torch.as_tensor(noise_dist, dtype=torch.float32, device=self.device)
|
|
149
|
+
# require normalized, non-negative distribution
|
|
150
|
+
if (not torch.isfinite(noise_probs).all()
|
|
151
|
+
or (noise_probs < 0).any()
|
|
152
|
+
or abs(float(noise_probs.sum().item()) - 1.0) > 1e-6):
|
|
153
|
+
raise ValueError(
|
|
154
|
+
"noise_dist must be non-negative, finite, and sum to 1.0 "
|
|
155
|
+
f"(got sum={float(noise_probs.sum().item()):.6f}, "
|
|
156
|
+
f"min={float(noise_probs.min().item()):.6f})"
|
|
157
|
+
)
|
|
115
158
|
|
|
116
159
|
for epoch in range(1, int(num_epochs) + 1):
|
|
117
160
|
epoch_loss = 0.0
|
|
@@ -140,7 +183,140 @@ class SGNS_Torch:
|
|
|
140
183
|
y_neg = torch.zeros_like(neg_logits)
|
|
141
184
|
loss_neg = bce(neg_logits, y_neg)
|
|
142
185
|
|
|
143
|
-
loss = loss_pos + loss_neg
|
|
186
|
+
loss = loss_pos + K*loss_neg
|
|
187
|
+
|
|
188
|
+
self.opt.zero_grad(set_to_none=True)
|
|
189
|
+
loss.backward()
|
|
190
|
+
self.opt.step()
|
|
191
|
+
|
|
192
|
+
if lr_step_per_batch and self.lr_sched is not None:
|
|
193
|
+
self.lr_sched.step()
|
|
194
|
+
|
|
195
|
+
epoch_loss += float(loss.detach().cpu().item())
|
|
196
|
+
batches += 1
|
|
197
|
+
_logger.debug("Epoch %d batch %d loss=%.6f", epoch, batches, loss.item())
|
|
198
|
+
|
|
199
|
+
if not lr_step_per_batch and self.lr_sched is not None:
|
|
200
|
+
self.lr_sched.step()
|
|
201
|
+
|
|
202
|
+
mean_loss = epoch_loss / max(batches, 1)
|
|
203
|
+
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
204
|
+
|
|
205
|
+
@property
|
|
206
|
+
def in_embeddings(self) -> np.ndarray:
|
|
207
|
+
W = self.in_emb.weight.detach().cpu().numpy()
|
|
208
|
+
_logger.debug("In emb shape: %s", W.shape)
|
|
209
|
+
return W
|
|
210
|
+
|
|
211
|
+
@property
|
|
212
|
+
def out_embeddings(self) -> np.ndarray:
|
|
213
|
+
W = self.out_emb.weight.detach().cpu().numpy()
|
|
214
|
+
_logger.debug("Out emb shape: %s", W.shape)
|
|
215
|
+
return W
|
|
216
|
+
|
|
217
|
+
@property
|
|
218
|
+
def avg_embeddings(self) -> np.ndarray:
|
|
219
|
+
return 0.5 * (self.in_embeddings + self.out_embeddings)
|
|
220
|
+
|
|
221
|
+
# tiny helper for device move
|
|
222
|
+
def to(self, device):
|
|
223
|
+
self.in_emb.to(device)
|
|
224
|
+
self.out_emb.to(device)
|
|
225
|
+
return self
|
|
226
|
+
|
|
227
|
+
class SG_Torch:
|
|
228
|
+
"""PyTorch implementation of Skip-Gram."""
|
|
229
|
+
|
|
230
|
+
def __init__(self,
|
|
231
|
+
V: int,
|
|
232
|
+
D: int,
|
|
233
|
+
*,
|
|
234
|
+
seed: int | None = None,
|
|
235
|
+
optim: Type[Optimizer] = torch.optim.SGD,
|
|
236
|
+
optim_kwargs: dict | None = None,
|
|
237
|
+
lr_sched: Type[LRScheduler] | None = None,
|
|
238
|
+
lr_sched_kwargs: dict | None = None,
|
|
239
|
+
device: str | None = None):
|
|
240
|
+
"""Initialize the plain Skip-Gram (full softmax) model in PyTorch.
|
|
241
|
+
|
|
242
|
+
Args:
|
|
243
|
+
V: Vocabulary size (number of nodes/tokens).
|
|
244
|
+
D: Embedding dimensionality.
|
|
245
|
+
seed: Optional RNG seed for reproducibility.
|
|
246
|
+
optim: Optimizer class to instantiate. Defaults to :class:`torch.optim.SGD`.
|
|
247
|
+
optim_kwargs: Keyword args for the optimizer. Defaults to ``{"lr": 0.1}``.
|
|
248
|
+
lr_sched: Optional learning-rate scheduler class.
|
|
249
|
+
lr_sched_kwargs: Keyword args for the scheduler (required if ``lr_sched`` is provided).
|
|
250
|
+
device: Target device string (e.g., ``"cuda"``). Defaults to CUDA if available, else CPU.
|
|
251
|
+
|
|
252
|
+
Notes:
|
|
253
|
+
The encoder/decoder are linear layers acting on one-hot centers:
|
|
254
|
+
• ``in_emb = nn.Linear(V, D)``
|
|
255
|
+
• ``out_emb = nn.Linear(D, V)``
|
|
256
|
+
Forward pass produces vocabulary-sized logits and is trained with CrossEntropyLoss.
|
|
257
|
+
"""
|
|
258
|
+
optim_kwargs = optim_kwargs or {"lr": 0.1}
|
|
259
|
+
if lr_sched is not None and lr_sched_kwargs is None:
|
|
260
|
+
raise ValueError("lr_sched_kwargs required when lr_sched is provided")
|
|
261
|
+
|
|
262
|
+
self.V, self.D = int(V), int(D)
|
|
263
|
+
|
|
264
|
+
self.in_emb = nn.Linear(self.V, self.D)
|
|
265
|
+
self.out_emb = nn.Linear(self.D, self.V)
|
|
266
|
+
|
|
267
|
+
resolved_device = device or ("cuda" if torch.cuda.is_available() else "cpu")
|
|
268
|
+
self.device = torch.device(resolved_device)
|
|
269
|
+
if seed is not None:
|
|
270
|
+
torch.manual_seed(int(seed))
|
|
271
|
+
np.random.seed(int(seed))
|
|
272
|
+
if self.device.type == "cuda":
|
|
273
|
+
torch.cuda.manual_seed_all(int(seed))
|
|
274
|
+
self.to(self.device)
|
|
275
|
+
_logger.info("SG_Torch init: V=%d D=%d device=%s seed=%s", self.V, self.D, self.device, seed)
|
|
276
|
+
|
|
277
|
+
params = list(self.in_emb.parameters()) + list(self.out_emb.parameters())
|
|
278
|
+
# optimizer / scheduler
|
|
279
|
+
self.opt = optim(params=params, **optim_kwargs)
|
|
280
|
+
self.lr_sched = lr_sched(self.opt, **lr_sched_kwargs) if lr_sched is not None else None
|
|
281
|
+
|
|
282
|
+
def predict(self, center: torch.Tensor) -> torch.Tensor:
|
|
283
|
+
center = center.to(self.device, dtype=torch.long)
|
|
284
|
+
c = nn.functional.one_hot(center, num_classes=self.V).to(dtype=torch.float32, device=self.device)
|
|
285
|
+
y = self.in_emb(c)
|
|
286
|
+
z = self.out_emb(y)
|
|
287
|
+
return z
|
|
288
|
+
|
|
289
|
+
__call__ = predict
|
|
290
|
+
|
|
291
|
+
def fit(self,
|
|
292
|
+
centers: np.ndarray,
|
|
293
|
+
contexts: np.ndarray,
|
|
294
|
+
num_epochs: int,
|
|
295
|
+
batch_size: int,
|
|
296
|
+
shuffle_data: bool,
|
|
297
|
+
lr_step_per_batch: bool,
|
|
298
|
+
**_ignore):
|
|
299
|
+
cce = nn.CrossEntropyLoss(reduction="mean")
|
|
300
|
+
|
|
301
|
+
N = centers.shape[0]
|
|
302
|
+
idx = np.arange(N)
|
|
303
|
+
|
|
304
|
+
for epoch in range(1, int(num_epochs) + 1):
|
|
305
|
+
epoch_loss = 0.0
|
|
306
|
+
batches = 0
|
|
307
|
+
if shuffle_data:
|
|
308
|
+
np.random.shuffle(idx)
|
|
309
|
+
|
|
310
|
+
for s in range(0, N, int(batch_size)):
|
|
311
|
+
take = idx[s:s+int(batch_size)]
|
|
312
|
+
if take.size == 0:
|
|
313
|
+
continue
|
|
314
|
+
|
|
315
|
+
cen = torch.as_tensor(centers[take], dtype=torch.long, device=self.device)
|
|
316
|
+
ctx = torch.as_tensor(contexts[take], dtype=torch.long, device=self.device)
|
|
317
|
+
|
|
318
|
+
logits = self(cen)
|
|
319
|
+
loss = cce(logits, ctx)
|
|
144
320
|
|
|
145
321
|
self.opt.zero_grad(set_to_none=True)
|
|
146
322
|
loss.backward()
|
|
@@ -158,11 +334,22 @@ class SGNS_Torch:
|
|
|
158
334
|
|
|
159
335
|
mean_loss = epoch_loss / max(batches, 1)
|
|
160
336
|
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
337
|
+
|
|
338
|
+
@property
|
|
339
|
+
def in_embeddings(self) -> np.ndarray:
|
|
340
|
+
W = self.in_emb.weight.detach().T.cpu().numpy()
|
|
341
|
+
_logger.debug("In emb shape: %s", W.shape)
|
|
342
|
+
return W
|
|
343
|
+
|
|
344
|
+
@property
|
|
345
|
+
def out_embeddings(self) -> np.ndarray:
|
|
346
|
+
W = self.out_emb.weight.detach().cpu().numpy()
|
|
347
|
+
_logger.debug("Out emb shape: %s", W.shape)
|
|
348
|
+
return W
|
|
161
349
|
|
|
162
350
|
@property
|
|
163
|
-
def
|
|
164
|
-
|
|
165
|
-
return self.in_emb.weight.detach().cpu().numpy()
|
|
351
|
+
def avg_embeddings(self) -> np.ndarray:
|
|
352
|
+
return 0.5 * (self.in_embeddings + self.out_embeddings)
|
|
166
353
|
|
|
167
354
|
# tiny helper for device move
|
|
168
355
|
def to(self, device):
|
|
@@ -171,7 +358,7 @@ class SGNS_Torch:
|
|
|
171
358
|
return self
|
|
172
359
|
|
|
173
360
|
|
|
174
|
-
__all__ = ["SGNS_Torch"]
|
|
361
|
+
__all__ = ["SGNS_Torch", "SG_Torch"]
|
|
175
362
|
|
|
176
363
|
if __name__ == "__main__":
|
|
177
364
|
pass
|
sawnergy/embedding/__init__.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
from .embedder import Embedder
|
|
4
|
+
from .visualizer import Visualizer
|
|
4
5
|
|
|
5
6
|
def __getattr__(name: str):
|
|
6
7
|
"""Lazily expose optional backends."""
|
|
@@ -14,6 +15,16 @@ def __getattr__(name: str):
|
|
|
14
15
|
) from exc
|
|
15
16
|
return SGNS_Torch
|
|
16
17
|
|
|
18
|
+
if name == "SG_Torch":
|
|
19
|
+
try:
|
|
20
|
+
from .SGNS_torch import SG_Torch
|
|
21
|
+
except Exception as exc:
|
|
22
|
+
raise ImportError(
|
|
23
|
+
"PyTorch backend requested but torch is not installed. "
|
|
24
|
+
"Install PyTorch via `pip install torch` (see https://pytorch.org/get-started)."
|
|
25
|
+
) from exc
|
|
26
|
+
return SG_Torch
|
|
27
|
+
|
|
17
28
|
if name == "SGNS_PureML":
|
|
18
29
|
try:
|
|
19
30
|
from .SGNS_pml import SGNS_PureML
|
|
@@ -24,11 +35,24 @@ def __getattr__(name: str):
|
|
|
24
35
|
"Install PureML first via `pip install ym-pure-ml` "
|
|
25
36
|
) from exc
|
|
26
37
|
|
|
38
|
+
if name == "SG_PureML":
|
|
39
|
+
try:
|
|
40
|
+
from .SGNS_pml import SG_PureML
|
|
41
|
+
return SG_PureML
|
|
42
|
+
except Exception as exc:
|
|
43
|
+
raise ImportError(
|
|
44
|
+
"PureML is not installed. "
|
|
45
|
+
"Install PureML first via `pip install ym-pure-ml` "
|
|
46
|
+
) from exc
|
|
47
|
+
|
|
27
48
|
raise AttributeError(name)
|
|
28
49
|
|
|
29
50
|
|
|
30
51
|
__all__ = [
|
|
31
52
|
"Embedder",
|
|
53
|
+
"Visualizer",
|
|
32
54
|
"SGNS_PureML",
|
|
33
55
|
"SGNS_Torch",
|
|
56
|
+
"SG_PureML",
|
|
57
|
+
"SG_Torch"
|
|
34
58
|
]
|