sawnergy 1.0.3__py3-none-any.whl → 1.0.9__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- sawnergy/__init__.py +3 -1
- sawnergy/embedding/SGNS_pml.py +324 -51
- sawnergy/embedding/SGNS_torch.py +282 -39
- sawnergy/embedding/__init__.py +26 -1
- sawnergy/embedding/embedder.py +426 -203
- sawnergy/embedding/visualizer.py +251 -0
- sawnergy/logging_util.py +1 -1
- sawnergy/rin/rin_builder.py +4 -4
- sawnergy/visual/visualizer.py +6 -6
- sawnergy/visual/visualizer_util.py +3 -0
- sawnergy/walks/walker.py +43 -22
- {sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/METADATA +91 -57
- sawnergy-1.0.9.dist-info/RECORD +23 -0
- sawnergy-1.0.3.dist-info/RECORD +0 -22
- {sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/WHEEL +0 -0
- {sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/licenses/LICENSE +0 -0
- {sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/licenses/NOTICE +0 -0
- {sawnergy-1.0.3.dist-info → sawnergy-1.0.9.dist-info}/top_level.txt +0 -0
sawnergy/__init__.py
CHANGED
sawnergy/embedding/SGNS_pml.py
CHANGED
|
@@ -3,11 +3,11 @@ from __future__ import annotations
|
|
|
3
3
|
# third party
|
|
4
4
|
import numpy as np
|
|
5
5
|
from pureml.machinery import Tensor
|
|
6
|
-
from pureml.layers import Embedding
|
|
7
|
-
from pureml.losses import BCE
|
|
6
|
+
from pureml.layers import Embedding, Affine
|
|
7
|
+
from pureml.losses import BCE, CCE
|
|
8
8
|
from pureml.general_math import sum as t_sum
|
|
9
|
-
from pureml.optimizers import Optim, LRScheduler
|
|
10
|
-
from pureml.training_utils import TensorDataset, DataLoader
|
|
9
|
+
from pureml.optimizers import Optim, LRScheduler, SGD
|
|
10
|
+
from pureml.training_utils import TensorDataset, DataLoader, one_hot
|
|
11
11
|
from pureml.base import NN
|
|
12
12
|
|
|
13
13
|
# built-in
|
|
@@ -28,51 +28,98 @@ class SGNS_PureML(NN):
|
|
|
28
28
|
"""PureML implementation of Skip-Gram with Negative Sampling."""
|
|
29
29
|
|
|
30
30
|
def __init__(self,
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
|
|
35
|
-
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
31
|
+
V: int,
|
|
32
|
+
D: int,
|
|
33
|
+
in_weights: Tensor | np.ndarray | None = None,
|
|
34
|
+
out_weights: Tensor | np.ndarray | None = None,
|
|
35
|
+
*,
|
|
36
|
+
seed: int | None = None,
|
|
37
|
+
optim: Type[Optim] = SGD,
|
|
38
|
+
optim_kwargs: dict | None = None,
|
|
39
|
+
lr_sched: Type[LRScheduler] | None = None,
|
|
40
|
+
lr_sched_kwargs: dict | None = None,
|
|
41
|
+
device: str | None = None):
|
|
39
42
|
"""
|
|
43
|
+
Initialize SGNS.
|
|
44
|
+
|
|
45
|
+
Shapes:
|
|
46
|
+
- Embedding tables:
|
|
47
|
+
in_weights: (V, D) or None — row i is the “input” vector for token i.
|
|
48
|
+
out_weights: (V, D) or None — row i is the “output” vector for token i.
|
|
49
|
+
|
|
40
50
|
Args:
|
|
41
|
-
V: Vocabulary size (number of nodes).
|
|
51
|
+
V: Vocabulary size (number of nodes/tokens).
|
|
42
52
|
D: Embedding dimensionality.
|
|
43
|
-
|
|
44
|
-
|
|
45
|
-
|
|
46
|
-
|
|
47
|
-
|
|
53
|
+
in_weights: Optional starting input-embedding matrix of shape (V, D) as
|
|
54
|
+
:class:`Tensor` or :class:`numpy.ndarray`. If None, the Embedding
|
|
55
|
+
layer initializes it (seeded if `seed` is set).
|
|
56
|
+
out_weights: Optional starting output-embedding matrix of shape (V, D) as
|
|
57
|
+
:class:`Tensor` or :class:`numpy.ndarray`. If None, the Embedding
|
|
58
|
+
layer initializes it (seeded if `seed` is set).
|
|
59
|
+
seed: Optional RNG seed used for **embedding initialization** and for
|
|
60
|
+
**negative sampling** during training.
|
|
61
|
+
optim: Optimizer class to instantiate. Defaults to plain SGD.
|
|
62
|
+
optim_kwargs: Keyword arguments for the optimizer. Defaults to {"lr": 0.1}.
|
|
63
|
+
lr_sched: Optional learning-rate scheduler class.
|
|
64
|
+
lr_sched_kwargs: Keyword arguments for the scheduler (required if lr_sched is provided).
|
|
65
|
+
device: Target device string (e.g., "cuda"); accepted for API parity, ignored by PureML.
|
|
48
66
|
"""
|
|
67
|
+
|
|
68
|
+
optim_kwargs = optim_kwargs or {"lr": 0.1}
|
|
69
|
+
|
|
70
|
+
if lr_sched is not None and lr_sched_kwargs is None:
|
|
71
|
+
raise ValueError("lr_sched_kwargs required when lr_sched is provided")
|
|
72
|
+
|
|
49
73
|
self.V, self.D = int(V), int(D)
|
|
50
|
-
self.in_emb = Embedding(V, D)
|
|
51
|
-
self.out_emb = Embedding(V, D)
|
|
52
74
|
|
|
75
|
+
# Convert warm-starts from np.ndarray → Tensor if needed
|
|
76
|
+
if isinstance(in_weights, np.ndarray):
|
|
77
|
+
in_weights = Tensor(in_weights, requires_grad=True)
|
|
78
|
+
if isinstance(out_weights, np.ndarray):
|
|
79
|
+
out_weights = Tensor(out_weights, requires_grad=True)
|
|
80
|
+
|
|
81
|
+
# embeddings
|
|
82
|
+
self.in_emb = Embedding(self.V, self.D, W=in_weights, seed=seed)
|
|
83
|
+
self.out_emb = Embedding(self.V, self.D, W=out_weights, seed=seed)
|
|
84
|
+
|
|
85
|
+
# seed + RNG for negative sampling
|
|
53
86
|
self.seed = None if seed is None else int(seed)
|
|
54
87
|
self._rng = np.random.default_rng(self.seed)
|
|
55
88
|
|
|
56
|
-
|
|
57
|
-
self.
|
|
58
|
-
|
|
89
|
+
# API compatibility: PureML is CPU-only
|
|
90
|
+
self.device = "cpu"
|
|
91
|
+
|
|
92
|
+
# optimizer / scheduler
|
|
93
|
+
self.optim: Optim = optim(self.parameters, **optim_kwargs)
|
|
94
|
+
self.lr_sched: LRScheduler | None = (
|
|
95
|
+
lr_sched(optim=self.optim, **lr_sched_kwargs) if lr_sched is not None else None
|
|
96
|
+
)
|
|
97
|
+
|
|
98
|
+
_logger.info(
|
|
99
|
+
"SGNS_PureML init: V=%d D=%d device=%s seed=%s",
|
|
100
|
+
self.V, self.D, self.device, self.seed
|
|
101
|
+
)
|
|
59
102
|
|
|
60
|
-
def _sample_neg(self, B: int, K: int, dist: np.ndarray):
|
|
61
|
-
"""Draw negative samples according to the provided unigram distribution."""
|
|
62
|
-
if dist.ndim != 1 or dist.size != self.V:
|
|
63
|
-
raise ValueError(f"noise_dist must be 1-D with length {self.V}; got {dist.shape}")
|
|
103
|
+
def _sample_neg(self, B: int, K: int, dist: np.ndarray) -> np.ndarray:
|
|
64
104
|
return self._rng.choice(self.V, size=(B, K), replace=True, p=dist)
|
|
65
105
|
|
|
66
|
-
def predict(self, center: Tensor, pos: Tensor, neg: Tensor) -> Tensor:
|
|
67
|
-
"""Compute positive/negative logits for SGNS.
|
|
68
|
-
c = self.in_emb(center)
|
|
69
|
-
pos_e = self.out_emb(pos)
|
|
70
|
-
neg_e = self.out_emb(neg)
|
|
71
|
-
pos_logits = t_sum(c * pos_e, axis=-1)
|
|
72
|
-
neg_logits = t_sum(c[:, None, :] * neg_e, axis=-1)
|
|
73
|
-
# ^^^
|
|
74
|
-
# (B,1,D) * (B,K,D) → (B,K,D) → sum D → (B,K)
|
|
106
|
+
def predict(self, center: Tensor, pos: Tensor, neg: Tensor) -> tuple[Tensor, Tensor]:
|
|
107
|
+
"""Compute positive/negative logits for SGNS.
|
|
75
108
|
|
|
109
|
+
Shapes:
|
|
110
|
+
center: (B,)
|
|
111
|
+
pos: (B,)
|
|
112
|
+
neg: (B, K)
|
|
113
|
+
Returns:
|
|
114
|
+
pos_logits: (B,)
|
|
115
|
+
neg_logits: (B, K)
|
|
116
|
+
"""
|
|
117
|
+
c = self.in_emb(center) # (B, D)
|
|
118
|
+
pos_e = self.out_emb(pos) # (B, D)
|
|
119
|
+
neg_e = self.out_emb(neg) # (B, K, D)
|
|
120
|
+
|
|
121
|
+
pos_logits = t_sum(c * pos_e, axis=-1) # (B,)
|
|
122
|
+
neg_logits = t_sum(c[:, None, :] * neg_e, axis=-1) # (B, K)
|
|
76
123
|
return pos_logits, neg_logits
|
|
77
124
|
|
|
78
125
|
def fit(self,
|
|
@@ -89,47 +136,273 @@ class SGNS_PureML(NN):
|
|
|
89
136
|
"SGNS_PureML fit: epochs=%d batch=%d negatives=%d shuffle=%s",
|
|
90
137
|
num_epochs, batch_size, num_negative_samples, shuffle_data
|
|
91
138
|
)
|
|
92
|
-
data = TensorDataset(centers, contexts)
|
|
93
139
|
|
|
140
|
+
if noise_dist.ndim != 1 or noise_dist.size != self.V:
|
|
141
|
+
raise ValueError(f"noise_dist must be 1-D with length {self.V}; got {noise_dist.shape}")
|
|
142
|
+
dist = np.asarray(noise_dist, dtype=np.float64)
|
|
143
|
+
if np.any(dist < 0):
|
|
144
|
+
raise ValueError("noise_dist has negative entries")
|
|
145
|
+
s = dist.sum()
|
|
146
|
+
if not np.isfinite(s) or s <= 0:
|
|
147
|
+
raise ValueError("noise_dist must have positive finite sum")
|
|
148
|
+
if abs(s - 1.0) > 1e-6:
|
|
149
|
+
dist = dist / s
|
|
150
|
+
|
|
151
|
+
data = TensorDataset(centers, contexts)
|
|
94
152
|
for epoch in range(1, num_epochs + 1):
|
|
95
153
|
epoch_loss = 0.0
|
|
96
154
|
batches = 0
|
|
97
|
-
|
|
98
|
-
|
|
155
|
+
dl_seed = None if self.seed is None else (self.seed + epoch)
|
|
156
|
+
for cen, pos in DataLoader(data, batch_size=batch_size, shuffle=shuffle_data, seed=dl_seed):
|
|
157
|
+
B = cen.data.shape[0] if isinstance(cen, Tensor) else len(cen)
|
|
99
158
|
|
|
159
|
+
neg_idx_np = self._sample_neg(B, num_negative_samples, dist)
|
|
160
|
+
neg = Tensor(neg_idx_np, requires_grad=False)
|
|
100
161
|
x_pos_logits, x_neg_logits = self(cen, pos, neg)
|
|
101
162
|
|
|
102
|
-
y_pos = Tensor(np.ones_like(x_pos_logits.
|
|
103
|
-
y_neg = Tensor(np.zeros_like(x_neg_logits.
|
|
163
|
+
y_pos = Tensor(np.ones_like(x_pos_logits.numpy(copy=False)), requires_grad=False)
|
|
164
|
+
y_neg = Tensor(np.zeros_like(x_neg_logits.numpy(copy=False)), requires_grad=False)
|
|
104
165
|
|
|
105
|
-
|
|
166
|
+
K = int(neg.data.shape[1])
|
|
167
|
+
loss = (
|
|
168
|
+
BCE(y_pos, x_pos_logits, from_logits=True)
|
|
169
|
+
+ Tensor(K)*BCE(y_neg, x_neg_logits, from_logits=True)
|
|
170
|
+
)
|
|
106
171
|
|
|
107
172
|
self.optim.zero_grad()
|
|
108
173
|
loss.backward()
|
|
109
174
|
self.optim.step()
|
|
110
|
-
|
|
111
|
-
if lr_step_per_batch:
|
|
175
|
+
|
|
176
|
+
if lr_step_per_batch and self.lr_sched is not None:
|
|
112
177
|
self.lr_sched.step()
|
|
113
178
|
|
|
114
|
-
loss_value = float(np.asarray(loss.data)
|
|
179
|
+
loss_value = float(np.asarray(loss.data))
|
|
115
180
|
epoch_loss += loss_value
|
|
116
181
|
batches += 1
|
|
117
182
|
_logger.debug("Epoch %d batch %d loss=%.6f", epoch, batches, loss_value)
|
|
118
183
|
|
|
119
|
-
if not lr_step_per_batch:
|
|
184
|
+
if (not lr_step_per_batch) and (self.lr_sched is not None):
|
|
120
185
|
self.lr_sched.step()
|
|
121
186
|
|
|
122
187
|
mean_loss = epoch_loss / max(batches, 1)
|
|
123
188
|
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
124
189
|
|
|
125
190
|
@property
|
|
126
|
-
def
|
|
127
|
-
|
|
128
|
-
W
|
|
129
|
-
|
|
191
|
+
def in_embeddings(self) -> np.ndarray:
|
|
192
|
+
W: Tensor = self.in_emb.parameters[0] # (V, D)
|
|
193
|
+
if W.shape != (self.V, self.D):
|
|
194
|
+
raise RuntimeError(
|
|
195
|
+
"Wrong embedding matrix shape: "
|
|
196
|
+
"self.in_emb.parameters[0].shape != (V, D)"
|
|
197
|
+
)
|
|
198
|
+
arr = W.numpy(copy=True, readonly=True) # (V, D)
|
|
199
|
+
_logger.debug("In emb shape: %s", arr.shape)
|
|
200
|
+
return arr
|
|
201
|
+
|
|
202
|
+
@property
|
|
203
|
+
def out_embeddings(self) -> np.ndarray:
|
|
204
|
+
W: Tensor = self.out_emb.parameters[0] # (V, D)
|
|
205
|
+
if W.shape != (self.V, self.D):
|
|
206
|
+
raise RuntimeError(
|
|
207
|
+
"Wrong embedding matrix shape: "
|
|
208
|
+
"self.out_emb.parameters[0].shape != (V, D)"
|
|
209
|
+
)
|
|
210
|
+
arr = W.numpy(copy=True, readonly=True) # (V, D)
|
|
211
|
+
_logger.debug("Out emb shape: %s", arr.shape)
|
|
212
|
+
return arr
|
|
213
|
+
|
|
214
|
+
@property
|
|
215
|
+
def avg_embeddings(self) -> np.ndarray:
|
|
216
|
+
return 0.5 * (self.in_embeddings + self.out_embeddings)
|
|
217
|
+
|
|
218
|
+
class SG_PureML(NN):
|
|
219
|
+
"""Plain Skip-Gram (full softmax) in PureML.
|
|
220
|
+
|
|
221
|
+
This variant uses **no bias terms**: both projections are pure linear maps.
|
|
222
|
+
|
|
223
|
+
Computation:
|
|
224
|
+
x = one_hot(center, V) # (B, V)
|
|
225
|
+
y = x @ W_in # (B, D), with W_in ∈ R^{VxD}
|
|
226
|
+
logits = y @ W_out # (B, V), with W_out ∈ R^{DxV}
|
|
227
|
+
loss = CCE(one_hot(context, V), logits, from_logits=True)
|
|
228
|
+
|
|
229
|
+
Embeddings:
|
|
230
|
+
- Input embeddings = rows of W_in → shape (V, D)
|
|
231
|
+
- Output embeddings = rows of W_outᵀ → shape (V, D)
|
|
232
|
+
"""
|
|
233
|
+
|
|
234
|
+
def __init__(self,
|
|
235
|
+
V: int,
|
|
236
|
+
D: int,
|
|
237
|
+
in_weights: Tensor | np.ndarray | None = None,
|
|
238
|
+
out_weights: Tensor | np.ndarray | None = None,
|
|
239
|
+
*,
|
|
240
|
+
seed: int | None = None,
|
|
241
|
+
optim: Type[Optim] = SGD,
|
|
242
|
+
optim_kwargs: dict | None = None,
|
|
243
|
+
lr_sched: Type[LRScheduler] | None = None,
|
|
244
|
+
lr_sched_kwargs: dict | None = None,
|
|
245
|
+
device: str | None = None):
|
|
246
|
+
"""Initialize the plain Skip-Gram model (full softmax, **no biases**).
|
|
247
|
+
|
|
248
|
+
Shapes:
|
|
249
|
+
- Linear maps (no bias):
|
|
250
|
+
W_in: (V, D) — rows are input embeddings for tokens.
|
|
251
|
+
W_out: (D, V) — maps D→V; rows of W_outᵀ are output embeddings.
|
|
252
|
+
|
|
253
|
+
- Warm-starts:
|
|
254
|
+
in_weights: (V, D) or None — copied into W_in if provided (Tensor or np.ndarray).
|
|
255
|
+
out_weights: (D, V) or None — copied into W_out if provided (Tensor or np.ndarray).
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
V: Vocabulary size (number of nodes/tokens).
|
|
259
|
+
D: Embedding dimensionality.
|
|
260
|
+
in_weights: Optional starting matrix for W_in with shape (V, D) as Tensor or np.ndarray.
|
|
261
|
+
out_weights: Optional starting matrix for W_out with shape (D, V) as Tensor or np.ndarray.
|
|
262
|
+
(Note the asymmetry with SGNS; use `.T` if converting from (V, D).)
|
|
263
|
+
seed: Optional RNG seed (used for layer initialization).
|
|
264
|
+
optim: Optimizer class to instantiate. Defaults to plain SGD.
|
|
265
|
+
optim_kwargs: Keyword arguments for the optimizer. Defaults to {"lr": 0.1}.
|
|
266
|
+
lr_sched: Optional learning-rate scheduler class.
|
|
267
|
+
lr_sched_kwargs: Keyword arguments for the scheduler (required if lr_sched is provided).
|
|
268
|
+
device: Device string (e.g., "cuda"). Accepted for parity, ignored by PureML (CPU-only).
|
|
269
|
+
"""
|
|
270
|
+
|
|
271
|
+
optim_kwargs = optim_kwargs or {"lr": 0.1}
|
|
272
|
+
if lr_sched is not None and lr_sched_kwargs is None:
|
|
273
|
+
raise ValueError("lr_sched_kwargs required when lr_sched is provided")
|
|
274
|
+
|
|
275
|
+
self.V, self.D = int(V), int(D)
|
|
276
|
+
|
|
277
|
+
# Convert warm-starts from np.ndarray → Tensor if needed
|
|
278
|
+
if isinstance(in_weights, np.ndarray):
|
|
279
|
+
in_weights = Tensor(in_weights, requires_grad=True)
|
|
280
|
+
if isinstance(out_weights, np.ndarray):
|
|
281
|
+
out_weights = Tensor(out_weights, requires_grad=True)
|
|
282
|
+
|
|
283
|
+
# input/output “embedding” projections
|
|
284
|
+
self.in_emb = Affine(self.V, self.D, W=in_weights, bias=False, seed=seed)
|
|
285
|
+
self.out_emb = Affine(self.D, self.V, W=out_weights, bias=False, seed=seed)
|
|
286
|
+
|
|
287
|
+
self.seed = None if seed is None else int(seed)
|
|
288
|
+
self.device = "cpu" # API parity
|
|
289
|
+
|
|
290
|
+
# optimizer / scheduler
|
|
291
|
+
self.optim: Optim = optim(self.parameters, **optim_kwargs)
|
|
292
|
+
self.lr_sched: LRScheduler | None = (
|
|
293
|
+
lr_sched(optim=self.optim, **lr_sched_kwargs) if lr_sched is not None else None
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
_logger.info(
|
|
297
|
+
"SG_PureML init: V=%d D=%d device=%s seed=%s",
|
|
298
|
+
self.V, self.D, self.device, self.seed
|
|
299
|
+
)
|
|
300
|
+
|
|
301
|
+
def predict(self, center: Tensor) -> Tensor:
|
|
302
|
+
"""Return vocabulary logits for each center index.
|
|
303
|
+
|
|
304
|
+
Args:
|
|
305
|
+
center: Tensor of center indices with shape `(B,)` and integer dtype.
|
|
306
|
+
|
|
307
|
+
Returns:
|
|
308
|
+
Tensor: Logits over the vocabulary with shape `(B, V)`.
|
|
309
|
+
"""
|
|
310
|
+
c = one_hot(dims=self.V, label=center) # (B, V)
|
|
311
|
+
y = self.in_emb(c) # (B, D)
|
|
312
|
+
z = self.out_emb(y) # (B, V)
|
|
313
|
+
return z
|
|
314
|
+
|
|
315
|
+
def fit(self,
|
|
316
|
+
centers: np.ndarray,
|
|
317
|
+
contexts: np.ndarray,
|
|
318
|
+
num_epochs: int,
|
|
319
|
+
batch_size: int,
|
|
320
|
+
shuffle_data: bool,
|
|
321
|
+
lr_step_per_batch: bool,
|
|
322
|
+
**_ignore):
|
|
323
|
+
"""Train Skip-Gram with full softmax on center/context pairs.
|
|
324
|
+
|
|
325
|
+
Args:
|
|
326
|
+
centers: Array of center indices, shape `(N,)`, dtype integer in `[0, V)`.
|
|
327
|
+
contexts: Array of context (target) indices, shape `(N,)`, dtype integer.
|
|
328
|
+
num_epochs: Number of passes over the dataset.
|
|
329
|
+
batch_size: Mini-batch size.
|
|
330
|
+
shuffle_data: Whether to shuffle pairs each epoch.
|
|
331
|
+
lr_step_per_batch: If True, call `lr_sched.step()` after every batch
|
|
332
|
+
(when a scheduler is provided). If False, step once per epoch.
|
|
333
|
+
**_ignore: Ignored kwargs for API compatibility with SGNS.
|
|
334
|
+
|
|
335
|
+
Optimization:
|
|
336
|
+
Uses `CCE(one_hot(context), logits, from_logits=True)` where
|
|
337
|
+
`logits = predict(center)`. Scheduler stepping obeys `lr_step_per_batch`.
|
|
338
|
+
"""
|
|
339
|
+
_logger.info(
|
|
340
|
+
"SG_PureML fit: epochs=%d batch=%d shuffle=%s",
|
|
341
|
+
num_epochs, batch_size, shuffle_data
|
|
342
|
+
)
|
|
343
|
+
data = TensorDataset(centers, contexts)
|
|
344
|
+
|
|
345
|
+
for epoch in range(1, num_epochs + 1):
|
|
346
|
+
epoch_loss = 0.0
|
|
347
|
+
batches = 0
|
|
348
|
+
dl_seed = None if self.seed is None else (self.seed + epoch)
|
|
349
|
+
for cen, ctx in DataLoader(data, batch_size=batch_size, shuffle=shuffle_data, seed=dl_seed):
|
|
350
|
+
logits = self(cen) # (B, V)
|
|
351
|
+
y = one_hot(dims=self.V, label=ctx) # (B, V)
|
|
352
|
+
loss = CCE(y, logits, from_logits=True) # scalar
|
|
353
|
+
|
|
354
|
+
self.optim.zero_grad()
|
|
355
|
+
loss.backward()
|
|
356
|
+
self.optim.step()
|
|
357
|
+
|
|
358
|
+
if lr_step_per_batch and self.lr_sched is not None:
|
|
359
|
+
self.lr_sched.step()
|
|
360
|
+
|
|
361
|
+
loss_value = float(np.asarray(loss.data))
|
|
362
|
+
epoch_loss += loss_value
|
|
363
|
+
batches += 1
|
|
364
|
+
_logger.debug("Epoch %d batch %d loss=%.6f", epoch, batches, loss_value)
|
|
365
|
+
|
|
366
|
+
if (not lr_step_per_batch) and (self.lr_sched is not None):
|
|
367
|
+
self.lr_sched.step()
|
|
368
|
+
|
|
369
|
+
mean_loss = epoch_loss / max(batches, 1)
|
|
370
|
+
_logger.info("Epoch %d/%d mean_loss=%.6f", epoch, num_epochs, mean_loss)
|
|
371
|
+
|
|
372
|
+
@property
|
|
373
|
+
def in_embeddings(self) -> np.ndarray:
|
|
374
|
+
"""Input embeddings matrix `W_in` as `(V, D)` (copy, read-only)."""
|
|
375
|
+
W = self.in_emb.parameters[0] # (V, D)
|
|
376
|
+
if W.shape != (self.V, self.D):
|
|
377
|
+
raise RuntimeError(
|
|
378
|
+
"Wrong embedding matrix shape: "
|
|
379
|
+
"self.in_emb.parameters[0].shape != (V, D)"
|
|
380
|
+
)
|
|
381
|
+
arr = W.numpy(copy=True, readonly=True) # (V, D)
|
|
382
|
+
_logger.debug("In emb shape: %s", arr.shape)
|
|
383
|
+
return arr
|
|
384
|
+
|
|
385
|
+
@property
|
|
386
|
+
def out_embeddings(self) -> np.ndarray:
|
|
387
|
+
"""Output embeddings matrix `W_outᵀ` as `(V, D)` (copy, read-only).
|
|
388
|
+
(`out_emb.parameters[0]` is `(D, V)`, so we transpose.)"""
|
|
389
|
+
W = self.out_emb.parameters[0] # (D, V)
|
|
390
|
+
if W.shape != (self.D, self.V):
|
|
391
|
+
raise RuntimeError(
|
|
392
|
+
"Wrong embedding matrix shape: "
|
|
393
|
+
"self.out_emb.parameters[0].shape != (D, V)"
|
|
394
|
+
)
|
|
395
|
+
arr = W.numpy(copy=True, readonly=True).T # (V, D)
|
|
396
|
+
_logger.debug("Out emb shape: %s", arr.shape)
|
|
397
|
+
return arr
|
|
398
|
+
|
|
399
|
+
@property
|
|
400
|
+
def avg_embeddings(self) -> np.ndarray:
|
|
401
|
+
"""Elementwise average of input/output embeddings, shape `(V, D)`."""
|
|
402
|
+
return 0.5 * (self.in_embeddings + self.out_embeddings) # (V, D)
|
|
130
403
|
|
|
131
404
|
|
|
132
|
-
__all__ = ["SGNS_PureML"]
|
|
405
|
+
__all__ = ["SGNS_PureML", "SG_PureML"]
|
|
133
406
|
|
|
134
407
|
if __name__ == "__main__":
|
|
135
408
|
pass
|