lucid-dl 2.12.0__py3-none-any.whl → 2.12.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- lucid/__init__.py +2 -2
- lucid/_tensor/__init__.py +11 -1
- lucid/_tensor/base.py +2 -0
- lucid/_tensor/tensor.py +192 -3
- lucid/_util/__init__.py +14 -5
- lucid/_util/func.py +73 -0
- lucid/models/__init__.py +1 -0
- lucid/models/seqclf/__init__.py +1 -0
- lucid/models/seqclf/bert.py +31 -0
- lucid/nn/_kernel/embedding.py +19 -16
- lucid/nn/functional/_util.py +40 -8
- lucid/nn/modules/attention.py +58 -6
- lucid/nn/modules/rnn.py +133 -21
- lucid/nn/modules/sparse.py +16 -1
- {lucid_dl-2.12.0.dist-info → lucid_dl-2.12.1.dist-info}/METADATA +5 -1
- {lucid_dl-2.12.0.dist-info → lucid_dl-2.12.1.dist-info}/RECORD +19 -17
- {lucid_dl-2.12.0.dist-info → lucid_dl-2.12.1.dist-info}/WHEEL +0 -0
- {lucid_dl-2.12.0.dist-info → lucid_dl-2.12.1.dist-info}/licenses/LICENSE +0 -0
- {lucid_dl-2.12.0.dist-info → lucid_dl-2.12.1.dist-info}/top_level.txt +0 -0
lucid/__init__.py
CHANGED
|
@@ -25,7 +25,7 @@ import json
|
|
|
25
25
|
import math
|
|
26
26
|
import numpy as np
|
|
27
27
|
|
|
28
|
-
from lucid._tensor import
|
|
28
|
+
from lucid._tensor import *
|
|
29
29
|
from lucid._func import *
|
|
30
30
|
from lucid._util import *
|
|
31
31
|
|
|
@@ -308,7 +308,7 @@ def register_model(func: _ModuleReturnFunc) -> _ModuleReturnFunc:
|
|
|
308
308
|
|
|
309
309
|
|
|
310
310
|
def _conv_view_limit_mb() -> int:
|
|
311
|
-
from lucid._kernel import conv as _conv_kernel
|
|
311
|
+
from lucid.nn._kernel import conv as _conv_kernel
|
|
312
312
|
|
|
313
313
|
return _conv_kernel.get_conv_view_limit_mb()
|
|
314
314
|
|
lucid/_tensor/__init__.py
CHANGED
lucid/_tensor/base.py
CHANGED
|
@@ -108,6 +108,8 @@ class _TensorBase:
|
|
|
108
108
|
|
|
109
109
|
def broadcast_to(self, shape: _ShapeLike) -> Self: ...
|
|
110
110
|
|
|
111
|
+
def expand(self, *sizes: int | _ShapeLike) -> Self: ...
|
|
112
|
+
|
|
111
113
|
def chunk(self, chunks: int, axis: int = 0) -> tuple[Self, ...]: ...
|
|
112
114
|
|
|
113
115
|
def swapaxes(self, axis1: int, axis2: int) -> Self: ...
|
lucid/_tensor/tensor.py
CHANGED
|
@@ -1,4 +1,15 @@
|
|
|
1
|
-
from typing import
|
|
1
|
+
from typing import (
|
|
2
|
+
Callable,
|
|
3
|
+
Iterator,
|
|
4
|
+
Optional,
|
|
5
|
+
Self,
|
|
6
|
+
SupportsIndex,
|
|
7
|
+
Any,
|
|
8
|
+
overload,
|
|
9
|
+
Generic,
|
|
10
|
+
TypeVar,
|
|
11
|
+
ClassVar,
|
|
12
|
+
)
|
|
2
13
|
from types import NoneType
|
|
3
14
|
from collections import deque
|
|
4
15
|
|
|
@@ -22,15 +33,32 @@ from lucid._backend.core import BackwardOperation, Operation, noop
|
|
|
22
33
|
from lucid._backend.metal import mx, parse_mlx_indexing, check_metal_availability
|
|
23
34
|
|
|
24
35
|
|
|
36
|
+
__all__ = [
|
|
37
|
+
"Tensor",
|
|
38
|
+
"FloatTensor",
|
|
39
|
+
"DoubleTensor",
|
|
40
|
+
"HalfTensor",
|
|
41
|
+
"CharTensor",
|
|
42
|
+
"ShortTensor",
|
|
43
|
+
"IntTensor",
|
|
44
|
+
"LongTensor",
|
|
45
|
+
"BoolTensor",
|
|
46
|
+
]
|
|
47
|
+
|
|
48
|
+
|
|
49
|
+
DType = TypeVar("DType", bound=Numeric | bool)
|
|
50
|
+
|
|
25
51
|
_HookType = Callable[["Tensor", _NumPyArray | _MLXArray], None]
|
|
26
52
|
|
|
27
53
|
_dtype_map = {int: types.Int64, float: types.Float64, complex: types.Complex64}
|
|
28
54
|
|
|
29
55
|
|
|
30
|
-
class Tensor(_TensorBase, _TensorInplace):
|
|
56
|
+
class Tensor(Generic[DType], _TensorBase, _TensorInplace):
|
|
57
|
+
_fixed_dtype: ClassVar[Numeric | None] = None
|
|
58
|
+
|
|
31
59
|
def __init__(
|
|
32
60
|
self,
|
|
33
|
-
data: _ArrayOrScalar
|
|
61
|
+
data: _ArrayOrScalar,
|
|
34
62
|
requires_grad: bool = False,
|
|
35
63
|
keep_grad: bool = False,
|
|
36
64
|
dtype: _BuiltinNumeric | Numeric | None = None,
|
|
@@ -39,6 +67,9 @@ class Tensor(_TensorBase, _TensorInplace):
|
|
|
39
67
|
self._is_free = False
|
|
40
68
|
self._is_bool_tensor = False
|
|
41
69
|
|
|
70
|
+
if self._fixed_dtype is not None:
|
|
71
|
+
dtype = self._fixed_dtype
|
|
72
|
+
|
|
42
73
|
if dtype is bool:
|
|
43
74
|
self._is_bool_tensor = True
|
|
44
75
|
dtype = None
|
|
@@ -285,6 +316,12 @@ class Tensor(_TensorBase, _TensorInplace):
|
|
|
285
316
|
dtype = device_or_dtype
|
|
286
317
|
return self.astype(dtype)
|
|
287
318
|
|
|
319
|
+
def cpu(self) -> Self:
|
|
320
|
+
return self.to(device="cpu")
|
|
321
|
+
|
|
322
|
+
def gpu(self) -> Self:
|
|
323
|
+
return self.to(device="gpu")
|
|
324
|
+
|
|
288
325
|
def is_cpu(self) -> bool:
|
|
289
326
|
return self.device == "cpu"
|
|
290
327
|
|
|
@@ -480,3 +517,155 @@ class Tensor(_TensorBase, _TensorInplace):
|
|
|
480
517
|
|
|
481
518
|
def bool(self) -> Self:
|
|
482
519
|
return self.astype(bool)
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
class LongTensor(Tensor[types.Int64]):
|
|
523
|
+
_fixed_dtype: ClassVar[Numeric | None] = types.Int64
|
|
524
|
+
|
|
525
|
+
def __init__(
|
|
526
|
+
self,
|
|
527
|
+
data: _ArrayOrScalar,
|
|
528
|
+
requires_grad: bool = False,
|
|
529
|
+
keep_grad: bool = False,
|
|
530
|
+
device: _DeviceType = "cpu",
|
|
531
|
+
) -> None:
|
|
532
|
+
super().__init__(
|
|
533
|
+
data=data,
|
|
534
|
+
requires_grad=requires_grad,
|
|
535
|
+
keep_grad=keep_grad,
|
|
536
|
+
dtype=types.Int64,
|
|
537
|
+
device=device,
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
|
|
541
|
+
class IntTensor(Tensor[types.Int32]):
|
|
542
|
+
_fixed_dtype: ClassVar[Numeric | None] = types.Int32
|
|
543
|
+
|
|
544
|
+
def __init__(
|
|
545
|
+
self,
|
|
546
|
+
data: _ArrayOrScalar,
|
|
547
|
+
requires_grad: bool = False,
|
|
548
|
+
keep_grad: bool = False,
|
|
549
|
+
device: _DeviceType = "cpu",
|
|
550
|
+
) -> None:
|
|
551
|
+
super().__init__(
|
|
552
|
+
data=data,
|
|
553
|
+
requires_grad=requires_grad,
|
|
554
|
+
keep_grad=keep_grad,
|
|
555
|
+
dtype=types.Int32,
|
|
556
|
+
device=device,
|
|
557
|
+
)
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
class ShortTensor(Tensor[types.Int16]):
|
|
561
|
+
_fixed_dtype: ClassVar[Numeric | None] = types.Int16
|
|
562
|
+
|
|
563
|
+
def __init__(
|
|
564
|
+
self,
|
|
565
|
+
data: _ArrayOrScalar,
|
|
566
|
+
requires_grad: bool = False,
|
|
567
|
+
keep_grad: bool = False,
|
|
568
|
+
device: _DeviceType = "cpu",
|
|
569
|
+
) -> None:
|
|
570
|
+
super().__init__(
|
|
571
|
+
data=data,
|
|
572
|
+
requires_grad=requires_grad,
|
|
573
|
+
keep_grad=keep_grad,
|
|
574
|
+
dtype=types.Int16,
|
|
575
|
+
device=device,
|
|
576
|
+
)
|
|
577
|
+
|
|
578
|
+
|
|
579
|
+
class CharTensor(Tensor[types.Int8]):
|
|
580
|
+
_fixed_dtype: ClassVar[Numeric | None] = types.Int8
|
|
581
|
+
|
|
582
|
+
def __init__(
|
|
583
|
+
self,
|
|
584
|
+
data: _ArrayOrScalar,
|
|
585
|
+
requires_grad: bool = False,
|
|
586
|
+
keep_grad: bool = False,
|
|
587
|
+
device: _DeviceType = "cpu",
|
|
588
|
+
) -> None:
|
|
589
|
+
super().__init__(
|
|
590
|
+
data=data,
|
|
591
|
+
requires_grad=requires_grad,
|
|
592
|
+
keep_grad=keep_grad,
|
|
593
|
+
dtype=types.Int8,
|
|
594
|
+
device=device,
|
|
595
|
+
)
|
|
596
|
+
|
|
597
|
+
|
|
598
|
+
class HalfTensor(Tensor[types.Float16]):
|
|
599
|
+
_fixed_dtype: ClassVar[Numeric | None] = types.Float16
|
|
600
|
+
|
|
601
|
+
def __init__(
|
|
602
|
+
self,
|
|
603
|
+
data: _ArrayOrScalar,
|
|
604
|
+
requires_grad: bool = False,
|
|
605
|
+
keep_grad: bool = False,
|
|
606
|
+
device: _DeviceType = "cpu",
|
|
607
|
+
) -> None:
|
|
608
|
+
super().__init__(
|
|
609
|
+
data=data,
|
|
610
|
+
requires_grad=requires_grad,
|
|
611
|
+
keep_grad=keep_grad,
|
|
612
|
+
dtype=types.Float16,
|
|
613
|
+
device=device,
|
|
614
|
+
)
|
|
615
|
+
|
|
616
|
+
|
|
617
|
+
class FloatTensor(Tensor[types.Float32]):
|
|
618
|
+
_fixed_dtype: ClassVar[Numeric | None] = types.Float32
|
|
619
|
+
|
|
620
|
+
def __init__(
|
|
621
|
+
self,
|
|
622
|
+
data: _ArrayOrScalar,
|
|
623
|
+
requires_grad: bool = False,
|
|
624
|
+
keep_grad: bool = False,
|
|
625
|
+
device: _DeviceType = "cpu",
|
|
626
|
+
) -> None:
|
|
627
|
+
super().__init__(
|
|
628
|
+
data=data,
|
|
629
|
+
requires_grad=requires_grad,
|
|
630
|
+
keep_grad=keep_grad,
|
|
631
|
+
dtype=types.Float32,
|
|
632
|
+
device=device,
|
|
633
|
+
)
|
|
634
|
+
|
|
635
|
+
|
|
636
|
+
class DoubleTensor(Tensor[types.Float64]):
|
|
637
|
+
_fixed_dtype: ClassVar[Numeric | None] = types.Float64
|
|
638
|
+
|
|
639
|
+
def __init__(
|
|
640
|
+
self,
|
|
641
|
+
data: _ArrayOrScalar,
|
|
642
|
+
requires_grad: bool = False,
|
|
643
|
+
keep_grad: bool = False,
|
|
644
|
+
device: _DeviceType = "cpu",
|
|
645
|
+
) -> None:
|
|
646
|
+
super().__init__(
|
|
647
|
+
data=data,
|
|
648
|
+
requires_grad=requires_grad,
|
|
649
|
+
keep_grad=keep_grad,
|
|
650
|
+
dtype=types.Float64,
|
|
651
|
+
device=device,
|
|
652
|
+
)
|
|
653
|
+
|
|
654
|
+
|
|
655
|
+
class BoolTensor(Tensor[bool]):
|
|
656
|
+
_fixed_dtype: ClassVar[Numeric | None] = None
|
|
657
|
+
|
|
658
|
+
def __init__(
|
|
659
|
+
self,
|
|
660
|
+
data: _ArrayOrScalar,
|
|
661
|
+
requires_grad: bool = False,
|
|
662
|
+
keep_grad: bool = False,
|
|
663
|
+
device: _DeviceType = "cpu",
|
|
664
|
+
) -> None:
|
|
665
|
+
super().__init__(
|
|
666
|
+
data=data,
|
|
667
|
+
requires_grad=requires_grad,
|
|
668
|
+
keep_grad=keep_grad,
|
|
669
|
+
dtype=bool,
|
|
670
|
+
device=device,
|
|
671
|
+
)
|
lucid/_util/__init__.py
CHANGED
|
@@ -9,11 +9,11 @@ from lucid._util import func
|
|
|
9
9
|
# fmt: off
|
|
10
10
|
__all__ = [
|
|
11
11
|
"reshape", "squeeze", "unsqueeze", "expand_dims", "ravel", "stack", "hstack",
|
|
12
|
-
"vstack", "concatenate", "pad", "repeat", "tile", "flatten", "meshgrid",
|
|
13
|
-
"split", "tril", "triu", "broadcast_to", "
|
|
14
|
-
"unbind", "sort", "nonzero", "unique", "topk", "argsort",
|
|
15
|
-
"histogram", "histogram2d", "where", "nonzero", "argmin",
|
|
16
|
-
"diagonal",
|
|
12
|
+
"vstack", "concatenate", "pad", "repeat", "tile", "flatten", "meshgrid",
|
|
13
|
+
"split", "tril", "triu", "broadcast_to", "expand", "chunk", "masked_fill",
|
|
14
|
+
"roll", "unbind", "sort", "nonzero", "unique", "topk", "argsort",
|
|
15
|
+
"histogramdd", "histogram", "histogram2d", "where", "nonzero", "argmin",
|
|
16
|
+
"argmax", "diagonal",
|
|
17
17
|
]
|
|
18
18
|
# fmt: on
|
|
19
19
|
|
|
@@ -106,6 +106,14 @@ def broadcast_to(a: Tensor, /, shape: _ShapeLike) -> Tensor:
|
|
|
106
106
|
return func.broadcast_to(shape)(a)
|
|
107
107
|
|
|
108
108
|
|
|
109
|
+
def expand(a: Tensor, /, *sizes: int | _ShapeLike) -> Tensor:
|
|
110
|
+
if len(sizes) == 1 and isinstance(sizes[0], (tuple, list)):
|
|
111
|
+
shape = sizes[0]
|
|
112
|
+
else:
|
|
113
|
+
shape = sizes
|
|
114
|
+
return func.expand(shape)(a)
|
|
115
|
+
|
|
116
|
+
|
|
109
117
|
def chunk(a: Tensor, /, chunks: int, axis: int = 0) -> tuple[Tensor, ...]:
|
|
110
118
|
return func.chunk(chunks, axis)(a)
|
|
111
119
|
|
|
@@ -257,6 +265,7 @@ Tensor.split = split
|
|
|
257
265
|
Tensor.tril = tril
|
|
258
266
|
Tensor.triu = triu
|
|
259
267
|
Tensor.broadcast_to = broadcast_to
|
|
268
|
+
Tensor.expand = expand
|
|
260
269
|
Tensor.chunk = chunk
|
|
261
270
|
Tensor.masked_fill = masked_fill
|
|
262
271
|
Tensor.roll = roll
|
lucid/_util/func.py
CHANGED
|
@@ -605,6 +605,79 @@ class broadcast_to(Operation):
|
|
|
605
605
|
return self.result.grad.reshape(self.original_shape)
|
|
606
606
|
|
|
607
607
|
|
|
608
|
+
class expand(Operation):
|
|
609
|
+
def __init__(self, shape: _ShapeLike) -> None:
|
|
610
|
+
super().__init__()
|
|
611
|
+
self.shape = shape
|
|
612
|
+
|
|
613
|
+
def _resolve_shape(self, input_shape: tuple[int, ...]) -> tuple[int, ...]:
|
|
614
|
+
shape = tuple(int(dim) for dim in self.shape)
|
|
615
|
+
if len(shape) == 0:
|
|
616
|
+
raise ValueError("expand() expects at least one dimension.")
|
|
617
|
+
|
|
618
|
+
if len(shape) < len(input_shape):
|
|
619
|
+
raise ValueError(
|
|
620
|
+
"expand() cannot shrink the number of dimensions from "
|
|
621
|
+
f"{len(input_shape)} to {len(shape)}."
|
|
622
|
+
)
|
|
623
|
+
|
|
624
|
+
ndim_diff = len(shape) - len(input_shape)
|
|
625
|
+
padded_input = (1,) * ndim_diff + input_shape
|
|
626
|
+
|
|
627
|
+
resolved: list[int] = []
|
|
628
|
+
for axis, (target_dim, input_dim) in enumerate(zip(shape, padded_input)):
|
|
629
|
+
if target_dim == -1:
|
|
630
|
+
if axis < ndim_diff:
|
|
631
|
+
raise ValueError(
|
|
632
|
+
"expand() cannot use -1 in a leading, "
|
|
633
|
+
"non-existing dimension."
|
|
634
|
+
)
|
|
635
|
+
target_dim = input_dim
|
|
636
|
+
|
|
637
|
+
elif target_dim < -1:
|
|
638
|
+
raise ValueError("expand() size must be >= -1.")
|
|
639
|
+
|
|
640
|
+
if input_dim == target_dim:
|
|
641
|
+
resolved.append(target_dim)
|
|
642
|
+
elif input_dim == 1 and target_dim >= 0:
|
|
643
|
+
resolved.append(target_dim)
|
|
644
|
+
else:
|
|
645
|
+
raise ValueError(
|
|
646
|
+
"expand() cannot expand dimension "
|
|
647
|
+
f"{axis} from {input_dim} to {target_dim}."
|
|
648
|
+
)
|
|
649
|
+
|
|
650
|
+
return tuple(resolved)
|
|
651
|
+
|
|
652
|
+
@unary_func_op()
|
|
653
|
+
def cpu(self, a: Tensor) -> _FuncOpReturnType:
|
|
654
|
+
self.original_shape = a.shape
|
|
655
|
+
self.expanded_shape = self._resolve_shape(a.shape)
|
|
656
|
+
|
|
657
|
+
self.result = Tensor(np.broadcast_to(a.data, self.expanded_shape))
|
|
658
|
+
return self.result, self.__grad__
|
|
659
|
+
|
|
660
|
+
@unary_func_op(device="gpu")
|
|
661
|
+
def gpu(self, a: Tensor) -> _FuncOpReturnType:
|
|
662
|
+
self.original_shape = a.shape
|
|
663
|
+
self.expanded_shape = self._resolve_shape(a.shape)
|
|
664
|
+
|
|
665
|
+
self.result = Tensor(mx.broadcast_to(a.data, self.expanded_shape))
|
|
666
|
+
return self.result, self.__grad__
|
|
667
|
+
|
|
668
|
+
def __grad__(self) -> _GradType:
|
|
669
|
+
input_shape = self.original_shape
|
|
670
|
+
ndim_diff = len(self.expanded_shape) - len(input_shape)
|
|
671
|
+
if ndim_diff > 0:
|
|
672
|
+
input_shape = (1,) * ndim_diff + input_shape
|
|
673
|
+
|
|
674
|
+
for axis, (in_dim, out_dim) in enumerate(zip(input_shape, self.expanded_shape)):
|
|
675
|
+
if in_dim == 1 and out_dim > 1:
|
|
676
|
+
self.result.grad = self.result.grad.sum(axis=axis, keepdims=True)
|
|
677
|
+
|
|
678
|
+
return self.result.grad.reshape(self.original_shape)
|
|
679
|
+
|
|
680
|
+
|
|
608
681
|
class chunk(Operation):
|
|
609
682
|
def __init__(self, chunks: int, axis: int) -> None:
|
|
610
683
|
super().__init__()
|
lucid/models/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
from .bert import *
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
import lucid
|
|
2
|
+
import lucid.nn as nn
|
|
3
|
+
import lucid.nn.functional as F
|
|
4
|
+
|
|
5
|
+
from lucid._tensor import Tensor
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
class _BertEmbeddings(nn.Module):
|
|
9
|
+
def __init__(
|
|
10
|
+
self,
|
|
11
|
+
vocab_size: int,
|
|
12
|
+
hidden_size: int,
|
|
13
|
+
pad_token_id: int,
|
|
14
|
+
max_position_embeddings: int,
|
|
15
|
+
type_vocab_size: int,
|
|
16
|
+
layer_norm_eps: float,
|
|
17
|
+
hidden_dropout_prob: float,
|
|
18
|
+
) -> None:
|
|
19
|
+
super().__init__()
|
|
20
|
+
self.word_embeddings = nn.Embedding(vocab_size, hidden_size, pad_token_id)
|
|
21
|
+
self.position_embeddings = nn.Embedding(max_position_embeddings, hidden_size)
|
|
22
|
+
self.token_type_embeddings = nn.Embedding(type_vocab_size)
|
|
23
|
+
|
|
24
|
+
self.layernorm = nn.LayerNorm(hidden_size, eps=layer_norm_eps)
|
|
25
|
+
self.dropout = nn.Dropout(hidden_dropout_prob)
|
|
26
|
+
|
|
27
|
+
self.position_ids: nn.Buffer
|
|
28
|
+
self.register_buffer(
|
|
29
|
+
"position_ids", nn.Buffer(lucid.arange(max_position_embeddings))
|
|
30
|
+
)
|
|
31
|
+
# TODO: Implement `lucid.Tensor.expand`
|
lucid/nn/_kernel/embedding.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import
|
|
1
|
+
from functools import partial
|
|
2
2
|
from types import ModuleType
|
|
3
3
|
|
|
4
4
|
import numpy as np
|
|
@@ -7,49 +7,44 @@ from lucid._backend.core import Operation, func_op, _FuncOpReturnType, _GradType
|
|
|
7
7
|
from lucid._backend.metal import mx
|
|
8
8
|
from lucid._tensor import Tensor
|
|
9
9
|
|
|
10
|
-
from lucid.types import _DeviceType, _TensorData
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
def _as_int_array(arr, lib_: ModuleType) -> _TensorData:
|
|
14
|
-
if lib_ is np:
|
|
15
|
-
return arr.astype(np.int64)
|
|
16
|
-
return arr.astype(mx.int32)
|
|
17
|
-
|
|
18
10
|
|
|
19
11
|
class embedding_kernel(Operation):
|
|
20
|
-
def __init__(self) -> None:
|
|
12
|
+
def __init__(self, padding_idx: int = -1) -> None:
|
|
21
13
|
super().__init__()
|
|
14
|
+
self.padding_idx = int(padding_idx)
|
|
22
15
|
self._indices = None
|
|
23
16
|
self._num_embeddings = None
|
|
24
17
|
|
|
25
18
|
def clear(self) -> None:
|
|
26
19
|
super().clear()
|
|
20
|
+
self.padding_idx = -1
|
|
27
21
|
self._indices = None
|
|
28
22
|
self._num_embeddings = None
|
|
29
23
|
|
|
30
24
|
@func_op(n_in=2, n_ret=1)
|
|
31
25
|
def cpu(self, indices: Tensor, weight: Tensor) -> _FuncOpReturnType:
|
|
32
|
-
return self._forward(indices, weight, lib_=np
|
|
26
|
+
return self._forward(indices, weight, lib_=np)
|
|
33
27
|
|
|
34
28
|
@func_op(n_in=2, n_ret=1, device="gpu")
|
|
35
29
|
def gpu(self, indices: Tensor, weight: Tensor) -> _FuncOpReturnType:
|
|
36
|
-
return self._forward(indices, weight, lib_=mx
|
|
30
|
+
return self._forward(indices, weight, lib_=mx)
|
|
37
31
|
|
|
38
32
|
def _forward(
|
|
39
|
-
self, indices: Tensor, weight: Tensor, lib_: ModuleType
|
|
33
|
+
self, indices: Tensor, weight: Tensor, lib_: ModuleType
|
|
40
34
|
) -> _FuncOpReturnType:
|
|
41
|
-
idx =
|
|
35
|
+
idx = indices.data
|
|
42
36
|
out = weight.data[idx]
|
|
43
37
|
|
|
44
38
|
self._indices = idx
|
|
45
39
|
self._num_embeddings = int(weight.shape[0])
|
|
46
40
|
|
|
47
|
-
self.result = Tensor(out
|
|
48
|
-
return self.result,
|
|
41
|
+
self.result = Tensor(out)
|
|
42
|
+
return self.result, partial(self.__grad__, lib_=lib_)
|
|
49
43
|
|
|
50
44
|
def __grad__(self, lib_: ModuleType) -> _GradType:
|
|
51
45
|
if self.result is None or self.result.grad is None:
|
|
52
46
|
raise RuntimeError("embedding backward called before forward.")
|
|
47
|
+
|
|
53
48
|
if self._indices is None or self._num_embeddings is None:
|
|
54
49
|
raise RuntimeError("embedding cached data missing.")
|
|
55
50
|
|
|
@@ -58,15 +53,23 @@ class embedding_kernel(Operation):
|
|
|
58
53
|
grad_flat = grad_out.reshape(idx.shape[0], -1)
|
|
59
54
|
|
|
60
55
|
if lib_ is np:
|
|
56
|
+
if self.padding_idx >= 0:
|
|
57
|
+
keep = idx != self.padding_idx
|
|
58
|
+
idx = idx[keep]
|
|
59
|
+
grad_flat = grad_flat[keep]
|
|
60
|
+
|
|
61
61
|
grad_w = np.zeros(
|
|
62
62
|
(self._num_embeddings, grad_flat.shape[1]), dtype=grad_out.dtype
|
|
63
63
|
)
|
|
64
64
|
np.add.at(grad_w, idx, grad_flat)
|
|
65
|
+
|
|
65
66
|
else:
|
|
66
67
|
grad_w = mx.zeros(
|
|
67
68
|
(self._num_embeddings, grad_flat.shape[1]), dtype=grad_out.dtype
|
|
68
69
|
)
|
|
69
70
|
for i in range(idx.shape[0]):
|
|
71
|
+
if self.padding_idx >= 0 and int(idx[i]) == self.padding_idx:
|
|
72
|
+
continue
|
|
70
73
|
grad_w = grad_w.at[idx[i]].add(grad_flat[i])
|
|
71
74
|
|
|
72
75
|
return None, grad_w
|
lucid/nn/functional/_util.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
1
3
|
import lucid
|
|
2
4
|
import lucid.nn.functional
|
|
3
5
|
|
|
@@ -5,6 +7,7 @@ from lucid._tensor import Tensor
|
|
|
5
7
|
from lucid.types import _Scalar, Numeric
|
|
6
8
|
|
|
7
9
|
from lucid.nn._kernel.embedding import embedding_kernel
|
|
10
|
+
from lucid._backend.metal import mx
|
|
8
11
|
|
|
9
12
|
|
|
10
13
|
def _interpolate_bilinear(
|
|
@@ -131,17 +134,46 @@ def embedding(
|
|
|
131
134
|
max_norm: float | None = None,
|
|
132
135
|
norm_type: float = 2.0,
|
|
133
136
|
) -> Tensor:
|
|
137
|
+
num_embeddings = int(weight.shape[0])
|
|
138
|
+
if padding_idx is None:
|
|
139
|
+
pad = -1
|
|
140
|
+
else:
|
|
141
|
+
pad = int(padding_idx)
|
|
142
|
+
if pad < 0:
|
|
143
|
+
pad += num_embeddings
|
|
144
|
+
if pad < 0 or pad >= num_embeddings:
|
|
145
|
+
raise IndexError("padding_idx out of range.")
|
|
146
|
+
|
|
134
147
|
indices = input_.astype(lucid.Int)
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
if
|
|
138
|
-
|
|
139
|
-
output *= 1 - mask[..., None]
|
|
148
|
+
idx_data = indices.data
|
|
149
|
+
|
|
150
|
+
if (idx_data < 0).any() or (idx_data >= num_embeddings).any():
|
|
151
|
+
raise IndexError("embedding indices out of range.")
|
|
140
152
|
|
|
141
153
|
if max_norm is not None:
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
154
|
+
lib_ = np if weight.is_cpu() else mx
|
|
155
|
+
flat = idx_data.reshape(-1)
|
|
156
|
+
|
|
157
|
+
w = weight.data[flat]
|
|
158
|
+
if norm_type <= 0:
|
|
159
|
+
raise ValueError("norm_type must be positive.")
|
|
160
|
+
|
|
161
|
+
norms = (lib_.abs(w) ** norm_type).sum(axis=1) ** (1.0 / norm_type)
|
|
162
|
+
scale = lib_.minimum(1.0, max_norm / (norms + (norms == 0)))
|
|
163
|
+
|
|
164
|
+
if pad >= 0:
|
|
165
|
+
mask = flat == pad
|
|
166
|
+
mask_f = mask.astype(scale.dtype)
|
|
167
|
+
scale = scale * (1 - mask_f) + mask_f
|
|
168
|
+
|
|
169
|
+
weight.data[flat] = w * scale[:, None]
|
|
170
|
+
|
|
171
|
+
op = embedding_kernel(padding_idx=pad)
|
|
172
|
+
output = op(indices, weight)
|
|
173
|
+
|
|
174
|
+
if pad >= 0:
|
|
175
|
+
mask = input_.data == pad
|
|
176
|
+
output *= 1 - mask[..., None]
|
|
145
177
|
|
|
146
178
|
return output
|
|
147
179
|
|
lucid/nn/modules/attention.py
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import math
|
|
2
|
+
|
|
1
3
|
import lucid
|
|
2
4
|
import lucid.nn as nn
|
|
3
5
|
import lucid.nn.functional as F
|
|
@@ -41,6 +43,7 @@ class ScaledDotProductAttention(nn.Module):
|
|
|
41
43
|
"num_heads",
|
|
42
44
|
"dropout",
|
|
43
45
|
"bias",
|
|
46
|
+
"use_separate_proj_weight",
|
|
44
47
|
"add_bias_kv",
|
|
45
48
|
"add_zero_attn",
|
|
46
49
|
)
|
|
@@ -51,6 +54,7 @@ class MultiHeadAttention(nn.Module):
|
|
|
51
54
|
num_heads: int,
|
|
52
55
|
dropout: float = 0.0,
|
|
53
56
|
bias: bool = True,
|
|
57
|
+
use_separate_proj_weight: bool = True,
|
|
54
58
|
add_bias_kv: bool = False,
|
|
55
59
|
add_zero_attn: bool = False,
|
|
56
60
|
kdim: int | None = None,
|
|
@@ -60,6 +64,7 @@ class MultiHeadAttention(nn.Module):
|
|
|
60
64
|
self.embed_dim = embed_dim
|
|
61
65
|
self.num_heads = num_heads
|
|
62
66
|
self.dropout = dropout
|
|
67
|
+
self.use_separate_proj_weight = use_separate_proj_weight
|
|
63
68
|
self.add_bias_kv = add_bias_kv
|
|
64
69
|
self.add_zero_attn = add_zero_attn
|
|
65
70
|
|
|
@@ -70,9 +75,30 @@ class MultiHeadAttention(nn.Module):
|
|
|
70
75
|
kdim = kdim if kdim is not None else embed_dim
|
|
71
76
|
vdim = vdim if vdim is not None else embed_dim
|
|
72
77
|
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
78
|
+
if use_separate_proj_weight:
|
|
79
|
+
self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
|
80
|
+
self.k_proj = nn.Linear(kdim, embed_dim, bias=bias)
|
|
81
|
+
self.v_proj = nn.Linear(vdim, embed_dim, bias=bias)
|
|
82
|
+
|
|
83
|
+
self.register_parameter("in_proj_weight", None)
|
|
84
|
+
self.register_parameter("in_proj_bias", None)
|
|
85
|
+
else:
|
|
86
|
+
if kdim != embed_dim or vdim != embed_dim:
|
|
87
|
+
raise ValueError(
|
|
88
|
+
"in_proj_weight requires kdim and vdim to equal embed_dim."
|
|
89
|
+
)
|
|
90
|
+
|
|
91
|
+
weight_ = lucid.empty(3 * embed_dim, embed_dim)
|
|
92
|
+
self.in_proj_weight = nn.Parameter(weight_)
|
|
93
|
+
if bias:
|
|
94
|
+
bias_ = lucid.empty(3 * embed_dim)
|
|
95
|
+
self.in_proj_bias = nn.Parameter(bias_)
|
|
96
|
+
else:
|
|
97
|
+
self.register_parameter("in_proj_bias", None)
|
|
98
|
+
|
|
99
|
+
self.q_proj = None
|
|
100
|
+
self.k_proj = None
|
|
101
|
+
self.v_proj = None
|
|
76
102
|
|
|
77
103
|
self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
|
|
78
104
|
|
|
@@ -84,6 +110,17 @@ class MultiHeadAttention(nn.Module):
|
|
|
84
110
|
self.bias_v = None
|
|
85
111
|
|
|
86
112
|
self.scale: _Scalar = self.head_dim**-0.5
|
|
113
|
+
self._reset_parameters()
|
|
114
|
+
|
|
115
|
+
def _reset_parameters(self) -> None:
|
|
116
|
+
if self.in_proj_weight is None:
|
|
117
|
+
return
|
|
118
|
+
|
|
119
|
+
nn.init.kaiming_uniform(self.in_proj_weight)
|
|
120
|
+
if self.in_proj_bias is not None:
|
|
121
|
+
fan_in, _ = nn.init._dist._calculate_fan_in_and_fan_out(self.in_proj_weight)
|
|
122
|
+
bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
|
|
123
|
+
nn.init.uniform(self.in_proj_bias, -bound, bound)
|
|
87
124
|
|
|
88
125
|
def forward(
|
|
89
126
|
self,
|
|
@@ -97,9 +134,24 @@ class MultiHeadAttention(nn.Module):
|
|
|
97
134
|
N, q_len = query.shape[:2]
|
|
98
135
|
k_len, v_len = key.shape[1], value.shape[1]
|
|
99
136
|
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
137
|
+
if self.in_proj_weight is None:
|
|
138
|
+
q = self.q_proj(query)
|
|
139
|
+
k = self.k_proj(key)
|
|
140
|
+
v = self.v_proj(value)
|
|
141
|
+
else:
|
|
142
|
+
if query is key and key is value:
|
|
143
|
+
qkv = F.linear(query, self.in_proj_weight, self.in_proj_bias)
|
|
144
|
+
q, k, v = lucid.chunk(qkv, 3, axis=-1)
|
|
145
|
+
else:
|
|
146
|
+
w_q, w_k, w_v = lucid.chunk(self.in_proj_weight, 3, axis=0)
|
|
147
|
+
if self.in_proj_bias is not None:
|
|
148
|
+
b_q, b_k, b_v = lucid.chunk(self.in_proj_bias, 3, axis=0)
|
|
149
|
+
else:
|
|
150
|
+
b_q = b_k = b_v = None
|
|
151
|
+
|
|
152
|
+
q = F.linear(query, w_q, b_q)
|
|
153
|
+
k = F.linear(key, w_k, b_k)
|
|
154
|
+
v = F.linear(value, w_v, b_v)
|
|
103
155
|
|
|
104
156
|
q = q.reshape(N, self.num_heads, q_len, self.head_dim)
|
|
105
157
|
k = k.reshape(N, self.num_heads, k_len, self.head_dim)
|
lucid/nn/modules/rnn.py
CHANGED
|
@@ -284,21 +284,16 @@ class RNNBase(nn.Module):
|
|
|
284
284
|
) -> None:
|
|
285
285
|
super().__init__()
|
|
286
286
|
self.is_lstm = False
|
|
287
|
-
cell_kwargs = {}
|
|
288
287
|
nonlinearity = "tanh"
|
|
289
288
|
|
|
290
289
|
if mode == "RNN_TANH":
|
|
291
|
-
|
|
292
|
-
cell_kwargs: dict[str, object] = {"nonlinearity": nonlinearity}
|
|
290
|
+
pass
|
|
293
291
|
elif mode == "RNN_RELU":
|
|
294
292
|
nonlinearity = "relu"
|
|
295
|
-
cell_cls = RNNCell
|
|
296
|
-
cell_kwargs = {"nonlinearity": nonlinearity}
|
|
297
293
|
elif mode == "LSTM":
|
|
298
|
-
cell_cls = LSTMCell
|
|
299
294
|
self.is_lstm = True
|
|
300
295
|
elif mode == "GRU":
|
|
301
|
-
|
|
296
|
+
pass
|
|
302
297
|
else:
|
|
303
298
|
raise ValueError(
|
|
304
299
|
f"Invalid mode '{mode}'. Supported modes are 'RNN_TANH', "
|
|
@@ -315,18 +310,120 @@ class RNNBase(nn.Module):
|
|
|
315
310
|
self.batch_first = batch_first
|
|
316
311
|
self.dropout = float(dropout)
|
|
317
312
|
|
|
318
|
-
layers: list[nn.Module] = []
|
|
319
313
|
for layer in range(num_layers):
|
|
320
314
|
layer_input_size = input_size if layer == 0 else hidden_size
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
325
|
-
|
|
326
|
-
|
|
315
|
+
sqrt_k = 1.0 / (hidden_size**0.5)
|
|
316
|
+
|
|
317
|
+
if mode in ("RNN_TANH", "RNN_RELU"):
|
|
318
|
+
w_ih = nn.Parameter(
|
|
319
|
+
lucid.random.uniform(
|
|
320
|
+
-sqrt_k, sqrt_k, (hidden_size, layer_input_size)
|
|
321
|
+
)
|
|
327
322
|
)
|
|
328
|
-
|
|
329
|
-
|
|
323
|
+
w_hh = nn.Parameter(
|
|
324
|
+
lucid.random.uniform(-sqrt_k, sqrt_k, (hidden_size, hidden_size))
|
|
325
|
+
)
|
|
326
|
+
elif mode == "LSTM":
|
|
327
|
+
w_ih = nn.Parameter(
|
|
328
|
+
lucid.random.uniform(
|
|
329
|
+
-sqrt_k, sqrt_k, (4 * hidden_size, layer_input_size)
|
|
330
|
+
)
|
|
331
|
+
)
|
|
332
|
+
w_hh = nn.Parameter(
|
|
333
|
+
lucid.random.uniform(
|
|
334
|
+
-sqrt_k, sqrt_k, (4 * hidden_size, hidden_size)
|
|
335
|
+
)
|
|
336
|
+
)
|
|
337
|
+
else:
|
|
338
|
+
w_ih = nn.Parameter(
|
|
339
|
+
lucid.random.uniform(
|
|
340
|
+
-sqrt_k, sqrt_k, (3 * hidden_size, layer_input_size)
|
|
341
|
+
)
|
|
342
|
+
)
|
|
343
|
+
w_hh = nn.Parameter(
|
|
344
|
+
lucid.random.uniform(
|
|
345
|
+
-sqrt_k, sqrt_k, (3 * hidden_size, hidden_size)
|
|
346
|
+
)
|
|
347
|
+
)
|
|
348
|
+
|
|
349
|
+
self.register_parameter(f"weight_ih_l{layer}", w_ih)
|
|
350
|
+
self.register_parameter(f"weight_hh_l{layer}", w_hh)
|
|
351
|
+
|
|
352
|
+
if bias:
|
|
353
|
+
b_ih = nn.Parameter(
|
|
354
|
+
lucid.random.uniform(-sqrt_k, sqrt_k, w_ih.shape[0])
|
|
355
|
+
)
|
|
356
|
+
b_hh = nn.Parameter(
|
|
357
|
+
lucid.random.uniform(-sqrt_k, sqrt_k, w_hh.shape[0])
|
|
358
|
+
)
|
|
359
|
+
self.register_parameter(f"bias_ih_l{layer}", b_ih)
|
|
360
|
+
self.register_parameter(f"bias_hh_l{layer}", b_hh)
|
|
361
|
+
else:
|
|
362
|
+
self.register_parameter(f"bias_ih_l{layer}", None)
|
|
363
|
+
self.register_parameter(f"bias_hh_l{layer}", None)
|
|
364
|
+
|
|
365
|
+
def _rnn_cell(
|
|
366
|
+
self,
|
|
367
|
+
input_: Tensor,
|
|
368
|
+
hx: Tensor,
|
|
369
|
+
w_ih: Tensor,
|
|
370
|
+
w_hh: Tensor,
|
|
371
|
+
b_ih: Tensor | None,
|
|
372
|
+
b_hh: Tensor | None,
|
|
373
|
+
) -> Tensor:
|
|
374
|
+
hy = F.linear(input_, w_ih, b_ih)
|
|
375
|
+
hy += F.linear(hx, w_hh, b_hh)
|
|
376
|
+
|
|
377
|
+
if self.mode == "RNN_TANH":
|
|
378
|
+
return F.tanh(hy)
|
|
379
|
+
|
|
380
|
+
return F.relu(hy)
|
|
381
|
+
|
|
382
|
+
def _lstm_cell(
|
|
383
|
+
self,
|
|
384
|
+
input_: Tensor,
|
|
385
|
+
hx: Tensor,
|
|
386
|
+
cx: Tensor,
|
|
387
|
+
w_ih: Tensor,
|
|
388
|
+
w_hh: Tensor,
|
|
389
|
+
b_ih: Tensor | None,
|
|
390
|
+
b_hh: Tensor | None,
|
|
391
|
+
) -> tuple[Tensor, Tensor]:
|
|
392
|
+
gates = F.linear(input_, w_ih, b_ih)
|
|
393
|
+
gates += F.linear(hx, w_hh, b_hh)
|
|
394
|
+
|
|
395
|
+
i_t, f_t, g_t, o_t = lucid.split(gates, 4, axis=1)
|
|
396
|
+
i_t = F.sigmoid(i_t)
|
|
397
|
+
f_t = F.sigmoid(f_t)
|
|
398
|
+
g_t = F.tanh(g_t)
|
|
399
|
+
o_t = F.sigmoid(o_t)
|
|
400
|
+
|
|
401
|
+
c_t = f_t * cx + i_t * g_t
|
|
402
|
+
h_t = o_t * F.tanh(c_t)
|
|
403
|
+
|
|
404
|
+
return h_t, c_t
|
|
405
|
+
|
|
406
|
+
def _gru_cell(
|
|
407
|
+
self,
|
|
408
|
+
input_: Tensor,
|
|
409
|
+
hx: Tensor,
|
|
410
|
+
w_ih: Tensor,
|
|
411
|
+
w_hh: Tensor,
|
|
412
|
+
b_ih: Tensor | None,
|
|
413
|
+
b_hh: Tensor | None,
|
|
414
|
+
) -> Tensor:
|
|
415
|
+
input_gates = F.linear(input_, w_ih, b_ih)
|
|
416
|
+
hidden_gates = F.linear(hx, w_hh, b_hh)
|
|
417
|
+
|
|
418
|
+
i_r, i_z, i_n = lucid.split(input_gates, 3, axis=1)
|
|
419
|
+
h_r, h_z, h_n = lucid.split(hidden_gates, 3, axis=1)
|
|
420
|
+
|
|
421
|
+
r_t = F.sigmoid(i_r + h_r)
|
|
422
|
+
z_t = F.sigmoid(i_z + h_z)
|
|
423
|
+
n_t = F.tanh(i_n + r_t * h_n)
|
|
424
|
+
h_t = (1 - z_t) * n_t + z_t * hx
|
|
425
|
+
|
|
426
|
+
return h_t
|
|
330
427
|
|
|
331
428
|
def _init_hidden(
|
|
332
429
|
self, batch_size: int, dtype: Numeric, device: _DeviceType
|
|
@@ -441,7 +538,12 @@ class RNNBase(nn.Module):
|
|
|
441
538
|
h_n_list: list[Tensor] = []
|
|
442
539
|
c_n_list: list[Tensor] | None = [] if self.is_lstm else None
|
|
443
540
|
|
|
444
|
-
for layer_idx
|
|
541
|
+
for layer_idx in range(self.num_layers):
|
|
542
|
+
w_ih = getattr(self, f"weight_ih_l{layer_idx}")
|
|
543
|
+
w_hh = getattr(self, f"weight_hh_l{layer_idx}")
|
|
544
|
+
b_ih = getattr(self, f"bias_ih_l{layer_idx}")
|
|
545
|
+
b_hh = getattr(self, f"bias_hh_l{layer_idx}")
|
|
546
|
+
|
|
445
547
|
if self.is_lstm:
|
|
446
548
|
h_t = hx_h[layer_idx]
|
|
447
549
|
c_t = hx_c[layer_idx]
|
|
@@ -481,9 +583,13 @@ class RNNBase(nn.Module):
|
|
|
481
583
|
offset += bs
|
|
482
584
|
|
|
483
585
|
if self.is_lstm:
|
|
484
|
-
h_t, c_t =
|
|
586
|
+
h_t, c_t = self._lstm_cell(
|
|
587
|
+
step_input, h_t, c_t, w_ih, w_hh, b_ih, b_hh
|
|
588
|
+
)
|
|
589
|
+
elif self.mode == "GRU":
|
|
590
|
+
h_t = self._gru_cell(step_input, h_t, w_ih, w_hh, b_ih, b_hh)
|
|
485
591
|
else:
|
|
486
|
-
h_t =
|
|
592
|
+
h_t = self._rnn_cell(step_input, h_t, w_ih, w_hh, b_ih, b_hh)
|
|
487
593
|
|
|
488
594
|
outputs.append(h_t)
|
|
489
595
|
prev_bs = bs
|
|
@@ -512,11 +618,17 @@ class RNNBase(nn.Module):
|
|
|
512
618
|
|
|
513
619
|
else:
|
|
514
620
|
for t in range(seq_len):
|
|
621
|
+
step_input = layer_input[t]
|
|
515
622
|
if self.is_lstm:
|
|
516
|
-
h_t, c_t =
|
|
623
|
+
h_t, c_t = self._lstm_cell(
|
|
624
|
+
step_input, h_t, c_t, w_ih, w_hh, b_ih, b_hh
|
|
625
|
+
)
|
|
626
|
+
outputs.append(h_t.unsqueeze(axis=0))
|
|
627
|
+
elif self.mode == "GRU":
|
|
628
|
+
h_t = self._gru_cell(step_input, h_t, w_ih, w_hh, b_ih, b_hh)
|
|
517
629
|
outputs.append(h_t.unsqueeze(axis=0))
|
|
518
630
|
else:
|
|
519
|
-
h_t =
|
|
631
|
+
h_t = self._rnn_cell(step_input, h_t, w_ih, w_hh, b_ih, b_hh)
|
|
520
632
|
outputs.append(h_t.unsqueeze(axis=0))
|
|
521
633
|
|
|
522
634
|
layer_output = lucid.concatenate(tuple(outputs), axis=0)
|
lucid/nn/modules/sparse.py
CHANGED
|
@@ -21,7 +21,17 @@ class Embedding(nn.Module):
|
|
|
21
21
|
super().__init__()
|
|
22
22
|
self.num_embeddings = num_embeddings
|
|
23
23
|
self.embedding_dim = embedding_dim
|
|
24
|
-
|
|
24
|
+
|
|
25
|
+
if padding_idx is None:
|
|
26
|
+
self.padding_idx = None
|
|
27
|
+
else:
|
|
28
|
+
pad = int(padding_idx)
|
|
29
|
+
if pad < 0:
|
|
30
|
+
pad += num_embeddings
|
|
31
|
+
if pad < 0 or pad >= num_embeddings:
|
|
32
|
+
raise IndexError("padding_idx out of range.")
|
|
33
|
+
self.padding_idx = pad
|
|
34
|
+
|
|
25
35
|
self.max_norm = max_norm
|
|
26
36
|
self.norm_type = norm_type
|
|
27
37
|
|
|
@@ -32,6 +42,9 @@ class Embedding(nn.Module):
|
|
|
32
42
|
else:
|
|
33
43
|
self.weight = nn.Parameter(_weight)
|
|
34
44
|
|
|
45
|
+
if self.padding_idx is not None:
|
|
46
|
+
self.weight.data[self.padding_idx] = 0
|
|
47
|
+
|
|
35
48
|
def forward(self, input_: Tensor) -> Tensor:
|
|
36
49
|
return F.embedding(
|
|
37
50
|
input_, self.weight, self.padding_idx, self.max_norm, self.norm_type
|
|
@@ -39,3 +52,5 @@ class Embedding(nn.Module):
|
|
|
39
52
|
|
|
40
53
|
def reset_parameters(self) -> None:
|
|
41
54
|
self.weight.data = lucid.random.uniform(-0.1, 0.1, self.weight.shape)
|
|
55
|
+
if self.padding_idx is not None:
|
|
56
|
+
self.weight.data[self.padding_idx] = 0
|
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
Metadata-Version: 2.4
|
|
2
2
|
Name: lucid-dl
|
|
3
|
-
Version: 2.12.
|
|
3
|
+
Version: 2.12.1
|
|
4
4
|
Summary: Lumerico's Comprehensive Interface for Deep Learning
|
|
5
5
|
Home-page: https://github.com/ChanLumerico/lucid
|
|
6
6
|
Author: ChanLumerico
|
|
@@ -48,6 +48,10 @@ Whether you're a student, educator, or an advanced researcher seeking to demysti
|
|
|
48
48
|
|
|
49
49
|
### 🔥 What's New
|
|
50
50
|
|
|
51
|
+
- New Tensor utility function added: `lucid.Tensor.expand`
|
|
52
|
+
|
|
53
|
+
- Added Type-Generic Tensors: `lucid.LongTensor`, `lucid.DoubleTensor`, etc.
|
|
54
|
+
|
|
51
55
|
- Added new visual tool: `lucid.visual.build_tensor_mermaid_chart` which builds a Mermaid chart of given tensor's computatoinal graph
|
|
52
56
|
|
|
53
57
|
- Added additional `nn.Module` hooks for richer introspection during training:
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
lucid/__init__.py,sha256=
|
|
1
|
+
lucid/__init__.py,sha256=p5SmXNcVcuTeiCmgpKRU6ttNp-Nut58VblUy6B2a5HU,9159
|
|
2
2
|
lucid/error.py,sha256=qnTiVuZm3c5-DIt-OOyobZ7RUm7E1K4NR0j998LG1ug,709
|
|
3
3
|
lucid/port.py,sha256=Kt1YaSWef_eKF4KRj-UFhirvFC5urEESfYQ_BSlBZGE,3811
|
|
4
4
|
lucid/types.py,sha256=Zdz2r4ledouEG-6Gi6yEza5vSLyyTzZJn7AcRKbxy8o,6906
|
|
@@ -12,11 +12,11 @@ lucid/_func/ufunc.py,sha256=AnCSykuYC0fNNoZso-TM60Rlq_4c54uMYOG3BHdvy20,30261
|
|
|
12
12
|
lucid/_fusion/__init__.py,sha256=SVzLiFzs4m1mMOpefKDLFkYqV0zV5FGwFd9hEbUZtSo,68
|
|
13
13
|
lucid/_fusion/base.py,sha256=d6nWuPjYxkie9Xrtbj3JVusnIN61PIoSFFSthJNm9os,3821
|
|
14
14
|
lucid/_fusion/func.py,sha256=9tXzB-QNrx_AvNJiPto807faXKlzjuMG4o9gRgI5usc,1659
|
|
15
|
-
lucid/_tensor/__init__.py,sha256=
|
|
16
|
-
lucid/_tensor/base.py,sha256=
|
|
17
|
-
lucid/_tensor/tensor.py,sha256=
|
|
18
|
-
lucid/_util/__init__.py,sha256=
|
|
19
|
-
lucid/_util/func.py,sha256=
|
|
15
|
+
lucid/_tensor/__init__.py,sha256=prEYQ-GevlGoO7JblW_1dpwarRpCoWzrOYMjWAjSg2Q,180
|
|
16
|
+
lucid/_tensor/base.py,sha256=KWYTFIuc8_ZOeVPUZh-ogT3C9Ey6wA6WLkwepE_CGhk,5323
|
|
17
|
+
lucid/_tensor/tensor.py,sha256=gb9h30Kh6mdKg3eH0XFNLGlhUgyhKF-1lNr_wNcKw1k,19174
|
|
18
|
+
lucid/_util/__init__.py,sha256=_p_qhtPKWzV-kwBpCqSsgXrWSbWWw3I7PMMhgFqiLYg,7085
|
|
19
|
+
lucid/_util/func.py,sha256=DDA6vUYc9b8FeX1pvfI1jW8BpGOm7Wr8E_9btGrYYs0,46921
|
|
20
20
|
lucid/autograd/__init__.py,sha256=hDoK_B2chRFVhoxsT4vxRKangzBEMWqF8gj2hdoTenk,6775
|
|
21
21
|
lucid/data/__init__.py,sha256=qrDIQsnix5ZUEa0yrtomaaWbNJyJ3xEr2gdhRvg70_8,118
|
|
22
22
|
lucid/data/_base.py,sha256=RM8xpBl8qFhm19n7eER_jOsRaxkL3rbOkwUvn6VetSE,5921
|
|
@@ -29,7 +29,7 @@ lucid/einops/__init__.py,sha256=9Dlmfw6PsIU9b_a89Zre4yV2rztRHPCL4QpsUnXJwjM,802
|
|
|
29
29
|
lucid/einops/_func.py,sha256=XXsX9lse_0turKoFnOTtLdY6hBUi0gq_8K81G7nr80I,21026
|
|
30
30
|
lucid/linalg/__init__.py,sha256=N-LrlC3qSsOMt6Ad1-PP3Qc3QH6EWNf5P50GBvwb9aQ,1118
|
|
31
31
|
lucid/linalg/_func.py,sha256=Iyeut5nHwQmO8N326kQUaTjgoKVoBaxt_gy_3NXXD60,16378
|
|
32
|
-
lucid/models/__init__.py,sha256=
|
|
32
|
+
lucid/models/__init__.py,sha256=0BoSSrffJnK3Vqz3yVtvUReuB5uFWTYG8NosK2dz97U,111
|
|
33
33
|
lucid/models/utils.py,sha256=2g8FLcMLRgVxgGEaYuwJyFxeXu-A_a4_MVr0K-TNh74,5195
|
|
34
34
|
lucid/models/imgclf/__init__.py,sha256=kQH-nNu8_TPJ7Av151WSpcY4GJ06gGAd6Ozs3m3KMcE,590
|
|
35
35
|
lucid/models/imgclf/alex.py,sha256=fZsPdCjWUseCrxBwKj-i5fPSDYLgBpfm0SJe07YKRuE,1472
|
|
@@ -76,6 +76,8 @@ lucid/models/objdet/yolo/yolo_v3.py,sha256=B5U42Npwfg8nSgU9E261zf0cbQS9RVYrX1ADD
|
|
|
76
76
|
lucid/models/objdet/yolo/yolo_v4.py,sha256=RFbBumreXmy6s8IYZvUuhW0893ss8sx_8Vgi6KbBKWo,21467
|
|
77
77
|
lucid/models/seq2seq/__init__.py,sha256=wjsrhj4H_AcqwwbebAN8b68QBA8L6p1_12dkG2995-w,27
|
|
78
78
|
lucid/models/seq2seq/transformer.py,sha256=y5rerCs1s6jXTsVvbgscWScKpQKuSu1fezsBe7PNTRA,3513
|
|
79
|
+
lucid/models/seqclf/__init__.py,sha256=qpzGjlHlqe7oQO4KBiz2XtchpoI9u1PUlaPAIh6EY0w,20
|
|
80
|
+
lucid/models/seqclf/bert.py,sha256=wlnZsNci9dMd3yYCY3QoJqgA0s7gnSO2XcfR99l1JaA,990
|
|
79
81
|
lucid/nn/__init__.py,sha256=nyy6px1CxfchWUh68xCiQSxD7Gk65vamhWK8ztRvH68,184
|
|
80
82
|
lucid/nn/fused.py,sha256=75fcXuo6fHSO-JtjuKhowhHSDr4qc5871WR63sUzH0g,5492
|
|
81
83
|
lucid/nn/module.py,sha256=_EWtGkAuWWCPZ5f3t5pJOOzpi14gQBpP7JW2S8o4_GE,26855
|
|
@@ -84,7 +86,7 @@ lucid/nn/_kernel/__init__.py,sha256=n1bnYdeb_bNDBKASWGywTRa0Ne9hMAkal3AuVZJgovI,
|
|
|
84
86
|
lucid/nn/_kernel/activation.py,sha256=mfe48Aw3_Hv0hZEVC7DxDw19XK9XSLfdCOvo2JcZz_o,5662
|
|
85
87
|
lucid/nn/_kernel/attention.py,sha256=1k0gboLObMNVow2v3TwliXC_2v8uKf2o8jHYFuyQqcg,3699
|
|
86
88
|
lucid/nn/_kernel/conv.py,sha256=TiY3EkUAmwFCI1aA8YVMoZJHIRrqmJAXZEPh1C7lons,16412
|
|
87
|
-
lucid/nn/_kernel/embedding.py,sha256=
|
|
89
|
+
lucid/nn/_kernel/embedding.py,sha256=w90-SSr_DYzcI-zLkvye8P2o9C103imPPe4HBRPKUSg,2480
|
|
88
90
|
lucid/nn/_kernel/loss.py,sha256=UD0B5DZ3R98OPZUigHsctL0eAJch2rKQpn1uaI3fzGg,13935
|
|
89
91
|
lucid/nn/_kernel/norm.py,sha256=261WtixerLxFISIroQw8l_zZ3X0b4c_eDy8QHHA-i4M,11992
|
|
90
92
|
lucid/nn/_kernel/pool.py,sha256=IQh5hfKU4PUvnGS1ayorUmytB_vCSxcbAwBYlFKw0iI,10697
|
|
@@ -98,12 +100,12 @@ lucid/nn/functional/_loss.py,sha256=b6KT8SrKe5lgAqlAmQnT00Hk7tvd-UcBPNryGYtTPWQ,
|
|
|
98
100
|
lucid/nn/functional/_norm.py,sha256=yunKJttd3WTxXvzKuugL2LgHLmp-9dMxhHgQ9myLUzA,5041
|
|
99
101
|
lucid/nn/functional/_pool.py,sha256=u6ykwqTZ38b9QPwUqFXpnPhOx2cc_9x9AfH0k26Y9pQ,4085
|
|
100
102
|
lucid/nn/functional/_spatial.py,sha256=lazoSvVMFcauBWRbMOqmkgixA5bDes6scGHVWCgVmHE,3911
|
|
101
|
-
lucid/nn/functional/_util.py,sha256=
|
|
103
|
+
lucid/nn/functional/_util.py,sha256=gfsoGo7JgCHtPkcQqqisO8MfyyK57Pzy7Oeny8k2KKo,5936
|
|
102
104
|
lucid/nn/init/__init__.py,sha256=YFi-HD2TEglweJ-gyX3n4UVZYzd70gcUi1dBu6hnOAY,1533
|
|
103
105
|
lucid/nn/init/_dist.py,sha256=Tj9SKl43ZrJdv99X5qXUowdcts4f4D3tUk7RBmX5uCg,2462
|
|
104
106
|
lucid/nn/modules/__init__.py,sha256=mol5Gfy-3ab5hBYZRxX0vjiI0w5VyKtBxVwj_vrOAZs,285
|
|
105
107
|
lucid/nn/modules/activation.py,sha256=CpiKpzgZHoCp8UO5taCJ9BuwFz5mYUs0o1_TQcEwQbQ,2823
|
|
106
|
-
lucid/nn/modules/attention.py,sha256=
|
|
108
|
+
lucid/nn/modules/attention.py,sha256=XdOrGsS0zTPM8isP7MXoelGuuosNXO9HgD53wGPMBdM,6465
|
|
107
109
|
lucid/nn/modules/conv.py,sha256=KbtInQgKSw3U_qXiqy7x53DZM9YAMUq7sFas1nV7NxY,13932
|
|
108
110
|
lucid/nn/modules/drop.py,sha256=8127XhAbwk0nHWKVcGYqnnzsfmYn-WZ8iR6DXW_al5g,2127
|
|
109
111
|
lucid/nn/modules/einops.py,sha256=3NGbfcBq9PZ9Vlbai53eBGGY4ckeWGXTCdPD73zuuNE,512
|
|
@@ -111,8 +113,8 @@ lucid/nn/modules/linear.py,sha256=87cuFWYct9JlmtVC3jGR-8eouxxzANaVA6cd7p9r2Ho,28
|
|
|
111
113
|
lucid/nn/modules/loss.py,sha256=pjEMIruhtpTHhHFsNThS9LFz-aI_DAXLqMV8KRXydEg,3431
|
|
112
114
|
lucid/nn/modules/norm.py,sha256=bYsKOg58kxzhMhbyvHrDDgVzN_p3D9HBTdYWpDtDeHQ,6842
|
|
113
115
|
lucid/nn/modules/pool.py,sha256=ymVnS2NZjh08Tw0VeOfkB6AVrMeLmCKvgxkmEO3KUuw,5044
|
|
114
|
-
lucid/nn/modules/rnn.py,sha256=
|
|
115
|
-
lucid/nn/modules/sparse.py,sha256=
|
|
116
|
+
lucid/nn/modules/rnn.py,sha256=y_dfvs-2PabKzug9jxMn8o4ir0KfMg64nOS2UKScBZY,24855
|
|
117
|
+
lucid/nn/modules/sparse.py,sha256=C6Kz6Vhe9ko0Ym6JJHOH3HKdNFZJ_xTt2KSRqITXGl8,1620
|
|
116
118
|
lucid/nn/modules/transformer.py,sha256=z56emF_eX18pxRELjfmmsY-7Bn9h2yjIdxCaxs6YDwA,11246
|
|
117
119
|
lucid/nn/modules/vision.py,sha256=8xYasT7TNj4NXwMwwJIw1nbV1paeWEFg_ZohXn9kZBg,1579
|
|
118
120
|
lucid/nn/utils/__init__.py,sha256=ynHrPi9SPdRRXhGjghG42FRBcEiVN8Hb_04XHBZqy_o,46
|
|
@@ -136,8 +138,8 @@ lucid/visual/__init__.py,sha256=tRgyNHzKWA8cp-a_GV586Bs0yJUN5ZTmKgnUhscutHQ,23
|
|
|
136
138
|
lucid/visual/mermaid.py,sha256=m0X0kkdLuCxEzKmXSy3zplUaa3Gov8RRonKyHiEvfHE,32738
|
|
137
139
|
lucid/weights/__init__.py,sha256=z1AikA3rOEeckWGkYWlcZkxNlJo9Xwa39PL6ly3hWnc,8801
|
|
138
140
|
lucid/weights/__init__.pyi,sha256=lFonYC3cUx2Idolf3AEPnjFcyqcn3UDU84oJlZafqLY,3013
|
|
139
|
-
lucid_dl-2.12.
|
|
140
|
-
lucid_dl-2.12.
|
|
141
|
-
lucid_dl-2.12.
|
|
142
|
-
lucid_dl-2.12.
|
|
143
|
-
lucid_dl-2.12.
|
|
141
|
+
lucid_dl-2.12.1.dist-info/licenses/LICENSE,sha256=vxRFYnVD1IeYtsvw-KmoElfqrjxKHv1h9YTvsG54loQ,1065
|
|
142
|
+
lucid_dl-2.12.1.dist-info/METADATA,sha256=QsDRFeh22Zlxi7RNDDb9aSM75Ly0lZVBtpX82v34kl8,11817
|
|
143
|
+
lucid_dl-2.12.1.dist-info/WHEEL,sha256=wUyA8OaulRlbfwMtmQsvNngGrxQHAvkKcvRmdizlJi0,92
|
|
144
|
+
lucid_dl-2.12.1.dist-info/top_level.txt,sha256=uzP_qBx9iNWIHKJRlElYcBLYVqMpdm9Q1Ma63QPYbFc,6
|
|
145
|
+
lucid_dl-2.12.1.dist-info/RECORD,,
|
|
File without changes
|
|
File without changes
|
|
File without changes
|