py-sadl 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- py_sadl-1.0.2.dist-info/METADATA +338 -0
- py_sadl-1.0.2.dist-info/RECORD +13 -0
- py_sadl-1.0.2.dist-info/WHEEL +4 -0
- py_sadl-1.0.2.dist-info/licenses/LICENSE +21 -0
- sadl/__init__.py +74 -0
- sadl/backend.py +45 -0
- sadl/disk.py +147 -0
- sadl/function.py +415 -0
- sadl/grad_ops.py +1158 -0
- sadl/ops.py +67 -0
- sadl/optimizer.py +352 -0
- sadl/tensor.py +531 -0
- sadl/utils.py +33 -0
sadl/tensor.py
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
1
|
+
"""Custom tensor implementations that support autograd."""
|
|
2
|
+
|
|
3
|
+
from __future__ import annotations
|
|
4
|
+
|
|
5
|
+
import logging
|
|
6
|
+
from typing import TYPE_CHECKING, Any, ParamSpec, Self, TypeVar
|
|
7
|
+
|
|
8
|
+
from .backend import BACKEND, TensorDevice, xp
|
|
9
|
+
from .grad_ops import GradOp, get_grad_op, normalize_grad_op_name
|
|
10
|
+
from .utils import copy_array
|
|
11
|
+
|
|
12
|
+
if TYPE_CHECKING:
|
|
13
|
+
from collections.abc import Callable, Iterable, Mapping
|
|
14
|
+
from types import TracebackType
|
|
15
|
+
|
|
16
|
+
P = ParamSpec("P")
|
|
17
|
+
T = TypeVar("T")
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
logger = logging.getLogger(__name__)
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
def _to_array(x: Any) -> Any:
|
|
24
|
+
"""Recursively convert Tensors to plain ndarrays (handles nested lists/tuples)."""
|
|
25
|
+
if isinstance(x, Tensor):
|
|
26
|
+
return xp.asarray(x)
|
|
27
|
+
if isinstance(x, list | tuple):
|
|
28
|
+
converted = [_to_array(i) for i in x]
|
|
29
|
+
return type(x)(converted)
|
|
30
|
+
return x
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
def _to_tensor(x: Any) -> Tensor:
|
|
34
|
+
"""Convert input to Tensor. Non-Tensors become Tensors with requires_grad=False."""
|
|
35
|
+
if isinstance(x, Tensor):
|
|
36
|
+
return x
|
|
37
|
+
return Tensor(x, requires_grad=False)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
_GRAD_MODE_ENABLED: bool = True
|
|
41
|
+
|
|
42
|
+
|
|
43
|
+
class no_grad: # noqa: N801
|
|
44
|
+
"""Context manager to disable gradient tracking in the context."""
|
|
45
|
+
|
|
46
|
+
def __enter__(self) -> Self:
|
|
47
|
+
global _GRAD_MODE_ENABLED
|
|
48
|
+
self.prev = _GRAD_MODE_ENABLED
|
|
49
|
+
_GRAD_MODE_ENABLED = False
|
|
50
|
+
return self
|
|
51
|
+
|
|
52
|
+
def __exit__(
|
|
53
|
+
self,
|
|
54
|
+
exc_type: type[BaseException] | None,
|
|
55
|
+
exc_val: BaseException | None,
|
|
56
|
+
exc_tb: TracebackType | None,
|
|
57
|
+
) -> None:
|
|
58
|
+
global _GRAD_MODE_ENABLED
|
|
59
|
+
_GRAD_MODE_ENABLED = self.prev
|
|
60
|
+
|
|
61
|
+
|
|
62
|
+
def set_global_grad_mode(enabled: bool) -> None:
|
|
63
|
+
"""Sets the global grad mode to `enabled`.
|
|
64
|
+
|
|
65
|
+
Args:
|
|
66
|
+
enabled (bool): Whether to enable or disable
|
|
67
|
+
gradient tracking.
|
|
68
|
+
"""
|
|
69
|
+
global _GRAD_MODE_ENABLED
|
|
70
|
+
_GRAD_MODE_ENABLED = enabled
|
|
71
|
+
logger.debug(f"Gradient tracking {'enabled' if enabled else 'disabled'}")
|
|
72
|
+
|
|
73
|
+
|
|
74
|
+
def get_current_global_grad_mode() -> bool:
|
|
75
|
+
"""Gets the current global grad mode.
|
|
76
|
+
|
|
77
|
+
Returns:
|
|
78
|
+
bool: Whether gradient tracking is
|
|
79
|
+
enabled or disabled.
|
|
80
|
+
"""
|
|
81
|
+
return _GRAD_MODE_ENABLED
|
|
82
|
+
|
|
83
|
+
|
|
84
|
+
def no_grad_fn(fn: Callable[P, T]) -> Callable[P, T]:
|
|
85
|
+
"""Disables gradient tracking for all ops in the annotated function.
|
|
86
|
+
|
|
87
|
+
This decorator preserves the original function's type signature.
|
|
88
|
+
|
|
89
|
+
Args:
|
|
90
|
+
fn (Callable[P, T]): The function in which to disable gradient tracking.
|
|
91
|
+
|
|
92
|
+
Returns:
|
|
93
|
+
The wrapped function with the same signature as the input.
|
|
94
|
+
|
|
95
|
+
Example:
|
|
96
|
+
>>> @no_grad_fn
|
|
97
|
+
... def inference(x: Tensor) -> Tensor:
|
|
98
|
+
... return x * 2
|
|
99
|
+
"""
|
|
100
|
+
|
|
101
|
+
def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
|
|
102
|
+
with no_grad():
|
|
103
|
+
return fn(*args, **kwargs)
|
|
104
|
+
|
|
105
|
+
return wrapper
|
|
106
|
+
|
|
107
|
+
|
|
108
|
+
class Tensor(xp.ndarray): # type: ignore[misc]
|
|
109
|
+
"""A tensor wrapper around arrays with autograd support."""
|
|
110
|
+
|
|
111
|
+
def __init__( # noqa: PLR0913
|
|
112
|
+
self,
|
|
113
|
+
data: Any = None, # noqa: ARG002 -> Ignored, handled by __new__, needed for signature
|
|
114
|
+
*,
|
|
115
|
+
src: tuple[Tensor, ...] | None = None,
|
|
116
|
+
creator_op: str | None = None,
|
|
117
|
+
op_ctx: dict[str, Any] | None = None,
|
|
118
|
+
requires_grad: bool = False,
|
|
119
|
+
keep_grad: bool = False,
|
|
120
|
+
) -> None:
|
|
121
|
+
self.src: tuple[Tensor, ...] = src or ()
|
|
122
|
+
|
|
123
|
+
backward_fn = get_grad_op(creator_op) if creator_op else None
|
|
124
|
+
|
|
125
|
+
if not self.is_leaf() and backward_fn is None:
|
|
126
|
+
raise ValueError(f'Gradient propagation not supported for op "{creator_op}"')
|
|
127
|
+
|
|
128
|
+
self.backward_fn: GradOp | None = backward_fn
|
|
129
|
+
self.op_ctx: dict[str, Any] = op_ctx or {}
|
|
130
|
+
|
|
131
|
+
self.requires_grad = _GRAD_MODE_ENABLED and requires_grad
|
|
132
|
+
|
|
133
|
+
self.grad: xp.array | None = None
|
|
134
|
+
|
|
135
|
+
self.keep_grad = keep_grad
|
|
136
|
+
|
|
137
|
+
def __array_finalize__(self, obj: Any) -> None:
|
|
138
|
+
"""Called when a new Tensor is created via .view(), slicing, or ufuncs.
|
|
139
|
+
|
|
140
|
+
Sets default values for all Tensor attributes. These can be overridden
|
|
141
|
+
after creation if needed.
|
|
142
|
+
"""
|
|
143
|
+
if obj is None:
|
|
144
|
+
# Called from __new__ via explicit constructor - __init__ will handle it
|
|
145
|
+
return
|
|
146
|
+
# Copy attributes from source object if available, otherwise use defaults
|
|
147
|
+
# These assignments are intentionally duplicated from __init__ because
|
|
148
|
+
# __array_finalize__ is called for views/slices instead of __init__
|
|
149
|
+
self.src: tuple[Tensor, ...] = getattr(obj, "src", ()) # type: ignore[no-redef]
|
|
150
|
+
self.backward_fn: GradOp | None = getattr(obj, "backward_fn", None) # type: ignore[no-redef]
|
|
151
|
+
self.op_ctx: dict[str, Any] = getattr(obj, "op_ctx", {}) # type: ignore[no-redef]
|
|
152
|
+
self.requires_grad: bool = getattr(obj, "requires_grad", False) # type: ignore[no-redef]
|
|
153
|
+
self.grad: xp.array | None = getattr(obj, "grad", None) # type: ignore[no-redef]
|
|
154
|
+
self.keep_grad: bool = getattr(obj, "keep_grad", False) # type: ignore[no-redef]
|
|
155
|
+
|
|
156
|
+
def is_leaf(self) -> bool:
|
|
157
|
+
"""Whether this Tensor is a leaf in a computation graph.
|
|
158
|
+
|
|
159
|
+
Checks whether it has no src/parents from which it was created.
|
|
160
|
+
|
|
161
|
+
Returns:
|
|
162
|
+
bool: If it is a leaf (`True`), or not (`False`).
|
|
163
|
+
"""
|
|
164
|
+
return len(self.src) == 0
|
|
165
|
+
|
|
166
|
+
def copy_to_device(self, device: TensorDevice) -> Tensor:
|
|
167
|
+
"""Copy tensor data to `device`.
|
|
168
|
+
|
|
169
|
+
For intermediate tensors in a computation graph (non-leaf with sources
|
|
170
|
+
that require grad), this is a tracked operation so gradients flow back.
|
|
171
|
+
For leaf tensors, this is a utility operation.
|
|
172
|
+
|
|
173
|
+
Note: If the Tensor already is on `device`, no copy is created. Instead,
|
|
174
|
+
the Tensor is returned as is.
|
|
175
|
+
|
|
176
|
+
Args:
|
|
177
|
+
device (TensorDevice): The device to copy to. Should either
|
|
178
|
+
be `cpu` or an integer specifying the GPU id.
|
|
179
|
+
|
|
180
|
+
Returns:
|
|
181
|
+
Tensor: A tensor with the same data, now on `device`.
|
|
182
|
+
"""
|
|
183
|
+
return _copy_to_device(tensor=self, device=device)
|
|
184
|
+
|
|
185
|
+
def detach(
|
|
186
|
+
self,
|
|
187
|
+
*,
|
|
188
|
+
in_place: bool = False,
|
|
189
|
+
) -> Tensor:
|
|
190
|
+
"""Detatch the Tensor from the computation graph.
|
|
191
|
+
|
|
192
|
+
Args:
|
|
193
|
+
in_place (bool): Whether to detach the current Tensor
|
|
194
|
+
in-place (`True`), which would cut the current computation
|
|
195
|
+
graph on that node, or to detach a copy (including the
|
|
196
|
+
memory buffer, unlike in Pytorch) of the
|
|
197
|
+
current Tensor (`False`), which does **not**
|
|
198
|
+
break the current computation graph. Defaults to False.
|
|
199
|
+
|
|
200
|
+
Returns:
|
|
201
|
+
Tensor: The resulting Tensor. If `in_place` is `True`, it will
|
|
202
|
+
be the same one identity-wise.
|
|
203
|
+
"""
|
|
204
|
+
if in_place:
|
|
205
|
+
self.src = ()
|
|
206
|
+
self.backward_fn = None
|
|
207
|
+
self.op_ctx = {}
|
|
208
|
+
if not self.keep_grad:
|
|
209
|
+
self.grad = None
|
|
210
|
+
return self
|
|
211
|
+
detached_tensor = Tensor(
|
|
212
|
+
self.copy(),
|
|
213
|
+
requires_grad=self.requires_grad,
|
|
214
|
+
keep_grad=self.keep_grad,
|
|
215
|
+
)
|
|
216
|
+
detached_tensor.grad = self.grad if self.keep_grad else None
|
|
217
|
+
return detached_tensor
|
|
218
|
+
|
|
219
|
+
def cpu(self) -> Tensor:
|
|
220
|
+
"""Move the Tensor to the cpu.
|
|
221
|
+
|
|
222
|
+
Note: If the Tensor already is on the cpu,
|
|
223
|
+
no copy is created. Instead, the Tensor is returned as is.
|
|
224
|
+
|
|
225
|
+
Returns:
|
|
226
|
+
Tensor: A **copy** of the Tensor on the cpu,
|
|
227
|
+
if it wasn't on the cpu before.
|
|
228
|
+
"""
|
|
229
|
+
return self.copy_to_device(device="cpu")
|
|
230
|
+
|
|
231
|
+
def gpu(self, device_id: int = 0) -> Tensor:
|
|
232
|
+
"""Move the Tensor to a gpu with `id`.
|
|
233
|
+
|
|
234
|
+
Note: If the Tensor already is on the gpu `device_id`,
|
|
235
|
+
no copy is created. Instead, the Tensor is returned as is.
|
|
236
|
+
|
|
237
|
+
Args:
|
|
238
|
+
device_id (int): The id of the gpu to which the Tensor
|
|
239
|
+
should be copied. Defaults to 0.
|
|
240
|
+
|
|
241
|
+
Returns:
|
|
242
|
+
Tensor: A **copy** of the Tensor on the specified gpu,
|
|
243
|
+
if it wasn't on the gpu `device_id` before.
|
|
244
|
+
"""
|
|
245
|
+
return self.copy_to_device(device=device_id)
|
|
246
|
+
|
|
247
|
+
def __hash__(self) -> int:
|
|
248
|
+
"""Identity-based hash for use in sets/dicts (computation graph tracking)."""
|
|
249
|
+
return id(self)
|
|
250
|
+
|
|
251
|
+
def __new__(cls, data: Iterable[Any], **kwargs: Any) -> Self:
|
|
252
|
+
"""Initializes the data."""
|
|
253
|
+
# **kwargs accepts src, creator_op, etc. but we don't use them here
|
|
254
|
+
# They'll be handled by __init__
|
|
255
|
+
result: Self = xp.asarray(data, dtype=kwargs.get("dtype")).view(cls)
|
|
256
|
+
return result
|
|
257
|
+
|
|
258
|
+
def __array_ufunc__(
|
|
259
|
+
self,
|
|
260
|
+
ufunc: xp.ufunc,
|
|
261
|
+
method: str,
|
|
262
|
+
*inputs: Any,
|
|
263
|
+
**kwargs: Any,
|
|
264
|
+
) -> Any:
|
|
265
|
+
logger.debug(
|
|
266
|
+
'__array_ufunc__: ufunc="%s" method="%s" inputs="%s" kwargs="%s"',
|
|
267
|
+
ufunc.__name__,
|
|
268
|
+
method,
|
|
269
|
+
inputs,
|
|
270
|
+
kwargs,
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
xp_input_arrays = tuple(_to_array(x) for x in inputs)
|
|
274
|
+
kwargs = {k: _to_array(v) for k, v in kwargs.items()}
|
|
275
|
+
|
|
276
|
+
track_kwargs = kwargs.copy()
|
|
277
|
+
|
|
278
|
+
func = getattr(ufunc, method)
|
|
279
|
+
|
|
280
|
+
if func.__name__ in ["maximum", "minimum"]:
|
|
281
|
+
result = func(*xp_input_arrays, **kwargs)
|
|
282
|
+
# create a mask for each axis, where True means the value in x
|
|
283
|
+
# at this position is an extremum (maximum or minimum, depending on "method"):
|
|
284
|
+
x_mask = result == xp_input_arrays[0]
|
|
285
|
+
# just for completness we do "* kwargs.get("where", 1)" in the following line
|
|
286
|
+
# this is because if "where" is "False" at a location, the extremum operation
|
|
287
|
+
# should not be used, and therefore x_mask does not apply
|
|
288
|
+
# (this is not strictly neccessary, because we account for this in the backward
|
|
289
|
+
# function by setting the grad to "0" for both x and y at these locations anyway)
|
|
290
|
+
track_kwargs["x_mask"] = x_mask * kwargs.get("where", 1)
|
|
291
|
+
|
|
292
|
+
else:
|
|
293
|
+
result = func(*xp_input_arrays, **kwargs)
|
|
294
|
+
|
|
295
|
+
# Skip graph building when grad mode is disabled (e.g., during backward pass)
|
|
296
|
+
if not _GRAD_MODE_ENABLED:
|
|
297
|
+
return Tensor(result, requires_grad=False)
|
|
298
|
+
|
|
299
|
+
src = tuple(_to_tensor(i) for i in inputs)
|
|
300
|
+
creator_op = normalize_grad_op_name(name=ufunc.__name__, is_reduce=method == "reduce")
|
|
301
|
+
|
|
302
|
+
return Tensor(
|
|
303
|
+
result,
|
|
304
|
+
src=src,
|
|
305
|
+
creator_op=creator_op,
|
|
306
|
+
op_ctx=track_kwargs,
|
|
307
|
+
requires_grad=any(elem.requires_grad for elem in src),
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
def __array_function__(
|
|
311
|
+
self,
|
|
312
|
+
func: Any,
|
|
313
|
+
types: Iterable[type],
|
|
314
|
+
args: Iterable[Any],
|
|
315
|
+
kwargs: Mapping[str, Any],
|
|
316
|
+
) -> Any:
|
|
317
|
+
logger.debug(
|
|
318
|
+
'__array_function__: func="%s" types="%s" args="%s" kwargs="%s"',
|
|
319
|
+
func.__name__,
|
|
320
|
+
types,
|
|
321
|
+
args,
|
|
322
|
+
kwargs,
|
|
323
|
+
)
|
|
324
|
+
|
|
325
|
+
xp_input_arrays = tuple(_to_array(x) for x in args)
|
|
326
|
+
kwargs = {k: _to_array(v) for k, v in kwargs.items()}
|
|
327
|
+
|
|
328
|
+
track_kwargs = kwargs.copy()
|
|
329
|
+
|
|
330
|
+
if func.__name__ in ["max", "min"]:
|
|
331
|
+
# execute the function, but retain the dimensions:
|
|
332
|
+
keepdims_kwargs = kwargs.copy()
|
|
333
|
+
keepdims_kwargs["keepdims"] = True
|
|
334
|
+
result = func(*xp_input_arrays, **keepdims_kwargs)
|
|
335
|
+
# create a mask for each axis, where True means the value in x
|
|
336
|
+
# at this position is an extremum (maximum or minimum, depending on "method"):
|
|
337
|
+
x_mask = result == xp_input_arrays[0]
|
|
338
|
+
track_kwargs["x_mask"] = x_mask
|
|
339
|
+
|
|
340
|
+
result = (
|
|
341
|
+
result
|
|
342
|
+
if kwargs.get("keepdims", False)
|
|
343
|
+
else xp.squeeze(result, axis=kwargs.get("axis"))
|
|
344
|
+
)
|
|
345
|
+
else:
|
|
346
|
+
result = func(*xp_input_arrays, **kwargs)
|
|
347
|
+
|
|
348
|
+
# Skip graph building when grad mode is disabled (e.g., during backward pass)
|
|
349
|
+
if not _GRAD_MODE_ENABLED:
|
|
350
|
+
return Tensor(result, requires_grad=False)
|
|
351
|
+
|
|
352
|
+
src = tuple(_to_tensor(a) for a in args)
|
|
353
|
+
|
|
354
|
+
return Tensor(
|
|
355
|
+
result,
|
|
356
|
+
src=src,
|
|
357
|
+
creator_op=func.__name__,
|
|
358
|
+
op_ctx=track_kwargs,
|
|
359
|
+
requires_grad=any(elem.requires_grad for elem in src),
|
|
360
|
+
)
|
|
361
|
+
|
|
362
|
+
|
|
363
|
+
class Parameter(Tensor):
|
|
364
|
+
"""A special Tensor that should be part of a model to optimize.
|
|
365
|
+
|
|
366
|
+
Parameters have an additional `is_training` attribute for controlling
|
|
367
|
+
behavior of layers like Dropout and BatchNorm.
|
|
368
|
+
"""
|
|
369
|
+
|
|
370
|
+
def __init__(
|
|
371
|
+
self,
|
|
372
|
+
data: Any = None, # Ignored - handled by __new__, but needed for signature compatibility
|
|
373
|
+
*,
|
|
374
|
+
is_training: bool = True,
|
|
375
|
+
) -> None:
|
|
376
|
+
super().__init__(
|
|
377
|
+
data=data,
|
|
378
|
+
src=None,
|
|
379
|
+
creator_op=None,
|
|
380
|
+
op_ctx=None,
|
|
381
|
+
requires_grad=True,
|
|
382
|
+
keep_grad=True,
|
|
383
|
+
)
|
|
384
|
+
self.is_training = is_training
|
|
385
|
+
|
|
386
|
+
def __array_finalize__(self, obj: Any) -> None:
|
|
387
|
+
"""Called when a new Parameter is created via .view(), slicing, or ufuncs.
|
|
388
|
+
|
|
389
|
+
Extends Tensor's __array_finalize__ to also handle is_training.
|
|
390
|
+
"""
|
|
391
|
+
# First call parent's __array_finalize__ to set Tensor attributes
|
|
392
|
+
super().__array_finalize__(obj)
|
|
393
|
+
# Then set Parameter-specific attributes
|
|
394
|
+
self.is_training: bool = getattr(obj, "is_training", True) # type: ignore[no-redef]
|
|
395
|
+
|
|
396
|
+
def __new__(cls, data: Iterable[Any], **kwargs: Any) -> Self:
|
|
397
|
+
"""Initializes the data."""
|
|
398
|
+
# parameters must always be float and can never be int
|
|
399
|
+
# -> their optimization will unlikely yield pure integer
|
|
400
|
+
# parameters
|
|
401
|
+
# -> integer parameters are not differentiable!
|
|
402
|
+
# this is because there exist no infinitesimal steps,
|
|
403
|
+
# there is always a jump, e.g. 1 -> 2
|
|
404
|
+
dtype = kwargs.get("dtype")
|
|
405
|
+
if xp.issubdtype(dtype, xp.integer):
|
|
406
|
+
raise ValueError("Parameter must have float type, found int.")
|
|
407
|
+
return Tensor.__new__(
|
|
408
|
+
cls=cls,
|
|
409
|
+
data=data,
|
|
410
|
+
**kwargs,
|
|
411
|
+
)
|
|
412
|
+
|
|
413
|
+
def copy_to_device(self, device: TensorDevice) -> Parameter:
|
|
414
|
+
"""Copy parameter data to `device`.
|
|
415
|
+
|
|
416
|
+
This is a utility operation (not tracked in the
|
|
417
|
+
computation graph). Consequently, the
|
|
418
|
+
**copy-semantic is different to normal Tensors**.
|
|
419
|
+
|
|
420
|
+
Note: If the Parameter already is on `device`, no copy is created.
|
|
421
|
+
Instead, the Parameter is returned as is.
|
|
422
|
+
|
|
423
|
+
Args:
|
|
424
|
+
device (TensorDevice): The device to copy to. Should either
|
|
425
|
+
be `cpu` or an integer specifying the GPU id.
|
|
426
|
+
|
|
427
|
+
Returns:
|
|
428
|
+
Parameter: A Parameter with the same data, now on `device`.
|
|
429
|
+
"""
|
|
430
|
+
new_device_array = copy_array(array=self, device=device)
|
|
431
|
+
|
|
432
|
+
# __array_finalize__ sets defaults, so we manually copy from self
|
|
433
|
+
result: Parameter = new_device_array.view(Parameter)
|
|
434
|
+
result.requires_grad = self.requires_grad
|
|
435
|
+
result.keep_grad = self.keep_grad
|
|
436
|
+
result.is_training = self.is_training
|
|
437
|
+
return result
|
|
438
|
+
|
|
439
|
+
|
|
440
|
+
def tensor(
|
|
441
|
+
data: Any,
|
|
442
|
+
*,
|
|
443
|
+
dtype: Any = None,
|
|
444
|
+
device: TensorDevice = "cpu",
|
|
445
|
+
requires_grad: bool = False,
|
|
446
|
+
keep_grad: bool = False,
|
|
447
|
+
) -> Tensor:
|
|
448
|
+
"""Factory function to create a Tensor on the specified device.
|
|
449
|
+
|
|
450
|
+
Args:
|
|
451
|
+
data (Any): The array data (can be scalar, list, array, etc).
|
|
452
|
+
dtype (Any): The data type of the array data.
|
|
453
|
+
Defaults to None, meaning dtype is inferred from data.
|
|
454
|
+
device (TensorDevice): The device on which the Tensor
|
|
455
|
+
should be created. Defaults to "cpu".
|
|
456
|
+
requires_grad (bool): Whether to track gradients. Defaults to False.
|
|
457
|
+
keep_grad (bool): Whether to retain gradients after backward. Defaults to False.
|
|
458
|
+
|
|
459
|
+
Returns:
|
|
460
|
+
Tensor: The created Tensor.
|
|
461
|
+
"""
|
|
462
|
+
if BACKEND == "numpy" or device == "cpu":
|
|
463
|
+
arr = xp.array(data, dtype=dtype)
|
|
464
|
+
else:
|
|
465
|
+
with xp.cuda.Device(device):
|
|
466
|
+
arr = xp.array(data, dtype=dtype)
|
|
467
|
+
|
|
468
|
+
result: Tensor = arr.view(Tensor)
|
|
469
|
+
# __array_finalize__ sets defaults; override with user values
|
|
470
|
+
result.requires_grad = _GRAD_MODE_ENABLED and requires_grad
|
|
471
|
+
result.keep_grad = keep_grad
|
|
472
|
+
return result
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
def _copy_to_device(tensor: Tensor, device: TensorDevice) -> Tensor:
|
|
476
|
+
"""Copy tensor data to `device`.
|
|
477
|
+
|
|
478
|
+
For intermediate tensors in a computation graph (non-leaf with sources
|
|
479
|
+
that require grad), this is a tracked operation so gradients flow back.
|
|
480
|
+
For leaf tensors, this is a utility operation.
|
|
481
|
+
|
|
482
|
+
Note: If the Tensor already is on `device`, no copy is created. Instead,
|
|
483
|
+
the Tensor is returned as is.
|
|
484
|
+
|
|
485
|
+
Args:
|
|
486
|
+
tensor (Tensor): The tensor to copy to `device`.
|
|
487
|
+
device (TensorDevice): The device to copy to. Should either
|
|
488
|
+
be `cpu` or an integer specifying the GPU id.
|
|
489
|
+
|
|
490
|
+
Returns:
|
|
491
|
+
Tensor: A tensor with the same data, now on `device`.
|
|
492
|
+
"""
|
|
493
|
+
new_device_array = copy_array(array=tensor, device=device)
|
|
494
|
+
|
|
495
|
+
# Check if this is a non-leaf tensor in an active computation graph
|
|
496
|
+
# (like intermediate activations in multi-GPU scenarios)
|
|
497
|
+
src_requires_grad = [s.requires_grad for s in tensor.src]
|
|
498
|
+
is_in_graph = (
|
|
499
|
+
_GRAD_MODE_ENABLED
|
|
500
|
+
and tensor.requires_grad
|
|
501
|
+
and len(tensor.src) > 0
|
|
502
|
+
and any(src_requires_grad)
|
|
503
|
+
)
|
|
504
|
+
|
|
505
|
+
# __array_finalize__ sets defaults from new_device_array (plain xp.ndarray),
|
|
506
|
+
# so we manually set attributes
|
|
507
|
+
result: Tensor = new_device_array.view(Tensor)
|
|
508
|
+
|
|
509
|
+
if is_in_graph:
|
|
510
|
+
# Tracked operation: gradients flow back through device transfer
|
|
511
|
+
result.src = (tensor,)
|
|
512
|
+
result.backward_fn = get_grad_op("copy_to_device")
|
|
513
|
+
result.requires_grad = True
|
|
514
|
+
else:
|
|
515
|
+
# Utility operation for leaf tensors
|
|
516
|
+
result.requires_grad = tensor.requires_grad
|
|
517
|
+
|
|
518
|
+
result.keep_grad = tensor.keep_grad
|
|
519
|
+
return result
|
|
520
|
+
|
|
521
|
+
|
|
522
|
+
__all__ = [
|
|
523
|
+
"Parameter",
|
|
524
|
+
"Tensor",
|
|
525
|
+
"_copy_to_device",
|
|
526
|
+
"get_current_global_grad_mode",
|
|
527
|
+
"no_grad",
|
|
528
|
+
"no_grad_fn",
|
|
529
|
+
"set_global_grad_mode",
|
|
530
|
+
"tensor",
|
|
531
|
+
]
|
sadl/utils.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
1
|
+
from .backend import BACKEND, TensorDevice, xp
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
def copy_array(array: "xp.ndarray", device: TensorDevice) -> "xp.ndarray":
|
|
5
|
+
"""Copy an array to the specified device.
|
|
6
|
+
|
|
7
|
+
Args:
|
|
8
|
+
array (xp.ndarray): The array to copy.
|
|
9
|
+
device (TensorDevice): Target device, "cpu" or GPU id (int).
|
|
10
|
+
|
|
11
|
+
Raises:
|
|
12
|
+
ValueError: If device string is not "cpu".
|
|
13
|
+
ValueError: If using numpy backend and requesting a GPU device.
|
|
14
|
+
|
|
15
|
+
Returns:
|
|
16
|
+
xp.ndarray: The array on the target device, or the original if already there.
|
|
17
|
+
"""
|
|
18
|
+
if array.device == device:
|
|
19
|
+
return array
|
|
20
|
+
if isinstance(device, str) and device != "cpu":
|
|
21
|
+
raise ValueError('Only "cpu" allowed as string device.')
|
|
22
|
+
if BACKEND == "numpy":
|
|
23
|
+
raise ValueError(
|
|
24
|
+
"Copying to another device is only possible when using cupy "
|
|
25
|
+
"as the backend. Currently, numpy is the backend. Please "
|
|
26
|
+
"check cupy and gpu availability."
|
|
27
|
+
)
|
|
28
|
+
# cupy:
|
|
29
|
+
if isinstance(device, int):
|
|
30
|
+
with xp.cuda.Device(device):
|
|
31
|
+
return xp.asarray(array)
|
|
32
|
+
else:
|
|
33
|
+
return xp.asnumpy(array)
|