py-sadl 1.0.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sadl/tensor.py ADDED
@@ -0,0 +1,531 @@
1
+ """Custom tensor implementations that support autograd."""
2
+
3
+ from __future__ import annotations
4
+
5
+ import logging
6
+ from typing import TYPE_CHECKING, Any, ParamSpec, Self, TypeVar
7
+
8
+ from .backend import BACKEND, TensorDevice, xp
9
+ from .grad_ops import GradOp, get_grad_op, normalize_grad_op_name
10
+ from .utils import copy_array
11
+
12
+ if TYPE_CHECKING:
13
+ from collections.abc import Callable, Iterable, Mapping
14
+ from types import TracebackType
15
+
16
+ P = ParamSpec("P")
17
+ T = TypeVar("T")
18
+
19
+
20
+ logger = logging.getLogger(__name__)
21
+
22
+
23
+ def _to_array(x: Any) -> Any:
24
+ """Recursively convert Tensors to plain ndarrays (handles nested lists/tuples)."""
25
+ if isinstance(x, Tensor):
26
+ return xp.asarray(x)
27
+ if isinstance(x, list | tuple):
28
+ converted = [_to_array(i) for i in x]
29
+ return type(x)(converted)
30
+ return x
31
+
32
+
33
+ def _to_tensor(x: Any) -> Tensor:
34
+ """Convert input to Tensor. Non-Tensors become Tensors with requires_grad=False."""
35
+ if isinstance(x, Tensor):
36
+ return x
37
+ return Tensor(x, requires_grad=False)
38
+
39
+
40
+ _GRAD_MODE_ENABLED: bool = True
41
+
42
+
43
+ class no_grad: # noqa: N801
44
+ """Context manager to disable gradient tracking in the context."""
45
+
46
+ def __enter__(self) -> Self:
47
+ global _GRAD_MODE_ENABLED
48
+ self.prev = _GRAD_MODE_ENABLED
49
+ _GRAD_MODE_ENABLED = False
50
+ return self
51
+
52
+ def __exit__(
53
+ self,
54
+ exc_type: type[BaseException] | None,
55
+ exc_val: BaseException | None,
56
+ exc_tb: TracebackType | None,
57
+ ) -> None:
58
+ global _GRAD_MODE_ENABLED
59
+ _GRAD_MODE_ENABLED = self.prev
60
+
61
+
62
+ def set_global_grad_mode(enabled: bool) -> None:
63
+ """Sets the global grad mode to `enabled`.
64
+
65
+ Args:
66
+ enabled (bool): Whether to enable or disable
67
+ gradient tracking.
68
+ """
69
+ global _GRAD_MODE_ENABLED
70
+ _GRAD_MODE_ENABLED = enabled
71
+ logger.debug(f"Gradient tracking {'enabled' if enabled else 'disabled'}")
72
+
73
+
74
+ def get_current_global_grad_mode() -> bool:
75
+ """Gets the current global grad mode.
76
+
77
+ Returns:
78
+ bool: Whether gradient tracking is
79
+ enabled or disabled.
80
+ """
81
+ return _GRAD_MODE_ENABLED
82
+
83
+
84
+ def no_grad_fn(fn: Callable[P, T]) -> Callable[P, T]:
85
+ """Disables gradient tracking for all ops in the annotated function.
86
+
87
+ This decorator preserves the original function's type signature.
88
+
89
+ Args:
90
+ fn (Callable[P, T]): The function in which to disable gradient tracking.
91
+
92
+ Returns:
93
+ The wrapped function with the same signature as the input.
94
+
95
+ Example:
96
+ >>> @no_grad_fn
97
+ ... def inference(x: Tensor) -> Tensor:
98
+ ... return x * 2
99
+ """
100
+
101
+ def wrapper(*args: P.args, **kwargs: P.kwargs) -> T:
102
+ with no_grad():
103
+ return fn(*args, **kwargs)
104
+
105
+ return wrapper
106
+
107
+
108
+ class Tensor(xp.ndarray): # type: ignore[misc]
109
+ """A tensor wrapper around arrays with autograd support."""
110
+
111
+ def __init__( # noqa: PLR0913
112
+ self,
113
+ data: Any = None, # noqa: ARG002 -> Ignored, handled by __new__, needed for signature
114
+ *,
115
+ src: tuple[Tensor, ...] | None = None,
116
+ creator_op: str | None = None,
117
+ op_ctx: dict[str, Any] | None = None,
118
+ requires_grad: bool = False,
119
+ keep_grad: bool = False,
120
+ ) -> None:
121
+ self.src: tuple[Tensor, ...] = src or ()
122
+
123
+ backward_fn = get_grad_op(creator_op) if creator_op else None
124
+
125
+ if not self.is_leaf() and backward_fn is None:
126
+ raise ValueError(f'Gradient propagation not supported for op "{creator_op}"')
127
+
128
+ self.backward_fn: GradOp | None = backward_fn
129
+ self.op_ctx: dict[str, Any] = op_ctx or {}
130
+
131
+ self.requires_grad = _GRAD_MODE_ENABLED and requires_grad
132
+
133
+ self.grad: xp.array | None = None
134
+
135
+ self.keep_grad = keep_grad
136
+
137
+ def __array_finalize__(self, obj: Any) -> None:
138
+ """Called when a new Tensor is created via .view(), slicing, or ufuncs.
139
+
140
+ Sets default values for all Tensor attributes. These can be overridden
141
+ after creation if needed.
142
+ """
143
+ if obj is None:
144
+ # Called from __new__ via explicit constructor - __init__ will handle it
145
+ return
146
+ # Copy attributes from source object if available, otherwise use defaults
147
+ # These assignments are intentionally duplicated from __init__ because
148
+ # __array_finalize__ is called for views/slices instead of __init__
149
+ self.src: tuple[Tensor, ...] = getattr(obj, "src", ()) # type: ignore[no-redef]
150
+ self.backward_fn: GradOp | None = getattr(obj, "backward_fn", None) # type: ignore[no-redef]
151
+ self.op_ctx: dict[str, Any] = getattr(obj, "op_ctx", {}) # type: ignore[no-redef]
152
+ self.requires_grad: bool = getattr(obj, "requires_grad", False) # type: ignore[no-redef]
153
+ self.grad: xp.array | None = getattr(obj, "grad", None) # type: ignore[no-redef]
154
+ self.keep_grad: bool = getattr(obj, "keep_grad", False) # type: ignore[no-redef]
155
+
156
+ def is_leaf(self) -> bool:
157
+ """Whether this Tensor is a leaf in a computation graph.
158
+
159
+ Checks whether it has no src/parents from which it was created.
160
+
161
+ Returns:
162
+ bool: If it is a leaf (`True`), or not (`False`).
163
+ """
164
+ return len(self.src) == 0
165
+
166
+ def copy_to_device(self, device: TensorDevice) -> Tensor:
167
+ """Copy tensor data to `device`.
168
+
169
+ For intermediate tensors in a computation graph (non-leaf with sources
170
+ that require grad), this is a tracked operation so gradients flow back.
171
+ For leaf tensors, this is a utility operation.
172
+
173
+ Note: If the Tensor already is on `device`, no copy is created. Instead,
174
+ the Tensor is returned as is.
175
+
176
+ Args:
177
+ device (TensorDevice): The device to copy to. Should either
178
+ be `cpu` or an integer specifying the GPU id.
179
+
180
+ Returns:
181
+ Tensor: A tensor with the same data, now on `device`.
182
+ """
183
+ return _copy_to_device(tensor=self, device=device)
184
+
185
+ def detach(
186
+ self,
187
+ *,
188
+ in_place: bool = False,
189
+ ) -> Tensor:
190
+ """Detatch the Tensor from the computation graph.
191
+
192
+ Args:
193
+ in_place (bool): Whether to detach the current Tensor
194
+ in-place (`True`), which would cut the current computation
195
+ graph on that node, or to detach a copy (including the
196
+ memory buffer, unlike in Pytorch) of the
197
+ current Tensor (`False`), which does **not**
198
+ break the current computation graph. Defaults to False.
199
+
200
+ Returns:
201
+ Tensor: The resulting Tensor. If `in_place` is `True`, it will
202
+ be the same one identity-wise.
203
+ """
204
+ if in_place:
205
+ self.src = ()
206
+ self.backward_fn = None
207
+ self.op_ctx = {}
208
+ if not self.keep_grad:
209
+ self.grad = None
210
+ return self
211
+ detached_tensor = Tensor(
212
+ self.copy(),
213
+ requires_grad=self.requires_grad,
214
+ keep_grad=self.keep_grad,
215
+ )
216
+ detached_tensor.grad = self.grad if self.keep_grad else None
217
+ return detached_tensor
218
+
219
+ def cpu(self) -> Tensor:
220
+ """Move the Tensor to the cpu.
221
+
222
+ Note: If the Tensor already is on the cpu,
223
+ no copy is created. Instead, the Tensor is returned as is.
224
+
225
+ Returns:
226
+ Tensor: A **copy** of the Tensor on the cpu,
227
+ if it wasn't on the cpu before.
228
+ """
229
+ return self.copy_to_device(device="cpu")
230
+
231
+ def gpu(self, device_id: int = 0) -> Tensor:
232
+ """Move the Tensor to a gpu with `id`.
233
+
234
+ Note: If the Tensor already is on the gpu `device_id`,
235
+ no copy is created. Instead, the Tensor is returned as is.
236
+
237
+ Args:
238
+ device_id (int): The id of the gpu to which the Tensor
239
+ should be copied. Defaults to 0.
240
+
241
+ Returns:
242
+ Tensor: A **copy** of the Tensor on the specified gpu,
243
+ if it wasn't on the gpu `device_id` before.
244
+ """
245
+ return self.copy_to_device(device=device_id)
246
+
247
+ def __hash__(self) -> int:
248
+ """Identity-based hash for use in sets/dicts (computation graph tracking)."""
249
+ return id(self)
250
+
251
+ def __new__(cls, data: Iterable[Any], **kwargs: Any) -> Self:
252
+ """Initializes the data."""
253
+ # **kwargs accepts src, creator_op, etc. but we don't use them here
254
+ # They'll be handled by __init__
255
+ result: Self = xp.asarray(data, dtype=kwargs.get("dtype")).view(cls)
256
+ return result
257
+
258
+ def __array_ufunc__(
259
+ self,
260
+ ufunc: xp.ufunc,
261
+ method: str,
262
+ *inputs: Any,
263
+ **kwargs: Any,
264
+ ) -> Any:
265
+ logger.debug(
266
+ '__array_ufunc__: ufunc="%s" method="%s" inputs="%s" kwargs="%s"',
267
+ ufunc.__name__,
268
+ method,
269
+ inputs,
270
+ kwargs,
271
+ )
272
+
273
+ xp_input_arrays = tuple(_to_array(x) for x in inputs)
274
+ kwargs = {k: _to_array(v) for k, v in kwargs.items()}
275
+
276
+ track_kwargs = kwargs.copy()
277
+
278
+ func = getattr(ufunc, method)
279
+
280
+ if func.__name__ in ["maximum", "minimum"]:
281
+ result = func(*xp_input_arrays, **kwargs)
282
+ # create a mask for each axis, where True means the value in x
283
+ # at this position is an extremum (maximum or minimum, depending on "method"):
284
+ x_mask = result == xp_input_arrays[0]
285
+ # just for completness we do "* kwargs.get("where", 1)" in the following line
286
+ # this is because if "where" is "False" at a location, the extremum operation
287
+ # should not be used, and therefore x_mask does not apply
288
+ # (this is not strictly neccessary, because we account for this in the backward
289
+ # function by setting the grad to "0" for both x and y at these locations anyway)
290
+ track_kwargs["x_mask"] = x_mask * kwargs.get("where", 1)
291
+
292
+ else:
293
+ result = func(*xp_input_arrays, **kwargs)
294
+
295
+ # Skip graph building when grad mode is disabled (e.g., during backward pass)
296
+ if not _GRAD_MODE_ENABLED:
297
+ return Tensor(result, requires_grad=False)
298
+
299
+ src = tuple(_to_tensor(i) for i in inputs)
300
+ creator_op = normalize_grad_op_name(name=ufunc.__name__, is_reduce=method == "reduce")
301
+
302
+ return Tensor(
303
+ result,
304
+ src=src,
305
+ creator_op=creator_op,
306
+ op_ctx=track_kwargs,
307
+ requires_grad=any(elem.requires_grad for elem in src),
308
+ )
309
+
310
+ def __array_function__(
311
+ self,
312
+ func: Any,
313
+ types: Iterable[type],
314
+ args: Iterable[Any],
315
+ kwargs: Mapping[str, Any],
316
+ ) -> Any:
317
+ logger.debug(
318
+ '__array_function__: func="%s" types="%s" args="%s" kwargs="%s"',
319
+ func.__name__,
320
+ types,
321
+ args,
322
+ kwargs,
323
+ )
324
+
325
+ xp_input_arrays = tuple(_to_array(x) for x in args)
326
+ kwargs = {k: _to_array(v) for k, v in kwargs.items()}
327
+
328
+ track_kwargs = kwargs.copy()
329
+
330
+ if func.__name__ in ["max", "min"]:
331
+ # execute the function, but retain the dimensions:
332
+ keepdims_kwargs = kwargs.copy()
333
+ keepdims_kwargs["keepdims"] = True
334
+ result = func(*xp_input_arrays, **keepdims_kwargs)
335
+ # create a mask for each axis, where True means the value in x
336
+ # at this position is an extremum (maximum or minimum, depending on "method"):
337
+ x_mask = result == xp_input_arrays[0]
338
+ track_kwargs["x_mask"] = x_mask
339
+
340
+ result = (
341
+ result
342
+ if kwargs.get("keepdims", False)
343
+ else xp.squeeze(result, axis=kwargs.get("axis"))
344
+ )
345
+ else:
346
+ result = func(*xp_input_arrays, **kwargs)
347
+
348
+ # Skip graph building when grad mode is disabled (e.g., during backward pass)
349
+ if not _GRAD_MODE_ENABLED:
350
+ return Tensor(result, requires_grad=False)
351
+
352
+ src = tuple(_to_tensor(a) for a in args)
353
+
354
+ return Tensor(
355
+ result,
356
+ src=src,
357
+ creator_op=func.__name__,
358
+ op_ctx=track_kwargs,
359
+ requires_grad=any(elem.requires_grad for elem in src),
360
+ )
361
+
362
+
363
+ class Parameter(Tensor):
364
+ """A special Tensor that should be part of a model to optimize.
365
+
366
+ Parameters have an additional `is_training` attribute for controlling
367
+ behavior of layers like Dropout and BatchNorm.
368
+ """
369
+
370
+ def __init__(
371
+ self,
372
+ data: Any = None, # Ignored - handled by __new__, but needed for signature compatibility
373
+ *,
374
+ is_training: bool = True,
375
+ ) -> None:
376
+ super().__init__(
377
+ data=data,
378
+ src=None,
379
+ creator_op=None,
380
+ op_ctx=None,
381
+ requires_grad=True,
382
+ keep_grad=True,
383
+ )
384
+ self.is_training = is_training
385
+
386
+ def __array_finalize__(self, obj: Any) -> None:
387
+ """Called when a new Parameter is created via .view(), slicing, or ufuncs.
388
+
389
+ Extends Tensor's __array_finalize__ to also handle is_training.
390
+ """
391
+ # First call parent's __array_finalize__ to set Tensor attributes
392
+ super().__array_finalize__(obj)
393
+ # Then set Parameter-specific attributes
394
+ self.is_training: bool = getattr(obj, "is_training", True) # type: ignore[no-redef]
395
+
396
+ def __new__(cls, data: Iterable[Any], **kwargs: Any) -> Self:
397
+ """Initializes the data."""
398
+ # parameters must always be float and can never be int
399
+ # -> their optimization will unlikely yield pure integer
400
+ # parameters
401
+ # -> integer parameters are not differentiable!
402
+ # this is because there exist no infinitesimal steps,
403
+ # there is always a jump, e.g. 1 -> 2
404
+ dtype = kwargs.get("dtype")
405
+ if xp.issubdtype(dtype, xp.integer):
406
+ raise ValueError("Parameter must have float type, found int.")
407
+ return Tensor.__new__(
408
+ cls=cls,
409
+ data=data,
410
+ **kwargs,
411
+ )
412
+
413
+ def copy_to_device(self, device: TensorDevice) -> Parameter:
414
+ """Copy parameter data to `device`.
415
+
416
+ This is a utility operation (not tracked in the
417
+ computation graph). Consequently, the
418
+ **copy-semantic is different to normal Tensors**.
419
+
420
+ Note: If the Parameter already is on `device`, no copy is created.
421
+ Instead, the Parameter is returned as is.
422
+
423
+ Args:
424
+ device (TensorDevice): The device to copy to. Should either
425
+ be `cpu` or an integer specifying the GPU id.
426
+
427
+ Returns:
428
+ Parameter: A Parameter with the same data, now on `device`.
429
+ """
430
+ new_device_array = copy_array(array=self, device=device)
431
+
432
+ # __array_finalize__ sets defaults, so we manually copy from self
433
+ result: Parameter = new_device_array.view(Parameter)
434
+ result.requires_grad = self.requires_grad
435
+ result.keep_grad = self.keep_grad
436
+ result.is_training = self.is_training
437
+ return result
438
+
439
+
440
+ def tensor(
441
+ data: Any,
442
+ *,
443
+ dtype: Any = None,
444
+ device: TensorDevice = "cpu",
445
+ requires_grad: bool = False,
446
+ keep_grad: bool = False,
447
+ ) -> Tensor:
448
+ """Factory function to create a Tensor on the specified device.
449
+
450
+ Args:
451
+ data (Any): The array data (can be scalar, list, array, etc).
452
+ dtype (Any): The data type of the array data.
453
+ Defaults to None, meaning dtype is inferred from data.
454
+ device (TensorDevice): The device on which the Tensor
455
+ should be created. Defaults to "cpu".
456
+ requires_grad (bool): Whether to track gradients. Defaults to False.
457
+ keep_grad (bool): Whether to retain gradients after backward. Defaults to False.
458
+
459
+ Returns:
460
+ Tensor: The created Tensor.
461
+ """
462
+ if BACKEND == "numpy" or device == "cpu":
463
+ arr = xp.array(data, dtype=dtype)
464
+ else:
465
+ with xp.cuda.Device(device):
466
+ arr = xp.array(data, dtype=dtype)
467
+
468
+ result: Tensor = arr.view(Tensor)
469
+ # __array_finalize__ sets defaults; override with user values
470
+ result.requires_grad = _GRAD_MODE_ENABLED and requires_grad
471
+ result.keep_grad = keep_grad
472
+ return result
473
+
474
+
475
+ def _copy_to_device(tensor: Tensor, device: TensorDevice) -> Tensor:
476
+ """Copy tensor data to `device`.
477
+
478
+ For intermediate tensors in a computation graph (non-leaf with sources
479
+ that require grad), this is a tracked operation so gradients flow back.
480
+ For leaf tensors, this is a utility operation.
481
+
482
+ Note: If the Tensor already is on `device`, no copy is created. Instead,
483
+ the Tensor is returned as is.
484
+
485
+ Args:
486
+ tensor (Tensor): The tensor to copy to `device`.
487
+ device (TensorDevice): The device to copy to. Should either
488
+ be `cpu` or an integer specifying the GPU id.
489
+
490
+ Returns:
491
+ Tensor: A tensor with the same data, now on `device`.
492
+ """
493
+ new_device_array = copy_array(array=tensor, device=device)
494
+
495
+ # Check if this is a non-leaf tensor in an active computation graph
496
+ # (like intermediate activations in multi-GPU scenarios)
497
+ src_requires_grad = [s.requires_grad for s in tensor.src]
498
+ is_in_graph = (
499
+ _GRAD_MODE_ENABLED
500
+ and tensor.requires_grad
501
+ and len(tensor.src) > 0
502
+ and any(src_requires_grad)
503
+ )
504
+
505
+ # __array_finalize__ sets defaults from new_device_array (plain xp.ndarray),
506
+ # so we manually set attributes
507
+ result: Tensor = new_device_array.view(Tensor)
508
+
509
+ if is_in_graph:
510
+ # Tracked operation: gradients flow back through device transfer
511
+ result.src = (tensor,)
512
+ result.backward_fn = get_grad_op("copy_to_device")
513
+ result.requires_grad = True
514
+ else:
515
+ # Utility operation for leaf tensors
516
+ result.requires_grad = tensor.requires_grad
517
+
518
+ result.keep_grad = tensor.keep_grad
519
+ return result
520
+
521
+
522
+ __all__ = [
523
+ "Parameter",
524
+ "Tensor",
525
+ "_copy_to_device",
526
+ "get_current_global_grad_mode",
527
+ "no_grad",
528
+ "no_grad_fn",
529
+ "set_global_grad_mode",
530
+ "tensor",
531
+ ]
sadl/utils.py ADDED
@@ -0,0 +1,33 @@
1
+ from .backend import BACKEND, TensorDevice, xp
2
+
3
+
4
+ def copy_array(array: "xp.ndarray", device: TensorDevice) -> "xp.ndarray":
5
+ """Copy an array to the specified device.
6
+
7
+ Args:
8
+ array (xp.ndarray): The array to copy.
9
+ device (TensorDevice): Target device, "cpu" or GPU id (int).
10
+
11
+ Raises:
12
+ ValueError: If device string is not "cpu".
13
+ ValueError: If using numpy backend and requesting a GPU device.
14
+
15
+ Returns:
16
+ xp.ndarray: The array on the target device, or the original if already there.
17
+ """
18
+ if array.device == device:
19
+ return array
20
+ if isinstance(device, str) and device != "cpu":
21
+ raise ValueError('Only "cpu" allowed as string device.')
22
+ if BACKEND == "numpy":
23
+ raise ValueError(
24
+ "Copying to another device is only possible when using cupy "
25
+ "as the backend. Currently, numpy is the backend. Please "
26
+ "check cupy and gpu availability."
27
+ )
28
+ # cupy:
29
+ if isinstance(device, int):
30
+ with xp.cuda.Device(device):
31
+ return xp.asarray(array)
32
+ else:
33
+ return xp.asnumpy(array)