quantmllibrary 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. quantml/__init__.py +74 -0
  2. quantml/autograd.py +154 -0
  3. quantml/cli/__init__.py +10 -0
  4. quantml/cli/run_experiment.py +385 -0
  5. quantml/config/__init__.py +28 -0
  6. quantml/config/config.py +259 -0
  7. quantml/data/__init__.py +33 -0
  8. quantml/data/cache.py +149 -0
  9. quantml/data/feature_store.py +234 -0
  10. quantml/data/futures.py +254 -0
  11. quantml/data/loaders.py +236 -0
  12. quantml/data/memory_optimizer.py +234 -0
  13. quantml/data/validators.py +390 -0
  14. quantml/experiments/__init__.py +23 -0
  15. quantml/experiments/logger.py +208 -0
  16. quantml/experiments/results.py +158 -0
  17. quantml/experiments/tracker.py +223 -0
  18. quantml/features/__init__.py +25 -0
  19. quantml/features/base.py +104 -0
  20. quantml/features/gap_features.py +124 -0
  21. quantml/features/registry.py +138 -0
  22. quantml/features/volatility_features.py +140 -0
  23. quantml/features/volume_features.py +142 -0
  24. quantml/functional.py +37 -0
  25. quantml/models/__init__.py +27 -0
  26. quantml/models/attention.py +258 -0
  27. quantml/models/dropout.py +130 -0
  28. quantml/models/gru.py +319 -0
  29. quantml/models/linear.py +112 -0
  30. quantml/models/lstm.py +353 -0
  31. quantml/models/mlp.py +286 -0
  32. quantml/models/normalization.py +289 -0
  33. quantml/models/rnn.py +154 -0
  34. quantml/models/tcn.py +238 -0
  35. quantml/online.py +209 -0
  36. quantml/ops.py +1707 -0
  37. quantml/optim/__init__.py +42 -0
  38. quantml/optim/adafactor.py +206 -0
  39. quantml/optim/adagrad.py +157 -0
  40. quantml/optim/adam.py +267 -0
  41. quantml/optim/lookahead.py +97 -0
  42. quantml/optim/quant_optimizer.py +228 -0
  43. quantml/optim/radam.py +192 -0
  44. quantml/optim/rmsprop.py +203 -0
  45. quantml/optim/schedulers.py +286 -0
  46. quantml/optim/sgd.py +181 -0
  47. quantml/py.typed +0 -0
  48. quantml/streaming.py +175 -0
  49. quantml/tensor.py +462 -0
  50. quantml/time_series.py +447 -0
  51. quantml/training/__init__.py +135 -0
  52. quantml/training/alpha_eval.py +203 -0
  53. quantml/training/backtest.py +280 -0
  54. quantml/training/backtest_analysis.py +168 -0
  55. quantml/training/cv.py +106 -0
  56. quantml/training/data_loader.py +177 -0
  57. quantml/training/ensemble.py +84 -0
  58. quantml/training/feature_importance.py +135 -0
  59. quantml/training/features.py +364 -0
  60. quantml/training/futures_backtest.py +266 -0
  61. quantml/training/gradient_clipping.py +206 -0
  62. quantml/training/losses.py +248 -0
  63. quantml/training/lr_finder.py +127 -0
  64. quantml/training/metrics.py +376 -0
  65. quantml/training/regularization.py +89 -0
  66. quantml/training/trainer.py +239 -0
  67. quantml/training/walk_forward.py +190 -0
  68. quantml/utils/__init__.py +51 -0
  69. quantml/utils/gradient_check.py +274 -0
  70. quantml/utils/logging.py +181 -0
  71. quantml/utils/ops_cpu.py +231 -0
  72. quantml/utils/profiling.py +364 -0
  73. quantml/utils/reproducibility.py +220 -0
  74. quantml/utils/serialization.py +335 -0
  75. quantmllibrary-0.1.0.dist-info/METADATA +536 -0
  76. quantmllibrary-0.1.0.dist-info/RECORD +79 -0
  77. quantmllibrary-0.1.0.dist-info/WHEEL +5 -0
  78. quantmllibrary-0.1.0.dist-info/licenses/LICENSE +22 -0
  79. quantmllibrary-0.1.0.dist-info/top_level.txt +1 -0
quantml/ops.py ADDED
@@ -0,0 +1,1707 @@
1
+ """
2
+ Core operations for Tensor objects.
3
+
4
+ This module provides all the fundamental operations that can be performed
5
+ on tensors, including arithmetic, linear algebra, reductions, and activations.
6
+ All operations support automatic differentiation.
7
+ """
8
+
9
+ from typing import Union, Optional, List, Any, Callable
10
+ from quantml.tensor import Tensor
11
+ from quantml.autograd import backward
12
+
13
+ # Try to import NumPy for optimized operations
14
+ try:
15
+ import numpy as np
16
+ HAS_NUMPY = True
17
+ except ImportError:
18
+ HAS_NUMPY = False
19
+ np = None
20
+
21
+
22
+ def _to_tensor(x: Union[Tensor, float, int, List]) -> Tensor:
23
+ """Convert input to Tensor if needed."""
24
+ if isinstance(x, Tensor):
25
+ return x
26
+ return Tensor(x)
27
+
28
+
29
+ def _create_tensor_from_numpy(
30
+ arr: Any,
31
+ requires_grad: bool = False,
32
+ _prev: Optional[set] = None,
33
+ _op: Optional[str] = None,
34
+ _backward_fn: Optional[Callable] = None
35
+ ) -> Tensor:
36
+ """
37
+ Create a Tensor directly from a NumPy array, avoiding list conversion.
38
+
39
+ This is an internal optimization to eliminate NumPy -> list -> NumPy conversions.
40
+ """
41
+ if HAS_NUMPY and isinstance(arr, np.ndarray):
42
+ return Tensor(
43
+ arr, # Will be converted to NumPy in __init__
44
+ requires_grad=requires_grad,
45
+ _prev=_prev,
46
+ _op=_op,
47
+ _backward_fn=_backward_fn,
48
+ _np_array=arr # Direct pass to skip conversion
49
+ )
50
+ # Fallback to regular Tensor creation
51
+ return Tensor(
52
+ arr,
53
+ requires_grad=requires_grad,
54
+ _prev=_prev,
55
+ _op=_op,
56
+ _backward_fn=_backward_fn
57
+ )
58
+
59
+
60
+ def _to_numpy(data: Any):
61
+ """Convert data to NumPy array if NumPy is available."""
62
+ if not HAS_NUMPY:
63
+ return None
64
+ if isinstance(data, np.ndarray):
65
+ return data
66
+ return np.array(data, dtype=np.float64)
67
+
68
+
69
+ def _from_numpy(arr) -> List:
70
+ """Convert NumPy array to nested list."""
71
+ if arr is None:
72
+ return None
73
+ if HAS_NUMPY and isinstance(arr, np.ndarray):
74
+ return arr.tolist()
75
+ return arr
76
+
77
+
78
+ def _broadcast_shape(shape1: tuple, shape2: tuple) -> tuple:
79
+ """Compute broadcasted shape of two tensors."""
80
+ # Simple broadcasting: if one is scalar, use the other's shape
81
+ if shape1 == (1, 1) or shape1 == (1,):
82
+ return shape2
83
+ if shape2 == (1, 1) or shape2 == (1,):
84
+ return shape1
85
+ # For now, shapes must match or one must be scalar
86
+ return shape1 if len(shape1) >= len(shape2) else shape2
87
+
88
+
89
+ def _broadcast_op(a: List, b: List, op: callable, op_type: Optional[str] = None) -> List:
90
+ """Apply operation with broadcasting. Uses NumPy if available."""
91
+ if HAS_NUMPY:
92
+ try:
93
+ a_arr = _to_numpy(a)
94
+ b_arr = _to_numpy(b)
95
+ if a_arr is not None and b_arr is not None:
96
+ # Use NumPy broadcasting based on op_type
97
+ if op_type == 'add':
98
+ result = (a_arr + b_arr).tolist()
99
+ elif op_type == 'mul':
100
+ result = (a_arr * b_arr).tolist()
101
+ elif op_type == 'sub':
102
+ result = (a_arr - b_arr).tolist()
103
+ elif op_type == 'div':
104
+ result = (a_arr / b_arr).tolist()
105
+ else:
106
+ # Fallback for custom ops
107
+ result = np.vectorize(op)(a_arr, b_arr).tolist()
108
+ return result
109
+ except (ValueError, TypeError):
110
+ # Broadcasting failed, fall through to pure Python
111
+ pass
112
+
113
+ # Pure Python fallback
114
+ if isinstance(a[0], list) and isinstance(b[0], list):
115
+ # Both 2D
116
+ if len(a) == len(b) and len(a[0]) == len(b[0]):
117
+ return [[op(a[i][j], b[i][j]) for j in range(len(a[i]))]
118
+ for i in range(len(a))]
119
+ # Broadcast scalar
120
+ if len(b) == 1 and len(b[0]) == 1:
121
+ val = b[0][0]
122
+ return [[op(a[i][j], val) for j in range(len(a[i]))]
123
+ for i in range(len(a))]
124
+ if len(a) == 1 and len(a[0]) == 1:
125
+ val = a[0][0]
126
+ return [[op(val, b[i][j]) for j in range(len(b[i]))]
127
+ for i in range(len(b))]
128
+ elif isinstance(a[0], list):
129
+ # a is 2D, b is 1D or scalar
130
+ if isinstance(b, list) and not isinstance(b[0], list):
131
+ # b is 1D
132
+ return [[op(a[i][j], b[j] if j < len(b) else b[0])
133
+ for j in range(len(a[i]))] for i in range(len(a))]
134
+ else:
135
+ # b is scalar
136
+ val = float(b[0]) if isinstance(b, list) else float(b)
137
+ return [[op(a[i][j], val) for j in range(len(a[i]))]
138
+ for i in range(len(a))]
139
+ elif isinstance(b[0], list):
140
+ # b is 2D, a is 1D or scalar
141
+ val = float(a[0]) if isinstance(a, list) else float(a)
142
+ return [[op(val, b[i][j]) for j in range(len(b[i]))]
143
+ for i in range(len(b))]
144
+ else:
145
+ # Both 1D
146
+ max_len = max(len(a), len(b))
147
+ return [op(a[i] if i < len(a) else a[0],
148
+ b[i] if i < len(b) else b[0])
149
+ for i in range(max_len)]
150
+ return a
151
+
152
+
153
+ def add(a: Union[Tensor, float, int], b: Union[Tensor, float, int]) -> Tensor:
154
+ """
155
+ Add two tensors element-wise.
156
+
157
+ Args:
158
+ a: First tensor or scalar
159
+ b: Second tensor or scalar
160
+
161
+ Returns:
162
+ New tensor with result
163
+
164
+ Examples:
165
+ >>> x = Tensor([1.0, 2.0])
166
+ >>> y = Tensor([3.0, 4.0])
167
+ >>> z = add(x, y) # [4.0, 6.0]
168
+ """
169
+ a = _to_tensor(a)
170
+ b = _to_tensor(b)
171
+
172
+ # Use NumPy arrays if available for better performance
173
+ a_data = a.numpy if (HAS_NUMPY and a.numpy is not None) else a.data
174
+ b_data = b.numpy if (HAS_NUMPY and b.numpy is not None) else b.data
175
+
176
+ if HAS_NUMPY and isinstance(a_data, np.ndarray) and isinstance(b_data, np.ndarray):
177
+ try:
178
+ out_arr = a_data + b_data
179
+ # Create tensor directly from NumPy array to avoid conversion
180
+ out = _create_tensor_from_numpy(
181
+ out_arr,
182
+ requires_grad=a.requires_grad or b.requires_grad,
183
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
184
+ _op='add'
185
+ )
186
+ except (ValueError, TypeError):
187
+ out_data = _broadcast_op(a.data, b.data, lambda x, y: float(x) + float(y), 'add')
188
+ out = Tensor(
189
+ out_data,
190
+ requires_grad=a.requires_grad or b.requires_grad,
191
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
192
+ _op='add'
193
+ )
194
+ else:
195
+ out_data = _broadcast_op(a_data, b_data, lambda x, y: float(x) + float(y), 'add')
196
+ out = Tensor(
197
+ out_data,
198
+ requires_grad=a.requires_grad or b.requires_grad,
199
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
200
+ _op='add'
201
+ )
202
+
203
+ def _backward(grad):
204
+ if a.requires_grad:
205
+ a.backward(grad)
206
+ if b.requires_grad:
207
+ b.backward(grad)
208
+
209
+ if out.requires_grad:
210
+ out._backward_fn = _backward
211
+
212
+ return out
213
+
214
+
215
+ def sub(a: Union[Tensor, float, int], b: Union[Tensor, float, int]) -> Tensor:
216
+ """Subtract two tensors element-wise."""
217
+ return add(a, mul(b, -1.0))
218
+
219
+
220
+ def mul(a: Union[Tensor, float, int], b: Union[Tensor, float, int]) -> Tensor:
221
+ """
222
+ Multiply two tensors element-wise.
223
+
224
+ Args:
225
+ a: First tensor or scalar
226
+ b: Second tensor or scalar
227
+
228
+ Returns:
229
+ New tensor with result
230
+ """
231
+ a = _to_tensor(a)
232
+ b = _to_tensor(b)
233
+
234
+ # Use NumPy arrays if available
235
+ a_data = a.numpy if (HAS_NUMPY and a.numpy is not None) else a.data
236
+ b_data = b.numpy if (HAS_NUMPY and b.numpy is not None) else b.data
237
+
238
+ if HAS_NUMPY and isinstance(a_data, np.ndarray) and isinstance(b_data, np.ndarray):
239
+ try:
240
+ out_arr = a_data * b_data
241
+ # Create tensor directly from NumPy array
242
+ out = _create_tensor_from_numpy(
243
+ out_arr,
244
+ requires_grad=a.requires_grad or b.requires_grad,
245
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
246
+ _op='mul'
247
+ )
248
+ except (ValueError, TypeError):
249
+ out_data = _broadcast_op(a.data, b.data, lambda x, y: float(x) * float(y), 'mul')
250
+ out = Tensor(
251
+ out_data,
252
+ requires_grad=a.requires_grad or b.requires_grad,
253
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
254
+ _op='mul'
255
+ )
256
+ else:
257
+ out_data = _broadcast_op(a_data, b_data, lambda x, y: float(x) * float(y), 'mul')
258
+ out = Tensor(
259
+ out_data,
260
+ requires_grad=a.requires_grad or b.requires_grad,
261
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
262
+ _op='mul'
263
+ )
264
+
265
+ def _backward(grad):
266
+ if a.requires_grad:
267
+ a_grad = _broadcast_op(grad, b.data, lambda g, b_val: float(g) * float(b_val), 'mul')
268
+ a.backward(a_grad)
269
+ if b.requires_grad:
270
+ b_grad = _broadcast_op(grad, a.data, lambda g, a_val: float(g) * float(a_val), 'mul')
271
+ b.backward(b_grad)
272
+
273
+ if out.requires_grad:
274
+ out._backward_fn = _backward
275
+
276
+ return out
277
+
278
+
279
+ def div(a: Union[Tensor, float, int], b: Union[Tensor, float, int]) -> Tensor:
280
+ """Divide two tensors element-wise."""
281
+ return mul(a, pow(b, -1.0))
282
+
283
+
284
+ def pow(a: Union[Tensor, float, int], power: Union[float, int]) -> Tensor:
285
+ """
286
+ Raise tensor to a power.
287
+
288
+ Args:
289
+ a: Base tensor
290
+ power: Exponent
291
+
292
+ Returns:
293
+ New tensor with result
294
+ """
295
+ a = _to_tensor(a)
296
+ power = float(power)
297
+
298
+ # Use NumPy if available
299
+ if HAS_NUMPY:
300
+ try:
301
+ a_arr = _to_numpy(a.data)
302
+ if a_arr is not None:
303
+ out_arr = np.power(a_arr, power)
304
+ # Create tensor directly from NumPy array
305
+ out = _create_tensor_from_numpy(
306
+ out_arr,
307
+ requires_grad=a.requires_grad,
308
+ _prev={a} if a.requires_grad else set(),
309
+ _op='pow'
310
+ )
311
+ # Set backward function
312
+ def _backward(grad):
313
+ if a.requires_grad:
314
+ # d/dx (x^n) = n * x^(n-1)
315
+ if HAS_NUMPY:
316
+ try:
317
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
318
+ a_grad_arr = grad_arr * power * np.power(a_arr, power - 1)
319
+ a.backward(a_grad_arr)
320
+ except (ValueError, TypeError):
321
+ # Fallback
322
+ if isinstance(grad, list) and isinstance(a.data[0], list):
323
+ a_grad = [[float(grad[i][j]) * power * (float(a.data[i][j]) ** (power - 1))
324
+ for j in range(len(grad[i]))] for i in range(len(grad))]
325
+ elif isinstance(grad, list):
326
+ a_grad = [float(grad[i]) * power * (float(a.data[i]) ** (power - 1))
327
+ for i in range(len(grad))]
328
+ else:
329
+ a_grad = float(grad) * power * (float(a.data[0][0]) ** (power - 1))
330
+ a.backward(a_grad)
331
+ else:
332
+ if isinstance(grad, list) and isinstance(a.data[0], list):
333
+ a_grad = [[float(grad[i][j]) * power * (float(a.data[i][j]) ** (power - 1))
334
+ for j in range(len(grad[i]))] for i in range(len(grad))]
335
+ elif isinstance(grad, list):
336
+ a_grad = [float(grad[i]) * power * (float(a.data[i]) ** (power - 1))
337
+ for i in range(len(grad))]
338
+ else:
339
+ a_grad = float(grad) * power * (float(a.data[0][0]) ** (power - 1))
340
+ a.backward(a_grad)
341
+
342
+ if out.requires_grad:
343
+ out._backward_fn = _backward
344
+
345
+ return out
346
+ else:
347
+ # Fallback
348
+ def _pow_op(x):
349
+ return float(x) ** power
350
+ if isinstance(a.data[0], list):
351
+ out_data = [[_pow_op(a.data[i][j]) for j in range(len(a.data[i]))]
352
+ for i in range(len(a.data))]
353
+ else:
354
+ out_data = [_pow_op(x) for x in a.data]
355
+ except (ValueError, TypeError):
356
+ # Fallback
357
+ def _pow_op(x):
358
+ return float(x) ** power
359
+ if isinstance(a.data[0], list):
360
+ out_data = [[_pow_op(a.data[i][j]) for j in range(len(a.data[i]))]
361
+ for i in range(len(a.data))]
362
+ else:
363
+ out_data = [_pow_op(x) for x in a.data]
364
+ else:
365
+ # Pure Python
366
+ def _pow_op(x):
367
+ return float(x) ** power
368
+ if isinstance(a.data[0], list):
369
+ out_data = [[_pow_op(a.data[i][j]) for j in range(len(a.data[i]))]
370
+ for i in range(len(a.data))]
371
+ else:
372
+ out_data = [_pow_op(x) for x in a.data]
373
+
374
+ out = Tensor(
375
+ out_data,
376
+ requires_grad=a.requires_grad,
377
+ _prev={a} if a.requires_grad else set(),
378
+ _op='pow'
379
+ )
380
+
381
+ def _backward(grad):
382
+ if a.requires_grad:
383
+ # d/dx (x^n) = n * x^(n-1)
384
+ if isinstance(grad, list) and isinstance(a.data[0], list):
385
+ a_grad = [[float(grad[i][j]) * power * (float(a.data[i][j]) ** (power - 1))
386
+ for j in range(len(grad[i]))] for i in range(len(grad))]
387
+ elif isinstance(grad, list):
388
+ a_grad = [float(grad[i]) * power * (float(a.data[i]) ** (power - 1))
389
+ for i in range(len(grad))]
390
+ else:
391
+ a_grad = float(grad) * power * (float(a.data[0][0]) ** (power - 1))
392
+ a.backward(a_grad)
393
+
394
+ if out.requires_grad:
395
+ out._backward_fn = _backward
396
+
397
+ return out
398
+
399
+
400
+ def matmul(a: Tensor, b: Tensor) -> Tensor:
401
+ """
402
+ Matrix multiplication of two tensors.
403
+
404
+ Args:
405
+ a: First matrix (m x n)
406
+ b: Second matrix (n x p)
407
+
408
+ Returns:
409
+ Result matrix (m x p)
410
+
411
+ Examples:
412
+ >>> a = Tensor([[1.0, 2.0], [3.0, 4.0]])
413
+ >>> b = Tensor([[5.0, 6.0], [7.0, 8.0]])
414
+ >>> c = matmul(a, b) # [[19.0, 22.0], [43.0, 50.0]]
415
+ """
416
+ a = _to_tensor(a)
417
+ b = _to_tensor(b)
418
+
419
+ # Ensure 2D
420
+ a_data = a.data if isinstance(a.data[0], list) else [a.data]
421
+ b_data = b.data if isinstance(b.data[0], list) else [b.data]
422
+
423
+ # Matrix multiplication - use NumPy if available
424
+ if HAS_NUMPY:
425
+ try:
426
+ # Use numpy arrays directly from tensors if available
427
+ a_arr = a.numpy if (a.numpy is not None) else np.array(a_data, dtype=np.float64)
428
+ b_arr = b.numpy if (b.numpy is not None) else np.array(b_data, dtype=np.float64)
429
+ out_arr = np.dot(a_arr, b_arr)
430
+ # Create tensor directly from NumPy array
431
+ out = _create_tensor_from_numpy(
432
+ out_arr,
433
+ requires_grad=a.requires_grad or b.requires_grad,
434
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
435
+ _op='matmul'
436
+ )
437
+ except (ValueError, TypeError):
438
+ # Fallback to pure Python
439
+ m, n = len(a_data), len(a_data[0])
440
+ n2, p = len(b_data), len(b_data[0])
441
+ if n != n2:
442
+ raise ValueError(f"Matrix dimensions incompatible: {a.shape} x {b.shape}")
443
+ out_data = [[sum(float(a_data[i][k]) * float(b_data[k][j])
444
+ for k in range(n))
445
+ for j in range(p)]
446
+ for i in range(m)]
447
+ out = Tensor(
448
+ out_data,
449
+ requires_grad=a.requires_grad or b.requires_grad,
450
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
451
+ _op='matmul'
452
+ )
453
+ else:
454
+ # Pure Python implementation
455
+ m, n = len(a_data), len(a_data[0])
456
+ n2, p = len(b_data), len(b_data[0])
457
+ if n != n2:
458
+ raise ValueError(f"Matrix dimensions incompatible: {a.shape} x {b.shape}")
459
+ out_data = [[sum(float(a_data[i][k]) * float(b_data[k][j])
460
+ for k in range(n))
461
+ for j in range(p)]
462
+ for i in range(m)]
463
+ out = Tensor(
464
+ out_data,
465
+ requires_grad=a.requires_grad or b.requires_grad,
466
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
467
+ _op='matmul'
468
+ )
469
+
470
+ def _backward(grad):
471
+ if a.requires_grad:
472
+ # dL/da = grad @ b.T
473
+ if HAS_NUMPY:
474
+ try:
475
+ grad_arr = np.array(grad, dtype=np.float64)
476
+ b_arr = np.array(b_data, dtype=np.float64)
477
+ a_grad_arr = np.dot(grad_arr, b_arr.T)
478
+ a_grad = a_grad_arr.tolist()
479
+ except (ValueError, TypeError):
480
+ # Fallback
481
+ b_T = [[b_data[j][i] for j in range(len(b_data))]
482
+ for i in range(len(b_data[0]))]
483
+ a_grad = [[sum(float(grad[i][k]) * float(b_T[k][j])
484
+ for k in range(len(grad[0])))
485
+ for j in range(len(b_T[0]))]
486
+ for i in range(len(grad))]
487
+ else:
488
+ b_T = [[b_data[j][i] for j in range(len(b_data))]
489
+ for i in range(len(b_data[0]))]
490
+ a_grad = [[sum(float(grad[i][k]) * float(b_T[k][j])
491
+ for k in range(len(grad[0])))
492
+ for j in range(len(b_T[0]))]
493
+ for i in range(len(grad))]
494
+ a.backward(a_grad)
495
+ if b.requires_grad:
496
+ # dL/db = a.T @ grad
497
+ if HAS_NUMPY:
498
+ try:
499
+ a_arr = np.array(a_data, dtype=np.float64)
500
+ grad_arr = np.array(grad, dtype=np.float64)
501
+ b_grad_arr = np.dot(a_arr.T, grad_arr)
502
+ b_grad = b_grad_arr.tolist()
503
+ except (ValueError, TypeError):
504
+ # Fallback
505
+ a_T = [[a_data[j][i] for j in range(len(a_data))]
506
+ for i in range(len(a_data[0]))]
507
+ b_grad = [[sum(float(a_T[i][k]) * float(grad[k][j])
508
+ for k in range(len(a_T[0])))
509
+ for j in range(len(grad[0]))]
510
+ for i in range(len(a_T))]
511
+ else:
512
+ a_T = [[a_data[j][i] for j in range(len(a_data))]
513
+ for i in range(len(a_data[0]))]
514
+ b_grad = [[sum(float(a_T[i][k]) * float(grad[k][j])
515
+ for k in range(len(a_T[0])))
516
+ for j in range(len(grad[0]))]
517
+ for i in range(len(a_T))]
518
+ b.backward(b_grad)
519
+
520
+ if out.requires_grad:
521
+ out._backward_fn = _backward
522
+
523
+ return out
524
+
525
+
526
+ def dot(a: Tensor, b: Tensor) -> Tensor:
527
+ """
528
+ Dot product of two 1D tensors.
529
+
530
+ Args:
531
+ a: First vector
532
+ b: Second vector
533
+
534
+ Returns:
535
+ Scalar result
536
+ """
537
+ a = _to_tensor(a)
538
+ b = _to_tensor(b)
539
+
540
+ # Flatten to 1D
541
+ a_flat = a.data if not isinstance(a.data[0], list) else a.data[0]
542
+ b_flat = b.data if not isinstance(b.data[0], list) else b.data[0]
543
+
544
+ if len(a_flat) != len(b_flat):
545
+ raise ValueError(f"Vectors must have same length: {len(a_flat)} vs {len(b_flat)}")
546
+
547
+ # Use NumPy if available
548
+ if HAS_NUMPY:
549
+ try:
550
+ a_arr = np.array(a_flat, dtype=np.float64)
551
+ b_arr = np.array(b_flat, dtype=np.float64)
552
+ result = float(np.dot(a_arr, b_arr))
553
+ except (ValueError, TypeError):
554
+ result = sum(float(a_flat[i]) * float(b_flat[i]) for i in range(len(a_flat)))
555
+ else:
556
+ result = sum(float(a_flat[i]) * float(b_flat[i]) for i in range(len(a_flat)))
557
+
558
+ out = Tensor(
559
+ [[result]],
560
+ requires_grad=a.requires_grad or b.requires_grad,
561
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
562
+ _op='dot'
563
+ )
564
+
565
+ def _backward(grad):
566
+ grad_val = float(grad[0][0]) if isinstance(grad[0], list) else float(grad[0])
567
+ if a.requires_grad:
568
+ a.backward([grad_val * float(b_flat[i]) for i in range(len(b_flat))])
569
+ if b.requires_grad:
570
+ b.backward([grad_val * float(a_flat[i]) for i in range(len(a_flat))])
571
+
572
+ if out.requires_grad:
573
+ out._backward_fn = _backward
574
+
575
+ return out
576
+
577
+
578
+
579
+ def transpose(t: Tensor) -> Tensor:
580
+ """
581
+ Transpose a 2D tensor.
582
+
583
+ Args:
584
+ t: Input tensor
585
+
586
+ Returns:
587
+ Transposed tensor
588
+ """
589
+ t = _to_tensor(t)
590
+
591
+ # Use NumPy if available
592
+ if HAS_NUMPY:
593
+ try:
594
+ t_arr = t.numpy if (t.numpy is not None) else _to_numpy(t.data)
595
+ if t_arr is not None:
596
+ if t_arr.ndim == 1:
597
+ out_arr = t_arr.reshape(1, -1).T
598
+ elif t_arr.ndim == 2:
599
+ out_arr = t_arr.T
600
+ else:
601
+ # Generic transpose for higher dim? default to reverse all dims or just swap last two
602
+ out_arr = np.transpose(t_arr) # Reverses dims by default
603
+
604
+ out = _create_tensor_from_numpy(
605
+ out_arr,
606
+ requires_grad=t.requires_grad,
607
+ _prev={t} if t.requires_grad else set(),
608
+ _op='transpose'
609
+ )
610
+ else:
611
+ out_data = _transpose_pure_python(t.data)
612
+ out = Tensor(
613
+ out_data,
614
+ requires_grad=t.requires_grad,
615
+ _prev={t} if t.requires_grad else set(),
616
+ _op='transpose'
617
+ )
618
+ except (ValueError, TypeError):
619
+ out_data = _transpose_pure_python(t.data)
620
+ out = Tensor(
621
+ out_data,
622
+ requires_grad=t.requires_grad,
623
+ _prev={t} if t.requires_grad else set(),
624
+ _op='transpose'
625
+ )
626
+ else:
627
+ out_data = _transpose_pure_python(t.data)
628
+ out = Tensor(
629
+ out_data,
630
+ requires_grad=t.requires_grad,
631
+ _prev={t} if t.requires_grad else set(),
632
+ _op='transpose'
633
+ )
634
+
635
+ def _backward(grad):
636
+ if t.requires_grad:
637
+ # Gradient of transpose is transpose of gradient
638
+ from quantml.ops import transpose
639
+ t.backward(transpose(Tensor(grad) if not isinstance(grad, Tensor) else grad).data)
640
+
641
+ if out.requires_grad:
642
+ out._backward_fn = _backward
643
+
644
+ return out
645
+
646
+
647
+ def _transpose_pure_python(data):
648
+ if not isinstance(data, list):
649
+ return [[data]]
650
+ if not isinstance(data[0], list):
651
+ return [[x] for x in data]
652
+ return [[data[j][i] for j in range(len(data))] for i in range(len(data[0]))]
653
+
654
+
655
+ def sum(t: Tensor, axis: Optional[int] = None) -> Tensor:
656
+ """
657
+ Sum elements of tensor, optionally along an axis.
658
+
659
+ Args:
660
+ t: Input tensor
661
+ axis: Axis to sum along (None for all elements)
662
+
663
+ Returns:
664
+ Sum result
665
+ """
666
+ t = _to_tensor(t)
667
+
668
+ # Use NumPy if available
669
+ if HAS_NUMPY:
670
+ try:
671
+ t_arr = t.numpy if (t.numpy is not None) else _to_numpy(t.data)
672
+ if t_arr is not None:
673
+ result = np.sum(t_arr, axis=axis, keepdims=True)
674
+ # Create tensor directly from NumPy array
675
+ out = _create_tensor_from_numpy(
676
+ result,
677
+ requires_grad=t.requires_grad,
678
+ _prev={t} if t.requires_grad else set(),
679
+ _op='sum'
680
+ )
681
+ # Set backward function
682
+ def _backward(grad):
683
+ if t.requires_grad:
684
+ # Broadcast gradient back
685
+ if HAS_NUMPY:
686
+ try:
687
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
688
+ if axis is None:
689
+ # Broadcast to all elements
690
+ t_grad = np.broadcast_to(grad_arr, t_arr.shape)
691
+ elif axis == 0:
692
+ # Broadcast along axis 0
693
+ t_grad = np.broadcast_to(grad_arr, t_arr.shape)
694
+ else: # axis == 1
695
+ t_grad = np.broadcast_to(grad_arr, t_arr.shape)
696
+ t.backward(t_grad)
697
+ except (ValueError, TypeError):
698
+ # Fallback to list operations
699
+ if axis is None:
700
+ grad_val = float(grad[0][0]) if isinstance(grad[0], list) else float(grad[0])
701
+ if isinstance(t.data[0], list):
702
+ t_grad = [[grad_val for _ in row] for row in t.data]
703
+ else:
704
+ t_grad = [grad_val for _ in t.data]
705
+ elif axis == 0:
706
+ grad_val = float(grad[0][0]) if isinstance(grad[0], list) else float(grad[0])
707
+ if isinstance(t.data[0], list):
708
+ t_grad = [[grad_val for _ in range(len(t.data))]
709
+ for _ in range(len(t.data[0]))]
710
+ t_grad = [[t_grad[j][i] for j in range(len(t_grad))]
711
+ for i in range(len(t_grad[0]))]
712
+ else:
713
+ t_grad = [grad_val for _ in t.data]
714
+ else: # axis == 1
715
+ if isinstance(grad[0], list):
716
+ t_grad = [[float(grad[i][0]) for _ in range(len(t.data[i]))]
717
+ for i in range(len(t.data))]
718
+ else:
719
+ t_grad = [[float(grad[i]) for _ in range(len(t.data[i]))]
720
+ for i in range(len(t.data))]
721
+ t.backward(t_grad)
722
+ else:
723
+ # Fallback (same as before)
724
+ if axis is None:
725
+ grad_val = float(grad[0][0]) if isinstance(grad[0], list) else float(grad[0])
726
+ if isinstance(t.data[0], list):
727
+ t_grad = [[grad_val for _ in row] for row in t.data]
728
+ else:
729
+ t_grad = [grad_val for _ in t.data]
730
+ elif axis == 0:
731
+ grad_val = float(grad[0][0]) if isinstance(grad[0], list) else float(grad[0])
732
+ if isinstance(t.data[0], list):
733
+ t_grad = [[grad_val for _ in range(len(t.data))]
734
+ for _ in range(len(t.data[0]))]
735
+ t_grad = [[t_grad[j][i] for j in range(len(t_grad))]
736
+ for i in range(len(t_grad[0]))]
737
+ else:
738
+ t_grad = [grad_val for _ in t.data]
739
+ else: # axis == 1
740
+ if isinstance(grad[0], list):
741
+ t_grad = [[float(grad[i][0]) for _ in range(len(t.data[i]))]
742
+ for i in range(len(t.data))]
743
+ else:
744
+ t_grad = [[float(grad[i]) for _ in range(len(t.data[i]))]
745
+ for i in range(len(t.data))]
746
+ t.backward(t_grad)
747
+
748
+ if out.requires_grad:
749
+ out._backward_fn = _backward
750
+
751
+ return out
752
+ except (ValueError, TypeError):
753
+ # Fallback to pure Python
754
+ if axis is None:
755
+ total = 0.0
756
+ if isinstance(t.data[0], list):
757
+ for row in t.data:
758
+ total += sum(float(x) for x in row)
759
+ else:
760
+ total = sum(float(x) for x in t.data)
761
+ out_data = [[total]]
762
+ elif axis == 0:
763
+ if isinstance(t.data[0], list):
764
+ out_data = [[sum(float(t.data[i][j]) for i in range(len(t.data)))
765
+ for j in range(len(t.data[0]))]]
766
+ else:
767
+ out_data = [[sum(float(x) for x in t.data)]]
768
+ elif axis == 1:
769
+ if isinstance(t.data[0], list):
770
+ out_data = [[sum(float(row[j]) for j in range(len(row)))]
771
+ for row in t.data]
772
+ else:
773
+ out_data = t.data
774
+ else:
775
+ raise ValueError(f"Invalid axis: {axis}")
776
+ else:
777
+ # Pure Python implementation
778
+ if axis is None:
779
+ total = 0.0
780
+ if isinstance(t.data[0], list):
781
+ for row in t.data:
782
+ total += sum(float(x) for x in row)
783
+ else:
784
+ total = sum(float(x) for x in t.data)
785
+ out_data = [[total]]
786
+ elif axis == 0:
787
+ if isinstance(t.data[0], list):
788
+ out_data = [[sum(float(t.data[i][j]) for i in range(len(t.data)))
789
+ for j in range(len(t.data[0]))]]
790
+ else:
791
+ out_data = [[sum(float(x) for x in t.data)]]
792
+ elif axis == 1:
793
+ if isinstance(t.data[0], list):
794
+ out_data = [[sum(float(row[j]) for j in range(len(row)))]
795
+ for row in t.data]
796
+ else:
797
+ out_data = t.data
798
+ else:
799
+ raise ValueError(f"Invalid axis: {axis}")
800
+
801
+ out = Tensor(
802
+ out_data,
803
+ requires_grad=t.requires_grad,
804
+ _prev={t} if t.requires_grad else set(),
805
+ _op='sum'
806
+ )
807
+
808
+ def _backward(grad):
809
+ if t.requires_grad:
810
+ # Broadcast gradient back
811
+ if axis is None:
812
+ # Broadcast to all elements
813
+ grad_val = float(grad[0][0]) if isinstance(grad[0], list) else float(grad[0])
814
+ if isinstance(t.data[0], list):
815
+ t_grad = [[grad_val for _ in row] for row in t.data]
816
+ else:
817
+ t_grad = [grad_val for _ in t.data]
818
+ elif axis == 0:
819
+ grad_val = float(grad[0][0]) if isinstance(grad[0], list) else float(grad[0])
820
+ if isinstance(t.data[0], list):
821
+ t_grad = [[grad_val for _ in range(len(t.data))]
822
+ for _ in range(len(t.data[0]))]
823
+ t_grad = [[t_grad[j][i] for j in range(len(t_grad))]
824
+ for i in range(len(t_grad[0]))]
825
+ else:
826
+ t_grad = [grad_val for _ in t.data]
827
+ else: # axis == 1
828
+ if isinstance(grad[0], list):
829
+ t_grad = [[float(grad[i][0]) for _ in range(len(t.data[i]))]
830
+ for i in range(len(t.data))]
831
+ else:
832
+ t_grad = [[float(grad[i]) for _ in range(len(t.data[i]))]
833
+ for i in range(len(t.data))]
834
+ t.backward(t_grad)
835
+
836
+ if out.requires_grad:
837
+ out._backward_fn = _backward
838
+
839
+ return out
840
+
841
+
842
+ def mean(t: Tensor, axis: Optional[int] = None) -> Tensor:
843
+ """Compute mean of tensor elements."""
844
+ t = _to_tensor(t)
845
+ s = sum(t, axis=axis)
846
+ if axis is None:
847
+ count = 1.0
848
+ if isinstance(t.data[0], list):
849
+ count = len(t.data) * len(t.data[0])
850
+ else:
851
+ count = len(t.data)
852
+ elif axis == 0:
853
+ count = len(t.data) if isinstance(t.data[0], list) else 1.0
854
+ else:
855
+ count = len(t.data[0]) if isinstance(t.data[0], list) else len(t.data)
856
+
857
+ return div(s, count)
858
+
859
+
860
+ def var(t: Tensor, axis: Optional[int] = None, unbiased: bool = True) -> Tensor:
861
+ """
862
+ Compute variance of tensor elements.
863
+
864
+ Args:
865
+ t: Input tensor
866
+ axis: Axis to compute variance over
867
+ unbiased: If True, use Bessel's correction (N-1)
868
+
869
+ Returns:
870
+ Tensor with variance
871
+ """
872
+ t = _to_tensor(t)
873
+ m = mean(t, axis=axis)
874
+
875
+ # Expand mean to match shape for subtraction
876
+ # If axis is 0 (cols), mean is 1xM, needs to be NxM
877
+ # If axis is 1 (rows), mean is Nx1, needs to be NxM
878
+ # Note: simple sub() might not broadcast correctly if shapes don't align perfectly
879
+ # For now relying on sub's broadcasting or manual reshaping if needed
880
+
881
+ diff = sub(t, m)
882
+ diff_sq = mul(diff, diff)
883
+
884
+ # Sum of squared differences
885
+ s = sum(diff_sq, axis=axis)
886
+
887
+ # Divide by N or N-1
888
+ if axis is None:
889
+ count = len(t.data) * len(t.data[0]) if isinstance(t.data[0], list) else len(t.data)
890
+ elif axis == 0:
891
+ count = len(t.data) if isinstance(t.data[0], list) else 1
892
+ else:
893
+ count = len(t.data[0]) if isinstance(t.data[0], list) else len(t.data)
894
+
895
+ denom = count - 1 if unbiased and count > 1 else count
896
+
897
+ return div(s, float(denom))
898
+
899
+
900
+ def std(t: Tensor, axis: Optional[int] = None, unbiased: bool = True) -> Tensor:
901
+ """Compute standard deviation of tensor elements."""
902
+ v = var(t, axis=axis, unbiased=unbiased)
903
+ return pow(v, 0.5)
904
+
905
+
906
+ def relu(t: Tensor) -> Tensor:
907
+ """
908
+ ReLU activation function: max(0, x)
909
+
910
+ Args:
911
+ t: Input tensor
912
+
913
+ Returns:
914
+ Tensor with ReLU applied
915
+ """
916
+ t = _to_tensor(t)
917
+
918
+ # Use NumPy if available
919
+ if HAS_NUMPY:
920
+ try:
921
+ t_arr = _to_numpy(t.data)
922
+ if t_arr is not None:
923
+ out_arr = np.maximum(0, t_arr)
924
+ out_data = out_arr.tolist()
925
+ else:
926
+ # Fallback
927
+ def _relu_op(x):
928
+ return max(0.0, float(x))
929
+ if isinstance(t.data[0], list):
930
+ out_data = [[_relu_op(t.data[i][j]) for j in range(len(t.data[i]))]
931
+ for i in range(len(t.data))]
932
+ else:
933
+ out_data = [_relu_op(x) for x in t.data]
934
+ except (ValueError, TypeError):
935
+ # Fallback
936
+ def _relu_op(x):
937
+ return max(0.0, float(x))
938
+ if isinstance(t.data[0], list):
939
+ out_data = [[_relu_op(t.data[i][j]) for j in range(len(t.data[i]))]
940
+ for i in range(len(t.data))]
941
+ else:
942
+ out_data = [_relu_op(x) for x in t.data]
943
+ else:
944
+ # Pure Python
945
+ def _relu_op(x):
946
+ return max(0.0, float(x))
947
+ if isinstance(t.data[0], list):
948
+ out_data = [[_relu_op(t.data[i][j]) for j in range(len(t.data[i]))]
949
+ for i in range(len(t.data))]
950
+ else:
951
+ out_data = [_relu_op(x) for x in t.data]
952
+
953
+ out = Tensor(
954
+ out_data,
955
+ requires_grad=t.requires_grad,
956
+ _prev={t} if t.requires_grad else set(),
957
+ _op='relu'
958
+ )
959
+
960
+ def _backward(grad):
961
+ if t.requires_grad:
962
+ if HAS_NUMPY:
963
+ try:
964
+ # Convert to numpy if not already
965
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
966
+ t_arr = _to_numpy(t.data)
967
+ # Gradient is passed where t > 0
968
+ t_grad_arr = np.where(t_arr > 0, grad_arr, 0.0)
969
+ t.backward(t_grad_arr)
970
+ return
971
+ except (ValueError, TypeError):
972
+ pass
973
+
974
+ # Pure Python fallback
975
+ if isinstance(grad, list) and isinstance(t.data[0], list):
976
+ t_grad = [[float(grad[i][j]) if float(t.data[i][j]) > 0 else 0.0
977
+ for j in range(len(grad[i]))] for i in range(len(grad))]
978
+ elif isinstance(grad, list):
979
+ t_grad = [float(grad[i]) if float(t.data[i]) > 0 else 0.0
980
+ for i in range(len(grad))]
981
+ else:
982
+ t_grad = float(grad) if float(t.data[0][0] if isinstance(t.data, list) else t.data) > 0 else 0.0
983
+ t.backward(t_grad)
984
+
985
+ if out.requires_grad:
986
+ out._backward_fn = _backward
987
+
988
+ return out
989
+
990
+
991
+ def sigmoid(t: Tensor) -> Tensor:
992
+ """
993
+ Sigmoid activation function: 1 / (1 + exp(-x))
994
+
995
+ Args:
996
+ t: Input tensor
997
+
998
+ Returns:
999
+ Tensor with sigmoid applied
1000
+ """
1001
+ t = _to_tensor(t)
1002
+
1003
+ # Use NumPy if available
1004
+ if HAS_NUMPY:
1005
+ try:
1006
+ t_arr = _to_numpy(t.data)
1007
+ if t_arr is not None:
1008
+ out_arr = 1.0 / (1.0 + np.exp(-t_arr))
1009
+ out_data = out_arr.tolist()
1010
+ else:
1011
+ # Fallback
1012
+ import math
1013
+ def _sigmoid_op(x):
1014
+ return 1.0 / (1.0 + math.exp(-float(x)))
1015
+ if isinstance(t.data[0], list):
1016
+ out_data = [[_sigmoid_op(t.data[i][j]) for j in range(len(t.data[i]))]
1017
+ for i in range(len(t.data))]
1018
+ else:
1019
+ out_data = [_sigmoid_op(x) for x in t.data]
1020
+ except (ValueError, TypeError):
1021
+ # Fallback
1022
+ import math
1023
+ def _sigmoid_op(x):
1024
+ return 1.0 / (1.0 + math.exp(-float(x)))
1025
+ if isinstance(t.data[0], list):
1026
+ out_data = [[_sigmoid_op(t.data[i][j]) for j in range(len(t.data[i]))]
1027
+ for i in range(len(t.data))]
1028
+ else:
1029
+ out_data = [_sigmoid_op(x) for x in t.data]
1030
+ else:
1031
+ # Pure Python
1032
+ import math
1033
+ def _sigmoid_op(x):
1034
+ return 1.0 / (1.0 + math.exp(-float(x)))
1035
+ if isinstance(t.data[0], list):
1036
+ out_data = [[_sigmoid_op(t.data[i][j]) for j in range(len(t.data[i]))]
1037
+ for i in range(len(t.data))]
1038
+ else:
1039
+ out_data = [_sigmoid_op(x) for x in t.data]
1040
+
1041
+ out = Tensor(
1042
+ out_data,
1043
+ requires_grad=t.requires_grad,
1044
+ _prev={t} if t.requires_grad else set(),
1045
+ _op='sigmoid'
1046
+ )
1047
+
1048
+ def _backward(grad):
1049
+ if t.requires_grad:
1050
+ # d/dx sigmoid(x) = sigmoid(x) * (1 - sigmoid(x))
1051
+ if HAS_NUMPY:
1052
+ try:
1053
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
1054
+ out_arr = _to_numpy(out_data)
1055
+ t_grad_arr = grad_arr * out_arr * (1.0 - out_arr)
1056
+ t.backward(t_grad_arr)
1057
+ return
1058
+ except (ValueError, TypeError):
1059
+ pass
1060
+
1061
+ # Pure Python fallback
1062
+ if isinstance(grad, list) and isinstance(t.data[0], list):
1063
+ t_grad = [[float(grad[i][j]) * float(out_data[i][j]) * (1.0 - float(out_data[i][j]))
1064
+ for j in range(len(grad[i]))] for i in range(len(grad))]
1065
+ elif isinstance(grad, list):
1066
+ t_grad = [float(grad[i]) * float(out_data[i]) * (1.0 - float(out_data[i]))
1067
+ for i in range(len(grad))]
1068
+ else:
1069
+ s = float(out_data[0][0]) if isinstance(out_data, list) and isinstance(out_data[0], list) else float(out_data[0]) if isinstance(out_data, list) else float(out_data)
1070
+ t_grad = float(grad) * s * (1.0 - s)
1071
+ t.backward(t_grad)
1072
+
1073
+ if out.requires_grad:
1074
+ out._backward_fn = _backward
1075
+
1076
+ return out
1077
+
1078
+
1079
+ def abs(t: Tensor) -> Tensor:
1080
+ """
1081
+ Absolute value of tensor elements.
1082
+
1083
+ Args:
1084
+ t: Input tensor
1085
+
1086
+ Returns:
1087
+ Tensor with absolute values
1088
+ """
1089
+ t = _to_tensor(t)
1090
+
1091
+ # Use NumPy if available
1092
+ if HAS_NUMPY:
1093
+ try:
1094
+ t_arr = _to_numpy(t.data)
1095
+ if t_arr is not None:
1096
+ out_arr = np.abs(t_arr)
1097
+ out_data = out_arr.tolist()
1098
+ else:
1099
+ # Fallback
1100
+ if isinstance(t.data[0], list):
1101
+ out_data = [[abs(float(t.data[i][j])) for j in range(len(t.data[i]))]
1102
+ for i in range(len(t.data))]
1103
+ else:
1104
+ out_data = [abs(float(x)) for x in t.data]
1105
+ except (ValueError, TypeError):
1106
+ # Fallback
1107
+ if isinstance(t.data[0], list):
1108
+ out_data = [[abs(float(t.data[i][j])) for j in range(len(t.data[i]))]
1109
+ for i in range(len(t.data))]
1110
+ else:
1111
+ out_data = [abs(float(x)) for x in t.data]
1112
+ else:
1113
+ # Pure Python
1114
+ if isinstance(t.data[0], list):
1115
+ out_data = [[abs(float(t.data[i][j])) for j in range(len(t.data[i]))]
1116
+ for i in range(len(t.data))]
1117
+ else:
1118
+ out_data = [abs(float(x)) for x in t.data]
1119
+
1120
+ out = Tensor(
1121
+ out_data,
1122
+ requires_grad=t.requires_grad,
1123
+ _prev={t} if t.requires_grad else set(),
1124
+ _op='abs'
1125
+ )
1126
+
1127
+ def _backward(grad):
1128
+ if t.requires_grad:
1129
+ # d/dx |x| = sign(x) = x / |x|
1130
+ if isinstance(grad, list) and isinstance(t.data[0], list):
1131
+ t_grad = [[float(grad[i][j]) * (1.0 if float(t.data[i][j]) >= 0 else -1.0)
1132
+ for j in range(len(grad[i]))] for i in range(len(grad))]
1133
+ elif isinstance(grad, list):
1134
+ t_grad = [float(grad[i]) * (1.0 if float(t.data[i]) >= 0 else -1.0)
1135
+ for i in range(len(grad))]
1136
+ else:
1137
+ t_grad = float(grad) * (1.0 if float(t.data[0][0]) >= 0 else -1.0)
1138
+ t.backward(t_grad)
1139
+
1140
+ if out.requires_grad:
1141
+ out._backward_fn = _backward
1142
+
1143
+ return out
1144
+
1145
+
1146
+ def maximum(a: Tensor, b: Union[Tensor, float, int]) -> Tensor:
1147
+ """
1148
+ Element-wise maximum of two tensors.
1149
+
1150
+ Args:
1151
+ a: First tensor
1152
+ b: Second tensor or scalar
1153
+
1154
+ Returns:
1155
+ Tensor with element-wise maximum
1156
+ """
1157
+ a = _to_tensor(a)
1158
+ b = _to_tensor(b)
1159
+
1160
+ # Use NumPy if available
1161
+ a_data = a.numpy if (HAS_NUMPY and a.numpy is not None) else a.data
1162
+ b_data = b.numpy if (HAS_NUMPY and b.numpy is not None) else b.data
1163
+
1164
+ if HAS_NUMPY and isinstance(a_data, np.ndarray) and isinstance(b_data, np.ndarray):
1165
+ try:
1166
+ out_arr = np.maximum(a_data, b_data)
1167
+ out_data = out_arr.tolist()
1168
+ except (ValueError, TypeError):
1169
+ out_data = _broadcast_op(a.data, b.data, lambda x, y: max(float(x), float(y)))
1170
+ else:
1171
+ out_data = _broadcast_op(a_data, b_data, lambda x, y: max(float(x), float(y)))
1172
+
1173
+ out = Tensor(
1174
+ out_data,
1175
+ requires_grad=a.requires_grad or b.requires_grad,
1176
+ _prev={a, b} if (a.requires_grad or b.requires_grad) else set(),
1177
+ _op='maximum'
1178
+ )
1179
+
1180
+ def _backward(grad):
1181
+ if a.requires_grad:
1182
+ # Gradient is grad where a >= b, else 0
1183
+ if isinstance(grad, list) and isinstance(a.data[0], list):
1184
+ a_grad = [[float(grad[i][j]) if float(a.data[i][j]) >= float(b.data[i][j] if isinstance(b.data[0], list) else b.data[j] if j < len(b.data) else b.data[0]) else 0.0
1185
+ for j in range(len(grad[i]))] for i in range(len(grad))]
1186
+ else:
1187
+ a_grad = grad
1188
+ a.backward(a_grad)
1189
+ if b.requires_grad:
1190
+ # Gradient is grad where b > a, else 0
1191
+ if isinstance(grad, list) and isinstance(b.data[0], list):
1192
+ b_grad = [[float(grad[i][j]) if float(b.data[i][j]) > float(a.data[i][j] if isinstance(a.data[0], list) else a.data[j] if j < len(a.data) else a.data[0]) else 0.0
1193
+ for j in range(len(grad[i]))] for i in range(len(grad))]
1194
+ else:
1195
+ b_grad = grad
1196
+ b.backward(b_grad)
1197
+
1198
+ if out.requires_grad:
1199
+ out._backward_fn = _backward
1200
+
1201
+ return out
1202
+
1203
+
1204
+ def tanh(t: Tensor) -> Tensor:
1205
+ """
1206
+ Hyperbolic tangent activation function.
1207
+
1208
+ Args:
1209
+ t: Input tensor
1210
+
1211
+ Returns:
1212
+ Tensor with tanh applied
1213
+ """
1214
+ t = _to_tensor(t)
1215
+
1216
+ # Use NumPy if available
1217
+ if HAS_NUMPY:
1218
+ try:
1219
+ t_arr = _to_numpy(t.data)
1220
+ if t_arr is not None:
1221
+ out_arr = np.tanh(t_arr)
1222
+ out_data = out_arr.tolist()
1223
+ else:
1224
+ # Fallback
1225
+ import math
1226
+ def _tanh_op(x):
1227
+ return math.tanh(float(x))
1228
+ if isinstance(t.data[0], list):
1229
+ out_data = [[_tanh_op(t.data[i][j]) for j in range(len(t.data[i]))]
1230
+ for i in range(len(t.data))]
1231
+ else:
1232
+ out_data = [_tanh_op(x) for x in t.data]
1233
+ except (ValueError, TypeError):
1234
+ # Fallback
1235
+ import math
1236
+ def _tanh_op(x):
1237
+ return math.tanh(float(x))
1238
+ if isinstance(t.data[0], list):
1239
+ out_data = [[_tanh_op(t.data[i][j]) for j in range(len(t.data[i]))]
1240
+ for i in range(len(t.data))]
1241
+ else:
1242
+ out_data = [_tanh_op(x) for x in t.data]
1243
+ else:
1244
+ # Pure Python
1245
+ import math
1246
+ def _tanh_op(x):
1247
+ return math.tanh(float(x))
1248
+ if isinstance(t.data[0], list):
1249
+ out_data = [[_tanh_op(t.data[i][j]) for j in range(len(t.data[i]))]
1250
+ for i in range(len(t.data))]
1251
+ else:
1252
+ out_data = [_tanh_op(x) for x in t.data]
1253
+
1254
+ out = Tensor(
1255
+ out_data,
1256
+ requires_grad=t.requires_grad,
1257
+ _prev={t} if t.requires_grad else set(),
1258
+ _op='tanh'
1259
+ )
1260
+
1261
+ def _backward(grad):
1262
+ if t.requires_grad:
1263
+ # d/dx tanh(x) = 1 - tanh^2(x)
1264
+ if HAS_NUMPY:
1265
+ try:
1266
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
1267
+ out_arr = _to_numpy(out_data)
1268
+ t_grad_arr = grad_arr * (1.0 - out_arr ** 2)
1269
+ t.backward(t_grad_arr)
1270
+ return
1271
+ except (ValueError, TypeError):
1272
+ pass
1273
+
1274
+ # Pure Python fallback
1275
+ if isinstance(grad, list) and isinstance(t.data[0], list):
1276
+ t_grad = [[float(grad[i][j]) * (1.0 - float(out_data[i][j]) ** 2)
1277
+ for j in range(len(grad[i]))] for i in range(len(grad))]
1278
+ elif isinstance(grad, list):
1279
+ t_grad = [float(grad[i]) * (1.0 - float(out_data[i]) ** 2)
1280
+ for i in range(len(grad))]
1281
+ else:
1282
+ s = float(out_data[0][0]) if isinstance(out_data, list) and isinstance(out_data[0], list) else float(out_data[0]) if isinstance(out_data, list) else float(out_data)
1283
+ t_grad = float(grad) * (1.0 - s ** 2)
1284
+ t.backward(t_grad)
1285
+
1286
+ if out.requires_grad:
1287
+ out._backward_fn = _backward
1288
+
1289
+ return out
1290
+
1291
+
1292
+ def softmax(t: Tensor, axis: int = -1) -> Tensor:
1293
+ """
1294
+ Softmax activation function: exp(x) / sum(exp(x))
1295
+
1296
+ Normalizes values to probabilities that sum to 1.
1297
+
1298
+ Args:
1299
+ t: Input tensor
1300
+ axis: Axis along which to compute softmax (default: -1, last axis)
1301
+
1302
+ Returns:
1303
+ Tensor with softmax applied
1304
+
1305
+ Examples:
1306
+ >>> x = Tensor([[1.0, 2.0, 3.0]])
1307
+ >>> y = softmax(x) # Probabilities summing to 1
1308
+ """
1309
+ t = _to_tensor(t)
1310
+
1311
+ # Use NumPy if available
1312
+ if HAS_NUMPY:
1313
+ try:
1314
+ t_arr = _to_numpy(t.data)
1315
+ if t_arr is not None:
1316
+ # Numerically stable softmax
1317
+ t_max = np.max(t_arr, axis=axis, keepdims=True)
1318
+ exp_arr = np.exp(t_arr - t_max)
1319
+ out_arr = exp_arr / np.sum(exp_arr, axis=axis, keepdims=True)
1320
+ out_data = out_arr.tolist()
1321
+ else:
1322
+ # Fallback to pure Python
1323
+ out_data = _softmax_pure_python(t.data, axis)
1324
+ except (ValueError, TypeError):
1325
+ out_data = _softmax_pure_python(t.data, axis)
1326
+ else:
1327
+ out_data = _softmax_pure_python(t.data, axis)
1328
+
1329
+ out = Tensor(
1330
+ out_data,
1331
+ requires_grad=t.requires_grad,
1332
+ _prev={t} if t.requires_grad else set(),
1333
+ _op='softmax'
1334
+ )
1335
+
1336
+ def _backward(grad):
1337
+ if t.requires_grad:
1338
+ # Jacobian-vector product for softmax
1339
+ if HAS_NUMPY:
1340
+ try:
1341
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
1342
+ s = _to_numpy(out_data)
1343
+
1344
+ # optimized implementation: s * (g - sum(s*g))
1345
+ # sum(s*g) along axis
1346
+ sg_dot = np.sum(s * grad_arr, axis=axis, keepdims=True)
1347
+ t_grad_arr = s * (grad_arr - sg_dot)
1348
+ t.backward(t_grad_arr)
1349
+ return
1350
+ except (ValueError, TypeError):
1351
+ pass
1352
+
1353
+ # Pure Python fallback
1354
+ if isinstance(grad, list) and isinstance(t.data[0], list):
1355
+ t_grad = []
1356
+ for i in range(len(grad)):
1357
+ row_grad = []
1358
+ s = out_data[i] # softmax output
1359
+ g = grad[i] # upstream gradient
1360
+ # Sum over all k: s[j] * (delta_jk - s[k]) * g[k]
1361
+ dot_sg = sum(float(s[k]) * float(g[k]) for k in range(len(s)))
1362
+ for j in range(len(g)):
1363
+ row_grad.append(float(s[j]) * (float(g[j]) - dot_sg))
1364
+ t_grad.append(row_grad)
1365
+ elif isinstance(grad, list):
1366
+ s = out_data
1367
+ g = grad
1368
+ dot_sg = sum(float(s[k]) * float(g[k]) for k in range(len(s)))
1369
+ t_grad = [float(s[j]) * (float(g[j]) - dot_sg) for j in range(len(g))]
1370
+ else:
1371
+ t_grad = grad
1372
+ t.backward(t_grad)
1373
+
1374
+ if out.requires_grad:
1375
+ out._backward_fn = _backward
1376
+
1377
+ return out
1378
+
1379
+
1380
+ def _softmax_pure_python(data, axis=-1):
1381
+ """Pure Python softmax implementation."""
1382
+ import math
1383
+
1384
+ if isinstance(data[0], list):
1385
+ # 2D case
1386
+ if axis == -1 or axis == 1:
1387
+ result = []
1388
+ for row in data:
1389
+ max_val = max(float(x) for x in row)
1390
+ exp_vals = [math.exp(float(x) - max_val) for x in row]
1391
+ sum_exp = sum(exp_vals)
1392
+ result.append([e / sum_exp for e in exp_vals])
1393
+ return result
1394
+ else: # axis == 0
1395
+ # Transpose, apply, transpose back
1396
+ cols = len(data[0])
1397
+ rows = len(data)
1398
+ result = [[0.0] * cols for _ in range(rows)]
1399
+ for j in range(cols):
1400
+ col = [float(data[i][j]) for i in range(rows)]
1401
+ max_val = max(col)
1402
+ exp_vals = [math.exp(x - max_val) for x in col]
1403
+ sum_exp = sum(exp_vals)
1404
+ for i in range(rows):
1405
+ result[i][j] = exp_vals[i] / sum_exp
1406
+ return result
1407
+ else:
1408
+ # 1D case
1409
+ max_val = max(float(x) for x in data)
1410
+ exp_vals = [math.exp(float(x) - max_val) for x in data]
1411
+ sum_exp = sum(exp_vals)
1412
+ return [e / sum_exp for e in exp_vals]
1413
+
1414
+
1415
+ def leaky_relu(t: Tensor, negative_slope: float = 0.01) -> Tensor:
1416
+ """
1417
+ Leaky ReLU activation function: max(0, x) + negative_slope * min(0, x)
1418
+
1419
+ Args:
1420
+ t: Input tensor
1421
+ negative_slope: Slope for negative values (default: 0.01)
1422
+
1423
+ Returns:
1424
+ Tensor with Leaky ReLU applied
1425
+
1426
+ Examples:
1427
+ >>> x = Tensor([[-1.0, 0.0, 1.0]])
1428
+ >>> y = leaky_relu(x, negative_slope=0.1) # [[-0.1, 0.0, 1.0]]
1429
+ """
1430
+ t = _to_tensor(t)
1431
+
1432
+ if HAS_NUMPY:
1433
+ try:
1434
+ t_arr = _to_numpy(t.data)
1435
+ if t_arr is not None:
1436
+ out_arr = np.where(t_arr > 0, t_arr, negative_slope * t_arr)
1437
+ out_data = out_arr.tolist()
1438
+ else:
1439
+ out_data = _leaky_relu_pure_python(t.data, negative_slope)
1440
+ except (ValueError, TypeError):
1441
+ out_data = _leaky_relu_pure_python(t.data, negative_slope)
1442
+ else:
1443
+ out_data = _leaky_relu_pure_python(t.data, negative_slope)
1444
+
1445
+ out = Tensor(
1446
+ out_data,
1447
+ requires_grad=t.requires_grad,
1448
+ _prev={t} if t.requires_grad else set(),
1449
+ _op='leaky_relu'
1450
+ )
1451
+
1452
+ def _backward(grad):
1453
+ if t.requires_grad:
1454
+ if HAS_NUMPY:
1455
+ try:
1456
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
1457
+ t_arr = _to_numpy(t.data)
1458
+ t_grad_arr = np.where(t_arr > 0, grad_arr, grad_arr * negative_slope)
1459
+ t.backward(t_grad_arr)
1460
+ return
1461
+ except (ValueError, TypeError):
1462
+ pass
1463
+
1464
+ # Pure Python fallback
1465
+ if isinstance(grad, list) and isinstance(t.data[0], list):
1466
+ t_grad = [[float(grad[i][j]) if float(t.data[i][j]) > 0
1467
+ else float(grad[i][j]) * negative_slope
1468
+ for j in range(len(grad[i]))] for i in range(len(grad))]
1469
+ elif isinstance(grad, list):
1470
+ t_grad = [float(grad[i]) if float(t.data[i]) > 0
1471
+ else float(grad[i]) * negative_slope
1472
+ for i in range(len(grad))]
1473
+ else:
1474
+ val = float(t.data[0][0]) if isinstance(t.data, list) and isinstance(t.data[0], list) else float(t.data[0]) if isinstance(t.data, list) else float(t.data)
1475
+ g_val = float(grad)
1476
+ t_grad = g_val if val > 0 else g_val * negative_slope
1477
+ t.backward(t_grad)
1478
+
1479
+ if out.requires_grad:
1480
+ out._backward_fn = _backward
1481
+
1482
+ return out
1483
+
1484
+
1485
+ def _leaky_relu_pure_python(data, negative_slope):
1486
+ """Pure Python Leaky ReLU implementation."""
1487
+ if isinstance(data[0], list):
1488
+ return [[float(x) if float(x) > 0 else negative_slope * float(x)
1489
+ for x in row] for row in data]
1490
+ else:
1491
+ return [float(x) if float(x) > 0 else negative_slope * float(x)
1492
+ for x in data]
1493
+
1494
+
1495
+ def gelu(t: Tensor) -> Tensor:
1496
+ """
1497
+ Gaussian Error Linear Unit (GELU) activation function.
1498
+
1499
+ GELU(x) = x * Φ(x) where Φ is the CDF of the standard normal distribution.
1500
+ Approximation: 0.5 * x * (1 + tanh(sqrt(2/π) * (x + 0.044715 * x^3)))
1501
+
1502
+ Popular in Transformer models.
1503
+
1504
+ Args:
1505
+ t: Input tensor
1506
+
1507
+ Returns:
1508
+ Tensor with GELU applied
1509
+
1510
+ Examples:
1511
+ >>> x = Tensor([[0.0, 1.0, 2.0]])
1512
+ >>> y = gelu(x)
1513
+ """
1514
+ t = _to_tensor(t)
1515
+ import math
1516
+ sqrt_2_over_pi = math.sqrt(2.0 / math.pi)
1517
+
1518
+ if HAS_NUMPY:
1519
+ try:
1520
+ t_arr = _to_numpy(t.data)
1521
+ if t_arr is not None:
1522
+ inner = sqrt_2_over_pi * (t_arr + 0.044715 * t_arr ** 3)
1523
+ out_arr = 0.5 * t_arr * (1.0 + np.tanh(inner))
1524
+ out_data = out_arr.tolist()
1525
+ else:
1526
+ out_data = _gelu_pure_python(t.data, sqrt_2_over_pi)
1527
+ except (ValueError, TypeError):
1528
+ out_data = _gelu_pure_python(t.data, sqrt_2_over_pi)
1529
+ else:
1530
+ out_data = _gelu_pure_python(t.data, sqrt_2_over_pi)
1531
+
1532
+ out = Tensor(
1533
+ out_data,
1534
+ requires_grad=t.requires_grad,
1535
+ _prev={t} if t.requires_grad else set(),
1536
+ _op='gelu'
1537
+ )
1538
+
1539
+ def _backward(grad):
1540
+ if t.requires_grad:
1541
+ if HAS_NUMPY:
1542
+ try:
1543
+ grad_arr = np.array(grad, dtype=np.float64) if not isinstance(grad, np.ndarray) else grad
1544
+ x = _to_numpy(t.data)
1545
+ inner = sqrt_2_over_pi * (x + 0.044715 * x ** 3)
1546
+ tanh_inner = np.tanh(inner)
1547
+ sech2_inner = 1.0 - tanh_inner ** 2
1548
+ d_inner = sqrt_2_over_pi * (1.0 + 0.134145 * x ** 2)
1549
+ d_gelu = 0.5 * (1.0 + tanh_inner) + 0.5 * x * sech2_inner * d_inner
1550
+ t_grad_arr = grad_arr * d_gelu
1551
+ t.backward(t_grad_arr)
1552
+ return
1553
+ except (ValueError, TypeError):
1554
+ pass
1555
+
1556
+ # Pure Python fallback
1557
+ # GELU derivative (using approximation)
1558
+ # d/dx GELU(x) = 0.5 * (1 + tanh(inner)) + 0.5 * x * sech^2(inner) * d_inner/dx
1559
+ if isinstance(grad, list) and isinstance(t.data[0], list):
1560
+ t_grad = []
1561
+ for i in range(len(grad)):
1562
+ row_grad = []
1563
+ for j in range(len(grad[i])):
1564
+ x = float(t.data[i][j])
1565
+ inner = sqrt_2_over_pi * (x + 0.044715 * x ** 3)
1566
+ tanh_inner = math.tanh(inner)
1567
+ sech2_inner = 1.0 - tanh_inner ** 2
1568
+ d_inner = sqrt_2_over_pi * (1.0 + 0.134145 * x ** 2)
1569
+ d_gelu = 0.5 * (1.0 + tanh_inner) + 0.5 * x * sech2_inner * d_inner
1570
+ row_grad.append(float(grad[i][j]) * d_gelu)
1571
+ t_grad.append(row_grad)
1572
+ elif isinstance(grad, list):
1573
+ t_grad = []
1574
+ for i in range(len(grad)):
1575
+ x = float(t.data[i])
1576
+ inner = sqrt_2_over_pi * (x + 0.044715 * x ** 3)
1577
+ tanh_inner = math.tanh(inner)
1578
+ sech2_inner = 1.0 - tanh_inner ** 2
1579
+ d_inner = sqrt_2_over_pi * (1.0 + 0.134145 * x ** 2)
1580
+ d_gelu = 0.5 * (1.0 + tanh_inner) + 0.5 * x * sech2_inner * d_inner
1581
+ t_grad.append(float(grad[i]) * d_gelu)
1582
+ else:
1583
+ x = float(t.data[0][0] if isinstance(t.data, list) and isinstance(t.data[0], list) else t.data[0] if isinstance(t.data, list) else t.data)
1584
+ g_val = float(grad)
1585
+ inner = sqrt_2_over_pi * (x + 0.044715 * x ** 3)
1586
+ tanh_inner = math.tanh(inner)
1587
+ sech2_inner = 1.0 - tanh_inner ** 2
1588
+ d_inner = sqrt_2_over_pi * (1.0 + 0.134145 * x ** 2)
1589
+ d_gelu = 0.5 * (1.0 + tanh_inner) + 0.5 * x * sech2_inner * d_inner
1590
+ t_grad = g_val * d_gelu
1591
+ t.backward(t_grad)
1592
+
1593
+ if out.requires_grad:
1594
+ out._backward_fn = _backward
1595
+
1596
+ return out
1597
+
1598
+
1599
+ def _gelu_pure_python(data, sqrt_2_over_pi):
1600
+ """Pure Python GELU implementation."""
1601
+ import math
1602
+
1603
+ def _gelu_element(x):
1604
+ x = float(x)
1605
+ inner = sqrt_2_over_pi * (x + 0.044715 * x ** 3)
1606
+ return 0.5 * x * (1.0 + math.tanh(inner))
1607
+
1608
+ if isinstance(data[0], list):
1609
+ return [[_gelu_element(x) for x in row] for row in data]
1610
+ else:
1611
+ return [_gelu_element(x) for x in data]
1612
+
1613
+
1614
+ def swish(t: Tensor, beta: float = 1.0) -> Tensor:
1615
+ """
1616
+ Swish activation function: x * sigmoid(beta * x)
1617
+
1618
+ Also known as SiLU (Sigmoid Linear Unit) when beta=1.
1619
+ Self-gated activation that sometimes outperforms ReLU.
1620
+
1621
+ Args:
1622
+ t: Input tensor
1623
+ beta: Scaling parameter (default: 1.0)
1624
+
1625
+ Returns:
1626
+ Tensor with Swish applied
1627
+
1628
+ Examples:
1629
+ >>> x = Tensor([[0.0, 1.0, 2.0]])
1630
+ >>> y = swish(x)
1631
+ """
1632
+ t = _to_tensor(t)
1633
+ import math
1634
+
1635
+ if HAS_NUMPY:
1636
+ try:
1637
+ t_arr = _to_numpy(t.data)
1638
+ if t_arr is not None:
1639
+ sig = 1.0 / (1.0 + np.exp(-beta * t_arr))
1640
+ out_arr = t_arr * sig
1641
+ out_data = out_arr.tolist()
1642
+ else:
1643
+ out_data = _swish_pure_python(t.data, beta)
1644
+ except (ValueError, TypeError):
1645
+ out_data = _swish_pure_python(t.data, beta)
1646
+ else:
1647
+ out_data = _swish_pure_python(t.data, beta)
1648
+
1649
+ out = Tensor(
1650
+ out_data,
1651
+ requires_grad=t.requires_grad,
1652
+ _prev={t} if t.requires_grad else set(),
1653
+ _op='swish'
1654
+ )
1655
+
1656
+ def _backward(grad):
1657
+ if t.requires_grad:
1658
+ # d/dx swish(x) = swish(x) + sigmoid(beta*x) * (1 - swish(x))
1659
+ # = sigmoid(beta*x) * (1 + beta*x*(1 - sigmoid(beta*x)))
1660
+ if isinstance(grad, list) and isinstance(t.data[0], list):
1661
+ t_grad = []
1662
+ for i in range(len(grad)):
1663
+ row_grad = []
1664
+ for j in range(len(grad[i])):
1665
+ x = float(t.data[i][j])
1666
+ sig = 1.0 / (1.0 + math.exp(-beta * x))
1667
+ sw = x * sig
1668
+ d_swish = sw + sig * (1.0 - sw)
1669
+ row_grad.append(float(grad[i][j]) * d_swish)
1670
+ t_grad.append(row_grad)
1671
+ elif isinstance(grad, list):
1672
+ t_grad = []
1673
+ for i in range(len(grad)):
1674
+ x = float(t.data[i])
1675
+ sig = 1.0 / (1.0 + math.exp(-beta * x))
1676
+ sw = x * sig
1677
+ d_swish = sw + sig * (1.0 - sw)
1678
+ t_grad.append(float(grad[i]) * d_swish)
1679
+ else:
1680
+ x = float(t.data[0][0])
1681
+ sig = 1.0 / (1.0 + math.exp(-beta * x))
1682
+ sw = x * sig
1683
+ d_swish = sw + sig * (1.0 - sw)
1684
+ t_grad = float(grad) * d_swish
1685
+ t.backward(t_grad)
1686
+
1687
+ if out.requires_grad:
1688
+ out._backward_fn = _backward
1689
+
1690
+ return out
1691
+
1692
+
1693
+ def _swish_pure_python(data, beta):
1694
+ """Pure Python Swish implementation."""
1695
+ import math
1696
+
1697
+ def _swish_element(x):
1698
+ x = float(x)
1699
+ sig = 1.0 / (1.0 + math.exp(-beta * x))
1700
+ return x * sig
1701
+
1702
+ if isinstance(data[0], list):
1703
+ return [[_swish_element(x) for x in row] for row in data]
1704
+ else:
1705
+ return [_swish_element(x) for x in data]
1706
+
1707
+