netcl 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (79) hide show
  1. netcl/__init__.py +37 -0
  2. netcl/amp.py +147 -0
  3. netcl/autograd/__init__.py +86 -0
  4. netcl/autograd/debug.py +12 -0
  5. netcl/autograd/engine.py +121 -0
  6. netcl/autograd/ops.py +558 -0
  7. netcl/core/__init__.py +17 -0
  8. netcl/core/capabilities.py +102 -0
  9. netcl/core/device.py +66 -0
  10. netcl/core/kernels/__init__.py +12 -0
  11. netcl/core/kernels/primitives.py +159 -0
  12. netcl/core/memory.py +59 -0
  13. netcl/core/parameter.py +60 -0
  14. netcl/core/tensor.py +130 -0
  15. netcl/data/augment.py +43 -0
  16. netcl/data/augment_gpu.py +135 -0
  17. netcl/data/dataloader.py +87 -0
  18. netcl/data/filters.py +65 -0
  19. netcl/distributed/__init__.py +22 -0
  20. netcl/distributed/collectives.py +125 -0
  21. netcl/distributed/data_parallel.py +55 -0
  22. netcl/distributed/device_manager.py +33 -0
  23. netcl/distributed/trainer.py +63 -0
  24. netcl/io/__init__.py +3 -0
  25. netcl/io/checkpoint.py +58 -0
  26. netcl/io/serialization.py +117 -0
  27. netcl/nn/__init__.py +48 -0
  28. netcl/nn/batchnorm.py +258 -0
  29. netcl/nn/decorators.py +48 -0
  30. netcl/nn/factory.py +123 -0
  31. netcl/nn/functional.py +45 -0
  32. netcl/nn/groupnorm.py +45 -0
  33. netcl/nn/init.py +34 -0
  34. netcl/nn/layernorm.py +41 -0
  35. netcl/nn/layers.py +369 -0
  36. netcl/nn/loss.py +27 -0
  37. netcl/nn/modules.py +100 -0
  38. netcl/nn/padding.py +57 -0
  39. netcl/nn/pooling.py +267 -0
  40. netcl/nn/residual.py +22 -0
  41. netcl/nn/resnet.py +155 -0
  42. netcl/nn/simple.py +41 -0
  43. netcl/ops/__init__.py +45 -0
  44. netcl/ops/broadcast.py +103 -0
  45. netcl/ops/conv2d.py +745 -0
  46. netcl/ops/conv_transpose2d.py +200 -0
  47. netcl/ops/depthwise_conv2d.py +235 -0
  48. netcl/ops/elementwise.py +477 -0
  49. netcl/ops/im2col.py +122 -0
  50. netcl/ops/matmul.py +182 -0
  51. netcl/ops/reduction.py +102 -0
  52. netcl/ops/softmax.py +96 -0
  53. netcl/ops/softmax_fp16.py +33 -0
  54. netcl/ops/transpose.py +60 -0
  55. netcl/optim/__init__.py +11 -0
  56. netcl/optim/adam.py +55 -0
  57. netcl/optim/adamw.py +58 -0
  58. netcl/optim/amp.py +42 -0
  59. netcl/optim/clip.py +28 -0
  60. netcl/optim/lr_plateau.py +26 -0
  61. netcl/optim/lr_scheduler.py +14 -0
  62. netcl/optim/momentum.py +38 -0
  63. netcl/optim/rmsprop.py +51 -0
  64. netcl/optim/sgd.py +43 -0
  65. netcl/profiling/__init__.py +7 -0
  66. netcl/profiling/timing.py +33 -0
  67. netcl/runtime/__init__.py +8 -0
  68. netcl/runtime/graph.py +131 -0
  69. netcl/runtime/scheduler.py +36 -0
  70. netcl/trainer/__init__.py +3 -0
  71. netcl/trainer/trainer.py +148 -0
  72. netcl/utils/__init__.py +4 -0
  73. netcl/utils/data.py +33 -0
  74. netcl/utils/progress.py +58 -0
  75. netcl-0.1.0.dist-info/METADATA +84 -0
  76. netcl-0.1.0.dist-info/RECORD +79 -0
  77. netcl-0.1.0.dist-info/WHEEL +5 -0
  78. netcl-0.1.0.dist-info/licenses/LICENSE +21 -0
  79. netcl-0.1.0.dist-info/top_level.txt +1 -0
netcl/__init__.py ADDED
@@ -0,0 +1,37 @@
1
+ """
2
+ netcl: PyOpenCL-based experimentation framework.
3
+ This package currently focuses on low-level kernel primitives and helpers.
4
+ """
5
+
6
+ from . import core, ops, autograd, distributed, runtime, profiling
7
+ from .ops import (
8
+ matmul,
9
+ build_matmul_kernel,
10
+ elementwise_binary,
11
+ relu,
12
+ bias_add,
13
+ reduce_sum,
14
+ softmax,
15
+ conv2d,
16
+ )
17
+ from . import nn, optim, io
18
+
19
+ __all__ = [
20
+ "core",
21
+ "ops",
22
+ "autograd",
23
+ "distributed",
24
+ "runtime",
25
+ "profiling",
26
+ "nn",
27
+ "optim",
28
+ "io",
29
+ "matmul",
30
+ "build_matmul_kernel",
31
+ "elementwise_binary",
32
+ "relu",
33
+ "bias_add",
34
+ "reduce_sum",
35
+ "softmax",
36
+ "conv2d",
37
+ ]
netcl/amp.py ADDED
@@ -0,0 +1,147 @@
1
+ from __future__ import annotations
2
+
3
+ from dataclasses import dataclass
4
+ from typing import Optional, Sequence
5
+
6
+ try:
7
+ import numpy as np # type: ignore
8
+ except ImportError: # pragma: no cover
9
+ np = None
10
+
11
+ from netcl.core.tensor import Tensor
12
+ from netcl.core.tensor import _np_dtype # type: ignore
13
+
14
+ _AUTOCAST_ENABLED = False
15
+
16
+
17
+ @dataclass
18
+ class GradScaler:
19
+ init_scale: float = 2.0**16
20
+ growth_factor: float = 2.0
21
+ backoff_factor: float = 0.5
22
+ growth_interval: int = 2000
23
+ enabled: bool = True
24
+
25
+ def __post_init__(self):
26
+ self._scale = self.init_scale
27
+ self._growth_tracker = 0
28
+
29
+ @property
30
+ def scale(self) -> float:
31
+ return self._scale
32
+
33
+ def scale_loss(self, loss: Tensor) -> Tensor:
34
+ if not self.enabled:
35
+ return loss
36
+ from netcl.ops.elementwise import elementwise_binary
37
+
38
+ return elementwise_binary(loss, loss, expression=f"MUL(v0, {float(self._scale)})")
39
+
40
+ def unscale_grads(self, params: Sequence[Tensor]):
41
+ if not self.enabled:
42
+ return False
43
+ found_inf = False
44
+ for p in params:
45
+ if p.grad is None:
46
+ continue
47
+ g = p.grad.to_host()
48
+ if np.any(~np.isfinite(g)):
49
+ found_inf = True
50
+ break
51
+ if not found_inf:
52
+ inv_scale = 1.0 / self._scale
53
+ from netcl.ops.elementwise import elementwise_binary
54
+
55
+ for p in params:
56
+ if p.grad is None:
57
+ continue
58
+ p.grad = elementwise_binary(p.grad, p.grad, expression=f"MUL(v0, {inv_scale})")
59
+ return found_inf
60
+
61
+ def step(self, optimizer, params: Sequence[Tensor]):
62
+ if not self.enabled:
63
+ optimizer.step()
64
+ return
65
+ found_inf = self.unscale_grads(params)
66
+ if not found_inf:
67
+ optimizer.step()
68
+ self._growth_tracker += 1
69
+ if self._growth_tracker % self.growth_interval == 0:
70
+ self._scale *= self.growth_factor
71
+ else:
72
+ self._scale *= self.backoff_factor
73
+ self._growth_tracker = 0
74
+
75
+ def update(self):
76
+ # no-op kept for API compatibility
77
+ pass
78
+
79
+
80
+ def supports_fp16(queue) -> bool:
81
+ """
82
+ Check device extensions for cl_khr_fp16 support.
83
+ """
84
+ try:
85
+ return "cl_khr_fp16" in queue.device.extensions
86
+ except Exception:
87
+ return False
88
+
89
+
90
+ def autocast_enabled(profile_supports_fp16: bool) -> bool:
91
+ return profile_supports_fp16
92
+
93
+
94
+ class autocast:
95
+ """
96
+ Context manager for autocast. Enables casting only if underlying device supports fp16.
97
+ """
98
+
99
+ def __init__(self, enabled: bool = True, device_queue=None):
100
+ self.enabled = enabled
101
+ self.device_queue = device_queue
102
+ self.prev = False
103
+ self._capable = True
104
+
105
+ def __enter__(self):
106
+ global _AUTOCAST_ENABLED
107
+ self.prev = _AUTOCAST_ENABLED
108
+ if not self.enabled:
109
+ _AUTOCAST_ENABLED = False
110
+ return self
111
+ if self.device_queue is not None:
112
+ self._capable = supports_fp16(self.device_queue)
113
+ _AUTOCAST_ENABLED = self.enabled and self._capable
114
+ return self
115
+
116
+ def __exit__(self, exc_type, exc_val, exc_tb):
117
+ global _AUTOCAST_ENABLED
118
+ _AUTOCAST_ENABLED = self.prev
119
+ return False
120
+
121
+
122
+ def is_autocast_enabled() -> bool:
123
+ return _AUTOCAST_ENABLED
124
+
125
+
126
+ def maybe_cast_tensor(t: Tensor) -> Tensor:
127
+ if not _AUTOCAST_ENABLED:
128
+ return t
129
+ if t.dtype in ("float", "float32"):
130
+ # only cast if device can handle fp16
131
+ if supports_fp16(t.queue):
132
+ arr = t.to_host().astype(np.float16)
133
+ return Tensor.from_host(t.queue, arr, dtype="float16")
134
+ return t
135
+ return t
136
+
137
+
138
+ def master_param(param: Tensor) -> Tensor:
139
+ """
140
+ Keep master weights in FP32 for optimizers.
141
+ """
142
+ if param.dtype in ("float16", "half"):
143
+ master = Tensor.from_host(param.queue, param.to_host().astype(np.float32), dtype="float32")
144
+ setattr(master, "_model_param", param)
145
+ return master
146
+ setattr(param, "_model_param", param)
147
+ return param
@@ -0,0 +1,86 @@
1
+ """
2
+ Autograd: Node/Tape plus op wrappers.
3
+ """
4
+
5
+ from .engine import Node, Tape, apply_op, no_grad, set_current_tape, get_current_tape
6
+ from .debug import debug_tape
7
+ from .ops import (
8
+ tensor,
9
+ add,
10
+ relu,
11
+ bias_add,
12
+ matmul_op,
13
+ sub,
14
+ mse_loss,
15
+ sigmoid,
16
+ tanh,
17
+ leaky_relu,
18
+ gelu,
19
+ swish,
20
+ elu,
21
+ softplus,
22
+ hard_sigmoid,
23
+ hard_swish,
24
+ clamp,
25
+ hard_tanh,
26
+ prelu,
27
+ hinge_loss,
28
+ l1_loss,
29
+ l2_loss,
30
+ depthwise_conv2d,
31
+ batch_norm2d,
32
+ layer_norm,
33
+ pad2d,
34
+ group_norm,
35
+ global_avg_pool2d,
36
+ cross_entropy,
37
+ conv2d,
38
+ flatten,
39
+ max_pool2d,
40
+ dropout,
41
+ avg_pool2d,
42
+ )
43
+
44
+ __all__ = [
45
+ "Node",
46
+ "Tape",
47
+ "apply_op",
48
+ "no_grad",
49
+ "tensor",
50
+ "add",
51
+ "relu",
52
+ "bias_add",
53
+ "matmul_op",
54
+ "sub",
55
+ "mse_loss",
56
+ "sigmoid",
57
+ "tanh",
58
+ "leaky_relu",
59
+ "gelu",
60
+ "swish",
61
+ "elu",
62
+ "softplus",
63
+ "hard_sigmoid",
64
+ "hard_swish",
65
+ "clamp",
66
+ "hard_tanh",
67
+ "prelu",
68
+ "hinge_loss",
69
+ "l1_loss",
70
+ "l2_loss",
71
+ "depthwise_conv2d",
72
+ "batch_norm2d",
73
+ "layer_norm",
74
+ "pad2d",
75
+ "group_norm",
76
+ "global_avg_pool2d",
77
+ "cross_entropy",
78
+ "conv2d",
79
+ "flatten",
80
+ "max_pool2d",
81
+ "dropout",
82
+ "avg_pool2d",
83
+ "debug_tape",
84
+ "set_current_tape",
85
+ "get_current_tape",
86
+ ]
@@ -0,0 +1,12 @@
1
+ from __future__ import annotations
2
+
3
+ from contextlib import contextmanager
4
+ from typing import Iterator
5
+
6
+
7
+ @contextmanager
8
+ def debug_tape(tape):
9
+ """
10
+ Context manager to expose a tape for debugging/inspection.
11
+ """
12
+ yield tape
@@ -0,0 +1,121 @@
1
+ """
2
+ Minimal autograd backbone (placeholders).
3
+
4
+ To be extended with full gradient tracking and backward kernels per op.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ from dataclasses import dataclass, field
10
+ from typing import Any, Callable, List, Optional
11
+ import threading
12
+
13
+ from netcl.core.tensor import Tensor
14
+
15
+
16
+ GradFn = Callable[[Any], List[Optional[Tensor]]]
17
+
18
+
19
+ @dataclass
20
+ class Node:
21
+ value: Tensor
22
+ grad_fn: Optional[GradFn] = None
23
+ parents: List["Node"] = field(default_factory=list)
24
+ grad: Optional[Tensor] = None
25
+ requires_grad: bool = False
26
+
27
+
28
+ class Tape:
29
+ """
30
+ Records operations for backward.
31
+ """
32
+
33
+ def __init__(self) -> None:
34
+ self.nodes: List[Node] = []
35
+ self.enabled: bool = True
36
+
37
+ def record(self, node: Node) -> Node:
38
+ if self.enabled:
39
+ self.nodes.append(node)
40
+ return node
41
+
42
+ def backward(self, loss: Node, grad: Optional[Tensor] = None) -> None:
43
+ if grad is None:
44
+ # seed with ones
45
+ loss.grad = ones_like(loss.value)
46
+ else:
47
+ loss.grad = grad
48
+ # Reverse topological order (here: recorded order)
49
+ for node in reversed(self.nodes):
50
+ if node.grad is None or node.grad_fn is None:
51
+ continue
52
+ grads = node.grad_fn(node.grad)
53
+ for parent, g in zip(node.parents, grads):
54
+ if g is None:
55
+ continue
56
+ if parent.grad is None:
57
+ parent.grad = g
58
+ else:
59
+ parent.grad = add_inplace(parent.grad, g)
60
+ # propagate to underlying Tensor for optimizers
61
+ if parent.value.grad is None:
62
+ parent.value.grad = parent.grad
63
+ else:
64
+ parent.value.grad = add_inplace(parent.value.grad, g)
65
+
66
+
67
+ # thread-local current tape for tape-free APIs
68
+ _tls = threading.local()
69
+
70
+
71
+ def set_current_tape(tape: Optional[Tape]):
72
+ _tls.current_tape = tape
73
+
74
+
75
+ def get_current_tape() -> Optional[Tape]:
76
+ return getattr(_tls, "current_tape", None)
77
+
78
+
79
+ def ones_like(t: Tensor) -> Tensor:
80
+ from netcl.core.tensor import Tensor as T
81
+ if t.dtype not in ("float", "float32", "double", "float64"):
82
+ raise ValueError("ones_like supports float tensors")
83
+ import numpy as np
84
+
85
+ data = np.ones(t.shape, dtype=np.float32 if "32" in t.dtype or t.dtype == "float" else np.float64)
86
+ return T.from_host(t.queue, data, dtype=t.dtype)
87
+
88
+
89
+ def add_inplace(dst: Tensor, src: Tensor) -> Tensor:
90
+ from netcl.ops.elementwise import elementwise_binary
91
+
92
+ return elementwise_binary(dst, src, expression="ADD(v0, v1)", out=dst)
93
+
94
+
95
+ def no_grad():
96
+ """
97
+ Context manager placeholder to disable gradient tracking.
98
+ """
99
+
100
+ class _NoGrad:
101
+ def __enter__(self):
102
+ return self
103
+
104
+ def __exit__(self, exc_type, exc, tb):
105
+ return False
106
+
107
+ return _NoGrad()
108
+
109
+
110
+ def apply_op(fn: Callable[..., Tensor], grad_fn: Optional[GradFn], *args: Node, tape: Optional[Tape] = None) -> Node:
111
+ tape = tape or get_current_tape()
112
+ out_value = fn(*[a.value if isinstance(a, Node) else a for a in args])
113
+ node = Node(
114
+ value=out_value,
115
+ grad_fn=grad_fn,
116
+ parents=[a for a in args if isinstance(a, Node)],
117
+ requires_grad=any(getattr(a, "requires_grad", False) for a in args),
118
+ )
119
+ if tape is not None:
120
+ tape.record(node)
121
+ return node