netcl 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- netcl/__init__.py +37 -0
- netcl/amp.py +147 -0
- netcl/autograd/__init__.py +86 -0
- netcl/autograd/debug.py +12 -0
- netcl/autograd/engine.py +121 -0
- netcl/autograd/ops.py +558 -0
- netcl/core/__init__.py +17 -0
- netcl/core/capabilities.py +102 -0
- netcl/core/device.py +66 -0
- netcl/core/kernels/__init__.py +12 -0
- netcl/core/kernels/primitives.py +159 -0
- netcl/core/memory.py +59 -0
- netcl/core/parameter.py +60 -0
- netcl/core/tensor.py +130 -0
- netcl/data/augment.py +43 -0
- netcl/data/augment_gpu.py +135 -0
- netcl/data/dataloader.py +87 -0
- netcl/data/filters.py +65 -0
- netcl/distributed/__init__.py +22 -0
- netcl/distributed/collectives.py +125 -0
- netcl/distributed/data_parallel.py +55 -0
- netcl/distributed/device_manager.py +33 -0
- netcl/distributed/trainer.py +63 -0
- netcl/io/__init__.py +3 -0
- netcl/io/checkpoint.py +58 -0
- netcl/io/serialization.py +117 -0
- netcl/nn/__init__.py +48 -0
- netcl/nn/batchnorm.py +258 -0
- netcl/nn/decorators.py +48 -0
- netcl/nn/factory.py +123 -0
- netcl/nn/functional.py +45 -0
- netcl/nn/groupnorm.py +45 -0
- netcl/nn/init.py +34 -0
- netcl/nn/layernorm.py +41 -0
- netcl/nn/layers.py +369 -0
- netcl/nn/loss.py +27 -0
- netcl/nn/modules.py +100 -0
- netcl/nn/padding.py +57 -0
- netcl/nn/pooling.py +267 -0
- netcl/nn/residual.py +22 -0
- netcl/nn/resnet.py +155 -0
- netcl/nn/simple.py +41 -0
- netcl/ops/__init__.py +45 -0
- netcl/ops/broadcast.py +103 -0
- netcl/ops/conv2d.py +745 -0
- netcl/ops/conv_transpose2d.py +200 -0
- netcl/ops/depthwise_conv2d.py +235 -0
- netcl/ops/elementwise.py +477 -0
- netcl/ops/im2col.py +122 -0
- netcl/ops/matmul.py +182 -0
- netcl/ops/reduction.py +102 -0
- netcl/ops/softmax.py +96 -0
- netcl/ops/softmax_fp16.py +33 -0
- netcl/ops/transpose.py +60 -0
- netcl/optim/__init__.py +11 -0
- netcl/optim/adam.py +55 -0
- netcl/optim/adamw.py +58 -0
- netcl/optim/amp.py +42 -0
- netcl/optim/clip.py +28 -0
- netcl/optim/lr_plateau.py +26 -0
- netcl/optim/lr_scheduler.py +14 -0
- netcl/optim/momentum.py +38 -0
- netcl/optim/rmsprop.py +51 -0
- netcl/optim/sgd.py +43 -0
- netcl/profiling/__init__.py +7 -0
- netcl/profiling/timing.py +33 -0
- netcl/runtime/__init__.py +8 -0
- netcl/runtime/graph.py +131 -0
- netcl/runtime/scheduler.py +36 -0
- netcl/trainer/__init__.py +3 -0
- netcl/trainer/trainer.py +148 -0
- netcl/utils/__init__.py +4 -0
- netcl/utils/data.py +33 -0
- netcl/utils/progress.py +58 -0
- netcl-0.1.0.dist-info/METADATA +84 -0
- netcl-0.1.0.dist-info/RECORD +79 -0
- netcl-0.1.0.dist-info/WHEEL +5 -0
- netcl-0.1.0.dist-info/licenses/LICENSE +21 -0
- netcl-0.1.0.dist-info/top_level.txt +1 -0
netcl/__init__.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"""
|
|
2
|
+
netcl: PyOpenCL-based experimentation framework.
|
|
3
|
+
This package currently focuses on low-level kernel primitives and helpers.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from . import core, ops, autograd, distributed, runtime, profiling
|
|
7
|
+
from .ops import (
|
|
8
|
+
matmul,
|
|
9
|
+
build_matmul_kernel,
|
|
10
|
+
elementwise_binary,
|
|
11
|
+
relu,
|
|
12
|
+
bias_add,
|
|
13
|
+
reduce_sum,
|
|
14
|
+
softmax,
|
|
15
|
+
conv2d,
|
|
16
|
+
)
|
|
17
|
+
from . import nn, optim, io
|
|
18
|
+
|
|
19
|
+
__all__ = [
|
|
20
|
+
"core",
|
|
21
|
+
"ops",
|
|
22
|
+
"autograd",
|
|
23
|
+
"distributed",
|
|
24
|
+
"runtime",
|
|
25
|
+
"profiling",
|
|
26
|
+
"nn",
|
|
27
|
+
"optim",
|
|
28
|
+
"io",
|
|
29
|
+
"matmul",
|
|
30
|
+
"build_matmul_kernel",
|
|
31
|
+
"elementwise_binary",
|
|
32
|
+
"relu",
|
|
33
|
+
"bias_add",
|
|
34
|
+
"reduce_sum",
|
|
35
|
+
"softmax",
|
|
36
|
+
"conv2d",
|
|
37
|
+
]
|
netcl/amp.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
1
|
+
from __future__ import annotations
|
|
2
|
+
|
|
3
|
+
from dataclasses import dataclass
|
|
4
|
+
from typing import Optional, Sequence
|
|
5
|
+
|
|
6
|
+
try:
|
|
7
|
+
import numpy as np # type: ignore
|
|
8
|
+
except ImportError: # pragma: no cover
|
|
9
|
+
np = None
|
|
10
|
+
|
|
11
|
+
from netcl.core.tensor import Tensor
|
|
12
|
+
from netcl.core.tensor import _np_dtype # type: ignore
|
|
13
|
+
|
|
14
|
+
_AUTOCAST_ENABLED = False
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
@dataclass
|
|
18
|
+
class GradScaler:
|
|
19
|
+
init_scale: float = 2.0**16
|
|
20
|
+
growth_factor: float = 2.0
|
|
21
|
+
backoff_factor: float = 0.5
|
|
22
|
+
growth_interval: int = 2000
|
|
23
|
+
enabled: bool = True
|
|
24
|
+
|
|
25
|
+
def __post_init__(self):
|
|
26
|
+
self._scale = self.init_scale
|
|
27
|
+
self._growth_tracker = 0
|
|
28
|
+
|
|
29
|
+
@property
|
|
30
|
+
def scale(self) -> float:
|
|
31
|
+
return self._scale
|
|
32
|
+
|
|
33
|
+
def scale_loss(self, loss: Tensor) -> Tensor:
|
|
34
|
+
if not self.enabled:
|
|
35
|
+
return loss
|
|
36
|
+
from netcl.ops.elementwise import elementwise_binary
|
|
37
|
+
|
|
38
|
+
return elementwise_binary(loss, loss, expression=f"MUL(v0, {float(self._scale)})")
|
|
39
|
+
|
|
40
|
+
def unscale_grads(self, params: Sequence[Tensor]):
|
|
41
|
+
if not self.enabled:
|
|
42
|
+
return False
|
|
43
|
+
found_inf = False
|
|
44
|
+
for p in params:
|
|
45
|
+
if p.grad is None:
|
|
46
|
+
continue
|
|
47
|
+
g = p.grad.to_host()
|
|
48
|
+
if np.any(~np.isfinite(g)):
|
|
49
|
+
found_inf = True
|
|
50
|
+
break
|
|
51
|
+
if not found_inf:
|
|
52
|
+
inv_scale = 1.0 / self._scale
|
|
53
|
+
from netcl.ops.elementwise import elementwise_binary
|
|
54
|
+
|
|
55
|
+
for p in params:
|
|
56
|
+
if p.grad is None:
|
|
57
|
+
continue
|
|
58
|
+
p.grad = elementwise_binary(p.grad, p.grad, expression=f"MUL(v0, {inv_scale})")
|
|
59
|
+
return found_inf
|
|
60
|
+
|
|
61
|
+
def step(self, optimizer, params: Sequence[Tensor]):
|
|
62
|
+
if not self.enabled:
|
|
63
|
+
optimizer.step()
|
|
64
|
+
return
|
|
65
|
+
found_inf = self.unscale_grads(params)
|
|
66
|
+
if not found_inf:
|
|
67
|
+
optimizer.step()
|
|
68
|
+
self._growth_tracker += 1
|
|
69
|
+
if self._growth_tracker % self.growth_interval == 0:
|
|
70
|
+
self._scale *= self.growth_factor
|
|
71
|
+
else:
|
|
72
|
+
self._scale *= self.backoff_factor
|
|
73
|
+
self._growth_tracker = 0
|
|
74
|
+
|
|
75
|
+
def update(self):
|
|
76
|
+
# no-op kept for API compatibility
|
|
77
|
+
pass
|
|
78
|
+
|
|
79
|
+
|
|
80
|
+
def supports_fp16(queue) -> bool:
|
|
81
|
+
"""
|
|
82
|
+
Check device extensions for cl_khr_fp16 support.
|
|
83
|
+
"""
|
|
84
|
+
try:
|
|
85
|
+
return "cl_khr_fp16" in queue.device.extensions
|
|
86
|
+
except Exception:
|
|
87
|
+
return False
|
|
88
|
+
|
|
89
|
+
|
|
90
|
+
def autocast_enabled(profile_supports_fp16: bool) -> bool:
|
|
91
|
+
return profile_supports_fp16
|
|
92
|
+
|
|
93
|
+
|
|
94
|
+
class autocast:
|
|
95
|
+
"""
|
|
96
|
+
Context manager for autocast. Enables casting only if underlying device supports fp16.
|
|
97
|
+
"""
|
|
98
|
+
|
|
99
|
+
def __init__(self, enabled: bool = True, device_queue=None):
|
|
100
|
+
self.enabled = enabled
|
|
101
|
+
self.device_queue = device_queue
|
|
102
|
+
self.prev = False
|
|
103
|
+
self._capable = True
|
|
104
|
+
|
|
105
|
+
def __enter__(self):
|
|
106
|
+
global _AUTOCAST_ENABLED
|
|
107
|
+
self.prev = _AUTOCAST_ENABLED
|
|
108
|
+
if not self.enabled:
|
|
109
|
+
_AUTOCAST_ENABLED = False
|
|
110
|
+
return self
|
|
111
|
+
if self.device_queue is not None:
|
|
112
|
+
self._capable = supports_fp16(self.device_queue)
|
|
113
|
+
_AUTOCAST_ENABLED = self.enabled and self._capable
|
|
114
|
+
return self
|
|
115
|
+
|
|
116
|
+
def __exit__(self, exc_type, exc_val, exc_tb):
|
|
117
|
+
global _AUTOCAST_ENABLED
|
|
118
|
+
_AUTOCAST_ENABLED = self.prev
|
|
119
|
+
return False
|
|
120
|
+
|
|
121
|
+
|
|
122
|
+
def is_autocast_enabled() -> bool:
|
|
123
|
+
return _AUTOCAST_ENABLED
|
|
124
|
+
|
|
125
|
+
|
|
126
|
+
def maybe_cast_tensor(t: Tensor) -> Tensor:
|
|
127
|
+
if not _AUTOCAST_ENABLED:
|
|
128
|
+
return t
|
|
129
|
+
if t.dtype in ("float", "float32"):
|
|
130
|
+
# only cast if device can handle fp16
|
|
131
|
+
if supports_fp16(t.queue):
|
|
132
|
+
arr = t.to_host().astype(np.float16)
|
|
133
|
+
return Tensor.from_host(t.queue, arr, dtype="float16")
|
|
134
|
+
return t
|
|
135
|
+
return t
|
|
136
|
+
|
|
137
|
+
|
|
138
|
+
def master_param(param: Tensor) -> Tensor:
|
|
139
|
+
"""
|
|
140
|
+
Keep master weights in FP32 for optimizers.
|
|
141
|
+
"""
|
|
142
|
+
if param.dtype in ("float16", "half"):
|
|
143
|
+
master = Tensor.from_host(param.queue, param.to_host().astype(np.float32), dtype="float32")
|
|
144
|
+
setattr(master, "_model_param", param)
|
|
145
|
+
return master
|
|
146
|
+
setattr(param, "_model_param", param)
|
|
147
|
+
return param
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Autograd: Node/Tape plus op wrappers.
|
|
3
|
+
"""
|
|
4
|
+
|
|
5
|
+
from .engine import Node, Tape, apply_op, no_grad, set_current_tape, get_current_tape
|
|
6
|
+
from .debug import debug_tape
|
|
7
|
+
from .ops import (
|
|
8
|
+
tensor,
|
|
9
|
+
add,
|
|
10
|
+
relu,
|
|
11
|
+
bias_add,
|
|
12
|
+
matmul_op,
|
|
13
|
+
sub,
|
|
14
|
+
mse_loss,
|
|
15
|
+
sigmoid,
|
|
16
|
+
tanh,
|
|
17
|
+
leaky_relu,
|
|
18
|
+
gelu,
|
|
19
|
+
swish,
|
|
20
|
+
elu,
|
|
21
|
+
softplus,
|
|
22
|
+
hard_sigmoid,
|
|
23
|
+
hard_swish,
|
|
24
|
+
clamp,
|
|
25
|
+
hard_tanh,
|
|
26
|
+
prelu,
|
|
27
|
+
hinge_loss,
|
|
28
|
+
l1_loss,
|
|
29
|
+
l2_loss,
|
|
30
|
+
depthwise_conv2d,
|
|
31
|
+
batch_norm2d,
|
|
32
|
+
layer_norm,
|
|
33
|
+
pad2d,
|
|
34
|
+
group_norm,
|
|
35
|
+
global_avg_pool2d,
|
|
36
|
+
cross_entropy,
|
|
37
|
+
conv2d,
|
|
38
|
+
flatten,
|
|
39
|
+
max_pool2d,
|
|
40
|
+
dropout,
|
|
41
|
+
avg_pool2d,
|
|
42
|
+
)
|
|
43
|
+
|
|
44
|
+
__all__ = [
|
|
45
|
+
"Node",
|
|
46
|
+
"Tape",
|
|
47
|
+
"apply_op",
|
|
48
|
+
"no_grad",
|
|
49
|
+
"tensor",
|
|
50
|
+
"add",
|
|
51
|
+
"relu",
|
|
52
|
+
"bias_add",
|
|
53
|
+
"matmul_op",
|
|
54
|
+
"sub",
|
|
55
|
+
"mse_loss",
|
|
56
|
+
"sigmoid",
|
|
57
|
+
"tanh",
|
|
58
|
+
"leaky_relu",
|
|
59
|
+
"gelu",
|
|
60
|
+
"swish",
|
|
61
|
+
"elu",
|
|
62
|
+
"softplus",
|
|
63
|
+
"hard_sigmoid",
|
|
64
|
+
"hard_swish",
|
|
65
|
+
"clamp",
|
|
66
|
+
"hard_tanh",
|
|
67
|
+
"prelu",
|
|
68
|
+
"hinge_loss",
|
|
69
|
+
"l1_loss",
|
|
70
|
+
"l2_loss",
|
|
71
|
+
"depthwise_conv2d",
|
|
72
|
+
"batch_norm2d",
|
|
73
|
+
"layer_norm",
|
|
74
|
+
"pad2d",
|
|
75
|
+
"group_norm",
|
|
76
|
+
"global_avg_pool2d",
|
|
77
|
+
"cross_entropy",
|
|
78
|
+
"conv2d",
|
|
79
|
+
"flatten",
|
|
80
|
+
"max_pool2d",
|
|
81
|
+
"dropout",
|
|
82
|
+
"avg_pool2d",
|
|
83
|
+
"debug_tape",
|
|
84
|
+
"set_current_tape",
|
|
85
|
+
"get_current_tape",
|
|
86
|
+
]
|
netcl/autograd/debug.py
ADDED
netcl/autograd/engine.py
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Minimal autograd backbone (placeholders).
|
|
3
|
+
|
|
4
|
+
To be extended with full gradient tracking and backward kernels per op.
|
|
5
|
+
"""
|
|
6
|
+
|
|
7
|
+
from __future__ import annotations
|
|
8
|
+
|
|
9
|
+
from dataclasses import dataclass, field
|
|
10
|
+
from typing import Any, Callable, List, Optional
|
|
11
|
+
import threading
|
|
12
|
+
|
|
13
|
+
from netcl.core.tensor import Tensor
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
GradFn = Callable[[Any], List[Optional[Tensor]]]
|
|
17
|
+
|
|
18
|
+
|
|
19
|
+
@dataclass
|
|
20
|
+
class Node:
|
|
21
|
+
value: Tensor
|
|
22
|
+
grad_fn: Optional[GradFn] = None
|
|
23
|
+
parents: List["Node"] = field(default_factory=list)
|
|
24
|
+
grad: Optional[Tensor] = None
|
|
25
|
+
requires_grad: bool = False
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
class Tape:
|
|
29
|
+
"""
|
|
30
|
+
Records operations for backward.
|
|
31
|
+
"""
|
|
32
|
+
|
|
33
|
+
def __init__(self) -> None:
|
|
34
|
+
self.nodes: List[Node] = []
|
|
35
|
+
self.enabled: bool = True
|
|
36
|
+
|
|
37
|
+
def record(self, node: Node) -> Node:
|
|
38
|
+
if self.enabled:
|
|
39
|
+
self.nodes.append(node)
|
|
40
|
+
return node
|
|
41
|
+
|
|
42
|
+
def backward(self, loss: Node, grad: Optional[Tensor] = None) -> None:
|
|
43
|
+
if grad is None:
|
|
44
|
+
# seed with ones
|
|
45
|
+
loss.grad = ones_like(loss.value)
|
|
46
|
+
else:
|
|
47
|
+
loss.grad = grad
|
|
48
|
+
# Reverse topological order (here: recorded order)
|
|
49
|
+
for node in reversed(self.nodes):
|
|
50
|
+
if node.grad is None or node.grad_fn is None:
|
|
51
|
+
continue
|
|
52
|
+
grads = node.grad_fn(node.grad)
|
|
53
|
+
for parent, g in zip(node.parents, grads):
|
|
54
|
+
if g is None:
|
|
55
|
+
continue
|
|
56
|
+
if parent.grad is None:
|
|
57
|
+
parent.grad = g
|
|
58
|
+
else:
|
|
59
|
+
parent.grad = add_inplace(parent.grad, g)
|
|
60
|
+
# propagate to underlying Tensor for optimizers
|
|
61
|
+
if parent.value.grad is None:
|
|
62
|
+
parent.value.grad = parent.grad
|
|
63
|
+
else:
|
|
64
|
+
parent.value.grad = add_inplace(parent.value.grad, g)
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
# thread-local current tape for tape-free APIs
|
|
68
|
+
_tls = threading.local()
|
|
69
|
+
|
|
70
|
+
|
|
71
|
+
def set_current_tape(tape: Optional[Tape]):
|
|
72
|
+
_tls.current_tape = tape
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def get_current_tape() -> Optional[Tape]:
|
|
76
|
+
return getattr(_tls, "current_tape", None)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def ones_like(t: Tensor) -> Tensor:
|
|
80
|
+
from netcl.core.tensor import Tensor as T
|
|
81
|
+
if t.dtype not in ("float", "float32", "double", "float64"):
|
|
82
|
+
raise ValueError("ones_like supports float tensors")
|
|
83
|
+
import numpy as np
|
|
84
|
+
|
|
85
|
+
data = np.ones(t.shape, dtype=np.float32 if "32" in t.dtype or t.dtype == "float" else np.float64)
|
|
86
|
+
return T.from_host(t.queue, data, dtype=t.dtype)
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def add_inplace(dst: Tensor, src: Tensor) -> Tensor:
|
|
90
|
+
from netcl.ops.elementwise import elementwise_binary
|
|
91
|
+
|
|
92
|
+
return elementwise_binary(dst, src, expression="ADD(v0, v1)", out=dst)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def no_grad():
|
|
96
|
+
"""
|
|
97
|
+
Context manager placeholder to disable gradient tracking.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
class _NoGrad:
|
|
101
|
+
def __enter__(self):
|
|
102
|
+
return self
|
|
103
|
+
|
|
104
|
+
def __exit__(self, exc_type, exc, tb):
|
|
105
|
+
return False
|
|
106
|
+
|
|
107
|
+
return _NoGrad()
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def apply_op(fn: Callable[..., Tensor], grad_fn: Optional[GradFn], *args: Node, tape: Optional[Tape] = None) -> Node:
|
|
111
|
+
tape = tape or get_current_tape()
|
|
112
|
+
out_value = fn(*[a.value if isinstance(a, Node) else a for a in args])
|
|
113
|
+
node = Node(
|
|
114
|
+
value=out_value,
|
|
115
|
+
grad_fn=grad_fn,
|
|
116
|
+
parents=[a for a in args if isinstance(a, Node)],
|
|
117
|
+
requires_grad=any(getattr(a, "requires_grad", False) for a in args),
|
|
118
|
+
)
|
|
119
|
+
if tape is not None:
|
|
120
|
+
tape.record(node)
|
|
121
|
+
return node
|