PyPI - netcl - Versions diffs - 0.1.0__tar.gz - Mend

netcl 0.1.0__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (86) hide show

netcl-0.1.0/LICENSE +21 -0
netcl-0.1.0/MANIFEST.in +6 -0
netcl-0.1.0/PKG-INFO +84 -0
netcl-0.1.0/README.md +178 -0
netcl-0.1.0/README_PACKAGE.md +66 -0
netcl-0.1.0/netcl/__init__.py +37 -0
netcl-0.1.0/netcl/amp.py +147 -0
netcl-0.1.0/netcl/autograd/__init__.py +86 -0
netcl-0.1.0/netcl/autograd/debug.py +12 -0
netcl-0.1.0/netcl/autograd/engine.py +121 -0
netcl-0.1.0/netcl/autograd/ops.py +558 -0
netcl-0.1.0/netcl/core/__init__.py +17 -0
netcl-0.1.0/netcl/core/capabilities.py +102 -0
netcl-0.1.0/netcl/core/device.py +66 -0
netcl-0.1.0/netcl/core/kernels/__init__.py +12 -0
netcl-0.1.0/netcl/core/kernels/primitives.py +159 -0
netcl-0.1.0/netcl/core/memory.py +59 -0
netcl-0.1.0/netcl/core/parameter.py +60 -0
netcl-0.1.0/netcl/core/tensor.py +130 -0
netcl-0.1.0/netcl/data/augment.py +43 -0
netcl-0.1.0/netcl/data/augment_gpu.py +135 -0
netcl-0.1.0/netcl/data/dataloader.py +87 -0
netcl-0.1.0/netcl/data/filters.py +65 -0
netcl-0.1.0/netcl/distributed/__init__.py +22 -0
netcl-0.1.0/netcl/distributed/collectives.py +125 -0
netcl-0.1.0/netcl/distributed/data_parallel.py +55 -0
netcl-0.1.0/netcl/distributed/device_manager.py +33 -0
netcl-0.1.0/netcl/distributed/trainer.py +63 -0
netcl-0.1.0/netcl/io/__init__.py +3 -0
netcl-0.1.0/netcl/io/checkpoint.py +58 -0
netcl-0.1.0/netcl/io/serialization.py +117 -0
netcl-0.1.0/netcl/nn/__init__.py +48 -0
netcl-0.1.0/netcl/nn/batchnorm.py +258 -0
netcl-0.1.0/netcl/nn/decorators.py +48 -0
netcl-0.1.0/netcl/nn/factory.py +123 -0
netcl-0.1.0/netcl/nn/functional.py +45 -0
netcl-0.1.0/netcl/nn/groupnorm.py +45 -0
netcl-0.1.0/netcl/nn/init.py +34 -0
netcl-0.1.0/netcl/nn/layernorm.py +41 -0
netcl-0.1.0/netcl/nn/layers.py +369 -0
netcl-0.1.0/netcl/nn/loss.py +27 -0
netcl-0.1.0/netcl/nn/modules.py +100 -0
netcl-0.1.0/netcl/nn/padding.py +57 -0
netcl-0.1.0/netcl/nn/pooling.py +267 -0
netcl-0.1.0/netcl/nn/residual.py +22 -0
netcl-0.1.0/netcl/nn/resnet.py +155 -0
netcl-0.1.0/netcl/nn/simple.py +41 -0
netcl-0.1.0/netcl/ops/__init__.py +45 -0
netcl-0.1.0/netcl/ops/broadcast.py +103 -0
netcl-0.1.0/netcl/ops/conv2d.py +745 -0
netcl-0.1.0/netcl/ops/conv_transpose2d.py +200 -0
netcl-0.1.0/netcl/ops/depthwise_conv2d.py +235 -0
netcl-0.1.0/netcl/ops/elementwise.py +477 -0
netcl-0.1.0/netcl/ops/im2col.py +122 -0
netcl-0.1.0/netcl/ops/matmul.py +182 -0
netcl-0.1.0/netcl/ops/reduction.py +102 -0
netcl-0.1.0/netcl/ops/softmax.py +96 -0
netcl-0.1.0/netcl/ops/softmax_fp16.py +33 -0
netcl-0.1.0/netcl/ops/transpose.py +60 -0
netcl-0.1.0/netcl/optim/__init__.py +11 -0
netcl-0.1.0/netcl/optim/adam.py +55 -0
netcl-0.1.0/netcl/optim/adamw.py +58 -0
netcl-0.1.0/netcl/optim/amp.py +42 -0
netcl-0.1.0/netcl/optim/clip.py +28 -0
netcl-0.1.0/netcl/optim/lr_plateau.py +26 -0
netcl-0.1.0/netcl/optim/lr_scheduler.py +14 -0
netcl-0.1.0/netcl/optim/momentum.py +38 -0
netcl-0.1.0/netcl/optim/rmsprop.py +51 -0
netcl-0.1.0/netcl/optim/sgd.py +43 -0
netcl-0.1.0/netcl/profiling/__init__.py +7 -0
netcl-0.1.0/netcl/profiling/timing.py +33 -0
netcl-0.1.0/netcl/runtime/__init__.py +8 -0
netcl-0.1.0/netcl/runtime/graph.py +131 -0
netcl-0.1.0/netcl/runtime/scheduler.py +36 -0
netcl-0.1.0/netcl/trainer/__init__.py +3 -0
netcl-0.1.0/netcl/trainer/trainer.py +148 -0
netcl-0.1.0/netcl/utils/__init__.py +4 -0
netcl-0.1.0/netcl/utils/data.py +33 -0
netcl-0.1.0/netcl/utils/progress.py +58 -0
netcl-0.1.0/netcl.egg-info/PKG-INFO +84 -0
netcl-0.1.0/netcl.egg-info/SOURCES.txt +84 -0
netcl-0.1.0/netcl.egg-info/dependency_links.txt +1 -0
netcl-0.1.0/netcl.egg-info/requires.txt +2 -0
netcl-0.1.0/netcl.egg-info/top_level.txt +1 -0
netcl-0.1.0/pyproject.toml +36 -0
netcl-0.1.0/setup.cfg +4 -0

netcl-0.1.0/LICENSE ADDED Viewed

@@ -0,0 +1,21 @@
+MIT License
+Copyright (c) 2024 netcl contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

netcl-0.1.0/MANIFEST.in ADDED Viewed

@@ -0,0 +1,6 @@
+include LICENSE
+include README_PACKAGE.md
+recursive-exclude tests *
+recursive-exclude docs *
+recursive-exclude wiki *
+recursive-exclude scripts *

netcl-0.1.0/PKG-INFO ADDED Viewed

@@ -0,0 +1,84 @@
+Metadata-Version: 2.4
+Name: netcl
+Version: 0.1.0
+Summary: PyOpenCL-based deep learning playground with autograd, kernels, and high-level APIs.
+Author: netcl contributors
+License: MIT
+Project-URL: Repository, https://bbwebservice.online/lukas/netcl
+Keywords: opencl,deep-learning,autograd,gpu
+Classifier: Programming Language :: Python :: 3
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Requires-Python: >=3.10
+Description-Content-Type: text/markdown
+License-File: LICENSE
+Requires-Dist: numpy>=1.22
+Requires-Dist: pyopencl>=2022.3
+Dynamic: license-file
+# netcl – PyOpenCL Deep Learning Playground
+`netcl` ist ein experimentelles Deep-Learning-Framework auf Basis von PyOpenCL. Es kombiniert low-level Kernel (Conv/Matmul/Elementwise) mit einer einfachen Autograd-Engine und einer High-Level API (Module, Trainer, Serializer), ohne Abhängigkeiten zu anderen DL-Frameworks.
+## Installation
+```bash
+pip install .
+```
+Voraussetzungen: Python ≥ 3.10, NumPy, PyOpenCL und ein verfügbares OpenCL-Gerät.
+## Schnelles Beispiel (MNIST-Mini-MLP)
+```python
+import numpy as np
+from netcl.core.device import manager
+from netcl.nn.layers import Sequential, Flatten, Linear, ReLU
+from netcl import autograd as ag
+from netcl.optim import Adam
+from netcl.core.tensor import Tensor
+dev = manager.default()
+q = dev.queue
+model = Sequential(
+    Flatten(),
+    Linear(q, in_features=28*28, out_features=128),
+    ReLU(),
+    Linear(q, in_features=128, out_features=10),
+)
+opt = Adam(model.parameters(), lr=5e-3)
+def one_hot(y, n=10):
+    oh = np.zeros((y.shape[0], n), dtype=np.float32)
+    oh[np.arange(y.shape[0]), y] = 1
+    return oh
+xb = np.random.randn(32, 1, 28, 28).astype(np.float32)
+yb = one_hot(np.random.randint(0, 10, size=(32,)))
+tape = ag.Tape()
+ag.set_current_tape(tape)
+x = ag.tensor(Tensor.from_host(q, xb))
+y = ag.tensor(Tensor.from_host(q, yb))
+logits = model(x)
+loss = ag.cross_entropy(logits, y)
+tape.backward(loss)
+opt.step(); opt.zero_grad()
+ag.set_current_tape(None)
+```
+## Kernfeatures
+- **Autograd**: Tape-basiert, elementare Ops (Matmul, Conv2d, Pooling, Elementwise) mit Backward.
+- **Module/High-Level API**: `Linear`, `Conv2d`, `BatchNorm2d`, `Sequential`, `@model`-Dekorator, Trainer.
+- **Optimierungen**: Conv-Algo-Heuristik/Autotuning, optional Mixed Precision, Buffer-Pool.
+- **Serialization**: Speichert `Sequential`-Modelle als JSON (Architektur) + NPZ (Gewichte) via `netcl.io.serialization`.
+## Speichern & Laden eines Modells
+```python
+from netcl.io.serialization import save_model, load_model
+save_model(model, "checkpoints/mnist_mlp")
+model2 = load_model("checkpoints/mnist_mlp")  # queue auto="default"
+```
+## Hinweise
+- Tests werden nicht mitinstalliert. Für lokale Entwicklung: `python -m pytest`.
+- Für Performance: Batch-Größe erhöhen, Augment minimieren, optional Mixed Precision (`Trainer(..., mixed_precision=True)`).
+- Conv-Algorithmen wählen automatisch optimierte Pfade; env-Flags wie `NETCL_CONV_AUTOTUNE=1` aktivieren Tuning.

netcl-0.1.0/README.md ADDED Viewed

@@ -0,0 +1,178 @@
+# PyOpenCL KI-Framework Plan
+## Ueberblick & Ziele
+- Hochperformantes KI-Framework auf Basis von PyOpenCL mit klarer Trennung zwischen Low-Level-Kernel-Implementierungen und High-Level-Ausfuehrungsmodellen.
+- Minimierung des Host-Device-Overheads durch effiziente Speicherverwaltung, asynchrone Transfers und Kernel-Fusion.
+- Multi-GPU-Skalierung (Daten- und perspektivisch Modellparallelismus) mit reproduzierbarem Verhalten und robustem Autograd.
+## Leitprinzipien
+- Trennung der Schichten: Devices/Memory/Kernels (Low-Level) vs. Ops/Autograd/Runtime/Distributed (High-Level).
+- Alle Ops setzen auf rudimentaere OpenCL-Bausteine (Loads/Stores, simple Arithmetik, lokale Reduktionen); komplexe Ops werden als Komposition/Fusion daraus erzeugt.
+- Datenbewegungen minimieren: Reuse von Buffern, Pinned Host Memory, Overlap von Compute & Copy, Zero-Copy wo moeglich.
+- Determinismus optional: reproduzierbare Seeds, konsistente Reduktionen.
+- Erweiterbarkeit: neue Ops, neue Devices, austauschbare Kernel-Build-Parameter via Cache, Autotuning und dynamische Kernel-Generierung.
+- Graph/Pipeline-fokussiert: Aehnlich TF: User baut ANN/Graph; Runtime generiert eine Pipeline aus elementaren Ops, alloziert/reused Buffer, minimiert Python-Interaktion zur Laufzeit.
+## Anforderungen
+- Funktional:
+  - Geraete-Discovery, Kontext- und Queue-Management mit Fallback-Strategie.
+  - Speicher-Allocator/Pool (Buckets, Reuse, Sub-Buffers), Pinned Host Buffers, asynchrone Copies.
+  - Kernel-Bibliothek: Matmul/GEMM, Convolution (im2col + Spezialfaelle), Elementwise (unary/binary), Reduktionen, Softmax, LayerNorm/BatchNorm, Scatter/Gather, RNG.
+  - Autograd-Engine mit Grad-Registrierung pro Op; Unterstuetzung fuer Gradient-Accumulation.
+  - Multi-GPU-Kollektive: AllReduce, Broadcast, Scatter/Gather; Datenparallelismus als erster Modus.
+  - API: Tensor mit Dtype/Device/Strides, Eager-Ausfuehrung; optional Graph/Lazy spaeter.
+  - Profiling/Tracing: Event-Timing, Bandbreitenmessung, Kernel-Stats.
+- Nicht-funktional:
+  - Performance-Portabilitaet ueber OpenCL; Feature-Detection (Subgroups, FP16/BF16, Images).
+  - Stabilitaet unter Stress (viele kleine Kernels, grosse Tensors); sauberes Fehler-Handling (Build-Fail, OOM).
+  - Klare Fehlermeldungen und Debug-Optionen (Assertions fuer Shapes/Strides).
+## Architektur (Schichten)
+- `core.devices`: Device/Kontext/Queue-Discovery, Capability-Cache, Seed-Management.
+- `core.memory`: Buffer-Objekte, Pool/Allocator, Pinned Host Buffers, Sub-Buffers, Async Copy.
+- `core.kernels`: Kernel-Quellen, Build/Cache, Feature-Gates, Tuning-Parameter (Workgroup, Vectorization).
+- `ops` (Low-Level): Matmul, Conv2D/Depthwise, Elementwise, Reduce, Softmax, LayerNorm/BatchNorm, RNG.
+- `autograd`: Tape oder Graph, Grad-Registrierung pro Op, Backward-Kernels, Grad-Accumulation.
+- `distributed`: AllReduce/Broadcast/Scatter-Gather, Parameter-Sharding, Host-Relay als Fallback.
+- `runtime`: Scheduler fuer Queues/Events, Stream-Ordering, Overlap von Compute/Copy (Double-Buffering); Graph Executor der ANN/Op-Graph in Pipelines uebersetzt und Buffer-Lebenszeiten verwaltet.
+- `api`: Tensor-Klasse, Module/Layer-Primitives, Optimizer (SGD/Adam), Checkpointing.
+- `profiling`: Event-Timing, Bandwidth/Flops-Schaetzung, Kernel-Stats Dump.
+## Speicher- & Transfer-Strategie
+- Memory Pool mit Buckets; Reuse ueber Lebenszeit-Tracking; Minimierung von Fragmentation.
+- Pinned Host Memory fuer H2D/D2H; `enqueue_copy` asynchron mit Events; Batch-Transfers.
+- Zero-Copy nutzen, wenn Plattform Host-Device-Sharing erlaubt; sonst Staging-Puffer.
+- Layout-Planer (z. B. NHWC vs. NCHW) abhaengig vom Kernel; Stride-Awareness in Ops.
+- Double-/Triple-Buffering fuer Datenpipelines, um IO und Compute zu ueberlappen.
+- Graph-basierte Buffer-Reuse: Lebenszeit-Analyse des ANN/Op-Graphs fuer Platzierung/Reuse; Inplace- und Alias-Strategien wo sicher.
+## Kernel-Bibliothek & Optimierung
+- Primitives: Load/Store, einfache Arithmetik, Shuffle/Subgroup-Reduce (falls verfuegbar), lokale Speicher-Kacheln, atomare Operationen. Alle komplexen Ops bauen darauf auf.
+- Dynamischer Kernel-Generator: erzeugt Quellcode basierend auf Hyperstrategie (Layout, Fusion, Tiling) und Device-Capabilities; wendet Template-Parameter und Konstantenfaltung an.
+- Matmul/GEMM: Tiling + lokale Speicher-Kacheln; Vectorization; Autotuning von M/N/K-Tiles und Workgroup-Groessen.
+- Convolution: Im2col + GEMM als Baseline; Spezialkernel fuer 1x1 und 3x3; Depthwise-Kernels.
+- Elementwise: Fused Kernel Generator (AST -> OpenCL C) fuer Ketten (bias + activation).
+- Reduktionen: Hierarchisch (Workgroup-Reduktion -> global), optionale Subgroup-Optimierung; deterministische Pfade.
+- RNG: Counter-based (z. B. Philox/Threefry); pro Work-Item Streams; reproduzierbare Seeds.
+- Mixed Precision: FP16/BF16 wenn verfuegbar; Akkumulation in FP32; Fallback auf FP32.
+## Multi-GPU & Distributed
+- Datenparallel: Parameter repliziert; Batch-Sharding; Gradient AllReduce (Ring/Tree). Start mit Host-Relay (Copy -> Reduce -> Scatter), spaetere Device-zu-Device-Pfade pruefen.
+- Modellparallel (spaeter): Parameter-Shards, Pipeline-Parallelismus, Punkt-zu-Punkt Copies.
+- Konsistenz: Events/Barrieren fuer Ordering; deterministische Reduktionen optional; FP32 Master Weights moeglich.
+- Hyperstrategie fuer Ressourcenaufteilung: Tiling/Partitionierung pro Device (Batch-Splits, Tensor-Shards) wird dynamisch nach Device-Memory/Compute/Link-Bandbreite bestimmt; Scheduler verteilt Work-Items und Buffers entsprechend.
+## Build/Cache & Autotuning
+- Kernel-Build-Cache pro Device (Hash aus Source, Build-Options, Treiber-Version).
+- Autotuning-DB pro Device (Tile- und Workgroup-Sizes); Persistenz auf Disk; konservative Defaults ohne Tuning.
+- Capability-Checks (Subgroups, FP16/BF16, Images vs. Buffers) steuern Kernel-Pfade.
+## API-Skizze (Python)
+- `Tensor(device, shape, dtype, data=None, requires_grad=False)`
+- `ops.matmul(a, b)`, `ops.conv2d(x, w, stride, pad)`, `ops.relu(x)`, `ops.softmax(x, axis)`, `ops.layer_norm(x, gamma, beta)`
+- `tensor.backward(grad=None)`; `optimizer.step()`, `optimizer.zero_grad()`
+- `with device.stream(): ...` fuer explizite Queue/Stream-Wahl.
+- `distributed.init(devices=[...])`, `distributed.all_reduce(tensor)`
+- Graph/Pipeline: `with graph(): y = model(x)` erzeugt statischen Graph; `executor.run(graph, inputs)` uebersetzt in Pipeline mit minimaler Python-Beteiligung.
+## High-Level Interface (Plan)
+- `netcl.Tensor`: Zentrale Datenstruktur; Flags fuer `requires_grad`; Methoden `to_host()`, `from_host()`, `from_shape(pool=...)`.
+- `netcl.autograd`:
+  - `tensor(x, requires_grad=False)` -> Node
+  - Ops: `add`, `relu`, `bias_add`, `matmul_op` (weitere Ops folgen: conv2d, softmax, reduce_sum)
+  - `Tape.backward(loss, grad=None)` mit Grad-Akkumulation und ones_like-Seed.
+- Optimizer (geplant):
+  - Basis: `Optimizer(params, lr)`, API: `step()`, `zero_grad()`, `clip_grad(norm)` optional.
+  - Implementierungen: `SGD(lr, momentum=0, weight_decay=0)`, `Adam(lr, betas=(0.9,0.999), eps=1e-8, weight_decay=0)`.
+  - Params sind `Tensor` mit `requires_grad=True`; Grad-Lesen via `tensor.grad`.
+- `netcl.ops` (High-Level Wrappers auf Kernels):
+  - Elementwise: `elementwise_binary(expr)`, `relu`, `bias_add`
+  - Reduktionen: `reduce_sum(x, axis=None/0/1)`
+  - Softmax: `softmax(x, axis=1)`
+  - Matmul: `matmul(a, b, pool=None)`
+  - Conv: `conv2d(x, w, pool=None)` (NCHW, stride=1, pad=0 als Start)
+- Layers (geplant):
+  - `Linear(in_features, out_features, bias=True)` -> nutzt `matmul + bias_add`; init Xavier/kaiming; Parameter sind Tensoren.
+  - `Conv2d(in_channels, out_channels, kernel_size, stride=1, padding=0)` -> nutzt `conv2d`; Parameter: weights (+ optional bias).
+  - Aktivierungen: `ReLU`, `Sigmoid`, `Tanh`, `LeakyReLU`, `Dropout(p)`.
+  - Pooling: `max_pool2d`.
+  - `Sequential(*layers)` -> Vorwaertsverkettung, sammelt Parameter fuer Optimizer.
+- Convenience-Funktionen:
+  - Initializer: `xavier_uniform(tensor)`, `kaiming_uniform(tensor)`.
+  - Losses: `mse_loss(pred, target)`, `cross_entropy(logits, targets)` (Softmax + NLL).
+  - Data movement: `to_host()`, `from_host()`, `to(device)` (spaeter).
+  - Graph shortcuts: `Graph.from_layers(layers, input_tensors)` baut Ops-Graph aus Layer-Liste.
+- Serialisierung/Checkpointing (ohne andere AI-Framework-Imports):
+  - Format-Vorschlag: Metadaten als JSON (Layer-Typ, Shapes, Hyperparams), Gewichte als `.npz` (NumPy-kompatibel) oder Binär-Blob pro Tensor.
+  - API: `save(model, path)` schreibt JSON+Gewichte; `load(path, device=None)` rekonstruiert Layer/Tensoren; kompatibel mit Lazy-Loading fuer grosse Gewichte.
+  - Graph-Snapshots: `save_graph(graph, path)` speichert Op-Listen und Topologie; `load_graph(path)` rekonstruiert fuer Executor.
+  - Versionierung: Schema-Version im JSON, Hash pro Gewicht fuer Integritaet.
+- Training Loop (geplant Muster):
+  ```
+  tape = Tape()
+  model = Sequential(Linear(784, 256), ReLU(), Linear(256, 10))
+  opt = Adam(model.parameters(), lr=1e-3)
+  out = model(tensor(x, True), tape=tape)
+  loss = cross_entropy(out, tensor(targets))
+  tape.backward(loss)
+  opt.step(); opt.zero_grad()
+  ```
+- Progress/Monitoring:
+  - `ProgressBar(total, epoch=None)` aus `netcl.utils.progress`; `update(step, info={"loss":..., "acc":..., "it/s":...})` zeigt animierten Balken (it/s, ETA, Epochen).
+- Graph/Pipeline:
+  - `Graph.add_op(name, fn, inputs, shape, dtype)` -> TensorRef
+  - `GraphExecutor.run(graph)` mit Toposort, optionaler `fusion_hook`, BufferPool-Reuse, (spaeter) async overlap.
+- Memory/Pooling:
+  - `BufferPool` bucket-basiert; Tensor speichert `pool_handle`; Executor gibt Buffers nach Refcount frei.
+- Distributed (Platzhalter):
+  - API-Signaturen: `all_reduce`, `broadcast`, `scatter`, `gather` (Implementierung folgt).
+- Profiling:
+  - `EventTimer.time_event(event)` -> Dauer in ms; spaeter: Hook in Executor.
+## High-Level Usage-Fluss (geplant)
+1) Daten auf Device laden: `tx = Tensor.from_host(queue, x, dtype="float32")`.
+2) Modelldefinition mit Autograd-Nodes: `tape = Tape(); y = relu(matmul_op(tensor(tx, True), tensor(w, True), tape=tape))`.
+3) Loss berechnen (zu ergaenzen: MSE/CrossEntropy Ops) und `tape.backward(loss)`.
+4) Parameter-Update (zu ergaenzen: Optimizer-API).
+5) Fuer statische Ausfuehrung: Graph aufbauen (`Graph`), Pipeline aus Ops, Executor run -> minimale Python-Interaktion.
+## Profiling/Tracing & Debug
+- Event-basierte Timings pro Kernel/Copy; Bandbreitenmessung; Roofline-Hinweise.
+- Kernel-Stats (Auslastung, Workgroup-Groesse, Registerdruck falls verfuegbar).
+- Debug-Modus: zusaetzliche Checks (NaN/Inf, Bounds, Shape/Stride-Assertions), deterministische Reduktionen.
+## Testplan
+- Unit-Tests:
+  - Kernels gegen NumPy/Referenz: Matmul, Conv, Reduce, Softmax, LayerNorm.
+  - Gradient Checks (finite differences) pro Op.
+  - Memory Pool: Reuse, Fragmentation, Leak-Detection.
+  - RNG: deterministische Sequenzen pro Seed/Device.
+- Kernel-Generator: erzeugte Kernels aus Primitives muessen korrekt und deterministisch sein; AST/Template-Expansion Tests.
+- Integrationstests:
+  - Mehr-Op-Graph mit aktivierter Fusion; Ergebnisvergleich zu NumPy/PyTorch.
+  - Autograd ueber kleine Netze (MLP, CNN) mit Loss; Gradients gegen PyTorch.
+  - Multi-GPU: AllReduce-Korrektheit (sum/mean), Sharded Batch; deterministische Runs.
+  - Overlap-Test: Compute+Copy parallel (Timing via Events).
+- Performance-/Regressions-Tests:
+  - Benchmarks pro Device: GEMM-GFLOPs, Conv-Throughput, Bandbreite.
+  - Autotuning/Hyperstrategie: waehlt beste Config (Tiling/Partitionierung) und bleibt korrekt.
+- Pipeline/Executor: Test, dass Graph-zu-Pipeline-Plan mit Buffer-Reuse erzeugt wird und zur Laufzeit minimale Python-Overhead verursacht (Timing-basierte Smoke-Tests).
+- Stress-/Robustheitstests:
+  - Grosse Tensors, viele kleine Kernels (Scheduler/Pool-Stabilitaet).
+  - Fehlerpfade: Build-Failure, OOM -> sauberes Recovery ohne Leaks.
+## Roadmap (iterativ)
+1. Core: Device/Queue-Management, Memory Pool, Tensor-Basis; Definition der rudimentaeren OpenCL-Primitives.
+2. Kernels: Elementwise, Reduce, Matmul (mit leichtem Autotuning) auf Basis der Primitives; dynamischer Kernel-Generator MVP.
+3. Autograd: Tape + Grad-Registrierung; Grund-Ops mit Backward; Grad-Check-Tests.
+4. Ops: Conv2D (im2col), Softmax, Normen; Fusion-Generator erweitert Hyperstrategie (Layout/Tiling-Auswahl).
+5. Graph/Executor: Graph-Capture (ANN-Style), Buffer-Lebenszeit-Planung, Pipeline-Executor mit minimaler Python-Interaktion, Memory-Pooling-Integration.
+6. Distributed: AllReduce/Broadcast (Host-Relay zuerst, spaeter direkte Pfade); dynamische Partitionierung nach Device-Profilen.
+7. Profiling/Tracing + Benchmark-Suite; Ressourcenselektions-Heuristiken evaluieren.
+8. Stabilisierung: Tests/Performance-Regressions, deterministische Modi, BF16/FP16-Support; Persistenz von Tuning/Hyperstrategie-Entscheidungen.
+## Offene Fragen / Entscheidungen
+- Mixed Precision Prioritaet: BF16 vs. FP16 Verfuegbarkeit auf Zielgeraeten?
+- Autograd-Modell: nur Eager oder zusaetzlich Graph-basiert mit JIT-Fusion?
+- Kommunikationspfad Multi-GPU: reicht Host-Relay oder braucht es P2P/OpenCL-Extensions?
+- Zielgeraete: nur GPUs oder auch CPU/FPGA (beeinflusst Kernel-Optimierung und Workgroup-Strategie)?

netcl-0.1.0/README_PACKAGE.md ADDED Viewed

@@ -0,0 +1,66 @@
+# netcl – PyOpenCL Deep Learning Playground
+`netcl` ist ein experimentelles Deep-Learning-Framework auf Basis von PyOpenCL. Es kombiniert low-level Kernel (Conv/Matmul/Elementwise) mit einer einfachen Autograd-Engine und einer High-Level API (Module, Trainer, Serializer), ohne Abhängigkeiten zu anderen DL-Frameworks.
+## Installation
+```bash
+pip install .
+```
+Voraussetzungen: Python ≥ 3.10, NumPy, PyOpenCL und ein verfügbares OpenCL-Gerät.
+## Schnelles Beispiel (MNIST-Mini-MLP)
+```python
+import numpy as np
+from netcl.core.device import manager
+from netcl.nn.layers import Sequential, Flatten, Linear, ReLU
+from netcl import autograd as ag
+from netcl.optim import Adam
+from netcl.core.tensor import Tensor
+dev = manager.default()
+q = dev.queue
+model = Sequential(
+    Flatten(),
+    Linear(q, in_features=28*28, out_features=128),
+    ReLU(),
+    Linear(q, in_features=128, out_features=10),
+)
+opt = Adam(model.parameters(), lr=5e-3)
+def one_hot(y, n=10):
+    oh = np.zeros((y.shape[0], n), dtype=np.float32)
+    oh[np.arange(y.shape[0]), y] = 1
+    return oh
+xb = np.random.randn(32, 1, 28, 28).astype(np.float32)
+yb = one_hot(np.random.randint(0, 10, size=(32,)))
+tape = ag.Tape()
+ag.set_current_tape(tape)
+x = ag.tensor(Tensor.from_host(q, xb))
+y = ag.tensor(Tensor.from_host(q, yb))
+logits = model(x)
+loss = ag.cross_entropy(logits, y)
+tape.backward(loss)
+opt.step(); opt.zero_grad()
+ag.set_current_tape(None)
+```
+## Kernfeatures
+- **Autograd**: Tape-basiert, elementare Ops (Matmul, Conv2d, Pooling, Elementwise) mit Backward.
+- **Module/High-Level API**: `Linear`, `Conv2d`, `BatchNorm2d`, `Sequential`, `@model`-Dekorator, Trainer.
+- **Optimierungen**: Conv-Algo-Heuristik/Autotuning, optional Mixed Precision, Buffer-Pool.
+- **Serialization**: Speichert `Sequential`-Modelle als JSON (Architektur) + NPZ (Gewichte) via `netcl.io.serialization`.
+## Speichern & Laden eines Modells
+```python
+from netcl.io.serialization import save_model, load_model
+save_model(model, "checkpoints/mnist_mlp")
+model2 = load_model("checkpoints/mnist_mlp")  # queue auto="default"
+```
+## Hinweise
+- Tests werden nicht mitinstalliert. Für lokale Entwicklung: `python -m pytest`.
+- Für Performance: Batch-Größe erhöhen, Augment minimieren, optional Mixed Precision (`Trainer(..., mixed_precision=True)`).
+- Conv-Algorithmen wählen automatisch optimierte Pfade; env-Flags wie `NETCL_CONV_AUTOTUNE=1` aktivieren Tuning.

netcl-0.1.0/netcl/__init__.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""
+netcl: PyOpenCL-based experimentation framework.
+This package currently focuses on low-level kernel primitives and helpers.
+"""
+from . import core, ops, autograd, distributed, runtime, profiling
+from .ops import (
+    matmul,
+    build_matmul_kernel,
+    elementwise_binary,
+    relu,
+    bias_add,
+    reduce_sum,
+    softmax,
+    conv2d,
+)
+from . import nn, optim, io
+__all__ = [
+    "core",
+    "ops",
+    "autograd",
+    "distributed",
+    "runtime",
+    "profiling",
+    "nn",
+    "optim",
+    "io",
+    "matmul",
+    "build_matmul_kernel",
+    "elementwise_binary",
+    "relu",
+    "bias_add",
+    "reduce_sum",
+    "softmax",
+    "conv2d",
+]

netcl-0.1.0/netcl/amp.py ADDED Viewed

@@ -0,0 +1,147 @@
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Sequence
+try:
+    import numpy as np  # type: ignore
+except ImportError:  # pragma: no cover
+    np = None
+from netcl.core.tensor import Tensor
+from netcl.core.tensor import _np_dtype  # type: ignore
+_AUTOCAST_ENABLED = False
+@dataclass
+class GradScaler:
+    init_scale: float = 2.0**16
+    growth_factor: float = 2.0
+    backoff_factor: float = 0.5
+    growth_interval: int = 2000
+    enabled: bool = True
+    def __post_init__(self):
+        self._scale = self.init_scale
+        self._growth_tracker = 0
+    @property
+    def scale(self) -> float:
+        return self._scale
+    def scale_loss(self, loss: Tensor) -> Tensor:
+        if not self.enabled:
+            return loss
+        from netcl.ops.elementwise import elementwise_binary
+        return elementwise_binary(loss, loss, expression=f"MUL(v0, {float(self._scale)})")
+    def unscale_grads(self, params: Sequence[Tensor]):
+        if not self.enabled:
+            return False
+        found_inf = False
+        for p in params:
+            if p.grad is None:
+                continue
+            g = p.grad.to_host()
+            if np.any(~np.isfinite(g)):
+                found_inf = True
+                break
+        if not found_inf:
+            inv_scale = 1.0 / self._scale
+            from netcl.ops.elementwise import elementwise_binary
+            for p in params:
+                if p.grad is None:
+                    continue
+                p.grad = elementwise_binary(p.grad, p.grad, expression=f"MUL(v0, {inv_scale})")
+        return found_inf
+    def step(self, optimizer, params: Sequence[Tensor]):
+        if not self.enabled:
+            optimizer.step()
+            return
+        found_inf = self.unscale_grads(params)
+        if not found_inf:
+            optimizer.step()
+            self._growth_tracker += 1
+            if self._growth_tracker % self.growth_interval == 0:
+                self._scale *= self.growth_factor
+        else:
+            self._scale *= self.backoff_factor
+            self._growth_tracker = 0
+    def update(self):
+        # no-op kept for API compatibility
+        pass
+def supports_fp16(queue) -> bool:
+    """
+    Check device extensions for cl_khr_fp16 support.
+    """
+    try:
+        return "cl_khr_fp16" in queue.device.extensions
+    except Exception:
+        return False
+def autocast_enabled(profile_supports_fp16: bool) -> bool:
+    return profile_supports_fp16
+class autocast:
+    """
+    Context manager for autocast. Enables casting only if underlying device supports fp16.
+    """
+    def __init__(self, enabled: bool = True, device_queue=None):
+        self.enabled = enabled
+        self.device_queue = device_queue
+        self.prev = False
+        self._capable = True
+    def __enter__(self):
+        global _AUTOCAST_ENABLED
+        self.prev = _AUTOCAST_ENABLED
+        if not self.enabled:
+            _AUTOCAST_ENABLED = False
+            return self
+        if self.device_queue is not None:
+            self._capable = supports_fp16(self.device_queue)
+        _AUTOCAST_ENABLED = self.enabled and self._capable
+        return self
+    def __exit__(self, exc_type, exc_val, exc_tb):
+        global _AUTOCAST_ENABLED
+        _AUTOCAST_ENABLED = self.prev
+        return False
+def is_autocast_enabled() -> bool:
+    return _AUTOCAST_ENABLED
+def maybe_cast_tensor(t: Tensor) -> Tensor:
+    if not _AUTOCAST_ENABLED:
+        return t
+    if t.dtype in ("float", "float32"):
+        # only cast if device can handle fp16
+        if supports_fp16(t.queue):
+            arr = t.to_host().astype(np.float16)
+            return Tensor.from_host(t.queue, arr, dtype="float16")
+        return t
+    return t
+def master_param(param: Tensor) -> Tensor:
+    """
+    Keep master weights in FP32 for optimizers.
+    """
+    if param.dtype in ("float16", "half"):
+        master = Tensor.from_host(param.queue, param.to_host().astype(np.float32), dtype="float32")
+        setattr(master, "_model_param", param)
+        return master
+    setattr(param, "_model_param", param)
+    return param

netcl-0.1.0/netcl/autograd/__init__.py ADDED Viewed

@@ -0,0 +1,86 @@
+"""
+Autograd: Node/Tape plus op wrappers.
+"""
+from .engine import Node, Tape, apply_op, no_grad, set_current_tape, get_current_tape
+from .debug import debug_tape
+from .ops import (
+    tensor,
+    add,
+    relu,
+    bias_add,
+    matmul_op,
+    sub,
+    mse_loss,
+    sigmoid,
+    tanh,
+    leaky_relu,
+    gelu,
+    swish,
+    elu,
+    softplus,
+    hard_sigmoid,
+    hard_swish,
+    clamp,
+    hard_tanh,
+    prelu,
+    hinge_loss,
+    l1_loss,
+    l2_loss,
+    depthwise_conv2d,
+    batch_norm2d,
+    layer_norm,
+    pad2d,
+    group_norm,
+    global_avg_pool2d,
+    cross_entropy,
+    conv2d,
+    flatten,
+    max_pool2d,
+    dropout,
+    avg_pool2d,
+)
+__all__ = [
+    "Node",
+    "Tape",
+    "apply_op",
+    "no_grad",
+    "tensor",
+    "add",
+    "relu",
+    "bias_add",
+    "matmul_op",
+    "sub",
+    "mse_loss",
+    "sigmoid",
+    "tanh",
+    "leaky_relu",
+    "gelu",
+    "swish",
+    "elu",
+    "softplus",
+    "hard_sigmoid",
+    "hard_swish",
+    "clamp",
+    "hard_tanh",
+    "prelu",
+    "hinge_loss",
+    "l1_loss",
+    "l2_loss",
+    "depthwise_conv2d",
+    "batch_norm2d",
+    "layer_norm",
+    "pad2d",
+    "group_norm",
+    "global_avg_pool2d",
+    "cross_entropy",
+    "conv2d",
+    "flatten",
+    "max_pool2d",
+    "dropout",
+    "avg_pool2d",
+    "debug_tape",
+    "set_current_tape",
+    "get_current_tape",
+]

netcl-0.1.0/netcl/autograd/debug.py ADDED Viewed

@@ -0,0 +1,12 @@
+from __future__ import annotations
+from contextlib import contextmanager
+from typing import Iterator
+@contextmanager
+def debug_tape(tape):
+    """
+    Context manager to expose a tape for debugging/inspection.
+    """
+    yield tape