pycograd 0.0.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. pycograd-0.0.1/MANIFEST.in +2 -0
  2. pycograd-0.0.1/PKG-INFO +324 -0
  3. pycograd-0.0.1/README.md +263 -0
  4. pycograd-0.0.1/docs/HISTORY.rst +117 -0
  5. pycograd-0.0.1/docs/LICENSE.txt +11 -0
  6. pycograd-0.0.1/pycograd/__init__.py +390 -0
  7. pycograd-0.0.1/pycograd/_constraints.py +148 -0
  8. pycograd-0.0.1/pycograd/_dims.py +453 -0
  9. pycograd-0.0.1/pycograd/_typing.py +97 -0
  10. pycograd-0.0.1/pycograd/_version.py +669 -0
  11. pycograd-0.0.1/pycograd/ad_graph.py +372 -0
  12. pycograd-0.0.1/pycograd/backends/__init__.py +302 -0
  13. pycograd-0.0.1/pycograd/backends/abstract_backend.py +73 -0
  14. pycograd-0.0.1/pycograd/backends/cupy_backend.py +47 -0
  15. pycograd-0.0.1/pycograd/backends/jax_backend.py +207 -0
  16. pycograd-0.0.1/pycograd/backends/mps_backend.py +47 -0
  17. pycograd-0.0.1/pycograd/backends/numpy_backend.py +66 -0
  18. pycograd-0.0.1/pycograd/backends/tf_backend.py +407 -0
  19. pycograd-0.0.1/pycograd/backends/torch_backend.py +482 -0
  20. pycograd-0.0.1/pycograd/batching.py +638 -0
  21. pycograd-0.0.1/pycograd/capture.py +527 -0
  22. pycograd-0.0.1/pycograd/checkpoint.py +420 -0
  23. pycograd-0.0.1/pycograd/compile.py +199 -0
  24. pycograd-0.0.1/pycograd/cost.py +548 -0
  25. pycograd-0.0.1/pycograd/data.py +115 -0
  26. pycograd-0.0.1/pycograd/dtypes.py +152 -0
  27. pycograd-0.0.1/pycograd/examples/__init__.py +12 -0
  28. pycograd-0.0.1/pycograd/examples/__main__.py +242 -0
  29. pycograd-0.0.1/pycograd/examples/models.py +953 -0
  30. pycograd-0.0.1/pycograd/export.py +121 -0
  31. pycograd-0.0.1/pycograd/extension.py +137 -0
  32. pycograd-0.0.1/pycograd/forward.py +683 -0
  33. pycograd-0.0.1/pycograd/functional.py +808 -0
  34. pycograd-0.0.1/pycograd/ops.py +1575 -0
  35. pycograd-0.0.1/pycograd/optimizers.py +284 -0
  36. pycograd-0.0.1/pycograd/params.py +882 -0
  37. pycograd-0.0.1/pycograd/passes.py +580 -0
  38. pycograd-0.0.1/pycograd/random.py +92 -0
  39. pycograd-0.0.1/pycograd/remat.py +779 -0
  40. pycograd-0.0.1/pycograd/shapes.py +1174 -0
  41. pycograd-0.0.1/pycograd/tensor.py +650 -0
  42. pycograd-0.0.1/pycograd/trace.py +420 -0
  43. pycograd-0.0.1/pycograd/tracer.py +531 -0
  44. pycograd-0.0.1/pycograd/training.py +136 -0
  45. pycograd-0.0.1/pycograd/transforms.py +1078 -0
  46. pycograd-0.0.1/pycograd/transpose.py +167 -0
  47. pycograd-0.0.1/pycograd/tree.py +109 -0
  48. pycograd-0.0.1/pycograd/version.py +18 -0
  49. pycograd-0.0.1/pycograd.egg-info/PKG-INFO +324 -0
  50. pycograd-0.0.1/pycograd.egg-info/SOURCES.txt +56 -0
  51. pycograd-0.0.1/pycograd.egg-info/dependency_links.txt +1 -0
  52. pycograd-0.0.1/pycograd.egg-info/not-zip-safe +1 -0
  53. pycograd-0.0.1/pycograd.egg-info/requires.txt +44 -0
  54. pycograd-0.0.1/pycograd.egg-info/top_level.txt +1 -0
  55. pycograd-0.0.1/pyproject.toml +42 -0
  56. pycograd-0.0.1/setup.cfg +95 -0
  57. pycograd-0.0.1/setup.py +6 -0
@@ -0,0 +1,2 @@
1
+ include README.md docs/HISTORY.rst docs/LICENSE.txt
2
+ recursive-exclude test *
@@ -0,0 +1,324 @@
1
+ Metadata-Version: 2.4
2
+ Name: pycograd
3
+ Version: 0.0.1
4
+ Summary: A small, readable reverse-mode autograd library built on numpy and pyccolo
5
+ Home-page: https://github.com/smacke/pycograd
6
+ Author: Stephen Macke
7
+ Author-email: stephen.macke@gmail.com
8
+ License: BSD-3-Clause
9
+ Classifier: Development Status :: 3 - Alpha
10
+ Classifier: Intended Audience :: Developers
11
+ Classifier: Intended Audience :: Science/Research
12
+ Classifier: License :: OSI Approved :: BSD License
13
+ Classifier: Natural Language :: English
14
+ Classifier: Topic :: Scientific/Engineering :: Artificial Intelligence
15
+ Classifier: Programming Language :: Python :: 3.10
16
+ Classifier: Programming Language :: Python :: 3.11
17
+ Classifier: Programming Language :: Python :: 3.12
18
+ Classifier: Programming Language :: Python :: 3.13
19
+ Classifier: Programming Language :: Python :: 3.14
20
+ Requires-Python: >=3.10
21
+ Description-Content-Type: text/markdown; charset=UTF-8
22
+ License-File: docs/LICENSE.txt
23
+ Requires-Dist: ml_dtypes
24
+ Requires-Dist: numpy
25
+ Requires-Dist: pipescript>=0.0.22
26
+ Requires-Dist: pyccolo>=0.0.87
27
+ Provides-Extra: test
28
+ Requires-Dist: black<24; extra == "test"
29
+ Requires-Dist: ipython; extra == "test"
30
+ Requires-Dist: isort; extra == "test"
31
+ Requires-Dist: mypy; extra == "test"
32
+ Requires-Dist: pytest; extra == "test"
33
+ Requires-Dist: pytest-cov; extra == "test"
34
+ Requires-Dist: ruff; extra == "test"
35
+ Provides-Extra: jax
36
+ Requires-Dist: jax; extra == "jax"
37
+ Provides-Extra: torch
38
+ Requires-Dist: torch; extra == "torch"
39
+ Provides-Extra: tf
40
+ Requires-Dist: tensorflow; extra == "tf"
41
+ Provides-Extra: cupy
42
+ Requires-Dist: cupy; extra == "cupy"
43
+ Provides-Extra: onnx
44
+ Requires-Dist: torch; extra == "onnx"
45
+ Requires-Dist: onnx; extra == "onnx"
46
+ Requires-Dist: onnxruntime; extra == "onnx"
47
+ Provides-Extra: dev
48
+ Requires-Dist: build; extra == "dev"
49
+ Requires-Dist: pycln; extra == "dev"
50
+ Requires-Dist: twine; extra == "dev"
51
+ Requires-Dist: setuptools-git-versioning; extra == "dev"
52
+ Requires-Dist: versioneer; extra == "dev"
53
+ Requires-Dist: black<24; extra == "dev"
54
+ Requires-Dist: ipython; extra == "dev"
55
+ Requires-Dist: isort; extra == "dev"
56
+ Requires-Dist: mypy; extra == "dev"
57
+ Requires-Dist: pytest; extra == "dev"
58
+ Requires-Dist: pytest-cov; extra == "dev"
59
+ Requires-Dist: ruff; extra == "dev"
60
+ Dynamic: license-file
61
+
62
+ # pycograd
63
+
64
+ [![pycograd](https://github.com/smacke/pycograd/actions/workflows/ci.yml/badge.svg)](https://github.com/smacke/pycograd/actions/workflows/ci.yml)
65
+ [![checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/)
66
+ [![License: BSD3](https://img.shields.io/badge/License-BSD3-maroon.svg)](https://opensource.org/licenses/BSD-3-Clause)
67
+ [![Python versions](https://img.shields.io/pypi/pyversions/pycograd.svg)](https://pypi.org/project/pycograd)
68
+ [![PyPI version](https://img.shields.io/pypi/v/pycograd.svg)](https://pypi.org/project/pycograd)
69
+
70
+ A small, readable reverse-mode automatic-differentiation library, built on numpy
71
+ and [pyccolo](https://github.com/smacke/pyccolo). Write *ordinary* numeric Python
72
+ — including `numpy` calls like `np.exp`, `np.dot`, `np.sum` and operators like
73
+ `@` — and get correct gradients, with **no special "autodiff namespace."**
74
+
75
+ It's small enough to read end to end — the kind of autograd you can use to
76
+ explain backprop on one slide — but the machinery around the core scales *up*:
77
+ auto-batching (`vmap`), forward-mode (`jvp`) and Hessians, graph capture and
78
+ optimization, gradient checkpointing, and a one-call **compile to PyTorch / JAX /
79
+ TensorFlow** — enough to write a Transformer or an RWKV recurrent net (see
80
+ [`notebooks/`](notebooks/)) and have one forward pass serve all of them.
81
+
82
+ There are **two co-equal ways to write a model**, and the same transforms work on
83
+ both:
84
+
85
+ - a **functional** surface — plain numpy functions you hand to `grad` /
86
+ `value_and_grad` / `vmap`;
87
+ - an **ambient-weights DSL** — a `params{ ... } as weights:` block plus `|>`
88
+ pipelines, so you write the forward *once*, by bare name, and `weights.grad`
89
+ differentiates it.
90
+
91
+ ## Install
92
+
93
+ ```bash
94
+ pip install pycograd
95
+ ```
96
+
97
+ ## Quickstart
98
+
99
+ ### Functional — `grad` over ordinary numpy
100
+
101
+ Hand any numpy function to `grad` / `value_and_grad`; the array argument is lifted
102
+ onto the tape for you.
103
+
104
+ ```python
105
+ import numpy as np
106
+ from pycograd import value_and_grad
107
+
108
+ def f(x):
109
+ return np.sum(np.sin(x * x)) # ordinary numpy -- and it differentiates
110
+
111
+ x = np.array([0.5, 1.0, 1.5])
112
+ value, (g,) = value_and_grad(f)(x)
113
+ # g == 2 * x * cos(x * x)
114
+ ```
115
+
116
+ ### The ambient-weights DSL — write the forward once
117
+
118
+ In a notebook or IPython session, `%load_ext pycograd` turns on the DSL: a
119
+ `params{ ... } as weights:` block builds a parameter pytree and injects the
120
+ weights as ambient names, and `|>` pipelines differentiate when a model runs
121
+ through them. Here is a 2-layer MLP classifier trained by SGD:
122
+
123
+ ```python
124
+ %load_ext pycograd
125
+ import numpy as np
126
+ from pycograd import relu, softmax, cross_entropy
127
+
128
+ rng = np.random.default_rng(42)
129
+ X, Y = ... # features and one-hot labels (3 classes)
130
+
131
+ with params{
132
+ w1 = 0.3 * rng.standard_normal((2, 16)); b1 = np.zeros(16)
133
+ w2 = 0.3 * rng.standard_normal((16, 3)); b2 = np.zeros(3)
134
+ } as weights:
135
+ logits = $ |> $ @ w1 + b1 |> relu |> $ @ w2 + b2 # the model, written once
136
+ forward = $ |> logits |> softmax
137
+ obj = |> X |> logits |> cross_entropy($, Y) # mean softmax cross-entropy
138
+ for _ in range(10):
139
+ value, grads = weights.grad(obj) # bind weights -> Vars, backprop
140
+ weights.step(grads, 0.5) # in-place SGD
141
+ ```
142
+
143
+ Weights are referenced by bare name; `relu` / `softmax` / `cross_entropy` are
144
+ first-class, finite-difference-checked fused ops imported straight from
145
+ `pycograd` (there is no op library to import for the *model* — a linear layer is
146
+ just `$ @ w + b`). `frozen[...]` holds a weight fixed (its gradient comes back
147
+ `None`), `tied(...)` shares one. `weights.grad` only *computes* gradients, so any
148
+ optimizer can consume them — swap the loop for `train(weights, obj, 200,
149
+ Adam(lr=cosine_decay(0.05, 200)))`. The very same `forward` is what `vmap` and
150
+ `compile` consume below.
151
+
152
+ ## One forward, many uses
153
+
154
+ The payoff of writing the forward once is that the autodiff transforms compose
155
+ over it with no rewrites.
156
+
157
+ ### Per-sample gradients with `vmap`
158
+
159
+ `vmap` turns a function written for **one** example into one that runs over a
160
+ whole batch in a single vectorized pass. Composed with `grad`, it gives something
161
+ broadcasting *cannot*: the gradient of each example separately, stacked over the
162
+ batch.
163
+
164
+ ```python
165
+ from pycograd import grad, vmap
166
+
167
+ def per_example_loss(w, b, x, y): # ONE (2,) point + ONE label -> scalar
168
+ return x |> $ @ w + b |> cross_entropy($, y)
169
+
170
+ w = np.zeros((2, 3)); b = np.zeros(3)
171
+ gw, gb, _, _ = vmap(grad(per_example_loss), in_axes=(None, None, 0, 0))(w, b, X, Y)
172
+ # gw: (N, 2, 3) gb: (N, 3) -- one gradient per example
173
+ # their batch-mean is exactly the ordinary full-batch gradient
174
+ ```
175
+
176
+ Per-sample gradients are exactly what gradient clipping and DP-SGD need: bound
177
+ each example's gradient norm *before* averaging. `vmap` is one trace level in the
178
+ interpreter stack, so it composes every which way — `vmap(grad(f))` gives the
179
+ per-sample gradients above, `grad(vmap(f))` runs straight through a batched
180
+ forward, and `vmap(vmap(f))` nests.
181
+
182
+ ### Compile to PyTorch / JAX / TensorFlow
183
+
184
+ The same forward can be handed to *another framework's* autodiff. Pass
185
+ `backend=` to `weights.grad` (or `train`) and gradients come back from torch /
186
+ jax / tf instead of pycograd's numpy tape — matching to floating-point tolerance:
187
+
188
+ ```python
189
+ v_np, g_np = weights.grad(objective) # pycograd's numpy tape
190
+ for backend in ("torch", "jax", "tf"):
191
+ v_be, g_be = weights.grad(objective, backend=backend, jit=True)
192
+ worst = max(np.max(np.abs(np.asarray(g_be[k]) - np.asarray(g_np[k]))) for k in weights)
193
+ print("%-5s max|grad - grad_numpy| = %.1e" % (backend, worst)) # ~1e-6
194
+ ```
195
+
196
+ `compile_to(forward, "torch")` instead returns a function over the framework's
197
+ own tensors, and `weights.to_torch_module(forward)` / `export_torchscript` /
198
+ `export_onnx` package a trained net for shipping with no pycograd dependency. A
199
+ GRU, an attention block, or an RWKV cell written once thus trains on numpy,
200
+ batches under `vmap`, and compiles to three frameworks with zero rewrites
201
+ (see the notebooks below).
202
+
203
+ ## Shape inference
204
+
205
+ Because a net is just a numpy function, you can ask what shapes it produces
206
+ *without* training it. `eval_shape` runs the function over abstract `(shape,
207
+ dtype)` values — no data, no allocation, so a `100000×100000` matmul is sized
208
+ instantly — and `summary` tabulates the parameters:
209
+
210
+ ```python
211
+ from pycograd import eval_shape, summary, ShapeDtypeStruct as S
212
+
213
+ eval_shape(mlp_forward, S((5, 2)), S((2, 16)), S((16,)), S((16, 3)), S((3,))) # -> f64[5,3]
214
+ summary(mlp_batch_loss, params, (5, 2), (5, 3)) # per-weight table + total params
215
+ ```
216
+
217
+ Shape mismatches raise a `ShapeError` that names the op and operand shapes
218
+ (`matmul: incompatible shapes (3, 4) and (5, 6)`) instead of an opaque numpy
219
+ message; a shape that genuinely depends on data values is reported as such rather
220
+ than silently mis-sized.
221
+
222
+ ## Gradient checkpointing
223
+
224
+ The tape keeps every intermediate alive until `backward`, so a deep net can run
225
+ out of memory. `checkpoint(f)` wraps a segment so its activations are **dropped on
226
+ the forward and recomputed in backward** — trading ~one extra forward pass for a
227
+ large peak-memory drop. It's a drop-in: gradients are unchanged.
228
+
229
+ ```python
230
+ from pycograd import checkpoint, value_and_grad
231
+
232
+ def loss(x):
233
+ y = checkpoint(block)(x) # block's activations are rematerialized in backward
234
+ return np.sum(y * y)
235
+
236
+ value, (g,) = value_and_grad(loss)(x) # same gradient, less memory
237
+ ```
238
+
239
+ It composes with positional `grad` / `value_and_grad`, the ambient
240
+ `weights.grad` loop, and `vmap` (`vmap(checkpoint(f)) == checkpoint(vmap(f))`);
241
+ `f` must be deterministic in its inputs/weights, since it is re-run to recover the
242
+ activations.
243
+
244
+ ## Devices / backends
245
+
246
+ The tape runs on a pluggable **array backend** (NumPy by default). A `device(...)`
247
+ block swaps the array library the primitives, the tape, and the optimizers compute
248
+ with — so the same net trains on a GPU with no code changes, gradients and
249
+ optimizer state living on-device:
250
+
251
+ ```python
252
+ from pycograd import device, value_and_grad, Adam
253
+
254
+ with device("cupy"): # requires a CUDA GPU + cupy (`pip install pycograd[cupy]`)
255
+ value, (g,) = value_and_grad(loss)(w) # tape + grads on the GPU
256
+ w = Adam(lr=1e-3).step(w, g) # Adam moments on the GPU too
257
+ ```
258
+
259
+ CuPy mirrors NumPy's API, so the `np.exp` / `@` / `np.sum` code you already wrote
260
+ "just works." For finer control, `on_cpu[...]` / `on_device(...)` pin individual
261
+ leaves — e.g. a large embedding table on the CPU while the classifier trains on
262
+ the GPU, one autograd graph straddling both (see the device-placement notebook).
263
+ This is distinct from `compile_to`, which hands the net to *another framework's*
264
+ autodiff.
265
+
266
+ ## Graph capture & rematerialization
267
+
268
+ `capture(forward, x)` records a `|>` pipeline into a flat SSA graph you can print,
269
+ `grad_graph` differentiates it, and `optimize` runs passes over it (CSE across the
270
+ forward/backward boundary, dead-code elimination). On top of that, `plan_remat`
271
+ fits a value+gradient pass under a memory budget by deciding per activation
272
+ whether to keep, spill, or recompute it — `eval_scheduled` then runs the plan to
273
+ the identical answer. See the [graph-viz](notebooks/pycograd_graph_viz_demo.ipynb)
274
+ and [remat](notebooks/pycograd_remat_demo.ipynb) notebooks.
275
+
276
+ ## Examples & notebooks
277
+
278
+ The bundled demos (logistic regression, MLP, LayerNorm/Dropout, single-head
279
+ Transformer block, GRU/LSTM) train from scratch and are gradient-checked against
280
+ finite differences. Run them with:
281
+
282
+ ```bash
283
+ python -m pycograd.examples
284
+ ```
285
+
286
+ The [`notebooks/`](notebooks/) directory walks through the library end to end:
287
+
288
+ - [`pycograd_demo`](notebooks/pycograd_demo.ipynb) — the DSL tour: linear
289
+ classifier → MLP → highway net → self-attention → a Transformer encoder block.
290
+ - [`pycograd_vmap_demo`](notebooks/pycograd_vmap_demo.ipynb) — where `vmap`
291
+ earns its keep: per-sample gradients, gradient clipping, batched attention.
292
+ - [`pycograd_rnn_demo`](notebooks/pycograd_rnn_demo.ipynb) /
293
+ [`pycograd_rwkv_demo`](notebooks/pycograd_rwkv_demo.ipynb) — GRU/LSTM and
294
+ RWKV (trained in parallel, sampled in O(1)-per-token recurrent form).
295
+ - [`pycograd_compile_*`](notebooks/) — compile/parity against PyTorch, JAX,
296
+ TensorFlow, and Apple MPS, plus TorchScript/ONNX export.
297
+ - [`pycograd_device_placement_demo`](notebooks/pycograd_device_placement_demo.ipynb) —
298
+ a single pass split across CPU and GPU.
299
+ - [`pycograd_graph_viz_demo`](notebooks/pycograd_graph_viz_demo.ipynb) /
300
+ [`pycograd_remat_demo`](notebooks/pycograd_remat_demo.ipynb) — the graph IR,
301
+ optimization passes, and the cost-model-driven spill/remat planner.
302
+
303
+ `value_and_grad` / `grad` work the same in a notebook as anywhere else; the DSL is
304
+ the only part that needs `%load_ext pycograd`.
305
+
306
+ ## How it works
307
+
308
+ * `Var` is a reverse-mode tape node wrapping a numpy array. Arithmetic operators
309
+ are overloaded so that running a program builds a computation graph;
310
+ `Var.backward()` then walks it in reverse to accumulate gradients.
311
+
312
+ * Operator overloading alone is *not enough*. The moment user code calls a numpy
313
+ function — `np.exp(x)` — numpy's ufunc machinery takes over and the gradient
314
+ link is lost. (`Var` sets `__array_ufunc__ = None` so this fails loudly instead
315
+ of silently producing a wrong gradient.) pyccolo supplies the missing piece: its
316
+ `before_call` event lets a handler *replace the function being called*, swapping
317
+ `np.exp` for a differentiable `d_exp` transparently — so idiomatic numpy code
318
+ "just differentiates." The same trick routes scalar `math.*` through the
319
+ numpy-backed primitives, differentiates through your own helper functions by
320
+ instrumenting them on demand, and powers the `|>` DSL.
321
+
322
+ ## License
323
+
324
+ [BSD-3-Clause](docs/LICENSE.txt).
@@ -0,0 +1,263 @@
1
+ # pycograd
2
+
3
+ [![pycograd](https://github.com/smacke/pycograd/actions/workflows/ci.yml/badge.svg)](https://github.com/smacke/pycograd/actions/workflows/ci.yml)
4
+ [![checked with mypy](https://www.mypy-lang.org/static/mypy_badge.svg)](https://mypy-lang.org/)
5
+ [![License: BSD3](https://img.shields.io/badge/License-BSD3-maroon.svg)](https://opensource.org/licenses/BSD-3-Clause)
6
+ [![Python versions](https://img.shields.io/pypi/pyversions/pycograd.svg)](https://pypi.org/project/pycograd)
7
+ [![PyPI version](https://img.shields.io/pypi/v/pycograd.svg)](https://pypi.org/project/pycograd)
8
+
9
+ A small, readable reverse-mode automatic-differentiation library, built on numpy
10
+ and [pyccolo](https://github.com/smacke/pyccolo). Write *ordinary* numeric Python
11
+ — including `numpy` calls like `np.exp`, `np.dot`, `np.sum` and operators like
12
+ `@` — and get correct gradients, with **no special "autodiff namespace."**
13
+
14
+ It's small enough to read end to end — the kind of autograd you can use to
15
+ explain backprop on one slide — but the machinery around the core scales *up*:
16
+ auto-batching (`vmap`), forward-mode (`jvp`) and Hessians, graph capture and
17
+ optimization, gradient checkpointing, and a one-call **compile to PyTorch / JAX /
18
+ TensorFlow** — enough to write a Transformer or an RWKV recurrent net (see
19
+ [`notebooks/`](notebooks/)) and have one forward pass serve all of them.
20
+
21
+ There are **two co-equal ways to write a model**, and the same transforms work on
22
+ both:
23
+
24
+ - a **functional** surface — plain numpy functions you hand to `grad` /
25
+ `value_and_grad` / `vmap`;
26
+ - an **ambient-weights DSL** — a `params{ ... } as weights:` block plus `|>`
27
+ pipelines, so you write the forward *once*, by bare name, and `weights.grad`
28
+ differentiates it.
29
+
30
+ ## Install
31
+
32
+ ```bash
33
+ pip install pycograd
34
+ ```
35
+
36
+ ## Quickstart
37
+
38
+ ### Functional — `grad` over ordinary numpy
39
+
40
+ Hand any numpy function to `grad` / `value_and_grad`; the array argument is lifted
41
+ onto the tape for you.
42
+
43
+ ```python
44
+ import numpy as np
45
+ from pycograd import value_and_grad
46
+
47
+ def f(x):
48
+ return np.sum(np.sin(x * x)) # ordinary numpy -- and it differentiates
49
+
50
+ x = np.array([0.5, 1.0, 1.5])
51
+ value, (g,) = value_and_grad(f)(x)
52
+ # g == 2 * x * cos(x * x)
53
+ ```
54
+
55
+ ### The ambient-weights DSL — write the forward once
56
+
57
+ In a notebook or IPython session, `%load_ext pycograd` turns on the DSL: a
58
+ `params{ ... } as weights:` block builds a parameter pytree and injects the
59
+ weights as ambient names, and `|>` pipelines differentiate when a model runs
60
+ through them. Here is a 2-layer MLP classifier trained by SGD:
61
+
62
+ ```python
63
+ %load_ext pycograd
64
+ import numpy as np
65
+ from pycograd import relu, softmax, cross_entropy
66
+
67
+ rng = np.random.default_rng(42)
68
+ X, Y = ... # features and one-hot labels (3 classes)
69
+
70
+ with params{
71
+ w1 = 0.3 * rng.standard_normal((2, 16)); b1 = np.zeros(16)
72
+ w2 = 0.3 * rng.standard_normal((16, 3)); b2 = np.zeros(3)
73
+ } as weights:
74
+ logits = $ |> $ @ w1 + b1 |> relu |> $ @ w2 + b2 # the model, written once
75
+ forward = $ |> logits |> softmax
76
+ obj = |> X |> logits |> cross_entropy($, Y) # mean softmax cross-entropy
77
+ for _ in range(10):
78
+ value, grads = weights.grad(obj) # bind weights -> Vars, backprop
79
+ weights.step(grads, 0.5) # in-place SGD
80
+ ```
81
+
82
+ Weights are referenced by bare name; `relu` / `softmax` / `cross_entropy` are
83
+ first-class, finite-difference-checked fused ops imported straight from
84
+ `pycograd` (there is no op library to import for the *model* — a linear layer is
85
+ just `$ @ w + b`). `frozen[...]` holds a weight fixed (its gradient comes back
86
+ `None`), `tied(...)` shares one. `weights.grad` only *computes* gradients, so any
87
+ optimizer can consume them — swap the loop for `train(weights, obj, 200,
88
+ Adam(lr=cosine_decay(0.05, 200)))`. The very same `forward` is what `vmap` and
89
+ `compile` consume below.
90
+
91
+ ## One forward, many uses
92
+
93
+ The payoff of writing the forward once is that the autodiff transforms compose
94
+ over it with no rewrites.
95
+
96
+ ### Per-sample gradients with `vmap`
97
+
98
+ `vmap` turns a function written for **one** example into one that runs over a
99
+ whole batch in a single vectorized pass. Composed with `grad`, it gives something
100
+ broadcasting *cannot*: the gradient of each example separately, stacked over the
101
+ batch.
102
+
103
+ ```python
104
+ from pycograd import grad, vmap
105
+
106
+ def per_example_loss(w, b, x, y): # ONE (2,) point + ONE label -> scalar
107
+ return x |> $ @ w + b |> cross_entropy($, y)
108
+
109
+ w = np.zeros((2, 3)); b = np.zeros(3)
110
+ gw, gb, _, _ = vmap(grad(per_example_loss), in_axes=(None, None, 0, 0))(w, b, X, Y)
111
+ # gw: (N, 2, 3) gb: (N, 3) -- one gradient per example
112
+ # their batch-mean is exactly the ordinary full-batch gradient
113
+ ```
114
+
115
+ Per-sample gradients are exactly what gradient clipping and DP-SGD need: bound
116
+ each example's gradient norm *before* averaging. `vmap` is one trace level in the
117
+ interpreter stack, so it composes every which way — `vmap(grad(f))` gives the
118
+ per-sample gradients above, `grad(vmap(f))` runs straight through a batched
119
+ forward, and `vmap(vmap(f))` nests.
120
+
121
+ ### Compile to PyTorch / JAX / TensorFlow
122
+
123
+ The same forward can be handed to *another framework's* autodiff. Pass
124
+ `backend=` to `weights.grad` (or `train`) and gradients come back from torch /
125
+ jax / tf instead of pycograd's numpy tape — matching to floating-point tolerance:
126
+
127
+ ```python
128
+ v_np, g_np = weights.grad(objective) # pycograd's numpy tape
129
+ for backend in ("torch", "jax", "tf"):
130
+ v_be, g_be = weights.grad(objective, backend=backend, jit=True)
131
+ worst = max(np.max(np.abs(np.asarray(g_be[k]) - np.asarray(g_np[k]))) for k in weights)
132
+ print("%-5s max|grad - grad_numpy| = %.1e" % (backend, worst)) # ~1e-6
133
+ ```
134
+
135
+ `compile_to(forward, "torch")` instead returns a function over the framework's
136
+ own tensors, and `weights.to_torch_module(forward)` / `export_torchscript` /
137
+ `export_onnx` package a trained net for shipping with no pycograd dependency. A
138
+ GRU, an attention block, or an RWKV cell written once thus trains on numpy,
139
+ batches under `vmap`, and compiles to three frameworks with zero rewrites
140
+ (see the notebooks below).
141
+
142
+ ## Shape inference
143
+
144
+ Because a net is just a numpy function, you can ask what shapes it produces
145
+ *without* training it. `eval_shape` runs the function over abstract `(shape,
146
+ dtype)` values — no data, no allocation, so a `100000×100000` matmul is sized
147
+ instantly — and `summary` tabulates the parameters:
148
+
149
+ ```python
150
+ from pycograd import eval_shape, summary, ShapeDtypeStruct as S
151
+
152
+ eval_shape(mlp_forward, S((5, 2)), S((2, 16)), S((16,)), S((16, 3)), S((3,))) # -> f64[5,3]
153
+ summary(mlp_batch_loss, params, (5, 2), (5, 3)) # per-weight table + total params
154
+ ```
155
+
156
+ Shape mismatches raise a `ShapeError` that names the op and operand shapes
157
+ (`matmul: incompatible shapes (3, 4) and (5, 6)`) instead of an opaque numpy
158
+ message; a shape that genuinely depends on data values is reported as such rather
159
+ than silently mis-sized.
160
+
161
+ ## Gradient checkpointing
162
+
163
+ The tape keeps every intermediate alive until `backward`, so a deep net can run
164
+ out of memory. `checkpoint(f)` wraps a segment so its activations are **dropped on
165
+ the forward and recomputed in backward** — trading ~one extra forward pass for a
166
+ large peak-memory drop. It's a drop-in: gradients are unchanged.
167
+
168
+ ```python
169
+ from pycograd import checkpoint, value_and_grad
170
+
171
+ def loss(x):
172
+ y = checkpoint(block)(x) # block's activations are rematerialized in backward
173
+ return np.sum(y * y)
174
+
175
+ value, (g,) = value_and_grad(loss)(x) # same gradient, less memory
176
+ ```
177
+
178
+ It composes with positional `grad` / `value_and_grad`, the ambient
179
+ `weights.grad` loop, and `vmap` (`vmap(checkpoint(f)) == checkpoint(vmap(f))`);
180
+ `f` must be deterministic in its inputs/weights, since it is re-run to recover the
181
+ activations.
182
+
183
+ ## Devices / backends
184
+
185
+ The tape runs on a pluggable **array backend** (NumPy by default). A `device(...)`
186
+ block swaps the array library the primitives, the tape, and the optimizers compute
187
+ with — so the same net trains on a GPU with no code changes, gradients and
188
+ optimizer state living on-device:
189
+
190
+ ```python
191
+ from pycograd import device, value_and_grad, Adam
192
+
193
+ with device("cupy"): # requires a CUDA GPU + cupy (`pip install pycograd[cupy]`)
194
+ value, (g,) = value_and_grad(loss)(w) # tape + grads on the GPU
195
+ w = Adam(lr=1e-3).step(w, g) # Adam moments on the GPU too
196
+ ```
197
+
198
+ CuPy mirrors NumPy's API, so the `np.exp` / `@` / `np.sum` code you already wrote
199
+ "just works." For finer control, `on_cpu[...]` / `on_device(...)` pin individual
200
+ leaves — e.g. a large embedding table on the CPU while the classifier trains on
201
+ the GPU, one autograd graph straddling both (see the device-placement notebook).
202
+ This is distinct from `compile_to`, which hands the net to *another framework's*
203
+ autodiff.
204
+
205
+ ## Graph capture & rematerialization
206
+
207
+ `capture(forward, x)` records a `|>` pipeline into a flat SSA graph you can print,
208
+ `grad_graph` differentiates it, and `optimize` runs passes over it (CSE across the
209
+ forward/backward boundary, dead-code elimination). On top of that, `plan_remat`
210
+ fits a value+gradient pass under a memory budget by deciding per activation
211
+ whether to keep, spill, or recompute it — `eval_scheduled` then runs the plan to
212
+ the identical answer. See the [graph-viz](notebooks/pycograd_graph_viz_demo.ipynb)
213
+ and [remat](notebooks/pycograd_remat_demo.ipynb) notebooks.
214
+
215
+ ## Examples & notebooks
216
+
217
+ The bundled demos (logistic regression, MLP, LayerNorm/Dropout, single-head
218
+ Transformer block, GRU/LSTM) train from scratch and are gradient-checked against
219
+ finite differences. Run them with:
220
+
221
+ ```bash
222
+ python -m pycograd.examples
223
+ ```
224
+
225
+ The [`notebooks/`](notebooks/) directory walks through the library end to end:
226
+
227
+ - [`pycograd_demo`](notebooks/pycograd_demo.ipynb) — the DSL tour: linear
228
+ classifier → MLP → highway net → self-attention → a Transformer encoder block.
229
+ - [`pycograd_vmap_demo`](notebooks/pycograd_vmap_demo.ipynb) — where `vmap`
230
+ earns its keep: per-sample gradients, gradient clipping, batched attention.
231
+ - [`pycograd_rnn_demo`](notebooks/pycograd_rnn_demo.ipynb) /
232
+ [`pycograd_rwkv_demo`](notebooks/pycograd_rwkv_demo.ipynb) — GRU/LSTM and
233
+ RWKV (trained in parallel, sampled in O(1)-per-token recurrent form).
234
+ - [`pycograd_compile_*`](notebooks/) — compile/parity against PyTorch, JAX,
235
+ TensorFlow, and Apple MPS, plus TorchScript/ONNX export.
236
+ - [`pycograd_device_placement_demo`](notebooks/pycograd_device_placement_demo.ipynb) —
237
+ a single pass split across CPU and GPU.
238
+ - [`pycograd_graph_viz_demo`](notebooks/pycograd_graph_viz_demo.ipynb) /
239
+ [`pycograd_remat_demo`](notebooks/pycograd_remat_demo.ipynb) — the graph IR,
240
+ optimization passes, and the cost-model-driven spill/remat planner.
241
+
242
+ `value_and_grad` / `grad` work the same in a notebook as anywhere else; the DSL is
243
+ the only part that needs `%load_ext pycograd`.
244
+
245
+ ## How it works
246
+
247
+ * `Var` is a reverse-mode tape node wrapping a numpy array. Arithmetic operators
248
+ are overloaded so that running a program builds a computation graph;
249
+ `Var.backward()` then walks it in reverse to accumulate gradients.
250
+
251
+ * Operator overloading alone is *not enough*. The moment user code calls a numpy
252
+ function — `np.exp(x)` — numpy's ufunc machinery takes over and the gradient
253
+ link is lost. (`Var` sets `__array_ufunc__ = None` so this fails loudly instead
254
+ of silently producing a wrong gradient.) pyccolo supplies the missing piece: its
255
+ `before_call` event lets a handler *replace the function being called*, swapping
256
+ `np.exp` for a differentiable `d_exp` transparently — so idiomatic numpy code
257
+ "just differentiates." The same trick routes scalar `math.*` through the
258
+ numpy-backed primitives, differentiates through your own helper functions by
259
+ instrumenting them on demand, and powers the `|>` DSL.
260
+
261
+ ## License
262
+
263
+ [BSD-3-Clause](docs/LICENSE.txt).