PyPI - tinygrad - Versions diffs - 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl - Mend

tinygrad 0.10.2py3-none-any.whl → 0.11.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (131) hide show

tinygrad/__init__.py +1 -1
tinygrad/apps/llm.py +206 -0
tinygrad/codegen/__init__.py +116 -0
tinygrad/codegen/devectorizer.py +315 -172
tinygrad/codegen/expander.py +8 -16
tinygrad/codegen/gpudims.py +89 -0
tinygrad/codegen/linearize.py +205 -203
tinygrad/codegen/lowerer.py +92 -139
tinygrad/codegen/opt/__init__.py +38 -0
tinygrad/codegen/opt/heuristic.py +125 -0
tinygrad/codegen/opt/kernel.py +510 -0
tinygrad/{engine → codegen/opt}/search.py +51 -35
tinygrad/codegen/opt/swizzler.py +134 -0
tinygrad/codegen/opt/tc.py +127 -0
tinygrad/codegen/quantize.py +67 -0
tinygrad/device.py +122 -132
tinygrad/dtype.py +152 -35
tinygrad/engine/jit.py +81 -54
tinygrad/engine/memory.py +46 -27
tinygrad/engine/realize.py +82 -41
tinygrad/engine/schedule.py +70 -445
tinygrad/frontend/__init__.py +0 -0
tinygrad/frontend/onnx.py +1253 -0
tinygrad/frontend/torch.py +5 -0
tinygrad/gradient.py +19 -27
tinygrad/helpers.py +95 -47
tinygrad/nn/__init__.py +7 -8
tinygrad/nn/optim.py +72 -41
tinygrad/nn/state.py +37 -23
tinygrad/renderer/__init__.py +40 -60
tinygrad/renderer/cstyle.py +143 -128
tinygrad/renderer/llvmir.py +113 -62
tinygrad/renderer/ptx.py +50 -32
tinygrad/renderer/wgsl.py +27 -23
tinygrad/runtime/autogen/am/am.py +5861 -0
tinygrad/runtime/autogen/am/pm4_nv.py +962 -0
tinygrad/runtime/autogen/am/pm4_soc15.py +931 -0
tinygrad/runtime/autogen/am/sdma_4_0_0.py +5209 -0
tinygrad/runtime/autogen/am/sdma_4_4_2.py +5209 -0
tinygrad/runtime/autogen/am/sdma_5_0_0.py +7103 -0
tinygrad/runtime/autogen/am/sdma_6_0_0.py +8085 -0
tinygrad/runtime/autogen/am/smu_v13_0_0.py +3068 -0
tinygrad/runtime/autogen/am/smu_v14_0_2.py +3605 -0
tinygrad/runtime/autogen/amd_gpu.py +1433 -67197
tinygrad/runtime/autogen/comgr.py +35 -9
tinygrad/runtime/autogen/comgr_3.py +906 -0
tinygrad/runtime/autogen/cuda.py +2419 -494
tinygrad/runtime/autogen/hsa.py +57 -16
tinygrad/runtime/autogen/ib.py +7171 -0
tinygrad/runtime/autogen/io_uring.py +917 -118
tinygrad/runtime/autogen/kfd.py +748 -26
tinygrad/runtime/autogen/libc.py +613 -218
tinygrad/runtime/autogen/libusb.py +1643 -0
tinygrad/runtime/autogen/nv/nv.py +8602 -0
tinygrad/runtime/autogen/nv_gpu.py +7218 -2072
tinygrad/runtime/autogen/opencl.py +2 -4
tinygrad/runtime/autogen/sqtt.py +1789 -0
tinygrad/runtime/autogen/vfio.py +3 -3
tinygrad/runtime/autogen/webgpu.py +273 -264
tinygrad/runtime/graph/cuda.py +3 -3
tinygrad/runtime/graph/hcq.py +68 -29
tinygrad/runtime/graph/metal.py +29 -13
tinygrad/runtime/graph/remote.py +114 -0
tinygrad/runtime/ops_amd.py +537 -320
tinygrad/runtime/ops_cpu.py +108 -7
tinygrad/runtime/ops_cuda.py +12 -14
tinygrad/runtime/ops_disk.py +13 -10
tinygrad/runtime/ops_dsp.py +47 -40
tinygrad/runtime/ops_gpu.py +13 -11
tinygrad/runtime/ops_hip.py +6 -9
tinygrad/runtime/ops_llvm.py +35 -15
tinygrad/runtime/ops_metal.py +29 -19
tinygrad/runtime/ops_npy.py +5 -3
tinygrad/runtime/ops_null.py +28 -0
tinygrad/runtime/ops_nv.py +306 -234
tinygrad/runtime/ops_python.py +62 -52
tinygrad/runtime/ops_qcom.py +28 -39
tinygrad/runtime/ops_remote.py +482 -0
tinygrad/runtime/ops_webgpu.py +28 -28
tinygrad/runtime/support/am/amdev.py +114 -249
tinygrad/runtime/support/am/ip.py +211 -172
tinygrad/runtime/support/amd.py +138 -0
tinygrad/runtime/support/{compiler_hip.py → compiler_amd.py} +40 -8
tinygrad/runtime/support/compiler_cuda.py +8 -11
tinygrad/runtime/support/elf.py +2 -1
tinygrad/runtime/support/hcq.py +184 -97
tinygrad/runtime/support/ib.py +172 -0
tinygrad/runtime/support/llvm.py +3 -4
tinygrad/runtime/support/memory.py +251 -0
tinygrad/runtime/support/nv/__init__.py +0 -0
tinygrad/runtime/support/nv/ip.py +581 -0
tinygrad/runtime/support/nv/nvdev.py +183 -0
tinygrad/runtime/support/system.py +170 -0
tinygrad/runtime/support/usb.py +268 -0
tinygrad/runtime/support/webgpu.py +18 -0
tinygrad/schedule/__init__.py +0 -0
tinygrad/schedule/grouper.py +119 -0
tinygrad/schedule/kernelize.py +368 -0
tinygrad/schedule/multi.py +231 -0
tinygrad/shape/shapetracker.py +40 -46
tinygrad/shape/view.py +88 -52
tinygrad/tensor.py +968 -542
tinygrad/uop/__init__.py +117 -0
tinygrad/{codegen/transcendental.py → uop/decompositions.py} +125 -38
tinygrad/uop/mathtraits.py +169 -0
tinygrad/uop/ops.py +1021 -0
tinygrad/uop/spec.py +228 -0
tinygrad/{codegen → uop}/symbolic.py +239 -216
tinygrad/uop/upat.py +163 -0
tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/x86asm.min.js +19 -0
tinygrad/viz/assets/d3js.org/d3.v7.min.js +2 -0
tinygrad/viz/assets/dagrejs.github.io/project/dagre/latest/dagre.min.js +801 -0
tinygrad/viz/index.html +203 -403
tinygrad/viz/js/index.js +718 -0
tinygrad/viz/js/worker.js +29 -0
tinygrad/viz/serve.py +224 -102
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA +24 -16
tinygrad-0.11.0.dist-info/RECORD +141 -0
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/WHEEL +1 -1
tinygrad/codegen/kernel.py +0 -693
tinygrad/engine/multi.py +0 -161
tinygrad/ops.py +0 -1003
tinygrad/runtime/ops_cloud.py +0 -220
tinygrad/runtime/support/allocator.py +0 -94
tinygrad/spec.py +0 -155
tinygrad/viz/assets/d3js.org/d3.v5.min.js +0 -2
tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +0 -4816
tinygrad/viz/perfetto.html +0 -178
tinygrad-0.10.2.dist-info/RECORD +0 -99
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info/licenses}/LICENSE +0 -0
{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/top_level.txt +0 -0

tinygrad/viz/js/worker.js ADDED Viewed

@@ -0,0 +1,29 @@
+const NODE_PADDING = 10;
+const LINE_HEIGHT = 14;
+const canvas = new OffscreenCanvas(0, 0);
+const ctx = canvas.getContext("2d");
+ctx.font = `${LINE_HEIGHT}px sans-serif`;
+onmessage = (e) => {
+  const { graph, additions, ctxs } = e.data;
+  const g = new dagre.graphlib.Graph({ compound: true });
+  g.setGraph({ rankdir: "LR" }).setDefaultEdgeLabel(function() { return {}; });
+  if (additions.length !== 0) g.setNode("addition", {label:"", style:"fill: rgba(26, 27, 38, 0.5);", padding:0});
+  for (let [k, {label, src, ref, ...rest }] of Object.entries(graph)) {
+    // adjust node dims by label size (excluding escape codes) + add padding
+    let [width, height] = [0, 0];
+    for (line of label.replace(/\u001B\[(?:K|.*?m)/g, "").split("\n")) {
+      width = Math.max(width, ctx.measureText(line).width);
+      height += LINE_HEIGHT;
+    }
+    g.setNode(k, {width:width+NODE_PADDING*2, height:height+NODE_PADDING*2, padding:NODE_PADDING, label, ref, ...rest});
+    // add edges
+    const edgeCounts = {}
+    for (const s of src) edgeCounts[s] = (edgeCounts[s] || 0)+1;
+    for (const s of src) g.setEdge(s, k, { label: edgeCounts[s] > 1 ? edgeCounts[s] : null });
+    if (additions.includes(parseInt(k))) g.setParent(k, "addition");
+  }
+  dagre.layout(g);
+  postMessage(dagre.graphlib.json.write(g));
+  self.close();
+}

tinygrad/viz/serve.py CHANGED Viewed

@@ -1,118 +1,240 @@
 #!/usr/bin/env python3
-import multiprocessing, pickle, functools, difflib, os, threading, json, time, sys, webbrowser, socket, argparse, decimal
-from http.server import HTTPServer, BaseHTTPRequestHandler
+import multiprocessing, pickle, difflib, os, threading, json, time, sys, webbrowser, socket, argparse, socketserver, functools, codecs, io
+import subprocess, ctypes
+from contextlib import redirect_stdout
+from decimal import Decimal
+from http.server import BaseHTTPRequestHandler
 from urllib.parse import parse_qs, urlparse
-from typing import Any, Callable, TypedDict, Generator
-from tinygrad.helpers import colored, getenv, tqdm, unwrap, word_wrap
-from tinygrad.ops import TrackedGraphRewrite, UOp, Ops, lines, GroupOp
-from tinygrad.codegen.kernel import Kernel
-from tinygrad.device import ProfileEvent, ProfileDeviceEvent, ProfileRangeEvent, ProfileGraphEvent
+from typing import Any, TypedDict, Generator
+from tinygrad.helpers import colored, getenv, tqdm, unwrap, word_wrap, TRACEMETA, ProfileEvent, ProfileRangeEvent, TracingKey, ProfilePointEvent
+from tinygrad.uop.ops import TrackedGraphRewrite, UOp, Ops, printable, GroupOp, srender, sint
+from tinygrad.device import ProfileDeviceEvent, ProfileGraphEvent, ProfileGraphEntry, Device
+from tinygrad.renderer import ProgramSpec
 from tinygrad.dtype import dtypes
-uops_colors = {Ops.LOAD: "#ffc0c0", Ops.STORE: "#87CEEB", Ops.CONST: "#e0e0e0", Ops.VCONST: "#e0e0e0",
-               Ops.DEFINE_GLOBAL: "#ffe0b0", Ops.DEFINE_LOCAL: "#ffe0d0", Ops.DEFINE_ACC: "#f0ffe0", Ops.REDUCE_AXIS: "#FF6B6B",
-               Ops.RANGE: "#c8a0e0", Ops.ASSIGN: "#e0ffc0", Ops.BARRIER: "#ff8080", Ops.IF: "#c8b0c0", Ops.SPECIAL: "#c0c0ff",
+uops_colors = {Ops.LOAD: "#ffc0c0", Ops.STORE: "#87CEEB", Ops.CONST: "#e0e0e0", Ops.VCONST: "#e0e0e0", Ops.REDUCE: "#FF5B5B",
+               Ops.DEFINE_GLOBAL: "#ffe0b0", Ops.DEFINE_LOCAL: "#ffe0d0", Ops.DEFINE_REG: "#f0ffe0", Ops.REDUCE_AXIS: "#FF6B6B",
+               Ops.RANGE: "#c8a0e0", Ops.ASSIGN: "#909090", Ops.BARRIER: "#ff8080", Ops.IF: "#c8b0c0", Ops.SPECIAL: "#c0c0ff",
                Ops.INDEX: "#e8ffa0", Ops.WMMA: "#efefc0", Ops.VIEW: "#C8F9D4", Ops.MULTI: "#f6ccff", Ops.KERNEL: "#3e7f55",
-               **{x:"#D8F9E4" for x in GroupOp.Movement}, **{x:"#ffffc0" for x in GroupOp.ALU}, Ops.THREEFRY:"#ffff80",
-               Ops.BLOCK: "#C4A484", Ops.BLOCKEND: "#C4A4A4", Ops.BUFFER: "#B0BDFF", Ops.COPY: "#a040a0", Ops.NAME:"#808080"}
+               **{x:"#D8F9E4" for x in GroupOp.Movement}, **{x:"#ffffc0" for x in GroupOp.ALU}, Ops.THREEFRY:"#ffff80", Ops.BUFFER_VIEW: "#E5EAFF",
+               Ops.BLOCK: "#C4A484", Ops.BLOCKEND: "#C4A4A4", Ops.BUFFER: "#B0BDFF", Ops.COPY: "#a040a0", Ops.FUSE: "#FFa500",
+               Ops.ALLREDUCE: "#ff40a0", Ops.MSELECT: "#d040a0", Ops.MSTACK: "#d040a0", Ops.CONTIGUOUS: "#FFC14D",
+               Ops.CHILDREN: "#80ffc0", Ops.CHILD: "#80fff0", Ops.BUFFERIZE: "#FF991C", Ops.REWRITE_ERROR: "#ff2e2e"}
 # VIZ API
-# NOTE: if any extra rendering in VIZ fails, we don't crash
-def pcall(fxn:Callable[..., str], *args, **kwargs) -> str:
-  try: return fxn(*args, **kwargs)
-  except Exception as e: return f"ERROR: {e}"
 # ** Metadata for a track_rewrites scope
-class GraphRewriteMetadata(TypedDict):
-  loc: tuple[str, int]                   # [path, lineno] calling graph_rewrite
-  match_count: int                       # total match count in this context
-  code_line: str                         # source code calling graph_rewrite
-  kernel_code: str|None                  # optionally render the final kernel code
-  name: str|None                         # optional name of the rewrite
-@functools.lru_cache(None)
-def _prg(k:Kernel): return k.to_program().src
-def to_metadata(k:Any, v:TrackedGraphRewrite) -> GraphRewriteMetadata:
-  return {"loc":v.loc, "match_count":len(v.matches), "code_line":lines(v.loc[0])[v.loc[1]-1].strip(),
-          "kernel_code":pcall(_prg, k) if isinstance(k, Kernel) else None, "name":v.name}
-def get_metadata(keys:list[Any], contexts:list[list[TrackedGraphRewrite]]) -> list[tuple[str, list[GraphRewriteMetadata]]]:
-  return [(k.name if isinstance(k, Kernel) else str(k), [to_metadata(k, v) for v in vals]) for k,vals in zip(keys, contexts)]
+ref_map:dict[Any, int] = {}
+def get_metadata(keys:list[TracingKey], contexts:list[list[TrackedGraphRewrite]]) -> list[dict]:
+  ret = []
+  for i,(k,v) in enumerate(zip(keys, contexts)):
+    steps = [{"name":s.name, "loc":s.loc, "depth":s.depth, "match_count":len(s.matches), "code_line":printable(s.loc),
+              "query":f"/ctxs?ctx={i}&idx={j}"} for j,s in enumerate(v)]
+    ret.append(r:={"name":k.display_name, "steps":steps})
+    # use the first key to get runtime profiling data about this context
+    if getenv("PROFILE_VALUE") >= 2 and k.keys: r["runtime_stats"] = get_runtime_stats(k.keys[0])
+    # program spec metadata
+    if isinstance(k.ret, ProgramSpec):
+      steps.append({"name":"View Disassembly", "query":f"/disasm?ctx={i}"})
+      r["fmt"] = k.ret.src
+    for key in k.keys: ref_map[key] = i
+  return ret
 # ** Complete rewrite details for a graph_rewrite call
 class GraphRewriteDetails(TypedDict):
   graph: dict                            # JSON serialized UOp for this rewrite step
   uop: str                               # strigified UOp for this rewrite step
-  diff: list[str]|None                   # string diff of the single UOp that changed
+  diff: list[str]|None                   # diff of the single UOp that changed
   changed_nodes: list[int]|None          # the changed UOp id + all its parents ids
   upat: tuple[tuple[str, int], str]|None # [loc, source_code] of the matched UPat
-def uop_to_json(x:UOp) -> dict[int, tuple[str, list[int], str]]:
+def shape_to_str(s:tuple[sint, ...]): return "(" + ','.join(srender(x) for x in s) + ")"
+def mask_to_str(s:tuple[tuple[sint, sint], ...]): return "(" + ','.join(shape_to_str(x) for x in s) + ")"
+def uop_to_json(x:UOp) -> dict[int, dict]:
   assert isinstance(x, UOp)
-  # NOTE: this is [id, [label, src_ids, color]]
-  graph: dict[int, tuple[str, list[int], str]] = {}
+  graph: dict[int, dict] = {}
   excluded: set[UOp] = set()
-  for u in (toposort:=x.toposort):
+  for u in (toposort:=x.toposort()):
     # always exclude DEVICE/CONST/UNIQUE
-    if u.op in {Ops.DEVICE, Ops.CONST, Ops.UNIQUE}: excluded.add(u)
+    if u.op in {Ops.DEVICE, Ops.CONST, Ops.UNIQUE} and u is not x: excluded.add(u)
     # only exclude CONST VIEW source if it has no other children in the graph
     if u.op is Ops.CONST and len(u.src) != 0 and all(cr.op is Ops.CONST for c in u.src[0].children if (cr:=c()) is not None and cr in toposort):
       excluded.update(u.src)
   for u in toposort:
     if u in excluded: continue
-    argst = str(u.arg)
+    argst = codecs.decode(str(u.arg), "unicode_escape")
     if u.op is Ops.VIEW:
-      argst = ("\n".join([f"{v.shape} / {v.strides}"+(f"\nMASK {v.mask}" if v.mask is not None else "")+
-                          ("" if v.offset == 0 else f" / {v.offset}") for v in unwrap(u.st).views]))
+      argst = ("\n".join([f"{shape_to_str(v.shape)} / {shape_to_str(v.strides)}"+("" if v.offset == 0 else f" / {srender(v.offset)}")+
+                          (f"\nMASK {mask_to_str(v.mask)}" if v.mask is not None else "") for v in unwrap(u.st).views]))
     label = f"{str(u.op).split('.')[1]}{(chr(10)+word_wrap(argst.replace(':', ''))) if u.arg is not None else ''}"
     if u.dtype != dtypes.void: label += f"\n{u.dtype}"
     for idx,x in enumerate(u.src):
       if x in excluded:
-        if x.op is Ops.CONST and dtypes.is_float(u.dtype): label += f"\nCONST{idx} {x.arg:g}"
-        else: label += f"\n{x.op.name}{idx} {x.arg}"
-    graph[id(u)] = (label, [id(x) for x in u.src if x not in excluded], uops_colors.get(u.op, "#ffffff"))
+        arg = f"{x.arg:g}" if x.op is Ops.CONST and dtypes.is_float(u.dtype) else f"{x.arg}"
+        label += f"\n{x.op.name}{idx} {arg}" + (f" {x.src[0].op}" if len(x.src) else "")
+    try:
+      if u.op not in {Ops.VIEW, Ops.BUFFER, Ops.KERNEL, Ops.ASSIGN, Ops.COPY, Ops.SINK, *GroupOp.Buffer} and u.st is not None:
+        label += f"\n{shape_to_str(u.shape)}"
+      elif len(rngs:=u.ranges):
+        label += f"\n{str(sorted([x.arg for x in rngs]))}"
+    except Exception:
+      label += "\n<ISSUE GETTING LABEL>"
+    if (ref:=ref_map.get(u.arg.ast) if u.op is Ops.KERNEL else None) is not None: label += f"\ncodegen@{ctxs[ref]['name']}"
+    # NOTE: kernel already has metadata in arg
+    if TRACEMETA >= 2 and u.metadata is not None and u.op is not Ops.KERNEL: label += "\n"+repr(u.metadata)
+    graph[id(u)] = {"label":label, "src":[id(x) for x in u.src if x not in excluded], "color":uops_colors.get(u.op, "#ffffff"),
+                    "ref":ref, "tag":u.tag}
   return graph
-def get_details(k:Any, ctx:TrackedGraphRewrite) -> Generator[GraphRewriteDetails, None, None]:
-  yield {"graph":uop_to_json(next_sink:=ctx.sink), "uop":str(ctx.sink), "changed_nodes":None, "diff":None, "upat":None}
+@functools.cache
+def _reconstruct(a:int):
+  op, dtype, src, arg, tag = contexts[2][a]
+  arg = type(arg)(_reconstruct(arg.ast), arg.metadata) if op is Ops.KERNEL else arg
+  return UOp(op, dtype, tuple(_reconstruct(s) for s in src), arg, tag)
+def get_details(ctx:TrackedGraphRewrite) -> Generator[GraphRewriteDetails, None, None]:
+  yield {"graph":uop_to_json(next_sink:=_reconstruct(ctx.sink)), "uop":str(next_sink), "changed_nodes":None, "diff":None, "upat":None}
   replaces: dict[UOp, UOp] = {}
-  for u0,u1,upat in tqdm(ctx.matches):
-    replaces[u0] = u1
-    new_sink = next_sink.substitute(replaces)
-    yield {"graph": (sink_json:=uop_to_json(new_sink)), "uop":str(new_sink), "changed_nodes":[id(x) for x in u1.toposort if id(x) in sink_json],
-           "diff":list(difflib.unified_diff(pcall(str, u0).splitlines(), pcall(str, u1).splitlines())), "upat":(upat.location, upat.printable())}
+  for u0_num,u1_num,upat_loc in tqdm(ctx.matches):
+    replaces[u0:=_reconstruct(u0_num)] = u1 = _reconstruct(u1_num)
+    try: new_sink = next_sink.substitute(replaces)
+    except RuntimeError as e: new_sink = UOp(Ops.NOOP, arg=str(e))
+    yield {"graph":(sink_json:=uop_to_json(new_sink)), "uop":str(new_sink), "changed_nodes":[id(x) for x in u1.toposort() if id(x) in sink_json],
+           "diff":list(difflib.unified_diff(str(u0).splitlines(), str(u1).splitlines())), "upat":(upat_loc, printable(upat_loc))}
     if not ctx.bottom_up: next_sink = new_sink
 # Profiler API
-devices:dict[str, tuple[decimal.Decimal, decimal.Decimal, int]] = {}
-def prep_ts(device:str, ts:decimal.Decimal, is_copy): return int(decimal.Decimal(ts) + devices[device][is_copy])
-def dev_to_pid(device:str, is_copy=False): return {"pid": devices[device][2], "tid": int(is_copy)}
-def dev_ev_to_perfetto_json(ev:ProfileDeviceEvent):
-  devices[ev.device] = (ev.comp_tdiff, ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff, len(devices))
-  return [{"name": "process_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "args": {"name": ev.device}},
-          {"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 0, "args": {"name": "COMPUTE"}},
-          {"name": "thread_name", "ph": "M", "pid": dev_to_pid(ev.device)['pid'], "tid": 1, "args": {"name": "COPY"}}]
-def range_ev_to_perfetto_json(ev:ProfileRangeEvent):
-  return [{"name": ev.name, "ph": "X", "ts": prep_ts(ev.device, ev.st, ev.is_copy), "dur": float(ev.en-ev.st), **dev_to_pid(ev.device, ev.is_copy)}]
-def graph_ev_to_perfetto_json(ev:ProfileGraphEvent, reccnt):
-  ret = []
-  for i,e in enumerate(ev.ents):
-    st, en = ev.sigs[e.st_id], ev.sigs[e.en_id]
-    ret += [{"name": e.name, "ph": "X", "ts": prep_ts(e.device, st, e.is_copy), "dur": float(en-st), **dev_to_pid(e.device, e.is_copy)}]
-    for dep in ev.deps[i]:
-      d = ev.ents[dep]
-      ret += [{"ph": "s", **dev_to_pid(d.device, d.is_copy), "id": reccnt+len(ret), "ts": prep_ts(d.device, ev.sigs[d.en_id], d.is_copy), "bp": "e"}]
-      ret += [{"ph": "f", **dev_to_pid(e.device, e.is_copy), "id": reccnt+len(ret)-1, "ts": prep_ts(e.device, st, e.is_copy), "bp": "e"}]
+device_ts_diffs:dict[str, tuple[Decimal, Decimal]] = {}
+def cpu_ts_diff(device:str, thread=0) -> Decimal: return device_ts_diffs.get(device, (Decimal(0),))[thread]
+DevEvent = ProfileRangeEvent|ProfileGraphEntry|ProfilePointEvent
+def flatten_events(profile:list[ProfileEvent]) -> Generator[tuple[Decimal, Decimal, DevEvent], None, None]:
+  for e in profile:
+    if isinstance(e, ProfileRangeEvent): yield (e.st+(diff:=cpu_ts_diff(e.device, e.is_copy)), (e.en if e.en is not None else e.st)+diff, e)
+    elif isinstance(e, ProfilePointEvent): yield (e.ts, e.ts, e)
+    elif isinstance(e, ProfileGraphEvent):
+      cpu_ts = []
+      for ent in e.ents: cpu_ts += [e.sigs[ent.st_id]+(diff:=cpu_ts_diff(ent.device, ent.is_copy)), e.sigs[ent.en_id]+diff]
+      yield (st:=min(cpu_ts)), (et:=max(cpu_ts)), ProfileRangeEvent(f"{e.ents[0].device.split(':')[0]} Graph", f"batched {len(e.ents)}", st, et)
+      for i,ent in enumerate(e.ents): yield (cpu_ts[i*2], cpu_ts[i*2+1], ent)
+# timeline layout stacks events in a contiguous block. When a late starter finishes late, there is whitespace in the higher levels.
+def timeline_layout(events:list[tuple[int, int, float, DevEvent]]) -> dict:
+  shapes:list[dict] = []
+  levels:list[int] = []
+  for st,et,dur,e in events:
+    if dur == 0: continue
+    # find a free level to put the event
+    depth = next((i for i,level_et in enumerate(levels) if st>=level_et), len(levels))
+    if depth < len(levels): levels[depth] = et
+    else: levels.append(et)
+    name, cat, info = e.name, None, None
+    if (ref:=ref_map.get(name)) is not None:
+      name = ctxs[ref]["name"]
+      # TODO: support symbolic by capturing var_vals in profile events
+      if isinstance(p:=contexts[0][ref].ret, ProgramSpec) and all(isinstance(es,int) for es in [p.estimates.ops, p.estimates.mem, p.estimates.lds]):
+        info = f"{p.estimates.ops/(t:=dur*1e3):.2f} GFLOPS {p.estimates.mem/t:4.1f}|{p.estimates.lds/t:.1f} GB/s"
+    elif isinstance(e.name, TracingKey):
+      name, cat = e.name.display_name, e.name.cat
+      ref = next((v for k in e.name.keys if (v:=ref_map.get(k)) is not None), None)
+    shapes.append({"name":name, "ref":ref, "st":st, "dur":dur, "depth":depth, "cat":cat, "info":info})
+  return {"shapes":shapes, "maxDepth":len(levels)}
+def mem_layout(events:list[tuple[int, int, float, DevEvent]], max_ts:int) -> dict:
+  step, peak, mem = 0, 0, 0
+  shps:dict[int, dict] = {}
+  temp:dict[int, dict] = {}
+  timestamps:list[int] = []
+  for st,_,_,e in events:
+    if not isinstance(e, ProfilePointEvent): continue
+    if e.name == "alloc":
+      shps[e.key] = temp[e.key] = {"x":[step], "y":[mem], "arg":e.arg}
+      timestamps.append(int(e.ts))
+      step += 1
+      mem += e.arg["nbytes"]
+      if mem > peak: peak = mem
+    if e.name == "free":
+      timestamps.append(int(e.ts))
+      step += 1
+      mem -= (removed:=temp.pop(e.key))["arg"]["nbytes"]
+      removed["x"].append(step)
+      removed["y"].append(removed["y"][-1])
+      for k,v in temp.items():
+        if k > e.key:
+          v["x"] += [step, step]
+          v["y"] += [v["y"][-1], v["y"][-1]-removed["arg"]["nbytes"]]
+  for v in temp.values():
+    v["x"].append(step)
+    v["y"].append(v["y"][-1])
+  timestamps.append(max_ts)
+  return {"shapes":list(shps.values()), "peak":peak, "timestamps":timestamps}
+def get_profile(profile:list[ProfileEvent]) -> bytes|None:
+  # start by getting the time diffs
+  for ev in profile:
+    if isinstance(ev,ProfileDeviceEvent): device_ts_diffs[ev.device] = (ev.comp_tdiff, ev.copy_tdiff if ev.copy_tdiff is not None else ev.comp_tdiff)
+  # map events per device
+  dev_events:dict[str, list[tuple[int, int, float, DevEvent]]] = {}
+  min_ts:int|None = None
+  max_ts:int|None = None
+  for ts,en,e in flatten_events(profile):
+    dev_events.setdefault(e.device,[]).append((st:=int(ts), et:=int(en), float(en-ts), e))
+    if min_ts is None or st < min_ts: min_ts = st
+    if max_ts is None or et > max_ts: max_ts = et
+  if min_ts is None: return None
+  # return layout of per device events
+  layout:dict[str, dict] = {}
+  for k,v in dev_events.items():
+    v.sort(key=lambda e:e[0])
+    layout[k] = timeline_layout(v)
+    layout[f"{k} Memory"] = mem_layout(v, unwrap(max_ts))
+  return json.dumps({"layout":layout, "st":min_ts, "et":max_ts}).encode("utf-8")
+def get_runtime_stats(key) -> list[dict]:
+  ret:list[dict] = []
+  for e in profile:
+    if isinstance(e, ProfileRangeEvent) and e.en is not None and e.name == key:
+      ret.append({"device":e.device, "data":[{"name":"Duration", "value":float(e.en-e.st), "unit":"us"}]})
   return ret
-def to_perfetto(profile:list[ProfileEvent]):
-  # Start json with devices.
-  prof_json = [x for ev in profile if isinstance(ev, ProfileDeviceEvent) for x in dev_ev_to_perfetto_json(ev)]
-  for ev in tqdm(profile, desc="preparing profile"):
-    if isinstance(ev, ProfileRangeEvent): prof_json += range_ev_to_perfetto_json(ev)
-    elif isinstance(ev, ProfileGraphEvent): prof_json += graph_ev_to_perfetto_json(ev, reccnt=len(prof_json))
-  return json.dumps({"traceEvents": prof_json}).encode() if len(prof_json) > 0 else None
+# ** Assembly analyzers
+def get_llvm_mca(asm:str, mtriple:str, mcpu:str) -> dict:
+  target_args = [f"-mtriple={mtriple}", f"-mcpu={mcpu}"]
+  # disassembly output can include headers / metadata, skip if llvm-mca can't parse those lines
+  data = json.loads(subprocess.check_output(["llvm-mca","-skip-unsupported-instructions=parse-failure","--json","-"]+target_args, input=asm.encode()))
+  cr = data["CodeRegions"][0]
+  resource_labels = data["TargetInfo"]["Resources"]
+  rows:list = [[instr] for instr in cr["Instructions"]]
+  # add scheduler estimates
+  for info in cr["InstructionInfoView"]["InstructionList"]: rows[info["Instruction"]].append(info["Latency"])
+  # map per instruction resource usage
+  instr_usage:dict[int, dict[int, int]] = {}
+  for d in cr["ResourcePressureView"]["ResourcePressureInfo"]:
+    instr_usage.setdefault(i:=d["InstructionIndex"], {}).setdefault(r:=d["ResourceIndex"], 0)
+    instr_usage[i][r] += d["ResourceUsage"]
+  # last row is the usage summary
+  summary = [{"idx":k, "label":resource_labels[k], "value":v} for k,v in instr_usage.pop(len(rows), {}).items()]
+  max_usage = max([sum(v.values()) for i,v in instr_usage.items() if i<len(rows)], default=0)
+  for i,usage in instr_usage.items(): rows[i].append([[k, v, (v/max_usage)*100] for k,v in usage.items()])
+  return {"rows":rows, "cols":["Opcode", "Latency", {"title":"HW Resources", "labels":resource_labels}], "summary":summary}
+def get_disassembly(ctx:list[str]):
+  if not isinstance(prg:=contexts[0][int(ctx[0])].ret, ProgramSpec): return
+  lib = (compiler:=Device[prg.device].compiler).compile(prg.src)
+  with redirect_stdout(buf:=io.StringIO()): compiler.disassemble(lib)
+  disasm_str = buf.getvalue()
+  from tinygrad.runtime.ops_llvm import llvm, LLVMCompiler
+  if isinstance(compiler, LLVMCompiler):
+    mtriple = ctypes.string_at(llvm.LLVMGetTargetMachineTriple(tm:=compiler.target_machine)).decode()
+    mcpu = ctypes.string_at(llvm.LLVMGetTargetMachineCPU(tm)).decode()
+    ret = get_llvm_mca(disasm_str, mtriple, mcpu)
+  else: ret = {"src":disasm_str}
+  return json.dumps(ret).encode()
 # ** HTTP server
@@ -122,33 +244,17 @@ class Handler(BaseHTTPRequestHandler):
     if (url:=urlparse(self.path)).path == "/":
       with open(os.path.join(os.path.dirname(__file__), "index.html"), "rb") as f: ret = f.read()
-    elif (url:=urlparse(self.path)).path == "/profiler":
-      with open(os.path.join(os.path.dirname(__file__), "perfetto.html"), "rb") as f: ret = f.read()
-    elif self.path.startswith("/assets/") and '/..' not in self.path:
+    elif self.path.startswith(("/assets/", "/js/")) and '/..' not in self.path:
       try:
         with open(os.path.join(os.path.dirname(__file__), self.path.strip('/')), "rb") as f: ret = f.read()
         if url.path.endswith(".js"): content_type = "application/javascript"
         if url.path.endswith(".css"): content_type = "text/css"
       except FileNotFoundError: status_code = 404
-    elif url.path == "/kernels":
-      if "kernel" in (query:=parse_qs(url.query)):
-        def getarg(k:str,default=0): return int(query[k][0]) if k in query else default
-        kidx, ridx = getarg("kernel"), getarg("idx")
-        try:
-          # stream details
-          self.send_response(200)
-          self.send_header("Content-Type", "text/event-stream")
-          self.send_header("Cache-Control", "no-cache")
-          self.end_headers()
-          for r in get_details(contexts[0][kidx], contexts[1][kidx][ridx]):
-            self.wfile.write(f"data: {json.dumps(r)}\n\n".encode("utf-8"))
-            self.wfile.flush()
-          self.wfile.write("data: END\n\n".encode("utf-8"))
-          return self.wfile.flush()
-        # pass if client closed connection
-        except (BrokenPipeError, ConnectionResetError): return
-      ret, content_type = json.dumps(kernels).encode(), "application/json"
-    elif url.path == "/get_profile" and perfetto_profile is not None: ret, content_type = perfetto_profile, "application/json"
+    elif (query:=parse_qs(url.query)):
+      if url.path == "/disasm": ret, content_type = get_disassembly(**query), "application/json"
+      else: return self.stream_json(get_details(contexts[1][int(query["ctx"][0])][int(query["idx"][0])]))
+    elif url.path == "/ctxs": ret, content_type = json.dumps(ctxs).encode(), "application/json"
+    elif url.path == "/get_profile" and profile_ret is not None: ret, content_type = profile_ret, "application/json"
     else: status_code = 404
     # send response
@@ -158,6 +264,19 @@ class Handler(BaseHTTPRequestHandler):
     self.end_headers()
     return self.wfile.write(ret)
+  def stream_json(self, source:Generator):
+    try:
+      self.send_response(200)
+      self.send_header("Content-Type", "text/event-stream")
+      self.send_header("Cache-Control", "no-cache")
+      self.end_headers()
+      for r in source:
+        self.wfile.write(f"data: {json.dumps(r)}\n\n".encode("utf-8"))
+        self.wfile.flush()
+      self.wfile.write("data: END\n\n".encode("utf-8"))
+    # pass if client closed connection
+    except (BrokenPipeError, ConnectionResetError): return
 # ** main loop
 def reloader():
@@ -172,6 +291,9 @@ def load_pickle(path:str):
   if path is None or not os.path.exists(path): return None
   with open(path, "rb") as f: return pickle.load(f)
+# NOTE: using HTTPServer forces a potentially slow socket.getfqdn
+class TCPServerWithReuse(socketserver.TCPServer): allow_reuse_address = True
 if __name__ == "__main__":
   parser = argparse.ArgumentParser()
   parser.add_argument('--kernels', type=str, help='Path to kernels', default=None)
@@ -189,15 +311,15 @@ if __name__ == "__main__":
   contexts, profile = load_pickle(args.kernels), load_pickle(args.profile)
   # NOTE: this context is a tuple of list[keys] and list[values]
-  kernels = get_metadata(*contexts) if contexts is not None else []
+  ctxs = get_metadata(*contexts[:2]) if contexts is not None else []
-  perfetto_profile = to_perfetto(profile) if profile is not None else None
+  profile_ret = get_profile(profile) if profile is not None else None
-  server = HTTPServer(('', PORT), Handler)
+  server = TCPServerWithReuse(('', PORT), Handler)
   reloader_thread = threading.Thread(target=reloader)
   reloader_thread.start()
   print(f"*** started viz on {HOST}:{PORT}")
-  print(colored(f"*** ready in {(time.perf_counter()-st)*1e3:4.2f}ms", "green"))
+  print(colored(f"*** ready in {(time.perf_counter()-st)*1e3:4.2f}ms", "green"), flush=True)
   if len(getenv("BROWSER", "")) > 0: webbrowser.open(f"{HOST}:{PORT}{'/profiler' if contexts is None else ''}")
   try: server.serve_forever()
   except KeyboardInterrupt:

{tinygrad-0.10.2.dist-info → tinygrad-0.11.0.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
-Metadata-Version: 2.2
+Metadata-Version: 2.4
 Name: tinygrad
-Version: 0.10.2
+Version: 0.11.0
 Summary: You like pytorch? You like micrograd? You love tinygrad! <3
 Author: George Hotz
 License: MIT
@@ -19,31 +19,38 @@ Requires-Dist: mypy==1.13.0; extra == "linting"
 Requires-Dist: typing-extensions; extra == "linting"
 Requires-Dist: pre-commit; extra == "linting"
 Requires-Dist: ruff; extra == "linting"
-Requires-Dist: types-tqdm; extra == "linting"
+Requires-Dist: numpy; extra == "linting"
 Provides-Extra: testing-minimal
 Requires-Dist: numpy; extra == "testing-minimal"
-Requires-Dist: torch; extra == "testing-minimal"
+Requires-Dist: torch==2.7.1; extra == "testing-minimal"
 Requires-Dist: pytest; extra == "testing-minimal"
 Requires-Dist: pytest-xdist; extra == "testing-minimal"
 Requires-Dist: hypothesis; extra == "testing-minimal"
+Requires-Dist: z3-solver; extra == "testing-minimal"
+Requires-Dist: ml_dtypes; extra == "testing-minimal"
 Provides-Extra: testing-unit
 Requires-Dist: numpy; extra == "testing-unit"
-Requires-Dist: torch; extra == "testing-unit"
+Requires-Dist: torch==2.7.1; extra == "testing-unit"
 Requires-Dist: pytest; extra == "testing-unit"
 Requires-Dist: pytest-xdist; extra == "testing-unit"
 Requires-Dist: hypothesis; extra == "testing-unit"
+Requires-Dist: z3-solver; extra == "testing-unit"
+Requires-Dist: ml_dtypes; extra == "testing-unit"
 Requires-Dist: tqdm; extra == "testing-unit"
 Requires-Dist: safetensors; extra == "testing-unit"
 Requires-Dist: tabulate; extra == "testing-unit"
 Provides-Extra: testing
 Requires-Dist: numpy; extra == "testing"
-Requires-Dist: torch; extra == "testing"
+Requires-Dist: torch==2.7.1; extra == "testing"
 Requires-Dist: pytest; extra == "testing"
 Requires-Dist: pytest-xdist; extra == "testing"
 Requires-Dist: hypothesis; extra == "testing"
+Requires-Dist: z3-solver; extra == "testing"
+Requires-Dist: ml_dtypes; extra == "testing"
 Requires-Dist: pillow; extra == "testing"
-Requires-Dist: onnx==1.16.0; extra == "testing"
+Requires-Dist: onnx==1.18.0; extra == "testing"
 Requires-Dist: onnx2torch; extra == "testing"
+Requires-Dist: onnxruntime; extra == "testing"
 Requires-Dist: opencv-python; extra == "testing"
 Requires-Dist: tabulate; extra == "testing"
 Requires-Dist: tqdm; extra == "testing"
@@ -58,6 +65,10 @@ Requires-Dist: nibabel; extra == "testing"
 Requires-Dist: bottle; extra == "testing"
 Requires-Dist: ggml-python; extra == "testing"
 Requires-Dist: capstone; extra == "testing"
+Requires-Dist: pycocotools; extra == "testing"
+Requires-Dist: boto3; extra == "testing"
+Requires-Dist: pandas; extra == "testing"
+Requires-Dist: influxdb3-python; extra == "testing"
 Provides-Extra: docs
 Requires-Dist: mkdocs; extra == "docs"
 Requires-Dist: mkdocs-material; extra == "docs"
@@ -66,14 +77,12 @@ Requires-Dist: markdown-callouts; extra == "docs"
 Requires-Dist: markdown-exec[ansi]; extra == "docs"
 Requires-Dist: black; extra == "docs"
 Requires-Dist: numpy; extra == "docs"
-Provides-Extra: testing-tf
-Requires-Dist: tensorflow==2.15.1; extra == "testing-tf"
-Requires-Dist: tensorflow_addons; extra == "testing-tf"
 Dynamic: author
 Dynamic: classifier
 Dynamic: description
 Dynamic: description-content-type
 Dynamic: license
+Dynamic: license-file
 Dynamic: provides-extra
 Dynamic: requires-python
 Dynamic: summary
@@ -101,11 +110,11 @@ tinygrad: For something between [PyTorch](https://github.com/pytorch/pytorch) an
 ---
-This may not be the best deep learning framework, but it is a deep learning framework.
+Despite tinygrad's size, it is a fully featured deep learning framework.
-Due to its extreme simplicity, it aims to be the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
+Due to its extreme simplicity, it is the easiest framework to add new accelerators to, with support for both inference and training. If XLA is CISC, tinygrad is RISC.
-tinygrad is still alpha software, but we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
+tinygrad is now beta software, we [raised some money](https://geohot.github.io/blog/jekyll/update/2023/05/24/the-tiny-corp-raised-5M.html) to make it good. Someday, we will tape out chips.
 ## Features
@@ -119,9 +128,8 @@ Try a matmul. See how, despite the style, it is fused into one kernel with the p
 ```sh
 DEBUG=3 python3 -c "from tinygrad import Tensor;
-N = 1024; a, b = Tensor.rand(N, N), Tensor.rand(N, N);
-c = (a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2);
-print((c.numpy() - (a.numpy() @ b.numpy())).mean())"
+N = 1024; a, b = Tensor.empty(N, N), Tensor.empty(N, N);
+(a.reshape(N, 1, N) * b.T.reshape(1, N, N)).sum(axis=2).realize()"
 ```
 And we can change `DEBUG` to `4` to see the generated code.

tinygrad 0.10.2__py3-none-any.whl → 0.11.0__py3-none-any.whl

tinygrad 0.10.2py3-none-any.whl → 0.11.0py3-none-any.whl