tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. tinygrad/codegen/devectorizer.py +247 -0
  2. tinygrad/codegen/expander.py +121 -0
  3. tinygrad/codegen/kernel.py +141 -201
  4. tinygrad/codegen/linearize.py +223 -84
  5. tinygrad/codegen/lowerer.py +60 -42
  6. tinygrad/codegen/symbolic.py +476 -0
  7. tinygrad/codegen/transcendental.py +22 -13
  8. tinygrad/device.py +187 -47
  9. tinygrad/dtype.py +39 -28
  10. tinygrad/engine/jit.py +83 -65
  11. tinygrad/engine/memory.py +4 -5
  12. tinygrad/engine/multi.py +161 -0
  13. tinygrad/engine/realize.py +62 -108
  14. tinygrad/engine/schedule.py +396 -357
  15. tinygrad/engine/search.py +55 -66
  16. tinygrad/gradient.py +73 -0
  17. tinygrad/helpers.py +81 -59
  18. tinygrad/nn/__init__.py +30 -32
  19. tinygrad/nn/datasets.py +1 -2
  20. tinygrad/nn/optim.py +22 -26
  21. tinygrad/nn/state.py +91 -66
  22. tinygrad/ops.py +492 -641
  23. tinygrad/renderer/__init__.py +95 -36
  24. tinygrad/renderer/cstyle.py +99 -92
  25. tinygrad/renderer/llvmir.py +83 -34
  26. tinygrad/renderer/ptx.py +83 -99
  27. tinygrad/renderer/wgsl.py +95 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  29. tinygrad/runtime/autogen/comgr.py +2 -0
  30. tinygrad/runtime/autogen/kfd.py +4 -3
  31. tinygrad/runtime/autogen/kgsl.py +1 -1
  32. tinygrad/runtime/autogen/libc.py +404 -71
  33. tinygrad/runtime/autogen/llvm.py +11379 -0
  34. tinygrad/runtime/autogen/pci.py +1333 -0
  35. tinygrad/runtime/autogen/vfio.py +891 -0
  36. tinygrad/runtime/autogen/webgpu.py +6985 -0
  37. tinygrad/runtime/graph/cuda.py +8 -9
  38. tinygrad/runtime/graph/hcq.py +84 -79
  39. tinygrad/runtime/graph/metal.py +40 -43
  40. tinygrad/runtime/ops_amd.py +498 -334
  41. tinygrad/runtime/ops_cloud.py +34 -34
  42. tinygrad/runtime/ops_cpu.py +24 -0
  43. tinygrad/runtime/ops_cuda.py +30 -27
  44. tinygrad/runtime/ops_disk.py +62 -63
  45. tinygrad/runtime/ops_dsp.py +159 -42
  46. tinygrad/runtime/ops_gpu.py +30 -30
  47. tinygrad/runtime/ops_hip.py +29 -31
  48. tinygrad/runtime/ops_llvm.py +48 -41
  49. tinygrad/runtime/ops_metal.py +149 -113
  50. tinygrad/runtime/ops_npy.py +2 -2
  51. tinygrad/runtime/ops_nv.py +238 -273
  52. tinygrad/runtime/ops_python.py +55 -50
  53. tinygrad/runtime/ops_qcom.py +129 -157
  54. tinygrad/runtime/ops_webgpu.py +225 -0
  55. tinygrad/runtime/support/allocator.py +94 -0
  56. tinygrad/runtime/support/am/__init__.py +0 -0
  57. tinygrad/runtime/support/am/amdev.py +396 -0
  58. tinygrad/runtime/support/am/ip.py +463 -0
  59. tinygrad/runtime/support/compiler_cuda.py +4 -2
  60. tinygrad/runtime/support/elf.py +28 -4
  61. tinygrad/runtime/support/hcq.py +256 -324
  62. tinygrad/runtime/support/llvm.py +26 -0
  63. tinygrad/shape/shapetracker.py +85 -53
  64. tinygrad/shape/view.py +104 -140
  65. tinygrad/spec.py +155 -0
  66. tinygrad/tensor.py +835 -527
  67. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  68. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  69. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  70. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  71. tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  72. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  73. tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  74. tinygrad/viz/index.html +544 -0
  75. tinygrad/viz/perfetto.html +178 -0
  76. tinygrad/viz/serve.py +205 -0
  77. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
  78. tinygrad-0.10.2.dist-info/RECORD +99 -0
  79. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
  80. tinygrad/codegen/uopgraph.py +0 -506
  81. tinygrad/engine/lazy.py +0 -228
  82. tinygrad/function.py +0 -212
  83. tinygrad/multi.py +0 -177
  84. tinygrad/runtime/graph/clang.py +0 -39
  85. tinygrad/runtime/ops_clang.py +0 -35
  86. tinygrad-0.10.0.dist-info/RECORD +0 -77
  87. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
  88. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,64 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, Any
3
- import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys
2
+ import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
4
3
  assert sys.platform != 'win32'
5
- from tinygrad.device import BufferOptions, Compiled, Allocator
6
- from tinygrad.helpers import from_mv, getenv, round_up, mv_address, to_mv
7
- from tinygrad.runtime.ops_clang import ClangCompiler
8
- from tinygrad.renderer.cstyle import DSPRenderer
4
+ from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, MallocAllocator
5
+ from tinygrad.dtype import dtypes, DType, PtrDType
6
+ from tinygrad.ops import Ops, UOp
7
+ from tinygrad.helpers import from_mv, getenv, round_up, mv_address, to_mv, cpu_objdump, DEBUG
8
+ from tinygrad.renderer.cstyle import ClangRenderer
9
9
  from tinygrad.runtime.autogen import libc, qcom_dsp
10
10
  if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
11
11
 
12
+ from tinygrad.ops import PatternMatcher, UPat
13
+
14
+ dsp_pm = PatternMatcher([
15
+ (((UPat.var('x').maximum(0) ^ -1).maximum(-256) ^ -1).cast(dtypes.uchar.vec(128)),
16
+ lambda x: UOp(Ops.CUSTOM, dtypes.uchar.vec(128), src=tuple(x.gep(tuple(range(i, i+32))) for i in range(0, 128, 32)),
17
+ arg="__builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B({3}, {2}), __builtin_HEXAGON_V6_vpackwh_sat_128B({1}, {0}))")),
18
+ (UPat(Ops.GEP, name="x"), lambda x: UOp(Ops.CUSTOM, x.dtype, x.src+x.src,
19
+ "__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 else None),
20
+ ])
21
+
22
+ dsp_pm_late = PatternMatcher([
23
+ (UPat.var("x")+UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
24
+ (UPat.var("x")*UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
25
+ (UPat.var("x")//UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
26
+ (UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
27
+ lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
28
+ ])
29
+
30
+ class DSPRenderer(ClangRenderer):
31
+ device = "DSP"
32
+ supports_float4 = True
33
+ buffer_suffix = " restrict __attribute__((align_value(128)))"
34
+ kernel_prefix = "__attribute__((noinline)) "
35
+ pre_matcher = dsp_pm
36
+ extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher
37
+ type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
38
+ code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
39
+ Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
40
+ Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
41
+
42
+ def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
43
+ ret = super().render_kernel(function_name, kernel, bufs, uops, prefix)
44
+ msrc = ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
45
+ _Bool set_dcvs_params; short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3];};''','int HAP_power_set(void*, void*);',
46
+ 'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
47
+ 'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
48
+ 'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
49
+ 'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
50
+ 'HAP_power_set((void*)handle, (void*)&req);']
51
+ msrc += ['if ((sc>>24) != 2) return 0;']
52
+ msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
53
+ msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
54
+ msrc += [f'void *buf_{i} = HAP_mmap(0,sz_or_val_{i},3,0,pra[{i+3}].dma.fd,0)+off{i};' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
55
+ msrc += ["unsigned long long start = HAP_perf_get_time_us();"]
56
+ msrc += [f"{function_name}({', '.join([(f'buf_{i}' if isinstance(b[1][0], PtrDType) else f'sz_or_val_{i}') for i,b in enumerate(bufs)])});"]
57
+ msrc += ["*(unsigned long long *)(pra[2].buf.pv) = HAP_perf_get_time_us() - start;"]
58
+ msrc += [f'HAP_munmap(buf_{i}, sz_or_val_{i});' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
59
+ msrc += ["return 0; }"]
60
+ return ret + '\n' + '\n'.join(msrc)
61
+
12
62
  def rpc_sc(method=0, ins=0, outs=0, fds=0): return (method << 24) | (ins << 16) | (outs << 8) | fds
13
63
  def rpc_prep_args(ins=None, outs=None, in_fds=None):
14
64
  ins, outs, in_fds = ins or list(), outs or list(), in_fds or list()
@@ -21,65 +71,81 @@ def rpc_prep_args(ins=None, outs=None, in_fds=None):
21
71
  return pra, fds, attrs, (ins, outs)
22
72
 
23
73
  class DSPProgram:
24
- def __init__(self, device:DSPDevice, name:str, lib:bytes):
25
- self.device, self.lib = device, lib
74
+ def __init__(self, dev:DSPDevice, name:str, lib:bytes):
75
+ self.dev, self.lib = dev, lib
26
76
 
27
- def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
77
+ def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
28
78
  if len(bufs) >= 16: raise RuntimeError(f"Too many buffers to execute: {len(bufs)}")
29
79
 
30
80
  pra, fds, attrs, _ = rpc_prep_args(ins=[var_vals_mv:=memoryview(bytearray((len(bufs)+len(vals))*4)), off_mv:=memoryview(bytearray(len(bufs)*4))],
31
81
  outs=[timer:=memoryview(bytearray(8)).cast('Q')], in_fds=[b.share_info.fd for b in bufs])
32
82
  var_vals_mv.cast('i')[:] = array.array('i', tuple(b.size for b in bufs) + vals)
33
83
  off_mv.cast('I')[:] = array.array('I', tuple(b.offset for b in bufs))
34
- self.device.exec_lib(self.lib, rpc_sc(method=2, ins=2, outs=1, fds=len(bufs)), pra, fds, attrs)
84
+ self.dev.exec_lib(self.lib, rpc_sc(method=2, ins=2, outs=1, fds=len(bufs)), pra, fds, attrs)
35
85
  return timer[0] / 1e6
36
86
 
37
87
  class DSPBuffer:
38
- def __init__(self, va_addr:int, size:int, share_info:Any, offset:int=0):
88
+ def __init__(self, va_addr:int, size:int, share_info, offset:int=0):
39
89
  self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
40
90
 
41
91
  class DSPAllocator(Allocator):
42
- def __init__(self, device:DSPDevice):
43
- self.device = device
92
+ def __init__(self, dev:DSPDevice):
93
+ self.dev = dev
44
94
  super().__init__()
45
95
 
46
- def _alloc(self, size:int, options:BufferOptions):
47
- b = qcom_dsp.ION_IOC_ALLOC(self.device.ion_fd, len=size, align=0x200, heap_id_mask=1<<qcom_dsp.ION_SYSTEM_HEAP_ID, flags=qcom_dsp.ION_FLAG_CACHED)
48
- share_info = qcom_dsp.ION_IOC_SHARE(self.device.ion_fd, handle=b.handle)
96
+ def _alloc(self, size:int, options:BufferSpec):
97
+ b = qcom_dsp.ION_IOC_ALLOC(self.dev.ion_fd, len=size, align=0x200, heap_id_mask=1<<qcom_dsp.ION_SYSTEM_HEAP_ID, flags=qcom_dsp.ION_FLAG_CACHED)
98
+ share_info = qcom_dsp.ION_IOC_SHARE(self.dev.ion_fd, handle=b.handle)
49
99
  va_addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, share_info.fd, 0)
50
100
  return DSPBuffer(va_addr, size, share_info, offset=0)
51
101
 
52
- def _free(self, opaque:DSPBuffer, options:BufferOptions):
53
- libc.munmap(opaque.va_addr, opaque.size)
54
- os.close(opaque.share_info.fd)
55
- qcom_dsp.ION_IOC_FREE(self.device.ion_fd, handle=opaque.share_info.handle)
102
+ def _free(self, opaque:DSPBuffer, options:BufferSpec):
103
+ if libc is not None and qcom_dsp is not None:
104
+ libc.munmap(opaque.va_addr, opaque.size)
105
+ os.close(opaque.share_info.fd)
106
+ qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
56
107
 
57
- def as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
58
- def copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
59
- def copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
60
- def offset(self, buf, size:int, offset:int): return DSPBuffer(buf.va_addr+offset, size, buf.share_info, buf.offset+offset)
108
+ def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
109
+ def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
110
+ def _copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
111
+ def _offset(self, buf, size:int, offset:int): return DSPBuffer(buf.va_addr+offset, size, buf.share_info, buf.offset+offset)
61
112
 
62
- class DSPDevice(Compiled):
63
- def __init__(self, device:str=""):
64
- self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
113
+ class ClangCompiler(Compiler):
114
+ def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
115
+ self.args = ['-shared', '-march=native'] if args is None else args
116
+ self.objdump_tool = objdump_tool
117
+ super().__init__(cachekey)
65
118
 
66
- # Generate link script to pass into clang. Aligning all used sections to 4k fixes invoke problem.
67
- sections = ['hash', 'text', 'rela.plt', 'got', 'got.plt', 'dynamic', 'dynsym', 'dynstr', 'plt', 'data', 'bss']
68
- sections_link = '\n'.join([f'.{n} : ALIGN(4096) {{ *(.{n}) }}' for n in sections])
69
- with tempfile.NamedTemporaryFile(delete=False) as self.link_ld:
70
- self.link_ld.write(f"SECTIONS {{ . = 0x0; {sections_link}\n /DISCARD/ : {{ *(.note .note.* .gnu.hash .comment) }} }}".encode())
71
- self.link_ld.flush()
119
+ def compile(self, src:str) -> bytes:
120
+ # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
121
+ with tempfile.NamedTemporaryFile(delete=True) as output_file:
122
+ subprocess.check_output([getenv("CC", 'clang'), *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
123
+ '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
124
+ return pathlib.Path(output_file.name).read_bytes()
72
125
 
73
- compiler_args = ["--target=hexagon", "-mcpu=hexagonv65", "-fuse-ld=lld", "-nostdlib", "-mhvx=v65", "-mhvx-length=128b", f"-T{self.link_ld.name}"]
74
- super().__init__(device, DSPAllocator(self), DSPRenderer(),
75
- ClangCompiler("compile_dsp", args=compiler_args, objdump_tool='llvm-objdump'), functools.partial(DSPProgram, self))
126
+ def disassemble(self, lib:bytes): return cpu_objdump(lib, self.objdump_tool)
76
127
 
77
- fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
78
- self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferOptions(nolru=True))
79
- ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
128
+ class DSPDevice(Compiled):
129
+ def __init__(self, device:str=""):
130
+ compiler_args = ["--target=hexagon", "-mcpu=hexagonv65", "-fuse-ld=lld", "-nostdlib", "-mhvx=v65", "-mhvx-length=128b"]
131
+ try:
132
+ self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
133
+ # Generate link script to pass into clang. Aligning all used sections to 4k fixes invoke problem.
134
+ sections = ['hash', 'text', 'rela.plt', 'got', 'got.plt', 'dynamic', 'dynsym', 'dynstr', 'plt', 'data', 'bss']
135
+ sections_link = '\n'.join([f'.{n} : ALIGN(4096) {{ *(.{n}) }}' for n in sections])
136
+ with tempfile.NamedTemporaryFile(delete=False) as self.link_ld:
137
+ self.link_ld.write(f"SECTIONS {{ . = 0x0; {sections_link}\n /DISCARD/ : {{ *(.note .note.* .gnu.hash .comment) }} }}".encode())
138
+ self.link_ld.flush()
139
+ super().__init__(device, DSPAllocator(self), DSPRenderer(),
140
+ ClangCompiler("compile_dsp", ["-shared"] + compiler_args + [f"-T{self.link_ld.name}"], 'llvm-objdump'), functools.partial(DSPProgram, self))
141
+ fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
142
+ self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferSpec(nolru=True))
143
+ ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
80
144
 
81
- self.init_dsp()
82
- RPCListner(self).start()
145
+ self.init_dsp()
146
+ RPCListener(self).start()
147
+ except FileNotFoundError:
148
+ super().__init__(device, MallocAllocator, MockDSPRenderer(), ClangCompiler(None, ["-static"] + compiler_args, 'llvm-objdump'), MockDSPProgram)
83
149
 
84
150
  def open_lib(self, lib):
85
151
  self.binded_lib, self.binded_lib_off = lib, 0
@@ -117,7 +183,7 @@ class DSPDevice(Compiled):
117
183
  qcom_dsp.FASTRPC_IOCTL_INIT(self.rpc_fd, flags=0x1, file=self.shell_buf.va_addr, filelen=self.shell_buf.size, filefd=self.shell_buf.share_info.fd)
118
184
  qcom_dsp.FASTRPC_IOCTL_INVOKE(self.rpc_fd, handle=3, sc=rpc_sc(method=3, ins=0, outs=0))
119
185
 
120
- class RPCListner(threading.Thread):
186
+ class RPCListener(threading.Thread):
121
187
  def __init__(self, device:DSPDevice):
122
188
  super().__init__()
123
189
  self.device, self.daemon = device, True
@@ -179,3 +245,54 @@ class RPCListner(threading.Thread):
179
245
  st = qcom_dsp.FASTRPC_IOCTL_MMAP(self.device.rpc_fd, fd=-1, flags=in_args[0].cast('I')[2], vaddrin=0, size=in_args[0].cast('Q')[3])
180
246
  out_args[0].cast('Q')[0:2] = array.array('Q', [0, st.vaddrout])
181
247
  else: raise RuntimeError(f"Unknown op: {sc=:X}")
248
+
249
+ # ***** mock DSP *****
250
+
251
+ mockdsp_boilerplate = '''/* DSP boilerplate */ static long syscall(long r0, long r1, long r2, long r3, long r4, long r5, long r6) {
252
+ long retval; __asm__ volatile("r0 = %1; r1 = %2; r2 = %3; r3 = %4; r4 = %5; r5 = %6; r6 = %7; trap0(#1); %0 = r0" : "=r" (retval)
253
+ : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "r" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
254
+ static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
255
+ static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
256
+ static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
257
+ static unsigned int inscount(void) {{ unsigned int ret; __asm__ volatile(".word 0x6a15c000; %0 = R0" : "=r" (ret) : : "r0"); return ret; }}
258
+ static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
259
+ return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}'''
260
+
261
+ class MockDSPRenderer(DSPRenderer):
262
+ def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
263
+ ret = ClangRenderer.render_kernel(self, function_name, kernel, bufs, uops, prefix)
264
+ # https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
265
+ # control register 21 is HEX_REG_QEMU_INSN_CNT, 0x6a15c000 loads it
266
+ msrc = [mockdsp_boilerplate, 'void _start(void) {']
267
+ for i,b in enumerate(bufs):
268
+ if isinstance(b[1][0], PtrDType):
269
+ sz = b[1][0].size*b[1][0].itemsize
270
+ # for loop for big reads
271
+ msrc.append(f"void *buf{i} = mmap2(0, {sz}, 3, 0x21, -1, 0); for(int rd = 0; rd < {sz}; rd += read(0, buf{i}+rd, {sz}-rd));")
272
+ else:
273
+ msrc.append(f"unsigned int val{i}; read(0, &val{i}, 4);")
274
+ msrc.append("unsigned int st = inscount();")
275
+ msrc.append(f"{function_name}({', '.join([(f'(void*)buf{i}' if isinstance(b[1][0], PtrDType) else f'val{i}') for i,b in enumerate(bufs)])});")
276
+ msrc.append("unsigned int et = inscount() - st; write(1, &et, sizeof(et));")
277
+ for i,b in enumerate(bufs):
278
+ if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
279
+ msrc.append('exit(0); }')
280
+ return ret + '\n' + '\n'.join(msrc)
281
+
282
+ class MockDSPProgram:
283
+ def __init__(self, name:str, lib:bytes): self.lib = lib
284
+ def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
285
+ with tempfile.NamedTemporaryFile(suffix=".out") as dsp_lib:
286
+ dsp_lib.write(self.lib)
287
+ dsp_lib.flush()
288
+ os.chmod(dsp_lib.name, 0o0777)
289
+ # NOTE: this timing includes a docker launch
290
+ proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
291
+ "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
292
+ input=b''.join([bytes(x) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
293
+ offset = 4
294
+ for x in bufs:
295
+ x[:] = proc.stdout[offset:offset+len(x)]
296
+ offset += len(x)
297
+ assert offset == len(proc.stdout)
298
+ return struct.unpack("I", proc.stdout[0:4])[0] / 1e9 # pretend it's 1 Ghz, but this is an inscount, not a time
@@ -1,10 +1,10 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, Optional, List, cast
3
- import ctypes, functools, hashlib
2
+ from typing import Optional, cast
3
+ import ctypes, functools, hashlib, contextlib
4
4
  from tinygrad.runtime.autogen import opencl as cl
5
5
  from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address
6
6
  from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
7
- from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
7
+ from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError
8
8
 
9
9
  # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
10
10
  OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -15,15 +15,15 @@ def check(status):
15
15
  def checked(ret, status): return (check(status.value), ret)[1]
16
16
 
17
17
  class CLCompiler(Compiler):
18
- def __init__(self, device:CLDevice, compile_key:str):
19
- self.device = device
18
+ def __init__(self, dev:CLDevice, compile_key:str):
19
+ self.dev = dev
20
20
  super().__init__(f"compile_cl_{compile_key}")
21
21
  def compile(self, src:str) -> bytes:
22
- program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
23
- build_status: int = cl.clBuildProgram(program, 1, self.device.device_id, None, cl.clBuildProgram.argtypes[4](), None)
22
+ program = checked(cl.clCreateProgramWithSource(self.dev.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
23
+ build_status: int = cl.clBuildProgram(program, 1, self.dev.device_id, None, cl.clBuildProgram.argtypes[4](), None)
24
24
  if build_status != 0:
25
- cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
26
- cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
25
+ cl.clGetProgramBuildInfo(program, self.dev.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
26
+ cl.clGetProgramBuildInfo(program, self.dev.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
27
27
  raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
28
28
  check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(ctypes.c_size_t), binary_sizes := (ctypes.c_size_t * 1)(), None))
29
29
  check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), (ctypes.c_void_p * 1)(ctypes.addressof(binary := ctypes.create_string_buffer(binary_sizes[0]))), None)) # noqa: E501
@@ -32,7 +32,7 @@ class CLCompiler(Compiler):
32
32
 
33
33
  class CLProgram:
34
34
  def __init__(self, device:CLDevice, name:str, lib:bytes):
35
- self.device, self.name, self.lib = device, name, lib
35
+ self.dev, self.name, self.lib = device, name, lib
36
36
  self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
37
37
  to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
38
38
  errcode_ret := ctypes.c_int32()), errcode_ret)
@@ -41,15 +41,15 @@ class CLProgram:
41
41
  self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
42
42
 
43
43
  def __del__(self):
44
- if hasattr(self, 'kernel'): check(cl.clReleaseKernel(self.kernel))
45
- if hasattr(self, 'program'): check(cl.clReleaseProgram(self.program))
44
+ with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseKernel(self.kernel))
45
+ with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseProgram(self.program))
46
46
 
47
- def __call__(self, *bufs:Tuple[ctypes._CData, BufferOptions], global_size:Tuple[int,int,int]=(1,1,1), local_size:Optional[Tuple[int,int,int]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
47
+ def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:Optional[tuple[int,int,int]]=None, vals:tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
48
48
  for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
49
49
  for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
50
- if local_size is not None: global_size = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
50
+ if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
51
51
  event = cl.cl_event() if wait else None
52
- check(cl.clEnqueueNDRangeKernel(self.device.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
52
+ check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
53
53
  if wait:
54
54
  assert event is not None
55
55
  check(cl.clWaitForEvents(1, event))
@@ -59,31 +59,31 @@ class CLProgram:
59
59
  return None
60
60
 
61
61
  class CLAllocator(LRUAllocator):
62
- def __init__(self, device:CLDevice):
63
- self.device = device
62
+ def __init__(self, dev:CLDevice):
63
+ self.dev = dev
64
64
  super().__init__()
65
- def _alloc(self, size:int, options:BufferOptions) -> Tuple[ctypes._CData, BufferOptions]:
65
+ def _alloc(self, size:int, options:BufferSpec) -> tuple[ctypes._CData, BufferSpec]:
66
66
  if options.image is not None:
67
- return (checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
67
+ return (checked(cl.clCreateImage2D(self.dev.context, cl.CL_MEM_READ_WRITE,
68
68
  cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
69
69
  options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options)
70
- return (checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
71
- def _free(self, opaque:Tuple[ctypes._CData, BufferOptions], options:BufferOptions): check(cl.clReleaseMemObject(opaque[0]))
72
- def copyin(self, dest:Tuple[ctypes._CData, BufferOptions], src:memoryview):
70
+ return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
71
+ def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
72
+ def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview):
73
73
  if dest[1].image is not None:
74
- check(cl.clEnqueueWriteImage(self.device.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0),
74
+ check(cl.clEnqueueWriteImage(self.dev.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0),
75
75
  (ctypes.c_size_t * 3)(dest[1].image.shape[1],dest[1].image.shape[0],1), 0, 0, from_mv(src), 0, None, None))
76
76
  else:
77
77
  if mv_address(src) % 16: src = memoryview(bytearray(src))
78
- check(cl.clEnqueueWriteBuffer(self.device.queue, dest[0], False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
79
- self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
80
- def copyout(self, dest:memoryview, src:Tuple[ctypes._CData, BufferOptions]):
78
+ check(cl.clEnqueueWriteBuffer(self.dev.queue, dest[0], False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
79
+ self.dev.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
80
+ def _copyout(self, dest:memoryview, src:tuple[ctypes._CData, BufferSpec]):
81
81
  if src[1].image is not None:
82
- check(cl.clEnqueueReadImage(self.device.queue, src[0], False, (ctypes.c_size_t * 3)(0,0,0),
82
+ check(cl.clEnqueueReadImage(self.dev.queue, src[0], False, (ctypes.c_size_t * 3)(0,0,0),
83
83
  (ctypes.c_size_t * 3)(src[1].image.shape[1],src[1].image.shape[0],1), 0, 0, from_mv(dest), 0, None, None))
84
84
  else:
85
- check(cl.clEnqueueReadBuffer(self.device.queue, src[0], False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
86
- self.device.synchronize()
85
+ check(cl.clEnqueueReadBuffer(self.dev.queue, src[0], False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
86
+ self.dev.synchronize()
87
87
 
88
88
  class CLDevice(Compiled):
89
89
  device_ids = None # this is global and only initted once
@@ -103,7 +103,7 @@ class CLDevice(Compiled):
103
103
  if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
104
104
  self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
105
105
  self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
106
- self.pending_copyin: List[memoryview] = []
106
+ self.pending_copyin: list[memoryview] = []
107
107
  self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501
108
108
 
109
109
  compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
@@ -1,8 +1,6 @@
1
- from __future__ import annotations
2
1
  import ctypes, functools
3
- from typing import Tuple
4
2
  from tinygrad.helpers import init_c_var, from_mv, init_c_struct_t, getenv
5
- from tinygrad.device import Compiled, LRUAllocator, BufferOptions
3
+ from tinygrad.device import Compiled, LRUAllocator, BufferSpec
6
4
  from tinygrad.runtime.autogen import hip
7
5
  from tinygrad.runtime.support.compiler_hip import AMDCompiler
8
6
  from tinygrad.renderer.cstyle import HIPRenderer
@@ -11,18 +9,28 @@ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint
11
9
  def check(status):
12
10
  if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}")
13
11
 
12
+ class HIPDevice(Compiled):
13
+ def __init__(self, device:str=""):
14
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
15
+ self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
16
+ self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
17
+ super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
18
+ def synchronize(self):
19
+ check(hip.hipSetDevice(self.device_id))
20
+ check(hip.hipDeviceSynchronize())
21
+
14
22
  class HIPProgram:
15
- def __init__(self, device:HIPDevice, name:str, lib:bytes):
16
- self.device, self.name, self.lib = device, name, lib
17
- check(hip.hipSetDevice(self.device.device_id))
23
+ def __init__(self, dev:HIPDevice, name:str, lib:bytes):
24
+ self.dev, self.name, self.lib = dev, name, lib
25
+ check(hip.hipSetDevice(self.dev.device_id))
18
26
  self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
19
27
  self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
20
28
 
21
29
  def __del__(self):
22
30
  if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
23
31
 
24
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
25
- check(hip.hipSetDevice(self.device.device_id))
32
+ def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
33
+ check(hip.hipSetDevice(self.dev.device_id))
26
34
  if not hasattr(self, "vargs"):
27
35
  self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
28
36
  [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
@@ -32,37 +40,27 @@ class HIPProgram:
32
40
  for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
33
41
  for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
34
42
 
35
- if wait: check(hip.hipEventRecord(self.device.time_event_st, None))
43
+ if wait: check(hip.hipEventRecord(self.dev.time_event_st, None))
36
44
 
37
45
  check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs))
38
46
 
39
47
  if wait:
40
- check(hip.hipEventRecord(self.device.time_event_en, None))
41
- check(hip.hipEventSynchronize(self.device.time_event_en))
42
- check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.device.time_event_st, self.device.time_event_en))
48
+ check(hip.hipEventRecord(self.dev.time_event_en, None))
49
+ check(hip.hipEventSynchronize(self.dev.time_event_en))
50
+ check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.dev.time_event_st, self.dev.time_event_en))
43
51
  return ret.value * 1e-3
44
52
 
45
53
  class HIPAllocator(LRUAllocator):
46
- def __init__(self, device:HIPDevice):
47
- self.device = device
54
+ def __init__(self, dev:HIPDevice):
55
+ self.dev = dev
48
56
  super().__init__()
49
- def _alloc(self, size:int, options:BufferOptions):
50
- check(hip.hipSetDevice(self.device.device_id))
57
+ def _alloc(self, size:int, options:BufferSpec):
58
+ check(hip.hipSetDevice(self.dev.device_id))
51
59
  return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
52
- def _free(self, opaque, options:BufferOptions): check(hip.hipFree(opaque))
53
- def copyin(self, dest, src: memoryview):
54
- check(hip.hipSetDevice(self.device.device_id))
60
+ def _free(self, opaque, options:BufferSpec): check(hip.hipFree(opaque))
61
+ def _copyin(self, dest, src: memoryview):
62
+ check(hip.hipSetDevice(self.dev.device_id))
55
63
  check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
56
- def copyout(self, dest:memoryview, src):
57
- self.device.synchronize()
64
+ def _copyout(self, dest:memoryview, src):
65
+ self.dev.synchronize()
58
66
  check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
59
-
60
- class HIPDevice(Compiled):
61
- def __init__(self, device:str=""):
62
- self.device_id = int(device.split(":")[1]) if ":" in device else 0
63
- self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
64
- self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
65
- super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
66
- def synchronize(self):
67
- check(hip.hipSetDevice(self.device_id))
68
- check(hip.hipDeviceSynchronize())
@@ -1,51 +1,58 @@
1
- from __future__ import annotations
2
- import ctypes, functools
3
- from typing import Tuple
4
- from tinygrad.device import Compiled, Compiler, MallocAllocator
5
- from tinygrad.helpers import cpu_time_execution, getenv, cpu_objdump
1
+ import ctypes, platform
2
+ from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
3
+ from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
6
4
  from tinygrad.renderer.llvmir import LLVMRenderer
7
- import llvmlite.binding as llvm
5
+ import tinygrad.runtime.autogen.llvm as llvm
6
+ from tinygrad.runtime.support.elf import jit_loader
7
+
8
+ def cerr(): return ctypes.pointer(ctypes.pointer(ctypes.c_char()))
9
+
10
+ def expect(x, err, ret=None):
11
+ if x: raise RuntimeError(llvm.string_cast(err.contents) if not isinstance(err, str) else err)
12
+ return ret
8
13
 
9
14
  class LLVMCompiler(Compiler):
10
- def __init__(self, device:LLVMDevice, opt:bool=False):
11
- self.device = device
12
- self.optimizer: llvm.passmanagers.ModulePassManager = llvm.create_module_pass_manager()
13
- self.device.target_machine.add_analysis_passes(self.optimizer)
14
- if opt:
15
- with llvm.create_pass_manager_builder() as builder:
16
- builder.opt_level = 3; builder.size_level = 0; builder.loop_vectorize = True; builder.slp_vectorize = True # noqa: E702
17
- builder.populate(self.optimizer)
18
- super().__init__("compile_llvm_opt" if opt else "compile_llvm")
15
+ def __init__(self, host_arch:str):
16
+ for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
19
17
 
20
- def compile(self, src:str) -> bytes:
21
- mod = llvm.parse_assembly(src)
22
- mod.verify()
23
- self.optimizer.run(mod)
24
- return self.device.target_machine.emit_object(mod)
18
+ triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
19
+ target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
20
+ # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
21
+ cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
22
+ if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
23
+ self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
24
+ llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
25
25
 
26
- def disassemble(self, lib:bytes): cpu_objdump(lib)
26
+ self.pbo = llvm.LLVMCreatePassBuilderOptions()
27
+ if (opt:=bool(getenv("LLVMOPT", "1"))):
28
+ self.passes = b'default<O2>'
29
+ llvm.LLVMPassBuilderOptionsSetLoopUnrolling(self.pbo, True)
30
+ llvm.LLVMPassBuilderOptionsSetLoopVectorization(self.pbo, True)
31
+ llvm.LLVMPassBuilderOptionsSetSLPVectorization(self.pbo, True)
32
+ llvm.LLVMPassBuilderOptionsSetVerifyEach(self.pbo, True)
33
+ else:
34
+ self.passes = b'default<O0>'
27
35
 
28
- class LLVMProgram:
29
- def __init__(self, device:LLVMDevice, name:str, lib:bytes):
30
- self.name, self.lib = name, lib
31
- device.engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
32
- self.fxn = device.engine.get_function_address(name)
33
- assert self.fxn != 0, "LLVM failed to get function address"
36
+ super().__init__(f"compile_llvm_jit{'_opt' if opt else ''}")
37
+
38
+ def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo)
39
+
40
+ def compile(self, src:str) -> bytes:
41
+ src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
42
+ mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
43
+ expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
44
+ expect(llvm.LLVMRunPasses(mod, self.passes, self.target_machine, self.pbo), 'failed to run passes')
45
+ if DEBUG >= 7: print(ctypes.string_at(llvm.LLVMPrintModuleToString(mod)).decode())
46
+ obj_buf = expect(llvm.LLVMTargetMachineEmitToMemoryBuffer(self.target_machine, mod, llvm.LLVMObjectFile, err:=cerr(),
47
+ ctypes.pointer(buf:=llvm.LLVMMemoryBufferRef())), err, buf)
48
+ llvm.LLVMDisposeModule(mod)
49
+ obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
50
+ llvm.LLVMDisposeMemoryBuffer(obj_buf)
51
+ return jit_loader(obj)
34
52
 
35
- def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
36
- if not hasattr(self, 'cfunc'):
37
- self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
38
- return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
53
+ def disassemble(self, lib:bytes): capstone_flatdump(lib)
39
54
 
40
55
  class LLVMDevice(Compiled):
41
56
  def __init__(self, device:str):
42
- llvm.initialize()
43
- llvm.initialize_native_target()
44
- llvm.initialize_native_asmprinter()
45
- llvm.initialize_native_asmparser()
46
- # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA
47
- self.target_machine: llvm.targets.TargetMachine = llvm.Target.from_triple(llvm.get_process_triple()).create_target_machine(opt=2)
48
- backing_mod = llvm.parse_assembly(str())
49
- backing_mod.triple = llvm.get_process_triple()
50
- self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
51
- super().__init__(device, MallocAllocator, LLVMRenderer(), LLVMCompiler(self, getenv("LLVMOPT")), functools.partial(LLVMProgram, self))
57
+ compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()])
58
+ super().__init__(device, MallocAllocator, LLVMRenderer(), compiler, CPUProgram)