tinygrad 0.10.0__py3-none-any.whl → 0.10.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. tinygrad/codegen/kernel.py +114 -172
  2. tinygrad/codegen/linearize.py +211 -81
  3. tinygrad/codegen/lowerer.py +30 -35
  4. tinygrad/codegen/{uopgraph.py → rewriter.py} +69 -59
  5. tinygrad/codegen/transcendental.py +12 -13
  6. tinygrad/device.py +170 -47
  7. tinygrad/dtype.py +28 -26
  8. tinygrad/engine/jit.py +80 -63
  9. tinygrad/engine/memory.py +4 -5
  10. tinygrad/engine/multi.py +162 -0
  11. tinygrad/engine/realize.py +58 -107
  12. tinygrad/engine/schedule.py +381 -314
  13. tinygrad/engine/search.py +40 -44
  14. tinygrad/gradient.py +70 -0
  15. tinygrad/helpers.py +77 -58
  16. tinygrad/nn/__init__.py +30 -32
  17. tinygrad/nn/datasets.py +1 -2
  18. tinygrad/nn/optim.py +22 -26
  19. tinygrad/nn/state.py +89 -64
  20. tinygrad/ops.py +562 -446
  21. tinygrad/renderer/__init__.py +79 -36
  22. tinygrad/renderer/cstyle.py +70 -84
  23. tinygrad/renderer/llvmir.py +32 -20
  24. tinygrad/renderer/ptx.py +79 -99
  25. tinygrad/renderer/wgsl.py +87 -0
  26. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  27. tinygrad/runtime/autogen/comgr.py +2 -0
  28. tinygrad/runtime/autogen/kfd.py +4 -3
  29. tinygrad/runtime/autogen/kgsl.py +1 -1
  30. tinygrad/runtime/autogen/libpciaccess.py +2023 -0
  31. tinygrad/runtime/autogen/llvm.py +11379 -0
  32. tinygrad/runtime/autogen/vfio.py +891 -0
  33. tinygrad/runtime/graph/cuda.py +8 -9
  34. tinygrad/runtime/graph/hcq.py +84 -79
  35. tinygrad/runtime/graph/metal.py +19 -21
  36. tinygrad/runtime/ops_amd.py +488 -327
  37. tinygrad/runtime/ops_clang.py +15 -28
  38. tinygrad/runtime/ops_cloud.py +34 -34
  39. tinygrad/runtime/ops_cuda.py +30 -27
  40. tinygrad/runtime/ops_disk.py +62 -63
  41. tinygrad/runtime/ops_dsp.py +129 -38
  42. tinygrad/runtime/ops_gpu.py +30 -30
  43. tinygrad/runtime/ops_hip.py +29 -31
  44. tinygrad/runtime/ops_llvm.py +45 -40
  45. tinygrad/runtime/ops_metal.py +93 -73
  46. tinygrad/runtime/ops_npy.py +2 -2
  47. tinygrad/runtime/ops_nv.py +232 -270
  48. tinygrad/runtime/ops_python.py +51 -46
  49. tinygrad/runtime/ops_qcom.py +129 -157
  50. tinygrad/runtime/ops_webgpu.py +63 -0
  51. tinygrad/runtime/support/allocator.py +94 -0
  52. tinygrad/runtime/support/am/__init__.py +0 -0
  53. tinygrad/runtime/support/am/amdev.py +384 -0
  54. tinygrad/runtime/support/am/ip.py +463 -0
  55. tinygrad/runtime/support/compiler_cuda.py +4 -2
  56. tinygrad/runtime/support/elf.py +26 -4
  57. tinygrad/runtime/support/hcq.py +254 -324
  58. tinygrad/runtime/support/llvm.py +32 -0
  59. tinygrad/shape/shapetracker.py +84 -53
  60. tinygrad/shape/view.py +103 -138
  61. tinygrad/spec.py +154 -0
  62. tinygrad/tensor.py +744 -496
  63. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/METADATA +32 -21
  64. tinygrad-0.10.1.dist-info/RECORD +86 -0
  65. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/WHEEL +1 -1
  66. tinygrad/engine/lazy.py +0 -228
  67. tinygrad/function.py +0 -212
  68. tinygrad/multi.py +0 -177
  69. tinygrad/runtime/graph/clang.py +0 -39
  70. tinygrad-0.10.0.dist-info/RECORD +0 -77
  71. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/LICENSE +0 -0
  72. {tinygrad-0.10.0.dist-info → tinygrad-0.10.1.dist-info}/top_level.txt +0 -0
@@ -1,14 +1,45 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, Any
3
- import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys
2
+ from typing import Tuple, Any, List
3
+ import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, time, struct
4
4
  assert sys.platform != 'win32'
5
- from tinygrad.device import BufferOptions, Compiled, Allocator
6
- from tinygrad.helpers import from_mv, getenv, round_up, mv_address, to_mv
7
- from tinygrad.runtime.ops_clang import ClangCompiler
8
- from tinygrad.renderer.cstyle import DSPRenderer
5
+ from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, MallocAllocator
6
+ from tinygrad.dtype import dtypes, DType, PtrDType
7
+ from tinygrad.ops import Ops, UOp
8
+ from tinygrad.helpers import from_mv, getenv, round_up, mv_address, to_mv, cpu_objdump, DEBUG
9
+ from tinygrad.renderer.cstyle import ClangRenderer
9
10
  from tinygrad.runtime.autogen import libc, qcom_dsp
10
11
  if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
11
12
 
13
+ class DSPRenderer(ClangRenderer):
14
+ device = "DSP"
15
+ supports_float4 = False
16
+ buffer_suffix = " restrict __attribute__((align_value(128)))"
17
+ kernel_prefix = "__attribute__((noinline)) "
18
+ type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
19
+ code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
20
+ Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
21
+ Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
22
+
23
+ def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,Tuple[DType,bool]]], uops:List[UOp], prefix=None) -> str:
24
+ ret = super().render_kernel(function_name, kernel, bufs, uops, prefix)
25
+ msrc = ['''struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency; _Bool set_dcvs_params;
26
+ short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3]; };''', 'int HAP_power_set(void*, void*);',
27
+ 'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
28
+ 'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
29
+ 'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
30
+ 'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
31
+ 'HAP_power_set((void*)handle, (void*)&req);']
32
+ msrc += ['if ((sc>>24) != 2) return 0;']
33
+ msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
34
+ msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
35
+ msrc += [f'void *buf_{i} = HAP_mmap(0,sz_or_val_{i},3,0,pra[{i+3}].dma.fd,0)+off{i};' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
36
+ msrc += ["unsigned long long start = HAP_perf_get_time_us();"]
37
+ msrc += [f"{function_name}({', '.join([(f'buf_{i}' if isinstance(b[1][0], PtrDType) else f'sz_or_val_{i}') for i,b in enumerate(bufs)])});"]
38
+ msrc += ["*(unsigned long long *)(pra[2].buf.pv) = HAP_perf_get_time_us() - start;"]
39
+ msrc += [f'HAP_munmap(buf_{i}, sz_or_val_{i});' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
40
+ msrc += ["return 0; }"]
41
+ return ret + '\n' + '\n'.join(msrc)
42
+
12
43
  def rpc_sc(method=0, ins=0, outs=0, fds=0): return (method << 24) | (ins << 16) | (outs << 8) | fds
13
44
  def rpc_prep_args(ins=None, outs=None, in_fds=None):
14
45
  ins, outs, in_fds = ins or list(), outs or list(), in_fds or list()
@@ -21,8 +52,8 @@ def rpc_prep_args(ins=None, outs=None, in_fds=None):
21
52
  return pra, fds, attrs, (ins, outs)
22
53
 
23
54
  class DSPProgram:
24
- def __init__(self, device:DSPDevice, name:str, lib:bytes):
25
- self.device, self.lib = device, lib
55
+ def __init__(self, dev:DSPDevice, name:str, lib:bytes):
56
+ self.dev, self.lib = dev, lib
26
57
 
27
58
  def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
28
59
  if len(bufs) >= 16: raise RuntimeError(f"Too many buffers to execute: {len(bufs)}")
@@ -31,7 +62,7 @@ class DSPProgram:
31
62
  outs=[timer:=memoryview(bytearray(8)).cast('Q')], in_fds=[b.share_info.fd for b in bufs])
32
63
  var_vals_mv.cast('i')[:] = array.array('i', tuple(b.size for b in bufs) + vals)
33
64
  off_mv.cast('I')[:] = array.array('I', tuple(b.offset for b in bufs))
34
- self.device.exec_lib(self.lib, rpc_sc(method=2, ins=2, outs=1, fds=len(bufs)), pra, fds, attrs)
65
+ self.dev.exec_lib(self.lib, rpc_sc(method=2, ins=2, outs=1, fds=len(bufs)), pra, fds, attrs)
35
66
  return timer[0] / 1e6
36
67
 
37
68
  class DSPBuffer:
@@ -39,47 +70,62 @@ class DSPBuffer:
39
70
  self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
40
71
 
41
72
  class DSPAllocator(Allocator):
42
- def __init__(self, device:DSPDevice):
43
- self.device = device
73
+ def __init__(self, dev:DSPDevice):
74
+ self.dev = dev
44
75
  super().__init__()
45
76
 
46
- def _alloc(self, size:int, options:BufferOptions):
47
- b = qcom_dsp.ION_IOC_ALLOC(self.device.ion_fd, len=size, align=0x200, heap_id_mask=1<<qcom_dsp.ION_SYSTEM_HEAP_ID, flags=qcom_dsp.ION_FLAG_CACHED)
48
- share_info = qcom_dsp.ION_IOC_SHARE(self.device.ion_fd, handle=b.handle)
77
+ def _alloc(self, size:int, options:BufferSpec):
78
+ b = qcom_dsp.ION_IOC_ALLOC(self.dev.ion_fd, len=size, align=0x200, heap_id_mask=1<<qcom_dsp.ION_SYSTEM_HEAP_ID, flags=qcom_dsp.ION_FLAG_CACHED)
79
+ share_info = qcom_dsp.ION_IOC_SHARE(self.dev.ion_fd, handle=b.handle)
49
80
  va_addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, share_info.fd, 0)
50
81
  return DSPBuffer(va_addr, size, share_info, offset=0)
51
82
 
52
- def _free(self, opaque:DSPBuffer, options:BufferOptions):
83
+ def _free(self, opaque:DSPBuffer, options:BufferSpec):
53
84
  libc.munmap(opaque.va_addr, opaque.size)
54
85
  os.close(opaque.share_info.fd)
55
- qcom_dsp.ION_IOC_FREE(self.device.ion_fd, handle=opaque.share_info.handle)
86
+ qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
56
87
 
57
- def as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
58
- def copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
59
- def copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
60
- def offset(self, buf, size:int, offset:int): return DSPBuffer(buf.va_addr+offset, size, buf.share_info, buf.offset+offset)
88
+ def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
89
+ def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
90
+ def _copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
91
+ def _offset(self, buf, size:int, offset:int): return DSPBuffer(buf.va_addr+offset, size, buf.share_info, buf.offset+offset)
61
92
 
62
- class DSPDevice(Compiled):
63
- def __init__(self, device:str=""):
64
- self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
93
+ class ClangCompiler(Compiler):
94
+ def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
95
+ self.args = ['-shared', '-march=native'] if args is None else args
96
+ self.objdump_tool = objdump_tool
97
+ super().__init__(cachekey)
65
98
 
66
- # Generate link script to pass into clang. Aligning all used sections to 4k fixes invoke problem.
67
- sections = ['hash', 'text', 'rela.plt', 'got', 'got.plt', 'dynamic', 'dynsym', 'dynstr', 'plt', 'data', 'bss']
68
- sections_link = '\n'.join([f'.{n} : ALIGN(4096) {{ *(.{n}) }}' for n in sections])
69
- with tempfile.NamedTemporaryFile(delete=False) as self.link_ld:
70
- self.link_ld.write(f"SECTIONS {{ . = 0x0; {sections_link}\n /DISCARD/ : {{ *(.note .note.* .gnu.hash .comment) }} }}".encode())
71
- self.link_ld.flush()
99
+ def compile(self, src:str) -> bytes:
100
+ # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
101
+ with tempfile.NamedTemporaryFile(delete=True) as output_file:
102
+ subprocess.check_output(['clang', *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
103
+ '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
104
+ return pathlib.Path(output_file.name).read_bytes()
72
105
 
73
- compiler_args = ["--target=hexagon", "-mcpu=hexagonv65", "-fuse-ld=lld", "-nostdlib", "-mhvx=v65", "-mhvx-length=128b", f"-T{self.link_ld.name}"]
74
- super().__init__(device, DSPAllocator(self), DSPRenderer(),
75
- ClangCompiler("compile_dsp", args=compiler_args, objdump_tool='llvm-objdump'), functools.partial(DSPProgram, self))
106
+ def disassemble(self, lib:bytes): return cpu_objdump(lib, self.objdump_tool)
76
107
 
77
- fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
78
- self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferOptions(nolru=True))
79
- ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
108
+ class DSPDevice(Compiled):
109
+ def __init__(self, device:str=""):
110
+ compiler_args = ["--target=hexagon", "-mcpu=hexagonv65", "-fuse-ld=lld", "-nostdlib", "-mhvx=v65", "-mhvx-length=128b"]
111
+ try:
112
+ self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
113
+ # Generate link script to pass into clang. Aligning all used sections to 4k fixes invoke problem.
114
+ sections = ['hash', 'text', 'rela.plt', 'got', 'got.plt', 'dynamic', 'dynsym', 'dynstr', 'plt', 'data', 'bss']
115
+ sections_link = '\n'.join([f'.{n} : ALIGN(4096) {{ *(.{n}) }}' for n in sections])
116
+ with tempfile.NamedTemporaryFile(delete=False) as self.link_ld:
117
+ self.link_ld.write(f"SECTIONS {{ . = 0x0; {sections_link}\n /DISCARD/ : {{ *(.note .note.* .gnu.hash .comment) }} }}".encode())
118
+ self.link_ld.flush()
119
+ super().__init__(device, DSPAllocator(self), DSPRenderer(),
120
+ ClangCompiler("compile_dsp", ["-shared"] + compiler_args + [f"-T{self.link_ld.name}"], 'llvm-objdump'), functools.partial(DSPProgram, self))
121
+ fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
122
+ self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferSpec(nolru=True))
123
+ ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
80
124
 
81
- self.init_dsp()
82
- RPCListner(self).start()
125
+ self.init_dsp()
126
+ RPCListener(self).start()
127
+ except FileNotFoundError:
128
+ super().__init__(device, MallocAllocator, MockDSPRenderer(), ClangCompiler(None, ["-static"] + compiler_args, 'llvm-objdump'), MockDSPProgram)
83
129
 
84
130
  def open_lib(self, lib):
85
131
  self.binded_lib, self.binded_lib_off = lib, 0
@@ -117,7 +163,7 @@ class DSPDevice(Compiled):
117
163
  qcom_dsp.FASTRPC_IOCTL_INIT(self.rpc_fd, flags=0x1, file=self.shell_buf.va_addr, filelen=self.shell_buf.size, filefd=self.shell_buf.share_info.fd)
118
164
  qcom_dsp.FASTRPC_IOCTL_INVOKE(self.rpc_fd, handle=3, sc=rpc_sc(method=3, ins=0, outs=0))
119
165
 
120
- class RPCListner(threading.Thread):
166
+ class RPCListener(threading.Thread):
121
167
  def __init__(self, device:DSPDevice):
122
168
  super().__init__()
123
169
  self.device, self.daemon = device, True
@@ -179,3 +225,48 @@ class RPCListner(threading.Thread):
179
225
  st = qcom_dsp.FASTRPC_IOCTL_MMAP(self.device.rpc_fd, fd=-1, flags=in_args[0].cast('I')[2], vaddrin=0, size=in_args[0].cast('Q')[3])
180
226
  out_args[0].cast('Q')[0:2] = array.array('Q', [0, st.vaddrout])
181
227
  else: raise RuntimeError(f"Unknown op: {sc=:X}")
228
+
229
+ # ***** mock DSP *****
230
+
231
+ class MockDSPRenderer(DSPRenderer):
232
+ def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,Tuple[DType,bool]]], uops:List[UOp], prefix=None) -> str:
233
+ ret = ClangRenderer.render_kernel(self, function_name, kernel, bufs, uops, prefix)
234
+ # https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
235
+ msrc = ['''static long syscall(long r0, long r1, long r2, long r3, long r4, long r5, long r6) {
236
+ long retval; __asm__ volatile("r0 = %1; r1 = %2; r2 = %3; r3 = %4; r4 = %5; r5 = %6; r6 = #%7; trap0(#1); %0 = r0" : "=r" (retval)
237
+ : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "i" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
238
+ static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
239
+ static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
240
+ static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
241
+ static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
242
+ return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}''', 'void _start(void) {']
243
+ for i,b in enumerate(bufs):
244
+ if isinstance(b[1][0], PtrDType):
245
+ sz = b[1][0].size*b[1][0].itemsize
246
+ msrc.append(f"void *buf{i} = mmap2(0, {sz}, 3, 0x21, -1, 0); read(0, buf{i}, {sz});")
247
+ else:
248
+ msrc.append(f"unsigned int val{i}; read(0, &val{i}, 4);")
249
+ msrc.append(f"{function_name}({', '.join([(f'(void*)buf{i}' if isinstance(b[1][0], PtrDType) else f'val{i}') for i,b in enumerate(bufs)])});")
250
+ for i,b in enumerate(bufs):
251
+ if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
252
+ msrc.append('exit(0); }')
253
+ return ret + '\n' + '\n'.join(msrc)
254
+
255
+ class MockDSPProgram:
256
+ def __init__(self, name:str, lib:bytes): self.lib = lib
257
+ def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
258
+ with tempfile.NamedTemporaryFile(suffix=".out") as dsp_lib:
259
+ dsp_lib.write(self.lib)
260
+ dsp_lib.flush()
261
+ os.chmod(dsp_lib.name, 0o0777)
262
+ # NOTE: this timing includes a docker launch
263
+ start = time.perf_counter()
264
+ proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
265
+ "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 3 else ''} /work/"+os.path.basename(dsp_lib.name)],
266
+ input=b''.join([bytes(x) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
267
+ elapsed = time.perf_counter() - start
268
+ offset = 0
269
+ for x in bufs:
270
+ x[:] = proc.stdout[offset:offset+len(x)]
271
+ offset += len(x)
272
+ return elapsed
@@ -1,10 +1,10 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, Optional, List, cast
3
- import ctypes, functools, hashlib
2
+ from typing import Optional, cast
3
+ import ctypes, functools, hashlib, contextlib
4
4
  from tinygrad.runtime.autogen import opencl as cl
5
5
  from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address
6
6
  from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
7
- from tinygrad.device import BufferOptions, LRUAllocator, Compiled, Compiler, CompileError
7
+ from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError
8
8
 
9
9
  # see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
10
10
  OSX_TIMING_RATIO = (125/3) if OSX else 1.0
@@ -15,15 +15,15 @@ def check(status):
15
15
  def checked(ret, status): return (check(status.value), ret)[1]
16
16
 
17
17
  class CLCompiler(Compiler):
18
- def __init__(self, device:CLDevice, compile_key:str):
19
- self.device = device
18
+ def __init__(self, dev:CLDevice, compile_key:str):
19
+ self.dev = dev
20
20
  super().__init__(f"compile_cl_{compile_key}")
21
21
  def compile(self, src:str) -> bytes:
22
- program = checked(cl.clCreateProgramWithSource(self.device.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
23
- build_status: int = cl.clBuildProgram(program, 1, self.device.device_id, None, cl.clBuildProgram.argtypes[4](), None)
22
+ program = checked(cl.clCreateProgramWithSource(self.dev.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
23
+ build_status: int = cl.clBuildProgram(program, 1, self.dev.device_id, None, cl.clBuildProgram.argtypes[4](), None)
24
24
  if build_status != 0:
25
- cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
26
- cl.clGetProgramBuildInfo(program, self.device.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
25
+ cl.clGetProgramBuildInfo(program, self.dev.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
26
+ cl.clGetProgramBuildInfo(program, self.dev.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
27
27
  raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
28
28
  check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(ctypes.c_size_t), binary_sizes := (ctypes.c_size_t * 1)(), None))
29
29
  check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), (ctypes.c_void_p * 1)(ctypes.addressof(binary := ctypes.create_string_buffer(binary_sizes[0]))), None)) # noqa: E501
@@ -32,7 +32,7 @@ class CLCompiler(Compiler):
32
32
 
33
33
  class CLProgram:
34
34
  def __init__(self, device:CLDevice, name:str, lib:bytes):
35
- self.device, self.name, self.lib = device, name, lib
35
+ self.dev, self.name, self.lib = device, name, lib
36
36
  self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
37
37
  to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
38
38
  errcode_ret := ctypes.c_int32()), errcode_ret)
@@ -41,15 +41,15 @@ class CLProgram:
41
41
  self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
42
42
 
43
43
  def __del__(self):
44
- if hasattr(self, 'kernel'): check(cl.clReleaseKernel(self.kernel))
45
- if hasattr(self, 'program'): check(cl.clReleaseProgram(self.program))
44
+ with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseKernel(self.kernel))
45
+ with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseProgram(self.program))
46
46
 
47
- def __call__(self, *bufs:Tuple[ctypes._CData, BufferOptions], global_size:Tuple[int,int,int]=(1,1,1), local_size:Optional[Tuple[int,int,int]]=None, vals:Tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
47
+ def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:Optional[tuple[int,int,int]]=None, vals:tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
48
48
  for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
49
49
  for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
50
- if local_size is not None: global_size = cast(Tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
50
+ if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
51
51
  event = cl.cl_event() if wait else None
52
- check(cl.clEnqueueNDRangeKernel(self.device.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
52
+ check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
53
53
  if wait:
54
54
  assert event is not None
55
55
  check(cl.clWaitForEvents(1, event))
@@ -59,31 +59,31 @@ class CLProgram:
59
59
  return None
60
60
 
61
61
  class CLAllocator(LRUAllocator):
62
- def __init__(self, device:CLDevice):
63
- self.device = device
62
+ def __init__(self, dev:CLDevice):
63
+ self.dev = dev
64
64
  super().__init__()
65
- def _alloc(self, size:int, options:BufferOptions) -> Tuple[ctypes._CData, BufferOptions]:
65
+ def _alloc(self, size:int, options:BufferSpec) -> tuple[ctypes._CData, BufferSpec]:
66
66
  if options.image is not None:
67
- return (checked(cl.clCreateImage2D(self.device.context, cl.CL_MEM_READ_WRITE,
67
+ return (checked(cl.clCreateImage2D(self.dev.context, cl.CL_MEM_READ_WRITE,
68
68
  cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
69
69
  options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options)
70
- return (checked(cl.clCreateBuffer(self.device.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
71
- def _free(self, opaque:Tuple[ctypes._CData, BufferOptions], options:BufferOptions): check(cl.clReleaseMemObject(opaque[0]))
72
- def copyin(self, dest:Tuple[ctypes._CData, BufferOptions], src:memoryview):
70
+ return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
71
+ def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
72
+ def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview):
73
73
  if dest[1].image is not None:
74
- check(cl.clEnqueueWriteImage(self.device.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0),
74
+ check(cl.clEnqueueWriteImage(self.dev.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0),
75
75
  (ctypes.c_size_t * 3)(dest[1].image.shape[1],dest[1].image.shape[0],1), 0, 0, from_mv(src), 0, None, None))
76
76
  else:
77
77
  if mv_address(src) % 16: src = memoryview(bytearray(src))
78
- check(cl.clEnqueueWriteBuffer(self.device.queue, dest[0], False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
79
- self.device.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
80
- def copyout(self, dest:memoryview, src:Tuple[ctypes._CData, BufferOptions]):
78
+ check(cl.clEnqueueWriteBuffer(self.dev.queue, dest[0], False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
79
+ self.dev.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
80
+ def _copyout(self, dest:memoryview, src:tuple[ctypes._CData, BufferSpec]):
81
81
  if src[1].image is not None:
82
- check(cl.clEnqueueReadImage(self.device.queue, src[0], False, (ctypes.c_size_t * 3)(0,0,0),
82
+ check(cl.clEnqueueReadImage(self.dev.queue, src[0], False, (ctypes.c_size_t * 3)(0,0,0),
83
83
  (ctypes.c_size_t * 3)(src[1].image.shape[1],src[1].image.shape[0],1), 0, 0, from_mv(dest), 0, None, None))
84
84
  else:
85
- check(cl.clEnqueueReadBuffer(self.device.queue, src[0], False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
86
- self.device.synchronize()
85
+ check(cl.clEnqueueReadBuffer(self.dev.queue, src[0], False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
86
+ self.dev.synchronize()
87
87
 
88
88
  class CLDevice(Compiled):
89
89
  device_ids = None # this is global and only initted once
@@ -103,7 +103,7 @@ class CLDevice(Compiled):
103
103
  if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
104
104
  self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
105
105
  self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
106
- self.pending_copyin: List[memoryview] = []
106
+ self.pending_copyin: list[memoryview] = []
107
107
  self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501
108
108
 
109
109
  compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
@@ -1,8 +1,6 @@
1
- from __future__ import annotations
2
1
  import ctypes, functools
3
- from typing import Tuple
4
2
  from tinygrad.helpers import init_c_var, from_mv, init_c_struct_t, getenv
5
- from tinygrad.device import Compiled, LRUAllocator, BufferOptions
3
+ from tinygrad.device import Compiled, LRUAllocator, BufferSpec
6
4
  from tinygrad.runtime.autogen import hip
7
5
  from tinygrad.runtime.support.compiler_hip import AMDCompiler
8
6
  from tinygrad.renderer.cstyle import HIPRenderer
@@ -11,18 +9,28 @@ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint
11
9
  def check(status):
12
10
  if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}")
13
11
 
12
+ class HIPDevice(Compiled):
13
+ def __init__(self, device:str=""):
14
+ self.device_id = int(device.split(":")[1]) if ":" in device else 0
15
+ self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
16
+ self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
17
+ super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
18
+ def synchronize(self):
19
+ check(hip.hipSetDevice(self.device_id))
20
+ check(hip.hipDeviceSynchronize())
21
+
14
22
  class HIPProgram:
15
- def __init__(self, device:HIPDevice, name:str, lib:bytes):
16
- self.device, self.name, self.lib = device, name, lib
17
- check(hip.hipSetDevice(self.device.device_id))
23
+ def __init__(self, dev:HIPDevice, name:str, lib:bytes):
24
+ self.dev, self.name, self.lib = dev, name, lib
25
+ check(hip.hipSetDevice(self.dev.device_id))
18
26
  self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
19
27
  self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
20
28
 
21
29
  def __del__(self):
22
30
  if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
23
31
 
24
- def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
25
- check(hip.hipSetDevice(self.device.device_id))
32
+ def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
33
+ check(hip.hipSetDevice(self.dev.device_id))
26
34
  if not hasattr(self, "vargs"):
27
35
  self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
28
36
  [(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
@@ -32,37 +40,27 @@ class HIPProgram:
32
40
  for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
33
41
  for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
34
42
 
35
- if wait: check(hip.hipEventRecord(self.device.time_event_st, None))
43
+ if wait: check(hip.hipEventRecord(self.dev.time_event_st, None))
36
44
 
37
45
  check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs))
38
46
 
39
47
  if wait:
40
- check(hip.hipEventRecord(self.device.time_event_en, None))
41
- check(hip.hipEventSynchronize(self.device.time_event_en))
42
- check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.device.time_event_st, self.device.time_event_en))
48
+ check(hip.hipEventRecord(self.dev.time_event_en, None))
49
+ check(hip.hipEventSynchronize(self.dev.time_event_en))
50
+ check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.dev.time_event_st, self.dev.time_event_en))
43
51
  return ret.value * 1e-3
44
52
 
45
53
  class HIPAllocator(LRUAllocator):
46
- def __init__(self, device:HIPDevice):
47
- self.device = device
54
+ def __init__(self, dev:HIPDevice):
55
+ self.dev = dev
48
56
  super().__init__()
49
- def _alloc(self, size:int, options:BufferOptions):
50
- check(hip.hipSetDevice(self.device.device_id))
57
+ def _alloc(self, size:int, options:BufferSpec):
58
+ check(hip.hipSetDevice(self.dev.device_id))
51
59
  return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
52
- def _free(self, opaque, options:BufferOptions): check(hip.hipFree(opaque))
53
- def copyin(self, dest, src: memoryview):
54
- check(hip.hipSetDevice(self.device.device_id))
60
+ def _free(self, opaque, options:BufferSpec): check(hip.hipFree(opaque))
61
+ def _copyin(self, dest, src: memoryview):
62
+ check(hip.hipSetDevice(self.dev.device_id))
55
63
  check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
56
- def copyout(self, dest:memoryview, src):
57
- self.device.synchronize()
64
+ def _copyout(self, dest:memoryview, src):
65
+ self.dev.synchronize()
58
66
  check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
59
-
60
- class HIPDevice(Compiled):
61
- def __init__(self, device:str=""):
62
- self.device_id = int(device.split(":")[1]) if ":" in device else 0
63
- self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
64
- self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
65
- super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
66
- def synchronize(self):
67
- check(hip.hipSetDevice(self.device_id))
68
- check(hip.hipDeviceSynchronize())
@@ -1,51 +1,56 @@
1
- from __future__ import annotations
2
- import ctypes, functools
3
- from typing import Tuple
4
- from tinygrad.device import Compiled, Compiler, MallocAllocator
5
- from tinygrad.helpers import cpu_time_execution, getenv, cpu_objdump
1
+ import ctypes, platform, sys
2
+ from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
3
+ from tinygrad.helpers import OSX, getenv, capstone_flatdump
6
4
  from tinygrad.renderer.llvmir import LLVMRenderer
7
- import llvmlite.binding as llvm
5
+ import tinygrad.runtime.autogen.llvm as llvm
6
+ from tinygrad.runtime.support.elf import jit_loader
7
+
8
+ def cerr(): return ctypes.pointer(ctypes.pointer(ctypes.c_char()))
9
+
10
+ def expect(x, err, ret=None):
11
+ if x: raise RuntimeError(llvm.string_cast(err.contents) if not isinstance(err, str) else err)
12
+ return ret
8
13
 
9
14
  class LLVMCompiler(Compiler):
10
- def __init__(self, device:LLVMDevice, opt:bool=False):
11
- self.device = device
12
- self.optimizer: llvm.passmanagers.ModulePassManager = llvm.create_module_pass_manager()
13
- self.device.target_machine.add_analysis_passes(self.optimizer)
15
+ def __init__(self, host_arch:str, opt:bool):
16
+ for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
17
+
18
+ triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
19
+ target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
20
+ # +reserve-x18 here does the same thing as -ffixed-x18 in ops_clang.py, see comments there for why it's needed on arm osx
21
+ self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, b'', b'+reserve-x18' if OSX and host_arch == 'AArch64' else b'',
22
+ llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
23
+
24
+ self.pbo = llvm.LLVMCreatePassBuilderOptions()
14
25
  if opt:
15
- with llvm.create_pass_manager_builder() as builder:
16
- builder.opt_level = 3; builder.size_level = 0; builder.loop_vectorize = True; builder.slp_vectorize = True # noqa: E702
17
- builder.populate(self.optimizer)
18
- super().__init__("compile_llvm_opt" if opt else "compile_llvm")
26
+ self.passes = b'default<O2>'
27
+ llvm.LLVMPassBuilderOptionsSetLoopUnrolling(self.pbo, True)
28
+ llvm.LLVMPassBuilderOptionsSetLoopVectorization(self.pbo, True)
29
+ llvm.LLVMPassBuilderOptionsSetSLPVectorization(self.pbo, True)
30
+ llvm.LLVMPassBuilderOptionsSetVerifyEach(self.pbo, True)
31
+ else:
32
+ self.passes = b'default<O0>'
19
33
 
20
- def compile(self, src:str) -> bytes:
21
- mod = llvm.parse_assembly(src)
22
- mod.verify()
23
- self.optimizer.run(mod)
24
- return self.device.target_machine.emit_object(mod)
34
+ super().__init__(f"compile_llvm_jit{'_opt' if opt else ''}")
25
35
 
26
- def disassemble(self, lib:bytes): cpu_objdump(lib)
36
+ def __del__(self):
37
+ llvm.LLVMDisposePassBuilderOptions(self.pbo)
27
38
 
28
- class LLVMProgram:
29
- def __init__(self, device:LLVMDevice, name:str, lib:bytes):
30
- self.name, self.lib = name, lib
31
- device.engine.add_object_file(llvm.object_file.ObjectFileRef.from_data(lib))
32
- self.fxn = device.engine.get_function_address(name)
33
- assert self.fxn != 0, "LLVM failed to get function address"
39
+ def compile(self, src:str) -> bytes:
40
+ src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
41
+ mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
42
+ expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
43
+ expect(llvm.LLVMRunPasses(mod, self.passes, self.target_machine, self.pbo), 'failed to run passes')
44
+ obj_buf = expect(llvm.LLVMTargetMachineEmitToMemoryBuffer(self.target_machine, mod, llvm.LLVMObjectFile, err:=cerr(),
45
+ ctypes.pointer(buf:=llvm.LLVMMemoryBufferRef())), err, buf)
46
+ obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
47
+ llvm.LLVMDisposeModule(mod)
48
+ llvm.LLVMDisposeMemoryBuffer(obj_buf)
49
+ return jit_loader(obj)
34
50
 
35
- def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
36
- if not hasattr(self, 'cfunc'):
37
- self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
38
- return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
51
+ def disassemble(self, lib:bytes): capstone_flatdump(lib)
39
52
 
40
53
  class LLVMDevice(Compiled):
41
54
  def __init__(self, device:str):
42
- llvm.initialize()
43
- llvm.initialize_native_target()
44
- llvm.initialize_native_asmprinter()
45
- llvm.initialize_native_asmparser()
46
- # this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA
47
- self.target_machine: llvm.targets.TargetMachine = llvm.Target.from_triple(llvm.get_process_triple()).create_target_machine(opt=2)
48
- backing_mod = llvm.parse_assembly(str())
49
- backing_mod.triple = llvm.get_process_triple()
50
- self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
51
- super().__init__(device, MallocAllocator, LLVMRenderer(), LLVMCompiler(self, getenv("LLVMOPT")), functools.partial(LLVMProgram, self))
55
+ compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()], bool(getenv("LLVMOPT")))
56
+ super().__init__(device, MallocAllocator, LLVMRenderer('win64cc' if sys.platform == 'win32' else None), compiler, CPUProgram)