tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/devectorizer.py +247 -0
- tinygrad/codegen/expander.py +121 -0
- tinygrad/codegen/kernel.py +141 -201
- tinygrad/codegen/linearize.py +223 -84
- tinygrad/codegen/lowerer.py +60 -42
- tinygrad/codegen/symbolic.py +476 -0
- tinygrad/codegen/transcendental.py +22 -13
- tinygrad/device.py +187 -47
- tinygrad/dtype.py +39 -28
- tinygrad/engine/jit.py +83 -65
- tinygrad/engine/memory.py +4 -5
- tinygrad/engine/multi.py +161 -0
- tinygrad/engine/realize.py +62 -108
- tinygrad/engine/schedule.py +396 -357
- tinygrad/engine/search.py +55 -66
- tinygrad/gradient.py +73 -0
- tinygrad/helpers.py +81 -59
- tinygrad/nn/__init__.py +30 -32
- tinygrad/nn/datasets.py +1 -2
- tinygrad/nn/optim.py +22 -26
- tinygrad/nn/state.py +91 -66
- tinygrad/ops.py +492 -641
- tinygrad/renderer/__init__.py +95 -36
- tinygrad/renderer/cstyle.py +99 -92
- tinygrad/renderer/llvmir.py +83 -34
- tinygrad/renderer/ptx.py +83 -99
- tinygrad/renderer/wgsl.py +95 -0
- tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- tinygrad/runtime/autogen/comgr.py +2 -0
- tinygrad/runtime/autogen/kfd.py +4 -3
- tinygrad/runtime/autogen/kgsl.py +1 -1
- tinygrad/runtime/autogen/libc.py +404 -71
- tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad/runtime/autogen/pci.py +1333 -0
- tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad/runtime/autogen/webgpu.py +6985 -0
- tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad/runtime/graph/hcq.py +84 -79
- tinygrad/runtime/graph/metal.py +40 -43
- tinygrad/runtime/ops_amd.py +498 -334
- tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad/runtime/ops_cpu.py +24 -0
- tinygrad/runtime/ops_cuda.py +30 -27
- tinygrad/runtime/ops_disk.py +62 -63
- tinygrad/runtime/ops_dsp.py +159 -42
- tinygrad/runtime/ops_gpu.py +30 -30
- tinygrad/runtime/ops_hip.py +29 -31
- tinygrad/runtime/ops_llvm.py +48 -41
- tinygrad/runtime/ops_metal.py +149 -113
- tinygrad/runtime/ops_npy.py +2 -2
- tinygrad/runtime/ops_nv.py +238 -273
- tinygrad/runtime/ops_python.py +55 -50
- tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad/runtime/ops_webgpu.py +225 -0
- tinygrad/runtime/support/allocator.py +94 -0
- tinygrad/runtime/support/am/__init__.py +0 -0
- tinygrad/runtime/support/am/amdev.py +396 -0
- tinygrad/runtime/support/am/ip.py +463 -0
- tinygrad/runtime/support/compiler_cuda.py +4 -2
- tinygrad/runtime/support/elf.py +28 -4
- tinygrad/runtime/support/hcq.py +256 -324
- tinygrad/runtime/support/llvm.py +26 -0
- tinygrad/shape/shapetracker.py +85 -53
- tinygrad/shape/view.py +104 -140
- tinygrad/spec.py +155 -0
- tinygrad/tensor.py +835 -527
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
- tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
- tinygrad/viz/index.html +544 -0
- tinygrad/viz/perfetto.html +178 -0
- tinygrad/viz/serve.py +205 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
- tinygrad-0.10.2.dist-info/RECORD +99 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/uopgraph.py +0 -506
- tinygrad/engine/lazy.py +0 -228
- tinygrad/function.py +0 -212
- tinygrad/multi.py +0 -177
- tinygrad/runtime/graph/clang.py +0 -39
- tinygrad/runtime/ops_clang.py +0 -35
- tinygrad-0.10.0.dist-info/RECORD +0 -77
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_dsp.py
CHANGED
@@ -1,14 +1,64 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
3
|
-
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys
|
2
|
+
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
|
4
3
|
assert sys.platform != 'win32'
|
5
|
-
from tinygrad.device import
|
6
|
-
from tinygrad.
|
7
|
-
from tinygrad.
|
8
|
-
from tinygrad.
|
4
|
+
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, MallocAllocator
|
5
|
+
from tinygrad.dtype import dtypes, DType, PtrDType
|
6
|
+
from tinygrad.ops import Ops, UOp
|
7
|
+
from tinygrad.helpers import from_mv, getenv, round_up, mv_address, to_mv, cpu_objdump, DEBUG
|
8
|
+
from tinygrad.renderer.cstyle import ClangRenderer
|
9
9
|
from tinygrad.runtime.autogen import libc, qcom_dsp
|
10
10
|
if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
|
11
11
|
|
12
|
+
from tinygrad.ops import PatternMatcher, UPat
|
13
|
+
|
14
|
+
dsp_pm = PatternMatcher([
|
15
|
+
(((UPat.var('x').maximum(0) ^ -1).maximum(-256) ^ -1).cast(dtypes.uchar.vec(128)),
|
16
|
+
lambda x: UOp(Ops.CUSTOM, dtypes.uchar.vec(128), src=tuple(x.gep(tuple(range(i, i+32))) for i in range(0, 128, 32)),
|
17
|
+
arg="__builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B({3}, {2}), __builtin_HEXAGON_V6_vpackwh_sat_128B({1}, {0}))")),
|
18
|
+
(UPat(Ops.GEP, name="x"), lambda x: UOp(Ops.CUSTOM, x.dtype, x.src+x.src,
|
19
|
+
"__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 else None),
|
20
|
+
])
|
21
|
+
|
22
|
+
dsp_pm_late = PatternMatcher([
|
23
|
+
(UPat.var("x")+UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
|
24
|
+
(UPat.var("x")*UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
|
25
|
+
(UPat.var("x")//UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
|
26
|
+
(UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
|
27
|
+
lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
|
28
|
+
])
|
29
|
+
|
30
|
+
class DSPRenderer(ClangRenderer):
|
31
|
+
device = "DSP"
|
32
|
+
supports_float4 = True
|
33
|
+
buffer_suffix = " restrict __attribute__((align_value(128)))"
|
34
|
+
kernel_prefix = "__attribute__((noinline)) "
|
35
|
+
pre_matcher = dsp_pm
|
36
|
+
extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher
|
37
|
+
type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
|
38
|
+
code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
|
39
|
+
Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
|
40
|
+
Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
|
41
|
+
|
42
|
+
def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
|
43
|
+
ret = super().render_kernel(function_name, kernel, bufs, uops, prefix)
|
44
|
+
msrc = ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
|
45
|
+
_Bool set_dcvs_params; short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3];};''','int HAP_power_set(void*, void*);',
|
46
|
+
'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
|
47
|
+
'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
|
48
|
+
'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
|
49
|
+
'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
|
50
|
+
'HAP_power_set((void*)handle, (void*)&req);']
|
51
|
+
msrc += ['if ((sc>>24) != 2) return 0;']
|
52
|
+
msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
|
53
|
+
msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
|
54
|
+
msrc += [f'void *buf_{i} = HAP_mmap(0,sz_or_val_{i},3,0,pra[{i+3}].dma.fd,0)+off{i};' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
|
55
|
+
msrc += ["unsigned long long start = HAP_perf_get_time_us();"]
|
56
|
+
msrc += [f"{function_name}({', '.join([(f'buf_{i}' if isinstance(b[1][0], PtrDType) else f'sz_or_val_{i}') for i,b in enumerate(bufs)])});"]
|
57
|
+
msrc += ["*(unsigned long long *)(pra[2].buf.pv) = HAP_perf_get_time_us() - start;"]
|
58
|
+
msrc += [f'HAP_munmap(buf_{i}, sz_or_val_{i});' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
|
59
|
+
msrc += ["return 0; }"]
|
60
|
+
return ret + '\n' + '\n'.join(msrc)
|
61
|
+
|
12
62
|
def rpc_sc(method=0, ins=0, outs=0, fds=0): return (method << 24) | (ins << 16) | (outs << 8) | fds
|
13
63
|
def rpc_prep_args(ins=None, outs=None, in_fds=None):
|
14
64
|
ins, outs, in_fds = ins or list(), outs or list(), in_fds or list()
|
@@ -21,65 +71,81 @@ def rpc_prep_args(ins=None, outs=None, in_fds=None):
|
|
21
71
|
return pra, fds, attrs, (ins, outs)
|
22
72
|
|
23
73
|
class DSPProgram:
|
24
|
-
def __init__(self,
|
25
|
-
self.
|
74
|
+
def __init__(self, dev:DSPDevice, name:str, lib:bytes):
|
75
|
+
self.dev, self.lib = dev, lib
|
26
76
|
|
27
|
-
def __call__(self, *bufs, vals:
|
77
|
+
def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
|
28
78
|
if len(bufs) >= 16: raise RuntimeError(f"Too many buffers to execute: {len(bufs)}")
|
29
79
|
|
30
80
|
pra, fds, attrs, _ = rpc_prep_args(ins=[var_vals_mv:=memoryview(bytearray((len(bufs)+len(vals))*4)), off_mv:=memoryview(bytearray(len(bufs)*4))],
|
31
81
|
outs=[timer:=memoryview(bytearray(8)).cast('Q')], in_fds=[b.share_info.fd for b in bufs])
|
32
82
|
var_vals_mv.cast('i')[:] = array.array('i', tuple(b.size for b in bufs) + vals)
|
33
83
|
off_mv.cast('I')[:] = array.array('I', tuple(b.offset for b in bufs))
|
34
|
-
self.
|
84
|
+
self.dev.exec_lib(self.lib, rpc_sc(method=2, ins=2, outs=1, fds=len(bufs)), pra, fds, attrs)
|
35
85
|
return timer[0] / 1e6
|
36
86
|
|
37
87
|
class DSPBuffer:
|
38
|
-
def __init__(self, va_addr:int, size:int, share_info
|
88
|
+
def __init__(self, va_addr:int, size:int, share_info, offset:int=0):
|
39
89
|
self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
|
40
90
|
|
41
91
|
class DSPAllocator(Allocator):
|
42
|
-
def __init__(self,
|
43
|
-
self.
|
92
|
+
def __init__(self, dev:DSPDevice):
|
93
|
+
self.dev = dev
|
44
94
|
super().__init__()
|
45
95
|
|
46
|
-
def _alloc(self, size:int, options:
|
47
|
-
b = qcom_dsp.ION_IOC_ALLOC(self.
|
48
|
-
share_info = qcom_dsp.ION_IOC_SHARE(self.
|
96
|
+
def _alloc(self, size:int, options:BufferSpec):
|
97
|
+
b = qcom_dsp.ION_IOC_ALLOC(self.dev.ion_fd, len=size, align=0x200, heap_id_mask=1<<qcom_dsp.ION_SYSTEM_HEAP_ID, flags=qcom_dsp.ION_FLAG_CACHED)
|
98
|
+
share_info = qcom_dsp.ION_IOC_SHARE(self.dev.ion_fd, handle=b.handle)
|
49
99
|
va_addr = libc.mmap(0, size, mmap.PROT_READ|mmap.PROT_WRITE, mmap.MAP_SHARED, share_info.fd, 0)
|
50
100
|
return DSPBuffer(va_addr, size, share_info, offset=0)
|
51
101
|
|
52
|
-
def _free(self, opaque:DSPBuffer, options:
|
53
|
-
libc
|
54
|
-
|
55
|
-
|
102
|
+
def _free(self, opaque:DSPBuffer, options:BufferSpec):
|
103
|
+
if libc is not None and qcom_dsp is not None:
|
104
|
+
libc.munmap(opaque.va_addr, opaque.size)
|
105
|
+
os.close(opaque.share_info.fd)
|
106
|
+
qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
|
56
107
|
|
57
|
-
def
|
58
|
-
def
|
59
|
-
def
|
60
|
-
def
|
108
|
+
def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
|
109
|
+
def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
|
110
|
+
def _copyout(self, dest:memoryview, src:DSPBuffer): ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
|
111
|
+
def _offset(self, buf, size:int, offset:int): return DSPBuffer(buf.va_addr+offset, size, buf.share_info, buf.offset+offset)
|
61
112
|
|
62
|
-
class
|
63
|
-
def __init__(self,
|
64
|
-
self.
|
113
|
+
class ClangCompiler(Compiler):
|
114
|
+
def __init__(self, cachekey="compile_clang", args:list[str]|None=None, objdump_tool='objdump'):
|
115
|
+
self.args = ['-shared', '-march=native'] if args is None else args
|
116
|
+
self.objdump_tool = objdump_tool
|
117
|
+
super().__init__(cachekey)
|
65
118
|
|
66
|
-
|
67
|
-
|
68
|
-
|
69
|
-
|
70
|
-
|
71
|
-
|
119
|
+
def compile(self, src:str) -> bytes:
|
120
|
+
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
|
121
|
+
with tempfile.NamedTemporaryFile(delete=True) as output_file:
|
122
|
+
subprocess.check_output([getenv("CC", 'clang'), *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
|
123
|
+
'-', '-o', str(output_file.name)], input=src.encode('utf-8'))
|
124
|
+
return pathlib.Path(output_file.name).read_bytes()
|
72
125
|
|
73
|
-
|
74
|
-
super().__init__(device, DSPAllocator(self), DSPRenderer(),
|
75
|
-
ClangCompiler("compile_dsp", args=compiler_args, objdump_tool='llvm-objdump'), functools.partial(DSPProgram, self))
|
126
|
+
def disassemble(self, lib:bytes): return cpu_objdump(lib, self.objdump_tool)
|
76
127
|
|
77
|
-
|
78
|
-
|
79
|
-
|
128
|
+
class DSPDevice(Compiled):
|
129
|
+
def __init__(self, device:str=""):
|
130
|
+
compiler_args = ["--target=hexagon", "-mcpu=hexagonv65", "-fuse-ld=lld", "-nostdlib", "-mhvx=v65", "-mhvx-length=128b"]
|
131
|
+
try:
|
132
|
+
self.ion_fd = os.open('/dev/ion', os.O_RDONLY)
|
133
|
+
# Generate link script to pass into clang. Aligning all used sections to 4k fixes invoke problem.
|
134
|
+
sections = ['hash', 'text', 'rela.plt', 'got', 'got.plt', 'dynamic', 'dynsym', 'dynstr', 'plt', 'data', 'bss']
|
135
|
+
sections_link = '\n'.join([f'.{n} : ALIGN(4096) {{ *(.{n}) }}' for n in sections])
|
136
|
+
with tempfile.NamedTemporaryFile(delete=False) as self.link_ld:
|
137
|
+
self.link_ld.write(f"SECTIONS {{ . = 0x0; {sections_link}\n /DISCARD/ : {{ *(.note .note.* .gnu.hash .comment) }} }}".encode())
|
138
|
+
self.link_ld.flush()
|
139
|
+
super().__init__(device, DSPAllocator(self), DSPRenderer(),
|
140
|
+
ClangCompiler("compile_dsp", ["-shared"] + compiler_args + [f"-T{self.link_ld.name}"], 'llvm-objdump'), functools.partial(DSPProgram, self))
|
141
|
+
fastrpc_shell = memoryview(bytearray(pathlib.Path('/dsp/cdsp/fastrpc_shell_3').read_bytes()))
|
142
|
+
self.shell_buf = self.allocator.alloc(round_up(fastrpc_shell.nbytes, 0x1000), BufferSpec(nolru=True))
|
143
|
+
ctypes.memmove(self.shell_buf.va_addr, mv_address(fastrpc_shell), fastrpc_shell.nbytes)
|
80
144
|
|
81
|
-
|
82
|
-
|
145
|
+
self.init_dsp()
|
146
|
+
RPCListener(self).start()
|
147
|
+
except FileNotFoundError:
|
148
|
+
super().__init__(device, MallocAllocator, MockDSPRenderer(), ClangCompiler(None, ["-static"] + compiler_args, 'llvm-objdump'), MockDSPProgram)
|
83
149
|
|
84
150
|
def open_lib(self, lib):
|
85
151
|
self.binded_lib, self.binded_lib_off = lib, 0
|
@@ -117,7 +183,7 @@ class DSPDevice(Compiled):
|
|
117
183
|
qcom_dsp.FASTRPC_IOCTL_INIT(self.rpc_fd, flags=0x1, file=self.shell_buf.va_addr, filelen=self.shell_buf.size, filefd=self.shell_buf.share_info.fd)
|
118
184
|
qcom_dsp.FASTRPC_IOCTL_INVOKE(self.rpc_fd, handle=3, sc=rpc_sc(method=3, ins=0, outs=0))
|
119
185
|
|
120
|
-
class
|
186
|
+
class RPCListener(threading.Thread):
|
121
187
|
def __init__(self, device:DSPDevice):
|
122
188
|
super().__init__()
|
123
189
|
self.device, self.daemon = device, True
|
@@ -179,3 +245,54 @@ class RPCListner(threading.Thread):
|
|
179
245
|
st = qcom_dsp.FASTRPC_IOCTL_MMAP(self.device.rpc_fd, fd=-1, flags=in_args[0].cast('I')[2], vaddrin=0, size=in_args[0].cast('Q')[3])
|
180
246
|
out_args[0].cast('Q')[0:2] = array.array('Q', [0, st.vaddrout])
|
181
247
|
else: raise RuntimeError(f"Unknown op: {sc=:X}")
|
248
|
+
|
249
|
+
# ***** mock DSP *****
|
250
|
+
|
251
|
+
mockdsp_boilerplate = '''/* DSP boilerplate */ static long syscall(long r0, long r1, long r2, long r3, long r4, long r5, long r6) {
|
252
|
+
long retval; __asm__ volatile("r0 = %1; r1 = %2; r2 = %3; r3 = %4; r4 = %5; r5 = %6; r6 = %7; trap0(#1); %0 = r0" : "=r" (retval)
|
253
|
+
: "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "r" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
|
254
|
+
static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
|
255
|
+
static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
|
256
|
+
static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
|
257
|
+
static unsigned int inscount(void) {{ unsigned int ret; __asm__ volatile(".word 0x6a15c000; %0 = R0" : "=r" (ret) : : "r0"); return ret; }}
|
258
|
+
static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
|
259
|
+
return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}'''
|
260
|
+
|
261
|
+
class MockDSPRenderer(DSPRenderer):
|
262
|
+
def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
|
263
|
+
ret = ClangRenderer.render_kernel(self, function_name, kernel, bufs, uops, prefix)
|
264
|
+
# https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
|
265
|
+
# control register 21 is HEX_REG_QEMU_INSN_CNT, 0x6a15c000 loads it
|
266
|
+
msrc = [mockdsp_boilerplate, 'void _start(void) {']
|
267
|
+
for i,b in enumerate(bufs):
|
268
|
+
if isinstance(b[1][0], PtrDType):
|
269
|
+
sz = b[1][0].size*b[1][0].itemsize
|
270
|
+
# for loop for big reads
|
271
|
+
msrc.append(f"void *buf{i} = mmap2(0, {sz}, 3, 0x21, -1, 0); for(int rd = 0; rd < {sz}; rd += read(0, buf{i}+rd, {sz}-rd));")
|
272
|
+
else:
|
273
|
+
msrc.append(f"unsigned int val{i}; read(0, &val{i}, 4);")
|
274
|
+
msrc.append("unsigned int st = inscount();")
|
275
|
+
msrc.append(f"{function_name}({', '.join([(f'(void*)buf{i}' if isinstance(b[1][0], PtrDType) else f'val{i}') for i,b in enumerate(bufs)])});")
|
276
|
+
msrc.append("unsigned int et = inscount() - st; write(1, &et, sizeof(et));")
|
277
|
+
for i,b in enumerate(bufs):
|
278
|
+
if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
|
279
|
+
msrc.append('exit(0); }')
|
280
|
+
return ret + '\n' + '\n'.join(msrc)
|
281
|
+
|
282
|
+
class MockDSPProgram:
|
283
|
+
def __init__(self, name:str, lib:bytes): self.lib = lib
|
284
|
+
def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
|
285
|
+
with tempfile.NamedTemporaryFile(suffix=".out") as dsp_lib:
|
286
|
+
dsp_lib.write(self.lib)
|
287
|
+
dsp_lib.flush()
|
288
|
+
os.chmod(dsp_lib.name, 0o0777)
|
289
|
+
# NOTE: this timing includes a docker launch
|
290
|
+
proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
|
291
|
+
"qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
|
292
|
+
input=b''.join([bytes(x) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
|
293
|
+
offset = 4
|
294
|
+
for x in bufs:
|
295
|
+
x[:] = proc.stdout[offset:offset+len(x)]
|
296
|
+
offset += len(x)
|
297
|
+
assert offset == len(proc.stdout)
|
298
|
+
return struct.unpack("I", proc.stdout[0:4])[0] / 1e9 # pretend it's 1 Ghz, but this is an inscount, not a time
|
tinygrad/runtime/ops_gpu.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
from typing import
|
3
|
-
import ctypes, functools, hashlib
|
2
|
+
from typing import Optional, cast
|
3
|
+
import ctypes, functools, hashlib, contextlib
|
4
4
|
from tinygrad.runtime.autogen import opencl as cl
|
5
5
|
from tinygrad.helpers import init_c_var, to_char_p_p, from_mv, OSX, DEBUG, getenv, mv_address
|
6
6
|
from tinygrad.renderer.cstyle import OpenCLRenderer, IntelRenderer
|
7
|
-
from tinygrad.device import
|
7
|
+
from tinygrad.device import BufferSpec, LRUAllocator, Compiled, Compiler, CompileError
|
8
8
|
|
9
9
|
# see test/external/external_osx_profiling.py to determine this ratio. it's in like GPU clocks or something
|
10
10
|
OSX_TIMING_RATIO = (125/3) if OSX else 1.0
|
@@ -15,15 +15,15 @@ def check(status):
|
|
15
15
|
def checked(ret, status): return (check(status.value), ret)[1]
|
16
16
|
|
17
17
|
class CLCompiler(Compiler):
|
18
|
-
def __init__(self,
|
19
|
-
self.
|
18
|
+
def __init__(self, dev:CLDevice, compile_key:str):
|
19
|
+
self.dev = dev
|
20
20
|
super().__init__(f"compile_cl_{compile_key}")
|
21
21
|
def compile(self, src:str) -> bytes:
|
22
|
-
program = checked(cl.clCreateProgramWithSource(self.
|
23
|
-
build_status: int = cl.clBuildProgram(program, 1, self.
|
22
|
+
program = checked(cl.clCreateProgramWithSource(self.dev.context, 1, to_char_p_p([src.encode()]), None, status := ctypes.c_int32()), status)
|
23
|
+
build_status: int = cl.clBuildProgram(program, 1, self.dev.device_id, None, cl.clBuildProgram.argtypes[4](), None)
|
24
24
|
if build_status != 0:
|
25
|
-
cl.clGetProgramBuildInfo(program, self.
|
26
|
-
cl.clGetProgramBuildInfo(program, self.
|
25
|
+
cl.clGetProgramBuildInfo(program, self.dev.device_id, cl.CL_PROGRAM_BUILD_LOG, 0, None, log_size := ctypes.c_size_t())
|
26
|
+
cl.clGetProgramBuildInfo(program, self.dev.device_id, cl.CL_PROGRAM_BUILD_LOG, log_size.value, mstr := ctypes.create_string_buffer(log_size.value), None) # noqa: E501
|
27
27
|
raise CompileError(f"OpenCL Compile Error\n\n{mstr.value.decode()}")
|
28
28
|
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARY_SIZES, ctypes.sizeof(ctypes.c_size_t), binary_sizes := (ctypes.c_size_t * 1)(), None))
|
29
29
|
check(cl.clGetProgramInfo(program, cl.CL_PROGRAM_BINARIES, ctypes.sizeof(ctypes.c_void_p), (ctypes.c_void_p * 1)(ctypes.addressof(binary := ctypes.create_string_buffer(binary_sizes[0]))), None)) # noqa: E501
|
@@ -32,7 +32,7 @@ class CLCompiler(Compiler):
|
|
32
32
|
|
33
33
|
class CLProgram:
|
34
34
|
def __init__(self, device:CLDevice, name:str, lib:bytes):
|
35
|
-
self.
|
35
|
+
self.dev, self.name, self.lib = device, name, lib
|
36
36
|
self.program = checked(cl.clCreateProgramWithBinary(device.context, 1, device.device_id, (ctypes.c_size_t * 1)(len(lib)),
|
37
37
|
to_char_p_p([lib], ctypes.c_ubyte), binary_status := ctypes.c_int32(),
|
38
38
|
errcode_ret := ctypes.c_int32()), errcode_ret)
|
@@ -41,15 +41,15 @@ class CLProgram:
|
|
41
41
|
self.kernel = checked(cl.clCreateKernel(self.program, name.encode(), status := ctypes.c_int32()), status)
|
42
42
|
|
43
43
|
def __del__(self):
|
44
|
-
|
45
|
-
|
44
|
+
with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseKernel(self.kernel))
|
45
|
+
with contextlib.suppress(TypeError, AttributeError): check(cl.clReleaseProgram(self.program))
|
46
46
|
|
47
|
-
def __call__(self, *bufs:
|
47
|
+
def __call__(self, *bufs:tuple[ctypes._CData, BufferSpec], global_size:tuple[int,int,int]=(1,1,1), local_size:Optional[tuple[int,int,int]]=None, vals:tuple[int, ...]=(), wait=False) -> Optional[float]: # noqa: E501
|
48
48
|
for i,(b,_) in enumerate(bufs): cl.clSetKernelArg(self.kernel, i, ctypes.sizeof(b), ctypes.byref(b))
|
49
49
|
for i,v in enumerate(vals,start=len(bufs)): cl.clSetKernelArg(self.kernel, i, 4, ctypes.byref(ctypes.c_int32(v)))
|
50
|
-
if local_size is not None: global_size = cast(
|
50
|
+
if local_size is not None: global_size = cast(tuple[int,int,int], tuple(int(g*l) for g,l in zip(global_size, local_size)))
|
51
51
|
event = cl.cl_event() if wait else None
|
52
|
-
check(cl.clEnqueueNDRangeKernel(self.
|
52
|
+
check(cl.clEnqueueNDRangeKernel(self.dev.queue, self.kernel, len(global_size), None, (ctypes.c_size_t * len(global_size))(*global_size), (ctypes.c_size_t * len(local_size))(*local_size) if local_size else None, 0, None, event)) # noqa: E501
|
53
53
|
if wait:
|
54
54
|
assert event is not None
|
55
55
|
check(cl.clWaitForEvents(1, event))
|
@@ -59,31 +59,31 @@ class CLProgram:
|
|
59
59
|
return None
|
60
60
|
|
61
61
|
class CLAllocator(LRUAllocator):
|
62
|
-
def __init__(self,
|
63
|
-
self.
|
62
|
+
def __init__(self, dev:CLDevice):
|
63
|
+
self.dev = dev
|
64
64
|
super().__init__()
|
65
|
-
def _alloc(self, size:int, options:
|
65
|
+
def _alloc(self, size:int, options:BufferSpec) -> tuple[ctypes._CData, BufferSpec]:
|
66
66
|
if options.image is not None:
|
67
|
-
return (checked(cl.clCreateImage2D(self.
|
67
|
+
return (checked(cl.clCreateImage2D(self.dev.context, cl.CL_MEM_READ_WRITE,
|
68
68
|
cl.cl_image_format(cl.CL_RGBA, {2: cl.CL_HALF_FLOAT, 4: cl.CL_FLOAT}[options.image.itemsize]),
|
69
69
|
options.image.shape[1], options.image.shape[0], 0, None, status := ctypes.c_int32()), status), options)
|
70
|
-
return (checked(cl.clCreateBuffer(self.
|
71
|
-
def _free(self, opaque:
|
72
|
-
def
|
70
|
+
return (checked(cl.clCreateBuffer(self.dev.context, cl.CL_MEM_READ_WRITE, size, None, status := ctypes.c_int32()), status), options)
|
71
|
+
def _free(self, opaque:tuple[ctypes._CData, BufferSpec], options:BufferSpec): check(cl.clReleaseMemObject(opaque[0]))
|
72
|
+
def _copyin(self, dest:tuple[ctypes._CData, BufferSpec], src:memoryview):
|
73
73
|
if dest[1].image is not None:
|
74
|
-
check(cl.clEnqueueWriteImage(self.
|
74
|
+
check(cl.clEnqueueWriteImage(self.dev.queue, dest[0], False, (ctypes.c_size_t * 3)(0,0,0),
|
75
75
|
(ctypes.c_size_t * 3)(dest[1].image.shape[1],dest[1].image.shape[0],1), 0, 0, from_mv(src), 0, None, None))
|
76
76
|
else:
|
77
77
|
if mv_address(src) % 16: src = memoryview(bytearray(src))
|
78
|
-
check(cl.clEnqueueWriteBuffer(self.
|
79
|
-
self.
|
80
|
-
def
|
78
|
+
check(cl.clEnqueueWriteBuffer(self.dev.queue, dest[0], False, 0, len(src)*src.itemsize, from_mv(src), 0, None, None))
|
79
|
+
self.dev.pending_copyin.append(src) # NOTE: these can't be freed until the GPU actually executes this command
|
80
|
+
def _copyout(self, dest:memoryview, src:tuple[ctypes._CData, BufferSpec]):
|
81
81
|
if src[1].image is not None:
|
82
|
-
check(cl.clEnqueueReadImage(self.
|
82
|
+
check(cl.clEnqueueReadImage(self.dev.queue, src[0], False, (ctypes.c_size_t * 3)(0,0,0),
|
83
83
|
(ctypes.c_size_t * 3)(src[1].image.shape[1],src[1].image.shape[0],1), 0, 0, from_mv(dest), 0, None, None))
|
84
84
|
else:
|
85
|
-
check(cl.clEnqueueReadBuffer(self.
|
86
|
-
self.
|
85
|
+
check(cl.clEnqueueReadBuffer(self.dev.queue, src[0], False, 0, len(dest)*dest.itemsize, from_mv(dest), 0, None, None))
|
86
|
+
self.dev.synchronize()
|
87
87
|
|
88
88
|
class CLDevice(Compiled):
|
89
89
|
device_ids = None # this is global and only initted once
|
@@ -103,7 +103,7 @@ class CLDevice(Compiled):
|
|
103
103
|
if DEBUG >= 1: print(f"CLDevice: opening {self.device_name} with version {self.driver_version}")
|
104
104
|
self.context = checked(cl.clCreateContext(None, 1, self.device_id, cl.clCreateContext.argtypes[3](), None, status := ctypes.c_int32()), status)
|
105
105
|
self.queue = checked(cl.clCreateCommandQueue(self.context, self.device_id, cl.CL_QUEUE_PROFILING_ENABLE, status), status)
|
106
|
-
self.pending_copyin:
|
106
|
+
self.pending_copyin: list[memoryview] = []
|
107
107
|
self.device_exts = (cl.clGetDeviceInfo(self.device_id, cl.CL_DEVICE_EXTENSIONS, 4096, ctypes.byref(buf := ctypes.create_string_buffer(4096)), ctypes.byref(total := ctypes.c_size_t())), ctypes.string_at(buf, size=total.value).decode())[1] # noqa: E501
|
108
108
|
|
109
109
|
compile_key = hashlib.md5(self.device_name.encode() + self.driver_version.encode()).hexdigest()
|
tinygrad/runtime/ops_hip.py
CHANGED
@@ -1,8 +1,6 @@
|
|
1
|
-
from __future__ import annotations
|
2
1
|
import ctypes, functools
|
3
|
-
from typing import Tuple
|
4
2
|
from tinygrad.helpers import init_c_var, from_mv, init_c_struct_t, getenv
|
5
|
-
from tinygrad.device import Compiled, LRUAllocator,
|
3
|
+
from tinygrad.device import Compiled, LRUAllocator, BufferSpec
|
6
4
|
from tinygrad.runtime.autogen import hip
|
7
5
|
from tinygrad.runtime.support.compiler_hip import AMDCompiler
|
8
6
|
from tinygrad.renderer.cstyle import HIPRenderer
|
@@ -11,18 +9,28 @@ if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401 # pylint
|
|
11
9
|
def check(status):
|
12
10
|
if status != 0: raise RuntimeError(f"HIP Error {status}, {ctypes.string_at(hip.hipGetErrorString(status)).decode()}")
|
13
11
|
|
12
|
+
class HIPDevice(Compiled):
|
13
|
+
def __init__(self, device:str=""):
|
14
|
+
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
15
|
+
self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
|
16
|
+
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
17
|
+
super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
|
18
|
+
def synchronize(self):
|
19
|
+
check(hip.hipSetDevice(self.device_id))
|
20
|
+
check(hip.hipDeviceSynchronize())
|
21
|
+
|
14
22
|
class HIPProgram:
|
15
|
-
def __init__(self,
|
16
|
-
self.
|
17
|
-
check(hip.hipSetDevice(self.
|
23
|
+
def __init__(self, dev:HIPDevice, name:str, lib:bytes):
|
24
|
+
self.dev, self.name, self.lib = dev, name, lib
|
25
|
+
check(hip.hipSetDevice(self.dev.device_id))
|
18
26
|
self.module = init_c_var(hip.hipModule_t(), lambda x: check(hip.hipModuleLoadData(ctypes.byref(x), lib)))
|
19
27
|
self.prg = init_c_var(hip.hipFunction_t(), lambda x: check(hip.hipModuleGetFunction(ctypes.byref(x), self.module, name.encode("utf-8"))))
|
20
28
|
|
21
29
|
def __del__(self):
|
22
30
|
if hasattr(self, 'module'): check(hip.hipModuleUnload(self.module))
|
23
31
|
|
24
|
-
def __call__(self, *args, global_size:
|
25
|
-
check(hip.hipSetDevice(self.
|
32
|
+
def __call__(self, *args, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
33
|
+
check(hip.hipSetDevice(self.dev.device_id))
|
26
34
|
if not hasattr(self, "vargs"):
|
27
35
|
self.c_args = init_c_struct_t(tuple([(f'f{i}', hip.hipDeviceptr_t) for i in range(len(args))] +
|
28
36
|
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))(*args, *vals)
|
@@ -32,37 +40,27 @@ class HIPProgram:
|
|
32
40
|
for i in range(len(args)): self.c_args.__setattr__(f'f{i}', args[i])
|
33
41
|
for i in range(len(vals)): self.c_args.__setattr__(f'v{i}', vals[i])
|
34
42
|
|
35
|
-
if wait: check(hip.hipEventRecord(self.
|
43
|
+
if wait: check(hip.hipEventRecord(self.dev.time_event_st, None))
|
36
44
|
|
37
45
|
check(hip.hipModuleLaunchKernel(self.prg, *global_size, *local_size, 0, None, None, self.vargs))
|
38
46
|
|
39
47
|
if wait:
|
40
|
-
check(hip.hipEventRecord(self.
|
41
|
-
check(hip.hipEventSynchronize(self.
|
42
|
-
check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.
|
48
|
+
check(hip.hipEventRecord(self.dev.time_event_en, None))
|
49
|
+
check(hip.hipEventSynchronize(self.dev.time_event_en))
|
50
|
+
check(hip.hipEventElapsedTime(ctypes.byref(ret := ctypes.c_float()), self.dev.time_event_st, self.dev.time_event_en))
|
43
51
|
return ret.value * 1e-3
|
44
52
|
|
45
53
|
class HIPAllocator(LRUAllocator):
|
46
|
-
def __init__(self,
|
47
|
-
self.
|
54
|
+
def __init__(self, dev:HIPDevice):
|
55
|
+
self.dev = dev
|
48
56
|
super().__init__()
|
49
|
-
def _alloc(self, size:int, options:
|
50
|
-
check(hip.hipSetDevice(self.
|
57
|
+
def _alloc(self, size:int, options:BufferSpec):
|
58
|
+
check(hip.hipSetDevice(self.dev.device_id))
|
51
59
|
return init_c_var(hip.hipDeviceptr_t(), lambda x: check(hip.hipMalloc(ctypes.byref(x), size)))
|
52
|
-
def _free(self, opaque, options:
|
53
|
-
def
|
54
|
-
check(hip.hipSetDevice(self.
|
60
|
+
def _free(self, opaque, options:BufferSpec): check(hip.hipFree(opaque))
|
61
|
+
def _copyin(self, dest, src: memoryview):
|
62
|
+
check(hip.hipSetDevice(self.dev.device_id))
|
55
63
|
check(hip.hipMemcpy(dest, from_mv(src), len(src), hip.hipMemcpyHostToDevice))
|
56
|
-
def
|
57
|
-
self.
|
64
|
+
def _copyout(self, dest:memoryview, src):
|
65
|
+
self.dev.synchronize()
|
58
66
|
check(hip.hipMemcpy(from_mv(dest), src, len(dest), hip.hipMemcpyDeviceToHost))
|
59
|
-
|
60
|
-
class HIPDevice(Compiled):
|
61
|
-
def __init__(self, device:str=""):
|
62
|
-
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
63
|
-
self.arch = init_c_var(hip.hipDeviceProp_t(), lambda x: check(hip.hipGetDeviceProperties(x, self.device_id))).gcnArchName.decode()
|
64
|
-
self.time_event_st, self.time_event_en = [init_c_var(hip.hipEvent_t(), lambda x: hip.hipEventCreate(ctypes.byref(x), 0)) for _ in range(2)]
|
65
|
-
super().__init__(device, HIPAllocator(self), HIPRenderer(), AMDCompiler(self.arch), functools.partial(HIPProgram, self))
|
66
|
-
def synchronize(self):
|
67
|
-
check(hip.hipSetDevice(self.device_id))
|
68
|
-
check(hip.hipDeviceSynchronize())
|
tinygrad/runtime/ops_llvm.py
CHANGED
@@ -1,51 +1,58 @@
|
|
1
|
-
|
2
|
-
import
|
3
|
-
from
|
4
|
-
from tinygrad.device import Compiled, Compiler, MallocAllocator
|
5
|
-
from tinygrad.helpers import cpu_time_execution, getenv, cpu_objdump
|
1
|
+
import ctypes, platform
|
2
|
+
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
|
3
|
+
from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
|
6
4
|
from tinygrad.renderer.llvmir import LLVMRenderer
|
7
|
-
import
|
5
|
+
import tinygrad.runtime.autogen.llvm as llvm
|
6
|
+
from tinygrad.runtime.support.elf import jit_loader
|
7
|
+
|
8
|
+
def cerr(): return ctypes.pointer(ctypes.pointer(ctypes.c_char()))
|
9
|
+
|
10
|
+
def expect(x, err, ret=None):
|
11
|
+
if x: raise RuntimeError(llvm.string_cast(err.contents) if not isinstance(err, str) else err)
|
12
|
+
return ret
|
8
13
|
|
9
14
|
class LLVMCompiler(Compiler):
|
10
|
-
def __init__(self,
|
11
|
-
|
12
|
-
self.optimizer: llvm.passmanagers.ModulePassManager = llvm.create_module_pass_manager()
|
13
|
-
self.device.target_machine.add_analysis_passes(self.optimizer)
|
14
|
-
if opt:
|
15
|
-
with llvm.create_pass_manager_builder() as builder:
|
16
|
-
builder.opt_level = 3; builder.size_level = 0; builder.loop_vectorize = True; builder.slp_vectorize = True # noqa: E702
|
17
|
-
builder.populate(self.optimizer)
|
18
|
-
super().__init__("compile_llvm_opt" if opt else "compile_llvm")
|
15
|
+
def __init__(self, host_arch:str):
|
16
|
+
for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
|
19
17
|
|
20
|
-
|
21
|
-
|
22
|
-
|
23
|
-
|
24
|
-
|
18
|
+
triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
|
19
|
+
target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
|
20
|
+
# +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
|
21
|
+
cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
|
22
|
+
if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
|
23
|
+
self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
|
24
|
+
llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
|
25
25
|
|
26
|
-
|
26
|
+
self.pbo = llvm.LLVMCreatePassBuilderOptions()
|
27
|
+
if (opt:=bool(getenv("LLVMOPT", "1"))):
|
28
|
+
self.passes = b'default<O2>'
|
29
|
+
llvm.LLVMPassBuilderOptionsSetLoopUnrolling(self.pbo, True)
|
30
|
+
llvm.LLVMPassBuilderOptionsSetLoopVectorization(self.pbo, True)
|
31
|
+
llvm.LLVMPassBuilderOptionsSetSLPVectorization(self.pbo, True)
|
32
|
+
llvm.LLVMPassBuilderOptionsSetVerifyEach(self.pbo, True)
|
33
|
+
else:
|
34
|
+
self.passes = b'default<O0>'
|
27
35
|
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
36
|
+
super().__init__(f"compile_llvm_jit{'_opt' if opt else ''}")
|
37
|
+
|
38
|
+
def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo)
|
39
|
+
|
40
|
+
def compile(self, src:str) -> bytes:
|
41
|
+
src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
|
42
|
+
mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
|
43
|
+
expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
|
44
|
+
expect(llvm.LLVMRunPasses(mod, self.passes, self.target_machine, self.pbo), 'failed to run passes')
|
45
|
+
if DEBUG >= 7: print(ctypes.string_at(llvm.LLVMPrintModuleToString(mod)).decode())
|
46
|
+
obj_buf = expect(llvm.LLVMTargetMachineEmitToMemoryBuffer(self.target_machine, mod, llvm.LLVMObjectFile, err:=cerr(),
|
47
|
+
ctypes.pointer(buf:=llvm.LLVMMemoryBufferRef())), err, buf)
|
48
|
+
llvm.LLVMDisposeModule(mod)
|
49
|
+
obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
|
50
|
+
llvm.LLVMDisposeMemoryBuffer(obj_buf)
|
51
|
+
return jit_loader(obj)
|
34
52
|
|
35
|
-
def
|
36
|
-
if not hasattr(self, 'cfunc'):
|
37
|
-
self.cfunc = ctypes.CFUNCTYPE(ctypes.c_int, *([ctypes.c_void_p]*len(bufs)), *([ctypes.c_int32]*len(vals)))(self.fxn)
|
38
|
-
return cpu_time_execution(lambda: self.cfunc(*bufs, *vals), enable=wait)
|
53
|
+
def disassemble(self, lib:bytes): capstone_flatdump(lib)
|
39
54
|
|
40
55
|
class LLVMDevice(Compiled):
|
41
56
|
def __init__(self, device:str):
|
42
|
-
|
43
|
-
|
44
|
-
llvm.initialize_native_asmprinter()
|
45
|
-
llvm.initialize_native_asmparser()
|
46
|
-
# this opt actually can change things. ex: opt=3 means no FMA, opt=2 means FMA
|
47
|
-
self.target_machine: llvm.targets.TargetMachine = llvm.Target.from_triple(llvm.get_process_triple()).create_target_machine(opt=2)
|
48
|
-
backing_mod = llvm.parse_assembly(str())
|
49
|
-
backing_mod.triple = llvm.get_process_triple()
|
50
|
-
self.engine: llvm.executionengine.ExecutionEngine = llvm.create_mcjit_compiler(backing_mod, self.target_machine)
|
51
|
-
super().__init__(device, MallocAllocator, LLVMRenderer(), LLVMCompiler(self, getenv("LLVMOPT")), functools.partial(LLVMProgram, self))
|
57
|
+
compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()])
|
58
|
+
super().__init__(device, MallocAllocator, LLVMRenderer(), compiler, CPUProgram)
|