tinygrad 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/devectorizer.py +247 -0
- tinygrad/codegen/expander.py +121 -0
- tinygrad/codegen/kernel.py +35 -37
- tinygrad/codegen/linearize.py +19 -10
- tinygrad/codegen/lowerer.py +31 -8
- tinygrad/codegen/symbolic.py +476 -0
- tinygrad/codegen/transcendental.py +10 -0
- tinygrad/device.py +28 -11
- tinygrad/dtype.py +12 -3
- tinygrad/engine/jit.py +3 -2
- tinygrad/engine/multi.py +0 -1
- tinygrad/engine/realize.py +7 -4
- tinygrad/engine/schedule.py +227 -255
- tinygrad/engine/search.py +20 -27
- tinygrad/gradient.py +3 -0
- tinygrad/helpers.py +7 -4
- tinygrad/nn/state.py +2 -2
- tinygrad/ops.py +64 -329
- tinygrad/renderer/__init__.py +19 -3
- tinygrad/renderer/cstyle.py +39 -18
- tinygrad/renderer/llvmir.py +55 -18
- tinygrad/renderer/ptx.py +6 -2
- tinygrad/renderer/wgsl.py +20 -12
- tinygrad/runtime/autogen/libc.py +404 -71
- tinygrad/runtime/autogen/{libpciaccess.py → pci.py} +25 -715
- tinygrad/runtime/autogen/webgpu.py +6985 -0
- tinygrad/runtime/graph/metal.py +28 -29
- tinygrad/runtime/ops_amd.py +37 -34
- tinygrad/runtime/{ops_clang.py → ops_cpu.py} +4 -2
- tinygrad/runtime/ops_disk.py +1 -1
- tinygrad/runtime/ops_dsp.py +59 -33
- tinygrad/runtime/ops_llvm.py +14 -12
- tinygrad/runtime/ops_metal.py +78 -62
- tinygrad/runtime/ops_nv.py +9 -6
- tinygrad/runtime/ops_python.py +5 -5
- tinygrad/runtime/ops_webgpu.py +200 -38
- tinygrad/runtime/support/am/amdev.py +23 -11
- tinygrad/runtime/support/am/ip.py +10 -10
- tinygrad/runtime/support/elf.py +2 -0
- tinygrad/runtime/support/hcq.py +7 -5
- tinygrad/runtime/support/llvm.py +8 -14
- tinygrad/shape/shapetracker.py +3 -2
- tinygrad/shape/view.py +2 -3
- tinygrad/spec.py +21 -20
- tinygrad/tensor.py +150 -90
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
- tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
- tinygrad/viz/index.html +544 -0
- tinygrad/viz/perfetto.html +178 -0
- tinygrad/viz/serve.py +205 -0
- {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/METADATA +20 -8
- tinygrad-0.10.2.dist-info/RECORD +99 -0
- tinygrad/codegen/rewriter.py +0 -516
- tinygrad-0.10.1.dist-info/RECORD +0 -86
- {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +0 -0
- {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/graph/metal.py
CHANGED
@@ -22,16 +22,16 @@ class MetalGraph(GraphRunner):
|
|
22
22
|
if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
|
23
23
|
|
24
24
|
# create metal batch exec
|
25
|
-
icb_descriptor = msg(libobjc.objc_getClass(b"MTLIndirectCommandBufferDescriptor")
|
26
|
-
msg(
|
27
|
-
msg(
|
28
|
-
msg(
|
29
|
-
msg(
|
30
|
-
|
31
|
-
self.icb = msg(
|
32
|
-
icb_descriptor, len(jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache
|
25
|
+
icb_descriptor = msg("new", objc_instance)(libobjc.objc_getClass(b"MTLIndirectCommandBufferDescriptor"))
|
26
|
+
msg("setCommandTypes:")(icb_descriptor, MTLIndirectCommandType.MTLIndirectCommandTypeConcurrentDispatch)
|
27
|
+
msg("setInheritBuffers:")(icb_descriptor, False)
|
28
|
+
msg("setInheritPipelineState:")(icb_descriptor, False)
|
29
|
+
msg("setMaxKernelBufferBindCount:")(icb_descriptor, 31)
|
30
|
+
|
31
|
+
self.icb = msg("newIndirectCommandBufferWithDescriptor:maxCommandCount:options:", objc_instance)(self.dev.sysdevice,
|
32
|
+
icb_descriptor, len(jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache)
|
33
33
|
if self.icb.value is None: raise GraphException("create indirect command buffer failed, does your system support this?")
|
34
|
-
icb_label = bytes(msg(msg(
|
34
|
+
icb_label = bytes(msg("UTF8String", ctypes.c_char_p)(msg("description", objc_instance)(self.icb))).decode()
|
35
35
|
self.needs_icb_fix = int("AGXG15XFamilyIndirectCommandBuffer" not in icb_label) # not required on M3
|
36
36
|
|
37
37
|
if len(self.vars): self.int_buf = self.dev.allocator.alloc(len(self.vars)*dtypes.int32.itemsize)
|
@@ -39,18 +39,18 @@ class MetalGraph(GraphRunner):
|
|
39
39
|
all_pipelines = []
|
40
40
|
for j,ji in enumerate(jit_cache):
|
41
41
|
prg: CompiledRunner = cast(CompiledRunner, ji.prg)
|
42
|
-
icb_command = msg(
|
42
|
+
icb_command = msg("indirectComputeCommandAtIndex:", objc_instance)(self.icb, j)
|
43
43
|
all_pipelines.append(prg._prg.pipeline_state)
|
44
|
-
msg(
|
44
|
+
msg("setComputePipelineState:")(icb_command, prg._prg.pipeline_state)
|
45
45
|
for i,b in enumerate(ji.bufs):
|
46
46
|
if b is not None and b not in input_rawbuffers:
|
47
|
-
msg(
|
47
|
+
msg("setKernelBuffer:offset:atIndex:")(icb_command, b._buf.buf, b._buf.offset, i)
|
48
48
|
all_resources.append(b._buf.buf)
|
49
|
-
for i,v in enumerate(prg.p.vars): msg(
|
49
|
+
for i,v in enumerate(prg.p.vars): msg("setKernelBuffer:offset:atIndex:")(icb_command, self.int_buf.buf, self.vars.index(v)*4, len(ji.bufs)+i)
|
50
50
|
|
51
51
|
global_size, local_size = prg.p.launch_dims(var_vals)
|
52
|
-
msg(
|
53
|
-
msg(
|
52
|
+
msg("concurrentDispatchThreadgroups:threadsPerThreadgroup:")(icb_command, to_struct(*global_size), to_struct(*local_size))
|
53
|
+
msg("setBarrier")(icb_command)
|
54
54
|
|
55
55
|
self.all_resources = dedup(all_resources)
|
56
56
|
self.all_pipelines = dedup(all_pipelines)
|
@@ -64,18 +64,17 @@ class MetalGraph(GraphRunner):
|
|
64
64
|
all_resources = dedup(self.all_resources + [x._buf.buf for x in input_rawbuffers])
|
65
65
|
|
66
66
|
for (j,i),input_idx in self.input_replace.items():
|
67
|
-
computeCommand = msg(
|
68
|
-
msg(
|
69
|
-
input_rawbuffers[input_idx]._buf.offset, i)
|
67
|
+
computeCommand = msg("indirectComputeCommandAtIndex:", objc_id)(self.icb, j)
|
68
|
+
msg("setKernelBuffer:offset:atIndex:")(computeCommand, input_rawbuffers[input_idx]._buf.buf, input_rawbuffers[input_idx]._buf.offset, i)
|
70
69
|
|
71
70
|
for j, global_dims, local_dims in self.updated_launch_dims(var_vals):
|
72
|
-
computeCommand = msg(
|
73
|
-
msg(
|
71
|
+
computeCommand = msg("indirectComputeCommandAtIndex:", objc_id)(self.icb, j)
|
72
|
+
msg("concurrentDispatchThreadgroups:threadsPerThreadgroup:")(computeCommand, to_struct(*global_dims), to_struct(*local_dims))
|
74
73
|
for j, var in enumerate(self.vars): self.int_buf_view[j] = var_vals[var]
|
75
74
|
|
76
|
-
command_buffer = msg(
|
77
|
-
encoder = msg(
|
78
|
-
msg(
|
75
|
+
command_buffer = msg("commandBuffer", objc_instance)(self.dev.mtl_queue)
|
76
|
+
encoder = msg("computeCommandEncoder", objc_instance)(command_buffer)
|
77
|
+
msg("useResources:count:usage:")(encoder, (objc_id * len(all_resources))(*all_resources), len(all_resources),
|
79
78
|
MTLResourceUsage.MTLResourceUsageRead | MTLResourceUsage.MTLResourceUsageWrite)
|
80
79
|
|
81
80
|
# NOTE: the pipelines likely need to be added to the used resources to fix the crash on M1/M2, but I haven't figured out how
|
@@ -85,13 +84,13 @@ class MetalGraph(GraphRunner):
|
|
85
84
|
# to repro the crash (which can also crash other running GPU apps), run with FIX_METAL_ICB=0
|
86
85
|
if getenv("FIX_METAL_ICB", self.needs_icb_fix):
|
87
86
|
for ps in self.all_pipelines:
|
88
|
-
msg(
|
89
|
-
msg(
|
87
|
+
msg("setComputePipelineState:")(encoder, ps)
|
88
|
+
msg("dispatchThreadgroups:threadsPerThreadgroup:")(encoder, to_struct(0,0,0), to_struct(0,0,0))
|
90
89
|
|
91
|
-
msg(
|
92
|
-
msg(
|
93
|
-
msg(
|
94
|
-
msg(
|
90
|
+
msg("executeCommandsInBuffer:withRange:")(encoder, self.icb, self.range)
|
91
|
+
msg("endEncoding")(encoder)
|
92
|
+
msg("setLabel:")(command_buffer, to_ns_str(f"batched {len(self.jit_cache)}"))
|
93
|
+
msg("commit")(command_buffer)
|
95
94
|
self.command_buffer = command_buffer
|
96
95
|
|
97
96
|
self.dev.mtl_buffers_in_flight.append(command_buffer)
|
tinygrad/runtime/ops_amd.py
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
from typing import Any, cast
|
3
|
-
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
|
3
|
+
import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
|
4
4
|
assert sys.platform != 'win32'
|
5
5
|
from dataclasses import dataclass
|
6
6
|
from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
|
7
7
|
from tinygrad.ops import sint
|
8
|
-
from tinygrad.device import BufferSpec
|
8
|
+
from tinygrad.device import BufferSpec, CPUProgram
|
9
9
|
from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
|
10
10
|
from tinygrad.renderer.cstyle import AMDRenderer
|
11
|
-
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc,
|
11
|
+
from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
|
12
12
|
from tinygrad.runtime.autogen.am import am
|
13
13
|
from tinygrad.runtime.support.compiler_hip import AMDCompiler
|
14
14
|
from tinygrad.runtime.support.elf import elf_loader
|
@@ -151,13 +151,11 @@ class AMDComputeQueue(HWQueue):
|
|
151
151
|
for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
|
152
152
|
|
153
153
|
dev.compute_queue.put_value += len(cmds)
|
154
|
-
dev.compute_queue.
|
155
|
-
dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
|
154
|
+
dev.compute_queue.signal_doorbell()
|
156
155
|
|
157
|
-
SDMA_MAX_COPY_SIZE = 0x400000
|
158
156
|
class AMDCopyQueue(HWQueue):
|
159
|
-
def __init__(self):
|
160
|
-
self.internal_cmd_sizes = []
|
157
|
+
def __init__(self, max_copy_size=0x40000000):
|
158
|
+
self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
|
161
159
|
super().__init__()
|
162
160
|
|
163
161
|
def q(self, *arr):
|
@@ -165,10 +163,10 @@ class AMDCopyQueue(HWQueue):
|
|
165
163
|
self.internal_cmd_sizes.append(len(arr))
|
166
164
|
|
167
165
|
def copy(self, dest:sint, src:sint, copy_size:int):
|
168
|
-
copied, copy_commands = 0, (copy_size +
|
166
|
+
copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
|
169
167
|
|
170
168
|
for _ in range(copy_commands):
|
171
|
-
step_copy_size = min(copy_size - copied,
|
169
|
+
step_copy_size = min(copy_size - copied, self.max_copy_size)
|
172
170
|
|
173
171
|
self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
|
174
172
|
amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
|
@@ -237,8 +235,7 @@ class AMDCopyQueue(HWQueue):
|
|
237
235
|
dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
|
238
236
|
dev.sdma_queue.put_value += rem_packet_cnt * 4
|
239
237
|
|
240
|
-
dev.sdma_queue.
|
241
|
-
dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
|
238
|
+
dev.sdma_queue.signal_doorbell()
|
242
239
|
|
243
240
|
class AMDProgram(HCQProgram):
|
244
241
|
def __init__(self, dev:AMDDevice, name:str, lib:bytes):
|
@@ -280,8 +277,6 @@ class AMDProgram(HCQProgram):
|
|
280
277
|
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
|
281
278
|
|
282
279
|
class AMDAllocator(HCQAllocator['AMDDevice']):
|
283
|
-
def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
|
284
|
-
|
285
280
|
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
286
281
|
return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
|
287
282
|
|
@@ -301,6 +296,13 @@ class AMDQueueDesc:
|
|
301
296
|
doorbell: memoryview
|
302
297
|
put_value: int = 0
|
303
298
|
|
299
|
+
def signal_doorbell(self):
|
300
|
+
self.write_ptr[0] = self.put_value
|
301
|
+
|
302
|
+
# Ensure all prior writes are visible to the GPU.
|
303
|
+
if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
|
304
|
+
self.doorbell[0] = self.put_value
|
305
|
+
|
304
306
|
class KFDIface:
|
305
307
|
kfd:HWInterface|None = None
|
306
308
|
event_page:HCQBuffer|None = None
|
@@ -426,6 +428,7 @@ class KFDIface:
|
|
426
428
|
class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
|
427
429
|
|
428
430
|
class PCIIface:
|
431
|
+
supported_devs:list[int] = [0x744c, 0x7480]
|
429
432
|
vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
|
430
433
|
vfio_fd:HWInterface
|
431
434
|
gpus:list[Any] = []
|
@@ -434,25 +437,23 @@ class PCIIface:
|
|
434
437
|
self.dev = dev
|
435
438
|
|
436
439
|
if first_dev:=len(PCIIface.gpus) == 0:
|
437
|
-
|
438
|
-
|
439
|
-
|
440
|
-
if
|
440
|
+
for pcibus in HWInterface("/sys/bus/pci/devices").listdir():
|
441
|
+
vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
|
442
|
+
device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
|
443
|
+
if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
|
441
444
|
|
442
445
|
# TODO: visible_devices should be handled layer above this?
|
443
446
|
visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
|
444
447
|
PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
|
445
448
|
|
446
|
-
self.
|
447
|
-
self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}"
|
449
|
+
self.pcibus = PCIIface.gpus[dev_id]
|
448
450
|
|
449
451
|
# Unbind the device from the kernel driver
|
450
452
|
if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
|
451
453
|
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
|
452
|
-
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15")
|
453
454
|
|
454
|
-
|
455
|
-
|
455
|
+
supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
|
456
|
+
HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
|
456
457
|
|
457
458
|
# Try to init vfio. Use it if success.
|
458
459
|
if PCIIface.vfio:
|
@@ -485,16 +486,20 @@ class PCIIface:
|
|
485
486
|
irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
|
486
487
|
argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
|
487
488
|
vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
|
488
|
-
else:
|
489
|
+
else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
|
489
490
|
|
490
491
|
self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
|
491
|
-
self.
|
492
|
+
self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
|
493
|
+
self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]}
|
494
|
+
|
495
|
+
bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
|
496
|
+
self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
|
492
497
|
|
493
498
|
self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
|
494
499
|
self.doorbell_cpu_addr = mv_address(dbell)
|
495
500
|
|
496
|
-
|
497
|
-
|
501
|
+
pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
|
502
|
+
self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
|
498
503
|
|
499
504
|
array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
|
500
505
|
simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
|
@@ -503,8 +508,9 @@ class PCIIface:
|
|
503
508
|
'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
|
504
509
|
|
505
510
|
def _map_pci_range(self, bar, off=0, addr=0, size=None):
|
506
|
-
fd, sz = self.bar_fds[bar], size or self.
|
507
|
-
|
511
|
+
fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
|
512
|
+
libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
|
513
|
+
return to_mv(loc, sz)
|
508
514
|
|
509
515
|
def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
|
510
516
|
if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
|
@@ -530,8 +536,7 @@ class PCIIface:
|
|
530
536
|
if self.dev in mem.meta.mapped_devs: return
|
531
537
|
mem.meta.mapped_devs.append(self.dev)
|
532
538
|
|
533
|
-
|
534
|
-
paddrs = [(paddr if mem.meta.mapping.system else (paddr + owner_sys_base), size) for paddr, size in mem.meta.mapping.paddrs]
|
539
|
+
paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
|
535
540
|
self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
|
536
541
|
|
537
542
|
def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
|
@@ -596,8 +601,6 @@ class AMDDevice(HCQCompiled):
|
|
596
601
|
self.max_private_segment_size = 0
|
597
602
|
self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
|
598
603
|
|
599
|
-
atexit.register(self.device_fini)
|
600
|
-
|
601
604
|
def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
|
602
605
|
ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
|
603
606
|
gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
|
@@ -627,6 +630,6 @@ class AMDDevice(HCQCompiled):
|
|
627
630
|
|
628
631
|
def on_device_hang(self): self.dev_iface.on_device_hang()
|
629
632
|
|
630
|
-
def
|
633
|
+
def finalize(self):
|
631
634
|
self.synchronize()
|
632
635
|
if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()
|
@@ -1,5 +1,5 @@
|
|
1
1
|
import platform, subprocess, sys
|
2
|
-
from tinygrad.helpers import capstone_flatdump
|
2
|
+
from tinygrad.helpers import capstone_flatdump, getenv
|
3
3
|
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
|
4
4
|
from tinygrad.runtime.support.elf import jit_loader
|
5
5
|
from tinygrad.renderer.cstyle import ClangRenderer
|
@@ -13,10 +13,12 @@ class ClangJITCompiler(Compiler):
|
|
13
13
|
target = 'x86_64' if sys.platform == 'win32' else platform.machine()
|
14
14
|
args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
|
15
15
|
arch_args = ['-ffixed-x18'] if target == 'arm64' else []
|
16
|
-
obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
|
16
|
+
obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
|
17
17
|
return jit_loader(obj)
|
18
18
|
|
19
19
|
def disassemble(self, lib:bytes): return capstone_flatdump(lib)
|
20
20
|
|
21
21
|
class ClangDevice(Compiled):
|
22
22
|
def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
|
23
|
+
|
24
|
+
CPUDevice = ClangDevice
|
tinygrad/runtime/ops_disk.py
CHANGED
@@ -67,7 +67,7 @@ class DiskBuffer:
|
|
67
67
|
self.device, self.size, self.offset = device, size, offset
|
68
68
|
def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
|
69
69
|
def _buf(self) -> memoryview:
|
70
|
-
assert hasattr(self.device, "mem"), "DiskBuffer wasn't opened"
|
70
|
+
assert hasattr(self.device, "mem"), f"DiskBuffer wasn't opened: {self.device.device}"
|
71
71
|
return memoryview(self.device.mem)[self.offset:self.offset+self.size]
|
72
72
|
|
73
73
|
MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
|
tinygrad/runtime/ops_dsp.py
CHANGED
@@ -1,6 +1,5 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
|
3
|
-
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, time, struct
|
2
|
+
import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
|
4
3
|
assert sys.platform != 'win32'
|
5
4
|
from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, MallocAllocator
|
6
5
|
from tinygrad.dtype import dtypes, DType, PtrDType
|
@@ -10,25 +9,45 @@ from tinygrad.renderer.cstyle import ClangRenderer
|
|
10
9
|
from tinygrad.runtime.autogen import libc, qcom_dsp
|
11
10
|
if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
|
12
11
|
|
12
|
+
from tinygrad.ops import PatternMatcher, UPat
|
13
|
+
|
14
|
+
dsp_pm = PatternMatcher([
|
15
|
+
(((UPat.var('x').maximum(0) ^ -1).maximum(-256) ^ -1).cast(dtypes.uchar.vec(128)),
|
16
|
+
lambda x: UOp(Ops.CUSTOM, dtypes.uchar.vec(128), src=tuple(x.gep(tuple(range(i, i+32))) for i in range(0, 128, 32)),
|
17
|
+
arg="__builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B({3}, {2}), __builtin_HEXAGON_V6_vpackwh_sat_128B({1}, {0}))")),
|
18
|
+
(UPat(Ops.GEP, name="x"), lambda x: UOp(Ops.CUSTOM, x.dtype, x.src+x.src,
|
19
|
+
"__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 else None),
|
20
|
+
])
|
21
|
+
|
22
|
+
dsp_pm_late = PatternMatcher([
|
23
|
+
(UPat.var("x")+UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
|
24
|
+
(UPat.var("x")*UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
|
25
|
+
(UPat.var("x")//UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
|
26
|
+
(UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
|
27
|
+
lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
|
28
|
+
])
|
29
|
+
|
13
30
|
class DSPRenderer(ClangRenderer):
|
14
31
|
device = "DSP"
|
15
|
-
supports_float4 =
|
32
|
+
supports_float4 = True
|
16
33
|
buffer_suffix = " restrict __attribute__((align_value(128)))"
|
17
34
|
kernel_prefix = "__attribute__((noinline)) "
|
35
|
+
pre_matcher = dsp_pm
|
36
|
+
extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher
|
18
37
|
type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
|
19
38
|
code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
|
20
39
|
Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
|
21
40
|
Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
|
22
41
|
|
23
|
-
def render_kernel(self, function_name:str, kernel:
|
42
|
+
def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
|
24
43
|
ret = super().render_kernel(function_name, kernel, bufs, uops, prefix)
|
25
|
-
msrc = ['''struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
44
|
+
msrc = ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
|
45
|
+
_Bool set_dcvs_params; short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3];};''','int HAP_power_set(void*, void*);',
|
46
|
+
'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
|
47
|
+
'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
|
48
|
+
'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
|
49
|
+
'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
|
50
|
+
'HAP_power_set((void*)handle, (void*)&req);']
|
32
51
|
msrc += ['if ((sc>>24) != 2) return 0;']
|
33
52
|
msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
|
34
53
|
msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
|
@@ -55,7 +74,7 @@ class DSPProgram:
|
|
55
74
|
def __init__(self, dev:DSPDevice, name:str, lib:bytes):
|
56
75
|
self.dev, self.lib = dev, lib
|
57
76
|
|
58
|
-
def __call__(self, *bufs, vals:
|
77
|
+
def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
|
59
78
|
if len(bufs) >= 16: raise RuntimeError(f"Too many buffers to execute: {len(bufs)}")
|
60
79
|
|
61
80
|
pra, fds, attrs, _ = rpc_prep_args(ins=[var_vals_mv:=memoryview(bytearray((len(bufs)+len(vals))*4)), off_mv:=memoryview(bytearray(len(bufs)*4))],
|
@@ -66,7 +85,7 @@ class DSPProgram:
|
|
66
85
|
return timer[0] / 1e6
|
67
86
|
|
68
87
|
class DSPBuffer:
|
69
|
-
def __init__(self, va_addr:int, size:int, share_info
|
88
|
+
def __init__(self, va_addr:int, size:int, share_info, offset:int=0):
|
70
89
|
self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
|
71
90
|
|
72
91
|
class DSPAllocator(Allocator):
|
@@ -81,9 +100,10 @@ class DSPAllocator(Allocator):
|
|
81
100
|
return DSPBuffer(va_addr, size, share_info, offset=0)
|
82
101
|
|
83
102
|
def _free(self, opaque:DSPBuffer, options:BufferSpec):
|
84
|
-
libc
|
85
|
-
|
86
|
-
|
103
|
+
if libc is not None and qcom_dsp is not None:
|
104
|
+
libc.munmap(opaque.va_addr, opaque.size)
|
105
|
+
os.close(opaque.share_info.fd)
|
106
|
+
qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
|
87
107
|
|
88
108
|
def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
|
89
109
|
def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
|
@@ -99,7 +119,7 @@ class ClangCompiler(Compiler):
|
|
99
119
|
def compile(self, src:str) -> bytes:
|
100
120
|
# TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
|
101
121
|
with tempfile.NamedTemporaryFile(delete=True) as output_file:
|
102
|
-
subprocess.check_output(['clang', *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
|
122
|
+
subprocess.check_output([getenv("CC", 'clang'), *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
|
103
123
|
'-', '-o', str(output_file.name)], input=src.encode('utf-8'))
|
104
124
|
return pathlib.Path(output_file.name).read_bytes()
|
105
125
|
|
@@ -228,25 +248,32 @@ class RPCListener(threading.Thread):
|
|
228
248
|
|
229
249
|
# ***** mock DSP *****
|
230
250
|
|
251
|
+
mockdsp_boilerplate = '''/* DSP boilerplate */ static long syscall(long r0, long r1, long r2, long r3, long r4, long r5, long r6) {
|
252
|
+
long retval; __asm__ volatile("r0 = %1; r1 = %2; r2 = %3; r3 = %4; r4 = %5; r5 = %6; r6 = %7; trap0(#1); %0 = r0" : "=r" (retval)
|
253
|
+
: "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "r" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
|
254
|
+
static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
|
255
|
+
static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
|
256
|
+
static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
|
257
|
+
static unsigned int inscount(void) {{ unsigned int ret; __asm__ volatile(".word 0x6a15c000; %0 = R0" : "=r" (ret) : : "r0"); return ret; }}
|
258
|
+
static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
|
259
|
+
return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}'''
|
260
|
+
|
231
261
|
class MockDSPRenderer(DSPRenderer):
|
232
|
-
def render_kernel(self, function_name:str, kernel:
|
262
|
+
def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
|
233
263
|
ret = ClangRenderer.render_kernel(self, function_name, kernel, bufs, uops, prefix)
|
234
264
|
# https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
|
235
|
-
|
236
|
-
|
237
|
-
: "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "i" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
|
238
|
-
static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
|
239
|
-
static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
|
240
|
-
static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
|
241
|
-
static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
|
242
|
-
return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}''', 'void _start(void) {']
|
265
|
+
# control register 21 is HEX_REG_QEMU_INSN_CNT, 0x6a15c000 loads it
|
266
|
+
msrc = [mockdsp_boilerplate, 'void _start(void) {']
|
243
267
|
for i,b in enumerate(bufs):
|
244
268
|
if isinstance(b[1][0], PtrDType):
|
245
269
|
sz = b[1][0].size*b[1][0].itemsize
|
246
|
-
|
270
|
+
# for loop for big reads
|
271
|
+
msrc.append(f"void *buf{i} = mmap2(0, {sz}, 3, 0x21, -1, 0); for(int rd = 0; rd < {sz}; rd += read(0, buf{i}+rd, {sz}-rd));")
|
247
272
|
else:
|
248
273
|
msrc.append(f"unsigned int val{i}; read(0, &val{i}, 4);")
|
274
|
+
msrc.append("unsigned int st = inscount();")
|
249
275
|
msrc.append(f"{function_name}({', '.join([(f'(void*)buf{i}' if isinstance(b[1][0], PtrDType) else f'val{i}') for i,b in enumerate(bufs)])});")
|
276
|
+
msrc.append("unsigned int et = inscount() - st; write(1, &et, sizeof(et));")
|
250
277
|
for i,b in enumerate(bufs):
|
251
278
|
if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
|
252
279
|
msrc.append('exit(0); }')
|
@@ -254,19 +281,18 @@ class MockDSPRenderer(DSPRenderer):
|
|
254
281
|
|
255
282
|
class MockDSPProgram:
|
256
283
|
def __init__(self, name:str, lib:bytes): self.lib = lib
|
257
|
-
def __call__(self, *bufs, vals:
|
284
|
+
def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
|
258
285
|
with tempfile.NamedTemporaryFile(suffix=".out") as dsp_lib:
|
259
286
|
dsp_lib.write(self.lib)
|
260
287
|
dsp_lib.flush()
|
261
288
|
os.chmod(dsp_lib.name, 0o0777)
|
262
289
|
# NOTE: this timing includes a docker launch
|
263
|
-
start = time.perf_counter()
|
264
290
|
proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
|
265
|
-
"qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >=
|
291
|
+
"qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
|
266
292
|
input=b''.join([bytes(x) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
|
267
|
-
|
268
|
-
offset = 0
|
293
|
+
offset = 4
|
269
294
|
for x in bufs:
|
270
295
|
x[:] = proc.stdout[offset:offset+len(x)]
|
271
296
|
offset += len(x)
|
272
|
-
|
297
|
+
assert offset == len(proc.stdout)
|
298
|
+
return struct.unpack("I", proc.stdout[0:4])[0] / 1e9 # pretend it's 1 Ghz, but this is an inscount, not a time
|
tinygrad/runtime/ops_llvm.py
CHANGED
@@ -1,6 +1,6 @@
|
|
1
|
-
import ctypes, platform
|
1
|
+
import ctypes, platform
|
2
2
|
from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
|
3
|
-
from tinygrad.helpers import OSX, getenv, capstone_flatdump
|
3
|
+
from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
|
4
4
|
from tinygrad.renderer.llvmir import LLVMRenderer
|
5
5
|
import tinygrad.runtime.autogen.llvm as llvm
|
6
6
|
from tinygrad.runtime.support.elf import jit_loader
|
@@ -12,17 +12,19 @@ def expect(x, err, ret=None):
|
|
12
12
|
return ret
|
13
13
|
|
14
14
|
class LLVMCompiler(Compiler):
|
15
|
-
def __init__(self, host_arch:str
|
16
|
-
for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
|
15
|
+
def __init__(self, host_arch:str):
|
16
|
+
for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
|
17
17
|
|
18
18
|
triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
|
19
19
|
target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
|
20
|
-
# +reserve-x18 here does the same thing as -ffixed-x18 in
|
21
|
-
|
20
|
+
# +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
|
21
|
+
cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
|
22
|
+
if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
|
23
|
+
self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
|
22
24
|
llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
|
23
25
|
|
24
26
|
self.pbo = llvm.LLVMCreatePassBuilderOptions()
|
25
|
-
if opt:
|
27
|
+
if (opt:=bool(getenv("LLVMOPT", "1"))):
|
26
28
|
self.passes = b'default<O2>'
|
27
29
|
llvm.LLVMPassBuilderOptionsSetLoopUnrolling(self.pbo, True)
|
28
30
|
llvm.LLVMPassBuilderOptionsSetLoopVectorization(self.pbo, True)
|
@@ -33,18 +35,18 @@ class LLVMCompiler(Compiler):
|
|
33
35
|
|
34
36
|
super().__init__(f"compile_llvm_jit{'_opt' if opt else ''}")
|
35
37
|
|
36
|
-
def __del__(self):
|
37
|
-
llvm.LLVMDisposePassBuilderOptions(self.pbo)
|
38
|
+
def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo)
|
38
39
|
|
39
40
|
def compile(self, src:str) -> bytes:
|
40
41
|
src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
|
41
42
|
mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
|
42
43
|
expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
|
43
44
|
expect(llvm.LLVMRunPasses(mod, self.passes, self.target_machine, self.pbo), 'failed to run passes')
|
45
|
+
if DEBUG >= 7: print(ctypes.string_at(llvm.LLVMPrintModuleToString(mod)).decode())
|
44
46
|
obj_buf = expect(llvm.LLVMTargetMachineEmitToMemoryBuffer(self.target_machine, mod, llvm.LLVMObjectFile, err:=cerr(),
|
45
47
|
ctypes.pointer(buf:=llvm.LLVMMemoryBufferRef())), err, buf)
|
46
|
-
obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
|
47
48
|
llvm.LLVMDisposeModule(mod)
|
49
|
+
obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
|
48
50
|
llvm.LLVMDisposeMemoryBuffer(obj_buf)
|
49
51
|
return jit_loader(obj)
|
50
52
|
|
@@ -52,5 +54,5 @@ class LLVMCompiler(Compiler):
|
|
52
54
|
|
53
55
|
class LLVMDevice(Compiled):
|
54
56
|
def __init__(self, device:str):
|
55
|
-
compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()]
|
56
|
-
super().__init__(device, MallocAllocator, LLVMRenderer(
|
57
|
+
compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()])
|
58
|
+
super().__init__(device, MallocAllocator, LLVMRenderer(), compiler, CPUProgram)
|