tinygrad 0.10.1__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (62) hide show
  1. tinygrad/codegen/devectorizer.py +247 -0
  2. tinygrad/codegen/expander.py +121 -0
  3. tinygrad/codegen/kernel.py +35 -37
  4. tinygrad/codegen/linearize.py +19 -10
  5. tinygrad/codegen/lowerer.py +31 -8
  6. tinygrad/codegen/symbolic.py +476 -0
  7. tinygrad/codegen/transcendental.py +10 -0
  8. tinygrad/device.py +28 -11
  9. tinygrad/dtype.py +12 -3
  10. tinygrad/engine/jit.py +3 -2
  11. tinygrad/engine/multi.py +0 -1
  12. tinygrad/engine/realize.py +7 -4
  13. tinygrad/engine/schedule.py +227 -255
  14. tinygrad/engine/search.py +20 -27
  15. tinygrad/gradient.py +3 -0
  16. tinygrad/helpers.py +7 -4
  17. tinygrad/nn/state.py +2 -2
  18. tinygrad/ops.py +64 -329
  19. tinygrad/renderer/__init__.py +19 -3
  20. tinygrad/renderer/cstyle.py +39 -18
  21. tinygrad/renderer/llvmir.py +55 -18
  22. tinygrad/renderer/ptx.py +6 -2
  23. tinygrad/renderer/wgsl.py +20 -12
  24. tinygrad/runtime/autogen/libc.py +404 -71
  25. tinygrad/runtime/autogen/{libpciaccess.py → pci.py} +25 -715
  26. tinygrad/runtime/autogen/webgpu.py +6985 -0
  27. tinygrad/runtime/graph/metal.py +28 -29
  28. tinygrad/runtime/ops_amd.py +37 -34
  29. tinygrad/runtime/{ops_clang.py → ops_cpu.py} +4 -2
  30. tinygrad/runtime/ops_disk.py +1 -1
  31. tinygrad/runtime/ops_dsp.py +59 -33
  32. tinygrad/runtime/ops_llvm.py +14 -12
  33. tinygrad/runtime/ops_metal.py +78 -62
  34. tinygrad/runtime/ops_nv.py +9 -6
  35. tinygrad/runtime/ops_python.py +5 -5
  36. tinygrad/runtime/ops_webgpu.py +200 -38
  37. tinygrad/runtime/support/am/amdev.py +23 -11
  38. tinygrad/runtime/support/am/ip.py +10 -10
  39. tinygrad/runtime/support/elf.py +2 -0
  40. tinygrad/runtime/support/hcq.py +7 -5
  41. tinygrad/runtime/support/llvm.py +8 -14
  42. tinygrad/shape/shapetracker.py +3 -2
  43. tinygrad/shape/view.py +2 -3
  44. tinygrad/spec.py +21 -20
  45. tinygrad/tensor.py +150 -90
  46. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  47. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  48. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  49. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  50. tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  51. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  52. tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  53. tinygrad/viz/index.html +544 -0
  54. tinygrad/viz/perfetto.html +178 -0
  55. tinygrad/viz/serve.py +205 -0
  56. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/METADATA +20 -8
  57. tinygrad-0.10.2.dist-info/RECORD +99 -0
  58. tinygrad/codegen/rewriter.py +0 -516
  59. tinygrad-0.10.1.dist-info/RECORD +0 -86
  60. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
  61. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +0 -0
  62. {tinygrad-0.10.1.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
@@ -22,16 +22,16 @@ class MetalGraph(GraphRunner):
22
22
  if not all(isinstance(ji.prg, CompiledRunner) for ji in jit_cache): raise GraphException
23
23
 
24
24
  # create metal batch exec
25
- icb_descriptor = msg(libobjc.objc_getClass(b"MTLIndirectCommandBufferDescriptor"), "new", restype=objc_instance)
26
- msg(icb_descriptor, "setCommandTypes:", MTLIndirectCommandType.MTLIndirectCommandTypeConcurrentDispatch)
27
- msg(icb_descriptor, "setInheritBuffers:", False)
28
- msg(icb_descriptor, "setInheritPipelineState:", False)
29
- msg(icb_descriptor, "setMaxKernelBufferBindCount:", 31)
30
-
31
- self.icb = msg(self.dev.sysdevice, "newIndirectCommandBufferWithDescriptor:maxCommandCount:options:",
32
- icb_descriptor, len(jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache, restype=objc_instance)
25
+ icb_descriptor = msg("new", objc_instance)(libobjc.objc_getClass(b"MTLIndirectCommandBufferDescriptor"))
26
+ msg("setCommandTypes:")(icb_descriptor, MTLIndirectCommandType.MTLIndirectCommandTypeConcurrentDispatch)
27
+ msg("setInheritBuffers:")(icb_descriptor, False)
28
+ msg("setInheritPipelineState:")(icb_descriptor, False)
29
+ msg("setMaxKernelBufferBindCount:")(icb_descriptor, 31)
30
+
31
+ self.icb = msg("newIndirectCommandBufferWithDescriptor:maxCommandCount:options:", objc_instance)(self.dev.sysdevice,
32
+ icb_descriptor, len(jit_cache), MTLResourceOptions.MTLResourceCPUCacheModeDefaultCache)
33
33
  if self.icb.value is None: raise GraphException("create indirect command buffer failed, does your system support this?")
34
- icb_label = bytes(msg(msg(self.icb, "description", restype=objc_instance), "UTF8String", restype=ctypes.c_char_p)).decode()
34
+ icb_label = bytes(msg("UTF8String", ctypes.c_char_p)(msg("description", objc_instance)(self.icb))).decode()
35
35
  self.needs_icb_fix = int("AGXG15XFamilyIndirectCommandBuffer" not in icb_label) # not required on M3
36
36
 
37
37
  if len(self.vars): self.int_buf = self.dev.allocator.alloc(len(self.vars)*dtypes.int32.itemsize)
@@ -39,18 +39,18 @@ class MetalGraph(GraphRunner):
39
39
  all_pipelines = []
40
40
  for j,ji in enumerate(jit_cache):
41
41
  prg: CompiledRunner = cast(CompiledRunner, ji.prg)
42
- icb_command = msg(self.icb, "indirectComputeCommandAtIndex:", j, restype=objc_instance)
42
+ icb_command = msg("indirectComputeCommandAtIndex:", objc_instance)(self.icb, j)
43
43
  all_pipelines.append(prg._prg.pipeline_state)
44
- msg(icb_command, "setComputePipelineState:", prg._prg.pipeline_state)
44
+ msg("setComputePipelineState:")(icb_command, prg._prg.pipeline_state)
45
45
  for i,b in enumerate(ji.bufs):
46
46
  if b is not None and b not in input_rawbuffers:
47
- msg(icb_command, "setKernelBuffer:offset:atIndex:", b._buf.buf, b._buf.offset, i)
47
+ msg("setKernelBuffer:offset:atIndex:")(icb_command, b._buf.buf, b._buf.offset, i)
48
48
  all_resources.append(b._buf.buf)
49
- for i,v in enumerate(prg.p.vars): msg(icb_command, "setKernelBuffer:offset:atIndex:", self.int_buf.buf, self.vars.index(v)*4, len(ji.bufs)+i)
49
+ for i,v in enumerate(prg.p.vars): msg("setKernelBuffer:offset:atIndex:")(icb_command, self.int_buf.buf, self.vars.index(v)*4, len(ji.bufs)+i)
50
50
 
51
51
  global_size, local_size = prg.p.launch_dims(var_vals)
52
- msg(icb_command, "concurrentDispatchThreadgroups:threadsPerThreadgroup:", to_struct(*global_size), to_struct(*local_size))
53
- msg(icb_command, "setBarrier")
52
+ msg("concurrentDispatchThreadgroups:threadsPerThreadgroup:")(icb_command, to_struct(*global_size), to_struct(*local_size))
53
+ msg("setBarrier")(icb_command)
54
54
 
55
55
  self.all_resources = dedup(all_resources)
56
56
  self.all_pipelines = dedup(all_pipelines)
@@ -64,18 +64,17 @@ class MetalGraph(GraphRunner):
64
64
  all_resources = dedup(self.all_resources + [x._buf.buf for x in input_rawbuffers])
65
65
 
66
66
  for (j,i),input_idx in self.input_replace.items():
67
- computeCommand = msg(self.icb, "indirectComputeCommandAtIndex:", j, restype=objc_id)
68
- msg(computeCommand, "setKernelBuffer:offset:atIndex:", input_rawbuffers[input_idx]._buf.buf,
69
- input_rawbuffers[input_idx]._buf.offset, i)
67
+ computeCommand = msg("indirectComputeCommandAtIndex:", objc_id)(self.icb, j)
68
+ msg("setKernelBuffer:offset:atIndex:")(computeCommand, input_rawbuffers[input_idx]._buf.buf, input_rawbuffers[input_idx]._buf.offset, i)
70
69
 
71
70
  for j, global_dims, local_dims in self.updated_launch_dims(var_vals):
72
- computeCommand = msg(self.icb, "indirectComputeCommandAtIndex:", j, restype=objc_id)
73
- msg(computeCommand, "concurrentDispatchThreadgroups:threadsPerThreadgroup:", to_struct(*global_dims), to_struct(*local_dims))
71
+ computeCommand = msg("indirectComputeCommandAtIndex:", objc_id)(self.icb, j)
72
+ msg("concurrentDispatchThreadgroups:threadsPerThreadgroup:")(computeCommand, to_struct(*global_dims), to_struct(*local_dims))
74
73
  for j, var in enumerate(self.vars): self.int_buf_view[j] = var_vals[var]
75
74
 
76
- command_buffer = msg(self.dev.mtl_queue, "commandBuffer", restype=objc_instance)
77
- encoder = msg(command_buffer, "computeCommandEncoder", restype=objc_instance)
78
- msg(encoder, "useResources:count:usage:", (objc_id * len(all_resources))(*all_resources), len(all_resources),
75
+ command_buffer = msg("commandBuffer", objc_instance)(self.dev.mtl_queue)
76
+ encoder = msg("computeCommandEncoder", objc_instance)(command_buffer)
77
+ msg("useResources:count:usage:")(encoder, (objc_id * len(all_resources))(*all_resources), len(all_resources),
79
78
  MTLResourceUsage.MTLResourceUsageRead | MTLResourceUsage.MTLResourceUsageWrite)
80
79
 
81
80
  # NOTE: the pipelines likely need to be added to the used resources to fix the crash on M1/M2, but I haven't figured out how
@@ -85,13 +84,13 @@ class MetalGraph(GraphRunner):
85
84
  # to repro the crash (which can also crash other running GPU apps), run with FIX_METAL_ICB=0
86
85
  if getenv("FIX_METAL_ICB", self.needs_icb_fix):
87
86
  for ps in self.all_pipelines:
88
- msg(encoder, "setComputePipelineState:", ps)
89
- msg(encoder, "dispatchThreadgroups:threadsPerThreadgroup:", to_struct(0,0,0), to_struct(0,0,0))
87
+ msg("setComputePipelineState:")(encoder, ps)
88
+ msg("dispatchThreadgroups:threadsPerThreadgroup:")(encoder, to_struct(0,0,0), to_struct(0,0,0))
90
89
 
91
- msg(encoder, "executeCommandsInBuffer:withRange:", self.icb, self.range)
92
- msg(encoder, "endEncoding")
93
- msg(command_buffer, "setLabel:", to_ns_str(f"batched {len(self.jit_cache)}"))
94
- msg(command_buffer, "commit")
90
+ msg("executeCommandsInBuffer:withRange:")(encoder, self.icb, self.range)
91
+ msg("endEncoding")(encoder)
92
+ msg("setLabel:")(command_buffer, to_ns_str(f"batched {len(self.jit_cache)}"))
93
+ msg("commit")(command_buffer)
95
94
  self.command_buffer = command_buffer
96
95
 
97
96
  self.dev.mtl_buffers_in_flight.append(command_buffer)
@@ -1,14 +1,14 @@
1
1
  from __future__ import annotations
2
2
  from typing import Any, cast
3
- import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select, atexit
3
+ import os, ctypes, ctypes.util, functools, mmap, errno, array, contextlib, sys, select
4
4
  assert sys.platform != 'win32'
5
5
  from dataclasses import dataclass
6
6
  from tinygrad.runtime.support.hcq import HCQCompiled, HCQAllocator, HCQBuffer, HWQueue, CLikeArgsState, HCQSignal, HCQProgram, HWInterface
7
7
  from tinygrad.ops import sint
8
- from tinygrad.device import BufferSpec
8
+ from tinygrad.device import BufferSpec, CPUProgram
9
9
  from tinygrad.helpers import getenv, to_mv, round_up, data64_le, mv_address, DEBUG, OSX
10
10
  from tinygrad.renderer.cstyle import AMDRenderer
11
- from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, libpciaccess, vfio
11
+ from tinygrad.runtime.autogen import kfd, hsa, amd_gpu, libc, pci, vfio
12
12
  from tinygrad.runtime.autogen.am import am
13
13
  from tinygrad.runtime.support.compiler_hip import AMDCompiler
14
14
  from tinygrad.runtime.support.elf import elf_loader
@@ -151,13 +151,11 @@ class AMDComputeQueue(HWQueue):
151
151
  for i, value in enumerate(cmds): dev.compute_queue.ring[(dev.compute_queue.put_value + i) % len(dev.compute_queue.ring)] = value
152
152
 
153
153
  dev.compute_queue.put_value += len(cmds)
154
- dev.compute_queue.write_ptr[0] = dev.compute_queue.put_value
155
- dev.compute_queue.doorbell[0] = dev.compute_queue.put_value
154
+ dev.compute_queue.signal_doorbell()
156
155
 
157
- SDMA_MAX_COPY_SIZE = 0x400000
158
156
  class AMDCopyQueue(HWQueue):
159
- def __init__(self):
160
- self.internal_cmd_sizes = []
157
+ def __init__(self, max_copy_size=0x40000000):
158
+ self.internal_cmd_sizes, self.max_copy_size = [], max_copy_size
161
159
  super().__init__()
162
160
 
163
161
  def q(self, *arr):
@@ -165,10 +163,10 @@ class AMDCopyQueue(HWQueue):
165
163
  self.internal_cmd_sizes.append(len(arr))
166
164
 
167
165
  def copy(self, dest:sint, src:sint, copy_size:int):
168
- copied, copy_commands = 0, (copy_size + SDMA_MAX_COPY_SIZE - 1) // SDMA_MAX_COPY_SIZE
166
+ copied, copy_commands = 0, (copy_size + self.max_copy_size - 1) // self.max_copy_size
169
167
 
170
168
  for _ in range(copy_commands):
171
- step_copy_size = min(copy_size - copied, SDMA_MAX_COPY_SIZE)
169
+ step_copy_size = min(copy_size - copied, self.max_copy_size)
172
170
 
173
171
  self.q(amd_gpu.SDMA_OP_COPY | amd_gpu.SDMA_PKT_COPY_LINEAR_HEADER_SUB_OP(amd_gpu.SDMA_SUBOP_COPY_LINEAR),
174
172
  amd_gpu.SDMA_PKT_COPY_LINEAR_COUNT_COUNT(step_copy_size - 1), 0, *data64_le(src + copied), *data64_le(dest + copied))
@@ -237,8 +235,7 @@ class AMDCopyQueue(HWQueue):
237
235
  dev.sdma_queue.ring[0:rem_packet_cnt] = array.array('I', cmds[tail_blit_dword:])
238
236
  dev.sdma_queue.put_value += rem_packet_cnt * 4
239
237
 
240
- dev.sdma_queue.write_ptr[0] = dev.sdma_queue.put_value
241
- dev.sdma_queue.doorbell[0] = dev.sdma_queue.put_value
238
+ dev.sdma_queue.signal_doorbell()
242
239
 
243
240
  class AMDProgram(HCQProgram):
244
241
  def __init__(self, dev:AMDDevice, name:str, lib:bytes):
@@ -280,8 +277,6 @@ class AMDProgram(HCQProgram):
280
277
  if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, BufferSpec(cpu_access=True, nolru=True))
281
278
 
282
279
  class AMDAllocator(HCQAllocator['AMDDevice']):
283
- def __init__(self, dev:AMDDevice): super().__init__(dev, batch_size=SDMA_MAX_COPY_SIZE)
284
-
285
280
  def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
286
281
  return self.dev.dev_iface.alloc(size, host=options.host, uncached=options.uncached, cpu_access=options.cpu_access)
287
282
 
@@ -301,6 +296,13 @@ class AMDQueueDesc:
301
296
  doorbell: memoryview
302
297
  put_value: int = 0
303
298
 
299
+ def signal_doorbell(self):
300
+ self.write_ptr[0] = self.put_value
301
+
302
+ # Ensure all prior writes are visible to the GPU.
303
+ if CPUProgram.atomic_lib is not None: CPUProgram.atomic_lib.atomic_thread_fence(__ATOMIC_SEQ_CST:=5)
304
+ self.doorbell[0] = self.put_value
305
+
304
306
  class KFDIface:
305
307
  kfd:HWInterface|None = None
306
308
  event_page:HCQBuffer|None = None
@@ -426,6 +428,7 @@ class KFDIface:
426
428
  class AMAllocationMeta: owner:AMDDevice; mapped_devs:list[AMDDevice]; mapping:AMMapping # noqa: E702
427
429
 
428
430
  class PCIIface:
431
+ supported_devs:list[int] = [0x744c, 0x7480]
429
432
  vfio:bool = getenv("VFIO", 1) and HWInterface.exists("/dev/vfio/vfio")
430
433
  vfio_fd:HWInterface
431
434
  gpus:list[Any] = []
@@ -434,25 +437,23 @@ class PCIIface:
434
437
  self.dev = dev
435
438
 
436
439
  if first_dev:=len(PCIIface.gpus) == 0:
437
- libpciaccess.pci_system_init()
438
- pci_iter = libpciaccess.pci_id_match_iterator_create(None)
439
- while pcidev:=libpciaccess.pci_device_next(pci_iter):
440
- if pcidev.contents.vendor_id == 0x1002 and pcidev.contents.device_id == 0x744c: PCIIface.gpus.append(pcidev.contents)
440
+ for pcibus in HWInterface("/sys/bus/pci/devices").listdir():
441
+ vendor = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/vendor").read(), 16)
442
+ device = int(HWInterface(f"/sys/bus/pci/devices/{pcibus}/device").read(), 16)
443
+ if vendor == 0x1002 and device in PCIIface.supported_devs: PCIIface.gpus.append(pcibus)
441
444
 
442
445
  # TODO: visible_devices should be handled layer above this?
443
446
  visible_devices = [int(x) for x in (getenv('VISIBLE_DEVICES', getenv('HIP_VISIBLE_DEVICES', ''))).split(',') if x.strip()]
444
447
  PCIIface.gpus = [PCIIface.gpus[x] for x in visible_devices] if visible_devices else PCIIface.gpus
445
448
 
446
- self.pcidev = PCIIface.gpus[dev_id]
447
- self.pcibus = f"{self.pcidev.domain_16:04x}:{self.pcidev.bus:02x}:{self.pcidev.dev:02x}.{self.pcidev.func:d}"
449
+ self.pcibus = PCIIface.gpus[dev_id]
448
450
 
449
451
  # Unbind the device from the kernel driver
450
452
  if HWInterface.exists(f"/sys/bus/pci/devices/{self.pcibus}/driver"):
451
453
  HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/driver/unbind", os.O_WRONLY).write(self.pcibus)
452
- HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write("15")
453
454
 
454
- # Probe device
455
- libpciaccess.pci_device_probe(ctypes.byref(self.pcidev))
455
+ supported_sizes = int(HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDONLY).read(), 16)
456
+ HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource0_resize", os.O_RDWR).write(str(supported_sizes.bit_length() - 1))
456
457
 
457
458
  # Try to init vfio. Use it if success.
458
459
  if PCIIface.vfio:
@@ -485,16 +486,20 @@ class PCIIface:
485
486
  irqs = vfio.struct_vfio_irq_set(index=vfio.VFIO_PCI_MSI_IRQ_INDEX, flags=vfio.VFIO_IRQ_SET_DATA_EVENTFD|vfio.VFIO_IRQ_SET_ACTION_TRIGGER,
486
487
  argsz=ctypes.sizeof(vfio.struct_vfio_irq_set), count=1, data=(ctypes.c_int * 1)(self.irq_fd.fd))
487
488
  vfio.VFIO_DEVICE_SET_IRQS(self.vfio_dev, irqs)
488
- else: libpciaccess.pci_device_enable(ctypes.byref(self.pcidev))
489
+ else: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/enable", os.O_RDWR).write("1")
489
490
 
490
491
  self.pagemap = HWInterface("/proc/self/pagemap", os.O_RDONLY)
491
- self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC) for bar in [0, 2, 5]}
492
+ self.cfg_fd = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/config", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC)
493
+ self.bar_fds = {bar: HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource{bar}", os.O_RDWR | os.O_SYNC | os.O_CLOEXEC) for bar in [0, 2, 5]}
494
+
495
+ bar_info = HWInterface(f"/sys/bus/pci/devices/{self.pcibus}/resource", os.O_RDONLY).read().splitlines()
496
+ self.bar_info = {j:(int(start,16), int(end,16), int(flgs,16)) for j,(start,end,flgs) in enumerate(l.split() for l in bar_info)}
492
497
 
493
498
  self.adev = AMDev(self.pcibus, self._map_pci_range(0), dbell:=self._map_pci_range(2).cast('Q'), self._map_pci_range(5).cast('I'))
494
499
  self.doorbell_cpu_addr = mv_address(dbell)
495
500
 
496
- libpciaccess.pci_device_cfg_read_u16(self.pcidev, ctypes.byref(val:=ctypes.c_uint16()), libpciaccess.PCI_COMMAND)
497
- libpciaccess.pci_device_cfg_write_u16(self.pcidev, val.value | libpciaccess.PCI_COMMAND_MASTER, libpciaccess.PCI_COMMAND)
501
+ pci_cmd = int.from_bytes(self.cfg_fd.read(2, binary=True, offset=pci.PCI_COMMAND), byteorder='little') | pci.PCI_COMMAND_MASTER
502
+ self.cfg_fd.write(pci_cmd.to_bytes(2, byteorder='little'), binary=True, offset=pci.PCI_COMMAND)
498
503
 
499
504
  array_count = self.adev.gc_info.gc_num_sa_per_se * self.adev.gc_info.gc_num_se
500
505
  simd_count = 2 * array_count * (self.adev.gc_info.gc_num_wgp0_per_sa + self.adev.gc_info.gc_num_wgp1_per_sa)
@@ -503,8 +508,9 @@ class PCIIface:
503
508
  'simd_arrays_per_engine': self.adev.gc_info.gc_num_sa_per_se, 'lds_size_in_kb': self.adev.gc_info.gc_lds_size}
504
509
 
505
510
  def _map_pci_range(self, bar, off=0, addr=0, size=None):
506
- fd, sz = self.bar_fds[bar], size or self.pcidev.regions[bar].size
507
- return to_mv(fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz)
511
+ fd, sz = self.bar_fds[bar], size or (self.bar_info[bar][1] - self.bar_info[bar][0] + 1)
512
+ libc.madvise(loc:=fd.mmap(addr, sz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED | (MAP_FIXED if addr else 0), off), sz, libc.MADV_DONTFORK)
513
+ return to_mv(loc, sz)
508
514
 
509
515
  def alloc(self, size:int, host=False, uncached=False, cpu_access=False):
510
516
  if host or (not getenv("AMD_ALLOC_QUEUE_DEV_MEM", 1) and uncached and cpu_access): # host or gtt-like memory.
@@ -530,8 +536,7 @@ class PCIIface:
530
536
  if self.dev in mem.meta.mapped_devs: return
531
537
  mem.meta.mapped_devs.append(self.dev)
532
538
 
533
- owner_sys_base = mem.meta.owner.dev_iface.pcidev.regions[0].base_addr
534
- paddrs = [(paddr if mem.meta.mapping.system else (paddr + owner_sys_base), size) for paddr, size in mem.meta.mapping.paddrs]
539
+ paddrs = [(paddr if mem.meta.mapping.system else (paddr+mem.meta.owner.dev_iface.bar_info[0][0]), size) for paddr,size in mem.meta.mapping.paddrs]
535
540
  self.adev.mm.map_range(mem.va_addr, mem.size, paddrs, system=True, snooped=mem.meta.mapping.snooped, uncached=mem.meta.mapping.uncached)
536
541
 
537
542
  def create_queue(self, queue_type, ring, gart, eop_buffer=None, ctl_stack_size=0, ctx_save_restore_size=0, debug_memory_size=0):
@@ -596,8 +601,6 @@ class AMDDevice(HCQCompiled):
596
601
  self.max_private_segment_size = 0
597
602
  self._ensure_has_local_memory(128) # set default scratch size to 128 bytes per thread
598
603
 
599
- atexit.register(self.device_fini)
600
-
601
604
  def create_queue(self, queue_type, ring_size, ctx_save_restore_size=0, eop_buffer_size=0, ctl_stack_size=0, debug_memory_size=0):
602
605
  ring = self.dev_iface.alloc(ring_size, uncached=True, cpu_access=True)
603
606
  gart = self.dev_iface.alloc(0x1000, uncached=True, cpu_access=True)
@@ -627,6 +630,6 @@ class AMDDevice(HCQCompiled):
627
630
 
628
631
  def on_device_hang(self): self.dev_iface.on_device_hang()
629
632
 
630
- def device_fini(self):
633
+ def finalize(self):
631
634
  self.synchronize()
632
635
  if hasattr(self.dev_iface, 'device_fini'): self.dev_iface.device_fini()
@@ -1,5 +1,5 @@
1
1
  import platform, subprocess, sys
2
- from tinygrad.helpers import capstone_flatdump
2
+ from tinygrad.helpers import capstone_flatdump, getenv
3
3
  from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
4
4
  from tinygrad.runtime.support.elf import jit_loader
5
5
  from tinygrad.renderer.cstyle import ClangRenderer
@@ -13,10 +13,12 @@ class ClangJITCompiler(Compiler):
13
13
  target = 'x86_64' if sys.platform == 'win32' else platform.machine()
14
14
  args = ['-march=native', f'--target={target}-none-unknown-elf', '-O2', '-fPIC', '-ffreestanding', '-fno-math-errno', '-nostdlib']
15
15
  arch_args = ['-ffixed-x18'] if target == 'arm64' else []
16
- obj = subprocess.check_output(['clang', '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
16
+ obj = subprocess.check_output([getenv("CC", 'clang'), '-c', '-x', 'c', *args, *arch_args, '-', '-o', '-'], input=src.encode('utf-8'))
17
17
  return jit_loader(obj)
18
18
 
19
19
  def disassemble(self, lib:bytes): return capstone_flatdump(lib)
20
20
 
21
21
  class ClangDevice(Compiled):
22
22
  def __init__(self, device:str): super().__init__(device, MallocAllocator, ClangRenderer(), ClangJITCompiler(), CPUProgram)
23
+
24
+ CPUDevice = ClangDevice
@@ -67,7 +67,7 @@ class DiskBuffer:
67
67
  self.device, self.size, self.offset = device, size, offset
68
68
  def __repr__(self): return f"<DiskBuffer size={self.size} offset={self.offset}>"
69
69
  def _buf(self) -> memoryview:
70
- assert hasattr(self.device, "mem"), "DiskBuffer wasn't opened"
70
+ assert hasattr(self.device, "mem"), f"DiskBuffer wasn't opened: {self.device.device}"
71
71
  return memoryview(self.device.mem)[self.offset:self.offset+self.size]
72
72
 
73
73
  MAP_LOCKED, MAP_POPULATE = 0 if OSX else 0x2000, getattr(mmap, "MAP_POPULATE", 0 if OSX else 0x008000)
@@ -1,6 +1,5 @@
1
1
  from __future__ import annotations
2
- from typing import Tuple, Any, List
3
- import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, time, struct
2
+ import ctypes, os, mmap, tempfile, pathlib, array, functools, threading, contextlib, sys, subprocess, struct
4
3
  assert sys.platform != 'win32'
5
4
  from tinygrad.device import BufferSpec, Compiled, Allocator, Compiler, MallocAllocator
6
5
  from tinygrad.dtype import dtypes, DType, PtrDType
@@ -10,25 +9,45 @@ from tinygrad.renderer.cstyle import ClangRenderer
10
9
  from tinygrad.runtime.autogen import libc, qcom_dsp
11
10
  if getenv("IOCTL"): import extra.dsp.run # noqa: F401 # pylint: disable=unused-import
12
11
 
12
+ from tinygrad.ops import PatternMatcher, UPat
13
+
14
+ dsp_pm = PatternMatcher([
15
+ (((UPat.var('x').maximum(0) ^ -1).maximum(-256) ^ -1).cast(dtypes.uchar.vec(128)),
16
+ lambda x: UOp(Ops.CUSTOM, dtypes.uchar.vec(128), src=tuple(x.gep(tuple(range(i, i+32))) for i in range(0, 128, 32)),
17
+ arg="__builtin_HEXAGON_V6_vpackhub_sat_128B(__builtin_HEXAGON_V6_vpackwh_sat_128B({3}, {2}), __builtin_HEXAGON_V6_vpackwh_sat_128B({1}, {0}))")),
18
+ (UPat(Ops.GEP, name="x"), lambda x: UOp(Ops.CUSTOM, x.dtype, x.src+x.src,
19
+ "__builtin_shufflevector({0}, {1}, "+','.join([str(y) for y in x.arg])+")") if len(x.arg) > 1 else None),
20
+ ])
21
+
22
+ dsp_pm_late = PatternMatcher([
23
+ (UPat.var("x")+UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x+UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
24
+ (UPat.var("x")*UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x*UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
25
+ (UPat.var("x")//UPat(Ops.VECTORIZE, src=UPat.var("y")), lambda x,y: x//UOp(Ops.CUSTOM, x.dtype, (y,), arg="{0}")),
26
+ (UPat(Ops.DEFINE_ACC, src=(UPat(Ops.VECTORIZE, src=UPat(Ops.CONST, arg=0)),), dtype=dtypes.uchar.vec(128), name="d", allow_any_len=True),
27
+ lambda d: d.replace(src=(UOp(Ops.CUSTOM, d.dtype, arg="__builtin_HEXAGON_V6_vd0_128B()"),)+d.src[1:])),
28
+ ])
29
+
13
30
  class DSPRenderer(ClangRenderer):
14
31
  device = "DSP"
15
- supports_float4 = False
32
+ supports_float4 = True
16
33
  buffer_suffix = " restrict __attribute__((align_value(128)))"
17
34
  kernel_prefix = "__attribute__((noinline)) "
35
+ pre_matcher = dsp_pm
36
+ extra_matcher = dsp_pm_late+ClangRenderer.extra_matcher
18
37
  type_map = { **ClangRenderer.type_map, dtypes.uint64: "unsigned long long", dtypes.int64: "long long" }
19
38
  code_for_op = {**ClangRenderer.code_for_op, Ops.SIN: lambda x,dtype: f"__builtin_sin({x})",
20
39
  Ops.LOG2: lambda x,dtype: f"__builtin_log2l({x})" if dtype == dtypes.float64 else f"__builtin_log2f({x})",
21
40
  Ops.EXP2: lambda x,dtype: f"__builtin_exp2l({x})" if dtype == dtypes.float64 else f"__builtin_exp2f({x})"}
22
41
 
23
- def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,Tuple[DType,bool]]], uops:List[UOp], prefix=None) -> str:
42
+ def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
24
43
  ret = super().render_kernel(function_name, kernel, bufs, uops, prefix)
25
- msrc = ['''struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency; _Bool set_dcvs_params;
26
- short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3]; };''', 'int HAP_power_set(void*, void*);',
27
- 'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
28
- 'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
29
- 'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
30
- 'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
31
- 'HAP_power_set((void*)handle, (void*)&req);']
44
+ msrc = ['''/* DSP boilerplate */ struct dcvs_v2_req { int type; int _pad; _Bool dcvs_enable; char dcvs_option; _Bool set_latency; int latency;
45
+ _Bool set_dcvs_params; short _pad2; char target_corner; char min_corner; char max_corner; int _pad3[3];};''','int HAP_power_set(void*, void*);',
46
+ 'typedef union { struct { void *pv; unsigned int len; } buf; struct { int fd; unsigned int offset; } dma; } remote_arg;',
47
+ 'void* HAP_mmap(void *addr, int len, int prot, int flags, int fd, long offset);', 'int HAP_munmap(void *addr, int len);',
48
+ 'unsigned long long HAP_perf_get_time_us(void);', 'int entry(unsigned long long handle, unsigned int sc, remote_arg* pra) {',
49
+ 'struct dcvs_v2_req req = {.type=7, .dcvs_enable=0, .set_latency=1, .latency=100, .set_dcvs_params=1, .target_corner = 6 /* TURBO */};',
50
+ 'HAP_power_set((void*)handle, (void*)&req);']
32
51
  msrc += ['if ((sc>>24) != 2) return 0;']
33
52
  msrc += [f'int sz_or_val_{i} = ((int*)pra[0].buf.pv)[{i}];' for i,b in enumerate(bufs)]
34
53
  msrc += [f'int off{i} = ((int*)pra[1].buf.pv)[{i}];' for i,b in enumerate(bufs) if isinstance(b[1][0], PtrDType)]
@@ -55,7 +74,7 @@ class DSPProgram:
55
74
  def __init__(self, dev:DSPDevice, name:str, lib:bytes):
56
75
  self.dev, self.lib = dev, lib
57
76
 
58
- def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
77
+ def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
59
78
  if len(bufs) >= 16: raise RuntimeError(f"Too many buffers to execute: {len(bufs)}")
60
79
 
61
80
  pra, fds, attrs, _ = rpc_prep_args(ins=[var_vals_mv:=memoryview(bytearray((len(bufs)+len(vals))*4)), off_mv:=memoryview(bytearray(len(bufs)*4))],
@@ -66,7 +85,7 @@ class DSPProgram:
66
85
  return timer[0] / 1e6
67
86
 
68
87
  class DSPBuffer:
69
- def __init__(self, va_addr:int, size:int, share_info:Any, offset:int=0):
88
+ def __init__(self, va_addr:int, size:int, share_info, offset:int=0):
70
89
  self.va_addr, self.size, self.share_info, self.offset = va_addr, size, share_info, offset
71
90
 
72
91
  class DSPAllocator(Allocator):
@@ -81,9 +100,10 @@ class DSPAllocator(Allocator):
81
100
  return DSPBuffer(va_addr, size, share_info, offset=0)
82
101
 
83
102
  def _free(self, opaque:DSPBuffer, options:BufferSpec):
84
- libc.munmap(opaque.va_addr, opaque.size)
85
- os.close(opaque.share_info.fd)
86
- qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
103
+ if libc is not None and qcom_dsp is not None:
104
+ libc.munmap(opaque.va_addr, opaque.size)
105
+ os.close(opaque.share_info.fd)
106
+ qcom_dsp.ION_IOC_FREE(self.dev.ion_fd, handle=opaque.share_info.handle)
87
107
 
88
108
  def _as_buffer(self, src:DSPBuffer) -> memoryview: return to_mv(src.va_addr, src.size)
89
109
  def _copyin(self, dest:DSPBuffer, src:memoryview): ctypes.memmove(dest.va_addr, from_mv(src), src.nbytes)
@@ -99,7 +119,7 @@ class ClangCompiler(Compiler):
99
119
  def compile(self, src:str) -> bytes:
100
120
  # TODO: remove file write. sadly clang doesn't like the use of /dev/stdout here
101
121
  with tempfile.NamedTemporaryFile(delete=True) as output_file:
102
- subprocess.check_output(['clang', *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
122
+ subprocess.check_output([getenv("CC", 'clang'), *self.args, '-O2', '-Wall', '-Werror', '-x', 'c', '-fPIC', '-ffreestanding', '-nostdlib',
103
123
  '-', '-o', str(output_file.name)], input=src.encode('utf-8'))
104
124
  return pathlib.Path(output_file.name).read_bytes()
105
125
 
@@ -228,25 +248,32 @@ class RPCListener(threading.Thread):
228
248
 
229
249
  # ***** mock DSP *****
230
250
 
251
+ mockdsp_boilerplate = '''/* DSP boilerplate */ static long syscall(long r0, long r1, long r2, long r3, long r4, long r5, long r6) {
252
+ long retval; __asm__ volatile("r0 = %1; r1 = %2; r2 = %3; r3 = %4; r4 = %5; r5 = %6; r6 = %7; trap0(#1); %0 = r0" : "=r" (retval)
253
+ : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "r" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
254
+ static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
255
+ static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
256
+ static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
257
+ static unsigned int inscount(void) {{ unsigned int ret; __asm__ volatile(".word 0x6a15c000; %0 = R0" : "=r" (ret) : : "r0"); return ret; }}
258
+ static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
259
+ return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}'''
260
+
231
261
  class MockDSPRenderer(DSPRenderer):
232
- def render_kernel(self, function_name:str, kernel:List[str], bufs:List[Tuple[str,Tuple[DType,bool]]], uops:List[UOp], prefix=None) -> str:
262
+ def render_kernel(self, function_name:str, kernel:list[str], bufs:list[tuple[str,tuple[DType,bool]]], uops:list[UOp], prefix=None) -> str:
233
263
  ret = ClangRenderer.render_kernel(self, function_name, kernel, bufs, uops, prefix)
234
264
  # https://gpages.juszkiewicz.com.pl/syscalls-table/syscalls.html
235
- msrc = ['''static long syscall(long r0, long r1, long r2, long r3, long r4, long r5, long r6) {
236
- long retval; __asm__ volatile("r0 = %1; r1 = %2; r2 = %3; r3 = %4; r4 = %5; r5 = %6; r6 = #%7; trap0(#1); %0 = r0" : "=r" (retval)
237
- : "r" (r0), "r" (r1), "r" (r2), "r" (r3), "r" (r4), "r" (r5), "i" (r6) : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); return retval; }
238
- static int read(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 63); }}
239
- static int write(int fd, void* buf, int len) {{ return syscall(fd, (long)buf, len, 0, 0, 0, 64); }}
240
- static int exit(int ret) {{ return syscall(ret, 0, 0, 0, 0, 0, 93); }}
241
- static void *mmap2(void *addr, unsigned int length, int prot, int flags, int fd, unsigned long offset) {{
242
- return (void*)syscall((long)addr, length, prot, flags, fd, offset, 222); }}''', 'void _start(void) {']
265
+ # control register 21 is HEX_REG_QEMU_INSN_CNT, 0x6a15c000 loads it
266
+ msrc = [mockdsp_boilerplate, 'void _start(void) {']
243
267
  for i,b in enumerate(bufs):
244
268
  if isinstance(b[1][0], PtrDType):
245
269
  sz = b[1][0].size*b[1][0].itemsize
246
- msrc.append(f"void *buf{i} = mmap2(0, {sz}, 3, 0x21, -1, 0); read(0, buf{i}, {sz});")
270
+ # for loop for big reads
271
+ msrc.append(f"void *buf{i} = mmap2(0, {sz}, 3, 0x21, -1, 0); for(int rd = 0; rd < {sz}; rd += read(0, buf{i}+rd, {sz}-rd));")
247
272
  else:
248
273
  msrc.append(f"unsigned int val{i}; read(0, &val{i}, 4);")
274
+ msrc.append("unsigned int st = inscount();")
249
275
  msrc.append(f"{function_name}({', '.join([(f'(void*)buf{i}' if isinstance(b[1][0], PtrDType) else f'val{i}') for i,b in enumerate(bufs)])});")
276
+ msrc.append("unsigned int et = inscount() - st; write(1, &et, sizeof(et));")
250
277
  for i,b in enumerate(bufs):
251
278
  if isinstance(b[1][0], PtrDType): msrc.append(f"write(1, buf{i}, {b[1][0].size*b[1][0].itemsize});")
252
279
  msrc.append('exit(0); }')
@@ -254,19 +281,18 @@ class MockDSPRenderer(DSPRenderer):
254
281
 
255
282
  class MockDSPProgram:
256
283
  def __init__(self, name:str, lib:bytes): self.lib = lib
257
- def __call__(self, *bufs, vals:Tuple[int, ...]=(), wait=False):
284
+ def __call__(self, *bufs, vals:tuple[int, ...]=(), wait=False):
258
285
  with tempfile.NamedTemporaryFile(suffix=".out") as dsp_lib:
259
286
  dsp_lib.write(self.lib)
260
287
  dsp_lib.flush()
261
288
  os.chmod(dsp_lib.name, 0o0777)
262
289
  # NOTE: this timing includes a docker launch
263
- start = time.perf_counter()
264
290
  proc = subprocess.run(["docker", "run", "--rm", "-i", "-v", f"{os.path.abspath(os.path.dirname(dsp_lib.name))}:/work", "-w", "/work",
265
- "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 3 else ''} /work/"+os.path.basename(dsp_lib.name)],
291
+ "qemu-hexagon", "-c", f"qemu-hexagon {'-strace' if DEBUG >= 5 else ''} /work/"+os.path.basename(dsp_lib.name)],
266
292
  input=b''.join([bytes(x) for x in bufs] + [struct.pack("I", x) for x in vals]), stdout=subprocess.PIPE, check=True)
267
- elapsed = time.perf_counter() - start
268
- offset = 0
293
+ offset = 4
269
294
  for x in bufs:
270
295
  x[:] = proc.stdout[offset:offset+len(x)]
271
296
  offset += len(x)
272
- return elapsed
297
+ assert offset == len(proc.stdout)
298
+ return struct.unpack("I", proc.stdout[0:4])[0] / 1e9 # pretend it's 1 Ghz, but this is an inscount, not a time
@@ -1,6 +1,6 @@
1
- import ctypes, platform, sys
1
+ import ctypes, platform
2
2
  from tinygrad.device import Compiled, Compiler, MallocAllocator, CPUProgram
3
- from tinygrad.helpers import OSX, getenv, capstone_flatdump
3
+ from tinygrad.helpers import OSX, getenv, capstone_flatdump, DEBUG
4
4
  from tinygrad.renderer.llvmir import LLVMRenderer
5
5
  import tinygrad.runtime.autogen.llvm as llvm
6
6
  from tinygrad.runtime.support.elf import jit_loader
@@ -12,17 +12,19 @@ def expect(x, err, ret=None):
12
12
  return ret
13
13
 
14
14
  class LLVMCompiler(Compiler):
15
- def __init__(self, host_arch:str, opt:bool):
16
- for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
15
+ def __init__(self, host_arch:str):
16
+ for component in ['Target', 'TargetInfo', 'TargetMC', 'AsmParser', 'AsmPrinter']: getattr(llvm, f'LLVMInitialize{host_arch}{component}')()
17
17
 
18
18
  triple = {'AArch64': b'aarch64', 'X86': b'x86_64'}[host_arch] + b'-none-unknown-elf'
19
19
  target = expect(llvm.LLVMGetTargetFromTriple(triple, ctypes.pointer(tgt:=llvm.LLVMTargetRef()), err:=cerr()), err, tgt)
20
- # +reserve-x18 here does the same thing as -ffixed-x18 in ops_clang.py, see comments there for why it's needed on arm osx
21
- self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, b'', b'+reserve-x18' if OSX and host_arch == 'AArch64' else b'',
20
+ # +reserve-x18 here does the same thing as -ffixed-x18 in ops_cpu.py, see comments there for why it's needed on arm osx
21
+ cpu, feats = ctypes.string_at(llvm.LLVMGetHostCPUName()), (b'+reserve-x18,' if OSX else b'') + ctypes.string_at(llvm.LLVMGetHostCPUFeatures())
22
+ if DEBUG >= 2: print(f"LLVM init for {cpu!r} with {feats!r}")
23
+ self.target_machine = llvm.LLVMCreateTargetMachine(target, triple, cpu, feats,
22
24
  llvm.LLVMCodeGenLevelDefault, llvm.LLVMRelocPIC, llvm.LLVMCodeModelDefault)
23
25
 
24
26
  self.pbo = llvm.LLVMCreatePassBuilderOptions()
25
- if opt:
27
+ if (opt:=bool(getenv("LLVMOPT", "1"))):
26
28
  self.passes = b'default<O2>'
27
29
  llvm.LLVMPassBuilderOptionsSetLoopUnrolling(self.pbo, True)
28
30
  llvm.LLVMPassBuilderOptionsSetLoopVectorization(self.pbo, True)
@@ -33,18 +35,18 @@ class LLVMCompiler(Compiler):
33
35
 
34
36
  super().__init__(f"compile_llvm_jit{'_opt' if opt else ''}")
35
37
 
36
- def __del__(self):
37
- llvm.LLVMDisposePassBuilderOptions(self.pbo)
38
+ def __del__(self): llvm.LLVMDisposePassBuilderOptions(self.pbo)
38
39
 
39
40
  def compile(self, src:str) -> bytes:
40
41
  src_buf = llvm.LLVMCreateMemoryBufferWithMemoryRangeCopy(ctypes.create_string_buffer(src_bytes:=src.encode()), len(src_bytes), b'src')
41
42
  mod = expect(llvm.LLVMParseIRInContext(llvm.LLVMGetGlobalContext(), src_buf, ctypes.pointer(m:=llvm.LLVMModuleRef()), err:=cerr()), err, m)
42
43
  expect(llvm.LLVMVerifyModule(mod, llvm.LLVMReturnStatusAction, err:=cerr()), err)
43
44
  expect(llvm.LLVMRunPasses(mod, self.passes, self.target_machine, self.pbo), 'failed to run passes')
45
+ if DEBUG >= 7: print(ctypes.string_at(llvm.LLVMPrintModuleToString(mod)).decode())
44
46
  obj_buf = expect(llvm.LLVMTargetMachineEmitToMemoryBuffer(self.target_machine, mod, llvm.LLVMObjectFile, err:=cerr(),
45
47
  ctypes.pointer(buf:=llvm.LLVMMemoryBufferRef())), err, buf)
46
- obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
47
48
  llvm.LLVMDisposeModule(mod)
49
+ obj = ctypes.string_at(llvm.LLVMGetBufferStart(obj_buf), llvm.LLVMGetBufferSize(obj_buf))
48
50
  llvm.LLVMDisposeMemoryBuffer(obj_buf)
49
51
  return jit_loader(obj)
50
52
 
@@ -52,5 +54,5 @@ class LLVMCompiler(Compiler):
52
54
 
53
55
  class LLVMDevice(Compiled):
54
56
  def __init__(self, device:str):
55
- compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()], bool(getenv("LLVMOPT")))
56
- super().__init__(device, MallocAllocator, LLVMRenderer('win64cc' if sys.platform == 'win32' else None), compiler, CPUProgram)
57
+ compiler = LLVMCompiler({'arm64': 'AArch64', 'aarch64': 'AArch64', 'x86_64': 'X86', 'AMD64': 'X86'}[platform.machine()])
58
+ super().__init__(device, MallocAllocator, LLVMRenderer(), compiler, CPUProgram)