tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (88) hide show
  1. tinygrad/codegen/devectorizer.py +247 -0
  2. tinygrad/codegen/expander.py +121 -0
  3. tinygrad/codegen/kernel.py +141 -201
  4. tinygrad/codegen/linearize.py +223 -84
  5. tinygrad/codegen/lowerer.py +60 -42
  6. tinygrad/codegen/symbolic.py +476 -0
  7. tinygrad/codegen/transcendental.py +22 -13
  8. tinygrad/device.py +187 -47
  9. tinygrad/dtype.py +39 -28
  10. tinygrad/engine/jit.py +83 -65
  11. tinygrad/engine/memory.py +4 -5
  12. tinygrad/engine/multi.py +161 -0
  13. tinygrad/engine/realize.py +62 -108
  14. tinygrad/engine/schedule.py +396 -357
  15. tinygrad/engine/search.py +55 -66
  16. tinygrad/gradient.py +73 -0
  17. tinygrad/helpers.py +81 -59
  18. tinygrad/nn/__init__.py +30 -32
  19. tinygrad/nn/datasets.py +1 -2
  20. tinygrad/nn/optim.py +22 -26
  21. tinygrad/nn/state.py +91 -66
  22. tinygrad/ops.py +492 -641
  23. tinygrad/renderer/__init__.py +95 -36
  24. tinygrad/renderer/cstyle.py +99 -92
  25. tinygrad/renderer/llvmir.py +83 -34
  26. tinygrad/renderer/ptx.py +83 -99
  27. tinygrad/renderer/wgsl.py +95 -0
  28. tinygrad/runtime/autogen/amd_gpu.py +39507 -12
  29. tinygrad/runtime/autogen/comgr.py +2 -0
  30. tinygrad/runtime/autogen/kfd.py +4 -3
  31. tinygrad/runtime/autogen/kgsl.py +1 -1
  32. tinygrad/runtime/autogen/libc.py +404 -71
  33. tinygrad/runtime/autogen/llvm.py +11379 -0
  34. tinygrad/runtime/autogen/pci.py +1333 -0
  35. tinygrad/runtime/autogen/vfio.py +891 -0
  36. tinygrad/runtime/autogen/webgpu.py +6985 -0
  37. tinygrad/runtime/graph/cuda.py +8 -9
  38. tinygrad/runtime/graph/hcq.py +84 -79
  39. tinygrad/runtime/graph/metal.py +40 -43
  40. tinygrad/runtime/ops_amd.py +498 -334
  41. tinygrad/runtime/ops_cloud.py +34 -34
  42. tinygrad/runtime/ops_cpu.py +24 -0
  43. tinygrad/runtime/ops_cuda.py +30 -27
  44. tinygrad/runtime/ops_disk.py +62 -63
  45. tinygrad/runtime/ops_dsp.py +159 -42
  46. tinygrad/runtime/ops_gpu.py +30 -30
  47. tinygrad/runtime/ops_hip.py +29 -31
  48. tinygrad/runtime/ops_llvm.py +48 -41
  49. tinygrad/runtime/ops_metal.py +149 -113
  50. tinygrad/runtime/ops_npy.py +2 -2
  51. tinygrad/runtime/ops_nv.py +238 -273
  52. tinygrad/runtime/ops_python.py +55 -50
  53. tinygrad/runtime/ops_qcom.py +129 -157
  54. tinygrad/runtime/ops_webgpu.py +225 -0
  55. tinygrad/runtime/support/allocator.py +94 -0
  56. tinygrad/runtime/support/am/__init__.py +0 -0
  57. tinygrad/runtime/support/am/amdev.py +396 -0
  58. tinygrad/runtime/support/am/ip.py +463 -0
  59. tinygrad/runtime/support/compiler_cuda.py +4 -2
  60. tinygrad/runtime/support/elf.py +28 -4
  61. tinygrad/runtime/support/hcq.py +256 -324
  62. tinygrad/runtime/support/llvm.py +26 -0
  63. tinygrad/shape/shapetracker.py +85 -53
  64. tinygrad/shape/view.py +104 -140
  65. tinygrad/spec.py +155 -0
  66. tinygrad/tensor.py +835 -527
  67. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
  68. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
  69. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
  70. tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
  71. tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
  72. tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
  73. tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
  74. tinygrad/viz/index.html +544 -0
  75. tinygrad/viz/perfetto.html +178 -0
  76. tinygrad/viz/serve.py +205 -0
  77. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
  78. tinygrad-0.10.2.dist-info/RECORD +99 -0
  79. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
  80. tinygrad/codegen/uopgraph.py +0 -506
  81. tinygrad/engine/lazy.py +0 -228
  82. tinygrad/function.py +0 -212
  83. tinygrad/multi.py +0 -177
  84. tinygrad/runtime/graph/clang.py +0 -39
  85. tinygrad/runtime/ops_clang.py +0 -35
  86. tinygrad-0.10.0.dist-info/RECORD +0 -77
  87. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
  88. {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
@@ -2,10 +2,10 @@
2
2
  # a python uops emulator
3
3
  # works to test the tensor cores, and all the uops in general
4
4
  # this is the (living) definition of uops
5
- from typing import Tuple, List, Optional, Any, Dict
6
- import pickle, base64, itertools, time, struct
5
+ from typing import Optional, Any, TYPE_CHECKING
6
+ import pickle, base64, itertools, time, struct, sys
7
7
  from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate
8
- from tinygrad.helpers import all_same, getenv, flatten
8
+ from tinygrad.helpers import all_same, getenv, flatten, get_single_element
9
9
  from tinygrad.device import Compiled, Compiler, Allocator
10
10
  from tinygrad.ops import exec_alu, Ops, UOp, GroupOp
11
11
  from tinygrad.renderer import Renderer
@@ -26,21 +26,21 @@ def _store(m, i, v):
26
26
 
27
27
  class PythonProgram:
28
28
  def __init__(self, name:str, lib:bytes):
29
- self.uops: List[Tuple[Ops, Optional[DType], List[int], Any]] = pickle.loads(lib)
30
- def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
29
+ self.uops: list[tuple[Ops, Optional[DType], list[int], Any]] = pickle.loads(lib)
30
+ def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
31
31
  st = time.perf_counter()
32
32
  warp = list(itertools.product(*[range(x) for x in local_size[::-1]]))
33
33
  warp_size = len(warp)
34
34
  for idxs in itertools.product(*[range(x) for x in global_size[::-1]]):
35
- ul: Dict[int, Any] = {}
36
- dl: Dict[int, DType] = {}
37
- pbufs: List[memoryview] = list(bufs)
38
- pvals: List[int] = list(vals)
35
+ ul: dict[int, Any] = {}
36
+ dl: dict[int, DType] = {}
37
+ pbufs: list[memoryview] = list(bufs)
38
+ pvals: list[int] = list(vals)
39
39
  i = 0
40
- loop_ends: Dict[int, int] = {}
40
+ loop_ends: dict[int, int] = {}
41
41
  while i < len(self.uops):
42
42
  uop, dtype, idp, arg = self.uops[i]
43
- void_ops = {Ops.STORE, Ops.ENDRANGE, Ops.BARRIER, Ops.IF, Ops.ENDIF}
43
+ void_ops = {Ops.STORE, Ops.ENDRANGE, Ops.BARRIER, Ops.IF, Ops.ENDIF, Ops.NAME}
44
44
  if uop is Ops.DEFINE_ACC: idp = [idp[0]]
45
45
  inp = [ul[v] for v in idp if self.uops[v][0] not in void_ops]
46
46
  dtp = [dl[v] for v in idp if self.uops[v][0] not in void_ops]
@@ -60,19 +60,17 @@ class PythonProgram:
60
60
  loop_ends[idp[0]] = i
61
61
  i = idp[0]
62
62
  continue
63
- if uop in (Ops.BARRIER, Ops.IF, Ops.ENDIF):
63
+ if uop in (Ops.BARRIER, Ops.IF, Ops.ENDIF, Ops.NAME):
64
64
  # in the python emulator, the warp is always in sync
65
65
  i += 1
66
66
  continue
67
67
  assert dtype is not None, f"{uop} is missing a dtype"
68
68
  dl[i] = dtype
69
- if uop is Ops.DEFINE_GLOBAL:
70
- assert dtype.fmt is not None
71
- ul[i] = [pbufs.pop(0).cast(dtype.fmt)] * warp_size
72
- elif uop is Ops.DEFINE_LOCAL:
73
- assert dtype.fmt is not None
74
- lbuf = memoryview(bytearray(arg[1]*dtype.itemsize))
75
- ul[i] = [lbuf.cast(dtype.fmt)] * warp_size
69
+ if uop in {Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL}:
70
+ assert dtype.fmt is not None and isinstance(dtype, PtrDType)
71
+ if TYPE_CHECKING or sys.version_info < (3, 12): assert dtype.fmt != "e"
72
+ buf = memoryview(bytearray(dtype.size*dtype.itemsize)) if uop is Ops.DEFINE_LOCAL else pbufs.pop(0)
73
+ ul[i] = [buf.cast(dtype.fmt)] * warp_size
76
74
  elif uop is Ops.DEFINE_VAR:
77
75
  ul[i] = [pvals.pop(0)] * warp_size
78
76
  elif uop is Ops.SPECIAL:
@@ -115,18 +113,13 @@ class PythonProgram:
115
113
  elif uop is Ops.ASSIGN:
116
114
  for j in range(len(inp[0])): inp[0][j] = inp[1][j]
117
115
  ul[i] = inp[0]
118
- elif uop is Ops.GEP:
119
- assert len(arg) == 1
120
- ul[i] = inp[0][arg[0]]
116
+ elif uop is Ops.GEP: ul[i] = inp[0][get_single_element(arg)]
121
117
  elif uop is Ops.WMMA:
122
118
  # here are the models for the WMMA instruction on the different hardware
123
119
  def wmma_helper(WARP_THREADS, K, NUM_A, NUM_B, NUM_C, a_elem, b_elem, c_map):
124
- assert len(inp[0]) == NUM_A, f"A must have {NUM_A} elements per thread, it has {len(inp[0])}"
125
- assert len(inp[1]) == NUM_B, f"B must have {NUM_B} elements per thread, it has {len(inp[1])}"
126
- assert len(inp[2]) == NUM_C, f"C must have {NUM_C} elements per thread, it has {len(inp[2])}"
127
- assert len(flatten(inp[0])) == NUM_A * warp_size, f"WMMA must have {NUM_A * warp_size} total elements for A in WMMA"
128
- assert len(flatten(inp[1])) == NUM_B * warp_size, f"WMMA must have {NUM_B * warp_size} total elements for B in WMMA"
129
- assert len(flatten(inp[2])) == NUM_C * warp_size, f"WMMA must have {NUM_C * warp_size} total elements for C in WMMA"
120
+ for cc, tinp, num in zip(("A", "B", "C"), inp, (NUM_A, NUM_B, NUM_C)):
121
+ assert len(tinp) == num, f"{cc} must have {num} elements per thread, it has {len(tinp)}"
122
+ assert len(flatten(tinp)) == num * warp_size, f"WMMA must have {num * warp_size} total elements for {cc} in WMMA"
130
123
  assert warp_size > 0 and warp_size % WARP_THREADS == 0, f"must have multiples of {WARP_THREADS} warp threads"
131
124
  out = [inp[2][elem_idx][:] for elem_idx in range(NUM_C)]
132
125
  for goff in range(0, warp_size, WARP_THREADS):
@@ -145,31 +138,43 @@ class PythonProgram:
145
138
  ul[i] = wmma_helper(32, 8, 2, 2, 2, a_b_elem, a_b_elem, c_map)
146
139
  elif arg[4] == "AMD":
147
140
  # A (16 elements on 32 threads): col major, lane 16-32 == lane 0-15
148
- def a_elem(x, i, j, goff):
149
- assert x[i][goff+j] == x[i][goff+j+16], "warp elements not duplicated properly across lanes"
150
- return x[i][goff+j]
141
+ def a_elem(x, k, row, goff):
142
+ assert x[k][goff+row] == x[k][goff+row+16], "warp elements not duplicated properly across lanes"
143
+ return x[k][goff+row]
151
144
  # B (16 elements on 32 threads): row major, lane 16-32 == lane 0-15
152
- def b_elem(x, i, j, goff): return a_elem(x, j, i, goff) # pylint: disable=arguments-out-of-order
145
+ def b_elem(x, col, k, goff): return a_elem(x, k, col, goff) # pylint: disable=arguments-out-of-order
153
146
  def c_map(lane, elem): return (lane%16, lane//16+elem*2) # (i, j), C, D (8 elements on 32 threads): row major
154
147
  ul[i] = wmma_helper(32, 16, 16, 16, 8, a_elem, b_elem, c_map)
155
148
  elif arg[4] == "CUDA":
156
- # A (8 elements on 32 threads)
157
- def a_elem(x, i, j, goff): return x[(i%2)+(j//8)*2+(i//8)*4][goff+((i//2)%4)+(j%8)*4]
158
- # B (4 elements on 32 threads)
159
- def b_elem(x, i, j, goff): return x[(j%2)+(j//8)*2][goff+(j//2)%4+(i)*4]
160
- # (i, j), C, D (4 elements on 32 threads)
161
- def c_map(lane, elem): return ((elem%2)+(lane%4)*2, (lane//4)+(elem//2)*8)
162
- ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
149
+ # (col, row) given (lane, elem) for C & D (4 elements on 32 threads); shared by all tc shapes with M=16 N=8
150
+ def c_map(lane, elem): return (elem%2 + (lane%4)*2, lane//4 + (elem//2)*8)
151
+
152
+ if arg[1] == (8,16,16):
153
+ def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2 + (k//8)*4][goff + (k//2)%4 + (row%8)*4]
154
+ def b_elem(x, col, k, goff): return x[k%2 + (k//8)*2][goff + (k//2)%4 + col*4]
155
+ ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
156
+
157
+ elif arg[1] == (8,16,8) and arg[2] == dtypes.half:
158
+ def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2][goff + k//2 + (row%8)*4]
159
+ def b_elem(x, col, k, goff): return x[k%2][goff + k//2 + col*4]
160
+ ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map)
161
+
162
+ elif arg[1] == (8,16,8) and arg[2] == dtypes.float:
163
+ def a_elem(x, k, row, goff): return x[(k//4)*2 + row//8][goff + k%4 + (row%8)*4]
164
+ def b_elem(x, col, k, goff): return x[k//4][goff + k%4 + col*4]
165
+ ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map)
166
+
167
+ else: raise NotImplementedError(f"unimplemented tensor core {arg}")
163
168
  elif arg[4] == "INTEL":
164
169
  # A (16 elements on 8 threads)
165
- def a_elem(x, i, j, goff): return x[i%2+j*2][goff+i//2]
170
+ def a_elem(x, k, row, goff): return x[k%2+row*2][goff+k//2]
166
171
  # B (16 elements on 8 threads)
167
- def b_elem(x, i, j, goff): return x[j][goff+i]
172
+ def b_elem(x, col, k, goff): return x[k][goff+col]
168
173
  # C, D (8 elements on 8 threads)
169
174
  def c_map(lane, elem): return (lane, elem)
170
175
  ul[i] = wmma_helper(8, 16, 16, 16, 8, a_elem, b_elem, c_map)
171
- elif arg[4] == "CLANG":
172
- def elem(x, i, j, _): return x[i+j][0]
176
+ elif arg[4] == "CPU":
177
+ def elem(x, col, row, _): return x[col+row][0] # k is always 0
173
178
  def c_map(_, elem): return (elem%16, elem//16)
174
179
  ul[i] = wmma_helper(1, 1, 16, 16, 256, elem, elem, c_map)
175
180
  else: raise NotImplementedError(f"unimplemented tensor core {arg}")
@@ -186,11 +191,12 @@ class PythonRenderer(Renderer):
186
191
  def __init__(self):
187
192
  if getenv("EMULATE_METAL"): self.device, self.tensor_cores = "METAL", MetalRenderer.tensor_cores
188
193
  if getenv("EMULATE_AMD"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores
189
- if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tensor_cores
194
+ if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm80
195
+ if getenv("EMULATE_CUDA_SM75"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm75
190
196
  if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores
191
- if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CLANG", ClangRenderer.tensor_cores
197
+ if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CPU", ClangRenderer.tensor_cores
192
198
 
193
- def render(self, name:str, uops:List[UOp]) -> str:
199
+ def render(self, uops:list[UOp]) -> str:
194
200
  lops = [(u.op, u.dtype, [uops.index(v) for v in u.src], u.arg) for u in uops]
195
201
  return base64.b64encode(pickle.dumps(lops)).decode()
196
202
 
@@ -199,9 +205,8 @@ class PythonCompiler(Compiler):
199
205
 
200
206
  class PythonAllocator(Allocator):
201
207
  def _alloc(self, size, options): return memoryview(bytearray(size))
202
- def copyin(self, dest, src:memoryview): dest[:] = src
203
- def copyout(self, dest:memoryview, src): dest[:] = src
208
+ def _copyin(self, dest, src:memoryview): dest[:] = src
209
+ def _copyout(self, dest:memoryview, src): dest[:] = src
204
210
 
205
211
  class PythonDevice(Compiled):
206
- def __init__(self, device:str):
207
- super().__init__(device, PythonAllocator(), PythonRenderer(), PythonCompiler(), PythonProgram)
212
+ def __init__(self, device:str): super().__init__(device, PythonAllocator(), PythonRenderer(), PythonCompiler(), PythonProgram)
@@ -1,14 +1,15 @@
1
1
  from __future__ import annotations
2
- import os, ctypes, functools, mmap, struct, array, decimal, math, sys
2
+ import os, ctypes, functools, mmap, struct, array, math, sys
3
3
  assert sys.platform != 'win32'
4
4
  from types import SimpleNamespace
5
- from typing import Tuple, List, Any, cast
6
- from tinygrad.device import BufferOptions
7
- from tinygrad.runtime.support.hcq import HCQBuffer, HWComputeQueue, HCQProgram, HCQCompiled, HCQSignal, HCQAllocator, HCQArgsState
8
- from tinygrad.runtime.autogen import kgsl, adreno, libc
5
+ from typing import Any, cast
6
+ from tinygrad.device import BufferSpec
7
+ from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
8
+ from tinygrad.runtime.support.hcq import HWInterface
9
+ from tinygrad.runtime.autogen import kgsl, adreno
9
10
  from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
10
11
  from tinygrad.renderer.cstyle import QCOMRenderer
11
- from tinygrad.helpers import getenv, from_mv, mv_address, to_mv, round_up, data64_le, prod, fromimport
12
+ from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport
12
13
  if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
13
14
 
14
15
  BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
@@ -36,25 +37,25 @@ class QCOMCompiler(CLCompiler):
36
37
  def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
37
38
 
38
39
  class QCOMSignal(HCQSignal):
39
- def __init__(self, value=0, is_timeline=False):
40
- self._signal = QCOMDevice.signals_pool.pop()
41
- super().__init__(value)
42
- def __del__(self): QCOMDevice.signals_pool.append(self._signal)
43
- def _get_value(self) -> int: return self._signal[0]
44
- def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer
45
- def _set_value(self, new_value:int): self._signal[0] = new_value
46
-
47
- class QCOMComputeQueue(HWComputeQueue):
48
- def __init__(self):
49
- self.cmd_idx_to_dims = {}
50
- super().__init__()
40
+ def __init__(self, base_addr:int|None=None, **kwargs):
41
+ super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2)
51
42
 
52
43
  def __del__(self):
53
- if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferOptions(cpu_access=True, nolru=True))
44
+ if isinstance(self.base_addr, int): QCOMDevice.signals_pool.append(self.base_addr)
54
45
 
55
- def cmd(self, opcode: int, *vals: int): self.q += [pkt7_hdr(opcode, len(vals)), *vals]
46
+ def _sleep(self, time_spent_waiting_ms:int):
47
+ # Sleep only for only timeline signals. Do it immediately to free cpu.
48
+ if self.timeline_for_device is not None:
49
+ kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.timeline_for_device.fd, context_id=self.timeline_for_device.ctx,
50
+ timestamp=self.timeline_for_device.last_cmd, timeout=0xffffffff)
56
51
 
57
- def reg(self, reg: int, *vals: int): self.q += [pkt4_hdr(reg, len(vals)), *vals]
52
+ class QCOMComputeQueue(HWQueue):
53
+ def __del__(self):
54
+ if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
55
+
56
+ def cmd(self, opcode: int, *vals: int): self.q(pkt7_hdr(opcode, len(vals)), *vals)
57
+
58
+ def reg(self, reg: int, *vals: int): self.q(pkt4_hdr(reg, len(vals)), *vals)
58
59
 
59
60
  def _cache_flush(self, write_back=True, invalidate=False, sync=True, memsync=False):
60
61
  # TODO: 7xx support.
@@ -63,54 +64,52 @@ class QCOMComputeQueue(HWComputeQueue):
63
64
  if memsync: self.cmd(adreno.CP_WAIT_MEM_WRITES)
64
65
  if sync: self.cmd(adreno.CP_WAIT_FOR_IDLE)
65
66
 
66
- def _memory_barrier(self): self._cache_flush(write_back=True, invalidate=True, sync=True, memsync=True)
67
+ def memory_barrier(self):
68
+ self._cache_flush(write_back=True, invalidate=True, sync=True, memsync=True)
69
+ return self
67
70
 
68
- def _signal(self, signal, value=0, ts=False):
71
+ def signal(self, signal:QCOMSignal, value=0, ts=False):
69
72
  self.cmd(adreno.CP_WAIT_FOR_IDLE)
70
73
  if QCOMDevice.gpu_id < 700:
71
74
  self.cmd(adreno.CP_EVENT_WRITE, qreg.cp_event_write_0(event=adreno.CACHE_FLUSH_TS, timestamp=ts),
72
- *data64_le(mv_address(signal._signal) + (0 if not ts else 8)), qreg.cp_event_write_3(value & 0xFFFFFFFF))
75
+ *data64_le(signal.timestamp_addr if ts else signal.value_addr), qreg.cp_event_write_3(value & 0xFFFFFFFF))
73
76
  self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
74
77
  else:
75
78
  # TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP
76
79
  raise RuntimeError('CP_EVENT_WRITE7 is not supported')
80
+ return self
77
81
 
78
- def _timestamp(self, signal): return self._signal(signal, 0, ts=True)
82
+ def timestamp(self, signal:QCOMSignal): return self.signal(signal, 0, ts=True)
79
83
 
80
- def _wait(self, signal, value=0):
81
- self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(mv_address(signal._signal)),
84
+ def wait(self, signal:QCOMSignal, value=0):
85
+ self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(signal.value_addr),
82
86
  qreg.cp_wait_reg_mem_3(ref=value&0xFFFFFFFF), qreg.cp_wait_reg_mem_4(mask=0xFFFFFFFF), qreg.cp_wait_reg_mem_5(delay_loop_cycles=32))
87
+ return self
83
88
 
84
- def _update_signal(self, cmd_idx, signal, value):
85
- if signal is not None: self._patch(cmd_idx, offset=3, data=data64_le(mv_address(signal._signal)))
86
- if value is not None: self._patch(cmd_idx, offset=5, data=[value & 0xFFFFFFFF])
87
-
88
- def _update_wait(self, cmd_idx, signal, value):
89
- if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(mv_address(signal._signal)))
90
- if value is not None: self._patch(cmd_idx, offset=4, data=[value & 0xFFFFFFFF])
91
-
92
- def _build_gpu_command(self, device, hw_addr=None):
93
- to_mv((hw_page_addr:=hw_addr or device._alloc_cmd_buf(len(self.q) * 4)), len(self.q) * 4).cast('I')[:] = array.array('I', self.q)
94
- obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self.q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
95
- submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=device.ctx,
89
+ def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None):
90
+ to_mv((hw_page_addr:=hw_addr or dev.cmd_buf_allocator.alloc(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
91
+ obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
92
+ submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx,
96
93
  cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object))
97
94
  return submit_req, obj
98
95
 
99
- def bind(self, device):
100
- self.binded_device = device
101
- self.hw_page = device.allocator.alloc(len(self.q) * 4, BufferOptions(cpu_access=True, nolru=True))
96
+ def bind(self, dev:QCOMDevice):
97
+ self.binded_device = dev
98
+ self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
102
99
  self.submit_req, self.obj = self._build_gpu_command(self.binded_device, self.hw_page.va_addr)
103
100
  # From now on, the queue is on the device for faster submission.
104
- self.q = to_mv(self.obj.gpuaddr, len(self.q) * 4).cast("I") # type: ignore
101
+ self._q = to_mv(self.obj.gpuaddr, len(self._q) * 4).cast("I")
102
+
103
+ def _submit(self, dev:QCOMDevice):
104
+ if self.binded_device == dev: submit_req = self.submit_req
105
+ else: submit_req, _ = self._build_gpu_command(dev)
106
+ dev.last_cmd = kgsl.IOCTL_KGSL_GPU_COMMAND(dev.fd, __payload=submit_req).timestamp
105
107
 
106
- def _submit(self, device):
107
- if self.binded_device == device: submit_req = self.submit_req
108
- else: submit_req, _ = self._build_gpu_command(device)
109
- device.last_cmd = kgsl.IOCTL_KGSL_GPU_COMMAND(device.fd, __payload=submit_req).timestamp
108
+ def exec(self, prg:QCOMProgram, args_state:QCOMArgsState, global_size, local_size):
109
+ self.bind_args_state(args_state)
110
110
 
111
- def _exec(self, prg, args_state, global_size, local_size):
112
- global_size_mp = [int(g*l) for g,l in zip(global_size, local_size)]
113
- self.cmd_idx_to_dims[self._cur_cmd_idx()] = [global_size, local_size]
111
+ def cast_int(x, ceil=False): return (math.ceil(x) if ceil else int(x)) if isinstance(x, float) else x
112
+ global_size_mp = [cast_int(g*l) for g,l in zip(global_size, local_size)]
114
113
 
115
114
  self.cmd(adreno.CP_SET_MARKER, qreg.a6xx_cp_set_marker_0(mode=adreno.RM6_COMPUTE))
116
115
  self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, qreg.a6xx_hlsq_invalidate_cmd(cs_state=True, cs_ibo=True))
@@ -126,12 +125,12 @@ class QCOMComputeQueue(HWComputeQueue):
126
125
  self.reg(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0,
127
126
  qreg.a6xx_hlsq_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1),
128
127
  global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_hlsq_cs_cntl_1(threadsize=adreno.THREAD64),
129
- int(math.ceil(global_size[0])), int(math.ceil(global_size[1])), int(math.ceil(global_size[2])))
128
+ cast_int(global_size[0], ceil=True), cast_int(global_size[1], ceil=True), cast_int(global_size[2], ceil=True))
130
129
 
131
130
  self.reg(adreno.REG_A6XX_SP_CS_CTRL_REG0,
132
131
  qreg.a6xx_sp_cs_ctrl_reg0(threadsize=adreno.THREAD64, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
133
132
  qreg.a6xx_sp_cs_unknown_a9b1(unk6=True, shared_size=prg.shared_size), 0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr),
134
- qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.device._stack.va_addr),
133
+ qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.dev._stack.va_addr),
135
134
  qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
136
135
 
137
136
  self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
@@ -151,7 +150,7 @@ class QCOMComputeQueue(HWComputeQueue):
151
150
  state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
152
151
  *data64_le(args_state.ptr + args_state.prg.samp_off))
153
152
  self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
154
- self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.device._border_color_base()))
153
+ self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr))
155
154
 
156
155
  if args_state.prg.tex_cnt > 0:
157
156
  self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
@@ -169,22 +168,10 @@ class QCOMComputeQueue(HWComputeQueue):
169
168
  qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nibo=args_state.prg.ibo_cnt))
170
169
  self.cmd(adreno.CP_RUN_OPENCL, 0)
171
170
  self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
172
-
173
- def _update_exec(self, cmd_idx, global_size, local_size):
174
- if global_size is not None:
175
- self._patch(cmd_idx, offset=29, data=[int(math.ceil(global_size[0])), int(math.ceil(global_size[1])), int(math.ceil(global_size[2]))])
176
- self.cmd_idx_to_dims[cmd_idx][0] = global_size
177
-
178
- if local_size is not None:
179
- payload = qreg.a6xx_hlsq_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1)
180
- self._patch(cmd_idx, offset=20, data=[payload])
181
- self.cmd_idx_to_dims[cmd_idx][1] = local_size
182
-
183
- global_size_mp = [int(g*l) for g,l in zip(self.cmd_idx_to_dims[cmd_idx][0], self.cmd_idx_to_dims[cmd_idx][1])]
184
- self._patch(cmd_idx, offset=21, data=[global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0])
171
+ return self
185
172
 
186
173
  class QCOMArgsState(HCQArgsState):
187
- def __init__(self, ptr:int, prg:QCOMProgram, bufs:Tuple[HCQBuffer, ...], vals:Tuple[int, ...]=()):
174
+ def __init__(self, ptr:int, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
188
175
  super().__init__(ptr, prg, bufs, vals=vals)
189
176
 
190
177
  if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
@@ -195,44 +182,41 @@ class QCOMArgsState(HCQArgsState):
195
182
  for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
196
183
 
197
184
  if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
198
- for i, b in enumerate(cast(List[QCOMBuffer], bufs)):
199
- if prg.buf_info[i].type is BUFTYPE_TEX: to_mv(self.ptr + prg.buf_info[i].offset, len(b.desc) * 4).cast('I')[:] = array.array('I', b.desc)
200
- elif prg.buf_info[i].type is BUFTYPE_IBO: to_mv(self.ptr + prg.buf_info[i].offset, len(b.ibo) * 4).cast('I')[:] = array.array('I', b.ibo)
201
- else: self.update_buffer(i, b)
202
- for i, v in enumerate(vals): self.update_var(i, v)
203
-
204
- def update_buffer(self, index:int, buf:HCQBuffer):
205
- if self.buf_info[index].type is not BUFTYPE_BUF: self.args_view[self.buf_info[index].offset//8 + 2] = buf.va_addr
206
- else: self.args_view[self.buf_info[index].offset//8] = buf.va_addr
185
+ for i, b in enumerate(bufs):
186
+ if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}:
187
+ obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo
188
+ to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
189
+ self.bind_sints_to_ptr(b.va_addr, ptr=self.ptr + self.buf_info[i].offset + (0 if self.buf_info[i].type is BUFTYPE_BUF else 16), fmt='Q')
207
190
 
208
- def update_var(self, index:int, val:int): self.args_view[self.args_info[index].offset//8] = val
191
+ for i, v in enumerate(vals): self.bind_sints_to_ptr(v, ptr=self.ptr + self.args_info[i].offset, fmt='I')
209
192
 
210
193
  class QCOMProgram(HCQProgram):
211
- def __init__(self, device: QCOMDevice, name: str, lib: bytes):
212
- self.device, self.name, self.lib = device, name, lib
194
+ def __init__(self, dev: QCOMDevice, name: str, lib: bytes):
195
+ self.dev: QCOMDevice = dev
196
+ self.name, self.lib = name, lib
213
197
  self._parse_lib()
214
198
 
215
- self.lib_gpu = self.device.allocator.alloc(self.image_size, options=BufferOptions(cpu_access=True, nolru=True))
216
- to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
199
+ self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, options=BufferSpec(cpu_access=True, nolru=True))
200
+ to_mv(cast(int, self.lib_gpu.va_addr), self.image_size)[:] = self.image
217
201
 
218
- self.pvtmem_size_per_item = round_up(self.pvtmem, 512) >> 9
219
- self.pvtmem_size_total = self.pvtmem_size_per_item * 128 * 2
220
- self.hw_stack_offset = round_up(next_power2(round_up(self.pvtmem, 512)) * 128 * 16, 0x1000)
221
- self.shared_size = max(1, (self.shmem - 1) // 1024)
202
+ self.pvtmem_size_per_item: int = round_up(self.pvtmem, 512) >> 9
203
+ self.pvtmem_size_total: int = self.pvtmem_size_per_item * 128 * 2
204
+ self.hw_stack_offset: int = round_up(next_power2(round_up(self.pvtmem, 512)) * 128 * 16, 0x1000)
205
+ self.shared_size: int = max(1, (self.shmem - 1) // 1024)
222
206
  self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs + round_up(self.hregs, 2) // 2)) * 128)) * 128)
223
- device._ensure_stack_size(self.hw_stack_offset * 4)
207
+ dev._ensure_stack_size(self.hw_stack_offset * 4)
224
208
 
225
209
  kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
226
- super().__init__(QCOMArgsState, self.device, self.name, kernargs_alloc_size=kernargs_alloc_size)
210
+ super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
227
211
 
228
- def __call__(self, *bufs, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
212
+ def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
229
213
  if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
230
214
  if any(g*l>mx for g,l,mx in zip(global_size, local_size, [65536, 65536, 65536])) and any(l>mx for l,mx in zip(local_size, [1024, 1024, 1024])):
231
215
  raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
232
216
  return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
233
217
 
234
218
  def _parse_lib(self):
235
- def _read_lib(off): return struct.unpack("I", self.lib[off:off+4])[0]
219
+ def _read_lib(off) -> int: return struct.unpack("I", self.lib[off:off+4])[0]
236
220
 
237
221
  # Extract image binary
238
222
  self.image_size = _read_lib(0x100)
@@ -282,17 +266,15 @@ class QCOMProgram(HCQProgram):
282
266
  self.fregs, self.hregs = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18)
283
267
 
284
268
  def __del__(self):
285
- if hasattr(self, 'lib_gpu'): self.device.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferOptions(cpu_access=True, nolru=True))
269
+ if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
286
270
 
287
- class QCOMBuffer(HCQBuffer):
288
- def __init__(self, va_addr:int, size:int, info=None, mapped=False, desc=None, ibo=None, pitch=None, real_stride=None, **kwargs):
289
- self.va_addr, self.size, self.info, self.mapped = va_addr, size, info, mapped
271
+ class QCOMTextureInfo:
272
+ def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
273
+ self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
290
274
 
291
- # Texture specific definitions
292
- self.desc, self.ibo, self.pitch, self.real_stride = [0] * 16, [0] * 16, pitch, real_stride
293
-
294
- class QCOMAllocator(HCQAllocator):
295
- def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
275
+ class QCOMAllocator(HCQAllocatorBase):
276
+ def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
277
+ # Recalculate real size for texture
296
278
  if options.image is not None:
297
279
  imgw, imgh, itemsize_log = options.image.shape[1], options.image.shape[0], int(math.log2(options.image.itemsize))
298
280
  pitchalign = max(6, 11 - int(math.log2(imgh))) if imgh > 1 else 6
@@ -301,99 +283,91 @@ class QCOMAllocator(HCQAllocator):
301
283
  granularity = 128 if options.image.itemsize == 4 else 256
302
284
  pitch_add = (1 << pitchalign) if min(next_power2(imgw), round_up(imgw, granularity)) - align_up + 1 <= imgw and imgw > granularity//2 else 0
303
285
  pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add
286
+ size = pitch * imgh
304
287
 
305
- if options.external_ptr: texture = QCOMBuffer(options.external_ptr, size)
306
- else: texture = self.device._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE)
307
-
308
- texture.pitch, texture.real_stride = pitch, real_stride
288
+ buf = HCQBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size)
309
289
 
290
+ if options.image is not None:
310
291
  tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
311
- texture.desc[0] = qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt)
312
- texture.desc[1] = qreg.a6xx_tex_const_1(width=imgw, height=imgh)
313
- texture.desc[2] = qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=texture.pitch, pitchalign=pitchalign-6)
314
- texture.desc[4:8] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
315
- texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]]
316
-
317
- return texture
292
+ desc = [qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt), qreg.a6xx_tex_const_1(width=imgw, height=imgh),
293
+ qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6), 0,
294
+ *data64_le(buf.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
318
295
 
319
- return QCOMBuffer(options.external_ptr, size) if options.external_ptr else self.device._gpu_alloc(size)
296
+ buf.texture_info = QCOMTextureInfo(pitch, real_stride, desc, [desc[0] & (~0xffff), *desc[1:len(desc)]])
297
+ return buf
320
298
 
321
299
  def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0):
322
300
  while src_off < src_size:
323
301
  ctypes.memmove(dest_addr+dest_off, src_addr+src_off, real_size)
324
302
  src_off, dest_off = src_off+src_stride, dest_off+dest_stride
325
303
 
326
- def copyin(self, dest:HCQBuffer, src:memoryview):
327
- if (qd:=cast(QCOMBuffer, dest)).pitch is not None: self._do_copy(mv_address(src), qd.va_addr, len(src), qd.real_stride, qd.real_stride, qd.pitch)
328
- else: ctypes.memmove(dest.va_addr, mv_address(src), src.nbytes)
304
+ def _copyin(self, dest:HCQBuffer, src:memoryview):
305
+ stride, pitch = (src.nbytes, src.nbytes) if (ti:=cast(QCOMTextureInfo, dest.texture_info)) is None else (ti.real_stride, ti.pitch)
306
+ self._do_copy(mv_address(src), dest.va_addr, src.nbytes, stride, stride, pitch)
307
+
308
+ def _copyout(self, dest:memoryview, src:HCQBuffer):
309
+ self.dev.synchronize()
329
310
 
330
- def copyout(self, dest:memoryview, src:HCQBuffer):
331
- self.device.synchronize()
332
- if (qs:=cast(QCOMBuffer, src)).pitch is not None: self._do_copy(qs.va_addr, mv_address(dest), qs.size, qs.real_stride, qs.pitch, qs.real_stride)
333
- else: ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
311
+ stride, pitch = (src.size, src.size) if (ti:=cast(QCOMTextureInfo, src.texture_info)) is None else (ti.real_stride, ti.pitch)
312
+ self._do_copy(src.va_addr, mv_address(dest), src.size, stride, pitch, stride)
334
313
 
335
- def as_buffer(self, src:HCQBuffer) -> memoryview:
336
- self.device.synchronize()
337
- return to_mv(src.va_addr, src.size)
314
+ def _as_buffer(self, src:HCQBuffer) -> memoryview:
315
+ self.dev.synchronize()
316
+ return to_mv(cast(int, src.va_addr), src.size)
338
317
 
339
- def _free(self, opaque, options:BufferOptions):
340
- self.device.synchronize()
341
- self.device._gpu_free(opaque)
318
+ def _free(self, opaque, options:BufferSpec):
319
+ self.dev.synchronize()
320
+ self.dev._gpu_free(opaque)
342
321
 
343
322
  class QCOMDevice(HCQCompiled):
344
323
  signals_page: Any = None
345
- signals_pool: List[Any] = []
324
+ signals_pool: list[int] = []
346
325
  gpu_id: int = 0
347
326
  dummy_addr: int = 0
348
327
 
349
328
  def __init__(self, device:str=""):
350
- self.fd = os.open('/dev/kgsl-3d0', os.O_RDWR)
351
- QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr
329
+ self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR)
330
+ QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
352
331
  QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
353
- QCOMDevice.signals_pool = [to_mv(self.signals_page.va_addr + off, 16).cast("Q") for off in range(0, self.signals_page.size, 16)]
354
- info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0
355
- QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
356
- if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
332
+ QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
357
333
 
358
- super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
359
- QCOMSignal, QCOMComputeQueue, None)
334
+ flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
335
+ | kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
336
+ self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id
360
337
 
361
- def _ctx_create(self):
362
- cr = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=(kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT |
363
- kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC | kgsl.KGSL_CONTEXT_PRIORITY(8) |
364
- kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)))
338
+ self.cmd_buf = self._gpu_alloc(16 << 20)
339
+ self.cmd_buf_allocator = BumpAllocator(size=self.cmd_buf.size, base=cast(int, self.cmd_buf.va_addr), wrap=True)
365
340
 
366
- # Set power to maximum.
367
- struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, cr.drawctxt_id, mv_address(_:=memoryview(array.array('I', [1]))), 4)
341
+ self.border_color_buf = self._gpu_alloc(0x1000, fill_zeroes=True)
342
+
343
+ self.last_cmd:int = 0
344
+
345
+ # Set max power
346
+ struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, self.ctx, mv_address(_:=memoryview(array.array('I', [1]))), 4)
368
347
  kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
369
- return cr.drawctxt_id
370
348
 
371
- def _info(self):
349
+ # Load info about qcom device
372
350
  info = kgsl.struct_kgsl_devinfo()
373
351
  kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
374
- return info
352
+ QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
353
+ if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
354
+
355
+ super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
356
+ QCOMSignal, QCOMComputeQueue, None)
375
357
 
376
- def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False):
358
+ def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
377
359
  flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
378
360
  if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
379
361
 
380
362
  alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=(bosz:=round_up(size, 1<<alignment_hint)), flags=flags, mmapsize=bosz)
381
- va_addr = libc.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, self.fd, alloc.id * 0x1000)
363
+ va_addr = self.fd.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, alloc.id * 0x1000)
382
364
 
383
365
  if fill_zeroes: ctypes.memset(va_addr, 0, size)
384
- return QCOMBuffer(va_addr=va_addr, size=size, info=alloc)
366
+ return HCQBuffer(va_addr=va_addr, size=size, meta=alloc)
385
367
 
386
- def _gpu_free(self, mem):
387
- kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.info.id)
388
- libc.munmap(mem.va_addr, mem.info.mmapsize)
389
-
390
- def _alloc_cmd_buf(self, sz: int):
391
- self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz
392
- return self.cmd_buf.va_addr + cur_ptr
393
-
394
- def _border_color_base(self):
395
- if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, fill_zeroes=True)
396
- return self._border_color_gpu.va_addr
368
+ def _gpu_free(self, mem:HCQBuffer):
369
+ kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)
370
+ HWInterface.munmap(mem.va_addr, mem.meta.mmapsize)
397
371
 
398
372
  def _ensure_stack_size(self, sz):
399
373
  if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
@@ -401,5 +375,3 @@ class QCOMDevice(HCQCompiled):
401
375
  self.synchronize()
402
376
  self._gpu_free(self._stack)
403
377
  self._stack = self._gpu_alloc(sz)
404
-
405
- def _syncdev(self): kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.fd, context_id=self.ctx, timestamp=self.last_cmd, timeout=0xffffffff)