tinygrad 0.10.0__py3-none-any.whl → 0.10.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/devectorizer.py +247 -0
- tinygrad/codegen/expander.py +121 -0
- tinygrad/codegen/kernel.py +141 -201
- tinygrad/codegen/linearize.py +223 -84
- tinygrad/codegen/lowerer.py +60 -42
- tinygrad/codegen/symbolic.py +476 -0
- tinygrad/codegen/transcendental.py +22 -13
- tinygrad/device.py +187 -47
- tinygrad/dtype.py +39 -28
- tinygrad/engine/jit.py +83 -65
- tinygrad/engine/memory.py +4 -5
- tinygrad/engine/multi.py +161 -0
- tinygrad/engine/realize.py +62 -108
- tinygrad/engine/schedule.py +396 -357
- tinygrad/engine/search.py +55 -66
- tinygrad/gradient.py +73 -0
- tinygrad/helpers.py +81 -59
- tinygrad/nn/__init__.py +30 -32
- tinygrad/nn/datasets.py +1 -2
- tinygrad/nn/optim.py +22 -26
- tinygrad/nn/state.py +91 -66
- tinygrad/ops.py +492 -641
- tinygrad/renderer/__init__.py +95 -36
- tinygrad/renderer/cstyle.py +99 -92
- tinygrad/renderer/llvmir.py +83 -34
- tinygrad/renderer/ptx.py +83 -99
- tinygrad/renderer/wgsl.py +95 -0
- tinygrad/runtime/autogen/amd_gpu.py +39507 -12
- tinygrad/runtime/autogen/comgr.py +2 -0
- tinygrad/runtime/autogen/kfd.py +4 -3
- tinygrad/runtime/autogen/kgsl.py +1 -1
- tinygrad/runtime/autogen/libc.py +404 -71
- tinygrad/runtime/autogen/llvm.py +11379 -0
- tinygrad/runtime/autogen/pci.py +1333 -0
- tinygrad/runtime/autogen/vfio.py +891 -0
- tinygrad/runtime/autogen/webgpu.py +6985 -0
- tinygrad/runtime/graph/cuda.py +8 -9
- tinygrad/runtime/graph/hcq.py +84 -79
- tinygrad/runtime/graph/metal.py +40 -43
- tinygrad/runtime/ops_amd.py +498 -334
- tinygrad/runtime/ops_cloud.py +34 -34
- tinygrad/runtime/ops_cpu.py +24 -0
- tinygrad/runtime/ops_cuda.py +30 -27
- tinygrad/runtime/ops_disk.py +62 -63
- tinygrad/runtime/ops_dsp.py +159 -42
- tinygrad/runtime/ops_gpu.py +30 -30
- tinygrad/runtime/ops_hip.py +29 -31
- tinygrad/runtime/ops_llvm.py +48 -41
- tinygrad/runtime/ops_metal.py +149 -113
- tinygrad/runtime/ops_npy.py +2 -2
- tinygrad/runtime/ops_nv.py +238 -273
- tinygrad/runtime/ops_python.py +55 -50
- tinygrad/runtime/ops_qcom.py +129 -157
- tinygrad/runtime/ops_webgpu.py +225 -0
- tinygrad/runtime/support/allocator.py +94 -0
- tinygrad/runtime/support/am/__init__.py +0 -0
- tinygrad/runtime/support/am/amdev.py +396 -0
- tinygrad/runtime/support/am/ip.py +463 -0
- tinygrad/runtime/support/compiler_cuda.py +4 -2
- tinygrad/runtime/support/elf.py +28 -4
- tinygrad/runtime/support/hcq.py +256 -324
- tinygrad/runtime/support/llvm.py +26 -0
- tinygrad/shape/shapetracker.py +85 -53
- tinygrad/shape/view.py +104 -140
- tinygrad/spec.py +155 -0
- tinygrad/tensor.py +835 -527
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/highlight.min.js +1232 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/cpp.min.js +47 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/languages/python.min.js +42 -0
- tinygrad/viz/assets/cdnjs.cloudflare.com/ajax/libs/highlight.js/11.10.0/styles/default.min.css +9 -0
- tinygrad/viz/assets/d3js.org/d3.v5.min.js +2 -0
- tinygrad/viz/assets/dagrejs.github.io/project/dagre-d3/latest/dagre-d3.min.js +4816 -0
- tinygrad/viz/assets/unpkg.com/@highlightjs/cdn-assets@11.10.0/styles/tokyo-night-dark.min.css +8 -0
- tinygrad/viz/index.html +544 -0
- tinygrad/viz/perfetto.html +178 -0
- tinygrad/viz/serve.py +205 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/METADATA +48 -25
- tinygrad-0.10.2.dist-info/RECORD +99 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/WHEEL +1 -1
- tinygrad/codegen/uopgraph.py +0 -506
- tinygrad/engine/lazy.py +0 -228
- tinygrad/function.py +0 -212
- tinygrad/multi.py +0 -177
- tinygrad/runtime/graph/clang.py +0 -39
- tinygrad/runtime/ops_clang.py +0 -35
- tinygrad-0.10.0.dist-info/RECORD +0 -77
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/LICENSE +0 -0
- {tinygrad-0.10.0.dist-info → tinygrad-0.10.2.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_python.py
CHANGED
@@ -2,10 +2,10 @@
|
|
2
2
|
# a python uops emulator
|
3
3
|
# works to test the tensor cores, and all the uops in general
|
4
4
|
# this is the (living) definition of uops
|
5
|
-
from typing import
|
6
|
-
import pickle, base64, itertools, time, struct
|
5
|
+
from typing import Optional, Any, TYPE_CHECKING
|
6
|
+
import pickle, base64, itertools, time, struct, sys
|
7
7
|
from tinygrad.dtype import DType, dtypes, ImageDType, PtrDType, truncate
|
8
|
-
from tinygrad.helpers import all_same, getenv, flatten
|
8
|
+
from tinygrad.helpers import all_same, getenv, flatten, get_single_element
|
9
9
|
from tinygrad.device import Compiled, Compiler, Allocator
|
10
10
|
from tinygrad.ops import exec_alu, Ops, UOp, GroupOp
|
11
11
|
from tinygrad.renderer import Renderer
|
@@ -26,21 +26,21 @@ def _store(m, i, v):
|
|
26
26
|
|
27
27
|
class PythonProgram:
|
28
28
|
def __init__(self, name:str, lib:bytes):
|
29
|
-
self.uops:
|
30
|
-
def __call__(self, *bufs, global_size:
|
29
|
+
self.uops: list[tuple[Ops, Optional[DType], list[int], Any]] = pickle.loads(lib)
|
30
|
+
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
31
31
|
st = time.perf_counter()
|
32
32
|
warp = list(itertools.product(*[range(x) for x in local_size[::-1]]))
|
33
33
|
warp_size = len(warp)
|
34
34
|
for idxs in itertools.product(*[range(x) for x in global_size[::-1]]):
|
35
|
-
ul:
|
36
|
-
dl:
|
37
|
-
pbufs:
|
38
|
-
pvals:
|
35
|
+
ul: dict[int, Any] = {}
|
36
|
+
dl: dict[int, DType] = {}
|
37
|
+
pbufs: list[memoryview] = list(bufs)
|
38
|
+
pvals: list[int] = list(vals)
|
39
39
|
i = 0
|
40
|
-
loop_ends:
|
40
|
+
loop_ends: dict[int, int] = {}
|
41
41
|
while i < len(self.uops):
|
42
42
|
uop, dtype, idp, arg = self.uops[i]
|
43
|
-
void_ops = {Ops.STORE, Ops.ENDRANGE, Ops.BARRIER, Ops.IF, Ops.ENDIF}
|
43
|
+
void_ops = {Ops.STORE, Ops.ENDRANGE, Ops.BARRIER, Ops.IF, Ops.ENDIF, Ops.NAME}
|
44
44
|
if uop is Ops.DEFINE_ACC: idp = [idp[0]]
|
45
45
|
inp = [ul[v] for v in idp if self.uops[v][0] not in void_ops]
|
46
46
|
dtp = [dl[v] for v in idp if self.uops[v][0] not in void_ops]
|
@@ -60,19 +60,17 @@ class PythonProgram:
|
|
60
60
|
loop_ends[idp[0]] = i
|
61
61
|
i = idp[0]
|
62
62
|
continue
|
63
|
-
if uop in (Ops.BARRIER, Ops.IF, Ops.ENDIF):
|
63
|
+
if uop in (Ops.BARRIER, Ops.IF, Ops.ENDIF, Ops.NAME):
|
64
64
|
# in the python emulator, the warp is always in sync
|
65
65
|
i += 1
|
66
66
|
continue
|
67
67
|
assert dtype is not None, f"{uop} is missing a dtype"
|
68
68
|
dl[i] = dtype
|
69
|
-
if uop
|
70
|
-
assert dtype.fmt is not None
|
71
|
-
|
72
|
-
|
73
|
-
|
74
|
-
lbuf = memoryview(bytearray(arg[1]*dtype.itemsize))
|
75
|
-
ul[i] = [lbuf.cast(dtype.fmt)] * warp_size
|
69
|
+
if uop in {Ops.DEFINE_GLOBAL, Ops.DEFINE_LOCAL}:
|
70
|
+
assert dtype.fmt is not None and isinstance(dtype, PtrDType)
|
71
|
+
if TYPE_CHECKING or sys.version_info < (3, 12): assert dtype.fmt != "e"
|
72
|
+
buf = memoryview(bytearray(dtype.size*dtype.itemsize)) if uop is Ops.DEFINE_LOCAL else pbufs.pop(0)
|
73
|
+
ul[i] = [buf.cast(dtype.fmt)] * warp_size
|
76
74
|
elif uop is Ops.DEFINE_VAR:
|
77
75
|
ul[i] = [pvals.pop(0)] * warp_size
|
78
76
|
elif uop is Ops.SPECIAL:
|
@@ -115,18 +113,13 @@ class PythonProgram:
|
|
115
113
|
elif uop is Ops.ASSIGN:
|
116
114
|
for j in range(len(inp[0])): inp[0][j] = inp[1][j]
|
117
115
|
ul[i] = inp[0]
|
118
|
-
elif uop is Ops.GEP:
|
119
|
-
assert len(arg) == 1
|
120
|
-
ul[i] = inp[0][arg[0]]
|
116
|
+
elif uop is Ops.GEP: ul[i] = inp[0][get_single_element(arg)]
|
121
117
|
elif uop is Ops.WMMA:
|
122
118
|
# here are the models for the WMMA instruction on the different hardware
|
123
119
|
def wmma_helper(WARP_THREADS, K, NUM_A, NUM_B, NUM_C, a_elem, b_elem, c_map):
|
124
|
-
|
125
|
-
|
126
|
-
|
127
|
-
assert len(flatten(inp[0])) == NUM_A * warp_size, f"WMMA must have {NUM_A * warp_size} total elements for A in WMMA"
|
128
|
-
assert len(flatten(inp[1])) == NUM_B * warp_size, f"WMMA must have {NUM_B * warp_size} total elements for B in WMMA"
|
129
|
-
assert len(flatten(inp[2])) == NUM_C * warp_size, f"WMMA must have {NUM_C * warp_size} total elements for C in WMMA"
|
120
|
+
for cc, tinp, num in zip(("A", "B", "C"), inp, (NUM_A, NUM_B, NUM_C)):
|
121
|
+
assert len(tinp) == num, f"{cc} must have {num} elements per thread, it has {len(tinp)}"
|
122
|
+
assert len(flatten(tinp)) == num * warp_size, f"WMMA must have {num * warp_size} total elements for {cc} in WMMA"
|
130
123
|
assert warp_size > 0 and warp_size % WARP_THREADS == 0, f"must have multiples of {WARP_THREADS} warp threads"
|
131
124
|
out = [inp[2][elem_idx][:] for elem_idx in range(NUM_C)]
|
132
125
|
for goff in range(0, warp_size, WARP_THREADS):
|
@@ -145,31 +138,43 @@ class PythonProgram:
|
|
145
138
|
ul[i] = wmma_helper(32, 8, 2, 2, 2, a_b_elem, a_b_elem, c_map)
|
146
139
|
elif arg[4] == "AMD":
|
147
140
|
# A (16 elements on 32 threads): col major, lane 16-32 == lane 0-15
|
148
|
-
def a_elem(x,
|
149
|
-
assert x[
|
150
|
-
return x[
|
141
|
+
def a_elem(x, k, row, goff):
|
142
|
+
assert x[k][goff+row] == x[k][goff+row+16], "warp elements not duplicated properly across lanes"
|
143
|
+
return x[k][goff+row]
|
151
144
|
# B (16 elements on 32 threads): row major, lane 16-32 == lane 0-15
|
152
|
-
def b_elem(x,
|
145
|
+
def b_elem(x, col, k, goff): return a_elem(x, k, col, goff) # pylint: disable=arguments-out-of-order
|
153
146
|
def c_map(lane, elem): return (lane%16, lane//16+elem*2) # (i, j), C, D (8 elements on 32 threads): row major
|
154
147
|
ul[i] = wmma_helper(32, 16, 16, 16, 8, a_elem, b_elem, c_map)
|
155
148
|
elif arg[4] == "CUDA":
|
156
|
-
#
|
157
|
-
def
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
149
|
+
# (col, row) given (lane, elem) for C & D (4 elements on 32 threads); shared by all tc shapes with M=16 N=8
|
150
|
+
def c_map(lane, elem): return (elem%2 + (lane%4)*2, lane//4 + (elem//2)*8)
|
151
|
+
|
152
|
+
if arg[1] == (8,16,16):
|
153
|
+
def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2 + (k//8)*4][goff + (k//2)%4 + (row%8)*4]
|
154
|
+
def b_elem(x, col, k, goff): return x[k%2 + (k//8)*2][goff + (k//2)%4 + col*4]
|
155
|
+
ul[i] = wmma_helper(32, 16, 8, 4, 4, a_elem, b_elem, c_map)
|
156
|
+
|
157
|
+
elif arg[1] == (8,16,8) and arg[2] == dtypes.half:
|
158
|
+
def a_elem(x, k, row, goff): return x[k%2 + (row//8)*2][goff + k//2 + (row%8)*4]
|
159
|
+
def b_elem(x, col, k, goff): return x[k%2][goff + k//2 + col*4]
|
160
|
+
ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map)
|
161
|
+
|
162
|
+
elif arg[1] == (8,16,8) and arg[2] == dtypes.float:
|
163
|
+
def a_elem(x, k, row, goff): return x[(k//4)*2 + row//8][goff + k%4 + (row%8)*4]
|
164
|
+
def b_elem(x, col, k, goff): return x[k//4][goff + k%4 + col*4]
|
165
|
+
ul[i] = wmma_helper(32, 8, 4, 2, 4, a_elem, b_elem, c_map)
|
166
|
+
|
167
|
+
else: raise NotImplementedError(f"unimplemented tensor core {arg}")
|
163
168
|
elif arg[4] == "INTEL":
|
164
169
|
# A (16 elements on 8 threads)
|
165
|
-
def a_elem(x,
|
170
|
+
def a_elem(x, k, row, goff): return x[k%2+row*2][goff+k//2]
|
166
171
|
# B (16 elements on 8 threads)
|
167
|
-
def b_elem(x,
|
172
|
+
def b_elem(x, col, k, goff): return x[k][goff+col]
|
168
173
|
# C, D (8 elements on 8 threads)
|
169
174
|
def c_map(lane, elem): return (lane, elem)
|
170
175
|
ul[i] = wmma_helper(8, 16, 16, 16, 8, a_elem, b_elem, c_map)
|
171
|
-
elif arg[4] == "
|
172
|
-
def elem(x,
|
176
|
+
elif arg[4] == "CPU":
|
177
|
+
def elem(x, col, row, _): return x[col+row][0] # k is always 0
|
173
178
|
def c_map(_, elem): return (elem%16, elem//16)
|
174
179
|
ul[i] = wmma_helper(1, 1, 16, 16, 256, elem, elem, c_map)
|
175
180
|
else: raise NotImplementedError(f"unimplemented tensor core {arg}")
|
@@ -186,11 +191,12 @@ class PythonRenderer(Renderer):
|
|
186
191
|
def __init__(self):
|
187
192
|
if getenv("EMULATE_METAL"): self.device, self.tensor_cores = "METAL", MetalRenderer.tensor_cores
|
188
193
|
if getenv("EMULATE_AMD"): self.device, self.tensor_cores = "AMD", AMDRenderer.tensor_cores
|
189
|
-
if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.
|
194
|
+
if getenv("EMULATE_CUDA"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm80
|
195
|
+
if getenv("EMULATE_CUDA_SM75"): self.device, self.tensor_cores = "CUDA", CUDARenderer.tc_sm75
|
190
196
|
if getenv("EMULATE_INTEL"): self.device, self.suffix, self.tensor_cores = "INTEL", "INTEL", IntelRenderer.tensor_cores
|
191
|
-
if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "
|
197
|
+
if getenv("EMULATE_AMX"): self.device, self.tensor_cores = "CPU", ClangRenderer.tensor_cores
|
192
198
|
|
193
|
-
def render(self,
|
199
|
+
def render(self, uops:list[UOp]) -> str:
|
194
200
|
lops = [(u.op, u.dtype, [uops.index(v) for v in u.src], u.arg) for u in uops]
|
195
201
|
return base64.b64encode(pickle.dumps(lops)).decode()
|
196
202
|
|
@@ -199,9 +205,8 @@ class PythonCompiler(Compiler):
|
|
199
205
|
|
200
206
|
class PythonAllocator(Allocator):
|
201
207
|
def _alloc(self, size, options): return memoryview(bytearray(size))
|
202
|
-
def
|
203
|
-
def
|
208
|
+
def _copyin(self, dest, src:memoryview): dest[:] = src
|
209
|
+
def _copyout(self, dest:memoryview, src): dest[:] = src
|
204
210
|
|
205
211
|
class PythonDevice(Compiled):
|
206
|
-
def __init__(self, device:str):
|
207
|
-
super().__init__(device, PythonAllocator(), PythonRenderer(), PythonCompiler(), PythonProgram)
|
212
|
+
def __init__(self, device:str): super().__init__(device, PythonAllocator(), PythonRenderer(), PythonCompiler(), PythonProgram)
|
tinygrad/runtime/ops_qcom.py
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
from __future__ import annotations
|
2
|
-
import os, ctypes, functools, mmap, struct, array,
|
2
|
+
import os, ctypes, functools, mmap, struct, array, math, sys
|
3
3
|
assert sys.platform != 'win32'
|
4
4
|
from types import SimpleNamespace
|
5
|
-
from typing import
|
6
|
-
from tinygrad.device import
|
7
|
-
from tinygrad.runtime.support.hcq import HCQBuffer,
|
8
|
-
from tinygrad.runtime.
|
5
|
+
from typing import Any, cast
|
6
|
+
from tinygrad.device import BufferSpec
|
7
|
+
from tinygrad.runtime.support.hcq import HCQBuffer, HWQueue, HCQProgram, HCQCompiled, HCQAllocatorBase, HCQSignal, HCQArgsState, BumpAllocator
|
8
|
+
from tinygrad.runtime.support.hcq import HWInterface
|
9
|
+
from tinygrad.runtime.autogen import kgsl, adreno
|
9
10
|
from tinygrad.runtime.ops_gpu import CLCompiler, CLDevice
|
10
11
|
from tinygrad.renderer.cstyle import QCOMRenderer
|
11
|
-
from tinygrad.helpers import getenv,
|
12
|
+
from tinygrad.helpers import getenv, mv_address, to_mv, round_up, data64_le, prod, fromimport
|
12
13
|
if getenv("IOCTL"): import extra.qcom_gpu_driver.opencl_ioctl # noqa: F401 # pylint: disable=unused-import
|
13
14
|
|
14
15
|
BUFTYPE_BUF, BUFTYPE_TEX, BUFTYPE_IBO = 0, 1, 2
|
@@ -36,25 +37,25 @@ class QCOMCompiler(CLCompiler):
|
|
36
37
|
def disassemble(self, lib:bytes): fromimport('extra.disassemblers.adreno', 'disasm')(lib)
|
37
38
|
|
38
39
|
class QCOMSignal(HCQSignal):
|
39
|
-
def __init__(self,
|
40
|
-
|
41
|
-
super().__init__(value)
|
42
|
-
def __del__(self): QCOMDevice.signals_pool.append(self._signal)
|
43
|
-
def _get_value(self) -> int: return self._signal[0]
|
44
|
-
def _get_timestamp(self) -> decimal.Decimal: return decimal.Decimal(self._signal[1]) / decimal.Decimal(19.2) # based on the 19.2MHz always-on timer
|
45
|
-
def _set_value(self, new_value:int): self._signal[0] = new_value
|
46
|
-
|
47
|
-
class QCOMComputeQueue(HWComputeQueue):
|
48
|
-
def __init__(self):
|
49
|
-
self.cmd_idx_to_dims = {}
|
50
|
-
super().__init__()
|
40
|
+
def __init__(self, base_addr:int|None=None, **kwargs):
|
41
|
+
super().__init__(QCOMDevice.signals_pool.pop() if base_addr is None else base_addr, **kwargs, timestamp_divider=19.2)
|
51
42
|
|
52
43
|
def __del__(self):
|
53
|
-
if self.
|
44
|
+
if isinstance(self.base_addr, int): QCOMDevice.signals_pool.append(self.base_addr)
|
54
45
|
|
55
|
-
def
|
46
|
+
def _sleep(self, time_spent_waiting_ms:int):
|
47
|
+
# Sleep only for only timeline signals. Do it immediately to free cpu.
|
48
|
+
if self.timeline_for_device is not None:
|
49
|
+
kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.timeline_for_device.fd, context_id=self.timeline_for_device.ctx,
|
50
|
+
timestamp=self.timeline_for_device.last_cmd, timeout=0xffffffff)
|
56
51
|
|
57
|
-
|
52
|
+
class QCOMComputeQueue(HWQueue):
|
53
|
+
def __del__(self):
|
54
|
+
if self.binded_device is not None: self.binded_device.allocator.free(self.hw_page, self.hw_page.size, BufferSpec(cpu_access=True, nolru=True))
|
55
|
+
|
56
|
+
def cmd(self, opcode: int, *vals: int): self.q(pkt7_hdr(opcode, len(vals)), *vals)
|
57
|
+
|
58
|
+
def reg(self, reg: int, *vals: int): self.q(pkt4_hdr(reg, len(vals)), *vals)
|
58
59
|
|
59
60
|
def _cache_flush(self, write_back=True, invalidate=False, sync=True, memsync=False):
|
60
61
|
# TODO: 7xx support.
|
@@ -63,54 +64,52 @@ class QCOMComputeQueue(HWComputeQueue):
|
|
63
64
|
if memsync: self.cmd(adreno.CP_WAIT_MEM_WRITES)
|
64
65
|
if sync: self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
65
66
|
|
66
|
-
def
|
67
|
+
def memory_barrier(self):
|
68
|
+
self._cache_flush(write_back=True, invalidate=True, sync=True, memsync=True)
|
69
|
+
return self
|
67
70
|
|
68
|
-
def
|
71
|
+
def signal(self, signal:QCOMSignal, value=0, ts=False):
|
69
72
|
self.cmd(adreno.CP_WAIT_FOR_IDLE)
|
70
73
|
if QCOMDevice.gpu_id < 700:
|
71
74
|
self.cmd(adreno.CP_EVENT_WRITE, qreg.cp_event_write_0(event=adreno.CACHE_FLUSH_TS, timestamp=ts),
|
72
|
-
*data64_le(
|
75
|
+
*data64_le(signal.timestamp_addr if ts else signal.value_addr), qreg.cp_event_write_3(value & 0xFFFFFFFF))
|
73
76
|
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
74
77
|
else:
|
75
78
|
# TODO: support devices starting with 8 Gen 1. Also, 700th series have convenient CP_GLOBAL_TIMESTAMP and CP_LOCAL_TIMESTAMP
|
76
79
|
raise RuntimeError('CP_EVENT_WRITE7 is not supported')
|
80
|
+
return self
|
77
81
|
|
78
|
-
def
|
82
|
+
def timestamp(self, signal:QCOMSignal): return self.signal(signal, 0, ts=True)
|
79
83
|
|
80
|
-
def
|
81
|
-
self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(
|
84
|
+
def wait(self, signal:QCOMSignal, value=0):
|
85
|
+
self.cmd(adreno.CP_WAIT_REG_MEM, qreg.cp_wait_reg_mem_0(function=adreno.WRITE_GE, poll=adreno.POLL_MEMORY),*data64_le(signal.value_addr),
|
82
86
|
qreg.cp_wait_reg_mem_3(ref=value&0xFFFFFFFF), qreg.cp_wait_reg_mem_4(mask=0xFFFFFFFF), qreg.cp_wait_reg_mem_5(delay_loop_cycles=32))
|
87
|
+
return self
|
83
88
|
|
84
|
-
def
|
85
|
-
|
86
|
-
|
87
|
-
|
88
|
-
def _update_wait(self, cmd_idx, signal, value):
|
89
|
-
if signal is not None: self._patch(cmd_idx, offset=2, data=data64_le(mv_address(signal._signal)))
|
90
|
-
if value is not None: self._patch(cmd_idx, offset=4, data=[value & 0xFFFFFFFF])
|
91
|
-
|
92
|
-
def _build_gpu_command(self, device, hw_addr=None):
|
93
|
-
to_mv((hw_page_addr:=hw_addr or device._alloc_cmd_buf(len(self.q) * 4)), len(self.q) * 4).cast('I')[:] = array.array('I', self.q)
|
94
|
-
obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self.q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
|
95
|
-
submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=device.ctx,
|
89
|
+
def _build_gpu_command(self, dev:QCOMDevice, hw_addr=None):
|
90
|
+
to_mv((hw_page_addr:=hw_addr or dev.cmd_buf_allocator.alloc(len(self._q) * 4)), len(self._q) * 4).cast('I')[:] = array.array('I', self._q)
|
91
|
+
obj = kgsl.struct_kgsl_command_object(gpuaddr=hw_page_addr, size=len(self._q) * 4, flags=kgsl.KGSL_CMDLIST_IB)
|
92
|
+
submit_req = kgsl.struct_kgsl_gpu_command(cmdlist=ctypes.addressof(obj), numcmds=1, context_id=dev.ctx,
|
96
93
|
cmdsize=ctypes.sizeof(kgsl.struct_kgsl_command_object))
|
97
94
|
return submit_req, obj
|
98
95
|
|
99
|
-
def bind(self,
|
100
|
-
self.binded_device =
|
101
|
-
self.hw_page =
|
96
|
+
def bind(self, dev:QCOMDevice):
|
97
|
+
self.binded_device = dev
|
98
|
+
self.hw_page = dev.allocator.alloc(len(self._q) * 4, BufferSpec(cpu_access=True, nolru=True))
|
102
99
|
self.submit_req, self.obj = self._build_gpu_command(self.binded_device, self.hw_page.va_addr)
|
103
100
|
# From now on, the queue is on the device for faster submission.
|
104
|
-
self.
|
101
|
+
self._q = to_mv(self.obj.gpuaddr, len(self._q) * 4).cast("I")
|
102
|
+
|
103
|
+
def _submit(self, dev:QCOMDevice):
|
104
|
+
if self.binded_device == dev: submit_req = self.submit_req
|
105
|
+
else: submit_req, _ = self._build_gpu_command(dev)
|
106
|
+
dev.last_cmd = kgsl.IOCTL_KGSL_GPU_COMMAND(dev.fd, __payload=submit_req).timestamp
|
105
107
|
|
106
|
-
def
|
107
|
-
|
108
|
-
else: submit_req, _ = self._build_gpu_command(device)
|
109
|
-
device.last_cmd = kgsl.IOCTL_KGSL_GPU_COMMAND(device.fd, __payload=submit_req).timestamp
|
108
|
+
def exec(self, prg:QCOMProgram, args_state:QCOMArgsState, global_size, local_size):
|
109
|
+
self.bind_args_state(args_state)
|
110
110
|
|
111
|
-
|
112
|
-
global_size_mp = [
|
113
|
-
self.cmd_idx_to_dims[self._cur_cmd_idx()] = [global_size, local_size]
|
111
|
+
def cast_int(x, ceil=False): return (math.ceil(x) if ceil else int(x)) if isinstance(x, float) else x
|
112
|
+
global_size_mp = [cast_int(g*l) for g,l in zip(global_size, local_size)]
|
114
113
|
|
115
114
|
self.cmd(adreno.CP_SET_MARKER, qreg.a6xx_cp_set_marker_0(mode=adreno.RM6_COMPUTE))
|
116
115
|
self.reg(adreno.REG_A6XX_HLSQ_INVALIDATE_CMD, qreg.a6xx_hlsq_invalidate_cmd(cs_state=True, cs_ibo=True))
|
@@ -126,12 +125,12 @@ class QCOMComputeQueue(HWComputeQueue):
|
|
126
125
|
self.reg(adreno.REG_A6XX_HLSQ_CS_NDRANGE_0,
|
127
126
|
qreg.a6xx_hlsq_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1),
|
128
127
|
global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0, 0xccc0cf, 0xfc | qreg.a6xx_hlsq_cs_cntl_1(threadsize=adreno.THREAD64),
|
129
|
-
|
128
|
+
cast_int(global_size[0], ceil=True), cast_int(global_size[1], ceil=True), cast_int(global_size[2], ceil=True))
|
130
129
|
|
131
130
|
self.reg(adreno.REG_A6XX_SP_CS_CTRL_REG0,
|
132
131
|
qreg.a6xx_sp_cs_ctrl_reg0(threadsize=adreno.THREAD64, halfregfootprint=prg.hregs, fullregfootprint=prg.fregs, branchstack=prg.brnchstck),
|
133
132
|
qreg.a6xx_sp_cs_unknown_a9b1(unk6=True, shared_size=prg.shared_size), 0, prg.prg_offset, *data64_le(prg.lib_gpu.va_addr),
|
134
|
-
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.
|
133
|
+
qreg.a6xx_sp_cs_pvt_mem_param(memsizeperitem=prg.pvtmem_size_per_item), *data64_le(prg.dev._stack.va_addr),
|
135
134
|
qreg.a6xx_sp_cs_pvt_mem_size(totalpvtmemsize=prg.pvtmem_size_total))
|
136
135
|
|
137
136
|
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
@@ -151,7 +150,7 @@ class QCOMComputeQueue(HWComputeQueue):
|
|
151
150
|
state_block=adreno.SB6_CS_TEX, num_unit=args_state.prg.samp_cnt),
|
152
151
|
*data64_le(args_state.ptr + args_state.prg.samp_off))
|
153
152
|
self.reg(adreno.REG_A6XX_SP_CS_TEX_SAMP, *data64_le(args_state.ptr + args_state.prg.samp_off))
|
154
|
-
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.
|
153
|
+
self.reg(adreno.REG_A6XX_SP_PS_TP_BORDER_COLOR_BASE_ADDR, *data64_le(prg.dev.border_color_buf.va_addr))
|
155
154
|
|
156
155
|
if args_state.prg.tex_cnt > 0:
|
157
156
|
self.cmd(adreno.CP_LOAD_STATE6_FRAG, qreg.cp_load_state6_0(state_type=adreno.ST_CONSTANTS, state_src=adreno.SS6_INDIRECT,
|
@@ -169,22 +168,10 @@ class QCOMComputeQueue(HWComputeQueue):
|
|
169
168
|
qreg.a6xx_sp_cs_config(enabled=True, nsamp=args_state.prg.samp_cnt, ntex=args_state.prg.tex_cnt, nibo=args_state.prg.ibo_cnt))
|
170
169
|
self.cmd(adreno.CP_RUN_OPENCL, 0)
|
171
170
|
self._cache_flush(write_back=True, invalidate=False, sync=False, memsync=False)
|
172
|
-
|
173
|
-
def _update_exec(self, cmd_idx, global_size, local_size):
|
174
|
-
if global_size is not None:
|
175
|
-
self._patch(cmd_idx, offset=29, data=[int(math.ceil(global_size[0])), int(math.ceil(global_size[1])), int(math.ceil(global_size[2]))])
|
176
|
-
self.cmd_idx_to_dims[cmd_idx][0] = global_size
|
177
|
-
|
178
|
-
if local_size is not None:
|
179
|
-
payload = qreg.a6xx_hlsq_cs_ndrange_0(kerneldim=3, localsizex=local_size[0] - 1, localsizey=local_size[1] - 1, localsizez=local_size[2] - 1)
|
180
|
-
self._patch(cmd_idx, offset=20, data=[payload])
|
181
|
-
self.cmd_idx_to_dims[cmd_idx][1] = local_size
|
182
|
-
|
183
|
-
global_size_mp = [int(g*l) for g,l in zip(self.cmd_idx_to_dims[cmd_idx][0], self.cmd_idx_to_dims[cmd_idx][1])]
|
184
|
-
self._patch(cmd_idx, offset=21, data=[global_size_mp[0], 0, global_size_mp[1], 0, global_size_mp[2], 0])
|
171
|
+
return self
|
185
172
|
|
186
173
|
class QCOMArgsState(HCQArgsState):
|
187
|
-
def __init__(self, ptr:int, prg:QCOMProgram, bufs:
|
174
|
+
def __init__(self, ptr:int, prg:QCOMProgram, bufs:tuple[HCQBuffer, ...], vals:tuple[int, ...]=()):
|
188
175
|
super().__init__(ptr, prg, bufs, vals=vals)
|
189
176
|
|
190
177
|
if len(bufs) + len(vals) != len(prg.buf_info): raise RuntimeError(f'incorrect args size given={len(bufs)+len(vals)} != want={len(prg.buf_info)}')
|
@@ -195,44 +182,41 @@ class QCOMArgsState(HCQArgsState):
|
|
195
182
|
for cnst_val, cnst_off, cnst_sz in prg.consts_info: to_mv(self.ptr + cnst_off, cnst_sz)[:] = cnst_val.to_bytes(cnst_sz, byteorder='little')
|
196
183
|
|
197
184
|
if prg.samp_cnt > 0: to_mv(self.ptr + prg.samp_off, len(prg.samplers) * 4).cast('I')[:] = array.array('I', prg.samplers)
|
198
|
-
for i, b in enumerate(
|
199
|
-
if prg.buf_info[i].type
|
200
|
-
|
201
|
-
|
202
|
-
|
203
|
-
|
204
|
-
def update_buffer(self, index:int, buf:HCQBuffer):
|
205
|
-
if self.buf_info[index].type is not BUFTYPE_BUF: self.args_view[self.buf_info[index].offset//8 + 2] = buf.va_addr
|
206
|
-
else: self.args_view[self.buf_info[index].offset//8] = buf.va_addr
|
185
|
+
for i, b in enumerate(bufs):
|
186
|
+
if prg.buf_info[i].type in {BUFTYPE_TEX, BUFTYPE_IBO}:
|
187
|
+
obj = b.texture_info.desc if prg.buf_info[i].type is BUFTYPE_TEX else b.texture_info.ibo
|
188
|
+
to_mv(self.ptr + prg.buf_info[i].offset, len(obj) * 4).cast('I')[:] = array.array('I', obj)
|
189
|
+
self.bind_sints_to_ptr(b.va_addr, ptr=self.ptr + self.buf_info[i].offset + (0 if self.buf_info[i].type is BUFTYPE_BUF else 16), fmt='Q')
|
207
190
|
|
208
|
-
|
191
|
+
for i, v in enumerate(vals): self.bind_sints_to_ptr(v, ptr=self.ptr + self.args_info[i].offset, fmt='I')
|
209
192
|
|
210
193
|
class QCOMProgram(HCQProgram):
|
211
|
-
def __init__(self,
|
212
|
-
self.
|
194
|
+
def __init__(self, dev: QCOMDevice, name: str, lib: bytes):
|
195
|
+
self.dev: QCOMDevice = dev
|
196
|
+
self.name, self.lib = name, lib
|
213
197
|
self._parse_lib()
|
214
198
|
|
215
|
-
self.lib_gpu = self.
|
216
|
-
to_mv(self.lib_gpu.va_addr, self.image_size)[:] = self.image
|
199
|
+
self.lib_gpu: HCQBuffer = self.dev.allocator.alloc(self.image_size, options=BufferSpec(cpu_access=True, nolru=True))
|
200
|
+
to_mv(cast(int, self.lib_gpu.va_addr), self.image_size)[:] = self.image
|
217
201
|
|
218
|
-
self.pvtmem_size_per_item = round_up(self.pvtmem, 512) >> 9
|
219
|
-
self.pvtmem_size_total = self.pvtmem_size_per_item * 128 * 2
|
220
|
-
self.hw_stack_offset = round_up(next_power2(round_up(self.pvtmem, 512)) * 128 * 16, 0x1000)
|
221
|
-
self.shared_size = max(1, (self.shmem - 1) // 1024)
|
202
|
+
self.pvtmem_size_per_item: int = round_up(self.pvtmem, 512) >> 9
|
203
|
+
self.pvtmem_size_total: int = self.pvtmem_size_per_item * 128 * 2
|
204
|
+
self.hw_stack_offset: int = round_up(next_power2(round_up(self.pvtmem, 512)) * 128 * 16, 0x1000)
|
205
|
+
self.shared_size: int = max(1, (self.shmem - 1) // 1024)
|
222
206
|
self.max_threads = min(1024, ((384 * 32) // (max(1, (self.fregs + round_up(self.hregs, 2) // 2)) * 128)) * 128)
|
223
|
-
|
207
|
+
dev._ensure_stack_size(self.hw_stack_offset * 4)
|
224
208
|
|
225
209
|
kernargs_alloc_size = round_up(2048 + (self.tex_cnt + self.ibo_cnt) * 0x40 + self.samp_cnt * 0x10, 0x100)
|
226
|
-
super().__init__(QCOMArgsState, self.
|
210
|
+
super().__init__(QCOMArgsState, self.dev, self.name, kernargs_alloc_size=kernargs_alloc_size)
|
227
211
|
|
228
|
-
def __call__(self, *bufs, global_size:
|
212
|
+
def __call__(self, *bufs, global_size:tuple[int,int,int]=(1,1,1), local_size:tuple[int,int,int]=(1,1,1), vals:tuple[int, ...]=(), wait=False):
|
229
213
|
if self.max_threads < prod(local_size): raise RuntimeError("Too many resources requested for launch")
|
230
214
|
if any(g*l>mx for g,l,mx in zip(global_size, local_size, [65536, 65536, 65536])) and any(l>mx for l,mx in zip(local_size, [1024, 1024, 1024])):
|
231
215
|
raise RuntimeError(f"Invalid global/local dims {global_size=}, {local_size=}")
|
232
216
|
return super().__call__(*bufs, global_size=global_size, local_size=local_size, vals=vals, wait=wait)
|
233
217
|
|
234
218
|
def _parse_lib(self):
|
235
|
-
def _read_lib(off): return struct.unpack("I", self.lib[off:off+4])[0]
|
219
|
+
def _read_lib(off) -> int: return struct.unpack("I", self.lib[off:off+4])[0]
|
236
220
|
|
237
221
|
# Extract image binary
|
238
222
|
self.image_size = _read_lib(0x100)
|
@@ -282,17 +266,15 @@ class QCOMProgram(HCQProgram):
|
|
282
266
|
self.fregs, self.hregs = _read_lib(reg_desc_off + 0x14), _read_lib(reg_desc_off + 0x18)
|
283
267
|
|
284
268
|
def __del__(self):
|
285
|
-
if hasattr(self, 'lib_gpu'): self.
|
269
|
+
if hasattr(self, 'lib_gpu'): self.dev.allocator.free(self.lib_gpu, self.lib_gpu.size, options=BufferSpec(cpu_access=True, nolru=True))
|
286
270
|
|
287
|
-
class
|
288
|
-
def __init__(self,
|
289
|
-
self.
|
271
|
+
class QCOMTextureInfo:
|
272
|
+
def __init__(self, pitch:int, real_stride:int, desc:list[int], ibo:list[int]):
|
273
|
+
self.pitch, self.real_stride, self.desc, self.ibo = pitch, real_stride, desc, ibo
|
290
274
|
|
291
|
-
|
292
|
-
|
293
|
-
|
294
|
-
class QCOMAllocator(HCQAllocator):
|
295
|
-
def _alloc(self, size:int, options:BufferOptions) -> HCQBuffer:
|
275
|
+
class QCOMAllocator(HCQAllocatorBase):
|
276
|
+
def _alloc(self, size:int, options:BufferSpec) -> HCQBuffer:
|
277
|
+
# Recalculate real size for texture
|
296
278
|
if options.image is not None:
|
297
279
|
imgw, imgh, itemsize_log = options.image.shape[1], options.image.shape[0], int(math.log2(options.image.itemsize))
|
298
280
|
pitchalign = max(6, 11 - int(math.log2(imgh))) if imgh > 1 else 6
|
@@ -301,99 +283,91 @@ class QCOMAllocator(HCQAllocator):
|
|
301
283
|
granularity = 128 if options.image.itemsize == 4 else 256
|
302
284
|
pitch_add = (1 << pitchalign) if min(next_power2(imgw), round_up(imgw, granularity)) - align_up + 1 <= imgw and imgw > granularity//2 else 0
|
303
285
|
pitch = round_up((real_stride:=imgw * 4 * options.image.itemsize), 1 << pitchalign) + pitch_add
|
286
|
+
size = pitch * imgh
|
304
287
|
|
305
|
-
|
306
|
-
else: texture = self.device._gpu_alloc(pitch * imgh, kgsl.KGSL_MEMTYPE_TEXTURE)
|
307
|
-
|
308
|
-
texture.pitch, texture.real_stride = pitch, real_stride
|
288
|
+
buf = HCQBuffer(options.external_ptr, size) if options.external_ptr else self.dev._gpu_alloc(size)
|
309
289
|
|
290
|
+
if options.image is not None:
|
310
291
|
tex_fmt = adreno.FMT6_32_32_32_32_FLOAT if options.image.itemsize == 4 else adreno.FMT6_16_16_16_16_FLOAT
|
311
|
-
|
312
|
-
|
313
|
-
|
314
|
-
texture.desc[4:8] = [*data64_le(texture.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
|
315
|
-
texture.ibo = [texture.desc[0] & (~0xffff), *texture.desc[1:len(texture.desc)]]
|
316
|
-
|
317
|
-
return texture
|
292
|
+
desc = [qreg.a6xx_tex_const_0(0x8, swiz_x=0, swiz_y=1, swiz_z=2, swiz_w=3, fmt=tex_fmt), qreg.a6xx_tex_const_1(width=imgw, height=imgh),
|
293
|
+
qreg.a6xx_tex_const_2(type=adreno.A6XX_TEX_2D, pitch=pitch, pitchalign=pitchalign-6), 0,
|
294
|
+
*data64_le(buf.va_addr), qreg.a6xx_tex_const_6(plane_pitch=0x400000), qreg.a6xx_tex_const_7(13)]
|
318
295
|
|
319
|
-
|
296
|
+
buf.texture_info = QCOMTextureInfo(pitch, real_stride, desc, [desc[0] & (~0xffff), *desc[1:len(desc)]])
|
297
|
+
return buf
|
320
298
|
|
321
299
|
def _do_copy(self, src_addr, dest_addr, src_size, real_size, src_stride, dest_stride, dest_off=0, src_off=0):
|
322
300
|
while src_off < src_size:
|
323
301
|
ctypes.memmove(dest_addr+dest_off, src_addr+src_off, real_size)
|
324
302
|
src_off, dest_off = src_off+src_stride, dest_off+dest_stride
|
325
303
|
|
326
|
-
def
|
327
|
-
if (
|
328
|
-
|
304
|
+
def _copyin(self, dest:HCQBuffer, src:memoryview):
|
305
|
+
stride, pitch = (src.nbytes, src.nbytes) if (ti:=cast(QCOMTextureInfo, dest.texture_info)) is None else (ti.real_stride, ti.pitch)
|
306
|
+
self._do_copy(mv_address(src), dest.va_addr, src.nbytes, stride, stride, pitch)
|
307
|
+
|
308
|
+
def _copyout(self, dest:memoryview, src:HCQBuffer):
|
309
|
+
self.dev.synchronize()
|
329
310
|
|
330
|
-
|
331
|
-
self.
|
332
|
-
if (qs:=cast(QCOMBuffer, src)).pitch is not None: self._do_copy(qs.va_addr, mv_address(dest), qs.size, qs.real_stride, qs.pitch, qs.real_stride)
|
333
|
-
else: ctypes.memmove(from_mv(dest), src.va_addr, dest.nbytes)
|
311
|
+
stride, pitch = (src.size, src.size) if (ti:=cast(QCOMTextureInfo, src.texture_info)) is None else (ti.real_stride, ti.pitch)
|
312
|
+
self._do_copy(src.va_addr, mv_address(dest), src.size, stride, pitch, stride)
|
334
313
|
|
335
|
-
def
|
336
|
-
self.
|
337
|
-
return to_mv(src.va_addr, src.size)
|
314
|
+
def _as_buffer(self, src:HCQBuffer) -> memoryview:
|
315
|
+
self.dev.synchronize()
|
316
|
+
return to_mv(cast(int, src.va_addr), src.size)
|
338
317
|
|
339
|
-
def _free(self, opaque, options:
|
340
|
-
self.
|
341
|
-
self.
|
318
|
+
def _free(self, opaque, options:BufferSpec):
|
319
|
+
self.dev.synchronize()
|
320
|
+
self.dev._gpu_free(opaque)
|
342
321
|
|
343
322
|
class QCOMDevice(HCQCompiled):
|
344
323
|
signals_page: Any = None
|
345
|
-
signals_pool:
|
324
|
+
signals_pool: list[int] = []
|
346
325
|
gpu_id: int = 0
|
347
326
|
dummy_addr: int = 0
|
348
327
|
|
349
328
|
def __init__(self, device:str=""):
|
350
|
-
self.fd =
|
351
|
-
QCOMDevice.dummy_addr = self._gpu_alloc(0x1000).va_addr
|
329
|
+
self.fd = HWInterface('/dev/kgsl-3d0', os.O_RDWR)
|
330
|
+
QCOMDevice.dummy_addr = cast(int, self._gpu_alloc(0x1000).va_addr)
|
352
331
|
QCOMDevice.signals_page = self._gpu_alloc(16 * 65536, uncached=True)
|
353
|
-
QCOMDevice.signals_pool = [
|
354
|
-
info, self.ctx, self.cmd_buf, self.cmd_buf_ptr, self.last_cmd = self._info(), self._ctx_create(), self._gpu_alloc(16 << 20), 0,0
|
355
|
-
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
|
356
|
-
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
|
332
|
+
QCOMDevice.signals_pool = [self.signals_page.va_addr + off for off in range(0, self.signals_page.size, 16)]
|
357
333
|
|
358
|
-
|
359
|
-
|
334
|
+
flags = kgsl.KGSL_CONTEXT_PREAMBLE | kgsl.KGSL_CONTEXT_PWR_CONSTRAINT | kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC \
|
335
|
+
| kgsl.KGSL_CONTEXT_PRIORITY(8) | kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)
|
336
|
+
self.ctx = kgsl.IOCTL_KGSL_DRAWCTXT_CREATE(self.fd, flags=flags).drawctxt_id
|
360
337
|
|
361
|
-
|
362
|
-
|
363
|
-
kgsl.KGSL_CONTEXT_NO_FAULT_TOLERANCE | kgsl.KGSL_CONTEXT_NO_GMEM_ALLOC | kgsl.KGSL_CONTEXT_PRIORITY(8) |
|
364
|
-
kgsl.KGSL_CONTEXT_PREEMPT_STYLE(kgsl.KGSL_CONTEXT_PREEMPT_STYLE_FINEGRAIN)))
|
338
|
+
self.cmd_buf = self._gpu_alloc(16 << 20)
|
339
|
+
self.cmd_buf_allocator = BumpAllocator(size=self.cmd_buf.size, base=cast(int, self.cmd_buf.va_addr), wrap=True)
|
365
340
|
|
366
|
-
|
367
|
-
|
341
|
+
self.border_color_buf = self._gpu_alloc(0x1000, fill_zeroes=True)
|
342
|
+
|
343
|
+
self.last_cmd:int = 0
|
344
|
+
|
345
|
+
# Set max power
|
346
|
+
struct.pack_into('IIQQ', pwr:=memoryview(bytearray(0x18)), 0, 1, self.ctx, mv_address(_:=memoryview(array.array('I', [1]))), 4)
|
368
347
|
kgsl.IOCTL_KGSL_SETPROPERTY(self.fd, type=kgsl.KGSL_PROP_PWR_CONSTRAINT, value=mv_address(pwr), sizebytes=pwr.nbytes)
|
369
|
-
return cr.drawctxt_id
|
370
348
|
|
371
|
-
|
349
|
+
# Load info about qcom device
|
372
350
|
info = kgsl.struct_kgsl_devinfo()
|
373
351
|
kgsl.IOCTL_KGSL_DEVICE_GETPROPERTY(self.fd, type=kgsl.KGSL_PROP_DEVICE_INFO, value=ctypes.addressof(info), sizebytes=ctypes.sizeof(info))
|
374
|
-
|
352
|
+
QCOMDevice.gpu_id = ((info.chip_id >> 24) & 0xFF) * 100 + ((info.chip_id >> 16) & 0xFF) * 10 + ((info.chip_id >> 8) & 0xFF)
|
353
|
+
if QCOMDevice.gpu_id >= 700: raise RuntimeError(f"Unsupported GPU: {QCOMDevice.gpu_id}")
|
354
|
+
|
355
|
+
super().__init__(device, QCOMAllocator(self), QCOMRenderer(), QCOMCompiler(device), functools.partial(QCOMProgram, self),
|
356
|
+
QCOMSignal, QCOMComputeQueue, None)
|
375
357
|
|
376
|
-
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False):
|
358
|
+
def _gpu_alloc(self, size:int, flags:int=0, uncached=False, fill_zeroes=False) -> HCQBuffer:
|
377
359
|
flags |= kgsl.KGSL_MEMALIGN(alignment_hint:=12) | kgsl.KGSL_MEMFLAGS_USE_CPU_MAP
|
378
360
|
if uncached: flags |= kgsl.KGSL_CACHEMODE(kgsl.KGSL_CACHEMODE_UNCACHED)
|
379
361
|
|
380
362
|
alloc = kgsl.IOCTL_KGSL_GPUOBJ_ALLOC(self.fd, size=(bosz:=round_up(size, 1<<alignment_hint)), flags=flags, mmapsize=bosz)
|
381
|
-
va_addr =
|
363
|
+
va_addr = self.fd.mmap(0, bosz, mmap.PROT_READ | mmap.PROT_WRITE, mmap.MAP_SHARED, alloc.id * 0x1000)
|
382
364
|
|
383
365
|
if fill_zeroes: ctypes.memset(va_addr, 0, size)
|
384
|
-
return
|
366
|
+
return HCQBuffer(va_addr=va_addr, size=size, meta=alloc)
|
385
367
|
|
386
|
-
def _gpu_free(self, mem):
|
387
|
-
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.
|
388
|
-
|
389
|
-
|
390
|
-
def _alloc_cmd_buf(self, sz: int):
|
391
|
-
self.cmd_buf_ptr = (cur_ptr:=self.cmd_buf_ptr if self.cmd_buf_ptr + sz < self.cmd_buf.size else 0) + sz
|
392
|
-
return self.cmd_buf.va_addr + cur_ptr
|
393
|
-
|
394
|
-
def _border_color_base(self):
|
395
|
-
if not hasattr(self, '_border_color_gpu'): self._border_color_gpu = self._gpu_alloc(0x1000, fill_zeroes=True)
|
396
|
-
return self._border_color_gpu.va_addr
|
368
|
+
def _gpu_free(self, mem:HCQBuffer):
|
369
|
+
kgsl.IOCTL_KGSL_GPUOBJ_FREE(self.fd, id=mem.meta.id)
|
370
|
+
HWInterface.munmap(mem.va_addr, mem.meta.mmapsize)
|
397
371
|
|
398
372
|
def _ensure_stack_size(self, sz):
|
399
373
|
if not hasattr(self, '_stack'): self._stack = self._gpu_alloc(sz)
|
@@ -401,5 +375,3 @@ class QCOMDevice(HCQCompiled):
|
|
401
375
|
self.synchronize()
|
402
376
|
self._gpu_free(self._stack)
|
403
377
|
self._stack = self._gpu_alloc(sz)
|
404
|
-
|
405
|
-
def _syncdev(self): kgsl.IOCTL_KGSL_DEVICE_WAITTIMESTAMP_CTXTID(self.fd, context_id=self.ctx, timestamp=self.last_cmd, timeout=0xffffffff)
|