tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- tinygrad/codegen/__init__.py +0 -0
- tinygrad/codegen/kernel.py +78 -90
- tinygrad/codegen/linearizer.py +237 -169
- tinygrad/codegen/uops.py +278 -242
- tinygrad/device.py +147 -10
- tinygrad/dtype.py +7 -7
- tinygrad/engine/graph.py +16 -16
- tinygrad/engine/jit.py +39 -36
- tinygrad/engine/realize.py +6 -5
- tinygrad/engine/schedule.py +15 -7
- tinygrad/engine/search.py +6 -3
- tinygrad/function.py +17 -23
- tinygrad/helpers.py +77 -8
- tinygrad/lazy.py +26 -26
- tinygrad/multi.py +13 -9
- tinygrad/nn/__init__.py +1 -1
- tinygrad/nn/datasets.py +2 -1
- tinygrad/nn/state.py +3 -4
- tinygrad/ops.py +49 -16
- tinygrad/renderer/__init__.py +8 -4
- tinygrad/renderer/assembly.py +93 -100
- tinygrad/renderer/cstyle.py +47 -42
- tinygrad/renderer/llvmir.py +30 -30
- tinygrad/runtime/__init__.py +0 -0
- tinygrad/runtime/autogen/amd_gpu.py +11504 -1
- tinygrad/runtime/autogen/comgr.py +36 -10
- tinygrad/runtime/autogen/hsa.py +146 -14
- tinygrad/runtime/autogen/io_uring.py +1486 -0
- tinygrad/runtime/autogen/nv_gpu.py +269 -0
- tinygrad/runtime/driver/__init__.py +0 -0
- tinygrad/runtime/driver/hip_comgr.py +20 -11
- tinygrad/runtime/graph/__init__.py +0 -0
- tinygrad/runtime/graph/clang.py +3 -2
- tinygrad/runtime/graph/cuda.py +2 -2
- tinygrad/runtime/graph/hcq.py +122 -78
- tinygrad/runtime/ops_amd.py +302 -316
- tinygrad/runtime/ops_cuda.py +3 -3
- tinygrad/runtime/ops_disk.py +70 -5
- tinygrad/runtime/ops_gpu.py +2 -2
- tinygrad/runtime/ops_metal.py +5 -6
- tinygrad/runtime/ops_npy.py +1 -1
- tinygrad/runtime/ops_nv.py +161 -166
- tinygrad/runtime/ops_python.py +20 -16
- tinygrad/shape/__init__.py +0 -0
- tinygrad/shape/shapetracker.py +5 -2
- tinygrad/shape/symbolic.py +1 -3
- tinygrad/shape/view.py +34 -19
- tinygrad/tensor.py +219 -135
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
- tinygrad-0.9.1.dist-info/RECORD +63 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
- tinygrad/runtime/driver/hsa.py +0 -143
- tinygrad/runtime/graph/hsa.py +0 -171
- tinygrad/runtime/ops_hsa.py +0 -278
- tinygrad-0.9.0.dist-info/RECORD +0 -60
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
- {tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0
tinygrad/runtime/ops_hsa.py
DELETED
@@ -1,278 +0,0 @@
|
|
1
|
-
from __future__ import annotations
|
2
|
-
import ctypes, functools, subprocess, io, atexit, collections, json
|
3
|
-
from typing import Tuple, TypeVar, List, Dict, Any
|
4
|
-
import tinygrad.runtime.autogen.hsa as hsa
|
5
|
-
from tinygrad.helpers import DEBUG, init_c_var, from_mv, round_up, to_mv, init_c_struct_t, getenv
|
6
|
-
from tinygrad.device import Compiled, Compiler, CompileError, BufferOptions, LRUAllocator
|
7
|
-
from tinygrad.renderer.cstyle import HIPRenderer
|
8
|
-
from tinygrad.runtime.driver.hsa import check, scan_agents, find_memory_pool, AQLQueue
|
9
|
-
from tinygrad.runtime.driver.hip_comgr import compile_hip
|
10
|
-
if getenv("IOCTL"): import extra.hip_gpu_driver.hip_ioctl # noqa: F401
|
11
|
-
|
12
|
-
PROFILE = getenv("PROFILE", 0)
|
13
|
-
|
14
|
-
class HSAProfiler:
|
15
|
-
def __init__(self):
|
16
|
-
self.tracked_signals = collections.defaultdict(list)
|
17
|
-
self.collected_events: List[Tuple[Any, ...]] = []
|
18
|
-
self.copy_timings = hsa.hsa_amd_profiling_async_copy_time_t()
|
19
|
-
self.disp_timings = hsa.hsa_amd_profiling_dispatch_time_t()
|
20
|
-
|
21
|
-
def track(self, signal, device, name, is_copy=False): self.tracked_signals[device].append((signal, name, is_copy))
|
22
|
-
def process(self, device):
|
23
|
-
# Process all tracked signals, should be called before any of tracked signals are reused.
|
24
|
-
for sig,name,is_copy in self.tracked_signals[device]:
|
25
|
-
if is_copy: check(hsa.hsa_amd_profiling_get_async_copy_time(sig, ctypes.byref(timings := self.copy_timings)))
|
26
|
-
else: check(hsa.hsa_amd_profiling_get_dispatch_time(device.agent, sig, ctypes.byref(timings := self.disp_timings))) #type:ignore
|
27
|
-
self.collected_events.append((device.device_id, 1 if is_copy else 0, name, timings.start, timings.end))
|
28
|
-
self.tracked_signals.pop(device)
|
29
|
-
|
30
|
-
def save(self, path):
|
31
|
-
mjson = []
|
32
|
-
for i in range(len(HSADevice.devices)):
|
33
|
-
mjson.append({"name": "process_name", "ph": "M", "pid": i, "args": {"name": "HSA"}})
|
34
|
-
mjson.append({"name": "thread_name", "ph": "M", "pid": i, "tid": 0, "args": {"name": "AQL"}})
|
35
|
-
mjson.append({"name": "thread_name", "ph": "M", "pid": i, "tid": 1, "args": {"name": "SDMA"}})
|
36
|
-
|
37
|
-
for dev_id,queue_id,name,st,et in self.collected_events:
|
38
|
-
mjson.append({"name": name, "ph": "B", "pid": dev_id, "tid": queue_id, "ts": st*1e-3})
|
39
|
-
mjson.append({"name": name, "ph": "E", "pid": dev_id, "tid": queue_id, "ts": et*1e-3})
|
40
|
-
with open(path, "w") as f: f.write(json.dumps({"traceEvents": mjson}))
|
41
|
-
print(f"Saved HSA profile to {path}")
|
42
|
-
Profiler = HSAProfiler()
|
43
|
-
|
44
|
-
class HSACompiler(Compiler):
|
45
|
-
def __init__(self, arch:str):
|
46
|
-
self.arch = arch
|
47
|
-
super().__init__(f"compile_hip_{self.arch}")
|
48
|
-
def compile(self, src:str) -> bytes:
|
49
|
-
try: return compile_hip(src, self.arch)
|
50
|
-
except RuntimeError as e: raise CompileError(e)
|
51
|
-
|
52
|
-
class HSAProgram:
|
53
|
-
def __init__(self, device:HSADevice, name:str, lib:bytes):
|
54
|
-
self.device, self.name, self.lib = device, name, lib
|
55
|
-
|
56
|
-
if DEBUG >= 6:
|
57
|
-
asm = subprocess.check_output(["/opt/rocm/llvm/bin/llvm-objdump", '-d', '-'], input=lib)
|
58
|
-
print('\n'.join([x for x in asm.decode('utf-8').split("\n") if 's_code_end' not in x]))
|
59
|
-
|
60
|
-
self.exec = init_c_var(hsa.hsa_executable_t(), lambda x: check(hsa.hsa_executable_create_alt(hsa.HSA_PROFILE_FULL, hsa.HSA_DEFAULT_FLOAT_ROUNDING_MODE_DEFAULT, None, ctypes.byref(x)))) # noqa: E501
|
61
|
-
self.code_reader = init_c_var(hsa.hsa_code_object_reader_t(),
|
62
|
-
lambda x: check(hsa.hsa_code_object_reader_create_from_memory(lib, len(lib), ctypes.byref(x))))
|
63
|
-
check(hsa.hsa_executable_load_agent_code_object(self.exec, self.device.agent, self.code_reader, None, None))
|
64
|
-
check(hsa.hsa_executable_freeze(self.exec, None))
|
65
|
-
|
66
|
-
self.kernel = init_c_var(hsa.hsa_executable_symbol_t(), lambda x: check(hsa.hsa_executable_get_symbol_by_name(self.exec, (name+".kd").encode("utf-8"), ctypes.byref(self.device.agent), ctypes.byref(x)))) # noqa: E501
|
67
|
-
self.handle = init_c_var(ctypes.c_uint64(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_OBJECT, ctypes.byref(x)))) # noqa: E501
|
68
|
-
self.kernargs_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_KERNARG_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
|
69
|
-
self.group_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_GROUP_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
|
70
|
-
self.private_segment_size = init_c_var(ctypes.c_uint32(), lambda x: check(hsa.hsa_executable_symbol_get_info(self.kernel, hsa.HSA_EXECUTABLE_SYMBOL_INFO_KERNEL_PRIVATE_SEGMENT_SIZE, ctypes.byref(x)))).value # noqa: E501
|
71
|
-
|
72
|
-
def __del__(self):
|
73
|
-
self.device.synchronize()
|
74
|
-
if hasattr(self, 'code_reader'): check(hsa.hsa_code_object_reader_destroy(self.code_reader))
|
75
|
-
if hasattr(self, 'exec'): check(hsa.hsa_executable_destroy(self.exec))
|
76
|
-
|
77
|
-
def __call__(self, *args, global_size:Tuple[int,int,int]=(1,1,1), local_size:Tuple[int,int,int]=(1,1,1), vals:Tuple[int, ...]=(), wait=False):
|
78
|
-
if not hasattr(self, "args_struct_t"):
|
79
|
-
self.args_struct_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(args))] +
|
80
|
-
[(f'v{i}', ctypes.c_int) for i in range(len(vals))]))
|
81
|
-
if ctypes.sizeof(self.args_struct_t) != self.kernargs_segment_size:
|
82
|
-
raise RuntimeError(f"HSAProgram.__call__: incorrect args struct size {ctypes.sizeof(self.args_struct_t)} != {self.kernargs_segment_size}")
|
83
|
-
|
84
|
-
kernargs = None
|
85
|
-
if self.kernargs_segment_size > 0:
|
86
|
-
kernargs = self.device.alloc_kernargs(self.kernargs_segment_size)
|
87
|
-
args_st = self.args_struct_t.from_address(kernargs)
|
88
|
-
for i in range(len(args)): args_st.__setattr__(f'f{i}', args[i])
|
89
|
-
for i in range(len(vals)): args_st.__setattr__(f'v{i}', vals[i])
|
90
|
-
self.device.flush_hdp()
|
91
|
-
|
92
|
-
signal = self.device.alloc_signal(reusable=True) if wait or PROFILE else None
|
93
|
-
self.device.hw_queue.submit_kernel(self, global_size, local_size, kernargs, completion_signal=signal)
|
94
|
-
if PROFILE: Profiler.track(signal, self.device, self.name)
|
95
|
-
if wait:
|
96
|
-
hsa.hsa_signal_wait_scacquire(signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
|
97
|
-
check(hsa.hsa_amd_profiling_get_dispatch_time(self.device.agent, signal, ctypes.byref(timings := hsa.hsa_amd_profiling_dispatch_time_t())))
|
98
|
-
return (timings.end - timings.start) * self.device.clocks_to_time
|
99
|
-
|
100
|
-
T = TypeVar("T")
|
101
|
-
CHUNK_SIZE, PAGE_SIZE = 256*1024*1024, 0x1000
|
102
|
-
class HSAAllocator(LRUAllocator):
|
103
|
-
def __init__(self, device:HSADevice):
|
104
|
-
self.device = device
|
105
|
-
super().__init__()
|
106
|
-
|
107
|
-
def _alloc(self, size:int, options:BufferOptions):
|
108
|
-
if options.host:
|
109
|
-
check(hsa.hsa_amd_memory_pool_allocate(HSADevice.cpu_mempool, size, 0, ctypes.byref(mem := ctypes.c_void_p())))
|
110
|
-
check(hsa.hsa_amd_agents_allow_access(2, (hsa.hsa_agent_t*2)(HSADevice.cpu_agent, self.device.agent), None, mem))
|
111
|
-
return mem.value
|
112
|
-
else:
|
113
|
-
c_agents = (hsa.hsa_agent_t * len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]))(*HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU])
|
114
|
-
check(hsa.hsa_amd_memory_pool_allocate(self.device.gpu_mempool, size, 0, ctypes.byref(buf := ctypes.c_void_p())))
|
115
|
-
check(hsa.hsa_amd_agents_allow_access(len(HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU]), c_agents, None, buf))
|
116
|
-
return buf.value
|
117
|
-
|
118
|
-
def _free(self, opaque:T, options:BufferOptions):
|
119
|
-
HSADevice.synchronize_system()
|
120
|
-
check(hsa.hsa_amd_memory_pool_free(opaque))
|
121
|
-
|
122
|
-
def copyin(self, dest:T, src: memoryview):
|
123
|
-
# Async copyin sync model uses barriers on the main hw queue, since barriers are guaranteed to execute in order with all other packets.
|
124
|
-
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
|
125
|
-
mem = self._alloc(src.nbytes, BufferOptions(host=True))
|
126
|
-
ctypes.memmove(mem, from_mv(src), src.nbytes)
|
127
|
-
check(hsa.hsa_amd_memory_async_copy_on_engine(dest, self.device.agent, mem, HSADevice.cpu_agent, src.nbytes, 1, ctypes.byref(sync_signal),
|
128
|
-
copy_signal := self.device.alloc_signal(reusable=True), hsa.HSA_AMD_SDMA_ENGINE_0, True))
|
129
|
-
self.device.hw_queue.submit_barrier([copy_signal])
|
130
|
-
self.device.delayed_free.append(mem)
|
131
|
-
if PROFILE: Profiler.track(copy_signal, self.device, f"copyin: CPU -> HSA:{self.device.device_id}", is_copy=True)
|
132
|
-
|
133
|
-
def copy_from_fd(self, dest, fd, offset, size):
|
134
|
-
self.device.hw_queue.submit_barrier([], sync_signal := self.device.alloc_signal(reusable=True))
|
135
|
-
|
136
|
-
if not hasattr(self, 'hb'):
|
137
|
-
self.hb = [self._alloc(CHUNK_SIZE, BufferOptions(host=True)) for _ in range(2)]
|
138
|
-
self.hb_signals = [self.device.alloc_signal(reusable=False) for _ in range(2)]
|
139
|
-
self.hb_polarity = 0
|
140
|
-
self.sdma = [hsa.HSA_AMD_SDMA_ENGINE_0, hsa.HSA_AMD_SDMA_ENGINE_1]
|
141
|
-
for sig in self.hb_signals: hsa.hsa_signal_store_relaxed(sig, 0)
|
142
|
-
|
143
|
-
fo = io.FileIO(fd, "a+b", closefd=False)
|
144
|
-
fo.seek(offset - (minor_offset:=offset % PAGE_SIZE))
|
145
|
-
|
146
|
-
copies_called = 0
|
147
|
-
copied_in = 0
|
148
|
-
for local_offset in range(0, size+minor_offset, CHUNK_SIZE):
|
149
|
-
local_size = min(round_up(size+minor_offset, PAGE_SIZE)-local_offset, CHUNK_SIZE)
|
150
|
-
copy_size = min(local_size-minor_offset, size-copied_in)
|
151
|
-
if copy_size == 0: break
|
152
|
-
|
153
|
-
hsa.hsa_signal_wait_scacquire(self.hb_signals[self.hb_polarity], hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
|
154
|
-
self.device.reusable_signals.append(self.hb_signals[self.hb_polarity]) # it's free now and can be reused
|
155
|
-
self.hb_signals[self.hb_polarity] = self.device.alloc_signal(reusable=False)
|
156
|
-
|
157
|
-
fo.readinto(to_mv(self.hb[self.hb_polarity], local_size))
|
158
|
-
check(hsa.hsa_amd_memory_async_copy_on_engine(dest+copied_in, self.device.agent, self.hb[self.hb_polarity]+minor_offset, HSADevice.cpu_agent,
|
159
|
-
copy_size, 1, ctypes.byref(sync_signal), self.hb_signals[self.hb_polarity],
|
160
|
-
self.sdma[self.hb_polarity], True))
|
161
|
-
copied_in += copy_size
|
162
|
-
self.hb_polarity = (self.hb_polarity + 1) % len(self.hb)
|
163
|
-
minor_offset = 0 # only on the first
|
164
|
-
copies_called += 1
|
165
|
-
|
166
|
-
wait_signals = [self.hb_signals[self.hb_polarity - 1]]
|
167
|
-
if copies_called > 1: wait_signals.append(self.hb_signals[self.hb_polarity])
|
168
|
-
self.device.hw_queue.submit_barrier(wait_signals)
|
169
|
-
|
170
|
-
def copyout(self, dest:memoryview, src:T):
|
171
|
-
HSADevice.synchronize_system()
|
172
|
-
copy_signal = self.device.alloc_signal(reusable=True)
|
173
|
-
c_agents = (hsa.hsa_agent_t*2)(self.device.agent, HSADevice.cpu_agent)
|
174
|
-
check(hsa.hsa_amd_memory_lock_to_pool(from_mv(dest), dest.nbytes, c_agents, 2, HSADevice.cpu_mempool, 0, ctypes.byref(addr:=ctypes.c_void_p())))
|
175
|
-
check(hsa.hsa_amd_memory_async_copy(addr, HSADevice.cpu_agent, src, self.device.agent, dest.nbytes, 0, None, copy_signal))
|
176
|
-
hsa.hsa_signal_wait_scacquire(copy_signal, hsa.HSA_SIGNAL_CONDITION_LT, 1, (1 << 64) - 1, hsa.HSA_WAIT_STATE_ACTIVE)
|
177
|
-
check(hsa.hsa_amd_memory_unlock(from_mv(dest)))
|
178
|
-
if PROFILE: Profiler.track(copy_signal, self.device, f"copyout: HSA:{self.device.device_id} -> CPU", is_copy=True)
|
179
|
-
|
180
|
-
def transfer(self, dest:T, src:T, sz:int, src_dev=None, dest_dev=None):
|
181
|
-
src_dev.hw_queue.submit_barrier([], sync_signal_1 := src_dev.alloc_signal(reusable=True))
|
182
|
-
dest_dev.hw_queue.submit_barrier([], sync_signal_2 := dest_dev.alloc_signal(reusable=True))
|
183
|
-
c_wait_signal = (hsa.hsa_signal_t*2)(sync_signal_1, sync_signal_2)
|
184
|
-
check(hsa.hsa_amd_memory_async_copy_on_engine(dest, dest_dev.agent, src, src_dev.agent, sz, 2, c_wait_signal,
|
185
|
-
copy_signal := dest_dev.alloc_signal(reusable=False), hsa.HSA_AMD_SDMA_ENGINE_0, True))
|
186
|
-
src_dev.hw_queue.submit_barrier([copy_signal])
|
187
|
-
dest_dev.hw_queue.submit_barrier([copy_signal])
|
188
|
-
if PROFILE: Profiler.track(copy_signal, src_dev, f"transfer: HSA:{src_dev.device_id} -> HSA:{dest_dev.device_id}", is_copy=True)
|
189
|
-
|
190
|
-
class HSADevice(Compiled):
|
191
|
-
devices: List[HSADevice] = []
|
192
|
-
agents: Dict[int, List[hsa.hsa_agent_t]] = {}
|
193
|
-
cpu_agent: hsa.hsa_agent_t
|
194
|
-
cpu_mempool: hsa.hsa_amd_memory_pool_t
|
195
|
-
def __init__(self, device:str=""):
|
196
|
-
if not HSADevice.agents:
|
197
|
-
check(hsa.hsa_init())
|
198
|
-
atexit.register(hsa_terminate)
|
199
|
-
HSADevice.agents = scan_agents()
|
200
|
-
HSADevice.cpu_agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_CPU][0]
|
201
|
-
HSADevice.cpu_mempool = find_memory_pool(HSADevice.cpu_agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_CPU)
|
202
|
-
if PROFILE: check(hsa.hsa_amd_profiling_async_copy_enable(1))
|
203
|
-
|
204
|
-
self.device_id = int(device.split(":")[1]) if ":" in device else 0
|
205
|
-
self.agent = HSADevice.agents[hsa.HSA_DEVICE_TYPE_GPU][self.device_id]
|
206
|
-
self.gpu_mempool = find_memory_pool(self.agent, segtyp=hsa.HSA_AMD_SEGMENT_GLOBAL, location=hsa.HSA_AMD_MEMORY_POOL_LOCATION_GPU)
|
207
|
-
self.hw_queue = AQLQueue(self)
|
208
|
-
HSADevice.devices.append(self)
|
209
|
-
|
210
|
-
check(hsa.hsa_agent_get_info(self.agent, hsa.HSA_AGENT_INFO_NAME, ctypes.byref(agent_name_buf := ctypes.create_string_buffer(256))))
|
211
|
-
self.arch = ctypes.string_at(agent_name_buf).decode()
|
212
|
-
|
213
|
-
check(hsa.hsa_system_get_info(hsa.HSA_SYSTEM_INFO_TIMESTAMP_FREQUENCY, ctypes.byref(gpu_freq := ctypes.c_uint64())))
|
214
|
-
self.clocks_to_time: float = 1 / gpu_freq.value
|
215
|
-
|
216
|
-
check(hsa.hsa_agent_get_info(self.agent, hsa.HSA_AMD_AGENT_INFO_HDP_FLUSH, ctypes.byref(hdp_flush := hsa.hsa_amd_hdp_flush_t())))
|
217
|
-
self.hdp_flush = hdp_flush
|
218
|
-
|
219
|
-
self.delayed_free: List[int] = []
|
220
|
-
self.reusable_signals: List[hsa.hsa_signal_t] = []
|
221
|
-
|
222
|
-
from tinygrad.runtime.graph.hsa import HSAGraph
|
223
|
-
super().__init__(device, HSAAllocator(self), HIPRenderer(), HSACompiler(self.arch), functools.partial(HSAProgram, self), HSAGraph)
|
224
|
-
|
225
|
-
# Finish init: preallocate some signals + space for kernargs
|
226
|
-
self.signal_pool = [init_c_var(hsa.hsa_signal_t(), lambda x: check(hsa.hsa_signal_create(1, 0, None, ctypes.byref(x)))) for _ in range(4096)]
|
227
|
-
self._new_kernargs_region(16 << 20) # initial region size is 16mb
|
228
|
-
|
229
|
-
def synchronize(self):
|
230
|
-
self.hw_queue.wait()
|
231
|
-
|
232
|
-
for sig in self.reusable_signals: hsa.hsa_signal_silent_store_relaxed(sig, 1)
|
233
|
-
self.signal_pool.extend(self.reusable_signals)
|
234
|
-
self.reusable_signals.clear()
|
235
|
-
|
236
|
-
for opaque_to_free in self.delayed_free: check(hsa.hsa_amd_memory_pool_free(opaque_to_free))
|
237
|
-
self.delayed_free.clear()
|
238
|
-
|
239
|
-
self.kernarg_next_addr = self.kernarg_start_addr
|
240
|
-
Profiler.process(self)
|
241
|
-
|
242
|
-
@staticmethod
|
243
|
-
def synchronize_system():
|
244
|
-
for d in HSADevice.devices: d.synchronize()
|
245
|
-
|
246
|
-
def alloc_signal(self, reusable=False):
|
247
|
-
if len(self.signal_pool): signal = self.signal_pool.pop()
|
248
|
-
else: check(hsa.hsa_amd_signal_create(1, 0, None, 0, ctypes.byref(signal := hsa.hsa_signal_t())))
|
249
|
-
|
250
|
-
# reusable means a signal could be reused after synchronize for the device it's allocated from is called.
|
251
|
-
if reusable: self.reusable_signals.append(signal)
|
252
|
-
return signal
|
253
|
-
|
254
|
-
def alloc_kernargs(self, sz):
|
255
|
-
if self.kernarg_next_addr + sz >= self.kernarg_start_addr + self.kernarg_pool_sz: self._new_kernargs_region(int(self.kernarg_pool_sz * 2))
|
256
|
-
result = self.kernarg_next_addr
|
257
|
-
self.kernarg_next_addr = round_up(self.kernarg_next_addr + sz, 16)
|
258
|
-
return result
|
259
|
-
|
260
|
-
def _new_kernargs_region(self, sz:int):
|
261
|
-
if hasattr(self, 'kernarg_start_addr'): self.delayed_free.append(self.kernarg_start_addr)
|
262
|
-
self.kernarg_start_addr: int = self.allocator._alloc(sz, BufferOptions())
|
263
|
-
self.kernarg_next_addr = self.kernarg_start_addr
|
264
|
-
self.kernarg_pool_sz: int = sz
|
265
|
-
|
266
|
-
def flush_hdp(self): self.hdp_flush.HDP_MEM_FLUSH_CNTL[0] = 1
|
267
|
-
|
268
|
-
def hsa_terminate():
|
269
|
-
# Need to stop/delete aql queue before hsa shut down, this leads to gpu hangs.
|
270
|
-
for dev in HSADevice.devices:
|
271
|
-
Profiler.process(dev)
|
272
|
-
del dev.hw_queue
|
273
|
-
|
274
|
-
# hsa_shut_down cleans up all hsa-related resources.
|
275
|
-
hsa.hsa_shut_down()
|
276
|
-
HSADevice.synchronize = lambda: None #type:ignore
|
277
|
-
HSAProgram.__del__ = lambda _: None #type:ignore
|
278
|
-
if Profiler.collected_events: Profiler.save("/tmp/profile.json")
|
tinygrad-0.9.0.dist-info/RECORD
DELETED
@@ -1,60 +0,0 @@
|
|
1
|
-
tinygrad/__init__.py,sha256=jC-35zswLSXLuRRThG_o6yar6qQjLCqmeaFCj_XKN08,449
|
2
|
-
tinygrad/device.py,sha256=zXcrFjBsiV1rW0aXupszDjD98TWLHin7u8pBd5fdJqo,10446
|
3
|
-
tinygrad/dtype.py,sha256=xg2BlFIPcQw0onHW_0ktGXjved9SXgQcLNrqe6gCXto,6221
|
4
|
-
tinygrad/function.py,sha256=0xkWst2tRsOeN6YcQS65MfVfWwKQYFkAacgkTys0VdQ,9616
|
5
|
-
tinygrad/helpers.py,sha256=XI8MIeBE35wQ4q0NEsUCvkj3QdY0adI80SfCbOySOVI,12773
|
6
|
-
tinygrad/lazy.py,sha256=xqaEqXaIpt_77SP_2U6Pyfw8YeGd_0PzNDXJOOnRJ24,13379
|
7
|
-
tinygrad/multi.py,sha256=gyGXYVviaPfzAkoJjLFUiusVd3no6HRJunOOxD0DaaY,11362
|
8
|
-
tinygrad/ops.py,sha256=aNk1jLuJl--Z_u8DE8Du1iN1AFhMH-6Le5mnvRHLDvI,7124
|
9
|
-
tinygrad/tensor.py,sha256=nznRGHH7-64bMpFeu8gvdSDfJ5CEEoezIcHSxPgeJ7k,129223
|
10
|
-
tinygrad/codegen/kernel.py,sha256=RRRmOX3iOOgu5ISABW_UTVh5vfGdLFfK1UOtKtpghuY,38169
|
11
|
-
tinygrad/codegen/linearizer.py,sha256=jxwEcxxcpWOvYlIgXmlGkgNYZ4sDykiY_5Ve3a8tpYg,27622
|
12
|
-
tinygrad/codegen/uops.py,sha256=yKS-3w9teuS_3BLnHAN4vWtSRvZHmsx194YRBXMOhFI,21872
|
13
|
-
tinygrad/engine/__init__.py,sha256=47DEQpj8HBSa-_TImW-5JCeuQeRkm5NMpJWZG3hSuFU,0
|
14
|
-
tinygrad/engine/graph.py,sha256=eEbb17qbJ0A-2VjN4l7SCbA4yI7jh6YN6EybPShmcbg,5221
|
15
|
-
tinygrad/engine/jit.py,sha256=TrdZQEXnF-SowCAyy69iEp85NPFFtOVaIz_2i1cwCvQ,11049
|
16
|
-
tinygrad/engine/realize.py,sha256=H2CgiLWRqTQSr9udJb1iLX7DFdLcPLwxSJZ2bUfHXDs,11077
|
17
|
-
tinygrad/engine/schedule.py,sha256=I3OxiNwveWbVuhhFWdR1B5G8goEyqXOHWe7UQQ3Ogz8,18487
|
18
|
-
tinygrad/engine/search.py,sha256=M11qHlufIffBS9b7mjk8gniyoGRaengwXnMfUQTHmyw,11289
|
19
|
-
tinygrad/nn/__init__.py,sha256=DoHrq9pUFs1vm9FUA5eet5_tvhlZJlC_uC4kBqo98kI,12932
|
20
|
-
tinygrad/nn/datasets.py,sha256=Mvf_0eCEEqtB9-8iiyDFhm-B7rTyWXwmSY_3xjeHleo,458
|
21
|
-
tinygrad/nn/optim.py,sha256=zf85kwumpS17fk1NBjUfe7tOUE7-XH37SL-LjgAfva8,6799
|
22
|
-
tinygrad/nn/state.py,sha256=nGR05s3kuDNp9lliCIr4-6Ek7Korha7jCAWie5S2rB4,10138
|
23
|
-
tinygrad/renderer/__init__.py,sha256=-LjQ9tC2rI8fveaS_xn24X_knXKILFj-iZFcRTk8fNM,2672
|
24
|
-
tinygrad/renderer/assembly.py,sha256=MD-SSC7-Nqwt3zrwe0aDXVX08W9Ox6Vj_byPS1k1bAQ,17923
|
25
|
-
tinygrad/renderer/cstyle.py,sha256=tFWWW-egorLFEDwX6fA9-rYxvNLc67LjxlZ6JzrWCF0,24043
|
26
|
-
tinygrad/renderer/llvmir.py,sha256=BZViWXj2G6JtEcOgc-CtnIj-d9xP0ZjgNXdTKQT_PJ8,10315
|
27
|
-
tinygrad/runtime/ops_amd.py,sha256=3jOrFqxk8JkPX043tEUFLvyKSgX7Fls785g_gOkdzVM,31811
|
28
|
-
tinygrad/runtime/ops_clang.py,sha256=XWqwobReRdu-Tj-chbWEJFMx6AQfgdGCcpdWcLWUTOQ,1468
|
29
|
-
tinygrad/runtime/ops_cuda.py,sha256=cgeoVpY9bOGU22Eh78XR5YOYY2cgsJt4Vnxl6u8N6co,10840
|
30
|
-
tinygrad/runtime/ops_disk.py,sha256=75-iihZxkhNvA5O3VaW61LOXwmlSX4XwegpnV1C4D5A,2738
|
31
|
-
tinygrad/runtime/ops_gpu.py,sha256=FB3Fp-VVEDGEt_6CfJxsM_TWzhp5giXCP1TSSRMXE80,7532
|
32
|
-
tinygrad/runtime/ops_hsa.py,sha256=YNQLqZjJ9twTJRKS41l2oIrncOAu3wdOdBegs9zYlgo,16188
|
33
|
-
tinygrad/runtime/ops_llvm.py,sha256=dODiyVSlPofHyDIZrD-V74h8W1d94VPnr_-A4gNbSO4,2229
|
34
|
-
tinygrad/runtime/ops_metal.py,sha256=fGSNpwmYIHaif9a5SiwyMX2bub-r5hTNpnrqlaPMeUc,5815
|
35
|
-
tinygrad/runtime/ops_npy.py,sha256=qaAi0AEo6nt7iZ-eWqM8z2aQfNJgZUpmBCEDmrIzWL0,369
|
36
|
-
tinygrad/runtime/ops_nv.py,sha256=PCMAHMrW4J7esgnkpwq3bB91Q3h6hBATr8JuykR9vcA,37633
|
37
|
-
tinygrad/runtime/ops_python.py,sha256=mmsDj1hJ3BtreAq5dfCuuUGbgoIhCKlVwNqMDmXBISs,10871
|
38
|
-
tinygrad/runtime/autogen/amd_gpu.py,sha256=1NDH0ualiZ8OtgTjaYcQ1HjKs_SQ7eUHuJvdrDodvCk,65022
|
39
|
-
tinygrad/runtime/autogen/comgr.py,sha256=Z99Y6K8D_nuMpOs0qDWiA0MV-RxueV65o2OyPFdcsHE,38563
|
40
|
-
tinygrad/runtime/autogen/cuda.py,sha256=GgRl4AfU54JG0G1XJj2dq2FbrUZ8XG_AnFrPAZJpSSg,250380
|
41
|
-
tinygrad/runtime/autogen/hip.py,sha256=1yUHDCwL3KkD15if2Q1Ud3GbJiR7DxsNorKZTCINw54,245532
|
42
|
-
tinygrad/runtime/autogen/hsa.py,sha256=tGpnXUhhQkAIEr0yyCxRImzajmt-nN0KzJn4KnT_bH8,270073
|
43
|
-
tinygrad/runtime/autogen/kfd.py,sha256=dDmLFL6HL_QXRW2rZOCArY55PRuXuLN9563XCipV2jM,29935
|
44
|
-
tinygrad/runtime/autogen/nv_gpu.py,sha256=K9WwwdIitHrY2AXpYy8bbdD9aEwdbz9vL7748pz6Re0,1672024
|
45
|
-
tinygrad/runtime/autogen/opencl.py,sha256=aW-luGFF5PXFEZ6SgrGANhA9qpkI-fZyEsmDfpez2Ss,82654
|
46
|
-
tinygrad/runtime/driver/hip_comgr.py,sha256=rFQRsOYo4XquwcHFTe2mGzMfozdL9hitO3DRYBDFSuM,3376
|
47
|
-
tinygrad/runtime/driver/hsa.py,sha256=PoNy8gHBPoRUhUZligFp0z_Le9fyEXbJrnlgwInt_R0,7218
|
48
|
-
tinygrad/runtime/graph/clang.py,sha256=10Bs64J0r12g6upqCHVoK3LoTrdbBBHQD43efMhlBjo,1957
|
49
|
-
tinygrad/runtime/graph/cuda.py,sha256=LNx6RQLcQSKMlHfVK5r_efujN0lRPhKqi8yp249OAIs,5265
|
50
|
-
tinygrad/runtime/graph/hcq.py,sha256=mspwzroBTwNNHDob7oK-JCt48mhuIhX_G0qNYvFVuVM,8089
|
51
|
-
tinygrad/runtime/graph/hsa.py,sha256=UJgSg2irrKT87LBZ3DfaGmoK7rJk8OZhIHEHhtF8rUE,10035
|
52
|
-
tinygrad/runtime/graph/metal.py,sha256=bwB6uAsqjEbwv5ML5ziWduBtmTpseJboo6J9ssVa4v4,4579
|
53
|
-
tinygrad/shape/shapetracker.py,sha256=hWqh2uWsbBp3lKlRpY8Fj1oTWvEx1YwVsKl0QiA-QnU,6334
|
54
|
-
tinygrad/shape/symbolic.py,sha256=hn2khLoHAJSwyZ91i679oJZCLTaz0Sf2dUG-HRJMtVw,16688
|
55
|
-
tinygrad/shape/view.py,sha256=KMf_KzNwXmcX1NbFPq862-Jv_E6TgeO27lcPjrAweF4,17092
|
56
|
-
tinygrad-0.9.0.dist-info/LICENSE,sha256=ABRhUPEILzINYIukgazD-_rPipkUNUwslrb0RxnV6Xc,1058
|
57
|
-
tinygrad-0.9.0.dist-info/METADATA,sha256=oyGO3WSmMQ7NTAK3RGk0ZXCkr-L3XKltKYhYrKEuifk,10227
|
58
|
-
tinygrad-0.9.0.dist-info/WHEEL,sha256=GJ7t_kWBFywbagK5eo9IoUwLW6oyOeTKmQ-9iHFVNxQ,92
|
59
|
-
tinygrad-0.9.0.dist-info/top_level.txt,sha256=vDABMCWBFQnx2kn9Azueu88FP-1klQdePoHikQhHymc,9
|
60
|
-
tinygrad-0.9.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|