PyPI - tinygrad - Versions diffs - 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl - Mend

tinygrad 0.9.0py3-none-any.whl → 0.9.1py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (57) hide show

tinygrad/codegen/__init__.py +0 -0
tinygrad/codegen/kernel.py +78 -90
tinygrad/codegen/linearizer.py +237 -169
tinygrad/codegen/uops.py +278 -242
tinygrad/device.py +147 -10
tinygrad/dtype.py +7 -7
tinygrad/engine/graph.py +16 -16
tinygrad/engine/jit.py +39 -36
tinygrad/engine/realize.py +6 -5
tinygrad/engine/schedule.py +15 -7
tinygrad/engine/search.py +6 -3
tinygrad/function.py +17 -23
tinygrad/helpers.py +77 -8
tinygrad/lazy.py +26 -26
tinygrad/multi.py +13 -9
tinygrad/nn/__init__.py +1 -1
tinygrad/nn/datasets.py +2 -1
tinygrad/nn/state.py +3 -4
tinygrad/ops.py +49 -16
tinygrad/renderer/__init__.py +8 -4
tinygrad/renderer/assembly.py +93 -100
tinygrad/renderer/cstyle.py +47 -42
tinygrad/renderer/llvmir.py +30 -30
tinygrad/runtime/__init__.py +0 -0
tinygrad/runtime/autogen/amd_gpu.py +11504 -1
tinygrad/runtime/autogen/comgr.py +36 -10
tinygrad/runtime/autogen/hsa.py +146 -14
tinygrad/runtime/autogen/io_uring.py +1486 -0
tinygrad/runtime/autogen/nv_gpu.py +269 -0
tinygrad/runtime/driver/__init__.py +0 -0
tinygrad/runtime/driver/hip_comgr.py +20 -11
tinygrad/runtime/graph/__init__.py +0 -0
tinygrad/runtime/graph/clang.py +3 -2
tinygrad/runtime/graph/cuda.py +2 -2
tinygrad/runtime/graph/hcq.py +122 -78
tinygrad/runtime/ops_amd.py +302 -316
tinygrad/runtime/ops_cuda.py +3 -3
tinygrad/runtime/ops_disk.py +70 -5
tinygrad/runtime/ops_gpu.py +2 -2
tinygrad/runtime/ops_metal.py +5 -6
tinygrad/runtime/ops_npy.py +1 -1
tinygrad/runtime/ops_nv.py +161 -166
tinygrad/runtime/ops_python.py +20 -16
tinygrad/shape/__init__.py +0 -0
tinygrad/shape/shapetracker.py +5 -2
tinygrad/shape/symbolic.py +1 -3
tinygrad/shape/view.py +34 -19
tinygrad/tensor.py +219 -135
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/METADATA +14 -6
tinygrad-0.9.1.dist-info/RECORD +63 -0
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/WHEEL +1 -1
tinygrad/runtime/driver/hsa.py +0 -143
tinygrad/runtime/graph/hsa.py +0 -171
tinygrad/runtime/ops_hsa.py +0 -278
tinygrad-0.9.0.dist-info/RECORD +0 -60
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/LICENSE +0 -0
{tinygrad-0.9.0.dist-info → tinygrad-0.9.1.dist-info}/top_level.txt +0 -0

tinygrad/runtime/autogen/nv_gpu.py CHANGED Viewed

@@ -33326,3 +33326,272 @@ __all__ = \
     'union_NV2080_CTRL_NVLINK_CALLBACK_TYPE_callbackParams',
     'union_NV2080_CTRL_NVLINK_INJECT_TLC_ERROR_TYPE',
     'union_RM_GSP_SPDM_CMD', 'union_c__SA_NVOS32_PARAMETERS_data']
+nv_status_codes = {}
+NV_OK = 0x00000000
+nv_status_codes[NV_OK] = "Success"
+NV_ERR_GENERIC = 0x0000FFFF
+nv_status_codes[NV_ERR_GENERIC] = "Failure: Generic Error"
+NV_ERR_BROKEN_FB = 0x00000001
+nv_status_codes[NV_ERR_BROKEN_FB] = "Frame-Buffer broken"
+NV_ERR_BUFFER_TOO_SMALL = 0x00000002
+nv_status_codes[NV_ERR_BUFFER_TOO_SMALL] = "Buffer passed in is too small"
+NV_ERR_BUSY_RETRY = 0x00000003
+nv_status_codes[NV_ERR_BUSY_RETRY] = "System is busy, retry later"
+NV_ERR_CALLBACK_NOT_SCHEDULED = 0x00000004
+nv_status_codes[NV_ERR_CALLBACK_NOT_SCHEDULED] = "The requested callback API not scheduled"
+NV_ERR_CARD_NOT_PRESENT = 0x00000005
+nv_status_codes[NV_ERR_CARD_NOT_PRESENT] = "Card not detected"
+NV_ERR_CYCLE_DETECTED = 0x00000006
+nv_status_codes[NV_ERR_CYCLE_DETECTED] = "Call cycle detected"
+NV_ERR_DMA_IN_USE = 0x00000007
+nv_status_codes[NV_ERR_DMA_IN_USE] = "Requested DMA is in use"
+NV_ERR_DMA_MEM_NOT_LOCKED = 0x00000008
+nv_status_codes[NV_ERR_DMA_MEM_NOT_LOCKED] = "Requested DMA memory is not locked"
+NV_ERR_DMA_MEM_NOT_UNLOCKED = 0x00000009
+nv_status_codes[NV_ERR_DMA_MEM_NOT_UNLOCKED] = "Requested DMA memory is not unlocked"
+NV_ERR_DUAL_LINK_INUSE = 0x0000000A
+nv_status_codes[NV_ERR_DUAL_LINK_INUSE] = "Dual-Link is in use"
+NV_ERR_ECC_ERROR = 0x0000000B
+nv_status_codes[NV_ERR_ECC_ERROR] = "Generic ECC error"
+NV_ERR_FIFO_BAD_ACCESS = 0x0000000C
+nv_status_codes[NV_ERR_FIFO_BAD_ACCESS] = "FIFO: Invalid access"
+NV_ERR_FREQ_NOT_SUPPORTED = 0x0000000D
+nv_status_codes[NV_ERR_FREQ_NOT_SUPPORTED] = "Requested frequency is not supported"
+NV_ERR_GPU_DMA_NOT_INITIALIZED = 0x0000000E
+nv_status_codes[NV_ERR_GPU_DMA_NOT_INITIALIZED] = "Requested DMA not initialized"
+NV_ERR_GPU_IS_LOST = 0x0000000F
+nv_status_codes[NV_ERR_GPU_IS_LOST] = "GPU lost from the bus"
+NV_ERR_GPU_IN_FULLCHIP_RESET = 0x00000010
+nv_status_codes[NV_ERR_GPU_IN_FULLCHIP_RESET] = "GPU currently in full-chip reset"
+NV_ERR_GPU_NOT_FULL_POWER = 0x00000011
+nv_status_codes[NV_ERR_GPU_NOT_FULL_POWER] = "GPU not in full power"
+NV_ERR_GPU_UUID_NOT_FOUND = 0x00000012
+nv_status_codes[NV_ERR_GPU_UUID_NOT_FOUND] = "GPU UUID not found"
+NV_ERR_HOT_SWITCH = 0x00000013
+nv_status_codes[NV_ERR_HOT_SWITCH] = "System in hot switch"
+NV_ERR_I2C_ERROR = 0x00000014
+nv_status_codes[NV_ERR_I2C_ERROR] = "I2C Error"
+NV_ERR_I2C_SPEED_TOO_HIGH = 0x00000015
+nv_status_codes[NV_ERR_I2C_SPEED_TOO_HIGH] = "I2C Error: Speed too high"
+NV_ERR_ILLEGAL_ACTION = 0x00000016
+nv_status_codes[NV_ERR_ILLEGAL_ACTION] = "Current action is not allowed"
+NV_ERR_IN_USE = 0x00000017
+nv_status_codes[NV_ERR_IN_USE] = "Generic busy error"
+NV_ERR_INFLATE_COMPRESSED_DATA_FAILED = 0x00000018
+nv_status_codes[NV_ERR_INFLATE_COMPRESSED_DATA_FAILED] = "Failed to inflate compressed data"
+NV_ERR_INSERT_DUPLICATE_NAME = 0x00000019
+nv_status_codes[NV_ERR_INSERT_DUPLICATE_NAME] = "Found a duplicate entry in the requested btree"
+NV_ERR_INSUFFICIENT_RESOURCES = 0x0000001A
+nv_status_codes[NV_ERR_INSUFFICIENT_RESOURCES] = "Ran out of a critical resource, other than memory"
+NV_ERR_INSUFFICIENT_PERMISSIONS = 0x0000001B
+nv_status_codes[NV_ERR_INSUFFICIENT_PERMISSIONS] = "The requester does not have sufficient permissions"
+NV_ERR_INSUFFICIENT_POWER = 0x0000001C
+nv_status_codes[NV_ERR_INSUFFICIENT_POWER] = "Generic Error: Low power"
+NV_ERR_INVALID_ACCESS_TYPE = 0x0000001D
+nv_status_codes[NV_ERR_INVALID_ACCESS_TYPE] = "This type of access is not allowed"
+NV_ERR_INVALID_ADDRESS = 0x0000001E
+nv_status_codes[NV_ERR_INVALID_ADDRESS] = "Address not valid"
+NV_ERR_INVALID_ARGUMENT = 0x0000001F
+nv_status_codes[NV_ERR_INVALID_ARGUMENT] = "Invalid argument to call"
+NV_ERR_INVALID_BASE = 0x00000020
+nv_status_codes[NV_ERR_INVALID_BASE] = "Invalid base"
+NV_ERR_INVALID_CHANNEL = 0x00000021
+nv_status_codes[NV_ERR_INVALID_CHANNEL] = "Given channel-id not valid"
+NV_ERR_INVALID_CLASS = 0x00000022
+nv_status_codes[NV_ERR_INVALID_CLASS] = "Given class-id not valid"
+NV_ERR_INVALID_CLIENT = 0x00000023
+nv_status_codes[NV_ERR_INVALID_CLIENT] = "Given client not valid"
+NV_ERR_INVALID_COMMAND = 0x00000024
+nv_status_codes[NV_ERR_INVALID_COMMAND] = "Command passed is not valid"
+NV_ERR_INVALID_DATA = 0x00000025
+nv_status_codes[NV_ERR_INVALID_DATA] = "Invalid data passed"
+NV_ERR_INVALID_DEVICE = 0x00000026
+nv_status_codes[NV_ERR_INVALID_DEVICE] = "Current device is not valid"
+NV_ERR_INVALID_DMA_SPECIFIER = 0x00000027
+nv_status_codes[NV_ERR_INVALID_DMA_SPECIFIER] = "The requested DMA specifier is not valid"
+NV_ERR_INVALID_EVENT = 0x00000028
+nv_status_codes[NV_ERR_INVALID_EVENT] = "Invalid event occurred"
+NV_ERR_INVALID_FLAGS = 0x00000029
+nv_status_codes[NV_ERR_INVALID_FLAGS] = "Invalid flags passed"
+NV_ERR_INVALID_FUNCTION = 0x0000002A
+nv_status_codes[NV_ERR_INVALID_FUNCTION] = "Called function is not valid"
+NV_ERR_INVALID_HEAP = 0x0000002B
+nv_status_codes[NV_ERR_INVALID_HEAP] = "Heap corrupted"
+NV_ERR_INVALID_INDEX = 0x0000002C
+nv_status_codes[NV_ERR_INVALID_INDEX] = "Index invalid"
+NV_ERR_INVALID_IRQ_LEVEL = 0x0000002D
+nv_status_codes[NV_ERR_INVALID_IRQ_LEVEL] = "Requested IRQ level is not valid"
+NV_ERR_INVALID_LIMIT = 0x0000002E
+nv_status_codes[NV_ERR_INVALID_LIMIT] = "Generic Error: Invalid limit"
+NV_ERR_INVALID_LOCK_STATE = 0x0000002F
+nv_status_codes[NV_ERR_INVALID_LOCK_STATE] = "Requested lock state not valid"
+NV_ERR_INVALID_METHOD = 0x00000030
+nv_status_codes[NV_ERR_INVALID_METHOD] = "Requested method not valid"
+NV_ERR_INVALID_OBJECT = 0x00000031
+nv_status_codes[NV_ERR_INVALID_OBJECT] = "Object not valid"
+NV_ERR_INVALID_OBJECT_BUFFER = 0x00000032
+nv_status_codes[NV_ERR_INVALID_OBJECT_BUFFER] = "Object buffer passed is not valid"
+NV_ERR_INVALID_OBJECT_HANDLE = 0x00000033
+nv_status_codes[NV_ERR_INVALID_OBJECT_HANDLE] = "Object handle is not valid"
+NV_ERR_INVALID_OBJECT_NEW = 0x00000034
+nv_status_codes[NV_ERR_INVALID_OBJECT_NEW] = "New object is not valid"
+NV_ERR_INVALID_OBJECT_OLD = 0x00000035
+nv_status_codes[NV_ERR_INVALID_OBJECT_OLD] = "Old object is not valid"
+NV_ERR_INVALID_OBJECT_PARENT = 0x00000036
+nv_status_codes[NV_ERR_INVALID_OBJECT_PARENT] = "Object parent is not valid"
+NV_ERR_INVALID_OFFSET = 0x00000037
+nv_status_codes[NV_ERR_INVALID_OFFSET] = "The offset passed is not valid"
+NV_ERR_INVALID_OPERATION = 0x00000038
+nv_status_codes[NV_ERR_INVALID_OPERATION] = "Requested operation is not valid"
+NV_ERR_INVALID_OWNER = 0x00000039
+nv_status_codes[NV_ERR_INVALID_OWNER] = "Owner not valid"
+NV_ERR_INVALID_PARAM_STRUCT = 0x0000003A
+nv_status_codes[NV_ERR_INVALID_PARAM_STRUCT] = "Invalid structure parameter"
+NV_ERR_INVALID_PARAMETER = 0x0000003B
+nv_status_codes[NV_ERR_INVALID_PARAMETER] = "At least one of the parameters passed is not valid"
+NV_ERR_INVALID_PATH = 0x0000003C
+nv_status_codes[NV_ERR_INVALID_PATH] = "The requested path is not valid"
+NV_ERR_INVALID_POINTER = 0x0000003D
+nv_status_codes[NV_ERR_INVALID_POINTER] = "Pointer not valid"
+NV_ERR_INVALID_REGISTRY_KEY = 0x0000003E
+nv_status_codes[NV_ERR_INVALID_REGISTRY_KEY] = "Found an invalid registry key"
+NV_ERR_INVALID_REQUEST = 0x0000003F
+nv_status_codes[NV_ERR_INVALID_REQUEST] = "Generic Error: Invalid request"
+NV_ERR_INVALID_STATE = 0x00000040
+nv_status_codes[NV_ERR_INVALID_STATE] = "Generic Error: Invalid state"
+NV_ERR_INVALID_STRING_LENGTH = 0x00000041
+nv_status_codes[NV_ERR_INVALID_STRING_LENGTH] = "The string length is not valid"
+NV_ERR_INVALID_READ = 0x00000042
+nv_status_codes[NV_ERR_INVALID_READ] = "The requested read operation is not valid"
+NV_ERR_INVALID_WRITE = 0x00000043
+nv_status_codes[NV_ERR_INVALID_WRITE] = "The requested write operation is not valid"
+NV_ERR_INVALID_XLATE = 0x00000044
+nv_status_codes[NV_ERR_INVALID_XLATE] = "The requested translate operation is not valid"
+NV_ERR_IRQ_NOT_FIRING = 0x00000045
+nv_status_codes[NV_ERR_IRQ_NOT_FIRING] = "Requested IRQ is not firing"
+NV_ERR_IRQ_EDGE_TRIGGERED = 0x00000046
+nv_status_codes[NV_ERR_IRQ_EDGE_TRIGGERED] = "IRQ is edge triggered"
+NV_ERR_MEMORY_TRAINING_FAILED = 0x00000047
+nv_status_codes[NV_ERR_MEMORY_TRAINING_FAILED] = "Failed memory training sequence"
+NV_ERR_MISMATCHED_SLAVE = 0x00000048
+nv_status_codes[NV_ERR_MISMATCHED_SLAVE] = "Slave mismatch"
+NV_ERR_MISMATCHED_TARGET = 0x00000049
+nv_status_codes[NV_ERR_MISMATCHED_TARGET] = "Target mismatch"
+NV_ERR_MISSING_TABLE_ENTRY = 0x0000004A
+nv_status_codes[NV_ERR_MISSING_TABLE_ENTRY] = "Requested entry missing not found in the table"
+NV_ERR_MODULE_LOAD_FAILED = 0x0000004B
+nv_status_codes[NV_ERR_MODULE_LOAD_FAILED] = "Failed to load the requested module"
+NV_ERR_MORE_DATA_AVAILABLE = 0x0000004C
+nv_status_codes[NV_ERR_MORE_DATA_AVAILABLE] = "There is more data available"
+NV_ERR_MORE_PROCESSING_REQUIRED = 0x0000004D
+nv_status_codes[NV_ERR_MORE_PROCESSING_REQUIRED] = "More processing required for the given call"
+NV_ERR_MULTIPLE_MEMORY_TYPES = 0x0000004E
+nv_status_codes[NV_ERR_MULTIPLE_MEMORY_TYPES] = "Multiple memory types found"
+NV_ERR_NO_FREE_FIFOS = 0x0000004F
+nv_status_codes[NV_ERR_NO_FREE_FIFOS] = "No more free FIFOs found"
+NV_ERR_NO_INTR_PENDING = 0x00000050
+nv_status_codes[NV_ERR_NO_INTR_PENDING] = "No interrupt pending"
+NV_ERR_NO_MEMORY = 0x00000051
+nv_status_codes[NV_ERR_NO_MEMORY] = "Out of memory"
+NV_ERR_NO_SUCH_DOMAIN = 0x00000052
+nv_status_codes[NV_ERR_NO_SUCH_DOMAIN] = "Requested domain does not exist"
+NV_ERR_NO_VALID_PATH = 0x00000053
+nv_status_codes[NV_ERR_NO_VALID_PATH] = "Caller did not specify a valid path"
+NV_ERR_NOT_COMPATIBLE = 0x00000054
+nv_status_codes[NV_ERR_NOT_COMPATIBLE] = "Generic Error: Incompatible types"
+NV_ERR_NOT_READY = 0x00000055
+nv_status_codes[NV_ERR_NOT_READY] = "Generic Error: Not ready"
+NV_ERR_NOT_SUPPORTED = 0x00000056
+nv_status_codes[NV_ERR_NOT_SUPPORTED] = "Call not supported"
+NV_ERR_OBJECT_NOT_FOUND = 0x00000057
+nv_status_codes[NV_ERR_OBJECT_NOT_FOUND] = "Requested object not found"
+NV_ERR_OBJECT_TYPE_MISMATCH = 0x00000058
+nv_status_codes[NV_ERR_OBJECT_TYPE_MISMATCH] = "Specified objects do not match"
+NV_ERR_OPERATING_SYSTEM = 0x00000059
+nv_status_codes[NV_ERR_OPERATING_SYSTEM] = "Generic operating system error"
+NV_ERR_OTHER_DEVICE_FOUND = 0x0000005A
+nv_status_codes[NV_ERR_OTHER_DEVICE_FOUND] = "Found other device instead of the requested one"
+NV_ERR_OUT_OF_RANGE = 0x0000005B
+nv_status_codes[NV_ERR_OUT_OF_RANGE] = "The specified value is out of bounds"
+NV_ERR_OVERLAPPING_UVM_COMMIT = 0x0000005C
+nv_status_codes[NV_ERR_OVERLAPPING_UVM_COMMIT] = "Overlapping unified virtual memory commit"
+NV_ERR_PAGE_TABLE_NOT_AVAIL = 0x0000005D
+nv_status_codes[NV_ERR_PAGE_TABLE_NOT_AVAIL] = "Requested page table not available"
+NV_ERR_PID_NOT_FOUND = 0x0000005E
+nv_status_codes[NV_ERR_PID_NOT_FOUND] = "Process-Id not found"
+NV_ERR_PROTECTION_FAULT = 0x0000005F
+nv_status_codes[NV_ERR_PROTECTION_FAULT] = "Protection fault"
+NV_ERR_RC_ERROR = 0x00000060
+nv_status_codes[NV_ERR_RC_ERROR] = "Generic RC error"
+NV_ERR_REJECTED_VBIOS = 0x00000061
+nv_status_codes[NV_ERR_REJECTED_VBIOS] = "Given Video BIOS rejected/invalid"
+NV_ERR_RESET_REQUIRED = 0x00000062
+nv_status_codes[NV_ERR_RESET_REQUIRED] = "Reset required"
+NV_ERR_STATE_IN_USE = 0x00000063
+nv_status_codes[NV_ERR_STATE_IN_USE] = "State in use"
+NV_ERR_SIGNAL_PENDING = 0x00000064
+nv_status_codes[NV_ERR_SIGNAL_PENDING] = "Signal pending"
+NV_ERR_TIMEOUT = 0x00000065
+nv_status_codes[NV_ERR_TIMEOUT] = "Call timed out"
+NV_ERR_TIMEOUT_RETRY = 0x00000066
+nv_status_codes[NV_ERR_TIMEOUT_RETRY] = "Call timed out, please retry later"
+NV_ERR_TOO_MANY_PRIMARIES = 0x00000067
+nv_status_codes[NV_ERR_TOO_MANY_PRIMARIES] = "Too many primaries"
+NV_ERR_UVM_ADDRESS_IN_USE = 0x00000068
+nv_status_codes[NV_ERR_UVM_ADDRESS_IN_USE] = "Unified virtual memory requested address already in use"
+NV_ERR_MAX_SESSION_LIMIT_REACHED = 0x00000069
+nv_status_codes[NV_ERR_MAX_SESSION_LIMIT_REACHED] = "Maximum number of sessions reached"
+NV_ERR_LIB_RM_VERSION_MISMATCH = 0x0000006A
+nv_status_codes[NV_ERR_LIB_RM_VERSION_MISMATCH] = "Library version doesn't match driver version"
+NV_ERR_PRIV_SEC_VIOLATION = 0x0000006B
+nv_status_codes[NV_ERR_PRIV_SEC_VIOLATION] = "Priv security violation"
+NV_ERR_GPU_IN_DEBUG_MODE = 0x0000006C
+nv_status_codes[NV_ERR_GPU_IN_DEBUG_MODE] = "GPU currently in debug mode"
+NV_ERR_FEATURE_NOT_ENABLED = 0x0000006D
+nv_status_codes[NV_ERR_FEATURE_NOT_ENABLED] = "Requested Feature functionality is not enabled"
+NV_ERR_RESOURCE_LOST = 0x0000006E
+nv_status_codes[NV_ERR_RESOURCE_LOST] = "Requested resource has been destroyed"
+NV_ERR_PMU_NOT_READY = 0x0000006F
+nv_status_codes[NV_ERR_PMU_NOT_READY] = "PMU is not ready or has not yet been initialized"
+NV_ERR_FLCN_ERROR = 0x00000070
+nv_status_codes[NV_ERR_FLCN_ERROR] = "Generic falcon assert or halt"
+NV_ERR_FATAL_ERROR = 0x00000071
+nv_status_codes[NV_ERR_FATAL_ERROR] = "Fatal/unrecoverable error"
+NV_ERR_MEMORY_ERROR = 0x00000072
+nv_status_codes[NV_ERR_MEMORY_ERROR] = "Generic memory error"
+NV_ERR_INVALID_LICENSE = 0x00000073
+nv_status_codes[NV_ERR_INVALID_LICENSE] = "License provided is rejected or invalid"
+NV_ERR_NVLINK_INIT_ERROR = 0x00000074
+nv_status_codes[NV_ERR_NVLINK_INIT_ERROR] = "Nvlink Init Error"
+NV_ERR_NVLINK_MINION_ERROR = 0x00000075
+nv_status_codes[NV_ERR_NVLINK_MINION_ERROR] = "Nvlink Minion Error"
+NV_ERR_NVLINK_CLOCK_ERROR = 0x00000076
+nv_status_codes[NV_ERR_NVLINK_CLOCK_ERROR] = "Nvlink Clock Error"
+NV_ERR_NVLINK_TRAINING_ERROR = 0x00000077
+nv_status_codes[NV_ERR_NVLINK_TRAINING_ERROR] = "Nvlink Training Error"
+NV_ERR_NVLINK_CONFIGURATION_ERROR = 0x00000078
+nv_status_codes[NV_ERR_NVLINK_CONFIGURATION_ERROR] = "Nvlink Configuration Error"
+NV_ERR_RISCV_ERROR = 0x00000079
+nv_status_codes[NV_ERR_RISCV_ERROR] = "Generic RISC-V assert or halt"
+NV_ERR_FABRIC_MANAGER_NOT_PRESENT = 0x0000007A
+nv_status_codes[NV_ERR_FABRIC_MANAGER_NOT_PRESENT] = "Fabric Manager is not loaded"
+NV_ERR_ALREADY_SIGNALLED = 0x0000007B
+nv_status_codes[NV_ERR_ALREADY_SIGNALLED] = "Semaphore Surface value already >= requested wait value"
+NV_ERR_QUEUE_TASK_SLOT_NOT_AVAILABLE = 0x0000007C
+nv_status_codes[NV_ERR_QUEUE_TASK_SLOT_NOT_AVAILABLE] = "PMU RPC error due to no queue slot available for this event"
+NV_WARN_HOT_SWITCH = 0x00010001
+nv_status_codes[NV_WARN_HOT_SWITCH] = "WARNING Hot switch"
+NV_WARN_INCORRECT_PERFMON_DATA = 0x00010002
+nv_status_codes[NV_WARN_INCORRECT_PERFMON_DATA] = "WARNING Incorrect performance monitor data"
+NV_WARN_MISMATCHED_SLAVE = 0x00010003
+nv_status_codes[NV_WARN_MISMATCHED_SLAVE] = "WARNING Slave mismatch"
+NV_WARN_MISMATCHED_TARGET = 0x00010004
+nv_status_codes[NV_WARN_MISMATCHED_TARGET] = "WARNING Target mismatch"
+NV_WARN_MORE_PROCESSING_REQUIRED = 0x00010005
+nv_status_codes[NV_WARN_MORE_PROCESSING_REQUIRED] = "WARNING More processing required for the call"
+NV_WARN_NOTHING_TO_DO = 0x00010006
+nv_status_codes[NV_WARN_NOTHING_TO_DO] = "WARNING Nothing to do"
+NV_WARN_NULL_OBJECT = 0x00010007
+nv_status_codes[NV_WARN_NULL_OBJECT] = "WARNING NULL object found"
+NV_WARN_OUT_OF_RANGE = 0x00010008
+nv_status_codes[NV_WARN_OUT_OF_RANGE] = "WARNING value out of range"

tinygrad/runtime/driver/__init__.py ADDED Viewed

File without changes

tinygrad/runtime/driver/hip_comgr.py CHANGED Viewed

@@ -14,7 +14,7 @@ def _get_comgr_data(data_set, data_type):
   return bytes(dat)
 # AMD_COMGR_SAVE_TEMPS=1 AMD_COMGR_REDIRECT_LOGS=stdout AMD_COMGR_EMIT_VERBOSE_LOGS=1
-def compile_hip(prg:str, arch="gfx1100") -> bytes:
+def compile_hip(prg:str, arch="gfx1100", asm=False) -> bytes:
   check(comgr.amd_comgr_create_action_info(ctypes.byref(action_info := comgr.amd_comgr_action_info_t())))
   check(comgr.amd_comgr_action_info_set_language(action_info, comgr.AMD_COMGR_LANGUAGE_HIP))
   check(comgr.amd_comgr_action_info_set_isa_name(action_info, b"amdgcn-amd-amdhsa--" + arch.encode()))
@@ -27,17 +27,26 @@ def compile_hip(prg:str, arch="gfx1100") -> bytes:
   check(comgr.amd_comgr_create_data(comgr.AMD_COMGR_DATA_KIND_SOURCE, ctypes.byref(data_src := comgr.amd_comgr_data_t())))
   check(comgr.amd_comgr_set_data(data_src, len(rprg := prg.encode()), rprg))
-  check(comgr.amd_comgr_set_data_name(data_src, b"<null>"))
-  check(comgr.amd_comgr_data_set_add(data_set_src, data_src))
-  # -include hiprtc_runtime.h was removed
-  check(comgr.amd_comgr_action_info_set_options(action_info, f"-O3 -mcumode --hip-version=6.0.32830 -DHIP_VERSION_MAJOR=6 -DHIP_VERSION_MINOR=0 -DHIP_VERSION_PATCH=32830 -D__HIPCC_RTC__ -std=c++14 -nogpuinc -Wno-gnu-line-marker -Wno-missing-prototypes --offload-arch={arch} -I/opt/rocm/include -Xclang -disable-llvm-passes".encode())) # noqa: E501
-  status = comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, action_info, data_set_src, data_set_bc)
-  if status != 0:
-    print(_get_comgr_data(data_set_bc, comgr.AMD_COMGR_DATA_KIND_LOG).decode())
-    raise RuntimeError("compile failed")
-  check(comgr.amd_comgr_action_info_set_options(action_info, b"-O3 -mllvm -amdgpu-internalize-symbols"))
-  check(comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE, action_info, data_set_bc, data_set_reloc))
+  if asm:
+    check(comgr.amd_comgr_set_data_name(data_src, b"<null>.s"))
+    check(comgr.amd_comgr_data_set_add(data_set_src, data_src))
+    status = comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_ASSEMBLE_SOURCE_TO_RELOCATABLE, action_info, data_set_src, data_set_reloc)
+    if status != 0:
+      print(_get_comgr_data(data_set_reloc, comgr.AMD_COMGR_DATA_KIND_LOG).decode())
+      raise RuntimeError("assemble failed")
+  else:
+    check(comgr.amd_comgr_set_data_name(data_src, b"<null>"))
+    check(comgr.amd_comgr_data_set_add(data_set_src, data_src))
+    # -include hiprtc_runtime.h was removed
+    check(comgr.amd_comgr_action_info_set_options(action_info, f"-O3 -mcumode --hip-version=6.0.32830 -DHIP_VERSION_MAJOR=6 -DHIP_VERSION_MINOR=0 -DHIP_VERSION_PATCH=32830 -D__HIPCC_RTC__ -std=c++14 -nogpuinc -Wno-gnu-line-marker -Wno-missing-prototypes --offload-arch={arch} -I/opt/rocm/include -Xclang -disable-llvm-passes".encode())) # noqa: E501
+    status = comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_COMPILE_SOURCE_WITH_DEVICE_LIBS_TO_BC, action_info, data_set_src, data_set_bc)
+    if status != 0:
+      print(_get_comgr_data(data_set_bc, comgr.AMD_COMGR_DATA_KIND_LOG).decode())
+      raise RuntimeError("compile failed")
+    check(comgr.amd_comgr_action_info_set_options(action_info, b"-O3 -mllvm -amdgpu-internalize-symbols"))
+    check(comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE, action_info, data_set_bc, data_set_reloc))
   check(comgr.amd_comgr_action_info_set_options(action_info, b""))
   check(comgr.amd_comgr_do_action(comgr.AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action_info, data_set_reloc, data_set_exec))
   ret = _get_comgr_data(data_set_exec, comgr.AMD_COMGR_DATA_KIND_EXECUTABLE)

tinygrad/runtime/graph/__init__.py ADDED Viewed

File without changes

tinygrad/runtime/graph/clang.py CHANGED Viewed

@@ -16,7 +16,7 @@ class ClangGraph(GraphRunner):
     prgs = '\n'.join(dedup([cast(CompiledRunner, ji.prg).p.src for ji in jit_cache]))
     args = [f"{render_dtype(x.dtype)}* arg{i}" for i,x in enumerate(input_rawbuffers)]
-    args += [f"int {v.expr}" for v in var_vals]
+    args += sorted([f"int {v.expr}" for v in var_vals])
     code = ["void batched("+','.join(args)+") {"]
     for ji in jit_cache:
       args = []
@@ -35,4 +35,5 @@ class ClangGraph(GraphRunner):
     self.clprg = ClangProgram("batched", compiler.compile(prgs+"\n"+"\n".join(code))) # no point in caching the pointers
   def __call__(self, rawbufs: List[Buffer], var_vals: Dict[Variable, int], wait=False):
-    return cpu_time_execution(lambda: self.clprg(*[x._buf for x in rawbufs], *[x for x in var_vals.values()]), enable=wait)
+    return cpu_time_execution(
+    lambda: self.clprg(*[x._buf for x in rawbufs], *[x[1] for x in sorted(var_vals.items(), key=lambda x: x[0].expr)]), enable=wait)

tinygrad/runtime/graph/cuda.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import ctypes
 from typing import Any, Optional, Tuple, Dict, List, cast
 import tinygrad.runtime.autogen.cuda as cuda
-from tinygrad.helpers import init_c_var, GraphException
+from tinygrad.helpers import init_c_var, GraphException, dedup
 from tinygrad.device import Buffer, Device
 from tinygrad.runtime.ops_cuda import CUDADevice, check, encode_args, cu_time_execution
 from tinygrad.shape.symbolic import Variable
@@ -15,7 +15,7 @@ class CUDAGraph(MultiGraphRunner):
     # Check all jit items are compatible.
     if not all(isinstance(ji.prg, (CompiledRunner, BufferXfer)) for ji in jit_cache): raise GraphException
-    self.jc_idx_with_updatable_rawbufs = list(set([x[0] for x in self.input_replace.keys()]))
+    self.jc_idx_with_updatable_rawbufs = dedup([x[0] for x in self.input_replace.keys()])
     self.updatable_nodes: Dict[int, Tuple[Any, Any, Any, bool]] = {} # Dict[jc index] = tuple(graph node, node params, input kernel params, is memcpy)
     self.graph = init_c_var(cuda.CUgraph(), lambda x: check(cuda.cuGraphCreate(ctypes.byref(x), 0)))

tinygrad/runtime/graph/hcq.py CHANGED Viewed

@@ -1,134 +1,160 @@
-import ctypes, collections, array, time
+import collections, array, time
 from typing import List, Any, Dict, cast, Optional, Tuple, Set
-from tinygrad.helpers import GraphException, round_up, to_mv, init_c_struct_t
+from tinygrad.helpers import round_up, to_mv, PROFILE
 from tinygrad.device import Buffer, BufferOptions, Compiled, Device
 from tinygrad.shape.symbolic import Variable
 from tinygrad.engine.realize import ExecItem, BufferXfer, CompiledRunner
 from tinygrad.engine.jit import MultiGraphRunner
 class HCQGraph(MultiGraphRunner):
-  def __init__(self, device_t, comp_hcq_t, copy_hcq_t, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
+  def __init__(self, jit_cache: List[ExecItem], input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int]):
     super().__init__(jit_cache, input_rawbuffers, var_vals)
-    self.device_t, self.comp_hcq_t, self.copy_hcq_t = device_t, comp_hcq_t, copy_hcq_t
-    # Check all jit items are compatible.
-    self.devices = list(set(cast(self.device_t, d) for ji in jit_cache for d in [Device[cast(Buffer, x).device] for x in ji.bufs])) #type: ignore
-    if any(not isinstance(d, self.device_t) for d in self.devices): raise GraphException
+    self.devices = list(set(cast(Any, d) for ji in jit_cache for d in [Device[cast(Buffer, x).device] for x in ji.bufs]))
     # Allocate kernel args.
     kernargs_size: Dict[Compiled, int] = collections.defaultdict(int)
     for ji in self.jit_cache:
       if not isinstance(ji.prg, CompiledRunner): continue
-      kernargs_size[ji.prg.device] += round_up(ji.prg.clprg.kernargs_segment_size, 16)
-    kernargs_ptrs: Dict[Compiled, int] = {dev:dev.allocator._alloc(sz, BufferOptions(cpu_access=True)).va_addr for dev,sz in kernargs_size.items()}
+      kernargs_size[ji.prg.device] += round_up(ji.prg.clprg.kernargs_alloc_size, 16)
+    self.kernargs_bufs: Dict[Compiled, Any] = {dev:dev.allocator._alloc(sz, BufferOptions(cpu_access=True)) for dev,sz in kernargs_size.items()}
+    kernargs_ptrs: Dict[Compiled, int] = {dev:buf.va_addr for dev,buf in self.kernargs_bufs.items()}
     # Fill initial arguments.
     self.kargs_addrs: Dict[int, int] = {}
-    self.ji_kargs_structs: Dict[int, ctypes.Structure] = {}
+    self.ji_args_bufs: Dict[int, memoryview] = {}
+    self.ji_args_vars: Dict[int, memoryview] = {}
     for j,ji in enumerate(self.jit_cache):
       if not isinstance(ji.prg, CompiledRunner): continue
       self.kargs_addrs[j] = kernargs_ptrs[ji.prg.device]
-      kernargs_ptrs[ji.prg.device] += round_up(ji.prg.clprg.kernargs_segment_size, 16)
+      kernargs_ptrs[ji.prg.device] += round_up(ji.prg.clprg.kernargs_alloc_size, 16)
-      args_t = init_c_struct_t(tuple([(f'f{i}', ctypes.c_void_p) for i in range(len(ji.bufs))] +
-                                     [(f'v{i}', ctypes.c_int) for i in range(len(ji.prg.p.vars))]))
-      self.ji_kargs_structs[j] = args_t.from_address(self.kargs_addrs[j] + ji.prg.clprg.kernargs_offset)
-      for i in range(len(ji.bufs)): self.ji_kargs_structs[j].__setattr__(f'f{i}', cast(Buffer, ji.bufs[i])._buf.va_addr)
-      for i in range(len(ji.prg.p.vars)): self.ji_kargs_structs[j].__setattr__(f'v{i}', var_vals[ji.prg.p.vars[i]])
+      self.ji_args_bufs[j] = to_mv(self.kargs_addrs[j] + ji.prg.clprg.kernargs_offset, len(ji.bufs) * 8).cast('Q')
+      self.ji_args_vars[j] = to_mv(self.kargs_addrs[j] + ji.prg.clprg.kernargs_offset + len(ji.bufs) * 8, len(ji.prg.p.vars) * 4).cast('I')
+      for i in range(len(ji.bufs)): self.ji_args_bufs[j][i] = cast(Buffer, ji.bufs[i])._buf.va_addr
+      for i in range(len(ji.prg.p.vars)): self.ji_args_vars[j][i] = var_vals[ji.prg.p.vars[i]]
       # NV needs constbuffer to be set
       if ji.prg.device.dname.startswith("NV"): to_mv(self.kargs_addrs[j], 0x160).cast('I')[:] = array.array('I', ji.prg.clprg.constbuffer_0)
-    # Build queues.
-    self.comp_queues: Dict[Compiled, Any] = collections.defaultdict(self.comp_hcq_t)
-    self.comp_signal = {dev: dev._get_signal(value=0) for dev in self.devices}
-    self.comp_signal_val = {dev: 0 for dev in self.devices}
+    # Schedule Dependencies.
+    # There are two types of queues on each device: copy and compute. Both must synchronize with all external operations before launching any
+    # graph-related tasks. This synchronization uses a global timeline signal per device. Within the graph, the compute queue coordinates with
+    # global operations and sets a kickoff signal. Any queue accessing a buffer from another device waits for this signal from the device’s
+    # compute queue to ensure exclusive access. The compute queue signals the completion of the graph, synchronizing with the device's copy queue.
+    self.comp_queues: Dict[Compiled, Any] = {dev: dev.hw_compute_queue_t() for dev in self.devices}
+    self.copy_queues: Dict[Compiled, Any] = {dev: dev.hw_copy_queue_t() for dev in self.devices}
+    self.signal_sched: Dict[int, Tuple[List, Optional[int], Optional[List]]] = {} # Dict[ji_idx, (deps, sigval, prof_info)]
+    self.signals: Dict[Any, Any] = {q: self.devices[0]._get_signal(value=0) for q in list(self.comp_queues.values())+list(self.copy_queues.values())}
+    self.dev_kickoff_signal = {dev: self.devices[0]._get_signal(value=0) for dev in self.devices + ['CPU']} # Dict[dev, signal]
+    self.kickoff_value = 0
+    self.save_devs: Dict[Any, Set] = {q: set() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}
+    for dev in self.devices: self.save_devs[self.comp_queues[dev]].add(dev)
-    self.copy_queues: Dict[Compiled, Any] = collections.defaultdict(self.copy_hcq_t)
-    self.copy_signal = {dev: dev._get_signal(value=0) for dev in self.devices}
-    self.copy_signal_val = {dev: 0 for dev in self.devices}
+    self.graph_timeline = {dev: 0 for dev in self.devices} # Dict[dev, last graph sigval]
+    self.last_ji: Dict[Any, Any] = {q: None for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}
-    self.kickoff_signal = self.devices[0]._get_signal(value=0)
-    self.kickoff_value = 0
-    self.graph_timeline = {dev: 0 for dev in self.devices}
+    for j,ji in enumerate(self.jit_cache):
+      enqueue_dev = ji.prg.device if isinstance(ji.prg, CompiledRunner) else Device[ji.bufs[1].device] #type:ignore
+      enqueue_queue = self.comp_queues[enqueue_dev] if isinstance(ji.prg, CompiledRunner) else self.copy_queues[enqueue_dev]
+      out_signal = self.signals[enqueue_queue]
+      writable_buffers = ji.prg.p.outcount if isinstance(ji.prg, CompiledRunner) else 1
+      deps = self.access_resources(enqueue_queue, ji.bufs[writable_buffers:], ji.bufs[:writable_buffers], j + 1)
+      if isinstance(ji.prg, CompiledRunner):
+        # Update signal on compute kernel to depend on the previous kernel.
+        if (last_j:=self.last_ji[enqueue_queue]) is not None: deps = [x for x in deps if id(x[0]) != id(out_signal)] + [(out_signal, last_j + 1)]
+        # Remove self-dependency for AMD or NV with only 1 same-queue dep, since NV chains 2+ execs in this case, eliminating dep need.
+        if (dname:=enqueue_dev.dname.split(":", 1)[0]) == "AMD" or (dname == "NV" and len(deps) == 1 and id(deps[0][0]) == id(out_signal)):
+          deps = [x for x in deps if id(x[0]) != id(out_signal)]
+      elif isinstance(ji.prg, BufferXfer): deps = [x for x in deps if id(x[0]) != id(out_signal)]
+      # Go through all dependencies and, if we need the signal from that ji, enable it by setting the signal value in the signal schedule.
+      for sig, val in deps:
+        if id(sig) in [id(x) for x in self.signals.values()]:
+          self.signal_sched[val - 1] = self.signal_sched[val - 1][:1] + (val,) + self.signal_sched[val - 1][2:]
+      prof_ji_desc = ji.prg.clprg.name if isinstance(ji.prg, CompiledRunner) else f"{ji.bufs[1].device} -> {ji.bufs[0].device}" # type: ignore
+      prof_info = ([enqueue_dev._get_signal() for _ in range(2)] + [enqueue_dev, prof_ji_desc, isinstance(ji.prg, BufferXfer)]) if PROFILE else None
+      self.signal_sched[j] = (deps, None if isinstance(ji.prg, CompiledRunner) else (j + 1), prof_info)
+      self.last_ji[enqueue_queue] = j
+    # Build hardware queues.
     self.exec_ptrs: Dict[int, Tuple[Any, int]] = {}
     self.copy_to_devs: Dict[Compiled, Set[Compiled]] = {dev: set() for dev in self.devices}
+    self.kickoff_wait_cmds: Dict[Any, List] = {q: list() for q in list(self.comp_queues.values()) + list(self.copy_queues.values())}
+    for dev in self.devices:
+      self.comp_queues[dev].memory_barrier().wait(dev.timeline_signal, dev.timeline_value - 1) \
+                           .wait(self.dev_kickoff_signal['CPU'], self.kickoff_value).signal(self.dev_kickoff_signal[dev], self.kickoff_value)
     for j,ji in enumerate(self.jit_cache):
+      deps, signal_value, prof_info = self.signal_sched[j]
+      enqueue_queue = self.copy_queues[Device[ji.bufs[1].device]] if isinstance(ji.prg, BufferXfer) else self.comp_queues[ji.prg.device] #type:ignore
+      # Encode waits and start profile timestamp (if needed).
+      for sig, val in deps:
+        enqueue_queue.wait(sig, val)
+        if id(sig) in [id(x) for x in self.dev_kickoff_signal.values()]: self.kickoff_wait_cmds[enqueue_queue].append(len(enqueue_queue) - 1)
+      if prof_info: enqueue_queue.timestamp(prof_info[0])
+      # Encode main commands based on ji type.
       if isinstance(ji.prg, CompiledRunner):
-        exec_params = {}
-        deps = self.access_resources(ji.bufs[(outs:=ji.prg.p.outcount):], ji.bufs[:outs], (self.comp_signal[ji.prg.device], sig_val:=j+1))
-        deps = [x for x in deps if id(x[0]) != id(self.comp_signal[ji.prg.device])]
-        # On NV, to synchronize kernel execution, we must either issue a wait or chain executions to schedule them in order.
-        # Chaining executions is preferred when possible, as it is faster.
-        if ji.prg.device.dname.startswith("NV"):
-          if len(deps) == 0 and self.comp_signal_val[ji.prg.device] > 0:
-            exec_params['chain_exec_ptr'] = self.exec_ptrs[self.comp_signal_val[ji.prg.device] - 1][1]
-          else: deps.append((self.comp_signal[ji.prg.device], self.comp_signal_val[ji.prg.device]))
-        for sig, val in deps: self.comp_queues[ji.prg.device].wait(sig, val)
-        self.exec_ptrs[j] = (self.comp_queues[ji.prg.device], self.comp_queues[ji.prg.device].ptr())
-        self.comp_queues[ji.prg.device].exec(ji.prg.clprg, self.kargs_addrs[j], *ji.prg.p.launch_dims(var_vals),
-                                             signal=self.comp_signal[ji.prg.device], signal_value=sig_val, **exec_params)
-        self.comp_signal_val[ji.prg.device] = sig_val
+        enqueue_queue.exec(ji.prg.clprg, self.kargs_addrs[j], *ji.prg.p.launch_dims(var_vals),
+                           signal=self.signals[enqueue_queue] if signal_value is not None else None, signal_value=signal_value)
+        self.exec_ptrs[j] = (enqueue_queue, len(enqueue_queue) - 1)
       elif isinstance(ji.prg, BufferXfer):
         dest, src = [cast(Buffer, x) for x in ji.bufs[0:2]]
         Device[src.device]._gpu_map(dest._buf) #type: ignore
-        deps = self.access_resources([src], [dest], (self.copy_signal[Device[src.device]], sig_val:=j+1))
-        deps.append((self.copy_signal[Device[src.device]], self.copy_signal_val[Device[src.device]]))
-        self.copy_signal_val[Device[src.device]] = sig_val
-        for sig,val in deps: self.copy_queues[Device[src.device]].wait(sig, val)
-        self.copy_queues[Device[src.device]].copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes) \
-                                            .signal(self.copy_signal[Device[src.device]], sig_val)
+        enqueue_queue.copy(dest._buf.va_addr, src._buf.va_addr, dest.nbytes).signal(self.signals[enqueue_queue], signal_value)
         self.copy_to_devs[Device[dest.device]].add(Device[src.device])
+      # Encode finish profile timestamp (if needed).
+      if prof_info: enqueue_queue.timestamp(prof_info[1])
     for dev in self.devices:
-      if self.copy_signal_val[dev] > 0: self.comp_queues[dev].wait(self.copy_signal[dev], self.copy_signal_val[dev])
-      for dep_dev in self.copy_to_devs[dev]: self.comp_queues[dev].wait(self.copy_signal[dep_dev], self.copy_signal_val[dep_dev])
+      for dep_dev in list(self.copy_to_devs[dev]) + [dev]:
+        if (last_j:=self.last_ji[self.copy_queues[dep_dev]]) is None: continue
+        self.comp_queues[dev].wait(self.signals[self.copy_queues[dep_dev]], self.signal_sched[last_j][1])
+      self.comp_queues[dev].signal(dev.timeline_signal, dev.timeline_value)
       if hasattr(self.comp_queues[dev], 'bind'): self.comp_queues[dev].bind(dev)
-      if hasattr(self.copy_queues[dev], 'bind') and self.copy_signal_val[dev] > 0: self.copy_queues[dev].bind(dev)
+      if hasattr(self.copy_queues[dev], 'bind') and self.last_ji[self.copy_queues[dev]] is not None: self.copy_queues[dev].bind(dev)
   def __call__(self, input_rawbuffers: List[Buffer], var_vals: Dict[Variable, int], wait=False) -> Optional[float]:
     # Wait and restore signals
     self.kickoff_value += 1
     for dev in self.devices: dev._wait_signal(dev.timeline_signal, self.graph_timeline[dev])
-    for dev in self.devices:
-      dev._set_signal(self.comp_signal[dev], 0)
-      dev._set_signal(self.copy_signal[dev], 0)
-    dev._set_signal(self.kickoff_signal, self.kickoff_value)
+    for queue in self.comp_queues.values(): self.devices[0]._set_signal(self.signals[queue], 0)
+    for queue in self.copy_queues.values(): self.devices[0]._set_signal(self.signals[queue], 0)
+    self.devices[0]._set_signal(self.dev_kickoff_signal['CPU'], self.kickoff_value)
+    if PROFILE and self.kickoff_value > 1:
+      for _,_,(st,en,dev,desc,is_cp) in self.signal_sched.values(): #type: ignore
+        dev.raw_prof_records += [(dev._read_timestamp(st), dev._read_timestamp(en), desc, is_cp)]
     # Update rawbuffers
-    for (j,i),input_idx in self.input_replace.items():
-      self.ji_kargs_structs[j].__setattr__(f'f{i}', input_rawbuffers[input_idx]._buf.va_addr)
+    for (j,i),input_idx in self.input_replace.items(): self.ji_args_bufs[j][i] = input_rawbuffers[input_idx]._buf.va_addr
     # Update var_vals
     for j in self.jc_idx_with_updatable_var_vals:
-      for i,v in enumerate(cast(CompiledRunner, self.jit_cache[j].prg).p.vars):
-        self.ji_kargs_structs[j].__setattr__(f'v{i}', var_vals[v])
+      for i,v in enumerate(cast(CompiledRunner, self.jit_cache[j].prg).p.vars): self.ji_args_vars[j][i] = var_vals[v]
     for j in self.jc_idx_with_updatable_launch_dims:
       queue, cmd_ptr = self.exec_ptrs[j]
       queue.update_exec(cmd_ptr, *cast(CompiledRunner, self.jit_cache[j].prg).p.launch_dims(var_vals))
     for dev in self.devices:
-      # Submit sync with world and queues.
-      self.comp_hcq_t().wait(dev.timeline_signal, dev.timeline_value - 1) \
-                       .wait(self.kickoff_signal, self.kickoff_value).submit(dev)
-      self.comp_queues[dev].submit(dev)
-      if self.copy_signal_val[dev] > 0:
-        self.copy_hcq_t().wait(dev.timeline_signal, dev.timeline_value - 1) \
-                         .wait(self.kickoff_signal, self.kickoff_value).submit(dev)
-        self.copy_queues[dev].submit(dev)
-      # Signal the final value
-      self.comp_hcq_t().signal(dev.timeline_signal, dev.timeline_value).submit(dev)
+      self.comp_queues[dev].update_wait(1, dev.timeline_signal, dev.timeline_value - 1).update_wait(2, value=self.kickoff_value) \
+                           .update_signal(3, value=self.kickoff_value) \
+                           .update_signal(len(self.comp_queues[dev]) - 1, dev.timeline_signal, dev.timeline_value).submit(dev)
+      if self.last_ji[(cp_queue:=self.copy_queues[dev])] is not None:
+        for cmd_idx in self.kickoff_wait_cmds[cp_queue]: cp_queue.update_wait(cmd_idx, value=self.kickoff_value)
+        cp_queue.submit(dev)
       self.graph_timeline[dev] = dev.timeline_value
       dev.timeline_value += 1
@@ -138,6 +164,24 @@ class HCQGraph(MultiGraphRunner):
       return time.perf_counter() - st
     return None
-  def access_resources(self, read, write, new_dependency):
-    deps = self._access_resources(read, write, new_dependency)
-    return [(k, max(v for x, v in deps if id(x) == idk)) for idk, k in {id(x[0]): x[0] for x in deps}.items()]
+  def access_resources(self, queue, read, write, new_val):
+    deps = self._access_resources(read, write, (queue, new_val))
+    sync_signals = []
+    for dep_queue,_ in deps: self.save_devs[queue].update(self.save_devs[dep_queue])
+    for buf in read+write:
+      if buf.device not in self.save_devs[queue]:
+        self.save_devs[queue].add(buf.device)
+        sync_signals += [(self.dev_kickoff_signal[Device[buf.device]], self.kickoff_value)]
+    return [(self.signals[k], max(v for x, v in deps if id(x) == idk)) for idk, k in {id(x[0]): x[0] for x in deps}.items()] + sync_signals
+  def __del__(self):
+    for dev in self.devices: dev._wait_signal(dev.timeline_signal, self.graph_timeline[dev])
+    # Graph is destructed. No need to keep signals any more, so return them as part of profiling.
+    if PROFILE and self.kickoff_value > 1:
+      for _,_,(st,en,dev,desc,is_cp) in self.signal_sched.values(): dev.sig_prof_records += [(st, en, desc, is_cp)] #type: ignore
+    self.devices[0].signals_pool += list(self.dev_kickoff_signal.values()) + list(self.signals.values()) # type: ignore
+    for dev, buf in self.kernargs_bufs.items(): dev.allocator._free(buf, BufferOptions(cpu_access=True))

tinygrad 0.9.0__py3-none-any.whl → 0.9.1__py3-none-any.whl

tinygrad 0.9.0py3-none-any.whl → 0.9.1py3-none-any.whl