tinygrad 0.9.1__py3-none-any.whl → 0.9.2__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (56) hide show
  1. tinygrad/codegen/kernel.py +248 -115
  2. tinygrad/codegen/lowerer.py +215 -0
  3. tinygrad/codegen/transcendental.py +310 -0
  4. tinygrad/codegen/uopgraph.py +622 -0
  5. tinygrad/codegen/uops.py +235 -393
  6. tinygrad/device.py +428 -69
  7. tinygrad/dtype.py +18 -4
  8. tinygrad/engine/graph.py +19 -32
  9. tinygrad/engine/jit.py +148 -70
  10. tinygrad/engine/realize.py +127 -51
  11. tinygrad/engine/schedule.py +259 -216
  12. tinygrad/engine/search.py +29 -22
  13. tinygrad/function.py +9 -0
  14. tinygrad/helpers.py +87 -49
  15. tinygrad/lazy.py +34 -35
  16. tinygrad/multi.py +41 -36
  17. tinygrad/nn/__init__.py +39 -22
  18. tinygrad/nn/state.py +3 -3
  19. tinygrad/ops.py +63 -62
  20. tinygrad/renderer/__init__.py +43 -21
  21. tinygrad/renderer/assembly.py +104 -106
  22. tinygrad/renderer/cstyle.py +87 -60
  23. tinygrad/renderer/llvmir.py +21 -30
  24. tinygrad/runtime/autogen/amd_gpu.py +25208 -5753
  25. tinygrad/runtime/autogen/cuda.py +6 -162
  26. tinygrad/runtime/autogen/kfd.py +32 -0
  27. tinygrad/runtime/autogen/libc.py +4260 -0
  28. tinygrad/runtime/autogen/nvrtc.py +579 -0
  29. tinygrad/runtime/graph/clang.py +2 -2
  30. tinygrad/runtime/graph/cuda.py +8 -11
  31. tinygrad/runtime/graph/hcq.py +120 -107
  32. tinygrad/runtime/graph/metal.py +18 -15
  33. tinygrad/runtime/ops_amd.py +197 -305
  34. tinygrad/runtime/ops_clang.py +2 -2
  35. tinygrad/runtime/ops_cuda.py +36 -94
  36. tinygrad/runtime/ops_disk.py +3 -7
  37. tinygrad/runtime/ops_gpu.py +4 -2
  38. tinygrad/runtime/ops_hip.py +70 -0
  39. tinygrad/runtime/ops_metal.py +38 -27
  40. tinygrad/runtime/ops_nv.py +283 -363
  41. tinygrad/runtime/ops_python.py +26 -30
  42. tinygrad/runtime/support/compiler_cuda.py +78 -0
  43. tinygrad/runtime/{driver/hip_comgr.py → support/compiler_hip.py} +15 -1
  44. tinygrad/runtime/support/elf.py +38 -0
  45. tinygrad/shape/shapetracker.py +5 -14
  46. tinygrad/shape/symbolic.py +4 -8
  47. tinygrad/shape/view.py +34 -22
  48. tinygrad/tensor.py +399 -97
  49. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/METADATA +49 -48
  50. tinygrad-0.9.2.dist-info/RECORD +70 -0
  51. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/WHEEL +1 -1
  52. tinygrad/codegen/linearizer.py +0 -528
  53. tinygrad-0.9.1.dist-info/RECORD +0 -63
  54. /tinygrad/runtime/{driver → support}/__init__.py +0 -0
  55. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/LICENSE +0 -0
  56. {tinygrad-0.9.1.dist-info → tinygrad-0.9.2.dist-info}/top_level.txt +0 -0
@@ -146,7 +146,6 @@ def char_pointer_cast(string, encoding='utf-8'):
146
146
 
147
147
  _libraries = {}
148
148
  _libraries['libcuda.so'] = ctypes.CDLL(ctypes.util.find_library('cuda'))
149
- _libraries['libnvrtc.so'] = ctypes.CDLL(ctypes.util.find_library('nvrtc'))
150
149
 
151
150
 
152
151
  cuuint32_t = ctypes.c_uint32
@@ -4921,143 +4920,6 @@ try:
4921
4920
  cuGetExportTable.argtypes = [ctypes.POINTER(ctypes.POINTER(None)), ctypes.POINTER(struct_CUuuid_st)]
4922
4921
  except AttributeError:
4923
4922
  pass
4924
-
4925
- # values for enumeration 'c__EA_nvrtcResult'
4926
- c__EA_nvrtcResult__enumvalues = {
4927
- 0: 'NVRTC_SUCCESS',
4928
- 1: 'NVRTC_ERROR_OUT_OF_MEMORY',
4929
- 2: 'NVRTC_ERROR_PROGRAM_CREATION_FAILURE',
4930
- 3: 'NVRTC_ERROR_INVALID_INPUT',
4931
- 4: 'NVRTC_ERROR_INVALID_PROGRAM',
4932
- 5: 'NVRTC_ERROR_INVALID_OPTION',
4933
- 6: 'NVRTC_ERROR_COMPILATION',
4934
- 7: 'NVRTC_ERROR_BUILTIN_OPERATION_FAILURE',
4935
- 8: 'NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION',
4936
- 9: 'NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION',
4937
- 10: 'NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID',
4938
- 11: 'NVRTC_ERROR_INTERNAL_ERROR',
4939
- }
4940
- NVRTC_SUCCESS = 0
4941
- NVRTC_ERROR_OUT_OF_MEMORY = 1
4942
- NVRTC_ERROR_PROGRAM_CREATION_FAILURE = 2
4943
- NVRTC_ERROR_INVALID_INPUT = 3
4944
- NVRTC_ERROR_INVALID_PROGRAM = 4
4945
- NVRTC_ERROR_INVALID_OPTION = 5
4946
- NVRTC_ERROR_COMPILATION = 6
4947
- NVRTC_ERROR_BUILTIN_OPERATION_FAILURE = 7
4948
- NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION = 8
4949
- NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION = 9
4950
- NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID = 10
4951
- NVRTC_ERROR_INTERNAL_ERROR = 11
4952
- c__EA_nvrtcResult = ctypes.c_uint32 # enum
4953
- nvrtcResult = c__EA_nvrtcResult
4954
- nvrtcResult__enumvalues = c__EA_nvrtcResult__enumvalues
4955
- try:
4956
- nvrtcGetErrorString = _libraries['libnvrtc.so'].nvrtcGetErrorString
4957
- nvrtcGetErrorString.restype = ctypes.POINTER(ctypes.c_char)
4958
- nvrtcGetErrorString.argtypes = [nvrtcResult]
4959
- except AttributeError:
4960
- pass
4961
- try:
4962
- nvrtcVersion = _libraries['libnvrtc.so'].nvrtcVersion
4963
- nvrtcVersion.restype = nvrtcResult
4964
- nvrtcVersion.argtypes = [ctypes.POINTER(ctypes.c_int32), ctypes.POINTER(ctypes.c_int32)]
4965
- except AttributeError:
4966
- pass
4967
- try:
4968
- nvrtcGetNumSupportedArchs = _libraries['libnvrtc.so'].nvrtcGetNumSupportedArchs
4969
- nvrtcGetNumSupportedArchs.restype = nvrtcResult
4970
- nvrtcGetNumSupportedArchs.argtypes = [ctypes.POINTER(ctypes.c_int32)]
4971
- except AttributeError:
4972
- pass
4973
- try:
4974
- nvrtcGetSupportedArchs = _libraries['libnvrtc.so'].nvrtcGetSupportedArchs
4975
- nvrtcGetSupportedArchs.restype = nvrtcResult
4976
- nvrtcGetSupportedArchs.argtypes = [ctypes.POINTER(ctypes.c_int32)]
4977
- except AttributeError:
4978
- pass
4979
- class struct__nvrtcProgram(Structure):
4980
- pass
4981
-
4982
- nvrtcProgram = ctypes.POINTER(struct__nvrtcProgram)
4983
- try:
4984
- nvrtcCreateProgram = _libraries['libnvrtc.so'].nvrtcCreateProgram
4985
- nvrtcCreateProgram.restype = nvrtcResult
4986
- nvrtcCreateProgram.argtypes = [ctypes.POINTER(ctypes.POINTER(struct__nvrtcProgram)), ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.c_char), ctypes.c_int32, ctypes.POINTER(ctypes.POINTER(ctypes.c_char)), ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
4987
- except AttributeError:
4988
- pass
4989
- try:
4990
- nvrtcDestroyProgram = _libraries['libnvrtc.so'].nvrtcDestroyProgram
4991
- nvrtcDestroyProgram.restype = nvrtcResult
4992
- nvrtcDestroyProgram.argtypes = [ctypes.POINTER(ctypes.POINTER(struct__nvrtcProgram))]
4993
- except AttributeError:
4994
- pass
4995
- try:
4996
- nvrtcCompileProgram = _libraries['libnvrtc.so'].nvrtcCompileProgram
4997
- nvrtcCompileProgram.restype = nvrtcResult
4998
- nvrtcCompileProgram.argtypes = [nvrtcProgram, ctypes.c_int32, ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
4999
- except AttributeError:
5000
- pass
5001
- try:
5002
- nvrtcGetPTXSize = _libraries['libnvrtc.so'].nvrtcGetPTXSize
5003
- nvrtcGetPTXSize.restype = nvrtcResult
5004
- nvrtcGetPTXSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
5005
- except AttributeError:
5006
- pass
5007
- try:
5008
- nvrtcGetPTX = _libraries['libnvrtc.so'].nvrtcGetPTX
5009
- nvrtcGetPTX.restype = nvrtcResult
5010
- nvrtcGetPTX.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
5011
- except AttributeError:
5012
- pass
5013
- try:
5014
- nvrtcGetCUBINSize = _libraries['libnvrtc.so'].nvrtcGetCUBINSize
5015
- nvrtcGetCUBINSize.restype = nvrtcResult
5016
- nvrtcGetCUBINSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
5017
- except AttributeError:
5018
- pass
5019
- try:
5020
- nvrtcGetCUBIN = _libraries['libnvrtc.so'].nvrtcGetCUBIN
5021
- nvrtcGetCUBIN.restype = nvrtcResult
5022
- nvrtcGetCUBIN.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
5023
- except AttributeError:
5024
- pass
5025
- try:
5026
- nvrtcGetNVVMSize = _libraries['libnvrtc.so'].nvrtcGetNVVMSize
5027
- nvrtcGetNVVMSize.restype = nvrtcResult
5028
- nvrtcGetNVVMSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
5029
- except AttributeError:
5030
- pass
5031
- try:
5032
- nvrtcGetNVVM = _libraries['libnvrtc.so'].nvrtcGetNVVM
5033
- nvrtcGetNVVM.restype = nvrtcResult
5034
- nvrtcGetNVVM.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
5035
- except AttributeError:
5036
- pass
5037
- try:
5038
- nvrtcGetProgramLogSize = _libraries['libnvrtc.so'].nvrtcGetProgramLogSize
5039
- nvrtcGetProgramLogSize.restype = nvrtcResult
5040
- nvrtcGetProgramLogSize.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_uint64)]
5041
- except AttributeError:
5042
- pass
5043
- try:
5044
- nvrtcGetProgramLog = _libraries['libnvrtc.so'].nvrtcGetProgramLog
5045
- nvrtcGetProgramLog.restype = nvrtcResult
5046
- nvrtcGetProgramLog.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
5047
- except AttributeError:
5048
- pass
5049
- try:
5050
- nvrtcAddNameExpression = _libraries['libnvrtc.so'].nvrtcAddNameExpression
5051
- nvrtcAddNameExpression.restype = nvrtcResult
5052
- nvrtcAddNameExpression.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char)]
5053
- except AttributeError:
5054
- pass
5055
- try:
5056
- nvrtcGetLoweredName = _libraries['libnvrtc.so'].nvrtcGetLoweredName
5057
- nvrtcGetLoweredName.restype = nvrtcResult
5058
- nvrtcGetLoweredName.argtypes = [nvrtcProgram, ctypes.POINTER(ctypes.c_char), ctypes.POINTER(ctypes.POINTER(ctypes.c_char))]
5059
- except AttributeError:
5060
- pass
5061
4923
  __all__ = \
5062
4924
  ['CUDA_ARRAY3D_DESCRIPTOR', 'CUDA_ARRAY3D_DESCRIPTOR_v2',
5063
4925
  'CUDA_ARRAY_DESCRIPTOR', 'CUDA_ARRAY_DESCRIPTOR_v2',
@@ -5673,21 +5535,11 @@ __all__ = \
5673
5535
  'CUuserObjectRetain_flags__enumvalues',
5674
5536
  'CUuserObjectRetain_flags_enum', 'CUuserObject_flags',
5675
5537
  'CUuserObject_flags__enumvalues', 'CUuserObject_flags_enum',
5676
- 'CUuuid', 'NVRTC_ERROR_BUILTIN_OPERATION_FAILURE',
5677
- 'NVRTC_ERROR_COMPILATION', 'NVRTC_ERROR_INTERNAL_ERROR',
5678
- 'NVRTC_ERROR_INVALID_INPUT', 'NVRTC_ERROR_INVALID_OPTION',
5679
- 'NVRTC_ERROR_INVALID_PROGRAM',
5680
- 'NVRTC_ERROR_NAME_EXPRESSION_NOT_VALID',
5681
- 'NVRTC_ERROR_NO_LOWERED_NAMES_BEFORE_COMPILATION',
5682
- 'NVRTC_ERROR_NO_NAME_EXPRESSIONS_AFTER_COMPILATION',
5683
- 'NVRTC_ERROR_OUT_OF_MEMORY',
5684
- 'NVRTC_ERROR_PROGRAM_CREATION_FAILURE', 'NVRTC_SUCCESS',
5685
- 'c__EA_nvrtcResult', 'cuArray3DCreate_v2',
5686
- 'cuArray3DGetDescriptor_v2', 'cuArrayCreate_v2', 'cuArrayDestroy',
5687
- 'cuArrayGetDescriptor_v2', 'cuArrayGetPlane',
5688
- 'cuArrayGetSparseProperties', 'cuCtxAttach', 'cuCtxCreate_v2',
5689
- 'cuCtxCreate_v3', 'cuCtxDestroy_v2', 'cuCtxDetach',
5690
- 'cuCtxDisablePeerAccess', 'cuCtxEnablePeerAccess',
5538
+ 'CUuuid', 'cuArray3DCreate_v2', 'cuArray3DGetDescriptor_v2',
5539
+ 'cuArrayCreate_v2', 'cuArrayDestroy', 'cuArrayGetDescriptor_v2',
5540
+ 'cuArrayGetPlane', 'cuArrayGetSparseProperties', 'cuCtxAttach',
5541
+ 'cuCtxCreate_v2', 'cuCtxCreate_v3', 'cuCtxDestroy_v2',
5542
+ 'cuCtxDetach', 'cuCtxDisablePeerAccess', 'cuCtxEnablePeerAccess',
5691
5543
  'cuCtxGetApiVersion', 'cuCtxGetCacheConfig', 'cuCtxGetCurrent',
5692
5544
  'cuCtxGetDevice', 'cuCtxGetExecAffinity', 'cuCtxGetFlags',
5693
5545
  'cuCtxGetLimit', 'cuCtxGetSharedMemConfig',
@@ -5844,14 +5696,7 @@ __all__ = \
5844
5696
  'cuTexRefSetMipmappedArray', 'cuThreadExchangeStreamCaptureMode',
5845
5697
  'cuUserObjectCreate', 'cuUserObjectRelease', 'cuUserObjectRetain',
5846
5698
  'cuWaitExternalSemaphoresAsync', 'cudaError_enum', 'cuuint32_t',
5847
- 'cuuint64_t', 'nvrtcAddNameExpression', 'nvrtcCompileProgram',
5848
- 'nvrtcCreateProgram', 'nvrtcDestroyProgram', 'nvrtcGetCUBIN',
5849
- 'nvrtcGetCUBINSize', 'nvrtcGetErrorString', 'nvrtcGetLoweredName',
5850
- 'nvrtcGetNVVM', 'nvrtcGetNVVMSize', 'nvrtcGetNumSupportedArchs',
5851
- 'nvrtcGetPTX', 'nvrtcGetPTXSize', 'nvrtcGetProgramLog',
5852
- 'nvrtcGetProgramLogSize', 'nvrtcGetSupportedArchs',
5853
- 'nvrtcProgram', 'nvrtcResult', 'nvrtcResult__enumvalues',
5854
- 'nvrtcVersion', 'size_t', 'struct_CUDA_ARRAY3D_DESCRIPTOR_st',
5699
+ 'cuuint64_t', 'size_t', 'struct_CUDA_ARRAY3D_DESCRIPTOR_st',
5855
5700
  'struct_CUDA_ARRAY_DESCRIPTOR_st',
5856
5701
  'struct_CUDA_ARRAY_SPARSE_PROPERTIES_st',
5857
5702
  'struct_CUDA_ARRAY_SPARSE_PROPERTIES_st_tileExtent',
@@ -5906,7 +5751,6 @@ __all__ = \
5906
5751
  'struct_CUstreamMemOpWriteValueParams_st', 'struct_CUstream_st',
5907
5752
  'struct_CUsurfref_st', 'struct_CUtexref_st',
5908
5753
  'struct_CUuserObject_st', 'struct_CUuuid_st',
5909
- 'struct__nvrtcProgram',
5910
5754
  'union_CUDA_EXTERNAL_MEMORY_HANDLE_DESC_st_handle',
5911
5755
  'union_CUDA_EXTERNAL_SEMAPHORE_HANDLE_DESC_st_handle',
5912
5756
  'union_CUDA_EXTERNAL_SEMAPHORE_SIGNAL_PARAMS_st_0_nvSciSync',
@@ -810,3 +810,35 @@ __all__ = \
810
810
  'struct_kfd_ioctl_wait_events_args',
811
811
  'struct_kfd_memory_exception_failure',
812
812
  'struct_kfd_process_device_apertures', 'union_kfd_event_data_0']
813
+ AMDKFD_IOC_GET_VERSION = ("IOR", 0x01, struct_kfd_ioctl_get_version_args)
814
+ AMDKFD_IOC_CREATE_QUEUE = ("IOWR", 0x02, struct_kfd_ioctl_create_queue_args)
815
+ AMDKFD_IOC_DESTROY_QUEUE = ("IOWR", 0x03, struct_kfd_ioctl_destroy_queue_args)
816
+ AMDKFD_IOC_SET_MEMORY_POLICY = ("IOW", 0x04, struct_kfd_ioctl_set_memory_policy_args)
817
+ AMDKFD_IOC_GET_CLOCK_COUNTERS = ("IOWR", 0x05, struct_kfd_ioctl_get_clock_counters_args)
818
+ AMDKFD_IOC_GET_PROCESS_APERTURES = ("IOR", 0x06, struct_kfd_ioctl_get_process_apertures_args)
819
+ AMDKFD_IOC_UPDATE_QUEUE = ("IOW", 0x07, struct_kfd_ioctl_update_queue_args)
820
+ AMDKFD_IOC_CREATE_EVENT = ("IOWR", 0x08, struct_kfd_ioctl_create_event_args)
821
+ AMDKFD_IOC_DESTROY_EVENT = ("IOW", 0x09, struct_kfd_ioctl_destroy_event_args)
822
+ AMDKFD_IOC_SET_EVENT = ("IOW", 0x0A, struct_kfd_ioctl_set_event_args)
823
+ AMDKFD_IOC_RESET_EVENT = ("IOW", 0x0B, struct_kfd_ioctl_reset_event_args)
824
+ AMDKFD_IOC_WAIT_EVENTS = ("IOWR", 0x0C, struct_kfd_ioctl_wait_events_args)
825
+ AMDKFD_IOC_DBG_REGISTER = ("IOW", 0x0D, struct_kfd_ioctl_dbg_register_args)
826
+ AMDKFD_IOC_DBG_UNREGISTER = ("IOW", 0x0E, struct_kfd_ioctl_dbg_unregister_args)
827
+ AMDKFD_IOC_DBG_ADDRESS_WATCH = ("IOW", 0x0F, struct_kfd_ioctl_dbg_address_watch_args)
828
+ AMDKFD_IOC_DBG_WAVE_CONTROL = ("IOW", 0x10, struct_kfd_ioctl_dbg_wave_control_args)
829
+ AMDKFD_IOC_SET_SCRATCH_BACKING_VA = ("IOWR", 0x11, struct_kfd_ioctl_set_scratch_backing_va_args)
830
+ AMDKFD_IOC_GET_TILE_CONFIG = ("IOWR", 0x12, struct_kfd_ioctl_get_tile_config_args)
831
+ AMDKFD_IOC_SET_TRAP_HANDLER = ("IOW", 0x13, struct_kfd_ioctl_set_trap_handler_args)
832
+ AMDKFD_IOC_ACQUIRE_VM = ("IOW", 0x15, struct_kfd_ioctl_acquire_vm_args)
833
+ AMDKFD_IOC_ALLOC_MEMORY_OF_GPU = ("IOWR", 0x16, struct_kfd_ioctl_alloc_memory_of_gpu_args)
834
+ AMDKFD_IOC_FREE_MEMORY_OF_GPU = ("IOW", 0x17, struct_kfd_ioctl_free_memory_of_gpu_args)
835
+ AMDKFD_IOC_MAP_MEMORY_TO_GPU = ("IOWR", 0x18, struct_kfd_ioctl_map_memory_to_gpu_args)
836
+ AMDKFD_IOC_UNMAP_MEMORY_FROM_GPU = ("IOWR", 0x19, struct_kfd_ioctl_unmap_memory_from_gpu_args)
837
+ AMDKFD_IOC_SET_CU_MASK = ("IOW", 0x1A, struct_kfd_ioctl_set_cu_mask_args)
838
+ AMDKFD_IOC_GET_QUEUE_WAVE_STATE = ("IOWR", 0x1B, struct_kfd_ioctl_get_queue_wave_state_args)
839
+ AMDKFD_IOC_GET_DMABUF_INFO = ("IOWR", 0x1C, struct_kfd_ioctl_get_dmabuf_info_args)
840
+ AMDKFD_IOC_IMPORT_DMABUF = ("IOWR", 0x1D, struct_kfd_ioctl_import_dmabuf_args)
841
+ AMDKFD_IOC_ALLOC_QUEUE_GWS = ("IOWR", 0x1E, struct_kfd_ioctl_alloc_queue_gws_args)
842
+ AMDKFD_IOC_SMI_EVENTS = ("IOWR", 0x1F, struct_kfd_ioctl_smi_events_args)
843
+ AMDKFD_IOC_SVM = ("IOWR", 0x20, struct_kfd_ioctl_svm_args)
844
+ AMDKFD_IOC_SET_XNACK_MODE = ("IOWR", 0x21, struct_kfd_ioctl_set_xnack_mode_args)