locomp 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- locomp/__init__.py +158 -0
- locomp/_native/__init__.py +0 -0
- locomp/_native/fast_dispatch.m +162 -0
- locomp/api.py +774 -0
- locomp/autotune.py +248 -0
- locomp/backends/__init__.py +1 -0
- locomp/backends/metal_codegen.py +811 -0
- locomp/backends/metal_runtime.py +450 -0
- locomp/frontend.py +861 -0
- locomp/ir.py +259 -0
- locomp/optimizer.py +286 -0
- locomp-0.1.0.dist-info/METADATA +460 -0
- locomp-0.1.0.dist-info/RECORD +16 -0
- locomp-0.1.0.dist-info/WHEEL +5 -0
- locomp-0.1.0.dist-info/licenses/LICENSE +201 -0
- locomp-0.1.0.dist-info/top_level.txt +1 -0
locomp/__init__.py
ADDED
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
"""
|
|
2
|
+
Locomp — A GPU compute compiler for Apple Metal.
|
|
3
|
+
Write GPU kernels in Python, compile to native Metal shaders.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
from locomp.autotune import Config, autotune, clear_cache
|
|
7
|
+
from locomp.api import (
|
|
8
|
+
kernel,
|
|
9
|
+
program_id,
|
|
10
|
+
thread_id,
|
|
11
|
+
local_id,
|
|
12
|
+
group_size,
|
|
13
|
+
num_groups,
|
|
14
|
+
barrier,
|
|
15
|
+
shared_memory,
|
|
16
|
+
shared_load,
|
|
17
|
+
shared_store,
|
|
18
|
+
simd_sum,
|
|
19
|
+
simd_max,
|
|
20
|
+
simd_min,
|
|
21
|
+
simd_broadcast,
|
|
22
|
+
simd_shuffle_down,
|
|
23
|
+
simd_lane_id,
|
|
24
|
+
simd_group_id,
|
|
25
|
+
simdgroup_matrix_load,
|
|
26
|
+
simdgroup_matrix_load_device,
|
|
27
|
+
simdgroup_matrix_store,
|
|
28
|
+
simdgroup_matrix_store_device,
|
|
29
|
+
simdgroup_mac,
|
|
30
|
+
simdgroup_matrix,
|
|
31
|
+
atomic_add,
|
|
32
|
+
atomic_max,
|
|
33
|
+
atomic_min,
|
|
34
|
+
arange,
|
|
35
|
+
load,
|
|
36
|
+
store,
|
|
37
|
+
exp,
|
|
38
|
+
log,
|
|
39
|
+
sqrt,
|
|
40
|
+
abs,
|
|
41
|
+
tanh,
|
|
42
|
+
sin,
|
|
43
|
+
cos,
|
|
44
|
+
asin,
|
|
45
|
+
acos,
|
|
46
|
+
atan,
|
|
47
|
+
atan2,
|
|
48
|
+
sinh,
|
|
49
|
+
cosh,
|
|
50
|
+
exp2,
|
|
51
|
+
log2,
|
|
52
|
+
log10,
|
|
53
|
+
rsqrt,
|
|
54
|
+
ceil,
|
|
55
|
+
floor,
|
|
56
|
+
round,
|
|
57
|
+
sigmoid,
|
|
58
|
+
fma,
|
|
59
|
+
pow,
|
|
60
|
+
clamp,
|
|
61
|
+
copysign,
|
|
62
|
+
fmod,
|
|
63
|
+
step,
|
|
64
|
+
where,
|
|
65
|
+
tensor,
|
|
66
|
+
empty,
|
|
67
|
+
zeros,
|
|
68
|
+
ones,
|
|
69
|
+
hardware_info,
|
|
70
|
+
set_device,
|
|
71
|
+
constexpr,
|
|
72
|
+
Tensor,
|
|
73
|
+
Float16,
|
|
74
|
+
UInt8,
|
|
75
|
+
Int8,
|
|
76
|
+
Int32,
|
|
77
|
+
Bool,
|
|
78
|
+
)
|
|
79
|
+
|
|
80
|
+
__version__ = "0.1.0"
|
|
81
|
+
__all__ = [
|
|
82
|
+
"Config",
|
|
83
|
+
"autotune",
|
|
84
|
+
"clear_cache",
|
|
85
|
+
"kernel",
|
|
86
|
+
"program_id",
|
|
87
|
+
"thread_id",
|
|
88
|
+
"local_id",
|
|
89
|
+
"group_size",
|
|
90
|
+
"num_groups",
|
|
91
|
+
"barrier",
|
|
92
|
+
"shared_memory",
|
|
93
|
+
"shared_load",
|
|
94
|
+
"shared_store",
|
|
95
|
+
"simd_sum",
|
|
96
|
+
"simd_max",
|
|
97
|
+
"simd_min",
|
|
98
|
+
"simd_broadcast",
|
|
99
|
+
"simd_shuffle_down",
|
|
100
|
+
"simd_lane_id",
|
|
101
|
+
"simd_group_id",
|
|
102
|
+
"simdgroup_matrix_load",
|
|
103
|
+
"simdgroup_matrix_load_device",
|
|
104
|
+
"simdgroup_matrix_store",
|
|
105
|
+
"simdgroup_matrix_store_device",
|
|
106
|
+
"simdgroup_mac",
|
|
107
|
+
"simdgroup_matrix",
|
|
108
|
+
"atomic_add",
|
|
109
|
+
"atomic_max",
|
|
110
|
+
"atomic_min",
|
|
111
|
+
"arange",
|
|
112
|
+
"load",
|
|
113
|
+
"store",
|
|
114
|
+
"exp",
|
|
115
|
+
"log",
|
|
116
|
+
"sqrt",
|
|
117
|
+
"abs",
|
|
118
|
+
"tanh",
|
|
119
|
+
"sin",
|
|
120
|
+
"cos",
|
|
121
|
+
"asin",
|
|
122
|
+
"acos",
|
|
123
|
+
"atan",
|
|
124
|
+
"atan2",
|
|
125
|
+
"sinh",
|
|
126
|
+
"cosh",
|
|
127
|
+
"exp2",
|
|
128
|
+
"log2",
|
|
129
|
+
"log10",
|
|
130
|
+
"rsqrt",
|
|
131
|
+
"ceil",
|
|
132
|
+
"floor",
|
|
133
|
+
"round",
|
|
134
|
+
"sigmoid",
|
|
135
|
+
"fma",
|
|
136
|
+
"pow",
|
|
137
|
+
"clamp",
|
|
138
|
+
"copysign",
|
|
139
|
+
"fmod",
|
|
140
|
+
"step",
|
|
141
|
+
"where",
|
|
142
|
+
"tensor",
|
|
143
|
+
"empty",
|
|
144
|
+
"zeros",
|
|
145
|
+
"ones",
|
|
146
|
+
"atomic_add",
|
|
147
|
+
"atomic_max",
|
|
148
|
+
"atomic_min",
|
|
149
|
+
"hardware_info",
|
|
150
|
+
"set_device",
|
|
151
|
+
"constexpr",
|
|
152
|
+
"Tensor",
|
|
153
|
+
"Float16",
|
|
154
|
+
"UInt8",
|
|
155
|
+
"Int8",
|
|
156
|
+
"Int32",
|
|
157
|
+
"Bool",
|
|
158
|
+
]
|
|
File without changes
|
|
@@ -0,0 +1,162 @@
|
|
|
1
|
+
// locust/_native/fast_dispatch.m
|
|
2
|
+
// Minimal C bridge for Metal kernel dispatch.
|
|
3
|
+
// Replaces 8+ PyObjC round-trips with a single ctypes call.
|
|
4
|
+
|
|
5
|
+
#import <Metal/Metal.h>
|
|
6
|
+
|
|
7
|
+
// Pending command buffer for async dispatch
|
|
8
|
+
static id<MTLCommandBuffer> _pending_cmd_buf = nil;
|
|
9
|
+
|
|
10
|
+
// Dispatch a compute kernel synchronously. Returns GPU time in milliseconds.
|
|
11
|
+
double locust_dispatch(void *queue_ptr, void *pipeline_ptr,
|
|
12
|
+
void **buffer_ptrs, int num_buffers,
|
|
13
|
+
int gx, int gy, int gz,
|
|
14
|
+
int tx, int ty, int tz) {
|
|
15
|
+
@autoreleasepool {
|
|
16
|
+
id<MTLCommandQueue> queue = (__bridge id<MTLCommandQueue>)queue_ptr;
|
|
17
|
+
id<MTLComputePipelineState> pipeline = (__bridge id<MTLComputePipelineState>)pipeline_ptr;
|
|
18
|
+
|
|
19
|
+
id<MTLCommandBuffer> cmdBuf = [queue commandBuffer];
|
|
20
|
+
id<MTLComputeCommandEncoder> encoder = [cmdBuf computeCommandEncoder];
|
|
21
|
+
[encoder setComputePipelineState:pipeline];
|
|
22
|
+
|
|
23
|
+
for (int i = 0; i < num_buffers; i++) {
|
|
24
|
+
id<MTLBuffer> buf = (__bridge id<MTLBuffer>)buffer_ptrs[i];
|
|
25
|
+
[encoder setBuffer:buf offset:0 atIndex:i];
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
MTLSize grid = MTLSizeMake(gx, gy, gz);
|
|
29
|
+
MTLSize tgSize = MTLSizeMake(tx, ty, tz);
|
|
30
|
+
[encoder dispatchThreadgroups:grid threadsPerThreadgroup:tgSize];
|
|
31
|
+
[encoder endEncoding];
|
|
32
|
+
|
|
33
|
+
[cmdBuf commit];
|
|
34
|
+
[cmdBuf waitUntilCompleted];
|
|
35
|
+
|
|
36
|
+
return (cmdBuf.GPUEndTime - cmdBuf.GPUStartTime) * 1000.0;
|
|
37
|
+
}
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
// Dispatch async — commit but don't wait. Returns immediately.
|
|
41
|
+
void locust_dispatch_async(void *queue_ptr, void *pipeline_ptr,
|
|
42
|
+
void **buffer_ptrs, int num_buffers,
|
|
43
|
+
int gx, int gy, int gz,
|
|
44
|
+
int tx, int ty, int tz) {
|
|
45
|
+
// Wait for any previous pending work first
|
|
46
|
+
if (_pending_cmd_buf != nil) {
|
|
47
|
+
[_pending_cmd_buf waitUntilCompleted];
|
|
48
|
+
_pending_cmd_buf = nil;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
id<MTLCommandQueue> queue = (__bridge id<MTLCommandQueue>)queue_ptr;
|
|
52
|
+
id<MTLComputePipelineState> pipeline = (__bridge id<MTLComputePipelineState>)pipeline_ptr;
|
|
53
|
+
|
|
54
|
+
id<MTLCommandBuffer> cmdBuf = [queue commandBuffer];
|
|
55
|
+
id<MTLComputeCommandEncoder> encoder = [cmdBuf computeCommandEncoder];
|
|
56
|
+
[encoder setComputePipelineState:pipeline];
|
|
57
|
+
|
|
58
|
+
for (int i = 0; i < num_buffers; i++) {
|
|
59
|
+
id<MTLBuffer> buf = (__bridge id<MTLBuffer>)buffer_ptrs[i];
|
|
60
|
+
[encoder setBuffer:buf offset:0 atIndex:i];
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
MTLSize grid = MTLSizeMake(gx, gy, gz);
|
|
64
|
+
MTLSize tgSize = MTLSizeMake(tx, ty, tz);
|
|
65
|
+
[encoder dispatchThreadgroups:grid threadsPerThreadgroup:tgSize];
|
|
66
|
+
[encoder endEncoding];
|
|
67
|
+
|
|
68
|
+
[cmdBuf commit];
|
|
69
|
+
_pending_cmd_buf = cmdBuf;
|
|
70
|
+
}
|
|
71
|
+
|
|
72
|
+
// Wait for pending async work. Returns 0 if nothing pending.
|
|
73
|
+
double locust_sync(void) {
|
|
74
|
+
if (_pending_cmd_buf == nil) return 0.0;
|
|
75
|
+
[_pending_cmd_buf waitUntilCompleted];
|
|
76
|
+
double gpu_ms = (_pending_cmd_buf.GPUEndTime - _pending_cmd_buf.GPUStartTime) * 1000.0;
|
|
77
|
+
_pending_cmd_buf = nil;
|
|
78
|
+
return gpu_ms;
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Dispatch same kernel N times in one command buffer. Returns avg GPU time in ms.
|
|
82
|
+
double locust_dispatch_repeat(void *queue_ptr, void *pipeline_ptr,
|
|
83
|
+
void **buffer_ptrs, int num_buffers,
|
|
84
|
+
int gx, int gy, int gz,
|
|
85
|
+
int tx, int ty, int tz,
|
|
86
|
+
int repeat) {
|
|
87
|
+
@autoreleasepool {
|
|
88
|
+
id<MTLCommandQueue> queue = (__bridge id<MTLCommandQueue>)queue_ptr;
|
|
89
|
+
id<MTLComputePipelineState> pipeline = (__bridge id<MTLComputePipelineState>)pipeline_ptr;
|
|
90
|
+
|
|
91
|
+
id<MTLCommandBuffer> cmdBuf = [queue commandBuffer];
|
|
92
|
+
|
|
93
|
+
MTLSize grid = MTLSizeMake(gx, gy, gz);
|
|
94
|
+
MTLSize tgSize = MTLSizeMake(tx, ty, tz);
|
|
95
|
+
|
|
96
|
+
for (int r = 0; r < repeat; r++) {
|
|
97
|
+
id<MTLComputeCommandEncoder> encoder = [cmdBuf computeCommandEncoder];
|
|
98
|
+
[encoder setComputePipelineState:pipeline];
|
|
99
|
+
for (int i = 0; i < num_buffers; i++) {
|
|
100
|
+
id<MTLBuffer> buf = (__bridge id<MTLBuffer>)buffer_ptrs[i];
|
|
101
|
+
[encoder setBuffer:buf offset:0 atIndex:i];
|
|
102
|
+
}
|
|
103
|
+
[encoder dispatchThreadgroups:grid threadsPerThreadgroup:tgSize];
|
|
104
|
+
[encoder endEncoding];
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
[cmdBuf commit];
|
|
108
|
+
[cmdBuf waitUntilCompleted];
|
|
109
|
+
|
|
110
|
+
double total_ms = (cmdBuf.GPUEndTime - cmdBuf.GPUStartTime) * 1000.0;
|
|
111
|
+
return total_ms / repeat;
|
|
112
|
+
}
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
// --- Batch mode ---
|
|
116
|
+
// Multiple different kernels in one command buffer.
|
|
117
|
+
static id<MTLCommandBuffer> _batch_cmd_buf = nil;
|
|
118
|
+
|
|
119
|
+
void locust_batch_begin(void *queue_ptr) {
|
|
120
|
+
if (_batch_cmd_buf != nil) {
|
|
121
|
+
[_batch_cmd_buf waitUntilCompleted];
|
|
122
|
+
_batch_cmd_buf = nil;
|
|
123
|
+
}
|
|
124
|
+
// Also drain any pending async work
|
|
125
|
+
if (_pending_cmd_buf != nil) {
|
|
126
|
+
[_pending_cmd_buf waitUntilCompleted];
|
|
127
|
+
_pending_cmd_buf = nil;
|
|
128
|
+
}
|
|
129
|
+
id<MTLCommandQueue> queue = (__bridge id<MTLCommandQueue>)queue_ptr;
|
|
130
|
+
_batch_cmd_buf = [queue commandBuffer];
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
void locust_batch_dispatch(void *pipeline_ptr,
|
|
134
|
+
void **buffer_ptrs, int num_buffers,
|
|
135
|
+
int gx, int gy, int gz,
|
|
136
|
+
int tx, int ty, int tz) {
|
|
137
|
+
if (_batch_cmd_buf == nil) return;
|
|
138
|
+
|
|
139
|
+
id<MTLComputePipelineState> pipeline = (__bridge id<MTLComputePipelineState>)pipeline_ptr;
|
|
140
|
+
|
|
141
|
+
id<MTLComputeCommandEncoder> encoder = [_batch_cmd_buf computeCommandEncoder];
|
|
142
|
+
[encoder setComputePipelineState:pipeline];
|
|
143
|
+
|
|
144
|
+
for (int i = 0; i < num_buffers; i++) {
|
|
145
|
+
id<MTLBuffer> buf = (__bridge id<MTLBuffer>)buffer_ptrs[i];
|
|
146
|
+
[encoder setBuffer:buf offset:0 atIndex:i];
|
|
147
|
+
}
|
|
148
|
+
|
|
149
|
+
MTLSize grid = MTLSizeMake(gx, gy, gz);
|
|
150
|
+
MTLSize tgSize = MTLSizeMake(tx, ty, tz);
|
|
151
|
+
[encoder dispatchThreadgroups:grid threadsPerThreadgroup:tgSize];
|
|
152
|
+
[encoder endEncoding];
|
|
153
|
+
}
|
|
154
|
+
|
|
155
|
+
double locust_batch_end(void) {
|
|
156
|
+
if (_batch_cmd_buf == nil) return 0.0;
|
|
157
|
+
[_batch_cmd_buf commit];
|
|
158
|
+
[_batch_cmd_buf waitUntilCompleted];
|
|
159
|
+
double gpu_ms = (_batch_cmd_buf.GPUEndTime - _batch_cmd_buf.GPUStartTime) * 1000.0;
|
|
160
|
+
_batch_cmd_buf = nil;
|
|
161
|
+
return gpu_ms;
|
|
162
|
+
}
|