npm - @rlabs-inc/sparse - Versions diffs - 0.1.0 - Mend

@rlabs-inc/sparse 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/CLAUDE.md +92 -0
package/libsparse.dylib +0 -0
package/native/Makefile +30 -0
package/native/libsparse.dylib +0 -0
package/native/sparse.h +180 -0
package/native/sparse.m +734 -0
package/native/sparse.metal +215 -0
package/package.json +38 -0
package/src/ffi.ts +156 -0
package/src/gpu.ts +382 -0
package/src/index.ts +7 -0
package/src/test-debug-spikes.ts +70 -0
package/src/test-limits.ts +140 -0
package/src/test-scatter-loop.ts +226 -0
package/src/test-stress.ts +160 -0
package/src/test-webgpu.ts +31 -0

package/native/sparse.m ADDED Viewed

@@ -0,0 +1,734 @@
+// ============================================================================
+// SPARSE - Metal Implementation
+// Objective-C bridge between C API and Metal compute
+// ============================================================================
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#include "sparse.h"
+#include <stdlib.h>
+// ============================================================================
+// INTERNAL STRUCTURES
+// ============================================================================
+struct SparseContext {
+    id<MTLDevice> device;
+    id<MTLCommandQueue> commandQueue;
+    id<MTLLibrary> library;
+    // Compiled kernel functions
+    id<MTLComputePipelineState> scatterAddPipeline;
+    id<MTLComputePipelineState> addArraysPipeline;
+    id<MTLComputePipelineState> addScalarPipeline;
+    id<MTLComputePipelineState> multiplyArraysPipeline;
+    id<MTLComputePipelineState> multiplyScalarPipeline;
+    id<MTLComputePipelineState> squarePipeline;
+    id<MTLComputePipelineState> greaterEqualPipeline;
+    id<MTLComputePipelineState> whereSelectPipeline;
+    id<MTLComputePipelineState> whereScalarPipeline;
+    id<MTLComputePipelineState> gatherPipeline;
+    id<MTLComputePipelineState> gatherBoolPipeline;
+    id<MTLComputePipelineState> fillFloatPipeline;
+    id<MTLComputePipelineState> fillZerosPipeline;
+    id<MTLComputePipelineState> sumReducePipeline;
+    char deviceName[256];
+};
+struct SparseBuffer {
+    id<MTLBuffer> buffer;
+    uint32_t count;
+    SparseDataType dtype;
+    SparseContextRef ctx;
+};
+// ============================================================================
+// HELPER FUNCTIONS
+// ============================================================================
+static id<MTLComputePipelineState> createPipeline(SparseContextRef ctx, NSString* functionName) {
+    NSError* error = nil;
+    id<MTLFunction> function = [ctx->library newFunctionWithName:functionName];
+    if (!function) {
+        NSLog(@"Failed to find function: %@", functionName);
+        return nil;
+    }
+    id<MTLComputePipelineState> pipeline = [ctx->device newComputePipelineStateWithFunction:function error:&error];
+    if (error) {
+        NSLog(@"Failed to create pipeline for %@: %@", functionName, error);
+        return nil;
+    }
+    return pipeline;
+}
+static void dispatchCompute(SparseContextRef ctx, id<MTLComputePipelineState> pipeline, uint32_t count) {
+    id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+    id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+    [encoder setComputePipelineState:pipeline];
+    NSUInteger threadGroupSize = pipeline.maxTotalThreadsPerThreadgroup;
+    if (threadGroupSize > 256) threadGroupSize = 256;
+    MTLSize gridSize = MTLSizeMake(count, 1, 1);
+    MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+    [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+    [encoder endEncoding];
+    [commandBuffer commit];
+    [commandBuffer waitUntilCompleted];
+}
+// ============================================================================
+// CONTEXT MANAGEMENT
+// ============================================================================
+SparseContextRef sparse_init(void) {
+    @autoreleasepool {
+        SparseContextRef ctx = (SparseContextRef)calloc(1, sizeof(struct SparseContext));
+        if (!ctx) return NULL;
+        // Get the default Metal device
+        ctx->device = MTLCreateSystemDefaultDevice();
+        if (!ctx->device) {
+            NSLog(@"Metal is not supported on this device");
+            free(ctx);
+            return NULL;
+        }
+        // Store device name
+        const char* name = [ctx->device.name UTF8String];
+        strncpy(ctx->deviceName, name, sizeof(ctx->deviceName) - 1);
+        // Create command queue
+        ctx->commandQueue = [ctx->device newCommandQueue];
+        // Load shader library from source
+        NSString* shaderPath = [[NSBundle mainBundle] pathForResource:@"sparse" ofType:@"metal"];
+        NSString* shaderSource = nil;
+        if (shaderPath) {
+            shaderSource = [NSString stringWithContentsOfFile:shaderPath encoding:NSUTF8StringEncoding error:nil];
+        }
+        if (!shaderSource) {
+            // Try loading from current directory or embedded
+            NSString* currentDir = [[NSFileManager defaultManager] currentDirectoryPath];
+            NSString* localPath = [currentDir stringByAppendingPathComponent:@"native/sparse.metal"];
+            shaderSource = [NSString stringWithContentsOfFile:localPath encoding:NSUTF8StringEncoding error:nil];
+            if (!shaderSource) {
+                // Try relative to library location
+                localPath = [currentDir stringByAppendingPathComponent:@"sparse.metal"];
+                shaderSource = [NSString stringWithContentsOfFile:localPath encoding:NSUTF8StringEncoding error:nil];
+            }
+        }
+        if (!shaderSource) {
+            NSLog(@"Could not load sparse.metal shader file");
+            free(ctx);
+            return NULL;
+        }
+        NSError* error = nil;
+        MTLCompileOptions* options = [[MTLCompileOptions alloc] init];
+        ctx->library = [ctx->device newLibraryWithSource:shaderSource options:options error:&error];
+        if (error) {
+            NSLog(@"Failed to compile Metal shaders: %@", error);
+            free(ctx);
+            return NULL;
+        }
+        // Create all compute pipelines
+        ctx->scatterAddPipeline = createPipeline(ctx, @"scatter_add");
+        ctx->addArraysPipeline = createPipeline(ctx, @"add_arrays");
+        ctx->addScalarPipeline = createPipeline(ctx, @"add_scalar");
+        ctx->multiplyArraysPipeline = createPipeline(ctx, @"multiply_arrays");
+        ctx->multiplyScalarPipeline = createPipeline(ctx, @"multiply_scalar");
+        ctx->squarePipeline = createPipeline(ctx, @"square");
+        ctx->greaterEqualPipeline = createPipeline(ctx, @"greater_equal");
+        ctx->whereSelectPipeline = createPipeline(ctx, @"where_select");
+        ctx->whereScalarPipeline = createPipeline(ctx, @"where_scalar");
+        ctx->gatherPipeline = createPipeline(ctx, @"gather");
+        ctx->gatherBoolPipeline = createPipeline(ctx, @"gather_bool");
+        ctx->fillFloatPipeline = createPipeline(ctx, @"fill_float");
+        ctx->fillZerosPipeline = createPipeline(ctx, @"fill_zeros");
+        ctx->sumReducePipeline = createPipeline(ctx, @"sum_reduce");
+        return ctx;
+    }
+}
+void sparse_cleanup(SparseContextRef ctx) {
+    if (!ctx) return;
+    // ARC handles Metal object cleanup
+    free(ctx);
+}
+void sparse_sync(SparseContextRef ctx) {
+    if (!ctx) return;
+    // Create and immediately complete a command buffer to sync
+    id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+    [commandBuffer commit];
+    [commandBuffer waitUntilCompleted];
+}
+const char* sparse_device_name(SparseContextRef ctx) {
+    return ctx ? ctx->deviceName : "Unknown";
+}
+uint64_t sparse_device_memory(SparseContextRef ctx) {
+    if (!ctx) return 0;
+    return ctx->device.recommendedMaxWorkingSetSize;
+}
+// ============================================================================
+// BUFFER MANAGEMENT
+// ============================================================================
+SparseBufferRef sparse_zeros(SparseContextRef ctx, uint32_t count, SparseDataType dtype) {
+    if (!ctx || count == 0) return NULL;
+    size_t elementSize = (dtype == SPARSE_FLOAT32) ? sizeof(float) : sizeof(uint32_t);
+    size_t bufferSize = count * elementSize;
+    SparseBufferRef buf = (SparseBufferRef)calloc(1, sizeof(struct SparseBuffer));
+    buf->buffer = [ctx->device newBufferWithLength:bufferSize options:MTLResourceStorageModeShared];
+    buf->count = count;
+    buf->dtype = dtype;
+    buf->ctx = ctx;
+    // Zero the buffer
+    memset(buf->buffer.contents, 0, bufferSize);
+    return buf;
+}
+SparseBufferRef sparse_full(SparseContextRef ctx, uint32_t count, float value) {
+    if (!ctx || count == 0) return NULL;
+    SparseBufferRef buf = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    if (!buf) return NULL;
+    // Use GPU to fill
+    id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+    id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+    [encoder setComputePipelineState:ctx->fillFloatPipeline];
+    [encoder setBuffer:buf->buffer offset:0 atIndex:0];
+    [encoder setBytes:&value length:sizeof(float) atIndex:1];
+    [encoder setBytes:&count length:sizeof(uint32_t) atIndex:2];
+    NSUInteger threadGroupSize = 256;
+    MTLSize gridSize = MTLSizeMake(count, 1, 1);
+    MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+    [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+    [encoder endEncoding];
+    [commandBuffer commit];
+    [commandBuffer waitUntilCompleted];
+    return buf;
+}
+SparseBufferRef sparse_from_float(SparseContextRef ctx, const float* data, uint32_t count) {
+    if (!ctx || !data || count == 0) return NULL;
+    SparseBufferRef buf = (SparseBufferRef)calloc(1, sizeof(struct SparseBuffer));
+    buf->buffer = [ctx->device newBufferWithBytes:data
+                                           length:count * sizeof(float)
+                                          options:MTLResourceStorageModeShared];
+    buf->count = count;
+    buf->dtype = SPARSE_FLOAT32;
+    buf->ctx = ctx;
+    return buf;
+}
+SparseBufferRef sparse_from_uint(SparseContextRef ctx, const uint32_t* data, uint32_t count) {
+    if (!ctx || !data || count == 0) return NULL;
+    SparseBufferRef buf = (SparseBufferRef)calloc(1, sizeof(struct SparseBuffer));
+    buf->buffer = [ctx->device newBufferWithBytes:data
+                                           length:count * sizeof(uint32_t)
+                                          options:MTLResourceStorageModeShared];
+    buf->count = count;
+    buf->dtype = SPARSE_UINT32;
+    buf->ctx = ctx;
+    return buf;
+}
+void sparse_to_float(SparseBufferRef buf, float* out, uint32_t count) {
+    if (!buf || !out) return;
+    uint32_t copyCount = (count < buf->count) ? count : buf->count;
+    memcpy(out, buf->buffer.contents, copyCount * sizeof(float));
+}
+void sparse_to_uint(SparseBufferRef buf, uint32_t* out, uint32_t count) {
+    if (!buf || !out) return;
+    uint32_t copyCount = (count < buf->count) ? count : buf->count;
+    memcpy(out, buf->buffer.contents, copyCount * sizeof(uint32_t));
+}
+uint32_t sparse_buffer_count(SparseBufferRef buf) {
+    return buf ? buf->count : 0;
+}
+SparseDataType sparse_buffer_dtype(SparseBufferRef buf) {
+    return buf ? buf->dtype : SPARSE_FLOAT32;
+}
+void sparse_buffer_free(SparseBufferRef buf) {
+    if (!buf) return;
+    // ARC handles MTLBuffer release
+    free(buf);
+}
+// ============================================================================
+// CORE OPERATIONS
+// ============================================================================
+void sparse_scatter_add(
+    SparseContextRef ctx,
+    SparseBufferRef target,
+    SparseBufferRef indices,
+    SparseBufferRef values,
+    uint32_t count
+) {
+    if (!ctx || !target || !indices || !values || count == 0) return;
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->scatterAddPipeline];
+        [encoder setBuffer:target->buffer offset:0 atIndex:0];
+        [encoder setBuffer:indices->buffer offset:0 atIndex:1];
+        [encoder setBuffer:values->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+}
+SparseBufferRef sparse_gather(
+    SparseContextRef ctx,
+    SparseBufferRef source,
+    SparseBufferRef indices,
+    uint32_t count
+) {
+    if (!ctx || !source || !indices || count == 0) return NULL;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->gatherPipeline];
+        [encoder setBuffer:source->buffer offset:0 atIndex:0];
+        [encoder setBuffer:indices->buffer offset:0 atIndex:1];
+        [encoder setBuffer:result->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+SparseBufferRef sparse_gather_bool(
+    SparseContextRef ctx,
+    SparseBufferRef source,
+    SparseBufferRef indices,
+    uint32_t count
+) {
+    if (!ctx || !source || !indices || count == 0) return NULL;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_UINT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->gatherBoolPipeline];
+        [encoder setBuffer:source->buffer offset:0 atIndex:0];
+        [encoder setBuffer:indices->buffer offset:0 atIndex:1];
+        [encoder setBuffer:result->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+// ============================================================================
+// ELEMENT-WISE OPERATIONS
+// ============================================================================
+SparseBufferRef sparse_add(SparseContextRef ctx, SparseBufferRef a, SparseBufferRef b) {
+    if (!ctx || !a || !b) return NULL;
+    uint32_t count = a->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->addArraysPipeline];
+        [encoder setBuffer:a->buffer offset:0 atIndex:0];
+        [encoder setBuffer:b->buffer offset:0 atIndex:1];
+        [encoder setBuffer:result->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+SparseBufferRef sparse_add_scalar(SparseContextRef ctx, SparseBufferRef a, float scalar) {
+    if (!ctx || !a) return NULL;
+    uint32_t count = a->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->addScalarPipeline];
+        [encoder setBuffer:a->buffer offset:0 atIndex:0];
+        [encoder setBytes:&scalar length:sizeof(float) atIndex:1];
+        [encoder setBuffer:result->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+SparseBufferRef sparse_multiply(SparseContextRef ctx, SparseBufferRef a, SparseBufferRef b) {
+    if (!ctx || !a || !b) return NULL;
+    uint32_t count = a->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->multiplyArraysPipeline];
+        [encoder setBuffer:a->buffer offset:0 atIndex:0];
+        [encoder setBuffer:b->buffer offset:0 atIndex:1];
+        [encoder setBuffer:result->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+SparseBufferRef sparse_multiply_scalar(SparseContextRef ctx, SparseBufferRef a, float scalar) {
+    if (!ctx || !a) return NULL;
+    uint32_t count = a->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->multiplyScalarPipeline];
+        [encoder setBuffer:a->buffer offset:0 atIndex:0];
+        [encoder setBytes:&scalar length:sizeof(float) atIndex:1];
+        [encoder setBuffer:result->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+SparseBufferRef sparse_square(SparseContextRef ctx, SparseBufferRef a) {
+    if (!ctx || !a) return NULL;
+    uint32_t count = a->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->squarePipeline];
+        [encoder setBuffer:a->buffer offset:0 atIndex:0];
+        [encoder setBuffer:result->buffer offset:0 atIndex:1];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:2];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+// ============================================================================
+// CONDITIONAL OPERATIONS
+// ============================================================================
+SparseBufferRef sparse_greater_equal(SparseContextRef ctx, SparseBufferRef a, float threshold) {
+    if (!ctx || !a) return NULL;
+    uint32_t count = a->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_UINT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->greaterEqualPipeline];
+        [encoder setBuffer:a->buffer offset:0 atIndex:0];
+        [encoder setBytes:&threshold length:sizeof(float) atIndex:1];
+        [encoder setBuffer:result->buffer offset:0 atIndex:2];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:3];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+SparseBufferRef sparse_where(
+    SparseContextRef ctx,
+    SparseBufferRef condition,
+    SparseBufferRef if_true,
+    SparseBufferRef if_false
+) {
+    if (!ctx || !condition || !if_true || !if_false) return NULL;
+    uint32_t count = condition->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->whereSelectPipeline];
+        [encoder setBuffer:condition->buffer offset:0 atIndex:0];
+        [encoder setBuffer:if_true->buffer offset:0 atIndex:1];
+        [encoder setBuffer:if_false->buffer offset:0 atIndex:2];
+        [encoder setBuffer:result->buffer offset:0 atIndex:3];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:4];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+SparseBufferRef sparse_where_scalar(
+    SparseContextRef ctx,
+    SparseBufferRef condition,
+    float if_true,
+    float if_false
+) {
+    if (!ctx || !condition) return NULL;
+    uint32_t count = condition->count;
+    SparseBufferRef result = sparse_zeros(ctx, count, SPARSE_FLOAT32);
+    @autoreleasepool {
+        id<MTLCommandBuffer> commandBuffer = [ctx->commandQueue commandBuffer];
+        id<MTLComputeCommandEncoder> encoder = [commandBuffer computeCommandEncoder];
+        [encoder setComputePipelineState:ctx->whereScalarPipeline];
+        [encoder setBuffer:condition->buffer offset:0 atIndex:0];
+        [encoder setBytes:&if_true length:sizeof(float) atIndex:1];
+        [encoder setBytes:&if_false length:sizeof(float) atIndex:2];
+        [encoder setBuffer:result->buffer offset:0 atIndex:3];
+        [encoder setBytes:&count length:sizeof(uint32_t) atIndex:4];
+        NSUInteger threadGroupSize = 256;
+        MTLSize gridSize = MTLSizeMake(count, 1, 1);
+        MTLSize groupSize = MTLSizeMake(threadGroupSize, 1, 1);
+        [encoder dispatchThreads:gridSize threadsPerThreadgroup:groupSize];
+        [encoder endEncoding];
+        [commandBuffer commit];
+        [commandBuffer waitUntilCompleted];
+    }
+    return result;
+}
+// ============================================================================
+// REDUCTION OPERATIONS
+// ============================================================================
+float sparse_sum(SparseContextRef ctx, SparseBufferRef a) {
+    if (!ctx || !a) return 0.0f;
+    // For simplicity, read back to CPU and sum
+    // TODO: Use GPU reduction for large arrays
+    float* data = (float*)malloc(a->count * sizeof(float));
+    sparse_to_float(a, data, a->count);
+    float sum = 0.0f;
+    for (uint32_t i = 0; i < a->count; i++) {
+        sum += data[i];
+    }
+    free(data);
+    return sum;
+}
+uint32_t sparse_sum_bool(SparseContextRef ctx, SparseBufferRef a) {
+    if (!ctx || !a) return 0;
+    uint32_t* data = (uint32_t*)malloc(a->count * sizeof(uint32_t));
+    sparse_to_uint(a, data, a->count);
+    uint32_t sum = 0;
+    for (uint32_t i = 0; i < a->count; i++) {
+        sum += data[i];
+    }
+    free(data);
+    return sum;
+}
+// ============================================================================
+// RANDOM OPERATIONS
+// ============================================================================
+SparseBufferRef sparse_random_uniform(
+    SparseContextRef ctx,
+    uint32_t count,
+    float low,
+    float high
+) {
+    if (!ctx || count == 0) return NULL;
+    // Generate on CPU (Metal random is complex)
+    float* data = (float*)malloc(count * sizeof(float));
+    float range = high - low;
+    for (uint32_t i = 0; i < count; i++) {
+        data[i] = low + ((float)arc4random() / (float)UINT32_MAX) * range;
+    }
+    SparseBufferRef result = sparse_from_float(ctx, data, count);
+    free(data);
+    return result;
+}
+SparseBufferRef sparse_random_normal(
+    SparseContextRef ctx,
+    uint32_t count,
+    float mean,
+    float std
+) {
+    if (!ctx || count == 0) return NULL;
+    // Box-Muller transform on CPU
+    float* data = (float*)malloc(count * sizeof(float));
+    for (uint32_t i = 0; i < count; i += 2) {
+        float u1 = ((float)arc4random() + 1.0f) / ((float)UINT32_MAX + 1.0f);
+        float u2 = ((float)arc4random() + 1.0f) / ((float)UINT32_MAX + 1.0f);
+        float z0 = sqrtf(-2.0f * logf(u1)) * cosf(2.0f * M_PI * u2);
+        float z1 = sqrtf(-2.0f * logf(u1)) * sinf(2.0f * M_PI * u2);
+        data[i] = mean + z0 * std;
+        if (i + 1 < count) {
+            data[i + 1] = mean + z1 * std;
+        }
+    }
+    SparseBufferRef result = sparse_from_float(ctx, data, count);
+    free(data);
+    return result;
+}