npm - @lgrammel/ds4-provider - Versions diffs - 0.0.1 - Mend

@lgrammel/ds4-provider 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

package/README.md +96 -0
package/binding.gyp +75 -0
package/dist/ds4-language-model.d.ts +71 -0
package/dist/ds4-language-model.d.ts.map +1 -0
package/dist/ds4-language-model.js +888 -0
package/dist/ds4-language-model.js.map +1 -0
package/dist/ds4-provider.d.ts +13 -0
package/dist/ds4-provider.d.ts.map +1 -0
package/dist/ds4-provider.js +20 -0
package/dist/ds4-provider.js.map +1 -0
package/dist/index.d.ts +4 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +4 -0
package/dist/index.js.map +1 -0
package/dist/native-binding.d.ts +42 -0
package/dist/native-binding.d.ts.map +1 -0
package/dist/native-binding.js +157 -0
package/dist/native-binding.js.map +1 -0
package/ds4/LICENSE +22 -0
package/ds4/ds4.c +18268 -0
package/ds4/ds4.h +196 -0
package/ds4/ds4_gpu.h +804 -0
package/ds4/ds4_metal.m +14657 -0
package/ds4/metal/argsort.metal +266 -0
package/ds4/metal/bin.metal +192 -0
package/ds4/metal/concat.metal +62 -0
package/ds4/metal/cpy.metal +57 -0
package/ds4/metal/dense.metal +1121 -0
package/ds4/metal/dsv4_hc.metal +861 -0
package/ds4/metal/dsv4_kv.metal +227 -0
package/ds4/metal/dsv4_misc.metal +1088 -0
package/ds4/metal/dsv4_rope.metal +155 -0
package/ds4/metal/flash_attn.metal +1426 -0
package/ds4/metal/get_rows.metal +54 -0
package/ds4/metal/glu.metal +36 -0
package/ds4/metal/moe.metal +1737 -0
package/ds4/metal/norm.metal +153 -0
package/ds4/metal/repeat.metal +52 -0
package/ds4/metal/set_rows.metal +55 -0
package/ds4/metal/softmax.metal +241 -0
package/ds4/metal/sum_rows.metal +102 -0
package/ds4/metal/unary.metal +312 -0
package/native/binding.cpp +621 -0
package/package.json +66 -0
package/scripts/postinstall.cjs +13 -0
package/scripts/vendor-ds4.cjs +67 -0

package/ds4/metal/dsv4_kv.metal ADDED Viewed

@@ -0,0 +1,227 @@
+constant float dsv4_e4m3fn_exp_scale[16] = {
+    0.0f, 0.015625f, 0.03125f, 0.0625f,
+    0.125f, 0.25f, 0.5f, 1.0f,
+    2.0f, 4.0f, 8.0f, 16.0f,
+    32.0f, 64.0f, 128.0f, 256.0f,
+};
+struct ds4_metal_args_dsv4_fp8_kv_quantize {
+    int64_t ne00;
+    int64_t ne01;
+    int64_t ne02;
+    int64_t ne03;
+    ulong nb00;
+    ulong nb01;
+    ulong nb02;
+    ulong nb03;
+    ulong nb0;
+    ulong nb1;
+    ulong nb2;
+    ulong nb3;
+    int n_rot;
+};
+struct ds4_metal_args_dsv4_kv_fp8_store {
+    int32_t head_dim;
+    int32_t n_rot;
+    int32_t raw_row;
+};
+struct ds4_metal_args_dsv4_ratio4_shift {
+    uint32_t width;
+};
+struct ds4_metal_args_dsv4_compressor_store_one {
+    uint32_t width;
+    uint32_t ratio;
+    uint32_t pos;
+    uint32_t ape_type;
+};
+static inline float dsv4_e4m3fn_value(int i) {
+    const int exp  = (i >> 3) & 0x0f;
+    const int mant = i & 0x07;
+    return exp == 0
+        ? float(mant) * 0.001953125f
+        : (1.0f + float(mant) * 0.125f) * dsv4_e4m3fn_exp_scale[exp];
+}
+static inline float dsv4_e4m3fn_dequant(float x) {
+    const float sign = x < 0.0f ? -1.0f : 1.0f;
+    const float ax = min(abs(x), 448.0f);
+    int lo = 0;
+    int hi = 126;
+    while (lo < hi) {
+        const int mid = (lo + hi + 1) >> 1;
+        if (dsv4_e4m3fn_value(mid) <= ax) {
+            lo = mid;
+        } else {
+            hi = mid - 1;
+        }
+    }
+    int best = lo;
+    if (best < 126) {
+        const float best_diff = abs(ax - dsv4_e4m3fn_value(best));
+        const float next_diff = abs(ax - dsv4_e4m3fn_value(best + 1));
+        if (next_diff < best_diff || (next_diff == best_diff && ((best + 1) & 1) == 0 && (best & 1) != 0)) {
+            best = best + 1;
+        }
+    }
+    return sign * dsv4_e4m3fn_value(best);
+}
+// Quantizes the non-RoPE part of a KV row through E4M3FN and writes the
+// dequantized value back as float. DS4 uses this to match the FP8 KV-cache
+// semantics while keeping the Metal graph's cache buffers float-addressable.
+kernel void kernel_dsv4_fp8_kv_quantize_f32(
+        constant ds4_metal_args_dsv4_fp8_kv_quantize & args,
+        device  const char * src0,
+        device        char * dst,
+        threadgroup  float * scratch [[threadgroup(0)]],
+        uint row [[threadgroup_position_in_grid]],
+        uint tid [[thread_position_in_threadgroup]]) {
+    const int64_t n_rows = args.ne01 * args.ne02 * args.ne03;
+    if ((int64_t) row >= n_rows) {
+        return;
+    }
+    const int64_t i1 = row % args.ne01;
+    const int64_t i2 = (row / args.ne01) % args.ne02;
+    const int64_t i3 = row / (args.ne01 * args.ne02);
+    device const char * src_base = src0 + i1*args.nb01 + i2*args.nb02 + i3*args.nb03;
+    device       char * dst_base = dst  + i1*args.nb1  + i2*args.nb2  + i3*args.nb3;
+    const int64_t n_nope = args.ne00 - args.n_rot;
+    for (int64_t off = 0; off < n_nope; off += 64) {
+        float v = 0.0f;
+        if (tid < 64) {
+            v = *((device const float *) (src_base + (off + tid)*args.nb00));
+            scratch[tid] = abs(v);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        for (uint stride = 32; stride > 0; stride >>= 1) {
+            if (tid < stride) {
+                scratch[tid] = max(scratch[tid], scratch[tid + stride]);
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+        }
+        const float amax = max(scratch[0], 1.0e-4f);
+        const float scale = exp2(ceil(log2(amax / 448.0f)));
+        if (tid < 64) {
+            const float q = dsv4_e4m3fn_dequant(clamp(v / scale, -448.0f, 448.0f)) * scale;
+            *((device float *) (dst_base + (off + tid)*args.nb0)) = q;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    for (int64_t i = n_nope + tid; i < args.ne00; i += 64) {
+        *((device float *) (dst_base + i*args.nb0)) = *((device const float *) (src_base + i*args.nb00));
+    }
+}
+// Decode-side KV finalizer after RoPE. The normal RoPE kernel intentionally
+// remains separate because tiny trigonometric codegen changes can flip later
+// sampled tokens. This kernel only fuses the FP8 round-trip for the non-RoPE
+// prefix with the F16-rounded raw-cache row used by FlashAttention.
+kernel void kernel_dsv4_kv_fp8_store_f32(
+        constant ds4_metal_args_dsv4_kv_fp8_store & args,
+        device        float * kv,
+        device        float * raw_cache,
+        threadgroup   float * scratch [[threadgroup(0)]],
+        uint tid [[thread_position_in_threadgroup]]) {
+    const int head_dim = args.head_dim;
+    const int n_rot = args.n_rot;
+    const int n_nope = head_dim - n_rot;
+    if (head_dim <= 0 || n_rot < 0 || n_nope < 0 || tid >= 64) {
+        return;
+    }
+    device float * raw = raw_cache + (int64_t)args.raw_row * head_dim;
+    for (int off = 0; off < n_nope; off += 64) {
+        float v = 0.0f;
+        if (off + (int)tid < n_nope) {
+            v = kv[off + tid];
+            scratch[tid] = abs(v);
+        } else {
+            scratch[tid] = 0.0f;
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+        for (uint stride = 32; stride > 0; stride >>= 1) {
+            if (tid < stride) {
+                scratch[tid] = max(scratch[tid], scratch[tid + stride]);
+            }
+            threadgroup_barrier(mem_flags::mem_threadgroup);
+        }
+        const float amax = max(scratch[0], 1.0e-4f);
+        const float fp8_scale = exp2(ceil(log2(amax / 448.0f)));
+        if (off + (int)tid < n_nope) {
+            const float q = dsv4_e4m3fn_dequant(clamp(v / fp8_scale, -448.0f, 448.0f)) * fp8_scale;
+            kv[off + tid] = q;
+            raw[off + tid] = (float)((half)q);
+        }
+        threadgroup_barrier(mem_flags::mem_threadgroup);
+    }
+    for (int i = n_nope + tid; i < head_dim; i += 64) {
+        raw[i] = (float)((half)kv[i]);
+    }
+}
+// Ratio-4 compression keeps two 4-row halves of recurrent state. After an
+// emitted compressed row, the second half becomes the next window's previous
+// half. The old encoder expressed this as four generic copies; this DS4-specific
+// kernel performs the KV and score copies together.
+kernel void kernel_dsv4_ratio4_shift_f32(
+        constant ds4_metal_args_dsv4_ratio4_shift & args,
+        device float * state_kv,
+        device float * state_score,
+        uint gid [[thread_position_in_grid]]) {
+    const uint n = 4u * args.width;
+    if (gid >= n) return;
+    state_kv[gid] = state_kv[n + gid];
+    state_score[gid] = state_score[n + gid];
+}
+// One-token compressor frontier update. Decode appends exactly one projected KV
+// row and one score row into a small recurrent state. The generic batch helper
+// expresses this as APE copy, score add, and two set_rows operations; this
+// kernel writes both state tensors directly while preserving the same
+// score + APE arithmetic.
+kernel void kernel_dsv4_compressor_store_one(
+        constant ds4_metal_args_dsv4_compressor_store_one & args,
+        device const float * kv,
+        device const float * score,
+        device const char  * ape,
+        device       float * state_kv,
+        device       float * state_score,
+        uint gid [[thread_position_in_grid]]) {
+    if (gid >= args.width || args.width == 0 || args.ratio == 0) {
+        return;
+    }
+    const uint pos_mod = args.pos % args.ratio;
+    const uint dst_row = args.ratio == 4u ? args.ratio + pos_mod : pos_mod;
+    const uint dst = dst_row * args.width + gid;
+    const uint ape_i = pos_mod * args.width + gid;
+    float ape_v;
+    if (args.ape_type == 1u) {
+        ape_v = (float)(((device const half *)ape)[ape_i]);
+    } else {
+        ape_v = ((device const float *)ape)[ape_i];
+    }
+    state_kv[dst] = kv[gid];
+    state_score[dst] = score[gid] + ape_v;
+}