npm - @simulatte/doppler - Versions diffs - 0.1.4 → 0.1.6 - Mend

@simulatte/doppler 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (199) hide show

package/README.md +26 -10
package/package.json +30 -6
package/src/client/doppler-api.browser.d.ts +1 -0
package/src/client/doppler-api.browser.js +288 -0
package/src/client/doppler-api.js +1 -1
package/src/client/doppler-provider/types.js +1 -1
package/src/config/execution-contract-check.d.ts +33 -0
package/src/config/execution-contract-check.js +72 -0
package/src/config/execution-v0-contract-check.d.ts +94 -0
package/src/config/execution-v0-contract-check.js +251 -0
package/src/config/execution-v0-graph-contract-check.d.ts +20 -0
package/src/config/execution-v0-graph-contract-check.js +64 -0
package/src/config/kernel-path-contract-check.d.ts +76 -0
package/src/config/kernel-path-contract-check.js +479 -0
package/src/config/kernel-path-loader.d.ts +16 -0
package/src/config/kernel-path-loader.js +54 -0
package/src/config/kernels/kernel-ref-digests.js +39 -27
package/src/config/kernels/registry.json +598 -2
package/src/config/loader.js +81 -48
package/src/config/merge-contract-check.d.ts +16 -0
package/src/config/merge-contract-check.js +321 -0
package/src/config/merge-helpers.d.ts +58 -0
package/src/config/merge-helpers.js +54 -0
package/src/config/merge.js +21 -6
package/src/config/presets/models/janus-text.json +2 -0
package/src/config/presets/models/qwen3.json +9 -2
package/src/config/presets/models/transformer.json +5 -0
package/src/config/quantization-contract-check.d.ts +12 -0
package/src/config/quantization-contract-check.js +91 -0
package/src/config/required-inference-fields-contract-check.d.ts +24 -0
package/src/config/required-inference-fields-contract-check.js +237 -0
package/src/config/schema/browser-suite-metrics.schema.d.ts +17 -0
package/src/config/schema/browser-suite-metrics.schema.js +46 -0
package/src/config/schema/conversion-report.schema.d.ts +40 -0
package/src/config/schema/conversion-report.schema.js +108 -0
package/src/config/schema/doppler.schema.js +12 -18
package/src/config/schema/index.d.ts +22 -0
package/src/config/schema/index.js +18 -0
package/src/config/schema/inference-defaults.schema.js +3 -0
package/src/config/schema/inference.schema.d.ts +9 -0
package/src/config/schema/kernel-path.schema.d.ts +6 -0
package/src/config/schema/manifest.schema.d.ts +6 -0
package/src/config/schema/manifest.schema.js +3 -0
package/src/converter/core.d.ts +10 -0
package/src/converter/core.js +27 -2
package/src/converter/parsers/diffusion.js +63 -3
package/src/converter/rope-config.js +42 -0
package/src/gpu/device.js +58 -0
package/src/gpu/kernels/attention.js +98 -0
package/src/gpu/kernels/bias_add.wgsl +8 -6
package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
package/src/gpu/kernels/conv2d.js +1 -1
package/src/gpu/kernels/conv2d.wgsl +7 -8
package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
package/src/gpu/kernels/depthwise_conv2d.d.ts +29 -0
package/src/gpu/kernels/depthwise_conv2d.js +99 -0
package/src/gpu/kernels/depthwise_conv2d.wgsl +55 -0
package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +59 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.d.ts +27 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.js +93 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +44 -0
package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +48 -0
package/src/gpu/kernels/index.d.ts +30 -0
package/src/gpu/kernels/index.js +25 -0
package/src/gpu/kernels/matmul.js +25 -0
package/src/gpu/kernels/pixel_shuffle.js +1 -1
package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
package/src/gpu/kernels/relu.d.ts +18 -0
package/src/gpu/kernels/relu.js +58 -0
package/src/gpu/kernels/relu.wgsl +22 -0
package/src/gpu/kernels/relu_f16.wgsl +24 -0
package/src/gpu/kernels/repeat_channels.d.ts +21 -0
package/src/gpu/kernels/repeat_channels.js +60 -0
package/src/gpu/kernels/repeat_channels.wgsl +28 -0
package/src/gpu/kernels/repeat_channels_f16.wgsl +30 -0
package/src/gpu/kernels/residual.js +44 -8
package/src/gpu/kernels/residual.wgsl +6 -3
package/src/gpu/kernels/residual_f16.wgsl +2 -1
package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
package/src/gpu/kernels/residual_vec4.wgsl +2 -1
package/src/gpu/kernels/rmsnorm.js +58 -6
package/src/gpu/kernels/rmsnorm.wgsl +14 -6
package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
package/src/gpu/kernels/rope.d.ts +2 -0
package/src/gpu/kernels/rope.js +11 -1
package/src/gpu/kernels/rope.wgsl +56 -40
package/src/gpu/kernels/sana_linear_attention.d.ts +27 -0
package/src/gpu/kernels/sana_linear_attention.js +121 -0
package/src/gpu/kernels/sana_linear_attention_apply.wgsl +43 -0
package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +46 -0
package/src/gpu/kernels/sana_linear_attention_summary.wgsl +51 -0
package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +53 -0
package/src/gpu/kernels/silu.d.ts +1 -0
package/src/gpu/kernels/silu.js +32 -14
package/src/gpu/kernels/silu.wgsl +19 -9
package/src/gpu/kernels/silu_f16.wgsl +19 -9
package/src/gpu/kernels/transpose.js +15 -2
package/src/gpu/kernels/transpose.wgsl +5 -6
package/src/gpu/kernels/upsample2d.js +2 -1
package/src/gpu/kernels/upsample2d.wgsl +6 -9
package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
package/src/gpu/kernels/utils.js +16 -1
package/src/index-browser.d.ts +1 -1
package/src/index-browser.js +2 -2
package/src/index.js +1 -1
package/src/inference/browser-harness.js +109 -23
package/src/inference/pipelines/diffusion/init.js +14 -0
package/src/inference/pipelines/diffusion/pipeline.js +215 -77
package/src/inference/pipelines/diffusion/sana-transformer.d.ts +53 -0
package/src/inference/pipelines/diffusion/sana-transformer.js +738 -0
package/src/inference/pipelines/diffusion/scheduler.d.ts +17 -1
package/src/inference/pipelines/diffusion/scheduler.js +91 -3
package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +11 -4
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +282 -0
package/src/inference/pipelines/diffusion/text-encoder.js +18 -1
package/src/inference/pipelines/diffusion/types.d.ts +4 -0
package/src/inference/pipelines/diffusion/vae.js +782 -78
package/src/inference/pipelines/text/attention/record.js +11 -2
package/src/inference/pipelines/text/attention/run.js +11 -2
package/src/inference/pipelines/text/chat-format.js +25 -1
package/src/inference/pipelines/text/config.d.ts +9 -0
package/src/inference/pipelines/text/config.js +69 -2
package/src/inference/pipelines/text/execution-plan.js +23 -31
package/src/inference/pipelines/text/execution-v0.js +43 -95
package/src/inference/pipelines/text/ffn/standard.js +3 -0
package/src/inference/pipelines/text/init.d.ts +4 -0
package/src/inference/pipelines/text/init.js +56 -9
package/src/inference/pipelines/text/layer.js +11 -0
package/src/inference/pipelines/text.js +4 -0
package/src/inference/tokenizers/bundled.js +156 -33
package/src/rules/execution-rules-contract-check.d.ts +17 -0
package/src/rules/execution-rules-contract-check.js +245 -0
package/src/rules/kernels/depthwise-conv2d.rules.json +6 -0
package/src/rules/kernels/grouped-pointwise-conv2d.rules.json +6 -0
package/src/rules/kernels/relu.rules.json +6 -0
package/src/rules/kernels/repeat-channels.rules.json +6 -0
package/src/rules/kernels/sana-linear-attention.rules.json +6 -0
package/src/rules/layer-pattern-contract-check.d.ts +17 -0
package/src/rules/layer-pattern-contract-check.js +231 -0
package/src/rules/rule-registry.d.ts +28 -0
package/src/rules/rule-registry.js +38 -0
package/src/rules/tooling/command-runtime.rules.json +18 -0
package/src/tooling/command-api.d.ts +27 -1
package/src/tooling/command-api.js +142 -3
package/src/tooling/conversion-config-materializer.d.ts +24 -0
package/src/tooling/conversion-config-materializer.js +99 -0
package/src/tooling/lean-execution-contract-runner.d.ts +43 -0
package/src/tooling/lean-execution-contract-runner.js +158 -0
package/src/tooling/node-browser-command-runner.d.ts +4 -0
package/src/tooling/node-browser-command-runner.js +58 -3
package/src/tooling/node-command-runner.js +15 -0
package/src/tooling/node-convert.d.ts +10 -0
package/src/tooling/node-converter.js +59 -0
package/src/tooling/node-webgpu.js +11 -89
package/src/training/checkpoint-watch.d.ts +7 -0
package/src/training/checkpoint-watch.js +106 -0
package/src/training/checkpoint.d.ts +6 -1
package/src/training/checkpoint.js +12 -2
package/src/training/distillation/artifacts.d.ts +71 -0
package/src/training/distillation/artifacts.js +132 -0
package/src/training/distillation/checkpoint-watch.d.ts +10 -0
package/src/training/distillation/checkpoint-watch.js +57 -0
package/src/training/distillation/dataset.d.ts +59 -0
package/src/training/distillation/dataset.js +337 -0
package/src/training/distillation/eval.d.ts +34 -0
package/src/training/distillation/eval.js +310 -0
package/src/training/distillation/index.d.ts +29 -0
package/src/training/distillation/index.js +29 -0
package/src/training/distillation/runtime.d.ts +20 -0
package/src/training/distillation/runtime.js +121 -0
package/src/training/distillation/scoreboard.d.ts +6 -0
package/src/training/distillation/scoreboard.js +8 -0
package/src/training/distillation/stage-a.d.ts +45 -0
package/src/training/distillation/stage-a.js +338 -0
package/src/training/distillation/stage-b.d.ts +24 -0
package/src/training/distillation/stage-b.js +20 -0
package/src/training/index.d.ts +10 -0
package/src/training/index.js +10 -0
package/src/training/lora-pipeline.d.ts +40 -0
package/src/training/lora-pipeline.js +796 -0
package/src/training/operator-artifacts.d.ts +62 -0
package/src/training/operator-artifacts.js +140 -0
package/src/training/operator-command.d.ts +5 -0
package/src/training/operator-command.js +453 -0
package/src/training/operator-eval.d.ts +48 -0
package/src/training/operator-eval.js +230 -0
package/src/training/operator-scoreboard.d.ts +5 -0
package/src/training/operator-scoreboard.js +44 -0
package/src/training/runner.d.ts +52 -0
package/src/training/runner.js +29 -4
package/src/training/suite.d.ts +112 -0
package/src/training/suite.js +9 -9
package/src/training/workloads.d.ts +164 -0
package/src/training/workloads.js +539 -0
package/src/version.d.ts +2 -0
package/src/version.js +2 -0
package/tools/convert-safetensors-node.js +47 -0
package/tools/doppler-cli.js +252 -41

package/src/gpu/kernels/sana_linear_attention_apply.wgsl ADDED Viewed

@@ -0,0 +1,43 @@
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    num_heads: u32,
+    head_dim: u32,
+    num_tokens: u32,
+    hidden_size: u32,
+    eps: f32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> query: array<f32>;
+@group(0) @binding(2) var<storage, read> summary: array<f32>;
+@group(0) @binding(3) var<storage, read_write> output: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let hidden = gid.x;
+    let token = gid.y;
+    if (token >= u.num_tokens || hidden >= u.hidden_size) {
+        return;
+    }
+    let idx = token * u.hidden_size + hidden;
+    let head = hidden / u.head_dim;
+    let dim = hidden - head * u.head_dim;
+    let rows_per_head = u.head_dim + 1u;
+    let head_offset = head * rows_per_head * u.head_dim;
+    let hidden_base = head * u.head_dim;
+    var numerator: f32 = 0.0;
+    var denominator: f32 = 0.0;
+    for (var i: u32 = 0u; i < u.head_dim; i = i + 1u) {
+        let q_value = max(query[token * u.hidden_size + hidden_base + i], 0.0);
+        numerator = numerator + summary[head_offset + dim * u.head_dim + i] * q_value;
+        denominator = denominator + summary[head_offset + u.head_dim * u.head_dim + i] * q_value;
+    }
+    output[idx] = numerator / (denominator + u.eps);
+}

package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl ADDED Viewed

@@ -0,0 +1,46 @@
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    num_heads: u32,
+    head_dim: u32,
+    num_tokens: u32,
+    hidden_size: u32,
+    eps: f32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> query: array<f16>;
+@group(0) @binding(2) var<storage, read> summary: array<f32>;
+@group(0) @binding(3) var<storage, read_write> output: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let hidden = gid.x;
+    let token = gid.y;
+    if (token >= u.num_tokens || hidden >= u.hidden_size) {
+        return;
+    }
+    let idx = token * u.hidden_size + hidden;
+    let head = hidden / u.head_dim;
+    let dim = hidden - head * u.head_dim;
+    let rows_per_head = u.head_dim + 1u;
+    let head_offset = head * rows_per_head * u.head_dim;
+    let hidden_base = head * u.head_dim;
+    var numerator: f32 = 0.0;
+    var denominator: f32 = 0.0;
+    for (var i: u32 = 0u; i < u.head_dim; i = i + 1u) {
+        let q_value = max(f32(query[token * u.hidden_size + hidden_base + i]), 0.0);
+        numerator = numerator + summary[head_offset + dim * u.head_dim + i] * q_value;
+        denominator = denominator + summary[head_offset + u.head_dim * u.head_dim + i] * q_value;
+    }
+    let result = numerator / (denominator + u.eps);
+    output[idx] = f16(clamp(result, -65504.0, 65504.0));
+}

package/src/gpu/kernels/sana_linear_attention_summary.wgsl ADDED Viewed

@@ -0,0 +1,51 @@
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    num_heads: u32,
+    head_dim: u32,
+    num_tokens: u32,
+    hidden_size: u32,
+    _pad0: u32,
+    _pad1: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> query: array<f32>;
+@group(0) @binding(2) var<storage, read> key: array<f32>;
+@group(0) @binding(3) var<storage, read> value: array<f32>;
+@group(0) @binding(4) var<storage, read_write> summary: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let rows_per_head = u.head_dim + 1u;
+    let head_span = rows_per_head * u.head_dim;
+    let total = u.num_heads * head_span;
+    if (idx >= total) {
+        return;
+    }
+    let head = idx / head_span;
+    let rem = idx - head * head_span;
+    let row = rem / u.head_dim;
+    let col = rem - row * u.head_dim;
+    let hidden_base = head * u.head_dim;
+    var acc: f32 = 0.0;
+    for (var token: u32 = 0u; token < u.num_tokens; token = token + 1u) {
+        let query_value = query[token * u.hidden_size + hidden_base + col];
+        let key_idx = token * u.hidden_size + hidden_base + col;
+        let key_value = max(key[key_idx], 0.0);
+        let value_value = select(
+            value[token * u.hidden_size + hidden_base + row],
+            1.0,
+            row == u.head_dim
+        );
+        if (u.hidden_size == 0u) {
+            acc = acc + query_value;
+        }
+        acc = acc + value_value * key_value;
+    }
+    summary[idx] = acc;
+}

package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl ADDED Viewed

@@ -0,0 +1,53 @@
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    num_heads: u32,
+    head_dim: u32,
+    num_tokens: u32,
+    hidden_size: u32,
+    _pad0: u32,
+    _pad1: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> query: array<f16>;
+@group(0) @binding(2) var<storage, read> key: array<f16>;
+@group(0) @binding(3) var<storage, read> value: array<f16>;
+@group(0) @binding(4) var<storage, read_write> summary: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let rows_per_head = u.head_dim + 1u;
+    let head_span = rows_per_head * u.head_dim;
+    let total = u.num_heads * head_span;
+    if (idx >= total) {
+        return;
+    }
+    let head = idx / head_span;
+    let rem = idx - head * head_span;
+    let row = rem / u.head_dim;
+    let col = rem - row * u.head_dim;
+    let hidden_base = head * u.head_dim;
+    var acc: f32 = 0.0;
+    for (var token: u32 = 0u; token < u.num_tokens; token = token + 1u) {
+        let query_value = f32(query[token * u.hidden_size + hidden_base + col]);
+        let key_idx = token * u.hidden_size + hidden_base + col;
+        let key_value = max(f32(key[key_idx]), 0.0);
+        let value_value = select(
+            f32(value[token * u.hidden_size + hidden_base + row]),
+            1.0,
+            row == u.head_dim
+        );
+        if (u.hidden_size == 0u) {
+            acc = acc + query_value;
+        }
+        acc = acc + value_value * key_value;
+    }
+    summary[idx] = acc;
+}

package/src/gpu/kernels/silu.d.ts CHANGED Viewed

@@ -16,6 +16,7 @@ export interface SiLUOptions extends OutputBufferOptions {
   size?: number | null;
   gate?: Tensor | null;
   gateActivation?: 'silu' | 'sigmoid';
+  inputActivation?: 'silu' | 'identity';
   useVec4?: boolean;
   biasOffset?: number;
   swigluLimit: number | null;

package/src/gpu/kernels/silu.js CHANGED Viewed

@@ -47,6 +47,18 @@ function createSiLUBindGroupEntries(uniformBuffer, input, output, gate) {
   ];
 }
+function planSiLUDispatch(device, size, useVec4) {
+  const maxPerDim = Number.isFinite(device?.limits?.maxComputeWorkgroupsPerDimension)
+    ? device.limits.maxComputeWorkgroupsPerDimension
+    : 65535;
+  const laneWidth = useVec4 ? 4 : 1;
+  const chunkSize = maxPerDim * WORKGROUP_SIZES.DEFAULT * laneWidth;
+  const dispatchStride = Math.min(size, chunkSize);
+  const x = Math.min(maxPerDim, Math.ceil(dispatchStride / (WORKGROUP_SIZES.DEFAULT * laneWidth)));
+  const y = Math.max(1, Math.ceil(size / chunkSize));
+  return { dispatchStride, workgroups: [x, y, 1] };
+}
 export async function runSiLU(
   input,
@@ -60,6 +72,7 @@ export async function runSiLU(
     useVec4 = false,
     swigluLimit,
     gateActivation = 'silu',
+    inputActivation = 'silu',
   } = options;
   const resolvedSwigluLimit = resolveSwigluLimit(swigluLimit, 'SiLU');
@@ -74,14 +87,17 @@ export async function runSiLU(
     useSplit: false,
     useRowsplit: false,
   });
-  const constants = gate && gateActivation === 'sigmoid'
-    ? { ...(overrides || {}), GATE_USE_SIGMOID: true }
-    : overrides;
+  const constants = {
+    ...(overrides || {}),
+    ...(gate && gateActivation === 'sigmoid' ? { GATE_USE_SIGMOID: true } : {}),
+    ...(inputActivation === 'identity' ? { INPUT_USE_IDENTITY: true } : {}),
+  };
   const pipeline = await getPipelineFast('silu', variant, null, constants);
   const inferredSize = size || (input.buffer.size / bytesPerElement);
   const outputSize = inferredSize * bytesPerElement;
   const output = outputBuffer || acquireBuffer(outputSize, undefined, 'silu_output');
+  const dispatchPlan = planSiLUDispatch(device, inferredSize, useVec4);
   // Create uniform buffer
   const uniformBuffer = createUniformBufferWithView(
@@ -89,7 +105,7 @@ export async function runSiLU(
     16,
     (view) => {
       view.setUint32(0, inferredSize, true);
-      view.setUint32(4, 0, true);
+      view.setUint32(4, dispatchPlan.dispatchStride, true);
       view.setFloat32(8, gate ? resolvedSwigluLimit : 0, true);
       view.setFloat32(12, 0, true);
     },
@@ -106,8 +122,7 @@ export async function runSiLU(
     entries,
   });
-  const workgroups = Math.ceil(inferredSize / WORKGROUP_SIZES.DEFAULT);
-  dispatch(device, pipeline, bindGroup, workgroups, 'silu');
+  dispatch(device, pipeline, bindGroup, dispatchPlan.workgroups, 'silu');
   uniformBuffer.destroy();
@@ -215,7 +230,7 @@ export async function runSiLURowSplit(
     ],
   });
-  const workgroups = Math.ceil((numTokens * dim) / WORKGROUP_SIZES.DEFAULT);
+  const workgroups = [Math.ceil(dim / WORKGROUP_SIZES.DEFAULT), numTokens, 1];
   dispatch(device, pipeline, bindGroup, workgroups, 'silu_rowsplit');
   uniformBuffer.destroy();
@@ -269,7 +284,7 @@ export async function recordSiLURowSplit(
     ],
   });
-  const workgroups = Math.ceil((numTokens * dim) / WORKGROUP_SIZES.DEFAULT);
+  const workgroups = [Math.ceil(dim / WORKGROUP_SIZES.DEFAULT), numTokens, 1];
   recordDispatch(recorder, pipeline, bindGroup, workgroups, 'silu_rowsplit');
   return createTensor(output, input.dtype, [numTokens, dim], 'silu_rowsplit_output');
@@ -288,6 +303,7 @@ export async function recordSiLU(
     outputBuffer = null,
     swigluLimit,
     gateActivation = 'silu',
+    inputActivation = 'silu',
   } = options;
   const resolvedSwigluLimit = resolveSwigluLimit(swigluLimit, 'SiLU');
@@ -302,14 +318,17 @@ export async function recordSiLU(
     useSplit: false,
     useRowsplit: false,
   });
-  const constants = gate && gateActivation === 'sigmoid'
-    ? { ...(overrides || {}), GATE_USE_SIGMOID: true }
-    : overrides;
+  const constants = {
+    ...(overrides || {}),
+    ...(gate && gateActivation === 'sigmoid' ? { GATE_USE_SIGMOID: true } : {}),
+    ...(inputActivation === 'identity' ? { INPUT_USE_IDENTITY: true } : {}),
+  };
   const pipeline = await getPipelineFast('silu', variant, null, constants);
   const inferredSize = size || (input.buffer.size / bytesPerElement);
   const outputSize = inferredSize * bytesPerElement;
   const output = outputBuffer || acquireBuffer(outputSize, undefined, 'silu_output');
+  const dispatchPlan = planSiLUDispatch(device, inferredSize, false);
   // Uniform buffer
   const uniformBuffer = createUniformBufferWithView(
@@ -317,7 +336,7 @@ export async function recordSiLU(
     16,
     (view) => {
       view.setUint32(0, inferredSize, true);
-      view.setUint32(4, 0, true);
+      view.setUint32(4, dispatchPlan.dispatchStride, true);
       view.setFloat32(8, gate ? resolvedSwigluLimit : 0, true);
       view.setFloat32(12, 0, true);
     },
@@ -333,8 +352,7 @@ export async function recordSiLU(
     entries,
   });
-  const workgroups = Math.ceil(inferredSize / WORKGROUP_SIZES.DEFAULT);
-  recordDispatch(recorder, pipeline, bindGroup, workgroups, 'silu');
+  recordDispatch(recorder, pipeline, bindGroup, dispatchPlan.workgroups, 'silu');
   return createTensor(output, input.dtype, [inferredSize], 'silu_output');
 }

package/src/gpu/kernels/silu.wgsl CHANGED Viewed

@@ -10,13 +10,14 @@
 override WORKGROUP_SIZE: u32 = 256u;
 override HAS_GATE: bool = false;
 override GATE_USE_SIGMOID: bool = false;
+override INPUT_USE_IDENTITY: bool = false;
 override USE_SPLIT: bool = false;
 override USE_VEC4: bool = false;
 override USE_ROWSPLIT: bool = false;
 struct Uniforms {
     size: u32,          // Total output elements
-    rowsplit_dim: u32,  // Dim for rowsplit variants (0 when unused)
+    rowsplit_dim: u32,  // Row-split dim or dispatch stride for non-row-split variants
     clamp_max: f32,     // SwiGLU clamp (0 = disabled)
     _pad1: f32,
 }
@@ -35,6 +36,10 @@ fn silu(x: f32) -> f32 {
     return x * sigmoid(x);
 }
+fn apply_input_activation(x: f32) -> f32 {
+    return select(silu(x), x, INPUT_USE_IDENTITY);
+}
 fn clamp_swiglu(x: f32) -> f32 {
     if (u.clamp_max <= 0.0) {
         return x;
@@ -46,8 +51,9 @@ fn clamp_swiglu(x: f32) -> f32 {
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>
 ) {
+    let dispatch_stride = max(u.rowsplit_dim, 1u);
     if (USE_VEC4) {
-        let base_idx = global_id.x * 4u;
+        let base_idx = global_id.y * dispatch_stride + global_id.x * 4u;
         if (base_idx >= u.size) {
             return;
         }
@@ -55,12 +61,12 @@ fn main(
         let remaining = min(4u, u.size - base_idx);
         for (var i: u32 = 0u; i < remaining; i = i + 1u) {
             let x = input[base_idx + i];
-            output[base_idx + i] = silu(x);
+            output[base_idx + i] = apply_input_activation(x);
         }
         return;
     }
-    let idx = global_id.x;
+    let idx = global_id.y * dispatch_stride + global_id.x;
     if (idx >= u.size) {
         return;
     }
@@ -70,12 +76,16 @@ fn main(
             return;
         }
         let dim = u.rowsplit_dim;
-        let token_idx = idx / dim;
-        let dim_idx = idx % dim;
+        let num_tokens = u.size / dim;
+        let token_idx = global_id.y;
+        let dim_idx = global_id.x;
+        if (token_idx >= num_tokens || dim_idx >= dim) {
+            return;
+        }
         let row_base = token_idx * dim * 2u;
         let g = input[row_base + dim_idx];
         let up = input[row_base + dim + dim_idx];
-        output[idx] = clamp_swiglu(silu(g) * up);
+        output[token_idx * dim + dim_idx] = clamp_swiglu(silu(g) * up);
         return;
     }
@@ -83,7 +93,7 @@ fn main(
         let up = input[idx];
         let g = gate[idx];
         let gateAct = select(silu(g), sigmoid(g), GATE_USE_SIGMOID);
-        output[idx] = clamp_swiglu(gateAct * up);
+        output[idx] = clamp_swiglu(gateAct * apply_input_activation(up));
         return;
     }
@@ -95,5 +105,5 @@ fn main(
     }
     let x = input[idx];
-    output[idx] = silu(x);
+    output[idx] = apply_input_activation(x);
 }

package/src/gpu/kernels/silu_f16.wgsl CHANGED Viewed

@@ -9,13 +9,14 @@ enable f16;
 override WORKGROUP_SIZE: u32 = 256u;
 override HAS_GATE: bool = false;
 override GATE_USE_SIGMOID: bool = false;
+override INPUT_USE_IDENTITY: bool = false;
 override USE_SPLIT: bool = false;
 override USE_VEC4: bool = false;
 override USE_ROWSPLIT: bool = false;
 struct Uniforms {
     size: u32,          // Total output elements
-    rowsplit_dim: u32,  // Dim for rowsplit variants (0 when unused)
+    rowsplit_dim: u32,  // Row-split dim or dispatch stride for non-row-split variants
     clamp_max: f32,     // SwiGLU clamp (0 = disabled)
     _pad1: f32,
 }
@@ -34,6 +35,10 @@ fn silu(x: f32) -> f32 {
     return x * sigmoid(x);
 }
+fn apply_input_activation(x: f32) -> f32 {
+    return select(silu(x), x, INPUT_USE_IDENTITY);
+}
 fn clamp_swiglu(x: f32) -> f32 {
     if (u.clamp_max <= 0.0) {
         return x;
@@ -45,8 +50,9 @@ fn clamp_swiglu(x: f32) -> f32 {
 fn main(
     @builtin(global_invocation_id) global_id: vec3<u32>
 ) {
+    let dispatch_stride = max(u.rowsplit_dim, 1u);
     if (USE_VEC4) {
-        let base_idx = global_id.x * 4u;
+        let base_idx = global_id.y * dispatch_stride + global_id.x * 4u;
         if (base_idx >= u.size) {
             return;
         }
@@ -54,12 +60,12 @@ fn main(
         let remaining = min(4u, u.size - base_idx);
         for (var i: u32 = 0u; i < remaining; i = i + 1u) {
             let x = f32(input[base_idx + i]);
-            output[base_idx + i] = f16(silu(x));
+            output[base_idx + i] = f16(apply_input_activation(x));
         }
         return;
     }
-    let idx = global_id.x;
+    let idx = global_id.y * dispatch_stride + global_id.x;
     if (idx >= u.size) {
         return;
     }
@@ -69,12 +75,16 @@ fn main(
             return;
         }
         let dim = u.rowsplit_dim;
-        let token_idx = idx / dim;
-        let dim_idx = idx % dim;
+        let num_tokens = u.size / dim;
+        let token_idx = global_id.y;
+        let dim_idx = global_id.x;
+        if (token_idx >= num_tokens || dim_idx >= dim) {
+            return;
+        }
         let row_base = token_idx * dim * 2u;
         let g = f32(input[row_base + dim_idx]);
         let up = f32(input[row_base + dim + dim_idx]);
-        output[idx] = f16(clamp_swiglu(silu(g) * up));
+        output[token_idx * dim + dim_idx] = f16(clamp_swiglu(silu(g) * up));
         return;
     }
@@ -82,7 +92,7 @@ fn main(
         let up = f32(input[idx]);
         let g = f32(gate[idx]);
         let gateAct = select(silu(g), sigmoid(g), GATE_USE_SIGMOID);
-        output[idx] = f16(clamp_swiglu(gateAct * up));
+        output[idx] = f16(clamp_swiglu(gateAct * apply_input_activation(up)));
         return;
     }
@@ -94,5 +104,5 @@ fn main(
     }
     let x = f32(input[idx]);
-    output[idx] = f16(silu(x));
+    output[idx] = f16(apply_input_activation(x));
 }

package/src/gpu/kernels/transpose.js CHANGED Viewed

@@ -3,19 +3,32 @@ import { createTensor, dtypeBytes } from '../tensor.js';
 import { WORKGROUP_SIZES } from './constants.js';
 import { unifiedKernelWrapper } from './utils.js';
+function planTransposeDispatch(target, cols) {
+  const device = target?.device;
+  const maxPerDim = Number.isFinite(device?.limits?.maxComputeWorkgroupsPerDimension)
+    ? device.limits.maxComputeWorkgroupsPerDimension
+    : 65535;
+  const dispatchStride = Math.min(cols, maxPerDim * WORKGROUP_SIZES.DEFAULT);
+  return {
+    dispatchStride,
+    workgroups: [Math.ceil(dispatchStride / WORKGROUP_SIZES.DEFAULT), 1, 1],
+  };
+}
 async function _transpose(target, input, rows, cols, options = {}) {
   const { outputBuffer = null } = options;
   const bytesPerElement = dtypeBytes(input.dtype);
   const outputSize = rows * cols * bytesPerElement;
   const outputBuf = outputBuffer || acquireBuffer(outputSize, undefined, 'transpose_output');
+  const dispatchPlan = planTransposeDispatch(target, cols);
   await unifiedKernelWrapper(
     'transpose',
     target,
     'default',
     [input, outputBuf],
-    { rows, cols },
-    Math.ceil((rows * cols) / WORKGROUP_SIZES.DEFAULT)
+    { rows, cols, _pad0: dispatchPlan.dispatchStride, _pad1: 0 },
+    [dispatchPlan.workgroups[0], rows, 1]
   );
   return createTensor(outputBuf, input.dtype, [cols, rows], 'transpose_output');

package/src/gpu/kernels/transpose.wgsl CHANGED Viewed

@@ -19,14 +19,13 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
-    let total = u.rows * u.cols;
-    if (idx >= total) {
+    let dispatch_stride = max(u._pad0, 1u);
+    let row = gid.y;
+    let col = gid.x + row * dispatch_stride;
+    if (row >= u.rows || col >= u.cols) {
         return;
     }
-    let row = idx / u.cols;
-    let col = idx % u.cols;
+    let idx = row * u.cols + col;
     let out_idx = col * u.rows + row;
     output[out_idx] = input[idx];
 }

package/src/gpu/kernels/upsample2d.js CHANGED Viewed

@@ -31,6 +31,7 @@ async function _upsample2d(target, input, options = {}) {
   const outHeight = resolvedHeight * scale;
   const outWidth = resolvedWidth * scale;
+  const outSpatial = outHeight * outWidth;
   const bytesPerElement = dtypeBytes(input.dtype);
   const outputSize = channels * outHeight * outWidth * bytesPerElement;
   const output = outputBuffer || acquireBuffer(outputSize, undefined, 'upsample2d_output');
@@ -43,7 +44,7 @@ async function _upsample2d(target, input, options = {}) {
       out_height: outHeight, out_width: outWidth, scale,
       _pad0: 0, _pad1: 0,
     },
-    Math.ceil((channels * outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil(outSpatial / WORKGROUP_SIZES.DEFAULT), channels, 1]
   );
   return createTensor(output, input.dtype, [channels, outHeight, outWidth], 'upsample2d_output');

package/src/gpu/kernels/upsample2d.wgsl CHANGED Viewed

@@ -19,19 +19,16 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let out_spatial = u.out_height * u.out_width;
-    let total = u.channels * out_spatial;
-    if (idx >= total) {
+    let spatial_idx = gid.x;
+    let channel = gid.y;
+    if (spatial_idx >= out_spatial || channel >= u.channels) {
         return;
     }
-    let channel = idx / out_spatial;
-    let rem = idx - channel * out_spatial;
-    let out_y = rem / u.out_width;
-    let out_x = rem - out_y * u.out_width;
+    let out_y = spatial_idx / u.out_width;
+    let out_x = spatial_idx - out_y * u.out_width;
     let in_y = out_y / u.scale;
     let in_x = out_x / u.scale;
     let in_idx = (channel * u.in_height + in_y) * u.in_width + in_x;
-    output[idx] = input[in_idx];
+    output[channel * out_spatial + spatial_idx] = input[in_idx];
 }

package/src/gpu/kernels/upsample2d_f16.wgsl CHANGED Viewed

@@ -23,19 +23,16 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let out_spatial = u.out_height * u.out_width;
-    let total = u.channels * out_spatial;
-    if (idx >= total) {
+    let spatial_idx = gid.x;
+    let channel = gid.y;
+    if (spatial_idx >= out_spatial || channel >= u.channels) {
         return;
     }
-    let channel = idx / out_spatial;
-    let rem = idx - channel * out_spatial;
-    let out_y = rem / u.out_width;
-    let out_x = rem - out_y * u.out_width;
+    let out_y = spatial_idx / u.out_width;
+    let out_x = spatial_idx - out_y * u.out_width;
     let in_y = out_y / u.scale;
     let in_x = out_x / u.scale;
     let in_idx = (channel * u.in_height + in_y) * u.in_width + in_x;
-    output[idx] = input[in_idx];
+    output[channel * out_spatial + spatial_idx] = input[in_idx];
 }

package/src/gpu/kernels/utils.js CHANGED Viewed

@@ -116,9 +116,24 @@ export async function unifiedKernelWrapper(opName, target, variant, bindings, un
       index = config.variantMetadata.outputBinding;
     }
+    const buffer = binding?.buffer || binding;
+    const isGpuBuffer = buffer && (
+      typeof GPUBuffer === 'undefined'
+        ? true
+        : buffer instanceof GPUBuffer
+    );
+    if (!isGpuBuffer) {
+      const bindingLabel = binding?.label ?? buffer?.label ?? 'unknown';
+      const bufferType = buffer === null ? 'null' : buffer === undefined ? 'undefined' : buffer.constructor?.name || typeof buffer;
+      throw new Error(
+        `Kernel "${opName}/${variant}" binding "${bindingConfig.name}" (index ${index}) requires a GPUBuffer ` +
+        `(label=${bindingLabel}, type=${bufferType}).`
+      );
+    }
     bindGroupEntries.push({
       binding: index,
-      resource: { buffer: binding?.buffer || binding }
+      resource: { buffer }
     });
   }

package/src/index-browser.d.ts CHANGED Viewed

@@ -1,5 +1,5 @@
 export declare const DOPPLER_VERSION: string;
-export { doppler } from './client/doppler-api.js';
+export { doppler } from './client/doppler-api.browser.js';
 export {
   DopplerLoader,