npm - @simulatte/doppler - Versions diffs - 0.1.4 → 0.1.6 - Mend

@simulatte/doppler 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (199) hide show

package/README.md +26 -10
package/package.json +30 -6
package/src/client/doppler-api.browser.d.ts +1 -0
package/src/client/doppler-api.browser.js +288 -0
package/src/client/doppler-api.js +1 -1
package/src/client/doppler-provider/types.js +1 -1
package/src/config/execution-contract-check.d.ts +33 -0
package/src/config/execution-contract-check.js +72 -0
package/src/config/execution-v0-contract-check.d.ts +94 -0
package/src/config/execution-v0-contract-check.js +251 -0
package/src/config/execution-v0-graph-contract-check.d.ts +20 -0
package/src/config/execution-v0-graph-contract-check.js +64 -0
package/src/config/kernel-path-contract-check.d.ts +76 -0
package/src/config/kernel-path-contract-check.js +479 -0
package/src/config/kernel-path-loader.d.ts +16 -0
package/src/config/kernel-path-loader.js +54 -0
package/src/config/kernels/kernel-ref-digests.js +39 -27
package/src/config/kernels/registry.json +598 -2
package/src/config/loader.js +81 -48
package/src/config/merge-contract-check.d.ts +16 -0
package/src/config/merge-contract-check.js +321 -0
package/src/config/merge-helpers.d.ts +58 -0
package/src/config/merge-helpers.js +54 -0
package/src/config/merge.js +21 -6
package/src/config/presets/models/janus-text.json +2 -0
package/src/config/presets/models/qwen3.json +9 -2
package/src/config/presets/models/transformer.json +5 -0
package/src/config/quantization-contract-check.d.ts +12 -0
package/src/config/quantization-contract-check.js +91 -0
package/src/config/required-inference-fields-contract-check.d.ts +24 -0
package/src/config/required-inference-fields-contract-check.js +237 -0
package/src/config/schema/browser-suite-metrics.schema.d.ts +17 -0
package/src/config/schema/browser-suite-metrics.schema.js +46 -0
package/src/config/schema/conversion-report.schema.d.ts +40 -0
package/src/config/schema/conversion-report.schema.js +108 -0
package/src/config/schema/doppler.schema.js +12 -18
package/src/config/schema/index.d.ts +22 -0
package/src/config/schema/index.js +18 -0
package/src/config/schema/inference-defaults.schema.js +3 -0
package/src/config/schema/inference.schema.d.ts +9 -0
package/src/config/schema/kernel-path.schema.d.ts +6 -0
package/src/config/schema/manifest.schema.d.ts +6 -0
package/src/config/schema/manifest.schema.js +3 -0
package/src/converter/core.d.ts +10 -0
package/src/converter/core.js +27 -2
package/src/converter/parsers/diffusion.js +63 -3
package/src/converter/rope-config.js +42 -0
package/src/gpu/device.js +58 -0
package/src/gpu/kernels/attention.js +98 -0
package/src/gpu/kernels/bias_add.wgsl +8 -6
package/src/gpu/kernels/bias_add_f16.wgsl +8 -5
package/src/gpu/kernels/conv2d.js +1 -1
package/src/gpu/kernels/conv2d.wgsl +7 -8
package/src/gpu/kernels/conv2d_f16.wgsl +7 -8
package/src/gpu/kernels/depthwise_conv2d.d.ts +29 -0
package/src/gpu/kernels/depthwise_conv2d.js +99 -0
package/src/gpu/kernels/depthwise_conv2d.wgsl +55 -0
package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +59 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.d.ts +27 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.js +93 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +44 -0
package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +48 -0
package/src/gpu/kernels/index.d.ts +30 -0
package/src/gpu/kernels/index.js +25 -0
package/src/gpu/kernels/matmul.js +25 -0
package/src/gpu/kernels/pixel_shuffle.js +1 -1
package/src/gpu/kernels/pixel_shuffle.wgsl +4 -5
package/src/gpu/kernels/pixel_shuffle_f16.wgsl +4 -5
package/src/gpu/kernels/relu.d.ts +18 -0
package/src/gpu/kernels/relu.js +58 -0
package/src/gpu/kernels/relu.wgsl +22 -0
package/src/gpu/kernels/relu_f16.wgsl +24 -0
package/src/gpu/kernels/repeat_channels.d.ts +21 -0
package/src/gpu/kernels/repeat_channels.js +60 -0
package/src/gpu/kernels/repeat_channels.wgsl +28 -0
package/src/gpu/kernels/repeat_channels_f16.wgsl +30 -0
package/src/gpu/kernels/residual.js +44 -8
package/src/gpu/kernels/residual.wgsl +6 -3
package/src/gpu/kernels/residual_f16.wgsl +2 -1
package/src/gpu/kernels/residual_f16_vec4.wgsl +2 -1
package/src/gpu/kernels/residual_vec4.wgsl +2 -1
package/src/gpu/kernels/rmsnorm.js +58 -6
package/src/gpu/kernels/rmsnorm.wgsl +14 -6
package/src/gpu/kernels/rmsnorm_f16.wgsl +10 -2
package/src/gpu/kernels/rope.d.ts +2 -0
package/src/gpu/kernels/rope.js +11 -1
package/src/gpu/kernels/rope.wgsl +56 -40
package/src/gpu/kernels/sana_linear_attention.d.ts +27 -0
package/src/gpu/kernels/sana_linear_attention.js +121 -0
package/src/gpu/kernels/sana_linear_attention_apply.wgsl +43 -0
package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +46 -0
package/src/gpu/kernels/sana_linear_attention_summary.wgsl +51 -0
package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +53 -0
package/src/gpu/kernels/silu.d.ts +1 -0
package/src/gpu/kernels/silu.js +32 -14
package/src/gpu/kernels/silu.wgsl +19 -9
package/src/gpu/kernels/silu_f16.wgsl +19 -9
package/src/gpu/kernels/transpose.js +15 -2
package/src/gpu/kernels/transpose.wgsl +5 -6
package/src/gpu/kernels/upsample2d.js +2 -1
package/src/gpu/kernels/upsample2d.wgsl +6 -9
package/src/gpu/kernels/upsample2d_f16.wgsl +6 -9
package/src/gpu/kernels/utils.js +16 -1
package/src/index-browser.d.ts +1 -1
package/src/index-browser.js +2 -2
package/src/index.js +1 -1
package/src/inference/browser-harness.js +109 -23
package/src/inference/pipelines/diffusion/init.js +14 -0
package/src/inference/pipelines/diffusion/pipeline.js +215 -77
package/src/inference/pipelines/diffusion/sana-transformer.d.ts +53 -0
package/src/inference/pipelines/diffusion/sana-transformer.js +738 -0
package/src/inference/pipelines/diffusion/scheduler.d.ts +17 -1
package/src/inference/pipelines/diffusion/scheduler.js +91 -3
package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +11 -4
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +282 -0
package/src/inference/pipelines/diffusion/text-encoder.js +18 -1
package/src/inference/pipelines/diffusion/types.d.ts +4 -0
package/src/inference/pipelines/diffusion/vae.js +782 -78
package/src/inference/pipelines/text/attention/record.js +11 -2
package/src/inference/pipelines/text/attention/run.js +11 -2
package/src/inference/pipelines/text/chat-format.js +25 -1
package/src/inference/pipelines/text/config.d.ts +9 -0
package/src/inference/pipelines/text/config.js +69 -2
package/src/inference/pipelines/text/execution-plan.js +23 -31
package/src/inference/pipelines/text/execution-v0.js +43 -95
package/src/inference/pipelines/text/ffn/standard.js +3 -0
package/src/inference/pipelines/text/init.d.ts +4 -0
package/src/inference/pipelines/text/init.js +56 -9
package/src/inference/pipelines/text/layer.js +11 -0
package/src/inference/pipelines/text.js +4 -0
package/src/inference/tokenizers/bundled.js +156 -33
package/src/rules/execution-rules-contract-check.d.ts +17 -0
package/src/rules/execution-rules-contract-check.js +245 -0
package/src/rules/kernels/depthwise-conv2d.rules.json +6 -0
package/src/rules/kernels/grouped-pointwise-conv2d.rules.json +6 -0
package/src/rules/kernels/relu.rules.json +6 -0
package/src/rules/kernels/repeat-channels.rules.json +6 -0
package/src/rules/kernels/sana-linear-attention.rules.json +6 -0
package/src/rules/layer-pattern-contract-check.d.ts +17 -0
package/src/rules/layer-pattern-contract-check.js +231 -0
package/src/rules/rule-registry.d.ts +28 -0
package/src/rules/rule-registry.js +38 -0
package/src/rules/tooling/command-runtime.rules.json +18 -0
package/src/tooling/command-api.d.ts +27 -1
package/src/tooling/command-api.js +142 -3
package/src/tooling/conversion-config-materializer.d.ts +24 -0
package/src/tooling/conversion-config-materializer.js +99 -0
package/src/tooling/lean-execution-contract-runner.d.ts +43 -0
package/src/tooling/lean-execution-contract-runner.js +158 -0
package/src/tooling/node-browser-command-runner.d.ts +4 -0
package/src/tooling/node-browser-command-runner.js +58 -3
package/src/tooling/node-command-runner.js +15 -0
package/src/tooling/node-convert.d.ts +10 -0
package/src/tooling/node-converter.js +59 -0
package/src/tooling/node-webgpu.js +11 -89
package/src/training/checkpoint-watch.d.ts +7 -0
package/src/training/checkpoint-watch.js +106 -0
package/src/training/checkpoint.d.ts +6 -1
package/src/training/checkpoint.js +12 -2
package/src/training/distillation/artifacts.d.ts +71 -0
package/src/training/distillation/artifacts.js +132 -0
package/src/training/distillation/checkpoint-watch.d.ts +10 -0
package/src/training/distillation/checkpoint-watch.js +57 -0
package/src/training/distillation/dataset.d.ts +59 -0
package/src/training/distillation/dataset.js +337 -0
package/src/training/distillation/eval.d.ts +34 -0
package/src/training/distillation/eval.js +310 -0
package/src/training/distillation/index.d.ts +29 -0
package/src/training/distillation/index.js +29 -0
package/src/training/distillation/runtime.d.ts +20 -0
package/src/training/distillation/runtime.js +121 -0
package/src/training/distillation/scoreboard.d.ts +6 -0
package/src/training/distillation/scoreboard.js +8 -0
package/src/training/distillation/stage-a.d.ts +45 -0
package/src/training/distillation/stage-a.js +338 -0
package/src/training/distillation/stage-b.d.ts +24 -0
package/src/training/distillation/stage-b.js +20 -0
package/src/training/index.d.ts +10 -0
package/src/training/index.js +10 -0
package/src/training/lora-pipeline.d.ts +40 -0
package/src/training/lora-pipeline.js +796 -0
package/src/training/operator-artifacts.d.ts +62 -0
package/src/training/operator-artifacts.js +140 -0
package/src/training/operator-command.d.ts +5 -0
package/src/training/operator-command.js +453 -0
package/src/training/operator-eval.d.ts +48 -0
package/src/training/operator-eval.js +230 -0
package/src/training/operator-scoreboard.d.ts +5 -0
package/src/training/operator-scoreboard.js +44 -0
package/src/training/runner.d.ts +52 -0
package/src/training/runner.js +29 -4
package/src/training/suite.d.ts +112 -0
package/src/training/suite.js +9 -9
package/src/training/workloads.d.ts +164 -0
package/src/training/workloads.js +539 -0
package/src/version.d.ts +2 -0
package/src/version.js +2 -0
package/tools/convert-safetensors-node.js +47 -0
package/tools/doppler-cli.js +252 -41

package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl ADDED Viewed

@@ -0,0 +1,48 @@
+// Grouped Pointwise Conv2D Kernel (NCHW, f16)
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    in_channels: u32,
+    out_channels: u32,
+    height: u32,
+    width: u32,
+    groups: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read> weight: array<f16>;
+@group(0) @binding(3) var<storage, read> bias: array<f16>;
+@group(0) @binding(4) var<storage, read_write> output: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let spatial = u.height * u.width;
+    let spatial_idx = gid.x;
+    let out_channel = gid.y;
+    if (spatial_idx >= spatial || out_channel >= u.out_channels) {
+        return;
+    }
+    let y = spatial_idx / u.width;
+    let x = spatial_idx - y * u.width;
+    let in_per_group = u.in_channels / u.groups;
+    let out_per_group = u.out_channels / u.groups;
+    let group_idx = out_channel / out_per_group;
+    let in_offset = group_idx * in_per_group;
+    var sum: f32 = f32(bias[out_channel]);
+    for (var i: u32 = 0u; i < in_per_group; i = i + 1u) {
+        let input_idx = ((in_offset + i) * u.height + y) * u.width + x;
+        let weight_idx = out_channel * in_per_group + i;
+        sum = sum + f32(input[input_idx]) * f32(weight[weight_idx]);
+    }
+    output[out_channel * spatial + spatial_idx] = f16(sum);
+}

package/src/gpu/kernels/index.d.ts CHANGED Viewed

@@ -174,6 +174,18 @@ export {
   type Conv2DOptions,
 } from './conv2d.js';
+export {
+  runDepthwiseConv2D,
+  recordDepthwiseConv2D,
+  type DepthwiseConv2DOptions,
+} from './depthwise_conv2d.js';
+export {
+  runGroupedPointwiseConv2D,
+  recordGroupedPointwiseConv2D,
+  type GroupedPointwiseConv2DOptions,
+} from './grouped_pointwise_conv2d.js';
 // Gather (Embedding Lookup)
 export {
   runGather,
@@ -250,6 +262,24 @@ export {
   type SampleResult,
 } from './sample.js';
+export {
+  runSanaLinearAttention,
+  recordSanaLinearAttention,
+  type SanaLinearAttentionOptions,
+} from './sana_linear_attention.js';
+export {
+  runRepeatChannels,
+  recordRepeatChannels,
+  type RepeatChannelsOptions,
+} from './repeat_channels.js';
+export {
+  runReLU,
+  recordReLU,
+  type ReLUOptions,
+} from './relu.js';
 // Fused FFN (Tier 2 P0)
 export {
   runFusedFFN,

package/src/gpu/kernels/index.js CHANGED Viewed

@@ -139,6 +139,16 @@ export {
   recordConv2D,
 } from './conv2d.js';
+export {
+  runDepthwiseConv2D,
+  recordDepthwiseConv2D,
+} from './depthwise_conv2d.js';
+export {
+  runGroupedPointwiseConv2D,
+  recordGroupedPointwiseConv2D,
+} from './grouped_pointwise_conv2d.js';
 // Gather (Embedding Lookup)
 export {
   runGather,
@@ -205,6 +215,21 @@ export {
   isGPUSamplingAvailable,
 } from './sample.js';
+export {
+  runSanaLinearAttention,
+  recordSanaLinearAttention,
+} from './sana_linear_attention.js';
+export {
+  runRepeatChannels,
+  recordRepeatChannels,
+} from './repeat_channels.js';
+export {
+  runReLU,
+  recordReLU,
+} from './relu.js';
 // Fused FFN (Tier 2 P0)
 export {
   runFusedFFN,

package/src/gpu/kernels/matmul.js CHANGED Viewed

@@ -52,6 +52,23 @@ function buildProfileLabel(options = {}) {
   return `matmul${roleLabel}${layerLabel}`;
 }
+function assertBindGroupBuffer(kernelName, variant, bindingIndex, bindingLabel, buffer, details = []) {
+  const isGpuBuffer = buffer && (
+    typeof GPUBuffer === 'undefined'
+      ? true
+      : buffer instanceof GPUBuffer
+  );
+  if (isGpuBuffer) {
+    return;
+  }
+  const detailText = details.filter(Boolean).join(', ');
+  throw new Error(
+    `[${kernelName}] variant="${variant}" binding ${bindingIndex} "${bindingLabel}" requires a GPUBuffer` +
+    (detailText ? ` (${detailText})` : '') +
+    '.'
+  );
+}
 function createMatmulBindGroupEntries(variant, uniformBuffer, matmulInput, bBuffer, outputBuffer, offsets, bindingSizes) {
   const isQ4KF16 = variant === 'q4_fused_multicol_f16'
     || variant === 'q4_fused_f16a'
@@ -59,6 +76,14 @@ function createMatmulBindGroupEntries(variant, uniformBuffer, matmulInput, bBuff
     || variant === 'q4_fused_multicol_f16a'
     || variant === 'q4_fused_batched_f16a';
+  assertBindGroupBuffer('matmul', variant, 0, 'uniforms', uniformBuffer);
+  assertBindGroupBuffer('matmul', variant, 1, 'input', matmulInput?.buffer, [
+    `inputLabel=${matmulInput?.label ?? 'unknown'}`,
+    `inputDtype=${matmulInput?.dtype ?? 'unknown'}`,
+  ]);
+  assertBindGroupBuffer('matmul', variant, 2, 'weights', bBuffer);
+  assertBindGroupBuffer('matmul', variant, isQ4KF16 ? 4 : 3, 'output', outputBuffer);
   const entries = [
     { binding: 0, resource: { buffer: uniformBuffer } },
     { binding: 1, resource: { buffer: matmulInput.buffer, offset: offsets.aOffset, size: bindingSizes.aBindingSize } },

package/src/gpu/kernels/pixel_shuffle.js CHANGED Viewed

@@ -34,7 +34,7 @@ async function _pixelShuffle(target, input, options = {}) {
       grid_width: gridWidth, grid_height: gridHeight, patch_size: patchSize,
       patch_channels: inferredPatchChannels, _pad0: 0,
     },
-    Math.ceil((outChannels * outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT)
+    [Math.ceil((outHeight * outWidth) / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
   );
   return createTensor(output, input.dtype, [outChannels, outHeight, outWidth], 'pixel_shuffle_output');

package/src/gpu/kernels/pixel_shuffle.wgsl CHANGED Viewed

@@ -19,17 +19,16 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let spatial_size = u.out_height * u.out_width;
-    let total = u.out_channels * spatial_size;
-    if (idx >= total) {
+    let spatial = gid.x;
+    let c = gid.y;
+    if (c >= u.out_channels || spatial >= spatial_size) {
         return;
     }
-    let c = idx / spatial_size;
-    let spatial = idx - c * spatial_size;
     let y = spatial / u.out_width;
     let x = spatial - y * u.out_width;
+    let idx = c * spatial_size + spatial;
     let grid_y = y / u.patch_size;
     let grid_x = x / u.patch_size;

package/src/gpu/kernels/pixel_shuffle_f16.wgsl CHANGED Viewed

@@ -22,17 +22,16 @@ struct Uniforms {
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
     let spatial_size = u.out_height * u.out_width;
-    let total = u.out_channels * spatial_size;
-    if (idx >= total) {
+    let spatial = gid.x;
+    let c = gid.y;
+    if (c >= u.out_channels || spatial >= spatial_size) {
         return;
     }
-    let c = idx / spatial_size;
-    let spatial = idx - c * spatial_size;
     let y = spatial / u.out_width;
     let x = spatial - y * u.out_width;
+    let idx = c * spatial_size + spatial;
     let grid_y = y / u.patch_size;
     let grid_x = x / u.patch_size;

package/src/gpu/kernels/relu.d.ts ADDED Viewed

@@ -0,0 +1,18 @@
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+import type { OutputBufferOptions } from './types.js';
+export interface ReLUOptions extends OutputBufferOptions {
+  count?: number | null;
+}
+export declare function runReLU(
+  input: Tensor,
+  options?: ReLUOptions
+): Promise<Tensor>;
+export declare function recordReLU(
+  recorder: CommandRecorder,
+  input: Tensor,
+  options?: ReLUOptions
+): Promise<Tensor>;

package/src/gpu/kernels/relu.js ADDED Viewed

@@ -0,0 +1,58 @@
+import { acquireBuffer } from '../../memory/buffer-pool.js';
+import { createTensor, dtypeBytes } from '../tensor.js';
+import { unifiedKernelWrapper } from './utils.js';
+import { selectRuleValue } from './rule-registry.js';
+import { WORKGROUP_SIZES } from './constants.js';
+function selectReluVariant(dtype) {
+  return selectRuleValue('relu', 'variant', { dtype });
+}
+function resolveCount(input, countOverride) {
+  if (Number.isFinite(countOverride) && countOverride > 0) {
+    return Math.floor(countOverride);
+  }
+  if (Array.isArray(input.shape) && input.shape.length > 0) {
+    return input.shape.reduce((acc, value) => acc * value, 1);
+  }
+  return Math.floor(input.buffer.size / dtypeBytes(input.dtype));
+}
+function planReluDispatch(target, size) {
+  const device = target?.device;
+  const maxPerDim = Number.isFinite(device?.limits?.maxComputeWorkgroupsPerDimension)
+    ? device.limits.maxComputeWorkgroupsPerDimension
+    : 65535;
+  const dispatchStride = Math.min(size, maxPerDim * WORKGROUP_SIZES.DEFAULT);
+  return {
+    dispatchStride,
+    workgroups: [Math.ceil(dispatchStride / WORKGROUP_SIZES.DEFAULT), 1, 1],
+  };
+}
+async function _relu(target, input, options = {}) {
+  const { count = null, outputBuffer = null } = options;
+  const size = resolveCount(input, count);
+  const variant = selectReluVariant(input.dtype);
+  const output = outputBuffer || acquireBuffer(size * dtypeBytes(input.dtype), undefined, 'relu_output');
+  const dispatchPlan = planReluDispatch(target, size);
+  await unifiedKernelWrapper(
+    'relu',
+    target,
+    variant,
+    [input, output],
+    { size, _pad0: dispatchPlan.dispatchStride, _pad1: 0, _pad2: 0 },
+    dispatchPlan.workgroups
+  );
+  return createTensor(output, input.dtype, [...input.shape], 'relu_output');
+}
+export async function runReLU(input, options = {}) {
+  return _relu(null, input, options);
+}
+export async function recordReLU(recorder, input, options = {}) {
+  return _relu(recorder, input, options);
+}

package/src/gpu/kernels/relu.wgsl ADDED Viewed

@@ -0,0 +1,22 @@
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    size: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let dispatch_stride = max(u._pad0, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
+    if (idx >= u.size) {
+        return;
+    }
+    output[idx] = max(input[idx], 0.0);
+}

package/src/gpu/kernels/relu_f16.wgsl ADDED Viewed

@@ -0,0 +1,24 @@
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    size: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read_write> output: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let dispatch_stride = max(u._pad0, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
+    if (idx >= u.size) {
+        return;
+    }
+    output[idx] = max(input[idx], 0.0h);
+}

package/src/gpu/kernels/repeat_channels.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+import type { OutputBufferOptions } from './types.js';
+export interface RepeatChannelsOptions extends OutputBufferOptions {
+  inChannels: number;
+  height: number;
+  width: number;
+  repeats: number;
+}
+export declare function runRepeatChannels(
+  input: Tensor,
+  options: RepeatChannelsOptions
+): Promise<Tensor>;
+export declare function recordRepeatChannels(
+  recorder: CommandRecorder,
+  input: Tensor,
+  options: RepeatChannelsOptions
+): Promise<Tensor>;

package/src/gpu/kernels/repeat_channels.js ADDED Viewed

@@ -0,0 +1,60 @@
+import { acquireBuffer } from '../../memory/buffer-pool.js';
+import { createTensor, dtypeBytes } from '../tensor.js';
+import { unifiedKernelWrapper } from './utils.js';
+import { selectRuleValue } from './rule-registry.js';
+import { WORKGROUP_SIZES } from './constants.js';
+function selectRepeatChannelsVariant(dtype) {
+  return selectRuleValue('repeatChannels', 'variant', { dtype });
+}
+async function _repeatChannels(target, input, options = {}) {
+  const {
+    inChannels,
+    height,
+    width,
+    repeats,
+    outputBuffer = null,
+  } = options;
+  if (
+    !Number.isFinite(inChannels) ||
+    !Number.isFinite(height) ||
+    !Number.isFinite(width) ||
+    !Number.isFinite(repeats) ||
+    repeats < 1
+  ) {
+    throw new Error('RepeatChannels requires inChannels, height, width, and repeats.');
+  }
+  const outChannels = inChannels * repeats;
+  const variant = selectRepeatChannelsVariant(input.dtype);
+  const bytesPerElement = dtypeBytes(input.dtype);
+  const outputSize = outChannels * height * width * bytesPerElement;
+  const output = outputBuffer || acquireBuffer(outputSize, undefined, 'repeat_channels_output');
+  await unifiedKernelWrapper(
+    'repeat_channels',
+    target,
+    variant,
+    [input, output],
+    {
+      in_channels: inChannels,
+      height,
+      width,
+      repeats,
+      _pad0: 0,
+    },
+    [Math.ceil((height * width) / WORKGROUP_SIZES.DEFAULT), outChannels, 1]
+  );
+  return createTensor(output, input.dtype, [outChannels, height, width], 'repeat_channels_output');
+}
+export async function runRepeatChannels(input, options = {}) {
+  return _repeatChannels(null, input, options);
+}
+export async function recordRepeatChannels(recorder, input, options = {}) {
+  return _repeatChannels(recorder, input, options);
+}

package/src/gpu/kernels/repeat_channels.wgsl ADDED Viewed

@@ -0,0 +1,28 @@
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    in_channels: u32,
+    height: u32,
+    width: u32,
+    repeats: u32,
+    _pad0: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let spatial = u.height * u.width;
+    let out_channels = u.in_channels * u.repeats;
+    let spatial_idx = gid.x;
+    let out_channel = gid.y;
+    if (out_channel >= out_channels || spatial_idx >= spatial) {
+        return;
+    }
+    let channel = out_channel / u.repeats;
+    let idx = out_channel * spatial + spatial_idx;
+    output[idx] = input[channel * spatial + spatial_idx];
+}

package/src/gpu/kernels/repeat_channels_f16.wgsl ADDED Viewed

@@ -0,0 +1,30 @@
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    in_channels: u32,
+    height: u32,
+    width: u32,
+    repeats: u32,
+    _pad0: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read_write> output: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let spatial = u.height * u.width;
+    let out_channels = u.in_channels * u.repeats;
+    let spatial_idx = gid.x;
+    let out_channel = gid.y;
+    if (out_channel >= out_channels || spatial_idx >= spatial) {
+        return;
+    }
+    let channel = out_channel / u.repeats;
+    let idx = out_channel * spatial + spatial_idx;
+    output[idx] = input[channel * spatial + spatial_idx];
+}

package/src/gpu/kernels/residual.js CHANGED Viewed

@@ -63,6 +63,22 @@ function cleanupTemps(temps, recorder) {
   }
 }
+function planResidualDispatch(target, size, elementsPerWorkgroup) {
+  const device = target?.device;
+  const maxPerDim = Number.isFinite(device?.limits?.maxComputeWorkgroupsPerDimension)
+    ? device.limits.maxComputeWorkgroupsPerDimension
+    : 65535;
+  const dispatchStride = Math.min(size, maxPerDim * elementsPerWorkgroup);
+  return {
+    dispatchStride,
+    workgroups: [
+      Math.ceil(dispatchStride / elementsPerWorkgroup),
+      Math.ceil(size / dispatchStride),
+      1,
+    ],
+  };
+}
 async function _residualAdd(target, a, b, size, options = {}) {
   const recorder = target && typeof target.beginComputePass === 'function' ? target : null;
   const { useVec4 = true, outputBuffer = null } = options;
@@ -75,15 +91,17 @@ async function _residualAdd(target, a, b, size, options = {}) {
   const outputSize = size * bytesPerElement;
   const output = outputBuffer || acquireBuffer(outputSize, undefined, 'residual_output');
-  const workgroups = useVec4
-    ? Math.ceil(size / VEC4_ELEMENTS_PER_WG)
-    : Math.ceil(size / WORKGROUP_SIZES.DEFAULT);
+  const dispatchPlan = planResidualDispatch(
+    target,
+    size,
+    useVec4 ? VEC4_ELEMENTS_PER_WG : WORKGROUP_SIZES.DEFAULT
+  );
   await unifiedKernelWrapper(
     'residual', target, variant,
     [aAligned, bAligned, output],
-    { size },
-    workgroups
+    { size, scale: 1, _pad1: dispatchPlan.dispatchStride, _pad2: 0 },
+    dispatchPlan.workgroups
   );
   cleanupTemps(temps, recorder);
@@ -96,13 +114,31 @@ async function _biasAdd(target, data, bias, numTokens, dim, options = {}) {
   const { bias: biasAligned, temps } = await alignBiasTensor(data, bias, recorder);
   const variant = selectBiasAddVariant(data.dtype, biasAligned.dtype);
-  const workgroups = Math.ceil((numTokens * dim) / WORKGROUP_SIZES.DEFAULT);
+  const device = target?.device;
+  const maxPerDim = Number.isFinite(device?.limits?.maxComputeWorkgroupsPerDimension)
+    ? device.limits.maxComputeWorkgroupsPerDimension
+    : 65535;
+  const tokenStride = Math.min(numTokens, maxPerDim);
+  const workgroups = [
+    Math.ceil(dim / WORKGROUP_SIZES.DEFAULT),
+    tokenStride,
+    Math.ceil(numTokens / tokenStride),
+  ];
   await unifiedKernelWrapper(
     'bias_add', target, variant,
     [data, biasAligned],
-    { num_tokens: numTokens, dim, data_offset: dataOffset, bias_offset: biasOffset },
+    {
+      num_tokens: numTokens,
+      dim,
+      data_offset: dataOffset,
+      bias_offset: biasOffset,
+      token_stride: tokenStride,
+      _pad0: 0,
+      _pad1: 0,
+      _pad2: 0,
+    },
     workgroups
   );

package/src/gpu/kernels/residual.wgsl CHANGED Viewed

@@ -23,7 +23,8 @@ override WORKGROUP_SIZE: u32 = 256u;
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
+    let dispatch_stride = max(u._pad1, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
     if (idx >= u.size) {
         return;
     }
@@ -35,7 +36,8 @@ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
 // This avoids requiring a different bind group layout with read_write on 'a'
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn add_inplace(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
+    let dispatch_stride = max(u._pad1, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
     if (idx >= u.size) {
         return;
     }
@@ -45,7 +47,8 @@ fn add_inplace(@builtin(global_invocation_id) gid: vec3<u32>) {
 // Fused residual + scale: output = a + scale * b
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn add_scaled(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
+    let dispatch_stride = max(u._pad1, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
     if (idx >= u.size) {
         return;
     }

package/src/gpu/kernels/residual_f16.wgsl CHANGED Viewed

@@ -27,7 +27,8 @@ override WORKGROUP_SIZE: u32 = 256u;
 @compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
 fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x;
+    let dispatch_stride = max(u._pad1, 1u);
+    let idx = gid.y * dispatch_stride + gid.x;
     if (idx >= u.size) {
         return;
     }

package/src/gpu/kernels/residual_f16_vec4.wgsl CHANGED Viewed

@@ -25,7 +25,8 @@ override WORKGROUP_SIZE_VEC4: u32 = 64u;
 // Vectorized version for better throughput
 @compute @workgroup_size(WORKGROUP_SIZE_VEC4, 1, 1)
 fn add_vec4(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x * 4u;
+    let dispatch_stride = max(u._pad1, 4u);
+    let idx = gid.y * dispatch_stride + gid.x * 4u;
     let size = u.size;
     if (idx >= size) {

package/src/gpu/kernels/residual_vec4.wgsl CHANGED Viewed

@@ -23,7 +23,8 @@ override WORKGROUP_SIZE_VEC4: u32 = 64u;
 // Vectorized version for better throughput
 @compute @workgroup_size(WORKGROUP_SIZE_VEC4, 1, 1)
 fn add_vec4(@builtin(global_invocation_id) gid: vec3<u32>) {
-    let idx = gid.x * 4u;
+    let dispatch_stride = max(u._pad1, 4u);
+    let idx = gid.y * dispatch_stride + gid.x * 4u;
     let size = u.size;
     if (idx >= size) {