npm - @simulatte/doppler - Versions diffs - 0.1.3 → 0.1.5 - Mend

@simulatte/doppler 0.1.3 → 0.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (114) hide show

package/README.md +11 -5
package/package.json +27 -4
package/src/client/doppler-api.browser.d.ts +1 -0
package/src/client/doppler-api.browser.js +288 -0
package/src/client/doppler-api.d.ts +80 -0
package/src/client/doppler-api.js +298 -0
package/src/client/doppler-provider/types.js +1 -1
package/src/client/doppler-registry.d.ts +23 -0
package/src/client/doppler-registry.js +88 -0
package/src/client/doppler-registry.json +39 -0
package/src/config/execution-contract-check.d.ts +82 -0
package/src/config/execution-contract-check.js +317 -0
package/src/config/execution-v0-contract-check.d.ts +94 -0
package/src/config/execution-v0-contract-check.js +251 -0
package/src/config/execution-v0-graph-contract-check.d.ts +20 -0
package/src/config/execution-v0-graph-contract-check.js +64 -0
package/src/config/kernel-path-contract-check.d.ts +76 -0
package/src/config/kernel-path-contract-check.js +479 -0
package/src/config/kernel-path-loader.d.ts +16 -0
package/src/config/kernel-path-loader.js +54 -0
package/src/config/kernels/kernel-ref-digests.js +12 -0
package/src/config/kernels/registry.json +556 -0
package/src/config/loader.js +90 -67
package/src/config/merge-contract-check.d.ts +16 -0
package/src/config/merge-contract-check.js +321 -0
package/src/config/merge-helpers.d.ts +58 -0
package/src/config/merge-helpers.js +54 -0
package/src/config/merge.js +3 -6
package/src/config/presets/models/janus-text.json +27 -0
package/src/config/quantization-contract-check.d.ts +12 -0
package/src/config/quantization-contract-check.js +91 -0
package/src/config/required-inference-fields-contract-check.d.ts +24 -0
package/src/config/required-inference-fields-contract-check.js +231 -0
package/src/config/schema/browser-suite-metrics.schema.d.ts +17 -0
package/src/config/schema/browser-suite-metrics.schema.js +46 -0
package/src/config/schema/conversion-report.schema.d.ts +40 -0
package/src/config/schema/conversion-report.schema.js +108 -0
package/src/config/schema/doppler.schema.js +12 -18
package/src/config/schema/index.d.ts +22 -0
package/src/config/schema/index.js +18 -0
package/src/converter/core.d.ts +10 -0
package/src/converter/core.js +49 -11
package/src/converter/parsers/diffusion.js +63 -3
package/src/converter/tokenizer-utils.js +17 -3
package/src/formats/rdrr/validation.js +13 -0
package/src/gpu/kernels/depthwise_conv2d.d.ts +29 -0
package/src/gpu/kernels/depthwise_conv2d.js +98 -0
package/src/gpu/kernels/depthwise_conv2d.wgsl +58 -0
package/src/gpu/kernels/depthwise_conv2d_f16.wgsl +62 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.d.ts +27 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.js +92 -0
package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl +47 -0
package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl +51 -0
package/src/gpu/kernels/index.d.ts +30 -0
package/src/gpu/kernels/index.js +25 -0
package/src/gpu/kernels/relu.d.ts +18 -0
package/src/gpu/kernels/relu.js +45 -0
package/src/gpu/kernels/relu.wgsl +21 -0
package/src/gpu/kernels/relu_f16.wgsl +23 -0
package/src/gpu/kernels/repeat_channels.d.ts +21 -0
package/src/gpu/kernels/repeat_channels.js +60 -0
package/src/gpu/kernels/repeat_channels.wgsl +29 -0
package/src/gpu/kernels/repeat_channels_f16.wgsl +31 -0
package/src/gpu/kernels/sana_linear_attention.d.ts +27 -0
package/src/gpu/kernels/sana_linear_attention.js +122 -0
package/src/gpu/kernels/sana_linear_attention_apply.wgsl +44 -0
package/src/gpu/kernels/sana_linear_attention_apply_f16.wgsl +47 -0
package/src/gpu/kernels/sana_linear_attention_summary.wgsl +47 -0
package/src/gpu/kernels/sana_linear_attention_summary_f16.wgsl +49 -0
package/src/index-browser.d.ts +1 -0
package/src/index-browser.js +2 -1
package/src/index.d.ts +1 -0
package/src/index.js +2 -1
package/src/inference/browser-harness.js +164 -38
package/src/inference/pipelines/diffusion/init.js +14 -0
package/src/inference/pipelines/diffusion/pipeline.js +206 -77
package/src/inference/pipelines/diffusion/sana-transformer.d.ts +53 -0
package/src/inference/pipelines/diffusion/sana-transformer.js +738 -0
package/src/inference/pipelines/diffusion/scheduler.d.ts +17 -1
package/src/inference/pipelines/diffusion/scheduler.js +91 -3
package/src/inference/pipelines/diffusion/text-encoder-gpu.d.ts +6 -4
package/src/inference/pipelines/diffusion/text-encoder-gpu.js +270 -0
package/src/inference/pipelines/diffusion/text-encoder.js +18 -1
package/src/inference/pipelines/diffusion/types.d.ts +4 -0
package/src/inference/pipelines/diffusion/vae.js +782 -78
package/src/inference/pipelines/text/config.d.ts +5 -0
package/src/inference/pipelines/text/config.js +1 -1
package/src/inference/pipelines/text/execution-v0.js +141 -101
package/src/inference/pipelines/text/init.js +41 -10
package/src/inference/pipelines/text.js +7 -1
package/src/rules/execution-rules-contract-check.d.ts +17 -0
package/src/rules/execution-rules-contract-check.js +245 -0
package/src/rules/kernels/depthwise-conv2d.rules.json +6 -0
package/src/rules/kernels/grouped-pointwise-conv2d.rules.json +6 -0
package/src/rules/kernels/relu.rules.json +6 -0
package/src/rules/kernels/repeat-channels.rules.json +6 -0
package/src/rules/kernels/sana-linear-attention.rules.json +6 -0
package/src/rules/layer-pattern-contract-check.d.ts +17 -0
package/src/rules/layer-pattern-contract-check.js +231 -0
package/src/rules/rule-registry.d.ts +28 -0
package/src/rules/rule-registry.js +38 -0
package/src/tooling/conversion-config-materializer.d.ts +24 -0
package/src/tooling/conversion-config-materializer.js +99 -0
package/src/tooling/lean-execution-contract-runner.d.ts +43 -0
package/src/tooling/lean-execution-contract-runner.js +158 -0
package/src/tooling/lean-execution-contract.d.ts +16 -0
package/src/tooling/lean-execution-contract.js +81 -0
package/src/tooling/node-convert.d.ts +10 -0
package/src/tooling/node-converter.js +59 -0
package/src/tooling/node-webgpu.js +30 -9
package/src/version.d.ts +2 -0
package/src/version.js +2 -0
package/tools/convert-safetensors-node.js +47 -0
package/tools/doppler-cli.js +167 -6

package/src/gpu/kernels/grouped_pointwise_conv2d.d.ts ADDED Viewed

@@ -0,0 +1,27 @@
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+import type { OutputBufferOptions } from './types.js';
+import type { WeightBuffer } from '../weight-buffer.js';
+export interface GroupedPointwiseConv2DOptions extends OutputBufferOptions {
+  inChannels: number;
+  outChannels: number;
+  height: number;
+  width: number;
+  groups: number;
+}
+export declare function runGroupedPointwiseConv2D(
+  input: Tensor,
+  weight: GPUBuffer | WeightBuffer,
+  bias: GPUBuffer | WeightBuffer | null,
+  options: GroupedPointwiseConv2DOptions
+): Promise<Tensor>;
+export declare function recordGroupedPointwiseConv2D(
+  recorder: CommandRecorder,
+  input: Tensor,
+  weight: GPUBuffer | WeightBuffer,
+  bias: GPUBuffer | WeightBuffer | null,
+  options: GroupedPointwiseConv2DOptions
+): Promise<Tensor>;

package/src/gpu/kernels/grouped_pointwise_conv2d.js ADDED Viewed

@@ -0,0 +1,92 @@
+import { getDevice } from '../device.js';
+import { acquireBuffer, releaseBuffer } from '../../memory/buffer-pool.js';
+import { createTensor, dtypeBytes } from '../tensor.js';
+import { getBuffer } from '../weight-buffer.js';
+import { unifiedKernelWrapper } from './utils.js';
+import { selectRuleValue } from './rule-registry.js';
+import { WORKGROUP_SIZES } from './constants.js';
+function selectGroupedPointwiseConv2DVariant(isF16) {
+  return selectRuleValue('groupedPointwiseConv2d', 'variant', { isF16 });
+}
+async function _groupedPointwiseConv2D(target, input, weight, bias, options = {}) {
+  const recorder = target && typeof target.beginComputePass === 'function' ? target : null;
+  const device = target?.device || getDevice();
+  const {
+    inChannels,
+    outChannels,
+    height,
+    width,
+    groups,
+    outputBuffer = null,
+  } = options;
+  if (
+    !Number.isFinite(inChannels) ||
+    !Number.isFinite(outChannels) ||
+    !Number.isFinite(height) ||
+    !Number.isFinite(width) ||
+    !Number.isFinite(groups)
+  ) {
+    throw new Error('GroupedPointwiseConv2D requires explicit dimensions.');
+  }
+  if (groups <= 0 || inChannels % groups !== 0 || outChannels % groups !== 0) {
+    throw new Error(
+      `GroupedPointwiseConv2D requires inChannels/outChannels divisible by groups. Got ${inChannels}/${outChannels}/${groups}.`
+    );
+  }
+  const isF16 = input.dtype === 'f16';
+  const variant = selectGroupedPointwiseConv2DVariant(isF16);
+  const bytesPerElement = dtypeBytes(input.dtype);
+  const outputSize = outChannels * height * width * bytesPerElement;
+  const output = outputBuffer || acquireBuffer(outputSize, undefined, 'grouped_pointwise_conv2d_output');
+  const weightBuffer = getBuffer(weight);
+  let biasBuffer = getBuffer(bias);
+  let tempBias = null;
+  if (!biasBuffer) {
+    const biasSize = outChannels * bytesPerElement;
+    tempBias = acquireBuffer(biasSize, undefined, 'grouped_pointwise_conv2d_bias_zero');
+    biasBuffer = tempBias;
+    const paddedSize = Math.ceil(biasSize / 4) * 4;
+    device.queue.writeBuffer(biasBuffer, 0, new Uint8Array(paddedSize));
+  }
+  await unifiedKernelWrapper(
+    'grouped_pointwise_conv2d',
+    target,
+    variant,
+    [input, weightBuffer, biasBuffer, output],
+    {
+      in_channels: inChannels,
+      out_channels: outChannels,
+      height,
+      width,
+      groups,
+      _pad0: 0,
+      _pad1: 0,
+      _pad2: 0,
+    },
+    Math.ceil((outChannels * height * width) / WORKGROUP_SIZES.DEFAULT)
+  );
+  if (tempBias) {
+    if (recorder) {
+      recorder.trackTemporaryBuffer(tempBias);
+    } else {
+      releaseBuffer(tempBias);
+    }
+  }
+  return createTensor(output, input.dtype, [outChannels, height, width], 'grouped_pointwise_conv2d_output');
+}
+export async function runGroupedPointwiseConv2D(input, weight, bias, options = {}) {
+  return _groupedPointwiseConv2D(null, input, weight, bias, options);
+}
+export async function recordGroupedPointwiseConv2D(recorder, input, weight, bias, options = {}) {
+  return _groupedPointwiseConv2D(recorder, input, weight, bias, options);
+}

package/src/gpu/kernels/grouped_pointwise_conv2d.wgsl ADDED Viewed

@@ -0,0 +1,47 @@
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    in_channels: u32,
+    out_channels: u32,
+    height: u32,
+    width: u32,
+    groups: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f32>;
+@group(0) @binding(2) var<storage, read> weight: array<f32>;
+@group(0) @binding(3) var<storage, read> bias: array<f32>;
+@group(0) @binding(4) var<storage, read_write> output: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let spatial = u.height * u.width;
+    let out_size = u.out_channels * spatial;
+    if (idx >= out_size) {
+        return;
+    }
+    let out_channel = idx / spatial;
+    let rem = idx - out_channel * spatial;
+    let y = rem / u.width;
+    let x = rem - y * u.width;
+    let in_per_group = u.in_channels / u.groups;
+    let out_per_group = u.out_channels / u.groups;
+    let group_idx = out_channel / out_per_group;
+    let in_offset = group_idx * in_per_group;
+    var sum: f32 = bias[out_channel];
+    for (var i: u32 = 0u; i < in_per_group; i = i + 1u) {
+        let input_idx = ((in_offset + i) * u.height + y) * u.width + x;
+        let weight_idx = out_channel * in_per_group + i;
+        sum = sum + input[input_idx] * weight[weight_idx];
+    }
+    output[idx] = sum;
+}

package/src/gpu/kernels/grouped_pointwise_conv2d_f16.wgsl ADDED Viewed

@@ -0,0 +1,51 @@
+// Grouped Pointwise Conv2D Kernel (NCHW, f16)
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    in_channels: u32,
+    out_channels: u32,
+    height: u32,
+    width: u32,
+    groups: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read> weight: array<f16>;
+@group(0) @binding(3) var<storage, read> bias: array<f16>;
+@group(0) @binding(4) var<storage, read_write> output: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let spatial = u.height * u.width;
+    let out_size = u.out_channels * spatial;
+    if (idx >= out_size) {
+        return;
+    }
+    let out_channel = idx / spatial;
+    let rem = idx - out_channel * spatial;
+    let y = rem / u.width;
+    let x = rem - y * u.width;
+    let in_per_group = u.in_channels / u.groups;
+    let out_per_group = u.out_channels / u.groups;
+    let group_idx = out_channel / out_per_group;
+    let in_offset = group_idx * in_per_group;
+    var sum: f32 = f32(bias[out_channel]);
+    for (var i: u32 = 0u; i < in_per_group; i = i + 1u) {
+        let input_idx = ((in_offset + i) * u.height + y) * u.width + x;
+        let weight_idx = out_channel * in_per_group + i;
+        sum = sum + f32(input[input_idx]) * f32(weight[weight_idx]);
+    }
+    output[idx] = f16(sum);
+}

package/src/gpu/kernels/index.d.ts CHANGED Viewed

@@ -174,6 +174,18 @@ export {
   type Conv2DOptions,
 } from './conv2d.js';
+export {
+  runDepthwiseConv2D,
+  recordDepthwiseConv2D,
+  type DepthwiseConv2DOptions,
+} from './depthwise_conv2d.js';
+export {
+  runGroupedPointwiseConv2D,
+  recordGroupedPointwiseConv2D,
+  type GroupedPointwiseConv2DOptions,
+} from './grouped_pointwise_conv2d.js';
 // Gather (Embedding Lookup)
 export {
   runGather,
@@ -250,6 +262,24 @@ export {
   type SampleResult,
 } from './sample.js';
+export {
+  runSanaLinearAttention,
+  recordSanaLinearAttention,
+  type SanaLinearAttentionOptions,
+} from './sana_linear_attention.js';
+export {
+  runRepeatChannels,
+  recordRepeatChannels,
+  type RepeatChannelsOptions,
+} from './repeat_channels.js';
+export {
+  runReLU,
+  recordReLU,
+  type ReLUOptions,
+} from './relu.js';
 // Fused FFN (Tier 2 P0)
 export {
   runFusedFFN,

package/src/gpu/kernels/index.js CHANGED Viewed

@@ -139,6 +139,16 @@ export {
   recordConv2D,
 } from './conv2d.js';
+export {
+  runDepthwiseConv2D,
+  recordDepthwiseConv2D,
+} from './depthwise_conv2d.js';
+export {
+  runGroupedPointwiseConv2D,
+  recordGroupedPointwiseConv2D,
+} from './grouped_pointwise_conv2d.js';
 // Gather (Embedding Lookup)
 export {
   runGather,
@@ -205,6 +215,21 @@ export {
   isGPUSamplingAvailable,
 } from './sample.js';
+export {
+  runSanaLinearAttention,
+  recordSanaLinearAttention,
+} from './sana_linear_attention.js';
+export {
+  runRepeatChannels,
+  recordRepeatChannels,
+} from './repeat_channels.js';
+export {
+  runReLU,
+  recordReLU,
+} from './relu.js';
 // Fused FFN (Tier 2 P0)
 export {
   runFusedFFN,

package/src/gpu/kernels/relu.d.ts ADDED Viewed

@@ -0,0 +1,18 @@
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+import type { OutputBufferOptions } from './types.js';
+export interface ReLUOptions extends OutputBufferOptions {
+  count?: number | null;
+}
+export declare function runReLU(
+  input: Tensor,
+  options?: ReLUOptions
+): Promise<Tensor>;
+export declare function recordReLU(
+  recorder: CommandRecorder,
+  input: Tensor,
+  options?: ReLUOptions
+): Promise<Tensor>;

package/src/gpu/kernels/relu.js ADDED Viewed

@@ -0,0 +1,45 @@
+import { acquireBuffer } from '../../memory/buffer-pool.js';
+import { createTensor, dtypeBytes } from '../tensor.js';
+import { unifiedKernelWrapper } from './utils.js';
+import { selectRuleValue } from './rule-registry.js';
+import { WORKGROUP_SIZES } from './constants.js';
+function selectReluVariant(dtype) {
+  return selectRuleValue('relu', 'variant', { dtype });
+}
+function resolveCount(input, countOverride) {
+  if (Number.isFinite(countOverride) && countOverride > 0) {
+    return Math.floor(countOverride);
+  }
+  if (Array.isArray(input.shape) && input.shape.length > 0) {
+    return input.shape.reduce((acc, value) => acc * value, 1);
+  }
+  return Math.floor(input.buffer.size / dtypeBytes(input.dtype));
+}
+async function _relu(target, input, options = {}) {
+  const { count = null, outputBuffer = null } = options;
+  const size = resolveCount(input, count);
+  const variant = selectReluVariant(input.dtype);
+  const output = outputBuffer || acquireBuffer(size * dtypeBytes(input.dtype), undefined, 'relu_output');
+  await unifiedKernelWrapper(
+    'relu',
+    target,
+    variant,
+    [input, output],
+    { size, _pad0: 0, _pad1: 0, _pad2: 0 },
+    Math.ceil(size / WORKGROUP_SIZES.DEFAULT)
+  );
+  return createTensor(output, input.dtype, [...input.shape], 'relu_output');
+}
+export async function runReLU(input, options = {}) {
+  return _relu(null, input, options);
+}
+export async function recordReLU(recorder, input, options = {}) {
+  return _relu(recorder, input, options);
+}

package/src/gpu/kernels/relu.wgsl ADDED Viewed

@@ -0,0 +1,21 @@
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    size: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= u.size) {
+        return;
+    }
+    output[idx] = max(input[idx], 0.0);
+}

package/src/gpu/kernels/relu_f16.wgsl ADDED Viewed

@@ -0,0 +1,23 @@
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    size: u32,
+    _pad0: u32,
+    _pad1: u32,
+    _pad2: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read_write> output: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    if (idx >= u.size) {
+        return;
+    }
+    output[idx] = max(input[idx], 0.0h);
+}

package/src/gpu/kernels/repeat_channels.d.ts ADDED Viewed

@@ -0,0 +1,21 @@
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+import type { OutputBufferOptions } from './types.js';
+export interface RepeatChannelsOptions extends OutputBufferOptions {
+  inChannels: number;
+  height: number;
+  width: number;
+  repeats: number;
+}
+export declare function runRepeatChannels(
+  input: Tensor,
+  options: RepeatChannelsOptions
+): Promise<Tensor>;
+export declare function recordRepeatChannels(
+  recorder: CommandRecorder,
+  input: Tensor,
+  options: RepeatChannelsOptions
+): Promise<Tensor>;

package/src/gpu/kernels/repeat_channels.js ADDED Viewed

@@ -0,0 +1,60 @@
+import { acquireBuffer } from '../../memory/buffer-pool.js';
+import { createTensor, dtypeBytes } from '../tensor.js';
+import { unifiedKernelWrapper } from './utils.js';
+import { selectRuleValue } from './rule-registry.js';
+import { WORKGROUP_SIZES } from './constants.js';
+function selectRepeatChannelsVariant(dtype) {
+  return selectRuleValue('repeatChannels', 'variant', { dtype });
+}
+async function _repeatChannels(target, input, options = {}) {
+  const {
+    inChannels,
+    height,
+    width,
+    repeats,
+    outputBuffer = null,
+  } = options;
+  if (
+    !Number.isFinite(inChannels) ||
+    !Number.isFinite(height) ||
+    !Number.isFinite(width) ||
+    !Number.isFinite(repeats) ||
+    repeats < 1
+  ) {
+    throw new Error('RepeatChannels requires inChannels, height, width, and repeats.');
+  }
+  const outChannels = inChannels * repeats;
+  const variant = selectRepeatChannelsVariant(input.dtype);
+  const bytesPerElement = dtypeBytes(input.dtype);
+  const outputSize = outChannels * height * width * bytesPerElement;
+  const output = outputBuffer || acquireBuffer(outputSize, undefined, 'repeat_channels_output');
+  await unifiedKernelWrapper(
+    'repeat_channels',
+    target,
+    variant,
+    [input, output],
+    {
+      in_channels: inChannels,
+      height,
+      width,
+      repeats,
+      _pad0: 0,
+    },
+    Math.ceil((outChannels * height * width) / WORKGROUP_SIZES.DEFAULT)
+  );
+  return createTensor(output, input.dtype, [outChannels, height, width], 'repeat_channels_output');
+}
+export async function runRepeatChannels(input, options = {}) {
+  return _repeatChannels(null, input, options);
+}
+export async function recordRepeatChannels(recorder, input, options = {}) {
+  return _repeatChannels(recorder, input, options);
+}

package/src/gpu/kernels/repeat_channels.wgsl ADDED Viewed

@@ -0,0 +1,29 @@
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    in_channels: u32,
+    height: u32,
+    width: u32,
+    repeats: u32,
+    _pad0: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f32>;
+@group(0) @binding(2) var<storage, read_write> output: array<f32>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let spatial = u.height * u.width;
+    let out_channels = u.in_channels * u.repeats;
+    let total = out_channels * spatial;
+    if (idx >= total) {
+        return;
+    }
+    let out_channel = idx / spatial;
+    let channel = out_channel / u.repeats;
+    let spatial_idx = idx - out_channel * spatial;
+    output[idx] = input[channel * spatial + spatial_idx];
+}

package/src/gpu/kernels/repeat_channels_f16.wgsl ADDED Viewed

@@ -0,0 +1,31 @@
+enable f16;
+override WORKGROUP_SIZE: u32 = 256u;
+struct Uniforms {
+    in_channels: u32,
+    height: u32,
+    width: u32,
+    repeats: u32,
+    _pad0: u32,
+}
+@group(0) @binding(0) var<uniform> u: Uniforms;
+@group(0) @binding(1) var<storage, read> input: array<f16>;
+@group(0) @binding(2) var<storage, read_write> output: array<f16>;
+@compute @workgroup_size(WORKGROUP_SIZE, 1, 1)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let idx = gid.x;
+    let spatial = u.height * u.width;
+    let out_channels = u.in_channels * u.repeats;
+    let total = out_channels * spatial;
+    if (idx >= total) {
+        return;
+    }
+    let out_channel = idx / spatial;
+    let channel = out_channel / u.repeats;
+    let spatial_idx = idx - out_channel * spatial;
+    output[idx] = input[channel * spatial + spatial_idx];
+}

package/src/gpu/kernels/sana_linear_attention.d.ts ADDED Viewed

@@ -0,0 +1,27 @@
+import type { Tensor } from '../tensor.js';
+import type { CommandRecorder } from '../command-recorder.js';
+import type { OutputBufferOptions } from './types.js';
+export interface SanaLinearAttentionOptions extends OutputBufferOptions {
+  numHeads: number;
+  headDim: number;
+  numTokens?: number;
+  hiddenSize?: number;
+  eps?: number;
+  summaryBuffer?: GPUBuffer | null;
+}
+export declare function runSanaLinearAttention(
+  query: Tensor,
+  key: Tensor,
+  value: Tensor,
+  options: SanaLinearAttentionOptions
+): Promise<Tensor>;
+export declare function recordSanaLinearAttention(
+  recorder: CommandRecorder,
+  query: Tensor,
+  key: Tensor,
+  value: Tensor,
+  options: SanaLinearAttentionOptions
+): Promise<Tensor>;