@mni-ml/framework 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/dist/autodiff.d.ts +13 -0
  2. package/dist/autodiff.d.ts.map +1 -0
  3. package/dist/autodiff.js +91 -0
  4. package/dist/autodiff.js.map +1 -0
  5. package/dist/datasets.d.ts +16 -0
  6. package/dist/datasets.d.ts.map +1 -0
  7. package/dist/datasets.js +64 -0
  8. package/dist/datasets.js.map +1 -0
  9. package/dist/fast_ops.d.ts +23 -0
  10. package/dist/fast_ops.d.ts.map +1 -0
  11. package/dist/fast_ops.js +263 -0
  12. package/dist/fast_ops.js.map +1 -0
  13. package/dist/fast_ops_worker.d.ts +2 -0
  14. package/dist/fast_ops_worker.d.ts.map +1 -0
  15. package/dist/fast_ops_worker.js +119 -0
  16. package/dist/fast_ops_worker.js.map +1 -0
  17. package/dist/gpu_backend.d.ts +37 -0
  18. package/dist/gpu_backend.d.ts.map +1 -0
  19. package/dist/gpu_backend.js +163 -0
  20. package/dist/gpu_backend.js.map +1 -0
  21. package/dist/gpu_kernels.d.ts +74 -0
  22. package/dist/gpu_kernels.d.ts.map +1 -0
  23. package/dist/gpu_kernels.js +571 -0
  24. package/dist/gpu_kernels.js.map +1 -0
  25. package/dist/gpu_ops.d.ts +43 -0
  26. package/dist/gpu_ops.d.ts.map +1 -0
  27. package/dist/gpu_ops.js +365 -0
  28. package/dist/gpu_ops.js.map +1 -0
  29. package/dist/index.d.ts +15 -0
  30. package/dist/index.d.ts.map +1 -0
  31. package/dist/index.js +20 -0
  32. package/dist/index.js.map +1 -0
  33. package/dist/module.d.ts +23 -0
  34. package/dist/module.d.ts.map +1 -0
  35. package/dist/module.js +97 -0
  36. package/dist/module.js.map +1 -0
  37. package/dist/nn.d.ts +63 -0
  38. package/dist/nn.d.ts.map +1 -0
  39. package/dist/nn.js +234 -0
  40. package/dist/nn.js.map +1 -0
  41. package/dist/operators.d.ts +29 -0
  42. package/dist/operators.d.ts.map +1 -0
  43. package/dist/operators.js +91 -0
  44. package/dist/operators.js.map +1 -0
  45. package/dist/optimizer.d.ts +15 -0
  46. package/dist/optimizer.d.ts.map +1 -0
  47. package/dist/optimizer.js +62 -0
  48. package/dist/optimizer.js.map +1 -0
  49. package/dist/scalar.d.ts +42 -0
  50. package/dist/scalar.d.ts.map +1 -0
  51. package/dist/scalar.js +126 -0
  52. package/dist/scalar.js.map +1 -0
  53. package/dist/scalar_functions.d.ts +62 -0
  54. package/dist/scalar_functions.d.ts.map +1 -0
  55. package/dist/scalar_functions.js +127 -0
  56. package/dist/scalar_functions.js.map +1 -0
  57. package/dist/tensor.d.ts +58 -0
  58. package/dist/tensor.d.ts.map +1 -0
  59. package/dist/tensor.js +288 -0
  60. package/dist/tensor.js.map +1 -0
  61. package/dist/tensor_data.d.ts +29 -0
  62. package/dist/tensor_data.d.ts.map +1 -0
  63. package/dist/tensor_data.js +131 -0
  64. package/dist/tensor_data.js.map +1 -0
  65. package/dist/tensor_functions.d.ts +97 -0
  66. package/dist/tensor_functions.d.ts.map +1 -0
  67. package/dist/tensor_functions.js +465 -0
  68. package/dist/tensor_functions.js.map +1 -0
  69. package/dist/tensor_ops.d.ts +47 -0
  70. package/dist/tensor_ops.d.ts.map +1 -0
  71. package/dist/tensor_ops.js +249 -0
  72. package/dist/tensor_ops.js.map +1 -0
  73. package/package.json +45 -0
@@ -0,0 +1,37 @@
1
+ export declare function getDevice(): Promise<GPUDevice>;
2
+ export declare function destroyDevice(): void;
3
+ /**
4
+ * Try to obtain a GPUDevice with experimental subgroup matrix support.
5
+ * Returns null when the feature is not available on this adapter/platform.
6
+ * Dawn (the backend for the `webgpu` npm package) exposes Apple simdgroup_matrix
7
+ * (8x8 f32 on Metal/Apple7+) and Vulkan VK_KHR_cooperative_matrix behind the
8
+ * `allow_unsafe_apis` toggle and the `chromium-experimental-subgroup-matrix`
9
+ * device feature.
10
+ */
11
+ export declare function getTensorCoreDevice(): Promise<GPUDevice | null>;
12
+ export declare function destroyTensorCoreDevice(): void;
13
+ /**
14
+ * Upload a Float64Array to the GPU as f32. Returns a STORAGE | COPY_SRC buffer.
15
+ */
16
+ export declare function uploadBuffer(device: GPUDevice, data: Float64Array): GPUBuffer;
17
+ /**
18
+ * Create a GPU buffer for compute output (read-write storage, copyable for readback).
19
+ */
20
+ export declare function createOutputBuffer(device: GPUDevice, count: number): GPUBuffer;
21
+ /**
22
+ * Read a GPU buffer back to a Float64Array (f32 -> f64).
23
+ */
24
+ export declare function readbackBuffer(device: GPUDevice, srcBuffer: GPUBuffer, count: number): Promise<Float64Array>;
25
+ /**
26
+ * Create a uniform buffer from an ArrayBuffer of packed metadata.
27
+ * Only suitable for simple structs without array members (WGSL alignment rules).
28
+ */
29
+ export declare function createUniformBuffer(device: GPUDevice, data: ArrayBuffer): GPUBuffer;
30
+ /**
31
+ * Create a read-only storage buffer for params that contain arrays.
32
+ * Storage buffers don't have the 16-byte array element alignment requirement
33
+ * that uniform buffers do in WGSL.
34
+ */
35
+ export declare function createStorageParamsBuffer(device: GPUDevice, data: ArrayBuffer): GPUBuffer;
36
+ export declare function getOrCreatePipeline(device: GPUDevice, shaderCode: string, entryPoint?: string): GPUComputePipeline;
37
+ //# sourceMappingURL=gpu_backend.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"gpu_backend.d.ts","sourceRoot":"","sources":["../src/gpu_backend.ts"],"names":[],"mappings":"AASA,wBAAsB,SAAS,IAAI,OAAO,CAAC,SAAS,CAAC,CAOpD;AAED,wBAAgB,aAAa,IAAI,IAAI,CAMpC;AAOD;;;;;;;GAOG;AACH,wBAAsB,mBAAmB,IAAI,OAAO,CAAC,SAAS,GAAG,IAAI,CAAC,CA2BrE;AAED,wBAAgB,uBAAuB,IAAI,IAAI,CAM9C;AAID;;GAEG;AACH,wBAAgB,YAAY,CAAC,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,YAAY,GAAG,SAAS,CAa7E;AAED;;GAEG;AACH,wBAAgB,kBAAkB,CAAC,MAAM,EAAE,SAAS,EAAE,KAAK,EAAE,MAAM,GAAG,SAAS,CAK9E;AAED;;GAEG;AACH,wBAAsB,cAAc,CAChC,MAAM,EAAE,SAAS,EACjB,SAAS,EAAE,SAAS,EACpB,KAAK,EAAE,MAAM,GACd,OAAO,CAAC,YAAY,CAAC,CAkBvB;AAED;;;GAGG;AACH,wBAAgB,mBAAmB,CAAC,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,WAAW,GAAG,SAAS,CAUnF;AAED;;;;GAIG;AACH,wBAAgB,yBAAyB,CAAC,MAAM,EAAE,SAAS,EAAE,IAAI,EAAE,WAAW,GAAG,SAAS,CASzF;AAID,wBAAgB,mBAAmB,CAC/B,MAAM,EAAE,SAAS,EACjB,UAAU,EAAE,MAAM,EAClB,UAAU,GAAE,MAAe,GAC5B,kBAAkB,CAYpB"}
@@ -0,0 +1,163 @@
1
+ import { create, globals } from 'webgpu';
2
+ Object.assign(globalThis, globals);
3
+ let _device = null;
4
+ let _gpu = null;
5
+ const pipelineCache = new Map();
6
+ export async function getDevice() {
7
+ if (_device)
8
+ return _device;
9
+ _gpu = create([]);
10
+ const adapter = await _gpu.requestAdapter();
11
+ if (!adapter)
12
+ throw new Error('No WebGPU adapter found');
13
+ _device = await adapter.requestDevice();
14
+ return _device;
15
+ }
16
+ export function destroyDevice() {
17
+ _device?.destroy();
18
+ _device = null;
19
+ _gpu = null;
20
+ destroyTensorCoreDevice();
21
+ pipelineCache.clear();
22
+ }
23
+ // ---- Experimental tensor core (subgroup matrix) device ----
24
+ let _tcDevice = null;
25
+ let _tcProbed = false;
26
+ /**
27
+ * Try to obtain a GPUDevice with experimental subgroup matrix support.
28
+ * Returns null when the feature is not available on this adapter/platform.
29
+ * Dawn (the backend for the `webgpu` npm package) exposes Apple simdgroup_matrix
30
+ * (8x8 f32 on Metal/Apple7+) and Vulkan VK_KHR_cooperative_matrix behind the
31
+ * `allow_unsafe_apis` toggle and the `chromium-experimental-subgroup-matrix`
32
+ * device feature.
33
+ */
34
+ export async function getTensorCoreDevice() {
35
+ if (_tcDevice)
36
+ return _tcDevice;
37
+ if (_tcProbed)
38
+ return null;
39
+ _tcProbed = true;
40
+ try {
41
+ const gpu = create([
42
+ 'enable-dawn-features=allow_unsafe_apis',
43
+ ]);
44
+ const adapter = await gpu.requestAdapter();
45
+ if (!adapter)
46
+ return null;
47
+ const featureName = adapter.features.has('chromium-experimental-subgroup-matrix')
48
+ ? 'chromium-experimental-subgroup-matrix'
49
+ : adapter.features.has('subgroup-matrix')
50
+ ? 'subgroup-matrix'
51
+ : null;
52
+ if (!featureName)
53
+ return null;
54
+ _tcDevice = await adapter.requestDevice({
55
+ requiredFeatures: [featureName],
56
+ });
57
+ return _tcDevice;
58
+ }
59
+ catch {
60
+ return null;
61
+ }
62
+ }
63
+ export function destroyTensorCoreDevice() {
64
+ // Intentionally skip device.destroy() -- Dawn's experimental subgroup matrix
65
+ // path can SIGSEGV during teardown (the compute results are correct).
66
+ // Letting the device be GC'd avoids the crash in test/process-exit scenarios.
67
+ _tcDevice = null;
68
+ _tcProbed = false;
69
+ }
70
+ // Buffer helpers
71
+ /**
72
+ * Upload a Float64Array to the GPU as f32. Returns a STORAGE | COPY_SRC buffer.
73
+ */
74
+ export function uploadBuffer(device, data) {
75
+ const f32 = new Float32Array(data.length);
76
+ for (let i = 0; i < data.length; i++) {
77
+ f32[i] = Math.fround(data[i]);
78
+ }
79
+ const buf = device.createBuffer({
80
+ size: f32.byteLength,
81
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
82
+ mappedAtCreation: true,
83
+ });
84
+ new Float32Array(buf.getMappedRange()).set(f32);
85
+ buf.unmap();
86
+ return buf;
87
+ }
88
+ /**
89
+ * Create a GPU buffer for compute output (read-write storage, copyable for readback).
90
+ */
91
+ export function createOutputBuffer(device, count) {
92
+ return device.createBuffer({
93
+ size: count * Float32Array.BYTES_PER_ELEMENT,
94
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_SRC,
95
+ });
96
+ }
97
+ /**
98
+ * Read a GPU buffer back to a Float64Array (f32 -> f64).
99
+ */
100
+ export async function readbackBuffer(device, srcBuffer, count) {
101
+ const byteLen = count * Float32Array.BYTES_PER_ELEMENT;
102
+ const staging = device.createBuffer({
103
+ size: byteLen,
104
+ usage: GPUBufferUsage.MAP_READ | GPUBufferUsage.COPY_DST,
105
+ });
106
+ const encoder = device.createCommandEncoder();
107
+ encoder.copyBufferToBuffer(srcBuffer, 0, staging, 0, byteLen);
108
+ device.queue.submit([encoder.finish()]);
109
+ await staging.mapAsync(GPUMapMode.READ);
110
+ const f32 = new Float32Array(staging.getMappedRange());
111
+ const result = new Float64Array(count);
112
+ for (let i = 0; i < count; i++) {
113
+ result[i] = f32[i];
114
+ }
115
+ staging.unmap();
116
+ staging.destroy();
117
+ return result;
118
+ }
119
+ /**
120
+ * Create a uniform buffer from an ArrayBuffer of packed metadata.
121
+ * Only suitable for simple structs without array members (WGSL alignment rules).
122
+ */
123
+ export function createUniformBuffer(device, data) {
124
+ const aligned = Math.ceil(data.byteLength / 16) * 16;
125
+ const buf = device.createBuffer({
126
+ size: Math.max(aligned, 16),
127
+ usage: GPUBufferUsage.UNIFORM | GPUBufferUsage.COPY_DST,
128
+ mappedAtCreation: true,
129
+ });
130
+ new Uint8Array(buf.getMappedRange(0, data.byteLength)).set(new Uint8Array(data));
131
+ buf.unmap();
132
+ return buf;
133
+ }
134
+ /**
135
+ * Create a read-only storage buffer for params that contain arrays.
136
+ * Storage buffers don't have the 16-byte array element alignment requirement
137
+ * that uniform buffers do in WGSL.
138
+ */
139
+ export function createStorageParamsBuffer(device, data) {
140
+ const buf = device.createBuffer({
141
+ size: Math.max(data.byteLength, 4),
142
+ usage: GPUBufferUsage.STORAGE | GPUBufferUsage.COPY_DST,
143
+ mappedAtCreation: true,
144
+ });
145
+ new Uint8Array(buf.getMappedRange(0, data.byteLength)).set(new Uint8Array(data));
146
+ buf.unmap();
147
+ return buf;
148
+ }
149
+ // Pipeline cache
150
+ export function getOrCreatePipeline(device, shaderCode, entryPoint = 'main') {
151
+ const key = shaderCode;
152
+ let pipeline = pipelineCache.get(key);
153
+ if (pipeline)
154
+ return pipeline;
155
+ const module = device.createShaderModule({ code: shaderCode });
156
+ pipeline = device.createComputePipeline({
157
+ layout: 'auto',
158
+ compute: { module, entryPoint },
159
+ });
160
+ pipelineCache.set(key, pipeline);
161
+ return pipeline;
162
+ }
163
+ //# sourceMappingURL=gpu_backend.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"gpu_backend.js","sourceRoot":"","sources":["../src/gpu_backend.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,MAAM,EAAE,OAAO,EAAE,MAAM,QAAQ,CAAC;AAEzC,MAAM,CAAC,MAAM,CAAC,UAAU,EAAE,OAAO,CAAC,CAAC;AAEnC,IAAI,OAAO,GAAqB,IAAI,CAAC;AACrC,IAAI,IAAI,GAAe,IAAI,CAAC;AAE5B,MAAM,aAAa,GAAG,IAAI,GAAG,EAA8B,CAAC;AAE5D,MAAM,CAAC,KAAK,UAAU,SAAS;IAC3B,IAAI,OAAO;QAAE,OAAO,OAAO,CAAC;IAC5B,IAAI,GAAG,MAAM,CAAC,EAAE,CAAmB,CAAC;IACpC,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,cAAc,EAAE,CAAC;IAC5C,IAAI,CAAC,OAAO;QAAE,MAAM,IAAI,KAAK,CAAC,yBAAyB,CAAC,CAAC;IACzD,OAAO,GAAG,MAAM,OAAO,CAAC,aAAa,EAAE,CAAC;IACxC,OAAO,OAAO,CAAC;AACnB,CAAC;AAED,MAAM,UAAU,aAAa;IACzB,OAAO,EAAE,OAAO,EAAE,CAAC;IACnB,OAAO,GAAG,IAAI,CAAC;IACf,IAAI,GAAG,IAAI,CAAC;IACZ,uBAAuB,EAAE,CAAC;IAC1B,aAAa,CAAC,KAAK,EAAE,CAAC;AAC1B,CAAC;AAED,8DAA8D;AAE9D,IAAI,SAAS,GAAqB,IAAI,CAAC;AACvC,IAAI,SAAS,GAAG,KAAK,CAAC;AAEtB;;;;;;;GAOG;AACH,MAAM,CAAC,KAAK,UAAU,mBAAmB;IACrC,IAAI,SAAS;QAAE,OAAO,SAAS,CAAC;IAChC,IAAI,SAAS;QAAE,OAAO,IAAI,CAAC;IAC3B,SAAS,GAAG,IAAI,CAAC;IAEjB,IAAI,CAAC;QACD,MAAM,GAAG,GAAG,MAAM,CAAC;YACf,wCAAwC;SAC3C,CAAmB,CAAC;QACrB,MAAM,OAAO,GAAG,MAAM,GAAG,CAAC,cAAc,EAAE,CAAC;QAC3C,IAAI,CAAC,OAAO;YAAE,OAAO,IAAI,CAAC;QAE1B,MAAM,WAAW,GACb,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,uCAAuC,CAAC;YACzD,CAAC,CAAC,uCAAuC;YACzC,CAAC,CAAC,OAAO,CAAC,QAAQ,CAAC,GAAG,CAAC,iBAAiB,CAAC;gBACrC,CAAC,CAAC,iBAAiB;gBACnB,CAAC,CAAC,IAAI,CAAC;QACnB,IAAI,CAAC,WAAW;YAAE,OAAO,IAAI,CAAC;QAE9B,SAAS,GAAG,MAAM,OAAO,CAAC,aAAa,CAAC;YACpC,gBAAgB,EAAE,CAAC,WAA6B,CAAC;SACpD,CAAC,CAAC;QACH,OAAO,SAAS,CAAC;IACrB,CAAC;IAAC,MAAM,CAAC;QACL,OAAO,IAAI,CAAC;IAChB,CAAC;AACL,CAAC;AAED,MAAM,UAAU,uBAAuB;IACnC,6EAA6E;IAC7E,sEAAsE;IACtE,8EAA8E;IAC9E,SAAS,GAAG,IAAI,CAAC;IACjB,SAAS,GAAG,KAAK,CAAC;AACtB,CAAC;AAED,kBAAkB;AAElB;;GAEG;AACH,MAAM,UAAU,YAAY,CAAC,MAAiB,EAAE,IAAkB;IAC9D,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,IAAI,CAAC,MAAM,CAAC,CAAC;IAC1C,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACnC,GAAG,CAAC,CAAC,CAAC,GAAG,IAAI,CAAC,MAAM,CAAC,IAAI,CAAC,CAAC,CAAE,CAAC,CAAC;IACnC,CAAC;IACD,MAAM,GAAG,GAAG,MAAM,CAAC,YAAY,CAAC;QAC5B,IAAI,EAAE,GAAG,CAAC,UAAU;QACpB,KAAK,EAAE,cAAc,CAAC,OAAO,GAAG,cAAc,CAAC,QAAQ;QACvD,gBAAgB,EAAE,IAAI;KACzB,CAAC,CAAC;IACH,IAAI,YAAY,CAAC,GAAG,CAAC,cAAc,EAAE,CAAC,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IAChD,GAAG,CAAC,KAAK,EAAE,CAAC;IACZ,OAAO,GAAG,CAAC;AACf,CAAC;AAED;;GAEG;AACH,MAAM,UAAU,kBAAkB,CAAC,MAAiB,EAAE,KAAa;IAC/D,OAAO,MAAM,CAAC,YAAY,CAAC;QACvB,IAAI,EAAE,KAAK,GAAG,YAAY,CAAC,iBAAiB;QAC5C,KAAK,EAAE,cAAc,CAAC,OAAO,GAAG,cAAc,CAAC,QAAQ;KAC1D,CAAC,CAAC;AACP,CAAC;AAED;;GAEG;AACH,MAAM,CAAC,KAAK,UAAU,cAAc,CAChC,MAAiB,EACjB,SAAoB,EACpB,KAAa;IAEb,MAAM,OAAO,GAAG,KAAK,GAAG,YAAY,CAAC,iBAAiB,CAAC;IACvD,MAAM,OAAO,GAAG,MAAM,CAAC,YAAY,CAAC;QAChC,IAAI,EAAE,OAAO;QACb,KAAK,EAAE,cAAc,CAAC,QAAQ,GAAG,cAAc,CAAC,QAAQ;KAC3D,CAAC,CAAC;IACH,MAAM,OAAO,GAAG,MAAM,CAAC,oBAAoB,EAAE,CAAC;IAC9C,OAAO,CAAC,kBAAkB,CAAC,SAAS,EAAE,CAAC,EAAE,OAAO,EAAE,CAAC,EAAE,OAAO,CAAC,CAAC;IAC9D,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,CAAC,OAAO,CAAC,MAAM,EAAE,CAAC,CAAC,CAAC;IACxC,MAAM,OAAO,CAAC,QAAQ,CAAC,UAAU,CAAC,IAAI,CAAC,CAAC;IACxC,MAAM,GAAG,GAAG,IAAI,YAAY,CAAC,OAAO,CAAC,cAAc,EAAE,CAAC,CAAC;IACvD,MAAM,MAAM,GAAG,IAAI,YAAY,CAAC,KAAK,CAAC,CAAC;IACvC,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,KAAK,EAAE,CAAC,EAAE,EAAE,CAAC;QAC7B,MAAM,CAAC,CAAC,CAAC,GAAG,GAAG,CAAC,CAAC,CAAE,CAAC;IACxB,CAAC;IACD,OAAO,CAAC,KAAK,EAAE,CAAC;IAChB,OAAO,CAAC,OAAO,EAAE,CAAC;IAClB,OAAO,MAAM,CAAC;AAClB,CAAC;AAED;;;GAGG;AACH,MAAM,UAAU,mBAAmB,CAAC,MAAiB,EAAE,IAAiB;IACpE,MAAM,OAAO,GAAG,IAAI,CAAC,IAAI,CAAC,IAAI,CAAC,UAAU,GAAG,EAAE,CAAC,GAAG,EAAE,CAAC;IACrD,MAAM,GAAG,GAAG,MAAM,CAAC,YAAY,CAAC;QAC5B,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,OAAO,EAAE,EAAE,CAAC;QAC3B,KAAK,EAAE,cAAc,CAAC,OAAO,GAAG,cAAc,CAAC,QAAQ;QACvD,gBAAgB,EAAE,IAAI;KACzB,CAAC,CAAC;IACH,IAAI,UAAU,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;IACjF,GAAG,CAAC,KAAK,EAAE,CAAC;IACZ,OAAO,GAAG,CAAC;AACf,CAAC;AAED;;;;GAIG;AACH,MAAM,UAAU,yBAAyB,CAAC,MAAiB,EAAE,IAAiB;IAC1E,MAAM,GAAG,GAAG,MAAM,CAAC,YAAY,CAAC;QAC5B,IAAI,EAAE,IAAI,CAAC,GAAG,CAAC,IAAI,CAAC,UAAU,EAAE,CAAC,CAAC;QAClC,KAAK,EAAE,cAAc,CAAC,OAAO,GAAG,cAAc,CAAC,QAAQ;QACvD,gBAAgB,EAAE,IAAI;KACzB,CAAC,CAAC;IACH,IAAI,UAAU,CAAC,GAAG,CAAC,cAAc,CAAC,CAAC,EAAE,IAAI,CAAC,UAAU,CAAC,CAAC,CAAC,GAAG,CAAC,IAAI,UAAU,CAAC,IAAI,CAAC,CAAC,CAAC;IACjF,GAAG,CAAC,KAAK,EAAE,CAAC;IACZ,OAAO,GAAG,CAAC;AACf,CAAC;AAED,iBAAiB;AAEjB,MAAM,UAAU,mBAAmB,CAC/B,MAAiB,EACjB,UAAkB,EAClB,aAAqB,MAAM;IAE3B,MAAM,GAAG,GAAG,UAAU,CAAC;IACvB,IAAI,QAAQ,GAAG,aAAa,CAAC,GAAG,CAAC,GAAG,CAAC,CAAC;IACtC,IAAI,QAAQ;QAAE,OAAO,QAAQ,CAAC;IAE9B,MAAM,MAAM,GAAG,MAAM,CAAC,kBAAkB,CAAC,EAAE,IAAI,EAAE,UAAU,EAAE,CAAC,CAAC;IAC/D,QAAQ,GAAG,MAAM,CAAC,qBAAqB,CAAC;QACpC,MAAM,EAAE,MAAM;QACd,OAAO,EAAE,EAAE,MAAM,EAAE,UAAU,EAAE;KAClC,CAAC,CAAC;IACH,aAAa,CAAC,GAAG,CAAC,GAAG,EAAE,QAAQ,CAAC,CAAC;IACjC,OAAO,QAAQ,CAAC;AACpB,CAAC"}
@@ -0,0 +1,74 @@
1
+ export declare const WORKGROUP_SIZE = 256;
2
+ export declare const BLOCK_SIZE = 16;
3
+ export declare const UNARY_OPS: Record<string, string>;
4
+ export declare const BINARY_OPS: Record<string, string>;
5
+ export declare function resolveUnaryOp(fn: Function): string;
6
+ export declare function resolveBinaryOp(fn: Function): string;
7
+ export declare const REDUCE_IDENTITY: Record<string, string>;
8
+ /**
9
+ * Aligned map: shapes & strides match, simple 1:1 element mapping.
10
+ */
11
+ export declare function buildAlignedMapShader(opBody: string): string;
12
+ /**
13
+ * Broadcast map: output and input may differ in shape.
14
+ * Uses storage buffer for params to avoid WGSL uniform array alignment rules.
15
+ */
16
+ export declare function buildBroadcastMapShader(opBody: string): string;
17
+ /**
18
+ * Aligned zip: all three tensors share shape & strides.
19
+ */
20
+ export declare function buildAlignedZipShader(opBody: string): string;
21
+ /**
22
+ * Broadcast zip: output, a, b may differ in shape.
23
+ * Uses storage buffer for params to avoid WGSL uniform array alignment rules.
24
+ */
25
+ export declare function buildBroadcastZipShader(opBody: string): string;
26
+ /**
27
+ * Sum practice: block-level partial sums using shared memory.
28
+ * Input: array of length size. Output: array of length ceil(size / WORKGROUP_SIZE).
29
+ * Each workgroup sums WORKGROUP_SIZE contiguous elements into one output cell.
30
+ */
31
+ export declare function buildSumPracticeShader(): string;
32
+ /**
33
+ * General reduce along one dimension.
34
+ * One workgroup per output element. Threads cooperatively reduce
35
+ * the reduction dimension using shared memory.
36
+ * Uses storage buffer for params to avoid WGSL uniform array alignment rules.
37
+ */
38
+ export declare function buildReduceShader(opBody: string, identity: string): string;
39
+ /**
40
+ * Tiled matrix multiplication using workgroup shared memory.
41
+ * Dispatched as 3D: (ceil(N/BLOCK), ceil(M/BLOCK), batchSize).
42
+ * Each 16x16 workgroup computes one output tile, loading tiles of A and B
43
+ * into shared memory to satisfy:
44
+ * - all data read from shared memory (not global) during accumulation
45
+ * - each global cell of A and B read exactly once
46
+ * - each thread writes to global memory exactly once
47
+ * Supports arbitrary broadcast batch dimensions via stride-based indexing.
48
+ */
49
+ export declare function buildMatMulShader(): string;
50
+ export declare const TC_TILE = 8;
51
+ /**
52
+ * Experimental tensor-core-accelerated matmul shader using the Dawn/Chrome
53
+ * `chromium_experimental_subgroup_matrix` extension.
54
+ *
55
+ * On Apple Silicon (Metal), this maps to `simdgroup_matrix` 8x8 f32 hardware
56
+ * instructions. On Vulkan with VK_KHR_cooperative_matrix it maps to the
57
+ * equivalent SPIR-V ops.
58
+ *
59
+ * @param workgroupX - must equal the device's maxSubgroupSize (e.g. 64 on
60
+ * Apple M-series via Dawn, 32 or 64 on Vulkan). The WebGPU spec requires
61
+ * the x-dimension of workgroup_size to be a multiple of maxSubgroupSize
62
+ * when the shader uses subgroup matrices.
63
+ *
64
+ * Constraints (checked by the caller on the TS side):
65
+ * - All tensors must be contiguous (natural row-major strides)
66
+ * - M, N, K must all be multiples of TC_TILE (8)
67
+ * - Batch dimensions of A and B must match exactly (no broadcasting)
68
+ *
69
+ * Dispatch: (N/TC_TILE, M/TC_TILE, batchSize).
70
+ * Each workgroup computes one 8x8 output tile by tiling over the K dimension
71
+ * with subgroupMatrixLoad / MultiplyAccumulate / Store.
72
+ */
73
+ export declare function buildTensorCoreMatMulShader(workgroupX: number): string;
74
+ //# sourceMappingURL=gpu_kernels.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"gpu_kernels.d.ts","sourceRoot":"","sources":["../src/gpu_kernels.ts"],"names":[],"mappings":"AAEA,eAAO,MAAM,cAAc,MAAM,CAAC;AAClC,eAAO,MAAM,UAAU,KAAK,CAAC;AAK7B,eAAO,MAAM,SAAS,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAQ5C,CAAC;AAEF,eAAO,MAAM,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAO7C,CAAC;AAoBF,wBAAgB,cAAc,CAAC,EAAE,EAAE,QAAQ,GAAG,MAAM,CAInD;AAED,wBAAgB,eAAe,CAAC,EAAE,EAAE,QAAQ,GAAG,MAAM,CAIpD;AAED,eAAO,MAAM,eAAe,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAIlD,CAAC;AA4CF;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAiB5D;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAqC9D;AAED;;GAEG;AACH,wBAAgB,qBAAqB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CAkB5D;AAED;;;GAGG;AACH,wBAAgB,uBAAuB,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,CA4C9D;AAED;;;;GAIG;AACH,wBAAgB,sBAAsB,IAAI,MAAM,CAqC/C;AAED;;;;;GAKG;AACH,wBAAgB,iBAAiB,CAAC,MAAM,EAAE,MAAM,EAAE,QAAQ,EAAE,MAAM,GAAG,MAAM,CAkE1E;AAED;;;;;;;;;GASG;AACH,wBAAgB,iBAAiB,IAAI,MAAM,CAsI1C;AAID,eAAO,MAAM,OAAO,IAAI,CAAC;AAEzB;;;;;;;;;;;;;;;;;;;;;GAqBG;AACH,wBAAgB,2BAA2B,CAAC,UAAU,EAAE,MAAM,GAAG,MAAM,CAwDtE"}