@genai-fi/nanogpt 0.6.3 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.js +11 -11
- package/dist/NanoGPTModel.d.ts +2 -2
- package/dist/NanoGPTModel.js +104 -136
- package/dist/{RealDiv-BYViZwhN.js → RealDiv-C4hOvYOZ.js} +26 -25
- package/dist/{Reshape-t7Kcikjk.js → Reshape-BLijOA8h.js} +5 -5
- package/dist/TeachableLLM.js +5 -5
- package/dist/{TiedEmbedding-9WeDwvjO.js → TiedEmbedding-BLltddza.js} +4 -4
- package/dist/{axis_util-Bu4h7XWV.js → axis_util-DaAl5MER.js} +3 -3
- package/dist/backend.d.ts +1 -0
- package/dist/backend.js +7 -0
- package/dist/backend_util-DWiwsi2N.js +749 -0
- package/dist/{broadcast_to-DARN-DBD.js → broadcast_to-C4v-j9yA.js} +2 -2
- package/dist/{concat-5aPGqw3Z.js → concat-CsHeR4zV.js} +8 -8
- package/dist/{dataset-pgqp-YfL.js → dataset-JDyjG3QR.js} +3 -3
- package/dist/{dropout-Bciw46HT.js → dropout-hpDwECTe.js} +7 -7
- package/dist/{gather-DjyCjmOD.js → gather-D0_gPiBz.js} +4 -4
- package/dist/gelu-uyHP1x1f.js +26 -0
- package/dist/gpgpu_math-DJm3ZTAf.js +2371 -0
- package/dist/index-BPPzKVdR.js +12099 -0
- package/dist/{index-BAzbokzv.js → index-C0dhsYom.js} +405 -389
- package/dist/{kernel_funcs_utils-CUxJCg0g.js → kernel_funcs_utils-CwRTFqrc.js} +31 -30
- package/dist/layers/BaseLayer.js +2 -2
- package/dist/layers/CausalSelfAttention.js +6 -6
- package/dist/layers/MLP.js +5 -5
- package/dist/layers/RMSNorm.js +3 -3
- package/dist/layers/RoPECache.js +4 -4
- package/dist/layers/TiedEmbedding.js +5 -5
- package/dist/layers/TransformerBlock.js +1 -1
- package/dist/loader/loadTransformers.js +1 -1
- package/dist/loader/oldZipLoad.js +5 -5
- package/dist/{log_sum_exp-YEo2h3gb.js → log_sum_exp-D086OgZJ.js} +15 -15
- package/dist/main.d.ts +2 -0
- package/dist/main.js +9 -5
- package/dist/{mat_mul-7121rsJk.js → mat_mul-1nwdPkQ_.js} +4 -4
- package/dist/{max-DtlIuVeW.js → max-BQc2Aj-I.js} +4 -4
- package/dist/{mulmat_packed_gpu-D4nKF7Je.js → mulmat_packed_gpu-Gzf3I9UV.js} +1 -1
- package/dist/non_max_suppression_impl-CsEgBuMA.js +134 -0
- package/dist/{ones-BBlSRqn1.js → ones-D63HpSF_.js} +2 -2
- package/dist/ops/appendCache.js +3 -3
- package/dist/ops/attentionMask.js +1 -1
- package/dist/ops/cpu/appendCache.js +8 -8
- package/dist/ops/cpu/attentionMask.js +9 -9
- package/dist/ops/cpu/fusedSoftmax.js +17 -11
- package/dist/ops/cpu/gatherSub.js +7 -7
- package/dist/ops/cpu/gelu.js +13 -13
- package/dist/ops/cpu/matMulGelu.js +36 -24
- package/dist/ops/cpu/matMulMul.js +14 -8
- package/dist/ops/cpu/mulDropout.js +9 -3
- package/dist/ops/cpu/normRMS.js +5 -5
- package/dist/ops/cpu/qkv.js +3 -3
- package/dist/ops/cpu/rope.js +5 -5
- package/dist/ops/cpu/scatterSub.js +11 -11
- package/dist/ops/fusedSoftmax.js +1 -1
- package/dist/ops/gatherSub.js +1 -1
- package/dist/ops/gelu.js +2 -2
- package/dist/ops/grads/attentionMask.js +1 -1
- package/dist/ops/grads/fusedSoftmax.js +2 -2
- package/dist/ops/grads/gelu.js +3 -24
- package/dist/ops/grads/matMulGelu.js +5 -5
- package/dist/ops/grads/normRMS.js +6 -6
- package/dist/ops/grads/qkv.js +1 -1
- package/dist/ops/grads/rope.js +3 -3
- package/dist/ops/matMulGelu.js +1 -1
- package/dist/ops/matMulMul.js +1 -1
- package/dist/ops/mulDrop.js +1 -1
- package/dist/ops/normRMS.js +1 -1
- package/dist/ops/qkv.js +1 -1
- package/dist/ops/rope.js +4 -4
- package/dist/ops/scatterSub.js +1 -1
- package/dist/ops/webgl/appendCache.js +1 -1
- package/dist/ops/webgl/attentionMask.js +1 -1
- package/dist/ops/webgl/fusedSoftmax.js +4 -4
- package/dist/ops/webgl/gatherSub.js +1 -1
- package/dist/ops/webgl/gelu.js +2 -2
- package/dist/ops/webgl/log.js +5 -5
- package/dist/ops/webgl/matMulGelu.js +17 -17
- package/dist/ops/webgl/matMulMul.js +1 -1
- package/dist/ops/webgl/mulDropout.js +4 -4
- package/dist/ops/webgl/normRMS.js +2 -2
- package/dist/ops/webgl/qkv.js +1 -1
- package/dist/ops/webgl/rope.js +1 -1
- package/dist/ops/webgl/scatterSub.js +1 -1
- package/dist/ops/webgpu/appendCache.d.ts +1 -0
- package/dist/ops/webgpu/appendCache.js +56 -0
- package/dist/ops/webgpu/attentionMask.d.ts +1 -0
- package/dist/ops/webgpu/attentionMask.js +64 -0
- package/dist/ops/webgpu/gatherSub.d.ts +1 -0
- package/dist/ops/webgpu/gatherSub.js +37 -0
- package/dist/ops/webgpu/gelu.d.ts +14 -0
- package/dist/ops/webgpu/gelu.js +86 -0
- package/dist/ops/webgpu/index.d.ts +0 -0
- package/dist/ops/webgpu/index.js +8 -0
- package/dist/ops/webgpu/normRMS.d.ts +1 -0
- package/dist/ops/webgpu/normRMS.js +115 -0
- package/dist/ops/webgpu/qkv.d.ts +1 -0
- package/dist/ops/webgpu/qkv.js +56 -0
- package/dist/ops/webgpu/rope.d.ts +1 -0
- package/dist/ops/webgpu/rope.js +68 -0
- package/dist/ops/webgpu/scatterSub.d.ts +1 -0
- package/dist/ops/webgpu/scatterSub.js +37 -0
- package/dist/{ops-C0sQEcPw.js → ops-CIQLNshk.js} +452 -503
- package/dist/{random_width-DWzaOgrn.js → random_width-DkYP8W8N.js} +143 -144
- package/dist/{range-DYsrnfiy.js → range-CYzpQY53.js} +1 -1
- package/dist/{reciprocal-CJQeasVa.js → reciprocal-_A9yv27J.js} +1 -1
- package/dist/{register_all_kernels-BfFCQAqs.js → register_all_kernels-guvSxp7M.js} +202 -200
- package/dist/{reshape-krWGKraP.js → reshape-BMUzc1UY.js} +3 -3
- package/dist/{scatter_nd_util-93ln7Hut.js → scatter_nd_util-IRBqKz_b.js} +3 -3
- package/dist/{selu_util-sntGesxr.js → selu_util-Dt_iuXaq.js} +6 -6
- package/dist/shared-BNa2q6jD.js +69 -0
- package/dist/{shared-Ca6iDobD.js → shared-CDu9S76h.js} +541 -606
- package/dist/{sin-D_h-qCSx.js → sin-Cocju-BY.js} +6 -6
- package/dist/{softmax-fsdtf6JC.js → softmax-GPNK3o-U.js} +3 -3
- package/dist/{split-eiktj-6L.js → split-CHzJjxDv.js} +4 -4
- package/dist/{stack-dfEEz2OY.js → stack-Dpgg_1W1.js} +2 -2
- package/dist/{sum-BE_Irnim.js → sum-B8wEpKsg.js} +5 -5
- package/dist/{tensor-Xyi595sG.js → tensor-RvZVNmg0.js} +1 -1
- package/dist/{tensor2d-CPEkynbH.js → tensor2d-B_kyod7_.js} +1 -1
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/DatasetBuilder.js +2 -2
- package/dist/training/Evaluator.js +1 -1
- package/dist/training/FullTrainer.js +20 -20
- package/dist/training/Trainer.d.ts +5 -6
- package/dist/training/Trainer.js +59 -60
- package/dist/training/sparseCrossEntropy.js +4 -4
- package/dist/utilities/dummy.js +19 -19
- package/dist/utilities/generate.js +15 -16
- package/dist/utilities/multinomialCPU.d.ts +2 -0
- package/dist/utilities/multinomialCPU.js +13 -0
- package/dist/utilities/performance.d.ts +2 -0
- package/dist/utilities/performance.js +16 -0
- package/dist/utilities/profile.d.ts +1 -0
- package/dist/utilities/profile.js +9 -6
- package/dist/utilities/safetensors.js +2 -2
- package/dist/utilities/weights.js +2 -2
- package/dist/{variable-wSS22xj5.js → variable-DXEUOwew.js} +1 -1
- package/dist/webgpu_util-g13LvDIv.js +625 -0
- package/dist/{zeros-YJDE7oRb.js → zeros-DCPCdFGq.js} +8 -8
- package/package.json +2 -1
- package/dist/gpgpu_math-CNslybmD.js +0 -3115
- package/dist/norm-CzltS9Fz.js +0 -86
package/dist/ops/webgl/gelu.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { u as s, C as x } from "../../kernel_funcs_utils-
|
|
1
|
+
import { f as a } from "../../index-C0dhsYom.js";
|
|
2
|
+
import { u as s, C as x } from "../../kernel_funcs_utils-CwRTFqrc.js";
|
|
3
3
|
const t = 0.7978845608028654, r = 0.044715, c = x + `
|
|
4
4
|
float x3 = x * x * x;
|
|
5
5
|
float inner = x + ${r} * x3;
|
package/dist/ops/webgl/log.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { u as s, l as N } from "../../kernel_funcs_utils-
|
|
3
|
-
import {
|
|
1
|
+
import { f as e, a8 as r } from "../../index-C0dhsYom.js";
|
|
2
|
+
import { u as s, l as N } from "../../kernel_funcs_utils-CwRTFqrc.js";
|
|
3
|
+
import { y as l } from "../../shared-BNa2q6jD.js";
|
|
4
4
|
/**
|
|
5
5
|
* @license
|
|
6
6
|
* Copyright 2020 Google LLC. All Rights Reserved.
|
|
@@ -32,8 +32,8 @@ const a = N + `
|
|
|
32
32
|
packedOpSnippet: t,
|
|
33
33
|
cpuKernelImpl: l
|
|
34
34
|
}), o = {
|
|
35
|
-
kernelName:
|
|
35
|
+
kernelName: r,
|
|
36
36
|
backendName: "webgl",
|
|
37
37
|
kernelFunc: n
|
|
38
38
|
};
|
|
39
|
-
|
|
39
|
+
e(o);
|
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { r as f } from "../../Reshape-
|
|
3
|
-
import { M as U } from "../../mulmat_packed_gpu-
|
|
4
|
-
import { m as E } from "../../mat_mul-
|
|
5
|
-
const M = 0.7978845608028654, x = 0.044715,
|
|
1
|
+
import { f as _, t as R, e as C, j as A, k as N, l as H, u as O } from "../../index-C0dhsYom.js";
|
|
2
|
+
import { r as f } from "../../Reshape-BLijOA8h.js";
|
|
3
|
+
import { M as U } from "../../mulmat_packed_gpu-Gzf3I9UV.js";
|
|
4
|
+
import { m as E } from "../../mat_mul-1nwdPkQ_.js";
|
|
5
|
+
const M = 0.7978845608028654, x = 0.044715, j = `
|
|
6
6
|
vec4 x3 = x * x * x;
|
|
7
7
|
vec4 inner = x + ${x} * x3;
|
|
8
8
|
inner = ${M} * inner;
|
|
@@ -10,7 +10,7 @@ const M = 0.7978845608028654, x = 0.044715, q = `
|
|
|
10
10
|
inner = 0.5 * (1.0 + inner);
|
|
11
11
|
vec4 result = x * inner;
|
|
12
12
|
return result;
|
|
13
|
-
`,
|
|
13
|
+
`, q = `
|
|
14
14
|
vec4 a2 = a * a;
|
|
15
15
|
vec4 a3 = a2 * a;
|
|
16
16
|
vec4 u = ${M} * (a + ${x} * a3);
|
|
@@ -34,7 +34,7 @@ function w({
|
|
|
34
34
|
i === p,
|
|
35
35
|
() => `Error in matMul: inner shapes (${i}) and (${p}) of Tensors with shapes ${e.shape} and ${t.shape} and transposeA=${s} and transposeB=${n} must match.`
|
|
36
36
|
);
|
|
37
|
-
const v = s ? [d, i, h] : [d, h, i], S = n ? [m, l, p] : [m, p, l],
|
|
37
|
+
const v = s ? [d, i, h] : [d, h, i], S = n ? [m, l, p] : [m, p, l], k = f({ inputs: { x: e }, backend: a, attrs: { shape: v } }), D = f({ inputs: { x: t }, backend: a, attrs: { shape: S } }), G = [k, D], y = Math.max(d, m), L = c, B = O(e.dtype, t.dtype), F = new U(
|
|
38
38
|
v,
|
|
39
39
|
S,
|
|
40
40
|
[y, h, l],
|
|
@@ -44,15 +44,15 @@ function w({
|
|
|
44
44
|
L,
|
|
45
45
|
!!o,
|
|
46
46
|
!1
|
|
47
|
-
),
|
|
48
|
-
o &&
|
|
49
|
-
const $ = a.runWebGLProgram(F,
|
|
47
|
+
), g = [k, D];
|
|
48
|
+
o && g.push(o);
|
|
49
|
+
const $ = a.runWebGLProgram(F, g, B), I = f({ inputs: { x: $ }, backend: a, attrs: { shape: b } });
|
|
50
50
|
G.push($);
|
|
51
51
|
for (const P of G)
|
|
52
52
|
a.disposeIntermediateTensorInfo(P);
|
|
53
53
|
return I;
|
|
54
54
|
}
|
|
55
|
-
function
|
|
55
|
+
function z(e) {
|
|
56
56
|
const { inputs: t, backend: s } = e, { x: n, kernel: a } = t;
|
|
57
57
|
if (n === void 0 || a === void 0)
|
|
58
58
|
throw new Error("BatchMatMul requires two input tensors.");
|
|
@@ -62,15 +62,15 @@ function W(e) {
|
|
|
62
62
|
transposeA: !1,
|
|
63
63
|
transposeB: !1,
|
|
64
64
|
backend: s,
|
|
65
|
-
activationSnippet:
|
|
65
|
+
activationSnippet: j
|
|
66
66
|
});
|
|
67
67
|
}
|
|
68
|
-
const
|
|
68
|
+
const W = {
|
|
69
69
|
kernelName: "MatMulGelu",
|
|
70
70
|
backendName: "webgl",
|
|
71
|
-
kernelFunc:
|
|
71
|
+
kernelFunc: z
|
|
72
72
|
};
|
|
73
|
-
_(
|
|
73
|
+
_(W);
|
|
74
74
|
function J(e) {
|
|
75
75
|
const { dy: t, x: s, kernel: n } = e.inputs, a = e.backend;
|
|
76
76
|
return R(() => {
|
|
@@ -81,7 +81,7 @@ function J(e) {
|
|
|
81
81
|
transposeA: !1,
|
|
82
82
|
transposeB: !1,
|
|
83
83
|
backend: a,
|
|
84
|
-
activationSnippet:
|
|
84
|
+
activationSnippet: q,
|
|
85
85
|
multiplier: t
|
|
86
86
|
})
|
|
87
87
|
), o = E(c, n, !1, !0), r = E(s, c, !0, !1);
|
|
@@ -97,5 +97,5 @@ _(Q);
|
|
|
97
97
|
export {
|
|
98
98
|
te as MATMUL_SHARED_DIM_THRESHOLD,
|
|
99
99
|
w as batchMatMulGeluImpl,
|
|
100
|
-
|
|
100
|
+
z as batchMatMulKernel
|
|
101
101
|
};
|
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { f as m } from "../../index-C0dhsYom.js";
|
|
2
2
|
class f {
|
|
3
3
|
variableNames = ["a", "b"];
|
|
4
4
|
outputShape;
|
|
@@ -7,8 +7,8 @@ class f {
|
|
|
7
7
|
{ name: "dropoutRate", type: "float" },
|
|
8
8
|
{ name: "seed", type: "float" }
|
|
9
9
|
];
|
|
10
|
-
constructor(
|
|
11
|
-
this.outputShape = [
|
|
10
|
+
constructor(t, r, o) {
|
|
11
|
+
this.outputShape = [t, r, o, o], this.userCode = `
|
|
12
12
|
float random(ivec4 coords) {
|
|
13
13
|
float x = float(coords.x * 4096 + coords.y * 256 + coords.z * 16 + coords.w);
|
|
14
14
|
return fract(sin(seed + x) * 43758.5453123);
|
|
@@ -27,7 +27,7 @@ class f {
|
|
|
27
27
|
}
|
|
28
28
|
}
|
|
29
29
|
function b(e) {
|
|
30
|
-
const { inputs:
|
|
30
|
+
const { inputs: t, attrs: r } = e, { a: o, b: s } = t, { dropoutRate: a, seed: c } = r, n = e.backend, d = o.shape[0], u = o.shape[2], p = o.shape[1], l = new f(d, p, u);
|
|
31
31
|
return n.runWebGLProgram(l, [o, s], "float32", [
|
|
32
32
|
[a ?? 0],
|
|
33
33
|
[c ?? Math.random() * 1e4]
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import {
|
|
2
|
-
import { s as x } from "../../sum-
|
|
1
|
+
import { f as p, e as G } from "../../index-C0dhsYom.js";
|
|
2
|
+
import { s as x } from "../../sum-B8wEpKsg.js";
|
|
3
3
|
class y {
|
|
4
4
|
variableNames = ["x", "meanSquare", "gamma"];
|
|
5
5
|
outputShape;
|
package/dist/ops/webgl/qkv.js
CHANGED
package/dist/ops/webgl/rope.js
CHANGED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { f as u, c as d, g as l } from "../../webgpu_util-g13LvDIv.js";
|
|
2
|
+
import { f as m } from "../../index-C0dhsYom.js";
|
|
3
|
+
class f {
|
|
4
|
+
variableNames = ["cache", "item"];
|
|
5
|
+
outputShape;
|
|
6
|
+
shaderKey = "AppendCache";
|
|
7
|
+
dispatchLayout;
|
|
8
|
+
dispatch;
|
|
9
|
+
workgroupSize = [64, 1, 1];
|
|
10
|
+
size = !0;
|
|
11
|
+
uniforms = "cacheT: i32";
|
|
12
|
+
constructor(t, a, s, o, c) {
|
|
13
|
+
const i = Math.min(s + 1, c);
|
|
14
|
+
this.outputShape = [t, a, i, o], this.dispatchLayout = u(this.outputShape), this.dispatch = d(this.dispatchLayout, this.outputShape, this.workgroupSize);
|
|
15
|
+
}
|
|
16
|
+
getUserCode() {
|
|
17
|
+
const t = this.outputShape[2];
|
|
18
|
+
return `
|
|
19
|
+
${l("index")} {
|
|
20
|
+
if (index < uniforms.size) {
|
|
21
|
+
let coords = getCoordsFromIndex(index); // [b, h, t, d]
|
|
22
|
+
let b = coords[0];
|
|
23
|
+
let h = coords[1];
|
|
24
|
+
let t = coords[2];
|
|
25
|
+
let d = coords[3];
|
|
26
|
+
|
|
27
|
+
let itemT = 1;
|
|
28
|
+
let maxSize = ${t};
|
|
29
|
+
let totalT = uniforms.cacheT + itemT;
|
|
30
|
+
let start = select(0, 1, totalT >= maxSize);
|
|
31
|
+
|
|
32
|
+
let srcT = t + start;
|
|
33
|
+
var val = 0.0;
|
|
34
|
+
if (srcT < uniforms.cacheT) {
|
|
35
|
+
val = getCache(b, h, srcT, d);
|
|
36
|
+
} else if (srcT == uniforms.cacheT) {
|
|
37
|
+
val = getItem(b, h, 0, d);
|
|
38
|
+
} else {
|
|
39
|
+
val = 0.0;
|
|
40
|
+
}
|
|
41
|
+
setOutputAtIndex(index, val);
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
`;
|
|
45
|
+
}
|
|
46
|
+
}
|
|
47
|
+
function T(e) {
|
|
48
|
+
const { cache: t, item: a } = e.inputs, { maxSize: s, pastLen: o } = e.attrs, c = e.backend, i = t.shape[0], r = t.shape[2], n = t.shape[1], h = new f(i, n, r, a.shape[3], s), p = [{ type: "int32", data: [o] }];
|
|
49
|
+
return c.runWebGPUProgram(h, [t, a], "float32", p);
|
|
50
|
+
}
|
|
51
|
+
const g = {
|
|
52
|
+
kernelName: "AppendCache",
|
|
53
|
+
backendName: "webgpu",
|
|
54
|
+
kernelFunc: T
|
|
55
|
+
};
|
|
56
|
+
m(g);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,64 @@
|
|
|
1
|
+
import { f } from "../../index-C0dhsYom.js";
|
|
2
|
+
import { f as m, c as k, g as l } from "../../webgpu_util-g13LvDIv.js";
|
|
3
|
+
class g {
|
|
4
|
+
variableNames = ["q", "k"];
|
|
5
|
+
outputShape;
|
|
6
|
+
shaderKey = "AttentionMask";
|
|
7
|
+
dispatchLayout;
|
|
8
|
+
dispatch;
|
|
9
|
+
uniforms = "divisor: f32, pastLen: i32, inf: f32";
|
|
10
|
+
workgroupSize = [64, 1, 1];
|
|
11
|
+
size = !0;
|
|
12
|
+
hs;
|
|
13
|
+
nh;
|
|
14
|
+
T1;
|
|
15
|
+
T2;
|
|
16
|
+
constructor(t, e, o, i, a) {
|
|
17
|
+
if (this.outputShape = [t, e, o, i], this.hs = a, this.nh = e, this.T1 = o, this.T2 = i, this.dispatchLayout = m(this.outputShape), this.dispatch = k(this.dispatchLayout, this.outputShape, this.workgroupSize), a % 4 !== 0)
|
|
18
|
+
throw new Error("Head size must be a multiple of 4 for AttentionMaskProgram");
|
|
19
|
+
}
|
|
20
|
+
getUserCode() {
|
|
21
|
+
return `
|
|
22
|
+
${l("index")} {
|
|
23
|
+
|
|
24
|
+
let coords = getCoordsFromIndex(index);
|
|
25
|
+
let b = coords[0];
|
|
26
|
+
let h = coords[1];
|
|
27
|
+
let t1 = coords[2];
|
|
28
|
+
let t2 = coords[3];
|
|
29
|
+
|
|
30
|
+
if (index < uniforms.size) {
|
|
31
|
+
if (t2 > t1 + uniforms.pastLen) {
|
|
32
|
+
setOutputAtIndex(index, uniforms.inf);
|
|
33
|
+
return;
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
var sum: f32 = 0.0;
|
|
37
|
+
for (var i: i32 = 0; i < ${this.hs}; i = i + 4) {
|
|
38
|
+
let q0 = getIndexFromCoords4D(vec4<i32>(b, h, t1, i), uniforms.qShape);
|
|
39
|
+
let qv = vec4<f32>(q[q0], q[q0 + 1], q[q0 + 2], q[q0 + 3]);
|
|
40
|
+
let k0 = getIndexFromCoords4D(vec4<i32>(b, h, t2, i), uniforms.kShape);
|
|
41
|
+
let kv = vec4<f32>(k[k0], k[k0 + 1], k[k0 + 2], k[k0 + 3]);
|
|
42
|
+
sum = sum + dot(qv, kv);
|
|
43
|
+
}
|
|
44
|
+
let scaled = sum * uniforms.divisor;
|
|
45
|
+
setOutputAtIndex(index, scaled);
|
|
46
|
+
}
|
|
47
|
+
}
|
|
48
|
+
`;
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
function q(s) {
|
|
52
|
+
const { q: t, k: e } = s.inputs, { divisor: o, pastLen: i } = s.attrs, a = s.backend, n = t.shape[0], r = t.shape[2], u = e.shape[2], c = t.shape[1], d = t.shape[3], h = new g(n, c, r, u, d), p = [
|
|
53
|
+
{ type: "float32", data: [o] },
|
|
54
|
+
{ type: "int32", data: [i] },
|
|
55
|
+
{ type: "float32", data: [Number.NEGATIVE_INFINITY] }
|
|
56
|
+
];
|
|
57
|
+
return a.runWebGPUProgram(h, [t, e], "float32", p);
|
|
58
|
+
}
|
|
59
|
+
const v = {
|
|
60
|
+
kernelName: "AttentionMask",
|
|
61
|
+
backendName: "webgpu",
|
|
62
|
+
kernelFunc: q
|
|
63
|
+
};
|
|
64
|
+
f(v);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
import { f as u, c as n, g as c } from "../../webgpu_util-g13LvDIv.js";
|
|
2
|
+
import { f as p } from "../../index-C0dhsYom.js";
|
|
3
|
+
class d {
|
|
4
|
+
variableNames = ["labels", "logits", "values"];
|
|
5
|
+
outputShape;
|
|
6
|
+
shaderKey = "GatherSub";
|
|
7
|
+
dispatchLayout;
|
|
8
|
+
dispatch;
|
|
9
|
+
workgroupSize = [64, 1, 1];
|
|
10
|
+
size = !0;
|
|
11
|
+
constructor(e) {
|
|
12
|
+
this.outputShape = [e], this.dispatchLayout = u(this.outputShape), this.dispatch = n(this.dispatchLayout, this.outputShape, this.workgroupSize);
|
|
13
|
+
}
|
|
14
|
+
getUserCode() {
|
|
15
|
+
return `
|
|
16
|
+
${c("index")} {
|
|
17
|
+
if (index < uniforms.size) {
|
|
18
|
+
let coords = getCoordsFromIndex(index);
|
|
19
|
+
let idx = i32(getLabelsByOutputIndex(index));
|
|
20
|
+
let val = getValuesByOutputIndex(index);
|
|
21
|
+
let logit = getLogits(coords, idx);
|
|
22
|
+
setOutputAtIndex(index, val - logit);
|
|
23
|
+
}
|
|
24
|
+
}
|
|
25
|
+
`;
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
function l(t) {
|
|
29
|
+
const { logits: e, labels: a, values: s } = t.inputs, i = t.backend, o = a.shape[0], r = new d(o);
|
|
30
|
+
return i.runWebGPUProgram(r, [a, e, s], "float32");
|
|
31
|
+
}
|
|
32
|
+
const h = {
|
|
33
|
+
kernelName: "EfficientGatherSub",
|
|
34
|
+
backendName: "webgpu",
|
|
35
|
+
kernelFunc: l
|
|
36
|
+
};
|
|
37
|
+
p(h);
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { WebGPUProgram } from '@tensorflow/tfjs-backend-webgpu';
|
|
2
|
+
export declare class GeluProgram implements WebGPUProgram {
|
|
3
|
+
outputShape: number[];
|
|
4
|
+
shaderKey: string;
|
|
5
|
+
dispatchLayout: {
|
|
6
|
+
x: number[];
|
|
7
|
+
};
|
|
8
|
+
dispatch: [number, number, number];
|
|
9
|
+
variableNames: string[];
|
|
10
|
+
workgroupSize: [number, number, number];
|
|
11
|
+
size: boolean;
|
|
12
|
+
constructor(outputShape: number[]);
|
|
13
|
+
getUserCode(): string;
|
|
14
|
+
}
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
import { f as i } from "../../index-C0dhsYom.js";
|
|
2
|
+
import { f as o, c as s, g as p } from "../../webgpu_util-g13LvDIv.js";
|
|
3
|
+
const u = 0.7978845608028654, a = 0.044715;
|
|
4
|
+
class c {
|
|
5
|
+
outputShape;
|
|
6
|
+
shaderKey;
|
|
7
|
+
dispatchLayout;
|
|
8
|
+
dispatch;
|
|
9
|
+
variableNames = ["A"];
|
|
10
|
+
workgroupSize;
|
|
11
|
+
size = !0;
|
|
12
|
+
constructor(e) {
|
|
13
|
+
this.workgroupSize = [128, 1, 1], this.outputShape = e, this.dispatchLayout = o(this.outputShape), this.dispatch = s(this.dispatchLayout, this.outputShape, this.workgroupSize), this.shaderKey = "unary_gelu";
|
|
14
|
+
}
|
|
15
|
+
getUserCode() {
|
|
16
|
+
return `
|
|
17
|
+
fn unaryOperation(x : f32) -> f32 {
|
|
18
|
+
let x3 = x * x * x;
|
|
19
|
+
var inner = fma(${a}, x3, x);
|
|
20
|
+
inner = ${u} * inner;
|
|
21
|
+
inner = tanh(inner);
|
|
22
|
+
inner = 0.5 * (1.0 + inner);
|
|
23
|
+
return x * inner;
|
|
24
|
+
}
|
|
25
|
+
${p("index")} {
|
|
26
|
+
if (index < uniforms.size) {
|
|
27
|
+
let a = getAByOutputIndex(index);
|
|
28
|
+
setOutputAtIndex(index, unaryOperation(a));
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
`;
|
|
32
|
+
}
|
|
33
|
+
}
|
|
34
|
+
function h(t) {
|
|
35
|
+
const { x: e } = t.inputs, n = t.backend, r = new c(e.shape);
|
|
36
|
+
return n.runWebGPUProgram(r, [e], "float32");
|
|
37
|
+
}
|
|
38
|
+
const l = {
|
|
39
|
+
kernelName: "Gelu",
|
|
40
|
+
backendName: "webgpu",
|
|
41
|
+
kernelFunc: h
|
|
42
|
+
};
|
|
43
|
+
i(l);
|
|
44
|
+
class x {
|
|
45
|
+
// Inputs: dy, x
|
|
46
|
+
variableNames = ["dy", "x"];
|
|
47
|
+
outputShape;
|
|
48
|
+
shaderKey = "GeluGrad";
|
|
49
|
+
dispatchLayout;
|
|
50
|
+
dispatch;
|
|
51
|
+
workgroupSize = [128, 1, 1];
|
|
52
|
+
size = !0;
|
|
53
|
+
constructor(e) {
|
|
54
|
+
this.outputShape = e, this.dispatchLayout = o(this.outputShape), this.dispatch = s(this.dispatchLayout, this.outputShape, this.workgroupSize);
|
|
55
|
+
}
|
|
56
|
+
getUserCode() {
|
|
57
|
+
return `
|
|
58
|
+
${p("index")} {
|
|
59
|
+
if (index < uniforms.size) {
|
|
60
|
+
let X = getXByOutputIndex(index);
|
|
61
|
+
let x2 = X * X;
|
|
62
|
+
let x3 = x2 * X;
|
|
63
|
+
let u = ${u} * (X + ${a} * x3);
|
|
64
|
+
let t = tanh(u);
|
|
65
|
+
let sech2 = 1.0 - t * t;
|
|
66
|
+
let du_dx = ${u} * (1.0 + 3.0 * ${a} * x2);
|
|
67
|
+
let dgelu = 0.5 * (1.0 + t) + 0.5 * X * sech2 * du_dx;
|
|
68
|
+
let DY = getDyByOutputIndex(index);
|
|
69
|
+
setOutputAtIndex(index, DY * dgelu);
|
|
70
|
+
}
|
|
71
|
+
}`;
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
function g(t) {
|
|
75
|
+
const { dy: e, x: n } = t.inputs, r = t.backend, d = new x(n.shape);
|
|
76
|
+
return r.runWebGPUProgram(d, [e, n], "float32");
|
|
77
|
+
}
|
|
78
|
+
const m = {
|
|
79
|
+
kernelName: "GeluGrad",
|
|
80
|
+
backendName: "webgpu",
|
|
81
|
+
kernelFunc: g
|
|
82
|
+
};
|
|
83
|
+
i(m);
|
|
84
|
+
export {
|
|
85
|
+
c as GeluProgram
|
|
86
|
+
};
|
|
File without changes
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,115 @@
|
|
|
1
|
+
import { f as m, c as p, g as c } from "../../webgpu_util-g13LvDIv.js";
|
|
2
|
+
import { f as l, e as k } from "../../index-C0dhsYom.js";
|
|
3
|
+
import { s as M } from "../../sum-B8wEpKsg.js";
|
|
4
|
+
class N {
|
|
5
|
+
variableNames = ["x", "meanSquare", "gamma"];
|
|
6
|
+
outputShape;
|
|
7
|
+
shaderKey = "RMSNorm";
|
|
8
|
+
dispatchLayout;
|
|
9
|
+
dispatch;
|
|
10
|
+
workgroupSize = [64, 1, 1];
|
|
11
|
+
size = !0;
|
|
12
|
+
constructor(t, e, a) {
|
|
13
|
+
this.outputShape = [t, e, a], this.dispatchLayout = m(this.outputShape), this.dispatch = p(this.dispatchLayout, this.outputShape, this.workgroupSize);
|
|
14
|
+
}
|
|
15
|
+
getUserCode() {
|
|
16
|
+
return `
|
|
17
|
+
${c("index")} {
|
|
18
|
+
if (index < uniforms.size) {
|
|
19
|
+
let coords = getCoordsFromIndex(index);
|
|
20
|
+
let x = getXByOutputIndex(index);
|
|
21
|
+
let meanSquare = getMeanSquare(coords[0], coords[1], 0);
|
|
22
|
+
let gamma = getGammaByOutputIndex(index);
|
|
23
|
+
let invRms = inverseSqrt(meanSquare + 1e-8);
|
|
24
|
+
let normalized = x * invRms;
|
|
25
|
+
let outVal = normalized * gamma;
|
|
26
|
+
setOutputAtIndex(index, outVal);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
`;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function b(s) {
|
|
33
|
+
const { x: t, gamma: e } = s.inputs, a = s.backend, o = t.shape[0], n = t.shape[1], i = t.shape[2], u = t.square().mean(-1, !0), r = new N(o, n, i);
|
|
34
|
+
return a.runWebGPUProgram(r, [t, u, e], "float32");
|
|
35
|
+
}
|
|
36
|
+
const z = {
|
|
37
|
+
kernelName: "RMSNorm",
|
|
38
|
+
backendName: "webgpu",
|
|
39
|
+
kernelFunc: b
|
|
40
|
+
};
|
|
41
|
+
l(z);
|
|
42
|
+
class R {
|
|
43
|
+
variableNames = ["x", "meanSquare", "dyGamma", "dyXMean"];
|
|
44
|
+
outputShape;
|
|
45
|
+
shaderKey = "RMSNormGradX";
|
|
46
|
+
dispatchLayout;
|
|
47
|
+
dispatch;
|
|
48
|
+
workgroupSize = [64, 1, 1];
|
|
49
|
+
size = !0;
|
|
50
|
+
C;
|
|
51
|
+
constructor(t, e, a) {
|
|
52
|
+
this.outputShape = [t, e, a], this.dispatchLayout = m(this.outputShape), this.dispatch = p(this.dispatchLayout, this.outputShape, this.workgroupSize), this.C = a;
|
|
53
|
+
}
|
|
54
|
+
getUserCode() {
|
|
55
|
+
return `
|
|
56
|
+
${c("index")} {
|
|
57
|
+
if (index < uniforms.size) {
|
|
58
|
+
let coords = getCoordsFromIndex(index);
|
|
59
|
+
let x = getXByOutputIndex(index);
|
|
60
|
+
let meanSquare = getMeanSquare(coords[0], coords[1], 0) + 1e-8;
|
|
61
|
+
let dyGamma = getDyGammaByOutputIndex(index);
|
|
62
|
+
let dyXMean = getDyXMean(coords[0], coords[1], 0) / ${this.C}.0;
|
|
63
|
+
let invRms = inverseSqrt(meanSquare);
|
|
64
|
+
let dx = dyGamma * invRms - x * dyXMean * invRms / meanSquare;
|
|
65
|
+
setOutputAtIndex(index, dx);
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
`;
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
class v {
|
|
72
|
+
variableNames = ["x", "meanSquare", "dy"];
|
|
73
|
+
outputShape;
|
|
74
|
+
shaderKey = "RMSNormGradGamma";
|
|
75
|
+
dispatchLayout;
|
|
76
|
+
dispatch;
|
|
77
|
+
workgroupSize = [64, 1, 1];
|
|
78
|
+
size = !0;
|
|
79
|
+
constructor(t, e, a) {
|
|
80
|
+
this.outputShape = [t, e, a], this.dispatchLayout = m(this.outputShape), this.dispatch = p(this.dispatchLayout, this.outputShape, this.workgroupSize);
|
|
81
|
+
}
|
|
82
|
+
getUserCode() {
|
|
83
|
+
return `
|
|
84
|
+
${c("index")} {
|
|
85
|
+
if (index < uniforms.size) {
|
|
86
|
+
let coords = getCoordsFromIndex(index);
|
|
87
|
+
let x = getXByOutputIndex(index);
|
|
88
|
+
let meanSquare = getMeanSquare(coords[0], coords[1], 0) + 1e-8;
|
|
89
|
+
let dy = getDyByOutputIndex(index);
|
|
90
|
+
let invRms = inverseSqrt(meanSquare);
|
|
91
|
+
let dGamma = dy * (x * invRms);
|
|
92
|
+
setOutputAtIndex(index,dGamma);
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
`;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
function I(s) {
|
|
99
|
+
const { dy: t, x: e, gamma: a } = s.inputs, o = s.backend, n = e.shape[0], i = e.shape[1], u = e.shape[2], r = t.mul(a), h = r.mul(e), g = h.sum(-1, !0);
|
|
100
|
+
h.dispose();
|
|
101
|
+
const S = e.square(), d = S.mean(-1, !0);
|
|
102
|
+
S.dispose();
|
|
103
|
+
const y = new R(n, i, u), G = o.runWebGPUProgram(y, [e, d, r, g], "float32");
|
|
104
|
+
r.dispose(), g.dispose();
|
|
105
|
+
const q = new v(n, i, u), x = o.runWebGPUProgram(q, [e, d, t], "float32");
|
|
106
|
+
d.dispose();
|
|
107
|
+
const f = M(k().makeTensorFromTensorInfo(x), [0, 1]);
|
|
108
|
+
return o.disposeData(x), [G, f];
|
|
109
|
+
}
|
|
110
|
+
const P = {
|
|
111
|
+
kernelName: "RMSNormGrad",
|
|
112
|
+
backendName: "webgpu",
|
|
113
|
+
kernelFunc: I
|
|
114
|
+
};
|
|
115
|
+
l(P);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
import { f as c, c as d, g as h } from "../../webgpu_util-g13LvDIv.js";
|
|
2
|
+
import { f as p } from "../../index-C0dhsYom.js";
|
|
3
|
+
class l {
|
|
4
|
+
variableNames = ["x", "kernel"];
|
|
5
|
+
outputShape;
|
|
6
|
+
shaderKey = "QKV";
|
|
7
|
+
dispatchLayout;
|
|
8
|
+
dispatch;
|
|
9
|
+
uniforms = "mode: i32";
|
|
10
|
+
workgroupSize = [64, 1, 1];
|
|
11
|
+
size = !0;
|
|
12
|
+
constructor(t, e, o, s) {
|
|
13
|
+
const r = s / e;
|
|
14
|
+
this.outputShape = [t, e, o, r], this.dispatchLayout = c(this.outputShape), this.dispatch = d(this.dispatchLayout, this.outputShape, this.workgroupSize);
|
|
15
|
+
}
|
|
16
|
+
getUserCode() {
|
|
17
|
+
const t = this.outputShape[1], e = this.outputShape[3], o = t * e;
|
|
18
|
+
return `
|
|
19
|
+
${h("index")} {
|
|
20
|
+
if (index < uniforms.size) {
|
|
21
|
+
let coords = getCoordsFromIndex(index); // [b, h, t, d]
|
|
22
|
+
let b = coords[0];
|
|
23
|
+
let h = coords[1];
|
|
24
|
+
let t = coords[2];
|
|
25
|
+
let d = coords[3];
|
|
26
|
+
|
|
27
|
+
// Compute output channel index in fused kernel
|
|
28
|
+
let out_offset = uniforms.mode * ${t} * ${e} + h * ${e} + d;
|
|
29
|
+
|
|
30
|
+
var sum = 0.0;
|
|
31
|
+
for (var c = 0; c < ${o}; c += 1) {
|
|
32
|
+
let xval = getX(b, t, c); // fetch from x
|
|
33
|
+
let kval = getKernel(c, out_offset); // fetch from kernel
|
|
34
|
+
sum += xval * kval;
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
setOutputAtIndex(index, sum);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
`;
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
function m(a) {
|
|
44
|
+
const { x: t, kernel: e } = a.inputs, { heads: o } = a.attrs, s = a.backend, r = t.shape[0], i = t.shape[1], u = t.shape[2], n = new l(r, o, i, u);
|
|
45
|
+
return [
|
|
46
|
+
s.runWebGPUProgram(n, [t, e], "float32", [{ type: "int32", data: [0] }]),
|
|
47
|
+
s.runWebGPUProgram(n, [t, e], "float32", [{ type: "int32", data: [1] }]),
|
|
48
|
+
s.runWebGPUProgram(n, [t, e], "float32", [{ type: "int32", data: [2] }])
|
|
49
|
+
];
|
|
50
|
+
}
|
|
51
|
+
const f = {
|
|
52
|
+
kernelName: "QKV",
|
|
53
|
+
backendName: "webgpu",
|
|
54
|
+
kernelFunc: m
|
|
55
|
+
};
|
|
56
|
+
p(f);
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|