@genai-fi/nanogpt 0.6.2 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (142) hide show
  1. package/dist/Generator.js +11 -11
  2. package/dist/NanoGPTModel.d.ts +2 -2
  3. package/dist/NanoGPTModel.js +104 -136
  4. package/dist/{RealDiv-BYViZwhN.js → RealDiv-C4hOvYOZ.js} +26 -25
  5. package/dist/{Reshape-t7Kcikjk.js → Reshape-BLijOA8h.js} +5 -5
  6. package/dist/TeachableLLM.d.ts +3 -0
  7. package/dist/TeachableLLM.js +50 -47
  8. package/dist/{TiedEmbedding-9WeDwvjO.js → TiedEmbedding-BLltddza.js} +4 -4
  9. package/dist/{axis_util-Bu4h7XWV.js → axis_util-DaAl5MER.js} +3 -3
  10. package/dist/backend.d.ts +1 -0
  11. package/dist/backend.js +7 -0
  12. package/dist/backend_util-DWiwsi2N.js +749 -0
  13. package/dist/{broadcast_to-DARN-DBD.js → broadcast_to-C4v-j9yA.js} +2 -2
  14. package/dist/{concat-5aPGqw3Z.js → concat-CsHeR4zV.js} +8 -8
  15. package/dist/{dataset-pgqp-YfL.js → dataset-JDyjG3QR.js} +3 -3
  16. package/dist/{dropout-Bciw46HT.js → dropout-hpDwECTe.js} +7 -7
  17. package/dist/{gather-DjyCjmOD.js → gather-D0_gPiBz.js} +4 -4
  18. package/dist/gelu-uyHP1x1f.js +26 -0
  19. package/dist/gpgpu_math-DJm3ZTAf.js +2371 -0
  20. package/dist/index-BPPzKVdR.js +12099 -0
  21. package/dist/{index-BAzbokzv.js → index-C0dhsYom.js} +405 -389
  22. package/dist/{kernel_funcs_utils-CUxJCg0g.js → kernel_funcs_utils-CwRTFqrc.js} +31 -30
  23. package/dist/layers/BaseLayer.js +2 -2
  24. package/dist/layers/CausalSelfAttention.js +6 -6
  25. package/dist/layers/MLP.js +5 -5
  26. package/dist/layers/RMSNorm.js +3 -3
  27. package/dist/layers/RoPECache.js +4 -4
  28. package/dist/layers/TiedEmbedding.js +5 -5
  29. package/dist/layers/TransformerBlock.js +1 -1
  30. package/dist/loader/loadTransformers.js +1 -1
  31. package/dist/loader/oldZipLoad.js +5 -5
  32. package/dist/{log_sum_exp-YEo2h3gb.js → log_sum_exp-D086OgZJ.js} +15 -15
  33. package/dist/main.d.ts +2 -0
  34. package/dist/main.js +9 -5
  35. package/dist/{mat_mul-7121rsJk.js → mat_mul-1nwdPkQ_.js} +4 -4
  36. package/dist/{max-DtlIuVeW.js → max-BQc2Aj-I.js} +4 -4
  37. package/dist/{mulmat_packed_gpu-D4nKF7Je.js → mulmat_packed_gpu-Gzf3I9UV.js} +1 -1
  38. package/dist/non_max_suppression_impl-CsEgBuMA.js +134 -0
  39. package/dist/{ones-BBlSRqn1.js → ones-D63HpSF_.js} +2 -2
  40. package/dist/ops/appendCache.js +3 -3
  41. package/dist/ops/attentionMask.js +1 -1
  42. package/dist/ops/cpu/appendCache.js +8 -8
  43. package/dist/ops/cpu/attentionMask.js +9 -9
  44. package/dist/ops/cpu/fusedSoftmax.js +17 -11
  45. package/dist/ops/cpu/gatherSub.js +7 -7
  46. package/dist/ops/cpu/gelu.js +13 -13
  47. package/dist/ops/cpu/matMulGelu.js +36 -24
  48. package/dist/ops/cpu/matMulMul.js +14 -8
  49. package/dist/ops/cpu/mulDropout.js +9 -3
  50. package/dist/ops/cpu/normRMS.js +5 -5
  51. package/dist/ops/cpu/qkv.js +3 -3
  52. package/dist/ops/cpu/rope.js +5 -5
  53. package/dist/ops/cpu/scatterSub.js +11 -11
  54. package/dist/ops/fusedSoftmax.js +1 -1
  55. package/dist/ops/gatherSub.js +1 -1
  56. package/dist/ops/gelu.js +2 -2
  57. package/dist/ops/grads/attentionMask.js +1 -1
  58. package/dist/ops/grads/fusedSoftmax.js +2 -2
  59. package/dist/ops/grads/gelu.js +3 -24
  60. package/dist/ops/grads/matMulGelu.js +5 -5
  61. package/dist/ops/grads/normRMS.js +6 -6
  62. package/dist/ops/grads/qkv.js +1 -1
  63. package/dist/ops/grads/rope.js +3 -3
  64. package/dist/ops/matMulGelu.js +1 -1
  65. package/dist/ops/matMulMul.js +1 -1
  66. package/dist/ops/mulDrop.js +1 -1
  67. package/dist/ops/normRMS.js +1 -1
  68. package/dist/ops/qkv.js +1 -1
  69. package/dist/ops/rope.js +4 -4
  70. package/dist/ops/scatterSub.js +1 -1
  71. package/dist/ops/webgl/appendCache.js +1 -1
  72. package/dist/ops/webgl/attentionMask.js +1 -1
  73. package/dist/ops/webgl/fusedSoftmax.js +4 -4
  74. package/dist/ops/webgl/gatherSub.js +1 -1
  75. package/dist/ops/webgl/gelu.js +2 -2
  76. package/dist/ops/webgl/log.js +5 -5
  77. package/dist/ops/webgl/matMulGelu.js +17 -17
  78. package/dist/ops/webgl/matMulMul.js +1 -1
  79. package/dist/ops/webgl/mulDropout.js +4 -4
  80. package/dist/ops/webgl/normRMS.js +2 -2
  81. package/dist/ops/webgl/qkv.js +1 -1
  82. package/dist/ops/webgl/rope.js +1 -1
  83. package/dist/ops/webgl/scatterSub.js +1 -1
  84. package/dist/ops/webgpu/appendCache.js +56 -0
  85. package/dist/ops/webgpu/attentionMask.d.ts +1 -0
  86. package/dist/ops/webgpu/attentionMask.js +64 -0
  87. package/dist/ops/webgpu/gatherSub.d.ts +1 -0
  88. package/dist/ops/webgpu/gatherSub.js +37 -0
  89. package/dist/ops/webgpu/gelu.d.ts +14 -0
  90. package/dist/ops/webgpu/gelu.js +86 -0
  91. package/dist/ops/webgpu/index.d.ts +0 -0
  92. package/dist/ops/webgpu/index.js +8 -0
  93. package/dist/ops/webgpu/normRMS.d.ts +1 -0
  94. package/dist/ops/webgpu/normRMS.js +115 -0
  95. package/dist/ops/webgpu/qkv.d.ts +1 -0
  96. package/dist/ops/webgpu/qkv.js +56 -0
  97. package/dist/ops/webgpu/rope.d.ts +1 -0
  98. package/dist/ops/webgpu/rope.js +68 -0
  99. package/dist/ops/webgpu/scatterSub.d.ts +1 -0
  100. package/dist/ops/webgpu/scatterSub.js +37 -0
  101. package/dist/{ops-C0sQEcPw.js → ops-CIQLNshk.js} +452 -503
  102. package/dist/{random_width-DWzaOgrn.js → random_width-DkYP8W8N.js} +143 -144
  103. package/dist/{range-DYsrnfiy.js → range-CYzpQY53.js} +1 -1
  104. package/dist/{reciprocal-CJQeasVa.js → reciprocal-_A9yv27J.js} +1 -1
  105. package/dist/{register_all_kernels-BfFCQAqs.js → register_all_kernels-guvSxp7M.js} +202 -200
  106. package/dist/{reshape-krWGKraP.js → reshape-BMUzc1UY.js} +3 -3
  107. package/dist/{scatter_nd_util-93ln7Hut.js → scatter_nd_util-IRBqKz_b.js} +3 -3
  108. package/dist/{selu_util-sntGesxr.js → selu_util-Dt_iuXaq.js} +6 -6
  109. package/dist/shared-BNa2q6jD.js +69 -0
  110. package/dist/{shared-Ca6iDobD.js → shared-CDu9S76h.js} +541 -606
  111. package/dist/{sin-D_h-qCSx.js → sin-Cocju-BY.js} +6 -6
  112. package/dist/{softmax-fsdtf6JC.js → softmax-GPNK3o-U.js} +3 -3
  113. package/dist/{split-eiktj-6L.js → split-CHzJjxDv.js} +4 -4
  114. package/dist/{stack-dfEEz2OY.js → stack-Dpgg_1W1.js} +2 -2
  115. package/dist/{sum-BE_Irnim.js → sum-B8wEpKsg.js} +5 -5
  116. package/dist/{tensor-Xyi595sG.js → tensor-RvZVNmg0.js} +1 -1
  117. package/dist/{tensor2d-CPEkynbH.js → tensor2d-B_kyod7_.js} +1 -1
  118. package/dist/training/AdamExt.js +1 -1
  119. package/dist/training/DatasetBuilder.js +2 -2
  120. package/dist/training/Evaluator.js +1 -1
  121. package/dist/training/FullTrainer.js +20 -20
  122. package/dist/training/Trainer.d.ts +5 -6
  123. package/dist/training/Trainer.js +59 -60
  124. package/dist/training/sparseCrossEntropy.js +19 -26
  125. package/dist/utilities/dummy.js +19 -19
  126. package/dist/utilities/generate.js +15 -16
  127. package/dist/utilities/multinomialCPU.d.ts +2 -0
  128. package/dist/utilities/multinomialCPU.js +13 -0
  129. package/dist/utilities/performance.d.ts +2 -0
  130. package/dist/utilities/performance.js +16 -0
  131. package/dist/utilities/profile.d.ts +1 -0
  132. package/dist/utilities/profile.js +9 -6
  133. package/dist/utilities/safetensors.js +2 -2
  134. package/dist/utilities/weights.js +2 -2
  135. package/dist/{variable-wSS22xj5.js → variable-DXEUOwew.js} +1 -1
  136. package/dist/webgpu_util-g13LvDIv.js +625 -0
  137. package/dist/{zeros-YJDE7oRb.js → zeros-DCPCdFGq.js} +8 -8
  138. package/package.json +2 -1
  139. package/dist/gpgpu_math-CNslybmD.js +0 -3115
  140. package/dist/norm-CzltS9Fz.js +0 -86
  141. package/dist/ops/node/sparseCrossEntropy.js +0 -11
  142. /package/dist/ops/{node/sparseCrossEntropy.d.ts → webgpu/appendCache.d.ts} +0 -0
@@ -1,5 +1,5 @@
1
- import { r as a } from "../../index-BAzbokzv.js";
2
- import { u as s, C as x } from "../../kernel_funcs_utils-CUxJCg0g.js";
1
+ import { f as a } from "../../index-C0dhsYom.js";
2
+ import { u as s, C as x } from "../../kernel_funcs_utils-CwRTFqrc.js";
3
3
  const t = 0.7978845608028654, r = 0.044715, c = x + `
4
4
  float x3 = x * x * x;
5
5
  float inner = x + ${r} * x3;
@@ -1,6 +1,6 @@
1
- import { r, a9 as e } from "../../index-BAzbokzv.js";
2
- import { u as s, l as N } from "../../kernel_funcs_utils-CUxJCg0g.js";
3
- import { aG as l } from "../../shared-Ca6iDobD.js";
1
+ import { f as e, a8 as r } from "../../index-C0dhsYom.js";
2
+ import { u as s, l as N } from "../../kernel_funcs_utils-CwRTFqrc.js";
3
+ import { y as l } from "../../shared-BNa2q6jD.js";
4
4
  /**
5
5
  * @license
6
6
  * Copyright 2020 Google LLC. All Rights Reserved.
@@ -32,8 +32,8 @@ const a = N + `
32
32
  packedOpSnippet: t,
33
33
  cpuKernelImpl: l
34
34
  }), o = {
35
- kernelName: e,
35
+ kernelName: r,
36
36
  backendName: "webgl",
37
37
  kernelFunc: n
38
38
  };
39
- r(o);
39
+ e(o);
@@ -1,8 +1,8 @@
1
- import { r as _, t as R, e as C, g as A, h as N, i as H, u as O } from "../../index-BAzbokzv.js";
2
- import { r as f } from "../../Reshape-t7Kcikjk.js";
3
- import { M as U } from "../../mulmat_packed_gpu-D4nKF7Je.js";
4
- import { m as E } from "../../mat_mul-7121rsJk.js";
5
- const M = 0.7978845608028654, x = 0.044715, q = `
1
+ import { f as _, t as R, e as C, j as A, k as N, l as H, u as O } from "../../index-C0dhsYom.js";
2
+ import { r as f } from "../../Reshape-BLijOA8h.js";
3
+ import { M as U } from "../../mulmat_packed_gpu-Gzf3I9UV.js";
4
+ import { m as E } from "../../mat_mul-1nwdPkQ_.js";
5
+ const M = 0.7978845608028654, x = 0.044715, j = `
6
6
  vec4 x3 = x * x * x;
7
7
  vec4 inner = x + ${x} * x3;
8
8
  inner = ${M} * inner;
@@ -10,7 +10,7 @@ const M = 0.7978845608028654, x = 0.044715, q = `
10
10
  inner = 0.5 * (1.0 + inner);
11
11
  vec4 result = x * inner;
12
12
  return result;
13
- `, z = `
13
+ `, q = `
14
14
  vec4 a2 = a * a;
15
15
  vec4 a3 = a2 * a;
16
16
  vec4 u = ${M} * (a + ${x} * a3);
@@ -34,7 +34,7 @@ function w({
34
34
  i === p,
35
35
  () => `Error in matMul: inner shapes (${i}) and (${p}) of Tensors with shapes ${e.shape} and ${t.shape} and transposeA=${s} and transposeB=${n} must match.`
36
36
  );
37
- const v = s ? [d, i, h] : [d, h, i], S = n ? [m, l, p] : [m, p, l], g = f({ inputs: { x: e }, backend: a, attrs: { shape: v } }), D = f({ inputs: { x: t }, backend: a, attrs: { shape: S } }), G = [g, D], y = Math.max(d, m), L = c, B = O(e.dtype, t.dtype), F = new U(
37
+ const v = s ? [d, i, h] : [d, h, i], S = n ? [m, l, p] : [m, p, l], k = f({ inputs: { x: e }, backend: a, attrs: { shape: v } }), D = f({ inputs: { x: t }, backend: a, attrs: { shape: S } }), G = [k, D], y = Math.max(d, m), L = c, B = O(e.dtype, t.dtype), F = new U(
38
38
  v,
39
39
  S,
40
40
  [y, h, l],
@@ -44,15 +44,15 @@ function w({
44
44
  L,
45
45
  !!o,
46
46
  !1
47
- ), k = [g, D];
48
- o && k.push(o);
49
- const $ = a.runWebGLProgram(F, k, B), I = f({ inputs: { x: $ }, backend: a, attrs: { shape: b } });
47
+ ), g = [k, D];
48
+ o && g.push(o);
49
+ const $ = a.runWebGLProgram(F, g, B), I = f({ inputs: { x: $ }, backend: a, attrs: { shape: b } });
50
50
  G.push($);
51
51
  for (const P of G)
52
52
  a.disposeIntermediateTensorInfo(P);
53
53
  return I;
54
54
  }
55
- function W(e) {
55
+ function z(e) {
56
56
  const { inputs: t, backend: s } = e, { x: n, kernel: a } = t;
57
57
  if (n === void 0 || a === void 0)
58
58
  throw new Error("BatchMatMul requires two input tensors.");
@@ -62,15 +62,15 @@ function W(e) {
62
62
  transposeA: !1,
63
63
  transposeB: !1,
64
64
  backend: s,
65
- activationSnippet: q
65
+ activationSnippet: j
66
66
  });
67
67
  }
68
- const j = {
68
+ const W = {
69
69
  kernelName: "MatMulGelu",
70
70
  backendName: "webgl",
71
- kernelFunc: W
71
+ kernelFunc: z
72
72
  };
73
- _(j);
73
+ _(W);
74
74
  function J(e) {
75
75
  const { dy: t, x: s, kernel: n } = e.inputs, a = e.backend;
76
76
  return R(() => {
@@ -81,7 +81,7 @@ function J(e) {
81
81
  transposeA: !1,
82
82
  transposeB: !1,
83
83
  backend: a,
84
- activationSnippet: z,
84
+ activationSnippet: q,
85
85
  multiplier: t
86
86
  })
87
87
  ), o = E(c, n, !1, !0), r = E(s, c, !0, !1);
@@ -97,5 +97,5 @@ _(Q);
97
97
  export {
98
98
  te as MATMUL_SHARED_DIM_THRESHOLD,
99
99
  w as batchMatMulGeluImpl,
100
- W as batchMatMulKernel
100
+ z as batchMatMulKernel
101
101
  };
@@ -1,4 +1,4 @@
1
- import { r as u } from "../../index-BAzbokzv.js";
1
+ import { f as u } from "../../index-C0dhsYom.js";
2
2
  import { batchMatMulGeluImpl as c } from "./matMulGelu.js";
3
3
  const M = `
4
4
  return a * b;
@@ -1,4 +1,4 @@
1
- import { r as m } from "../../index-BAzbokzv.js";
1
+ import { f as m } from "../../index-C0dhsYom.js";
2
2
  class f {
3
3
  variableNames = ["a", "b"];
4
4
  outputShape;
@@ -7,8 +7,8 @@ class f {
7
7
  { name: "dropoutRate", type: "float" },
8
8
  { name: "seed", type: "float" }
9
9
  ];
10
- constructor(r, t, o) {
11
- this.outputShape = [r, t, o, o], this.userCode = `
10
+ constructor(t, r, o) {
11
+ this.outputShape = [t, r, o, o], this.userCode = `
12
12
  float random(ivec4 coords) {
13
13
  float x = float(coords.x * 4096 + coords.y * 256 + coords.z * 16 + coords.w);
14
14
  return fract(sin(seed + x) * 43758.5453123);
@@ -27,7 +27,7 @@ class f {
27
27
  }
28
28
  }
29
29
  function b(e) {
30
- const { inputs: r, attrs: t } = e, { a: o, b: s } = r, { dropoutRate: a, seed: c } = t, n = e.backend, d = o.shape[0], u = o.shape[2], p = o.shape[1], l = new f(d, p, u);
30
+ const { inputs: t, attrs: r } = e, { a: o, b: s } = t, { dropoutRate: a, seed: c } = r, n = e.backend, d = o.shape[0], u = o.shape[2], p = o.shape[1], l = new f(d, p, u);
31
31
  return n.runWebGLProgram(l, [o, s], "float32", [
32
32
  [a ?? 0],
33
33
  [c ?? Math.random() * 1e4]
@@ -1,5 +1,5 @@
1
- import { r as p, e as G } from "../../index-BAzbokzv.js";
2
- import { s as x } from "../../sum-BE_Irnim.js";
1
+ import { f as p, e as G } from "../../index-C0dhsYom.js";
2
+ import { s as x } from "../../sum-B8wEpKsg.js";
3
3
  class y {
4
4
  variableNames = ["x", "meanSquare", "gamma"];
5
5
  outputShape;
@@ -1,4 +1,4 @@
1
- import { r as i } from "../../index-BAzbokzv.js";
1
+ import { f as i } from "../../index-C0dhsYom.js";
2
2
  class l {
3
3
  variableNames = ["x", "kernel"];
4
4
  outputShape;
@@ -1,4 +1,4 @@
1
- import { r as u } from "../../index-BAzbokzv.js";
1
+ import { f as u } from "../../index-C0dhsYom.js";
2
2
  class l {
3
3
  variableNames = ["x", "sin", "cos"];
4
4
  outputShape;
@@ -1,4 +1,4 @@
1
- import { r as i } from "../../index-BAzbokzv.js";
1
+ import { f as i } from "../../index-C0dhsYom.js";
2
2
  class u {
3
3
  variableNames = ["labels", "softmaxProbs", "dy"];
4
4
  outputShape;
@@ -0,0 +1,56 @@
1
+ import { f as u, c as d, g as l } from "../../webgpu_util-g13LvDIv.js";
2
+ import { f as m } from "../../index-C0dhsYom.js";
3
+ class f {
4
+ variableNames = ["cache", "item"];
5
+ outputShape;
6
+ shaderKey = "AppendCache";
7
+ dispatchLayout;
8
+ dispatch;
9
+ workgroupSize = [64, 1, 1];
10
+ size = !0;
11
+ uniforms = "cacheT: i32";
12
+ constructor(t, a, s, o, c) {
13
+ const i = Math.min(s + 1, c);
14
+ this.outputShape = [t, a, i, o], this.dispatchLayout = u(this.outputShape), this.dispatch = d(this.dispatchLayout, this.outputShape, this.workgroupSize);
15
+ }
16
+ getUserCode() {
17
+ const t = this.outputShape[2];
18
+ return `
19
+ ${l("index")} {
20
+ if (index < uniforms.size) {
21
+ let coords = getCoordsFromIndex(index); // [b, h, t, d]
22
+ let b = coords[0];
23
+ let h = coords[1];
24
+ let t = coords[2];
25
+ let d = coords[3];
26
+
27
+ let itemT = 1;
28
+ let maxSize = ${t};
29
+ let totalT = uniforms.cacheT + itemT;
30
+ let start = select(0, 1, totalT >= maxSize);
31
+
32
+ let srcT = t + start;
33
+ var val = 0.0;
34
+ if (srcT < uniforms.cacheT) {
35
+ val = getCache(b, h, srcT, d);
36
+ } else if (srcT == uniforms.cacheT) {
37
+ val = getItem(b, h, 0, d);
38
+ } else {
39
+ val = 0.0;
40
+ }
41
+ setOutputAtIndex(index, val);
42
+ }
43
+ }
44
+ `;
45
+ }
46
+ }
47
+ function T(e) {
48
+ const { cache: t, item: a } = e.inputs, { maxSize: s, pastLen: o } = e.attrs, c = e.backend, i = t.shape[0], r = t.shape[2], n = t.shape[1], h = new f(i, n, r, a.shape[3], s), p = [{ type: "int32", data: [o] }];
49
+ return c.runWebGPUProgram(h, [t, a], "float32", p);
50
+ }
51
+ const g = {
52
+ kernelName: "AppendCache",
53
+ backendName: "webgpu",
54
+ kernelFunc: T
55
+ };
56
+ m(g);
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,64 @@
1
+ import { f } from "../../index-C0dhsYom.js";
2
+ import { f as m, c as k, g as l } from "../../webgpu_util-g13LvDIv.js";
3
+ class g {
4
+ variableNames = ["q", "k"];
5
+ outputShape;
6
+ shaderKey = "AttentionMask";
7
+ dispatchLayout;
8
+ dispatch;
9
+ uniforms = "divisor: f32, pastLen: i32, inf: f32";
10
+ workgroupSize = [64, 1, 1];
11
+ size = !0;
12
+ hs;
13
+ nh;
14
+ T1;
15
+ T2;
16
+ constructor(t, e, o, i, a) {
17
+ if (this.outputShape = [t, e, o, i], this.hs = a, this.nh = e, this.T1 = o, this.T2 = i, this.dispatchLayout = m(this.outputShape), this.dispatch = k(this.dispatchLayout, this.outputShape, this.workgroupSize), a % 4 !== 0)
18
+ throw new Error("Head size must be a multiple of 4 for AttentionMaskProgram");
19
+ }
20
+ getUserCode() {
21
+ return `
22
+ ${l("index")} {
23
+
24
+ let coords = getCoordsFromIndex(index);
25
+ let b = coords[0];
26
+ let h = coords[1];
27
+ let t1 = coords[2];
28
+ let t2 = coords[3];
29
+
30
+ if (index < uniforms.size) {
31
+ if (t2 > t1 + uniforms.pastLen) {
32
+ setOutputAtIndex(index, uniforms.inf);
33
+ return;
34
+ }
35
+
36
+ var sum: f32 = 0.0;
37
+ for (var i: i32 = 0; i < ${this.hs}; i = i + 4) {
38
+ let q0 = getIndexFromCoords4D(vec4<i32>(b, h, t1, i), uniforms.qShape);
39
+ let qv = vec4<f32>(q[q0], q[q0 + 1], q[q0 + 2], q[q0 + 3]);
40
+ let k0 = getIndexFromCoords4D(vec4<i32>(b, h, t2, i), uniforms.kShape);
41
+ let kv = vec4<f32>(k[k0], k[k0 + 1], k[k0 + 2], k[k0 + 3]);
42
+ sum = sum + dot(qv, kv);
43
+ }
44
+ let scaled = sum * uniforms.divisor;
45
+ setOutputAtIndex(index, scaled);
46
+ }
47
+ }
48
+ `;
49
+ }
50
+ }
51
+ function q(s) {
52
+ const { q: t, k: e } = s.inputs, { divisor: o, pastLen: i } = s.attrs, a = s.backend, n = t.shape[0], r = t.shape[2], u = e.shape[2], c = t.shape[1], d = t.shape[3], h = new g(n, c, r, u, d), p = [
53
+ { type: "float32", data: [o] },
54
+ { type: "int32", data: [i] },
55
+ { type: "float32", data: [Number.NEGATIVE_INFINITY] }
56
+ ];
57
+ return a.runWebGPUProgram(h, [t, e], "float32", p);
58
+ }
59
+ const v = {
60
+ kernelName: "AttentionMask",
61
+ backendName: "webgpu",
62
+ kernelFunc: q
63
+ };
64
+ f(v);
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,37 @@
1
+ import { f as u, c as n, g as c } from "../../webgpu_util-g13LvDIv.js";
2
+ import { f as p } from "../../index-C0dhsYom.js";
3
+ class d {
4
+ variableNames = ["labels", "logits", "values"];
5
+ outputShape;
6
+ shaderKey = "GatherSub";
7
+ dispatchLayout;
8
+ dispatch;
9
+ workgroupSize = [64, 1, 1];
10
+ size = !0;
11
+ constructor(e) {
12
+ this.outputShape = [e], this.dispatchLayout = u(this.outputShape), this.dispatch = n(this.dispatchLayout, this.outputShape, this.workgroupSize);
13
+ }
14
+ getUserCode() {
15
+ return `
16
+ ${c("index")} {
17
+ if (index < uniforms.size) {
18
+ let coords = getCoordsFromIndex(index);
19
+ let idx = i32(getLabelsByOutputIndex(index));
20
+ let val = getValuesByOutputIndex(index);
21
+ let logit = getLogits(coords, idx);
22
+ setOutputAtIndex(index, val - logit);
23
+ }
24
+ }
25
+ `;
26
+ }
27
+ }
28
+ function l(t) {
29
+ const { logits: e, labels: a, values: s } = t.inputs, i = t.backend, o = a.shape[0], r = new d(o);
30
+ return i.runWebGPUProgram(r, [a, e, s], "float32");
31
+ }
32
+ const h = {
33
+ kernelName: "EfficientGatherSub",
34
+ backendName: "webgpu",
35
+ kernelFunc: l
36
+ };
37
+ p(h);
@@ -0,0 +1,14 @@
1
+ import { WebGPUProgram } from '@tensorflow/tfjs-backend-webgpu';
2
+ export declare class GeluProgram implements WebGPUProgram {
3
+ outputShape: number[];
4
+ shaderKey: string;
5
+ dispatchLayout: {
6
+ x: number[];
7
+ };
8
+ dispatch: [number, number, number];
9
+ variableNames: string[];
10
+ workgroupSize: [number, number, number];
11
+ size: boolean;
12
+ constructor(outputShape: number[]);
13
+ getUserCode(): string;
14
+ }
@@ -0,0 +1,86 @@
1
+ import { f as i } from "../../index-C0dhsYom.js";
2
+ import { f as o, c as s, g as p } from "../../webgpu_util-g13LvDIv.js";
3
+ const u = 0.7978845608028654, a = 0.044715;
4
+ class c {
5
+ outputShape;
6
+ shaderKey;
7
+ dispatchLayout;
8
+ dispatch;
9
+ variableNames = ["A"];
10
+ workgroupSize;
11
+ size = !0;
12
+ constructor(e) {
13
+ this.workgroupSize = [128, 1, 1], this.outputShape = e, this.dispatchLayout = o(this.outputShape), this.dispatch = s(this.dispatchLayout, this.outputShape, this.workgroupSize), this.shaderKey = "unary_gelu";
14
+ }
15
+ getUserCode() {
16
+ return `
17
+ fn unaryOperation(x : f32) -> f32 {
18
+ let x3 = x * x * x;
19
+ var inner = fma(${a}, x3, x);
20
+ inner = ${u} * inner;
21
+ inner = tanh(inner);
22
+ inner = 0.5 * (1.0 + inner);
23
+ return x * inner;
24
+ }
25
+ ${p("index")} {
26
+ if (index < uniforms.size) {
27
+ let a = getAByOutputIndex(index);
28
+ setOutputAtIndex(index, unaryOperation(a));
29
+ }
30
+ }
31
+ `;
32
+ }
33
+ }
34
+ function h(t) {
35
+ const { x: e } = t.inputs, n = t.backend, r = new c(e.shape);
36
+ return n.runWebGPUProgram(r, [e], "float32");
37
+ }
38
+ const l = {
39
+ kernelName: "Gelu",
40
+ backendName: "webgpu",
41
+ kernelFunc: h
42
+ };
43
+ i(l);
44
+ class x {
45
+ // Inputs: dy, x
46
+ variableNames = ["dy", "x"];
47
+ outputShape;
48
+ shaderKey = "GeluGrad";
49
+ dispatchLayout;
50
+ dispatch;
51
+ workgroupSize = [128, 1, 1];
52
+ size = !0;
53
+ constructor(e) {
54
+ this.outputShape = e, this.dispatchLayout = o(this.outputShape), this.dispatch = s(this.dispatchLayout, this.outputShape, this.workgroupSize);
55
+ }
56
+ getUserCode() {
57
+ return `
58
+ ${p("index")} {
59
+ if (index < uniforms.size) {
60
+ let X = getXByOutputIndex(index);
61
+ let x2 = X * X;
62
+ let x3 = x2 * X;
63
+ let u = ${u} * (X + ${a} * x3);
64
+ let t = tanh(u);
65
+ let sech2 = 1.0 - t * t;
66
+ let du_dx = ${u} * (1.0 + 3.0 * ${a} * x2);
67
+ let dgelu = 0.5 * (1.0 + t) + 0.5 * X * sech2 * du_dx;
68
+ let DY = getDyByOutputIndex(index);
69
+ setOutputAtIndex(index, DY * dgelu);
70
+ }
71
+ }`;
72
+ }
73
+ }
74
+ function g(t) {
75
+ const { dy: e, x: n } = t.inputs, r = t.backend, d = new x(n.shape);
76
+ return r.runWebGPUProgram(d, [e, n], "float32");
77
+ }
78
+ const m = {
79
+ kernelName: "GeluGrad",
80
+ backendName: "webgpu",
81
+ kernelFunc: g
82
+ };
83
+ i(m);
84
+ export {
85
+ c as GeluProgram
86
+ };
File without changes
@@ -0,0 +1,8 @@
1
+ import "./attentionMask.js";
2
+ import "./normRMS.js";
3
+ import "./rope.js";
4
+ import "./appendCache.js";
5
+ import "./scatterSub.js";
6
+ import "./gatherSub.js";
7
+ import "./qkv.js";
8
+ import "./gelu.js";
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,115 @@
1
+ import { f as m, c as p, g as c } from "../../webgpu_util-g13LvDIv.js";
2
+ import { f as l, e as k } from "../../index-C0dhsYom.js";
3
+ import { s as M } from "../../sum-B8wEpKsg.js";
4
+ class N {
5
+ variableNames = ["x", "meanSquare", "gamma"];
6
+ outputShape;
7
+ shaderKey = "RMSNorm";
8
+ dispatchLayout;
9
+ dispatch;
10
+ workgroupSize = [64, 1, 1];
11
+ size = !0;
12
+ constructor(t, e, a) {
13
+ this.outputShape = [t, e, a], this.dispatchLayout = m(this.outputShape), this.dispatch = p(this.dispatchLayout, this.outputShape, this.workgroupSize);
14
+ }
15
+ getUserCode() {
16
+ return `
17
+ ${c("index")} {
18
+ if (index < uniforms.size) {
19
+ let coords = getCoordsFromIndex(index);
20
+ let x = getXByOutputIndex(index);
21
+ let meanSquare = getMeanSquare(coords[0], coords[1], 0);
22
+ let gamma = getGammaByOutputIndex(index);
23
+ let invRms = inverseSqrt(meanSquare + 1e-8);
24
+ let normalized = x * invRms;
25
+ let outVal = normalized * gamma;
26
+ setOutputAtIndex(index, outVal);
27
+ }
28
+ }
29
+ `;
30
+ }
31
+ }
32
+ function b(s) {
33
+ const { x: t, gamma: e } = s.inputs, a = s.backend, o = t.shape[0], n = t.shape[1], i = t.shape[2], u = t.square().mean(-1, !0), r = new N(o, n, i);
34
+ return a.runWebGPUProgram(r, [t, u, e], "float32");
35
+ }
36
+ const z = {
37
+ kernelName: "RMSNorm",
38
+ backendName: "webgpu",
39
+ kernelFunc: b
40
+ };
41
+ l(z);
42
+ class R {
43
+ variableNames = ["x", "meanSquare", "dyGamma", "dyXMean"];
44
+ outputShape;
45
+ shaderKey = "RMSNormGradX";
46
+ dispatchLayout;
47
+ dispatch;
48
+ workgroupSize = [64, 1, 1];
49
+ size = !0;
50
+ C;
51
+ constructor(t, e, a) {
52
+ this.outputShape = [t, e, a], this.dispatchLayout = m(this.outputShape), this.dispatch = p(this.dispatchLayout, this.outputShape, this.workgroupSize), this.C = a;
53
+ }
54
+ getUserCode() {
55
+ return `
56
+ ${c("index")} {
57
+ if (index < uniforms.size) {
58
+ let coords = getCoordsFromIndex(index);
59
+ let x = getXByOutputIndex(index);
60
+ let meanSquare = getMeanSquare(coords[0], coords[1], 0) + 1e-8;
61
+ let dyGamma = getDyGammaByOutputIndex(index);
62
+ let dyXMean = getDyXMean(coords[0], coords[1], 0) / ${this.C}.0;
63
+ let invRms = inverseSqrt(meanSquare);
64
+ let dx = dyGamma * invRms - x * dyXMean * invRms / meanSquare;
65
+ setOutputAtIndex(index, dx);
66
+ }
67
+ }
68
+ `;
69
+ }
70
+ }
71
+ class v {
72
+ variableNames = ["x", "meanSquare", "dy"];
73
+ outputShape;
74
+ shaderKey = "RMSNormGradGamma";
75
+ dispatchLayout;
76
+ dispatch;
77
+ workgroupSize = [64, 1, 1];
78
+ size = !0;
79
+ constructor(t, e, a) {
80
+ this.outputShape = [t, e, a], this.dispatchLayout = m(this.outputShape), this.dispatch = p(this.dispatchLayout, this.outputShape, this.workgroupSize);
81
+ }
82
+ getUserCode() {
83
+ return `
84
+ ${c("index")} {
85
+ if (index < uniforms.size) {
86
+ let coords = getCoordsFromIndex(index);
87
+ let x = getXByOutputIndex(index);
88
+ let meanSquare = getMeanSquare(coords[0], coords[1], 0) + 1e-8;
89
+ let dy = getDyByOutputIndex(index);
90
+ let invRms = inverseSqrt(meanSquare);
91
+ let dGamma = dy * (x * invRms);
92
+ setOutputAtIndex(index,dGamma);
93
+ }
94
+ }
95
+ `;
96
+ }
97
+ }
98
+ function I(s) {
99
+ const { dy: t, x: e, gamma: a } = s.inputs, o = s.backend, n = e.shape[0], i = e.shape[1], u = e.shape[2], r = t.mul(a), h = r.mul(e), g = h.sum(-1, !0);
100
+ h.dispose();
101
+ const S = e.square(), d = S.mean(-1, !0);
102
+ S.dispose();
103
+ const y = new R(n, i, u), G = o.runWebGPUProgram(y, [e, d, r, g], "float32");
104
+ r.dispose(), g.dispose();
105
+ const q = new v(n, i, u), x = o.runWebGPUProgram(q, [e, d, t], "float32");
106
+ d.dispose();
107
+ const f = M(k().makeTensorFromTensorInfo(x), [0, 1]);
108
+ return o.disposeData(x), [G, f];
109
+ }
110
+ const P = {
111
+ kernelName: "RMSNormGrad",
112
+ backendName: "webgpu",
113
+ kernelFunc: I
114
+ };
115
+ l(P);
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,56 @@
1
+ import { f as c, c as d, g as h } from "../../webgpu_util-g13LvDIv.js";
2
+ import { f as p } from "../../index-C0dhsYom.js";
3
+ class l {
4
+ variableNames = ["x", "kernel"];
5
+ outputShape;
6
+ shaderKey = "QKV";
7
+ dispatchLayout;
8
+ dispatch;
9
+ uniforms = "mode: i32";
10
+ workgroupSize = [64, 1, 1];
11
+ size = !0;
12
+ constructor(t, e, o, s) {
13
+ const r = s / e;
14
+ this.outputShape = [t, e, o, r], this.dispatchLayout = c(this.outputShape), this.dispatch = d(this.dispatchLayout, this.outputShape, this.workgroupSize);
15
+ }
16
+ getUserCode() {
17
+ const t = this.outputShape[1], e = this.outputShape[3], o = t * e;
18
+ return `
19
+ ${h("index")} {
20
+ if (index < uniforms.size) {
21
+ let coords = getCoordsFromIndex(index); // [b, h, t, d]
22
+ let b = coords[0];
23
+ let h = coords[1];
24
+ let t = coords[2];
25
+ let d = coords[3];
26
+
27
+ // Compute output channel index in fused kernel
28
+ let out_offset = uniforms.mode * ${t} * ${e} + h * ${e} + d;
29
+
30
+ var sum = 0.0;
31
+ for (var c = 0; c < ${o}; c += 1) {
32
+ let xval = getX(b, t, c); // fetch from x
33
+ let kval = getKernel(c, out_offset); // fetch from kernel
34
+ sum += xval * kval;
35
+ }
36
+
37
+ setOutputAtIndex(index, sum);
38
+ }
39
+ }
40
+ `;
41
+ }
42
+ }
43
+ function m(a) {
44
+ const { x: t, kernel: e } = a.inputs, { heads: o } = a.attrs, s = a.backend, r = t.shape[0], i = t.shape[1], u = t.shape[2], n = new l(r, o, i, u);
45
+ return [
46
+ s.runWebGPUProgram(n, [t, e], "float32", [{ type: "int32", data: [0] }]),
47
+ s.runWebGPUProgram(n, [t, e], "float32", [{ type: "int32", data: [1] }]),
48
+ s.runWebGPUProgram(n, [t, e], "float32", [{ type: "int32", data: [2] }])
49
+ ];
50
+ }
51
+ const f = {
52
+ kernelName: "QKV",
53
+ backendName: "webgpu",
54
+ kernelFunc: m
55
+ };
56
+ p(f);
@@ -0,0 +1 @@
1
+ export {};