npm - @genai-fi/nanogpt - Versions diffs - 0.8.4 → 0.8.5 - Mend

@genai-fi/nanogpt 0.8.4 → 0.8.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (4) hide show

package/dist/ops/webgl/gelu.js +7 -7
package/dist/ops/webgl/matMulGelu.js +47 -37
package/dist/ops/webgpu/gelu.js +20 -14
package/package.json +1 -1

package/dist/ops/webgl/gelu.js CHANGED Viewed

@@ -1,17 +1,17 @@
 import { f as a } from "../../index-DdmHGZjq.js";
-import { u as s, C as c } from "../../kernel_funcs_utils-CDfFpUab.js";
-const t = 0.7978845608028654, r = 0.044715, d = c + `
+import { u as s, C as i } from "../../kernel_funcs_utils-CDfFpUab.js";
+const t = 0.7978845608028654, r = 0.044715, c = i + `
     float x3 = x * x * x;
     float inner = x + ${r} * x3;
     inner = ${t} * inner;
-    inner = tanh(inner);
+    inner = abs(inner) > 15.0 ? sign(inner) : tanh(inner);
     inner = 0.5 * (1.0 + inner);
     inner = x * inner;
     return inner;
-`, i = s({ opSnippet: d }), x = {
+`, d = s({ opSnippet: c }), x = {
   kernelName: "Gelu",
   backendName: "webgl",
-  kernelFunc: i
+  kernelFunc: d
 };
 a(x);
 class f {
@@ -27,7 +27,7 @@ class f {
                 float x2 = x * x;
                 float x3 = x2 * x;
                 float u  = ${t} * (x + ${r} * x3);
-                float t  = tanh(u);
+                float t = abs(u) > 15.0 ? sign(u) : tanh(u);
                 float sech2 = 1.0 - t * t;
                 float du_dx = ${t} * (1.0 + 3.0 * ${r} * x2);
                 float dgelu = 0.5 * (1.0 + t) + 0.5 * x * sech2 * du_dx;
@@ -46,5 +46,5 @@ const p = {
 };
 a(p);
 export {
-  i as gelu
+  d as gelu
 };

package/dist/ops/webgl/matMulGelu.js CHANGED Viewed

@@ -1,63 +1,73 @@
-import { f as _, t as R, e as C, j as A, l as N, n as H, u as O } from "../../index-DdmHGZjq.js";
+import { f as E, t as R, e as C, j as $, l as N, n as H, u as O } from "../../index-DdmHGZjq.js";
 import { r as f } from "../../Reshape-Bh_jzKzV.js";
 import { M as U } from "../../mulmat_packed_gpu-q_Gmwyld.js";
-import { m as E } from "../../mat_mul-Dpy2mMRu.js";
-const M = 0.7978845608028654, x = 0.044715, j = `
+import { m as A } from "../../mat_mul-Dpy2mMRu.js";
+const M = 0.7978845608028654, g = 0.044715, j = `
     vec4 x3 = x * x * x;
-    vec4 inner = x + ${x} * x3;
+    vec4 inner = x + ${g} * x3;
     inner = ${M} * inner;
-    inner = tanh(inner);
+    inner = vec4(
+        abs(inner[0]) > 15.0 ? sign(inner[0]) : tanh(inner[0]),
+        abs(inner[1]) > 15.0 ? sign(inner[1]) : tanh(inner[1]),
+        abs(inner[2]) > 15.0 ? sign(inner[2]) : tanh(inner[2]),
+        abs(inner[3]) > 15.0 ? sign(inner[3]) : tanh(inner[3])
+    );
     inner = 0.5 * (1.0 + inner);
     vec4 result = x * inner;
     return result;
 `, q = `
     vec4 a2 = a * a;
     vec4 a3 = a2 * a;
-    vec4 u  = ${M} * (a + ${x} * a3);
-    vec4 t  = tanh(u);
+    vec4 u  = ${M} * (a + ${g} * a3);
+    vec4 t = vec4(
+        abs(u[0]) > 15.0 ? sign(u[0]) : tanh(u[0]),
+        abs(u[1]) > 15.0 ? sign(u[1]) : tanh(u[1]),
+        abs(u[2]) > 15.0 ? sign(u[2]) : tanh(u[2]),
+        abs(u[3]) > 15.0 ? sign(u[3]) : tanh(u[3])
+    );
     vec4 sech2 = 1.0 - t * t;
-    vec4 du_dx = ${M} * (1.0 + 3.0 * ${x} * a2);
+    vec4 du_dx = ${M} * (1.0 + 3.0 * ${g} * a2);
     vec4 dgelu = 0.5 * (1.0 + t) + 0.5 * a * sech2 * du_dx;
     return dgelu * b;
-`, te = 1e3;
-function w({
+`, ne = 1e3;
+function _({
   a: e,
-  b: t,
+  b: n,
   transposeA: s,
-  transposeB: n,
+  transposeB: t,
   backend: a,
   activationSnippet: c,
   multiplier: o
 }) {
-  const r = e.shape.length, u = t.shape.length, i = s ? e.shape[r - 2] : e.shape[r - 1], p = n ? t.shape[u - 1] : t.shape[u - 2], h = s ? e.shape[r - 1] : e.shape[r - 2], l = n ? t.shape[u - 2] : t.shape[u - 1], K = e.shape.slice(0, -2), T = t.shape.slice(0, -2), d = A(K), m = A(T), b = N(e.shape.slice(0, -2), t.shape.slice(0, -2)).concat([h, l]);
+  const r = e.shape.length, i = n.shape.length, u = s ? e.shape[r - 2] : e.shape[r - 1], h = t ? n.shape[i - 1] : n.shape[i - 2], p = s ? e.shape[r - 1] : e.shape[r - 2], l = t ? n.shape[i - 2] : n.shape[i - 1], w = e.shape.slice(0, -2), K = n.shape.slice(0, -2), d = $(w), m = $(K), T = N(e.shape.slice(0, -2), n.shape.slice(0, -2)).concat([p, l]);
   H(
-    i === p,
-    () => `Error in matMul: inner shapes (${i}) and (${p}) of Tensors with shapes ${e.shape} and ${t.shape} and transposeA=${s} and transposeB=${n} must match.`
+    u === h,
+    () => `Error in matMul: inner shapes (${u}) and (${h}) of Tensors with shapes ${e.shape} and ${n.shape} and transposeA=${s} and transposeB=${t} must match.`
   );
-  const v = s ? [d, i, h] : [d, h, i], S = n ? [m, l, p] : [m, p, l], D = f({ inputs: { x: e }, backend: a, attrs: { shape: v } }), G = f({ inputs: { x: t }, backend: a, attrs: { shape: S } }), g = [D, G], y = Math.max(d, m), L = c, B = O(e.dtype, t.dtype), F = new U(
+  const v = s ? [d, u, p] : [d, p, u], x = t ? [m, l, h] : [m, h, l], S = f({ inputs: { x: e }, backend: a, attrs: { shape: v } }), b = f({ inputs: { x: n }, backend: a, attrs: { shape: x } }), D = [S, b], y = Math.max(d, m), L = c, B = O(e.dtype, n.dtype), F = new U(
     v,
-    S,
-    [y, h, l],
+    x,
+    [y, p, l],
     s,
-    n,
+    t,
     !1,
     L,
     !!o,
     !1
-  ), k = [D, G];
-  o && k.push(o);
-  const $ = a.runWebGLProgram(F, k, B), I = f({ inputs: { x: $ }, backend: a, attrs: { shape: b } });
-  g.push($);
-  for (const P of g)
+  ), G = [S, b];
+  o && G.push(o);
+  const k = a.runWebGLProgram(F, G, B), I = f({ inputs: { x: k }, backend: a, attrs: { shape: T } });
+  D.push(k);
+  for (const P of D)
     a.disposeIntermediateTensorInfo(P);
   return I;
 }
 function z(e) {
-  const { inputs: t, backend: s } = e, { x: n, kernel: a } = t;
-  if (n === void 0 || a === void 0)
+  const { inputs: n, backend: s } = e, { x: t, kernel: a } = n;
+  if (t === void 0 || a === void 0)
     throw new Error("BatchMatMul requires two input tensors.");
-  return w({
-    a: n,
+  return _({
+    a: t,
     b: a,
     transposeA: !1,
     transposeB: !1,
@@ -70,21 +80,21 @@ const W = {
   backendName: "webgl",
   kernelFunc: z
 };
-_(W);
+E(W);
 function J(e) {
-  const { dy: t, x: s, kernel: n } = e.inputs, a = e.backend;
+  const { dy: n, x: s, kernel: t } = e.inputs, a = e.backend;
   return R(() => {
     const c = C().makeTensorFromTensorInfo(
-      w({
+      _({
         a: s,
-        b: n,
+        b: t,
         transposeA: !1,
         transposeB: !1,
         backend: a,
         activationSnippet: q,
-        multiplier: t
+        multiplier: n
       })
-    ), o = E(c, n, !1, !0), r = E(s, c, !0, !1);
+    ), o = A(c, t, !1, !0), r = A(s, c, !0, !1);
     return [o, r];
   });
 }
@@ -93,9 +103,9 @@ const Q = {
   backendName: "webgl",
   kernelFunc: J
 };
-_(Q);
+E(Q);
 export {
-  te as MATMUL_SHARED_DIM_THRESHOLD,
-  w as batchMatMulGeluImpl,
+  ne as MATMUL_SHARED_DIM_THRESHOLD,
+  _ as batchMatMulGeluImpl,
   z as batchMatMulKernel
 };

package/dist/ops/webgpu/gelu.js CHANGED Viewed

@@ -1,8 +1,8 @@
 import { f as i } from "../../index-DdmHGZjq.js";
 import { g as o } from "../../webgpu_program-Dhk9R5aG.js";
 import { f as s, c as p } from "../../webgpu_util-BqGnZg8t.js";
-const u = 0.7978845608028654, a = 0.044715;
-class c {
+const a = 0.7978845608028654, u = 0.044715;
+class h {
   outputShape;
   shaderKey;
   dispatchLayout;
@@ -15,11 +15,14 @@ class c {
   }
   getUserCode() {
     return `
+      fn polyTanh(x: f32) -> f32 {
+         return select(tanh(x), sign(x), abs(x) > 15.0);
+      }
       fn unaryOperation(x : f32) -> f32 {
         let x3 = x * x * x;
-        var inner = fma(${a}, x3, x);
-        inner = ${u} * inner;
-        inner = tanh(inner);
+        var inner = fma(${u}, x3, x);
+        inner = ${a} * inner;
+        inner = polyTanh(inner);
         inner = 0.5 * (1.0 + inner);
         return x * inner;
       }
@@ -32,14 +35,14 @@ class c {
       `;
   }
 }
-function h(t) {
-  const { x: e } = t.inputs, n = t.backend, r = new c(e.shape);
+function c(t) {
+  const { x: e } = t.inputs, n = t.backend, r = new h(e.shape);
   return n.runWebGPUProgram(r, [e], "float32");
 }
 const l = {
   kernelName: "Gelu",
   backendName: "webgpu",
-  kernelFunc: h
+  kernelFunc: c
 };
 i(l);
 class x {
@@ -56,15 +59,18 @@ class x {
   }
   getUserCode() {
     return `
+            fn polyTanh(x: f32) -> f32 {
+                return select(tanh(x), sign(x), abs(x) > 15.0);
+            }
             ${o("index")} {
                 if (index < uniforms.size) {
                     let X  = getXByOutputIndex(index);
                     let x2 = X * X;
                     let x3 = x2 * X;
-                    let u  = ${u} * (X + ${a} * x3);
-                    let t  = tanh(u);
+                    let u  = ${a} * (X + ${u} * x3);
+                    let t  = polyTanh(u);
                     let sech2 = 1.0 - t * t;
-                    let du_dx = ${u} * (1.0 + 3.0 * ${a} * x2);
+                    let du_dx = ${a} * (1.0 + 3.0 * ${u} * x2);
                     let dgelu = 0.5 * (1.0 + t) + 0.5 * X * sech2 * du_dx;
                     let DY = getDyByOutputIndex(index);
                     setOutputAtIndex(index, DY * dgelu);
@@ -76,12 +82,12 @@ function g(t) {
   const { dy: e, x: n } = t.inputs, r = t.backend, d = new x(n.shape);
   return r.runWebGPUProgram(d, [e, n], "float32");
 }
-const m = {
+const f = {
   kernelName: "GeluGrad",
   backendName: "webgpu",
   kernelFunc: g
 };
-i(m);
+i(f);
 export {
-  c as GeluProgram
+  h as GeluProgram
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.8.4",
+    "version": "0.8.5",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",