npm - @genai-fi/nanogpt - Versions diffs - 0.15.7 → 0.15.8 - Mend

@genai-fi/nanogpt 0.15.7 → 0.15.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/dist/ops/webgpu/clipScale.js +7 -7
package/dist/ops/webgpu/norm2.js +34 -21
package/dist/ops/webgpu/utils/reductions.d.ts +1 -0
package/dist/ops/webgpu/utils/reductions.js +35 -29
package/package.json +1 -1

package/dist/ops/webgpu/clipScale.js CHANGED Viewed

@@ -15,16 +15,16 @@ class k extends g {
       !1
     ), this.uniforms += "scaling: f32, clipNorm: f32";
   }
-  getPreprocessSnippet() {
+  getReadSnippet() {
     return `
-            candidate = candidate / 100.0f;
+            return bitcast<f32>(u32(x[index]));
         `;
   }
   getWriteSnippet() {
     return `
             if (tid == 0) {
                 let cnorm = uniforms.clipNorm;
-                let gradNorm = sqrt(bestValue);
+                let gradNorm = sqrt(max(bestValue, 0.0));
                 result[0] = (cnorm / max(cnorm, gradNorm)) * uniforms.scaling;
                 result[1] = gradNorm;
             }
@@ -44,15 +44,15 @@ function w(o) {
     outSize: 2,
     batchSize: 1,
     windowSize: r
-  }, m = new k(l, p, r), u = d(m, [e], c, [
+  }, u = new k(l, p, r), m = d(u, [e], c, [
     { type: "float32", data: [i] },
     { type: "float32", data: [n] }
   ]);
-  return a.forEach((f) => f.dispose()), u;
+  return a.forEach((f) => f.dispose()), m;
 }
-const N = {
+const b = {
   kernelName: "ClipScale",
   backendName: "webgpu",
   kernelFunc: w
 };
-S(N);
+S(b);

package/dist/ops/webgpu/norm2.js CHANGED Viewed

@@ -1,13 +1,26 @@
-import { reduce as g, ReduceProgram as S } from "./utils/reductions.js";
-import { c as w, U as h } from "../../index-CUXkjxiT.js";
+import { reduce as g, ReduceProgram as h } from "./utils/reductions.js";
+import { c as w, U as S } from "../../index-CUXkjxiT.js";
 import k from "./utils/deviceInfo.js";
-class z extends S {
+class v extends h {
   shaderKey = "norm2";
   atomic = !0;
-  constructor(o, t, i) {
+  utilityFunctions = `
+        fn atomicAddF32(sum: ptr<storage, atomic<i32>, read_write>, value: f32) -> f32 {
+            var old = atomicLoad(sum);
+            loop {
+                let new_value = value + bitcast<f32>(old);
+                let exchange_result = atomicCompareExchangeWeak(sum, old, bitcast<i32>(new_value));
+                if (exchange_result.exchanged) {
+                    return new_value;
+                }
+                old = exchange_result.old_value;
+            }
+        }
+    `;
+  constructor(o, r, i) {
     super(
       o,
-      t,
+      r,
       {
         reductionOp: "sum",
         elementwise: !1,
@@ -25,39 +38,39 @@ class z extends S {
   getWriteSnippet() {
     return `
             if (tid == 0) {
-                atomicAdd(&result[uniforms.index], i32(bestValue * 100.0f));
+                atomicAddF32(&result[uniforms.index], bestValue);
             }
         `;
   }
 }
-function b(r) {
-  const { x: o, output: t } = r.inputs, { invLossScaling: i, index: c } = r.attrs, n = r.backend, d = [], u = k(n);
+function x(t) {
+  const { x: o, output: r } = t.inputs, { invLossScaling: i, index: c } = t.attrs, n = t.backend, u = [], d = k(n);
   let e = Math.min(512, n.device.limits.maxComputeWorkgroupSizeX);
-  const s = 4, a = h(o.shape);
-  for (; a % (e * s) !== 0 && e > 1; )
+  const a = 4, s = S(o.shape);
+  for (; s % (e * a) !== 0 && e > 1; )
     e /= 2;
   if (e === 1)
-    throw new Error(`Cannot find suitable workgroup size for Norm2Program with reduce size ${a}`);
-  const m = {
-    inSize: e * s,
+    throw new Error(`Cannot find suitable workgroup size for Norm2Program with reduce size ${s}`);
+  const l = {
+    inSize: e * a,
     outSize: 1,
-    batchSize: a / (e * s),
+    batchSize: s / (e * a),
     windowSize: e
-  }, p = new z(u, m, e), f = g(
-    p,
+  }, m = new v(d, l, e), p = g(
+    m,
     [o],
     n,
     [
       { type: "float32", data: [i] },
       { type: "int32", data: [c] }
     ],
-    t
+    r
   );
-  return d.forEach((l) => l.dispose()), f;
+  return u.forEach((f) => f.dispose()), p;
 }
-const x = {
+const z = {
   kernelName: "Norm2",
   backendName: "webgpu",
-  kernelFunc: b
+  kernelFunc: x
 };
-w(x);
+w(z);

package/dist/ops/webgpu/utils/reductions.d.ts CHANGED Viewed

@@ -29,6 +29,7 @@ export declare class ReduceProgram implements WebGPUProgram {
     subgroupBuiltins: boolean;
     deviceInfo: DeviceInformation;
     params: ReduceParams;
+    utilityFunctions?: string;
     constructor(deviceInfo: DeviceInformation, reduceInfo: backend_util.ReduceInfo, params: ReduceParams, packed: boolean);
     protected getWriteSnippet(): string;
     protected getPreprocessSnippet(): string;

package/dist/ops/webgpu/utils/reductions.js CHANGED Viewed

@@ -1,10 +1,10 @@
-import { ah as f, U as S, h } from "../../../index-CUXkjxiT.js";
+import { ah as h, U as S, h as f } from "../../../index-CUXkjxiT.js";
 import { e as d } from "../../../webgpu_program-B4HmApL1.js";
 import { reshape16 as g } from "../../reshape16.js";
 import { f as z } from "../../../webgpu_util-DYlGSwOJ.js";
 import { c as k } from "../../../axis_util-GTVlo58H.js";
 import { z as x } from "../../../zeros-DvZpK8s6.js";
-function a(e, u, t, i) {
+function c(e, u, t, i) {
   return e && !u ? `
             bestValue = subgroupAdd(bestValue);
         ` : e ? `
@@ -37,10 +37,10 @@ function a(e, u, t, i) {
             bestValue = bestValues[0];
         `;
 }
-function v(e) {
+function $(e) {
   const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
              var<workgroup> bestValues : array<f32, ${e.workgroupSizeX}>;
-           `, i = a(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
+           `, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
   return `
            fn DIV_CEIL(a : u32, b : u32) -> u32 {
             return ((a - 1u) / b + 1u);
@@ -54,6 +54,7 @@ function v(e) {
             }
            ${t}
+           ${e.utilityFunctions ?? ""}
            ${d("index")} {
                 let outputIndex = index / ${u};
@@ -81,10 +82,10 @@ function v(e) {
            }
          `;
 }
-function $(e) {
+function v(e) {
   const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
              var<workgroup> bestValues : array<vec2<f32>, ${e.workgroupSizeX}>;
-           `, i = a(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !0);
+           `, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !0);
   return `
            fn DIV_CEIL(a : u32, b : u32) -> u32 {
             return ((a - 1u) / b + 1u);
@@ -97,7 +98,8 @@ function $(e) {
                 `}
             }
-           ${t}
+            ${t}
+            ${e.utilityFunctions ?? ""}
            ${d("index")} {
                 let outputIndex = index / ${u};
@@ -128,12 +130,12 @@ function $(e) {
          `;
 }
 function V(e) {
-  return e.elementwise ? v(e) : $(e);
+  return e.elementwise ? $(e) : v(e);
 }
 function w(e) {
   const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
              var<workgroup> bestValues : array<f32, ${e.workgroupSizeX}>;
-           `, i = a(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
+           `, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
   return `
            fn DIV_CEIL(a : u32, b : u32) -> u32 {
             return ((a - 1u) / b + 1u);
@@ -146,6 +148,7 @@ function w(e) {
             }
            ${t}
+              ${e.utilityFunctions ?? ""}
            ${d("index")} {
                 let outputIndex = index / ${e.workgroupSizeX};
@@ -173,11 +176,11 @@ function w(e) {
            }
          `;
 }
-function P(e, u) {
-  const t = e[0], r = f(u, t.shape), [, n] = k(t.shape, r), s = S(n), o = S(t.shape) / s;
-  return { windowSize: s, inSize: s, batchSize: o, outSize: o };
+function X(e, u) {
+  const t = e[0], o = h(u, t.shape), [, n] = k(t.shape, o), s = S(n), r = S(t.shape) / s;
+  return { windowSize: s, inSize: s, batchSize: r, outSize: r };
 }
-class A {
+class P {
   atomic = !1;
   outputShape;
   shaderKey = "reduce16";
@@ -196,11 +199,12 @@ class A {
   subgroupBuiltins = !1;
   deviceInfo;
   params;
-  constructor(u, t, i, r) {
-    this.params = i, this.inputShape = [t.batchSize, t.inSize], this.deviceInfo = u, this.packed = r;
+  utilityFunctions;
+  constructor(u, t, i, o) {
+    this.params = i, this.inputShape = [t.batchSize, t.inSize], this.deviceInfo = u, this.packed = o;
     const n = i.forceWorkgroupSize ? i.forceWorkgroupSize : t.inSize % 64 === 0 ? 64 : 32;
-    u.subgroupsSupported && !i.forceWorkgroupSize ? (this.workgroupSize = [Math.min(n, u.subgroupMaxSize), 1, 1], this.subgroups = !0, u.variableSubgroups && (this.subgroupBuiltins = !0)) : this.workgroupSize[0] = n, this.outputShape = i.elementwise ? [t.batchSize, t.inSize] : r ? [t.outSize / 2] : [t.outSize], this.dispatchLayout = z(this.outputShape), this.dispatch = [
-      i.elementwise ? t.batchSize : r ? t.batchSize / 2 : t.batchSize,
+    u.subgroupsSupported && !i.forceWorkgroupSize ? (this.workgroupSize = [Math.min(n, u.subgroupMaxSize), 1, 1], this.subgroups = !0, u.variableSubgroups && (this.subgroupBuiltins = !0)) : this.workgroupSize[0] = n, this.outputShape = i.elementwise ? [t.batchSize, t.inSize] : o ? [t.outSize / 2] : [t.outSize], this.dispatchLayout = z(this.outputShape), this.dispatch = [
+      i.elementwise ? t.batchSize : o ? t.batchSize / 2 : t.batchSize,
       1,
       1
     ], this.outputComponent = 1, this.variableComponents = [1], this.elementwise = i.elementwise === !0;
@@ -230,7 +234,8 @@ class A {
       inputReadSnippet: this.getReadSnippet(),
       inputSnippet: this.getPreprocessSnippet(),
       outputSnippet: this.getWriteSnippet(),
-      reducedSnippet: this.getPostprocessSnippet()
+      reducedSnippet: this.getPostprocessSnippet(),
+      utilityFunctions: this.utilityFunctions
     }) : w({
       ...this.params,
       workgroupSizeX: u,
@@ -239,21 +244,22 @@ class A {
       inputReadSnippet: this.getReadSnippet(),
       inputSnippet: this.getPreprocessSnippet(),
       outputSnippet: this.getWriteSnippet(),
-      reducedSnippet: this.getPostprocessSnippet()
+      reducedSnippet: this.getPostprocessSnippet(),
+      utilityFunctions: this.utilityFunctions
     });
   }
 }
-function W(e, u, t, i, r) {
-  const n = u[0], c = [{ type: "int32", data: [e.inputShape[e.inputShape.length - 1]] }, ...i ?? []];
-  let o = r;
-  !r && e.atomic && (o = x(e.outputShape, "int32"));
+function A(e, u, t, i, o) {
+  const n = u[0], a = [{ type: "int32", data: [e.inputShape[e.inputShape.length - 1]] }, ...i ?? []];
+  let r = o;
+  !o && e.atomic && (r = x(e.outputShape, "int32"));
   const l = t.runWebGPUProgram(
     e,
     u,
     e.packed ? "packedF16" : e.atomic ? "int32" : "float32",
-    c,
-    o
-  ), p = h().makeTensorFromTensorInfo(l);
+    a,
+    r
+  ), p = f().makeTensorFromTensorInfo(l);
   if (e.outputShape.length === 1 && e.outputShape[0] <= 2)
     return p;
   const b = g(
@@ -263,7 +269,7 @@ function W(e, u, t, i, r) {
   return p.dispose(), b;
 }
 export {
-  A as ReduceProgram,
-  P as createReduceInfo,
-  W as reduce
+  P as ReduceProgram,
+  X as createReduceInfo,
+  A as reduce
 };

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
     "name": "@genai-fi/nanogpt",
-    "version": "0.15.7",
+    "version": "0.15.8",
     "type": "module",
     "main": "dist/main.js",
     "types": "dist/main.d.ts",