@genai-fi/nanogpt 0.15.7 → 0.15.8

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -15,16 +15,16 @@ class k extends g {
15
15
  !1
16
16
  ), this.uniforms += "scaling: f32, clipNorm: f32";
17
17
  }
18
- getPreprocessSnippet() {
18
+ getReadSnippet() {
19
19
  return `
20
- candidate = candidate / 100.0f;
20
+ return bitcast<f32>(u32(x[index]));
21
21
  `;
22
22
  }
23
23
  getWriteSnippet() {
24
24
  return `
25
25
  if (tid == 0) {
26
26
  let cnorm = uniforms.clipNorm;
27
- let gradNorm = sqrt(bestValue);
27
+ let gradNorm = sqrt(max(bestValue, 0.0));
28
28
  result[0] = (cnorm / max(cnorm, gradNorm)) * uniforms.scaling;
29
29
  result[1] = gradNorm;
30
30
  }
@@ -44,15 +44,15 @@ function w(o) {
44
44
  outSize: 2,
45
45
  batchSize: 1,
46
46
  windowSize: r
47
- }, m = new k(l, p, r), u = d(m, [e], c, [
47
+ }, u = new k(l, p, r), m = d(u, [e], c, [
48
48
  { type: "float32", data: [i] },
49
49
  { type: "float32", data: [n] }
50
50
  ]);
51
- return a.forEach((f) => f.dispose()), u;
51
+ return a.forEach((f) => f.dispose()), m;
52
52
  }
53
- const N = {
53
+ const b = {
54
54
  kernelName: "ClipScale",
55
55
  backendName: "webgpu",
56
56
  kernelFunc: w
57
57
  };
58
- S(N);
58
+ S(b);
@@ -1,13 +1,26 @@
1
- import { reduce as g, ReduceProgram as S } from "./utils/reductions.js";
2
- import { c as w, U as h } from "../../index-CUXkjxiT.js";
1
+ import { reduce as g, ReduceProgram as h } from "./utils/reductions.js";
2
+ import { c as w, U as S } from "../../index-CUXkjxiT.js";
3
3
  import k from "./utils/deviceInfo.js";
4
- class z extends S {
4
+ class v extends h {
5
5
  shaderKey = "norm2";
6
6
  atomic = !0;
7
- constructor(o, t, i) {
7
+ utilityFunctions = `
8
+ fn atomicAddF32(sum: ptr<storage, atomic<i32>, read_write>, value: f32) -> f32 {
9
+ var old = atomicLoad(sum);
10
+ loop {
11
+ let new_value = value + bitcast<f32>(old);
12
+ let exchange_result = atomicCompareExchangeWeak(sum, old, bitcast<i32>(new_value));
13
+ if (exchange_result.exchanged) {
14
+ return new_value;
15
+ }
16
+ old = exchange_result.old_value;
17
+ }
18
+ }
19
+ `;
20
+ constructor(o, r, i) {
8
21
  super(
9
22
  o,
10
- t,
23
+ r,
11
24
  {
12
25
  reductionOp: "sum",
13
26
  elementwise: !1,
@@ -25,39 +38,39 @@ class z extends S {
25
38
  getWriteSnippet() {
26
39
  return `
27
40
  if (tid == 0) {
28
- atomicAdd(&result[uniforms.index], i32(bestValue * 100.0f));
41
+ atomicAddF32(&result[uniforms.index], bestValue);
29
42
  }
30
43
  `;
31
44
  }
32
45
  }
33
- function b(r) {
34
- const { x: o, output: t } = r.inputs, { invLossScaling: i, index: c } = r.attrs, n = r.backend, d = [], u = k(n);
46
+ function x(t) {
47
+ const { x: o, output: r } = t.inputs, { invLossScaling: i, index: c } = t.attrs, n = t.backend, u = [], d = k(n);
35
48
  let e = Math.min(512, n.device.limits.maxComputeWorkgroupSizeX);
36
- const s = 4, a = h(o.shape);
37
- for (; a % (e * s) !== 0 && e > 1; )
49
+ const a = 4, s = S(o.shape);
50
+ for (; s % (e * a) !== 0 && e > 1; )
38
51
  e /= 2;
39
52
  if (e === 1)
40
- throw new Error(`Cannot find suitable workgroup size for Norm2Program with reduce size ${a}`);
41
- const m = {
42
- inSize: e * s,
53
+ throw new Error(`Cannot find suitable workgroup size for Norm2Program with reduce size ${s}`);
54
+ const l = {
55
+ inSize: e * a,
43
56
  outSize: 1,
44
- batchSize: a / (e * s),
57
+ batchSize: s / (e * a),
45
58
  windowSize: e
46
- }, p = new z(u, m, e), f = g(
47
- p,
59
+ }, m = new v(d, l, e), p = g(
60
+ m,
48
61
  [o],
49
62
  n,
50
63
  [
51
64
  { type: "float32", data: [i] },
52
65
  { type: "int32", data: [c] }
53
66
  ],
54
- t
67
+ r
55
68
  );
56
- return d.forEach((l) => l.dispose()), f;
69
+ return u.forEach((f) => f.dispose()), p;
57
70
  }
58
- const x = {
71
+ const z = {
59
72
  kernelName: "Norm2",
60
73
  backendName: "webgpu",
61
- kernelFunc: b
74
+ kernelFunc: x
62
75
  };
63
- w(x);
76
+ w(z);
@@ -29,6 +29,7 @@ export declare class ReduceProgram implements WebGPUProgram {
29
29
  subgroupBuiltins: boolean;
30
30
  deviceInfo: DeviceInformation;
31
31
  params: ReduceParams;
32
+ utilityFunctions?: string;
32
33
  constructor(deviceInfo: DeviceInformation, reduceInfo: backend_util.ReduceInfo, params: ReduceParams, packed: boolean);
33
34
  protected getWriteSnippet(): string;
34
35
  protected getPreprocessSnippet(): string;
@@ -1,10 +1,10 @@
1
- import { ah as f, U as S, h } from "../../../index-CUXkjxiT.js";
1
+ import { ah as h, U as S, h as f } from "../../../index-CUXkjxiT.js";
2
2
  import { e as d } from "../../../webgpu_program-B4HmApL1.js";
3
3
  import { reshape16 as g } from "../../reshape16.js";
4
4
  import { f as z } from "../../../webgpu_util-DYlGSwOJ.js";
5
5
  import { c as k } from "../../../axis_util-GTVlo58H.js";
6
6
  import { z as x } from "../../../zeros-DvZpK8s6.js";
7
- function a(e, u, t, i) {
7
+ function c(e, u, t, i) {
8
8
  return e && !u ? `
9
9
  bestValue = subgroupAdd(bestValue);
10
10
  ` : e ? `
@@ -37,10 +37,10 @@ function a(e, u, t, i) {
37
37
  bestValue = bestValues[0];
38
38
  `;
39
39
  }
40
- function v(e) {
40
+ function $(e) {
41
41
  const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
42
42
  var<workgroup> bestValues : array<f32, ${e.workgroupSizeX}>;
43
- `, i = a(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
43
+ `, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
44
44
  return `
45
45
  fn DIV_CEIL(a : u32, b : u32) -> u32 {
46
46
  return ((a - 1u) / b + 1u);
@@ -54,6 +54,7 @@ function v(e) {
54
54
  }
55
55
 
56
56
  ${t}
57
+ ${e.utilityFunctions ?? ""}
57
58
 
58
59
  ${d("index")} {
59
60
  let outputIndex = index / ${u};
@@ -81,10 +82,10 @@ function v(e) {
81
82
  }
82
83
  `;
83
84
  }
84
- function $(e) {
85
+ function v(e) {
85
86
  const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
86
87
  var<workgroup> bestValues : array<vec2<f32>, ${e.workgroupSizeX}>;
87
- `, i = a(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !0);
88
+ `, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !0);
88
89
  return `
89
90
  fn DIV_CEIL(a : u32, b : u32) -> u32 {
90
91
  return ((a - 1u) / b + 1u);
@@ -97,7 +98,8 @@ function $(e) {
97
98
  `}
98
99
  }
99
100
 
100
- ${t}
101
+ ${t}
102
+ ${e.utilityFunctions ?? ""}
101
103
 
102
104
  ${d("index")} {
103
105
  let outputIndex = index / ${u};
@@ -128,12 +130,12 @@ function $(e) {
128
130
  `;
129
131
  }
130
132
  function V(e) {
131
- return e.elementwise ? v(e) : $(e);
133
+ return e.elementwise ? $(e) : v(e);
132
134
  }
133
135
  function w(e) {
134
136
  const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
135
137
  var<workgroup> bestValues : array<f32, ${e.workgroupSizeX}>;
136
- `, i = a(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
138
+ `, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
137
139
  return `
138
140
  fn DIV_CEIL(a : u32, b : u32) -> u32 {
139
141
  return ((a - 1u) / b + 1u);
@@ -146,6 +148,7 @@ function w(e) {
146
148
  }
147
149
 
148
150
  ${t}
151
+ ${e.utilityFunctions ?? ""}
149
152
 
150
153
  ${d("index")} {
151
154
  let outputIndex = index / ${e.workgroupSizeX};
@@ -173,11 +176,11 @@ function w(e) {
173
176
  }
174
177
  `;
175
178
  }
176
- function P(e, u) {
177
- const t = e[0], r = f(u, t.shape), [, n] = k(t.shape, r), s = S(n), o = S(t.shape) / s;
178
- return { windowSize: s, inSize: s, batchSize: o, outSize: o };
179
+ function X(e, u) {
180
+ const t = e[0], o = h(u, t.shape), [, n] = k(t.shape, o), s = S(n), r = S(t.shape) / s;
181
+ return { windowSize: s, inSize: s, batchSize: r, outSize: r };
179
182
  }
180
- class A {
183
+ class P {
181
184
  atomic = !1;
182
185
  outputShape;
183
186
  shaderKey = "reduce16";
@@ -196,11 +199,12 @@ class A {
196
199
  subgroupBuiltins = !1;
197
200
  deviceInfo;
198
201
  params;
199
- constructor(u, t, i, r) {
200
- this.params = i, this.inputShape = [t.batchSize, t.inSize], this.deviceInfo = u, this.packed = r;
202
+ utilityFunctions;
203
+ constructor(u, t, i, o) {
204
+ this.params = i, this.inputShape = [t.batchSize, t.inSize], this.deviceInfo = u, this.packed = o;
201
205
  const n = i.forceWorkgroupSize ? i.forceWorkgroupSize : t.inSize % 64 === 0 ? 64 : 32;
202
- u.subgroupsSupported && !i.forceWorkgroupSize ? (this.workgroupSize = [Math.min(n, u.subgroupMaxSize), 1, 1], this.subgroups = !0, u.variableSubgroups && (this.subgroupBuiltins = !0)) : this.workgroupSize[0] = n, this.outputShape = i.elementwise ? [t.batchSize, t.inSize] : r ? [t.outSize / 2] : [t.outSize], this.dispatchLayout = z(this.outputShape), this.dispatch = [
203
- i.elementwise ? t.batchSize : r ? t.batchSize / 2 : t.batchSize,
206
+ u.subgroupsSupported && !i.forceWorkgroupSize ? (this.workgroupSize = [Math.min(n, u.subgroupMaxSize), 1, 1], this.subgroups = !0, u.variableSubgroups && (this.subgroupBuiltins = !0)) : this.workgroupSize[0] = n, this.outputShape = i.elementwise ? [t.batchSize, t.inSize] : o ? [t.outSize / 2] : [t.outSize], this.dispatchLayout = z(this.outputShape), this.dispatch = [
207
+ i.elementwise ? t.batchSize : o ? t.batchSize / 2 : t.batchSize,
204
208
  1,
205
209
  1
206
210
  ], this.outputComponent = 1, this.variableComponents = [1], this.elementwise = i.elementwise === !0;
@@ -230,7 +234,8 @@ class A {
230
234
  inputReadSnippet: this.getReadSnippet(),
231
235
  inputSnippet: this.getPreprocessSnippet(),
232
236
  outputSnippet: this.getWriteSnippet(),
233
- reducedSnippet: this.getPostprocessSnippet()
237
+ reducedSnippet: this.getPostprocessSnippet(),
238
+ utilityFunctions: this.utilityFunctions
234
239
  }) : w({
235
240
  ...this.params,
236
241
  workgroupSizeX: u,
@@ -239,21 +244,22 @@ class A {
239
244
  inputReadSnippet: this.getReadSnippet(),
240
245
  inputSnippet: this.getPreprocessSnippet(),
241
246
  outputSnippet: this.getWriteSnippet(),
242
- reducedSnippet: this.getPostprocessSnippet()
247
+ reducedSnippet: this.getPostprocessSnippet(),
248
+ utilityFunctions: this.utilityFunctions
243
249
  });
244
250
  }
245
251
  }
246
- function W(e, u, t, i, r) {
247
- const n = u[0], c = [{ type: "int32", data: [e.inputShape[e.inputShape.length - 1]] }, ...i ?? []];
248
- let o = r;
249
- !r && e.atomic && (o = x(e.outputShape, "int32"));
252
+ function A(e, u, t, i, o) {
253
+ const n = u[0], a = [{ type: "int32", data: [e.inputShape[e.inputShape.length - 1]] }, ...i ?? []];
254
+ let r = o;
255
+ !o && e.atomic && (r = x(e.outputShape, "int32"));
250
256
  const l = t.runWebGPUProgram(
251
257
  e,
252
258
  u,
253
259
  e.packed ? "packedF16" : e.atomic ? "int32" : "float32",
254
- c,
255
- o
256
- ), p = h().makeTensorFromTensorInfo(l);
260
+ a,
261
+ r
262
+ ), p = f().makeTensorFromTensorInfo(l);
257
263
  if (e.outputShape.length === 1 && e.outputShape[0] <= 2)
258
264
  return p;
259
265
  const b = g(
@@ -263,7 +269,7 @@ function W(e, u, t, i, r) {
263
269
  return p.dispose(), b;
264
270
  }
265
271
  export {
266
- A as ReduceProgram,
267
- P as createReduceInfo,
268
- W as reduce
272
+ P as ReduceProgram,
273
+ X as createReduceInfo,
274
+ A as reduce
269
275
  };
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@genai-fi/nanogpt",
3
- "version": "0.15.7",
3
+ "version": "0.15.8",
4
4
  "type": "module",
5
5
  "main": "dist/main.js",
6
6
  "types": "dist/main.d.ts",