@genai-fi/nanogpt 0.15.7 → 0.15.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
|
@@ -15,16 +15,16 @@ class k extends g {
|
|
|
15
15
|
!1
|
|
16
16
|
), this.uniforms += "scaling: f32, clipNorm: f32";
|
|
17
17
|
}
|
|
18
|
-
|
|
18
|
+
getReadSnippet() {
|
|
19
19
|
return `
|
|
20
|
-
|
|
20
|
+
return bitcast<f32>(u32(x[index]));
|
|
21
21
|
`;
|
|
22
22
|
}
|
|
23
23
|
getWriteSnippet() {
|
|
24
24
|
return `
|
|
25
25
|
if (tid == 0) {
|
|
26
26
|
let cnorm = uniforms.clipNorm;
|
|
27
|
-
let gradNorm = sqrt(bestValue);
|
|
27
|
+
let gradNorm = sqrt(max(bestValue, 0.0));
|
|
28
28
|
result[0] = (cnorm / max(cnorm, gradNorm)) * uniforms.scaling;
|
|
29
29
|
result[1] = gradNorm;
|
|
30
30
|
}
|
|
@@ -44,15 +44,15 @@ function w(o) {
|
|
|
44
44
|
outSize: 2,
|
|
45
45
|
batchSize: 1,
|
|
46
46
|
windowSize: r
|
|
47
|
-
},
|
|
47
|
+
}, u = new k(l, p, r), m = d(u, [e], c, [
|
|
48
48
|
{ type: "float32", data: [i] },
|
|
49
49
|
{ type: "float32", data: [n] }
|
|
50
50
|
]);
|
|
51
|
-
return a.forEach((f) => f.dispose()),
|
|
51
|
+
return a.forEach((f) => f.dispose()), m;
|
|
52
52
|
}
|
|
53
|
-
const
|
|
53
|
+
const b = {
|
|
54
54
|
kernelName: "ClipScale",
|
|
55
55
|
backendName: "webgpu",
|
|
56
56
|
kernelFunc: w
|
|
57
57
|
};
|
|
58
|
-
S(
|
|
58
|
+
S(b);
|
package/dist/ops/webgpu/norm2.js
CHANGED
|
@@ -1,13 +1,26 @@
|
|
|
1
|
-
import { reduce as g, ReduceProgram as
|
|
2
|
-
import { c as w, U as
|
|
1
|
+
import { reduce as g, ReduceProgram as h } from "./utils/reductions.js";
|
|
2
|
+
import { c as w, U as S } from "../../index-CUXkjxiT.js";
|
|
3
3
|
import k from "./utils/deviceInfo.js";
|
|
4
|
-
class
|
|
4
|
+
class v extends h {
|
|
5
5
|
shaderKey = "norm2";
|
|
6
6
|
atomic = !0;
|
|
7
|
-
|
|
7
|
+
utilityFunctions = `
|
|
8
|
+
fn atomicAddF32(sum: ptr<storage, atomic<i32>, read_write>, value: f32) -> f32 {
|
|
9
|
+
var old = atomicLoad(sum);
|
|
10
|
+
loop {
|
|
11
|
+
let new_value = value + bitcast<f32>(old);
|
|
12
|
+
let exchange_result = atomicCompareExchangeWeak(sum, old, bitcast<i32>(new_value));
|
|
13
|
+
if (exchange_result.exchanged) {
|
|
14
|
+
return new_value;
|
|
15
|
+
}
|
|
16
|
+
old = exchange_result.old_value;
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
`;
|
|
20
|
+
constructor(o, r, i) {
|
|
8
21
|
super(
|
|
9
22
|
o,
|
|
10
|
-
|
|
23
|
+
r,
|
|
11
24
|
{
|
|
12
25
|
reductionOp: "sum",
|
|
13
26
|
elementwise: !1,
|
|
@@ -25,39 +38,39 @@ class z extends S {
|
|
|
25
38
|
getWriteSnippet() {
|
|
26
39
|
return `
|
|
27
40
|
if (tid == 0) {
|
|
28
|
-
|
|
41
|
+
atomicAddF32(&result[uniforms.index], bestValue);
|
|
29
42
|
}
|
|
30
43
|
`;
|
|
31
44
|
}
|
|
32
45
|
}
|
|
33
|
-
function
|
|
34
|
-
const { x: o, output:
|
|
46
|
+
function x(t) {
|
|
47
|
+
const { x: o, output: r } = t.inputs, { invLossScaling: i, index: c } = t.attrs, n = t.backend, u = [], d = k(n);
|
|
35
48
|
let e = Math.min(512, n.device.limits.maxComputeWorkgroupSizeX);
|
|
36
|
-
const
|
|
37
|
-
for (;
|
|
49
|
+
const a = 4, s = S(o.shape);
|
|
50
|
+
for (; s % (e * a) !== 0 && e > 1; )
|
|
38
51
|
e /= 2;
|
|
39
52
|
if (e === 1)
|
|
40
|
-
throw new Error(`Cannot find suitable workgroup size for Norm2Program with reduce size ${
|
|
41
|
-
const
|
|
42
|
-
inSize: e *
|
|
53
|
+
throw new Error(`Cannot find suitable workgroup size for Norm2Program with reduce size ${s}`);
|
|
54
|
+
const l = {
|
|
55
|
+
inSize: e * a,
|
|
43
56
|
outSize: 1,
|
|
44
|
-
batchSize:
|
|
57
|
+
batchSize: s / (e * a),
|
|
45
58
|
windowSize: e
|
|
46
|
-
},
|
|
47
|
-
|
|
59
|
+
}, m = new v(d, l, e), p = g(
|
|
60
|
+
m,
|
|
48
61
|
[o],
|
|
49
62
|
n,
|
|
50
63
|
[
|
|
51
64
|
{ type: "float32", data: [i] },
|
|
52
65
|
{ type: "int32", data: [c] }
|
|
53
66
|
],
|
|
54
|
-
|
|
67
|
+
r
|
|
55
68
|
);
|
|
56
|
-
return
|
|
69
|
+
return u.forEach((f) => f.dispose()), p;
|
|
57
70
|
}
|
|
58
|
-
const
|
|
71
|
+
const z = {
|
|
59
72
|
kernelName: "Norm2",
|
|
60
73
|
backendName: "webgpu",
|
|
61
|
-
kernelFunc:
|
|
74
|
+
kernelFunc: x
|
|
62
75
|
};
|
|
63
|
-
w(
|
|
76
|
+
w(z);
|
|
@@ -29,6 +29,7 @@ export declare class ReduceProgram implements WebGPUProgram {
|
|
|
29
29
|
subgroupBuiltins: boolean;
|
|
30
30
|
deviceInfo: DeviceInformation;
|
|
31
31
|
params: ReduceParams;
|
|
32
|
+
utilityFunctions?: string;
|
|
32
33
|
constructor(deviceInfo: DeviceInformation, reduceInfo: backend_util.ReduceInfo, params: ReduceParams, packed: boolean);
|
|
33
34
|
protected getWriteSnippet(): string;
|
|
34
35
|
protected getPreprocessSnippet(): string;
|
|
@@ -1,10 +1,10 @@
|
|
|
1
|
-
import { ah as
|
|
1
|
+
import { ah as h, U as S, h as f } from "../../../index-CUXkjxiT.js";
|
|
2
2
|
import { e as d } from "../../../webgpu_program-B4HmApL1.js";
|
|
3
3
|
import { reshape16 as g } from "../../reshape16.js";
|
|
4
4
|
import { f as z } from "../../../webgpu_util-DYlGSwOJ.js";
|
|
5
5
|
import { c as k } from "../../../axis_util-GTVlo58H.js";
|
|
6
6
|
import { z as x } from "../../../zeros-DvZpK8s6.js";
|
|
7
|
-
function
|
|
7
|
+
function c(e, u, t, i) {
|
|
8
8
|
return e && !u ? `
|
|
9
9
|
bestValue = subgroupAdd(bestValue);
|
|
10
10
|
` : e ? `
|
|
@@ -37,10 +37,10 @@ function a(e, u, t, i) {
|
|
|
37
37
|
bestValue = bestValues[0];
|
|
38
38
|
`;
|
|
39
39
|
}
|
|
40
|
-
function
|
|
40
|
+
function $(e) {
|
|
41
41
|
const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
|
|
42
42
|
var<workgroup> bestValues : array<f32, ${e.workgroupSizeX}>;
|
|
43
|
-
`, i =
|
|
43
|
+
`, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
|
|
44
44
|
return `
|
|
45
45
|
fn DIV_CEIL(a : u32, b : u32) -> u32 {
|
|
46
46
|
return ((a - 1u) / b + 1u);
|
|
@@ -54,6 +54,7 @@ function v(e) {
|
|
|
54
54
|
}
|
|
55
55
|
|
|
56
56
|
${t}
|
|
57
|
+
${e.utilityFunctions ?? ""}
|
|
57
58
|
|
|
58
59
|
${d("index")} {
|
|
59
60
|
let outputIndex = index / ${u};
|
|
@@ -81,10 +82,10 @@ function v(e) {
|
|
|
81
82
|
}
|
|
82
83
|
`;
|
|
83
84
|
}
|
|
84
|
-
function
|
|
85
|
+
function v(e) {
|
|
85
86
|
const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
|
|
86
87
|
var<workgroup> bestValues : array<vec2<f32>, ${e.workgroupSizeX}>;
|
|
87
|
-
`, i =
|
|
88
|
+
`, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !0);
|
|
88
89
|
return `
|
|
89
90
|
fn DIV_CEIL(a : u32, b : u32) -> u32 {
|
|
90
91
|
return ((a - 1u) / b + 1u);
|
|
@@ -97,7 +98,8 @@ function $(e) {
|
|
|
97
98
|
`}
|
|
98
99
|
}
|
|
99
100
|
|
|
100
|
-
|
|
101
|
+
${t}
|
|
102
|
+
${e.utilityFunctions ?? ""}
|
|
101
103
|
|
|
102
104
|
${d("index")} {
|
|
103
105
|
let outputIndex = index / ${u};
|
|
@@ -128,12 +130,12 @@ function $(e) {
|
|
|
128
130
|
`;
|
|
129
131
|
}
|
|
130
132
|
function V(e) {
|
|
131
|
-
return e.elementwise ?
|
|
133
|
+
return e.elementwise ? $(e) : v(e);
|
|
132
134
|
}
|
|
133
135
|
function w(e) {
|
|
134
136
|
const u = `${e.workgroupSizeX}`, t = e.subgroups && !e.variableSubgroups ? "" : `
|
|
135
137
|
var<workgroup> bestValues : array<f32, ${e.workgroupSizeX}>;
|
|
136
|
-
`, i =
|
|
138
|
+
`, i = c(e.subgroups, e.variableSubgroups, e.workgroupSizeX, !1);
|
|
137
139
|
return `
|
|
138
140
|
fn DIV_CEIL(a : u32, b : u32) -> u32 {
|
|
139
141
|
return ((a - 1u) / b + 1u);
|
|
@@ -146,6 +148,7 @@ function w(e) {
|
|
|
146
148
|
}
|
|
147
149
|
|
|
148
150
|
${t}
|
|
151
|
+
${e.utilityFunctions ?? ""}
|
|
149
152
|
|
|
150
153
|
${d("index")} {
|
|
151
154
|
let outputIndex = index / ${e.workgroupSizeX};
|
|
@@ -173,11 +176,11 @@ function w(e) {
|
|
|
173
176
|
}
|
|
174
177
|
`;
|
|
175
178
|
}
|
|
176
|
-
function
|
|
177
|
-
const t = e[0],
|
|
178
|
-
return { windowSize: s, inSize: s, batchSize:
|
|
179
|
+
function X(e, u) {
|
|
180
|
+
const t = e[0], o = h(u, t.shape), [, n] = k(t.shape, o), s = S(n), r = S(t.shape) / s;
|
|
181
|
+
return { windowSize: s, inSize: s, batchSize: r, outSize: r };
|
|
179
182
|
}
|
|
180
|
-
class
|
|
183
|
+
class P {
|
|
181
184
|
atomic = !1;
|
|
182
185
|
outputShape;
|
|
183
186
|
shaderKey = "reduce16";
|
|
@@ -196,11 +199,12 @@ class A {
|
|
|
196
199
|
subgroupBuiltins = !1;
|
|
197
200
|
deviceInfo;
|
|
198
201
|
params;
|
|
199
|
-
|
|
200
|
-
|
|
202
|
+
utilityFunctions;
|
|
203
|
+
constructor(u, t, i, o) {
|
|
204
|
+
this.params = i, this.inputShape = [t.batchSize, t.inSize], this.deviceInfo = u, this.packed = o;
|
|
201
205
|
const n = i.forceWorkgroupSize ? i.forceWorkgroupSize : t.inSize % 64 === 0 ? 64 : 32;
|
|
202
|
-
u.subgroupsSupported && !i.forceWorkgroupSize ? (this.workgroupSize = [Math.min(n, u.subgroupMaxSize), 1, 1], this.subgroups = !0, u.variableSubgroups && (this.subgroupBuiltins = !0)) : this.workgroupSize[0] = n, this.outputShape = i.elementwise ? [t.batchSize, t.inSize] :
|
|
203
|
-
i.elementwise ? t.batchSize :
|
|
206
|
+
u.subgroupsSupported && !i.forceWorkgroupSize ? (this.workgroupSize = [Math.min(n, u.subgroupMaxSize), 1, 1], this.subgroups = !0, u.variableSubgroups && (this.subgroupBuiltins = !0)) : this.workgroupSize[0] = n, this.outputShape = i.elementwise ? [t.batchSize, t.inSize] : o ? [t.outSize / 2] : [t.outSize], this.dispatchLayout = z(this.outputShape), this.dispatch = [
|
|
207
|
+
i.elementwise ? t.batchSize : o ? t.batchSize / 2 : t.batchSize,
|
|
204
208
|
1,
|
|
205
209
|
1
|
|
206
210
|
], this.outputComponent = 1, this.variableComponents = [1], this.elementwise = i.elementwise === !0;
|
|
@@ -230,7 +234,8 @@ class A {
|
|
|
230
234
|
inputReadSnippet: this.getReadSnippet(),
|
|
231
235
|
inputSnippet: this.getPreprocessSnippet(),
|
|
232
236
|
outputSnippet: this.getWriteSnippet(),
|
|
233
|
-
reducedSnippet: this.getPostprocessSnippet()
|
|
237
|
+
reducedSnippet: this.getPostprocessSnippet(),
|
|
238
|
+
utilityFunctions: this.utilityFunctions
|
|
234
239
|
}) : w({
|
|
235
240
|
...this.params,
|
|
236
241
|
workgroupSizeX: u,
|
|
@@ -239,21 +244,22 @@ class A {
|
|
|
239
244
|
inputReadSnippet: this.getReadSnippet(),
|
|
240
245
|
inputSnippet: this.getPreprocessSnippet(),
|
|
241
246
|
outputSnippet: this.getWriteSnippet(),
|
|
242
|
-
reducedSnippet: this.getPostprocessSnippet()
|
|
247
|
+
reducedSnippet: this.getPostprocessSnippet(),
|
|
248
|
+
utilityFunctions: this.utilityFunctions
|
|
243
249
|
});
|
|
244
250
|
}
|
|
245
251
|
}
|
|
246
|
-
function
|
|
247
|
-
const n = u[0],
|
|
248
|
-
let
|
|
249
|
-
!
|
|
252
|
+
function A(e, u, t, i, o) {
|
|
253
|
+
const n = u[0], a = [{ type: "int32", data: [e.inputShape[e.inputShape.length - 1]] }, ...i ?? []];
|
|
254
|
+
let r = o;
|
|
255
|
+
!o && e.atomic && (r = x(e.outputShape, "int32"));
|
|
250
256
|
const l = t.runWebGPUProgram(
|
|
251
257
|
e,
|
|
252
258
|
u,
|
|
253
259
|
e.packed ? "packedF16" : e.atomic ? "int32" : "float32",
|
|
254
|
-
|
|
255
|
-
|
|
256
|
-
), p =
|
|
260
|
+
a,
|
|
261
|
+
r
|
|
262
|
+
), p = f().makeTensorFromTensorInfo(l);
|
|
257
263
|
if (e.outputShape.length === 1 && e.outputShape[0] <= 2)
|
|
258
264
|
return p;
|
|
259
265
|
const b = g(
|
|
@@ -263,7 +269,7 @@ function W(e, u, t, i, r) {
|
|
|
263
269
|
return p.dispose(), b;
|
|
264
270
|
}
|
|
265
271
|
export {
|
|
266
|
-
|
|
267
|
-
|
|
268
|
-
|
|
272
|
+
P as ReduceProgram,
|
|
273
|
+
X as createReduceInfo,
|
|
274
|
+
A as reduce
|
|
269
275
|
};
|