@genai-fi/nanogpt 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Generator.d.ts +36 -4
- package/dist/Generator.js +183 -69
- package/dist/{RealDiv-Dy0p8Bvo.js → RealDiv-N8TpOMYv.js} +14 -14
- package/dist/{Reshape-DvudQDvJ.js → Reshape-B-lWQRnF.js} +1 -1
- package/dist/{Reshape-DH5srBP0.js → Reshape-Bo8HzP8V.js} +5 -5
- package/dist/TeachableLLM.d.ts +6 -6
- package/dist/TeachableLLM.js +51 -50
- package/dist/Trainer.d.ts +19 -3
- package/dist/Trainer.js +71 -28
- package/dist/{axis_util-BzbKo31C.js → axis_util-DubwyOhW.js} +3 -3
- package/dist/backend.js +2 -2
- package/dist/{backend_util-TE7aTPhZ.js → backend_util-BJ-_jSeK.js} +46 -46
- package/dist/{broadcast_to-CdbwV-Dj.js → broadcast_to-BYfCp5iL.js} +2 -2
- package/dist/{concat-CsxrgovM.js → concat-BmDqqFsa.js} +1 -1
- package/dist/{dataset-CtdBYwjo.js → dataset-CJmEGu6D.js} +5 -5
- package/dist/{dropout-DYs5QFGQ.js → dropout-sx0sjVAT.js} +8 -8
- package/dist/exports_initializers-DAKM8UO9.js +16 -0
- package/dist/{gather-CMMy2KEG.js → gather-C1siEkdp.js} +1 -1
- package/dist/{gelu-C-dPj6Ku.js → gelu-Bd3UBBxg.js} +1 -1
- package/dist/{gpgpu_math-DGNLNL4I.js → gpgpu_math-TFLxaLkw.js} +26 -26
- package/dist/{index-CLthM0TO.js → index-BaPo_0H8.js} +185 -185
- package/dist/{index-BoWRt-10.js → index-CUQrfsw_.js} +266 -265
- package/dist/{kernel_funcs_utils-BYKWV8Aa.js → kernel_funcs_utils-P9aFa232.js} +9 -9
- package/dist/layers/BaseLayer.d.ts +8 -13
- package/dist/layers/BaseLayer.js +25 -13
- package/dist/layers/CausalSelfAttention.d.ts +3 -2
- package/dist/layers/CausalSelfAttention.js +28 -28
- package/dist/layers/MLP.d.ts +3 -2
- package/dist/layers/MLP.js +16 -20
- package/dist/layers/PositionEmbedding.d.ts +9 -0
- package/dist/layers/PositionEmbedding.js +45 -0
- package/dist/layers/RMSNorm.d.ts +3 -2
- package/dist/layers/RMSNorm.js +6 -6
- package/dist/layers/RoPECache.d.ts +1 -1
- package/dist/layers/RoPECache.js +4 -4
- package/dist/layers/TiedEmbedding.d.ts +3 -2
- package/dist/layers/TiedEmbedding.js +29 -7
- package/dist/layers/TransformerBlock.d.ts +3 -2
- package/dist/layers/TransformerBlock.js +1 -1
- package/dist/loader/load.d.ts +2 -2
- package/dist/loader/loadHF.d.ts +2 -2
- package/dist/loader/loadTransformers.d.ts +4 -2
- package/dist/loader/loadTransformers.js +10 -9
- package/dist/loader/newZipLoad.d.ts +2 -2
- package/dist/loader/oldZipLoad.d.ts +2 -2
- package/dist/loader/oldZipLoad.js +42 -51
- package/dist/loader/save.d.ts +8 -0
- package/dist/loader/save.js +62 -0
- package/dist/{log_sum_exp-DbjkV734.js → log_sum_exp-C142qZqY.js} +14 -14
- package/dist/main.d.ts +5 -4
- package/dist/main.js +22 -18
- package/dist/{mat_mul-8m8pfdcx.js → mat_mul-DMkduNJu.js} +1 -1
- package/dist/{max-Ddnnb5xe.js → max-B3JOcNGb.js} +1 -1
- package/dist/mod-uUuj4gSb.js +27 -0
- package/dist/models/NanoGPTV1.d.ts +15 -0
- package/dist/models/NanoGPTV1.js +71 -0
- package/dist/{config.d.ts → models/config.d.ts} +1 -0
- package/dist/{config.js → models/config.js} +1 -0
- package/dist/models/factory.d.ts +3 -0
- package/dist/models/factory.js +14 -0
- package/dist/models/model.d.ts +26 -0
- package/dist/models/model.js +68 -0
- package/dist/{mulmat_packed_gpu-VSekgsNv.js → mulmat_packed_gpu-Cm2gw-c8.js} +1 -1
- package/dist/{ones-Dj0SDhHf.js → ones-ZdgQGBCP.js} +2 -2
- package/dist/ops/adamAdjust.js +1 -1
- package/dist/ops/adamMoments.js +1 -1
- package/dist/ops/appendCache.js +3 -3
- package/dist/ops/attentionMask.js +1 -1
- package/dist/ops/cpu/adamAdjust.js +9 -9
- package/dist/ops/cpu/adamMoments.js +2 -2
- package/dist/ops/cpu/appendCache.js +2 -2
- package/dist/ops/cpu/attentionMask.js +5 -5
- package/dist/ops/cpu/fusedSoftmax.js +2 -2
- package/dist/ops/cpu/gatherSub.js +3 -3
- package/dist/ops/cpu/gelu.js +1 -1
- package/dist/ops/cpu/matMulGelu.js +2 -2
- package/dist/ops/cpu/matMulMul.js +1 -1
- package/dist/ops/cpu/mulDropout.js +1 -1
- package/dist/ops/cpu/normRMS.js +1 -1
- package/dist/ops/cpu/qkv.js +3 -3
- package/dist/ops/cpu/rope.js +5 -5
- package/dist/ops/cpu/scatterSub.js +11 -11
- package/dist/ops/fusedSoftmax.js +1 -1
- package/dist/ops/gatherSub.js +1 -1
- package/dist/ops/gelu.js +2 -2
- package/dist/ops/grads/attentionMask.js +1 -1
- package/dist/ops/grads/fusedSoftmax.js +2 -2
- package/dist/ops/grads/gelu.js +2 -2
- package/dist/ops/grads/matMulGelu.js +1 -1
- package/dist/ops/grads/normRMS.js +1 -1
- package/dist/ops/grads/qkv.js +1 -1
- package/dist/ops/grads/rope.js +1 -1
- package/dist/ops/matMulGelu.js +1 -1
- package/dist/ops/matMulMul.js +1 -1
- package/dist/ops/mulDrop.js +1 -1
- package/dist/ops/normRMS.js +1 -1
- package/dist/ops/qkv.js +1 -1
- package/dist/ops/rope.js +4 -4
- package/dist/ops/scatterSub.js +1 -1
- package/dist/ops/webgl/adamAdjust.js +2 -2
- package/dist/ops/webgl/adamMoments.js +1 -1
- package/dist/ops/webgl/appendCache.js +1 -1
- package/dist/ops/webgl/attentionMask.js +1 -1
- package/dist/ops/webgl/fusedSoftmax.js +4 -4
- package/dist/ops/webgl/gatherSub.js +1 -1
- package/dist/ops/webgl/gelu.js +2 -2
- package/dist/ops/webgl/log.js +3 -3
- package/dist/ops/webgl/matMulGelu.js +10 -10
- package/dist/ops/webgl/matMulMul.js +1 -1
- package/dist/ops/webgl/mulDropout.js +1 -1
- package/dist/ops/webgl/normRMS.js +2 -2
- package/dist/ops/webgl/qkv.js +1 -1
- package/dist/ops/webgl/rope.js +1 -1
- package/dist/ops/webgl/scatterSub.js +1 -1
- package/dist/ops/webgpu/adamAdjust.js +3 -3
- package/dist/ops/webgpu/adamMoments.js +3 -3
- package/dist/ops/webgpu/appendCache.js +3 -3
- package/dist/ops/webgpu/attentionMask.js +3 -3
- package/dist/ops/webgpu/gatherSub.js +3 -3
- package/dist/ops/webgpu/gelu.js +3 -3
- package/dist/ops/webgpu/normRMS.js +2 -2
- package/dist/ops/webgpu/normRMSGrad.js +5 -5
- package/dist/ops/webgpu/qkv.js +3 -3
- package/dist/ops/webgpu/rope.js +3 -3
- package/dist/ops/webgpu/scatterSub.js +3 -3
- package/dist/ops/webgpu/utils/reductions.js +4 -4
- package/dist/{ops-BFGCx8Ri.js → ops-C_1K_-35.js} +103 -103
- package/dist/{random_width-sZORGo5k.js → random_width-D8Pwy_na.js} +136 -136
- package/dist/{range-CRuAh-gd.js → range-LVHrSLdi.js} +1 -1
- package/dist/{reciprocal-BvGAyKyu.js → reciprocal-CaR9e67G.js} +1 -1
- package/dist/{register_all_kernels-BwDSRN-f.js → register_all_kernels-DUshvVWP.js} +2026 -2049
- package/dist/{reshape-CdBq1WJ6.js → reshape-DEfQGSin.js} +1 -1
- package/dist/{scatter_nd_util-DUstGbU1.js → scatter_nd_util-CUPPNLaA.js} +1 -1
- package/dist/{selu_util-BJEXVvjX.js → selu_util-8vv5JxQV.js} +3 -3
- package/dist/{shared-B8ztnyEk.js → shared-CkNorDcU.js} +83 -83
- package/dist/{shared-wS99K7_n.js → shared-D1elLckx.js} +1 -1
- package/dist/{sin-BeA3tsEd.js → sin-D2CKKmyR.js} +1 -1
- package/dist/{slice-BiOsknYS.js → slice-BnyE-M_7.js} +1 -1
- package/dist/{softmax-Bv_6lyMX.js → softmax-DLoZWYBx.js} +1 -1
- package/dist/{split-B-dikLRw.js → split-By_n4TKP.js} +1 -1
- package/dist/{stack-B17UN2nn.js → stack-DkdFLq37.js} +1 -1
- package/dist/{sum-66ew2byf.js → sum-l_0SqM4h.js} +3 -3
- package/dist/{tensor-JwS7ZYY6.js → tensor-BAQdLqoU.js} +1 -1
- package/dist/{tensor2d-wxPAnDQy.js → tensor2d-BHy261cI.js} +1 -1
- package/dist/training/Adam.js +2 -2
- package/dist/training/AdamExt.js +1 -1
- package/dist/training/DatasetBuilder.js +2 -2
- package/dist/training/Evaluator.d.ts +2 -2
- package/dist/training/FullTrainer.d.ts +16 -3
- package/dist/training/FullTrainer.js +91 -53
- package/dist/training/Trainer.d.ts +25 -3
- package/dist/training/Trainer.js +39 -47
- package/dist/training/sparseCrossEntropy.js +9 -9
- package/dist/utilities/dummy.d.ts +4 -4
- package/dist/utilities/dummy.js +13 -13
- package/dist/utilities/multinomialCPU.js +2 -2
- package/dist/utilities/parameters.d.ts +1 -1
- package/dist/utilities/performance.js +1 -1
- package/dist/utilities/profile.js +1 -1
- package/dist/utilities/safetensors.js +2 -2
- package/dist/utilities/weights.js +2 -2
- package/dist/{variable-BuddVFLa.js → variable-C9hihzDB.js} +1 -1
- package/dist/{webgpu_program-PFzf1hAQ.js → webgpu_program-dFEVbDPL.js} +1 -1
- package/dist/{webgpu_util-D____QpY.js → webgpu_util-DLImlSc6.js} +27 -27
- package/dist/{zeros--BdLQ3oG.js → zeros-VZ72lWXM.js} +1 -1
- package/package.json +2 -3
- package/dist/NanoGPTModel.d.ts +0 -52
- package/dist/NanoGPTModel.js +0 -203
- package/dist/TiedEmbedding-BxOerUmB.js +0 -43
- package/dist/utilities/generate.d.ts +0 -3
- package/dist/utilities/generate.js +0 -22
- package/dist/utilities/save.d.ts +0 -9
- package/dist/utilities/save.js +0 -61
package/dist/layers/BaseLayer.js
CHANGED
|
@@ -1,22 +1,28 @@
|
|
|
1
|
-
import { T as
|
|
2
|
-
import { v as _ } from "../variable-
|
|
3
|
-
class
|
|
1
|
+
import { T as p, I as g, e as o, J as v } from "../index-CUQrfsw_.js";
|
|
2
|
+
import { v as _ } from "../variable-C9hihzDB.js";
|
|
3
|
+
class T {
|
|
4
4
|
parent;
|
|
5
5
|
config;
|
|
6
6
|
_variables = /* @__PURE__ */ new Map();
|
|
7
7
|
_trainable = !0;
|
|
8
8
|
children = [];
|
|
9
|
+
profiler;
|
|
9
10
|
constructor(t, r) {
|
|
10
11
|
this.config = t, this.parent = r, this.parent && this.parent.children.push(this);
|
|
11
12
|
}
|
|
12
13
|
getProfiler() {
|
|
13
|
-
return this.
|
|
14
|
+
return this.profiler;
|
|
15
|
+
}
|
|
16
|
+
setProfiler(t) {
|
|
17
|
+
this.profiler = t || void 0, this.children.forEach((r) => {
|
|
18
|
+
r.setProfiler(t);
|
|
19
|
+
});
|
|
14
20
|
}
|
|
15
21
|
startMemory() {
|
|
16
|
-
this.
|
|
22
|
+
this.profiler?.startMemory();
|
|
17
23
|
}
|
|
18
24
|
endMemory(t) {
|
|
19
|
-
this.
|
|
25
|
+
this.profiler?.endMemory(t);
|
|
20
26
|
}
|
|
21
27
|
addVariable(t, r) {
|
|
22
28
|
this._variables.set(t, r || null);
|
|
@@ -41,11 +47,17 @@ class M {
|
|
|
41
47
|
r.trainable = t;
|
|
42
48
|
});
|
|
43
49
|
}
|
|
44
|
-
getVariable(t) {
|
|
45
|
-
const
|
|
46
|
-
if (!r)
|
|
50
|
+
getVariable(t, r = !1) {
|
|
51
|
+
const e = this._variables.get(t);
|
|
52
|
+
if (!e && r)
|
|
53
|
+
for (const i of this.children) {
|
|
54
|
+
const s = i.getVariable(t, !0);
|
|
55
|
+
if (s)
|
|
56
|
+
return s;
|
|
57
|
+
}
|
|
58
|
+
if (!e)
|
|
47
59
|
throw new Error(`Variable ${t} not found`);
|
|
48
|
-
return
|
|
60
|
+
return e;
|
|
49
61
|
}
|
|
50
62
|
hasVariable(t) {
|
|
51
63
|
return this._variables.get(t) !== null;
|
|
@@ -85,7 +97,7 @@ class M {
|
|
|
85
97
|
call(t, ...r) {
|
|
86
98
|
this.build();
|
|
87
99
|
const e = this.forward(t, ...r);
|
|
88
|
-
if (t.training && e instanceof
|
|
100
|
+
if (t.training && e instanceof p) {
|
|
89
101
|
const i = this.dropout(e);
|
|
90
102
|
return i !== e && e.dispose(), i;
|
|
91
103
|
} else
|
|
@@ -95,7 +107,7 @@ class M {
|
|
|
95
107
|
return this.build(), this.checkpointingFn(t, ...r);
|
|
96
108
|
}
|
|
97
109
|
checkpointingFn(t, ...r) {
|
|
98
|
-
const e = this.trainableVariables, s =
|
|
110
|
+
const e = this.trainableVariables, s = g((...a) => {
|
|
99
111
|
const l = a[a.length - 1], n = a.slice(0, r.length), h = this.forward(t, ...n);
|
|
100
112
|
return l(n), { value: h, gradFunc: (c, f) => {
|
|
101
113
|
const u = o().state.activeTape;
|
|
@@ -112,5 +124,5 @@ class M {
|
|
|
112
124
|
}
|
|
113
125
|
}
|
|
114
126
|
export {
|
|
115
|
-
|
|
127
|
+
T as default
|
|
116
128
|
};
|
|
@@ -1,5 +1,6 @@
|
|
|
1
|
-
import { default as BaseLayer, ForwardAttributes
|
|
1
|
+
import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
|
|
2
2
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
3
|
+
import { GPTConfig } from '../models/config';
|
|
3
4
|
export type KVCache = {
|
|
4
5
|
k?: Tensor;
|
|
5
6
|
v?: Tensor;
|
|
@@ -22,7 +23,7 @@ export default class CausalSelfAttention extends BaseLayer<AttentionForwardAttri
|
|
|
22
23
|
private projUnits;
|
|
23
24
|
private ATTN;
|
|
24
25
|
private PROJ;
|
|
25
|
-
constructor(index: number, config:
|
|
26
|
+
constructor(index: number, config: GPTConfig, parent?: BaseLayer);
|
|
26
27
|
protected build(): void;
|
|
27
28
|
private getAttentionScores;
|
|
28
29
|
private getAttentionScoresWithPast;
|
|
@@ -3,14 +3,14 @@ import O from "./BaseLayer.js";
|
|
|
3
3
|
import { qkv as P } from "../ops/qkv.js";
|
|
4
4
|
import { rope as v } from "../ops/rope.js";
|
|
5
5
|
import { appendCache as V } from "../ops/appendCache.js";
|
|
6
|
-
import {
|
|
6
|
+
import { k as c, t as C } from "../index-CUQrfsw_.js";
|
|
7
7
|
import { fusedSoftmax as T } from "../ops/fusedSoftmax.js";
|
|
8
|
-
import { d as
|
|
9
|
-
import { v as b } from "../variable-
|
|
10
|
-
import { r as k, d as
|
|
11
|
-
import { r as N } from "../reshape-
|
|
12
|
-
import { m as R } from "../mat_mul-
|
|
13
|
-
class
|
|
8
|
+
import { d as L } from "../random_width-D8Pwy_na.js";
|
|
9
|
+
import { v as b } from "../variable-C9hihzDB.js";
|
|
10
|
+
import { r as k, d as y } from "../dropout-sx0sjVAT.js";
|
|
11
|
+
import { r as N } from "../reshape-DEfQGSin.js";
|
|
12
|
+
import { m as R } from "../mat_mul-DMkduNJu.js";
|
|
13
|
+
class $ extends O {
|
|
14
14
|
divisor;
|
|
15
15
|
index;
|
|
16
16
|
units;
|
|
@@ -18,27 +18,27 @@ class W extends O {
|
|
|
18
18
|
ATTN;
|
|
19
19
|
PROJ;
|
|
20
20
|
constructor(t, i, s) {
|
|
21
|
-
super(i, s), this.index = t, this.units = i.
|
|
21
|
+
super(i, s), this.index = t, this.units = i.nEmbed * 3, this.projUnits = i.nEmbed, this.ATTN = `block_${this.index}_cAttn`, this.PROJ = `block_${this.index}_cProj`, this.addVariable(this.ATTN), this.addVariable(this.PROJ), this.divisor = 1 / Math.sqrt(i.nEmbed / i.nHead);
|
|
22
22
|
}
|
|
23
23
|
build() {
|
|
24
24
|
this.hasVariable(this.ATTN) === !1 && this.setVariable(
|
|
25
25
|
this.ATTN,
|
|
26
26
|
b(
|
|
27
|
-
k([this.config.
|
|
27
|
+
k([this.config.nEmbed, this.units], 0, 0.02),
|
|
28
28
|
!0
|
|
29
29
|
//`block_${this.index}_attn_cAttn_kernel`
|
|
30
30
|
)
|
|
31
31
|
), this.hasVariable(this.PROJ) === !1 && this.setVariable(
|
|
32
32
|
this.PROJ,
|
|
33
33
|
b(
|
|
34
|
-
k([this.projUnits, this.config.
|
|
34
|
+
k([this.projUnits, this.config.nEmbed], 0, 0.02),
|
|
35
35
|
!0
|
|
36
36
|
//`block_${this.index}_attn_cProj_kernel`
|
|
37
37
|
)
|
|
38
38
|
);
|
|
39
39
|
}
|
|
40
40
|
getAttentionScores(t, i, s, o) {
|
|
41
|
-
const e = g(t, i, this.divisor), n = T(e, s ? this.config.
|
|
41
|
+
const e = g(t, i, this.divisor), n = T(e, s ? this.config.dropout : 0, o);
|
|
42
42
|
return e.dispose(), n;
|
|
43
43
|
}
|
|
44
44
|
// Attention with optional past. If pastLen > 0 and T_cur == 1, no mask needed.
|
|
@@ -47,50 +47,50 @@ class W extends O {
|
|
|
47
47
|
return o.dispose(), e;
|
|
48
48
|
}
|
|
49
49
|
getQKV(t) {
|
|
50
|
-
return P(t, this.getVariable(this.ATTN), this.config.
|
|
50
|
+
return P(t, this.getVariable(this.ATTN), this.config.nHead);
|
|
51
51
|
}
|
|
52
52
|
getOutputProjection(t) {
|
|
53
|
-
const i = t.shape[0], s = t.shape[2], o = this.config.
|
|
54
|
-
return n.dispose(), e.dispose(),
|
|
53
|
+
const i = t.shape[0], s = t.shape[2], o = this.config.nEmbed, e = t.transpose([0, 2, 1, 3]), n = N(e, [i, s, o]), r = L(n, this.getVariable(this.PROJ));
|
|
54
|
+
return n.dispose(), e.dispose(), r;
|
|
55
55
|
}
|
|
56
56
|
updateCache(t, i, s) {
|
|
57
|
-
const o = this.config.
|
|
57
|
+
const o = this.config.blockSize, e = t.shape[2], n = s.length || 0, r = V(t, o, n, s.k);
|
|
58
58
|
t.dispose(), s.k && s.k.dispose();
|
|
59
|
-
const
|
|
59
|
+
const p = V(i, o, n, s.v);
|
|
60
60
|
i.dispose(), s.v && s.v.dispose();
|
|
61
61
|
const d = Math.min(n + e, o), h = s.cumulativeLength + e;
|
|
62
|
-
s.length = d, s.cumulativeLength = h, s.k = c(
|
|
62
|
+
s.length = d, s.cumulativeLength = h, s.k = c(r), s.v = c(p);
|
|
63
63
|
}
|
|
64
64
|
forward(t, i) {
|
|
65
65
|
return C(() => {
|
|
66
66
|
this.startMemory();
|
|
67
|
-
const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0,
|
|
68
|
-
|
|
67
|
+
const [s, o, e] = this.getQKV(i), n = t.pastKV ? t.pastKV.cumulativeLength : 0, r = t.ropeCache, p = r ? v(s, r, n) : s, d = r ? v(o, r, n) : o;
|
|
68
|
+
r && (s.dispose(), o.dispose());
|
|
69
69
|
const h = t.pastKV ? t.pastKV.length : 0;
|
|
70
70
|
t.pastKV && !t.training && this.updateCache(d, e, t.pastKV);
|
|
71
71
|
const u = t.pastKV?.k ? t.pastKV.k : d, m = t.pastKV?.v ? t.pastKV.v : e;
|
|
72
|
-
let
|
|
73
|
-
h > 0 ?
|
|
74
|
-
const l = R(
|
|
75
|
-
f ||
|
|
72
|
+
let a;
|
|
73
|
+
h > 0 ? a = this.getAttentionScoresWithPast(p, u, h) : a = this.getAttentionScores(p, u, t.training, t.seed || 0), p.dispose(), t.pastKV || u.dispose();
|
|
74
|
+
const l = R(a, m), f = t.attentionScores !== void 0 && t.attentionScores.attentionOut !== void 0;
|
|
75
|
+
f || a.dispose(), t.pastKV || m.dispose();
|
|
76
76
|
const A = this.getOutputProjection(l);
|
|
77
77
|
if (l.dispose(), f && t.attentionScores && t.attentionScores.attentionOut !== void 0) {
|
|
78
|
-
const K =
|
|
78
|
+
const K = a.shape[1], S = a.shape[2];
|
|
79
79
|
t.attentionScores.attentionOut?.push(
|
|
80
|
-
c(
|
|
80
|
+
c(a.slice([0, 0, 0, 0], [1, -1, -1, -1]).reshape([K, S, -1]))
|
|
81
81
|
);
|
|
82
82
|
}
|
|
83
83
|
return this.endMemory("CausalSelfAttention"), A;
|
|
84
84
|
});
|
|
85
85
|
}
|
|
86
86
|
dropout(t) {
|
|
87
|
-
if (this.config.
|
|
88
|
-
const i =
|
|
87
|
+
if (this.config.dropout > 0) {
|
|
88
|
+
const i = y(t, this.config.dropout);
|
|
89
89
|
return t.dispose(), i;
|
|
90
90
|
} else
|
|
91
91
|
return t;
|
|
92
92
|
}
|
|
93
93
|
}
|
|
94
94
|
export {
|
|
95
|
-
|
|
95
|
+
$ as default
|
|
96
96
|
};
|
package/dist/layers/MLP.d.ts
CHANGED
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
|
-
import { default as BaseLayer, ForwardAttributes
|
|
2
|
+
import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
|
|
3
|
+
import { GPTConfig } from '../main';
|
|
3
4
|
export default class MLP extends BaseLayer {
|
|
4
5
|
private index;
|
|
5
6
|
private hiddenUnits;
|
|
6
7
|
private MLPHIDDEN;
|
|
7
8
|
private MLPOUT;
|
|
8
|
-
constructor(index: number, config:
|
|
9
|
+
constructor(index: number, config: GPTConfig, parent?: BaseLayer);
|
|
9
10
|
protected build(): void;
|
|
10
11
|
forward(_: ForwardAttributes, x: Tensor): Tensor;
|
|
11
12
|
protected dropout(x: Tensor): Tensor;
|
package/dist/layers/MLP.js
CHANGED
|
@@ -1,56 +1,52 @@
|
|
|
1
|
-
import { t as
|
|
1
|
+
import { t as p } from "../index-CUQrfsw_.js";
|
|
2
2
|
import u from "./BaseLayer.js";
|
|
3
3
|
import { matMulGelu as M } from "../ops/matMulGelu.js";
|
|
4
|
-
import { v as o } from "../variable-
|
|
5
|
-
import { r as h, d as f } from "../dropout-
|
|
6
|
-
import { r as d } from "../reshape-
|
|
7
|
-
import { m as c } from "../mat_mul-
|
|
8
|
-
class
|
|
4
|
+
import { v as o } from "../variable-C9hihzDB.js";
|
|
5
|
+
import { r as h, d as f } from "../dropout-sx0sjVAT.js";
|
|
6
|
+
import { r as d } from "../reshape-DEfQGSin.js";
|
|
7
|
+
import { m as c } from "../mat_mul-DMkduNJu.js";
|
|
8
|
+
class H extends u {
|
|
9
9
|
index;
|
|
10
10
|
hiddenUnits;
|
|
11
11
|
MLPHIDDEN;
|
|
12
12
|
MLPOUT;
|
|
13
13
|
constructor(i, t, s) {
|
|
14
|
-
super(t, s), this.index = i, this.hiddenUnits = t.
|
|
14
|
+
super(t, s), this.index = i, this.hiddenUnits = t.mlpFactor * t.nEmbed, this.MLPHIDDEN = `block_${this.index}_mlpHidden`, this.MLPOUT = `block_${this.index}_mlpOut`, this.addVariable(this.MLPHIDDEN), this.addVariable(this.MLPOUT);
|
|
15
15
|
}
|
|
16
16
|
build() {
|
|
17
17
|
this.hasVariable(this.MLPHIDDEN) === !1 && this.setVariable(
|
|
18
18
|
this.MLPHIDDEN,
|
|
19
19
|
o(
|
|
20
|
-
h([this.config.
|
|
20
|
+
h([this.config.nEmbed, this.hiddenUnits], 0, 0.02),
|
|
21
21
|
!0
|
|
22
22
|
//`block_${this.index}_attn_cAttn_kernel`
|
|
23
23
|
)
|
|
24
24
|
), this.hasVariable(this.MLPOUT) === !1 && this.setVariable(
|
|
25
25
|
this.MLPOUT,
|
|
26
26
|
o(
|
|
27
|
-
h(
|
|
28
|
-
[this.hiddenUnits, this.config.gpt.nEmbed],
|
|
29
|
-
0,
|
|
30
|
-
0.02 / Math.sqrt(2 * this.config.gpt.nLayer)
|
|
31
|
-
),
|
|
27
|
+
h([this.hiddenUnits, this.config.nEmbed], 0, 0.02 / Math.sqrt(2 * this.config.nLayer)),
|
|
32
28
|
!0
|
|
33
29
|
//`block_${this.index}_attn_cProj_kernel`
|
|
34
30
|
)
|
|
35
31
|
);
|
|
36
32
|
}
|
|
37
33
|
forward(i, t) {
|
|
38
|
-
return
|
|
34
|
+
return p(() => {
|
|
39
35
|
this.startMemory();
|
|
40
|
-
const [s, r, e] = t.shape, n = d(t, [s * r, e]), a = M(n, this.getVariable(this.MLPHIDDEN)),
|
|
36
|
+
const [s, r, e] = t.shape, n = d(t, [s * r, e]), a = M(n, this.getVariable(this.MLPHIDDEN)), m = c(a, this.getVariable(this.MLPOUT));
|
|
41
37
|
a.dispose();
|
|
42
|
-
const
|
|
43
|
-
return this.endMemory("MLP"),
|
|
38
|
+
const l = d(m, [s, r, e]);
|
|
39
|
+
return this.endMemory("MLP"), l;
|
|
44
40
|
});
|
|
45
41
|
}
|
|
46
42
|
dropout(i) {
|
|
47
|
-
if (this.config.
|
|
48
|
-
const t = f(i, this.config.
|
|
43
|
+
if (this.config.dropout > 0) {
|
|
44
|
+
const t = f(i, this.config.dropout);
|
|
49
45
|
return i.dispose(), t;
|
|
50
46
|
}
|
|
51
47
|
return i;
|
|
52
48
|
}
|
|
53
49
|
}
|
|
54
50
|
export {
|
|
55
|
-
|
|
51
|
+
H as default
|
|
56
52
|
};
|
|
@@ -0,0 +1,9 @@
|
|
|
1
|
+
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
|
+
import { default as BaseLayer } from './BaseLayer';
|
|
3
|
+
import { GPTConfig, ModelForwardAttributes } from '../main';
|
|
4
|
+
export default class PositionEmbedding extends BaseLayer {
|
|
5
|
+
private wpe?;
|
|
6
|
+
private drop;
|
|
7
|
+
constructor(config: GPTConfig, name?: string, parent?: BaseLayer);
|
|
8
|
+
forward(attrs: ModelForwardAttributes, x: Tensor): Tensor;
|
|
9
|
+
}
|
|
@@ -0,0 +1,45 @@
|
|
|
1
|
+
import { t as c, a8 as u, b as i } from "../index-CUQrfsw_.js";
|
|
2
|
+
import f from "./BaseLayer.js";
|
|
3
|
+
import { E as g, D as h } from "../random_width-D8Pwy_na.js";
|
|
4
|
+
import { r as b } from "../exports_initializers-DAKM8UO9.js";
|
|
5
|
+
import { m as l } from "../mod-uUuj4gSb.js";
|
|
6
|
+
import { r as w } from "../range-LVHrSLdi.js";
|
|
7
|
+
/**
|
|
8
|
+
* @license
|
|
9
|
+
* Copyright 2018 Google LLC
|
|
10
|
+
*
|
|
11
|
+
* Use of this source code is governed by an MIT-style
|
|
12
|
+
* license that can be found in the LICENSE file or at
|
|
13
|
+
* https://opensource.org/licenses/MIT.
|
|
14
|
+
* =============================================================================
|
|
15
|
+
*/
|
|
16
|
+
function E(t) {
|
|
17
|
+
return new h(t);
|
|
18
|
+
}
|
|
19
|
+
function x(t) {
|
|
20
|
+
return new g(t);
|
|
21
|
+
}
|
|
22
|
+
class q extends f {
|
|
23
|
+
wpe;
|
|
24
|
+
// Position embeddings
|
|
25
|
+
drop;
|
|
26
|
+
// Dropout
|
|
27
|
+
constructor(o, n = "", r) {
|
|
28
|
+
super(o, r), this.wpe = x({
|
|
29
|
+
inputDim: this.config.blockSize,
|
|
30
|
+
outputDim: this.config.nEmbed,
|
|
31
|
+
name: n,
|
|
32
|
+
embeddingsInitializer: b({ mean: 0, stddev: 0.02 })
|
|
33
|
+
}), this.drop = E({ rate: this.config.dropout });
|
|
34
|
+
}
|
|
35
|
+
forward(o, n) {
|
|
36
|
+
const r = o.cache?.[0]?.length ?? 0;
|
|
37
|
+
return c(() => {
|
|
38
|
+
const [, s] = n.shape, e = this.config.blockSize, a = w(0, s, 1, "int32"), m = l(u(a, i(r, "int32")), i(e, "int32")), d = this.wpe.apply(m), p = n.add(d);
|
|
39
|
+
return this.drop.apply(p, { training: o.training });
|
|
40
|
+
});
|
|
41
|
+
}
|
|
42
|
+
}
|
|
43
|
+
export {
|
|
44
|
+
q as default
|
|
45
|
+
};
|
package/dist/layers/RMSNorm.d.ts
CHANGED
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
|
-
import { default as BaseLayer, ForwardAttributes
|
|
2
|
+
import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
|
|
3
|
+
import { GPTConfig } from '../main';
|
|
3
4
|
export default class RMSNorm extends BaseLayer {
|
|
4
5
|
private GAMMA;
|
|
5
|
-
constructor(config:
|
|
6
|
+
constructor(config: GPTConfig, name?: string, parent?: BaseLayer);
|
|
6
7
|
forward(_: ForwardAttributes, x: Tensor): Tensor;
|
|
7
8
|
}
|
package/dist/layers/RMSNorm.js
CHANGED
|
@@ -1,12 +1,12 @@
|
|
|
1
|
-
import { t as s } from "../index-
|
|
1
|
+
import { t as s } from "../index-CUQrfsw_.js";
|
|
2
2
|
import e from "./BaseLayer.js";
|
|
3
3
|
import { normRMS as a } from "../ops/normRMS.js";
|
|
4
|
-
import { v as i } from "../variable-
|
|
5
|
-
import { o as m } from "../ones-
|
|
6
|
-
class
|
|
4
|
+
import { v as i } from "../variable-C9hihzDB.js";
|
|
5
|
+
import { o as m } from "../ones-ZdgQGBCP.js";
|
|
6
|
+
class l extends e {
|
|
7
7
|
GAMMA;
|
|
8
8
|
constructor(r, t = "", o) {
|
|
9
|
-
super(r, o), this.GAMMA = t, this.addVariable(this.GAMMA, i(m([r.
|
|
9
|
+
super(r, o), this.GAMMA = t, this.addVariable(this.GAMMA, i(m([r.nEmbed]), !0, this.GAMMA, "float32"));
|
|
10
10
|
}
|
|
11
11
|
forward(r, t) {
|
|
12
12
|
return s(() => {
|
|
@@ -17,5 +17,5 @@ class f extends e {
|
|
|
17
17
|
}
|
|
18
18
|
}
|
|
19
19
|
export {
|
|
20
|
-
|
|
20
|
+
l as default
|
|
21
21
|
};
|
package/dist/layers/RoPECache.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { b as t, x as h, t as n,
|
|
2
|
-
import { r as c } from "../reciprocal-
|
|
3
|
-
import { c as f, s as m } from "../sin-
|
|
4
|
-
import { r as a } from "../range-
|
|
1
|
+
import { b as t, x as h, t as n, k as p } from "../index-CUQrfsw_.js";
|
|
2
|
+
import { r as c } from "../reciprocal-CaR9e67G.js";
|
|
3
|
+
import { c as f, s as m } from "../sin-D2CKKmyR.js";
|
|
4
|
+
import { r as a } from "../range-LVHrSLdi.js";
|
|
5
5
|
class D {
|
|
6
6
|
constructor(o) {
|
|
7
7
|
this.config = o;
|
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
|
-
import { default as BaseLayer, ForwardAttributes
|
|
2
|
+
import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
|
|
3
|
+
import { GPTConfig } from '../models/config';
|
|
3
4
|
export default class TiedEmbeddingOutputLayer extends BaseLayer {
|
|
4
5
|
private vocabSize;
|
|
5
6
|
private embedDim;
|
|
6
7
|
private initializer;
|
|
7
8
|
private WEIGHTS;
|
|
8
|
-
constructor(config:
|
|
9
|
+
constructor(config: GPTConfig, name: string, parent?: BaseLayer);
|
|
9
10
|
embed(inputs: Tensor): Tensor;
|
|
10
11
|
project(inputs: Tensor): Tensor;
|
|
11
12
|
forward(_: ForwardAttributes, x: Tensor): Tensor;
|
|
@@ -1,9 +1,31 @@
|
|
|
1
|
-
import "../random_width-
|
|
2
|
-
import "../index-
|
|
3
|
-
import {
|
|
4
|
-
import "./BaseLayer.js";
|
|
5
|
-
import "../variable-
|
|
6
|
-
import "../gather-
|
|
1
|
+
import { d as r } from "../random_width-D8Pwy_na.js";
|
|
2
|
+
import "../index-CUQrfsw_.js";
|
|
3
|
+
import { r as a } from "../exports_initializers-DAKM8UO9.js";
|
|
4
|
+
import s from "./BaseLayer.js";
|
|
5
|
+
import { v as m } from "../variable-C9hihzDB.js";
|
|
6
|
+
import { g as o } from "../gather-C1siEkdp.js";
|
|
7
|
+
class S extends s {
|
|
8
|
+
vocabSize;
|
|
9
|
+
embedDim;
|
|
10
|
+
initializer;
|
|
11
|
+
WEIGHTS;
|
|
12
|
+
constructor(i, e, t) {
|
|
13
|
+
super(i, t), this.WEIGHTS = e, this.vocabSize = i.vocabSize, this.embedDim = i.nEmbed, this.initializer = a({
|
|
14
|
+
mean: 0,
|
|
15
|
+
stddev: 0.02
|
|
16
|
+
}), this.addVariable(this.WEIGHTS, m(this.initializer.apply([this.vocabSize, this.embedDim]), !0));
|
|
17
|
+
}
|
|
18
|
+
embed(i) {
|
|
19
|
+
return o(this.getVariable(this.WEIGHTS), i, 0);
|
|
20
|
+
}
|
|
21
|
+
project(i) {
|
|
22
|
+
return r(i, this.getVariable(this.WEIGHTS).transpose());
|
|
23
|
+
}
|
|
24
|
+
// Dummy, should not be used.
|
|
25
|
+
forward(i, e) {
|
|
26
|
+
return this.project(e);
|
|
27
|
+
}
|
|
28
|
+
}
|
|
7
29
|
export {
|
|
8
|
-
|
|
30
|
+
S as default
|
|
9
31
|
};
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { AttentionScores, KVCache } from './CausalSelfAttention';
|
|
2
|
-
import { default as BaseLayer, ForwardAttributes
|
|
2
|
+
import { default as BaseLayer, ForwardAttributes } from './BaseLayer';
|
|
3
3
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
4
|
+
import { GPTConfig } from '../models/config';
|
|
4
5
|
interface BlockAttributes extends ForwardAttributes {
|
|
5
6
|
pastKV?: KVCache;
|
|
6
7
|
seed?: number;
|
|
@@ -13,7 +14,7 @@ export default class Block extends BaseLayer<BlockAttributes> {
|
|
|
13
14
|
private mlp;
|
|
14
15
|
private index;
|
|
15
16
|
skipped: boolean;
|
|
16
|
-
constructor(index: number, config:
|
|
17
|
+
constructor(index: number, config: GPTConfig, parent?: BaseLayer);
|
|
17
18
|
private getMLPOutput;
|
|
18
19
|
forward(attrs: BlockAttributes, x: Tensor): Tensor;
|
|
19
20
|
dispose(): void;
|
|
@@ -2,7 +2,7 @@ import l from "./CausalSelfAttention.js";
|
|
|
2
2
|
import r from "./MLP.js";
|
|
3
3
|
import o from "./RMSNorm.js";
|
|
4
4
|
import d from "./BaseLayer.js";
|
|
5
|
-
import { t as p } from "../index-
|
|
5
|
+
import { t as p } from "../index-CUQrfsw_.js";
|
|
6
6
|
class k extends d {
|
|
7
7
|
ln1;
|
|
8
8
|
attn;
|
package/dist/loader/load.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { default as NanoGPT } from '../NanoGPTModel';
|
|
2
1
|
import { ITokeniser } from '../tokeniser/type';
|
|
2
|
+
import { default as Model, ModelForwardAttributes } from '../models/model';
|
|
3
3
|
export declare const VERSION = 2;
|
|
4
4
|
export interface Metadata {
|
|
5
5
|
version: string;
|
|
@@ -7,7 +7,7 @@ export interface Metadata {
|
|
|
7
7
|
name?: string;
|
|
8
8
|
}
|
|
9
9
|
export declare function loadModel(data: Blob | Buffer | string): Promise<{
|
|
10
|
-
model:
|
|
10
|
+
model: Model<ModelForwardAttributes>;
|
|
11
11
|
tokeniser: ITokeniser;
|
|
12
12
|
name?: string;
|
|
13
13
|
}>;
|
package/dist/loader/loadHF.d.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
|
-
import { default as NanoGPT } from '../NanoGPTModel';
|
|
2
1
|
import { ITokeniser } from '../main';
|
|
2
|
+
import { default as Model, ModelForwardAttributes } from '../models/model';
|
|
3
3
|
export default function loadHuggingFace(name: string): Promise<{
|
|
4
|
-
model:
|
|
4
|
+
model: Model<ModelForwardAttributes>;
|
|
5
5
|
tokeniser: ITokeniser;
|
|
6
6
|
name?: string;
|
|
7
7
|
}>;
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import { ITokeniser } from '../tokeniser/type';
|
|
2
|
-
import { default as
|
|
2
|
+
import { default as Model, ModelForwardAttributes } from '../models/model';
|
|
3
|
+
import { TrainingState } from '../training/Trainer';
|
|
3
4
|
export interface TransformersConfig {
|
|
4
5
|
model_type: string;
|
|
5
6
|
vocab_size: number;
|
|
@@ -22,10 +23,11 @@ export interface TransformersMetadata {
|
|
|
22
23
|
name?: string;
|
|
23
24
|
version: number;
|
|
24
25
|
application: string;
|
|
26
|
+
training?: TrainingState;
|
|
25
27
|
[key: string]: unknown;
|
|
26
28
|
}
|
|
27
29
|
export default function loadTransformers(config: TransformersConfig, tokeniser: TransformersTokeniser, metadata: TransformersMetadata, weightData: ArrayBuffer): Promise<{
|
|
28
|
-
model:
|
|
30
|
+
model: Model<ModelForwardAttributes>;
|
|
29
31
|
tokeniser: ITokeniser;
|
|
30
32
|
name?: string;
|
|
31
33
|
}>;
|
|
@@ -1,11 +1,12 @@
|
|
|
1
|
-
import
|
|
2
|
-
import c from "../tokeniser/
|
|
3
|
-
import
|
|
4
|
-
import {
|
|
5
|
-
import {
|
|
6
|
-
import
|
|
1
|
+
import l from "../tokeniser/CharTokeniser.js";
|
|
2
|
+
import c from "../tokeniser/bpe.js";
|
|
3
|
+
import { load_safetensors as b } from "../utilities/safetensors.js";
|
|
4
|
+
import { a1 as y } from "../index-CUQrfsw_.js";
|
|
5
|
+
import { dummyPassAsync as u } from "../utilities/dummy.js";
|
|
6
|
+
import _ from "../models/factory.js";
|
|
7
7
|
async function L(e, a, r, t) {
|
|
8
8
|
const n = {
|
|
9
|
+
modelType: e.model_type || "GenAI_NanoGPT_v1",
|
|
9
10
|
vocabSize: e.vocab_size,
|
|
10
11
|
blockSize: e.block_size,
|
|
11
12
|
nLayer: e.num_hidden_layers,
|
|
@@ -16,12 +17,12 @@ async function L(e, a, r, t) {
|
|
|
16
17
|
biasInLayerNorm: e.biasInLayerNorm,
|
|
17
18
|
mlpFactor: e.mlpFactor,
|
|
18
19
|
useRope: e.useRope
|
|
19
|
-
}, m = (a.type ?? "char") === "char" ? new
|
|
20
|
+
}, m = (a.type ?? "char") === "char" ? new l(a.vocab) : new c(a.vocab, a.merges), i = await b(t), s = /* @__PURE__ */ new Map();
|
|
20
21
|
for (const [p, d] of Object.entries(i))
|
|
21
22
|
s.set(p, [d]);
|
|
22
23
|
y();
|
|
23
|
-
const o =
|
|
24
|
-
return await
|
|
24
|
+
const o = _(n);
|
|
25
|
+
return await u(o), o.loadWeights(s), { model: o, tokeniser: m, name: r.name };
|
|
25
26
|
}
|
|
26
27
|
export {
|
|
27
28
|
L as default
|
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
import { ITokeniser } from '../main';
|
|
2
|
-
import { default as NanoGPT } from '../NanoGPTModel';
|
|
3
2
|
import { default as zip } from 'jszip';
|
|
3
|
+
import { default as Model, ModelForwardAttributes } from '../models/model';
|
|
4
4
|
export default function loadZipFile(zipFile: zip): Promise<{
|
|
5
|
-
model:
|
|
5
|
+
model: Model<ModelForwardAttributes>;
|
|
6
6
|
tokeniser: ITokeniser;
|
|
7
7
|
name?: string;
|
|
8
8
|
}>;
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { default as zip } from 'jszip';
|
|
2
2
|
import { ITokeniser } from '../main';
|
|
3
|
-
import { default as
|
|
3
|
+
import { default as Model, ModelForwardAttributes } from '../models/model';
|
|
4
4
|
export default function loadOldModel(zipFile: zip): Promise<{
|
|
5
|
-
model:
|
|
5
|
+
model: Model<ModelForwardAttributes>;
|
|
6
6
|
tokeniser: ITokeniser;
|
|
7
7
|
}>;
|