@genai-fi/nanogpt 0.13.0 → 0.13.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/TeachableLLM.js +2 -2
- package/dist/main.d.ts +2 -0
- package/dist/main.js +34 -30
- package/dist/training/BasicTrainer.js +36 -36
- package/dist/training/Evaluator.d.ts +8 -3
- package/dist/training/Evaluator.js +41 -17
- package/dist/training/SFTDatasetBuilder.d.ts +5 -1
- package/dist/training/SFTDatasetBuilder.js +32 -28
- package/dist/training/loss.d.ts +1 -1
- package/dist/training/loss.js +4 -4
- package/dist/training/sparseCrossEntropy.d.ts +2 -2
- package/dist/training/sparseCrossEntropy.js +35 -31
- package/package.json +1 -1
package/dist/TeachableLLM.js
CHANGED
|
@@ -36,10 +36,10 @@ import c from "./tokeniser/CharTokeniser.js";
|
|
|
36
36
|
import g from "./tokeniser/bpe.js";
|
|
37
37
|
import "./papaparse.min-C0cScC2i.js";
|
|
38
38
|
import "./jszip.min-Bz5-11Bk.js";
|
|
39
|
-
import "./ops/cpu/scatterSub.js";
|
|
40
|
-
import "./ops/webgl/scatterSub.js";
|
|
41
39
|
import "./ops/cpu/gatherSub.js";
|
|
42
40
|
import "./ops/webgl/gatherSub.js";
|
|
41
|
+
import "./ops/cpu/scatterSub.js";
|
|
42
|
+
import "./ops/webgl/scatterSub.js";
|
|
43
43
|
import "./ops/cpu/matMulGelu.js";
|
|
44
44
|
import "./matMulGelu-CoUYwB2k.js";
|
|
45
45
|
import "./ops/grads/matMulGelu.js";
|
package/dist/main.d.ts
CHANGED
|
@@ -14,7 +14,9 @@ export { default as BPETokeniser } from './tokeniser/bpe';
|
|
|
14
14
|
export { default as waitForModel } from './utilities/waitForModel';
|
|
15
15
|
export { default as loadTextData } from './data/textLoader';
|
|
16
16
|
export { default as Generator } from './Generator';
|
|
17
|
+
export { default as Evaluator } from './training/Evaluator';
|
|
17
18
|
export type { ITrainerOptions } from './Trainer';
|
|
19
|
+
export { default as Trainer } from './Trainer';
|
|
18
20
|
export type { IGenerateOptions } from './Generator';
|
|
19
21
|
export { type ModelForwardAttributes, default as Model } from './models/model';
|
|
20
22
|
export type { ITokeniser, Conversation, Roles } from './tokeniser/type';
|
package/dist/main.js
CHANGED
|
@@ -6,14 +6,16 @@ import "./index-Cp39cXWe.js";
|
|
|
6
6
|
import "./dataset-BMe3pbsL.js";
|
|
7
7
|
import { default as fo } from "./models/NanoGPTV1.js";
|
|
8
8
|
import { default as lo } from "./TeachableLLM.js";
|
|
9
|
-
import { default as
|
|
9
|
+
import { default as uo } from "./tokeniser/CharTokeniser.js";
|
|
10
10
|
import { default as ko } from "./tokeniser/bpe.js";
|
|
11
11
|
import { default as go } from "./utilities/waitForModel.js";
|
|
12
12
|
import { default as Co } from "./data/textLoader.js";
|
|
13
|
-
import { default as
|
|
14
|
-
import {
|
|
15
|
-
import { default as
|
|
16
|
-
import {
|
|
13
|
+
import { default as Mo } from "./training/Evaluator.js";
|
|
14
|
+
import { default as Bo } from "./Trainer.js";
|
|
15
|
+
import { default as vo } from "./models/model.js";
|
|
16
|
+
import { estimateMemoryUsage as Lo, estimateParameterCount as So, estimateResources as Ao, estimateTrainingMemoryUsage as Fo, validateConfig as Go } from "./utilities/parameters.js";
|
|
17
|
+
import { default as Uo } from "./utilities/topP.js";
|
|
18
|
+
import { Task as Do, tokensFromTasks as No } from "./training/tasks/Task.js";
|
|
17
19
|
import o from "./training/tasks/PretrainingTask.js";
|
|
18
20
|
import r from "./training/tasks/StartSentenceTask.js";
|
|
19
21
|
import t from "./training/tasks/ConversationTask.js";
|
|
@@ -52,15 +54,15 @@ import "./matMul16-CH8D42Kx.js";
|
|
|
52
54
|
import "./ops/webgl/matMul16.js";
|
|
53
55
|
import "./ops/cpu/matMul16.js";
|
|
54
56
|
import "./ops/transpose16.js";
|
|
55
|
-
import { selectBackend as
|
|
56
|
-
import { default as
|
|
57
|
-
import
|
|
58
|
-
import
|
|
57
|
+
import { selectBackend as qo } from "./backend.js";
|
|
58
|
+
import { default as Ho } from "./utilities/performance.js";
|
|
59
|
+
import a from "./layers/CausalSelfAttention.js";
|
|
60
|
+
import p from "./layers/MLP.js";
|
|
59
61
|
import i from "./layers/TransformerBlock.js";
|
|
60
62
|
import s from "./layers/RoPECache.js";
|
|
61
|
-
import { default as
|
|
62
|
-
import { default as
|
|
63
|
-
import { sentenceEmbeddings as
|
|
63
|
+
import { default as Jo } from "./training/AdamExt.js";
|
|
64
|
+
import { default as Oo } from "./checks/index.js";
|
|
65
|
+
import { sentenceEmbeddings as Vo, sentenceEmbeddingsTensor as Wo } from "./utilities/sentences.js";
|
|
64
66
|
const to = {
|
|
65
67
|
PretrainingTask: o,
|
|
66
68
|
StartSentenceTask: r,
|
|
@@ -69,35 +71,37 @@ const to = {
|
|
|
69
71
|
pack16: m,
|
|
70
72
|
unpack16: e
|
|
71
73
|
}, mo = {
|
|
72
|
-
CausalSelfAttention:
|
|
73
|
-
MLP:
|
|
74
|
+
CausalSelfAttention: a,
|
|
75
|
+
MLP: p,
|
|
74
76
|
TransformerBlock: i,
|
|
75
77
|
RoPECache: s
|
|
76
78
|
};
|
|
77
79
|
export {
|
|
78
|
-
|
|
80
|
+
Jo as AdamExt,
|
|
79
81
|
ko as BPETokeniser,
|
|
80
|
-
|
|
82
|
+
uo as CharTokeniser,
|
|
83
|
+
Mo as Evaluator,
|
|
81
84
|
io as Generator,
|
|
82
|
-
|
|
85
|
+
vo as Model,
|
|
83
86
|
fo as NanoGPT,
|
|
84
|
-
|
|
87
|
+
Do as Task,
|
|
85
88
|
lo as TeachableLLM,
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
|
|
89
|
+
Bo as Trainer,
|
|
90
|
+
Oo as checks,
|
|
91
|
+
Lo as estimateMemoryUsage,
|
|
92
|
+
So as estimateParameterCount,
|
|
93
|
+
Ao as estimateResources,
|
|
94
|
+
Fo as estimateTrainingMemoryUsage,
|
|
91
95
|
mo as layers,
|
|
92
96
|
Co as loadTextData,
|
|
93
97
|
eo as ops,
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
+
Ho as performanceTest,
|
|
99
|
+
qo as selectBackend,
|
|
100
|
+
Vo as sentenceEmbeddings,
|
|
101
|
+
Wo as sentenceEmbeddingsTensor,
|
|
98
102
|
to as tasks,
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
103
|
+
No as tokensFromTasks,
|
|
104
|
+
Uo as topP,
|
|
105
|
+
Go as validateConfig,
|
|
102
106
|
go as waitForModel
|
|
103
107
|
};
|
|
@@ -52,19 +52,19 @@ class R {
|
|
|
52
52
|
mixedPrecision: this._mixedPrecision
|
|
53
53
|
},
|
|
54
54
|
d
|
|
55
|
-
),
|
|
55
|
+
), t = T(e, l, this.maskedLoss);
|
|
56
56
|
e.dispose();
|
|
57
|
-
const g =
|
|
58
|
-
return
|
|
59
|
-
}, { value:
|
|
57
|
+
const g = t.mul(S(this.lossScaling));
|
|
58
|
+
return t.dispose(), g;
|
|
59
|
+
}, { value: s, grads: n } = L(o);
|
|
60
60
|
if (a)
|
|
61
61
|
this.model.getProfiler()?.endMemory("Training");
|
|
62
62
|
else {
|
|
63
63
|
this.optimizer.applyGradients(n);
|
|
64
64
|
const e = Object.keys(n);
|
|
65
|
-
this.model.weightStore.touchVariables(e), this.model.getProfiler()?.endMemory("Training"), c ? (i.gradients = n, Object.values(n).forEach((
|
|
65
|
+
this.model.weightStore.touchVariables(e), this.model.getProfiler()?.endMemory("Training"), c ? (i.gradients = n, Object.values(n).forEach((t) => P(t))) : m(n);
|
|
66
66
|
}
|
|
67
|
-
return
|
|
67
|
+
return s.mul(S(1 / this.lossScaling));
|
|
68
68
|
});
|
|
69
69
|
}
|
|
70
70
|
async dummyPass() {
|
|
@@ -99,13 +99,13 @@ class R {
|
|
|
99
99
|
...r
|
|
100
100
|
}, d = Date.now(), l = this.createEmptyState();
|
|
101
101
|
this.lastState = l, await this.dummyPass(), r?.advancedMetrics && (this.model.getProfiler() || this.model.setProfiler(new y())), this.running = !0, l.logStartTime = d;
|
|
102
|
-
const o = a ? new u(this.model, a) : void 0,
|
|
102
|
+
const o = a ? new u(this.model, a) : void 0, s = await i.iterator();
|
|
103
103
|
try {
|
|
104
104
|
for (; this.running; ) {
|
|
105
|
-
const n = await
|
|
105
|
+
const n = await s.next();
|
|
106
106
|
if (n.done) break;
|
|
107
|
-
const e = n.value,
|
|
108
|
-
e.xs.dispose(), e.ys.dispose(), l.step++, l.totalSteps++, l.step % c === 0 && await this.performLogging(
|
|
107
|
+
const e = n.value, t = this.trainStep(l, e, !1);
|
|
108
|
+
e.xs.dispose(), e.ys.dispose(), l.step++, l.totalSteps++, l.step % c === 0 && await this.performLogging(t, e.xs.shape[0], r, o), t.dispose();
|
|
109
109
|
}
|
|
110
110
|
} catch (n) {
|
|
111
111
|
throw console.error("Training error:", n), m(), n;
|
|
@@ -116,45 +116,45 @@ class R {
|
|
|
116
116
|
const { onStep: d } = {
|
|
117
117
|
...p,
|
|
118
118
|
...a
|
|
119
|
-
}, l = a?.gradientMetrics || !1, o = (await i.data())[0],
|
|
120
|
-
|
|
119
|
+
}, l = a?.gradientMetrics || !1, o = (await i.data())[0], s = this.lastState;
|
|
120
|
+
s.lastLoss = o;
|
|
121
121
|
const n = Date.now();
|
|
122
|
-
|
|
122
|
+
s.trainingDuration += n - s.logStartTime;
|
|
123
123
|
const e = {
|
|
124
|
-
loss:
|
|
125
|
-
step:
|
|
126
|
-
time: Date.now() -
|
|
124
|
+
loss: s.lastLoss,
|
|
125
|
+
step: s.step,
|
|
126
|
+
time: Date.now() - s.logStartTime,
|
|
127
127
|
batchSize: r,
|
|
128
128
|
learningRate: a?.advancedMetrics ? this.optimizer.lr : void 0
|
|
129
129
|
};
|
|
130
130
|
if (this.model.trainingState = {
|
|
131
|
-
steps:
|
|
131
|
+
steps: s.totalSteps,
|
|
132
132
|
learningRate: this.optimizer.lr,
|
|
133
133
|
batchSize: r,
|
|
134
|
-
loss:
|
|
135
|
-
}, a?.gradientMetrics && l &&
|
|
136
|
-
const
|
|
137
|
-
for (const [g, h] of Object.entries(
|
|
138
|
-
|
|
139
|
-
e.gradientMetrics =
|
|
134
|
+
loss: s.lastLoss
|
|
135
|
+
}, a?.gradientMetrics && l && s.gradients) {
|
|
136
|
+
const t = /* @__PURE__ */ new Map();
|
|
137
|
+
for (const [g, h] of Object.entries(s.gradients))
|
|
138
|
+
t.set(g, await k(h)), h.dispose();
|
|
139
|
+
e.gradientMetrics = t;
|
|
140
140
|
}
|
|
141
141
|
if (c)
|
|
142
142
|
try {
|
|
143
|
-
const
|
|
144
|
-
t.validationLosses.push(
|
|
145
|
-
} catch (
|
|
146
|
-
console.error("Validation error:",
|
|
143
|
+
const t = await c.evaluate(5);
|
|
144
|
+
Array.isArray(t) ? e.valLoss = t[0] : (s.validationLosses.push(t), e.valLoss = t);
|
|
145
|
+
} catch (t) {
|
|
146
|
+
console.error("Validation error:", t);
|
|
147
147
|
}
|
|
148
148
|
if (d) {
|
|
149
|
-
const
|
|
150
|
-
duration:
|
|
151
|
-
totalSamples:
|
|
152
|
-
samplesPerSecond:
|
|
149
|
+
const t = {
|
|
150
|
+
duration: s.trainingDuration,
|
|
151
|
+
totalSamples: s.totalSteps * e.batchSize,
|
|
152
|
+
samplesPerSecond: s.totalSteps * e.batchSize / (s.trainingDuration / 1e3),
|
|
153
153
|
memory: a?.advancedMetrics ? this.model.getProfiler()?.getPeakMemory() || 0 : void 0
|
|
154
154
|
};
|
|
155
|
-
await d(e,
|
|
155
|
+
await d(e, t);
|
|
156
156
|
}
|
|
157
|
-
|
|
157
|
+
s.logStartTime = Date.now();
|
|
158
158
|
}
|
|
159
159
|
async trainOnDataset(i, r, a) {
|
|
160
160
|
const { logInterval: c, maxSteps: d } = {
|
|
@@ -162,13 +162,13 @@ class R {
|
|
|
162
162
|
...r
|
|
163
163
|
}, l = Date.now(), o = this.createEmptyState();
|
|
164
164
|
this.lastState = o, await this.dummyPass(), r?.advancedMetrics && (this.model.getProfiler() || this.model.setProfiler(new y())), this.running = !0, o.logStartTime = l;
|
|
165
|
-
const
|
|
165
|
+
const s = a ? new u(this.model, a) : void 0, n = await i.iterator();
|
|
166
166
|
try {
|
|
167
167
|
for (; this.running; ) {
|
|
168
168
|
const e = await n.next();
|
|
169
169
|
if (e.done) break;
|
|
170
|
-
const
|
|
171
|
-
|
|
170
|
+
const t = e.value, g = o.step % c === 0, h = (r?.gradientMetrics || !1) && g, f = this.trainStep(o, t, !1, h);
|
|
171
|
+
t.xs.dispose(), t.ys.dispose(), o.step++, o.totalSteps++, g && await this.performLogging(f, t.xs.shape[0], r, s), f.dispose(), o.step >= d && this.stop();
|
|
172
172
|
}
|
|
173
173
|
} catch (e) {
|
|
174
174
|
throw console.error("Training error:", e), m(), e;
|
|
@@ -1,9 +1,14 @@
|
|
|
1
1
|
import { Dataset } from '@tensorflow/tfjs-data';
|
|
2
2
|
import { TensorContainer } from '@tensorflow/tfjs-core';
|
|
3
3
|
import { default as Model, ModelForwardAttributes } from '../models/model';
|
|
4
|
+
import { Conversation, ITokeniser } from '../main';
|
|
4
5
|
export default class Evaluator {
|
|
5
6
|
private model;
|
|
6
|
-
private iterator
|
|
7
|
-
|
|
8
|
-
|
|
7
|
+
private iterator?;
|
|
8
|
+
private xs?;
|
|
9
|
+
private ys?;
|
|
10
|
+
constructor(model: Model<ModelForwardAttributes>, dataset: Dataset<TensorContainer> | Conversation[][], tokeniser?: ITokeniser);
|
|
11
|
+
dispose(): void;
|
|
12
|
+
private calculateBatchLoss;
|
|
13
|
+
evaluate(maxBatches?: number): Promise<number | number[]>;
|
|
9
14
|
}
|
|
@@ -1,23 +1,47 @@
|
|
|
1
|
-
import
|
|
2
|
-
|
|
3
|
-
|
|
4
|
-
|
|
1
|
+
import "../index-twYeuV3_.js";
|
|
2
|
+
import { calculateLoss as u } from "./loss.js";
|
|
3
|
+
import { buildSFTExample as p } from "./SFTDatasetBuilder.js";
|
|
4
|
+
import { t as c } from "../tensor-CO6h2H2F.js";
|
|
5
|
+
class b {
|
|
6
|
+
constructor(i, t, a) {
|
|
7
|
+
if (this.model = i, Array.isArray(t)) {
|
|
8
|
+
if (!a)
|
|
9
|
+
throw new Error("Tokeniser is required when dataset is an array of conversations");
|
|
10
|
+
const o = t.map((s) => p(s, -100, a, i.config.blockSize)).filter((s) => s !== null);
|
|
11
|
+
if (o.length === 0)
|
|
12
|
+
return;
|
|
13
|
+
this.xs = c(o.map((s) => s.xs)), this.ys = c(o.map((s) => s.ys));
|
|
14
|
+
} else
|
|
15
|
+
this.iterator = t.iterator();
|
|
5
16
|
}
|
|
6
17
|
iterator;
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
18
|
+
xs;
|
|
19
|
+
ys;
|
|
20
|
+
dispose() {
|
|
21
|
+
this.xs && this.xs.dispose(), this.ys && this.ys.dispose();
|
|
22
|
+
}
|
|
23
|
+
async calculateBatchLoss(i, t, a, o) {
|
|
24
|
+
const s = this.model.forward({ training: !1 }, i), r = u(s, t, o, a);
|
|
25
|
+
s.dispose();
|
|
26
|
+
const e = await r.array();
|
|
27
|
+
return r.dispose(), e;
|
|
28
|
+
}
|
|
29
|
+
async evaluate(i = 100) {
|
|
30
|
+
let t = 0, a = 0;
|
|
31
|
+
if (this.iterator) {
|
|
32
|
+
const o = await this.iterator;
|
|
33
|
+
for (let s = 0; s < i; s++) {
|
|
34
|
+
const r = await o.next();
|
|
35
|
+
if (r.done) break;
|
|
36
|
+
const l = r.value, { xs: e, ys: n } = l, h = this.model.forward({ training: !1 }, e), f = await this.calculateBatchLoss(h, n, !1, !1);
|
|
37
|
+
e.dispose(), n.dispose(), t += f, a++;
|
|
38
|
+
}
|
|
39
|
+
return t / a;
|
|
40
|
+
} else if (this.xs && this.ys)
|
|
41
|
+
return this.calculateBatchLoss(this.xs, this.ys, !0, !0);
|
|
42
|
+
throw new Error("No data available for evaluation");
|
|
19
43
|
}
|
|
20
44
|
}
|
|
21
45
|
export {
|
|
22
|
-
|
|
46
|
+
b as default
|
|
23
47
|
};
|
|
@@ -1,7 +1,11 @@
|
|
|
1
1
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
|
-
import { ITokeniser } from '../tokeniser/type';
|
|
2
|
+
import { Conversation, ITokeniser } from '../tokeniser/type';
|
|
3
3
|
import { Dataset } from '@tensorflow/tfjs-data';
|
|
4
4
|
import { Task } from './tasks/Task';
|
|
5
|
+
export declare function buildSFTExample(conversation: Conversation[], ignoreIndex: number, tokenizer: ITokeniser, blockSize: number): {
|
|
6
|
+
xs: Int32Array;
|
|
7
|
+
ys: Int32Array;
|
|
8
|
+
} | null;
|
|
5
9
|
export declare class SFTDatasetBuilder {
|
|
6
10
|
tokenizer: ITokeniser;
|
|
7
11
|
blockSize: number;
|
|
@@ -1,41 +1,44 @@
|
|
|
1
1
|
import { t as x } from "../index-twYeuV3_.js";
|
|
2
2
|
import "../dataset-BMe3pbsL.js";
|
|
3
|
-
import { g as
|
|
3
|
+
import { g as I } from "../readers-C_41Nuv3.js";
|
|
4
4
|
import "../index-Cp39cXWe.js";
|
|
5
|
-
function
|
|
6
|
-
const s = [t.bosToken], n = [!1],
|
|
5
|
+
function w(u, a, t, l) {
|
|
6
|
+
const s = [t.bosToken], n = [!1], f = {
|
|
7
7
|
user: t.getSpecialTokenIndex("<|user_start|>"),
|
|
8
8
|
assistant: t.getSpecialTokenIndex("<|assistant_start|>"),
|
|
9
9
|
system: t.getSpecialTokenIndex("<|system_start|>")
|
|
10
|
-
},
|
|
10
|
+
}, i = {
|
|
11
11
|
user: t.getSpecialTokenIndex("<|user_end|>"),
|
|
12
12
|
assistant: t.getSpecialTokenIndex("<|assistant_end|>"),
|
|
13
13
|
system: t.getSpecialTokenIndex("<|system_end|>")
|
|
14
14
|
};
|
|
15
15
|
for (const e of u) {
|
|
16
|
-
const
|
|
17
|
-
if (
|
|
16
|
+
const c = f[e.role], h = i[e.role];
|
|
17
|
+
if (c == null || h == null)
|
|
18
18
|
throw new Error(`Missing special tokens for role: ${e.role}`);
|
|
19
|
-
s.push(
|
|
20
|
-
const
|
|
21
|
-
for (const
|
|
22
|
-
s.push(
|
|
23
|
-
const S = t.isSpecialToken(
|
|
24
|
-
n.push(
|
|
19
|
+
s.push(c), n.push(!1);
|
|
20
|
+
const k = t.encode(e.content);
|
|
21
|
+
for (const m of k) {
|
|
22
|
+
s.push(m);
|
|
23
|
+
const S = t.isSpecialToken(m), y = e.role === "assistant";
|
|
24
|
+
n.push(y && !S);
|
|
25
25
|
}
|
|
26
|
-
s.push(
|
|
26
|
+
s.push(h), n.push(!1);
|
|
27
27
|
}
|
|
28
28
|
s.push(t.eosToken), n.push(!1);
|
|
29
|
-
const o =
|
|
29
|
+
const o = l + 1;
|
|
30
30
|
if (s.length < o) {
|
|
31
|
-
const e = o - s.length,
|
|
32
|
-
for (let
|
|
33
|
-
s.push(
|
|
31
|
+
const e = o - s.length, c = t.getSpecialTokenIndex("<pad>");
|
|
32
|
+
for (let h = 0; h < e; h++)
|
|
33
|
+
s.push(c), n.push(!1);
|
|
34
34
|
} else s.length > o && (s.length = o, n.length = o);
|
|
35
|
-
const
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
35
|
+
const p = new Int32Array(s.slice(0, l)), r = s.slice(1, l + 1), T = n.slice(1, l + 1), d = new Int32Array(r.length);
|
|
36
|
+
let g = !1;
|
|
37
|
+
for (let e = 0; e < r.length; e++) {
|
|
38
|
+
const c = T[e] ? r[e] : a;
|
|
39
|
+
d[e] = c, c !== a && (g = !0);
|
|
40
|
+
}
|
|
41
|
+
return g ? { xs: p, ys: d } : null;
|
|
39
42
|
}
|
|
40
43
|
class A {
|
|
41
44
|
tokenizer;
|
|
@@ -49,17 +52,17 @@ class A {
|
|
|
49
52
|
* - Pads with eosToken and masks padding.
|
|
50
53
|
* - Masks non-assistant tokens in labels with ignoreIndex (default -100).
|
|
51
54
|
*/
|
|
52
|
-
async createSFTDataset(a, t = 32,
|
|
55
|
+
async createSFTDataset(a, t = 32, l = -100) {
|
|
53
56
|
if (!a.length)
|
|
54
57
|
throw new Error("No conversations provided.");
|
|
55
58
|
const s = this.tokenizer, n = this.blockSize;
|
|
56
|
-
return
|
|
59
|
+
return I(function* () {
|
|
57
60
|
for (; ; ) {
|
|
58
|
-
const
|
|
59
|
-
|
|
61
|
+
const i = Math.floor(Math.random() * a.length), p = a[i].getRandomConversation(), r = w(p, l, s, n);
|
|
62
|
+
r && (yield r);
|
|
60
63
|
}
|
|
61
|
-
}).batch(t).map((
|
|
62
|
-
const o =
|
|
64
|
+
}).batch(t).map((i) => {
|
|
65
|
+
const o = i;
|
|
63
66
|
return x(() => ({
|
|
64
67
|
xs: o.xs.cast("int32"),
|
|
65
68
|
ys: o.ys.cast("int32")
|
|
@@ -68,5 +71,6 @@ class A {
|
|
|
68
71
|
}
|
|
69
72
|
}
|
|
70
73
|
export {
|
|
71
|
-
A as SFTDatasetBuilder
|
|
74
|
+
A as SFTDatasetBuilder,
|
|
75
|
+
w as buildSFTExample
|
|
72
76
|
};
|
package/dist/training/loss.d.ts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { Tensor } from '@tensorflow/tfjs-core';
|
|
2
|
-
export declare function calculateLoss(logits: Tensor, targets: Tensor, masked?: boolean): Tensor;
|
|
2
|
+
export declare function calculateLoss(logits: Tensor, targets: Tensor, masked?: boolean, keepBatch?: boolean): Tensor;
|
package/dist/training/loss.js
CHANGED
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
import { createSoftmaxCrossEntropyWithGrad as
|
|
2
|
-
function
|
|
1
|
+
import { createSoftmaxCrossEntropyWithGrad as c } from "./sparseCrossEntropy.js";
|
|
2
|
+
function a(r, s, t, n) {
|
|
3
3
|
try {
|
|
4
|
-
return
|
|
4
|
+
return c(t, n)(r, s);
|
|
5
5
|
} catch (o) {
|
|
6
6
|
throw console.error("Error computing loss:", o), new Error(`Loss computation failed: ${o}`);
|
|
7
7
|
}
|
|
8
8
|
}
|
|
9
9
|
export {
|
|
10
|
-
|
|
10
|
+
a as calculateLoss
|
|
11
11
|
};
|
|
@@ -3,5 +3,5 @@ import * as tf from '@tensorflow/tfjs-core';
|
|
|
3
3
|
* Numerically stable sparse cross-entropy with gradient support
|
|
4
4
|
* This version handles potential numerical issues better
|
|
5
5
|
*/
|
|
6
|
-
export declare function sparseSoftmaxCrossEntropy(logits: tf.Tensor, labels: tf.Tensor, validMask?: tf.Tensor): tf.Tensor;
|
|
7
|
-
export declare function createSoftmaxCrossEntropyWithGrad(masked?: boolean): (...args: tf.Tensor[]) => tf.Tensor<tf.Rank>;
|
|
6
|
+
export declare function sparseSoftmaxCrossEntropy(logits: tf.Tensor, labels: tf.Tensor, validMask?: tf.Tensor, keepBatch?: boolean, originalBatchShape?: number[]): tf.Tensor;
|
|
7
|
+
export declare function createSoftmaxCrossEntropyWithGrad(masked?: boolean, keepBatch?: boolean): (...args: tf.Tensor[]) => tf.Tensor<tf.Rank>;
|
|
@@ -1,42 +1,46 @@
|
|
|
1
|
-
import { gatherSub as
|
|
2
|
-
import { scatterSub as
|
|
3
|
-
import { t as
|
|
4
|
-
import { m as
|
|
5
|
-
import { s as
|
|
6
|
-
import { s as
|
|
7
|
-
function
|
|
8
|
-
return
|
|
9
|
-
const
|
|
10
|
-
let s =
|
|
11
|
-
if (
|
|
12
|
-
s =
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
1
|
+
import { gatherSub as P } from "../ops/gatherSub.js";
|
|
2
|
+
import { scatterSub as V } from "../ops/scatterSub.js";
|
|
3
|
+
import { t as z, c as W, m as w, j as C, a1 as $, a as L, a2 as E } from "../index-twYeuV3_.js";
|
|
4
|
+
import { m as A, l as D, a as G, n as H, w as J } from "../not_equal-DXJHGhGS.js";
|
|
5
|
+
import { s as m } from "../sum-CgGUPVhu.js";
|
|
6
|
+
import { s as K } from "../softmax-DpG1TdjZ.js";
|
|
7
|
+
function N(e, d, h, x, a) {
|
|
8
|
+
return z(() => {
|
|
9
|
+
const u = e.shape[e.shape.length - 1], c = a || e.shape.slice(0, -1), f = c.reduce((n, y) => n * y, 1), i = e.shape.length > 2 ? e.reshape([f, u]) : e, S = d.shape.length > 1 ? d.reshape([f]).cast("int32") : d.cast("int32"), p = A(i, -1, !0), t = W(i, p), r = D(t, -1);
|
|
10
|
+
let s = P(r, S, t);
|
|
11
|
+
if (h)
|
|
12
|
+
if (s = w(s, h), x) {
|
|
13
|
+
const n = m(h.reshape(c), -1);
|
|
14
|
+
s = C(m(s.reshape(c), -1), n);
|
|
15
|
+
} else {
|
|
16
|
+
const n = m(h);
|
|
17
|
+
s = C(m(s), n);
|
|
18
|
+
}
|
|
19
|
+
else
|
|
20
|
+
x ? s = G(s.reshape(c), -1) : s = G(s);
|
|
17
21
|
return s;
|
|
18
22
|
});
|
|
19
23
|
}
|
|
20
|
-
function
|
|
21
|
-
return
|
|
24
|
+
function k(e, d) {
|
|
25
|
+
return $(
|
|
22
26
|
// @ts-expect-error Invalid params
|
|
23
|
-
(
|
|
24
|
-
const
|
|
25
|
-
let
|
|
26
|
-
if (
|
|
27
|
-
const
|
|
28
|
-
|
|
27
|
+
(a, u, c) => {
|
|
28
|
+
const f = a.shape[a.shape.length - 1], i = a.shape.slice(0, -1), S = i.reduce((l, o) => l * o, 1), p = a.reshape([S, f]), t = u.reshape([S]).cast("int32");
|
|
29
|
+
let r, s = null;
|
|
30
|
+
if (e) {
|
|
31
|
+
const l = L(-100, "int32"), o = H(t, l);
|
|
32
|
+
s = o.cast("float32"), r = J(o, t, E(t));
|
|
29
33
|
} else
|
|
30
|
-
|
|
31
|
-
const
|
|
32
|
-
return
|
|
33
|
-
const
|
|
34
|
-
return [
|
|
34
|
+
r = t;
|
|
35
|
+
const n = N(p, r, s || void 0, d, i);
|
|
36
|
+
return c(s ? [p, r, s] : [p, r]), p.dispose(), t.dispose(), { value: n, gradFunc: (l, o) => z(() => {
|
|
37
|
+
const b = o[0], I = o[1], g = e ? o[2] : void 0, T = K(b), j = g ? m(g) : L(b.shape[0], "float32"), v = l.div(j).broadcastTo([b.shape[0]]), q = g && e ? w(v, g) : v, F = V(T, I, q), M = E(u);
|
|
38
|
+
return [F.reshape(a.shape), M];
|
|
35
39
|
}) };
|
|
36
40
|
}
|
|
37
41
|
);
|
|
38
42
|
}
|
|
39
43
|
export {
|
|
40
|
-
|
|
41
|
-
|
|
44
|
+
k as createSoftmaxCrossEntropyWithGrad,
|
|
45
|
+
N as sparseSoftmaxCrossEntropy
|
|
42
46
|
};
|