@mikugg/guidance 0.8.0 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +3 -3
- package/dist/index.d.ts.map +1 -1
- package/dist/lib/__test__/template.test.js +20 -3
- package/dist/lib/_llama-tokenizer.d.ts.map +1 -1
- package/dist/lib/_llama-tokenizer.js +107 -94
- package/dist/lib/_trie.js +1 -1
- package/dist/lib/template.d.ts +3 -2
- package/dist/lib/template.d.ts.map +1 -1
- package/dist/lib/template.js +87 -21
- package/dist/lib/token-generator.d.ts +6 -4
- package/dist/lib/token-generator.d.ts.map +1 -1
- package/dist/lib/token-generator.js +52 -14
- package/dist/lib/tokenizer.d.ts.map +1 -1
- package/dist/lib/tokenizer.js +6 -5
- package/package.json +2 -2
package/dist/index.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import * as Tokenizer from
|
|
2
|
-
import * as TokenGenerator from
|
|
3
|
-
import * as Template from
|
|
1
|
+
import * as Tokenizer from "./lib/tokenizer";
|
|
2
|
+
import * as TokenGenerator from "./lib/token-generator";
|
|
3
|
+
import * as Template from "./lib/template";
|
|
4
4
|
export { Tokenizer, TokenGenerator, Template };
|
|
5
5
|
//# sourceMappingURL=index.d.ts.map
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,SAAS,MAAM,iBAAiB,CAAC;AAC7C,OAAO,KAAK,cAAc,MAAM,uBAAuB,CAAC;AACxD,OAAO,KAAK,QAAQ,MAAM,gBAAgB,CAAC;AAE3C,OAAO,EAAE,SAAS,EAAE,cAAc,EAAE,QAAQ,EAAE,CAAC"}
|
|
@@ -8,6 +8,18 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
9
|
});
|
|
10
10
|
};
|
|
11
|
+
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
|
|
12
|
+
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
|
|
13
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
14
|
+
var g = generator.apply(thisArg, _arguments || []), i, q = [];
|
|
15
|
+
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
|
|
16
|
+
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
|
|
17
|
+
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
|
|
18
|
+
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
|
|
19
|
+
function fulfill(value) { resume("next", value); }
|
|
20
|
+
function reject(value) { resume("throw", value); }
|
|
21
|
+
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
|
|
22
|
+
};
|
|
11
23
|
Object.defineProperty(exports, "__esModule", { value: true });
|
|
12
24
|
const template_1 = require("../template");
|
|
13
25
|
const tokenizer_1 = require("../tokenizer"); // import paths as required
|
|
@@ -21,8 +33,8 @@ class MockTokenGenerator extends token_generator_1.AbstractTokenGenerator {
|
|
|
21
33
|
}
|
|
22
34
|
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
23
35
|
generateString(prompt, options) {
|
|
24
|
-
return
|
|
25
|
-
return 'generated';
|
|
36
|
+
return __asyncGenerator(this, arguments, function* generateString_1() {
|
|
37
|
+
return yield __await('generated');
|
|
26
38
|
});
|
|
27
39
|
}
|
|
28
40
|
}
|
|
@@ -69,9 +81,14 @@ describe('TemplateProcessor', () => {
|
|
|
69
81
|
expect(result.get('weather')).toEqual(' rainy');
|
|
70
82
|
}));
|
|
71
83
|
it('should process template with SEL method in a JSON correctly', () => __awaiter(void 0, void 0, void 0, function* () {
|
|
84
|
+
const mockGenerateString = function () {
|
|
85
|
+
return __asyncGenerator(this, arguments, function* () {
|
|
86
|
+
yield yield __await('wizard');
|
|
87
|
+
});
|
|
88
|
+
};
|
|
72
89
|
// 29879 = "s"
|
|
73
90
|
const spyGenerateToken = jest.spyOn(generator, 'generateToken').mockReturnValue(new Promise((resolve) => resolve("s")));
|
|
74
|
-
const spyGenerateString = jest.spyOn(generator, 'generateString').
|
|
91
|
+
const spyGenerateString = jest.spyOn(generator, 'generateString').mockImplementation(mockGenerateString);
|
|
75
92
|
const result = yield templateProcessor.processTemplate(`RPG Game Character specification
|
|
76
93
|
{
|
|
77
94
|
"name": "{{name}}",
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"_llama-tokenizer.d.ts","sourceRoot":"","sources":["../../src/lib/_llama-tokenizer.ts"],"names":[],"mappings":"AAEA;;;;;;;;;;;GAWG;AAEH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,
|
|
1
|
+
{"version":3,"file":"_llama-tokenizer.d.ts","sourceRoot":"","sources":["../../src/lib/_llama-tokenizer.ts"],"names":[],"mappings":"AAEA;;;;;;;;;;;GAWG;AAEH,MAAM,WAAW,cAAc;IAC7B,MAAM,EAAE,CACN,IAAI,EAAE,MAAM,EACZ,aAAa,CAAC,EAAE,OAAO,EACvB,mBAAmB,CAAC,EAAE,OAAO,EAC7B,eAAe,CAAC,EAAE,OAAO,KACtB,MAAM,EAAE,CAAC;IACd,MAAM,EAAE,CACN,QAAQ,EAAE,MAAM,EAAE,EAClB,aAAa,CAAC,EAAE,OAAO,EACvB,mBAAmB,CAAC,EAAE,OAAO,KAC1B,MAAM,CAAC;CACb;AAED,QAAA,MAAM,cAAc,EAAE,cAAmB,CAAC;AAqgB1C,eAAe,cAAc,CAAC"}
|
|
@@ -19,7 +19,9 @@ const base64decode = function (encodedString) {
|
|
|
19
19
|
return atob(encodedString);
|
|
20
20
|
};
|
|
21
21
|
const getMergeIdentifierString = function (firstTokenId, secondTokenId) {
|
|
22
|
-
return llamaTokenizer.vocabById[firstTokenId] +
|
|
22
|
+
return (llamaTokenizer.vocabById[firstTokenId] +
|
|
23
|
+
" " +
|
|
24
|
+
llamaTokenizer.vocabById[secondTokenId]);
|
|
23
25
|
};
|
|
24
26
|
const decompressMerges = function (merges_binary) {
|
|
25
27
|
// Base64 decode binary.
|
|
@@ -50,46 +52,46 @@ const decompressMerges = function (merges_binary) {
|
|
|
50
52
|
return merges;
|
|
51
53
|
};
|
|
52
54
|
/**
|
|
53
|
-
* Helper function to decode the vocabulary.
|
|
54
|
-
*
|
|
55
|
-
* vocab_base64 is base64-encoded string of tokens delimited by '\n' (line break) in utf-8.
|
|
56
|
-
* The row number of the token (indexing from 0) represents the id of the token in LLaMA tokenizer.
|
|
57
|
-
*
|
|
58
|
-
* Most tokens look like this: "ic" (without the quotes) (representing the "i" character followed by the "c" character)
|
|
59
|
-
* Some tokens are special. In particular, spaces are replaced with the "▁" character and line-break is represented as "<0x0A>".
|
|
60
|
-
*
|
|
61
|
-
* This helper function returns the vocabulary as an array that contains Strings representing tokens:
|
|
62
|
-
*
|
|
63
|
-
* "<unk>" // Special token: unknown token
|
|
64
|
-
* "<s>" // Special token: beginning of string
|
|
65
|
-
* "</s>" // Special token: end of string
|
|
66
|
-
* "<0x00>" // Byte-level token representing the 0-byte
|
|
67
|
-
* "<0x01>" // Byte-level token ...
|
|
68
|
-
* "<0x02>" // Byte-level token ...
|
|
69
|
-
* ... // More byte-level tokens
|
|
70
|
-
* "<0x0A>" // Byte-level token representing '\n' (line break). This is one of the few byte-level tokens that appear to be actually needed in practice.
|
|
71
|
-
* ... // More byte-level tokens
|
|
72
|
-
* "<0xFF>" // Byte-level token ...
|
|
73
|
-
* "▁▁" // Token representing 2 consecutive spaces.
|
|
74
|
-
* "▁t" // Token representing the space character followed by the "t" character.
|
|
75
|
-
* "er" // Token representing the "e" character followed by the "r" character. Most tokens look like this.
|
|
76
|
-
* ... // 32000 tokens
|
|
77
|
-
*/
|
|
55
|
+
* Helper function to decode the vocabulary.
|
|
56
|
+
*
|
|
57
|
+
* vocab_base64 is base64-encoded string of tokens delimited by '\n' (line break) in utf-8.
|
|
58
|
+
* The row number of the token (indexing from 0) represents the id of the token in LLaMA tokenizer.
|
|
59
|
+
*
|
|
60
|
+
* Most tokens look like this: "ic" (without the quotes) (representing the "i" character followed by the "c" character)
|
|
61
|
+
* Some tokens are special. In particular, spaces are replaced with the "▁" character and line-break is represented as "<0x0A>".
|
|
62
|
+
*
|
|
63
|
+
* This helper function returns the vocabulary as an array that contains Strings representing tokens:
|
|
64
|
+
*
|
|
65
|
+
* "<unk>" // Special token: unknown token
|
|
66
|
+
* "<s>" // Special token: beginning of string
|
|
67
|
+
* "</s>" // Special token: end of string
|
|
68
|
+
* "<0x00>" // Byte-level token representing the 0-byte
|
|
69
|
+
* "<0x01>" // Byte-level token ...
|
|
70
|
+
* "<0x02>" // Byte-level token ...
|
|
71
|
+
* ... // More byte-level tokens
|
|
72
|
+
* "<0x0A>" // Byte-level token representing '\n' (line break). This is one of the few byte-level tokens that appear to be actually needed in practice.
|
|
73
|
+
* ... // More byte-level tokens
|
|
74
|
+
* "<0xFF>" // Byte-level token ...
|
|
75
|
+
* "▁▁" // Token representing 2 consecutive spaces.
|
|
76
|
+
* "▁t" // Token representing the space character followed by the "t" character.
|
|
77
|
+
* "er" // Token representing the "e" character followed by the "r" character. Most tokens look like this.
|
|
78
|
+
* ... // 32000 tokens
|
|
79
|
+
*/
|
|
78
80
|
const decodeVocabulary = function (vocab_base64) {
|
|
79
|
-
const byteArray = Uint8Array.from(base64decode(vocab_base64), c => c.charCodeAt(0));
|
|
80
|
-
const textDecoder = new TextDecoder(
|
|
81
|
+
const byteArray = Uint8Array.from(base64decode(vocab_base64), (c) => c.charCodeAt(0));
|
|
82
|
+
const textDecoder = new TextDecoder("utf-8");
|
|
81
83
|
return textDecoder.decode(byteArray).split("\n");
|
|
82
84
|
};
|
|
83
85
|
const utf8ByteToHex = (c) => {
|
|
84
|
-
const hexValue = c.toString(16).toUpperCase().padStart(2,
|
|
86
|
+
const hexValue = c.toString(16).toUpperCase().padStart(2, "0");
|
|
85
87
|
return `<0x${hexValue}>`;
|
|
86
88
|
};
|
|
87
89
|
const hexToUtf8Byte = (hex) => {
|
|
88
|
-
const strippedHex = hex.replace(/<0x|>/g,
|
|
90
|
+
const strippedHex = hex.replace(/<0x|>/g, "");
|
|
89
91
|
return parseInt(strippedHex, 16);
|
|
90
92
|
};
|
|
91
93
|
const utf8Encoder = new TextEncoder();
|
|
92
|
-
const utf8Decoder = new TextDecoder(
|
|
94
|
+
const utf8Decoder = new TextDecoder("utf-8");
|
|
93
95
|
class PriorityQueue {
|
|
94
96
|
// PriorityQueue implementation is copied from https://stackoverflow.com/a/42919752 with minor refactoring
|
|
95
97
|
constructor(comparator = (a, b) => a > b) {
|
|
@@ -106,7 +108,7 @@ class PriorityQueue {
|
|
|
106
108
|
return this._heap[0];
|
|
107
109
|
}
|
|
108
110
|
push(...values) {
|
|
109
|
-
values.forEach(value => {
|
|
111
|
+
values.forEach((value) => {
|
|
110
112
|
this._heap.push(value);
|
|
111
113
|
this._siftUp();
|
|
112
114
|
});
|
|
@@ -152,9 +154,14 @@ class PriorityQueue {
|
|
|
152
154
|
}
|
|
153
155
|
_siftDown() {
|
|
154
156
|
let node = 0;
|
|
155
|
-
while ((this._left(node) < this.size() &&
|
|
156
|
-
|
|
157
|
-
|
|
157
|
+
while ((this._left(node) < this.size() &&
|
|
158
|
+
this._greater(this._left(node), node)) ||
|
|
159
|
+
(this._right(node) < this.size() &&
|
|
160
|
+
this._greater(this._right(node), node))) {
|
|
161
|
+
let maxChild = this._right(node) < this.size() &&
|
|
162
|
+
this._greater(this._right(node), this._left(node))
|
|
163
|
+
? this._right(node)
|
|
164
|
+
: this._left(node);
|
|
158
165
|
this._swap(node, maxChild);
|
|
159
166
|
node = maxChild;
|
|
160
167
|
}
|
|
@@ -171,7 +178,7 @@ const mapCharactersToTokenIds = (prompt, add_bos_token, add_preceding_space) =>
|
|
|
171
178
|
prompt = " " + prompt;
|
|
172
179
|
}
|
|
173
180
|
// Special: spaces are represented as thick underscore ▁ (id 29871)
|
|
174
|
-
const promptAltered =
|
|
181
|
+
const promptAltered = prompt.replaceAll(" ", llamaTokenizer.vocabById[29871]);
|
|
175
182
|
// We need to use Array.from to iterate over characters in order to support UTF-8 multipoint characters
|
|
176
183
|
const charArray = Array.from(promptAltered);
|
|
177
184
|
// Transform each character to its corresponding token
|
|
@@ -190,7 +197,13 @@ const mapCharactersToTokenIds = (prompt, add_bos_token, add_preceding_space) =>
|
|
|
190
197
|
if (!(hex >= 0)) {
|
|
191
198
|
// This is not supposed to happen because the LLaMA vocabulary has a token corresponding to each byte,
|
|
192
199
|
// but if this happens regardless, let's follow the protocol and tokenize to <UNK> token instead of crashing.
|
|
193
|
-
console.log(
|
|
200
|
+
console.log("Encountered unknown character " +
|
|
201
|
+
c +
|
|
202
|
+
" (partial UTF-8 byte " +
|
|
203
|
+
bytes[j] +
|
|
204
|
+
" + hex + " +
|
|
205
|
+
utf8ByteToHex(bytes[j]) +
|
|
206
|
+
")");
|
|
194
207
|
tokenIds[tokenIds.length - 1] = 0;
|
|
195
208
|
}
|
|
196
209
|
}
|
|
@@ -203,8 +216,10 @@ const encode = (prompt, add_bos_token = true, add_preceding_space = true, log_pe
|
|
|
203
216
|
if (log_performance) {
|
|
204
217
|
startTime = performance.now();
|
|
205
218
|
}
|
|
206
|
-
if (!llamaTokenizer.vocabById ||
|
|
207
|
-
|
|
219
|
+
if (!llamaTokenizer.vocabById ||
|
|
220
|
+
!llamaTokenizer.vocabByString ||
|
|
221
|
+
!llamaTokenizer.merges) {
|
|
222
|
+
console.log("Tokenizer not initialized properly!");
|
|
208
223
|
return;
|
|
209
224
|
}
|
|
210
225
|
if (prompt.length === 0) {
|
|
@@ -221,7 +236,8 @@ const encode = (prompt, add_bos_token = true, add_preceding_space = true, log_pe
|
|
|
221
236
|
// Merge priority is primarily determined by the location of the merge in the "merges" data,
|
|
222
237
|
// secondarily determined by the relative position of the node in the linked list
|
|
223
238
|
// (We want to perform equal merges from left to right)
|
|
224
|
-
const mergePrio = llamaTokenizer.merges.get(mergeIdentifierString) +
|
|
239
|
+
const mergePrio = llamaTokenizer.merges.get(mergeIdentifierString) +
|
|
240
|
+
leftNode.origPos / prompt.length;
|
|
225
241
|
if (mergePrio) {
|
|
226
242
|
// If mergePrio not found in merges, that means this merge is not possible according to vocabulary.
|
|
227
243
|
leftNode.mergePrio = mergePrio;
|
|
@@ -242,7 +258,7 @@ const encode = (prompt, add_bos_token = true, add_preceding_space = true, log_pe
|
|
|
242
258
|
origPos: i,
|
|
243
259
|
tokenId: tokenIds[i],
|
|
244
260
|
prev: prevTokenNode,
|
|
245
|
-
next: null
|
|
261
|
+
next: null,
|
|
246
262
|
};
|
|
247
263
|
prevTokenNode.next = currTokenNode;
|
|
248
264
|
addToMergeQueue(prevTokenNode);
|
|
@@ -271,7 +287,7 @@ const encode = (prompt, add_bos_token = true, add_preceding_space = true, log_pe
|
|
|
271
287
|
origPos: oldPrev.origPos,
|
|
272
288
|
tokenId: oldPrev.tokenId,
|
|
273
289
|
prev: oldPrev.prev,
|
|
274
|
-
next: oldPrev.next
|
|
290
|
+
next: oldPrev.next,
|
|
275
291
|
};
|
|
276
292
|
leftOfMerge.prev = newPrev;
|
|
277
293
|
// Update linked list reference of "prev of prev"
|
|
@@ -288,7 +304,7 @@ const encode = (prompt, add_bos_token = true, add_preceding_space = true, log_pe
|
|
|
288
304
|
origPos: leftOfMerge.origPos,
|
|
289
305
|
tokenId: llamaTokenizer.vocabByString.get(leftOfMerge.mergeToString),
|
|
290
306
|
prev: leftOfMerge.prev,
|
|
291
|
-
next: leftOfMerge.next.next
|
|
307
|
+
next: leftOfMerge.next.next,
|
|
292
308
|
};
|
|
293
309
|
// Consider adding to merge queue: prev--resultOfMerge
|
|
294
310
|
if (resultOfMerge.prev) {
|
|
@@ -313,7 +329,7 @@ const encode = (prompt, add_bos_token = true, add_preceding_space = true, log_pe
|
|
|
313
329
|
}
|
|
314
330
|
if (log_performance) {
|
|
315
331
|
const endTime = performance.now();
|
|
316
|
-
console.log(
|
|
332
|
+
console.log("Tokenizer running time: " + (endTime - startTime) + " milliseconds");
|
|
317
333
|
}
|
|
318
334
|
return mergedTokenIds;
|
|
319
335
|
};
|
|
@@ -331,7 +347,7 @@ const decode = function (tokenIds, add_bos_token = true, add_preceding_space = t
|
|
|
331
347
|
else {
|
|
332
348
|
// Typical case
|
|
333
349
|
const utf8bytes = utf8Encoder.encode(tokenString);
|
|
334
|
-
utf8bytes.forEach(utf8Byte => utf8byteVals.push(utf8Byte));
|
|
350
|
+
utf8bytes.forEach((utf8Byte) => utf8byteVals.push(utf8Byte));
|
|
335
351
|
}
|
|
336
352
|
}
|
|
337
353
|
const uint8Array = new Uint8Array(utf8byteVals);
|
|
@@ -342,7 +358,10 @@ const decode = function (tokenIds, add_bos_token = true, add_preceding_space = t
|
|
|
342
358
|
};
|
|
343
359
|
function runTests() {
|
|
344
360
|
function isEqual(arr1, arr2) {
|
|
345
|
-
return arr1.length === arr2.length &&
|
|
361
|
+
return (arr1.length === arr2.length &&
|
|
362
|
+
arr1.every(function (value, index) {
|
|
363
|
+
return value === arr2[index];
|
|
364
|
+
}));
|
|
346
365
|
}
|
|
347
366
|
function testCase(inputString, expectedTokenIds) {
|
|
348
367
|
const actualTokens = encode(inputString, true, true, true);
|
|
@@ -366,57 +385,51 @@ function runTests() {
|
|
|
366
385
|
// Equal prio merges are performed left-to-right (fixed in 1.1.1)
|
|
367
386
|
testCase("ax\n####\nboo", [1, 4853, 13, 4136, 13, 833, 29877]);
|
|
368
387
|
// UTF-8 multipoint character that should be found in vocabulary
|
|
369
|
-
testCase(
|
|
388
|
+
testCase("镇", [1, 29871, 30411]);
|
|
370
389
|
// UTF-8 multipoint character that should NOT be found in vocabulary, fallback to MULTIPLE byte tokens
|
|
371
|
-
testCase(
|
|
390
|
+
testCase("🦙", [1, 29871, 243, 162, 169, 156]);
|
|
372
391
|
// Consecutive UTF-8 multipoint characters that are NOT found in a vocabulary and use DIFFERENT number of bytes
|
|
373
|
-
testCase(
|
|
374
|
-
testCase(
|
|
392
|
+
testCase("🦙Ꙋ", [1, 29871, 243, 162, 169, 156, 237, 156, 141]);
|
|
393
|
+
testCase("Ꙋ🦙", [1, 29871, 237, 156, 141, 243, 162, 169, 156]);
|
|
375
394
|
// Larger text input with various special characters sprinkled in
|
|
376
|
-
testCase(
|
|
377
|
-
|
|
378
|
-
|
|
379
|
-
|
|
380
|
-
3082, 3949, 295, 333, 29892, 17644, 1304, 408,
|
|
381
|
-
322, 4870, 13019, 491, 1126, 29872, 273, 4185, 1973, 1951,
|
|
382
|
-
278, 4721, 29899, 1625, 3774, 713, 3152, 29889, 365, 5288,
|
|
383
|
-
|
|
384
|
-
|
|
385
|
-
|
|
386
|
-
|
|
387
|
-
|
|
388
|
-
|
|
389
|
-
|
|
390
|
-
|
|
391
|
-
|
|
392
|
-
|
|
393
|
-
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
|
|
406
|
-
|
|
407
|
-
|
|
408
|
-
|
|
409
|
-
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
413
|
-
|
|
414
|
-
262, 1078, 408, 372, 1153, 1144, 7226, 29953, 29962, 7579,
|
|
415
|
-
304, 319, 962, 2518, 831, 13496, 3002, 29892, 11829, 294,
|
|
416
|
-
674, 736, 304, 278, 4094, 7689, 886, 322, 301, 4425,
|
|
417
|
-
787, 988, 896, 2041, 515, 472, 278, 1095, 310, 931,
|
|
418
|
-
7226, 29953, 29962]);
|
|
419
|
-
console.log('LLaMA Tokenizer tests passed successfully.');
|
|
395
|
+
testCase('The llama (/ˈlɑːmə/; 🦙Spanish pronunciation: [ˈʎama]) (Lama glama) is a domesticated South American camelid, widely used as a meat and pack animal by Andean cultures since the Pre-Columbian era. Llamas are social animals and live with others as a herd. Their wool is soft and contains only a small amount of lanolin.[2] Llamas can learn simple tasks after a few repetitions. When using a pack, they can carry about 25 to 30% of their body weight for 8 to 13 km (5–8 miles).[3] The name llama (in the past also spelled "lama" or "glama") was adopted by European settlers from native Peruvians.[4] The ancestors of llamas are thought to have originated from the Great Plains of North America about 40 million years ago, and subsequently migrated to South America about three million years ago during the Great American Interchange. By the end of the last ice age (10,000–12,000 years ago), camelids were extinct in North America.[3] As of 2007, there were over seven million llamas and alpacas in South America and over 158,000 llamas and 100,000Ꙋ🦙 alpacas, descended from progenitors imported late in the 20th century, in the United States and Canada.[5] In Aymara mythology, llamas are important beings. The Heavenly Llama is said to drink water from the ocean and urinates as it rains.[6] According to Aymara eschatology, llamas will return to the water springs and lagoons where they come from at the end of time.[6]', [
|
|
396
|
+
1, 450, 11148, 3304, 20374, 30176, 29880, 30426, 30215, 29885, 30184,
|
|
397
|
+
29914, 29936, 29871, 243, 162, 169, 156, 15495, 728, 11504, 11173, 362,
|
|
398
|
+
29901, 518, 30176, 31743, 3304, 2314, 313, 29931, 3304, 3144, 3304, 29897,
|
|
399
|
+
338, 263, 21849, 630, 4275, 3082, 3949, 295, 333, 29892, 17644, 1304, 408,
|
|
400
|
+
263, 27654, 322, 4870, 13019, 491, 1126, 29872, 273, 4185, 1973, 1951,
|
|
401
|
+
278, 4721, 29899, 1625, 3774, 713, 3152, 29889, 365, 5288, 294, 526, 5264,
|
|
402
|
+
15006, 322, 5735, 411, 4045, 408, 263, 902, 29881, 29889, 11275, 281,
|
|
403
|
+
1507, 338, 4964, 322, 3743, 871, 263, 2319, 5253, 310, 10906, 22878, 7226,
|
|
404
|
+
29906, 29962, 365, 5288, 294, 508, 5110, 2560, 9595, 1156, 263, 2846,
|
|
405
|
+
21159, 2187, 29889, 1932, 773, 263, 4870, 29892, 896, 508, 8677, 1048,
|
|
406
|
+
29871, 29906, 29945, 304, 29871, 29941, 29900, 29995, 310, 1009, 3573,
|
|
407
|
+
7688, 363, 29871, 29947, 304, 29871, 29896, 29941, 2383, 313, 29945,
|
|
408
|
+
29994, 29947, 7800, 467, 29961, 29941, 29962, 450, 1024, 11148, 3304, 313,
|
|
409
|
+
262, 278, 4940, 884, 805, 14356, 376, 29880, 3304, 29908, 470, 376, 3820,
|
|
410
|
+
3304, 1159, 471, 16356, 491, 7824, 3604, 9306, 515, 7531, 25493, 1403,
|
|
411
|
+
550, 7226, 29946, 29962, 450, 19525, 943, 310, 11829, 294, 526, 2714, 304,
|
|
412
|
+
505, 3978, 630, 515, 278, 7027, 13494, 1144, 310, 4644, 6813, 1048, 29871,
|
|
413
|
+
29946, 29900, 7284, 2440, 8020, 29892, 322, 17602, 9725, 630, 304, 4275,
|
|
414
|
+
6813, 1048, 2211, 7284, 2440, 8020, 2645, 278, 7027, 3082, 4124, 3167,
|
|
415
|
+
29889, 2648, 278, 1095, 310, 278, 1833, 14890, 5046, 313, 29896, 29900,
|
|
416
|
+
29892, 29900, 29900, 29900, 29994, 29896, 29906, 29892, 29900, 29900,
|
|
417
|
+
29900, 2440, 8020, 511, 3949, 295, 4841, 892, 1294, 5562, 297, 4644, 6813,
|
|
418
|
+
7226, 29941, 29962, 1094, 310, 29871, 29906, 29900, 29900, 29955, 29892,
|
|
419
|
+
727, 892, 975, 9881, 7284, 11829, 294, 322, 394, 29886, 562, 294, 297,
|
|
420
|
+
4275, 6813, 322, 975, 29871, 29896, 29945, 29947, 29892, 29900, 29900,
|
|
421
|
+
29900, 11829, 294, 322, 29871, 29896, 29900, 29900, 29892, 29900, 29900,
|
|
422
|
+
29900, 237, 156, 141, 243, 162, 169, 156, 394, 29886, 562, 294, 29892,
|
|
423
|
+
5153, 2760, 515, 410, 1885, 17259, 19673, 5683, 297, 278, 29871, 29906,
|
|
424
|
+
29900, 386, 6462, 29892, 297, 278, 3303, 3900, 322, 7400, 7226, 29945,
|
|
425
|
+
29962, 512, 319, 962, 2518, 22082, 3002, 29892, 11829, 294, 526, 4100,
|
|
426
|
+
367, 886, 29889, 450, 22977, 368, 365, 29880, 3304, 338, 1497, 304, 13748,
|
|
427
|
+
4094, 515, 278, 23474, 322, 5065, 262, 1078, 408, 372, 1153, 1144, 7226,
|
|
428
|
+
29953, 29962, 7579, 304, 319, 962, 2518, 831, 13496, 3002, 29892, 11829,
|
|
429
|
+
294, 674, 736, 304, 278, 4094, 7689, 886, 322, 301, 4425, 787, 988, 896,
|
|
430
|
+
2041, 515, 472, 278, 1095, 310, 931, 7226, 29953, 29962,
|
|
431
|
+
]);
|
|
432
|
+
console.log("LLaMA Tokenizer tests passed successfully.");
|
|
420
433
|
return true;
|
|
421
434
|
}
|
|
422
435
|
function initializeLlamaTokenizer() {
|
package/dist/lib/_trie.js
CHANGED
package/dist/lib/template.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { AbstractTokenizer } from
|
|
2
|
-
import { AbstractTokenGenerator } from
|
|
1
|
+
import { AbstractTokenizer } from "./tokenizer";
|
|
2
|
+
import { AbstractTokenGenerator } from "./token-generator";
|
|
3
3
|
export declare enum TEMPLATE_METHODS {
|
|
4
4
|
SEL = "SEL",
|
|
5
5
|
GEN = "GEN"
|
|
@@ -11,6 +11,7 @@ export declare class TemplateProcessor {
|
|
|
11
11
|
setTokenizer(tokenizer: AbstractTokenizer): void;
|
|
12
12
|
setGenerator(generator: AbstractTokenGenerator): void;
|
|
13
13
|
processTemplate(template: string, variables: Map<string, string | string[]>): Promise<Map<string, string>>;
|
|
14
|
+
processTemplateStream(template: string, variables: Map<string, string | string[]>): AsyncGenerator<Map<string, string>, void>;
|
|
14
15
|
private findAllIndexes;
|
|
15
16
|
}
|
|
16
17
|
//# sourceMappingURL=template.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"template.d.ts","sourceRoot":"","sources":["../../src/lib/template.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,EAAE,sBAAsB,EAAE,MAAM,mBAAmB,CAAC;AAE3D,oBAAY,gBAAgB;IAC1B,GAAG,QAAQ;IACX,GAAG,QAAQ;CACZ;AAED,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,SAAS,CAAoB;IACrC,OAAO,CAAC,SAAS,CAAyB;gBAE9B,SAAS,EAAE,iBAAiB,EAAE,SAAS,EAAE,sBAAsB;IAKpE,YAAY,CAAC,SAAS,EAAE,iBAAiB;IAIzC,YAAY,CAAC,SAAS,EAAE,sBAAsB;IAIxC,eAAe,
|
|
1
|
+
{"version":3,"file":"template.d.ts","sourceRoot":"","sources":["../../src/lib/template.ts"],"names":[],"mappings":"AACA,OAAO,EAAE,iBAAiB,EAAE,MAAM,aAAa,CAAC;AAChD,OAAO,EAAE,sBAAsB,EAAE,MAAM,mBAAmB,CAAC;AAE3D,oBAAY,gBAAgB;IAC1B,GAAG,QAAQ;IACX,GAAG,QAAQ;CACZ;AAED,qBAAa,iBAAiB;IAC5B,OAAO,CAAC,SAAS,CAAoB;IACrC,OAAO,CAAC,SAAS,CAAyB;gBAE9B,SAAS,EAAE,iBAAiB,EAAE,SAAS,EAAE,sBAAsB;IAKpE,YAAY,CAAC,SAAS,EAAE,iBAAiB;IAIzC,YAAY,CAAC,SAAS,EAAE,sBAAsB;IAIxC,eAAe,CAC1B,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC,GACxC,OAAO,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IAYjB,qBAAqB,CACjC,QAAQ,EAAE,MAAM,EAChB,SAAS,EAAE,GAAG,CAAC,MAAM,EAAE,MAAM,GAAG,MAAM,EAAE,CAAC,GACxC,cAAc,CAAC,GAAG,CAAC,MAAM,EAAE,MAAM,CAAC,EAAE,IAAI,CAAC;IA2G5C,OAAO,CAAC,cAAc;CAQvB"}
|
package/dist/lib/template.js
CHANGED
|
@@ -8,6 +8,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
9
|
});
|
|
10
10
|
};
|
|
11
|
+
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
12
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
13
|
+
var m = o[Symbol.asyncIterator], i;
|
|
14
|
+
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
15
|
+
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
16
|
+
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
17
|
+
};
|
|
18
|
+
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
|
|
19
|
+
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
|
|
20
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
21
|
+
var g = generator.apply(thisArg, _arguments || []), i, q = [];
|
|
22
|
+
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
|
|
23
|
+
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
|
|
24
|
+
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
|
|
25
|
+
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
|
|
26
|
+
function fulfill(value) { resume("next", value); }
|
|
27
|
+
function reject(value) { resume("throw", value); }
|
|
28
|
+
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
|
|
29
|
+
};
|
|
11
30
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
31
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
32
|
};
|
|
@@ -31,50 +50,94 @@ class TemplateProcessor {
|
|
|
31
50
|
this.generator = generator;
|
|
32
51
|
}
|
|
33
52
|
processTemplate(template, variables) {
|
|
53
|
+
var _a, e_1, _b, _c;
|
|
34
54
|
return __awaiter(this, void 0, void 0, function* () {
|
|
55
|
+
let finalResult = new Map();
|
|
56
|
+
try {
|
|
57
|
+
for (var _d = true, _e = __asyncValues(this.processTemplateStream(template, variables)), _f; _f = yield _e.next(), _a = _f.done, !_a; _d = true) {
|
|
58
|
+
_c = _f.value;
|
|
59
|
+
_d = false;
|
|
60
|
+
const partialResult = _c;
|
|
61
|
+
finalResult = partialResult;
|
|
62
|
+
}
|
|
63
|
+
}
|
|
64
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
65
|
+
finally {
|
|
66
|
+
try {
|
|
67
|
+
if (!_d && !_a && (_b = _e.return)) yield _b.call(_e);
|
|
68
|
+
}
|
|
69
|
+
finally { if (e_1) throw e_1.error; }
|
|
70
|
+
}
|
|
71
|
+
return finalResult;
|
|
72
|
+
});
|
|
73
|
+
}
|
|
74
|
+
processTemplateStream(template, variables) {
|
|
75
|
+
return __asyncGenerator(this, arguments, function* processTemplateStream_1() {
|
|
76
|
+
var _a, e_2, _b, _c;
|
|
35
77
|
const result = new Map();
|
|
36
78
|
// Replace {{val}} in template with variables[val]
|
|
37
79
|
variables.forEach((value, key) => {
|
|
38
|
-
template = template.replace(new RegExp(`{{${key}}}`,
|
|
80
|
+
template = template.replace(new RegExp(`{{${key}}}`, "g"), value.toString());
|
|
39
81
|
});
|
|
40
82
|
// Replace {{method variableName methodArg1=methodArg1Value methodArg2=methodArg2Value}} in template
|
|
41
|
-
const indexes = this.findAllIndexes(template,
|
|
83
|
+
const indexes = this.findAllIndexes(template, "{{");
|
|
42
84
|
let nextTemplateIndexForPrompt = 0;
|
|
43
|
-
let prompt =
|
|
85
|
+
let prompt = "";
|
|
44
86
|
for (let i = 0; i < indexes.length; i++) {
|
|
45
87
|
prompt += template.substring(nextTemplateIndexForPrompt, indexes[i]);
|
|
46
88
|
const start = indexes[i] + 2;
|
|
47
|
-
const end = template.substring(start).indexOf(
|
|
89
|
+
const end = template.substring(start).indexOf("}}") + start;
|
|
48
90
|
const content = template.substring(start, end);
|
|
49
|
-
const args = content.split(
|
|
91
|
+
const args = content.split(" ");
|
|
50
92
|
const method = args[0];
|
|
51
93
|
const variableName = args[1];
|
|
52
|
-
const methodArgs = args
|
|
53
|
-
|
|
94
|
+
const methodArgs = args
|
|
95
|
+
.slice(2)
|
|
96
|
+
.reduce((acc, arg) => {
|
|
97
|
+
const [key, value] = arg.split("=");
|
|
54
98
|
acc[key] = value;
|
|
55
99
|
return acc;
|
|
56
100
|
}, {});
|
|
57
|
-
let completion =
|
|
101
|
+
let completion = "";
|
|
58
102
|
switch (method) {
|
|
59
103
|
case TEMPLATE_METHODS.GEN:
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
|
|
64
|
-
|
|
104
|
+
const stream = this.generator.generateString(prompt, methodArgs);
|
|
105
|
+
try {
|
|
106
|
+
for (var _d = true, stream_1 = (e_2 = void 0, __asyncValues(stream)), stream_1_1; stream_1_1 = yield __await(stream_1.next()), _a = stream_1_1.done, !_a; _d = true) {
|
|
107
|
+
_c = stream_1_1.value;
|
|
108
|
+
_d = false;
|
|
109
|
+
const chunk = _c;
|
|
110
|
+
completion = chunk;
|
|
111
|
+
// Remove string after stop
|
|
112
|
+
if (methodArgs["stop"]) {
|
|
113
|
+
if (completion.indexOf(methodArgs["stop"]) >= 0) {
|
|
114
|
+
completion = completion.substring(0, completion.indexOf(methodArgs["stop"]));
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
result.set(variableName, completion);
|
|
118
|
+
yield yield __await(result);
|
|
65
119
|
}
|
|
66
120
|
}
|
|
67
|
-
|
|
121
|
+
catch (e_2_1) { e_2 = { error: e_2_1 }; }
|
|
122
|
+
finally {
|
|
123
|
+
try {
|
|
124
|
+
if (!_d && !_a && (_b = stream_1.return)) yield __await(_b.call(stream_1));
|
|
125
|
+
}
|
|
126
|
+
finally { if (e_2) throw e_2.error; }
|
|
127
|
+
}
|
|
68
128
|
break;
|
|
69
129
|
case TEMPLATE_METHODS.SEL:
|
|
70
130
|
const trie = new _trie_1.default();
|
|
71
131
|
// Get options from variables
|
|
72
|
-
const options = variables.get(methodArgs[
|
|
132
|
+
const options = variables.get(methodArgs["options"]);
|
|
73
133
|
if (!options) {
|
|
74
|
-
throw new Error(`${methodArgs[
|
|
134
|
+
throw new Error(`${methodArgs["options"]} variable not found`);
|
|
75
135
|
}
|
|
76
136
|
// Add all options to trie
|
|
77
|
-
options.forEach(option =>
|
|
137
|
+
options.forEach((option) => {
|
|
138
|
+
const prefix = this.tokenizer.encodeString(prompt + option + this.tokenizer.getEOS());
|
|
139
|
+
trie.addPrefix(prefix);
|
|
140
|
+
});
|
|
78
141
|
let currentPrefixPrompt = prompt;
|
|
79
142
|
do {
|
|
80
143
|
const currentPrefix = trie.getNextPrefix(this.tokenizer.encodeString(currentPrefixPrompt));
|
|
@@ -82,19 +145,23 @@ class TemplateProcessor {
|
|
|
82
145
|
const nextChildren = trie.getNextChildren(currentPrefix);
|
|
83
146
|
if (nextChildren.length < 2) {
|
|
84
147
|
// If there is only one child, we complete
|
|
85
|
-
completion = this.tokenizer
|
|
148
|
+
completion = this.tokenizer
|
|
149
|
+
.decodeString(trie.getWord(currentPrefix))
|
|
150
|
+
.substring(prompt.length)
|
|
151
|
+
.replace(this.tokenizer.getEOS(), "");
|
|
86
152
|
break;
|
|
87
153
|
}
|
|
88
154
|
else {
|
|
89
155
|
// If there is more than one child, we generate the next token
|
|
90
|
-
const nextToken = yield this.generator.generateToken(prompt, nextChildren.reduce((acc, child) => {
|
|
156
|
+
const nextToken = yield __await(this.generator.generateToken(prompt, nextChildren.reduce((acc, child) => {
|
|
91
157
|
acc[child.toString()] = 100;
|
|
92
158
|
return acc;
|
|
93
|
-
}, {}));
|
|
159
|
+
}, {})));
|
|
94
160
|
currentPrefixPrompt = currentPrefixPrompt + nextToken;
|
|
95
161
|
}
|
|
96
162
|
} while (!completion);
|
|
97
163
|
result.set(variableName, completion);
|
|
164
|
+
yield yield __await(result);
|
|
98
165
|
break;
|
|
99
166
|
default:
|
|
100
167
|
throw new Error(`Invalid method ${method} in template`);
|
|
@@ -102,7 +169,6 @@ class TemplateProcessor {
|
|
|
102
169
|
prompt += completion;
|
|
103
170
|
nextTemplateIndexForPrompt = end + 2;
|
|
104
171
|
}
|
|
105
|
-
return result;
|
|
106
172
|
});
|
|
107
173
|
}
|
|
108
174
|
findAllIndexes(str, substr) {
|
|
@@ -1,7 +1,8 @@
|
|
|
1
|
-
import { ClientOptions } from
|
|
1
|
+
import { ClientOptions } from "openai";
|
|
2
|
+
import { CompletionCreateParams } from "openai/resources/completions.mjs";
|
|
2
3
|
export declare abstract class AbstractTokenGenerator {
|
|
3
4
|
abstract generateToken(prompt: string, logit_bias: Record<string, number>): Promise<string>;
|
|
4
|
-
abstract generateString(prompt: string, options: Record<string, string>):
|
|
5
|
+
abstract generateString(prompt: string, options: Record<string, string>): AsyncGenerator<string>;
|
|
5
6
|
}
|
|
6
7
|
/**
|
|
7
8
|
* OpenAI Token Generator
|
|
@@ -10,12 +11,13 @@ export declare abstract class AbstractTokenGenerator {
|
|
|
10
11
|
export declare class OpenAITokenGenerator extends AbstractTokenGenerator {
|
|
11
12
|
private openai;
|
|
12
13
|
private model;
|
|
14
|
+
private defaultCompletionParams?;
|
|
13
15
|
constructor(params: {
|
|
14
16
|
apiKey: string;
|
|
15
17
|
model: string;
|
|
16
18
|
baseURL?: string;
|
|
17
|
-
}, options?: ClientOptions);
|
|
19
|
+
}, options?: ClientOptions, defaultCompletionParams?: CompletionCreateParams);
|
|
18
20
|
generateToken(prompt: string, logit_bias: Record<string, number>): Promise<string>;
|
|
19
|
-
generateString(prompt: string, options: Record<string, string>):
|
|
21
|
+
generateString(prompt: string, options: Record<string, string>): AsyncGenerator<string>;
|
|
20
22
|
}
|
|
21
23
|
//# sourceMappingURL=token-generator.d.ts.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"token-generator.d.ts","sourceRoot":"","sources":["../../src/lib/token-generator.ts"],"names":[],"mappings":"AAAA,OAAe,EAAE,aAAa,EAAE,MAAM,QAAQ,
|
|
1
|
+
{"version":3,"file":"token-generator.d.ts","sourceRoot":"","sources":["../../src/lib/token-generator.ts"],"names":[],"mappings":"AAAA,OAAe,EAAE,aAAa,EAAE,MAAM,QAAQ,CAAC;AAC/C,OAAO,EAAE,sBAAsB,EAAE,MAAM,kCAAkC,CAAC;AAE1E,8BAAsB,sBAAsB;IAC1C,QAAQ,CAAC,aAAa,CACpB,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACjC,OAAO,CAAC,MAAM,CAAC;IAClB,QAAQ,CAAC,cAAc,CACrB,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAC9B,cAAc,CAAC,MAAM,CAAC;CAC1B;AACD;;;GAGG;AACH,qBAAa,oBAAqB,SAAQ,sBAAsB;IAC9D,OAAO,CAAC,MAAM,CAAS;IACvB,OAAO,CAAC,KAAK,CAAS;IACtB,OAAO,CAAC,uBAAuB,CAAC,CAAyB;gBAGvD,MAAM,EAAE;QACN,MAAM,EAAE,MAAM,CAAC;QACf,KAAK,EAAE,MAAM,CAAC;QACd,OAAO,CAAC,EAAE,MAAM,CAAC;KAClB,EACD,OAAO,CAAC,EAAE,aAAa,EACvB,uBAAuB,CAAC,EAAE,sBAAsB;IAYnC,aAAa,CAC1B,MAAM,EAAE,MAAM,EACd,UAAU,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GACjC,OAAO,CAAC,MAAM,CAAC;IAwCF,cAAc,CAC5B,MAAM,EAAE,MAAM,EACd,OAAO,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,GAC9B,cAAc,CAAC,MAAM,CAAC;CAkB1B"}
|
|
@@ -8,6 +8,25 @@ var __awaiter = (this && this.__awaiter) || function (thisArg, _arguments, P, ge
|
|
|
8
8
|
step((generator = generator.apply(thisArg, _arguments || [])).next());
|
|
9
9
|
});
|
|
10
10
|
};
|
|
11
|
+
var __await = (this && this.__await) || function (v) { return this instanceof __await ? (this.v = v, this) : new __await(v); }
|
|
12
|
+
var __asyncValues = (this && this.__asyncValues) || function (o) {
|
|
13
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
14
|
+
var m = o[Symbol.asyncIterator], i;
|
|
15
|
+
return m ? m.call(o) : (o = typeof __values === "function" ? __values(o) : o[Symbol.iterator](), i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i);
|
|
16
|
+
function verb(n) { i[n] = o[n] && function (v) { return new Promise(function (resolve, reject) { v = o[n](v), settle(resolve, reject, v.done, v.value); }); }; }
|
|
17
|
+
function settle(resolve, reject, d, v) { Promise.resolve(v).then(function(v) { resolve({ value: v, done: d }); }, reject); }
|
|
18
|
+
};
|
|
19
|
+
var __asyncGenerator = (this && this.__asyncGenerator) || function (thisArg, _arguments, generator) {
|
|
20
|
+
if (!Symbol.asyncIterator) throw new TypeError("Symbol.asyncIterator is not defined.");
|
|
21
|
+
var g = generator.apply(thisArg, _arguments || []), i, q = [];
|
|
22
|
+
return i = {}, verb("next"), verb("throw"), verb("return"), i[Symbol.asyncIterator] = function () { return this; }, i;
|
|
23
|
+
function verb(n) { if (g[n]) i[n] = function (v) { return new Promise(function (a, b) { q.push([n, v, a, b]) > 1 || resume(n, v); }); }; }
|
|
24
|
+
function resume(n, v) { try { step(g[n](v)); } catch (e) { settle(q[0][3], e); } }
|
|
25
|
+
function step(r) { r.value instanceof __await ? Promise.resolve(r.value.v).then(fulfill, reject) : settle(q[0][2], r); }
|
|
26
|
+
function fulfill(value) { resume("next", value); }
|
|
27
|
+
function reject(value) { resume("throw", value); }
|
|
28
|
+
function settle(f, v) { if (f(v), q.shift(), q.length) resume(q[0][0], q[0][1]); }
|
|
29
|
+
};
|
|
11
30
|
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
12
31
|
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
13
32
|
};
|
|
@@ -22,26 +41,24 @@ exports.AbstractTokenGenerator = AbstractTokenGenerator;
|
|
|
22
41
|
*
|
|
23
42
|
*/
|
|
24
43
|
class OpenAITokenGenerator extends AbstractTokenGenerator {
|
|
25
|
-
constructor(params, options) {
|
|
44
|
+
constructor(params, options, defaultCompletionParams) {
|
|
26
45
|
super();
|
|
27
46
|
this.model = params.model;
|
|
28
47
|
this.openai = new openai_1.default(Object.assign({ apiKey: params.apiKey, baseURL: params.baseURL }, options));
|
|
48
|
+
this.defaultCompletionParams = defaultCompletionParams;
|
|
29
49
|
}
|
|
30
50
|
generateToken(prompt, logit_bias) {
|
|
31
51
|
var _a;
|
|
32
52
|
return __awaiter(this, void 0, void 0, function* () {
|
|
33
|
-
const result = yield this.openai.completions.create({
|
|
34
|
-
|
|
35
|
-
prompt,
|
|
36
|
-
logit_bias,
|
|
37
|
-
logprobs: 10,
|
|
38
|
-
max_tokens: 1,
|
|
39
|
-
});
|
|
53
|
+
const result = yield this.openai.completions.create(Object.assign(Object.assign({}, this.defaultCompletionParams), { stream: false, model: this.model, prompt,
|
|
54
|
+
logit_bias, logprobs: 10, max_tokens: 1 }));
|
|
40
55
|
const logprobsResult = ((_a = result.choices[0].logprobs) === null || _a === void 0 ? void 0 : _a.top_logprobs) || [];
|
|
41
|
-
const top_logprobs = logprobsResult
|
|
56
|
+
const top_logprobs = logprobsResult
|
|
57
|
+
? logprobsResult[0]
|
|
58
|
+
: { "2": 0 };
|
|
42
59
|
// get max top_logpobs that is in logit_bias
|
|
43
60
|
let max = -Infinity;
|
|
44
|
-
let max_key =
|
|
61
|
+
let max_key = "";
|
|
45
62
|
for (const key in top_logprobs) {
|
|
46
63
|
if (top_logprobs[key] > max && key in logit_bias) {
|
|
47
64
|
max = top_logprobs[key];
|
|
@@ -49,7 +66,7 @@ class OpenAITokenGenerator extends AbstractTokenGenerator {
|
|
|
49
66
|
}
|
|
50
67
|
}
|
|
51
68
|
// if no key in logit_bias, get max top_logprobs
|
|
52
|
-
if (max_key ===
|
|
69
|
+
if (max_key === "") {
|
|
53
70
|
// no key in logit_bias
|
|
54
71
|
max = -Infinity;
|
|
55
72
|
for (const key in top_logprobs) {
|
|
@@ -63,9 +80,30 @@ class OpenAITokenGenerator extends AbstractTokenGenerator {
|
|
|
63
80
|
});
|
|
64
81
|
}
|
|
65
82
|
generateString(prompt, options) {
|
|
66
|
-
|
|
67
|
-
|
|
68
|
-
|
|
83
|
+
var _a, _b;
|
|
84
|
+
return __asyncGenerator(this, arguments, function* generateString_1() {
|
|
85
|
+
var _c, e_1, _d, _e;
|
|
86
|
+
const stream = yield __await(this.openai.completions.create(Object.assign(Object.assign(Object.assign({}, this.defaultCompletionParams), options), { stop: [
|
|
87
|
+
...(((_a = this.defaultCompletionParams) === null || _a === void 0 ? void 0 : _a.stop) || []),
|
|
88
|
+
...(options.stop ? options.stop.split(",") : []),
|
|
89
|
+
], model: this.model, prompt, stream: true })));
|
|
90
|
+
let result = "";
|
|
91
|
+
try {
|
|
92
|
+
for (var _f = true, stream_1 = __asyncValues(stream), stream_1_1; stream_1_1 = yield __await(stream_1.next()), _c = stream_1_1.done, !_c; _f = true) {
|
|
93
|
+
_e = stream_1_1.value;
|
|
94
|
+
_f = false;
|
|
95
|
+
const chunk = _e;
|
|
96
|
+
result += (_b = chunk.choices[0]) === null || _b === void 0 ? void 0 : _b.text;
|
|
97
|
+
yield yield __await(result);
|
|
98
|
+
}
|
|
99
|
+
}
|
|
100
|
+
catch (e_1_1) { e_1 = { error: e_1_1 }; }
|
|
101
|
+
finally {
|
|
102
|
+
try {
|
|
103
|
+
if (!_f && !_c && (_d = stream_1.return)) yield __await(_d.call(stream_1));
|
|
104
|
+
}
|
|
105
|
+
finally { if (e_1) throw e_1.error; }
|
|
106
|
+
}
|
|
69
107
|
});
|
|
70
108
|
}
|
|
71
109
|
}
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/lib/tokenizer.ts"],"names":[],"mappings":"AAGA,8BAAsB,iBAAiB;IACrC,QAAQ,CAAC,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE;IAC5C,QAAQ,CAAC,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,MAAM;IAC5C,QAAQ,CAAC,MAAM,IAAI,MAAM;CAC1B;AAED,qBAAa,cAAe,SAAQ,iBAAiB;IAC1C,YAAY,
|
|
1
|
+
{"version":3,"file":"tokenizer.d.ts","sourceRoot":"","sources":["../../src/lib/tokenizer.ts"],"names":[],"mappings":"AAGA,8BAAsB,iBAAiB;IACrC,QAAQ,CAAC,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE;IAC5C,QAAQ,CAAC,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,MAAM;IAC5C,QAAQ,CAAC,MAAM,IAAI,MAAM;CAC1B;AAED,qBAAa,cAAe,SAAQ,iBAAiB;IAC1C,YAAY,CACnB,GAAG,EAAE,MAAM,EACX,aAAa,CAAC,EAAE,OAAO,EACvB,mBAAmB,CAAC,EAAE,OAAO,EAC7B,eAAe,CAAC,EAAE,OAAO,GACxB,MAAM,EAAE;IAqBF,YAAY,CACnB,GAAG,EAAE,MAAM,EAAE,EACb,aAAa,CAAC,EAAE,OAAO,EACvB,mBAAmB,CAAC,EAAE,OAAO,GAC5B,MAAM;IAWA,MAAM,IAAI,MAAM;CAG1B;AAED,qBAAa,YAAa,SAAQ,iBAAiB;IACxC,YAAY,CAAC,GAAG,EAAE,MAAM,GAAG,MAAM,EAAE;IAMnC,YAAY,CAAC,GAAG,EAAE,MAAM,EAAE,GAAG,MAAM;IAInC,MAAM,IAAI,MAAM;CAG1B"}
|
package/dist/lib/tokenizer.js
CHANGED
|
@@ -15,7 +15,7 @@ class LLaMATokenizer extends AbstractTokenizer {
|
|
|
15
15
|
str = str.substring(0, str.length - this.getEOS().length);
|
|
16
16
|
return [
|
|
17
17
|
..._llama_tokenizer_1.default.encode(str, add_bos_token, add_preceding_space, log_performance),
|
|
18
|
-
2 // EOS
|
|
18
|
+
2, // EOS
|
|
19
19
|
];
|
|
20
20
|
}
|
|
21
21
|
return _llama_tokenizer_1.default.encode(str, add_bos_token, add_preceding_space, log_performance);
|
|
@@ -23,26 +23,27 @@ class LLaMATokenizer extends AbstractTokenizer {
|
|
|
23
23
|
decodeString(arr, add_bos_token, add_preceding_space) {
|
|
24
24
|
if (arr[arr.length - 1] === 2) {
|
|
25
25
|
arr = arr.slice(0, arr.length - 1);
|
|
26
|
-
return _llama_tokenizer_1.default.decode(arr, add_bos_token, add_preceding_space) +
|
|
26
|
+
return (_llama_tokenizer_1.default.decode(arr, add_bos_token, add_preceding_space) +
|
|
27
|
+
this.getEOS());
|
|
27
28
|
}
|
|
28
29
|
return _llama_tokenizer_1.default.decode(arr, add_bos_token, add_preceding_space);
|
|
29
30
|
}
|
|
30
31
|
getEOS() {
|
|
31
|
-
return
|
|
32
|
+
return "</s>";
|
|
32
33
|
}
|
|
33
34
|
}
|
|
34
35
|
exports.LLaMATokenizer = LLaMATokenizer;
|
|
35
36
|
class GTPTokenizer extends AbstractTokenizer {
|
|
36
37
|
encodeString(str) {
|
|
37
38
|
return (0, gpt_tokenizer_1.encode)(str, {
|
|
38
|
-
allowedSpecial: new Set([this.getEOS()])
|
|
39
|
+
allowedSpecial: new Set([this.getEOS()]),
|
|
39
40
|
});
|
|
40
41
|
}
|
|
41
42
|
decodeString(arr) {
|
|
42
43
|
return (0, gpt_tokenizer_1.decode)(arr);
|
|
43
44
|
}
|
|
44
45
|
getEOS() {
|
|
45
|
-
return
|
|
46
|
+
return "<|endoftext|>";
|
|
46
47
|
}
|
|
47
48
|
}
|
|
48
49
|
exports.GTPTokenizer = GTPTokenizer;
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@mikugg/guidance",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.0",
|
|
4
4
|
"description": "",
|
|
5
5
|
"main": "dist/index.js",
|
|
6
6
|
"types": "dist/index.d.ts",
|
|
@@ -39,5 +39,5 @@
|
|
|
39
39
|
"ts-node": "^10.9.1",
|
|
40
40
|
"typescript": "^5.2.2"
|
|
41
41
|
},
|
|
42
|
-
"gitHead": "
|
|
42
|
+
"gitHead": "73073416020c05b15d4f58805fe914743e7ffd6e"
|
|
43
43
|
}
|