@zwa73/utils 1.0.8 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/UtilCodecs.js +21 -8
- package/package.json +1 -1
- package/publish.bat +2 -0
- package/src/UtilCodecs.ts +26 -10
package/dist/UtilCodecs.js
CHANGED
|
@@ -3,8 +3,8 @@ Object.defineProperty(exports, "__esModule", { value: true });
|
|
|
3
3
|
exports.decodeTokenDavinci = exports.decodeTokenTurbo = exports.encodeTokenDavinci = exports.encodeTokenTurbo = exports.tokenNumDavinci = exports.tokenNumTurbo = exports.encodeHtmlEntities = exports.decodeHtmlEntities = void 0;
|
|
4
4
|
const he = require("html-entities");
|
|
5
5
|
const tiktoken_1 = require("tiktoken");
|
|
6
|
-
|
|
7
|
-
|
|
6
|
+
let encoderTurbo = null;
|
|
7
|
+
let encoderDavinci = null;
|
|
8
8
|
const textDecoder = new TextDecoder();
|
|
9
9
|
// 定义一个对象,存储常见的HTML实体和对应的字符
|
|
10
10
|
let htmlEntities = {
|
|
@@ -46,13 +46,21 @@ exports.encodeHtmlEntities = encodeHtmlEntities;
|
|
|
46
46
|
//cl100k_base ChatGPT models, text-embedding-ada-002
|
|
47
47
|
//p50k_base Code models, text-davinci-002, text-davinci-003
|
|
48
48
|
//r50k_base (or gpt2) GPT-3 models like davinci
|
|
49
|
+
//避免在nextjs调用时出错
|
|
50
|
+
function initTikTokenEncoder() {
|
|
51
|
+
if (encoderTurbo != null && encoderDavinci != null)
|
|
52
|
+
return;
|
|
53
|
+
encoderTurbo = (0, tiktoken_1.get_encoding)("cl100k_base");
|
|
54
|
+
encoderDavinci = (0, tiktoken_1.get_encoding)("p50k_base");
|
|
55
|
+
}
|
|
49
56
|
/**token长度计算器 Turbo模型
|
|
50
57
|
* @param {string} str = 所要计算的消息
|
|
51
58
|
* @returns {number} 整数长度结果
|
|
52
59
|
*/
|
|
53
60
|
function tokenNumTurbo(str) {
|
|
61
|
+
initTikTokenEncoder();
|
|
54
62
|
//return encoder.encode(str).length
|
|
55
|
-
return encoderTurbo
|
|
63
|
+
return encoderTurbo?.encode(str).length;
|
|
56
64
|
}
|
|
57
65
|
exports.tokenNumTurbo = tokenNumTurbo;
|
|
58
66
|
/**token长度计算器 Davinci模型
|
|
@@ -60,7 +68,8 @@ exports.tokenNumTurbo = tokenNumTurbo;
|
|
|
60
68
|
* @returns {number} 整数长度结果
|
|
61
69
|
*/
|
|
62
70
|
function tokenNumDavinci(str) {
|
|
63
|
-
|
|
71
|
+
initTikTokenEncoder();
|
|
72
|
+
return encoderDavinci?.encode(str).length;
|
|
64
73
|
}
|
|
65
74
|
exports.tokenNumDavinci = tokenNumDavinci;
|
|
66
75
|
/**token编码 Turbo模型
|
|
@@ -68,7 +77,8 @@ exports.tokenNumDavinci = tokenNumDavinci;
|
|
|
68
77
|
* @returns {Array<number>} Token数组
|
|
69
78
|
*/
|
|
70
79
|
function encodeTokenTurbo(str) {
|
|
71
|
-
|
|
80
|
+
initTikTokenEncoder();
|
|
81
|
+
return encoderTurbo?.encode(str);
|
|
72
82
|
}
|
|
73
83
|
exports.encodeTokenTurbo = encodeTokenTurbo;
|
|
74
84
|
/**token编码 Davinci模型
|
|
@@ -76,7 +86,8 @@ exports.encodeTokenTurbo = encodeTokenTurbo;
|
|
|
76
86
|
* @returns {Array<number>} Token数组
|
|
77
87
|
*/
|
|
78
88
|
function encodeTokenDavinci(str) {
|
|
79
|
-
|
|
89
|
+
initTikTokenEncoder();
|
|
90
|
+
return encoderDavinci?.encode(str);
|
|
80
91
|
}
|
|
81
92
|
exports.encodeTokenDavinci = encodeTokenDavinci;
|
|
82
93
|
/**token解码 Turbo模型
|
|
@@ -84,7 +95,8 @@ exports.encodeTokenDavinci = encodeTokenDavinci;
|
|
|
84
95
|
* @returns {string} 消息字符串
|
|
85
96
|
*/
|
|
86
97
|
function decodeTokenTurbo(arr) {
|
|
87
|
-
|
|
98
|
+
initTikTokenEncoder();
|
|
99
|
+
return textDecoder.decode(encoderTurbo?.decode(arr));
|
|
88
100
|
}
|
|
89
101
|
exports.decodeTokenTurbo = decodeTokenTurbo;
|
|
90
102
|
/**token解码 Davinci模型
|
|
@@ -92,6 +104,7 @@ exports.decodeTokenTurbo = decodeTokenTurbo;
|
|
|
92
104
|
* @returns {string} 消息字符串
|
|
93
105
|
*/
|
|
94
106
|
function decodeTokenDavinci(arr) {
|
|
95
|
-
|
|
107
|
+
initTikTokenEncoder();
|
|
108
|
+
return textDecoder.decode(encoderDavinci?.decode(arr));
|
|
96
109
|
}
|
|
97
110
|
exports.decodeTokenDavinci = decodeTokenDavinci;
|
package/package.json
CHANGED
package/publish.bat
ADDED
package/src/UtilCodecs.ts
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import * as he from 'html-entities';
|
|
2
|
-
import {get_encoding} from 'tiktoken';
|
|
3
|
-
|
|
4
|
-
|
|
2
|
+
import {get_encoding,Tiktoken} from 'tiktoken';
|
|
3
|
+
let encoderTurbo:Tiktoken|null = null;
|
|
4
|
+
let encoderDavinci:Tiktoken|null = null;
|
|
5
5
|
const textDecoder = new TextDecoder();
|
|
6
6
|
|
|
7
7
|
|
|
@@ -48,20 +48,32 @@ export function encodeHtmlEntities(str:string) {
|
|
|
48
48
|
//p50k_base Code models, text-davinci-002, text-davinci-003
|
|
49
49
|
//r50k_base (or gpt2) GPT-3 models like davinci
|
|
50
50
|
|
|
51
|
+
|
|
52
|
+
//避免在nextjs调用时出错
|
|
53
|
+
function initTikTokenEncoder (){
|
|
54
|
+
if(encoderTurbo!=null && encoderDavinci!=null)
|
|
55
|
+
return;
|
|
56
|
+
|
|
57
|
+
encoderTurbo = get_encoding("cl100k_base");
|
|
58
|
+
encoderDavinci = get_encoding("p50k_base");
|
|
59
|
+
}
|
|
60
|
+
|
|
51
61
|
/**token长度计算器 Turbo模型
|
|
52
62
|
* @param {string} str = 所要计算的消息
|
|
53
63
|
* @returns {number} 整数长度结果
|
|
54
64
|
*/
|
|
55
|
-
export function tokenNumTurbo(str:string){
|
|
65
|
+
export function tokenNumTurbo(str:string):number{
|
|
66
|
+
initTikTokenEncoder();
|
|
56
67
|
//return encoder.encode(str).length
|
|
57
|
-
return encoderTurbo
|
|
68
|
+
return encoderTurbo?.encode(str).length as any as number;
|
|
58
69
|
}
|
|
59
70
|
/**token长度计算器 Davinci模型
|
|
60
71
|
* @param {string} str = 所要计算的消息
|
|
61
72
|
* @returns {number} 整数长度结果
|
|
62
73
|
*/
|
|
63
74
|
export function tokenNumDavinci(str:string):number{
|
|
64
|
-
|
|
75
|
+
initTikTokenEncoder();
|
|
76
|
+
return encoderDavinci?.encode(str).length as any as number;
|
|
65
77
|
}
|
|
66
78
|
|
|
67
79
|
/**token编码 Turbo模型
|
|
@@ -69,26 +81,30 @@ export function tokenNumDavinci(str:string):number{
|
|
|
69
81
|
* @returns {Array<number>} Token数组
|
|
70
82
|
*/
|
|
71
83
|
export function encodeTokenTurbo(str:string):Uint32Array{
|
|
72
|
-
|
|
84
|
+
initTikTokenEncoder();
|
|
85
|
+
return encoderTurbo?.encode(str) as any as Uint32Array
|
|
73
86
|
}
|
|
74
87
|
/**token编码 Davinci模型
|
|
75
88
|
* @param {string} str = 所要计算的消息
|
|
76
89
|
* @returns {Array<number>} Token数组
|
|
77
90
|
*/
|
|
78
91
|
export function encodeTokenDavinci(str:string):Uint32Array{
|
|
79
|
-
|
|
92
|
+
initTikTokenEncoder();
|
|
93
|
+
return encoderDavinci?.encode(str) as any as Uint32Array;
|
|
80
94
|
}
|
|
81
95
|
/**token解码 Turbo模型
|
|
82
96
|
* @param {Array<number>} arr = Token数组
|
|
83
97
|
* @returns {string} 消息字符串
|
|
84
98
|
*/
|
|
85
99
|
export function decodeTokenTurbo(arr:Uint32Array):string{
|
|
86
|
-
|
|
100
|
+
initTikTokenEncoder();
|
|
101
|
+
return textDecoder.decode(encoderTurbo?.decode(arr));
|
|
87
102
|
}
|
|
88
103
|
/**token解码 Davinci模型
|
|
89
104
|
* @param {Array<number>} arr = Token数组
|
|
90
105
|
* @returns {string} 消息字符串
|
|
91
106
|
*/
|
|
92
107
|
export function decodeTokenDavinci(arr:Uint32Array):string{
|
|
93
|
-
|
|
108
|
+
initTikTokenEncoder();
|
|
109
|
+
return textDecoder.decode(encoderDavinci?.decode(arr));
|
|
94
110
|
}
|