hama-js 1.3.2 → 1.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/browser.js +1 -0
- package/dist/browser/browser.js.map +1 -1
- package/dist/browser/jamo.js +19 -20
- package/dist/browser/jamo.js.map +1 -1
- package/dist/browser/tokenizer.js +15 -6
- package/dist/browser/tokenizer.js.map +1 -1
- package/dist/node/index.js +1 -0
- package/dist/node/index.js.map +1 -1
- package/dist/node/jamo.js +19 -20
- package/dist/node/jamo.js.map +1 -1
- package/dist/node/tokenizer.js +15 -6
- package/dist/node/tokenizer.js.map +1 -1
- package/dist/types/tokenizer.d.ts +5 -0
- package/package.json +1 -1
package/dist/browser/browser.js
CHANGED
|
@@ -9,6 +9,7 @@ export class G2PBrowserModel {
|
|
|
9
9
|
const opts = {
|
|
10
10
|
modelUrl: options.modelUrl ?? DEFAULT_MODEL_URL,
|
|
11
11
|
maxInputLen: options.maxInputLen ?? 128,
|
|
12
|
+
// Retained for API compatibility; autoregressive ONNX sets output length in-graph.
|
|
12
13
|
maxOutputLen: options.maxOutputLen ?? 32,
|
|
13
14
|
};
|
|
14
15
|
const model = new G2PBrowserModel(opts);
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../../src/browser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,EAAE,MAAM,iBAAiB,CAAC;AAE3D,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAa,MAAM,gBAAgB,CAAC;AAQ1E,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,wBAAwB,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC;AAExF,MAAM,OAAO,eAAe;IAI1B,YAAoB,OAAiC;QACnD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAA0B,EAAE;QAC9C,MAAM,IAAI,GAA6B;YACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,iBAAiB;YAC/C,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,GAAG;YACvC,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,EAAE;SACzC,CAAC;QACF,MAAM,KAAK,GAAG,IAAI,eAAe,CAAC,IAAI,CAAC,CAAC;QACxC,KAAK,CAAC,OAAO,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE;YAC3D,kBAAkB,EAAE,CAAC,MAAM,CAAC;SAC7B,CAAC,CAAC;QACH,OAAO,KAAK,CAAC;IACf,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,IAAY;QACxB,MAAM,OAAO,GAAG,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;QAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjD,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtE,MAAM,KAAK,GAA2B;YACpC,SAAS,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;YACvE,aAAa,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;SACtD,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAC9C,MAAM,OAAO,GAAG,OAAO,CAAC,WAAW,CAAC,IAAqB,CAAC;QAC1D,MAAM,IAAI,GAAG,OAAO,CAAC,YAAY,CAAC,IAAqB,CAAC;QACxD,OAAO,iBAAiB,CAAC,OAAO,EAAE,IAAI,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAC/D,CAAC;CACF"}
|
|
1
|
+
{"version":3,"file":"browser.js","sourceRoot":"","sources":["../../src/browser.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,gBAAgB,EAAE,MAAM,EAAE,MAAM,iBAAiB,CAAC;AAE3D,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAa,MAAM,gBAAgB,CAAC;AAQ1E,MAAM,iBAAiB,GAAG,IAAI,GAAG,CAAC,wBAAwB,EAAE,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,QAAQ,EAAE,CAAC;AAExF,MAAM,OAAO,eAAe;IAI1B,YAAoB,OAAiC;QACnD,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;IACzB,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAA0B,EAAE;QAC9C,MAAM,IAAI,GAA6B;YACrC,QAAQ,EAAE,OAAO,CAAC,QAAQ,IAAI,iBAAiB;YAC/C,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,GAAG;YACvC,mFAAmF;YACnF,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,EAAE;SACzC,CAAC;QACF,MAAM,KAAK,GAAG,IAAI,eAAe,CAAC,IAAI,CAAC,CAAC;QACxC,KAAK,CAAC,OAAO,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,QAAQ,EAAE;YAC3D,kBAAkB,EAAE,CAAC,MAAM,CAAC;SAC7B,CAAC,CAAC;QACH,OAAO,KAAK,CAAC;IACf,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,IAAY;QACxB,MAAM,OAAO,GAAG,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;QAC3D,MAAM,QAAQ,GAAG,aAAa,CAAC,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QACjD,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtE,MAAM,KAAK,GAA2B;YACpC,SAAS,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,OAAO,CAAC,WAAW,CAAC,CAAC;YACvE,aAAa,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;SACtD,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAC9C,MAAM,OAAO,GAAG,OAAO,CAAC,WAAW,CAAC,IAAqB,CAAC;QAC1D,MAAM,IAAI,GAAG,OAAO,CAAC,YAAY,CAAC,IAAqB,CAAC;QACxD,OAAO,iBAAiB,CAAC,OAAO,EAAE,IAAI,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAC/D,CAAC;CACF"}
|
package/dist/browser/jamo.js
CHANGED
|
@@ -13,27 +13,26 @@ const WHITESPACE_REGEX = /\s/;
|
|
|
13
13
|
export const splitTextToJamo = (text) => {
|
|
14
14
|
const tokens = [];
|
|
15
15
|
const mapping = [];
|
|
16
|
-
|
|
17
|
-
let
|
|
18
|
-
while (
|
|
19
|
-
const code =
|
|
20
|
-
const
|
|
21
|
-
const charLen =
|
|
22
|
-
|
|
23
|
-
|
|
16
|
+
let offset = 0;
|
|
17
|
+
let charIndex = 0;
|
|
18
|
+
while (offset < text.length) {
|
|
19
|
+
const code = text.codePointAt(offset);
|
|
20
|
+
const ch = String.fromCodePoint(code);
|
|
21
|
+
const charLen = ch.length;
|
|
22
|
+
offset += charLen;
|
|
23
|
+
if (WHITESPACE_REGEX.test(ch)) {
|
|
24
|
+
charIndex += 1;
|
|
24
25
|
continue;
|
|
25
26
|
}
|
|
26
|
-
|
|
27
|
-
|
|
27
|
+
const normalizedPart = ch.toLocaleLowerCase("und");
|
|
28
|
+
let normalizedOffset = 0;
|
|
29
|
+
while (normalizedOffset < normalizedPart.length) {
|
|
30
|
+
const innerCode = normalizedPart.codePointAt(normalizedOffset);
|
|
28
31
|
const innerChar = String.fromCodePoint(innerCode);
|
|
29
|
-
|
|
30
|
-
if (WHITESPACE_REGEX.test(innerChar)) {
|
|
31
|
-
break;
|
|
32
|
-
}
|
|
32
|
+
normalizedOffset += innerChar.length;
|
|
33
33
|
if (!isHangulSyllable(innerCode)) {
|
|
34
34
|
tokens.push(innerChar);
|
|
35
|
-
mapping.push(
|
|
36
|
-
index += innerLen;
|
|
35
|
+
mapping.push(charIndex);
|
|
37
36
|
continue;
|
|
38
37
|
}
|
|
39
38
|
const syllableIndex = innerCode - S_BASE;
|
|
@@ -41,15 +40,15 @@ export const splitTextToJamo = (text) => {
|
|
|
41
40
|
const v = Math.floor((syllableIndex % N_COUNT) / T_COUNT);
|
|
42
41
|
const t = syllableIndex % T_COUNT;
|
|
43
42
|
tokens.push(String.fromCodePoint(L_BASE + l));
|
|
44
|
-
mapping.push(
|
|
43
|
+
mapping.push(charIndex);
|
|
45
44
|
tokens.push(String.fromCodePoint(V_BASE + v));
|
|
46
|
-
mapping.push(
|
|
45
|
+
mapping.push(charIndex);
|
|
47
46
|
if (t !== 0) {
|
|
48
47
|
tokens.push(String.fromCodePoint(T_BASE + t));
|
|
49
|
-
mapping.push(
|
|
48
|
+
mapping.push(charIndex);
|
|
50
49
|
}
|
|
51
|
-
index += innerLen;
|
|
52
50
|
}
|
|
51
|
+
charIndex += 1;
|
|
53
52
|
}
|
|
54
53
|
return { tokens, originalIndices: mapping };
|
|
55
54
|
};
|
package/dist/browser/jamo.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jamo.js","sourceRoot":"","sources":["../../src/jamo.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,cAAc,GAAG,MAAM,CAAC;AAO9B,MAAM,gBAAgB,GAAG,CAAC,IAAY,EAAE,EAAE,CACxC,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,CAAC;AAE5C,MAAM,gBAAgB,GAAG,IAAI,CAAC;AAE9B,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,IAAY,EAAgB,EAAE;IAC5D,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,MAAM,
|
|
1
|
+
{"version":3,"file":"jamo.js","sourceRoot":"","sources":["../../src/jamo.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,cAAc,GAAG,MAAM,CAAC;AAO9B,MAAM,gBAAgB,GAAG,CAAC,IAAY,EAAE,EAAE,CACxC,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,CAAC;AAE5C,MAAM,gBAAgB,GAAG,IAAI,CAAC;AAE9B,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,IAAY,EAAgB,EAAE;IAC5D,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,OAAO,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAE,CAAC;QACvC,MAAM,EAAE,GAAG,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;QACtC,MAAM,OAAO,GAAG,EAAE,CAAC,MAAM,CAAC;QAC1B,MAAM,IAAI,OAAO,CAAC;QAClB,IAAI,gBAAgB,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;YAC9B,SAAS,IAAI,CAAC,CAAC;YACf,SAAS;QACX,CAAC;QACD,MAAM,cAAc,GAAG,EAAE,CAAC,iBAAiB,CAAC,KAAK,CAAC,CAAC;QACnD,IAAI,gBAAgB,GAAG,CAAC,CAAC;QACzB,OAAO,gBAAgB,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC;YAChD,MAAM,SAAS,GAAG,cAAc,CAAC,WAAW,CAAC,gBAAgB,CAAE,CAAC;YAChE,MAAM,SAAS,GAAG,MAAM,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;YAClD,gBAAgB,IAAI,SAAS,CAAC,MAAM,CAAC;YAErC,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,EAAE,CAAC;gBACjC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBACvB,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBACxB,SAAS;YACX,CAAC;YAED,MAAM,aAAa,GAAG,SAAS,GAAG,MAAM,CAAC;YACzC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,GAAG,OAAO,CAAC,CAAC;YAC9C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,GAAG,OAAO,CAAC,CAAC;YAC1D,MAAM,CAAC,GAAG,aAAa,GAAG,OAAO,CAAC;YAClC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBACZ,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;gBAC9C,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;QACD,SAAS,IAAI,CAAC,CAAC;IACjB,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,eAAe,EAAE,OAAO,EAAE,CAAC;AAC9C,CAAC,CAAC;AAEF,MAAM,eAAe,GAAG,CAAC,OAAe,EAAE,MAAc,EAAE,KAAa,EAAE,EAAE,CACzE,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,GAAG,OAAO,GAAG,MAAM,GAAG,OAAO,GAAG,KAAK,CAAC,CAAC;AAE9E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,MAAgB,EAAU,EAAE;IACzD,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,OAAO,GAAkB,IAAI,CAAC;IAClC,IAAI,MAAM,GAAkB,IAAI,CAAC;IACjC,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,MAAM,KAAK,GAAG,GAAG,EAAE;QACjB,IAAI,OAAO,KAAK,IAAI,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;QACvD,CAAC;QACD,OAAO,GAAG,IAAI,CAAC;QACf,MAAM,GAAG,IAAI,CAAC;QACd,KAAK,GAAG,CAAC,CAAC;IACZ,CAAC,CAAC;IAEF,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,EAAE;QACvB,MAAM,IAAI,GAAG,KAAK,CAAC,WAAW,CAAC,CAAC,CAAE,CAAC;QACnC,IAAI,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,EAAE,CAAC;YAC9C,IAAI,OAAO,KAAK,IAAI,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACxC,KAAK,EAAE,CAAC;YACV,CAAC;YACD,OAAO,GAAG,IAAI,GAAG,MAAM,CAAC;QAC1B,CAAC;aAAM,IAAI,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,EAAE,CAAC;YACrD,IAAI,OAAO,KAAK,IAAI;gBAAE,OAAO,GAAG,cAAc,GAAG,MAAM,CAAC;YACxD,IAAI,MAAM,KAAK,IAAI;gBAAE,KAAK,EAAE,CAAC;YAC7B,MAAM,GAAG,IAAI,GAAG,MAAM,CAAC;QACzB,CAAC;aAAM,IAAI,IAAI,GAAG,MAAM,IAAI,IAAI,IAAI,MAAM,GAAG,OAAO,EAAE,CAAC;YACrD,IAAI,OAAO,KAAK,IAAI,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACxC,KAAK,EAAE,CAAC;gBACR,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,IAAI,GAAG,MAAM,CAAC;gBACtB,KAAK,EAAE,CAAC;YACV,CAAC;QACH,CAAC;aAAM,CAAC;YACN,KAAK,EAAE,CAAC;YACR,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC,CAAC,CAAC;IACH,KAAK,EAAE,CAAC;IACR,OAAO,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AACzB,CAAC,CAAC"}
|
|
@@ -6,23 +6,26 @@ const decoderTokenToId = new Map(VOCAB.decoder.map((token, idx) => [token, idx])
|
|
|
6
6
|
export const encodeText = (text, maxInputLen) => {
|
|
7
7
|
const jamoSeq = splitTextToJamo(text);
|
|
8
8
|
const tokens = jamoSeq.tokens.length ? jamoSeq.tokens : ["<unk>"];
|
|
9
|
+
const indices = jamoSeq.originalIndices.length ? jamoSeq.originalIndices : [-1];
|
|
9
10
|
const ids = tokens.map((token) => encoderTokenToId.get(token) ?? encoderTokenToId.get("<unk>"));
|
|
10
11
|
const length = Math.min(ids.length, maxInputLen);
|
|
11
12
|
const padded = new Array(maxInputLen).fill(BigInt(encoderTokenToId.get("<pad>")));
|
|
12
13
|
for (let i = 0; i < length; i++) {
|
|
13
14
|
padded[i] = BigInt(ids[i]);
|
|
14
15
|
}
|
|
15
|
-
const positionMap =
|
|
16
|
-
return { ids: padded, length, positionMap: positionMap.length ? positionMap : [
|
|
16
|
+
const positionMap = indices.slice(0, length);
|
|
17
|
+
return { ids: padded, length, positionMap: positionMap.length ? positionMap : [-1] };
|
|
17
18
|
};
|
|
18
19
|
export const decoderIds = {
|
|
19
20
|
pad: decoderTokenToId.get("<pad>"),
|
|
20
21
|
sos: decoderTokenToId.get("<sos>"),
|
|
21
22
|
eos: decoderTokenToId.get("<eos>"),
|
|
23
|
+
unk: decoderTokenToId.get("<unk>"),
|
|
22
24
|
};
|
|
23
25
|
export const decodeIdsToResult = (ids, attnIndices, positionMap) => {
|
|
24
26
|
const phonemes = [];
|
|
25
27
|
const alignments = [];
|
|
28
|
+
let outOfRangeTokenCount = 0;
|
|
26
29
|
for (let i = 0; i < ids.length; i++) {
|
|
27
30
|
const tokenId = Number(ids[i]);
|
|
28
31
|
if (tokenId === decoderIds.eos)
|
|
@@ -31,15 +34,21 @@ export const decodeIdsToResult = (ids, attnIndices, positionMap) => {
|
|
|
31
34
|
continue;
|
|
32
35
|
if (tokenId === decoderIds.sos && phonemes.length === 0)
|
|
33
36
|
continue;
|
|
34
|
-
const phoneme = VOCAB.decoder[tokenId]
|
|
37
|
+
const phoneme = VOCAB.decoder[tokenId];
|
|
38
|
+
if (phoneme === undefined) {
|
|
39
|
+
outOfRangeTokenCount += 1;
|
|
40
|
+
}
|
|
35
41
|
const srcPos = Math.max(0, Math.min(Number(attnIndices[i] ?? 0), positionMap.length > 0 ? positionMap.length - 1 : 0));
|
|
36
|
-
const charIndex = positionMap.length > 0 ? positionMap[srcPos] :
|
|
42
|
+
const charIndex = positionMap.length > 0 ? positionMap[srcPos] : -1;
|
|
37
43
|
alignments.push({
|
|
38
|
-
phoneme,
|
|
44
|
+
phoneme: phoneme ?? VOCAB.decoder[decoderIds.unk],
|
|
39
45
|
phonemeIndex: alignments.length,
|
|
40
46
|
charIndex,
|
|
41
47
|
});
|
|
42
|
-
phonemes.push(phoneme);
|
|
48
|
+
phonemes.push(phoneme ?? VOCAB.decoder[decoderIds.unk]);
|
|
49
|
+
}
|
|
50
|
+
if (outOfRangeTokenCount > 0 && typeof console !== "undefined") {
|
|
51
|
+
console.warn(`[hama-js] decodeIdsToResult saw ${outOfRangeTokenCount} out-of-range decoder ids; mapped to <unk>.`);
|
|
43
52
|
}
|
|
44
53
|
return { ipa: phonemes.join(""), alignments };
|
|
45
54
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/tokenizer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAgB,MAAM,WAAW,CAAC;AAC1D,OAAO,SAAS,MAAM,yBAAyB,CAAC;AAOhD,MAAM,CAAC,MAAM,KAAK,GAAe,SAAuB,CAAC;AAEzD,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;AAEF,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/tokenizer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAgB,MAAM,WAAW,CAAC;AAC1D,OAAO,SAAS,MAAM,yBAAyB,CAAC;AAOhD,MAAM,CAAC,MAAM,KAAK,GAAe,SAAuB,CAAC;AAEzD,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;AAEF,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;AAuBF,MAAM,CAAC,MAAM,UAAU,GAAG,CACxB,IAAY,EACZ,WAAmB,EACN,EAAE;IACf,MAAM,OAAO,GAAiB,eAAe,CAAC,IAAI,CAAC,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;IAClE,MAAM,OAAO,GAAG,OAAO,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAChF,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CACpB,CAAC,KAAK,EAAE,EAAE,CAAC,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE,CACzE,CAAC;IACF,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;IACjD,MAAM,MAAM,GAAG,IAAI,KAAK,CAAS,WAAW,CAAC,CAAC,IAAI,CAChD,MAAM,CAAC,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE,CAAC,CACvC,CAAC;IACF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7B,CAAC;IACD,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IAC7C,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;AACvF,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,UAAU,GAAG;IACxB,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;IACnC,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;IACnC,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;IACnC,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;CACpC,CAAC;AAEF,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAC/B,GAA+B,EAC/B,WAAuC,EACvC,WAAqB,EACV,EAAE;IACb,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,UAAU,GAAgB,EAAE,CAAC;IACnC,IAAI,oBAAoB,GAAG,CAAC,CAAC;IAE7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,OAAO,KAAK,UAAU,CAAC,GAAG;YAAE,MAAM;QACtC,IAAI,OAAO,KAAK,UAAU,CAAC,GAAG;YAAE,SAAS;QACzC,IAAI,OAAO,KAAK,UAAU,CAAC,GAAG,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAElE,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;YAC1B,oBAAoB,IAAI,CAAC,CAAC;QAC5B,CAAC;QACD,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CACrB,CAAC,EACD,IAAI,CAAC,GAAG,CACN,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,EAC3B,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CACpD,CACF,CAAC;QACF,MAAM,SAAS,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACpE,UAAU,CAAC,IAAI,CAAC;YACd,OAAO,EAAE,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;YACjD,YAAY,EAAE,UAAU,CAAC,MAAM;YAC/B,SAAS;SACV,CAAC,CAAC;QACH,QAAQ,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,oBAAoB,GAAG,CAAC,IAAI,OAAO,OAAO,KAAK,WAAW,EAAE,CAAC;QAC/D,OAAO,CAAC,IAAI,CACV,mCAAmC,oBAAoB,6CAA6C,CACrG,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC;AAChD,CAAC,CAAC"}
|
package/dist/node/index.js
CHANGED
|
@@ -14,6 +14,7 @@ export class G2PNodeModel {
|
|
|
14
14
|
const opts = {
|
|
15
15
|
modelPath: options.modelPath ?? defaultModelPath,
|
|
16
16
|
maxInputLen: options.maxInputLen ?? 128,
|
|
17
|
+
// Retained for API compatibility; autoregressive ONNX sets output length in-graph.
|
|
17
18
|
maxOutputLen: options.maxOutputLen ?? 32,
|
|
18
19
|
};
|
|
19
20
|
const session = await InferenceSession.create(opts.modelPath, {
|
package/dist/node/index.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,gBAAgB,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAE5D,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAa,MAAM,gBAAgB,CAAC;AAQ1E,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC/D,MAAM,gBAAgB,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,QAAQ,EAAE,eAAe,CAAC,CAAC;AAEzE,MAAM,OAAO,YAAY;IAKvB,YAAoB,OAAyB,EAAE,IAA0B;QACvE,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC;QACpC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC;IACxC,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAAsB,EAAE;QAC1C,MAAM,IAAI,GAAyB;YACjC,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,gBAAgB;YAChD,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,GAAG;YACvC,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,EAAE;SACzC,CAAC;QACF,MAAM,OAAO,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,SAAS,EAAE;YAC5D,sBAAsB,EAAE,UAAU;SACnC,CAAC,CAAC;QACH,OAAO,IAAI,YAAY,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,IAAY;QACxB,MAAM,OAAO,GAAG,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;QACnD,MAAM,QAAQ,GAAG,IAAI,aAAa,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAChD,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtE,MAAM,KAAK,GAA2B;YACpC,SAAS,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YAC/D,aAAa,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;SACtD,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAC9C,MAAM,OAAO,GAAG,OAAO,CAAC,WAAW,CAAC,IAAqB,CAAC;QAC1D,MAAM,IAAI,GAAG,OAAO,CAAC,YAAY,CAAC,IAAqB,CAAC;QACxD,OAAO,iBAAiB,CAAC,OAAO,EAAE,IAAI,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAC/D,CAAC;CACF"}
|
|
1
|
+
{"version":3,"file":"index.js","sourceRoot":"","sources":["../../src/index.ts"],"names":[],"mappings":"AAAA,OAAO,IAAI,MAAM,WAAW,CAAC;AAC7B,OAAO,EAAE,aAAa,EAAE,MAAM,UAAU,CAAC;AACzC,OAAO,EAAE,gBAAgB,EAAE,MAAM,EAAE,MAAM,kBAAkB,CAAC;AAE5D,OAAO,EAAE,iBAAiB,EAAE,UAAU,EAAa,MAAM,gBAAgB,CAAC;AAQ1E,MAAM,SAAS,GAAG,IAAI,CAAC,OAAO,CAAC,aAAa,CAAC,MAAM,CAAC,IAAI,CAAC,GAAG,CAAC,CAAC,CAAC;AAC/D,MAAM,gBAAgB,GAAG,IAAI,CAAC,IAAI,CAAC,SAAS,EAAE,QAAQ,EAAE,eAAe,CAAC,CAAC;AAEzE,MAAM,OAAO,YAAY;IAKvB,YAAoB,OAAyB,EAAE,IAA0B;QACvE,IAAI,CAAC,OAAO,GAAG,OAAO,CAAC;QACvB,IAAI,CAAC,WAAW,GAAG,IAAI,CAAC,WAAW,CAAC;QACpC,IAAI,CAAC,YAAY,GAAG,IAAI,CAAC,YAAY,CAAC;IACxC,CAAC;IAED,MAAM,CAAC,KAAK,CAAC,MAAM,CAAC,UAAsB,EAAE;QAC1C,MAAM,IAAI,GAAyB;YACjC,SAAS,EAAE,OAAO,CAAC,SAAS,IAAI,gBAAgB;YAChD,WAAW,EAAE,OAAO,CAAC,WAAW,IAAI,GAAG;YACvC,mFAAmF;YACnF,YAAY,EAAE,OAAO,CAAC,YAAY,IAAI,EAAE;SACzC,CAAC;QACF,MAAM,OAAO,GAAG,MAAM,gBAAgB,CAAC,MAAM,CAAC,IAAI,CAAC,SAAS,EAAE;YAC5D,sBAAsB,EAAE,UAAU;SACnC,CAAC,CAAC;QACH,OAAO,IAAI,YAAY,CAAC,OAAO,EAAE,IAAI,CAAC,CAAC;IACzC,CAAC;IAED,KAAK,CAAC,OAAO,CAAC,IAAY;QACxB,MAAM,OAAO,GAAG,UAAU,CAAC,IAAI,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;QACnD,MAAM,QAAQ,GAAG,IAAI,aAAa,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC;QAChD,MAAM,YAAY,GAAG,IAAI,aAAa,CAAC,CAAC,MAAM,CAAC,OAAO,CAAC,MAAM,IAAI,CAAC,CAAC,CAAC,CAAC,CAAC;QAEtE,MAAM,KAAK,GAA2B;YACpC,SAAS,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,QAAQ,EAAE,CAAC,CAAC,EAAE,IAAI,CAAC,WAAW,CAAC,CAAC;YAC/D,aAAa,EAAE,IAAI,MAAM,CAAC,OAAO,EAAE,YAAY,EAAE,CAAC,CAAC,CAAC,CAAC;SACtD,CAAC;QAEF,MAAM,OAAO,GAAG,MAAM,IAAI,CAAC,OAAO,CAAC,GAAG,CAAC,KAAK,CAAC,CAAC;QAC9C,MAAM,OAAO,GAAG,OAAO,CAAC,WAAW,CAAC,IAAqB,CAAC;QAC1D,MAAM,IAAI,GAAG,OAAO,CAAC,YAAY,CAAC,IAAqB,CAAC;QACxD,OAAO,iBAAiB,CAAC,OAAO,EAAE,IAAI,EAAE,OAAO,CAAC,WAAW,CAAC,CAAC;IAC/D,CAAC;CACF"}
|
package/dist/node/jamo.js
CHANGED
|
@@ -13,27 +13,26 @@ const WHITESPACE_REGEX = /\s/;
|
|
|
13
13
|
export const splitTextToJamo = (text) => {
|
|
14
14
|
const tokens = [];
|
|
15
15
|
const mapping = [];
|
|
16
|
-
|
|
17
|
-
let
|
|
18
|
-
while (
|
|
19
|
-
const code =
|
|
20
|
-
const
|
|
21
|
-
const charLen =
|
|
22
|
-
|
|
23
|
-
|
|
16
|
+
let offset = 0;
|
|
17
|
+
let charIndex = 0;
|
|
18
|
+
while (offset < text.length) {
|
|
19
|
+
const code = text.codePointAt(offset);
|
|
20
|
+
const ch = String.fromCodePoint(code);
|
|
21
|
+
const charLen = ch.length;
|
|
22
|
+
offset += charLen;
|
|
23
|
+
if (WHITESPACE_REGEX.test(ch)) {
|
|
24
|
+
charIndex += 1;
|
|
24
25
|
continue;
|
|
25
26
|
}
|
|
26
|
-
|
|
27
|
-
|
|
27
|
+
const normalizedPart = ch.toLocaleLowerCase("und");
|
|
28
|
+
let normalizedOffset = 0;
|
|
29
|
+
while (normalizedOffset < normalizedPart.length) {
|
|
30
|
+
const innerCode = normalizedPart.codePointAt(normalizedOffset);
|
|
28
31
|
const innerChar = String.fromCodePoint(innerCode);
|
|
29
|
-
|
|
30
|
-
if (WHITESPACE_REGEX.test(innerChar)) {
|
|
31
|
-
break;
|
|
32
|
-
}
|
|
32
|
+
normalizedOffset += innerChar.length;
|
|
33
33
|
if (!isHangulSyllable(innerCode)) {
|
|
34
34
|
tokens.push(innerChar);
|
|
35
|
-
mapping.push(
|
|
36
|
-
index += innerLen;
|
|
35
|
+
mapping.push(charIndex);
|
|
37
36
|
continue;
|
|
38
37
|
}
|
|
39
38
|
const syllableIndex = innerCode - S_BASE;
|
|
@@ -41,15 +40,15 @@ export const splitTextToJamo = (text) => {
|
|
|
41
40
|
const v = Math.floor((syllableIndex % N_COUNT) / T_COUNT);
|
|
42
41
|
const t = syllableIndex % T_COUNT;
|
|
43
42
|
tokens.push(String.fromCodePoint(L_BASE + l));
|
|
44
|
-
mapping.push(
|
|
43
|
+
mapping.push(charIndex);
|
|
45
44
|
tokens.push(String.fromCodePoint(V_BASE + v));
|
|
46
|
-
mapping.push(
|
|
45
|
+
mapping.push(charIndex);
|
|
47
46
|
if (t !== 0) {
|
|
48
47
|
tokens.push(String.fromCodePoint(T_BASE + t));
|
|
49
|
-
mapping.push(
|
|
48
|
+
mapping.push(charIndex);
|
|
50
49
|
}
|
|
51
|
-
index += innerLen;
|
|
52
50
|
}
|
|
51
|
+
charIndex += 1;
|
|
53
52
|
}
|
|
54
53
|
return { tokens, originalIndices: mapping };
|
|
55
54
|
};
|
package/dist/node/jamo.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"jamo.js","sourceRoot":"","sources":["../../src/jamo.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,cAAc,GAAG,MAAM,CAAC;AAO9B,MAAM,gBAAgB,GAAG,CAAC,IAAY,EAAE,EAAE,CACxC,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,CAAC;AAE5C,MAAM,gBAAgB,GAAG,IAAI,CAAC;AAE9B,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,IAAY,EAAgB,EAAE;IAC5D,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,MAAM,
|
|
1
|
+
{"version":3,"file":"jamo.js","sourceRoot":"","sources":["../../src/jamo.ts"],"names":[],"mappings":"AAAA,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,MAAM,GAAG,MAAM,CAAC;AACtB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,EAAE,CAAC;AACnB,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,OAAO,GAAG,OAAO,GAAG,OAAO,CAAC;AAClC,MAAM,cAAc,GAAG,MAAM,CAAC;AAO9B,MAAM,gBAAgB,GAAG,CAAC,IAAY,EAAE,EAAE,CACxC,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,CAAC;AAE5C,MAAM,gBAAgB,GAAG,IAAI,CAAC;AAE9B,MAAM,CAAC,MAAM,eAAe,GAAG,CAAC,IAAY,EAAgB,EAAE;IAC5D,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,MAAM,OAAO,GAAa,EAAE,CAAC;IAC7B,IAAI,MAAM,GAAG,CAAC,CAAC;IACf,IAAI,SAAS,GAAG,CAAC,CAAC;IAClB,OAAO,MAAM,GAAG,IAAI,CAAC,MAAM,EAAE,CAAC;QAC5B,MAAM,IAAI,GAAG,IAAI,CAAC,WAAW,CAAC,MAAM,CAAE,CAAC;QACvC,MAAM,EAAE,GAAG,MAAM,CAAC,aAAa,CAAC,IAAI,CAAC,CAAC;QACtC,MAAM,OAAO,GAAG,EAAE,CAAC,MAAM,CAAC;QAC1B,MAAM,IAAI,OAAO,CAAC;QAClB,IAAI,gBAAgB,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,CAAC;YAC9B,SAAS,IAAI,CAAC,CAAC;YACf,SAAS;QACX,CAAC;QACD,MAAM,cAAc,GAAG,EAAE,CAAC,iBAAiB,CAAC,KAAK,CAAC,CAAC;QACnD,IAAI,gBAAgB,GAAG,CAAC,CAAC;QACzB,OAAO,gBAAgB,GAAG,cAAc,CAAC,MAAM,EAAE,CAAC;YAChD,MAAM,SAAS,GAAG,cAAc,CAAC,WAAW,CAAC,gBAAgB,CAAE,CAAC;YAChE,MAAM,SAAS,GAAG,MAAM,CAAC,aAAa,CAAC,SAAS,CAAC,CAAC;YAClD,gBAAgB,IAAI,SAAS,CAAC,MAAM,CAAC;YAErC,IAAI,CAAC,gBAAgB,CAAC,SAAS,CAAC,EAAE,CAAC;gBACjC,MAAM,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBACvB,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;gBACxB,SAAS;YACX,CAAC;YAED,MAAM,aAAa,GAAG,SAAS,GAAG,MAAM,CAAC;YACzC,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,aAAa,GAAG,OAAO,CAAC,CAAC;YAC9C,MAAM,CAAC,GAAG,IAAI,CAAC,KAAK,CAAC,CAAC,aAAa,GAAG,OAAO,CAAC,GAAG,OAAO,CAAC,CAAC;YAC1D,MAAM,CAAC,GAAG,aAAa,GAAG,OAAO,CAAC;YAClC,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;YAC9C,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YACxB,IAAI,CAAC,KAAK,CAAC,EAAE,CAAC;gBACZ,MAAM,CAAC,IAAI,CAAC,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC;gBAC9C,OAAO,CAAC,IAAI,CAAC,SAAS,CAAC,CAAC;YAC1B,CAAC;QACH,CAAC;QACD,SAAS,IAAI,CAAC,CAAC;IACjB,CAAC;IACD,OAAO,EAAE,MAAM,EAAE,eAAe,EAAE,OAAO,EAAE,CAAC;AAC9C,CAAC,CAAC;AAEF,MAAM,eAAe,GAAG,CAAC,OAAe,EAAE,MAAc,EAAE,KAAa,EAAE,EAAE,CACzE,MAAM,CAAC,aAAa,CAAC,MAAM,GAAG,OAAO,GAAG,OAAO,GAAG,MAAM,GAAG,OAAO,GAAG,KAAK,CAAC,CAAC;AAE9E,MAAM,CAAC,MAAM,cAAc,GAAG,CAAC,MAAgB,EAAU,EAAE;IACzD,MAAM,MAAM,GAAa,EAAE,CAAC;IAC5B,IAAI,OAAO,GAAkB,IAAI,CAAC;IAClC,IAAI,MAAM,GAAkB,IAAI,CAAC;IACjC,IAAI,KAAK,GAAG,CAAC,CAAC;IAEd,MAAM,KAAK,GAAG,GAAG,EAAE;QACjB,IAAI,OAAO,KAAK,IAAI,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;YACxC,MAAM,CAAC,IAAI,CAAC,eAAe,CAAC,OAAO,EAAE,MAAM,EAAE,KAAK,CAAC,CAAC,CAAC;QACvD,CAAC;QACD,OAAO,GAAG,IAAI,CAAC;QACf,MAAM,GAAG,IAAI,CAAC;QACd,KAAK,GAAG,CAAC,CAAC;IACZ,CAAC,CAAC;IAEF,MAAM,CAAC,OAAO,CAAC,CAAC,KAAK,EAAE,EAAE;QACvB,MAAM,IAAI,GAAG,KAAK,CAAC,WAAW,CAAC,CAAC,CAAE,CAAC;QACnC,IAAI,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,EAAE,CAAC;YAC9C,IAAI,OAAO,KAAK,IAAI,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACxC,KAAK,EAAE,CAAC;YACV,CAAC;YACD,OAAO,GAAG,IAAI,GAAG,MAAM,CAAC;QAC1B,CAAC;aAAM,IAAI,IAAI,IAAI,MAAM,IAAI,IAAI,GAAG,MAAM,GAAG,OAAO,EAAE,CAAC;YACrD,IAAI,OAAO,KAAK,IAAI;gBAAE,OAAO,GAAG,cAAc,GAAG,MAAM,CAAC;YACxD,IAAI,MAAM,KAAK,IAAI;gBAAE,KAAK,EAAE,CAAC;YAC7B,MAAM,GAAG,IAAI,GAAG,MAAM,CAAC;QACzB,CAAC;aAAM,IAAI,IAAI,GAAG,MAAM,IAAI,IAAI,IAAI,MAAM,GAAG,OAAO,EAAE,CAAC;YACrD,IAAI,OAAO,KAAK,IAAI,IAAI,MAAM,KAAK,IAAI,EAAE,CAAC;gBACxC,KAAK,EAAE,CAAC;gBACR,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;YACrB,CAAC;iBAAM,CAAC;gBACN,KAAK,GAAG,IAAI,GAAG,MAAM,CAAC;gBACtB,KAAK,EAAE,CAAC;YACV,CAAC;QACH,CAAC;aAAM,CAAC;YACN,KAAK,EAAE,CAAC;YACR,MAAM,CAAC,IAAI,CAAC,KAAK,CAAC,CAAC;QACrB,CAAC;IACH,CAAC,CAAC,CAAC;IACH,KAAK,EAAE,CAAC;IACR,OAAO,MAAM,CAAC,IAAI,CAAC,EAAE,CAAC,CAAC;AACzB,CAAC,CAAC"}
|
package/dist/node/tokenizer.js
CHANGED
|
@@ -6,23 +6,26 @@ const decoderTokenToId = new Map(VOCAB.decoder.map((token, idx) => [token, idx])
|
|
|
6
6
|
export const encodeText = (text, maxInputLen) => {
|
|
7
7
|
const jamoSeq = splitTextToJamo(text);
|
|
8
8
|
const tokens = jamoSeq.tokens.length ? jamoSeq.tokens : ["<unk>"];
|
|
9
|
+
const indices = jamoSeq.originalIndices.length ? jamoSeq.originalIndices : [-1];
|
|
9
10
|
const ids = tokens.map((token) => encoderTokenToId.get(token) ?? encoderTokenToId.get("<unk>"));
|
|
10
11
|
const length = Math.min(ids.length, maxInputLen);
|
|
11
12
|
const padded = new Array(maxInputLen).fill(BigInt(encoderTokenToId.get("<pad>")));
|
|
12
13
|
for (let i = 0; i < length; i++) {
|
|
13
14
|
padded[i] = BigInt(ids[i]);
|
|
14
15
|
}
|
|
15
|
-
const positionMap =
|
|
16
|
-
return { ids: padded, length, positionMap: positionMap.length ? positionMap : [
|
|
16
|
+
const positionMap = indices.slice(0, length);
|
|
17
|
+
return { ids: padded, length, positionMap: positionMap.length ? positionMap : [-1] };
|
|
17
18
|
};
|
|
18
19
|
export const decoderIds = {
|
|
19
20
|
pad: decoderTokenToId.get("<pad>"),
|
|
20
21
|
sos: decoderTokenToId.get("<sos>"),
|
|
21
22
|
eos: decoderTokenToId.get("<eos>"),
|
|
23
|
+
unk: decoderTokenToId.get("<unk>"),
|
|
22
24
|
};
|
|
23
25
|
export const decodeIdsToResult = (ids, attnIndices, positionMap) => {
|
|
24
26
|
const phonemes = [];
|
|
25
27
|
const alignments = [];
|
|
28
|
+
let outOfRangeTokenCount = 0;
|
|
26
29
|
for (let i = 0; i < ids.length; i++) {
|
|
27
30
|
const tokenId = Number(ids[i]);
|
|
28
31
|
if (tokenId === decoderIds.eos)
|
|
@@ -31,15 +34,21 @@ export const decodeIdsToResult = (ids, attnIndices, positionMap) => {
|
|
|
31
34
|
continue;
|
|
32
35
|
if (tokenId === decoderIds.sos && phonemes.length === 0)
|
|
33
36
|
continue;
|
|
34
|
-
const phoneme = VOCAB.decoder[tokenId]
|
|
37
|
+
const phoneme = VOCAB.decoder[tokenId];
|
|
38
|
+
if (phoneme === undefined) {
|
|
39
|
+
outOfRangeTokenCount += 1;
|
|
40
|
+
}
|
|
35
41
|
const srcPos = Math.max(0, Math.min(Number(attnIndices[i] ?? 0), positionMap.length > 0 ? positionMap.length - 1 : 0));
|
|
36
|
-
const charIndex = positionMap.length > 0 ? positionMap[srcPos] :
|
|
42
|
+
const charIndex = positionMap.length > 0 ? positionMap[srcPos] : -1;
|
|
37
43
|
alignments.push({
|
|
38
|
-
phoneme,
|
|
44
|
+
phoneme: phoneme ?? VOCAB.decoder[decoderIds.unk],
|
|
39
45
|
phonemeIndex: alignments.length,
|
|
40
46
|
charIndex,
|
|
41
47
|
});
|
|
42
|
-
phonemes.push(phoneme);
|
|
48
|
+
phonemes.push(phoneme ?? VOCAB.decoder[decoderIds.unk]);
|
|
49
|
+
}
|
|
50
|
+
if (outOfRangeTokenCount > 0 && typeof console !== "undefined") {
|
|
51
|
+
console.warn(`[hama-js] decodeIdsToResult saw ${outOfRangeTokenCount} out-of-range decoder ids; mapped to <unk>.`);
|
|
43
52
|
}
|
|
44
53
|
return { ipa: phonemes.join(""), alignments };
|
|
45
54
|
};
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/tokenizer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAgB,MAAM,WAAW,CAAC;AAC1D,OAAO,SAAS,MAAM,yBAAyB,CAAC;AAOhD,MAAM,CAAC,MAAM,KAAK,GAAe,SAAuB,CAAC;AAEzD,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;AAEF,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;
|
|
1
|
+
{"version":3,"file":"tokenizer.js","sourceRoot":"","sources":["../../src/tokenizer.ts"],"names":[],"mappings":"AAAA,OAAO,EAAE,eAAe,EAAgB,MAAM,WAAW,CAAC;AAC1D,OAAO,SAAS,MAAM,yBAAyB,CAAC;AAOhD,MAAM,CAAC,MAAM,KAAK,GAAe,SAAuB,CAAC;AAEzD,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;AAEF,MAAM,gBAAgB,GAAG,IAAI,GAAG,CAC9B,KAAK,CAAC,OAAO,CAAC,GAAG,CAAC,CAAC,KAAK,EAAE,GAAG,EAAE,EAAE,CAAC,CAAC,KAAK,EAAE,GAAG,CAAU,CAAC,CACzD,CAAC;AAuBF,MAAM,CAAC,MAAM,UAAU,GAAG,CACxB,IAAY,EACZ,WAAmB,EACN,EAAE;IACf,MAAM,OAAO,GAAiB,eAAe,CAAC,IAAI,CAAC,CAAC;IACpD,MAAM,MAAM,GAAG,OAAO,CAAC,MAAM,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,OAAO,CAAC,CAAC;IAClE,MAAM,OAAO,GAAG,OAAO,CAAC,eAAe,CAAC,MAAM,CAAC,CAAC,CAAC,OAAO,CAAC,eAAe,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;IAChF,MAAM,GAAG,GAAG,MAAM,CAAC,GAAG,CACpB,CAAC,KAAK,EAAE,EAAE,CAAC,gBAAgB,CAAC,GAAG,CAAC,KAAK,CAAC,IAAI,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE,CACzE,CAAC;IACF,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CAAC,GAAG,CAAC,MAAM,EAAE,WAAW,CAAC,CAAC;IACjD,MAAM,MAAM,GAAG,IAAI,KAAK,CAAS,WAAW,CAAC,CAAC,IAAI,CAChD,MAAM,CAAC,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE,CAAC,CACvC,CAAC;IACF,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QAChC,MAAM,CAAC,CAAC,CAAC,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;IAC7B,CAAC;IACD,MAAM,WAAW,GAAG,OAAO,CAAC,KAAK,CAAC,CAAC,EAAE,MAAM,CAAC,CAAC;IAC7C,OAAO,EAAE,GAAG,EAAE,MAAM,EAAE,MAAM,EAAE,WAAW,EAAE,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,WAAW,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,EAAE,CAAC;AACvF,CAAC,CAAC;AAEF,MAAM,CAAC,MAAM,UAAU,GAAG;IACxB,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;IACnC,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;IACnC,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;IACnC,GAAG,EAAE,gBAAgB,CAAC,GAAG,CAAC,OAAO,CAAE;CACpC,CAAC;AAEF,MAAM,CAAC,MAAM,iBAAiB,GAAG,CAC/B,GAA+B,EAC/B,WAAuC,EACvC,WAAqB,EACV,EAAE;IACb,MAAM,QAAQ,GAAa,EAAE,CAAC;IAC9B,MAAM,UAAU,GAAgB,EAAE,CAAC;IACnC,IAAI,oBAAoB,GAAG,CAAC,CAAC;IAE7B,KAAK,IAAI,CAAC,GAAG,CAAC,EAAE,CAAC,GAAG,GAAG,CAAC,MAAM,EAAE,CAAC,EAAE,EAAE,CAAC;QACpC,MAAM,OAAO,GAAG,MAAM,CAAC,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC;QAC/B,IAAI,OAAO,KAAK,UAAU,CAAC,GAAG;YAAE,MAAM;QACtC,IAAI,OAAO,KAAK,UAAU,CAAC,GAAG;YAAE,SAAS;QACzC,IAAI,OAAO,KAAK,UAAU,CAAC,GAAG,IAAI,QAAQ,CAAC,MAAM,KAAK,CAAC;YAAE,SAAS;QAElE,MAAM,OAAO,GAAG,KAAK,CAAC,OAAO,CAAC,OAAO,CAAC,CAAC;QACvC,IAAI,OAAO,KAAK,SAAS,EAAE,CAAC;YAC1B,oBAAoB,IAAI,CAAC,CAAC;QAC5B,CAAC;QACD,MAAM,MAAM,GAAG,IAAI,CAAC,GAAG,CACrB,CAAC,EACD,IAAI,CAAC,GAAG,CACN,MAAM,CAAC,WAAW,CAAC,CAAC,CAAC,IAAI,CAAC,CAAC,EAC3B,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,CAAC,CACpD,CACF,CAAC;QACF,MAAM,SAAS,GAAG,WAAW,CAAC,MAAM,GAAG,CAAC,CAAC,CAAC,CAAC,WAAW,CAAC,MAAM,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC,CAAC;QACpE,UAAU,CAAC,IAAI,CAAC;YACd,OAAO,EAAE,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC;YACjD,YAAY,EAAE,UAAU,CAAC,MAAM;YAC/B,SAAS;SACV,CAAC,CAAC;QACH,QAAQ,CAAC,IAAI,CAAC,OAAO,IAAI,KAAK,CAAC,OAAO,CAAC,UAAU,CAAC,GAAG,CAAC,CAAC,CAAC;IAC1D,CAAC;IAED,IAAI,oBAAoB,GAAG,CAAC,IAAI,OAAO,OAAO,KAAK,WAAW,EAAE,CAAC;QAC/D,OAAO,CAAC,IAAI,CACV,mCAAmC,oBAAoB,6CAA6C,CACrG,CAAC;IACJ,CAAC;IAED,OAAO,EAAE,GAAG,EAAE,QAAQ,CAAC,IAAI,CAAC,EAAE,CAAC,EAAE,UAAU,EAAE,CAAC;AAChD,CAAC,CAAC"}
|
|
@@ -6,6 +6,10 @@ export declare const VOCAB: Vocabulary;
|
|
|
6
6
|
export interface Alignment {
|
|
7
7
|
phoneme: string;
|
|
8
8
|
phonemeIndex: number;
|
|
9
|
+
/**
|
|
10
|
+
* Original input character index for this phoneme alignment.
|
|
11
|
+
* Uses -1 sentinel when the input has no non-whitespace characters.
|
|
12
|
+
*/
|
|
9
13
|
charIndex: number;
|
|
10
14
|
}
|
|
11
15
|
export interface G2PResult {
|
|
@@ -22,5 +26,6 @@ export declare const decoderIds: {
|
|
|
22
26
|
pad: number;
|
|
23
27
|
sos: number;
|
|
24
28
|
eos: number;
|
|
29
|
+
unk: number;
|
|
25
30
|
};
|
|
26
31
|
export declare const decodeIdsToResult: (ids: ArrayLike<number | bigint>, attnIndices: ArrayLike<number | bigint>, positionMap: number[]) => G2PResult;
|