hama-js 1.3.11 → 1.3.13
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/browser/browser.js +3 -0
- package/dist/browser/browser.js.map +1 -1
- package/dist/browser/pronunciation.js +404 -12
- package/dist/browser/pronunciation.js.map +1 -1
- package/dist/node/index.js +3 -0
- package/dist/node/index.js.map +1 -1
- package/dist/node/pronunciation.js +404 -12
- package/dist/node/pronunciation.js.map +1 -1
- package/dist/types/browser.d.ts +1 -0
- package/dist/types/index.d.ts +1 -0
- package/dist/types/pronunciation.d.ts +4 -0
- package/package.json +1 -1
|
@@ -1,5 +1,7 @@
|
|
|
1
|
+
import { splitTextToJamo } from "./jamo.js";
|
|
1
2
|
const DEFAULT_SCAN_OPTIONS = {
|
|
2
3
|
language: "en",
|
|
4
|
+
spanUnit: "character",
|
|
3
5
|
maxDistanceRatio: 0.2,
|
|
4
6
|
minDistance: 0,
|
|
5
7
|
maxDistance: null,
|
|
@@ -20,6 +22,7 @@ const DEFAULT_SCAN_OPTIONS = {
|
|
|
20
22
|
};
|
|
21
23
|
const DEFAULT_REPLACE_OPTIONS = {
|
|
22
24
|
language: "en",
|
|
25
|
+
spanUnit: "character",
|
|
23
26
|
maxDistanceRatio: 0.2,
|
|
24
27
|
minDistance: 0,
|
|
25
28
|
maxDistance: null,
|
|
@@ -56,11 +59,17 @@ export async function pronunciationScanWithModel(model, text, terms, options = {
|
|
|
56
59
|
const phoneEncoder = new PhoneEncoder();
|
|
57
60
|
const qgramEncoder = new QGramEncoder();
|
|
58
61
|
const tokens = await prepareTokens(text, model, merged, phoneEncoder);
|
|
59
|
-
const
|
|
62
|
+
const maxInputLen = resolvePredictorMaxInputLen(model);
|
|
63
|
+
const compiled = await compileVariants(terms, model, merged, phoneEncoder, qgramEncoder, maxInputLen);
|
|
64
|
+
const baseStats = emptyScanStats(tokens.length);
|
|
65
|
+
baseStats.rejectedByInputLimit = compiled.rejectedByInputLimit;
|
|
60
66
|
if (compiled.variants.length === 0) {
|
|
61
|
-
return { matches: [], stats:
|
|
67
|
+
return { matches: [], stats: baseStats };
|
|
62
68
|
}
|
|
63
|
-
const { matches, stats } =
|
|
69
|
+
const { matches, stats } = merged.spanUnit === "character"
|
|
70
|
+
? await scanCompiledByCharacters(text, tokens, compiled, merged, qgramEncoder, model, phoneEncoder, maxInputLen)
|
|
71
|
+
: scanCompiled(text, tokens, compiled, merged, qgramEncoder);
|
|
72
|
+
stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + compiled.rejectedByInputLimit;
|
|
64
73
|
const resolved = resolveScanMatches(matches, merged.resolveOverlaps);
|
|
65
74
|
stats.matchesReturned = resolved.length;
|
|
66
75
|
return { matches: resolved, stats };
|
|
@@ -124,6 +133,7 @@ const emptyScanStats = (tokenCount) => ({
|
|
|
124
133
|
candidateVariantsVerified: 0,
|
|
125
134
|
matchesReturned: 0,
|
|
126
135
|
rejectedByLength: 0,
|
|
136
|
+
rejectedByInputLimit: 0,
|
|
127
137
|
rejectedByQgram: 0,
|
|
128
138
|
rejectedByDistance: 0,
|
|
129
139
|
});
|
|
@@ -147,6 +157,7 @@ const normalizeForMatch = (text) => {
|
|
|
147
157
|
};
|
|
148
158
|
const compactSurface = (text) => text.replace(/ /gu, "").replace(/-/gu, "").replace(/'/gu, "");
|
|
149
159
|
const isWordChar = (ch) => /\p{L}|\p{N}/u.test(ch);
|
|
160
|
+
const characterUnitCount = (text) => toCodePoints(text).filter((codePoint) => isWordChar(codePoint.ch)).length;
|
|
150
161
|
const toCodePoints = (text) => {
|
|
151
162
|
const result = [];
|
|
152
163
|
let codeUnitOffset = 0;
|
|
@@ -211,10 +222,11 @@ const prepareTokens = async (text, model, options, phoneEncoder) => {
|
|
|
211
222
|
const cacheKey = normText || token.rawText;
|
|
212
223
|
let cached = tokenCache.get(cacheKey);
|
|
213
224
|
if (!cached) {
|
|
214
|
-
const
|
|
225
|
+
const aligned = await phonemizeTextAligned(normText || token.rawText, model);
|
|
215
226
|
cached = {
|
|
216
|
-
phoneTokens,
|
|
217
|
-
phones: phoneTokens.map((phone) => phoneEncoder.encode(phone)),
|
|
227
|
+
phoneTokens: aligned.phoneTokens,
|
|
228
|
+
phones: aligned.phoneTokens.map((phone) => phoneEncoder.encode(phone)),
|
|
229
|
+
charIndexes: aligned.charIndexes,
|
|
218
230
|
};
|
|
219
231
|
tokenCache.set(cacheKey, cached);
|
|
220
232
|
}
|
|
@@ -227,14 +239,42 @@ const prepareTokens = async (text, model, options, phoneEncoder) => {
|
|
|
227
239
|
endCodeUnit: token.endCodeUnit,
|
|
228
240
|
phones: [...cached.phones],
|
|
229
241
|
phoneTokens: [...cached.phoneTokens],
|
|
242
|
+
charPhones: buildTokenCharPhones(token.rawText, normText, cached.phones, cached.phoneTokens, cached.charIndexes),
|
|
230
243
|
});
|
|
231
244
|
}
|
|
232
245
|
return prepared;
|
|
233
246
|
};
|
|
234
|
-
const
|
|
247
|
+
const buildCharacterUnits = (text, tokens) => {
|
|
248
|
+
const codePoints = toCodePoints(text);
|
|
249
|
+
const units = [];
|
|
250
|
+
let tokenIndex = 0;
|
|
251
|
+
for (const codePoint of codePoints) {
|
|
252
|
+
if (!isWordChar(codePoint.ch))
|
|
253
|
+
continue;
|
|
254
|
+
while (tokenIndex < tokens.length && codePoint.charIndex >= tokens[tokenIndex].endChar) {
|
|
255
|
+
tokenIndex += 1;
|
|
256
|
+
}
|
|
257
|
+
if (tokenIndex >= tokens.length)
|
|
258
|
+
break;
|
|
259
|
+
if (codePoint.charIndex < tokens[tokenIndex].startChar ||
|
|
260
|
+
codePoint.charIndex >= tokens[tokenIndex].endChar) {
|
|
261
|
+
continue;
|
|
262
|
+
}
|
|
263
|
+
units.push({
|
|
264
|
+
startChar: codePoint.charIndex,
|
|
265
|
+
endChar: codePoint.charIndex + 1,
|
|
266
|
+
startCodeUnit: codePoint.codeUnitStart,
|
|
267
|
+
endCodeUnit: codePoint.codeUnitEnd,
|
|
268
|
+
tokenIndex,
|
|
269
|
+
});
|
|
270
|
+
}
|
|
271
|
+
return units;
|
|
272
|
+
};
|
|
273
|
+
const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder, maxInputLen) => {
|
|
235
274
|
const variants = [];
|
|
236
275
|
const byTokenCount = new Map();
|
|
237
276
|
const indexByTokenCount = new Map();
|
|
277
|
+
let rejectedByInputLimit = 0;
|
|
238
278
|
let variantId = 0;
|
|
239
279
|
for (const rawTerm of terms) {
|
|
240
280
|
const term = coerceTerm(rawTerm);
|
|
@@ -244,11 +284,19 @@ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder
|
|
|
244
284
|
}
|
|
245
285
|
for (const [surfaceText, aliasText] of surfaces) {
|
|
246
286
|
const surfaceNorm = normalizeForMatch(surfaceText);
|
|
247
|
-
const tokenCount =
|
|
287
|
+
const tokenCount = options.spanUnit === "character"
|
|
288
|
+
? Math.max(1, characterUnitCount(surfaceText))
|
|
289
|
+
: tokenizeWithOffsets(surfaceText).length || Math.max(1, surfaceNorm.split(" ").filter(Boolean).length);
|
|
248
290
|
const pronunciationInputs = term.pronunciations.length > 0
|
|
249
291
|
? term.pronunciations.slice(0, options.maxTermPronunciations)
|
|
250
292
|
: [null];
|
|
251
293
|
for (const pronunciationInput of pronunciationInputs) {
|
|
294
|
+
if (pronunciationInput == null &&
|
|
295
|
+
maxInputLen != null &&
|
|
296
|
+
estimatePredictorInputLength(surfaceNorm) > maxInputLen) {
|
|
297
|
+
rejectedByInputLimit += 1;
|
|
298
|
+
continue;
|
|
299
|
+
}
|
|
252
300
|
const phoneTokens = pronunciationInput == null
|
|
253
301
|
? await phonemizeText(surfaceNorm, model)
|
|
254
302
|
: parseExplicitPronunciation(pronunciationInput);
|
|
@@ -289,7 +337,7 @@ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder
|
|
|
289
337
|
}
|
|
290
338
|
}
|
|
291
339
|
}
|
|
292
|
-
return { variants, byTokenCount, indexByTokenCount };
|
|
340
|
+
return { variants, byTokenCount, indexByTokenCount, rejectedByInputLimit };
|
|
293
341
|
};
|
|
294
342
|
const scanCompiled = (text, tokens, compiled, options, qgramEncoder) => {
|
|
295
343
|
const stats = emptyScanStats(tokens.length);
|
|
@@ -388,6 +436,187 @@ const scanCompiled = (text, tokens, compiled, options, qgramEncoder) => {
|
|
|
388
436
|
}
|
|
389
437
|
return { matches: dedupeScanMatches(rawMatches), stats };
|
|
390
438
|
};
|
|
439
|
+
const scanCompiledByCharacters = async (text, tokens, compiled, options, qgramEncoder, model, phoneEncoder, maxInputLen) => {
|
|
440
|
+
const stats = emptyScanStats(tokens.length);
|
|
441
|
+
const charUnits = buildCharacterUnits(text, tokens);
|
|
442
|
+
if (compiled.variants.length === 0 || charUnits.length === 0) {
|
|
443
|
+
return { matches: [], stats };
|
|
444
|
+
}
|
|
445
|
+
const tokenCounts = [...compiled.byTokenCount.keys()].sort((a, b) => a - b);
|
|
446
|
+
const variantById = new Map(compiled.variants.map((variant) => [variant.variantId, variant]));
|
|
447
|
+
const lengths = windowLengths(tokenCounts, options);
|
|
448
|
+
const rawMatches = [];
|
|
449
|
+
const windowCache = new Map();
|
|
450
|
+
// Approximate phones per character unit, sliced out of each token's
|
|
451
|
+
// alignment buckets. They let us reject the vast majority of candidate
|
|
452
|
+
// windows with pure-JS filters before paying for a real G2P inference.
|
|
453
|
+
const unitPhones = charUnits.map((unit) => {
|
|
454
|
+
const token = tokens[unit.tokenIndex];
|
|
455
|
+
if (!token.charPhones)
|
|
456
|
+
return null;
|
|
457
|
+
return token.charPhones[unit.startChar - token.startChar] ?? null;
|
|
458
|
+
});
|
|
459
|
+
const prefixPhoneCounts = [0];
|
|
460
|
+
const prefixUnmappable = [0];
|
|
461
|
+
for (let idx = 0; idx < charUnits.length; idx += 1) {
|
|
462
|
+
const phones = unitPhones[idx];
|
|
463
|
+
prefixPhoneCounts.push(prefixPhoneCounts[idx] + (phones ? phones.length : 0));
|
|
464
|
+
prefixUnmappable.push(prefixUnmappable[idx] + (phones ? 0 : 1));
|
|
465
|
+
}
|
|
466
|
+
// Concatenated per-character phones approximate the true G2P output of the
|
|
467
|
+
// window text, so the prefilter widens every variant threshold by this slack
|
|
468
|
+
// before discarding a window without verification.
|
|
469
|
+
const approxSlackFor = (variant) => Math.max(2, Math.ceil(variant.phoneLen * 0.25));
|
|
470
|
+
for (let startUnit = 0; startUnit < charUnits.length; startUnit += 1) {
|
|
471
|
+
for (const windowLength of lengths) {
|
|
472
|
+
const endUnit = startUnit + windowLength;
|
|
473
|
+
if (endUnit > charUnits.length)
|
|
474
|
+
continue;
|
|
475
|
+
stats.windowCount = (stats.windowCount ?? 0) + 1;
|
|
476
|
+
const firstUnit = charUnits[startUnit];
|
|
477
|
+
const lastUnit = charUnits[endUnit - 1];
|
|
478
|
+
const windowText = text.slice(firstUnit.startCodeUnit, lastUnit.endCodeUnit);
|
|
479
|
+
const windowNorm = normalizeForMatch(windowText);
|
|
480
|
+
if (maxInputLen != null &&
|
|
481
|
+
estimatePredictorInputLength(windowNorm || windowText) > maxInputLen) {
|
|
482
|
+
stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + 1;
|
|
483
|
+
continue;
|
|
484
|
+
}
|
|
485
|
+
const relevantCounts = candidateTokenBuckets(windowLength, tokenCounts, options);
|
|
486
|
+
if (relevantCounts.length === 0)
|
|
487
|
+
continue;
|
|
488
|
+
// Alignment-based prefilter: only windows that plausibly match some
|
|
489
|
+
// variant (under slackened thresholds) are sent to the predictor.
|
|
490
|
+
const windowMappable = prefixUnmappable[endUnit] - prefixUnmappable[startUnit] === 0;
|
|
491
|
+
if (windowMappable) {
|
|
492
|
+
const approxLen = prefixPhoneCounts[endUnit] - prefixPhoneCounts[startUnit];
|
|
493
|
+
let approxLengthRejected = 0;
|
|
494
|
+
let approxQgramRejected = 0;
|
|
495
|
+
let approxDistanceRejected = 0;
|
|
496
|
+
const lengthOkVariants = [];
|
|
497
|
+
for (const count of relevantCounts) {
|
|
498
|
+
for (const variant of compiled.byTokenCount.get(count) ?? []) {
|
|
499
|
+
if (Math.abs(approxLen - variant.phoneLen) > variant.thresholdK + approxSlackFor(variant)) {
|
|
500
|
+
approxLengthRejected += 1;
|
|
501
|
+
continue;
|
|
502
|
+
}
|
|
503
|
+
lengthOkVariants.push(variant);
|
|
504
|
+
}
|
|
505
|
+
}
|
|
506
|
+
let plausible = false;
|
|
507
|
+
if (lengthOkVariants.length > 0) {
|
|
508
|
+
const approxPhones = [];
|
|
509
|
+
for (let unit = startUnit; unit < endUnit; unit += 1) {
|
|
510
|
+
approxPhones.push(...unitPhones[unit]);
|
|
511
|
+
}
|
|
512
|
+
const approxQfreq = qgramFrequency(approxPhones, options.qgramSize, qgramEncoder);
|
|
513
|
+
for (const variant of lengthOkVariants) {
|
|
514
|
+
const slackK = variant.thresholdK + approxSlackFor(variant);
|
|
515
|
+
const required = requiredOverlap(variant.phoneLen, approxPhones.length, options.qgramSize, slackK);
|
|
516
|
+
if (qgramOverlap(approxQfreq, variant.qgramFreq) < required) {
|
|
517
|
+
approxQgramRejected += 1;
|
|
518
|
+
continue;
|
|
519
|
+
}
|
|
520
|
+
if (verifyDistance(variant.phones, approxPhones, slackK, options.verifier) == null) {
|
|
521
|
+
approxDistanceRejected += 1;
|
|
522
|
+
continue;
|
|
523
|
+
}
|
|
524
|
+
plausible = true;
|
|
525
|
+
break;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
if (!plausible) {
|
|
529
|
+
stats.candidateVariantsConsidered =
|
|
530
|
+
(stats.candidateVariantsConsidered ?? 0) + lengthOkVariants.length;
|
|
531
|
+
stats.rejectedByLength = (stats.rejectedByLength ?? 0) + approxLengthRejected;
|
|
532
|
+
stats.rejectedByQgram = (stats.rejectedByQgram ?? 0) + approxQgramRejected;
|
|
533
|
+
stats.rejectedByDistance = (stats.rejectedByDistance ?? 0) + approxDistanceRejected;
|
|
534
|
+
continue;
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
const window = await buildCharacterWindow(text, charUnits, startUnit, endUnit, model, phoneEncoder, windowCache, maxInputLen);
|
|
538
|
+
if (!window) {
|
|
539
|
+
stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + 1;
|
|
540
|
+
continue;
|
|
541
|
+
}
|
|
542
|
+
const lengthOkIds = new Set();
|
|
543
|
+
for (const count of relevantCounts) {
|
|
544
|
+
for (const variant of compiled.byTokenCount.get(count) ?? []) {
|
|
545
|
+
if (Math.abs(window.phones.length - variant.phoneLen) > variant.thresholdK) {
|
|
546
|
+
stats.rejectedByLength = (stats.rejectedByLength ?? 0) + 1;
|
|
547
|
+
continue;
|
|
548
|
+
}
|
|
549
|
+
lengthOkIds.add(variant.variantId);
|
|
550
|
+
}
|
|
551
|
+
}
|
|
552
|
+
if (lengthOkIds.size === 0)
|
|
553
|
+
continue;
|
|
554
|
+
stats.candidateVariantsConsidered = (stats.candidateVariantsConsidered ?? 0) + lengthOkIds.size;
|
|
555
|
+
const windowQfreq = qgramFrequency(window.phones, options.qgramSize, qgramEncoder);
|
|
556
|
+
const candidateOverlap = new Map();
|
|
557
|
+
for (const count of relevantCounts) {
|
|
558
|
+
const postings = compiled.indexByTokenCount.get(count) ?? new Map();
|
|
559
|
+
for (const [qgramId, windowCount] of windowQfreq.entries()) {
|
|
560
|
+
for (const [variantId, termCount] of postings.get(qgramId) ?? []) {
|
|
561
|
+
if (!lengthOkIds.has(variantId))
|
|
562
|
+
continue;
|
|
563
|
+
candidateOverlap.set(variantId, (candidateOverlap.get(variantId) ?? 0) + Math.min(windowCount, termCount));
|
|
564
|
+
}
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
const verifiedIds = [];
|
|
568
|
+
for (const variantId of [...lengthOkIds].sort((a, b) => a - b)) {
|
|
569
|
+
const variant = variantById.get(variantId);
|
|
570
|
+
const required = requiredOverlap(variant.phoneLen, window.phones.length, options.qgramSize, variant.thresholdK);
|
|
571
|
+
if ((candidateOverlap.get(variantId) ?? 0) < required) {
|
|
572
|
+
stats.rejectedByQgram = (stats.rejectedByQgram ?? 0) + 1;
|
|
573
|
+
continue;
|
|
574
|
+
}
|
|
575
|
+
verifiedIds.push(variantId);
|
|
576
|
+
}
|
|
577
|
+
if (verifiedIds.length === 0)
|
|
578
|
+
continue;
|
|
579
|
+
stats.candidateVariantsVerified = (stats.candidateVariantsVerified ?? 0) + verifiedIds.length;
|
|
580
|
+
for (const variantId of verifiedIds) {
|
|
581
|
+
const variant = variantById.get(variantId);
|
|
582
|
+
const distance = verifyDistance(variant.phones, window.phones, variant.thresholdK, options.verifier);
|
|
583
|
+
if (distance == null) {
|
|
584
|
+
stats.rejectedByDistance = (stats.rejectedByDistance ?? 0) + 1;
|
|
585
|
+
continue;
|
|
586
|
+
}
|
|
587
|
+
const phonemeSimilarity = similarity(distance, variant.phones.length, window.phones.length);
|
|
588
|
+
const textDistance = levenshteinDistance(variant.surfaceCompact, window.surfaceCompact);
|
|
589
|
+
const textSimilarity = similarity(textDistance, variant.surfaceCompact.length, window.surfaceCompact.length);
|
|
590
|
+
const score = options.scoring === "phoneme"
|
|
591
|
+
? phonemeSimilarity
|
|
592
|
+
: options.phonemeWeight * phonemeSimilarity + options.textWeight * textSimilarity;
|
|
593
|
+
if (score < options.minScore)
|
|
594
|
+
continue;
|
|
595
|
+
rawMatches.push({
|
|
596
|
+
termId: variant.termId,
|
|
597
|
+
termText: variant.termText,
|
|
598
|
+
canonical: variant.canonical,
|
|
599
|
+
aliasText: variant.aliasText,
|
|
600
|
+
matchedText: window.matchedText,
|
|
601
|
+
startChar: window.startChar,
|
|
602
|
+
endChar: window.endChar,
|
|
603
|
+
startToken: window.startToken,
|
|
604
|
+
endToken: window.endToken,
|
|
605
|
+
score,
|
|
606
|
+
phonemeDistance: distance,
|
|
607
|
+
phonemeThreshold: variant.thresholdK,
|
|
608
|
+
phonemeSimilarity,
|
|
609
|
+
textDistance,
|
|
610
|
+
textSimilarity,
|
|
611
|
+
termPronunciation: options.returnPhonemes ? [...variant.phoneTokens] : null,
|
|
612
|
+
matchedPronunciation: options.returnPhonemes ? [...window.phoneTokens] : null,
|
|
613
|
+
metadata: variant.metadata,
|
|
614
|
+
});
|
|
615
|
+
}
|
|
616
|
+
}
|
|
617
|
+
}
|
|
618
|
+
return { matches: dedupeScanMatches(rawMatches), stats };
|
|
619
|
+
};
|
|
391
620
|
const windowLengths = (termTokenCounts, options) => {
|
|
392
621
|
if (termTokenCounts.length === 0)
|
|
393
622
|
return [];
|
|
@@ -429,6 +658,62 @@ const buildWindow = (text, tokens, startToken, endToken) => {
|
|
|
429
658
|
phoneTokens,
|
|
430
659
|
};
|
|
431
660
|
};
|
|
661
|
+
const buildCharacterWindow = async (text, charUnits, startUnit, endUnit, model, phoneEncoder, cache, maxInputLen) => {
|
|
662
|
+
const first = charUnits[startUnit];
|
|
663
|
+
const last = charUnits[endUnit - 1];
|
|
664
|
+
const matchedText = text.slice(first.startCodeUnit, last.endCodeUnit);
|
|
665
|
+
const surfaceNorm = normalizeForMatch(matchedText);
|
|
666
|
+
const cacheKey = surfaceNorm || matchedText;
|
|
667
|
+
if (!cache.has(cacheKey)) {
|
|
668
|
+
if (maxInputLen != null && estimatePredictorInputLength(surfaceNorm || matchedText) > maxInputLen) {
|
|
669
|
+
cache.set(cacheKey, null);
|
|
670
|
+
return null;
|
|
671
|
+
}
|
|
672
|
+
const phoneTokens = await phonemizeText(surfaceNorm || matchedText, model);
|
|
673
|
+
cache.set(cacheKey, {
|
|
674
|
+
surfaceNorm,
|
|
675
|
+
phoneTokens,
|
|
676
|
+
phones: phoneTokens.map((phone) => phoneEncoder.encode(phone)),
|
|
677
|
+
});
|
|
678
|
+
}
|
|
679
|
+
const cached = cache.get(cacheKey);
|
|
680
|
+
if (!cached)
|
|
681
|
+
return null;
|
|
682
|
+
return {
|
|
683
|
+
startToken: first.tokenIndex,
|
|
684
|
+
endToken: last.tokenIndex + 1,
|
|
685
|
+
startChar: first.startChar,
|
|
686
|
+
endChar: last.endChar,
|
|
687
|
+
matchedText,
|
|
688
|
+
surfaceNorm: cached.surfaceNorm,
|
|
689
|
+
surfaceCompact: compactSurface(cached.surfaceNorm),
|
|
690
|
+
phones: [...cached.phones],
|
|
691
|
+
phoneTokens: [...cached.phoneTokens],
|
|
692
|
+
};
|
|
693
|
+
};
|
|
694
|
+
const resolvePredictorMaxInputLen = (model) => {
|
|
695
|
+
if (typeof model.getMaxInputLen === "function") {
|
|
696
|
+
const value = model.getMaxInputLen();
|
|
697
|
+
if (typeof value === "number" && Number.isFinite(value) && value > 0) {
|
|
698
|
+
return Math.trunc(value);
|
|
699
|
+
}
|
|
700
|
+
}
|
|
701
|
+
const carrier = model;
|
|
702
|
+
if (typeof carrier.maxInputLen === "number" &&
|
|
703
|
+
Number.isFinite(carrier.maxInputLen) &&
|
|
704
|
+
carrier.maxInputLen > 0) {
|
|
705
|
+
return Math.trunc(carrier.maxInputLen);
|
|
706
|
+
}
|
|
707
|
+
const optionValue = carrier.options?.maxInputLen;
|
|
708
|
+
if (typeof optionValue === "number" && Number.isFinite(optionValue) && optionValue > 0) {
|
|
709
|
+
return Math.trunc(optionValue);
|
|
710
|
+
}
|
|
711
|
+
return null;
|
|
712
|
+
};
|
|
713
|
+
const estimatePredictorInputLength = (text) => {
|
|
714
|
+
const sequence = splitTextToJamo(text);
|
|
715
|
+
return sequence.tokens.length > 0 ? sequence.tokens.length : 1;
|
|
716
|
+
};
|
|
432
717
|
const requiredOverlap = (termLen, windowLen, q, thresholdK) => Math.max(0, Math.max(termLen, windowLen) - q + 1 - thresholdK * q);
|
|
433
718
|
const verifyDistance = (pattern, text, thresholdK, verifier) => {
|
|
434
719
|
if (Math.abs(pattern.length - text.length) > thresholdK)
|
|
@@ -869,8 +1154,12 @@ const coerceTerm = (term) => {
|
|
|
869
1154
|
};
|
|
870
1155
|
};
|
|
871
1156
|
const phonemizeText = async (text, model) => {
|
|
1157
|
+
const { phoneTokens } = await phonemizeTextAligned(text, model);
|
|
1158
|
+
return phoneTokens;
|
|
1159
|
+
};
|
|
1160
|
+
const phonemizeTextAligned = async (text, model) => {
|
|
872
1161
|
const normalized = normalizeForMatch(text);
|
|
873
|
-
const fallback =
|
|
1162
|
+
const fallback = pseudoPhonesAligned(normalized);
|
|
874
1163
|
if (!normalized)
|
|
875
1164
|
return fallback;
|
|
876
1165
|
try {
|
|
@@ -879,13 +1168,106 @@ const phonemizeText = async (text, model) => {
|
|
|
879
1168
|
outputDelimiter: "",
|
|
880
1169
|
preserveLiterals: "none",
|
|
881
1170
|
});
|
|
882
|
-
|
|
883
|
-
|
|
1171
|
+
// Some languages decode with literal separator tokens between phonemes.
|
|
1172
|
+
// They carry no pronunciation signal, so drop them before matching.
|
|
1173
|
+
const aligned = result.alignments.filter((alignment) => alignment.phoneme.trim() !== "");
|
|
1174
|
+
if (aligned.length === 0)
|
|
1175
|
+
return fallback;
|
|
1176
|
+
return {
|
|
1177
|
+
phoneTokens: aligned.map((alignment) => alignment.phoneme),
|
|
1178
|
+
charIndexes: aligned.map((alignment) => alignment.charIndex),
|
|
1179
|
+
};
|
|
884
1180
|
}
|
|
885
1181
|
catch {
|
|
886
1182
|
return fallback;
|
|
887
1183
|
}
|
|
888
1184
|
};
|
|
1185
|
+
/** Mirrors pseudoPhones but keeps the source character index per pseudo-phone. */
|
|
1186
|
+
const pseudoPhonesAligned = (text) => {
|
|
1187
|
+
const phoneTokens = [];
|
|
1188
|
+
const charIndexes = [];
|
|
1189
|
+
Array.from(text).forEach((ch, idx) => {
|
|
1190
|
+
if (/\s/u.test(ch) || ch === "-" || ch === "'")
|
|
1191
|
+
return;
|
|
1192
|
+
phoneTokens.push(ch);
|
|
1193
|
+
charIndexes.push(idx);
|
|
1194
|
+
});
|
|
1195
|
+
if (phoneTokens.length === 0) {
|
|
1196
|
+
return { phoneTokens: ["<unk>"], charIndexes: [-1] };
|
|
1197
|
+
}
|
|
1198
|
+
return { phoneTokens, charIndexes };
|
|
1199
|
+
};
|
|
1200
|
+
/**
|
|
1201
|
+
* Buckets a token's encoded phones by raw character offset using the
|
|
1202
|
+
* predictor's normalized-input character alignments. Returns null when the
|
|
1203
|
+
* raw→normalized offset mapping is not compositional (per-character
|
|
1204
|
+
* normalization lengths fail to add up to the normalized string).
|
|
1205
|
+
*/
|
|
1206
|
+
const buildTokenCharPhones = (rawText, normText, phones, phoneTokens, charIndexes) => {
|
|
1207
|
+
const rawChars = Array.from(rawText);
|
|
1208
|
+
if (rawChars.length === 0)
|
|
1209
|
+
return null;
|
|
1210
|
+
const normLen = Array.from(normText).length;
|
|
1211
|
+
// Spell-out mode: when every phone is literally its source character the
|
|
1212
|
+
// predictor is naming letters, not pronouncing the word, so per-character
|
|
1213
|
+
// slices would not approximate how a sub-span is actually pronounced.
|
|
1214
|
+
const normChars = Array.from(normText);
|
|
1215
|
+
let letterIdentity = phoneTokens.length > 0;
|
|
1216
|
+
for (let idx = 0; idx < phoneTokens.length && letterIdentity; idx += 1) {
|
|
1217
|
+
const charIndex = charIndexes[idx];
|
|
1218
|
+
if (charIndex < 0 || charIndex >= normChars.length || phoneTokens[idx] !== normChars[charIndex]) {
|
|
1219
|
+
letterIdentity = false;
|
|
1220
|
+
}
|
|
1221
|
+
}
|
|
1222
|
+
if (letterIdentity)
|
|
1223
|
+
return null;
|
|
1224
|
+
// Trailing characters with no aligned phones usually mean the prediction was
|
|
1225
|
+
// truncated; windows over the tail would look spuriously short.
|
|
1226
|
+
let maxAligned = -1;
|
|
1227
|
+
for (const charIndex of charIndexes) {
|
|
1228
|
+
if (charIndex > maxAligned)
|
|
1229
|
+
maxAligned = charIndex;
|
|
1230
|
+
}
|
|
1231
|
+
if (normLen - 1 - maxAligned >= 3)
|
|
1232
|
+
return null;
|
|
1233
|
+
const cum = [0];
|
|
1234
|
+
for (const ch of rawChars) {
|
|
1235
|
+
cum.push(cum[cum.length - 1] + Array.from(normalizeForMatch(ch)).length);
|
|
1236
|
+
}
|
|
1237
|
+
if (cum[cum.length - 1] !== normLen)
|
|
1238
|
+
return null;
|
|
1239
|
+
const buckets = rawChars.map(() => []);
|
|
1240
|
+
for (let idx = 0; idx < phones.length; idx += 1) {
|
|
1241
|
+
const charIndex = charIndexes[idx] ?? -1;
|
|
1242
|
+
let target;
|
|
1243
|
+
if (charIndex < 0) {
|
|
1244
|
+
target = 0;
|
|
1245
|
+
}
|
|
1246
|
+
else if (charIndex >= normLen) {
|
|
1247
|
+
target = rawChars.length - 1;
|
|
1248
|
+
}
|
|
1249
|
+
else {
|
|
1250
|
+
let lo = 0;
|
|
1251
|
+
let hi = rawChars.length - 1;
|
|
1252
|
+
target = rawChars.length - 1;
|
|
1253
|
+
while (lo <= hi) {
|
|
1254
|
+
const mid = (lo + hi) >> 1;
|
|
1255
|
+
if (cum[mid] <= charIndex && charIndex < cum[mid + 1]) {
|
|
1256
|
+
target = mid;
|
|
1257
|
+
break;
|
|
1258
|
+
}
|
|
1259
|
+
if (charIndex < cum[mid]) {
|
|
1260
|
+
hi = mid - 1;
|
|
1261
|
+
}
|
|
1262
|
+
else {
|
|
1263
|
+
lo = mid + 1;
|
|
1264
|
+
}
|
|
1265
|
+
}
|
|
1266
|
+
}
|
|
1267
|
+
buckets[target].push(phones[idx]);
|
|
1268
|
+
}
|
|
1269
|
+
return buckets;
|
|
1270
|
+
};
|
|
889
1271
|
const pseudoPhones = (text) => {
|
|
890
1272
|
const compact = compactSurface(text);
|
|
891
1273
|
return Array.from(compact).filter((ch) => !/\s/u.test(ch)).length > 0
|
|
@@ -915,6 +1297,16 @@ const effectiveThreshold = (length, maxDistanceRatio, minDistance, maxDistance,
|
|
|
915
1297
|
}
|
|
916
1298
|
return threshold;
|
|
917
1299
|
};
|
|
1300
|
+
const qgramOverlap = (left, right) => {
|
|
1301
|
+
const [small, large] = left.size <= right.size ? [left, right] : [right, left];
|
|
1302
|
+
let overlap = 0;
|
|
1303
|
+
for (const [qgramId, count] of small.entries()) {
|
|
1304
|
+
const other = large.get(qgramId);
|
|
1305
|
+
if (other != null)
|
|
1306
|
+
overlap += Math.min(count, other);
|
|
1307
|
+
}
|
|
1308
|
+
return overlap;
|
|
1309
|
+
};
|
|
918
1310
|
const qgramFrequency = (sequence, q, encoder) => {
|
|
919
1311
|
const freq = new Map();
|
|
920
1312
|
if (q <= 0 || sequence.length < q)
|