hama-js 1.3.11 → 1.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,7 @@
1
+ import { splitTextToJamo } from "./jamo.js";
1
2
  const DEFAULT_SCAN_OPTIONS = {
2
3
  language: "en",
4
+ spanUnit: "character",
3
5
  maxDistanceRatio: 0.2,
4
6
  minDistance: 0,
5
7
  maxDistance: null,
@@ -20,6 +22,7 @@ const DEFAULT_SCAN_OPTIONS = {
20
22
  };
21
23
  const DEFAULT_REPLACE_OPTIONS = {
22
24
  language: "en",
25
+ spanUnit: "character",
23
26
  maxDistanceRatio: 0.2,
24
27
  minDistance: 0,
25
28
  maxDistance: null,
@@ -56,11 +59,17 @@ export async function pronunciationScanWithModel(model, text, terms, options = {
56
59
  const phoneEncoder = new PhoneEncoder();
57
60
  const qgramEncoder = new QGramEncoder();
58
61
  const tokens = await prepareTokens(text, model, merged, phoneEncoder);
59
- const compiled = await compileVariants(terms, model, merged, phoneEncoder, qgramEncoder);
62
+ const maxInputLen = resolvePredictorMaxInputLen(model);
63
+ const compiled = await compileVariants(terms, model, merged, phoneEncoder, qgramEncoder, maxInputLen);
64
+ const baseStats = emptyScanStats(tokens.length);
65
+ baseStats.rejectedByInputLimit = compiled.rejectedByInputLimit;
60
66
  if (compiled.variants.length === 0) {
61
- return { matches: [], stats: emptyScanStats(tokens.length) };
67
+ return { matches: [], stats: baseStats };
62
68
  }
63
- const { matches, stats } = scanCompiled(text, tokens, compiled, merged, qgramEncoder);
69
+ const { matches, stats } = merged.spanUnit === "character"
70
+ ? await scanCompiledByCharacters(text, tokens, compiled, merged, qgramEncoder, model, phoneEncoder, maxInputLen)
71
+ : scanCompiled(text, tokens, compiled, merged, qgramEncoder);
72
+ stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + compiled.rejectedByInputLimit;
64
73
  const resolved = resolveScanMatches(matches, merged.resolveOverlaps);
65
74
  stats.matchesReturned = resolved.length;
66
75
  return { matches: resolved, stats };
@@ -124,6 +133,7 @@ const emptyScanStats = (tokenCount) => ({
124
133
  candidateVariantsVerified: 0,
125
134
  matchesReturned: 0,
126
135
  rejectedByLength: 0,
136
+ rejectedByInputLimit: 0,
127
137
  rejectedByQgram: 0,
128
138
  rejectedByDistance: 0,
129
139
  });
@@ -147,6 +157,7 @@ const normalizeForMatch = (text) => {
147
157
  };
148
158
  const compactSurface = (text) => text.replace(/ /gu, "").replace(/-/gu, "").replace(/'/gu, "");
149
159
  const isWordChar = (ch) => /\p{L}|\p{N}/u.test(ch);
160
+ const characterUnitCount = (text) => toCodePoints(text).filter((codePoint) => isWordChar(codePoint.ch)).length;
150
161
  const toCodePoints = (text) => {
151
162
  const result = [];
152
163
  let codeUnitOffset = 0;
@@ -211,10 +222,11 @@ const prepareTokens = async (text, model, options, phoneEncoder) => {
211
222
  const cacheKey = normText || token.rawText;
212
223
  let cached = tokenCache.get(cacheKey);
213
224
  if (!cached) {
214
- const phoneTokens = await phonemizeText(normText || token.rawText, model);
225
+ const aligned = await phonemizeTextAligned(normText || token.rawText, model);
215
226
  cached = {
216
- phoneTokens,
217
- phones: phoneTokens.map((phone) => phoneEncoder.encode(phone)),
227
+ phoneTokens: aligned.phoneTokens,
228
+ phones: aligned.phoneTokens.map((phone) => phoneEncoder.encode(phone)),
229
+ charIndexes: aligned.charIndexes,
218
230
  };
219
231
  tokenCache.set(cacheKey, cached);
220
232
  }
@@ -227,14 +239,42 @@ const prepareTokens = async (text, model, options, phoneEncoder) => {
227
239
  endCodeUnit: token.endCodeUnit,
228
240
  phones: [...cached.phones],
229
241
  phoneTokens: [...cached.phoneTokens],
242
+ charPhones: buildTokenCharPhones(token.rawText, normText, cached.phones, cached.phoneTokens, cached.charIndexes),
230
243
  });
231
244
  }
232
245
  return prepared;
233
246
  };
234
- const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder) => {
247
+ const buildCharacterUnits = (text, tokens) => {
248
+ const codePoints = toCodePoints(text);
249
+ const units = [];
250
+ let tokenIndex = 0;
251
+ for (const codePoint of codePoints) {
252
+ if (!isWordChar(codePoint.ch))
253
+ continue;
254
+ while (tokenIndex < tokens.length && codePoint.charIndex >= tokens[tokenIndex].endChar) {
255
+ tokenIndex += 1;
256
+ }
257
+ if (tokenIndex >= tokens.length)
258
+ break;
259
+ if (codePoint.charIndex < tokens[tokenIndex].startChar ||
260
+ codePoint.charIndex >= tokens[tokenIndex].endChar) {
261
+ continue;
262
+ }
263
+ units.push({
264
+ startChar: codePoint.charIndex,
265
+ endChar: codePoint.charIndex + 1,
266
+ startCodeUnit: codePoint.codeUnitStart,
267
+ endCodeUnit: codePoint.codeUnitEnd,
268
+ tokenIndex,
269
+ });
270
+ }
271
+ return units;
272
+ };
273
+ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder, maxInputLen) => {
235
274
  const variants = [];
236
275
  const byTokenCount = new Map();
237
276
  const indexByTokenCount = new Map();
277
+ let rejectedByInputLimit = 0;
238
278
  let variantId = 0;
239
279
  for (const rawTerm of terms) {
240
280
  const term = coerceTerm(rawTerm);
@@ -244,11 +284,19 @@ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder
244
284
  }
245
285
  for (const [surfaceText, aliasText] of surfaces) {
246
286
  const surfaceNorm = normalizeForMatch(surfaceText);
247
- const tokenCount = tokenizeWithOffsets(surfaceText).length || Math.max(1, surfaceNorm.split(" ").filter(Boolean).length);
287
+ const tokenCount = options.spanUnit === "character"
288
+ ? Math.max(1, characterUnitCount(surfaceText))
289
+ : tokenizeWithOffsets(surfaceText).length || Math.max(1, surfaceNorm.split(" ").filter(Boolean).length);
248
290
  const pronunciationInputs = term.pronunciations.length > 0
249
291
  ? term.pronunciations.slice(0, options.maxTermPronunciations)
250
292
  : [null];
251
293
  for (const pronunciationInput of pronunciationInputs) {
294
+ if (pronunciationInput == null &&
295
+ maxInputLen != null &&
296
+ estimatePredictorInputLength(surfaceNorm) > maxInputLen) {
297
+ rejectedByInputLimit += 1;
298
+ continue;
299
+ }
252
300
  const phoneTokens = pronunciationInput == null
253
301
  ? await phonemizeText(surfaceNorm, model)
254
302
  : parseExplicitPronunciation(pronunciationInput);
@@ -289,7 +337,7 @@ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder
289
337
  }
290
338
  }
291
339
  }
292
- return { variants, byTokenCount, indexByTokenCount };
340
+ return { variants, byTokenCount, indexByTokenCount, rejectedByInputLimit };
293
341
  };
294
342
  const scanCompiled = (text, tokens, compiled, options, qgramEncoder) => {
295
343
  const stats = emptyScanStats(tokens.length);
@@ -388,6 +436,187 @@ const scanCompiled = (text, tokens, compiled, options, qgramEncoder) => {
388
436
  }
389
437
  return { matches: dedupeScanMatches(rawMatches), stats };
390
438
  };
439
+ const scanCompiledByCharacters = async (text, tokens, compiled, options, qgramEncoder, model, phoneEncoder, maxInputLen) => {
440
+ const stats = emptyScanStats(tokens.length);
441
+ const charUnits = buildCharacterUnits(text, tokens);
442
+ if (compiled.variants.length === 0 || charUnits.length === 0) {
443
+ return { matches: [], stats };
444
+ }
445
+ const tokenCounts = [...compiled.byTokenCount.keys()].sort((a, b) => a - b);
446
+ const variantById = new Map(compiled.variants.map((variant) => [variant.variantId, variant]));
447
+ const lengths = windowLengths(tokenCounts, options);
448
+ const rawMatches = [];
449
+ const windowCache = new Map();
450
+ // Approximate phones per character unit, sliced out of each token's
451
+ // alignment buckets. They let us reject the vast majority of candidate
452
+ // windows with pure-JS filters before paying for a real G2P inference.
453
+ const unitPhones = charUnits.map((unit) => {
454
+ const token = tokens[unit.tokenIndex];
455
+ if (!token.charPhones)
456
+ return null;
457
+ return token.charPhones[unit.startChar - token.startChar] ?? null;
458
+ });
459
+ const prefixPhoneCounts = [0];
460
+ const prefixUnmappable = [0];
461
+ for (let idx = 0; idx < charUnits.length; idx += 1) {
462
+ const phones = unitPhones[idx];
463
+ prefixPhoneCounts.push(prefixPhoneCounts[idx] + (phones ? phones.length : 0));
464
+ prefixUnmappable.push(prefixUnmappable[idx] + (phones ? 0 : 1));
465
+ }
466
+ // Concatenated per-character phones approximate the true G2P output of the
467
+ // window text, so the prefilter widens every variant threshold by this slack
468
+ // before discarding a window without verification.
469
+ const approxSlackFor = (variant) => Math.max(2, Math.ceil(variant.phoneLen * 0.25));
470
+ for (let startUnit = 0; startUnit < charUnits.length; startUnit += 1) {
471
+ for (const windowLength of lengths) {
472
+ const endUnit = startUnit + windowLength;
473
+ if (endUnit > charUnits.length)
474
+ continue;
475
+ stats.windowCount = (stats.windowCount ?? 0) + 1;
476
+ const firstUnit = charUnits[startUnit];
477
+ const lastUnit = charUnits[endUnit - 1];
478
+ const windowText = text.slice(firstUnit.startCodeUnit, lastUnit.endCodeUnit);
479
+ const windowNorm = normalizeForMatch(windowText);
480
+ if (maxInputLen != null &&
481
+ estimatePredictorInputLength(windowNorm || windowText) > maxInputLen) {
482
+ stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + 1;
483
+ continue;
484
+ }
485
+ const relevantCounts = candidateTokenBuckets(windowLength, tokenCounts, options);
486
+ if (relevantCounts.length === 0)
487
+ continue;
488
+ // Alignment-based prefilter: only windows that plausibly match some
489
+ // variant (under slackened thresholds) are sent to the predictor.
490
+ const windowMappable = prefixUnmappable[endUnit] - prefixUnmappable[startUnit] === 0;
491
+ if (windowMappable) {
492
+ const approxLen = prefixPhoneCounts[endUnit] - prefixPhoneCounts[startUnit];
493
+ let approxLengthRejected = 0;
494
+ let approxQgramRejected = 0;
495
+ let approxDistanceRejected = 0;
496
+ const lengthOkVariants = [];
497
+ for (const count of relevantCounts) {
498
+ for (const variant of compiled.byTokenCount.get(count) ?? []) {
499
+ if (Math.abs(approxLen - variant.phoneLen) > variant.thresholdK + approxSlackFor(variant)) {
500
+ approxLengthRejected += 1;
501
+ continue;
502
+ }
503
+ lengthOkVariants.push(variant);
504
+ }
505
+ }
506
+ let plausible = false;
507
+ if (lengthOkVariants.length > 0) {
508
+ const approxPhones = [];
509
+ for (let unit = startUnit; unit < endUnit; unit += 1) {
510
+ approxPhones.push(...unitPhones[unit]);
511
+ }
512
+ const approxQfreq = qgramFrequency(approxPhones, options.qgramSize, qgramEncoder);
513
+ for (const variant of lengthOkVariants) {
514
+ const slackK = variant.thresholdK + approxSlackFor(variant);
515
+ const required = requiredOverlap(variant.phoneLen, approxPhones.length, options.qgramSize, slackK);
516
+ if (qgramOverlap(approxQfreq, variant.qgramFreq) < required) {
517
+ approxQgramRejected += 1;
518
+ continue;
519
+ }
520
+ if (verifyDistance(variant.phones, approxPhones, slackK, options.verifier) == null) {
521
+ approxDistanceRejected += 1;
522
+ continue;
523
+ }
524
+ plausible = true;
525
+ break;
526
+ }
527
+ }
528
+ if (!plausible) {
529
+ stats.candidateVariantsConsidered =
530
+ (stats.candidateVariantsConsidered ?? 0) + lengthOkVariants.length;
531
+ stats.rejectedByLength = (stats.rejectedByLength ?? 0) + approxLengthRejected;
532
+ stats.rejectedByQgram = (stats.rejectedByQgram ?? 0) + approxQgramRejected;
533
+ stats.rejectedByDistance = (stats.rejectedByDistance ?? 0) + approxDistanceRejected;
534
+ continue;
535
+ }
536
+ }
537
+ const window = await buildCharacterWindow(text, charUnits, startUnit, endUnit, model, phoneEncoder, windowCache, maxInputLen);
538
+ if (!window) {
539
+ stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + 1;
540
+ continue;
541
+ }
542
+ const lengthOkIds = new Set();
543
+ for (const count of relevantCounts) {
544
+ for (const variant of compiled.byTokenCount.get(count) ?? []) {
545
+ if (Math.abs(window.phones.length - variant.phoneLen) > variant.thresholdK) {
546
+ stats.rejectedByLength = (stats.rejectedByLength ?? 0) + 1;
547
+ continue;
548
+ }
549
+ lengthOkIds.add(variant.variantId);
550
+ }
551
+ }
552
+ if (lengthOkIds.size === 0)
553
+ continue;
554
+ stats.candidateVariantsConsidered = (stats.candidateVariantsConsidered ?? 0) + lengthOkIds.size;
555
+ const windowQfreq = qgramFrequency(window.phones, options.qgramSize, qgramEncoder);
556
+ const candidateOverlap = new Map();
557
+ for (const count of relevantCounts) {
558
+ const postings = compiled.indexByTokenCount.get(count) ?? new Map();
559
+ for (const [qgramId, windowCount] of windowQfreq.entries()) {
560
+ for (const [variantId, termCount] of postings.get(qgramId) ?? []) {
561
+ if (!lengthOkIds.has(variantId))
562
+ continue;
563
+ candidateOverlap.set(variantId, (candidateOverlap.get(variantId) ?? 0) + Math.min(windowCount, termCount));
564
+ }
565
+ }
566
+ }
567
+ const verifiedIds = [];
568
+ for (const variantId of [...lengthOkIds].sort((a, b) => a - b)) {
569
+ const variant = variantById.get(variantId);
570
+ const required = requiredOverlap(variant.phoneLen, window.phones.length, options.qgramSize, variant.thresholdK);
571
+ if ((candidateOverlap.get(variantId) ?? 0) < required) {
572
+ stats.rejectedByQgram = (stats.rejectedByQgram ?? 0) + 1;
573
+ continue;
574
+ }
575
+ verifiedIds.push(variantId);
576
+ }
577
+ if (verifiedIds.length === 0)
578
+ continue;
579
+ stats.candidateVariantsVerified = (stats.candidateVariantsVerified ?? 0) + verifiedIds.length;
580
+ for (const variantId of verifiedIds) {
581
+ const variant = variantById.get(variantId);
582
+ const distance = verifyDistance(variant.phones, window.phones, variant.thresholdK, options.verifier);
583
+ if (distance == null) {
584
+ stats.rejectedByDistance = (stats.rejectedByDistance ?? 0) + 1;
585
+ continue;
586
+ }
587
+ const phonemeSimilarity = similarity(distance, variant.phones.length, window.phones.length);
588
+ const textDistance = levenshteinDistance(variant.surfaceCompact, window.surfaceCompact);
589
+ const textSimilarity = similarity(textDistance, variant.surfaceCompact.length, window.surfaceCompact.length);
590
+ const score = options.scoring === "phoneme"
591
+ ? phonemeSimilarity
592
+ : options.phonemeWeight * phonemeSimilarity + options.textWeight * textSimilarity;
593
+ if (score < options.minScore)
594
+ continue;
595
+ rawMatches.push({
596
+ termId: variant.termId,
597
+ termText: variant.termText,
598
+ canonical: variant.canonical,
599
+ aliasText: variant.aliasText,
600
+ matchedText: window.matchedText,
601
+ startChar: window.startChar,
602
+ endChar: window.endChar,
603
+ startToken: window.startToken,
604
+ endToken: window.endToken,
605
+ score,
606
+ phonemeDistance: distance,
607
+ phonemeThreshold: variant.thresholdK,
608
+ phonemeSimilarity,
609
+ textDistance,
610
+ textSimilarity,
611
+ termPronunciation: options.returnPhonemes ? [...variant.phoneTokens] : null,
612
+ matchedPronunciation: options.returnPhonemes ? [...window.phoneTokens] : null,
613
+ metadata: variant.metadata,
614
+ });
615
+ }
616
+ }
617
+ }
618
+ return { matches: dedupeScanMatches(rawMatches), stats };
619
+ };
391
620
  const windowLengths = (termTokenCounts, options) => {
392
621
  if (termTokenCounts.length === 0)
393
622
  return [];
@@ -429,6 +658,62 @@ const buildWindow = (text, tokens, startToken, endToken) => {
429
658
  phoneTokens,
430
659
  };
431
660
  };
661
+ const buildCharacterWindow = async (text, charUnits, startUnit, endUnit, model, phoneEncoder, cache, maxInputLen) => {
662
+ const first = charUnits[startUnit];
663
+ const last = charUnits[endUnit - 1];
664
+ const matchedText = text.slice(first.startCodeUnit, last.endCodeUnit);
665
+ const surfaceNorm = normalizeForMatch(matchedText);
666
+ const cacheKey = surfaceNorm || matchedText;
667
+ if (!cache.has(cacheKey)) {
668
+ if (maxInputLen != null && estimatePredictorInputLength(surfaceNorm || matchedText) > maxInputLen) {
669
+ cache.set(cacheKey, null);
670
+ return null;
671
+ }
672
+ const phoneTokens = await phonemizeText(surfaceNorm || matchedText, model);
673
+ cache.set(cacheKey, {
674
+ surfaceNorm,
675
+ phoneTokens,
676
+ phones: phoneTokens.map((phone) => phoneEncoder.encode(phone)),
677
+ });
678
+ }
679
+ const cached = cache.get(cacheKey);
680
+ if (!cached)
681
+ return null;
682
+ return {
683
+ startToken: first.tokenIndex,
684
+ endToken: last.tokenIndex + 1,
685
+ startChar: first.startChar,
686
+ endChar: last.endChar,
687
+ matchedText,
688
+ surfaceNorm: cached.surfaceNorm,
689
+ surfaceCompact: compactSurface(cached.surfaceNorm),
690
+ phones: [...cached.phones],
691
+ phoneTokens: [...cached.phoneTokens],
692
+ };
693
+ };
694
+ const resolvePredictorMaxInputLen = (model) => {
695
+ if (typeof model.getMaxInputLen === "function") {
696
+ const value = model.getMaxInputLen();
697
+ if (typeof value === "number" && Number.isFinite(value) && value > 0) {
698
+ return Math.trunc(value);
699
+ }
700
+ }
701
+ const carrier = model;
702
+ if (typeof carrier.maxInputLen === "number" &&
703
+ Number.isFinite(carrier.maxInputLen) &&
704
+ carrier.maxInputLen > 0) {
705
+ return Math.trunc(carrier.maxInputLen);
706
+ }
707
+ const optionValue = carrier.options?.maxInputLen;
708
+ if (typeof optionValue === "number" && Number.isFinite(optionValue) && optionValue > 0) {
709
+ return Math.trunc(optionValue);
710
+ }
711
+ return null;
712
+ };
713
+ const estimatePredictorInputLength = (text) => {
714
+ const sequence = splitTextToJamo(text);
715
+ return sequence.tokens.length > 0 ? sequence.tokens.length : 1;
716
+ };
432
717
  const requiredOverlap = (termLen, windowLen, q, thresholdK) => Math.max(0, Math.max(termLen, windowLen) - q + 1 - thresholdK * q);
433
718
  const verifyDistance = (pattern, text, thresholdK, verifier) => {
434
719
  if (Math.abs(pattern.length - text.length) > thresholdK)
@@ -869,8 +1154,12 @@ const coerceTerm = (term) => {
869
1154
  };
870
1155
  };
871
1156
  const phonemizeText = async (text, model) => {
1157
+ const { phoneTokens } = await phonemizeTextAligned(text, model);
1158
+ return phoneTokens;
1159
+ };
1160
+ const phonemizeTextAligned = async (text, model) => {
872
1161
  const normalized = normalizeForMatch(text);
873
- const fallback = pseudoPhones(normalized);
1162
+ const fallback = pseudoPhonesAligned(normalized);
874
1163
  if (!normalized)
875
1164
  return fallback;
876
1165
  try {
@@ -879,13 +1168,106 @@ const phonemizeText = async (text, model) => {
879
1168
  outputDelimiter: "",
880
1169
  preserveLiterals: "none",
881
1170
  });
882
- const phones = result.alignments.map((alignment) => alignment.phoneme);
883
- return phones.length > 0 ? phones : fallback;
1171
+ // Some languages decode with literal separator tokens between phonemes.
1172
+ // They carry no pronunciation signal, so drop them before matching.
1173
+ const aligned = result.alignments.filter((alignment) => alignment.phoneme.trim() !== "");
1174
+ if (aligned.length === 0)
1175
+ return fallback;
1176
+ return {
1177
+ phoneTokens: aligned.map((alignment) => alignment.phoneme),
1178
+ charIndexes: aligned.map((alignment) => alignment.charIndex),
1179
+ };
884
1180
  }
885
1181
  catch {
886
1182
  return fallback;
887
1183
  }
888
1184
  };
1185
+ /** Mirrors pseudoPhones but keeps the source character index per pseudo-phone. */
1186
+ const pseudoPhonesAligned = (text) => {
1187
+ const phoneTokens = [];
1188
+ const charIndexes = [];
1189
+ Array.from(text).forEach((ch, idx) => {
1190
+ if (/\s/u.test(ch) || ch === "-" || ch === "'")
1191
+ return;
1192
+ phoneTokens.push(ch);
1193
+ charIndexes.push(idx);
1194
+ });
1195
+ if (phoneTokens.length === 0) {
1196
+ return { phoneTokens: ["<unk>"], charIndexes: [-1] };
1197
+ }
1198
+ return { phoneTokens, charIndexes };
1199
+ };
1200
+ /**
1201
+ * Buckets a token's encoded phones by raw character offset using the
1202
+ * predictor's normalized-input character alignments. Returns null when the
1203
+ * raw→normalized offset mapping is not compositional (per-character
1204
+ * normalization lengths fail to add up to the normalized string).
1205
+ */
1206
+ const buildTokenCharPhones = (rawText, normText, phones, phoneTokens, charIndexes) => {
1207
+ const rawChars = Array.from(rawText);
1208
+ if (rawChars.length === 0)
1209
+ return null;
1210
+ const normLen = Array.from(normText).length;
1211
+ // Spell-out mode: when every phone is literally its source character the
1212
+ // predictor is naming letters, not pronouncing the word, so per-character
1213
+ // slices would not approximate how a sub-span is actually pronounced.
1214
+ const normChars = Array.from(normText);
1215
+ let letterIdentity = phoneTokens.length > 0;
1216
+ for (let idx = 0; idx < phoneTokens.length && letterIdentity; idx += 1) {
1217
+ const charIndex = charIndexes[idx];
1218
+ if (charIndex < 0 || charIndex >= normChars.length || phoneTokens[idx] !== normChars[charIndex]) {
1219
+ letterIdentity = false;
1220
+ }
1221
+ }
1222
+ if (letterIdentity)
1223
+ return null;
1224
+ // Trailing characters with no aligned phones usually mean the prediction was
1225
+ // truncated; windows over the tail would look spuriously short.
1226
+ let maxAligned = -1;
1227
+ for (const charIndex of charIndexes) {
1228
+ if (charIndex > maxAligned)
1229
+ maxAligned = charIndex;
1230
+ }
1231
+ if (normLen - 1 - maxAligned >= 3)
1232
+ return null;
1233
+ const cum = [0];
1234
+ for (const ch of rawChars) {
1235
+ cum.push(cum[cum.length - 1] + Array.from(normalizeForMatch(ch)).length);
1236
+ }
1237
+ if (cum[cum.length - 1] !== normLen)
1238
+ return null;
1239
+ const buckets = rawChars.map(() => []);
1240
+ for (let idx = 0; idx < phones.length; idx += 1) {
1241
+ const charIndex = charIndexes[idx] ?? -1;
1242
+ let target;
1243
+ if (charIndex < 0) {
1244
+ target = 0;
1245
+ }
1246
+ else if (charIndex >= normLen) {
1247
+ target = rawChars.length - 1;
1248
+ }
1249
+ else {
1250
+ let lo = 0;
1251
+ let hi = rawChars.length - 1;
1252
+ target = rawChars.length - 1;
1253
+ while (lo <= hi) {
1254
+ const mid = (lo + hi) >> 1;
1255
+ if (cum[mid] <= charIndex && charIndex < cum[mid + 1]) {
1256
+ target = mid;
1257
+ break;
1258
+ }
1259
+ if (charIndex < cum[mid]) {
1260
+ hi = mid - 1;
1261
+ }
1262
+ else {
1263
+ lo = mid + 1;
1264
+ }
1265
+ }
1266
+ }
1267
+ buckets[target].push(phones[idx]);
1268
+ }
1269
+ return buckets;
1270
+ };
889
1271
  const pseudoPhones = (text) => {
890
1272
  const compact = compactSurface(text);
891
1273
  return Array.from(compact).filter((ch) => !/\s/u.test(ch)).length > 0
@@ -915,6 +1297,16 @@ const effectiveThreshold = (length, maxDistanceRatio, minDistance, maxDistance,
915
1297
  }
916
1298
  return threshold;
917
1299
  };
1300
+ const qgramOverlap = (left, right) => {
1301
+ const [small, large] = left.size <= right.size ? [left, right] : [right, left];
1302
+ let overlap = 0;
1303
+ for (const [qgramId, count] of small.entries()) {
1304
+ const other = large.get(qgramId);
1305
+ if (other != null)
1306
+ overlap += Math.min(count, other);
1307
+ }
1308
+ return overlap;
1309
+ };
918
1310
  const qgramFrequency = (sequence, q, encoder) => {
919
1311
  const freq = new Map();
920
1312
  if (q <= 0 || sequence.length < q)