hama-js 1.3.9 → 1.3.11

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,993 @@
1
+ const DEFAULT_SCAN_OPTIONS = {
2
+ language: "en",
3
+ maxDistanceRatio: 0.2,
4
+ minDistance: 0,
5
+ maxDistance: null,
6
+ thresholdBasis: "phonemes",
7
+ wordBoundaryMode: "flexible",
8
+ tokenSlack: 1,
9
+ qgramSize: 2,
10
+ maxTermPronunciations: 4,
11
+ verifier: "auto",
12
+ scoring: "hybrid",
13
+ phonemeWeight: 0.85,
14
+ textWeight: 0.15,
15
+ minScore: 0,
16
+ resolveOverlaps: "best_non_overlapping",
17
+ allowShortFuzzy: false,
18
+ returnPhonemes: false,
19
+ debug: false,
20
+ };
21
+ const DEFAULT_REPLACE_OPTIONS = {
22
+ language: "en",
23
+ maxDistanceRatio: 0.2,
24
+ minDistance: 0,
25
+ maxDistance: null,
26
+ thresholdBasis: "phonemes",
27
+ wordBoundaryMode: "flexible",
28
+ tokenSlack: 1,
29
+ qgramSize: 2,
30
+ maxTermPronunciations: 4,
31
+ verifier: "auto",
32
+ scoring: "hybrid",
33
+ phonemeWeight: 0.85,
34
+ textWeight: 0.15,
35
+ minScore: 0.72,
36
+ allowShortFuzzy: false,
37
+ returnPhonemes: false,
38
+ debug: false,
39
+ replacementSource: "canonical",
40
+ caseStrategy: "canonical",
41
+ conflictPolicy: "weighted_interval",
42
+ ambiguousPolicy: "skip",
43
+ ambiguityMargin: 0.05,
44
+ includeUnchanged: false,
45
+ includeDiscarded: true,
46
+ keepScanMatches: false,
47
+ };
48
+ const APOSTROPHE_CHARS = new Set(["'", "\u2019", "\u2018", "\u02bc", "`", "\u00b4", "\uff07"]);
49
+ const DASH_CHARS = new Set(["-", "\u2010", "\u2011", "\u2012", "\u2013", "\u2014", "\u2015", "\u2212", "\ufe63", "\uff0d"]);
50
+ const WORD_JOINERS = new Set([...APOSTROPHE_CHARS, ...DASH_CHARS]);
51
+ export async function pronunciationScanWithModel(model, text, terms, options = {}) {
52
+ const merged = mergeScanOptions(options);
53
+ if (!text || terms.length === 0) {
54
+ return { matches: [], stats: emptyScanStats(0) };
55
+ }
56
+ const phoneEncoder = new PhoneEncoder();
57
+ const qgramEncoder = new QGramEncoder();
58
+ const tokens = await prepareTokens(text, model, merged, phoneEncoder);
59
+ const compiled = await compileVariants(terms, model, merged, phoneEncoder, qgramEncoder);
60
+ if (compiled.variants.length === 0) {
61
+ return { matches: [], stats: emptyScanStats(tokens.length) };
62
+ }
63
+ const { matches, stats } = scanCompiled(text, tokens, compiled, merged, qgramEncoder);
64
+ const resolved = resolveScanMatches(matches, merged.resolveOverlaps);
65
+ stats.matchesReturned = resolved.length;
66
+ return { matches: resolved, stats };
67
+ }
68
+ export async function pronunciationReplaceWithModel(model, text, terms, options = {}) {
69
+ const merged = mergeReplaceOptions(options);
70
+ const rawScan = await pronunciationScanWithModel(model, text, terms, {
71
+ ...merged,
72
+ resolveOverlaps: "all",
73
+ });
74
+ const rawMatches = [...rawScan.matches];
75
+ const candidates = convertMatchesToPatchCandidates(rawMatches, text, merged);
76
+ const { survivors: deduped, discarded: duplicateDiscarded } = dedupePatchCandidates(candidates);
77
+ const { survivors: disambiguated, discarded: ambiguousDiscarded } = markAmbiguous(deduped, merged);
78
+ const { selected, discarded: overlapDiscarded } = selectNonOverlapping(disambiguated, merged);
79
+ const { text: finalText, patches: appliedSelected } = applyPatches(text, selected);
80
+ const applied = appliedSelected.filter((patch) => patch.status === "applied" || (patch.status === "unchanged" && merged.includeUnchanged));
81
+ const discarded = merged.includeDiscarded
82
+ ? [...duplicateDiscarded, ...ambiguousDiscarded, ...overlapDiscarded].sort((left, right) => left.startChar - right.startChar ||
83
+ left.endChar - right.endChar ||
84
+ right.score - left.score)
85
+ : [];
86
+ const patches = [...applied, ...discarded].sort((left, right) => {
87
+ const leftStatusPriority = left.status === "applied" || left.status === "unchanged" ? 0 : 1;
88
+ const rightStatusPriority = right.status === "applied" || right.status === "unchanged" ? 0 : 1;
89
+ return (left.startChar - right.startChar ||
90
+ leftStatusPriority - rightStatusPriority ||
91
+ right.score - left.score);
92
+ });
93
+ return {
94
+ originalText: text,
95
+ text: finalText,
96
+ applied,
97
+ discarded,
98
+ patches,
99
+ stats: {
100
+ ...rawScan.stats,
101
+ rawMatches: rawMatches.length,
102
+ dedupedMatches: deduped.length,
103
+ ambiguousDiscarded: ambiguousDiscarded.length,
104
+ overlapDiscarded: overlapDiscarded.length,
105
+ duplicateDiscarded: duplicateDiscarded.length,
106
+ appliedCount: appliedSelected.filter((patch) => patch.status === "applied").length,
107
+ unchangedCount: appliedSelected.filter((patch) => patch.status === "unchanged").length,
108
+ },
109
+ rawMatches: merged.keepScanMatches ? rawMatches : null,
110
+ };
111
+ }
112
+ export const mergeScanOptions = (options = {}) => ({
113
+ ...DEFAULT_SCAN_OPTIONS,
114
+ ...options,
115
+ });
116
+ export const mergeReplaceOptions = (options = {}) => ({
117
+ ...DEFAULT_REPLACE_OPTIONS,
118
+ ...options,
119
+ });
120
+ const emptyScanStats = (tokenCount) => ({
121
+ tokenCount,
122
+ windowCount: 0,
123
+ candidateVariantsConsidered: 0,
124
+ candidateVariantsVerified: 0,
125
+ matchesReturned: 0,
126
+ rejectedByLength: 0,
127
+ rejectedByQgram: 0,
128
+ rejectedByDistance: 0,
129
+ });
130
+ const normalizeForMatch = (text) => {
131
+ const normalized = text.normalize("NFKC");
132
+ const mapped = Array.from(normalized)
133
+ .map((ch) => {
134
+ if (APOSTROPHE_CHARS.has(ch))
135
+ return "'";
136
+ if (DASH_CHARS.has(ch))
137
+ return "-";
138
+ return ch;
139
+ })
140
+ .join("")
141
+ .toLocaleLowerCase("und")
142
+ .normalize("NFKD");
143
+ const stripped = Array.from(mapped)
144
+ .filter((ch) => !/\p{M}/u.test(ch))
145
+ .join("");
146
+ return stripped.replace(/\s+/gu, " ").trim();
147
+ };
148
+ const compactSurface = (text) => text.replace(/ /gu, "").replace(/-/gu, "").replace(/'/gu, "");
149
+ const isWordChar = (ch) => /\p{L}|\p{N}/u.test(ch);
150
+ const toCodePoints = (text) => {
151
+ const result = [];
152
+ let codeUnitOffset = 0;
153
+ let charIndex = 0;
154
+ while (codeUnitOffset < text.length) {
155
+ const code = text.codePointAt(codeUnitOffset);
156
+ const ch = String.fromCodePoint(code);
157
+ result.push({
158
+ ch,
159
+ charIndex,
160
+ codeUnitStart: codeUnitOffset,
161
+ codeUnitEnd: codeUnitOffset + ch.length,
162
+ });
163
+ codeUnitOffset += ch.length;
164
+ charIndex += 1;
165
+ }
166
+ return result;
167
+ };
168
+ const tokenizeWithOffsets = (text) => {
169
+ const codePoints = toCodePoints(text);
170
+ const tokens = [];
171
+ let idx = 0;
172
+ while (idx < codePoints.length) {
173
+ if (!isWordChar(codePoints[idx].ch)) {
174
+ idx += 1;
175
+ continue;
176
+ }
177
+ const start = idx;
178
+ idx += 1;
179
+ while (idx < codePoints.length) {
180
+ const current = codePoints[idx];
181
+ if (isWordChar(current.ch)) {
182
+ idx += 1;
183
+ continue;
184
+ }
185
+ if (WORD_JOINERS.has(current.ch) &&
186
+ idx + 1 < codePoints.length &&
187
+ idx > start &&
188
+ isWordChar(codePoints[idx - 1].ch) &&
189
+ isWordChar(codePoints[idx + 1].ch)) {
190
+ idx += 1;
191
+ continue;
192
+ }
193
+ break;
194
+ }
195
+ const slice = codePoints.slice(start, idx);
196
+ tokens.push({
197
+ rawText: text.slice(slice[0].codeUnitStart, slice[slice.length - 1].codeUnitEnd),
198
+ startChar: slice[0].charIndex,
199
+ endChar: slice[slice.length - 1].charIndex + 1,
200
+ startCodeUnit: slice[0].codeUnitStart,
201
+ endCodeUnit: slice[slice.length - 1].codeUnitEnd,
202
+ });
203
+ }
204
+ return tokens;
205
+ };
206
+ const prepareTokens = async (text, model, options, phoneEncoder) => {
207
+ const tokenCache = new Map();
208
+ const prepared = [];
209
+ for (const token of tokenizeWithOffsets(text)) {
210
+ const normText = normalizeForMatch(token.rawText);
211
+ const cacheKey = normText || token.rawText;
212
+ let cached = tokenCache.get(cacheKey);
213
+ if (!cached) {
214
+ const phoneTokens = await phonemizeText(normText || token.rawText, model);
215
+ cached = {
216
+ phoneTokens,
217
+ phones: phoneTokens.map((phone) => phoneEncoder.encode(phone)),
218
+ };
219
+ tokenCache.set(cacheKey, cached);
220
+ }
221
+ prepared.push({
222
+ rawText: token.rawText,
223
+ normText,
224
+ startChar: token.startChar,
225
+ endChar: token.endChar,
226
+ startCodeUnit: token.startCodeUnit,
227
+ endCodeUnit: token.endCodeUnit,
228
+ phones: [...cached.phones],
229
+ phoneTokens: [...cached.phoneTokens],
230
+ });
231
+ }
232
+ return prepared;
233
+ };
234
+ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder) => {
235
+ const variants = [];
236
+ const byTokenCount = new Map();
237
+ const indexByTokenCount = new Map();
238
+ let variantId = 0;
239
+ for (const rawTerm of terms) {
240
+ const term = coerceTerm(rawTerm);
241
+ const surfaces = [[term.text, null]];
242
+ for (const alias of term.aliases) {
243
+ surfaces.push([alias, alias]);
244
+ }
245
+ for (const [surfaceText, aliasText] of surfaces) {
246
+ const surfaceNorm = normalizeForMatch(surfaceText);
247
+ const tokenCount = tokenizeWithOffsets(surfaceText).length || Math.max(1, surfaceNorm.split(" ").filter(Boolean).length);
248
+ const pronunciationInputs = term.pronunciations.length > 0
249
+ ? term.pronunciations.slice(0, options.maxTermPronunciations)
250
+ : [null];
251
+ for (const pronunciationInput of pronunciationInputs) {
252
+ const phoneTokens = pronunciationInput == null
253
+ ? await phonemizeText(surfaceNorm, model)
254
+ : parseExplicitPronunciation(pronunciationInput);
255
+ const encoded = phoneTokens.map((phone) => phoneEncoder.encode(phone));
256
+ const thresholdLength = options.thresholdBasis === "phonemes" ? encoded.length : surfaceNorm.length;
257
+ const thresholdK = effectiveThreshold(thresholdLength, options.maxDistanceRatio, options.minDistance, options.maxDistance, options.allowShortFuzzy);
258
+ const qgramFreq = qgramFrequency(encoded, options.qgramSize, qgramEncoder);
259
+ const variant = {
260
+ variantId,
261
+ termId: term.id ?? null,
262
+ termText: term.text,
263
+ canonical: term.canonical,
264
+ aliasText,
265
+ metadata: term.metadata ?? null,
266
+ tokenCount,
267
+ surfaceNorm,
268
+ surfaceCompact: compactSurface(surfaceNorm),
269
+ phones: encoded,
270
+ phoneTokens,
271
+ phoneLen: encoded.length,
272
+ thresholdK,
273
+ qgramFreq,
274
+ pronunciationValue: pronunciationInput == null ? [...phoneTokens] : pronunciationInput,
275
+ };
276
+ variants.push(variant);
277
+ if (!byTokenCount.has(tokenCount))
278
+ byTokenCount.set(tokenCount, []);
279
+ byTokenCount.get(tokenCount).push(variant);
280
+ if (!indexByTokenCount.has(tokenCount))
281
+ indexByTokenCount.set(tokenCount, new Map());
282
+ for (const [qgramId, qgramCount] of qgramFreq.entries()) {
283
+ if (!indexByTokenCount.get(tokenCount).has(qgramId)) {
284
+ indexByTokenCount.get(tokenCount).set(qgramId, []);
285
+ }
286
+ indexByTokenCount.get(tokenCount).get(qgramId).push([variantId, qgramCount]);
287
+ }
288
+ variantId += 1;
289
+ }
290
+ }
291
+ }
292
+ return { variants, byTokenCount, indexByTokenCount };
293
+ };
294
+ const scanCompiled = (text, tokens, compiled, options, qgramEncoder) => {
295
+ const stats = emptyScanStats(tokens.length);
296
+ if (compiled.variants.length === 0 || tokens.length === 0) {
297
+ return { matches: [], stats };
298
+ }
299
+ const tokenCounts = [...compiled.byTokenCount.keys()].sort((a, b) => a - b);
300
+ const variantById = new Map(compiled.variants.map((variant) => [variant.variantId, variant]));
301
+ const lengths = windowLengths(tokenCounts, options);
302
+ const rawMatches = [];
303
+ for (let startToken = 0; startToken < tokens.length; startToken++) {
304
+ for (const windowLength of lengths) {
305
+ const endToken = startToken + windowLength;
306
+ if (endToken > tokens.length)
307
+ continue;
308
+ stats.windowCount = (stats.windowCount ?? 0) + 1;
309
+ const window = buildWindow(text, tokens, startToken, endToken);
310
+ const relevantCounts = candidateTokenBuckets(windowLength, tokenCounts, options);
311
+ if (relevantCounts.length === 0)
312
+ continue;
313
+ const lengthOkIds = new Set();
314
+ for (const count of relevantCounts) {
315
+ for (const variant of compiled.byTokenCount.get(count) ?? []) {
316
+ if (Math.abs(window.phones.length - variant.phoneLen) > variant.thresholdK) {
317
+ stats.rejectedByLength = (stats.rejectedByLength ?? 0) + 1;
318
+ continue;
319
+ }
320
+ lengthOkIds.add(variant.variantId);
321
+ }
322
+ }
323
+ if (lengthOkIds.size === 0)
324
+ continue;
325
+ stats.candidateVariantsConsidered = (stats.candidateVariantsConsidered ?? 0) + lengthOkIds.size;
326
+ const windowQfreq = qgramFrequency(window.phones, options.qgramSize, qgramEncoder);
327
+ const candidateOverlap = new Map();
328
+ for (const count of relevantCounts) {
329
+ const postings = compiled.indexByTokenCount.get(count) ?? new Map();
330
+ for (const [qgramId, windowCount] of windowQfreq.entries()) {
331
+ for (const [variantId, termCount] of postings.get(qgramId) ?? []) {
332
+ if (!lengthOkIds.has(variantId))
333
+ continue;
334
+ candidateOverlap.set(variantId, (candidateOverlap.get(variantId) ?? 0) + Math.min(windowCount, termCount));
335
+ }
336
+ }
337
+ }
338
+ const verifiedIds = [];
339
+ for (const variantId of [...lengthOkIds].sort((a, b) => a - b)) {
340
+ const variant = variantById.get(variantId);
341
+ const required = requiredOverlap(variant.phoneLen, window.phones.length, options.qgramSize, variant.thresholdK);
342
+ if ((candidateOverlap.get(variantId) ?? 0) < required) {
343
+ stats.rejectedByQgram = (stats.rejectedByQgram ?? 0) + 1;
344
+ continue;
345
+ }
346
+ verifiedIds.push(variantId);
347
+ }
348
+ if (verifiedIds.length === 0)
349
+ continue;
350
+ stats.candidateVariantsVerified = (stats.candidateVariantsVerified ?? 0) + verifiedIds.length;
351
+ for (const variantId of verifiedIds) {
352
+ const variant = variantById.get(variantId);
353
+ const distance = verifyDistance(variant.phones, window.phones, variant.thresholdK, options.verifier);
354
+ if (distance == null) {
355
+ stats.rejectedByDistance = (stats.rejectedByDistance ?? 0) + 1;
356
+ continue;
357
+ }
358
+ const phonemeSimilarity = similarity(distance, variant.phones.length, window.phones.length);
359
+ const textDistance = levenshteinDistance(variant.surfaceCompact, window.surfaceCompact);
360
+ const textSimilarity = similarity(textDistance, variant.surfaceCompact.length, window.surfaceCompact.length);
361
+ const score = options.scoring === "phoneme"
362
+ ? phonemeSimilarity
363
+ : options.phonemeWeight * phonemeSimilarity + options.textWeight * textSimilarity;
364
+ if (score < options.minScore)
365
+ continue;
366
+ rawMatches.push({
367
+ termId: variant.termId,
368
+ termText: variant.termText,
369
+ canonical: variant.canonical,
370
+ aliasText: variant.aliasText,
371
+ matchedText: window.matchedText,
372
+ startChar: window.startChar,
373
+ endChar: window.endChar,
374
+ startToken: window.startToken,
375
+ endToken: window.endToken,
376
+ score,
377
+ phonemeDistance: distance,
378
+ phonemeThreshold: variant.thresholdK,
379
+ phonemeSimilarity,
380
+ textDistance,
381
+ textSimilarity,
382
+ termPronunciation: options.returnPhonemes ? [...variant.phoneTokens] : null,
383
+ matchedPronunciation: options.returnPhonemes ? [...window.phoneTokens] : null,
384
+ metadata: variant.metadata,
385
+ });
386
+ }
387
+ }
388
+ }
389
+ return { matches: dedupeScanMatches(rawMatches), stats };
390
+ };
391
+ const windowLengths = (termTokenCounts, options) => {
392
+ if (termTokenCounts.length === 0)
393
+ return [];
394
+ if (options.wordBoundaryMode === "strict") {
395
+ return [...new Set(termTokenCounts.filter((count) => count > 0))].sort((a, b) => a - b);
396
+ }
397
+ const minCount = Math.max(1, Math.min(...termTokenCounts) - options.tokenSlack);
398
+ const maxCount = Math.max(...termTokenCounts) + options.tokenSlack;
399
+ const result = [];
400
+ for (let value = minCount; value <= maxCount; value += 1) {
401
+ result.push(value);
402
+ }
403
+ return result;
404
+ };
405
+ const candidateTokenBuckets = (windowLength, termTokenCounts, options) => {
406
+ if (options.wordBoundaryMode === "strict") {
407
+ return termTokenCounts.includes(windowLength) ? [windowLength] : [];
408
+ }
409
+ return termTokenCounts.filter((count) => Math.abs(count - windowLength) <= options.tokenSlack);
410
+ };
411
+ const buildWindow = (text, tokens, startToken, endToken) => {
412
+ const selected = tokens.slice(startToken, endToken);
413
+ const surfaceNorm = selected.map((token) => token.normText).filter(Boolean).join(" ");
414
+ const phones = [];
415
+ const phoneTokens = [];
416
+ for (const token of selected) {
417
+ phones.push(...token.phones);
418
+ phoneTokens.push(...token.phoneTokens);
419
+ }
420
+ return {
421
+ startToken,
422
+ endToken,
423
+ startChar: selected[0].startChar,
424
+ endChar: selected[selected.length - 1].endChar,
425
+ matchedText: text.slice(selected[0].startCodeUnit, selected[selected.length - 1].endCodeUnit),
426
+ surfaceNorm,
427
+ surfaceCompact: compactSurface(surfaceNorm),
428
+ phones,
429
+ phoneTokens,
430
+ };
431
+ };
432
+ const requiredOverlap = (termLen, windowLen, q, thresholdK) => Math.max(0, Math.max(termLen, windowLen) - q + 1 - thresholdK * q);
433
+ const verifyDistance = (pattern, text, thresholdK, verifier) => {
434
+ if (Math.abs(pattern.length - text.length) > thresholdK)
435
+ return null;
436
+ if (verifier === "myers") {
437
+ const distance = myersDistance(pattern, text);
438
+ return distance <= thresholdK ? distance : null;
439
+ }
440
+ if (verifier === "auto" && pattern.length <= 64) {
441
+ const distance = myersDistance(pattern, text);
442
+ return distance <= thresholdK ? distance : null;
443
+ }
444
+ return ukkonenDistance(pattern, text, thresholdK);
445
+ };
446
+ const myersDistance = (pattern, text) => {
447
+ const m = pattern.length;
448
+ if (m === 0)
449
+ return text.length;
450
+ if (text.length === 0)
451
+ return m;
452
+ const peq = new Map();
453
+ for (let idx = 0; idx < pattern.length; idx += 1) {
454
+ peq.set(pattern[idx], (peq.get(pattern[idx]) ?? 0n) | (1n << BigInt(idx)));
455
+ }
456
+ const mask = (1n << BigInt(m)) - 1n;
457
+ let pv = mask;
458
+ let mv = 0n;
459
+ let score = m;
460
+ const highBit = 1n << BigInt(m - 1);
461
+ for (const symbol of text) {
462
+ const eq = peq.get(symbol) ?? 0n;
463
+ const xv = eq | mv;
464
+ const xh = (((eq & pv) + pv) ^ pv) | eq;
465
+ let ph = mv | (~(xh | pv) & mask);
466
+ let mh = pv & xh;
467
+ if ((ph & highBit) !== 0n) {
468
+ score += 1;
469
+ }
470
+ else if ((mh & highBit) !== 0n) {
471
+ score -= 1;
472
+ }
473
+ ph = ((ph << 1n) | 1n) & mask;
474
+ mh = (mh << 1n) & mask;
475
+ pv = (mh | (~(xv | ph) & mask)) & mask;
476
+ mv = ph & xv;
477
+ }
478
+ return score;
479
+ };
480
+ const ukkonenDistance = (pattern, text, thresholdK) => {
481
+ const m = pattern.length;
482
+ const n = text.length;
483
+ if (m === 0)
484
+ return n <= thresholdK ? n : null;
485
+ if (n === 0)
486
+ return m <= thresholdK ? m : null;
487
+ const inf = thresholdK + 1;
488
+ let prev = Array.from({ length: m + 1 }, (_, idx) => idx);
489
+ for (let i = 1; i <= n; i += 1) {
490
+ const curr = new Array(m + 1).fill(inf);
491
+ const lo = Math.max(1, i - thresholdK);
492
+ const hi = Math.min(m, i + thresholdK);
493
+ if (lo === 1)
494
+ curr[0] = i;
495
+ for (let j = lo; j <= hi; j += 1) {
496
+ const cost = pattern[j - 1] === text[i - 1] ? 0 : 1;
497
+ curr[j] = Math.min(prev[j] + 1, curr[j - 1] + 1, prev[j - 1] + cost);
498
+ }
499
+ const bandMin = curr.slice(lo, hi + 1).reduce((best, value) => Math.min(best, value), inf);
500
+ if (bandMin > thresholdK)
501
+ return null;
502
+ prev = curr;
503
+ }
504
+ return prev[m] <= thresholdK ? prev[m] : null;
505
+ };
506
+ const levenshteinDistance = (left, right) => {
507
+ if (left === right)
508
+ return 0;
509
+ if (!left)
510
+ return right.length;
511
+ if (!right)
512
+ return left.length;
513
+ let prev = Array.from({ length: right.length + 1 }, (_, idx) => idx);
514
+ for (let i = 0; i < left.length; i += 1) {
515
+ const curr = [i + 1];
516
+ for (let j = 0; j < right.length; j += 1) {
517
+ const cost = left[i] === right[j] ? 0 : 1;
518
+ curr.push(Math.min(prev[j + 1] + 1, curr[j] + 1, prev[j] + cost));
519
+ }
520
+ prev = curr;
521
+ }
522
+ return prev[prev.length - 1];
523
+ };
524
+ const similarity = (distance, leftLen, rightLen) => 1 - distance / Math.max(leftLen, rightLen, 1);
525
+ const dedupeScanMatches = (matches) => {
526
+ const winners = new Map();
527
+ for (const match of matches) {
528
+ const key = `${match.startChar}:${match.endChar}:${match.canonical}`;
529
+ const previous = winners.get(key);
530
+ if (!previous || compareScanMatch(match, previous) < 0) {
531
+ winners.set(key, match);
532
+ }
533
+ }
534
+ return [...winners.values()].sort((left, right) => left.startChar - right.startChar ||
535
+ left.endChar - right.endChar ||
536
+ right.score - left.score);
537
+ };
538
+ const resolveScanMatches = (matches, mode) => {
539
+ if (mode === "all") {
540
+ return [...matches].sort((left, right) => left.startChar - right.startChar ||
541
+ left.endChar - right.endChar ||
542
+ right.score - left.score);
543
+ }
544
+ if (mode === "per_term_best") {
545
+ const winners = new Map();
546
+ for (const match of matches) {
547
+ const previous = winners.get(match.canonical);
548
+ if (!previous || compareScanMatch(match, previous) < 0) {
549
+ winners.set(match.canonical, match);
550
+ }
551
+ }
552
+ return [...winners.values()].sort((left, right) => left.startChar - right.startChar ||
553
+ left.endChar - right.endChar ||
554
+ right.score - left.score);
555
+ }
556
+ const ordered = [...matches].sort((left, right) => {
557
+ const lengthDelta = (right.endChar - right.startChar) - (left.endChar - left.startChar);
558
+ return (right.score - left.score ||
559
+ left.phonemeDistance - right.phonemeDistance ||
560
+ lengthDelta ||
561
+ left.startChar - right.startChar);
562
+ });
563
+ const chosen = [];
564
+ for (const match of ordered) {
565
+ if (chosen.some((existing) => overlaps(match, existing)))
566
+ continue;
567
+ chosen.push(match);
568
+ }
569
+ return chosen.sort((left, right) => left.startChar - right.startChar ||
570
+ left.endChar - right.endChar ||
571
+ right.score - left.score);
572
+ };
573
+ const convertMatchesToPatchCandidates = (matches, originalText, options) => {
574
+ const boundaries = buildCharBoundaries(originalText);
575
+ const candidates = [];
576
+ for (const match of matches) {
577
+ let replacementText = resolveReplacementText(match, options);
578
+ if (!replacementText)
579
+ continue;
580
+ replacementText = applyCaseStrategy(replacementText, match.matchedText, options);
581
+ const sourceText = sliceByCharRange(originalText, boundaries, match.startChar, match.endChar);
582
+ candidates.push({
583
+ ...match,
584
+ status: "applied",
585
+ discardReason: null,
586
+ replacementText,
587
+ outputStartChar: null,
588
+ outputEndChar: null,
589
+ changed: sourceText !== replacementText,
590
+ deltaChars: charLength(replacementText) - (match.endChar - match.startChar),
591
+ });
592
+ }
593
+ return candidates;
594
+ };
595
+ const resolveReplacementText = (match, options) => {
596
+ if (options.replacementSource === "term_text")
597
+ return match.termText;
598
+ if (options.replacementSource === "alias_text")
599
+ return match.aliasText ?? match.termText;
600
+ return match.canonical || match.termText;
601
+ };
602
+ const applyCaseStrategy = (replacementText, matchedText, options) => {
603
+ if (options.caseStrategy !== "match_simple")
604
+ return replacementText;
605
+ if (isAllUpper(matchedText))
606
+ return replacementText.toUpperCase();
607
+ if (isTitleCase(matchedText))
608
+ return titleCaseWords(replacementText);
609
+ return replacementText;
610
+ };
611
+ const isAllUpper = (text) => {
612
+ const letters = Array.from(text).filter((ch) => /\p{L}/u.test(ch));
613
+ return letters.length > 0 && letters.every((ch) => ch === ch.toUpperCase());
614
+ };
615
+ const isTitleCase = (text) => {
616
+ const words = text.trim().split(/\s+/u).filter(Boolean);
617
+ return words.length > 0 && words.every((word) => isTitleCaseWord(word));
618
+ };
619
+ const isTitleCaseWord = (word) => {
620
+ const parts = word.split(/([-'])/u).filter(Boolean);
621
+ let sawCased = false;
622
+ for (const part of parts) {
623
+ if (part === "-" || part === "'")
624
+ continue;
625
+ const letters = Array.from(part).filter((ch) => /\p{L}/u.test(ch));
626
+ if (letters.length === 0)
627
+ continue;
628
+ sawCased = true;
629
+ if (letters[0] !== letters[0].toUpperCase())
630
+ return false;
631
+ if (letters.slice(1).some((ch) => ch !== ch.toLowerCase()))
632
+ return false;
633
+ }
634
+ return sawCased;
635
+ };
636
+ const titleCaseWords = (text) => text
637
+ .split(" ")
638
+ .map((word) => word
639
+ .split("-")
640
+ .map((part) => part
641
+ .split("'")
642
+ .map((segment) => segment ? segment[0].toUpperCase() + segment.slice(1).toLowerCase() : "")
643
+ .join("'"))
644
+ .join("-"))
645
+ .join(" ");
646
+ const dedupePatchCandidates = (candidates) => {
647
+ const winners = new Map();
648
+ const discarded = [];
649
+ for (const candidate of candidates) {
650
+ const key = `${candidate.startChar}:${candidate.endChar}:${candidate.canonical}:${candidate.replacementText}`;
651
+ const previous = winners.get(key);
652
+ if (!previous) {
653
+ winners.set(key, candidate);
654
+ continue;
655
+ }
656
+ const better = comparePatch(candidate, previous) < 0 ? candidate : previous;
657
+ const loser = better === candidate ? previous : candidate;
658
+ winners.set(key, better);
659
+ discarded.push({
660
+ ...loser,
661
+ status: "discarded_duplicate",
662
+ discardReason: "duplicate_of_better_variant",
663
+ outputStartChar: null,
664
+ outputEndChar: null,
665
+ });
666
+ }
667
+ return { survivors: [...winners.values()], discarded };
668
+ };
669
+ const markAmbiguous = (candidates, options) => {
670
+ const groups = new Map();
671
+ for (const candidate of candidates) {
672
+ const key = `${candidate.startChar}:${candidate.endChar}`;
673
+ if (!groups.has(key))
674
+ groups.set(key, []);
675
+ groups.get(key).push(candidate);
676
+ }
677
+ const survivors = [];
678
+ const discarded = [];
679
+ for (const group of groups.values()) {
680
+ const ordered = [...group].sort(comparePatch);
681
+ if (ordered.length < 2 ||
682
+ ordered[0].canonical === ordered[1].canonical ||
683
+ Math.abs(ordered[0].score - ordered[1].score) >= options.ambiguityMargin) {
684
+ survivors.push(...ordered);
685
+ continue;
686
+ }
687
+ if (options.ambiguousPolicy === "keep_best") {
688
+ survivors.push(ordered[0]);
689
+ for (const loser of ordered.slice(1)) {
690
+ discarded.push({
691
+ ...loser,
692
+ status: "discarded_ambiguous",
693
+ discardReason: "same_span_competing_canonicals",
694
+ outputStartChar: null,
695
+ outputEndChar: null,
696
+ });
697
+ }
698
+ continue;
699
+ }
700
+ for (const loser of ordered) {
701
+ discarded.push({
702
+ ...loser,
703
+ status: "discarded_ambiguous",
704
+ discardReason: "same_span_competing_canonicals",
705
+ outputStartChar: null,
706
+ outputEndChar: null,
707
+ });
708
+ }
709
+ }
710
+ return { survivors, discarded };
711
+ };
712
+ const selectNonOverlapping = (candidates, options) => {
713
+ if (candidates.length === 0)
714
+ return { selected: [], discarded: [] };
715
+ if (options.conflictPolicy === "greedy_left_to_right")
716
+ return selectGreedy(candidates);
717
+ if (options.conflictPolicy === "error") {
718
+ const ordered = [...candidates].sort((left, right) => left.startChar - right.startChar || left.endChar - right.endChar);
719
+ for (let idx = 1; idx < ordered.length; idx += 1) {
720
+ if (overlaps(ordered[idx - 1], ordered[idx])) {
721
+ throw new Error("Overlapping pronunciation replacements remain after ambiguity resolution");
722
+ }
723
+ }
724
+ return { selected: ordered, discarded: [] };
725
+ }
726
+ return selectWeightedInterval(candidates);
727
+ };
728
+ const selectGreedy = (candidates) => {
729
+ const ordered = [...candidates].sort((left, right) => {
730
+ const lengthDelta = (right.endChar - right.startChar) - (left.endChar - left.startChar);
731
+ return left.startChar - right.startChar || right.score - left.score || lengthDelta;
732
+ });
733
+ const selected = [];
734
+ const discarded = [];
735
+ for (const candidate of ordered) {
736
+ if (selected.some((existing) => overlaps(candidate, existing))) {
737
+ discarded.push({
738
+ ...candidate,
739
+ status: "discarded_overlap",
740
+ discardReason: "lost_to_higher_value_non_overlapping_set",
741
+ outputStartChar: null,
742
+ outputEndChar: null,
743
+ });
744
+ continue;
745
+ }
746
+ selected.push(candidate);
747
+ }
748
+ return {
749
+ selected: selected.sort((left, right) => left.startChar - right.startChar || left.endChar - right.endChar),
750
+ discarded,
751
+ };
752
+ };
753
+ const selectWeightedInterval = (candidates) => {
754
+ const ordered = [...candidates].sort((left, right) => {
755
+ const lengthDelta = (right.endChar - right.startChar) - (left.endChar - left.startChar);
756
+ return (left.endChar - right.endChar ||
757
+ left.startChar - right.startChar ||
758
+ right.score - left.score ||
759
+ lengthDelta);
760
+ });
761
+ const endPositions = ordered.map((candidate) => candidate.endChar);
762
+ const predecessors = [];
763
+ for (let idx = 0; idx < ordered.length; idx += 1) {
764
+ let predecessor = bisectRight(endPositions, ordered[idx].startChar) - 1;
765
+ while (predecessor >= 0 && overlaps(ordered[predecessor], ordered[idx])) {
766
+ predecessor -= 1;
767
+ }
768
+ predecessors.push(predecessor);
769
+ }
770
+ const states = Array.from({ length: ordered.length + 1 }, () => [0, 0, 0, 0]);
771
+ const takeFlags = new Array(ordered.length).fill(false);
772
+ for (let idx = 1; idx <= ordered.length; idx += 1) {
773
+ const candidate = ordered[idx - 1];
774
+ const pred = predecessors[idx - 1] + 1;
775
+ const take = addPatchValue(states[pred], candidate);
776
+ const skip = states[idx - 1];
777
+ if (compareState(take, skip) > 0) {
778
+ states[idx] = take;
779
+ takeFlags[idx - 1] = true;
780
+ }
781
+ else {
782
+ states[idx] = skip;
783
+ }
784
+ }
785
+ const selectedIndexes = new Set();
786
+ let idx = ordered.length;
787
+ while (idx > 0) {
788
+ const candidate = ordered[idx - 1];
789
+ const pred = predecessors[idx - 1] + 1;
790
+ const take = addPatchValue(states[pred], candidate);
791
+ if (takeFlags[idx - 1] && compareState(take, states[idx]) === 0) {
792
+ selectedIndexes.add(idx - 1);
793
+ idx = pred;
794
+ }
795
+ else {
796
+ idx -= 1;
797
+ }
798
+ }
799
+ const selected = ordered.filter((_, index) => selectedIndexes.has(index));
800
+ const discarded = ordered
801
+ .filter((_, index) => !selectedIndexes.has(index))
802
+ .map((candidate) => ({
803
+ ...candidate,
804
+ status: "discarded_overlap",
805
+ discardReason: "lost_to_higher_value_non_overlapping_set",
806
+ outputStartChar: null,
807
+ outputEndChar: null,
808
+ }));
809
+ return { selected, discarded };
810
+ };
811
+ const addPatchValue = (state, patch) => [
812
+ state[0] + Math.round(patch.score * 1_000_000),
813
+ state[1] + (patch.endChar - patch.startChar),
814
+ state[2] - patch.phonemeDistance,
815
+ state[3] - 1,
816
+ ];
817
+ const compareState = (left, right) => {
818
+ for (let idx = 0; idx < left.length; idx += 1) {
819
+ if (left[idx] !== right[idx])
820
+ return left[idx] > right[idx] ? 1 : -1;
821
+ }
822
+ return 0;
823
+ };
824
+ const applyPatches = (originalText, selected) => {
825
+ const ordered = [...selected].sort((left, right) => left.startChar - right.startChar || left.endChar - right.endChar);
826
+ const boundaries = buildCharBoundaries(originalText);
827
+ let cursorChar = 0;
828
+ let outputCharLength = 0;
829
+ const chunks = [];
830
+ const patches = [];
831
+ for (const patch of ordered) {
832
+ const untouched = sliceByCharRange(originalText, boundaries, cursorChar, patch.startChar);
833
+ chunks.push(untouched);
834
+ outputCharLength += charLength(untouched);
835
+ const outputStartChar = outputCharLength;
836
+ chunks.push(patch.replacementText);
837
+ outputCharLength += charLength(patch.replacementText);
838
+ const outputEndChar = outputCharLength;
839
+ patches.push({
840
+ ...patch,
841
+ status: patch.changed ? "applied" : "unchanged",
842
+ discardReason: null,
843
+ outputStartChar,
844
+ outputEndChar,
845
+ });
846
+ cursorChar = patch.endChar;
847
+ }
848
+ chunks.push(sliceByCharRange(originalText, boundaries, cursorChar, boundaries.length - 1));
849
+ return { text: chunks.join(""), patches };
850
+ };
851
+ const coerceTerm = (term) => {
852
+ if (typeof term === "string") {
853
+ return {
854
+ id: "",
855
+ text: term,
856
+ canonical: term,
857
+ pronunciations: [],
858
+ aliases: [],
859
+ metadata: {},
860
+ };
861
+ }
862
+ return {
863
+ id: term.id ?? "",
864
+ text: term.text,
865
+ canonical: term.canonical ?? term.text,
866
+ pronunciations: term.pronunciations ?? [],
867
+ aliases: term.aliases ?? [],
868
+ metadata: term.metadata ?? {},
869
+ };
870
+ };
871
+ const phonemizeText = async (text, model) => {
872
+ const normalized = normalizeForMatch(text);
873
+ const fallback = pseudoPhones(normalized);
874
+ if (!normalized)
875
+ return fallback;
876
+ try {
877
+ const result = await model.predict(normalized, {
878
+ splitDelimiter: null,
879
+ outputDelimiter: "",
880
+ preserveLiterals: "none",
881
+ });
882
+ const phones = result.alignments.map((alignment) => alignment.phoneme);
883
+ return phones.length > 0 ? phones : fallback;
884
+ }
885
+ catch {
886
+ return fallback;
887
+ }
888
+ };
889
+ const pseudoPhones = (text) => {
890
+ const compact = compactSurface(text);
891
+ return Array.from(compact).filter((ch) => !/\s/u.test(ch)).length > 0
892
+ ? Array.from(compact).filter((ch) => !/\s/u.test(ch))
893
+ : ["<unk>"];
894
+ };
895
+ const parseExplicitPronunciation = (value) => {
896
+ if (typeof value === "string") {
897
+ const stripped = value.trim();
898
+ if (!stripped)
899
+ return ["<unk>"];
900
+ if (/\s/u.test(stripped))
901
+ return stripped.split(/\s+/u).filter(Boolean);
902
+ return pseudoPhones(stripped);
903
+ }
904
+ return value.filter(Boolean);
905
+ };
906
+ const effectiveThreshold = (length, maxDistanceRatio, minDistance, maxDistance, allowShortFuzzy) => {
907
+ let threshold = Math.max(Math.floor(length * maxDistanceRatio), minDistance);
908
+ if (maxDistance != null)
909
+ threshold = Math.min(threshold, maxDistance);
910
+ if (!allowShortFuzzy) {
911
+ if (length <= 3)
912
+ return 0;
913
+ if (length <= 6)
914
+ return Math.min(threshold, 1);
915
+ }
916
+ return threshold;
917
+ };
918
+ const qgramFrequency = (sequence, q, encoder) => {
919
+ const freq = new Map();
920
+ if (q <= 0 || sequence.length < q)
921
+ return freq;
922
+ for (let idx = 0; idx <= sequence.length - q; idx += 1) {
923
+ const qgramId = encoder.encode(sequence.slice(idx, idx + q));
924
+ freq.set(qgramId, (freq.get(qgramId) ?? 0) + 1);
925
+ }
926
+ return freq;
927
+ };
928
+ const compareScanMatch = (left, right) => right.score - left.score ||
929
+ left.phonemeDistance - right.phonemeDistance ||
930
+ (right.textSimilarity ?? 0) - (left.textSimilarity ?? 0);
931
+ const comparePatch = (left, right) => right.score - left.score ||
932
+ left.phonemeDistance - right.phonemeDistance ||
933
+ (right.textSimilarity ?? 0) - (left.textSimilarity ?? 0);
934
+ const overlaps = (left, right) => !(left.endChar <= right.startChar || right.endChar <= left.startChar);
935
+ const buildCharBoundaries = (text) => {
936
+ const boundaries = [0];
937
+ let offset = 0;
938
+ while (offset < text.length) {
939
+ const code = text.codePointAt(offset);
940
+ const ch = String.fromCodePoint(code);
941
+ offset += ch.length;
942
+ boundaries.push(offset);
943
+ }
944
+ return boundaries;
945
+ };
946
+ const sliceByCharRange = (text, boundaries, startChar, endChar) => text.slice(boundaries[startChar] ?? 0, boundaries[endChar] ?? text.length);
947
+ const charLength = (text) => Array.from(text).length;
948
+ const bisectRight = (values, target) => {
949
+ let lo = 0;
950
+ let hi = values.length;
951
+ while (lo < hi) {
952
+ const mid = (lo + hi) >> 1;
953
+ if (target < values[mid]) {
954
+ hi = mid;
955
+ }
956
+ else {
957
+ lo = mid + 1;
958
+ }
959
+ }
960
+ return lo;
961
+ };
962
+ class PhoneEncoder {
963
+ constructor() {
964
+ this.nextId = 1;
965
+ this.mapping = new Map();
966
+ }
967
+ encode(phone) {
968
+ const existing = this.mapping.get(phone);
969
+ if (existing != null)
970
+ return existing;
971
+ const assigned = this.nextId;
972
+ this.mapping.set(phone, assigned);
973
+ this.nextId += 1;
974
+ return assigned;
975
+ }
976
+ }
977
+ class QGramEncoder {
978
+ constructor() {
979
+ this.nextId = 1;
980
+ this.mapping = new Map();
981
+ }
982
+ encode(qgram) {
983
+ const key = qgram.join(",");
984
+ const existing = this.mapping.get(key);
985
+ if (existing != null)
986
+ return existing;
987
+ const assigned = this.nextId;
988
+ this.mapping.set(key, assigned);
989
+ this.nextId += 1;
990
+ return assigned;
991
+ }
992
+ }
993
+ //# sourceMappingURL=pronunciation.js.map