npm - hama-js - Versions diffs - 1.3.11 → 1.3.13 - Mend

hama-js 1.3.11 → 1.3.13

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (12) hide show

package/dist/browser/browser.js +3 -0
package/dist/browser/browser.js.map +1 -1
package/dist/browser/pronunciation.js +404 -12
package/dist/browser/pronunciation.js.map +1 -1
package/dist/node/index.js +3 -0
package/dist/node/index.js.map +1 -1
package/dist/node/pronunciation.js +404 -12
package/dist/node/pronunciation.js.map +1 -1
package/dist/types/browser.d.ts +1 -0
package/dist/types/index.d.ts +1 -0
package/dist/types/pronunciation.d.ts +4 -0
package/package.json +1 -1

package/dist/node/pronunciation.js CHANGED Viewed

@@ -1,5 +1,7 @@
+import { splitTextToJamo } from "./jamo.js";
 const DEFAULT_SCAN_OPTIONS = {
     language: "en",
+    spanUnit: "character",
     maxDistanceRatio: 0.2,
     minDistance: 0,
     maxDistance: null,
@@ -20,6 +22,7 @@ const DEFAULT_SCAN_OPTIONS = {
 };
 const DEFAULT_REPLACE_OPTIONS = {
     language: "en",
+    spanUnit: "character",
     maxDistanceRatio: 0.2,
     minDistance: 0,
     maxDistance: null,
@@ -56,11 +59,17 @@ export async function pronunciationScanWithModel(model, text, terms, options = {
     const phoneEncoder = new PhoneEncoder();
     const qgramEncoder = new QGramEncoder();
     const tokens = await prepareTokens(text, model, merged, phoneEncoder);
-    const compiled = await compileVariants(terms, model, merged, phoneEncoder, qgramEncoder);
+    const maxInputLen = resolvePredictorMaxInputLen(model);
+    const compiled = await compileVariants(terms, model, merged, phoneEncoder, qgramEncoder, maxInputLen);
+    const baseStats = emptyScanStats(tokens.length);
+    baseStats.rejectedByInputLimit = compiled.rejectedByInputLimit;
     if (compiled.variants.length === 0) {
-        return { matches: [], stats: emptyScanStats(tokens.length) };
+        return { matches: [], stats: baseStats };
     }
-    const { matches, stats } = scanCompiled(text, tokens, compiled, merged, qgramEncoder);
+    const { matches, stats } = merged.spanUnit === "character"
+        ? await scanCompiledByCharacters(text, tokens, compiled, merged, qgramEncoder, model, phoneEncoder, maxInputLen)
+        : scanCompiled(text, tokens, compiled, merged, qgramEncoder);
+    stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + compiled.rejectedByInputLimit;
     const resolved = resolveScanMatches(matches, merged.resolveOverlaps);
     stats.matchesReturned = resolved.length;
     return { matches: resolved, stats };
@@ -124,6 +133,7 @@ const emptyScanStats = (tokenCount) => ({
     candidateVariantsVerified: 0,
     matchesReturned: 0,
     rejectedByLength: 0,
+    rejectedByInputLimit: 0,
     rejectedByQgram: 0,
     rejectedByDistance: 0,
 });
@@ -147,6 +157,7 @@ const normalizeForMatch = (text) => {
 };
 const compactSurface = (text) => text.replace(/ /gu, "").replace(/-/gu, "").replace(/'/gu, "");
 const isWordChar = (ch) => /\p{L}|\p{N}/u.test(ch);
+const characterUnitCount = (text) => toCodePoints(text).filter((codePoint) => isWordChar(codePoint.ch)).length;
 const toCodePoints = (text) => {
     const result = [];
     let codeUnitOffset = 0;
@@ -211,10 +222,11 @@ const prepareTokens = async (text, model, options, phoneEncoder) => {
         const cacheKey = normText || token.rawText;
         let cached = tokenCache.get(cacheKey);
         if (!cached) {
-            const phoneTokens = await phonemizeText(normText || token.rawText, model);
+            const aligned = await phonemizeTextAligned(normText || token.rawText, model);
             cached = {
-                phoneTokens,
-                phones: phoneTokens.map((phone) => phoneEncoder.encode(phone)),
+                phoneTokens: aligned.phoneTokens,
+                phones: aligned.phoneTokens.map((phone) => phoneEncoder.encode(phone)),
+                charIndexes: aligned.charIndexes,
             };
             tokenCache.set(cacheKey, cached);
         }
@@ -227,14 +239,42 @@ const prepareTokens = async (text, model, options, phoneEncoder) => {
             endCodeUnit: token.endCodeUnit,
             phones: [...cached.phones],
             phoneTokens: [...cached.phoneTokens],
+            charPhones: buildTokenCharPhones(token.rawText, normText, cached.phones, cached.phoneTokens, cached.charIndexes),
         });
     }
     return prepared;
 };
-const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder) => {
+const buildCharacterUnits = (text, tokens) => {
+    const codePoints = toCodePoints(text);
+    const units = [];
+    let tokenIndex = 0;
+    for (const codePoint of codePoints) {
+        if (!isWordChar(codePoint.ch))
+            continue;
+        while (tokenIndex < tokens.length && codePoint.charIndex >= tokens[tokenIndex].endChar) {
+            tokenIndex += 1;
+        }
+        if (tokenIndex >= tokens.length)
+            break;
+        if (codePoint.charIndex < tokens[tokenIndex].startChar ||
+            codePoint.charIndex >= tokens[tokenIndex].endChar) {
+            continue;
+        }
+        units.push({
+            startChar: codePoint.charIndex,
+            endChar: codePoint.charIndex + 1,
+            startCodeUnit: codePoint.codeUnitStart,
+            endCodeUnit: codePoint.codeUnitEnd,
+            tokenIndex,
+        });
+    }
+    return units;
+};
+const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder, maxInputLen) => {
     const variants = [];
     const byTokenCount = new Map();
     const indexByTokenCount = new Map();
+    let rejectedByInputLimit = 0;
     let variantId = 0;
     for (const rawTerm of terms) {
         const term = coerceTerm(rawTerm);
@@ -244,11 +284,19 @@ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder
         }
         for (const [surfaceText, aliasText] of surfaces) {
             const surfaceNorm = normalizeForMatch(surfaceText);
-            const tokenCount = tokenizeWithOffsets(surfaceText).length || Math.max(1, surfaceNorm.split(" ").filter(Boolean).length);
+            const tokenCount = options.spanUnit === "character"
+                ? Math.max(1, characterUnitCount(surfaceText))
+                : tokenizeWithOffsets(surfaceText).length || Math.max(1, surfaceNorm.split(" ").filter(Boolean).length);
             const pronunciationInputs = term.pronunciations.length > 0
                 ? term.pronunciations.slice(0, options.maxTermPronunciations)
                 : [null];
             for (const pronunciationInput of pronunciationInputs) {
+                if (pronunciationInput == null &&
+                    maxInputLen != null &&
+                    estimatePredictorInputLength(surfaceNorm) > maxInputLen) {
+                    rejectedByInputLimit += 1;
+                    continue;
+                }
                 const phoneTokens = pronunciationInput == null
                     ? await phonemizeText(surfaceNorm, model)
                     : parseExplicitPronunciation(pronunciationInput);
@@ -289,7 +337,7 @@ const compileVariants = async (terms, model, options, phoneEncoder, qgramEncoder
             }
         }
     }
-    return { variants, byTokenCount, indexByTokenCount };
+    return { variants, byTokenCount, indexByTokenCount, rejectedByInputLimit };
 };
 const scanCompiled = (text, tokens, compiled, options, qgramEncoder) => {
     const stats = emptyScanStats(tokens.length);
@@ -388,6 +436,187 @@ const scanCompiled = (text, tokens, compiled, options, qgramEncoder) => {
     }
     return { matches: dedupeScanMatches(rawMatches), stats };
 };
+const scanCompiledByCharacters = async (text, tokens, compiled, options, qgramEncoder, model, phoneEncoder, maxInputLen) => {
+    const stats = emptyScanStats(tokens.length);
+    const charUnits = buildCharacterUnits(text, tokens);
+    if (compiled.variants.length === 0 || charUnits.length === 0) {
+        return { matches: [], stats };
+    }
+    const tokenCounts = [...compiled.byTokenCount.keys()].sort((a, b) => a - b);
+    const variantById = new Map(compiled.variants.map((variant) => [variant.variantId, variant]));
+    const lengths = windowLengths(tokenCounts, options);
+    const rawMatches = [];
+    const windowCache = new Map();
+    // Approximate phones per character unit, sliced out of each token's
+    // alignment buckets. They let us reject the vast majority of candidate
+    // windows with pure-JS filters before paying for a real G2P inference.
+    const unitPhones = charUnits.map((unit) => {
+        const token = tokens[unit.tokenIndex];
+        if (!token.charPhones)
+            return null;
+        return token.charPhones[unit.startChar - token.startChar] ?? null;
+    });
+    const prefixPhoneCounts = [0];
+    const prefixUnmappable = [0];
+    for (let idx = 0; idx < charUnits.length; idx += 1) {
+        const phones = unitPhones[idx];
+        prefixPhoneCounts.push(prefixPhoneCounts[idx] + (phones ? phones.length : 0));
+        prefixUnmappable.push(prefixUnmappable[idx] + (phones ? 0 : 1));
+    }
+    // Concatenated per-character phones approximate the true G2P output of the
+    // window text, so the prefilter widens every variant threshold by this slack
+    // before discarding a window without verification.
+    const approxSlackFor = (variant) => Math.max(2, Math.ceil(variant.phoneLen * 0.25));
+    for (let startUnit = 0; startUnit < charUnits.length; startUnit += 1) {
+        for (const windowLength of lengths) {
+            const endUnit = startUnit + windowLength;
+            if (endUnit > charUnits.length)
+                continue;
+            stats.windowCount = (stats.windowCount ?? 0) + 1;
+            const firstUnit = charUnits[startUnit];
+            const lastUnit = charUnits[endUnit - 1];
+            const windowText = text.slice(firstUnit.startCodeUnit, lastUnit.endCodeUnit);
+            const windowNorm = normalizeForMatch(windowText);
+            if (maxInputLen != null &&
+                estimatePredictorInputLength(windowNorm || windowText) > maxInputLen) {
+                stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + 1;
+                continue;
+            }
+            const relevantCounts = candidateTokenBuckets(windowLength, tokenCounts, options);
+            if (relevantCounts.length === 0)
+                continue;
+            // Alignment-based prefilter: only windows that plausibly match some
+            // variant (under slackened thresholds) are sent to the predictor.
+            const windowMappable = prefixUnmappable[endUnit] - prefixUnmappable[startUnit] === 0;
+            if (windowMappable) {
+                const approxLen = prefixPhoneCounts[endUnit] - prefixPhoneCounts[startUnit];
+                let approxLengthRejected = 0;
+                let approxQgramRejected = 0;
+                let approxDistanceRejected = 0;
+                const lengthOkVariants = [];
+                for (const count of relevantCounts) {
+                    for (const variant of compiled.byTokenCount.get(count) ?? []) {
+                        if (Math.abs(approxLen - variant.phoneLen) > variant.thresholdK + approxSlackFor(variant)) {
+                            approxLengthRejected += 1;
+                            continue;
+                        }
+                        lengthOkVariants.push(variant);
+                    }
+                }
+                let plausible = false;
+                if (lengthOkVariants.length > 0) {
+                    const approxPhones = [];
+                    for (let unit = startUnit; unit < endUnit; unit += 1) {
+                        approxPhones.push(...unitPhones[unit]);
+                    }
+                    const approxQfreq = qgramFrequency(approxPhones, options.qgramSize, qgramEncoder);
+                    for (const variant of lengthOkVariants) {
+                        const slackK = variant.thresholdK + approxSlackFor(variant);
+                        const required = requiredOverlap(variant.phoneLen, approxPhones.length, options.qgramSize, slackK);
+                        if (qgramOverlap(approxQfreq, variant.qgramFreq) < required) {
+                            approxQgramRejected += 1;
+                            continue;
+                        }
+                        if (verifyDistance(variant.phones, approxPhones, slackK, options.verifier) == null) {
+                            approxDistanceRejected += 1;
+                            continue;
+                        }
+                        plausible = true;
+                        break;
+                    }
+                }
+                if (!plausible) {
+                    stats.candidateVariantsConsidered =
+                        (stats.candidateVariantsConsidered ?? 0) + lengthOkVariants.length;
+                    stats.rejectedByLength = (stats.rejectedByLength ?? 0) + approxLengthRejected;
+                    stats.rejectedByQgram = (stats.rejectedByQgram ?? 0) + approxQgramRejected;
+                    stats.rejectedByDistance = (stats.rejectedByDistance ?? 0) + approxDistanceRejected;
+                    continue;
+                }
+            }
+            const window = await buildCharacterWindow(text, charUnits, startUnit, endUnit, model, phoneEncoder, windowCache, maxInputLen);
+            if (!window) {
+                stats.rejectedByInputLimit = (stats.rejectedByInputLimit ?? 0) + 1;
+                continue;
+            }
+            const lengthOkIds = new Set();
+            for (const count of relevantCounts) {
+                for (const variant of compiled.byTokenCount.get(count) ?? []) {
+                    if (Math.abs(window.phones.length - variant.phoneLen) > variant.thresholdK) {
+                        stats.rejectedByLength = (stats.rejectedByLength ?? 0) + 1;
+                        continue;
+                    }
+                    lengthOkIds.add(variant.variantId);
+                }
+            }
+            if (lengthOkIds.size === 0)
+                continue;
+            stats.candidateVariantsConsidered = (stats.candidateVariantsConsidered ?? 0) + lengthOkIds.size;
+            const windowQfreq = qgramFrequency(window.phones, options.qgramSize, qgramEncoder);
+            const candidateOverlap = new Map();
+            for (const count of relevantCounts) {
+                const postings = compiled.indexByTokenCount.get(count) ?? new Map();
+                for (const [qgramId, windowCount] of windowQfreq.entries()) {
+                    for (const [variantId, termCount] of postings.get(qgramId) ?? []) {
+                        if (!lengthOkIds.has(variantId))
+                            continue;
+                        candidateOverlap.set(variantId, (candidateOverlap.get(variantId) ?? 0) + Math.min(windowCount, termCount));
+                    }
+                }
+            }
+            const verifiedIds = [];
+            for (const variantId of [...lengthOkIds].sort((a, b) => a - b)) {
+                const variant = variantById.get(variantId);
+                const required = requiredOverlap(variant.phoneLen, window.phones.length, options.qgramSize, variant.thresholdK);
+                if ((candidateOverlap.get(variantId) ?? 0) < required) {
+                    stats.rejectedByQgram = (stats.rejectedByQgram ?? 0) + 1;
+                    continue;
+                }
+                verifiedIds.push(variantId);
+            }
+            if (verifiedIds.length === 0)
+                continue;
+            stats.candidateVariantsVerified = (stats.candidateVariantsVerified ?? 0) + verifiedIds.length;
+            for (const variantId of verifiedIds) {
+                const variant = variantById.get(variantId);
+                const distance = verifyDistance(variant.phones, window.phones, variant.thresholdK, options.verifier);
+                if (distance == null) {
+                    stats.rejectedByDistance = (stats.rejectedByDistance ?? 0) + 1;
+                    continue;
+                }
+                const phonemeSimilarity = similarity(distance, variant.phones.length, window.phones.length);
+                const textDistance = levenshteinDistance(variant.surfaceCompact, window.surfaceCompact);
+                const textSimilarity = similarity(textDistance, variant.surfaceCompact.length, window.surfaceCompact.length);
+                const score = options.scoring === "phoneme"
+                    ? phonemeSimilarity
+                    : options.phonemeWeight * phonemeSimilarity + options.textWeight * textSimilarity;
+                if (score < options.minScore)
+                    continue;
+                rawMatches.push({
+                    termId: variant.termId,
+                    termText: variant.termText,
+                    canonical: variant.canonical,
+                    aliasText: variant.aliasText,
+                    matchedText: window.matchedText,
+                    startChar: window.startChar,
+                    endChar: window.endChar,
+                    startToken: window.startToken,
+                    endToken: window.endToken,
+                    score,
+                    phonemeDistance: distance,
+                    phonemeThreshold: variant.thresholdK,
+                    phonemeSimilarity,
+                    textDistance,
+                    textSimilarity,
+                    termPronunciation: options.returnPhonemes ? [...variant.phoneTokens] : null,
+                    matchedPronunciation: options.returnPhonemes ? [...window.phoneTokens] : null,
+                    metadata: variant.metadata,
+                });
+            }
+        }
+    }
+    return { matches: dedupeScanMatches(rawMatches), stats };
+};
 const windowLengths = (termTokenCounts, options) => {
     if (termTokenCounts.length === 0)
         return [];
@@ -429,6 +658,62 @@ const buildWindow = (text, tokens, startToken, endToken) => {
         phoneTokens,
     };
 };
+const buildCharacterWindow = async (text, charUnits, startUnit, endUnit, model, phoneEncoder, cache, maxInputLen) => {
+    const first = charUnits[startUnit];
+    const last = charUnits[endUnit - 1];
+    const matchedText = text.slice(first.startCodeUnit, last.endCodeUnit);
+    const surfaceNorm = normalizeForMatch(matchedText);
+    const cacheKey = surfaceNorm || matchedText;
+    if (!cache.has(cacheKey)) {
+        if (maxInputLen != null && estimatePredictorInputLength(surfaceNorm || matchedText) > maxInputLen) {
+            cache.set(cacheKey, null);
+            return null;
+        }
+        const phoneTokens = await phonemizeText(surfaceNorm || matchedText, model);
+        cache.set(cacheKey, {
+            surfaceNorm,
+            phoneTokens,
+            phones: phoneTokens.map((phone) => phoneEncoder.encode(phone)),
+        });
+    }
+    const cached = cache.get(cacheKey);
+    if (!cached)
+        return null;
+    return {
+        startToken: first.tokenIndex,
+        endToken: last.tokenIndex + 1,
+        startChar: first.startChar,
+        endChar: last.endChar,
+        matchedText,
+        surfaceNorm: cached.surfaceNorm,
+        surfaceCompact: compactSurface(cached.surfaceNorm),
+        phones: [...cached.phones],
+        phoneTokens: [...cached.phoneTokens],
+    };
+};
+const resolvePredictorMaxInputLen = (model) => {
+    if (typeof model.getMaxInputLen === "function") {
+        const value = model.getMaxInputLen();
+        if (typeof value === "number" && Number.isFinite(value) && value > 0) {
+            return Math.trunc(value);
+        }
+    }
+    const carrier = model;
+    if (typeof carrier.maxInputLen === "number" &&
+        Number.isFinite(carrier.maxInputLen) &&
+        carrier.maxInputLen > 0) {
+        return Math.trunc(carrier.maxInputLen);
+    }
+    const optionValue = carrier.options?.maxInputLen;
+    if (typeof optionValue === "number" && Number.isFinite(optionValue) && optionValue > 0) {
+        return Math.trunc(optionValue);
+    }
+    return null;
+};
+const estimatePredictorInputLength = (text) => {
+    const sequence = splitTextToJamo(text);
+    return sequence.tokens.length > 0 ? sequence.tokens.length : 1;
+};
 const requiredOverlap = (termLen, windowLen, q, thresholdK) => Math.max(0, Math.max(termLen, windowLen) - q + 1 - thresholdK * q);
 const verifyDistance = (pattern, text, thresholdK, verifier) => {
     if (Math.abs(pattern.length - text.length) > thresholdK)
@@ -869,8 +1154,12 @@ const coerceTerm = (term) => {
     };
 };
 const phonemizeText = async (text, model) => {
+    const { phoneTokens } = await phonemizeTextAligned(text, model);
+    return phoneTokens;
+};
+const phonemizeTextAligned = async (text, model) => {
     const normalized = normalizeForMatch(text);
-    const fallback = pseudoPhones(normalized);
+    const fallback = pseudoPhonesAligned(normalized);
     if (!normalized)
         return fallback;
     try {
@@ -879,13 +1168,106 @@ const phonemizeText = async (text, model) => {
             outputDelimiter: "",
             preserveLiterals: "none",
         });
-        const phones = result.alignments.map((alignment) => alignment.phoneme);
-        return phones.length > 0 ? phones : fallback;
+        // Some languages decode with literal separator tokens between phonemes.
+        // They carry no pronunciation signal, so drop them before matching.
+        const aligned = result.alignments.filter((alignment) => alignment.phoneme.trim() !== "");
+        if (aligned.length === 0)
+            return fallback;
+        return {
+            phoneTokens: aligned.map((alignment) => alignment.phoneme),
+            charIndexes: aligned.map((alignment) => alignment.charIndex),
+        };
     }
     catch {
         return fallback;
     }
 };
+/** Mirrors pseudoPhones but keeps the source character index per pseudo-phone. */
+const pseudoPhonesAligned = (text) => {
+    const phoneTokens = [];
+    const charIndexes = [];
+    Array.from(text).forEach((ch, idx) => {
+        if (/\s/u.test(ch) || ch === "-" || ch === "'")
+            return;
+        phoneTokens.push(ch);
+        charIndexes.push(idx);
+    });
+    if (phoneTokens.length === 0) {
+        return { phoneTokens: ["<unk>"], charIndexes: [-1] };
+    }
+    return { phoneTokens, charIndexes };
+};
+/**
+ * Buckets a token's encoded phones by raw character offset using the
+ * predictor's normalized-input character alignments. Returns null when the
+ * raw→normalized offset mapping is not compositional (per-character
+ * normalization lengths fail to add up to the normalized string).
+ */
+const buildTokenCharPhones = (rawText, normText, phones, phoneTokens, charIndexes) => {
+    const rawChars = Array.from(rawText);
+    if (rawChars.length === 0)
+        return null;
+    const normLen = Array.from(normText).length;
+    // Spell-out mode: when every phone is literally its source character the
+    // predictor is naming letters, not pronouncing the word, so per-character
+    // slices would not approximate how a sub-span is actually pronounced.
+    const normChars = Array.from(normText);
+    let letterIdentity = phoneTokens.length > 0;
+    for (let idx = 0; idx < phoneTokens.length && letterIdentity; idx += 1) {
+        const charIndex = charIndexes[idx];
+        if (charIndex < 0 || charIndex >= normChars.length || phoneTokens[idx] !== normChars[charIndex]) {
+            letterIdentity = false;
+        }
+    }
+    if (letterIdentity)
+        return null;
+    // Trailing characters with no aligned phones usually mean the prediction was
+    // truncated; windows over the tail would look spuriously short.
+    let maxAligned = -1;
+    for (const charIndex of charIndexes) {
+        if (charIndex > maxAligned)
+            maxAligned = charIndex;
+    }
+    if (normLen - 1 - maxAligned >= 3)
+        return null;
+    const cum = [0];
+    for (const ch of rawChars) {
+        cum.push(cum[cum.length - 1] + Array.from(normalizeForMatch(ch)).length);
+    }
+    if (cum[cum.length - 1] !== normLen)
+        return null;
+    const buckets = rawChars.map(() => []);
+    for (let idx = 0; idx < phones.length; idx += 1) {
+        const charIndex = charIndexes[idx] ?? -1;
+        let target;
+        if (charIndex < 0) {
+            target = 0;
+        }
+        else if (charIndex >= normLen) {
+            target = rawChars.length - 1;
+        }
+        else {
+            let lo = 0;
+            let hi = rawChars.length - 1;
+            target = rawChars.length - 1;
+            while (lo <= hi) {
+                const mid = (lo + hi) >> 1;
+                if (cum[mid] <= charIndex && charIndex < cum[mid + 1]) {
+                    target = mid;
+                    break;
+                }
+                if (charIndex < cum[mid]) {
+                    hi = mid - 1;
+                }
+                else {
+                    lo = mid + 1;
+                }
+            }
+        }
+        buckets[target].push(phones[idx]);
+    }
+    return buckets;
+};
 const pseudoPhones = (text) => {
     const compact = compactSurface(text);
     return Array.from(compact).filter((ch) => !/\s/u.test(ch)).length > 0
@@ -915,6 +1297,16 @@ const effectiveThreshold = (length, maxDistanceRatio, minDistance, maxDistance,
     }
     return threshold;
 };
+const qgramOverlap = (left, right) => {
+    const [small, large] = left.size <= right.size ? [left, right] : [right, left];
+    let overlap = 0;
+    for (const [qgramId, count] of small.entries()) {
+        const other = large.get(qgramId);
+        if (other != null)
+            overlap += Math.min(count, other);
+    }
+    return overlap;
+};
 const qgramFrequency = (sequence, q, encoder) => {
     const freq = new Map();
     if (q <= 0 || sequence.length < q)