npm - @dev-pi2pie/word-counter - Versions diffs - 0.1.3-canary.0 → 0.1.3-canary.2 - Mend

@dev-pi2pie/word-counter 0.1.3-canary.0 → 0.1.3-canary.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md CHANGED Viewed

@@ -69,6 +69,24 @@ word-counter --latin-language en "Hello world"
 word-counter --latin-tag en "Hello world"
 ```
+Add custom Latin hint rules (repeatable) or load from JSON:
+```bash
+word-counter --latin-hint 'pl=[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]' "Zażółć gęślą jaźń"
+word-counter --latin-hint 'tr=[çğıöşüÇĞİÖŞÜ]' --latin-hint 'ro=[ăâîșțĂÂÎȘȚ]' "șță"
+word-counter --latin-hints-file ./examples/latin-hints.json "Zażółć Știință Iğdır"
+word-counter --no-default-latin-hints --latin-hint 'pl=[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]' "Zażółć"
+```
+`examples/latin-hints.json` format:
+```json
+[
+  { "tag": "pl", "pattern": "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" },
+  { "tag": "tr", "pattern": "[çğıöşüÇĞİÖŞÜ]", "priority": 1 }
+]
+```
 Hint a language tag for Han fallback:
 ```bash
@@ -245,6 +263,10 @@ import wordCounter, {
 wordCounter("Hello world", { latinLanguageHint: "en" });
 wordCounter("Hello world", { latinTagHint: "en" });
+wordCounter("Zażółć gęślą jaźń", {
+  latinHintRules: [{ tag: "pl", pattern: "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" }],
+});
+wordCounter("Über", { useDefaultLatinHints: false });
 wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
 wordCounter("Hi 👋, world!", { nonWords: true });
 wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
@@ -295,6 +317,10 @@ const {
 wordCounter("Hello world", { latinLanguageHint: "en" });
 wordCounter("Hello world", { latinTagHint: "en" });
+wordCounter("Zażółć gęślą jaźń", {
+  latinHintRules: [{ tag: "pl", pattern: "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" }],
+});
+wordCounter("Über", { useDefaultLatinHints: false });
 wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
 wordCounter("Hi 👋, world!", { nonWords: true });
 wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
@@ -568,15 +594,22 @@ Example JSON (trimmed):
 - Detection is regex/script based (Unicode script checks), not a statistical language-ID model.
 - Ambiguous Latin text uses `und-Latn` unless a Latin hint is provided.
 - Han-script fallback uses `und-Hani` by default because regex script checks cannot natively distinguish `zh-Hans` vs `zh-Hant`.
-- Current built-in Latin diacritic heuristics are intentionally limited:
+- Current built-in Latin diacritic heuristics include:
   - `de`: `äöüÄÖÜß`
   - `es`: `ñÑ¿¡`
   - `pt`: `ãõÃÕ`
   - `fr`: `œŒæÆ`
+  - `pl`: `ąćęłńśźżĄĆĘŁŃŚŹŻ`
+  - `tr`: `ıİğĞşŞ`
+  - `ro`: `ăĂâÂîÎșȘțȚ`
+  - `hu`: `őŐűŰ`
+  - `is`: `ðÐþÞ`
 - Latin text with other European diacritics may still remain in `und-Latn` unless a hint is provided.
 - Use `--mode chunk`/`--mode segments` or `--format json` to see the exact locale tag assigned to each chunk.
 - Regex/script-only detection cannot reliably identify English vs. other Latin-script languages; 100% certainty requires explicit metadata (document language tags, user-provided locale, headers) or a language-ID model.
 - Use `--latin-language <tag>` or `--latin-tag <tag>` for ambiguous Latin text.
+- Use `--latin-hint <tag>=<pattern>` (repeatable) and `--latin-hints-file <path>` to add custom Latin rules.
+- Use `--no-default-latin-hints` to disable built-in Latin diacritic rules.
 - Use `--han-language <tag>` or `--han-tag <tag>` for Han-script fallback.
 - `--latin-locale` remains supported as a legacy alias for now and is planned for future deprecation.

package/dist/cjs/index.cjs CHANGED Viewed

@@ -322,10 +322,53 @@ function resolveMode(input, fallback = "chunk") {
 	return normalizeMode(input) ?? fallback;
 }
+//#endregion
+//#region src/wc/latin-hints.ts
+const DEFAULT_LATIN_HINT_RULES_SOURCE = [
+	{
+		tag: "de",
+		pattern: "[äöüÄÖÜß]"
+	},
+	{
+		tag: "es",
+		pattern: "[ñÑ¿¡]"
+	},
+	{
+		tag: "pt",
+		pattern: "[ãõÃÕ]"
+	},
+	{
+		tag: "fr",
+		pattern: "[œŒæÆ]"
+	},
+	{
+		tag: "pl",
+		pattern: "[ąćęłńśźżĄĆĘŁŃŚŹŻ]"
+	},
+	{
+		tag: "tr",
+		pattern: "[ıİğĞşŞ]"
+	},
+	{
+		tag: "ro",
+		pattern: "[ăĂâÂîÎșȘțȚ]"
+	},
+	{
+		tag: "hu",
+		pattern: "[őŐűŰ]"
+	},
+	{
+		tag: "is",
+		pattern: "[ðÐþÞ]"
+	}
+];
+const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
 //#endregion
 //#region src/wc/locale-detect.ts
 const DEFAULT_LOCALE = "und-Latn";
 const DEFAULT_HAN_TAG = "und-Hani";
+const MAX_LATIN_HINT_PATTERN_LENGTH = 256;
 const regex = {
 	hiragana: /\p{Script=Hiragana}/u,
 	katakana: /\p{Script=Katakana}/u,
@@ -337,31 +380,10 @@ const regex = {
 	devanagari: /\p{Script=Devanagari}/u,
 	thai: /\p{Script=Thai}/u
 };
-const latinLocaleHints = [
-	{
-		locale: "de",
-		regex: /[äöüÄÖÜß]/
-	},
-	{
-		locale: "es",
-		regex: /[ñÑ¿¡]/
-	},
-	{
-		locale: "pt",
-		regex: /[ãõÃÕ]/
-	},
-	{
-		locale: "fr",
-		regex: /[œŒæÆ]/
-	}
-];
-const latinLocales = new Set([DEFAULT_LOCALE, ...latinLocaleHints.map((hint) => hint.locale)]);
-function isLatinLocale(locale) {
-	return latinLocales.has(locale);
-}
-function detectLatinLocale(char) {
-	for (const hint of latinLocaleHints) if (hint.regex.test(char)) return hint.locale;
-	return DEFAULT_LOCALE;
+const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
+function isLatinLocale(locale, context) {
+	if (context) return context.latinLocales.has(locale);
+	return defaultLatinLocales.has(locale);
 }
 function resolveLatinHint(options) {
 	const latinTagHint = options.latinTagHint?.trim();
@@ -377,7 +399,82 @@ function resolveHanHint(options) {
 	const hanLanguageHint = options.hanLanguageHint?.trim();
 	if (hanLanguageHint) return hanLanguageHint;
 }
-function detectLocaleForChar(char, previousLocale, options = {}) {
+function compileLatinHintPattern(pattern, label) {
+	const source = typeof pattern === "string" ? pattern : pattern.source;
+	const hasUnicodeMode = typeof pattern !== "string" && (pattern.flags.includes("u") || pattern.flags.includes("v"));
+	const flags = typeof pattern === "string" ? "u" : hasUnicodeMode ? pattern.flags : `${pattern.flags}u`;
+	if (source.length === 0) throw new Error(`${label}: pattern must not be empty.`);
+	if (source.length > MAX_LATIN_HINT_PATTERN_LENGTH) throw new Error(`${label}: pattern must be at most ${MAX_LATIN_HINT_PATTERN_LENGTH} characters.`);
+	try {
+		return new RegExp(source, flags);
+	} catch (error) {
+		const message = error instanceof Error ? error.message : String(error);
+		throw new Error(`${label}: invalid Unicode regex pattern (${message}).`);
+	}
+}
+function normalizeLatinHintPriority(priority, label) {
+	if (priority === void 0) return 0;
+	if (typeof priority !== "number" || !Number.isFinite(priority)) throw new Error(`${label}: priority must be a finite number when provided.`);
+	return priority;
+}
+function compileLatinHintRule(rule, order, label) {
+	const tag = typeof rule.tag === "string" ? rule.tag.trim() : "";
+	if (!tag) throw new Error(`${label}: tag must be a non-empty string.`);
+	return {
+		tag,
+		pattern: compileLatinHintPattern(rule.pattern, label),
+		priority: normalizeLatinHintPriority(rule.priority, label),
+		order
+	};
+}
+function resolveLatinHintRules(options) {
+	const useDefaultLatinHints = options.useDefaultLatinHints !== false;
+	const customRules = options.latinHintRules ?? [];
+	const combinedRules = [];
+	for (let index = 0; index < customRules.length; index += 1) {
+		const rule = customRules[index];
+		if (!rule) continue;
+		combinedRules.push({
+			rule,
+			label: `Invalid custom Latin hint rule at index ${index}`
+		});
+	}
+	if (useDefaultLatinHints) for (let index = 0; index < DEFAULT_LATIN_HINT_RULES.length; index += 1) {
+		const rule = DEFAULT_LATIN_HINT_RULES[index];
+		if (!rule) continue;
+		combinedRules.push({
+			rule,
+			label: `Invalid default Latin hint rule at index ${index}`
+		});
+	}
+	const resolvedRules = combinedRules.map((entry, index) => compileLatinHintRule(entry.rule, index, entry.label));
+	resolvedRules.sort((left, right) => {
+		if (left.priority !== right.priority) return right.priority - left.priority;
+		return left.order - right.order;
+	});
+	return resolvedRules;
+}
+function resolveLocaleDetectContext(options = {}) {
+	const latinHint = resolveLatinHint(options);
+	const latinHintRules = resolveLatinHintRules(options);
+	const latinLocales = new Set([DEFAULT_LOCALE]);
+	for (const rule of latinHintRules) latinLocales.add(rule.tag);
+	if (latinHint) latinLocales.add(latinHint);
+	return {
+		latinHint,
+		hanHint: resolveHanHint(options),
+		latinHintRules,
+		latinLocales
+	};
+}
+function detectLatinLocale(char, context) {
+	for (const hint of context.latinHintRules) {
+		hint.pattern.lastIndex = 0;
+		if (hint.pattern.test(char)) return hint.tag;
+	}
+	return DEFAULT_LOCALE;
+}
+function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options), allowLatinLocaleCarry = true, allowJapaneseHanCarry = true) {
 	if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
 	if (regex.hangul.test(char)) return "ko";
 	if (regex.arabic.test(char)) return "ar";
@@ -385,15 +482,14 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
 	if (regex.devanagari.test(char)) return "hi";
 	if (regex.thai.test(char)) return "th";
 	if (regex.han.test(char)) {
-		if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
-		return resolveHanHint(options) ?? DEFAULT_HAN_TAG;
+		if (allowJapaneseHanCarry && previousLocale && previousLocale.startsWith("ja")) return previousLocale;
+		return context.hanHint ?? DEFAULT_HAN_TAG;
 	}
 	if (regex.latin.test(char)) {
-		const hintedLocale = detectLatinLocale(char);
+		const hintedLocale = detectLatinLocale(char, context);
 		if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
-		if (previousLocale && isLatinLocale(previousLocale) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
-		const latinHint = resolveLatinHint(options);
-		if (latinHint) return latinHint;
+		if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
+		if (context.latinHint) return context.latinHint;
 		return DEFAULT_LOCALE;
 	}
 	return null;
@@ -401,31 +497,59 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
 //#endregion
 //#region src/wc/segment.ts
+const HARD_BOUNDARY_REGEX = /[\r\n,.!?;:，、。！？；：．｡､]/u;
+const LATIN_PROMOTION_BREAK_REGEX = /[\s,.!?;:，、。！？；：．｡､]/u;
 function segmentTextByLocale(text, options = {}) {
+	const context = resolveLocaleDetectContext(options);
 	const chunks = [];
 	let currentLocale = DEFAULT_LOCALE;
 	let buffer = "";
 	let bufferHasScript = false;
+	let sawCarryBoundary = false;
+	const updateCarryBoundaryState = (detected, char) => {
+		if (detected !== null) {
+			sawCarryBoundary = false;
+			return;
+		}
+		if (HARD_BOUNDARY_REGEX.test(char)) sawCarryBoundary = true;
+	};
 	for (const char of text) {
-		const detected = detectLocaleForChar(char, currentLocale, options);
+		const detected = detectLocaleForChar(char, currentLocale, options, context, !sawCarryBoundary, !sawCarryBoundary);
 		const targetLocale = detected ?? currentLocale;
 		if (buffer === "") {
 			currentLocale = targetLocale;
 			buffer = char;
 			bufferHasScript = detected !== null;
+			updateCarryBoundaryState(detected, char);
 			continue;
 		}
 		if (detected !== null && !bufferHasScript) {
 			currentLocale = targetLocale;
 			buffer += char;
 			bufferHasScript = true;
+			updateCarryBoundaryState(detected, char);
 			continue;
 		}
 		if (targetLocale !== currentLocale && detected !== null) {
-			if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale)) {
+			if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
+				const promotionBreakIndex = findLastLatinPromotionBreakIndex(buffer);
+				if (promotionBreakIndex === -1) {
+					currentLocale = targetLocale;
+					buffer += char;
+					bufferHasScript = true;
+					updateCarryBoundaryState(detected, char);
+					continue;
+				}
+				const prefix = buffer.slice(0, promotionBreakIndex + 1);
+				const suffix = buffer.slice(promotionBreakIndex + 1);
+				if (prefix.length > 0) chunks.push({
+					locale: currentLocale,
+					text: prefix
+				});
 				currentLocale = targetLocale;
-				buffer += char;
+				buffer = `${suffix}${char}`;
 				bufferHasScript = true;
+				updateCarryBoundaryState(detected, char);
 				continue;
 			}
 			chunks.push({
@@ -435,10 +559,12 @@ function segmentTextByLocale(text, options = {}) {
 			currentLocale = targetLocale;
 			buffer = char;
 			bufferHasScript = true;
+			updateCarryBoundaryState(detected, char);
 			continue;
 		}
 		buffer += char;
 		if (detected !== null) bufferHasScript = true;
+		updateCarryBoundaryState(detected, char);
 	}
 	if (buffer.length > 0) chunks.push({
 		locale: currentLocale,
@@ -446,6 +572,14 @@ function segmentTextByLocale(text, options = {}) {
 	});
 	return mergeAdjacentChunks(chunks);
 }
+function findLastLatinPromotionBreakIndex(buffer) {
+	for (let index = buffer.length - 1; index >= 0; index -= 1) {
+		const char = buffer[index];
+		if (!char) continue;
+		if (LATIN_PROMOTION_BREAK_REGEX.test(char)) return index;
+	}
+	return -1;
+}
 function mergeAdjacentChunks(chunks) {
 	if (chunks.length === 0) return chunks;
 	const merged = [];
@@ -475,6 +609,8 @@ function wordCounter(text, options = {}) {
 		latinLanguageHint: options.latinLanguageHint,
 		latinTagHint: options.latinTagHint,
 		latinLocaleHint: options.latinLocaleHint,
+		latinHintRules: options.latinHintRules,
+		useDefaultLatinHints: options.useDefaultLatinHints,
 		hanLanguageHint: options.hanLanguageHint,
 		hanTagHint: options.hanTagHint
 	});