@dev-pi2pie/word-counter 0.1.3-canary.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -21
- package/dist/cjs/index.cjs +42 -5
- package/dist/cjs/index.cjs.map +1 -1
- package/dist/esm/bin.mjs +43 -6
- package/dist/esm/bin.mjs.map +1 -1
- package/dist/esm/index.mjs +42 -5
- package/dist/esm/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/esm/bin.mjs
CHANGED
|
@@ -421,7 +421,7 @@ var require_picocolors = /* @__PURE__ */ __commonJSMin(((exports, module) => {
|
|
|
421
421
|
//#endregion
|
|
422
422
|
//#region src/cli/program/version-embedded.ts
|
|
423
423
|
var import_picocolors = /* @__PURE__ */ __toESM(require_picocolors(), 1);
|
|
424
|
-
const EMBEDDED_PACKAGE_VERSION = "0.1.3
|
|
424
|
+
const EMBEDDED_PACKAGE_VERSION = "0.1.3";
|
|
425
425
|
|
|
426
426
|
//#endregion
|
|
427
427
|
//#region src/cli/program/version.ts
|
|
@@ -1760,7 +1760,7 @@ function detectLatinLocale(char, context) {
|
|
|
1760
1760
|
}
|
|
1761
1761
|
return DEFAULT_LOCALE;
|
|
1762
1762
|
}
|
|
1763
|
-
function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options)) {
|
|
1763
|
+
function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options), allowLatinLocaleCarry = true, allowJapaneseHanCarry = true) {
|
|
1764
1764
|
if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
|
|
1765
1765
|
if (regex.hangul.test(char)) return "ko";
|
|
1766
1766
|
if (regex.arabic.test(char)) return "ar";
|
|
@@ -1768,13 +1768,13 @@ function detectLocaleForChar(char, previousLocale, options = {}, context = resol
|
|
|
1768
1768
|
if (regex.devanagari.test(char)) return "hi";
|
|
1769
1769
|
if (regex.thai.test(char)) return "th";
|
|
1770
1770
|
if (regex.han.test(char)) {
|
|
1771
|
-
if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
|
|
1771
|
+
if (allowJapaneseHanCarry && previousLocale && previousLocale.startsWith("ja")) return previousLocale;
|
|
1772
1772
|
return context.hanHint ?? DEFAULT_HAN_TAG;
|
|
1773
1773
|
}
|
|
1774
1774
|
if (regex.latin.test(char)) {
|
|
1775
1775
|
const hintedLocale = detectLatinLocale(char, context);
|
|
1776
1776
|
if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
|
|
1777
|
-
if (previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
|
|
1777
|
+
if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
|
|
1778
1778
|
if (context.latinHint) return context.latinHint;
|
|
1779
1779
|
return DEFAULT_LOCALE;
|
|
1780
1780
|
}
|
|
@@ -1783,32 +1783,59 @@ function detectLocaleForChar(char, previousLocale, options = {}, context = resol
|
|
|
1783
1783
|
|
|
1784
1784
|
//#endregion
|
|
1785
1785
|
//#region src/wc/segment.ts
|
|
1786
|
+
const HARD_BOUNDARY_REGEX = /[\r\n,.!?;:,、。!?;:.。、]/u;
|
|
1787
|
+
const LATIN_PROMOTION_BREAK_REGEX = /[\s,.!?;:,、。!?;:.。、]/u;
|
|
1786
1788
|
function segmentTextByLocale(text, options = {}) {
|
|
1787
1789
|
const context = resolveLocaleDetectContext(options);
|
|
1788
1790
|
const chunks = [];
|
|
1789
1791
|
let currentLocale = DEFAULT_LOCALE;
|
|
1790
1792
|
let buffer = "";
|
|
1791
1793
|
let bufferHasScript = false;
|
|
1794
|
+
let sawCarryBoundary = false;
|
|
1795
|
+
const updateCarryBoundaryState = (detected, char) => {
|
|
1796
|
+
if (detected !== null) {
|
|
1797
|
+
sawCarryBoundary = false;
|
|
1798
|
+
return;
|
|
1799
|
+
}
|
|
1800
|
+
if (HARD_BOUNDARY_REGEX.test(char)) sawCarryBoundary = true;
|
|
1801
|
+
};
|
|
1792
1802
|
for (const char of text) {
|
|
1793
|
-
const detected = detectLocaleForChar(char, currentLocale, options, context);
|
|
1803
|
+
const detected = detectLocaleForChar(char, currentLocale, options, context, !sawCarryBoundary, !sawCarryBoundary);
|
|
1794
1804
|
const targetLocale = detected ?? currentLocale;
|
|
1795
1805
|
if (buffer === "") {
|
|
1796
1806
|
currentLocale = targetLocale;
|
|
1797
1807
|
buffer = char;
|
|
1798
1808
|
bufferHasScript = detected !== null;
|
|
1809
|
+
updateCarryBoundaryState(detected, char);
|
|
1799
1810
|
continue;
|
|
1800
1811
|
}
|
|
1801
1812
|
if (detected !== null && !bufferHasScript) {
|
|
1802
1813
|
currentLocale = targetLocale;
|
|
1803
1814
|
buffer += char;
|
|
1804
1815
|
bufferHasScript = true;
|
|
1816
|
+
updateCarryBoundaryState(detected, char);
|
|
1805
1817
|
continue;
|
|
1806
1818
|
}
|
|
1807
1819
|
if (targetLocale !== currentLocale && detected !== null) {
|
|
1808
1820
|
if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
|
|
1821
|
+
const promotionBreakIndex = findLastLatinPromotionBreakIndex(buffer);
|
|
1822
|
+
if (promotionBreakIndex === -1) {
|
|
1823
|
+
currentLocale = targetLocale;
|
|
1824
|
+
buffer += char;
|
|
1825
|
+
bufferHasScript = true;
|
|
1826
|
+
updateCarryBoundaryState(detected, char);
|
|
1827
|
+
continue;
|
|
1828
|
+
}
|
|
1829
|
+
const prefix = buffer.slice(0, promotionBreakIndex + 1);
|
|
1830
|
+
const suffix = buffer.slice(promotionBreakIndex + 1);
|
|
1831
|
+
if (prefix.length > 0) chunks.push({
|
|
1832
|
+
locale: currentLocale,
|
|
1833
|
+
text: prefix
|
|
1834
|
+
});
|
|
1809
1835
|
currentLocale = targetLocale;
|
|
1810
|
-
buffer
|
|
1836
|
+
buffer = `${suffix}${char}`;
|
|
1811
1837
|
bufferHasScript = true;
|
|
1838
|
+
updateCarryBoundaryState(detected, char);
|
|
1812
1839
|
continue;
|
|
1813
1840
|
}
|
|
1814
1841
|
chunks.push({
|
|
@@ -1818,10 +1845,12 @@ function segmentTextByLocale(text, options = {}) {
|
|
|
1818
1845
|
currentLocale = targetLocale;
|
|
1819
1846
|
buffer = char;
|
|
1820
1847
|
bufferHasScript = true;
|
|
1848
|
+
updateCarryBoundaryState(detected, char);
|
|
1821
1849
|
continue;
|
|
1822
1850
|
}
|
|
1823
1851
|
buffer += char;
|
|
1824
1852
|
if (detected !== null) bufferHasScript = true;
|
|
1853
|
+
updateCarryBoundaryState(detected, char);
|
|
1825
1854
|
}
|
|
1826
1855
|
if (buffer.length > 0) chunks.push({
|
|
1827
1856
|
locale: currentLocale,
|
|
@@ -1829,6 +1858,14 @@ function segmentTextByLocale(text, options = {}) {
|
|
|
1829
1858
|
});
|
|
1830
1859
|
return mergeAdjacentChunks(chunks);
|
|
1831
1860
|
}
|
|
1861
|
+
function findLastLatinPromotionBreakIndex(buffer) {
|
|
1862
|
+
for (let index = buffer.length - 1; index >= 0; index -= 1) {
|
|
1863
|
+
const char = buffer[index];
|
|
1864
|
+
if (!char) continue;
|
|
1865
|
+
if (LATIN_PROMOTION_BREAK_REGEX.test(char)) return index;
|
|
1866
|
+
}
|
|
1867
|
+
return -1;
|
|
1868
|
+
}
|
|
1832
1869
|
function mergeAdjacentChunks(chunks) {
|
|
1833
1870
|
if (chunks.length === 0) return chunks;
|
|
1834
1871
|
const merged = [];
|