@dev-pi2pie/word-counter 0.1.3-canary.1 → 0.1.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/esm/bin.mjs CHANGED
@@ -421,7 +421,7 @@ var require_picocolors = /* @__PURE__ */ __commonJSMin(((exports, module) => {
421
421
  //#endregion
422
422
  //#region src/cli/program/version-embedded.ts
423
423
  var import_picocolors = /* @__PURE__ */ __toESM(require_picocolors(), 1);
424
- const EMBEDDED_PACKAGE_VERSION = "0.1.3-canary.1";
424
+ const EMBEDDED_PACKAGE_VERSION = "0.1.3";
425
425
 
426
426
  //#endregion
427
427
  //#region src/cli/program/version.ts
@@ -1760,7 +1760,7 @@ function detectLatinLocale(char, context) {
1760
1760
  }
1761
1761
  return DEFAULT_LOCALE;
1762
1762
  }
1763
- function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options)) {
1763
+ function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options), allowLatinLocaleCarry = true, allowJapaneseHanCarry = true) {
1764
1764
  if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
1765
1765
  if (regex.hangul.test(char)) return "ko";
1766
1766
  if (regex.arabic.test(char)) return "ar";
@@ -1768,13 +1768,13 @@ function detectLocaleForChar(char, previousLocale, options = {}, context = resol
1768
1768
  if (regex.devanagari.test(char)) return "hi";
1769
1769
  if (regex.thai.test(char)) return "th";
1770
1770
  if (regex.han.test(char)) {
1771
- if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
1771
+ if (allowJapaneseHanCarry && previousLocale && previousLocale.startsWith("ja")) return previousLocale;
1772
1772
  return context.hanHint ?? DEFAULT_HAN_TAG;
1773
1773
  }
1774
1774
  if (regex.latin.test(char)) {
1775
1775
  const hintedLocale = detectLatinLocale(char, context);
1776
1776
  if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
1777
- if (previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
1777
+ if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
1778
1778
  if (context.latinHint) return context.latinHint;
1779
1779
  return DEFAULT_LOCALE;
1780
1780
  }
@@ -1783,32 +1783,59 @@ function detectLocaleForChar(char, previousLocale, options = {}, context = resol
1783
1783
 
1784
1784
  //#endregion
1785
1785
  //#region src/wc/segment.ts
1786
+ const HARD_BOUNDARY_REGEX = /[\r\n,.!?;:,、。!?;:.。、]/u;
1787
+ const LATIN_PROMOTION_BREAK_REGEX = /[\s,.!?;:,、。!?;:.。、]/u;
1786
1788
  function segmentTextByLocale(text, options = {}) {
1787
1789
  const context = resolveLocaleDetectContext(options);
1788
1790
  const chunks = [];
1789
1791
  let currentLocale = DEFAULT_LOCALE;
1790
1792
  let buffer = "";
1791
1793
  let bufferHasScript = false;
1794
+ let sawCarryBoundary = false;
1795
+ const updateCarryBoundaryState = (detected, char) => {
1796
+ if (detected !== null) {
1797
+ sawCarryBoundary = false;
1798
+ return;
1799
+ }
1800
+ if (HARD_BOUNDARY_REGEX.test(char)) sawCarryBoundary = true;
1801
+ };
1792
1802
  for (const char of text) {
1793
- const detected = detectLocaleForChar(char, currentLocale, options, context);
1803
+ const detected = detectLocaleForChar(char, currentLocale, options, context, !sawCarryBoundary, !sawCarryBoundary);
1794
1804
  const targetLocale = detected ?? currentLocale;
1795
1805
  if (buffer === "") {
1796
1806
  currentLocale = targetLocale;
1797
1807
  buffer = char;
1798
1808
  bufferHasScript = detected !== null;
1809
+ updateCarryBoundaryState(detected, char);
1799
1810
  continue;
1800
1811
  }
1801
1812
  if (detected !== null && !bufferHasScript) {
1802
1813
  currentLocale = targetLocale;
1803
1814
  buffer += char;
1804
1815
  bufferHasScript = true;
1816
+ updateCarryBoundaryState(detected, char);
1805
1817
  continue;
1806
1818
  }
1807
1819
  if (targetLocale !== currentLocale && detected !== null) {
1808
1820
  if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
1821
+ const promotionBreakIndex = findLastLatinPromotionBreakIndex(buffer);
1822
+ if (promotionBreakIndex === -1) {
1823
+ currentLocale = targetLocale;
1824
+ buffer += char;
1825
+ bufferHasScript = true;
1826
+ updateCarryBoundaryState(detected, char);
1827
+ continue;
1828
+ }
1829
+ const prefix = buffer.slice(0, promotionBreakIndex + 1);
1830
+ const suffix = buffer.slice(promotionBreakIndex + 1);
1831
+ if (prefix.length > 0) chunks.push({
1832
+ locale: currentLocale,
1833
+ text: prefix
1834
+ });
1809
1835
  currentLocale = targetLocale;
1810
- buffer += char;
1836
+ buffer = `${suffix}${char}`;
1811
1837
  bufferHasScript = true;
1838
+ updateCarryBoundaryState(detected, char);
1812
1839
  continue;
1813
1840
  }
1814
1841
  chunks.push({
@@ -1818,10 +1845,12 @@ function segmentTextByLocale(text, options = {}) {
1818
1845
  currentLocale = targetLocale;
1819
1846
  buffer = char;
1820
1847
  bufferHasScript = true;
1848
+ updateCarryBoundaryState(detected, char);
1821
1849
  continue;
1822
1850
  }
1823
1851
  buffer += char;
1824
1852
  if (detected !== null) bufferHasScript = true;
1853
+ updateCarryBoundaryState(detected, char);
1825
1854
  }
1826
1855
  if (buffer.length > 0) chunks.push({
1827
1856
  locale: currentLocale,
@@ -1829,6 +1858,14 @@ function segmentTextByLocale(text, options = {}) {
1829
1858
  });
1830
1859
  return mergeAdjacentChunks(chunks);
1831
1860
  }
1861
+ function findLastLatinPromotionBreakIndex(buffer) {
1862
+ for (let index = buffer.length - 1; index >= 0; index -= 1) {
1863
+ const char = buffer[index];
1864
+ if (!char) continue;
1865
+ if (LATIN_PROMOTION_BREAK_REGEX.test(char)) return index;
1866
+ }
1867
+ return -1;
1868
+ }
1832
1869
  function mergeAdjacentChunks(chunks) {
1833
1870
  if (chunks.length === 0) return chunks;
1834
1871
  const merged = [];