@dev-pi2pie/word-counter 0.1.2 → 0.1.3-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/esm/bin.mjs CHANGED
@@ -146,7 +146,7 @@ function createDebugChannel(options) {
146
146
 
147
147
  //#endregion
148
148
  //#region src/cli/path/filter.ts
149
- const DEFAULT_INCLUDE_EXTENSIONS = new Set([
149
+ const DEFAULT_INCLUDE_EXTENSIONS = Object.freeze([
150
150
  ".md",
151
151
  ".markdown",
152
152
  ".mdx",
@@ -223,13 +223,13 @@ function shouldIncludeFromDirectoryRegex(relativePath, filter) {
223
223
 
224
224
  //#endregion
225
225
  //#region src/cli/total-of.ts
226
- const TOTAL_OF_PARTS = [
226
+ const TOTAL_OF_PARTS = Object.freeze([
227
227
  "words",
228
228
  "emoji",
229
229
  "symbols",
230
230
  "punctuation",
231
231
  "whitespace"
232
- ];
232
+ ]);
233
233
  const TOTAL_OF_PART_ALIASES = {
234
234
  word: "words",
235
235
  words: "words",
@@ -342,8 +342,11 @@ const PATH_MODE_CHOICES = ["auto", "manual"];
342
342
  function collectPathValue(value, previous = []) {
343
343
  return [...previous, value];
344
344
  }
345
+ function collectLatinHintValue(value, previous = []) {
346
+ return [...previous, value];
347
+ }
345
348
  function configureProgramOptions(program, parseMode) {
346
- program.addOption(new Option("-m, --mode <mode>", "breakdown mode").choices(MODE_CHOICES).argParser(parseMode).default("chunk")).addOption(new Option("-f, --format <format>", "output format").choices(FORMAT_CHOICES).default("standard")).addOption(new Option("--section <section>", "document section mode").choices(SECTION_CHOICES).default("all")).addOption(new Option("--path-mode <mode>", "path resolution mode: auto (default) expands directories; manual treats --path values as literal files").choices(PATH_MODE_CHOICES).default("auto")).option("--latin-language <language>", "hint a language tag for Latin script text").option("--latin-tag <tag>", "hint a BCP 47 tag for Latin script text").option("--latin-locale <locale>", "legacy alias of --latin-language").option("--han-language <language>", "hint a language tag for Han script text").option("--han-tag <tag>", "hint a BCP 47 tag for Han script text").option("--non-words", "collect emoji, symbols, and punctuation (excludes whitespace)").option("--include-whitespace", "include whitespace counts (implies with --non-words; same as --misc)").option("--misc", "collect non-words plus whitespace (alias for --include-whitespace)").option("--total-of <parts>", "override total composition (comma-separated): words,emoji,symbols,punctuation,whitespace", parseTotalOfOption).option("--pretty", "pretty print JSON output", false).option("--debug", "enable debug diagnostics on stderr").option("--verbose", "emit verbose per-file debug diagnostics (requires --debug)").option("--debug-report [path]", "write debug diagnostics to a report file").option("--debug-report-tee", "mirror debug diagnostics to both report file and stderr").option("--debug-tee", "alias of --debug-report-tee").option("--merged", "show merged aggregate output (default)").option("--per-file", "show per-file output plus merged summary").option("--no-progress", "disable batch progress indicator").option("--keep-progress", "keep final batch progress line visible in standard mode").option("--no-recursive", "disable recursive directory traversal").option("--quiet-skips", "hide skip diagnostics (applies when --debug is enabled)").option("--include-ext <exts>", "comma-separated extensions to include during directory scanning", collectExtensionOption, []).option("--exclude-ext <exts>", "comma-separated extensions to exclude during directory scanning", collectExtensionOption, []).option("--regex <pattern>", "regex filter for directory-scanned paths (applies to --path directories only)").option("-p, --path <path>", "read input from file or directory (directories expand in auto mode by default)", collectPathValue, []).argument("[text...]", "text to count").showHelpAfterError();
349
+ program.addOption(new Option("-m, --mode <mode>", "breakdown mode").choices(MODE_CHOICES).argParser(parseMode).default("chunk")).addOption(new Option("-f, --format <format>", "output format").choices(FORMAT_CHOICES).default("standard")).addOption(new Option("--section <section>", "document section mode").choices(SECTION_CHOICES).default("all")).addOption(new Option("--path-mode <mode>", "path resolution mode: auto (default) expands directories; manual treats --path values as literal files").choices(PATH_MODE_CHOICES).default("auto")).option("--latin-language <language>", "hint a language tag for Latin script text").option("--latin-tag <tag>", "hint a BCP 47 tag for Latin script text").option("--latin-locale <locale>", "legacy alias of --latin-language").option("--latin-hint <tag>=<pattern>", "add a custom Latin hint rule (repeatable)", collectLatinHintValue, []).option("--latin-hints-file <path>", "load custom Latin hint rules from a JSON file").option("--no-default-latin-hints", "disable built-in Latin hint rules").option("--han-language <language>", "hint a language tag for Han script text").option("--han-tag <tag>", "hint a BCP 47 tag for Han script text").option("--non-words", "collect emoji, symbols, and punctuation (excludes whitespace)").option("--include-whitespace", "include whitespace counts (implies with --non-words; same as --misc)").option("--misc", "collect non-words plus whitespace (alias for --include-whitespace)").option("--total-of <parts>", "override total composition (comma-separated): words,emoji,symbols,punctuation,whitespace", parseTotalOfOption).option("--pretty", "pretty print JSON output", false).option("--debug", "enable debug diagnostics on stderr").option("--verbose", "emit verbose per-file debug diagnostics (requires --debug)").option("--debug-report [path]", "write debug diagnostics to a report file").option("--debug-report-tee", "mirror debug diagnostics to both report file and stderr").option("--debug-tee", "alias of --debug-report-tee").option("--merged", "show merged aggregate output (default)").option("--per-file", "show per-file output plus merged summary").option("--no-progress", "disable batch progress indicator").option("--keep-progress", "keep final batch progress line visible in standard mode").option("--no-recursive", "disable recursive directory traversal").option("--quiet-skips", "hide skip diagnostics (applies when --debug is enabled)").option("--include-ext <exts>", "comma-separated extensions to include during directory scanning", collectExtensionOption, []).option("--exclude-ext <exts>", "comma-separated extensions to exclude during directory scanning", collectExtensionOption, []).option("--regex <pattern>", "regex filter for directory-scanned paths (applies to --path directories only)").option("-p, --path <path>", "read input from file or directory (directories expand in auto mode by default)", collectPathValue, []).argument("[text...]", "text to count").showHelpAfterError();
347
350
  }
348
351
 
349
352
  //#endregion
@@ -418,7 +421,7 @@ var require_picocolors = /* @__PURE__ */ __commonJSMin(((exports, module) => {
418
421
  //#endregion
419
422
  //#region src/cli/program/version-embedded.ts
420
423
  var import_picocolors = /* @__PURE__ */ __toESM(require_picocolors(), 1);
421
- const EMBEDDED_PACKAGE_VERSION = "0.1.2";
424
+ const EMBEDDED_PACKAGE_VERSION = "0.1.3-canary.1";
422
425
 
423
426
  //#endregion
424
427
  //#region src/cli/program/version.ts
@@ -1605,10 +1608,53 @@ function resolveMode(input, fallback = "chunk") {
1605
1608
  return normalizeMode(input) ?? fallback;
1606
1609
  }
1607
1610
 
1611
+ //#endregion
1612
+ //#region src/wc/latin-hints.ts
1613
+ const DEFAULT_LATIN_HINT_RULES_SOURCE = [
1614
+ {
1615
+ tag: "de",
1616
+ pattern: "[äöüÄÖÜß]"
1617
+ },
1618
+ {
1619
+ tag: "es",
1620
+ pattern: "[ñÑ¿¡]"
1621
+ },
1622
+ {
1623
+ tag: "pt",
1624
+ pattern: "[ãõÃÕ]"
1625
+ },
1626
+ {
1627
+ tag: "fr",
1628
+ pattern: "[œŒæÆ]"
1629
+ },
1630
+ {
1631
+ tag: "pl",
1632
+ pattern: "[ąćęłńśźżĄĆĘŁŃŚŹŻ]"
1633
+ },
1634
+ {
1635
+ tag: "tr",
1636
+ pattern: "[ıİğĞşŞ]"
1637
+ },
1638
+ {
1639
+ tag: "ro",
1640
+ pattern: "[ăĂâÂîÎșȘțȚ]"
1641
+ },
1642
+ {
1643
+ tag: "hu",
1644
+ pattern: "[őŐűŰ]"
1645
+ },
1646
+ {
1647
+ tag: "is",
1648
+ pattern: "[ðÐþÞ]"
1649
+ }
1650
+ ];
1651
+ const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
1652
+
1608
1653
  //#endregion
1609
1654
  //#region src/wc/locale-detect.ts
1610
1655
  const DEFAULT_LOCALE = "und-Latn";
1611
- const DEFAULT_HAN_TAG = "zh-Hani";
1656
+ const DEFAULT_HAN_TAG = "und-Hani";
1657
+ const MAX_LATIN_HINT_PATTERN_LENGTH = 256;
1612
1658
  const regex = {
1613
1659
  hiragana: /\p{Script=Hiragana}/u,
1614
1660
  katakana: /\p{Script=Katakana}/u,
@@ -1620,31 +1666,10 @@ const regex = {
1620
1666
  devanagari: /\p{Script=Devanagari}/u,
1621
1667
  thai: /\p{Script=Thai}/u
1622
1668
  };
1623
- const latinLocaleHints = [
1624
- {
1625
- locale: "de",
1626
- regex: /[äöüÄÖÜß]/
1627
- },
1628
- {
1629
- locale: "es",
1630
- regex: /[ñÑ¿¡]/
1631
- },
1632
- {
1633
- locale: "pt",
1634
- regex: /[ãõÃÕ]/
1635
- },
1636
- {
1637
- locale: "fr",
1638
- regex: /[œŒæÆ]/
1639
- }
1640
- ];
1641
- const latinLocales = new Set([DEFAULT_LOCALE, ...latinLocaleHints.map((hint) => hint.locale)]);
1642
- function isLatinLocale(locale) {
1643
- return latinLocales.has(locale);
1644
- }
1645
- function detectLatinLocale(char) {
1646
- for (const hint of latinLocaleHints) if (hint.regex.test(char)) return hint.locale;
1647
- return DEFAULT_LOCALE;
1669
+ const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
1670
+ function isLatinLocale(locale, context) {
1671
+ if (context) return context.latinLocales.has(locale);
1672
+ return defaultLatinLocales.has(locale);
1648
1673
  }
1649
1674
  function resolveLatinHint(options) {
1650
1675
  const latinTagHint = options.latinTagHint?.trim();
@@ -1660,7 +1685,82 @@ function resolveHanHint(options) {
1660
1685
  const hanLanguageHint = options.hanLanguageHint?.trim();
1661
1686
  if (hanLanguageHint) return hanLanguageHint;
1662
1687
  }
1663
- function detectLocaleForChar(char, previousLocale, options = {}) {
1688
+ function compileLatinHintPattern(pattern, label) {
1689
+ const source = typeof pattern === "string" ? pattern : pattern.source;
1690
+ const hasUnicodeMode = typeof pattern !== "string" && (pattern.flags.includes("u") || pattern.flags.includes("v"));
1691
+ const flags = typeof pattern === "string" ? "u" : hasUnicodeMode ? pattern.flags : `${pattern.flags}u`;
1692
+ if (source.length === 0) throw new Error(`${label}: pattern must not be empty.`);
1693
+ if (source.length > MAX_LATIN_HINT_PATTERN_LENGTH) throw new Error(`${label}: pattern must be at most ${MAX_LATIN_HINT_PATTERN_LENGTH} characters.`);
1694
+ try {
1695
+ return new RegExp(source, flags);
1696
+ } catch (error) {
1697
+ const message = error instanceof Error ? error.message : String(error);
1698
+ throw new Error(`${label}: invalid Unicode regex pattern (${message}).`);
1699
+ }
1700
+ }
1701
+ function normalizeLatinHintPriority(priority, label) {
1702
+ if (priority === void 0) return 0;
1703
+ if (typeof priority !== "number" || !Number.isFinite(priority)) throw new Error(`${label}: priority must be a finite number when provided.`);
1704
+ return priority;
1705
+ }
1706
+ function compileLatinHintRule(rule, order, label) {
1707
+ const tag = typeof rule.tag === "string" ? rule.tag.trim() : "";
1708
+ if (!tag) throw new Error(`${label}: tag must be a non-empty string.`);
1709
+ return {
1710
+ tag,
1711
+ pattern: compileLatinHintPattern(rule.pattern, label),
1712
+ priority: normalizeLatinHintPriority(rule.priority, label),
1713
+ order
1714
+ };
1715
+ }
1716
+ function resolveLatinHintRules$1(options) {
1717
+ const useDefaultLatinHints = options.useDefaultLatinHints !== false;
1718
+ const customRules = options.latinHintRules ?? [];
1719
+ const combinedRules = [];
1720
+ for (let index = 0; index < customRules.length; index += 1) {
1721
+ const rule = customRules[index];
1722
+ if (!rule) continue;
1723
+ combinedRules.push({
1724
+ rule,
1725
+ label: `Invalid custom Latin hint rule at index ${index}`
1726
+ });
1727
+ }
1728
+ if (useDefaultLatinHints) for (let index = 0; index < DEFAULT_LATIN_HINT_RULES.length; index += 1) {
1729
+ const rule = DEFAULT_LATIN_HINT_RULES[index];
1730
+ if (!rule) continue;
1731
+ combinedRules.push({
1732
+ rule,
1733
+ label: `Invalid default Latin hint rule at index ${index}`
1734
+ });
1735
+ }
1736
+ const resolvedRules = combinedRules.map((entry, index) => compileLatinHintRule(entry.rule, index, entry.label));
1737
+ resolvedRules.sort((left, right) => {
1738
+ if (left.priority !== right.priority) return right.priority - left.priority;
1739
+ return left.order - right.order;
1740
+ });
1741
+ return resolvedRules;
1742
+ }
1743
+ function resolveLocaleDetectContext(options = {}) {
1744
+ const latinHint = resolveLatinHint(options);
1745
+ const latinHintRules = resolveLatinHintRules$1(options);
1746
+ const latinLocales = new Set([DEFAULT_LOCALE]);
1747
+ for (const rule of latinHintRules) latinLocales.add(rule.tag);
1748
+ if (latinHint) latinLocales.add(latinHint);
1749
+ return {
1750
+ latinHint,
1751
+ hanHint: resolveHanHint(options),
1752
+ latinHintRules,
1753
+ latinLocales
1754
+ };
1755
+ }
1756
+ function detectLatinLocale(char, context) {
1757
+ for (const hint of context.latinHintRules) {
1758
+ hint.pattern.lastIndex = 0;
1759
+ if (hint.pattern.test(char)) return hint.tag;
1760
+ }
1761
+ return DEFAULT_LOCALE;
1762
+ }
1763
+ function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options)) {
1664
1764
  if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
1665
1765
  if (regex.hangul.test(char)) return "ko";
1666
1766
  if (regex.arabic.test(char)) return "ar";
@@ -1669,14 +1769,13 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
1669
1769
  if (regex.thai.test(char)) return "th";
1670
1770
  if (regex.han.test(char)) {
1671
1771
  if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
1672
- return resolveHanHint(options) ?? DEFAULT_HAN_TAG;
1772
+ return context.hanHint ?? DEFAULT_HAN_TAG;
1673
1773
  }
1674
1774
  if (regex.latin.test(char)) {
1675
- const hintedLocale = detectLatinLocale(char);
1775
+ const hintedLocale = detectLatinLocale(char, context);
1676
1776
  if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
1677
- if (previousLocale && isLatinLocale(previousLocale) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
1678
- const latinHint = resolveLatinHint(options);
1679
- if (latinHint) return latinHint;
1777
+ if (previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
1778
+ if (context.latinHint) return context.latinHint;
1680
1779
  return DEFAULT_LOCALE;
1681
1780
  }
1682
1781
  return null;
@@ -1685,12 +1784,13 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
1685
1784
  //#endregion
1686
1785
  //#region src/wc/segment.ts
1687
1786
  function segmentTextByLocale(text, options = {}) {
1787
+ const context = resolveLocaleDetectContext(options);
1688
1788
  const chunks = [];
1689
1789
  let currentLocale = DEFAULT_LOCALE;
1690
1790
  let buffer = "";
1691
1791
  let bufferHasScript = false;
1692
1792
  for (const char of text) {
1693
- const detected = detectLocaleForChar(char, currentLocale, options);
1793
+ const detected = detectLocaleForChar(char, currentLocale, options, context);
1694
1794
  const targetLocale = detected ?? currentLocale;
1695
1795
  if (buffer === "") {
1696
1796
  currentLocale = targetLocale;
@@ -1705,7 +1805,7 @@ function segmentTextByLocale(text, options = {}) {
1705
1805
  continue;
1706
1806
  }
1707
1807
  if (targetLocale !== currentLocale && detected !== null) {
1708
- if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale)) {
1808
+ if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
1709
1809
  currentLocale = targetLocale;
1710
1810
  buffer += char;
1711
1811
  bufferHasScript = true;
@@ -1758,6 +1858,8 @@ function wordCounter(text, options = {}) {
1758
1858
  latinLanguageHint: options.latinLanguageHint,
1759
1859
  latinTagHint: options.latinTagHint,
1760
1860
  latinLocaleHint: options.latinLocaleHint,
1861
+ latinHintRules: options.latinHintRules,
1862
+ useDefaultLatinHints: options.useDefaultLatinHints,
1761
1863
  hanLanguageHint: options.hanLanguageHint,
1762
1864
  hanTagHint: options.hanTagHint
1763
1865
  });
@@ -2477,6 +2579,57 @@ function resolveDebugReportPathOption(rawValue) {
2477
2579
  if (rawValue === void 0 || rawValue === false) return;
2478
2580
  if (typeof rawValue === "string") return rawValue;
2479
2581
  }
2582
+ function parseInlineLatinHintRule(value) {
2583
+ const separatorIndex = value.indexOf("=");
2584
+ if (separatorIndex <= 0) throw new Error("`--latin-hint` must use `<tag>=<pattern>` format.");
2585
+ const tag = value.slice(0, separatorIndex).trim();
2586
+ const pattern = value.slice(separatorIndex + 1);
2587
+ if (!tag) throw new Error("`--latin-hint` tag must be non-empty.");
2588
+ if (!pattern) throw new Error("`--latin-hint` pattern must be non-empty.");
2589
+ return {
2590
+ tag,
2591
+ pattern
2592
+ };
2593
+ }
2594
+ function parseLatinHintsFileRule(value, index, sourcePath) {
2595
+ if (typeof value !== "object" || value === null) throw new Error(`Invalid Latin hint rule at ${sourcePath}#${index}: rule must be an object.`);
2596
+ const tag = "tag" in value ? value.tag : void 0;
2597
+ const pattern = "pattern" in value ? value.pattern : void 0;
2598
+ const priority = "priority" in value ? value.priority : void 0;
2599
+ if (typeof tag !== "string" || tag.trim().length === 0) throw new Error(`Invalid Latin hint rule at ${sourcePath}#${index}: tag must be a non-empty string.`);
2600
+ if (typeof pattern !== "string") throw new Error(`Invalid Latin hint rule at ${sourcePath}#${index}: pattern must be a string.`);
2601
+ if (priority !== void 0 && (typeof priority !== "number" || !Number.isFinite(priority))) throw new Error(`Invalid Latin hint rule at ${sourcePath}#${index}: priority must be a finite number.`);
2602
+ return {
2603
+ tag,
2604
+ pattern,
2605
+ ...priority !== void 0 ? { priority } : {}
2606
+ };
2607
+ }
2608
+ function parseLatinHintsFile(path) {
2609
+ let raw;
2610
+ try {
2611
+ raw = readFileSync(path, "utf8");
2612
+ } catch (error) {
2613
+ const message = error instanceof Error ? error.message : String(error);
2614
+ throw new Error(`Failed to read Latin hint file (${path}): ${message}`);
2615
+ }
2616
+ let parsed;
2617
+ try {
2618
+ parsed = JSON.parse(raw);
2619
+ } catch (error) {
2620
+ const message = error instanceof Error ? error.message : String(error);
2621
+ throw new Error(`Invalid JSON in Latin hint file (${path}): ${message}`);
2622
+ }
2623
+ if (!Array.isArray(parsed)) throw new Error(`Latin hint file (${path}) must contain a JSON array.`);
2624
+ return parsed.map((rule, index) => parseLatinHintsFileRule(rule, index, path));
2625
+ }
2626
+ function resolveLatinHintRules(options) {
2627
+ const inlineRules = (options.latinHint ?? []).map((value) => parseInlineLatinHintRule(value));
2628
+ const fileRules = typeof options.latinHintsFile === "string" && options.latinHintsFile.length > 0 ? parseLatinHintsFile(options.latinHintsFile) : [];
2629
+ const mergedRules = [...inlineRules, ...fileRules];
2630
+ if (mergedRules.length === 0) return;
2631
+ return mergedRules;
2632
+ }
2480
2633
  function resolveCountRunOptions(options) {
2481
2634
  const useSection = options.section !== "all";
2482
2635
  const totalOfParts = options.totalOf;
@@ -2495,6 +2648,8 @@ function resolveCountRunOptions(options) {
2495
2648
  latinLanguageHint: options.latinLanguage,
2496
2649
  latinTagHint: options.latinTag,
2497
2650
  latinLocaleHint: options.latinLocale,
2651
+ latinHintRules: resolveLatinHintRules(options),
2652
+ useDefaultLatinHints: options.defaultLatinHints !== false,
2498
2653
  hanLanguageHint: options.hanLanguage,
2499
2654
  hanTagHint: options.hanTag,
2500
2655
  nonWords: enableNonWords,