npm - @dev-pi2pie/word-counter - Versions diffs - 0.0.8 → 0.0.9 - Mend

@dev-pi2pie/word-counter 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md CHANGED Viewed

@@ -47,29 +47,9 @@ npm unlink --global @dev-pi2pie/word-counter
 npm install -g @dev-pi2pie/word-counter@latest
 ```
-### From GitHub Packages
-If your scope is configured to use GitHub Packages:
-```bash
-# ~/.npmrc
-@dev-pi2pie:registry=https://npm.pkg.github.com
-```
-```bash
-npm install -g @dev-pi2pie/word-counter@latest
-```
-If your scope is configured to use npmjs instead, the same scoped package name
-will resolve from npmjs.com (see the npm registry section above).
-> [!note]
-> **npm** may show newer releases (for example, `v0.0.6`) while GitHub Packages still lists `v0.0.5`.
-> This is historical; releases kept in sync starting with `v0.0.6`.
 ## Usage
-Once installed (via `npm link`, npm registry, or GitHub Packages), you can use the CLI directly:
+Once installed (via `npm link` or the npm registry), you can use the CLI directly:
 ```bash
 word-counter "Hello 世界 안녕"
@@ -109,12 +89,13 @@ word-counter --path ./fixtures/sample.txt
 ## Library Usage
-The package exports can be used after installing from GitHub Packages or linking locally with `npm link`.
+The package exports can be used after installing from the npm registry or linking locally with `npm link`.
 ### ESM
 ```js
 import wordCounter, {
+  countCharsForLocale,
   countWordsForLocale,
   countSections,
   parseMarkdown,
@@ -124,6 +105,36 @@ import wordCounter, {
 wordCounter("Hello world", { latinLocaleHint: "en" });
 wordCounter("Hi 👋, world!", { nonWords: true });
+wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
+wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
+countCharsForLocale("👋", "en");
+```
+Note: `includeWhitespace` only affects results when `nonWords: true` is enabled.
+Sample output (with `nonWords: true` and `includeWhitespace: true`):
+```json
+{
+  "total": 4,
+  "counts": { "words": 2, "nonWords": 2, "total": 4 },
+  "breakdown": {
+    "mode": "chunk",
+    "items": [
+      {
+        // ...
+        "words": 2,
+        "nonWords": {
+          "emoji": [],
+          "symbols": [],
+          "punctuation": [],
+          "counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 2 },
+          "whitespace": { "spaces": 0, "tabs": 1, "newlines": 1, "other": 0 }
+        }
+      }
+    ]
+  }
+}
 ```
 ### CJS
@@ -131,6 +142,7 @@ wordCounter("Hi 👋, world!", { nonWords: true });
 ```js
 const wordCounter = require("@dev-pi2pie/word-counter");
 const {
+  countCharsForLocale,
   countWordsForLocale,
   countSections,
   parseMarkdown,
@@ -140,6 +152,36 @@ const {
 wordCounter("Hello world", { latinLocaleHint: "en" });
 wordCounter("Hi 👋, world!", { nonWords: true });
+wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
+wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
+countCharsForLocale("👋", "en");
+```
+Note: `includeWhitespace` only affects results when `nonWords: true` is enabled.
+Sample output (with `nonWords: true` and `includeWhitespace: true`):
+```json
+{
+  "total": 4,
+  "counts": { "words": 2, "nonWords": 2, "total": 4 },
+  "breakdown": {
+    "mode": "chunk",
+    "items": [
+      {
+        // ...
+        "words": 2,
+        "nonWords": {
+          "emoji": [],
+          "symbols": [],
+          "punctuation": [],
+          "counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 2 },
+          "whitespace": { "spaces": 0, "tabs": 1, "newlines": 1, "other": 0 }
+        }
+      }
+    ]
+  }
+}
 ```
 ### Export Summary
@@ -150,6 +192,7 @@ wordCounter("Hi 👋, world!", { nonWords: true });
 | --------------------- | -------- | -------------------------------------------------- |
 | `default`             | function | `wordCounter(text, options?) -> WordCounterResult` |
 | `wordCounter`         | function | Alias of the default export.                       |
+| `countCharsForLocale` | function | Low-level helper for per-locale char counts.       |
 | `countWordsForLocale` | function | Low-level helper for per-locale counts.            |
 | `segmentTextByLocale` | function | Low-level helper for locale-aware segmentation.    |
@@ -168,13 +211,13 @@ wordCounter("Hi 👋, world!", { nonWords: true });
 #### Types
-| Export                 | Kind | Notes                                     |
-| ---------------------- | ---- | ----------------------------------------- |
-| `WordCounterOptions`   | type | Options for the `wordCounter` function.   |
-| `WordCounterResult`    | type | Returned by `wordCounter`.                |
-| `WordCounterBreakdown` | type | Breakdown payload in `WordCounterResult`. |
-| `WordCounterMode`      | type | `"chunk" \| "segments" \| "collector"`.   |
-| `NonWordCollection`    | type | Non-word segments + counts payload.       |
+| Export                 | Kind | Notes                                             |
+| ---------------------- | ---- | ------------------------------------------------- |
+| `WordCounterOptions`   | type | Options for the `wordCounter` function.           |
+| `WordCounterResult`    | type | Returned by `wordCounter`.                        |
+| `WordCounterBreakdown` | type | Breakdown payload in `WordCounterResult`.         |
+| `WordCounterMode`      | type | `"chunk" \| "segments" \| "collector" \| "char"`. |
+| `NonWordCollection`    | type | Non-word segments + counts payload.               |
 ### Display Modes
@@ -183,6 +226,14 @@ Choose a breakdown style with `--mode` (or `-m`):
 - `chunk` (default) – list each contiguous locale block in order of appearance.
 - `segments` – show the actual wordlike segments used for counting.
 - `collector` – aggregate counts per locale regardless of text position.
+- `char` – count grapheme clusters (user-perceived characters) per locale.
+Aliases are normalized for CLI + API:
+- `chunk`, `chunks`
+- `segments`, `segment`, `seg`
+- `collector`, `collect`, `colle`
+- `char`, `chars`, `character`, `characters`
 Examples:
@@ -195,6 +246,9 @@ word-counter --mode segments "飛鳥 bird 貓 cat; how do you do?"
 # aggregate per locale
 word-counter -m collector "飛鳥 bird 貓 cat; how do you do?"
+# grapheme-aware character count
+word-counter -m char "Hi 👋, world!"
 ```
 ### Section Modes (Frontmatter)
@@ -268,6 +322,37 @@ word-counter --non-words "Hi 👋, world!"
 Example: `total = words + emoji + symbols + punctuation` when enabled.
 Standard output labels this as `Total count` to reflect the combined total; `--format raw` still prints a single number.
+Include whitespace-like characters in the non-words bucket (API: `includeWhitespace: true`):
+```bash
+word-counter --include-whitespace "Hi\tthere\n"
+word-counter --misc "Hi\tthere\n"
+```
+In the CLI, `--include-whitespace` implies with `--non-words` (same behavior as `--misc`). `--non-words` alone does not include whitespace. When enabled, whitespace counts appear under `nonWords.whitespace`, and `total = words + nonWords` (emoji + symbols + punctuation + whitespace). JSON output also includes top-level `counts` when `nonWords` is enabled. See `docs/schemas/whitespace-categories.md` for how whitespace is categorized.
+Example JSON (trimmed):
+```json
+{
+  "total": 5,
+  "counts": { "words": 2, "nonWords": 3, "total": 5 },
+  "breakdown": {
+    "mode": "chunk",
+    "items": [
+      {
+        "locale": "und-Latn",
+        "words": 2,
+        "nonWords": {
+          "counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 3 },
+          "whitespace": { "spaces": 1, "tabs": 1, "newlines": 1, "other": 0 }
+        }
+      }
+    ]
+  }
+}
+```
 > [!Note]
 > Text-default symbols (e.g. ©) count as `symbols` unless explicitly emoji-presented (e.g. ©️ with VS16).

package/dist/cjs/index.cjs CHANGED Viewed

@@ -2,6 +2,7 @@ let yaml = require("yaml");
 //#region src/wc/segmenter.ts
 const segmenterCache = /* @__PURE__ */ new Map();
+const graphemeSegmenterCache = /* @__PURE__ */ new Map();
 function getSegmenter(locale) {
 	const cached = segmenterCache.get(locale);
 	if (cached) return cached;
@@ -9,12 +10,29 @@ function getSegmenter(locale) {
 	segmenterCache.set(locale, segmenter);
 	return segmenter;
 }
+function getGraphemeSegmenter(locale) {
+	const cached = graphemeSegmenterCache.get(locale);
+	if (cached) return cached;
+	const segmenter = new Intl.Segmenter(locale, { granularity: "grapheme" });
+	graphemeSegmenterCache.set(locale, segmenter);
+	return segmenter;
+}
+function supportsSegmenter() {
+	return typeof Intl !== "undefined" && typeof Intl.Segmenter === "function";
+}
 function countWordsForLocale(text, locale) {
 	const segmenter = getSegmenter(locale);
 	let count = 0;
 	for (const segment of segmenter.segment(text)) if (segment.isWordLike) count++;
 	return count;
 }
+function countCharsForLocale(text, locale) {
+	if (!supportsSegmenter()) return Array.from(text).length;
+	const segmenter = getGraphemeSegmenter(locale);
+	let count = 0;
+	for (const _segment of segmenter.segment(text)) count++;
+	return count;
+}
 //#endregion
 //#region src/wc/non-words.ts
@@ -23,6 +41,13 @@ const emojiPresentationRegex = /\p{Emoji_Presentation}/u;
 const keycapEmojiRegex = /[0-9#*]\uFE0F?\u20E3/u;
 const symbolRegex = /\p{S}/u;
 const punctuationRegex = /\p{P}/u;
+const whitespaceRegex = /\s/u;
+const newlineChars = new Set([
+	"\n",
+	"\r",
+	"\u2028",
+	"\u2029"
+]);
 function createNonWordCollection() {
 	return {
 		emoji: [],
@@ -49,6 +74,40 @@ function addNonWord(collection, category, segment) {
 	collection.punctuation.push(segment);
 	collection.counts.punctuation += 1;
 }
+function addWhitespace(collection, segment) {
+	let whitespace = collection.whitespace;
+	let count = 0;
+	for (const char of segment) {
+		if (char === " ") {
+			whitespace = whitespace ?? createWhitespaceCounts();
+			whitespace.spaces += 1;
+			count += 1;
+			continue;
+		}
+		if (char === "	") {
+			whitespace = whitespace ?? createWhitespaceCounts();
+			whitespace.tabs += 1;
+			count += 1;
+			continue;
+		}
+		if (newlineChars.has(char)) {
+			whitespace = whitespace ?? createWhitespaceCounts();
+			whitespace.newlines += 1;
+			count += 1;
+			continue;
+		}
+		if (whitespaceRegex.test(char)) {
+			whitespace = whitespace ?? createWhitespaceCounts();
+			whitespace.other += 1;
+			count += 1;
+		}
+	}
+	if (count > 0) {
+		collection.whitespace = whitespace ?? createWhitespaceCounts();
+		collection.counts.whitespace = (collection.counts.whitespace ?? 0) + count;
+	}
+	return count;
+}
 function classifyNonWordSegment(segment) {
 	const hasEmojiVariationSelector = segment.includes("️");
 	if (keycapEmojiRegex.test(segment) || emojiPresentationRegex.test(segment) || hasEmojiVariationSelector && emojiRegex.test(segment)) return "emoji";
@@ -69,17 +128,35 @@ function mergeNonWordCollections(target, source) {
 		target.punctuation.push(...source.punctuation);
 		target.counts.punctuation += source.counts.punctuation;
 	}
+	if (source.counts.whitespace && source.counts.whitespace > 0 && source.whitespace) {
+		const whitespace = target.whitespace ?? createWhitespaceCounts();
+		whitespace.spaces += source.whitespace.spaces;
+		whitespace.tabs += source.whitespace.tabs;
+		whitespace.newlines += source.whitespace.newlines;
+		whitespace.other += source.whitespace.other;
+		target.whitespace = whitespace;
+		target.counts.whitespace = (target.counts.whitespace ?? 0) + source.counts.whitespace;
+	}
 	return target;
 }
+function createWhitespaceCounts() {
+	return {
+		spaces: 0,
+		tabs: 0,
+		newlines: 0,
+		other: 0
+	};
+}
 //#endregion
 //#region src/wc/analyze.ts
-function analyzeChunk(chunk, collectNonWords) {
+function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
 	const segmenter = getSegmenter(chunk.locale);
 	const segments = [];
 	const nonWords = collectNonWords ? createNonWordCollection() : null;
 	for (const part of segmenter.segment(chunk.text)) if (part.isWordLike) segments.push(part.segment);
 	else if (collectNonWords && nonWords) {
+		if (includeWhitespace) addWhitespace(nonWords, part.segment);
 		const category = classifyNonWordSegment(part.segment);
 		if (category) addNonWord(nonWords, category, part.segment);
 	}
@@ -91,6 +168,40 @@ function analyzeChunk(chunk, collectNonWords) {
 		nonWords: nonWords ?? void 0
 	};
 }
+function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
+	const segmenter = getSegmenter(chunk.locale);
+	const nonWords = collectNonWords ? createNonWordCollection() : null;
+	let chars = 0;
+	let wordChars = 0;
+	let nonWordChars = 0;
+	for (const part of segmenter.segment(chunk.text)) {
+		if (part.isWordLike) {
+			const count = countCharsForLocale(part.segment, chunk.locale);
+			chars += count;
+			wordChars += count;
+			continue;
+		}
+		if (collectNonWords && nonWords) {
+			let whitespaceCount = 0;
+			if (includeWhitespace) whitespaceCount = addWhitespace(nonWords, part.segment);
+			const category = classifyNonWordSegment(part.segment);
+			if (category) addNonWord(nonWords, category, part.segment);
+			if (category || whitespaceCount > 0) {
+				const count = countCharsForLocale(part.segment, chunk.locale);
+				chars += count;
+				nonWordChars += count;
+			}
+		}
+	}
+	return {
+		locale: chunk.locale,
+		text: chunk.text,
+		chars,
+		wordChars,
+		nonWordChars,
+		nonWords: nonWords ?? void 0
+	};
+}
 function aggregateByLocale(chunks) {
 	const order = [];
 	const map = /* @__PURE__ */ new Map();
@@ -111,6 +222,30 @@ function aggregateByLocale(chunks) {
 	return order.map((locale) => map.get(locale));
 }
+//#endregion
+//#region src/wc/mode.ts
+const MODE_ALIASES = {
+	chunk: "chunk",
+	chunks: "chunk",
+	segments: "segments",
+	segment: "segments",
+	seg: "segments",
+	collector: "collector",
+	collect: "collector",
+	colle: "collector",
+	char: "char",
+	chars: "char",
+	character: "char",
+	characters: "char"
+};
+function normalizeMode(input) {
+	if (!input) return null;
+	return MODE_ALIASES[input.trim().toLowerCase()] ?? null;
+}
+function resolveMode(input, fallback = "chunk") {
+	return normalizeMode(input) ?? fallback;
+}
 //#endregion
 //#region src/wc/locale-detect.ts
 const DEFAULT_LOCALE = "und-Latn";
@@ -241,16 +376,51 @@ function mergeAdjacentChunks(chunks) {
 //#endregion
 //#region src/wc/wc.ts
 function wordCounter(text, options = {}) {
-	const mode = options.mode ?? "chunk";
+	const mode = resolveMode(options.mode, "chunk");
 	const collectNonWords = Boolean(options.nonWords);
-	const analyzed = segmentTextByLocale(text, { latinLocaleHint: options.latinLocaleHint }).map((chunk) => analyzeChunk(chunk, collectNonWords));
+	const includeWhitespace = Boolean(options.includeWhitespace);
+	const chunks = segmentTextByLocale(text, { latinLocaleHint: options.latinLocaleHint });
+	if (mode === "char") {
+		const analyzed$1 = chunks.map((chunk) => analyzeCharChunk(chunk, collectNonWords, includeWhitespace));
+		const total$1 = analyzed$1.reduce((sum, chunk) => sum + chunk.chars, 0);
+		const items = analyzed$1.map((chunk) => ({
+			locale: chunk.locale,
+			text: chunk.text,
+			chars: chunk.chars,
+			nonWords: chunk.nonWords
+		}));
+		return {
+			total: total$1,
+			counts: collectNonWords ? {
+				words: analyzed$1.reduce((sum, chunk) => sum + chunk.wordChars, 0),
+				nonWords: analyzed$1.reduce((sum, chunk) => sum + chunk.nonWordChars, 0),
+				total: total$1
+			} : void 0,
+			breakdown: {
+				mode,
+				items
+			}
+		};
+	}
+	const analyzed = chunks.map((chunk) => analyzeChunk(chunk, collectNonWords, includeWhitespace));
+	const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0);
+	const nonWordsTotal = collectNonWords ? analyzed.reduce((sum, chunk) => {
+		if (!chunk.nonWords) return sum;
+		return sum + getNonWordTotal(chunk.nonWords);
+	}, 0) : 0;
 	const total = analyzed.reduce((sum, chunk) => {
 		let chunkTotal = chunk.words;
-		if (collectNonWords && chunk.nonWords) chunkTotal += chunk.nonWords.counts.emoji + chunk.nonWords.counts.symbols + chunk.nonWords.counts.punctuation;
+		if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal(chunk.nonWords);
 		return sum + chunkTotal;
 	}, 0);
+	const counts = collectNonWords ? {
+		words: wordsTotal,
+		nonWords: nonWordsTotal,
+		total
+	} : void 0;
 	if (mode === "segments") return {
 		total,
+		counts,
 		breakdown: {
 			mode,
 			items: analyzed.map((chunk) => ({
@@ -264,6 +434,7 @@ function wordCounter(text, options = {}) {
 	};
 	if (mode === "collector") return {
 		total,
+		counts,
 		breakdown: {
 			mode,
 			items: aggregateByLocale(analyzed),
@@ -272,6 +443,7 @@ function wordCounter(text, options = {}) {
 	};
 	return {
 		total,
+		counts,
 		breakdown: {
 			mode,
 			items: analyzed.map((chunk) => ({
@@ -283,6 +455,9 @@ function wordCounter(text, options = {}) {
 		}
 	};
 }
+function getNonWordTotal(nonWords) {
+	return nonWords.counts.emoji + nonWords.counts.symbols + nonWords.counts.punctuation + (nonWords.counts.whitespace ?? 0);
+}
 function collectNonWordsAggregate(analyzed, enabled) {
 	if (!enabled) return;
 	const collection = createNonWordCollection();
@@ -834,6 +1009,7 @@ function showSingularOrPluralWord(count, word) {
 const cjsExports = Object.assign(wc_default, {
 	default: wc_default,
 	wordCounter: wc_default,
+	countCharsForLocale,
 	countWordsForLocale,
 	segmentTextByLocale,
 	parseMarkdown,