@dev-pi2pie/word-counter 0.0.8 → 0.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -47,29 +47,9 @@ npm unlink --global @dev-pi2pie/word-counter
47
47
  npm install -g @dev-pi2pie/word-counter@latest
48
48
  ```
49
49
 
50
- ### From GitHub Packages
51
-
52
- If your scope is configured to use GitHub Packages:
53
-
54
- ```bash
55
- # ~/.npmrc
56
- @dev-pi2pie:registry=https://npm.pkg.github.com
57
- ```
58
-
59
- ```bash
60
- npm install -g @dev-pi2pie/word-counter@latest
61
- ```
62
-
63
- If your scope is configured to use npmjs instead, the same scoped package name
64
- will resolve from npmjs.com (see the npm registry section above).
65
-
66
- > [!note]
67
- > **npm** may show newer releases (for example, `v0.0.6`) while GitHub Packages still lists `v0.0.5`.
68
- > This is historical; releases kept in sync starting with `v0.0.6`.
69
-
70
50
  ## Usage
71
51
 
72
- Once installed (via `npm link`, npm registry, or GitHub Packages), you can use the CLI directly:
52
+ Once installed (via `npm link` or the npm registry), you can use the CLI directly:
73
53
 
74
54
  ```bash
75
55
  word-counter "Hello 世界 안녕"
@@ -109,12 +89,13 @@ word-counter --path ./fixtures/sample.txt
109
89
 
110
90
  ## Library Usage
111
91
 
112
- The package exports can be used after installing from GitHub Packages or linking locally with `npm link`.
92
+ The package exports can be used after installing from the npm registry or linking locally with `npm link`.
113
93
 
114
94
  ### ESM
115
95
 
116
96
  ```js
117
97
  import wordCounter, {
98
+ countCharsForLocale,
118
99
  countWordsForLocale,
119
100
  countSections,
120
101
  parseMarkdown,
@@ -124,6 +105,36 @@ import wordCounter, {
124
105
 
125
106
  wordCounter("Hello world", { latinLocaleHint: "en" });
126
107
  wordCounter("Hi 👋, world!", { nonWords: true });
108
+ wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
109
+ wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
110
+ countCharsForLocale("👋", "en");
111
+ ```
112
+
113
+ Note: `includeWhitespace` only affects results when `nonWords: true` is enabled.
114
+
115
+ Sample output (with `nonWords: true` and `includeWhitespace: true`):
116
+
117
+ ```json
118
+ {
119
+ "total": 4,
120
+ "counts": { "words": 2, "nonWords": 2, "total": 4 },
121
+ "breakdown": {
122
+ "mode": "chunk",
123
+ "items": [
124
+ {
125
+ // ...
126
+ "words": 2,
127
+ "nonWords": {
128
+ "emoji": [],
129
+ "symbols": [],
130
+ "punctuation": [],
131
+ "counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 2 },
132
+ "whitespace": { "spaces": 0, "tabs": 1, "newlines": 1, "other": 0 }
133
+ }
134
+ }
135
+ ]
136
+ }
137
+ }
127
138
  ```
128
139
 
129
140
  ### CJS
@@ -131,6 +142,7 @@ wordCounter("Hi 👋, world!", { nonWords: true });
131
142
  ```js
132
143
  const wordCounter = require("@dev-pi2pie/word-counter");
133
144
  const {
145
+ countCharsForLocale,
134
146
  countWordsForLocale,
135
147
  countSections,
136
148
  parseMarkdown,
@@ -140,6 +152,36 @@ const {
140
152
 
141
153
  wordCounter("Hello world", { latinLocaleHint: "en" });
142
154
  wordCounter("Hi 👋, world!", { nonWords: true });
155
+ wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
156
+ wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
157
+ countCharsForLocale("👋", "en");
158
+ ```
159
+
160
+ Note: `includeWhitespace` only affects results when `nonWords: true` is enabled.
161
+
162
+ Sample output (with `nonWords: true` and `includeWhitespace: true`):
163
+
164
+ ```json
165
+ {
166
+ "total": 4,
167
+ "counts": { "words": 2, "nonWords": 2, "total": 4 },
168
+ "breakdown": {
169
+ "mode": "chunk",
170
+ "items": [
171
+ {
172
+ // ...
173
+ "words": 2,
174
+ "nonWords": {
175
+ "emoji": [],
176
+ "symbols": [],
177
+ "punctuation": [],
178
+ "counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 2 },
179
+ "whitespace": { "spaces": 0, "tabs": 1, "newlines": 1, "other": 0 }
180
+ }
181
+ }
182
+ ]
183
+ }
184
+ }
143
185
  ```
144
186
 
145
187
  ### Export Summary
@@ -150,6 +192,7 @@ wordCounter("Hi 👋, world!", { nonWords: true });
150
192
  | --------------------- | -------- | -------------------------------------------------- |
151
193
  | `default` | function | `wordCounter(text, options?) -> WordCounterResult` |
152
194
  | `wordCounter` | function | Alias of the default export. |
195
+ | `countCharsForLocale` | function | Low-level helper for per-locale char counts. |
153
196
  | `countWordsForLocale` | function | Low-level helper for per-locale counts. |
154
197
  | `segmentTextByLocale` | function | Low-level helper for locale-aware segmentation. |
155
198
 
@@ -168,13 +211,13 @@ wordCounter("Hi 👋, world!", { nonWords: true });
168
211
 
169
212
  #### Types
170
213
 
171
- | Export | Kind | Notes |
172
- | ---------------------- | ---- | ----------------------------------------- |
173
- | `WordCounterOptions` | type | Options for the `wordCounter` function. |
174
- | `WordCounterResult` | type | Returned by `wordCounter`. |
175
- | `WordCounterBreakdown` | type | Breakdown payload in `WordCounterResult`. |
176
- | `WordCounterMode` | type | `"chunk" \| "segments" \| "collector"`. |
177
- | `NonWordCollection` | type | Non-word segments + counts payload. |
214
+ | Export | Kind | Notes |
215
+ | ---------------------- | ---- | ------------------------------------------------- |
216
+ | `WordCounterOptions` | type | Options for the `wordCounter` function. |
217
+ | `WordCounterResult` | type | Returned by `wordCounter`. |
218
+ | `WordCounterBreakdown` | type | Breakdown payload in `WordCounterResult`. |
219
+ | `WordCounterMode` | type | `"chunk" \| "segments" \| "collector" \| "char"`. |
220
+ | `NonWordCollection` | type | Non-word segments + counts payload. |
178
221
 
179
222
  ### Display Modes
180
223
 
@@ -183,6 +226,14 @@ Choose a breakdown style with `--mode` (or `-m`):
183
226
  - `chunk` (default) – list each contiguous locale block in order of appearance.
184
227
  - `segments` – show the actual wordlike segments used for counting.
185
228
  - `collector` – aggregate counts per locale regardless of text position.
229
+ - `char` – count grapheme clusters (user-perceived characters) per locale.
230
+
231
+ Aliases are normalized for CLI + API:
232
+
233
+ - `chunk`, `chunks`
234
+ - `segments`, `segment`, `seg`
235
+ - `collector`, `collect`, `colle`
236
+ - `char`, `chars`, `character`, `characters`
186
237
 
187
238
  Examples:
188
239
 
@@ -195,6 +246,9 @@ word-counter --mode segments "飛鳥 bird 貓 cat; how do you do?"
195
246
 
196
247
  # aggregate per locale
197
248
  word-counter -m collector "飛鳥 bird 貓 cat; how do you do?"
249
+
250
+ # grapheme-aware character count
251
+ word-counter -m char "Hi 👋, world!"
198
252
  ```
199
253
 
200
254
  ### Section Modes (Frontmatter)
@@ -268,6 +322,37 @@ word-counter --non-words "Hi 👋, world!"
268
322
  Example: `total = words + emoji + symbols + punctuation` when enabled.
269
323
  Standard output labels this as `Total count` to reflect the combined total; `--format raw` still prints a single number.
270
324
 
325
+ Include whitespace-like characters in the non-words bucket (API: `includeWhitespace: true`):
326
+
327
+ ```bash
328
+ word-counter --include-whitespace "Hi\tthere\n"
329
+ word-counter --misc "Hi\tthere\n"
330
+ ```
331
+
332
+ In the CLI, `--include-whitespace` implies with `--non-words` (same behavior as `--misc`). `--non-words` alone does not include whitespace. When enabled, whitespace counts appear under `nonWords.whitespace`, and `total = words + nonWords` (emoji + symbols + punctuation + whitespace). JSON output also includes top-level `counts` when `nonWords` is enabled. See `docs/schemas/whitespace-categories.md` for how whitespace is categorized.
333
+
334
+ Example JSON (trimmed):
335
+
336
+ ```json
337
+ {
338
+ "total": 5,
339
+ "counts": { "words": 2, "nonWords": 3, "total": 5 },
340
+ "breakdown": {
341
+ "mode": "chunk",
342
+ "items": [
343
+ {
344
+ "locale": "und-Latn",
345
+ "words": 2,
346
+ "nonWords": {
347
+ "counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 3 },
348
+ "whitespace": { "spaces": 1, "tabs": 1, "newlines": 1, "other": 0 }
349
+ }
350
+ }
351
+ ]
352
+ }
353
+ }
354
+ ```
355
+
271
356
  > [!Note]
272
357
  > Text-default symbols (e.g. ©) count as `symbols` unless explicitly emoji-presented (e.g. ©️ with VS16).
273
358
 
@@ -2,6 +2,7 @@ let yaml = require("yaml");
2
2
 
3
3
  //#region src/wc/segmenter.ts
4
4
  const segmenterCache = /* @__PURE__ */ new Map();
5
+ const graphemeSegmenterCache = /* @__PURE__ */ new Map();
5
6
  function getSegmenter(locale) {
6
7
  const cached = segmenterCache.get(locale);
7
8
  if (cached) return cached;
@@ -9,12 +10,29 @@ function getSegmenter(locale) {
9
10
  segmenterCache.set(locale, segmenter);
10
11
  return segmenter;
11
12
  }
13
+ function getGraphemeSegmenter(locale) {
14
+ const cached = graphemeSegmenterCache.get(locale);
15
+ if (cached) return cached;
16
+ const segmenter = new Intl.Segmenter(locale, { granularity: "grapheme" });
17
+ graphemeSegmenterCache.set(locale, segmenter);
18
+ return segmenter;
19
+ }
20
+ function supportsSegmenter() {
21
+ return typeof Intl !== "undefined" && typeof Intl.Segmenter === "function";
22
+ }
12
23
  function countWordsForLocale(text, locale) {
13
24
  const segmenter = getSegmenter(locale);
14
25
  let count = 0;
15
26
  for (const segment of segmenter.segment(text)) if (segment.isWordLike) count++;
16
27
  return count;
17
28
  }
29
+ function countCharsForLocale(text, locale) {
30
+ if (!supportsSegmenter()) return Array.from(text).length;
31
+ const segmenter = getGraphemeSegmenter(locale);
32
+ let count = 0;
33
+ for (const _segment of segmenter.segment(text)) count++;
34
+ return count;
35
+ }
18
36
 
19
37
  //#endregion
20
38
  //#region src/wc/non-words.ts
@@ -23,6 +41,13 @@ const emojiPresentationRegex = /\p{Emoji_Presentation}/u;
23
41
  const keycapEmojiRegex = /[0-9#*]\uFE0F?\u20E3/u;
24
42
  const symbolRegex = /\p{S}/u;
25
43
  const punctuationRegex = /\p{P}/u;
44
+ const whitespaceRegex = /\s/u;
45
+ const newlineChars = new Set([
46
+ "\n",
47
+ "\r",
48
+ "\u2028",
49
+ "\u2029"
50
+ ]);
26
51
  function createNonWordCollection() {
27
52
  return {
28
53
  emoji: [],
@@ -49,6 +74,40 @@ function addNonWord(collection, category, segment) {
49
74
  collection.punctuation.push(segment);
50
75
  collection.counts.punctuation += 1;
51
76
  }
77
+ function addWhitespace(collection, segment) {
78
+ let whitespace = collection.whitespace;
79
+ let count = 0;
80
+ for (const char of segment) {
81
+ if (char === " ") {
82
+ whitespace = whitespace ?? createWhitespaceCounts();
83
+ whitespace.spaces += 1;
84
+ count += 1;
85
+ continue;
86
+ }
87
+ if (char === " ") {
88
+ whitespace = whitespace ?? createWhitespaceCounts();
89
+ whitespace.tabs += 1;
90
+ count += 1;
91
+ continue;
92
+ }
93
+ if (newlineChars.has(char)) {
94
+ whitespace = whitespace ?? createWhitespaceCounts();
95
+ whitespace.newlines += 1;
96
+ count += 1;
97
+ continue;
98
+ }
99
+ if (whitespaceRegex.test(char)) {
100
+ whitespace = whitespace ?? createWhitespaceCounts();
101
+ whitespace.other += 1;
102
+ count += 1;
103
+ }
104
+ }
105
+ if (count > 0) {
106
+ collection.whitespace = whitespace ?? createWhitespaceCounts();
107
+ collection.counts.whitespace = (collection.counts.whitespace ?? 0) + count;
108
+ }
109
+ return count;
110
+ }
52
111
  function classifyNonWordSegment(segment) {
53
112
  const hasEmojiVariationSelector = segment.includes("️");
54
113
  if (keycapEmojiRegex.test(segment) || emojiPresentationRegex.test(segment) || hasEmojiVariationSelector && emojiRegex.test(segment)) return "emoji";
@@ -69,17 +128,35 @@ function mergeNonWordCollections(target, source) {
69
128
  target.punctuation.push(...source.punctuation);
70
129
  target.counts.punctuation += source.counts.punctuation;
71
130
  }
131
+ if (source.counts.whitespace && source.counts.whitespace > 0 && source.whitespace) {
132
+ const whitespace = target.whitespace ?? createWhitespaceCounts();
133
+ whitespace.spaces += source.whitespace.spaces;
134
+ whitespace.tabs += source.whitespace.tabs;
135
+ whitespace.newlines += source.whitespace.newlines;
136
+ whitespace.other += source.whitespace.other;
137
+ target.whitespace = whitespace;
138
+ target.counts.whitespace = (target.counts.whitespace ?? 0) + source.counts.whitespace;
139
+ }
72
140
  return target;
73
141
  }
142
+ function createWhitespaceCounts() {
143
+ return {
144
+ spaces: 0,
145
+ tabs: 0,
146
+ newlines: 0,
147
+ other: 0
148
+ };
149
+ }
74
150
 
75
151
  //#endregion
76
152
  //#region src/wc/analyze.ts
77
- function analyzeChunk(chunk, collectNonWords) {
153
+ function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
78
154
  const segmenter = getSegmenter(chunk.locale);
79
155
  const segments = [];
80
156
  const nonWords = collectNonWords ? createNonWordCollection() : null;
81
157
  for (const part of segmenter.segment(chunk.text)) if (part.isWordLike) segments.push(part.segment);
82
158
  else if (collectNonWords && nonWords) {
159
+ if (includeWhitespace) addWhitespace(nonWords, part.segment);
83
160
  const category = classifyNonWordSegment(part.segment);
84
161
  if (category) addNonWord(nonWords, category, part.segment);
85
162
  }
@@ -91,6 +168,40 @@ function analyzeChunk(chunk, collectNonWords) {
91
168
  nonWords: nonWords ?? void 0
92
169
  };
93
170
  }
171
+ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
172
+ const segmenter = getSegmenter(chunk.locale);
173
+ const nonWords = collectNonWords ? createNonWordCollection() : null;
174
+ let chars = 0;
175
+ let wordChars = 0;
176
+ let nonWordChars = 0;
177
+ for (const part of segmenter.segment(chunk.text)) {
178
+ if (part.isWordLike) {
179
+ const count = countCharsForLocale(part.segment, chunk.locale);
180
+ chars += count;
181
+ wordChars += count;
182
+ continue;
183
+ }
184
+ if (collectNonWords && nonWords) {
185
+ let whitespaceCount = 0;
186
+ if (includeWhitespace) whitespaceCount = addWhitespace(nonWords, part.segment);
187
+ const category = classifyNonWordSegment(part.segment);
188
+ if (category) addNonWord(nonWords, category, part.segment);
189
+ if (category || whitespaceCount > 0) {
190
+ const count = countCharsForLocale(part.segment, chunk.locale);
191
+ chars += count;
192
+ nonWordChars += count;
193
+ }
194
+ }
195
+ }
196
+ return {
197
+ locale: chunk.locale,
198
+ text: chunk.text,
199
+ chars,
200
+ wordChars,
201
+ nonWordChars,
202
+ nonWords: nonWords ?? void 0
203
+ };
204
+ }
94
205
  function aggregateByLocale(chunks) {
95
206
  const order = [];
96
207
  const map = /* @__PURE__ */ new Map();
@@ -111,6 +222,30 @@ function aggregateByLocale(chunks) {
111
222
  return order.map((locale) => map.get(locale));
112
223
  }
113
224
 
225
+ //#endregion
226
+ //#region src/wc/mode.ts
227
+ const MODE_ALIASES = {
228
+ chunk: "chunk",
229
+ chunks: "chunk",
230
+ segments: "segments",
231
+ segment: "segments",
232
+ seg: "segments",
233
+ collector: "collector",
234
+ collect: "collector",
235
+ colle: "collector",
236
+ char: "char",
237
+ chars: "char",
238
+ character: "char",
239
+ characters: "char"
240
+ };
241
+ function normalizeMode(input) {
242
+ if (!input) return null;
243
+ return MODE_ALIASES[input.trim().toLowerCase()] ?? null;
244
+ }
245
+ function resolveMode(input, fallback = "chunk") {
246
+ return normalizeMode(input) ?? fallback;
247
+ }
248
+
114
249
  //#endregion
115
250
  //#region src/wc/locale-detect.ts
116
251
  const DEFAULT_LOCALE = "und-Latn";
@@ -241,16 +376,51 @@ function mergeAdjacentChunks(chunks) {
241
376
  //#endregion
242
377
  //#region src/wc/wc.ts
243
378
  function wordCounter(text, options = {}) {
244
- const mode = options.mode ?? "chunk";
379
+ const mode = resolveMode(options.mode, "chunk");
245
380
  const collectNonWords = Boolean(options.nonWords);
246
- const analyzed = segmentTextByLocale(text, { latinLocaleHint: options.latinLocaleHint }).map((chunk) => analyzeChunk(chunk, collectNonWords));
381
+ const includeWhitespace = Boolean(options.includeWhitespace);
382
+ const chunks = segmentTextByLocale(text, { latinLocaleHint: options.latinLocaleHint });
383
+ if (mode === "char") {
384
+ const analyzed$1 = chunks.map((chunk) => analyzeCharChunk(chunk, collectNonWords, includeWhitespace));
385
+ const total$1 = analyzed$1.reduce((sum, chunk) => sum + chunk.chars, 0);
386
+ const items = analyzed$1.map((chunk) => ({
387
+ locale: chunk.locale,
388
+ text: chunk.text,
389
+ chars: chunk.chars,
390
+ nonWords: chunk.nonWords
391
+ }));
392
+ return {
393
+ total: total$1,
394
+ counts: collectNonWords ? {
395
+ words: analyzed$1.reduce((sum, chunk) => sum + chunk.wordChars, 0),
396
+ nonWords: analyzed$1.reduce((sum, chunk) => sum + chunk.nonWordChars, 0),
397
+ total: total$1
398
+ } : void 0,
399
+ breakdown: {
400
+ mode,
401
+ items
402
+ }
403
+ };
404
+ }
405
+ const analyzed = chunks.map((chunk) => analyzeChunk(chunk, collectNonWords, includeWhitespace));
406
+ const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0);
407
+ const nonWordsTotal = collectNonWords ? analyzed.reduce((sum, chunk) => {
408
+ if (!chunk.nonWords) return sum;
409
+ return sum + getNonWordTotal(chunk.nonWords);
410
+ }, 0) : 0;
247
411
  const total = analyzed.reduce((sum, chunk) => {
248
412
  let chunkTotal = chunk.words;
249
- if (collectNonWords && chunk.nonWords) chunkTotal += chunk.nonWords.counts.emoji + chunk.nonWords.counts.symbols + chunk.nonWords.counts.punctuation;
413
+ if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal(chunk.nonWords);
250
414
  return sum + chunkTotal;
251
415
  }, 0);
416
+ const counts = collectNonWords ? {
417
+ words: wordsTotal,
418
+ nonWords: nonWordsTotal,
419
+ total
420
+ } : void 0;
252
421
  if (mode === "segments") return {
253
422
  total,
423
+ counts,
254
424
  breakdown: {
255
425
  mode,
256
426
  items: analyzed.map((chunk) => ({
@@ -264,6 +434,7 @@ function wordCounter(text, options = {}) {
264
434
  };
265
435
  if (mode === "collector") return {
266
436
  total,
437
+ counts,
267
438
  breakdown: {
268
439
  mode,
269
440
  items: aggregateByLocale(analyzed),
@@ -272,6 +443,7 @@ function wordCounter(text, options = {}) {
272
443
  };
273
444
  return {
274
445
  total,
446
+ counts,
275
447
  breakdown: {
276
448
  mode,
277
449
  items: analyzed.map((chunk) => ({
@@ -283,6 +455,9 @@ function wordCounter(text, options = {}) {
283
455
  }
284
456
  };
285
457
  }
458
+ function getNonWordTotal(nonWords) {
459
+ return nonWords.counts.emoji + nonWords.counts.symbols + nonWords.counts.punctuation + (nonWords.counts.whitespace ?? 0);
460
+ }
286
461
  function collectNonWordsAggregate(analyzed, enabled) {
287
462
  if (!enabled) return;
288
463
  const collection = createNonWordCollection();
@@ -834,6 +1009,7 @@ function showSingularOrPluralWord(count, word) {
834
1009
  const cjsExports = Object.assign(wc_default, {
835
1010
  default: wc_default,
836
1011
  wordCounter: wc_default,
1012
+ countCharsForLocale,
837
1013
  countWordsForLocale,
838
1014
  segmentTextByLocale,
839
1015
  parseMarkdown,