@dev-pi2pie/word-counter 0.0.8 → 0.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +114 -29
- package/dist/cjs/index.cjs +180 -4
- package/dist/cjs/index.cjs.map +1 -1
- package/dist/esm/bin.mjs +236 -19
- package/dist/esm/bin.mjs.map +1 -1
- package/dist/esm/index.d.mts +25 -2
- package/dist/esm/index.mjs +180 -5
- package/dist/esm/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -47,29 +47,9 @@ npm unlink --global @dev-pi2pie/word-counter
|
|
|
47
47
|
npm install -g @dev-pi2pie/word-counter@latest
|
|
48
48
|
```
|
|
49
49
|
|
|
50
|
-
### From GitHub Packages
|
|
51
|
-
|
|
52
|
-
If your scope is configured to use GitHub Packages:
|
|
53
|
-
|
|
54
|
-
```bash
|
|
55
|
-
# ~/.npmrc
|
|
56
|
-
@dev-pi2pie:registry=https://npm.pkg.github.com
|
|
57
|
-
```
|
|
58
|
-
|
|
59
|
-
```bash
|
|
60
|
-
npm install -g @dev-pi2pie/word-counter@latest
|
|
61
|
-
```
|
|
62
|
-
|
|
63
|
-
If your scope is configured to use npmjs instead, the same scoped package name
|
|
64
|
-
will resolve from npmjs.com (see the npm registry section above).
|
|
65
|
-
|
|
66
|
-
> [!note]
|
|
67
|
-
> **npm** may show newer releases (for example, `v0.0.6`) while GitHub Packages still lists `v0.0.5`.
|
|
68
|
-
> This is historical; releases kept in sync starting with `v0.0.6`.
|
|
69
|
-
|
|
70
50
|
## Usage
|
|
71
51
|
|
|
72
|
-
Once installed (via `npm link
|
|
52
|
+
Once installed (via `npm link` or the npm registry), you can use the CLI directly:
|
|
73
53
|
|
|
74
54
|
```bash
|
|
75
55
|
word-counter "Hello 世界 안녕"
|
|
@@ -109,12 +89,13 @@ word-counter --path ./fixtures/sample.txt
|
|
|
109
89
|
|
|
110
90
|
## Library Usage
|
|
111
91
|
|
|
112
|
-
The package exports can be used after installing from
|
|
92
|
+
The package exports can be used after installing from the npm registry or linking locally with `npm link`.
|
|
113
93
|
|
|
114
94
|
### ESM
|
|
115
95
|
|
|
116
96
|
```js
|
|
117
97
|
import wordCounter, {
|
|
98
|
+
countCharsForLocale,
|
|
118
99
|
countWordsForLocale,
|
|
119
100
|
countSections,
|
|
120
101
|
parseMarkdown,
|
|
@@ -124,6 +105,36 @@ import wordCounter, {
|
|
|
124
105
|
|
|
125
106
|
wordCounter("Hello world", { latinLocaleHint: "en" });
|
|
126
107
|
wordCounter("Hi 👋, world!", { nonWords: true });
|
|
108
|
+
wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
|
|
109
|
+
wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
|
|
110
|
+
countCharsForLocale("👋", "en");
|
|
111
|
+
```
|
|
112
|
+
|
|
113
|
+
Note: `includeWhitespace` only affects results when `nonWords: true` is enabled.
|
|
114
|
+
|
|
115
|
+
Sample output (with `nonWords: true` and `includeWhitespace: true`):
|
|
116
|
+
|
|
117
|
+
```json
|
|
118
|
+
{
|
|
119
|
+
"total": 4,
|
|
120
|
+
"counts": { "words": 2, "nonWords": 2, "total": 4 },
|
|
121
|
+
"breakdown": {
|
|
122
|
+
"mode": "chunk",
|
|
123
|
+
"items": [
|
|
124
|
+
{
|
|
125
|
+
// ...
|
|
126
|
+
"words": 2,
|
|
127
|
+
"nonWords": {
|
|
128
|
+
"emoji": [],
|
|
129
|
+
"symbols": [],
|
|
130
|
+
"punctuation": [],
|
|
131
|
+
"counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 2 },
|
|
132
|
+
"whitespace": { "spaces": 0, "tabs": 1, "newlines": 1, "other": 0 }
|
|
133
|
+
}
|
|
134
|
+
}
|
|
135
|
+
]
|
|
136
|
+
}
|
|
137
|
+
}
|
|
127
138
|
```
|
|
128
139
|
|
|
129
140
|
### CJS
|
|
@@ -131,6 +142,7 @@ wordCounter("Hi 👋, world!", { nonWords: true });
|
|
|
131
142
|
```js
|
|
132
143
|
const wordCounter = require("@dev-pi2pie/word-counter");
|
|
133
144
|
const {
|
|
145
|
+
countCharsForLocale,
|
|
134
146
|
countWordsForLocale,
|
|
135
147
|
countSections,
|
|
136
148
|
parseMarkdown,
|
|
@@ -140,6 +152,36 @@ const {
|
|
|
140
152
|
|
|
141
153
|
wordCounter("Hello world", { latinLocaleHint: "en" });
|
|
142
154
|
wordCounter("Hi 👋, world!", { nonWords: true });
|
|
155
|
+
wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
|
|
156
|
+
wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
|
|
157
|
+
countCharsForLocale("👋", "en");
|
|
158
|
+
```
|
|
159
|
+
|
|
160
|
+
Note: `includeWhitespace` only affects results when `nonWords: true` is enabled.
|
|
161
|
+
|
|
162
|
+
Sample output (with `nonWords: true` and `includeWhitespace: true`):
|
|
163
|
+
|
|
164
|
+
```json
|
|
165
|
+
{
|
|
166
|
+
"total": 4,
|
|
167
|
+
"counts": { "words": 2, "nonWords": 2, "total": 4 },
|
|
168
|
+
"breakdown": {
|
|
169
|
+
"mode": "chunk",
|
|
170
|
+
"items": [
|
|
171
|
+
{
|
|
172
|
+
// ...
|
|
173
|
+
"words": 2,
|
|
174
|
+
"nonWords": {
|
|
175
|
+
"emoji": [],
|
|
176
|
+
"symbols": [],
|
|
177
|
+
"punctuation": [],
|
|
178
|
+
"counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 2 },
|
|
179
|
+
"whitespace": { "spaces": 0, "tabs": 1, "newlines": 1, "other": 0 }
|
|
180
|
+
}
|
|
181
|
+
}
|
|
182
|
+
]
|
|
183
|
+
}
|
|
184
|
+
}
|
|
143
185
|
```
|
|
144
186
|
|
|
145
187
|
### Export Summary
|
|
@@ -150,6 +192,7 @@ wordCounter("Hi 👋, world!", { nonWords: true });
|
|
|
150
192
|
| --------------------- | -------- | -------------------------------------------------- |
|
|
151
193
|
| `default` | function | `wordCounter(text, options?) -> WordCounterResult` |
|
|
152
194
|
| `wordCounter` | function | Alias of the default export. |
|
|
195
|
+
| `countCharsForLocale` | function | Low-level helper for per-locale char counts. |
|
|
153
196
|
| `countWordsForLocale` | function | Low-level helper for per-locale counts. |
|
|
154
197
|
| `segmentTextByLocale` | function | Low-level helper for locale-aware segmentation. |
|
|
155
198
|
|
|
@@ -168,13 +211,13 @@ wordCounter("Hi 👋, world!", { nonWords: true });
|
|
|
168
211
|
|
|
169
212
|
#### Types
|
|
170
213
|
|
|
171
|
-
| Export | Kind | Notes
|
|
172
|
-
| ---------------------- | ---- |
|
|
173
|
-
| `WordCounterOptions` | type | Options for the `wordCounter` function.
|
|
174
|
-
| `WordCounterResult` | type | Returned by `wordCounter`.
|
|
175
|
-
| `WordCounterBreakdown` | type | Breakdown payload in `WordCounterResult`.
|
|
176
|
-
| `WordCounterMode` | type | `"chunk" \| "segments" \| "collector"`.
|
|
177
|
-
| `NonWordCollection` | type | Non-word segments + counts payload.
|
|
214
|
+
| Export | Kind | Notes |
|
|
215
|
+
| ---------------------- | ---- | ------------------------------------------------- |
|
|
216
|
+
| `WordCounterOptions` | type | Options for the `wordCounter` function. |
|
|
217
|
+
| `WordCounterResult` | type | Returned by `wordCounter`. |
|
|
218
|
+
| `WordCounterBreakdown` | type | Breakdown payload in `WordCounterResult`. |
|
|
219
|
+
| `WordCounterMode` | type | `"chunk" \| "segments" \| "collector" \| "char"`. |
|
|
220
|
+
| `NonWordCollection` | type | Non-word segments + counts payload. |
|
|
178
221
|
|
|
179
222
|
### Display Modes
|
|
180
223
|
|
|
@@ -183,6 +226,14 @@ Choose a breakdown style with `--mode` (or `-m`):
|
|
|
183
226
|
- `chunk` (default) – list each contiguous locale block in order of appearance.
|
|
184
227
|
- `segments` – show the actual wordlike segments used for counting.
|
|
185
228
|
- `collector` – aggregate counts per locale regardless of text position.
|
|
229
|
+
- `char` – count grapheme clusters (user-perceived characters) per locale.
|
|
230
|
+
|
|
231
|
+
Aliases are normalized for CLI + API:
|
|
232
|
+
|
|
233
|
+
- `chunk`, `chunks`
|
|
234
|
+
- `segments`, `segment`, `seg`
|
|
235
|
+
- `collector`, `collect`, `colle`
|
|
236
|
+
- `char`, `chars`, `character`, `characters`
|
|
186
237
|
|
|
187
238
|
Examples:
|
|
188
239
|
|
|
@@ -195,6 +246,9 @@ word-counter --mode segments "飛鳥 bird 貓 cat; how do you do?"
|
|
|
195
246
|
|
|
196
247
|
# aggregate per locale
|
|
197
248
|
word-counter -m collector "飛鳥 bird 貓 cat; how do you do?"
|
|
249
|
+
|
|
250
|
+
# grapheme-aware character count
|
|
251
|
+
word-counter -m char "Hi 👋, world!"
|
|
198
252
|
```
|
|
199
253
|
|
|
200
254
|
### Section Modes (Frontmatter)
|
|
@@ -268,6 +322,37 @@ word-counter --non-words "Hi 👋, world!"
|
|
|
268
322
|
Example: `total = words + emoji + symbols + punctuation` when enabled.
|
|
269
323
|
Standard output labels this as `Total count` to reflect the combined total; `--format raw` still prints a single number.
|
|
270
324
|
|
|
325
|
+
Include whitespace-like characters in the non-words bucket (API: `includeWhitespace: true`):
|
|
326
|
+
|
|
327
|
+
```bash
|
|
328
|
+
word-counter --include-whitespace "Hi\tthere\n"
|
|
329
|
+
word-counter --misc "Hi\tthere\n"
|
|
330
|
+
```
|
|
331
|
+
|
|
332
|
+
In the CLI, `--include-whitespace` implies with `--non-words` (same behavior as `--misc`). `--non-words` alone does not include whitespace. When enabled, whitespace counts appear under `nonWords.whitespace`, and `total = words + nonWords` (emoji + symbols + punctuation + whitespace). JSON output also includes top-level `counts` when `nonWords` is enabled. See `docs/schemas/whitespace-categories.md` for how whitespace is categorized.
|
|
333
|
+
|
|
334
|
+
Example JSON (trimmed):
|
|
335
|
+
|
|
336
|
+
```json
|
|
337
|
+
{
|
|
338
|
+
"total": 5,
|
|
339
|
+
"counts": { "words": 2, "nonWords": 3, "total": 5 },
|
|
340
|
+
"breakdown": {
|
|
341
|
+
"mode": "chunk",
|
|
342
|
+
"items": [
|
|
343
|
+
{
|
|
344
|
+
"locale": "und-Latn",
|
|
345
|
+
"words": 2,
|
|
346
|
+
"nonWords": {
|
|
347
|
+
"counts": { "emoji": 0, "symbols": 0, "punctuation": 0, "whitespace": 3 },
|
|
348
|
+
"whitespace": { "spaces": 1, "tabs": 1, "newlines": 1, "other": 0 }
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
]
|
|
352
|
+
}
|
|
353
|
+
}
|
|
354
|
+
```
|
|
355
|
+
|
|
271
356
|
> [!Note]
|
|
272
357
|
> Text-default symbols (e.g. ©) count as `symbols` unless explicitly emoji-presented (e.g. ©️ with VS16).
|
|
273
358
|
|
package/dist/cjs/index.cjs
CHANGED
|
@@ -2,6 +2,7 @@ let yaml = require("yaml");
|
|
|
2
2
|
|
|
3
3
|
//#region src/wc/segmenter.ts
|
|
4
4
|
const segmenterCache = /* @__PURE__ */ new Map();
|
|
5
|
+
const graphemeSegmenterCache = /* @__PURE__ */ new Map();
|
|
5
6
|
function getSegmenter(locale) {
|
|
6
7
|
const cached = segmenterCache.get(locale);
|
|
7
8
|
if (cached) return cached;
|
|
@@ -9,12 +10,29 @@ function getSegmenter(locale) {
|
|
|
9
10
|
segmenterCache.set(locale, segmenter);
|
|
10
11
|
return segmenter;
|
|
11
12
|
}
|
|
13
|
+
function getGraphemeSegmenter(locale) {
|
|
14
|
+
const cached = graphemeSegmenterCache.get(locale);
|
|
15
|
+
if (cached) return cached;
|
|
16
|
+
const segmenter = new Intl.Segmenter(locale, { granularity: "grapheme" });
|
|
17
|
+
graphemeSegmenterCache.set(locale, segmenter);
|
|
18
|
+
return segmenter;
|
|
19
|
+
}
|
|
20
|
+
function supportsSegmenter() {
|
|
21
|
+
return typeof Intl !== "undefined" && typeof Intl.Segmenter === "function";
|
|
22
|
+
}
|
|
12
23
|
function countWordsForLocale(text, locale) {
|
|
13
24
|
const segmenter = getSegmenter(locale);
|
|
14
25
|
let count = 0;
|
|
15
26
|
for (const segment of segmenter.segment(text)) if (segment.isWordLike) count++;
|
|
16
27
|
return count;
|
|
17
28
|
}
|
|
29
|
+
function countCharsForLocale(text, locale) {
|
|
30
|
+
if (!supportsSegmenter()) return Array.from(text).length;
|
|
31
|
+
const segmenter = getGraphemeSegmenter(locale);
|
|
32
|
+
let count = 0;
|
|
33
|
+
for (const _segment of segmenter.segment(text)) count++;
|
|
34
|
+
return count;
|
|
35
|
+
}
|
|
18
36
|
|
|
19
37
|
//#endregion
|
|
20
38
|
//#region src/wc/non-words.ts
|
|
@@ -23,6 +41,13 @@ const emojiPresentationRegex = /\p{Emoji_Presentation}/u;
|
|
|
23
41
|
const keycapEmojiRegex = /[0-9#*]\uFE0F?\u20E3/u;
|
|
24
42
|
const symbolRegex = /\p{S}/u;
|
|
25
43
|
const punctuationRegex = /\p{P}/u;
|
|
44
|
+
const whitespaceRegex = /\s/u;
|
|
45
|
+
const newlineChars = new Set([
|
|
46
|
+
"\n",
|
|
47
|
+
"\r",
|
|
48
|
+
"\u2028",
|
|
49
|
+
"\u2029"
|
|
50
|
+
]);
|
|
26
51
|
function createNonWordCollection() {
|
|
27
52
|
return {
|
|
28
53
|
emoji: [],
|
|
@@ -49,6 +74,40 @@ function addNonWord(collection, category, segment) {
|
|
|
49
74
|
collection.punctuation.push(segment);
|
|
50
75
|
collection.counts.punctuation += 1;
|
|
51
76
|
}
|
|
77
|
+
function addWhitespace(collection, segment) {
|
|
78
|
+
let whitespace = collection.whitespace;
|
|
79
|
+
let count = 0;
|
|
80
|
+
for (const char of segment) {
|
|
81
|
+
if (char === " ") {
|
|
82
|
+
whitespace = whitespace ?? createWhitespaceCounts();
|
|
83
|
+
whitespace.spaces += 1;
|
|
84
|
+
count += 1;
|
|
85
|
+
continue;
|
|
86
|
+
}
|
|
87
|
+
if (char === " ") {
|
|
88
|
+
whitespace = whitespace ?? createWhitespaceCounts();
|
|
89
|
+
whitespace.tabs += 1;
|
|
90
|
+
count += 1;
|
|
91
|
+
continue;
|
|
92
|
+
}
|
|
93
|
+
if (newlineChars.has(char)) {
|
|
94
|
+
whitespace = whitespace ?? createWhitespaceCounts();
|
|
95
|
+
whitespace.newlines += 1;
|
|
96
|
+
count += 1;
|
|
97
|
+
continue;
|
|
98
|
+
}
|
|
99
|
+
if (whitespaceRegex.test(char)) {
|
|
100
|
+
whitespace = whitespace ?? createWhitespaceCounts();
|
|
101
|
+
whitespace.other += 1;
|
|
102
|
+
count += 1;
|
|
103
|
+
}
|
|
104
|
+
}
|
|
105
|
+
if (count > 0) {
|
|
106
|
+
collection.whitespace = whitespace ?? createWhitespaceCounts();
|
|
107
|
+
collection.counts.whitespace = (collection.counts.whitespace ?? 0) + count;
|
|
108
|
+
}
|
|
109
|
+
return count;
|
|
110
|
+
}
|
|
52
111
|
function classifyNonWordSegment(segment) {
|
|
53
112
|
const hasEmojiVariationSelector = segment.includes("️");
|
|
54
113
|
if (keycapEmojiRegex.test(segment) || emojiPresentationRegex.test(segment) || hasEmojiVariationSelector && emojiRegex.test(segment)) return "emoji";
|
|
@@ -69,17 +128,35 @@ function mergeNonWordCollections(target, source) {
|
|
|
69
128
|
target.punctuation.push(...source.punctuation);
|
|
70
129
|
target.counts.punctuation += source.counts.punctuation;
|
|
71
130
|
}
|
|
131
|
+
if (source.counts.whitespace && source.counts.whitespace > 0 && source.whitespace) {
|
|
132
|
+
const whitespace = target.whitespace ?? createWhitespaceCounts();
|
|
133
|
+
whitespace.spaces += source.whitespace.spaces;
|
|
134
|
+
whitespace.tabs += source.whitespace.tabs;
|
|
135
|
+
whitespace.newlines += source.whitespace.newlines;
|
|
136
|
+
whitespace.other += source.whitespace.other;
|
|
137
|
+
target.whitespace = whitespace;
|
|
138
|
+
target.counts.whitespace = (target.counts.whitespace ?? 0) + source.counts.whitespace;
|
|
139
|
+
}
|
|
72
140
|
return target;
|
|
73
141
|
}
|
|
142
|
+
function createWhitespaceCounts() {
|
|
143
|
+
return {
|
|
144
|
+
spaces: 0,
|
|
145
|
+
tabs: 0,
|
|
146
|
+
newlines: 0,
|
|
147
|
+
other: 0
|
|
148
|
+
};
|
|
149
|
+
}
|
|
74
150
|
|
|
75
151
|
//#endregion
|
|
76
152
|
//#region src/wc/analyze.ts
|
|
77
|
-
function analyzeChunk(chunk, collectNonWords) {
|
|
153
|
+
function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
|
|
78
154
|
const segmenter = getSegmenter(chunk.locale);
|
|
79
155
|
const segments = [];
|
|
80
156
|
const nonWords = collectNonWords ? createNonWordCollection() : null;
|
|
81
157
|
for (const part of segmenter.segment(chunk.text)) if (part.isWordLike) segments.push(part.segment);
|
|
82
158
|
else if (collectNonWords && nonWords) {
|
|
159
|
+
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
83
160
|
const category = classifyNonWordSegment(part.segment);
|
|
84
161
|
if (category) addNonWord(nonWords, category, part.segment);
|
|
85
162
|
}
|
|
@@ -91,6 +168,40 @@ function analyzeChunk(chunk, collectNonWords) {
|
|
|
91
168
|
nonWords: nonWords ?? void 0
|
|
92
169
|
};
|
|
93
170
|
}
|
|
171
|
+
function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
172
|
+
const segmenter = getSegmenter(chunk.locale);
|
|
173
|
+
const nonWords = collectNonWords ? createNonWordCollection() : null;
|
|
174
|
+
let chars = 0;
|
|
175
|
+
let wordChars = 0;
|
|
176
|
+
let nonWordChars = 0;
|
|
177
|
+
for (const part of segmenter.segment(chunk.text)) {
|
|
178
|
+
if (part.isWordLike) {
|
|
179
|
+
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
180
|
+
chars += count;
|
|
181
|
+
wordChars += count;
|
|
182
|
+
continue;
|
|
183
|
+
}
|
|
184
|
+
if (collectNonWords && nonWords) {
|
|
185
|
+
let whitespaceCount = 0;
|
|
186
|
+
if (includeWhitespace) whitespaceCount = addWhitespace(nonWords, part.segment);
|
|
187
|
+
const category = classifyNonWordSegment(part.segment);
|
|
188
|
+
if (category) addNonWord(nonWords, category, part.segment);
|
|
189
|
+
if (category || whitespaceCount > 0) {
|
|
190
|
+
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
191
|
+
chars += count;
|
|
192
|
+
nonWordChars += count;
|
|
193
|
+
}
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
return {
|
|
197
|
+
locale: chunk.locale,
|
|
198
|
+
text: chunk.text,
|
|
199
|
+
chars,
|
|
200
|
+
wordChars,
|
|
201
|
+
nonWordChars,
|
|
202
|
+
nonWords: nonWords ?? void 0
|
|
203
|
+
};
|
|
204
|
+
}
|
|
94
205
|
function aggregateByLocale(chunks) {
|
|
95
206
|
const order = [];
|
|
96
207
|
const map = /* @__PURE__ */ new Map();
|
|
@@ -111,6 +222,30 @@ function aggregateByLocale(chunks) {
|
|
|
111
222
|
return order.map((locale) => map.get(locale));
|
|
112
223
|
}
|
|
113
224
|
|
|
225
|
+
//#endregion
|
|
226
|
+
//#region src/wc/mode.ts
|
|
227
|
+
const MODE_ALIASES = {
|
|
228
|
+
chunk: "chunk",
|
|
229
|
+
chunks: "chunk",
|
|
230
|
+
segments: "segments",
|
|
231
|
+
segment: "segments",
|
|
232
|
+
seg: "segments",
|
|
233
|
+
collector: "collector",
|
|
234
|
+
collect: "collector",
|
|
235
|
+
colle: "collector",
|
|
236
|
+
char: "char",
|
|
237
|
+
chars: "char",
|
|
238
|
+
character: "char",
|
|
239
|
+
characters: "char"
|
|
240
|
+
};
|
|
241
|
+
function normalizeMode(input) {
|
|
242
|
+
if (!input) return null;
|
|
243
|
+
return MODE_ALIASES[input.trim().toLowerCase()] ?? null;
|
|
244
|
+
}
|
|
245
|
+
function resolveMode(input, fallback = "chunk") {
|
|
246
|
+
return normalizeMode(input) ?? fallback;
|
|
247
|
+
}
|
|
248
|
+
|
|
114
249
|
//#endregion
|
|
115
250
|
//#region src/wc/locale-detect.ts
|
|
116
251
|
const DEFAULT_LOCALE = "und-Latn";
|
|
@@ -241,16 +376,51 @@ function mergeAdjacentChunks(chunks) {
|
|
|
241
376
|
//#endregion
|
|
242
377
|
//#region src/wc/wc.ts
|
|
243
378
|
function wordCounter(text, options = {}) {
|
|
244
|
-
const mode = options.mode
|
|
379
|
+
const mode = resolveMode(options.mode, "chunk");
|
|
245
380
|
const collectNonWords = Boolean(options.nonWords);
|
|
246
|
-
const
|
|
381
|
+
const includeWhitespace = Boolean(options.includeWhitespace);
|
|
382
|
+
const chunks = segmentTextByLocale(text, { latinLocaleHint: options.latinLocaleHint });
|
|
383
|
+
if (mode === "char") {
|
|
384
|
+
const analyzed$1 = chunks.map((chunk) => analyzeCharChunk(chunk, collectNonWords, includeWhitespace));
|
|
385
|
+
const total$1 = analyzed$1.reduce((sum, chunk) => sum + chunk.chars, 0);
|
|
386
|
+
const items = analyzed$1.map((chunk) => ({
|
|
387
|
+
locale: chunk.locale,
|
|
388
|
+
text: chunk.text,
|
|
389
|
+
chars: chunk.chars,
|
|
390
|
+
nonWords: chunk.nonWords
|
|
391
|
+
}));
|
|
392
|
+
return {
|
|
393
|
+
total: total$1,
|
|
394
|
+
counts: collectNonWords ? {
|
|
395
|
+
words: analyzed$1.reduce((sum, chunk) => sum + chunk.wordChars, 0),
|
|
396
|
+
nonWords: analyzed$1.reduce((sum, chunk) => sum + chunk.nonWordChars, 0),
|
|
397
|
+
total: total$1
|
|
398
|
+
} : void 0,
|
|
399
|
+
breakdown: {
|
|
400
|
+
mode,
|
|
401
|
+
items
|
|
402
|
+
}
|
|
403
|
+
};
|
|
404
|
+
}
|
|
405
|
+
const analyzed = chunks.map((chunk) => analyzeChunk(chunk, collectNonWords, includeWhitespace));
|
|
406
|
+
const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0);
|
|
407
|
+
const nonWordsTotal = collectNonWords ? analyzed.reduce((sum, chunk) => {
|
|
408
|
+
if (!chunk.nonWords) return sum;
|
|
409
|
+
return sum + getNonWordTotal(chunk.nonWords);
|
|
410
|
+
}, 0) : 0;
|
|
247
411
|
const total = analyzed.reduce((sum, chunk) => {
|
|
248
412
|
let chunkTotal = chunk.words;
|
|
249
|
-
if (collectNonWords && chunk.nonWords) chunkTotal += chunk.nonWords
|
|
413
|
+
if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal(chunk.nonWords);
|
|
250
414
|
return sum + chunkTotal;
|
|
251
415
|
}, 0);
|
|
416
|
+
const counts = collectNonWords ? {
|
|
417
|
+
words: wordsTotal,
|
|
418
|
+
nonWords: nonWordsTotal,
|
|
419
|
+
total
|
|
420
|
+
} : void 0;
|
|
252
421
|
if (mode === "segments") return {
|
|
253
422
|
total,
|
|
423
|
+
counts,
|
|
254
424
|
breakdown: {
|
|
255
425
|
mode,
|
|
256
426
|
items: analyzed.map((chunk) => ({
|
|
@@ -264,6 +434,7 @@ function wordCounter(text, options = {}) {
|
|
|
264
434
|
};
|
|
265
435
|
if (mode === "collector") return {
|
|
266
436
|
total,
|
|
437
|
+
counts,
|
|
267
438
|
breakdown: {
|
|
268
439
|
mode,
|
|
269
440
|
items: aggregateByLocale(analyzed),
|
|
@@ -272,6 +443,7 @@ function wordCounter(text, options = {}) {
|
|
|
272
443
|
};
|
|
273
444
|
return {
|
|
274
445
|
total,
|
|
446
|
+
counts,
|
|
275
447
|
breakdown: {
|
|
276
448
|
mode,
|
|
277
449
|
items: analyzed.map((chunk) => ({
|
|
@@ -283,6 +455,9 @@ function wordCounter(text, options = {}) {
|
|
|
283
455
|
}
|
|
284
456
|
};
|
|
285
457
|
}
|
|
458
|
+
function getNonWordTotal(nonWords) {
|
|
459
|
+
return nonWords.counts.emoji + nonWords.counts.symbols + nonWords.counts.punctuation + (nonWords.counts.whitespace ?? 0);
|
|
460
|
+
}
|
|
286
461
|
function collectNonWordsAggregate(analyzed, enabled) {
|
|
287
462
|
if (!enabled) return;
|
|
288
463
|
const collection = createNonWordCollection();
|
|
@@ -834,6 +1009,7 @@ function showSingularOrPluralWord(count, word) {
|
|
|
834
1009
|
const cjsExports = Object.assign(wc_default, {
|
|
835
1010
|
default: wc_default,
|
|
836
1011
|
wordCounter: wc_default,
|
|
1012
|
+
countCharsForLocale,
|
|
837
1013
|
countWordsForLocale,
|
|
838
1014
|
segmentTextByLocale,
|
|
839
1015
|
parseMarkdown,
|