@dev-pi2pie/word-counter 0.1.0-canary.4 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +113 -70
- package/dist/cjs/index.cjs +104 -22
- package/dist/cjs/index.cjs.map +1 -1
- package/dist/esm/bin.mjs +1216 -784
- package/dist/esm/bin.mjs.map +1 -1
- package/dist/esm/index.d.mts +9 -2
- package/dist/esm/index.mjs +104 -22
- package/dist/esm/index.mjs.map +1 -1
- package/package.json +9 -6
package/README.md
CHANGED
|
@@ -2,130 +2,116 @@
|
|
|
2
2
|
|
|
3
3
|
Locale-aware word counting powered by the Web API [`Intl.Segmenter`](https://developer.mozilla.org/docs/Web/JavaScript/Reference/Global_Objects/Intl/Segmenter). The script automatically detects the primary writing system for each portion of the input, segments the text with matching BCP 47 locale tags, and reports word totals per locale.
|
|
4
4
|
|
|
5
|
-
##
|
|
5
|
+
## Quick Start (npx)
|
|
6
6
|
|
|
7
|
-
|
|
8
|
-
- Adjacent characters that share the same locale tag are grouped into a chunk.
|
|
9
|
-
- Each chunk is counted with `Intl.Segmenter` at `granularity: "word"`, caching segmenters to avoid re-instantiation.
|
|
10
|
-
- Per-locale counts are summed into a overall total and printed to stdout.
|
|
7
|
+
Runtime requirement: Node.js `>=20`.
|
|
11
8
|
|
|
12
|
-
|
|
13
|
-
|
|
14
|
-
- Output keeps the field name `locale` for compatibility.
|
|
15
|
-
- In this project, locale values are BCP 47 tags and are often language/script focused (for example: `en`, `und-Latn`, `zh-Hani`) rather than region-specific tags (for example: `en-US`, `zh-TW`).
|
|
16
|
-
- Default detection prefers language/script tags to avoid incorrect region assumptions.
|
|
17
|
-
- You can still provide region-specific locale tags through hint flags when needed.
|
|
18
|
-
|
|
19
|
-
## Installation
|
|
20
|
-
|
|
21
|
-
### For Development
|
|
22
|
-
|
|
23
|
-
Clone the repository and set up locally:
|
|
9
|
+
Run without installing:
|
|
24
10
|
|
|
25
11
|
```bash
|
|
26
|
-
|
|
27
|
-
cd word-counter
|
|
28
|
-
bun install
|
|
29
|
-
bun run build
|
|
30
|
-
npm link
|
|
12
|
+
npx @dev-pi2pie/word-counter "Hello 世界 안녕"
|
|
31
13
|
```
|
|
32
14
|
|
|
33
|
-
|
|
15
|
+
Pipe stdin:
|
|
34
16
|
|
|
35
17
|
```bash
|
|
36
|
-
|
|
18
|
+
echo "こんにちは world مرحبا" | npx @dev-pi2pie/word-counter
|
|
37
19
|
```
|
|
38
20
|
|
|
39
|
-
|
|
21
|
+
File input:
|
|
40
22
|
|
|
41
23
|
```bash
|
|
42
|
-
|
|
24
|
+
npx @dev-pi2pie/word-counter --path ./examples/yaml-basic.md
|
|
43
25
|
```
|
|
44
26
|
|
|
45
|
-
|
|
27
|
+
## Install and Usage Paths
|
|
46
28
|
|
|
47
|
-
|
|
48
|
-
npm unlink --global @dev-pi2pie/word-counter
|
|
49
|
-
```
|
|
29
|
+
Pick one path based on how often you use it:
|
|
50
30
|
|
|
51
|
-
|
|
31
|
+
1. One-off use: `npx @dev-pi2pie/word-counter ...` (no install, best for quick checks and CI snippets).
|
|
32
|
+
2. Frequent CLI use: `npm install -g @dev-pi2pie/word-counter@latest` then run `word-counter ...`.
|
|
33
|
+
3. Library use in code: `npm install @dev-pi2pie/word-counter` and import from your app/scripts.
|
|
34
|
+
|
|
35
|
+
For local development in this repository:
|
|
52
36
|
|
|
53
37
|
```bash
|
|
54
|
-
|
|
38
|
+
git clone https://github.com/dev-pi2pie/word-counter.git
|
|
39
|
+
cd word-counter
|
|
40
|
+
bun install
|
|
41
|
+
bun run build
|
|
42
|
+
npm link
|
|
55
43
|
```
|
|
56
44
|
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
Once installed (via `npm link` or the npm registry), you can use the CLI directly:
|
|
45
|
+
Then:
|
|
60
46
|
|
|
61
47
|
```bash
|
|
62
48
|
word-counter "Hello 世界 안녕"
|
|
63
49
|
```
|
|
64
50
|
|
|
65
|
-
|
|
51
|
+
To remove the global link:
|
|
66
52
|
|
|
67
53
|
```bash
|
|
68
|
-
|
|
54
|
+
npm unlink --global @dev-pi2pie/word-counter
|
|
69
55
|
```
|
|
70
56
|
|
|
71
|
-
|
|
57
|
+
## CLI Usage
|
|
58
|
+
|
|
59
|
+
Basic text:
|
|
72
60
|
|
|
73
61
|
```bash
|
|
74
|
-
|
|
62
|
+
word-counter "Hello 世界 안녕"
|
|
75
63
|
```
|
|
76
64
|
|
|
77
|
-
Hint a
|
|
65
|
+
Hint a language tag for ambiguous Latin text:
|
|
78
66
|
|
|
79
67
|
```bash
|
|
80
68
|
word-counter --latin-language en "Hello world"
|
|
81
69
|
word-counter --latin-tag en "Hello world"
|
|
82
70
|
```
|
|
83
71
|
|
|
84
|
-
Hint a
|
|
72
|
+
Hint a language tag for Han fallback:
|
|
85
73
|
|
|
86
74
|
```bash
|
|
87
75
|
word-counter --han-language zh-Hant "漢字測試"
|
|
88
76
|
word-counter --han-tag zh-Hans "汉字测试"
|
|
89
77
|
```
|
|
90
78
|
|
|
91
|
-
Collect non-
|
|
79
|
+
Collect non-words (emoji/symbols/punctuation):
|
|
92
80
|
|
|
93
81
|
```bash
|
|
94
82
|
word-counter --non-words "Hi 👋, world!"
|
|
95
83
|
```
|
|
96
84
|
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
Or read from a file:
|
|
85
|
+
Override total composition:
|
|
100
86
|
|
|
101
87
|
```bash
|
|
102
|
-
word-counter --
|
|
88
|
+
word-counter --non-words --total-of words "Hi 👋, world!"
|
|
89
|
+
word-counter --total-of punctuation --format raw "Hi, world!"
|
|
90
|
+
word-counter --total-of words,emoji --format json "Hi 👋, world!"
|
|
103
91
|
```
|
|
104
92
|
|
|
105
|
-
`--path`
|
|
106
|
-
Such files are treated as valid inputs and contribute zero words by default.
|
|
107
|
-
|
|
108
|
-
### Batch Counting
|
|
93
|
+
## Batch Counting (`--path`)
|
|
109
94
|
|
|
110
|
-
|
|
95
|
+
Repeat `--path` for mixed inputs (files and/or directories):
|
|
111
96
|
|
|
112
97
|
```bash
|
|
113
|
-
word-counter --path ./docs/a.md --path ./docs
|
|
98
|
+
word-counter --path ./docs/a.md --path ./docs --path ./notes.txt
|
|
114
99
|
```
|
|
115
100
|
|
|
116
|
-
|
|
101
|
+
Directory scans are recursive by default:
|
|
117
102
|
|
|
118
103
|
```bash
|
|
119
104
|
word-counter --path ./examples/test-case-multi-files-support
|
|
105
|
+
word-counter --path ./examples/test-case-multi-files-support --no-recursive
|
|
120
106
|
```
|
|
121
107
|
|
|
122
|
-
Show per-file
|
|
108
|
+
Show per-file plus merged summary:
|
|
123
109
|
|
|
124
110
|
```bash
|
|
125
111
|
word-counter --path ./examples/test-case-multi-files-support --per-file
|
|
126
112
|
```
|
|
127
113
|
|
|
128
|
-
|
|
114
|
+
Progress behavior in standard batch mode:
|
|
129
115
|
|
|
130
116
|
```bash
|
|
131
117
|
word-counter --path ./examples/test-case-multi-files-support
|
|
@@ -133,33 +119,83 @@ word-counter --path ./examples/test-case-multi-files-support --no-progress
|
|
|
133
119
|
word-counter --path ./examples/test-case-multi-files-support --keep-progress
|
|
134
120
|
```
|
|
135
121
|
|
|
136
|
-
Progress
|
|
122
|
+
Progress is transient by default, auto-disabled for single-input runs, and suppressed in `--format raw` and `--format json`.
|
|
137
123
|
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
124
|
+
### Stable Path Resolution Contract (`#26`)
|
|
125
|
+
|
|
126
|
+
- Repeated `--path` values are accepted as mixed inputs (file + directory).
|
|
127
|
+
- In `--path-mode auto` (default), directory inputs are expanded to files (recursive unless `--no-recursive`).
|
|
128
|
+
- In `--path-mode manual`, directory inputs are not expanded and are skipped as non-regular files.
|
|
129
|
+
- Extension filters apply only to files discovered from directory expansion.
|
|
130
|
+
- Direct file inputs are always considered regardless of `--include-ext` / `--exclude-ext`.
|
|
131
|
+
- Overlap dedupe is by resolved absolute file path.
|
|
132
|
+
- If the same file is discovered multiple ways (repeated roots, nested roots, explicit file + directory), it is counted once.
|
|
133
|
+
- Final processing order is deterministic: resolved files are sorted by absolute path ascending before load/count.
|
|
143
134
|
|
|
144
|
-
|
|
145
|
-
Use `--keep-progress` when you want the final progress line to stay visible after completion.
|
|
135
|
+
### Extension Filters
|
|
146
136
|
|
|
147
|
-
|
|
137
|
+
Use include/exclude filters for directory scans:
|
|
148
138
|
|
|
149
139
|
```bash
|
|
150
140
|
word-counter --path ./examples/test-case-multi-files-support --include-ext .md,.mdx
|
|
151
141
|
word-counter --path ./examples/test-case-multi-files-support --include-ext .md,.txt --exclude-ext .txt
|
|
152
142
|
```
|
|
153
143
|
|
|
154
|
-
|
|
155
|
-
|
|
144
|
+
Direct file path example (filters do not block explicit file inputs):
|
|
145
|
+
|
|
146
|
+
```bash
|
|
147
|
+
word-counter --path ./examples/test-case-multi-files-support/ignored.js --include-ext .md --exclude-ext .md
|
|
148
|
+
```
|
|
149
|
+
|
|
150
|
+
### Debugging Diagnostics (`--debug`)
|
|
151
|
+
|
|
152
|
+
`--debug` remains the diagnostics gate and now defaults to `compact` event volume:
|
|
153
|
+
|
|
154
|
+
- lifecycle/stage timing events
|
|
155
|
+
- resolved/skipped summary events
|
|
156
|
+
- dedupe/filter summary counts
|
|
157
|
+
|
|
158
|
+
Use `--verbose` to include per-file/per-path events:
|
|
159
|
+
|
|
160
|
+
```bash
|
|
161
|
+
word-counter --path ./examples/test-case-multi-files-support --debug --verbose
|
|
162
|
+
```
|
|
163
|
+
|
|
164
|
+
Use `--debug-report [path]` to route debug diagnostics to a JSONL report file:
|
|
165
|
+
|
|
166
|
+
- no path: writes to current working directory with pattern `wc-debug-YYYYMMDD-HHmmss-<pid>.jsonl`
|
|
167
|
+
- path provided: writes to the specified location
|
|
168
|
+
- default-name collision handling: appends `-<n>` suffix to avoid overwriting existing files
|
|
169
|
+
- explicit path validation: existing directories are rejected (explicit paths are treated as file targets)
|
|
170
|
+
|
|
171
|
+
By default with `--debug-report`, debug lines are file-only (not mirrored to terminal).
|
|
172
|
+
Use `--debug-report-tee` (alias: `--debug-tee`) to mirror to both file and `stderr`.
|
|
173
|
+
Flag dependencies: `--verbose` requires `--debug`; `--debug-report` requires `--debug`; `--debug-report-tee`/`--debug-tee` requires `--debug-report`.
|
|
174
|
+
|
|
175
|
+
Examples:
|
|
156
176
|
|
|
157
177
|
```bash
|
|
158
|
-
word-counter --path ./examples/test-case-multi-files-support --debug
|
|
178
|
+
word-counter --path ./examples/test-case-multi-files-support --debug --debug-report
|
|
179
|
+
word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl
|
|
180
|
+
word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl --debug-report-tee
|
|
181
|
+
word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl --debug-tee
|
|
159
182
|
```
|
|
160
183
|
|
|
161
|
-
|
|
162
|
-
|
|
184
|
+
Skip details stay debug-gated and can still be suppressed with `--quiet-skips`.
|
|
185
|
+
|
|
186
|
+
## How It Works
|
|
187
|
+
|
|
188
|
+
- The runtime inspects each character's Unicode script to infer its likely locale tag (e.g., `und-Latn`, `zh-Hani`, `ja`).
|
|
189
|
+
- Adjacent characters that share the same locale tag are grouped into a chunk.
|
|
190
|
+
- Each chunk is counted with `Intl.Segmenter` at `granularity: "word"`, caching segmenters to avoid re-instantiation.
|
|
191
|
+
- Per-locale counts are summed into an overall total and printed to stdout.
|
|
192
|
+
|
|
193
|
+
## Locale vs Language Code
|
|
194
|
+
|
|
195
|
+
- Output keeps the field name `locale` for compatibility.
|
|
196
|
+
- In this project, locale values are BCP 47 tags and are often language/script focused (for example: `en`, `und-Latn`, `zh-Hani`) rather than region-specific tags (for example: `en-US`, `zh-TW`).
|
|
197
|
+
- Default detection prefers language/script tags to avoid incorrect region assumptions.
|
|
198
|
+
- You can still provide region-specific locale tags through hint flags when needed.
|
|
163
199
|
|
|
164
200
|
## Library Usage
|
|
165
201
|
|
|
@@ -182,6 +218,7 @@ wordCounter("Hello world", { latinTagHint: "en" });
|
|
|
182
218
|
wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
|
|
183
219
|
wordCounter("Hi 👋, world!", { nonWords: true });
|
|
184
220
|
wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
|
|
221
|
+
wordCounter("飛鳥 bird 貓 cat", { mode: "char-collector" });
|
|
185
222
|
wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
|
|
186
223
|
countCharsForLocale("👋", "en");
|
|
187
224
|
```
|
|
@@ -231,6 +268,7 @@ wordCounter("Hello world", { latinTagHint: "en" });
|
|
|
231
268
|
wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
|
|
232
269
|
wordCounter("Hi 👋, world!", { nonWords: true });
|
|
233
270
|
wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
|
|
271
|
+
wordCounter("飛鳥 bird 貓 cat", { mode: "char-collector" });
|
|
234
272
|
wordCounter("Hi\tthere\n", { nonWords: true, includeWhitespace: true });
|
|
235
273
|
countCharsForLocale("👋", "en");
|
|
236
274
|
```
|
|
@@ -294,7 +332,7 @@ Sample output (with `nonWords: true` and `includeWhitespace: true`):
|
|
|
294
332
|
| `WordCounterOptions` | type | Options for the `wordCounter` function. |
|
|
295
333
|
| `WordCounterResult` | type | Returned by `wordCounter`. |
|
|
296
334
|
| `WordCounterBreakdown` | type | Breakdown payload in `WordCounterResult`. |
|
|
297
|
-
| `WordCounterMode` | type | `"chunk" \| "segments" \| "collector" \| "char"`. |
|
|
335
|
+
| `WordCounterMode` | type | `"chunk" \| "segments" \| "collector" \| "char" \| "char-collector"`. |
|
|
298
336
|
| `NonWordCollection` | type | Non-word segments + counts payload. |
|
|
299
337
|
|
|
300
338
|
### Display Modes
|
|
@@ -306,6 +344,7 @@ Choose a breakdown style with `--mode` (or `-m`):
|
|
|
306
344
|
- `collector` – aggregate counts per locale regardless of text position.
|
|
307
345
|
Keeps per-locale segment lists in memory, so very large corpora can use noticeably more memory than `chunk` mode.
|
|
308
346
|
- `char` – count grapheme clusters (user-perceived characters) per locale.
|
|
347
|
+
- `char-collector` – aggregate grapheme-cluster counts per locale (collector-style char mode).
|
|
309
348
|
|
|
310
349
|
Aliases are normalized for CLI + API:
|
|
311
350
|
|
|
@@ -313,6 +352,7 @@ Aliases are normalized for CLI + API:
|
|
|
313
352
|
- `segments`, `segment`, `seg`
|
|
314
353
|
- `collector`, `collect`, `colle`
|
|
315
354
|
- `char`, `chars`, `character`, `characters`
|
|
355
|
+
- `char-collector`, `charcollector`, `char-collect`, `collector-char`, `characters-collector`, `colchar`, `charcol`, `char-col`, `char-colle`
|
|
316
356
|
|
|
317
357
|
Examples:
|
|
318
358
|
|
|
@@ -328,6 +368,9 @@ word-counter -m collector "飛鳥 bird 貓 cat; how do you do?"
|
|
|
328
368
|
|
|
329
369
|
# grapheme-aware character count
|
|
330
370
|
word-counter -m char "Hi 👋, world!"
|
|
371
|
+
|
|
372
|
+
# aggregate grapheme-aware character counts per locale
|
|
373
|
+
word-counter -m char-collector "飛鳥 bird 貓 cat; how do you do?"
|
|
331
374
|
```
|
|
332
375
|
|
|
333
376
|
### Section Modes (Frontmatter)
|
package/dist/cjs/index.cjs
CHANGED
|
@@ -208,6 +208,32 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
208
208
|
nonWords: nonWords ?? void 0
|
|
209
209
|
};
|
|
210
210
|
}
|
|
211
|
+
function aggregateCharsByLocale(chunks) {
|
|
212
|
+
const order = [];
|
|
213
|
+
const map = /* @__PURE__ */ new Map();
|
|
214
|
+
for (const chunk of chunks) {
|
|
215
|
+
const existing = map.get(chunk.locale);
|
|
216
|
+
if (existing) {
|
|
217
|
+
existing.chars += chunk.chars;
|
|
218
|
+
existing.wordChars += chunk.wordChars;
|
|
219
|
+
existing.nonWordChars += chunk.nonWordChars;
|
|
220
|
+
if (chunk.nonWords) {
|
|
221
|
+
if (!existing.nonWords) existing.nonWords = createNonWordCollection();
|
|
222
|
+
mergeNonWordCollections(existing.nonWords, chunk.nonWords);
|
|
223
|
+
}
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
order.push(chunk.locale);
|
|
227
|
+
map.set(chunk.locale, {
|
|
228
|
+
locale: chunk.locale,
|
|
229
|
+
chars: chunk.chars,
|
|
230
|
+
wordChars: chunk.wordChars,
|
|
231
|
+
nonWordChars: chunk.nonWordChars,
|
|
232
|
+
nonWords: chunk.nonWords ? mergeNonWordCollections(createNonWordCollection(), chunk.nonWords) : void 0
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
return order.map((locale) => map.get(locale));
|
|
236
|
+
}
|
|
211
237
|
function aggregateByLocale(chunks) {
|
|
212
238
|
const order = [];
|
|
213
239
|
const map = /* @__PURE__ */ new Map();
|
|
@@ -242,11 +268,55 @@ const MODE_ALIASES = {
|
|
|
242
268
|
char: "char",
|
|
243
269
|
chars: "char",
|
|
244
270
|
character: "char",
|
|
245
|
-
characters: "char"
|
|
271
|
+
characters: "char",
|
|
272
|
+
"char-collector": "char-collector"
|
|
246
273
|
};
|
|
274
|
+
const CHAR_MODE_ALIASES = new Set([
|
|
275
|
+
"char",
|
|
276
|
+
"chars",
|
|
277
|
+
"character",
|
|
278
|
+
"characters"
|
|
279
|
+
]);
|
|
280
|
+
const COLLECTOR_MODE_ALIASES = new Set([
|
|
281
|
+
"collector",
|
|
282
|
+
"collect",
|
|
283
|
+
"colle",
|
|
284
|
+
"col"
|
|
285
|
+
]);
|
|
286
|
+
function collapseSeparators(value) {
|
|
287
|
+
return value.replace(/[-_\s]+/g, "");
|
|
288
|
+
}
|
|
289
|
+
function isComposedCharCollectorFromTokens(value) {
|
|
290
|
+
const tokens = value.split(/[-_\s]+/).map((token) => token.trim()).filter((token) => token.length > 0);
|
|
291
|
+
if (tokens.length < 2) return false;
|
|
292
|
+
let hasCharAlias = false;
|
|
293
|
+
let hasCollectorAlias = false;
|
|
294
|
+
for (const token of tokens) {
|
|
295
|
+
if (CHAR_MODE_ALIASES.has(token)) {
|
|
296
|
+
hasCharAlias = true;
|
|
297
|
+
continue;
|
|
298
|
+
}
|
|
299
|
+
if (COLLECTOR_MODE_ALIASES.has(token)) {
|
|
300
|
+
hasCollectorAlias = true;
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
return false;
|
|
304
|
+
}
|
|
305
|
+
return hasCharAlias && hasCollectorAlias;
|
|
306
|
+
}
|
|
307
|
+
function isComposedCharCollectorCompact(value) {
|
|
308
|
+
for (const charAlias of CHAR_MODE_ALIASES) for (const collectorAlias of COLLECTOR_MODE_ALIASES) if (value === `${charAlias}${collectorAlias}` || value === `${collectorAlias}${charAlias}`) return true;
|
|
309
|
+
return false;
|
|
310
|
+
}
|
|
247
311
|
function normalizeMode(input) {
|
|
248
312
|
if (!input) return null;
|
|
249
|
-
|
|
313
|
+
const normalized = input.trim().toLowerCase();
|
|
314
|
+
const direct = MODE_ALIASES[normalized];
|
|
315
|
+
if (direct) return direct;
|
|
316
|
+
if (isComposedCharCollectorFromTokens(normalized)) return "char-collector";
|
|
317
|
+
const compact = collapseSeparators(normalized);
|
|
318
|
+
if (isComposedCharCollectorCompact(compact)) return "char-collector";
|
|
319
|
+
return MODE_ALIASES[compact] ?? null;
|
|
250
320
|
}
|
|
251
321
|
function resolveMode(input, fallback = "chunk") {
|
|
252
322
|
return normalizeMode(input) ?? fallback;
|
|
@@ -408,25 +478,37 @@ function wordCounter(text, options = {}) {
|
|
|
408
478
|
hanLanguageHint: options.hanLanguageHint,
|
|
409
479
|
hanTagHint: options.hanTagHint
|
|
410
480
|
});
|
|
411
|
-
if (mode === "char") {
|
|
412
|
-
const analyzed
|
|
413
|
-
const total
|
|
414
|
-
const
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
481
|
+
if (mode === "char" || mode === "char-collector") {
|
|
482
|
+
const analyzed = chunks.map((chunk) => analyzeCharChunk(chunk, collectNonWords, includeWhitespace));
|
|
483
|
+
const total = analyzed.reduce((sum, chunk) => sum + chunk.chars, 0);
|
|
484
|
+
const counts = collectNonWords ? {
|
|
485
|
+
words: analyzed.reduce((sum, chunk) => sum + chunk.wordChars, 0),
|
|
486
|
+
nonWords: analyzed.reduce((sum, chunk) => sum + chunk.nonWordChars, 0),
|
|
487
|
+
total
|
|
488
|
+
} : void 0;
|
|
489
|
+
if (mode === "char") return {
|
|
490
|
+
total,
|
|
491
|
+
counts,
|
|
492
|
+
breakdown: {
|
|
493
|
+
mode,
|
|
494
|
+
items: analyzed.map((chunk) => ({
|
|
495
|
+
locale: chunk.locale,
|
|
496
|
+
text: chunk.text,
|
|
497
|
+
chars: chunk.chars,
|
|
498
|
+
nonWords: chunk.nonWords
|
|
499
|
+
}))
|
|
500
|
+
}
|
|
501
|
+
};
|
|
420
502
|
return {
|
|
421
|
-
total
|
|
422
|
-
counts
|
|
423
|
-
words: analyzed$1.reduce((sum, chunk) => sum + chunk.wordChars, 0),
|
|
424
|
-
nonWords: analyzed$1.reduce((sum, chunk) => sum + chunk.nonWordChars, 0),
|
|
425
|
-
total: total$1
|
|
426
|
-
} : void 0,
|
|
503
|
+
total,
|
|
504
|
+
counts,
|
|
427
505
|
breakdown: {
|
|
428
506
|
mode,
|
|
429
|
-
items
|
|
507
|
+
items: aggregateCharsByLocale(analyzed).map((chunk) => ({
|
|
508
|
+
locale: chunk.locale,
|
|
509
|
+
chars: chunk.chars,
|
|
510
|
+
nonWords: chunk.nonWords
|
|
511
|
+
}))
|
|
430
512
|
}
|
|
431
513
|
};
|
|
432
514
|
}
|
|
@@ -796,7 +878,7 @@ function parseTomlFrontmatter(frontmatter) {
|
|
|
796
878
|
index += 1;
|
|
797
879
|
const nextLine = lines[index] ?? "";
|
|
798
880
|
combined += `\n${nextLine}`;
|
|
799
|
-
if (
|
|
881
|
+
if (new RegExp(`${delimiter}\\s*$`).test(nextLine)) {
|
|
800
882
|
closed = true;
|
|
801
883
|
break;
|
|
802
884
|
}
|
|
@@ -924,10 +1006,10 @@ function parseMarkdown(input) {
|
|
|
924
1006
|
data: null,
|
|
925
1007
|
frontmatterType: null
|
|
926
1008
|
};
|
|
927
|
-
const frontmatter
|
|
1009
|
+
const frontmatter = jsonBlock.jsonText;
|
|
928
1010
|
let content = normalizedWithoutBom.slice(jsonBlock.endIndex + 1);
|
|
929
1011
|
if (content.startsWith("\n")) content = content.slice(1);
|
|
930
|
-
const data = parseFrontmatter(frontmatter
|
|
1012
|
+
const data = parseFrontmatter(frontmatter, "json");
|
|
931
1013
|
if (!data) return {
|
|
932
1014
|
frontmatter: null,
|
|
933
1015
|
content: normalizedWithoutBom,
|
|
@@ -935,7 +1017,7 @@ function parseMarkdown(input) {
|
|
|
935
1017
|
frontmatterType: null
|
|
936
1018
|
};
|
|
937
1019
|
return {
|
|
938
|
-
frontmatter
|
|
1020
|
+
frontmatter,
|
|
939
1021
|
content,
|
|
940
1022
|
data,
|
|
941
1023
|
frontmatterType: "json"
|