@dev-pi2pie/word-counter 0.1.6-canary.1 → 0.1.7-canary.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -0
- package/dist/cjs/detector.cjs +2 -1
- package/dist/cjs/markdown.cjs +25 -12
- package/dist/esm/bin.mjs +42 -19
- package/dist/esm/detector.mjs +2 -1
- package/dist/esm/index2.d.mts +1 -1
- package/dist/esm/markdown.mjs +25 -12
- package/dist/esm/worker/count-worker.mjs +27 -13
- package/dist/wasm-language-detector/language_detector.js +5 -6
- package/dist/wasm-language-detector/language_detector_bg.wasm +0 -0
- package/dist/wasm-language-detector/package.json +1 -1
- package/package.json +9 -12
package/README.md
CHANGED
|
@@ -111,6 +111,7 @@ Inspect detector behavior without count output:
|
|
|
111
111
|
```bash
|
|
112
112
|
word-counter inspect "こんにちは、世界!これはテストです。"
|
|
113
113
|
word-counter inspect --detector wasm --view engine "This sentence should clearly be detected as English for the wasm detector path."
|
|
114
|
+
word-counter inspect --detector wasm --view engine --content-gate strict "Readers understand this behavior."
|
|
114
115
|
word-counter inspect --detector regex -f json "こんにちは、世界!これはテストです。"
|
|
115
116
|
word-counter inspect --detector regex -f json --pretty "こんにちは、世界!これはテストです。"
|
|
116
117
|
word-counter inspect --detector wasm --content-gate off "mode: debug\ntee: true\npath: logs\nUse this for testing."
|
|
@@ -144,6 +145,11 @@ Detector mode notes:
|
|
|
144
145
|
- Technical-noise-heavy Latin windows stay conservative and may remain `und-Latn` even when the detector produces a wrong-but-confident language guess.
|
|
145
146
|
- inspect/debug disclosure uses `contentGate` as the canonical gate field.
|
|
146
147
|
- legacy debug/evidence payloads still emit `qualityGate` as a compatibility alias derived from `contentGate.passed`.
|
|
148
|
+
- `inspect --view engine` stays raw:
|
|
149
|
+
- it shows the detector sample plus raw/normalized/remapped Whatlang output
|
|
150
|
+
- it does not apply `eligibility` or `contentGate` policy decisions
|
|
151
|
+
- if engine view uses an explicit or effective non-default content-gate mode, the CLI emits a cyan info note and points to `--view pipeline`
|
|
152
|
+
- `inspect --view pipeline` is the inspect surface for `eligibility`, `contentGate`, acceptance, and fallback reasoning.
|
|
147
153
|
- for practical verification, use `inspect` to compare direct mode outcomes across `default`, `strict`, `loose`, and `off`; use `--debug --detector-evidence` when you specifically need counting-flow event details or legacy `qualityGate` compatibility
|
|
148
154
|
- `word-counter inspect` supports:
|
|
149
155
|
- positional text input
|
package/dist/cjs/detector.cjs
CHANGED
|
@@ -966,7 +966,8 @@ function resolveWhatlangWasmModulePath() {
|
|
|
966
966
|
}
|
|
967
967
|
async function loadWhatlangWasmModule() {
|
|
968
968
|
if (!modulePromise) modulePromise = (async () => {
|
|
969
|
-
|
|
969
|
+
const modulePath = resolveWhatlangWasmModulePath();
|
|
970
|
+
return requireFromHere(modulePath);
|
|
970
971
|
})();
|
|
971
972
|
return modulePromise;
|
|
972
973
|
}
|
package/dist/cjs/markdown.cjs
CHANGED
|
@@ -45,7 +45,7 @@ const keycapEmojiRegex = /[0-9#*]\uFE0F?\u20E3/u;
|
|
|
45
45
|
const symbolRegex = /\p{S}/u;
|
|
46
46
|
const punctuationRegex = /\p{P}/u;
|
|
47
47
|
const whitespaceRegex = /\s/u;
|
|
48
|
-
const newlineChars = new Set([
|
|
48
|
+
const newlineChars = /* @__PURE__ */ new Set([
|
|
49
49
|
"\n",
|
|
50
50
|
"\r",
|
|
51
51
|
"\u2028",
|
|
@@ -156,11 +156,16 @@ function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
156
156
|
const segmenter = getSegmenter(chunk.locale);
|
|
157
157
|
const segments = [];
|
|
158
158
|
const nonWords = collectNonWords ? createNonWordCollection() : null;
|
|
159
|
-
for (const part of segmenter.segment(chunk.text))
|
|
160
|
-
else if (collectNonWords && nonWords) {
|
|
161
|
-
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
159
|
+
for (const part of segmenter.segment(chunk.text)) {
|
|
162
160
|
const category = classifyNonWordSegment(part.segment);
|
|
163
|
-
if (category)
|
|
161
|
+
if (category) {
|
|
162
|
+
if (collectNonWords && nonWords) addNonWord(nonWords, category, part.segment);
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
if (part.isWordLike) segments.push(part.segment);
|
|
166
|
+
else if (collectNonWords && nonWords) {
|
|
167
|
+
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
168
|
+
}
|
|
164
169
|
}
|
|
165
170
|
return {
|
|
166
171
|
locale: chunk.locale,
|
|
@@ -177,6 +182,16 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
177
182
|
let wordChars = 0;
|
|
178
183
|
let nonWordChars = 0;
|
|
179
184
|
for (const part of segmenter.segment(chunk.text)) {
|
|
185
|
+
const category = classifyNonWordSegment(part.segment);
|
|
186
|
+
if (category) {
|
|
187
|
+
if (collectNonWords && nonWords) {
|
|
188
|
+
addNonWord(nonWords, category, part.segment);
|
|
189
|
+
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
190
|
+
chars += count;
|
|
191
|
+
nonWordChars += count;
|
|
192
|
+
}
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
180
195
|
if (part.isWordLike) {
|
|
181
196
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
182
197
|
chars += count;
|
|
@@ -186,9 +201,7 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
186
201
|
if (collectNonWords && nonWords) {
|
|
187
202
|
let whitespaceCount = 0;
|
|
188
203
|
if (includeWhitespace) whitespaceCount = addWhitespace(nonWords, part.segment);
|
|
189
|
-
|
|
190
|
-
if (category) addNonWord(nonWords, category, part.segment);
|
|
191
|
-
if (category || whitespaceCount > 0) {
|
|
204
|
+
if (whitespaceCount > 0) {
|
|
192
205
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
193
206
|
chars += count;
|
|
194
207
|
nonWordChars += count;
|
|
@@ -266,13 +279,13 @@ const MODE_ALIASES = {
|
|
|
266
279
|
characters: "char",
|
|
267
280
|
"char-collector": "char-collector"
|
|
268
281
|
};
|
|
269
|
-
const CHAR_MODE_ALIASES = new Set([
|
|
282
|
+
const CHAR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
270
283
|
"char",
|
|
271
284
|
"chars",
|
|
272
285
|
"character",
|
|
273
286
|
"characters"
|
|
274
287
|
]);
|
|
275
|
-
const COLLECTOR_MODE_ALIASES = new Set([
|
|
288
|
+
const COLLECTOR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
276
289
|
"collector",
|
|
277
290
|
"collect",
|
|
278
291
|
"colle",
|
|
@@ -370,7 +383,7 @@ const regex = {
|
|
|
370
383
|
devanagari: /\p{Script=Devanagari}/u,
|
|
371
384
|
thai: /\p{Script=Thai}/u
|
|
372
385
|
};
|
|
373
|
-
const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
386
|
+
const defaultLatinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
374
387
|
function isLatinLocale(locale, context) {
|
|
375
388
|
if (context) return context.latinLocales.has(locale);
|
|
376
389
|
return defaultLatinLocales.has(locale);
|
|
@@ -447,7 +460,7 @@ function resolveLatinHintRules(options) {
|
|
|
447
460
|
function resolveLocaleDetectContext(options = {}) {
|
|
448
461
|
const latinHint = resolveLatinHint(options);
|
|
449
462
|
const latinHintRules = resolveLatinHintRules(options);
|
|
450
|
-
const latinLocales = new Set([DEFAULT_LOCALE]);
|
|
463
|
+
const latinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE]);
|
|
451
464
|
for (const rule of latinHintRules) latinLocales.add(rule.tag);
|
|
452
465
|
if (latinHint) latinLocales.add(latinHint);
|
|
453
466
|
return {
|
package/dist/esm/bin.mjs
CHANGED
|
@@ -15,7 +15,7 @@ var __getOwnPropDesc = Object.getOwnPropertyDescriptor;
|
|
|
15
15
|
var __getOwnPropNames = Object.getOwnPropertyNames;
|
|
16
16
|
var __getProtoOf = Object.getPrototypeOf;
|
|
17
17
|
var __hasOwnProp = Object.prototype.hasOwnProperty;
|
|
18
|
-
var __commonJSMin = (cb, mod) => () => (mod || cb((mod = { exports: {} }).exports, mod), mod.exports);
|
|
18
|
+
var __commonJSMin = (cb, mod) => () => (mod || (cb((mod = { exports: {} }).exports, mod), cb = null), mod.exports);
|
|
19
19
|
var __copyProps = (to, from, except, desc) => {
|
|
20
20
|
if (from && typeof from === "object" || typeof from === "function") for (var keys = __getOwnPropNames(from), i = 0, n = keys.length, key; i < n; i++) {
|
|
21
21
|
key = keys[i];
|
|
@@ -262,7 +262,8 @@ function collectTotalOfCounts(result) {
|
|
|
262
262
|
return counts;
|
|
263
263
|
}
|
|
264
264
|
function parseTotalOfToken(token) {
|
|
265
|
-
const
|
|
265
|
+
const normalized = token.trim().toLowerCase();
|
|
266
|
+
const canonical = TOTAL_OF_PART_ALIASES[normalized];
|
|
266
267
|
if (canonical) return canonical;
|
|
267
268
|
throw new Error(`Invalid --total-of part: ${token}. Allowed: ${TOTAL_OF_PARTS.join(", ")}.`);
|
|
268
269
|
}
|
|
@@ -1271,7 +1272,7 @@ function meetsRequiredNodeVersion(version) {
|
|
|
1271
1272
|
return version.patch >= REQUIRED_NODE_VERSION.patch;
|
|
1272
1273
|
}
|
|
1273
1274
|
function resolveRuntimeSummary(overrides = {}) {
|
|
1274
|
-
const packageVersion = normalizePackageVersion(overrides.packageVersion ?? "0.1.
|
|
1275
|
+
const packageVersion = normalizePackageVersion(overrides.packageVersion ?? "0.1.7-canary.1");
|
|
1275
1276
|
const nodeVersion = overrides.nodeVersion ?? process.version;
|
|
1276
1277
|
const parsedNodeVersion = parseNodeVersion(nodeVersion);
|
|
1277
1278
|
return {
|
|
@@ -2027,7 +2028,7 @@ const keycapEmojiRegex = /[0-9#*]\uFE0F?\u20E3/u;
|
|
|
2027
2028
|
const symbolRegex = /\p{S}/u;
|
|
2028
2029
|
const punctuationRegex = /\p{P}/u;
|
|
2029
2030
|
const whitespaceRegex = /\s/u;
|
|
2030
|
-
const newlineChars = new Set([
|
|
2031
|
+
const newlineChars = /* @__PURE__ */ new Set([
|
|
2031
2032
|
"\n",
|
|
2032
2033
|
"\r",
|
|
2033
2034
|
"\u2028",
|
|
@@ -2138,11 +2139,16 @@ function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
2138
2139
|
const segmenter = getSegmenter(chunk.locale);
|
|
2139
2140
|
const segments = [];
|
|
2140
2141
|
const nonWords = collectNonWords ? createNonWordCollection() : null;
|
|
2141
|
-
for (const part of segmenter.segment(chunk.text))
|
|
2142
|
-
else if (collectNonWords && nonWords) {
|
|
2143
|
-
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
2142
|
+
for (const part of segmenter.segment(chunk.text)) {
|
|
2144
2143
|
const category = classifyNonWordSegment(part.segment);
|
|
2145
|
-
if (category)
|
|
2144
|
+
if (category) {
|
|
2145
|
+
if (collectNonWords && nonWords) addNonWord(nonWords, category, part.segment);
|
|
2146
|
+
continue;
|
|
2147
|
+
}
|
|
2148
|
+
if (part.isWordLike) segments.push(part.segment);
|
|
2149
|
+
else if (collectNonWords && nonWords) {
|
|
2150
|
+
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
2151
|
+
}
|
|
2146
2152
|
}
|
|
2147
2153
|
return {
|
|
2148
2154
|
locale: chunk.locale,
|
|
@@ -2159,6 +2165,16 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
2159
2165
|
let wordChars = 0;
|
|
2160
2166
|
let nonWordChars = 0;
|
|
2161
2167
|
for (const part of segmenter.segment(chunk.text)) {
|
|
2168
|
+
const category = classifyNonWordSegment(part.segment);
|
|
2169
|
+
if (category) {
|
|
2170
|
+
if (collectNonWords && nonWords) {
|
|
2171
|
+
addNonWord(nonWords, category, part.segment);
|
|
2172
|
+
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
2173
|
+
chars += count;
|
|
2174
|
+
nonWordChars += count;
|
|
2175
|
+
}
|
|
2176
|
+
continue;
|
|
2177
|
+
}
|
|
2162
2178
|
if (part.isWordLike) {
|
|
2163
2179
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
2164
2180
|
chars += count;
|
|
@@ -2168,9 +2184,7 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
2168
2184
|
if (collectNonWords && nonWords) {
|
|
2169
2185
|
let whitespaceCount = 0;
|
|
2170
2186
|
if (includeWhitespace) whitespaceCount = addWhitespace(nonWords, part.segment);
|
|
2171
|
-
|
|
2172
|
-
if (category) addNonWord(nonWords, category, part.segment);
|
|
2173
|
-
if (category || whitespaceCount > 0) {
|
|
2187
|
+
if (whitespaceCount > 0) {
|
|
2174
2188
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
2175
2189
|
chars += count;
|
|
2176
2190
|
nonWordChars += count;
|
|
@@ -2248,13 +2262,13 @@ const MODE_ALIASES = {
|
|
|
2248
2262
|
characters: "char",
|
|
2249
2263
|
"char-collector": "char-collector"
|
|
2250
2264
|
};
|
|
2251
|
-
const CHAR_MODE_ALIASES = new Set([
|
|
2265
|
+
const CHAR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
2252
2266
|
"char",
|
|
2253
2267
|
"chars",
|
|
2254
2268
|
"character",
|
|
2255
2269
|
"characters"
|
|
2256
2270
|
]);
|
|
2257
|
-
const COLLECTOR_MODE_ALIASES = new Set([
|
|
2271
|
+
const COLLECTOR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
2258
2272
|
"collector",
|
|
2259
2273
|
"collect",
|
|
2260
2274
|
"colle",
|
|
@@ -2352,7 +2366,7 @@ const regex = {
|
|
|
2352
2366
|
devanagari: /\p{Script=Devanagari}/u,
|
|
2353
2367
|
thai: /\p{Script=Thai}/u
|
|
2354
2368
|
};
|
|
2355
|
-
const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
2369
|
+
const defaultLatinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
2356
2370
|
function isLatinLocale(locale, context) {
|
|
2357
2371
|
if (context) return context.latinLocales.has(locale);
|
|
2358
2372
|
return defaultLatinLocales.has(locale);
|
|
@@ -2429,7 +2443,7 @@ function resolveLatinHintRules$1(options) {
|
|
|
2429
2443
|
function resolveLocaleDetectContext(options = {}) {
|
|
2430
2444
|
const latinHint = resolveLatinHint(options);
|
|
2431
2445
|
const latinHintRules = resolveLatinHintRules$1(options);
|
|
2432
|
-
const latinLocales = new Set([DEFAULT_LOCALE]);
|
|
2446
|
+
const latinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE]);
|
|
2433
2447
|
for (const rule of latinHintRules) latinLocales.add(rule.tag);
|
|
2434
2448
|
if (latinHint) latinLocales.add(latinHint);
|
|
2435
2449
|
return {
|
|
@@ -3766,7 +3780,8 @@ function resolveWhatlangWasmModulePath() {
|
|
|
3766
3780
|
}
|
|
3767
3781
|
async function loadWhatlangWasmModule() {
|
|
3768
3782
|
if (!modulePromise) modulePromise = (async () => {
|
|
3769
|
-
|
|
3783
|
+
const modulePath = resolveWhatlangWasmModulePath();
|
|
3784
|
+
return requireFromHere(modulePath);
|
|
3770
3785
|
})();
|
|
3771
3786
|
return modulePromise;
|
|
3772
3787
|
}
|
|
@@ -4968,7 +4983,7 @@ const INSPECT_HELP_LINES = [
|
|
|
4968
4983
|
"",
|
|
4969
4984
|
"Options:",
|
|
4970
4985
|
" -d, --detector <mode> inspect detector mode (wasm, regex) (default: regex)",
|
|
4971
|
-
" --content-gate <mode> content gate mode (default, strict, loose, off) (default: default)",
|
|
4986
|
+
" --content-gate <mode> content gate mode for pipeline policy inspection (default, strict, loose, off) (default: default)",
|
|
4972
4987
|
" --view <view> inspect view (pipeline, engine) (default: pipeline)",
|
|
4973
4988
|
" -f, --format <format> inspect output format (standard, json) (default: standard)",
|
|
4974
4989
|
" --pretty pretty print inspect JSON output",
|
|
@@ -5223,6 +5238,13 @@ function emitConfigNotes$1(notes) {
|
|
|
5223
5238
|
console.error(import_picocolors.default.yellow(warningLine));
|
|
5224
5239
|
}
|
|
5225
5240
|
}
|
|
5241
|
+
function shouldEmitEngineContentGateInfo(validated) {
|
|
5242
|
+
if (validated.view !== "engine" || validated.detector !== "wasm") return false;
|
|
5243
|
+
return validated.sources.contentGate || validated.contentGateMode !== "default";
|
|
5244
|
+
}
|
|
5245
|
+
function emitEngineContentGateInfo() {
|
|
5246
|
+
console.error(import_picocolors.default.cyan("Info: `--content-gate` does not affect `inspect --view engine`; engine view shows raw detector output. Use `--view pipeline` to inspect eligibility and content-gate restrictions."));
|
|
5247
|
+
}
|
|
5226
5248
|
async function executeInspectCommand({ argv, runtime }) {
|
|
5227
5249
|
const parsed = validateInspectInvocation(argv);
|
|
5228
5250
|
if (!parsed.ok) {
|
|
@@ -5254,6 +5276,7 @@ async function executeInspectCommand({ argv, runtime }) {
|
|
|
5254
5276
|
process.exitCode = 1;
|
|
5255
5277
|
return;
|
|
5256
5278
|
}
|
|
5279
|
+
if (shouldEmitEngineContentGateInfo(validated)) emitEngineContentGateInfo();
|
|
5257
5280
|
try {
|
|
5258
5281
|
if (validated.paths.length === 0) {
|
|
5259
5282
|
const input = await loadSingleInspectInput(void 0, validated.textTokens, validated.section);
|
|
@@ -5362,7 +5385,7 @@ function normalizeVersion(value) {
|
|
|
5362
5385
|
return trimmed;
|
|
5363
5386
|
}
|
|
5364
5387
|
function resolvePackageVersion(options = {}) {
|
|
5365
|
-
const embeddedVersion = normalizeVersion(options.embeddedVersion ?? "0.1.
|
|
5388
|
+
const embeddedVersion = normalizeVersion(options.embeddedVersion ?? "0.1.7-canary.1");
|
|
5366
5389
|
if (embeddedVersion) return embeddedVersion;
|
|
5367
5390
|
const maxLevels = options.maxLevels ?? 8;
|
|
5368
5391
|
const resolveFromPath = options.resolveFromPath ?? resolveVersionFromPath;
|
|
@@ -5560,7 +5583,7 @@ function aggregateSectionedResults(results, preserveCollectorSegments) {
|
|
|
5560
5583
|
existing.items.push(item.result);
|
|
5561
5584
|
}
|
|
5562
5585
|
}
|
|
5563
|
-
const sourceOrder = new Map([["frontmatter", 0], ["content", 1]]);
|
|
5586
|
+
const sourceOrder = /* @__PURE__ */ new Map([["frontmatter", 0], ["content", 1]]);
|
|
5564
5587
|
const items = [...grouped.values()].sort((left, right) => {
|
|
5565
5588
|
const sourceDiff = (sourceOrder.get(left.source) ?? 0) - (sourceOrder.get(right.source) ?? 0);
|
|
5566
5589
|
if (sourceDiff !== 0) return sourceDiff;
|
package/dist/esm/detector.mjs
CHANGED
|
@@ -966,7 +966,8 @@ function resolveWhatlangWasmModulePath() {
|
|
|
966
966
|
}
|
|
967
967
|
async function loadWhatlangWasmModule() {
|
|
968
968
|
if (!modulePromise) modulePromise = (async () => {
|
|
969
|
-
|
|
969
|
+
const modulePath = resolveWhatlangWasmModulePath();
|
|
970
|
+
return requireFromHere(modulePath);
|
|
970
971
|
})();
|
|
971
972
|
return modulePromise;
|
|
972
973
|
}
|
package/dist/esm/index2.d.mts
CHANGED
|
@@ -1,2 +1,2 @@
|
|
|
1
1
|
import { _ as NonWordCollection, a as SectionMode, b as WordCounterOptions, c as appendAll, d as countCharsForLocale, f as countWordsForLocale, h as LatinHintRule, i as ParsedMarkdown, l as wordCounter, n as parseMarkdown, o as SectionedResult, p as segmentTextByLocale, r as FrontmatterType, s as showSingularOrPluralWord, t as countSections, u as DEFAULT_LATIN_HINT_RULES, v as WordCounterBreakdown, x as WordCounterResult, y as WordCounterMode } from "./index.mjs";
|
|
2
|
-
export { DEFAULT_LATIN_HINT_RULES, FrontmatterType, LatinHintRule, NonWordCollection, ParsedMarkdown, SectionMode, SectionedResult, WordCounterBreakdown, WordCounterMode, WordCounterOptions, WordCounterResult, appendAll, countCharsForLocale, countSections, countWordsForLocale, wordCounter as default, wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
|
|
2
|
+
export { DEFAULT_LATIN_HINT_RULES, type FrontmatterType, type LatinHintRule, type NonWordCollection, type ParsedMarkdown, type SectionMode, type SectionedResult, type WordCounterBreakdown, type WordCounterMode, type WordCounterOptions, type WordCounterResult, appendAll, countCharsForLocale, countSections, countWordsForLocale, wordCounter as default, wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
|
package/dist/esm/markdown.mjs
CHANGED
|
@@ -45,7 +45,7 @@ const keycapEmojiRegex = /[0-9#*]\uFE0F?\u20E3/u;
|
|
|
45
45
|
const symbolRegex = /\p{S}/u;
|
|
46
46
|
const punctuationRegex = /\p{P}/u;
|
|
47
47
|
const whitespaceRegex = /\s/u;
|
|
48
|
-
const newlineChars = new Set([
|
|
48
|
+
const newlineChars = /* @__PURE__ */ new Set([
|
|
49
49
|
"\n",
|
|
50
50
|
"\r",
|
|
51
51
|
"\u2028",
|
|
@@ -156,11 +156,16 @@ function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
156
156
|
const segmenter = getSegmenter(chunk.locale);
|
|
157
157
|
const segments = [];
|
|
158
158
|
const nonWords = collectNonWords ? createNonWordCollection() : null;
|
|
159
|
-
for (const part of segmenter.segment(chunk.text))
|
|
160
|
-
else if (collectNonWords && nonWords) {
|
|
161
|
-
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
159
|
+
for (const part of segmenter.segment(chunk.text)) {
|
|
162
160
|
const category = classifyNonWordSegment(part.segment);
|
|
163
|
-
if (category)
|
|
161
|
+
if (category) {
|
|
162
|
+
if (collectNonWords && nonWords) addNonWord(nonWords, category, part.segment);
|
|
163
|
+
continue;
|
|
164
|
+
}
|
|
165
|
+
if (part.isWordLike) segments.push(part.segment);
|
|
166
|
+
else if (collectNonWords && nonWords) {
|
|
167
|
+
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
168
|
+
}
|
|
164
169
|
}
|
|
165
170
|
return {
|
|
166
171
|
locale: chunk.locale,
|
|
@@ -177,6 +182,16 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
177
182
|
let wordChars = 0;
|
|
178
183
|
let nonWordChars = 0;
|
|
179
184
|
for (const part of segmenter.segment(chunk.text)) {
|
|
185
|
+
const category = classifyNonWordSegment(part.segment);
|
|
186
|
+
if (category) {
|
|
187
|
+
if (collectNonWords && nonWords) {
|
|
188
|
+
addNonWord(nonWords, category, part.segment);
|
|
189
|
+
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
190
|
+
chars += count;
|
|
191
|
+
nonWordChars += count;
|
|
192
|
+
}
|
|
193
|
+
continue;
|
|
194
|
+
}
|
|
180
195
|
if (part.isWordLike) {
|
|
181
196
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
182
197
|
chars += count;
|
|
@@ -186,9 +201,7 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
186
201
|
if (collectNonWords && nonWords) {
|
|
187
202
|
let whitespaceCount = 0;
|
|
188
203
|
if (includeWhitespace) whitespaceCount = addWhitespace(nonWords, part.segment);
|
|
189
|
-
|
|
190
|
-
if (category) addNonWord(nonWords, category, part.segment);
|
|
191
|
-
if (category || whitespaceCount > 0) {
|
|
204
|
+
if (whitespaceCount > 0) {
|
|
192
205
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
193
206
|
chars += count;
|
|
194
207
|
nonWordChars += count;
|
|
@@ -266,13 +279,13 @@ const MODE_ALIASES = {
|
|
|
266
279
|
characters: "char",
|
|
267
280
|
"char-collector": "char-collector"
|
|
268
281
|
};
|
|
269
|
-
const CHAR_MODE_ALIASES = new Set([
|
|
282
|
+
const CHAR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
270
283
|
"char",
|
|
271
284
|
"chars",
|
|
272
285
|
"character",
|
|
273
286
|
"characters"
|
|
274
287
|
]);
|
|
275
|
-
const COLLECTOR_MODE_ALIASES = new Set([
|
|
288
|
+
const COLLECTOR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
276
289
|
"collector",
|
|
277
290
|
"collect",
|
|
278
291
|
"colle",
|
|
@@ -370,7 +383,7 @@ const regex = {
|
|
|
370
383
|
devanagari: /\p{Script=Devanagari}/u,
|
|
371
384
|
thai: /\p{Script=Thai}/u
|
|
372
385
|
};
|
|
373
|
-
const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
386
|
+
const defaultLatinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
374
387
|
function isLatinLocale(locale, context) {
|
|
375
388
|
if (context) return context.latinLocales.has(locale);
|
|
376
389
|
return defaultLatinLocales.has(locale);
|
|
@@ -447,7 +460,7 @@ function resolveLatinHintRules(options) {
|
|
|
447
460
|
function resolveLocaleDetectContext(options = {}) {
|
|
448
461
|
const latinHint = resolveLatinHint(options);
|
|
449
462
|
const latinHintRules = resolveLatinHintRules(options);
|
|
450
|
-
const latinLocales = new Set([DEFAULT_LOCALE]);
|
|
463
|
+
const latinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE]);
|
|
451
464
|
for (const rule of latinHintRules) latinLocales.add(rule.tag);
|
|
452
465
|
if (latinHint) latinLocales.add(latinHint);
|
|
453
466
|
return {
|
|
@@ -500,7 +500,7 @@ const keycapEmojiRegex = /[0-9#*]\uFE0F?\u20E3/u;
|
|
|
500
500
|
const symbolRegex = /\p{S}/u;
|
|
501
501
|
const punctuationRegex = /\p{P}/u;
|
|
502
502
|
const whitespaceRegex = /\s/u;
|
|
503
|
-
const newlineChars = new Set([
|
|
503
|
+
const newlineChars = /* @__PURE__ */ new Set([
|
|
504
504
|
"\n",
|
|
505
505
|
"\r",
|
|
506
506
|
"\u2028",
|
|
@@ -611,11 +611,16 @@ function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
611
611
|
const segmenter = getSegmenter(chunk.locale);
|
|
612
612
|
const segments = [];
|
|
613
613
|
const nonWords = collectNonWords ? createNonWordCollection() : null;
|
|
614
|
-
for (const part of segmenter.segment(chunk.text))
|
|
615
|
-
else if (collectNonWords && nonWords) {
|
|
616
|
-
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
614
|
+
for (const part of segmenter.segment(chunk.text)) {
|
|
617
615
|
const category = classifyNonWordSegment(part.segment);
|
|
618
|
-
if (category)
|
|
616
|
+
if (category) {
|
|
617
|
+
if (collectNonWords && nonWords) addNonWord(nonWords, category, part.segment);
|
|
618
|
+
continue;
|
|
619
|
+
}
|
|
620
|
+
if (part.isWordLike) segments.push(part.segment);
|
|
621
|
+
else if (collectNonWords && nonWords) {
|
|
622
|
+
if (includeWhitespace) addWhitespace(nonWords, part.segment);
|
|
623
|
+
}
|
|
619
624
|
}
|
|
620
625
|
return {
|
|
621
626
|
locale: chunk.locale,
|
|
@@ -632,6 +637,16 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
632
637
|
let wordChars = 0;
|
|
633
638
|
let nonWordChars = 0;
|
|
634
639
|
for (const part of segmenter.segment(chunk.text)) {
|
|
640
|
+
const category = classifyNonWordSegment(part.segment);
|
|
641
|
+
if (category) {
|
|
642
|
+
if (collectNonWords && nonWords) {
|
|
643
|
+
addNonWord(nonWords, category, part.segment);
|
|
644
|
+
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
645
|
+
chars += count;
|
|
646
|
+
nonWordChars += count;
|
|
647
|
+
}
|
|
648
|
+
continue;
|
|
649
|
+
}
|
|
635
650
|
if (part.isWordLike) {
|
|
636
651
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
637
652
|
chars += count;
|
|
@@ -641,9 +656,7 @@ function analyzeCharChunk(chunk, collectNonWords, includeWhitespace) {
|
|
|
641
656
|
if (collectNonWords && nonWords) {
|
|
642
657
|
let whitespaceCount = 0;
|
|
643
658
|
if (includeWhitespace) whitespaceCount = addWhitespace(nonWords, part.segment);
|
|
644
|
-
|
|
645
|
-
if (category) addNonWord(nonWords, category, part.segment);
|
|
646
|
-
if (category || whitespaceCount > 0) {
|
|
659
|
+
if (whitespaceCount > 0) {
|
|
647
660
|
const count = countCharsForLocale(part.segment, chunk.locale);
|
|
648
661
|
chars += count;
|
|
649
662
|
nonWordChars += count;
|
|
@@ -721,13 +734,13 @@ const MODE_ALIASES = {
|
|
|
721
734
|
characters: "char",
|
|
722
735
|
"char-collector": "char-collector"
|
|
723
736
|
};
|
|
724
|
-
const CHAR_MODE_ALIASES = new Set([
|
|
737
|
+
const CHAR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
725
738
|
"char",
|
|
726
739
|
"chars",
|
|
727
740
|
"character",
|
|
728
741
|
"characters"
|
|
729
742
|
]);
|
|
730
|
-
const COLLECTOR_MODE_ALIASES = new Set([
|
|
743
|
+
const COLLECTOR_MODE_ALIASES = /* @__PURE__ */ new Set([
|
|
731
744
|
"collector",
|
|
732
745
|
"collect",
|
|
733
746
|
"colle",
|
|
@@ -825,7 +838,7 @@ const regex = {
|
|
|
825
838
|
devanagari: /\p{Script=Devanagari}/u,
|
|
826
839
|
thai: /\p{Script=Thai}/u
|
|
827
840
|
};
|
|
828
|
-
const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
841
|
+
const defaultLatinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
829
842
|
function isLatinLocale(locale, context) {
|
|
830
843
|
if (context) return context.latinLocales.has(locale);
|
|
831
844
|
return defaultLatinLocales.has(locale);
|
|
@@ -902,7 +915,7 @@ function resolveLatinHintRules(options) {
|
|
|
902
915
|
function resolveLocaleDetectContext(options = {}) {
|
|
903
916
|
const latinHint = resolveLatinHint(options);
|
|
904
917
|
const latinHintRules = resolveLatinHintRules(options);
|
|
905
|
-
const latinLocales = new Set([DEFAULT_LOCALE]);
|
|
918
|
+
const latinLocales = /* @__PURE__ */ new Set([DEFAULT_LOCALE]);
|
|
906
919
|
for (const rule of latinHintRules) latinLocales.add(rule.tag);
|
|
907
920
|
if (latinHint) latinLocales.add(latinHint);
|
|
908
921
|
return {
|
|
@@ -1921,7 +1934,8 @@ function resolveWhatlangWasmModulePath() {
|
|
|
1921
1934
|
}
|
|
1922
1935
|
async function loadWhatlangWasmModule() {
|
|
1923
1936
|
if (!modulePromise) modulePromise = (async () => {
|
|
1924
|
-
|
|
1937
|
+
const modulePath = resolveWhatlangWasmModulePath();
|
|
1938
|
+
return requireFromHere(modulePath);
|
|
1925
1939
|
})();
|
|
1926
1940
|
return modulePromise;
|
|
1927
1941
|
}
|
|
@@ -14,14 +14,13 @@ function detect_language(text, _route_tag) {
|
|
|
14
14
|
return ret;
|
|
15
15
|
}
|
|
16
16
|
exports.detect_language = detect_language;
|
|
17
|
-
|
|
18
17
|
function __wbg_get_imports() {
|
|
19
18
|
const import0 = {
|
|
20
19
|
__proto__: null,
|
|
21
|
-
|
|
20
|
+
__wbg___wbindgen_throw_344f42d3211c4765: function(arg0, arg1) {
|
|
22
21
|
throw new Error(getStringFromWasm0(arg0, arg1));
|
|
23
22
|
},
|
|
24
|
-
|
|
23
|
+
__wbg_new_da52cf8fe3429cb2: function() {
|
|
25
24
|
const ret = new Object();
|
|
26
25
|
return ret;
|
|
27
26
|
},
|
|
@@ -55,8 +54,7 @@ function __wbg_get_imports() {
|
|
|
55
54
|
}
|
|
56
55
|
|
|
57
56
|
function getStringFromWasm0(ptr, len) {
|
|
58
|
-
|
|
59
|
-
return decodeText(ptr, len);
|
|
57
|
+
return decodeText(ptr >>> 0, len);
|
|
60
58
|
}
|
|
61
59
|
|
|
62
60
|
let cachedUint8ArrayMemory0 = null;
|
|
@@ -128,5 +126,6 @@ let WASM_VECTOR_LEN = 0;
|
|
|
128
126
|
const wasmPath = `${__dirname}/language_detector_bg.wasm`;
|
|
129
127
|
const wasmBytes = require('fs').readFileSync(wasmPath);
|
|
130
128
|
const wasmModule = new WebAssembly.Module(wasmBytes);
|
|
131
|
-
let
|
|
129
|
+
let wasmInstance = new WebAssembly.Instance(wasmModule, __wbg_get_imports());
|
|
130
|
+
let wasm = wasmInstance.exports;
|
|
132
131
|
wasm.__wbindgen_start();
|
|
Binary file
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@dev-pi2pie/word-counter",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.7-canary.1",
|
|
4
4
|
"keywords": [
|
|
5
5
|
"cli",
|
|
6
6
|
"intl-segmenter",
|
|
@@ -56,24 +56,21 @@
|
|
|
56
56
|
"format:check": "oxfmt --check src test scripts package.json tsconfig.json tsconfig.test.json .oxlintrc.json .oxfmtrc.json"
|
|
57
57
|
},
|
|
58
58
|
"dependencies": {
|
|
59
|
-
"commander": "^
|
|
60
|
-
"yaml": "^2.
|
|
59
|
+
"commander": "^15.0.0",
|
|
60
|
+
"yaml": "^2.9.0"
|
|
61
61
|
},
|
|
62
62
|
"devDependencies": {
|
|
63
|
-
"@types/bun": "^1.3.
|
|
64
|
-
"@types/node": "^
|
|
65
|
-
"oxfmt": "^0.
|
|
66
|
-
"oxlint": "^1.
|
|
63
|
+
"@types/bun": "^1.3.14",
|
|
64
|
+
"@types/node": "^26.1.0",
|
|
65
|
+
"oxfmt": "^0.57.0",
|
|
66
|
+
"oxlint": "^1.72.0",
|
|
67
67
|
"picocolors": "^1.1.1",
|
|
68
|
-
"tsdown": "^0.
|
|
69
|
-
"typescript": "^6.0.
|
|
68
|
+
"tsdown": "^0.22.3",
|
|
69
|
+
"typescript": "^6.0.3"
|
|
70
70
|
},
|
|
71
71
|
"peerDependencies": {
|
|
72
72
|
"typescript": "^5 || ^6"
|
|
73
73
|
},
|
|
74
|
-
"overrides": {
|
|
75
|
-
"picomatch": "4.0.4"
|
|
76
|
-
},
|
|
77
74
|
"engines": {
|
|
78
75
|
"node": ">=22.18.0"
|
|
79
76
|
}
|