@dev-pi2pie/word-counter 0.1.4 → 0.1.5-canary.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +75 -0
- package/dist/cjs/detector.cjs +427 -0
- package/dist/cjs/detector.cjs.map +1 -0
- package/dist/cjs/index.cjs +10 -1257
- package/dist/cjs/index.cjs.map +1 -1
- package/dist/cjs/markdown.cjs +1318 -0
- package/dist/cjs/markdown.cjs.map +1 -0
- package/dist/esm/bin.mjs +966 -298
- package/dist/esm/bin.mjs.map +1 -1
- package/dist/esm/detector.d.mts +37 -0
- package/dist/esm/detector.mjs +412 -0
- package/dist/esm/detector.mjs.map +1 -0
- package/dist/esm/index.d.mts +1 -1
- package/dist/esm/index.mjs +2 -1248
- package/dist/esm/index.mjs.map +1 -1
- package/dist/esm/index2.d.mts +2 -0
- package/dist/esm/markdown.mjs +1229 -0
- package/dist/esm/markdown.mjs.map +1 -0
- package/dist/esm/worker/count-worker.mjs +412 -47
- package/dist/esm/worker/count-worker.mjs.map +1 -1
- package/dist/esm/worker-pool.mjs +6 -3
- package/dist/esm/worker-pool.mjs.map +1 -1
- package/dist/wasm-language-detector/LICENSE +21 -0
- package/dist/wasm-language-detector/language_detector.d.ts +4 -0
- package/dist/wasm-language-detector/language_detector.js +132 -0
- package/dist/wasm-language-detector/language_detector_bg.wasm +0 -0
- package/dist/wasm-language-detector/language_detector_bg.wasm.d.ts +8 -0
- package/dist/wasm-language-detector/package.json +17 -0
- package/package.json +18 -10
|
@@ -1,7 +1,10 @@
|
|
|
1
|
+
import { createRequire } from "node:module";
|
|
1
2
|
import { readFile } from "node:fs/promises";
|
|
2
3
|
import { parentPort, workerData } from "node:worker_threads";
|
|
3
4
|
import { parseDocument } from "yaml";
|
|
4
|
-
|
|
5
|
+
import { existsSync } from "node:fs";
|
|
6
|
+
import { dirname, join } from "node:path";
|
|
7
|
+
import { fileURLToPath } from "node:url";
|
|
5
8
|
//#region src/markdown/toml/arrays.ts
|
|
6
9
|
function ensureArrayContainer(result, key) {
|
|
7
10
|
const existing = result[key];
|
|
@@ -16,7 +19,6 @@ function flattenArrayTables(result) {
|
|
|
16
19
|
result[key] = value.map((entry) => Object.entries(entry).map(([entryKey, entryValue]) => `${entryKey}=${entryValue}`).join(", ")).join(" | ");
|
|
17
20
|
}
|
|
18
21
|
}
|
|
19
|
-
|
|
20
22
|
//#endregion
|
|
21
23
|
//#region src/markdown/toml/keys.ts
|
|
22
24
|
function stripKeyQuotes(key) {
|
|
@@ -35,7 +37,6 @@ function normalizeKeyPath(key) {
|
|
|
35
37
|
if (segments.some((segment) => !segment)) return null;
|
|
36
38
|
return segments.join(".");
|
|
37
39
|
}
|
|
38
|
-
|
|
39
40
|
//#endregion
|
|
40
41
|
//#region src/markdown/toml/strings.ts
|
|
41
42
|
function stripInlineComment(line) {
|
|
@@ -84,7 +85,6 @@ function parseStringLiteral(value) {
|
|
|
84
85
|
if (value.startsWith("'") && value.endsWith("'")) return value.slice(1, -1);
|
|
85
86
|
return null;
|
|
86
87
|
}
|
|
87
|
-
|
|
88
88
|
//#endregion
|
|
89
89
|
//#region src/markdown/toml/values.ts
|
|
90
90
|
function parsePrimitive(raw) {
|
|
@@ -242,7 +242,6 @@ function toPlainText(value) {
|
|
|
242
242
|
if (Array.isArray(value)) return value.map((item) => String(item)).join(", ");
|
|
243
243
|
return String(value);
|
|
244
244
|
}
|
|
245
|
-
|
|
246
245
|
//#endregion
|
|
247
246
|
//#region src/markdown/toml/parse-frontmatter.ts
|
|
248
247
|
function parseTomlFrontmatter(frontmatter) {
|
|
@@ -326,7 +325,6 @@ function parseTomlFrontmatter(frontmatter) {
|
|
|
326
325
|
flattenArrayTables(result);
|
|
327
326
|
return result;
|
|
328
327
|
}
|
|
329
|
-
|
|
330
328
|
//#endregion
|
|
331
329
|
//#region src/markdown/parse-markdown.ts
|
|
332
330
|
const FENCE_TO_TYPE = {
|
|
@@ -461,7 +459,6 @@ function parseMarkdown(input) {
|
|
|
461
459
|
frontmatterType: openingType
|
|
462
460
|
};
|
|
463
461
|
}
|
|
464
|
-
|
|
465
462
|
//#endregion
|
|
466
463
|
//#region src/wc/segmenter.ts
|
|
467
464
|
const segmenterCache = /* @__PURE__ */ new Map();
|
|
@@ -490,13 +487,11 @@ function countCharsForLocale(text, locale) {
|
|
|
490
487
|
for (const _segment of segmenter.segment(text)) count++;
|
|
491
488
|
return count;
|
|
492
489
|
}
|
|
493
|
-
|
|
494
490
|
//#endregion
|
|
495
491
|
//#region src/utils/append-all.ts
|
|
496
492
|
function appendAll(target, source) {
|
|
497
493
|
for (const item of source) target.push(item);
|
|
498
494
|
}
|
|
499
|
-
|
|
500
495
|
//#endregion
|
|
501
496
|
//#region src/wc/non-words.ts
|
|
502
497
|
const emojiRegex = /(?:\p{Extended_Pictographic}|\p{Emoji_Presentation})/u;
|
|
@@ -610,7 +605,6 @@ function createWhitespaceCounts() {
|
|
|
610
605
|
other: 0
|
|
611
606
|
};
|
|
612
607
|
}
|
|
613
|
-
|
|
614
608
|
//#endregion
|
|
615
609
|
//#region src/wc/analyze.ts
|
|
616
610
|
function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
|
|
@@ -710,7 +704,6 @@ function aggregateByLocale(chunks) {
|
|
|
710
704
|
}
|
|
711
705
|
return order.map((locale) => map.get(locale));
|
|
712
706
|
}
|
|
713
|
-
|
|
714
707
|
//#endregion
|
|
715
708
|
//#region src/wc/mode.ts
|
|
716
709
|
const MODE_ALIASES = {
|
|
@@ -778,10 +771,7 @@ function normalizeMode(input) {
|
|
|
778
771
|
function resolveMode(input, fallback = "chunk") {
|
|
779
772
|
return normalizeMode(input) ?? fallback;
|
|
780
773
|
}
|
|
781
|
-
|
|
782
|
-
//#endregion
|
|
783
|
-
//#region src/wc/latin-hints.ts
|
|
784
|
-
const DEFAULT_LATIN_HINT_RULES_SOURCE = [
|
|
774
|
+
const DEFAULT_LATIN_HINT_RULES = Object.freeze([
|
|
785
775
|
{
|
|
786
776
|
tag: "de",
|
|
787
777
|
pattern: "[äöüÄÖÜß]"
|
|
@@ -818,9 +808,7 @@ const DEFAULT_LATIN_HINT_RULES_SOURCE = [
|
|
|
818
808
|
tag: "is",
|
|
819
809
|
pattern: "[ðÐþÞ]"
|
|
820
810
|
}
|
|
821
|
-
];
|
|
822
|
-
const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
|
|
823
|
-
|
|
811
|
+
].map((rule) => Object.freeze({ ...rule })));
|
|
824
812
|
//#endregion
|
|
825
813
|
//#region src/wc/locale-detect.ts
|
|
826
814
|
const DEFAULT_LOCALE = "und-Latn";
|
|
@@ -940,18 +928,17 @@ function detectLocaleForChar(char, previousLocale, options = {}, context = resol
|
|
|
940
928
|
if (regex.thai.test(char)) return "th";
|
|
941
929
|
if (regex.han.test(char)) {
|
|
942
930
|
if (allowJapaneseHanCarry && previousLocale && previousLocale.startsWith("ja")) return previousLocale;
|
|
943
|
-
return context.hanHint ??
|
|
931
|
+
return context.hanHint ?? "und-Hani";
|
|
944
932
|
}
|
|
945
933
|
if (regex.latin.test(char)) {
|
|
946
934
|
const hintedLocale = detectLatinLocale(char, context);
|
|
947
|
-
if (hintedLocale !==
|
|
948
|
-
if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !==
|
|
935
|
+
if (hintedLocale !== "und-Latn") return hintedLocale;
|
|
936
|
+
if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== "und-Latn") return previousLocale;
|
|
949
937
|
if (context.latinHint) return context.latinHint;
|
|
950
938
|
return DEFAULT_LOCALE;
|
|
951
939
|
}
|
|
952
940
|
return null;
|
|
953
941
|
}
|
|
954
|
-
|
|
955
942
|
//#endregion
|
|
956
943
|
//#region src/wc/segment.ts
|
|
957
944
|
const HARD_BOUNDARY_REGEX = /[\r\n,.!?;:,、。!?;:.。、]/u;
|
|
@@ -988,7 +975,7 @@ function segmentTextByLocale(text, options = {}) {
|
|
|
988
975
|
continue;
|
|
989
976
|
}
|
|
990
977
|
if (targetLocale !== currentLocale && detected !== null) {
|
|
991
|
-
if (currentLocale ===
|
|
978
|
+
if (currentLocale === "und-Latn" && isLatinLocale(targetLocale, context)) {
|
|
992
979
|
const promotionBreakIndex = findLastLatinPromotionBreakIndex(buffer);
|
|
993
980
|
if (promotionBreakIndex === -1) {
|
|
994
981
|
currentLocale = targetLocale;
|
|
@@ -1055,7 +1042,6 @@ function mergeAdjacentChunks(chunks) {
|
|
|
1055
1042
|
merged.push(last);
|
|
1056
1043
|
return merged;
|
|
1057
1044
|
}
|
|
1058
|
-
|
|
1059
1045
|
//#endregion
|
|
1060
1046
|
//#region src/wc/wc.ts
|
|
1061
1047
|
function wordCounter(text, options = {}) {
|
|
@@ -1109,11 +1095,11 @@ function wordCounter(text, options = {}) {
|
|
|
1109
1095
|
const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0);
|
|
1110
1096
|
const nonWordsTotal = collectNonWords ? analyzed.reduce((sum, chunk) => {
|
|
1111
1097
|
if (!chunk.nonWords) return sum;
|
|
1112
|
-
return sum + getNonWordTotal(chunk.nonWords);
|
|
1098
|
+
return sum + getNonWordTotal$1(chunk.nonWords);
|
|
1113
1099
|
}, 0) : 0;
|
|
1114
1100
|
const total = analyzed.reduce((sum, chunk) => {
|
|
1115
1101
|
let chunkTotal = chunk.words;
|
|
1116
|
-
if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal(chunk.nonWords);
|
|
1102
|
+
if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal$1(chunk.nonWords);
|
|
1117
1103
|
return sum + chunkTotal;
|
|
1118
1104
|
}, 0);
|
|
1119
1105
|
const counts = collectNonWords ? {
|
|
@@ -1141,7 +1127,7 @@ function wordCounter(text, options = {}) {
|
|
|
1141
1127
|
breakdown: {
|
|
1142
1128
|
mode,
|
|
1143
1129
|
items: aggregateByLocale(analyzed),
|
|
1144
|
-
nonWords: collectNonWordsAggregate(analyzed, collectNonWords)
|
|
1130
|
+
nonWords: collectNonWordsAggregate$1(analyzed, collectNonWords)
|
|
1145
1131
|
}
|
|
1146
1132
|
};
|
|
1147
1133
|
return {
|
|
@@ -1158,10 +1144,10 @@ function wordCounter(text, options = {}) {
|
|
|
1158
1144
|
}
|
|
1159
1145
|
};
|
|
1160
1146
|
}
|
|
1161
|
-
function getNonWordTotal(nonWords) {
|
|
1147
|
+
function getNonWordTotal$1(nonWords) {
|
|
1162
1148
|
return nonWords.counts.emoji + nonWords.counts.symbols + nonWords.counts.punctuation + (nonWords.counts.whitespace ?? 0);
|
|
1163
1149
|
}
|
|
1164
|
-
function collectNonWordsAggregate(analyzed, enabled) {
|
|
1150
|
+
function collectNonWordsAggregate$1(analyzed, enabled) {
|
|
1165
1151
|
if (!enabled) return;
|
|
1166
1152
|
const collection = createNonWordCollection();
|
|
1167
1153
|
for (const chunk of analyzed) {
|
|
@@ -1170,14 +1156,12 @@ function collectNonWordsAggregate(analyzed, enabled) {
|
|
|
1170
1156
|
}
|
|
1171
1157
|
return collection;
|
|
1172
1158
|
}
|
|
1173
|
-
|
|
1174
1159
|
//#endregion
|
|
1175
1160
|
//#region src/wc/index.ts
|
|
1176
1161
|
var wc_default = wordCounter;
|
|
1177
|
-
|
|
1178
1162
|
//#endregion
|
|
1179
1163
|
//#region src/markdown/section-count.ts
|
|
1180
|
-
function normalizeText(value) {
|
|
1164
|
+
function normalizeText$1(value) {
|
|
1181
1165
|
if (value == null) return "";
|
|
1182
1166
|
if (typeof value === "string") return value;
|
|
1183
1167
|
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
@@ -1187,10 +1171,10 @@ function normalizeText(value) {
|
|
|
1187
1171
|
return String(value);
|
|
1188
1172
|
}
|
|
1189
1173
|
}
|
|
1190
|
-
function buildPerKeyItems(data, mode, options) {
|
|
1174
|
+
function buildPerKeyItems$1(data, mode, options) {
|
|
1191
1175
|
if (!data || typeof data !== "object" || Array.isArray(data)) return [];
|
|
1192
1176
|
return Object.entries(data).map(([key, value]) => {
|
|
1193
|
-
const valueText = normalizeText(value);
|
|
1177
|
+
const valueText = normalizeText$1(value);
|
|
1194
1178
|
return {
|
|
1195
1179
|
name: key,
|
|
1196
1180
|
source: "frontmatter",
|
|
@@ -1198,14 +1182,14 @@ function buildPerKeyItems(data, mode, options) {
|
|
|
1198
1182
|
};
|
|
1199
1183
|
});
|
|
1200
1184
|
}
|
|
1201
|
-
function buildSingleItem(name, text, mode, options, source) {
|
|
1185
|
+
function buildSingleItem$1(name, text, mode, options, source) {
|
|
1202
1186
|
return [{
|
|
1203
1187
|
name,
|
|
1204
1188
|
source,
|
|
1205
1189
|
result: wc_default(text, options)
|
|
1206
1190
|
}];
|
|
1207
1191
|
}
|
|
1208
|
-
function sumTotals(items) {
|
|
1192
|
+
function sumTotals$1(items) {
|
|
1209
1193
|
return items.reduce((sum, item) => sum + item.result.total, 0);
|
|
1210
1194
|
}
|
|
1211
1195
|
function countSections(input, section, options = {}) {
|
|
@@ -1227,11 +1211,188 @@ function countSections(input, section, options = {}) {
|
|
|
1227
1211
|
const frontmatterText = parsed.frontmatter ?? "";
|
|
1228
1212
|
const contentText = parsed.content ?? "";
|
|
1229
1213
|
let items = [];
|
|
1230
|
-
if (section === "frontmatter") items = buildSingleItem("frontmatter", frontmatterText, mode, options, "frontmatter");
|
|
1231
|
-
else if (section === "content") items = buildSingleItem("content", contentText, mode, options, "content");
|
|
1232
|
-
else if (section === "split") items = [...buildSingleItem("frontmatter", frontmatterText, mode, options, "frontmatter"), ...buildSingleItem("content", contentText, mode, options, "content")];
|
|
1233
|
-
else if (section === "per-key") items = buildPerKeyItems(parsed.data, mode, options);
|
|
1234
|
-
else if (section === "split-per-key") items = [...buildPerKeyItems(parsed.data, mode, options), ...buildSingleItem("content", contentText, mode, options, "content")];
|
|
1214
|
+
if (section === "frontmatter") items = buildSingleItem$1("frontmatter", frontmatterText, mode, options, "frontmatter");
|
|
1215
|
+
else if (section === "content") items = buildSingleItem$1("content", contentText, mode, options, "content");
|
|
1216
|
+
else if (section === "split") items = [...buildSingleItem$1("frontmatter", frontmatterText, mode, options, "frontmatter"), ...buildSingleItem$1("content", contentText, mode, options, "content")];
|
|
1217
|
+
else if (section === "per-key") items = buildPerKeyItems$1(parsed.data, mode, options);
|
|
1218
|
+
else if (section === "split-per-key") items = [...buildPerKeyItems$1(parsed.data, mode, options), ...buildSingleItem$1("content", contentText, mode, options, "content")];
|
|
1219
|
+
return {
|
|
1220
|
+
section,
|
|
1221
|
+
total: sumTotals$1(items),
|
|
1222
|
+
frontmatterType: parsed.frontmatterType,
|
|
1223
|
+
items
|
|
1224
|
+
};
|
|
1225
|
+
}
|
|
1226
|
+
//#endregion
|
|
1227
|
+
//#region src/detector/none.ts
|
|
1228
|
+
async function wordCounterWithRegexDetector(text, options = {}) {
|
|
1229
|
+
return wc_default(text, options);
|
|
1230
|
+
}
|
|
1231
|
+
async function countSectionsWithRegexDetector(input, section, options = {}) {
|
|
1232
|
+
return countSections(input, section, options);
|
|
1233
|
+
}
|
|
1234
|
+
//#endregion
|
|
1235
|
+
//#region src/detector/result-builder.ts
|
|
1236
|
+
function getNonWordTotal(nonWords) {
|
|
1237
|
+
return nonWords.counts.emoji + nonWords.counts.symbols + nonWords.counts.punctuation + (nonWords.counts.whitespace ?? 0);
|
|
1238
|
+
}
|
|
1239
|
+
function collectNonWordsAggregate(analyzed, enabled) {
|
|
1240
|
+
if (!enabled) return;
|
|
1241
|
+
const collection = createNonWordCollection();
|
|
1242
|
+
for (const chunk of analyzed) {
|
|
1243
|
+
if (!chunk.nonWords) continue;
|
|
1244
|
+
mergeNonWordCollections(collection, chunk.nonWords);
|
|
1245
|
+
}
|
|
1246
|
+
return collection;
|
|
1247
|
+
}
|
|
1248
|
+
function buildWordCounterResultFromChunks(chunks, options = {}) {
|
|
1249
|
+
const mode = resolveMode(options.mode, "chunk");
|
|
1250
|
+
const collectNonWords = Boolean(options.nonWords);
|
|
1251
|
+
const includeWhitespace = Boolean(options.includeWhitespace);
|
|
1252
|
+
if (mode === "char" || mode === "char-collector") {
|
|
1253
|
+
const analyzed = chunks.map((chunk) => analyzeCharChunk(chunk, collectNonWords, includeWhitespace));
|
|
1254
|
+
const total = analyzed.reduce((sum, chunk) => sum + chunk.chars, 0);
|
|
1255
|
+
const counts = collectNonWords ? {
|
|
1256
|
+
words: analyzed.reduce((sum, chunk) => sum + chunk.wordChars, 0),
|
|
1257
|
+
nonWords: analyzed.reduce((sum, chunk) => sum + chunk.nonWordChars, 0),
|
|
1258
|
+
total
|
|
1259
|
+
} : void 0;
|
|
1260
|
+
if (mode === "char") return {
|
|
1261
|
+
total,
|
|
1262
|
+
counts,
|
|
1263
|
+
breakdown: {
|
|
1264
|
+
mode,
|
|
1265
|
+
items: analyzed.map((chunk) => ({
|
|
1266
|
+
locale: chunk.locale,
|
|
1267
|
+
text: chunk.text,
|
|
1268
|
+
chars: chunk.chars,
|
|
1269
|
+
nonWords: chunk.nonWords
|
|
1270
|
+
}))
|
|
1271
|
+
}
|
|
1272
|
+
};
|
|
1273
|
+
return {
|
|
1274
|
+
total,
|
|
1275
|
+
counts,
|
|
1276
|
+
breakdown: {
|
|
1277
|
+
mode,
|
|
1278
|
+
items: aggregateCharsByLocale(analyzed).map((chunk) => ({
|
|
1279
|
+
locale: chunk.locale,
|
|
1280
|
+
chars: chunk.chars,
|
|
1281
|
+
nonWords: chunk.nonWords
|
|
1282
|
+
}))
|
|
1283
|
+
}
|
|
1284
|
+
};
|
|
1285
|
+
}
|
|
1286
|
+
const analyzed = chunks.map((chunk) => analyzeChunk(chunk, collectNonWords, includeWhitespace));
|
|
1287
|
+
const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0);
|
|
1288
|
+
const nonWordsTotal = collectNonWords ? analyzed.reduce((sum, chunk) => {
|
|
1289
|
+
if (!chunk.nonWords) return sum;
|
|
1290
|
+
return sum + getNonWordTotal(chunk.nonWords);
|
|
1291
|
+
}, 0) : 0;
|
|
1292
|
+
const total = analyzed.reduce((sum, chunk) => {
|
|
1293
|
+
let chunkTotal = chunk.words;
|
|
1294
|
+
if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal(chunk.nonWords);
|
|
1295
|
+
return sum + chunkTotal;
|
|
1296
|
+
}, 0);
|
|
1297
|
+
const counts = collectNonWords ? {
|
|
1298
|
+
words: wordsTotal,
|
|
1299
|
+
nonWords: nonWordsTotal,
|
|
1300
|
+
total
|
|
1301
|
+
} : void 0;
|
|
1302
|
+
if (mode === "segments") return {
|
|
1303
|
+
total,
|
|
1304
|
+
counts,
|
|
1305
|
+
breakdown: {
|
|
1306
|
+
mode,
|
|
1307
|
+
items: analyzed.map((chunk) => ({
|
|
1308
|
+
locale: chunk.locale,
|
|
1309
|
+
text: chunk.text,
|
|
1310
|
+
words: chunk.words,
|
|
1311
|
+
segments: chunk.segments,
|
|
1312
|
+
nonWords: chunk.nonWords
|
|
1313
|
+
}))
|
|
1314
|
+
}
|
|
1315
|
+
};
|
|
1316
|
+
if (mode === "collector") return {
|
|
1317
|
+
total,
|
|
1318
|
+
counts,
|
|
1319
|
+
breakdown: {
|
|
1320
|
+
mode,
|
|
1321
|
+
items: aggregateByLocale(analyzed),
|
|
1322
|
+
nonWords: collectNonWordsAggregate(analyzed, collectNonWords)
|
|
1323
|
+
}
|
|
1324
|
+
};
|
|
1325
|
+
return {
|
|
1326
|
+
total,
|
|
1327
|
+
counts,
|
|
1328
|
+
breakdown: {
|
|
1329
|
+
mode,
|
|
1330
|
+
items: analyzed.map((chunk) => ({
|
|
1331
|
+
locale: chunk.locale,
|
|
1332
|
+
text: chunk.text,
|
|
1333
|
+
words: chunk.words,
|
|
1334
|
+
nonWords: chunk.nonWords
|
|
1335
|
+
}))
|
|
1336
|
+
}
|
|
1337
|
+
};
|
|
1338
|
+
}
|
|
1339
|
+
//#endregion
|
|
1340
|
+
//#region src/detector/sections.ts
|
|
1341
|
+
function normalizeText(value) {
|
|
1342
|
+
if (value == null) return "";
|
|
1343
|
+
if (typeof value === "string") return value;
|
|
1344
|
+
if (typeof value === "number" || typeof value === "boolean") return String(value);
|
|
1345
|
+
try {
|
|
1346
|
+
return JSON.stringify(value);
|
|
1347
|
+
} catch {
|
|
1348
|
+
return String(value);
|
|
1349
|
+
}
|
|
1350
|
+
}
|
|
1351
|
+
async function buildPerKeyItems(data, options) {
|
|
1352
|
+
if (!data || typeof data !== "object" || Array.isArray(data)) return [];
|
|
1353
|
+
return Promise.all(Object.entries(data).map(async ([key, value]) => {
|
|
1354
|
+
const valueText = normalizeText(value);
|
|
1355
|
+
return {
|
|
1356
|
+
name: key,
|
|
1357
|
+
source: "frontmatter",
|
|
1358
|
+
result: await wordCounterWithDetector(valueText ? `${key}: ${valueText}` : key, options)
|
|
1359
|
+
};
|
|
1360
|
+
}));
|
|
1361
|
+
}
|
|
1362
|
+
async function buildSingleItem(name, text, options, source) {
|
|
1363
|
+
return [{
|
|
1364
|
+
name,
|
|
1365
|
+
source,
|
|
1366
|
+
result: await wordCounterWithDetector(text, options)
|
|
1367
|
+
}];
|
|
1368
|
+
}
|
|
1369
|
+
function sumTotals(items) {
|
|
1370
|
+
return items.reduce((sum, item) => sum + item.result.total, 0);
|
|
1371
|
+
}
|
|
1372
|
+
async function countSectionsWithResolvedDetector(input, section, options = {}) {
|
|
1373
|
+
options.mode;
|
|
1374
|
+
if (section === "all") {
|
|
1375
|
+
const result = await wordCounterWithDetector(input, options);
|
|
1376
|
+
return {
|
|
1377
|
+
section,
|
|
1378
|
+
total: result.total,
|
|
1379
|
+
frontmatterType: null,
|
|
1380
|
+
items: [{
|
|
1381
|
+
name: "all",
|
|
1382
|
+
source: "content",
|
|
1383
|
+
result
|
|
1384
|
+
}]
|
|
1385
|
+
};
|
|
1386
|
+
}
|
|
1387
|
+
const parsed = parseMarkdown(input);
|
|
1388
|
+
const frontmatterText = parsed.frontmatter ?? "";
|
|
1389
|
+
const contentText = parsed.content ?? "";
|
|
1390
|
+
let items = [];
|
|
1391
|
+
if (section === "frontmatter") items = await buildSingleItem("frontmatter", frontmatterText, options, "frontmatter");
|
|
1392
|
+
else if (section === "content") items = await buildSingleItem("content", contentText, options, "content");
|
|
1393
|
+
else if (section === "split") items = [...await buildSingleItem("frontmatter", frontmatterText, options, "frontmatter"), ...await buildSingleItem("content", contentText, options, "content")];
|
|
1394
|
+
else if (section === "per-key") items = await buildPerKeyItems(parsed.data, options);
|
|
1395
|
+
else if (section === "split-per-key") items = [...await buildPerKeyItems(parsed.data, options), ...await buildSingleItem("content", contentText, options, "content")];
|
|
1235
1396
|
return {
|
|
1236
1397
|
section,
|
|
1237
1398
|
total: sumTotals(items),
|
|
@@ -1239,7 +1400,207 @@ function countSections(input, section, options = {}) {
|
|
|
1239
1400
|
items
|
|
1240
1401
|
};
|
|
1241
1402
|
}
|
|
1242
|
-
|
|
1403
|
+
const LATIN_WASM_MIN_CONFIDENCE = .75;
|
|
1404
|
+
const HANI_WASM_MIN_CONFIDENCE = .9;
|
|
1405
|
+
const LATIN_SCRIPT_REGEX = /\p{Script=Latin}/u;
|
|
1406
|
+
const HAN_SCRIPT_REGEX = /\p{Script=Han}/u;
|
|
1407
|
+
const DETECTOR_ROUTE_POLICIES = {
|
|
1408
|
+
[DEFAULT_LOCALE]: {
|
|
1409
|
+
routeTag: DEFAULT_LOCALE,
|
|
1410
|
+
minScriptChars: 24,
|
|
1411
|
+
minConfidence: LATIN_WASM_MIN_CONFIDENCE,
|
|
1412
|
+
requireReliable: true
|
|
1413
|
+
},
|
|
1414
|
+
[DEFAULT_HAN_TAG]: {
|
|
1415
|
+
routeTag: DEFAULT_HAN_TAG,
|
|
1416
|
+
minScriptChars: 12,
|
|
1417
|
+
minConfidence: HANI_WASM_MIN_CONFIDENCE,
|
|
1418
|
+
requireReliable: true
|
|
1419
|
+
}
|
|
1420
|
+
};
|
|
1421
|
+
function isAmbiguousDetectorRoute(locale) {
|
|
1422
|
+
return locale === "und-Latn" || locale === "und-Hani";
|
|
1423
|
+
}
|
|
1424
|
+
function countScriptBearingCharsForRoute(text, routeTag) {
|
|
1425
|
+
const matcher = routeTag === "und-Hani" ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX;
|
|
1426
|
+
let count = 0;
|
|
1427
|
+
for (const char of text) if (matcher.test(char)) count += 1;
|
|
1428
|
+
return count;
|
|
1429
|
+
}
|
|
1430
|
+
function shouldRunWasmDetector(text, routeTag) {
|
|
1431
|
+
const policy = DETECTOR_ROUTE_POLICIES[routeTag];
|
|
1432
|
+
return countScriptBearingCharsForRoute(text, routeTag) >= policy.minScriptChars;
|
|
1433
|
+
}
|
|
1434
|
+
function normalizeDetectorSampleForRoute(text, routeTag) {
|
|
1435
|
+
const matcher = routeTag === "und-Hani" ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX;
|
|
1436
|
+
return [...text].map((char) => {
|
|
1437
|
+
if (matcher.test(char)) return char;
|
|
1438
|
+
if (/\s/u.test(char)) return " ";
|
|
1439
|
+
return " ";
|
|
1440
|
+
}).join("").replace(/\s+/g, " ").trim();
|
|
1441
|
+
}
|
|
1442
|
+
//#endregion
|
|
1443
|
+
//#region src/detector/whatlang-wasm.ts
|
|
1444
|
+
const GENERATED_FOLDER_NAME = "wasm-language-detector";
|
|
1445
|
+
const GENERATED_MODULE_FILE = "language_detector.js";
|
|
1446
|
+
const MAX_SEARCH_DEPTH = 8;
|
|
1447
|
+
const requireFromHere = createRequire(import.meta.url);
|
|
1448
|
+
const WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE = "WASM detector runtime is unavailable. Run `bun run build:wasm` to generate it.";
|
|
1449
|
+
let modulePromise = null;
|
|
1450
|
+
function resolveCandidateModulePaths() {
|
|
1451
|
+
const moduleDir = dirname(fileURLToPath(import.meta.url));
|
|
1452
|
+
const candidates = /* @__PURE__ */ new Set();
|
|
1453
|
+
let currentDir = moduleDir;
|
|
1454
|
+
for (let depth = 0; depth < MAX_SEARCH_DEPTH; depth += 1) {
|
|
1455
|
+
candidates.add(join(currentDir, GENERATED_FOLDER_NAME, GENERATED_MODULE_FILE));
|
|
1456
|
+
candidates.add(join(currentDir, "generated", GENERATED_FOLDER_NAME, GENERATED_MODULE_FILE));
|
|
1457
|
+
const parentDir = dirname(currentDir);
|
|
1458
|
+
if (parentDir === currentDir) break;
|
|
1459
|
+
currentDir = parentDir;
|
|
1460
|
+
}
|
|
1461
|
+
return [...candidates];
|
|
1462
|
+
}
|
|
1463
|
+
function resolveWhatlangWasmModulePath() {
|
|
1464
|
+
for (const candidate of resolveCandidateModulePaths()) if (existsSync(candidate)) return candidate;
|
|
1465
|
+
throw new Error(WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE);
|
|
1466
|
+
}
|
|
1467
|
+
async function loadWhatlangWasmModule() {
|
|
1468
|
+
if (!modulePromise) modulePromise = (async () => {
|
|
1469
|
+
return requireFromHere(resolveWhatlangWasmModulePath());
|
|
1470
|
+
})();
|
|
1471
|
+
return modulePromise;
|
|
1472
|
+
}
|
|
1473
|
+
async function detectWithWhatlangWasm(text, routeTag) {
|
|
1474
|
+
return (await loadWhatlangWasmModule()).detect_language(text, routeTag);
|
|
1475
|
+
}
|
|
1476
|
+
//#endregion
|
|
1477
|
+
//#region src/detector/whatlang-map.ts
|
|
1478
|
+
const LATIN_LANGUAGE_TAGS = {
|
|
1479
|
+
cat: "ca",
|
|
1480
|
+
ces: "cs",
|
|
1481
|
+
dan: "da",
|
|
1482
|
+
deu: "de",
|
|
1483
|
+
eng: "en",
|
|
1484
|
+
fin: "fi",
|
|
1485
|
+
fra: "fr",
|
|
1486
|
+
hun: "hu",
|
|
1487
|
+
ita: "it",
|
|
1488
|
+
lat: "la",
|
|
1489
|
+
nld: "nl",
|
|
1490
|
+
pol: "pl",
|
|
1491
|
+
por: "pt",
|
|
1492
|
+
ron: "ro",
|
|
1493
|
+
spa: "es",
|
|
1494
|
+
swe: "sv",
|
|
1495
|
+
tur: "tr"
|
|
1496
|
+
};
|
|
1497
|
+
const HANI_LANGUAGE_TAGS = {
|
|
1498
|
+
cmn: "zh",
|
|
1499
|
+
jpn: "ja"
|
|
1500
|
+
};
|
|
1501
|
+
function hasSupportedScript(result, routeTag) {
|
|
1502
|
+
if (routeTag === "und-Latn") return result.script === "Latin";
|
|
1503
|
+
return result.script === "Mandarin";
|
|
1504
|
+
}
|
|
1505
|
+
function remapLanguageTag(lang, routeTag) {
|
|
1506
|
+
if (routeTag === "und-Latn") return LATIN_LANGUAGE_TAGS[lang];
|
|
1507
|
+
return HANI_LANGUAGE_TAGS[lang];
|
|
1508
|
+
}
|
|
1509
|
+
function remapWhatlangResult(result, routeTag) {
|
|
1510
|
+
if (!hasSupportedScript(result, routeTag)) return null;
|
|
1511
|
+
const tag = remapLanguageTag(result.lang, routeTag);
|
|
1512
|
+
if (!tag) return null;
|
|
1513
|
+
return {
|
|
1514
|
+
tag,
|
|
1515
|
+
confidence: result.confidence,
|
|
1516
|
+
reliable: result.reliable,
|
|
1517
|
+
source: "wasm"
|
|
1518
|
+
};
|
|
1519
|
+
}
|
|
1520
|
+
function getDetectorFallbackTag(routeTag) {
|
|
1521
|
+
return routeTag === "und-Hani" ? DEFAULT_HAN_TAG : DEFAULT_LOCALE;
|
|
1522
|
+
}
|
|
1523
|
+
//#endregion
|
|
1524
|
+
//#region src/detector/wasm.ts
|
|
1525
|
+
function shouldAcceptDetectorTag(routeTag, confidence, reliable) {
|
|
1526
|
+
const policy = DETECTOR_ROUTE_POLICIES[routeTag];
|
|
1527
|
+
if (policy.requireReliable && reliable !== true) return false;
|
|
1528
|
+
if (confidence === void 0) return false;
|
|
1529
|
+
return confidence >= policy.minConfidence;
|
|
1530
|
+
}
|
|
1531
|
+
function buildDetectorWindows(chunks) {
|
|
1532
|
+
const windows = [];
|
|
1533
|
+
for (let index = 0; index < chunks.length; index += 1) {
|
|
1534
|
+
const chunk = chunks[index];
|
|
1535
|
+
if (!chunk || !isAmbiguousDetectorRoute(chunk.locale)) continue;
|
|
1536
|
+
const previousWindow = windows[windows.length - 1];
|
|
1537
|
+
if (previousWindow && previousWindow.routeTag === chunk.locale && previousWindow.endIndex === index - 1) {
|
|
1538
|
+
previousWindow.endIndex = index;
|
|
1539
|
+
previousWindow.text += chunk.text;
|
|
1540
|
+
continue;
|
|
1541
|
+
}
|
|
1542
|
+
windows.push({
|
|
1543
|
+
routeTag: chunk.locale,
|
|
1544
|
+
startIndex: index,
|
|
1545
|
+
endIndex: index,
|
|
1546
|
+
text: chunk.text
|
|
1547
|
+
});
|
|
1548
|
+
}
|
|
1549
|
+
return windows;
|
|
1550
|
+
}
|
|
1551
|
+
async function resolveWindowLocale(window) {
|
|
1552
|
+
if (!shouldRunWasmDetector(window.text, window.routeTag)) return window.routeTag;
|
|
1553
|
+
const rawResult = await detectWithWhatlangWasm(window.text, window.routeTag);
|
|
1554
|
+
const rawRemapped = rawResult ? remapWhatlangResult(rawResult, window.routeTag) : null;
|
|
1555
|
+
const normalizedSample = normalizeDetectorSampleForRoute(window.text, window.routeTag);
|
|
1556
|
+
const normalizedResult = normalizedSample.length > 0 && normalizedSample !== window.text ? await detectWithWhatlangWasm(normalizedSample, window.routeTag) : null;
|
|
1557
|
+
const normalizedRemapped = normalizedResult ? remapWhatlangResult(normalizedResult, window.routeTag) : null;
|
|
1558
|
+
const candidates = [rawRemapped, normalizedRemapped].filter((value) => value !== null);
|
|
1559
|
+
if (candidates.length === 0) return getDetectorFallbackTag(window.routeTag);
|
|
1560
|
+
const strongestCandidate = candidates.reduce((best, current) => {
|
|
1561
|
+
if (!best) return current;
|
|
1562
|
+
return (current.confidence ?? 0) > (best.confidence ?? 0) ? current : best;
|
|
1563
|
+
}, candidates[0]);
|
|
1564
|
+
if (strongestCandidate && shouldAcceptDetectorTag(window.routeTag, strongestCandidate.confidence, strongestCandidate.reliable)) return strongestCandidate.tag;
|
|
1565
|
+
if (window.routeTag === "und-Latn" && rawRemapped && normalizedRemapped && rawRemapped.tag === normalizedRemapped.tag) {
|
|
1566
|
+
if (Math.max(rawRemapped.confidence ?? 0, normalizedRemapped.confidence ?? 0) >= .7) return rawRemapped.tag;
|
|
1567
|
+
}
|
|
1568
|
+
return getDetectorFallbackTag(window.routeTag);
|
|
1569
|
+
}
|
|
1570
|
+
async function segmentTextByLocaleWithWasmDetector(text, options = {}) {
|
|
1571
|
+
const chunks = segmentTextByLocale(text, options);
|
|
1572
|
+
const resolved = [...chunks];
|
|
1573
|
+
const windows = buildDetectorWindows(chunks);
|
|
1574
|
+
for (const window of windows) {
|
|
1575
|
+
const resolvedLocale = await resolveWindowLocale(window);
|
|
1576
|
+
for (let index = window.startIndex; index <= window.endIndex; index += 1) {
|
|
1577
|
+
const chunk = resolved[index];
|
|
1578
|
+
if (!chunk) continue;
|
|
1579
|
+
resolved[index] = {
|
|
1580
|
+
...chunk,
|
|
1581
|
+
locale: resolvedLocale
|
|
1582
|
+
};
|
|
1583
|
+
}
|
|
1584
|
+
}
|
|
1585
|
+
return resolved;
|
|
1586
|
+
}
|
|
1587
|
+
async function wordCounterWithWasmDetector(text, options = {}) {
|
|
1588
|
+
return buildWordCounterResultFromChunks(await segmentTextByLocaleWithWasmDetector(text, options), options);
|
|
1589
|
+
}
|
|
1590
|
+
async function countSectionsWithWasmDetector(input, section, options = {}) {
|
|
1591
|
+
return countSectionsWithResolvedDetector(input, section, options);
|
|
1592
|
+
}
|
|
1593
|
+
function resolveDetectorMode(mode) {
|
|
1594
|
+
return mode ?? "regex";
|
|
1595
|
+
}
|
|
1596
|
+
async function wordCounterWithDetector(text, options = {}) {
|
|
1597
|
+
if (resolveDetectorMode(options.detector) === "wasm") return wordCounterWithWasmDetector(text, options);
|
|
1598
|
+
return wordCounterWithRegexDetector(text, options);
|
|
1599
|
+
}
|
|
1600
|
+
async function countSectionsWithDetector(input, section, options = {}) {
|
|
1601
|
+
if (resolveDetectorMode(options.detector) === "wasm") return countSectionsWithWasmDetector(input, section, options);
|
|
1602
|
+
return countSectionsWithRegexDetector(input, section, options);
|
|
1603
|
+
}
|
|
1243
1604
|
//#endregion
|
|
1244
1605
|
//#region src/cli/batch/aggregate.ts
|
|
1245
1606
|
function stripCollectorSegmentsFromWordCounterResult(result) {
|
|
@@ -1256,7 +1617,6 @@ function compactCollectorSegmentsInCountResult(result) {
|
|
|
1256
1617
|
}
|
|
1257
1618
|
stripCollectorSegmentsFromWordCounterResult(result);
|
|
1258
1619
|
}
|
|
1259
|
-
|
|
1260
1620
|
//#endregion
|
|
1261
1621
|
//#region src/cli/path/load.ts
|
|
1262
1622
|
function isProbablyBinary(buffer) {
|
|
@@ -1273,7 +1633,6 @@ function isProbablyBinary(buffer) {
|
|
|
1273
1633
|
}
|
|
1274
1634
|
return suspicious / sampleSize > .3;
|
|
1275
1635
|
}
|
|
1276
|
-
|
|
1277
1636
|
//#endregion
|
|
1278
1637
|
//#region src/cli/batch/jobs/worker/count-worker.ts
|
|
1279
1638
|
const config = workerData;
|
|
@@ -1335,7 +1694,13 @@ parentPort.on("message", async (message) => {
|
|
|
1335
1694
|
}
|
|
1336
1695
|
try {
|
|
1337
1696
|
const content = buffer.toString("utf8");
|
|
1338
|
-
const result = config.section === "all" ? wc_default(content, config.wcOptions) : countSections(content, config.section, config.wcOptions)
|
|
1697
|
+
const result = config.detectorMode === "regex" ? config.section === "all" ? wc_default(content, config.wcOptions) : countSections(content, config.section, config.wcOptions) : config.section === "all" ? await wordCounterWithDetector(content, {
|
|
1698
|
+
...config.wcOptions,
|
|
1699
|
+
detector: config.detectorMode
|
|
1700
|
+
}) : await countSectionsWithDetector(content, config.section, {
|
|
1701
|
+
...config.wcOptions,
|
|
1702
|
+
detector: config.detectorMode
|
|
1703
|
+
});
|
|
1339
1704
|
if (!config.preserveCollectorSegments) compactCollectorSegmentsInCountResult(result);
|
|
1340
1705
|
const response = {
|
|
1341
1706
|
type: "result",
|
|
@@ -1364,7 +1729,7 @@ parentPort.on("message", async (message) => {
|
|
|
1364
1729
|
parentPort?.postMessage(response);
|
|
1365
1730
|
}
|
|
1366
1731
|
});
|
|
1367
|
-
|
|
1368
1732
|
//#endregion
|
|
1369
|
-
export {
|
|
1733
|
+
export {};
|
|
1734
|
+
|
|
1370
1735
|
//# sourceMappingURL=count-worker.mjs.map
|