@dev-pi2pie/word-counter 0.1.4 → 0.1.5-canary.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,10 @@
1
+ import { createRequire } from "node:module";
1
2
  import { readFile } from "node:fs/promises";
2
3
  import { parentPort, workerData } from "node:worker_threads";
3
4
  import { parseDocument } from "yaml";
4
-
5
+ import { existsSync } from "node:fs";
6
+ import { dirname, join } from "node:path";
7
+ import { fileURLToPath } from "node:url";
5
8
  //#region src/markdown/toml/arrays.ts
6
9
  function ensureArrayContainer(result, key) {
7
10
  const existing = result[key];
@@ -16,7 +19,6 @@ function flattenArrayTables(result) {
16
19
  result[key] = value.map((entry) => Object.entries(entry).map(([entryKey, entryValue]) => `${entryKey}=${entryValue}`).join(", ")).join(" | ");
17
20
  }
18
21
  }
19
-
20
22
  //#endregion
21
23
  //#region src/markdown/toml/keys.ts
22
24
  function stripKeyQuotes(key) {
@@ -35,7 +37,6 @@ function normalizeKeyPath(key) {
35
37
  if (segments.some((segment) => !segment)) return null;
36
38
  return segments.join(".");
37
39
  }
38
-
39
40
  //#endregion
40
41
  //#region src/markdown/toml/strings.ts
41
42
  function stripInlineComment(line) {
@@ -84,7 +85,6 @@ function parseStringLiteral(value) {
84
85
  if (value.startsWith("'") && value.endsWith("'")) return value.slice(1, -1);
85
86
  return null;
86
87
  }
87
-
88
88
  //#endregion
89
89
  //#region src/markdown/toml/values.ts
90
90
  function parsePrimitive(raw) {
@@ -242,7 +242,6 @@ function toPlainText(value) {
242
242
  if (Array.isArray(value)) return value.map((item) => String(item)).join(", ");
243
243
  return String(value);
244
244
  }
245
-
246
245
  //#endregion
247
246
  //#region src/markdown/toml/parse-frontmatter.ts
248
247
  function parseTomlFrontmatter(frontmatter) {
@@ -326,7 +325,6 @@ function parseTomlFrontmatter(frontmatter) {
326
325
  flattenArrayTables(result);
327
326
  return result;
328
327
  }
329
-
330
328
  //#endregion
331
329
  //#region src/markdown/parse-markdown.ts
332
330
  const FENCE_TO_TYPE = {
@@ -461,7 +459,6 @@ function parseMarkdown(input) {
461
459
  frontmatterType: openingType
462
460
  };
463
461
  }
464
-
465
462
  //#endregion
466
463
  //#region src/wc/segmenter.ts
467
464
  const segmenterCache = /* @__PURE__ */ new Map();
@@ -490,13 +487,11 @@ function countCharsForLocale(text, locale) {
490
487
  for (const _segment of segmenter.segment(text)) count++;
491
488
  return count;
492
489
  }
493
-
494
490
  //#endregion
495
491
  //#region src/utils/append-all.ts
496
492
  function appendAll(target, source) {
497
493
  for (const item of source) target.push(item);
498
494
  }
499
-
500
495
  //#endregion
501
496
  //#region src/wc/non-words.ts
502
497
  const emojiRegex = /(?:\p{Extended_Pictographic}|\p{Emoji_Presentation})/u;
@@ -610,7 +605,6 @@ function createWhitespaceCounts() {
610
605
  other: 0
611
606
  };
612
607
  }
613
-
614
608
  //#endregion
615
609
  //#region src/wc/analyze.ts
616
610
  function analyzeChunk(chunk, collectNonWords, includeWhitespace) {
@@ -710,7 +704,6 @@ function aggregateByLocale(chunks) {
710
704
  }
711
705
  return order.map((locale) => map.get(locale));
712
706
  }
713
-
714
707
  //#endregion
715
708
  //#region src/wc/mode.ts
716
709
  const MODE_ALIASES = {
@@ -778,10 +771,7 @@ function normalizeMode(input) {
778
771
  function resolveMode(input, fallback = "chunk") {
779
772
  return normalizeMode(input) ?? fallback;
780
773
  }
781
-
782
- //#endregion
783
- //#region src/wc/latin-hints.ts
784
- const DEFAULT_LATIN_HINT_RULES_SOURCE = [
774
+ const DEFAULT_LATIN_HINT_RULES = Object.freeze([
785
775
  {
786
776
  tag: "de",
787
777
  pattern: "[äöüÄÖÜß]"
@@ -818,9 +808,7 @@ const DEFAULT_LATIN_HINT_RULES_SOURCE = [
818
808
  tag: "is",
819
809
  pattern: "[ðÐþÞ]"
820
810
  }
821
- ];
822
- const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
823
-
811
+ ].map((rule) => Object.freeze({ ...rule })));
824
812
  //#endregion
825
813
  //#region src/wc/locale-detect.ts
826
814
  const DEFAULT_LOCALE = "und-Latn";
@@ -940,18 +928,17 @@ function detectLocaleForChar(char, previousLocale, options = {}, context = resol
940
928
  if (regex.thai.test(char)) return "th";
941
929
  if (regex.han.test(char)) {
942
930
  if (allowJapaneseHanCarry && previousLocale && previousLocale.startsWith("ja")) return previousLocale;
943
- return context.hanHint ?? DEFAULT_HAN_TAG;
931
+ return context.hanHint ?? "und-Hani";
944
932
  }
945
933
  if (regex.latin.test(char)) {
946
934
  const hintedLocale = detectLatinLocale(char, context);
947
- if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
948
- if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
935
+ if (hintedLocale !== "und-Latn") return hintedLocale;
936
+ if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== "und-Latn") return previousLocale;
949
937
  if (context.latinHint) return context.latinHint;
950
938
  return DEFAULT_LOCALE;
951
939
  }
952
940
  return null;
953
941
  }
954
-
955
942
  //#endregion
956
943
  //#region src/wc/segment.ts
957
944
  const HARD_BOUNDARY_REGEX = /[\r\n,.!?;:,、。!?;:.。、]/u;
@@ -988,7 +975,7 @@ function segmentTextByLocale(text, options = {}) {
988
975
  continue;
989
976
  }
990
977
  if (targetLocale !== currentLocale && detected !== null) {
991
- if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
978
+ if (currentLocale === "und-Latn" && isLatinLocale(targetLocale, context)) {
992
979
  const promotionBreakIndex = findLastLatinPromotionBreakIndex(buffer);
993
980
  if (promotionBreakIndex === -1) {
994
981
  currentLocale = targetLocale;
@@ -1055,7 +1042,6 @@ function mergeAdjacentChunks(chunks) {
1055
1042
  merged.push(last);
1056
1043
  return merged;
1057
1044
  }
1058
-
1059
1045
  //#endregion
1060
1046
  //#region src/wc/wc.ts
1061
1047
  function wordCounter(text, options = {}) {
@@ -1109,11 +1095,11 @@ function wordCounter(text, options = {}) {
1109
1095
  const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0);
1110
1096
  const nonWordsTotal = collectNonWords ? analyzed.reduce((sum, chunk) => {
1111
1097
  if (!chunk.nonWords) return sum;
1112
- return sum + getNonWordTotal(chunk.nonWords);
1098
+ return sum + getNonWordTotal$1(chunk.nonWords);
1113
1099
  }, 0) : 0;
1114
1100
  const total = analyzed.reduce((sum, chunk) => {
1115
1101
  let chunkTotal = chunk.words;
1116
- if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal(chunk.nonWords);
1102
+ if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal$1(chunk.nonWords);
1117
1103
  return sum + chunkTotal;
1118
1104
  }, 0);
1119
1105
  const counts = collectNonWords ? {
@@ -1141,7 +1127,7 @@ function wordCounter(text, options = {}) {
1141
1127
  breakdown: {
1142
1128
  mode,
1143
1129
  items: aggregateByLocale(analyzed),
1144
- nonWords: collectNonWordsAggregate(analyzed, collectNonWords)
1130
+ nonWords: collectNonWordsAggregate$1(analyzed, collectNonWords)
1145
1131
  }
1146
1132
  };
1147
1133
  return {
@@ -1158,10 +1144,10 @@ function wordCounter(text, options = {}) {
1158
1144
  }
1159
1145
  };
1160
1146
  }
1161
- function getNonWordTotal(nonWords) {
1147
+ function getNonWordTotal$1(nonWords) {
1162
1148
  return nonWords.counts.emoji + nonWords.counts.symbols + nonWords.counts.punctuation + (nonWords.counts.whitespace ?? 0);
1163
1149
  }
1164
- function collectNonWordsAggregate(analyzed, enabled) {
1150
+ function collectNonWordsAggregate$1(analyzed, enabled) {
1165
1151
  if (!enabled) return;
1166
1152
  const collection = createNonWordCollection();
1167
1153
  for (const chunk of analyzed) {
@@ -1170,14 +1156,12 @@ function collectNonWordsAggregate(analyzed, enabled) {
1170
1156
  }
1171
1157
  return collection;
1172
1158
  }
1173
-
1174
1159
  //#endregion
1175
1160
  //#region src/wc/index.ts
1176
1161
  var wc_default = wordCounter;
1177
-
1178
1162
  //#endregion
1179
1163
  //#region src/markdown/section-count.ts
1180
- function normalizeText(value) {
1164
+ function normalizeText$1(value) {
1181
1165
  if (value == null) return "";
1182
1166
  if (typeof value === "string") return value;
1183
1167
  if (typeof value === "number" || typeof value === "boolean") return String(value);
@@ -1187,10 +1171,10 @@ function normalizeText(value) {
1187
1171
  return String(value);
1188
1172
  }
1189
1173
  }
1190
- function buildPerKeyItems(data, mode, options) {
1174
+ function buildPerKeyItems$1(data, mode, options) {
1191
1175
  if (!data || typeof data !== "object" || Array.isArray(data)) return [];
1192
1176
  return Object.entries(data).map(([key, value]) => {
1193
- const valueText = normalizeText(value);
1177
+ const valueText = normalizeText$1(value);
1194
1178
  return {
1195
1179
  name: key,
1196
1180
  source: "frontmatter",
@@ -1198,14 +1182,14 @@ function buildPerKeyItems(data, mode, options) {
1198
1182
  };
1199
1183
  });
1200
1184
  }
1201
- function buildSingleItem(name, text, mode, options, source) {
1185
+ function buildSingleItem$1(name, text, mode, options, source) {
1202
1186
  return [{
1203
1187
  name,
1204
1188
  source,
1205
1189
  result: wc_default(text, options)
1206
1190
  }];
1207
1191
  }
1208
- function sumTotals(items) {
1192
+ function sumTotals$1(items) {
1209
1193
  return items.reduce((sum, item) => sum + item.result.total, 0);
1210
1194
  }
1211
1195
  function countSections(input, section, options = {}) {
@@ -1227,11 +1211,188 @@ function countSections(input, section, options = {}) {
1227
1211
  const frontmatterText = parsed.frontmatter ?? "";
1228
1212
  const contentText = parsed.content ?? "";
1229
1213
  let items = [];
1230
- if (section === "frontmatter") items = buildSingleItem("frontmatter", frontmatterText, mode, options, "frontmatter");
1231
- else if (section === "content") items = buildSingleItem("content", contentText, mode, options, "content");
1232
- else if (section === "split") items = [...buildSingleItem("frontmatter", frontmatterText, mode, options, "frontmatter"), ...buildSingleItem("content", contentText, mode, options, "content")];
1233
- else if (section === "per-key") items = buildPerKeyItems(parsed.data, mode, options);
1234
- else if (section === "split-per-key") items = [...buildPerKeyItems(parsed.data, mode, options), ...buildSingleItem("content", contentText, mode, options, "content")];
1214
+ if (section === "frontmatter") items = buildSingleItem$1("frontmatter", frontmatterText, mode, options, "frontmatter");
1215
+ else if (section === "content") items = buildSingleItem$1("content", contentText, mode, options, "content");
1216
+ else if (section === "split") items = [...buildSingleItem$1("frontmatter", frontmatterText, mode, options, "frontmatter"), ...buildSingleItem$1("content", contentText, mode, options, "content")];
1217
+ else if (section === "per-key") items = buildPerKeyItems$1(parsed.data, mode, options);
1218
+ else if (section === "split-per-key") items = [...buildPerKeyItems$1(parsed.data, mode, options), ...buildSingleItem$1("content", contentText, mode, options, "content")];
1219
+ return {
1220
+ section,
1221
+ total: sumTotals$1(items),
1222
+ frontmatterType: parsed.frontmatterType,
1223
+ items
1224
+ };
1225
+ }
1226
+ //#endregion
1227
+ //#region src/detector/none.ts
1228
+ async function wordCounterWithRegexDetector(text, options = {}) {
1229
+ return wc_default(text, options);
1230
+ }
1231
+ async function countSectionsWithRegexDetector(input, section, options = {}) {
1232
+ return countSections(input, section, options);
1233
+ }
1234
+ //#endregion
1235
+ //#region src/detector/result-builder.ts
1236
+ function getNonWordTotal(nonWords) {
1237
+ return nonWords.counts.emoji + nonWords.counts.symbols + nonWords.counts.punctuation + (nonWords.counts.whitespace ?? 0);
1238
+ }
1239
+ function collectNonWordsAggregate(analyzed, enabled) {
1240
+ if (!enabled) return;
1241
+ const collection = createNonWordCollection();
1242
+ for (const chunk of analyzed) {
1243
+ if (!chunk.nonWords) continue;
1244
+ mergeNonWordCollections(collection, chunk.nonWords);
1245
+ }
1246
+ return collection;
1247
+ }
1248
+ function buildWordCounterResultFromChunks(chunks, options = {}) {
1249
+ const mode = resolveMode(options.mode, "chunk");
1250
+ const collectNonWords = Boolean(options.nonWords);
1251
+ const includeWhitespace = Boolean(options.includeWhitespace);
1252
+ if (mode === "char" || mode === "char-collector") {
1253
+ const analyzed = chunks.map((chunk) => analyzeCharChunk(chunk, collectNonWords, includeWhitespace));
1254
+ const total = analyzed.reduce((sum, chunk) => sum + chunk.chars, 0);
1255
+ const counts = collectNonWords ? {
1256
+ words: analyzed.reduce((sum, chunk) => sum + chunk.wordChars, 0),
1257
+ nonWords: analyzed.reduce((sum, chunk) => sum + chunk.nonWordChars, 0),
1258
+ total
1259
+ } : void 0;
1260
+ if (mode === "char") return {
1261
+ total,
1262
+ counts,
1263
+ breakdown: {
1264
+ mode,
1265
+ items: analyzed.map((chunk) => ({
1266
+ locale: chunk.locale,
1267
+ text: chunk.text,
1268
+ chars: chunk.chars,
1269
+ nonWords: chunk.nonWords
1270
+ }))
1271
+ }
1272
+ };
1273
+ return {
1274
+ total,
1275
+ counts,
1276
+ breakdown: {
1277
+ mode,
1278
+ items: aggregateCharsByLocale(analyzed).map((chunk) => ({
1279
+ locale: chunk.locale,
1280
+ chars: chunk.chars,
1281
+ nonWords: chunk.nonWords
1282
+ }))
1283
+ }
1284
+ };
1285
+ }
1286
+ const analyzed = chunks.map((chunk) => analyzeChunk(chunk, collectNonWords, includeWhitespace));
1287
+ const wordsTotal = analyzed.reduce((sum, chunk) => sum + chunk.words, 0);
1288
+ const nonWordsTotal = collectNonWords ? analyzed.reduce((sum, chunk) => {
1289
+ if (!chunk.nonWords) return sum;
1290
+ return sum + getNonWordTotal(chunk.nonWords);
1291
+ }, 0) : 0;
1292
+ const total = analyzed.reduce((sum, chunk) => {
1293
+ let chunkTotal = chunk.words;
1294
+ if (collectNonWords && chunk.nonWords) chunkTotal += getNonWordTotal(chunk.nonWords);
1295
+ return sum + chunkTotal;
1296
+ }, 0);
1297
+ const counts = collectNonWords ? {
1298
+ words: wordsTotal,
1299
+ nonWords: nonWordsTotal,
1300
+ total
1301
+ } : void 0;
1302
+ if (mode === "segments") return {
1303
+ total,
1304
+ counts,
1305
+ breakdown: {
1306
+ mode,
1307
+ items: analyzed.map((chunk) => ({
1308
+ locale: chunk.locale,
1309
+ text: chunk.text,
1310
+ words: chunk.words,
1311
+ segments: chunk.segments,
1312
+ nonWords: chunk.nonWords
1313
+ }))
1314
+ }
1315
+ };
1316
+ if (mode === "collector") return {
1317
+ total,
1318
+ counts,
1319
+ breakdown: {
1320
+ mode,
1321
+ items: aggregateByLocale(analyzed),
1322
+ nonWords: collectNonWordsAggregate(analyzed, collectNonWords)
1323
+ }
1324
+ };
1325
+ return {
1326
+ total,
1327
+ counts,
1328
+ breakdown: {
1329
+ mode,
1330
+ items: analyzed.map((chunk) => ({
1331
+ locale: chunk.locale,
1332
+ text: chunk.text,
1333
+ words: chunk.words,
1334
+ nonWords: chunk.nonWords
1335
+ }))
1336
+ }
1337
+ };
1338
+ }
1339
+ //#endregion
1340
+ //#region src/detector/sections.ts
1341
+ function normalizeText(value) {
1342
+ if (value == null) return "";
1343
+ if (typeof value === "string") return value;
1344
+ if (typeof value === "number" || typeof value === "boolean") return String(value);
1345
+ try {
1346
+ return JSON.stringify(value);
1347
+ } catch {
1348
+ return String(value);
1349
+ }
1350
+ }
1351
+ async function buildPerKeyItems(data, options) {
1352
+ if (!data || typeof data !== "object" || Array.isArray(data)) return [];
1353
+ return Promise.all(Object.entries(data).map(async ([key, value]) => {
1354
+ const valueText = normalizeText(value);
1355
+ return {
1356
+ name: key,
1357
+ source: "frontmatter",
1358
+ result: await wordCounterWithDetector(valueText ? `${key}: ${valueText}` : key, options)
1359
+ };
1360
+ }));
1361
+ }
1362
+ async function buildSingleItem(name, text, options, source) {
1363
+ return [{
1364
+ name,
1365
+ source,
1366
+ result: await wordCounterWithDetector(text, options)
1367
+ }];
1368
+ }
1369
+ function sumTotals(items) {
1370
+ return items.reduce((sum, item) => sum + item.result.total, 0);
1371
+ }
1372
+ async function countSectionsWithResolvedDetector(input, section, options = {}) {
1373
+ options.mode;
1374
+ if (section === "all") {
1375
+ const result = await wordCounterWithDetector(input, options);
1376
+ return {
1377
+ section,
1378
+ total: result.total,
1379
+ frontmatterType: null,
1380
+ items: [{
1381
+ name: "all",
1382
+ source: "content",
1383
+ result
1384
+ }]
1385
+ };
1386
+ }
1387
+ const parsed = parseMarkdown(input);
1388
+ const frontmatterText = parsed.frontmatter ?? "";
1389
+ const contentText = parsed.content ?? "";
1390
+ let items = [];
1391
+ if (section === "frontmatter") items = await buildSingleItem("frontmatter", frontmatterText, options, "frontmatter");
1392
+ else if (section === "content") items = await buildSingleItem("content", contentText, options, "content");
1393
+ else if (section === "split") items = [...await buildSingleItem("frontmatter", frontmatterText, options, "frontmatter"), ...await buildSingleItem("content", contentText, options, "content")];
1394
+ else if (section === "per-key") items = await buildPerKeyItems(parsed.data, options);
1395
+ else if (section === "split-per-key") items = [...await buildPerKeyItems(parsed.data, options), ...await buildSingleItem("content", contentText, options, "content")];
1235
1396
  return {
1236
1397
  section,
1237
1398
  total: sumTotals(items),
@@ -1239,7 +1400,207 @@ function countSections(input, section, options = {}) {
1239
1400
  items
1240
1401
  };
1241
1402
  }
1242
-
1403
+ const LATIN_WASM_MIN_CONFIDENCE = .75;
1404
+ const HANI_WASM_MIN_CONFIDENCE = .9;
1405
+ const LATIN_SCRIPT_REGEX = /\p{Script=Latin}/u;
1406
+ const HAN_SCRIPT_REGEX = /\p{Script=Han}/u;
1407
+ const DETECTOR_ROUTE_POLICIES = {
1408
+ [DEFAULT_LOCALE]: {
1409
+ routeTag: DEFAULT_LOCALE,
1410
+ minScriptChars: 24,
1411
+ minConfidence: LATIN_WASM_MIN_CONFIDENCE,
1412
+ requireReliable: true
1413
+ },
1414
+ [DEFAULT_HAN_TAG]: {
1415
+ routeTag: DEFAULT_HAN_TAG,
1416
+ minScriptChars: 12,
1417
+ minConfidence: HANI_WASM_MIN_CONFIDENCE,
1418
+ requireReliable: true
1419
+ }
1420
+ };
1421
+ function isAmbiguousDetectorRoute(locale) {
1422
+ return locale === "und-Latn" || locale === "und-Hani";
1423
+ }
1424
+ function countScriptBearingCharsForRoute(text, routeTag) {
1425
+ const matcher = routeTag === "und-Hani" ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX;
1426
+ let count = 0;
1427
+ for (const char of text) if (matcher.test(char)) count += 1;
1428
+ return count;
1429
+ }
1430
+ function shouldRunWasmDetector(text, routeTag) {
1431
+ const policy = DETECTOR_ROUTE_POLICIES[routeTag];
1432
+ return countScriptBearingCharsForRoute(text, routeTag) >= policy.minScriptChars;
1433
+ }
1434
+ function normalizeDetectorSampleForRoute(text, routeTag) {
1435
+ const matcher = routeTag === "und-Hani" ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX;
1436
+ return [...text].map((char) => {
1437
+ if (matcher.test(char)) return char;
1438
+ if (/\s/u.test(char)) return " ";
1439
+ return " ";
1440
+ }).join("").replace(/\s+/g, " ").trim();
1441
+ }
1442
+ //#endregion
1443
+ //#region src/detector/whatlang-wasm.ts
1444
+ const GENERATED_FOLDER_NAME = "wasm-language-detector";
1445
+ const GENERATED_MODULE_FILE = "language_detector.js";
1446
+ const MAX_SEARCH_DEPTH = 8;
1447
+ const requireFromHere = createRequire(import.meta.url);
1448
+ const WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE = "WASM detector runtime is unavailable. Run `bun run build:wasm` to generate it.";
1449
+ let modulePromise = null;
1450
+ function resolveCandidateModulePaths() {
1451
+ const moduleDir = dirname(fileURLToPath(import.meta.url));
1452
+ const candidates = /* @__PURE__ */ new Set();
1453
+ let currentDir = moduleDir;
1454
+ for (let depth = 0; depth < MAX_SEARCH_DEPTH; depth += 1) {
1455
+ candidates.add(join(currentDir, GENERATED_FOLDER_NAME, GENERATED_MODULE_FILE));
1456
+ candidates.add(join(currentDir, "generated", GENERATED_FOLDER_NAME, GENERATED_MODULE_FILE));
1457
+ const parentDir = dirname(currentDir);
1458
+ if (parentDir === currentDir) break;
1459
+ currentDir = parentDir;
1460
+ }
1461
+ return [...candidates];
1462
+ }
1463
+ function resolveWhatlangWasmModulePath() {
1464
+ for (const candidate of resolveCandidateModulePaths()) if (existsSync(candidate)) return candidate;
1465
+ throw new Error(WASM_DETECTOR_RUNTIME_UNAVAILABLE_MESSAGE);
1466
+ }
1467
+ async function loadWhatlangWasmModule() {
1468
+ if (!modulePromise) modulePromise = (async () => {
1469
+ return requireFromHere(resolveWhatlangWasmModulePath());
1470
+ })();
1471
+ return modulePromise;
1472
+ }
1473
+ async function detectWithWhatlangWasm(text, routeTag) {
1474
+ return (await loadWhatlangWasmModule()).detect_language(text, routeTag);
1475
+ }
1476
+ //#endregion
1477
+ //#region src/detector/whatlang-map.ts
1478
+ const LATIN_LANGUAGE_TAGS = {
1479
+ cat: "ca",
1480
+ ces: "cs",
1481
+ dan: "da",
1482
+ deu: "de",
1483
+ eng: "en",
1484
+ fin: "fi",
1485
+ fra: "fr",
1486
+ hun: "hu",
1487
+ ita: "it",
1488
+ lat: "la",
1489
+ nld: "nl",
1490
+ pol: "pl",
1491
+ por: "pt",
1492
+ ron: "ro",
1493
+ spa: "es",
1494
+ swe: "sv",
1495
+ tur: "tr"
1496
+ };
1497
+ const HANI_LANGUAGE_TAGS = {
1498
+ cmn: "zh",
1499
+ jpn: "ja"
1500
+ };
1501
+ function hasSupportedScript(result, routeTag) {
1502
+ if (routeTag === "und-Latn") return result.script === "Latin";
1503
+ return result.script === "Mandarin";
1504
+ }
1505
+ function remapLanguageTag(lang, routeTag) {
1506
+ if (routeTag === "und-Latn") return LATIN_LANGUAGE_TAGS[lang];
1507
+ return HANI_LANGUAGE_TAGS[lang];
1508
+ }
1509
+ function remapWhatlangResult(result, routeTag) {
1510
+ if (!hasSupportedScript(result, routeTag)) return null;
1511
+ const tag = remapLanguageTag(result.lang, routeTag);
1512
+ if (!tag) return null;
1513
+ return {
1514
+ tag,
1515
+ confidence: result.confidence,
1516
+ reliable: result.reliable,
1517
+ source: "wasm"
1518
+ };
1519
+ }
1520
+ function getDetectorFallbackTag(routeTag) {
1521
+ return routeTag === "und-Hani" ? DEFAULT_HAN_TAG : DEFAULT_LOCALE;
1522
+ }
1523
+ //#endregion
1524
+ //#region src/detector/wasm.ts
1525
+ function shouldAcceptDetectorTag(routeTag, confidence, reliable) {
1526
+ const policy = DETECTOR_ROUTE_POLICIES[routeTag];
1527
+ if (policy.requireReliable && reliable !== true) return false;
1528
+ if (confidence === void 0) return false;
1529
+ return confidence >= policy.minConfidence;
1530
+ }
1531
+ function buildDetectorWindows(chunks) {
1532
+ const windows = [];
1533
+ for (let index = 0; index < chunks.length; index += 1) {
1534
+ const chunk = chunks[index];
1535
+ if (!chunk || !isAmbiguousDetectorRoute(chunk.locale)) continue;
1536
+ const previousWindow = windows[windows.length - 1];
1537
+ if (previousWindow && previousWindow.routeTag === chunk.locale && previousWindow.endIndex === index - 1) {
1538
+ previousWindow.endIndex = index;
1539
+ previousWindow.text += chunk.text;
1540
+ continue;
1541
+ }
1542
+ windows.push({
1543
+ routeTag: chunk.locale,
1544
+ startIndex: index,
1545
+ endIndex: index,
1546
+ text: chunk.text
1547
+ });
1548
+ }
1549
+ return windows;
1550
+ }
1551
+ async function resolveWindowLocale(window) {
1552
+ if (!shouldRunWasmDetector(window.text, window.routeTag)) return window.routeTag;
1553
+ const rawResult = await detectWithWhatlangWasm(window.text, window.routeTag);
1554
+ const rawRemapped = rawResult ? remapWhatlangResult(rawResult, window.routeTag) : null;
1555
+ const normalizedSample = normalizeDetectorSampleForRoute(window.text, window.routeTag);
1556
+ const normalizedResult = normalizedSample.length > 0 && normalizedSample !== window.text ? await detectWithWhatlangWasm(normalizedSample, window.routeTag) : null;
1557
+ const normalizedRemapped = normalizedResult ? remapWhatlangResult(normalizedResult, window.routeTag) : null;
1558
+ const candidates = [rawRemapped, normalizedRemapped].filter((value) => value !== null);
1559
+ if (candidates.length === 0) return getDetectorFallbackTag(window.routeTag);
1560
+ const strongestCandidate = candidates.reduce((best, current) => {
1561
+ if (!best) return current;
1562
+ return (current.confidence ?? 0) > (best.confidence ?? 0) ? current : best;
1563
+ }, candidates[0]);
1564
+ if (strongestCandidate && shouldAcceptDetectorTag(window.routeTag, strongestCandidate.confidence, strongestCandidate.reliable)) return strongestCandidate.tag;
1565
+ if (window.routeTag === "und-Latn" && rawRemapped && normalizedRemapped && rawRemapped.tag === normalizedRemapped.tag) {
1566
+ if (Math.max(rawRemapped.confidence ?? 0, normalizedRemapped.confidence ?? 0) >= .7) return rawRemapped.tag;
1567
+ }
1568
+ return getDetectorFallbackTag(window.routeTag);
1569
+ }
1570
+ async function segmentTextByLocaleWithWasmDetector(text, options = {}) {
1571
+ const chunks = segmentTextByLocale(text, options);
1572
+ const resolved = [...chunks];
1573
+ const windows = buildDetectorWindows(chunks);
1574
+ for (const window of windows) {
1575
+ const resolvedLocale = await resolveWindowLocale(window);
1576
+ for (let index = window.startIndex; index <= window.endIndex; index += 1) {
1577
+ const chunk = resolved[index];
1578
+ if (!chunk) continue;
1579
+ resolved[index] = {
1580
+ ...chunk,
1581
+ locale: resolvedLocale
1582
+ };
1583
+ }
1584
+ }
1585
+ return resolved;
1586
+ }
1587
+ async function wordCounterWithWasmDetector(text, options = {}) {
1588
+ return buildWordCounterResultFromChunks(await segmentTextByLocaleWithWasmDetector(text, options), options);
1589
+ }
1590
+ async function countSectionsWithWasmDetector(input, section, options = {}) {
1591
+ return countSectionsWithResolvedDetector(input, section, options);
1592
+ }
1593
+ function resolveDetectorMode(mode) {
1594
+ return mode ?? "regex";
1595
+ }
1596
+ async function wordCounterWithDetector(text, options = {}) {
1597
+ if (resolveDetectorMode(options.detector) === "wasm") return wordCounterWithWasmDetector(text, options);
1598
+ return wordCounterWithRegexDetector(text, options);
1599
+ }
1600
+ async function countSectionsWithDetector(input, section, options = {}) {
1601
+ if (resolveDetectorMode(options.detector) === "wasm") return countSectionsWithWasmDetector(input, section, options);
1602
+ return countSectionsWithRegexDetector(input, section, options);
1603
+ }
1243
1604
  //#endregion
1244
1605
  //#region src/cli/batch/aggregate.ts
1245
1606
  function stripCollectorSegmentsFromWordCounterResult(result) {
@@ -1256,7 +1617,6 @@ function compactCollectorSegmentsInCountResult(result) {
1256
1617
  }
1257
1618
  stripCollectorSegmentsFromWordCounterResult(result);
1258
1619
  }
1259
-
1260
1620
  //#endregion
1261
1621
  //#region src/cli/path/load.ts
1262
1622
  function isProbablyBinary(buffer) {
@@ -1273,7 +1633,6 @@ function isProbablyBinary(buffer) {
1273
1633
  }
1274
1634
  return suspicious / sampleSize > .3;
1275
1635
  }
1276
-
1277
1636
  //#endregion
1278
1637
  //#region src/cli/batch/jobs/worker/count-worker.ts
1279
1638
  const config = workerData;
@@ -1335,7 +1694,13 @@ parentPort.on("message", async (message) => {
1335
1694
  }
1336
1695
  try {
1337
1696
  const content = buffer.toString("utf8");
1338
- const result = config.section === "all" ? wc_default(content, config.wcOptions) : countSections(content, config.section, config.wcOptions);
1697
+ const result = config.detectorMode === "regex" ? config.section === "all" ? wc_default(content, config.wcOptions) : countSections(content, config.section, config.wcOptions) : config.section === "all" ? await wordCounterWithDetector(content, {
1698
+ ...config.wcOptions,
1699
+ detector: config.detectorMode
1700
+ }) : await countSectionsWithDetector(content, config.section, {
1701
+ ...config.wcOptions,
1702
+ detector: config.detectorMode
1703
+ });
1339
1704
  if (!config.preserveCollectorSegments) compactCollectorSegmentsInCountResult(result);
1340
1705
  const response = {
1341
1706
  type: "result",
@@ -1364,7 +1729,7 @@ parentPort.on("message", async (message) => {
1364
1729
  parentPort?.postMessage(response);
1365
1730
  }
1366
1731
  });
1367
-
1368
1732
  //#endregion
1369
- export { };
1733
+ export {};
1734
+
1370
1735
  //# sourceMappingURL=count-worker.mjs.map