@dev-pi2pie/word-counter 0.1.5-canary.2 → 0.1.5-canary.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -3
- package/dist/cjs/detector.cjs +429 -15
- package/dist/cjs/detector.cjs.map +1 -1
- package/dist/cjs/markdown.cjs +6 -0
- package/dist/esm/bin.mjs +788 -209
- package/dist/esm/bin.mjs.map +1 -1
- package/dist/esm/detector.d.mts +39 -1
- package/dist/esm/detector.mjs +430 -16
- package/dist/esm/detector.mjs.map +1 -1
- package/dist/esm/index.mjs +1 -1
- package/dist/esm/markdown.mjs +1 -1
- package/dist/esm/worker/count-worker.mjs +480 -20
- package/dist/esm/worker/count-worker.mjs.map +1 -1
- package/dist/esm/worker-pool.mjs +16 -2
- package/dist/esm/worker-pool.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -109,8 +109,10 @@ Detector mode notes:
|
|
|
109
109
|
- `--detector wasm` only runs for ambiguous `und-Latn` and `und-Hani` chunks.
|
|
110
110
|
- `--detector regex` keeps the original script/regex chunk-first detection path.
|
|
111
111
|
- `--detector wasm` uses a detector-oriented ambiguous-window scoring pass before accepted tags are projected back onto the counting chunks.
|
|
112
|
+
- In `--detector wasm` mode, Latin hint rules and explicit Latin hint flags are deferred until after detector evaluation and only relabel unresolved `und-Latn` output.
|
|
112
113
|
- Very short chunks stay on the original `und-*` fallback.
|
|
113
114
|
- Low-confidence or unsupported detector results fall back to `und-*`.
|
|
115
|
+
- Technical-noise-heavy Latin windows stay conservative and may remain `und-Latn` even when the detector produces a wrong-but-confident language guess.
|
|
114
116
|
|
|
115
117
|
Collect non-words (emoji/symbols/punctuation):
|
|
116
118
|
|
|
@@ -285,14 +287,24 @@ word-counter --path ./examples/test-case-multi-files-support --debug --verbose
|
|
|
285
287
|
|
|
286
288
|
Use `--debug-report [path]` to route debug diagnostics to a JSONL report file:
|
|
287
289
|
|
|
288
|
-
- no path: writes to current working directory with pattern `wc-debug-YYYYMMDD-HHmmss-<pid>.jsonl`
|
|
290
|
+
- no path: writes to current working directory with pattern `wc-debug-YYYYMMDD-HHmmss-utc-<pid>.jsonl`
|
|
291
|
+
- no path with `--detector-evidence`: writes with pattern `wc-detector-evidence-YYYYMMDD-HHmmss-utc-<pid>.jsonl`
|
|
289
292
|
- path provided: writes to the specified location
|
|
290
293
|
- default-name collision handling: appends `-<n>` suffix to avoid overwriting existing files
|
|
291
294
|
- explicit path validation: existing directories are rejected (explicit paths are treated as file targets)
|
|
295
|
+
- compatibility note: the autogenerated filename moved from the older local-time pattern to the new UTC `...-utc-...jsonl` pattern
|
|
292
296
|
|
|
293
297
|
By default with `--debug-report`, debug lines are file-only (not mirrored to terminal).
|
|
294
298
|
Use `--debug-report-tee` (alias: `--debug-tee`) to mirror to both file and `stderr`.
|
|
295
|
-
Flag dependencies: `--verbose` requires `--debug`; `--debug-report` requires `--debug`; `--debug-report-tee`/`--debug-tee` requires `--debug-report`.
|
|
299
|
+
Flag dependencies: `--verbose` requires `--debug`; `--detector-evidence` requires `--debug` and `--detector wasm`; `--debug-report` requires `--debug`; `--debug-report-tee`/`--debug-tee` requires `--debug-report`.
|
|
300
|
+
|
|
301
|
+
Use `--detector-evidence` to add per-window detector evidence onto the same debug stream:
|
|
302
|
+
|
|
303
|
+
- only meaningful with `--detector wasm`
|
|
304
|
+
- compact mode emits bounded single-line previews plus detector decision metadata
|
|
305
|
+
- verbose mode emits full raw detector windows and full normalized samples
|
|
306
|
+
- evidence remains detector-window based even when output mode changes to `collector`, `char`, or another counting mode
|
|
307
|
+
- fallback evidence reports the post-fallback final tag used by downstream counting output; in rare split-relabel cases it may also include `finalLocales`
|
|
296
308
|
|
|
297
309
|
Examples:
|
|
298
310
|
|
|
@@ -301,17 +313,26 @@ word-counter --path ./examples/test-case-multi-files-support --debug --debug-rep
|
|
|
301
313
|
word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl
|
|
302
314
|
word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl --debug-report-tee
|
|
303
315
|
word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl --debug-tee
|
|
316
|
+
word-counter --detector wasm --debug --detector-evidence "This sentence should clearly be detected as English for the wasm detector path."
|
|
317
|
+
word-counter --detector wasm --debug --verbose --detector-evidence "This sentence should clearly be detected as English for the wasm detector path."
|
|
318
|
+
word-counter --detector wasm --debug --detector-evidence --debug-report
|
|
304
319
|
```
|
|
305
320
|
|
|
306
321
|
Skip details stay debug-gated and can be suppressed with `--quiet-skips`.
|
|
307
322
|
|
|
323
|
+
When `--format json` is combined with `--debug`, debug-only diagnostics are emitted under `debug.*`:
|
|
324
|
+
|
|
325
|
+
- single input and merged batch may include `debug.detector`
|
|
326
|
+
- per-file batch may include `debug.skipped`, `debug.detector`, and per-entry `files[i].debug.detector`
|
|
327
|
+
- per-file top-level `skipped` is still emitted temporarily for compatibility
|
|
328
|
+
|
|
308
329
|
## How It Works
|
|
309
330
|
|
|
310
331
|
- The runtime inspects each character's Unicode script to infer its likely locale tag (e.g., `und-Latn`, `und-Hani`, `ja`).
|
|
311
332
|
- Adjacent characters that share the same locale tag are grouped into a chunk.
|
|
312
333
|
- Each chunk is counted with `Intl.Segmenter` at `granularity: "word"`, caching segmenters to avoid re-instantiation.
|
|
313
334
|
- Per-locale counts are summed into an overall total and printed to stdout.
|
|
314
|
-
- With `--detector wasm`, ambiguous `und-Latn` and `und-Hani` chunks can be relabeled through the optional WASM detector before counting.
|
|
335
|
+
- With `--detector wasm`, ambiguous `und-Latn` and `und-Hani` chunks can be relabeled through the optional WASM detector before counting; unresolved `und-Latn` chunks then fall back to the existing Latin hint rules and explicit Latin hint precedence.
|
|
315
336
|
|
|
316
337
|
## Locale vs Language Code
|
|
317
338
|
|
|
@@ -696,6 +717,7 @@ Example JSON (trimmed):
|
|
|
696
717
|
- Detection is regex/script based, not statistical language-ID.
|
|
697
718
|
- Ambiguous Latin defaults to `und-Latn`; Han fallback defaults to `und-Hani`.
|
|
698
719
|
- `--detector wasm` is optional and conservative; it only runs for ambiguous chunks that meet minimum script-bearing length thresholds.
|
|
720
|
+
- In `--detector wasm` mode, ambiguous Latin stays on `und-Latn` for detector eligibility first, then built-in/custom Latin rules and explicit Latin hints are applied only if the detector leaves that chunk unresolved.
|
|
699
721
|
- The current first WASM engine is `whatlang`, remapped into this package's public tags.
|
|
700
722
|
- The npm package ships one portable WASM artifact; users do not install per-OS detector packages.
|
|
701
723
|
- Use explicit tag and hint flags when you need deterministic tagging.
|
package/dist/cjs/detector.cjs
CHANGED
|
@@ -118,6 +118,41 @@ function buildWordCounterResultFromChunks(chunks, options = {}) {
|
|
|
118
118
|
}
|
|
119
119
|
};
|
|
120
120
|
}
|
|
121
|
+
function recordDetectorWindow(summary, routeTag) {
|
|
122
|
+
if (!summary) return;
|
|
123
|
+
summary.windowsTotal += 1;
|
|
124
|
+
if (routeTag === "und-Latn") {
|
|
125
|
+
summary.routes.latin += 1;
|
|
126
|
+
return;
|
|
127
|
+
}
|
|
128
|
+
if (routeTag === "und-Hani") summary.routes.han += 1;
|
|
129
|
+
}
|
|
130
|
+
function recordDetectorAccepted(summary, path) {
|
|
131
|
+
if (!summary) return;
|
|
132
|
+
summary.accepted += 1;
|
|
133
|
+
if (path === "reliable") {
|
|
134
|
+
summary.acceptancePaths.reliable += 1;
|
|
135
|
+
return;
|
|
136
|
+
}
|
|
137
|
+
summary.acceptancePaths.corroborated += 1;
|
|
138
|
+
}
|
|
139
|
+
function recordDetectorFallback(summary, reason) {
|
|
140
|
+
if (!summary) return;
|
|
141
|
+
summary.fallback += 1;
|
|
142
|
+
summary.fallbackReasons[reason] += 1;
|
|
143
|
+
}
|
|
144
|
+
function createDetectorEvidencePreview(text) {
|
|
145
|
+
const collapsed = text.replace(/\s+/gu, " ").trim();
|
|
146
|
+
const codePoints = Array.from(collapsed);
|
|
147
|
+
if (codePoints.length <= 160) return {
|
|
148
|
+
preview: collapsed,
|
|
149
|
+
truncated: false
|
|
150
|
+
};
|
|
151
|
+
return {
|
|
152
|
+
preview: codePoints.slice(0, 160).join(""),
|
|
153
|
+
truncated: true
|
|
154
|
+
};
|
|
155
|
+
}
|
|
121
156
|
//#endregion
|
|
122
157
|
//#region src/detector/sections.ts
|
|
123
158
|
function normalizeText(value) {
|
|
@@ -186,6 +221,7 @@ const LATIN_WASM_MIN_CONFIDENCE = .75;
|
|
|
186
221
|
const HANI_WASM_MIN_CONFIDENCE = .9;
|
|
187
222
|
const LATIN_SCRIPT_REGEX = /\p{Script=Latin}/u;
|
|
188
223
|
const HAN_SCRIPT_REGEX = /\p{Script=Han}/u;
|
|
224
|
+
const LATIN_WORD_REGEX = /\p{Script=Latin}+/gu;
|
|
189
225
|
const DETECTOR_ROUTE_POLICIES = {
|
|
190
226
|
[require_markdown.DEFAULT_LOCALE]: {
|
|
191
227
|
routeTag: require_markdown.DEFAULT_LOCALE,
|
|
@@ -209,10 +245,6 @@ function countScriptBearingCharsForRoute(text, routeTag) {
|
|
|
209
245
|
for (const char of text) if (matcher.test(char)) count += 1;
|
|
210
246
|
return count;
|
|
211
247
|
}
|
|
212
|
-
function shouldRunWasmDetector(text, routeTag) {
|
|
213
|
-
const policy = DETECTOR_ROUTE_POLICIES[routeTag];
|
|
214
|
-
return countScriptBearingCharsForRoute(text, routeTag) >= policy.minScriptChars;
|
|
215
|
-
}
|
|
216
248
|
function normalizeDetectorSampleForRoute(text, routeTag) {
|
|
217
249
|
const matcher = routeTag === "und-Hani" ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX;
|
|
218
250
|
return [...text].map((char) => {
|
|
@@ -221,6 +253,57 @@ function normalizeDetectorSampleForRoute(text, routeTag) {
|
|
|
221
253
|
return " ";
|
|
222
254
|
}).join("").replace(/\s+/g, " ").trim();
|
|
223
255
|
}
|
|
256
|
+
function countLatinWords(text) {
|
|
257
|
+
return text.match(LATIN_WORD_REGEX)?.length ?? 0;
|
|
258
|
+
}
|
|
259
|
+
function isTechnicalLikeLatinLine(line, latinWords) {
|
|
260
|
+
const trimmed = line.trim();
|
|
261
|
+
if (!trimmed) return false;
|
|
262
|
+
if (/^[>#$]/u.test(trimmed)) return true;
|
|
263
|
+
if (/(^|\s)--[a-z0-9][a-z0-9-]*/iu.test(trimmed)) return true;
|
|
264
|
+
if (/`[^`]+`/u.test(trimmed)) return true;
|
|
265
|
+
if (/(^|[\s"'`])(?:\.{0,2}\/|\/)?[\w./-]+\.[a-z0-9]{1,6}(?=$|[\s"'`])/iu.test(trimmed)) return true;
|
|
266
|
+
if (/^[\-\*\d.)\s]*[\p{L}\p{N}_.-]+:\s+\S/iu.test(trimmed) && latinWords <= 8) return true;
|
|
267
|
+
return false;
|
|
268
|
+
}
|
|
269
|
+
function shouldTreatLatinProseBlockAsSentenceLike(latinWords, lineCount, hasSentencePunctuation) {
|
|
270
|
+
if (latinWords < 4) return false;
|
|
271
|
+
if (hasSentencePunctuation) return true;
|
|
272
|
+
return lineCount <= 1 ? latinWords >= 5 : latinWords >= 8;
|
|
273
|
+
}
|
|
274
|
+
function shouldAcceptLatinDetectorWindow(text, normalizedSample) {
|
|
275
|
+
if (countLatinWords(normalizedSample) < 4) return false;
|
|
276
|
+
let proseWords = 0;
|
|
277
|
+
let technicalWords = 0;
|
|
278
|
+
let proseBlockWords = 0;
|
|
279
|
+
let proseBlockLines = 0;
|
|
280
|
+
let proseBlockHasSentencePunctuation = false;
|
|
281
|
+
const flushProseBlock = () => {
|
|
282
|
+
if (shouldTreatLatinProseBlockAsSentenceLike(proseBlockWords, proseBlockLines, proseBlockHasSentencePunctuation)) proseWords += proseBlockWords;
|
|
283
|
+
proseBlockWords = 0;
|
|
284
|
+
proseBlockLines = 0;
|
|
285
|
+
proseBlockHasSentencePunctuation = false;
|
|
286
|
+
};
|
|
287
|
+
for (const rawLine of text.split(/\r?\n/u)) {
|
|
288
|
+
const line = rawLine.trim();
|
|
289
|
+
if (!line || line === "---" || line === "```") {
|
|
290
|
+
flushProseBlock();
|
|
291
|
+
continue;
|
|
292
|
+
}
|
|
293
|
+
const latinWords = countLatinWords(line);
|
|
294
|
+
if (latinWords === 0) continue;
|
|
295
|
+
if (isTechnicalLikeLatinLine(line, latinWords)) {
|
|
296
|
+
flushProseBlock();
|
|
297
|
+
technicalWords += latinWords;
|
|
298
|
+
continue;
|
|
299
|
+
}
|
|
300
|
+
proseBlockWords += latinWords;
|
|
301
|
+
proseBlockLines += 1;
|
|
302
|
+
proseBlockHasSentencePunctuation ||= /[.!?]/u.test(line);
|
|
303
|
+
}
|
|
304
|
+
flushProseBlock();
|
|
305
|
+
return proseWords >= 4 && proseWords >= technicalWords;
|
|
306
|
+
}
|
|
224
307
|
//#endregion
|
|
225
308
|
//#region src/detector/whatlang-wasm.ts
|
|
226
309
|
const GENERATED_FOLDER_NAME = "wasm-language-detector";
|
|
@@ -304,12 +387,142 @@ function getDetectorFallbackTag(routeTag) {
|
|
|
304
387
|
}
|
|
305
388
|
//#endregion
|
|
306
389
|
//#region src/detector/wasm.ts
|
|
390
|
+
function createDeferredLatinPreSegmentOptions(options) {
|
|
391
|
+
return {
|
|
392
|
+
...options,
|
|
393
|
+
latinLanguageHint: void 0,
|
|
394
|
+
latinTagHint: void 0,
|
|
395
|
+
latinLocaleHint: void 0,
|
|
396
|
+
latinHintRules: void 0,
|
|
397
|
+
useDefaultLatinHints: false
|
|
398
|
+
};
|
|
399
|
+
}
|
|
400
|
+
function createRuleOnlyLatinOptions(options) {
|
|
401
|
+
return {
|
|
402
|
+
...options,
|
|
403
|
+
latinLanguageHint: void 0,
|
|
404
|
+
latinTagHint: void 0,
|
|
405
|
+
latinLocaleHint: void 0
|
|
406
|
+
};
|
|
407
|
+
}
|
|
408
|
+
function mergeAdjacentChunks(chunks) {
|
|
409
|
+
if (chunks.length === 0) return chunks;
|
|
410
|
+
const merged = [];
|
|
411
|
+
let last = chunks[0];
|
|
412
|
+
for (let index = 1; index < chunks.length; index += 1) {
|
|
413
|
+
const chunk = chunks[index];
|
|
414
|
+
if (chunk.locale === last.locale) {
|
|
415
|
+
last = {
|
|
416
|
+
locale: last.locale,
|
|
417
|
+
text: last.text + chunk.text
|
|
418
|
+
};
|
|
419
|
+
continue;
|
|
420
|
+
}
|
|
421
|
+
merged.push(last);
|
|
422
|
+
last = chunk;
|
|
423
|
+
}
|
|
424
|
+
merged.push(last);
|
|
425
|
+
return merged;
|
|
426
|
+
}
|
|
427
|
+
function reapplyDeferredLatinFallback(chunks, options) {
|
|
428
|
+
const relabeled = [];
|
|
429
|
+
for (const chunk of chunks) {
|
|
430
|
+
if (chunk.locale !== "und-Latn") {
|
|
431
|
+
relabeled.push(chunk);
|
|
432
|
+
continue;
|
|
433
|
+
}
|
|
434
|
+
relabeled.push(...require_markdown.segmentTextByLocale(chunk.text, options));
|
|
435
|
+
}
|
|
436
|
+
return mergeAdjacentChunks(relabeled);
|
|
437
|
+
}
|
|
438
|
+
function reapplyResolvedLatinHintRules(resolvedChunks, originalChunks, options) {
|
|
439
|
+
const relabeled = [];
|
|
440
|
+
const ruleOnlyOptions = createRuleOnlyLatinOptions(options);
|
|
441
|
+
for (let index = 0; index < resolvedChunks.length; index += 1) {
|
|
442
|
+
const chunk = resolvedChunks[index];
|
|
443
|
+
const originalChunk = originalChunks[index];
|
|
444
|
+
if (!chunk || !originalChunk) continue;
|
|
445
|
+
if (originalChunk.locale !== "und-Latn" || chunk.locale === "und-Latn") {
|
|
446
|
+
relabeled.push(chunk);
|
|
447
|
+
continue;
|
|
448
|
+
}
|
|
449
|
+
const hintedChunks = require_markdown.segmentTextByLocale(chunk.text, ruleOnlyOptions).map((hintedChunk) => ({
|
|
450
|
+
locale: hintedChunk.locale === "und-Latn" ? chunk.locale : hintedChunk.locale,
|
|
451
|
+
text: hintedChunk.text
|
|
452
|
+
}));
|
|
453
|
+
relabeled.push(...hintedChunks);
|
|
454
|
+
}
|
|
455
|
+
return mergeAdjacentChunks(relabeled);
|
|
456
|
+
}
|
|
307
457
|
function shouldAcceptDetectorTag(routeTag, confidence, reliable) {
|
|
308
458
|
const policy = DETECTOR_ROUTE_POLICIES[routeTag];
|
|
309
459
|
if (policy.requireReliable && reliable !== true) return false;
|
|
310
460
|
if (confidence === void 0) return false;
|
|
311
461
|
return confidence >= policy.minConfidence;
|
|
312
462
|
}
|
|
463
|
+
function resolveFallbackDebugOutcome(window, options) {
|
|
464
|
+
const fallbackTag = getDetectorFallbackTag(window.routeTag);
|
|
465
|
+
if (window.routeTag !== "und-Latn") return { finalTag: fallbackTag };
|
|
466
|
+
const finalLocales = reapplyDeferredLatinFallback([{
|
|
467
|
+
locale: fallbackTag,
|
|
468
|
+
text: window.text
|
|
469
|
+
}], options).map((chunk) => chunk.locale);
|
|
470
|
+
if (finalLocales.length === 1) return { finalTag: finalLocales[0] };
|
|
471
|
+
return finalLocales.length > 1 ? {
|
|
472
|
+
finalTag: fallbackTag,
|
|
473
|
+
finalLocales
|
|
474
|
+
} : { finalTag: fallbackTag };
|
|
475
|
+
}
|
|
476
|
+
function buildEvidenceSample(result, remappedTag) {
|
|
477
|
+
return {
|
|
478
|
+
lang: result?.lang ?? null,
|
|
479
|
+
script: result?.script ?? null,
|
|
480
|
+
confidence: result?.confidence ?? null,
|
|
481
|
+
reliable: result?.reliable ?? null,
|
|
482
|
+
remappedTag
|
|
483
|
+
};
|
|
484
|
+
}
|
|
485
|
+
function emitDetectorWindowEvidence({ window, windowIndex, normalizedSample, eligible, qualityGate, rawResult, rawRemappedTag, normalizedResult, normalizedRemappedTag, decision, debug }) {
|
|
486
|
+
const evidence = debug?.evidence;
|
|
487
|
+
if (!evidence || !debug.emit) return;
|
|
488
|
+
const routePolicy = DETECTOR_ROUTE_POLICIES[window.routeTag];
|
|
489
|
+
const baseDetails = {
|
|
490
|
+
engine: "whatlang-wasm",
|
|
491
|
+
routeTag: window.routeTag,
|
|
492
|
+
windowIndex,
|
|
493
|
+
startIndex: window.startIndex,
|
|
494
|
+
endIndex: window.endIndex,
|
|
495
|
+
mode: evidence.mode,
|
|
496
|
+
section: evidence.section,
|
|
497
|
+
textLength: window.text.length,
|
|
498
|
+
normalizedLength: normalizedSample.length,
|
|
499
|
+
normalizedApplied: normalizedSample !== window.text,
|
|
500
|
+
scriptChars: countScriptBearingCharsForRoute(window.text, window.routeTag),
|
|
501
|
+
minScriptChars: routePolicy.minScriptChars,
|
|
502
|
+
eligible,
|
|
503
|
+
qualityGate,
|
|
504
|
+
raw: buildEvidenceSample(rawResult, rawRemappedTag),
|
|
505
|
+
normalized: buildEvidenceSample(normalizedResult, normalizedRemappedTag),
|
|
506
|
+
decision
|
|
507
|
+
};
|
|
508
|
+
if (evidence.verbosity === "verbose") {
|
|
509
|
+
debug.emit("detector.window.evidence", {
|
|
510
|
+
...baseDetails,
|
|
511
|
+
text: window.text,
|
|
512
|
+
normalizedText: normalizedSample
|
|
513
|
+
}, { verbosity: "verbose" });
|
|
514
|
+
return;
|
|
515
|
+
}
|
|
516
|
+
const textPreview = createDetectorEvidencePreview(window.text);
|
|
517
|
+
const normalizedPreview = createDetectorEvidencePreview(normalizedSample);
|
|
518
|
+
debug.emit("detector.window.evidence", {
|
|
519
|
+
...baseDetails,
|
|
520
|
+
textPreview: textPreview.preview,
|
|
521
|
+
textPreviewTruncated: textPreview.truncated,
|
|
522
|
+
normalizedPreview: normalizedPreview.preview,
|
|
523
|
+
normalizedPreviewTruncated: normalizedPreview.truncated
|
|
524
|
+
}, { verbosity: "compact" });
|
|
525
|
+
}
|
|
313
526
|
function buildDetectorWindows(chunks) {
|
|
314
527
|
const windows = [];
|
|
315
528
|
for (let index = 0; index < chunks.length; index += 1) {
|
|
@@ -330,31 +543,231 @@ function buildDetectorWindows(chunks) {
|
|
|
330
543
|
}
|
|
331
544
|
return windows;
|
|
332
545
|
}
|
|
333
|
-
async function resolveWindowLocale(window) {
|
|
334
|
-
|
|
546
|
+
async function resolveWindowLocale(window, windowIndex, options, debug) {
|
|
547
|
+
recordDetectorWindow(debug?.summary, window.routeTag);
|
|
548
|
+
debug?.emit?.("detector.window.start", {
|
|
549
|
+
routeTag: window.routeTag,
|
|
550
|
+
startIndex: window.startIndex,
|
|
551
|
+
endIndex: window.endIndex,
|
|
552
|
+
textLength: window.text.length
|
|
553
|
+
}, { verbosity: "verbose" });
|
|
554
|
+
const routePolicy = DETECTOR_ROUTE_POLICIES[window.routeTag];
|
|
555
|
+
const eligible = countScriptBearingCharsForRoute(window.text, window.routeTag) >= routePolicy.minScriptChars;
|
|
556
|
+
const normalizedSample = normalizeDetectorSampleForRoute(window.text, window.routeTag);
|
|
557
|
+
const passesLatinQualityGate = window.routeTag !== "und-Latn" || shouldAcceptLatinDetectorWindow(window.text, normalizedSample);
|
|
558
|
+
if (!eligible) {
|
|
559
|
+
recordDetectorFallback(debug?.summary, "notEligible");
|
|
560
|
+
const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
|
|
561
|
+
emitDetectorWindowEvidence({
|
|
562
|
+
window,
|
|
563
|
+
windowIndex,
|
|
564
|
+
normalizedSample,
|
|
565
|
+
eligible,
|
|
566
|
+
qualityGate: passesLatinQualityGate,
|
|
567
|
+
rawResult: null,
|
|
568
|
+
rawRemappedTag: null,
|
|
569
|
+
normalizedResult: null,
|
|
570
|
+
normalizedRemappedTag: null,
|
|
571
|
+
decision: {
|
|
572
|
+
accepted: false,
|
|
573
|
+
path: null,
|
|
574
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
575
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
576
|
+
fallbackReason: "notEligible"
|
|
577
|
+
},
|
|
578
|
+
debug
|
|
579
|
+
});
|
|
580
|
+
debug?.emit?.("detector.window.fallback", {
|
|
581
|
+
routeTag: window.routeTag,
|
|
582
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
583
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
584
|
+
reason: "notEligible"
|
|
585
|
+
});
|
|
586
|
+
return window.routeTag;
|
|
587
|
+
}
|
|
335
588
|
const rawResult = await detectWithWhatlangWasm(window.text, window.routeTag);
|
|
336
589
|
const rawRemapped = rawResult ? remapWhatlangResult(rawResult, window.routeTag) : null;
|
|
337
|
-
const normalizedSample = normalizeDetectorSampleForRoute(window.text, window.routeTag);
|
|
338
590
|
const normalizedResult = normalizedSample.length > 0 && normalizedSample !== window.text ? await detectWithWhatlangWasm(normalizedSample, window.routeTag) : null;
|
|
591
|
+
debug?.emit?.("detector.window.sample", {
|
|
592
|
+
routeTag: window.routeTag,
|
|
593
|
+
normalizedApplied: normalizedSample.length > 0 && normalizedSample !== window.text,
|
|
594
|
+
normalizedLength: normalizedSample.length,
|
|
595
|
+
qualityGate: passesLatinQualityGate,
|
|
596
|
+
rawTag: rawRemapped?.tag ?? null,
|
|
597
|
+
rawConfidence: rawRemapped?.confidence ?? null,
|
|
598
|
+
rawReliable: rawRemapped?.reliable ?? null
|
|
599
|
+
}, { verbosity: "verbose" });
|
|
339
600
|
const normalizedRemapped = normalizedResult ? remapWhatlangResult(normalizedResult, window.routeTag) : null;
|
|
601
|
+
debug?.emit?.("detector.window.candidates", {
|
|
602
|
+
routeTag: window.routeTag,
|
|
603
|
+
normalizedTag: normalizedRemapped?.tag ?? null,
|
|
604
|
+
normalizedConfidence: normalizedRemapped?.confidence ?? null,
|
|
605
|
+
normalizedReliable: normalizedRemapped?.reliable ?? null
|
|
606
|
+
}, { verbosity: "verbose" });
|
|
340
607
|
const candidates = [rawRemapped, normalizedRemapped].filter((value) => value !== null);
|
|
341
|
-
if (candidates.length === 0)
|
|
608
|
+
if (candidates.length === 0) {
|
|
609
|
+
recordDetectorFallback(debug?.summary, "noCandidate");
|
|
610
|
+
const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
|
|
611
|
+
emitDetectorWindowEvidence({
|
|
612
|
+
window,
|
|
613
|
+
windowIndex,
|
|
614
|
+
normalizedSample,
|
|
615
|
+
eligible,
|
|
616
|
+
qualityGate: passesLatinQualityGate,
|
|
617
|
+
rawResult,
|
|
618
|
+
rawRemappedTag: rawRemapped?.tag ?? null,
|
|
619
|
+
normalizedResult,
|
|
620
|
+
normalizedRemappedTag: normalizedRemapped?.tag ?? null,
|
|
621
|
+
decision: {
|
|
622
|
+
accepted: false,
|
|
623
|
+
path: null,
|
|
624
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
625
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
626
|
+
fallbackReason: "noCandidate"
|
|
627
|
+
},
|
|
628
|
+
debug
|
|
629
|
+
});
|
|
630
|
+
debug?.emit?.("detector.window.fallback", {
|
|
631
|
+
routeTag: window.routeTag,
|
|
632
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
633
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
634
|
+
reason: "noCandidate"
|
|
635
|
+
});
|
|
636
|
+
return getDetectorFallbackTag(window.routeTag);
|
|
637
|
+
}
|
|
342
638
|
const strongestCandidate = candidates.reduce((best, current) => {
|
|
343
639
|
if (!best) return current;
|
|
344
640
|
return (current.confidence ?? 0) > (best.confidence ?? 0) ? current : best;
|
|
345
641
|
}, candidates[0]);
|
|
346
|
-
if (strongestCandidate && shouldAcceptDetectorTag(window.routeTag, strongestCandidate.confidence, strongestCandidate.reliable))
|
|
347
|
-
|
|
348
|
-
|
|
642
|
+
if (strongestCandidate && passesLatinQualityGate && shouldAcceptDetectorTag(window.routeTag, strongestCandidate.confidence, strongestCandidate.reliable)) {
|
|
643
|
+
recordDetectorAccepted(debug?.summary, "reliable");
|
|
644
|
+
emitDetectorWindowEvidence({
|
|
645
|
+
window,
|
|
646
|
+
windowIndex,
|
|
647
|
+
normalizedSample,
|
|
648
|
+
eligible,
|
|
649
|
+
qualityGate: passesLatinQualityGate,
|
|
650
|
+
rawResult,
|
|
651
|
+
rawRemappedTag: rawRemapped?.tag ?? null,
|
|
652
|
+
normalizedResult,
|
|
653
|
+
normalizedRemappedTag: normalizedRemapped?.tag ?? null,
|
|
654
|
+
decision: {
|
|
655
|
+
accepted: true,
|
|
656
|
+
path: "reliable",
|
|
657
|
+
finalTag: strongestCandidate.tag,
|
|
658
|
+
fallbackReason: null
|
|
659
|
+
},
|
|
660
|
+
debug
|
|
661
|
+
});
|
|
662
|
+
debug?.emit?.("detector.window.accepted", {
|
|
663
|
+
routeTag: window.routeTag,
|
|
664
|
+
finalTag: strongestCandidate.tag,
|
|
665
|
+
acceptancePath: "reliable",
|
|
666
|
+
confidence: strongestCandidate.confidence ?? null,
|
|
667
|
+
reliable: strongestCandidate.reliable ?? null
|
|
668
|
+
});
|
|
669
|
+
return strongestCandidate.tag;
|
|
670
|
+
}
|
|
671
|
+
if (window.routeTag === "und-Latn" && passesLatinQualityGate && rawRemapped && normalizedRemapped && rawRemapped.tag === normalizedRemapped.tag) {
|
|
672
|
+
const corroboratedConfidence = Math.max(rawRemapped.confidence ?? 0, normalizedRemapped.confidence ?? 0);
|
|
673
|
+
const hasReliableCorroboration = rawRemapped.reliable === true || normalizedRemapped.reliable === true;
|
|
674
|
+
if (hasReliableCorroboration && corroboratedConfidence >= .7) {
|
|
675
|
+
recordDetectorAccepted(debug?.summary, "corroborated");
|
|
676
|
+
emitDetectorWindowEvidence({
|
|
677
|
+
window,
|
|
678
|
+
windowIndex,
|
|
679
|
+
normalizedSample,
|
|
680
|
+
eligible,
|
|
681
|
+
qualityGate: passesLatinQualityGate,
|
|
682
|
+
rawResult,
|
|
683
|
+
rawRemappedTag: rawRemapped.tag,
|
|
684
|
+
normalizedResult,
|
|
685
|
+
normalizedRemappedTag: normalizedRemapped.tag,
|
|
686
|
+
decision: {
|
|
687
|
+
accepted: true,
|
|
688
|
+
path: "corroborated",
|
|
689
|
+
finalTag: rawRemapped.tag,
|
|
690
|
+
fallbackReason: null
|
|
691
|
+
},
|
|
692
|
+
debug
|
|
693
|
+
});
|
|
694
|
+
debug?.emit?.("detector.window.accepted", {
|
|
695
|
+
routeTag: window.routeTag,
|
|
696
|
+
finalTag: rawRemapped.tag,
|
|
697
|
+
acceptancePath: "corroborated",
|
|
698
|
+
confidence: corroboratedConfidence,
|
|
699
|
+
reliable: hasReliableCorroboration
|
|
700
|
+
});
|
|
701
|
+
return rawRemapped.tag;
|
|
702
|
+
}
|
|
703
|
+
if (!hasReliableCorroboration && corroboratedConfidence >= .7) {
|
|
704
|
+
recordDetectorFallback(debug?.summary, "corroborationUnreliable");
|
|
705
|
+
const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
|
|
706
|
+
emitDetectorWindowEvidence({
|
|
707
|
+
window,
|
|
708
|
+
windowIndex,
|
|
709
|
+
normalizedSample,
|
|
710
|
+
eligible,
|
|
711
|
+
qualityGate: passesLatinQualityGate,
|
|
712
|
+
rawResult,
|
|
713
|
+
rawRemappedTag: rawRemapped.tag,
|
|
714
|
+
normalizedResult,
|
|
715
|
+
normalizedRemappedTag: normalizedRemapped.tag,
|
|
716
|
+
decision: {
|
|
717
|
+
accepted: false,
|
|
718
|
+
path: null,
|
|
719
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
720
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
721
|
+
fallbackReason: "corroborationUnreliable"
|
|
722
|
+
},
|
|
723
|
+
debug
|
|
724
|
+
});
|
|
725
|
+
debug?.emit?.("detector.window.fallback", {
|
|
726
|
+
routeTag: window.routeTag,
|
|
727
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
728
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
729
|
+
reason: "corroborationUnreliable"
|
|
730
|
+
});
|
|
731
|
+
return getDetectorFallbackTag(window.routeTag);
|
|
732
|
+
}
|
|
349
733
|
}
|
|
734
|
+
const fallbackReason = passesLatinQualityGate ? "belowThreshold" : "qualityGate";
|
|
735
|
+
recordDetectorFallback(debug?.summary, fallbackReason);
|
|
736
|
+
const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
|
|
737
|
+
emitDetectorWindowEvidence({
|
|
738
|
+
window,
|
|
739
|
+
windowIndex,
|
|
740
|
+
normalizedSample,
|
|
741
|
+
eligible,
|
|
742
|
+
qualityGate: passesLatinQualityGate,
|
|
743
|
+
rawResult,
|
|
744
|
+
rawRemappedTag: rawRemapped?.tag ?? null,
|
|
745
|
+
normalizedResult,
|
|
746
|
+
normalizedRemappedTag: normalizedRemapped?.tag ?? null,
|
|
747
|
+
decision: {
|
|
748
|
+
accepted: false,
|
|
749
|
+
path: null,
|
|
750
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
751
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
752
|
+
fallbackReason
|
|
753
|
+
},
|
|
754
|
+
debug
|
|
755
|
+
});
|
|
756
|
+
debug?.emit?.("detector.window.fallback", {
|
|
757
|
+
routeTag: window.routeTag,
|
|
758
|
+
finalTag: fallbackDebugOutcome.finalTag,
|
|
759
|
+
...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
|
|
760
|
+
reason: fallbackReason
|
|
761
|
+
});
|
|
350
762
|
return getDetectorFallbackTag(window.routeTag);
|
|
351
763
|
}
|
|
352
764
|
async function segmentTextByLocaleWithWasmDetector(text, options = {}) {
|
|
353
|
-
|
|
765
|
+
require_markdown.resolveLocaleDetectContext(options);
|
|
766
|
+
const chunks = require_markdown.segmentTextByLocale(text, createDeferredLatinPreSegmentOptions(options));
|
|
354
767
|
const resolved = [...chunks];
|
|
355
768
|
const windows = buildDetectorWindows(chunks);
|
|
356
|
-
for (const window of windows) {
|
|
357
|
-
const resolvedLocale = await resolveWindowLocale(window);
|
|
769
|
+
for (const [windowIndex, window] of windows.entries()) {
|
|
770
|
+
const resolvedLocale = await resolveWindowLocale(window, windowIndex, options, options.detectorDebug);
|
|
358
771
|
for (let index = window.startIndex; index <= window.endIndex; index += 1) {
|
|
359
772
|
const chunk = resolved[index];
|
|
360
773
|
if (!chunk) continue;
|
|
@@ -364,7 +777,8 @@ async function segmentTextByLocaleWithWasmDetector(text, options = {}) {
|
|
|
364
777
|
};
|
|
365
778
|
}
|
|
366
779
|
}
|
|
367
|
-
|
|
780
|
+
options.detectorDebug?.emit?.("detector.summary", options.detectorDebug.summary, { verbosity: "compact" });
|
|
781
|
+
return reapplyDeferredLatinFallback(reapplyResolvedLatinHintRules(resolved, chunks, options), options);
|
|
368
782
|
}
|
|
369
783
|
async function wordCounterWithWasmDetector(text, options = {}) {
|
|
370
784
|
return buildWordCounterResultFromChunks(await segmentTextByLocaleWithWasmDetector(text, options), options);
|