@dev-pi2pie/word-counter 0.1.5-canary.2 → 0.1.5-canary.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -109,8 +109,10 @@ Detector mode notes:
109
109
  - `--detector wasm` only runs for ambiguous `und-Latn` and `und-Hani` chunks.
110
110
  - `--detector regex` keeps the original script/regex chunk-first detection path.
111
111
  - `--detector wasm` uses a detector-oriented ambiguous-window scoring pass before accepted tags are projected back onto the counting chunks.
112
+ - In `--detector wasm` mode, Latin hint rules and explicit Latin hint flags are deferred until after detector evaluation and only relabel unresolved `und-Latn` output.
112
113
  - Very short chunks stay on the original `und-*` fallback.
113
114
  - Low-confidence or unsupported detector results fall back to `und-*`.
115
+ - Technical-noise-heavy Latin windows stay conservative and may remain `und-Latn` even when the detector produces a wrong-but-confident language guess.
114
116
 
115
117
  Collect non-words (emoji/symbols/punctuation):
116
118
 
@@ -285,14 +287,24 @@ word-counter --path ./examples/test-case-multi-files-support --debug --verbose
285
287
 
286
288
  Use `--debug-report [path]` to route debug diagnostics to a JSONL report file:
287
289
 
288
- - no path: writes to current working directory with pattern `wc-debug-YYYYMMDD-HHmmss-<pid>.jsonl`
290
+ - no path: writes to current working directory with pattern `wc-debug-YYYYMMDD-HHmmss-utc-<pid>.jsonl`
291
+ - no path with `--detector-evidence`: writes with pattern `wc-detector-evidence-YYYYMMDD-HHmmss-utc-<pid>.jsonl`
289
292
  - path provided: writes to the specified location
290
293
  - default-name collision handling: appends `-<n>` suffix to avoid overwriting existing files
291
294
  - explicit path validation: existing directories are rejected (explicit paths are treated as file targets)
295
+ - compatibility note: the autogenerated filename moved from the older local-time pattern to the new UTC `...-utc-...jsonl` pattern
292
296
 
293
297
  By default with `--debug-report`, debug lines are file-only (not mirrored to terminal).
294
298
  Use `--debug-report-tee` (alias: `--debug-tee`) to mirror to both file and `stderr`.
295
- Flag dependencies: `--verbose` requires `--debug`; `--debug-report` requires `--debug`; `--debug-report-tee`/`--debug-tee` requires `--debug-report`.
299
+ Flag dependencies: `--verbose` requires `--debug`; `--detector-evidence` requires `--debug` and `--detector wasm`; `--debug-report` requires `--debug`; `--debug-report-tee`/`--debug-tee` requires `--debug-report`.
300
+
301
+ Use `--detector-evidence` to add per-window detector evidence onto the same debug stream:
302
+
303
+ - only meaningful with `--detector wasm`
304
+ - compact mode emits bounded single-line previews plus detector decision metadata
305
+ - verbose mode emits full raw detector windows and full normalized samples
306
+ - evidence remains detector-window based even when output mode changes to `collector`, `char`, or another counting mode
307
+ - fallback evidence reports the post-fallback final tag used by downstream counting output; in rare split-relabel cases it may also include `finalLocales`
296
308
 
297
309
  Examples:
298
310
 
@@ -301,17 +313,26 @@ word-counter --path ./examples/test-case-multi-files-support --debug --debug-rep
301
313
  word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl
302
314
  word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl --debug-report-tee
303
315
  word-counter --path ./examples/test-case-multi-files-support --debug --debug-report ./logs/debug.jsonl --debug-tee
316
+ word-counter --detector wasm --debug --detector-evidence "This sentence should clearly be detected as English for the wasm detector path."
317
+ word-counter --detector wasm --debug --verbose --detector-evidence "This sentence should clearly be detected as English for the wasm detector path."
318
+ word-counter --detector wasm --debug --detector-evidence --debug-report
304
319
  ```
305
320
 
306
321
  Skip details stay debug-gated and can be suppressed with `--quiet-skips`.
307
322
 
323
+ When `--format json` is combined with `--debug`, debug-only diagnostics are emitted under `debug.*`:
324
+
325
+ - single input and merged batch may include `debug.detector`
326
+ - per-file batch may include `debug.skipped`, `debug.detector`, and per-entry `files[i].debug.detector`
327
+ - per-file top-level `skipped` is still emitted temporarily for compatibility
328
+
308
329
  ## How It Works
309
330
 
310
331
  - The runtime inspects each character's Unicode script to infer its likely locale tag (e.g., `und-Latn`, `und-Hani`, `ja`).
311
332
  - Adjacent characters that share the same locale tag are grouped into a chunk.
312
333
  - Each chunk is counted with `Intl.Segmenter` at `granularity: "word"`, caching segmenters to avoid re-instantiation.
313
334
  - Per-locale counts are summed into an overall total and printed to stdout.
314
- - With `--detector wasm`, ambiguous `und-Latn` and `und-Hani` chunks can be relabeled through the optional WASM detector before counting.
335
+ - With `--detector wasm`, ambiguous `und-Latn` and `und-Hani` chunks can be relabeled through the optional WASM detector before counting; unresolved `und-Latn` chunks then fall back to the existing Latin hint rules and explicit Latin hint precedence.
315
336
 
316
337
  ## Locale vs Language Code
317
338
 
@@ -696,6 +717,7 @@ Example JSON (trimmed):
696
717
  - Detection is regex/script based, not statistical language-ID.
697
718
  - Ambiguous Latin defaults to `und-Latn`; Han fallback defaults to `und-Hani`.
698
719
  - `--detector wasm` is optional and conservative; it only runs for ambiguous chunks that meet minimum script-bearing length thresholds.
720
+ - In `--detector wasm` mode, ambiguous Latin stays on `und-Latn` for detector eligibility first, then built-in/custom Latin rules and explicit Latin hints are applied only if the detector leaves that chunk unresolved.
699
721
  - The current first WASM engine is `whatlang`, remapped into this package's public tags.
700
722
  - The npm package ships one portable WASM artifact; users do not install per-OS detector packages.
701
723
  - Use explicit tag and hint flags when you need deterministic tagging.
@@ -118,6 +118,41 @@ function buildWordCounterResultFromChunks(chunks, options = {}) {
118
118
  }
119
119
  };
120
120
  }
121
+ function recordDetectorWindow(summary, routeTag) {
122
+ if (!summary) return;
123
+ summary.windowsTotal += 1;
124
+ if (routeTag === "und-Latn") {
125
+ summary.routes.latin += 1;
126
+ return;
127
+ }
128
+ if (routeTag === "und-Hani") summary.routes.han += 1;
129
+ }
130
+ function recordDetectorAccepted(summary, path) {
131
+ if (!summary) return;
132
+ summary.accepted += 1;
133
+ if (path === "reliable") {
134
+ summary.acceptancePaths.reliable += 1;
135
+ return;
136
+ }
137
+ summary.acceptancePaths.corroborated += 1;
138
+ }
139
+ function recordDetectorFallback(summary, reason) {
140
+ if (!summary) return;
141
+ summary.fallback += 1;
142
+ summary.fallbackReasons[reason] += 1;
143
+ }
144
+ function createDetectorEvidencePreview(text) {
145
+ const collapsed = text.replace(/\s+/gu, " ").trim();
146
+ const codePoints = Array.from(collapsed);
147
+ if (codePoints.length <= 160) return {
148
+ preview: collapsed,
149
+ truncated: false
150
+ };
151
+ return {
152
+ preview: codePoints.slice(0, 160).join(""),
153
+ truncated: true
154
+ };
155
+ }
121
156
  //#endregion
122
157
  //#region src/detector/sections.ts
123
158
  function normalizeText(value) {
@@ -186,6 +221,7 @@ const LATIN_WASM_MIN_CONFIDENCE = .75;
186
221
  const HANI_WASM_MIN_CONFIDENCE = .9;
187
222
  const LATIN_SCRIPT_REGEX = /\p{Script=Latin}/u;
188
223
  const HAN_SCRIPT_REGEX = /\p{Script=Han}/u;
224
+ const LATIN_WORD_REGEX = /\p{Script=Latin}+/gu;
189
225
  const DETECTOR_ROUTE_POLICIES = {
190
226
  [require_markdown.DEFAULT_LOCALE]: {
191
227
  routeTag: require_markdown.DEFAULT_LOCALE,
@@ -209,10 +245,6 @@ function countScriptBearingCharsForRoute(text, routeTag) {
209
245
  for (const char of text) if (matcher.test(char)) count += 1;
210
246
  return count;
211
247
  }
212
- function shouldRunWasmDetector(text, routeTag) {
213
- const policy = DETECTOR_ROUTE_POLICIES[routeTag];
214
- return countScriptBearingCharsForRoute(text, routeTag) >= policy.minScriptChars;
215
- }
216
248
  function normalizeDetectorSampleForRoute(text, routeTag) {
217
249
  const matcher = routeTag === "und-Hani" ? HAN_SCRIPT_REGEX : LATIN_SCRIPT_REGEX;
218
250
  return [...text].map((char) => {
@@ -221,6 +253,57 @@ function normalizeDetectorSampleForRoute(text, routeTag) {
221
253
  return " ";
222
254
  }).join("").replace(/\s+/g, " ").trim();
223
255
  }
256
+ function countLatinWords(text) {
257
+ return text.match(LATIN_WORD_REGEX)?.length ?? 0;
258
+ }
259
+ function isTechnicalLikeLatinLine(line, latinWords) {
260
+ const trimmed = line.trim();
261
+ if (!trimmed) return false;
262
+ if (/^[>#$]/u.test(trimmed)) return true;
263
+ if (/(^|\s)--[a-z0-9][a-z0-9-]*/iu.test(trimmed)) return true;
264
+ if (/`[^`]+`/u.test(trimmed)) return true;
265
+ if (/(^|[\s"'`])(?:\.{0,2}\/|\/)?[\w./-]+\.[a-z0-9]{1,6}(?=$|[\s"'`])/iu.test(trimmed)) return true;
266
+ if (/^[\-\*\d.)\s]*[\p{L}\p{N}_.-]+:\s+\S/iu.test(trimmed) && latinWords <= 8) return true;
267
+ return false;
268
+ }
269
+ function shouldTreatLatinProseBlockAsSentenceLike(latinWords, lineCount, hasSentencePunctuation) {
270
+ if (latinWords < 4) return false;
271
+ if (hasSentencePunctuation) return true;
272
+ return lineCount <= 1 ? latinWords >= 5 : latinWords >= 8;
273
+ }
274
+ function shouldAcceptLatinDetectorWindow(text, normalizedSample) {
275
+ if (countLatinWords(normalizedSample) < 4) return false;
276
+ let proseWords = 0;
277
+ let technicalWords = 0;
278
+ let proseBlockWords = 0;
279
+ let proseBlockLines = 0;
280
+ let proseBlockHasSentencePunctuation = false;
281
+ const flushProseBlock = () => {
282
+ if (shouldTreatLatinProseBlockAsSentenceLike(proseBlockWords, proseBlockLines, proseBlockHasSentencePunctuation)) proseWords += proseBlockWords;
283
+ proseBlockWords = 0;
284
+ proseBlockLines = 0;
285
+ proseBlockHasSentencePunctuation = false;
286
+ };
287
+ for (const rawLine of text.split(/\r?\n/u)) {
288
+ const line = rawLine.trim();
289
+ if (!line || line === "---" || line === "```") {
290
+ flushProseBlock();
291
+ continue;
292
+ }
293
+ const latinWords = countLatinWords(line);
294
+ if (latinWords === 0) continue;
295
+ if (isTechnicalLikeLatinLine(line, latinWords)) {
296
+ flushProseBlock();
297
+ technicalWords += latinWords;
298
+ continue;
299
+ }
300
+ proseBlockWords += latinWords;
301
+ proseBlockLines += 1;
302
+ proseBlockHasSentencePunctuation ||= /[.!?]/u.test(line);
303
+ }
304
+ flushProseBlock();
305
+ return proseWords >= 4 && proseWords >= technicalWords;
306
+ }
224
307
  //#endregion
225
308
  //#region src/detector/whatlang-wasm.ts
226
309
  const GENERATED_FOLDER_NAME = "wasm-language-detector";
@@ -304,12 +387,142 @@ function getDetectorFallbackTag(routeTag) {
304
387
  }
305
388
  //#endregion
306
389
  //#region src/detector/wasm.ts
390
+ function createDeferredLatinPreSegmentOptions(options) {
391
+ return {
392
+ ...options,
393
+ latinLanguageHint: void 0,
394
+ latinTagHint: void 0,
395
+ latinLocaleHint: void 0,
396
+ latinHintRules: void 0,
397
+ useDefaultLatinHints: false
398
+ };
399
+ }
400
+ function createRuleOnlyLatinOptions(options) {
401
+ return {
402
+ ...options,
403
+ latinLanguageHint: void 0,
404
+ latinTagHint: void 0,
405
+ latinLocaleHint: void 0
406
+ };
407
+ }
408
+ function mergeAdjacentChunks(chunks) {
409
+ if (chunks.length === 0) return chunks;
410
+ const merged = [];
411
+ let last = chunks[0];
412
+ for (let index = 1; index < chunks.length; index += 1) {
413
+ const chunk = chunks[index];
414
+ if (chunk.locale === last.locale) {
415
+ last = {
416
+ locale: last.locale,
417
+ text: last.text + chunk.text
418
+ };
419
+ continue;
420
+ }
421
+ merged.push(last);
422
+ last = chunk;
423
+ }
424
+ merged.push(last);
425
+ return merged;
426
+ }
427
+ function reapplyDeferredLatinFallback(chunks, options) {
428
+ const relabeled = [];
429
+ for (const chunk of chunks) {
430
+ if (chunk.locale !== "und-Latn") {
431
+ relabeled.push(chunk);
432
+ continue;
433
+ }
434
+ relabeled.push(...require_markdown.segmentTextByLocale(chunk.text, options));
435
+ }
436
+ return mergeAdjacentChunks(relabeled);
437
+ }
438
+ function reapplyResolvedLatinHintRules(resolvedChunks, originalChunks, options) {
439
+ const relabeled = [];
440
+ const ruleOnlyOptions = createRuleOnlyLatinOptions(options);
441
+ for (let index = 0; index < resolvedChunks.length; index += 1) {
442
+ const chunk = resolvedChunks[index];
443
+ const originalChunk = originalChunks[index];
444
+ if (!chunk || !originalChunk) continue;
445
+ if (originalChunk.locale !== "und-Latn" || chunk.locale === "und-Latn") {
446
+ relabeled.push(chunk);
447
+ continue;
448
+ }
449
+ const hintedChunks = require_markdown.segmentTextByLocale(chunk.text, ruleOnlyOptions).map((hintedChunk) => ({
450
+ locale: hintedChunk.locale === "und-Latn" ? chunk.locale : hintedChunk.locale,
451
+ text: hintedChunk.text
452
+ }));
453
+ relabeled.push(...hintedChunks);
454
+ }
455
+ return mergeAdjacentChunks(relabeled);
456
+ }
307
457
  function shouldAcceptDetectorTag(routeTag, confidence, reliable) {
308
458
  const policy = DETECTOR_ROUTE_POLICIES[routeTag];
309
459
  if (policy.requireReliable && reliable !== true) return false;
310
460
  if (confidence === void 0) return false;
311
461
  return confidence >= policy.minConfidence;
312
462
  }
463
+ function resolveFallbackDebugOutcome(window, options) {
464
+ const fallbackTag = getDetectorFallbackTag(window.routeTag);
465
+ if (window.routeTag !== "und-Latn") return { finalTag: fallbackTag };
466
+ const finalLocales = reapplyDeferredLatinFallback([{
467
+ locale: fallbackTag,
468
+ text: window.text
469
+ }], options).map((chunk) => chunk.locale);
470
+ if (finalLocales.length === 1) return { finalTag: finalLocales[0] };
471
+ return finalLocales.length > 1 ? {
472
+ finalTag: fallbackTag,
473
+ finalLocales
474
+ } : { finalTag: fallbackTag };
475
+ }
476
+ function buildEvidenceSample(result, remappedTag) {
477
+ return {
478
+ lang: result?.lang ?? null,
479
+ script: result?.script ?? null,
480
+ confidence: result?.confidence ?? null,
481
+ reliable: result?.reliable ?? null,
482
+ remappedTag
483
+ };
484
+ }
485
+ function emitDetectorWindowEvidence({ window, windowIndex, normalizedSample, eligible, qualityGate, rawResult, rawRemappedTag, normalizedResult, normalizedRemappedTag, decision, debug }) {
486
+ const evidence = debug?.evidence;
487
+ if (!evidence || !debug.emit) return;
488
+ const routePolicy = DETECTOR_ROUTE_POLICIES[window.routeTag];
489
+ const baseDetails = {
490
+ engine: "whatlang-wasm",
491
+ routeTag: window.routeTag,
492
+ windowIndex,
493
+ startIndex: window.startIndex,
494
+ endIndex: window.endIndex,
495
+ mode: evidence.mode,
496
+ section: evidence.section,
497
+ textLength: window.text.length,
498
+ normalizedLength: normalizedSample.length,
499
+ normalizedApplied: normalizedSample !== window.text,
500
+ scriptChars: countScriptBearingCharsForRoute(window.text, window.routeTag),
501
+ minScriptChars: routePolicy.minScriptChars,
502
+ eligible,
503
+ qualityGate,
504
+ raw: buildEvidenceSample(rawResult, rawRemappedTag),
505
+ normalized: buildEvidenceSample(normalizedResult, normalizedRemappedTag),
506
+ decision
507
+ };
508
+ if (evidence.verbosity === "verbose") {
509
+ debug.emit("detector.window.evidence", {
510
+ ...baseDetails,
511
+ text: window.text,
512
+ normalizedText: normalizedSample
513
+ }, { verbosity: "verbose" });
514
+ return;
515
+ }
516
+ const textPreview = createDetectorEvidencePreview(window.text);
517
+ const normalizedPreview = createDetectorEvidencePreview(normalizedSample);
518
+ debug.emit("detector.window.evidence", {
519
+ ...baseDetails,
520
+ textPreview: textPreview.preview,
521
+ textPreviewTruncated: textPreview.truncated,
522
+ normalizedPreview: normalizedPreview.preview,
523
+ normalizedPreviewTruncated: normalizedPreview.truncated
524
+ }, { verbosity: "compact" });
525
+ }
313
526
  function buildDetectorWindows(chunks) {
314
527
  const windows = [];
315
528
  for (let index = 0; index < chunks.length; index += 1) {
@@ -330,31 +543,231 @@ function buildDetectorWindows(chunks) {
330
543
  }
331
544
  return windows;
332
545
  }
333
- async function resolveWindowLocale(window) {
334
- if (!shouldRunWasmDetector(window.text, window.routeTag)) return window.routeTag;
546
+ async function resolveWindowLocale(window, windowIndex, options, debug) {
547
+ recordDetectorWindow(debug?.summary, window.routeTag);
548
+ debug?.emit?.("detector.window.start", {
549
+ routeTag: window.routeTag,
550
+ startIndex: window.startIndex,
551
+ endIndex: window.endIndex,
552
+ textLength: window.text.length
553
+ }, { verbosity: "verbose" });
554
+ const routePolicy = DETECTOR_ROUTE_POLICIES[window.routeTag];
555
+ const eligible = countScriptBearingCharsForRoute(window.text, window.routeTag) >= routePolicy.minScriptChars;
556
+ const normalizedSample = normalizeDetectorSampleForRoute(window.text, window.routeTag);
557
+ const passesLatinQualityGate = window.routeTag !== "und-Latn" || shouldAcceptLatinDetectorWindow(window.text, normalizedSample);
558
+ if (!eligible) {
559
+ recordDetectorFallback(debug?.summary, "notEligible");
560
+ const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
561
+ emitDetectorWindowEvidence({
562
+ window,
563
+ windowIndex,
564
+ normalizedSample,
565
+ eligible,
566
+ qualityGate: passesLatinQualityGate,
567
+ rawResult: null,
568
+ rawRemappedTag: null,
569
+ normalizedResult: null,
570
+ normalizedRemappedTag: null,
571
+ decision: {
572
+ accepted: false,
573
+ path: null,
574
+ finalTag: fallbackDebugOutcome.finalTag,
575
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
576
+ fallbackReason: "notEligible"
577
+ },
578
+ debug
579
+ });
580
+ debug?.emit?.("detector.window.fallback", {
581
+ routeTag: window.routeTag,
582
+ finalTag: fallbackDebugOutcome.finalTag,
583
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
584
+ reason: "notEligible"
585
+ });
586
+ return window.routeTag;
587
+ }
335
588
  const rawResult = await detectWithWhatlangWasm(window.text, window.routeTag);
336
589
  const rawRemapped = rawResult ? remapWhatlangResult(rawResult, window.routeTag) : null;
337
- const normalizedSample = normalizeDetectorSampleForRoute(window.text, window.routeTag);
338
590
  const normalizedResult = normalizedSample.length > 0 && normalizedSample !== window.text ? await detectWithWhatlangWasm(normalizedSample, window.routeTag) : null;
591
+ debug?.emit?.("detector.window.sample", {
592
+ routeTag: window.routeTag,
593
+ normalizedApplied: normalizedSample.length > 0 && normalizedSample !== window.text,
594
+ normalizedLength: normalizedSample.length,
595
+ qualityGate: passesLatinQualityGate,
596
+ rawTag: rawRemapped?.tag ?? null,
597
+ rawConfidence: rawRemapped?.confidence ?? null,
598
+ rawReliable: rawRemapped?.reliable ?? null
599
+ }, { verbosity: "verbose" });
339
600
  const normalizedRemapped = normalizedResult ? remapWhatlangResult(normalizedResult, window.routeTag) : null;
601
+ debug?.emit?.("detector.window.candidates", {
602
+ routeTag: window.routeTag,
603
+ normalizedTag: normalizedRemapped?.tag ?? null,
604
+ normalizedConfidence: normalizedRemapped?.confidence ?? null,
605
+ normalizedReliable: normalizedRemapped?.reliable ?? null
606
+ }, { verbosity: "verbose" });
340
607
  const candidates = [rawRemapped, normalizedRemapped].filter((value) => value !== null);
341
- if (candidates.length === 0) return getDetectorFallbackTag(window.routeTag);
608
+ if (candidates.length === 0) {
609
+ recordDetectorFallback(debug?.summary, "noCandidate");
610
+ const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
611
+ emitDetectorWindowEvidence({
612
+ window,
613
+ windowIndex,
614
+ normalizedSample,
615
+ eligible,
616
+ qualityGate: passesLatinQualityGate,
617
+ rawResult,
618
+ rawRemappedTag: rawRemapped?.tag ?? null,
619
+ normalizedResult,
620
+ normalizedRemappedTag: normalizedRemapped?.tag ?? null,
621
+ decision: {
622
+ accepted: false,
623
+ path: null,
624
+ finalTag: fallbackDebugOutcome.finalTag,
625
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
626
+ fallbackReason: "noCandidate"
627
+ },
628
+ debug
629
+ });
630
+ debug?.emit?.("detector.window.fallback", {
631
+ routeTag: window.routeTag,
632
+ finalTag: fallbackDebugOutcome.finalTag,
633
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
634
+ reason: "noCandidate"
635
+ });
636
+ return getDetectorFallbackTag(window.routeTag);
637
+ }
342
638
  const strongestCandidate = candidates.reduce((best, current) => {
343
639
  if (!best) return current;
344
640
  return (current.confidence ?? 0) > (best.confidence ?? 0) ? current : best;
345
641
  }, candidates[0]);
346
- if (strongestCandidate && shouldAcceptDetectorTag(window.routeTag, strongestCandidate.confidence, strongestCandidate.reliable)) return strongestCandidate.tag;
347
- if (window.routeTag === "und-Latn" && rawRemapped && normalizedRemapped && rawRemapped.tag === normalizedRemapped.tag) {
348
- if (Math.max(rawRemapped.confidence ?? 0, normalizedRemapped.confidence ?? 0) >= .7) return rawRemapped.tag;
642
+ if (strongestCandidate && passesLatinQualityGate && shouldAcceptDetectorTag(window.routeTag, strongestCandidate.confidence, strongestCandidate.reliable)) {
643
+ recordDetectorAccepted(debug?.summary, "reliable");
644
+ emitDetectorWindowEvidence({
645
+ window,
646
+ windowIndex,
647
+ normalizedSample,
648
+ eligible,
649
+ qualityGate: passesLatinQualityGate,
650
+ rawResult,
651
+ rawRemappedTag: rawRemapped?.tag ?? null,
652
+ normalizedResult,
653
+ normalizedRemappedTag: normalizedRemapped?.tag ?? null,
654
+ decision: {
655
+ accepted: true,
656
+ path: "reliable",
657
+ finalTag: strongestCandidate.tag,
658
+ fallbackReason: null
659
+ },
660
+ debug
661
+ });
662
+ debug?.emit?.("detector.window.accepted", {
663
+ routeTag: window.routeTag,
664
+ finalTag: strongestCandidate.tag,
665
+ acceptancePath: "reliable",
666
+ confidence: strongestCandidate.confidence ?? null,
667
+ reliable: strongestCandidate.reliable ?? null
668
+ });
669
+ return strongestCandidate.tag;
670
+ }
671
+ if (window.routeTag === "und-Latn" && passesLatinQualityGate && rawRemapped && normalizedRemapped && rawRemapped.tag === normalizedRemapped.tag) {
672
+ const corroboratedConfidence = Math.max(rawRemapped.confidence ?? 0, normalizedRemapped.confidence ?? 0);
673
+ const hasReliableCorroboration = rawRemapped.reliable === true || normalizedRemapped.reliable === true;
674
+ if (hasReliableCorroboration && corroboratedConfidence >= .7) {
675
+ recordDetectorAccepted(debug?.summary, "corroborated");
676
+ emitDetectorWindowEvidence({
677
+ window,
678
+ windowIndex,
679
+ normalizedSample,
680
+ eligible,
681
+ qualityGate: passesLatinQualityGate,
682
+ rawResult,
683
+ rawRemappedTag: rawRemapped.tag,
684
+ normalizedResult,
685
+ normalizedRemappedTag: normalizedRemapped.tag,
686
+ decision: {
687
+ accepted: true,
688
+ path: "corroborated",
689
+ finalTag: rawRemapped.tag,
690
+ fallbackReason: null
691
+ },
692
+ debug
693
+ });
694
+ debug?.emit?.("detector.window.accepted", {
695
+ routeTag: window.routeTag,
696
+ finalTag: rawRemapped.tag,
697
+ acceptancePath: "corroborated",
698
+ confidence: corroboratedConfidence,
699
+ reliable: hasReliableCorroboration
700
+ });
701
+ return rawRemapped.tag;
702
+ }
703
+ if (!hasReliableCorroboration && corroboratedConfidence >= .7) {
704
+ recordDetectorFallback(debug?.summary, "corroborationUnreliable");
705
+ const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
706
+ emitDetectorWindowEvidence({
707
+ window,
708
+ windowIndex,
709
+ normalizedSample,
710
+ eligible,
711
+ qualityGate: passesLatinQualityGate,
712
+ rawResult,
713
+ rawRemappedTag: rawRemapped.tag,
714
+ normalizedResult,
715
+ normalizedRemappedTag: normalizedRemapped.tag,
716
+ decision: {
717
+ accepted: false,
718
+ path: null,
719
+ finalTag: fallbackDebugOutcome.finalTag,
720
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
721
+ fallbackReason: "corroborationUnreliable"
722
+ },
723
+ debug
724
+ });
725
+ debug?.emit?.("detector.window.fallback", {
726
+ routeTag: window.routeTag,
727
+ finalTag: fallbackDebugOutcome.finalTag,
728
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
729
+ reason: "corroborationUnreliable"
730
+ });
731
+ return getDetectorFallbackTag(window.routeTag);
732
+ }
349
733
  }
734
+ const fallbackReason = passesLatinQualityGate ? "belowThreshold" : "qualityGate";
735
+ recordDetectorFallback(debug?.summary, fallbackReason);
736
+ const fallbackDebugOutcome = resolveFallbackDebugOutcome(window, options);
737
+ emitDetectorWindowEvidence({
738
+ window,
739
+ windowIndex,
740
+ normalizedSample,
741
+ eligible,
742
+ qualityGate: passesLatinQualityGate,
743
+ rawResult,
744
+ rawRemappedTag: rawRemapped?.tag ?? null,
745
+ normalizedResult,
746
+ normalizedRemappedTag: normalizedRemapped?.tag ?? null,
747
+ decision: {
748
+ accepted: false,
749
+ path: null,
750
+ finalTag: fallbackDebugOutcome.finalTag,
751
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
752
+ fallbackReason
753
+ },
754
+ debug
755
+ });
756
+ debug?.emit?.("detector.window.fallback", {
757
+ routeTag: window.routeTag,
758
+ finalTag: fallbackDebugOutcome.finalTag,
759
+ ...fallbackDebugOutcome.finalLocales ? { finalLocales: fallbackDebugOutcome.finalLocales } : {},
760
+ reason: fallbackReason
761
+ });
350
762
  return getDetectorFallbackTag(window.routeTag);
351
763
  }
352
764
  async function segmentTextByLocaleWithWasmDetector(text, options = {}) {
353
- const chunks = require_markdown.segmentTextByLocale(text, options);
765
+ require_markdown.resolveLocaleDetectContext(options);
766
+ const chunks = require_markdown.segmentTextByLocale(text, createDeferredLatinPreSegmentOptions(options));
354
767
  const resolved = [...chunks];
355
768
  const windows = buildDetectorWindows(chunks);
356
- for (const window of windows) {
357
- const resolvedLocale = await resolveWindowLocale(window);
769
+ for (const [windowIndex, window] of windows.entries()) {
770
+ const resolvedLocale = await resolveWindowLocale(window, windowIndex, options, options.detectorDebug);
358
771
  for (let index = window.startIndex; index <= window.endIndex; index += 1) {
359
772
  const chunk = resolved[index];
360
773
  if (!chunk) continue;
@@ -364,7 +777,8 @@ async function segmentTextByLocaleWithWasmDetector(text, options = {}) {
364
777
  };
365
778
  }
366
779
  }
367
- return resolved;
780
+ options.detectorDebug?.emit?.("detector.summary", options.detectorDebug.summary, { verbosity: "compact" });
781
+ return reapplyDeferredLatinFallback(reapplyResolvedLatinHintRules(resolved, chunks, options), options);
368
782
  }
369
783
  async function wordCounterWithWasmDetector(text, options = {}) {
370
784
  return buildWordCounterResultFromChunks(await segmentTextByLocaleWithWasmDetector(text, options), options);