@dev-pi2pie/word-counter 0.1.3-canary.0 → 0.1.3-canary.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -69,6 +69,24 @@ word-counter --latin-language en "Hello world"
69
69
  word-counter --latin-tag en "Hello world"
70
70
  ```
71
71
 
72
+ Add custom Latin hint rules (repeatable) or load from JSON:
73
+
74
+ ```bash
75
+ word-counter --latin-hint 'pl=[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]' "Zażółć gęślą jaźń"
76
+ word-counter --latin-hint 'tr=[çğıöşüÇĞİÖŞÜ]' --latin-hint 'ro=[ăâîșțĂÂÎȘȚ]' "șță"
77
+ word-counter --latin-hints-file ./examples/latin-hints.json "Zażółć Știință Iğdır"
78
+ word-counter --no-default-latin-hints --latin-hint 'pl=[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]' "Zażółć"
79
+ ```
80
+
81
+ `examples/latin-hints.json` format:
82
+
83
+ ```json
84
+ [
85
+ { "tag": "pl", "pattern": "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" },
86
+ { "tag": "tr", "pattern": "[çğıöşüÇĞİÖŞÜ]", "priority": 1 }
87
+ ]
88
+ ```
89
+
72
90
  Hint a language tag for Han fallback:
73
91
 
74
92
  ```bash
@@ -245,6 +263,10 @@ import wordCounter, {
245
263
 
246
264
  wordCounter("Hello world", { latinLanguageHint: "en" });
247
265
  wordCounter("Hello world", { latinTagHint: "en" });
266
+ wordCounter("Zażółć gęślą jaźń", {
267
+ latinHintRules: [{ tag: "pl", pattern: "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" }],
268
+ });
269
+ wordCounter("Über", { useDefaultLatinHints: false });
248
270
  wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
249
271
  wordCounter("Hi 👋, world!", { nonWords: true });
250
272
  wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
@@ -295,6 +317,10 @@ const {
295
317
 
296
318
  wordCounter("Hello world", { latinLanguageHint: "en" });
297
319
  wordCounter("Hello world", { latinTagHint: "en" });
320
+ wordCounter("Zażółć gęślą jaźń", {
321
+ latinHintRules: [{ tag: "pl", pattern: "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" }],
322
+ });
323
+ wordCounter("Über", { useDefaultLatinHints: false });
298
324
  wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
299
325
  wordCounter("Hi 👋, world!", { nonWords: true });
300
326
  wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
@@ -568,15 +594,22 @@ Example JSON (trimmed):
568
594
  - Detection is regex/script based (Unicode script checks), not a statistical language-ID model.
569
595
  - Ambiguous Latin text uses `und-Latn` unless a Latin hint is provided.
570
596
  - Han-script fallback uses `und-Hani` by default because regex script checks cannot natively distinguish `zh-Hans` vs `zh-Hant`.
571
- - Current built-in Latin diacritic heuristics are intentionally limited:
597
+ - Current built-in Latin diacritic heuristics include:
572
598
  - `de`: `äöüÄÖÜß`
573
599
  - `es`: `ñÑ¿¡`
574
600
  - `pt`: `ãõÃÕ`
575
601
  - `fr`: `œŒæÆ`
602
+ - `pl`: `ąćęłńśźżĄĆĘŁŃŚŹŻ`
603
+ - `tr`: `ıİğĞşŞ`
604
+ - `ro`: `ăĂâÂîÎșȘțȚ`
605
+ - `hu`: `őŐűŰ`
606
+ - `is`: `ðÐþÞ`
576
607
  - Latin text with other European diacritics may still remain in `und-Latn` unless a hint is provided.
577
608
  - Use `--mode chunk`/`--mode segments` or `--format json` to see the exact locale tag assigned to each chunk.
578
609
  - Regex/script-only detection cannot reliably identify English vs. other Latin-script languages; 100% certainty requires explicit metadata (document language tags, user-provided locale, headers) or a language-ID model.
579
610
  - Use `--latin-language <tag>` or `--latin-tag <tag>` for ambiguous Latin text.
611
+ - Use `--latin-hint <tag>=<pattern>` (repeatable) and `--latin-hints-file <path>` to add custom Latin rules.
612
+ - Use `--no-default-latin-hints` to disable built-in Latin diacritic rules.
580
613
  - Use `--han-language <tag>` or `--han-tag <tag>` for Han-script fallback.
581
614
  - `--latin-locale` remains supported as a legacy alias for now and is planned for future deprecation.
582
615
 
@@ -322,10 +322,53 @@ function resolveMode(input, fallback = "chunk") {
322
322
  return normalizeMode(input) ?? fallback;
323
323
  }
324
324
 
325
+ //#endregion
326
+ //#region src/wc/latin-hints.ts
327
+ const DEFAULT_LATIN_HINT_RULES_SOURCE = [
328
+ {
329
+ tag: "de",
330
+ pattern: "[äöüÄÖÜß]"
331
+ },
332
+ {
333
+ tag: "es",
334
+ pattern: "[ñÑ¿¡]"
335
+ },
336
+ {
337
+ tag: "pt",
338
+ pattern: "[ãõÃÕ]"
339
+ },
340
+ {
341
+ tag: "fr",
342
+ pattern: "[œŒæÆ]"
343
+ },
344
+ {
345
+ tag: "pl",
346
+ pattern: "[ąćęłńśźżĄĆĘŁŃŚŹŻ]"
347
+ },
348
+ {
349
+ tag: "tr",
350
+ pattern: "[ıİğĞşŞ]"
351
+ },
352
+ {
353
+ tag: "ro",
354
+ pattern: "[ăĂâÂîÎșȘțȚ]"
355
+ },
356
+ {
357
+ tag: "hu",
358
+ pattern: "[őŐűŰ]"
359
+ },
360
+ {
361
+ tag: "is",
362
+ pattern: "[ðÐþÞ]"
363
+ }
364
+ ];
365
+ const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
366
+
325
367
  //#endregion
326
368
  //#region src/wc/locale-detect.ts
327
369
  const DEFAULT_LOCALE = "und-Latn";
328
370
  const DEFAULT_HAN_TAG = "und-Hani";
371
+ const MAX_LATIN_HINT_PATTERN_LENGTH = 256;
329
372
  const regex = {
330
373
  hiragana: /\p{Script=Hiragana}/u,
331
374
  katakana: /\p{Script=Katakana}/u,
@@ -337,31 +380,10 @@ const regex = {
337
380
  devanagari: /\p{Script=Devanagari}/u,
338
381
  thai: /\p{Script=Thai}/u
339
382
  };
340
- const latinLocaleHints = [
341
- {
342
- locale: "de",
343
- regex: /[äöüÄÖÜß]/
344
- },
345
- {
346
- locale: "es",
347
- regex: /[ñÑ¿¡]/
348
- },
349
- {
350
- locale: "pt",
351
- regex: /[ãõÃÕ]/
352
- },
353
- {
354
- locale: "fr",
355
- regex: /[œŒæÆ]/
356
- }
357
- ];
358
- const latinLocales = new Set([DEFAULT_LOCALE, ...latinLocaleHints.map((hint) => hint.locale)]);
359
- function isLatinLocale(locale) {
360
- return latinLocales.has(locale);
361
- }
362
- function detectLatinLocale(char) {
363
- for (const hint of latinLocaleHints) if (hint.regex.test(char)) return hint.locale;
364
- return DEFAULT_LOCALE;
383
+ const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
384
+ function isLatinLocale(locale, context) {
385
+ if (context) return context.latinLocales.has(locale);
386
+ return defaultLatinLocales.has(locale);
365
387
  }
366
388
  function resolveLatinHint(options) {
367
389
  const latinTagHint = options.latinTagHint?.trim();
@@ -377,7 +399,82 @@ function resolveHanHint(options) {
377
399
  const hanLanguageHint = options.hanLanguageHint?.trim();
378
400
  if (hanLanguageHint) return hanLanguageHint;
379
401
  }
380
- function detectLocaleForChar(char, previousLocale, options = {}) {
402
+ function compileLatinHintPattern(pattern, label) {
403
+ const source = typeof pattern === "string" ? pattern : pattern.source;
404
+ const hasUnicodeMode = typeof pattern !== "string" && (pattern.flags.includes("u") || pattern.flags.includes("v"));
405
+ const flags = typeof pattern === "string" ? "u" : hasUnicodeMode ? pattern.flags : `${pattern.flags}u`;
406
+ if (source.length === 0) throw new Error(`${label}: pattern must not be empty.`);
407
+ if (source.length > MAX_LATIN_HINT_PATTERN_LENGTH) throw new Error(`${label}: pattern must be at most ${MAX_LATIN_HINT_PATTERN_LENGTH} characters.`);
408
+ try {
409
+ return new RegExp(source, flags);
410
+ } catch (error) {
411
+ const message = error instanceof Error ? error.message : String(error);
412
+ throw new Error(`${label}: invalid Unicode regex pattern (${message}).`);
413
+ }
414
+ }
415
+ function normalizeLatinHintPriority(priority, label) {
416
+ if (priority === void 0) return 0;
417
+ if (typeof priority !== "number" || !Number.isFinite(priority)) throw new Error(`${label}: priority must be a finite number when provided.`);
418
+ return priority;
419
+ }
420
+ function compileLatinHintRule(rule, order, label) {
421
+ const tag = typeof rule.tag === "string" ? rule.tag.trim() : "";
422
+ if (!tag) throw new Error(`${label}: tag must be a non-empty string.`);
423
+ return {
424
+ tag,
425
+ pattern: compileLatinHintPattern(rule.pattern, label),
426
+ priority: normalizeLatinHintPriority(rule.priority, label),
427
+ order
428
+ };
429
+ }
430
+ function resolveLatinHintRules(options) {
431
+ const useDefaultLatinHints = options.useDefaultLatinHints !== false;
432
+ const customRules = options.latinHintRules ?? [];
433
+ const combinedRules = [];
434
+ for (let index = 0; index < customRules.length; index += 1) {
435
+ const rule = customRules[index];
436
+ if (!rule) continue;
437
+ combinedRules.push({
438
+ rule,
439
+ label: `Invalid custom Latin hint rule at index ${index}`
440
+ });
441
+ }
442
+ if (useDefaultLatinHints) for (let index = 0; index < DEFAULT_LATIN_HINT_RULES.length; index += 1) {
443
+ const rule = DEFAULT_LATIN_HINT_RULES[index];
444
+ if (!rule) continue;
445
+ combinedRules.push({
446
+ rule,
447
+ label: `Invalid default Latin hint rule at index ${index}`
448
+ });
449
+ }
450
+ const resolvedRules = combinedRules.map((entry, index) => compileLatinHintRule(entry.rule, index, entry.label));
451
+ resolvedRules.sort((left, right) => {
452
+ if (left.priority !== right.priority) return right.priority - left.priority;
453
+ return left.order - right.order;
454
+ });
455
+ return resolvedRules;
456
+ }
457
+ function resolveLocaleDetectContext(options = {}) {
458
+ const latinHint = resolveLatinHint(options);
459
+ const latinHintRules = resolveLatinHintRules(options);
460
+ const latinLocales = new Set([DEFAULT_LOCALE]);
461
+ for (const rule of latinHintRules) latinLocales.add(rule.tag);
462
+ if (latinHint) latinLocales.add(latinHint);
463
+ return {
464
+ latinHint,
465
+ hanHint: resolveHanHint(options),
466
+ latinHintRules,
467
+ latinLocales
468
+ };
469
+ }
470
+ function detectLatinLocale(char, context) {
471
+ for (const hint of context.latinHintRules) {
472
+ hint.pattern.lastIndex = 0;
473
+ if (hint.pattern.test(char)) return hint.tag;
474
+ }
475
+ return DEFAULT_LOCALE;
476
+ }
477
+ function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options), allowLatinLocaleCarry = true, allowJapaneseHanCarry = true) {
381
478
  if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
382
479
  if (regex.hangul.test(char)) return "ko";
383
480
  if (regex.arabic.test(char)) return "ar";
@@ -385,15 +482,14 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
385
482
  if (regex.devanagari.test(char)) return "hi";
386
483
  if (regex.thai.test(char)) return "th";
387
484
  if (regex.han.test(char)) {
388
- if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
389
- return resolveHanHint(options) ?? DEFAULT_HAN_TAG;
485
+ if (allowJapaneseHanCarry && previousLocale && previousLocale.startsWith("ja")) return previousLocale;
486
+ return context.hanHint ?? DEFAULT_HAN_TAG;
390
487
  }
391
488
  if (regex.latin.test(char)) {
392
- const hintedLocale = detectLatinLocale(char);
489
+ const hintedLocale = detectLatinLocale(char, context);
393
490
  if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
394
- if (previousLocale && isLatinLocale(previousLocale) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
395
- const latinHint = resolveLatinHint(options);
396
- if (latinHint) return latinHint;
491
+ if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
492
+ if (context.latinHint) return context.latinHint;
397
493
  return DEFAULT_LOCALE;
398
494
  }
399
495
  return null;
@@ -401,31 +497,59 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
401
497
 
402
498
  //#endregion
403
499
  //#region src/wc/segment.ts
500
+ const HARD_BOUNDARY_REGEX = /[\r\n,.!?;:,、。!?;:.。、]/u;
501
+ const LATIN_PROMOTION_BREAK_REGEX = /[\s,.!?;:,、。!?;:.。、]/u;
404
502
  function segmentTextByLocale(text, options = {}) {
503
+ const context = resolveLocaleDetectContext(options);
405
504
  const chunks = [];
406
505
  let currentLocale = DEFAULT_LOCALE;
407
506
  let buffer = "";
408
507
  let bufferHasScript = false;
508
+ let sawCarryBoundary = false;
509
+ const updateCarryBoundaryState = (detected, char) => {
510
+ if (detected !== null) {
511
+ sawCarryBoundary = false;
512
+ return;
513
+ }
514
+ if (HARD_BOUNDARY_REGEX.test(char)) sawCarryBoundary = true;
515
+ };
409
516
  for (const char of text) {
410
- const detected = detectLocaleForChar(char, currentLocale, options);
517
+ const detected = detectLocaleForChar(char, currentLocale, options, context, !sawCarryBoundary, !sawCarryBoundary);
411
518
  const targetLocale = detected ?? currentLocale;
412
519
  if (buffer === "") {
413
520
  currentLocale = targetLocale;
414
521
  buffer = char;
415
522
  bufferHasScript = detected !== null;
523
+ updateCarryBoundaryState(detected, char);
416
524
  continue;
417
525
  }
418
526
  if (detected !== null && !bufferHasScript) {
419
527
  currentLocale = targetLocale;
420
528
  buffer += char;
421
529
  bufferHasScript = true;
530
+ updateCarryBoundaryState(detected, char);
422
531
  continue;
423
532
  }
424
533
  if (targetLocale !== currentLocale && detected !== null) {
425
- if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale)) {
534
+ if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
535
+ const promotionBreakIndex = findLastLatinPromotionBreakIndex(buffer);
536
+ if (promotionBreakIndex === -1) {
537
+ currentLocale = targetLocale;
538
+ buffer += char;
539
+ bufferHasScript = true;
540
+ updateCarryBoundaryState(detected, char);
541
+ continue;
542
+ }
543
+ const prefix = buffer.slice(0, promotionBreakIndex + 1);
544
+ const suffix = buffer.slice(promotionBreakIndex + 1);
545
+ if (prefix.length > 0) chunks.push({
546
+ locale: currentLocale,
547
+ text: prefix
548
+ });
426
549
  currentLocale = targetLocale;
427
- buffer += char;
550
+ buffer = `${suffix}${char}`;
428
551
  bufferHasScript = true;
552
+ updateCarryBoundaryState(detected, char);
429
553
  continue;
430
554
  }
431
555
  chunks.push({
@@ -435,10 +559,12 @@ function segmentTextByLocale(text, options = {}) {
435
559
  currentLocale = targetLocale;
436
560
  buffer = char;
437
561
  bufferHasScript = true;
562
+ updateCarryBoundaryState(detected, char);
438
563
  continue;
439
564
  }
440
565
  buffer += char;
441
566
  if (detected !== null) bufferHasScript = true;
567
+ updateCarryBoundaryState(detected, char);
442
568
  }
443
569
  if (buffer.length > 0) chunks.push({
444
570
  locale: currentLocale,
@@ -446,6 +572,14 @@ function segmentTextByLocale(text, options = {}) {
446
572
  });
447
573
  return mergeAdjacentChunks(chunks);
448
574
  }
575
+ function findLastLatinPromotionBreakIndex(buffer) {
576
+ for (let index = buffer.length - 1; index >= 0; index -= 1) {
577
+ const char = buffer[index];
578
+ if (!char) continue;
579
+ if (LATIN_PROMOTION_BREAK_REGEX.test(char)) return index;
580
+ }
581
+ return -1;
582
+ }
449
583
  function mergeAdjacentChunks(chunks) {
450
584
  if (chunks.length === 0) return chunks;
451
585
  const merged = [];
@@ -475,6 +609,8 @@ function wordCounter(text, options = {}) {
475
609
  latinLanguageHint: options.latinLanguageHint,
476
610
  latinTagHint: options.latinTagHint,
477
611
  latinLocaleHint: options.latinLocaleHint,
612
+ latinHintRules: options.latinHintRules,
613
+ useDefaultLatinHints: options.useDefaultLatinHints,
478
614
  hanLanguageHint: options.hanLanguageHint,
479
615
  hanTagHint: options.hanTagHint
480
616
  });