@dev-pi2pie/word-counter 0.1.3-canary.0 → 0.1.3-canary.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +34 -1
- package/dist/cjs/index.cjs +171 -35
- package/dist/cjs/index.cjs.map +1 -1
- package/dist/esm/bin.mjs +232 -40
- package/dist/esm/bin.mjs.map +1 -1
- package/dist/esm/index.d.mts +22 -10
- package/dist/esm/index.mjs +172 -36
- package/dist/esm/index.mjs.map +1 -1
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -69,6 +69,24 @@ word-counter --latin-language en "Hello world"
|
|
|
69
69
|
word-counter --latin-tag en "Hello world"
|
|
70
70
|
```
|
|
71
71
|
|
|
72
|
+
Add custom Latin hint rules (repeatable) or load from JSON:
|
|
73
|
+
|
|
74
|
+
```bash
|
|
75
|
+
word-counter --latin-hint 'pl=[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]' "Zażółć gęślą jaźń"
|
|
76
|
+
word-counter --latin-hint 'tr=[çğıöşüÇĞİÖŞÜ]' --latin-hint 'ro=[ăâîșțĂÂÎȘȚ]' "șță"
|
|
77
|
+
word-counter --latin-hints-file ./examples/latin-hints.json "Zażółć Știință Iğdır"
|
|
78
|
+
word-counter --no-default-latin-hints --latin-hint 'pl=[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]' "Zażółć"
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
`examples/latin-hints.json` format:
|
|
82
|
+
|
|
83
|
+
```json
|
|
84
|
+
[
|
|
85
|
+
{ "tag": "pl", "pattern": "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" },
|
|
86
|
+
{ "tag": "tr", "pattern": "[çğıöşüÇĞİÖŞÜ]", "priority": 1 }
|
|
87
|
+
]
|
|
88
|
+
```
|
|
89
|
+
|
|
72
90
|
Hint a language tag for Han fallback:
|
|
73
91
|
|
|
74
92
|
```bash
|
|
@@ -245,6 +263,10 @@ import wordCounter, {
|
|
|
245
263
|
|
|
246
264
|
wordCounter("Hello world", { latinLanguageHint: "en" });
|
|
247
265
|
wordCounter("Hello world", { latinTagHint: "en" });
|
|
266
|
+
wordCounter("Zażółć gęślą jaźń", {
|
|
267
|
+
latinHintRules: [{ tag: "pl", pattern: "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" }],
|
|
268
|
+
});
|
|
269
|
+
wordCounter("Über", { useDefaultLatinHints: false });
|
|
248
270
|
wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
|
|
249
271
|
wordCounter("Hi 👋, world!", { nonWords: true });
|
|
250
272
|
wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
|
|
@@ -295,6 +317,10 @@ const {
|
|
|
295
317
|
|
|
296
318
|
wordCounter("Hello world", { latinLanguageHint: "en" });
|
|
297
319
|
wordCounter("Hello world", { latinTagHint: "en" });
|
|
320
|
+
wordCounter("Zażółć gęślą jaźń", {
|
|
321
|
+
latinHintRules: [{ tag: "pl", pattern: "[ąćęłńóśźżĄĆĘŁŃÓŚŹŻ]" }],
|
|
322
|
+
});
|
|
323
|
+
wordCounter("Über", { useDefaultLatinHints: false });
|
|
298
324
|
wordCounter("漢字測試", { hanTagHint: "zh-Hant" });
|
|
299
325
|
wordCounter("Hi 👋, world!", { nonWords: true });
|
|
300
326
|
wordCounter("Hi 👋, world!", { mode: "char", nonWords: true });
|
|
@@ -568,15 +594,22 @@ Example JSON (trimmed):
|
|
|
568
594
|
- Detection is regex/script based (Unicode script checks), not a statistical language-ID model.
|
|
569
595
|
- Ambiguous Latin text uses `und-Latn` unless a Latin hint is provided.
|
|
570
596
|
- Han-script fallback uses `und-Hani` by default because regex script checks cannot natively distinguish `zh-Hans` vs `zh-Hant`.
|
|
571
|
-
- Current built-in Latin diacritic heuristics
|
|
597
|
+
- Current built-in Latin diacritic heuristics include:
|
|
572
598
|
- `de`: `äöüÄÖÜß`
|
|
573
599
|
- `es`: `ñÑ¿¡`
|
|
574
600
|
- `pt`: `ãõÃÕ`
|
|
575
601
|
- `fr`: `œŒæÆ`
|
|
602
|
+
- `pl`: `ąćęłńśźżĄĆĘŁŃŚŹŻ`
|
|
603
|
+
- `tr`: `ıİğĞşŞ`
|
|
604
|
+
- `ro`: `ăĂâÂîÎșȘțȚ`
|
|
605
|
+
- `hu`: `őŐűŰ`
|
|
606
|
+
- `is`: `ðÐþÞ`
|
|
576
607
|
- Latin text with other European diacritics may still remain in `und-Latn` unless a hint is provided.
|
|
577
608
|
- Use `--mode chunk`/`--mode segments` or `--format json` to see the exact locale tag assigned to each chunk.
|
|
578
609
|
- Regex/script-only detection cannot reliably identify English vs. other Latin-script languages; 100% certainty requires explicit metadata (document language tags, user-provided locale, headers) or a language-ID model.
|
|
579
610
|
- Use `--latin-language <tag>` or `--latin-tag <tag>` for ambiguous Latin text.
|
|
611
|
+
- Use `--latin-hint <tag>=<pattern>` (repeatable) and `--latin-hints-file <path>` to add custom Latin rules.
|
|
612
|
+
- Use `--no-default-latin-hints` to disable built-in Latin diacritic rules.
|
|
580
613
|
- Use `--han-language <tag>` or `--han-tag <tag>` for Han-script fallback.
|
|
581
614
|
- `--latin-locale` remains supported as a legacy alias for now and is planned for future deprecation.
|
|
582
615
|
|
package/dist/cjs/index.cjs
CHANGED
|
@@ -322,10 +322,53 @@ function resolveMode(input, fallback = "chunk") {
|
|
|
322
322
|
return normalizeMode(input) ?? fallback;
|
|
323
323
|
}
|
|
324
324
|
|
|
325
|
+
//#endregion
|
|
326
|
+
//#region src/wc/latin-hints.ts
|
|
327
|
+
const DEFAULT_LATIN_HINT_RULES_SOURCE = [
|
|
328
|
+
{
|
|
329
|
+
tag: "de",
|
|
330
|
+
pattern: "[äöüÄÖÜß]"
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
tag: "es",
|
|
334
|
+
pattern: "[ñÑ¿¡]"
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
tag: "pt",
|
|
338
|
+
pattern: "[ãõÃÕ]"
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
tag: "fr",
|
|
342
|
+
pattern: "[œŒæÆ]"
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
tag: "pl",
|
|
346
|
+
pattern: "[ąćęłńśźżĄĆĘŁŃŚŹŻ]"
|
|
347
|
+
},
|
|
348
|
+
{
|
|
349
|
+
tag: "tr",
|
|
350
|
+
pattern: "[ıİğĞşŞ]"
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
tag: "ro",
|
|
354
|
+
pattern: "[ăĂâÂîÎșȘțȚ]"
|
|
355
|
+
},
|
|
356
|
+
{
|
|
357
|
+
tag: "hu",
|
|
358
|
+
pattern: "[őŐűŰ]"
|
|
359
|
+
},
|
|
360
|
+
{
|
|
361
|
+
tag: "is",
|
|
362
|
+
pattern: "[ðÐþÞ]"
|
|
363
|
+
}
|
|
364
|
+
];
|
|
365
|
+
const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
|
|
366
|
+
|
|
325
367
|
//#endregion
|
|
326
368
|
//#region src/wc/locale-detect.ts
|
|
327
369
|
const DEFAULT_LOCALE = "und-Latn";
|
|
328
370
|
const DEFAULT_HAN_TAG = "und-Hani";
|
|
371
|
+
const MAX_LATIN_HINT_PATTERN_LENGTH = 256;
|
|
329
372
|
const regex = {
|
|
330
373
|
hiragana: /\p{Script=Hiragana}/u,
|
|
331
374
|
katakana: /\p{Script=Katakana}/u,
|
|
@@ -337,31 +380,10 @@ const regex = {
|
|
|
337
380
|
devanagari: /\p{Script=Devanagari}/u,
|
|
338
381
|
thai: /\p{Script=Thai}/u
|
|
339
382
|
};
|
|
340
|
-
const
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
},
|
|
345
|
-
{
|
|
346
|
-
locale: "es",
|
|
347
|
-
regex: /[ñÑ¿¡]/
|
|
348
|
-
},
|
|
349
|
-
{
|
|
350
|
-
locale: "pt",
|
|
351
|
-
regex: /[ãõÃÕ]/
|
|
352
|
-
},
|
|
353
|
-
{
|
|
354
|
-
locale: "fr",
|
|
355
|
-
regex: /[œŒæÆ]/
|
|
356
|
-
}
|
|
357
|
-
];
|
|
358
|
-
const latinLocales = new Set([DEFAULT_LOCALE, ...latinLocaleHints.map((hint) => hint.locale)]);
|
|
359
|
-
function isLatinLocale(locale) {
|
|
360
|
-
return latinLocales.has(locale);
|
|
361
|
-
}
|
|
362
|
-
function detectLatinLocale(char) {
|
|
363
|
-
for (const hint of latinLocaleHints) if (hint.regex.test(char)) return hint.locale;
|
|
364
|
-
return DEFAULT_LOCALE;
|
|
383
|
+
const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
384
|
+
function isLatinLocale(locale, context) {
|
|
385
|
+
if (context) return context.latinLocales.has(locale);
|
|
386
|
+
return defaultLatinLocales.has(locale);
|
|
365
387
|
}
|
|
366
388
|
function resolveLatinHint(options) {
|
|
367
389
|
const latinTagHint = options.latinTagHint?.trim();
|
|
@@ -377,7 +399,82 @@ function resolveHanHint(options) {
|
|
|
377
399
|
const hanLanguageHint = options.hanLanguageHint?.trim();
|
|
378
400
|
if (hanLanguageHint) return hanLanguageHint;
|
|
379
401
|
}
|
|
380
|
-
function
|
|
402
|
+
function compileLatinHintPattern(pattern, label) {
|
|
403
|
+
const source = typeof pattern === "string" ? pattern : pattern.source;
|
|
404
|
+
const hasUnicodeMode = typeof pattern !== "string" && (pattern.flags.includes("u") || pattern.flags.includes("v"));
|
|
405
|
+
const flags = typeof pattern === "string" ? "u" : hasUnicodeMode ? pattern.flags : `${pattern.flags}u`;
|
|
406
|
+
if (source.length === 0) throw new Error(`${label}: pattern must not be empty.`);
|
|
407
|
+
if (source.length > MAX_LATIN_HINT_PATTERN_LENGTH) throw new Error(`${label}: pattern must be at most ${MAX_LATIN_HINT_PATTERN_LENGTH} characters.`);
|
|
408
|
+
try {
|
|
409
|
+
return new RegExp(source, flags);
|
|
410
|
+
} catch (error) {
|
|
411
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
412
|
+
throw new Error(`${label}: invalid Unicode regex pattern (${message}).`);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
function normalizeLatinHintPriority(priority, label) {
|
|
416
|
+
if (priority === void 0) return 0;
|
|
417
|
+
if (typeof priority !== "number" || !Number.isFinite(priority)) throw new Error(`${label}: priority must be a finite number when provided.`);
|
|
418
|
+
return priority;
|
|
419
|
+
}
|
|
420
|
+
function compileLatinHintRule(rule, order, label) {
|
|
421
|
+
const tag = typeof rule.tag === "string" ? rule.tag.trim() : "";
|
|
422
|
+
if (!tag) throw new Error(`${label}: tag must be a non-empty string.`);
|
|
423
|
+
return {
|
|
424
|
+
tag,
|
|
425
|
+
pattern: compileLatinHintPattern(rule.pattern, label),
|
|
426
|
+
priority: normalizeLatinHintPriority(rule.priority, label),
|
|
427
|
+
order
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
function resolveLatinHintRules(options) {
|
|
431
|
+
const useDefaultLatinHints = options.useDefaultLatinHints !== false;
|
|
432
|
+
const customRules = options.latinHintRules ?? [];
|
|
433
|
+
const combinedRules = [];
|
|
434
|
+
for (let index = 0; index < customRules.length; index += 1) {
|
|
435
|
+
const rule = customRules[index];
|
|
436
|
+
if (!rule) continue;
|
|
437
|
+
combinedRules.push({
|
|
438
|
+
rule,
|
|
439
|
+
label: `Invalid custom Latin hint rule at index ${index}`
|
|
440
|
+
});
|
|
441
|
+
}
|
|
442
|
+
if (useDefaultLatinHints) for (let index = 0; index < DEFAULT_LATIN_HINT_RULES.length; index += 1) {
|
|
443
|
+
const rule = DEFAULT_LATIN_HINT_RULES[index];
|
|
444
|
+
if (!rule) continue;
|
|
445
|
+
combinedRules.push({
|
|
446
|
+
rule,
|
|
447
|
+
label: `Invalid default Latin hint rule at index ${index}`
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
const resolvedRules = combinedRules.map((entry, index) => compileLatinHintRule(entry.rule, index, entry.label));
|
|
451
|
+
resolvedRules.sort((left, right) => {
|
|
452
|
+
if (left.priority !== right.priority) return right.priority - left.priority;
|
|
453
|
+
return left.order - right.order;
|
|
454
|
+
});
|
|
455
|
+
return resolvedRules;
|
|
456
|
+
}
|
|
457
|
+
function resolveLocaleDetectContext(options = {}) {
|
|
458
|
+
const latinHint = resolveLatinHint(options);
|
|
459
|
+
const latinHintRules = resolveLatinHintRules(options);
|
|
460
|
+
const latinLocales = new Set([DEFAULT_LOCALE]);
|
|
461
|
+
for (const rule of latinHintRules) latinLocales.add(rule.tag);
|
|
462
|
+
if (latinHint) latinLocales.add(latinHint);
|
|
463
|
+
return {
|
|
464
|
+
latinHint,
|
|
465
|
+
hanHint: resolveHanHint(options),
|
|
466
|
+
latinHintRules,
|
|
467
|
+
latinLocales
|
|
468
|
+
};
|
|
469
|
+
}
|
|
470
|
+
function detectLatinLocale(char, context) {
|
|
471
|
+
for (const hint of context.latinHintRules) {
|
|
472
|
+
hint.pattern.lastIndex = 0;
|
|
473
|
+
if (hint.pattern.test(char)) return hint.tag;
|
|
474
|
+
}
|
|
475
|
+
return DEFAULT_LOCALE;
|
|
476
|
+
}
|
|
477
|
+
function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options), allowLatinLocaleCarry = true, allowJapaneseHanCarry = true) {
|
|
381
478
|
if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
|
|
382
479
|
if (regex.hangul.test(char)) return "ko";
|
|
383
480
|
if (regex.arabic.test(char)) return "ar";
|
|
@@ -385,15 +482,14 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
|
|
|
385
482
|
if (regex.devanagari.test(char)) return "hi";
|
|
386
483
|
if (regex.thai.test(char)) return "th";
|
|
387
484
|
if (regex.han.test(char)) {
|
|
388
|
-
if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
|
|
389
|
-
return
|
|
485
|
+
if (allowJapaneseHanCarry && previousLocale && previousLocale.startsWith("ja")) return previousLocale;
|
|
486
|
+
return context.hanHint ?? DEFAULT_HAN_TAG;
|
|
390
487
|
}
|
|
391
488
|
if (regex.latin.test(char)) {
|
|
392
|
-
const hintedLocale = detectLatinLocale(char);
|
|
489
|
+
const hintedLocale = detectLatinLocale(char, context);
|
|
393
490
|
if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
|
|
394
|
-
if (previousLocale && isLatinLocale(previousLocale) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
|
|
395
|
-
|
|
396
|
-
if (latinHint) return latinHint;
|
|
491
|
+
if (allowLatinLocaleCarry && previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
|
|
492
|
+
if (context.latinHint) return context.latinHint;
|
|
397
493
|
return DEFAULT_LOCALE;
|
|
398
494
|
}
|
|
399
495
|
return null;
|
|
@@ -401,31 +497,59 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
|
|
|
401
497
|
|
|
402
498
|
//#endregion
|
|
403
499
|
//#region src/wc/segment.ts
|
|
500
|
+
const HARD_BOUNDARY_REGEX = /[\r\n,.!?;:,、。!?;:.。、]/u;
|
|
501
|
+
const LATIN_PROMOTION_BREAK_REGEX = /[\s,.!?;:,、。!?;:.。、]/u;
|
|
404
502
|
function segmentTextByLocale(text, options = {}) {
|
|
503
|
+
const context = resolveLocaleDetectContext(options);
|
|
405
504
|
const chunks = [];
|
|
406
505
|
let currentLocale = DEFAULT_LOCALE;
|
|
407
506
|
let buffer = "";
|
|
408
507
|
let bufferHasScript = false;
|
|
508
|
+
let sawCarryBoundary = false;
|
|
509
|
+
const updateCarryBoundaryState = (detected, char) => {
|
|
510
|
+
if (detected !== null) {
|
|
511
|
+
sawCarryBoundary = false;
|
|
512
|
+
return;
|
|
513
|
+
}
|
|
514
|
+
if (HARD_BOUNDARY_REGEX.test(char)) sawCarryBoundary = true;
|
|
515
|
+
};
|
|
409
516
|
for (const char of text) {
|
|
410
|
-
const detected = detectLocaleForChar(char, currentLocale, options);
|
|
517
|
+
const detected = detectLocaleForChar(char, currentLocale, options, context, !sawCarryBoundary, !sawCarryBoundary);
|
|
411
518
|
const targetLocale = detected ?? currentLocale;
|
|
412
519
|
if (buffer === "") {
|
|
413
520
|
currentLocale = targetLocale;
|
|
414
521
|
buffer = char;
|
|
415
522
|
bufferHasScript = detected !== null;
|
|
523
|
+
updateCarryBoundaryState(detected, char);
|
|
416
524
|
continue;
|
|
417
525
|
}
|
|
418
526
|
if (detected !== null && !bufferHasScript) {
|
|
419
527
|
currentLocale = targetLocale;
|
|
420
528
|
buffer += char;
|
|
421
529
|
bufferHasScript = true;
|
|
530
|
+
updateCarryBoundaryState(detected, char);
|
|
422
531
|
continue;
|
|
423
532
|
}
|
|
424
533
|
if (targetLocale !== currentLocale && detected !== null) {
|
|
425
|
-
if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale)) {
|
|
534
|
+
if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
|
|
535
|
+
const promotionBreakIndex = findLastLatinPromotionBreakIndex(buffer);
|
|
536
|
+
if (promotionBreakIndex === -1) {
|
|
537
|
+
currentLocale = targetLocale;
|
|
538
|
+
buffer += char;
|
|
539
|
+
bufferHasScript = true;
|
|
540
|
+
updateCarryBoundaryState(detected, char);
|
|
541
|
+
continue;
|
|
542
|
+
}
|
|
543
|
+
const prefix = buffer.slice(0, promotionBreakIndex + 1);
|
|
544
|
+
const suffix = buffer.slice(promotionBreakIndex + 1);
|
|
545
|
+
if (prefix.length > 0) chunks.push({
|
|
546
|
+
locale: currentLocale,
|
|
547
|
+
text: prefix
|
|
548
|
+
});
|
|
426
549
|
currentLocale = targetLocale;
|
|
427
|
-
buffer
|
|
550
|
+
buffer = `${suffix}${char}`;
|
|
428
551
|
bufferHasScript = true;
|
|
552
|
+
updateCarryBoundaryState(detected, char);
|
|
429
553
|
continue;
|
|
430
554
|
}
|
|
431
555
|
chunks.push({
|
|
@@ -435,10 +559,12 @@ function segmentTextByLocale(text, options = {}) {
|
|
|
435
559
|
currentLocale = targetLocale;
|
|
436
560
|
buffer = char;
|
|
437
561
|
bufferHasScript = true;
|
|
562
|
+
updateCarryBoundaryState(detected, char);
|
|
438
563
|
continue;
|
|
439
564
|
}
|
|
440
565
|
buffer += char;
|
|
441
566
|
if (detected !== null) bufferHasScript = true;
|
|
567
|
+
updateCarryBoundaryState(detected, char);
|
|
442
568
|
}
|
|
443
569
|
if (buffer.length > 0) chunks.push({
|
|
444
570
|
locale: currentLocale,
|
|
@@ -446,6 +572,14 @@ function segmentTextByLocale(text, options = {}) {
|
|
|
446
572
|
});
|
|
447
573
|
return mergeAdjacentChunks(chunks);
|
|
448
574
|
}
|
|
575
|
+
function findLastLatinPromotionBreakIndex(buffer) {
|
|
576
|
+
for (let index = buffer.length - 1; index >= 0; index -= 1) {
|
|
577
|
+
const char = buffer[index];
|
|
578
|
+
if (!char) continue;
|
|
579
|
+
if (LATIN_PROMOTION_BREAK_REGEX.test(char)) return index;
|
|
580
|
+
}
|
|
581
|
+
return -1;
|
|
582
|
+
}
|
|
449
583
|
function mergeAdjacentChunks(chunks) {
|
|
450
584
|
if (chunks.length === 0) return chunks;
|
|
451
585
|
const merged = [];
|
|
@@ -475,6 +609,8 @@ function wordCounter(text, options = {}) {
|
|
|
475
609
|
latinLanguageHint: options.latinLanguageHint,
|
|
476
610
|
latinTagHint: options.latinTagHint,
|
|
477
611
|
latinLocaleHint: options.latinLocaleHint,
|
|
612
|
+
latinHintRules: options.latinHintRules,
|
|
613
|
+
useDefaultLatinHints: options.useDefaultLatinHints,
|
|
478
614
|
hanLanguageHint: options.hanLanguageHint,
|
|
479
615
|
hanTagHint: options.hanTagHint
|
|
480
616
|
});
|