@dev-pi2pie/word-counter 0.1.2 → 0.1.3-canary.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +44 -4
- package/dist/cjs/index.cjs +133 -34
- package/dist/cjs/index.cjs.map +1 -1
- package/dist/esm/bin.mjs +194 -39
- package/dist/esm/bin.mjs.map +1 -1
- package/dist/esm/index.d.mts +22 -10
- package/dist/esm/index.mjs +134 -35
- package/dist/esm/index.mjs.map +1 -1
- package/package.json +1 -1
package/dist/esm/index.d.mts
CHANGED
|
@@ -1,12 +1,3 @@
|
|
|
1
|
-
//#region src/wc/locale-detect.d.ts
|
|
2
|
-
interface LocaleDetectOptions {
|
|
3
|
-
latinLanguageHint?: string;
|
|
4
|
-
latinTagHint?: string;
|
|
5
|
-
latinLocaleHint?: string;
|
|
6
|
-
hanLanguageHint?: string;
|
|
7
|
-
hanTagHint?: string;
|
|
8
|
-
}
|
|
9
|
-
//#endregion
|
|
10
1
|
//#region src/wc/types.d.ts
|
|
11
2
|
interface LocaleChunk {
|
|
12
3
|
locale: string;
|
|
@@ -53,11 +44,18 @@ interface NonWordCollection {
|
|
|
53
44
|
whitespace?: WhitespaceCounts;
|
|
54
45
|
counts: NonWordCounts;
|
|
55
46
|
}
|
|
47
|
+
interface LatinHintRule {
|
|
48
|
+
tag: string;
|
|
49
|
+
pattern: string | RegExp;
|
|
50
|
+
priority?: number;
|
|
51
|
+
}
|
|
56
52
|
interface WordCounterOptions {
|
|
57
53
|
mode?: WordCounterMode;
|
|
58
54
|
latinLanguageHint?: string;
|
|
59
55
|
latinTagHint?: string;
|
|
60
56
|
latinLocaleHint?: string;
|
|
57
|
+
latinHintRules?: LatinHintRule[];
|
|
58
|
+
useDefaultLatinHints?: boolean;
|
|
61
59
|
hanLanguageHint?: string;
|
|
62
60
|
hanTagHint?: string;
|
|
63
61
|
nonWords?: boolean;
|
|
@@ -91,6 +89,17 @@ interface WordCounterResult {
|
|
|
91
89
|
breakdown: WordCounterBreakdown;
|
|
92
90
|
}
|
|
93
91
|
//#endregion
|
|
92
|
+
//#region src/wc/locale-detect.d.ts
|
|
93
|
+
interface LocaleDetectOptions {
|
|
94
|
+
latinLanguageHint?: string;
|
|
95
|
+
latinTagHint?: string;
|
|
96
|
+
latinLocaleHint?: string;
|
|
97
|
+
latinHintRules?: LatinHintRule[];
|
|
98
|
+
useDefaultLatinHints?: boolean;
|
|
99
|
+
hanLanguageHint?: string;
|
|
100
|
+
hanTagHint?: string;
|
|
101
|
+
}
|
|
102
|
+
//#endregion
|
|
94
103
|
//#region src/wc/segment.d.ts
|
|
95
104
|
declare function segmentTextByLocale(text: string, options?: LocaleDetectOptions): LocaleChunk[];
|
|
96
105
|
//#endregion
|
|
@@ -98,6 +107,9 @@ declare function segmentTextByLocale(text: string, options?: LocaleDetectOptions
|
|
|
98
107
|
declare function countWordsForLocale(text: string, locale: string): number;
|
|
99
108
|
declare function countCharsForLocale(text: string, locale: string): number;
|
|
100
109
|
//#endregion
|
|
110
|
+
//#region src/wc/latin-hints.d.ts
|
|
111
|
+
declare const DEFAULT_LATIN_HINT_RULES: ReadonlyArray<Readonly<LatinHintRule>>;
|
|
112
|
+
//#endregion
|
|
101
113
|
//#region src/wc/wc.d.ts
|
|
102
114
|
declare function wordCounter(text: string, options?: WordCounterOptions): WordCounterResult;
|
|
103
115
|
//#endregion
|
|
@@ -133,5 +145,5 @@ declare function parseMarkdown(input: string): ParsedMarkdown;
|
|
|
133
145
|
//#region src/markdown/section-count.d.ts
|
|
134
146
|
declare function countSections(input: string, section: SectionMode, options?: WordCounterOptions): SectionedResult;
|
|
135
147
|
//#endregion
|
|
136
|
-
export { FrontmatterType, type NonWordCollection, ParsedMarkdown, SectionMode, SectionedResult, type WordCounterBreakdown, type WordCounterMode, type WordCounterOptions, type WordCounterResult, appendAll, countCharsForLocale, countSections, countWordsForLocale, wordCounter as default, wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
|
|
148
|
+
export { DEFAULT_LATIN_HINT_RULES, FrontmatterType, type LatinHintRule, type NonWordCollection, ParsedMarkdown, SectionMode, SectionedResult, type WordCounterBreakdown, type WordCounterMode, type WordCounterOptions, type WordCounterResult, appendAll, countCharsForLocale, countSections, countWordsForLocale, wordCounter as default, wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
|
|
137
149
|
//# sourceMappingURL=index.d.mts.map
|
package/dist/esm/index.mjs
CHANGED
|
@@ -322,10 +322,53 @@ function resolveMode(input, fallback = "chunk") {
|
|
|
322
322
|
return normalizeMode(input) ?? fallback;
|
|
323
323
|
}
|
|
324
324
|
|
|
325
|
+
//#endregion
|
|
326
|
+
//#region src/wc/latin-hints.ts
|
|
327
|
+
const DEFAULT_LATIN_HINT_RULES_SOURCE = [
|
|
328
|
+
{
|
|
329
|
+
tag: "de",
|
|
330
|
+
pattern: "[äöüÄÖÜß]"
|
|
331
|
+
},
|
|
332
|
+
{
|
|
333
|
+
tag: "es",
|
|
334
|
+
pattern: "[ñÑ¿¡]"
|
|
335
|
+
},
|
|
336
|
+
{
|
|
337
|
+
tag: "pt",
|
|
338
|
+
pattern: "[ãõÃÕ]"
|
|
339
|
+
},
|
|
340
|
+
{
|
|
341
|
+
tag: "fr",
|
|
342
|
+
pattern: "[œŒæÆ]"
|
|
343
|
+
},
|
|
344
|
+
{
|
|
345
|
+
tag: "pl",
|
|
346
|
+
pattern: "[ąćęłńśźżĄĆĘŁŃŚŹŻ]"
|
|
347
|
+
},
|
|
348
|
+
{
|
|
349
|
+
tag: "tr",
|
|
350
|
+
pattern: "[ıİğĞşŞ]"
|
|
351
|
+
},
|
|
352
|
+
{
|
|
353
|
+
tag: "ro",
|
|
354
|
+
pattern: "[ăĂâÂîÎșȘțȚ]"
|
|
355
|
+
},
|
|
356
|
+
{
|
|
357
|
+
tag: "hu",
|
|
358
|
+
pattern: "[őŐűŰ]"
|
|
359
|
+
},
|
|
360
|
+
{
|
|
361
|
+
tag: "is",
|
|
362
|
+
pattern: "[ðÐþÞ]"
|
|
363
|
+
}
|
|
364
|
+
];
|
|
365
|
+
const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
|
|
366
|
+
|
|
325
367
|
//#endregion
|
|
326
368
|
//#region src/wc/locale-detect.ts
|
|
327
369
|
const DEFAULT_LOCALE = "und-Latn";
|
|
328
|
-
const DEFAULT_HAN_TAG = "
|
|
370
|
+
const DEFAULT_HAN_TAG = "und-Hani";
|
|
371
|
+
const MAX_LATIN_HINT_PATTERN_LENGTH = 256;
|
|
329
372
|
const regex = {
|
|
330
373
|
hiragana: /\p{Script=Hiragana}/u,
|
|
331
374
|
katakana: /\p{Script=Katakana}/u,
|
|
@@ -337,31 +380,10 @@ const regex = {
|
|
|
337
380
|
devanagari: /\p{Script=Devanagari}/u,
|
|
338
381
|
thai: /\p{Script=Thai}/u
|
|
339
382
|
};
|
|
340
|
-
const
|
|
341
|
-
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
},
|
|
345
|
-
{
|
|
346
|
-
locale: "es",
|
|
347
|
-
regex: /[ñÑ¿¡]/
|
|
348
|
-
},
|
|
349
|
-
{
|
|
350
|
-
locale: "pt",
|
|
351
|
-
regex: /[ãõÃÕ]/
|
|
352
|
-
},
|
|
353
|
-
{
|
|
354
|
-
locale: "fr",
|
|
355
|
-
regex: /[œŒæÆ]/
|
|
356
|
-
}
|
|
357
|
-
];
|
|
358
|
-
const latinLocales = new Set([DEFAULT_LOCALE, ...latinLocaleHints.map((hint) => hint.locale)]);
|
|
359
|
-
function isLatinLocale(locale) {
|
|
360
|
-
return latinLocales.has(locale);
|
|
361
|
-
}
|
|
362
|
-
function detectLatinLocale(char) {
|
|
363
|
-
for (const hint of latinLocaleHints) if (hint.regex.test(char)) return hint.locale;
|
|
364
|
-
return DEFAULT_LOCALE;
|
|
383
|
+
const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
|
|
384
|
+
function isLatinLocale(locale, context) {
|
|
385
|
+
if (context) return context.latinLocales.has(locale);
|
|
386
|
+
return defaultLatinLocales.has(locale);
|
|
365
387
|
}
|
|
366
388
|
function resolveLatinHint(options) {
|
|
367
389
|
const latinTagHint = options.latinTagHint?.trim();
|
|
@@ -377,7 +399,82 @@ function resolveHanHint(options) {
|
|
|
377
399
|
const hanLanguageHint = options.hanLanguageHint?.trim();
|
|
378
400
|
if (hanLanguageHint) return hanLanguageHint;
|
|
379
401
|
}
|
|
380
|
-
function
|
|
402
|
+
function compileLatinHintPattern(pattern, label) {
|
|
403
|
+
const source = typeof pattern === "string" ? pattern : pattern.source;
|
|
404
|
+
const hasUnicodeMode = typeof pattern !== "string" && (pattern.flags.includes("u") || pattern.flags.includes("v"));
|
|
405
|
+
const flags = typeof pattern === "string" ? "u" : hasUnicodeMode ? pattern.flags : `${pattern.flags}u`;
|
|
406
|
+
if (source.length === 0) throw new Error(`${label}: pattern must not be empty.`);
|
|
407
|
+
if (source.length > MAX_LATIN_HINT_PATTERN_LENGTH) throw new Error(`${label}: pattern must be at most ${MAX_LATIN_HINT_PATTERN_LENGTH} characters.`);
|
|
408
|
+
try {
|
|
409
|
+
return new RegExp(source, flags);
|
|
410
|
+
} catch (error) {
|
|
411
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
412
|
+
throw new Error(`${label}: invalid Unicode regex pattern (${message}).`);
|
|
413
|
+
}
|
|
414
|
+
}
|
|
415
|
+
function normalizeLatinHintPriority(priority, label) {
|
|
416
|
+
if (priority === void 0) return 0;
|
|
417
|
+
if (typeof priority !== "number" || !Number.isFinite(priority)) throw new Error(`${label}: priority must be a finite number when provided.`);
|
|
418
|
+
return priority;
|
|
419
|
+
}
|
|
420
|
+
function compileLatinHintRule(rule, order, label) {
|
|
421
|
+
const tag = typeof rule.tag === "string" ? rule.tag.trim() : "";
|
|
422
|
+
if (!tag) throw new Error(`${label}: tag must be a non-empty string.`);
|
|
423
|
+
return {
|
|
424
|
+
tag,
|
|
425
|
+
pattern: compileLatinHintPattern(rule.pattern, label),
|
|
426
|
+
priority: normalizeLatinHintPriority(rule.priority, label),
|
|
427
|
+
order
|
|
428
|
+
};
|
|
429
|
+
}
|
|
430
|
+
function resolveLatinHintRules(options) {
|
|
431
|
+
const useDefaultLatinHints = options.useDefaultLatinHints !== false;
|
|
432
|
+
const customRules = options.latinHintRules ?? [];
|
|
433
|
+
const combinedRules = [];
|
|
434
|
+
for (let index = 0; index < customRules.length; index += 1) {
|
|
435
|
+
const rule = customRules[index];
|
|
436
|
+
if (!rule) continue;
|
|
437
|
+
combinedRules.push({
|
|
438
|
+
rule,
|
|
439
|
+
label: `Invalid custom Latin hint rule at index ${index}`
|
|
440
|
+
});
|
|
441
|
+
}
|
|
442
|
+
if (useDefaultLatinHints) for (let index = 0; index < DEFAULT_LATIN_HINT_RULES.length; index += 1) {
|
|
443
|
+
const rule = DEFAULT_LATIN_HINT_RULES[index];
|
|
444
|
+
if (!rule) continue;
|
|
445
|
+
combinedRules.push({
|
|
446
|
+
rule,
|
|
447
|
+
label: `Invalid default Latin hint rule at index ${index}`
|
|
448
|
+
});
|
|
449
|
+
}
|
|
450
|
+
const resolvedRules = combinedRules.map((entry, index) => compileLatinHintRule(entry.rule, index, entry.label));
|
|
451
|
+
resolvedRules.sort((left, right) => {
|
|
452
|
+
if (left.priority !== right.priority) return right.priority - left.priority;
|
|
453
|
+
return left.order - right.order;
|
|
454
|
+
});
|
|
455
|
+
return resolvedRules;
|
|
456
|
+
}
|
|
457
|
+
function resolveLocaleDetectContext(options = {}) {
|
|
458
|
+
const latinHint = resolveLatinHint(options);
|
|
459
|
+
const latinHintRules = resolveLatinHintRules(options);
|
|
460
|
+
const latinLocales = new Set([DEFAULT_LOCALE]);
|
|
461
|
+
for (const rule of latinHintRules) latinLocales.add(rule.tag);
|
|
462
|
+
if (latinHint) latinLocales.add(latinHint);
|
|
463
|
+
return {
|
|
464
|
+
latinHint,
|
|
465
|
+
hanHint: resolveHanHint(options),
|
|
466
|
+
latinHintRules,
|
|
467
|
+
latinLocales
|
|
468
|
+
};
|
|
469
|
+
}
|
|
470
|
+
function detectLatinLocale(char, context) {
|
|
471
|
+
for (const hint of context.latinHintRules) {
|
|
472
|
+
hint.pattern.lastIndex = 0;
|
|
473
|
+
if (hint.pattern.test(char)) return hint.tag;
|
|
474
|
+
}
|
|
475
|
+
return DEFAULT_LOCALE;
|
|
476
|
+
}
|
|
477
|
+
function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options)) {
|
|
381
478
|
if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
|
|
382
479
|
if (regex.hangul.test(char)) return "ko";
|
|
383
480
|
if (regex.arabic.test(char)) return "ar";
|
|
@@ -386,14 +483,13 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
|
|
|
386
483
|
if (regex.thai.test(char)) return "th";
|
|
387
484
|
if (regex.han.test(char)) {
|
|
388
485
|
if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
|
|
389
|
-
return
|
|
486
|
+
return context.hanHint ?? DEFAULT_HAN_TAG;
|
|
390
487
|
}
|
|
391
488
|
if (regex.latin.test(char)) {
|
|
392
|
-
const hintedLocale = detectLatinLocale(char);
|
|
489
|
+
const hintedLocale = detectLatinLocale(char, context);
|
|
393
490
|
if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
|
|
394
|
-
if (previousLocale && isLatinLocale(previousLocale) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
|
|
395
|
-
|
|
396
|
-
if (latinHint) return latinHint;
|
|
491
|
+
if (previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
|
|
492
|
+
if (context.latinHint) return context.latinHint;
|
|
397
493
|
return DEFAULT_LOCALE;
|
|
398
494
|
}
|
|
399
495
|
return null;
|
|
@@ -402,12 +498,13 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
|
|
|
402
498
|
//#endregion
|
|
403
499
|
//#region src/wc/segment.ts
|
|
404
500
|
function segmentTextByLocale(text, options = {}) {
|
|
501
|
+
const context = resolveLocaleDetectContext(options);
|
|
405
502
|
const chunks = [];
|
|
406
503
|
let currentLocale = DEFAULT_LOCALE;
|
|
407
504
|
let buffer = "";
|
|
408
505
|
let bufferHasScript = false;
|
|
409
506
|
for (const char of text) {
|
|
410
|
-
const detected = detectLocaleForChar(char, currentLocale, options);
|
|
507
|
+
const detected = detectLocaleForChar(char, currentLocale, options, context);
|
|
411
508
|
const targetLocale = detected ?? currentLocale;
|
|
412
509
|
if (buffer === "") {
|
|
413
510
|
currentLocale = targetLocale;
|
|
@@ -422,7 +519,7 @@ function segmentTextByLocale(text, options = {}) {
|
|
|
422
519
|
continue;
|
|
423
520
|
}
|
|
424
521
|
if (targetLocale !== currentLocale && detected !== null) {
|
|
425
|
-
if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale)) {
|
|
522
|
+
if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
|
|
426
523
|
currentLocale = targetLocale;
|
|
427
524
|
buffer += char;
|
|
428
525
|
bufferHasScript = true;
|
|
@@ -475,6 +572,8 @@ function wordCounter(text, options = {}) {
|
|
|
475
572
|
latinLanguageHint: options.latinLanguageHint,
|
|
476
573
|
latinTagHint: options.latinTagHint,
|
|
477
574
|
latinLocaleHint: options.latinLocaleHint,
|
|
575
|
+
latinHintRules: options.latinHintRules,
|
|
576
|
+
useDefaultLatinHints: options.useDefaultLatinHints,
|
|
478
577
|
hanLanguageHint: options.hanLanguageHint,
|
|
479
578
|
hanTagHint: options.hanTagHint
|
|
480
579
|
});
|
|
@@ -1115,5 +1214,5 @@ function countSections(input, section, options = {}) {
|
|
|
1115
1214
|
}
|
|
1116
1215
|
|
|
1117
1216
|
//#endregion
|
|
1118
|
-
export { appendAll, countCharsForLocale, countSections, countWordsForLocale, wc_default as default, wc_default as wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
|
|
1217
|
+
export { DEFAULT_LATIN_HINT_RULES, appendAll, countCharsForLocale, countSections, countWordsForLocale, wc_default as default, wc_default as wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
|
|
1119
1218
|
//# sourceMappingURL=index.mjs.map
|