@dev-pi2pie/word-counter 0.1.2 → 0.1.3-canary.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,12 +1,3 @@
1
- //#region src/wc/locale-detect.d.ts
2
- interface LocaleDetectOptions {
3
- latinLanguageHint?: string;
4
- latinTagHint?: string;
5
- latinLocaleHint?: string;
6
- hanLanguageHint?: string;
7
- hanTagHint?: string;
8
- }
9
- //#endregion
10
1
  //#region src/wc/types.d.ts
11
2
  interface LocaleChunk {
12
3
  locale: string;
@@ -53,11 +44,18 @@ interface NonWordCollection {
53
44
  whitespace?: WhitespaceCounts;
54
45
  counts: NonWordCounts;
55
46
  }
47
+ interface LatinHintRule {
48
+ tag: string;
49
+ pattern: string | RegExp;
50
+ priority?: number;
51
+ }
56
52
  interface WordCounterOptions {
57
53
  mode?: WordCounterMode;
58
54
  latinLanguageHint?: string;
59
55
  latinTagHint?: string;
60
56
  latinLocaleHint?: string;
57
+ latinHintRules?: LatinHintRule[];
58
+ useDefaultLatinHints?: boolean;
61
59
  hanLanguageHint?: string;
62
60
  hanTagHint?: string;
63
61
  nonWords?: boolean;
@@ -91,6 +89,17 @@ interface WordCounterResult {
91
89
  breakdown: WordCounterBreakdown;
92
90
  }
93
91
  //#endregion
92
+ //#region src/wc/locale-detect.d.ts
93
+ interface LocaleDetectOptions {
94
+ latinLanguageHint?: string;
95
+ latinTagHint?: string;
96
+ latinLocaleHint?: string;
97
+ latinHintRules?: LatinHintRule[];
98
+ useDefaultLatinHints?: boolean;
99
+ hanLanguageHint?: string;
100
+ hanTagHint?: string;
101
+ }
102
+ //#endregion
94
103
  //#region src/wc/segment.d.ts
95
104
  declare function segmentTextByLocale(text: string, options?: LocaleDetectOptions): LocaleChunk[];
96
105
  //#endregion
@@ -98,6 +107,9 @@ declare function segmentTextByLocale(text: string, options?: LocaleDetectOptions
98
107
  declare function countWordsForLocale(text: string, locale: string): number;
99
108
  declare function countCharsForLocale(text: string, locale: string): number;
100
109
  //#endregion
110
+ //#region src/wc/latin-hints.d.ts
111
+ declare const DEFAULT_LATIN_HINT_RULES: ReadonlyArray<Readonly<LatinHintRule>>;
112
+ //#endregion
101
113
  //#region src/wc/wc.d.ts
102
114
  declare function wordCounter(text: string, options?: WordCounterOptions): WordCounterResult;
103
115
  //#endregion
@@ -133,5 +145,5 @@ declare function parseMarkdown(input: string): ParsedMarkdown;
133
145
  //#region src/markdown/section-count.d.ts
134
146
  declare function countSections(input: string, section: SectionMode, options?: WordCounterOptions): SectionedResult;
135
147
  //#endregion
136
- export { FrontmatterType, type NonWordCollection, ParsedMarkdown, SectionMode, SectionedResult, type WordCounterBreakdown, type WordCounterMode, type WordCounterOptions, type WordCounterResult, appendAll, countCharsForLocale, countSections, countWordsForLocale, wordCounter as default, wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
148
+ export { DEFAULT_LATIN_HINT_RULES, FrontmatterType, type LatinHintRule, type NonWordCollection, ParsedMarkdown, SectionMode, SectionedResult, type WordCounterBreakdown, type WordCounterMode, type WordCounterOptions, type WordCounterResult, appendAll, countCharsForLocale, countSections, countWordsForLocale, wordCounter as default, wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
137
149
  //# sourceMappingURL=index.d.mts.map
@@ -322,10 +322,53 @@ function resolveMode(input, fallback = "chunk") {
322
322
  return normalizeMode(input) ?? fallback;
323
323
  }
324
324
 
325
+ //#endregion
326
+ //#region src/wc/latin-hints.ts
327
+ const DEFAULT_LATIN_HINT_RULES_SOURCE = [
328
+ {
329
+ tag: "de",
330
+ pattern: "[äöüÄÖÜß]"
331
+ },
332
+ {
333
+ tag: "es",
334
+ pattern: "[ñÑ¿¡]"
335
+ },
336
+ {
337
+ tag: "pt",
338
+ pattern: "[ãõÃÕ]"
339
+ },
340
+ {
341
+ tag: "fr",
342
+ pattern: "[œŒæÆ]"
343
+ },
344
+ {
345
+ tag: "pl",
346
+ pattern: "[ąćęłńśźżĄĆĘŁŃŚŹŻ]"
347
+ },
348
+ {
349
+ tag: "tr",
350
+ pattern: "[ıİğĞşŞ]"
351
+ },
352
+ {
353
+ tag: "ro",
354
+ pattern: "[ăĂâÂîÎșȘțȚ]"
355
+ },
356
+ {
357
+ tag: "hu",
358
+ pattern: "[őŐűŰ]"
359
+ },
360
+ {
361
+ tag: "is",
362
+ pattern: "[ðÐþÞ]"
363
+ }
364
+ ];
365
+ const DEFAULT_LATIN_HINT_RULES = Object.freeze(DEFAULT_LATIN_HINT_RULES_SOURCE.map((rule) => Object.freeze({ ...rule })));
366
+
325
367
  //#endregion
326
368
  //#region src/wc/locale-detect.ts
327
369
  const DEFAULT_LOCALE = "und-Latn";
328
- const DEFAULT_HAN_TAG = "zh-Hani";
370
+ const DEFAULT_HAN_TAG = "und-Hani";
371
+ const MAX_LATIN_HINT_PATTERN_LENGTH = 256;
329
372
  const regex = {
330
373
  hiragana: /\p{Script=Hiragana}/u,
331
374
  katakana: /\p{Script=Katakana}/u,
@@ -337,31 +380,10 @@ const regex = {
337
380
  devanagari: /\p{Script=Devanagari}/u,
338
381
  thai: /\p{Script=Thai}/u
339
382
  };
340
- const latinLocaleHints = [
341
- {
342
- locale: "de",
343
- regex: /[äöüÄÖÜß]/
344
- },
345
- {
346
- locale: "es",
347
- regex: /[ñÑ¿¡]/
348
- },
349
- {
350
- locale: "pt",
351
- regex: /[ãõÃÕ]/
352
- },
353
- {
354
- locale: "fr",
355
- regex: /[œŒæÆ]/
356
- }
357
- ];
358
- const latinLocales = new Set([DEFAULT_LOCALE, ...latinLocaleHints.map((hint) => hint.locale)]);
359
- function isLatinLocale(locale) {
360
- return latinLocales.has(locale);
361
- }
362
- function detectLatinLocale(char) {
363
- for (const hint of latinLocaleHints) if (hint.regex.test(char)) return hint.locale;
364
- return DEFAULT_LOCALE;
383
+ const defaultLatinLocales = new Set([DEFAULT_LOCALE, ...DEFAULT_LATIN_HINT_RULES.map((hint) => hint.tag)]);
384
+ function isLatinLocale(locale, context) {
385
+ if (context) return context.latinLocales.has(locale);
386
+ return defaultLatinLocales.has(locale);
365
387
  }
366
388
  function resolveLatinHint(options) {
367
389
  const latinTagHint = options.latinTagHint?.trim();
@@ -377,7 +399,82 @@ function resolveHanHint(options) {
377
399
  const hanLanguageHint = options.hanLanguageHint?.trim();
378
400
  if (hanLanguageHint) return hanLanguageHint;
379
401
  }
380
- function detectLocaleForChar(char, previousLocale, options = {}) {
402
+ function compileLatinHintPattern(pattern, label) {
403
+ const source = typeof pattern === "string" ? pattern : pattern.source;
404
+ const hasUnicodeMode = typeof pattern !== "string" && (pattern.flags.includes("u") || pattern.flags.includes("v"));
405
+ const flags = typeof pattern === "string" ? "u" : hasUnicodeMode ? pattern.flags : `${pattern.flags}u`;
406
+ if (source.length === 0) throw new Error(`${label}: pattern must not be empty.`);
407
+ if (source.length > MAX_LATIN_HINT_PATTERN_LENGTH) throw new Error(`${label}: pattern must be at most ${MAX_LATIN_HINT_PATTERN_LENGTH} characters.`);
408
+ try {
409
+ return new RegExp(source, flags);
410
+ } catch (error) {
411
+ const message = error instanceof Error ? error.message : String(error);
412
+ throw new Error(`${label}: invalid Unicode regex pattern (${message}).`);
413
+ }
414
+ }
415
+ function normalizeLatinHintPriority(priority, label) {
416
+ if (priority === void 0) return 0;
417
+ if (typeof priority !== "number" || !Number.isFinite(priority)) throw new Error(`${label}: priority must be a finite number when provided.`);
418
+ return priority;
419
+ }
420
+ function compileLatinHintRule(rule, order, label) {
421
+ const tag = typeof rule.tag === "string" ? rule.tag.trim() : "";
422
+ if (!tag) throw new Error(`${label}: tag must be a non-empty string.`);
423
+ return {
424
+ tag,
425
+ pattern: compileLatinHintPattern(rule.pattern, label),
426
+ priority: normalizeLatinHintPriority(rule.priority, label),
427
+ order
428
+ };
429
+ }
430
+ function resolveLatinHintRules(options) {
431
+ const useDefaultLatinHints = options.useDefaultLatinHints !== false;
432
+ const customRules = options.latinHintRules ?? [];
433
+ const combinedRules = [];
434
+ for (let index = 0; index < customRules.length; index += 1) {
435
+ const rule = customRules[index];
436
+ if (!rule) continue;
437
+ combinedRules.push({
438
+ rule,
439
+ label: `Invalid custom Latin hint rule at index ${index}`
440
+ });
441
+ }
442
+ if (useDefaultLatinHints) for (let index = 0; index < DEFAULT_LATIN_HINT_RULES.length; index += 1) {
443
+ const rule = DEFAULT_LATIN_HINT_RULES[index];
444
+ if (!rule) continue;
445
+ combinedRules.push({
446
+ rule,
447
+ label: `Invalid default Latin hint rule at index ${index}`
448
+ });
449
+ }
450
+ const resolvedRules = combinedRules.map((entry, index) => compileLatinHintRule(entry.rule, index, entry.label));
451
+ resolvedRules.sort((left, right) => {
452
+ if (left.priority !== right.priority) return right.priority - left.priority;
453
+ return left.order - right.order;
454
+ });
455
+ return resolvedRules;
456
+ }
457
+ function resolveLocaleDetectContext(options = {}) {
458
+ const latinHint = resolveLatinHint(options);
459
+ const latinHintRules = resolveLatinHintRules(options);
460
+ const latinLocales = new Set([DEFAULT_LOCALE]);
461
+ for (const rule of latinHintRules) latinLocales.add(rule.tag);
462
+ if (latinHint) latinLocales.add(latinHint);
463
+ return {
464
+ latinHint,
465
+ hanHint: resolveHanHint(options),
466
+ latinHintRules,
467
+ latinLocales
468
+ };
469
+ }
470
+ function detectLatinLocale(char, context) {
471
+ for (const hint of context.latinHintRules) {
472
+ hint.pattern.lastIndex = 0;
473
+ if (hint.pattern.test(char)) return hint.tag;
474
+ }
475
+ return DEFAULT_LOCALE;
476
+ }
477
+ function detectLocaleForChar(char, previousLocale, options = {}, context = resolveLocaleDetectContext(options)) {
381
478
  if (regex.hiragana.test(char) || regex.katakana.test(char)) return "ja";
382
479
  if (regex.hangul.test(char)) return "ko";
383
480
  if (regex.arabic.test(char)) return "ar";
@@ -386,14 +483,13 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
386
483
  if (regex.thai.test(char)) return "th";
387
484
  if (regex.han.test(char)) {
388
485
  if (previousLocale && previousLocale.startsWith("ja")) return previousLocale;
389
- return resolveHanHint(options) ?? DEFAULT_HAN_TAG;
486
+ return context.hanHint ?? DEFAULT_HAN_TAG;
390
487
  }
391
488
  if (regex.latin.test(char)) {
392
- const hintedLocale = detectLatinLocale(char);
489
+ const hintedLocale = detectLatinLocale(char, context);
393
490
  if (hintedLocale !== DEFAULT_LOCALE) return hintedLocale;
394
- if (previousLocale && isLatinLocale(previousLocale) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
395
- const latinHint = resolveLatinHint(options);
396
- if (latinHint) return latinHint;
491
+ if (previousLocale && isLatinLocale(previousLocale, context) && previousLocale !== DEFAULT_LOCALE) return previousLocale;
492
+ if (context.latinHint) return context.latinHint;
397
493
  return DEFAULT_LOCALE;
398
494
  }
399
495
  return null;
@@ -402,12 +498,13 @@ function detectLocaleForChar(char, previousLocale, options = {}) {
402
498
  //#endregion
403
499
  //#region src/wc/segment.ts
404
500
  function segmentTextByLocale(text, options = {}) {
501
+ const context = resolveLocaleDetectContext(options);
405
502
  const chunks = [];
406
503
  let currentLocale = DEFAULT_LOCALE;
407
504
  let buffer = "";
408
505
  let bufferHasScript = false;
409
506
  for (const char of text) {
410
- const detected = detectLocaleForChar(char, currentLocale, options);
507
+ const detected = detectLocaleForChar(char, currentLocale, options, context);
411
508
  const targetLocale = detected ?? currentLocale;
412
509
  if (buffer === "") {
413
510
  currentLocale = targetLocale;
@@ -422,7 +519,7 @@ function segmentTextByLocale(text, options = {}) {
422
519
  continue;
423
520
  }
424
521
  if (targetLocale !== currentLocale && detected !== null) {
425
- if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale)) {
522
+ if (currentLocale === DEFAULT_LOCALE && isLatinLocale(targetLocale, context)) {
426
523
  currentLocale = targetLocale;
427
524
  buffer += char;
428
525
  bufferHasScript = true;
@@ -475,6 +572,8 @@ function wordCounter(text, options = {}) {
475
572
  latinLanguageHint: options.latinLanguageHint,
476
573
  latinTagHint: options.latinTagHint,
477
574
  latinLocaleHint: options.latinLocaleHint,
575
+ latinHintRules: options.latinHintRules,
576
+ useDefaultLatinHints: options.useDefaultLatinHints,
478
577
  hanLanguageHint: options.hanLanguageHint,
479
578
  hanTagHint: options.hanTagHint
480
579
  });
@@ -1115,5 +1214,5 @@ function countSections(input, section, options = {}) {
1115
1214
  }
1116
1215
 
1117
1216
  //#endregion
1118
- export { appendAll, countCharsForLocale, countSections, countWordsForLocale, wc_default as default, wc_default as wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
1217
+ export { DEFAULT_LATIN_HINT_RULES, appendAll, countCharsForLocale, countSections, countWordsForLocale, wc_default as default, wc_default as wordCounter, parseMarkdown, segmentTextByLocale, showSingularOrPluralWord };
1119
1218
  //# sourceMappingURL=index.mjs.map