gs-tokenizer 0.1.19 → 0.1.20
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/core.cjs +193 -44
- package/lib/core.d.ts +57 -1
- package/lib/core.js +197 -43
- package/lib/type.cjs +1 -1
- package/lib/type.d.ts +21 -16
- package/lib/type.js +1 -1
- package/package.json +1 -1
package/lib/core.cjs
CHANGED
|
@@ -528,7 +528,7 @@ function tokenText(tokens, exclude = ["punctuation", "space"]) {
|
|
|
528
528
|
function detectLang(str, fastCJK = !0) {
|
|
529
529
|
if (/^[a-zA-Z0-9]+$/.test(str) && /[a-zA-Z]/.test(str))
|
|
530
530
|
return type.Lang.EN;
|
|
531
|
-
let lang = type.Lang.NONE;
|
|
531
|
+
let lang = type.Lang.NONE, secondLang = type.Lang.NONE;
|
|
532
532
|
if (/^\s+$/.test(str))
|
|
533
533
|
return type.Lang.WHITESPACE;
|
|
534
534
|
for (let i = 0; i < str.length; ) {
|
|
@@ -540,40 +540,60 @@ function detectLang(str, fastCJK = !0) {
|
|
|
540
540
|
if (cp >= 48 && cp <= 57) {
|
|
541
541
|
if (lang === type.Lang.NONE)
|
|
542
542
|
lang = type.Lang.NUMERIC_HALF;
|
|
543
|
-
else if (lang !== type.Lang.NUMERIC_HALF)
|
|
544
|
-
|
|
543
|
+
else if (lang !== type.Lang.NUMERIC_HALF) {
|
|
544
|
+
if (secondLang === type.Lang.NONE)
|
|
545
|
+
secondLang = lang;
|
|
546
|
+
else if (secondLang !== type.Lang.NUMERIC_HALF)
|
|
547
|
+
return secondLang | lang | type.Lang.NUMERIC_HALF;
|
|
548
|
+
}
|
|
545
549
|
i++;
|
|
546
550
|
continue;
|
|
547
551
|
}
|
|
548
552
|
if (cp >= 65296 && cp <= 65305) {
|
|
549
553
|
if (lang === type.Lang.NONE)
|
|
550
554
|
lang = type.Lang.NUMERIC_FULL;
|
|
551
|
-
else if (lang !== type.Lang.NUMERIC_FULL)
|
|
552
|
-
|
|
555
|
+
else if (lang !== type.Lang.NUMERIC_FULL) {
|
|
556
|
+
if (secondLang === type.Lang.NONE)
|
|
557
|
+
secondLang = lang;
|
|
558
|
+
else if (secondLang !== type.Lang.NUMERIC_FULL)
|
|
559
|
+
return secondLang | lang | type.Lang.NUMERIC_FULL;
|
|
560
|
+
}
|
|
553
561
|
i += cp > 65535 ? 2 : 1;
|
|
554
562
|
continue;
|
|
555
563
|
}
|
|
556
564
|
if (cp >= 8544 && cp <= 8584 || cp >= 8528 && cp <= 8543) {
|
|
557
565
|
if (lang === type.Lang.NONE)
|
|
558
566
|
lang = type.Lang.NUMERIC_OTHER;
|
|
559
|
-
else if (lang !== type.Lang.NUMERIC_OTHER)
|
|
560
|
-
|
|
567
|
+
else if (lang !== type.Lang.NUMERIC_OTHER) {
|
|
568
|
+
if (secondLang === type.Lang.NONE)
|
|
569
|
+
secondLang = lang;
|
|
570
|
+
else if (secondLang !== type.Lang.NUMERIC_OTHER)
|
|
571
|
+
return secondLang | lang | type.Lang.NUMERIC_OTHER;
|
|
572
|
+
}
|
|
561
573
|
i += cp > 65535 ? 2 : 1;
|
|
562
574
|
continue;
|
|
563
575
|
}
|
|
564
576
|
if (cp >= 9312 && cp <= 9371 || cp >= 12832 && cp <= 12895 || cp >= 12977 && cp <= 12991) {
|
|
565
577
|
if (lang === type.Lang.NONE)
|
|
566
578
|
lang = type.Lang.NUMERIC_OTHER;
|
|
567
|
-
else if (lang !== type.Lang.NUMERIC_OTHER)
|
|
568
|
-
|
|
579
|
+
else if (lang !== type.Lang.NUMERIC_OTHER) {
|
|
580
|
+
if (secondLang === type.Lang.NONE)
|
|
581
|
+
secondLang = lang;
|
|
582
|
+
else if (secondLang !== type.Lang.NUMERIC_OTHER)
|
|
583
|
+
return secondLang | lang | type.Lang.NUMERIC_OTHER;
|
|
584
|
+
}
|
|
569
585
|
i += cp > 65535 ? 2 : 1;
|
|
570
586
|
continue;
|
|
571
587
|
}
|
|
572
588
|
if (cp >= 8304 && cp <= 8351 || cp >= 178 && cp <= 179 || cp === 185) {
|
|
573
589
|
if (lang === type.Lang.NONE)
|
|
574
590
|
lang = type.Lang.NUMERIC_OTHER;
|
|
575
|
-
else if (lang !== type.Lang.NUMERIC_OTHER)
|
|
576
|
-
|
|
591
|
+
else if (lang !== type.Lang.NUMERIC_OTHER) {
|
|
592
|
+
if (secondLang === type.Lang.NONE)
|
|
593
|
+
secondLang = lang;
|
|
594
|
+
else if (secondLang !== type.Lang.NUMERIC_OTHER)
|
|
595
|
+
return secondLang | lang | type.Lang.NUMERIC_OTHER;
|
|
596
|
+
}
|
|
577
597
|
i += cp > 65535 ? 2 : 1;
|
|
578
598
|
continue;
|
|
579
599
|
}
|
|
@@ -582,24 +602,57 @@ function detectLang(str, fastCJK = !0) {
|
|
|
582
602
|
if (lang === type.Lang.NONE)
|
|
583
603
|
lang = type.Lang.EN;
|
|
584
604
|
else if (lang !== type.Lang.EN)
|
|
585
|
-
|
|
605
|
+
if (secondLang === type.Lang.NONE)
|
|
606
|
+
secondLang = lang, lang = type.Lang.EN;
|
|
607
|
+
else
|
|
608
|
+
return secondLang | lang | type.Lang.EN;
|
|
586
609
|
i++;
|
|
587
610
|
continue;
|
|
588
611
|
}
|
|
589
612
|
if (lang === type.Lang.NONE)
|
|
590
613
|
lang = type.Lang.SYMBOL_HALF;
|
|
591
614
|
else if (lang !== type.Lang.SYMBOL_HALF)
|
|
592
|
-
|
|
615
|
+
if (secondLang === type.Lang.NONE)
|
|
616
|
+
secondLang = lang, lang = type.Lang.SYMBOL_HALF;
|
|
617
|
+
else
|
|
618
|
+
return secondLang | lang | type.Lang.SYMBOL_HALF;
|
|
593
619
|
i++;
|
|
594
620
|
continue;
|
|
595
621
|
}
|
|
596
|
-
if (cp >= 19968 && cp <= 40959 ||
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
if (lang
|
|
602
|
-
|
|
622
|
+
if (cp >= 19968 && cp <= 40959 || cp >= 13312 && cp <= 19903) {
|
|
623
|
+
const currentCJKLang = type.Lang.ZH;
|
|
624
|
+
if (fastCJK) return currentCJKLang;
|
|
625
|
+
if (lang === type.Lang.NONE)
|
|
626
|
+
lang = currentCJKLang;
|
|
627
|
+
else if (lang !== currentCJKLang)
|
|
628
|
+
if (secondLang === type.Lang.NONE)
|
|
629
|
+
secondLang = lang, lang = currentCJKLang;
|
|
630
|
+
else
|
|
631
|
+
return secondLang | lang | currentCJKLang;
|
|
632
|
+
i += cp > 65535 ? 2 : 1;
|
|
633
|
+
continue;
|
|
634
|
+
} else if (cp >= 12352 && cp <= 12543) {
|
|
635
|
+
const currentCJKLang = type.Lang.JA;
|
|
636
|
+
if (fastCJK) return currentCJKLang;
|
|
637
|
+
if (lang === type.Lang.NONE)
|
|
638
|
+
lang = currentCJKLang;
|
|
639
|
+
else if (lang !== currentCJKLang)
|
|
640
|
+
if (secondLang === type.Lang.NONE)
|
|
641
|
+
secondLang = lang, lang = currentCJKLang;
|
|
642
|
+
else
|
|
643
|
+
return secondLang | lang | currentCJKLang;
|
|
644
|
+
i += cp > 65535 ? 2 : 1;
|
|
645
|
+
continue;
|
|
646
|
+
} else if (cp >= 44032 && cp <= 55215) {
|
|
647
|
+
const currentCJKLang = type.Lang.KO;
|
|
648
|
+
if (fastCJK) return currentCJKLang;
|
|
649
|
+
if (lang === type.Lang.NONE)
|
|
650
|
+
lang = currentCJKLang;
|
|
651
|
+
else if (lang !== currentCJKLang)
|
|
652
|
+
if (secondLang === type.Lang.NONE)
|
|
653
|
+
secondLang = lang, lang = currentCJKLang;
|
|
654
|
+
else
|
|
655
|
+
return secondLang | lang | currentCJKLang;
|
|
603
656
|
i += cp > 65535 ? 2 : 1;
|
|
604
657
|
continue;
|
|
605
658
|
}
|
|
@@ -608,44 +661,79 @@ function detectLang(str, fastCJK = !0) {
|
|
|
608
661
|
cp >= 127995 && cp <= 127999 || // Emoji组合标记 (零宽连接符)
|
|
609
662
|
cp >= 8205 && cp <= 8205 || // 变体选择器
|
|
610
663
|
cp >= 65024 && cp <= 65039) {
|
|
611
|
-
if (lang === type.Lang.NONE)
|
|
612
|
-
|
|
664
|
+
if (lang === type.Lang.NONE)
|
|
665
|
+
lang = type.Lang.EMOJI;
|
|
666
|
+
else if (lang !== type.Lang.EMOJI)
|
|
667
|
+
if (secondLang === type.Lang.NONE)
|
|
668
|
+
secondLang = lang, lang = type.Lang.EMOJI;
|
|
669
|
+
else
|
|
670
|
+
return secondLang | lang | type.Lang.EMOJI;
|
|
613
671
|
i += cp > 65535 ? 2 : 1;
|
|
614
672
|
continue;
|
|
615
673
|
}
|
|
616
674
|
if (cp >= 1024 && cp <= 1279) {
|
|
617
|
-
if (lang === type.Lang.NONE)
|
|
618
|
-
|
|
675
|
+
if (lang === type.Lang.NONE)
|
|
676
|
+
lang = type.Lang.RU;
|
|
677
|
+
else if (lang !== type.Lang.RU)
|
|
678
|
+
if (secondLang === type.Lang.NONE)
|
|
679
|
+
secondLang = lang, lang = type.Lang.RU;
|
|
680
|
+
else
|
|
681
|
+
return secondLang | lang | type.Lang.RU;
|
|
619
682
|
i += cp > 65535 ? 2 : 1;
|
|
620
683
|
continue;
|
|
621
684
|
}
|
|
622
685
|
if (cp >= 1536 && cp <= 1791) {
|
|
623
|
-
if (lang === type.Lang.NONE)
|
|
624
|
-
|
|
686
|
+
if (lang === type.Lang.NONE)
|
|
687
|
+
lang = type.Lang.AR;
|
|
688
|
+
else if (lang !== type.Lang.AR)
|
|
689
|
+
if (secondLang === type.Lang.NONE)
|
|
690
|
+
secondLang = lang, lang = type.Lang.AR;
|
|
691
|
+
else
|
|
692
|
+
return secondLang | lang | type.Lang.AR;
|
|
625
693
|
i += cp > 65535 ? 2 : 1;
|
|
626
694
|
continue;
|
|
627
695
|
}
|
|
628
696
|
if (cp >= 2304 && cp <= 2431) {
|
|
629
|
-
if (lang === type.Lang.NONE)
|
|
630
|
-
|
|
697
|
+
if (lang === type.Lang.NONE)
|
|
698
|
+
lang = type.Lang.HI;
|
|
699
|
+
else if (lang !== type.Lang.HI)
|
|
700
|
+
if (secondLang === type.Lang.NONE)
|
|
701
|
+
secondLang = lang, lang = type.Lang.HI;
|
|
702
|
+
else
|
|
703
|
+
return secondLang | lang | type.Lang.HI;
|
|
631
704
|
i += cp > 65535 ? 2 : 1;
|
|
632
705
|
continue;
|
|
633
706
|
}
|
|
634
707
|
if (cp >= 3584 && cp <= 3711) {
|
|
635
|
-
if (lang === type.Lang.NONE)
|
|
636
|
-
|
|
708
|
+
if (lang === type.Lang.NONE)
|
|
709
|
+
lang = type.Lang.TH;
|
|
710
|
+
else if (lang !== type.Lang.TH)
|
|
711
|
+
if (secondLang === type.Lang.NONE)
|
|
712
|
+
secondLang = lang, lang = type.Lang.TH;
|
|
713
|
+
else
|
|
714
|
+
return secondLang | lang | type.Lang.TH;
|
|
637
715
|
i += cp > 65535 ? 2 : 1;
|
|
638
716
|
continue;
|
|
639
717
|
}
|
|
640
718
|
if (cp >= 1424 && cp <= 1535) {
|
|
641
|
-
if (lang === type.Lang.NONE)
|
|
642
|
-
|
|
719
|
+
if (lang === type.Lang.NONE)
|
|
720
|
+
lang = type.Lang.HE;
|
|
721
|
+
else if (lang !== type.Lang.HE)
|
|
722
|
+
if (secondLang === type.Lang.NONE)
|
|
723
|
+
secondLang = lang, lang = type.Lang.HE;
|
|
724
|
+
else
|
|
725
|
+
return secondLang | lang | type.Lang.HE;
|
|
643
726
|
i += cp > 65535 ? 2 : 1;
|
|
644
727
|
continue;
|
|
645
728
|
}
|
|
646
729
|
if (cp >= 880 && cp <= 1023) {
|
|
647
|
-
if (lang === type.Lang.NONE)
|
|
648
|
-
|
|
730
|
+
if (lang === type.Lang.NONE)
|
|
731
|
+
lang = type.Lang.EL;
|
|
732
|
+
else if (lang !== type.Lang.EL)
|
|
733
|
+
if (secondLang === type.Lang.NONE)
|
|
734
|
+
secondLang = lang, lang = type.Lang.EL;
|
|
735
|
+
else
|
|
736
|
+
return secondLang | lang | type.Lang.EL;
|
|
649
737
|
i += cp > 65535 ? 2 : 1;
|
|
650
738
|
continue;
|
|
651
739
|
}
|
|
@@ -658,13 +746,18 @@ function detectLang(str, fastCJK = !0) {
|
|
|
658
746
|
if (lang === type.Lang.NONE)
|
|
659
747
|
lang = type.Lang.SYMBOL_FULL;
|
|
660
748
|
else if (lang !== type.Lang.SYMBOL_FULL)
|
|
661
|
-
|
|
749
|
+
if (secondLang === type.Lang.NONE)
|
|
750
|
+
secondLang = lang, lang = type.Lang.SYMBOL_FULL;
|
|
751
|
+
else
|
|
752
|
+
return secondLang | lang | type.Lang.SYMBOL_FULL;
|
|
662
753
|
i += cp > 65535 ? 2 : 1;
|
|
663
754
|
continue;
|
|
664
755
|
}
|
|
665
|
-
|
|
756
|
+
if (secondLang !== type.Lang.NONE)
|
|
757
|
+
return secondLang | lang | type.Lang.OTHER;
|
|
758
|
+
lang !== type.Lang.NONE && (secondLang = lang), lang = type.Lang.OTHER;
|
|
666
759
|
}
|
|
667
|
-
return lang === type.Lang.NONE ? type.Lang.EN : lang;
|
|
760
|
+
return secondLang !== type.Lang.NONE ? secondLang | lang : lang === type.Lang.NONE ? type.Lang.EN : lang;
|
|
668
761
|
}
|
|
669
762
|
class MultilingualTokenizer {
|
|
670
763
|
wordIndex = new FirstCharWordIndex();
|
|
@@ -791,8 +884,8 @@ class MultilingualTokenizer {
|
|
|
791
884
|
result.push(token);
|
|
792
885
|
continue;
|
|
793
886
|
}
|
|
794
|
-
const
|
|
795
|
-
|
|
887
|
+
const isCJK2 = cjkRegex.test(token.txt) || token.meta?.lang?.startsWith("zh") || token.meta?.lang?.startsWith("ja") || token.meta?.lang?.startsWith("ko");
|
|
888
|
+
isCJK2 && tokenLength > cjkTokenLengthLimit ? token.txt = token.txt.slice(0, cjkTokenLengthLimit) : !isCJK2 && tokenLength > enTokenLengthLimit && (token.txt = token.txt.slice(0, enTokenLengthLimit)), result.push(token);
|
|
796
889
|
}
|
|
797
890
|
return result;
|
|
798
891
|
}
|
|
@@ -836,10 +929,7 @@ class MultilingualTokenizer {
|
|
|
836
929
|
}
|
|
837
930
|
}
|
|
838
931
|
function detectChar(cp) {
|
|
839
|
-
return cp <= 127 && (cp === 32 || cp === 9 || cp === 10 || cp === 13) ? type.Lang.WHITESPACE : cp >= 48 && cp <= 57 ? type.Lang.NUMERIC_HALF : cp >= 65296 && cp <= 65305 ? type.Lang.NUMERIC_FULL : cp >= 8544 && cp <= 8584 || cp >= 8528 && cp <= 8543 || cp >= 9312 && cp <= 9371 || cp >= 12832 && cp <= 12895 || cp >= 12977 && cp <= 12991 || cp >= 8304 && cp <= 8351 || cp >= 178 && cp <= 179 || cp === 185 ? type.Lang.NUMERIC_OTHER : cp <= 127 ? cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122 ? type.Lang.EN : type.Lang.SYMBOL_HALF : cp >= 19968 && cp <= 40959 || //
|
|
840
|
-
cp >= 13312 && cp <= 19903 || // 汉字扩展A
|
|
841
|
-
cp >= 12352 && cp <= 12543 || // 日文假名
|
|
842
|
-
cp >= 44032 && cp <= 55215 ? type.Lang.CJK : cp >= 126976 && cp <= 129791 || // 经典emoji
|
|
932
|
+
return cp <= 127 && (cp === 32 || cp === 9 || cp === 10 || cp === 13) ? type.Lang.WHITESPACE : cp >= 48 && cp <= 57 ? type.Lang.NUMERIC_HALF : cp >= 65296 && cp <= 65305 ? type.Lang.NUMERIC_FULL : cp >= 8544 && cp <= 8584 || cp >= 8528 && cp <= 8543 || cp >= 9312 && cp <= 9371 || cp >= 12832 && cp <= 12895 || cp >= 12977 && cp <= 12991 || cp >= 8304 && cp <= 8351 || cp >= 178 && cp <= 179 || cp === 185 ? type.Lang.NUMERIC_OTHER : cp <= 127 ? cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122 ? type.Lang.EN : type.Lang.SYMBOL_HALF : cp >= 19968 && cp <= 40959 || cp >= 13312 && cp <= 19903 ? type.Lang.ZH : cp >= 12352 && cp <= 12543 ? type.Lang.JA : cp >= 44032 && cp <= 55215 ? type.Lang.KO : cp >= 126976 && cp <= 129791 || // 经典emoji
|
|
843
933
|
cp >= 9728 && cp <= 10175 || // Emoji修饰符
|
|
844
934
|
cp >= 127995 && cp <= 127999 || // Emoji组合标记 (零宽连接符)
|
|
845
935
|
cp >= 8205 && cp <= 8205 || // 变体选择器
|
|
@@ -850,4 +940,63 @@ function detectChar(cp) {
|
|
|
850
940
|
cp >= 8192 && cp <= 8303 || cp >= 8448 && cp <= 9983 ? type.Lang.SYMBOL_FULL : type.Lang.OTHER
|
|
851
941
|
);
|
|
852
942
|
}
|
|
853
|
-
|
|
943
|
+
class LangHelper {
|
|
944
|
+
static is(lang, n) {
|
|
945
|
+
return (lang & n) !== 0;
|
|
946
|
+
}
|
|
947
|
+
/**
|
|
948
|
+
* 判断是否为数字类型
|
|
949
|
+
* @param lang 语言类型
|
|
950
|
+
* @returns 是否为数字类型
|
|
951
|
+
*/
|
|
952
|
+
static isNumeric(lang) {
|
|
953
|
+
return (lang & type.Lang.NUMERIC) !== 0;
|
|
954
|
+
}
|
|
955
|
+
/**
|
|
956
|
+
* 判断是否为符号类型
|
|
957
|
+
* @param lang 语言类型
|
|
958
|
+
* @returns 是否为符号类型
|
|
959
|
+
*/
|
|
960
|
+
static isSymbol(lang) {
|
|
961
|
+
return (lang & type.Lang.SYMBOL) !== 0;
|
|
962
|
+
}
|
|
963
|
+
/**
|
|
964
|
+
* 判断是否为CJK(中、日、韩)
|
|
965
|
+
* @param lang 语言类型
|
|
966
|
+
* @returns 是否为CJK
|
|
967
|
+
*/
|
|
968
|
+
static isCJK(lang) {
|
|
969
|
+
return (lang & type.Lang.CJK) !== 0;
|
|
970
|
+
}
|
|
971
|
+
}
|
|
972
|
+
class DetectLangHelper {
|
|
973
|
+
/**
|
|
974
|
+
* 判断文本是否为数字类型
|
|
975
|
+
* @param str 输入文本
|
|
976
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
977
|
+
* @returns 是否为数字类型
|
|
978
|
+
*/
|
|
979
|
+
static isNumeric(str, fastCJK = !0) {
|
|
980
|
+
return LangHelper.isNumeric(detectLang(str, fastCJK));
|
|
981
|
+
}
|
|
982
|
+
/**
|
|
983
|
+
* 判断文本是否为符号类型
|
|
984
|
+
* @param str 输入文本
|
|
985
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
986
|
+
* @returns 是否为符号类型
|
|
987
|
+
*/
|
|
988
|
+
static isSymbol(str, fastCJK = !0) {
|
|
989
|
+
return LangHelper.isSymbol(detectLang(str, fastCJK));
|
|
990
|
+
}
|
|
991
|
+
/**
|
|
992
|
+
* 判断文本是否为CJK(中、日、韩)
|
|
993
|
+
* @param str 输入文本
|
|
994
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
995
|
+
* @returns 是否为CJK
|
|
996
|
+
*/
|
|
997
|
+
static isCJK(str, fastCJK = !0) {
|
|
998
|
+
return LangHelper.isCJK(detectLang(str, fastCJK));
|
|
999
|
+
}
|
|
1000
|
+
}
|
|
1001
|
+
const { isNumeric, isSymbol, isCJK } = DetectLangHelper;
|
|
1002
|
+
exports.DetectLangHelper = DetectLangHelper, exports.LangHelper = LangHelper, exports.MultilingualTokenizer = MultilingualTokenizer, exports.detectChar = detectChar, exports.detectLang = detectLang, exports.isCJK = isCJK, exports.isNumeric = isNumeric, exports.isSymbol = isSymbol, exports.tokenText = tokenText;
|
package/lib/core.d.ts
CHANGED
|
@@ -91,6 +91,62 @@ declare function detectLang(str: string, fastCJK?: boolean): Lang;
|
|
|
91
91
|
*/
|
|
92
92
|
declare function detectChar(cp: number): Lang;
|
|
93
93
|
|
|
94
|
+
/**
|
|
95
|
+
* Lang类型辅助工具类
|
|
96
|
+
* 提供Lang类型的快速判断和操作方法
|
|
97
|
+
*/
|
|
98
|
+
declare class LangHelper {
|
|
99
|
+
static is(lang: Lang, n: number | Lang): boolean;
|
|
100
|
+
/**
|
|
101
|
+
* 判断是否为数字类型
|
|
102
|
+
* @param lang 语言类型
|
|
103
|
+
* @returns 是否为数字类型
|
|
104
|
+
*/
|
|
105
|
+
static isNumeric(lang: Lang): boolean;
|
|
106
|
+
/**
|
|
107
|
+
* 判断是否为符号类型
|
|
108
|
+
* @param lang 语言类型
|
|
109
|
+
* @returns 是否为符号类型
|
|
110
|
+
*/
|
|
111
|
+
static isSymbol(lang: Lang): boolean;
|
|
112
|
+
/**
|
|
113
|
+
* 判断是否为CJK(中、日、韩)
|
|
114
|
+
* @param lang 语言类型
|
|
115
|
+
* @returns 是否为CJK
|
|
116
|
+
*/
|
|
117
|
+
static isCJK(lang: Lang): boolean;
|
|
118
|
+
}
|
|
119
|
+
/**
|
|
120
|
+
* DetectLang辅助工具类
|
|
121
|
+
* 提供对detectLang函数的快速使用方式,参数为字符串
|
|
122
|
+
*/
|
|
123
|
+
declare class DetectLangHelper {
|
|
124
|
+
/**
|
|
125
|
+
* 判断文本是否为数字类型
|
|
126
|
+
* @param str 输入文本
|
|
127
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
128
|
+
* @returns 是否为数字类型
|
|
129
|
+
*/
|
|
130
|
+
static isNumeric(str: string, fastCJK?: boolean): boolean;
|
|
131
|
+
/**
|
|
132
|
+
* 判断文本是否为符号类型
|
|
133
|
+
* @param str 输入文本
|
|
134
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
135
|
+
* @returns 是否为符号类型
|
|
136
|
+
*/
|
|
137
|
+
static isSymbol(str: string, fastCJK?: boolean): boolean;
|
|
138
|
+
/**
|
|
139
|
+
* 判断文本是否为CJK(中、日、韩)
|
|
140
|
+
* @param str 输入文本
|
|
141
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
142
|
+
* @returns 是否为CJK
|
|
143
|
+
*/
|
|
144
|
+
static isCJK(str: string, fastCJK?: boolean): boolean;
|
|
145
|
+
}
|
|
146
|
+
declare const isNumeric: typeof DetectLangHelper.isNumeric;
|
|
147
|
+
declare const isSymbol: typeof DetectLangHelper.isSymbol;
|
|
148
|
+
declare const isCJK: typeof DetectLangHelper.isCJK;
|
|
149
|
+
|
|
94
150
|
declare function tokenText(tokens: IToken[], exclude?: TokenType[]): string[];
|
|
95
151
|
|
|
96
|
-
export { MultilingualTokenizer, detectChar, detectLang, tokenText };
|
|
152
|
+
export { DetectLangHelper, LangHelper, MultilingualTokenizer, detectChar, detectLang, isCJK, isNumeric, isSymbol, tokenText };
|
package/lib/core.js
CHANGED
|
@@ -527,7 +527,7 @@ function tokenText(tokens, exclude = ["punctuation", "space"]) {
|
|
|
527
527
|
function detectLang(str, fastCJK = !0) {
|
|
528
528
|
if (/^[a-zA-Z0-9]+$/.test(str) && /[a-zA-Z]/.test(str))
|
|
529
529
|
return Lang.EN;
|
|
530
|
-
let lang = Lang.NONE;
|
|
530
|
+
let lang = Lang.NONE, secondLang = Lang.NONE;
|
|
531
531
|
if (/^\s+$/.test(str))
|
|
532
532
|
return Lang.WHITESPACE;
|
|
533
533
|
for (let i = 0; i < str.length; ) {
|
|
@@ -539,40 +539,60 @@ function detectLang(str, fastCJK = !0) {
|
|
|
539
539
|
if (cp >= 48 && cp <= 57) {
|
|
540
540
|
if (lang === Lang.NONE)
|
|
541
541
|
lang = Lang.NUMERIC_HALF;
|
|
542
|
-
else if (lang !== Lang.NUMERIC_HALF)
|
|
543
|
-
|
|
542
|
+
else if (lang !== Lang.NUMERIC_HALF) {
|
|
543
|
+
if (secondLang === Lang.NONE)
|
|
544
|
+
secondLang = lang;
|
|
545
|
+
else if (secondLang !== Lang.NUMERIC_HALF)
|
|
546
|
+
return secondLang | lang | Lang.NUMERIC_HALF;
|
|
547
|
+
}
|
|
544
548
|
i++;
|
|
545
549
|
continue;
|
|
546
550
|
}
|
|
547
551
|
if (cp >= 65296 && cp <= 65305) {
|
|
548
552
|
if (lang === Lang.NONE)
|
|
549
553
|
lang = Lang.NUMERIC_FULL;
|
|
550
|
-
else if (lang !== Lang.NUMERIC_FULL)
|
|
551
|
-
|
|
554
|
+
else if (lang !== Lang.NUMERIC_FULL) {
|
|
555
|
+
if (secondLang === Lang.NONE)
|
|
556
|
+
secondLang = lang;
|
|
557
|
+
else if (secondLang !== Lang.NUMERIC_FULL)
|
|
558
|
+
return secondLang | lang | Lang.NUMERIC_FULL;
|
|
559
|
+
}
|
|
552
560
|
i += cp > 65535 ? 2 : 1;
|
|
553
561
|
continue;
|
|
554
562
|
}
|
|
555
563
|
if (cp >= 8544 && cp <= 8584 || cp >= 8528 && cp <= 8543) {
|
|
556
564
|
if (lang === Lang.NONE)
|
|
557
565
|
lang = Lang.NUMERIC_OTHER;
|
|
558
|
-
else if (lang !== Lang.NUMERIC_OTHER)
|
|
559
|
-
|
|
566
|
+
else if (lang !== Lang.NUMERIC_OTHER) {
|
|
567
|
+
if (secondLang === Lang.NONE)
|
|
568
|
+
secondLang = lang;
|
|
569
|
+
else if (secondLang !== Lang.NUMERIC_OTHER)
|
|
570
|
+
return secondLang | lang | Lang.NUMERIC_OTHER;
|
|
571
|
+
}
|
|
560
572
|
i += cp > 65535 ? 2 : 1;
|
|
561
573
|
continue;
|
|
562
574
|
}
|
|
563
575
|
if (cp >= 9312 && cp <= 9371 || cp >= 12832 && cp <= 12895 || cp >= 12977 && cp <= 12991) {
|
|
564
576
|
if (lang === Lang.NONE)
|
|
565
577
|
lang = Lang.NUMERIC_OTHER;
|
|
566
|
-
else if (lang !== Lang.NUMERIC_OTHER)
|
|
567
|
-
|
|
578
|
+
else if (lang !== Lang.NUMERIC_OTHER) {
|
|
579
|
+
if (secondLang === Lang.NONE)
|
|
580
|
+
secondLang = lang;
|
|
581
|
+
else if (secondLang !== Lang.NUMERIC_OTHER)
|
|
582
|
+
return secondLang | lang | Lang.NUMERIC_OTHER;
|
|
583
|
+
}
|
|
568
584
|
i += cp > 65535 ? 2 : 1;
|
|
569
585
|
continue;
|
|
570
586
|
}
|
|
571
587
|
if (cp >= 8304 && cp <= 8351 || cp >= 178 && cp <= 179 || cp === 185) {
|
|
572
588
|
if (lang === Lang.NONE)
|
|
573
589
|
lang = Lang.NUMERIC_OTHER;
|
|
574
|
-
else if (lang !== Lang.NUMERIC_OTHER)
|
|
575
|
-
|
|
590
|
+
else if (lang !== Lang.NUMERIC_OTHER) {
|
|
591
|
+
if (secondLang === Lang.NONE)
|
|
592
|
+
secondLang = lang;
|
|
593
|
+
else if (secondLang !== Lang.NUMERIC_OTHER)
|
|
594
|
+
return secondLang | lang | Lang.NUMERIC_OTHER;
|
|
595
|
+
}
|
|
576
596
|
i += cp > 65535 ? 2 : 1;
|
|
577
597
|
continue;
|
|
578
598
|
}
|
|
@@ -581,24 +601,57 @@ function detectLang(str, fastCJK = !0) {
|
|
|
581
601
|
if (lang === Lang.NONE)
|
|
582
602
|
lang = Lang.EN;
|
|
583
603
|
else if (lang !== Lang.EN)
|
|
584
|
-
|
|
604
|
+
if (secondLang === Lang.NONE)
|
|
605
|
+
secondLang = lang, lang = Lang.EN;
|
|
606
|
+
else
|
|
607
|
+
return secondLang | lang | Lang.EN;
|
|
585
608
|
i++;
|
|
586
609
|
continue;
|
|
587
610
|
}
|
|
588
611
|
if (lang === Lang.NONE)
|
|
589
612
|
lang = Lang.SYMBOL_HALF;
|
|
590
613
|
else if (lang !== Lang.SYMBOL_HALF)
|
|
591
|
-
|
|
614
|
+
if (secondLang === Lang.NONE)
|
|
615
|
+
secondLang = lang, lang = Lang.SYMBOL_HALF;
|
|
616
|
+
else
|
|
617
|
+
return secondLang | lang | Lang.SYMBOL_HALF;
|
|
592
618
|
i++;
|
|
593
619
|
continue;
|
|
594
620
|
}
|
|
595
|
-
if (cp >= 19968 && cp <= 40959 ||
|
|
596
|
-
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
if (lang
|
|
601
|
-
|
|
621
|
+
if (cp >= 19968 && cp <= 40959 || cp >= 13312 && cp <= 19903) {
|
|
622
|
+
const currentCJKLang = Lang.ZH;
|
|
623
|
+
if (fastCJK) return currentCJKLang;
|
|
624
|
+
if (lang === Lang.NONE)
|
|
625
|
+
lang = currentCJKLang;
|
|
626
|
+
else if (lang !== currentCJKLang)
|
|
627
|
+
if (secondLang === Lang.NONE)
|
|
628
|
+
secondLang = lang, lang = currentCJKLang;
|
|
629
|
+
else
|
|
630
|
+
return secondLang | lang | currentCJKLang;
|
|
631
|
+
i += cp > 65535 ? 2 : 1;
|
|
632
|
+
continue;
|
|
633
|
+
} else if (cp >= 12352 && cp <= 12543) {
|
|
634
|
+
const currentCJKLang = Lang.JA;
|
|
635
|
+
if (fastCJK) return currentCJKLang;
|
|
636
|
+
if (lang === Lang.NONE)
|
|
637
|
+
lang = currentCJKLang;
|
|
638
|
+
else if (lang !== currentCJKLang)
|
|
639
|
+
if (secondLang === Lang.NONE)
|
|
640
|
+
secondLang = lang, lang = currentCJKLang;
|
|
641
|
+
else
|
|
642
|
+
return secondLang | lang | currentCJKLang;
|
|
643
|
+
i += cp > 65535 ? 2 : 1;
|
|
644
|
+
continue;
|
|
645
|
+
} else if (cp >= 44032 && cp <= 55215) {
|
|
646
|
+
const currentCJKLang = Lang.KO;
|
|
647
|
+
if (fastCJK) return currentCJKLang;
|
|
648
|
+
if (lang === Lang.NONE)
|
|
649
|
+
lang = currentCJKLang;
|
|
650
|
+
else if (lang !== currentCJKLang)
|
|
651
|
+
if (secondLang === Lang.NONE)
|
|
652
|
+
secondLang = lang, lang = currentCJKLang;
|
|
653
|
+
else
|
|
654
|
+
return secondLang | lang | currentCJKLang;
|
|
602
655
|
i += cp > 65535 ? 2 : 1;
|
|
603
656
|
continue;
|
|
604
657
|
}
|
|
@@ -607,44 +660,79 @@ function detectLang(str, fastCJK = !0) {
|
|
|
607
660
|
cp >= 127995 && cp <= 127999 || // Emoji组合标记 (零宽连接符)
|
|
608
661
|
cp >= 8205 && cp <= 8205 || // 变体选择器
|
|
609
662
|
cp >= 65024 && cp <= 65039) {
|
|
610
|
-
if (lang === Lang.NONE)
|
|
611
|
-
|
|
663
|
+
if (lang === Lang.NONE)
|
|
664
|
+
lang = Lang.EMOJI;
|
|
665
|
+
else if (lang !== Lang.EMOJI)
|
|
666
|
+
if (secondLang === Lang.NONE)
|
|
667
|
+
secondLang = lang, lang = Lang.EMOJI;
|
|
668
|
+
else
|
|
669
|
+
return secondLang | lang | Lang.EMOJI;
|
|
612
670
|
i += cp > 65535 ? 2 : 1;
|
|
613
671
|
continue;
|
|
614
672
|
}
|
|
615
673
|
if (cp >= 1024 && cp <= 1279) {
|
|
616
|
-
if (lang === Lang.NONE)
|
|
617
|
-
|
|
674
|
+
if (lang === Lang.NONE)
|
|
675
|
+
lang = Lang.RU;
|
|
676
|
+
else if (lang !== Lang.RU)
|
|
677
|
+
if (secondLang === Lang.NONE)
|
|
678
|
+
secondLang = lang, lang = Lang.RU;
|
|
679
|
+
else
|
|
680
|
+
return secondLang | lang | Lang.RU;
|
|
618
681
|
i += cp > 65535 ? 2 : 1;
|
|
619
682
|
continue;
|
|
620
683
|
}
|
|
621
684
|
if (cp >= 1536 && cp <= 1791) {
|
|
622
|
-
if (lang === Lang.NONE)
|
|
623
|
-
|
|
685
|
+
if (lang === Lang.NONE)
|
|
686
|
+
lang = Lang.AR;
|
|
687
|
+
else if (lang !== Lang.AR)
|
|
688
|
+
if (secondLang === Lang.NONE)
|
|
689
|
+
secondLang = lang, lang = Lang.AR;
|
|
690
|
+
else
|
|
691
|
+
return secondLang | lang | Lang.AR;
|
|
624
692
|
i += cp > 65535 ? 2 : 1;
|
|
625
693
|
continue;
|
|
626
694
|
}
|
|
627
695
|
if (cp >= 2304 && cp <= 2431) {
|
|
628
|
-
if (lang === Lang.NONE)
|
|
629
|
-
|
|
696
|
+
if (lang === Lang.NONE)
|
|
697
|
+
lang = Lang.HI;
|
|
698
|
+
else if (lang !== Lang.HI)
|
|
699
|
+
if (secondLang === Lang.NONE)
|
|
700
|
+
secondLang = lang, lang = Lang.HI;
|
|
701
|
+
else
|
|
702
|
+
return secondLang | lang | Lang.HI;
|
|
630
703
|
i += cp > 65535 ? 2 : 1;
|
|
631
704
|
continue;
|
|
632
705
|
}
|
|
633
706
|
if (cp >= 3584 && cp <= 3711) {
|
|
634
|
-
if (lang === Lang.NONE)
|
|
635
|
-
|
|
707
|
+
if (lang === Lang.NONE)
|
|
708
|
+
lang = Lang.TH;
|
|
709
|
+
else if (lang !== Lang.TH)
|
|
710
|
+
if (secondLang === Lang.NONE)
|
|
711
|
+
secondLang = lang, lang = Lang.TH;
|
|
712
|
+
else
|
|
713
|
+
return secondLang | lang | Lang.TH;
|
|
636
714
|
i += cp > 65535 ? 2 : 1;
|
|
637
715
|
continue;
|
|
638
716
|
}
|
|
639
717
|
if (cp >= 1424 && cp <= 1535) {
|
|
640
|
-
if (lang === Lang.NONE)
|
|
641
|
-
|
|
718
|
+
if (lang === Lang.NONE)
|
|
719
|
+
lang = Lang.HE;
|
|
720
|
+
else if (lang !== Lang.HE)
|
|
721
|
+
if (secondLang === Lang.NONE)
|
|
722
|
+
secondLang = lang, lang = Lang.HE;
|
|
723
|
+
else
|
|
724
|
+
return secondLang | lang | Lang.HE;
|
|
642
725
|
i += cp > 65535 ? 2 : 1;
|
|
643
726
|
continue;
|
|
644
727
|
}
|
|
645
728
|
if (cp >= 880 && cp <= 1023) {
|
|
646
|
-
if (lang === Lang.NONE)
|
|
647
|
-
|
|
729
|
+
if (lang === Lang.NONE)
|
|
730
|
+
lang = Lang.EL;
|
|
731
|
+
else if (lang !== Lang.EL)
|
|
732
|
+
if (secondLang === Lang.NONE)
|
|
733
|
+
secondLang = lang, lang = Lang.EL;
|
|
734
|
+
else
|
|
735
|
+
return secondLang | lang | Lang.EL;
|
|
648
736
|
i += cp > 65535 ? 2 : 1;
|
|
649
737
|
continue;
|
|
650
738
|
}
|
|
@@ -657,13 +745,18 @@ function detectLang(str, fastCJK = !0) {
|
|
|
657
745
|
if (lang === Lang.NONE)
|
|
658
746
|
lang = Lang.SYMBOL_FULL;
|
|
659
747
|
else if (lang !== Lang.SYMBOL_FULL)
|
|
660
|
-
|
|
748
|
+
if (secondLang === Lang.NONE)
|
|
749
|
+
secondLang = lang, lang = Lang.SYMBOL_FULL;
|
|
750
|
+
else
|
|
751
|
+
return secondLang | lang | Lang.SYMBOL_FULL;
|
|
661
752
|
i += cp > 65535 ? 2 : 1;
|
|
662
753
|
continue;
|
|
663
754
|
}
|
|
664
|
-
|
|
755
|
+
if (secondLang !== Lang.NONE)
|
|
756
|
+
return secondLang | lang | Lang.OTHER;
|
|
757
|
+
lang !== Lang.NONE && (secondLang = lang), lang = Lang.OTHER;
|
|
665
758
|
}
|
|
666
|
-
return lang === Lang.NONE ? Lang.EN : lang;
|
|
759
|
+
return secondLang !== Lang.NONE ? secondLang | lang : lang === Lang.NONE ? Lang.EN : lang;
|
|
667
760
|
}
|
|
668
761
|
class MultilingualTokenizer {
|
|
669
762
|
wordIndex = new FirstCharWordIndex();
|
|
@@ -790,8 +883,8 @@ class MultilingualTokenizer {
|
|
|
790
883
|
result.push(token);
|
|
791
884
|
continue;
|
|
792
885
|
}
|
|
793
|
-
const
|
|
794
|
-
|
|
886
|
+
const isCJK2 = cjkRegex.test(token.txt) || token.meta?.lang?.startsWith("zh") || token.meta?.lang?.startsWith("ja") || token.meta?.lang?.startsWith("ko");
|
|
887
|
+
isCJK2 && tokenLength > cjkTokenLengthLimit ? token.txt = token.txt.slice(0, cjkTokenLengthLimit) : !isCJK2 && tokenLength > enTokenLengthLimit && (token.txt = token.txt.slice(0, enTokenLengthLimit)), result.push(token);
|
|
795
888
|
}
|
|
796
889
|
return result;
|
|
797
890
|
}
|
|
@@ -835,10 +928,7 @@ class MultilingualTokenizer {
|
|
|
835
928
|
}
|
|
836
929
|
}
|
|
837
930
|
function detectChar(cp) {
|
|
838
|
-
return cp <= 127 && (cp === 32 || cp === 9 || cp === 10 || cp === 13) ? Lang.WHITESPACE : cp >= 48 && cp <= 57 ? Lang.NUMERIC_HALF : cp >= 65296 && cp <= 65305 ? Lang.NUMERIC_FULL : cp >= 8544 && cp <= 8584 || cp >= 8528 && cp <= 8543 ? Lang.NUMERIC_OTHER : cp >= 9312 && cp <= 9371 || cp >= 12832 && cp <= 12895 || cp >= 12977 && cp <= 12991 ? Lang.NUMERIC_OTHER : cp >= 8304 && cp <= 8351 || cp >= 178 && cp <= 179 || cp === 185 ? Lang.NUMERIC_OTHER : cp <= 127 ? cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122 ? Lang.EN : Lang.SYMBOL_HALF : cp >= 19968 && cp <= 40959 || //
|
|
839
|
-
cp >= 13312 && cp <= 19903 || // 汉字扩展A
|
|
840
|
-
cp >= 12352 && cp <= 12543 || // 日文假名
|
|
841
|
-
cp >= 44032 && cp <= 55215 ? Lang.CJK : cp >= 126976 && cp <= 129791 || // 经典emoji
|
|
931
|
+
return cp <= 127 && (cp === 32 || cp === 9 || cp === 10 || cp === 13) ? Lang.WHITESPACE : cp >= 48 && cp <= 57 ? Lang.NUMERIC_HALF : cp >= 65296 && cp <= 65305 ? Lang.NUMERIC_FULL : cp >= 8544 && cp <= 8584 || cp >= 8528 && cp <= 8543 ? Lang.NUMERIC_OTHER : cp >= 9312 && cp <= 9371 || cp >= 12832 && cp <= 12895 || cp >= 12977 && cp <= 12991 ? Lang.NUMERIC_OTHER : cp >= 8304 && cp <= 8351 || cp >= 178 && cp <= 179 || cp === 185 ? Lang.NUMERIC_OTHER : cp <= 127 ? cp >= 65 && cp <= 90 || cp >= 97 && cp <= 122 ? Lang.EN : Lang.SYMBOL_HALF : cp >= 19968 && cp <= 40959 || cp >= 13312 && cp <= 19903 ? Lang.ZH : cp >= 12352 && cp <= 12543 ? Lang.JA : cp >= 44032 && cp <= 55215 ? Lang.KO : cp >= 126976 && cp <= 129791 || // 经典emoji
|
|
842
932
|
cp >= 9728 && cp <= 10175 || // Emoji修饰符
|
|
843
933
|
cp >= 127995 && cp <= 127999 || // Emoji组合标记 (零宽连接符)
|
|
844
934
|
cp >= 8205 && cp <= 8205 || // 变体选择器
|
|
@@ -849,9 +939,73 @@ function detectChar(cp) {
|
|
|
849
939
|
cp >= 8192 && cp <= 8303 || cp >= 8448 && cp <= 9983 ? Lang.SYMBOL_FULL : Lang.OTHER
|
|
850
940
|
);
|
|
851
941
|
}
|
|
942
|
+
class LangHelper {
|
|
943
|
+
static is(lang, n) {
|
|
944
|
+
return (lang & n) !== 0;
|
|
945
|
+
}
|
|
946
|
+
/**
|
|
947
|
+
* 判断是否为数字类型
|
|
948
|
+
* @param lang 语言类型
|
|
949
|
+
* @returns 是否为数字类型
|
|
950
|
+
*/
|
|
951
|
+
static isNumeric(lang) {
|
|
952
|
+
return (lang & Lang.NUMERIC) !== 0;
|
|
953
|
+
}
|
|
954
|
+
/**
|
|
955
|
+
* 判断是否为符号类型
|
|
956
|
+
* @param lang 语言类型
|
|
957
|
+
* @returns 是否为符号类型
|
|
958
|
+
*/
|
|
959
|
+
static isSymbol(lang) {
|
|
960
|
+
return (lang & Lang.SYMBOL) !== 0;
|
|
961
|
+
}
|
|
962
|
+
/**
|
|
963
|
+
* 判断是否为CJK(中、日、韩)
|
|
964
|
+
* @param lang 语言类型
|
|
965
|
+
* @returns 是否为CJK
|
|
966
|
+
*/
|
|
967
|
+
static isCJK(lang) {
|
|
968
|
+
return (lang & Lang.CJK) !== 0;
|
|
969
|
+
}
|
|
970
|
+
}
|
|
971
|
+
class DetectLangHelper {
|
|
972
|
+
/**
|
|
973
|
+
* 判断文本是否为数字类型
|
|
974
|
+
* @param str 输入文本
|
|
975
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
976
|
+
* @returns 是否为数字类型
|
|
977
|
+
*/
|
|
978
|
+
static isNumeric(str, fastCJK = !0) {
|
|
979
|
+
return LangHelper.isNumeric(detectLang(str, fastCJK));
|
|
980
|
+
}
|
|
981
|
+
/**
|
|
982
|
+
* 判断文本是否为符号类型
|
|
983
|
+
* @param str 输入文本
|
|
984
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
985
|
+
* @returns 是否为符号类型
|
|
986
|
+
*/
|
|
987
|
+
static isSymbol(str, fastCJK = !0) {
|
|
988
|
+
return LangHelper.isSymbol(detectLang(str, fastCJK));
|
|
989
|
+
}
|
|
990
|
+
/**
|
|
991
|
+
* 判断文本是否为CJK(中、日、韩)
|
|
992
|
+
* @param str 输入文本
|
|
993
|
+
* @param fastCJK 是否快速检测CJK语言
|
|
994
|
+
* @returns 是否为CJK
|
|
995
|
+
*/
|
|
996
|
+
static isCJK(str, fastCJK = !0) {
|
|
997
|
+
return LangHelper.isCJK(detectLang(str, fastCJK));
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
const { isNumeric, isSymbol, isCJK } = DetectLangHelper;
|
|
852
1001
|
export {
|
|
1002
|
+
DetectLangHelper,
|
|
1003
|
+
LangHelper,
|
|
853
1004
|
MultilingualTokenizer,
|
|
854
1005
|
detectChar,
|
|
855
1006
|
detectLang,
|
|
1007
|
+
isCJK,
|
|
1008
|
+
isNumeric,
|
|
1009
|
+
isSymbol,
|
|
856
1010
|
tokenText
|
|
857
1011
|
};
|
package/lib/type.cjs
CHANGED
|
@@ -14,5 +14,5 @@ const SUPPORTED_LANGUAGES = [
|
|
|
14
14
|
urlQueryLengthLimit: 64,
|
|
15
15
|
lowercaseEnglish: !1
|
|
16
16
|
};
|
|
17
|
-
var Lang = /* @__PURE__ */ ((Lang2) => (Lang2[Lang2.NONE =
|
|
17
|
+
var Lang = /* @__PURE__ */ ((Lang2) => (Lang2[Lang2.NONE = 1] = "NONE", Lang2[Lang2.EN = 2] = "EN", Lang2[Lang2.RU = 4] = "RU", Lang2[Lang2.AR = 8] = "AR", Lang2[Lang2.HI = 16] = "HI", Lang2[Lang2.TH = 32] = "TH", Lang2[Lang2.HE = 64] = "HE", Lang2[Lang2.EL = 128] = "EL", Lang2[Lang2.WHITESPACE = 256] = "WHITESPACE", Lang2[Lang2.OTHER = 512] = "OTHER", Lang2[Lang2.CJK = 7168] = "CJK", Lang2[Lang2.ZH = 1024] = "ZH", Lang2[Lang2.JA = 2048] = "JA", Lang2[Lang2.KO = 4096] = "KO", Lang2[Lang2.NUMERIC = 57344] = "NUMERIC", Lang2[Lang2.NUMERIC_HALF = 8192] = "NUMERIC_HALF", Lang2[Lang2.NUMERIC_FULL = 16384] = "NUMERIC_FULL", Lang2[Lang2.NUMERIC_OTHER = 32768] = "NUMERIC_OTHER", Lang2[Lang2.SYMBOL = 458752] = "SYMBOL", Lang2[Lang2.SYMBOL_HALF = 65536] = "SYMBOL_HALF", Lang2[Lang2.SYMBOL_FULL = 131072] = "SYMBOL_FULL", Lang2[Lang2.EMOJI = 262144] = "EMOJI", Lang2))(Lang || {});
|
|
18
18
|
exports.DefaultTokenizerOption = DefaultTokenizerOption, exports.Lang = Lang, exports.SUPPORTED_LANGUAGES = SUPPORTED_LANGUAGES;
|
package/lib/type.d.ts
CHANGED
|
@@ -2,23 +2,28 @@ declare const SUPPORTED_LANGUAGES: readonly ["zh", "zh-CN", "zh-TW", "en", "ja",
|
|
|
2
2
|
type SupportedLanguage = typeof SUPPORTED_LANGUAGES[number];
|
|
3
3
|
|
|
4
4
|
declare const enum Lang {
|
|
5
|
-
NONE = 0
|
|
6
|
-
EN =
|
|
7
|
-
CJK = 2,
|
|
8
|
-
EMOJI = 3,
|
|
5
|
+
NONE = 1,// 无类型(使用明确的位掩码避免与0的歧义)
|
|
6
|
+
EN = 2,
|
|
9
7
|
RU = 4,
|
|
10
|
-
AR =
|
|
11
|
-
HI =
|
|
12
|
-
TH =
|
|
13
|
-
HE =
|
|
14
|
-
EL =
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
8
|
+
AR = 8,
|
|
9
|
+
HI = 16,
|
|
10
|
+
TH = 32,
|
|
11
|
+
HE = 64,
|
|
12
|
+
EL = 128,
|
|
13
|
+
WHITESPACE = 256,
|
|
14
|
+
OTHER = 512,
|
|
15
|
+
CJK = 7168,// 中日韩统一表意文字 (仅包含已定义的 ZH, JA, KO)
|
|
16
|
+
ZH = 1024,// 中文
|
|
17
|
+
JA = 2048,// 日文
|
|
18
|
+
KO = 4096,// 韩文
|
|
19
|
+
NUMERIC = 57344,// 数字父类别 (仅包含已定义的子类别)
|
|
20
|
+
NUMERIC_HALF = 8192,// 半角数字 (0-9)
|
|
21
|
+
NUMERIC_FULL = 16384,// 全角数字 (0-9)
|
|
22
|
+
NUMERIC_OTHER = 32768,// 其他数字(如罗马数字、分数等)
|
|
23
|
+
SYMBOL = 458752,// 符号父类别 (包含 EMOJI, SYMBOL_HALF, SYMBOL_FULL)
|
|
24
|
+
SYMBOL_HALF = 65536,// 半角符号
|
|
25
|
+
SYMBOL_FULL = 131072,// 全角符号
|
|
26
|
+
EMOJI = 262144
|
|
22
27
|
}
|
|
23
28
|
|
|
24
29
|
type TokenType = 'word' | 'name' | 'title' | 'hashtag' | 'mention' | 'host' | 'email' | 'url' | 'ip' | 'date' | 'number' | 'emoji' | 'punctuation' | 'space' | 'other';
|
package/lib/type.js
CHANGED
|
@@ -13,7 +13,7 @@ const SUPPORTED_LANGUAGES = [
|
|
|
13
13
|
urlQueryLengthLimit: 64,
|
|
14
14
|
lowercaseEnglish: !1
|
|
15
15
|
};
|
|
16
|
-
var Lang = /* @__PURE__ */ ((Lang2) => (Lang2[Lang2.NONE =
|
|
16
|
+
var Lang = /* @__PURE__ */ ((Lang2) => (Lang2[Lang2.NONE = 1] = "NONE", Lang2[Lang2.EN = 2] = "EN", Lang2[Lang2.RU = 4] = "RU", Lang2[Lang2.AR = 8] = "AR", Lang2[Lang2.HI = 16] = "HI", Lang2[Lang2.TH = 32] = "TH", Lang2[Lang2.HE = 64] = "HE", Lang2[Lang2.EL = 128] = "EL", Lang2[Lang2.WHITESPACE = 256] = "WHITESPACE", Lang2[Lang2.OTHER = 512] = "OTHER", Lang2[Lang2.CJK = 7168] = "CJK", Lang2[Lang2.ZH = 1024] = "ZH", Lang2[Lang2.JA = 2048] = "JA", Lang2[Lang2.KO = 4096] = "KO", Lang2[Lang2.NUMERIC = 57344] = "NUMERIC", Lang2[Lang2.NUMERIC_HALF = 8192] = "NUMERIC_HALF", Lang2[Lang2.NUMERIC_FULL = 16384] = "NUMERIC_FULL", Lang2[Lang2.NUMERIC_OTHER = 32768] = "NUMERIC_OTHER", Lang2[Lang2.SYMBOL = 458752] = "SYMBOL", Lang2[Lang2.SYMBOL_HALF = 65536] = "SYMBOL_HALF", Lang2[Lang2.SYMBOL_FULL = 131072] = "SYMBOL_FULL", Lang2[Lang2.EMOJI = 262144] = "EMOJI", Lang2))(Lang || {});
|
|
17
17
|
export {
|
|
18
18
|
DefaultTokenizerOption,
|
|
19
19
|
Lang,
|