gs-tokenizer 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.cn.md +262 -0
- package/README.ja.md +262 -0
- package/README.ko.md +262 -0
- package/README.md +262 -0
- package/lib/core.cjs +1 -0
- package/lib/core.d.ts +230 -0
- package/lib/core.js +1 -0
- package/lib/index.cjs +1 -0
- package/lib/index.d.ts +116 -0
- package/lib/index.js +1 -0
- package/lib/lexicon.cjs +1 -0
- package/lib/lexicon.d.ts +221 -0
- package/lib/lexicon.js +1 -0
- package/package.json +51 -0
package/lib/core.d.ts
ADDED
|
@@ -0,0 +1,230 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* 分词结果的Token接口
|
|
3
|
+
* @interface Token
|
|
4
|
+
*/
|
|
5
|
+
interface Token {
|
|
6
|
+
/** Token的文本内容 */
|
|
7
|
+
txt: string;
|
|
8
|
+
/** Token类型:单词、标点符号、空格、其他、表情符号、日期、URL、IP地址 */
|
|
9
|
+
type: 'word' | 'punctuation' | 'space' | 'other' | 'emoji' | 'date' | 'url' | 'ip';
|
|
10
|
+
/** Token的语言代码(可选) */
|
|
11
|
+
lang?: string;
|
|
12
|
+
/** Token的来源(可选),如自定义词库名称 */
|
|
13
|
+
src?: string;
|
|
14
|
+
}
|
|
15
|
+
/**
|
|
16
|
+
* 自定义词库条目接口
|
|
17
|
+
* @interface LexiconEntry
|
|
18
|
+
*/
|
|
19
|
+
interface LexiconEntry {
|
|
20
|
+
/** 词库优先级,值越高优先级越高 */
|
|
21
|
+
priority: number;
|
|
22
|
+
/** 词库中的单词集合 */
|
|
23
|
+
data: Set<string>;
|
|
24
|
+
/** 词库名称 */
|
|
25
|
+
name: string;
|
|
26
|
+
/** 词库对应的语言代码 */
|
|
27
|
+
lang: string;
|
|
28
|
+
}
|
|
29
|
+
/**
|
|
30
|
+
* 分词器配置选项接口
|
|
31
|
+
* @interface TokenizerOptions
|
|
32
|
+
*/
|
|
33
|
+
interface TokenizerOptions {
|
|
34
|
+
/** 自定义词库配置,键为语言代码,值为该语言的词库条目数组 */
|
|
35
|
+
customDictionaries?: Record<string, LexiconEntry[]>;
|
|
36
|
+
/** 分词粒度:单词、字符、句子 */
|
|
37
|
+
granularity?: 'word' | 'grapheme' | 'sentence';
|
|
38
|
+
/** 默认语言代码,当无法检测语言时使用 */
|
|
39
|
+
defaultLanguage?: string;
|
|
40
|
+
}
|
|
41
|
+
/**
|
|
42
|
+
* 文本分词选项接口
|
|
43
|
+
* @interface TokenizeTextOptions
|
|
44
|
+
*/
|
|
45
|
+
interface TokenizeTextOptions {
|
|
46
|
+
/** 可选,指定文本语言代码 */
|
|
47
|
+
language?: string;
|
|
48
|
+
/** 可选,指定要包含的token类型数组 */
|
|
49
|
+
includeTypes?: Token['type'][];
|
|
50
|
+
/** 可选,指定要排除的token类型数组 */
|
|
51
|
+
excludeTypes?: Token['type'][];
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
/**
|
|
55
|
+
* 语言分词器接口
|
|
56
|
+
* @interface LanguageTokenizer
|
|
57
|
+
*/
|
|
58
|
+
interface LanguageTokenizer {
|
|
59
|
+
/**
|
|
60
|
+
* 检测文本的语言
|
|
61
|
+
* @param text - 要检测语言的文本
|
|
62
|
+
* @returns 检测到的语言代码
|
|
63
|
+
*/
|
|
64
|
+
detectLanguage(text: string): string;
|
|
65
|
+
/**
|
|
66
|
+
* 对文本进行分词
|
|
67
|
+
* @param text - 要分词的文本
|
|
68
|
+
* @param language - 指定的语言代码
|
|
69
|
+
* @returns 分词结果的Token数组
|
|
70
|
+
*/
|
|
71
|
+
tokenize(text: string, language: string): Token[];
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
/**
|
|
75
|
+
* 英文分词器类,实现LanguageTokenizer接口,用于处理英文文本的分词
|
|
76
|
+
* @class EnglishTokenizer
|
|
77
|
+
* @implements {LanguageTokenizer}
|
|
78
|
+
*/
|
|
79
|
+
declare class EnglishTokenizer implements LanguageTokenizer {
|
|
80
|
+
/** 自定义词库,键为语言代码,值为该语言的词库条目数组 */
|
|
81
|
+
private customDictionaries;
|
|
82
|
+
/**
|
|
83
|
+
* 构造函数
|
|
84
|
+
* @param customDictionaries - 自定义词库,默认为空对象
|
|
85
|
+
*/
|
|
86
|
+
constructor(customDictionaries?: Record<string, LexiconEntry[]>);
|
|
87
|
+
/**
|
|
88
|
+
* 检测文本的语言是否为英文
|
|
89
|
+
* @param text - 要检测语言的文本
|
|
90
|
+
* @returns 如果是英文返回'en',否则返回空字符串
|
|
91
|
+
*/
|
|
92
|
+
detectLanguage(text: string): string;
|
|
93
|
+
/**
|
|
94
|
+
* 对英文文本进行分词
|
|
95
|
+
* @param text - 要分词的英文文本
|
|
96
|
+
* @param language - 指定的语言代码(通常为'en')
|
|
97
|
+
* @returns 分词结果的Token数组
|
|
98
|
+
*/
|
|
99
|
+
tokenize(text: string, language: string): Token[];
|
|
100
|
+
private tagNameTokens;
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
/**
|
|
104
|
+
* CJK分词器类,实现LanguageTokenizer接口,用于处理中文、日文和韩文等CJK语言的文本分词
|
|
105
|
+
* @class CJKTokenizer
|
|
106
|
+
* @implements {LanguageTokenizer}
|
|
107
|
+
*/
|
|
108
|
+
declare class CJKTokenizer implements LanguageTokenizer {
|
|
109
|
+
/** 分词器实例映射,键为语言代码和粒度的组合 */
|
|
110
|
+
private segmenters;
|
|
111
|
+
/** 自定义词库,键为语言代码,值为该语言的词库条目数组 */
|
|
112
|
+
private customDictionaries;
|
|
113
|
+
/**
|
|
114
|
+
* 构造函数
|
|
115
|
+
* @param customDictionaries - 自定义词库,默认为空对象
|
|
116
|
+
*/
|
|
117
|
+
constructor(customDictionaries?: Record<string, LexiconEntry[]>);
|
|
118
|
+
/**
|
|
119
|
+
* 检测文本的语言是否为中文、日文或韩文
|
|
120
|
+
* @param text - 要检测语言的文本
|
|
121
|
+
* @returns 如果是中文返回'zh',日文返回'ja',韩文返回'ko',否则返回空字符串
|
|
122
|
+
*/
|
|
123
|
+
detectLanguage(text: string): string;
|
|
124
|
+
/**
|
|
125
|
+
* 对CJK文本进行分词
|
|
126
|
+
* @param text - 要分词的CJK文本
|
|
127
|
+
* @param language - 指定的语言代码('zh'、'ja'或'ko')
|
|
128
|
+
* @returns 分词结果的Token数组
|
|
129
|
+
*/
|
|
130
|
+
tokenize(text: string, language: string): Token[];
|
|
131
|
+
private getSegmenter;
|
|
132
|
+
private applyCustomDictionary;
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
/**
|
|
136
|
+
* 日期分词器类,实现LanguageTokenizer接口,用于识别和分词文本中的日期格式
|
|
137
|
+
* @class DateTokenizer
|
|
138
|
+
* @implements {LanguageTokenizer}
|
|
139
|
+
*/
|
|
140
|
+
declare class DateTokenizer implements LanguageTokenizer {
|
|
141
|
+
/** 综合日期正则表达式,用于匹配多种日期格式 */
|
|
142
|
+
private comprehensiveDatePattern;
|
|
143
|
+
/**
|
|
144
|
+
* 检测文本的语言
|
|
145
|
+
* @param text - 要检测语言的文本
|
|
146
|
+
* @returns 日期分词器不检测语言,返回空字符串
|
|
147
|
+
*/
|
|
148
|
+
detectLanguage(text: string): string;
|
|
149
|
+
/**
|
|
150
|
+
* 对文本进行分词,识别并提取日期格式
|
|
151
|
+
* @param text - 要分词的文本
|
|
152
|
+
* @param language - 指定的语言代码,默认为中文'zh'
|
|
153
|
+
* @returns 分词结果的Token数组,日期token的type为'date'
|
|
154
|
+
*/
|
|
155
|
+
tokenize(text: string, language?: string): Token[];
|
|
156
|
+
private isValidDate;
|
|
157
|
+
private isValidDateComponents;
|
|
158
|
+
}
|
|
159
|
+
|
|
160
|
+
/**
|
|
161
|
+
* 语言检测类,用于检测文本的语言类型
|
|
162
|
+
* @class LanguageDetector
|
|
163
|
+
*/
|
|
164
|
+
declare class LanguageDetector {
|
|
165
|
+
/**
|
|
166
|
+
* 检测文本的语言
|
|
167
|
+
* @param text - 要检测语言的文本
|
|
168
|
+
* @returns 检测到的语言代码:'ja'(日语)、'ko'(韩语)、'zh'(中文)或默认的'en'(英文)
|
|
169
|
+
*/
|
|
170
|
+
static detectLanguage(text: string): string;
|
|
171
|
+
}
|
|
172
|
+
|
|
173
|
+
/**
|
|
174
|
+
* 多语言分词器类,支持中英文、日语、韩语等多种语言的文本分词
|
|
175
|
+
* @class MultilingualTokenizer
|
|
176
|
+
*/
|
|
177
|
+
declare class MultilingualTokenizer {
|
|
178
|
+
/** 语言分词器数组 */
|
|
179
|
+
private tokenizers;
|
|
180
|
+
/** 自定义词库,键为语言代码,值为该语言的词库条目数组 */
|
|
181
|
+
private customDictionaries;
|
|
182
|
+
/** 默认语言代码 */
|
|
183
|
+
private defaultLanguage;
|
|
184
|
+
/**
|
|
185
|
+
* 构造函数
|
|
186
|
+
* @param options - 分词器配置选项
|
|
187
|
+
*/
|
|
188
|
+
constructor(options?: TokenizerOptions);
|
|
189
|
+
/**
|
|
190
|
+
* 添加自定义词库
|
|
191
|
+
* @param words - 要添加的单词数组
|
|
192
|
+
* @param language - 词库对应的语言代码
|
|
193
|
+
* @param priority - 词库优先级,值越高优先级越高
|
|
194
|
+
* @param name - 词库名称,用于标识和管理词库
|
|
195
|
+
*/
|
|
196
|
+
addCustomDictionary(words: string[], language: string, priority: number, name: string): void;
|
|
197
|
+
/**
|
|
198
|
+
* 移除自定义词库中的指定单词
|
|
199
|
+
* @param word - 要移除的单词
|
|
200
|
+
* @param language - 可选,指定要操作的语言词库
|
|
201
|
+
* @param lexiconName - 可选,指定要操作的词库名称
|
|
202
|
+
*/
|
|
203
|
+
removeCustomWord(word: string, language?: string, lexiconName?: string): void;
|
|
204
|
+
/**
|
|
205
|
+
* 主分词方法,对输入文本进行多语言分词
|
|
206
|
+
* @param text - 要分词的文本
|
|
207
|
+
* @param language - 可选,指定文本语言代码,不指定则自动检测
|
|
208
|
+
* @returns 分词结果的Token数组
|
|
209
|
+
*/
|
|
210
|
+
tokenize(text: string, language?: string): Token[];
|
|
211
|
+
/**
|
|
212
|
+
* 获取纯文本分词结果,可自定义包含或排除的token类型
|
|
213
|
+
* @param text - 要分词的文本
|
|
214
|
+
* @param options - 可选,配置项
|
|
215
|
+
* @param options.language - 可选,指定文本语言代码
|
|
216
|
+
* @param options.includeTypes - 可选,指定要包含的token类型数组
|
|
217
|
+
* @param options.excludeTypes - 可选,指定要排除的token类型数组
|
|
218
|
+
* @returns 文本数组
|
|
219
|
+
*/
|
|
220
|
+
tokenizeText(text: string, options?: TokenizeTextOptions): string[];
|
|
221
|
+
}
|
|
222
|
+
/**
|
|
223
|
+
* 创建多语言分词器实例的工厂函数
|
|
224
|
+
* @param options - 分词器配置选项
|
|
225
|
+
* @returns MultilingualTokenizer实例
|
|
226
|
+
*/
|
|
227
|
+
declare function createTokenizer(options?: TokenizerOptions): MultilingualTokenizer;
|
|
228
|
+
|
|
229
|
+
export { CJKTokenizer, DateTokenizer, EnglishTokenizer, LanguageDetector, MultilingualTokenizer, createTokenizer };
|
|
230
|
+
export type { LanguageTokenizer, LexiconEntry, Token, TokenizeTextOptions, TokenizerOptions };
|
package/lib/core.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const s=[],n=t.split(/\b/);for(const t of n)t&&(t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?s.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(s,e)}tagNameTokens(t,e){const s=[];let n=0;for(;n<t.length;){if(n<t.length&&"word"===t[n].type){const i=t[n].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){s.push({txt:i,type:"word",lang:e,src:r.name}),n++,t=!0;break}if(t)continue}}s.push({txt:t[n].txt,type:t[n].type,lang:e,src:t[n].src||""}),n++}return s}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const s=[],n=this.getSegmenter(e);for(const i of n.segment(t)){const{segment:t,isWordLike:n}=i;t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?s.push({txt:t,type:"punctuation",lang:e,src:""}):n?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(s,e)}getSegmenter(t,e="word"){const s=`${t}-${e}`;return this.segmenters.has(s)||this.segmenters.set(s,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(s)}applyCustomDictionary(t,e){const s=this.customDictionaries[e]||[];let n=t;if(s.length>0){const t=[];let i=0;for(;i<n.length;){let r=!1;for(let o=Math.min(5,n.length-i);o>=1;o--){if(o>1&&n.slice(i,i+o).some(t=>"word"!==t.type))continue;const a=n.slice(i,i+o).map(t=>t.txt).join("");for(const n of s.sort((t,e)=>e.priority-t.priority))if(n.data.has(a)){t.push({txt:a,type:"word",lang:e,src:""}),i+=o,r=!0;break}if(r)break}r||(t.push({...n[i],src:""}),i++)}n=t}return n}}class s{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const s=[];let n=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const s=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,s)),index:o.index,end:Math.max(o.end,s)}:(r.push({text:o.text,index:o.index}),o={...e,end:s}):o={...e,end:s}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>n){const r=t.slice(n,i.index);s.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?s.push({txt:i.text,type:"date",lang:e,src:""}):s.push({txt:i.text,type:"other",lang:e,src:""}),n=i.index+i.text.length}if(n<t.length){const i=t.slice(n);s.push({txt:i,type:"other",lang:e,src:""})}return s}isValidDate(t){let e,s,n;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),s=parseInt(t.slice(4,6)),n=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],s=i[1],n=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],s=i[1],n=i[2]):i[2]>31?(s=i[0],n=i[1],e=i[2]):(s=i[0],n=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;s=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,n=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),s=parseInt(i[2]),n=parseInt(i[3])}}return this.isValidDateComponents(e,s,n)}isValidDateComponents(t,e,s){if(e<1||e>12||s<1||s>31)return!1;if(2===e){if(s>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(s>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class n{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const s=[];let n=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g,r=/(?:https?:\/\/)?(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|(?:localhost|127\.0\.0\.1)|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(?::\d{1,5})?(?:\/[^\s]*)?/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"url",text:a[0]});o.sort((t,e)=>t.index-e.index);const c=[];for(let t=0;t<o.length;t++){let e=!1;for(let s=0;s<o.length;s++)if(t!==s){const n=o[t].index===o[s].index&&o[t].endIndex===o[s].endIndex,i=o[t].index>=o[s].index&&o[t].endIndex<=o[s].endIndex;if(n){if("ip"===o[s].type){e=!0;break}}else if(i){e=!0;break}}e||c.push(o[t])}for(const i of c)i.index>n&&s.push({txt:t.substring(n,i.index),type:"other",lang:e}),s.push({txt:i.text,type:i.type,lang:"en"}),n=i.endIndex;return n<t.length&&s.push({txt:t.substring(n),type:"other",lang:e}),s}}class r{tokenizers;customDictionaries;defaultLanguage;constructor(n={}){this.customDictionaries=n.customDictionaries||{},this.defaultLanguage=n.defaultLanguage||"en",this.tokenizers=[new s,new i,new t,new e(this.customDictionaries)]}addCustomDictionary(t,e,s,n){const i=e||this.defaultLanguage;this.customDictionaries[i]||(this.customDictionaries[i]=[]);const r=this.customDictionaries[i].findIndex(t=>t.name===n&&t.lang===i&&t.priority===s);if(r>=0){const e=this.customDictionaries[i][r];t.forEach(t=>e.data.add(t))}else this.customDictionaries[i].push({priority:s,data:new Set(t),name:n,lang:i})}removeCustomWord(t,e,s){if(e){if(this.customDictionaries[e])if(s){const n=this.customDictionaries[e].find(t=>t.name===s);n&&n.data.delete(t)}else this.customDictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.customDictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(r,o){const a=o||n.detectLanguage(r),c=this.tokenizers.find(t=>t instanceof s);if(!c)return[];const u=c.tokenize(r,a),d=[],l=this.tokenizers.find(t=>t instanceof i);if(!l)return d;for(const s of u)if("date"===s.type)d.push(s);else{const n=l.tokenize(s.txt,a);for(const s of n)if("url"===s.type||"ip"===s.type)d.push(s);else{let n=[];if("en"===a){const e=this.tokenizers.find(e=>e instanceof t);e&&(n=e.tokenize(s.txt,a))}else if(["zh","ja","ko"].includes(a)){const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}n.length>0?d.push(...n):d.push(s)}}return d}tokenizeText(t,e){const s=this.tokenize(t,e?.language),n=["punctuation","space","other",...e?.excludeTypes||[]];return s.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type))&&!n.includes(t.type)).map(t=>t.txt)}}function o(t){return new r(t)}export{e as CJKTokenizer,s as DateTokenizer,t as EnglishTokenizer,n as LanguageDetector,r as MultilingualTokenizer,o as createTokenizer};
|
package/lib/index.cjs
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
"use strict";var e=require("./core"),t=require("./lexicon");function n(e){var t=Object.create(null);return e&&Object.keys(e).forEach(function(n){if("default"!==n){var s=Object.getOwnPropertyDescriptor(e,n);Object.defineProperty(t,n,s.get?s:{enumerable:!0,get:function(){return e[n]}})}}),t.default=e,Object.freeze(t)}var s=n(t);class a{static instances=new Map;lexicons=[];constructor(e){this.loadLexicons(e)}static getInstance(e){const t=`${JSON.stringify(e.types)}-${JSON.stringify(e.languages)}`;return a.instances.has(t)||a.instances.set(t,new a(e)),a.instances.get(t)}loadLexicons(e){const{languages:t,types:n}=e;this.lexicons=[];const a={lastName:100,firstName:100,famousName:100,famousWorks:100,country:80,computerTerm:75,city:70,networkTerms:70,medicines:70,transportation:70,luxury:65,pronouns:65,address:60,foods:60,appliances:60,honorific:50,nickname:50,title:50,kinship:50,organization:50,furniture:55,pets:55};t.forEach(e=>{const t=e.replace("-","_");n.forEach(n=>{const r=`${t}_${n.charAt(0).toUpperCase()+n.slice(1)}`,o=s[r]||"",i=this.parseLexiconString(o);this.lexicons.push({priority:a[n]||50,data:i,name:`${t}_${n}`,lang:e})})})}parseLexiconString(e){const t=e.split("").map(e=>e.trim()).filter(e=>e.length>0);return new Set(t)}checkWord(e){for(const t of this.lexicons)if(t.data.has(e))return{found:!0,lexiconName:t.name};return{found:!1,lexiconName:""}}addCustomLexicon(e){this.lexicons.push(e),this.lexicons.sort((e,t)=>t.priority-e.priority)}getLexicons(){return this.lexicons}static clearAllInstances(){a.instances.clear()}}class r{static instance=null;static defaultLanguages=["zh-CN","zh-TW","en-US","ja-JP","ko-KR"];static defaultTypes=["lastName","firstName","famousName","famousWorks","honorific","nickname","title","kinship","organization","country","city","address","computerTerm","networkTerms","pronouns","foods","medicines","luxury","transportation","appliances","furniture","pets"];static getInstance(){if(!r.instance){const t={defaultLanguage:"zh"};r.instance=e.createTokenizer(t)}return r.instance}static setDefaultLanguages(e){r.defaultLanguages=e,r.instance=null}static setDefaultTypes(e){r.defaultTypes=e,r.instance=null}static tokenize(e,t){return r.getInstance().tokenize(e,t)}static tokenizeText(e,t){return r.getInstance().tokenizeText(e,{language:t})}static addCustomDictionary(e,t,n,s){r.getInstance().addCustomDictionary(e,t,n,s)}static removeCustomWord(e,t,n){r.getInstance().removeCustomWord(e,t,n)}static getLexiconLoader(){const e={languages:r.defaultLanguages,types:r.defaultTypes};return a.getInstance(e)}}Object.defineProperty(exports,"MultilingualTokenizer",{enumerable:!0,get:function(){return e.MultilingualTokenizer}}),Object.defineProperty(exports,"createTokenizer",{enumerable:!0,get:function(){return e.createTokenizer}}),exports.LexiconLoader=a,exports.QuickUseTokenizer=r,exports.addCustomDictionary=(e,t,n,s)=>r.addCustomDictionary(e,t,n,s),exports.removeCustomWord=(e,t,n)=>r.removeCustomWord(e,t,n),exports.setDefaultLanguages=e=>r.setDefaultLanguages(e),exports.setDefaultTypes=e=>r.setDefaultTypes(e),exports.tokenize=(e,t)=>r.tokenize(e,t),exports.tokenizeText=(e,t)=>r.getInstance().tokenizeText(e,{language:t}),exports.tokenizeToText=(e,t)=>r.tokenizeText(e,t),Object.keys(t).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return t[e]}})});
|
package/lib/index.d.ts
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { LexiconEntry, MultilingualTokenizer, Token } from './core';
|
|
2
|
+
export { MultilingualTokenizer, Token, TokenizerOptions, createTokenizer } from './core';
|
|
3
|
+
export * from './lexicon';
|
|
4
|
+
|
|
5
|
+
interface LexiconConfig {
|
|
6
|
+
types: string[];
|
|
7
|
+
languages: string[];
|
|
8
|
+
}
|
|
9
|
+
declare class LexiconLoader {
|
|
10
|
+
private static instances;
|
|
11
|
+
private lexicons;
|
|
12
|
+
private constructor();
|
|
13
|
+
/**
|
|
14
|
+
* 获取词库加载器实例
|
|
15
|
+
*/
|
|
16
|
+
static getInstance(config: LexiconConfig): LexiconLoader;
|
|
17
|
+
/**
|
|
18
|
+
* 加载指定语言和类型的词库
|
|
19
|
+
*/
|
|
20
|
+
private loadLexicons;
|
|
21
|
+
/**
|
|
22
|
+
* 解析词库字符串为Set
|
|
23
|
+
*/
|
|
24
|
+
private parseLexiconString;
|
|
25
|
+
/**
|
|
26
|
+
* 检查单词是否在词库中,并返回词库来源
|
|
27
|
+
*/
|
|
28
|
+
checkWord(word: string): {
|
|
29
|
+
found: boolean;
|
|
30
|
+
lexiconName: string;
|
|
31
|
+
};
|
|
32
|
+
/**
|
|
33
|
+
* 添加自定义词库
|
|
34
|
+
*/
|
|
35
|
+
addCustomLexicon(lexicon: LexiconEntry): void;
|
|
36
|
+
/**
|
|
37
|
+
* 获取所有词库条目
|
|
38
|
+
*/
|
|
39
|
+
getLexicons(): LexiconEntry[];
|
|
40
|
+
/**
|
|
41
|
+
* 清除所有实例
|
|
42
|
+
*/
|
|
43
|
+
static clearAllInstances(): void;
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
/**
|
|
47
|
+
* 快速使用多语言分词器类,提供静态实例和便捷方法
|
|
48
|
+
* @class QuickUseTokenizer
|
|
49
|
+
*/
|
|
50
|
+
declare class QuickUseTokenizer {
|
|
51
|
+
/** 静态分词器实例 */
|
|
52
|
+
private static instance;
|
|
53
|
+
/** 默认加载的语言 */
|
|
54
|
+
private static defaultLanguages;
|
|
55
|
+
/** 默认加载的词库类型 */
|
|
56
|
+
private static defaultTypes;
|
|
57
|
+
/**
|
|
58
|
+
* 获取分词器实例(单例模式)
|
|
59
|
+
* @returns MultilingualTokenizer实例
|
|
60
|
+
*/
|
|
61
|
+
static getInstance(): MultilingualTokenizer;
|
|
62
|
+
/**
|
|
63
|
+
* 设置默认加载的语言
|
|
64
|
+
* @param languages - 要加载的语言代码数组
|
|
65
|
+
*/
|
|
66
|
+
static setDefaultLanguages(languages: string[]): void;
|
|
67
|
+
/**
|
|
68
|
+
* 设置默认加载的词库类型
|
|
69
|
+
* @param types - 要加载的词库类型数组
|
|
70
|
+
*/
|
|
71
|
+
static setDefaultTypes(types: string[]): void;
|
|
72
|
+
/**
|
|
73
|
+
* 分词方法
|
|
74
|
+
* @param text - 要分词的文本
|
|
75
|
+
* @param language - 可选,指定文本语言代码
|
|
76
|
+
* @returns 分词结果的Token数组
|
|
77
|
+
*/
|
|
78
|
+
static tokenize(text: string, language?: string): Token[];
|
|
79
|
+
/**
|
|
80
|
+
* 获取纯文本分词结果
|
|
81
|
+
* @param text - 要分词的文本
|
|
82
|
+
* @param language - 可选,指定文本语言代码
|
|
83
|
+
* @returns 单词数组
|
|
84
|
+
*/
|
|
85
|
+
static tokenizeText(text: string, language?: string): string[];
|
|
86
|
+
/**
|
|
87
|
+
* 添加自定义词库
|
|
88
|
+
* @param words - 要添加的单词数组
|
|
89
|
+
* @param language - 词库对应的语言代码
|
|
90
|
+
* @param priority - 词库优先级
|
|
91
|
+
* @param name - 词库名称
|
|
92
|
+
*/
|
|
93
|
+
static addCustomDictionary(words: string[], language: string, priority: number, name: string): void;
|
|
94
|
+
/**
|
|
95
|
+
* 移除自定义词库中的指定单词
|
|
96
|
+
* @param word - 要移除的单词
|
|
97
|
+
* @param language - 可选,指定要操作的语言词库
|
|
98
|
+
* @param lexiconName - 可选,指定要操作的词库名称
|
|
99
|
+
*/
|
|
100
|
+
static removeCustomWord(word: string, language?: string, lexiconName?: string): void;
|
|
101
|
+
/**
|
|
102
|
+
* 获取词库加载器实例
|
|
103
|
+
* @returns LexiconLoader实例
|
|
104
|
+
*/
|
|
105
|
+
static getLexiconLoader(): LexiconLoader;
|
|
106
|
+
}
|
|
107
|
+
declare const tokenize: (text: string, language?: string) => Token[];
|
|
108
|
+
declare const tokenizeToText: (text: string, language?: string) => string[];
|
|
109
|
+
declare const tokenizeText: (text: string, language?: string) => string[];
|
|
110
|
+
declare const addCustomDictionary: (words: string[], language: string, priority: number, name: string) => void;
|
|
111
|
+
declare const removeCustomWord: (word: string, language?: string, lexiconName?: string) => void;
|
|
112
|
+
declare const setDefaultLanguages: (languages: string[]) => void;
|
|
113
|
+
declare const setDefaultTypes: (types: string[]) => void;
|
|
114
|
+
|
|
115
|
+
export { LexiconLoader, QuickUseTokenizer, addCustomDictionary, removeCustomWord, setDefaultLanguages, setDefaultTypes, tokenize, tokenizeText, tokenizeToText };
|
|
116
|
+
export type { LexiconConfig };
|
package/lib/index.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
import{createTokenizer as e}from"./core";import{MultilingualTokenizer as t,createTokenizer as n}from"./core";import*as s from"./lexicon";export*from"./lexicon";class a{static instances=new Map;lexicons=[];constructor(e){this.loadLexicons(e)}static getInstance(e){const t=`${JSON.stringify(e.types)}-${JSON.stringify(e.languages)}`;return a.instances.has(t)||a.instances.set(t,new a(e)),a.instances.get(t)}loadLexicons(e){const{languages:t,types:n}=e;this.lexicons=[];const a={lastName:100,firstName:100,famousName:100,famousWorks:100,country:80,computerTerm:75,city:70,networkTerms:70,medicines:70,transportation:70,luxury:65,pronouns:65,address:60,foods:60,appliances:60,honorific:50,nickname:50,title:50,kinship:50,organization:50,furniture:55,pets:55};t.forEach(e=>{const t=e.replace("-","_");n.forEach(n=>{const i=`${t}_${n.charAt(0).toUpperCase()+n.slice(1)}`,o=s[i]||"",r=this.parseLexiconString(o);this.lexicons.push({priority:a[n]||50,data:r,name:`${t}_${n}`,lang:e})})})}parseLexiconString(e){const t=e.split("").map(e=>e.trim()).filter(e=>e.length>0);return new Set(t)}checkWord(e){for(const t of this.lexicons)if(t.data.has(e))return{found:!0,lexiconName:t.name};return{found:!1,lexiconName:""}}addCustomLexicon(e){this.lexicons.push(e),this.lexicons.sort((e,t)=>t.priority-e.priority)}getLexicons(){return this.lexicons}static clearAllInstances(){a.instances.clear()}}class i{static instance=null;static defaultLanguages=["zh-CN","zh-TW","en-US","ja-JP","ko-KR"];static defaultTypes=["lastName","firstName","famousName","famousWorks","honorific","nickname","title","kinship","organization","country","city","address","computerTerm","networkTerms","pronouns","foods","medicines","luxury","transportation","appliances","furniture","pets"];static getInstance(){if(!i.instance){const t={defaultLanguage:"zh"};i.instance=e(t)}return i.instance}static setDefaultLanguages(e){i.defaultLanguages=e,i.instance=null}static setDefaultTypes(e){i.defaultTypes=e,i.instance=null}static tokenize(e,t){return i.getInstance().tokenize(e,t)}static tokenizeText(e,t){return i.getInstance().tokenizeText(e,{language:t})}static addCustomDictionary(e,t,n,s){i.getInstance().addCustomDictionary(e,t,n,s)}static removeCustomWord(e,t,n){i.getInstance().removeCustomWord(e,t,n)}static getLexiconLoader(){const e={languages:i.defaultLanguages,types:i.defaultTypes};return a.getInstance(e)}}const o=(e,t)=>i.tokenize(e,t),r=(e,t)=>i.tokenizeText(e,t),c=(e,t)=>i.getInstance().tokenizeText(e,{language:t}),u=(e,t,n,s)=>i.addCustomDictionary(e,t,n,s),l=(e,t,n)=>i.removeCustomWord(e,t,n),m=e=>i.setDefaultLanguages(e),g=e=>i.setDefaultTypes(e);export{a as LexiconLoader,t as MultilingualTokenizer,i as QuickUseTokenizer,u as addCustomDictionary,n as createTokenizer,l as removeCustomWord,m as setDefaultLanguages,g as setDefaultTypes,o as tokenize,c as tokenizeText,r as tokenizeToText};
|