gs-tokenizer 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.cn.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # gs-tokenizer - 多语言分词器
2
2
 
3
- 一个功能强大且轻量级的多语言分词器库,为英语、中文、日语和韩语等多种语言提供自然语言处理能力。
3
+ 这是一个纯前端的极小体积的简单分词器,受限于体积有各种不完善,好处是能够以极少体积在浏览器运行。
4
4
 
5
5
  ## 文档
6
6
 
package/README.md CHANGED
@@ -1,6 +1,6 @@
1
1
  # gs-tokenizer
2
2
 
3
- A powerful and lightweight multilingual tokenizer library that provides natural language processing capabilities for multiple languages including English, Chinese, Japanese, and Korean.
3
+ A simple and extremely lightweight frontend tokenizer with some limitations due to its size, but capable of running in browsers with minimal footprint.
4
4
 
5
5
  ## Documentation
6
6
 
package/lib/core.cjs CHANGED
@@ -1 +1 @@
1
- "use strict";class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const s=[],n=t.split(/\b/);for(const t of n)t&&(t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?s.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(s,e)}tagNameTokens(t,e){const s=[];let n=0;for(;n<t.length;){if(n<t.length&&"word"===t[n].type){const i=t[n].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){s.push({txt:i,type:"word",lang:e,src:r.name}),n++,t=!0;break}if(t)continue}}s.push({txt:t[n].txt,type:t[n].type,lang:e,src:t[n].src||""}),n++}return s}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const s=[],n=this.getSegmenter(e);for(const i of n.segment(t)){const{segment:t,isWordLike:n}=i;t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?s.push({txt:t,type:"punctuation",lang:e,src:""}):n?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(s,e)}getSegmenter(t,e="word"){const s=`${t}-${e}`;return this.segmenters.has(s)||this.segmenters.set(s,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(s)}applyCustomDictionary(t,e){const s=this.customDictionaries[e]||[];let n=t;if(s.length>0){const t=[];let i=0;for(;i<n.length;){let r=!1;for(let o=Math.min(5,n.length-i);o>=1;o--){if(o>1&&n.slice(i,i+o).some(t=>"word"!==t.type))continue;const a=n.slice(i,i+o).map(t=>t.txt).join("");for(const n of s.sort((t,e)=>e.priority-t.priority))if(n.data.has(a)){t.push({txt:a,type:"word",lang:e,src:""}),i+=o,r=!0;break}if(r)break}r||(t.push({...n[i],src:""}),i++)}n=t}return n}}class s{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const s=[];let n=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const s=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,s)),index:o.index,end:Math.max(o.end,s)}:(r.push({text:o.text,index:o.index}),o={...e,end:s}):o={...e,end:s}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>n){const r=t.slice(n,i.index);s.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?s.push({txt:i.text,type:"date",lang:e,src:""}):s.push({txt:i.text,type:"other",lang:e,src:""}),n=i.index+i.text.length}if(n<t.length){const i=t.slice(n);s.push({txt:i,type:"other",lang:e,src:""})}return s}isValidDate(t){let e,s,n;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),s=parseInt(t.slice(4,6)),n=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],s=i[1],n=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],s=i[1],n=i[2]):i[2]>31?(s=i[0],n=i[1],e=i[2]):(s=i[0],n=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;s=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,n=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),s=parseInt(i[2]),n=parseInt(i[3])}}return this.isValidDateComponents(e,s,n)}isValidDateComponents(t,e,s){if(e<1||e>12||s<1||s>31)return!1;if(2===e){if(s>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(s>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class n{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const s=[];let n=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g,r=/(?:https?:\/\/)?(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|(?:localhost|127\.0\.0\.1)|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(?::\d{1,5})?(?:\/[^\s]*)?/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"url",text:a[0]});o.sort((t,e)=>t.index-e.index);const c=[];for(let t=0;t<o.length;t++){let e=!1;for(let s=0;s<o.length;s++)if(t!==s){const n=o[t].index===o[s].index&&o[t].endIndex===o[s].endIndex,i=o[t].index>=o[s].index&&o[t].endIndex<=o[s].endIndex;if(n){if("ip"===o[s].type){e=!0;break}}else if(i){e=!0;break}}e||c.push(o[t])}for(const i of c)i.index>n&&s.push({txt:t.substring(n,i.index),type:"other",lang:e}),s.push({txt:i.text,type:i.type,lang:"en"}),n=i.endIndex;return n<t.length&&s.push({txt:t.substring(n),type:"other",lang:e}),s}}class r{tokenizers;customDictionaries;defaultLanguage;constructor(n={}){this.customDictionaries=n.customDictionaries||{},this.defaultLanguage=n.defaultLanguage||"en",this.tokenizers=[new s,new i,new t,new e(this.customDictionaries)]}addCustomDictionary(t,e,s,n){const i=e||this.defaultLanguage;this.customDictionaries[i]||(this.customDictionaries[i]=[]);const r=this.customDictionaries[i].findIndex(t=>t.name===n&&t.lang===i&&t.priority===s);if(r>=0){const e=this.customDictionaries[i][r];t.forEach(t=>e.data.add(t))}else this.customDictionaries[i].push({priority:s,data:new Set(t),name:n,lang:i})}removeCustomWord(t,e,s){if(e){if(this.customDictionaries[e])if(s){const n=this.customDictionaries[e].find(t=>t.name===s);n&&n.data.delete(t)}else this.customDictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.customDictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(r,o){const a=o||n.detectLanguage(r),c=this.tokenizers.find(t=>t instanceof s);if(!c)return[];const u=c.tokenize(r,a),d=[],l=this.tokenizers.find(t=>t instanceof i);if(!l)return d;for(const s of u)if("date"===s.type)d.push(s);else{const n=l.tokenize(s.txt,a);for(const s of n)if("url"===s.type||"ip"===s.type)d.push(s);else{let n=[];if("en"===a){const e=this.tokenizers.find(e=>e instanceof t);e&&(n=e.tokenize(s.txt,a))}else if(["zh","ja","ko"].includes(a)){const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}n.length>0?d.push(...n):d.push(s)}}return d}tokenizeText(t,e){const s=this.tokenize(t,e?.language),n=["punctuation","space","other",...e?.excludeTypes||[]];return s.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type))&&!n.includes(t.type)).map(t=>t.txt)}}exports.CJKTokenizer=e,exports.DateTokenizer=s,exports.EnglishTokenizer=t,exports.LanguageDetector=n,exports.MultilingualTokenizer=r,exports.createTokenizer=function(t){return new r(t)};
1
+ "use strict";class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const n=[],s=t.split(/\b/);for(const t of s)t&&(t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?n.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(n,e)}tagNameTokens(t,e){const n=[];let s=0;for(;s<t.length;){if(s<t.length&&"word"===t[s].type){const i=t[s].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){n.push({txt:i,type:"word",lang:e,src:r.name}),s++,t=!0;break}if(t)continue}}n.push({txt:t[s].txt,type:t[s].type,lang:e,src:t[s].src||""}),s++}return n}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const n=[],s=this.getSegmenter(e);for(const i of s.segment(t)){const{segment:t,isWordLike:s}=i;t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?n.push({txt:t,type:"punctuation",lang:e,src:""}):s?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(n,e)}getSegmenter(t,e="word"){const n=`${t}-${e}`;return this.segmenters.has(n)||this.segmenters.set(n,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(n)}applyCustomDictionary(t,e){const n=this.customDictionaries[e]||[];let s=t;if(n.length>0){const t=[];let i=0;for(;i<s.length;){let r=null,o=-1;for(let t=Math.min(5,s.length-i);t>=1;t--){if(t>1&&s.slice(i,i+t).some(t=>"word"!==t.type))continue;const e=s.slice(i,i+t).map(t=>t.txt).join("");for(const s of n)s.data.has(e)&&(!r||t>r.length||t===r.length&&s.priority>o)&&(r={length:t,text:e},o=s.priority)}r?(t.push({txt:r.text,type:"word",lang:e,src:""}),i+=r.length):(t.push({...s[i],src:""}),i++)}s=t}return s}}class n{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))|(?:[零一二三四五六七八九十百千万亿]+)(?:小时|分钟|秒|毫秒|天|周|月|年)/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const n=[];let s=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const n=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,n)),index:o.index,end:Math.max(o.end,n)}:(r.push({text:o.text,index:o.index}),o={...e,end:n}):o={...e,end:n}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>s){const r=t.slice(s,i.index);n.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?n.push({txt:i.text,type:"date",lang:e,src:""}):n.push({txt:i.text,type:"other",lang:e,src:""}),s=i.index+i.text.length}if(s<t.length){const i=t.slice(s);n.push({txt:i,type:"other",lang:e,src:""})}return n}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t)||/^[零一二三四五六七八九十百千万亿]+(?:小时|分钟|秒|毫秒|天|周|月|年)$/.test(t))return!0;let e,n,s;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),n=parseInt(t.slice(4,6)),s=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],n=i[1],s=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],n=i[1],s=i[2]):i[2]>31?(n=i[0],s=i[1],e=i[2]):(n=i[0],s=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;n=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,s=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),n=parseInt(i[2]),s=parseInt(i[3])}}return this.isValidDateComponents(e,n,s)}isValidChineseNumberTime(t){const e=/(小时|分钟|秒|毫秒|天|周|月|年)$/,n=t.replace(e,""),s=t.match(e)?.[1]||"";return!(!n||!s)&&/^(?:[零一二三四五六七八九]|十[零一二三四五六七八九]?|百[零一二三四五六七八九]?|千[零一二三四五六七八九]?|万[零一二三四五六七八九]?|亿[零一二三四五六七八九]?)+$/.test(n)}isValidDateComponents(t,e,n){if(e<1||e>12||n<1||n>31)return!1;if(2===e){if(n>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(n>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class s{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g,r=/(?:https?:\/\/)?(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|(?:localhost|127\.0\.0\.1)|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(?::\d{1,5})?(?:\/[^\s]*)?/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"url",text:a[0]});o.sort((t,e)=>t.index-e.index);const u=[];for(let t=0;t<o.length;t++){let e=!1;for(let n=0;n<o.length;n++)if(t!==n){const s=o[t].index===o[n].index&&o[t].endIndex===o[n].endIndex,i=o[t].index>=o[n].index&&o[t].endIndex<=o[n].endIndex;if(s){if("ip"===o[n].type){e=!0;break}}else if(i){e=!0;break}}e||u.push(o[t])}for(const i of u)i.index>s&&n.push({txt:t.substring(s,i.index),type:"other",lang:e}),n.push({txt:i.text,type:i.type,lang:"en"}),s=i.endIndex;return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e}),n}}class r{CHINESE_NUMBERS=new Set(["零","一","二","三","四","五","六","七","八","九","十","百","千","万","亿","壹","贰","叁","肆","伍","陆","柒","捌","玖","拾","佰","仟"]);CHINESE_SURNAMES=new Set(["张","李","王","刘","陈","杨","赵","黄","周","吴","徐","孙","胡","朱","高","林","何","郭","马","罗","梁","宋","郑","谢","韩","唐","冯","于","董","萧","程","曹","袁","邓","许","傅","沈","曾","彭","吕","苏","卢","蒋","蔡","贾","丁","魏","薛","叶","阎","余","潘","杜","戴","夏","钟","汪","田","任","姜","范","方","石","姚","谭","廖","邹","熊","金","陆","郝","孔","白","崔","康","毛","邱","秦","江","史","顾","侯","邵","孟","龙","万","段","雷","钱","汤","尹","黎","易","常","武","乔","贺","赖","龚","文"]);UNITS=new Set(["公斤","英里","克","千克","吨","米","厘米","毫米","公里","斤","两","元","角","分","小时","分钟","秒","折","折扣","卷","券","美元","人民币","元","角","分","亩","公顷","平方米","平方分米","平方厘米","立方厘米","升","毫升","天","周","月","年","岁","度","瓦","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛","牛顿","帕斯卡","巴","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","g","mg","t","km","m","cm","mm","μm","nm","L","mL","l","ml","h","min","s","d","w","y","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","B","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"]);ORDINAL_PREFIXES=new Set(["第","No.","No","no.","no"]);detectLanguage(t){return""}tokenize(t,e){if(!t)return[];const n=[];let s=0;const i=t.length;for(;s<i;){let e=!1,r=this.findOrdinalPrefix(t,s);if(r){const{prefix:o,prefixEnd:a}=r;let u=a;for(;u<i&&""===t[u].trim();)u++;const c=this.findNumber(t,u);if(c){let r=c.end;for(;r<i&&""===t[r].trim();)r++;const o=this.findUnit(t,r);o&&(r=o.end),n.push({start:s,end:r,txt:t.substring(s,r)}),s=r,e=!0}}if(!e){const i=this.findNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}e||s++}if(0===n.length)return[{txt:t,type:"other",lang:e,src:""}];const r=[];let o=0;const a=[];for(const t of n){let e=!1;for(const[n,s]of a.entries())if(t.start<=s.end&&t.end>=s.start){t.end-t.start>s.end-s.start&&(a[n]=t),e=!0;break}e||a.push(t)}a.sort((t,e)=>t.start-e.start);for(const n of a)n.start>o&&r.push({txt:t.substring(o,n.start),type:"other",lang:e,src:""}),r.push({txt:n.txt,type:"number",lang:e,src:""}),o=n.end;return o<i&&r.push({txt:t.substring(o),type:"other",lang:e,src:""}),r}findOrdinalPrefix(t,e){for(const n of this.ORDINAL_PREFIXES)if(t.startsWith(n,e))return{prefix:n,prefixEnd:e+n.length};return null}findNumber(t,e){const n=[/^[+-]?\d+(\.\d+)?[eE][+-]?\d+/,/^[+-]?\d+\.\d+/,/^[+-]?\.\d+/,/^[+-]?\d+/];for(const s of n){const n=s.exec(t.substring(e));if(n)return{end:e+n[0].length}}return null}findChineseNumber(t,e){let n=e;for(;n<t.length&&this.CHINESE_NUMBERS.has(t[n]);)n++;const s=n-e;if(s>1)return{end:n};if(1===s){t[e];const s=e>0?t[e-1]:"",i=e+1<t.length?t[e+1]:"",r=this.findUnit(t,e+1);if(s&&this.isNumberRelatedChar(s)||r)return{end:n};if(i&&this.CHINESE_NUMBERS.has(i))return{end:n}}return null}isNumberRelatedChar(t){return this.CHINESE_NUMBERS.has(t)||this.UNITS.has(t)||t>="0"&&t<="9"||"."===t||"e"===t||"E"===t||"+"===t||"-"===t}findUnit(t,e){let n=0;for(let s=e;s<Math.min(e+5,t.length);s++){const i=t.substring(e,s+1);if(this.UNITS.has(i)){const i=t[s+1];(!i||!/[a-zA-Z0-9]/.test(i))&&(n=s+1-e)}}return n>0?{end:e+n}:null}findNumberWithUnit(t,e){const n=this.findNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}findChineseNumberWithUnit(t,e){const n=this.findChineseNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}isNumberChar(t,e,n){if(t>="0"&&t<="9")return!0;if("."===t){const t=e>0?n[e-1]:"",s=e<n.length-1?n[e+1]:"";return t>="0"&&t<="9"||s>="0"&&s<="9"}if(("e"===t||"E"===t)&&e>0){const t=n[e-1];return t>="0"&&t<="9"||"."===t}if(("+"===t||"-"===t)&&e>0){const t=n[e-1];return"e"===t||"E"===t}if(this.CHINESE_NUMBERS.has(t))return!0;for(const s of this.ORDINAL_PREFIXES)if(s.includes(t)&&n.startsWith(s,e))return!0;for(const t of this.UNITS)if(n.startsWith(t,e))return!0;if(("+"===t||"-"===t)&&e>1){const t=n[e-1],s=n[e-2];return("e"===t||"E"===t)&&(s>="0"&&s<="9"||"."===s)}return!1}}class o{tokenizers;customDictionaries;defaultLanguage;constructor(s={}){this.customDictionaries=s.customDictionaries||{},this.defaultLanguage=s.defaultLanguage||"en",this.tokenizers=[new n,new i,new r,new t,new e(this.customDictionaries)]}addCustomDictionary(t,n,s,i){const r=n||this.defaultLanguage;this.customDictionaries[r]||(this.customDictionaries[r]=[]);const o=this.customDictionaries[r].findIndex(t=>t.name===i&&t.lang===r&&t.priority===s);if(o>=0){const e=this.customDictionaries[r][o];t.forEach(t=>e.data.add(t))}else this.customDictionaries[r].push({priority:s,data:new Set(t),name:i,lang:r});const a=this.tokenizers.find(t=>t instanceof e);a&&(a.customDictionaries=this.customDictionaries)}removeCustomWord(t,e,n){if(e){if(this.customDictionaries[e])if(n){const s=this.customDictionaries[e].find(t=>t.name===n);s&&s.data.delete(t)}else this.customDictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.customDictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(o,a){const u=a||s.detectLanguage(o),c=this.tokenizers.find(t=>t instanceof n);if(!c)return[];const d=c.tokenize(o,u),h=[],l=this.tokenizers.find(t=>t instanceof i);if(!l)return h;for(const n of d)if("date"===n.type)h.push(n);else{const s=l.tokenize(n.txt,u),i=this.tokenizers.find(t=>t instanceof r);if(!i)return h;for(const n of s)if("url"===n.type||"ip"===n.type)h.push(n);else{const s=i.tokenize(n.txt,u);for(const n of s)if("number"===n.type)h.push(n);else{let s=[];if("en"===u){const e=this.tokenizers.find(e=>e instanceof t);e&&(s=e.tokenize(n.txt,u))}else if(["zh","ja","ko"].includes(u)){const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,u))}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,u))}s.length>0?h.push(...s):h.push(n)}}}return h}tokenizeText(t,e){const n=this.tokenize(t,e?.language),s=["punctuation","space","other",...e?.excludeTypes||[]];return n.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type))&&!s.includes(t.type)).map(t=>t.txt)}}exports.CJKTokenizer=e,exports.DateTokenizer=n,exports.EnglishTokenizer=t,exports.LanguageDetector=s,exports.MultilingualTokenizer=o,exports.createTokenizer=function(t){return new o(t)};
package/lib/core.d.ts CHANGED
@@ -5,8 +5,8 @@
5
5
  interface Token {
6
6
  /** Token的文本内容 */
7
7
  txt: string;
8
- /** Token类型:单词、标点符号、空格、其他、表情符号、日期、URL、IP地址 */
9
- type: 'word' | 'punctuation' | 'space' | 'other' | 'emoji' | 'date' | 'url' | 'ip';
8
+ /** Token类型:单词、标点符号、空格、其他、表情符号、日期、URL、IP地址、数字 */
9
+ type: 'word' | 'punctuation' | 'space' | 'other' | 'emoji' | 'date' | 'url' | 'ip' | 'number';
10
10
  /** Token的语言代码(可选) */
11
11
  lang?: string;
12
12
  /** Token的来源(可选),如自定义词库名称 */
@@ -138,7 +138,7 @@ declare class CJKTokenizer implements LanguageTokenizer {
138
138
  * @implements {LanguageTokenizer}
139
139
  */
140
140
  declare class DateTokenizer implements LanguageTokenizer {
141
- /** 综合日期正则表达式,用于匹配多种日期格式 */
141
+ /** 综合日期时间正则表达式,用于匹配多种日期和时间格式 */
142
142
  private comprehensiveDatePattern;
143
143
  /**
144
144
  * 检测文本的语言
@@ -154,6 +154,7 @@ declare class DateTokenizer implements LanguageTokenizer {
154
154
  */
155
155
  tokenize(text: string, language?: string): Token[];
156
156
  private isValidDate;
157
+ private isValidChineseNumberTime;
157
158
  private isValidDateComponents;
158
159
  }
159
160
 
package/lib/core.js CHANGED
@@ -1 +1 @@
1
- class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const s=[],n=t.split(/\b/);for(const t of n)t&&(t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?s.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(s,e)}tagNameTokens(t,e){const s=[];let n=0;for(;n<t.length;){if(n<t.length&&"word"===t[n].type){const i=t[n].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){s.push({txt:i,type:"word",lang:e,src:r.name}),n++,t=!0;break}if(t)continue}}s.push({txt:t[n].txt,type:t[n].type,lang:e,src:t[n].src||""}),n++}return s}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const s=[],n=this.getSegmenter(e);for(const i of n.segment(t)){const{segment:t,isWordLike:n}=i;t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?s.push({txt:t,type:"punctuation",lang:e,src:""}):n?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(s,e)}getSegmenter(t,e="word"){const s=`${t}-${e}`;return this.segmenters.has(s)||this.segmenters.set(s,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(s)}applyCustomDictionary(t,e){const s=this.customDictionaries[e]||[];let n=t;if(s.length>0){const t=[];let i=0;for(;i<n.length;){let r=!1;for(let o=Math.min(5,n.length-i);o>=1;o--){if(o>1&&n.slice(i,i+o).some(t=>"word"!==t.type))continue;const a=n.slice(i,i+o).map(t=>t.txt).join("");for(const n of s.sort((t,e)=>e.priority-t.priority))if(n.data.has(a)){t.push({txt:a,type:"word",lang:e,src:""}),i+=o,r=!0;break}if(r)break}r||(t.push({...n[i],src:""}),i++)}n=t}return n}}class s{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const s=[];let n=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const s=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,s)),index:o.index,end:Math.max(o.end,s)}:(r.push({text:o.text,index:o.index}),o={...e,end:s}):o={...e,end:s}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>n){const r=t.slice(n,i.index);s.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?s.push({txt:i.text,type:"date",lang:e,src:""}):s.push({txt:i.text,type:"other",lang:e,src:""}),n=i.index+i.text.length}if(n<t.length){const i=t.slice(n);s.push({txt:i,type:"other",lang:e,src:""})}return s}isValidDate(t){let e,s,n;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),s=parseInt(t.slice(4,6)),n=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],s=i[1],n=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],s=i[1],n=i[2]):i[2]>31?(s=i[0],n=i[1],e=i[2]):(s=i[0],n=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;s=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,n=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),s=parseInt(i[2]),n=parseInt(i[3])}}return this.isValidDateComponents(e,s,n)}isValidDateComponents(t,e,s){if(e<1||e>12||s<1||s>31)return!1;if(2===e){if(s>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(s>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class n{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const s=[];let n=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g,r=/(?:https?:\/\/)?(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|(?:localhost|127\.0\.0\.1)|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(?::\d{1,5})?(?:\/[^\s]*)?/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"url",text:a[0]});o.sort((t,e)=>t.index-e.index);const c=[];for(let t=0;t<o.length;t++){let e=!1;for(let s=0;s<o.length;s++)if(t!==s){const n=o[t].index===o[s].index&&o[t].endIndex===o[s].endIndex,i=o[t].index>=o[s].index&&o[t].endIndex<=o[s].endIndex;if(n){if("ip"===o[s].type){e=!0;break}}else if(i){e=!0;break}}e||c.push(o[t])}for(const i of c)i.index>n&&s.push({txt:t.substring(n,i.index),type:"other",lang:e}),s.push({txt:i.text,type:i.type,lang:"en"}),n=i.endIndex;return n<t.length&&s.push({txt:t.substring(n),type:"other",lang:e}),s}}class r{tokenizers;customDictionaries;defaultLanguage;constructor(n={}){this.customDictionaries=n.customDictionaries||{},this.defaultLanguage=n.defaultLanguage||"en",this.tokenizers=[new s,new i,new t,new e(this.customDictionaries)]}addCustomDictionary(t,e,s,n){const i=e||this.defaultLanguage;this.customDictionaries[i]||(this.customDictionaries[i]=[]);const r=this.customDictionaries[i].findIndex(t=>t.name===n&&t.lang===i&&t.priority===s);if(r>=0){const e=this.customDictionaries[i][r];t.forEach(t=>e.data.add(t))}else this.customDictionaries[i].push({priority:s,data:new Set(t),name:n,lang:i})}removeCustomWord(t,e,s){if(e){if(this.customDictionaries[e])if(s){const n=this.customDictionaries[e].find(t=>t.name===s);n&&n.data.delete(t)}else this.customDictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.customDictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(r,o){const a=o||n.detectLanguage(r),c=this.tokenizers.find(t=>t instanceof s);if(!c)return[];const u=c.tokenize(r,a),d=[],l=this.tokenizers.find(t=>t instanceof i);if(!l)return d;for(const s of u)if("date"===s.type)d.push(s);else{const n=l.tokenize(s.txt,a);for(const s of n)if("url"===s.type||"ip"===s.type)d.push(s);else{let n=[];if("en"===a){const e=this.tokenizers.find(e=>e instanceof t);e&&(n=e.tokenize(s.txt,a))}else if(["zh","ja","ko"].includes(a)){const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}n.length>0?d.push(...n):d.push(s)}}return d}tokenizeText(t,e){const s=this.tokenize(t,e?.language),n=["punctuation","space","other",...e?.excludeTypes||[]];return s.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type))&&!n.includes(t.type)).map(t=>t.txt)}}function o(t){return new r(t)}export{e as CJKTokenizer,s as DateTokenizer,t as EnglishTokenizer,n as LanguageDetector,r as MultilingualTokenizer,o as createTokenizer};
1
+ class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const n=[],s=t.split(/\b/);for(const t of s)t&&(t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?n.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(n,e)}tagNameTokens(t,e){const n=[];let s=0;for(;s<t.length;){if(s<t.length&&"word"===t[s].type){const i=t[s].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){n.push({txt:i,type:"word",lang:e,src:r.name}),s++,t=!0;break}if(t)continue}}n.push({txt:t[s].txt,type:t[s].type,lang:e,src:t[s].src||""}),s++}return n}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const n=[],s=this.getSegmenter(e);for(const i of s.segment(t)){const{segment:t,isWordLike:s}=i;t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?n.push({txt:t,type:"punctuation",lang:e,src:""}):s?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(n,e)}getSegmenter(t,e="word"){const n=`${t}-${e}`;return this.segmenters.has(n)||this.segmenters.set(n,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(n)}applyCustomDictionary(t,e){const n=this.customDictionaries[e]||[];let s=t;if(n.length>0){const t=[];let i=0;for(;i<s.length;){let r=null,o=-1;for(let t=Math.min(5,s.length-i);t>=1;t--){if(t>1&&s.slice(i,i+t).some(t=>"word"!==t.type))continue;const e=s.slice(i,i+t).map(t=>t.txt).join("");for(const s of n)s.data.has(e)&&(!r||t>r.length||t===r.length&&s.priority>o)&&(r={length:t,text:e},o=s.priority)}r?(t.push({txt:r.text,type:"word",lang:e,src:""}),i+=r.length):(t.push({...s[i],src:""}),i++)}s=t}return s}}class n{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))|(?:[零一二三四五六七八九十百千万亿]+)(?:小时|分钟|秒|毫秒|天|周|月|年)/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const n=[];let s=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const n=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,n)),index:o.index,end:Math.max(o.end,n)}:(r.push({text:o.text,index:o.index}),o={...e,end:n}):o={...e,end:n}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>s){const r=t.slice(s,i.index);n.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?n.push({txt:i.text,type:"date",lang:e,src:""}):n.push({txt:i.text,type:"other",lang:e,src:""}),s=i.index+i.text.length}if(s<t.length){const i=t.slice(s);n.push({txt:i,type:"other",lang:e,src:""})}return n}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t)||/^[零一二三四五六七八九十百千万亿]+(?:小时|分钟|秒|毫秒|天|周|月|年)$/.test(t))return!0;let e,n,s;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),n=parseInt(t.slice(4,6)),s=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],n=i[1],s=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],n=i[1],s=i[2]):i[2]>31?(n=i[0],s=i[1],e=i[2]):(n=i[0],s=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;n=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,s=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),n=parseInt(i[2]),s=parseInt(i[3])}}return this.isValidDateComponents(e,n,s)}isValidChineseNumberTime(t){const e=/(小时|分钟|秒|毫秒|天|周|月|年)$/,n=t.replace(e,""),s=t.match(e)?.[1]||"";return!(!n||!s)&&/^(?:[零一二三四五六七八九]|十[零一二三四五六七八九]?|百[零一二三四五六七八九]?|千[零一二三四五六七八九]?|万[零一二三四五六七八九]?|亿[零一二三四五六七八九]?)+$/.test(n)}isValidDateComponents(t,e,n){if(e<1||e>12||n<1||n>31)return!1;if(2===e){if(n>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(n>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class s{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g,r=/(?:https?:\/\/)?(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|(?:localhost|127\.0\.0\.1)|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(?::\d{1,5})?(?:\/[^\s]*)?/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"url",text:a[0]});o.sort((t,e)=>t.index-e.index);const u=[];for(let t=0;t<o.length;t++){let e=!1;for(let n=0;n<o.length;n++)if(t!==n){const s=o[t].index===o[n].index&&o[t].endIndex===o[n].endIndex,i=o[t].index>=o[n].index&&o[t].endIndex<=o[n].endIndex;if(s){if("ip"===o[n].type){e=!0;break}}else if(i){e=!0;break}}e||u.push(o[t])}for(const i of u)i.index>s&&n.push({txt:t.substring(s,i.index),type:"other",lang:e}),n.push({txt:i.text,type:i.type,lang:"en"}),s=i.endIndex;return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e}),n}}class r{CHINESE_NUMBERS=new Set(["零","一","二","三","四","五","六","七","八","九","十","百","千","万","亿","壹","贰","叁","肆","伍","陆","柒","捌","玖","拾","佰","仟"]);CHINESE_SURNAMES=new Set(["张","李","王","刘","陈","杨","赵","黄","周","吴","徐","孙","胡","朱","高","林","何","郭","马","罗","梁","宋","郑","谢","韩","唐","冯","于","董","萧","程","曹","袁","邓","许","傅","沈","曾","彭","吕","苏","卢","蒋","蔡","贾","丁","魏","薛","叶","阎","余","潘","杜","戴","夏","钟","汪","田","任","姜","范","方","石","姚","谭","廖","邹","熊","金","陆","郝","孔","白","崔","康","毛","邱","秦","江","史","顾","侯","邵","孟","龙","万","段","雷","钱","汤","尹","黎","易","常","武","乔","贺","赖","龚","文"]);UNITS=new Set(["公斤","英里","克","千克","吨","米","厘米","毫米","公里","斤","两","元","角","分","小时","分钟","秒","折","折扣","卷","券","美元","人民币","元","角","分","亩","公顷","平方米","平方分米","平方厘米","立方厘米","升","毫升","天","周","月","年","岁","度","瓦","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛","牛顿","帕斯卡","巴","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","g","mg","t","km","m","cm","mm","μm","nm","L","mL","l","ml","h","min","s","d","w","y","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","B","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"]);ORDINAL_PREFIXES=new Set(["第","No.","No","no.","no"]);detectLanguage(t){return""}tokenize(t,e){if(!t)return[];const n=[];let s=0;const i=t.length;for(;s<i;){let e=!1,r=this.findOrdinalPrefix(t,s);if(r){const{prefix:o,prefixEnd:a}=r;let u=a;for(;u<i&&""===t[u].trim();)u++;const c=this.findNumber(t,u);if(c){let r=c.end;for(;r<i&&""===t[r].trim();)r++;const o=this.findUnit(t,r);o&&(r=o.end),n.push({start:s,end:r,txt:t.substring(s,r)}),s=r,e=!0}}if(!e){const i=this.findNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}e||s++}if(0===n.length)return[{txt:t,type:"other",lang:e,src:""}];const r=[];let o=0;const a=[];for(const t of n){let e=!1;for(const[n,s]of a.entries())if(t.start<=s.end&&t.end>=s.start){t.end-t.start>s.end-s.start&&(a[n]=t),e=!0;break}e||a.push(t)}a.sort((t,e)=>t.start-e.start);for(const n of a)n.start>o&&r.push({txt:t.substring(o,n.start),type:"other",lang:e,src:""}),r.push({txt:n.txt,type:"number",lang:e,src:""}),o=n.end;return o<i&&r.push({txt:t.substring(o),type:"other",lang:e,src:""}),r}findOrdinalPrefix(t,e){for(const n of this.ORDINAL_PREFIXES)if(t.startsWith(n,e))return{prefix:n,prefixEnd:e+n.length};return null}findNumber(t,e){const n=[/^[+-]?\d+(\.\d+)?[eE][+-]?\d+/,/^[+-]?\d+\.\d+/,/^[+-]?\.\d+/,/^[+-]?\d+/];for(const s of n){const n=s.exec(t.substring(e));if(n)return{end:e+n[0].length}}return null}findChineseNumber(t,e){let n=e;for(;n<t.length&&this.CHINESE_NUMBERS.has(t[n]);)n++;const s=n-e;if(s>1)return{end:n};if(1===s){t[e];const s=e>0?t[e-1]:"",i=e+1<t.length?t[e+1]:"",r=this.findUnit(t,e+1);if(s&&this.isNumberRelatedChar(s)||r)return{end:n};if(i&&this.CHINESE_NUMBERS.has(i))return{end:n}}return null}isNumberRelatedChar(t){return this.CHINESE_NUMBERS.has(t)||this.UNITS.has(t)||t>="0"&&t<="9"||"."===t||"e"===t||"E"===t||"+"===t||"-"===t}findUnit(t,e){let n=0;for(let s=e;s<Math.min(e+5,t.length);s++){const i=t.substring(e,s+1);if(this.UNITS.has(i)){const i=t[s+1];(!i||!/[a-zA-Z0-9]/.test(i))&&(n=s+1-e)}}return n>0?{end:e+n}:null}findNumberWithUnit(t,e){const n=this.findNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}findChineseNumberWithUnit(t,e){const n=this.findChineseNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}isNumberChar(t,e,n){if(t>="0"&&t<="9")return!0;if("."===t){const t=e>0?n[e-1]:"",s=e<n.length-1?n[e+1]:"";return t>="0"&&t<="9"||s>="0"&&s<="9"}if(("e"===t||"E"===t)&&e>0){const t=n[e-1];return t>="0"&&t<="9"||"."===t}if(("+"===t||"-"===t)&&e>0){const t=n[e-1];return"e"===t||"E"===t}if(this.CHINESE_NUMBERS.has(t))return!0;for(const s of this.ORDINAL_PREFIXES)if(s.includes(t)&&n.startsWith(s,e))return!0;for(const t of this.UNITS)if(n.startsWith(t,e))return!0;if(("+"===t||"-"===t)&&e>1){const t=n[e-1],s=n[e-2];return("e"===t||"E"===t)&&(s>="0"&&s<="9"||"."===s)}return!1}}class o{tokenizers;customDictionaries;defaultLanguage;constructor(s={}){this.customDictionaries=s.customDictionaries||{},this.defaultLanguage=s.defaultLanguage||"en",this.tokenizers=[new n,new i,new r,new t,new e(this.customDictionaries)]}addCustomDictionary(t,n,s,i){const r=n||this.defaultLanguage;this.customDictionaries[r]||(this.customDictionaries[r]=[]);const o=this.customDictionaries[r].findIndex(t=>t.name===i&&t.lang===r&&t.priority===s);if(o>=0){const e=this.customDictionaries[r][o];t.forEach(t=>e.data.add(t))}else this.customDictionaries[r].push({priority:s,data:new Set(t),name:i,lang:r});const a=this.tokenizers.find(t=>t instanceof e);a&&(a.customDictionaries=this.customDictionaries)}removeCustomWord(t,e,n){if(e){if(this.customDictionaries[e])if(n){const s=this.customDictionaries[e].find(t=>t.name===n);s&&s.data.delete(t)}else this.customDictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.customDictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(o,a){const u=a||s.detectLanguage(o),c=this.tokenizers.find(t=>t instanceof n);if(!c)return[];const d=c.tokenize(o,u),h=[],l=this.tokenizers.find(t=>t instanceof i);if(!l)return h;for(const n of d)if("date"===n.type)h.push(n);else{const s=l.tokenize(n.txt,u),i=this.tokenizers.find(t=>t instanceof r);if(!i)return h;for(const n of s)if("url"===n.type||"ip"===n.type)h.push(n);else{const s=i.tokenize(n.txt,u);for(const n of s)if("number"===n.type)h.push(n);else{let s=[];if("en"===u){const e=this.tokenizers.find(e=>e instanceof t);e&&(s=e.tokenize(n.txt,u))}else if(["zh","ja","ko"].includes(u)){const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,u))}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,u))}s.length>0?h.push(...s):h.push(n)}}}return h}tokenizeText(t,e){const n=this.tokenize(t,e?.language),s=["punctuation","space","other",...e?.excludeTypes||[]];return n.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type))&&!s.includes(t.type)).map(t=>t.txt)}}function a(t){return new o(t)}export{e as CJKTokenizer,n as DateTokenizer,t as EnglishTokenizer,s as LanguageDetector,o as MultilingualTokenizer,a as createTokenizer};
package/lib/index.cjs CHANGED
@@ -1 +1 @@
1
- "use strict";var e=require("./core"),t=require("./lexicon");function n(e){var t=Object.create(null);return e&&Object.keys(e).forEach(function(n){if("default"!==n){var s=Object.getOwnPropertyDescriptor(e,n);Object.defineProperty(t,n,s.get?s:{enumerable:!0,get:function(){return e[n]}})}}),t.default=e,Object.freeze(t)}var s=n(t);class a{static instances=new Map;lexicons=[];constructor(e){this.loadLexicons(e)}static getInstance(e){const t=`${JSON.stringify(e.types)}-${JSON.stringify(e.languages)}`;return a.instances.has(t)||a.instances.set(t,new a(e)),a.instances.get(t)}loadLexicons(e){const{languages:t,types:n}=e;this.lexicons=[];const a={lastName:100,firstName:100,famousName:100,famousWorks:100,country:80,computerTerm:75,city:70,networkTerms:70,medicines:70,transportation:70,luxury:65,pronouns:65,address:60,foods:60,appliances:60,honorific:50,nickname:50,title:50,kinship:50,organization:50,furniture:55,pets:55};t.forEach(e=>{const t=e.replace("-","_");n.forEach(n=>{const r=`${t}_${n.charAt(0).toUpperCase()+n.slice(1)}`,o=s[r]||"",i=this.parseLexiconString(o);this.lexicons.push({priority:a[n]||50,data:i,name:`${t}_${n}`,lang:e})})})}parseLexiconString(e){const t=e.split("").map(e=>e.trim()).filter(e=>e.length>0);return new Set(t)}checkWord(e){for(const t of this.lexicons)if(t.data.has(e))return{found:!0,lexiconName:t.name};return{found:!1,lexiconName:""}}addCustomLexicon(e){this.lexicons.push(e),this.lexicons.sort((e,t)=>t.priority-e.priority)}getLexicons(){return this.lexicons}static clearAllInstances(){a.instances.clear()}}class r{static instance=null;static defaultLanguages=["zh-CN","zh-TW","en-US","ja-JP","ko-KR"];static defaultTypes=["lastName","firstName","famousName","famousWorks","honorific","nickname","title","kinship","organization","country","city","address","computerTerm","networkTerms","pronouns","foods","medicines","luxury","transportation","appliances","furniture","pets"];static getInstance(){if(!r.instance){const t={defaultLanguage:"zh"};r.instance=e.createTokenizer(t)}return r.instance}static setDefaultLanguages(e){r.defaultLanguages=e,r.instance=null}static setDefaultTypes(e){r.defaultTypes=e,r.instance=null}static tokenize(e,t){return r.getInstance().tokenize(e,t)}static tokenizeText(e,t){return r.getInstance().tokenizeText(e,{language:t})}static addCustomDictionary(e,t,n,s){r.getInstance().addCustomDictionary(e,t,n,s)}static removeCustomWord(e,t,n){r.getInstance().removeCustomWord(e,t,n)}static getLexiconLoader(){const e={languages:r.defaultLanguages,types:r.defaultTypes};return a.getInstance(e)}}Object.defineProperty(exports,"MultilingualTokenizer",{enumerable:!0,get:function(){return e.MultilingualTokenizer}}),Object.defineProperty(exports,"createTokenizer",{enumerable:!0,get:function(){return e.createTokenizer}}),exports.LexiconLoader=a,exports.QuickUseTokenizer=r,exports.addCustomDictionary=(e,t,n,s)=>r.addCustomDictionary(e,t,n,s),exports.removeCustomWord=(e,t,n)=>r.removeCustomWord(e,t,n),exports.setDefaultLanguages=e=>r.setDefaultLanguages(e),exports.setDefaultTypes=e=>r.setDefaultTypes(e),exports.tokenize=(e,t)=>r.tokenize(e,t),exports.tokenizeText=(e,t)=>r.getInstance().tokenizeText(e,{language:t}),exports.tokenizeToText=(e,t)=>r.tokenizeText(e,t),Object.keys(t).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return t[e]}})});
1
+ "use strict";var e=require("./core"),t=require("./lexicon");function n(e){var t=Object.create(null);return e&&Object.keys(e).forEach(function(n){if("default"!==n){var a=Object.getOwnPropertyDescriptor(e,n);Object.defineProperty(t,n,a.get?a:{enumerable:!0,get:function(){return e[n]}})}}),t.default=e,Object.freeze(t)}var a=n(t);class s{static instances=new Map;lexicons=[];constructor(e){this.loadLexicons(e)}static getInstance(e){const t=`${JSON.stringify(e.types)}-${JSON.stringify(e.languages)}`;return s.instances.has(t)||s.instances.set(t,new s(e)),s.instances.get(t)}loadLexicons(e){const{languages:t,types:n}=e;this.lexicons=[];const s={lastName:100,firstName:100,famousName:100,famousWorks:100,country:80,computerTerm:75,city:70,networkTerms:70,medicines:70,transportation:70,luxury:65,pronouns:65,address:60,foods:60,appliances:60,honorific:50,nickname:50,title:50,kinship:50,organization:50,furniture:55,pets:55};t.forEach(e=>{const t=e.replace("-","_");n.forEach(n=>{const o=`${t}_${n.charAt(0).toUpperCase()+n.slice(1)}`,r=a[o]||"",i=this.parseLexiconString(r);this.lexicons.push({priority:s[n]||50,data:i,name:`${t}_${n}`,lang:e})})})}parseLexiconString(e){const t=e.split("").map(e=>e.trim()).filter(e=>e.length>0);return new Set(t)}checkWord(e){for(const t of this.lexicons)if(t.data.has(e))return{found:!0,lexiconName:t.name};return{found:!1,lexiconName:""}}addCustomLexicon(e){this.lexicons.push(e),this.lexicons.sort((e,t)=>t.priority-e.priority)}getLexicons(){return this.lexicons}static clearAllInstances(){s.instances.clear()}}class o{static instance=null;static defaultLanguages=["zh-CN","zh-TW","en-US","ja-JP","ko-KR"];static defaultTypes=["lastName","firstName","famousName","famousWorks","honorific","nickname","title","kinship","organization","country","city","address","computerTerm","networkTerms","pronouns","foods","medicines","luxury","transportation","appliances","furniture","pets","otherNames"];static getInstance(){if(!o.instance){const t={defaultLanguage:"zh"};o.instance=e.createTokenizer(t);const n=o.getLexiconLoader().getLexicons();console.log("Loaded lexicons:",n.map(e=>({name:e.name,size:e.data.size,hasDouyin:e.data.has("抖音")}))),n.forEach(e=>{const t=e.lang.replace("-","").toLowerCase().slice(0,2);console.log(`Adding lexicon ${e.name} with ${e.data.size} words to lang ${t}`),o.instance?.addCustomDictionary(Array.from(e.data),t,e.priority,e.name)})}return o.instance}static setDefaultLanguages(e){o.defaultLanguages=e,o.instance=null}static setDefaultTypes(e){o.defaultTypes=e,o.instance=null}static tokenize(e,t){return o.getInstance().tokenize(e,t)}static tokenizeText(e,t){return o.getInstance().tokenizeText(e,{language:t})}static addCustomDictionary(e,t,n,a){o.getInstance().addCustomDictionary(e,t,n,a)}static removeCustomWord(e,t,n){o.getInstance().removeCustomWord(e,t,n)}static getLexiconLoader(){const e={languages:o.defaultLanguages,types:o.defaultTypes};return s.getInstance(e)}}Object.defineProperty(exports,"MultilingualTokenizer",{enumerable:!0,get:function(){return e.MultilingualTokenizer}}),Object.defineProperty(exports,"createTokenizer",{enumerable:!0,get:function(){return e.createTokenizer}}),exports.LexiconLoader=s,exports.QuickUseTokenizer=o,exports.addCustomDictionary=(e,t,n,a)=>o.addCustomDictionary(e,t,n,a),exports.removeCustomWord=(e,t,n)=>o.removeCustomWord(e,t,n),exports.setDefaultLanguages=e=>o.setDefaultLanguages(e),exports.setDefaultTypes=e=>o.setDefaultTypes(e),exports.tokenize=(e,t)=>o.tokenize(e,t),exports.tokenizeText=(e,t)=>o.getInstance().tokenizeText(e,{language:t}),exports.tokenizeToText=(e,t)=>o.tokenizeText(e,t),Object.keys(t).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return t[e]}})});
package/lib/index.js CHANGED
@@ -1 +1 @@
1
- import{createTokenizer as e}from"./core";import{MultilingualTokenizer as t,createTokenizer as n}from"./core";import*as s from"./lexicon";export*from"./lexicon";class a{static instances=new Map;lexicons=[];constructor(e){this.loadLexicons(e)}static getInstance(e){const t=`${JSON.stringify(e.types)}-${JSON.stringify(e.languages)}`;return a.instances.has(t)||a.instances.set(t,new a(e)),a.instances.get(t)}loadLexicons(e){const{languages:t,types:n}=e;this.lexicons=[];const a={lastName:100,firstName:100,famousName:100,famousWorks:100,country:80,computerTerm:75,city:70,networkTerms:70,medicines:70,transportation:70,luxury:65,pronouns:65,address:60,foods:60,appliances:60,honorific:50,nickname:50,title:50,kinship:50,organization:50,furniture:55,pets:55};t.forEach(e=>{const t=e.replace("-","_");n.forEach(n=>{const i=`${t}_${n.charAt(0).toUpperCase()+n.slice(1)}`,o=s[i]||"",r=this.parseLexiconString(o);this.lexicons.push({priority:a[n]||50,data:r,name:`${t}_${n}`,lang:e})})})}parseLexiconString(e){const t=e.split("").map(e=>e.trim()).filter(e=>e.length>0);return new Set(t)}checkWord(e){for(const t of this.lexicons)if(t.data.has(e))return{found:!0,lexiconName:t.name};return{found:!1,lexiconName:""}}addCustomLexicon(e){this.lexicons.push(e),this.lexicons.sort((e,t)=>t.priority-e.priority)}getLexicons(){return this.lexicons}static clearAllInstances(){a.instances.clear()}}class i{static instance=null;static defaultLanguages=["zh-CN","zh-TW","en-US","ja-JP","ko-KR"];static defaultTypes=["lastName","firstName","famousName","famousWorks","honorific","nickname","title","kinship","organization","country","city","address","computerTerm","networkTerms","pronouns","foods","medicines","luxury","transportation","appliances","furniture","pets"];static getInstance(){if(!i.instance){const t={defaultLanguage:"zh"};i.instance=e(t)}return i.instance}static setDefaultLanguages(e){i.defaultLanguages=e,i.instance=null}static setDefaultTypes(e){i.defaultTypes=e,i.instance=null}static tokenize(e,t){return i.getInstance().tokenize(e,t)}static tokenizeText(e,t){return i.getInstance().tokenizeText(e,{language:t})}static addCustomDictionary(e,t,n,s){i.getInstance().addCustomDictionary(e,t,n,s)}static removeCustomWord(e,t,n){i.getInstance().removeCustomWord(e,t,n)}static getLexiconLoader(){const e={languages:i.defaultLanguages,types:i.defaultTypes};return a.getInstance(e)}}const o=(e,t)=>i.tokenize(e,t),r=(e,t)=>i.tokenizeText(e,t),c=(e,t)=>i.getInstance().tokenizeText(e,{language:t}),u=(e,t,n,s)=>i.addCustomDictionary(e,t,n,s),l=(e,t,n)=>i.removeCustomWord(e,t,n),m=e=>i.setDefaultLanguages(e),g=e=>i.setDefaultTypes(e);export{a as LexiconLoader,t as MultilingualTokenizer,i as QuickUseTokenizer,u as addCustomDictionary,n as createTokenizer,l as removeCustomWord,m as setDefaultLanguages,g as setDefaultTypes,o as tokenize,c as tokenizeText,r as tokenizeToText};
1
+ import{createTokenizer as e}from"./core";import{MultilingualTokenizer as t,createTokenizer as n}from"./core";import*as s from"./lexicon";export*from"./lexicon";class a{static instances=new Map;lexicons=[];constructor(e){this.loadLexicons(e)}static getInstance(e){const t=`${JSON.stringify(e.types)}-${JSON.stringify(e.languages)}`;return a.instances.has(t)||a.instances.set(t,new a(e)),a.instances.get(t)}loadLexicons(e){const{languages:t,types:n}=e;this.lexicons=[];const a={lastName:100,firstName:100,famousName:100,famousWorks:100,country:80,computerTerm:75,city:70,networkTerms:70,medicines:70,transportation:70,luxury:65,pronouns:65,address:60,foods:60,appliances:60,honorific:50,nickname:50,title:50,kinship:50,organization:50,furniture:55,pets:55};t.forEach(e=>{const t=e.replace("-","_");n.forEach(n=>{const o=`${t}_${n.charAt(0).toUpperCase()+n.slice(1)}`,i=s[o]||"",r=this.parseLexiconString(i);this.lexicons.push({priority:a[n]||50,data:r,name:`${t}_${n}`,lang:e})})})}parseLexiconString(e){const t=e.split("").map(e=>e.trim()).filter(e=>e.length>0);return new Set(t)}checkWord(e){for(const t of this.lexicons)if(t.data.has(e))return{found:!0,lexiconName:t.name};return{found:!1,lexiconName:""}}addCustomLexicon(e){this.lexicons.push(e),this.lexicons.sort((e,t)=>t.priority-e.priority)}getLexicons(){return this.lexicons}static clearAllInstances(){a.instances.clear()}}class o{static instance=null;static defaultLanguages=["zh-CN","zh-TW","en-US","ja-JP","ko-KR"];static defaultTypes=["lastName","firstName","famousName","famousWorks","honorific","nickname","title","kinship","organization","country","city","address","computerTerm","networkTerms","pronouns","foods","medicines","luxury","transportation","appliances","furniture","pets","otherNames"];static getInstance(){if(!o.instance){const t={defaultLanguage:"zh"};o.instance=e(t);const n=o.getLexiconLoader().getLexicons();console.log("Loaded lexicons:",n.map(e=>({name:e.name,size:e.data.size,hasDouyin:e.data.has("抖音")}))),n.forEach(e=>{const t=e.lang.replace("-","").toLowerCase().slice(0,2);console.log(`Adding lexicon ${e.name} with ${e.data.size} words to lang ${t}`),o.instance?.addCustomDictionary(Array.from(e.data),t,e.priority,e.name)})}return o.instance}static setDefaultLanguages(e){o.defaultLanguages=e,o.instance=null}static setDefaultTypes(e){o.defaultTypes=e,o.instance=null}static tokenize(e,t){return o.getInstance().tokenize(e,t)}static tokenizeText(e,t){return o.getInstance().tokenizeText(e,{language:t})}static addCustomDictionary(e,t,n,s){o.getInstance().addCustomDictionary(e,t,n,s)}static removeCustomWord(e,t,n){o.getInstance().removeCustomWord(e,t,n)}static getLexiconLoader(){const e={languages:o.defaultLanguages,types:o.defaultTypes};return a.getInstance(e)}}const i=(e,t)=>o.tokenize(e,t),r=(e,t)=>o.tokenizeText(e,t),c=(e,t)=>o.getInstance().tokenizeText(e,{language:t}),u=(e,t,n,s)=>o.addCustomDictionary(e,t,n,s),l=(e,t,n)=>o.removeCustomWord(e,t,n),m=e=>o.setDefaultLanguages(e),g=e=>o.setDefaultTypes(e);export{a as LexiconLoader,t as MultilingualTokenizer,o as QuickUseTokenizer,u as addCustomDictionary,n as createTokenizer,l as removeCustomWord,m as setDefaultLanguages,g as setDefaultTypes,i as tokenize,c as tokenizeText,r as tokenizeToText};