gs-tokenizer 0.1.7 → 0.1.8
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/core.cjs +1 -1
- package/lib/core.d.ts +1 -7
- package/lib/core.js +1 -1
- package/lib/index.cjs +1 -1
- package/lib/index.d.ts +2 -69
- package/lib/index.js +1 -1
- package/lib/lexicon.cjs +1 -1
- package/lib/lexicon.d.ts +66 -2
- package/lib/lexicon.js +1 -1
- package/package.json +1 -1
package/lib/core.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";class t{dictionaries;constructor(t={}){this.dictionaries=t}detectLanguage(t){return/^[a-zA-Z0-9\s!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]+$/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const n=[],s=t.split(/\b/);for(const t of s)t&&(t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?n.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(n,e)}tagNameTokens(t,e){const n=[];let s=0;for(;s<t.length;){if(s<t.length&&"word"===t[s].type){const i=t[s].txt;if(this.dictionaries[e]){let t=!1;for(const r of this.dictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){n.push({txt:i,type:"word",lang:e,src:r.name}),s++,t=!0;break}if(t)continue}}n.push({txt:t[s].txt,type:t[s].type,lang:e,src:t[s].src||""}),s++}return n}}class e{segmenters;dictionaries;constructor(t={}){this.segmenters=new Map,this.dictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const n=[],s=this.getSegmenter(e);for(const i of s.segment(t)){const{segment:t,isWordLike:s}=i;t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?n.push({txt:t,type:"punctuation",lang:e,src:""}):s?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(n,e)}getSegmenter(t,e="word"){const n=`${t}-${e}`;return this.segmenters.has(n)||this.segmenters.set(n,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(n)}applyCustomDictionary(t,e){const n=this.dictionaries[e]||[];let s=t;if(n.length>0){const t=[];let i=0;for(;i<s.length;){let r=null,o=-1;for(let t=Math.min(5,s.length-i);t>=1;t--){const e=s.slice(i,i+t).map(t=>t.txt).join("");for(const s of n)s.data.has(e)&&(!r||t>r.length||t===r.length&&s.priority>o)&&(r={length:t,text:e},o=s.priority)}r?(t.push({txt:r.text,type:"word",lang:e,src:""}),i+=r.length):(t.push({...s[i],src:""}),i++)}s=t}return s}}class n{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))|(?:[零一二三四五六七八九十百千万亿]+)(?:小时|分钟|秒|毫秒|天|周|月|年)/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const n=[];let s=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const n=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,n)),index:o.index,end:Math.max(o.end,n)}:(r.push({text:o.text,index:o.index}),o={...e,end:n}):o={...e,end:n}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>s){const r=t.slice(s,i.index);n.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?n.push({txt:i.text,type:"date",lang:e,src:""}):n.push({txt:i.text,type:"other",lang:e,src:""}),s=i.index+i.text.length}if(s<t.length){const i=t.slice(s);n.push({txt:i,type:"other",lang:e,src:""})}return n}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t)||/^[零一二三四五六七八九十百千万亿]+(?:小时|分钟|秒|毫秒|天|周|月|年)$/.test(t))return!0;let e,n,s;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),n=parseInt(t.slice(4,6)),s=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],n=i[1],s=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],n=i[1],s=i[2]):i[2]>31?(n=i[0],s=i[1],e=i[2]):(n=i[0],s=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;n=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,s=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),n=parseInt(i[2]),s=parseInt(i[3])}}return this.isValidDateComponents(e,n,s)}isValidChineseNumberTime(t){const e=/(小时|分钟|秒|毫秒|天|周|月|年)$/,n=t.replace(e,""),s=t.match(e)?.[1]||"";return!(!n||!s)&&/^(?:[零一二三四五六七八九]|十[零一二三四五六七八九]?|百[零一二三四五六七八九]?|千[零一二三四五六七八九]?|万[零一二三四五六七八九]?|亿[零一二三四五六七八九]?)+$/.test(n)}isValidDateComponents(t,e,n){if(e<1||e>12||n<1||n>31)return!1;if(2===e){if(n>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(n>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class s{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?\b/g,r=/\b(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|localhost)(?::\d{1,5})?\b/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"host",text:a[0]});o.sort((t,e)=>t.index-e.index);const d=[];for(let t=0;t<o.length;t++){let e=!1;for(let n=0;n<o.length;n++)if(t!==n){const s=o[t].index===o[n].index&&o[t].endIndex===o[n].endIndex,i=o[t].index>=o[n].index&&o[t].endIndex<=o[n].endIndex;if(s){if("ip"===o[n].type){e=!0;break}}else if(i){e=!0;break}}e||d.push(o[t])}for(const i of d)i.index>s&&n.push({txt:t.substring(s,i.index),type:"other",lang:e}),n.push({txt:i.text,type:i.type,lang:"en"}),s=i.endIndex;return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e}),n}}class r{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/(#[\p{L}\p{N}_]+)|(@[\p{L}\p{N}_]+)/gu;let r;for(;null!==(r=i.exec(t));){r.index>s&&n.push({txt:t.substring(s,r.index),type:"other",lang:e,src:""});const i=r[0],o=i.startsWith("#")?"hashtag":"mention";n.push({txt:i,type:o,lang:e,src:"social"}),s=r.index+i.length}return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e,src:""}),n}}class o{CHINESE_NUMBERS=new Set(["零","一","二","三","四","五","六","七","八","九","十","百","千","万","亿","壹","贰","叁","肆","伍","陆","柒","捌","玖","拾","佰","仟"]);CHINESE_SURNAMES=new Set(["张","李","王","刘","陈","杨","赵","黄","周","吴","徐","孙","胡","朱","高","林","何","郭","马","罗","梁","宋","郑","谢","韩","唐","冯","于","董","萧","程","曹","袁","邓","许","傅","沈","曾","彭","吕","苏","卢","蒋","蔡","贾","丁","魏","薛","叶","阎","余","潘","杜","戴","夏","钟","汪","田","任","姜","范","方","石","姚","谭","廖","邹","熊","金","陆","郝","孔","白","崔","康","毛","邱","秦","江","史","顾","侯","邵","孟","龙","万","段","雷","钱","汤","尹","黎","易","常","武","乔","贺","赖","龚","文"]);UNITS=new Set(["公斤","英里","克","千克","吨","米","厘米","毫米","公里","斤","两","元","角","分","小时","分钟","秒","折","折扣","卷","券","美元","人民币","元","角","分","亩","公顷","平方米","平方分米","平方厘米","立方厘米","升","毫升","天","周","月","年","岁","度","瓦","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛","牛顿","帕斯卡","巴","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","g","mg","t","km","m","cm","mm","μm","nm","L","mL","l","ml","h","min","s","d","w","y","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","B","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"]);ORDINAL_PREFIXES=new Set(["第","No.","No","no.","no"]);detectLanguage(t){return""}tokenize(t,e){if(!t)return[];const n=[];let s=0;const i=t.length;for(;s<i;){let e=!1,r=this.findOrdinalPrefix(t,s);if(r){const{prefix:o,prefixEnd:a}=r;let d=a;for(;d<i&&""===t[d].trim();)d++;const c=this.findNumber(t,d);if(c){let r=c.end;for(;r<i&&""===t[r].trim();)r++;const o=this.findUnit(t,r);o&&(r=o.end),n.push({start:s,end:r,txt:t.substring(s,r)}),s=r,e=!0}}if(!e){const i=this.findNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}e||s++}if(0===n.length)return[{txt:t,type:"other",lang:e,src:""}];const r=[];let o=0;const a=[];for(const t of n){let e=!1;for(const[n,s]of a.entries())if(t.start<=s.end&&t.end>=s.start){t.end-t.start>s.end-s.start&&(a[n]=t),e=!0;break}e||a.push(t)}a.sort((t,e)=>t.start-e.start);for(const n of a)n.start>o&&r.push({txt:t.substring(o,n.start),type:"other",lang:e,src:""}),r.push({txt:n.txt,type:"number",lang:e,src:""}),o=n.end;return o<i&&r.push({txt:t.substring(o),type:"other",lang:e,src:""}),r}findOrdinalPrefix(t,e){for(const n of this.ORDINAL_PREFIXES)if(t.startsWith(n,e))return{prefix:n,prefixEnd:e+n.length};return null}findNumber(t,e){const n=[/^[+-]?\d+(\.\d+)?[eE][+-]?\d+/,/^[+-]?\d+(\.\d+)?%/,/^[+-]?\d+(\.\d+)?‰/,/^[+-]?\d+\.\d+/,/^[+-]?\.\d+/,/^[+-]?\d+/];for(const s of n){const i=s.exec(t.substring(e));if(i){const r=e+i[0].length;if(r<t.length&&/[a-zA-Z]/.test(t[r])){if(s===n[0])return{end:r};continue}return{end:r}}}return null}findChineseNumber(t,e){let n=e;for(;n<t.length&&this.CHINESE_NUMBERS.has(t[n]);)n++;const s=n-e;if(s>1)return{end:n};if(1===s){t[e];const s=e>0?t[e-1]:"",i=e+1<t.length?t[e+1]:"",r=this.findUnit(t,e+1);if(s&&this.isNumberRelatedChar(s)||r)return{end:n};if(i&&this.CHINESE_NUMBERS.has(i))return{end:n}}return null}isNumberRelatedChar(t){return this.CHINESE_NUMBERS.has(t)||this.UNITS.has(t)||t>="0"&&t<="9"||"."===t||"e"===t||"E"===t||"+"===t||"-"===t}findUnit(t,e){let n=0;for(let s=e;s<Math.min(e+5,t.length);s++){const i=t.substring(e,s+1);if(this.UNITS.has(i)){const i=t[s+1];(!i||!/[a-zA-Z0-9]/.test(i))&&(n=s+1-e)}}return n>0?{end:e+n}:null}findNumberWithUnit(t,e){const n=this.findNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}findChineseNumberWithUnit(t,e){const n=this.findChineseNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}isNumberChar(t,e,n){if(t>="0"&&t<="9")return!0;if("."===t){const t=e>0?n[e-1]:"",s=e<n.length-1?n[e+1]:"";return t>="0"&&t<="9"||s>="0"&&s<="9"}if(("e"===t||"E"===t)&&e>0){const t=n[e-1];return t>="0"&&t<="9"||"."===t}if(("+"===t||"-"===t)&&e>0){const t=n[e-1];return"e"===t||"E"===t}if(this.CHINESE_NUMBERS.has(t))return!0;for(const s of this.ORDINAL_PREFIXES)if(s.includes(t)&&n.startsWith(s,e))return!0;for(const t of this.UNITS)if(n.startsWith(t,e))return!0;if(("+"===t||"-"===t)&&e>1){const t=n[e-1],s=n[e-2];return("e"===t||"E"===t)&&(s>="0"&&s<="9"||"."===s)}return!1}}class a{tokenizers;dictionaries;defaultLanguage;constructor(s={}){this.dictionaries=s.dictionaries||{},this.defaultLanguage=s.defaultLanguage||"en",this.tokenizers=[new n,new i,new r,new o,new t,new e(this.dictionaries)]}addDictionary(t,n,i,r){const o=void 0!==i?i:200;let a=r;if(!a&&t.length>0){const e=t.find(t=>""!==t.trim())||"";a=s.detectLanguage(e)}const d=a||this.defaultLanguage;this.dictionaries[d]||(this.dictionaries[d]=[]);const c=this.dictionaries[d].findIndex(t=>t.name===n&&t.lang===d&&t.priority===o);if(c>=0){const e=this.dictionaries[d][c];t.forEach(t=>e.data.add(t))}else this.dictionaries[d].push({priority:o,data:new Set(t),name:n,lang:d});const u=this.tokenizers.find(t=>t instanceof e);u&&(u.dictionaries=this.dictionaries)}removeCustomWord(t,e,n){if(e){if(this.dictionaries[e])if(n){const s=this.dictionaries[e].find(t=>t.name===n);s&&s.data.delete(t)}else this.dictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.dictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(a,d){const c=d||s.detectLanguage(a),u=this.tokenizers.find(t=>t instanceof n);if(!u)return[];const h=u.tokenize(a,c),l=[],f=this.tokenizers.find(t=>t instanceof i);if(!f)return l;for(const n of h)if("date"===n.type)l.push(n);else{const s=f.tokenize(n.txt,c),i=this.tokenizers.find(t=>t instanceof r);if(!i)return l;for(const n of s)if("host"===n.type||"ip"===n.type)l.push(n);else{const s=i.tokenize(n.txt,c),r=this.tokenizers.find(t=>t instanceof o);if(!r)return l;for(const n of s)if("hashtag"===n.type||"mention"===n.type)l.push(n);else{const s=r.tokenize(n.txt,c);for(const n of s)if("number"===n.type)l.push(n);else{let s=[];if("en"===c){const e=this.tokenizers.find(e=>e instanceof t);e&&(s=e.tokenize(n.txt,c))}else if(["zh","ja","ko"].includes(c)){const i=n.txt,r=this.tokenizers.find(t=>t instanceof e);if(r){const e=r.tokenize(i,c);if(e.length>0)s=e;else{const e=[];let n=0;const o=/[a-zA-Z0-9]+[a-zA-Z0-9_-]*|[a-zA-Z0-9]|[a-zA-Z]+/g;let a;for(;null!==(a=o.exec(i));){if(a.index>n){const t=i.substring(n,a.index),s=r.tokenize(t,c);e.push(...s)}const s=a[0],o=this.tokenizers.find(e=>e instanceof t);if(o){const t=o.tokenize(s,"en");e.push(...t)}n=a.index+a[0].length}if(n<i.length){const t=i.substring(n),s=r.tokenize(t,c);e.push(...s)}s=e}}}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,c))}s.length>0?l.push(...s):l.push(n)}}}}const p=[];let g=null;for(const t of l)"punctuation"===t.type?g?g.txt+=t.txt:g={...t}:(g&&(p.push(g),g=null),p.push(t));return g&&p.push(g),p}tokenizeText(t,e){const n=this.tokenize(t,e?.language),s=["space",...e?.excludeTypes||[]];return n.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type)||s.includes(t.type)||"punctuation"===t.type&&t.txt.replace(/\s/g,"").length<=1)).map(t=>t.txt)}get loadedLexiconNames(){const t=new Set;for(const e in this.dictionaries)Object.prototype.hasOwnProperty.call(this.dictionaries,e)&&this.dictionaries[e].forEach(e=>{t.add(e.name)});return Array.from(t)}}exports.CJKTokenizer=e,exports.DateTokenizer=n,exports.EnglishTokenizer=t,exports.LanguageDetector=s,exports.MultilingualTokenizer=a,exports.createTokenizer=function(t){return new a(t)};
|
|
1
|
+
"use strict";class t{dictionaries;constructor(t={}){this.dictionaries=t}detectLanguage(t){return/^[a-zA-Z0-9\s!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]+$/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const n=[],s=t.split(/\b/);for(const t of s)t&&(t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?n.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(n,e)}tagNameTokens(t,e){const n=[];let s=0;for(;s<t.length;){if(s<t.length&&"word"===t[s].type){const i=t[s].txt;if(this.dictionaries[e]){let t=!1;for(const r of this.dictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){n.push({txt:i,type:"word",lang:e,src:r.name}),s++,t=!0;break}if(t)continue}}n.push({txt:t[s].txt,type:t[s].type,lang:e,src:t[s].src||""}),s++}return n}}class e{segmenters;dictionaries;constructor(t={}){this.segmenters=new Map,this.dictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const n=[],s=this.getSegmenter(e);for(const i of s.segment(t)){const{segment:t,isWordLike:s}=i;t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?n.push({txt:t,type:"punctuation",lang:e,src:""}):s?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(n,e)}getSegmenter(t,e="word"){const n=`${t}-${e}`;return this.segmenters.has(n)||this.segmenters.set(n,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(n)}applyCustomDictionary(t,e){const n=this.dictionaries[e]||[];let s=t;if(n.length>0){const t=[];let i=0;for(;i<s.length;){let r=null,o=-1;for(let t=Math.min(5,s.length-i);t>=1;t--){const e=s.slice(i,i+t).map(t=>t.txt).join("");for(const s of n)s.data.has(e)&&(!r||t>r.length||t===r.length&&s.priority>o)&&(r={length:t,text:e},o=s.priority)}r?(t.push({txt:r.text,type:"word",lang:e,src:""}),i+=r.length):(t.push({...s[i],src:""}),i++)}s=t}return s}}class n{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))|(?:[零一二三四五六七八九十百千万亿]+)(?:小时|分钟|秒|毫秒|天|周|月|年)/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const n=[];let s=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const n=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,n)),index:o.index,end:Math.max(o.end,n)}:(r.push({text:o.text,index:o.index}),o={...e,end:n}):o={...e,end:n}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>s){const r=t.slice(s,i.index);n.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?n.push({txt:i.text,type:"date",lang:e,src:""}):n.push({txt:i.text,type:"other",lang:e,src:""}),s=i.index+i.text.length}if(s<t.length){const i=t.slice(s);n.push({txt:i,type:"other",lang:e,src:""})}return n}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t)||/^[零一二三四五六七八九十百千万亿]+(?:小时|分钟|秒|毫秒|天|周|月|年)$/.test(t))return!0;let e,n,s;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),n=parseInt(t.slice(4,6)),s=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],n=i[1],s=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],n=i[1],s=i[2]):i[2]>31?(n=i[0],s=i[1],e=i[2]):(n=i[0],s=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;n=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,s=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),n=parseInt(i[2]),s=parseInt(i[3])}}return this.isValidDateComponents(e,n,s)}isValidChineseNumberTime(t){const e=/(小时|分钟|秒|毫秒|天|周|月|年)$/,n=t.replace(e,""),s=t.match(e)?.[1]||"";return!(!n||!s)&&/^(?:[零一二三四五六七八九]|十[零一二三四五六七八九]?|百[零一二三四五六七八九]?|千[零一二三四五六七八九]?|万[零一二三四五六七八九]?|亿[零一二三四五六七八九]?)+$/.test(n)}isValidDateComponents(t,e,n){if(e<1||e>12||n<1||n>31)return!1;if(2===e){if(n>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(n>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class s{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?\b/g,r=/\b(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|localhost)(?::\d{1,5})?\b/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"host",text:a[0]});o.sort((t,e)=>t.index-e.index);const d=[];for(let t=0;t<o.length;t++){let e=!1;for(let n=0;n<o.length;n++)if(t!==n){const s=o[t].index===o[n].index&&o[t].endIndex===o[n].endIndex,i=o[t].index>=o[n].index&&o[t].endIndex<=o[n].endIndex;if(s){if("ip"===o[n].type){e=!0;break}}else if(i){e=!0;break}}e||d.push(o[t])}for(const i of d)i.index>s&&n.push({txt:t.substring(s,i.index),type:"other",lang:e}),n.push({txt:i.text,type:i.type,lang:"en"}),s=i.endIndex;return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e}),n}}class r{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/(#[\p{L}\p{N}_]+)|(@[\p{L}\p{N}_]+)/gu;let r;for(;null!==(r=i.exec(t));){r.index>s&&n.push({txt:t.substring(s,r.index),type:"other",lang:e,src:""});const i=r[0],o=i.startsWith("#")?"hashtag":"mention";n.push({txt:i,type:o,lang:e,src:"social"}),s=r.index+i.length}return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e,src:""}),n}}class o{CHINESE_NUMBERS=new Set(["零","一","二","三","四","五","六","七","八","九","十","百","千","万","亿","壹","贰","叁","肆","伍","陆","柒","捌","玖","拾","佰","仟"]);CHINESE_SURNAMES=new Set(["张","李","王","刘","陈","杨","赵","黄","周","吴","徐","孙","胡","朱","高","林","何","郭","马","罗","梁","宋","郑","谢","韩","唐","冯","于","董","萧","程","曹","袁","邓","许","傅","沈","曾","彭","吕","苏","卢","蒋","蔡","贾","丁","魏","薛","叶","阎","余","潘","杜","戴","夏","钟","汪","田","任","姜","范","方","石","姚","谭","廖","邹","熊","金","陆","郝","孔","白","崔","康","毛","邱","秦","江","史","顾","侯","邵","孟","龙","万","段","雷","钱","汤","尹","黎","易","常","武","乔","贺","赖","龚","文"]);UNITS=new Set(["公斤","英里","克","千克","吨","米","厘米","毫米","公里","斤","两","元","角","分","小时","分钟","秒","折","折扣","卷","券","美元","人民币","元","角","分","亩","公顷","平方米","平方分米","平方厘米","立方厘米","升","毫升","天","周","月","年","岁","度","瓦","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛","牛顿","帕斯卡","巴","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","g","mg","t","km","m","cm","mm","μm","nm","L","mL","l","ml","h","min","s","d","w","y","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","B","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"]);ORDINAL_PREFIXES=new Set(["第","No.","No","no.","no"]);detectLanguage(t){return""}tokenize(t,e){if(!t)return[];const n=[];let s=0;const i=t.length;for(;s<i;){let e=!1,r=this.findOrdinalPrefix(t,s);if(r){const{prefix:o,prefixEnd:a}=r;let d=a;for(;d<i&&""===t[d].trim();)d++;const c=this.findNumber(t,d);if(c){let r=c.end;for(;r<i&&""===t[r].trim();)r++;const o=this.findUnit(t,r);o&&(r=o.end),n.push({start:s,end:r,txt:t.substring(s,r)}),s=r,e=!0}}if(!e){const i=this.findNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}e||s++}if(0===n.length)return[{txt:t,type:"other",lang:e,src:""}];const r=[];let o=0;const a=[];for(const t of n){let e=!1;for(const[n,s]of a.entries())if(t.start<=s.end&&t.end>=s.start){t.end-t.start>s.end-s.start&&(a[n]=t),e=!0;break}e||a.push(t)}a.sort((t,e)=>t.start-e.start);for(const n of a)n.start>o&&r.push({txt:t.substring(o,n.start),type:"other",lang:e,src:""}),r.push({txt:n.txt,type:"number",lang:e,src:""}),o=n.end;return o<i&&r.push({txt:t.substring(o),type:"other",lang:e,src:""}),r}findOrdinalPrefix(t,e){for(const n of this.ORDINAL_PREFIXES)if(t.startsWith(n,e))return{prefix:n,prefixEnd:e+n.length};return null}findNumber(t,e){const n=[/^[+-]?\d+(\.\d+)?[eE][+-]?\d+/,/^[+-]?\d+(\.\d+)?%/,/^[+-]?\d+(\.\d+)?‰/,/^[+-]?\d+\.\d+/,/^[+-]?\.\d+/,/^[+-]?\d+/];for(const s of n){const i=s.exec(t.substring(e));if(i){const r=e+i[0].length;if(r<t.length&&/[a-zA-Z]/.test(t[r])){if(s===n[0])return{end:r};continue}return{end:r}}}return null}findChineseNumber(t,e){let n=e;for(;n<t.length&&this.CHINESE_NUMBERS.has(t[n]);)n++;const s=n-e;if(s>1)return{end:n};if(1===s){t[e];const s=e>0?t[e-1]:"",i=e+1<t.length?t[e+1]:"",r=this.findUnit(t,e+1);if(s&&this.isNumberRelatedChar(s)||r)return{end:n};if(i&&this.CHINESE_NUMBERS.has(i))return{end:n}}return null}isNumberRelatedChar(t){return this.CHINESE_NUMBERS.has(t)||this.UNITS.has(t)||t>="0"&&t<="9"||"."===t||"e"===t||"E"===t||"+"===t||"-"===t}findUnit(t,e){let n=0;for(let s=e;s<Math.min(e+5,t.length);s++){const i=t.substring(e,s+1);if(this.UNITS.has(i)){const i=t[s+1];(!i||!/[a-zA-Z0-9]/.test(i))&&(n=s+1-e)}}return n>0?{end:e+n}:null}findNumberWithUnit(t,e){const n=this.findNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}findChineseNumberWithUnit(t,e){const n=this.findChineseNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}isNumberChar(t,e,n){if(t>="0"&&t<="9")return!0;if("."===t){const t=e>0?n[e-1]:"",s=e<n.length-1?n[e+1]:"";return t>="0"&&t<="9"||s>="0"&&s<="9"}if(("e"===t||"E"===t)&&e>0){const t=n[e-1];return t>="0"&&t<="9"||"."===t}if(("+"===t||"-"===t)&&e>0){const t=n[e-1];return"e"===t||"E"===t}if(this.CHINESE_NUMBERS.has(t))return!0;for(const s of this.ORDINAL_PREFIXES)if(s.includes(t)&&n.startsWith(s,e))return!0;for(const t of this.UNITS)if(n.startsWith(t,e))return!0;if(("+"===t||"-"===t)&&e>1){const t=n[e-1],s=n[e-2];return("e"===t||"E"===t)&&(s>="0"&&s<="9"||"."===s)}return!1}}exports.CJKTokenizer=e,exports.DateTokenizer=n,exports.EnglishTokenizer=t,exports.LanguageDetector=s,exports.MultilingualTokenizer=class{tokenizers;dictionaries;defaultLanguage;constructor(s={}){this.dictionaries=s.dictionaries||{},this.defaultLanguage=s.defaultLanguage||"en",this.tokenizers=[new n,new i,new r,new o,new t,new e(this.dictionaries)]}addDictionary(t,n,i,r){const o=void 0!==i?i:200;let a=r;if(!a&&t.length>0){const e=t.find(t=>""!==t.trim())||"";a=s.detectLanguage(e)}const d=a||this.defaultLanguage;this.dictionaries[d]||(this.dictionaries[d]=[]);const c=this.dictionaries[d].findIndex(t=>t.name===n&&t.lang===d&&t.priority===o);if(c>=0){const e=this.dictionaries[d][c];t.forEach(t=>e.data.add(t))}else this.dictionaries[d].push({priority:o,data:new Set(t),name:n,lang:d});const u=this.tokenizers.find(t=>t instanceof e);u&&(u.dictionaries=this.dictionaries)}removeCustomWord(t,e,n){if(e){if(this.dictionaries[e])if(n){const s=this.dictionaries[e].find(t=>t.name===n);s&&s.data.delete(t)}else this.dictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.dictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(a,d){const c=d||s.detectLanguage(a),u=this.tokenizers.find(t=>t instanceof n);if(!u)return[];const h=u.tokenize(a,c),l=[],f=this.tokenizers.find(t=>t instanceof i);if(!f)return l;for(const n of h)if("date"===n.type)l.push(n);else{const s=f.tokenize(n.txt,c),i=this.tokenizers.find(t=>t instanceof r);if(!i)return l;for(const n of s)if("host"===n.type||"ip"===n.type)l.push(n);else{const s=i.tokenize(n.txt,c),r=this.tokenizers.find(t=>t instanceof o);if(!r)return l;for(const n of s)if("hashtag"===n.type||"mention"===n.type)l.push(n);else{const s=r.tokenize(n.txt,c);for(const n of s)if("number"===n.type)l.push(n);else{let s=[];if("en"===c){const e=this.tokenizers.find(e=>e instanceof t);e&&(s=e.tokenize(n.txt,c))}else if(["zh","ja","ko"].includes(c)){const i=n.txt,r=this.tokenizers.find(t=>t instanceof e);if(r){const e=r.tokenize(i,c);if(e.length>0)s=e;else{const e=[];let n=0;const o=/[a-zA-Z0-9]+[a-zA-Z0-9_-]*|[a-zA-Z0-9]|[a-zA-Z]+/g;let a;for(;null!==(a=o.exec(i));){if(a.index>n){const t=i.substring(n,a.index),s=r.tokenize(t,c);e.push(...s)}const s=a[0],o=this.tokenizers.find(e=>e instanceof t);if(o){const t=o.tokenize(s,"en");e.push(...t)}n=a.index+a[0].length}if(n<i.length){const t=i.substring(n),s=r.tokenize(t,c);e.push(...s)}s=e}}}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,c))}s.length>0?l.push(...s):l.push(n)}}}}const p=[];let g=null;for(const t of l)"punctuation"===t.type?g?g.txt+=t.txt:g={...t}:(g&&(p.push(g),g=null),p.push(t));return g&&p.push(g),p}tokenizeText(t,e){const n=this.tokenize(t,e?.language),s=["space",...e?.excludeTypes||[]];return n.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type)||s.includes(t.type)||"punctuation"===t.type&&t.txt.replace(/\s/g,"").length<=1)).map(t=>t.txt)}get loadedLexiconNames(){const t=new Set;for(const e in this.dictionaries)Object.prototype.hasOwnProperty.call(this.dictionaries,e)&&this.dictionaries[e].forEach(e=>{t.add(e.name)});return Array.from(t)}};
|
package/lib/core.d.ts
CHANGED
|
@@ -225,12 +225,6 @@ declare class MultilingualTokenizer {
|
|
|
225
225
|
*/
|
|
226
226
|
get loadedLexiconNames(): string[];
|
|
227
227
|
}
|
|
228
|
-
/**
|
|
229
|
-
* 创建多语言分词器实例的工厂函数
|
|
230
|
-
* @param options - 分词器配置选项
|
|
231
|
-
* @returns MultilingualTokenizer实例
|
|
232
|
-
*/
|
|
233
|
-
declare function createTokenizer(options?: TokenizerOptions): MultilingualTokenizer;
|
|
234
228
|
|
|
235
|
-
export { CJKTokenizer, DateTokenizer, EnglishTokenizer, LanguageDetector, MultilingualTokenizer
|
|
229
|
+
export { CJKTokenizer, DateTokenizer, EnglishTokenizer, LanguageDetector, MultilingualTokenizer };
|
|
236
230
|
export type { LanguageTokenizer, LexiconEntry, Token, TokenizeTextOptions, TokenizerOptions };
|
package/lib/core.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
class t{dictionaries;constructor(t={}){this.dictionaries=t}detectLanguage(t){return/^[a-zA-Z0-9\s!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]+$/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const n=[],s=t.split(/\b/);for(const t of s)t&&(t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?n.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(n,e)}tagNameTokens(t,e){const n=[];let s=0;for(;s<t.length;){if(s<t.length&&"word"===t[s].type){const i=t[s].txt;if(this.dictionaries[e]){let t=!1;for(const r of this.dictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){n.push({txt:i,type:"word",lang:e,src:r.name}),s++,t=!0;break}if(t)continue}}n.push({txt:t[s].txt,type:t[s].type,lang:e,src:t[s].src||""}),s++}return n}}class e{segmenters;dictionaries;constructor(t={}){this.segmenters=new Map,this.dictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const n=[],s=this.getSegmenter(e);for(const i of s.segment(t)){const{segment:t,isWordLike:s}=i;t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?n.push({txt:t,type:"punctuation",lang:e,src:""}):s?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(n,e)}getSegmenter(t,e="word"){const n=`${t}-${e}`;return this.segmenters.has(n)||this.segmenters.set(n,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(n)}applyCustomDictionary(t,e){const n=this.dictionaries[e]||[];let s=t;if(n.length>0){const t=[];let i=0;for(;i<s.length;){let r=null,o=-1;for(let t=Math.min(5,s.length-i);t>=1;t--){const e=s.slice(i,i+t).map(t=>t.txt).join("");for(const s of n)s.data.has(e)&&(!r||t>r.length||t===r.length&&s.priority>o)&&(r={length:t,text:e},o=s.priority)}r?(t.push({txt:r.text,type:"word",lang:e,src:""}),i+=r.length):(t.push({...s[i],src:""}),i++)}s=t}return s}}class n{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))|(?:[零一二三四五六七八九十百千万亿]+)(?:小时|分钟|秒|毫秒|天|周|月|年)/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const n=[];let s=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const n=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,n)),index:o.index,end:Math.max(o.end,n)}:(r.push({text:o.text,index:o.index}),o={...e,end:n}):o={...e,end:n}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>s){const r=t.slice(s,i.index);n.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?n.push({txt:i.text,type:"date",lang:e,src:""}):n.push({txt:i.text,type:"other",lang:e,src:""}),s=i.index+i.text.length}if(s<t.length){const i=t.slice(s);n.push({txt:i,type:"other",lang:e,src:""})}return n}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t)||/^[零一二三四五六七八九十百千万亿]+(?:小时|分钟|秒|毫秒|天|周|月|年)$/.test(t))return!0;let e,n,s;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),n=parseInt(t.slice(4,6)),s=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],n=i[1],s=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],n=i[1],s=i[2]):i[2]>31?(n=i[0],s=i[1],e=i[2]):(n=i[0],s=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;n=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,s=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),n=parseInt(i[2]),s=parseInt(i[3])}}return this.isValidDateComponents(e,n,s)}isValidChineseNumberTime(t){const e=/(小时|分钟|秒|毫秒|天|周|月|年)$/,n=t.replace(e,""),s=t.match(e)?.[1]||"";return!(!n||!s)&&/^(?:[零一二三四五六七八九]|十[零一二三四五六七八九]?|百[零一二三四五六七八九]?|千[零一二三四五六七八九]?|万[零一二三四五六七八九]?|亿[零一二三四五六七八九]?)+$/.test(n)}isValidDateComponents(t,e,n){if(e<1||e>12||n<1||n>31)return!1;if(2===e){if(n>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(n>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class s{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?\b/g,r=/\b(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|localhost)(?::\d{1,5})?\b/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"host",text:a[0]});o.sort((t,e)=>t.index-e.index);const d=[];for(let t=0;t<o.length;t++){let e=!1;for(let n=0;n<o.length;n++)if(t!==n){const s=o[t].index===o[n].index&&o[t].endIndex===o[n].endIndex,i=o[t].index>=o[n].index&&o[t].endIndex<=o[n].endIndex;if(s){if("ip"===o[n].type){e=!0;break}}else if(i){e=!0;break}}e||d.push(o[t])}for(const i of d)i.index>s&&n.push({txt:t.substring(s,i.index),type:"other",lang:e}),n.push({txt:i.text,type:i.type,lang:"en"}),s=i.endIndex;return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e}),n}}class r{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/(#[\p{L}\p{N}_]+)|(@[\p{L}\p{N}_]+)/gu;let r;for(;null!==(r=i.exec(t));){r.index>s&&n.push({txt:t.substring(s,r.index),type:"other",lang:e,src:""});const i=r[0],o=i.startsWith("#")?"hashtag":"mention";n.push({txt:i,type:o,lang:e,src:"social"}),s=r.index+i.length}return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e,src:""}),n}}class o{CHINESE_NUMBERS=new Set(["零","一","二","三","四","五","六","七","八","九","十","百","千","万","亿","壹","贰","叁","肆","伍","陆","柒","捌","玖","拾","佰","仟"]);CHINESE_SURNAMES=new Set(["张","李","王","刘","陈","杨","赵","黄","周","吴","徐","孙","胡","朱","高","林","何","郭","马","罗","梁","宋","郑","谢","韩","唐","冯","于","董","萧","程","曹","袁","邓","许","傅","沈","曾","彭","吕","苏","卢","蒋","蔡","贾","丁","魏","薛","叶","阎","余","潘","杜","戴","夏","钟","汪","田","任","姜","范","方","石","姚","谭","廖","邹","熊","金","陆","郝","孔","白","崔","康","毛","邱","秦","江","史","顾","侯","邵","孟","龙","万","段","雷","钱","汤","尹","黎","易","常","武","乔","贺","赖","龚","文"]);UNITS=new Set(["公斤","英里","克","千克","吨","米","厘米","毫米","公里","斤","两","元","角","分","小时","分钟","秒","折","折扣","卷","券","美元","人民币","元","角","分","亩","公顷","平方米","平方分米","平方厘米","立方厘米","升","毫升","天","周","月","年","岁","度","瓦","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛","牛顿","帕斯卡","巴","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","g","mg","t","km","m","cm","mm","μm","nm","L","mL","l","ml","h","min","s","d","w","y","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","B","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"]);ORDINAL_PREFIXES=new Set(["第","No.","No","no.","no"]);detectLanguage(t){return""}tokenize(t,e){if(!t)return[];const n=[];let s=0;const i=t.length;for(;s<i;){let e=!1,r=this.findOrdinalPrefix(t,s);if(r){const{prefix:o,prefixEnd:a}=r;let d=a;for(;d<i&&""===t[d].trim();)d++;const c=this.findNumber(t,d);if(c){let r=c.end;for(;r<i&&""===t[r].trim();)r++;const o=this.findUnit(t,r);o&&(r=o.end),n.push({start:s,end:r,txt:t.substring(s,r)}),s=r,e=!0}}if(!e){const i=this.findNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}e||s++}if(0===n.length)return[{txt:t,type:"other",lang:e,src:""}];const r=[];let o=0;const a=[];for(const t of n){let e=!1;for(const[n,s]of a.entries())if(t.start<=s.end&&t.end>=s.start){t.end-t.start>s.end-s.start&&(a[n]=t),e=!0;break}e||a.push(t)}a.sort((t,e)=>t.start-e.start);for(const n of a)n.start>o&&r.push({txt:t.substring(o,n.start),type:"other",lang:e,src:""}),r.push({txt:n.txt,type:"number",lang:e,src:""}),o=n.end;return o<i&&r.push({txt:t.substring(o),type:"other",lang:e,src:""}),r}findOrdinalPrefix(t,e){for(const n of this.ORDINAL_PREFIXES)if(t.startsWith(n,e))return{prefix:n,prefixEnd:e+n.length};return null}findNumber(t,e){const n=[/^[+-]?\d+(\.\d+)?[eE][+-]?\d+/,/^[+-]?\d+(\.\d+)?%/,/^[+-]?\d+(\.\d+)?‰/,/^[+-]?\d+\.\d+/,/^[+-]?\.\d+/,/^[+-]?\d+/];for(const s of n){const i=s.exec(t.substring(e));if(i){const r=e+i[0].length;if(r<t.length&&/[a-zA-Z]/.test(t[r])){if(s===n[0])return{end:r};continue}return{end:r}}}return null}findChineseNumber(t,e){let n=e;for(;n<t.length&&this.CHINESE_NUMBERS.has(t[n]);)n++;const s=n-e;if(s>1)return{end:n};if(1===s){t[e];const s=e>0?t[e-1]:"",i=e+1<t.length?t[e+1]:"",r=this.findUnit(t,e+1);if(s&&this.isNumberRelatedChar(s)||r)return{end:n};if(i&&this.CHINESE_NUMBERS.has(i))return{end:n}}return null}isNumberRelatedChar(t){return this.CHINESE_NUMBERS.has(t)||this.UNITS.has(t)||t>="0"&&t<="9"||"."===t||"e"===t||"E"===t||"+"===t||"-"===t}findUnit(t,e){let n=0;for(let s=e;s<Math.min(e+5,t.length);s++){const i=t.substring(e,s+1);if(this.UNITS.has(i)){const i=t[s+1];(!i||!/[a-zA-Z0-9]/.test(i))&&(n=s+1-e)}}return n>0?{end:e+n}:null}findNumberWithUnit(t,e){const n=this.findNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}findChineseNumberWithUnit(t,e){const n=this.findChineseNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}isNumberChar(t,e,n){if(t>="0"&&t<="9")return!0;if("."===t){const t=e>0?n[e-1]:"",s=e<n.length-1?n[e+1]:"";return t>="0"&&t<="9"||s>="0"&&s<="9"}if(("e"===t||"E"===t)&&e>0){const t=n[e-1];return t>="0"&&t<="9"||"."===t}if(("+"===t||"-"===t)&&e>0){const t=n[e-1];return"e"===t||"E"===t}if(this.CHINESE_NUMBERS.has(t))return!0;for(const s of this.ORDINAL_PREFIXES)if(s.includes(t)&&n.startsWith(s,e))return!0;for(const t of this.UNITS)if(n.startsWith(t,e))return!0;if(("+"===t||"-"===t)&&e>1){const t=n[e-1],s=n[e-2];return("e"===t||"E"===t)&&(s>="0"&&s<="9"||"."===s)}return!1}}class a{tokenizers;dictionaries;defaultLanguage;constructor(s={}){this.dictionaries=s.dictionaries||{},this.defaultLanguage=s.defaultLanguage||"en",this.tokenizers=[new n,new i,new r,new o,new t,new e(this.dictionaries)]}addDictionary(t,n,i,r){const o=void 0!==i?i:200;let a=r;if(!a&&t.length>0){const e=t.find(t=>""!==t.trim())||"";a=s.detectLanguage(e)}const d=a||this.defaultLanguage;this.dictionaries[d]||(this.dictionaries[d]=[]);const c=this.dictionaries[d].findIndex(t=>t.name===n&&t.lang===d&&t.priority===o);if(c>=0){const e=this.dictionaries[d][c];t.forEach(t=>e.data.add(t))}else this.dictionaries[d].push({priority:o,data:new Set(t),name:n,lang:d});const u=this.tokenizers.find(t=>t instanceof e);u&&(u.dictionaries=this.dictionaries)}removeCustomWord(t,e,n){if(e){if(this.dictionaries[e])if(n){const s=this.dictionaries[e].find(t=>t.name===n);s&&s.data.delete(t)}else this.dictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.dictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(a,d){const c=d||s.detectLanguage(a),u=this.tokenizers.find(t=>t instanceof n);if(!u)return[];const h=u.tokenize(a,c),l=[],f=this.tokenizers.find(t=>t instanceof i);if(!f)return l;for(const n of h)if("date"===n.type)l.push(n);else{const s=f.tokenize(n.txt,c),i=this.tokenizers.find(t=>t instanceof r);if(!i)return l;for(const n of s)if("host"===n.type||"ip"===n.type)l.push(n);else{const s=i.tokenize(n.txt,c),r=this.tokenizers.find(t=>t instanceof o);if(!r)return l;for(const n of s)if("hashtag"===n.type||"mention"===n.type)l.push(n);else{const s=r.tokenize(n.txt,c);for(const n of s)if("number"===n.type)l.push(n);else{let s=[];if("en"===c){const e=this.tokenizers.find(e=>e instanceof t);e&&(s=e.tokenize(n.txt,c))}else if(["zh","ja","ko"].includes(c)){const i=n.txt,r=this.tokenizers.find(t=>t instanceof e);if(r){const e=r.tokenize(i,c);if(e.length>0)s=e;else{const e=[];let n=0;const o=/[a-zA-Z0-9]+[a-zA-Z0-9_-]*|[a-zA-Z0-9]|[a-zA-Z]+/g;let a;for(;null!==(a=o.exec(i));){if(a.index>n){const t=i.substring(n,a.index),s=r.tokenize(t,c);e.push(...s)}const s=a[0],o=this.tokenizers.find(e=>e instanceof t);if(o){const t=o.tokenize(s,"en");e.push(...t)}n=a.index+a[0].length}if(n<i.length){const t=i.substring(n),s=r.tokenize(t,c);e.push(...s)}s=e}}}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,c))}s.length>0?l.push(...s):l.push(n)}}}}const p=[];let g=null;for(const t of l)"punctuation"===t.type?g?g.txt+=t.txt:g={...t}:(g&&(p.push(g),g=null),p.push(t));return g&&p.push(g),p}tokenizeText(t,e){const n=this.tokenize(t,e?.language),s=["space",...e?.excludeTypes||[]];return n.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type)||s.includes(t.type)||"punctuation"===t.type&&t.txt.replace(/\s/g,"").length<=1)).map(t=>t.txt)}get loadedLexiconNames(){const t=new Set;for(const e in this.dictionaries)Object.prototype.hasOwnProperty.call(this.dictionaries,e)&&this.dictionaries[e].forEach(e=>{t.add(e.name)});return Array.from(t)}}function d(t){return new a(t)}export{e as CJKTokenizer,n as DateTokenizer,t as EnglishTokenizer,s as LanguageDetector,a as MultilingualTokenizer,d as createTokenizer};
|
|
1
|
+
class t{dictionaries;constructor(t={}){this.dictionaries=t}detectLanguage(t){return/^[a-zA-Z0-9\s!"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~]+$/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const n=[],s=t.split(/\b/);for(const t of s)t&&(t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?n.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(n,e)}tagNameTokens(t,e){const n=[];let s=0;for(;s<t.length;){if(s<t.length&&"word"===t[s].type){const i=t[s].txt;if(this.dictionaries[e]){let t=!1;for(const r of this.dictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){n.push({txt:i,type:"word",lang:e,src:r.name}),s++,t=!0;break}if(t)continue}}n.push({txt:t[s].txt,type:t[s].type,lang:e,src:t[s].src||""}),s++}return n}}class e{segmenters;dictionaries;constructor(t={}){this.segmenters=new Map,this.dictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const n=[],s=this.getSegmenter(e);for(const i of s.segment(t)){const{segment:t,isWordLike:s}=i;t.match(/^\s+$/)?n.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9#]/.test(t)?n.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?n.push({txt:t,type:"punctuation",lang:e,src:""}):s?n.push({txt:t,type:"word",lang:e,src:""}):n.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(n,e)}getSegmenter(t,e="word"){const n=`${t}-${e}`;return this.segmenters.has(n)||this.segmenters.set(n,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(n)}applyCustomDictionary(t,e){const n=this.dictionaries[e]||[];let s=t;if(n.length>0){const t=[];let i=0;for(;i<s.length;){let r=null,o=-1;for(let t=Math.min(5,s.length-i);t>=1;t--){const e=s.slice(i,i+t).map(t=>t.txt).join("");for(const s of n)s.data.has(e)&&(!r||t>r.length||t===r.length&&s.priority>o)&&(r={length:t,text:e},o=s.priority)}r?(t.push({txt:r.text,type:"word",lang:e,src:""}),i+=r.length):(t.push({...s[i],src:""}),i++)}s=t}return s}}class n{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))|(?:[零一二三四五六七八九十百千万亿]+)(?:小时|分钟|秒|毫秒|天|周|月|年)/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const n=[];let s=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const n=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,n)),index:o.index,end:Math.max(o.end,n)}:(r.push({text:o.text,index:o.index}),o={...e,end:n}):o={...e,end:n}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>s){const r=t.slice(s,i.index);n.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?n.push({txt:i.text,type:"date",lang:e,src:""}):n.push({txt:i.text,type:"other",lang:e,src:""}),s=i.index+i.text.length}if(s<t.length){const i=t.slice(s);n.push({txt:i,type:"other",lang:e,src:""})}return n}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t)||/^[零一二三四五六七八九十百千万亿]+(?:小时|分钟|秒|毫秒|天|周|月|年)$/.test(t))return!0;let e,n,s;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),n=parseInt(t.slice(4,6)),s=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],n=i[1],s=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],n=i[1],s=i[2]):i[2]>31?(n=i[0],s=i[1],e=i[2]):(n=i[0],s=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;n=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,s=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),n=parseInt(i[2]),s=parseInt(i[3])}}return this.isValidDateComponents(e,n,s)}isValidChineseNumberTime(t){const e=/(小时|分钟|秒|毫秒|天|周|月|年)$/,n=t.replace(e,""),s=t.match(e)?.[1]||"";return!(!n||!s)&&/^(?:[零一二三四五六七八九]|十[零一二三四五六七八九]?|百[零一二三四五六七八九]?|千[零一二三四五六七八九]?|万[零一二三四五六七八九]?|亿[零一二三四五六七八九]?)+$/.test(n)}isValidDateComponents(t,e,n){if(e<1||e>12||n<1||n>31)return!1;if(2===e){if(n>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(n>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class s{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(?::\d{1,5})?\b/g,r=/\b(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|localhost)(?::\d{1,5})?\b/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"host",text:a[0]});o.sort((t,e)=>t.index-e.index);const d=[];for(let t=0;t<o.length;t++){let e=!1;for(let n=0;n<o.length;n++)if(t!==n){const s=o[t].index===o[n].index&&o[t].endIndex===o[n].endIndex,i=o[t].index>=o[n].index&&o[t].endIndex<=o[n].endIndex;if(s){if("ip"===o[n].type){e=!0;break}}else if(i){e=!0;break}}e||d.push(o[t])}for(const i of d)i.index>s&&n.push({txt:t.substring(s,i.index),type:"other",lang:e}),n.push({txt:i.text,type:i.type,lang:"en"}),s=i.endIndex;return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e}),n}}class r{detectLanguage(t){return"en"}tokenize(t,e){const n=[];let s=0;const i=/(#[\p{L}\p{N}_]+)|(@[\p{L}\p{N}_]+)/gu;let r;for(;null!==(r=i.exec(t));){r.index>s&&n.push({txt:t.substring(s,r.index),type:"other",lang:e,src:""});const i=r[0],o=i.startsWith("#")?"hashtag":"mention";n.push({txt:i,type:o,lang:e,src:"social"}),s=r.index+i.length}return s<t.length&&n.push({txt:t.substring(s),type:"other",lang:e,src:""}),n}}class o{CHINESE_NUMBERS=new Set(["零","一","二","三","四","五","六","七","八","九","十","百","千","万","亿","壹","贰","叁","肆","伍","陆","柒","捌","玖","拾","佰","仟"]);CHINESE_SURNAMES=new Set(["张","李","王","刘","陈","杨","赵","黄","周","吴","徐","孙","胡","朱","高","林","何","郭","马","罗","梁","宋","郑","谢","韩","唐","冯","于","董","萧","程","曹","袁","邓","许","傅","沈","曾","彭","吕","苏","卢","蒋","蔡","贾","丁","魏","薛","叶","阎","余","潘","杜","戴","夏","钟","汪","田","任","姜","范","方","石","姚","谭","廖","邹","熊","金","陆","郝","孔","白","崔","康","毛","邱","秦","江","史","顾","侯","邵","孟","龙","万","段","雷","钱","汤","尹","黎","易","常","武","乔","贺","赖","龚","文"]);UNITS=new Set(["公斤","英里","克","千克","吨","米","厘米","毫米","公里","斤","两","元","角","分","小时","分钟","秒","折","折扣","卷","券","美元","人民币","元","角","分","亩","公顷","平方米","平方分米","平方厘米","立方厘米","升","毫升","天","周","月","年","岁","度","瓦","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛","牛顿","帕斯卡","巴","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","g","mg","t","km","m","cm","mm","μm","nm","L","mL","l","ml","h","min","s","d","w","y","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","B","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"]);ORDINAL_PREFIXES=new Set(["第","No.","No","no.","no"]);detectLanguage(t){return""}tokenize(t,e){if(!t)return[];const n=[];let s=0;const i=t.length;for(;s<i;){let e=!1,r=this.findOrdinalPrefix(t,s);if(r){const{prefix:o,prefixEnd:a}=r;let d=a;for(;d<i&&""===t[d].trim();)d++;const c=this.findNumber(t,d);if(c){let r=c.end;for(;r<i&&""===t[r].trim();)r++;const o=this.findUnit(t,r);o&&(r=o.end),n.push({start:s,end:r,txt:t.substring(s,r)}),s=r,e=!0}}if(!e){const i=this.findNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumberWithUnit(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}if(!e){const i=this.findChineseNumber(t,s);i&&(n.push({start:s,end:i.end,txt:t.substring(s,i.end)}),s=i.end,e=!0)}e||s++}if(0===n.length)return[{txt:t,type:"other",lang:e,src:""}];const r=[];let o=0;const a=[];for(const t of n){let e=!1;for(const[n,s]of a.entries())if(t.start<=s.end&&t.end>=s.start){t.end-t.start>s.end-s.start&&(a[n]=t),e=!0;break}e||a.push(t)}a.sort((t,e)=>t.start-e.start);for(const n of a)n.start>o&&r.push({txt:t.substring(o,n.start),type:"other",lang:e,src:""}),r.push({txt:n.txt,type:"number",lang:e,src:""}),o=n.end;return o<i&&r.push({txt:t.substring(o),type:"other",lang:e,src:""}),r}findOrdinalPrefix(t,e){for(const n of this.ORDINAL_PREFIXES)if(t.startsWith(n,e))return{prefix:n,prefixEnd:e+n.length};return null}findNumber(t,e){const n=[/^[+-]?\d+(\.\d+)?[eE][+-]?\d+/,/^[+-]?\d+(\.\d+)?%/,/^[+-]?\d+(\.\d+)?‰/,/^[+-]?\d+\.\d+/,/^[+-]?\.\d+/,/^[+-]?\d+/];for(const s of n){const i=s.exec(t.substring(e));if(i){const r=e+i[0].length;if(r<t.length&&/[a-zA-Z]/.test(t[r])){if(s===n[0])return{end:r};continue}return{end:r}}}return null}findChineseNumber(t,e){let n=e;for(;n<t.length&&this.CHINESE_NUMBERS.has(t[n]);)n++;const s=n-e;if(s>1)return{end:n};if(1===s){t[e];const s=e>0?t[e-1]:"",i=e+1<t.length?t[e+1]:"",r=this.findUnit(t,e+1);if(s&&this.isNumberRelatedChar(s)||r)return{end:n};if(i&&this.CHINESE_NUMBERS.has(i))return{end:n}}return null}isNumberRelatedChar(t){return this.CHINESE_NUMBERS.has(t)||this.UNITS.has(t)||t>="0"&&t<="9"||"."===t||"e"===t||"E"===t||"+"===t||"-"===t}findUnit(t,e){let n=0;for(let s=e;s<Math.min(e+5,t.length);s++){const i=t.substring(e,s+1);if(this.UNITS.has(i)){const i=t[s+1];(!i||!/[a-zA-Z0-9]/.test(i))&&(n=s+1-e)}}return n>0?{end:e+n}:null}findNumberWithUnit(t,e){const n=this.findNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}findChineseNumberWithUnit(t,e){const n=this.findChineseNumber(t,e);if(!n)return null;let s=n.end;const i=this.findUnit(t,s);return i&&(s=i.end),s>n.end?{end:s}:null}isNumberChar(t,e,n){if(t>="0"&&t<="9")return!0;if("."===t){const t=e>0?n[e-1]:"",s=e<n.length-1?n[e+1]:"";return t>="0"&&t<="9"||s>="0"&&s<="9"}if(("e"===t||"E"===t)&&e>0){const t=n[e-1];return t>="0"&&t<="9"||"."===t}if(("+"===t||"-"===t)&&e>0){const t=n[e-1];return"e"===t||"E"===t}if(this.CHINESE_NUMBERS.has(t))return!0;for(const s of this.ORDINAL_PREFIXES)if(s.includes(t)&&n.startsWith(s,e))return!0;for(const t of this.UNITS)if(n.startsWith(t,e))return!0;if(("+"===t||"-"===t)&&e>1){const t=n[e-1],s=n[e-2];return("e"===t||"E"===t)&&(s>="0"&&s<="9"||"."===s)}return!1}}class a{tokenizers;dictionaries;defaultLanguage;constructor(s={}){this.dictionaries=s.dictionaries||{},this.defaultLanguage=s.defaultLanguage||"en",this.tokenizers=[new n,new i,new r,new o,new t,new e(this.dictionaries)]}addDictionary(t,n,i,r){const o=void 0!==i?i:200;let a=r;if(!a&&t.length>0){const e=t.find(t=>""!==t.trim())||"";a=s.detectLanguage(e)}const d=a||this.defaultLanguage;this.dictionaries[d]||(this.dictionaries[d]=[]);const c=this.dictionaries[d].findIndex(t=>t.name===n&&t.lang===d&&t.priority===o);if(c>=0){const e=this.dictionaries[d][c];t.forEach(t=>e.data.add(t))}else this.dictionaries[d].push({priority:o,data:new Set(t),name:n,lang:d});const u=this.tokenizers.find(t=>t instanceof e);u&&(u.dictionaries=this.dictionaries)}removeCustomWord(t,e,n){if(e){if(this.dictionaries[e])if(n){const s=this.dictionaries[e].find(t=>t.name===n);s&&s.data.delete(t)}else this.dictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.dictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(a,d){const c=d||s.detectLanguage(a),u=this.tokenizers.find(t=>t instanceof n);if(!u)return[];const h=u.tokenize(a,c),l=[],f=this.tokenizers.find(t=>t instanceof i);if(!f)return l;for(const n of h)if("date"===n.type)l.push(n);else{const s=f.tokenize(n.txt,c),i=this.tokenizers.find(t=>t instanceof r);if(!i)return l;for(const n of s)if("host"===n.type||"ip"===n.type)l.push(n);else{const s=i.tokenize(n.txt,c),r=this.tokenizers.find(t=>t instanceof o);if(!r)return l;for(const n of s)if("hashtag"===n.type||"mention"===n.type)l.push(n);else{const s=r.tokenize(n.txt,c);for(const n of s)if("number"===n.type)l.push(n);else{let s=[];if("en"===c){const e=this.tokenizers.find(e=>e instanceof t);e&&(s=e.tokenize(n.txt,c))}else if(["zh","ja","ko"].includes(c)){const i=n.txt,r=this.tokenizers.find(t=>t instanceof e);if(r){const e=r.tokenize(i,c);if(e.length>0)s=e;else{const e=[];let n=0;const o=/[a-zA-Z0-9]+[a-zA-Z0-9_-]*|[a-zA-Z0-9]|[a-zA-Z]+/g;let a;for(;null!==(a=o.exec(i));){if(a.index>n){const t=i.substring(n,a.index),s=r.tokenize(t,c);e.push(...s)}const s=a[0],o=this.tokenizers.find(e=>e instanceof t);if(o){const t=o.tokenize(s,"en");e.push(...t)}n=a.index+a[0].length}if(n<i.length){const t=i.substring(n),s=r.tokenize(t,c);e.push(...s)}s=e}}}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(s=t.tokenize(n.txt,c))}s.length>0?l.push(...s):l.push(n)}}}}const p=[];let g=null;for(const t of l)"punctuation"===t.type?g?g.txt+=t.txt:g={...t}:(g&&(p.push(g),g=null),p.push(t));return g&&p.push(g),p}tokenizeText(t,e){const n=this.tokenize(t,e?.language),s=["space",...e?.excludeTypes||[]];return n.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type)||s.includes(t.type)||"punctuation"===t.type&&t.txt.replace(/\s/g,"").length<=1)).map(t=>t.txt)}get loadedLexiconNames(){const t=new Set;for(const e in this.dictionaries)Object.prototype.hasOwnProperty.call(this.dictionaries,e)&&this.dictionaries[e].forEach(e=>{t.add(e.name)});return Array.from(t)}}export{e as CJKTokenizer,n as DateTokenizer,t as EnglishTokenizer,s as LanguageDetector,a as MultilingualTokenizer};
|
package/lib/index.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";var e=require("./
|
|
1
|
+
"use strict";var e=require("./lexicon"),t=require("./core");Object.keys(e).forEach(function(t){"default"!==t&&!Object.prototype.hasOwnProperty.call(exports,t)&&Object.defineProperty(exports,t,{enumerable:!0,get:function(){return e[t]}})}),Object.keys(t).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return t[e]}})});
|
package/lib/index.d.ts
CHANGED
|
@@ -1,69 +1,2 @@
|
|
|
1
|
-
|
|
2
|
-
export
|
|
3
|
-
import { SupportedLanguage, SupportedType } from './lexicon';
|
|
4
|
-
export { LexiconConfig, LexiconLoader, SUPPORTED_LANGUAGES, SUPPORTED_TYPES, SupportedLanguage, SupportedType } from './lexicon';
|
|
5
|
-
|
|
6
|
-
/**
|
|
7
|
-
* 快速使用多语言分词器类,提供静态实例和便捷方法
|
|
8
|
-
* @class QuickUseTokenizer
|
|
9
|
-
*/
|
|
10
|
-
declare class QuickUseTokenizer {
|
|
11
|
-
/** 静态分词器实例 */
|
|
12
|
-
private static instance;
|
|
13
|
-
/** 默认加载的语言 */
|
|
14
|
-
private static defaultLanguages;
|
|
15
|
-
/** 默认加载的词库类型 */
|
|
16
|
-
private static defaultTypes;
|
|
17
|
-
/**
|
|
18
|
-
* 获取分词器实例(单例模式)
|
|
19
|
-
* @returns MultilingualTokenizer实例
|
|
20
|
-
*/
|
|
21
|
-
static getInstance(): MultilingualTokenizer;
|
|
22
|
-
/**
|
|
23
|
-
* 设置默认加载的语言
|
|
24
|
-
* @param languages - 要加载的语言代码数组
|
|
25
|
-
*/
|
|
26
|
-
static setDefaultLanguages(languages: SupportedLanguage[]): void;
|
|
27
|
-
/**
|
|
28
|
-
* 设置默认加载的词库类型
|
|
29
|
-
* @param types - 要加载的词库类型数组
|
|
30
|
-
*/
|
|
31
|
-
static setDefaultTypes(types: SupportedType[]): void;
|
|
32
|
-
/**
|
|
33
|
-
* 分词方法
|
|
34
|
-
* @param text - 要分词的文本
|
|
35
|
-
* @param language - 可选,指定文本语言代码
|
|
36
|
-
* @returns 分词结果的Token数组
|
|
37
|
-
*/
|
|
38
|
-
static tokenize(text: string, language?: string): Token[];
|
|
39
|
-
/**
|
|
40
|
-
* 获取纯文本分词结果
|
|
41
|
-
* @param text - 要分词的文本
|
|
42
|
-
* @param language - 可选,指定文本语言代码
|
|
43
|
-
* @returns 单词数组
|
|
44
|
-
*/
|
|
45
|
-
static tokenizeText(text: string, language?: string): string[];
|
|
46
|
-
/**
|
|
47
|
-
* 添加自定义词库
|
|
48
|
-
* @param words - 要添加的单词数组
|
|
49
|
-
* @param name - 词库名称
|
|
50
|
-
* @param priority - 词库优先级,值越高优先级越高,默认比内置词库最高优先级大100
|
|
51
|
-
* @param language - 词库对应的语言代码,未指定时自动根据words判断
|
|
52
|
-
*/
|
|
53
|
-
static addDictionary(words: string[], name: string, priority?: number, language?: string): void;
|
|
54
|
-
/**
|
|
55
|
-
* 移除自定义词库中的指定单词
|
|
56
|
-
* @param word - 要移除的单词
|
|
57
|
-
* @param language - 可选,指定要操作的语言词库
|
|
58
|
-
* @param lexiconName - 可选,指定要操作的词库名称
|
|
59
|
-
*/
|
|
60
|
-
static removeCustomWord(word: string, language?: string, lexiconName?: string): void;
|
|
61
|
-
}
|
|
62
|
-
declare const tokenize: (text: string, language?: string) => Token[];
|
|
63
|
-
declare const tokenizeText: (text: string, language?: string) => string[];
|
|
64
|
-
declare const addDictionary: (words: string[], name: string, priority?: number, language?: string) => void;
|
|
65
|
-
declare const removeCustomWord: (word: string, language?: string, lexiconName?: string) => void;
|
|
66
|
-
declare const setDefaultLanguages: (languages: SupportedLanguage[]) => void;
|
|
67
|
-
declare const setDefaultTypes: (types: SupportedType[]) => void;
|
|
68
|
-
|
|
69
|
-
export { QuickUseTokenizer, addDictionary, removeCustomWord, setDefaultLanguages, setDefaultTypes, tokenize, tokenizeText };
|
|
1
|
+
export * from './lexicon';
|
|
2
|
+
export * from './core';
|
package/lib/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
|
|
1
|
+
export*from"./lexicon";export*from"./core";
|