gs-tokenizer 0.1.13 → 0.1.14
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/core.cjs +1 -1
- package/lib/core.js +1 -1
- package/package.json +1 -1
package/lib/core.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";class t{#t={};#e={};add(t,e){this.addBatch([{word:t,meta:e}])}addBatch(t){const e={...this.#t},s={};for(const t in this.#e)if(Object.prototype.hasOwnProperty.call(this.#e,t)){s[t]={};for(const e in this.#e[t])Object.prototype.hasOwnProperty.call(this.#e[t],e)&&(s[t][+e]=[...this.#e[t][+e]])}for(const{word:n,meta:r}of t){const t=n[0],i=n.length;e[n]=r,s[t]||(s[t]={}),s[t][i]||(s[t][i]=[]);const o=s[t][i];o.includes(n)||o.push(n)}this.#t=e,this.#e=s}match(t,e){if(e>=t.length)return[];const s=t[e],n=this.#e[s],r=[];if(n)for(const s in n){const i=n[+s],o=e+ +s;if(o>t.length)continue;const a=t.slice(e,o);for(const t of i)t===a&&r.push({word:t,meta:this.#t[t]})}return r}matches(t){return this.match(t,0).map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name}))}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){const e=this.index.match(t,0);if(!e.length)return{tokens:[],end:0};return{tokens:e.map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name})),end:Math.max(...e.map(t=>t.word.length))}}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{priority=1e3;best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}const i=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:i}}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class o{skipOwnLastMax=!0;groupTypes=void 0;groupSources=void 0;types=void 0;mainGroup=0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.groupTypes?.[this.mainGroup]||this.id;return{tokens:[{txt:n[this.mainGroup],type:s,src:this.groupSources?.[this.mainGroup]||s}],unprocessedStart:e+n[this.mainGroup].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e,s=null;for(let n=0;n<this.RegexArray.length;n++)if(s=this.RegexArray[n].exec(t),s){this.types?.[n]&&(e=this.types[n]);break}if(e||(e=this.groupTypes?.[this.mainGroup]||this.id),!s)return{tokens:[],end:0};const n={txt:s[this.mainGroup],type:e,src:this.groupSources?.[this.mainGroup]||e};let r;if(s[this.mainGroup+1]){r=[n];for(let t=this.mainGroup+1;s[t];t++){const n=this.groupTypes?.[t];r.push({txt:s[t],type:n||e,src:this.groupSources?.[t]||n||`${e}-sub`})}}else r=[n];return{tokens:r,end:s[0].length}}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",h=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"];class d extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)?[${a}0-9]+(?:${h.join("|")})?`,"i"),new RegExp(`^(?:第|No)?[${a}0-9]+[名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB]?`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i]}class c extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class l extends o{id="punctuation";order=10;priority=0;skipOwnLastMax=!0;breakIfProcessed=!0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class p{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.best(t,0);return{tokens:e.tokens,end:e.tokens.length>0?e.tokens[0].txt.length:0}}}class u{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?{tokens:[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}],end:e[0].length}:{tokens:[],end:0}}}class g extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}class m extends o{id="url";order=7;priority=1e3;skipOwnLastMax=!0;breakIfProcessed=!0;groupTypes={1:"host",2:"other",3:"other",4:"other"};groupSources={2:"url-path",3:"url-query-string",4:"url-hash"};RegexArray=[/^\s*(?:https?|ftp)?:[/]+((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?((?:[/][^/?#\s]*)*\/?)(?:[?]([^#\s]*)*)?(?:#(\S*))?/i]}class f extends o{id="ip";order=7;priority=100;skipOwnLastMax=!0;breakIfProcessed=!0;RegexArray=[/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/,/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/]}exports.MultilingualTokenizer=class{wordIndex=new t;#s=[];#n=new Set;#r=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new p),this.addStage(new u),this.addStage(new m),this.addStage(new f),this.addStage(new g),this.addStage(new d),this.addStage(new l),this.addStage(new c)}get loadedLexiconNames(){return[...this.#n]}get loadedNameLexiconNames(){return this.#r}addDictionary(t,e,s=0,n){this.#n.add(e),this.wordIndex.addBatch(t.map(t=>({word:t,meta:{name:e,priority:s,lang:n}})))}setNameDictionary(t,e){this.#r.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#s.push(t),this.#s.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#s){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#i(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;let r=0;for(;e<t.length;){const i=t.slice(e),o=[];let a=1;for(const t of this.#s){if(r>=e&&t.unprocessedOnly||t.skipOwnLastMax&&e<=n.get(t))continue;const s=t.all(i);if(!s.end)continue;o.push(...s.tokens);let h=s.end;if(r=Math.max(r,e+h),t.skipOwnLastMax&&n.set(t,e+h),t.breakIfProcessed){a=h;break}}if(o.length){const t={start:e,end:r};s.push([t,o])}e+=a}return this.#o(t,s)}tokenizeText(t){return this.tokenize(t).map(t=>t.txt)}tokenizeTextAll(t){return this.tokenizeAll(t).filter(t=>"punctuation"===t.type&&t.txt.length>1||"space"!==t.type).map(t=>t.txt)}#o(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#i(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#a(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}};
|
|
1
|
+
"use strict";class t{#t={};#e={};add(t,e){this.addBatch([{word:t,meta:e}])}addBatch(t){const e={...this.#t},s={};for(const t in this.#e)if(Object.prototype.hasOwnProperty.call(this.#e,t)){s[t]={};for(const e in this.#e[t])Object.prototype.hasOwnProperty.call(this.#e[t],e)&&(s[t][+e]=[...this.#e[t][+e]])}for(const{word:n,meta:r}of t){const t=n[0],i=n.length;e[n]=r,s[t]||(s[t]={}),s[t][i]||(s[t][i]=[]);const o=s[t][i];o.includes(n)||o.push(n)}this.#t=e,this.#e=s}match(t,e){if(e>=t.length)return[];const s=t[e],n=this.#e[s],r=[];if(n)for(const s in n){const i=n[+s],o=e+ +s;if(o>t.length)continue;const a=t.slice(e,o);for(const t of i)t===a&&r.push({word:t,meta:this.#t[t]})}return r}matches(t){return this.match(t,0).map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name}))}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){const e=this.index.match(t,0);if(!e.length)return{tokens:[],end:0};return{tokens:e.map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name})),end:Math.max(...e.map(t=>t.word.length))}}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{priority=1e3;best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}const i=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:i}}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class o{skipOwnLastMax=!0;groupTypes=void 0;groupSources=void 0;types=void 0;mainGroup=0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.groupTypes?.[this.mainGroup]||this.id;return{tokens:[{txt:n[this.mainGroup],type:s,src:this.groupSources?.[this.mainGroup]||s}],unprocessedStart:e+n[this.mainGroup].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e,s=null;for(let n=0;n<this.RegexArray.length;n++)if(s=this.RegexArray[n].exec(t),s){this.types?.[n]&&(e=this.types[n]);break}if(e||(e=this.groupTypes?.[this.mainGroup]||this.id),!s)return{tokens:[],end:0};const n={txt:s[this.mainGroup],type:e,src:this.groupSources?.[this.mainGroup]||e};let r;if(s[this.mainGroup+1]){r=[n];for(let t=this.mainGroup+1;s[t];t++){const n=this.groupTypes?.[t];r.push({txt:s[t],type:n||e,src:this.groupSources?.[t]||n||`${e}-sub`})}}else r=[n];return{tokens:r,end:s[0].length}}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",h=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"],d="元名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB";class c extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)[${a}0-9]+(?:${h.join("|")})`,"i"),new RegExp(`^(?:第|No)[${a}0-9]+[${d}]`,"i"),new RegExp(`^[${a}0-9]+(?:${h.join("|")})`,"i"),new RegExp(`^[0-9${a}]+[${d}]`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i,new RegExp(`^[${a}]+`,"i")]}class l extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class p extends o{id="punctuation";order=10;priority=0;skipOwnLastMax=!0;breakIfProcessed=!0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class u{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.best(t,0);return{tokens:e.tokens,end:e.tokens.length>0?e.tokens[0].txt.length:0}}}class g{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?{tokens:[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}],end:e[0].length}:{tokens:[],end:0}}}class m extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}class f extends o{id="url";order=7;priority=1e3;skipOwnLastMax=!0;breakIfProcessed=!0;groupTypes={1:"host",2:"other",3:"other",4:"other"};groupSources={2:"url-path",3:"url-query-string",4:"url-hash"};RegexArray=[/^\s*(?:https?|ftp)?:[/]+((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?((?:[/][^/?#\s]*)*\/?)(?:[?]([^#\s]*)*)?(?:#(\S*))?/i]}class x extends o{id="ip";order=7;priority=100;skipOwnLastMax=!0;breakIfProcessed=!0;RegexArray=[/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/,/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/]}function y(t){return Array.from(new Set(t.filter(t=>"punctuation"===t.type?t.txt.length>1:"space"!==t.type).map(t=>t.txt)))}exports.MultilingualTokenizer=class{wordIndex=new t;#s=[];#n=new Set;#r=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new u),this.addStage(new g),this.addStage(new f),this.addStage(new x),this.addStage(new m),this.addStage(new c),this.addStage(new p),this.addStage(new l)}get loadedLexiconNames(){return[...this.#n]}get loadedNameLexiconNames(){return this.#r}addDictionary(t,e,s=0,n){this.#n.add(e),this.wordIndex.addBatch(t.map(t=>({word:t,meta:{name:e,priority:s,lang:n}})))}setNameDictionary(t,e){this.#r.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#s.push(t),this.#s.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#s){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#i(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;let r=0;for(;e<t.length;){const i=t.slice(e),o=[];let a=1;for(const t of this.#s){if(r>=e&&t.unprocessedOnly||t.skipOwnLastMax&&e<=n.get(t))continue;const s=t.all(i);if(!s.end)continue;o.push(...s.tokens);let h=s.end;if(r=Math.max(r,e+h),t.skipOwnLastMax&&n.set(t,e+h),t.breakIfProcessed){a=h;break}}if(o.length){const t={start:e,end:r};s.push([t,o])}e+=a}return this.#o(t,s)}tokenizeText(t){return y(this.tokenize(t))}tokenizeTextAll(t){return y(this.tokenizeAll(t))}#o(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#i(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#a(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}};
|
package/lib/core.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
class t{#t={};#e={};add(t,e){this.addBatch([{word:t,meta:e}])}addBatch(t){const e={...this.#t},s={};for(const t in this.#e)if(Object.prototype.hasOwnProperty.call(this.#e,t)){s[t]={};for(const e in this.#e[t])Object.prototype.hasOwnProperty.call(this.#e[t],e)&&(s[t][+e]=[...this.#e[t][+e]])}for(const{word:n,meta:r}of t){const t=n[0],i=n.length;e[n]=r,s[t]||(s[t]={}),s[t][i]||(s[t][i]=[]);const o=s[t][i];o.includes(n)||o.push(n)}this.#t=e,this.#e=s}match(t,e){if(e>=t.length)return[];const s=t[e],n=this.#e[s],r=[];if(n)for(const s in n){const i=n[+s],o=e+ +s;if(o>t.length)continue;const a=t.slice(e,o);for(const t of i)t===a&&r.push({word:t,meta:this.#t[t]})}return r}matches(t){return this.match(t,0).map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name}))}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){const e=this.index.match(t,0);if(!e.length)return{tokens:[],end:0};return{tokens:e.map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name})),end:Math.max(...e.map(t=>t.word.length))}}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{priority=1e3;best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}const i=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:i}}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class o{skipOwnLastMax=!0;groupTypes=void 0;groupSources=void 0;types=void 0;mainGroup=0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.groupTypes?.[this.mainGroup]||this.id;return{tokens:[{txt:n[this.mainGroup],type:s,src:this.groupSources?.[this.mainGroup]||s}],unprocessedStart:e+n[this.mainGroup].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e,s=null;for(let n=0;n<this.RegexArray.length;n++)if(s=this.RegexArray[n].exec(t),s){this.types?.[n]&&(e=this.types[n]);break}if(e||(e=this.groupTypes?.[this.mainGroup]||this.id),!s)return{tokens:[],end:0};const n={txt:s[this.mainGroup],type:e,src:this.groupSources?.[this.mainGroup]||e};let r;if(s[this.mainGroup+1]){r=[n];for(let t=this.mainGroup+1;s[t];t++){const n=this.groupTypes?.[t];r.push({txt:s[t],type:n||e,src:this.groupSources?.[t]||n||`${e}-sub`})}}else r=[n];return{tokens:r,end:s[0].length}}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",h=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"];class d extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)?[${a}0-9]+(?:${h.join("|")})?`,"i"),new RegExp(`^(?:第|No)?[${a}0-9]+[名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB]?`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i]}class c extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class l extends o{id="punctuation";order=10;priority=0;skipOwnLastMax=!0;breakIfProcessed=!0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class p{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.best(t,0);return{tokens:e.tokens,end:e.tokens.length>0?e.tokens[0].txt.length:0}}}class u{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?{tokens:[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}],end:e[0].length}:{tokens:[],end:0}}}class g extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}class m extends o{id="url";order=7;priority=1e3;skipOwnLastMax=!0;breakIfProcessed=!0;groupTypes={1:"host",2:"other",3:"other",4:"other"};groupSources={2:"url-path",3:"url-query-string",4:"url-hash"};RegexArray=[/^\s*(?:https?|ftp)?:[/]+((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?((?:[/][^/?#\s]*)*\/?)(?:[?]([^#\s]*)*)?(?:#(\S*))?/i]}class f extends o{id="ip";order=7;priority=100;skipOwnLastMax=!0;breakIfProcessed=!0;RegexArray=[/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/,/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/]}class x{wordIndex=new t;#s=[];#n=new Set;#r=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new p),this.addStage(new u),this.addStage(new m),this.addStage(new f),this.addStage(new g),this.addStage(new d),this.addStage(new l),this.addStage(new c)}get loadedLexiconNames(){return[...this.#n]}get loadedNameLexiconNames(){return this.#r}addDictionary(t,e,s=0,n){this.#n.add(e),this.wordIndex.addBatch(t.map(t=>({word:t,meta:{name:e,priority:s,lang:n}})))}setNameDictionary(t,e){this.#r.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#s.push(t),this.#s.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#s){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#i(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;let r=0;for(;e<t.length;){const i=t.slice(e),o=[];let a=1;for(const t of this.#s){if(r>=e&&t.unprocessedOnly||t.skipOwnLastMax&&e<=n.get(t))continue;const s=t.all(i);if(!s.end)continue;o.push(...s.tokens);let h=s.end;if(r=Math.max(r,e+h),t.skipOwnLastMax&&n.set(t,e+h),t.breakIfProcessed){a=h;break}}if(o.length){const t={start:e,end:r};s.push([t,o])}e+=a}return this.#o(t,s)}tokenizeText(t){return this.tokenize(t).map(t=>t.txt)}tokenizeTextAll(t){return this.tokenizeAll(t).filter(t=>"punctuation"===t.type&&t.txt.length>1||"space"!==t.type).map(t=>t.txt)}#o(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#i(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#a(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}}export{x as MultilingualTokenizer};
|
|
1
|
+
class t{#t={};#e={};add(t,e){this.addBatch([{word:t,meta:e}])}addBatch(t){const e={...this.#t},s={};for(const t in this.#e)if(Object.prototype.hasOwnProperty.call(this.#e,t)){s[t]={};for(const e in this.#e[t])Object.prototype.hasOwnProperty.call(this.#e[t],e)&&(s[t][+e]=[...this.#e[t][+e]])}for(const{word:n,meta:r}of t){const t=n[0],i=n.length;e[n]=r,s[t]||(s[t]={}),s[t][i]||(s[t][i]=[]);const o=s[t][i];o.includes(n)||o.push(n)}this.#t=e,this.#e=s}match(t,e){if(e>=t.length)return[];const s=t[e],n=this.#e[s],r=[];if(n)for(const s in n){const i=n[+s],o=e+ +s;if(o>t.length)continue;const a=t.slice(e,o);for(const t of i)t===a&&r.push({word:t,meta:this.#t[t]})}return r}matches(t){return this.match(t,0).map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name}))}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){const e=this.index.match(t,0);if(!e.length)return{tokens:[],end:0};return{tokens:e.map(t=>({txt:t.word,type:"word",lang:t.meta.lang,src:t.meta.name})),end:Math.max(...e.map(t=>t.word.length))}}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{priority=1e3;best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}const i=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:i}}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class o{skipOwnLastMax=!0;groupTypes=void 0;groupSources=void 0;types=void 0;mainGroup=0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.groupTypes?.[this.mainGroup]||this.id;return{tokens:[{txt:n[this.mainGroup],type:s,src:this.groupSources?.[this.mainGroup]||s}],unprocessedStart:e+n[this.mainGroup].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e,s=null;for(let n=0;n<this.RegexArray.length;n++)if(s=this.RegexArray[n].exec(t),s){this.types?.[n]&&(e=this.types[n]);break}if(e||(e=this.groupTypes?.[this.mainGroup]||this.id),!s)return{tokens:[],end:0};const n={txt:s[this.mainGroup],type:e,src:this.groupSources?.[this.mainGroup]||e};let r;if(s[this.mainGroup+1]){r=[n];for(let t=this.mainGroup+1;s[t];t++){const n=this.groupTypes?.[t];r.push({txt:s[t],type:n||e,src:this.groupSources?.[t]||n||`${e}-sub`})}}else r=[n];return{tokens:r,end:s[0].length}}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",h=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"],d="元名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB";class c extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)[${a}0-9]+(?:${h.join("|")})`,"i"),new RegExp(`^(?:第|No)[${a}0-9]+[${d}]`,"i"),new RegExp(`^[${a}0-9]+(?:${h.join("|")})`,"i"),new RegExp(`^[0-9${a}]+[${d}]`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i,new RegExp(`^[${a}]+`,"i")]}class l extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class p extends o{id="punctuation";order=10;priority=0;skipOwnLastMax=!0;breakIfProcessed=!0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class u{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.best(t,0);return{tokens:e.tokens,end:e.tokens.length>0?e.tokens[0].txt.length:0}}}class g{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?{tokens:[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}],end:e[0].length}:{tokens:[],end:0}}}class m extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}class f extends o{id="url";order=7;priority=1e3;skipOwnLastMax=!0;breakIfProcessed=!0;groupTypes={1:"host",2:"other",3:"other",4:"other"};groupSources={2:"url-path",3:"url-query-string",4:"url-hash"};RegexArray=[/^\s*(?:https?|ftp)?:[/]+((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?((?:[/][^/?#\s]*)*\/?)(?:[?]([^#\s]*)*)?(?:#(\S*))?/i]}class x extends o{id="ip";order=7;priority=100;skipOwnLastMax=!0;breakIfProcessed=!0;RegexArray=[/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/,/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/]}function y(t){return Array.from(new Set(t.filter(t=>"punctuation"===t.type?t.txt.length>1:"space"!==t.type).map(t=>t.txt)))}class k{wordIndex=new t;#s=[];#n=new Set;#r=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new u),this.addStage(new g),this.addStage(new f),this.addStage(new x),this.addStage(new m),this.addStage(new c),this.addStage(new p),this.addStage(new l)}get loadedLexiconNames(){return[...this.#n]}get loadedNameLexiconNames(){return this.#r}addDictionary(t,e,s=0,n){this.#n.add(e),this.wordIndex.addBatch(t.map(t=>({word:t,meta:{name:e,priority:s,lang:n}})))}setNameDictionary(t,e){this.#r.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#s.push(t),this.#s.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#s){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#i(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;let r=0;for(;e<t.length;){const i=t.slice(e),o=[];let a=1;for(const t of this.#s){if(r>=e&&t.unprocessedOnly||t.skipOwnLastMax&&e<=n.get(t))continue;const s=t.all(i);if(!s.end)continue;o.push(...s.tokens);let h=s.end;if(r=Math.max(r,e+h),t.skipOwnLastMax&&n.set(t,e+h),t.breakIfProcessed){a=h;break}}if(o.length){const t={start:e,end:r};s.push([t,o])}e+=a}return this.#o(t,s)}tokenizeText(t){return y(this.tokenize(t))}tokenizeTextAll(t){return y(this.tokenizeAll(t))}#o(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#i(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#a(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}}export{k as MultilingualTokenizer};
|