gs-tokenizer 0.1.10 → 0.1.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.cn.md CHANGED
@@ -28,7 +28,7 @@
28
28
 
29
29
  | 模块 | 稳定性 | 速度 | 分词准确性 | 新特性 |
30
30
  |------|--------|------|------------|--------|
31
- | old-core | ✅ 更稳定 | ⚡️ 较慢 | ✅ 更准确 | ❌ 无新特性 |
31
+ | old | ✅ 更稳定 | ⚡️ 较慢 | ✅ 更准确 | ❌ 无新特性 |
32
32
  | core | ⚠️ 较不稳定 | ⚡️ 更快 | ⚠️ 可能不够准确 | ✅ tokenizeAll, 基于Stage的架构 |
33
33
 
34
34
  ## 安装
@@ -131,14 +131,14 @@ const tokens = tokenizer.tokenize(text);
131
131
  const allTokens = tokenizer.tokenizeAll(text);
132
132
  ```
133
133
 
134
- ### 使用Old-Core模块
134
+ ### 使用Old模块
135
135
 
136
136
  ```javascript
137
- import { OldMultilingualTokenizer } from 'gs-tokenizer/old-core';
137
+ import { OldMultilingualTokenizer } from 'gs-tokenizer/old';
138
138
 
139
139
  const tokenizer = new OldMultilingualTokenizer();
140
140
 
141
- // 分词文本 (old-core更稳定但速度较慢)
141
+ // 分词文本 (old更稳定但速度较慢)
142
142
  const text = '我爱北京天安门';
143
143
  const tokens = tokenizer.tokenize(text);
144
144
  ```
package/README.ja.md CHANGED
@@ -28,7 +28,7 @@
28
28
 
29
29
  | モジュール | 安定性 | 速度 | トークン化精度 | 新機能 |
30
30
  |----------|--------|------|----------------|--------|
31
- | old-core | ✅ より安定 | ⚡️ より遅い | ✅ より正確 | ❌ 新機能なし |
31
+ | old | ✅ より安定 | ⚡️ より遅い | ✅ より正確 | ❌ 新機能なし |
32
32
  | core | ⚠️ 安定性が低い | ⚡️ より速い | ⚠️ 精度が低い可能性あり | ✅ tokenizeAll、ステージベースのアーキテクチャ |
33
33
 
34
34
  ## インストール
@@ -131,14 +131,14 @@ const tokens = tokenizer.tokenize(text);
131
131
  const allTokens = tokenizer.tokenizeAll(text);
132
132
  ```
133
133
 
134
- ### Old-Coreモジュールの使用
134
+ ### Oldモジュールの使用
135
135
 
136
136
  ```javascript
137
- import { OldMultilingualTokenizer } from 'gs-tokenizer/old-core';
137
+ import { OldMultilingualTokenizer } from 'gs-tokenizer/old';
138
138
 
139
139
  const tokenizer = new OldMultilingualTokenizer();
140
140
 
141
- // テキストをトークン化 (old-coreはより安定だが速度が遅い)
141
+ // テキストをトークン化 (oldはより安定だが速度が遅い)
142
142
  const text = '私は北京の天安門が好きです';
143
143
  const tokens = tokenizer.tokenize(text);
144
144
  ```
package/README.ko.md CHANGED
@@ -28,7 +28,7 @@
28
28
 
29
29
  | 모듈 | 안정성 | 속도 | 토큰화 정확도 | 새로운 기능 |
30
30
  |------|--------|------|--------------|------------|
31
- | old-core | ✅ 보다 안정 | ⚡️ 보다 느림 | ✅ 보다 정확 | ❌ 새로운 기능 없음 |
31
+ | old | ✅ 보다 안정 | ⚡️ 보다 느림 | ✅ 보다 정확 | ❌ 새로운 기능 없음 |
32
32
  | core | ⚠️ 보다 불안정 | ⚡️ 보다 빠름 | ⚠️ 정확도가 낮을 수 있음 | ✅ tokenizeAll, 스테이지 기반 아키텍처 |
33
33
 
34
34
  ## 설치
@@ -131,14 +131,14 @@ const tokens = tokenizer.tokenize(text);
131
131
  const allTokens = tokenizer.tokenizeAll(text);
132
132
  ```
133
133
 
134
- ### Old-Core 모듈 사용
134
+ ### Old 모듈 사용
135
135
 
136
136
  ```javascript
137
- import { OldMultilingualTokenizer } from 'gs-tokenizer/old-core';
137
+ import { OldMultilingualTokenizer } from 'gs-tokenizer/old';
138
138
 
139
139
  const tokenizer = new OldMultilingualTokenizer();
140
140
 
141
- // 텍스트 토큰화 (old-core가 더 안정적이지만 더 느림)
141
+ // 텍스트 토큰화 (old가 더 안정적이지만 더 느림)
142
142
  const text = '나는 북경 천안문을 좋아합니다';
143
143
  const tokens = tokenizer.tokenize(text);
144
144
  ```
package/README.md CHANGED
@@ -28,7 +28,7 @@ A powerful and lightweight multilingual tokenizer library that provides natural
28
28
 
29
29
  | Module | Stability | Speed | Tokenization Accuracy | New Features |
30
30
  |--------|-----------|-------|-----------------------|--------------|
31
- | old-core | ✅ More stable | ⚡️ Slower | ✅ More accurate | ❌ No new features |
31
+ | old | ✅ More stable | ⚡️ Slower | ✅ More accurate | ❌ No new features |
32
32
  | core | ⚠️ Less stable | ⚡️ Faster | ⚠️ May be less accurate | ✅ tokenizeAll, Stage-based architecture |
33
33
 
34
34
  ## Installation
@@ -131,14 +131,14 @@ const tokens = tokenizer.tokenize(text);
131
131
  const allTokens = tokenizer.tokenizeAll(text);
132
132
  ```
133
133
 
134
- ### Using Old-Core Module
134
+ ### Using Old Module
135
135
 
136
136
  ```javascript
137
- import { OldMultilingualTokenizer } from 'gs-tokenizer/old-core';
137
+ import { OldMultilingualTokenizer } from 'gs-tokenizer/old';
138
138
 
139
139
  const tokenizer = new OldMultilingualTokenizer();
140
140
 
141
- // Tokenize text (old-core is more stable but slower)
141
+ // Tokenize text (old is more stable but slower)
142
142
  const text = '我爱北京天安门';
143
143
  const tokens = tokenizer.tokenize(text);
144
144
  ```
package/lib/core.cjs CHANGED
@@ -1 +1 @@
1
- "use strict";class t{#t=new Map;#e=new Map;#s=new Map;add(t,e){const s=t[0];if(this.#s.has(s))throw new Error(`FirstCharWordIndex: add word ${t} with meta ${e.name} failed, because it has been added before`);this.#t.set(t,e);const n=t.length;let r=this.#e.get(s);r||(r=new Map,this.#e.set(s,r));let i=r.get(n);i||(i=[],r.set(n,i)),i.push(t)}getLenCache(t){if(this.#s.has(t))return this.#s.get(t);if(!this.#e.has(t))return this.#s.set(t,[]),[];const e=Array.from(this.#e.get(t));return this.#e.delete(t),this.#s.set(t,e.sort((t,e)=>e[0]-t[0])),e}match(t,e){const s=t[e],n=this.getLenCache(s);if(!n)return[];const r=[];for(const[s,i]of n){const n=e+s;if(n>t.length)continue;const o=t.slice(e,n);for(const t of i)t===o&&r.push({word:t,meta:this.#t.get(t)})}return r}matches(t){const e=[],s=t[0],n=this.getLenCache(s);if(!n)return[];for(const[,s]of n)for(const n of s)if(t.startsWith(n)){const t=this.#t.get(n);e.push({txt:n,type:"word",lang:t.lang,src:t.name})}return e}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){return this.index.matches(t)}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}return e}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}return e}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}return e}}class o{skipOwnLastMax=!0;types=void 0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.id;return{tokens:[{txt:n[0],type:s,src:s}],unprocessedStart:e+n[0].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e=null,s=this.id;for(let n=0;n<this.RegexArray.length;n++)if(e=this.RegexArray[n].exec(t),e){this.types?.[n]&&(s=this.types[n]);break}if(!e)return[];const n={txt:e[0],type:s,src:s};if(!e[1])return[n];const r=[n];for(let t=1;e[t];t++)r.push({txt:e[t],type:s,src:`${s}-sub`});return r}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",c=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"];class h extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)?[${a}0-9]+(?:${c.join("|")})?`,"i"),new RegExp(`^(?:第|No)?[${a}0-9]+[名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB]?`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i]}class d extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class l extends o{id="punctuation";order=10;priority=0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class u{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){return this.best(t,0).tokens}}class p{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}]:[]}}class g{static IPV4=/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/;static IPV6=/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/;static HOST=/^(?:https?|ftp\/\/)?(?:(?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?/;id="host-ip";order=7;priority=10;skipOwnLastMax=!0;#n=/[^a-zA-Z0-9-]+/;best(t,e){const s=t.slice(e);let n,r=null;if(n=g.HOST.exec(s),n&&(r="host"),n||(n=g.IPV6.exec(s),n&&(r="ip")),n||(n=g.IPV4.exec(s),n&&(r="ip")),!n)return{tokens:[],unprocessedStart:e,consumed:!1};let i=n[0];return i=i.replace(/^https\/\//,""),"ip"===r&&i.startsWith("[")&&(i=i.slice(1,i.indexOf("]"))+i.slice(i.indexOf("]")+1)),{tokens:[{txt:i,type:r}],unprocessedStart:e+n[0].length,consumed:!0}}all(t){let e,s;if(e=g.HOST.exec(t),e&&(s="host"),e||(e=g.IPV6.exec(t),e&&(s="ip")),e||(e=g.IPV4.exec(t),e&&(s="ip")),!e)return[];const n=e[0],r={txt:n,type:s,src:s};return this.#n.test(n)?[r,...n.split(this.#n).map(t=>({txt:t,type:"word",src:`${s}-sub`}))]:[r]}}class f extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}exports.MultilingualTokenizer=class{wordIndex=new t;#r=[];#i=new Set;#o=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new u),this.addStage(new p),this.addStage(new g),this.addStage(new f),this.addStage(new h),this.addStage(new l),this.addStage(new d)}get loadedLexiconNames(){return[...this.#i]}get loadedNameLexiconNames(){return this.#o}addDictionary(t,e,s=0,n){this.#i.add(e);for(const r of t)this.wordIndex.add(r,{name:e,priority:s,lang:n})}setNameDictionary(t,e){this.#o.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#r.push(t),this.#r.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#r){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#a(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;for(;e<t.length;){const r=t.slice(e),i=[];for(const t of this.#r){if(!t.skipOwnLastMax){i.push(...t.all(r));continue}if(e<=n.get(t))continue;const s=t.all(r);if(!s.length)continue;i.push(...s);let o=0;for(const t of s)t.txt.length>o&&(o=t.txt.length);n.set(t,e+o)}if(i.length){let t=0;for(const e of i)e.txt.length>t&&(t=e.txt.length);const n={start:e,end:e+t};s.push([n,i])}e++}return this.#c(t,s)}tokenizeText(t){return this.tokenize(t).map(t=>t.txt)}tokenizeTextAll(t){return this.tokenizeAll(t).filter(t=>"punctuation"===t.type&&t.txt.length>1||"space"!==t.type).map(t=>t.txt)}#c(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#h(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#h(t,n,t.length)),s}#a(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#h(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#h(t,n,t.length)),s}#h(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}};
1
+ "use strict";class t{#t=new Map;#e=new Map;add(t,e){const s=t[0];this.#t.set(t,e);const n=t.length;let r=this.#e.get(s);r||(r=new Map,this.#e.set(s,r));let i=r.get(n);i||(i=[],r.set(n,i)),i.push(t)}match(t,e){const s=t[e],n=this.#e.get(s);if(!n)return[];const r=[];for(const[s,i]of n){const n=e+s;if(n>t.length)continue;const o=t.slice(e,n);for(const t of i)t===o&&r.push({word:t,meta:this.#t.get(t)})}return r}matches(t){const e=[],s=t[0],n=this.#e.get(s);if(!n)return[];for(const[,s]of n)for(const n of s)if(t.startsWith(n)){const t=this.#t.get(n);e.push({txt:n,type:"word",lang:t.lang,src:t.name})}return e}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){const e=this.index.matches(t);return{tokens:e,end:e.length>0?e[0].txt.length:0}}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{priority=1e3;best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}const i=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:i}}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class o{skipOwnLastMax=!0;groupTypes=void 0;groupSources=void 0;types=void 0;mainGroup=0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.groupTypes?.[this.mainGroup]||this.id;return{tokens:[{txt:n[this.mainGroup],type:s,src:this.groupSources?.[this.mainGroup]||s}],unprocessedStart:e+n[this.mainGroup].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e,s=null;for(let n=0;n<this.RegexArray.length;n++)if(s=this.RegexArray[n].exec(t),s){this.types?.[n]&&(e=this.types[n]);break}if(e||(e=this.groupTypes?.[this.mainGroup]||this.id),!s)return{tokens:[],end:0};const n={txt:s[this.mainGroup],type:e,src:this.groupSources?.[this.mainGroup]||e};let r;if(s[this.mainGroup+1]){r=[n];for(let t=this.mainGroup+1;s[t];t++){const n=this.groupTypes?.[t];r.push({txt:s[t],type:n||e,src:this.groupSources?.[t]||n||`${e}-sub`})}}else r=[n];return{tokens:r,end:s[0].length}}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",h=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"];class c extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)?[${a}0-9]+(?:${h.join("|")})?`,"i"),new RegExp(`^(?:第|No)?[${a}0-9]+[名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB]?`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i]}class d extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class l extends o{id="punctuation";order=10;priority=0;skipOwnLastMax=!0;breakIfProcessed=!0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class u{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.best(t,0);return{tokens:e.tokens,end:e.tokens.length>0?e.tokens[0].txt.length:0}}}class p{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?{tokens:[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}],end:e[0].length}:{tokens:[],end:0}}}class g extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}class f extends o{id="url";order=7;priority=1e3;skipOwnLastMax=!0;breakIfProcessed=!0;groupTypes={1:"host",2:"other",3:"other",4:"other"};groupSources={2:"url-path",3:"url-query-string",4:"url-hash"};RegexArray=[/^\s*(?:https?|ftp)?:[/]+((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?((?:[/][^/?#\s]*)*\/?)(?:[?]([^#\s]*)*)?(?:#(\S*))?/i]}class m extends o{id="ip";order=7;priority=100;skipOwnLastMax=!0;breakIfProcessed=!0;RegexArray=[/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/,/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/]}exports.MultilingualTokenizer=class{wordIndex=new t;#s=[];#n=new Set;#r=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new u),this.addStage(new p),this.addStage(new f),this.addStage(new m),this.addStage(new g),this.addStage(new c),this.addStage(new l),this.addStage(new d)}get loadedLexiconNames(){return[...this.#n]}get loadedNameLexiconNames(){return this.#r}addDictionary(t,e,s=0,n){this.#n.add(e);for(const r of t)this.wordIndex.add(r,{name:e,priority:s,lang:n})}setNameDictionary(t,e){this.#r.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#s.push(t),this.#s.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#s){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#i(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;let r=0;for(;e<t.length;){const i=t.slice(e),o=[];let a=1;for(const t of this.#s){if(r>=e&&t.unprocessedOnly||t.skipOwnLastMax&&e<=n.get(t))continue;const s=t.all(i);if(!s.end)continue;o.push(...s.tokens);let h=s.end;if(r=Math.max(r,e+h),t.skipOwnLastMax&&n.set(t,e+h),t.breakIfProcessed){a=h;break}}if(o.length){const t={start:e,end:r};s.push([t,o])}e+=a}return this.#o(t,s)}tokenizeText(t){return this.tokenize(t).map(t=>t.txt)}tokenizeTextAll(t){return this.tokenizeAll(t).filter(t=>"punctuation"===t.type&&t.txt.length>1||"space"!==t.type).map(t=>t.txt)}#o(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#i(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#a(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}};
package/lib/core.d.ts CHANGED
@@ -3,7 +3,6 @@ import { IWordIndex, LexiconMeta, IWordMatch, IToken, IMultilingualTokenizer, Su
3
3
  declare class FirstCharWordIndex implements IWordIndex {
4
4
  #private;
5
5
  add(word: string, meta: LexiconMeta): void;
6
- getLenCache(ch: string): [number, string[]][];
7
6
  match(text: string, pos: number): IWordMatch[];
8
7
  matches(text: string): IToken[];
9
8
  }
package/lib/core.js CHANGED
@@ -1 +1 @@
1
- class t{#t=new Map;#e=new Map;#s=new Map;add(t,e){const s=t[0];if(this.#s.has(s))throw new Error(`FirstCharWordIndex: add word ${t} with meta ${e.name} failed, because it has been added before`);this.#t.set(t,e);const n=t.length;let r=this.#e.get(s);r||(r=new Map,this.#e.set(s,r));let i=r.get(n);i||(i=[],r.set(n,i)),i.push(t)}getLenCache(t){if(this.#s.has(t))return this.#s.get(t);if(!this.#e.has(t))return this.#s.set(t,[]),[];const e=Array.from(this.#e.get(t));return this.#e.delete(t),this.#s.set(t,e.sort((t,e)=>e[0]-t[0])),e}match(t,e){const s=t[e],n=this.getLenCache(s);if(!n)return[];const r=[];for(const[s,i]of n){const n=e+s;if(n>t.length)continue;const o=t.slice(e,n);for(const t of i)t===o&&r.push({word:t,meta:this.#t.get(t)})}return r}matches(t){const e=[],s=t[0],n=this.getLenCache(s);if(!n)return[];for(const[,s]of n)for(const n of s)if(t.startsWith(n)){const t=this.#t.get(n);e.push({txt:n,type:"word",lang:t.lang,src:t.name})}return e}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){return this.index.matches(t)}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}return e}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}return e}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}return e}}class o{skipOwnLastMax=!0;types=void 0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.id;return{tokens:[{txt:n[0],type:s,src:s}],unprocessedStart:e+n[0].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e=null,s=this.id;for(let n=0;n<this.RegexArray.length;n++)if(e=this.RegexArray[n].exec(t),e){this.types?.[n]&&(s=this.types[n]);break}if(!e)return[];const n={txt:e[0],type:s,src:s};if(!e[1])return[n];const r=[n];for(let t=1;e[t];t++)r.push({txt:e[t],type:s,src:`${s}-sub`});return r}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",c=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"];class h extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)?[${a}0-9]+(?:${c.join("|")})?`,"i"),new RegExp(`^(?:第|No)?[${a}0-9]+[名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB]?`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i]}class d extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class l extends o{id="punctuation";order=10;priority=0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class u{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){return this.best(t,0).tokens}}class p{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}]:[]}}class g{static IPV4=/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/;static IPV6=/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/;static HOST=/^(?:https?|ftp\/\/)?(?:(?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?/;id="host-ip";order=7;priority=10;skipOwnLastMax=!0;#n=/[^a-zA-Z0-9-]+/;best(t,e){const s=t.slice(e);let n,r=null;if(n=g.HOST.exec(s),n&&(r="host"),n||(n=g.IPV6.exec(s),n&&(r="ip")),n||(n=g.IPV4.exec(s),n&&(r="ip")),!n)return{tokens:[],unprocessedStart:e,consumed:!1};let i=n[0];return i=i.replace(/^https\/\//,""),"ip"===r&&i.startsWith("[")&&(i=i.slice(1,i.indexOf("]"))+i.slice(i.indexOf("]")+1)),{tokens:[{txt:i,type:r}],unprocessedStart:e+n[0].length,consumed:!0}}all(t){let e,s;if(e=g.HOST.exec(t),e&&(s="host"),e||(e=g.IPV6.exec(t),e&&(s="ip")),e||(e=g.IPV4.exec(t),e&&(s="ip")),!e)return[];const n=e[0],r={txt:n,type:s,src:s};return this.#n.test(n)?[r,...n.split(this.#n).map(t=>({txt:t,type:"word",src:`${s}-sub`}))]:[r]}}class f extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}class m{wordIndex=new t;#r=[];#i=new Set;#o=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new u),this.addStage(new p),this.addStage(new g),this.addStage(new f),this.addStage(new h),this.addStage(new l),this.addStage(new d)}get loadedLexiconNames(){return[...this.#i]}get loadedNameLexiconNames(){return this.#o}addDictionary(t,e,s=0,n){this.#i.add(e);for(const r of t)this.wordIndex.add(r,{name:e,priority:s,lang:n})}setNameDictionary(t,e){this.#o.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#r.push(t),this.#r.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#r){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#a(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;for(;e<t.length;){const r=t.slice(e),i=[];for(const t of this.#r){if(!t.skipOwnLastMax){i.push(...t.all(r));continue}if(e<=n.get(t))continue;const s=t.all(r);if(!s.length)continue;i.push(...s);let o=0;for(const t of s)t.txt.length>o&&(o=t.txt.length);n.set(t,e+o)}if(i.length){let t=0;for(const e of i)e.txt.length>t&&(t=e.txt.length);const n={start:e,end:e+t};s.push([n,i])}e++}return this.#c(t,s)}tokenizeText(t){return this.tokenize(t).map(t=>t.txt)}tokenizeTextAll(t){return this.tokenizeAll(t).filter(t=>"punctuation"===t.type&&t.txt.length>1||"space"!==t.type).map(t=>t.txt)}#c(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#h(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#h(t,n,t.length)),s}#a(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#h(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#h(t,n,t.length)),s}#h(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}}export{m as MultilingualTokenizer};
1
+ class t{#t=new Map;#e=new Map;add(t,e){const s=t[0];this.#t.set(t,e);const n=t.length;let r=this.#e.get(s);r||(r=new Map,this.#e.set(s,r));let i=r.get(n);i||(i=[],r.set(n,i)),i.push(t)}match(t,e){const s=t[e],n=this.#e.get(s);if(!n)return[];const r=[];for(const[s,i]of n){const n=e+s;if(n>t.length)continue;const o=t.slice(e,n);for(const t of i)t===o&&r.push({word:t,meta:this.#t.get(t)})}return r}matches(t){const e=[],s=t[0],n=this.#e.get(s);if(!n)return[];for(const[,s]of n)for(const n of s)if(t.startsWith(n)){const t=this.#t.get(n);e.push({txt:n,type:"word",lang:t.lang,src:t.name})}return e}}class e{id="dictionary";order=1;priority=0;index;initialize(t){this.index=t.wordIndex}best(t,e){const s=this.index.match(t,e);if(!s.length)return{tokens:[],unprocessedStart:e,consumed:!1};const n=s.sort((t,e)=>e.word.length!==t.word.length?e.word.length-t.word.length:e.meta.priority-t.meta.priority)[0];return{tokens:[{txt:n.word,type:"word",lang:n.meta.lang,src:n.meta.name}],unprocessedStart:e+n.word.length,consumed:!0}}all(t){const e=this.index.matches(t);return{tokens:e,end:e.length>0?e[0].txt.length:0}}}class s{lang;id;order=2;priority=0;last;first;title;constructor(t,e){this.lang=e,this.last=t.lastName,this.first=t.firstName,this.title=t.title,this.id=`name,${this.lang}`}}class n extends s{priority=1e3;best(t,e){let s=e,n="";const r=t[e];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;const i=s+r.length;for(const e of this.first)if(t.startsWith(e,i))return{tokens:[{txt:n+r+e,type:"name",lang:this.lang,src:this.id}],unprocessedStart:i+e.length,consumed:!0};if(s){const t=n+r;return{tokens:[{txt:t,type:"name",lang:this.lang,src:this.id}],unprocessedStart:e+t.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];let s=0,n="";const r=t[0];("老"===r||"小"===r)&&(n=r,s++);for(const r of this.last){if(!t.startsWith(r,s))continue;if(s){const t=n+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}const i=s+r.length;for(const s of this.first){if(!t.startsWith(s,i))continue;const o=n+r+s;e.push({txt:o,type:"name",lang:this.lang,src:this.id})}}const i=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:i}}}class r extends s{order=3;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length;for(const s of this.first){if(t.startsWith(s,n))return{tokens:[{txt:e+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0};if(t.startsWith(s,n+1))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(t.startsWith(r,n)){const t=s+" "+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}if(t.startsWith(r,s.length)){const t=s+r;e.push({txt:t,type:"name",lang:this.lang,src:this.id})}}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class i extends s{order=4;best(t,e){let s=e;for(const e of this.last){if(!t.startsWith(e,s))continue;const n=s+e.length+1;for(const s of this.first)if(t.startsWith(s,n))return{tokens:[{txt:e+" "+s,type:"name",lang:this.lang,src:this.id}],unprocessedStart:n+s.length,consumed:!0}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=[];for(const s of this.last){if(!t.startsWith(s))continue;const n=s.length+1;for(const r of this.first){if(!t.startsWith(r,n))continue;const i=s+" "+r;e.push({txt:i,type:"name",lang:this.lang,src:this.id})}}const s=e.length>0?Math.max(...e.map(t=>t.txt.length)):0;return{tokens:e,end:s}}}class o{skipOwnLastMax=!0;groupTypes=void 0;groupSources=void 0;types=void 0;mainGroup=0;best(t,e){const s=t.slice(e);for(let t=0;t<this.RegexArray.length;t++){const n=this.RegexArray[t].exec(s);if(n){const s=this.types?.[t]||this.groupTypes?.[this.mainGroup]||this.id;return{tokens:[{txt:n[this.mainGroup],type:s,src:this.groupSources?.[this.mainGroup]||s}],unprocessedStart:e+n[this.mainGroup].length,consumed:!0}}}return{tokens:[],unprocessedStart:e,consumed:!1}}all(t){let e,s=null;for(let n=0;n<this.RegexArray.length;n++)if(s=this.RegexArray[n].exec(t),s){this.types?.[n]&&(e=this.types[n]);break}if(e||(e=this.groupTypes?.[this.mainGroup]||this.id),!s)return{tokens:[],end:0};const n={txt:s[this.mainGroup],type:e,src:this.groupSources?.[this.mainGroup]||e};let r;if(s[this.mainGroup+1]){r=[n];for(let t=this.mainGroup+1;s[t];t++){const n=this.groupTypes?.[t];r.push({txt:s[t],type:n||e,src:this.groupSources?.[t]||n||`${e}-sub`})}}else r=[n];return{tokens:r,end:s[0].length}}}const a="壹贰叁肆伍陆柒捌玖拾佰仟十百千万亿萬億兆零一二三四五六七八九",h=["公斤","英里","千克","厘米","毫米","公里","小时","分钟","折扣","美元","人民币","公顷","平方米","平方分米","平方厘米","立方厘米","毫升","千瓦","安培","伏特","欧姆","焦耳","卡路里","千克力","牛顿","帕斯卡","标准大气压","毫米汞柱","摄氏度","华氏度","弧度","角度","kg","mg","km","cm","mm","μm","nm","mL","ml","min","°C","°F","rad","deg","Hz","kHz","MHz","GHz","bit","Byte","KB","MB","GB","TB","PB","EB","ZB","YB","lb","oz","pound","pounds"];class c extends o{id="number";order=9;priority=10;skipOwnLastMax=!0;RegexArray=[new RegExp(`^(?:第|No)?[${a}0-9]+(?:${h.join("|")})?`,"i"),new RegExp(`^(?:第|No)?[${a}0-9]+[名場场个克吨米斤两元角分秒折卷券元角分亩升天周月年岁度瓦牛巴gtmLlhsdwyB]?`,"i"),/^[+-]?(?:\d{1,3}(?:,\d{3})+|\d+)(?:\.\d+)?(?:e[+-]?\d+)?%?/i]}class d extends o{id="space";order=100;priority=0;RegexArray=[/^\s+/]}class l extends o{id="punctuation";order=10;priority=0;skipOwnLastMax=!0;breakIfProcessed=!0;types=["emoji","punctuation"];RegexArray=[/^\p{Emoji_Presentation}+/u,/^[^0-9A-Za-z\u4e00-\u9fff\s]+/]}class u{id="social";order=5;priority=10;skipOwnLastMax=!0;nameRe=/^[\p{L}\p{N}_\-]+/u;best(t,e){const s=t[e];if("@"!==s&&"#"!==s)return{tokens:[],unprocessedStart:e,consumed:!1};let n=e+1;const r=this.nameRe.exec(t.slice(n));return r?{tokens:[{txt:s+r[0],type:"@"===s?"mention":"hashtag",src:"social"}],unprocessedStart:n+r[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.best(t,0);return{tokens:e.tokens,end:e.tokens.length>0?e.tokens[0].txt.length:0}}}class p{id="email";order=6;priority=20;skipOwnLastMax=!0;re=/^([a-zA-Z0-9._%+-]+)@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/;best(t,e){const s=this.re.exec(t.slice(e));return s?{tokens:[{txt:s[0],type:"email",src:"email"}],unprocessedStart:e+s[0].length,consumed:!0}:{tokens:[],unprocessedStart:e,consumed:!1}}all(t){const e=this.re.exec(t);return e?{tokens:[{txt:e[0],type:"email",src:"email"},{txt:e[1],type:"word",src:"email-sub"}],end:e[0].length}:{tokens:[],end:0}}}class g extends o{id="date";order=8;priority=0;RegexArray=[/^(\d{4}年)\s*(\d{1,2}月)\s*(\d{1,2}日)/,new RegExp(`^([${a}]{4}年)s*([${a}]{1,2}月)s*([${a}]{1,2}日)`),/^(\d{4})\s*[-/.]\s*(\d{1,2})\s*[-/.]\s*(\d{1,2})/,/^(\d{1,2})\s*[-/.](\d{1,2})\s*[-/.]\s*(\d{4})/,/^(?:\d{4}年|d{1,2}[月日])/,new RegExp(`^(?:[${a}]{4}年|[${a}]{1,2}[月日])`)]}class f extends o{id="url";order=7;priority=1e3;skipOwnLastMax=!0;breakIfProcessed=!0;groupTypes={1:"host",2:"other",3:"other",4:"other"};groupSources={2:"url-path",3:"url-query-string",4:"url-hash"};RegexArray=[/^\s*(?:https?|ftp)?:[/]+((?:[a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+|localhost)(?::\d{1,5})?((?:[/][^/?#\s]*)*\/?)(?:[?]([^#\s]*)*)?(?:#(\S*))?/i]}class m extends o{id="ip";order=7;priority=100;skipOwnLastMax=!0;breakIfProcessed=!0;RegexArray=[/^(?:\[[0-9a-fA-F:]*:[0-9a-fA-F:]+]|[0-9a-fA-F]*:[0-9a-fA-F:]+)(?::\d{1,5})?/,/^(?:\d{1,3}\.){3}\d{1,3}(?::\d{1,5})?/]}class x{wordIndex=new t;#s=[];#n=new Set;#r=[];nativeSegmenter=typeof Intl<"u"&&"Segmenter"in Intl?new Intl.Segmenter("und",{granularity:"word"}):null;constructor(){this.addStage(new e),this.addStage(new u),this.addStage(new p),this.addStage(new f),this.addStage(new m),this.addStage(new g),this.addStage(new c),this.addStage(new l),this.addStage(new d)}get loadedLexiconNames(){return[...this.#n]}get loadedNameLexiconNames(){return this.#r}addDictionary(t,e,s=0,n){this.#n.add(e);for(const r of t)this.wordIndex.add(r,{name:e,priority:s,lang:n})}setNameDictionary(t,e){this.#r.push(e),/^zh/i.test(e)?this.addStage(new n(t,e)):/^(ko|jp)/i.test(e)?this.addStage(new r(t,e)):this.addStage(new i(t,e))}addStage(t){this.#s.push(t),this.#s.sort((t,e)=>t.order-e.order||e.priority-t.priority),t.initialize?.(this)}tokenize(t){const e=[],s=t.length;let n=0;for(;n<s;){let s=!1;for(const r of this.#s){const i=r.best(t,n);if(i.tokens.length){for(const t of i.tokens)e.push({...t,start:n,end:i.unprocessedStart});n=i.unprocessedStart,s=!0;break}}s||n++}return this.#i(t,e)}tokenizeAll(t){let e=0;const s=[],n=new Map;let r=0;for(;e<t.length;){const i=t.slice(e),o=[];let a=1;for(const t of this.#s){if(r>=e&&t.unprocessedOnly||t.skipOwnLastMax&&e<=n.get(t))continue;const s=t.all(i);if(!s.end)continue;o.push(...s.tokens);let h=s.end;if(r=Math.max(r,e+h),t.skipOwnLastMax&&n.set(t,e+h),t.breakIfProcessed){a=h;break}}if(o.length){const t={start:e,end:r};s.push([t,o])}e+=a}return this.#o(t,s)}tokenizeText(t){return this.tokenize(t).map(t=>t.txt)}tokenizeTextAll(t){return this.tokenizeAll(t).filter(t=>"punctuation"===t.type&&t.txt.length>1||"space"!==t.type).map(t=>t.txt)}#o(t,e){const s=[];let n=0;if(e.length)for(const[r,i]of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(...i),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#i(t,e){if(!this.nativeSegmenter)return e;const s=[];let n=0;for(const r of e)n<r.start&&s.push(...this.#a(t,n,r.start)),s.push(r),n=r.end;return n<t.length&&s.push(...this.#a(t,n,t.length)),s}#a(t,e,s){const n=t.slice(e,s),r=[];for(const t of this.nativeSegmenter.segment(n)){const s=e+t.index,n=s+t.segment.length;r.push({txt:t.segment,type:"word",src:"native",start:s,end:n})}return r}}export{x as MultilingualTokenizer};
package/lib/index.cjs CHANGED
@@ -1 +1 @@
1
- "use strict";var e=require('./type'),t=require('./lexicon'),r=require('./core'),o=require('./old-core');Object.keys(e).forEach(function(t){"default"!==t&&!Object.prototype.hasOwnProperty.call(exports,t)&&Object.defineProperty(exports,t,{enumerable:!0,get:function(){return e[t]}})}),Object.keys(t).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return t[e]}})}),Object.keys(r).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return r[e]}})}),Object.keys(o).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return o[e]}})});
1
+ "use strict";var e=require('./type'),t=require('./lexicon'),r=require('./core'),o=require('./old');Object.keys(e).forEach(function(t){"default"!==t&&!Object.prototype.hasOwnProperty.call(exports,t)&&Object.defineProperty(exports,t,{enumerable:!0,get:function(){return e[t]}})}),Object.keys(t).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return t[e]}})}),Object.keys(r).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return r[e]}})}),Object.keys(o).forEach(function(e){"default"!==e&&!Object.prototype.hasOwnProperty.call(exports,e)&&Object.defineProperty(exports,e,{enumerable:!0,get:function(){return o[e]}})});
package/lib/index.d.ts CHANGED
@@ -1,4 +1,4 @@
1
1
  export * from './type';
2
2
  export * from './lexicon';
3
3
  export * from './core';
4
- export * from './old-core';
4
+ export * from './old';
package/lib/index.js CHANGED
@@ -1 +1 @@
1
- export*from'./type';export*from'./lexicon';export*from'./core';export*from'./old-core';
1
+ export*from'./type';export*from'./lexicon';export*from'./core';export*from'./old';