gs-tokenizer 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/core.cjs +1 -1
- package/lib/core.d.ts +1 -1
- package/lib/core.js +1 -1
- package/lib/lexicon.cjs +1 -1
- package/lib/lexicon.js +1 -1
- package/package.json +1 -1
package/lib/core.cjs
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
"use strict";class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const s=[],n=t.split(/\b/);for(const t of n)t&&(t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?s.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(s,e)}tagNameTokens(t,e){const s=[];let n=0;for(;n<t.length;){if(n<t.length&&"word"===t[n].type){const i=t[n].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){s.push({txt:i,type:"word",lang:e,src:r.name}),n++,t=!0;break}if(t)continue}}s.push({txt:t[n].txt,type:t[n].type,lang:e,src:t[n].src||""}),n++}return s}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const s=[],n=this.getSegmenter(e);for(const i of n.segment(t)){const{segment:t,isWordLike:n}=i;t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?s.push({txt:t,type:"punctuation",lang:e,src:""}):n?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(s,e)}getSegmenter(t,e="word"){const s=`${t}-${e}`;return this.segmenters.has(s)||this.segmenters.set(s,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(s)}applyCustomDictionary(t,e){const s=this.customDictionaries[e]||[];let n=t;if(s.length>0){const t=[];let i=0;for(;i<n.length;){let r=!1;for(let o=Math.min(5,n.length-i);o>=1;o--){if(o>1&&n.slice(i,i+o).some(t=>"word"!==t.type))continue;const a=n.slice(i,i+o).map(t=>t.txt).join("");for(const n of s.sort((t,e)=>e.priority-t.priority))if(n.data.has(a)){t.push({txt:a,type:"word",lang:e,src:""}),i+=o,r=!0;break}if(r)break}r||(t.push({...n[i],src:""}),i++)}n=t}return n}}class s{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)
|
|
1
|
+
"use strict";class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const s=[],n=t.split(/\b/);for(const t of n)t&&(t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?s.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(s,e)}tagNameTokens(t,e){const s=[];let n=0;for(;n<t.length;){if(n<t.length&&"word"===t[n].type){const i=t[n].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){s.push({txt:i,type:"word",lang:e,src:r.name}),n++,t=!0;break}if(t)continue}}s.push({txt:t[n].txt,type:t[n].type,lang:e,src:t[n].src||""}),n++}return s}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const s=[],n=this.getSegmenter(e);for(const i of n.segment(t)){const{segment:t,isWordLike:n}=i;t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?s.push({txt:t,type:"punctuation",lang:e,src:""}):n?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(s,e)}getSegmenter(t,e="word"){const s=`${t}-${e}`;return this.segmenters.has(s)||this.segmenters.set(s,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(s)}applyCustomDictionary(t,e){const s=this.customDictionaries[e]||[];let n=t;if(s.length>0){const t=[];let i=0;for(;i<n.length;){let r=!1;for(let o=Math.min(5,n.length-i);o>=1;o--){if(o>1&&n.slice(i,i+o).some(t=>"word"!==t.type))continue;const a=n.slice(i,i+o).map(t=>t.txt).join("");for(const n of s.sort((t,e)=>e.priority-t.priority))if(n.data.has(a)){t.push({txt:a,type:"word",lang:e,src:""}),i+=o,r=!0;break}if(r)break}r||(t.push({...n[i],src:""}),i++)}n=t}return n}}class s{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const s=[];let n=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const s=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,s)),index:o.index,end:Math.max(o.end,s)}:(r.push({text:o.text,index:o.index}),o={...e,end:s}):o={...e,end:s}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>n){const r=t.slice(n,i.index);s.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?s.push({txt:i.text,type:"date",lang:e,src:""}):s.push({txt:i.text,type:"other",lang:e,src:""}),n=i.index+i.text.length}if(n<t.length){const i=t.slice(n);s.push({txt:i,type:"other",lang:e,src:""})}return s}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t))return!0;let e,s,n;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),s=parseInt(t.slice(4,6)),n=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],s=i[1],n=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],s=i[1],n=i[2]):i[2]>31?(s=i[0],n=i[1],e=i[2]):(s=i[0],n=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;s=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,n=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),s=parseInt(i[2]),n=parseInt(i[3])}}return this.isValidDateComponents(e,s,n)}isValidDateComponents(t,e,s){if(e<1||e>12||s<1||s>31)return!1;if(2===e){if(s>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(s>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class n{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const s=[];let n=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g,r=/(?:https?:\/\/)?(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|(?:localhost|127\.0\.0\.1)|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(?::\d{1,5})?(?:\/[^\s]*)?/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"url",text:a[0]});o.sort((t,e)=>t.index-e.index);const c=[];for(let t=0;t<o.length;t++){let e=!1;for(let s=0;s<o.length;s++)if(t!==s){const n=o[t].index===o[s].index&&o[t].endIndex===o[s].endIndex,i=o[t].index>=o[s].index&&o[t].endIndex<=o[s].endIndex;if(n){if("ip"===o[s].type){e=!0;break}}else if(i){e=!0;break}}e||c.push(o[t])}for(const i of c)i.index>n&&s.push({txt:t.substring(n,i.index),type:"other",lang:e}),s.push({txt:i.text,type:i.type,lang:"en"}),n=i.endIndex;return n<t.length&&s.push({txt:t.substring(n),type:"other",lang:e}),s}}class r{tokenizers;customDictionaries;defaultLanguage;constructor(n={}){this.customDictionaries=n.customDictionaries||{},this.defaultLanguage=n.defaultLanguage||"en",this.tokenizers=[new s,new i,new t,new e(this.customDictionaries)]}addCustomDictionary(t,e,s,n){const i=e||this.defaultLanguage;this.customDictionaries[i]||(this.customDictionaries[i]=[]);const r=this.customDictionaries[i].findIndex(t=>t.name===n&&t.lang===i&&t.priority===s);if(r>=0){const e=this.customDictionaries[i][r];t.forEach(t=>e.data.add(t))}else this.customDictionaries[i].push({priority:s,data:new Set(t),name:n,lang:i})}removeCustomWord(t,e,s){if(e){if(this.customDictionaries[e])if(s){const n=this.customDictionaries[e].find(t=>t.name===s);n&&n.data.delete(t)}else this.customDictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.customDictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(r,o){const a=o||n.detectLanguage(r),c=this.tokenizers.find(t=>t instanceof s);if(!c)return[];const u=c.tokenize(r,a),d=[],l=this.tokenizers.find(t=>t instanceof i);if(!l)return d;for(const s of u)if("date"===s.type)d.push(s);else{const n=l.tokenize(s.txt,a);for(const s of n)if("url"===s.type||"ip"===s.type)d.push(s);else{let n=[];if("en"===a){const e=this.tokenizers.find(e=>e instanceof t);e&&(n=e.tokenize(s.txt,a))}else if(["zh","ja","ko"].includes(a)){const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}n.length>0?d.push(...n):d.push(s)}}return d}tokenizeText(t,e){const s=this.tokenize(t,e?.language),n=["punctuation","space","other",...e?.excludeTypes||[]];return s.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type))&&!n.includes(t.type)).map(t=>t.txt)}}exports.CJKTokenizer=e,exports.DateTokenizer=s,exports.EnglishTokenizer=t,exports.LanguageDetector=n,exports.MultilingualTokenizer=r,exports.createTokenizer=function(t){return new r(t)};
|
package/lib/core.d.ts
CHANGED
|
@@ -138,7 +138,7 @@ declare class CJKTokenizer implements LanguageTokenizer {
|
|
|
138
138
|
* @implements {LanguageTokenizer}
|
|
139
139
|
*/
|
|
140
140
|
declare class DateTokenizer implements LanguageTokenizer {
|
|
141
|
-
/**
|
|
141
|
+
/** 综合日期时间正则表达式,用于匹配多种日期和时间格式 */
|
|
142
142
|
private comprehensiveDatePattern;
|
|
143
143
|
/**
|
|
144
144
|
* 检测文本的语言
|
package/lib/core.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const s=[],n=t.split(/\b/);for(const t of n)t&&(t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?s.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(s,e)}tagNameTokens(t,e){const s=[];let n=0;for(;n<t.length;){if(n<t.length&&"word"===t[n].type){const i=t[n].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){s.push({txt:i,type:"word",lang:e,src:r.name}),n++,t=!0;break}if(t)continue}}s.push({txt:t[n].txt,type:t[n].type,lang:e,src:t[n].src||""}),n++}return s}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const s=[],n=this.getSegmenter(e);for(const i of n.segment(t)){const{segment:t,isWordLike:n}=i;t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?s.push({txt:t,type:"punctuation",lang:e,src:""}):n?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(s,e)}getSegmenter(t,e="word"){const s=`${t}-${e}`;return this.segmenters.has(s)||this.segmenters.set(s,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(s)}applyCustomDictionary(t,e){const s=this.customDictionaries[e]||[];let n=t;if(s.length>0){const t=[];let i=0;for(;i<n.length;){let r=!1;for(let o=Math.min(5,n.length-i);o>=1;o--){if(o>1&&n.slice(i,i+o).some(t=>"word"!==t.type))continue;const a=n.slice(i,i+o).map(t=>t.txt).join("");for(const n of s.sort((t,e)=>e.priority-t.priority))if(n.data.has(a)){t.push({txt:a,type:"word",lang:e,src:""}),i+=o,r=!0;break}if(r)break}r||(t.push({...n[i],src:""}),i++)}n=t}return n}}class s{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)
|
|
1
|
+
class t{customDictionaries;constructor(t={}){this.customDictionaries=t}detectLanguage(t){return/[a-zA-Z]/.test(t)&&!/[一-鿿]/.test(t)?"en":""}tokenize(t,e){const s=[],n=t.split(/\b/);for(const t of n)t&&(t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^a-zA-Z0-9]+$/)?s.push({txt:t,type:"punctuation",lang:e,src:""}):t.match(/^[a-zA-Z0-9]+$/)?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""}));return this.tagNameTokens(s,e)}tagNameTokens(t,e){const s=[];let n=0;for(;n<t.length;){if(n<t.length&&"word"===t[n].type){const i=t[n].txt;if(this.customDictionaries[e]){let t=!1;for(const r of this.customDictionaries[e].sort((t,e)=>e.priority-t.priority))if(r.data.has(i)){s.push({txt:i,type:"word",lang:e,src:r.name}),n++,t=!0;break}if(t)continue}}s.push({txt:t[n].txt,type:t[n].type,lang:e,src:t[n].src||""}),n++}return s}}class e{segmenters;customDictionaries;constructor(t={}){this.segmenters=new Map,this.customDictionaries=t}detectLanguage(t){return/[\u4e00-\u9fff]/.test(t)?"zh":/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":""}tokenize(t,e){const s=[],n=this.getSegmenter(e);for(const i of n.segment(t)){const{segment:t,isWordLike:n}=i;t.match(/^\s+$/)?s.push({txt:t,type:"space",lang:e,src:""}):/^\p{Emoji}+$/u.test(t)&&!/[0-9]/.test(t)?s.push({txt:t,type:"emoji",lang:e,src:""}):t.match(/^[^\p{L}\p{N}]+$/u)?s.push({txt:t,type:"punctuation",lang:e,src:""}):n?s.push({txt:t,type:"word",lang:e,src:""}):s.push({txt:t,type:"other",lang:e,src:""})}return this.applyCustomDictionary(s,e)}getSegmenter(t,e="word"){const s=`${t}-${e}`;return this.segmenters.has(s)||this.segmenters.set(s,new Intl.Segmenter(t,{granularity:e})),this.segmenters.get(s)}applyCustomDictionary(t,e){const s=this.customDictionaries[e]||[];let n=t;if(s.length>0){const t=[];let i=0;for(;i<n.length;){let r=!1;for(let o=Math.min(5,n.length-i);o>=1;o--){if(o>1&&n.slice(i,i+o).some(t=>"word"!==t.type))continue;const a=n.slice(i,i+o).map(t=>t.txt).join("");for(const n of s.sort((t,e)=>e.priority-t.priority))if(n.data.has(a)){t.push({txt:a,type:"word",lang:e,src:""}),i+=o,r=!0;break}if(r)break}r||(t.push({...n[i],src:""}),i++)}n=t}return n}}class s{comprehensiveDatePattern=/\d{8}|\d{4}[-/.]\d{2}[-/.]\d{2}|\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}|(?:Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+\d{1,2}(?:,\s+\d{4})?|\d{4}年\d{1,2}月(?:\d{1,2}日)?|\d{1,2}月\d{1,2}日(?:\d{4}年)?|\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))/gi;detectLanguage(t){return""}tokenize(t,e="zh"){const s=[];let n=0;const i=[];for(const e of t.matchAll(this.comprehensiveDatePattern))void 0!==e.index&&e[0]&&i.push({text:e[0],index:e.index});i.sort((t,e)=>t.index-e.index);const r=[];let o=null;for(const e of i){const s=e.index+e.text.length;o?e.index<=o.end?o={text:t.slice(o.index,Math.max(o.end,s)),index:o.index,end:Math.max(o.end,s)}:(r.push({text:o.text,index:o.index}),o={...e,end:s}):o={...e,end:s}}o&&r.push({text:o.text,index:o.index});for(const i of r){if(i.index>n){const r=t.slice(n,i.index);s.push({txt:r,type:"other",lang:e,src:""})}this.isValidDate(i.text)?s.push({txt:i.text,type:"date",lang:e,src:""}):s.push({txt:i.text,type:"other",lang:e,src:""}),n=i.index+i.text.length}if(n<t.length){const i=t.slice(n);s.push({txt:i,type:"other",lang:e,src:""})}return s}isValidDate(t){if(/^\d+(?:小时|分钟|秒|毫秒|天|周|月|年|\s+(?:hours?|minutes?|seconds?|milliseconds?|days?|weeks?|months?|years?))$/.test(t))return!0;let e,s,n;if(/^\d{8}$/.test(t))e=parseInt(t.slice(0,4)),s=parseInt(t.slice(4,6)),n=parseInt(t.slice(6,8));else if(/^\d{4}[-.]\d{2}[-.]\d{2}$/.test(t)){const i=t.split(/[-.]/).map(Number);e=i[0],s=i[1],n=i[2]}else if(/^\d{1,2}[-/.]\d{1,2}[-/.]\d{2,4}$/.test(t)||/^\d{4}[-/.]\d{1,2}[-/.]\d{1,2}$/.test(t)){const i=t.split(/[-/.]/).map(Number);i[0]>=1e3?(e=i[0],s=i[1],n=i[2]):i[2]>31?(s=i[0],n=i[1],e=i[2]):(s=i[0],n=i[1],e=i[2]<50?2e3+i[2]:1900+i[2])}else if(/^(Jan(?:uary)?|Feb(?:ruary)?|Mar(?:ch)?|Apr(?:il)?|May|June?|July?|Aug(?:ust)?|Sep(?:tember)?|Oct(?:ober)?|Nov(?:ember)?|Dec(?:ember)?)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i.test(t)){const i=["Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"],r=t.match(/^(\w+)\s+(\d{1,2})(?:,\s+(\d{4}))?$/i);if(!r)return!1;s=i.findIndex(t=>t.toLowerCase()===r[1].substring(0,3).toLowerCase())+1,n=parseInt(r[2]),e=r[3]?parseInt(r[3]):(new Date).getFullYear()}else{if(!/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/.test(t))return!1;{const i=t.match(/^(\d{4})?年?(\d{1,2})月(\d{1,2})日?(\d{4})?$/);if(!i)return!1;e=parseInt(i[1]||i[4]||(new Date).getFullYear().toString()),s=parseInt(i[2]),n=parseInt(i[3])}}return this.isValidDateComponents(e,s,n)}isValidDateComponents(t,e,s){if(e<1||e>12||s<1||s>31)return!1;if(2===e){if(s>(t%4==0&&t%100!=0||t%400==0?29:28))return!1}else if(s>[31,28,31,30,31,30,31,31,30,31,30,31][e-1])return!1;return!0}}class n{static detectLanguage(t){return/[\u3040-\u309f\u30a0-\u30ff]/.test(t)?"ja":/[\uac00-\ud7af]/.test(t)?"ko":/[\u4e00-\u9fff]/.test(t)?"zh":"en"}}class i{detectLanguage(t){return"en"}tokenize(t,e){const s=[];let n=0;const i=/\b(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\b/g,r=/(?:https?:\/\/)?(?:[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}(?:\.[a-zA-Z]{2,})?|(?:localhost|127\.0\.0\.1)|(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?))(?::\d{1,5})?(?:\/[^\s]*)?/g,o=[];let a;for(;null!==(a=i.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"ip",text:a[0]});for(;null!==(a=r.exec(t));)o.push({index:a.index,endIndex:a.index+a[0].length,type:"url",text:a[0]});o.sort((t,e)=>t.index-e.index);const c=[];for(let t=0;t<o.length;t++){let e=!1;for(let s=0;s<o.length;s++)if(t!==s){const n=o[t].index===o[s].index&&o[t].endIndex===o[s].endIndex,i=o[t].index>=o[s].index&&o[t].endIndex<=o[s].endIndex;if(n){if("ip"===o[s].type){e=!0;break}}else if(i){e=!0;break}}e||c.push(o[t])}for(const i of c)i.index>n&&s.push({txt:t.substring(n,i.index),type:"other",lang:e}),s.push({txt:i.text,type:i.type,lang:"en"}),n=i.endIndex;return n<t.length&&s.push({txt:t.substring(n),type:"other",lang:e}),s}}class r{tokenizers;customDictionaries;defaultLanguage;constructor(n={}){this.customDictionaries=n.customDictionaries||{},this.defaultLanguage=n.defaultLanguage||"en",this.tokenizers=[new s,new i,new t,new e(this.customDictionaries)]}addCustomDictionary(t,e,s,n){const i=e||this.defaultLanguage;this.customDictionaries[i]||(this.customDictionaries[i]=[]);const r=this.customDictionaries[i].findIndex(t=>t.name===n&&t.lang===i&&t.priority===s);if(r>=0){const e=this.customDictionaries[i][r];t.forEach(t=>e.data.add(t))}else this.customDictionaries[i].push({priority:s,data:new Set(t),name:n,lang:i})}removeCustomWord(t,e,s){if(e){if(this.customDictionaries[e])if(s){const n=this.customDictionaries[e].find(t=>t.name===s);n&&n.data.delete(t)}else this.customDictionaries[e].forEach(e=>{e.data.delete(t)})}else Object.values(this.customDictionaries).forEach(e=>{e.forEach(e=>{e.data.has(t)&&e.data.delete(t)})})}tokenize(r,o){const a=o||n.detectLanguage(r),c=this.tokenizers.find(t=>t instanceof s);if(!c)return[];const u=c.tokenize(r,a),d=[],l=this.tokenizers.find(t=>t instanceof i);if(!l)return d;for(const s of u)if("date"===s.type)d.push(s);else{const n=l.tokenize(s.txt,a);for(const s of n)if("url"===s.type||"ip"===s.type)d.push(s);else{let n=[];if("en"===a){const e=this.tokenizers.find(e=>e instanceof t);e&&(n=e.tokenize(s.txt,a))}else if(["zh","ja","ko"].includes(a)){const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}else{const t=this.tokenizers.find(t=>t instanceof e);t&&(n=t.tokenize(s.txt,a))}n.length>0?d.push(...n):d.push(s)}}return d}tokenizeText(t,e){const s=this.tokenize(t,e?.language),n=["punctuation","space","other",...e?.excludeTypes||[]];return s.filter(t=>!(e?.includeTypes&&e.includeTypes.length>0&&!e.includeTypes.includes(t.type))&&!n.includes(t.type)).map(t=>t.txt)}}function o(t){return new r(t)}export{e as CJKTokenizer,s as DateTokenizer,t as EnglishTokenizer,n as LanguageDetector,r as MultilingualTokenizer,o as createTokenizer};
|