thai-cut-browser 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/LICENSE +165 -0
- package/README.md +92 -0
- package/lib/acceptors.d.ts +23 -0
- package/lib/acceptors.js +35 -0
- package/lib/default_dict_words.d.ts +2 -0
- package/lib/default_dict_words.js +24131 -0
- package/lib/dict.d.ts +16 -0
- package/lib/dict.js +73 -0
- package/lib/index.d.ts +27 -0
- package/lib/index.js +39 -0
- package/lib/latin_rules.d.ts +3 -0
- package/lib/latin_rules.js +98 -0
- package/lib/path_info_builder.d.ts +7 -0
- package/lib/path_info_builder.js +57 -0
- package/lib/path_selector.d.ts +12 -0
- package/lib/path_selector.js +28 -0
- package/lib/prefixtree.d.ts +6 -0
- package/lib/prefixtree.js +31 -0
- package/lib/thai_rules.d.ts +3 -0
- package/lib/thai_rules.js +62 -0
- package/lib/wordcut.d.ts +21 -0
- package/lib/wordcut.js +37 -0
- package/lib/wordcut_core.d.ts +32 -0
- package/lib/wordcut_core.js +61 -0
- package/package.json +57 -0
package/lib/dict.d.ts
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
1
|
+
import type { Acceptor } from "./acceptors";
|
|
2
|
+
type DictAcceptor = Acceptor & {
|
|
3
|
+
nodeId: number;
|
|
4
|
+
dict: typeof WordcutDict;
|
|
5
|
+
};
|
|
6
|
+
declare const WordcutDict: {
|
|
7
|
+
dict: string[];
|
|
8
|
+
tree: import("./prefixtree").PrefixTree<string>;
|
|
9
|
+
init(words?: string[], withDefault?: boolean, additionalWords?: string[]): void;
|
|
10
|
+
addWords(words: string[], finalize?: boolean): void;
|
|
11
|
+
finalizeDict(): void;
|
|
12
|
+
createAcceptor(): DictAcceptor;
|
|
13
|
+
transit(acceptor: DictAcceptor, ch: string): DictAcceptor;
|
|
14
|
+
sortuniq(a: string[]): string[];
|
|
15
|
+
};
|
|
16
|
+
export = WordcutDict;
|
package/lib/dict.js
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
const prefixtree_1 = require("./prefixtree");
|
|
6
|
+
const default_dict_words_1 = __importDefault(require("./default_dict_words"));
|
|
7
|
+
const WordcutDict = {
|
|
8
|
+
dict: [],
|
|
9
|
+
tree: (0, prefixtree_1.createPrefixTree)([]),
|
|
10
|
+
init(words, withDefault = true, additionalWords) {
|
|
11
|
+
if (words !== undefined && !Array.isArray(words)) {
|
|
12
|
+
throw new Error("words must be a string[]");
|
|
13
|
+
}
|
|
14
|
+
this.dict = [];
|
|
15
|
+
if (words !== undefined) {
|
|
16
|
+
this.addWords(words, false);
|
|
17
|
+
}
|
|
18
|
+
if (withDefault) {
|
|
19
|
+
this.addWords(Array.from(default_dict_words_1.default), false);
|
|
20
|
+
}
|
|
21
|
+
if (additionalWords !== undefined) {
|
|
22
|
+
if (!Array.isArray(additionalWords)) {
|
|
23
|
+
throw new Error("additionalWords must be a string[]");
|
|
24
|
+
}
|
|
25
|
+
this.addWords(additionalWords, false);
|
|
26
|
+
}
|
|
27
|
+
this.finalizeDict();
|
|
28
|
+
},
|
|
29
|
+
addWords(words, finalize) {
|
|
30
|
+
finalize = finalize === undefined || finalize;
|
|
31
|
+
this.dict.push(...words);
|
|
32
|
+
if (finalize) {
|
|
33
|
+
this.finalizeDict();
|
|
34
|
+
}
|
|
35
|
+
},
|
|
36
|
+
finalizeDict() {
|
|
37
|
+
this.dict = this.sortuniq(this.dict);
|
|
38
|
+
this.tree = (0, prefixtree_1.createPrefixTree)(this.dict.map((w) => [w, null]));
|
|
39
|
+
},
|
|
40
|
+
createAcceptor() {
|
|
41
|
+
const dict = this;
|
|
42
|
+
return {
|
|
43
|
+
nodeId: 0,
|
|
44
|
+
strOffset: 0,
|
|
45
|
+
isFinal: false,
|
|
46
|
+
isError: false,
|
|
47
|
+
tag: "DICT",
|
|
48
|
+
w: 1,
|
|
49
|
+
type: "DICT",
|
|
50
|
+
dict,
|
|
51
|
+
transit(ch) {
|
|
52
|
+
return this.dict.transit(this, ch);
|
|
53
|
+
}
|
|
54
|
+
};
|
|
55
|
+
},
|
|
56
|
+
transit(acceptor, ch) {
|
|
57
|
+
const child = this.tree.lookup(acceptor.nodeId, acceptor.strOffset, ch);
|
|
58
|
+
if (child !== null) {
|
|
59
|
+
const [nodeId, isFinal] = child;
|
|
60
|
+
acceptor.nodeId = nodeId;
|
|
61
|
+
acceptor.strOffset++;
|
|
62
|
+
acceptor.isFinal = isFinal;
|
|
63
|
+
}
|
|
64
|
+
else {
|
|
65
|
+
acceptor.isError = true;
|
|
66
|
+
}
|
|
67
|
+
return acceptor;
|
|
68
|
+
},
|
|
69
|
+
sortuniq(a) {
|
|
70
|
+
return a.sort().filter((item, pos, arr) => !pos || item !== arr[pos - 1]);
|
|
71
|
+
},
|
|
72
|
+
};
|
|
73
|
+
module.exports = WordcutDict;
|
package/lib/index.d.ts
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
export interface WordcutInstance {
|
|
2
|
+
initNoDict(): void;
|
|
3
|
+
init(words?: string[], withDefault?: boolean, additionalWords?: string[]): void;
|
|
4
|
+
cut(text: string, delimiter?: string): string;
|
|
5
|
+
cutIntoArray(text: string): string[];
|
|
6
|
+
cutIntoRanges(text: string, noText?: boolean): Array<{
|
|
7
|
+
s: number;
|
|
8
|
+
e: number;
|
|
9
|
+
text?: string;
|
|
10
|
+
}>;
|
|
11
|
+
}
|
|
12
|
+
export interface CreateWordcutOptions {
|
|
13
|
+
dictionaryWords?: string[];
|
|
14
|
+
withDefaultDict?: boolean;
|
|
15
|
+
additionalWords?: string[];
|
|
16
|
+
noDict?: boolean;
|
|
17
|
+
dictPath?: string | string[];
|
|
18
|
+
}
|
|
19
|
+
export declare function createWordcut(options?: CreateWordcutOptions): WordcutInstance;
|
|
20
|
+
export declare function cut(text: string, delimiter?: string): string;
|
|
21
|
+
export declare function cutIntoArray(text: string): string[];
|
|
22
|
+
export declare function cutIntoRanges(text: string, noText?: boolean): Array<{
|
|
23
|
+
s: number;
|
|
24
|
+
e: number;
|
|
25
|
+
text?: string;
|
|
26
|
+
}>;
|
|
27
|
+
export default createWordcut;
|
package/lib/index.js
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createWordcut = createWordcut;
|
|
4
|
+
exports.cut = cut;
|
|
5
|
+
exports.cutIntoArray = cutIntoArray;
|
|
6
|
+
exports.cutIntoRanges = cutIntoRanges;
|
|
7
|
+
const BaseWordcut = require("./wordcut");
|
|
8
|
+
function createInstance() {
|
|
9
|
+
return Object.create(BaseWordcut);
|
|
10
|
+
}
|
|
11
|
+
function createWordcut(options = {}) {
|
|
12
|
+
const instance = createInstance();
|
|
13
|
+
if (options.dictPath !== undefined) {
|
|
14
|
+
throw new Error("dictPath is no longer supported. Pass dictionaryWords: string[] instead.");
|
|
15
|
+
}
|
|
16
|
+
if (options.noDict) {
|
|
17
|
+
instance.initNoDict();
|
|
18
|
+
return instance;
|
|
19
|
+
}
|
|
20
|
+
instance.init(options.dictionaryWords, options.withDefaultDict, options.additionalWords);
|
|
21
|
+
return instance;
|
|
22
|
+
}
|
|
23
|
+
let defaultInstance = null;
|
|
24
|
+
function getDefaultInstance() {
|
|
25
|
+
if (defaultInstance === null) {
|
|
26
|
+
defaultInstance = createWordcut();
|
|
27
|
+
}
|
|
28
|
+
return defaultInstance;
|
|
29
|
+
}
|
|
30
|
+
function cut(text, delimiter) {
|
|
31
|
+
return getDefaultInstance().cut(text, delimiter);
|
|
32
|
+
}
|
|
33
|
+
function cutIntoArray(text) {
|
|
34
|
+
return getDefaultInstance().cutIntoArray(text);
|
|
35
|
+
}
|
|
36
|
+
function cutIntoRanges(text, noText) {
|
|
37
|
+
return getDefaultInstance().cutIntoRanges(text, noText);
|
|
38
|
+
}
|
|
39
|
+
exports.default = createWordcut;
|
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
const WordRule = {
|
|
3
|
+
createAcceptor(tag) {
|
|
4
|
+
if (tag.WORD_RULE)
|
|
5
|
+
return null;
|
|
6
|
+
return {
|
|
7
|
+
strOffset: 0,
|
|
8
|
+
isFinal: false,
|
|
9
|
+
isError: false,
|
|
10
|
+
tag: "WORD_RULE",
|
|
11
|
+
type: "WORD_RULE",
|
|
12
|
+
w: 1,
|
|
13
|
+
transit(ch) {
|
|
14
|
+
const lch = ch.toLowerCase();
|
|
15
|
+
if (lch >= "a" && lch <= "z") {
|
|
16
|
+
this.isFinal = true;
|
|
17
|
+
this.strOffset++;
|
|
18
|
+
}
|
|
19
|
+
else {
|
|
20
|
+
this.isError = true;
|
|
21
|
+
}
|
|
22
|
+
return this;
|
|
23
|
+
}
|
|
24
|
+
};
|
|
25
|
+
}
|
|
26
|
+
};
|
|
27
|
+
const NumberRule = {
|
|
28
|
+
createAcceptor(tag) {
|
|
29
|
+
if (tag.NUMBER_RULE)
|
|
30
|
+
return null;
|
|
31
|
+
return {
|
|
32
|
+
strOffset: 0,
|
|
33
|
+
isFinal: false,
|
|
34
|
+
isError: false,
|
|
35
|
+
tag: "NUMBER_RULE",
|
|
36
|
+
type: "NUMBER_RULE",
|
|
37
|
+
w: 1,
|
|
38
|
+
transit(ch) {
|
|
39
|
+
if (ch >= "0" && ch <= "9") {
|
|
40
|
+
this.isFinal = true;
|
|
41
|
+
this.strOffset++;
|
|
42
|
+
}
|
|
43
|
+
else {
|
|
44
|
+
this.isError = true;
|
|
45
|
+
}
|
|
46
|
+
return this;
|
|
47
|
+
}
|
|
48
|
+
};
|
|
49
|
+
}
|
|
50
|
+
};
|
|
51
|
+
const SpaceRule = {
|
|
52
|
+
createAcceptor(tag) {
|
|
53
|
+
if (tag.SPACE_RULE)
|
|
54
|
+
return null;
|
|
55
|
+
return {
|
|
56
|
+
strOffset: 0,
|
|
57
|
+
isFinal: false,
|
|
58
|
+
isError: false,
|
|
59
|
+
tag: "SPACE_RULE",
|
|
60
|
+
type: "SPACE_RULE",
|
|
61
|
+
w: 1,
|
|
62
|
+
transit(ch) {
|
|
63
|
+
if (ch === " " || ch === "\t" || ch === "\r" || ch === "\n" || ch === "\u00A0" || ch === "\u2003") {
|
|
64
|
+
this.isFinal = true;
|
|
65
|
+
this.strOffset++;
|
|
66
|
+
}
|
|
67
|
+
else {
|
|
68
|
+
this.isError = true;
|
|
69
|
+
}
|
|
70
|
+
return this;
|
|
71
|
+
}
|
|
72
|
+
};
|
|
73
|
+
}
|
|
74
|
+
};
|
|
75
|
+
const SingleSymbolRule = {
|
|
76
|
+
createAcceptor() {
|
|
77
|
+
return {
|
|
78
|
+
strOffset: 0,
|
|
79
|
+
isFinal: false,
|
|
80
|
+
isError: false,
|
|
81
|
+
tag: "SINSYM",
|
|
82
|
+
type: "SINSYM",
|
|
83
|
+
w: 1,
|
|
84
|
+
transit(ch) {
|
|
85
|
+
if (this.strOffset === 0 && ch.match(/^[\@\(\)\/\,\-\.\?"`]$/)) {
|
|
86
|
+
this.isFinal = true;
|
|
87
|
+
this.strOffset++;
|
|
88
|
+
}
|
|
89
|
+
else {
|
|
90
|
+
this.isError = true;
|
|
91
|
+
}
|
|
92
|
+
return this;
|
|
93
|
+
}
|
|
94
|
+
};
|
|
95
|
+
}
|
|
96
|
+
};
|
|
97
|
+
const LatinRules = [WordRule, SpaceRule, SingleSymbolRule, NumberRule];
|
|
98
|
+
module.exports = LatinRules;
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
import type { Acceptor } from "./acceptors";
|
|
2
|
+
import type { PathInfo } from "./path_selector";
|
|
3
|
+
export default function createPathInfoBuilder(): {
|
|
4
|
+
buildByAcceptors(path: PathInfo[], finalAcceptors: Acceptor[], i: number): PathInfo[];
|
|
5
|
+
fallback(path: PathInfo[], leftBoundary: number, text: string, i: number): PathInfo;
|
|
6
|
+
build(path: PathInfo[], finalAcceptors: Acceptor[], i: number, leftBoundary: number, text: string): PathInfo[];
|
|
7
|
+
};
|
|
@@ -0,0 +1,57 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.default = createPathInfoBuilder;
|
|
4
|
+
function createPathInfoBuilder() {
|
|
5
|
+
return {
|
|
6
|
+
buildByAcceptors(path, finalAcceptors, i) {
|
|
7
|
+
const infos = finalAcceptors.map((acceptor) => {
|
|
8
|
+
const p = i - acceptor.strOffset + 1;
|
|
9
|
+
const base = path[p];
|
|
10
|
+
const info = {
|
|
11
|
+
p,
|
|
12
|
+
mw: base.mw + (acceptor.mw ?? 0),
|
|
13
|
+
w: acceptor.w + base.w,
|
|
14
|
+
unk: (acceptor.unk ?? 0) + base.unk,
|
|
15
|
+
type: acceptor.type
|
|
16
|
+
};
|
|
17
|
+
if (acceptor.type === "PART") {
|
|
18
|
+
for (let j = p + 1; j <= i; j++) {
|
|
19
|
+
path[j].merge = p;
|
|
20
|
+
}
|
|
21
|
+
info.merge = p;
|
|
22
|
+
}
|
|
23
|
+
return info;
|
|
24
|
+
});
|
|
25
|
+
return infos.filter(Boolean);
|
|
26
|
+
},
|
|
27
|
+
fallback(path, leftBoundary, text, i) {
|
|
28
|
+
const base = path[leftBoundary];
|
|
29
|
+
if (text[i].match(/[\u0E48-\u0E4E]/)) {
|
|
30
|
+
if (leftBoundary !== 0) {
|
|
31
|
+
leftBoundary = path[leftBoundary].p;
|
|
32
|
+
}
|
|
33
|
+
return {
|
|
34
|
+
p: leftBoundary,
|
|
35
|
+
mw: 0,
|
|
36
|
+
w: 1 + base.w,
|
|
37
|
+
unk: 1 + base.unk,
|
|
38
|
+
type: "UNK"
|
|
39
|
+
};
|
|
40
|
+
}
|
|
41
|
+
return {
|
|
42
|
+
p: leftBoundary,
|
|
43
|
+
mw: base.mw,
|
|
44
|
+
w: 1 + base.w,
|
|
45
|
+
unk: 1 + base.unk,
|
|
46
|
+
type: "UNK"
|
|
47
|
+
};
|
|
48
|
+
},
|
|
49
|
+
build(path, finalAcceptors, i, leftBoundary, text) {
|
|
50
|
+
const basicPathInfos = this.buildByAcceptors(path, finalAcceptors, i);
|
|
51
|
+
if (basicPathInfos.length > 0) {
|
|
52
|
+
return basicPathInfos;
|
|
53
|
+
}
|
|
54
|
+
return [this.fallback(path, leftBoundary, text, i)];
|
|
55
|
+
}
|
|
56
|
+
};
|
|
57
|
+
}
|
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.default = createPathSelector;
|
|
4
|
+
function createPathSelector() {
|
|
5
|
+
return {
|
|
6
|
+
selectPath(paths) {
|
|
7
|
+
return paths.reduce((selectedPath, path) => {
|
|
8
|
+
if (selectedPath === null) {
|
|
9
|
+
return path;
|
|
10
|
+
}
|
|
11
|
+
if (path.unk < selectedPath.unk)
|
|
12
|
+
return path;
|
|
13
|
+
if (path.unk === selectedPath.unk) {
|
|
14
|
+
if (path.mw < selectedPath.mw)
|
|
15
|
+
return path;
|
|
16
|
+
if (path.mw === selectedPath.mw) {
|
|
17
|
+
if (path.w < selectedPath.w)
|
|
18
|
+
return path;
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
return selectedPath;
|
|
22
|
+
}, null);
|
|
23
|
+
},
|
|
24
|
+
createPath() {
|
|
25
|
+
return [{ p: null, w: 0, unk: 0, type: "INIT", mw: 0 }];
|
|
26
|
+
}
|
|
27
|
+
};
|
|
28
|
+
}
|
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
export type PrefixTreeChild<T = unknown> = [number, boolean, T | null];
|
|
2
|
+
export interface PrefixTree<T = unknown> {
|
|
3
|
+
tab: Record<string, PrefixTreeChild<T>>;
|
|
4
|
+
lookup(nodeId: number, offset: number, ch: string): PrefixTreeChild<T> | null;
|
|
5
|
+
}
|
|
6
|
+
export declare function createPrefixTree<T = unknown>(wordPayloads: Array<[string, T]> | null | undefined): PrefixTree<T>;
|
|
@@ -0,0 +1,31 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.createPrefixTree = createPrefixTree;
|
|
4
|
+
function createPrefixTree(wordPayloads) {
|
|
5
|
+
const tab = {};
|
|
6
|
+
if (wordPayloads) {
|
|
7
|
+
wordPayloads.forEach(([word, payload], i) => {
|
|
8
|
+
let rowNo = 0;
|
|
9
|
+
for (let j = 0; j < word.length; j++) {
|
|
10
|
+
const ch = word[j];
|
|
11
|
+
const key = String([rowNo, j, ch]);
|
|
12
|
+
const child = tab[key];
|
|
13
|
+
if (child) {
|
|
14
|
+
rowNo = child[0];
|
|
15
|
+
}
|
|
16
|
+
else {
|
|
17
|
+
const isFinal = j + 1 === word.length;
|
|
18
|
+
tab[key] = [i, isFinal, isFinal ? payload : null];
|
|
19
|
+
rowNo = i;
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
});
|
|
23
|
+
}
|
|
24
|
+
return {
|
|
25
|
+
tab,
|
|
26
|
+
lookup(nodeId, offset, ch) {
|
|
27
|
+
const child = this.tab[String([nodeId, offset, ch])];
|
|
28
|
+
return child ?? null;
|
|
29
|
+
}
|
|
30
|
+
};
|
|
31
|
+
}
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
function isMatch(pat, offset, ch) {
|
|
3
|
+
if (pat.length <= offset)
|
|
4
|
+
return false;
|
|
5
|
+
const expected = pat[offset];
|
|
6
|
+
return (expected === ch ||
|
|
7
|
+
(!!expected.match(/[กข]/) && !!ch.match(/[ก-ฮ]/)) ||
|
|
8
|
+
(!!expected.match(/[มบ]/) && !!ch.match(/[ก-ฮ]/)) ||
|
|
9
|
+
(!!expected.match(/\u0E49/) && !!ch.match(/[\u0E48-\u0E4B]/)));
|
|
10
|
+
}
|
|
11
|
+
const Rule0 = {
|
|
12
|
+
createAcceptor() {
|
|
13
|
+
const pat = "เหก็ม";
|
|
14
|
+
return {
|
|
15
|
+
strOffset: 0,
|
|
16
|
+
isFinal: false,
|
|
17
|
+
isError: false,
|
|
18
|
+
tag: "THAI_RULE",
|
|
19
|
+
type: "THAI_RULE",
|
|
20
|
+
w: 1,
|
|
21
|
+
transit(ch) {
|
|
22
|
+
if (isMatch(pat, this.strOffset, ch)) {
|
|
23
|
+
this.isFinal = this.strOffset + 1 === pat.length;
|
|
24
|
+
this.strOffset++;
|
|
25
|
+
}
|
|
26
|
+
else {
|
|
27
|
+
this.isError = true;
|
|
28
|
+
}
|
|
29
|
+
return this;
|
|
30
|
+
}
|
|
31
|
+
};
|
|
32
|
+
}
|
|
33
|
+
};
|
|
34
|
+
const PartRule = {
|
|
35
|
+
createAcceptor() {
|
|
36
|
+
return {
|
|
37
|
+
strOffset: 0,
|
|
38
|
+
isFinal: false,
|
|
39
|
+
isError: false,
|
|
40
|
+
tag: "PART",
|
|
41
|
+
type: "PART",
|
|
42
|
+
unk: 1,
|
|
43
|
+
w: 1,
|
|
44
|
+
patterns: ["แก", "เก", "ก้", "กก์", "กา", "กี", "กิ", "กืก"],
|
|
45
|
+
transit(ch) {
|
|
46
|
+
const offset = this.strOffset;
|
|
47
|
+
this.patterns = this.patterns.filter((pat) => isMatch(pat, offset, ch));
|
|
48
|
+
if (this.patterns.length > 0) {
|
|
49
|
+
const len = 1 + offset;
|
|
50
|
+
this.isFinal = this.patterns.some((pat) => pat.length === len);
|
|
51
|
+
this.strOffset++;
|
|
52
|
+
}
|
|
53
|
+
else {
|
|
54
|
+
this.isError = true;
|
|
55
|
+
}
|
|
56
|
+
return this;
|
|
57
|
+
}
|
|
58
|
+
};
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
const ThaiRules = [Rule0, PartRule];
|
|
62
|
+
module.exports = ThaiRules;
|
package/lib/wordcut.d.ts
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
1
|
+
import WordcutDict = require("./dict");
|
|
2
|
+
import createPathInfoBuilder from "./path_info_builder";
|
|
3
|
+
import createPathSelector from "./path_selector";
|
|
4
|
+
import createAcceptors from "./acceptors";
|
|
5
|
+
import latinRules = require("./latin_rules");
|
|
6
|
+
import thaiRules = require("./thai_rules");
|
|
7
|
+
interface WordcutType {
|
|
8
|
+
defaultPathInfoBuilder: typeof createPathInfoBuilder;
|
|
9
|
+
defaultPathSelector: typeof createPathSelector;
|
|
10
|
+
defaultAcceptors: typeof createAcceptors;
|
|
11
|
+
defaultLatinRules: typeof latinRules;
|
|
12
|
+
defaultThaiRules: typeof thaiRules;
|
|
13
|
+
defaultDict: typeof WordcutDict;
|
|
14
|
+
pathInfoBuilder: ReturnType<typeof createPathInfoBuilder>;
|
|
15
|
+
pathSelector: ReturnType<typeof createPathSelector>;
|
|
16
|
+
acceptors: ReturnType<typeof createAcceptors>;
|
|
17
|
+
initNoDict(): void;
|
|
18
|
+
init(words?: string[], withDefault?: boolean, additionalWords?: string[]): void;
|
|
19
|
+
}
|
|
20
|
+
declare const Wordcut: WordcutType;
|
|
21
|
+
export = Wordcut;
|
package/lib/wordcut.js
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
var __importDefault = (this && this.__importDefault) || function (mod) {
|
|
3
|
+
return (mod && mod.__esModule) ? mod : { "default": mod };
|
|
4
|
+
};
|
|
5
|
+
const WordcutDict = require("./dict");
|
|
6
|
+
const WordcutCore = require("./wordcut_core");
|
|
7
|
+
const path_info_builder_1 = __importDefault(require("./path_info_builder"));
|
|
8
|
+
const path_selector_1 = __importDefault(require("./path_selector"));
|
|
9
|
+
const acceptors_1 = __importDefault(require("./acceptors"));
|
|
10
|
+
const latinRules = require("./latin_rules");
|
|
11
|
+
const thaiRules = require("./thai_rules");
|
|
12
|
+
const Wordcut = Object.create(WordcutCore);
|
|
13
|
+
Wordcut.defaultPathInfoBuilder = path_info_builder_1.default;
|
|
14
|
+
Wordcut.defaultPathSelector = path_selector_1.default;
|
|
15
|
+
Wordcut.defaultAcceptors = acceptors_1.default;
|
|
16
|
+
Wordcut.defaultLatinRules = latinRules;
|
|
17
|
+
Wordcut.defaultThaiRules = thaiRules;
|
|
18
|
+
Wordcut.defaultDict = WordcutDict;
|
|
19
|
+
Wordcut.initNoDict = function initNoDict() {
|
|
20
|
+
this.pathInfoBuilder = this.defaultPathInfoBuilder();
|
|
21
|
+
this.pathSelector = this.defaultPathSelector();
|
|
22
|
+
this.acceptors = this.defaultAcceptors();
|
|
23
|
+
this.defaultLatinRules.forEach((rule) => {
|
|
24
|
+
this.acceptors.creators.push(rule);
|
|
25
|
+
});
|
|
26
|
+
this.defaultThaiRules.forEach((rule) => {
|
|
27
|
+
this.acceptors.creators.push(rule);
|
|
28
|
+
});
|
|
29
|
+
};
|
|
30
|
+
Wordcut.init = function init(words, withDefault, additionalWords) {
|
|
31
|
+
const useDefault = withDefault !== false;
|
|
32
|
+
this.initNoDict();
|
|
33
|
+
const dict = Object.assign({}, this.defaultDict);
|
|
34
|
+
dict.init(words, useDefault, additionalWords);
|
|
35
|
+
this.acceptors.creators.push(dict);
|
|
36
|
+
};
|
|
37
|
+
module.exports = Wordcut;
|
|
@@ -0,0 +1,32 @@
|
|
|
1
|
+
import type { PathInfo } from "./path_selector";
|
|
2
|
+
interface Range {
|
|
3
|
+
s: number;
|
|
4
|
+
e: number;
|
|
5
|
+
text?: string;
|
|
6
|
+
}
|
|
7
|
+
interface WordcutCoreLike {
|
|
8
|
+
buildPath(text: string): PathInfo[];
|
|
9
|
+
pathToRanges(path: PathInfo[]): Range[];
|
|
10
|
+
rangesToText(text: string, ranges: Range[], delimiter: string): string;
|
|
11
|
+
pathSelector: {
|
|
12
|
+
createPath(): PathInfo[];
|
|
13
|
+
selectPath(paths: PathInfo[]): PathInfo;
|
|
14
|
+
};
|
|
15
|
+
pathInfoBuilder: {
|
|
16
|
+
build(path: PathInfo[], finalAcceptors: unknown[], i: number, leftBoundary: number, text: string): PathInfo[];
|
|
17
|
+
};
|
|
18
|
+
acceptors: {
|
|
19
|
+
reset(): void;
|
|
20
|
+
transit(ch: string): void;
|
|
21
|
+
getFinalAcceptors(): unknown[];
|
|
22
|
+
};
|
|
23
|
+
}
|
|
24
|
+
declare const WordcutCore: {
|
|
25
|
+
buildPath(this: WordcutCoreLike, text: string): PathInfo[];
|
|
26
|
+
pathToRanges(path: PathInfo[]): Range[];
|
|
27
|
+
rangesToText(text: string, ranges: Range[], delimiter: string): string;
|
|
28
|
+
cut(this: WordcutCoreLike, text: string, delimiter?: string): string;
|
|
29
|
+
cutIntoRanges(this: WordcutCoreLike, text: string, noText?: boolean): Range[];
|
|
30
|
+
cutIntoArray(this: WordcutCoreLike, text: string): string[];
|
|
31
|
+
};
|
|
32
|
+
export = WordcutCore;
|
|
@@ -0,0 +1,61 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
const WordcutCore = {
|
|
3
|
+
buildPath(text) {
|
|
4
|
+
const path = this.pathSelector.createPath();
|
|
5
|
+
let leftBoundary = 0;
|
|
6
|
+
this.acceptors.reset();
|
|
7
|
+
for (let i = 0; i < text.length; i++) {
|
|
8
|
+
const ch = text[i];
|
|
9
|
+
this.acceptors.transit(ch);
|
|
10
|
+
const possiblePathInfos = this.pathInfoBuilder.build(path, this.acceptors.getFinalAcceptors(), i, leftBoundary, text);
|
|
11
|
+
const selectedPath = this.pathSelector.selectPath(possiblePathInfos);
|
|
12
|
+
path.push(selectedPath);
|
|
13
|
+
if (selectedPath.type !== "UNK") {
|
|
14
|
+
leftBoundary = i;
|
|
15
|
+
}
|
|
16
|
+
}
|
|
17
|
+
return path;
|
|
18
|
+
},
|
|
19
|
+
pathToRanges(path) {
|
|
20
|
+
let e = path.length - 1;
|
|
21
|
+
const ranges = [];
|
|
22
|
+
while (e > 0) {
|
|
23
|
+
const info = path[e];
|
|
24
|
+
let s = info.p;
|
|
25
|
+
if (info.merge !== undefined && ranges.length > 0) {
|
|
26
|
+
const r = ranges[ranges.length - 1];
|
|
27
|
+
r.s = info.merge;
|
|
28
|
+
s = r.s;
|
|
29
|
+
}
|
|
30
|
+
else {
|
|
31
|
+
ranges.push({ s, e });
|
|
32
|
+
}
|
|
33
|
+
e = s;
|
|
34
|
+
}
|
|
35
|
+
return ranges.reverse();
|
|
36
|
+
},
|
|
37
|
+
rangesToText(text, ranges, delimiter) {
|
|
38
|
+
return ranges.map((r) => text.substring(r.s, r.e)).join(delimiter);
|
|
39
|
+
},
|
|
40
|
+
cut(text, delimiter) {
|
|
41
|
+
const path = this.buildPath(text);
|
|
42
|
+
const ranges = this.pathToRanges(path);
|
|
43
|
+
return this.rangesToText(text, ranges, delimiter === undefined ? "|" : delimiter);
|
|
44
|
+
},
|
|
45
|
+
cutIntoRanges(text, noText) {
|
|
46
|
+
const path = this.buildPath(text);
|
|
47
|
+
const ranges = this.pathToRanges(path);
|
|
48
|
+
if (!noText) {
|
|
49
|
+
ranges.forEach((r) => {
|
|
50
|
+
r.text = text.substring(r.s, r.e);
|
|
51
|
+
});
|
|
52
|
+
}
|
|
53
|
+
return ranges;
|
|
54
|
+
},
|
|
55
|
+
cutIntoArray(text) {
|
|
56
|
+
const path = this.buildPath(text);
|
|
57
|
+
const ranges = this.pathToRanges(path);
|
|
58
|
+
return ranges.map((r) => text.substring(r.s, r.e));
|
|
59
|
+
}
|
|
60
|
+
};
|
|
61
|
+
module.exports = WordcutCore;
|