sudachi-ts 0.1.15 → 0.1.16
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/build/src/core/japaneseTokenizer.js +3 -0
- package/build/src/plugins/pathRewrite/base.d.ts +2 -0
- package/build/src/plugins/pathRewrite/base.js +1 -0
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.d.ts +2 -0
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.js +6 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -281,6 +281,9 @@ readings is visible in the baseline vs plugin outputs.
|
|
|
281
281
|
`TokenChunkerPlugin` is designed and validated against the full Sudachi system
|
|
282
282
|
dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
|
|
283
283
|
when adding or tuning chunk rules.
|
|
284
|
+
`TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
|
|
285
|
+
`tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
|
|
286
|
+
enabled throws an error.
|
|
284
287
|
|
|
285
288
|
## Dictionary Building
|
|
286
289
|
|
|
@@ -140,6 +140,9 @@ export class JapaneseTokenizer {
|
|
|
140
140
|
return builder.build();
|
|
141
141
|
}
|
|
142
142
|
tokenizeSentence(mode, input) {
|
|
143
|
+
for (const plugin of this.pathRewritePlugins) {
|
|
144
|
+
plugin.validateSplitMode(mode);
|
|
145
|
+
}
|
|
143
146
|
this.buildLattice(input);
|
|
144
147
|
const path = this.lattice.getBestPath();
|
|
145
148
|
for (const plugin of this.pathRewritePlugins) {
|
|
@@ -1,10 +1,12 @@
|
|
|
1
1
|
import type { InputText } from '../../core/inputText.js';
|
|
2
2
|
import type { Lattice, LatticeNode } from '../../core/lattice.js';
|
|
3
|
+
import type { SplitMode } from '../../core/tokenizer.js';
|
|
3
4
|
import type { CategoryType } from '../../dictionary/categoryType.js';
|
|
4
5
|
import type { Grammar } from '../../dictionary/grammar.js';
|
|
5
6
|
import { Plugin } from '../base.js';
|
|
6
7
|
export declare abstract class PathRewritePlugin extends Plugin {
|
|
7
8
|
setUp(_grammar: Grammar): void;
|
|
9
|
+
validateSplitMode(_mode: SplitMode): void;
|
|
8
10
|
abstract rewrite(text: InputText, path: LatticeNode[], lattice: Lattice): void;
|
|
9
11
|
concatenate(path: LatticeNode[], begin: number, end: number, lattice: Lattice, normalizedForm?: string | null): LatticeNode;
|
|
10
12
|
concatenateOov(path: LatticeNode[], begin: number, end: number, posId: number, lattice: Lattice): LatticeNode;
|
|
@@ -2,6 +2,7 @@ import { WordInfo } from '../../dictionary/wordInfo.js';
|
|
|
2
2
|
import { Plugin } from '../base.js';
|
|
3
3
|
export class PathRewritePlugin extends Plugin {
|
|
4
4
|
setUp(_grammar) { }
|
|
5
|
+
validateSplitMode(_mode) { }
|
|
5
6
|
concatenate(path, begin, end, lattice, normalizedForm = null) {
|
|
6
7
|
if (begin >= end) {
|
|
7
8
|
throw new Error('begin >= end');
|
|
@@ -1,5 +1,6 @@
|
|
|
1
1
|
import type { InputText } from '../../core/inputText.js';
|
|
2
2
|
import type { Lattice, LatticeNode } from '../../core/lattice.js';
|
|
3
|
+
import { SplitMode } from '../../core/tokenizer.js';
|
|
3
4
|
import type { Grammar } from '../../dictionary/grammar.js';
|
|
4
5
|
import { PathRewritePlugin } from './base.js';
|
|
5
6
|
export declare class TokenChunkerPlugin extends PathRewritePlugin {
|
|
@@ -7,6 +8,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
7
8
|
private enablePatternRules;
|
|
8
9
|
private enableBroadRules;
|
|
9
10
|
setUp(grammar: Grammar): void;
|
|
11
|
+
validateSplitMode(mode: SplitMode): void;
|
|
10
12
|
rewrite(_text: InputText, path: LatticeNode[], lattice: Lattice): void;
|
|
11
13
|
private toInitialChunks;
|
|
12
14
|
private applyPatternStage;
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { SplitMode } from '../../core/tokenizer.js';
|
|
1
2
|
import { WordInfo } from '../../dictionary/wordInfo.js';
|
|
2
3
|
import { PathRewritePlugin } from './base.js';
|
|
3
4
|
export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
@@ -9,6 +10,11 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
9
10
|
this.enablePatternRules = this.settings.getBoolean('enablePatternRules', true);
|
|
10
11
|
this.enableBroadRules = this.settings.getBoolean('enableBroadRules', false);
|
|
11
12
|
}
|
|
13
|
+
validateSplitMode(mode) {
|
|
14
|
+
if (mode !== SplitMode.C) {
|
|
15
|
+
throw new Error('TokenChunkerPlugin requires SplitMode.C. Use tokenizer.tokenize(text) or tokenizer.tokenize(SplitMode.C, text).');
|
|
16
|
+
}
|
|
17
|
+
}
|
|
12
18
|
rewrite(_text, path, lattice) {
|
|
13
19
|
if (path.length === 0) {
|
|
14
20
|
return;
|