sudachi-ts 0.1.11 → 0.1.15
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -32
- package/build/bin/sudachi-build-system.js +0 -0
- package/build/bin/sudachi-build-user.js +0 -0
- package/build/bin/sudachi-print-dict.js +0 -0
- package/build/bin/sudachi-print-header.js +0 -0
- package/build/bin/sudachi.js +0 -0
- package/build/src/config/config.js +10 -3
- package/build/src/config/settings.d.ts +8 -2
- package/build/src/config/settings.js +29 -7
- package/build/src/dictionary/characterCategory.js +2 -2
- package/build/src/dictionary/dictionaryFactory.js +2 -2
- package/build/src/plugins/inputText/defaultInputTextPlugin.js +1 -1
- package/build/src/plugins/loader.d.ts +5 -0
- package/build/src/plugins/loader.js +27 -1
- package/build/src/plugins/oov/meCabOovProviderPlugin.js +2 -2
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.d.ts +6 -6
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.js +967 -147
- package/package.json +58 -54
package/README.md
CHANGED
|
@@ -105,7 +105,7 @@ const config = await loadConfig('./sudachi.json');
|
|
|
105
105
|
const dict = Dictionary.create();
|
|
106
106
|
```
|
|
107
107
|
|
|
108
|
-
Example `sudachi.json`:
|
|
108
|
+
Example `sudachi.json`:
|
|
109
109
|
|
|
110
110
|
```json
|
|
111
111
|
{
|
|
@@ -120,10 +120,14 @@ Example `sudachi.json`:
|
|
|
120
120
|
}
|
|
121
121
|
}
|
|
122
122
|
]
|
|
123
|
-
}
|
|
124
|
-
```
|
|
125
|
-
|
|
126
|
-
|
|
123
|
+
}
|
|
124
|
+
```
|
|
125
|
+
|
|
126
|
+
For non-absolute file references in config (dictionary files, plugin module paths,
|
|
127
|
+
and built-in plugin file settings), Sudachi-TS tries paths relative to the config
|
|
128
|
+
file first, then relative to the current working directory.
|
|
129
|
+
|
|
130
|
+
By default, Sudachi-TS enables a built-in compound-particle lexicon
|
|
127
131
|
(`"enableDefaultCompoundParticles": true`) so forms such as `かも`, `のか`,
|
|
128
132
|
and `だから` are tokenized as single morphemes. Set it to `false` to disable:
|
|
129
133
|
|
|
@@ -266,16 +270,19 @@ const plugin = await loader.loadInputTextPlugin(
|
|
|
266
270
|
|
|
267
271
|
See [PLUGINS.md](./PLUGINS.md) for detailed plugin development guide.
|
|
268
272
|
|
|
269
|
-
Quick local comparison for the PoC token chunker plugin:
|
|
270
|
-
|
|
271
|
-
```bash
|
|
272
|
-
|
|
273
|
-
```
|
|
274
|
-
|
|
275
|
-
This example prints each token as `surface/reading` so the chunking impact on
|
|
276
|
-
readings is visible in the baseline vs plugin outputs.
|
|
277
|
-
|
|
278
|
-
|
|
273
|
+
Quick local comparison for the PoC token chunker plugin:
|
|
274
|
+
|
|
275
|
+
```bash
|
|
276
|
+
npm exec tsx examples/token-chunker-plugin.ts /path/to/system.dic "東京大学"
|
|
277
|
+
```
|
|
278
|
+
|
|
279
|
+
This example prints each token as `surface/reading` so the chunking impact on
|
|
280
|
+
readings is visible in the baseline vs plugin outputs.
|
|
281
|
+
`TokenChunkerPlugin` is designed and validated against the full Sudachi system
|
|
282
|
+
dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
|
|
283
|
+
when adding or tuning chunk rules.
|
|
284
|
+
|
|
285
|
+
## Dictionary Building
|
|
279
286
|
|
|
280
287
|
Build binary dictionaries from CSV source:
|
|
281
288
|
|
|
@@ -325,23 +332,23 @@ See [CONFIG.md](./CONFIG.md) for detailed configuration options.
|
|
|
325
332
|
|
|
326
333
|
## Development
|
|
327
334
|
|
|
328
|
-
```bash
|
|
329
|
-
# Clone repository
|
|
330
|
-
git clone https://github.com/your-org/sudachi-ts.git
|
|
331
|
-
cd sudachi-ts
|
|
332
|
-
|
|
333
|
-
# Install dependencies
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
# Type check
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
# Run tests
|
|
340
|
-
|
|
341
|
-
|
|
342
|
-
# Lint
|
|
343
|
-
|
|
344
|
-
```
|
|
335
|
+
```bash
|
|
336
|
+
# Clone repository
|
|
337
|
+
git clone https://github.com/your-org/sudachi-ts.git
|
|
338
|
+
cd sudachi-ts
|
|
339
|
+
|
|
340
|
+
# Install dependencies
|
|
341
|
+
npm install
|
|
342
|
+
|
|
343
|
+
# Type check
|
|
344
|
+
npm run typecheck
|
|
345
|
+
|
|
346
|
+
# Run tests
|
|
347
|
+
npm test
|
|
348
|
+
|
|
349
|
+
# Lint
|
|
350
|
+
npm run check:fix
|
|
351
|
+
```
|
|
345
352
|
|
|
346
353
|
## Architecture
|
|
347
354
|
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
package/build/bin/sudachi.js
CHANGED
|
File without changes
|
|
@@ -15,8 +15,8 @@ export class Config {
|
|
|
15
15
|
static async fromFile(filePath) {
|
|
16
16
|
const content = await readFile(filePath, 'utf-8');
|
|
17
17
|
const baseDir = dirname(filePath);
|
|
18
|
-
const anchor = PathAnchor.filesystem(baseDir);
|
|
19
|
-
return new Config(Settings.parse(content), anchor);
|
|
18
|
+
const anchor = PathAnchor.filesystem(baseDir).andThen(PathAnchor.none());
|
|
19
|
+
return new Config(Settings.parse(content, anchor), anchor);
|
|
20
20
|
}
|
|
21
21
|
static parse(json) {
|
|
22
22
|
return new Config(Settings.parse(json));
|
|
@@ -61,7 +61,14 @@ export class Config {
|
|
|
61
61
|
return this.settings.getIntList(key);
|
|
62
62
|
}
|
|
63
63
|
getPlugins(key) {
|
|
64
|
-
|
|
64
|
+
const plugins = this.settings.getPlugins(key);
|
|
65
|
+
if (!plugins) {
|
|
66
|
+
return null;
|
|
67
|
+
}
|
|
68
|
+
return plugins.map((plugin) => ({
|
|
69
|
+
className: plugin.className,
|
|
70
|
+
settings: plugin.settings.withAnchor(this.anchor),
|
|
71
|
+
}));
|
|
65
72
|
}
|
|
66
73
|
}
|
|
67
74
|
export async function loadConfig(configPath) {
|
|
@@ -1,12 +1,18 @@
|
|
|
1
|
+
import { PathAnchor } from './pathAnchor.js';
|
|
1
2
|
export type PluginConf<_T> = {
|
|
2
3
|
className: string;
|
|
3
4
|
settings: Settings;
|
|
4
5
|
};
|
|
5
6
|
export declare class Settings {
|
|
6
7
|
private readonly data;
|
|
7
|
-
|
|
8
|
+
private readonly anchor;
|
|
9
|
+
constructor(data?: Record<string, unknown>, anchor?: PathAnchor);
|
|
8
10
|
static empty(): Settings;
|
|
9
|
-
static parse(json: string,
|
|
11
|
+
static parse(json: string, basePathOrAnchor?: string | PathAnchor): Settings;
|
|
12
|
+
getAnchor(): PathAnchor;
|
|
13
|
+
withAnchor(anchor: PathAnchor): Settings;
|
|
14
|
+
getPath(key: string, defaultValue?: string): Promise<string | null>;
|
|
15
|
+
toObject(): Record<string, unknown>;
|
|
10
16
|
getString(key: string, defaultValue?: string): string | null;
|
|
11
17
|
getInt(key: string, defaultValue?: number): number;
|
|
12
18
|
getBoolean(key: string, defaultValue: boolean): boolean;
|
|
@@ -1,17 +1,39 @@
|
|
|
1
|
+
import { PathAnchor } from './pathAnchor.js';
|
|
1
2
|
export class Settings {
|
|
2
3
|
data;
|
|
3
|
-
|
|
4
|
+
anchor;
|
|
5
|
+
constructor(data = {}, anchor = PathAnchor.none()) {
|
|
4
6
|
this.data = { ...data };
|
|
7
|
+
this.anchor = anchor;
|
|
5
8
|
}
|
|
6
9
|
static empty() {
|
|
7
|
-
return new Settings({});
|
|
10
|
+
return new Settings({}, PathAnchor.none());
|
|
8
11
|
}
|
|
9
|
-
static parse(json,
|
|
12
|
+
static parse(json, basePathOrAnchor) {
|
|
10
13
|
const data = JSON.parse(json);
|
|
11
14
|
if (typeof data !== 'object' || data === null) {
|
|
12
15
|
throw new Error('root must be an object');
|
|
13
16
|
}
|
|
14
|
-
|
|
17
|
+
if (typeof basePathOrAnchor === 'string') {
|
|
18
|
+
return new Settings(data, PathAnchor.filesystem(basePathOrAnchor).andThen(PathAnchor.none()));
|
|
19
|
+
}
|
|
20
|
+
return new Settings(data, basePathOrAnchor ?? PathAnchor.none());
|
|
21
|
+
}
|
|
22
|
+
getAnchor() {
|
|
23
|
+
return this.anchor;
|
|
24
|
+
}
|
|
25
|
+
withAnchor(anchor) {
|
|
26
|
+
return new Settings(this.data, anchor);
|
|
27
|
+
}
|
|
28
|
+
async getPath(key, defaultValue) {
|
|
29
|
+
const value = this.getString(key, defaultValue);
|
|
30
|
+
if (value === null) {
|
|
31
|
+
return null;
|
|
32
|
+
}
|
|
33
|
+
return await this.anchor.resolve(value);
|
|
34
|
+
}
|
|
35
|
+
toObject() {
|
|
36
|
+
return { ...this.data };
|
|
15
37
|
}
|
|
16
38
|
getString(key, defaultValue) {
|
|
17
39
|
const value = this.data[key];
|
|
@@ -70,7 +92,7 @@ export class Settings {
|
|
|
70
92
|
const obj = item;
|
|
71
93
|
return {
|
|
72
94
|
className: obj.class,
|
|
73
|
-
settings: new Settings({ ...obj }),
|
|
95
|
+
settings: new Settings({ ...obj }, this.anchor),
|
|
74
96
|
};
|
|
75
97
|
}
|
|
76
98
|
throw new Error(`sub-object for ${key} didn't have class key`);
|
|
@@ -79,9 +101,9 @@ export class Settings {
|
|
|
79
101
|
return null;
|
|
80
102
|
}
|
|
81
103
|
withFallback(other) {
|
|
82
|
-
return new Settings({ ...other.data, ...this.data });
|
|
104
|
+
return new Settings({ ...other.data, ...this.data }, this.anchor.andThen(other.anchor));
|
|
83
105
|
}
|
|
84
106
|
merge(overrides) {
|
|
85
|
-
return new Settings({ ...this.data, ...overrides });
|
|
107
|
+
return new Settings({ ...this.data, ...overrides }, this.anchor);
|
|
86
108
|
}
|
|
87
109
|
}
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { readFile } from 'node:fs/promises';
|
|
1
2
|
import { CategoryType } from './categoryType.js';
|
|
2
3
|
export class CharacterCategory {
|
|
3
4
|
static PATTERN_SPACES = /\s+/;
|
|
@@ -67,8 +68,7 @@ export class CharacterCategory {
|
|
|
67
68
|
static async loadDefault() {
|
|
68
69
|
const charCategory = new CharacterCategory();
|
|
69
70
|
try {
|
|
70
|
-
const
|
|
71
|
-
const content = await response.text();
|
|
71
|
+
const content = await readFile(new URL('../resources/char.def', import.meta.url), 'utf-8');
|
|
72
72
|
charCategory.readCharacterDefinition(content);
|
|
73
73
|
}
|
|
74
74
|
catch (e) {
|
|
@@ -44,8 +44,8 @@ export class DictionaryFactory {
|
|
|
44
44
|
ensureLexiconSet().add(userDict.getLexicon());
|
|
45
45
|
}
|
|
46
46
|
}
|
|
47
|
-
const loader = new PluginLoader();
|
|
48
|
-
const defaultConfig = Config.parse(DEFAULT_CONFIG_JSON);
|
|
47
|
+
const loader = new PluginLoader(anchor);
|
|
48
|
+
const defaultConfig = Config.parse(DEFAULT_CONFIG_JSON).setAnchor(anchor);
|
|
49
49
|
let inputTextPluginConfs = config.getPlugins('inputTextPlugin');
|
|
50
50
|
if (!inputTextPluginConfs || inputTextPluginConfs.length === 0) {
|
|
51
51
|
inputTextPluginConfs = defaultConfig.getPlugins('inputTextPlugin');
|
|
@@ -9,7 +9,7 @@ export class DefaultInputTextPlugin extends InputTextPlugin {
|
|
|
9
9
|
if (this.initialized) {
|
|
10
10
|
return;
|
|
11
11
|
}
|
|
12
|
-
const rewriteDefPath = this.settings.
|
|
12
|
+
const rewriteDefPath = await this.settings.getPath('rewriteDef');
|
|
13
13
|
if (rewriteDefPath) {
|
|
14
14
|
const content = await readFully(rewriteDefPath);
|
|
15
15
|
this.readRewriteLists(content);
|
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { PathAnchor } from '../config/pathAnchor.js';
|
|
1
2
|
import type { Settings } from '../config/settings.js';
|
|
2
3
|
import type { Grammar } from '../dictionary/grammar.js';
|
|
3
4
|
import type { Plugin } from './base.js';
|
|
@@ -11,6 +12,8 @@ export interface LoadedPlugin<T extends Plugin> {
|
|
|
11
12
|
className: string;
|
|
12
13
|
}
|
|
13
14
|
export declare class PluginLoader {
|
|
15
|
+
private readonly anchor;
|
|
16
|
+
constructor(anchor?: PathAnchor);
|
|
14
17
|
loadInputTextPlugin(className: string, settings: Settings): Promise<LoadedPlugin<InputTextPlugin>>;
|
|
15
18
|
loadOovProviderPlugin(className: string, settings: Settings): Promise<LoadedPlugin<OovProviderPlugin>>;
|
|
16
19
|
loadPathRewritePlugin(className: string, settings: Settings): Promise<LoadedPlugin<PathRewritePlugin>>;
|
|
@@ -35,6 +38,8 @@ export declare class PluginLoader {
|
|
|
35
38
|
private loadPlugin;
|
|
36
39
|
private findPluginClass;
|
|
37
40
|
private isPluginConstructor;
|
|
41
|
+
private resolveClassSpecifier;
|
|
42
|
+
private isPathLikeSpecifier;
|
|
38
43
|
private isBuiltIn;
|
|
39
44
|
private getBuiltIn;
|
|
40
45
|
}
|
|
@@ -1,3 +1,6 @@
|
|
|
1
|
+
import { isAbsolute, resolve } from 'node:path';
|
|
2
|
+
import { pathToFileURL } from 'node:url';
|
|
3
|
+
import { PathAnchor } from '../config/pathAnchor.js';
|
|
1
4
|
import { InhibitConnectionPlugin } from './connection/inhibitConnectionPlugin.js';
|
|
2
5
|
import { DefaultInputTextPlugin } from './inputText/defaultInputTextPlugin.js';
|
|
3
6
|
import { IgnoreYomiganaPlugin } from './inputText/ignoreYomiganaPlugin.js';
|
|
@@ -9,6 +12,10 @@ import { JoinKatakanaOovPlugin } from './pathRewrite/joinKatakanaOovPlugin.js';
|
|
|
9
12
|
import { JoinNumericPlugin } from './pathRewrite/joinNumericPlugin.js';
|
|
10
13
|
import { TokenChunkerPlugin } from './pathRewrite/tokenChunkerPlugin.js';
|
|
11
14
|
export class PluginLoader {
|
|
15
|
+
anchor;
|
|
16
|
+
constructor(anchor = PathAnchor.none()) {
|
|
17
|
+
this.anchor = anchor;
|
|
18
|
+
}
|
|
12
19
|
async loadInputTextPlugin(className, settings) {
|
|
13
20
|
const plugin = await this.loadPlugin(className, settings);
|
|
14
21
|
return { plugin, className };
|
|
@@ -72,7 +79,8 @@ export class PluginLoader {
|
|
|
72
79
|
PluginClass = this.getBuiltIn(className);
|
|
73
80
|
}
|
|
74
81
|
else {
|
|
75
|
-
const
|
|
82
|
+
const classSpecifier = await this.resolveClassSpecifier(className);
|
|
83
|
+
const module = await import(classSpecifier);
|
|
76
84
|
PluginClass = this.findPluginClass(module, className);
|
|
77
85
|
}
|
|
78
86
|
const plugin = new PluginClass();
|
|
@@ -112,6 +120,24 @@ export class PluginLoader {
|
|
|
112
120
|
return false;
|
|
113
121
|
}
|
|
114
122
|
}
|
|
123
|
+
async resolveClassSpecifier(className) {
|
|
124
|
+
if (this.anchor === PathAnchor.none() ||
|
|
125
|
+
!this.isPathLikeSpecifier(className)) {
|
|
126
|
+
return className;
|
|
127
|
+
}
|
|
128
|
+
const resolvedPath = await this.anchor.resolve(className);
|
|
129
|
+
const absolutePath = isAbsolute(resolvedPath)
|
|
130
|
+
? resolvedPath
|
|
131
|
+
: resolve(resolvedPath);
|
|
132
|
+
return pathToFileURL(absolutePath).href;
|
|
133
|
+
}
|
|
134
|
+
isPathLikeSpecifier(className) {
|
|
135
|
+
return (className.startsWith('./') ||
|
|
136
|
+
className.startsWith('../') ||
|
|
137
|
+
className.startsWith('.\\') ||
|
|
138
|
+
className.startsWith('..\\') ||
|
|
139
|
+
isAbsolute(className));
|
|
140
|
+
}
|
|
115
141
|
isBuiltIn(name) {
|
|
116
142
|
return (name in BUILT_IN_PLUGINS || name.split('.').pop() in BUILT_IN_PLUGINS);
|
|
117
143
|
}
|
|
@@ -11,12 +11,12 @@ export class MeCabOovProviderPlugin extends OovProviderPlugin {
|
|
|
11
11
|
if (this.initialized) {
|
|
12
12
|
return;
|
|
13
13
|
}
|
|
14
|
-
const charDefPath = this.settings.
|
|
14
|
+
const charDefPath = await this.settings.getPath('charDef');
|
|
15
15
|
if (charDefPath) {
|
|
16
16
|
const content = await readFully(charDefPath);
|
|
17
17
|
this.readCharacterProperty(content);
|
|
18
18
|
}
|
|
19
|
-
const unkDefPath = this.settings.
|
|
19
|
+
const unkDefPath = await this.settings.getPath('unkDef');
|
|
20
20
|
const userPosMode = this.settings.getString(OovProviderPlugin.USER_POS, OovProviderPlugin.USER_POS_FORBID) ?? OovProviderPlugin.USER_POS_FORBID;
|
|
21
21
|
if (unkDefPath) {
|
|
22
22
|
const content = await readFully(unkDefPath);
|
|
@@ -6,9 +6,6 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
6
6
|
private grammar;
|
|
7
7
|
private enablePatternRules;
|
|
8
8
|
private enableBroadRules;
|
|
9
|
-
private enableCompoundNouns;
|
|
10
|
-
private minCompoundLength;
|
|
11
|
-
private excludedNounSubcategories;
|
|
12
9
|
setUp(grammar: Grammar): void;
|
|
13
10
|
rewrite(_text: InputText, path: LatticeNode[], lattice: Lattice): void;
|
|
14
11
|
private toInitialChunks;
|
|
@@ -25,12 +22,15 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
25
22
|
private isNumericCommaChunk;
|
|
26
23
|
private isNumericDotChunk;
|
|
27
24
|
private isNumericSignChunk;
|
|
25
|
+
private isLatinTextChunk;
|
|
28
26
|
private isCounterChunk;
|
|
29
27
|
private mergeCounterChunks;
|
|
30
28
|
private applyMergeStage;
|
|
31
|
-
private
|
|
32
|
-
private
|
|
33
|
-
private
|
|
29
|
+
private applyInlineRubyExactStage;
|
|
30
|
+
private applyInlineRubyPrefixStage;
|
|
31
|
+
private shouldMergeInlineRubyExact;
|
|
32
|
+
private shouldMergeInlineRubyPrefix;
|
|
33
|
+
private toHiragana;
|
|
34
34
|
private getPosById;
|
|
35
35
|
private mergeChunks;
|
|
36
36
|
private readingPart;
|