sudachi-ts 0.1.20-beta.7 → 0.1.22
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +102 -137
- package/build/src/dictionary/dictionaryFactory.js +9 -10
- package/build/src/index.d.ts +1 -1
- package/build/src/index.js +1 -1
- package/build/src/plugins/base.d.ts +1 -1
- package/build/src/plugins/base.js +2 -1
- package/build/src/plugins/connection/base.d.ts +2 -1
- package/build/src/plugins/connection/base.js +1 -1
- package/build/src/plugins/connection/targetedConnectionCostPlugin.d.ts +13 -0
- package/build/src/plugins/connection/targetedConnectionCostPlugin.js +102 -0
- package/build/src/plugins/index.d.ts +1 -1
- package/build/src/plugins/index.js +1 -1
- package/build/src/plugins/loader.d.ts +11 -17
- package/build/src/plugins/loader.js +21 -32
- package/package.json +12 -12
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.d.ts +0 -51
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.js +0 -2808
package/README.md
CHANGED
|
@@ -12,10 +12,10 @@ TypeScript port of [Sudachi](https://github.com/WorksApplications/Sudachi) Japan
|
|
|
12
12
|
- **Binary Dictionary Compatibility**: Load and use pre-built Sudachi dictionaries
|
|
13
13
|
- **Dynamic Plugin System**: Extensible architecture with runtime plugin loading
|
|
14
14
|
- **Dictionary Building**: Complete CSV to binary dictionary conversion
|
|
15
|
-
- **Sentence Detection**: Multi-sentence text processing
|
|
16
|
-
- **UTF-8 Handling**: Proper Japanese text normalization and character encoding
|
|
17
|
-
- **POS Matching**: Flexible part-of-speech filtering and matching
|
|
18
|
-
- **Counter Alias Recovery**: Resolves numeric kana counters such as `1こ` to the canonical counter lattice before best-path selection
|
|
15
|
+
- **Sentence Detection**: Multi-sentence text processing
|
|
16
|
+
- **UTF-8 Handling**: Proper Japanese text normalization and character encoding
|
|
17
|
+
- **POS Matching**: Flexible part-of-speech filtering and matching
|
|
18
|
+
- **Counter Alias Recovery**: Resolves numeric kana counters such as `1こ` to the canonical counter lattice before best-path selection
|
|
19
19
|
|
|
20
20
|
## Requirements
|
|
21
21
|
|
|
@@ -106,7 +106,7 @@ const config = await loadConfig('./sudachi.json');
|
|
|
106
106
|
const dict = Dictionary.create();
|
|
107
107
|
```
|
|
108
108
|
|
|
109
|
-
Example `sudachi.json`:
|
|
109
|
+
Example `sudachi.json`:
|
|
110
110
|
|
|
111
111
|
```json
|
|
112
112
|
{
|
|
@@ -121,31 +121,31 @@ Example `sudachi.json`:
|
|
|
121
121
|
}
|
|
122
122
|
}
|
|
123
123
|
]
|
|
124
|
-
}
|
|
125
|
-
```
|
|
126
|
-
|
|
127
|
-
For non-absolute file references in config (dictionary files, plugin module paths,
|
|
128
|
-
and built-in plugin file settings), Sudachi-TS tries paths relative to the config
|
|
129
|
-
file first, then relative to the current working directory.
|
|
130
|
-
|
|
131
|
-
By default, Sudachi-TS enables a built-in compound-particle lexicon
|
|
132
|
-
(`"enableDefaultCompoundParticles": true`) so forms such as `かも`, `のか`,
|
|
133
|
-
and `だから` are tokenized as single morphemes. Set it to `false` to disable:
|
|
124
|
+
}
|
|
125
|
+
```
|
|
126
|
+
|
|
127
|
+
For non-absolute file references in config (dictionary files, plugin module paths,
|
|
128
|
+
and built-in plugin file settings), Sudachi-TS tries paths relative to the config
|
|
129
|
+
file first, then relative to the current working directory.
|
|
130
|
+
|
|
131
|
+
By default, Sudachi-TS enables a built-in compound-particle lexicon
|
|
132
|
+
(`"enableDefaultCompoundParticles": true`) so forms such as `かも`, `のか`,
|
|
133
|
+
and `だから` are tokenized as single morphemes. Set it to `false` to disable:
|
|
134
134
|
|
|
135
135
|
```json
|
|
136
|
-
{
|
|
137
|
-
"enableDefaultCompoundParticles": false
|
|
138
|
-
}
|
|
139
|
-
```
|
|
140
|
-
|
|
141
|
-
The default OOV plugin stack also injects counter aliases in numeric contexts,
|
|
142
|
-
so kana counters such as `りんごを1こください。` are analyzed as
|
|
143
|
-
`りんご / を / 1 / こ / ください / 。` with the counter normalized to `個`
|
|
144
|
-
instead of falling through to unrelated dictionary entries.
|
|
145
|
-
|
|
146
|
-
## Working with Morphemes
|
|
147
|
-
|
|
148
|
-
Access detailed morpheme information:
|
|
136
|
+
{
|
|
137
|
+
"enableDefaultCompoundParticles": false
|
|
138
|
+
}
|
|
139
|
+
```
|
|
140
|
+
|
|
141
|
+
The default OOV plugin stack also injects counter aliases in numeric contexts,
|
|
142
|
+
so kana counters such as `りんごを1こください。` are analyzed as
|
|
143
|
+
`りんご / を / 1 / こ / ください / 。` with the counter normalized to `個`
|
|
144
|
+
instead of falling through to unrelated dictionary entries.
|
|
145
|
+
|
|
146
|
+
## Working with Morphemes
|
|
147
|
+
|
|
148
|
+
Access detailed morpheme information:
|
|
149
149
|
|
|
150
150
|
```typescript
|
|
151
151
|
const morpheme = result[0];
|
|
@@ -171,34 +171,34 @@ console.log(morpheme.end());
|
|
|
171
171
|
console.log(morpheme.length());
|
|
172
172
|
|
|
173
173
|
// Check morpheme properties
|
|
174
|
-
console.log(morpheme.isOov()); // True if out-of-vocabulary
|
|
175
|
-
```
|
|
176
|
-
|
|
177
|
-
## Public Dictionary Access
|
|
178
|
-
|
|
179
|
-
`DictionaryFactory` returns a public `Dictionary` that now exposes stable
|
|
180
|
-
dictionary metadata APIs without requiring internal imports.
|
|
181
|
-
|
|
182
|
-
```typescript
|
|
183
|
-
import { DictionaryFactory } from 'sudachi-ts';
|
|
184
|
-
|
|
185
|
-
const dictionary = await new DictionaryFactory().create('./sudachi.json');
|
|
186
|
-
|
|
187
|
-
const grammar = dictionary.getGrammar();
|
|
188
|
-
const lexicon = dictionary.getLexicon();
|
|
189
|
-
|
|
190
|
-
const kyotoId = lexicon.getWordId('京都', 3, 'キョウト');
|
|
191
|
-
const kyotoInfo = lexicon.getWordInfo(kyotoId);
|
|
192
|
-
|
|
193
|
-
console.log(grammar.getPartOfSpeechString(kyotoInfo.getPOSId()));
|
|
194
|
-
console.log(kyotoInfo.getSynonymGroupIds());
|
|
195
|
-
```
|
|
196
|
-
|
|
197
|
-
When user dictionaries are configured, `dictionary.getLexicon()` exposes the
|
|
198
|
-
merged lexicon view used by tokenization, so downstream plugins can look up both
|
|
199
|
-
system and user dictionary entries through the same public API.
|
|
200
|
-
|
|
201
|
-
## Splitting Morphemes
|
|
174
|
+
console.log(morpheme.isOov()); // True if out-of-vocabulary
|
|
175
|
+
```
|
|
176
|
+
|
|
177
|
+
## Public Dictionary Access
|
|
178
|
+
|
|
179
|
+
`DictionaryFactory` returns a public `Dictionary` that now exposes stable
|
|
180
|
+
dictionary metadata APIs without requiring internal imports.
|
|
181
|
+
|
|
182
|
+
```typescript
|
|
183
|
+
import { DictionaryFactory } from 'sudachi-ts';
|
|
184
|
+
|
|
185
|
+
const dictionary = await new DictionaryFactory().create('./sudachi.json');
|
|
186
|
+
|
|
187
|
+
const grammar = dictionary.getGrammar();
|
|
188
|
+
const lexicon = dictionary.getLexicon();
|
|
189
|
+
|
|
190
|
+
const kyotoId = lexicon.getWordId('京都', 3, 'キョウト');
|
|
191
|
+
const kyotoInfo = lexicon.getWordInfo(kyotoId);
|
|
192
|
+
|
|
193
|
+
console.log(grammar.getPartOfSpeechString(kyotoInfo.getPOSId()));
|
|
194
|
+
console.log(kyotoInfo.getSynonymGroupIds());
|
|
195
|
+
```
|
|
196
|
+
|
|
197
|
+
When user dictionaries are configured, `dictionary.getLexicon()` exposes the
|
|
198
|
+
merged lexicon view used by tokenization, so downstream plugins can look up both
|
|
199
|
+
system and user dictionary entries through the same public API.
|
|
200
|
+
|
|
201
|
+
## Splitting Morphemes
|
|
202
202
|
|
|
203
203
|
Use the split method to change granularity:
|
|
204
204
|
|
|
@@ -221,18 +221,18 @@ import { SentenceDetector } from 'sudachi-ts/sentdetect/sentenceDetector.js';
|
|
|
221
221
|
|
|
222
222
|
const sentences = tokenizer.tokenizeSentences('東京都は日本の首都です。大阪は商業都市です。');
|
|
223
223
|
|
|
224
|
-
for (const sentence of sentences) {
|
|
225
|
-
console.log('--- Sentence ---');
|
|
226
|
-
for (const morpheme of sentence) {
|
|
227
|
-
console.log(morpheme.surface());
|
|
228
|
-
}
|
|
229
|
-
}
|
|
230
|
-
```
|
|
231
|
-
|
|
232
|
-
`tokenizeSentences(...)` treats standalone quoted dialogue endings (for example
|
|
233
|
-
`「...!」`) as sentence boundaries, but keeps quoted speech attached to following
|
|
234
|
-
reporting clauses such as `「...。」と言いました。`. It also skips leading
|
|
235
|
-
inter-sentence whitespace such as newlines before tokenization.
|
|
224
|
+
for (const sentence of sentences) {
|
|
225
|
+
console.log('--- Sentence ---');
|
|
226
|
+
for (const morpheme of sentence) {
|
|
227
|
+
console.log(morpheme.surface());
|
|
228
|
+
}
|
|
229
|
+
}
|
|
230
|
+
```
|
|
231
|
+
|
|
232
|
+
`tokenizeSentences(...)` treats standalone quoted dialogue endings (for example
|
|
233
|
+
`「...!」`) as sentence boundaries, but keeps quoted speech attached to following
|
|
234
|
+
reporting clauses such as `「...。」と言いました。`. It also skips leading
|
|
235
|
+
inter-sentence whitespace such as newlines before tokenization.
|
|
236
236
|
|
|
237
237
|
Lazy sentence processing for streaming:
|
|
238
238
|
|
|
@@ -305,61 +305,26 @@ const plugin = await loader.loadInputTextPlugin(
|
|
|
305
305
|
|
|
306
306
|
See [PLUGINS.md](./PLUGINS.md) for detailed plugin development guide.
|
|
307
307
|
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
This example prints each token as `surface/reading` so the chunking impact on
|
|
315
|
-
readings is visible in the baseline vs plugin outputs.
|
|
316
|
-
`TokenChunkerPlugin` is designed and validated against the full Sudachi system
|
|
317
|
-
dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
|
|
318
|
-
when adding or tuning chunk rules.
|
|
319
|
-
`TokenChunkerPlugin` requires `enableDefaultCompoundParticles: true`. Dictionary
|
|
320
|
-
creation throws an error when this plugin is configured with default compound
|
|
321
|
-
particles disabled.
|
|
322
|
-
`TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
|
|
323
|
-
`tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
|
|
324
|
-
enabled throws an error.
|
|
325
|
-
When the lattice already contains a lexicalized compound candidate, the chunker
|
|
326
|
-
also prefers learner-facing noun compounds such as `学校` over split analyses
|
|
327
|
-
like `学` + `校`.
|
|
328
|
-
The chunker also handles polite progressive colloquial forms where `て/で` is
|
|
329
|
-
an auxiliary (`てる/でる`) such as `残ってます` and `残ってますよ`, plus
|
|
330
|
-
polite colloquial contraction forms like `太っちゃいます` and
|
|
331
|
-
`太っちゃいますよ`, and colloquial `〜てく` past contractions like
|
|
332
|
-
`持ってった`, colloquial `〜ておく` past contractions like `やめといた`, plus discourse chunks like `だなって` and contractions like
|
|
333
|
-
`してんだ`, `あっけど`, particle chunks like `とか`, sentence-final turns like `いいよな`, copula quote
|
|
334
|
-
spans like `ヒマだって`, and quoted reason clauses like `言ってたし`. It also chunks causative auxiliaries such as
|
|
335
|
-
`打たせる` / `内させる` into a single learner-facing token, along with
|
|
336
|
-
polite connective forms such as `込めまして`, negative connective forms such as
|
|
337
|
-
`遣わなくて`, lexicalized adverbials such as `別に`, conversational turns such as
|
|
338
|
-
`いいよ`, and causative te-forms such as `させて`. For learner-facing output it
|
|
339
|
-
also prefers more natural alternate dictionary readings when the lattice already
|
|
340
|
-
contains them, such as `明日` -> `アシタ`, `明後日` -> `アサッテ`, and
|
|
341
|
-
`私` -> `ワタシ`, and it supports additional `preferredReadings` overrides
|
|
342
|
-
via plugin settings using entries like `"私=ワタシ"`. It also normalizes
|
|
343
|
-
mixed-script weekday compounds such as `火よう日` -> `カヨウビ`.
|
|
344
|
-
The core tokenizer also rewrites sentence-ending ambiguities such as
|
|
345
|
-
`ね | こと | ね` into `ねこ | と | ね` when the lattice supports that path.
|
|
346
|
-
|
|
347
|
-
## Dictionary Building
|
|
308
|
+
|
|
309
|
+
The core tokenizer also rewrites sentence-ending ambiguities such as
|
|
310
|
+
`ね | こと | ね` into `ねこ | と | ね` when the lattice supports that path.
|
|
311
|
+
|
|
312
|
+
## Dictionary Building
|
|
348
313
|
|
|
349
314
|
Build binary dictionaries from CSV source:
|
|
350
315
|
|
|
351
|
-
```typescript
|
|
352
|
-
import { systemBuilder } from 'sudachi-ts/dictionary-build';
|
|
353
|
-
|
|
354
|
-
const builder = systemBuilder();
|
|
355
|
-
|
|
356
|
-
// Add lexicon entries from CSV
|
|
357
|
-
await builder.matrix(matrixDefContents);
|
|
358
|
-
await builder.lexicon(lexiconCsvContents, 'lexicon.csv');
|
|
359
|
-
|
|
360
|
-
// Build binary dictionary
|
|
361
|
-
const { buffer } = await builder.build();
|
|
362
|
-
```
|
|
316
|
+
```typescript
|
|
317
|
+
import { systemBuilder } from 'sudachi-ts/dictionary-build';
|
|
318
|
+
|
|
319
|
+
const builder = systemBuilder();
|
|
320
|
+
|
|
321
|
+
// Add lexicon entries from CSV
|
|
322
|
+
await builder.matrix(matrixDefContents);
|
|
323
|
+
await builder.lexicon(lexiconCsvContents, 'lexicon.csv');
|
|
324
|
+
|
|
325
|
+
// Build binary dictionary
|
|
326
|
+
const { buffer } = await builder.build();
|
|
327
|
+
```
|
|
363
328
|
|
|
364
329
|
CSV format:
|
|
365
330
|
|
|
@@ -395,23 +360,23 @@ See [CONFIG.md](./CONFIG.md) for detailed configuration options.
|
|
|
395
360
|
|
|
396
361
|
## Development
|
|
397
362
|
|
|
398
|
-
```bash
|
|
399
|
-
# Clone repository
|
|
400
|
-
git clone https://github.com/your-org/sudachi-ts.git
|
|
401
|
-
cd sudachi-ts
|
|
402
|
-
|
|
403
|
-
# Install dependencies
|
|
404
|
-
npm install
|
|
405
|
-
|
|
406
|
-
# Type check
|
|
407
|
-
npm run typecheck
|
|
408
|
-
|
|
409
|
-
# Run tests
|
|
410
|
-
npm test
|
|
411
|
-
|
|
412
|
-
# Lint
|
|
413
|
-
npm run check:fix
|
|
414
|
-
```
|
|
363
|
+
```bash
|
|
364
|
+
# Clone repository
|
|
365
|
+
git clone https://github.com/your-org/sudachi-ts.git
|
|
366
|
+
cd sudachi-ts
|
|
367
|
+
|
|
368
|
+
# Install dependencies
|
|
369
|
+
npm install
|
|
370
|
+
|
|
371
|
+
# Type check
|
|
372
|
+
npm run typecheck
|
|
373
|
+
|
|
374
|
+
# Run tests
|
|
375
|
+
npm test
|
|
376
|
+
|
|
377
|
+
# Lint
|
|
378
|
+
npm run check:fix
|
|
379
|
+
```
|
|
415
380
|
|
|
416
381
|
## Architecture
|
|
417
382
|
|
|
@@ -6,12 +6,6 @@ import { PluginLoader } from '../plugins/loader.js';
|
|
|
6
6
|
import { BinaryDictionary } from './binaryDictionary.js';
|
|
7
7
|
import { loadDefaultCompoundLexicon } from './defaultCompoundLexicon.js';
|
|
8
8
|
import { LexiconSet } from './lexiconSet.js';
|
|
9
|
-
function isTokenChunkerPlugin(className) {
|
|
10
|
-
if (className === 'com.worksap.nlp.sudachi.TokenChunkerPlugin') {
|
|
11
|
-
return true;
|
|
12
|
-
}
|
|
13
|
-
return className.split('.').pop() === 'TokenChunkerPlugin';
|
|
14
|
-
}
|
|
15
9
|
export class DictionaryFactory {
|
|
16
10
|
async create(configPath, customConfig) {
|
|
17
11
|
const config = customConfig || (await loadConfig(configPath));
|
|
@@ -52,6 +46,15 @@ export class DictionaryFactory {
|
|
|
52
46
|
}
|
|
53
47
|
const loader = new PluginLoader(anchor);
|
|
54
48
|
const defaultConfig = Config.parse(DEFAULT_CONFIG_JSON).setAnchor(anchor);
|
|
49
|
+
let editConnectionPluginConfs = config.getPlugins('editConnectionCostPlugin');
|
|
50
|
+
if (!editConnectionPluginConfs || editConnectionPluginConfs.length === 0) {
|
|
51
|
+
editConnectionPluginConfs = defaultConfig.getPlugins('editConnectionCostPlugin');
|
|
52
|
+
}
|
|
53
|
+
if (editConnectionPluginConfs && editConnectionPluginConfs.length > 0) {
|
|
54
|
+
for (const loaded of await loader.loadEditConnectionCostPlugins(editConnectionPluginConfs, grammar, lexicon)) {
|
|
55
|
+
loaded.plugin.edit(grammar);
|
|
56
|
+
}
|
|
57
|
+
}
|
|
55
58
|
let inputTextPluginConfs = config.getPlugins('inputTextPlugin');
|
|
56
59
|
if (!inputTextPluginConfs || inputTextPluginConfs.length === 0) {
|
|
57
60
|
inputTextPluginConfs = defaultConfig.getPlugins('inputTextPlugin');
|
|
@@ -82,10 +85,6 @@ export class DictionaryFactory {
|
|
|
82
85
|
if (!pathRewritePluginConfs || pathRewritePluginConfs.length === 0) {
|
|
83
86
|
pathRewritePluginConfs = defaultConfig.getPlugins('pathRewritePlugin');
|
|
84
87
|
}
|
|
85
|
-
if (!enableDefaultCompoundParticles &&
|
|
86
|
-
(pathRewritePluginConfs || []).some((conf) => isTokenChunkerPlugin(conf.className))) {
|
|
87
|
-
throw new Error('TokenChunkerPlugin is only compatible when enableDefaultCompoundParticles is true.');
|
|
88
|
-
}
|
|
89
88
|
const pathRewritePlugins = (await loader.loadPathRewritePlugins(pathRewritePluginConfs || [], grammar)).map((p) => p.plugin);
|
|
90
89
|
return new Dictionary(grammar, lexicon, inputTextPlugins, oovProviderPlugins, pathRewritePlugins);
|
|
91
90
|
}
|
package/build/src/index.d.ts
CHANGED
|
@@ -24,7 +24,7 @@ export { DEPTH, MAX_COMPONENT_LENGTH, POS } from './dictionary/pos.js';
|
|
|
24
24
|
export { PartialPOS, PosMatcher } from './dictionary/posMatcher.js';
|
|
25
25
|
export { WordInfo } from './dictionary/wordInfo.js';
|
|
26
26
|
export { DoubleArray } from './dictionary-build/doubleArray.js';
|
|
27
|
-
export { EditConnectionCostPlugin, InputTextPlugin, type LoadedPlugin, MorphemeFormatterPlugin, OovProviderPlugin, PathRewritePlugin, Plugin, PluginLoader, } from './plugins/index.js';
|
|
27
|
+
export { EditConnectionCostPlugin, InputTextPlugin, type LoadedPlugin, MorphemeFormatterPlugin, OovProviderPlugin, PathRewritePlugin, Plugin, PluginLoader, TargetedConnectionCostPlugin, } from './plugins/index.js';
|
|
28
28
|
export type { NonBreakChecker } from './sentdetect/sentenceDetector.js';
|
|
29
29
|
export { DEFAULT_LIMIT, SentenceDetector, } from './sentdetect/sentenceDetector.js';
|
|
30
30
|
export { applyMask, dic, dicIdMask, MAX_DIC_ID, MAX_WORD_ID, make, word, } from './utils/wordId.js';
|
package/build/src/index.js
CHANGED
|
@@ -15,7 +15,7 @@ export { DEPTH, MAX_COMPONENT_LENGTH, POS } from './dictionary/pos.js';
|
|
|
15
15
|
export { PartialPOS, PosMatcher } from './dictionary/posMatcher.js';
|
|
16
16
|
export { WordInfo } from './dictionary/wordInfo.js';
|
|
17
17
|
export { DoubleArray } from './dictionary-build/doubleArray.js';
|
|
18
|
-
export { EditConnectionCostPlugin, InputTextPlugin, MorphemeFormatterPlugin, OovProviderPlugin, PathRewritePlugin, Plugin, PluginLoader, } from './plugins/index.js';
|
|
18
|
+
export { EditConnectionCostPlugin, InputTextPlugin, MorphemeFormatterPlugin, OovProviderPlugin, PathRewritePlugin, Plugin, PluginLoader, TargetedConnectionCostPlugin, } from './plugins/index.js';
|
|
19
19
|
export { DEFAULT_LIMIT, SentenceDetector, } from './sentdetect/sentenceDetector.js';
|
|
20
20
|
export { applyMask, dic, dicIdMask, MAX_DIC_ID, MAX_WORD_ID, make, word, } from './utils/wordId.js';
|
|
21
21
|
export { addNth, hasNth, MAX_LENGTH, nth } from './utils/wordMask.js';
|
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import type { Grammar } from '../../dictionary/grammar.js';
|
|
2
|
+
import type { Lexicon } from '../../dictionary/lexicon.js';
|
|
2
3
|
import { Plugin } from '../base.js';
|
|
3
4
|
export declare abstract class EditConnectionCostPlugin extends Plugin {
|
|
4
|
-
setUp(_grammar: Grammar): void;
|
|
5
|
+
setUp(_grammar: Grammar, _lexicon?: Lexicon): void;
|
|
5
6
|
abstract edit(grammar: Grammar): void;
|
|
6
7
|
inhibitConnection(grammar: Grammar, left: number, right: number): void;
|
|
7
8
|
}
|
|
@@ -0,0 +1,13 @@
|
|
|
1
|
+
import type { Grammar } from '../../dictionary/grammar.js';
|
|
2
|
+
import type { Lexicon } from '../../dictionary/lexicon.js';
|
|
3
|
+
import { EditConnectionCostPlugin } from './base.js';
|
|
4
|
+
export declare class TargetedConnectionCostPlugin extends EditConnectionCostPlugin {
|
|
5
|
+
private rules;
|
|
6
|
+
setUp(grammar: Grammar, lexicon?: Lexicon): void;
|
|
7
|
+
edit(grammar: Grammar): void;
|
|
8
|
+
private resolveRule;
|
|
9
|
+
private resolveWordId;
|
|
10
|
+
private normalizePos;
|
|
11
|
+
private requireConnectionRule;
|
|
12
|
+
private requireEntryTarget;
|
|
13
|
+
}
|
|
@@ -0,0 +1,102 @@
|
|
|
1
|
+
import { EditConnectionCostPlugin } from './base.js';
|
|
2
|
+
export class TargetedConnectionCostPlugin extends EditConnectionCostPlugin {
|
|
3
|
+
rules = [];
|
|
4
|
+
setUp(grammar, lexicon) {
|
|
5
|
+
if (!lexicon) {
|
|
6
|
+
throw new Error('TargetedConnectionCostPlugin requires the lexicon during setup');
|
|
7
|
+
}
|
|
8
|
+
const rawRules = this.settings.toObject().rules;
|
|
9
|
+
if (!Array.isArray(rawRules) || rawRules.length === 0) {
|
|
10
|
+
throw new Error('rules is undefined');
|
|
11
|
+
}
|
|
12
|
+
this.rules = rawRules.map((rule, index) => this.resolveRule(grammar, lexicon, rule, index + 1));
|
|
13
|
+
}
|
|
14
|
+
edit(grammar) {
|
|
15
|
+
for (const rule of this.rules) {
|
|
16
|
+
grammar.setConnectCost(rule.leftRightId, rule.rightLeftId, rule.cost);
|
|
17
|
+
}
|
|
18
|
+
}
|
|
19
|
+
resolveRule(grammar, lexicon, rule, ruleIndex) {
|
|
20
|
+
const parsedRule = this.requireConnectionRule(rule, ruleIndex);
|
|
21
|
+
const leftWordId = this.resolveWordId(grammar, lexicon, parsedRule.left, 'rule', ruleIndex, 'left');
|
|
22
|
+
const rightWordId = this.resolveWordId(grammar, lexicon, parsedRule.right, 'rule', ruleIndex, 'right');
|
|
23
|
+
return {
|
|
24
|
+
leftRightId: lexicon.getRightId(leftWordId),
|
|
25
|
+
rightLeftId: lexicon.getLeftId(rightWordId),
|
|
26
|
+
cost: parsedRule.cost,
|
|
27
|
+
};
|
|
28
|
+
}
|
|
29
|
+
resolveWordId(grammar, lexicon, target, ruleLabel, ruleIndex, side) {
|
|
30
|
+
if (typeof target.surface !== 'string' || target.surface === '') {
|
|
31
|
+
throw new Error(`${ruleLabel} ${ruleIndex} ${side} surface must be a non-empty string`);
|
|
32
|
+
}
|
|
33
|
+
if (typeof target.reading !== 'string' || target.reading === '') {
|
|
34
|
+
throw new Error(`${ruleLabel} ${ruleIndex} ${side} reading must be a non-empty string`);
|
|
35
|
+
}
|
|
36
|
+
const pos = this.normalizePos(target.pos, ruleLabel, ruleIndex, side);
|
|
37
|
+
const posId = grammar.getPartOfSpeechId(pos);
|
|
38
|
+
if (posId < 0) {
|
|
39
|
+
throw new Error(`${ruleLabel} ${ruleIndex} ${side} POS ${pos.join(',')} was not found in the loaded grammar`);
|
|
40
|
+
}
|
|
41
|
+
const wordId = lexicon.getWordId(target.surface, posId, target.reading);
|
|
42
|
+
if (wordId < 0) {
|
|
43
|
+
throw new Error(`${ruleLabel} ${ruleIndex} ${side} entry ${target.surface} (${pos.join(',')} / ${target.reading}) was not found in the loaded lexicon`);
|
|
44
|
+
}
|
|
45
|
+
return wordId;
|
|
46
|
+
}
|
|
47
|
+
normalizePos(pos, ruleLabel, ruleIndex, side) {
|
|
48
|
+
if (!Array.isArray(pos) || pos.length === 0) {
|
|
49
|
+
throw new Error(`${ruleLabel} ${ruleIndex} ${side} pos must be a non-empty string list`);
|
|
50
|
+
}
|
|
51
|
+
const normalized = pos.map((item) => {
|
|
52
|
+
if (typeof item !== 'string') {
|
|
53
|
+
throw new Error(`${ruleLabel} ${ruleIndex} ${side} pos must contain only strings`);
|
|
54
|
+
}
|
|
55
|
+
return item;
|
|
56
|
+
});
|
|
57
|
+
while (normalized.length < 6) {
|
|
58
|
+
normalized.push('*');
|
|
59
|
+
}
|
|
60
|
+
return normalized.slice(0, 6);
|
|
61
|
+
}
|
|
62
|
+
requireConnectionRule(rule, ruleIndex) {
|
|
63
|
+
if (typeof rule !== 'object' || rule === null) {
|
|
64
|
+
throw new Error(`rule ${ruleIndex} must be an object`);
|
|
65
|
+
}
|
|
66
|
+
const obj = rule;
|
|
67
|
+
const left = obj.left;
|
|
68
|
+
const right = obj.right;
|
|
69
|
+
const cost = obj.cost;
|
|
70
|
+
if (typeof cost !== 'number') {
|
|
71
|
+
throw new Error(`rule ${ruleIndex} cost must be a number`);
|
|
72
|
+
}
|
|
73
|
+
return {
|
|
74
|
+
left: this.requireEntryTarget(left, ruleIndex, 'left'),
|
|
75
|
+
right: this.requireEntryTarget(right, ruleIndex, 'right'),
|
|
76
|
+
cost,
|
|
77
|
+
};
|
|
78
|
+
}
|
|
79
|
+
requireEntryTarget(value, ruleIndex, side) {
|
|
80
|
+
if (typeof value !== 'object' || value === null) {
|
|
81
|
+
throw new Error(`rule ${ruleIndex} ${side} must be an object`);
|
|
82
|
+
}
|
|
83
|
+
const obj = value;
|
|
84
|
+
const surface = obj.surface;
|
|
85
|
+
const pos = obj.pos;
|
|
86
|
+
const reading = obj.reading;
|
|
87
|
+
if (typeof surface !== 'string') {
|
|
88
|
+
throw new Error(`rule ${ruleIndex} ${side} surface must be a string`);
|
|
89
|
+
}
|
|
90
|
+
if (!Array.isArray(pos)) {
|
|
91
|
+
throw new Error(`rule ${ruleIndex} ${side} pos must be a string list`);
|
|
92
|
+
}
|
|
93
|
+
if (typeof reading !== 'string') {
|
|
94
|
+
throw new Error(`rule ${ruleIndex} ${side} reading must be a string`);
|
|
95
|
+
}
|
|
96
|
+
return {
|
|
97
|
+
surface,
|
|
98
|
+
pos: pos,
|
|
99
|
+
reading,
|
|
100
|
+
};
|
|
101
|
+
}
|
|
102
|
+
}
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export { Plugin } from './base.js';
|
|
2
2
|
export { EditConnectionCostPlugin } from './connection/base.js';
|
|
3
3
|
export { InhibitConnectionPlugin } from './connection/inhibitConnectionPlugin.js';
|
|
4
|
+
export { TargetedConnectionCostPlugin } from './connection/targetedConnectionCostPlugin.js';
|
|
4
5
|
export { MorphemeFormatterPlugin } from './formatter/base.js';
|
|
5
6
|
export { SimpleMorphemeFormatter } from './formatter/simpleMorphemeFormatter.js';
|
|
6
7
|
export { WordSegmentationFormatter } from './formatter/wordSegmentationFormatter.js';
|
|
@@ -17,4 +18,3 @@ export { SimpleOovProviderPlugin } from './oov/simpleOovProviderPlugin.js';
|
|
|
17
18
|
export { PathRewritePlugin } from './pathRewrite/base.js';
|
|
18
19
|
export { JoinKatakanaOovPlugin } from './pathRewrite/joinKatakanaOovPlugin.js';
|
|
19
20
|
export { JoinNumericPlugin } from './pathRewrite/joinNumericPlugin.js';
|
|
20
|
-
export { TokenChunkerPlugin } from './pathRewrite/tokenChunkerPlugin.js';
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
export { Plugin } from './base.js';
|
|
2
2
|
export { EditConnectionCostPlugin } from './connection/base.js';
|
|
3
3
|
export { InhibitConnectionPlugin } from './connection/inhibitConnectionPlugin.js';
|
|
4
|
+
export { TargetedConnectionCostPlugin } from './connection/targetedConnectionCostPlugin.js';
|
|
4
5
|
export { MorphemeFormatterPlugin } from './formatter/base.js';
|
|
5
6
|
export { SimpleMorphemeFormatter } from './formatter/simpleMorphemeFormatter.js';
|
|
6
7
|
export { WordSegmentationFormatter } from './formatter/wordSegmentationFormatter.js';
|
|
@@ -17,4 +18,3 @@ export { SimpleOovProviderPlugin } from './oov/simpleOovProviderPlugin.js';
|
|
|
17
18
|
export { PathRewritePlugin } from './pathRewrite/base.js';
|
|
18
19
|
export { JoinKatakanaOovPlugin } from './pathRewrite/joinKatakanaOovPlugin.js';
|
|
19
20
|
export { JoinNumericPlugin } from './pathRewrite/joinNumericPlugin.js';
|
|
20
|
-
export { TokenChunkerPlugin } from './pathRewrite/tokenChunkerPlugin.js';
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { PathAnchor } from '../config/pathAnchor.js';
|
|
2
2
|
import type { Settings } from '../config/settings.js';
|
|
3
3
|
import type { Grammar } from '../dictionary/grammar.js';
|
|
4
|
+
import type { Lexicon } from '../dictionary/lexicon.js';
|
|
4
5
|
import type { Plugin } from './base.js';
|
|
5
6
|
import type { EditConnectionCostPlugin } from './connection/base.js';
|
|
6
7
|
import type { MorphemeFormatterPlugin } from './formatter/base.js';
|
|
@@ -11,6 +12,10 @@ export interface LoadedPlugin<T extends Plugin> {
|
|
|
11
12
|
plugin: T;
|
|
12
13
|
className: string;
|
|
13
14
|
}
|
|
15
|
+
type PluginConfig = {
|
|
16
|
+
className: string;
|
|
17
|
+
settings: Settings;
|
|
18
|
+
};
|
|
14
19
|
export declare class PluginLoader {
|
|
15
20
|
private readonly anchor;
|
|
16
21
|
constructor(anchor?: PathAnchor);
|
|
@@ -19,27 +24,16 @@ export declare class PluginLoader {
|
|
|
19
24
|
loadPathRewritePlugin(className: string, settings: Settings): Promise<LoadedPlugin<PathRewritePlugin>>;
|
|
20
25
|
loadEditConnectionCostPlugin(className: string, settings: Settings): Promise<LoadedPlugin<EditConnectionCostPlugin>>;
|
|
21
26
|
loadMorphemeFormatterPlugin(className: string, settings: Settings): Promise<LoadedPlugin<MorphemeFormatterPlugin>>;
|
|
22
|
-
loadInputTextPlugins(configs:
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
loadOovProviderPlugins(configs: {
|
|
27
|
-
className: string;
|
|
28
|
-
settings: Settings;
|
|
29
|
-
}[], grammar: Grammar): Promise<LoadedPlugin<OovProviderPlugin>[]>;
|
|
30
|
-
loadPathRewritePlugins(configs: {
|
|
31
|
-
className: string;
|
|
32
|
-
settings: Settings;
|
|
33
|
-
}[], grammar: Grammar): Promise<LoadedPlugin<PathRewritePlugin>[]>;
|
|
34
|
-
loadEditConnectionCostPlugins(configs: {
|
|
35
|
-
className: string;
|
|
36
|
-
settings: Settings;
|
|
37
|
-
}[], grammar: Grammar): Promise<LoadedPlugin<EditConnectionCostPlugin>[]>;
|
|
27
|
+
loadInputTextPlugins(configs: PluginConfig[], grammar: Grammar): Promise<LoadedPlugin<InputTextPlugin>[]>;
|
|
28
|
+
loadOovProviderPlugins(configs: PluginConfig[], grammar: Grammar): Promise<LoadedPlugin<OovProviderPlugin>[]>;
|
|
29
|
+
loadPathRewritePlugins(configs: PluginConfig[], grammar: Grammar): Promise<LoadedPlugin<PathRewritePlugin>[]>;
|
|
30
|
+
loadEditConnectionCostPlugins(configs: PluginConfig[], grammar: Grammar, lexicon: Lexicon): Promise<LoadedPlugin<EditConnectionCostPlugin>[]>;
|
|
38
31
|
private loadPlugin;
|
|
32
|
+
private loadConfiguredPlugins;
|
|
39
33
|
private findPluginClass;
|
|
40
34
|
private isPluginConstructor;
|
|
41
35
|
private resolveClassSpecifier;
|
|
42
36
|
private isPathLikeSpecifier;
|
|
43
|
-
private isBuiltIn;
|
|
44
37
|
private getBuiltIn;
|
|
45
38
|
}
|
|
39
|
+
export {};
|