sudachi-ts 0.1.18 → 0.1.19
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md
CHANGED
|
@@ -281,6 +281,9 @@ readings is visible in the baseline vs plugin outputs.
|
|
|
281
281
|
`TokenChunkerPlugin` is designed and validated against the full Sudachi system
|
|
282
282
|
dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
|
|
283
283
|
when adding or tuning chunk rules.
|
|
284
|
+
`TokenChunkerPlugin` requires `enableDefaultCompoundParticles: true`. Dictionary
|
|
285
|
+
creation throws an error when this plugin is configured with default compound
|
|
286
|
+
particles disabled.
|
|
284
287
|
`TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
|
|
285
288
|
`tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
|
|
286
289
|
enabled throws an error.
|
|
@@ -6,6 +6,12 @@ import { PluginLoader } from '../plugins/loader.js';
|
|
|
6
6
|
import { BinaryDictionary } from './binaryDictionary.js';
|
|
7
7
|
import { loadDefaultCompoundLexicon } from './defaultCompoundLexicon.js';
|
|
8
8
|
import { LexiconSet } from './lexiconSet.js';
|
|
9
|
+
function isTokenChunkerPlugin(className) {
|
|
10
|
+
if (className === 'com.worksap.nlp.sudachi.TokenChunkerPlugin') {
|
|
11
|
+
return true;
|
|
12
|
+
}
|
|
13
|
+
return className.split('.').pop() === 'TokenChunkerPlugin';
|
|
14
|
+
}
|
|
9
15
|
export class DictionaryFactory {
|
|
10
16
|
async create(configPath, customConfig) {
|
|
11
17
|
const config = customConfig || (await loadConfig(configPath));
|
|
@@ -76,6 +82,10 @@ export class DictionaryFactory {
|
|
|
76
82
|
if (!pathRewritePluginConfs || pathRewritePluginConfs.length === 0) {
|
|
77
83
|
pathRewritePluginConfs = defaultConfig.getPlugins('pathRewritePlugin');
|
|
78
84
|
}
|
|
85
|
+
if (!enableDefaultCompoundParticles &&
|
|
86
|
+
(pathRewritePluginConfs || []).some((conf) => isTokenChunkerPlugin(conf.className))) {
|
|
87
|
+
throw new Error('TokenChunkerPlugin is only compatible when enableDefaultCompoundParticles is true.');
|
|
88
|
+
}
|
|
79
89
|
const pathRewritePlugins = (await loader.loadPathRewritePlugins(pathRewritePluginConfs || [], grammar)).map((p) => p.plugin);
|
|
80
90
|
return new Dictionary(grammar, lexicon, inputTextPlugins, oovProviderPlugins, pathRewritePlugins);
|
|
81
91
|
}
|
|
@@ -28,6 +28,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
28
28
|
private isCounterChunk;
|
|
29
29
|
private mergeCounterChunks;
|
|
30
30
|
private applyMergeStage;
|
|
31
|
+
private shouldMergeAttributiveKana;
|
|
31
32
|
private applyInlineRubyExactStage;
|
|
32
33
|
private applyInlineRubyPrefixStage;
|
|
33
34
|
private shouldMergeInlineRubyExact;
|
|
@@ -294,10 +294,16 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
294
294
|
while (i < chunks.length - 1) {
|
|
295
295
|
const current = chunks[i];
|
|
296
296
|
const next = chunks[i + 1];
|
|
297
|
+
const following = chunks[i + 2];
|
|
297
298
|
if (!current || !next) {
|
|
298
299
|
i++;
|
|
299
300
|
continue;
|
|
300
301
|
}
|
|
302
|
+
if (this.shouldMergeAttributiveKana(current, next, following)) {
|
|
303
|
+
const merged = this.mergeChunks([current, next], 'phrase');
|
|
304
|
+
chunks.splice(i, 2, merged);
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
301
307
|
if ((current.chunkType === 'te_form' ||
|
|
302
308
|
current.chunkType === 'suru_verb_te_form') &&
|
|
303
309
|
(next.chunkType === 'single_token' ||
|
|
@@ -321,6 +327,24 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
321
327
|
}
|
|
322
328
|
return chunks;
|
|
323
329
|
}
|
|
330
|
+
shouldMergeAttributiveKana(current, next, following) {
|
|
331
|
+
if (next.surface !== 'かな') {
|
|
332
|
+
return false;
|
|
333
|
+
}
|
|
334
|
+
const nextPos = this.getPosById(next.posId);
|
|
335
|
+
if (!nextPos || nextPos[0] !== '助詞' || nextPos[1] !== '終助詞') {
|
|
336
|
+
return false;
|
|
337
|
+
}
|
|
338
|
+
const currentPos0 = this.getPosById(current.posId)?.[0] ?? '';
|
|
339
|
+
if (!['名詞', '形状詞'].includes(currentPos0)) {
|
|
340
|
+
return false;
|
|
341
|
+
}
|
|
342
|
+
if (!following) {
|
|
343
|
+
return false;
|
|
344
|
+
}
|
|
345
|
+
const followingPos0 = this.getPosById(following.posId)?.[0] ?? '';
|
|
346
|
+
return ['名詞', '代名詞'].includes(followingPos0);
|
|
347
|
+
}
|
|
324
348
|
applyInlineRubyExactStage(source) {
|
|
325
349
|
const chunks = [...source];
|
|
326
350
|
let i = 0;
|