sudachi-ts 0.1.18 → 0.1.20-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +14 -7
- package/build/src/core/japaneseTokenizer.js +8 -0
- package/build/src/core/sentenceSplittingLazyAnalysis.js +10 -0
- package/build/src/dictionary/dictionaryFactory.js +10 -0
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.d.ts +2 -0
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.js +102 -1
- package/build/src/sentdetect/sentenceDetector.js +20 -19
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -191,13 +191,17 @@ import { SentenceDetector } from 'sudachi-ts/sentdetect/sentenceDetector.js';
|
|
|
191
191
|
|
|
192
192
|
const sentences = tokenizer.tokenizeSentences('東京都は日本の首都です。大阪は商業都市です。');
|
|
193
193
|
|
|
194
|
-
for (const sentence of sentences) {
|
|
195
|
-
console.log('--- Sentence ---');
|
|
196
|
-
for (const morpheme of sentence) {
|
|
197
|
-
console.log(morpheme.surface());
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
```
|
|
194
|
+
for (const sentence of sentences) {
|
|
195
|
+
console.log('--- Sentence ---');
|
|
196
|
+
for (const morpheme of sentence) {
|
|
197
|
+
console.log(morpheme.surface());
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
`tokenizeSentences(...)` treats quoted dialogue endings (for example `「...!」` and
|
|
203
|
+
`「...。」`) as sentence boundaries and skips leading inter-sentence whitespace such
|
|
204
|
+
as newlines before tokenization.
|
|
201
205
|
|
|
202
206
|
Lazy sentence processing for streaming:
|
|
203
207
|
|
|
@@ -281,6 +285,9 @@ readings is visible in the baseline vs plugin outputs.
|
|
|
281
285
|
`TokenChunkerPlugin` is designed and validated against the full Sudachi system
|
|
282
286
|
dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
|
|
283
287
|
when adding or tuning chunk rules.
|
|
288
|
+
`TokenChunkerPlugin` requires `enableDefaultCompoundParticles: true`. Dictionary
|
|
289
|
+
creation throws an error when this plugin is configured with default compound
|
|
290
|
+
particles disabled.
|
|
284
291
|
`TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
|
|
285
292
|
`tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
|
|
286
293
|
enabled throws an error.
|
|
@@ -7,6 +7,7 @@ import { MorphemeList as MorphemeListImpl } from './morphemeList.js';
|
|
|
7
7
|
import { SentenceSplittingLazyAnalysis } from './sentenceSplittingLazyAnalysis.js';
|
|
8
8
|
import { SplitMode } from './tokenizer.js';
|
|
9
9
|
import { UTF8InputTextBuilder } from './utf8InputText.js';
|
|
10
|
+
const LEADING_WHITESPACE_PATTERN = /^\s+/u;
|
|
10
11
|
export class JapaneseTokenizer {
|
|
11
12
|
grammar;
|
|
12
13
|
lexicon;
|
|
@@ -57,6 +58,13 @@ export class JapaneseTokenizer {
|
|
|
57
58
|
const sentences = [];
|
|
58
59
|
let remaining = inputText;
|
|
59
60
|
while (remaining.length > 0) {
|
|
61
|
+
const leadingWhitespace = remaining.match(LEADING_WHITESPACE_PATTERN);
|
|
62
|
+
if (leadingWhitespace) {
|
|
63
|
+
remaining = remaining.slice(leadingWhitespace[0].length);
|
|
64
|
+
if (remaining.length === 0) {
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
60
68
|
const checker = {
|
|
61
69
|
hasNonBreakWord: (eos) => {
|
|
62
70
|
const bytes = this.buildInputText(remaining.slice(0, eos)).getByteText();
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { SentenceDetector, } from '../sentdetect/sentenceDetector.js';
|
|
2
2
|
import { UTF8InputTextBuilder } from './utf8InputText.js';
|
|
3
3
|
const BUFFER_SIZE = 4096;
|
|
4
|
+
const LEADING_WHITESPACE_PATTERN = /^\s+/u;
|
|
4
5
|
export class SentenceSplittingLazyAnalysis {
|
|
5
6
|
mode;
|
|
6
7
|
grammar;
|
|
@@ -68,6 +69,15 @@ export class SentenceSplittingLazyAnalysis {
|
|
|
68
69
|
return this.buffer.length;
|
|
69
70
|
}
|
|
70
71
|
processNextSentence() {
|
|
72
|
+
const leadingWhitespace = this.normalized.match(LEADING_WHITESPACE_PATTERN);
|
|
73
|
+
if (leadingWhitespace) {
|
|
74
|
+
const skipped = leadingWhitespace[0].length;
|
|
75
|
+
this.bos += skipped;
|
|
76
|
+
this.normalized = this.normalized.slice(skipped);
|
|
77
|
+
if (this.normalized.length === 0) {
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
71
81
|
const detector = new SentenceDetector();
|
|
72
82
|
const eosLength = detector.getEos(this.normalized, this);
|
|
73
83
|
if (eosLength > 0) {
|
|
@@ -6,6 +6,12 @@ import { PluginLoader } from '../plugins/loader.js';
|
|
|
6
6
|
import { BinaryDictionary } from './binaryDictionary.js';
|
|
7
7
|
import { loadDefaultCompoundLexicon } from './defaultCompoundLexicon.js';
|
|
8
8
|
import { LexiconSet } from './lexiconSet.js';
|
|
9
|
+
function isTokenChunkerPlugin(className) {
|
|
10
|
+
if (className === 'com.worksap.nlp.sudachi.TokenChunkerPlugin') {
|
|
11
|
+
return true;
|
|
12
|
+
}
|
|
13
|
+
return className.split('.').pop() === 'TokenChunkerPlugin';
|
|
14
|
+
}
|
|
9
15
|
export class DictionaryFactory {
|
|
10
16
|
async create(configPath, customConfig) {
|
|
11
17
|
const config = customConfig || (await loadConfig(configPath));
|
|
@@ -76,6 +82,10 @@ export class DictionaryFactory {
|
|
|
76
82
|
if (!pathRewritePluginConfs || pathRewritePluginConfs.length === 0) {
|
|
77
83
|
pathRewritePluginConfs = defaultConfig.getPlugins('pathRewritePlugin');
|
|
78
84
|
}
|
|
85
|
+
if (!enableDefaultCompoundParticles &&
|
|
86
|
+
(pathRewritePluginConfs || []).some((conf) => isTokenChunkerPlugin(conf.className))) {
|
|
87
|
+
throw new Error('TokenChunkerPlugin is only compatible when enableDefaultCompoundParticles is true.');
|
|
88
|
+
}
|
|
79
89
|
const pathRewritePlugins = (await loader.loadPathRewritePlugins(pathRewritePluginConfs || [], grammar)).map((p) => p.plugin);
|
|
80
90
|
return new Dictionary(grammar, lexicon, inputTextPlugins, oovProviderPlugins, pathRewritePlugins);
|
|
81
91
|
}
|
|
@@ -28,6 +28,8 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
28
28
|
private isCounterChunk;
|
|
29
29
|
private mergeCounterChunks;
|
|
30
30
|
private applyMergeStage;
|
|
31
|
+
private shouldMergeAttributiveKana;
|
|
32
|
+
private shouldMergeSmallKana;
|
|
31
33
|
private applyInlineRubyExactStage;
|
|
32
34
|
private applyInlineRubyPrefixStage;
|
|
33
35
|
private shouldMergeInlineRubyExact;
|
|
@@ -294,10 +294,21 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
294
294
|
while (i < chunks.length - 1) {
|
|
295
295
|
const current = chunks[i];
|
|
296
296
|
const next = chunks[i + 1];
|
|
297
|
+
const following = chunks[i + 2];
|
|
297
298
|
if (!current || !next) {
|
|
298
299
|
i++;
|
|
299
300
|
continue;
|
|
300
301
|
}
|
|
302
|
+
if (this.shouldMergeSmallKana(current, next)) {
|
|
303
|
+
const merged = this.mergeChunks([current, next], 'fixed_expression');
|
|
304
|
+
chunks.splice(i, 2, merged);
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
if (this.shouldMergeAttributiveKana(current, next, following)) {
|
|
308
|
+
const merged = this.mergeChunks([current, next], 'phrase');
|
|
309
|
+
chunks.splice(i, 2, merged);
|
|
310
|
+
continue;
|
|
311
|
+
}
|
|
301
312
|
if ((current.chunkType === 'te_form' ||
|
|
302
313
|
current.chunkType === 'suru_verb_te_form') &&
|
|
303
314
|
(next.chunkType === 'single_token' ||
|
|
@@ -312,7 +323,10 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
312
323
|
continue;
|
|
313
324
|
}
|
|
314
325
|
if (current.surface.endsWith('んだ') &&
|
|
315
|
-
(next.surface === 'よ' ||
|
|
326
|
+
(next.surface === 'よ' ||
|
|
327
|
+
next.surface === 'よっ' ||
|
|
328
|
+
next.surface === 'ね' ||
|
|
329
|
+
next.surface === 'ねっ')) {
|
|
316
330
|
const merged = this.mergeChunks([current, next], 'fixed_expression');
|
|
317
331
|
chunks.splice(i, 2, merged);
|
|
318
332
|
continue;
|
|
@@ -321,6 +335,32 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
321
335
|
}
|
|
322
336
|
return chunks;
|
|
323
337
|
}
|
|
338
|
+
shouldMergeAttributiveKana(current, next, following) {
|
|
339
|
+
if (next.surface !== 'かな') {
|
|
340
|
+
return false;
|
|
341
|
+
}
|
|
342
|
+
const nextPos = this.getPosById(next.posId);
|
|
343
|
+
if (!nextPos || nextPos[0] !== '助詞' || nextPos[1] !== '終助詞') {
|
|
344
|
+
return false;
|
|
345
|
+
}
|
|
346
|
+
const currentPos0 = this.getPosById(current.posId)?.[0] ?? '';
|
|
347
|
+
if (!['名詞', '形状詞'].includes(currentPos0)) {
|
|
348
|
+
return false;
|
|
349
|
+
}
|
|
350
|
+
if (!following) {
|
|
351
|
+
return false;
|
|
352
|
+
}
|
|
353
|
+
const followingPos0 = this.getPosById(following.posId)?.[0] ?? '';
|
|
354
|
+
return ['名詞', '代名詞'].includes(followingPos0);
|
|
355
|
+
}
|
|
356
|
+
shouldMergeSmallKana(current, next) {
|
|
357
|
+
if (current.chunkType !== 'single_token' ||
|
|
358
|
+
next.chunkType !== 'single_token') {
|
|
359
|
+
return false;
|
|
360
|
+
}
|
|
361
|
+
return (SINGLE_KANA_PATTERN.test(current.surface) &&
|
|
362
|
+
SMALL_KANA_SUFFIXES.has(next.surface));
|
|
363
|
+
}
|
|
324
364
|
applyInlineRubyExactStage(source) {
|
|
325
365
|
const chunks = [...source];
|
|
326
366
|
let i = 0;
|
|
@@ -486,6 +526,8 @@ const KANJI_PATTERN = /\p{Script=Han}/u;
|
|
|
486
526
|
const KANJI_ONLY_PATTERN = /^[\p{Script=Han}々〆ヵヶ]+$/u;
|
|
487
527
|
const KANA_PATTERN = /^[ぁ-ゖァ-ヺー]+$/u;
|
|
488
528
|
const HIRAGANA_PATTERN = /^[ぁ-ゖー]+$/u;
|
|
529
|
+
const SINGLE_KANA_PATTERN = /^[ぁ-ゖァ-ヺー]$/u;
|
|
530
|
+
const SMALL_KANA_SUFFIXES = new Set(['ゃ', 'ゅ', 'ょ', 'ャ', 'ュ', 'ョ']);
|
|
489
531
|
const COUNTER_WORDS = new Set([
|
|
490
532
|
'本',
|
|
491
533
|
'匹',
|
|
@@ -683,6 +725,25 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
683
725
|
{ surface: 'た', pos0: '助動詞' },
|
|
684
726
|
],
|
|
685
727
|
},
|
|
728
|
+
{
|
|
729
|
+
name: 'copula_desu_past_split',
|
|
730
|
+
priority: 101,
|
|
731
|
+
resultType: 'phrase',
|
|
732
|
+
pattern: [
|
|
733
|
+
{ surface: 'でし', dictionaryForm: 'です', pos0: '助動詞' },
|
|
734
|
+
{ surface: 'た', pos0: '助動詞' },
|
|
735
|
+
],
|
|
736
|
+
},
|
|
737
|
+
{
|
|
738
|
+
name: 'verb_masen',
|
|
739
|
+
priority: 101,
|
|
740
|
+
resultType: 'phrase',
|
|
741
|
+
pattern: [
|
|
742
|
+
{ pos0: '動詞' },
|
|
743
|
+
{ surface: 'ませ', dictionaryForm: 'ます', pos0: '助動詞' },
|
|
744
|
+
{ surface: 'ん', pos0: '助動詞' },
|
|
745
|
+
],
|
|
746
|
+
},
|
|
686
747
|
{
|
|
687
748
|
name: 'noun_suru_past',
|
|
688
749
|
priority: 101,
|
|
@@ -1338,6 +1399,12 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
1338
1399
|
{ surface: 'いい' },
|
|
1339
1400
|
],
|
|
1340
1401
|
},
|
|
1402
|
+
{
|
|
1403
|
+
name: 'nakereba',
|
|
1404
|
+
priority: 95,
|
|
1405
|
+
resultType: 'phrase',
|
|
1406
|
+
pattern: [{ surface: 'なけれ' }, { surface: 'ば', pos0: '助詞' }],
|
|
1407
|
+
},
|
|
1341
1408
|
{
|
|
1342
1409
|
name: 'ja_ire_nai',
|
|
1343
1410
|
priority: 95,
|
|
@@ -1372,6 +1439,28 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
1372
1439
|
resultType: 'phrase',
|
|
1373
1440
|
pattern: [{ pos0: '動詞' }, { surface: 'た', pos0: '助動詞' }],
|
|
1374
1441
|
},
|
|
1442
|
+
{
|
|
1443
|
+
name: 'verb_chatta_past',
|
|
1444
|
+
priority: 96,
|
|
1445
|
+
resultType: 'phrase',
|
|
1446
|
+
pattern: [
|
|
1447
|
+
{ pos0: '動詞' },
|
|
1448
|
+
{ surface: ['ちゃっ', 'じゃっ'] },
|
|
1449
|
+
{ surface: 'た', pos0: '助動詞' },
|
|
1450
|
+
],
|
|
1451
|
+
},
|
|
1452
|
+
{
|
|
1453
|
+
name: 'verb_past_n_da',
|
|
1454
|
+
priority: 96,
|
|
1455
|
+
resultType: 'phrase',
|
|
1456
|
+
pattern: [{ pos0: '動詞' }, { surface: 'ん' }, { surface: 'だ' }],
|
|
1457
|
+
},
|
|
1458
|
+
{
|
|
1459
|
+
name: 'verb_past_n_da_compact',
|
|
1460
|
+
priority: 96,
|
|
1461
|
+
resultType: 'phrase',
|
|
1462
|
+
pattern: [{ pos0: '動詞' }, { surface: 'んだ' }],
|
|
1463
|
+
},
|
|
1375
1464
|
{
|
|
1376
1465
|
name: 'verb_tara_compact',
|
|
1377
1466
|
priority: 96,
|
|
@@ -1876,6 +1965,18 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
1876
1965
|
resultType: 'fixed_expression',
|
|
1877
1966
|
pattern: [{ surface: 'んだ' }, { surface: 'よ' }],
|
|
1878
1967
|
},
|
|
1968
|
+
{
|
|
1969
|
+
name: 'fixed_n_da_ne',
|
|
1970
|
+
priority: 94,
|
|
1971
|
+
resultType: 'fixed_expression',
|
|
1972
|
+
pattern: [{ surface: 'ん' }, { surface: 'だ' }, { surface: 'ね' }],
|
|
1973
|
+
},
|
|
1974
|
+
{
|
|
1975
|
+
name: 'fixed_nda_ne',
|
|
1976
|
+
priority: 94,
|
|
1977
|
+
resultType: 'fixed_expression',
|
|
1978
|
+
pattern: [{ surface: 'んだ' }, { surface: 'ね' }],
|
|
1979
|
+
},
|
|
1879
1980
|
{
|
|
1880
1981
|
name: 'fixed_de_su_yo',
|
|
1881
1982
|
priority: 94,
|
|
@@ -28,26 +28,27 @@ export class SentenceDetector {
|
|
|
28
28
|
let match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
29
29
|
while (match !== null) {
|
|
30
30
|
const eos = match.index + match[0].length;
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
adjustedEos += this.prohibitedBOS(s.slice(eos));
|
|
35
|
-
}
|
|
36
|
-
if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
|
|
37
|
-
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
38
|
-
continue;
|
|
39
|
-
}
|
|
40
|
-
if (eos < s.length && this.isContinuousPhrase(s, eos)) {
|
|
41
|
-
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
42
|
-
continue;
|
|
43
|
-
}
|
|
44
|
-
if (checker?.hasNonBreakWord(eos)) {
|
|
45
|
-
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
46
|
-
continue;
|
|
47
|
-
}
|
|
48
|
-
return adjustedEos;
|
|
31
|
+
let adjustedEos = eos;
|
|
32
|
+
if (eos < s.length) {
|
|
33
|
+
adjustedEos += this.prohibitedBOS(s.slice(eos));
|
|
49
34
|
}
|
|
50
|
-
|
|
35
|
+
if (this.parenthesisLevel(s.slice(0, adjustedEos)) !== 0) {
|
|
36
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
|
|
40
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
if (eos < s.length && this.isContinuousPhrase(s, eos)) {
|
|
44
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
if (checker?.hasNonBreakWord(adjustedEos)) {
|
|
48
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
return adjustedEos;
|
|
51
52
|
}
|
|
52
53
|
if (input.length > this.limit) {
|
|
53
54
|
const spaces = s.match(/^.+\s+/);
|