sudachi-ts 0.1.19 → 0.1.20-beta.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -7
- package/build/src/core/japaneseTokenizer.js +8 -0
- package/build/src/core/sentenceSplittingLazyAnalysis.js +10 -0
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.d.ts +1 -0
- package/build/src/plugins/pathRewrite/tokenChunkerPlugin.js +78 -1
- package/build/src/sentdetect/sentenceDetector.js +20 -19
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -191,13 +191,17 @@ import { SentenceDetector } from 'sudachi-ts/sentdetect/sentenceDetector.js';
|
|
|
191
191
|
|
|
192
192
|
const sentences = tokenizer.tokenizeSentences('東京都は日本の首都です。大阪は商業都市です。');
|
|
193
193
|
|
|
194
|
-
for (const sentence of sentences) {
|
|
195
|
-
console.log('--- Sentence ---');
|
|
196
|
-
for (const morpheme of sentence) {
|
|
197
|
-
console.log(morpheme.surface());
|
|
198
|
-
}
|
|
199
|
-
}
|
|
200
|
-
```
|
|
194
|
+
for (const sentence of sentences) {
|
|
195
|
+
console.log('--- Sentence ---');
|
|
196
|
+
for (const morpheme of sentence) {
|
|
197
|
+
console.log(morpheme.surface());
|
|
198
|
+
}
|
|
199
|
+
}
|
|
200
|
+
```
|
|
201
|
+
|
|
202
|
+
`tokenizeSentences(...)` treats quoted dialogue endings (for example `「...!」` and
|
|
203
|
+
`「...。」`) as sentence boundaries and skips leading inter-sentence whitespace such
|
|
204
|
+
as newlines before tokenization.
|
|
201
205
|
|
|
202
206
|
Lazy sentence processing for streaming:
|
|
203
207
|
|
|
@@ -7,6 +7,7 @@ import { MorphemeList as MorphemeListImpl } from './morphemeList.js';
|
|
|
7
7
|
import { SentenceSplittingLazyAnalysis } from './sentenceSplittingLazyAnalysis.js';
|
|
8
8
|
import { SplitMode } from './tokenizer.js';
|
|
9
9
|
import { UTF8InputTextBuilder } from './utf8InputText.js';
|
|
10
|
+
const LEADING_WHITESPACE_PATTERN = /^\s+/u;
|
|
10
11
|
export class JapaneseTokenizer {
|
|
11
12
|
grammar;
|
|
12
13
|
lexicon;
|
|
@@ -57,6 +58,13 @@ export class JapaneseTokenizer {
|
|
|
57
58
|
const sentences = [];
|
|
58
59
|
let remaining = inputText;
|
|
59
60
|
while (remaining.length > 0) {
|
|
61
|
+
const leadingWhitespace = remaining.match(LEADING_WHITESPACE_PATTERN);
|
|
62
|
+
if (leadingWhitespace) {
|
|
63
|
+
remaining = remaining.slice(leadingWhitespace[0].length);
|
|
64
|
+
if (remaining.length === 0) {
|
|
65
|
+
break;
|
|
66
|
+
}
|
|
67
|
+
}
|
|
60
68
|
const checker = {
|
|
61
69
|
hasNonBreakWord: (eos) => {
|
|
62
70
|
const bytes = this.buildInputText(remaining.slice(0, eos)).getByteText();
|
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
import { SentenceDetector, } from '../sentdetect/sentenceDetector.js';
|
|
2
2
|
import { UTF8InputTextBuilder } from './utf8InputText.js';
|
|
3
3
|
const BUFFER_SIZE = 4096;
|
|
4
|
+
const LEADING_WHITESPACE_PATTERN = /^\s+/u;
|
|
4
5
|
export class SentenceSplittingLazyAnalysis {
|
|
5
6
|
mode;
|
|
6
7
|
grammar;
|
|
@@ -68,6 +69,15 @@ export class SentenceSplittingLazyAnalysis {
|
|
|
68
69
|
return this.buffer.length;
|
|
69
70
|
}
|
|
70
71
|
processNextSentence() {
|
|
72
|
+
const leadingWhitespace = this.normalized.match(LEADING_WHITESPACE_PATTERN);
|
|
73
|
+
if (leadingWhitespace) {
|
|
74
|
+
const skipped = leadingWhitespace[0].length;
|
|
75
|
+
this.bos += skipped;
|
|
76
|
+
this.normalized = this.normalized.slice(skipped);
|
|
77
|
+
if (this.normalized.length === 0) {
|
|
78
|
+
return null;
|
|
79
|
+
}
|
|
80
|
+
}
|
|
71
81
|
const detector = new SentenceDetector();
|
|
72
82
|
const eosLength = detector.getEos(this.normalized, this);
|
|
73
83
|
if (eosLength > 0) {
|
|
@@ -29,6 +29,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
29
29
|
private mergeCounterChunks;
|
|
30
30
|
private applyMergeStage;
|
|
31
31
|
private shouldMergeAttributiveKana;
|
|
32
|
+
private shouldMergeSmallKana;
|
|
32
33
|
private applyInlineRubyExactStage;
|
|
33
34
|
private applyInlineRubyPrefixStage;
|
|
34
35
|
private shouldMergeInlineRubyExact;
|
|
@@ -299,6 +299,11 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
299
299
|
i++;
|
|
300
300
|
continue;
|
|
301
301
|
}
|
|
302
|
+
if (this.shouldMergeSmallKana(current, next)) {
|
|
303
|
+
const merged = this.mergeChunks([current, next], 'fixed_expression');
|
|
304
|
+
chunks.splice(i, 2, merged);
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
302
307
|
if (this.shouldMergeAttributiveKana(current, next, following)) {
|
|
303
308
|
const merged = this.mergeChunks([current, next], 'phrase');
|
|
304
309
|
chunks.splice(i, 2, merged);
|
|
@@ -318,7 +323,10 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
318
323
|
continue;
|
|
319
324
|
}
|
|
320
325
|
if (current.surface.endsWith('んだ') &&
|
|
321
|
-
(next.surface === 'よ' ||
|
|
326
|
+
(next.surface === 'よ' ||
|
|
327
|
+
next.surface === 'よっ' ||
|
|
328
|
+
next.surface === 'ね' ||
|
|
329
|
+
next.surface === 'ねっ')) {
|
|
322
330
|
const merged = this.mergeChunks([current, next], 'fixed_expression');
|
|
323
331
|
chunks.splice(i, 2, merged);
|
|
324
332
|
continue;
|
|
@@ -345,6 +353,14 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
|
|
|
345
353
|
const followingPos0 = this.getPosById(following.posId)?.[0] ?? '';
|
|
346
354
|
return ['名詞', '代名詞'].includes(followingPos0);
|
|
347
355
|
}
|
|
356
|
+
shouldMergeSmallKana(current, next) {
|
|
357
|
+
if (current.chunkType !== 'single_token' ||
|
|
358
|
+
next.chunkType !== 'single_token') {
|
|
359
|
+
return false;
|
|
360
|
+
}
|
|
361
|
+
return (SINGLE_KANA_PATTERN.test(current.surface) &&
|
|
362
|
+
SMALL_KANA_SUFFIXES.has(next.surface));
|
|
363
|
+
}
|
|
348
364
|
applyInlineRubyExactStage(source) {
|
|
349
365
|
const chunks = [...source];
|
|
350
366
|
let i = 0;
|
|
@@ -510,6 +526,8 @@ const KANJI_PATTERN = /\p{Script=Han}/u;
|
|
|
510
526
|
const KANJI_ONLY_PATTERN = /^[\p{Script=Han}々〆ヵヶ]+$/u;
|
|
511
527
|
const KANA_PATTERN = /^[ぁ-ゖァ-ヺー]+$/u;
|
|
512
528
|
const HIRAGANA_PATTERN = /^[ぁ-ゖー]+$/u;
|
|
529
|
+
const SINGLE_KANA_PATTERN = /^[ぁ-ゖァ-ヺー]$/u;
|
|
530
|
+
const SMALL_KANA_SUFFIXES = new Set(['ゃ', 'ゅ', 'ょ', 'ャ', 'ュ', 'ョ']);
|
|
513
531
|
const COUNTER_WORDS = new Set([
|
|
514
532
|
'本',
|
|
515
533
|
'匹',
|
|
@@ -707,6 +725,25 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
707
725
|
{ surface: 'た', pos0: '助動詞' },
|
|
708
726
|
],
|
|
709
727
|
},
|
|
728
|
+
{
|
|
729
|
+
name: 'copula_desu_past_split',
|
|
730
|
+
priority: 101,
|
|
731
|
+
resultType: 'phrase',
|
|
732
|
+
pattern: [
|
|
733
|
+
{ surface: 'でし', dictionaryForm: 'です', pos0: '助動詞' },
|
|
734
|
+
{ surface: 'た', pos0: '助動詞' },
|
|
735
|
+
],
|
|
736
|
+
},
|
|
737
|
+
{
|
|
738
|
+
name: 'verb_masen',
|
|
739
|
+
priority: 101,
|
|
740
|
+
resultType: 'phrase',
|
|
741
|
+
pattern: [
|
|
742
|
+
{ pos0: '動詞' },
|
|
743
|
+
{ surface: 'ませ', dictionaryForm: 'ます', pos0: '助動詞' },
|
|
744
|
+
{ surface: 'ん', pos0: '助動詞' },
|
|
745
|
+
],
|
|
746
|
+
},
|
|
710
747
|
{
|
|
711
748
|
name: 'noun_suru_past',
|
|
712
749
|
priority: 101,
|
|
@@ -1362,6 +1399,12 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
1362
1399
|
{ surface: 'いい' },
|
|
1363
1400
|
],
|
|
1364
1401
|
},
|
|
1402
|
+
{
|
|
1403
|
+
name: 'nakereba',
|
|
1404
|
+
priority: 95,
|
|
1405
|
+
resultType: 'phrase',
|
|
1406
|
+
pattern: [{ surface: 'なけれ' }, { surface: 'ば', pos0: '助詞' }],
|
|
1407
|
+
},
|
|
1365
1408
|
{
|
|
1366
1409
|
name: 'ja_ire_nai',
|
|
1367
1410
|
priority: 95,
|
|
@@ -1396,6 +1439,28 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
1396
1439
|
resultType: 'phrase',
|
|
1397
1440
|
pattern: [{ pos0: '動詞' }, { surface: 'た', pos0: '助動詞' }],
|
|
1398
1441
|
},
|
|
1442
|
+
{
|
|
1443
|
+
name: 'verb_chatta_past',
|
|
1444
|
+
priority: 96,
|
|
1445
|
+
resultType: 'phrase',
|
|
1446
|
+
pattern: [
|
|
1447
|
+
{ pos0: '動詞' },
|
|
1448
|
+
{ surface: ['ちゃっ', 'じゃっ'] },
|
|
1449
|
+
{ surface: 'た', pos0: '助動詞' },
|
|
1450
|
+
],
|
|
1451
|
+
},
|
|
1452
|
+
{
|
|
1453
|
+
name: 'verb_past_n_da',
|
|
1454
|
+
priority: 96,
|
|
1455
|
+
resultType: 'phrase',
|
|
1456
|
+
pattern: [{ pos0: '動詞' }, { surface: 'ん' }, { surface: 'だ' }],
|
|
1457
|
+
},
|
|
1458
|
+
{
|
|
1459
|
+
name: 'verb_past_n_da_compact',
|
|
1460
|
+
priority: 96,
|
|
1461
|
+
resultType: 'phrase',
|
|
1462
|
+
pattern: [{ pos0: '動詞' }, { surface: 'んだ' }],
|
|
1463
|
+
},
|
|
1399
1464
|
{
|
|
1400
1465
|
name: 'verb_tara_compact',
|
|
1401
1466
|
priority: 96,
|
|
@@ -1900,6 +1965,18 @@ const COLLOQUIAL_SEQUENCE_RULES = [
|
|
|
1900
1965
|
resultType: 'fixed_expression',
|
|
1901
1966
|
pattern: [{ surface: 'んだ' }, { surface: 'よ' }],
|
|
1902
1967
|
},
|
|
1968
|
+
{
|
|
1969
|
+
name: 'fixed_n_da_ne',
|
|
1970
|
+
priority: 94,
|
|
1971
|
+
resultType: 'fixed_expression',
|
|
1972
|
+
pattern: [{ surface: 'ん' }, { surface: 'だ' }, { surface: 'ね' }],
|
|
1973
|
+
},
|
|
1974
|
+
{
|
|
1975
|
+
name: 'fixed_nda_ne',
|
|
1976
|
+
priority: 94,
|
|
1977
|
+
resultType: 'fixed_expression',
|
|
1978
|
+
pattern: [{ surface: 'んだ' }, { surface: 'ね' }],
|
|
1979
|
+
},
|
|
1903
1980
|
{
|
|
1904
1981
|
name: 'fixed_de_su_yo',
|
|
1905
1982
|
priority: 94,
|
|
@@ -28,26 +28,27 @@ export class SentenceDetector {
|
|
|
28
28
|
let match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
29
29
|
while (match !== null) {
|
|
30
30
|
const eos = match.index + match[0].length;
|
|
31
|
-
|
|
32
|
-
|
|
33
|
-
|
|
34
|
-
adjustedEos += this.prohibitedBOS(s.slice(eos));
|
|
35
|
-
}
|
|
36
|
-
if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
|
|
37
|
-
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
38
|
-
continue;
|
|
39
|
-
}
|
|
40
|
-
if (eos < s.length && this.isContinuousPhrase(s, eos)) {
|
|
41
|
-
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
42
|
-
continue;
|
|
43
|
-
}
|
|
44
|
-
if (checker?.hasNonBreakWord(eos)) {
|
|
45
|
-
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
46
|
-
continue;
|
|
47
|
-
}
|
|
48
|
-
return adjustedEos;
|
|
31
|
+
let adjustedEos = eos;
|
|
32
|
+
if (eos < s.length) {
|
|
33
|
+
adjustedEos += this.prohibitedBOS(s.slice(eos));
|
|
49
34
|
}
|
|
50
|
-
|
|
35
|
+
if (this.parenthesisLevel(s.slice(0, adjustedEos)) !== 0) {
|
|
36
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
|
|
40
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
41
|
+
continue;
|
|
42
|
+
}
|
|
43
|
+
if (eos < s.length && this.isContinuousPhrase(s, eos)) {
|
|
44
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
if (checker?.hasNonBreakWord(adjustedEos)) {
|
|
48
|
+
match = SENTENCE_BREAKER_PATTERN.exec(s);
|
|
49
|
+
continue;
|
|
50
|
+
}
|
|
51
|
+
return adjustedEos;
|
|
51
52
|
}
|
|
52
53
|
if (input.length > this.limit) {
|
|
53
54
|
const spaces = s.match(/^.+\s+/);
|