sudachi-ts 0.1.19 → 0.1.20-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -191,13 +191,17 @@ import { SentenceDetector } from 'sudachi-ts/sentdetect/sentenceDetector.js';
191
191
 
192
192
  const sentences = tokenizer.tokenizeSentences('東京都は日本の首都です。大阪は商業都市です。');
193
193
 
194
- for (const sentence of sentences) {
195
- console.log('--- Sentence ---');
196
- for (const morpheme of sentence) {
197
- console.log(morpheme.surface());
198
- }
199
- }
200
- ```
194
+ for (const sentence of sentences) {
195
+ console.log('--- Sentence ---');
196
+ for (const morpheme of sentence) {
197
+ console.log(morpheme.surface());
198
+ }
199
+ }
200
+ ```
201
+
202
+ `tokenizeSentences(...)` treats quoted dialogue endings (for example `「...!」` and
203
+ `「...。」`) as sentence boundaries and skips leading inter-sentence whitespace such
204
+ as newlines before tokenization.
201
205
 
202
206
  Lazy sentence processing for streaming:
203
207
 
@@ -7,6 +7,7 @@ import { MorphemeList as MorphemeListImpl } from './morphemeList.js';
7
7
  import { SentenceSplittingLazyAnalysis } from './sentenceSplittingLazyAnalysis.js';
8
8
  import { SplitMode } from './tokenizer.js';
9
9
  import { UTF8InputTextBuilder } from './utf8InputText.js';
10
+ const LEADING_WHITESPACE_PATTERN = /^\s+/u;
10
11
  export class JapaneseTokenizer {
11
12
  grammar;
12
13
  lexicon;
@@ -57,6 +58,13 @@ export class JapaneseTokenizer {
57
58
  const sentences = [];
58
59
  let remaining = inputText;
59
60
  while (remaining.length > 0) {
61
+ const leadingWhitespace = remaining.match(LEADING_WHITESPACE_PATTERN);
62
+ if (leadingWhitespace) {
63
+ remaining = remaining.slice(leadingWhitespace[0].length);
64
+ if (remaining.length === 0) {
65
+ break;
66
+ }
67
+ }
60
68
  const checker = {
61
69
  hasNonBreakWord: (eos) => {
62
70
  const bytes = this.buildInputText(remaining.slice(0, eos)).getByteText();
@@ -1,6 +1,7 @@
1
1
  import { SentenceDetector, } from '../sentdetect/sentenceDetector.js';
2
2
  import { UTF8InputTextBuilder } from './utf8InputText.js';
3
3
  const BUFFER_SIZE = 4096;
4
+ const LEADING_WHITESPACE_PATTERN = /^\s+/u;
4
5
  export class SentenceSplittingLazyAnalysis {
5
6
  mode;
6
7
  grammar;
@@ -68,6 +69,15 @@ export class SentenceSplittingLazyAnalysis {
68
69
  return this.buffer.length;
69
70
  }
70
71
  processNextSentence() {
72
+ const leadingWhitespace = this.normalized.match(LEADING_WHITESPACE_PATTERN);
73
+ if (leadingWhitespace) {
74
+ const skipped = leadingWhitespace[0].length;
75
+ this.bos += skipped;
76
+ this.normalized = this.normalized.slice(skipped);
77
+ if (this.normalized.length === 0) {
78
+ return null;
79
+ }
80
+ }
71
81
  const detector = new SentenceDetector();
72
82
  const eosLength = detector.getEos(this.normalized, this);
73
83
  if (eosLength > 0) {
@@ -29,6 +29,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
29
29
  private mergeCounterChunks;
30
30
  private applyMergeStage;
31
31
  private shouldMergeAttributiveKana;
32
+ private shouldMergeSmallKana;
32
33
  private applyInlineRubyExactStage;
33
34
  private applyInlineRubyPrefixStage;
34
35
  private shouldMergeInlineRubyExact;
@@ -299,6 +299,11 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
299
299
  i++;
300
300
  continue;
301
301
  }
302
+ if (this.shouldMergeSmallKana(current, next)) {
303
+ const merged = this.mergeChunks([current, next], 'fixed_expression');
304
+ chunks.splice(i, 2, merged);
305
+ continue;
306
+ }
302
307
  if (this.shouldMergeAttributiveKana(current, next, following)) {
303
308
  const merged = this.mergeChunks([current, next], 'phrase');
304
309
  chunks.splice(i, 2, merged);
@@ -318,7 +323,10 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
318
323
  continue;
319
324
  }
320
325
  if (current.surface.endsWith('んだ') &&
321
- (next.surface === 'よ' || next.surface === 'よっ')) {
326
+ (next.surface === 'よ' ||
327
+ next.surface === 'よっ' ||
328
+ next.surface === 'ね' ||
329
+ next.surface === 'ねっ')) {
322
330
  const merged = this.mergeChunks([current, next], 'fixed_expression');
323
331
  chunks.splice(i, 2, merged);
324
332
  continue;
@@ -345,6 +353,14 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
345
353
  const followingPos0 = this.getPosById(following.posId)?.[0] ?? '';
346
354
  return ['名詞', '代名詞'].includes(followingPos0);
347
355
  }
356
+ shouldMergeSmallKana(current, next) {
357
+ if (current.chunkType !== 'single_token' ||
358
+ next.chunkType !== 'single_token') {
359
+ return false;
360
+ }
361
+ return (SINGLE_KANA_PATTERN.test(current.surface) &&
362
+ SMALL_KANA_SUFFIXES.has(next.surface));
363
+ }
348
364
  applyInlineRubyExactStage(source) {
349
365
  const chunks = [...source];
350
366
  let i = 0;
@@ -510,6 +526,8 @@ const KANJI_PATTERN = /\p{Script=Han}/u;
510
526
  const KANJI_ONLY_PATTERN = /^[\p{Script=Han}々〆ヵヶ]+$/u;
511
527
  const KANA_PATTERN = /^[ぁ-ゖァ-ヺー]+$/u;
512
528
  const HIRAGANA_PATTERN = /^[ぁ-ゖー]+$/u;
529
+ const SINGLE_KANA_PATTERN = /^[ぁ-ゖァ-ヺー]$/u;
530
+ const SMALL_KANA_SUFFIXES = new Set(['ゃ', 'ゅ', 'ょ', 'ャ', 'ュ', 'ョ']);
513
531
  const COUNTER_WORDS = new Set([
514
532
  '本',
515
533
  '匹',
@@ -707,6 +725,25 @@ const COLLOQUIAL_SEQUENCE_RULES = [
707
725
  { surface: 'た', pos0: '助動詞' },
708
726
  ],
709
727
  },
728
+ {
729
+ name: 'copula_desu_past_split',
730
+ priority: 101,
731
+ resultType: 'phrase',
732
+ pattern: [
733
+ { surface: 'でし', dictionaryForm: 'です', pos0: '助動詞' },
734
+ { surface: 'た', pos0: '助動詞' },
735
+ ],
736
+ },
737
+ {
738
+ name: 'verb_masen',
739
+ priority: 101,
740
+ resultType: 'phrase',
741
+ pattern: [
742
+ { pos0: '動詞' },
743
+ { surface: 'ませ', dictionaryForm: 'ます', pos0: '助動詞' },
744
+ { surface: 'ん', pos0: '助動詞' },
745
+ ],
746
+ },
710
747
  {
711
748
  name: 'noun_suru_past',
712
749
  priority: 101,
@@ -1362,6 +1399,12 @@ const COLLOQUIAL_SEQUENCE_RULES = [
1362
1399
  { surface: 'いい' },
1363
1400
  ],
1364
1401
  },
1402
+ {
1403
+ name: 'nakereba',
1404
+ priority: 95,
1405
+ resultType: 'phrase',
1406
+ pattern: [{ surface: 'なけれ' }, { surface: 'ば', pos0: '助詞' }],
1407
+ },
1365
1408
  {
1366
1409
  name: 'ja_ire_nai',
1367
1410
  priority: 95,
@@ -1396,6 +1439,28 @@ const COLLOQUIAL_SEQUENCE_RULES = [
1396
1439
  resultType: 'phrase',
1397
1440
  pattern: [{ pos0: '動詞' }, { surface: 'た', pos0: '助動詞' }],
1398
1441
  },
1442
+ {
1443
+ name: 'verb_chatta_past',
1444
+ priority: 96,
1445
+ resultType: 'phrase',
1446
+ pattern: [
1447
+ { pos0: '動詞' },
1448
+ { surface: ['ちゃっ', 'じゃっ'] },
1449
+ { surface: 'た', pos0: '助動詞' },
1450
+ ],
1451
+ },
1452
+ {
1453
+ name: 'verb_past_n_da',
1454
+ priority: 96,
1455
+ resultType: 'phrase',
1456
+ pattern: [{ pos0: '動詞' }, { surface: 'ん' }, { surface: 'だ' }],
1457
+ },
1458
+ {
1459
+ name: 'verb_past_n_da_compact',
1460
+ priority: 96,
1461
+ resultType: 'phrase',
1462
+ pattern: [{ pos0: '動詞' }, { surface: 'んだ' }],
1463
+ },
1399
1464
  {
1400
1465
  name: 'verb_tara_compact',
1401
1466
  priority: 96,
@@ -1900,6 +1965,18 @@ const COLLOQUIAL_SEQUENCE_RULES = [
1900
1965
  resultType: 'fixed_expression',
1901
1966
  pattern: [{ surface: 'んだ' }, { surface: 'よ' }],
1902
1967
  },
1968
+ {
1969
+ name: 'fixed_n_da_ne',
1970
+ priority: 94,
1971
+ resultType: 'fixed_expression',
1972
+ pattern: [{ surface: 'ん' }, { surface: 'だ' }, { surface: 'ね' }],
1973
+ },
1974
+ {
1975
+ name: 'fixed_nda_ne',
1976
+ priority: 94,
1977
+ resultType: 'fixed_expression',
1978
+ pattern: [{ surface: 'んだ' }, { surface: 'ね' }],
1979
+ },
1903
1980
  {
1904
1981
  name: 'fixed_de_su_yo',
1905
1982
  priority: 94,
@@ -28,26 +28,27 @@ export class SentenceDetector {
28
28
  let match = SENTENCE_BREAKER_PATTERN.exec(s);
29
29
  while (match !== null) {
30
30
  const eos = match.index + match[0].length;
31
- if (this.parenthesisLevel(s.slice(0, eos)) === 0) {
32
- let adjustedEos = eos;
33
- if (eos < s.length) {
34
- adjustedEos += this.prohibitedBOS(s.slice(eos));
35
- }
36
- if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
37
- match = SENTENCE_BREAKER_PATTERN.exec(s);
38
- continue;
39
- }
40
- if (eos < s.length && this.isContinuousPhrase(s, eos)) {
41
- match = SENTENCE_BREAKER_PATTERN.exec(s);
42
- continue;
43
- }
44
- if (checker?.hasNonBreakWord(eos)) {
45
- match = SENTENCE_BREAKER_PATTERN.exec(s);
46
- continue;
47
- }
48
- return adjustedEos;
31
+ let adjustedEos = eos;
32
+ if (eos < s.length) {
33
+ adjustedEos += this.prohibitedBOS(s.slice(eos));
49
34
  }
50
- match = SENTENCE_BREAKER_PATTERN.exec(s);
35
+ if (this.parenthesisLevel(s.slice(0, adjustedEos)) !== 0) {
36
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
37
+ continue;
38
+ }
39
+ if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
40
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
41
+ continue;
42
+ }
43
+ if (eos < s.length && this.isContinuousPhrase(s, eos)) {
44
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
45
+ continue;
46
+ }
47
+ if (checker?.hasNonBreakWord(adjustedEos)) {
48
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
49
+ continue;
50
+ }
51
+ return adjustedEos;
51
52
  }
52
53
  if (input.length > this.limit) {
53
54
  const spaces = s.match(/^.+\s+/);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sudachi-ts",
3
- "version": "0.1.19",
3
+ "version": "0.1.20-beta.0",
4
4
  "description": "TypeScript port of Sudachi morphological analyzer for Japanese text",
5
5
  "keywords": [
6
6
  "morphological-analyzer",