sudachi-ts 0.1.18 → 0.1.20-beta.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -191,13 +191,17 @@ import { SentenceDetector } from 'sudachi-ts/sentdetect/sentenceDetector.js';
191
191
 
192
192
  const sentences = tokenizer.tokenizeSentences('東京都は日本の首都です。大阪は商業都市です。');
193
193
 
194
- for (const sentence of sentences) {
195
- console.log('--- Sentence ---');
196
- for (const morpheme of sentence) {
197
- console.log(morpheme.surface());
198
- }
199
- }
200
- ```
194
+ for (const sentence of sentences) {
195
+ console.log('--- Sentence ---');
196
+ for (const morpheme of sentence) {
197
+ console.log(morpheme.surface());
198
+ }
199
+ }
200
+ ```
201
+
202
+ `tokenizeSentences(...)` treats quoted dialogue endings (for example `「...!」` and
203
+ `「...。」`) as sentence boundaries and skips leading inter-sentence whitespace such
204
+ as newlines before tokenization.
201
205
 
202
206
  Lazy sentence processing for streaming:
203
207
 
@@ -281,6 +285,9 @@ readings is visible in the baseline vs plugin outputs.
281
285
  `TokenChunkerPlugin` is designed and validated against the full Sudachi system
282
286
  dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
283
287
  when adding or tuning chunk rules.
288
+ `TokenChunkerPlugin` requires `enableDefaultCompoundParticles: true`. Dictionary
289
+ creation throws an error when this plugin is configured with default compound
290
+ particles disabled.
284
291
  `TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
285
292
  `tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
286
293
  enabled throws an error.
@@ -7,6 +7,7 @@ import { MorphemeList as MorphemeListImpl } from './morphemeList.js';
7
7
  import { SentenceSplittingLazyAnalysis } from './sentenceSplittingLazyAnalysis.js';
8
8
  import { SplitMode } from './tokenizer.js';
9
9
  import { UTF8InputTextBuilder } from './utf8InputText.js';
10
+ const LEADING_WHITESPACE_PATTERN = /^\s+/u;
10
11
  export class JapaneseTokenizer {
11
12
  grammar;
12
13
  lexicon;
@@ -57,6 +58,13 @@ export class JapaneseTokenizer {
57
58
  const sentences = [];
58
59
  let remaining = inputText;
59
60
  while (remaining.length > 0) {
61
+ const leadingWhitespace = remaining.match(LEADING_WHITESPACE_PATTERN);
62
+ if (leadingWhitespace) {
63
+ remaining = remaining.slice(leadingWhitespace[0].length);
64
+ if (remaining.length === 0) {
65
+ break;
66
+ }
67
+ }
60
68
  const checker = {
61
69
  hasNonBreakWord: (eos) => {
62
70
  const bytes = this.buildInputText(remaining.slice(0, eos)).getByteText();
@@ -1,6 +1,7 @@
1
1
  import { SentenceDetector, } from '../sentdetect/sentenceDetector.js';
2
2
  import { UTF8InputTextBuilder } from './utf8InputText.js';
3
3
  const BUFFER_SIZE = 4096;
4
+ const LEADING_WHITESPACE_PATTERN = /^\s+/u;
4
5
  export class SentenceSplittingLazyAnalysis {
5
6
  mode;
6
7
  grammar;
@@ -68,6 +69,15 @@ export class SentenceSplittingLazyAnalysis {
68
69
  return this.buffer.length;
69
70
  }
70
71
  processNextSentence() {
72
+ const leadingWhitespace = this.normalized.match(LEADING_WHITESPACE_PATTERN);
73
+ if (leadingWhitespace) {
74
+ const skipped = leadingWhitespace[0].length;
75
+ this.bos += skipped;
76
+ this.normalized = this.normalized.slice(skipped);
77
+ if (this.normalized.length === 0) {
78
+ return null;
79
+ }
80
+ }
71
81
  const detector = new SentenceDetector();
72
82
  const eosLength = detector.getEos(this.normalized, this);
73
83
  if (eosLength > 0) {
@@ -6,6 +6,12 @@ import { PluginLoader } from '../plugins/loader.js';
6
6
  import { BinaryDictionary } from './binaryDictionary.js';
7
7
  import { loadDefaultCompoundLexicon } from './defaultCompoundLexicon.js';
8
8
  import { LexiconSet } from './lexiconSet.js';
9
+ function isTokenChunkerPlugin(className) {
10
+ if (className === 'com.worksap.nlp.sudachi.TokenChunkerPlugin') {
11
+ return true;
12
+ }
13
+ return className.split('.').pop() === 'TokenChunkerPlugin';
14
+ }
9
15
  export class DictionaryFactory {
10
16
  async create(configPath, customConfig) {
11
17
  const config = customConfig || (await loadConfig(configPath));
@@ -76,6 +82,10 @@ export class DictionaryFactory {
76
82
  if (!pathRewritePluginConfs || pathRewritePluginConfs.length === 0) {
77
83
  pathRewritePluginConfs = defaultConfig.getPlugins('pathRewritePlugin');
78
84
  }
85
+ if (!enableDefaultCompoundParticles &&
86
+ (pathRewritePluginConfs || []).some((conf) => isTokenChunkerPlugin(conf.className))) {
87
+ throw new Error('TokenChunkerPlugin is only compatible when enableDefaultCompoundParticles is true.');
88
+ }
79
89
  const pathRewritePlugins = (await loader.loadPathRewritePlugins(pathRewritePluginConfs || [], grammar)).map((p) => p.plugin);
80
90
  return new Dictionary(grammar, lexicon, inputTextPlugins, oovProviderPlugins, pathRewritePlugins);
81
91
  }
@@ -28,6 +28,8 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
28
28
  private isCounterChunk;
29
29
  private mergeCounterChunks;
30
30
  private applyMergeStage;
31
+ private shouldMergeAttributiveKana;
32
+ private shouldMergeSmallKana;
31
33
  private applyInlineRubyExactStage;
32
34
  private applyInlineRubyPrefixStage;
33
35
  private shouldMergeInlineRubyExact;
@@ -294,10 +294,21 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
294
294
  while (i < chunks.length - 1) {
295
295
  const current = chunks[i];
296
296
  const next = chunks[i + 1];
297
+ const following = chunks[i + 2];
297
298
  if (!current || !next) {
298
299
  i++;
299
300
  continue;
300
301
  }
302
+ if (this.shouldMergeSmallKana(current, next)) {
303
+ const merged = this.mergeChunks([current, next], 'fixed_expression');
304
+ chunks.splice(i, 2, merged);
305
+ continue;
306
+ }
307
+ if (this.shouldMergeAttributiveKana(current, next, following)) {
308
+ const merged = this.mergeChunks([current, next], 'phrase');
309
+ chunks.splice(i, 2, merged);
310
+ continue;
311
+ }
301
312
  if ((current.chunkType === 'te_form' ||
302
313
  current.chunkType === 'suru_verb_te_form') &&
303
314
  (next.chunkType === 'single_token' ||
@@ -312,7 +323,10 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
312
323
  continue;
313
324
  }
314
325
  if (current.surface.endsWith('んだ') &&
315
- (next.surface === 'よ' || next.surface === 'よっ')) {
326
+ (next.surface === 'よ' ||
327
+ next.surface === 'よっ' ||
328
+ next.surface === 'ね' ||
329
+ next.surface === 'ねっ')) {
316
330
  const merged = this.mergeChunks([current, next], 'fixed_expression');
317
331
  chunks.splice(i, 2, merged);
318
332
  continue;
@@ -321,6 +335,32 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
321
335
  }
322
336
  return chunks;
323
337
  }
338
+ shouldMergeAttributiveKana(current, next, following) {
339
+ if (next.surface !== 'かな') {
340
+ return false;
341
+ }
342
+ const nextPos = this.getPosById(next.posId);
343
+ if (!nextPos || nextPos[0] !== '助詞' || nextPos[1] !== '終助詞') {
344
+ return false;
345
+ }
346
+ const currentPos0 = this.getPosById(current.posId)?.[0] ?? '';
347
+ if (!['名詞', '形状詞'].includes(currentPos0)) {
348
+ return false;
349
+ }
350
+ if (!following) {
351
+ return false;
352
+ }
353
+ const followingPos0 = this.getPosById(following.posId)?.[0] ?? '';
354
+ return ['名詞', '代名詞'].includes(followingPos0);
355
+ }
356
+ shouldMergeSmallKana(current, next) {
357
+ if (current.chunkType !== 'single_token' ||
358
+ next.chunkType !== 'single_token') {
359
+ return false;
360
+ }
361
+ return (SINGLE_KANA_PATTERN.test(current.surface) &&
362
+ SMALL_KANA_SUFFIXES.has(next.surface));
363
+ }
324
364
  applyInlineRubyExactStage(source) {
325
365
  const chunks = [...source];
326
366
  let i = 0;
@@ -486,6 +526,8 @@ const KANJI_PATTERN = /\p{Script=Han}/u;
486
526
  const KANJI_ONLY_PATTERN = /^[\p{Script=Han}々〆ヵヶ]+$/u;
487
527
  const KANA_PATTERN = /^[ぁ-ゖァ-ヺー]+$/u;
488
528
  const HIRAGANA_PATTERN = /^[ぁ-ゖー]+$/u;
529
+ const SINGLE_KANA_PATTERN = /^[ぁ-ゖァ-ヺー]$/u;
530
+ const SMALL_KANA_SUFFIXES = new Set(['ゃ', 'ゅ', 'ょ', 'ャ', 'ュ', 'ョ']);
489
531
  const COUNTER_WORDS = new Set([
490
532
  '本',
491
533
  '匹',
@@ -683,6 +725,25 @@ const COLLOQUIAL_SEQUENCE_RULES = [
683
725
  { surface: 'た', pos0: '助動詞' },
684
726
  ],
685
727
  },
728
+ {
729
+ name: 'copula_desu_past_split',
730
+ priority: 101,
731
+ resultType: 'phrase',
732
+ pattern: [
733
+ { surface: 'でし', dictionaryForm: 'です', pos0: '助動詞' },
734
+ { surface: 'た', pos0: '助動詞' },
735
+ ],
736
+ },
737
+ {
738
+ name: 'verb_masen',
739
+ priority: 101,
740
+ resultType: 'phrase',
741
+ pattern: [
742
+ { pos0: '動詞' },
743
+ { surface: 'ませ', dictionaryForm: 'ます', pos0: '助動詞' },
744
+ { surface: 'ん', pos0: '助動詞' },
745
+ ],
746
+ },
686
747
  {
687
748
  name: 'noun_suru_past',
688
749
  priority: 101,
@@ -1338,6 +1399,12 @@ const COLLOQUIAL_SEQUENCE_RULES = [
1338
1399
  { surface: 'いい' },
1339
1400
  ],
1340
1401
  },
1402
+ {
1403
+ name: 'nakereba',
1404
+ priority: 95,
1405
+ resultType: 'phrase',
1406
+ pattern: [{ surface: 'なけれ' }, { surface: 'ば', pos0: '助詞' }],
1407
+ },
1341
1408
  {
1342
1409
  name: 'ja_ire_nai',
1343
1410
  priority: 95,
@@ -1372,6 +1439,28 @@ const COLLOQUIAL_SEQUENCE_RULES = [
1372
1439
  resultType: 'phrase',
1373
1440
  pattern: [{ pos0: '動詞' }, { surface: 'た', pos0: '助動詞' }],
1374
1441
  },
1442
+ {
1443
+ name: 'verb_chatta_past',
1444
+ priority: 96,
1445
+ resultType: 'phrase',
1446
+ pattern: [
1447
+ { pos0: '動詞' },
1448
+ { surface: ['ちゃっ', 'じゃっ'] },
1449
+ { surface: 'た', pos0: '助動詞' },
1450
+ ],
1451
+ },
1452
+ {
1453
+ name: 'verb_past_n_da',
1454
+ priority: 96,
1455
+ resultType: 'phrase',
1456
+ pattern: [{ pos0: '動詞' }, { surface: 'ん' }, { surface: 'だ' }],
1457
+ },
1458
+ {
1459
+ name: 'verb_past_n_da_compact',
1460
+ priority: 96,
1461
+ resultType: 'phrase',
1462
+ pattern: [{ pos0: '動詞' }, { surface: 'んだ' }],
1463
+ },
1375
1464
  {
1376
1465
  name: 'verb_tara_compact',
1377
1466
  priority: 96,
@@ -1876,6 +1965,18 @@ const COLLOQUIAL_SEQUENCE_RULES = [
1876
1965
  resultType: 'fixed_expression',
1877
1966
  pattern: [{ surface: 'んだ' }, { surface: 'よ' }],
1878
1967
  },
1968
+ {
1969
+ name: 'fixed_n_da_ne',
1970
+ priority: 94,
1971
+ resultType: 'fixed_expression',
1972
+ pattern: [{ surface: 'ん' }, { surface: 'だ' }, { surface: 'ね' }],
1973
+ },
1974
+ {
1975
+ name: 'fixed_nda_ne',
1976
+ priority: 94,
1977
+ resultType: 'fixed_expression',
1978
+ pattern: [{ surface: 'んだ' }, { surface: 'ね' }],
1979
+ },
1879
1980
  {
1880
1981
  name: 'fixed_de_su_yo',
1881
1982
  priority: 94,
@@ -28,26 +28,27 @@ export class SentenceDetector {
28
28
  let match = SENTENCE_BREAKER_PATTERN.exec(s);
29
29
  while (match !== null) {
30
30
  const eos = match.index + match[0].length;
31
- if (this.parenthesisLevel(s.slice(0, eos)) === 0) {
32
- let adjustedEos = eos;
33
- if (eos < s.length) {
34
- adjustedEos += this.prohibitedBOS(s.slice(eos));
35
- }
36
- if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
37
- match = SENTENCE_BREAKER_PATTERN.exec(s);
38
- continue;
39
- }
40
- if (eos < s.length && this.isContinuousPhrase(s, eos)) {
41
- match = SENTENCE_BREAKER_PATTERN.exec(s);
42
- continue;
43
- }
44
- if (checker?.hasNonBreakWord(eos)) {
45
- match = SENTENCE_BREAKER_PATTERN.exec(s);
46
- continue;
47
- }
48
- return adjustedEos;
31
+ let adjustedEos = eos;
32
+ if (eos < s.length) {
33
+ adjustedEos += this.prohibitedBOS(s.slice(eos));
49
34
  }
50
- match = SENTENCE_BREAKER_PATTERN.exec(s);
35
+ if (this.parenthesisLevel(s.slice(0, adjustedEos)) !== 0) {
36
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
37
+ continue;
38
+ }
39
+ if (ITEMIZE_HEADER_PATTERN.test(s.slice(0, eos))) {
40
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
41
+ continue;
42
+ }
43
+ if (eos < s.length && this.isContinuousPhrase(s, eos)) {
44
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
45
+ continue;
46
+ }
47
+ if (checker?.hasNonBreakWord(adjustedEos)) {
48
+ match = SENTENCE_BREAKER_PATTERN.exec(s);
49
+ continue;
50
+ }
51
+ return adjustedEos;
51
52
  }
52
53
  if (input.length > this.limit) {
53
54
  const spaces = s.match(/^.+\s+/);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sudachi-ts",
3
- "version": "0.1.18",
3
+ "version": "0.1.20-beta.0",
4
4
  "description": "TypeScript port of Sudachi morphological analyzer for Japanese text",
5
5
  "keywords": [
6
6
  "morphological-analyzer",