sudachi-ts 0.1.18 → 0.1.19

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -281,6 +281,9 @@ readings is visible in the baseline vs plugin outputs.
281
281
  `TokenChunkerPlugin` is designed and validated against the full Sudachi system
282
282
  dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
283
283
  when adding or tuning chunk rules.
284
+ `TokenChunkerPlugin` requires `enableDefaultCompoundParticles: true`. Dictionary
285
+ creation throws an error when this plugin is configured with default compound
286
+ particles disabled.
284
287
  `TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
285
288
  `tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
286
289
  enabled throws an error.
@@ -6,6 +6,12 @@ import { PluginLoader } from '../plugins/loader.js';
6
6
  import { BinaryDictionary } from './binaryDictionary.js';
7
7
  import { loadDefaultCompoundLexicon } from './defaultCompoundLexicon.js';
8
8
  import { LexiconSet } from './lexiconSet.js';
9
+ function isTokenChunkerPlugin(className) {
10
+ if (className === 'com.worksap.nlp.sudachi.TokenChunkerPlugin') {
11
+ return true;
12
+ }
13
+ return className.split('.').pop() === 'TokenChunkerPlugin';
14
+ }
9
15
  export class DictionaryFactory {
10
16
  async create(configPath, customConfig) {
11
17
  const config = customConfig || (await loadConfig(configPath));
@@ -76,6 +82,10 @@ export class DictionaryFactory {
76
82
  if (!pathRewritePluginConfs || pathRewritePluginConfs.length === 0) {
77
83
  pathRewritePluginConfs = defaultConfig.getPlugins('pathRewritePlugin');
78
84
  }
85
+ if (!enableDefaultCompoundParticles &&
86
+ (pathRewritePluginConfs || []).some((conf) => isTokenChunkerPlugin(conf.className))) {
87
+ throw new Error('TokenChunkerPlugin is only compatible when enableDefaultCompoundParticles is true.');
88
+ }
79
89
  const pathRewritePlugins = (await loader.loadPathRewritePlugins(pathRewritePluginConfs || [], grammar)).map((p) => p.plugin);
80
90
  return new Dictionary(grammar, lexicon, inputTextPlugins, oovProviderPlugins, pathRewritePlugins);
81
91
  }
@@ -28,6 +28,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
28
28
  private isCounterChunk;
29
29
  private mergeCounterChunks;
30
30
  private applyMergeStage;
31
+ private shouldMergeAttributiveKana;
31
32
  private applyInlineRubyExactStage;
32
33
  private applyInlineRubyPrefixStage;
33
34
  private shouldMergeInlineRubyExact;
@@ -294,10 +294,16 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
294
294
  while (i < chunks.length - 1) {
295
295
  const current = chunks[i];
296
296
  const next = chunks[i + 1];
297
+ const following = chunks[i + 2];
297
298
  if (!current || !next) {
298
299
  i++;
299
300
  continue;
300
301
  }
302
+ if (this.shouldMergeAttributiveKana(current, next, following)) {
303
+ const merged = this.mergeChunks([current, next], 'phrase');
304
+ chunks.splice(i, 2, merged);
305
+ continue;
306
+ }
301
307
  if ((current.chunkType === 'te_form' ||
302
308
  current.chunkType === 'suru_verb_te_form') &&
303
309
  (next.chunkType === 'single_token' ||
@@ -321,6 +327,24 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
321
327
  }
322
328
  return chunks;
323
329
  }
330
+ shouldMergeAttributiveKana(current, next, following) {
331
+ if (next.surface !== 'かな') {
332
+ return false;
333
+ }
334
+ const nextPos = this.getPosById(next.posId);
335
+ if (!nextPos || nextPos[0] !== '助詞' || nextPos[1] !== '終助詞') {
336
+ return false;
337
+ }
338
+ const currentPos0 = this.getPosById(current.posId)?.[0] ?? '';
339
+ if (!['名詞', '形状詞'].includes(currentPos0)) {
340
+ return false;
341
+ }
342
+ if (!following) {
343
+ return false;
344
+ }
345
+ const followingPos0 = this.getPosById(following.posId)?.[0] ?? '';
346
+ return ['名詞', '代名詞'].includes(followingPos0);
347
+ }
324
348
  applyInlineRubyExactStage(source) {
325
349
  const chunks = [...source];
326
350
  let i = 0;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sudachi-ts",
3
- "version": "0.1.18",
3
+ "version": "0.1.19",
4
4
  "description": "TypeScript port of Sudachi morphological analyzer for Japanese text",
5
5
  "keywords": [
6
6
  "morphological-analyzer",