sudachi-ts 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -281,6 +281,9 @@ readings is visible in the baseline vs plugin outputs.
281
281
  `TokenChunkerPlugin` is designed and validated against the full Sudachi system
282
282
  dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
283
283
  when adding or tuning chunk rules.
284
+ `TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
285
+ `tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
286
+ enabled throws an error.
284
287
 
285
288
  ## Dictionary Building
286
289
 
@@ -140,6 +140,9 @@ export class JapaneseTokenizer {
140
140
  return builder.build();
141
141
  }
142
142
  tokenizeSentence(mode, input) {
143
+ for (const plugin of this.pathRewritePlugins) {
144
+ plugin.validateSplitMode(mode);
145
+ }
143
146
  this.buildLattice(input);
144
147
  const path = this.lattice.getBestPath();
145
148
  for (const plugin of this.pathRewritePlugins) {
@@ -8,5 +8,5 @@ export declare class SimpleOovProviderPlugin extends OovProviderPlugin {
8
8
  private rightId;
9
9
  private cost;
10
10
  setUp(grammar: Grammar): void;
11
- provideOOV(inputText: InputText, _offset: number, otherWords: number, result: LatticeNodeImpl[]): number;
11
+ provideOOV(inputText: InputText, offset: number, otherWords: number, result: LatticeNodeImpl[]): number;
12
12
  }
@@ -18,12 +18,12 @@ export class SimpleOovProviderPlugin extends OovProviderPlugin {
18
18
  const userPosMode = this.settings.getString(OovProviderPlugin.USER_POS, OovProviderPlugin.USER_POS_FORBID) ?? OovProviderPlugin.USER_POS_FORBID;
19
19
  this.oovPOSId = this.posIdOf(grammar, pos, userPosMode);
20
20
  }
21
- provideOOV(inputText, _offset, otherWords, result) {
21
+ provideOOV(inputText, offset, otherWords, result) {
22
22
  if (otherWords === 0) {
23
23
  const node = this.createNode();
24
24
  node.setParameter(this.leftId, this.rightId, this.cost);
25
- const length = inputText.getWordCandidateLength(0);
26
- const s = inputText.getSubstring(0, length);
25
+ const length = inputText.getWordCandidateLength(offset);
26
+ const s = inputText.getSubstring(offset, offset + length);
27
27
  const info = new WordInfo(s, length, this.oovPOSId, s, s, '');
28
28
  node.setWordInfo(info);
29
29
  result.push(node);
@@ -1,10 +1,12 @@
1
1
  import type { InputText } from '../../core/inputText.js';
2
2
  import type { Lattice, LatticeNode } from '../../core/lattice.js';
3
+ import type { SplitMode } from '../../core/tokenizer.js';
3
4
  import type { CategoryType } from '../../dictionary/categoryType.js';
4
5
  import type { Grammar } from '../../dictionary/grammar.js';
5
6
  import { Plugin } from '../base.js';
6
7
  export declare abstract class PathRewritePlugin extends Plugin {
7
8
  setUp(_grammar: Grammar): void;
9
+ validateSplitMode(_mode: SplitMode): void;
8
10
  abstract rewrite(text: InputText, path: LatticeNode[], lattice: Lattice): void;
9
11
  concatenate(path: LatticeNode[], begin: number, end: number, lattice: Lattice, normalizedForm?: string | null): LatticeNode;
10
12
  concatenateOov(path: LatticeNode[], begin: number, end: number, posId: number, lattice: Lattice): LatticeNode;
@@ -2,6 +2,7 @@ import { WordInfo } from '../../dictionary/wordInfo.js';
2
2
  import { Plugin } from '../base.js';
3
3
  export class PathRewritePlugin extends Plugin {
4
4
  setUp(_grammar) { }
5
+ validateSplitMode(_mode) { }
5
6
  concatenate(path, begin, end, lattice, normalizedForm = null) {
6
7
  if (begin >= end) {
7
8
  throw new Error('begin >= end');
@@ -1,5 +1,6 @@
1
1
  import type { InputText } from '../../core/inputText.js';
2
2
  import type { Lattice, LatticeNode } from '../../core/lattice.js';
3
+ import { SplitMode } from '../../core/tokenizer.js';
3
4
  import type { Grammar } from '../../dictionary/grammar.js';
4
5
  import { PathRewritePlugin } from './base.js';
5
6
  export declare class TokenChunkerPlugin extends PathRewritePlugin {
@@ -7,6 +8,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
7
8
  private enablePatternRules;
8
9
  private enableBroadRules;
9
10
  setUp(grammar: Grammar): void;
11
+ validateSplitMode(mode: SplitMode): void;
10
12
  rewrite(_text: InputText, path: LatticeNode[], lattice: Lattice): void;
11
13
  private toInitialChunks;
12
14
  private applyPatternStage;
@@ -1,3 +1,4 @@
1
+ import { SplitMode } from '../../core/tokenizer.js';
1
2
  import { WordInfo } from '../../dictionary/wordInfo.js';
2
3
  import { PathRewritePlugin } from './base.js';
3
4
  export class TokenChunkerPlugin extends PathRewritePlugin {
@@ -9,6 +10,11 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
9
10
  this.enablePatternRules = this.settings.getBoolean('enablePatternRules', true);
10
11
  this.enableBroadRules = this.settings.getBoolean('enableBroadRules', false);
11
12
  }
13
+ validateSplitMode(mode) {
14
+ if (mode !== SplitMode.C) {
15
+ throw new Error('TokenChunkerPlugin requires SplitMode.C. Use tokenizer.tokenize(text) or tokenizer.tokenize(SplitMode.C, text).');
16
+ }
17
+ }
12
18
  rewrite(_text, path, lattice) {
13
19
  if (path.length === 0) {
14
20
  return;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "sudachi-ts",
3
- "version": "0.1.15",
3
+ "version": "0.1.17",
4
4
  "description": "TypeScript port of Sudachi morphological analyzer for Japanese text",
5
5
  "keywords": [
6
6
  "morphological-analyzer",