npm - sudachi-ts - Versions diffs - 0.1.15 → 0.1.17 - Mend

sudachi-ts 0.1.15 → 0.1.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/README.md CHANGED Viewed

@@ -281,6 +281,9 @@ readings is visible in the baseline vs plugin outputs.
 `TokenChunkerPlugin` is designed and validated against the full Sudachi system
 dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
 when adding or tuning chunk rules.
+`TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
+`tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
+enabled throws an error.
 ## Dictionary Building

package/build/src/core/japaneseTokenizer.js CHANGED Viewed

@@ -140,6 +140,9 @@ export class JapaneseTokenizer {
         return builder.build();
     }
     tokenizeSentence(mode, input) {
+        for (const plugin of this.pathRewritePlugins) {
+            plugin.validateSplitMode(mode);
+        }
         this.buildLattice(input);
         const path = this.lattice.getBestPath();
         for (const plugin of this.pathRewritePlugins) {

package/build/src/plugins/oov/simpleOovProviderPlugin.d.ts CHANGED Viewed

@@ -8,5 +8,5 @@ export declare class SimpleOovProviderPlugin extends OovProviderPlugin {
     private rightId;
     private cost;
     setUp(grammar: Grammar): void;
-    provideOOV(inputText: InputText, _offset: number, otherWords: number, result: LatticeNodeImpl[]): number;
+    provideOOV(inputText: InputText, offset: number, otherWords: number, result: LatticeNodeImpl[]): number;
 }

package/build/src/plugins/oov/simpleOovProviderPlugin.js CHANGED Viewed

@@ -18,12 +18,12 @@ export class SimpleOovProviderPlugin extends OovProviderPlugin {
         const userPosMode = this.settings.getString(OovProviderPlugin.USER_POS, OovProviderPlugin.USER_POS_FORBID) ?? OovProviderPlugin.USER_POS_FORBID;
         this.oovPOSId = this.posIdOf(grammar, pos, userPosMode);
     }
-    provideOOV(inputText, _offset, otherWords, result) {
+    provideOOV(inputText, offset, otherWords, result) {
         if (otherWords === 0) {
             const node = this.createNode();
             node.setParameter(this.leftId, this.rightId, this.cost);
-            const length = inputText.getWordCandidateLength(0);
-            const s = inputText.getSubstring(0, length);
+            const length = inputText.getWordCandidateLength(offset);
+            const s = inputText.getSubstring(offset, offset + length);
             const info = new WordInfo(s, length, this.oovPOSId, s, s, '');
             node.setWordInfo(info);
             result.push(node);

package/build/src/plugins/pathRewrite/base.d.ts CHANGED Viewed

@@ -1,10 +1,12 @@
 import type { InputText } from '../../core/inputText.js';
 import type { Lattice, LatticeNode } from '../../core/lattice.js';
+import type { SplitMode } from '../../core/tokenizer.js';
 import type { CategoryType } from '../../dictionary/categoryType.js';
 import type { Grammar } from '../../dictionary/grammar.js';
 import { Plugin } from '../base.js';
 export declare abstract class PathRewritePlugin extends Plugin {
     setUp(_grammar: Grammar): void;
+    validateSplitMode(_mode: SplitMode): void;
     abstract rewrite(text: InputText, path: LatticeNode[], lattice: Lattice): void;
     concatenate(path: LatticeNode[], begin: number, end: number, lattice: Lattice, normalizedForm?: string | null): LatticeNode;
     concatenateOov(path: LatticeNode[], begin: number, end: number, posId: number, lattice: Lattice): LatticeNode;

package/build/src/plugins/pathRewrite/base.js CHANGED Viewed

@@ -2,6 +2,7 @@ import { WordInfo } from '../../dictionary/wordInfo.js';
 import { Plugin } from '../base.js';
 export class PathRewritePlugin extends Plugin {
     setUp(_grammar) { }
+    validateSplitMode(_mode) { }
     concatenate(path, begin, end, lattice, normalizedForm = null) {
         if (begin >= end) {
             throw new Error('begin >= end');

package/build/src/plugins/pathRewrite/tokenChunkerPlugin.d.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import type { InputText } from '../../core/inputText.js';
 import type { Lattice, LatticeNode } from '../../core/lattice.js';
+import { SplitMode } from '../../core/tokenizer.js';
 import type { Grammar } from '../../dictionary/grammar.js';
 import { PathRewritePlugin } from './base.js';
 export declare class TokenChunkerPlugin extends PathRewritePlugin {
@@ -7,6 +8,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
     private enablePatternRules;
     private enableBroadRules;
     setUp(grammar: Grammar): void;
+    validateSplitMode(mode: SplitMode): void;
     rewrite(_text: InputText, path: LatticeNode[], lattice: Lattice): void;
     private toInitialChunks;
     private applyPatternStage;

package/build/src/plugins/pathRewrite/tokenChunkerPlugin.js CHANGED Viewed

@@ -1,3 +1,4 @@
+import { SplitMode } from '../../core/tokenizer.js';
 import { WordInfo } from '../../dictionary/wordInfo.js';
 import { PathRewritePlugin } from './base.js';
 export class TokenChunkerPlugin extends PathRewritePlugin {
@@ -9,6 +10,11 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
         this.enablePatternRules = this.settings.getBoolean('enablePatternRules', true);
         this.enableBroadRules = this.settings.getBoolean('enableBroadRules', false);
     }
+    validateSplitMode(mode) {
+        if (mode !== SplitMode.C) {
+            throw new Error('TokenChunkerPlugin requires SplitMode.C. Use tokenizer.tokenize(text) or tokenizer.tokenize(SplitMode.C, text).');
+        }
+    }
     rewrite(_text, path, lattice) {
         if (path.length === 0) {
             return;

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "sudachi-ts",
-  "version": "0.1.15",
+  "version": "0.1.17",
   "description": "TypeScript port of Sudachi morphological analyzer for Japanese text",
   "keywords": [
     "morphological-analyzer",