sudachi-ts 0.1.13 → 0.1.16

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -105,7 +105,7 @@ const config = await loadConfig('./sudachi.json');
105
105
  const dict = Dictionary.create();
106
106
  ```
107
107
 
108
- Example `sudachi.json`:
108
+ Example `sudachi.json`:
109
109
 
110
110
  ```json
111
111
  {
@@ -120,10 +120,14 @@ Example `sudachi.json`:
120
120
  }
121
121
  }
122
122
  ]
123
- }
124
- ```
125
-
126
- By default, Sudachi-TS enables a built-in compound-particle lexicon
123
+ }
124
+ ```
125
+
126
+ For non-absolute file references in config (dictionary files, plugin module paths,
127
+ and built-in plugin file settings), Sudachi-TS tries paths relative to the config
128
+ file first, then relative to the current working directory.
129
+
130
+ By default, Sudachi-TS enables a built-in compound-particle lexicon
127
131
  (`"enableDefaultCompoundParticles": true`) so forms such as `かも`, `のか`,
128
132
  and `だから` are tokenized as single morphemes. Set it to `false` to disable:
129
133
 
@@ -269,7 +273,7 @@ See [PLUGINS.md](./PLUGINS.md) for detailed plugin development guide.
269
273
  Quick local comparison for the PoC token chunker plugin:
270
274
 
271
275
  ```bash
272
- bun examples/token-chunker-plugin.ts /path/to/system.dic "東京大学"
276
+ npm exec tsx examples/token-chunker-plugin.ts /path/to/system.dic "東京大学"
273
277
  ```
274
278
 
275
279
  This example prints each token as `surface/reading` so the chunking impact on
@@ -277,6 +281,9 @@ readings is visible in the baseline vs plugin outputs.
277
281
  `TokenChunkerPlugin` is designed and validated against the full Sudachi system
278
282
  dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
279
283
  when adding or tuning chunk rules.
284
+ `TokenChunkerPlugin` is intended for `SplitMode.C` tokenization; calling
285
+ `tokenize(SplitMode.A, ...)` or `tokenize(SplitMode.B, ...)` with this plugin
286
+ enabled throws an error.
280
287
 
281
288
  ## Dictionary Building
282
289
 
@@ -328,23 +335,23 @@ See [CONFIG.md](./CONFIG.md) for detailed configuration options.
328
335
 
329
336
  ## Development
330
337
 
331
- ```bash
332
- # Clone repository
333
- git clone https://github.com/your-org/sudachi-ts.git
334
- cd sudachi-ts
335
-
336
- # Install dependencies
337
- bun install
338
-
339
- # Type check
340
- bun x tsc --noEmit
341
-
342
- # Run tests
343
- bun test
344
-
345
- # Lint
346
- bun x @biomejs/biome lint --write .
347
- ```
338
+ ```bash
339
+ # Clone repository
340
+ git clone https://github.com/your-org/sudachi-ts.git
341
+ cd sudachi-ts
342
+
343
+ # Install dependencies
344
+ npm install
345
+
346
+ # Type check
347
+ npm run typecheck
348
+
349
+ # Run tests
350
+ npm test
351
+
352
+ # Lint
353
+ npm run check:fix
354
+ ```
348
355
 
349
356
  ## Architecture
350
357
 
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -15,8 +15,8 @@ export class Config {
15
15
  static async fromFile(filePath) {
16
16
  const content = await readFile(filePath, 'utf-8');
17
17
  const baseDir = dirname(filePath);
18
- const anchor = PathAnchor.filesystem(baseDir);
19
- return new Config(Settings.parse(content), anchor);
18
+ const anchor = PathAnchor.filesystem(baseDir).andThen(PathAnchor.none());
19
+ return new Config(Settings.parse(content, anchor), anchor);
20
20
  }
21
21
  static parse(json) {
22
22
  return new Config(Settings.parse(json));
@@ -61,7 +61,14 @@ export class Config {
61
61
  return this.settings.getIntList(key);
62
62
  }
63
63
  getPlugins(key) {
64
- return this.settings.getPlugins(key);
64
+ const plugins = this.settings.getPlugins(key);
65
+ if (!plugins) {
66
+ return null;
67
+ }
68
+ return plugins.map((plugin) => ({
69
+ className: plugin.className,
70
+ settings: plugin.settings.withAnchor(this.anchor),
71
+ }));
65
72
  }
66
73
  }
67
74
  export async function loadConfig(configPath) {
@@ -1,12 +1,18 @@
1
+ import { PathAnchor } from './pathAnchor.js';
1
2
  export type PluginConf<_T> = {
2
3
  className: string;
3
4
  settings: Settings;
4
5
  };
5
6
  export declare class Settings {
6
7
  private readonly data;
7
- constructor(data?: Record<string, unknown>);
8
+ private readonly anchor;
9
+ constructor(data?: Record<string, unknown>, anchor?: PathAnchor);
8
10
  static empty(): Settings;
9
- static parse(json: string, _basePath?: string): Settings;
11
+ static parse(json: string, basePathOrAnchor?: string | PathAnchor): Settings;
12
+ getAnchor(): PathAnchor;
13
+ withAnchor(anchor: PathAnchor): Settings;
14
+ getPath(key: string, defaultValue?: string): Promise<string | null>;
15
+ toObject(): Record<string, unknown>;
10
16
  getString(key: string, defaultValue?: string): string | null;
11
17
  getInt(key: string, defaultValue?: number): number;
12
18
  getBoolean(key: string, defaultValue: boolean): boolean;
@@ -1,17 +1,39 @@
1
+ import { PathAnchor } from './pathAnchor.js';
1
2
  export class Settings {
2
3
  data;
3
- constructor(data = {}) {
4
+ anchor;
5
+ constructor(data = {}, anchor = PathAnchor.none()) {
4
6
  this.data = { ...data };
7
+ this.anchor = anchor;
5
8
  }
6
9
  static empty() {
7
- return new Settings({});
10
+ return new Settings({}, PathAnchor.none());
8
11
  }
9
- static parse(json, _basePath) {
12
+ static parse(json, basePathOrAnchor) {
10
13
  const data = JSON.parse(json);
11
14
  if (typeof data !== 'object' || data === null) {
12
15
  throw new Error('root must be an object');
13
16
  }
14
- return new Settings(data);
17
+ if (typeof basePathOrAnchor === 'string') {
18
+ return new Settings(data, PathAnchor.filesystem(basePathOrAnchor).andThen(PathAnchor.none()));
19
+ }
20
+ return new Settings(data, basePathOrAnchor ?? PathAnchor.none());
21
+ }
22
+ getAnchor() {
23
+ return this.anchor;
24
+ }
25
+ withAnchor(anchor) {
26
+ return new Settings(this.data, anchor);
27
+ }
28
+ async getPath(key, defaultValue) {
29
+ const value = this.getString(key, defaultValue);
30
+ if (value === null) {
31
+ return null;
32
+ }
33
+ return await this.anchor.resolve(value);
34
+ }
35
+ toObject() {
36
+ return { ...this.data };
15
37
  }
16
38
  getString(key, defaultValue) {
17
39
  const value = this.data[key];
@@ -70,7 +92,7 @@ export class Settings {
70
92
  const obj = item;
71
93
  return {
72
94
  className: obj.class,
73
- settings: new Settings({ ...obj }),
95
+ settings: new Settings({ ...obj }, this.anchor),
74
96
  };
75
97
  }
76
98
  throw new Error(`sub-object for ${key} didn't have class key`);
@@ -79,9 +101,9 @@ export class Settings {
79
101
  return null;
80
102
  }
81
103
  withFallback(other) {
82
- return new Settings({ ...other.data, ...this.data });
104
+ return new Settings({ ...other.data, ...this.data }, this.anchor.andThen(other.anchor));
83
105
  }
84
106
  merge(overrides) {
85
- return new Settings({ ...this.data, ...overrides });
107
+ return new Settings({ ...this.data, ...overrides }, this.anchor);
86
108
  }
87
109
  }
@@ -140,6 +140,9 @@ export class JapaneseTokenizer {
140
140
  return builder.build();
141
141
  }
142
142
  tokenizeSentence(mode, input) {
143
+ for (const plugin of this.pathRewritePlugins) {
144
+ plugin.validateSplitMode(mode);
145
+ }
143
146
  this.buildLattice(input);
144
147
  const path = this.lattice.getBestPath();
145
148
  for (const plugin of this.pathRewritePlugins) {
@@ -1,3 +1,4 @@
1
+ import { readFile } from 'node:fs/promises';
1
2
  import { CategoryType } from './categoryType.js';
2
3
  export class CharacterCategory {
3
4
  static PATTERN_SPACES = /\s+/;
@@ -67,8 +68,7 @@ export class CharacterCategory {
67
68
  static async loadDefault() {
68
69
  const charCategory = new CharacterCategory();
69
70
  try {
70
- const response = await fetch(new URL('../resources/char.def', import.meta.url));
71
- const content = await response.text();
71
+ const content = await readFile(new URL('../resources/char.def', import.meta.url), 'utf-8');
72
72
  charCategory.readCharacterDefinition(content);
73
73
  }
74
74
  catch (e) {
@@ -44,8 +44,8 @@ export class DictionaryFactory {
44
44
  ensureLexiconSet().add(userDict.getLexicon());
45
45
  }
46
46
  }
47
- const loader = new PluginLoader();
48
- const defaultConfig = Config.parse(DEFAULT_CONFIG_JSON);
47
+ const loader = new PluginLoader(anchor);
48
+ const defaultConfig = Config.parse(DEFAULT_CONFIG_JSON).setAnchor(anchor);
49
49
  let inputTextPluginConfs = config.getPlugins('inputTextPlugin');
50
50
  if (!inputTextPluginConfs || inputTextPluginConfs.length === 0) {
51
51
  inputTextPluginConfs = defaultConfig.getPlugins('inputTextPlugin');
@@ -9,7 +9,7 @@ export class DefaultInputTextPlugin extends InputTextPlugin {
9
9
  if (this.initialized) {
10
10
  return;
11
11
  }
12
- const rewriteDefPath = this.settings.getString('rewriteDef');
12
+ const rewriteDefPath = await this.settings.getPath('rewriteDef');
13
13
  if (rewriteDefPath) {
14
14
  const content = await readFully(rewriteDefPath);
15
15
  this.readRewriteLists(content);
@@ -1,3 +1,4 @@
1
+ import { PathAnchor } from '../config/pathAnchor.js';
1
2
  import type { Settings } from '../config/settings.js';
2
3
  import type { Grammar } from '../dictionary/grammar.js';
3
4
  import type { Plugin } from './base.js';
@@ -11,6 +12,8 @@ export interface LoadedPlugin<T extends Plugin> {
11
12
  className: string;
12
13
  }
13
14
  export declare class PluginLoader {
15
+ private readonly anchor;
16
+ constructor(anchor?: PathAnchor);
14
17
  loadInputTextPlugin(className: string, settings: Settings): Promise<LoadedPlugin<InputTextPlugin>>;
15
18
  loadOovProviderPlugin(className: string, settings: Settings): Promise<LoadedPlugin<OovProviderPlugin>>;
16
19
  loadPathRewritePlugin(className: string, settings: Settings): Promise<LoadedPlugin<PathRewritePlugin>>;
@@ -35,6 +38,8 @@ export declare class PluginLoader {
35
38
  private loadPlugin;
36
39
  private findPluginClass;
37
40
  private isPluginConstructor;
41
+ private resolveClassSpecifier;
42
+ private isPathLikeSpecifier;
38
43
  private isBuiltIn;
39
44
  private getBuiltIn;
40
45
  }
@@ -1,3 +1,6 @@
1
+ import { isAbsolute, resolve } from 'node:path';
2
+ import { pathToFileURL } from 'node:url';
3
+ import { PathAnchor } from '../config/pathAnchor.js';
1
4
  import { InhibitConnectionPlugin } from './connection/inhibitConnectionPlugin.js';
2
5
  import { DefaultInputTextPlugin } from './inputText/defaultInputTextPlugin.js';
3
6
  import { IgnoreYomiganaPlugin } from './inputText/ignoreYomiganaPlugin.js';
@@ -9,6 +12,10 @@ import { JoinKatakanaOovPlugin } from './pathRewrite/joinKatakanaOovPlugin.js';
9
12
  import { JoinNumericPlugin } from './pathRewrite/joinNumericPlugin.js';
10
13
  import { TokenChunkerPlugin } from './pathRewrite/tokenChunkerPlugin.js';
11
14
  export class PluginLoader {
15
+ anchor;
16
+ constructor(anchor = PathAnchor.none()) {
17
+ this.anchor = anchor;
18
+ }
12
19
  async loadInputTextPlugin(className, settings) {
13
20
  const plugin = await this.loadPlugin(className, settings);
14
21
  return { plugin, className };
@@ -72,7 +79,8 @@ export class PluginLoader {
72
79
  PluginClass = this.getBuiltIn(className);
73
80
  }
74
81
  else {
75
- const module = await import(className);
82
+ const classSpecifier = await this.resolveClassSpecifier(className);
83
+ const module = await import(classSpecifier);
76
84
  PluginClass = this.findPluginClass(module, className);
77
85
  }
78
86
  const plugin = new PluginClass();
@@ -112,6 +120,24 @@ export class PluginLoader {
112
120
  return false;
113
121
  }
114
122
  }
123
+ async resolveClassSpecifier(className) {
124
+ if (this.anchor === PathAnchor.none() ||
125
+ !this.isPathLikeSpecifier(className)) {
126
+ return className;
127
+ }
128
+ const resolvedPath = await this.anchor.resolve(className);
129
+ const absolutePath = isAbsolute(resolvedPath)
130
+ ? resolvedPath
131
+ : resolve(resolvedPath);
132
+ return pathToFileURL(absolutePath).href;
133
+ }
134
+ isPathLikeSpecifier(className) {
135
+ return (className.startsWith('./') ||
136
+ className.startsWith('../') ||
137
+ className.startsWith('.\\') ||
138
+ className.startsWith('..\\') ||
139
+ isAbsolute(className));
140
+ }
115
141
  isBuiltIn(name) {
116
142
  return (name in BUILT_IN_PLUGINS || name.split('.').pop() in BUILT_IN_PLUGINS);
117
143
  }
@@ -11,12 +11,12 @@ export class MeCabOovProviderPlugin extends OovProviderPlugin {
11
11
  if (this.initialized) {
12
12
  return;
13
13
  }
14
- const charDefPath = this.settings.getString('charDef');
14
+ const charDefPath = await this.settings.getPath('charDef');
15
15
  if (charDefPath) {
16
16
  const content = await readFully(charDefPath);
17
17
  this.readCharacterProperty(content);
18
18
  }
19
- const unkDefPath = this.settings.getString('unkDef');
19
+ const unkDefPath = await this.settings.getPath('unkDef');
20
20
  const userPosMode = this.settings.getString(OovProviderPlugin.USER_POS, OovProviderPlugin.USER_POS_FORBID) ?? OovProviderPlugin.USER_POS_FORBID;
21
21
  if (unkDefPath) {
22
22
  const content = await readFully(unkDefPath);
@@ -1,10 +1,12 @@
1
1
  import type { InputText } from '../../core/inputText.js';
2
2
  import type { Lattice, LatticeNode } from '../../core/lattice.js';
3
+ import type { SplitMode } from '../../core/tokenizer.js';
3
4
  import type { CategoryType } from '../../dictionary/categoryType.js';
4
5
  import type { Grammar } from '../../dictionary/grammar.js';
5
6
  import { Plugin } from '../base.js';
6
7
  export declare abstract class PathRewritePlugin extends Plugin {
7
8
  setUp(_grammar: Grammar): void;
9
+ validateSplitMode(_mode: SplitMode): void;
8
10
  abstract rewrite(text: InputText, path: LatticeNode[], lattice: Lattice): void;
9
11
  concatenate(path: LatticeNode[], begin: number, end: number, lattice: Lattice, normalizedForm?: string | null): LatticeNode;
10
12
  concatenateOov(path: LatticeNode[], begin: number, end: number, posId: number, lattice: Lattice): LatticeNode;
@@ -2,6 +2,7 @@ import { WordInfo } from '../../dictionary/wordInfo.js';
2
2
  import { Plugin } from '../base.js';
3
3
  export class PathRewritePlugin extends Plugin {
4
4
  setUp(_grammar) { }
5
+ validateSplitMode(_mode) { }
5
6
  concatenate(path, begin, end, lattice, normalizedForm = null) {
6
7
  if (begin >= end) {
7
8
  throw new Error('begin >= end');
@@ -1,5 +1,6 @@
1
1
  import type { InputText } from '../../core/inputText.js';
2
2
  import type { Lattice, LatticeNode } from '../../core/lattice.js';
3
+ import { SplitMode } from '../../core/tokenizer.js';
3
4
  import type { Grammar } from '../../dictionary/grammar.js';
4
5
  import { PathRewritePlugin } from './base.js';
5
6
  export declare class TokenChunkerPlugin extends PathRewritePlugin {
@@ -7,6 +8,7 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
7
8
  private enablePatternRules;
8
9
  private enableBroadRules;
9
10
  setUp(grammar: Grammar): void;
11
+ validateSplitMode(mode: SplitMode): void;
10
12
  rewrite(_text: InputText, path: LatticeNode[], lattice: Lattice): void;
11
13
  private toInitialChunks;
12
14
  private applyPatternStage;
@@ -1,3 +1,4 @@
1
+ import { SplitMode } from '../../core/tokenizer.js';
1
2
  import { WordInfo } from '../../dictionary/wordInfo.js';
2
3
  import { PathRewritePlugin } from './base.js';
3
4
  export class TokenChunkerPlugin extends PathRewritePlugin {
@@ -9,6 +10,11 @@ export class TokenChunkerPlugin extends PathRewritePlugin {
9
10
  this.enablePatternRules = this.settings.getBoolean('enablePatternRules', true);
10
11
  this.enableBroadRules = this.settings.getBoolean('enableBroadRules', false);
11
12
  }
13
+ validateSplitMode(mode) {
14
+ if (mode !== SplitMode.C) {
15
+ throw new Error('TokenChunkerPlugin requires SplitMode.C. Use tokenizer.tokenize(text) or tokenizer.tokenize(SplitMode.C, text).');
16
+ }
17
+ }
12
18
  rewrite(_text, path, lattice) {
13
19
  if (path.length === 0) {
14
20
  return;
package/package.json CHANGED
@@ -1,65 +1,69 @@
1
- {
2
- "name": "sudachi-ts",
3
- "version": "0.1.13",
4
- "description": "TypeScript port of Sudachi morphological analyzer for Japanese text",
5
- "keywords": [
6
- "morphological-analyzer",
7
- "nlp",
8
- "japanese",
9
- "tokenization",
10
- "natural-language-processing",
11
- "text-processing"
12
- ],
13
- "author": "Glen Stampoultzis",
14
- "license": "Apache-2.0",
15
- "repository": {
16
- "type": "git",
17
- "url": "https://github.com/gstamp/sudachi-ts.git",
18
- "directory": "sudachi-ts"
19
- },
20
- "bugs": "https://github.com/gstamp/sudachi-ts/issues",
21
- "homepage": "https://github.com/gstamp/sudachi-ts#readme",
22
- "type": "module",
23
- "main": "./build/src/index.js",
24
- "types": "./build/src/index.d.ts",
25
- "exports": {
26
- ".": "./build/src/index.js",
27
- "./dictionary": "./build/src/dictionary/index.js",
28
- "./config": "./build/src/config/index.js",
29
- "./plugins": "./build/src/plugins/index.js"
30
- },
31
- "bin": {
32
- "sudachi": "./build/bin/sudachi.js",
33
- "sudachi-build-system": "./build/bin/sudachi-build-system.js",
34
- "sudachi-build-user": "./build/bin/sudachi-build-user.js",
35
- "sudachi-print-dict": "./build/bin/sudachi-print-dict.js",
36
- "sudachi-print-header": "./build/bin/sudachi-print-header.js"
37
- },
38
- "files": [
39
- "build/",
40
- "README.md",
41
- "LICENSE"
42
- ],
43
- "engines": {
44
- "node": ">=18.0.0"
45
- },
1
+ {
2
+ "name": "sudachi-ts",
3
+ "version": "0.1.16",
4
+ "description": "TypeScript port of Sudachi morphological analyzer for Japanese text",
5
+ "keywords": [
6
+ "morphological-analyzer",
7
+ "nlp",
8
+ "japanese",
9
+ "tokenization",
10
+ "natural-language-processing",
11
+ "text-processing"
12
+ ],
13
+ "author": "Glen Stampoultzis",
14
+ "license": "Apache-2.0",
15
+ "repository": {
16
+ "type": "git",
17
+ "url": "https://github.com/gstamp/sudachi-ts.git",
18
+ "directory": "sudachi-ts"
19
+ },
20
+ "bugs": "https://github.com/gstamp/sudachi-ts/issues",
21
+ "homepage": "https://github.com/gstamp/sudachi-ts#readme",
22
+ "type": "module",
23
+ "main": "./build/src/index.js",
24
+ "types": "./build/src/index.d.ts",
25
+ "exports": {
26
+ ".": "./build/src/index.js",
27
+ "./dictionary": "./build/src/dictionary/index.js",
28
+ "./config": "./build/src/config/index.js",
29
+ "./plugins": "./build/src/plugins/index.js"
30
+ },
31
+ "bin": {
32
+ "sudachi": "./build/bin/sudachi.js",
33
+ "sudachi-build-system": "./build/bin/sudachi-build-system.js",
34
+ "sudachi-build-user": "./build/bin/sudachi-build-user.js",
35
+ "sudachi-print-dict": "./build/bin/sudachi-print-dict.js",
36
+ "sudachi-print-header": "./build/bin/sudachi-print-header.js"
37
+ },
38
+ "files": [
39
+ "build/",
40
+ "README.md",
41
+ "LICENSE"
42
+ ],
43
+ "engines": {
44
+ "node": ">=18.0.0"
45
+ },
46
46
  "scripts": {
47
47
  "build": "tsc --project tsconfig.build.json",
48
- "build:clean": "rm -rf build && npm run build",
48
+ "build:clean": "node -e \"require('node:fs').rmSync('build', { recursive: true, force: true })\" && npm run build",
49
49
  "prepack": "npm run build:clean",
50
50
  "lint": "biome lint src/",
51
51
  "format": "biome format src/",
52
- "check": "biome check src/ && tsc --noEmit",
53
- "check:fix": "biome check --write src/ && tsc --noEmit",
52
+ "typecheck": "tsc --noEmit",
53
+ "check": "biome check src/ && npm run typecheck",
54
+ "check:fix": "biome check --write src/ && npm run typecheck",
55
+ "test": "vitest run",
56
+ "test:watch": "vitest",
54
57
  "release": "./scripts/release.sh"
55
58
  },
56
59
  "devDependencies": {
57
60
  "@biomejs/biome": "^2.3.14",
58
- "@types/bun": "^1.1.0",
59
61
  "@types/node": "^22.0.0",
60
- "typescript": "^5.7.0"
62
+ "tsx": "^4.20.6",
63
+ "typescript": "^5.7.0",
64
+ "vitest": "^3.2.4"
61
65
  },
62
- "peerDependencies": {
63
- "typescript": "^5.0.0"
64
- }
65
- }
66
+ "peerDependencies": {
67
+ "typescript": "^5.0.0"
68
+ }
69
+ }