sudachi-ts 0.1.11 → 0.1.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -105,7 +105,7 @@ const config = await loadConfig('./sudachi.json');
105
105
  const dict = Dictionary.create();
106
106
  ```
107
107
 
108
- Example `sudachi.json`:
108
+ Example `sudachi.json`:
109
109
 
110
110
  ```json
111
111
  {
@@ -120,10 +120,14 @@ Example `sudachi.json`:
120
120
  }
121
121
  }
122
122
  ]
123
- }
124
- ```
125
-
126
- By default, Sudachi-TS enables a built-in compound-particle lexicon
123
+ }
124
+ ```
125
+
126
+ For non-absolute file references in config (dictionary files, plugin module paths,
127
+ and built-in plugin file settings), Sudachi-TS tries paths relative to the config
128
+ file first, then relative to the current working directory.
129
+
130
+ By default, Sudachi-TS enables a built-in compound-particle lexicon
127
131
  (`"enableDefaultCompoundParticles": true`) so forms such as `かも`, `のか`,
128
132
  and `だから` are tokenized as single morphemes. Set it to `false` to disable:
129
133
 
@@ -266,16 +270,19 @@ const plugin = await loader.loadInputTextPlugin(
266
270
 
267
271
  See [PLUGINS.md](./PLUGINS.md) for detailed plugin development guide.
268
272
 
269
- Quick local comparison for the PoC token chunker plugin:
270
-
271
- ```bash
272
- bun examples/token-chunker-plugin.ts /path/to/system.dic "東京大学"
273
- ```
274
-
275
- This example prints each token as `surface/reading` so the chunking impact on
276
- readings is visible in the baseline vs plugin outputs.
277
-
278
- ## Dictionary Building
273
+ Quick local comparison for the PoC token chunker plugin:
274
+
275
+ ```bash
276
+ npm exec tsx examples/token-chunker-plugin.ts /path/to/system.dic "東京大学"
277
+ ```
278
+
279
+ This example prints each token as `surface/reading` so the chunking impact on
280
+ readings is visible in the baseline vs plugin outputs.
281
+ `TokenChunkerPlugin` is designed and validated against the full Sudachi system
282
+ dictionary (`system_full.dic` / `system.dic`), so prefer full-dictionary checks
283
+ when adding or tuning chunk rules.
284
+
285
+ ## Dictionary Building
279
286
 
280
287
  Build binary dictionaries from CSV source:
281
288
 
@@ -325,23 +332,23 @@ See [CONFIG.md](./CONFIG.md) for detailed configuration options.
325
332
 
326
333
  ## Development
327
334
 
328
- ```bash
329
- # Clone repository
330
- git clone https://github.com/your-org/sudachi-ts.git
331
- cd sudachi-ts
332
-
333
- # Install dependencies
334
- bun install
335
-
336
- # Type check
337
- bun x tsc --noEmit
338
-
339
- # Run tests
340
- bun test
341
-
342
- # Lint
343
- bun x @biomejs/biome lint --write .
344
- ```
335
+ ```bash
336
+ # Clone repository
337
+ git clone https://github.com/your-org/sudachi-ts.git
338
+ cd sudachi-ts
339
+
340
+ # Install dependencies
341
+ npm install
342
+
343
+ # Type check
344
+ npm run typecheck
345
+
346
+ # Run tests
347
+ npm test
348
+
349
+ # Lint
350
+ npm run check:fix
351
+ ```
345
352
 
346
353
  ## Architecture
347
354
 
File without changes
File without changes
File without changes
File without changes
File without changes
@@ -15,8 +15,8 @@ export class Config {
15
15
  static async fromFile(filePath) {
16
16
  const content = await readFile(filePath, 'utf-8');
17
17
  const baseDir = dirname(filePath);
18
- const anchor = PathAnchor.filesystem(baseDir);
19
- return new Config(Settings.parse(content), anchor);
18
+ const anchor = PathAnchor.filesystem(baseDir).andThen(PathAnchor.none());
19
+ return new Config(Settings.parse(content, anchor), anchor);
20
20
  }
21
21
  static parse(json) {
22
22
  return new Config(Settings.parse(json));
@@ -61,7 +61,14 @@ export class Config {
61
61
  return this.settings.getIntList(key);
62
62
  }
63
63
  getPlugins(key) {
64
- return this.settings.getPlugins(key);
64
+ const plugins = this.settings.getPlugins(key);
65
+ if (!plugins) {
66
+ return null;
67
+ }
68
+ return plugins.map((plugin) => ({
69
+ className: plugin.className,
70
+ settings: plugin.settings.withAnchor(this.anchor),
71
+ }));
65
72
  }
66
73
  }
67
74
  export async function loadConfig(configPath) {
@@ -1,12 +1,18 @@
1
+ import { PathAnchor } from './pathAnchor.js';
1
2
  export type PluginConf<_T> = {
2
3
  className: string;
3
4
  settings: Settings;
4
5
  };
5
6
  export declare class Settings {
6
7
  private readonly data;
7
- constructor(data?: Record<string, unknown>);
8
+ private readonly anchor;
9
+ constructor(data?: Record<string, unknown>, anchor?: PathAnchor);
8
10
  static empty(): Settings;
9
- static parse(json: string, _basePath?: string): Settings;
11
+ static parse(json: string, basePathOrAnchor?: string | PathAnchor): Settings;
12
+ getAnchor(): PathAnchor;
13
+ withAnchor(anchor: PathAnchor): Settings;
14
+ getPath(key: string, defaultValue?: string): Promise<string | null>;
15
+ toObject(): Record<string, unknown>;
10
16
  getString(key: string, defaultValue?: string): string | null;
11
17
  getInt(key: string, defaultValue?: number): number;
12
18
  getBoolean(key: string, defaultValue: boolean): boolean;
@@ -1,17 +1,39 @@
1
+ import { PathAnchor } from './pathAnchor.js';
1
2
  export class Settings {
2
3
  data;
3
- constructor(data = {}) {
4
+ anchor;
5
+ constructor(data = {}, anchor = PathAnchor.none()) {
4
6
  this.data = { ...data };
7
+ this.anchor = anchor;
5
8
  }
6
9
  static empty() {
7
- return new Settings({});
10
+ return new Settings({}, PathAnchor.none());
8
11
  }
9
- static parse(json, _basePath) {
12
+ static parse(json, basePathOrAnchor) {
10
13
  const data = JSON.parse(json);
11
14
  if (typeof data !== 'object' || data === null) {
12
15
  throw new Error('root must be an object');
13
16
  }
14
- return new Settings(data);
17
+ if (typeof basePathOrAnchor === 'string') {
18
+ return new Settings(data, PathAnchor.filesystem(basePathOrAnchor).andThen(PathAnchor.none()));
19
+ }
20
+ return new Settings(data, basePathOrAnchor ?? PathAnchor.none());
21
+ }
22
+ getAnchor() {
23
+ return this.anchor;
24
+ }
25
+ withAnchor(anchor) {
26
+ return new Settings(this.data, anchor);
27
+ }
28
+ async getPath(key, defaultValue) {
29
+ const value = this.getString(key, defaultValue);
30
+ if (value === null) {
31
+ return null;
32
+ }
33
+ return await this.anchor.resolve(value);
34
+ }
35
+ toObject() {
36
+ return { ...this.data };
15
37
  }
16
38
  getString(key, defaultValue) {
17
39
  const value = this.data[key];
@@ -70,7 +92,7 @@ export class Settings {
70
92
  const obj = item;
71
93
  return {
72
94
  className: obj.class,
73
- settings: new Settings({ ...obj }),
95
+ settings: new Settings({ ...obj }, this.anchor),
74
96
  };
75
97
  }
76
98
  throw new Error(`sub-object for ${key} didn't have class key`);
@@ -79,9 +101,9 @@ export class Settings {
79
101
  return null;
80
102
  }
81
103
  withFallback(other) {
82
- return new Settings({ ...other.data, ...this.data });
104
+ return new Settings({ ...other.data, ...this.data }, this.anchor.andThen(other.anchor));
83
105
  }
84
106
  merge(overrides) {
85
- return new Settings({ ...this.data, ...overrides });
107
+ return new Settings({ ...this.data, ...overrides }, this.anchor);
86
108
  }
87
109
  }
@@ -1,3 +1,4 @@
1
+ import { readFile } from 'node:fs/promises';
1
2
  import { CategoryType } from './categoryType.js';
2
3
  export class CharacterCategory {
3
4
  static PATTERN_SPACES = /\s+/;
@@ -67,8 +68,7 @@ export class CharacterCategory {
67
68
  static async loadDefault() {
68
69
  const charCategory = new CharacterCategory();
69
70
  try {
70
- const response = await fetch(new URL('../resources/char.def', import.meta.url));
71
- const content = await response.text();
71
+ const content = await readFile(new URL('../resources/char.def', import.meta.url), 'utf-8');
72
72
  charCategory.readCharacterDefinition(content);
73
73
  }
74
74
  catch (e) {
@@ -44,8 +44,8 @@ export class DictionaryFactory {
44
44
  ensureLexiconSet().add(userDict.getLexicon());
45
45
  }
46
46
  }
47
- const loader = new PluginLoader();
48
- const defaultConfig = Config.parse(DEFAULT_CONFIG_JSON);
47
+ const loader = new PluginLoader(anchor);
48
+ const defaultConfig = Config.parse(DEFAULT_CONFIG_JSON).setAnchor(anchor);
49
49
  let inputTextPluginConfs = config.getPlugins('inputTextPlugin');
50
50
  if (!inputTextPluginConfs || inputTextPluginConfs.length === 0) {
51
51
  inputTextPluginConfs = defaultConfig.getPlugins('inputTextPlugin');
@@ -9,7 +9,7 @@ export class DefaultInputTextPlugin extends InputTextPlugin {
9
9
  if (this.initialized) {
10
10
  return;
11
11
  }
12
- const rewriteDefPath = this.settings.getString('rewriteDef');
12
+ const rewriteDefPath = await this.settings.getPath('rewriteDef');
13
13
  if (rewriteDefPath) {
14
14
  const content = await readFully(rewriteDefPath);
15
15
  this.readRewriteLists(content);
@@ -1,3 +1,4 @@
1
+ import { PathAnchor } from '../config/pathAnchor.js';
1
2
  import type { Settings } from '../config/settings.js';
2
3
  import type { Grammar } from '../dictionary/grammar.js';
3
4
  import type { Plugin } from './base.js';
@@ -11,6 +12,8 @@ export interface LoadedPlugin<T extends Plugin> {
11
12
  className: string;
12
13
  }
13
14
  export declare class PluginLoader {
15
+ private readonly anchor;
16
+ constructor(anchor?: PathAnchor);
14
17
  loadInputTextPlugin(className: string, settings: Settings): Promise<LoadedPlugin<InputTextPlugin>>;
15
18
  loadOovProviderPlugin(className: string, settings: Settings): Promise<LoadedPlugin<OovProviderPlugin>>;
16
19
  loadPathRewritePlugin(className: string, settings: Settings): Promise<LoadedPlugin<PathRewritePlugin>>;
@@ -35,6 +38,8 @@ export declare class PluginLoader {
35
38
  private loadPlugin;
36
39
  private findPluginClass;
37
40
  private isPluginConstructor;
41
+ private resolveClassSpecifier;
42
+ private isPathLikeSpecifier;
38
43
  private isBuiltIn;
39
44
  private getBuiltIn;
40
45
  }
@@ -1,3 +1,6 @@
1
+ import { isAbsolute, resolve } from 'node:path';
2
+ import { pathToFileURL } from 'node:url';
3
+ import { PathAnchor } from '../config/pathAnchor.js';
1
4
  import { InhibitConnectionPlugin } from './connection/inhibitConnectionPlugin.js';
2
5
  import { DefaultInputTextPlugin } from './inputText/defaultInputTextPlugin.js';
3
6
  import { IgnoreYomiganaPlugin } from './inputText/ignoreYomiganaPlugin.js';
@@ -9,6 +12,10 @@ import { JoinKatakanaOovPlugin } from './pathRewrite/joinKatakanaOovPlugin.js';
9
12
  import { JoinNumericPlugin } from './pathRewrite/joinNumericPlugin.js';
10
13
  import { TokenChunkerPlugin } from './pathRewrite/tokenChunkerPlugin.js';
11
14
  export class PluginLoader {
15
+ anchor;
16
+ constructor(anchor = PathAnchor.none()) {
17
+ this.anchor = anchor;
18
+ }
12
19
  async loadInputTextPlugin(className, settings) {
13
20
  const plugin = await this.loadPlugin(className, settings);
14
21
  return { plugin, className };
@@ -72,7 +79,8 @@ export class PluginLoader {
72
79
  PluginClass = this.getBuiltIn(className);
73
80
  }
74
81
  else {
75
- const module = await import(className);
82
+ const classSpecifier = await this.resolveClassSpecifier(className);
83
+ const module = await import(classSpecifier);
76
84
  PluginClass = this.findPluginClass(module, className);
77
85
  }
78
86
  const plugin = new PluginClass();
@@ -112,6 +120,24 @@ export class PluginLoader {
112
120
  return false;
113
121
  }
114
122
  }
123
+ async resolveClassSpecifier(className) {
124
+ if (this.anchor === PathAnchor.none() ||
125
+ !this.isPathLikeSpecifier(className)) {
126
+ return className;
127
+ }
128
+ const resolvedPath = await this.anchor.resolve(className);
129
+ const absolutePath = isAbsolute(resolvedPath)
130
+ ? resolvedPath
131
+ : resolve(resolvedPath);
132
+ return pathToFileURL(absolutePath).href;
133
+ }
134
+ isPathLikeSpecifier(className) {
135
+ return (className.startsWith('./') ||
136
+ className.startsWith('../') ||
137
+ className.startsWith('.\\') ||
138
+ className.startsWith('..\\') ||
139
+ isAbsolute(className));
140
+ }
115
141
  isBuiltIn(name) {
116
142
  return (name in BUILT_IN_PLUGINS || name.split('.').pop() in BUILT_IN_PLUGINS);
117
143
  }
@@ -11,12 +11,12 @@ export class MeCabOovProviderPlugin extends OovProviderPlugin {
11
11
  if (this.initialized) {
12
12
  return;
13
13
  }
14
- const charDefPath = this.settings.getString('charDef');
14
+ const charDefPath = await this.settings.getPath('charDef');
15
15
  if (charDefPath) {
16
16
  const content = await readFully(charDefPath);
17
17
  this.readCharacterProperty(content);
18
18
  }
19
- const unkDefPath = this.settings.getString('unkDef');
19
+ const unkDefPath = await this.settings.getPath('unkDef');
20
20
  const userPosMode = this.settings.getString(OovProviderPlugin.USER_POS, OovProviderPlugin.USER_POS_FORBID) ?? OovProviderPlugin.USER_POS_FORBID;
21
21
  if (unkDefPath) {
22
22
  const content = await readFully(unkDefPath);
@@ -6,9 +6,6 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
6
6
  private grammar;
7
7
  private enablePatternRules;
8
8
  private enableBroadRules;
9
- private enableCompoundNouns;
10
- private minCompoundLength;
11
- private excludedNounSubcategories;
12
9
  setUp(grammar: Grammar): void;
13
10
  rewrite(_text: InputText, path: LatticeNode[], lattice: Lattice): void;
14
11
  private toInitialChunks;
@@ -25,12 +22,15 @@ export declare class TokenChunkerPlugin extends PathRewritePlugin {
25
22
  private isNumericCommaChunk;
26
23
  private isNumericDotChunk;
27
24
  private isNumericSignChunk;
25
+ private isLatinTextChunk;
28
26
  private isCounterChunk;
29
27
  private mergeCounterChunks;
30
28
  private applyMergeStage;
31
- private applyCompoundNounStage;
32
- private isChunkableNoun;
33
- private canMergeAsCompoundNoun;
29
+ private applyInlineRubyExactStage;
30
+ private applyInlineRubyPrefixStage;
31
+ private shouldMergeInlineRubyExact;
32
+ private shouldMergeInlineRubyPrefix;
33
+ private toHiragana;
34
34
  private getPosById;
35
35
  private mergeChunks;
36
36
  private readingPart;