opencc-wasm 0.6.3 → 0.8.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -73,17 +73,35 @@ const converter = OpenCC.Converter({ config: "s2twp" });
73
73
  const result = await converter("服务器软件"); // 伺服器軟體
74
74
  ```
75
75
 
76
+ `Converter()` also exposes an inspection helper:
77
+
78
+ ```javascript
79
+ const converter = OpenCC.Converter({ config: "s2twp" });
80
+ const inspected = await converter.inspect("勇敢的士兵");
81
+ console.log(inspected.segments); // Segmentation result
82
+ console.log(inspected.stages); // Per-stage conversion output
83
+ console.log(inspected.output); // Final converted output
84
+ ```
85
+
86
+ The same API also works with the CN Government Standard Jieba configs:
87
+
88
+ ```javascript
89
+ const converter = OpenCC.Converter({ config: "t2cngov_jieba" });
90
+ console.log(await converter("測試简体混繁體")); // 測試簡體混繁體
91
+
92
+ const keepSimp = OpenCC.Converter({ config: "t2cngov_keep_simp_jieba" });
93
+ console.log(await keepSimp("測試简体混繁體")); // 測試简体混繁體
94
+ ```
95
+
76
96
  **Supported configs:**
77
97
 
78
98
  | Config | Description | Example |
79
99
  |--------|-------------|---------|
80
100
  | `s2twp` | Simplified → Taiwan Traditional (with regional phrases) | 软件 → 軟體 |
81
- | `s2twp_jieba` | Simplified → Taiwan Traditional (jieba segmentation) | 城堡的士兵 → 城堡的士兵 |
82
101
  | `s2tw` | Simplified → Taiwan Traditional | 心里 → 心裡 |
83
102
  | `s2hk` | Simplified → Hong Kong Traditional | 心里 → 心裏 |
84
103
  | `s2t` | Simplified → OpenCC Standard Traditional | 简体 → 簡體 |
85
104
  | `tw2sp` | Taiwan → Simplified (with regional phrases) | 滑鼠 → 鼠标 |
86
- | `tw2sp_jieba` | Taiwan → Simplified (jieba segmentation) | 慰藉著 → 慰藉着 |
87
105
  | `tw2s` | Taiwan → Simplified | 軟體 → 软件 |
88
106
  | `tw2t` | Taiwan → Traditional | 吃飯 → 喫飯 |
89
107
  | `hk2s` | Hong Kong → Simplified | 打印機 → 打印机 |
@@ -95,6 +113,8 @@ const result = await converter("服务器软件"); // 伺服器軟體
95
113
  | `t2jp` | Traditional → Japanese Shinjitai | 櫻花 → 桜花 |
96
114
  | `t2cngov` | Traditional → CN Gov Standard | 潮溼 → 潮湿 |
97
115
  | `t2cngov_keep_simp` | Traditional → CN Gov (Keep Simp) | 简体繁體 → 简体繁體 |
116
+ | `t2cngov_jieba` | Traditional → CN Gov Standard (Jieba segmentation) | 測試简体混繁體 → 測試簡體混繁體 |
117
+ | `t2cngov_keep_simp_jieba` | Traditional → CN Gov (Keep Simp, Jieba segmentation) | 測試简体混繁體 → 測試简体混繁體 |
98
118
 
99
119
  #### Method 2: Using `from`/`to` parameters (compatible with `opencc-js`)
100
120
 
@@ -278,7 +298,7 @@ console.log(await t2s("繁體")); // 繁体
278
298
  ```typescript
279
299
  import OpenCC from 'opencc-wasm';
280
300
 
281
- type ConfigName = 's2t' | 's2tw' | 's2twp' | 's2twp_jieba' | 't2s' | 'tw2sp_jieba';
301
+ type ConfigName = 's2t' | 's2tw' | 's2twp' | 't2s';
282
302
 
283
303
  async function convert(config: ConfigName, text: string): Promise<string> {
284
304
  const converter = OpenCC.Converter({ config });
@@ -344,7 +364,7 @@ wasm-lib/
344
364
  │ │ ├── index.cjs
345
365
  │ │ ├── opencc-wasm.cjs
346
366
  │ │ └── opencc-wasm.wasm
347
- │ └── data/ ← OpenCC configs + dicts (+ jieba files if enabled)
367
+ │ └── data/ ← OpenCC configs + dicts
348
368
  ├── index.js ← Source API
349
369
  ├── index.d.ts ← TypeScript definitions
350
370
  └── scripts/
@@ -377,12 +397,30 @@ A: Initial load downloads configs + dicts (~1-2MB). Subsequent conversions are f
377
397
 
378
398
  - Uses persistent OpenCC handles to avoid reloading configs
379
399
  - Dictionaries stored in `/data/dict/` in virtual FS
380
- - Jieba assets stored in `/data/jieba_dict/` (dict, hmm_model, user dict, idf, stop_words)
381
400
  - Memory grows on demand (`ALLOW_MEMORY_GROWTH=1`)
382
401
  - Performance: Focuses on fidelity and compatibility with official OpenCC. May be slower than pure-JS implementations for raw throughput, but guarantees full OpenCC behavior.
383
402
 
384
403
  ## 📜 Changelog
385
404
 
405
+ ### 0.8.0 - 2026-04-22
406
+
407
+ - Added `converter.inspect(text)` to the WASM API, returning segmentation, per-stage conversion output, and the final output as structured JSON
408
+ - Exposed the new inspect entry point from the Emscripten module and regenerated publishable `dist/` artifacts
409
+ - Added typings, tests, and documentation for the inspect workflow
410
+
411
+ ### 0.7.0 - 2026-04-13
412
+
413
+ - Rebased the wasm branch onto upstream `master`
414
+ - Switched WASM Jieba support to reuse the upstream plugin implementation
415
+ - Registered the Jieba plugin statically inside the WASM module instead of using dynamic loading
416
+ - Restored bundled Jieba configs, dictionaries, and comparison tests in `wasm-lib` / `dist`
417
+ - Regenerated release artifacts and revalidated the full Node test suite
418
+
419
+ ### 0.6.3 - 2026-03-31
420
+
421
+ - Upstream alignment and cngov dictionary refresh
422
+ - Rebuilt publishable `dist/` artifacts after the dictionary sync
423
+
386
424
  ### 0.6.0 - 2026-01-17
387
425
 
388
426
  - Added Jieba segmentation support (cppjieba) for improved phrase handling
package/README.zh.md CHANGED
@@ -73,17 +73,35 @@ const converter = OpenCC.Converter({ config: "s2twp" });
73
73
  const result = await converter("服务器软件"); // 伺服器軟體
74
74
  ```
75
75
 
76
+ `Converter()` 也提供 inspect 輔助方法:
77
+
78
+ ```javascript
79
+ const converter = OpenCC.Converter({ config: "s2twp" });
80
+ const inspected = await converter.inspect("勇敢的士兵");
81
+ console.log(inspected.segments); // 分詞結果
82
+ console.log(inspected.stages); // 每一階段的轉換結果
83
+ console.log(inspected.output); // 最終輸出
84
+ ```
85
+
86
+ 同樣的 API 也可直接用在大陸政府標準繁體的 Jieba 設定:
87
+
88
+ ```javascript
89
+ const converter = OpenCC.Converter({ config: "t2cngov_jieba" });
90
+ console.log(await converter("測試简体混繁體")); // 測試簡體混繁體
91
+
92
+ const keepSimp = OpenCC.Converter({ config: "t2cngov_keep_simp_jieba" });
93
+ console.log(await keepSimp("測試简体混繁體")); // 測試简体混繁體
94
+ ```
95
+
76
96
  **支援的設定檔:**
77
97
 
78
98
  | 設定檔 | 說明 | 範例 |
79
99
  |--------|------|------|
80
100
  | `s2twp` | 簡體 → 台灣正體(含地域用詞轉換) | 軟體 → 軟體 |
81
- | `s2twp_jieba` | 簡體 → 台灣正體(jieba 分詞) | 城堡的士兵 → 城堡的士兵 |
82
101
  | `s2tw` | 簡體 → 台灣正體 | 心里 → 心裡 |
83
102
  | `s2hk` | 簡體 → 香港繁體 | 心里 → 心裏 |
84
103
  | `s2t` | 簡體 → OpenCC 標準繁體 | 简体 → 簡體 |
85
104
  | `tw2sp` | 台灣正體 → 簡體(含地域用詞轉換) | 滑鼠 → 鼠标 |
86
- | `tw2sp_jieba` | 台灣正體 → 簡體(jieba 分詞) | 慰藉著 → 慰藉着 |
87
105
  | `tw2s` | 台灣正體 → 簡體 | 軟體 → 软件 |
88
106
  | `tw2t` | 台灣正體 → OpenCC 標準繁體 | 吃飯 → 喫飯 |
89
107
  | `hk2s` | 香港繁體 → 簡體 | 打印機 → 打印机 |
@@ -95,6 +113,8 @@ const result = await converter("服务器软件"); // 伺服器軟體
95
113
  | `t2jp` | 日文舊字體 → 日文新字體 | 櫻花 → 桜花 |
96
114
  | `t2cngov` | 繁體 → 大陸政府標準繁體 | 潮溼 → 潮湿 |
97
115
  | `t2cngov_keep_simp` | 繁體 → 大陸政府標準繁體(保留簡體) | 简体繁體 → 简体繁體 |
116
+ | `t2cngov_jieba` | 繁體 → 大陸政府標準繁體(Jieba 分詞) | 測試简体混繁體 → 測試簡體混繁體 |
117
+ | `t2cngov_keep_simp_jieba` | 繁體 → 大陸政府標準繁體(保留簡體,Jieba 分詞) | 測試简体混繁體 → 測試简体混繁體 |
98
118
 
99
119
  #### 方式 2:使用 `from`/`to` 參數(与 `opencc-js` 相容)
100
120
 
@@ -278,7 +298,7 @@ console.log(await t2s("繁體")); // 繁体
278
298
  ```typescript
279
299
  import OpenCC from 'opencc-wasm';
280
300
 
281
- type ConfigName = 's2t' | 's2tw' | 's2twp' | 's2twp_jieba' | 't2s' | 'tw2sp_jieba';
301
+ type ConfigName = 's2t' | 's2tw' | 's2twp' | 't2s';
282
302
 
283
303
  async function convert(config: ConfigName, text: string): Promise<string> {
284
304
  const converter = OpenCC.Converter({ config });
@@ -377,12 +397,30 @@ A:首次載入需要下載設定檔和字典檔(約 1-2MB)。後續轉換
377
397
 
378
398
  - 使用持久的 OpenCC 控制代碼避免重複載入設定
379
399
  - 字典儲存在虛擬檔案系統的 `/data/dict/` 中
380
- - Jieba 資產儲存在 `/data/jieba_dict/`(詞典、hmm_model、user dict、idf、stop_words)
381
400
  - 記憶體按需成長(`ALLOW_MEMORY_GROWTH=1`)
382
401
  - 效能:專注於精確度和與官方 OpenCC 的相容性。原始吞吐量可能比純 JavaScript 實作慢,但保證完整的 OpenCC 行為。
383
402
 
384
403
  ## 📜 變更歷史
385
404
 
405
+ ### 0.8.0 - 2026-04-22
406
+
407
+ - 在 WASM API 中新增 `converter.inspect(text)`,可回傳分詞結果、每個階段的轉換輸出,以及最終輸出的結構化 JSON
408
+ - 從 Emscripten 模組匯出新的 inspect 入口,並重新產生可發布的 `dist/` 產物
409
+ - 補上 inspect 流程的型別、測試與文件
410
+
411
+ ### 0.7.0 - 2026-04-13
412
+
413
+ - 將 wasm 分支重新 rebase 到 upstream `master`
414
+ - WASM 的 Jieba 支援改為直接復用上游插件實作
415
+ - 在 WASM 模組內靜態註冊 Jieba 插件,不再依賴動態載入
416
+ - 在 `wasm-lib` / `dist` 中重新納入 Jieba 設定、詞典與對照測試
417
+ - 重新產生發行產物並完整驗證 Node 測試套件
418
+
419
+ ### 0.6.3 - 2026-03-31
420
+
421
+ - 與上游狀態對齊並同步 cngov 詞典
422
+ - 在字典同步後重新產生可發布的 `dist/` 產物
423
+
386
424
  ### 0.6.0 - 2026-01-17
387
425
 
388
426
  - 新增 Jieba 分詞支援(cppjieba)以改善詞組切分效果
@@ -18,6 +18,7 @@ const CONFIG_MAP = {
18
18
 
19
19
  const loadedConfigs = new Set();
20
20
  const loadedDicts = new Set();
21
+ const loadedResources = new Set();
21
22
  const handles = new Map();
22
23
  let modulePromise = null;
23
24
  let api = null;
@@ -37,6 +38,7 @@ async function getApi() {
37
38
  api = {
38
39
  create: mod.cwrap("opencc_create", "number", ["string"]),
39
40
  convert: mod.cwrap("opencc_convert", "string", ["number", "string"]),
41
+ inspect: mod.cwrap("opencc_inspect", "string", ["number", "string"]),
40
42
  destroy: mod.cwrap("opencc_destroy", null, ["number"]),
41
43
  };
42
44
  }
@@ -59,6 +61,19 @@ function collectOcd2Files(node, acc) {
59
61
  }
60
62
  }
61
63
 
64
+ function collectSegmentationResources(segmentation, acc) {
65
+ if (!segmentation || typeof segmentation !== "object") return;
66
+ const resources = segmentation.resources;
67
+ if (!resources || typeof resources !== "object") return;
68
+ Object.values(resources).forEach((value) => {
69
+ if (typeof value === "string" && value) acc.add(value);
70
+ });
71
+ if (segmentation.type === "jieba") {
72
+ acc.add("jieba_dict/idf.utf8");
73
+ acc.add("jieba_dict/stop_words.utf8");
74
+ }
75
+ }
76
+
62
77
  async function fetchText(urlObj) {
63
78
  if (urlObj.protocol === "file:") return readFileText(urlObj);
64
79
  const resp = await fetch(urlObj.href);
@@ -81,7 +96,9 @@ async function ensureConfig(configName) {
81
96
  const cfgJson = JSON.parse(await fetchText(cfgUrl));
82
97
 
83
98
  const dicts = new Set();
99
+ const resources = new Set();
84
100
  collectOcd2Files(cfgJson.segmentation?.dict, dicts);
101
+ collectSegmentationResources(cfgJson.segmentation, resources);
85
102
  if (Array.isArray(cfgJson.conversion_chain)) {
86
103
  cfgJson.conversion_chain.forEach((item) => collectOcd2Files(item?.dict, dicts));
87
104
  }
@@ -94,6 +111,15 @@ async function ensureConfig(configName) {
94
111
  mod.FS.writeFile(dictPath, buf);
95
112
  loadedDicts.add(file);
96
113
  }
114
+ for (const file of resources) {
115
+ if (loadedResources.has(file)) continue;
116
+ const resourceUrl = new URL("../data/" + file, BASE_URL);
117
+ const buf = await fetchBuffer(resourceUrl);
118
+ const resourcePath = "/data/" + file;
119
+ ensureParentDir(mod, resourcePath);
120
+ mod.FS.writeFile(resourcePath, buf);
121
+ loadedResources.add(file);
122
+ }
97
123
  const patchPaths = (node) => {
98
124
  if (!node || typeof node !== "object") return;
99
125
  if (node.type === "ocd2" && node.file) node.file = "/data/dict/" + node.file;
@@ -121,7 +147,16 @@ function resolveConfig(from, to) {
121
147
  }
122
148
 
123
149
  function createConverter({ from, to, config }) {
124
- const configName = config || resolveConfig(from, to);
150
+ let configName;
151
+
152
+ if (config) {
153
+ configName = config.endsWith(".json") ? config : `${config}.json`;
154
+ } else if (from && to) {
155
+ configName = resolveConfig(from, to);
156
+ } else {
157
+ throw new Error('Either "config" or both "from" and "to" must be specified');
158
+ }
159
+
125
160
  return async (text) => {
126
161
  if (configName === null) return text;
127
162
  const handle = await ensureConfig(configName);
@@ -130,6 +165,32 @@ function createConverter({ from, to, config }) {
130
165
  };
131
166
  }
132
167
 
168
+ function createInspector({ from, to, config }) {
169
+ let configName;
170
+
171
+ if (config) {
172
+ configName = config.endsWith(".json") ? config : `${config}.json`;
173
+ } else if (from && to) {
174
+ configName = resolveConfig(from, to);
175
+ } else {
176
+ throw new Error('Either "config" or both "from" and "to" must be specified');
177
+ }
178
+
179
+ return async (text) => {
180
+ if (configName === null) {
181
+ return {
182
+ input: text,
183
+ segments: text.length === 0 ? [] : [text],
184
+ stages: [],
185
+ output: text,
186
+ };
187
+ }
188
+ const handle = await ensureConfig(configName);
189
+ const { api: apiFns } = await getApi();
190
+ return JSON.parse(apiFns.inspect(handle, text));
191
+ };
192
+ }
193
+
133
194
  function CustomConverter(dictOrString) {
134
195
  let pairs = [];
135
196
  if (typeof dictOrString === "string") {
@@ -168,7 +229,10 @@ function ConverterFactory(fromLocale, toLocale, extraDicts = []) {
168
229
  const OpenCC = {
169
230
  Converter(opts) {
170
231
  const fn = createConverter(opts);
171
- return (text) => fn(text);
232
+ const inspect = createInspector(opts);
233
+ const converter = (text) => fn(text);
234
+ converter.inspect = (text) => inspect(text);
235
+ return converter;
172
236
  },
173
237
  CustomConverter,
174
238
  ConverterFactory,