opencc-wasm 0.6.3 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +43 -5
- package/README.zh.md +42 -4
- package/dist/cjs/index.cjs +66 -2
- package/dist/cjs/opencc-wasm.cjs +1 -1
- package/dist/cjs/opencc-wasm.wasm +0 -0
- package/dist/data/config/hk2t.json +1 -1
- package/dist/data/config/s2twp_jieba.json +4 -3
- package/dist/data/config/t2cngov_jieba.json +28 -0
- package/dist/data/config/t2cngov_keep_simp_jieba.json +28 -0
- package/dist/data/config/tw2sp_jieba.json +4 -3
- package/dist/data/config/tw2t.json +1 -1
- package/dist/data/dict/HKVariantsRevPhrases.ocd2 +0 -0
- package/dist/data/dict/STPhrases.ocd2 +0 -0
- package/dist/data/dict/TWPhrases.ocd2 +0 -0
- package/dist/data/dict/TWPhrasesRev.ocd2 +0 -0
- package/dist/data/dict/cngov/STPhrases.ocd2 +0 -0
- package/dist/data/dict/cngov/TGPhrases.ocd2 +0 -0
- package/dist/data/jieba_dict/jieba.dict.utf8 +0 -1
- package/dist/data/jieba_dict/jieba_merged.ocd2 +0 -0
- package/dist/data/jieba_dict/user.dict.utf8 +1 -0
- package/dist/esm/index.js +55 -37
- package/dist/esm/opencc-wasm.js +1 -1
- package/dist/esm/opencc-wasm.wasm +0 -0
- package/dist/opencc-wasm.wasm +0 -0
- package/index.d.ts +16 -1
- package/package.json +1 -1
- package/dist/data/jieba_dict/BUILD.bazel +0 -6
- package/dist/data/jieba_dict/README.md +0 -45
package/README.md
CHANGED
|
@@ -73,17 +73,35 @@ const converter = OpenCC.Converter({ config: "s2twp" });
|
|
|
73
73
|
const result = await converter("服务器软件"); // 伺服器軟體
|
|
74
74
|
```
|
|
75
75
|
|
|
76
|
+
`Converter()` also exposes an inspection helper:
|
|
77
|
+
|
|
78
|
+
```javascript
|
|
79
|
+
const converter = OpenCC.Converter({ config: "s2twp" });
|
|
80
|
+
const inspected = await converter.inspect("勇敢的士兵");
|
|
81
|
+
console.log(inspected.segments); // Segmentation result
|
|
82
|
+
console.log(inspected.stages); // Per-stage conversion output
|
|
83
|
+
console.log(inspected.output); // Final converted output
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
The same API also works with the CN Government Standard Jieba configs:
|
|
87
|
+
|
|
88
|
+
```javascript
|
|
89
|
+
const converter = OpenCC.Converter({ config: "t2cngov_jieba" });
|
|
90
|
+
console.log(await converter("測試简体混繁體")); // 測試簡體混繁體
|
|
91
|
+
|
|
92
|
+
const keepSimp = OpenCC.Converter({ config: "t2cngov_keep_simp_jieba" });
|
|
93
|
+
console.log(await keepSimp("測試简体混繁體")); // 測試简体混繁體
|
|
94
|
+
```
|
|
95
|
+
|
|
76
96
|
**Supported configs:**
|
|
77
97
|
|
|
78
98
|
| Config | Description | Example |
|
|
79
99
|
|--------|-------------|---------|
|
|
80
100
|
| `s2twp` | Simplified → Taiwan Traditional (with regional phrases) | 软件 → 軟體 |
|
|
81
|
-
| `s2twp_jieba` | Simplified → Taiwan Traditional (jieba segmentation) | 城堡的士兵 → 城堡的士兵 |
|
|
82
101
|
| `s2tw` | Simplified → Taiwan Traditional | 心里 → 心裡 |
|
|
83
102
|
| `s2hk` | Simplified → Hong Kong Traditional | 心里 → 心裏 |
|
|
84
103
|
| `s2t` | Simplified → OpenCC Standard Traditional | 简体 → 簡體 |
|
|
85
104
|
| `tw2sp` | Taiwan → Simplified (with regional phrases) | 滑鼠 → 鼠标 |
|
|
86
|
-
| `tw2sp_jieba` | Taiwan → Simplified (jieba segmentation) | 慰藉著 → 慰藉着 |
|
|
87
105
|
| `tw2s` | Taiwan → Simplified | 軟體 → 软件 |
|
|
88
106
|
| `tw2t` | Taiwan → Traditional | 吃飯 → 喫飯 |
|
|
89
107
|
| `hk2s` | Hong Kong → Simplified | 打印機 → 打印机 |
|
|
@@ -95,6 +113,8 @@ const result = await converter("服务器软件"); // 伺服器軟體
|
|
|
95
113
|
| `t2jp` | Traditional → Japanese Shinjitai | 櫻花 → 桜花 |
|
|
96
114
|
| `t2cngov` | Traditional → CN Gov Standard | 潮溼 → 潮湿 |
|
|
97
115
|
| `t2cngov_keep_simp` | Traditional → CN Gov (Keep Simp) | 简体繁體 → 简体繁體 |
|
|
116
|
+
| `t2cngov_jieba` | Traditional → CN Gov Standard (Jieba segmentation) | 測試简体混繁體 → 測試簡體混繁體 |
|
|
117
|
+
| `t2cngov_keep_simp_jieba` | Traditional → CN Gov (Keep Simp, Jieba segmentation) | 測試简体混繁體 → 測試简体混繁體 |
|
|
98
118
|
|
|
99
119
|
#### Method 2: Using `from`/`to` parameters (compatible with `opencc-js`)
|
|
100
120
|
|
|
@@ -278,7 +298,7 @@ console.log(await t2s("繁體")); // 繁体
|
|
|
278
298
|
```typescript
|
|
279
299
|
import OpenCC from 'opencc-wasm';
|
|
280
300
|
|
|
281
|
-
type ConfigName = 's2t' | 's2tw' | 's2twp' | '
|
|
301
|
+
type ConfigName = 's2t' | 's2tw' | 's2twp' | 't2s';
|
|
282
302
|
|
|
283
303
|
async function convert(config: ConfigName, text: string): Promise<string> {
|
|
284
304
|
const converter = OpenCC.Converter({ config });
|
|
@@ -344,7 +364,7 @@ wasm-lib/
|
|
|
344
364
|
│ │ ├── index.cjs
|
|
345
365
|
│ │ ├── opencc-wasm.cjs
|
|
346
366
|
│ │ └── opencc-wasm.wasm
|
|
347
|
-
│ └── data/ ← OpenCC configs + dicts
|
|
367
|
+
│ └── data/ ← OpenCC configs + dicts
|
|
348
368
|
├── index.js ← Source API
|
|
349
369
|
├── index.d.ts ← TypeScript definitions
|
|
350
370
|
└── scripts/
|
|
@@ -377,12 +397,30 @@ A: Initial load downloads configs + dicts (~1-2MB). Subsequent conversions are f
|
|
|
377
397
|
|
|
378
398
|
- Uses persistent OpenCC handles to avoid reloading configs
|
|
379
399
|
- Dictionaries stored in `/data/dict/` in virtual FS
|
|
380
|
-
- Jieba assets stored in `/data/jieba_dict/` (dict, hmm_model, user dict, idf, stop_words)
|
|
381
400
|
- Memory grows on demand (`ALLOW_MEMORY_GROWTH=1`)
|
|
382
401
|
- Performance: Focuses on fidelity and compatibility with official OpenCC. May be slower than pure-JS implementations for raw throughput, but guarantees full OpenCC behavior.
|
|
383
402
|
|
|
384
403
|
## 📜 Changelog
|
|
385
404
|
|
|
405
|
+
### 0.8.0 - 2026-04-22
|
|
406
|
+
|
|
407
|
+
- Added `converter.inspect(text)` to the WASM API, returning segmentation, per-stage conversion output, and the final output as structured JSON
|
|
408
|
+
- Exposed the new inspect entry point from the Emscripten module and regenerated publishable `dist/` artifacts
|
|
409
|
+
- Added typings, tests, and documentation for the inspect workflow
|
|
410
|
+
|
|
411
|
+
### 0.7.0 - 2026-04-13
|
|
412
|
+
|
|
413
|
+
- Rebased the wasm branch onto upstream `master`
|
|
414
|
+
- Switched WASM Jieba support to reuse the upstream plugin implementation
|
|
415
|
+
- Registered the Jieba plugin statically inside the WASM module instead of using dynamic loading
|
|
416
|
+
- Restored bundled Jieba configs, dictionaries, and comparison tests in `wasm-lib` / `dist`
|
|
417
|
+
- Regenerated release artifacts and revalidated the full Node test suite
|
|
418
|
+
|
|
419
|
+
### 0.6.3 - 2026-03-31
|
|
420
|
+
|
|
421
|
+
- Upstream alignment and cngov dictionary refresh
|
|
422
|
+
- Rebuilt publishable `dist/` artifacts after the dictionary sync
|
|
423
|
+
|
|
386
424
|
### 0.6.0 - 2026-01-17
|
|
387
425
|
|
|
388
426
|
- Added Jieba segmentation support (cppjieba) for improved phrase handling
|
package/README.zh.md
CHANGED
|
@@ -73,17 +73,35 @@ const converter = OpenCC.Converter({ config: "s2twp" });
|
|
|
73
73
|
const result = await converter("服务器软件"); // 伺服器軟體
|
|
74
74
|
```
|
|
75
75
|
|
|
76
|
+
`Converter()` 也提供 inspect 輔助方法:
|
|
77
|
+
|
|
78
|
+
```javascript
|
|
79
|
+
const converter = OpenCC.Converter({ config: "s2twp" });
|
|
80
|
+
const inspected = await converter.inspect("勇敢的士兵");
|
|
81
|
+
console.log(inspected.segments); // 分詞結果
|
|
82
|
+
console.log(inspected.stages); // 每一階段的轉換結果
|
|
83
|
+
console.log(inspected.output); // 最終輸出
|
|
84
|
+
```
|
|
85
|
+
|
|
86
|
+
同樣的 API 也可直接用在大陸政府標準繁體的 Jieba 設定:
|
|
87
|
+
|
|
88
|
+
```javascript
|
|
89
|
+
const converter = OpenCC.Converter({ config: "t2cngov_jieba" });
|
|
90
|
+
console.log(await converter("測試简体混繁體")); // 測試簡體混繁體
|
|
91
|
+
|
|
92
|
+
const keepSimp = OpenCC.Converter({ config: "t2cngov_keep_simp_jieba" });
|
|
93
|
+
console.log(await keepSimp("測試简体混繁體")); // 測試简体混繁體
|
|
94
|
+
```
|
|
95
|
+
|
|
76
96
|
**支援的設定檔:**
|
|
77
97
|
|
|
78
98
|
| 設定檔 | 說明 | 範例 |
|
|
79
99
|
|--------|------|------|
|
|
80
100
|
| `s2twp` | 簡體 → 台灣正體(含地域用詞轉換) | 軟體 → 軟體 |
|
|
81
|
-
| `s2twp_jieba` | 簡體 → 台灣正體(jieba 分詞) | 城堡的士兵 → 城堡的士兵 |
|
|
82
101
|
| `s2tw` | 簡體 → 台灣正體 | 心里 → 心裡 |
|
|
83
102
|
| `s2hk` | 簡體 → 香港繁體 | 心里 → 心裏 |
|
|
84
103
|
| `s2t` | 簡體 → OpenCC 標準繁體 | 简体 → 簡體 |
|
|
85
104
|
| `tw2sp` | 台灣正體 → 簡體(含地域用詞轉換) | 滑鼠 → 鼠标 |
|
|
86
|
-
| `tw2sp_jieba` | 台灣正體 → 簡體(jieba 分詞) | 慰藉著 → 慰藉着 |
|
|
87
105
|
| `tw2s` | 台灣正體 → 簡體 | 軟體 → 软件 |
|
|
88
106
|
| `tw2t` | 台灣正體 → OpenCC 標準繁體 | 吃飯 → 喫飯 |
|
|
89
107
|
| `hk2s` | 香港繁體 → 簡體 | 打印機 → 打印机 |
|
|
@@ -95,6 +113,8 @@ const result = await converter("服务器软件"); // 伺服器軟體
|
|
|
95
113
|
| `t2jp` | 日文舊字體 → 日文新字體 | 櫻花 → 桜花 |
|
|
96
114
|
| `t2cngov` | 繁體 → 大陸政府標準繁體 | 潮溼 → 潮湿 |
|
|
97
115
|
| `t2cngov_keep_simp` | 繁體 → 大陸政府標準繁體(保留簡體) | 简体繁體 → 简体繁體 |
|
|
116
|
+
| `t2cngov_jieba` | 繁體 → 大陸政府標準繁體(Jieba 分詞) | 測試简体混繁體 → 測試簡體混繁體 |
|
|
117
|
+
| `t2cngov_keep_simp_jieba` | 繁體 → 大陸政府標準繁體(保留簡體,Jieba 分詞) | 測試简体混繁體 → 測試简体混繁體 |
|
|
98
118
|
|
|
99
119
|
#### 方式 2:使用 `from`/`to` 參數(与 `opencc-js` 相容)
|
|
100
120
|
|
|
@@ -278,7 +298,7 @@ console.log(await t2s("繁體")); // 繁体
|
|
|
278
298
|
```typescript
|
|
279
299
|
import OpenCC from 'opencc-wasm';
|
|
280
300
|
|
|
281
|
-
type ConfigName = 's2t' | 's2tw' | 's2twp' | '
|
|
301
|
+
type ConfigName = 's2t' | 's2tw' | 's2twp' | 't2s';
|
|
282
302
|
|
|
283
303
|
async function convert(config: ConfigName, text: string): Promise<string> {
|
|
284
304
|
const converter = OpenCC.Converter({ config });
|
|
@@ -377,12 +397,30 @@ A:首次載入需要下載設定檔和字典檔(約 1-2MB)。後續轉換
|
|
|
377
397
|
|
|
378
398
|
- 使用持久的 OpenCC 控制代碼避免重複載入設定
|
|
379
399
|
- 字典儲存在虛擬檔案系統的 `/data/dict/` 中
|
|
380
|
-
- Jieba 資產儲存在 `/data/jieba_dict/`(詞典、hmm_model、user dict、idf、stop_words)
|
|
381
400
|
- 記憶體按需成長(`ALLOW_MEMORY_GROWTH=1`)
|
|
382
401
|
- 效能:專注於精確度和與官方 OpenCC 的相容性。原始吞吐量可能比純 JavaScript 實作慢,但保證完整的 OpenCC 行為。
|
|
383
402
|
|
|
384
403
|
## 📜 變更歷史
|
|
385
404
|
|
|
405
|
+
### 0.8.0 - 2026-04-22
|
|
406
|
+
|
|
407
|
+
- 在 WASM API 中新增 `converter.inspect(text)`,可回傳分詞結果、每個階段的轉換輸出,以及最終輸出的結構化 JSON
|
|
408
|
+
- 從 Emscripten 模組匯出新的 inspect 入口,並重新產生可發布的 `dist/` 產物
|
|
409
|
+
- 補上 inspect 流程的型別、測試與文件
|
|
410
|
+
|
|
411
|
+
### 0.7.0 - 2026-04-13
|
|
412
|
+
|
|
413
|
+
- 將 wasm 分支重新 rebase 到 upstream `master`
|
|
414
|
+
- WASM 的 Jieba 支援改為直接復用上游插件實作
|
|
415
|
+
- 在 WASM 模組內靜態註冊 Jieba 插件,不再依賴動態載入
|
|
416
|
+
- 在 `wasm-lib` / `dist` 中重新納入 Jieba 設定、詞典與對照測試
|
|
417
|
+
- 重新產生發行產物並完整驗證 Node 測試套件
|
|
418
|
+
|
|
419
|
+
### 0.6.3 - 2026-03-31
|
|
420
|
+
|
|
421
|
+
- 與上游狀態對齊並同步 cngov 詞典
|
|
422
|
+
- 在字典同步後重新產生可發布的 `dist/` 產物
|
|
423
|
+
|
|
386
424
|
### 0.6.0 - 2026-01-17
|
|
387
425
|
|
|
388
426
|
- 新增 Jieba 分詞支援(cppjieba)以改善詞組切分效果
|
package/dist/cjs/index.cjs
CHANGED
|
@@ -18,6 +18,7 @@ const CONFIG_MAP = {
|
|
|
18
18
|
|
|
19
19
|
const loadedConfigs = new Set();
|
|
20
20
|
const loadedDicts = new Set();
|
|
21
|
+
const loadedResources = new Set();
|
|
21
22
|
const handles = new Map();
|
|
22
23
|
let modulePromise = null;
|
|
23
24
|
let api = null;
|
|
@@ -37,6 +38,7 @@ async function getApi() {
|
|
|
37
38
|
api = {
|
|
38
39
|
create: mod.cwrap("opencc_create", "number", ["string"]),
|
|
39
40
|
convert: mod.cwrap("opencc_convert", "string", ["number", "string"]),
|
|
41
|
+
inspect: mod.cwrap("opencc_inspect", "string", ["number", "string"]),
|
|
40
42
|
destroy: mod.cwrap("opencc_destroy", null, ["number"]),
|
|
41
43
|
};
|
|
42
44
|
}
|
|
@@ -59,6 +61,19 @@ function collectOcd2Files(node, acc) {
|
|
|
59
61
|
}
|
|
60
62
|
}
|
|
61
63
|
|
|
64
|
+
function collectSegmentationResources(segmentation, acc) {
|
|
65
|
+
if (!segmentation || typeof segmentation !== "object") return;
|
|
66
|
+
const resources = segmentation.resources;
|
|
67
|
+
if (!resources || typeof resources !== "object") return;
|
|
68
|
+
Object.values(resources).forEach((value) => {
|
|
69
|
+
if (typeof value === "string" && value) acc.add(value);
|
|
70
|
+
});
|
|
71
|
+
if (segmentation.type === "jieba") {
|
|
72
|
+
acc.add("jieba_dict/idf.utf8");
|
|
73
|
+
acc.add("jieba_dict/stop_words.utf8");
|
|
74
|
+
}
|
|
75
|
+
}
|
|
76
|
+
|
|
62
77
|
async function fetchText(urlObj) {
|
|
63
78
|
if (urlObj.protocol === "file:") return readFileText(urlObj);
|
|
64
79
|
const resp = await fetch(urlObj.href);
|
|
@@ -81,7 +96,9 @@ async function ensureConfig(configName) {
|
|
|
81
96
|
const cfgJson = JSON.parse(await fetchText(cfgUrl));
|
|
82
97
|
|
|
83
98
|
const dicts = new Set();
|
|
99
|
+
const resources = new Set();
|
|
84
100
|
collectOcd2Files(cfgJson.segmentation?.dict, dicts);
|
|
101
|
+
collectSegmentationResources(cfgJson.segmentation, resources);
|
|
85
102
|
if (Array.isArray(cfgJson.conversion_chain)) {
|
|
86
103
|
cfgJson.conversion_chain.forEach((item) => collectOcd2Files(item?.dict, dicts));
|
|
87
104
|
}
|
|
@@ -94,6 +111,15 @@ async function ensureConfig(configName) {
|
|
|
94
111
|
mod.FS.writeFile(dictPath, buf);
|
|
95
112
|
loadedDicts.add(file);
|
|
96
113
|
}
|
|
114
|
+
for (const file of resources) {
|
|
115
|
+
if (loadedResources.has(file)) continue;
|
|
116
|
+
const resourceUrl = new URL("../data/" + file, BASE_URL);
|
|
117
|
+
const buf = await fetchBuffer(resourceUrl);
|
|
118
|
+
const resourcePath = "/data/" + file;
|
|
119
|
+
ensureParentDir(mod, resourcePath);
|
|
120
|
+
mod.FS.writeFile(resourcePath, buf);
|
|
121
|
+
loadedResources.add(file);
|
|
122
|
+
}
|
|
97
123
|
const patchPaths = (node) => {
|
|
98
124
|
if (!node || typeof node !== "object") return;
|
|
99
125
|
if (node.type === "ocd2" && node.file) node.file = "/data/dict/" + node.file;
|
|
@@ -121,7 +147,16 @@ function resolveConfig(from, to) {
|
|
|
121
147
|
}
|
|
122
148
|
|
|
123
149
|
function createConverter({ from, to, config }) {
|
|
124
|
-
|
|
150
|
+
let configName;
|
|
151
|
+
|
|
152
|
+
if (config) {
|
|
153
|
+
configName = config.endsWith(".json") ? config : `${config}.json`;
|
|
154
|
+
} else if (from && to) {
|
|
155
|
+
configName = resolveConfig(from, to);
|
|
156
|
+
} else {
|
|
157
|
+
throw new Error('Either "config" or both "from" and "to" must be specified');
|
|
158
|
+
}
|
|
159
|
+
|
|
125
160
|
return async (text) => {
|
|
126
161
|
if (configName === null) return text;
|
|
127
162
|
const handle = await ensureConfig(configName);
|
|
@@ -130,6 +165,32 @@ function createConverter({ from, to, config }) {
|
|
|
130
165
|
};
|
|
131
166
|
}
|
|
132
167
|
|
|
168
|
+
function createInspector({ from, to, config }) {
|
|
169
|
+
let configName;
|
|
170
|
+
|
|
171
|
+
if (config) {
|
|
172
|
+
configName = config.endsWith(".json") ? config : `${config}.json`;
|
|
173
|
+
} else if (from && to) {
|
|
174
|
+
configName = resolveConfig(from, to);
|
|
175
|
+
} else {
|
|
176
|
+
throw new Error('Either "config" or both "from" and "to" must be specified');
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
return async (text) => {
|
|
180
|
+
if (configName === null) {
|
|
181
|
+
return {
|
|
182
|
+
input: text,
|
|
183
|
+
segments: text.length === 0 ? [] : [text],
|
|
184
|
+
stages: [],
|
|
185
|
+
output: text,
|
|
186
|
+
};
|
|
187
|
+
}
|
|
188
|
+
const handle = await ensureConfig(configName);
|
|
189
|
+
const { api: apiFns } = await getApi();
|
|
190
|
+
return JSON.parse(apiFns.inspect(handle, text));
|
|
191
|
+
};
|
|
192
|
+
}
|
|
193
|
+
|
|
133
194
|
function CustomConverter(dictOrString) {
|
|
134
195
|
let pairs = [];
|
|
135
196
|
if (typeof dictOrString === "string") {
|
|
@@ -168,7 +229,10 @@ function ConverterFactory(fromLocale, toLocale, extraDicts = []) {
|
|
|
168
229
|
const OpenCC = {
|
|
169
230
|
Converter(opts) {
|
|
170
231
|
const fn = createConverter(opts);
|
|
171
|
-
|
|
232
|
+
const inspect = createInspector(opts);
|
|
233
|
+
const converter = (text) => fn(text);
|
|
234
|
+
converter.inspect = (text) => inspect(text);
|
|
235
|
+
return converter;
|
|
172
236
|
},
|
|
173
237
|
CustomConverter,
|
|
174
238
|
ConverterFactory,
|