yukichant 3.1.0 → 4.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +12 -2
- package/src/browser.js +3 -3
- package/src/cli.js +2 -2
- package/src/index.js +0 -3
- package/src/logger.js +1 -3
- package/src/typo-correction.js +57 -31
- package/.cursor/rules +0 -58
- package/.husky/commit-msg +0 -2
- package/.releaserc.json +0 -25
- package/AGENTS.md +0 -505
- package/CHANGELOG.md +0 -23
- package/__tests__/cli.js +0 -20
- package/__tests__/data.js +0 -41
- package/__tests__/fuzzy-kanji-match.js +0 -33
- package/__tests__/index.js +0 -42
- package/__tests__/machine-encrypt.js +0 -49
- package/__tests__/typo-correction.js +0 -56
- package/benchmark/CHATGPT_BENCHMARK.md +0 -90
- package/benchmark/README.md +0 -99
- package/benchmark/magi_ocr_data/README.md +0 -53
- package/benchmark/magi_ocr_data/dataset.tsv +0 -836
- package/benchmark/results/.gitkeep +0 -0
- package/benchmark/results/summary/latest_comparison.tsv +0 -9
- package/benchmark/scripts/compare-algorithms.js +0 -54
- package/benchmark/scripts/compare-and-report.js +0 -35
- package/benchmark/scripts/generate-report.js +0 -324
- package/benchmark/scripts/prompt-template.txt +0 -118
- package/benchmark/scripts/run-accuracy-test.js +0 -155
- package/benchmark/scripts/run-chatgpt-test.js +0 -284
- package/commitlint.config.js +0 -20
- package/doc/develop.md +0 -108
- package/doc/typo-correction-algorithm.md +0 -79
- package/jest.config.cjs +0 -185
- package/raw_data/json_generator +0 -49
- package/raw_data/meisi_json_generator +0 -53
- package/raw_data/spell.txt +0 -1011
- package/raw_data/spell_NG_word.txt +0 -4
- package/test_data/help_message.js +0 -19
package/package.json
CHANGED
|
@@ -1,9 +1,12 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "yukichant",
|
|
3
|
-
"version": "
|
|
3
|
+
"version": "4.1.0",
|
|
4
4
|
"description": "",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
|
-
"repository":
|
|
6
|
+
"repository": {
|
|
7
|
+
"type": "git",
|
|
8
|
+
"url": "git+https://github.com/amanoese/yukichant.git"
|
|
9
|
+
},
|
|
7
10
|
"author": "Seito Taka",
|
|
8
11
|
"main": "src/node.js",
|
|
9
12
|
"exports": {
|
|
@@ -45,6 +48,7 @@
|
|
|
45
48
|
},
|
|
46
49
|
"dependencies": {
|
|
47
50
|
"commander": "^8.3.0",
|
|
51
|
+
"diff": "^8.0.3",
|
|
48
52
|
"fastest-levenshtein": "^1.0.16",
|
|
49
53
|
"get-stdin": "^9.0.0",
|
|
50
54
|
"kuromoji": "^0.1.2",
|
|
@@ -53,6 +57,12 @@
|
|
|
53
57
|
"picocolors": "^1.0.0",
|
|
54
58
|
"yukidic": "git+https://github.com/amanoese/yukidic.git"
|
|
55
59
|
},
|
|
60
|
+
"files": [
|
|
61
|
+
"src/",
|
|
62
|
+
"data/",
|
|
63
|
+
"README.md",
|
|
64
|
+
"LICENSE"
|
|
65
|
+
],
|
|
56
66
|
"keywords": [
|
|
57
67
|
"cli",
|
|
58
68
|
"terminal",
|
package/src/browser.js
CHANGED
|
@@ -33,7 +33,7 @@ const KANJIVG_RADICAL_REPO = 'yagays/kanjivg-radical'
|
|
|
33
33
|
* @param {string} [version] - yukichantのリリースタグ(省略時はパッケージバージョン)
|
|
34
34
|
* @returns {Object} 各データのURL
|
|
35
35
|
*/
|
|
36
|
-
function getDefaultUrls(version = pkg.version) {
|
|
36
|
+
function getDefaultUrls(version = `v${pkg.version}`) {
|
|
37
37
|
return {
|
|
38
38
|
dataBaseUrl: `${GITHUB_RAW_BASE}/${YUKICHANT_REPO}/${version}/data`,
|
|
39
39
|
dicPath: `${GITHUB_RAW_BASE}/${YUKIDIC_REPO}/master/dic/`,
|
|
@@ -91,8 +91,8 @@ export async function initBrowser({
|
|
|
91
91
|
])
|
|
92
92
|
|
|
93
93
|
if (!TfIdf) {
|
|
94
|
-
const
|
|
95
|
-
TfIdf =
|
|
94
|
+
const tfidfModule = await import('natural/lib/natural/tfidf/index.js')
|
|
95
|
+
TfIdf = tfidfModule.TfIdf
|
|
96
96
|
}
|
|
97
97
|
|
|
98
98
|
initFuzzyKanjiMatch({ meisi, dousi, kanji2element, TfIdf })
|
package/src/cli.js
CHANGED
|
@@ -14,8 +14,8 @@ program
|
|
|
14
14
|
.version(version)
|
|
15
15
|
.argument('[text]','input text','')
|
|
16
16
|
.option('-d','decode flag')
|
|
17
|
-
.option('-s','strict decode mode
|
|
18
|
-
.option('--no-tfidf','disable tfidf mode
|
|
17
|
+
.option('-s','disable typo correction (strict decode mode)')
|
|
18
|
+
.option('--no-tfidf','disable tfidf mode for typo correction')
|
|
19
19
|
.option('--levenshtein','use Levenshtein distance algorithm instead of Jaro-Winkler')
|
|
20
20
|
.option('-v','verbose mode flag')
|
|
21
21
|
.option('-vv','more verbose') // なぜかVv
|
package/src/index.js
CHANGED
|
@@ -1,6 +1,4 @@
|
|
|
1
1
|
import simpleEnigma from './machine-encrypt.js'
|
|
2
|
-
import log from './logger.js'
|
|
3
|
-
|
|
4
2
|
export let default_encoder = (uint8text,{ meisi, dousi }) => {
|
|
5
3
|
|
|
6
4
|
//機械式暗号(ロータ型)の仕組みを利用したスクランブラーを配置
|
|
@@ -71,7 +69,6 @@ export let default_decoder = (typoCorrection) => async (encodeText,option = {} ,
|
|
|
71
69
|
cleanEncodeText = typoCorrection.exec(cleanEncodeText,option)
|
|
72
70
|
}
|
|
73
71
|
|
|
74
|
-
log.debug('修正後のテキスト:', cleanEncodeText)
|
|
75
72
|
// デコード用の正規表現に変換。
|
|
76
73
|
// ex: /さざ波|その者|ほうき星よ/g
|
|
77
74
|
let decodeRegExp = new RegExp(Object.keys(decodeHash).join('|'),'g')
|
package/src/logger.js
CHANGED
|
@@ -6,9 +6,7 @@ log.setLevel('warn');
|
|
|
6
6
|
// CLIオプションからログレベルを設定する関数
|
|
7
7
|
export function setLogLevel(option) {
|
|
8
8
|
if (option.Vv) {
|
|
9
|
-
log.setLevel('
|
|
10
|
-
} else if (option.v) {
|
|
11
|
-
log.setLevel('debug'); // デバッグ情報
|
|
9
|
+
log.setLevel('debug'); // アルゴリズム詳細を表示
|
|
12
10
|
} else {
|
|
13
11
|
log.setLevel('warn'); // 通常は警告とエラーのみ
|
|
14
12
|
}
|
package/src/typo-correction.js
CHANGED
|
@@ -1,9 +1,28 @@
|
|
|
1
1
|
import { distance, closest } from 'fastest-levenshtein';
|
|
2
|
+
import { diffChars } from 'diff';
|
|
2
3
|
import { JaroWinklerDistance } from './jaro-winkler.js';
|
|
3
4
|
import log from './logger.js';
|
|
5
|
+
import pc from 'picocolors';
|
|
4
6
|
|
|
5
7
|
const jaroWinkler = new JaroWinklerDistance();
|
|
6
8
|
|
|
9
|
+
function colorDiffLines(oldStr, newStr) {
|
|
10
|
+
const parts = diffChars(oldStr, newStr);
|
|
11
|
+
let origLine = '';
|
|
12
|
+
let fixLine = '';
|
|
13
|
+
for (const part of parts) {
|
|
14
|
+
if (part.added) {
|
|
15
|
+
fixLine += pc.green(pc.bold(part.value));
|
|
16
|
+
} else if (part.removed) {
|
|
17
|
+
origLine += pc.red(pc.strikethrough(part.value));
|
|
18
|
+
} else {
|
|
19
|
+
origLine += pc.dim(part.value);
|
|
20
|
+
fixLine += pc.dim(part.value);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
return { origLine, fixLine };
|
|
24
|
+
}
|
|
25
|
+
|
|
7
26
|
let tokenizer = null;
|
|
8
27
|
let fkm = null;
|
|
9
28
|
|
|
@@ -183,11 +202,11 @@ const exec = (text, option = { is_tfidf: false, v: false, Vv: false, Levenshtein
|
|
|
183
202
|
return text;
|
|
184
203
|
}
|
|
185
204
|
|
|
186
|
-
log.
|
|
187
|
-
log.
|
|
188
|
-
log.
|
|
189
|
-
log.
|
|
190
|
-
log.
|
|
205
|
+
log.debug('----------------------------------');
|
|
206
|
+
log.debug('ntokens', ntokens.filter((token) => token.pos !== '記号'));
|
|
207
|
+
log.debug('----------------------------------');
|
|
208
|
+
log.debug('ptokens', ptokens);
|
|
209
|
+
log.debug('----------------------------------');
|
|
191
210
|
|
|
192
211
|
let fixTokens = organizeUnknownTokens(ntokens, option);
|
|
193
212
|
let fixedTokens = fixTokens
|
|
@@ -196,38 +215,45 @@ const exec = (text, option = { is_tfidf: false, v: false, Vv: false, Levenshtein
|
|
|
196
215
|
.map((token) => {
|
|
197
216
|
// 。で終わる文は、。を削除して修正する
|
|
198
217
|
let fixText = token.v.replace(/。$/, '');
|
|
218
|
+
const originalText = fixText;
|
|
199
219
|
if (option.is_tfidf === true) {
|
|
200
220
|
fixText = nearTokenMatch(fixText, option);
|
|
201
|
-
log.debug('fixText', fixText);
|
|
202
221
|
} else {
|
|
203
222
|
fixText = findClosestWord(fixText, fkm.allWord, option.Levenshtein, option);
|
|
204
223
|
}
|
|
224
|
+
if (originalText !== fixText) {
|
|
225
|
+
const { origLine, fixLine } = colorDiffLines(originalText, fixText);
|
|
226
|
+
log.debug('----------------------------------');
|
|
227
|
+
log.debug(origLine);
|
|
228
|
+
log.debug(fixLine);
|
|
229
|
+
log.debug('----------------------------------');
|
|
230
|
+
}
|
|
205
231
|
return { ...token, v: fixText };
|
|
206
232
|
});
|
|
207
233
|
|
|
208
234
|
let fixedTextTokens = [...ptokens, ...fixedTokens].sort((a, b) => a.i - b.i);
|
|
209
235
|
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
223
|
-
|
|
224
|
-
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
|
|
230
|
-
|
|
236
|
+
const hasChanges = fixedTextTokens.some((token) => token.old && token.old !== token.v);
|
|
237
|
+
if (hasChanges) {
|
|
238
|
+
const diffs = fixedTextTokens.map((token) => ({
|
|
239
|
+
old: token.old || token.v,
|
|
240
|
+
fixed: token.v,
|
|
241
|
+
changed: !!(token.old && token.old !== token.v),
|
|
242
|
+
}));
|
|
243
|
+
|
|
244
|
+
if (typeof option.onDiff === 'function') {
|
|
245
|
+
option.onDiff(diffs);
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
if (option.v) {
|
|
249
|
+
const oldText = diffs.map(d => d.old).join('');
|
|
250
|
+
const fixedText = diffs.map(d => d.fixed).join('');
|
|
251
|
+
const { origLine, fixLine } = colorDiffLines(oldText, fixedText);
|
|
252
|
+
console.error(origLine);
|
|
253
|
+
console.error(fixLine);
|
|
254
|
+
}
|
|
255
|
+
} else if (typeof option.onDiff === 'function') {
|
|
256
|
+
option.onDiff(null);
|
|
231
257
|
}
|
|
232
258
|
|
|
233
259
|
let fixedText = fixedTextTokens.map((token) => token.v).join('');
|
|
@@ -235,7 +261,7 @@ const exec = (text, option = { is_tfidf: false, v: false, Vv: false, Levenshtein
|
|
|
235
261
|
};
|
|
236
262
|
|
|
237
263
|
const nearTokenMatch = (tokenStr, option = { isJaroWinklerDistance: false, v: false, Vv: false, Levenshtein: false }) => {
|
|
238
|
-
log.
|
|
264
|
+
log.debug('tokenStr', tokenStr);
|
|
239
265
|
|
|
240
266
|
let tokens = [...tokenStr];
|
|
241
267
|
let bestMatch = null;
|
|
@@ -248,7 +274,7 @@ const nearTokenMatch = (tokenStr, option = { isJaroWinklerDistance: false, v: fa
|
|
|
248
274
|
for (let i = 0; i < tokens.length; i++) {
|
|
249
275
|
let kanji = tokens[i];
|
|
250
276
|
if (fkm.han.test(kanji)) {
|
|
251
|
-
log.
|
|
277
|
+
log.debug('kanji', fkm.maxTfidfSocres(kanji));
|
|
252
278
|
|
|
253
279
|
let bestKanji = kanji;
|
|
254
280
|
let bestLocalDistance = Infinity;
|
|
@@ -265,7 +291,7 @@ const nearTokenMatch = (tokenStr, option = { isJaroWinklerDistance: false, v: fa
|
|
|
265
291
|
// 置き換えた後の文字列と最適なマッチの距離を計算
|
|
266
292
|
let d = calculateSimilarity(testText, bestMatchLocal, option.Levenshtein);
|
|
267
293
|
|
|
268
|
-
log.
|
|
294
|
+
log.debug({
|
|
269
295
|
'd' : d,
|
|
270
296
|
'bestLocalDistance': bestLocalDistance,
|
|
271
297
|
'testText' : testText,
|
|
@@ -318,7 +344,7 @@ const organizeUnknownTokens = (ntokens, option = { v: false, Vv: false }) => {
|
|
|
318
344
|
adverb = true;
|
|
319
345
|
}
|
|
320
346
|
|
|
321
|
-
log.
|
|
347
|
+
log.debug(
|
|
322
348
|
token.surface_form,
|
|
323
349
|
token.pos,
|
|
324
350
|
token.pos_detail_1,
|
package/.cursor/rules
DELETED
|
@@ -1,58 +0,0 @@
|
|
|
1
|
-
# yukichant プロジェクトルール
|
|
2
|
-
|
|
3
|
-
## 言語設定
|
|
4
|
-
|
|
5
|
-
**このプロジェクトでは、すべてのコミュニケーションを日本語で行ってください。**
|
|
6
|
-
|
|
7
|
-
- すべての説明・応答は日本語で記述する
|
|
8
|
-
- コミットメッセージは日本語で記述する
|
|
9
|
-
- コードコメントは日本語で記述する
|
|
10
|
-
- レビューコメントは日本語で記述する
|
|
11
|
-
- エラーメッセージの説明は日本語で記述する
|
|
12
|
-
|
|
13
|
-
## コミットメッセージフォーマット
|
|
14
|
-
|
|
15
|
-
コミットメッセージは以下のフォーマットで記述してください:
|
|
16
|
-
|
|
17
|
-
```
|
|
18
|
-
[種別] 簡潔な説明
|
|
19
|
-
```
|
|
20
|
-
|
|
21
|
-
### 種別の例
|
|
22
|
-
- `[feat]`: 新機能追加
|
|
23
|
-
- `[fix]`: バグ修正
|
|
24
|
-
- `[perf]`: パフォーマンス改善
|
|
25
|
-
- `[refactor]`: コード整理
|
|
26
|
-
- `[test]`: テスト追加/修正
|
|
27
|
-
- `[docs]`: ドキュメント更新
|
|
28
|
-
- `[deps]`: 依存関係の更新
|
|
29
|
-
- `[chore]`: その他の変更
|
|
30
|
-
|
|
31
|
-
### 例
|
|
32
|
-
```
|
|
33
|
-
[feat] Levenshtein距離アルゴリズムを追加
|
|
34
|
-
[fix] デコード時の形態素解析エラーを修正
|
|
35
|
-
[docs] 誤字修正アルゴリズムの使い方を追加
|
|
36
|
-
[refactor] 類似度計算処理を関数化
|
|
37
|
-
[test] typo-correctionのテストケースを追加
|
|
38
|
-
```
|
|
39
|
-
|
|
40
|
-
## コーディング規約
|
|
41
|
-
|
|
42
|
-
- ES Modules(`import`/`export`)を使用
|
|
43
|
-
- 非同期処理は`async`/`await`を使用
|
|
44
|
-
- 関数型プログラミングスタイル(`map`, `filter`, `reduce`を活用)
|
|
45
|
-
- Unicode正規表現(`\p{scx=Han}`など)を積極的に使用
|
|
46
|
-
|
|
47
|
-
## プロジェクト概要
|
|
48
|
-
|
|
49
|
-
yukichantは、テキストを日本語の詠唱呪文に変換し、元に戻すことができるNode.js製CLIツールです。
|
|
50
|
-
|
|
51
|
-
### 主要コンポーネント
|
|
52
|
-
- `src/index.js`: エンコード/デコードのコアロジック
|
|
53
|
-
- `src/typo-correction.js`: 誤字修正機能(Jaro-Winkler / Levenshtein)
|
|
54
|
-
- `src/machine-encrypt.js`: ローター型暗号実装
|
|
55
|
-
- `data/meisi.json`, `data/dousi.json`: 名詞・動詞辞書
|
|
56
|
-
|
|
57
|
-
詳細は `AGENTS.md` を参照してください。
|
|
58
|
-
|
package/.husky/commit-msg
DELETED
package/.releaserc.json
DELETED
|
@@ -1,25 +0,0 @@
|
|
|
1
|
-
{
|
|
2
|
-
"branches": ["master"],
|
|
3
|
-
"plugins": [
|
|
4
|
-
["@semantic-release/commit-analyzer", {
|
|
5
|
-
"preset": "conventionalcommits",
|
|
6
|
-
"releaseRules": [
|
|
7
|
-
{ "type": "feat", "release": "minor" },
|
|
8
|
-
{ "type": "fix", "release": "patch" },
|
|
9
|
-
{ "type": "perf", "release": "patch" },
|
|
10
|
-
{ "type": "refactor", "release": "patch" },
|
|
11
|
-
{ "type": "revert", "release": "patch" }
|
|
12
|
-
]
|
|
13
|
-
}],
|
|
14
|
-
["@semantic-release/release-notes-generator", {
|
|
15
|
-
"preset": "conventionalcommits"
|
|
16
|
-
}],
|
|
17
|
-
"@semantic-release/changelog",
|
|
18
|
-
"@semantic-release/npm",
|
|
19
|
-
["@semantic-release/git", {
|
|
20
|
-
"assets": ["package.json", "CHANGELOG.md"],
|
|
21
|
-
"message": "[release] ${nextRelease.version}"
|
|
22
|
-
}],
|
|
23
|
-
"@semantic-release/github"
|
|
24
|
-
]
|
|
25
|
-
}
|