yukichant 6.0.0 → 6.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +4 -0
- package/package.json +2 -2
- package/src/jaro-winkler.js +1 -1
- package/src/typo-correction.js +74 -110
package/README.md
CHANGED
|
@@ -6,6 +6,10 @@
|
|
|
6
6
|
yukichantは、テキストデータを詠唱呪文(魔法の言葉)に変換するコマンドです。
|
|
7
7
|
また変換した詠唱呪文は、元のテキストデータにデコードすることができます。
|
|
8
8
|
|
|
9
|
+
## Demo
|
|
10
|
+
|
|
11
|
+
- ブラウザで試せるデモページ: [https://amanoese.github.io/yukichant/](https://amanoese.github.io/yukichant/)
|
|
12
|
+
|
|
9
13
|
## 特徴
|
|
10
14
|
|
|
11
15
|
- **エンコード/デコード**: 任意のテキストを日本語の呪文風文章に変換し、復号できます。
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "yukichant",
|
|
3
|
-
"version": "6.0.
|
|
3
|
+
"version": "6.0.2",
|
|
4
4
|
"description": "",
|
|
5
5
|
"license": "Apache-2.0",
|
|
6
6
|
"repository": {
|
|
@@ -46,7 +46,7 @@
|
|
|
46
46
|
"@semantic-release/github": "^12.0.6",
|
|
47
47
|
"conventional-changelog-conventionalcommits": "^9.1.0",
|
|
48
48
|
"husky": "^9.1.7",
|
|
49
|
-
"jest": "^
|
|
49
|
+
"jest": "^30.3.0",
|
|
50
50
|
"jest-extended": "^1.2.0",
|
|
51
51
|
"semantic-release": "^25.0.3"
|
|
52
52
|
},
|
package/src/jaro-winkler.js
CHANGED
|
@@ -3,7 +3,7 @@
|
|
|
3
3
|
* 文字列の類似度を計算するアルゴリズム。値が1に近いほど類似していることを示す。
|
|
4
4
|
*/
|
|
5
5
|
export class JaroWinklerDistance {
|
|
6
|
-
constructor(prefixScale = 0.
|
|
6
|
+
constructor(prefixScale = 0.15, boostThreshold = 0.6, prefixLength = 5) {
|
|
7
7
|
this.prefixScale = prefixScale; // プレフィックス重み
|
|
8
8
|
this.boostThreshold = boostThreshold; // ブースト閾値
|
|
9
9
|
this.prefixLength = prefixLength; // プレフィックス最大長
|
package/src/typo-correction.js
CHANGED
|
@@ -70,114 +70,74 @@ function findClosestWord(word, wordList, useLevenshtein = false, option = { v: f
|
|
|
70
70
|
} else {
|
|
71
71
|
// Jaro-Winkler距離を使用
|
|
72
72
|
algorithmName = 'Jaro-Winkler';
|
|
73
|
-
|
|
74
|
-
|
|
75
|
-
|
|
76
|
-
|
|
77
|
-
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
81
|
-
|
|
82
|
-
|
|
83
|
-
score = maxSimilarity;
|
|
84
|
-
|
|
85
|
-
// Jaro-Winkler距離が低い場合(0.7未満)、Levenshtein距離も考慮する
|
|
86
|
-
// ただし、Jaro-Winklerの類似度が高い場合はそれを優先
|
|
87
|
-
if (maxSimilarity < 0.7) {
|
|
88
|
-
let minLevenshteinDistance = Infinity;
|
|
89
|
-
let bestLevenshteinWord = closestWord;
|
|
90
|
-
let bestLengthDiff = Infinity;
|
|
91
|
-
let bestPrefixMatch = -1;
|
|
92
|
-
|
|
93
|
-
for (const candidateWord of wordList) {
|
|
94
|
-
const levDistance = distance(word, candidateWord);
|
|
95
|
-
const lengthDiff = Math.abs(word.length - candidateWord.length);
|
|
96
|
-
|
|
97
|
-
// 先頭文字の一致数をカウント
|
|
98
|
-
let prefixMatch = 0;
|
|
99
|
-
for (let i = 0; i < Math.min(word.length, candidateWord.length); i++) {
|
|
100
|
-
if (word[i] === candidateWord[i]) {
|
|
101
|
-
prefixMatch++;
|
|
102
|
-
} else {
|
|
103
|
-
break;
|
|
104
|
-
}
|
|
73
|
+
// 1回目のスコアリング: 現在のJaro-Winkler設定で全候補を評価
|
|
74
|
+
const jaroScored = wordList
|
|
75
|
+
.map((candidateWord, index) => ({
|
|
76
|
+
index,
|
|
77
|
+
candidateWord,
|
|
78
|
+
similarity: jaroWinkler.similarity(word, candidateWord),
|
|
79
|
+
}))
|
|
80
|
+
.sort((a, b) => {
|
|
81
|
+
if (b.similarity !== a.similarity) {
|
|
82
|
+
return b.similarity - a.similarity;
|
|
105
83
|
}
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
|
|
117
|
-
|
|
118
|
-
|
|
119
|
-
|
|
120
|
-
|
|
121
|
-
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
127
|
-
|
|
128
|
-
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
|
|
136
|
-
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
84
|
+
return a.index - b.index;
|
|
85
|
+
});
|
|
86
|
+
|
|
87
|
+
const bestJaro = jaroScored[0];
|
|
88
|
+
const secondJaro = jaroScored[1];
|
|
89
|
+
const gapThreshold =
|
|
90
|
+
typeof option.jaroGapThreshold === 'number'
|
|
91
|
+
? option.jaroGapThreshold
|
|
92
|
+
: typeof option.jaroLevenshteinGapThreshold === 'number'
|
|
93
|
+
? option.jaroLevenshteinGapThreshold
|
|
94
|
+
: 0.02;
|
|
95
|
+
|
|
96
|
+
closestWord = bestJaro?.candidateWord ?? word;
|
|
97
|
+
score = bestJaro?.similarity ?? 0;
|
|
98
|
+
|
|
99
|
+
// 上位差が小さいときだけ「接頭辞一致を強めたJaro-Winkler」で再判定する
|
|
100
|
+
if (secondJaro && bestJaro.similarity - secondJaro.similarity <= gapThreshold) {
|
|
101
|
+
// 1位との差が閾値以内の候補だけを曖昧候補として再比較対象にする
|
|
102
|
+
const ambiguousCandidates = jaroScored
|
|
103
|
+
.filter((item) => bestJaro.similarity - item.similarity <= gapThreshold)
|
|
104
|
+
.map((item) => ({
|
|
105
|
+
index: item.index,
|
|
106
|
+
candidateWord: item.candidateWord,
|
|
107
|
+
baseSimilarity: item.similarity,
|
|
108
|
+
}));
|
|
109
|
+
|
|
110
|
+
const boostedPrefixScale =
|
|
111
|
+
typeof option.jaroBoostedPrefixScale === 'number'
|
|
112
|
+
? option.jaroBoostedPrefixScale
|
|
113
|
+
: 0.3;
|
|
114
|
+
const boostedJaroWinkler = new JaroWinklerDistance(
|
|
115
|
+
boostedPrefixScale,
|
|
116
|
+
jaroWinkler.boostThreshold,
|
|
117
|
+
jaroWinkler.prefixLength
|
|
118
|
+
);
|
|
119
|
+
|
|
120
|
+
// 2回目のスコアリング:
|
|
121
|
+
// 1) boostedSimilarity 2) baseSimilarity 3) 元の候補順 の順で決着
|
|
122
|
+
const boostedScored = ambiguousCandidates
|
|
123
|
+
.map((item) => ({
|
|
124
|
+
...item,
|
|
125
|
+
boostedSimilarity: boostedJaroWinkler.similarity(word, item.candidateWord),
|
|
126
|
+
}))
|
|
127
|
+
.sort((a, b) => {
|
|
128
|
+
if (b.boostedSimilarity !== a.boostedSimilarity) {
|
|
129
|
+
return b.boostedSimilarity - a.boostedSimilarity;
|
|
147
130
|
}
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
if (prefixMatch !== bestPrefixMatch) {
|
|
151
|
-
return false;
|
|
131
|
+
if (b.baseSimilarity !== a.baseSimilarity) {
|
|
132
|
+
return b.baseSimilarity - a.baseSimilarity;
|
|
152
133
|
}
|
|
153
|
-
|
|
154
|
-
|
|
155
|
-
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
bestLevenshteinWord = candidateWord;
|
|
161
|
-
bestLengthDiff = lengthDiff;
|
|
162
|
-
bestPrefixMatch = prefixMatch;
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
// Jaro-Winklerの類似度が低い場合(0.5未満)は、Levenshtein距離の結果を優先
|
|
167
|
-
// それ以外の場合は、Jaro-Winklerの結果を優先
|
|
168
|
-
if (maxSimilarity < 0.5 || minLevenshteinDistance <= Math.max(2, word.length / 2)) {
|
|
169
|
-
// Jaro-Winklerの結果とLevenshteinの結果を比較
|
|
170
|
-
const jaroBestMatch = closestWord;
|
|
171
|
-
const jaroDistance = distance(word, jaroBestMatch);
|
|
172
|
-
|
|
173
|
-
// Levenshtein距離がより良い結果を提供する場合、それを使用する
|
|
174
|
-
if (minLevenshteinDistance < jaroDistance ||
|
|
175
|
-
(minLevenshteinDistance === jaroDistance && bestLevenshteinWord.length < jaroBestMatch.length)) {
|
|
176
|
-
closestWord = bestLevenshteinWord;
|
|
177
|
-
algorithmName = 'Jaro-Winkler+Levenshtein';
|
|
178
|
-
score = minLevenshteinDistance;
|
|
179
|
-
}
|
|
180
|
-
}
|
|
134
|
+
return a.index - b.index;
|
|
135
|
+
});
|
|
136
|
+
|
|
137
|
+
// 同点の最終タイブレークは「先に現れた候補」を採用する
|
|
138
|
+
closestWord = boostedScored[0]?.candidateWord ?? closestWord;
|
|
139
|
+
score = boostedScored[0]?.boostedSimilarity ?? score;
|
|
140
|
+
algorithmName = 'Jaro-Winkler(boosted)';
|
|
181
141
|
}
|
|
182
142
|
}
|
|
183
143
|
|
|
@@ -264,6 +224,8 @@ const nearTokenMatch = (tokenStr, option = { isJaroWinklerDistance: false, v: fa
|
|
|
264
224
|
log.debug('tokenStr', tokenStr);
|
|
265
225
|
|
|
266
226
|
let tokens = [...tokenStr];
|
|
227
|
+
// bestMatch: 現時点での全体最良候補(文字列)
|
|
228
|
+
// bestDistance: bestMatchと入力の距離(小さいほど良い)
|
|
267
229
|
let bestMatch = null;
|
|
268
230
|
let bestDistance = Infinity;
|
|
269
231
|
|
|
@@ -279,7 +241,7 @@ const nearTokenMatch = (tokenStr, option = { isJaroWinklerDistance: false, v: fa
|
|
|
279
241
|
let bestKanji = kanji;
|
|
280
242
|
let bestLocalDistance = Infinity;
|
|
281
243
|
|
|
282
|
-
//
|
|
244
|
+
// この位置だけを差し替えた候補文字列を作り、局所的に最良の漢字を選ぶ
|
|
283
245
|
for (const result of fkm.maxTfidfSocres(kanji)) {
|
|
284
246
|
let newKanji = result.kanji;
|
|
285
247
|
let testTokens = [...currentTokens];
|
|
@@ -307,10 +269,10 @@ const nearTokenMatch = (tokenStr, option = { isJaroWinklerDistance: false, v: fa
|
|
|
307
269
|
}
|
|
308
270
|
}
|
|
309
271
|
|
|
310
|
-
//
|
|
272
|
+
// 局所最適だった漢字を採用して、次の位置の探索に引き継ぐ
|
|
311
273
|
currentTokens[i] = bestKanji;
|
|
312
274
|
|
|
313
|
-
//
|
|
275
|
+
// 採用後の全文字列で再評価し、グローバル最良候補を更新する
|
|
314
276
|
let currentText = currentTokens.join('');
|
|
315
277
|
let currentBestMatch = findClosestWord(currentText, fkm.allWord, option.Levenshtein, option);
|
|
316
278
|
let currentDistance = calculateSimilarity(currentText, currentBestMatch, option.Levenshtein);
|
|
@@ -322,12 +284,12 @@ const nearTokenMatch = (tokenStr, option = { isJaroWinklerDistance: false, v: fa
|
|
|
322
284
|
}
|
|
323
285
|
}
|
|
324
286
|
|
|
325
|
-
//
|
|
287
|
+
// 逐次探索で最良候補が得られていれば、それを最終結果として返す
|
|
326
288
|
if (bestMatch !== null) {
|
|
327
289
|
return bestMatch;
|
|
328
290
|
}
|
|
329
291
|
|
|
330
|
-
//
|
|
292
|
+
// 漢字置換が1回も走らなかった場合のフォールバック(元文字列を直接マッチ)
|
|
331
293
|
return findClosestWord(tokens.join(''), fkm.allWord, option.Levenshtein, option);
|
|
332
294
|
};
|
|
333
295
|
|
|
@@ -356,6 +318,8 @@ const organizeUnknownTokens = (ntokens, option = { v: false, Vv: false }) => {
|
|
|
356
318
|
if (
|
|
357
319
|
list.length === 0 ||
|
|
358
320
|
(last.adverb === true && adverb === false) ||
|
|
321
|
+
// 動詞の直後に名詞が来た場合は連結しない
|
|
322
|
+
(last.pos === '動詞' && token.pos === '名詞') ||
|
|
359
323
|
(last.i + last.v.length !== token.word_position) ||
|
|
360
324
|
((/^[\p{scx=Han}]+$/u).test(token.pos) &&
|
|
361
325
|
last.length >= 2 &&
|