kanabarum 0.2.0 → 0.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +57 -53
- package/dist/index.cjs +940 -0
- package/dist/index.cjs.map +1 -0
- package/dist/index.d.cts +34 -0
- package/dist/index.d.ts +30 -5
- package/dist/index.js +36 -7
- package/dist/index.js.map +1 -1
- package/package.json +4 -2
package/dist/index.cjs
ADDED
|
@@ -0,0 +1,940 @@
|
|
|
1
|
+
'use strict';
|
|
2
|
+
|
|
3
|
+
var kuromoji = require('kuromoji');
|
|
4
|
+
var path = require('path');
|
|
5
|
+
var module$1 = require('module');
|
|
6
|
+
|
|
7
|
+
var _documentCurrentScript = typeof document !== 'undefined' ? document.currentScript : null;
|
|
8
|
+
function _interopDefault (e) { return e && e.__esModule ? e : { default: e }; }
|
|
9
|
+
|
|
10
|
+
var kuromoji__default = /*#__PURE__*/_interopDefault(kuromoji);
|
|
11
|
+
var path__default = /*#__PURE__*/_interopDefault(path);
|
|
12
|
+
|
|
13
|
+
// src/normalizer.ts
|
|
14
|
+
function normalizeInputText(input) {
|
|
15
|
+
let normalized = input.normalize("NFC");
|
|
16
|
+
normalized = normalizeHalfwidthKatakanaOnly(normalized);
|
|
17
|
+
normalized = normalized.replace(/[\u2015\u2500]/g, "\u30FC");
|
|
18
|
+
normalized = normalized.replace(/(?<=([\u30A0-\u30FF]))-/g, "\u30FC").replace(/-(?=[\u30A0-\u30FF])/g, "\u30FC");
|
|
19
|
+
return normalized;
|
|
20
|
+
}
|
|
21
|
+
function normalizeHalfwidthKatakanaOnly(s) {
|
|
22
|
+
return s.replace(
|
|
23
|
+
/[\uFF66-\uFF9F\uFF70]+/g,
|
|
24
|
+
(chunk) => chunk.normalize("NFKC")
|
|
25
|
+
);
|
|
26
|
+
}
|
|
27
|
+
function toHiragana(input) {
|
|
28
|
+
let out = "";
|
|
29
|
+
for (const ch of input) {
|
|
30
|
+
const code = ch.codePointAt(0);
|
|
31
|
+
if (code >= 12449 && code <= 12534) {
|
|
32
|
+
out += String.fromCodePoint(code - 96);
|
|
33
|
+
continue;
|
|
34
|
+
}
|
|
35
|
+
if (ch === "\u30FC") {
|
|
36
|
+
out += ch;
|
|
37
|
+
continue;
|
|
38
|
+
}
|
|
39
|
+
out += ch;
|
|
40
|
+
}
|
|
41
|
+
return out;
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// src/dictionary.ts
|
|
45
|
+
var SpecialDictionary = [
|
|
46
|
+
// ["とうきょう", "도쿄"],
|
|
47
|
+
{ word: "\u3053\u3093\u306B\u3061\u306F", answer: "\uACE4\uB2C8\uCE58\uC640", hira: true, kata: true },
|
|
48
|
+
{ word: "\u3053\u3093\u3070\u3093\u306F", answer: "\uACF0\uBC29\uC640" },
|
|
49
|
+
{ word: "\u3059\u307F\u307E\u305B\u3093", answer: "\uC2A4\uBBF8\uB9C8\uC14D" },
|
|
50
|
+
{ word: "\u306F\u3072\u3075\u3078\u307B", answer: "\uD558\uD788\uD6C4\uD5E4\uD638" },
|
|
51
|
+
{ word: "\u304B\u308F\u3044\u3044", answer: "\uCE74\uC640\uC774" },
|
|
52
|
+
{ word: "\u3064\u306A\u307F", answer: "\uC4F0\uB098\uBBF8" },
|
|
53
|
+
{ word: "\u3086\u3046\u308A", answer: "\uC720\uC6B0\uB9AC" },
|
|
54
|
+
{ word: "\u30DF\u30E5\u30FC\u30B8\u30C3\u30AF", answer: "\uBBA4\uC9C0\uCFE0" },
|
|
55
|
+
{ word: "\u3061\u3083\u3093", answer: "\uCA29" }
|
|
56
|
+
];
|
|
57
|
+
|
|
58
|
+
// src/particleRewriter.ts
|
|
59
|
+
function isKatakanaChar(ch) {
|
|
60
|
+
const c = ch.codePointAt(0);
|
|
61
|
+
return c >= 12448 && c <= 12543;
|
|
62
|
+
}
|
|
63
|
+
function containsKanji(s) {
|
|
64
|
+
for (const ch of s) {
|
|
65
|
+
const c = ch.codePointAt(0);
|
|
66
|
+
if (c >= 19968 && c <= 40959 || c >= 13312 && c <= 19903) {
|
|
67
|
+
return true;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
return false;
|
|
71
|
+
}
|
|
72
|
+
function toHiragana2(s) {
|
|
73
|
+
const n = s.normalize("NFKC");
|
|
74
|
+
return Array.from(n).map((ch) => {
|
|
75
|
+
if (ch === "\u30FC")
|
|
76
|
+
return "";
|
|
77
|
+
if (!isKatakanaChar(ch))
|
|
78
|
+
return ch;
|
|
79
|
+
const code = ch.codePointAt(0);
|
|
80
|
+
if (code >= 12449 && code <= 12534) {
|
|
81
|
+
return String.fromCodePoint(code - 96);
|
|
82
|
+
}
|
|
83
|
+
return ch;
|
|
84
|
+
}).join("");
|
|
85
|
+
}
|
|
86
|
+
function dictKeysForHiraganaText(dict) {
|
|
87
|
+
const keys = [];
|
|
88
|
+
for (const e of dict) {
|
|
89
|
+
keys.push(e.word);
|
|
90
|
+
if (e.hira)
|
|
91
|
+
keys.push(toHiragana2(e.word));
|
|
92
|
+
}
|
|
93
|
+
return [...new Set(keys)].filter(Boolean);
|
|
94
|
+
}
|
|
95
|
+
function rangesOverlap(a, b) {
|
|
96
|
+
return a.start < b.end && b.start < a.end;
|
|
97
|
+
}
|
|
98
|
+
function buildProtectedRanges(text, keys) {
|
|
99
|
+
const sorted = [...new Set(keys)].sort((a, b) => b.length - a.length);
|
|
100
|
+
const ranges = [];
|
|
101
|
+
for (const key of sorted) {
|
|
102
|
+
if (!key)
|
|
103
|
+
continue;
|
|
104
|
+
let from = 0;
|
|
105
|
+
while (true) {
|
|
106
|
+
const idx = text.indexOf(key, from);
|
|
107
|
+
if (idx === -1)
|
|
108
|
+
break;
|
|
109
|
+
const cand = { start: idx, end: idx + key.length };
|
|
110
|
+
if (!ranges.some((r) => rangesOverlap(r, cand))) {
|
|
111
|
+
ranges.push(cand);
|
|
112
|
+
}
|
|
113
|
+
from = idx + 1;
|
|
114
|
+
}
|
|
115
|
+
}
|
|
116
|
+
ranges.sort((a, b) => a.start - b.start);
|
|
117
|
+
return ranges;
|
|
118
|
+
}
|
|
119
|
+
function isProtectedSpan(protectedRanges, start, end) {
|
|
120
|
+
for (const r of protectedRanges) {
|
|
121
|
+
if (start < r.end && end > r.start)
|
|
122
|
+
return true;
|
|
123
|
+
}
|
|
124
|
+
return false;
|
|
125
|
+
}
|
|
126
|
+
var HARD_BOUNDARY_SURF = /* @__PURE__ */ new Set([
|
|
127
|
+
"\u3002",
|
|
128
|
+
"\u3001",
|
|
129
|
+
"\uFF01",
|
|
130
|
+
"\uFF1F",
|
|
131
|
+
"!",
|
|
132
|
+
"?",
|
|
133
|
+
" ",
|
|
134
|
+
"\u3000"
|
|
135
|
+
]);
|
|
136
|
+
var HARD_BOUNDARY_DETAIL1 = /* @__PURE__ */ new Set([
|
|
137
|
+
"\u53E5\u70B9",
|
|
138
|
+
"\u8AAD\u70B9",
|
|
139
|
+
"\u62EC\u5F27\u958B",
|
|
140
|
+
"\u62EC\u5F27\u9589",
|
|
141
|
+
"\u7A7A\u767D"
|
|
142
|
+
]);
|
|
143
|
+
var LEXICAL_HE_ENDINGS = [
|
|
144
|
+
"\u3044\u306B\u3057\u3078",
|
|
145
|
+
"\u304A\u304D\u3078",
|
|
146
|
+
"\u3082\u3068\u3078",
|
|
147
|
+
"\u3059\u3048\u3078",
|
|
148
|
+
"\u3059\u3091\u3078",
|
|
149
|
+
"\u304B\u307F\u3078",
|
|
150
|
+
"\u304F\u306B\u3078",
|
|
151
|
+
"\u304D\u3057\u3078"
|
|
152
|
+
];
|
|
153
|
+
function isHardBoundaryToken(t) {
|
|
154
|
+
if (t.pos !== "\u8A18\u53F7")
|
|
155
|
+
return false;
|
|
156
|
+
if (HARD_BOUNDARY_SURF.has(t.surface_form))
|
|
157
|
+
return true;
|
|
158
|
+
return HARD_BOUNDARY_DETAIL1.has(t.pos_detail_1 ?? "");
|
|
159
|
+
}
|
|
160
|
+
function isContentToken(t) {
|
|
161
|
+
if (t.pos === "\u8A18\u53F7")
|
|
162
|
+
return !isHardBoundaryToken(t);
|
|
163
|
+
return true;
|
|
164
|
+
}
|
|
165
|
+
function prevContentIdx(tokens, i) {
|
|
166
|
+
for (let j = i - 1; j >= 0; j -= 1)
|
|
167
|
+
if (isContentToken(tokens[j]))
|
|
168
|
+
return j;
|
|
169
|
+
return -1;
|
|
170
|
+
}
|
|
171
|
+
function nextContentIdx(tokens, i) {
|
|
172
|
+
for (let j = i + 1; j < tokens.length; j += 1)
|
|
173
|
+
if (isContentToken(tokens[j]))
|
|
174
|
+
return j;
|
|
175
|
+
return -1;
|
|
176
|
+
}
|
|
177
|
+
function nextBoundaryOrEnd(tokens, i) {
|
|
178
|
+
for (let j = i + 1; j < tokens.length; j += 1) {
|
|
179
|
+
if (isHardBoundaryToken(tokens[j]))
|
|
180
|
+
continue;
|
|
181
|
+
return false;
|
|
182
|
+
}
|
|
183
|
+
return true;
|
|
184
|
+
}
|
|
185
|
+
function rewriteParticlesFromTokenization(originalText, hiraganaText, tokenizerTokens) {
|
|
186
|
+
const hiraChars = Array.from(hiraganaText);
|
|
187
|
+
const originChars = Array.from(originalText);
|
|
188
|
+
const protectedRanges = buildProtectedRanges(
|
|
189
|
+
hiraganaText,
|
|
190
|
+
dictKeysForHiraganaText(SpecialDictionary)
|
|
191
|
+
);
|
|
192
|
+
let out = "";
|
|
193
|
+
let origOut = "";
|
|
194
|
+
const spans = [];
|
|
195
|
+
let cursorInText = 0;
|
|
196
|
+
for (let i = 0; i < tokenizerTokens.length; i += 1) {
|
|
197
|
+
const tok = tokenizerTokens[i];
|
|
198
|
+
const wp = tok.word_position;
|
|
199
|
+
const surf = tok.surface_form;
|
|
200
|
+
const surfCpLen = [...surf].length;
|
|
201
|
+
const start = typeof wp === "number" ? wp - 1 : cursorInText;
|
|
202
|
+
const end = start + surfCpLen;
|
|
203
|
+
cursorInText = end;
|
|
204
|
+
const hasKanji = containsKanji(surf) && tok.pronunciation;
|
|
205
|
+
const hiraSurf = hasKanji ? toHiragana2(tok.pronunciation) : hiraChars.slice(start, end).join("");
|
|
206
|
+
const origSurfForOut = hasKanji ? tok.pronunciation.replace(/ー/g, "") : originChars.slice(start, end).join("");
|
|
207
|
+
const originSurf = originChars.slice(start, end).join("");
|
|
208
|
+
const originHadKatakana = /[\u30A0-\u30FF]/.test(originSurf);
|
|
209
|
+
if (isProtectedSpan(protectedRanges, start, end)) {
|
|
210
|
+
const outCpStart2 = [...out].length;
|
|
211
|
+
out += hiraSurf;
|
|
212
|
+
origOut += origSurfForOut;
|
|
213
|
+
spans.push({
|
|
214
|
+
start: outCpStart2,
|
|
215
|
+
end: [...out].length,
|
|
216
|
+
surface: hiraSurf,
|
|
217
|
+
pos: tok.pos,
|
|
218
|
+
pos1: tok.pos_detail_1 ?? void 0,
|
|
219
|
+
pos2: tok.pos_detail_2 ?? void 0,
|
|
220
|
+
pos3: tok.pos_detail_3 ?? void 0,
|
|
221
|
+
originHadKatakana
|
|
222
|
+
});
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
let replaced = hiraSurf;
|
|
226
|
+
if (tok.pos === "\u52A9\u8A5E" && hiraSurf === "\u306F") {
|
|
227
|
+
if (i > 0 && tokenizerTokens[i - 1].surface_form === "\u306F") ; else if (i + 1 < tokenizerTokens.length && tokenizerTokens[i + 1].surface_form === "\u306F") ; else {
|
|
228
|
+
const prevIdx = prevContentIdx(tokenizerTokens, i);
|
|
229
|
+
if (prevIdx >= 0) {
|
|
230
|
+
const nextIdx = nextContentIdx(tokenizerTokens, i);
|
|
231
|
+
const hasNextContent = nextIdx >= 0;
|
|
232
|
+
const isEndOrPunct = nextBoundaryOrEnd(tokenizerTokens, i);
|
|
233
|
+
const prevTok = tokenizerTokens[prevIdx];
|
|
234
|
+
const prevWpHa = prevTok.word_position;
|
|
235
|
+
const prevStartHa = typeof prevWpHa === "number" ? prevWpHa - 1 : 0;
|
|
236
|
+
const prevEndHa = prevStartHa + [...prevTok.surface_form].length;
|
|
237
|
+
const prevHira = hiraChars.slice(prevStartHa, prevEndHa).join("");
|
|
238
|
+
if (!prevHira.includes("\u3063") && tok.pos_detail_1 === "\u4FC2\u52A9\u8A5E" && (hasNextContent || isEndOrPunct) && prevTok.pos !== "\u52A9\u8A5E") {
|
|
239
|
+
replaced = "\u308F";
|
|
240
|
+
}
|
|
241
|
+
}
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
if (tok.pos === "\u52A9\u8A5E" && hiraSurf === "\u3078") {
|
|
245
|
+
if (start > 0) {
|
|
246
|
+
const left = hiraChars[start - 1];
|
|
247
|
+
if (left === " " || left === "\u3000" || left === " " || left === "\n" || left === "\r") ; else {
|
|
248
|
+
const prevIdx = prevContentIdx(tokenizerTokens, i);
|
|
249
|
+
if (prevIdx >= 0) {
|
|
250
|
+
const prevTok = tokenizerTokens[prevIdx];
|
|
251
|
+
const prevWp = prevTok.word_position;
|
|
252
|
+
const prevStart = typeof prevWp === "number" ? prevWp - 1 : 0;
|
|
253
|
+
const prevEnd = prevStart + [...prevTok.surface_form].length;
|
|
254
|
+
const prevHiraSurf = hiraChars.slice(prevStart, prevEnd).join("");
|
|
255
|
+
if (LEXICAL_HE_ENDINGS.some((w) => (prevHiraSurf + "\u3078").endsWith(w))) ; else if (prevHiraSurf.endsWith("\u306E")) ; else if (tok.pos_detail_1 === "\u683C\u52A9\u8A5E") {
|
|
256
|
+
replaced = "\u3048";
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
const outCpStart = [...out].length;
|
|
263
|
+
out += replaced;
|
|
264
|
+
origOut += origSurfForOut;
|
|
265
|
+
spans.push({
|
|
266
|
+
start: outCpStart,
|
|
267
|
+
end: [...out].length,
|
|
268
|
+
surface: replaced,
|
|
269
|
+
pos: tok.pos,
|
|
270
|
+
pos1: tok.pos_detail_1 ?? void 0,
|
|
271
|
+
pos2: tok.pos_detail_2 ?? void 0,
|
|
272
|
+
pos3: tok.pos_detail_3 ?? void 0,
|
|
273
|
+
originHadKatakana
|
|
274
|
+
});
|
|
275
|
+
}
|
|
276
|
+
return { rewritten: out, spans, rewrittenOriginal: origOut };
|
|
277
|
+
}
|
|
278
|
+
function tokenizeAndRewriteParticles(originalText, hiraganaText, tokenizer) {
|
|
279
|
+
const rawTokens = tokenizer.tokenize(originalText);
|
|
280
|
+
const { rewritten, spans, rewrittenOriginal } = rewriteParticlesFromTokenization(originalText, hiraganaText, rawTokens);
|
|
281
|
+
return { rewritten, spans, rewrittenOriginal, rawTokens };
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
// src/mora.ts
|
|
285
|
+
var SINGLE = {
|
|
286
|
+
\u3042: { out: "\uC544", vowelMain: "a", consClass: "vowel", vowelOnly: true },
|
|
287
|
+
\u3044: { out: "\uC774", vowelMain: "i", consClass: "vowel", vowelOnly: true },
|
|
288
|
+
\u3046: { out: "\uC6B0", vowelMain: "u", consClass: "vowel", vowelOnly: true },
|
|
289
|
+
\u3048: { out: "\uC5D0", vowelMain: "e", consClass: "vowel", vowelOnly: true },
|
|
290
|
+
\u304A: { out: "\uC624", vowelMain: "o", consClass: "vowel", vowelOnly: true },
|
|
291
|
+
\u304B: { out: "\uCE74", vowelMain: "a", consClass: "k" },
|
|
292
|
+
\u304D: { out: "\uD0A4", vowelMain: "i", consClass: "k" },
|
|
293
|
+
\u304F: { out: "\uCFE0", vowelMain: "u", consClass: "k" },
|
|
294
|
+
\u3051: { out: "\uCF00", vowelMain: "e", consClass: "k" },
|
|
295
|
+
\u3053: { out: "\uCF54", vowelMain: "o", consClass: "k" },
|
|
296
|
+
\u3055: { out: "\uC0AC", vowelMain: "a", consClass: "s" },
|
|
297
|
+
\u3057: { out: "\uC2DC", vowelMain: "i", consClass: "s" },
|
|
298
|
+
\u3059: { out: "\uC2A4", vowelMain: "u", consClass: "s" },
|
|
299
|
+
\u305B: { out: "\uC138", vowelMain: "e", consClass: "s" },
|
|
300
|
+
\u305D: { out: "\uC18C", vowelMain: "o", consClass: "s" },
|
|
301
|
+
\u305F: { out: "\uD0C0", vowelMain: "a", consClass: "t" },
|
|
302
|
+
\u3061: { out: "\uCE58", vowelMain: "i", consClass: "t" },
|
|
303
|
+
\u3064: { out: "\uCE20", vowelMain: "u", consClass: "t" },
|
|
304
|
+
\u3066: { out: "\uD14C", vowelMain: "e", consClass: "t" },
|
|
305
|
+
\u3068: { out: "\uD1A0", vowelMain: "o", consClass: "t" },
|
|
306
|
+
\u306A: { out: "\uB098", vowelMain: "a", consClass: "n" },
|
|
307
|
+
\u306B: { out: "\uB2C8", vowelMain: "i", consClass: "n" },
|
|
308
|
+
\u306C: { out: "\uB204", vowelMain: "u", consClass: "n" },
|
|
309
|
+
\u306D: { out: "\uB124", vowelMain: "e", consClass: "n" },
|
|
310
|
+
\u306E: { out: "\uB178", vowelMain: "o", consClass: "n" },
|
|
311
|
+
\u306F: { out: "\uD558", vowelMain: "a", consClass: "h" },
|
|
312
|
+
\u3072: { out: "\uD788", vowelMain: "i", consClass: "h" },
|
|
313
|
+
\u3075: { out: "\uD6C4", vowelMain: "u", consClass: "h" },
|
|
314
|
+
\u3078: { out: "\uD5E4", vowelMain: "e", consClass: "h" },
|
|
315
|
+
\u307B: { out: "\uD638", vowelMain: "o", consClass: "h" },
|
|
316
|
+
\u307E: { out: "\uB9C8", vowelMain: "a", consClass: "m" },
|
|
317
|
+
\u307F: { out: "\uBBF8", vowelMain: "i", consClass: "m" },
|
|
318
|
+
\u3080: { out: "\uBB34", vowelMain: "u", consClass: "m" },
|
|
319
|
+
\u3081: { out: "\uBA54", vowelMain: "e", consClass: "m" },
|
|
320
|
+
\u3082: { out: "\uBAA8", vowelMain: "o", consClass: "m" },
|
|
321
|
+
\u3084: { out: "\uC57C", vowelMain: "a", consClass: "y" },
|
|
322
|
+
\u3086: { out: "\uC720", vowelMain: "u", consClass: "y" },
|
|
323
|
+
\u3088: { out: "\uC694", vowelMain: "o", consClass: "y" },
|
|
324
|
+
\u3089: { out: "\uB77C", vowelMain: "a", consClass: "r" },
|
|
325
|
+
\u308A: { out: "\uB9AC", vowelMain: "i", consClass: "r" },
|
|
326
|
+
\u308B: { out: "\uB8E8", vowelMain: "u", consClass: "r" },
|
|
327
|
+
\u308C: { out: "\uB808", vowelMain: "e", consClass: "r" },
|
|
328
|
+
\u308D: { out: "\uB85C", vowelMain: "o", consClass: "r" },
|
|
329
|
+
\u308F: { out: "\uC640", vowelMain: "a", consClass: "w" },
|
|
330
|
+
\u3092: { out: "\uC624", vowelMain: "o", consClass: "w" },
|
|
331
|
+
\u304C: { out: "\uAC00", vowelMain: "a", consClass: "g" },
|
|
332
|
+
\u304E: { out: "\uAE30", vowelMain: "i", consClass: "g" },
|
|
333
|
+
\u3050: { out: "\uAD6C", vowelMain: "u", consClass: "g" },
|
|
334
|
+
\u3052: { out: "\uAC8C", vowelMain: "e", consClass: "g" },
|
|
335
|
+
\u3054: { out: "\uACE0", vowelMain: "o", consClass: "g" },
|
|
336
|
+
\u3056: { out: "\uC790", vowelMain: "a", consClass: "z" },
|
|
337
|
+
\u3058: { out: "\uC9C0", vowelMain: "i", consClass: "z" },
|
|
338
|
+
\u305A: { out: "\uC988", vowelMain: "u", consClass: "z" },
|
|
339
|
+
\u305C: { out: "\uC81C", vowelMain: "e", consClass: "z" },
|
|
340
|
+
\u305E: { out: "\uC870", vowelMain: "o", consClass: "z" },
|
|
341
|
+
\u3060: { out: "\uB2E4", vowelMain: "a", consClass: "d" },
|
|
342
|
+
\u3062: { out: "\uC9C0", vowelMain: "i", consClass: "d" },
|
|
343
|
+
\u3065: { out: "\uC988", vowelMain: "u", consClass: "d" },
|
|
344
|
+
\u3067: { out: "\uB370", vowelMain: "e", consClass: "d" },
|
|
345
|
+
\u3069: { out: "\uB3C4", vowelMain: "o", consClass: "d" },
|
|
346
|
+
\u3070: { out: "\uBC14", vowelMain: "a", consClass: "b" },
|
|
347
|
+
\u3073: { out: "\uBE44", vowelMain: "i", consClass: "b" },
|
|
348
|
+
\u3076: { out: "\uBD80", vowelMain: "u", consClass: "b" },
|
|
349
|
+
\u3079: { out: "\uBCA0", vowelMain: "e", consClass: "b" },
|
|
350
|
+
\u307C: { out: "\uBCF4", vowelMain: "o", consClass: "b" },
|
|
351
|
+
\u3071: { out: "\uD30C", vowelMain: "a", consClass: "p" },
|
|
352
|
+
\u3074: { out: "\uD53C", vowelMain: "i", consClass: "p" },
|
|
353
|
+
\u3077: { out: "\uD478", vowelMain: "u", consClass: "p" },
|
|
354
|
+
\u307A: { out: "\uD398", vowelMain: "e", consClass: "p" },
|
|
355
|
+
\u307D: { out: "\uD3EC", vowelMain: "o", consClass: "p" },
|
|
356
|
+
\u3094: { out: "\uBD80", vowelMain: "u", consClass: "b" }
|
|
357
|
+
};
|
|
358
|
+
var YOUON = {
|
|
359
|
+
\u304D\u3083: { out: "\uCEAC", vowelMain: "a", consClass: "k", wasYouon: true },
|
|
360
|
+
\u304D\u3085: { out: "\uD050", vowelMain: "u", consClass: "k", wasYouon: true },
|
|
361
|
+
\u304D\u3087: { out: "\uCFC4", vowelMain: "o", consClass: "k", wasYouon: true },
|
|
362
|
+
\u3057\u3083: { out: "\uC0E4", vowelMain: "a", consClass: "s", wasYouon: true },
|
|
363
|
+
\u3057\u3085: { out: "\uC288", vowelMain: "u", consClass: "s", wasYouon: true },
|
|
364
|
+
\u3057\u3087: { out: "\uC1FC", vowelMain: "o", consClass: "s", wasYouon: true },
|
|
365
|
+
\u3061\u3083: { out: "\uCC60", vowelMain: "a", consClass: "t", wasYouon: true },
|
|
366
|
+
\u3061\u3085: { out: "\uCE04", vowelMain: "u", consClass: "t", wasYouon: true },
|
|
367
|
+
\u3061\u3087: { out: "\uCD78", vowelMain: "o", consClass: "t", wasYouon: true },
|
|
368
|
+
\u3066\u3085: { out: "\uD29C", vowelMain: "u", consClass: "t", wasYouon: true },
|
|
369
|
+
\u3067\u3085: { out: "\uB4C0", vowelMain: "u", consClass: "d", wasYouon: true },
|
|
370
|
+
\u306B\u3083: { out: "\uB0D0", vowelMain: "a", consClass: "n", wasYouon: true },
|
|
371
|
+
\u306B\u3085: { out: "\uB274", vowelMain: "u", consClass: "n", wasYouon: true },
|
|
372
|
+
\u306B\u3087: { out: "\uB1E8", vowelMain: "o", consClass: "n", wasYouon: true },
|
|
373
|
+
\u3072\u3083: { out: "\uD590", vowelMain: "a", consClass: "h", wasYouon: true },
|
|
374
|
+
\u3072\u3085: { out: "\uD734", vowelMain: "u", consClass: "h", wasYouon: true },
|
|
375
|
+
\u3072\u3087: { out: "\uD6A8", vowelMain: "o", consClass: "h", wasYouon: true },
|
|
376
|
+
\u3075\u3083: { out: "\uD344", vowelMain: "a", consClass: "p", wasYouon: true },
|
|
377
|
+
\u3075\u3085: { out: "\uD4E8", vowelMain: "u", consClass: "p", wasYouon: true },
|
|
378
|
+
\u3075\u3087: { out: "\uD45C", vowelMain: "o", consClass: "p", wasYouon: true },
|
|
379
|
+
\u307F\u3083: { out: "\uBA00", vowelMain: "a", consClass: "m", wasYouon: true },
|
|
380
|
+
\u307F\u3085: { out: "\uBBA4", vowelMain: "u", consClass: "m", wasYouon: true },
|
|
381
|
+
\u307F\u3087: { out: "\uBB18", vowelMain: "o", consClass: "m", wasYouon: true },
|
|
382
|
+
\u308A\u3083: { out: "\uB7B4", vowelMain: "a", consClass: "r", wasYouon: true },
|
|
383
|
+
\u308A\u3085: { out: "\uB958", vowelMain: "u", consClass: "r", wasYouon: true },
|
|
384
|
+
\u308A\u3087: { out: "\uB8CC", vowelMain: "o", consClass: "r", wasYouon: true },
|
|
385
|
+
\u304E\u3083: { out: "\uAC38", vowelMain: "a", consClass: "g", wasYouon: true },
|
|
386
|
+
\u304E\u3085: { out: "\uADDC", vowelMain: "u", consClass: "g", wasYouon: true },
|
|
387
|
+
\u304E\u3087: { out: "\uAD50", vowelMain: "o", consClass: "g", wasYouon: true },
|
|
388
|
+
\u3058\u3083: { out: "\uC7C8", vowelMain: "a", consClass: "z", wasYouon: true },
|
|
389
|
+
\u3058\u3085: { out: "\uC96C", vowelMain: "u", consClass: "z", wasYouon: true },
|
|
390
|
+
\u3058\u3087: { out: "\uC8E0", vowelMain: "o", consClass: "z", wasYouon: true },
|
|
391
|
+
\u3073\u3083: { out: "\uBC4C", vowelMain: "a", consClass: "b", wasYouon: true },
|
|
392
|
+
\u3073\u3085: { out: "\uBDF0", vowelMain: "u", consClass: "b", wasYouon: true },
|
|
393
|
+
\u3073\u3087: { out: "\uBD64", vowelMain: "o", consClass: "b", wasYouon: true },
|
|
394
|
+
\u3074\u3083: { out: "\uD344", vowelMain: "a", consClass: "p", wasYouon: true },
|
|
395
|
+
\u3074\u3085: { out: "\uD4E8", vowelMain: "u", consClass: "p", wasYouon: true },
|
|
396
|
+
\u3074\u3087: { out: "\uD45C", vowelMain: "o", consClass: "p", wasYouon: true }
|
|
397
|
+
};
|
|
398
|
+
var LOAN = {
|
|
399
|
+
\u3066\u3043: { out: "\uD2F0", vowelMain: "i", consClass: "t" },
|
|
400
|
+
\u3067\u3043: { out: "\uB514", vowelMain: "i", consClass: "d" },
|
|
401
|
+
\u3061\u3047: { out: "\uCCB4", vowelMain: "e", consClass: "t" },
|
|
402
|
+
\u3057\u3047: { out: "\uC170", vowelMain: "e", consClass: "s" },
|
|
403
|
+
\u3058\u3047: { out: "\uC81C", vowelMain: "e", consClass: "z" },
|
|
404
|
+
\u3064\u3041: { out: "\uCC28", vowelMain: "a", consClass: "t" },
|
|
405
|
+
\u3064\u3043: { out: "\uCE58", vowelMain: "i", consClass: "t" },
|
|
406
|
+
\u3064\u3047: { out: "\uCCB4", vowelMain: "e", consClass: "t" },
|
|
407
|
+
\u3064\u3049: { out: "\uCD08", vowelMain: "o", consClass: "t" },
|
|
408
|
+
\u3075\u3041: { out: "\uD30C", vowelMain: "a", consClass: "p" },
|
|
409
|
+
\u3075\u3043: { out: "\uD53C", vowelMain: "i", consClass: "p" },
|
|
410
|
+
\u3075\u3047: { out: "\uD398", vowelMain: "e", consClass: "p" },
|
|
411
|
+
\u3075\u3049: { out: "\uD3EC", vowelMain: "o", consClass: "p" },
|
|
412
|
+
\u3050\u3041: { out: "\uACFC", vowelMain: "a", consClass: "g" },
|
|
413
|
+
\u3050\u3043: { out: "\uADC0", vowelMain: "i", consClass: "g" },
|
|
414
|
+
\u3050\u3047: { out: "\uADA4", vowelMain: "e", consClass: "g" },
|
|
415
|
+
\u3050\u3049: { out: "\uAD88", vowelMain: "o", consClass: "g" },
|
|
416
|
+
\u304F\u3041: { out: "\uCF70", vowelMain: "a", consClass: "k" },
|
|
417
|
+
\u304F\u3043: { out: "\uD034", vowelMain: "i", consClass: "k" },
|
|
418
|
+
\u304F\u3047: { out: "\uD018", vowelMain: "e", consClass: "k" },
|
|
419
|
+
\u304F\u3049: { out: "\uCFFC", vowelMain: "o", consClass: "k" },
|
|
420
|
+
\u3069\u3041: { out: "\uB3E0", vowelMain: "a", consClass: "d" },
|
|
421
|
+
\u3069\u3045: { out: "\uB450", vowelMain: "u", consClass: "d" },
|
|
422
|
+
\u3069\u3049: { out: "\uB46C", vowelMain: "o", consClass: "d" },
|
|
423
|
+
\u3094\u3041: { out: "\uBC14", vowelMain: "a", consClass: "b" },
|
|
424
|
+
\u3094\u3043: { out: "\uBE44", vowelMain: "i", consClass: "b" },
|
|
425
|
+
\u3094\u3047: { out: "\uBCA0", vowelMain: "e", consClass: "b" },
|
|
426
|
+
\u3094\u3049: { out: "\uBCF4", vowelMain: "o", consClass: "b" }
|
|
427
|
+
};
|
|
428
|
+
var SMALL_Y = /* @__PURE__ */ new Set(["\u3083", "\u3085", "\u3087"]);
|
|
429
|
+
var SMALL_V = /* @__PURE__ */ new Set(["\u3041", "\u3043", "\u3045", "\u3047", "\u3049"]);
|
|
430
|
+
var U_DROP_KEYS = /* @__PURE__ */ new Set([
|
|
431
|
+
"\u3086",
|
|
432
|
+
"\u304D\u3085",
|
|
433
|
+
"\u3057\u3085",
|
|
434
|
+
"\u3061\u3085",
|
|
435
|
+
"\u306B\u3085",
|
|
436
|
+
"\u3072\u3085",
|
|
437
|
+
"\u307F\u3085",
|
|
438
|
+
"\u308A\u3085",
|
|
439
|
+
"\u304E\u3085",
|
|
440
|
+
"\u3058\u3085",
|
|
441
|
+
"\u3073\u3085",
|
|
442
|
+
"\u3074\u3085"
|
|
443
|
+
]);
|
|
444
|
+
|
|
445
|
+
// src/coreConverter.ts
|
|
446
|
+
function isHiraganaChar(ch) {
|
|
447
|
+
const c = ch.codePointAt(0);
|
|
448
|
+
return c >= 12352 && c <= 12447;
|
|
449
|
+
}
|
|
450
|
+
function isKatakanaChar2(ch) {
|
|
451
|
+
const c = ch.codePointAt(0);
|
|
452
|
+
return c >= 12448 && c <= 12543;
|
|
453
|
+
}
|
|
454
|
+
function toHiraganaKey(s) {
|
|
455
|
+
const n = s.normalize("NFKC");
|
|
456
|
+
return Array.from(n).map((ch) => {
|
|
457
|
+
if (ch === "\u30FC")
|
|
458
|
+
return ch;
|
|
459
|
+
if (!isKatakanaChar2(ch))
|
|
460
|
+
return ch;
|
|
461
|
+
const code = ch.codePointAt(0);
|
|
462
|
+
if (code >= 12449 && code <= 12534) {
|
|
463
|
+
return String.fromCodePoint(code - 96);
|
|
464
|
+
}
|
|
465
|
+
return ch;
|
|
466
|
+
}).join("");
|
|
467
|
+
}
|
|
468
|
+
function toKatakanaKey(s) {
|
|
469
|
+
const n = s.normalize("NFKC");
|
|
470
|
+
return Array.from(n).map(
|
|
471
|
+
(ch) => isHiraganaChar(ch) ? String.fromCodePoint(ch.codePointAt(0) + 96) : ch
|
|
472
|
+
).join("");
|
|
473
|
+
}
|
|
474
|
+
function compileSpecialDictionary(dict) {
|
|
475
|
+
const items = [];
|
|
476
|
+
for (const e of dict) {
|
|
477
|
+
items.push({
|
|
478
|
+
keyChars: Array.from(e.word),
|
|
479
|
+
answer: e.answer,
|
|
480
|
+
stream: "orig"
|
|
481
|
+
});
|
|
482
|
+
if (e.hira) {
|
|
483
|
+
const k = toHiraganaKey(e.word);
|
|
484
|
+
items.push({
|
|
485
|
+
keyChars: Array.from(k),
|
|
486
|
+
answer: e.answer,
|
|
487
|
+
stream: "rewritten"
|
|
488
|
+
});
|
|
489
|
+
}
|
|
490
|
+
if (e.kata) {
|
|
491
|
+
const k = toKatakanaKey(e.word);
|
|
492
|
+
items.push({ keyChars: Array.from(k), answer: e.answer, stream: "orig" });
|
|
493
|
+
}
|
|
494
|
+
}
|
|
495
|
+
items.sort((a, b) => b.keyChars.length - a.keyChars.length);
|
|
496
|
+
return items;
|
|
497
|
+
}
|
|
498
|
+
var COMPILED_SPECIAL_DICT = compileSpecialDictionary(SpecialDictionary);
|
|
499
|
+
function coreKanaToHangulConvert(s, opts) {
|
|
500
|
+
const chars = Array.from(s);
|
|
501
|
+
const origChars = Array.from(opts?.original ?? s);
|
|
502
|
+
const HANGUL_BASE = 44032;
|
|
503
|
+
const HANGUL_END = 55203;
|
|
504
|
+
function isHangulSyllable(ch) {
|
|
505
|
+
const c = ch.codePointAt(0);
|
|
506
|
+
return c >= HANGUL_BASE && c <= HANGUL_END;
|
|
507
|
+
}
|
|
508
|
+
const JONG = {
|
|
509
|
+
G: 1,
|
|
510
|
+
// ㄱ
|
|
511
|
+
N: 4,
|
|
512
|
+
// ㄴ
|
|
513
|
+
M: 16,
|
|
514
|
+
// ㅁ
|
|
515
|
+
B: 17,
|
|
516
|
+
// ㅂ
|
|
517
|
+
S: 19,
|
|
518
|
+
// ㅅ
|
|
519
|
+
NG: 21
|
|
520
|
+
// ㅇ
|
|
521
|
+
};
|
|
522
|
+
function addFinal(syl, jong) {
|
|
523
|
+
if (!isHangulSyllable(syl))
|
|
524
|
+
return syl;
|
|
525
|
+
const code = syl.codePointAt(0) - HANGUL_BASE;
|
|
526
|
+
const cho = Math.floor(code / 588);
|
|
527
|
+
const jung = Math.floor(code % 588 / 28);
|
|
528
|
+
return String.fromCodePoint(HANGUL_BASE + cho * 588 + jung * 28 + jong);
|
|
529
|
+
}
|
|
530
|
+
function replaceLastHangul(out2, jong) {
|
|
531
|
+
if (!out2)
|
|
532
|
+
return out2;
|
|
533
|
+
const last = out2[out2.length - 1];
|
|
534
|
+
if (!isHangulSyllable(last))
|
|
535
|
+
return out2;
|
|
536
|
+
return out2.slice(0, -1) + addFinal(last, jong);
|
|
537
|
+
}
|
|
538
|
+
function isHiragana(ch) {
|
|
539
|
+
const c = ch.codePointAt(0);
|
|
540
|
+
return c >= 12352 && c <= 12447;
|
|
541
|
+
}
|
|
542
|
+
function isKana(ch) {
|
|
543
|
+
return isHiragana(ch) || ch === "\u30FC";
|
|
544
|
+
}
|
|
545
|
+
function readMoraAt(idx) {
|
|
546
|
+
if (idx >= chars.length)
|
|
547
|
+
return null;
|
|
548
|
+
const c0 = chars[idx];
|
|
549
|
+
const c1 = chars[idx + 1];
|
|
550
|
+
if (c1 && SMALL_V.has(c1)) {
|
|
551
|
+
const key2 = c0 + c1;
|
|
552
|
+
const info2 = LOAN[key2];
|
|
553
|
+
if (info2)
|
|
554
|
+
return { key: key2, len: 2, info: info2 };
|
|
555
|
+
}
|
|
556
|
+
if (c1 && SMALL_Y.has(c1)) {
|
|
557
|
+
const key2 = c0 + c1;
|
|
558
|
+
const info2 = YOUON[key2];
|
|
559
|
+
if (info2)
|
|
560
|
+
return { key: key2, len: 2, info: info2 };
|
|
561
|
+
}
|
|
562
|
+
const info = SINGLE[c0];
|
|
563
|
+
if (info)
|
|
564
|
+
return { key: c0, len: 1, info };
|
|
565
|
+
return { key: c0, len: 1, info: void 0 };
|
|
566
|
+
}
|
|
567
|
+
function readOriginalMoraAt(idx) {
|
|
568
|
+
if (idx >= origChars.length)
|
|
569
|
+
return null;
|
|
570
|
+
const c0 = origChars[idx];
|
|
571
|
+
const c1 = origChars[idx + 1];
|
|
572
|
+
if (c1 && SMALL_V.has(c1)) {
|
|
573
|
+
const key2 = c0 + c1;
|
|
574
|
+
const info2 = LOAN[key2];
|
|
575
|
+
if (info2)
|
|
576
|
+
return { key: key2, len: 2, info: info2 };
|
|
577
|
+
}
|
|
578
|
+
if (c1 && SMALL_Y.has(c1)) {
|
|
579
|
+
const key2 = c0 + c1;
|
|
580
|
+
const info2 = YOUON[key2];
|
|
581
|
+
if (info2)
|
|
582
|
+
return { key: key2, len: 2, info: info2 };
|
|
583
|
+
}
|
|
584
|
+
const info = SINGLE[c0];
|
|
585
|
+
if (info)
|
|
586
|
+
return { key: c0, len: 1, info };
|
|
587
|
+
return { key: c0, len: 1, info: void 0 };
|
|
588
|
+
}
|
|
589
|
+
function isLabialStart(cons) {
|
|
590
|
+
return cons === "m" || cons === "b" || cons === "p";
|
|
591
|
+
}
|
|
592
|
+
const tokens = opts?.tokens ?? null;
|
|
593
|
+
let tokIdx = 0;
|
|
594
|
+
function syncTokenIndex(charIndex) {
|
|
595
|
+
if (!tokens)
|
|
596
|
+
return;
|
|
597
|
+
while (tokIdx + 1 < tokens.length && tokens[tokIdx].end <= charIndex) {
|
|
598
|
+
tokIdx++;
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
function curToken(charIndex) {
|
|
602
|
+
if (!tokens)
|
|
603
|
+
return null;
|
|
604
|
+
syncTokenIndex(charIndex);
|
|
605
|
+
const t = tokens[tokIdx];
|
|
606
|
+
if (t && t.start <= charIndex && charIndex < t.end)
|
|
607
|
+
return t;
|
|
608
|
+
return null;
|
|
609
|
+
}
|
|
610
|
+
function prevToken() {
|
|
611
|
+
if (!tokens)
|
|
612
|
+
return null;
|
|
613
|
+
return tokIdx - 1 >= 0 ? tokens[tokIdx - 1] : null;
|
|
614
|
+
}
|
|
615
|
+
function nextToken() {
|
|
616
|
+
if (!tokens)
|
|
617
|
+
return null;
|
|
618
|
+
return tokIdx + 1 < tokens.length ? tokens[tokIdx + 1] : null;
|
|
619
|
+
}
|
|
620
|
+
const INITIAL_VOICING_BLOCK_NEXT = /* @__PURE__ */ new Set(["\u3044", "\u3072", "\u3093", "\u3066"]);
|
|
621
|
+
function peekNextMoraKeySkippingChoonpu(fromIdx) {
|
|
622
|
+
let j = fromIdx;
|
|
623
|
+
while (j < chars.length && chars[j] === "\u30FC")
|
|
624
|
+
j++;
|
|
625
|
+
const m = readMoraAt(j);
|
|
626
|
+
return m?.key ?? null;
|
|
627
|
+
}
|
|
628
|
+
const SAN_PARTICLES = /* @__PURE__ */ new Set(["\u306F", "\u308F", "\u3078", "\u3048", "\u3092", "\u304A"]);
|
|
629
|
+
function isSanHonorificAt(cpIdx) {
|
|
630
|
+
const t = curToken(cpIdx);
|
|
631
|
+
if (!t)
|
|
632
|
+
return false;
|
|
633
|
+
if (cpIdx < 1 || chars[cpIdx - 1] !== "\u3055")
|
|
634
|
+
return false;
|
|
635
|
+
const local = chars.slice(t.start, cpIdx + 1).join("");
|
|
636
|
+
if (!local.endsWith("\u3055\u3093"))
|
|
637
|
+
return false;
|
|
638
|
+
const hasPrefixInsideToken = cpIdx - 1 > t.start;
|
|
639
|
+
const p = prevToken();
|
|
640
|
+
const prevIsAttachable = !!p && p.end === t.start && p.surface.length > 0 && !HARD_BOUNDARY_SURF.has(p.surface);
|
|
641
|
+
if (!hasPrefixInsideToken && !prevIsAttachable)
|
|
642
|
+
return false;
|
|
643
|
+
const n = nextToken();
|
|
644
|
+
if (!n)
|
|
645
|
+
return true;
|
|
646
|
+
if (n.pos === "\u8A18\u53F7" && HARD_BOUNDARY_SURF.has(n.surface))
|
|
647
|
+
return true;
|
|
648
|
+
if (n.pos === "\u52A9\u8A5E" && SAN_PARTICLES.has(n.surface))
|
|
649
|
+
return true;
|
|
650
|
+
return false;
|
|
651
|
+
}
|
|
652
|
+
let out = "";
|
|
653
|
+
let i = 0;
|
|
654
|
+
let lastMora = null;
|
|
655
|
+
while (i < chars.length) {
|
|
656
|
+
let atTokenStart = false;
|
|
657
|
+
let tokForI = null;
|
|
658
|
+
if (tokens) {
|
|
659
|
+
tokForI = curToken(i);
|
|
660
|
+
atTokenStart = !!tokForI && tokForI.start === i;
|
|
661
|
+
if (tokForI?.pos === "\u8A18\u53F7")
|
|
662
|
+
atTokenStart = false;
|
|
663
|
+
} else {
|
|
664
|
+
atTokenStart = i === 0;
|
|
665
|
+
}
|
|
666
|
+
let matchedSpecial = false;
|
|
667
|
+
for (const it of COMPILED_SPECIAL_DICT) {
|
|
668
|
+
const src = it.stream === "orig" ? origChars : chars;
|
|
669
|
+
const len = it.keyChars.length;
|
|
670
|
+
if (i + len > src.length)
|
|
671
|
+
continue;
|
|
672
|
+
let ok = true;
|
|
673
|
+
for (let k = 0; k < len; k++) {
|
|
674
|
+
if (src[i + k] !== it.keyChars[k]) {
|
|
675
|
+
ok = false;
|
|
676
|
+
break;
|
|
677
|
+
}
|
|
678
|
+
}
|
|
679
|
+
if (!ok)
|
|
680
|
+
continue;
|
|
681
|
+
out += it.answer;
|
|
682
|
+
i += len;
|
|
683
|
+
lastMora = null;
|
|
684
|
+
matchedSpecial = true;
|
|
685
|
+
break;
|
|
686
|
+
}
|
|
687
|
+
if (matchedSpecial)
|
|
688
|
+
continue;
|
|
689
|
+
const ch = chars[i];
|
|
690
|
+
if (ch === "\u30FC") {
|
|
691
|
+
i += 1;
|
|
692
|
+
continue;
|
|
693
|
+
}
|
|
694
|
+
if (!isKana(ch)) {
|
|
695
|
+
out += ch;
|
|
696
|
+
i += 1;
|
|
697
|
+
lastMora = null;
|
|
698
|
+
continue;
|
|
699
|
+
}
|
|
700
|
+
if (ch === "\u3063") {
|
|
701
|
+
if (!out || !isHangulSyllable(out[out.length - 1])) {
|
|
702
|
+
out += "\u30C3";
|
|
703
|
+
i += 1;
|
|
704
|
+
lastMora = null;
|
|
705
|
+
continue;
|
|
706
|
+
}
|
|
707
|
+
const next = readMoraAt(i + 1);
|
|
708
|
+
const prevV = lastMora?.vowelMain ?? "a";
|
|
709
|
+
const nextInfo = next?.info;
|
|
710
|
+
const nextCons = nextInfo?.consClass ?? "t";
|
|
711
|
+
let jong = JONG.S;
|
|
712
|
+
if (nextCons === "p" || nextCons === "b")
|
|
713
|
+
jong = JONG.B;
|
|
714
|
+
else if (nextCons === "k" || nextCons === "g") {
|
|
715
|
+
jong = prevV === "e" || prevV === "i" ? JONG.S : JONG.G;
|
|
716
|
+
} else {
|
|
717
|
+
jong = JONG.S;
|
|
718
|
+
}
|
|
719
|
+
out = replaceLastHangul(out, jong);
|
|
720
|
+
i += 1;
|
|
721
|
+
continue;
|
|
722
|
+
}
|
|
723
|
+
if (ch === "\u304A" && chars[i + 1] === "\u304A") {
|
|
724
|
+
let j = i;
|
|
725
|
+
while (chars[j] === "\u304A")
|
|
726
|
+
j++;
|
|
727
|
+
out += "\uC624";
|
|
728
|
+
i = j;
|
|
729
|
+
lastMora = {
|
|
730
|
+
out: "\uC624",
|
|
731
|
+
vowelMain: "o",
|
|
732
|
+
consClass: "vowel",
|
|
733
|
+
vowelOnly: true
|
|
734
|
+
};
|
|
735
|
+
continue;
|
|
736
|
+
}
|
|
737
|
+
const mora = readMoraAt(i);
|
|
738
|
+
const originalMora = readOriginalMoraAt(i);
|
|
739
|
+
if (!mora) {
|
|
740
|
+
out += chars[i];
|
|
741
|
+
i += 1;
|
|
742
|
+
lastMora = null;
|
|
743
|
+
continue;
|
|
744
|
+
}
|
|
745
|
+
if (!originalMora) {
|
|
746
|
+
throw Error("\uC6D0\uBCF8 \uBAA8\uB77C \uC190\uC2E4");
|
|
747
|
+
}
|
|
748
|
+
if (mora.key === "\u3093") {
|
|
749
|
+
const next = readMoraAt(i + 1);
|
|
750
|
+
const nextInfo = next?.info;
|
|
751
|
+
const hasPrevHangul = out.length > 0 && isHangulSyllable(out[out.length - 1]);
|
|
752
|
+
if (!hasPrevHangul) {
|
|
753
|
+
out += "\u3134";
|
|
754
|
+
i += 1;
|
|
755
|
+
lastMora = null;
|
|
756
|
+
continue;
|
|
757
|
+
}
|
|
758
|
+
if (lastMora?.out === "\uC0AC" && isSanHonorificAt(i)) {
|
|
759
|
+
out = replaceLastHangul(out, JONG.NG);
|
|
760
|
+
i += 1;
|
|
761
|
+
continue;
|
|
762
|
+
}
|
|
763
|
+
let jong = JONG.N;
|
|
764
|
+
if (!next || !nextInfo || !isKana(next.key[0])) {
|
|
765
|
+
jong = lastMora?.wasYouon ? JONG.NG : JONG.N;
|
|
766
|
+
} else {
|
|
767
|
+
const nc = nextInfo.consClass;
|
|
768
|
+
if (nc === "k" || nc === "g") {
|
|
769
|
+
jong = JONG.NG;
|
|
770
|
+
} else if (nc === "vowel" || nc === "y" || nc === "w") {
|
|
771
|
+
jong = JONG.N;
|
|
772
|
+
} else if (isLabialStart(nc)) {
|
|
773
|
+
if (lastMora?.vowelOnly)
|
|
774
|
+
jong = JONG.N;
|
|
775
|
+
else
|
|
776
|
+
jong = JONG.M;
|
|
777
|
+
} else {
|
|
778
|
+
jong = JONG.N;
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
out = replaceLastHangul(out, jong);
|
|
782
|
+
i += 1;
|
|
783
|
+
continue;
|
|
784
|
+
}
|
|
785
|
+
const info = mora.info;
|
|
786
|
+
if (!info) {
|
|
787
|
+
out += mora.key;
|
|
788
|
+
i += mora.len;
|
|
789
|
+
lastMora = null;
|
|
790
|
+
continue;
|
|
791
|
+
}
|
|
792
|
+
let outSyl = info.out;
|
|
793
|
+
if (atTokenStart && (mora.key === "\u3068" || mora.key === "\u3053")) {
|
|
794
|
+
const prevIsSokuon = i > 0 && chars[i - 1] === "\u3063";
|
|
795
|
+
const nextKey = peekNextMoraKeySkippingChoonpu(i + mora.len);
|
|
796
|
+
const blockedByNext = !!nextKey && INITIAL_VOICING_BLOCK_NEXT.has(nextKey);
|
|
797
|
+
const isKou = mora.key === "\u3053" && chars[i + mora.len] === "\u3046";
|
|
798
|
+
let blockedByParticleUsage = false;
|
|
799
|
+
if (tokens && tokForI) {
|
|
800
|
+
const isSingleCharToken = tokForI.surface.length === 1;
|
|
801
|
+
const tokenMatchesMora = tokForI.surface === mora.key;
|
|
802
|
+
if (isSingleCharToken && tokenMatchesMora) {
|
|
803
|
+
const p = prevToken();
|
|
804
|
+
const prevLooksLikeContent = !!p && p.pos !== "\u8A18\u53F7" && p.pos !== "\u52A9\u8A5E" && p.pos !== "\u52A9\u52D5\u8A5E" && !HARD_BOUNDARY_SURF.has(p.surface);
|
|
805
|
+
if (prevLooksLikeContent)
|
|
806
|
+
blockedByParticleUsage = true;
|
|
807
|
+
}
|
|
808
|
+
}
|
|
809
|
+
const allowVoicing = !prevIsSokuon && !blockedByNext && !isKou && !blockedByParticleUsage;
|
|
810
|
+
if (allowVoicing) {
|
|
811
|
+
if (mora.key === "\u3068")
|
|
812
|
+
outSyl = "\uB3C4";
|
|
813
|
+
else if (mora.key === "\u3053")
|
|
814
|
+
outSyl = "\uACE0";
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
out += outSyl;
|
|
818
|
+
lastMora = { ...info, out: outSyl };
|
|
819
|
+
const next1 = chars[i + mora.len];
|
|
820
|
+
chars[i + mora.len + 1];
|
|
821
|
+
if (next1 === "\u3046" && info.vowelMain === "o") {
|
|
822
|
+
i += mora.len + 1;
|
|
823
|
+
continue;
|
|
824
|
+
}
|
|
825
|
+
if (next1 === "\u3046" && U_DROP_KEYS.has(mora.key)) {
|
|
826
|
+
i += mora.len + 1;
|
|
827
|
+
continue;
|
|
828
|
+
}
|
|
829
|
+
if (next1 === "\u3044") {
|
|
830
|
+
if (mora.key === "\u305B") {
|
|
831
|
+
i += mora.len + 1;
|
|
832
|
+
continue;
|
|
833
|
+
} else if (mora.key === "\u3051") {
|
|
834
|
+
i += mora.len + 1;
|
|
835
|
+
continue;
|
|
836
|
+
} else if (mora.key === "\u3048") {
|
|
837
|
+
if (originalMora.key !== "\u3078") {
|
|
838
|
+
i += mora.len + 1;
|
|
839
|
+
continue;
|
|
840
|
+
}
|
|
841
|
+
} else if (mora.key === "\u3057") {
|
|
842
|
+
i += mora.len + 1;
|
|
843
|
+
continue;
|
|
844
|
+
}
|
|
845
|
+
}
|
|
846
|
+
if (next1 === "\u3048" && mora.key === "\u306D") {
|
|
847
|
+
i += mora.len + 1;
|
|
848
|
+
continue;
|
|
849
|
+
}
|
|
850
|
+
i += mora.len;
|
|
851
|
+
}
|
|
852
|
+
return out;
|
|
853
|
+
}
|
|
854
|
+
|
|
855
|
+
// src/kanaToHangul.ts
|
|
856
|
+
function createKanaToHangul(tokenizer) {
|
|
857
|
+
return (input) => convertWithTokenizer(input, tokenizer);
|
|
858
|
+
}
|
|
859
|
+
function convertWithTokenizer(input, tokenizer) {
|
|
860
|
+
const normalized = normalizeInputText(input);
|
|
861
|
+
const hiragana = toHiragana(normalized);
|
|
862
|
+
const { rewritten, spans, rewrittenOriginal } = tokenizeAndRewriteParticles(
|
|
863
|
+
normalized,
|
|
864
|
+
hiragana,
|
|
865
|
+
tokenizer
|
|
866
|
+
);
|
|
867
|
+
return coreKanaToHangulConvert(rewritten, {
|
|
868
|
+
tokens: spans,
|
|
869
|
+
original: rewrittenOriginal
|
|
870
|
+
});
|
|
871
|
+
}
|
|
872
|
+
var require2 = module$1.createRequire((typeof document === 'undefined' ? require('u' + 'rl').pathToFileURL(__filename).href : (_documentCurrentScript && _documentCurrentScript.tagName.toUpperCase() === 'SCRIPT' && _documentCurrentScript.src || new URL('out.js', document.baseURI).href)));
|
|
873
|
+
async function buildTokenizer() {
|
|
874
|
+
return new Promise((resolve, reject) => {
|
|
875
|
+
const dicPath = path__default.default.join(require2.resolve("kuromoji"), "..", "..", "dict");
|
|
876
|
+
kuromoji__default.default.builder({ dicPath }).build((err, tk) => {
|
|
877
|
+
if (err || !tk)
|
|
878
|
+
reject(err);
|
|
879
|
+
else
|
|
880
|
+
resolve(tk);
|
|
881
|
+
});
|
|
882
|
+
});
|
|
883
|
+
}
|
|
884
|
+
var tokenizerPromise = null;
|
|
885
|
+
async function getTokenizer() {
|
|
886
|
+
if (!tokenizerPromise) {
|
|
887
|
+
tokenizerPromise = buildTokenizer();
|
|
888
|
+
}
|
|
889
|
+
return tokenizerPromise;
|
|
890
|
+
}
|
|
891
|
+
|
|
892
|
+
// src/kanaBarum.ts
|
|
893
|
+
var Kanabarum = class {
|
|
894
|
+
constructor() {
|
|
895
|
+
this.converter = null;
|
|
896
|
+
}
|
|
897
|
+
/**
|
|
898
|
+
* 토크나이저를 초기화합니다. 변환 전에 반드시 호출해야 합니다.
|
|
899
|
+
*/
|
|
900
|
+
async init() {
|
|
901
|
+
const tokenizer = await getTokenizer();
|
|
902
|
+
this.converter = createKanaToHangul(tokenizer);
|
|
903
|
+
}
|
|
904
|
+
/**
|
|
905
|
+
* 일본어 가나를 한글로 변환합니다.
|
|
906
|
+
* @param input 변환할 일본어 문자열
|
|
907
|
+
* @returns 한글로 변환된 문자열
|
|
908
|
+
* @throws init()이 호출되지 않은 경우 에러
|
|
909
|
+
*/
|
|
910
|
+
kanaToHangul(input) {
|
|
911
|
+
if (!this.converter) {
|
|
912
|
+
throw new Error("Kanabarum is not initialized. Call init() first.");
|
|
913
|
+
}
|
|
914
|
+
return this.converter(input);
|
|
915
|
+
}
|
|
916
|
+
};
|
|
917
|
+
var cachedConverter = null;
|
|
918
|
+
var pendingInit = null;
|
|
919
|
+
async function initKanaToHangul() {
|
|
920
|
+
if (cachedConverter)
|
|
921
|
+
return cachedConverter;
|
|
922
|
+
if (pendingInit)
|
|
923
|
+
return pendingInit;
|
|
924
|
+
pendingInit = (async () => {
|
|
925
|
+
const tokenizer = await getTokenizer();
|
|
926
|
+
const converter = createKanaToHangul(tokenizer);
|
|
927
|
+
cachedConverter = converter;
|
|
928
|
+
return converter;
|
|
929
|
+
})();
|
|
930
|
+
return pendingInit;
|
|
931
|
+
}
|
|
932
|
+
async function kanaToHangul(input) {
|
|
933
|
+
const converter = await initKanaToHangul();
|
|
934
|
+
return converter(input);
|
|
935
|
+
}
|
|
936
|
+
|
|
937
|
+
exports.Kanabarum = Kanabarum;
|
|
938
|
+
exports.kanaToHangul = kanaToHangul;
|
|
939
|
+
//# sourceMappingURL=out.js.map
|
|
940
|
+
//# sourceMappingURL=index.cjs.map
|