henkan 0.3.3 → 0.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (58) hide show
  1. package/dist/index.cjs.js +97 -124
  2. package/dist/index.cjs.js.map +3 -3
  3. package/dist/index.mjs +95 -120
  4. package/dist/index.mjs.map +2 -2
  5. package/dist/types/types.d.ts +16 -12
  6. package/dist/types/types.d.ts.map +1 -1
  7. package/dist/types/utils.d.ts +6 -6
  8. package/dist/types/utils.d.ts.map +1 -1
  9. package/docs/api/functions/capitalizeString.md +1 -1
  10. package/docs/api/functions/convertJMdict.md +1 -1
  11. package/docs/api/functions/convertKanjiDic.md +1 -1
  12. package/docs/api/functions/convertKradFile.md +4 -4
  13. package/docs/api/functions/convertRadkFile.md +4 -4
  14. package/docs/api/functions/convertTanakaCorpus.md +1 -1
  15. package/docs/api/functions/generateAnkiNote.md +1 -1
  16. package/docs/api/functions/generateAnkiNotesFile.md +1 -1
  17. package/docs/api/functions/getKanji.md +1 -1
  18. package/docs/api/functions/getKanjiExtended.md +1 -1
  19. package/docs/api/functions/getWord.md +1 -1
  20. package/docs/api/functions/isStringArray.md +1 -1
  21. package/docs/api/functions/isValidArray.md +1 -1
  22. package/docs/api/functions/isValidArrayWithFirstElement.md +1 -1
  23. package/docs/api/functions/makeSSML.md +1 -1
  24. package/docs/api/functions/shuffleArray.md +1 -1
  25. package/docs/api/functions/synthesizeSpeech.md +1 -1
  26. package/docs/api/interfaces/DictKanji.md +5 -5
  27. package/docs/api/interfaces/DictKanjiForm.md +4 -4
  28. package/docs/api/interfaces/DictKanjiMisc.md +5 -5
  29. package/docs/api/interfaces/DictKanjiReading.md +3 -3
  30. package/docs/api/interfaces/DictKanjiReadingMeaning.md +3 -3
  31. package/docs/api/interfaces/DictKanjiReadingMeaningGroup.md +3 -3
  32. package/docs/api/interfaces/DictKanjiWithRadicals.md +3 -3
  33. package/docs/api/interfaces/DictMeaning.md +11 -11
  34. package/docs/api/interfaces/DictRadical.md +4 -4
  35. package/docs/api/interfaces/DictReading.md +5 -5
  36. package/docs/api/interfaces/DictWord.md +29 -19
  37. package/docs/api/interfaces/ExamplePart.md +7 -7
  38. package/docs/api/interfaces/Grammar.md +15 -15
  39. package/docs/api/interfaces/GrammarMeaning.md +3 -3
  40. package/docs/api/interfaces/Kana.md +11 -11
  41. package/docs/api/interfaces/Kanji.md +22 -22
  42. package/docs/api/interfaces/KanjiComponent.md +3 -3
  43. package/docs/api/interfaces/KanjiForm.md +4 -4
  44. package/docs/api/interfaces/NoteAndTag.md +3 -3
  45. package/docs/api/interfaces/Phrase.md +4 -4
  46. package/docs/api/interfaces/Radical.md +16 -16
  47. package/docs/api/interfaces/Reading.md +5 -5
  48. package/docs/api/interfaces/ResultEntry.md +7 -7
  49. package/docs/api/interfaces/TanakaExample.md +17 -7
  50. package/docs/api/interfaces/Translation.md +3 -3
  51. package/docs/api/interfaces/UsefulRegExps.md +9 -9
  52. package/docs/api/interfaces/Word.md +18 -18
  53. package/docs/api/type-aliases/Dict.md +1 -1
  54. package/docs/api/type-aliases/DictName.md +1 -1
  55. package/docs/api/type-aliases/EntryType.md +1 -1
  56. package/docs/api/type-aliases/JLPT.md +1 -1
  57. package/docs/api/type-aliases/Result.md +1 -1
  58. package/package.json +6 -6
package/dist/index.mjs CHANGED
@@ -12,7 +12,7 @@ var regexps = {
12
12
  kanji: new RegExp("\\p{Script=Han}+", "u"),
13
13
  scriptSplit: /([\p{sc=Han}]+|[\p{sc=Hiragana}]+|[\p{sc=Katakana}]+|[^\p{sc=Han}\p{sc=Hiragana}\p{sc=Katakana}]+)/u,
14
14
  regExChars: /[-\/\\^$*+?.()|[\]{}]/,
15
- tanakaID: /#ID=\d+_\d+$/,
15
+ tanakaID: /#ID=(?<id>\d+_\d+)$/,
16
16
  tanakaPart: /(?<base>[^()\[\]\{\}\s]+)(?:\((?<reading>[\S]+)\))?(?:\[(?<glossnum>[\S]+)\])?(?:\{(?<inflection>[\S]+)\})?/,
17
17
  tanakaReferenceID: /#(?<entryid>[\d]+)/
18
18
  };
@@ -1171,15 +1171,6 @@ function convertJMdict(xmlString, examples) {
1171
1171
  const dict = [];
1172
1172
  xml.parseString(dictParsed, (err, result) => {
1173
1173
  if (err) throw err;
1174
- const tanakaParts = examples && examples.length > 0 ? new Set(
1175
- examples.map(
1176
- (example) => example.parts.map((part) => [
1177
- part.baseForm,
1178
- ...part.reading ? [part.reading] : [],
1179
- ...part.referenceID ? [part.referenceID] : []
1180
- ])
1181
- ).flat(2)
1182
- ) : void 0;
1183
1174
  if (result.JMdict && typeof result.JMdict === "object" && isValidArray(result.JMdict.entry))
1184
1175
  for (const entry of result.JMdict.entry) {
1185
1176
  const entryObj = {
@@ -1228,7 +1219,8 @@ function convertJMdict(xmlString, examples) {
1228
1219
  if (readingObj.reading.length > 0)
1229
1220
  entryObj.readings.push(readingObj);
1230
1221
  }
1231
- if (isValidArray(meanings))
1222
+ if (isValidArray(meanings)) {
1223
+ let usuallyInKanaMeanings = 0;
1232
1224
  for (const meaning of meanings) {
1233
1225
  const meaningObj = {};
1234
1226
  if (isStringArray(meaning.pos))
@@ -1254,12 +1246,21 @@ function convertJMdict(xmlString, examples) {
1254
1246
  if (isStringArray(meaning.field))
1255
1247
  meaningObj.fields = meaning.field;
1256
1248
  if (isStringArray(meaning.s_inf)) meaningObj.info = meaning.s_inf;
1257
- if (isStringArray(meaning.misc)) meaningObj.misc = meaning.misc;
1249
+ if (isStringArray(meaning.misc)) {
1250
+ meaningObj.misc = meaning.misc;
1251
+ if (meaningObj.misc && meaningObj.misc.includes(
1252
+ "word usually written using kana alone"
1253
+ ))
1254
+ usuallyInKanaMeanings++;
1255
+ }
1258
1256
  if (isStringArray(meaning.dial))
1259
1257
  meaningObj.dialects = meaning.dial;
1260
1258
  if (meaningObj.partOfSpeech && meaningObj.partOfSpeech.length > 0 || meaningObj.translations && meaningObj.translations.length > 0)
1261
1259
  entryObj.meanings.push(meaningObj);
1262
1260
  }
1261
+ if (entryObj.meanings.length === usuallyInKanaMeanings)
1262
+ entryObj.usuallyInKana = true;
1263
+ }
1263
1264
  if (examples) {
1264
1265
  const readings2 = new Set(
1265
1266
  entryObj.readings.filter(
@@ -1275,24 +1276,70 @@ function convertJMdict(xmlString, examples) {
1275
1276
  )) && (entryObj.isCommon === void 0 || kanjiForm.commonness && kanjiForm.commonness.length > 0)
1276
1277
  ).map((kanjiForm) => kanjiForm.form)
1277
1278
  ) : void 0;
1278
- let existsExample = false;
1279
- if (kanjiForms2 && kanjiForms2.size > 0 && tanakaParts) {
1280
- for (const kf of kanjiForms2)
1281
- if (tanakaParts.has(kf)) {
1282
- existsExample = true;
1279
+ const kanjiFormExamples = [];
1280
+ const readingMatchingKanjiFormExamples = [];
1281
+ const readingExamples = [];
1282
+ const partParts = /* @__PURE__ */ new Set();
1283
+ for (const example of examples)
1284
+ for (const part of example.parts) {
1285
+ const readingAsReadingMatch = part.reading !== void 0 && readings2.has(part.reading);
1286
+ if (kanjiForms2 && kanjiForms2.size > 0 && kanjiForms2.has(part.baseForm)) {
1287
+ if (readingAsReadingMatch) {
1288
+ readingMatchingKanjiFormExamples.push(example);
1289
+ partParts.add(part.baseForm).add(part.reading);
1290
+ } else {
1291
+ kanjiFormExamples.push(example);
1292
+ partParts.add(part.baseForm);
1293
+ }
1283
1294
  break;
1284
1295
  }
1285
- }
1286
- if (!existsExample && readings2.size > 0 && tanakaParts) {
1287
- for (const r of readings2)
1288
- if (tanakaParts.has(r)) {
1289
- existsExample = true;
1296
+ const readingAsBaseFormMatch = readings2.has(
1297
+ part.baseForm
1298
+ );
1299
+ const referenceIDMatch = part.referenceID !== void 0 && entryObj.id !== void 0 && part.referenceID === entryObj.id;
1300
+ if (readingAsReadingMatch || readingAsBaseFormMatch || referenceIDMatch) {
1301
+ readingExamples.push(example);
1302
+ if (readingAsReadingMatch) partParts.add(part.reading);
1303
+ if (readingAsBaseFormMatch) partParts.add(part.baseForm);
1304
+ if (referenceIDMatch) partParts.add(part.referenceID);
1290
1305
  break;
1291
1306
  }
1307
+ }
1308
+ const exampleSize = readingMatchingKanjiFormExamples.length + kanjiFormExamples.length + readingExamples.length;
1309
+ const includeKanjiFormExamples = readingMatchingKanjiFormExamples.length < Math.max(2, Math.round(exampleSize * 0.05));
1310
+ const includeReadingExamples = entryObj.usuallyInKana === void 0 && includeKanjiFormExamples && readingExamples.length >= Math.max(10, Math.round(exampleSize * 0.15)) || entryObj.usuallyInKana === true && readingExamples.length >= Math.max(2, Math.round(exampleSize * 0.5));
1311
+ let wordExamples = [
1312
+ ...readingMatchingKanjiFormExamples,
1313
+ ...includeKanjiFormExamples ? kanjiFormExamples : [],
1314
+ ...includeReadingExamples ? readingExamples : []
1315
+ ];
1316
+ const glossSpecificExamples = [];
1317
+ const seenPhrases = /* @__PURE__ */ new Set();
1318
+ for (let i = 0; i < entryObj.meanings.length; i++) {
1319
+ outer: for (const example of wordExamples) {
1320
+ if (seenPhrases.has(example.phrase)) continue;
1321
+ for (const part of example.parts)
1322
+ if (part.glossNumber === i + 1 && (partParts.has(part.baseForm) || part.reading && partParts.has(part.reading) || part.referenceID && partParts.has(part.referenceID))) {
1323
+ glossSpecificExamples.push(example);
1324
+ seenPhrases.add(example.phrase);
1325
+ break outer;
1326
+ }
1327
+ }
1328
+ if (glossSpecificExamples.length === 5) break;
1329
+ }
1330
+ if (glossSpecificExamples.length === 5)
1331
+ wordExamples = [...glossSpecificExamples];
1332
+ else if (glossSpecificExamples.length > 0) {
1333
+ const seenPhrases2 = new Set(
1334
+ glossSpecificExamples.map((ex) => ex.phrase)
1335
+ );
1336
+ wordExamples = [
1337
+ ...glossSpecificExamples,
1338
+ ...wordExamples.filter((ex) => !seenPhrases2.has(ex.phrase)).slice(0, 5 - glossSpecificExamples.length)
1339
+ ];
1292
1340
  }
1293
- if (!existsExample && tanakaParts && tanakaParts.has(entryObj.id))
1294
- existsExample = true;
1295
- if (existsExample) entryObj.hasPhrases = true;
1341
+ if (wordExamples.length > 0)
1342
+ entryObj.phraseIDs = (wordExamples.length > 5 ? wordExamples.slice(0, 5) : wordExamples).map((ex) => ex.id);
1296
1343
  }
1297
1344
  if (entryObj.id.length > 0 && entryObj.readings.length > 0 && entryObj.meanings.length > 0)
1298
1345
  dict.push(entryObj);
@@ -1391,9 +1438,12 @@ async function convertTanakaCorpus(tanakaString, generateFurigana) {
1391
1438
  let a = tanakaParsed[i];
1392
1439
  let b = tanakaParsed[i + 1];
1393
1440
  if (a && b && a.startsWith("A: ") && b.startsWith("B: ")) {
1394
- a = a.replace("A: ", "").replace(regexps.tanakaID, "");
1441
+ a = a.replace("A: ", "");
1395
1442
  b = b.replace("B: ", "");
1396
- const aParts = a.split(" ");
1443
+ const idMatch = regexps.tanakaID.exec(a);
1444
+ if (!idMatch || !idMatch.groups || !idMatch.groups["id"])
1445
+ throw new Error(`Invalid phrase ID for ${a}`);
1446
+ const aParts = a.replace(regexps.tanakaID, "").split(" ");
1397
1447
  const bParts = b.split(" ").filter((part) => part.trim().length !== 0).map((part) => {
1398
1448
  const partMatches = regexps.tanakaPart.exec(part);
1399
1449
  if (!partMatches || !partMatches.groups || partMatches.length === 0)
@@ -1431,8 +1481,9 @@ async function convertTanakaCorpus(tanakaString, generateFurigana) {
1431
1481
  mode: "furigana"
1432
1482
  });
1433
1483
  tanakaArray.push({
1434
- phrase,
1435
- translation,
1484
+ id: idMatch.groups["id"].trim(),
1485
+ phrase: phrase.trim(),
1486
+ translation: translation.trim(),
1436
1487
  parts: bParts,
1437
1488
  ...furigana ? { furigana } : {}
1438
1489
  });
@@ -1611,7 +1662,6 @@ function getWord(dict, id, kanjiDic, examples, dictWord, noteTypeName, deckPath)
1611
1662
  } : {},
1612
1663
  ...dictReading.commonness && dictReading.commonness.length > 0 ? { common: true } : {}
1613
1664
  }));
1614
- let usuallyInKanaMeanings = 0;
1615
1665
  word.translations = dictWord.meanings.map((dictMeaning) => {
1616
1666
  if (!dictMeaning.translations)
1617
1667
  throw new Error(`No translations for ${dictWord.id}`);
@@ -1664,11 +1714,10 @@ function getWord(dict, id, kanjiDic, examples, dictWord, noteTypeName, deckPath)
1664
1714
  dictMeaning.info,
1665
1715
  (info) => lookupWordNote(info, notes, word.tags, false, info)
1666
1716
  );
1667
- wordAddNoteArray(dictMeaning.misc, (misc) => {
1668
- lookupWordNote(misc, notes, word.tags, false, misc);
1669
- if (misc.toLowerCase() === "word usually written using kana alone")
1670
- usuallyInKanaMeanings++;
1671
- });
1717
+ wordAddNoteArray(
1718
+ dictMeaning.misc,
1719
+ (misc) => lookupWordNote(misc, notes, word.tags, false, misc)
1720
+ );
1672
1721
  for (let i = 0; i < notes.length; i++)
1673
1722
  notes[i] = capitalizeString(notes[i]);
1674
1723
  return {
@@ -1676,8 +1725,7 @@ function getWord(dict, id, kanjiDic, examples, dictWord, noteTypeName, deckPath)
1676
1725
  notes
1677
1726
  };
1678
1727
  });
1679
- if (word.translations && word.translations.length === usuallyInKanaMeanings)
1680
- word.usuallyInKana = true;
1728
+ if (dictWord.usuallyInKana === true) word.usuallyInKana = true;
1681
1729
  if (kanjiDic && word.kanjiForms) {
1682
1730
  word.kanji = [];
1683
1731
  for (const kanjiForm of word.kanjiForms)
@@ -1701,89 +1749,16 @@ function getWord(dict, id, kanjiDic, examples, dictWord, noteTypeName, deckPath)
1701
1749
  }
1702
1750
  if (word.kanji.length === 0) delete word.kanji;
1703
1751
  }
1704
- if (examples && dictWord.hasPhrases === true) {
1705
- const readings = new Set(
1706
- word.readings.filter(
1707
- (reading) => (!reading.notes || !reading.notes.some(
1708
- (note) => notSearchedForms.has(note)
1709
- )) && (word.common === void 0 || reading.common === true)
1710
- ).map((reading) => reading.reading)
1711
- );
1712
- const kanjiForms = word.kanjiForms ? new Set(
1713
- word.kanjiForms.filter(
1714
- (kanjiForm) => (!kanjiForm.notes || !kanjiForm.notes.some(
1715
- (note) => notSearchedForms.has(note)
1716
- )) && (word.common === void 0 || kanjiForm.common === true)
1717
- ).map((kanjiForm) => kanjiForm.kanjiForm)
1718
- ) : void 0;
1719
- const kanjiFormExamples = [];
1720
- const readingMatchingKanjiFormExamples = [];
1721
- const readingExamples = [];
1722
- const partParts = /* @__PURE__ */ new Set();
1723
- for (const example of examples)
1724
- for (const part of example.parts) {
1725
- const readingAsReadingMatch = part.reading !== void 0 && readings.has(part.reading);
1726
- if (kanjiForms && kanjiForms.size > 0 && kanjiForms.has(part.baseForm)) {
1727
- if (readingAsReadingMatch) {
1728
- readingMatchingKanjiFormExamples.push(example);
1729
- partParts.add(part.baseForm).add(part.reading);
1730
- } else {
1731
- kanjiFormExamples.push(example);
1732
- partParts.add(part.baseForm);
1733
- }
1734
- break;
1735
- }
1736
- const readingAsBaseFormMatch = readings.has(part.baseForm);
1737
- const referenceIDMatch = part.referenceID !== void 0 && word.id !== void 0 && part.referenceID === word.id;
1738
- if (readingAsReadingMatch || readingAsBaseFormMatch || referenceIDMatch) {
1739
- readingExamples.push(example);
1740
- if (readingAsReadingMatch) partParts.add(part.reading);
1741
- if (readingAsBaseFormMatch) partParts.add(part.baseForm);
1742
- if (referenceIDMatch) partParts.add(part.referenceID);
1743
- break;
1744
- }
1745
- }
1746
- const exampleSize = readingMatchingKanjiFormExamples.length + kanjiFormExamples.length + readingExamples.length;
1747
- const includeKanjiFormExamples = readingMatchingKanjiFormExamples.length < Math.max(2, Math.round(exampleSize * 0.05));
1748
- const includeReadingExamples = word.usuallyInKana === void 0 && includeKanjiFormExamples && readingExamples.length >= Math.max(10, Math.round(exampleSize * 0.15)) || word.usuallyInKana === true && readingExamples.length >= Math.max(2, Math.round(exampleSize * 0.5));
1749
- let wordExamples = [
1750
- ...readingMatchingKanjiFormExamples,
1751
- ...includeKanjiFormExamples ? kanjiFormExamples : [],
1752
- ...includeReadingExamples ? readingExamples : []
1753
- ];
1754
- if (word.translations) {
1755
- const glossSpecificExamples = [];
1756
- const seenPhrases = /* @__PURE__ */ new Set();
1757
- for (let i = 0; i < word.translations.length; i++) {
1758
- outer: for (const example of wordExamples) {
1759
- if (seenPhrases.has(example.phrase)) continue;
1760
- for (const part of example.parts)
1761
- if (part.glossNumber === i + 1 && (partParts.has(part.baseForm) || part.reading && partParts.has(part.reading) || part.referenceID && partParts.has(part.referenceID))) {
1762
- glossSpecificExamples.push(example);
1763
- seenPhrases.add(example.phrase);
1764
- break outer;
1765
- }
1766
- }
1767
- if (glossSpecificExamples.length === 5) break;
1768
- }
1769
- if (glossSpecificExamples.length === 5)
1770
- wordExamples = [...glossSpecificExamples];
1771
- else if (glossSpecificExamples.length > 0) {
1772
- const seenPhrases2 = new Set(
1773
- glossSpecificExamples.map((ex) => ex.phrase)
1774
- );
1775
- wordExamples = [
1776
- ...glossSpecificExamples,
1777
- ...wordExamples.filter((ex) => !seenPhrases2.has(ex.phrase)).slice(0, 5 - glossSpecificExamples.length)
1778
- ];
1779
- }
1780
- }
1781
- if (wordExamples.length > 0)
1782
- word.phrases = (wordExamples.length > 5 ? wordExamples.slice(0, 5) : wordExamples).map((ex) => ({
1783
- phrase: ex.furigana ?? ex.phrase,
1784
- translation: ex.translation,
1785
- originalPhrase: ex.phrase
1786
- }));
1752
+ if (examples && dictWord.phraseIDs && dictWord.phraseIDs.length > 0) {
1753
+ word.phrases = [];
1754
+ const phraseIDs = new Set(dictWord.phraseIDs);
1755
+ for (const ex of examples)
1756
+ if (phraseIDs.has(ex.id))
1757
+ word.phrases.push({
1758
+ phrase: ex.furigana ?? ex.phrase,
1759
+ translation: ex.translation,
1760
+ originalPhrase: ex.phrase
1761
+ });
1787
1762
  }
1788
1763
  return word;
1789
1764
  } else throw new Error(`Word${id ? ` ${id}` : ""} not found`);