twl-generator 1.3.6 → 1.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/cli.js +6 -1
- package/src/index.js +113 -141
- package/src/utils/twl-matcher.js +9 -0
- package/src/utils/usfm-alignment-remover.js +5 -3
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "twl-generator",
|
|
3
|
-
"version": "1.
|
|
3
|
+
"version": "1.4.0",
|
|
4
4
|
"description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
|
|
5
5
|
"main": "src/index.js",
|
|
6
6
|
"bin": {
|
package/src/cli.js
CHANGED
|
@@ -63,7 +63,12 @@ async function main() {
|
|
|
63
63
|
console.log(`Wrote ${out}`);
|
|
64
64
|
const dir = path.dirname(outPath);
|
|
65
65
|
const base = path.basename(outPath);
|
|
66
|
-
|
|
66
|
+
// Derive a sensible no-match filename when --out doesn't follow *.twl.tsv
|
|
67
|
+
let nmFile;
|
|
68
|
+
if (/\.twl\.tsv$/i.test(base)) nmFile = base.replace(/\.twl\.tsv$/i, '.no-match.twl.tsv');
|
|
69
|
+
else if (/\.tsv$/i.test(base)) nmFile = base.replace(/\.tsv$/i, '.no-match.twl.tsv');
|
|
70
|
+
else nmFile = base + '.no-match.twl.tsv';
|
|
71
|
+
const nmPath = path.join(dir, nmFile);
|
|
67
72
|
await fs.writeFile(nmPath, noMatchTsv, 'utf8');
|
|
68
73
|
console.log(`Wrote ${nmPath}`);
|
|
69
74
|
} else if (outDir) {
|
package/src/index.js
CHANGED
|
@@ -863,79 +863,37 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts
|
|
|
863
863
|
}
|
|
864
864
|
|
|
865
865
|
export async function generateTwlByBook(bookCode, options = {}) {
|
|
866
|
-
//
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
const
|
|
870
|
-
|
|
871
|
-
|
|
872
|
-
const
|
|
873
|
-
|
|
866
|
+
// New: English-first matching (no Strong's), using ULT USFM verses
|
|
867
|
+
// Build term -> [articles] from local tw_strongs_list.json (terms only; ignore Strong's)
|
|
868
|
+
const twJson = await loadTwJsonLocal();
|
|
869
|
+
const termToArticles = {};
|
|
870
|
+
for (const [article, val] of Object.entries(twJson)) {
|
|
871
|
+
const terms = (val && val.article && Array.isArray(val.article.terms)) ? val.article.terms : [];
|
|
872
|
+
for (const raw of terms) {
|
|
873
|
+
const term = String(raw || '').replace(/\s*\([^)]*\)\s*$/, '').trim();
|
|
874
|
+
if (!term) continue;
|
|
875
|
+
if (!termToArticles[term]) termToArticles[term] = [];
|
|
876
|
+
// Use slug as-is (e.g., kt/grace)
|
|
877
|
+
termToArticles[term].push(article);
|
|
878
|
+
}
|
|
874
879
|
}
|
|
880
|
+
|
|
881
|
+
// Build trie for fast scanning
|
|
882
|
+
const { buildTermTrie, scanVerseMatches } = await import('./utils/twl-matcher.js');
|
|
883
|
+
const trie = buildTermTrie(termToArticles);
|
|
884
|
+
|
|
885
|
+
// Fetch and parse ULT USFM into verses
|
|
886
|
+
const { processUsfmForBook } = await import('./utils/usfm-alignment-remover.js');
|
|
875
887
|
const bibleData = await readBooks();
|
|
876
888
|
const meta = findBookMeta(bibleData, bookCode);
|
|
877
889
|
if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
|
|
878
|
-
const
|
|
879
|
-
const twJson = await loadTwJsonLocal();
|
|
880
|
-
const strongPivot = pivotByStrong(twJson);
|
|
881
|
-
|
|
882
|
-
// 1) initial TSV
|
|
883
|
-
const baseTsv = buildInitialTsv(usfm, strongPivot, meta.key);
|
|
884
|
-
|
|
885
|
-
// 2) add GLQuote and GLOccurrence
|
|
886
|
-
const glRes = await addGLQuoteCols({
|
|
887
|
-
bibleLinks: ["unfoldingWord/en_ult/master"],
|
|
888
|
-
bookCode: meta.key,
|
|
889
|
-
tsvContent: baseTsv,
|
|
890
|
-
trySeparatorsAndOccurrences: true,
|
|
891
|
-
});
|
|
892
|
-
const withGl = glRes.output;
|
|
893
|
-
|
|
894
|
-
// 3) Convert GLQuote/GLOccurrence into OrigWords/Occurrence and convert to OL quotes BEFORE matching
|
|
895
|
-
const lines0 = withGl.split(/\r?\n/);
|
|
896
|
-
const header0 = lines0.shift();
|
|
897
|
-
const h0 = header0.split('\t');
|
|
898
|
-
const I0 = {
|
|
899
|
-
Reference: h0.indexOf('Reference'),
|
|
900
|
-
ID: h0.indexOf('ID'),
|
|
901
|
-
Tags: h0.indexOf('Tags'),
|
|
902
|
-
OrigWords: h0.indexOf('OrigWords'),
|
|
903
|
-
Occurrence: h0.indexOf('Occurrence'),
|
|
904
|
-
TWLink: h0.indexOf('TWLink'),
|
|
905
|
-
GLQuote: h0.indexOf('GLQuote'),
|
|
906
|
-
GLOccurrence: h0.indexOf('GLOccurrence'),
|
|
907
|
-
};
|
|
908
|
-
const rebuilt0 = [header0].concat(lines0.filter(Boolean).map(row => {
|
|
909
|
-
const c = row.split('\t');
|
|
910
|
-
const newCols = c.slice();
|
|
911
|
-
if (I0.GLQuote >= 0) newCols[I0.OrigWords] = c[I0.GLQuote];
|
|
912
|
-
if (I0.GLOccurrence >= 0) newCols[I0.Occurrence] = c[I0.GLOccurrence];
|
|
913
|
-
return newCols.join('\t');
|
|
914
|
-
})).join('\n');
|
|
915
|
-
const convEarly = await convertGLQuotes2OLQuotes({
|
|
916
|
-
bibleLinks: ["unfoldingWord/en_ult/master"],
|
|
917
|
-
bookCode: meta.key,
|
|
918
|
-
tsvContent: rebuilt0,
|
|
919
|
-
trySeparatorsAndOccurrences: true,
|
|
920
|
-
});
|
|
890
|
+
const versesByChapter = await processUsfmForBook(meta.key);
|
|
921
891
|
|
|
922
|
-
//
|
|
923
|
-
const
|
|
924
|
-
const
|
|
925
|
-
const aCols = headerA.split('\t');
|
|
926
|
-
const A = {
|
|
927
|
-
Reference: aCols.indexOf('Reference'),
|
|
928
|
-
ID: aCols.indexOf('ID'),
|
|
929
|
-
Tags: aCols.indexOf('Tags'),
|
|
930
|
-
OrigWords: aCols.indexOf('OrigWords'),
|
|
931
|
-
Occurrence: aCols.indexOf('Occurrence'),
|
|
932
|
-
TWLink: aCols.indexOf('TWLink'),
|
|
933
|
-
GLQuote: aCols.indexOf('GLQuote'),
|
|
934
|
-
GLOccurrence: aCols.indexOf('GLOccurrence'),
|
|
935
|
-
};
|
|
892
|
+
// Header without Strongs; keep GLQuote/GLOccurrence and add Variant of, Disambiguation
|
|
893
|
+
const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'];
|
|
894
|
+
const outRows = [header.join('\t')];
|
|
936
895
|
|
|
937
|
-
//
|
|
938
|
-
const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Strongs'];
|
|
896
|
+
// ID generator
|
|
939
897
|
const usedIds = new Set();
|
|
940
898
|
const genId = () => {
|
|
941
899
|
const letters = 'abcdefghijklmnopqrstuvwxyz';
|
|
@@ -949,86 +907,100 @@ export async function generateTwlByBook(bookCode, options = {}) {
|
|
|
949
907
|
}
|
|
950
908
|
};
|
|
951
909
|
|
|
952
|
-
|
|
953
|
-
|
|
954
|
-
if (
|
|
955
|
-
|
|
956
|
-
if (
|
|
957
|
-
|
|
958
|
-
|
|
959
|
-
|
|
960
|
-
|
|
961
|
-
|
|
962
|
-
|
|
963
|
-
|
|
964
|
-
|
|
965
|
-
|
|
966
|
-
|
|
967
|
-
|
|
968
|
-
|
|
969
|
-
|
|
970
|
-
|
|
971
|
-
|
|
972
|
-
|
|
973
|
-
|
|
974
|
-
|
|
975
|
-
|
|
976
|
-
|
|
977
|
-
|
|
978
|
-
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
GLQuote: 6,
|
|
982
|
-
GLOccurrence: 7,
|
|
983
|
-
Strongs: 8,
|
|
910
|
+
// Helpers for Variant of decision (allow only plural/-ed/-ing without marking variant)
|
|
911
|
+
const pluralizeWord = (w) => {
|
|
912
|
+
if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ies');
|
|
913
|
+
if (/(s|x|z|ch|sh)$/i.test(w)) return w + 'es';
|
|
914
|
+
if (/f$/i.test(w) && !/(roof|belief|chief|proof)$/i.test(w)) return w.replace(/f$/i, 'ves');
|
|
915
|
+
if (/fe$/i.test(w)) return w.replace(/fe$/i, 'ves');
|
|
916
|
+
if (/o$/i.test(w)) return w + 'es';
|
|
917
|
+
return w + 's';
|
|
918
|
+
};
|
|
919
|
+
const isVowel = (ch) => /[aeiou]/i.test(ch);
|
|
920
|
+
const isConsonant = (ch) => /[a-z]/i.test(ch) && !isVowel(ch);
|
|
921
|
+
const endsWithCVC = (w) => w.length >= 3 && isConsonant(w[w.length - 3]) && isVowel(w[w.length - 2]) && isConsonant(w[w.length - 1]) && !/[wxy]/i.test(w[w.length - 1]);
|
|
922
|
+
const edForm = (w) => (/e$/i.test(w) ? w + 'd' : (/[^aeiou]y$/i.test(w) ? w.replace(/y$/i, 'ied') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ed' : w + 'ed')));
|
|
923
|
+
const ingForm = (w) => (/ie$/i.test(w) ? w.replace(/ie$/i, 'ying') : (/ee$/i.test(w) ? w + 'ing' : (/e$/i.test(w) ? w.replace(/e$/i, 'ing') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ing' : w + 'ing'))));
|
|
924
|
+
|
|
925
|
+
const allowNoVariant = (base, match) => {
|
|
926
|
+
const b = String(base || '');
|
|
927
|
+
const m = String(match || '');
|
|
928
|
+
if (!b || !m) return true;
|
|
929
|
+
if (b.toLowerCase() === m.toLowerCase()) return true;
|
|
930
|
+
const parts = b.trim().split(/\s+/);
|
|
931
|
+
const head = parts.length > 1 ? parts.slice(0, -1).join(' ') + ' ' : '';
|
|
932
|
+
const last = parts[parts.length - 1];
|
|
933
|
+
const allowed = new Set([
|
|
934
|
+
head + pluralizeWord(last),
|
|
935
|
+
head + edForm(last),
|
|
936
|
+
head + ingForm(last),
|
|
937
|
+
].map(x => x.toLowerCase()));
|
|
938
|
+
return allowed.has(m.toLowerCase());
|
|
984
939
|
};
|
|
985
940
|
|
|
986
|
-
//
|
|
987
|
-
const
|
|
988
|
-
const
|
|
989
|
-
|
|
990
|
-
|
|
991
|
-
|
|
992
|
-
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
const
|
|
1004
|
-
|
|
941
|
+
// Walk through verses in order
|
|
942
|
+
const chapterNums = Object.keys(versesByChapter).map(n => parseInt(n, 10)).sort((a, b) => a - b);
|
|
943
|
+
for (const c of chapterNums) {
|
|
944
|
+
const verses = versesByChapter[c] || {};
|
|
945
|
+
const verseNums = Object.keys(verses).map(n => parseInt(n, 10)).sort((a, b) => a - b);
|
|
946
|
+
for (const v of verseNums) {
|
|
947
|
+
const text = verses[v] || '';
|
|
948
|
+
const matches = scanVerseMatches(text, trie);
|
|
949
|
+
// Count occurrences per exact matchedText (case-sensitive)
|
|
950
|
+
const occMap = new Map();
|
|
951
|
+
for (const m of matches) {
|
|
952
|
+
const glq = m.matchedText;
|
|
953
|
+
const occ = (occMap.get(glq) || 0) + 1;
|
|
954
|
+
occMap.set(glq, occ);
|
|
955
|
+
|
|
956
|
+
const ref = `${c}:${v}`;
|
|
957
|
+
const id = genId();
|
|
958
|
+
const primaryArticle = (m.articles && m.articles[0]) || '';
|
|
959
|
+
let tag = '';
|
|
960
|
+
if (primaryArticle.startsWith('kt/')) tag = 'keyterm';
|
|
961
|
+
else if (primaryArticle.startsWith('names/')) tag = 'name';
|
|
962
|
+
const twLink = primaryArticle ? `rc://*/tw/dict/bible/${primaryArticle}` : '';
|
|
963
|
+
|
|
964
|
+
// Variant of: only if beyond plural/-ed/-ing differences
|
|
965
|
+
const variantOf = allowNoVariant(m.term, glq) ? '' : m.term;
|
|
966
|
+
// Disambiguation: list all candidate articles for this match
|
|
967
|
+
const disamb = (m.articles && m.articles.length > 1) ? `(${m.articles.join(', ')})` : '';
|
|
968
|
+
|
|
969
|
+
// Set OrigWords/Occurrence equal to GLQuote/GLOccurrence for English-first output
|
|
970
|
+
outRows.push([
|
|
971
|
+
ref,
|
|
972
|
+
id,
|
|
973
|
+
tag,
|
|
974
|
+
glq,
|
|
975
|
+
String(occ),
|
|
976
|
+
twLink,
|
|
977
|
+
glq,
|
|
978
|
+
String(occ),
|
|
979
|
+
variantOf,
|
|
980
|
+
disamb,
|
|
981
|
+
].join('\t'));
|
|
1005
982
|
}
|
|
1006
|
-
const tried = prioritizeArticles(glq, strongId, strongPivot) || [];
|
|
1007
|
-
const disambTried = tried.length ? `(${tried.join(', ')})` : '';
|
|
1008
|
-
noMatchRows.push(cols.join('\t') + '\t' + disambTried);
|
|
1009
|
-
continue;
|
|
1010
983
|
}
|
|
1011
|
-
const art = result.article;
|
|
1012
|
-
cols[H.TWLink] = `rc://*/tw/dict/bible/${art}`;
|
|
1013
|
-
// Update Tags based on selected article prefix
|
|
1014
|
-
let tag = '';
|
|
1015
|
-
if (art.startsWith('kt/')) tag = 'keyterm';
|
|
1016
|
-
else if (art.startsWith('names/')) tag = 'name';
|
|
1017
|
-
cols[H.Tags] = tag;
|
|
1018
|
-
if (result.disamb) multiDisambRows++;
|
|
1019
|
-
const variantOf = result.variantTerm || '';
|
|
1020
|
-
outRows.push(cols.join('\t') + '\t' + variantOf + '\t' + (result.disamb || ''));
|
|
1021
984
|
}
|
|
1022
985
|
|
|
1023
|
-
|
|
1024
|
-
|
|
1025
|
-
|
|
1026
|
-
|
|
1027
|
-
|
|
1028
|
-
|
|
986
|
+
// Build TSV and convert GL OrigWords back to OL using tsv-quote-converters
|
|
987
|
+
let matchedTsv = outRows.join('\n');
|
|
988
|
+
try {
|
|
989
|
+
const { convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
|
|
990
|
+
const conv = await convertGLQuotes2OLQuotes({
|
|
991
|
+
bibleLinks: ['unfoldingWord/en_ult/master'],
|
|
992
|
+
bookCode: String(meta.key || bookCode).toLowerCase(),
|
|
993
|
+
tsvContent: matchedTsv,
|
|
994
|
+
trySeparatorsAndOccurrences: true,
|
|
995
|
+
quiet: true,
|
|
996
|
+
});
|
|
997
|
+
if (conv && typeof conv.output === 'string' && conv.output.length) {
|
|
998
|
+
matchedTsv = conv.output;
|
|
999
|
+
}
|
|
1000
|
+
} catch (e) {
|
|
1001
|
+
// If conversion fails (e.g., no network), fall back to unconverted TSV
|
|
1029
1002
|
}
|
|
1030
|
-
|
|
1031
|
-
const
|
|
1032
|
-
const noMatchTsv = noMatchRows.join('\n');
|
|
1003
|
+
const noMatchHeader = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Disambiguation'];
|
|
1004
|
+
const noMatchTsv = [noMatchHeader.join('\t')].join('\n');
|
|
1033
1005
|
return { matchedTsv, noMatchTsv };
|
|
1034
1006
|
}
|
package/src/utils/twl-matcher.js
CHANGED
|
@@ -427,3 +427,12 @@ export function generateTWLMatches(twTerms, verses) {
|
|
|
427
427
|
|
|
428
428
|
return tsvRows.join('\n');
|
|
429
429
|
}
|
|
430
|
+
|
|
431
|
+
// Expose lightweight building and scanning APIs for reuse
|
|
432
|
+
export function buildTermTrie(twTerms) {
|
|
433
|
+
return createOptimizedTermMap(twTerms);
|
|
434
|
+
}
|
|
435
|
+
|
|
436
|
+
export function scanVerseMatches(verseText, termTrie) {
|
|
437
|
+
return findMatches(verseText, termTrie);
|
|
438
|
+
}
|
|
@@ -69,10 +69,12 @@ export const removeAllTagsExceptChapterVerse = (usfmContent) => {
|
|
|
69
69
|
* @return {Promise<Object>} - Object with chapters and verses
|
|
70
70
|
*/
|
|
71
71
|
export async function processUsfmForBook(book) {
|
|
72
|
-
|
|
72
|
+
// Normalize book key to lowercase to match BibleBookData keys
|
|
73
|
+
const key = String(book || '').toLowerCase();
|
|
74
|
+
if (!BibleBookData[key]) throw new Error(`Unknown book: ${book}`);
|
|
73
75
|
|
|
74
76
|
const fetch = await getFetch();
|
|
75
|
-
const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[
|
|
77
|
+
const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
|
|
76
78
|
const usfmRes = await fetch(usfmUrl);
|
|
77
79
|
if (!usfmRes.ok) throw new Error(`Failed to download USFM file for ${book}`);
|
|
78
80
|
const usfmData = await usfmRes.json();
|
|
@@ -120,4 +122,4 @@ export function parseUsfmToVerses(usfm) {
|
|
|
120
122
|
}
|
|
121
123
|
|
|
122
124
|
return versesObj;
|
|
123
|
-
}
|
|
125
|
+
}
|