twl-generator 1.3.6 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "twl-generator",
3
- "version": "1.3.6",
3
+ "version": "1.4.0",
4
4
  "description": "Generate term-to-article lists from unfoldingWord en_tw archive for Bible books. Works in both Node.js (CLI) and React.js (browser) environments.",
5
5
  "main": "src/index.js",
6
6
  "bin": {
package/src/cli.js CHANGED
@@ -63,7 +63,12 @@ async function main() {
63
63
  console.log(`Wrote ${out}`);
64
64
  const dir = path.dirname(outPath);
65
65
  const base = path.basename(outPath);
66
- const nmPath = path.join(dir, base.replace(/\.twl\.tsv$/i, '.no-match.twl.tsv'));
66
+ // Derive a sensible no-match filename when --out doesn't follow *.twl.tsv
67
+ let nmFile;
68
+ if (/\.twl\.tsv$/i.test(base)) nmFile = base.replace(/\.twl\.tsv$/i, '.no-match.twl.tsv');
69
+ else if (/\.tsv$/i.test(base)) nmFile = base.replace(/\.tsv$/i, '.no-match.twl.tsv');
70
+ else nmFile = base + '.no-match.twl.tsv';
71
+ const nmPath = path.join(dir, nmFile);
67
72
  await fs.writeFile(nmPath, noMatchTsv, 'utf8');
68
73
  console.log(`Wrote ${nmPath}`);
69
74
  } else if (outDir) {
package/src/index.js CHANGED
@@ -863,79 +863,37 @@ function chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twMap, opts
863
863
  }
864
864
 
865
865
  export async function generateTwlByBook(bookCode, options = {}) {
866
- // Import Node-specific modules conditionally
867
- const { addGLQuoteCols, convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
868
-
869
- const useCompromise = !!options.useCompromise;
870
- let nlp = null;
871
- if (useCompromise) {
872
- const mod = await import('compromise');
873
- nlp = mod.default || mod;
866
+ // New: English-first matching (no Strong's), using ULT USFM verses
867
+ // Build term -> [articles] from local tw_strongs_list.json (terms only; ignore Strong's)
868
+ const twJson = await loadTwJsonLocal();
869
+ const termToArticles = {};
870
+ for (const [article, val] of Object.entries(twJson)) {
871
+ const terms = (val && val.article && Array.isArray(val.article.terms)) ? val.article.terms : [];
872
+ for (const raw of terms) {
873
+ const term = String(raw || '').replace(/\s*\([^)]*\)\s*$/, '').trim();
874
+ if (!term) continue;
875
+ if (!termToArticles[term]) termToArticles[term] = [];
876
+ // Use slug as-is (e.g., kt/grace)
877
+ termToArticles[term].push(article);
878
+ }
874
879
  }
880
+
881
+ // Build trie for fast scanning
882
+ const { buildTermTrie, scanVerseMatches } = await import('./utils/twl-matcher.js');
883
+ const trie = buildTermTrie(termToArticles);
884
+
885
+ // Fetch and parse ULT USFM into verses
886
+ const { processUsfmForBook } = await import('./utils/usfm-alignment-remover.js');
875
887
  const bibleData = await readBooks();
876
888
  const meta = findBookMeta(bibleData, bookCode);
877
889
  if (!meta) throw new Error(`Unknown book code: ${bookCode}`);
878
- const usfm = await fetchUsfm(meta.usfm, meta.testament);
879
- const twJson = await loadTwJsonLocal();
880
- const strongPivot = pivotByStrong(twJson);
881
-
882
- // 1) initial TSV
883
- const baseTsv = buildInitialTsv(usfm, strongPivot, meta.key);
884
-
885
- // 2) add GLQuote and GLOccurrence
886
- const glRes = await addGLQuoteCols({
887
- bibleLinks: ["unfoldingWord/en_ult/master"],
888
- bookCode: meta.key,
889
- tsvContent: baseTsv,
890
- trySeparatorsAndOccurrences: true,
891
- });
892
- const withGl = glRes.output;
893
-
894
- // 3) Convert GLQuote/GLOccurrence into OrigWords/Occurrence and convert to OL quotes BEFORE matching
895
- const lines0 = withGl.split(/\r?\n/);
896
- const header0 = lines0.shift();
897
- const h0 = header0.split('\t');
898
- const I0 = {
899
- Reference: h0.indexOf('Reference'),
900
- ID: h0.indexOf('ID'),
901
- Tags: h0.indexOf('Tags'),
902
- OrigWords: h0.indexOf('OrigWords'),
903
- Occurrence: h0.indexOf('Occurrence'),
904
- TWLink: h0.indexOf('TWLink'),
905
- GLQuote: h0.indexOf('GLQuote'),
906
- GLOccurrence: h0.indexOf('GLOccurrence'),
907
- };
908
- const rebuilt0 = [header0].concat(lines0.filter(Boolean).map(row => {
909
- const c = row.split('\t');
910
- const newCols = c.slice();
911
- if (I0.GLQuote >= 0) newCols[I0.OrigWords] = c[I0.GLQuote];
912
- if (I0.GLOccurrence >= 0) newCols[I0.Occurrence] = c[I0.GLOccurrence];
913
- return newCols.join('\t');
914
- })).join('\n');
915
- const convEarly = await convertGLQuotes2OLQuotes({
916
- bibleLinks: ["unfoldingWord/en_ult/master"],
917
- bookCode: meta.key,
918
- tsvContent: rebuilt0,
919
- trySeparatorsAndOccurrences: true,
920
- });
890
+ const versesByChapter = await processUsfmForBook(meta.key);
921
891
 
922
- // 4) Reorder columns and add Strongs + randomized 4-char IDs before matching
923
- const linesA = convEarly.output.split(/\r?\n/);
924
- const headerA = linesA.shift();
925
- const aCols = headerA.split('\t');
926
- const A = {
927
- Reference: aCols.indexOf('Reference'),
928
- ID: aCols.indexOf('ID'),
929
- Tags: aCols.indexOf('Tags'),
930
- OrigWords: aCols.indexOf('OrigWords'),
931
- Occurrence: aCols.indexOf('Occurrence'),
932
- TWLink: aCols.indexOf('TWLink'),
933
- GLQuote: aCols.indexOf('GLQuote'),
934
- GLOccurrence: aCols.indexOf('GLOccurrence'),
935
- };
892
+ // Header without Strongs; keep GLQuote/GLOccurrence and add Variant of, Disambiguation
893
+ const header = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Variant of', 'Disambiguation'];
894
+ const outRows = [header.join('\t')];
936
895
 
937
- // New header order: Reference, ID, Tags, OrigWords, Occurrence, TWLink, Strongs, GLQuote, GLOccurrence
938
- const finalHeaderBase = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Strongs'];
896
+ // ID generator
939
897
  const usedIds = new Set();
940
898
  const genId = () => {
941
899
  const letters = 'abcdefghijklmnopqrstuvwxyz';
@@ -949,86 +907,100 @@ export async function generateTwlByBook(bookCode, options = {}) {
949
907
  }
950
908
  };
951
909
 
952
- const preparedRows = [];
953
- for (const ln of linesA) {
954
- if (!ln.trim()) continue;
955
- const c = ln.split('\t');
956
- if (c.length < 7) continue;
957
- const strongsVal = c[A.ID];
958
- const newId = genId();
959
- const newRow = [
960
- c[A.Reference],
961
- newId,
962
- c[A.Tags],
963
- c[A.OrigWords],
964
- c[A.Occurrence],
965
- c[A.TWLink],
966
- c[A.GLQuote],
967
- c[A.GLOccurrence],
968
- strongsVal,
969
- ];
970
- preparedRows.push(newRow);
971
- }
972
-
973
- // Indexes for prepared rows
974
- const H = {
975
- Reference: 0,
976
- ID: 1,
977
- Tags: 2,
978
- OrigWords: 3,
979
- Occurrence: 4,
980
- TWLink: 5,
981
- GLQuote: 6,
982
- GLOccurrence: 7,
983
- Strongs: 8,
910
+ // Helpers for Variant of decision (allow only plural/-ed/-ing without marking variant)
911
+ const pluralizeWord = (w) => {
912
+ if (/[^aeiou]y$/i.test(w)) return w.replace(/y$/i, 'ies');
913
+ if (/(s|x|z|ch|sh)$/i.test(w)) return w + 'es';
914
+ if (/f$/i.test(w) && !/(roof|belief|chief|proof)$/i.test(w)) return w.replace(/f$/i, 'ves');
915
+ if (/fe$/i.test(w)) return w.replace(/fe$/i, 'ves');
916
+ if (/o$/i.test(w)) return w + 'es';
917
+ return w + 's';
918
+ };
919
+ const isVowel = (ch) => /[aeiou]/i.test(ch);
920
+ const isConsonant = (ch) => /[a-z]/i.test(ch) && !isVowel(ch);
921
+ const endsWithCVC = (w) => w.length >= 3 && isConsonant(w[w.length - 3]) && isVowel(w[w.length - 2]) && isConsonant(w[w.length - 1]) && !/[wxy]/i.test(w[w.length - 1]);
922
+ const edForm = (w) => (/e$/i.test(w) ? w + 'd' : (/[^aeiou]y$/i.test(w) ? w.replace(/y$/i, 'ied') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ed' : w + 'ed')));
923
+ const ingForm = (w) => (/ie$/i.test(w) ? w.replace(/ie$/i, 'ying') : (/ee$/i.test(w) ? w + 'ing' : (/e$/i.test(w) ? w.replace(/e$/i, 'ing') : (endsWithCVC(w) ? w + w[w.length - 1] + 'ing' : w + 'ing'))));
924
+
925
+ const allowNoVariant = (base, match) => {
926
+ const b = String(base || '');
927
+ const m = String(match || '');
928
+ if (!b || !m) return true;
929
+ if (b.toLowerCase() === m.toLowerCase()) return true;
930
+ const parts = b.trim().split(/\s+/);
931
+ const head = parts.length > 1 ? parts.slice(0, -1).join(' ') + ' ' : '';
932
+ const last = parts[parts.length - 1];
933
+ const allowed = new Set([
934
+ head + pluralizeWord(last),
935
+ head + edForm(last),
936
+ head + ingForm(last),
937
+ ].map(x => x.toLowerCase()));
938
+ return allowed.has(m.toLowerCase());
984
939
  };
985
940
 
986
- // 5) pick best TWLink based on GLQuote terms using Strongs column; include Variant of column
987
- const termMap = buildArticleTermMap(twJson);
988
- const outRows = [finalHeaderBase.concat(['Variant of', 'Disambiguation']).join('\t')];
989
- const noMatchRows = [finalHeaderBase.concat(['Disambiguation']).join('\t')];
990
- let totalRows = 0;
991
- let droppedRows = 0;
992
- let multiDisambRows = 0;
993
- const noMatchSamples = [];
994
-
995
- for (const cols of preparedRows) {
996
- totalRows++;
997
- const strongId = cols[H.Strongs];
998
- const glq = cols[H.GLQuote] || '';
999
- const result = chooseArticleByGlQuote(glq, strongId, strongPivot, termMap, twJson, { useCompromise, nlp });
1000
- if (!result) {
1001
- droppedRows++;
1002
- if (noMatchSamples.length < 8) {
1003
- const ref = cols[H.Reference] || '';
1004
- noMatchSamples.push(`${ref}\t${strongId}\t${glq}`);
941
+ // Walk through verses in order
942
+ const chapterNums = Object.keys(versesByChapter).map(n => parseInt(n, 10)).sort((a, b) => a - b);
943
+ for (const c of chapterNums) {
944
+ const verses = versesByChapter[c] || {};
945
+ const verseNums = Object.keys(verses).map(n => parseInt(n, 10)).sort((a, b) => a - b);
946
+ for (const v of verseNums) {
947
+ const text = verses[v] || '';
948
+ const matches = scanVerseMatches(text, trie);
949
+ // Count occurrences per exact matchedText (case-sensitive)
950
+ const occMap = new Map();
951
+ for (const m of matches) {
952
+ const glq = m.matchedText;
953
+ const occ = (occMap.get(glq) || 0) + 1;
954
+ occMap.set(glq, occ);
955
+
956
+ const ref = `${c}:${v}`;
957
+ const id = genId();
958
+ const primaryArticle = (m.articles && m.articles[0]) || '';
959
+ let tag = '';
960
+ if (primaryArticle.startsWith('kt/')) tag = 'keyterm';
961
+ else if (primaryArticle.startsWith('names/')) tag = 'name';
962
+ const twLink = primaryArticle ? `rc://*/tw/dict/bible/${primaryArticle}` : '';
963
+
964
+ // Variant of: only if beyond plural/-ed/-ing differences
965
+ const variantOf = allowNoVariant(m.term, glq) ? '' : m.term;
966
+ // Disambiguation: list all candidate articles for this match
967
+ const disamb = (m.articles && m.articles.length > 1) ? `(${m.articles.join(', ')})` : '';
968
+
969
+ // Set OrigWords/Occurrence equal to GLQuote/GLOccurrence for English-first output
970
+ outRows.push([
971
+ ref,
972
+ id,
973
+ tag,
974
+ glq,
975
+ String(occ),
976
+ twLink,
977
+ glq,
978
+ String(occ),
979
+ variantOf,
980
+ disamb,
981
+ ].join('\t'));
1005
982
  }
1006
- const tried = prioritizeArticles(glq, strongId, strongPivot) || [];
1007
- const disambTried = tried.length ? `(${tried.join(', ')})` : '';
1008
- noMatchRows.push(cols.join('\t') + '\t' + disambTried);
1009
- continue;
1010
983
  }
1011
- const art = result.article;
1012
- cols[H.TWLink] = `rc://*/tw/dict/bible/${art}`;
1013
- // Update Tags based on selected article prefix
1014
- let tag = '';
1015
- if (art.startsWith('kt/')) tag = 'keyterm';
1016
- else if (art.startsWith('names/')) tag = 'name';
1017
- cols[H.Tags] = tag;
1018
- if (result.disamb) multiDisambRows++;
1019
- const variantOf = result.variantTerm || '';
1020
- outRows.push(cols.join('\t') + '\t' + variantOf + '\t' + (result.disamb || ''));
1021
984
  }
1022
985
 
1023
- const keptRows = totalRows - droppedRows;
1024
- const pct = totalRows ? ((keptRows / totalRows) * 100).toFixed(1) : '0.0';
1025
- console.log(`[TWL] ${bookCode.toUpperCase()}: kept ${keptRows}/${totalRows} (${pct}%), dropped ${droppedRows}, disambiguated ${multiDisambRows}`);
1026
- if (noMatchSamples.length) {
1027
- console.log(`[TWL] ${bookCode.toUpperCase()}: no-match samples (up to 8):`);
1028
- for (const s of noMatchSamples) console.log(` ${s}`);
986
+ // Build TSV and convert GL OrigWords back to OL using tsv-quote-converters
987
+ let matchedTsv = outRows.join('\n');
988
+ try {
989
+ const { convertGLQuotes2OLQuotes } = await import('tsv-quote-converters');
990
+ const conv = await convertGLQuotes2OLQuotes({
991
+ bibleLinks: ['unfoldingWord/en_ult/master'],
992
+ bookCode: String(meta.key || bookCode).toLowerCase(),
993
+ tsvContent: matchedTsv,
994
+ trySeparatorsAndOccurrences: true,
995
+ quiet: true,
996
+ });
997
+ if (conv && typeof conv.output === 'string' && conv.output.length) {
998
+ matchedTsv = conv.output;
999
+ }
1000
+ } catch (e) {
1001
+ // If conversion fails (e.g., no network), fall back to unconverted TSV
1029
1002
  }
1030
-
1031
- const matchedTsv = outRows.join('\n');
1032
- const noMatchTsv = noMatchRows.join('\n');
1003
+ const noMatchHeader = ['Reference', 'ID', 'Tags', 'OrigWords', 'Occurrence', 'TWLink', 'GLQuote', 'GLOccurrence', 'Disambiguation'];
1004
+ const noMatchTsv = [noMatchHeader.join('\t')].join('\n');
1033
1005
  return { matchedTsv, noMatchTsv };
1034
1006
  }
@@ -427,3 +427,12 @@ export function generateTWLMatches(twTerms, verses) {
427
427
 
428
428
  return tsvRows.join('\n');
429
429
  }
430
+
431
+ // Expose lightweight building and scanning APIs for reuse
432
+ export function buildTermTrie(twTerms) {
433
+ return createOptimizedTermMap(twTerms);
434
+ }
435
+
436
+ export function scanVerseMatches(verseText, termTrie) {
437
+ return findMatches(verseText, termTrie);
438
+ }
@@ -69,10 +69,12 @@ export const removeAllTagsExceptChapterVerse = (usfmContent) => {
69
69
  * @return {Promise<Object>} - Object with chapters and verses
70
70
  */
71
71
  export async function processUsfmForBook(book) {
72
- if (!BibleBookData[book]) throw new Error(`Unknown book: ${book}`);
72
+ // Normalize book key to lowercase to match BibleBookData keys
73
+ const key = String(book || '').toLowerCase();
74
+ if (!BibleBookData[key]) throw new Error(`Unknown book: ${book}`);
73
75
 
74
76
  const fetch = await getFetch();
75
- const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[book].usfm}.usfm?ref=master`;
77
+ const usfmUrl = `https://git.door43.org/api/v1/repos/unfoldingWord/en_ult/contents/${BibleBookData[key].usfm}.usfm?ref=master`;
76
78
  const usfmRes = await fetch(usfmUrl);
77
79
  if (!usfmRes.ok) throw new Error(`Failed to download USFM file for ${book}`);
78
80
  const usfmData = await usfmRes.json();
@@ -120,4 +122,4 @@ export function parseUsfmToVerses(usfm) {
120
122
  }
121
123
 
122
124
  return versesObj;
123
- }
125
+ }